{ "best_global_step": 15962, "best_metric": 0.46267828345298767, "best_model_checkpoint": "saves_multiple/prefix-tuning/llama-3-8b-instruct/train_hellaswag_456_1760637853/checkpoint-15962", "epoch": 20.0, "eval_steps": 15962, "global_step": 159620, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0006264879087833605, "grad_norm": 222.64315795898438, "learning_rate": 2.505951635133442e-09, "loss": 9.4635, "num_input_tokens_seen": 6016, "step": 5 }, { "epoch": 0.001252975817566721, "grad_norm": 228.58648681640625, "learning_rate": 5.638391179050244e-09, "loss": 9.4542, "num_input_tokens_seen": 12256, "step": 10 }, { "epoch": 0.0018794637263500815, "grad_norm": 219.58265686035156, "learning_rate": 8.770830722967047e-09, "loss": 9.4314, "num_input_tokens_seen": 18496, "step": 15 }, { "epoch": 0.002505951635133442, "grad_norm": 225.51806640625, "learning_rate": 1.190327026688385e-08, "loss": 9.661, "num_input_tokens_seen": 24672, "step": 20 }, { "epoch": 0.0031324395439168025, "grad_norm": 189.77232360839844, "learning_rate": 1.5035709810800653e-08, "loss": 9.4882, "num_input_tokens_seen": 30880, "step": 25 }, { "epoch": 0.003758927452700163, "grad_norm": 218.5586700439453, "learning_rate": 1.8168149354717455e-08, "loss": 9.5404, "num_input_tokens_seen": 37024, "step": 30 }, { "epoch": 0.004385415361483523, "grad_norm": 239.15696716308594, "learning_rate": 2.1300588898634258e-08, "loss": 9.5128, "num_input_tokens_seen": 43264, "step": 35 }, { "epoch": 0.005011903270266884, "grad_norm": 218.03456115722656, "learning_rate": 2.443302844255106e-08, "loss": 9.1796, "num_input_tokens_seen": 49024, "step": 40 }, { "epoch": 0.005638391179050245, "grad_norm": 215.14500427246094, "learning_rate": 2.7565467986467864e-08, "loss": 9.1563, "num_input_tokens_seen": 54912, "step": 45 }, { "epoch": 0.006264879087833605, "grad_norm": 194.5288848876953, "learning_rate": 3.069790753038467e-08, "loss": 9.2362, "num_input_tokens_seen": 61248, "step": 50 }, { "epoch": 0.0068913669966169655, "grad_norm": 208.7165069580078, "learning_rate": 3.3830347074301466e-08, "loss": 9.3237, "num_input_tokens_seen": 67488, "step": 55 }, { "epoch": 0.007517854905400326, "grad_norm": 193.67079162597656, "learning_rate": 3.696278661821827e-08, "loss": 9.4766, "num_input_tokens_seen": 73600, "step": 60 }, { "epoch": 0.008144342814183686, "grad_norm": 195.70245361328125, "learning_rate": 4.009522616213507e-08, "loss": 9.2325, "num_input_tokens_seen": 79936, "step": 65 }, { "epoch": 0.008770830722967046, "grad_norm": 214.2272491455078, "learning_rate": 4.322766570605188e-08, "loss": 9.2168, "num_input_tokens_seen": 86112, "step": 70 }, { "epoch": 0.009397318631750407, "grad_norm": 195.04568481445312, "learning_rate": 4.636010524996868e-08, "loss": 9.0704, "num_input_tokens_seen": 92192, "step": 75 }, { "epoch": 0.010023806540533768, "grad_norm": 195.9908905029297, "learning_rate": 4.9492544793885487e-08, "loss": 9.2227, "num_input_tokens_seen": 98400, "step": 80 }, { "epoch": 0.010650294449317128, "grad_norm": 190.04541015625, "learning_rate": 5.262498433780228e-08, "loss": 8.6287, "num_input_tokens_seen": 104512, "step": 85 }, { "epoch": 0.01127678235810049, "grad_norm": 228.77049255371094, "learning_rate": 5.575742388171909e-08, "loss": 8.9181, "num_input_tokens_seen": 109952, "step": 90 }, { "epoch": 0.011903270266883849, "grad_norm": 178.7974853515625, "learning_rate": 5.888986342563589e-08, "loss": 8.8369, "num_input_tokens_seen": 116384, "step": 95 }, { "epoch": 0.01252975817566721, "grad_norm": 182.3853759765625, "learning_rate": 6.20223029695527e-08, "loss": 8.5497, "num_input_tokens_seen": 122496, "step": 100 }, { "epoch": 0.01315624608445057, "grad_norm": 181.8325958251953, "learning_rate": 6.515474251346949e-08, "loss": 8.4464, "num_input_tokens_seen": 128224, "step": 105 }, { "epoch": 0.013782733993233931, "grad_norm": 172.76138305664062, "learning_rate": 6.82871820573863e-08, "loss": 8.5179, "num_input_tokens_seen": 134528, "step": 110 }, { "epoch": 0.01440922190201729, "grad_norm": 180.55270385742188, "learning_rate": 7.141962160130309e-08, "loss": 8.5963, "num_input_tokens_seen": 140704, "step": 115 }, { "epoch": 0.015035709810800652, "grad_norm": 172.9241943359375, "learning_rate": 7.455206114521991e-08, "loss": 8.1082, "num_input_tokens_seen": 146688, "step": 120 }, { "epoch": 0.015662197719584013, "grad_norm": 182.11325073242188, "learning_rate": 7.76845006891367e-08, "loss": 8.4413, "num_input_tokens_seen": 152768, "step": 125 }, { "epoch": 0.016288685628367373, "grad_norm": 176.47850036621094, "learning_rate": 8.08169402330535e-08, "loss": 8.1213, "num_input_tokens_seen": 158688, "step": 130 }, { "epoch": 0.016915173537150732, "grad_norm": 174.80455017089844, "learning_rate": 8.394937977697032e-08, "loss": 8.2597, "num_input_tokens_seen": 164992, "step": 135 }, { "epoch": 0.017541661445934092, "grad_norm": 166.89193725585938, "learning_rate": 8.708181932088712e-08, "loss": 7.9136, "num_input_tokens_seen": 170368, "step": 140 }, { "epoch": 0.018168149354717455, "grad_norm": 166.81422424316406, "learning_rate": 9.021425886480391e-08, "loss": 8.0085, "num_input_tokens_seen": 176640, "step": 145 }, { "epoch": 0.018794637263500814, "grad_norm": 151.4792938232422, "learning_rate": 9.334669840872073e-08, "loss": 8.0992, "num_input_tokens_seen": 182880, "step": 150 }, { "epoch": 0.019421125172284174, "grad_norm": 154.0270233154297, "learning_rate": 9.647913795263753e-08, "loss": 8.0138, "num_input_tokens_seen": 189152, "step": 155 }, { "epoch": 0.020047613081067537, "grad_norm": 169.66554260253906, "learning_rate": 9.961157749655432e-08, "loss": 7.733, "num_input_tokens_seen": 195200, "step": 160 }, { "epoch": 0.020674100989850897, "grad_norm": 162.11216735839844, "learning_rate": 1.0274401704047112e-07, "loss": 7.5372, "num_input_tokens_seen": 201440, "step": 165 }, { "epoch": 0.021300588898634256, "grad_norm": 155.13035583496094, "learning_rate": 1.0587645658438794e-07, "loss": 7.5105, "num_input_tokens_seen": 207456, "step": 170 }, { "epoch": 0.021927076807417616, "grad_norm": 143.98956298828125, "learning_rate": 1.0900889612830474e-07, "loss": 7.1749, "num_input_tokens_seen": 213856, "step": 175 }, { "epoch": 0.02255356471620098, "grad_norm": 135.77914428710938, "learning_rate": 1.1214133567222153e-07, "loss": 7.1499, "num_input_tokens_seen": 219936, "step": 180 }, { "epoch": 0.023180052624984338, "grad_norm": 146.36996459960938, "learning_rate": 1.1527377521613833e-07, "loss": 7.1531, "num_input_tokens_seen": 226208, "step": 185 }, { "epoch": 0.023806540533767698, "grad_norm": 146.31796264648438, "learning_rate": 1.1840621476005515e-07, "loss": 7.0643, "num_input_tokens_seen": 232256, "step": 190 }, { "epoch": 0.024433028442551057, "grad_norm": 132.727783203125, "learning_rate": 1.2153865430397194e-07, "loss": 6.968, "num_input_tokens_seen": 238496, "step": 195 }, { "epoch": 0.02505951635133442, "grad_norm": 142.85699462890625, "learning_rate": 1.2467109384788875e-07, "loss": 6.8027, "num_input_tokens_seen": 244352, "step": 200 }, { "epoch": 0.02568600426011778, "grad_norm": 132.93508911132812, "learning_rate": 1.2780353339180557e-07, "loss": 6.3112, "num_input_tokens_seen": 250208, "step": 205 }, { "epoch": 0.02631249216890114, "grad_norm": 150.3761444091797, "learning_rate": 1.3093597293572236e-07, "loss": 6.6275, "num_input_tokens_seen": 256576, "step": 210 }, { "epoch": 0.0269389800776845, "grad_norm": 129.2598419189453, "learning_rate": 1.3406841247963915e-07, "loss": 6.3812, "num_input_tokens_seen": 262784, "step": 215 }, { "epoch": 0.027565467986467862, "grad_norm": 118.01812744140625, "learning_rate": 1.3720085202355596e-07, "loss": 6.1037, "num_input_tokens_seen": 268224, "step": 220 }, { "epoch": 0.02819195589525122, "grad_norm": 130.34352111816406, "learning_rate": 1.4033329156747275e-07, "loss": 5.8142, "num_input_tokens_seen": 274176, "step": 225 }, { "epoch": 0.02881844380403458, "grad_norm": 123.91333770751953, "learning_rate": 1.4346573111138957e-07, "loss": 6.0871, "num_input_tokens_seen": 280224, "step": 230 }, { "epoch": 0.029444931712817944, "grad_norm": 127.60879516601562, "learning_rate": 1.4659817065530636e-07, "loss": 5.8009, "num_input_tokens_seen": 286272, "step": 235 }, { "epoch": 0.030071419621601304, "grad_norm": 126.22982025146484, "learning_rate": 1.4973061019922315e-07, "loss": 5.7613, "num_input_tokens_seen": 292576, "step": 240 }, { "epoch": 0.030697907530384663, "grad_norm": 124.39469146728516, "learning_rate": 1.5286304974313997e-07, "loss": 5.6407, "num_input_tokens_seen": 298656, "step": 245 }, { "epoch": 0.031324395439168026, "grad_norm": 116.06964874267578, "learning_rate": 1.5599548928705678e-07, "loss": 5.2087, "num_input_tokens_seen": 304832, "step": 250 }, { "epoch": 0.03195088334795138, "grad_norm": 124.47612762451172, "learning_rate": 1.591279288309736e-07, "loss": 5.5553, "num_input_tokens_seen": 311136, "step": 255 }, { "epoch": 0.032577371256734745, "grad_norm": 137.56930541992188, "learning_rate": 1.6226036837489036e-07, "loss": 5.2333, "num_input_tokens_seen": 317632, "step": 260 }, { "epoch": 0.03320385916551811, "grad_norm": 135.53575134277344, "learning_rate": 1.6539280791880718e-07, "loss": 5.1714, "num_input_tokens_seen": 323808, "step": 265 }, { "epoch": 0.033830347074301464, "grad_norm": 112.84645080566406, "learning_rate": 1.6852524746272397e-07, "loss": 5.0026, "num_input_tokens_seen": 330272, "step": 270 }, { "epoch": 0.03445683498308483, "grad_norm": 119.58861541748047, "learning_rate": 1.7165768700664078e-07, "loss": 5.0008, "num_input_tokens_seen": 336384, "step": 275 }, { "epoch": 0.035083322891868184, "grad_norm": 114.77474212646484, "learning_rate": 1.747901265505576e-07, "loss": 4.8432, "num_input_tokens_seen": 342592, "step": 280 }, { "epoch": 0.03570981080065155, "grad_norm": 113.01918029785156, "learning_rate": 1.779225660944744e-07, "loss": 4.407, "num_input_tokens_seen": 348768, "step": 285 }, { "epoch": 0.03633629870943491, "grad_norm": 126.70811462402344, "learning_rate": 1.810550056383912e-07, "loss": 4.5516, "num_input_tokens_seen": 354624, "step": 290 }, { "epoch": 0.036962786618218266, "grad_norm": 107.17869567871094, "learning_rate": 1.8418744518230802e-07, "loss": 4.4063, "num_input_tokens_seen": 360160, "step": 295 }, { "epoch": 0.03758927452700163, "grad_norm": 99.74629211425781, "learning_rate": 1.8731988472622478e-07, "loss": 4.2367, "num_input_tokens_seen": 366272, "step": 300 }, { "epoch": 0.03821576243578499, "grad_norm": 118.44390106201172, "learning_rate": 1.904523242701416e-07, "loss": 4.0183, "num_input_tokens_seen": 372480, "step": 305 }, { "epoch": 0.03884225034456835, "grad_norm": 101.59062194824219, "learning_rate": 1.9358476381405841e-07, "loss": 3.934, "num_input_tokens_seen": 378624, "step": 310 }, { "epoch": 0.03946873825335171, "grad_norm": 103.8401107788086, "learning_rate": 1.967172033579752e-07, "loss": 3.9271, "num_input_tokens_seen": 384960, "step": 315 }, { "epoch": 0.040095226162135074, "grad_norm": 99.08815002441406, "learning_rate": 1.9984964290189202e-07, "loss": 3.8118, "num_input_tokens_seen": 391264, "step": 320 }, { "epoch": 0.04072171407091843, "grad_norm": 111.7527847290039, "learning_rate": 2.029820824458088e-07, "loss": 3.5804, "num_input_tokens_seen": 397504, "step": 325 }, { "epoch": 0.04134820197970179, "grad_norm": 90.0573959350586, "learning_rate": 2.061145219897256e-07, "loss": 3.3528, "num_input_tokens_seen": 403488, "step": 330 }, { "epoch": 0.04197468988848515, "grad_norm": 93.15338134765625, "learning_rate": 2.0924696153364242e-07, "loss": 3.3083, "num_input_tokens_seen": 409664, "step": 335 }, { "epoch": 0.04260117779726851, "grad_norm": 91.78150177001953, "learning_rate": 2.123794010775592e-07, "loss": 3.1042, "num_input_tokens_seen": 415808, "step": 340 }, { "epoch": 0.043227665706051875, "grad_norm": 87.71438598632812, "learning_rate": 2.1551184062147602e-07, "loss": 3.1313, "num_input_tokens_seen": 421888, "step": 345 }, { "epoch": 0.04385415361483523, "grad_norm": 84.88099670410156, "learning_rate": 2.1864428016539284e-07, "loss": 2.9453, "num_input_tokens_seen": 428032, "step": 350 }, { "epoch": 0.044480641523618594, "grad_norm": 89.1611099243164, "learning_rate": 2.2177671970930963e-07, "loss": 2.8971, "num_input_tokens_seen": 434208, "step": 355 }, { "epoch": 0.04510712943240196, "grad_norm": 91.11591339111328, "learning_rate": 2.2490915925322644e-07, "loss": 2.696, "num_input_tokens_seen": 440096, "step": 360 }, { "epoch": 0.04573361734118531, "grad_norm": 83.5699462890625, "learning_rate": 2.2804159879714326e-07, "loss": 2.4973, "num_input_tokens_seen": 446080, "step": 365 }, { "epoch": 0.046360105249968676, "grad_norm": 82.11884307861328, "learning_rate": 2.3117403834106002e-07, "loss": 2.4986, "num_input_tokens_seen": 452224, "step": 370 }, { "epoch": 0.04698659315875204, "grad_norm": 91.9319076538086, "learning_rate": 2.3430647788497684e-07, "loss": 2.2055, "num_input_tokens_seen": 458144, "step": 375 }, { "epoch": 0.047613081067535395, "grad_norm": 104.39422607421875, "learning_rate": 2.3743891742889363e-07, "loss": 2.1897, "num_input_tokens_seen": 463968, "step": 380 }, { "epoch": 0.04823956897631876, "grad_norm": 75.97245025634766, "learning_rate": 2.4057135697281044e-07, "loss": 2.1656, "num_input_tokens_seen": 470080, "step": 385 }, { "epoch": 0.048866056885102115, "grad_norm": 91.62303924560547, "learning_rate": 2.4370379651672726e-07, "loss": 1.9978, "num_input_tokens_seen": 476288, "step": 390 }, { "epoch": 0.04949254479388548, "grad_norm": 66.57998657226562, "learning_rate": 2.46836236060644e-07, "loss": 1.9337, "num_input_tokens_seen": 482112, "step": 395 }, { "epoch": 0.05011903270266884, "grad_norm": 73.30902099609375, "learning_rate": 2.4996867560456084e-07, "loss": 1.8764, "num_input_tokens_seen": 488288, "step": 400 }, { "epoch": 0.0507455206114522, "grad_norm": 65.71503448486328, "learning_rate": 2.5310111514847765e-07, "loss": 1.7185, "num_input_tokens_seen": 493568, "step": 405 }, { "epoch": 0.05137200852023556, "grad_norm": 75.19815826416016, "learning_rate": 2.5623355469239447e-07, "loss": 1.5748, "num_input_tokens_seen": 499584, "step": 410 }, { "epoch": 0.05199849642901892, "grad_norm": 59.189910888671875, "learning_rate": 2.593659942363113e-07, "loss": 1.4034, "num_input_tokens_seen": 504896, "step": 415 }, { "epoch": 0.05262498433780228, "grad_norm": 57.2137451171875, "learning_rate": 2.6249843378022805e-07, "loss": 1.5026, "num_input_tokens_seen": 511136, "step": 420 }, { "epoch": 0.05325147224658564, "grad_norm": 57.15275573730469, "learning_rate": 2.6563087332414487e-07, "loss": 1.3944, "num_input_tokens_seen": 517376, "step": 425 }, { "epoch": 0.053877960155369, "grad_norm": 55.23196029663086, "learning_rate": 2.687633128680617e-07, "loss": 1.2784, "num_input_tokens_seen": 523552, "step": 430 }, { "epoch": 0.05450444806415236, "grad_norm": 70.77559661865234, "learning_rate": 2.7189575241197844e-07, "loss": 1.2548, "num_input_tokens_seen": 530080, "step": 435 }, { "epoch": 0.055130935972935724, "grad_norm": 66.21360778808594, "learning_rate": 2.7502819195589526e-07, "loss": 1.0438, "num_input_tokens_seen": 535968, "step": 440 }, { "epoch": 0.05575742388171908, "grad_norm": 64.93487548828125, "learning_rate": 2.781606314998121e-07, "loss": 1.1344, "num_input_tokens_seen": 541984, "step": 445 }, { "epoch": 0.05638391179050244, "grad_norm": 43.45860290527344, "learning_rate": 2.812930710437289e-07, "loss": 1.0407, "num_input_tokens_seen": 548000, "step": 450 }, { "epoch": 0.057010399699285806, "grad_norm": 36.24368667602539, "learning_rate": 2.844255105876457e-07, "loss": 0.9932, "num_input_tokens_seen": 554400, "step": 455 }, { "epoch": 0.05763688760806916, "grad_norm": 33.39188766479492, "learning_rate": 2.875579501315625e-07, "loss": 0.8958, "num_input_tokens_seen": 560448, "step": 460 }, { "epoch": 0.058263375516852525, "grad_norm": 49.13450241088867, "learning_rate": 2.906903896754793e-07, "loss": 0.9027, "num_input_tokens_seen": 566592, "step": 465 }, { "epoch": 0.05888986342563589, "grad_norm": 32.44865798950195, "learning_rate": 2.938228292193961e-07, "loss": 0.7879, "num_input_tokens_seen": 572640, "step": 470 }, { "epoch": 0.059516351334419244, "grad_norm": 39.50494384765625, "learning_rate": 2.9695526876331287e-07, "loss": 0.8894, "num_input_tokens_seen": 578912, "step": 475 }, { "epoch": 0.06014283924320261, "grad_norm": 33.205406188964844, "learning_rate": 3.000877083072297e-07, "loss": 0.7604, "num_input_tokens_seen": 585152, "step": 480 }, { "epoch": 0.06076932715198596, "grad_norm": 39.930320739746094, "learning_rate": 3.032201478511465e-07, "loss": 0.7433, "num_input_tokens_seen": 591296, "step": 485 }, { "epoch": 0.061395815060769326, "grad_norm": 41.53486633300781, "learning_rate": 3.0635258739506326e-07, "loss": 0.6976, "num_input_tokens_seen": 597440, "step": 490 }, { "epoch": 0.06202230296955269, "grad_norm": 46.13853454589844, "learning_rate": 3.094850269389801e-07, "loss": 0.6489, "num_input_tokens_seen": 603648, "step": 495 }, { "epoch": 0.06264879087833605, "grad_norm": 34.06405258178711, "learning_rate": 3.126174664828969e-07, "loss": 0.6761, "num_input_tokens_seen": 609696, "step": 500 }, { "epoch": 0.0632752787871194, "grad_norm": 21.57476806640625, "learning_rate": 3.157499060268137e-07, "loss": 0.7154, "num_input_tokens_seen": 615776, "step": 505 }, { "epoch": 0.06390176669590276, "grad_norm": 26.224761962890625, "learning_rate": 3.188823455707305e-07, "loss": 0.664, "num_input_tokens_seen": 621856, "step": 510 }, { "epoch": 0.06452825460468613, "grad_norm": 33.287437438964844, "learning_rate": 3.2201478511464734e-07, "loss": 0.6734, "num_input_tokens_seen": 628000, "step": 515 }, { "epoch": 0.06515474251346949, "grad_norm": 42.83485412597656, "learning_rate": 3.2514722465856416e-07, "loss": 0.6762, "num_input_tokens_seen": 633984, "step": 520 }, { "epoch": 0.06578123042225285, "grad_norm": 41.54075622558594, "learning_rate": 3.2827966420248087e-07, "loss": 0.6445, "num_input_tokens_seen": 640416, "step": 525 }, { "epoch": 0.06640771833103622, "grad_norm": 40.839996337890625, "learning_rate": 3.314121037463977e-07, "loss": 0.6441, "num_input_tokens_seen": 646432, "step": 530 }, { "epoch": 0.06703420623981957, "grad_norm": 26.813610076904297, "learning_rate": 3.345445432903145e-07, "loss": 0.6235, "num_input_tokens_seen": 652512, "step": 535 }, { "epoch": 0.06766069414860293, "grad_norm": 33.96189498901367, "learning_rate": 3.376769828342313e-07, "loss": 0.5853, "num_input_tokens_seen": 658400, "step": 540 }, { "epoch": 0.06828718205738629, "grad_norm": 36.48435592651367, "learning_rate": 3.4080942237814813e-07, "loss": 0.5949, "num_input_tokens_seen": 664288, "step": 545 }, { "epoch": 0.06891366996616966, "grad_norm": 37.39591979980469, "learning_rate": 3.439418619220649e-07, "loss": 0.6108, "num_input_tokens_seen": 670336, "step": 550 }, { "epoch": 0.06954015787495302, "grad_norm": 27.386489868164062, "learning_rate": 3.470743014659817e-07, "loss": 0.5911, "num_input_tokens_seen": 676320, "step": 555 }, { "epoch": 0.07016664578373637, "grad_norm": 27.639785766601562, "learning_rate": 3.5020674100989853e-07, "loss": 0.5462, "num_input_tokens_seen": 682560, "step": 560 }, { "epoch": 0.07079313369251973, "grad_norm": 31.611108779907227, "learning_rate": 3.5333918055381534e-07, "loss": 0.5814, "num_input_tokens_seen": 688672, "step": 565 }, { "epoch": 0.0714196216013031, "grad_norm": 32.82149124145508, "learning_rate": 3.5647162009773216e-07, "loss": 0.5699, "num_input_tokens_seen": 694112, "step": 570 }, { "epoch": 0.07204610951008646, "grad_norm": 38.891544342041016, "learning_rate": 3.59604059641649e-07, "loss": 0.6142, "num_input_tokens_seen": 699840, "step": 575 }, { "epoch": 0.07267259741886982, "grad_norm": 32.49098587036133, "learning_rate": 3.6273649918556574e-07, "loss": 0.5524, "num_input_tokens_seen": 706144, "step": 580 }, { "epoch": 0.07329908532765318, "grad_norm": 46.85624313354492, "learning_rate": 3.6586893872948255e-07, "loss": 0.5696, "num_input_tokens_seen": 712256, "step": 585 }, { "epoch": 0.07392557323643653, "grad_norm": 20.053483963012695, "learning_rate": 3.6900137827339937e-07, "loss": 0.5631, "num_input_tokens_seen": 718368, "step": 590 }, { "epoch": 0.0745520611452199, "grad_norm": 30.71172523498535, "learning_rate": 3.721338178173162e-07, "loss": 0.5335, "num_input_tokens_seen": 724704, "step": 595 }, { "epoch": 0.07517854905400326, "grad_norm": 33.323036193847656, "learning_rate": 3.75266257361233e-07, "loss": 0.5382, "num_input_tokens_seen": 730592, "step": 600 }, { "epoch": 0.07580503696278662, "grad_norm": 25.358970642089844, "learning_rate": 3.783986969051497e-07, "loss": 0.5443, "num_input_tokens_seen": 736672, "step": 605 }, { "epoch": 0.07643152487156998, "grad_norm": 23.391801834106445, "learning_rate": 3.8153113644906653e-07, "loss": 0.5441, "num_input_tokens_seen": 742560, "step": 610 }, { "epoch": 0.07705801278035333, "grad_norm": 34.380882263183594, "learning_rate": 3.8466357599298334e-07, "loss": 0.5391, "num_input_tokens_seen": 748416, "step": 615 }, { "epoch": 0.0776845006891367, "grad_norm": 31.19721221923828, "learning_rate": 3.8779601553690016e-07, "loss": 0.5399, "num_input_tokens_seen": 754528, "step": 620 }, { "epoch": 0.07831098859792006, "grad_norm": 29.844133377075195, "learning_rate": 3.90928455080817e-07, "loss": 0.533, "num_input_tokens_seen": 760576, "step": 625 }, { "epoch": 0.07893747650670342, "grad_norm": 34.21921920776367, "learning_rate": 3.940608946247338e-07, "loss": 0.5357, "num_input_tokens_seen": 766816, "step": 630 }, { "epoch": 0.07956396441548678, "grad_norm": 32.41375732421875, "learning_rate": 3.9719333416865056e-07, "loss": 0.5208, "num_input_tokens_seen": 772608, "step": 635 }, { "epoch": 0.08019045232427015, "grad_norm": 21.379487991333008, "learning_rate": 4.0032577371256737e-07, "loss": 0.5013, "num_input_tokens_seen": 778624, "step": 640 }, { "epoch": 0.0808169402330535, "grad_norm": 35.953487396240234, "learning_rate": 4.034582132564842e-07, "loss": 0.5423, "num_input_tokens_seen": 784832, "step": 645 }, { "epoch": 0.08144342814183686, "grad_norm": 18.199064254760742, "learning_rate": 4.06590652800401e-07, "loss": 0.5361, "num_input_tokens_seen": 791232, "step": 650 }, { "epoch": 0.08206991605062022, "grad_norm": 39.40612030029297, "learning_rate": 4.097230923443178e-07, "loss": 0.5703, "num_input_tokens_seen": 796800, "step": 655 }, { "epoch": 0.08269640395940359, "grad_norm": 36.42975616455078, "learning_rate": 4.128555318882346e-07, "loss": 0.5397, "num_input_tokens_seen": 802304, "step": 660 }, { "epoch": 0.08332289186818695, "grad_norm": 22.044940948486328, "learning_rate": 4.159879714321514e-07, "loss": 0.4739, "num_input_tokens_seen": 808512, "step": 665 }, { "epoch": 0.0839493797769703, "grad_norm": 21.399011611938477, "learning_rate": 4.191204109760682e-07, "loss": 0.5032, "num_input_tokens_seen": 814496, "step": 670 }, { "epoch": 0.08457586768575366, "grad_norm": 22.888439178466797, "learning_rate": 4.2225285051998503e-07, "loss": 0.486, "num_input_tokens_seen": 820672, "step": 675 }, { "epoch": 0.08520235559453702, "grad_norm": 42.067344665527344, "learning_rate": 4.2538529006390185e-07, "loss": 0.5313, "num_input_tokens_seen": 826880, "step": 680 }, { "epoch": 0.08582884350332039, "grad_norm": 33.20975112915039, "learning_rate": 4.285177296078186e-07, "loss": 0.5837, "num_input_tokens_seen": 832928, "step": 685 }, { "epoch": 0.08645533141210375, "grad_norm": 31.4372615814209, "learning_rate": 4.3165016915173537e-07, "loss": 0.4955, "num_input_tokens_seen": 839104, "step": 690 }, { "epoch": 0.08708181932088711, "grad_norm": 24.342304229736328, "learning_rate": 4.347826086956522e-07, "loss": 0.5106, "num_input_tokens_seen": 845344, "step": 695 }, { "epoch": 0.08770830722967046, "grad_norm": 39.95219039916992, "learning_rate": 4.37915048239569e-07, "loss": 0.5187, "num_input_tokens_seen": 851616, "step": 700 }, { "epoch": 0.08833479513845383, "grad_norm": 32.78697204589844, "learning_rate": 4.410474877834858e-07, "loss": 0.508, "num_input_tokens_seen": 857600, "step": 705 }, { "epoch": 0.08896128304723719, "grad_norm": 27.297208786010742, "learning_rate": 4.4417992732740264e-07, "loss": 0.497, "num_input_tokens_seen": 863808, "step": 710 }, { "epoch": 0.08958777095602055, "grad_norm": 31.731361389160156, "learning_rate": 4.473123668713194e-07, "loss": 0.5238, "num_input_tokens_seen": 869792, "step": 715 }, { "epoch": 0.09021425886480391, "grad_norm": 25.66058921813965, "learning_rate": 4.504448064152362e-07, "loss": 0.5209, "num_input_tokens_seen": 876000, "step": 720 }, { "epoch": 0.09084074677358726, "grad_norm": 32.39864730834961, "learning_rate": 4.5357724595915303e-07, "loss": 0.4836, "num_input_tokens_seen": 882112, "step": 725 }, { "epoch": 0.09146723468237063, "grad_norm": 21.423538208007812, "learning_rate": 4.5670968550306985e-07, "loss": 0.5177, "num_input_tokens_seen": 888256, "step": 730 }, { "epoch": 0.09209372259115399, "grad_norm": 32.48574447631836, "learning_rate": 4.5984212504698666e-07, "loss": 0.5199, "num_input_tokens_seen": 894048, "step": 735 }, { "epoch": 0.09272021049993735, "grad_norm": 37.552276611328125, "learning_rate": 4.629745645909035e-07, "loss": 0.5046, "num_input_tokens_seen": 900224, "step": 740 }, { "epoch": 0.09334669840872072, "grad_norm": 23.061370849609375, "learning_rate": 4.661070041348202e-07, "loss": 0.5081, "num_input_tokens_seen": 906336, "step": 745 }, { "epoch": 0.09397318631750408, "grad_norm": 18.51679039001465, "learning_rate": 4.69239443678737e-07, "loss": 0.4877, "num_input_tokens_seen": 912448, "step": 750 }, { "epoch": 0.09459967422628743, "grad_norm": 36.133140563964844, "learning_rate": 4.723718832226538e-07, "loss": 0.5057, "num_input_tokens_seen": 918816, "step": 755 }, { "epoch": 0.09522616213507079, "grad_norm": 23.716386795043945, "learning_rate": 4.7550432276657064e-07, "loss": 0.5021, "num_input_tokens_seen": 925248, "step": 760 }, { "epoch": 0.09585265004385415, "grad_norm": 27.526662826538086, "learning_rate": 4.786367623104875e-07, "loss": 0.5578, "num_input_tokens_seen": 931392, "step": 765 }, { "epoch": 0.09647913795263752, "grad_norm": 16.98016929626465, "learning_rate": 4.817692018544042e-07, "loss": 0.4359, "num_input_tokens_seen": 937216, "step": 770 }, { "epoch": 0.09710562586142088, "grad_norm": 24.391141891479492, "learning_rate": 4.849016413983211e-07, "loss": 0.4705, "num_input_tokens_seen": 943552, "step": 775 }, { "epoch": 0.09773211377020423, "grad_norm": 21.514814376831055, "learning_rate": 4.880340809422378e-07, "loss": 0.4707, "num_input_tokens_seen": 949536, "step": 780 }, { "epoch": 0.09835860167898759, "grad_norm": 35.80907440185547, "learning_rate": 4.911665204861547e-07, "loss": 0.4915, "num_input_tokens_seen": 955584, "step": 785 }, { "epoch": 0.09898508958777096, "grad_norm": 25.666879653930664, "learning_rate": 4.942989600300715e-07, "loss": 0.5027, "num_input_tokens_seen": 961440, "step": 790 }, { "epoch": 0.09961157749655432, "grad_norm": 34.85185623168945, "learning_rate": 4.974313995739882e-07, "loss": 0.5226, "num_input_tokens_seen": 967392, "step": 795 }, { "epoch": 0.10023806540533768, "grad_norm": 23.1767635345459, "learning_rate": 5.00563839117905e-07, "loss": 0.5046, "num_input_tokens_seen": 973408, "step": 800 }, { "epoch": 0.10086455331412104, "grad_norm": 34.94982147216797, "learning_rate": 5.036962786618219e-07, "loss": 0.5065, "num_input_tokens_seen": 978784, "step": 805 }, { "epoch": 0.1014910412229044, "grad_norm": 33.201438903808594, "learning_rate": 5.068287182057386e-07, "loss": 0.5008, "num_input_tokens_seen": 985152, "step": 810 }, { "epoch": 0.10211752913168776, "grad_norm": 22.156248092651367, "learning_rate": 5.099611577496555e-07, "loss": 0.4682, "num_input_tokens_seen": 991168, "step": 815 }, { "epoch": 0.10274401704047112, "grad_norm": 28.442279815673828, "learning_rate": 5.130935972935723e-07, "loss": 0.472, "num_input_tokens_seen": 997504, "step": 820 }, { "epoch": 0.10337050494925448, "grad_norm": 40.399940490722656, "learning_rate": 5.16226036837489e-07, "loss": 0.4682, "num_input_tokens_seen": 1003968, "step": 825 }, { "epoch": 0.10399699285803785, "grad_norm": 22.8817138671875, "learning_rate": 5.193584763814059e-07, "loss": 0.4749, "num_input_tokens_seen": 1009792, "step": 830 }, { "epoch": 0.1046234807668212, "grad_norm": 20.94745635986328, "learning_rate": 5.224909159253227e-07, "loss": 0.502, "num_input_tokens_seen": 1015808, "step": 835 }, { "epoch": 0.10524996867560456, "grad_norm": 21.39569854736328, "learning_rate": 5.256233554692395e-07, "loss": 0.5004, "num_input_tokens_seen": 1022048, "step": 840 }, { "epoch": 0.10587645658438792, "grad_norm": 25.43179702758789, "learning_rate": 5.287557950131563e-07, "loss": 0.4691, "num_input_tokens_seen": 1028192, "step": 845 }, { "epoch": 0.10650294449317128, "grad_norm": 35.45829772949219, "learning_rate": 5.318882345570731e-07, "loss": 0.5183, "num_input_tokens_seen": 1034464, "step": 850 }, { "epoch": 0.10712943240195465, "grad_norm": 29.493743896484375, "learning_rate": 5.350206741009898e-07, "loss": 0.4912, "num_input_tokens_seen": 1040288, "step": 855 }, { "epoch": 0.107755920310738, "grad_norm": 36.014427185058594, "learning_rate": 5.381531136449067e-07, "loss": 0.5647, "num_input_tokens_seen": 1046560, "step": 860 }, { "epoch": 0.10838240821952136, "grad_norm": 16.331418991088867, "learning_rate": 5.412855531888235e-07, "loss": 0.4872, "num_input_tokens_seen": 1052608, "step": 865 }, { "epoch": 0.10900889612830472, "grad_norm": 22.80329704284668, "learning_rate": 5.444179927327403e-07, "loss": 0.4848, "num_input_tokens_seen": 1058688, "step": 870 }, { "epoch": 0.10963538403708808, "grad_norm": 24.751358032226562, "learning_rate": 5.475504322766571e-07, "loss": 0.478, "num_input_tokens_seen": 1064768, "step": 875 }, { "epoch": 0.11026187194587145, "grad_norm": 19.632503509521484, "learning_rate": 5.506828718205739e-07, "loss": 0.4725, "num_input_tokens_seen": 1070976, "step": 880 }, { "epoch": 0.11088835985465481, "grad_norm": 25.185800552368164, "learning_rate": 5.538153113644907e-07, "loss": 0.4939, "num_input_tokens_seen": 1077088, "step": 885 }, { "epoch": 0.11151484776343816, "grad_norm": 45.245704650878906, "learning_rate": 5.569477509084075e-07, "loss": 0.495, "num_input_tokens_seen": 1083232, "step": 890 }, { "epoch": 0.11214133567222152, "grad_norm": 19.465120315551758, "learning_rate": 5.600801904523244e-07, "loss": 0.4873, "num_input_tokens_seen": 1089344, "step": 895 }, { "epoch": 0.11276782358100489, "grad_norm": 42.52298355102539, "learning_rate": 5.632126299962411e-07, "loss": 0.4864, "num_input_tokens_seen": 1095488, "step": 900 }, { "epoch": 0.11339431148978825, "grad_norm": 22.81599998474121, "learning_rate": 5.663450695401579e-07, "loss": 0.5168, "num_input_tokens_seen": 1101408, "step": 905 }, { "epoch": 0.11402079939857161, "grad_norm": 26.570985794067383, "learning_rate": 5.694775090840747e-07, "loss": 0.4805, "num_input_tokens_seen": 1107744, "step": 910 }, { "epoch": 0.11464728730735496, "grad_norm": 21.14463233947754, "learning_rate": 5.726099486279915e-07, "loss": 0.4625, "num_input_tokens_seen": 1113120, "step": 915 }, { "epoch": 0.11527377521613832, "grad_norm": 28.141958236694336, "learning_rate": 5.757423881719084e-07, "loss": 0.4502, "num_input_tokens_seen": 1119232, "step": 920 }, { "epoch": 0.11590026312492169, "grad_norm": 57.08580780029297, "learning_rate": 5.788748277158251e-07, "loss": 0.56, "num_input_tokens_seen": 1125344, "step": 925 }, { "epoch": 0.11652675103370505, "grad_norm": 18.96742820739746, "learning_rate": 5.82007267259742e-07, "loss": 0.4569, "num_input_tokens_seen": 1131200, "step": 930 }, { "epoch": 0.11715323894248841, "grad_norm": 36.73490905761719, "learning_rate": 5.851397068036587e-07, "loss": 0.5147, "num_input_tokens_seen": 1137632, "step": 935 }, { "epoch": 0.11777972685127178, "grad_norm": 31.39521026611328, "learning_rate": 5.882721463475755e-07, "loss": 0.4738, "num_input_tokens_seen": 1143392, "step": 940 }, { "epoch": 0.11840621476005513, "grad_norm": 32.72727584838867, "learning_rate": 5.914045858914923e-07, "loss": 0.4854, "num_input_tokens_seen": 1149824, "step": 945 }, { "epoch": 0.11903270266883849, "grad_norm": 32.75657272338867, "learning_rate": 5.945370254354092e-07, "loss": 0.4763, "num_input_tokens_seen": 1156064, "step": 950 }, { "epoch": 0.11965919057762185, "grad_norm": 17.764263153076172, "learning_rate": 5.976694649793259e-07, "loss": 0.4689, "num_input_tokens_seen": 1162272, "step": 955 }, { "epoch": 0.12028567848640521, "grad_norm": 35.22453308105469, "learning_rate": 6.008019045232427e-07, "loss": 0.5585, "num_input_tokens_seen": 1168416, "step": 960 }, { "epoch": 0.12091216639518858, "grad_norm": 32.55640411376953, "learning_rate": 6.039343440671596e-07, "loss": 0.4721, "num_input_tokens_seen": 1174240, "step": 965 }, { "epoch": 0.12153865430397193, "grad_norm": 29.944812774658203, "learning_rate": 6.070667836110763e-07, "loss": 0.4885, "num_input_tokens_seen": 1180128, "step": 970 }, { "epoch": 0.12216514221275529, "grad_norm": 37.6554069519043, "learning_rate": 6.101992231549932e-07, "loss": 0.5103, "num_input_tokens_seen": 1186304, "step": 975 }, { "epoch": 0.12279163012153865, "grad_norm": 18.104007720947266, "learning_rate": 6.1333166269891e-07, "loss": 0.4886, "num_input_tokens_seen": 1192608, "step": 980 }, { "epoch": 0.12341811803032202, "grad_norm": 24.77501678466797, "learning_rate": 6.164641022428268e-07, "loss": 0.4927, "num_input_tokens_seen": 1198912, "step": 985 }, { "epoch": 0.12404460593910538, "grad_norm": 22.163997650146484, "learning_rate": 6.195965417867436e-07, "loss": 0.4882, "num_input_tokens_seen": 1204736, "step": 990 }, { "epoch": 0.12467109384788874, "grad_norm": 25.695526123046875, "learning_rate": 6.227289813306604e-07, "loss": 0.4738, "num_input_tokens_seen": 1210880, "step": 995 }, { "epoch": 0.1252975817566721, "grad_norm": 30.270658493041992, "learning_rate": 6.258614208745771e-07, "loss": 0.4811, "num_input_tokens_seen": 1216512, "step": 1000 }, { "epoch": 0.12592406966545547, "grad_norm": 20.2186336517334, "learning_rate": 6.289938604184939e-07, "loss": 0.4885, "num_input_tokens_seen": 1222464, "step": 1005 }, { "epoch": 0.1265505575742388, "grad_norm": 19.66001319885254, "learning_rate": 6.321262999624108e-07, "loss": 0.4885, "num_input_tokens_seen": 1228448, "step": 1010 }, { "epoch": 0.12717704548302217, "grad_norm": 16.76427459716797, "learning_rate": 6.352587395063275e-07, "loss": 0.4827, "num_input_tokens_seen": 1234848, "step": 1015 }, { "epoch": 0.12780353339180553, "grad_norm": 38.76410675048828, "learning_rate": 6.383911790502444e-07, "loss": 0.4919, "num_input_tokens_seen": 1241344, "step": 1020 }, { "epoch": 0.1284300213005889, "grad_norm": 21.25130271911621, "learning_rate": 6.415236185941611e-07, "loss": 0.4784, "num_input_tokens_seen": 1247520, "step": 1025 }, { "epoch": 0.12905650920937226, "grad_norm": 19.313648223876953, "learning_rate": 6.44656058138078e-07, "loss": 0.4561, "num_input_tokens_seen": 1253408, "step": 1030 }, { "epoch": 0.12968299711815562, "grad_norm": 29.115407943725586, "learning_rate": 6.477884976819948e-07, "loss": 0.4884, "num_input_tokens_seen": 1258752, "step": 1035 }, { "epoch": 0.13030948502693898, "grad_norm": 25.311847686767578, "learning_rate": 6.509209372259116e-07, "loss": 0.4878, "num_input_tokens_seen": 1264704, "step": 1040 }, { "epoch": 0.13093597293572234, "grad_norm": 16.43429946899414, "learning_rate": 6.540533767698284e-07, "loss": 0.4554, "num_input_tokens_seen": 1270592, "step": 1045 }, { "epoch": 0.1315624608445057, "grad_norm": 26.30427360534668, "learning_rate": 6.571858163137453e-07, "loss": 0.5071, "num_input_tokens_seen": 1276608, "step": 1050 }, { "epoch": 0.13218894875328907, "grad_norm": 17.477651596069336, "learning_rate": 6.603182558576619e-07, "loss": 0.5338, "num_input_tokens_seen": 1282656, "step": 1055 }, { "epoch": 0.13281543666207243, "grad_norm": 17.94588279724121, "learning_rate": 6.634506954015787e-07, "loss": 0.4722, "num_input_tokens_seen": 1288768, "step": 1060 }, { "epoch": 0.13344192457085577, "grad_norm": 25.307809829711914, "learning_rate": 6.665831349454956e-07, "loss": 0.5216, "num_input_tokens_seen": 1294880, "step": 1065 }, { "epoch": 0.13406841247963913, "grad_norm": 17.52459716796875, "learning_rate": 6.697155744894123e-07, "loss": 0.4968, "num_input_tokens_seen": 1301344, "step": 1070 }, { "epoch": 0.1346949003884225, "grad_norm": 20.420923233032227, "learning_rate": 6.728480140333292e-07, "loss": 0.4943, "num_input_tokens_seen": 1307680, "step": 1075 }, { "epoch": 0.13532138829720586, "grad_norm": 13.339408874511719, "learning_rate": 6.75980453577246e-07, "loss": 0.4904, "num_input_tokens_seen": 1313536, "step": 1080 }, { "epoch": 0.13594787620598922, "grad_norm": 14.359772682189941, "learning_rate": 6.791128931211628e-07, "loss": 0.4728, "num_input_tokens_seen": 1319808, "step": 1085 }, { "epoch": 0.13657436411477258, "grad_norm": 21.322219848632812, "learning_rate": 6.822453326650796e-07, "loss": 0.4636, "num_input_tokens_seen": 1326176, "step": 1090 }, { "epoch": 0.13720085202355595, "grad_norm": 24.0727481842041, "learning_rate": 6.853777722089965e-07, "loss": 0.4777, "num_input_tokens_seen": 1332576, "step": 1095 }, { "epoch": 0.1378273399323393, "grad_norm": 22.732633590698242, "learning_rate": 6.885102117529132e-07, "loss": 0.5088, "num_input_tokens_seen": 1338720, "step": 1100 }, { "epoch": 0.13845382784112267, "grad_norm": 23.263113021850586, "learning_rate": 6.916426512968301e-07, "loss": 0.4697, "num_input_tokens_seen": 1344992, "step": 1105 }, { "epoch": 0.13908031574990604, "grad_norm": 15.313148498535156, "learning_rate": 6.947750908407468e-07, "loss": 0.4893, "num_input_tokens_seen": 1351104, "step": 1110 }, { "epoch": 0.1397068036586894, "grad_norm": 18.125988006591797, "learning_rate": 6.979075303846636e-07, "loss": 0.4661, "num_input_tokens_seen": 1357632, "step": 1115 }, { "epoch": 0.14033329156747273, "grad_norm": 25.5230712890625, "learning_rate": 7.010399699285804e-07, "loss": 0.474, "num_input_tokens_seen": 1363808, "step": 1120 }, { "epoch": 0.1409597794762561, "grad_norm": 22.953298568725586, "learning_rate": 7.041724094724973e-07, "loss": 0.4727, "num_input_tokens_seen": 1369920, "step": 1125 }, { "epoch": 0.14158626738503946, "grad_norm": 17.274658203125, "learning_rate": 7.07304849016414e-07, "loss": 0.4987, "num_input_tokens_seen": 1375712, "step": 1130 }, { "epoch": 0.14221275529382282, "grad_norm": 21.5816593170166, "learning_rate": 7.104372885603309e-07, "loss": 0.4642, "num_input_tokens_seen": 1381984, "step": 1135 }, { "epoch": 0.1428392432026062, "grad_norm": 21.95441436767578, "learning_rate": 7.135697281042476e-07, "loss": 0.4481, "num_input_tokens_seen": 1387968, "step": 1140 }, { "epoch": 0.14346573111138955, "grad_norm": 22.821531295776367, "learning_rate": 7.167021676481645e-07, "loss": 0.4806, "num_input_tokens_seen": 1393952, "step": 1145 }, { "epoch": 0.1440922190201729, "grad_norm": 20.39767837524414, "learning_rate": 7.198346071920813e-07, "loss": 0.456, "num_input_tokens_seen": 1400160, "step": 1150 }, { "epoch": 0.14471870692895628, "grad_norm": 16.130165100097656, "learning_rate": 7.229670467359982e-07, "loss": 0.4783, "num_input_tokens_seen": 1406304, "step": 1155 }, { "epoch": 0.14534519483773964, "grad_norm": 27.551265716552734, "learning_rate": 7.260994862799149e-07, "loss": 0.5028, "num_input_tokens_seen": 1412576, "step": 1160 }, { "epoch": 0.145971682746523, "grad_norm": 31.609996795654297, "learning_rate": 7.292319258238316e-07, "loss": 0.5196, "num_input_tokens_seen": 1418592, "step": 1165 }, { "epoch": 0.14659817065530636, "grad_norm": 17.771413803100586, "learning_rate": 7.323643653677484e-07, "loss": 0.4639, "num_input_tokens_seen": 1424928, "step": 1170 }, { "epoch": 0.1472246585640897, "grad_norm": 13.124558448791504, "learning_rate": 7.354968049116652e-07, "loss": 0.4348, "num_input_tokens_seen": 1431296, "step": 1175 }, { "epoch": 0.14785114647287306, "grad_norm": 30.449539184570312, "learning_rate": 7.386292444555821e-07, "loss": 0.4939, "num_input_tokens_seen": 1437152, "step": 1180 }, { "epoch": 0.14847763438165643, "grad_norm": 21.926172256469727, "learning_rate": 7.417616839994988e-07, "loss": 0.5144, "num_input_tokens_seen": 1443136, "step": 1185 }, { "epoch": 0.1491041222904398, "grad_norm": 24.407367706298828, "learning_rate": 7.448941235434157e-07, "loss": 0.5224, "num_input_tokens_seen": 1448832, "step": 1190 }, { "epoch": 0.14973061019922315, "grad_norm": 23.550983428955078, "learning_rate": 7.480265630873325e-07, "loss": 0.5122, "num_input_tokens_seen": 1454112, "step": 1195 }, { "epoch": 0.15035709810800651, "grad_norm": 15.464835166931152, "learning_rate": 7.511590026312493e-07, "loss": 0.5008, "num_input_tokens_seen": 1460640, "step": 1200 }, { "epoch": 0.15098358601678988, "grad_norm": 16.451862335205078, "learning_rate": 7.542914421751661e-07, "loss": 0.4717, "num_input_tokens_seen": 1466816, "step": 1205 }, { "epoch": 0.15161007392557324, "grad_norm": 23.464611053466797, "learning_rate": 7.57423881719083e-07, "loss": 0.4735, "num_input_tokens_seen": 1473248, "step": 1210 }, { "epoch": 0.1522365618343566, "grad_norm": 19.607892990112305, "learning_rate": 7.605563212629997e-07, "loss": 0.4577, "num_input_tokens_seen": 1479328, "step": 1215 }, { "epoch": 0.15286304974313997, "grad_norm": 14.054795265197754, "learning_rate": 7.636887608069164e-07, "loss": 0.4581, "num_input_tokens_seen": 1484480, "step": 1220 }, { "epoch": 0.15348953765192333, "grad_norm": 13.854641914367676, "learning_rate": 7.668212003508333e-07, "loss": 0.4831, "num_input_tokens_seen": 1490624, "step": 1225 }, { "epoch": 0.15411602556070667, "grad_norm": 9.92822265625, "learning_rate": 7.6995363989475e-07, "loss": 0.4772, "num_input_tokens_seen": 1496832, "step": 1230 }, { "epoch": 0.15474251346949003, "grad_norm": 16.796581268310547, "learning_rate": 7.730860794386669e-07, "loss": 0.4973, "num_input_tokens_seen": 1502944, "step": 1235 }, { "epoch": 0.1553690013782734, "grad_norm": 15.715682983398438, "learning_rate": 7.762185189825837e-07, "loss": 0.4623, "num_input_tokens_seen": 1509280, "step": 1240 }, { "epoch": 0.15599548928705675, "grad_norm": 17.726221084594727, "learning_rate": 7.793509585265005e-07, "loss": 0.4552, "num_input_tokens_seen": 1515296, "step": 1245 }, { "epoch": 0.15662197719584012, "grad_norm": 16.8480224609375, "learning_rate": 7.824833980704173e-07, "loss": 0.49, "num_input_tokens_seen": 1521536, "step": 1250 }, { "epoch": 0.15724846510462348, "grad_norm": 15.482765197753906, "learning_rate": 7.856158376143342e-07, "loss": 0.4581, "num_input_tokens_seen": 1527840, "step": 1255 }, { "epoch": 0.15787495301340684, "grad_norm": 32.33808517456055, "learning_rate": 7.887482771582509e-07, "loss": 0.4861, "num_input_tokens_seen": 1533856, "step": 1260 }, { "epoch": 0.1585014409221902, "grad_norm": 10.30781364440918, "learning_rate": 7.918807167021678e-07, "loss": 0.4745, "num_input_tokens_seen": 1540032, "step": 1265 }, { "epoch": 0.15912792883097357, "grad_norm": 23.83780860900879, "learning_rate": 7.950131562460845e-07, "loss": 0.4812, "num_input_tokens_seen": 1546176, "step": 1270 }, { "epoch": 0.15975441673975693, "grad_norm": 17.815399169921875, "learning_rate": 7.981455957900012e-07, "loss": 0.4853, "num_input_tokens_seen": 1552480, "step": 1275 }, { "epoch": 0.1603809046485403, "grad_norm": 17.140583038330078, "learning_rate": 8.012780353339181e-07, "loss": 0.4943, "num_input_tokens_seen": 1558656, "step": 1280 }, { "epoch": 0.16100739255732363, "grad_norm": 19.71062469482422, "learning_rate": 8.044104748778348e-07, "loss": 0.4755, "num_input_tokens_seen": 1564416, "step": 1285 }, { "epoch": 0.161633880466107, "grad_norm": 18.711589813232422, "learning_rate": 8.075429144217517e-07, "loss": 0.4477, "num_input_tokens_seen": 1570432, "step": 1290 }, { "epoch": 0.16226036837489036, "grad_norm": 17.754554748535156, "learning_rate": 8.106753539656685e-07, "loss": 0.4595, "num_input_tokens_seen": 1576512, "step": 1295 }, { "epoch": 0.16288685628367372, "grad_norm": 17.779190063476562, "learning_rate": 8.138077935095853e-07, "loss": 0.4694, "num_input_tokens_seen": 1582720, "step": 1300 }, { "epoch": 0.16351334419245708, "grad_norm": 21.87968635559082, "learning_rate": 8.169402330535021e-07, "loss": 0.4812, "num_input_tokens_seen": 1588448, "step": 1305 }, { "epoch": 0.16413983210124045, "grad_norm": 14.914999961853027, "learning_rate": 8.20072672597419e-07, "loss": 0.4592, "num_input_tokens_seen": 1594400, "step": 1310 }, { "epoch": 0.1647663200100238, "grad_norm": 14.527460098266602, "learning_rate": 8.232051121413357e-07, "loss": 0.4376, "num_input_tokens_seen": 1600576, "step": 1315 }, { "epoch": 0.16539280791880717, "grad_norm": 14.775908470153809, "learning_rate": 8.263375516852526e-07, "loss": 0.4395, "num_input_tokens_seen": 1606912, "step": 1320 }, { "epoch": 0.16601929582759054, "grad_norm": 35.038246154785156, "learning_rate": 8.294699912291694e-07, "loss": 0.5027, "num_input_tokens_seen": 1613184, "step": 1325 }, { "epoch": 0.1666457837363739, "grad_norm": 23.77020835876465, "learning_rate": 8.326024307730861e-07, "loss": 0.4976, "num_input_tokens_seen": 1619328, "step": 1330 }, { "epoch": 0.16727227164515726, "grad_norm": 33.54347229003906, "learning_rate": 8.357348703170029e-07, "loss": 0.5222, "num_input_tokens_seen": 1625344, "step": 1335 }, { "epoch": 0.1678987595539406, "grad_norm": 28.027210235595703, "learning_rate": 8.388673098609198e-07, "loss": 0.4795, "num_input_tokens_seen": 1631552, "step": 1340 }, { "epoch": 0.16852524746272396, "grad_norm": 22.94502067565918, "learning_rate": 8.419997494048365e-07, "loss": 0.4593, "num_input_tokens_seen": 1637504, "step": 1345 }, { "epoch": 0.16915173537150732, "grad_norm": 18.621835708618164, "learning_rate": 8.451321889487534e-07, "loss": 0.4818, "num_input_tokens_seen": 1644096, "step": 1350 }, { "epoch": 0.16977822328029069, "grad_norm": 31.185922622680664, "learning_rate": 8.482646284926702e-07, "loss": 0.5011, "num_input_tokens_seen": 1650336, "step": 1355 }, { "epoch": 0.17040471118907405, "grad_norm": 16.485118865966797, "learning_rate": 8.51397068036587e-07, "loss": 0.4705, "num_input_tokens_seen": 1656736, "step": 1360 }, { "epoch": 0.1710311990978574, "grad_norm": 11.98579216003418, "learning_rate": 8.545295075805038e-07, "loss": 0.5093, "num_input_tokens_seen": 1662944, "step": 1365 }, { "epoch": 0.17165768700664077, "grad_norm": 17.533615112304688, "learning_rate": 8.576619471244207e-07, "loss": 0.4681, "num_input_tokens_seen": 1668832, "step": 1370 }, { "epoch": 0.17228417491542414, "grad_norm": 17.229883193969727, "learning_rate": 8.607943866683374e-07, "loss": 0.4787, "num_input_tokens_seen": 1675136, "step": 1375 }, { "epoch": 0.1729106628242075, "grad_norm": 15.77431583404541, "learning_rate": 8.639268262122543e-07, "loss": 0.463, "num_input_tokens_seen": 1681056, "step": 1380 }, { "epoch": 0.17353715073299086, "grad_norm": 11.183327674865723, "learning_rate": 8.670592657561709e-07, "loss": 0.4558, "num_input_tokens_seen": 1686656, "step": 1385 }, { "epoch": 0.17416363864177423, "grad_norm": 24.635883331298828, "learning_rate": 8.701917053000877e-07, "loss": 0.4909, "num_input_tokens_seen": 1692768, "step": 1390 }, { "epoch": 0.17479012655055756, "grad_norm": 14.840117454528809, "learning_rate": 8.733241448440046e-07, "loss": 0.5052, "num_input_tokens_seen": 1698208, "step": 1395 }, { "epoch": 0.17541661445934092, "grad_norm": 20.050416946411133, "learning_rate": 8.764565843879213e-07, "loss": 0.4901, "num_input_tokens_seen": 1704672, "step": 1400 }, { "epoch": 0.1760431023681243, "grad_norm": 12.771857261657715, "learning_rate": 8.795890239318382e-07, "loss": 0.4492, "num_input_tokens_seen": 1710880, "step": 1405 }, { "epoch": 0.17666959027690765, "grad_norm": 19.39531707763672, "learning_rate": 8.82721463475755e-07, "loss": 0.4935, "num_input_tokens_seen": 1716704, "step": 1410 }, { "epoch": 0.17729607818569101, "grad_norm": 11.942909240722656, "learning_rate": 8.858539030196718e-07, "loss": 0.4813, "num_input_tokens_seen": 1723008, "step": 1415 }, { "epoch": 0.17792256609447438, "grad_norm": 10.721324920654297, "learning_rate": 8.889863425635886e-07, "loss": 0.4781, "num_input_tokens_seen": 1729184, "step": 1420 }, { "epoch": 0.17854905400325774, "grad_norm": 14.694358825683594, "learning_rate": 8.921187821075055e-07, "loss": 0.4843, "num_input_tokens_seen": 1735360, "step": 1425 }, { "epoch": 0.1791755419120411, "grad_norm": 24.39377784729004, "learning_rate": 8.952512216514222e-07, "loss": 0.4748, "num_input_tokens_seen": 1741600, "step": 1430 }, { "epoch": 0.17980202982082447, "grad_norm": 13.67094612121582, "learning_rate": 8.983836611953391e-07, "loss": 0.4429, "num_input_tokens_seen": 1747808, "step": 1435 }, { "epoch": 0.18042851772960783, "grad_norm": 13.21165657043457, "learning_rate": 9.015161007392558e-07, "loss": 0.4871, "num_input_tokens_seen": 1754112, "step": 1440 }, { "epoch": 0.1810550056383912, "grad_norm": 20.52340316772461, "learning_rate": 9.046485402831725e-07, "loss": 0.4855, "num_input_tokens_seen": 1760096, "step": 1445 }, { "epoch": 0.18168149354717453, "grad_norm": 15.756925582885742, "learning_rate": 9.077809798270894e-07, "loss": 0.4722, "num_input_tokens_seen": 1766240, "step": 1450 }, { "epoch": 0.1823079814559579, "grad_norm": 12.059937477111816, "learning_rate": 9.109134193710062e-07, "loss": 0.4858, "num_input_tokens_seen": 1772032, "step": 1455 }, { "epoch": 0.18293446936474125, "grad_norm": 18.911012649536133, "learning_rate": 9.14045858914923e-07, "loss": 0.4801, "num_input_tokens_seen": 1778112, "step": 1460 }, { "epoch": 0.18356095727352462, "grad_norm": 13.02237606048584, "learning_rate": 9.171782984588398e-07, "loss": 0.4728, "num_input_tokens_seen": 1784320, "step": 1465 }, { "epoch": 0.18418744518230798, "grad_norm": 20.84585189819336, "learning_rate": 9.203107380027567e-07, "loss": 0.4496, "num_input_tokens_seen": 1790528, "step": 1470 }, { "epoch": 0.18481393309109134, "grad_norm": 14.995793342590332, "learning_rate": 9.234431775466734e-07, "loss": 0.4732, "num_input_tokens_seen": 1796576, "step": 1475 }, { "epoch": 0.1854404209998747, "grad_norm": 17.932292938232422, "learning_rate": 9.265756170905903e-07, "loss": 0.4506, "num_input_tokens_seen": 1802816, "step": 1480 }, { "epoch": 0.18606690890865807, "grad_norm": 16.02126121520996, "learning_rate": 9.297080566345071e-07, "loss": 0.4605, "num_input_tokens_seen": 1808896, "step": 1485 }, { "epoch": 0.18669339681744143, "grad_norm": 12.997295379638672, "learning_rate": 9.328404961784239e-07, "loss": 0.4859, "num_input_tokens_seen": 1814880, "step": 1490 }, { "epoch": 0.1873198847262248, "grad_norm": 17.365312576293945, "learning_rate": 9.359729357223406e-07, "loss": 0.4515, "num_input_tokens_seen": 1821024, "step": 1495 }, { "epoch": 0.18794637263500816, "grad_norm": 16.53742790222168, "learning_rate": 9.391053752662573e-07, "loss": 0.4496, "num_input_tokens_seen": 1827136, "step": 1500 }, { "epoch": 0.1885728605437915, "grad_norm": 18.842544555664062, "learning_rate": 9.422378148101742e-07, "loss": 0.4742, "num_input_tokens_seen": 1832896, "step": 1505 }, { "epoch": 0.18919934845257486, "grad_norm": 16.022628784179688, "learning_rate": 9.45370254354091e-07, "loss": 0.512, "num_input_tokens_seen": 1838624, "step": 1510 }, { "epoch": 0.18982583636135822, "grad_norm": 11.976082801818848, "learning_rate": 9.485026938980078e-07, "loss": 0.471, "num_input_tokens_seen": 1844704, "step": 1515 }, { "epoch": 0.19045232427014158, "grad_norm": 22.923208236694336, "learning_rate": 9.516351334419246e-07, "loss": 0.4886, "num_input_tokens_seen": 1850880, "step": 1520 }, { "epoch": 0.19107881217892494, "grad_norm": 14.84964370727539, "learning_rate": 9.547675729858414e-07, "loss": 0.4579, "num_input_tokens_seen": 1855968, "step": 1525 }, { "epoch": 0.1917053000877083, "grad_norm": 15.984855651855469, "learning_rate": 9.579000125297583e-07, "loss": 0.4833, "num_input_tokens_seen": 1861888, "step": 1530 }, { "epoch": 0.19233178799649167, "grad_norm": 12.189555168151855, "learning_rate": 9.610324520736751e-07, "loss": 0.4484, "num_input_tokens_seen": 1868256, "step": 1535 }, { "epoch": 0.19295827590527503, "grad_norm": 14.869606971740723, "learning_rate": 9.641648916175919e-07, "loss": 0.4763, "num_input_tokens_seen": 1874336, "step": 1540 }, { "epoch": 0.1935847638140584, "grad_norm": 14.166979789733887, "learning_rate": 9.672973311615086e-07, "loss": 0.4886, "num_input_tokens_seen": 1880512, "step": 1545 }, { "epoch": 0.19421125172284176, "grad_norm": 17.40867042541504, "learning_rate": 9.704297707054254e-07, "loss": 0.505, "num_input_tokens_seen": 1886304, "step": 1550 }, { "epoch": 0.19483773963162512, "grad_norm": 10.904388427734375, "learning_rate": 9.735622102493422e-07, "loss": 0.4615, "num_input_tokens_seen": 1892640, "step": 1555 }, { "epoch": 0.19546422754040846, "grad_norm": 19.44397735595703, "learning_rate": 9.766946497932591e-07, "loss": 0.4658, "num_input_tokens_seen": 1898912, "step": 1560 }, { "epoch": 0.19609071544919182, "grad_norm": 17.74071502685547, "learning_rate": 9.79827089337176e-07, "loss": 0.4788, "num_input_tokens_seen": 1905152, "step": 1565 }, { "epoch": 0.19671720335797518, "grad_norm": 11.467246055603027, "learning_rate": 9.829595288810927e-07, "loss": 0.4604, "num_input_tokens_seen": 1911040, "step": 1570 }, { "epoch": 0.19734369126675855, "grad_norm": 13.346708297729492, "learning_rate": 9.860919684250094e-07, "loss": 0.4624, "num_input_tokens_seen": 1916960, "step": 1575 }, { "epoch": 0.1979701791755419, "grad_norm": 16.7341365814209, "learning_rate": 9.892244079689264e-07, "loss": 0.4694, "num_input_tokens_seen": 1923296, "step": 1580 }, { "epoch": 0.19859666708432527, "grad_norm": 13.17404842376709, "learning_rate": 9.923568475128432e-07, "loss": 0.4565, "num_input_tokens_seen": 1929344, "step": 1585 }, { "epoch": 0.19922315499310864, "grad_norm": 10.193241119384766, "learning_rate": 9.9548928705676e-07, "loss": 0.4855, "num_input_tokens_seen": 1935296, "step": 1590 }, { "epoch": 0.199849642901892, "grad_norm": 20.153343200683594, "learning_rate": 9.986217266006767e-07, "loss": 0.4725, "num_input_tokens_seen": 1941024, "step": 1595 }, { "epoch": 0.20047613081067536, "grad_norm": 11.836256980895996, "learning_rate": 1.0017541661445937e-06, "loss": 0.4687, "num_input_tokens_seen": 1947168, "step": 1600 }, { "epoch": 0.20110261871945873, "grad_norm": 13.836121559143066, "learning_rate": 1.0048866056885102e-06, "loss": 0.4898, "num_input_tokens_seen": 1953216, "step": 1605 }, { "epoch": 0.2017291066282421, "grad_norm": 13.78735637664795, "learning_rate": 1.008019045232427e-06, "loss": 0.4654, "num_input_tokens_seen": 1959456, "step": 1610 }, { "epoch": 0.20235559453702542, "grad_norm": 19.824514389038086, "learning_rate": 1.011151484776344e-06, "loss": 0.4841, "num_input_tokens_seen": 1965280, "step": 1615 }, { "epoch": 0.2029820824458088, "grad_norm": 12.939146041870117, "learning_rate": 1.0142839243202607e-06, "loss": 0.4767, "num_input_tokens_seen": 1971648, "step": 1620 }, { "epoch": 0.20360857035459215, "grad_norm": 13.112632751464844, "learning_rate": 1.0174163638641775e-06, "loss": 0.4857, "num_input_tokens_seen": 1977664, "step": 1625 }, { "epoch": 0.2042350582633755, "grad_norm": 9.84666919708252, "learning_rate": 1.0205488034080942e-06, "loss": 0.4896, "num_input_tokens_seen": 1983744, "step": 1630 }, { "epoch": 0.20486154617215888, "grad_norm": 13.924190521240234, "learning_rate": 1.0236812429520112e-06, "loss": 0.4738, "num_input_tokens_seen": 1989760, "step": 1635 }, { "epoch": 0.20548803408094224, "grad_norm": 12.036764144897461, "learning_rate": 1.026813682495928e-06, "loss": 0.4835, "num_input_tokens_seen": 1995808, "step": 1640 }, { "epoch": 0.2061145219897256, "grad_norm": 18.629383087158203, "learning_rate": 1.0299461220398447e-06, "loss": 0.4728, "num_input_tokens_seen": 2001824, "step": 1645 }, { "epoch": 0.20674100989850897, "grad_norm": 12.19594955444336, "learning_rate": 1.0330785615837615e-06, "loss": 0.4683, "num_input_tokens_seen": 2007520, "step": 1650 }, { "epoch": 0.20736749780729233, "grad_norm": 10.741127014160156, "learning_rate": 1.0362110011276785e-06, "loss": 0.4928, "num_input_tokens_seen": 2014144, "step": 1655 }, { "epoch": 0.2079939857160757, "grad_norm": 10.26583480834961, "learning_rate": 1.039343440671595e-06, "loss": 0.473, "num_input_tokens_seen": 2020512, "step": 1660 }, { "epoch": 0.20862047362485903, "grad_norm": 8.771820068359375, "learning_rate": 1.0424758802155118e-06, "loss": 0.4808, "num_input_tokens_seen": 2026528, "step": 1665 }, { "epoch": 0.2092469615336424, "grad_norm": 14.101149559020996, "learning_rate": 1.0456083197594288e-06, "loss": 0.4735, "num_input_tokens_seen": 2032960, "step": 1670 }, { "epoch": 0.20987344944242575, "grad_norm": 10.452010154724121, "learning_rate": 1.0487407593033455e-06, "loss": 0.4801, "num_input_tokens_seen": 2038976, "step": 1675 }, { "epoch": 0.21049993735120912, "grad_norm": 9.013350486755371, "learning_rate": 1.0518731988472623e-06, "loss": 0.4509, "num_input_tokens_seen": 2045024, "step": 1680 }, { "epoch": 0.21112642525999248, "grad_norm": 10.445574760437012, "learning_rate": 1.055005638391179e-06, "loss": 0.4773, "num_input_tokens_seen": 2050912, "step": 1685 }, { "epoch": 0.21175291316877584, "grad_norm": 6.556990146636963, "learning_rate": 1.058138077935096e-06, "loss": 0.4737, "num_input_tokens_seen": 2057248, "step": 1690 }, { "epoch": 0.2123794010775592, "grad_norm": 12.931622505187988, "learning_rate": 1.0612705174790128e-06, "loss": 0.4549, "num_input_tokens_seen": 2063648, "step": 1695 }, { "epoch": 0.21300588898634257, "grad_norm": 12.17298698425293, "learning_rate": 1.0644029570229296e-06, "loss": 0.4911, "num_input_tokens_seen": 2070016, "step": 1700 }, { "epoch": 0.21363237689512593, "grad_norm": 19.216686248779297, "learning_rate": 1.0675353965668463e-06, "loss": 0.4601, "num_input_tokens_seen": 2076096, "step": 1705 }, { "epoch": 0.2142588648039093, "grad_norm": 8.271466255187988, "learning_rate": 1.070667836110763e-06, "loss": 0.4585, "num_input_tokens_seen": 2081600, "step": 1710 }, { "epoch": 0.21488535271269266, "grad_norm": 9.717873573303223, "learning_rate": 1.0738002756546798e-06, "loss": 0.4591, "num_input_tokens_seen": 2087712, "step": 1715 }, { "epoch": 0.215511840621476, "grad_norm": 14.62285041809082, "learning_rate": 1.0769327151985966e-06, "loss": 0.4962, "num_input_tokens_seen": 2092960, "step": 1720 }, { "epoch": 0.21613832853025935, "grad_norm": 19.028121948242188, "learning_rate": 1.0800651547425136e-06, "loss": 0.5149, "num_input_tokens_seen": 2098784, "step": 1725 }, { "epoch": 0.21676481643904272, "grad_norm": 11.532195091247559, "learning_rate": 1.0831975942864304e-06, "loss": 0.4837, "num_input_tokens_seen": 2104800, "step": 1730 }, { "epoch": 0.21739130434782608, "grad_norm": 10.633116722106934, "learning_rate": 1.0863300338303471e-06, "loss": 0.4768, "num_input_tokens_seen": 2110784, "step": 1735 }, { "epoch": 0.21801779225660944, "grad_norm": 11.413786888122559, "learning_rate": 1.0894624733742639e-06, "loss": 0.4702, "num_input_tokens_seen": 2116736, "step": 1740 }, { "epoch": 0.2186442801653928, "grad_norm": 9.299090385437012, "learning_rate": 1.0925949129181809e-06, "loss": 0.4488, "num_input_tokens_seen": 2123072, "step": 1745 }, { "epoch": 0.21927076807417617, "grad_norm": 11.533432960510254, "learning_rate": 1.0957273524620976e-06, "loss": 0.4798, "num_input_tokens_seen": 2129248, "step": 1750 }, { "epoch": 0.21989725598295953, "grad_norm": 11.551376342773438, "learning_rate": 1.0988597920060144e-06, "loss": 0.4658, "num_input_tokens_seen": 2135200, "step": 1755 }, { "epoch": 0.2205237438917429, "grad_norm": 12.329853057861328, "learning_rate": 1.1019922315499311e-06, "loss": 0.4908, "num_input_tokens_seen": 2141248, "step": 1760 }, { "epoch": 0.22115023180052626, "grad_norm": 8.81024169921875, "learning_rate": 1.105124671093848e-06, "loss": 0.4725, "num_input_tokens_seen": 2147488, "step": 1765 }, { "epoch": 0.22177671970930962, "grad_norm": 10.148378372192383, "learning_rate": 1.1082571106377647e-06, "loss": 0.4692, "num_input_tokens_seen": 2153792, "step": 1770 }, { "epoch": 0.22240320761809296, "grad_norm": 12.154644012451172, "learning_rate": 1.1113895501816816e-06, "loss": 0.4777, "num_input_tokens_seen": 2159840, "step": 1775 }, { "epoch": 0.22302969552687632, "grad_norm": 12.29324722290039, "learning_rate": 1.1145219897255984e-06, "loss": 0.489, "num_input_tokens_seen": 2165728, "step": 1780 }, { "epoch": 0.22365618343565968, "grad_norm": 8.904973983764648, "learning_rate": 1.1176544292695152e-06, "loss": 0.472, "num_input_tokens_seen": 2172096, "step": 1785 }, { "epoch": 0.22428267134444305, "grad_norm": 11.162693977355957, "learning_rate": 1.120786868813432e-06, "loss": 0.4624, "num_input_tokens_seen": 2178272, "step": 1790 }, { "epoch": 0.2249091592532264, "grad_norm": 8.830912590026855, "learning_rate": 1.123919308357349e-06, "loss": 0.4718, "num_input_tokens_seen": 2183104, "step": 1795 }, { "epoch": 0.22553564716200977, "grad_norm": 7.735280990600586, "learning_rate": 1.1270517479012657e-06, "loss": 0.4691, "num_input_tokens_seen": 2189440, "step": 1800 }, { "epoch": 0.22616213507079314, "grad_norm": 6.879495620727539, "learning_rate": 1.1301841874451824e-06, "loss": 0.4656, "num_input_tokens_seen": 2195392, "step": 1805 }, { "epoch": 0.2267886229795765, "grad_norm": 7.670830249786377, "learning_rate": 1.1333166269890992e-06, "loss": 0.481, "num_input_tokens_seen": 2200800, "step": 1810 }, { "epoch": 0.22741511088835986, "grad_norm": 30.086721420288086, "learning_rate": 1.1364490665330162e-06, "loss": 0.485, "num_input_tokens_seen": 2206496, "step": 1815 }, { "epoch": 0.22804159879714322, "grad_norm": 12.156604766845703, "learning_rate": 1.1395815060769327e-06, "loss": 0.4478, "num_input_tokens_seen": 2212672, "step": 1820 }, { "epoch": 0.2286680867059266, "grad_norm": 18.17355728149414, "learning_rate": 1.1427139456208495e-06, "loss": 0.4739, "num_input_tokens_seen": 2218432, "step": 1825 }, { "epoch": 0.22929457461470992, "grad_norm": 20.326065063476562, "learning_rate": 1.1458463851647665e-06, "loss": 0.492, "num_input_tokens_seen": 2224256, "step": 1830 }, { "epoch": 0.22992106252349329, "grad_norm": 8.532760620117188, "learning_rate": 1.1489788247086832e-06, "loss": 0.473, "num_input_tokens_seen": 2230304, "step": 1835 }, { "epoch": 0.23054755043227665, "grad_norm": 9.6052827835083, "learning_rate": 1.1521112642526e-06, "loss": 0.4754, "num_input_tokens_seen": 2236384, "step": 1840 }, { "epoch": 0.23117403834106, "grad_norm": 11.56068229675293, "learning_rate": 1.1552437037965167e-06, "loss": 0.4654, "num_input_tokens_seen": 2242400, "step": 1845 }, { "epoch": 0.23180052624984337, "grad_norm": 14.784561157226562, "learning_rate": 1.1583761433404337e-06, "loss": 0.4824, "num_input_tokens_seen": 2248384, "step": 1850 }, { "epoch": 0.23242701415862674, "grad_norm": 12.323617935180664, "learning_rate": 1.1615085828843505e-06, "loss": 0.4687, "num_input_tokens_seen": 2254720, "step": 1855 }, { "epoch": 0.2330535020674101, "grad_norm": 8.880786895751953, "learning_rate": 1.1646410224282672e-06, "loss": 0.4708, "num_input_tokens_seen": 2260416, "step": 1860 }, { "epoch": 0.23367998997619346, "grad_norm": 8.766907691955566, "learning_rate": 1.167773461972184e-06, "loss": 0.4785, "num_input_tokens_seen": 2266464, "step": 1865 }, { "epoch": 0.23430647788497683, "grad_norm": 7.988321304321289, "learning_rate": 1.170905901516101e-06, "loss": 0.4697, "num_input_tokens_seen": 2272736, "step": 1870 }, { "epoch": 0.2349329657937602, "grad_norm": 10.556381225585938, "learning_rate": 1.1740383410600175e-06, "loss": 0.4486, "num_input_tokens_seen": 2278240, "step": 1875 }, { "epoch": 0.23555945370254355, "grad_norm": 8.633042335510254, "learning_rate": 1.1771707806039343e-06, "loss": 0.4702, "num_input_tokens_seen": 2284384, "step": 1880 }, { "epoch": 0.2361859416113269, "grad_norm": 9.434280395507812, "learning_rate": 1.1803032201478513e-06, "loss": 0.4647, "num_input_tokens_seen": 2289856, "step": 1885 }, { "epoch": 0.23681242952011025, "grad_norm": 16.30852699279785, "learning_rate": 1.183435659691768e-06, "loss": 0.4775, "num_input_tokens_seen": 2295584, "step": 1890 }, { "epoch": 0.23743891742889361, "grad_norm": 12.417708396911621, "learning_rate": 1.1865680992356848e-06, "loss": 0.4645, "num_input_tokens_seen": 2301632, "step": 1895 }, { "epoch": 0.23806540533767698, "grad_norm": 8.46765422821045, "learning_rate": 1.1897005387796016e-06, "loss": 0.4369, "num_input_tokens_seen": 2307872, "step": 1900 }, { "epoch": 0.23869189324646034, "grad_norm": 12.961664199829102, "learning_rate": 1.1928329783235185e-06, "loss": 0.4752, "num_input_tokens_seen": 2314144, "step": 1905 }, { "epoch": 0.2393183811552437, "grad_norm": 11.033329963684082, "learning_rate": 1.1959654178674353e-06, "loss": 0.4736, "num_input_tokens_seen": 2320320, "step": 1910 }, { "epoch": 0.23994486906402707, "grad_norm": 7.846789836883545, "learning_rate": 1.199097857411352e-06, "loss": 0.4604, "num_input_tokens_seen": 2326496, "step": 1915 }, { "epoch": 0.24057135697281043, "grad_norm": 11.240324020385742, "learning_rate": 1.2022302969552688e-06, "loss": 0.4818, "num_input_tokens_seen": 2332704, "step": 1920 }, { "epoch": 0.2411978448815938, "grad_norm": 8.717928886413574, "learning_rate": 1.2053627364991858e-06, "loss": 0.4636, "num_input_tokens_seen": 2338592, "step": 1925 }, { "epoch": 0.24182433279037716, "grad_norm": 14.736968994140625, "learning_rate": 1.2084951760431024e-06, "loss": 0.4813, "num_input_tokens_seen": 2344544, "step": 1930 }, { "epoch": 0.24245082069916052, "grad_norm": 7.200821876525879, "learning_rate": 1.2116276155870191e-06, "loss": 0.4593, "num_input_tokens_seen": 2350816, "step": 1935 }, { "epoch": 0.24307730860794385, "grad_norm": 9.955575942993164, "learning_rate": 1.214760055130936e-06, "loss": 0.4633, "num_input_tokens_seen": 2357024, "step": 1940 }, { "epoch": 0.24370379651672722, "grad_norm": 13.428146362304688, "learning_rate": 1.2178924946748529e-06, "loss": 0.4579, "num_input_tokens_seen": 2362720, "step": 1945 }, { "epoch": 0.24433028442551058, "grad_norm": 6.8517351150512695, "learning_rate": 1.2210249342187696e-06, "loss": 0.4821, "num_input_tokens_seen": 2369152, "step": 1950 }, { "epoch": 0.24495677233429394, "grad_norm": 12.180150985717773, "learning_rate": 1.2241573737626864e-06, "loss": 0.4678, "num_input_tokens_seen": 2375296, "step": 1955 }, { "epoch": 0.2455832602430773, "grad_norm": 10.643255233764648, "learning_rate": 1.2272898133066034e-06, "loss": 0.4668, "num_input_tokens_seen": 2381376, "step": 1960 }, { "epoch": 0.24620974815186067, "grad_norm": 13.727449417114258, "learning_rate": 1.2304222528505201e-06, "loss": 0.4695, "num_input_tokens_seen": 2387424, "step": 1965 }, { "epoch": 0.24683623606064403, "grad_norm": 8.295743942260742, "learning_rate": 1.2335546923944369e-06, "loss": 0.4669, "num_input_tokens_seen": 2393312, "step": 1970 }, { "epoch": 0.2474627239694274, "grad_norm": 9.16917896270752, "learning_rate": 1.2366871319383536e-06, "loss": 0.4691, "num_input_tokens_seen": 2399264, "step": 1975 }, { "epoch": 0.24808921187821076, "grad_norm": 7.857784748077393, "learning_rate": 1.2398195714822706e-06, "loss": 0.4636, "num_input_tokens_seen": 2405184, "step": 1980 }, { "epoch": 0.24871569978699412, "grad_norm": 9.050460815429688, "learning_rate": 1.2429520110261872e-06, "loss": 0.4779, "num_input_tokens_seen": 2411424, "step": 1985 }, { "epoch": 0.24934218769577748, "grad_norm": 5.8908796310424805, "learning_rate": 1.2460844505701041e-06, "loss": 0.4449, "num_input_tokens_seen": 2417536, "step": 1990 }, { "epoch": 0.24996867560456082, "grad_norm": 8.795228958129883, "learning_rate": 1.249216890114021e-06, "loss": 0.4504, "num_input_tokens_seen": 2423968, "step": 1995 }, { "epoch": 0.2505951635133442, "grad_norm": 7.957924842834473, "learning_rate": 1.2523493296579379e-06, "loss": 0.4839, "num_input_tokens_seen": 2430016, "step": 2000 }, { "epoch": 0.25122165142212755, "grad_norm": 7.557647705078125, "learning_rate": 1.2554817692018544e-06, "loss": 0.4581, "num_input_tokens_seen": 2435520, "step": 2005 }, { "epoch": 0.25184813933091094, "grad_norm": 16.007949829101562, "learning_rate": 1.2586142087457712e-06, "loss": 0.4832, "num_input_tokens_seen": 2441760, "step": 2010 }, { "epoch": 0.25247462723969427, "grad_norm": 6.55971622467041, "learning_rate": 1.2617466482896882e-06, "loss": 0.4757, "num_input_tokens_seen": 2448096, "step": 2015 }, { "epoch": 0.2531011151484776, "grad_norm": 15.702923774719238, "learning_rate": 1.2648790878336047e-06, "loss": 0.5018, "num_input_tokens_seen": 2453888, "step": 2020 }, { "epoch": 0.253727603057261, "grad_norm": 6.82427453994751, "learning_rate": 1.2680115273775217e-06, "loss": 0.4664, "num_input_tokens_seen": 2459712, "step": 2025 }, { "epoch": 0.25435409096604433, "grad_norm": 7.6348981857299805, "learning_rate": 1.2711439669214385e-06, "loss": 0.4503, "num_input_tokens_seen": 2465792, "step": 2030 }, { "epoch": 0.2549805788748277, "grad_norm": 6.896665573120117, "learning_rate": 1.2742764064653554e-06, "loss": 0.465, "num_input_tokens_seen": 2472096, "step": 2035 }, { "epoch": 0.25560706678361106, "grad_norm": 13.585664749145508, "learning_rate": 1.277408846009272e-06, "loss": 0.4824, "num_input_tokens_seen": 2478208, "step": 2040 }, { "epoch": 0.25623355469239445, "grad_norm": 16.11211585998535, "learning_rate": 1.280541285553189e-06, "loss": 0.485, "num_input_tokens_seen": 2484544, "step": 2045 }, { "epoch": 0.2568600426011778, "grad_norm": 6.5040154457092285, "learning_rate": 1.2836737250971057e-06, "loss": 0.465, "num_input_tokens_seen": 2490720, "step": 2050 }, { "epoch": 0.2574865305099612, "grad_norm": 7.560153484344482, "learning_rate": 1.2868061646410227e-06, "loss": 0.4642, "num_input_tokens_seen": 2496896, "step": 2055 }, { "epoch": 0.2581130184187445, "grad_norm": 14.710986137390137, "learning_rate": 1.2899386041849393e-06, "loss": 0.4513, "num_input_tokens_seen": 2503040, "step": 2060 }, { "epoch": 0.2587395063275279, "grad_norm": 19.12196922302246, "learning_rate": 1.293071043728856e-06, "loss": 0.476, "num_input_tokens_seen": 2509504, "step": 2065 }, { "epoch": 0.25936599423631124, "grad_norm": 9.192066192626953, "learning_rate": 1.296203483272773e-06, "loss": 0.4617, "num_input_tokens_seen": 2515424, "step": 2070 }, { "epoch": 0.25999248214509457, "grad_norm": 9.04504680633545, "learning_rate": 1.2993359228166895e-06, "loss": 0.4696, "num_input_tokens_seen": 2521728, "step": 2075 }, { "epoch": 0.26061897005387796, "grad_norm": 8.642287254333496, "learning_rate": 1.3024683623606065e-06, "loss": 0.4611, "num_input_tokens_seen": 2527712, "step": 2080 }, { "epoch": 0.2612454579626613, "grad_norm": 6.930182456970215, "learning_rate": 1.3056008019045233e-06, "loss": 0.4597, "num_input_tokens_seen": 2534176, "step": 2085 }, { "epoch": 0.2618719458714447, "grad_norm": 9.256510734558105, "learning_rate": 1.3087332414484403e-06, "loss": 0.4474, "num_input_tokens_seen": 2540416, "step": 2090 }, { "epoch": 0.262498433780228, "grad_norm": 8.483951568603516, "learning_rate": 1.3118656809923568e-06, "loss": 0.4591, "num_input_tokens_seen": 2546528, "step": 2095 }, { "epoch": 0.2631249216890114, "grad_norm": 12.062129020690918, "learning_rate": 1.3149981205362738e-06, "loss": 0.4664, "num_input_tokens_seen": 2552640, "step": 2100 }, { "epoch": 0.26375140959779475, "grad_norm": 10.404544830322266, "learning_rate": 1.3181305600801905e-06, "loss": 0.4564, "num_input_tokens_seen": 2559072, "step": 2105 }, { "epoch": 0.26437789750657814, "grad_norm": 11.294517517089844, "learning_rate": 1.3212629996241075e-06, "loss": 0.5033, "num_input_tokens_seen": 2564576, "step": 2110 }, { "epoch": 0.2650043854153615, "grad_norm": 11.584867477416992, "learning_rate": 1.324395439168024e-06, "loss": 0.4904, "num_input_tokens_seen": 2571040, "step": 2115 }, { "epoch": 0.26563087332414487, "grad_norm": 6.679898262023926, "learning_rate": 1.3275278787119408e-06, "loss": 0.4511, "num_input_tokens_seen": 2577120, "step": 2120 }, { "epoch": 0.2662573612329282, "grad_norm": 6.945197582244873, "learning_rate": 1.3306603182558578e-06, "loss": 0.469, "num_input_tokens_seen": 2583296, "step": 2125 }, { "epoch": 0.26688384914171154, "grad_norm": 9.351788520812988, "learning_rate": 1.3337927577997744e-06, "loss": 0.4832, "num_input_tokens_seen": 2589568, "step": 2130 }, { "epoch": 0.26751033705049493, "grad_norm": 8.942502975463867, "learning_rate": 1.3369251973436913e-06, "loss": 0.4626, "num_input_tokens_seen": 2595808, "step": 2135 }, { "epoch": 0.26813682495927826, "grad_norm": 9.339611053466797, "learning_rate": 1.340057636887608e-06, "loss": 0.4861, "num_input_tokens_seen": 2602272, "step": 2140 }, { "epoch": 0.26876331286806165, "grad_norm": 9.069661140441895, "learning_rate": 1.343190076431525e-06, "loss": 0.4583, "num_input_tokens_seen": 2608192, "step": 2145 }, { "epoch": 0.269389800776845, "grad_norm": 8.349671363830566, "learning_rate": 1.3463225159754416e-06, "loss": 0.4656, "num_input_tokens_seen": 2614432, "step": 2150 }, { "epoch": 0.2700162886856284, "grad_norm": 10.549155235290527, "learning_rate": 1.3494549555193586e-06, "loss": 0.4872, "num_input_tokens_seen": 2620768, "step": 2155 }, { "epoch": 0.2706427765944117, "grad_norm": 10.76445484161377, "learning_rate": 1.3525873950632754e-06, "loss": 0.4635, "num_input_tokens_seen": 2626912, "step": 2160 }, { "epoch": 0.2712692645031951, "grad_norm": 7.5833282470703125, "learning_rate": 1.3557198346071923e-06, "loss": 0.4568, "num_input_tokens_seen": 2633088, "step": 2165 }, { "epoch": 0.27189575241197844, "grad_norm": 5.732034683227539, "learning_rate": 1.3588522741511089e-06, "loss": 0.4665, "num_input_tokens_seen": 2638944, "step": 2170 }, { "epoch": 0.27252224032076183, "grad_norm": 5.14511775970459, "learning_rate": 1.3619847136950257e-06, "loss": 0.4732, "num_input_tokens_seen": 2644448, "step": 2175 }, { "epoch": 0.27314872822954517, "grad_norm": 10.233145713806152, "learning_rate": 1.3651171532389426e-06, "loss": 0.4664, "num_input_tokens_seen": 2650368, "step": 2180 }, { "epoch": 0.2737752161383285, "grad_norm": 6.072805404663086, "learning_rate": 1.3682495927828594e-06, "loss": 0.4593, "num_input_tokens_seen": 2656256, "step": 2185 }, { "epoch": 0.2744017040471119, "grad_norm": 5.148373603820801, "learning_rate": 1.3713820323267762e-06, "loss": 0.4761, "num_input_tokens_seen": 2662496, "step": 2190 }, { "epoch": 0.27502819195589523, "grad_norm": 9.705360412597656, "learning_rate": 1.374514471870693e-06, "loss": 0.4801, "num_input_tokens_seen": 2668288, "step": 2195 }, { "epoch": 0.2756546798646786, "grad_norm": 7.604836940765381, "learning_rate": 1.3776469114146099e-06, "loss": 0.4659, "num_input_tokens_seen": 2674528, "step": 2200 }, { "epoch": 0.27628116777346196, "grad_norm": 9.940526008605957, "learning_rate": 1.3807793509585267e-06, "loss": 0.4701, "num_input_tokens_seen": 2680640, "step": 2205 }, { "epoch": 0.27690765568224535, "grad_norm": 8.382550239562988, "learning_rate": 1.3839117905024434e-06, "loss": 0.4579, "num_input_tokens_seen": 2686816, "step": 2210 }, { "epoch": 0.2775341435910287, "grad_norm": 7.127007484436035, "learning_rate": 1.3870442300463602e-06, "loss": 0.4615, "num_input_tokens_seen": 2692928, "step": 2215 }, { "epoch": 0.27816063149981207, "grad_norm": 8.226227760314941, "learning_rate": 1.3901766695902772e-06, "loss": 0.4685, "num_input_tokens_seen": 2699296, "step": 2220 }, { "epoch": 0.2787871194085954, "grad_norm": 8.812602996826172, "learning_rate": 1.393309109134194e-06, "loss": 0.4842, "num_input_tokens_seen": 2705472, "step": 2225 }, { "epoch": 0.2794136073173788, "grad_norm": 6.266550064086914, "learning_rate": 1.3964415486781105e-06, "loss": 0.4645, "num_input_tokens_seen": 2711776, "step": 2230 }, { "epoch": 0.28004009522616213, "grad_norm": 10.354814529418945, "learning_rate": 1.3995739882220274e-06, "loss": 0.472, "num_input_tokens_seen": 2717920, "step": 2235 }, { "epoch": 0.28066658313494547, "grad_norm": 6.99669885635376, "learning_rate": 1.4027064277659442e-06, "loss": 0.4724, "num_input_tokens_seen": 2723584, "step": 2240 }, { "epoch": 0.28129307104372886, "grad_norm": 6.058269500732422, "learning_rate": 1.4058388673098612e-06, "loss": 0.4708, "num_input_tokens_seen": 2729568, "step": 2245 }, { "epoch": 0.2819195589525122, "grad_norm": 5.644102573394775, "learning_rate": 1.4089713068537777e-06, "loss": 0.467, "num_input_tokens_seen": 2734752, "step": 2250 }, { "epoch": 0.2825460468612956, "grad_norm": 8.423958778381348, "learning_rate": 1.4121037463976947e-06, "loss": 0.4606, "num_input_tokens_seen": 2740736, "step": 2255 }, { "epoch": 0.2831725347700789, "grad_norm": 6.136109352111816, "learning_rate": 1.4152361859416115e-06, "loss": 0.459, "num_input_tokens_seen": 2746784, "step": 2260 }, { "epoch": 0.2837990226788623, "grad_norm": 6.984126091003418, "learning_rate": 1.4183686254855282e-06, "loss": 0.4592, "num_input_tokens_seen": 2753280, "step": 2265 }, { "epoch": 0.28442551058764565, "grad_norm": 8.209537506103516, "learning_rate": 1.421501065029445e-06, "loss": 0.4752, "num_input_tokens_seen": 2759424, "step": 2270 }, { "epoch": 0.28505199849642904, "grad_norm": 8.281280517578125, "learning_rate": 1.424633504573362e-06, "loss": 0.4259, "num_input_tokens_seen": 2765760, "step": 2275 }, { "epoch": 0.2856784864052124, "grad_norm": 12.200156211853027, "learning_rate": 1.4277659441172787e-06, "loss": 0.5061, "num_input_tokens_seen": 2771936, "step": 2280 }, { "epoch": 0.28630497431399576, "grad_norm": 5.94399356842041, "learning_rate": 1.4308983836611953e-06, "loss": 0.4849, "num_input_tokens_seen": 2777920, "step": 2285 }, { "epoch": 0.2869314622227791, "grad_norm": 5.319188117980957, "learning_rate": 1.4340308232051123e-06, "loss": 0.4525, "num_input_tokens_seen": 2784160, "step": 2290 }, { "epoch": 0.28755795013156243, "grad_norm": 8.934842109680176, "learning_rate": 1.437163262749029e-06, "loss": 0.4705, "num_input_tokens_seen": 2790304, "step": 2295 }, { "epoch": 0.2881844380403458, "grad_norm": 6.248721599578857, "learning_rate": 1.440295702292946e-06, "loss": 0.4697, "num_input_tokens_seen": 2796288, "step": 2300 }, { "epoch": 0.28881092594912916, "grad_norm": 6.623419284820557, "learning_rate": 1.4434281418368626e-06, "loss": 0.4583, "num_input_tokens_seen": 2802176, "step": 2305 }, { "epoch": 0.28943741385791255, "grad_norm": 8.28623104095459, "learning_rate": 1.4465605813807795e-06, "loss": 0.5205, "num_input_tokens_seen": 2808160, "step": 2310 }, { "epoch": 0.2900639017666959, "grad_norm": 9.400063514709473, "learning_rate": 1.4496930209246963e-06, "loss": 0.4766, "num_input_tokens_seen": 2814144, "step": 2315 }, { "epoch": 0.2906903896754793, "grad_norm": 12.495382308959961, "learning_rate": 1.4528254604686133e-06, "loss": 0.4477, "num_input_tokens_seen": 2820736, "step": 2320 }, { "epoch": 0.2913168775842626, "grad_norm": 6.577590465545654, "learning_rate": 1.4559579000125298e-06, "loss": 0.4492, "num_input_tokens_seen": 2826720, "step": 2325 }, { "epoch": 0.291943365493046, "grad_norm": 6.355686187744141, "learning_rate": 1.4590903395564468e-06, "loss": 0.4557, "num_input_tokens_seen": 2832832, "step": 2330 }, { "epoch": 0.29256985340182934, "grad_norm": 6.60156774520874, "learning_rate": 1.4622227791003636e-06, "loss": 0.4778, "num_input_tokens_seen": 2838496, "step": 2335 }, { "epoch": 0.29319634131061273, "grad_norm": 4.782873630523682, "learning_rate": 1.46535521864428e-06, "loss": 0.4738, "num_input_tokens_seen": 2844704, "step": 2340 }, { "epoch": 0.29382282921939606, "grad_norm": 7.035074234008789, "learning_rate": 1.468487658188197e-06, "loss": 0.4627, "num_input_tokens_seen": 2851168, "step": 2345 }, { "epoch": 0.2944493171281794, "grad_norm": 6.381319999694824, "learning_rate": 1.4716200977321138e-06, "loss": 0.4611, "num_input_tokens_seen": 2856608, "step": 2350 }, { "epoch": 0.2950758050369628, "grad_norm": 7.576340198516846, "learning_rate": 1.4747525372760308e-06, "loss": 0.459, "num_input_tokens_seen": 2862560, "step": 2355 }, { "epoch": 0.2957022929457461, "grad_norm": 7.566466331481934, "learning_rate": 1.4778849768199474e-06, "loss": 0.4585, "num_input_tokens_seen": 2869024, "step": 2360 }, { "epoch": 0.2963287808545295, "grad_norm": 12.626879692077637, "learning_rate": 1.4810174163638643e-06, "loss": 0.478, "num_input_tokens_seen": 2874848, "step": 2365 }, { "epoch": 0.29695526876331285, "grad_norm": 6.497006893157959, "learning_rate": 1.4841498559077811e-06, "loss": 0.4858, "num_input_tokens_seen": 2881120, "step": 2370 }, { "epoch": 0.29758175667209624, "grad_norm": 9.605722427368164, "learning_rate": 1.487282295451698e-06, "loss": 0.4684, "num_input_tokens_seen": 2887392, "step": 2375 }, { "epoch": 0.2982082445808796, "grad_norm": 7.327104568481445, "learning_rate": 1.4904147349956146e-06, "loss": 0.4781, "num_input_tokens_seen": 2893568, "step": 2380 }, { "epoch": 0.29883473248966297, "grad_norm": 6.57773494720459, "learning_rate": 1.4935471745395316e-06, "loss": 0.4649, "num_input_tokens_seen": 2899200, "step": 2385 }, { "epoch": 0.2994612203984463, "grad_norm": 7.187891483306885, "learning_rate": 1.4966796140834484e-06, "loss": 0.4569, "num_input_tokens_seen": 2905408, "step": 2390 }, { "epoch": 0.3000877083072297, "grad_norm": 11.22510051727295, "learning_rate": 1.499812053627365e-06, "loss": 0.4773, "num_input_tokens_seen": 2911520, "step": 2395 }, { "epoch": 0.30071419621601303, "grad_norm": 11.554910659790039, "learning_rate": 1.502944493171282e-06, "loss": 0.4535, "num_input_tokens_seen": 2917120, "step": 2400 }, { "epoch": 0.30134068412479637, "grad_norm": 6.017333030700684, "learning_rate": 1.5060769327151987e-06, "loss": 0.4691, "num_input_tokens_seen": 2922976, "step": 2405 }, { "epoch": 0.30196717203357976, "grad_norm": 11.296586036682129, "learning_rate": 1.5092093722591156e-06, "loss": 0.4635, "num_input_tokens_seen": 2929056, "step": 2410 }, { "epoch": 0.3025936599423631, "grad_norm": 8.032004356384277, "learning_rate": 1.5123418118030322e-06, "loss": 0.4409, "num_input_tokens_seen": 2935296, "step": 2415 }, { "epoch": 0.3032201478511465, "grad_norm": 9.901954650878906, "learning_rate": 1.5154742513469492e-06, "loss": 0.4972, "num_input_tokens_seen": 2941312, "step": 2420 }, { "epoch": 0.3038466357599298, "grad_norm": 10.95311164855957, "learning_rate": 1.518606690890866e-06, "loss": 0.5039, "num_input_tokens_seen": 2947488, "step": 2425 }, { "epoch": 0.3044731236687132, "grad_norm": 5.868508338928223, "learning_rate": 1.521739130434783e-06, "loss": 0.4785, "num_input_tokens_seen": 2953312, "step": 2430 }, { "epoch": 0.30509961157749654, "grad_norm": 4.8908162117004395, "learning_rate": 1.5248715699786995e-06, "loss": 0.4599, "num_input_tokens_seen": 2958912, "step": 2435 }, { "epoch": 0.30572609948627993, "grad_norm": 6.059788703918457, "learning_rate": 1.5280040095226164e-06, "loss": 0.4603, "num_input_tokens_seen": 2965120, "step": 2440 }, { "epoch": 0.30635258739506327, "grad_norm": 5.500459671020508, "learning_rate": 1.5311364490665332e-06, "loss": 0.4662, "num_input_tokens_seen": 2971136, "step": 2445 }, { "epoch": 0.30697907530384666, "grad_norm": 7.3190507888793945, "learning_rate": 1.5342688886104497e-06, "loss": 0.4509, "num_input_tokens_seen": 2977056, "step": 2450 }, { "epoch": 0.30760556321263, "grad_norm": 7.404897689819336, "learning_rate": 1.5374013281543667e-06, "loss": 0.4872, "num_input_tokens_seen": 2983200, "step": 2455 }, { "epoch": 0.30823205112141333, "grad_norm": 6.526014804840088, "learning_rate": 1.5405337676982835e-06, "loss": 0.4633, "num_input_tokens_seen": 2989248, "step": 2460 }, { "epoch": 0.3088585390301967, "grad_norm": 5.260727882385254, "learning_rate": 1.5436662072422005e-06, "loss": 0.4742, "num_input_tokens_seen": 2995424, "step": 2465 }, { "epoch": 0.30948502693898006, "grad_norm": 6.5559163093566895, "learning_rate": 1.546798646786117e-06, "loss": 0.4816, "num_input_tokens_seen": 3001568, "step": 2470 }, { "epoch": 0.31011151484776345, "grad_norm": 5.012643337249756, "learning_rate": 1.549931086330034e-06, "loss": 0.4805, "num_input_tokens_seen": 3007616, "step": 2475 }, { "epoch": 0.3107380027565468, "grad_norm": 4.682682037353516, "learning_rate": 1.5530635258739507e-06, "loss": 0.4556, "num_input_tokens_seen": 3014080, "step": 2480 }, { "epoch": 0.3113644906653302, "grad_norm": 6.5731964111328125, "learning_rate": 1.5561959654178677e-06, "loss": 0.4744, "num_input_tokens_seen": 3020096, "step": 2485 }, { "epoch": 0.3119909785741135, "grad_norm": 7.130102157592773, "learning_rate": 1.5593284049617843e-06, "loss": 0.4753, "num_input_tokens_seen": 3026208, "step": 2490 }, { "epoch": 0.3126174664828969, "grad_norm": 8.767916679382324, "learning_rate": 1.5624608445057012e-06, "loss": 0.4582, "num_input_tokens_seen": 3032640, "step": 2495 }, { "epoch": 0.31324395439168023, "grad_norm": 6.622191905975342, "learning_rate": 1.565593284049618e-06, "loss": 0.4683, "num_input_tokens_seen": 3038592, "step": 2500 }, { "epoch": 0.3138704423004636, "grad_norm": 8.262925148010254, "learning_rate": 1.5687257235935346e-06, "loss": 0.4678, "num_input_tokens_seen": 3044576, "step": 2505 }, { "epoch": 0.31449693020924696, "grad_norm": 9.419342041015625, "learning_rate": 1.5718581631374515e-06, "loss": 0.4504, "num_input_tokens_seen": 3050656, "step": 2510 }, { "epoch": 0.3151234181180303, "grad_norm": 9.991503715515137, "learning_rate": 1.5749906026813683e-06, "loss": 0.4868, "num_input_tokens_seen": 3056992, "step": 2515 }, { "epoch": 0.3157499060268137, "grad_norm": 6.635571002960205, "learning_rate": 1.5781230422252853e-06, "loss": 0.4652, "num_input_tokens_seen": 3063040, "step": 2520 }, { "epoch": 0.316376393935597, "grad_norm": 8.849898338317871, "learning_rate": 1.5812554817692018e-06, "loss": 0.4646, "num_input_tokens_seen": 3069056, "step": 2525 }, { "epoch": 0.3170028818443804, "grad_norm": 11.442172050476074, "learning_rate": 1.5843879213131188e-06, "loss": 0.4629, "num_input_tokens_seen": 3075552, "step": 2530 }, { "epoch": 0.31762936975316375, "grad_norm": 7.259812831878662, "learning_rate": 1.5875203608570356e-06, "loss": 0.4737, "num_input_tokens_seen": 3081824, "step": 2535 }, { "epoch": 0.31825585766194714, "grad_norm": 7.989844799041748, "learning_rate": 1.5906528004009525e-06, "loss": 0.459, "num_input_tokens_seen": 3088032, "step": 2540 }, { "epoch": 0.3188823455707305, "grad_norm": 6.140463352203369, "learning_rate": 1.593785239944869e-06, "loss": 0.4715, "num_input_tokens_seen": 3093920, "step": 2545 }, { "epoch": 0.31950883347951387, "grad_norm": 3.599010944366455, "learning_rate": 1.596917679488786e-06, "loss": 0.46, "num_input_tokens_seen": 3099872, "step": 2550 }, { "epoch": 0.3201353213882972, "grad_norm": 6.50638484954834, "learning_rate": 1.6000501190327028e-06, "loss": 0.4692, "num_input_tokens_seen": 3106176, "step": 2555 }, { "epoch": 0.3207618092970806, "grad_norm": 12.114567756652832, "learning_rate": 1.6031825585766194e-06, "loss": 0.4697, "num_input_tokens_seen": 3112416, "step": 2560 }, { "epoch": 0.3213882972058639, "grad_norm": 10.44322681427002, "learning_rate": 1.6063149981205363e-06, "loss": 0.4661, "num_input_tokens_seen": 3118624, "step": 2565 }, { "epoch": 0.32201478511464726, "grad_norm": 8.146123886108398, "learning_rate": 1.6094474376644531e-06, "loss": 0.4758, "num_input_tokens_seen": 3124736, "step": 2570 }, { "epoch": 0.32264127302343065, "grad_norm": 6.8038105964660645, "learning_rate": 1.61257987720837e-06, "loss": 0.4509, "num_input_tokens_seen": 3130816, "step": 2575 }, { "epoch": 0.323267760932214, "grad_norm": 7.451169013977051, "learning_rate": 1.6157123167522866e-06, "loss": 0.4688, "num_input_tokens_seen": 3137120, "step": 2580 }, { "epoch": 0.3238942488409974, "grad_norm": 8.993637084960938, "learning_rate": 1.6188447562962036e-06, "loss": 0.461, "num_input_tokens_seen": 3143232, "step": 2585 }, { "epoch": 0.3245207367497807, "grad_norm": 6.228075981140137, "learning_rate": 1.6219771958401204e-06, "loss": 0.4545, "num_input_tokens_seen": 3149344, "step": 2590 }, { "epoch": 0.3251472246585641, "grad_norm": 6.031264781951904, "learning_rate": 1.6251096353840374e-06, "loss": 0.4816, "num_input_tokens_seen": 3155904, "step": 2595 }, { "epoch": 0.32577371256734744, "grad_norm": 5.59621524810791, "learning_rate": 1.628242074927954e-06, "loss": 0.4811, "num_input_tokens_seen": 3161792, "step": 2600 }, { "epoch": 0.32640020047613083, "grad_norm": 8.603852272033691, "learning_rate": 1.6313745144718709e-06, "loss": 0.5015, "num_input_tokens_seen": 3168032, "step": 2605 }, { "epoch": 0.32702668838491417, "grad_norm": 6.318688869476318, "learning_rate": 1.6345069540157876e-06, "loss": 0.4565, "num_input_tokens_seen": 3174016, "step": 2610 }, { "epoch": 0.32765317629369756, "grad_norm": 5.274566650390625, "learning_rate": 1.6376393935597044e-06, "loss": 0.477, "num_input_tokens_seen": 3180096, "step": 2615 }, { "epoch": 0.3282796642024809, "grad_norm": 4.291835308074951, "learning_rate": 1.6407718331036212e-06, "loss": 0.4704, "num_input_tokens_seen": 3186368, "step": 2620 }, { "epoch": 0.3289061521112642, "grad_norm": 6.385632514953613, "learning_rate": 1.643904272647538e-06, "loss": 0.454, "num_input_tokens_seen": 3192000, "step": 2625 }, { "epoch": 0.3295326400200476, "grad_norm": 5.287576675415039, "learning_rate": 1.647036712191455e-06, "loss": 0.4574, "num_input_tokens_seen": 3198528, "step": 2630 }, { "epoch": 0.33015912792883095, "grad_norm": 7.367983341217041, "learning_rate": 1.6501691517353717e-06, "loss": 0.4763, "num_input_tokens_seen": 3204608, "step": 2635 }, { "epoch": 0.33078561583761434, "grad_norm": 6.7435221672058105, "learning_rate": 1.6533015912792884e-06, "loss": 0.4568, "num_input_tokens_seen": 3210592, "step": 2640 }, { "epoch": 0.3314121037463977, "grad_norm": 7.723214626312256, "learning_rate": 1.6564340308232052e-06, "loss": 0.4782, "num_input_tokens_seen": 3216704, "step": 2645 }, { "epoch": 0.33203859165518107, "grad_norm": 7.5385003089904785, "learning_rate": 1.6595664703671222e-06, "loss": 0.4832, "num_input_tokens_seen": 3223168, "step": 2650 }, { "epoch": 0.3326650795639644, "grad_norm": 7.168532848358154, "learning_rate": 1.662698909911039e-06, "loss": 0.4744, "num_input_tokens_seen": 3229056, "step": 2655 }, { "epoch": 0.3332915674727478, "grad_norm": 4.892106056213379, "learning_rate": 1.6658313494549557e-06, "loss": 0.4495, "num_input_tokens_seen": 3235168, "step": 2660 }, { "epoch": 0.33391805538153113, "grad_norm": 4.249660968780518, "learning_rate": 1.6689637889988725e-06, "loss": 0.4657, "num_input_tokens_seen": 3241120, "step": 2665 }, { "epoch": 0.3345445432903145, "grad_norm": 6.1600141525268555, "learning_rate": 1.6720962285427892e-06, "loss": 0.4832, "num_input_tokens_seen": 3247392, "step": 2670 }, { "epoch": 0.33517103119909786, "grad_norm": 4.330707550048828, "learning_rate": 1.675228668086706e-06, "loss": 0.471, "num_input_tokens_seen": 3253536, "step": 2675 }, { "epoch": 0.3357975191078812, "grad_norm": 7.301163196563721, "learning_rate": 1.6783611076306227e-06, "loss": 0.4621, "num_input_tokens_seen": 3259648, "step": 2680 }, { "epoch": 0.3364240070166646, "grad_norm": 9.511617660522461, "learning_rate": 1.6814935471745397e-06, "loss": 0.4655, "num_input_tokens_seen": 3265280, "step": 2685 }, { "epoch": 0.3370504949254479, "grad_norm": 5.562314510345459, "learning_rate": 1.6846259867184565e-06, "loss": 0.4695, "num_input_tokens_seen": 3271264, "step": 2690 }, { "epoch": 0.3376769828342313, "grad_norm": 5.025343894958496, "learning_rate": 1.6877584262623732e-06, "loss": 0.4885, "num_input_tokens_seen": 3277184, "step": 2695 }, { "epoch": 0.33830347074301464, "grad_norm": 5.050377368927002, "learning_rate": 1.69089086580629e-06, "loss": 0.4686, "num_input_tokens_seen": 3283392, "step": 2700 }, { "epoch": 0.33892995865179804, "grad_norm": 6.0004987716674805, "learning_rate": 1.694023305350207e-06, "loss": 0.4685, "num_input_tokens_seen": 3289472, "step": 2705 }, { "epoch": 0.33955644656058137, "grad_norm": 5.535054683685303, "learning_rate": 1.6971557448941237e-06, "loss": 0.4625, "num_input_tokens_seen": 3295712, "step": 2710 }, { "epoch": 0.34018293446936476, "grad_norm": 5.140888214111328, "learning_rate": 1.7002881844380405e-06, "loss": 0.4535, "num_input_tokens_seen": 3300992, "step": 2715 }, { "epoch": 0.3408094223781481, "grad_norm": 4.212894916534424, "learning_rate": 1.7034206239819573e-06, "loss": 0.4617, "num_input_tokens_seen": 3306304, "step": 2720 }, { "epoch": 0.3414359102869315, "grad_norm": 6.805813789367676, "learning_rate": 1.706553063525874e-06, "loss": 0.4687, "num_input_tokens_seen": 3312480, "step": 2725 }, { "epoch": 0.3420623981957148, "grad_norm": 7.074794292449951, "learning_rate": 1.709685503069791e-06, "loss": 0.4834, "num_input_tokens_seen": 3318656, "step": 2730 }, { "epoch": 0.34268888610449816, "grad_norm": 6.334695816040039, "learning_rate": 1.7128179426137076e-06, "loss": 0.4628, "num_input_tokens_seen": 3324480, "step": 2735 }, { "epoch": 0.34331537401328155, "grad_norm": 5.424689769744873, "learning_rate": 1.7159503821576245e-06, "loss": 0.4741, "num_input_tokens_seen": 3330176, "step": 2740 }, { "epoch": 0.3439418619220649, "grad_norm": 6.120943546295166, "learning_rate": 1.7190828217015413e-06, "loss": 0.4743, "num_input_tokens_seen": 3336224, "step": 2745 }, { "epoch": 0.3445683498308483, "grad_norm": 5.431140422821045, "learning_rate": 1.7222152612454583e-06, "loss": 0.4639, "num_input_tokens_seen": 3342464, "step": 2750 }, { "epoch": 0.3451948377396316, "grad_norm": 6.564249038696289, "learning_rate": 1.7253477007893748e-06, "loss": 0.4636, "num_input_tokens_seen": 3348800, "step": 2755 }, { "epoch": 0.345821325648415, "grad_norm": 11.583648681640625, "learning_rate": 1.7284801403332918e-06, "loss": 0.468, "num_input_tokens_seen": 3355104, "step": 2760 }, { "epoch": 0.34644781355719834, "grad_norm": 5.24657678604126, "learning_rate": 1.7316125798772086e-06, "loss": 0.4683, "num_input_tokens_seen": 3360800, "step": 2765 }, { "epoch": 0.3470743014659817, "grad_norm": 5.229267597198486, "learning_rate": 1.7347450194211255e-06, "loss": 0.4675, "num_input_tokens_seen": 3366912, "step": 2770 }, { "epoch": 0.34770078937476506, "grad_norm": 4.194070816040039, "learning_rate": 1.737877458965042e-06, "loss": 0.4621, "num_input_tokens_seen": 3372864, "step": 2775 }, { "epoch": 0.34832727728354845, "grad_norm": 4.222793102264404, "learning_rate": 1.7410098985089589e-06, "loss": 0.4603, "num_input_tokens_seen": 3378880, "step": 2780 }, { "epoch": 0.3489537651923318, "grad_norm": 9.517505645751953, "learning_rate": 1.7441423380528758e-06, "loss": 0.4758, "num_input_tokens_seen": 3384992, "step": 2785 }, { "epoch": 0.3495802531011151, "grad_norm": 4.566517353057861, "learning_rate": 1.7472747775967924e-06, "loss": 0.4547, "num_input_tokens_seen": 3391296, "step": 2790 }, { "epoch": 0.3502067410098985, "grad_norm": 5.3896098136901855, "learning_rate": 1.7504072171407094e-06, "loss": 0.4483, "num_input_tokens_seen": 3397632, "step": 2795 }, { "epoch": 0.35083322891868185, "grad_norm": 9.367375373840332, "learning_rate": 1.7535396566846261e-06, "loss": 0.466, "num_input_tokens_seen": 3403904, "step": 2800 }, { "epoch": 0.35145971682746524, "grad_norm": 7.033849716186523, "learning_rate": 1.756672096228543e-06, "loss": 0.4766, "num_input_tokens_seen": 3409312, "step": 2805 }, { "epoch": 0.3520862047362486, "grad_norm": 4.133486747741699, "learning_rate": 1.7598045357724596e-06, "loss": 0.4808, "num_input_tokens_seen": 3415616, "step": 2810 }, { "epoch": 0.35271269264503197, "grad_norm": 4.687460422515869, "learning_rate": 1.7629369753163766e-06, "loss": 0.4666, "num_input_tokens_seen": 3422080, "step": 2815 }, { "epoch": 0.3533391805538153, "grad_norm": 4.345351219177246, "learning_rate": 1.7660694148602934e-06, "loss": 0.4685, "num_input_tokens_seen": 3428416, "step": 2820 }, { "epoch": 0.3539656684625987, "grad_norm": 7.236639499664307, "learning_rate": 1.7692018544042104e-06, "loss": 0.4634, "num_input_tokens_seen": 3434656, "step": 2825 }, { "epoch": 0.35459215637138203, "grad_norm": 5.486993312835693, "learning_rate": 1.772334293948127e-06, "loss": 0.4737, "num_input_tokens_seen": 3440128, "step": 2830 }, { "epoch": 0.3552186442801654, "grad_norm": 6.679039001464844, "learning_rate": 1.7754667334920437e-06, "loss": 0.4671, "num_input_tokens_seen": 3446336, "step": 2835 }, { "epoch": 0.35584513218894875, "grad_norm": 10.969685554504395, "learning_rate": 1.7785991730359606e-06, "loss": 0.4754, "num_input_tokens_seen": 3452384, "step": 2840 }, { "epoch": 0.3564716200977321, "grad_norm": 3.4168713092803955, "learning_rate": 1.7817316125798772e-06, "loss": 0.4668, "num_input_tokens_seen": 3458560, "step": 2845 }, { "epoch": 0.3570981080065155, "grad_norm": 4.0375847816467285, "learning_rate": 1.7848640521237942e-06, "loss": 0.469, "num_input_tokens_seen": 3464640, "step": 2850 }, { "epoch": 0.3577245959152988, "grad_norm": 5.427146911621094, "learning_rate": 1.787996491667711e-06, "loss": 0.464, "num_input_tokens_seen": 3470656, "step": 2855 }, { "epoch": 0.3583510838240822, "grad_norm": 3.9616377353668213, "learning_rate": 1.791128931211628e-06, "loss": 0.4613, "num_input_tokens_seen": 3476864, "step": 2860 }, { "epoch": 0.35897757173286554, "grad_norm": 6.497121810913086, "learning_rate": 1.7942613707555445e-06, "loss": 0.4703, "num_input_tokens_seen": 3483168, "step": 2865 }, { "epoch": 0.35960405964164893, "grad_norm": 4.322838306427002, "learning_rate": 1.7973938102994614e-06, "loss": 0.4576, "num_input_tokens_seen": 3489248, "step": 2870 }, { "epoch": 0.36023054755043227, "grad_norm": 3.955615758895874, "learning_rate": 1.8005262498433782e-06, "loss": 0.4671, "num_input_tokens_seen": 3495264, "step": 2875 }, { "epoch": 0.36085703545921566, "grad_norm": 4.8577775955200195, "learning_rate": 1.8036586893872952e-06, "loss": 0.4739, "num_input_tokens_seen": 3501632, "step": 2880 }, { "epoch": 0.361483523367999, "grad_norm": 3.976456880569458, "learning_rate": 1.8067911289312117e-06, "loss": 0.448, "num_input_tokens_seen": 3507360, "step": 2885 }, { "epoch": 0.3621100112767824, "grad_norm": 5.314515590667725, "learning_rate": 1.8099235684751285e-06, "loss": 0.4609, "num_input_tokens_seen": 3513472, "step": 2890 }, { "epoch": 0.3627364991855657, "grad_norm": 4.270474433898926, "learning_rate": 1.8130560080190455e-06, "loss": 0.4524, "num_input_tokens_seen": 3519840, "step": 2895 }, { "epoch": 0.36336298709434905, "grad_norm": 4.731741428375244, "learning_rate": 1.816188447562962e-06, "loss": 0.4707, "num_input_tokens_seen": 3526080, "step": 2900 }, { "epoch": 0.36398947500313245, "grad_norm": 5.959033489227295, "learning_rate": 1.819320887106879e-06, "loss": 0.4798, "num_input_tokens_seen": 3532160, "step": 2905 }, { "epoch": 0.3646159629119158, "grad_norm": 5.074324131011963, "learning_rate": 1.8224533266507958e-06, "loss": 0.4514, "num_input_tokens_seen": 3538528, "step": 2910 }, { "epoch": 0.36524245082069917, "grad_norm": 3.2641282081604004, "learning_rate": 1.8255857661947127e-06, "loss": 0.4835, "num_input_tokens_seen": 3544896, "step": 2915 }, { "epoch": 0.3658689387294825, "grad_norm": 6.897470474243164, "learning_rate": 1.8287182057386293e-06, "loss": 0.4404, "num_input_tokens_seen": 3551136, "step": 2920 }, { "epoch": 0.3664954266382659, "grad_norm": 8.880521774291992, "learning_rate": 1.8318506452825463e-06, "loss": 0.4827, "num_input_tokens_seen": 3557248, "step": 2925 }, { "epoch": 0.36712191454704923, "grad_norm": 4.621381759643555, "learning_rate": 1.834983084826463e-06, "loss": 0.463, "num_input_tokens_seen": 3563200, "step": 2930 }, { "epoch": 0.3677484024558326, "grad_norm": 3.0601649284362793, "learning_rate": 1.83811552437038e-06, "loss": 0.4665, "num_input_tokens_seen": 3569376, "step": 2935 }, { "epoch": 0.36837489036461596, "grad_norm": 3.5063915252685547, "learning_rate": 1.8412479639142965e-06, "loss": 0.4703, "num_input_tokens_seen": 3575584, "step": 2940 }, { "epoch": 0.36900137827339935, "grad_norm": 4.6919426918029785, "learning_rate": 1.8443804034582133e-06, "loss": 0.4495, "num_input_tokens_seen": 3581728, "step": 2945 }, { "epoch": 0.3696278661821827, "grad_norm": 6.571033000946045, "learning_rate": 1.8475128430021303e-06, "loss": 0.4809, "num_input_tokens_seen": 3587936, "step": 2950 }, { "epoch": 0.370254354090966, "grad_norm": 4.979684829711914, "learning_rate": 1.8506452825460468e-06, "loss": 0.4891, "num_input_tokens_seen": 3594240, "step": 2955 }, { "epoch": 0.3708808419997494, "grad_norm": 7.004514217376709, "learning_rate": 1.8537777220899638e-06, "loss": 0.4598, "num_input_tokens_seen": 3600288, "step": 2960 }, { "epoch": 0.37150732990853275, "grad_norm": 6.743952751159668, "learning_rate": 1.8569101616338806e-06, "loss": 0.4664, "num_input_tokens_seen": 3606368, "step": 2965 }, { "epoch": 0.37213381781731614, "grad_norm": 2.873061418533325, "learning_rate": 1.8600426011777975e-06, "loss": 0.4481, "num_input_tokens_seen": 3612608, "step": 2970 }, { "epoch": 0.3727603057260995, "grad_norm": 6.341067314147949, "learning_rate": 1.863175040721714e-06, "loss": 0.485, "num_input_tokens_seen": 3618784, "step": 2975 }, { "epoch": 0.37338679363488286, "grad_norm": 4.04984712600708, "learning_rate": 1.866307480265631e-06, "loss": 0.5027, "num_input_tokens_seen": 3624864, "step": 2980 }, { "epoch": 0.3740132815436662, "grad_norm": 6.403581619262695, "learning_rate": 1.8694399198095478e-06, "loss": 0.4868, "num_input_tokens_seen": 3631488, "step": 2985 }, { "epoch": 0.3746397694524496, "grad_norm": 4.747465133666992, "learning_rate": 1.8725723593534648e-06, "loss": 0.4798, "num_input_tokens_seen": 3637600, "step": 2990 }, { "epoch": 0.3752662573612329, "grad_norm": 4.266440391540527, "learning_rate": 1.8757047988973814e-06, "loss": 0.4677, "num_input_tokens_seen": 3643328, "step": 2995 }, { "epoch": 0.3758927452700163, "grad_norm": 7.078805923461914, "learning_rate": 1.8788372384412981e-06, "loss": 0.4805, "num_input_tokens_seen": 3649440, "step": 3000 }, { "epoch": 0.37651923317879965, "grad_norm": 3.756160020828247, "learning_rate": 1.881969677985215e-06, "loss": 0.4645, "num_input_tokens_seen": 3655712, "step": 3005 }, { "epoch": 0.377145721087583, "grad_norm": 2.562373161315918, "learning_rate": 1.8851021175291317e-06, "loss": 0.4641, "num_input_tokens_seen": 3661664, "step": 3010 }, { "epoch": 0.3777722089963664, "grad_norm": 4.30961275100708, "learning_rate": 1.8882345570730486e-06, "loss": 0.464, "num_input_tokens_seen": 3667520, "step": 3015 }, { "epoch": 0.3783986969051497, "grad_norm": 7.766035079956055, "learning_rate": 1.8913669966169654e-06, "loss": 0.4739, "num_input_tokens_seen": 3673760, "step": 3020 }, { "epoch": 0.3790251848139331, "grad_norm": 4.981832027435303, "learning_rate": 1.8944994361608824e-06, "loss": 0.4565, "num_input_tokens_seen": 3680000, "step": 3025 }, { "epoch": 0.37965167272271644, "grad_norm": 4.622098445892334, "learning_rate": 1.897631875704799e-06, "loss": 0.4718, "num_input_tokens_seen": 3686400, "step": 3030 }, { "epoch": 0.38027816063149983, "grad_norm": 4.090660095214844, "learning_rate": 1.9007643152487159e-06, "loss": 0.4756, "num_input_tokens_seen": 3692512, "step": 3035 }, { "epoch": 0.38090464854028316, "grad_norm": 5.537451267242432, "learning_rate": 1.9038967547926327e-06, "loss": 0.4751, "num_input_tokens_seen": 3697856, "step": 3040 }, { "epoch": 0.38153113644906655, "grad_norm": 7.38533878326416, "learning_rate": 1.9070291943365496e-06, "loss": 0.4709, "num_input_tokens_seen": 3704160, "step": 3045 }, { "epoch": 0.3821576243578499, "grad_norm": 5.303448677062988, "learning_rate": 1.910161633880466e-06, "loss": 0.4524, "num_input_tokens_seen": 3710176, "step": 3050 }, { "epoch": 0.3827841122666333, "grad_norm": 6.850648880004883, "learning_rate": 1.913294073424383e-06, "loss": 0.4655, "num_input_tokens_seen": 3715712, "step": 3055 }, { "epoch": 0.3834106001754166, "grad_norm": 4.845042705535889, "learning_rate": 1.9164265129682997e-06, "loss": 0.4715, "num_input_tokens_seen": 3721600, "step": 3060 }, { "epoch": 0.38403708808419995, "grad_norm": 5.232320308685303, "learning_rate": 1.9195589525122165e-06, "loss": 0.4688, "num_input_tokens_seen": 3727808, "step": 3065 }, { "epoch": 0.38466357599298334, "grad_norm": 6.243874549865723, "learning_rate": 1.9226913920561337e-06, "loss": 0.4462, "num_input_tokens_seen": 3733984, "step": 3070 }, { "epoch": 0.3852900639017667, "grad_norm": 3.8435873985290527, "learning_rate": 1.9258238316000504e-06, "loss": 0.4566, "num_input_tokens_seen": 3740064, "step": 3075 }, { "epoch": 0.38591655181055007, "grad_norm": 3.7921762466430664, "learning_rate": 1.928956271143967e-06, "loss": 0.4612, "num_input_tokens_seen": 3746368, "step": 3080 }, { "epoch": 0.3865430397193334, "grad_norm": 5.384672164916992, "learning_rate": 1.932088710687884e-06, "loss": 0.4701, "num_input_tokens_seen": 3752544, "step": 3085 }, { "epoch": 0.3871695276281168, "grad_norm": 4.316971778869629, "learning_rate": 1.9352211502318007e-06, "loss": 0.4468, "num_input_tokens_seen": 3758528, "step": 3090 }, { "epoch": 0.38779601553690013, "grad_norm": 5.915549278259277, "learning_rate": 1.9383535897757175e-06, "loss": 0.4656, "num_input_tokens_seen": 3764512, "step": 3095 }, { "epoch": 0.3884225034456835, "grad_norm": 7.5600666999816895, "learning_rate": 1.9414860293196342e-06, "loss": 0.4674, "num_input_tokens_seen": 3770432, "step": 3100 }, { "epoch": 0.38904899135446686, "grad_norm": 7.936491012573242, "learning_rate": 1.944618468863551e-06, "loss": 0.4563, "num_input_tokens_seen": 3776736, "step": 3105 }, { "epoch": 0.38967547926325025, "grad_norm": 3.0439646244049072, "learning_rate": 1.9477509084074678e-06, "loss": 0.4807, "num_input_tokens_seen": 3782912, "step": 3110 }, { "epoch": 0.3903019671720336, "grad_norm": 13.778010368347168, "learning_rate": 1.9508833479513845e-06, "loss": 0.4964, "num_input_tokens_seen": 3788640, "step": 3115 }, { "epoch": 0.3909284550808169, "grad_norm": 5.040391445159912, "learning_rate": 1.9540157874953013e-06, "loss": 0.4761, "num_input_tokens_seen": 3793952, "step": 3120 }, { "epoch": 0.3915549429896003, "grad_norm": 3.211547613143921, "learning_rate": 1.9571482270392185e-06, "loss": 0.4726, "num_input_tokens_seen": 3799392, "step": 3125 }, { "epoch": 0.39218143089838364, "grad_norm": 4.538845062255859, "learning_rate": 1.9602806665831352e-06, "loss": 0.4602, "num_input_tokens_seen": 3805760, "step": 3130 }, { "epoch": 0.39280791880716703, "grad_norm": 3.5543880462646484, "learning_rate": 1.963413106127052e-06, "loss": 0.4673, "num_input_tokens_seen": 3811744, "step": 3135 }, { "epoch": 0.39343440671595037, "grad_norm": 3.9276394844055176, "learning_rate": 1.9665455456709688e-06, "loss": 0.4655, "num_input_tokens_seen": 3817472, "step": 3140 }, { "epoch": 0.39406089462473376, "grad_norm": 4.427441120147705, "learning_rate": 1.9696779852148855e-06, "loss": 0.4517, "num_input_tokens_seen": 3823936, "step": 3145 }, { "epoch": 0.3946873825335171, "grad_norm": 2.992065191268921, "learning_rate": 1.9728104247588023e-06, "loss": 0.4636, "num_input_tokens_seen": 3829728, "step": 3150 }, { "epoch": 0.3953138704423005, "grad_norm": 4.674420356750488, "learning_rate": 1.975942864302719e-06, "loss": 0.443, "num_input_tokens_seen": 3836000, "step": 3155 }, { "epoch": 0.3959403583510838, "grad_norm": 7.3628387451171875, "learning_rate": 1.979075303846636e-06, "loss": 0.5033, "num_input_tokens_seen": 3842176, "step": 3160 }, { "epoch": 0.3965668462598672, "grad_norm": 8.792458534240723, "learning_rate": 1.9822077433905526e-06, "loss": 0.4709, "num_input_tokens_seen": 3847648, "step": 3165 }, { "epoch": 0.39719333416865055, "grad_norm": 5.413067817687988, "learning_rate": 1.9853401829344698e-06, "loss": 0.443, "num_input_tokens_seen": 3853952, "step": 3170 }, { "epoch": 0.3978198220774339, "grad_norm": 6.288811206817627, "learning_rate": 1.988472622478386e-06, "loss": 0.483, "num_input_tokens_seen": 3859936, "step": 3175 }, { "epoch": 0.3984463099862173, "grad_norm": 3.470763921737671, "learning_rate": 1.9916050620223033e-06, "loss": 0.4676, "num_input_tokens_seen": 3865856, "step": 3180 }, { "epoch": 0.3990727978950006, "grad_norm": 7.2137274742126465, "learning_rate": 1.99473750156622e-06, "loss": 0.4631, "num_input_tokens_seen": 3872320, "step": 3185 }, { "epoch": 0.399699285803784, "grad_norm": 8.102096557617188, "learning_rate": 1.997869941110137e-06, "loss": 0.4634, "num_input_tokens_seen": 3878400, "step": 3190 }, { "epoch": 0.40032577371256733, "grad_norm": 21.14093589782715, "learning_rate": 2.0010023806540536e-06, "loss": 0.4774, "num_input_tokens_seen": 3884640, "step": 3195 }, { "epoch": 0.4009522616213507, "grad_norm": 14.606161117553711, "learning_rate": 2.0041348201979703e-06, "loss": 0.5068, "num_input_tokens_seen": 3890752, "step": 3200 }, { "epoch": 0.40157874953013406, "grad_norm": 6.689561367034912, "learning_rate": 2.007267259741887e-06, "loss": 0.4756, "num_input_tokens_seen": 3896864, "step": 3205 }, { "epoch": 0.40220523743891745, "grad_norm": 6.093937397003174, "learning_rate": 2.0103996992858043e-06, "loss": 0.4659, "num_input_tokens_seen": 3902944, "step": 3210 }, { "epoch": 0.4028317253477008, "grad_norm": 8.005853652954102, "learning_rate": 2.0135321388297206e-06, "loss": 0.4753, "num_input_tokens_seen": 3909120, "step": 3215 }, { "epoch": 0.4034582132564842, "grad_norm": 8.766292572021484, "learning_rate": 2.0166645783736374e-06, "loss": 0.4561, "num_input_tokens_seen": 3915200, "step": 3220 }, { "epoch": 0.4040847011652675, "grad_norm": 9.298173904418945, "learning_rate": 2.0197970179175546e-06, "loss": 0.4719, "num_input_tokens_seen": 3921440, "step": 3225 }, { "epoch": 0.40471118907405085, "grad_norm": 7.295738220214844, "learning_rate": 2.022929457461471e-06, "loss": 0.4596, "num_input_tokens_seen": 3927488, "step": 3230 }, { "epoch": 0.40533767698283424, "grad_norm": 5.1631550788879395, "learning_rate": 2.026061897005388e-06, "loss": 0.4847, "num_input_tokens_seen": 3933856, "step": 3235 }, { "epoch": 0.4059641648916176, "grad_norm": 7.431179523468018, "learning_rate": 2.029194336549305e-06, "loss": 0.4626, "num_input_tokens_seen": 3939744, "step": 3240 }, { "epoch": 0.40659065280040096, "grad_norm": 6.625420093536377, "learning_rate": 2.0323267760932216e-06, "loss": 0.4646, "num_input_tokens_seen": 3945888, "step": 3245 }, { "epoch": 0.4072171407091843, "grad_norm": 5.328295707702637, "learning_rate": 2.0354592156371384e-06, "loss": 0.4764, "num_input_tokens_seen": 3952128, "step": 3250 }, { "epoch": 0.4078436286179677, "grad_norm": 7.959784507751465, "learning_rate": 2.038591655181055e-06, "loss": 0.4812, "num_input_tokens_seen": 3958432, "step": 3255 }, { "epoch": 0.408470116526751, "grad_norm": 3.8637006282806396, "learning_rate": 2.041724094724972e-06, "loss": 0.4538, "num_input_tokens_seen": 3964512, "step": 3260 }, { "epoch": 0.4090966044355344, "grad_norm": 5.522642135620117, "learning_rate": 2.044856534268889e-06, "loss": 0.4651, "num_input_tokens_seen": 3970624, "step": 3265 }, { "epoch": 0.40972309234431775, "grad_norm": 3.0232460498809814, "learning_rate": 2.0479889738128054e-06, "loss": 0.4874, "num_input_tokens_seen": 3976960, "step": 3270 }, { "epoch": 0.41034958025310114, "grad_norm": 6.034719944000244, "learning_rate": 2.051121413356722e-06, "loss": 0.4669, "num_input_tokens_seen": 3983104, "step": 3275 }, { "epoch": 0.4109760681618845, "grad_norm": 3.128183126449585, "learning_rate": 2.0542538529006394e-06, "loss": 0.4736, "num_input_tokens_seen": 3989280, "step": 3280 }, { "epoch": 0.4116025560706678, "grad_norm": 4.100358486175537, "learning_rate": 2.0573862924445557e-06, "loss": 0.4756, "num_input_tokens_seen": 3995424, "step": 3285 }, { "epoch": 0.4122290439794512, "grad_norm": 3.9037232398986816, "learning_rate": 2.060518731988473e-06, "loss": 0.4601, "num_input_tokens_seen": 4001664, "step": 3290 }, { "epoch": 0.41285553188823454, "grad_norm": 3.758028268814087, "learning_rate": 2.0636511715323897e-06, "loss": 0.4645, "num_input_tokens_seen": 4007616, "step": 3295 }, { "epoch": 0.41348201979701793, "grad_norm": 3.2729997634887695, "learning_rate": 2.0667836110763065e-06, "loss": 0.4582, "num_input_tokens_seen": 4013856, "step": 3300 }, { "epoch": 0.41410850770580127, "grad_norm": 6.7328410148620605, "learning_rate": 2.0699160506202232e-06, "loss": 0.4858, "num_input_tokens_seen": 4020064, "step": 3305 }, { "epoch": 0.41473499561458466, "grad_norm": 5.428465843200684, "learning_rate": 2.07304849016414e-06, "loss": 0.4753, "num_input_tokens_seen": 4025728, "step": 3310 }, { "epoch": 0.415361483523368, "grad_norm": 3.821103811264038, "learning_rate": 2.0761809297080567e-06, "loss": 0.4663, "num_input_tokens_seen": 4031648, "step": 3315 }, { "epoch": 0.4159879714321514, "grad_norm": 3.565420627593994, "learning_rate": 2.079313369251974e-06, "loss": 0.4674, "num_input_tokens_seen": 4037760, "step": 3320 }, { "epoch": 0.4166144593409347, "grad_norm": 4.056554317474365, "learning_rate": 2.0824458087958903e-06, "loss": 0.4499, "num_input_tokens_seen": 4043104, "step": 3325 }, { "epoch": 0.41724094724971805, "grad_norm": 3.773610830307007, "learning_rate": 2.085578248339807e-06, "loss": 0.4535, "num_input_tokens_seen": 4048768, "step": 3330 }, { "epoch": 0.41786743515850144, "grad_norm": 3.7234973907470703, "learning_rate": 2.0887106878837242e-06, "loss": 0.4723, "num_input_tokens_seen": 4054912, "step": 3335 }, { "epoch": 0.4184939230672848, "grad_norm": 4.561150550842285, "learning_rate": 2.0918431274276406e-06, "loss": 0.4848, "num_input_tokens_seen": 4060992, "step": 3340 }, { "epoch": 0.41912041097606817, "grad_norm": 3.3666434288024902, "learning_rate": 2.0949755669715577e-06, "loss": 0.46, "num_input_tokens_seen": 4067296, "step": 3345 }, { "epoch": 0.4197468988848515, "grad_norm": 3.85316801071167, "learning_rate": 2.0981080065154745e-06, "loss": 0.4626, "num_input_tokens_seen": 4073440, "step": 3350 }, { "epoch": 0.4203733867936349, "grad_norm": 3.950249433517456, "learning_rate": 2.1012404460593913e-06, "loss": 0.4607, "num_input_tokens_seen": 4079488, "step": 3355 }, { "epoch": 0.42099987470241823, "grad_norm": 3.8057358264923096, "learning_rate": 2.104372885603308e-06, "loss": 0.4752, "num_input_tokens_seen": 4085440, "step": 3360 }, { "epoch": 0.4216263626112016, "grad_norm": 5.852189064025879, "learning_rate": 2.107505325147225e-06, "loss": 0.4742, "num_input_tokens_seen": 4091584, "step": 3365 }, { "epoch": 0.42225285051998496, "grad_norm": 4.720715522766113, "learning_rate": 2.1106377646911416e-06, "loss": 0.466, "num_input_tokens_seen": 4097888, "step": 3370 }, { "epoch": 0.42287933842876835, "grad_norm": 4.083423137664795, "learning_rate": 2.1137702042350583e-06, "loss": 0.4743, "num_input_tokens_seen": 4103808, "step": 3375 }, { "epoch": 0.4235058263375517, "grad_norm": 4.909229278564453, "learning_rate": 2.116902643778975e-06, "loss": 0.4717, "num_input_tokens_seen": 4110144, "step": 3380 }, { "epoch": 0.424132314246335, "grad_norm": 2.8546319007873535, "learning_rate": 2.120035083322892e-06, "loss": 0.4707, "num_input_tokens_seen": 4116480, "step": 3385 }, { "epoch": 0.4247588021551184, "grad_norm": 4.155975818634033, "learning_rate": 2.123167522866809e-06, "loss": 0.4666, "num_input_tokens_seen": 4122528, "step": 3390 }, { "epoch": 0.42538529006390174, "grad_norm": 5.222706317901611, "learning_rate": 2.1262999624107254e-06, "loss": 0.4778, "num_input_tokens_seen": 4128608, "step": 3395 }, { "epoch": 0.42601177797268514, "grad_norm": 4.200812339782715, "learning_rate": 2.1294324019546426e-06, "loss": 0.4609, "num_input_tokens_seen": 4134688, "step": 3400 }, { "epoch": 0.42663826588146847, "grad_norm": 3.367659330368042, "learning_rate": 2.1325648414985593e-06, "loss": 0.4657, "num_input_tokens_seen": 4141152, "step": 3405 }, { "epoch": 0.42726475379025186, "grad_norm": 4.918159484863281, "learning_rate": 2.135697281042476e-06, "loss": 0.4804, "num_input_tokens_seen": 4147424, "step": 3410 }, { "epoch": 0.4278912416990352, "grad_norm": 2.888634443283081, "learning_rate": 2.138829720586393e-06, "loss": 0.464, "num_input_tokens_seen": 4153376, "step": 3415 }, { "epoch": 0.4285177296078186, "grad_norm": 2.5279767513275146, "learning_rate": 2.1419621601303096e-06, "loss": 0.4657, "num_input_tokens_seen": 4159488, "step": 3420 }, { "epoch": 0.4291442175166019, "grad_norm": 3.1368212699890137, "learning_rate": 2.1450945996742264e-06, "loss": 0.4752, "num_input_tokens_seen": 4165824, "step": 3425 }, { "epoch": 0.4297707054253853, "grad_norm": 1.853352427482605, "learning_rate": 2.148227039218143e-06, "loss": 0.4638, "num_input_tokens_seen": 4171680, "step": 3430 }, { "epoch": 0.43039719333416865, "grad_norm": 2.933814287185669, "learning_rate": 2.15135947876206e-06, "loss": 0.4736, "num_input_tokens_seen": 4177664, "step": 3435 }, { "epoch": 0.431023681242952, "grad_norm": 3.1255087852478027, "learning_rate": 2.1544919183059767e-06, "loss": 0.4628, "num_input_tokens_seen": 4183776, "step": 3440 }, { "epoch": 0.4316501691517354, "grad_norm": 3.340178966522217, "learning_rate": 2.157624357849894e-06, "loss": 0.4627, "num_input_tokens_seen": 4189216, "step": 3445 }, { "epoch": 0.4322766570605187, "grad_norm": 3.6405396461486816, "learning_rate": 2.16075679739381e-06, "loss": 0.4602, "num_input_tokens_seen": 4195392, "step": 3450 }, { "epoch": 0.4329031449693021, "grad_norm": 2.5087592601776123, "learning_rate": 2.1638892369377274e-06, "loss": 0.4542, "num_input_tokens_seen": 4201312, "step": 3455 }, { "epoch": 0.43352963287808544, "grad_norm": 2.6265172958374023, "learning_rate": 2.167021676481644e-06, "loss": 0.4635, "num_input_tokens_seen": 4207296, "step": 3460 }, { "epoch": 0.4341561207868688, "grad_norm": 2.3109450340270996, "learning_rate": 2.170154116025561e-06, "loss": 0.4772, "num_input_tokens_seen": 4213280, "step": 3465 }, { "epoch": 0.43478260869565216, "grad_norm": 2.740180253982544, "learning_rate": 2.1732865555694777e-06, "loss": 0.4614, "num_input_tokens_seen": 4219392, "step": 3470 }, { "epoch": 0.43540909660443555, "grad_norm": 3.2397842407226562, "learning_rate": 2.1764189951133944e-06, "loss": 0.4661, "num_input_tokens_seen": 4225280, "step": 3475 }, { "epoch": 0.4360355845132189, "grad_norm": 3.337172031402588, "learning_rate": 2.179551434657311e-06, "loss": 0.4584, "num_input_tokens_seen": 4231264, "step": 3480 }, { "epoch": 0.4366620724220023, "grad_norm": 2.883998155593872, "learning_rate": 2.182683874201228e-06, "loss": 0.4699, "num_input_tokens_seen": 4237088, "step": 3485 }, { "epoch": 0.4372885603307856, "grad_norm": 4.948882579803467, "learning_rate": 2.1858163137451447e-06, "loss": 0.4559, "num_input_tokens_seen": 4243328, "step": 3490 }, { "epoch": 0.43791504823956895, "grad_norm": 3.6462910175323486, "learning_rate": 2.1889487532890615e-06, "loss": 0.467, "num_input_tokens_seen": 4249408, "step": 3495 }, { "epoch": 0.43854153614835234, "grad_norm": 2.8261210918426514, "learning_rate": 2.1920811928329787e-06, "loss": 0.4636, "num_input_tokens_seen": 4255744, "step": 3500 }, { "epoch": 0.4391680240571357, "grad_norm": 5.620125770568848, "learning_rate": 2.1952136323768954e-06, "loss": 0.4647, "num_input_tokens_seen": 4261888, "step": 3505 }, { "epoch": 0.43979451196591907, "grad_norm": 4.350405216217041, "learning_rate": 2.198346071920812e-06, "loss": 0.4658, "num_input_tokens_seen": 4268480, "step": 3510 }, { "epoch": 0.4404209998747024, "grad_norm": 6.445950508117676, "learning_rate": 2.201478511464729e-06, "loss": 0.4714, "num_input_tokens_seen": 4274624, "step": 3515 }, { "epoch": 0.4410474877834858, "grad_norm": 3.5513551235198975, "learning_rate": 2.2046109510086457e-06, "loss": 0.4593, "num_input_tokens_seen": 4280928, "step": 3520 }, { "epoch": 0.4416739756922691, "grad_norm": 8.487303733825684, "learning_rate": 2.2077433905525625e-06, "loss": 0.4655, "num_input_tokens_seen": 4286848, "step": 3525 }, { "epoch": 0.4423004636010525, "grad_norm": 4.110949993133545, "learning_rate": 2.2108758300964792e-06, "loss": 0.5042, "num_input_tokens_seen": 4292800, "step": 3530 }, { "epoch": 0.44292695150983585, "grad_norm": 3.0452513694763184, "learning_rate": 2.214008269640396e-06, "loss": 0.4621, "num_input_tokens_seen": 4298816, "step": 3535 }, { "epoch": 0.44355343941861924, "grad_norm": 3.3401317596435547, "learning_rate": 2.2171407091843128e-06, "loss": 0.4569, "num_input_tokens_seen": 4305120, "step": 3540 }, { "epoch": 0.4441799273274026, "grad_norm": 3.495713710784912, "learning_rate": 2.2202731487282295e-06, "loss": 0.4496, "num_input_tokens_seen": 4311264, "step": 3545 }, { "epoch": 0.4448064152361859, "grad_norm": 5.17857551574707, "learning_rate": 2.2234055882721463e-06, "loss": 0.4742, "num_input_tokens_seen": 4317504, "step": 3550 }, { "epoch": 0.4454329031449693, "grad_norm": 4.120763301849365, "learning_rate": 2.2265380278160635e-06, "loss": 0.4664, "num_input_tokens_seen": 4324064, "step": 3555 }, { "epoch": 0.44605939105375264, "grad_norm": 2.7252793312072754, "learning_rate": 2.2296704673599802e-06, "loss": 0.4749, "num_input_tokens_seen": 4329856, "step": 3560 }, { "epoch": 0.44668587896253603, "grad_norm": 7.57357120513916, "learning_rate": 2.232802906903897e-06, "loss": 0.4955, "num_input_tokens_seen": 4336384, "step": 3565 }, { "epoch": 0.44731236687131937, "grad_norm": 2.919436454772949, "learning_rate": 2.2359353464478138e-06, "loss": 0.4676, "num_input_tokens_seen": 4342432, "step": 3570 }, { "epoch": 0.44793885478010276, "grad_norm": 3.8940861225128174, "learning_rate": 2.2390677859917305e-06, "loss": 0.4764, "num_input_tokens_seen": 4348352, "step": 3575 }, { "epoch": 0.4485653426888861, "grad_norm": 2.7951040267944336, "learning_rate": 2.2422002255356473e-06, "loss": 0.4649, "num_input_tokens_seen": 4354624, "step": 3580 }, { "epoch": 0.4491918305976695, "grad_norm": 2.4162168502807617, "learning_rate": 2.245332665079564e-06, "loss": 0.4702, "num_input_tokens_seen": 4360224, "step": 3585 }, { "epoch": 0.4498183185064528, "grad_norm": 2.738987922668457, "learning_rate": 2.248465104623481e-06, "loss": 0.4622, "num_input_tokens_seen": 4366272, "step": 3590 }, { "epoch": 0.4504448064152362, "grad_norm": 3.3963029384613037, "learning_rate": 2.2515975441673976e-06, "loss": 0.4636, "num_input_tokens_seen": 4372640, "step": 3595 }, { "epoch": 0.45107129432401954, "grad_norm": 2.8500382900238037, "learning_rate": 2.2547299837113148e-06, "loss": 0.474, "num_input_tokens_seen": 4378560, "step": 3600 }, { "epoch": 0.4516977822328029, "grad_norm": 3.7836086750030518, "learning_rate": 2.257862423255231e-06, "loss": 0.4618, "num_input_tokens_seen": 4384992, "step": 3605 }, { "epoch": 0.45232427014158627, "grad_norm": 2.8836910724639893, "learning_rate": 2.2609948627991483e-06, "loss": 0.4689, "num_input_tokens_seen": 4391200, "step": 3610 }, { "epoch": 0.4529507580503696, "grad_norm": 4.659081935882568, "learning_rate": 2.264127302343065e-06, "loss": 0.4602, "num_input_tokens_seen": 4397408, "step": 3615 }, { "epoch": 0.453577245959153, "grad_norm": 3.052285671234131, "learning_rate": 2.267259741886982e-06, "loss": 0.4621, "num_input_tokens_seen": 4403456, "step": 3620 }, { "epoch": 0.45420373386793633, "grad_norm": 5.774023056030273, "learning_rate": 2.2703921814308986e-06, "loss": 0.4622, "num_input_tokens_seen": 4409984, "step": 3625 }, { "epoch": 0.4548302217767197, "grad_norm": 5.400131702423096, "learning_rate": 2.2735246209748154e-06, "loss": 0.47, "num_input_tokens_seen": 4414912, "step": 3630 }, { "epoch": 0.45545670968550306, "grad_norm": 4.862954139709473, "learning_rate": 2.276657060518732e-06, "loss": 0.458, "num_input_tokens_seen": 4421024, "step": 3635 }, { "epoch": 0.45608319759428645, "grad_norm": 5.215742588043213, "learning_rate": 2.2797895000626493e-06, "loss": 0.4567, "num_input_tokens_seen": 4427360, "step": 3640 }, { "epoch": 0.4567096855030698, "grad_norm": 2.9262075424194336, "learning_rate": 2.2829219396065656e-06, "loss": 0.4712, "num_input_tokens_seen": 4433664, "step": 3645 }, { "epoch": 0.4573361734118532, "grad_norm": 3.8372817039489746, "learning_rate": 2.2860543791504824e-06, "loss": 0.4769, "num_input_tokens_seen": 4439776, "step": 3650 }, { "epoch": 0.4579626613206365, "grad_norm": 3.438995838165283, "learning_rate": 2.2891868186943996e-06, "loss": 0.4582, "num_input_tokens_seen": 4446144, "step": 3655 }, { "epoch": 0.45858914922941985, "grad_norm": 2.296025037765503, "learning_rate": 2.292319258238316e-06, "loss": 0.4658, "num_input_tokens_seen": 4452352, "step": 3660 }, { "epoch": 0.45921563713820324, "grad_norm": 4.105533599853516, "learning_rate": 2.295451697782233e-06, "loss": 0.4624, "num_input_tokens_seen": 4458752, "step": 3665 }, { "epoch": 0.45984212504698657, "grad_norm": 3.2223927974700928, "learning_rate": 2.29858413732615e-06, "loss": 0.4872, "num_input_tokens_seen": 4464768, "step": 3670 }, { "epoch": 0.46046861295576996, "grad_norm": 2.3214097023010254, "learning_rate": 2.3017165768700666e-06, "loss": 0.4729, "num_input_tokens_seen": 4470752, "step": 3675 }, { "epoch": 0.4610951008645533, "grad_norm": 1.9208648204803467, "learning_rate": 2.3048490164139834e-06, "loss": 0.4649, "num_input_tokens_seen": 4477088, "step": 3680 }, { "epoch": 0.4617215887733367, "grad_norm": 3.7648751735687256, "learning_rate": 2.3079814559579e-06, "loss": 0.4736, "num_input_tokens_seen": 4483200, "step": 3685 }, { "epoch": 0.46234807668212, "grad_norm": 3.515852689743042, "learning_rate": 2.311113895501817e-06, "loss": 0.4444, "num_input_tokens_seen": 4489472, "step": 3690 }, { "epoch": 0.4629745645909034, "grad_norm": 3.325610637664795, "learning_rate": 2.314246335045734e-06, "loss": 0.4546, "num_input_tokens_seen": 4495648, "step": 3695 }, { "epoch": 0.46360105249968675, "grad_norm": 5.7719340324401855, "learning_rate": 2.3173787745896505e-06, "loss": 0.4796, "num_input_tokens_seen": 4501728, "step": 3700 }, { "epoch": 0.46422754040847014, "grad_norm": 6.884254455566406, "learning_rate": 2.3205112141335672e-06, "loss": 0.4726, "num_input_tokens_seen": 4507840, "step": 3705 }, { "epoch": 0.4648540283172535, "grad_norm": 4.055144309997559, "learning_rate": 2.3236436536774844e-06, "loss": 0.4635, "num_input_tokens_seen": 4513504, "step": 3710 }, { "epoch": 0.4654805162260368, "grad_norm": 4.233413219451904, "learning_rate": 2.3267760932214008e-06, "loss": 0.4308, "num_input_tokens_seen": 4519552, "step": 3715 }, { "epoch": 0.4661070041348202, "grad_norm": 4.126357078552246, "learning_rate": 2.329908532765318e-06, "loss": 0.4911, "num_input_tokens_seen": 4525088, "step": 3720 }, { "epoch": 0.46673349204360354, "grad_norm": 4.517475605010986, "learning_rate": 2.3330409723092347e-06, "loss": 0.5008, "num_input_tokens_seen": 4531232, "step": 3725 }, { "epoch": 0.46735997995238693, "grad_norm": 4.611202239990234, "learning_rate": 2.3361734118531515e-06, "loss": 0.4895, "num_input_tokens_seen": 4537920, "step": 3730 }, { "epoch": 0.46798646786117026, "grad_norm": 1.4853172302246094, "learning_rate": 2.3393058513970682e-06, "loss": 0.4633, "num_input_tokens_seen": 4544000, "step": 3735 }, { "epoch": 0.46861295576995365, "grad_norm": 2.0050415992736816, "learning_rate": 2.342438290940985e-06, "loss": 0.4582, "num_input_tokens_seen": 4550240, "step": 3740 }, { "epoch": 0.469239443678737, "grad_norm": 4.92007303237915, "learning_rate": 2.3455707304849018e-06, "loss": 0.4775, "num_input_tokens_seen": 4556416, "step": 3745 }, { "epoch": 0.4698659315875204, "grad_norm": 2.7294209003448486, "learning_rate": 2.348703170028819e-06, "loss": 0.4639, "num_input_tokens_seen": 4562368, "step": 3750 }, { "epoch": 0.4704924194963037, "grad_norm": 3.664163827896118, "learning_rate": 2.3518356095727353e-06, "loss": 0.4618, "num_input_tokens_seen": 4568672, "step": 3755 }, { "epoch": 0.4711189074050871, "grad_norm": 2.855962038040161, "learning_rate": 2.354968049116652e-06, "loss": 0.4577, "num_input_tokens_seen": 4574880, "step": 3760 }, { "epoch": 0.47174539531387044, "grad_norm": 3.256833553314209, "learning_rate": 2.3581004886605692e-06, "loss": 0.4653, "num_input_tokens_seen": 4580864, "step": 3765 }, { "epoch": 0.4723718832226538, "grad_norm": 4.477430820465088, "learning_rate": 2.3612329282044856e-06, "loss": 0.4694, "num_input_tokens_seen": 4587104, "step": 3770 }, { "epoch": 0.47299837113143717, "grad_norm": 2.9995064735412598, "learning_rate": 2.3643653677484028e-06, "loss": 0.459, "num_input_tokens_seen": 4593088, "step": 3775 }, { "epoch": 0.4736248590402205, "grad_norm": 5.318706035614014, "learning_rate": 2.3674978072923195e-06, "loss": 0.4684, "num_input_tokens_seen": 4599104, "step": 3780 }, { "epoch": 0.4742513469490039, "grad_norm": 2.569349527359009, "learning_rate": 2.3706302468362363e-06, "loss": 0.4591, "num_input_tokens_seen": 4605312, "step": 3785 }, { "epoch": 0.47487783485778723, "grad_norm": 2.760662317276001, "learning_rate": 2.373762686380153e-06, "loss": 0.4559, "num_input_tokens_seen": 4611648, "step": 3790 }, { "epoch": 0.4755043227665706, "grad_norm": 3.017850160598755, "learning_rate": 2.37689512592407e-06, "loss": 0.4819, "num_input_tokens_seen": 4617024, "step": 3795 }, { "epoch": 0.47613081067535395, "grad_norm": 4.396942615509033, "learning_rate": 2.3800275654679866e-06, "loss": 0.4703, "num_input_tokens_seen": 4623168, "step": 3800 }, { "epoch": 0.47675729858413735, "grad_norm": 4.1818461418151855, "learning_rate": 2.3831600050119038e-06, "loss": 0.4704, "num_input_tokens_seen": 4629248, "step": 3805 }, { "epoch": 0.4773837864929207, "grad_norm": 2.404844284057617, "learning_rate": 2.38629244455582e-06, "loss": 0.4701, "num_input_tokens_seen": 4635136, "step": 3810 }, { "epoch": 0.47801027440170407, "grad_norm": 3.3108181953430176, "learning_rate": 2.389424884099737e-06, "loss": 0.4626, "num_input_tokens_seen": 4641088, "step": 3815 }, { "epoch": 0.4786367623104874, "grad_norm": 6.878119468688965, "learning_rate": 2.392557323643654e-06, "loss": 0.4637, "num_input_tokens_seen": 4647008, "step": 3820 }, { "epoch": 0.47926325021927074, "grad_norm": 9.054509162902832, "learning_rate": 2.3956897631875704e-06, "loss": 0.4699, "num_input_tokens_seen": 4653152, "step": 3825 }, { "epoch": 0.47988973812805413, "grad_norm": 4.969636917114258, "learning_rate": 2.3988222027314876e-06, "loss": 0.4632, "num_input_tokens_seen": 4659360, "step": 3830 }, { "epoch": 0.48051622603683747, "grad_norm": 4.9237871170043945, "learning_rate": 2.4019546422754043e-06, "loss": 0.4496, "num_input_tokens_seen": 4665664, "step": 3835 }, { "epoch": 0.48114271394562086, "grad_norm": 9.50482177734375, "learning_rate": 2.405087081819321e-06, "loss": 0.5083, "num_input_tokens_seen": 4671680, "step": 3840 }, { "epoch": 0.4817692018544042, "grad_norm": 7.373294830322266, "learning_rate": 2.408219521363238e-06, "loss": 0.4764, "num_input_tokens_seen": 4677984, "step": 3845 }, { "epoch": 0.4823956897631876, "grad_norm": 4.792428016662598, "learning_rate": 2.4113519609071546e-06, "loss": 0.4635, "num_input_tokens_seen": 4683552, "step": 3850 }, { "epoch": 0.4830221776719709, "grad_norm": 4.242553234100342, "learning_rate": 2.4144844004510714e-06, "loss": 0.458, "num_input_tokens_seen": 4690016, "step": 3855 }, { "epoch": 0.4836486655807543, "grad_norm": 4.418268203735352, "learning_rate": 2.4176168399949886e-06, "loss": 0.4612, "num_input_tokens_seen": 4696352, "step": 3860 }, { "epoch": 0.48427515348953765, "grad_norm": 4.2054057121276855, "learning_rate": 2.420749279538905e-06, "loss": 0.4649, "num_input_tokens_seen": 4702464, "step": 3865 }, { "epoch": 0.48490164139832104, "grad_norm": 4.86564302444458, "learning_rate": 2.4238817190828217e-06, "loss": 0.465, "num_input_tokens_seen": 4708192, "step": 3870 }, { "epoch": 0.4855281293071044, "grad_norm": 5.129112243652344, "learning_rate": 2.427014158626739e-06, "loss": 0.4349, "num_input_tokens_seen": 4714144, "step": 3875 }, { "epoch": 0.4861546172158877, "grad_norm": 5.301564693450928, "learning_rate": 2.430146598170655e-06, "loss": 0.4523, "num_input_tokens_seen": 4720576, "step": 3880 }, { "epoch": 0.4867811051246711, "grad_norm": 10.727825164794922, "learning_rate": 2.4332790377145724e-06, "loss": 0.4787, "num_input_tokens_seen": 4726624, "step": 3885 }, { "epoch": 0.48740759303345443, "grad_norm": 5.90886926651001, "learning_rate": 2.436411477258489e-06, "loss": 0.4912, "num_input_tokens_seen": 4732736, "step": 3890 }, { "epoch": 0.4880340809422378, "grad_norm": 7.355247974395752, "learning_rate": 2.439543916802406e-06, "loss": 0.4699, "num_input_tokens_seen": 4739104, "step": 3895 }, { "epoch": 0.48866056885102116, "grad_norm": 3.925206422805786, "learning_rate": 2.4426763563463227e-06, "loss": 0.4775, "num_input_tokens_seen": 4745472, "step": 3900 }, { "epoch": 0.48928705675980455, "grad_norm": 5.490139007568359, "learning_rate": 2.4458087958902394e-06, "loss": 0.4688, "num_input_tokens_seen": 4751584, "step": 3905 }, { "epoch": 0.4899135446685879, "grad_norm": 3.736891508102417, "learning_rate": 2.448941235434156e-06, "loss": 0.4721, "num_input_tokens_seen": 4757728, "step": 3910 }, { "epoch": 0.4905400325773713, "grad_norm": 4.802195072174072, "learning_rate": 2.4520736749780734e-06, "loss": 0.474, "num_input_tokens_seen": 4763584, "step": 3915 }, { "epoch": 0.4911665204861546, "grad_norm": 5.351466655731201, "learning_rate": 2.4552061145219897e-06, "loss": 0.4729, "num_input_tokens_seen": 4769536, "step": 3920 }, { "epoch": 0.491793008394938, "grad_norm": 6.395874977111816, "learning_rate": 2.4583385540659065e-06, "loss": 0.4653, "num_input_tokens_seen": 4775616, "step": 3925 }, { "epoch": 0.49241949630372134, "grad_norm": 3.6641876697540283, "learning_rate": 2.4614709936098237e-06, "loss": 0.4748, "num_input_tokens_seen": 4781536, "step": 3930 }, { "epoch": 0.4930459842125047, "grad_norm": 3.8443329334259033, "learning_rate": 2.4646034331537404e-06, "loss": 0.4659, "num_input_tokens_seen": 4787872, "step": 3935 }, { "epoch": 0.49367247212128806, "grad_norm": 4.642219066619873, "learning_rate": 2.467735872697657e-06, "loss": 0.4715, "num_input_tokens_seen": 4793920, "step": 3940 }, { "epoch": 0.4942989600300714, "grad_norm": 3.9120829105377197, "learning_rate": 2.470868312241574e-06, "loss": 0.4691, "num_input_tokens_seen": 4800288, "step": 3945 }, { "epoch": 0.4949254479388548, "grad_norm": 3.1132712364196777, "learning_rate": 2.4740007517854907e-06, "loss": 0.4572, "num_input_tokens_seen": 4806336, "step": 3950 }, { "epoch": 0.4955519358476381, "grad_norm": 2.9249064922332764, "learning_rate": 2.4771331913294075e-06, "loss": 0.4675, "num_input_tokens_seen": 4812608, "step": 3955 }, { "epoch": 0.4961784237564215, "grad_norm": 3.1992642879486084, "learning_rate": 2.4802656308733243e-06, "loss": 0.4594, "num_input_tokens_seen": 4817920, "step": 3960 }, { "epoch": 0.49680491166520485, "grad_norm": 3.270420789718628, "learning_rate": 2.483398070417241e-06, "loss": 0.4758, "num_input_tokens_seen": 4824384, "step": 3965 }, { "epoch": 0.49743139957398824, "grad_norm": 3.348637342453003, "learning_rate": 2.486530509961158e-06, "loss": 0.4511, "num_input_tokens_seen": 4830176, "step": 3970 }, { "epoch": 0.4980578874827716, "grad_norm": 4.703303337097168, "learning_rate": 2.4896629495050745e-06, "loss": 0.4679, "num_input_tokens_seen": 4836288, "step": 3975 }, { "epoch": 0.49868437539155497, "grad_norm": 3.659255027770996, "learning_rate": 2.4927953890489913e-06, "loss": 0.4605, "num_input_tokens_seen": 4842336, "step": 3980 }, { "epoch": 0.4993108633003383, "grad_norm": 3.1675450801849365, "learning_rate": 2.4959278285929085e-06, "loss": 0.4733, "num_input_tokens_seen": 4848320, "step": 3985 }, { "epoch": 0.49993735120912164, "grad_norm": 2.610966444015503, "learning_rate": 2.4990602681368253e-06, "loss": 0.4651, "num_input_tokens_seen": 4853568, "step": 3990 }, { "epoch": 0.500563839117905, "grad_norm": 3.978473663330078, "learning_rate": 2.5021927076807416e-06, "loss": 0.4623, "num_input_tokens_seen": 4859616, "step": 3995 }, { "epoch": 0.5011903270266884, "grad_norm": 3.834299325942993, "learning_rate": 2.5053251472246588e-06, "loss": 0.4666, "num_input_tokens_seen": 4865248, "step": 4000 }, { "epoch": 0.5018168149354717, "grad_norm": 3.7246909141540527, "learning_rate": 2.5084575867685755e-06, "loss": 0.4681, "num_input_tokens_seen": 4871552, "step": 4005 }, { "epoch": 0.5024433028442551, "grad_norm": 4.352261543273926, "learning_rate": 2.5115900263124927e-06, "loss": 0.4677, "num_input_tokens_seen": 4877600, "step": 4010 }, { "epoch": 0.5030697907530385, "grad_norm": 3.5673139095306396, "learning_rate": 2.514722465856409e-06, "loss": 0.4616, "num_input_tokens_seen": 4883872, "step": 4015 }, { "epoch": 0.5036962786618219, "grad_norm": 3.17667293548584, "learning_rate": 2.517854905400326e-06, "loss": 0.4653, "num_input_tokens_seen": 4890048, "step": 4020 }, { "epoch": 0.5043227665706052, "grad_norm": 2.644500732421875, "learning_rate": 2.520987344944243e-06, "loss": 0.4617, "num_input_tokens_seen": 4896448, "step": 4025 }, { "epoch": 0.5049492544793885, "grad_norm": 2.550870895385742, "learning_rate": 2.5241197844881594e-06, "loss": 0.4733, "num_input_tokens_seen": 4902752, "step": 4030 }, { "epoch": 0.5055757423881719, "grad_norm": 3.3588614463806152, "learning_rate": 2.527252224032076e-06, "loss": 0.4721, "num_input_tokens_seen": 4908608, "step": 4035 }, { "epoch": 0.5062022302969552, "grad_norm": 3.1725008487701416, "learning_rate": 2.5303846635759933e-06, "loss": 0.4702, "num_input_tokens_seen": 4914816, "step": 4040 }, { "epoch": 0.5068287182057386, "grad_norm": 3.385061025619507, "learning_rate": 2.53351710311991e-06, "loss": 0.4712, "num_input_tokens_seen": 4920832, "step": 4045 }, { "epoch": 0.507455206114522, "grad_norm": 4.209146976470947, "learning_rate": 2.5366495426638264e-06, "loss": 0.4645, "num_input_tokens_seen": 4926752, "step": 4050 }, { "epoch": 0.5080816940233054, "grad_norm": 3.113900899887085, "learning_rate": 2.5397819822077436e-06, "loss": 0.4614, "num_input_tokens_seen": 4932384, "step": 4055 }, { "epoch": 0.5087081819320887, "grad_norm": 3.519585371017456, "learning_rate": 2.5429144217516604e-06, "loss": 0.4665, "num_input_tokens_seen": 4938528, "step": 4060 }, { "epoch": 0.5093346698408721, "grad_norm": 2.5971522331237793, "learning_rate": 2.5460468612955776e-06, "loss": 0.4748, "num_input_tokens_seen": 4944416, "step": 4065 }, { "epoch": 0.5099611577496554, "grad_norm": 3.2321763038635254, "learning_rate": 2.549179300839494e-06, "loss": 0.465, "num_input_tokens_seen": 4950848, "step": 4070 }, { "epoch": 0.5105876456584388, "grad_norm": 2.8523600101470947, "learning_rate": 2.5523117403834107e-06, "loss": 0.4691, "num_input_tokens_seen": 4956960, "step": 4075 }, { "epoch": 0.5112141335672221, "grad_norm": 4.035209655761719, "learning_rate": 2.555444179927328e-06, "loss": 0.4655, "num_input_tokens_seen": 4962848, "step": 4080 }, { "epoch": 0.5118406214760055, "grad_norm": 4.681708335876465, "learning_rate": 2.558576619471244e-06, "loss": 0.4582, "num_input_tokens_seen": 4969056, "step": 4085 }, { "epoch": 0.5124671093847889, "grad_norm": 4.158288478851318, "learning_rate": 2.561709059015161e-06, "loss": 0.473, "num_input_tokens_seen": 4975168, "step": 4090 }, { "epoch": 0.5130935972935722, "grad_norm": 2.9858815670013428, "learning_rate": 2.564841498559078e-06, "loss": 0.4618, "num_input_tokens_seen": 4981344, "step": 4095 }, { "epoch": 0.5137200852023556, "grad_norm": 2.5377888679504395, "learning_rate": 2.567973938102995e-06, "loss": 0.4827, "num_input_tokens_seen": 4987584, "step": 4100 }, { "epoch": 0.514346573111139, "grad_norm": 3.7750604152679443, "learning_rate": 2.5711063776469112e-06, "loss": 0.4628, "num_input_tokens_seen": 4993632, "step": 4105 }, { "epoch": 0.5149730610199224, "grad_norm": 2.815878391265869, "learning_rate": 2.5742388171908284e-06, "loss": 0.4678, "num_input_tokens_seen": 5000064, "step": 4110 }, { "epoch": 0.5155995489287056, "grad_norm": 2.072521209716797, "learning_rate": 2.577371256734745e-06, "loss": 0.4618, "num_input_tokens_seen": 5006240, "step": 4115 }, { "epoch": 0.516226036837489, "grad_norm": 2.4357399940490723, "learning_rate": 2.5805036962786624e-06, "loss": 0.4673, "num_input_tokens_seen": 5012064, "step": 4120 }, { "epoch": 0.5168525247462724, "grad_norm": 4.616705417633057, "learning_rate": 2.5836361358225787e-06, "loss": 0.4478, "num_input_tokens_seen": 5018272, "step": 4125 }, { "epoch": 0.5174790126550558, "grad_norm": 3.720438003540039, "learning_rate": 2.5867685753664955e-06, "loss": 0.4571, "num_input_tokens_seen": 5024704, "step": 4130 }, { "epoch": 0.5181055005638391, "grad_norm": 4.682794094085693, "learning_rate": 2.5899010149104127e-06, "loss": 0.4423, "num_input_tokens_seen": 5030656, "step": 4135 }, { "epoch": 0.5187319884726225, "grad_norm": 4.2444562911987305, "learning_rate": 2.593033454454329e-06, "loss": 0.4861, "num_input_tokens_seen": 5036896, "step": 4140 }, { "epoch": 0.5193584763814059, "grad_norm": 3.652545213699341, "learning_rate": 2.5961658939982458e-06, "loss": 0.4954, "num_input_tokens_seen": 5043104, "step": 4145 }, { "epoch": 0.5199849642901891, "grad_norm": 4.608511924743652, "learning_rate": 2.599298333542163e-06, "loss": 0.4707, "num_input_tokens_seen": 5049184, "step": 4150 }, { "epoch": 0.5206114521989725, "grad_norm": 3.268127679824829, "learning_rate": 2.6024307730860797e-06, "loss": 0.4583, "num_input_tokens_seen": 5054560, "step": 4155 }, { "epoch": 0.5212379401077559, "grad_norm": 4.136885643005371, "learning_rate": 2.605563212629996e-06, "loss": 0.4733, "num_input_tokens_seen": 5060448, "step": 4160 }, { "epoch": 0.5218644280165393, "grad_norm": 5.947231292724609, "learning_rate": 2.6086956521739132e-06, "loss": 0.4632, "num_input_tokens_seen": 5066880, "step": 4165 }, { "epoch": 0.5224909159253226, "grad_norm": 8.473905563354492, "learning_rate": 2.61182809171783e-06, "loss": 0.4604, "num_input_tokens_seen": 5073248, "step": 4170 }, { "epoch": 0.523117403834106, "grad_norm": 9.779873847961426, "learning_rate": 2.614960531261747e-06, "loss": 0.4901, "num_input_tokens_seen": 5079488, "step": 4175 }, { "epoch": 0.5237438917428894, "grad_norm": 5.918787479400635, "learning_rate": 2.6180929708056635e-06, "loss": 0.4602, "num_input_tokens_seen": 5085664, "step": 4180 }, { "epoch": 0.5243703796516728, "grad_norm": 3.131930112838745, "learning_rate": 2.6212254103495803e-06, "loss": 0.4712, "num_input_tokens_seen": 5091648, "step": 4185 }, { "epoch": 0.524996867560456, "grad_norm": 2.0060980319976807, "learning_rate": 2.6243578498934975e-06, "loss": 0.4622, "num_input_tokens_seen": 5097696, "step": 4190 }, { "epoch": 0.5256233554692394, "grad_norm": 4.453126907348633, "learning_rate": 2.627490289437414e-06, "loss": 0.4674, "num_input_tokens_seen": 5103488, "step": 4195 }, { "epoch": 0.5262498433780228, "grad_norm": 2.681790590286255, "learning_rate": 2.6306227289813306e-06, "loss": 0.4632, "num_input_tokens_seen": 5109824, "step": 4200 }, { "epoch": 0.5268763312868061, "grad_norm": 1.9019272327423096, "learning_rate": 2.6337551685252478e-06, "loss": 0.4677, "num_input_tokens_seen": 5115872, "step": 4205 }, { "epoch": 0.5275028191955895, "grad_norm": 2.4681761264801025, "learning_rate": 2.6368876080691645e-06, "loss": 0.4615, "num_input_tokens_seen": 5120512, "step": 4210 }, { "epoch": 0.5281293071043729, "grad_norm": 2.046565055847168, "learning_rate": 2.640020047613081e-06, "loss": 0.463, "num_input_tokens_seen": 5126784, "step": 4215 }, { "epoch": 0.5287557950131563, "grad_norm": 2.1731066703796387, "learning_rate": 2.643152487156998e-06, "loss": 0.4708, "num_input_tokens_seen": 5132608, "step": 4220 }, { "epoch": 0.5293822829219396, "grad_norm": 2.6440227031707764, "learning_rate": 2.646284926700915e-06, "loss": 0.4623, "num_input_tokens_seen": 5138976, "step": 4225 }, { "epoch": 0.530008770830723, "grad_norm": 2.0767674446105957, "learning_rate": 2.649417366244832e-06, "loss": 0.4533, "num_input_tokens_seen": 5144896, "step": 4230 }, { "epoch": 0.5306352587395063, "grad_norm": 3.3348217010498047, "learning_rate": 2.6525498057887483e-06, "loss": 0.4562, "num_input_tokens_seen": 5150976, "step": 4235 }, { "epoch": 0.5312617466482897, "grad_norm": 2.3956801891326904, "learning_rate": 2.655682245332665e-06, "loss": 0.4583, "num_input_tokens_seen": 5157280, "step": 4240 }, { "epoch": 0.531888234557073, "grad_norm": 2.675344228744507, "learning_rate": 2.6588146848765823e-06, "loss": 0.4756, "num_input_tokens_seen": 5163232, "step": 4245 }, { "epoch": 0.5325147224658564, "grad_norm": 2.4905788898468018, "learning_rate": 2.6619471244204986e-06, "loss": 0.4594, "num_input_tokens_seen": 5168896, "step": 4250 }, { "epoch": 0.5331412103746398, "grad_norm": 2.2905917167663574, "learning_rate": 2.6650795639644154e-06, "loss": 0.4555, "num_input_tokens_seen": 5175264, "step": 4255 }, { "epoch": 0.5337676982834231, "grad_norm": 3.073962450027466, "learning_rate": 2.6682120035083326e-06, "loss": 0.4656, "num_input_tokens_seen": 5181600, "step": 4260 }, { "epoch": 0.5343941861922065, "grad_norm": 1.9296374320983887, "learning_rate": 2.6713444430522493e-06, "loss": 0.4707, "num_input_tokens_seen": 5187744, "step": 4265 }, { "epoch": 0.5350206741009899, "grad_norm": 2.3262460231781006, "learning_rate": 2.6744768825961657e-06, "loss": 0.4598, "num_input_tokens_seen": 5194176, "step": 4270 }, { "epoch": 0.5356471620097732, "grad_norm": 2.8238282203674316, "learning_rate": 2.677609322140083e-06, "loss": 0.4679, "num_input_tokens_seen": 5200544, "step": 4275 }, { "epoch": 0.5362736499185565, "grad_norm": 1.9579746723175049, "learning_rate": 2.6807417616839996e-06, "loss": 0.4577, "num_input_tokens_seen": 5206688, "step": 4280 }, { "epoch": 0.5369001378273399, "grad_norm": 4.298897743225098, "learning_rate": 2.683874201227917e-06, "loss": 0.4592, "num_input_tokens_seen": 5211744, "step": 4285 }, { "epoch": 0.5375266257361233, "grad_norm": 3.1096365451812744, "learning_rate": 2.687006640771833e-06, "loss": 0.447, "num_input_tokens_seen": 5218048, "step": 4290 }, { "epoch": 0.5381531136449067, "grad_norm": 3.661853075027466, "learning_rate": 2.69013908031575e-06, "loss": 0.4468, "num_input_tokens_seen": 5223840, "step": 4295 }, { "epoch": 0.53877960155369, "grad_norm": 2.992501735687256, "learning_rate": 2.693271519859667e-06, "loss": 0.4686, "num_input_tokens_seen": 5229920, "step": 4300 }, { "epoch": 0.5394060894624734, "grad_norm": 4.233214378356934, "learning_rate": 2.6964039594035835e-06, "loss": 0.4976, "num_input_tokens_seen": 5236192, "step": 4305 }, { "epoch": 0.5400325773712568, "grad_norm": 4.64241886138916, "learning_rate": 2.6995363989475002e-06, "loss": 0.4906, "num_input_tokens_seen": 5242560, "step": 4310 }, { "epoch": 0.54065906528004, "grad_norm": 2.219965696334839, "learning_rate": 2.7026688384914174e-06, "loss": 0.4746, "num_input_tokens_seen": 5248896, "step": 4315 }, { "epoch": 0.5412855531888234, "grad_norm": 1.9625486135482788, "learning_rate": 2.705801278035334e-06, "loss": 0.4641, "num_input_tokens_seen": 5254976, "step": 4320 }, { "epoch": 0.5419120410976068, "grad_norm": 3.223358631134033, "learning_rate": 2.708933717579251e-06, "loss": 0.4764, "num_input_tokens_seen": 5261184, "step": 4325 }, { "epoch": 0.5425385290063902, "grad_norm": 4.259035587310791, "learning_rate": 2.7120661571231677e-06, "loss": 0.48, "num_input_tokens_seen": 5267072, "step": 4330 }, { "epoch": 0.5431650169151735, "grad_norm": 1.340096116065979, "learning_rate": 2.7151985966670845e-06, "loss": 0.4527, "num_input_tokens_seen": 5272576, "step": 4335 }, { "epoch": 0.5437915048239569, "grad_norm": 4.371307849884033, "learning_rate": 2.7183310362110016e-06, "loss": 0.4841, "num_input_tokens_seen": 5278688, "step": 4340 }, { "epoch": 0.5444179927327403, "grad_norm": 2.3833019733428955, "learning_rate": 2.721463475754918e-06, "loss": 0.4696, "num_input_tokens_seen": 5284704, "step": 4345 }, { "epoch": 0.5450444806415237, "grad_norm": 1.5946285724639893, "learning_rate": 2.7245959152988347e-06, "loss": 0.4603, "num_input_tokens_seen": 5290176, "step": 4350 }, { "epoch": 0.545670968550307, "grad_norm": 2.2405669689178467, "learning_rate": 2.727728354842752e-06, "loss": 0.4701, "num_input_tokens_seen": 5296416, "step": 4355 }, { "epoch": 0.5462974564590903, "grad_norm": 2.7023043632507324, "learning_rate": 2.7308607943866683e-06, "loss": 0.4548, "num_input_tokens_seen": 5302944, "step": 4360 }, { "epoch": 0.5469239443678737, "grad_norm": 3.237680196762085, "learning_rate": 2.7339932339305855e-06, "loss": 0.4651, "num_input_tokens_seen": 5309344, "step": 4365 }, { "epoch": 0.547550432276657, "grad_norm": 2.1258227825164795, "learning_rate": 2.7371256734745022e-06, "loss": 0.4678, "num_input_tokens_seen": 5315520, "step": 4370 }, { "epoch": 0.5481769201854404, "grad_norm": 1.9387351274490356, "learning_rate": 2.740258113018419e-06, "loss": 0.4682, "num_input_tokens_seen": 5321664, "step": 4375 }, { "epoch": 0.5488034080942238, "grad_norm": 2.233375310897827, "learning_rate": 2.7433905525623357e-06, "loss": 0.4679, "num_input_tokens_seen": 5327584, "step": 4380 }, { "epoch": 0.5494298960030072, "grad_norm": 1.8419897556304932, "learning_rate": 2.7465229921062525e-06, "loss": 0.4568, "num_input_tokens_seen": 5333504, "step": 4385 }, { "epoch": 0.5500563839117905, "grad_norm": 2.100130081176758, "learning_rate": 2.7496554316501693e-06, "loss": 0.4621, "num_input_tokens_seen": 5339680, "step": 4390 }, { "epoch": 0.5506828718205738, "grad_norm": 1.6706838607788086, "learning_rate": 2.7527878711940865e-06, "loss": 0.4701, "num_input_tokens_seen": 5345216, "step": 4395 }, { "epoch": 0.5513093597293572, "grad_norm": 3.3451080322265625, "learning_rate": 2.755920310738003e-06, "loss": 0.4626, "num_input_tokens_seen": 5351456, "step": 4400 }, { "epoch": 0.5519358476381406, "grad_norm": 2.2801427841186523, "learning_rate": 2.7590527502819196e-06, "loss": 0.4661, "num_input_tokens_seen": 5357440, "step": 4405 }, { "epoch": 0.5525623355469239, "grad_norm": 3.01706862449646, "learning_rate": 2.7621851898258367e-06, "loss": 0.4674, "num_input_tokens_seen": 5363552, "step": 4410 }, { "epoch": 0.5531888234557073, "grad_norm": 2.0493667125701904, "learning_rate": 2.765317629369753e-06, "loss": 0.4606, "num_input_tokens_seen": 5369920, "step": 4415 }, { "epoch": 0.5538153113644907, "grad_norm": 2.3020129203796387, "learning_rate": 2.7684500689136703e-06, "loss": 0.457, "num_input_tokens_seen": 5376128, "step": 4420 }, { "epoch": 0.554441799273274, "grad_norm": 3.983337879180908, "learning_rate": 2.771582508457587e-06, "loss": 0.4656, "num_input_tokens_seen": 5381888, "step": 4425 }, { "epoch": 0.5550682871820574, "grad_norm": 2.819784164428711, "learning_rate": 2.774714948001504e-06, "loss": 0.467, "num_input_tokens_seen": 5388224, "step": 4430 }, { "epoch": 0.5556947750908408, "grad_norm": 1.7474879026412964, "learning_rate": 2.7778473875454206e-06, "loss": 0.4683, "num_input_tokens_seen": 5394656, "step": 4435 }, { "epoch": 0.5563212629996241, "grad_norm": 3.903773784637451, "learning_rate": 2.7809798270893373e-06, "loss": 0.4467, "num_input_tokens_seen": 5400416, "step": 4440 }, { "epoch": 0.5569477509084074, "grad_norm": 4.243218898773193, "learning_rate": 2.784112266633254e-06, "loss": 0.4681, "num_input_tokens_seen": 5406592, "step": 4445 }, { "epoch": 0.5575742388171908, "grad_norm": 3.200655937194824, "learning_rate": 2.7872447061771713e-06, "loss": 0.4566, "num_input_tokens_seen": 5412544, "step": 4450 }, { "epoch": 0.5582007267259742, "grad_norm": 4.27252197265625, "learning_rate": 2.7903771457210876e-06, "loss": 0.4409, "num_input_tokens_seen": 5418912, "step": 4455 }, { "epoch": 0.5588272146347576, "grad_norm": 4.499093055725098, "learning_rate": 2.793509585265005e-06, "loss": 0.4848, "num_input_tokens_seen": 5424512, "step": 4460 }, { "epoch": 0.5594537025435409, "grad_norm": 2.261150360107422, "learning_rate": 2.7966420248089216e-06, "loss": 0.4809, "num_input_tokens_seen": 5430240, "step": 4465 }, { "epoch": 0.5600801904523243, "grad_norm": 6.3654584884643555, "learning_rate": 2.799774464352838e-06, "loss": 0.4878, "num_input_tokens_seen": 5436480, "step": 4470 }, { "epoch": 0.5607066783611077, "grad_norm": 3.888028621673584, "learning_rate": 2.802906903896755e-06, "loss": 0.4628, "num_input_tokens_seen": 5442496, "step": 4475 }, { "epoch": 0.5613331662698909, "grad_norm": 2.2797999382019043, "learning_rate": 2.806039343440672e-06, "loss": 0.4508, "num_input_tokens_seen": 5448736, "step": 4480 }, { "epoch": 0.5619596541786743, "grad_norm": 1.817173719406128, "learning_rate": 2.8091717829845886e-06, "loss": 0.4732, "num_input_tokens_seen": 5454880, "step": 4485 }, { "epoch": 0.5625861420874577, "grad_norm": 1.8701452016830444, "learning_rate": 2.8123042225285054e-06, "loss": 0.4586, "num_input_tokens_seen": 5461056, "step": 4490 }, { "epoch": 0.5632126299962411, "grad_norm": 2.139232873916626, "learning_rate": 2.815436662072422e-06, "loss": 0.4564, "num_input_tokens_seen": 5467264, "step": 4495 }, { "epoch": 0.5638391179050244, "grad_norm": 3.428925037384033, "learning_rate": 2.8185691016163393e-06, "loss": 0.4681, "num_input_tokens_seen": 5473632, "step": 4500 }, { "epoch": 0.5644656058138078, "grad_norm": 2.8112611770629883, "learning_rate": 2.821701541160256e-06, "loss": 0.4601, "num_input_tokens_seen": 5479072, "step": 4505 }, { "epoch": 0.5650920937225912, "grad_norm": 2.339782953262329, "learning_rate": 2.8248339807041724e-06, "loss": 0.4621, "num_input_tokens_seen": 5485184, "step": 4510 }, { "epoch": 0.5657185816313746, "grad_norm": 1.7779514789581299, "learning_rate": 2.8279664202480896e-06, "loss": 0.4743, "num_input_tokens_seen": 5491264, "step": 4515 }, { "epoch": 0.5663450695401578, "grad_norm": 1.894032597541809, "learning_rate": 2.8310988597920064e-06, "loss": 0.4827, "num_input_tokens_seen": 5496832, "step": 4520 }, { "epoch": 0.5669715574489412, "grad_norm": 1.8878474235534668, "learning_rate": 2.8342312993359227e-06, "loss": 0.4684, "num_input_tokens_seen": 5502848, "step": 4525 }, { "epoch": 0.5675980453577246, "grad_norm": 2.311461925506592, "learning_rate": 2.83736373887984e-06, "loss": 0.4574, "num_input_tokens_seen": 5508672, "step": 4530 }, { "epoch": 0.5682245332665079, "grad_norm": 2.2606191635131836, "learning_rate": 2.8404961784237567e-06, "loss": 0.457, "num_input_tokens_seen": 5514176, "step": 4535 }, { "epoch": 0.5688510211752913, "grad_norm": 2.494579553604126, "learning_rate": 2.8436286179676734e-06, "loss": 0.4557, "num_input_tokens_seen": 5520192, "step": 4540 }, { "epoch": 0.5694775090840747, "grad_norm": 3.76419997215271, "learning_rate": 2.84676105751159e-06, "loss": 0.4699, "num_input_tokens_seen": 5526240, "step": 4545 }, { "epoch": 0.5701039969928581, "grad_norm": 2.304328203201294, "learning_rate": 2.849893497055507e-06, "loss": 0.4631, "num_input_tokens_seen": 5532352, "step": 4550 }, { "epoch": 0.5707304849016414, "grad_norm": 3.511235237121582, "learning_rate": 2.853025936599424e-06, "loss": 0.4649, "num_input_tokens_seen": 5537888, "step": 4555 }, { "epoch": 0.5713569728104247, "grad_norm": 2.649352550506592, "learning_rate": 2.856158376143341e-06, "loss": 0.4729, "num_input_tokens_seen": 5543840, "step": 4560 }, { "epoch": 0.5719834607192081, "grad_norm": 2.6471328735351562, "learning_rate": 2.8592908156872572e-06, "loss": 0.4616, "num_input_tokens_seen": 5550080, "step": 4565 }, { "epoch": 0.5726099486279915, "grad_norm": 1.9915720224380493, "learning_rate": 2.8624232552311744e-06, "loss": 0.4651, "num_input_tokens_seen": 5555776, "step": 4570 }, { "epoch": 0.5732364365367748, "grad_norm": 3.5480661392211914, "learning_rate": 2.865555694775091e-06, "loss": 0.4618, "num_input_tokens_seen": 5561728, "step": 4575 }, { "epoch": 0.5738629244455582, "grad_norm": 2.073329448699951, "learning_rate": 2.8686881343190075e-06, "loss": 0.4678, "num_input_tokens_seen": 5568064, "step": 4580 }, { "epoch": 0.5744894123543416, "grad_norm": 3.329150438308716, "learning_rate": 2.8718205738629247e-06, "loss": 0.4601, "num_input_tokens_seen": 5573792, "step": 4585 }, { "epoch": 0.5751159002631249, "grad_norm": 2.0070557594299316, "learning_rate": 2.8749530134068415e-06, "loss": 0.4669, "num_input_tokens_seen": 5579616, "step": 4590 }, { "epoch": 0.5757423881719083, "grad_norm": 1.9183168411254883, "learning_rate": 2.8780854529507587e-06, "loss": 0.4703, "num_input_tokens_seen": 5585824, "step": 4595 }, { "epoch": 0.5763688760806917, "grad_norm": 2.6080174446105957, "learning_rate": 2.881217892494675e-06, "loss": 0.4625, "num_input_tokens_seen": 5591808, "step": 4600 }, { "epoch": 0.576995363989475, "grad_norm": 4.17172908782959, "learning_rate": 2.8843503320385918e-06, "loss": 0.466, "num_input_tokens_seen": 5597728, "step": 4605 }, { "epoch": 0.5776218518982583, "grad_norm": 1.5479689836502075, "learning_rate": 2.887482771582509e-06, "loss": 0.4556, "num_input_tokens_seen": 5603520, "step": 4610 }, { "epoch": 0.5782483398070417, "grad_norm": 3.0982725620269775, "learning_rate": 2.8906152111264257e-06, "loss": 0.4635, "num_input_tokens_seen": 5609600, "step": 4615 }, { "epoch": 0.5788748277158251, "grad_norm": 4.44560432434082, "learning_rate": 2.893747650670342e-06, "loss": 0.4582, "num_input_tokens_seen": 5615936, "step": 4620 }, { "epoch": 0.5795013156246085, "grad_norm": 1.6504868268966675, "learning_rate": 2.8968800902142593e-06, "loss": 0.4581, "num_input_tokens_seen": 5621408, "step": 4625 }, { "epoch": 0.5801278035333918, "grad_norm": 1.98882257938385, "learning_rate": 2.900012529758176e-06, "loss": 0.4671, "num_input_tokens_seen": 5627872, "step": 4630 }, { "epoch": 0.5807542914421752, "grad_norm": 2.993670701980591, "learning_rate": 2.9031449693020924e-06, "loss": 0.4608, "num_input_tokens_seen": 5634208, "step": 4635 }, { "epoch": 0.5813807793509586, "grad_norm": 3.1165711879730225, "learning_rate": 2.9062774088460095e-06, "loss": 0.4838, "num_input_tokens_seen": 5639808, "step": 4640 }, { "epoch": 0.5820072672597418, "grad_norm": 1.8731732368469238, "learning_rate": 2.9094098483899263e-06, "loss": 0.4611, "num_input_tokens_seen": 5645792, "step": 4645 }, { "epoch": 0.5826337551685252, "grad_norm": 1.8608876466751099, "learning_rate": 2.9125422879338435e-06, "loss": 0.4808, "num_input_tokens_seen": 5651104, "step": 4650 }, { "epoch": 0.5832602430773086, "grad_norm": 1.8195446729660034, "learning_rate": 2.91567472747776e-06, "loss": 0.4567, "num_input_tokens_seen": 5657408, "step": 4655 }, { "epoch": 0.583886730986092, "grad_norm": 2.5731828212738037, "learning_rate": 2.9188071670216766e-06, "loss": 0.4673, "num_input_tokens_seen": 5663360, "step": 4660 }, { "epoch": 0.5845132188948753, "grad_norm": 3.154407024383545, "learning_rate": 2.9219396065655938e-06, "loss": 0.4593, "num_input_tokens_seen": 5669440, "step": 4665 }, { "epoch": 0.5851397068036587, "grad_norm": 3.243769407272339, "learning_rate": 2.9250720461095105e-06, "loss": 0.4575, "num_input_tokens_seen": 5674880, "step": 4670 }, { "epoch": 0.5857661947124421, "grad_norm": 5.457956790924072, "learning_rate": 2.928204485653427e-06, "loss": 0.466, "num_input_tokens_seen": 5681056, "step": 4675 }, { "epoch": 0.5863926826212255, "grad_norm": 4.222486972808838, "learning_rate": 2.931336925197344e-06, "loss": 0.4587, "num_input_tokens_seen": 5687232, "step": 4680 }, { "epoch": 0.5870191705300087, "grad_norm": 6.189016342163086, "learning_rate": 2.934469364741261e-06, "loss": 0.4477, "num_input_tokens_seen": 5693312, "step": 4685 }, { "epoch": 0.5876456584387921, "grad_norm": 3.489954948425293, "learning_rate": 2.937601804285177e-06, "loss": 0.4699, "num_input_tokens_seen": 5699488, "step": 4690 }, { "epoch": 0.5882721463475755, "grad_norm": 4.336996555328369, "learning_rate": 2.9407342438290944e-06, "loss": 0.4796, "num_input_tokens_seen": 5705632, "step": 4695 }, { "epoch": 0.5888986342563588, "grad_norm": 2.8488612174987793, "learning_rate": 2.943866683373011e-06, "loss": 0.4532, "num_input_tokens_seen": 5711552, "step": 4700 }, { "epoch": 0.5895251221651422, "grad_norm": 7.912120342254639, "learning_rate": 2.9469991229169283e-06, "loss": 0.4659, "num_input_tokens_seen": 5718048, "step": 4705 }, { "epoch": 0.5901516100739256, "grad_norm": 4.009976863861084, "learning_rate": 2.9501315624608446e-06, "loss": 0.421, "num_input_tokens_seen": 5724320, "step": 4710 }, { "epoch": 0.590778097982709, "grad_norm": 4.955232620239258, "learning_rate": 2.9532640020047614e-06, "loss": 0.4995, "num_input_tokens_seen": 5730688, "step": 4715 }, { "epoch": 0.5914045858914923, "grad_norm": 3.963615894317627, "learning_rate": 2.9563964415486786e-06, "loss": 0.5178, "num_input_tokens_seen": 5736352, "step": 4720 }, { "epoch": 0.5920310738002756, "grad_norm": 4.34879207611084, "learning_rate": 2.9595288810925954e-06, "loss": 0.4751, "num_input_tokens_seen": 5742016, "step": 4725 }, { "epoch": 0.592657561709059, "grad_norm": 2.186953067779541, "learning_rate": 2.9626613206365117e-06, "loss": 0.4726, "num_input_tokens_seen": 5748064, "step": 4730 }, { "epoch": 0.5932840496178424, "grad_norm": 2.349637985229492, "learning_rate": 2.965793760180429e-06, "loss": 0.4629, "num_input_tokens_seen": 5754272, "step": 4735 }, { "epoch": 0.5939105375266257, "grad_norm": 3.1834700107574463, "learning_rate": 2.9689261997243457e-06, "loss": 0.4667, "num_input_tokens_seen": 5760448, "step": 4740 }, { "epoch": 0.5945370254354091, "grad_norm": 0.9949160218238831, "learning_rate": 2.972058639268262e-06, "loss": 0.4676, "num_input_tokens_seen": 5765632, "step": 4745 }, { "epoch": 0.5951635133441925, "grad_norm": 2.4831676483154297, "learning_rate": 2.975191078812179e-06, "loss": 0.4611, "num_input_tokens_seen": 5771872, "step": 4750 }, { "epoch": 0.5957900012529758, "grad_norm": 1.8477874994277954, "learning_rate": 2.978323518356096e-06, "loss": 0.4723, "num_input_tokens_seen": 5777888, "step": 4755 }, { "epoch": 0.5964164891617592, "grad_norm": 1.6636685132980347, "learning_rate": 2.981455957900013e-06, "loss": 0.4639, "num_input_tokens_seen": 5783776, "step": 4760 }, { "epoch": 0.5970429770705425, "grad_norm": 1.4923990964889526, "learning_rate": 2.9845883974439295e-06, "loss": 0.4701, "num_input_tokens_seen": 5789504, "step": 4765 }, { "epoch": 0.5976694649793259, "grad_norm": 2.6128580570220947, "learning_rate": 2.9877208369878462e-06, "loss": 0.4613, "num_input_tokens_seen": 5795616, "step": 4770 }, { "epoch": 0.5982959528881092, "grad_norm": 3.133617639541626, "learning_rate": 2.9908532765317634e-06, "loss": 0.4715, "num_input_tokens_seen": 5801824, "step": 4775 }, { "epoch": 0.5989224407968926, "grad_norm": 1.8761134147644043, "learning_rate": 2.99398571607568e-06, "loss": 0.463, "num_input_tokens_seen": 5807840, "step": 4780 }, { "epoch": 0.599548928705676, "grad_norm": 1.645296573638916, "learning_rate": 2.9971181556195965e-06, "loss": 0.4643, "num_input_tokens_seen": 5814080, "step": 4785 }, { "epoch": 0.6001754166144594, "grad_norm": 2.460846185684204, "learning_rate": 3.0002505951635137e-06, "loss": 0.4704, "num_input_tokens_seen": 5820000, "step": 4790 }, { "epoch": 0.6008019045232427, "grad_norm": 2.4197537899017334, "learning_rate": 3.0033830347074305e-06, "loss": 0.4654, "num_input_tokens_seen": 5825792, "step": 4795 }, { "epoch": 0.6014283924320261, "grad_norm": 1.679879069328308, "learning_rate": 3.006515474251347e-06, "loss": 0.458, "num_input_tokens_seen": 5832032, "step": 4800 }, { "epoch": 0.6020548803408095, "grad_norm": 1.1846237182617188, "learning_rate": 3.009647913795264e-06, "loss": 0.4628, "num_input_tokens_seen": 5838144, "step": 4805 }, { "epoch": 0.6026813682495927, "grad_norm": 1.6705286502838135, "learning_rate": 3.0127803533391808e-06, "loss": 0.452, "num_input_tokens_seen": 5844576, "step": 4810 }, { "epoch": 0.6033078561583761, "grad_norm": 2.1609866619110107, "learning_rate": 3.015912792883098e-06, "loss": 0.4957, "num_input_tokens_seen": 5850528, "step": 4815 }, { "epoch": 0.6039343440671595, "grad_norm": 1.5594476461410522, "learning_rate": 3.0190452324270143e-06, "loss": 0.4715, "num_input_tokens_seen": 5856672, "step": 4820 }, { "epoch": 0.6045608319759429, "grad_norm": 1.198371410369873, "learning_rate": 3.022177671970931e-06, "loss": 0.4686, "num_input_tokens_seen": 5863136, "step": 4825 }, { "epoch": 0.6051873198847262, "grad_norm": 3.1214804649353027, "learning_rate": 3.0253101115148482e-06, "loss": 0.474, "num_input_tokens_seen": 5869408, "step": 4830 }, { "epoch": 0.6058138077935096, "grad_norm": 1.7684088945388794, "learning_rate": 3.028442551058765e-06, "loss": 0.4638, "num_input_tokens_seen": 5875488, "step": 4835 }, { "epoch": 0.606440295702293, "grad_norm": 1.0690205097198486, "learning_rate": 3.0315749906026813e-06, "loss": 0.4639, "num_input_tokens_seen": 5881600, "step": 4840 }, { "epoch": 0.6070667836110764, "grad_norm": 1.6045331954956055, "learning_rate": 3.0347074301465985e-06, "loss": 0.4698, "num_input_tokens_seen": 5887488, "step": 4845 }, { "epoch": 0.6076932715198596, "grad_norm": 2.2367336750030518, "learning_rate": 3.0378398696905153e-06, "loss": 0.4566, "num_input_tokens_seen": 5893728, "step": 4850 }, { "epoch": 0.608319759428643, "grad_norm": 1.5200879573822021, "learning_rate": 3.0409723092344316e-06, "loss": 0.4588, "num_input_tokens_seen": 5899872, "step": 4855 }, { "epoch": 0.6089462473374264, "grad_norm": 2.096876382827759, "learning_rate": 3.044104748778349e-06, "loss": 0.4636, "num_input_tokens_seen": 5905888, "step": 4860 }, { "epoch": 0.6095727352462097, "grad_norm": 3.1015982627868652, "learning_rate": 3.0472371883222656e-06, "loss": 0.4743, "num_input_tokens_seen": 5911808, "step": 4865 }, { "epoch": 0.6101992231549931, "grad_norm": 1.9176030158996582, "learning_rate": 3.0503696278661828e-06, "loss": 0.452, "num_input_tokens_seen": 5917888, "step": 4870 }, { "epoch": 0.6108257110637765, "grad_norm": 1.861043095588684, "learning_rate": 3.053502067410099e-06, "loss": 0.4716, "num_input_tokens_seen": 5923616, "step": 4875 }, { "epoch": 0.6114521989725599, "grad_norm": 1.696899175643921, "learning_rate": 3.056634506954016e-06, "loss": 0.4627, "num_input_tokens_seen": 5929792, "step": 4880 }, { "epoch": 0.6120786868813431, "grad_norm": 1.7601335048675537, "learning_rate": 3.059766946497933e-06, "loss": 0.465, "num_input_tokens_seen": 5936000, "step": 4885 }, { "epoch": 0.6127051747901265, "grad_norm": 1.7279514074325562, "learning_rate": 3.06289938604185e-06, "loss": 0.4766, "num_input_tokens_seen": 5942144, "step": 4890 }, { "epoch": 0.6133316626989099, "grad_norm": 2.042634963989258, "learning_rate": 3.066031825585766e-06, "loss": 0.4614, "num_input_tokens_seen": 5948320, "step": 4895 }, { "epoch": 0.6139581506076933, "grad_norm": 1.7775005102157593, "learning_rate": 3.0691642651296833e-06, "loss": 0.4604, "num_input_tokens_seen": 5954400, "step": 4900 }, { "epoch": 0.6145846385164766, "grad_norm": 2.3783674240112305, "learning_rate": 3.0722967046736e-06, "loss": 0.4555, "num_input_tokens_seen": 5960448, "step": 4905 }, { "epoch": 0.61521112642526, "grad_norm": 2.7488436698913574, "learning_rate": 3.0754291442175164e-06, "loss": 0.476, "num_input_tokens_seen": 5966624, "step": 4910 }, { "epoch": 0.6158376143340434, "grad_norm": 1.291185975074768, "learning_rate": 3.0785615837614336e-06, "loss": 0.4778, "num_input_tokens_seen": 5972640, "step": 4915 }, { "epoch": 0.6164641022428267, "grad_norm": 2.113471031188965, "learning_rate": 3.0816940233053504e-06, "loss": 0.451, "num_input_tokens_seen": 5978880, "step": 4920 }, { "epoch": 0.61709059015161, "grad_norm": 3.6094112396240234, "learning_rate": 3.0848264628492676e-06, "loss": 0.4622, "num_input_tokens_seen": 5984704, "step": 4925 }, { "epoch": 0.6177170780603934, "grad_norm": 2.112147331237793, "learning_rate": 3.087958902393184e-06, "loss": 0.4592, "num_input_tokens_seen": 5990848, "step": 4930 }, { "epoch": 0.6183435659691768, "grad_norm": 1.1468397378921509, "learning_rate": 3.0910913419371007e-06, "loss": 0.4815, "num_input_tokens_seen": 5996736, "step": 4935 }, { "epoch": 0.6189700538779601, "grad_norm": 1.8838402032852173, "learning_rate": 3.094223781481018e-06, "loss": 0.4658, "num_input_tokens_seen": 6002560, "step": 4940 }, { "epoch": 0.6195965417867435, "grad_norm": 2.1676480770111084, "learning_rate": 3.0973562210249346e-06, "loss": 0.4724, "num_input_tokens_seen": 6008800, "step": 4945 }, { "epoch": 0.6202230296955269, "grad_norm": 5.29171895980835, "learning_rate": 3.100488660568851e-06, "loss": 0.4557, "num_input_tokens_seen": 6015136, "step": 4950 }, { "epoch": 0.6208495176043103, "grad_norm": 3.23061466217041, "learning_rate": 3.103621100112768e-06, "loss": 0.4659, "num_input_tokens_seen": 6021504, "step": 4955 }, { "epoch": 0.6214760055130936, "grad_norm": 2.0252792835235596, "learning_rate": 3.106753539656685e-06, "loss": 0.4595, "num_input_tokens_seen": 6027648, "step": 4960 }, { "epoch": 0.622102493421877, "grad_norm": 2.250870704650879, "learning_rate": 3.1098859792006013e-06, "loss": 0.4588, "num_input_tokens_seen": 6033728, "step": 4965 }, { "epoch": 0.6227289813306603, "grad_norm": 2.180636405944824, "learning_rate": 3.1130184187445184e-06, "loss": 0.4481, "num_input_tokens_seen": 6040032, "step": 4970 }, { "epoch": 0.6233554692394436, "grad_norm": 5.458287239074707, "learning_rate": 3.116150858288435e-06, "loss": 0.4753, "num_input_tokens_seen": 6046528, "step": 4975 }, { "epoch": 0.623981957148227, "grad_norm": 4.447821617126465, "learning_rate": 3.1192832978323524e-06, "loss": 0.4665, "num_input_tokens_seen": 6052384, "step": 4980 }, { "epoch": 0.6246084450570104, "grad_norm": 7.841136932373047, "learning_rate": 3.1224157373762687e-06, "loss": 0.506, "num_input_tokens_seen": 6058432, "step": 4985 }, { "epoch": 0.6252349329657938, "grad_norm": 6.728048324584961, "learning_rate": 3.1255481769201855e-06, "loss": 0.4687, "num_input_tokens_seen": 6064832, "step": 4990 }, { "epoch": 0.6258614208745771, "grad_norm": 2.1659815311431885, "learning_rate": 3.1286806164641027e-06, "loss": 0.4601, "num_input_tokens_seen": 6070784, "step": 4995 }, { "epoch": 0.6264879087833605, "grad_norm": 1.7956598997116089, "learning_rate": 3.1318130560080194e-06, "loss": 0.472, "num_input_tokens_seen": 6076896, "step": 5000 }, { "epoch": 0.6271143966921439, "grad_norm": 2.5287041664123535, "learning_rate": 3.1349454955519358e-06, "loss": 0.4529, "num_input_tokens_seen": 6083360, "step": 5005 }, { "epoch": 0.6277408846009273, "grad_norm": 4.38686466217041, "learning_rate": 3.138077935095853e-06, "loss": 0.4652, "num_input_tokens_seen": 6089568, "step": 5010 }, { "epoch": 0.6283673725097105, "grad_norm": 2.720506429672241, "learning_rate": 3.1412103746397697e-06, "loss": 0.4682, "num_input_tokens_seen": 6095200, "step": 5015 }, { "epoch": 0.6289938604184939, "grad_norm": 3.231361150741577, "learning_rate": 3.144342814183686e-06, "loss": 0.4828, "num_input_tokens_seen": 6101248, "step": 5020 }, { "epoch": 0.6296203483272773, "grad_norm": 2.410983085632324, "learning_rate": 3.1474752537276033e-06, "loss": 0.4608, "num_input_tokens_seen": 6107648, "step": 5025 }, { "epoch": 0.6302468362360606, "grad_norm": 2.328281879425049, "learning_rate": 3.15060769327152e-06, "loss": 0.458, "num_input_tokens_seen": 6113760, "step": 5030 }, { "epoch": 0.630873324144844, "grad_norm": 2.5674428939819336, "learning_rate": 3.1537401328154372e-06, "loss": 0.457, "num_input_tokens_seen": 6120032, "step": 5035 }, { "epoch": 0.6314998120536274, "grad_norm": 5.216761589050293, "learning_rate": 3.1568725723593536e-06, "loss": 0.4673, "num_input_tokens_seen": 6126272, "step": 5040 }, { "epoch": 0.6321262999624108, "grad_norm": 2.461474895477295, "learning_rate": 3.1600050119032703e-06, "loss": 0.4586, "num_input_tokens_seen": 6132352, "step": 5045 }, { "epoch": 0.632752787871194, "grad_norm": 10.648930549621582, "learning_rate": 3.1631374514471875e-06, "loss": 0.4974, "num_input_tokens_seen": 6138432, "step": 5050 }, { "epoch": 0.6333792757799774, "grad_norm": 3.5050106048583984, "learning_rate": 3.1662698909911043e-06, "loss": 0.464, "num_input_tokens_seen": 6144192, "step": 5055 }, { "epoch": 0.6340057636887608, "grad_norm": 2.67954683303833, "learning_rate": 3.1694023305350206e-06, "loss": 0.4641, "num_input_tokens_seen": 6150304, "step": 5060 }, { "epoch": 0.6346322515975442, "grad_norm": 2.514127731323242, "learning_rate": 3.172534770078938e-06, "loss": 0.4464, "num_input_tokens_seen": 6156576, "step": 5065 }, { "epoch": 0.6352587395063275, "grad_norm": 1.9765082597732544, "learning_rate": 3.1756672096228546e-06, "loss": 0.4688, "num_input_tokens_seen": 6162656, "step": 5070 }, { "epoch": 0.6358852274151109, "grad_norm": 2.1451830863952637, "learning_rate": 3.178799649166771e-06, "loss": 0.4724, "num_input_tokens_seen": 6168768, "step": 5075 }, { "epoch": 0.6365117153238943, "grad_norm": 2.506002187728882, "learning_rate": 3.181932088710688e-06, "loss": 0.4564, "num_input_tokens_seen": 6174624, "step": 5080 }, { "epoch": 0.6371382032326776, "grad_norm": 2.261579751968384, "learning_rate": 3.185064528254605e-06, "loss": 0.4675, "num_input_tokens_seen": 6180288, "step": 5085 }, { "epoch": 0.637764691141461, "grad_norm": 2.725990056991577, "learning_rate": 3.188196967798522e-06, "loss": 0.4811, "num_input_tokens_seen": 6186464, "step": 5090 }, { "epoch": 0.6383911790502443, "grad_norm": 2.770843982696533, "learning_rate": 3.1913294073424384e-06, "loss": 0.4616, "num_input_tokens_seen": 6192608, "step": 5095 }, { "epoch": 0.6390176669590277, "grad_norm": 1.7969242334365845, "learning_rate": 3.194461846886355e-06, "loss": 0.4588, "num_input_tokens_seen": 6198560, "step": 5100 }, { "epoch": 0.639644154867811, "grad_norm": 6.531905174255371, "learning_rate": 3.1975942864302723e-06, "loss": 0.466, "num_input_tokens_seen": 6204736, "step": 5105 }, { "epoch": 0.6402706427765944, "grad_norm": 1.7840423583984375, "learning_rate": 3.200726725974189e-06, "loss": 0.4721, "num_input_tokens_seen": 6210848, "step": 5110 }, { "epoch": 0.6408971306853778, "grad_norm": 1.50011146068573, "learning_rate": 3.2038591655181054e-06, "loss": 0.4595, "num_input_tokens_seen": 6216864, "step": 5115 }, { "epoch": 0.6415236185941612, "grad_norm": 1.698697805404663, "learning_rate": 3.2069916050620226e-06, "loss": 0.4682, "num_input_tokens_seen": 6223232, "step": 5120 }, { "epoch": 0.6421501065029445, "grad_norm": 1.3598051071166992, "learning_rate": 3.2101240446059394e-06, "loss": 0.4642, "num_input_tokens_seen": 6228800, "step": 5125 }, { "epoch": 0.6427765944117279, "grad_norm": 1.5842927694320679, "learning_rate": 3.2132564841498557e-06, "loss": 0.4664, "num_input_tokens_seen": 6235040, "step": 5130 }, { "epoch": 0.6434030823205112, "grad_norm": 1.2738465070724487, "learning_rate": 3.216388923693773e-06, "loss": 0.458, "num_input_tokens_seen": 6241312, "step": 5135 }, { "epoch": 0.6440295702292945, "grad_norm": 3.711275815963745, "learning_rate": 3.2195213632376897e-06, "loss": 0.4535, "num_input_tokens_seen": 6247648, "step": 5140 }, { "epoch": 0.6446560581380779, "grad_norm": 1.4140264987945557, "learning_rate": 3.222653802781607e-06, "loss": 0.4472, "num_input_tokens_seen": 6253984, "step": 5145 }, { "epoch": 0.6452825460468613, "grad_norm": 2.034822702407837, "learning_rate": 3.225786242325523e-06, "loss": 0.4754, "num_input_tokens_seen": 6259872, "step": 5150 }, { "epoch": 0.6459090339556447, "grad_norm": 2.9229655265808105, "learning_rate": 3.22891868186944e-06, "loss": 0.4646, "num_input_tokens_seen": 6265792, "step": 5155 }, { "epoch": 0.646535521864428, "grad_norm": 1.2579591274261475, "learning_rate": 3.232051121413357e-06, "loss": 0.4552, "num_input_tokens_seen": 6271968, "step": 5160 }, { "epoch": 0.6471620097732114, "grad_norm": 3.4683785438537598, "learning_rate": 3.235183560957274e-06, "loss": 0.5013, "num_input_tokens_seen": 6278208, "step": 5165 }, { "epoch": 0.6477884976819948, "grad_norm": 1.7739534378051758, "learning_rate": 3.2383160005011902e-06, "loss": 0.4445, "num_input_tokens_seen": 6284544, "step": 5170 }, { "epoch": 0.6484149855907781, "grad_norm": 1.9115225076675415, "learning_rate": 3.2414484400451074e-06, "loss": 0.4516, "num_input_tokens_seen": 6290848, "step": 5175 }, { "epoch": 0.6490414734995614, "grad_norm": 2.1059460639953613, "learning_rate": 3.244580879589024e-06, "loss": 0.4658, "num_input_tokens_seen": 6296832, "step": 5180 }, { "epoch": 0.6496679614083448, "grad_norm": 1.7853072881698608, "learning_rate": 3.247713319132941e-06, "loss": 0.49, "num_input_tokens_seen": 6302752, "step": 5185 }, { "epoch": 0.6502944493171282, "grad_norm": 1.4301748275756836, "learning_rate": 3.2508457586768577e-06, "loss": 0.4595, "num_input_tokens_seen": 6308832, "step": 5190 }, { "epoch": 0.6509209372259115, "grad_norm": 2.494384288787842, "learning_rate": 3.2539781982207745e-06, "loss": 0.4872, "num_input_tokens_seen": 6314560, "step": 5195 }, { "epoch": 0.6515474251346949, "grad_norm": 1.8862311840057373, "learning_rate": 3.2571106377646917e-06, "loss": 0.4645, "num_input_tokens_seen": 6320992, "step": 5200 }, { "epoch": 0.6521739130434783, "grad_norm": 2.0439505577087402, "learning_rate": 3.260243077308608e-06, "loss": 0.4733, "num_input_tokens_seen": 6326720, "step": 5205 }, { "epoch": 0.6528004009522617, "grad_norm": 2.0130107402801514, "learning_rate": 3.2633755168525248e-06, "loss": 0.4572, "num_input_tokens_seen": 6332512, "step": 5210 }, { "epoch": 0.6534268888610449, "grad_norm": 2.0164551734924316, "learning_rate": 3.266507956396442e-06, "loss": 0.462, "num_input_tokens_seen": 6338368, "step": 5215 }, { "epoch": 0.6540533767698283, "grad_norm": 1.436408519744873, "learning_rate": 3.2696403959403587e-06, "loss": 0.4629, "num_input_tokens_seen": 6344064, "step": 5220 }, { "epoch": 0.6546798646786117, "grad_norm": 1.42613685131073, "learning_rate": 3.272772835484275e-06, "loss": 0.4708, "num_input_tokens_seen": 6350048, "step": 5225 }, { "epoch": 0.6553063525873951, "grad_norm": 1.263667345046997, "learning_rate": 3.2759052750281922e-06, "loss": 0.4596, "num_input_tokens_seen": 6356160, "step": 5230 }, { "epoch": 0.6559328404961784, "grad_norm": 1.2130507230758667, "learning_rate": 3.279037714572109e-06, "loss": 0.4636, "num_input_tokens_seen": 6361728, "step": 5235 }, { "epoch": 0.6565593284049618, "grad_norm": 1.7649203538894653, "learning_rate": 3.2821701541160258e-06, "loss": 0.4587, "num_input_tokens_seen": 6367744, "step": 5240 }, { "epoch": 0.6571858163137452, "grad_norm": 1.1725475788116455, "learning_rate": 3.2853025936599425e-06, "loss": 0.4663, "num_input_tokens_seen": 6373696, "step": 5245 }, { "epoch": 0.6578123042225285, "grad_norm": 1.1562975645065308, "learning_rate": 3.2884350332038593e-06, "loss": 0.4804, "num_input_tokens_seen": 6379168, "step": 5250 }, { "epoch": 0.6584387921313118, "grad_norm": 1.0141444206237793, "learning_rate": 3.2915674727477765e-06, "loss": 0.4568, "num_input_tokens_seen": 6385408, "step": 5255 }, { "epoch": 0.6590652800400952, "grad_norm": 0.7266896367073059, "learning_rate": 3.294699912291693e-06, "loss": 0.4622, "num_input_tokens_seen": 6391456, "step": 5260 }, { "epoch": 0.6596917679488786, "grad_norm": 0.8366338610649109, "learning_rate": 3.2978323518356096e-06, "loss": 0.4702, "num_input_tokens_seen": 6397312, "step": 5265 }, { "epoch": 0.6603182558576619, "grad_norm": 1.252393126487732, "learning_rate": 3.3009647913795268e-06, "loss": 0.4846, "num_input_tokens_seen": 6403744, "step": 5270 }, { "epoch": 0.6609447437664453, "grad_norm": 1.1327824592590332, "learning_rate": 3.3040972309234435e-06, "loss": 0.4623, "num_input_tokens_seen": 6409888, "step": 5275 }, { "epoch": 0.6615712316752287, "grad_norm": 1.1451164484024048, "learning_rate": 3.3072296704673603e-06, "loss": 0.4672, "num_input_tokens_seen": 6416064, "step": 5280 }, { "epoch": 0.6621977195840121, "grad_norm": 1.3924751281738281, "learning_rate": 3.310362110011277e-06, "loss": 0.4676, "num_input_tokens_seen": 6422368, "step": 5285 }, { "epoch": 0.6628242074927954, "grad_norm": 1.8114713430404663, "learning_rate": 3.313494549555194e-06, "loss": 0.4625, "num_input_tokens_seen": 6428544, "step": 5290 }, { "epoch": 0.6634506954015787, "grad_norm": 1.935884714126587, "learning_rate": 3.3166269890991106e-06, "loss": 0.4657, "num_input_tokens_seen": 6434592, "step": 5295 }, { "epoch": 0.6640771833103621, "grad_norm": 1.3324261903762817, "learning_rate": 3.3197594286430274e-06, "loss": 0.4625, "num_input_tokens_seen": 6440096, "step": 5300 }, { "epoch": 0.6647036712191454, "grad_norm": 0.7641839981079102, "learning_rate": 3.322891868186944e-06, "loss": 0.4577, "num_input_tokens_seen": 6445952, "step": 5305 }, { "epoch": 0.6653301591279288, "grad_norm": 0.5623354315757751, "learning_rate": 3.3260243077308613e-06, "loss": 0.4613, "num_input_tokens_seen": 6452416, "step": 5310 }, { "epoch": 0.6659566470367122, "grad_norm": 1.3568589687347412, "learning_rate": 3.3291567472747776e-06, "loss": 0.4597, "num_input_tokens_seen": 6458400, "step": 5315 }, { "epoch": 0.6665831349454956, "grad_norm": 1.2392988204956055, "learning_rate": 3.332289186818695e-06, "loss": 0.4596, "num_input_tokens_seen": 6464832, "step": 5320 }, { "epoch": 0.6672096228542789, "grad_norm": 2.4339802265167236, "learning_rate": 3.3354216263626116e-06, "loss": 0.4792, "num_input_tokens_seen": 6471232, "step": 5325 }, { "epoch": 0.6678361107630623, "grad_norm": 1.4291564226150513, "learning_rate": 3.3385540659065284e-06, "loss": 0.4586, "num_input_tokens_seen": 6477152, "step": 5330 }, { "epoch": 0.6684625986718457, "grad_norm": 1.9478750228881836, "learning_rate": 3.341686505450445e-06, "loss": 0.4588, "num_input_tokens_seen": 6483200, "step": 5335 }, { "epoch": 0.669089086580629, "grad_norm": 2.2393088340759277, "learning_rate": 3.344818944994362e-06, "loss": 0.4664, "num_input_tokens_seen": 6489280, "step": 5340 }, { "epoch": 0.6697155744894123, "grad_norm": 1.3257571458816528, "learning_rate": 3.3479513845382786e-06, "loss": 0.4608, "num_input_tokens_seen": 6495520, "step": 5345 }, { "epoch": 0.6703420623981957, "grad_norm": 1.6038122177124023, "learning_rate": 3.3510838240821954e-06, "loss": 0.4685, "num_input_tokens_seen": 6501536, "step": 5350 }, { "epoch": 0.6709685503069791, "grad_norm": 1.075229287147522, "learning_rate": 3.354216263626112e-06, "loss": 0.4667, "num_input_tokens_seen": 6507808, "step": 5355 }, { "epoch": 0.6715950382157624, "grad_norm": 1.0851255655288696, "learning_rate": 3.3573487031700294e-06, "loss": 0.4676, "num_input_tokens_seen": 6513824, "step": 5360 }, { "epoch": 0.6722215261245458, "grad_norm": 1.2680424451828003, "learning_rate": 3.360481142713946e-06, "loss": 0.469, "num_input_tokens_seen": 6520256, "step": 5365 }, { "epoch": 0.6728480140333292, "grad_norm": 0.9061691761016846, "learning_rate": 3.3636135822578625e-06, "loss": 0.453, "num_input_tokens_seen": 6526400, "step": 5370 }, { "epoch": 0.6734745019421126, "grad_norm": 0.9827490448951721, "learning_rate": 3.3667460218017796e-06, "loss": 0.4595, "num_input_tokens_seen": 6532736, "step": 5375 }, { "epoch": 0.6741009898508958, "grad_norm": 1.2266148328781128, "learning_rate": 3.3698784613456964e-06, "loss": 0.4654, "num_input_tokens_seen": 6539136, "step": 5380 }, { "epoch": 0.6747274777596792, "grad_norm": 1.447406530380249, "learning_rate": 3.373010900889613e-06, "loss": 0.4825, "num_input_tokens_seen": 6545408, "step": 5385 }, { "epoch": 0.6753539656684626, "grad_norm": 0.9692156314849854, "learning_rate": 3.37614334043353e-06, "loss": 0.4714, "num_input_tokens_seen": 6551552, "step": 5390 }, { "epoch": 0.675980453577246, "grad_norm": 1.382262945175171, "learning_rate": 3.3792757799774467e-06, "loss": 0.4505, "num_input_tokens_seen": 6557632, "step": 5395 }, { "epoch": 0.6766069414860293, "grad_norm": 1.4849447011947632, "learning_rate": 3.3824082195213635e-06, "loss": 0.468, "num_input_tokens_seen": 6563616, "step": 5400 }, { "epoch": 0.6772334293948127, "grad_norm": 1.0675127506256104, "learning_rate": 3.3855406590652802e-06, "loss": 0.4593, "num_input_tokens_seen": 6569536, "step": 5405 }, { "epoch": 0.6778599173035961, "grad_norm": 1.790581464767456, "learning_rate": 3.388673098609197e-06, "loss": 0.4695, "num_input_tokens_seen": 6575744, "step": 5410 }, { "epoch": 0.6784864052123794, "grad_norm": 0.664862334728241, "learning_rate": 3.391805538153114e-06, "loss": 0.4729, "num_input_tokens_seen": 6582304, "step": 5415 }, { "epoch": 0.6791128931211627, "grad_norm": 0.6972739100456238, "learning_rate": 3.394937977697031e-06, "loss": 0.4674, "num_input_tokens_seen": 6588544, "step": 5420 }, { "epoch": 0.6797393810299461, "grad_norm": 1.3447377681732178, "learning_rate": 3.3980704172409473e-06, "loss": 0.4539, "num_input_tokens_seen": 6594208, "step": 5425 }, { "epoch": 0.6803658689387295, "grad_norm": 0.6452199220657349, "learning_rate": 3.4012028567848645e-06, "loss": 0.4613, "num_input_tokens_seen": 6600288, "step": 5430 }, { "epoch": 0.6809923568475128, "grad_norm": 1.2896901369094849, "learning_rate": 3.4043352963287812e-06, "loss": 0.4721, "num_input_tokens_seen": 6606464, "step": 5435 }, { "epoch": 0.6816188447562962, "grad_norm": 2.2117092609405518, "learning_rate": 3.407467735872698e-06, "loss": 0.4464, "num_input_tokens_seen": 6612736, "step": 5440 }, { "epoch": 0.6822453326650796, "grad_norm": 0.8459197282791138, "learning_rate": 3.4106001754166148e-06, "loss": 0.4735, "num_input_tokens_seen": 6619104, "step": 5445 }, { "epoch": 0.682871820573863, "grad_norm": 0.7118391990661621, "learning_rate": 3.4137326149605315e-06, "loss": 0.4624, "num_input_tokens_seen": 6624992, "step": 5450 }, { "epoch": 0.6834983084826463, "grad_norm": 1.3291172981262207, "learning_rate": 3.4168650545044487e-06, "loss": 0.4593, "num_input_tokens_seen": 6631552, "step": 5455 }, { "epoch": 0.6841247963914296, "grad_norm": 2.0404109954833984, "learning_rate": 3.419997494048365e-06, "loss": 0.4741, "num_input_tokens_seen": 6637568, "step": 5460 }, { "epoch": 0.684751284300213, "grad_norm": 1.3344570398330688, "learning_rate": 3.423129933592282e-06, "loss": 0.4714, "num_input_tokens_seen": 6644032, "step": 5465 }, { "epoch": 0.6853777722089963, "grad_norm": 1.2026015520095825, "learning_rate": 3.426262373136199e-06, "loss": 0.4687, "num_input_tokens_seen": 6650048, "step": 5470 }, { "epoch": 0.6860042601177797, "grad_norm": 1.4049469232559204, "learning_rate": 3.4293948126801158e-06, "loss": 0.459, "num_input_tokens_seen": 6656576, "step": 5475 }, { "epoch": 0.6866307480265631, "grad_norm": 1.688076376914978, "learning_rate": 3.432527252224032e-06, "loss": 0.4693, "num_input_tokens_seen": 6662304, "step": 5480 }, { "epoch": 0.6872572359353465, "grad_norm": 1.3934366703033447, "learning_rate": 3.4356596917679493e-06, "loss": 0.4689, "num_input_tokens_seen": 6668896, "step": 5485 }, { "epoch": 0.6878837238441298, "grad_norm": 0.6982900500297546, "learning_rate": 3.438792131311866e-06, "loss": 0.466, "num_input_tokens_seen": 6675168, "step": 5490 }, { "epoch": 0.6885102117529132, "grad_norm": 1.677100419998169, "learning_rate": 3.4419245708557832e-06, "loss": 0.4663, "num_input_tokens_seen": 6681344, "step": 5495 }, { "epoch": 0.6891366996616966, "grad_norm": 1.1922389268875122, "learning_rate": 3.4450570103996996e-06, "loss": 0.4612, "num_input_tokens_seen": 6687584, "step": 5500 }, { "epoch": 0.6897631875704799, "grad_norm": 1.1347169876098633, "learning_rate": 3.4481894499436163e-06, "loss": 0.4691, "num_input_tokens_seen": 6693536, "step": 5505 }, { "epoch": 0.6903896754792632, "grad_norm": 0.9914090037345886, "learning_rate": 3.4513218894875335e-06, "loss": 0.4607, "num_input_tokens_seen": 6699712, "step": 5510 }, { "epoch": 0.6910161633880466, "grad_norm": 1.7671310901641846, "learning_rate": 3.45445432903145e-06, "loss": 0.4619, "num_input_tokens_seen": 6705984, "step": 5515 }, { "epoch": 0.69164265129683, "grad_norm": 1.1571027040481567, "learning_rate": 3.4575867685753666e-06, "loss": 0.4603, "num_input_tokens_seen": 6711968, "step": 5520 }, { "epoch": 0.6922691392056133, "grad_norm": 1.9462025165557861, "learning_rate": 3.460719208119284e-06, "loss": 0.4704, "num_input_tokens_seen": 6718208, "step": 5525 }, { "epoch": 0.6928956271143967, "grad_norm": 1.752455234527588, "learning_rate": 3.4638516476632006e-06, "loss": 0.4621, "num_input_tokens_seen": 6724288, "step": 5530 }, { "epoch": 0.6935221150231801, "grad_norm": 2.3210365772247314, "learning_rate": 3.466984087207117e-06, "loss": 0.4622, "num_input_tokens_seen": 6730400, "step": 5535 }, { "epoch": 0.6941486029319635, "grad_norm": 1.9073052406311035, "learning_rate": 3.470116526751034e-06, "loss": 0.462, "num_input_tokens_seen": 6736320, "step": 5540 }, { "epoch": 0.6947750908407467, "grad_norm": 1.8105024099349976, "learning_rate": 3.473248966294951e-06, "loss": 0.4647, "num_input_tokens_seen": 6741952, "step": 5545 }, { "epoch": 0.6954015787495301, "grad_norm": 2.7159461975097656, "learning_rate": 3.476381405838868e-06, "loss": 0.4793, "num_input_tokens_seen": 6747936, "step": 5550 }, { "epoch": 0.6960280666583135, "grad_norm": 4.461328506469727, "learning_rate": 3.4795138453827844e-06, "loss": 0.4667, "num_input_tokens_seen": 6754176, "step": 5555 }, { "epoch": 0.6966545545670969, "grad_norm": 2.683979034423828, "learning_rate": 3.482646284926701e-06, "loss": 0.4875, "num_input_tokens_seen": 6759520, "step": 5560 }, { "epoch": 0.6972810424758802, "grad_norm": 2.5152759552001953, "learning_rate": 3.4857787244706183e-06, "loss": 0.4627, "num_input_tokens_seen": 6765600, "step": 5565 }, { "epoch": 0.6979075303846636, "grad_norm": 3.5667724609375, "learning_rate": 3.4889111640145347e-06, "loss": 0.4584, "num_input_tokens_seen": 6771872, "step": 5570 }, { "epoch": 0.698534018293447, "grad_norm": 3.569964647293091, "learning_rate": 3.4920436035584514e-06, "loss": 0.4801, "num_input_tokens_seen": 6778272, "step": 5575 }, { "epoch": 0.6991605062022302, "grad_norm": 3.6138665676116943, "learning_rate": 3.4951760431023686e-06, "loss": 0.4736, "num_input_tokens_seen": 6784064, "step": 5580 }, { "epoch": 0.6997869941110136, "grad_norm": 3.268152952194214, "learning_rate": 3.4983084826462854e-06, "loss": 0.4597, "num_input_tokens_seen": 6790336, "step": 5585 }, { "epoch": 0.700413482019797, "grad_norm": 2.0147573947906494, "learning_rate": 3.5014409221902017e-06, "loss": 0.4536, "num_input_tokens_seen": 6796480, "step": 5590 }, { "epoch": 0.7010399699285804, "grad_norm": 3.022967576980591, "learning_rate": 3.504573361734119e-06, "loss": 0.4612, "num_input_tokens_seen": 6802656, "step": 5595 }, { "epoch": 0.7016664578373637, "grad_norm": 2.3131346702575684, "learning_rate": 3.5077058012780357e-06, "loss": 0.4644, "num_input_tokens_seen": 6808608, "step": 5600 }, { "epoch": 0.7022929457461471, "grad_norm": 2.940260887145996, "learning_rate": 3.510838240821953e-06, "loss": 0.4471, "num_input_tokens_seen": 6814528, "step": 5605 }, { "epoch": 0.7029194336549305, "grad_norm": 5.311794757843018, "learning_rate": 3.513970680365869e-06, "loss": 0.4758, "num_input_tokens_seen": 6820768, "step": 5610 }, { "epoch": 0.7035459215637139, "grad_norm": 2.7292709350585938, "learning_rate": 3.517103119909786e-06, "loss": 0.4676, "num_input_tokens_seen": 6827008, "step": 5615 }, { "epoch": 0.7041724094724972, "grad_norm": 6.525243759155273, "learning_rate": 3.520235559453703e-06, "loss": 0.4379, "num_input_tokens_seen": 6832960, "step": 5620 }, { "epoch": 0.7047988973812805, "grad_norm": 2.9283573627471924, "learning_rate": 3.5233679989976195e-06, "loss": 0.5056, "num_input_tokens_seen": 6839232, "step": 5625 }, { "epoch": 0.7054253852900639, "grad_norm": 1.6976981163024902, "learning_rate": 3.5265004385415363e-06, "loss": 0.4478, "num_input_tokens_seen": 6845344, "step": 5630 }, { "epoch": 0.7060518731988472, "grad_norm": 1.728721261024475, "learning_rate": 3.5296328780854534e-06, "loss": 0.4898, "num_input_tokens_seen": 6851424, "step": 5635 }, { "epoch": 0.7066783611076306, "grad_norm": 1.7051976919174194, "learning_rate": 3.53276531762937e-06, "loss": 0.465, "num_input_tokens_seen": 6857760, "step": 5640 }, { "epoch": 0.707304849016414, "grad_norm": 0.9993337988853455, "learning_rate": 3.5358977571732865e-06, "loss": 0.4649, "num_input_tokens_seen": 6863904, "step": 5645 }, { "epoch": 0.7079313369251974, "grad_norm": 1.3202866315841675, "learning_rate": 3.5390301967172037e-06, "loss": 0.4632, "num_input_tokens_seen": 6870048, "step": 5650 }, { "epoch": 0.7085578248339807, "grad_norm": 1.3137428760528564, "learning_rate": 3.5421626362611205e-06, "loss": 0.4636, "num_input_tokens_seen": 6876544, "step": 5655 }, { "epoch": 0.7091843127427641, "grad_norm": 0.8851814270019531, "learning_rate": 3.5452950758050377e-06, "loss": 0.4534, "num_input_tokens_seen": 6882528, "step": 5660 }, { "epoch": 0.7098108006515474, "grad_norm": 1.6292461156845093, "learning_rate": 3.548427515348954e-06, "loss": 0.4627, "num_input_tokens_seen": 6888736, "step": 5665 }, { "epoch": 0.7104372885603308, "grad_norm": 1.5151498317718506, "learning_rate": 3.5515599548928708e-06, "loss": 0.463, "num_input_tokens_seen": 6894752, "step": 5670 }, { "epoch": 0.7110637764691141, "grad_norm": 1.813291072845459, "learning_rate": 3.554692394436788e-06, "loss": 0.4614, "num_input_tokens_seen": 6901152, "step": 5675 }, { "epoch": 0.7116902643778975, "grad_norm": 1.4027445316314697, "learning_rate": 3.5578248339807043e-06, "loss": 0.4466, "num_input_tokens_seen": 6907616, "step": 5680 }, { "epoch": 0.7123167522866809, "grad_norm": 0.8691561818122864, "learning_rate": 3.560957273524621e-06, "loss": 0.466, "num_input_tokens_seen": 6913760, "step": 5685 }, { "epoch": 0.7129432401954642, "grad_norm": 1.3978931903839111, "learning_rate": 3.5640897130685383e-06, "loss": 0.439, "num_input_tokens_seen": 6919744, "step": 5690 }, { "epoch": 0.7135697281042476, "grad_norm": 2.5965306758880615, "learning_rate": 3.567222152612455e-06, "loss": 0.4715, "num_input_tokens_seen": 6925920, "step": 5695 }, { "epoch": 0.714196216013031, "grad_norm": 1.4249467849731445, "learning_rate": 3.5703545921563714e-06, "loss": 0.4714, "num_input_tokens_seen": 6932128, "step": 5700 }, { "epoch": 0.7148227039218144, "grad_norm": 1.4524868726730347, "learning_rate": 3.5734870317002885e-06, "loss": 0.4699, "num_input_tokens_seen": 6937984, "step": 5705 }, { "epoch": 0.7154491918305976, "grad_norm": 1.2273043394088745, "learning_rate": 3.5766194712442053e-06, "loss": 0.4555, "num_input_tokens_seen": 6944160, "step": 5710 }, { "epoch": 0.716075679739381, "grad_norm": 1.3947603702545166, "learning_rate": 3.5797519107881225e-06, "loss": 0.4607, "num_input_tokens_seen": 6949824, "step": 5715 }, { "epoch": 0.7167021676481644, "grad_norm": 1.3381474018096924, "learning_rate": 3.582884350332039e-06, "loss": 0.4839, "num_input_tokens_seen": 6955680, "step": 5720 }, { "epoch": 0.7173286555569478, "grad_norm": 1.6870602369308472, "learning_rate": 3.5860167898759556e-06, "loss": 0.4638, "num_input_tokens_seen": 6961152, "step": 5725 }, { "epoch": 0.7179551434657311, "grad_norm": 1.6583044528961182, "learning_rate": 3.5891492294198728e-06, "loss": 0.4681, "num_input_tokens_seen": 6967392, "step": 5730 }, { "epoch": 0.7185816313745145, "grad_norm": 1.3812631368637085, "learning_rate": 3.592281668963789e-06, "loss": 0.4876, "num_input_tokens_seen": 6973216, "step": 5735 }, { "epoch": 0.7192081192832979, "grad_norm": 1.1335357427597046, "learning_rate": 3.595414108507706e-06, "loss": 0.4641, "num_input_tokens_seen": 6979264, "step": 5740 }, { "epoch": 0.7198346071920811, "grad_norm": 1.1054099798202515, "learning_rate": 3.598546548051623e-06, "loss": 0.4626, "num_input_tokens_seen": 6985536, "step": 5745 }, { "epoch": 0.7204610951008645, "grad_norm": 1.6080291271209717, "learning_rate": 3.60167898759554e-06, "loss": 0.4607, "num_input_tokens_seen": 6991552, "step": 5750 }, { "epoch": 0.7210875830096479, "grad_norm": 0.9151008725166321, "learning_rate": 3.604811427139456e-06, "loss": 0.4684, "num_input_tokens_seen": 6997792, "step": 5755 }, { "epoch": 0.7217140709184313, "grad_norm": 1.2436360120773315, "learning_rate": 3.6079438666833734e-06, "loss": 0.4616, "num_input_tokens_seen": 7003904, "step": 5760 }, { "epoch": 0.7223405588272146, "grad_norm": 1.6995152235031128, "learning_rate": 3.61107630622729e-06, "loss": 0.4652, "num_input_tokens_seen": 7010080, "step": 5765 }, { "epoch": 0.722967046735998, "grad_norm": 1.3789864778518677, "learning_rate": 3.6142087457712073e-06, "loss": 0.4638, "num_input_tokens_seen": 7016320, "step": 5770 }, { "epoch": 0.7235935346447814, "grad_norm": 1.2996665239334106, "learning_rate": 3.6173411853151237e-06, "loss": 0.4642, "num_input_tokens_seen": 7022208, "step": 5775 }, { "epoch": 0.7242200225535648, "grad_norm": 0.9225378632545471, "learning_rate": 3.6204736248590404e-06, "loss": 0.4604, "num_input_tokens_seen": 7028192, "step": 5780 }, { "epoch": 0.724846510462348, "grad_norm": 2.061629056930542, "learning_rate": 3.6236060644029576e-06, "loss": 0.4716, "num_input_tokens_seen": 7033888, "step": 5785 }, { "epoch": 0.7254729983711314, "grad_norm": 1.5078986883163452, "learning_rate": 3.626738503946874e-06, "loss": 0.4721, "num_input_tokens_seen": 7040128, "step": 5790 }, { "epoch": 0.7260994862799148, "grad_norm": 1.7351441383361816, "learning_rate": 3.6298709434907907e-06, "loss": 0.4658, "num_input_tokens_seen": 7046368, "step": 5795 }, { "epoch": 0.7267259741886981, "grad_norm": 1.017013669013977, "learning_rate": 3.633003383034708e-06, "loss": 0.4681, "num_input_tokens_seen": 7052640, "step": 5800 }, { "epoch": 0.7273524620974815, "grad_norm": 1.0710018873214722, "learning_rate": 3.6361358225786247e-06, "loss": 0.4623, "num_input_tokens_seen": 7058464, "step": 5805 }, { "epoch": 0.7279789500062649, "grad_norm": 0.5959440469741821, "learning_rate": 3.639268262122541e-06, "loss": 0.4604, "num_input_tokens_seen": 7064704, "step": 5810 }, { "epoch": 0.7286054379150483, "grad_norm": 1.0253627300262451, "learning_rate": 3.642400701666458e-06, "loss": 0.4621, "num_input_tokens_seen": 7070496, "step": 5815 }, { "epoch": 0.7292319258238316, "grad_norm": 1.035975456237793, "learning_rate": 3.645533141210375e-06, "loss": 0.4651, "num_input_tokens_seen": 7076448, "step": 5820 }, { "epoch": 0.729858413732615, "grad_norm": 0.7796387076377869, "learning_rate": 3.648665580754292e-06, "loss": 0.4597, "num_input_tokens_seen": 7082560, "step": 5825 }, { "epoch": 0.7304849016413983, "grad_norm": 0.9611402750015259, "learning_rate": 3.6517980202982085e-06, "loss": 0.4571, "num_input_tokens_seen": 7088640, "step": 5830 }, { "epoch": 0.7311113895501817, "grad_norm": 1.9811288118362427, "learning_rate": 3.6549304598421252e-06, "loss": 0.4619, "num_input_tokens_seen": 7093952, "step": 5835 }, { "epoch": 0.731737877458965, "grad_norm": 1.659798502922058, "learning_rate": 3.6580628993860424e-06, "loss": 0.4716, "num_input_tokens_seen": 7099968, "step": 5840 }, { "epoch": 0.7323643653677484, "grad_norm": 1.0668751001358032, "learning_rate": 3.6611953389299588e-06, "loss": 0.4529, "num_input_tokens_seen": 7106240, "step": 5845 }, { "epoch": 0.7329908532765318, "grad_norm": 1.3976436853408813, "learning_rate": 3.6643277784738755e-06, "loss": 0.4636, "num_input_tokens_seen": 7112512, "step": 5850 }, { "epoch": 0.7336173411853151, "grad_norm": 1.2798875570297241, "learning_rate": 3.6674602180177927e-06, "loss": 0.4747, "num_input_tokens_seen": 7118400, "step": 5855 }, { "epoch": 0.7342438290940985, "grad_norm": 1.8169795274734497, "learning_rate": 3.6705926575617095e-06, "loss": 0.4721, "num_input_tokens_seen": 7123520, "step": 5860 }, { "epoch": 0.7348703170028819, "grad_norm": 0.5685651302337646, "learning_rate": 3.673725097105626e-06, "loss": 0.4808, "num_input_tokens_seen": 7129504, "step": 5865 }, { "epoch": 0.7354968049116652, "grad_norm": 0.9633081555366516, "learning_rate": 3.676857536649543e-06, "loss": 0.4706, "num_input_tokens_seen": 7135264, "step": 5870 }, { "epoch": 0.7361232928204485, "grad_norm": 1.1195409297943115, "learning_rate": 3.6799899761934598e-06, "loss": 0.4622, "num_input_tokens_seen": 7141024, "step": 5875 }, { "epoch": 0.7367497807292319, "grad_norm": 1.2879456281661987, "learning_rate": 3.683122415737377e-06, "loss": 0.4664, "num_input_tokens_seen": 7147296, "step": 5880 }, { "epoch": 0.7373762686380153, "grad_norm": 2.4504213333129883, "learning_rate": 3.6862548552812933e-06, "loss": 0.4602, "num_input_tokens_seen": 7152896, "step": 5885 }, { "epoch": 0.7380027565467987, "grad_norm": 2.2493579387664795, "learning_rate": 3.68938729482521e-06, "loss": 0.4785, "num_input_tokens_seen": 7158944, "step": 5890 }, { "epoch": 0.738629244455582, "grad_norm": 1.0875225067138672, "learning_rate": 3.6925197343691272e-06, "loss": 0.4642, "num_input_tokens_seen": 7164480, "step": 5895 }, { "epoch": 0.7392557323643654, "grad_norm": 1.4516935348510742, "learning_rate": 3.6956521739130436e-06, "loss": 0.4556, "num_input_tokens_seen": 7170720, "step": 5900 }, { "epoch": 0.7398822202731488, "grad_norm": 2.0907466411590576, "learning_rate": 3.6987846134569603e-06, "loss": 0.4574, "num_input_tokens_seen": 7176768, "step": 5905 }, { "epoch": 0.740508708181932, "grad_norm": 0.8136513233184814, "learning_rate": 3.7019170530008775e-06, "loss": 0.456, "num_input_tokens_seen": 7182848, "step": 5910 }, { "epoch": 0.7411351960907154, "grad_norm": 2.0212676525115967, "learning_rate": 3.7050494925447943e-06, "loss": 0.4528, "num_input_tokens_seen": 7189024, "step": 5915 }, { "epoch": 0.7417616839994988, "grad_norm": 4.566936492919922, "learning_rate": 3.7081819320887106e-06, "loss": 0.4832, "num_input_tokens_seen": 7195104, "step": 5920 }, { "epoch": 0.7423881719082822, "grad_norm": 10.008139610290527, "learning_rate": 3.711314371632628e-06, "loss": 0.4495, "num_input_tokens_seen": 7201184, "step": 5925 }, { "epoch": 0.7430146598170655, "grad_norm": 17.118356704711914, "learning_rate": 3.7144468111765446e-06, "loss": 0.4617, "num_input_tokens_seen": 7206688, "step": 5930 }, { "epoch": 0.7436411477258489, "grad_norm": 17.694124221801758, "learning_rate": 3.7175792507204618e-06, "loss": 0.4809, "num_input_tokens_seen": 7212704, "step": 5935 }, { "epoch": 0.7442676356346323, "grad_norm": 2.903068780899048, "learning_rate": 3.720711690264378e-06, "loss": 0.4588, "num_input_tokens_seen": 7218720, "step": 5940 }, { "epoch": 0.7448941235434157, "grad_norm": 4.093738079071045, "learning_rate": 3.723844129808295e-06, "loss": 0.4413, "num_input_tokens_seen": 7224896, "step": 5945 }, { "epoch": 0.745520611452199, "grad_norm": 1.8037835359573364, "learning_rate": 3.726976569352212e-06, "loss": 0.4591, "num_input_tokens_seen": 7231072, "step": 5950 }, { "epoch": 0.7461470993609823, "grad_norm": 1.3758559226989746, "learning_rate": 3.7301090088961284e-06, "loss": 0.4716, "num_input_tokens_seen": 7237184, "step": 5955 }, { "epoch": 0.7467735872697657, "grad_norm": 1.9336742162704468, "learning_rate": 3.733241448440045e-06, "loss": 0.4684, "num_input_tokens_seen": 7243328, "step": 5960 }, { "epoch": 0.747400075178549, "grad_norm": 5.34736967086792, "learning_rate": 3.7363738879839623e-06, "loss": 0.467, "num_input_tokens_seen": 7249472, "step": 5965 }, { "epoch": 0.7480265630873324, "grad_norm": 3.061384916305542, "learning_rate": 3.739506327527879e-06, "loss": 0.5352, "num_input_tokens_seen": 7255904, "step": 5970 }, { "epoch": 0.7486530509961158, "grad_norm": 3.3381872177124023, "learning_rate": 3.7426387670717954e-06, "loss": 0.4246, "num_input_tokens_seen": 7262176, "step": 5975 }, { "epoch": 0.7492795389048992, "grad_norm": 3.4658913612365723, "learning_rate": 3.7457712066157126e-06, "loss": 0.4391, "num_input_tokens_seen": 7267936, "step": 5980 }, { "epoch": 0.7499060268136825, "grad_norm": 27.017019271850586, "learning_rate": 3.7489036461596294e-06, "loss": 0.7901, "num_input_tokens_seen": 7273376, "step": 5985 }, { "epoch": 0.7505325147224658, "grad_norm": 33.03109359741211, "learning_rate": 3.7520360857035466e-06, "loss": 0.7531, "num_input_tokens_seen": 7279200, "step": 5990 }, { "epoch": 0.7511590026312492, "grad_norm": 37.29126739501953, "learning_rate": 3.755168525247463e-06, "loss": 0.7237, "num_input_tokens_seen": 7285440, "step": 5995 }, { "epoch": 0.7517854905400326, "grad_norm": 1.135636329650879, "learning_rate": 3.7583009647913797e-06, "loss": 0.4724, "num_input_tokens_seen": 7291424, "step": 6000 }, { "epoch": 0.7524119784488159, "grad_norm": 1.1752755641937256, "learning_rate": 3.761433404335297e-06, "loss": 0.4601, "num_input_tokens_seen": 7297600, "step": 6005 }, { "epoch": 0.7530384663575993, "grad_norm": 1.18716299533844, "learning_rate": 3.7645658438792132e-06, "loss": 0.4617, "num_input_tokens_seen": 7303808, "step": 6010 }, { "epoch": 0.7536649542663827, "grad_norm": 1.1687755584716797, "learning_rate": 3.76769828342313e-06, "loss": 0.4631, "num_input_tokens_seen": 7309984, "step": 6015 }, { "epoch": 0.754291442175166, "grad_norm": 0.5838687419891357, "learning_rate": 3.770830722967047e-06, "loss": 0.4579, "num_input_tokens_seen": 7315520, "step": 6020 }, { "epoch": 0.7549179300839494, "grad_norm": 0.7947584390640259, "learning_rate": 3.773963162510964e-06, "loss": 0.4588, "num_input_tokens_seen": 7321728, "step": 6025 }, { "epoch": 0.7555444179927328, "grad_norm": 0.9567726850509644, "learning_rate": 3.7770956020548803e-06, "loss": 0.4635, "num_input_tokens_seen": 7327584, "step": 6030 }, { "epoch": 0.7561709059015161, "grad_norm": 1.1951208114624023, "learning_rate": 3.7802280415987975e-06, "loss": 0.4716, "num_input_tokens_seen": 7333600, "step": 6035 }, { "epoch": 0.7567973938102994, "grad_norm": 0.7822927236557007, "learning_rate": 3.7833604811427142e-06, "loss": 0.4668, "num_input_tokens_seen": 7340032, "step": 6040 }, { "epoch": 0.7574238817190828, "grad_norm": 1.6863031387329102, "learning_rate": 3.7864929206866314e-06, "loss": 0.4694, "num_input_tokens_seen": 7346336, "step": 6045 }, { "epoch": 0.7580503696278662, "grad_norm": 0.8882563710212708, "learning_rate": 3.7896253602305477e-06, "loss": 0.4598, "num_input_tokens_seen": 7352448, "step": 6050 }, { "epoch": 0.7586768575366496, "grad_norm": 0.8671595454216003, "learning_rate": 3.7927577997744645e-06, "loss": 0.4682, "num_input_tokens_seen": 7358624, "step": 6055 }, { "epoch": 0.7593033454454329, "grad_norm": 0.8702561855316162, "learning_rate": 3.7958902393183817e-06, "loss": 0.4749, "num_input_tokens_seen": 7365088, "step": 6060 }, { "epoch": 0.7599298333542163, "grad_norm": 1.091588020324707, "learning_rate": 3.799022678862298e-06, "loss": 0.4539, "num_input_tokens_seen": 7371104, "step": 6065 }, { "epoch": 0.7605563212629997, "grad_norm": 1.2733203172683716, "learning_rate": 3.802155118406215e-06, "loss": 0.4651, "num_input_tokens_seen": 7377024, "step": 6070 }, { "epoch": 0.7611828091717829, "grad_norm": 0.6299408674240112, "learning_rate": 3.805287557950132e-06, "loss": 0.4612, "num_input_tokens_seen": 7383136, "step": 6075 }, { "epoch": 0.7618092970805663, "grad_norm": 0.9563533067703247, "learning_rate": 3.8084199974940487e-06, "loss": 0.4572, "num_input_tokens_seen": 7389504, "step": 6080 }, { "epoch": 0.7624357849893497, "grad_norm": 0.942264199256897, "learning_rate": 3.811552437037965e-06, "loss": 0.4629, "num_input_tokens_seen": 7395648, "step": 6085 }, { "epoch": 0.7630622728981331, "grad_norm": 0.8745871782302856, "learning_rate": 3.8146848765818823e-06, "loss": 0.4596, "num_input_tokens_seen": 7401792, "step": 6090 }, { "epoch": 0.7636887608069164, "grad_norm": 0.9850392937660217, "learning_rate": 3.8178173161257995e-06, "loss": 0.479, "num_input_tokens_seen": 7407776, "step": 6095 }, { "epoch": 0.7643152487156998, "grad_norm": 0.5673009157180786, "learning_rate": 3.820949755669716e-06, "loss": 0.4705, "num_input_tokens_seen": 7413920, "step": 6100 }, { "epoch": 0.7649417366244832, "grad_norm": 1.4413567781448364, "learning_rate": 3.824082195213632e-06, "loss": 0.4462, "num_input_tokens_seen": 7420320, "step": 6105 }, { "epoch": 0.7655682245332666, "grad_norm": 0.8295035362243652, "learning_rate": 3.82721463475755e-06, "loss": 0.4487, "num_input_tokens_seen": 7426080, "step": 6110 }, { "epoch": 0.7661947124420498, "grad_norm": 0.7687625288963318, "learning_rate": 3.8303470743014665e-06, "loss": 0.4834, "num_input_tokens_seen": 7431648, "step": 6115 }, { "epoch": 0.7668212003508332, "grad_norm": 1.2289323806762695, "learning_rate": 3.833479513845383e-06, "loss": 0.4726, "num_input_tokens_seen": 7437120, "step": 6120 }, { "epoch": 0.7674476882596166, "grad_norm": 1.735982060432434, "learning_rate": 3.8366119533893e-06, "loss": 0.4652, "num_input_tokens_seen": 7443264, "step": 6125 }, { "epoch": 0.7680741761683999, "grad_norm": 0.964600145816803, "learning_rate": 3.839744392933217e-06, "loss": 0.4553, "num_input_tokens_seen": 7449536, "step": 6130 }, { "epoch": 0.7687006640771833, "grad_norm": 0.7020222544670105, "learning_rate": 3.8428768324771336e-06, "loss": 0.4652, "num_input_tokens_seen": 7455424, "step": 6135 }, { "epoch": 0.7693271519859667, "grad_norm": 0.905868649482727, "learning_rate": 3.84600927202105e-06, "loss": 0.4644, "num_input_tokens_seen": 7461344, "step": 6140 }, { "epoch": 0.7699536398947501, "grad_norm": 0.6087691783905029, "learning_rate": 3.849141711564967e-06, "loss": 0.4563, "num_input_tokens_seen": 7467488, "step": 6145 }, { "epoch": 0.7705801278035334, "grad_norm": 0.8756998777389526, "learning_rate": 3.852274151108884e-06, "loss": 0.4691, "num_input_tokens_seen": 7473664, "step": 6150 }, { "epoch": 0.7712066157123167, "grad_norm": 1.2039422988891602, "learning_rate": 3.855406590652801e-06, "loss": 0.4594, "num_input_tokens_seen": 7479904, "step": 6155 }, { "epoch": 0.7718331036211001, "grad_norm": 1.2888628244400024, "learning_rate": 3.858539030196717e-06, "loss": 0.4572, "num_input_tokens_seen": 7486112, "step": 6160 }, { "epoch": 0.7724595915298835, "grad_norm": 0.6188322901725769, "learning_rate": 3.861671469740634e-06, "loss": 0.4659, "num_input_tokens_seen": 7492064, "step": 6165 }, { "epoch": 0.7730860794386668, "grad_norm": 1.1210371255874634, "learning_rate": 3.864803909284551e-06, "loss": 0.4781, "num_input_tokens_seen": 7498048, "step": 6170 }, { "epoch": 0.7737125673474502, "grad_norm": 1.2147818803787231, "learning_rate": 3.867936348828468e-06, "loss": 0.4593, "num_input_tokens_seen": 7504352, "step": 6175 }, { "epoch": 0.7743390552562336, "grad_norm": 1.5232936143875122, "learning_rate": 3.8710687883723844e-06, "loss": 0.465, "num_input_tokens_seen": 7510592, "step": 6180 }, { "epoch": 0.7749655431650169, "grad_norm": 1.0699304342269897, "learning_rate": 3.874201227916301e-06, "loss": 0.4714, "num_input_tokens_seen": 7517024, "step": 6185 }, { "epoch": 0.7755920310738003, "grad_norm": 0.555325984954834, "learning_rate": 3.877333667460219e-06, "loss": 0.4657, "num_input_tokens_seen": 7523232, "step": 6190 }, { "epoch": 0.7762185189825836, "grad_norm": 0.9150404930114746, "learning_rate": 3.880466107004135e-06, "loss": 0.4655, "num_input_tokens_seen": 7529440, "step": 6195 }, { "epoch": 0.776845006891367, "grad_norm": 0.5372653603553772, "learning_rate": 3.883598546548052e-06, "loss": 0.4667, "num_input_tokens_seen": 7535648, "step": 6200 }, { "epoch": 0.7774714948001503, "grad_norm": 0.5284656286239624, "learning_rate": 3.886730986091969e-06, "loss": 0.4552, "num_input_tokens_seen": 7541888, "step": 6205 }, { "epoch": 0.7780979827089337, "grad_norm": 0.8176357746124268, "learning_rate": 3.889863425635886e-06, "loss": 0.4649, "num_input_tokens_seen": 7547648, "step": 6210 }, { "epoch": 0.7787244706177171, "grad_norm": 0.7662605047225952, "learning_rate": 3.892995865179803e-06, "loss": 0.4812, "num_input_tokens_seen": 7553728, "step": 6215 }, { "epoch": 0.7793509585265005, "grad_norm": 1.0453331470489502, "learning_rate": 3.896128304723719e-06, "loss": 0.4724, "num_input_tokens_seen": 7559168, "step": 6220 }, { "epoch": 0.7799774464352838, "grad_norm": 1.6141154766082764, "learning_rate": 3.899260744267636e-06, "loss": 0.4597, "num_input_tokens_seen": 7565152, "step": 6225 }, { "epoch": 0.7806039343440672, "grad_norm": 0.8008542060852051, "learning_rate": 3.902393183811553e-06, "loss": 0.459, "num_input_tokens_seen": 7570336, "step": 6230 }, { "epoch": 0.7812304222528506, "grad_norm": 1.1405855417251587, "learning_rate": 3.90552562335547e-06, "loss": 0.4598, "num_input_tokens_seen": 7576288, "step": 6235 }, { "epoch": 0.7818569101616338, "grad_norm": 1.3547331094741821, "learning_rate": 3.9086580628993864e-06, "loss": 0.4644, "num_input_tokens_seen": 7582400, "step": 6240 }, { "epoch": 0.7824833980704172, "grad_norm": 2.536699056625366, "learning_rate": 3.911790502443303e-06, "loss": 0.474, "num_input_tokens_seen": 7588256, "step": 6245 }, { "epoch": 0.7831098859792006, "grad_norm": 1.8737424612045288, "learning_rate": 3.91492294198722e-06, "loss": 0.4563, "num_input_tokens_seen": 7593888, "step": 6250 }, { "epoch": 0.783736373887984, "grad_norm": 2.248889684677124, "learning_rate": 3.918055381531137e-06, "loss": 0.4643, "num_input_tokens_seen": 7599776, "step": 6255 }, { "epoch": 0.7843628617967673, "grad_norm": 2.290198564529419, "learning_rate": 3.9211878210750535e-06, "loss": 0.4682, "num_input_tokens_seen": 7605920, "step": 6260 }, { "epoch": 0.7849893497055507, "grad_norm": 1.3963537216186523, "learning_rate": 3.92432026061897e-06, "loss": 0.4589, "num_input_tokens_seen": 7611680, "step": 6265 }, { "epoch": 0.7856158376143341, "grad_norm": 1.394201397895813, "learning_rate": 3.927452700162887e-06, "loss": 0.4626, "num_input_tokens_seen": 7617696, "step": 6270 }, { "epoch": 0.7862423255231175, "grad_norm": 0.8121210336685181, "learning_rate": 3.930585139706804e-06, "loss": 0.4647, "num_input_tokens_seen": 7624128, "step": 6275 }, { "epoch": 0.7868688134319007, "grad_norm": 1.3154616355895996, "learning_rate": 3.9337175792507205e-06, "loss": 0.4727, "num_input_tokens_seen": 7630112, "step": 6280 }, { "epoch": 0.7874953013406841, "grad_norm": 0.7740370035171509, "learning_rate": 3.936850018794637e-06, "loss": 0.4771, "num_input_tokens_seen": 7636608, "step": 6285 }, { "epoch": 0.7881217892494675, "grad_norm": 0.9312741756439209, "learning_rate": 3.939982458338554e-06, "loss": 0.4571, "num_input_tokens_seen": 7642176, "step": 6290 }, { "epoch": 0.7887482771582508, "grad_norm": 0.7713744640350342, "learning_rate": 3.943114897882472e-06, "loss": 0.471, "num_input_tokens_seen": 7648352, "step": 6295 }, { "epoch": 0.7893747650670342, "grad_norm": 0.8703476190567017, "learning_rate": 3.9462473374263884e-06, "loss": 0.4645, "num_input_tokens_seen": 7654464, "step": 6300 }, { "epoch": 0.7900012529758176, "grad_norm": 0.9158875346183777, "learning_rate": 3.949379776970304e-06, "loss": 0.4663, "num_input_tokens_seen": 7660256, "step": 6305 }, { "epoch": 0.790627740884601, "grad_norm": 0.70354163646698, "learning_rate": 3.952512216514222e-06, "loss": 0.4634, "num_input_tokens_seen": 7666464, "step": 6310 }, { "epoch": 0.7912542287933843, "grad_norm": 1.448798656463623, "learning_rate": 3.955644656058139e-06, "loss": 0.4669, "num_input_tokens_seen": 7671584, "step": 6315 }, { "epoch": 0.7918807167021676, "grad_norm": 0.7721143960952759, "learning_rate": 3.9587770956020555e-06, "loss": 0.4633, "num_input_tokens_seen": 7677984, "step": 6320 }, { "epoch": 0.792507204610951, "grad_norm": 0.8220639228820801, "learning_rate": 3.961909535145972e-06, "loss": 0.4669, "num_input_tokens_seen": 7684000, "step": 6325 }, { "epoch": 0.7931336925197344, "grad_norm": 0.7058441042900085, "learning_rate": 3.965041974689889e-06, "loss": 0.4572, "num_input_tokens_seen": 7690304, "step": 6330 }, { "epoch": 0.7937601804285177, "grad_norm": 0.8534722328186035, "learning_rate": 3.968174414233806e-06, "loss": 0.4544, "num_input_tokens_seen": 7696320, "step": 6335 }, { "epoch": 0.7943866683373011, "grad_norm": 0.8968515992164612, "learning_rate": 3.9713068537777225e-06, "loss": 0.4782, "num_input_tokens_seen": 7702528, "step": 6340 }, { "epoch": 0.7950131562460845, "grad_norm": 0.975752055644989, "learning_rate": 3.974439293321639e-06, "loss": 0.4626, "num_input_tokens_seen": 7708672, "step": 6345 }, { "epoch": 0.7956396441548678, "grad_norm": 0.9179965853691101, "learning_rate": 3.977571732865556e-06, "loss": 0.4617, "num_input_tokens_seen": 7714656, "step": 6350 }, { "epoch": 0.7962661320636512, "grad_norm": 0.6822329163551331, "learning_rate": 3.980704172409473e-06, "loss": 0.4557, "num_input_tokens_seen": 7720864, "step": 6355 }, { "epoch": 0.7968926199724345, "grad_norm": 1.169169545173645, "learning_rate": 3.98383661195339e-06, "loss": 0.4644, "num_input_tokens_seen": 7727040, "step": 6360 }, { "epoch": 0.7975191078812179, "grad_norm": 1.0271257162094116, "learning_rate": 3.986969051497306e-06, "loss": 0.4722, "num_input_tokens_seen": 7733376, "step": 6365 }, { "epoch": 0.7981455957900012, "grad_norm": 1.0956021547317505, "learning_rate": 3.990101491041223e-06, "loss": 0.4689, "num_input_tokens_seen": 7739392, "step": 6370 }, { "epoch": 0.7987720836987846, "grad_norm": 1.382063865661621, "learning_rate": 3.993233930585141e-06, "loss": 0.441, "num_input_tokens_seen": 7745408, "step": 6375 }, { "epoch": 0.799398571607568, "grad_norm": 0.9415664076805115, "learning_rate": 3.996366370129057e-06, "loss": 0.4589, "num_input_tokens_seen": 7751392, "step": 6380 }, { "epoch": 0.8000250595163514, "grad_norm": 0.9620749950408936, "learning_rate": 3.999498809672973e-06, "loss": 0.4741, "num_input_tokens_seen": 7756928, "step": 6385 }, { "epoch": 0.8006515474251347, "grad_norm": 0.7893787026405334, "learning_rate": 4.002631249216891e-06, "loss": 0.4649, "num_input_tokens_seen": 7763104, "step": 6390 }, { "epoch": 0.8012780353339181, "grad_norm": 0.9418577551841736, "learning_rate": 4.005763688760807e-06, "loss": 0.4727, "num_input_tokens_seen": 7769472, "step": 6395 }, { "epoch": 0.8019045232427015, "grad_norm": 0.8858726620674133, "learning_rate": 4.008896128304724e-06, "loss": 0.469, "num_input_tokens_seen": 7775552, "step": 6400 }, { "epoch": 0.8025310111514847, "grad_norm": 0.7616958022117615, "learning_rate": 4.012028567848641e-06, "loss": 0.4655, "num_input_tokens_seen": 7781664, "step": 6405 }, { "epoch": 0.8031574990602681, "grad_norm": 0.6206940412521362, "learning_rate": 4.015161007392558e-06, "loss": 0.4606, "num_input_tokens_seen": 7787488, "step": 6410 }, { "epoch": 0.8037839869690515, "grad_norm": 0.6904027462005615, "learning_rate": 4.018293446936474e-06, "loss": 0.4591, "num_input_tokens_seen": 7793472, "step": 6415 }, { "epoch": 0.8044104748778349, "grad_norm": 0.68558269739151, "learning_rate": 4.021425886480392e-06, "loss": 0.4608, "num_input_tokens_seen": 7799392, "step": 6420 }, { "epoch": 0.8050369627866182, "grad_norm": 1.2924251556396484, "learning_rate": 4.024558326024308e-06, "loss": 0.4602, "num_input_tokens_seen": 7805568, "step": 6425 }, { "epoch": 0.8056634506954016, "grad_norm": 1.0276031494140625, "learning_rate": 4.027690765568225e-06, "loss": 0.4574, "num_input_tokens_seen": 7811552, "step": 6430 }, { "epoch": 0.806289938604185, "grad_norm": 0.884544312953949, "learning_rate": 4.030823205112142e-06, "loss": 0.4585, "num_input_tokens_seen": 7817824, "step": 6435 }, { "epoch": 0.8069164265129684, "grad_norm": 0.9715198278427124, "learning_rate": 4.033955644656059e-06, "loss": 0.4628, "num_input_tokens_seen": 7823552, "step": 6440 }, { "epoch": 0.8075429144217516, "grad_norm": 1.4816642999649048, "learning_rate": 4.037088084199975e-06, "loss": 0.4591, "num_input_tokens_seen": 7829760, "step": 6445 }, { "epoch": 0.808169402330535, "grad_norm": 1.6636353731155396, "learning_rate": 4.040220523743892e-06, "loss": 0.4825, "num_input_tokens_seen": 7836000, "step": 6450 }, { "epoch": 0.8087958902393184, "grad_norm": 0.7387264370918274, "learning_rate": 4.043352963287809e-06, "loss": 0.4684, "num_input_tokens_seen": 7841792, "step": 6455 }, { "epoch": 0.8094223781481017, "grad_norm": 1.8489809036254883, "learning_rate": 4.046485402831726e-06, "loss": 0.458, "num_input_tokens_seen": 7848128, "step": 6460 }, { "epoch": 0.8100488660568851, "grad_norm": 1.4089246988296509, "learning_rate": 4.0496178423756425e-06, "loss": 0.4629, "num_input_tokens_seen": 7854144, "step": 6465 }, { "epoch": 0.8106753539656685, "grad_norm": 1.349690318107605, "learning_rate": 4.052750281919559e-06, "loss": 0.4696, "num_input_tokens_seen": 7860160, "step": 6470 }, { "epoch": 0.8113018418744519, "grad_norm": 1.54990816116333, "learning_rate": 4.055882721463476e-06, "loss": 0.4619, "num_input_tokens_seen": 7865632, "step": 6475 }, { "epoch": 0.8119283297832351, "grad_norm": 1.6125292778015137, "learning_rate": 4.059015161007393e-06, "loss": 0.4665, "num_input_tokens_seen": 7871968, "step": 6480 }, { "epoch": 0.8125548176920185, "grad_norm": 1.231950283050537, "learning_rate": 4.06214760055131e-06, "loss": 0.4637, "num_input_tokens_seen": 7878304, "step": 6485 }, { "epoch": 0.8131813056008019, "grad_norm": 1.3876947164535522, "learning_rate": 4.065280040095226e-06, "loss": 0.4615, "num_input_tokens_seen": 7884320, "step": 6490 }, { "epoch": 0.8138077935095853, "grad_norm": 2.2658777236938477, "learning_rate": 4.068412479639143e-06, "loss": 0.4767, "num_input_tokens_seen": 7890368, "step": 6495 }, { "epoch": 0.8144342814183686, "grad_norm": 0.94569993019104, "learning_rate": 4.071544919183061e-06, "loss": 0.4593, "num_input_tokens_seen": 7896448, "step": 6500 }, { "epoch": 0.815060769327152, "grad_norm": 2.7546935081481934, "learning_rate": 4.0746773587269766e-06, "loss": 0.469, "num_input_tokens_seen": 7902304, "step": 6505 }, { "epoch": 0.8156872572359354, "grad_norm": 1.1363955736160278, "learning_rate": 4.077809798270893e-06, "loss": 0.4593, "num_input_tokens_seen": 7908544, "step": 6510 }, { "epoch": 0.8163137451447187, "grad_norm": 1.0607727766036987, "learning_rate": 4.080942237814811e-06, "loss": 0.4579, "num_input_tokens_seen": 7914272, "step": 6515 }, { "epoch": 0.816940233053502, "grad_norm": 1.0253198146820068, "learning_rate": 4.084074677358728e-06, "loss": 0.4624, "num_input_tokens_seen": 7920736, "step": 6520 }, { "epoch": 0.8175667209622854, "grad_norm": 1.5633660554885864, "learning_rate": 4.087207116902644e-06, "loss": 0.4593, "num_input_tokens_seen": 7926816, "step": 6525 }, { "epoch": 0.8181932088710688, "grad_norm": 0.4355519413948059, "learning_rate": 4.090339556446561e-06, "loss": 0.4652, "num_input_tokens_seen": 7932960, "step": 6530 }, { "epoch": 0.8188196967798521, "grad_norm": 0.7749466896057129, "learning_rate": 4.093471995990478e-06, "loss": 0.4668, "num_input_tokens_seen": 7939104, "step": 6535 }, { "epoch": 0.8194461846886355, "grad_norm": 0.8468030691146851, "learning_rate": 4.096604435534395e-06, "loss": 0.4697, "num_input_tokens_seen": 7944992, "step": 6540 }, { "epoch": 0.8200726725974189, "grad_norm": 0.7002502083778381, "learning_rate": 4.0997368750783115e-06, "loss": 0.4718, "num_input_tokens_seen": 7951104, "step": 6545 }, { "epoch": 0.8206991605062023, "grad_norm": 1.2397491931915283, "learning_rate": 4.102869314622228e-06, "loss": 0.4726, "num_input_tokens_seen": 7957024, "step": 6550 }, { "epoch": 0.8213256484149856, "grad_norm": 0.7723267674446106, "learning_rate": 4.106001754166145e-06, "loss": 0.4611, "num_input_tokens_seen": 7963520, "step": 6555 }, { "epoch": 0.821952136323769, "grad_norm": 1.335972785949707, "learning_rate": 4.109134193710062e-06, "loss": 0.4623, "num_input_tokens_seen": 7969504, "step": 6560 }, { "epoch": 0.8225786242325523, "grad_norm": 0.7362011075019836, "learning_rate": 4.1122666332539786e-06, "loss": 0.4592, "num_input_tokens_seen": 7975936, "step": 6565 }, { "epoch": 0.8232051121413356, "grad_norm": 0.7894134521484375, "learning_rate": 4.115399072797895e-06, "loss": 0.464, "num_input_tokens_seen": 7981536, "step": 6570 }, { "epoch": 0.823831600050119, "grad_norm": 0.7462196946144104, "learning_rate": 4.118531512341812e-06, "loss": 0.4619, "num_input_tokens_seen": 7987712, "step": 6575 }, { "epoch": 0.8244580879589024, "grad_norm": 1.1882516145706177, "learning_rate": 4.121663951885729e-06, "loss": 0.4657, "num_input_tokens_seen": 7993792, "step": 6580 }, { "epoch": 0.8250845758676858, "grad_norm": 0.4647921621799469, "learning_rate": 4.124796391429646e-06, "loss": 0.4641, "num_input_tokens_seen": 7999936, "step": 6585 }, { "epoch": 0.8257110637764691, "grad_norm": 0.6561231017112732, "learning_rate": 4.127928830973562e-06, "loss": 0.457, "num_input_tokens_seen": 8005888, "step": 6590 }, { "epoch": 0.8263375516852525, "grad_norm": 0.6631225347518921, "learning_rate": 4.13106127051748e-06, "loss": 0.4622, "num_input_tokens_seen": 8012192, "step": 6595 }, { "epoch": 0.8269640395940359, "grad_norm": 0.6242045760154724, "learning_rate": 4.134193710061396e-06, "loss": 0.4623, "num_input_tokens_seen": 8018528, "step": 6600 }, { "epoch": 0.8275905275028193, "grad_norm": 0.6838914155960083, "learning_rate": 4.137326149605313e-06, "loss": 0.465, "num_input_tokens_seen": 8024864, "step": 6605 }, { "epoch": 0.8282170154116025, "grad_norm": 0.6687288880348206, "learning_rate": 4.14045858914923e-06, "loss": 0.4574, "num_input_tokens_seen": 8031072, "step": 6610 }, { "epoch": 0.8288435033203859, "grad_norm": 1.888916015625, "learning_rate": 4.143591028693146e-06, "loss": 0.4728, "num_input_tokens_seen": 8036928, "step": 6615 }, { "epoch": 0.8294699912291693, "grad_norm": 0.9202989339828491, "learning_rate": 4.146723468237063e-06, "loss": 0.4681, "num_input_tokens_seen": 8043104, "step": 6620 }, { "epoch": 0.8300964791379526, "grad_norm": 1.3406703472137451, "learning_rate": 4.149855907780981e-06, "loss": 0.4593, "num_input_tokens_seen": 8049568, "step": 6625 }, { "epoch": 0.830722967046736, "grad_norm": 1.242172360420227, "learning_rate": 4.152988347324897e-06, "loss": 0.465, "num_input_tokens_seen": 8055840, "step": 6630 }, { "epoch": 0.8313494549555194, "grad_norm": 1.5796414613723755, "learning_rate": 4.156120786868813e-06, "loss": 0.4585, "num_input_tokens_seen": 8061888, "step": 6635 }, { "epoch": 0.8319759428643028, "grad_norm": 2.133742570877075, "learning_rate": 4.159253226412731e-06, "loss": 0.4651, "num_input_tokens_seen": 8068384, "step": 6640 }, { "epoch": 0.832602430773086, "grad_norm": 0.7462928295135498, "learning_rate": 4.162385665956648e-06, "loss": 0.4471, "num_input_tokens_seen": 8074560, "step": 6645 }, { "epoch": 0.8332289186818694, "grad_norm": 0.9056297540664673, "learning_rate": 4.165518105500564e-06, "loss": 0.4666, "num_input_tokens_seen": 8080480, "step": 6650 }, { "epoch": 0.8338554065906528, "grad_norm": 1.184759497642517, "learning_rate": 4.168650545044481e-06, "loss": 0.4782, "num_input_tokens_seen": 8086176, "step": 6655 }, { "epoch": 0.8344818944994361, "grad_norm": 1.0501145124435425, "learning_rate": 4.171782984588398e-06, "loss": 0.4544, "num_input_tokens_seen": 8092384, "step": 6660 }, { "epoch": 0.8351083824082195, "grad_norm": 0.8913512229919434, "learning_rate": 4.174915424132315e-06, "loss": 0.4558, "num_input_tokens_seen": 8098304, "step": 6665 }, { "epoch": 0.8357348703170029, "grad_norm": 0.8181647658348083, "learning_rate": 4.1780478636762314e-06, "loss": 0.4661, "num_input_tokens_seen": 8104384, "step": 6670 }, { "epoch": 0.8363613582257863, "grad_norm": 0.9829070568084717, "learning_rate": 4.181180303220148e-06, "loss": 0.4904, "num_input_tokens_seen": 8110112, "step": 6675 }, { "epoch": 0.8369878461345696, "grad_norm": 1.0267654657363892, "learning_rate": 4.184312742764065e-06, "loss": 0.4664, "num_input_tokens_seen": 8116256, "step": 6680 }, { "epoch": 0.837614334043353, "grad_norm": 0.8966061472892761, "learning_rate": 4.187445182307982e-06, "loss": 0.4738, "num_input_tokens_seen": 8122112, "step": 6685 }, { "epoch": 0.8382408219521363, "grad_norm": 1.1007040739059448, "learning_rate": 4.1905776218518985e-06, "loss": 0.4685, "num_input_tokens_seen": 8128416, "step": 6690 }, { "epoch": 0.8388673098609197, "grad_norm": 0.8459490537643433, "learning_rate": 4.193710061395815e-06, "loss": 0.4498, "num_input_tokens_seen": 8134784, "step": 6695 }, { "epoch": 0.839493797769703, "grad_norm": 0.7479050755500793, "learning_rate": 4.196842500939732e-06, "loss": 0.4712, "num_input_tokens_seen": 8140416, "step": 6700 }, { "epoch": 0.8401202856784864, "grad_norm": 0.6403502225875854, "learning_rate": 4.199974940483649e-06, "loss": 0.4607, "num_input_tokens_seen": 8146528, "step": 6705 }, { "epoch": 0.8407467735872698, "grad_norm": 1.6552213430404663, "learning_rate": 4.2031073800275655e-06, "loss": 0.4637, "num_input_tokens_seen": 8152288, "step": 6710 }, { "epoch": 0.8413732614960531, "grad_norm": 0.6765273213386536, "learning_rate": 4.206239819571482e-06, "loss": 0.4716, "num_input_tokens_seen": 8158816, "step": 6715 }, { "epoch": 0.8419997494048365, "grad_norm": 0.8421820998191833, "learning_rate": 4.2093722591154e-06, "loss": 0.4678, "num_input_tokens_seen": 8164928, "step": 6720 }, { "epoch": 0.8426262373136199, "grad_norm": 0.7759631872177124, "learning_rate": 4.212504698659316e-06, "loss": 0.458, "num_input_tokens_seen": 8170912, "step": 6725 }, { "epoch": 0.8432527252224032, "grad_norm": 0.8747381567955017, "learning_rate": 4.215637138203233e-06, "loss": 0.4598, "num_input_tokens_seen": 8176416, "step": 6730 }, { "epoch": 0.8438792131311865, "grad_norm": 1.5700112581253052, "learning_rate": 4.21876957774715e-06, "loss": 0.473, "num_input_tokens_seen": 8182048, "step": 6735 }, { "epoch": 0.8445057010399699, "grad_norm": 0.5988540649414062, "learning_rate": 4.221902017291067e-06, "loss": 0.4541, "num_input_tokens_seen": 8187936, "step": 6740 }, { "epoch": 0.8451321889487533, "grad_norm": 0.6145793199539185, "learning_rate": 4.225034456834983e-06, "loss": 0.4577, "num_input_tokens_seen": 8194368, "step": 6745 }, { "epoch": 0.8457586768575367, "grad_norm": 1.4179890155792236, "learning_rate": 4.2281668963789005e-06, "loss": 0.471, "num_input_tokens_seen": 8200544, "step": 6750 }, { "epoch": 0.84638516476632, "grad_norm": 1.4485793113708496, "learning_rate": 4.231299335922817e-06, "loss": 0.4718, "num_input_tokens_seen": 8206464, "step": 6755 }, { "epoch": 0.8470116526751034, "grad_norm": 0.5997931361198425, "learning_rate": 4.234431775466733e-06, "loss": 0.459, "num_input_tokens_seen": 8212480, "step": 6760 }, { "epoch": 0.8476381405838868, "grad_norm": 1.389685034751892, "learning_rate": 4.237564215010651e-06, "loss": 0.466, "num_input_tokens_seen": 8218976, "step": 6765 }, { "epoch": 0.84826462849267, "grad_norm": 0.28792595863342285, "learning_rate": 4.2406966545545676e-06, "loss": 0.4627, "num_input_tokens_seen": 8225184, "step": 6770 }, { "epoch": 0.8488911164014534, "grad_norm": 0.942619264125824, "learning_rate": 4.243829094098484e-06, "loss": 0.451, "num_input_tokens_seen": 8231296, "step": 6775 }, { "epoch": 0.8495176043102368, "grad_norm": 0.4060033857822418, "learning_rate": 4.246961533642401e-06, "loss": 0.4687, "num_input_tokens_seen": 8237312, "step": 6780 }, { "epoch": 0.8501440922190202, "grad_norm": 0.5095406174659729, "learning_rate": 4.250093973186318e-06, "loss": 0.4568, "num_input_tokens_seen": 8243328, "step": 6785 }, { "epoch": 0.8507705801278035, "grad_norm": 0.708234965801239, "learning_rate": 4.253226412730235e-06, "loss": 0.4646, "num_input_tokens_seen": 8249472, "step": 6790 }, { "epoch": 0.8513970680365869, "grad_norm": 0.8841362595558167, "learning_rate": 4.256358852274151e-06, "loss": 0.4666, "num_input_tokens_seen": 8255584, "step": 6795 }, { "epoch": 0.8520235559453703, "grad_norm": 0.8065303564071655, "learning_rate": 4.259491291818068e-06, "loss": 0.4573, "num_input_tokens_seen": 8261728, "step": 6800 }, { "epoch": 0.8526500438541537, "grad_norm": 0.5430068969726562, "learning_rate": 4.262623731361985e-06, "loss": 0.4586, "num_input_tokens_seen": 8267904, "step": 6805 }, { "epoch": 0.8532765317629369, "grad_norm": 0.38007721304893494, "learning_rate": 4.265756170905902e-06, "loss": 0.4632, "num_input_tokens_seen": 8274112, "step": 6810 }, { "epoch": 0.8539030196717203, "grad_norm": 1.1562811136245728, "learning_rate": 4.268888610449818e-06, "loss": 0.467, "num_input_tokens_seen": 8280096, "step": 6815 }, { "epoch": 0.8545295075805037, "grad_norm": 1.214715600013733, "learning_rate": 4.272021049993735e-06, "loss": 0.461, "num_input_tokens_seen": 8285888, "step": 6820 }, { "epoch": 0.855155995489287, "grad_norm": 0.8541649580001831, "learning_rate": 4.275153489537652e-06, "loss": 0.4607, "num_input_tokens_seen": 8291840, "step": 6825 }, { "epoch": 0.8557824833980704, "grad_norm": 0.775073230266571, "learning_rate": 4.2782859290815696e-06, "loss": 0.4649, "num_input_tokens_seen": 8298208, "step": 6830 }, { "epoch": 0.8564089713068538, "grad_norm": 0.7824276089668274, "learning_rate": 4.2814183686254855e-06, "loss": 0.4647, "num_input_tokens_seen": 8304288, "step": 6835 }, { "epoch": 0.8570354592156372, "grad_norm": 0.3855088949203491, "learning_rate": 4.284550808169402e-06, "loss": 0.4574, "num_input_tokens_seen": 8310240, "step": 6840 }, { "epoch": 0.8576619471244205, "grad_norm": 0.7167187333106995, "learning_rate": 4.28768324771332e-06, "loss": 0.462, "num_input_tokens_seen": 8316384, "step": 6845 }, { "epoch": 0.8582884350332038, "grad_norm": 1.0627747774124146, "learning_rate": 4.290815687257237e-06, "loss": 0.4638, "num_input_tokens_seen": 8322400, "step": 6850 }, { "epoch": 0.8589149229419872, "grad_norm": 0.7470921277999878, "learning_rate": 4.2939481268011525e-06, "loss": 0.4508, "num_input_tokens_seen": 8328352, "step": 6855 }, { "epoch": 0.8595414108507706, "grad_norm": 0.8915612101554871, "learning_rate": 4.29708056634507e-06, "loss": 0.4717, "num_input_tokens_seen": 8334016, "step": 6860 }, { "epoch": 0.8601678987595539, "grad_norm": 0.807310163974762, "learning_rate": 4.300213005888987e-06, "loss": 0.4543, "num_input_tokens_seen": 8340000, "step": 6865 }, { "epoch": 0.8607943866683373, "grad_norm": 0.3924940526485443, "learning_rate": 4.303345445432903e-06, "loss": 0.4615, "num_input_tokens_seen": 8346112, "step": 6870 }, { "epoch": 0.8614208745771207, "grad_norm": 0.8244190812110901, "learning_rate": 4.3064778849768204e-06, "loss": 0.4587, "num_input_tokens_seen": 8352256, "step": 6875 }, { "epoch": 0.862047362485904, "grad_norm": 0.9423044919967651, "learning_rate": 4.309610324520737e-06, "loss": 0.4331, "num_input_tokens_seen": 8357536, "step": 6880 }, { "epoch": 0.8626738503946874, "grad_norm": 2.0252416133880615, "learning_rate": 4.312742764064654e-06, "loss": 0.4584, "num_input_tokens_seen": 8363680, "step": 6885 }, { "epoch": 0.8633003383034707, "grad_norm": 0.9190636277198792, "learning_rate": 4.315875203608571e-06, "loss": 0.4641, "num_input_tokens_seen": 8369856, "step": 6890 }, { "epoch": 0.8639268262122541, "grad_norm": 1.4552181959152222, "learning_rate": 4.3190076431524875e-06, "loss": 0.4823, "num_input_tokens_seen": 8375904, "step": 6895 }, { "epoch": 0.8645533141210374, "grad_norm": 1.3253471851348877, "learning_rate": 4.322140082696404e-06, "loss": 0.4555, "num_input_tokens_seen": 8381376, "step": 6900 }, { "epoch": 0.8651798020298208, "grad_norm": 1.250669240951538, "learning_rate": 4.325272522240321e-06, "loss": 0.4803, "num_input_tokens_seen": 8387104, "step": 6905 }, { "epoch": 0.8658062899386042, "grad_norm": 0.7176247835159302, "learning_rate": 4.328404961784238e-06, "loss": 0.4562, "num_input_tokens_seen": 8393312, "step": 6910 }, { "epoch": 0.8664327778473876, "grad_norm": 0.7406830787658691, "learning_rate": 4.3315374013281545e-06, "loss": 0.4573, "num_input_tokens_seen": 8399552, "step": 6915 }, { "epoch": 0.8670592657561709, "grad_norm": 0.8121256232261658, "learning_rate": 4.334669840872071e-06, "loss": 0.4578, "num_input_tokens_seen": 8405728, "step": 6920 }, { "epoch": 0.8676857536649543, "grad_norm": 0.7395899891853333, "learning_rate": 4.337802280415988e-06, "loss": 0.4611, "num_input_tokens_seen": 8411648, "step": 6925 }, { "epoch": 0.8683122415737377, "grad_norm": 1.2582106590270996, "learning_rate": 4.340934719959905e-06, "loss": 0.4807, "num_input_tokens_seen": 8417856, "step": 6930 }, { "epoch": 0.8689387294825209, "grad_norm": 0.6473917365074158, "learning_rate": 4.344067159503822e-06, "loss": 0.4634, "num_input_tokens_seen": 8424032, "step": 6935 }, { "epoch": 0.8695652173913043, "grad_norm": 0.3995186686515808, "learning_rate": 4.347199599047739e-06, "loss": 0.4623, "num_input_tokens_seen": 8430272, "step": 6940 }, { "epoch": 0.8701917053000877, "grad_norm": 0.7183041572570801, "learning_rate": 4.350332038591655e-06, "loss": 0.4633, "num_input_tokens_seen": 8436192, "step": 6945 }, { "epoch": 0.8708181932088711, "grad_norm": 0.75644850730896, "learning_rate": 4.353464478135572e-06, "loss": 0.4653, "num_input_tokens_seen": 8442400, "step": 6950 }, { "epoch": 0.8714446811176544, "grad_norm": 1.0226281881332397, "learning_rate": 4.3565969176794895e-06, "loss": 0.4671, "num_input_tokens_seen": 8448992, "step": 6955 }, { "epoch": 0.8720711690264378, "grad_norm": 0.7523447275161743, "learning_rate": 4.359729357223406e-06, "loss": 0.4651, "num_input_tokens_seen": 8455168, "step": 6960 }, { "epoch": 0.8726976569352212, "grad_norm": 0.8550127744674683, "learning_rate": 4.362861796767322e-06, "loss": 0.4659, "num_input_tokens_seen": 8460960, "step": 6965 }, { "epoch": 0.8733241448440046, "grad_norm": 0.8295575380325317, "learning_rate": 4.36599423631124e-06, "loss": 0.467, "num_input_tokens_seen": 8467200, "step": 6970 }, { "epoch": 0.8739506327527878, "grad_norm": 0.6648386120796204, "learning_rate": 4.3691266758551565e-06, "loss": 0.467, "num_input_tokens_seen": 8472832, "step": 6975 }, { "epoch": 0.8745771206615712, "grad_norm": 0.583457887172699, "learning_rate": 4.372259115399073e-06, "loss": 0.4611, "num_input_tokens_seen": 8479232, "step": 6980 }, { "epoch": 0.8752036085703546, "grad_norm": 0.7172260880470276, "learning_rate": 4.37539155494299e-06, "loss": 0.4546, "num_input_tokens_seen": 8485344, "step": 6985 }, { "epoch": 0.8758300964791379, "grad_norm": 0.4003543257713318, "learning_rate": 4.378523994486907e-06, "loss": 0.4661, "num_input_tokens_seen": 8491776, "step": 6990 }, { "epoch": 0.8764565843879213, "grad_norm": 0.7832958698272705, "learning_rate": 4.381656434030824e-06, "loss": 0.4717, "num_input_tokens_seen": 8498304, "step": 6995 }, { "epoch": 0.8770830722967047, "grad_norm": 0.5568743944168091, "learning_rate": 4.38478887357474e-06, "loss": 0.4666, "num_input_tokens_seen": 8504352, "step": 7000 }, { "epoch": 0.8777095602054881, "grad_norm": 0.8757283687591553, "learning_rate": 4.387921313118657e-06, "loss": 0.4711, "num_input_tokens_seen": 8510432, "step": 7005 }, { "epoch": 0.8783360481142714, "grad_norm": 1.3176308870315552, "learning_rate": 4.391053752662574e-06, "loss": 0.477, "num_input_tokens_seen": 8516576, "step": 7010 }, { "epoch": 0.8789625360230547, "grad_norm": 0.6921993494033813, "learning_rate": 4.394186192206491e-06, "loss": 0.4654, "num_input_tokens_seen": 8522528, "step": 7015 }, { "epoch": 0.8795890239318381, "grad_norm": 0.5196986198425293, "learning_rate": 4.397318631750407e-06, "loss": 0.4669, "num_input_tokens_seen": 8528864, "step": 7020 }, { "epoch": 0.8802155118406215, "grad_norm": 0.43576064705848694, "learning_rate": 4.400451071294324e-06, "loss": 0.4575, "num_input_tokens_seen": 8535008, "step": 7025 }, { "epoch": 0.8808419997494048, "grad_norm": 0.35992494225502014, "learning_rate": 4.403583510838241e-06, "loss": 0.4604, "num_input_tokens_seen": 8540896, "step": 7030 }, { "epoch": 0.8814684876581882, "grad_norm": 0.7134178280830383, "learning_rate": 4.406715950382158e-06, "loss": 0.4677, "num_input_tokens_seen": 8546976, "step": 7035 }, { "epoch": 0.8820949755669716, "grad_norm": 0.6087140440940857, "learning_rate": 4.4098483899260745e-06, "loss": 0.4678, "num_input_tokens_seen": 8553120, "step": 7040 }, { "epoch": 0.8827214634757549, "grad_norm": 0.9216071367263794, "learning_rate": 4.412980829469991e-06, "loss": 0.4603, "num_input_tokens_seen": 8559200, "step": 7045 }, { "epoch": 0.8833479513845383, "grad_norm": 0.5589913725852966, "learning_rate": 4.416113269013909e-06, "loss": 0.4607, "num_input_tokens_seen": 8565280, "step": 7050 }, { "epoch": 0.8839744392933216, "grad_norm": 0.48779189586639404, "learning_rate": 4.419245708557825e-06, "loss": 0.4646, "num_input_tokens_seen": 8570880, "step": 7055 }, { "epoch": 0.884600927202105, "grad_norm": 0.9082843661308289, "learning_rate": 4.422378148101742e-06, "loss": 0.4693, "num_input_tokens_seen": 8576672, "step": 7060 }, { "epoch": 0.8852274151108883, "grad_norm": 0.5251635909080505, "learning_rate": 4.425510587645659e-06, "loss": 0.4583, "num_input_tokens_seen": 8581824, "step": 7065 }, { "epoch": 0.8858539030196717, "grad_norm": 0.6207118630409241, "learning_rate": 4.428643027189576e-06, "loss": 0.4648, "num_input_tokens_seen": 8587296, "step": 7070 }, { "epoch": 0.8864803909284551, "grad_norm": 0.6499198079109192, "learning_rate": 4.431775466733493e-06, "loss": 0.4609, "num_input_tokens_seen": 8593632, "step": 7075 }, { "epoch": 0.8871068788372385, "grad_norm": 0.5703948736190796, "learning_rate": 4.434907906277409e-06, "loss": 0.4561, "num_input_tokens_seen": 8600032, "step": 7080 }, { "epoch": 0.8877333667460218, "grad_norm": 0.6789901256561279, "learning_rate": 4.438040345821326e-06, "loss": 0.4666, "num_input_tokens_seen": 8606240, "step": 7085 }, { "epoch": 0.8883598546548052, "grad_norm": 0.6040149927139282, "learning_rate": 4.441172785365243e-06, "loss": 0.4637, "num_input_tokens_seen": 8612416, "step": 7090 }, { "epoch": 0.8889863425635885, "grad_norm": 0.972006618976593, "learning_rate": 4.44430522490916e-06, "loss": 0.4643, "num_input_tokens_seen": 8618400, "step": 7095 }, { "epoch": 0.8896128304723718, "grad_norm": 1.2302379608154297, "learning_rate": 4.4474376644530765e-06, "loss": 0.4575, "num_input_tokens_seen": 8624672, "step": 7100 }, { "epoch": 0.8902393183811552, "grad_norm": 0.7411768436431885, "learning_rate": 4.450570103996993e-06, "loss": 0.4537, "num_input_tokens_seen": 8630592, "step": 7105 }, { "epoch": 0.8908658062899386, "grad_norm": 0.4578804075717926, "learning_rate": 4.45370254354091e-06, "loss": 0.4628, "num_input_tokens_seen": 8636608, "step": 7110 }, { "epoch": 0.891492294198722, "grad_norm": 0.9639644622802734, "learning_rate": 4.456834983084827e-06, "loss": 0.467, "num_input_tokens_seen": 8643104, "step": 7115 }, { "epoch": 0.8921187821075053, "grad_norm": 0.6158509254455566, "learning_rate": 4.4599674226287435e-06, "loss": 0.4578, "num_input_tokens_seen": 8648928, "step": 7120 }, { "epoch": 0.8927452700162887, "grad_norm": 0.44799482822418213, "learning_rate": 4.46309986217266e-06, "loss": 0.4683, "num_input_tokens_seen": 8655104, "step": 7125 }, { "epoch": 0.8933717579250721, "grad_norm": 1.3951711654663086, "learning_rate": 4.466232301716577e-06, "loss": 0.4732, "num_input_tokens_seen": 8661632, "step": 7130 }, { "epoch": 0.8939982458338555, "grad_norm": 1.0531423091888428, "learning_rate": 4.469364741260494e-06, "loss": 0.4511, "num_input_tokens_seen": 8667296, "step": 7135 }, { "epoch": 0.8946247337426387, "grad_norm": 0.6116869449615479, "learning_rate": 4.4724971808044106e-06, "loss": 0.4532, "num_input_tokens_seen": 8673088, "step": 7140 }, { "epoch": 0.8952512216514221, "grad_norm": 0.5982307195663452, "learning_rate": 4.475629620348327e-06, "loss": 0.4781, "num_input_tokens_seen": 8678912, "step": 7145 }, { "epoch": 0.8958777095602055, "grad_norm": 0.7497063875198364, "learning_rate": 4.478762059892244e-06, "loss": 0.4658, "num_input_tokens_seen": 8685024, "step": 7150 }, { "epoch": 0.8965041974689888, "grad_norm": 0.9417251944541931, "learning_rate": 4.481894499436162e-06, "loss": 0.4583, "num_input_tokens_seen": 8691168, "step": 7155 }, { "epoch": 0.8971306853777722, "grad_norm": 0.7942788004875183, "learning_rate": 4.4850269389800785e-06, "loss": 0.4753, "num_input_tokens_seen": 8696544, "step": 7160 }, { "epoch": 0.8977571732865556, "grad_norm": 0.44034209847450256, "learning_rate": 4.488159378523994e-06, "loss": 0.4679, "num_input_tokens_seen": 8702848, "step": 7165 }, { "epoch": 0.898383661195339, "grad_norm": 0.6517677307128906, "learning_rate": 4.491291818067912e-06, "loss": 0.461, "num_input_tokens_seen": 8709344, "step": 7170 }, { "epoch": 0.8990101491041222, "grad_norm": 0.43625935912132263, "learning_rate": 4.494424257611829e-06, "loss": 0.4606, "num_input_tokens_seen": 8715456, "step": 7175 }, { "epoch": 0.8996366370129056, "grad_norm": 0.6542662382125854, "learning_rate": 4.4975566971557455e-06, "loss": 0.4563, "num_input_tokens_seen": 8721152, "step": 7180 }, { "epoch": 0.900263124921689, "grad_norm": 0.6838401556015015, "learning_rate": 4.500689136699662e-06, "loss": 0.4551, "num_input_tokens_seen": 8727264, "step": 7185 }, { "epoch": 0.9008896128304724, "grad_norm": 0.95448237657547, "learning_rate": 4.503821576243579e-06, "loss": 0.481, "num_input_tokens_seen": 8733184, "step": 7190 }, { "epoch": 0.9015161007392557, "grad_norm": 1.3574542999267578, "learning_rate": 4.506954015787496e-06, "loss": 0.466, "num_input_tokens_seen": 8738592, "step": 7195 }, { "epoch": 0.9021425886480391, "grad_norm": 0.8036932945251465, "learning_rate": 4.5100864553314126e-06, "loss": 0.4593, "num_input_tokens_seen": 8744864, "step": 7200 }, { "epoch": 0.9027690765568225, "grad_norm": 0.8709162473678589, "learning_rate": 4.513218894875329e-06, "loss": 0.4697, "num_input_tokens_seen": 8751200, "step": 7205 }, { "epoch": 0.9033955644656058, "grad_norm": 0.44469448924064636, "learning_rate": 4.516351334419246e-06, "loss": 0.4769, "num_input_tokens_seen": 8757632, "step": 7210 }, { "epoch": 0.9040220523743892, "grad_norm": 0.62397301197052, "learning_rate": 4.519483773963163e-06, "loss": 0.4493, "num_input_tokens_seen": 8763744, "step": 7215 }, { "epoch": 0.9046485402831725, "grad_norm": 0.5018338561058044, "learning_rate": 4.52261621350708e-06, "loss": 0.4602, "num_input_tokens_seen": 8770144, "step": 7220 }, { "epoch": 0.9052750281919559, "grad_norm": 0.7235221862792969, "learning_rate": 4.525748653050996e-06, "loss": 0.4588, "num_input_tokens_seen": 8776256, "step": 7225 }, { "epoch": 0.9059015161007392, "grad_norm": 0.9472614526748657, "learning_rate": 4.528881092594913e-06, "loss": 0.4572, "num_input_tokens_seen": 8782528, "step": 7230 }, { "epoch": 0.9065280040095226, "grad_norm": 0.5119396448135376, "learning_rate": 4.53201353213883e-06, "loss": 0.4559, "num_input_tokens_seen": 8788512, "step": 7235 }, { "epoch": 0.907154491918306, "grad_norm": 1.2751221656799316, "learning_rate": 4.535145971682747e-06, "loss": 0.4729, "num_input_tokens_seen": 8794816, "step": 7240 }, { "epoch": 0.9077809798270894, "grad_norm": 0.38067737221717834, "learning_rate": 4.5382784112266634e-06, "loss": 0.4703, "num_input_tokens_seen": 8801056, "step": 7245 }, { "epoch": 0.9084074677358727, "grad_norm": 0.6313326954841614, "learning_rate": 4.541410850770581e-06, "loss": 0.466, "num_input_tokens_seen": 8807264, "step": 7250 }, { "epoch": 0.909033955644656, "grad_norm": 0.746897280216217, "learning_rate": 4.544543290314497e-06, "loss": 0.4647, "num_input_tokens_seen": 8813376, "step": 7255 }, { "epoch": 0.9096604435534394, "grad_norm": 0.4484128952026367, "learning_rate": 4.547675729858414e-06, "loss": 0.4611, "num_input_tokens_seen": 8819392, "step": 7260 }, { "epoch": 0.9102869314622227, "grad_norm": 1.1022740602493286, "learning_rate": 4.550808169402331e-06, "loss": 0.4681, "num_input_tokens_seen": 8825504, "step": 7265 }, { "epoch": 0.9109134193710061, "grad_norm": 0.9900858402252197, "learning_rate": 4.553940608946248e-06, "loss": 0.4689, "num_input_tokens_seen": 8831328, "step": 7270 }, { "epoch": 0.9115399072797895, "grad_norm": 0.7198054790496826, "learning_rate": 4.557073048490164e-06, "loss": 0.4713, "num_input_tokens_seen": 8837088, "step": 7275 }, { "epoch": 0.9121663951885729, "grad_norm": 0.9387272000312805, "learning_rate": 4.560205488034082e-06, "loss": 0.4603, "num_input_tokens_seen": 8843264, "step": 7280 }, { "epoch": 0.9127928830973562, "grad_norm": 0.8412942886352539, "learning_rate": 4.563337927577998e-06, "loss": 0.4573, "num_input_tokens_seen": 8849312, "step": 7285 }, { "epoch": 0.9134193710061396, "grad_norm": 0.6351922154426575, "learning_rate": 4.566470367121915e-06, "loss": 0.4625, "num_input_tokens_seen": 8855296, "step": 7290 }, { "epoch": 0.914045858914923, "grad_norm": 0.5250082612037659, "learning_rate": 4.569602806665832e-06, "loss": 0.4575, "num_input_tokens_seen": 8860928, "step": 7295 }, { "epoch": 0.9146723468237064, "grad_norm": 0.9579510688781738, "learning_rate": 4.572735246209749e-06, "loss": 0.4517, "num_input_tokens_seen": 8866944, "step": 7300 }, { "epoch": 0.9152988347324896, "grad_norm": 0.6900564432144165, "learning_rate": 4.5758676857536654e-06, "loss": 0.4632, "num_input_tokens_seen": 8873152, "step": 7305 }, { "epoch": 0.915925322641273, "grad_norm": 0.8868797421455383, "learning_rate": 4.579000125297582e-06, "loss": 0.4548, "num_input_tokens_seen": 8879072, "step": 7310 }, { "epoch": 0.9165518105500564, "grad_norm": 0.7276254296302795, "learning_rate": 4.582132564841499e-06, "loss": 0.4469, "num_input_tokens_seen": 8885472, "step": 7315 }, { "epoch": 0.9171782984588397, "grad_norm": 0.882649838924408, "learning_rate": 4.585265004385416e-06, "loss": 0.4521, "num_input_tokens_seen": 8891616, "step": 7320 }, { "epoch": 0.9178047863676231, "grad_norm": 0.7282215356826782, "learning_rate": 4.5883974439293325e-06, "loss": 0.4723, "num_input_tokens_seen": 8897536, "step": 7325 }, { "epoch": 0.9184312742764065, "grad_norm": 1.1623823642730713, "learning_rate": 4.591529883473249e-06, "loss": 0.474, "num_input_tokens_seen": 8903776, "step": 7330 }, { "epoch": 0.9190577621851899, "grad_norm": 0.6890787482261658, "learning_rate": 4.594662323017166e-06, "loss": 0.4964, "num_input_tokens_seen": 8909472, "step": 7335 }, { "epoch": 0.9196842500939731, "grad_norm": 0.6073508858680725, "learning_rate": 4.597794762561083e-06, "loss": 0.4671, "num_input_tokens_seen": 8915712, "step": 7340 }, { "epoch": 0.9203107380027565, "grad_norm": 0.6210227012634277, "learning_rate": 4.600927202105e-06, "loss": 0.4555, "num_input_tokens_seen": 8921824, "step": 7345 }, { "epoch": 0.9209372259115399, "grad_norm": 0.5638537406921387, "learning_rate": 4.604059641648916e-06, "loss": 0.4705, "num_input_tokens_seen": 8927872, "step": 7350 }, { "epoch": 0.9215637138203233, "grad_norm": 0.4472627341747284, "learning_rate": 4.607192081192833e-06, "loss": 0.4669, "num_input_tokens_seen": 8934176, "step": 7355 }, { "epoch": 0.9221902017291066, "grad_norm": 0.6049366593360901, "learning_rate": 4.610324520736751e-06, "loss": 0.4685, "num_input_tokens_seen": 8940096, "step": 7360 }, { "epoch": 0.92281668963789, "grad_norm": 0.580738365650177, "learning_rate": 4.613456960280667e-06, "loss": 0.4574, "num_input_tokens_seen": 8946624, "step": 7365 }, { "epoch": 0.9234431775466734, "grad_norm": 0.5496968626976013, "learning_rate": 4.616589399824583e-06, "loss": 0.4573, "num_input_tokens_seen": 8952896, "step": 7370 }, { "epoch": 0.9240696654554567, "grad_norm": 0.5619196891784668, "learning_rate": 4.619721839368501e-06, "loss": 0.4654, "num_input_tokens_seen": 8958976, "step": 7375 }, { "epoch": 0.92469615336424, "grad_norm": 0.5535270571708679, "learning_rate": 4.622854278912418e-06, "loss": 0.4644, "num_input_tokens_seen": 8964832, "step": 7380 }, { "epoch": 0.9253226412730234, "grad_norm": 0.709813117980957, "learning_rate": 4.625986718456334e-06, "loss": 0.4562, "num_input_tokens_seen": 8970304, "step": 7385 }, { "epoch": 0.9259491291818068, "grad_norm": 0.7080422043800354, "learning_rate": 4.629119158000251e-06, "loss": 0.4695, "num_input_tokens_seen": 8976512, "step": 7390 }, { "epoch": 0.9265756170905901, "grad_norm": 0.6725238561630249, "learning_rate": 4.632251597544168e-06, "loss": 0.4579, "num_input_tokens_seen": 8982592, "step": 7395 }, { "epoch": 0.9272021049993735, "grad_norm": 0.5047391653060913, "learning_rate": 4.635384037088085e-06, "loss": 0.4625, "num_input_tokens_seen": 8988768, "step": 7400 }, { "epoch": 0.9278285929081569, "grad_norm": 1.3084036111831665, "learning_rate": 4.6385164766320015e-06, "loss": 0.4718, "num_input_tokens_seen": 8994432, "step": 7405 }, { "epoch": 0.9284550808169403, "grad_norm": 0.31444787979125977, "learning_rate": 4.641648916175918e-06, "loss": 0.4608, "num_input_tokens_seen": 9000704, "step": 7410 }, { "epoch": 0.9290815687257236, "grad_norm": 0.36583995819091797, "learning_rate": 4.644781355719835e-06, "loss": 0.4625, "num_input_tokens_seen": 9006784, "step": 7415 }, { "epoch": 0.929708056634507, "grad_norm": 0.8298815488815308, "learning_rate": 4.647913795263752e-06, "loss": 0.4681, "num_input_tokens_seen": 9012928, "step": 7420 }, { "epoch": 0.9303345445432903, "grad_norm": 0.4979163408279419, "learning_rate": 4.651046234807669e-06, "loss": 0.4763, "num_input_tokens_seen": 9019040, "step": 7425 }, { "epoch": 0.9309610324520736, "grad_norm": 0.5467909574508667, "learning_rate": 4.654178674351585e-06, "loss": 0.4645, "num_input_tokens_seen": 9025120, "step": 7430 }, { "epoch": 0.931587520360857, "grad_norm": 0.7424190640449524, "learning_rate": 4.657311113895502e-06, "loss": 0.4664, "num_input_tokens_seen": 9031488, "step": 7435 }, { "epoch": 0.9322140082696404, "grad_norm": 0.48962390422821045, "learning_rate": 4.660443553439419e-06, "loss": 0.4624, "num_input_tokens_seen": 9037824, "step": 7440 }, { "epoch": 0.9328404961784238, "grad_norm": 0.646307647228241, "learning_rate": 4.663575992983336e-06, "loss": 0.4676, "num_input_tokens_seen": 9043936, "step": 7445 }, { "epoch": 0.9334669840872071, "grad_norm": 0.5462340712547302, "learning_rate": 4.666708432527252e-06, "loss": 0.4556, "num_input_tokens_seen": 9049920, "step": 7450 }, { "epoch": 0.9340934719959905, "grad_norm": 0.8321511745452881, "learning_rate": 4.66984087207117e-06, "loss": 0.473, "num_input_tokens_seen": 9055936, "step": 7455 }, { "epoch": 0.9347199599047739, "grad_norm": 0.7353525161743164, "learning_rate": 4.672973311615086e-06, "loss": 0.4716, "num_input_tokens_seen": 9062240, "step": 7460 }, { "epoch": 0.9353464478135572, "grad_norm": 1.0301052331924438, "learning_rate": 4.676105751159003e-06, "loss": 0.4655, "num_input_tokens_seen": 9068064, "step": 7465 }, { "epoch": 0.9359729357223405, "grad_norm": 0.603150486946106, "learning_rate": 4.67923819070292e-06, "loss": 0.4627, "num_input_tokens_seen": 9074592, "step": 7470 }, { "epoch": 0.9365994236311239, "grad_norm": 0.6503348350524902, "learning_rate": 4.682370630246836e-06, "loss": 0.4633, "num_input_tokens_seen": 9081184, "step": 7475 }, { "epoch": 0.9372259115399073, "grad_norm": 0.6620173454284668, "learning_rate": 4.685503069790753e-06, "loss": 0.4596, "num_input_tokens_seen": 9087424, "step": 7480 }, { "epoch": 0.9378523994486906, "grad_norm": 0.7596690058708191, "learning_rate": 4.688635509334671e-06, "loss": 0.4653, "num_input_tokens_seen": 9093568, "step": 7485 }, { "epoch": 0.938478887357474, "grad_norm": 0.3569457232952118, "learning_rate": 4.691767948878587e-06, "loss": 0.4648, "num_input_tokens_seen": 9099648, "step": 7490 }, { "epoch": 0.9391053752662574, "grad_norm": 0.6574710607528687, "learning_rate": 4.694900388422503e-06, "loss": 0.4667, "num_input_tokens_seen": 9106112, "step": 7495 }, { "epoch": 0.9397318631750408, "grad_norm": 0.9326300621032715, "learning_rate": 4.698032827966421e-06, "loss": 0.4624, "num_input_tokens_seen": 9112192, "step": 7500 }, { "epoch": 0.940358351083824, "grad_norm": 0.7200841307640076, "learning_rate": 4.701165267510338e-06, "loss": 0.4642, "num_input_tokens_seen": 9118528, "step": 7505 }, { "epoch": 0.9409848389926074, "grad_norm": 1.0743865966796875, "learning_rate": 4.704297707054254e-06, "loss": 0.4661, "num_input_tokens_seen": 9124672, "step": 7510 }, { "epoch": 0.9416113269013908, "grad_norm": 0.7223617434501648, "learning_rate": 4.707430146598171e-06, "loss": 0.4617, "num_input_tokens_seen": 9130656, "step": 7515 }, { "epoch": 0.9422378148101742, "grad_norm": 1.050255537033081, "learning_rate": 4.710562586142088e-06, "loss": 0.444, "num_input_tokens_seen": 9137376, "step": 7520 }, { "epoch": 0.9428643027189575, "grad_norm": 0.8637580871582031, "learning_rate": 4.713695025686005e-06, "loss": 0.4756, "num_input_tokens_seen": 9143360, "step": 7525 }, { "epoch": 0.9434907906277409, "grad_norm": 0.5773609280586243, "learning_rate": 4.7168274652299215e-06, "loss": 0.4479, "num_input_tokens_seen": 9149632, "step": 7530 }, { "epoch": 0.9441172785365243, "grad_norm": 1.2799921035766602, "learning_rate": 4.719959904773838e-06, "loss": 0.4651, "num_input_tokens_seen": 9155744, "step": 7535 }, { "epoch": 0.9447437664453076, "grad_norm": 1.113827109336853, "learning_rate": 4.723092344317755e-06, "loss": 0.4865, "num_input_tokens_seen": 9161792, "step": 7540 }, { "epoch": 0.9453702543540909, "grad_norm": 1.4958747625350952, "learning_rate": 4.726224783861672e-06, "loss": 0.4625, "num_input_tokens_seen": 9167936, "step": 7545 }, { "epoch": 0.9459967422628743, "grad_norm": 0.7663899064064026, "learning_rate": 4.7293572234055885e-06, "loss": 0.4712, "num_input_tokens_seen": 9174112, "step": 7550 }, { "epoch": 0.9466232301716577, "grad_norm": 0.5541816353797913, "learning_rate": 4.732489662949505e-06, "loss": 0.4576, "num_input_tokens_seen": 9179808, "step": 7555 }, { "epoch": 0.947249718080441, "grad_norm": 0.5841342210769653, "learning_rate": 4.735622102493422e-06, "loss": 0.4642, "num_input_tokens_seen": 9185984, "step": 7560 }, { "epoch": 0.9478762059892244, "grad_norm": 0.6385080218315125, "learning_rate": 4.73875454203734e-06, "loss": 0.4694, "num_input_tokens_seen": 9191808, "step": 7565 }, { "epoch": 0.9485026938980078, "grad_norm": 0.4354327917098999, "learning_rate": 4.7418869815812556e-06, "loss": 0.4597, "num_input_tokens_seen": 9198112, "step": 7570 }, { "epoch": 0.9491291818067912, "grad_norm": 0.4715420603752136, "learning_rate": 4.745019421125172e-06, "loss": 0.4598, "num_input_tokens_seen": 9204288, "step": 7575 }, { "epoch": 0.9497556697155745, "grad_norm": 0.690398097038269, "learning_rate": 4.74815186066909e-06, "loss": 0.4659, "num_input_tokens_seen": 9210400, "step": 7580 }, { "epoch": 0.9503821576243578, "grad_norm": 0.6553381085395813, "learning_rate": 4.751284300213006e-06, "loss": 0.462, "num_input_tokens_seen": 9216384, "step": 7585 }, { "epoch": 0.9510086455331412, "grad_norm": 0.48882368206977844, "learning_rate": 4.754416739756923e-06, "loss": 0.4674, "num_input_tokens_seen": 9222592, "step": 7590 }, { "epoch": 0.9516351334419245, "grad_norm": 0.7075697183609009, "learning_rate": 4.75754917930084e-06, "loss": 0.4577, "num_input_tokens_seen": 9228704, "step": 7595 }, { "epoch": 0.9522616213507079, "grad_norm": 0.25285571813583374, "learning_rate": 4.760681618844757e-06, "loss": 0.4733, "num_input_tokens_seen": 9234816, "step": 7600 }, { "epoch": 0.9528881092594913, "grad_norm": 0.4224819839000702, "learning_rate": 4.763814058388673e-06, "loss": 0.4636, "num_input_tokens_seen": 9241056, "step": 7605 }, { "epoch": 0.9535145971682747, "grad_norm": 0.5001590251922607, "learning_rate": 4.7669464979325905e-06, "loss": 0.4616, "num_input_tokens_seen": 9247136, "step": 7610 }, { "epoch": 0.954141085077058, "grad_norm": 0.8041774034500122, "learning_rate": 4.770078937476507e-06, "loss": 0.4638, "num_input_tokens_seen": 9253408, "step": 7615 }, { "epoch": 0.9547675729858414, "grad_norm": 0.7198663353919983, "learning_rate": 4.773211377020424e-06, "loss": 0.4604, "num_input_tokens_seen": 9259104, "step": 7620 }, { "epoch": 0.9553940608946248, "grad_norm": 0.5300973057746887, "learning_rate": 4.776343816564341e-06, "loss": 0.4605, "num_input_tokens_seen": 9265152, "step": 7625 }, { "epoch": 0.9560205488034081, "grad_norm": 0.4394083619117737, "learning_rate": 4.779476256108258e-06, "loss": 0.4671, "num_input_tokens_seen": 9271104, "step": 7630 }, { "epoch": 0.9566470367121914, "grad_norm": 0.5191137194633484, "learning_rate": 4.782608695652174e-06, "loss": 0.4462, "num_input_tokens_seen": 9277376, "step": 7635 }, { "epoch": 0.9572735246209748, "grad_norm": 0.28366580605506897, "learning_rate": 4.785741135196091e-06, "loss": 0.4668, "num_input_tokens_seen": 9283776, "step": 7640 }, { "epoch": 0.9579000125297582, "grad_norm": 0.5857676863670349, "learning_rate": 4.788873574740008e-06, "loss": 0.4653, "num_input_tokens_seen": 9290144, "step": 7645 }, { "epoch": 0.9585265004385415, "grad_norm": 0.8491039872169495, "learning_rate": 4.792006014283925e-06, "loss": 0.4608, "num_input_tokens_seen": 9296352, "step": 7650 }, { "epoch": 0.9591529883473249, "grad_norm": 0.8770455718040466, "learning_rate": 4.795138453827841e-06, "loss": 0.4622, "num_input_tokens_seen": 9302336, "step": 7655 }, { "epoch": 0.9597794762561083, "grad_norm": 0.24350020289421082, "learning_rate": 4.798270893371758e-06, "loss": 0.4758, "num_input_tokens_seen": 9308672, "step": 7660 }, { "epoch": 0.9604059641648917, "grad_norm": 0.5927849411964417, "learning_rate": 4.801403332915675e-06, "loss": 0.4651, "num_input_tokens_seen": 9314720, "step": 7665 }, { "epoch": 0.9610324520736749, "grad_norm": 0.6014406681060791, "learning_rate": 4.804535772459592e-06, "loss": 0.4632, "num_input_tokens_seen": 9320544, "step": 7670 }, { "epoch": 0.9616589399824583, "grad_norm": 0.5727958679199219, "learning_rate": 4.807668212003509e-06, "loss": 0.4632, "num_input_tokens_seen": 9326880, "step": 7675 }, { "epoch": 0.9622854278912417, "grad_norm": 0.9228606224060059, "learning_rate": 4.810800651547425e-06, "loss": 0.4639, "num_input_tokens_seen": 9332832, "step": 7680 }, { "epoch": 0.9629119158000251, "grad_norm": 0.9369730949401855, "learning_rate": 4.813933091091342e-06, "loss": 0.4651, "num_input_tokens_seen": 9338848, "step": 7685 }, { "epoch": 0.9635384037088084, "grad_norm": 0.5482061505317688, "learning_rate": 4.81706553063526e-06, "loss": 0.4671, "num_input_tokens_seen": 9344416, "step": 7690 }, { "epoch": 0.9641648916175918, "grad_norm": 0.813299834728241, "learning_rate": 4.8201979701791755e-06, "loss": 0.456, "num_input_tokens_seen": 9350656, "step": 7695 }, { "epoch": 0.9647913795263752, "grad_norm": 0.5137706995010376, "learning_rate": 4.823330409723092e-06, "loss": 0.4675, "num_input_tokens_seen": 9356704, "step": 7700 }, { "epoch": 0.9654178674351584, "grad_norm": 0.5585986971855164, "learning_rate": 4.82646284926701e-06, "loss": 0.4728, "num_input_tokens_seen": 9363200, "step": 7705 }, { "epoch": 0.9660443553439418, "grad_norm": 0.537692129611969, "learning_rate": 4.829595288810927e-06, "loss": 0.4594, "num_input_tokens_seen": 9368640, "step": 7710 }, { "epoch": 0.9666708432527252, "grad_norm": 0.514863133430481, "learning_rate": 4.8327277283548426e-06, "loss": 0.46, "num_input_tokens_seen": 9373760, "step": 7715 }, { "epoch": 0.9672973311615086, "grad_norm": 0.751953125, "learning_rate": 4.83586016789876e-06, "loss": 0.4741, "num_input_tokens_seen": 9379968, "step": 7720 }, { "epoch": 0.9679238190702919, "grad_norm": 0.5576709508895874, "learning_rate": 4.838992607442677e-06, "loss": 0.4672, "num_input_tokens_seen": 9386080, "step": 7725 }, { "epoch": 0.9685503069790753, "grad_norm": 0.4906425178050995, "learning_rate": 4.842125046986594e-06, "loss": 0.4614, "num_input_tokens_seen": 9392384, "step": 7730 }, { "epoch": 0.9691767948878587, "grad_norm": 0.5972575545310974, "learning_rate": 4.8452574865305104e-06, "loss": 0.4605, "num_input_tokens_seen": 9398336, "step": 7735 }, { "epoch": 0.9698032827966421, "grad_norm": 0.40563124418258667, "learning_rate": 4.848389926074427e-06, "loss": 0.4675, "num_input_tokens_seen": 9404352, "step": 7740 }, { "epoch": 0.9704297707054254, "grad_norm": 0.34674251079559326, "learning_rate": 4.851522365618344e-06, "loss": 0.46, "num_input_tokens_seen": 9410880, "step": 7745 }, { "epoch": 0.9710562586142087, "grad_norm": 0.45829057693481445, "learning_rate": 4.854654805162261e-06, "loss": 0.4708, "num_input_tokens_seen": 9417184, "step": 7750 }, { "epoch": 0.9716827465229921, "grad_norm": 0.7212744355201721, "learning_rate": 4.8577872447061775e-06, "loss": 0.4596, "num_input_tokens_seen": 9423168, "step": 7755 }, { "epoch": 0.9723092344317754, "grad_norm": 0.5342515707015991, "learning_rate": 4.860919684250094e-06, "loss": 0.4673, "num_input_tokens_seen": 9429280, "step": 7760 }, { "epoch": 0.9729357223405588, "grad_norm": 0.4671695828437805, "learning_rate": 4.864052123794011e-06, "loss": 0.4645, "num_input_tokens_seen": 9435616, "step": 7765 }, { "epoch": 0.9735622102493422, "grad_norm": 0.4198295772075653, "learning_rate": 4.867184563337928e-06, "loss": 0.4588, "num_input_tokens_seen": 9442112, "step": 7770 }, { "epoch": 0.9741886981581256, "grad_norm": 0.42121872305870056, "learning_rate": 4.8703170028818446e-06, "loss": 0.4625, "num_input_tokens_seen": 9447872, "step": 7775 }, { "epoch": 0.9748151860669089, "grad_norm": 0.5710822939872742, "learning_rate": 4.873449442425761e-06, "loss": 0.4583, "num_input_tokens_seen": 9453888, "step": 7780 }, { "epoch": 0.9754416739756923, "grad_norm": 0.9674049615859985, "learning_rate": 4.876581881969679e-06, "loss": 0.4685, "num_input_tokens_seen": 9460064, "step": 7785 }, { "epoch": 0.9760681618844756, "grad_norm": 0.4814642071723938, "learning_rate": 4.879714321513595e-06, "loss": 0.4662, "num_input_tokens_seen": 9466336, "step": 7790 }, { "epoch": 0.976694649793259, "grad_norm": 0.8873690962791443, "learning_rate": 4.882846761057512e-06, "loss": 0.461, "num_input_tokens_seen": 9472448, "step": 7795 }, { "epoch": 0.9773211377020423, "grad_norm": 0.5574537515640259, "learning_rate": 4.885979200601429e-06, "loss": 0.4669, "num_input_tokens_seen": 9478048, "step": 7800 }, { "epoch": 0.9779476256108257, "grad_norm": 0.7256521582603455, "learning_rate": 4.889111640145345e-06, "loss": 0.4721, "num_input_tokens_seen": 9484288, "step": 7805 }, { "epoch": 0.9785741135196091, "grad_norm": 1.2216418981552124, "learning_rate": 4.892244079689262e-06, "loss": 0.4605, "num_input_tokens_seen": 9490272, "step": 7810 }, { "epoch": 0.9792006014283924, "grad_norm": 0.6075568795204163, "learning_rate": 4.8953765192331795e-06, "loss": 0.4716, "num_input_tokens_seen": 9496160, "step": 7815 }, { "epoch": 0.9798270893371758, "grad_norm": 0.5959070920944214, "learning_rate": 4.898508958777096e-06, "loss": 0.4608, "num_input_tokens_seen": 9502208, "step": 7820 }, { "epoch": 0.9804535772459592, "grad_norm": 0.9007444977760315, "learning_rate": 4.901641398321012e-06, "loss": 0.4633, "num_input_tokens_seen": 9508576, "step": 7825 }, { "epoch": 0.9810800651547426, "grad_norm": 0.5868531465530396, "learning_rate": 4.90477383786493e-06, "loss": 0.4688, "num_input_tokens_seen": 9513984, "step": 7830 }, { "epoch": 0.9817065530635258, "grad_norm": 0.7186844348907471, "learning_rate": 4.9079062774088466e-06, "loss": 0.466, "num_input_tokens_seen": 9519360, "step": 7835 }, { "epoch": 0.9823330409723092, "grad_norm": 0.5310972929000854, "learning_rate": 4.911038716952763e-06, "loss": 0.4596, "num_input_tokens_seen": 9524576, "step": 7840 }, { "epoch": 0.9829595288810926, "grad_norm": 0.6571756601333618, "learning_rate": 4.91417115649668e-06, "loss": 0.4584, "num_input_tokens_seen": 9530848, "step": 7845 }, { "epoch": 0.983586016789876, "grad_norm": 0.5483787059783936, "learning_rate": 4.917303596040597e-06, "loss": 0.4652, "num_input_tokens_seen": 9536704, "step": 7850 }, { "epoch": 0.9842125046986593, "grad_norm": 0.5354673266410828, "learning_rate": 4.920436035584514e-06, "loss": 0.4652, "num_input_tokens_seen": 9543136, "step": 7855 }, { "epoch": 0.9848389926074427, "grad_norm": 0.7056474685668945, "learning_rate": 4.92356847512843e-06, "loss": 0.4638, "num_input_tokens_seen": 9549024, "step": 7860 }, { "epoch": 0.9854654805162261, "grad_norm": 0.4566287696361542, "learning_rate": 4.926700914672347e-06, "loss": 0.4648, "num_input_tokens_seen": 9555008, "step": 7865 }, { "epoch": 0.9860919684250093, "grad_norm": 0.27242177724838257, "learning_rate": 4.929833354216264e-06, "loss": 0.4633, "num_input_tokens_seen": 9560896, "step": 7870 }, { "epoch": 0.9867184563337927, "grad_norm": 0.457589715719223, "learning_rate": 4.932965793760181e-06, "loss": 0.4643, "num_input_tokens_seen": 9567072, "step": 7875 }, { "epoch": 0.9873449442425761, "grad_norm": 0.5249862670898438, "learning_rate": 4.9360982333040974e-06, "loss": 0.4638, "num_input_tokens_seen": 9573248, "step": 7880 }, { "epoch": 0.9879714321513595, "grad_norm": 0.5653167366981506, "learning_rate": 4.939230672848014e-06, "loss": 0.4587, "num_input_tokens_seen": 9579520, "step": 7885 }, { "epoch": 0.9885979200601428, "grad_norm": 0.45031383633613586, "learning_rate": 4.942363112391931e-06, "loss": 0.4609, "num_input_tokens_seen": 9585504, "step": 7890 }, { "epoch": 0.9892244079689262, "grad_norm": 0.6165633797645569, "learning_rate": 4.9454955519358486e-06, "loss": 0.4626, "num_input_tokens_seen": 9591776, "step": 7895 }, { "epoch": 0.9898508958777096, "grad_norm": 0.3051553964614868, "learning_rate": 4.9486279914797645e-06, "loss": 0.4635, "num_input_tokens_seen": 9597792, "step": 7900 }, { "epoch": 0.990477383786493, "grad_norm": 0.28887611627578735, "learning_rate": 4.951760431023681e-06, "loss": 0.4631, "num_input_tokens_seen": 9603872, "step": 7905 }, { "epoch": 0.9911038716952763, "grad_norm": 0.5013357996940613, "learning_rate": 4.954892870567599e-06, "loss": 0.4561, "num_input_tokens_seen": 9610176, "step": 7910 }, { "epoch": 0.9917303596040596, "grad_norm": 1.0343495607376099, "learning_rate": 4.958025310111515e-06, "loss": 0.4659, "num_input_tokens_seen": 9616032, "step": 7915 }, { "epoch": 0.992356847512843, "grad_norm": 0.5344735980033875, "learning_rate": 4.9611577496554315e-06, "loss": 0.4668, "num_input_tokens_seen": 9621600, "step": 7920 }, { "epoch": 0.9929833354216263, "grad_norm": 1.0090014934539795, "learning_rate": 4.964290189199349e-06, "loss": 0.4741, "num_input_tokens_seen": 9627872, "step": 7925 }, { "epoch": 0.9936098233304097, "grad_norm": 0.44117245078086853, "learning_rate": 4.967422628743266e-06, "loss": 0.4639, "num_input_tokens_seen": 9633536, "step": 7930 }, { "epoch": 0.9942363112391931, "grad_norm": 0.5217496752738953, "learning_rate": 4.970555068287183e-06, "loss": 0.4665, "num_input_tokens_seen": 9639904, "step": 7935 }, { "epoch": 0.9948627991479765, "grad_norm": 0.6242879033088684, "learning_rate": 4.9736875078310994e-06, "loss": 0.4605, "num_input_tokens_seen": 9646368, "step": 7940 }, { "epoch": 0.9954892870567598, "grad_norm": 0.2555505037307739, "learning_rate": 4.976819947375016e-06, "loss": 0.4582, "num_input_tokens_seen": 9652320, "step": 7945 }, { "epoch": 0.9961157749655432, "grad_norm": 0.8656823039054871, "learning_rate": 4.979952386918933e-06, "loss": 0.4672, "num_input_tokens_seen": 9658400, "step": 7950 }, { "epoch": 0.9967422628743265, "grad_norm": 0.35428109765052795, "learning_rate": 4.98308482646285e-06, "loss": 0.46, "num_input_tokens_seen": 9664160, "step": 7955 }, { "epoch": 0.9973687507831099, "grad_norm": 0.4384917914867401, "learning_rate": 4.9862172660067665e-06, "loss": 0.4629, "num_input_tokens_seen": 9670080, "step": 7960 }, { "epoch": 0.9979952386918932, "grad_norm": 0.6150863766670227, "learning_rate": 4.989349705550683e-06, "loss": 0.4642, "num_input_tokens_seen": 9676160, "step": 7965 }, { "epoch": 0.9986217266006766, "grad_norm": 0.5150481462478638, "learning_rate": 4.9924821450946e-06, "loss": 0.4638, "num_input_tokens_seen": 9682592, "step": 7970 }, { "epoch": 0.99924821450946, "grad_norm": 0.6008914113044739, "learning_rate": 4.995614584638517e-06, "loss": 0.4648, "num_input_tokens_seen": 9688832, "step": 7975 }, { "epoch": 0.9998747024182433, "grad_norm": 0.5665209293365479, "learning_rate": 4.9987470241824335e-06, "loss": 0.4675, "num_input_tokens_seen": 9695040, "step": 7980 }, { "epoch": 1.0005011903270267, "grad_norm": 0.7253996133804321, "learning_rate": 5.00187946372635e-06, "loss": 0.4634, "num_input_tokens_seen": 9701344, "step": 7985 }, { "epoch": 1.00112767823581, "grad_norm": 0.5393932461738586, "learning_rate": 5.005011903270267e-06, "loss": 0.4593, "num_input_tokens_seen": 9707776, "step": 7990 }, { "epoch": 1.0017541661445934, "grad_norm": 0.4513845443725586, "learning_rate": 5.008144342814184e-06, "loss": 0.457, "num_input_tokens_seen": 9713888, "step": 7995 }, { "epoch": 1.0023806540533768, "grad_norm": 0.47937217354774475, "learning_rate": 5.011276782358101e-06, "loss": 0.4681, "num_input_tokens_seen": 9719584, "step": 8000 }, { "epoch": 1.0030071419621602, "grad_norm": 0.5119696855545044, "learning_rate": 5.014409221902018e-06, "loss": 0.4614, "num_input_tokens_seen": 9725824, "step": 8005 }, { "epoch": 1.0036336298709434, "grad_norm": 0.550232470035553, "learning_rate": 5.017541661445935e-06, "loss": 0.4592, "num_input_tokens_seen": 9731936, "step": 8010 }, { "epoch": 1.0042601177797268, "grad_norm": 0.8878443837165833, "learning_rate": 5.020674100989851e-06, "loss": 0.4593, "num_input_tokens_seen": 9738080, "step": 8015 }, { "epoch": 1.0048866056885102, "grad_norm": 0.6433702111244202, "learning_rate": 5.023806540533768e-06, "loss": 0.472, "num_input_tokens_seen": 9744512, "step": 8020 }, { "epoch": 1.0055130935972936, "grad_norm": 0.6466675400733948, "learning_rate": 5.026938980077684e-06, "loss": 0.4615, "num_input_tokens_seen": 9750560, "step": 8025 }, { "epoch": 1.006139581506077, "grad_norm": 0.44727158546447754, "learning_rate": 5.030071419621602e-06, "loss": 0.4633, "num_input_tokens_seen": 9756512, "step": 8030 }, { "epoch": 1.0067660694148604, "grad_norm": 0.2846938669681549, "learning_rate": 5.033203859165519e-06, "loss": 0.4691, "num_input_tokens_seen": 9762880, "step": 8035 }, { "epoch": 1.0073925573236437, "grad_norm": 0.4998496472835541, "learning_rate": 5.0363362987094355e-06, "loss": 0.4563, "num_input_tokens_seen": 9769280, "step": 8040 }, { "epoch": 1.008019045232427, "grad_norm": 0.6962228417396545, "learning_rate": 5.039468738253352e-06, "loss": 0.4565, "num_input_tokens_seen": 9775744, "step": 8045 }, { "epoch": 1.0086455331412103, "grad_norm": 0.48802557587623596, "learning_rate": 5.042601177797269e-06, "loss": 0.4681, "num_input_tokens_seen": 9781632, "step": 8050 }, { "epoch": 1.0092720210499937, "grad_norm": 0.7157297134399414, "learning_rate": 5.045733617341185e-06, "loss": 0.4683, "num_input_tokens_seen": 9787552, "step": 8055 }, { "epoch": 1.009898508958777, "grad_norm": 0.6948915719985962, "learning_rate": 5.048866056885103e-06, "loss": 0.4637, "num_input_tokens_seen": 9793856, "step": 8060 }, { "epoch": 1.0105249968675605, "grad_norm": 0.5415530800819397, "learning_rate": 5.051998496429019e-06, "loss": 0.4527, "num_input_tokens_seen": 9800064, "step": 8065 }, { "epoch": 1.0111514847763439, "grad_norm": 0.5660191774368286, "learning_rate": 5.055130935972936e-06, "loss": 0.4598, "num_input_tokens_seen": 9806048, "step": 8070 }, { "epoch": 1.0117779726851273, "grad_norm": 0.5157898664474487, "learning_rate": 5.058263375516853e-06, "loss": 0.4646, "num_input_tokens_seen": 9812032, "step": 8075 }, { "epoch": 1.0124044605939106, "grad_norm": 0.3854438364505768, "learning_rate": 5.06139581506077e-06, "loss": 0.4572, "num_input_tokens_seen": 9817984, "step": 8080 }, { "epoch": 1.0130309485026938, "grad_norm": 0.44255053997039795, "learning_rate": 5.064528254604687e-06, "loss": 0.4486, "num_input_tokens_seen": 9824000, "step": 8085 }, { "epoch": 1.0136574364114772, "grad_norm": 1.130235195159912, "learning_rate": 5.067660694148603e-06, "loss": 0.4653, "num_input_tokens_seen": 9829952, "step": 8090 }, { "epoch": 1.0142839243202606, "grad_norm": 0.7351718544960022, "learning_rate": 5.07079313369252e-06, "loss": 0.4609, "num_input_tokens_seen": 9836288, "step": 8095 }, { "epoch": 1.014910412229044, "grad_norm": 0.604134202003479, "learning_rate": 5.073925573236437e-06, "loss": 0.4677, "num_input_tokens_seen": 9842560, "step": 8100 }, { "epoch": 1.0155369001378274, "grad_norm": 0.8909326195716858, "learning_rate": 5.0770580127803535e-06, "loss": 0.4638, "num_input_tokens_seen": 9848800, "step": 8105 }, { "epoch": 1.0161633880466108, "grad_norm": 0.3995985686779022, "learning_rate": 5.080190452324271e-06, "loss": 0.4495, "num_input_tokens_seen": 9855040, "step": 8110 }, { "epoch": 1.0167898759553942, "grad_norm": 0.8465175032615662, "learning_rate": 5.083322891868188e-06, "loss": 0.4808, "num_input_tokens_seen": 9860960, "step": 8115 }, { "epoch": 1.0174163638641773, "grad_norm": 0.34995558857917786, "learning_rate": 5.086455331412105e-06, "loss": 0.4696, "num_input_tokens_seen": 9867264, "step": 8120 }, { "epoch": 1.0180428517729607, "grad_norm": 0.7387290000915527, "learning_rate": 5.0895877709560205e-06, "loss": 0.4603, "num_input_tokens_seen": 9873728, "step": 8125 }, { "epoch": 1.0186693396817441, "grad_norm": 0.32489341497421265, "learning_rate": 5.092720210499937e-06, "loss": 0.4647, "num_input_tokens_seen": 9879872, "step": 8130 }, { "epoch": 1.0192958275905275, "grad_norm": 0.39720940589904785, "learning_rate": 5.095852650043854e-06, "loss": 0.4492, "num_input_tokens_seen": 9885824, "step": 8135 }, { "epoch": 1.019922315499311, "grad_norm": 0.9232814908027649, "learning_rate": 5.098985089587772e-06, "loss": 0.4651, "num_input_tokens_seen": 9891776, "step": 8140 }, { "epoch": 1.0205488034080943, "grad_norm": 1.2799490690231323, "learning_rate": 5.102117529131688e-06, "loss": 0.4671, "num_input_tokens_seen": 9897408, "step": 8145 }, { "epoch": 1.0211752913168777, "grad_norm": 1.574371576309204, "learning_rate": 5.105249968675605e-06, "loss": 0.475, "num_input_tokens_seen": 9903712, "step": 8150 }, { "epoch": 1.0218017792256608, "grad_norm": 0.39894306659698486, "learning_rate": 5.108382408219522e-06, "loss": 0.4735, "num_input_tokens_seen": 9909632, "step": 8155 }, { "epoch": 1.0224282671344442, "grad_norm": 0.6646936535835266, "learning_rate": 5.111514847763439e-06, "loss": 0.4726, "num_input_tokens_seen": 9915840, "step": 8160 }, { "epoch": 1.0230547550432276, "grad_norm": 0.38355058431625366, "learning_rate": 5.114647287307355e-06, "loss": 0.4631, "num_input_tokens_seen": 9921504, "step": 8165 }, { "epoch": 1.023681242952011, "grad_norm": 0.4538065791130066, "learning_rate": 5.117779726851272e-06, "loss": 0.4594, "num_input_tokens_seen": 9927744, "step": 8170 }, { "epoch": 1.0243077308607944, "grad_norm": 0.7378444075584412, "learning_rate": 5.120912166395189e-06, "loss": 0.4637, "num_input_tokens_seen": 9933984, "step": 8175 }, { "epoch": 1.0249342187695778, "grad_norm": 0.9692432284355164, "learning_rate": 5.124044605939106e-06, "loss": 0.4708, "num_input_tokens_seen": 9940192, "step": 8180 }, { "epoch": 1.0255607066783612, "grad_norm": 0.484328031539917, "learning_rate": 5.1271770454830225e-06, "loss": 0.4566, "num_input_tokens_seen": 9946016, "step": 8185 }, { "epoch": 1.0261871945871444, "grad_norm": 0.7106317281723022, "learning_rate": 5.13030948502694e-06, "loss": 0.4597, "num_input_tokens_seen": 9952000, "step": 8190 }, { "epoch": 1.0268136824959277, "grad_norm": 0.2891746759414673, "learning_rate": 5.133441924570857e-06, "loss": 0.4649, "num_input_tokens_seen": 9958208, "step": 8195 }, { "epoch": 1.0274401704047111, "grad_norm": 0.5565407276153564, "learning_rate": 5.136574364114773e-06, "loss": 0.4587, "num_input_tokens_seen": 9963680, "step": 8200 }, { "epoch": 1.0280666583134945, "grad_norm": 0.6408119201660156, "learning_rate": 5.1397068036586896e-06, "loss": 0.4629, "num_input_tokens_seen": 9969760, "step": 8205 }, { "epoch": 1.028693146222278, "grad_norm": 0.4880157709121704, "learning_rate": 5.142839243202606e-06, "loss": 0.4654, "num_input_tokens_seen": 9975840, "step": 8210 }, { "epoch": 1.0293196341310613, "grad_norm": 0.5818421840667725, "learning_rate": 5.145971682746523e-06, "loss": 0.4668, "num_input_tokens_seen": 9982176, "step": 8215 }, { "epoch": 1.0299461220398447, "grad_norm": 0.7287420034408569, "learning_rate": 5.149104122290441e-06, "loss": 0.4592, "num_input_tokens_seen": 9988160, "step": 8220 }, { "epoch": 1.030572609948628, "grad_norm": 0.6517755389213562, "learning_rate": 5.1522365618343575e-06, "loss": 0.4606, "num_input_tokens_seen": 9994368, "step": 8225 }, { "epoch": 1.0311990978574113, "grad_norm": 0.8261822462081909, "learning_rate": 5.155369001378274e-06, "loss": 0.46, "num_input_tokens_seen": 10000480, "step": 8230 }, { "epoch": 1.0318255857661947, "grad_norm": 0.8954787850379944, "learning_rate": 5.15850144092219e-06, "loss": 0.4711, "num_input_tokens_seen": 10006496, "step": 8235 }, { "epoch": 1.032452073674978, "grad_norm": 0.5480203032493591, "learning_rate": 5.161633880466107e-06, "loss": 0.4658, "num_input_tokens_seen": 10012256, "step": 8240 }, { "epoch": 1.0330785615837614, "grad_norm": 0.7183709740638733, "learning_rate": 5.164766320010024e-06, "loss": 0.4611, "num_input_tokens_seen": 10018240, "step": 8245 }, { "epoch": 1.0337050494925448, "grad_norm": 0.8600131869316101, "learning_rate": 5.167898759553941e-06, "loss": 0.4661, "num_input_tokens_seen": 10024384, "step": 8250 }, { "epoch": 1.0343315374013282, "grad_norm": 0.8133218288421631, "learning_rate": 5.171031199097858e-06, "loss": 0.4593, "num_input_tokens_seen": 10030464, "step": 8255 }, { "epoch": 1.0349580253101116, "grad_norm": 0.3972315788269043, "learning_rate": 5.174163638641775e-06, "loss": 0.4641, "num_input_tokens_seen": 10036864, "step": 8260 }, { "epoch": 1.0355845132188948, "grad_norm": 0.8007798194885254, "learning_rate": 5.1772960781856916e-06, "loss": 0.4588, "num_input_tokens_seen": 10043008, "step": 8265 }, { "epoch": 1.0362110011276782, "grad_norm": 0.923457682132721, "learning_rate": 5.180428517729608e-06, "loss": 0.4612, "num_input_tokens_seen": 10048608, "step": 8270 }, { "epoch": 1.0368374890364616, "grad_norm": 2.6346685886383057, "learning_rate": 5.183560957273524e-06, "loss": 0.4755, "num_input_tokens_seen": 10054688, "step": 8275 }, { "epoch": 1.037463976945245, "grad_norm": 0.8563520312309265, "learning_rate": 5.186693396817442e-06, "loss": 0.4623, "num_input_tokens_seen": 10060896, "step": 8280 }, { "epoch": 1.0380904648540283, "grad_norm": 0.616197943687439, "learning_rate": 5.189825836361359e-06, "loss": 0.4634, "num_input_tokens_seen": 10066944, "step": 8285 }, { "epoch": 1.0387169527628117, "grad_norm": 0.4770943224430084, "learning_rate": 5.192958275905275e-06, "loss": 0.467, "num_input_tokens_seen": 10072800, "step": 8290 }, { "epoch": 1.0393434406715951, "grad_norm": 0.5581151843070984, "learning_rate": 5.196090715449192e-06, "loss": 0.4622, "num_input_tokens_seen": 10078944, "step": 8295 }, { "epoch": 1.0399699285803785, "grad_norm": 0.7762377858161926, "learning_rate": 5.19922315499311e-06, "loss": 0.4597, "num_input_tokens_seen": 10085056, "step": 8300 }, { "epoch": 1.0405964164891617, "grad_norm": 0.8595024347305298, "learning_rate": 5.2023555945370265e-06, "loss": 0.4655, "num_input_tokens_seen": 10091328, "step": 8305 }, { "epoch": 1.041222904397945, "grad_norm": 0.5002902150154114, "learning_rate": 5.2054880340809424e-06, "loss": 0.4522, "num_input_tokens_seen": 10097280, "step": 8310 }, { "epoch": 1.0418493923067285, "grad_norm": 0.9419837594032288, "learning_rate": 5.208620473624859e-06, "loss": 0.4633, "num_input_tokens_seen": 10103040, "step": 8315 }, { "epoch": 1.0424758802155119, "grad_norm": 0.7206225395202637, "learning_rate": 5.211752913168776e-06, "loss": 0.4663, "num_input_tokens_seen": 10109376, "step": 8320 }, { "epoch": 1.0431023681242952, "grad_norm": 0.8561010360717773, "learning_rate": 5.214885352712693e-06, "loss": 0.4587, "num_input_tokens_seen": 10115520, "step": 8325 }, { "epoch": 1.0437288560330786, "grad_norm": 0.588443398475647, "learning_rate": 5.21801779225661e-06, "loss": 0.4768, "num_input_tokens_seen": 10121376, "step": 8330 }, { "epoch": 1.044355343941862, "grad_norm": 0.9021419882774353, "learning_rate": 5.221150231800527e-06, "loss": 0.4649, "num_input_tokens_seen": 10127552, "step": 8335 }, { "epoch": 1.0449818318506452, "grad_norm": 0.6785757541656494, "learning_rate": 5.224282671344444e-06, "loss": 0.4642, "num_input_tokens_seen": 10133216, "step": 8340 }, { "epoch": 1.0456083197594286, "grad_norm": 1.1416786909103394, "learning_rate": 5.22741511088836e-06, "loss": 0.4627, "num_input_tokens_seen": 10139264, "step": 8345 }, { "epoch": 1.046234807668212, "grad_norm": 0.821962833404541, "learning_rate": 5.2305475504322765e-06, "loss": 0.4661, "num_input_tokens_seen": 10145664, "step": 8350 }, { "epoch": 1.0468612955769954, "grad_norm": 0.5578033328056335, "learning_rate": 5.233679989976193e-06, "loss": 0.4592, "num_input_tokens_seen": 10151520, "step": 8355 }, { "epoch": 1.0474877834857788, "grad_norm": 0.4171784222126007, "learning_rate": 5.236812429520111e-06, "loss": 0.4609, "num_input_tokens_seen": 10157440, "step": 8360 }, { "epoch": 1.0481142713945621, "grad_norm": 1.2049132585525513, "learning_rate": 5.239944869064028e-06, "loss": 0.4659, "num_input_tokens_seen": 10163488, "step": 8365 }, { "epoch": 1.0487407593033455, "grad_norm": 1.2663068771362305, "learning_rate": 5.2430773086079444e-06, "loss": 0.4555, "num_input_tokens_seen": 10169120, "step": 8370 }, { "epoch": 1.0493672472121287, "grad_norm": 0.5100350975990295, "learning_rate": 5.246209748151861e-06, "loss": 0.4633, "num_input_tokens_seen": 10175168, "step": 8375 }, { "epoch": 1.049993735120912, "grad_norm": 0.42195838689804077, "learning_rate": 5.249342187695779e-06, "loss": 0.4775, "num_input_tokens_seen": 10180160, "step": 8380 }, { "epoch": 1.0506202230296955, "grad_norm": 0.47164440155029297, "learning_rate": 5.252474627239694e-06, "loss": 0.4576, "num_input_tokens_seen": 10186432, "step": 8385 }, { "epoch": 1.0512467109384789, "grad_norm": 0.5073176622390747, "learning_rate": 5.2556070667836115e-06, "loss": 0.4668, "num_input_tokens_seen": 10192800, "step": 8390 }, { "epoch": 1.0518731988472623, "grad_norm": 0.5047920942306519, "learning_rate": 5.258739506327528e-06, "loss": 0.456, "num_input_tokens_seen": 10198944, "step": 8395 }, { "epoch": 1.0524996867560457, "grad_norm": 0.3824922740459442, "learning_rate": 5.261871945871445e-06, "loss": 0.4665, "num_input_tokens_seen": 10205120, "step": 8400 }, { "epoch": 1.053126174664829, "grad_norm": 0.5917315483093262, "learning_rate": 5.265004385415362e-06, "loss": 0.4597, "num_input_tokens_seen": 10211424, "step": 8405 }, { "epoch": 1.0537526625736122, "grad_norm": 0.5362745523452759, "learning_rate": 5.268136824959279e-06, "loss": 0.4613, "num_input_tokens_seen": 10217344, "step": 8410 }, { "epoch": 1.0543791504823956, "grad_norm": 0.6028074026107788, "learning_rate": 5.271269264503196e-06, "loss": 0.4696, "num_input_tokens_seen": 10223232, "step": 8415 }, { "epoch": 1.055005638391179, "grad_norm": 0.5976576805114746, "learning_rate": 5.274401704047112e-06, "loss": 0.4636, "num_input_tokens_seen": 10229536, "step": 8420 }, { "epoch": 1.0556321262999624, "grad_norm": 0.7926720976829529, "learning_rate": 5.277534143591029e-06, "loss": 0.4659, "num_input_tokens_seen": 10235712, "step": 8425 }, { "epoch": 1.0562586142087458, "grad_norm": 0.5671725869178772, "learning_rate": 5.280666583134946e-06, "loss": 0.4676, "num_input_tokens_seen": 10241856, "step": 8430 }, { "epoch": 1.0568851021175292, "grad_norm": 0.5400941967964172, "learning_rate": 5.283799022678862e-06, "loss": 0.4634, "num_input_tokens_seen": 10247712, "step": 8435 }, { "epoch": 1.0575115900263126, "grad_norm": 0.7835063934326172, "learning_rate": 5.28693146222278e-06, "loss": 0.4618, "num_input_tokens_seen": 10253696, "step": 8440 }, { "epoch": 1.058138077935096, "grad_norm": 0.5768709778785706, "learning_rate": 5.290063901766697e-06, "loss": 0.4651, "num_input_tokens_seen": 10259648, "step": 8445 }, { "epoch": 1.0587645658438791, "grad_norm": 0.4335508346557617, "learning_rate": 5.2931963413106135e-06, "loss": 0.4652, "num_input_tokens_seen": 10265600, "step": 8450 }, { "epoch": 1.0593910537526625, "grad_norm": 0.467989444732666, "learning_rate": 5.296328780854529e-06, "loss": 0.4665, "num_input_tokens_seen": 10271744, "step": 8455 }, { "epoch": 1.060017541661446, "grad_norm": 0.45778489112854004, "learning_rate": 5.299461220398446e-06, "loss": 0.4605, "num_input_tokens_seen": 10277216, "step": 8460 }, { "epoch": 1.0606440295702293, "grad_norm": 0.39432457089424133, "learning_rate": 5.302593659942363e-06, "loss": 0.4576, "num_input_tokens_seen": 10283136, "step": 8465 }, { "epoch": 1.0612705174790127, "grad_norm": 0.5072248578071594, "learning_rate": 5.3057260994862806e-06, "loss": 0.463, "num_input_tokens_seen": 10289280, "step": 8470 }, { "epoch": 1.061897005387796, "grad_norm": 0.7770843505859375, "learning_rate": 5.308858539030197e-06, "loss": 0.4617, "num_input_tokens_seen": 10295328, "step": 8475 }, { "epoch": 1.0625234932965795, "grad_norm": 0.5904134511947632, "learning_rate": 5.311990978574114e-06, "loss": 0.4569, "num_input_tokens_seen": 10300896, "step": 8480 }, { "epoch": 1.0631499812053626, "grad_norm": 1.4599764347076416, "learning_rate": 5.315123418118031e-06, "loss": 0.4682, "num_input_tokens_seen": 10306912, "step": 8485 }, { "epoch": 1.063776469114146, "grad_norm": 1.8792577981948853, "learning_rate": 5.3182558576619485e-06, "loss": 0.4717, "num_input_tokens_seen": 10313472, "step": 8490 }, { "epoch": 1.0644029570229294, "grad_norm": 0.32333213090896606, "learning_rate": 5.3213882972058635e-06, "loss": 0.4534, "num_input_tokens_seen": 10319904, "step": 8495 }, { "epoch": 1.0650294449317128, "grad_norm": 0.8785341382026672, "learning_rate": 5.324520736749781e-06, "loss": 0.4681, "num_input_tokens_seen": 10325920, "step": 8500 }, { "epoch": 1.0656559328404962, "grad_norm": 1.0732980966567993, "learning_rate": 5.327653176293698e-06, "loss": 0.4623, "num_input_tokens_seen": 10332128, "step": 8505 }, { "epoch": 1.0662824207492796, "grad_norm": 0.9811587929725647, "learning_rate": 5.330785615837615e-06, "loss": 0.4499, "num_input_tokens_seen": 10338208, "step": 8510 }, { "epoch": 1.066908908658063, "grad_norm": 1.5572038888931274, "learning_rate": 5.333918055381531e-06, "loss": 0.4914, "num_input_tokens_seen": 10344256, "step": 8515 }, { "epoch": 1.0675353965668464, "grad_norm": 0.8734914660453796, "learning_rate": 5.337050494925449e-06, "loss": 0.4646, "num_input_tokens_seen": 10350432, "step": 8520 }, { "epoch": 1.0681618844756295, "grad_norm": 0.364907443523407, "learning_rate": 5.340182934469366e-06, "loss": 0.4697, "num_input_tokens_seen": 10356416, "step": 8525 }, { "epoch": 1.068788372384413, "grad_norm": 0.3963775634765625, "learning_rate": 5.343315374013282e-06, "loss": 0.4642, "num_input_tokens_seen": 10362368, "step": 8530 }, { "epoch": 1.0694148602931963, "grad_norm": 0.4290875196456909, "learning_rate": 5.3464478135571985e-06, "loss": 0.4672, "num_input_tokens_seen": 10368672, "step": 8535 }, { "epoch": 1.0700413482019797, "grad_norm": 0.4311801791191101, "learning_rate": 5.349580253101115e-06, "loss": 0.4669, "num_input_tokens_seen": 10374400, "step": 8540 }, { "epoch": 1.070667836110763, "grad_norm": 0.45127901434898376, "learning_rate": 5.352712692645032e-06, "loss": 0.4553, "num_input_tokens_seen": 10380160, "step": 8545 }, { "epoch": 1.0712943240195465, "grad_norm": 0.6082035303115845, "learning_rate": 5.35584513218895e-06, "loss": 0.4622, "num_input_tokens_seen": 10385824, "step": 8550 }, { "epoch": 1.0719208119283299, "grad_norm": 0.38012370467185974, "learning_rate": 5.358977571732866e-06, "loss": 0.4668, "num_input_tokens_seen": 10391872, "step": 8555 }, { "epoch": 1.072547299837113, "grad_norm": 0.5111009478569031, "learning_rate": 5.362110011276783e-06, "loss": 0.4557, "num_input_tokens_seen": 10398016, "step": 8560 }, { "epoch": 1.0731737877458964, "grad_norm": 0.2866117060184479, "learning_rate": 5.365242450820699e-06, "loss": 0.4615, "num_input_tokens_seen": 10404128, "step": 8565 }, { "epoch": 1.0738002756546798, "grad_norm": 0.43405112624168396, "learning_rate": 5.368374890364616e-06, "loss": 0.4636, "num_input_tokens_seen": 10410208, "step": 8570 }, { "epoch": 1.0744267635634632, "grad_norm": 0.47385725378990173, "learning_rate": 5.3715073299085326e-06, "loss": 0.4647, "num_input_tokens_seen": 10416384, "step": 8575 }, { "epoch": 1.0750532514722466, "grad_norm": 0.5021646022796631, "learning_rate": 5.37463976945245e-06, "loss": 0.4603, "num_input_tokens_seen": 10422752, "step": 8580 }, { "epoch": 1.07567973938103, "grad_norm": 0.4427041709423065, "learning_rate": 5.377772208996367e-06, "loss": 0.4607, "num_input_tokens_seen": 10428864, "step": 8585 }, { "epoch": 1.0763062272898134, "grad_norm": 0.6506779789924622, "learning_rate": 5.380904648540284e-06, "loss": 0.4516, "num_input_tokens_seen": 10435168, "step": 8590 }, { "epoch": 1.0769327151985966, "grad_norm": 0.5186444520950317, "learning_rate": 5.3840370880842005e-06, "loss": 0.4597, "num_input_tokens_seen": 10440928, "step": 8595 }, { "epoch": 1.07755920310738, "grad_norm": 0.7297033071517944, "learning_rate": 5.387169527628118e-06, "loss": 0.4586, "num_input_tokens_seen": 10447232, "step": 8600 }, { "epoch": 1.0781856910161634, "grad_norm": 0.45029935240745544, "learning_rate": 5.390301967172034e-06, "loss": 0.4749, "num_input_tokens_seen": 10453152, "step": 8605 }, { "epoch": 1.0788121789249467, "grad_norm": 0.6652331352233887, "learning_rate": 5.393434406715951e-06, "loss": 0.4523, "num_input_tokens_seen": 10459200, "step": 8610 }, { "epoch": 1.0794386668337301, "grad_norm": 0.804847240447998, "learning_rate": 5.3965668462598675e-06, "loss": 0.461, "num_input_tokens_seen": 10465312, "step": 8615 }, { "epoch": 1.0800651547425135, "grad_norm": 0.544691264629364, "learning_rate": 5.399699285803784e-06, "loss": 0.4659, "num_input_tokens_seen": 10471296, "step": 8620 }, { "epoch": 1.080691642651297, "grad_norm": 0.9531120657920837, "learning_rate": 5.402831725347701e-06, "loss": 0.4606, "num_input_tokens_seen": 10477056, "step": 8625 }, { "epoch": 1.08131813056008, "grad_norm": 0.3952122926712036, "learning_rate": 5.405964164891619e-06, "loss": 0.4661, "num_input_tokens_seen": 10483296, "step": 8630 }, { "epoch": 1.0819446184688635, "grad_norm": 0.7547417879104614, "learning_rate": 5.4090966044355354e-06, "loss": 0.4738, "num_input_tokens_seen": 10489152, "step": 8635 }, { "epoch": 1.0825711063776469, "grad_norm": 0.3936152756214142, "learning_rate": 5.412229043979451e-06, "loss": 0.4525, "num_input_tokens_seen": 10495296, "step": 8640 }, { "epoch": 1.0831975942864303, "grad_norm": 0.5279892086982727, "learning_rate": 5.415361483523368e-06, "loss": 0.4707, "num_input_tokens_seen": 10501664, "step": 8645 }, { "epoch": 1.0838240821952136, "grad_norm": 0.7185470461845398, "learning_rate": 5.418493923067285e-06, "loss": 0.4671, "num_input_tokens_seen": 10507424, "step": 8650 }, { "epoch": 1.084450570103997, "grad_norm": 0.6713447570800781, "learning_rate": 5.421626362611202e-06, "loss": 0.4645, "num_input_tokens_seen": 10514048, "step": 8655 }, { "epoch": 1.0850770580127804, "grad_norm": 0.9018558263778687, "learning_rate": 5.424758802155119e-06, "loss": 0.4613, "num_input_tokens_seen": 10519968, "step": 8660 }, { "epoch": 1.0857035459215638, "grad_norm": 0.8648956418037415, "learning_rate": 5.427891241699036e-06, "loss": 0.4573, "num_input_tokens_seen": 10526176, "step": 8665 }, { "epoch": 1.086330033830347, "grad_norm": 0.8147851824760437, "learning_rate": 5.431023681242953e-06, "loss": 0.4604, "num_input_tokens_seen": 10532160, "step": 8670 }, { "epoch": 1.0869565217391304, "grad_norm": 0.3367502987384796, "learning_rate": 5.434156120786869e-06, "loss": 0.458, "num_input_tokens_seen": 10538368, "step": 8675 }, { "epoch": 1.0875830096479138, "grad_norm": 0.8674522638320923, "learning_rate": 5.4372885603307854e-06, "loss": 0.4603, "num_input_tokens_seen": 10544512, "step": 8680 }, { "epoch": 1.0882094975566972, "grad_norm": 1.2580267190933228, "learning_rate": 5.440420999874702e-06, "loss": 0.4718, "num_input_tokens_seen": 10550816, "step": 8685 }, { "epoch": 1.0888359854654805, "grad_norm": 0.866889476776123, "learning_rate": 5.44355343941862e-06, "loss": 0.4596, "num_input_tokens_seen": 10557024, "step": 8690 }, { "epoch": 1.089462473374264, "grad_norm": 0.78083735704422, "learning_rate": 5.446685878962537e-06, "loss": 0.4721, "num_input_tokens_seen": 10562880, "step": 8695 }, { "epoch": 1.0900889612830473, "grad_norm": 0.24817359447479248, "learning_rate": 5.449818318506453e-06, "loss": 0.4676, "num_input_tokens_seen": 10568992, "step": 8700 }, { "epoch": 1.0907154491918305, "grad_norm": 0.5169346332550049, "learning_rate": 5.45295075805037e-06, "loss": 0.4647, "num_input_tokens_seen": 10575168, "step": 8705 }, { "epoch": 1.091341937100614, "grad_norm": 0.37195295095443726, "learning_rate": 5.456083197594288e-06, "loss": 0.4654, "num_input_tokens_seen": 10581632, "step": 8710 }, { "epoch": 1.0919684250093973, "grad_norm": 0.37325483560562134, "learning_rate": 5.459215637138204e-06, "loss": 0.4576, "num_input_tokens_seen": 10587232, "step": 8715 }, { "epoch": 1.0925949129181807, "grad_norm": 0.5020374655723572, "learning_rate": 5.46234807668212e-06, "loss": 0.4595, "num_input_tokens_seen": 10593248, "step": 8720 }, { "epoch": 1.093221400826964, "grad_norm": 0.4610462486743927, "learning_rate": 5.465480516226037e-06, "loss": 0.4617, "num_input_tokens_seen": 10599808, "step": 8725 }, { "epoch": 1.0938478887357475, "grad_norm": 0.5196598172187805, "learning_rate": 5.468612955769954e-06, "loss": 0.4586, "num_input_tokens_seen": 10605920, "step": 8730 }, { "epoch": 1.0944743766445308, "grad_norm": 0.524523913860321, "learning_rate": 5.471745395313871e-06, "loss": 0.4576, "num_input_tokens_seen": 10611936, "step": 8735 }, { "epoch": 1.0951008645533142, "grad_norm": 0.29186782240867615, "learning_rate": 5.474877834857788e-06, "loss": 0.4655, "num_input_tokens_seen": 10618112, "step": 8740 }, { "epoch": 1.0957273524620974, "grad_norm": 0.9108135104179382, "learning_rate": 5.478010274401705e-06, "loss": 0.4578, "num_input_tokens_seen": 10624256, "step": 8745 }, { "epoch": 1.0963538403708808, "grad_norm": 0.7758235335350037, "learning_rate": 5.481142713945621e-06, "loss": 0.4573, "num_input_tokens_seen": 10630240, "step": 8750 }, { "epoch": 1.0969803282796642, "grad_norm": 1.2961465120315552, "learning_rate": 5.484275153489538e-06, "loss": 0.4615, "num_input_tokens_seen": 10636640, "step": 8755 }, { "epoch": 1.0976068161884476, "grad_norm": 1.3017607927322388, "learning_rate": 5.4874075930334545e-06, "loss": 0.4618, "num_input_tokens_seen": 10643008, "step": 8760 }, { "epoch": 1.098233304097231, "grad_norm": 1.253177285194397, "learning_rate": 5.490540032577371e-06, "loss": 0.4875, "num_input_tokens_seen": 10648928, "step": 8765 }, { "epoch": 1.0988597920060144, "grad_norm": 0.4860451817512512, "learning_rate": 5.493672472121289e-06, "loss": 0.464, "num_input_tokens_seen": 10655168, "step": 8770 }, { "epoch": 1.0994862799147977, "grad_norm": 0.8598628044128418, "learning_rate": 5.496804911665206e-06, "loss": 0.4689, "num_input_tokens_seen": 10661280, "step": 8775 }, { "epoch": 1.100112767823581, "grad_norm": 0.40296468138694763, "learning_rate": 5.499937351209122e-06, "loss": 0.4687, "num_input_tokens_seen": 10667072, "step": 8780 }, { "epoch": 1.1007392557323643, "grad_norm": 1.4251391887664795, "learning_rate": 5.503069790753038e-06, "loss": 0.4719, "num_input_tokens_seen": 10672480, "step": 8785 }, { "epoch": 1.1013657436411477, "grad_norm": 2.19209623336792, "learning_rate": 5.506202230296955e-06, "loss": 0.4668, "num_input_tokens_seen": 10678080, "step": 8790 }, { "epoch": 1.101992231549931, "grad_norm": 34.91228103637695, "learning_rate": 5.509334669840873e-06, "loss": 0.5653, "num_input_tokens_seen": 10684192, "step": 8795 }, { "epoch": 1.1026187194587145, "grad_norm": 1.3424797058105469, "learning_rate": 5.5124671093847895e-06, "loss": 0.9047, "num_input_tokens_seen": 10690592, "step": 8800 }, { "epoch": 1.1032452073674979, "grad_norm": 11.517434120178223, "learning_rate": 5.515599548928706e-06, "loss": 0.4914, "num_input_tokens_seen": 10696896, "step": 8805 }, { "epoch": 1.1038716952762813, "grad_norm": 4.207785129547119, "learning_rate": 5.518731988472623e-06, "loss": 0.4895, "num_input_tokens_seen": 10703456, "step": 8810 }, { "epoch": 1.1044981831850644, "grad_norm": 1.1900274753570557, "learning_rate": 5.52186442801654e-06, "loss": 0.4611, "num_input_tokens_seen": 10709632, "step": 8815 }, { "epoch": 1.1051246710938478, "grad_norm": 1.082553505897522, "learning_rate": 5.524996867560457e-06, "loss": 0.4676, "num_input_tokens_seen": 10715744, "step": 8820 }, { "epoch": 1.1057511590026312, "grad_norm": 1.14365553855896, "learning_rate": 5.528129307104373e-06, "loss": 0.4623, "num_input_tokens_seen": 10722272, "step": 8825 }, { "epoch": 1.1063776469114146, "grad_norm": 0.6717883944511414, "learning_rate": 5.53126174664829e-06, "loss": 0.4605, "num_input_tokens_seen": 10728128, "step": 8830 }, { "epoch": 1.107004134820198, "grad_norm": 0.63397616147995, "learning_rate": 5.534394186192207e-06, "loss": 0.4616, "num_input_tokens_seen": 10734240, "step": 8835 }, { "epoch": 1.1076306227289814, "grad_norm": 0.6593836545944214, "learning_rate": 5.5375266257361236e-06, "loss": 0.4598, "num_input_tokens_seen": 10740320, "step": 8840 }, { "epoch": 1.1082571106377648, "grad_norm": 0.9450501799583435, "learning_rate": 5.54065906528004e-06, "loss": 0.466, "num_input_tokens_seen": 10746720, "step": 8845 }, { "epoch": 1.108883598546548, "grad_norm": 0.29769665002822876, "learning_rate": 5.543791504823958e-06, "loss": 0.4629, "num_input_tokens_seen": 10753184, "step": 8850 }, { "epoch": 1.1095100864553313, "grad_norm": 0.718936026096344, "learning_rate": 5.546923944367875e-06, "loss": 0.4614, "num_input_tokens_seen": 10758816, "step": 8855 }, { "epoch": 1.1101365743641147, "grad_norm": 0.7981929183006287, "learning_rate": 5.550056383911791e-06, "loss": 0.4584, "num_input_tokens_seen": 10765024, "step": 8860 }, { "epoch": 1.1107630622728981, "grad_norm": 0.46697232127189636, "learning_rate": 5.553188823455707e-06, "loss": 0.4734, "num_input_tokens_seen": 10771040, "step": 8865 }, { "epoch": 1.1113895501816815, "grad_norm": 0.6867861747741699, "learning_rate": 5.556321262999624e-06, "loss": 0.4634, "num_input_tokens_seen": 10777184, "step": 8870 }, { "epoch": 1.112016038090465, "grad_norm": 0.8710335493087769, "learning_rate": 5.559453702543542e-06, "loss": 0.468, "num_input_tokens_seen": 10783584, "step": 8875 }, { "epoch": 1.1126425259992483, "grad_norm": 0.4235104024410248, "learning_rate": 5.5625861420874585e-06, "loss": 0.4637, "num_input_tokens_seen": 10789664, "step": 8880 }, { "epoch": 1.1132690139080315, "grad_norm": 0.7335804104804993, "learning_rate": 5.565718581631375e-06, "loss": 0.4599, "num_input_tokens_seen": 10795296, "step": 8885 }, { "epoch": 1.1138955018168148, "grad_norm": 0.649377167224884, "learning_rate": 5.568851021175292e-06, "loss": 0.4637, "num_input_tokens_seen": 10801408, "step": 8890 }, { "epoch": 1.1145219897255982, "grad_norm": 0.7197245955467224, "learning_rate": 5.571983460719208e-06, "loss": 0.4608, "num_input_tokens_seen": 10807488, "step": 8895 }, { "epoch": 1.1151484776343816, "grad_norm": 0.9666827321052551, "learning_rate": 5.575115900263125e-06, "loss": 0.4561, "num_input_tokens_seen": 10813632, "step": 8900 }, { "epoch": 1.115774965543165, "grad_norm": 0.7481439113616943, "learning_rate": 5.578248339807042e-06, "loss": 0.4568, "num_input_tokens_seen": 10819008, "step": 8905 }, { "epoch": 1.1164014534519484, "grad_norm": 0.7767863273620605, "learning_rate": 5.581380779350959e-06, "loss": 0.4553, "num_input_tokens_seen": 10825056, "step": 8910 }, { "epoch": 1.1170279413607318, "grad_norm": 1.6069881916046143, "learning_rate": 5.584513218894876e-06, "loss": 0.4701, "num_input_tokens_seen": 10830880, "step": 8915 }, { "epoch": 1.1176544292695152, "grad_norm": 6.470424652099609, "learning_rate": 5.587645658438793e-06, "loss": 0.5168, "num_input_tokens_seen": 10837248, "step": 8920 }, { "epoch": 1.1182809171782984, "grad_norm": 0.7911848425865173, "learning_rate": 5.590778097982709e-06, "loss": 0.4792, "num_input_tokens_seen": 10843232, "step": 8925 }, { "epoch": 1.1189074050870818, "grad_norm": 0.8772517442703247, "learning_rate": 5.593910537526627e-06, "loss": 0.4731, "num_input_tokens_seen": 10848928, "step": 8930 }, { "epoch": 1.1195338929958651, "grad_norm": 0.5174048542976379, "learning_rate": 5.597042977070543e-06, "loss": 0.4691, "num_input_tokens_seen": 10855136, "step": 8935 }, { "epoch": 1.1201603809046485, "grad_norm": 0.19547760486602783, "learning_rate": 5.60017541661446e-06, "loss": 0.4624, "num_input_tokens_seen": 10861120, "step": 8940 }, { "epoch": 1.120786868813432, "grad_norm": 0.3561297655105591, "learning_rate": 5.6033078561583764e-06, "loss": 0.4627, "num_input_tokens_seen": 10867648, "step": 8945 }, { "epoch": 1.1214133567222153, "grad_norm": 0.67095547914505, "learning_rate": 5.606440295702293e-06, "loss": 0.4616, "num_input_tokens_seen": 10873664, "step": 8950 }, { "epoch": 1.1220398446309987, "grad_norm": 0.3314657211303711, "learning_rate": 5.60957273524621e-06, "loss": 0.4625, "num_input_tokens_seen": 10879712, "step": 8955 }, { "epoch": 1.122666332539782, "grad_norm": 0.5969526171684265, "learning_rate": 5.6127051747901276e-06, "loss": 0.4705, "num_input_tokens_seen": 10886176, "step": 8960 }, { "epoch": 1.1232928204485653, "grad_norm": 0.39081689715385437, "learning_rate": 5.615837614334044e-06, "loss": 0.4582, "num_input_tokens_seen": 10892192, "step": 8965 }, { "epoch": 1.1239193083573487, "grad_norm": 0.4267687201499939, "learning_rate": 5.61897005387796e-06, "loss": 0.4639, "num_input_tokens_seen": 10898176, "step": 8970 }, { "epoch": 1.124545796266132, "grad_norm": 0.39824312925338745, "learning_rate": 5.622102493421877e-06, "loss": 0.4613, "num_input_tokens_seen": 10904128, "step": 8975 }, { "epoch": 1.1251722841749154, "grad_norm": 0.9656322002410889, "learning_rate": 5.625234932965794e-06, "loss": 0.453, "num_input_tokens_seen": 10910016, "step": 8980 }, { "epoch": 1.1257987720836988, "grad_norm": 0.3634132742881775, "learning_rate": 5.628367372509711e-06, "loss": 0.4632, "num_input_tokens_seen": 10916480, "step": 8985 }, { "epoch": 1.1264252599924822, "grad_norm": 0.5131824612617493, "learning_rate": 5.631499812053628e-06, "loss": 0.4582, "num_input_tokens_seen": 10922816, "step": 8990 }, { "epoch": 1.1270517479012656, "grad_norm": 0.545979917049408, "learning_rate": 5.634632251597545e-06, "loss": 0.4787, "num_input_tokens_seen": 10928768, "step": 8995 }, { "epoch": 1.1276782358100488, "grad_norm": 0.4759480059146881, "learning_rate": 5.637764691141462e-06, "loss": 0.4688, "num_input_tokens_seen": 10935136, "step": 9000 }, { "epoch": 1.1283047237188322, "grad_norm": 0.4168413579463959, "learning_rate": 5.640897130685378e-06, "loss": 0.4613, "num_input_tokens_seen": 10940960, "step": 9005 }, { "epoch": 1.1289312116276156, "grad_norm": 0.559065580368042, "learning_rate": 5.644029570229294e-06, "loss": 0.4649, "num_input_tokens_seen": 10947072, "step": 9010 }, { "epoch": 1.129557699536399, "grad_norm": 0.3995177745819092, "learning_rate": 5.647162009773212e-06, "loss": 0.4645, "num_input_tokens_seen": 10953184, "step": 9015 }, { "epoch": 1.1301841874451823, "grad_norm": 0.5301981568336487, "learning_rate": 5.650294449317129e-06, "loss": 0.4646, "num_input_tokens_seen": 10958944, "step": 9020 }, { "epoch": 1.1308106753539657, "grad_norm": 0.4173884093761444, "learning_rate": 5.6534268888610455e-06, "loss": 0.4698, "num_input_tokens_seen": 10964960, "step": 9025 }, { "epoch": 1.1314371632627491, "grad_norm": 0.5391045212745667, "learning_rate": 5.656559328404962e-06, "loss": 0.4629, "num_input_tokens_seen": 10971424, "step": 9030 }, { "epoch": 1.1320636511715323, "grad_norm": 0.3805372416973114, "learning_rate": 5.659691767948879e-06, "loss": 0.4624, "num_input_tokens_seen": 10977504, "step": 9035 }, { "epoch": 1.1326901390803157, "grad_norm": 0.3894384801387787, "learning_rate": 5.662824207492797e-06, "loss": 0.4617, "num_input_tokens_seen": 10983392, "step": 9040 }, { "epoch": 1.133316626989099, "grad_norm": 0.46115532517433167, "learning_rate": 5.6659566470367125e-06, "loss": 0.4675, "num_input_tokens_seen": 10989344, "step": 9045 }, { "epoch": 1.1339431148978825, "grad_norm": 0.49353158473968506, "learning_rate": 5.669089086580629e-06, "loss": 0.4586, "num_input_tokens_seen": 10995168, "step": 9050 }, { "epoch": 1.1345696028066659, "grad_norm": 0.364602655172348, "learning_rate": 5.672221526124546e-06, "loss": 0.4642, "num_input_tokens_seen": 11001024, "step": 9055 }, { "epoch": 1.1351960907154492, "grad_norm": 0.41130438446998596, "learning_rate": 5.675353965668463e-06, "loss": 0.4617, "num_input_tokens_seen": 11007072, "step": 9060 }, { "epoch": 1.1358225786242326, "grad_norm": 0.6012611985206604, "learning_rate": 5.6784864052123804e-06, "loss": 0.4627, "num_input_tokens_seen": 11013216, "step": 9065 }, { "epoch": 1.1364490665330158, "grad_norm": 0.602841317653656, "learning_rate": 5.681618844756297e-06, "loss": 0.4607, "num_input_tokens_seen": 11019072, "step": 9070 }, { "epoch": 1.1370755544417992, "grad_norm": 0.3337688446044922, "learning_rate": 5.684751284300214e-06, "loss": 0.4628, "num_input_tokens_seen": 11025440, "step": 9075 }, { "epoch": 1.1377020423505826, "grad_norm": 0.43326207995414734, "learning_rate": 5.68788372384413e-06, "loss": 0.4626, "num_input_tokens_seen": 11031680, "step": 9080 }, { "epoch": 1.138328530259366, "grad_norm": 0.19131025671958923, "learning_rate": 5.691016163388047e-06, "loss": 0.4635, "num_input_tokens_seen": 11038048, "step": 9085 }, { "epoch": 1.1389550181681494, "grad_norm": 0.3490130603313446, "learning_rate": 5.694148602931963e-06, "loss": 0.4634, "num_input_tokens_seen": 11044160, "step": 9090 }, { "epoch": 1.1395815060769328, "grad_norm": 0.3104141652584076, "learning_rate": 5.697281042475881e-06, "loss": 0.4706, "num_input_tokens_seen": 11050144, "step": 9095 }, { "epoch": 1.1402079939857162, "grad_norm": 0.4882359504699707, "learning_rate": 5.700413482019798e-06, "loss": 0.4677, "num_input_tokens_seen": 11055936, "step": 9100 }, { "epoch": 1.1408344818944993, "grad_norm": 0.6651474833488464, "learning_rate": 5.7035459215637145e-06, "loss": 0.4584, "num_input_tokens_seen": 11062240, "step": 9105 }, { "epoch": 1.1414609698032827, "grad_norm": 0.6380296349525452, "learning_rate": 5.706678361107631e-06, "loss": 0.4639, "num_input_tokens_seen": 11068320, "step": 9110 }, { "epoch": 1.142087457712066, "grad_norm": 0.25385722517967224, "learning_rate": 5.709810800651547e-06, "loss": 0.4638, "num_input_tokens_seen": 11073984, "step": 9115 }, { "epoch": 1.1427139456208495, "grad_norm": 0.4979151785373688, "learning_rate": 5.712943240195464e-06, "loss": 0.4727, "num_input_tokens_seen": 11080000, "step": 9120 }, { "epoch": 1.1433404335296329, "grad_norm": 0.32621052861213684, "learning_rate": 5.716075679739382e-06, "loss": 0.4591, "num_input_tokens_seen": 11085376, "step": 9125 }, { "epoch": 1.1439669214384163, "grad_norm": 0.37527987360954285, "learning_rate": 5.719208119283298e-06, "loss": 0.4529, "num_input_tokens_seen": 11091232, "step": 9130 }, { "epoch": 1.1445934093471997, "grad_norm": 0.4693933129310608, "learning_rate": 5.722340558827215e-06, "loss": 0.4624, "num_input_tokens_seen": 11097120, "step": 9135 }, { "epoch": 1.145219897255983, "grad_norm": 0.3394898474216461, "learning_rate": 5.725472998371132e-06, "loss": 0.4642, "num_input_tokens_seen": 11103072, "step": 9140 }, { "epoch": 1.1458463851647662, "grad_norm": 0.5447177290916443, "learning_rate": 5.7286054379150495e-06, "loss": 0.4652, "num_input_tokens_seen": 11109312, "step": 9145 }, { "epoch": 1.1464728730735496, "grad_norm": 0.5059005618095398, "learning_rate": 5.731737877458966e-06, "loss": 0.4648, "num_input_tokens_seen": 11115040, "step": 9150 }, { "epoch": 1.147099360982333, "grad_norm": 0.20889940857887268, "learning_rate": 5.734870317002882e-06, "loss": 0.462, "num_input_tokens_seen": 11119776, "step": 9155 }, { "epoch": 1.1477258488911164, "grad_norm": 0.44369885325431824, "learning_rate": 5.738002756546799e-06, "loss": 0.4613, "num_input_tokens_seen": 11126016, "step": 9160 }, { "epoch": 1.1483523367998998, "grad_norm": 0.3646300137042999, "learning_rate": 5.741135196090716e-06, "loss": 0.4639, "num_input_tokens_seen": 11132320, "step": 9165 }, { "epoch": 1.1489788247086832, "grad_norm": 0.6381648182868958, "learning_rate": 5.7442676356346325e-06, "loss": 0.4695, "num_input_tokens_seen": 11137920, "step": 9170 }, { "epoch": 1.1496053126174666, "grad_norm": 0.3906558156013489, "learning_rate": 5.74740007517855e-06, "loss": 0.4559, "num_input_tokens_seen": 11144096, "step": 9175 }, { "epoch": 1.15023180052625, "grad_norm": 0.6538789868354797, "learning_rate": 5.750532514722467e-06, "loss": 0.46, "num_input_tokens_seen": 11150496, "step": 9180 }, { "epoch": 1.1508582884350331, "grad_norm": 0.4722007215023041, "learning_rate": 5.753664954266384e-06, "loss": 0.4659, "num_input_tokens_seen": 11156704, "step": 9185 }, { "epoch": 1.1514847763438165, "grad_norm": 0.3849225640296936, "learning_rate": 5.7567973938102995e-06, "loss": 0.4541, "num_input_tokens_seen": 11162688, "step": 9190 }, { "epoch": 1.1521112642526, "grad_norm": 0.39023005962371826, "learning_rate": 5.759929833354216e-06, "loss": 0.4651, "num_input_tokens_seen": 11168960, "step": 9195 }, { "epoch": 1.1527377521613833, "grad_norm": 0.413021981716156, "learning_rate": 5.763062272898133e-06, "loss": 0.4577, "num_input_tokens_seen": 11175136, "step": 9200 }, { "epoch": 1.1533642400701667, "grad_norm": 0.41878795623779297, "learning_rate": 5.766194712442051e-06, "loss": 0.456, "num_input_tokens_seen": 11180736, "step": 9205 }, { "epoch": 1.15399072797895, "grad_norm": 0.4506671130657196, "learning_rate": 5.769327151985967e-06, "loss": 0.4661, "num_input_tokens_seen": 11186912, "step": 9210 }, { "epoch": 1.1546172158877335, "grad_norm": 0.6916871666908264, "learning_rate": 5.772459591529884e-06, "loss": 0.4435, "num_input_tokens_seen": 11192832, "step": 9215 }, { "epoch": 1.1552437037965166, "grad_norm": 0.663657546043396, "learning_rate": 5.775592031073801e-06, "loss": 0.4541, "num_input_tokens_seen": 11199008, "step": 9220 }, { "epoch": 1.1558701917053, "grad_norm": 0.768674373626709, "learning_rate": 5.778724470617717e-06, "loss": 0.4663, "num_input_tokens_seen": 11205152, "step": 9225 }, { "epoch": 1.1564966796140834, "grad_norm": 0.30323299765586853, "learning_rate": 5.781856910161634e-06, "loss": 0.4574, "num_input_tokens_seen": 11211232, "step": 9230 }, { "epoch": 1.1571231675228668, "grad_norm": 0.6326631903648376, "learning_rate": 5.784989349705551e-06, "loss": 0.4694, "num_input_tokens_seen": 11217280, "step": 9235 }, { "epoch": 1.1577496554316502, "grad_norm": 0.5138139128684998, "learning_rate": 5.788121789249468e-06, "loss": 0.4602, "num_input_tokens_seen": 11223744, "step": 9240 }, { "epoch": 1.1583761433404336, "grad_norm": 0.3676725924015045, "learning_rate": 5.791254228793385e-06, "loss": 0.4646, "num_input_tokens_seen": 11229792, "step": 9245 }, { "epoch": 1.159002631249217, "grad_norm": 0.536630392074585, "learning_rate": 5.7943866683373015e-06, "loss": 0.4772, "num_input_tokens_seen": 11235872, "step": 9250 }, { "epoch": 1.1596291191580002, "grad_norm": 0.3960692882537842, "learning_rate": 5.797519107881219e-06, "loss": 0.4612, "num_input_tokens_seen": 11241120, "step": 9255 }, { "epoch": 1.1602556070667835, "grad_norm": 0.5568857789039612, "learning_rate": 5.800651547425136e-06, "loss": 0.4609, "num_input_tokens_seen": 11247648, "step": 9260 }, { "epoch": 1.160882094975567, "grad_norm": 1.6329466104507446, "learning_rate": 5.803783986969052e-06, "loss": 0.4472, "num_input_tokens_seen": 11254208, "step": 9265 }, { "epoch": 1.1615085828843503, "grad_norm": 19.858243942260742, "learning_rate": 5.8069164265129686e-06, "loss": 0.5244, "num_input_tokens_seen": 11260352, "step": 9270 }, { "epoch": 1.1621350707931337, "grad_norm": 4.670903205871582, "learning_rate": 5.810048866056885e-06, "loss": 0.4158, "num_input_tokens_seen": 11266112, "step": 9275 }, { "epoch": 1.162761558701917, "grad_norm": 4.043981075286865, "learning_rate": 5.813181305600802e-06, "loss": 0.5889, "num_input_tokens_seen": 11272256, "step": 9280 }, { "epoch": 1.1633880466107005, "grad_norm": 4.330509662628174, "learning_rate": 5.81631374514472e-06, "loss": 0.491, "num_input_tokens_seen": 11278464, "step": 9285 }, { "epoch": 1.1640145345194837, "grad_norm": 1.1644763946533203, "learning_rate": 5.8194461846886365e-06, "loss": 0.4577, "num_input_tokens_seen": 11283840, "step": 9290 }, { "epoch": 1.164641022428267, "grad_norm": 1.7846804857254028, "learning_rate": 5.822578624232553e-06, "loss": 0.4437, "num_input_tokens_seen": 11289888, "step": 9295 }, { "epoch": 1.1652675103370504, "grad_norm": 3.2389979362487793, "learning_rate": 5.825711063776469e-06, "loss": 0.4825, "num_input_tokens_seen": 11295936, "step": 9300 }, { "epoch": 1.1658939982458338, "grad_norm": 0.925080418586731, "learning_rate": 5.828843503320386e-06, "loss": 0.5094, "num_input_tokens_seen": 11301824, "step": 9305 }, { "epoch": 1.1665204861546172, "grad_norm": 0.607063889503479, "learning_rate": 5.831975942864303e-06, "loss": 0.4567, "num_input_tokens_seen": 11308096, "step": 9310 }, { "epoch": 1.1671469740634006, "grad_norm": 0.6075187921524048, "learning_rate": 5.83510838240822e-06, "loss": 0.4697, "num_input_tokens_seen": 11314112, "step": 9315 }, { "epoch": 1.167773461972184, "grad_norm": 0.4389842450618744, "learning_rate": 5.838240821952137e-06, "loss": 0.4579, "num_input_tokens_seen": 11320192, "step": 9320 }, { "epoch": 1.1683999498809672, "grad_norm": 0.4104185104370117, "learning_rate": 5.841373261496054e-06, "loss": 0.4611, "num_input_tokens_seen": 11326208, "step": 9325 }, { "epoch": 1.1690264377897506, "grad_norm": 0.4180091321468353, "learning_rate": 5.8445057010399706e-06, "loss": 0.4614, "num_input_tokens_seen": 11332416, "step": 9330 }, { "epoch": 1.169652925698534, "grad_norm": 0.640485405921936, "learning_rate": 5.8476381405838865e-06, "loss": 0.4552, "num_input_tokens_seen": 11338272, "step": 9335 }, { "epoch": 1.1702794136073174, "grad_norm": 0.38421374559402466, "learning_rate": 5.850770580127803e-06, "loss": 0.4546, "num_input_tokens_seen": 11344672, "step": 9340 }, { "epoch": 1.1709059015161007, "grad_norm": 0.4010031819343567, "learning_rate": 5.853903019671721e-06, "loss": 0.4719, "num_input_tokens_seen": 11350560, "step": 9345 }, { "epoch": 1.1715323894248841, "grad_norm": 0.4044342637062073, "learning_rate": 5.857035459215638e-06, "loss": 0.4628, "num_input_tokens_seen": 11356832, "step": 9350 }, { "epoch": 1.1721588773336675, "grad_norm": 0.5793952941894531, "learning_rate": 5.860167898759554e-06, "loss": 0.4517, "num_input_tokens_seen": 11362880, "step": 9355 }, { "epoch": 1.172785365242451, "grad_norm": 0.44520312547683716, "learning_rate": 5.863300338303471e-06, "loss": 0.4572, "num_input_tokens_seen": 11369056, "step": 9360 }, { "epoch": 1.173411853151234, "grad_norm": 0.5209718942642212, "learning_rate": 5.866432777847389e-06, "loss": 0.46, "num_input_tokens_seen": 11374912, "step": 9365 }, { "epoch": 1.1740383410600175, "grad_norm": 0.5382845997810364, "learning_rate": 5.8695652173913055e-06, "loss": 0.4643, "num_input_tokens_seen": 11380960, "step": 9370 }, { "epoch": 1.1746648289688009, "grad_norm": 0.5163856148719788, "learning_rate": 5.8726976569352214e-06, "loss": 0.4594, "num_input_tokens_seen": 11386848, "step": 9375 }, { "epoch": 1.1752913168775843, "grad_norm": 0.5304778218269348, "learning_rate": 5.875830096479138e-06, "loss": 0.4464, "num_input_tokens_seen": 11392896, "step": 9380 }, { "epoch": 1.1759178047863676, "grad_norm": 0.44549959897994995, "learning_rate": 5.878962536023055e-06, "loss": 0.4702, "num_input_tokens_seen": 11398848, "step": 9385 }, { "epoch": 1.176544292695151, "grad_norm": 0.37513428926467896, "learning_rate": 5.882094975566972e-06, "loss": 0.4677, "num_input_tokens_seen": 11405056, "step": 9390 }, { "epoch": 1.1771707806039344, "grad_norm": 0.44313979148864746, "learning_rate": 5.885227415110889e-06, "loss": 0.4694, "num_input_tokens_seen": 11410944, "step": 9395 }, { "epoch": 1.1777972685127178, "grad_norm": 0.8284593820571899, "learning_rate": 5.888359854654806e-06, "loss": 0.4566, "num_input_tokens_seen": 11417280, "step": 9400 }, { "epoch": 1.178423756421501, "grad_norm": 0.4827063977718353, "learning_rate": 5.891492294198723e-06, "loss": 0.4598, "num_input_tokens_seen": 11423584, "step": 9405 }, { "epoch": 1.1790502443302844, "grad_norm": 0.6252625584602356, "learning_rate": 5.894624733742639e-06, "loss": 0.4665, "num_input_tokens_seen": 11429696, "step": 9410 }, { "epoch": 1.1796767322390678, "grad_norm": 0.4518119692802429, "learning_rate": 5.8977571732865555e-06, "loss": 0.4578, "num_input_tokens_seen": 11435744, "step": 9415 }, { "epoch": 1.1803032201478512, "grad_norm": 0.48540252447128296, "learning_rate": 5.900889612830472e-06, "loss": 0.4784, "num_input_tokens_seen": 11441760, "step": 9420 }, { "epoch": 1.1809297080566346, "grad_norm": 0.5150968432426453, "learning_rate": 5.90402205237439e-06, "loss": 0.4601, "num_input_tokens_seen": 11448032, "step": 9425 }, { "epoch": 1.181556195965418, "grad_norm": 0.5297408699989319, "learning_rate": 5.907154491918307e-06, "loss": 0.4602, "num_input_tokens_seen": 11454048, "step": 9430 }, { "epoch": 1.1821826838742013, "grad_norm": 0.3898475468158722, "learning_rate": 5.9102869314622234e-06, "loss": 0.466, "num_input_tokens_seen": 11460064, "step": 9435 }, { "epoch": 1.1828091717829845, "grad_norm": 0.4236370921134949, "learning_rate": 5.91341937100614e-06, "loss": 0.4616, "num_input_tokens_seen": 11466112, "step": 9440 }, { "epoch": 1.183435659691768, "grad_norm": 0.36504611372947693, "learning_rate": 5.916551810550056e-06, "loss": 0.4647, "num_input_tokens_seen": 11472224, "step": 9445 }, { "epoch": 1.1840621476005513, "grad_norm": 0.5591530799865723, "learning_rate": 5.919684250093973e-06, "loss": 0.4617, "num_input_tokens_seen": 11478272, "step": 9450 }, { "epoch": 1.1846886355093347, "grad_norm": 0.39809244871139526, "learning_rate": 5.9228166896378905e-06, "loss": 0.4664, "num_input_tokens_seen": 11484416, "step": 9455 }, { "epoch": 1.185315123418118, "grad_norm": 0.4929426610469818, "learning_rate": 5.925949129181807e-06, "loss": 0.469, "num_input_tokens_seen": 11490528, "step": 9460 }, { "epoch": 1.1859416113269015, "grad_norm": 0.31802159547805786, "learning_rate": 5.929081568725724e-06, "loss": 0.46, "num_input_tokens_seen": 11496608, "step": 9465 }, { "epoch": 1.1865680992356848, "grad_norm": 0.40287768840789795, "learning_rate": 5.932214008269641e-06, "loss": 0.465, "num_input_tokens_seen": 11502368, "step": 9470 }, { "epoch": 1.187194587144468, "grad_norm": 0.3810296058654785, "learning_rate": 5.935346447813558e-06, "loss": 0.4617, "num_input_tokens_seen": 11507968, "step": 9475 }, { "epoch": 1.1878210750532514, "grad_norm": 0.29391083121299744, "learning_rate": 5.938478887357475e-06, "loss": 0.4612, "num_input_tokens_seen": 11513408, "step": 9480 }, { "epoch": 1.1884475629620348, "grad_norm": 0.7506564259529114, "learning_rate": 5.941611326901391e-06, "loss": 0.4575, "num_input_tokens_seen": 11519744, "step": 9485 }, { "epoch": 1.1890740508708182, "grad_norm": 0.26329582929611206, "learning_rate": 5.944743766445308e-06, "loss": 0.4692, "num_input_tokens_seen": 11526176, "step": 9490 }, { "epoch": 1.1897005387796016, "grad_norm": 0.3444090485572815, "learning_rate": 5.947876205989225e-06, "loss": 0.4598, "num_input_tokens_seen": 11532416, "step": 9495 }, { "epoch": 1.190327026688385, "grad_norm": 0.5265139937400818, "learning_rate": 5.951008645533141e-06, "loss": 0.4673, "num_input_tokens_seen": 11538432, "step": 9500 }, { "epoch": 1.1909535145971684, "grad_norm": 0.48366254568099976, "learning_rate": 5.954141085077059e-06, "loss": 0.4597, "num_input_tokens_seen": 11544416, "step": 9505 }, { "epoch": 1.1915800025059515, "grad_norm": 0.38757580518722534, "learning_rate": 5.957273524620976e-06, "loss": 0.4675, "num_input_tokens_seen": 11550656, "step": 9510 }, { "epoch": 1.192206490414735, "grad_norm": 0.22885194420814514, "learning_rate": 5.9604059641648925e-06, "loss": 0.4653, "num_input_tokens_seen": 11557184, "step": 9515 }, { "epoch": 1.1928329783235183, "grad_norm": 0.22808298468589783, "learning_rate": 5.963538403708808e-06, "loss": 0.4658, "num_input_tokens_seen": 11563104, "step": 9520 }, { "epoch": 1.1934594662323017, "grad_norm": 0.5363896489143372, "learning_rate": 5.966670843252725e-06, "loss": 0.4599, "num_input_tokens_seen": 11569408, "step": 9525 }, { "epoch": 1.194085954141085, "grad_norm": 0.49945068359375, "learning_rate": 5.969803282796642e-06, "loss": 0.4706, "num_input_tokens_seen": 11575488, "step": 9530 }, { "epoch": 1.1947124420498685, "grad_norm": 0.3393298089504242, "learning_rate": 5.9729357223405596e-06, "loss": 0.4677, "num_input_tokens_seen": 11581536, "step": 9535 }, { "epoch": 1.1953389299586519, "grad_norm": 0.9092710614204407, "learning_rate": 5.976068161884476e-06, "loss": 0.4547, "num_input_tokens_seen": 11587520, "step": 9540 }, { "epoch": 1.195965417867435, "grad_norm": 0.3701185882091522, "learning_rate": 5.979200601428393e-06, "loss": 0.4647, "num_input_tokens_seen": 11593344, "step": 9545 }, { "epoch": 1.1965919057762184, "grad_norm": 0.4271925389766693, "learning_rate": 5.98233304097231e-06, "loss": 0.461, "num_input_tokens_seen": 11599360, "step": 9550 }, { "epoch": 1.1972183936850018, "grad_norm": 0.5096514821052551, "learning_rate": 5.985465480516226e-06, "loss": 0.47, "num_input_tokens_seen": 11605504, "step": 9555 }, { "epoch": 1.1978448815937852, "grad_norm": 0.3704387843608856, "learning_rate": 5.988597920060143e-06, "loss": 0.4657, "num_input_tokens_seen": 11611456, "step": 9560 }, { "epoch": 1.1984713695025686, "grad_norm": 0.34823471307754517, "learning_rate": 5.99173035960406e-06, "loss": 0.4755, "num_input_tokens_seen": 11617568, "step": 9565 }, { "epoch": 1.199097857411352, "grad_norm": 0.3846196234226227, "learning_rate": 5.994862799147977e-06, "loss": 0.4604, "num_input_tokens_seen": 11623872, "step": 9570 }, { "epoch": 1.1997243453201354, "grad_norm": 0.49081969261169434, "learning_rate": 5.997995238691894e-06, "loss": 0.4585, "num_input_tokens_seen": 11629696, "step": 9575 }, { "epoch": 1.2003508332289188, "grad_norm": 0.4155828058719635, "learning_rate": 6.0011276782358104e-06, "loss": 0.4605, "num_input_tokens_seen": 11635616, "step": 9580 }, { "epoch": 1.200977321137702, "grad_norm": 0.486222505569458, "learning_rate": 6.004260117779728e-06, "loss": 0.4559, "num_input_tokens_seen": 11641728, "step": 9585 }, { "epoch": 1.2016038090464853, "grad_norm": 0.5549615621566772, "learning_rate": 6.007392557323645e-06, "loss": 0.4769, "num_input_tokens_seen": 11647776, "step": 9590 }, { "epoch": 1.2022302969552687, "grad_norm": 0.15436013042926788, "learning_rate": 6.010524996867561e-06, "loss": 0.4633, "num_input_tokens_seen": 11654080, "step": 9595 }, { "epoch": 1.2028567848640521, "grad_norm": 0.31546884775161743, "learning_rate": 6.0136574364114775e-06, "loss": 0.4617, "num_input_tokens_seen": 11660032, "step": 9600 }, { "epoch": 1.2034832727728355, "grad_norm": 0.3339388370513916, "learning_rate": 6.016789875955394e-06, "loss": 0.4578, "num_input_tokens_seen": 11666560, "step": 9605 }, { "epoch": 1.204109760681619, "grad_norm": 0.4307096302509308, "learning_rate": 6.019922315499311e-06, "loss": 0.4608, "num_input_tokens_seen": 11672640, "step": 9610 }, { "epoch": 1.2047362485904023, "grad_norm": 0.2927047312259674, "learning_rate": 6.023054755043229e-06, "loss": 0.4557, "num_input_tokens_seen": 11678912, "step": 9615 }, { "epoch": 1.2053627364991857, "grad_norm": 0.40101492404937744, "learning_rate": 6.026187194587145e-06, "loss": 0.459, "num_input_tokens_seen": 11685184, "step": 9620 }, { "epoch": 1.2059892244079689, "grad_norm": 0.42860084772109985, "learning_rate": 6.029319634131062e-06, "loss": 0.4727, "num_input_tokens_seen": 11691520, "step": 9625 }, { "epoch": 1.2066157123167522, "grad_norm": 0.5147110819816589, "learning_rate": 6.032452073674978e-06, "loss": 0.4691, "num_input_tokens_seen": 11697728, "step": 9630 }, { "epoch": 1.2072422002255356, "grad_norm": 0.42125269770622253, "learning_rate": 6.035584513218895e-06, "loss": 0.4609, "num_input_tokens_seen": 11704256, "step": 9635 }, { "epoch": 1.207868688134319, "grad_norm": 0.38409143686294556, "learning_rate": 6.038716952762812e-06, "loss": 0.4702, "num_input_tokens_seen": 11710528, "step": 9640 }, { "epoch": 1.2084951760431024, "grad_norm": 0.3528250753879547, "learning_rate": 6.041849392306729e-06, "loss": 0.4629, "num_input_tokens_seen": 11716928, "step": 9645 }, { "epoch": 1.2091216639518858, "grad_norm": 0.4357388913631439, "learning_rate": 6.044981831850646e-06, "loss": 0.4624, "num_input_tokens_seen": 11723008, "step": 9650 }, { "epoch": 1.2097481518606692, "grad_norm": 0.6046527028083801, "learning_rate": 6.048114271394563e-06, "loss": 0.4722, "num_input_tokens_seen": 11728960, "step": 9655 }, { "epoch": 1.2103746397694524, "grad_norm": 0.35837140679359436, "learning_rate": 6.0512467109384795e-06, "loss": 0.4588, "num_input_tokens_seen": 11735072, "step": 9660 }, { "epoch": 1.2110011276782358, "grad_norm": 0.3958148658275604, "learning_rate": 6.054379150482395e-06, "loss": 0.4644, "num_input_tokens_seen": 11741216, "step": 9665 }, { "epoch": 1.2116276155870191, "grad_norm": 0.37377873063087463, "learning_rate": 6.057511590026313e-06, "loss": 0.4659, "num_input_tokens_seen": 11746976, "step": 9670 }, { "epoch": 1.2122541034958025, "grad_norm": 0.32407552003860474, "learning_rate": 6.06064402957023e-06, "loss": 0.4612, "num_input_tokens_seen": 11753248, "step": 9675 }, { "epoch": 1.212880591404586, "grad_norm": 0.42263469099998474, "learning_rate": 6.0637764691141465e-06, "loss": 0.4691, "num_input_tokens_seen": 11759648, "step": 9680 }, { "epoch": 1.2135070793133693, "grad_norm": 0.6820544600486755, "learning_rate": 6.066908908658063e-06, "loss": 0.4654, "num_input_tokens_seen": 11765952, "step": 9685 }, { "epoch": 1.2141335672221527, "grad_norm": 0.49008405208587646, "learning_rate": 6.07004134820198e-06, "loss": 0.4514, "num_input_tokens_seen": 11772288, "step": 9690 }, { "epoch": 1.2147600551309359, "grad_norm": 0.6201921105384827, "learning_rate": 6.073173787745898e-06, "loss": 0.4639, "num_input_tokens_seen": 11778784, "step": 9695 }, { "epoch": 1.2153865430397193, "grad_norm": 0.926801860332489, "learning_rate": 6.0763062272898144e-06, "loss": 0.4722, "num_input_tokens_seen": 11785024, "step": 9700 }, { "epoch": 1.2160130309485027, "grad_norm": 0.3695645332336426, "learning_rate": 6.07943866683373e-06, "loss": 0.4619, "num_input_tokens_seen": 11790976, "step": 9705 }, { "epoch": 1.216639518857286, "grad_norm": 0.6191138029098511, "learning_rate": 6.082571106377647e-06, "loss": 0.4551, "num_input_tokens_seen": 11797152, "step": 9710 }, { "epoch": 1.2172660067660694, "grad_norm": 0.1967829316854477, "learning_rate": 6.085703545921564e-06, "loss": 0.4699, "num_input_tokens_seen": 11803104, "step": 9715 }, { "epoch": 1.2178924946748528, "grad_norm": 0.39541518688201904, "learning_rate": 6.088835985465481e-06, "loss": 0.4578, "num_input_tokens_seen": 11809280, "step": 9720 }, { "epoch": 1.2185189825836362, "grad_norm": 0.5204310417175293, "learning_rate": 6.091968425009398e-06, "loss": 0.4631, "num_input_tokens_seen": 11816032, "step": 9725 }, { "epoch": 1.2191454704924194, "grad_norm": 0.424515962600708, "learning_rate": 6.095100864553315e-06, "loss": 0.4681, "num_input_tokens_seen": 11822048, "step": 9730 }, { "epoch": 1.2197719584012028, "grad_norm": 0.39961180090904236, "learning_rate": 6.098233304097232e-06, "loss": 0.4533, "num_input_tokens_seen": 11828224, "step": 9735 }, { "epoch": 1.2203984463099862, "grad_norm": 0.6936893463134766, "learning_rate": 6.101365743641148e-06, "loss": 0.4572, "num_input_tokens_seen": 11834464, "step": 9740 }, { "epoch": 1.2210249342187696, "grad_norm": 0.46321603655815125, "learning_rate": 6.1044981831850645e-06, "loss": 0.4536, "num_input_tokens_seen": 11840512, "step": 9745 }, { "epoch": 1.221651422127553, "grad_norm": 0.5114815831184387, "learning_rate": 6.107630622728982e-06, "loss": 0.4663, "num_input_tokens_seen": 11846944, "step": 9750 }, { "epoch": 1.2222779100363363, "grad_norm": 0.5346404314041138, "learning_rate": 6.110763062272899e-06, "loss": 0.4583, "num_input_tokens_seen": 11853344, "step": 9755 }, { "epoch": 1.2229043979451197, "grad_norm": 0.5384147763252258, "learning_rate": 6.113895501816816e-06, "loss": 0.4636, "num_input_tokens_seen": 11859520, "step": 9760 }, { "epoch": 1.223530885853903, "grad_norm": 0.4416428804397583, "learning_rate": 6.117027941360732e-06, "loss": 0.4711, "num_input_tokens_seen": 11865504, "step": 9765 }, { "epoch": 1.2241573737626863, "grad_norm": 0.3364766836166382, "learning_rate": 6.120160380904649e-06, "loss": 0.4534, "num_input_tokens_seen": 11871744, "step": 9770 }, { "epoch": 1.2247838616714697, "grad_norm": 0.39729318022727966, "learning_rate": 6.123292820448565e-06, "loss": 0.4666, "num_input_tokens_seen": 11877408, "step": 9775 }, { "epoch": 1.225410349580253, "grad_norm": 0.4508865773677826, "learning_rate": 6.126425259992483e-06, "loss": 0.4746, "num_input_tokens_seen": 11884096, "step": 9780 }, { "epoch": 1.2260368374890365, "grad_norm": 0.16481539607048035, "learning_rate": 6.129557699536399e-06, "loss": 0.4573, "num_input_tokens_seen": 11890272, "step": 9785 }, { "epoch": 1.2266633253978199, "grad_norm": 0.4043736755847931, "learning_rate": 6.132690139080316e-06, "loss": 0.4625, "num_input_tokens_seen": 11896320, "step": 9790 }, { "epoch": 1.2272898133066032, "grad_norm": 0.29326337575912476, "learning_rate": 6.135822578624233e-06, "loss": 0.4604, "num_input_tokens_seen": 11902688, "step": 9795 }, { "epoch": 1.2279163012153864, "grad_norm": 0.3004530668258667, "learning_rate": 6.13895501816815e-06, "loss": 0.4672, "num_input_tokens_seen": 11908640, "step": 9800 }, { "epoch": 1.2285427891241698, "grad_norm": 0.3207574188709259, "learning_rate": 6.142087457712067e-06, "loss": 0.4635, "num_input_tokens_seen": 11914592, "step": 9805 }, { "epoch": 1.2291692770329532, "grad_norm": 0.38687342405319214, "learning_rate": 6.145219897255984e-06, "loss": 0.449, "num_input_tokens_seen": 11920928, "step": 9810 }, { "epoch": 1.2297957649417366, "grad_norm": 0.3269575238227844, "learning_rate": 6.1483523367999e-06, "loss": 0.4646, "num_input_tokens_seen": 11927264, "step": 9815 }, { "epoch": 1.23042225285052, "grad_norm": 0.36731424927711487, "learning_rate": 6.151484776343817e-06, "loss": 0.468, "num_input_tokens_seen": 11933248, "step": 9820 }, { "epoch": 1.2310487407593034, "grad_norm": 0.42601799964904785, "learning_rate": 6.1546172158877335e-06, "loss": 0.4499, "num_input_tokens_seen": 11938848, "step": 9825 }, { "epoch": 1.2316752286680868, "grad_norm": 0.46901756525039673, "learning_rate": 6.157749655431651e-06, "loss": 0.4673, "num_input_tokens_seen": 11945152, "step": 9830 }, { "epoch": 1.2323017165768702, "grad_norm": 0.45729953050613403, "learning_rate": 6.160882094975568e-06, "loss": 0.4631, "num_input_tokens_seen": 11950848, "step": 9835 }, { "epoch": 1.2329282044856535, "grad_norm": 0.42106199264526367, "learning_rate": 6.164014534519485e-06, "loss": 0.4638, "num_input_tokens_seen": 11957152, "step": 9840 }, { "epoch": 1.2335546923944367, "grad_norm": 0.6978309154510498, "learning_rate": 6.167146974063401e-06, "loss": 0.4711, "num_input_tokens_seen": 11963328, "step": 9845 }, { "epoch": 1.23418118030322, "grad_norm": 0.36591947078704834, "learning_rate": 6.170279413607317e-06, "loss": 0.4628, "num_input_tokens_seen": 11969408, "step": 9850 }, { "epoch": 1.2348076682120035, "grad_norm": 0.30824723839759827, "learning_rate": 6.173411853151234e-06, "loss": 0.4696, "num_input_tokens_seen": 11975552, "step": 9855 }, { "epoch": 1.2354341561207869, "grad_norm": 0.6424552798271179, "learning_rate": 6.176544292695152e-06, "loss": 0.4547, "num_input_tokens_seen": 11981888, "step": 9860 }, { "epoch": 1.2360606440295703, "grad_norm": 0.41308319568634033, "learning_rate": 6.1796767322390685e-06, "loss": 0.456, "num_input_tokens_seen": 11987712, "step": 9865 }, { "epoch": 1.2366871319383537, "grad_norm": 0.293210506439209, "learning_rate": 6.182809171782985e-06, "loss": 0.4583, "num_input_tokens_seen": 11993888, "step": 9870 }, { "epoch": 1.237313619847137, "grad_norm": 0.4413658678531647, "learning_rate": 6.185941611326902e-06, "loss": 0.4665, "num_input_tokens_seen": 11999904, "step": 9875 }, { "epoch": 1.2379401077559202, "grad_norm": 0.4158181846141815, "learning_rate": 6.189074050870819e-06, "loss": 0.4655, "num_input_tokens_seen": 12005888, "step": 9880 }, { "epoch": 1.2385665956647036, "grad_norm": 0.42920687794685364, "learning_rate": 6.192206490414735e-06, "loss": 0.4573, "num_input_tokens_seen": 12011776, "step": 9885 }, { "epoch": 1.239193083573487, "grad_norm": 0.3073367774486542, "learning_rate": 6.195338929958652e-06, "loss": 0.4673, "num_input_tokens_seen": 12017888, "step": 9890 }, { "epoch": 1.2398195714822704, "grad_norm": 0.4167298972606659, "learning_rate": 6.198471369502569e-06, "loss": 0.4692, "num_input_tokens_seen": 12023552, "step": 9895 }, { "epoch": 1.2404460593910538, "grad_norm": 0.3088783025741577, "learning_rate": 6.201603809046486e-06, "loss": 0.4625, "num_input_tokens_seen": 12029568, "step": 9900 }, { "epoch": 1.2410725472998372, "grad_norm": 0.2709609568119049, "learning_rate": 6.2047362485904026e-06, "loss": 0.4623, "num_input_tokens_seen": 12035552, "step": 9905 }, { "epoch": 1.2416990352086206, "grad_norm": 0.4197843670845032, "learning_rate": 6.20786868813432e-06, "loss": 0.4655, "num_input_tokens_seen": 12041376, "step": 9910 }, { "epoch": 1.2423255231174037, "grad_norm": 0.16437502205371857, "learning_rate": 6.211001127678237e-06, "loss": 0.4678, "num_input_tokens_seen": 12047744, "step": 9915 }, { "epoch": 1.2429520110261871, "grad_norm": 0.505615234375, "learning_rate": 6.214133567222154e-06, "loss": 0.4649, "num_input_tokens_seen": 12053664, "step": 9920 }, { "epoch": 1.2435784989349705, "grad_norm": 0.3004973828792572, "learning_rate": 6.21726600676607e-06, "loss": 0.4573, "num_input_tokens_seen": 12059936, "step": 9925 }, { "epoch": 1.244204986843754, "grad_norm": 0.4345720708370209, "learning_rate": 6.220398446309986e-06, "loss": 0.4654, "num_input_tokens_seen": 12066112, "step": 9930 }, { "epoch": 1.2448314747525373, "grad_norm": 0.3092924952507019, "learning_rate": 6.223530885853903e-06, "loss": 0.4652, "num_input_tokens_seen": 12072128, "step": 9935 }, { "epoch": 1.2454579626613207, "grad_norm": 0.278592973947525, "learning_rate": 6.226663325397821e-06, "loss": 0.4482, "num_input_tokens_seen": 12078336, "step": 9940 }, { "epoch": 1.246084450570104, "grad_norm": 0.43487393856048584, "learning_rate": 6.2297957649417375e-06, "loss": 0.4636, "num_input_tokens_seen": 12084384, "step": 9945 }, { "epoch": 1.2467109384788873, "grad_norm": 0.8529083728790283, "learning_rate": 6.232928204485654e-06, "loss": 0.4698, "num_input_tokens_seen": 12090176, "step": 9950 }, { "epoch": 1.2473374263876706, "grad_norm": 0.7177672386169434, "learning_rate": 6.236060644029571e-06, "loss": 0.454, "num_input_tokens_seen": 12096544, "step": 9955 }, { "epoch": 1.247963914296454, "grad_norm": 0.44681406021118164, "learning_rate": 6.239193083573487e-06, "loss": 0.4619, "num_input_tokens_seen": 12102368, "step": 9960 }, { "epoch": 1.2485904022052374, "grad_norm": 0.4865899980068207, "learning_rate": 6.242325523117404e-06, "loss": 0.4534, "num_input_tokens_seen": 12108672, "step": 9965 }, { "epoch": 1.2492168901140208, "grad_norm": 0.35977470874786377, "learning_rate": 6.245457962661321e-06, "loss": 0.4617, "num_input_tokens_seen": 12114784, "step": 9970 }, { "epoch": 1.2498433780228042, "grad_norm": 0.3724845051765442, "learning_rate": 6.248590402205238e-06, "loss": 0.4879, "num_input_tokens_seen": 12120928, "step": 9975 }, { "epoch": 1.2504698659315876, "grad_norm": 0.36581671237945557, "learning_rate": 6.251722841749155e-06, "loss": 0.4721, "num_input_tokens_seen": 12127488, "step": 9980 }, { "epoch": 1.2510963538403708, "grad_norm": 0.3823220431804657, "learning_rate": 6.254855281293072e-06, "loss": 0.462, "num_input_tokens_seen": 12133760, "step": 9985 }, { "epoch": 1.2517228417491542, "grad_norm": 0.36221006512641907, "learning_rate": 6.257987720836988e-06, "loss": 0.4571, "num_input_tokens_seen": 12140000, "step": 9990 }, { "epoch": 1.2523493296579375, "grad_norm": 0.31518203020095825, "learning_rate": 6.261120160380904e-06, "loss": 0.4619, "num_input_tokens_seen": 12146304, "step": 9995 }, { "epoch": 1.252975817566721, "grad_norm": 0.3623907268047333, "learning_rate": 6.264252599924822e-06, "loss": 0.4619, "num_input_tokens_seen": 12152320, "step": 10000 }, { "epoch": 1.2536023054755043, "grad_norm": 0.519658625125885, "learning_rate": 6.267385039468739e-06, "loss": 0.4644, "num_input_tokens_seen": 12158368, "step": 10005 }, { "epoch": 1.2542287933842877, "grad_norm": 0.27351781725883484, "learning_rate": 6.2705174790126554e-06, "loss": 0.4701, "num_input_tokens_seen": 12164288, "step": 10010 }, { "epoch": 1.2548552812930711, "grad_norm": 0.3081611096858978, "learning_rate": 6.273649918556572e-06, "loss": 0.458, "num_input_tokens_seen": 12170688, "step": 10015 }, { "epoch": 1.2554817692018543, "grad_norm": 0.3033650517463684, "learning_rate": 6.27678235810049e-06, "loss": 0.4617, "num_input_tokens_seen": 12176768, "step": 10020 }, { "epoch": 1.256108257110638, "grad_norm": 0.33208075165748596, "learning_rate": 6.2799147976444066e-06, "loss": 0.4613, "num_input_tokens_seen": 12182976, "step": 10025 }, { "epoch": 1.256734745019421, "grad_norm": 0.32248592376708984, "learning_rate": 6.283047237188323e-06, "loss": 0.4624, "num_input_tokens_seen": 12189344, "step": 10030 }, { "epoch": 1.2573612329282045, "grad_norm": 0.6812340617179871, "learning_rate": 6.286179676732239e-06, "loss": 0.4642, "num_input_tokens_seen": 12195424, "step": 10035 }, { "epoch": 1.2579877208369878, "grad_norm": 0.1873571127653122, "learning_rate": 6.289312116276156e-06, "loss": 0.477, "num_input_tokens_seen": 12200864, "step": 10040 }, { "epoch": 1.2586142087457712, "grad_norm": 0.425643652677536, "learning_rate": 6.292444555820073e-06, "loss": 0.4552, "num_input_tokens_seen": 12207072, "step": 10045 }, { "epoch": 1.2592406966545546, "grad_norm": 0.2974642515182495, "learning_rate": 6.29557699536399e-06, "loss": 0.4532, "num_input_tokens_seen": 12213024, "step": 10050 }, { "epoch": 1.2598671845633378, "grad_norm": 0.29509928822517395, "learning_rate": 6.298709434907907e-06, "loss": 0.4562, "num_input_tokens_seen": 12219040, "step": 10055 }, { "epoch": 1.2604936724721214, "grad_norm": 0.29161158204078674, "learning_rate": 6.301841874451824e-06, "loss": 0.4642, "num_input_tokens_seen": 12225280, "step": 10060 }, { "epoch": 1.2611201603809046, "grad_norm": 0.16247834265232086, "learning_rate": 6.304974313995741e-06, "loss": 0.4626, "num_input_tokens_seen": 12231296, "step": 10065 }, { "epoch": 1.261746648289688, "grad_norm": 0.3140160143375397, "learning_rate": 6.308106753539657e-06, "loss": 0.4778, "num_input_tokens_seen": 12237632, "step": 10070 }, { "epoch": 1.2623731361984714, "grad_norm": 0.1985677033662796, "learning_rate": 6.311239193083573e-06, "loss": 0.4613, "num_input_tokens_seen": 12243904, "step": 10075 }, { "epoch": 1.2629996241072547, "grad_norm": 0.32527586817741394, "learning_rate": 6.314371632627491e-06, "loss": 0.4566, "num_input_tokens_seen": 12249632, "step": 10080 }, { "epoch": 1.2636261120160381, "grad_norm": 0.2896025478839874, "learning_rate": 6.317504072171408e-06, "loss": 0.4658, "num_input_tokens_seen": 12255808, "step": 10085 }, { "epoch": 1.2642525999248215, "grad_norm": 0.12361978739500046, "learning_rate": 6.3206365117153245e-06, "loss": 0.4666, "num_input_tokens_seen": 12261664, "step": 10090 }, { "epoch": 1.264879087833605, "grad_norm": 0.48677313327789307, "learning_rate": 6.323768951259241e-06, "loss": 0.464, "num_input_tokens_seen": 12267616, "step": 10095 }, { "epoch": 1.265505575742388, "grad_norm": 0.5084523558616638, "learning_rate": 6.326901390803159e-06, "loss": 0.4672, "num_input_tokens_seen": 12272896, "step": 10100 }, { "epoch": 1.2661320636511715, "grad_norm": 0.3052109181880951, "learning_rate": 6.330033830347074e-06, "loss": 0.4629, "num_input_tokens_seen": 12278496, "step": 10105 }, { "epoch": 1.2667585515599549, "grad_norm": 0.12470072507858276, "learning_rate": 6.3331662698909915e-06, "loss": 0.4591, "num_input_tokens_seen": 12284704, "step": 10110 }, { "epoch": 1.2673850394687383, "grad_norm": 0.6615132689476013, "learning_rate": 6.336298709434908e-06, "loss": 0.4674, "num_input_tokens_seen": 12290112, "step": 10115 }, { "epoch": 1.2680115273775217, "grad_norm": 0.25842946767807007, "learning_rate": 6.339431148978825e-06, "loss": 0.4637, "num_input_tokens_seen": 12296224, "step": 10120 }, { "epoch": 1.268638015286305, "grad_norm": 0.2777446508407593, "learning_rate": 6.342563588522742e-06, "loss": 0.4643, "num_input_tokens_seen": 12302592, "step": 10125 }, { "epoch": 1.2692645031950884, "grad_norm": 0.14006327092647552, "learning_rate": 6.3456960280666594e-06, "loss": 0.4577, "num_input_tokens_seen": 12308928, "step": 10130 }, { "epoch": 1.2698909911038716, "grad_norm": 0.312757670879364, "learning_rate": 6.348828467610576e-06, "loss": 0.4599, "num_input_tokens_seen": 12314848, "step": 10135 }, { "epoch": 1.270517479012655, "grad_norm": 0.3047421872615814, "learning_rate": 6.351960907154493e-06, "loss": 0.4638, "num_input_tokens_seen": 12320896, "step": 10140 }, { "epoch": 1.2711439669214384, "grad_norm": 0.2793825566768646, "learning_rate": 6.355093346698409e-06, "loss": 0.4643, "num_input_tokens_seen": 12327392, "step": 10145 }, { "epoch": 1.2717704548302218, "grad_norm": 0.36323922872543335, "learning_rate": 6.358225786242326e-06, "loss": 0.4591, "num_input_tokens_seen": 12333568, "step": 10150 }, { "epoch": 1.2723969427390052, "grad_norm": 0.3766813576221466, "learning_rate": 6.361358225786242e-06, "loss": 0.4604, "num_input_tokens_seen": 12339776, "step": 10155 }, { "epoch": 1.2730234306477886, "grad_norm": 0.4673970639705658, "learning_rate": 6.36449066533016e-06, "loss": 0.4675, "num_input_tokens_seen": 12345760, "step": 10160 }, { "epoch": 1.273649918556572, "grad_norm": 0.25060421228408813, "learning_rate": 6.367623104874077e-06, "loss": 0.4601, "num_input_tokens_seen": 12352032, "step": 10165 }, { "epoch": 1.2742764064653551, "grad_norm": 0.2819884121417999, "learning_rate": 6.3707555444179935e-06, "loss": 0.4621, "num_input_tokens_seen": 12358464, "step": 10170 }, { "epoch": 1.2749028943741385, "grad_norm": 0.09459830075502396, "learning_rate": 6.37388798396191e-06, "loss": 0.4657, "num_input_tokens_seen": 12364128, "step": 10175 }, { "epoch": 1.275529382282922, "grad_norm": 0.3791668117046356, "learning_rate": 6.377020423505826e-06, "loss": 0.4729, "num_input_tokens_seen": 12370272, "step": 10180 }, { "epoch": 1.2761558701917053, "grad_norm": 0.2684851586818695, "learning_rate": 6.380152863049743e-06, "loss": 0.4661, "num_input_tokens_seen": 12376352, "step": 10185 }, { "epoch": 1.2767823581004887, "grad_norm": 0.24031378328800201, "learning_rate": 6.383285302593661e-06, "loss": 0.4626, "num_input_tokens_seen": 12382528, "step": 10190 }, { "epoch": 1.277408846009272, "grad_norm": 0.26519954204559326, "learning_rate": 6.386417742137577e-06, "loss": 0.4598, "num_input_tokens_seen": 12388672, "step": 10195 }, { "epoch": 1.2780353339180555, "grad_norm": 0.14491309225559235, "learning_rate": 6.389550181681494e-06, "loss": 0.4633, "num_input_tokens_seen": 12394944, "step": 10200 }, { "epoch": 1.2786618218268386, "grad_norm": 0.3866775333881378, "learning_rate": 6.392682621225411e-06, "loss": 0.4679, "num_input_tokens_seen": 12400320, "step": 10205 }, { "epoch": 1.279288309735622, "grad_norm": 0.3396012485027313, "learning_rate": 6.3958150607693285e-06, "loss": 0.4668, "num_input_tokens_seen": 12405376, "step": 10210 }, { "epoch": 1.2799147976444054, "grad_norm": 0.4808851182460785, "learning_rate": 6.3989475003132436e-06, "loss": 0.4627, "num_input_tokens_seen": 12411680, "step": 10215 }, { "epoch": 1.2805412855531888, "grad_norm": 0.27598628401756287, "learning_rate": 6.402079939857161e-06, "loss": 0.4622, "num_input_tokens_seen": 12417760, "step": 10220 }, { "epoch": 1.2811677734619722, "grad_norm": 0.233274444937706, "learning_rate": 6.405212379401078e-06, "loss": 0.4571, "num_input_tokens_seen": 12423904, "step": 10225 }, { "epoch": 1.2817942613707556, "grad_norm": 0.35461026430130005, "learning_rate": 6.408344818944995e-06, "loss": 0.4638, "num_input_tokens_seen": 12429920, "step": 10230 }, { "epoch": 1.282420749279539, "grad_norm": 0.4627736508846283, "learning_rate": 6.4114772584889115e-06, "loss": 0.4621, "num_input_tokens_seen": 12436352, "step": 10235 }, { "epoch": 1.2830472371883221, "grad_norm": 0.4857056438922882, "learning_rate": 6.414609698032829e-06, "loss": 0.4528, "num_input_tokens_seen": 12442432, "step": 10240 }, { "epoch": 1.2836737250971058, "grad_norm": 0.6163725852966309, "learning_rate": 6.417742137576746e-06, "loss": 0.4721, "num_input_tokens_seen": 12448576, "step": 10245 }, { "epoch": 1.284300213005889, "grad_norm": 0.36302658915519714, "learning_rate": 6.420874577120663e-06, "loss": 0.4598, "num_input_tokens_seen": 12454720, "step": 10250 }, { "epoch": 1.2849267009146723, "grad_norm": 0.37621036171913147, "learning_rate": 6.4240070166645785e-06, "loss": 0.4623, "num_input_tokens_seen": 12461024, "step": 10255 }, { "epoch": 1.2855531888234557, "grad_norm": 0.5057052969932556, "learning_rate": 6.427139456208495e-06, "loss": 0.4666, "num_input_tokens_seen": 12467072, "step": 10260 }, { "epoch": 1.286179676732239, "grad_norm": 0.47577163577079773, "learning_rate": 6.430271895752412e-06, "loss": 0.4787, "num_input_tokens_seen": 12472992, "step": 10265 }, { "epoch": 1.2868061646410225, "grad_norm": 0.4554196000099182, "learning_rate": 6.43340433529633e-06, "loss": 0.4619, "num_input_tokens_seen": 12479136, "step": 10270 }, { "epoch": 1.2874326525498057, "grad_norm": 0.5860764980316162, "learning_rate": 6.436536774840246e-06, "loss": 0.4639, "num_input_tokens_seen": 12485216, "step": 10275 }, { "epoch": 1.2880591404585893, "grad_norm": 0.1563487946987152, "learning_rate": 6.439669214384163e-06, "loss": 0.4604, "num_input_tokens_seen": 12491488, "step": 10280 }, { "epoch": 1.2886856283673724, "grad_norm": 0.4063849449157715, "learning_rate": 6.44280165392808e-06, "loss": 0.4647, "num_input_tokens_seen": 12497888, "step": 10285 }, { "epoch": 1.2893121162761558, "grad_norm": 0.2981390357017517, "learning_rate": 6.445934093471996e-06, "loss": 0.4611, "num_input_tokens_seen": 12504192, "step": 10290 }, { "epoch": 1.2899386041849392, "grad_norm": 0.3822430968284607, "learning_rate": 6.449066533015913e-06, "loss": 0.4604, "num_input_tokens_seen": 12510368, "step": 10295 }, { "epoch": 1.2905650920937226, "grad_norm": 0.2642672061920166, "learning_rate": 6.45219897255983e-06, "loss": 0.473, "num_input_tokens_seen": 12516640, "step": 10300 }, { "epoch": 1.291191580002506, "grad_norm": 0.2512097954750061, "learning_rate": 6.455331412103747e-06, "loss": 0.4594, "num_input_tokens_seen": 12523008, "step": 10305 }, { "epoch": 1.2918180679112894, "grad_norm": 0.5094199180603027, "learning_rate": 6.458463851647664e-06, "loss": 0.4739, "num_input_tokens_seen": 12529152, "step": 10310 }, { "epoch": 1.2924445558200728, "grad_norm": 0.30584967136383057, "learning_rate": 6.4615962911915805e-06, "loss": 0.4657, "num_input_tokens_seen": 12535488, "step": 10315 }, { "epoch": 1.293071043728856, "grad_norm": 0.277985155582428, "learning_rate": 6.464728730735498e-06, "loss": 0.4716, "num_input_tokens_seen": 12541856, "step": 10320 }, { "epoch": 1.2936975316376393, "grad_norm": 0.3569045960903168, "learning_rate": 6.467861170279413e-06, "loss": 0.4626, "num_input_tokens_seen": 12547968, "step": 10325 }, { "epoch": 1.2943240195464227, "grad_norm": 0.3303588628768921, "learning_rate": 6.470993609823331e-06, "loss": 0.4621, "num_input_tokens_seen": 12554080, "step": 10330 }, { "epoch": 1.2949505074552061, "grad_norm": 0.40408840775489807, "learning_rate": 6.474126049367248e-06, "loss": 0.4653, "num_input_tokens_seen": 12560256, "step": 10335 }, { "epoch": 1.2955769953639895, "grad_norm": 0.27112728357315063, "learning_rate": 6.477258488911164e-06, "loss": 0.4611, "num_input_tokens_seen": 12566336, "step": 10340 }, { "epoch": 1.296203483272773, "grad_norm": 0.13020573556423187, "learning_rate": 6.480390928455081e-06, "loss": 0.4616, "num_input_tokens_seen": 12572576, "step": 10345 }, { "epoch": 1.2968299711815563, "grad_norm": 0.3220938742160797, "learning_rate": 6.483523367998999e-06, "loss": 0.4671, "num_input_tokens_seen": 12578784, "step": 10350 }, { "epoch": 1.2974564590903395, "grad_norm": 0.34023016691207886, "learning_rate": 6.4866558075429155e-06, "loss": 0.4664, "num_input_tokens_seen": 12585056, "step": 10355 }, { "epoch": 1.2980829469991229, "grad_norm": 0.5467774271965027, "learning_rate": 6.489788247086832e-06, "loss": 0.4714, "num_input_tokens_seen": 12591296, "step": 10360 }, { "epoch": 1.2987094349079062, "grad_norm": 0.3098849952220917, "learning_rate": 6.492920686630748e-06, "loss": 0.4669, "num_input_tokens_seen": 12596896, "step": 10365 }, { "epoch": 1.2993359228166896, "grad_norm": 0.2210836410522461, "learning_rate": 6.496053126174665e-06, "loss": 0.4621, "num_input_tokens_seen": 12603008, "step": 10370 }, { "epoch": 1.299962410725473, "grad_norm": 0.30762478709220886, "learning_rate": 6.499185565718582e-06, "loss": 0.4606, "num_input_tokens_seen": 12609216, "step": 10375 }, { "epoch": 1.3005888986342564, "grad_norm": 0.4362759292125702, "learning_rate": 6.502318005262499e-06, "loss": 0.4617, "num_input_tokens_seen": 12615328, "step": 10380 }, { "epoch": 1.3012153865430398, "grad_norm": 0.2601485848426819, "learning_rate": 6.505450444806416e-06, "loss": 0.4588, "num_input_tokens_seen": 12621152, "step": 10385 }, { "epoch": 1.301841874451823, "grad_norm": 0.5376319289207458, "learning_rate": 6.508582884350333e-06, "loss": 0.4629, "num_input_tokens_seen": 12627520, "step": 10390 }, { "epoch": 1.3024683623606064, "grad_norm": 0.28499945998191833, "learning_rate": 6.51171532389425e-06, "loss": 0.4682, "num_input_tokens_seen": 12633504, "step": 10395 }, { "epoch": 1.3030948502693898, "grad_norm": 0.2553873062133789, "learning_rate": 6.5148477634381655e-06, "loss": 0.4686, "num_input_tokens_seen": 12639776, "step": 10400 }, { "epoch": 1.3037213381781732, "grad_norm": 0.2939710021018982, "learning_rate": 6.517980202982082e-06, "loss": 0.4631, "num_input_tokens_seen": 12645632, "step": 10405 }, { "epoch": 1.3043478260869565, "grad_norm": 0.40871095657348633, "learning_rate": 6.521112642526e-06, "loss": 0.4673, "num_input_tokens_seen": 12652192, "step": 10410 }, { "epoch": 1.30497431399574, "grad_norm": 0.4663410186767578, "learning_rate": 6.524245082069917e-06, "loss": 0.4657, "num_input_tokens_seen": 12658112, "step": 10415 }, { "epoch": 1.3056008019045233, "grad_norm": 0.2766273617744446, "learning_rate": 6.527377521613833e-06, "loss": 0.4651, "num_input_tokens_seen": 12663264, "step": 10420 }, { "epoch": 1.3062272898133065, "grad_norm": 0.22556446492671967, "learning_rate": 6.53050996115775e-06, "loss": 0.4635, "num_input_tokens_seen": 12669504, "step": 10425 }, { "epoch": 1.3068537777220899, "grad_norm": 0.22315432131290436, "learning_rate": 6.533642400701668e-06, "loss": 0.4613, "num_input_tokens_seen": 12675872, "step": 10430 }, { "epoch": 1.3074802656308733, "grad_norm": 0.41653716564178467, "learning_rate": 6.536774840245584e-06, "loss": 0.4655, "num_input_tokens_seen": 12681952, "step": 10435 }, { "epoch": 1.3081067535396567, "grad_norm": 0.2371828705072403, "learning_rate": 6.5399072797895004e-06, "loss": 0.4645, "num_input_tokens_seen": 12688448, "step": 10440 }, { "epoch": 1.30873324144844, "grad_norm": 0.10200830549001694, "learning_rate": 6.543039719333417e-06, "loss": 0.463, "num_input_tokens_seen": 12694112, "step": 10445 }, { "epoch": 1.3093597293572234, "grad_norm": 0.3156339228153229, "learning_rate": 6.546172158877334e-06, "loss": 0.4671, "num_input_tokens_seen": 12700096, "step": 10450 }, { "epoch": 1.3099862172660068, "grad_norm": 0.21371135115623474, "learning_rate": 6.549304598421251e-06, "loss": 0.4624, "num_input_tokens_seen": 12705792, "step": 10455 }, { "epoch": 1.31061270517479, "grad_norm": 0.31601545214653015, "learning_rate": 6.552437037965168e-06, "loss": 0.4676, "num_input_tokens_seen": 12711584, "step": 10460 }, { "epoch": 1.3112391930835736, "grad_norm": 0.21312355995178223, "learning_rate": 6.555569477509085e-06, "loss": 0.4608, "num_input_tokens_seen": 12717696, "step": 10465 }, { "epoch": 1.3118656809923568, "grad_norm": 0.24864894151687622, "learning_rate": 6.558701917053002e-06, "loss": 0.4654, "num_input_tokens_seen": 12723808, "step": 10470 }, { "epoch": 1.3124921689011402, "grad_norm": 0.14739753305912018, "learning_rate": 6.561834356596918e-06, "loss": 0.4647, "num_input_tokens_seen": 12730080, "step": 10475 }, { "epoch": 1.3131186568099236, "grad_norm": 0.2588900923728943, "learning_rate": 6.5649667961408346e-06, "loss": 0.4559, "num_input_tokens_seen": 12735648, "step": 10480 }, { "epoch": 1.313745144718707, "grad_norm": 0.3075890839099884, "learning_rate": 6.568099235684751e-06, "loss": 0.4604, "num_input_tokens_seen": 12741696, "step": 10485 }, { "epoch": 1.3143716326274903, "grad_norm": 0.28565821051597595, "learning_rate": 6.571231675228669e-06, "loss": 0.4623, "num_input_tokens_seen": 12747168, "step": 10490 }, { "epoch": 1.3149981205362735, "grad_norm": 0.48078063130378723, "learning_rate": 6.574364114772586e-06, "loss": 0.4635, "num_input_tokens_seen": 12753152, "step": 10495 }, { "epoch": 1.3156246084450571, "grad_norm": 0.3967600166797638, "learning_rate": 6.5774965543165025e-06, "loss": 0.4579, "num_input_tokens_seen": 12759168, "step": 10500 }, { "epoch": 1.3162510963538403, "grad_norm": 0.4412720203399658, "learning_rate": 6.580628993860419e-06, "loss": 0.4637, "num_input_tokens_seen": 12765120, "step": 10505 }, { "epoch": 1.3168775842626237, "grad_norm": 0.5137041807174683, "learning_rate": 6.583761433404335e-06, "loss": 0.466, "num_input_tokens_seen": 12771200, "step": 10510 }, { "epoch": 1.317504072171407, "grad_norm": 0.4372444450855255, "learning_rate": 6.586893872948253e-06, "loss": 0.4584, "num_input_tokens_seen": 12776960, "step": 10515 }, { "epoch": 1.3181305600801905, "grad_norm": 0.29541051387786865, "learning_rate": 6.5900263124921695e-06, "loss": 0.4711, "num_input_tokens_seen": 12783168, "step": 10520 }, { "epoch": 1.3187570479889739, "grad_norm": 0.1844222992658615, "learning_rate": 6.593158752036086e-06, "loss": 0.4618, "num_input_tokens_seen": 12789280, "step": 10525 }, { "epoch": 1.3193835358977573, "grad_norm": 0.3745502233505249, "learning_rate": 6.596291191580003e-06, "loss": 0.4598, "num_input_tokens_seen": 12795616, "step": 10530 }, { "epoch": 1.3200100238065406, "grad_norm": 0.332769513130188, "learning_rate": 6.59942363112392e-06, "loss": 0.4558, "num_input_tokens_seen": 12801984, "step": 10535 }, { "epoch": 1.3206365117153238, "grad_norm": 0.3941710591316223, "learning_rate": 6.602556070667837e-06, "loss": 0.4683, "num_input_tokens_seen": 12808544, "step": 10540 }, { "epoch": 1.3212629996241072, "grad_norm": 0.4001084864139557, "learning_rate": 6.605688510211753e-06, "loss": 0.4655, "num_input_tokens_seen": 12814816, "step": 10545 }, { "epoch": 1.3218894875328906, "grad_norm": 0.43949785828590393, "learning_rate": 6.60882094975567e-06, "loss": 0.473, "num_input_tokens_seen": 12820544, "step": 10550 }, { "epoch": 1.322515975441674, "grad_norm": 0.4753691852092743, "learning_rate": 6.611953389299587e-06, "loss": 0.4603, "num_input_tokens_seen": 12826720, "step": 10555 }, { "epoch": 1.3231424633504574, "grad_norm": 0.5450232028961182, "learning_rate": 6.615085828843504e-06, "loss": 0.464, "num_input_tokens_seen": 12832544, "step": 10560 }, { "epoch": 1.3237689512592408, "grad_norm": 0.5172916650772095, "learning_rate": 6.61821826838742e-06, "loss": 0.4651, "num_input_tokens_seen": 12838112, "step": 10565 }, { "epoch": 1.3243954391680242, "grad_norm": 0.5821904540061951, "learning_rate": 6.621350707931338e-06, "loss": 0.461, "num_input_tokens_seen": 12844416, "step": 10570 }, { "epoch": 1.3250219270768073, "grad_norm": 0.4391019344329834, "learning_rate": 6.624483147475255e-06, "loss": 0.4605, "num_input_tokens_seen": 12850304, "step": 10575 }, { "epoch": 1.3256484149855907, "grad_norm": 0.6164627075195312, "learning_rate": 6.6276155870191715e-06, "loss": 0.4428, "num_input_tokens_seen": 12855904, "step": 10580 }, { "epoch": 1.326274902894374, "grad_norm": 0.29103633761405945, "learning_rate": 6.6307480265630874e-06, "loss": 0.4716, "num_input_tokens_seen": 12862016, "step": 10585 }, { "epoch": 1.3269013908031575, "grad_norm": 0.7590137124061584, "learning_rate": 6.633880466107004e-06, "loss": 0.4539, "num_input_tokens_seen": 12867968, "step": 10590 }, { "epoch": 1.327527878711941, "grad_norm": 0.5853791236877441, "learning_rate": 6.637012905650922e-06, "loss": 0.4581, "num_input_tokens_seen": 12874176, "step": 10595 }, { "epoch": 1.3281543666207243, "grad_norm": 0.7004625797271729, "learning_rate": 6.6401453451948386e-06, "loss": 0.461, "num_input_tokens_seen": 12880416, "step": 10600 }, { "epoch": 1.3287808545295077, "grad_norm": 0.38664335012435913, "learning_rate": 6.643277784738755e-06, "loss": 0.4728, "num_input_tokens_seen": 12886656, "step": 10605 }, { "epoch": 1.3294073424382908, "grad_norm": 0.38862013816833496, "learning_rate": 6.646410224282672e-06, "loss": 0.4779, "num_input_tokens_seen": 12893088, "step": 10610 }, { "epoch": 1.3300338303470742, "grad_norm": 0.180806964635849, "learning_rate": 6.649542663826589e-06, "loss": 0.4651, "num_input_tokens_seen": 12899136, "step": 10615 }, { "epoch": 1.3306603182558576, "grad_norm": 0.49353498220443726, "learning_rate": 6.652675103370505e-06, "loss": 0.4654, "num_input_tokens_seen": 12905600, "step": 10620 }, { "epoch": 1.331286806164641, "grad_norm": 0.5657689571380615, "learning_rate": 6.655807542914422e-06, "loss": 0.4641, "num_input_tokens_seen": 12911392, "step": 10625 }, { "epoch": 1.3319132940734244, "grad_norm": 0.3346996009349823, "learning_rate": 6.658939982458339e-06, "loss": 0.4631, "num_input_tokens_seen": 12917504, "step": 10630 }, { "epoch": 1.3325397819822078, "grad_norm": 0.6549965143203735, "learning_rate": 6.662072422002256e-06, "loss": 0.463, "num_input_tokens_seen": 12923200, "step": 10635 }, { "epoch": 1.3331662698909912, "grad_norm": 0.5937651991844177, "learning_rate": 6.665204861546173e-06, "loss": 0.4706, "num_input_tokens_seen": 12929024, "step": 10640 }, { "epoch": 1.3337927577997744, "grad_norm": 0.3884553909301758, "learning_rate": 6.6683373010900894e-06, "loss": 0.4623, "num_input_tokens_seen": 12934400, "step": 10645 }, { "epoch": 1.3344192457085577, "grad_norm": 0.3867802917957306, "learning_rate": 6.671469740634007e-06, "loss": 0.4613, "num_input_tokens_seen": 12940704, "step": 10650 }, { "epoch": 1.3350457336173411, "grad_norm": 0.5442631244659424, "learning_rate": 6.674602180177923e-06, "loss": 0.4589, "num_input_tokens_seen": 12946336, "step": 10655 }, { "epoch": 1.3356722215261245, "grad_norm": 0.4645408093929291, "learning_rate": 6.67773461972184e-06, "loss": 0.4666, "num_input_tokens_seen": 12952480, "step": 10660 }, { "epoch": 1.336298709434908, "grad_norm": 0.5655879974365234, "learning_rate": 6.6808670592657565e-06, "loss": 0.4566, "num_input_tokens_seen": 12958752, "step": 10665 }, { "epoch": 1.3369251973436913, "grad_norm": 0.5640931129455566, "learning_rate": 6.683999498809673e-06, "loss": 0.4353, "num_input_tokens_seen": 12964512, "step": 10670 }, { "epoch": 1.3375516852524747, "grad_norm": 0.8190934062004089, "learning_rate": 6.68713193835359e-06, "loss": 0.4893, "num_input_tokens_seen": 12970432, "step": 10675 }, { "epoch": 1.3381781731612579, "grad_norm": 1.3633874654769897, "learning_rate": 6.690264377897508e-06, "loss": 0.4836, "num_input_tokens_seen": 12976608, "step": 10680 }, { "epoch": 1.3388046610700415, "grad_norm": 0.16029101610183716, "learning_rate": 6.693396817441424e-06, "loss": 0.4759, "num_input_tokens_seen": 12982816, "step": 10685 }, { "epoch": 1.3394311489788246, "grad_norm": 0.2383754700422287, "learning_rate": 6.69652925698534e-06, "loss": 0.463, "num_input_tokens_seen": 12989152, "step": 10690 }, { "epoch": 1.340057636887608, "grad_norm": 0.2873256802558899, "learning_rate": 6.699661696529257e-06, "loss": 0.4649, "num_input_tokens_seen": 12995712, "step": 10695 }, { "epoch": 1.3406841247963914, "grad_norm": 0.3047155737876892, "learning_rate": 6.702794136073174e-06, "loss": 0.4692, "num_input_tokens_seen": 13001856, "step": 10700 }, { "epoch": 1.3413106127051748, "grad_norm": 0.395948588848114, "learning_rate": 6.7059265756170914e-06, "loss": 0.4692, "num_input_tokens_seen": 13007808, "step": 10705 }, { "epoch": 1.3419371006139582, "grad_norm": 0.13704706728458405, "learning_rate": 6.709059015161008e-06, "loss": 0.459, "num_input_tokens_seen": 13013952, "step": 10710 }, { "epoch": 1.3425635885227414, "grad_norm": 0.36248987913131714, "learning_rate": 6.712191454704925e-06, "loss": 0.4615, "num_input_tokens_seen": 13019968, "step": 10715 }, { "epoch": 1.343190076431525, "grad_norm": 0.2772578001022339, "learning_rate": 6.715323894248842e-06, "loss": 0.4688, "num_input_tokens_seen": 13026080, "step": 10720 }, { "epoch": 1.3438165643403082, "grad_norm": 0.26267480850219727, "learning_rate": 6.7184563337927585e-06, "loss": 0.4589, "num_input_tokens_seen": 13032192, "step": 10725 }, { "epoch": 1.3444430522490916, "grad_norm": 0.3251713216304779, "learning_rate": 6.721588773336674e-06, "loss": 0.4668, "num_input_tokens_seen": 13038368, "step": 10730 }, { "epoch": 1.345069540157875, "grad_norm": 0.25924357771873474, "learning_rate": 6.724721212880592e-06, "loss": 0.4628, "num_input_tokens_seen": 13044224, "step": 10735 }, { "epoch": 1.3456960280666583, "grad_norm": 0.3305717408657074, "learning_rate": 6.727853652424509e-06, "loss": 0.4579, "num_input_tokens_seen": 13050048, "step": 10740 }, { "epoch": 1.3463225159754417, "grad_norm": 0.5542201399803162, "learning_rate": 6.7309860919684255e-06, "loss": 0.4613, "num_input_tokens_seen": 13056160, "step": 10745 }, { "epoch": 1.3469490038842251, "grad_norm": 0.37493836879730225, "learning_rate": 6.734118531512342e-06, "loss": 0.4573, "num_input_tokens_seen": 13062784, "step": 10750 }, { "epoch": 1.3475754917930085, "grad_norm": 0.3362503945827484, "learning_rate": 6.737250971056259e-06, "loss": 0.4709, "num_input_tokens_seen": 13069504, "step": 10755 }, { "epoch": 1.3482019797017917, "grad_norm": 0.4793878197669983, "learning_rate": 6.740383410600177e-06, "loss": 0.4779, "num_input_tokens_seen": 13075712, "step": 10760 }, { "epoch": 1.348828467610575, "grad_norm": 0.32021278142929077, "learning_rate": 6.743515850144093e-06, "loss": 0.4642, "num_input_tokens_seen": 13081824, "step": 10765 }, { "epoch": 1.3494549555193585, "grad_norm": 0.3508448302745819, "learning_rate": 6.746648289688009e-06, "loss": 0.4564, "num_input_tokens_seen": 13087296, "step": 10770 }, { "epoch": 1.3500814434281418, "grad_norm": 0.43442025780677795, "learning_rate": 6.749780729231926e-06, "loss": 0.456, "num_input_tokens_seen": 13093472, "step": 10775 }, { "epoch": 1.3507079313369252, "grad_norm": 0.2791994512081146, "learning_rate": 6.752913168775843e-06, "loss": 0.464, "num_input_tokens_seen": 13099552, "step": 10780 }, { "epoch": 1.3513344192457086, "grad_norm": 0.5059632062911987, "learning_rate": 6.7560456083197605e-06, "loss": 0.4563, "num_input_tokens_seen": 13104832, "step": 10785 }, { "epoch": 1.351960907154492, "grad_norm": 0.13552895188331604, "learning_rate": 6.759178047863677e-06, "loss": 0.4696, "num_input_tokens_seen": 13110784, "step": 10790 }, { "epoch": 1.3525873950632752, "grad_norm": 0.1705123633146286, "learning_rate": 6.762310487407594e-06, "loss": 0.459, "num_input_tokens_seen": 13116896, "step": 10795 }, { "epoch": 1.3532138829720586, "grad_norm": 0.15871094167232513, "learning_rate": 6.76544292695151e-06, "loss": 0.4605, "num_input_tokens_seen": 13122976, "step": 10800 }, { "epoch": 1.353840370880842, "grad_norm": 0.48901426792144775, "learning_rate": 6.768575366495427e-06, "loss": 0.4675, "num_input_tokens_seen": 13128864, "step": 10805 }, { "epoch": 1.3544668587896254, "grad_norm": 0.5147598385810852, "learning_rate": 6.7717078060393435e-06, "loss": 0.4671, "num_input_tokens_seen": 13134912, "step": 10810 }, { "epoch": 1.3550933466984088, "grad_norm": 0.13844440877437592, "learning_rate": 6.774840245583261e-06, "loss": 0.4547, "num_input_tokens_seen": 13141056, "step": 10815 }, { "epoch": 1.3557198346071921, "grad_norm": 0.22405949234962463, "learning_rate": 6.777972685127178e-06, "loss": 0.4583, "num_input_tokens_seen": 13146688, "step": 10820 }, { "epoch": 1.3563463225159755, "grad_norm": 0.23326945304870605, "learning_rate": 6.781105124671095e-06, "loss": 0.4631, "num_input_tokens_seen": 13152576, "step": 10825 }, { "epoch": 1.3569728104247587, "grad_norm": 0.3135417401790619, "learning_rate": 6.784237564215011e-06, "loss": 0.4683, "num_input_tokens_seen": 13158784, "step": 10830 }, { "epoch": 1.357599298333542, "grad_norm": 0.274705708026886, "learning_rate": 6.787370003758928e-06, "loss": 0.4775, "num_input_tokens_seen": 13165024, "step": 10835 }, { "epoch": 1.3582257862423255, "grad_norm": 0.3014810383319855, "learning_rate": 6.790502443302844e-06, "loss": 0.4587, "num_input_tokens_seen": 13171072, "step": 10840 }, { "epoch": 1.3588522741511089, "grad_norm": 0.36257368326187134, "learning_rate": 6.793634882846762e-06, "loss": 0.4596, "num_input_tokens_seen": 13176512, "step": 10845 }, { "epoch": 1.3594787620598923, "grad_norm": 0.26971864700317383, "learning_rate": 6.796767322390678e-06, "loss": 0.4578, "num_input_tokens_seen": 13182560, "step": 10850 }, { "epoch": 1.3601052499686757, "grad_norm": 0.5049517154693604, "learning_rate": 6.799899761934595e-06, "loss": 0.4671, "num_input_tokens_seen": 13188672, "step": 10855 }, { "epoch": 1.360731737877459, "grad_norm": 0.28170332312583923, "learning_rate": 6.803032201478512e-06, "loss": 0.4649, "num_input_tokens_seen": 13195008, "step": 10860 }, { "epoch": 1.3613582257862422, "grad_norm": 0.5516728758811951, "learning_rate": 6.8061646410224295e-06, "loss": 0.4584, "num_input_tokens_seen": 13201824, "step": 10865 }, { "epoch": 1.3619847136950256, "grad_norm": 0.27497342228889465, "learning_rate": 6.809297080566346e-06, "loss": 0.4725, "num_input_tokens_seen": 13207712, "step": 10870 }, { "epoch": 1.362611201603809, "grad_norm": 0.2488885074853897, "learning_rate": 6.812429520110262e-06, "loss": 0.4593, "num_input_tokens_seen": 13213856, "step": 10875 }, { "epoch": 1.3632376895125924, "grad_norm": 0.34055861830711365, "learning_rate": 6.815561959654179e-06, "loss": 0.4635, "num_input_tokens_seen": 13219520, "step": 10880 }, { "epoch": 1.3638641774213758, "grad_norm": 0.2694382667541504, "learning_rate": 6.818694399198096e-06, "loss": 0.4655, "num_input_tokens_seen": 13225472, "step": 10885 }, { "epoch": 1.3644906653301592, "grad_norm": 0.32880768179893494, "learning_rate": 6.8218268387420125e-06, "loss": 0.4633, "num_input_tokens_seen": 13231520, "step": 10890 }, { "epoch": 1.3651171532389426, "grad_norm": 0.624138593673706, "learning_rate": 6.82495927828593e-06, "loss": 0.4643, "num_input_tokens_seen": 13237664, "step": 10895 }, { "epoch": 1.3657436411477257, "grad_norm": 0.31463566422462463, "learning_rate": 6.828091717829847e-06, "loss": 0.4617, "num_input_tokens_seen": 13243936, "step": 10900 }, { "epoch": 1.3663701290565093, "grad_norm": 0.2514200210571289, "learning_rate": 6.831224157373764e-06, "loss": 0.4568, "num_input_tokens_seen": 13249984, "step": 10905 }, { "epoch": 1.3669966169652925, "grad_norm": 0.6760395765304565, "learning_rate": 6.8343565969176796e-06, "loss": 0.4584, "num_input_tokens_seen": 13256224, "step": 10910 }, { "epoch": 1.367623104874076, "grad_norm": 0.2614772915840149, "learning_rate": 6.837489036461596e-06, "loss": 0.4667, "num_input_tokens_seen": 13262368, "step": 10915 }, { "epoch": 1.3682495927828593, "grad_norm": 0.32478901743888855, "learning_rate": 6.840621476005513e-06, "loss": 0.4753, "num_input_tokens_seen": 13268512, "step": 10920 }, { "epoch": 1.3688760806916427, "grad_norm": 0.26543882489204407, "learning_rate": 6.843753915549431e-06, "loss": 0.4609, "num_input_tokens_seen": 13274688, "step": 10925 }, { "epoch": 1.369502568600426, "grad_norm": 0.604121208190918, "learning_rate": 6.8468863550933475e-06, "loss": 0.4707, "num_input_tokens_seen": 13280928, "step": 10930 }, { "epoch": 1.3701290565092092, "grad_norm": 0.3621136248111725, "learning_rate": 6.850018794637264e-06, "loss": 0.4573, "num_input_tokens_seen": 13287072, "step": 10935 }, { "epoch": 1.3707555444179929, "grad_norm": 0.14722493290901184, "learning_rate": 6.853151234181181e-06, "loss": 0.4659, "num_input_tokens_seen": 13292544, "step": 10940 }, { "epoch": 1.371382032326776, "grad_norm": 0.44628795981407166, "learning_rate": 6.856283673725098e-06, "loss": 0.4649, "num_input_tokens_seen": 13299264, "step": 10945 }, { "epoch": 1.3720085202355594, "grad_norm": 0.280239462852478, "learning_rate": 6.859416113269014e-06, "loss": 0.47, "num_input_tokens_seen": 13305568, "step": 10950 }, { "epoch": 1.3726350081443428, "grad_norm": 0.40476250648498535, "learning_rate": 6.862548552812931e-06, "loss": 0.4624, "num_input_tokens_seen": 13311328, "step": 10955 }, { "epoch": 1.3732614960531262, "grad_norm": 0.2849908471107483, "learning_rate": 6.865680992356848e-06, "loss": 0.4652, "num_input_tokens_seen": 13317024, "step": 10960 }, { "epoch": 1.3738879839619096, "grad_norm": 0.12074318528175354, "learning_rate": 6.868813431900765e-06, "loss": 0.4726, "num_input_tokens_seen": 13322784, "step": 10965 }, { "epoch": 1.374514471870693, "grad_norm": 0.3542729318141937, "learning_rate": 6.8719458714446816e-06, "loss": 0.4532, "num_input_tokens_seen": 13329152, "step": 10970 }, { "epoch": 1.3751409597794764, "grad_norm": 0.23045918345451355, "learning_rate": 6.875078310988599e-06, "loss": 0.464, "num_input_tokens_seen": 13335136, "step": 10975 }, { "epoch": 1.3757674476882595, "grad_norm": 0.09341698884963989, "learning_rate": 6.878210750532516e-06, "loss": 0.4615, "num_input_tokens_seen": 13341728, "step": 10980 }, { "epoch": 1.376393935597043, "grad_norm": 0.28153014183044434, "learning_rate": 6.881343190076432e-06, "loss": 0.4594, "num_input_tokens_seen": 13347840, "step": 10985 }, { "epoch": 1.3770204235058263, "grad_norm": 0.24810978770256042, "learning_rate": 6.884475629620349e-06, "loss": 0.4651, "num_input_tokens_seen": 13353760, "step": 10990 }, { "epoch": 1.3776469114146097, "grad_norm": 0.22621193528175354, "learning_rate": 6.887608069164265e-06, "loss": 0.4552, "num_input_tokens_seen": 13360192, "step": 10995 }, { "epoch": 1.378273399323393, "grad_norm": 0.29044055938720703, "learning_rate": 6.890740508708182e-06, "loss": 0.4655, "num_input_tokens_seen": 13366528, "step": 11000 }, { "epoch": 1.3788998872321765, "grad_norm": 0.24951450526714325, "learning_rate": 6.8938729482521e-06, "loss": 0.4659, "num_input_tokens_seen": 13372640, "step": 11005 }, { "epoch": 1.3795263751409599, "grad_norm": 0.2502317428588867, "learning_rate": 6.8970053877960165e-06, "loss": 0.465, "num_input_tokens_seen": 13378560, "step": 11010 }, { "epoch": 1.380152863049743, "grad_norm": 0.2642916142940521, "learning_rate": 6.900137827339933e-06, "loss": 0.4639, "num_input_tokens_seen": 13384416, "step": 11015 }, { "epoch": 1.3807793509585264, "grad_norm": 0.30962687730789185, "learning_rate": 6.903270266883849e-06, "loss": 0.4616, "num_input_tokens_seen": 13390336, "step": 11020 }, { "epoch": 1.3814058388673098, "grad_norm": 0.23570944368839264, "learning_rate": 6.906402706427766e-06, "loss": 0.4661, "num_input_tokens_seen": 13396576, "step": 11025 }, { "epoch": 1.3820323267760932, "grad_norm": 0.4360361397266388, "learning_rate": 6.909535145971683e-06, "loss": 0.4616, "num_input_tokens_seen": 13402432, "step": 11030 }, { "epoch": 1.3826588146848766, "grad_norm": 0.21278204023838043, "learning_rate": 6.9126675855156e-06, "loss": 0.4638, "num_input_tokens_seen": 13408384, "step": 11035 }, { "epoch": 1.38328530259366, "grad_norm": 0.3588014543056488, "learning_rate": 6.915800025059517e-06, "loss": 0.4612, "num_input_tokens_seen": 13414272, "step": 11040 }, { "epoch": 1.3839117905024434, "grad_norm": 0.2304113507270813, "learning_rate": 6.918932464603434e-06, "loss": 0.4607, "num_input_tokens_seen": 13420448, "step": 11045 }, { "epoch": 1.3845382784112266, "grad_norm": 0.3343488276004791, "learning_rate": 6.922064904147351e-06, "loss": 0.4592, "num_input_tokens_seen": 13426336, "step": 11050 }, { "epoch": 1.38516476632001, "grad_norm": 0.24767819046974182, "learning_rate": 6.925197343691268e-06, "loss": 0.4543, "num_input_tokens_seen": 13432448, "step": 11055 }, { "epoch": 1.3857912542287933, "grad_norm": 0.41894546151161194, "learning_rate": 6.928329783235183e-06, "loss": 0.4521, "num_input_tokens_seen": 13438688, "step": 11060 }, { "epoch": 1.3864177421375767, "grad_norm": 0.539293110370636, "learning_rate": 6.931462222779101e-06, "loss": 0.4667, "num_input_tokens_seen": 13445088, "step": 11065 }, { "epoch": 1.3870442300463601, "grad_norm": 0.9815070033073425, "learning_rate": 6.934594662323018e-06, "loss": 0.4766, "num_input_tokens_seen": 13451712, "step": 11070 }, { "epoch": 1.3876707179551435, "grad_norm": 0.27909865975379944, "learning_rate": 6.9377271018669344e-06, "loss": 0.4769, "num_input_tokens_seen": 13457632, "step": 11075 }, { "epoch": 1.388297205863927, "grad_norm": 0.19169019162654877, "learning_rate": 6.940859541410851e-06, "loss": 0.4552, "num_input_tokens_seen": 13463808, "step": 11080 }, { "epoch": 1.38892369377271, "grad_norm": 0.23891174793243408, "learning_rate": 6.943991980954769e-06, "loss": 0.4731, "num_input_tokens_seen": 13469760, "step": 11085 }, { "epoch": 1.3895501816814935, "grad_norm": 0.23105497658252716, "learning_rate": 6.947124420498686e-06, "loss": 0.4669, "num_input_tokens_seen": 13475712, "step": 11090 }, { "epoch": 1.3901766695902769, "grad_norm": 0.1139693334698677, "learning_rate": 6.9502568600426015e-06, "loss": 0.4633, "num_input_tokens_seen": 13481408, "step": 11095 }, { "epoch": 1.3908031574990602, "grad_norm": 0.27736425399780273, "learning_rate": 6.953389299586518e-06, "loss": 0.4554, "num_input_tokens_seen": 13487168, "step": 11100 }, { "epoch": 1.3914296454078436, "grad_norm": 0.27929309010505676, "learning_rate": 6.956521739130435e-06, "loss": 0.4525, "num_input_tokens_seen": 13493344, "step": 11105 }, { "epoch": 1.392056133316627, "grad_norm": 0.24048541486263275, "learning_rate": 6.959654178674352e-06, "loss": 0.4578, "num_input_tokens_seen": 13499168, "step": 11110 }, { "epoch": 1.3926826212254104, "grad_norm": 0.2302355170249939, "learning_rate": 6.962786618218269e-06, "loss": 0.4659, "num_input_tokens_seen": 13505600, "step": 11115 }, { "epoch": 1.3933091091341936, "grad_norm": 0.37019720673561096, "learning_rate": 6.965919057762186e-06, "loss": 0.4575, "num_input_tokens_seen": 13511616, "step": 11120 }, { "epoch": 1.3939355970429772, "grad_norm": 0.20251300930976868, "learning_rate": 6.969051497306103e-06, "loss": 0.4639, "num_input_tokens_seen": 13517536, "step": 11125 }, { "epoch": 1.3945620849517604, "grad_norm": 0.30175817012786865, "learning_rate": 6.972183936850019e-06, "loss": 0.4691, "num_input_tokens_seen": 13523840, "step": 11130 }, { "epoch": 1.3951885728605438, "grad_norm": 0.40200161933898926, "learning_rate": 6.975316376393936e-06, "loss": 0.4687, "num_input_tokens_seen": 13529920, "step": 11135 }, { "epoch": 1.3958150607693272, "grad_norm": 0.22182609140872955, "learning_rate": 6.978448815937852e-06, "loss": 0.4631, "num_input_tokens_seen": 13536000, "step": 11140 }, { "epoch": 1.3964415486781105, "grad_norm": 0.2835766077041626, "learning_rate": 6.98158125548177e-06, "loss": 0.463, "num_input_tokens_seen": 13541376, "step": 11145 }, { "epoch": 1.397068036586894, "grad_norm": 0.21925662457942963, "learning_rate": 6.984713695025687e-06, "loss": 0.4634, "num_input_tokens_seen": 13547360, "step": 11150 }, { "epoch": 1.397694524495677, "grad_norm": 0.26774802803993225, "learning_rate": 6.9878461345696035e-06, "loss": 0.4608, "num_input_tokens_seen": 13553696, "step": 11155 }, { "epoch": 1.3983210124044607, "grad_norm": 0.2027648389339447, "learning_rate": 6.99097857411352e-06, "loss": 0.4636, "num_input_tokens_seen": 13559872, "step": 11160 }, { "epoch": 1.3989475003132439, "grad_norm": 0.26262181997299194, "learning_rate": 6.994111013657438e-06, "loss": 0.4724, "num_input_tokens_seen": 13565824, "step": 11165 }, { "epoch": 1.3995739882220273, "grad_norm": 0.2595096528530121, "learning_rate": 6.997243453201353e-06, "loss": 0.4677, "num_input_tokens_seen": 13571840, "step": 11170 }, { "epoch": 1.4002004761308107, "grad_norm": 0.362810879945755, "learning_rate": 7.0003758927452706e-06, "loss": 0.4708, "num_input_tokens_seen": 13577920, "step": 11175 }, { "epoch": 1.400826964039594, "grad_norm": 0.3820532560348511, "learning_rate": 7.003508332289187e-06, "loss": 0.4688, "num_input_tokens_seen": 13583808, "step": 11180 }, { "epoch": 1.4014534519483774, "grad_norm": 0.24354101717472076, "learning_rate": 7.006640771833104e-06, "loss": 0.4604, "num_input_tokens_seen": 13589952, "step": 11185 }, { "epoch": 1.4020799398571608, "grad_norm": 0.19389225542545319, "learning_rate": 7.009773211377021e-06, "loss": 0.4635, "num_input_tokens_seen": 13595808, "step": 11190 }, { "epoch": 1.4027064277659442, "grad_norm": 0.25097841024398804, "learning_rate": 7.0129056509209384e-06, "loss": 0.469, "num_input_tokens_seen": 13601696, "step": 11195 }, { "epoch": 1.4033329156747274, "grad_norm": 0.35011032223701477, "learning_rate": 7.016038090464855e-06, "loss": 0.4606, "num_input_tokens_seen": 13608096, "step": 11200 }, { "epoch": 1.4039594035835108, "grad_norm": 0.517379105091095, "learning_rate": 7.019170530008771e-06, "loss": 0.4582, "num_input_tokens_seen": 13614176, "step": 11205 }, { "epoch": 1.4045858914922942, "grad_norm": 0.4717448353767395, "learning_rate": 7.022302969552688e-06, "loss": 0.4774, "num_input_tokens_seen": 13620320, "step": 11210 }, { "epoch": 1.4052123794010776, "grad_norm": 0.3166687786579132, "learning_rate": 7.025435409096605e-06, "loss": 0.4657, "num_input_tokens_seen": 13626688, "step": 11215 }, { "epoch": 1.405838867309861, "grad_norm": 0.41445624828338623, "learning_rate": 7.028567848640521e-06, "loss": 0.4618, "num_input_tokens_seen": 13632864, "step": 11220 }, { "epoch": 1.4064653552186444, "grad_norm": 0.22281695902347565, "learning_rate": 7.031700288184439e-06, "loss": 0.4586, "num_input_tokens_seen": 13638720, "step": 11225 }, { "epoch": 1.4070918431274277, "grad_norm": 0.21784700453281403, "learning_rate": 7.034832727728356e-06, "loss": 0.4618, "num_input_tokens_seen": 13644480, "step": 11230 }, { "epoch": 1.407718331036211, "grad_norm": 0.4280955195426941, "learning_rate": 7.0379651672722726e-06, "loss": 0.4595, "num_input_tokens_seen": 13650656, "step": 11235 }, { "epoch": 1.4083448189449943, "grad_norm": 0.2346849888563156, "learning_rate": 7.0410976068161885e-06, "loss": 0.4603, "num_input_tokens_seen": 13656576, "step": 11240 }, { "epoch": 1.4089713068537777, "grad_norm": 0.2878721356391907, "learning_rate": 7.044230046360105e-06, "loss": 0.4641, "num_input_tokens_seen": 13662720, "step": 11245 }, { "epoch": 1.409597794762561, "grad_norm": 0.37701451778411865, "learning_rate": 7.047362485904022e-06, "loss": 0.4635, "num_input_tokens_seen": 13668768, "step": 11250 }, { "epoch": 1.4102242826713445, "grad_norm": 0.2241935431957245, "learning_rate": 7.05049492544794e-06, "loss": 0.4598, "num_input_tokens_seen": 13675040, "step": 11255 }, { "epoch": 1.4108507705801279, "grad_norm": 0.31929147243499756, "learning_rate": 7.053627364991856e-06, "loss": 0.4602, "num_input_tokens_seen": 13681120, "step": 11260 }, { "epoch": 1.4114772584889113, "grad_norm": 0.34488609433174133, "learning_rate": 7.056759804535773e-06, "loss": 0.4585, "num_input_tokens_seen": 13687488, "step": 11265 }, { "epoch": 1.4121037463976944, "grad_norm": 0.43211889266967773, "learning_rate": 7.05989224407969e-06, "loss": 0.4549, "num_input_tokens_seen": 13693696, "step": 11270 }, { "epoch": 1.4127302343064778, "grad_norm": 0.3837173879146576, "learning_rate": 7.0630246836236075e-06, "loss": 0.457, "num_input_tokens_seen": 13700224, "step": 11275 }, { "epoch": 1.4133567222152612, "grad_norm": 0.501804530620575, "learning_rate": 7.066157123167523e-06, "loss": 0.4545, "num_input_tokens_seen": 13706272, "step": 11280 }, { "epoch": 1.4139832101240446, "grad_norm": 0.24175837635993958, "learning_rate": 7.06928956271144e-06, "loss": 0.497, "num_input_tokens_seen": 13712320, "step": 11285 }, { "epoch": 1.414609698032828, "grad_norm": 0.26990363001823425, "learning_rate": 7.072422002255357e-06, "loss": 0.4664, "num_input_tokens_seen": 13717568, "step": 11290 }, { "epoch": 1.4152361859416114, "grad_norm": 0.3001055419445038, "learning_rate": 7.075554441799274e-06, "loss": 0.4686, "num_input_tokens_seen": 13723872, "step": 11295 }, { "epoch": 1.4158626738503948, "grad_norm": 0.2870355248451233, "learning_rate": 7.0786868813431905e-06, "loss": 0.4689, "num_input_tokens_seen": 13729600, "step": 11300 }, { "epoch": 1.416489161759178, "grad_norm": 0.23256708681583405, "learning_rate": 7.081819320887108e-06, "loss": 0.4655, "num_input_tokens_seen": 13735840, "step": 11305 }, { "epoch": 1.4171156496679613, "grad_norm": 0.21949970722198486, "learning_rate": 7.084951760431025e-06, "loss": 0.4622, "num_input_tokens_seen": 13742016, "step": 11310 }, { "epoch": 1.4177421375767447, "grad_norm": 0.14316539466381073, "learning_rate": 7.088084199974941e-06, "loss": 0.4622, "num_input_tokens_seen": 13748288, "step": 11315 }, { "epoch": 1.4183686254855281, "grad_norm": 0.3196842670440674, "learning_rate": 7.0912166395188575e-06, "loss": 0.4599, "num_input_tokens_seen": 13754432, "step": 11320 }, { "epoch": 1.4189951133943115, "grad_norm": 0.3152945637702942, "learning_rate": 7.094349079062774e-06, "loss": 0.4637, "num_input_tokens_seen": 13760800, "step": 11325 }, { "epoch": 1.419621601303095, "grad_norm": 0.2776073217391968, "learning_rate": 7.097481518606691e-06, "loss": 0.4637, "num_input_tokens_seen": 13766784, "step": 11330 }, { "epoch": 1.4202480892118783, "grad_norm": 0.2457842081785202, "learning_rate": 7.100613958150609e-06, "loss": 0.4553, "num_input_tokens_seen": 13773280, "step": 11335 }, { "epoch": 1.4208745771206615, "grad_norm": 0.316855251789093, "learning_rate": 7.1037463976945254e-06, "loss": 0.4666, "num_input_tokens_seen": 13779424, "step": 11340 }, { "epoch": 1.421501065029445, "grad_norm": 0.2674032747745514, "learning_rate": 7.106878837238442e-06, "loss": 0.4666, "num_input_tokens_seen": 13785536, "step": 11345 }, { "epoch": 1.4221275529382282, "grad_norm": 0.42863160371780396, "learning_rate": 7.110011276782358e-06, "loss": 0.4678, "num_input_tokens_seen": 13791872, "step": 11350 }, { "epoch": 1.4227540408470116, "grad_norm": 0.2712749242782593, "learning_rate": 7.113143716326275e-06, "loss": 0.4687, "num_input_tokens_seen": 13797952, "step": 11355 }, { "epoch": 1.423380528755795, "grad_norm": 0.38955265283584595, "learning_rate": 7.116276155870192e-06, "loss": 0.461, "num_input_tokens_seen": 13803808, "step": 11360 }, { "epoch": 1.4240070166645784, "grad_norm": 0.2349899560213089, "learning_rate": 7.119408595414109e-06, "loss": 0.4671, "num_input_tokens_seen": 13810112, "step": 11365 }, { "epoch": 1.4246335045733618, "grad_norm": 0.356027752161026, "learning_rate": 7.122541034958026e-06, "loss": 0.4699, "num_input_tokens_seen": 13816288, "step": 11370 }, { "epoch": 1.425259992482145, "grad_norm": 0.2785666584968567, "learning_rate": 7.125673474501943e-06, "loss": 0.4656, "num_input_tokens_seen": 13822592, "step": 11375 }, { "epoch": 1.4258864803909286, "grad_norm": 0.5282872915267944, "learning_rate": 7.1288059140458595e-06, "loss": 0.4658, "num_input_tokens_seen": 13828448, "step": 11380 }, { "epoch": 1.4265129682997117, "grad_norm": 0.3331582844257355, "learning_rate": 7.131938353589777e-06, "loss": 0.4626, "num_input_tokens_seen": 13834304, "step": 11385 }, { "epoch": 1.4271394562084951, "grad_norm": 0.13847649097442627, "learning_rate": 7.135070793133693e-06, "loss": 0.4637, "num_input_tokens_seen": 13840576, "step": 11390 }, { "epoch": 1.4277659441172785, "grad_norm": 0.4221263527870178, "learning_rate": 7.13820323267761e-06, "loss": 0.4653, "num_input_tokens_seen": 13846752, "step": 11395 }, { "epoch": 1.428392432026062, "grad_norm": 0.19531406462192535, "learning_rate": 7.141335672221527e-06, "loss": 0.4617, "num_input_tokens_seen": 13852800, "step": 11400 }, { "epoch": 1.4290189199348453, "grad_norm": 0.21896328032016754, "learning_rate": 7.144468111765443e-06, "loss": 0.4571, "num_input_tokens_seen": 13859040, "step": 11405 }, { "epoch": 1.4296454078436285, "grad_norm": 0.25023117661476135, "learning_rate": 7.14760055130936e-06, "loss": 0.4598, "num_input_tokens_seen": 13865024, "step": 11410 }, { "epoch": 1.430271895752412, "grad_norm": 0.36568745970726013, "learning_rate": 7.150732990853278e-06, "loss": 0.4689, "num_input_tokens_seen": 13871168, "step": 11415 }, { "epoch": 1.4308983836611953, "grad_norm": 0.33689218759536743, "learning_rate": 7.1538654303971945e-06, "loss": 0.4623, "num_input_tokens_seen": 13877408, "step": 11420 }, { "epoch": 1.4315248715699787, "grad_norm": 0.29567232728004456, "learning_rate": 7.15699786994111e-06, "loss": 0.4587, "num_input_tokens_seen": 13881600, "step": 11425 }, { "epoch": 1.432151359478762, "grad_norm": 0.4056971073150635, "learning_rate": 7.160130309485027e-06, "loss": 0.4578, "num_input_tokens_seen": 13887392, "step": 11430 }, { "epoch": 1.4327778473875454, "grad_norm": 0.1942177265882492, "learning_rate": 7.163262749028944e-06, "loss": 0.4619, "num_input_tokens_seen": 13893536, "step": 11435 }, { "epoch": 1.4334043352963288, "grad_norm": 0.2666701674461365, "learning_rate": 7.166395188572861e-06, "loss": 0.4739, "num_input_tokens_seen": 13899712, "step": 11440 }, { "epoch": 1.4340308232051122, "grad_norm": 0.30472633242607117, "learning_rate": 7.169527628116778e-06, "loss": 0.4602, "num_input_tokens_seen": 13905728, "step": 11445 }, { "epoch": 1.4346573111138956, "grad_norm": 0.23494131863117218, "learning_rate": 7.172660067660695e-06, "loss": 0.4647, "num_input_tokens_seen": 13912000, "step": 11450 }, { "epoch": 1.4352837990226788, "grad_norm": 0.24859878420829773, "learning_rate": 7.175792507204612e-06, "loss": 0.4607, "num_input_tokens_seen": 13917984, "step": 11455 }, { "epoch": 1.4359102869314622, "grad_norm": 0.1213144063949585, "learning_rate": 7.178924946748528e-06, "loss": 0.4597, "num_input_tokens_seen": 13923680, "step": 11460 }, { "epoch": 1.4365367748402456, "grad_norm": 0.2574186325073242, "learning_rate": 7.1820573862924445e-06, "loss": 0.4623, "num_input_tokens_seen": 13929920, "step": 11465 }, { "epoch": 1.437163262749029, "grad_norm": 0.30163222551345825, "learning_rate": 7.185189825836362e-06, "loss": 0.4541, "num_input_tokens_seen": 13935424, "step": 11470 }, { "epoch": 1.4377897506578123, "grad_norm": 0.4353431463241577, "learning_rate": 7.188322265380279e-06, "loss": 0.4642, "num_input_tokens_seen": 13941504, "step": 11475 }, { "epoch": 1.4384162385665957, "grad_norm": 0.3389384150505066, "learning_rate": 7.191454704924196e-06, "loss": 0.4779, "num_input_tokens_seen": 13947552, "step": 11480 }, { "epoch": 1.4390427264753791, "grad_norm": 0.4567767381668091, "learning_rate": 7.194587144468112e-06, "loss": 0.4705, "num_input_tokens_seen": 13953632, "step": 11485 }, { "epoch": 1.4396692143841623, "grad_norm": 0.21591129899024963, "learning_rate": 7.197719584012029e-06, "loss": 0.4544, "num_input_tokens_seen": 13959584, "step": 11490 }, { "epoch": 1.4402957022929457, "grad_norm": 0.3327401876449585, "learning_rate": 7.200852023555947e-06, "loss": 0.4649, "num_input_tokens_seen": 13966016, "step": 11495 }, { "epoch": 1.440922190201729, "grad_norm": 0.33597540855407715, "learning_rate": 7.203984463099863e-06, "loss": 0.4633, "num_input_tokens_seen": 13971040, "step": 11500 }, { "epoch": 1.4415486781105125, "grad_norm": 0.21866877377033234, "learning_rate": 7.2071169026437795e-06, "loss": 0.4665, "num_input_tokens_seen": 13977216, "step": 11505 }, { "epoch": 1.4421751660192959, "grad_norm": 0.3560805320739746, "learning_rate": 7.210249342187696e-06, "loss": 0.4626, "num_input_tokens_seen": 13983040, "step": 11510 }, { "epoch": 1.4428016539280792, "grad_norm": 0.2947852313518524, "learning_rate": 7.213381781731613e-06, "loss": 0.4607, "num_input_tokens_seen": 13989408, "step": 11515 }, { "epoch": 1.4434281418368626, "grad_norm": 0.3576369881629944, "learning_rate": 7.21651422127553e-06, "loss": 0.4624, "num_input_tokens_seen": 13995648, "step": 11520 }, { "epoch": 1.4440546297456458, "grad_norm": 0.2677977681159973, "learning_rate": 7.219646660819447e-06, "loss": 0.4616, "num_input_tokens_seen": 14001696, "step": 11525 }, { "epoch": 1.4446811176544292, "grad_norm": 0.3083258271217346, "learning_rate": 7.222779100363364e-06, "loss": 0.4658, "num_input_tokens_seen": 14008000, "step": 11530 }, { "epoch": 1.4453076055632126, "grad_norm": 0.346035361289978, "learning_rate": 7.22591153990728e-06, "loss": 0.4668, "num_input_tokens_seen": 14014080, "step": 11535 }, { "epoch": 1.445934093471996, "grad_norm": 0.2637096047401428, "learning_rate": 7.229043979451197e-06, "loss": 0.4588, "num_input_tokens_seen": 14020000, "step": 11540 }, { "epoch": 1.4465605813807794, "grad_norm": 0.37590521574020386, "learning_rate": 7.2321764189951136e-06, "loss": 0.4542, "num_input_tokens_seen": 14026592, "step": 11545 }, { "epoch": 1.4471870692895628, "grad_norm": 0.24395498633384705, "learning_rate": 7.235308858539031e-06, "loss": 0.4668, "num_input_tokens_seen": 14032672, "step": 11550 }, { "epoch": 1.4478135571983461, "grad_norm": 0.387225866317749, "learning_rate": 7.238441298082948e-06, "loss": 0.4531, "num_input_tokens_seen": 14039008, "step": 11555 }, { "epoch": 1.4484400451071293, "grad_norm": 0.4161038100719452, "learning_rate": 7.241573737626865e-06, "loss": 0.4517, "num_input_tokens_seen": 14045216, "step": 11560 }, { "epoch": 1.449066533015913, "grad_norm": 0.4969582259654999, "learning_rate": 7.2447061771707815e-06, "loss": 0.4462, "num_input_tokens_seen": 14051744, "step": 11565 }, { "epoch": 1.449693020924696, "grad_norm": 0.45442843437194824, "learning_rate": 7.247838616714697e-06, "loss": 0.4851, "num_input_tokens_seen": 14057856, "step": 11570 }, { "epoch": 1.4503195088334795, "grad_norm": 0.3729415237903595, "learning_rate": 7.250971056258614e-06, "loss": 0.4723, "num_input_tokens_seen": 14064352, "step": 11575 }, { "epoch": 1.4509459967422629, "grad_norm": 0.6868278384208679, "learning_rate": 7.254103495802532e-06, "loss": 0.4684, "num_input_tokens_seen": 14070112, "step": 11580 }, { "epoch": 1.4515724846510463, "grad_norm": 0.4692116677761078, "learning_rate": 7.2572359353464485e-06, "loss": 0.4686, "num_input_tokens_seen": 14076288, "step": 11585 }, { "epoch": 1.4521989725598297, "grad_norm": 0.4094351530075073, "learning_rate": 7.260368374890365e-06, "loss": 0.4627, "num_input_tokens_seen": 14082368, "step": 11590 }, { "epoch": 1.4528254604686128, "grad_norm": 0.9101272225379944, "learning_rate": 7.263500814434282e-06, "loss": 0.4586, "num_input_tokens_seen": 14088544, "step": 11595 }, { "epoch": 1.4534519483773964, "grad_norm": 0.38553252816200256, "learning_rate": 7.266633253978199e-06, "loss": 0.4768, "num_input_tokens_seen": 14094304, "step": 11600 }, { "epoch": 1.4540784362861796, "grad_norm": 1.0356919765472412, "learning_rate": 7.269765693522116e-06, "loss": 0.4743, "num_input_tokens_seen": 14100448, "step": 11605 }, { "epoch": 1.454704924194963, "grad_norm": 0.31201061606407166, "learning_rate": 7.272898133066032e-06, "loss": 0.4634, "num_input_tokens_seen": 14106368, "step": 11610 }, { "epoch": 1.4553314121037464, "grad_norm": 0.29248046875, "learning_rate": 7.276030572609949e-06, "loss": 0.465, "num_input_tokens_seen": 14112768, "step": 11615 }, { "epoch": 1.4559579000125298, "grad_norm": 0.6072439551353455, "learning_rate": 7.279163012153866e-06, "loss": 0.4607, "num_input_tokens_seen": 14119232, "step": 11620 }, { "epoch": 1.4565843879213132, "grad_norm": 0.6985169053077698, "learning_rate": 7.282295451697783e-06, "loss": 0.4622, "num_input_tokens_seen": 14125408, "step": 11625 }, { "epoch": 1.4572108758300963, "grad_norm": 0.4268665313720703, "learning_rate": 7.285427891241699e-06, "loss": 0.469, "num_input_tokens_seen": 14131616, "step": 11630 }, { "epoch": 1.45783736373888, "grad_norm": 0.4785500764846802, "learning_rate": 7.288560330785617e-06, "loss": 0.4605, "num_input_tokens_seen": 14137888, "step": 11635 }, { "epoch": 1.4584638516476631, "grad_norm": 0.3131517171859741, "learning_rate": 7.291692770329534e-06, "loss": 0.4543, "num_input_tokens_seen": 14144160, "step": 11640 }, { "epoch": 1.4590903395564465, "grad_norm": 0.0955745279788971, "learning_rate": 7.29482520987345e-06, "loss": 0.4641, "num_input_tokens_seen": 14150080, "step": 11645 }, { "epoch": 1.45971682746523, "grad_norm": 0.2497706115245819, "learning_rate": 7.2979576494173664e-06, "loss": 0.4662, "num_input_tokens_seen": 14156224, "step": 11650 }, { "epoch": 1.4603433153740133, "grad_norm": 0.22815735638141632, "learning_rate": 7.301090088961283e-06, "loss": 0.4656, "num_input_tokens_seen": 14162528, "step": 11655 }, { "epoch": 1.4609698032827967, "grad_norm": 0.23020632565021515, "learning_rate": 7.304222528505201e-06, "loss": 0.4697, "num_input_tokens_seen": 14168480, "step": 11660 }, { "epoch": 1.46159629119158, "grad_norm": 0.30691075325012207, "learning_rate": 7.3073549680491176e-06, "loss": 0.4592, "num_input_tokens_seen": 14174752, "step": 11665 }, { "epoch": 1.4622227791003635, "grad_norm": 0.09751061350107193, "learning_rate": 7.310487407593034e-06, "loss": 0.4586, "num_input_tokens_seen": 14180768, "step": 11670 }, { "epoch": 1.4628492670091466, "grad_norm": 0.31463372707366943, "learning_rate": 7.313619847136951e-06, "loss": 0.4675, "num_input_tokens_seen": 14186624, "step": 11675 }, { "epoch": 1.46347575491793, "grad_norm": 0.2734684348106384, "learning_rate": 7.316752286680867e-06, "loss": 0.4612, "num_input_tokens_seen": 14192480, "step": 11680 }, { "epoch": 1.4641022428267134, "grad_norm": 0.43101444840431213, "learning_rate": 7.319884726224784e-06, "loss": 0.4586, "num_input_tokens_seen": 14198720, "step": 11685 }, { "epoch": 1.4647287307354968, "grad_norm": 0.2574869990348816, "learning_rate": 7.323017165768701e-06, "loss": 0.4635, "num_input_tokens_seen": 14204992, "step": 11690 }, { "epoch": 1.4653552186442802, "grad_norm": 0.2682919204235077, "learning_rate": 7.326149605312618e-06, "loss": 0.4576, "num_input_tokens_seen": 14210944, "step": 11695 }, { "epoch": 1.4659817065530636, "grad_norm": 0.28169485926628113, "learning_rate": 7.329282044856535e-06, "loss": 0.4541, "num_input_tokens_seen": 14217056, "step": 11700 }, { "epoch": 1.466608194461847, "grad_norm": 0.4215472638607025, "learning_rate": 7.332414484400452e-06, "loss": 0.4635, "num_input_tokens_seen": 14223392, "step": 11705 }, { "epoch": 1.4672346823706301, "grad_norm": 0.5988978147506714, "learning_rate": 7.3355469239443684e-06, "loss": 0.4788, "num_input_tokens_seen": 14229504, "step": 11710 }, { "epoch": 1.4678611702794135, "grad_norm": 0.6556450724601746, "learning_rate": 7.338679363488286e-06, "loss": 0.4681, "num_input_tokens_seen": 14235456, "step": 11715 }, { "epoch": 1.468487658188197, "grad_norm": 0.38015320897102356, "learning_rate": 7.341811803032202e-06, "loss": 0.4587, "num_input_tokens_seen": 14241952, "step": 11720 }, { "epoch": 1.4691141460969803, "grad_norm": 0.7846178412437439, "learning_rate": 7.344944242576119e-06, "loss": 0.4614, "num_input_tokens_seen": 14248192, "step": 11725 }, { "epoch": 1.4697406340057637, "grad_norm": 0.8009018898010254, "learning_rate": 7.3480766821200355e-06, "loss": 0.4725, "num_input_tokens_seen": 14253888, "step": 11730 }, { "epoch": 1.470367121914547, "grad_norm": 0.28462234139442444, "learning_rate": 7.351209121663952e-06, "loss": 0.4718, "num_input_tokens_seen": 14260256, "step": 11735 }, { "epoch": 1.4709936098233305, "grad_norm": 0.20648537576198578, "learning_rate": 7.35434156120787e-06, "loss": 0.4627, "num_input_tokens_seen": 14266304, "step": 11740 }, { "epoch": 1.4716200977321137, "grad_norm": 1.1569668054580688, "learning_rate": 7.357474000751787e-06, "loss": 0.4599, "num_input_tokens_seen": 14272448, "step": 11745 }, { "epoch": 1.472246585640897, "grad_norm": 0.6909807920455933, "learning_rate": 7.360606440295703e-06, "loss": 0.4695, "num_input_tokens_seen": 14278464, "step": 11750 }, { "epoch": 1.4728730735496804, "grad_norm": 0.678742527961731, "learning_rate": 7.363738879839619e-06, "loss": 0.4675, "num_input_tokens_seen": 14284448, "step": 11755 }, { "epoch": 1.4734995614584638, "grad_norm": 0.3323591649532318, "learning_rate": 7.366871319383536e-06, "loss": 0.4678, "num_input_tokens_seen": 14289984, "step": 11760 }, { "epoch": 1.4741260493672472, "grad_norm": 0.18469256162643433, "learning_rate": 7.370003758927453e-06, "loss": 0.4645, "num_input_tokens_seen": 14296256, "step": 11765 }, { "epoch": 1.4747525372760306, "grad_norm": 0.32308095693588257, "learning_rate": 7.3731361984713704e-06, "loss": 0.4619, "num_input_tokens_seen": 14302560, "step": 11770 }, { "epoch": 1.475379025184814, "grad_norm": 0.11545554548501968, "learning_rate": 7.376268638015287e-06, "loss": 0.4632, "num_input_tokens_seen": 14308864, "step": 11775 }, { "epoch": 1.4760055130935972, "grad_norm": 0.320017009973526, "learning_rate": 7.379401077559204e-06, "loss": 0.4652, "num_input_tokens_seen": 14315328, "step": 11780 }, { "epoch": 1.4766320010023808, "grad_norm": 0.3878757357597351, "learning_rate": 7.382533517103121e-06, "loss": 0.4684, "num_input_tokens_seen": 14321248, "step": 11785 }, { "epoch": 1.477258488911164, "grad_norm": 0.25131261348724365, "learning_rate": 7.385665956647037e-06, "loss": 0.4615, "num_input_tokens_seen": 14327680, "step": 11790 }, { "epoch": 1.4778849768199473, "grad_norm": 0.21923154592514038, "learning_rate": 7.388798396190953e-06, "loss": 0.4633, "num_input_tokens_seen": 14333632, "step": 11795 }, { "epoch": 1.4785114647287307, "grad_norm": 0.21928410232067108, "learning_rate": 7.391930835734871e-06, "loss": 0.4674, "num_input_tokens_seen": 14340064, "step": 11800 }, { "epoch": 1.4791379526375141, "grad_norm": 0.2389729768037796, "learning_rate": 7.395063275278788e-06, "loss": 0.4626, "num_input_tokens_seen": 14346336, "step": 11805 }, { "epoch": 1.4797644405462975, "grad_norm": 0.2675762474536896, "learning_rate": 7.3981957148227045e-06, "loss": 0.4702, "num_input_tokens_seen": 14352480, "step": 11810 }, { "epoch": 1.4803909284550807, "grad_norm": 0.21243441104888916, "learning_rate": 7.401328154366621e-06, "loss": 0.4625, "num_input_tokens_seen": 14358816, "step": 11815 }, { "epoch": 1.4810174163638643, "grad_norm": 0.27451232075691223, "learning_rate": 7.404460593910539e-06, "loss": 0.464, "num_input_tokens_seen": 14364544, "step": 11820 }, { "epoch": 1.4816439042726475, "grad_norm": 0.28162163496017456, "learning_rate": 7.407593033454456e-06, "loss": 0.4591, "num_input_tokens_seen": 14370336, "step": 11825 }, { "epoch": 1.4822703921814309, "grad_norm": 0.24934294819831848, "learning_rate": 7.410725472998372e-06, "loss": 0.4655, "num_input_tokens_seen": 14376608, "step": 11830 }, { "epoch": 1.4828968800902143, "grad_norm": 0.22175167500972748, "learning_rate": 7.413857912542288e-06, "loss": 0.465, "num_input_tokens_seen": 14382624, "step": 11835 }, { "epoch": 1.4835233679989976, "grad_norm": 0.2533148527145386, "learning_rate": 7.416990352086205e-06, "loss": 0.4649, "num_input_tokens_seen": 14388992, "step": 11840 }, { "epoch": 1.484149855907781, "grad_norm": 0.18724118173122406, "learning_rate": 7.420122791630122e-06, "loss": 0.4648, "num_input_tokens_seen": 14394912, "step": 11845 }, { "epoch": 1.4847763438165642, "grad_norm": 0.225862517952919, "learning_rate": 7.4232552311740395e-06, "loss": 0.4631, "num_input_tokens_seen": 14400736, "step": 11850 }, { "epoch": 1.4854028317253478, "grad_norm": 0.22025321424007416, "learning_rate": 7.426387670717956e-06, "loss": 0.4605, "num_input_tokens_seen": 14406880, "step": 11855 }, { "epoch": 1.486029319634131, "grad_norm": 0.3471226096153259, "learning_rate": 7.429520110261873e-06, "loss": 0.4577, "num_input_tokens_seen": 14413120, "step": 11860 }, { "epoch": 1.4866558075429144, "grad_norm": 0.27580609917640686, "learning_rate": 7.432652549805789e-06, "loss": 0.464, "num_input_tokens_seen": 14419296, "step": 11865 }, { "epoch": 1.4872822954516978, "grad_norm": 0.2367912083864212, "learning_rate": 7.435784989349706e-06, "loss": 0.4669, "num_input_tokens_seen": 14424864, "step": 11870 }, { "epoch": 1.4879087833604812, "grad_norm": 0.2233940213918686, "learning_rate": 7.4389174288936225e-06, "loss": 0.4737, "num_input_tokens_seen": 14430880, "step": 11875 }, { "epoch": 1.4885352712692645, "grad_norm": 0.09848277270793915, "learning_rate": 7.44204986843754e-06, "loss": 0.4606, "num_input_tokens_seen": 14436896, "step": 11880 }, { "epoch": 1.489161759178048, "grad_norm": 0.1980871856212616, "learning_rate": 7.445182307981457e-06, "loss": 0.4617, "num_input_tokens_seen": 14443296, "step": 11885 }, { "epoch": 1.4897882470868313, "grad_norm": 0.2834572196006775, "learning_rate": 7.448314747525374e-06, "loss": 0.4653, "num_input_tokens_seen": 14449408, "step": 11890 }, { "epoch": 1.4904147349956145, "grad_norm": 0.08063667267560959, "learning_rate": 7.45144718706929e-06, "loss": 0.4584, "num_input_tokens_seen": 14455488, "step": 11895 }, { "epoch": 1.491041222904398, "grad_norm": 0.24566707015037537, "learning_rate": 7.454579626613206e-06, "loss": 0.4606, "num_input_tokens_seen": 14461312, "step": 11900 }, { "epoch": 1.4916677108131813, "grad_norm": 0.3665737211704254, "learning_rate": 7.457712066157123e-06, "loss": 0.4625, "num_input_tokens_seen": 14467776, "step": 11905 }, { "epoch": 1.4922941987219647, "grad_norm": 0.2139035016298294, "learning_rate": 7.460844505701041e-06, "loss": 0.4649, "num_input_tokens_seen": 14473888, "step": 11910 }, { "epoch": 1.492920686630748, "grad_norm": 0.2702404856681824, "learning_rate": 7.463976945244957e-06, "loss": 0.4617, "num_input_tokens_seen": 14479616, "step": 11915 }, { "epoch": 1.4935471745395315, "grad_norm": 0.247589111328125, "learning_rate": 7.467109384788874e-06, "loss": 0.4593, "num_input_tokens_seen": 14485696, "step": 11920 }, { "epoch": 1.4941736624483148, "grad_norm": 0.09980762004852295, "learning_rate": 7.470241824332791e-06, "loss": 0.468, "num_input_tokens_seen": 14491776, "step": 11925 }, { "epoch": 1.494800150357098, "grad_norm": 0.2501462399959564, "learning_rate": 7.4733742638767086e-06, "loss": 0.4586, "num_input_tokens_seen": 14497696, "step": 11930 }, { "epoch": 1.4954266382658814, "grad_norm": 0.1505238264799118, "learning_rate": 7.476506703420625e-06, "loss": 0.4666, "num_input_tokens_seen": 14503808, "step": 11935 }, { "epoch": 1.4960531261746648, "grad_norm": 0.5060830116271973, "learning_rate": 7.479639142964541e-06, "loss": 0.4594, "num_input_tokens_seen": 14510016, "step": 11940 }, { "epoch": 1.4966796140834482, "grad_norm": 0.5029605031013489, "learning_rate": 7.482771582508458e-06, "loss": 0.4597, "num_input_tokens_seen": 14516000, "step": 11945 }, { "epoch": 1.4973061019922316, "grad_norm": 0.40406373143196106, "learning_rate": 7.485904022052375e-06, "loss": 0.4577, "num_input_tokens_seen": 14522240, "step": 11950 }, { "epoch": 1.497932589901015, "grad_norm": 0.36906152963638306, "learning_rate": 7.4890364615962915e-06, "loss": 0.4633, "num_input_tokens_seen": 14527648, "step": 11955 }, { "epoch": 1.4985590778097984, "grad_norm": 0.5246551632881165, "learning_rate": 7.492168901140209e-06, "loss": 0.4688, "num_input_tokens_seen": 14534240, "step": 11960 }, { "epoch": 1.4991855657185815, "grad_norm": 0.22580742835998535, "learning_rate": 7.495301340684126e-06, "loss": 0.4565, "num_input_tokens_seen": 14540288, "step": 11965 }, { "epoch": 1.499812053627365, "grad_norm": 0.31356629729270935, "learning_rate": 7.498433780228043e-06, "loss": 0.4604, "num_input_tokens_seen": 14546368, "step": 11970 }, { "epoch": 1.5004385415361483, "grad_norm": 0.38448581099510193, "learning_rate": 7.5015662197719586e-06, "loss": 0.4656, "num_input_tokens_seen": 14552480, "step": 11975 }, { "epoch": 1.5010650294449317, "grad_norm": 0.25836285948753357, "learning_rate": 7.504698659315875e-06, "loss": 0.463, "num_input_tokens_seen": 14558784, "step": 11980 }, { "epoch": 1.501691517353715, "grad_norm": 0.39428454637527466, "learning_rate": 7.507831098859792e-06, "loss": 0.4526, "num_input_tokens_seen": 14564672, "step": 11985 }, { "epoch": 1.5023180052624985, "grad_norm": 0.36295825242996216, "learning_rate": 7.51096353840371e-06, "loss": 0.4541, "num_input_tokens_seen": 14570880, "step": 11990 }, { "epoch": 1.5029444931712819, "grad_norm": 0.5682852864265442, "learning_rate": 7.5140959779476265e-06, "loss": 0.4709, "num_input_tokens_seen": 14576832, "step": 11995 }, { "epoch": 1.503570981080065, "grad_norm": 0.38970330357551575, "learning_rate": 7.517228417491543e-06, "loss": 0.4753, "num_input_tokens_seen": 14583104, "step": 12000 }, { "epoch": 1.5041974689888487, "grad_norm": 0.2748112380504608, "learning_rate": 7.52036085703546e-06, "loss": 0.4659, "num_input_tokens_seen": 14589408, "step": 12005 }, { "epoch": 1.5048239568976318, "grad_norm": 0.3353411555290222, "learning_rate": 7.523493296579376e-06, "loss": 0.4587, "num_input_tokens_seen": 14595456, "step": 12010 }, { "epoch": 1.5054504448064152, "grad_norm": 0.27381524443626404, "learning_rate": 7.526625736123293e-06, "loss": 0.467, "num_input_tokens_seen": 14601504, "step": 12015 }, { "epoch": 1.5060769327151986, "grad_norm": 0.3230890929698944, "learning_rate": 7.52975817566721e-06, "loss": 0.4577, "num_input_tokens_seen": 14607552, "step": 12020 }, { "epoch": 1.506703420623982, "grad_norm": 0.2724043130874634, "learning_rate": 7.532890615211127e-06, "loss": 0.468, "num_input_tokens_seen": 14613376, "step": 12025 }, { "epoch": 1.5073299085327654, "grad_norm": 0.18271160125732422, "learning_rate": 7.536023054755044e-06, "loss": 0.4576, "num_input_tokens_seen": 14619328, "step": 12030 }, { "epoch": 1.5079563964415486, "grad_norm": 0.42316925525665283, "learning_rate": 7.5391554942989606e-06, "loss": 0.4701, "num_input_tokens_seen": 14625568, "step": 12035 }, { "epoch": 1.5085828843503322, "grad_norm": 0.2214910238981247, "learning_rate": 7.542287933842878e-06, "loss": 0.4577, "num_input_tokens_seen": 14632096, "step": 12040 }, { "epoch": 1.5092093722591153, "grad_norm": 0.2347855418920517, "learning_rate": 7.545420373386795e-06, "loss": 0.4649, "num_input_tokens_seen": 14638080, "step": 12045 }, { "epoch": 1.5098358601678987, "grad_norm": 0.2789234519004822, "learning_rate": 7.548552812930711e-06, "loss": 0.4677, "num_input_tokens_seen": 14644384, "step": 12050 }, { "epoch": 1.5104623480766821, "grad_norm": 0.07727477699518204, "learning_rate": 7.551685252474628e-06, "loss": 0.4701, "num_input_tokens_seen": 14650912, "step": 12055 }, { "epoch": 1.5110888359854655, "grad_norm": 0.20891745388507843, "learning_rate": 7.554817692018544e-06, "loss": 0.4637, "num_input_tokens_seen": 14657376, "step": 12060 }, { "epoch": 1.511715323894249, "grad_norm": 0.2702287435531616, "learning_rate": 7.557950131562461e-06, "loss": 0.4619, "num_input_tokens_seen": 14663488, "step": 12065 }, { "epoch": 1.512341811803032, "grad_norm": 0.09710916131734848, "learning_rate": 7.561082571106379e-06, "loss": 0.4552, "num_input_tokens_seen": 14669856, "step": 12070 }, { "epoch": 1.5129682997118157, "grad_norm": 0.22792628407478333, "learning_rate": 7.5642150106502955e-06, "loss": 0.4617, "num_input_tokens_seen": 14676224, "step": 12075 }, { "epoch": 1.5135947876205988, "grad_norm": 0.21324442327022552, "learning_rate": 7.567347450194212e-06, "loss": 0.4639, "num_input_tokens_seen": 14682432, "step": 12080 }, { "epoch": 1.5142212755293822, "grad_norm": 0.08929959684610367, "learning_rate": 7.570479889738128e-06, "loss": 0.4648, "num_input_tokens_seen": 14688736, "step": 12085 }, { "epoch": 1.5148477634381656, "grad_norm": 0.23526908457279205, "learning_rate": 7.573612329282045e-06, "loss": 0.4578, "num_input_tokens_seen": 14694880, "step": 12090 }, { "epoch": 1.515474251346949, "grad_norm": 0.31135013699531555, "learning_rate": 7.576744768825962e-06, "loss": 0.4656, "num_input_tokens_seen": 14700768, "step": 12095 }, { "epoch": 1.5161007392557324, "grad_norm": 0.11432620137929916, "learning_rate": 7.579877208369879e-06, "loss": 0.467, "num_input_tokens_seen": 14707008, "step": 12100 }, { "epoch": 1.5167272271645156, "grad_norm": 0.3621331751346588, "learning_rate": 7.583009647913796e-06, "loss": 0.4584, "num_input_tokens_seen": 14713088, "step": 12105 }, { "epoch": 1.5173537150732992, "grad_norm": 0.22827361524105072, "learning_rate": 7.586142087457713e-06, "loss": 0.4613, "num_input_tokens_seen": 14718976, "step": 12110 }, { "epoch": 1.5179802029820824, "grad_norm": 0.2793721854686737, "learning_rate": 7.58927452700163e-06, "loss": 0.452, "num_input_tokens_seen": 14725120, "step": 12115 }, { "epoch": 1.5186066908908658, "grad_norm": 0.3660728931427002, "learning_rate": 7.5924069665455455e-06, "loss": 0.4646, "num_input_tokens_seen": 14731008, "step": 12120 }, { "epoch": 1.5192331787996491, "grad_norm": 0.2285906821489334, "learning_rate": 7.595539406089462e-06, "loss": 0.478, "num_input_tokens_seen": 14736320, "step": 12125 }, { "epoch": 1.5198596667084325, "grad_norm": 0.29397282004356384, "learning_rate": 7.59867184563338e-06, "loss": 0.4599, "num_input_tokens_seen": 14742240, "step": 12130 }, { "epoch": 1.520486154617216, "grad_norm": 0.2245175689458847, "learning_rate": 7.601804285177297e-06, "loss": 0.459, "num_input_tokens_seen": 14748096, "step": 12135 }, { "epoch": 1.521112642525999, "grad_norm": 0.18329329788684845, "learning_rate": 7.6049367247212134e-06, "loss": 0.4504, "num_input_tokens_seen": 14754144, "step": 12140 }, { "epoch": 1.5217391304347827, "grad_norm": 0.23777632415294647, "learning_rate": 7.60806916426513e-06, "loss": 0.4634, "num_input_tokens_seen": 14760416, "step": 12145 }, { "epoch": 1.5223656183435659, "grad_norm": 0.25084295868873596, "learning_rate": 7.611201603809048e-06, "loss": 0.4615, "num_input_tokens_seen": 14766752, "step": 12150 }, { "epoch": 1.5229921062523495, "grad_norm": 0.26337185502052307, "learning_rate": 7.614334043352965e-06, "loss": 0.458, "num_input_tokens_seen": 14773152, "step": 12155 }, { "epoch": 1.5236185941611327, "grad_norm": 0.25325170159339905, "learning_rate": 7.6174664828968805e-06, "loss": 0.4679, "num_input_tokens_seen": 14779168, "step": 12160 }, { "epoch": 1.524245082069916, "grad_norm": 0.2717793583869934, "learning_rate": 7.620598922440797e-06, "loss": 0.4737, "num_input_tokens_seen": 14785056, "step": 12165 }, { "epoch": 1.5248715699786994, "grad_norm": 0.2963012456893921, "learning_rate": 7.623731361984714e-06, "loss": 0.4552, "num_input_tokens_seen": 14791232, "step": 12170 }, { "epoch": 1.5254980578874826, "grad_norm": 0.47112154960632324, "learning_rate": 7.626863801528631e-06, "loss": 0.4603, "num_input_tokens_seen": 14797312, "step": 12175 }, { "epoch": 1.5261245457962662, "grad_norm": 0.10634017735719681, "learning_rate": 7.629996241072548e-06, "loss": 0.4414, "num_input_tokens_seen": 14802880, "step": 12180 }, { "epoch": 1.5267510337050494, "grad_norm": 0.23091337084770203, "learning_rate": 7.633128680616465e-06, "loss": 0.4566, "num_input_tokens_seen": 14808768, "step": 12185 }, { "epoch": 1.527377521613833, "grad_norm": 0.215068057179451, "learning_rate": 7.636261120160382e-06, "loss": 0.4584, "num_input_tokens_seen": 14815136, "step": 12190 }, { "epoch": 1.5280040095226162, "grad_norm": 0.1849289834499359, "learning_rate": 7.639393559704299e-06, "loss": 0.4615, "num_input_tokens_seen": 14821024, "step": 12195 }, { "epoch": 1.5286304974313996, "grad_norm": 0.3009972870349884, "learning_rate": 7.642525999248215e-06, "loss": 0.4841, "num_input_tokens_seen": 14827200, "step": 12200 }, { "epoch": 1.529256985340183, "grad_norm": 0.21381919085979462, "learning_rate": 7.645658438792132e-06, "loss": 0.4646, "num_input_tokens_seen": 14833440, "step": 12205 }, { "epoch": 1.5298834732489663, "grad_norm": 0.2209121137857437, "learning_rate": 7.648790878336049e-06, "loss": 0.4752, "num_input_tokens_seen": 14839680, "step": 12210 }, { "epoch": 1.5305099611577497, "grad_norm": 0.14946627616882324, "learning_rate": 7.651923317879966e-06, "loss": 0.4607, "num_input_tokens_seen": 14845856, "step": 12215 }, { "epoch": 1.531136449066533, "grad_norm": 0.2525082230567932, "learning_rate": 7.655055757423883e-06, "loss": 0.4647, "num_input_tokens_seen": 14851680, "step": 12220 }, { "epoch": 1.5317629369753165, "grad_norm": 0.22091779112815857, "learning_rate": 7.6581881969678e-06, "loss": 0.4572, "num_input_tokens_seen": 14857888, "step": 12225 }, { "epoch": 1.5323894248840997, "grad_norm": 0.37356826663017273, "learning_rate": 7.661320636511716e-06, "loss": 0.4482, "num_input_tokens_seen": 14863168, "step": 12230 }, { "epoch": 1.533015912792883, "grad_norm": 0.18279771506786346, "learning_rate": 7.664453076055633e-06, "loss": 0.4555, "num_input_tokens_seen": 14869216, "step": 12235 }, { "epoch": 1.5336424007016665, "grad_norm": 0.20872224867343903, "learning_rate": 7.66758551559955e-06, "loss": 0.4535, "num_input_tokens_seen": 14875616, "step": 12240 }, { "epoch": 1.5342688886104499, "grad_norm": 0.3921787142753601, "learning_rate": 7.670717955143466e-06, "loss": 0.4758, "num_input_tokens_seen": 14881600, "step": 12245 }, { "epoch": 1.5348953765192332, "grad_norm": 0.22791963815689087, "learning_rate": 7.673850394687383e-06, "loss": 0.4556, "num_input_tokens_seen": 14887456, "step": 12250 }, { "epoch": 1.5355218644280164, "grad_norm": 0.21718965470790863, "learning_rate": 7.6769828342313e-06, "loss": 0.4674, "num_input_tokens_seen": 14893632, "step": 12255 }, { "epoch": 1.5361483523368, "grad_norm": 0.10866646468639374, "learning_rate": 7.680115273775217e-06, "loss": 0.4672, "num_input_tokens_seen": 14899776, "step": 12260 }, { "epoch": 1.5367748402455832, "grad_norm": 0.2607765793800354, "learning_rate": 7.683247713319133e-06, "loss": 0.4583, "num_input_tokens_seen": 14905792, "step": 12265 }, { "epoch": 1.5374013281543666, "grad_norm": 0.27238398790359497, "learning_rate": 7.68638015286305e-06, "loss": 0.4715, "num_input_tokens_seen": 14911360, "step": 12270 }, { "epoch": 1.53802781606315, "grad_norm": 0.25158679485321045, "learning_rate": 7.689512592406967e-06, "loss": 0.4696, "num_input_tokens_seen": 14917408, "step": 12275 }, { "epoch": 1.5386543039719334, "grad_norm": 0.2692113220691681, "learning_rate": 7.692645031950884e-06, "loss": 0.4665, "num_input_tokens_seen": 14923648, "step": 12280 }, { "epoch": 1.5392807918807168, "grad_norm": 0.087190642952919, "learning_rate": 7.6957774714948e-06, "loss": 0.4635, "num_input_tokens_seen": 14929568, "step": 12285 }, { "epoch": 1.5399072797895, "grad_norm": 0.08271688222885132, "learning_rate": 7.698909911038717e-06, "loss": 0.4653, "num_input_tokens_seen": 14935744, "step": 12290 }, { "epoch": 1.5405337676982835, "grad_norm": 0.2201094627380371, "learning_rate": 7.702042350582634e-06, "loss": 0.4622, "num_input_tokens_seen": 14942048, "step": 12295 }, { "epoch": 1.5411602556070667, "grad_norm": 0.23608890175819397, "learning_rate": 7.705174790126552e-06, "loss": 0.4632, "num_input_tokens_seen": 14948000, "step": 12300 }, { "epoch": 1.54178674351585, "grad_norm": 0.19026751816272736, "learning_rate": 7.708307229670467e-06, "loss": 0.4688, "num_input_tokens_seen": 14953760, "step": 12305 }, { "epoch": 1.5424132314246335, "grad_norm": 0.23615916073322296, "learning_rate": 7.711439669214384e-06, "loss": 0.464, "num_input_tokens_seen": 14959904, "step": 12310 }, { "epoch": 1.5430397193334169, "grad_norm": 0.1875632256269455, "learning_rate": 7.714572108758301e-06, "loss": 0.465, "num_input_tokens_seen": 14965696, "step": 12315 }, { "epoch": 1.5436662072422003, "grad_norm": 0.2803148329257965, "learning_rate": 7.717704548302218e-06, "loss": 0.4587, "num_input_tokens_seen": 14971904, "step": 12320 }, { "epoch": 1.5442926951509834, "grad_norm": 0.26473429799079895, "learning_rate": 7.720836987846135e-06, "loss": 0.4641, "num_input_tokens_seen": 14978080, "step": 12325 }, { "epoch": 1.544919183059767, "grad_norm": 0.194733664393425, "learning_rate": 7.723969427390053e-06, "loss": 0.4616, "num_input_tokens_seen": 14984256, "step": 12330 }, { "epoch": 1.5455456709685502, "grad_norm": 0.18479719758033752, "learning_rate": 7.72710186693397e-06, "loss": 0.458, "num_input_tokens_seen": 14990304, "step": 12335 }, { "epoch": 1.5461721588773336, "grad_norm": 0.38380667567253113, "learning_rate": 7.730234306477885e-06, "loss": 0.4669, "num_input_tokens_seen": 14996384, "step": 12340 }, { "epoch": 1.546798646786117, "grad_norm": 0.30124375224113464, "learning_rate": 7.733366746021802e-06, "loss": 0.4467, "num_input_tokens_seen": 15002176, "step": 12345 }, { "epoch": 1.5474251346949004, "grad_norm": 0.5021612644195557, "learning_rate": 7.736499185565718e-06, "loss": 0.4754, "num_input_tokens_seen": 15008288, "step": 12350 }, { "epoch": 1.5480516226036838, "grad_norm": 0.45832887291908264, "learning_rate": 7.739631625109635e-06, "loss": 0.4636, "num_input_tokens_seen": 15014240, "step": 12355 }, { "epoch": 1.548678110512467, "grad_norm": 0.19859687983989716, "learning_rate": 7.742764064653554e-06, "loss": 0.4509, "num_input_tokens_seen": 15020544, "step": 12360 }, { "epoch": 1.5493045984212506, "grad_norm": 0.7164822220802307, "learning_rate": 7.74589650419747e-06, "loss": 0.4738, "num_input_tokens_seen": 15027040, "step": 12365 }, { "epoch": 1.5499310863300337, "grad_norm": 0.3160513937473297, "learning_rate": 7.749028943741387e-06, "loss": 0.4717, "num_input_tokens_seen": 15033216, "step": 12370 }, { "epoch": 1.5505575742388173, "grad_norm": 0.377022385597229, "learning_rate": 7.752161383285304e-06, "loss": 0.4612, "num_input_tokens_seen": 15039488, "step": 12375 }, { "epoch": 1.5511840621476005, "grad_norm": 0.3176846504211426, "learning_rate": 7.755293822829219e-06, "loss": 0.4531, "num_input_tokens_seen": 15045824, "step": 12380 }, { "epoch": 1.551810550056384, "grad_norm": 0.2638205885887146, "learning_rate": 7.758426262373137e-06, "loss": 0.4629, "num_input_tokens_seen": 15052192, "step": 12385 }, { "epoch": 1.5524370379651673, "grad_norm": 0.2562654912471771, "learning_rate": 7.761558701917054e-06, "loss": 0.4718, "num_input_tokens_seen": 15058336, "step": 12390 }, { "epoch": 1.5530635258739505, "grad_norm": 0.3261367976665497, "learning_rate": 7.764691141460971e-06, "loss": 0.4638, "num_input_tokens_seen": 15063808, "step": 12395 }, { "epoch": 1.553690013782734, "grad_norm": 0.2281530648469925, "learning_rate": 7.767823581004888e-06, "loss": 0.4657, "num_input_tokens_seen": 15069952, "step": 12400 }, { "epoch": 1.5543165016915172, "grad_norm": 0.29917630553245544, "learning_rate": 7.770956020548804e-06, "loss": 0.4641, "num_input_tokens_seen": 15076032, "step": 12405 }, { "epoch": 1.5549429896003009, "grad_norm": 0.2869762182235718, "learning_rate": 7.774088460092721e-06, "loss": 0.4662, "num_input_tokens_seen": 15082400, "step": 12410 }, { "epoch": 1.555569477509084, "grad_norm": 0.22465172410011292, "learning_rate": 7.777220899636638e-06, "loss": 0.4656, "num_input_tokens_seen": 15088160, "step": 12415 }, { "epoch": 1.5561959654178674, "grad_norm": 0.2729545533657074, "learning_rate": 7.780353339180555e-06, "loss": 0.4622, "num_input_tokens_seen": 15093952, "step": 12420 }, { "epoch": 1.5568224533266508, "grad_norm": 0.23651264607906342, "learning_rate": 7.783485778724471e-06, "loss": 0.4661, "num_input_tokens_seen": 15100608, "step": 12425 }, { "epoch": 1.5574489412354342, "grad_norm": 0.06704001873731613, "learning_rate": 7.786618218268388e-06, "loss": 0.4604, "num_input_tokens_seen": 15106624, "step": 12430 }, { "epoch": 1.5580754291442176, "grad_norm": 0.18309271335601807, "learning_rate": 7.789750657812305e-06, "loss": 0.4611, "num_input_tokens_seen": 15112928, "step": 12435 }, { "epoch": 1.5587019170530008, "grad_norm": 0.36549243330955505, "learning_rate": 7.792883097356222e-06, "loss": 0.4585, "num_input_tokens_seen": 15119040, "step": 12440 }, { "epoch": 1.5593284049617844, "grad_norm": 0.2684706151485443, "learning_rate": 7.796015536900139e-06, "loss": 0.4685, "num_input_tokens_seen": 15124992, "step": 12445 }, { "epoch": 1.5599548928705675, "grad_norm": 0.2214876115322113, "learning_rate": 7.799147976444055e-06, "loss": 0.459, "num_input_tokens_seen": 15130912, "step": 12450 }, { "epoch": 1.560581380779351, "grad_norm": 0.2692906856536865, "learning_rate": 7.802280415987972e-06, "loss": 0.4587, "num_input_tokens_seen": 15137184, "step": 12455 }, { "epoch": 1.5612078686881343, "grad_norm": 0.21805556118488312, "learning_rate": 7.805412855531889e-06, "loss": 0.4676, "num_input_tokens_seen": 15143648, "step": 12460 }, { "epoch": 1.5618343565969177, "grad_norm": 0.36782529950141907, "learning_rate": 7.808545295075806e-06, "loss": 0.4686, "num_input_tokens_seen": 15149088, "step": 12465 }, { "epoch": 1.562460844505701, "grad_norm": 0.19574157893657684, "learning_rate": 7.811677734619722e-06, "loss": 0.4676, "num_input_tokens_seen": 15155040, "step": 12470 }, { "epoch": 1.5630873324144843, "grad_norm": 0.3134431540966034, "learning_rate": 7.814810174163639e-06, "loss": 0.4587, "num_input_tokens_seen": 15161664, "step": 12475 }, { "epoch": 1.563713820323268, "grad_norm": 0.2053396850824356, "learning_rate": 7.817942613707556e-06, "loss": 0.4665, "num_input_tokens_seen": 15167904, "step": 12480 }, { "epoch": 1.564340308232051, "grad_norm": 0.20270079374313354, "learning_rate": 7.821075053251473e-06, "loss": 0.4607, "num_input_tokens_seen": 15173888, "step": 12485 }, { "epoch": 1.5649667961408344, "grad_norm": 0.3229086399078369, "learning_rate": 7.82420749279539e-06, "loss": 0.4737, "num_input_tokens_seen": 15179424, "step": 12490 }, { "epoch": 1.5655932840496178, "grad_norm": 0.38853755593299866, "learning_rate": 7.827339932339306e-06, "loss": 0.4658, "num_input_tokens_seen": 15185024, "step": 12495 }, { "epoch": 1.5662197719584012, "grad_norm": 0.379414439201355, "learning_rate": 7.830472371883223e-06, "loss": 0.4621, "num_input_tokens_seen": 15190944, "step": 12500 }, { "epoch": 1.5668462598671846, "grad_norm": 0.07801718264818192, "learning_rate": 7.83360481142714e-06, "loss": 0.4683, "num_input_tokens_seen": 15196608, "step": 12505 }, { "epoch": 1.5674727477759678, "grad_norm": 0.3332430422306061, "learning_rate": 7.836737250971056e-06, "loss": 0.4615, "num_input_tokens_seen": 15202432, "step": 12510 }, { "epoch": 1.5680992356847514, "grad_norm": 0.19665083289146423, "learning_rate": 7.839869690514973e-06, "loss": 0.4625, "num_input_tokens_seen": 15208384, "step": 12515 }, { "epoch": 1.5687257235935346, "grad_norm": 0.2611668109893799, "learning_rate": 7.843002130058892e-06, "loss": 0.459, "num_input_tokens_seen": 15214304, "step": 12520 }, { "epoch": 1.569352211502318, "grad_norm": 0.31007081270217896, "learning_rate": 7.846134569602807e-06, "loss": 0.4635, "num_input_tokens_seen": 15219712, "step": 12525 }, { "epoch": 1.5699786994111014, "grad_norm": 0.27473384141921997, "learning_rate": 7.849267009146724e-06, "loss": 0.458, "num_input_tokens_seen": 15225856, "step": 12530 }, { "epoch": 1.5706051873198847, "grad_norm": 0.15126118063926697, "learning_rate": 7.85239944869064e-06, "loss": 0.4631, "num_input_tokens_seen": 15231648, "step": 12535 }, { "epoch": 1.5712316752286681, "grad_norm": 0.09490079432725906, "learning_rate": 7.855531888234557e-06, "loss": 0.4595, "num_input_tokens_seen": 15237664, "step": 12540 }, { "epoch": 1.5718581631374513, "grad_norm": 0.3482178747653961, "learning_rate": 7.858664327778475e-06, "loss": 0.4609, "num_input_tokens_seen": 15243584, "step": 12545 }, { "epoch": 1.572484651046235, "grad_norm": 0.15091116726398468, "learning_rate": 7.861796767322392e-06, "loss": 0.4732, "num_input_tokens_seen": 15249664, "step": 12550 }, { "epoch": 1.573111138955018, "grad_norm": 0.18591473996639252, "learning_rate": 7.864929206866309e-06, "loss": 0.4652, "num_input_tokens_seen": 15256096, "step": 12555 }, { "epoch": 1.5737376268638015, "grad_norm": 0.188689723610878, "learning_rate": 7.868061646410224e-06, "loss": 0.4604, "num_input_tokens_seen": 15261952, "step": 12560 }, { "epoch": 1.5743641147725849, "grad_norm": 0.26542016863822937, "learning_rate": 7.871194085954141e-06, "loss": 0.4692, "num_input_tokens_seen": 15267968, "step": 12565 }, { "epoch": 1.5749906026813683, "grad_norm": 0.25112858414649963, "learning_rate": 7.874326525498058e-06, "loss": 0.4667, "num_input_tokens_seen": 15274208, "step": 12570 }, { "epoch": 1.5756170905901516, "grad_norm": 0.32691219449043274, "learning_rate": 7.877458965041976e-06, "loss": 0.4613, "num_input_tokens_seen": 15279968, "step": 12575 }, { "epoch": 1.5762435784989348, "grad_norm": 0.1854635328054428, "learning_rate": 7.880591404585893e-06, "loss": 0.4703, "num_input_tokens_seen": 15286080, "step": 12580 }, { "epoch": 1.5768700664077184, "grad_norm": 0.173902690410614, "learning_rate": 7.88372384412981e-06, "loss": 0.4675, "num_input_tokens_seen": 15292160, "step": 12585 }, { "epoch": 1.5774965543165016, "grad_norm": 0.17762450873851776, "learning_rate": 7.886856283673726e-06, "loss": 0.4694, "num_input_tokens_seen": 15298048, "step": 12590 }, { "epoch": 1.5781230422252852, "grad_norm": 0.06994430720806122, "learning_rate": 7.889988723217643e-06, "loss": 0.4629, "num_input_tokens_seen": 15304256, "step": 12595 }, { "epoch": 1.5787495301340684, "grad_norm": 0.07652650028467178, "learning_rate": 7.893121162761558e-06, "loss": 0.465, "num_input_tokens_seen": 15310208, "step": 12600 }, { "epoch": 1.5793760180428518, "grad_norm": 0.16815641522407532, "learning_rate": 7.896253602305477e-06, "loss": 0.4622, "num_input_tokens_seen": 15316128, "step": 12605 }, { "epoch": 1.5800025059516352, "grad_norm": 0.1904037892818451, "learning_rate": 7.899386041849393e-06, "loss": 0.4596, "num_input_tokens_seen": 15322496, "step": 12610 }, { "epoch": 1.5806289938604183, "grad_norm": 0.340348482131958, "learning_rate": 7.90251848139331e-06, "loss": 0.4661, "num_input_tokens_seen": 15328704, "step": 12615 }, { "epoch": 1.581255481769202, "grad_norm": 0.08867605030536652, "learning_rate": 7.905650920937227e-06, "loss": 0.4615, "num_input_tokens_seen": 15334720, "step": 12620 }, { "epoch": 1.5818819696779851, "grad_norm": 0.3372219204902649, "learning_rate": 7.908783360481144e-06, "loss": 0.4637, "num_input_tokens_seen": 15340384, "step": 12625 }, { "epoch": 1.5825084575867687, "grad_norm": 0.090458944439888, "learning_rate": 7.91191580002506e-06, "loss": 0.4639, "num_input_tokens_seen": 15346432, "step": 12630 }, { "epoch": 1.583134945495552, "grad_norm": 0.15401054918766022, "learning_rate": 7.915048239568977e-06, "loss": 0.4614, "num_input_tokens_seen": 15352320, "step": 12635 }, { "epoch": 1.5837614334043353, "grad_norm": 0.30992576479911804, "learning_rate": 7.918180679112894e-06, "loss": 0.4653, "num_input_tokens_seen": 15358016, "step": 12640 }, { "epoch": 1.5843879213131187, "grad_norm": 0.1887609213590622, "learning_rate": 7.92131311865681e-06, "loss": 0.4596, "num_input_tokens_seen": 15363616, "step": 12645 }, { "epoch": 1.585014409221902, "grad_norm": 0.3875580132007599, "learning_rate": 7.924445558200728e-06, "loss": 0.4541, "num_input_tokens_seen": 15369664, "step": 12650 }, { "epoch": 1.5856408971306855, "grad_norm": 0.27962222695350647, "learning_rate": 7.927577997744644e-06, "loss": 0.4618, "num_input_tokens_seen": 15375680, "step": 12655 }, { "epoch": 1.5862673850394686, "grad_norm": 0.31865349411964417, "learning_rate": 7.930710437288561e-06, "loss": 0.4624, "num_input_tokens_seen": 15382048, "step": 12660 }, { "epoch": 1.5868938729482522, "grad_norm": 0.2851105332374573, "learning_rate": 7.933842876832478e-06, "loss": 0.4547, "num_input_tokens_seen": 15388224, "step": 12665 }, { "epoch": 1.5875203608570354, "grad_norm": 0.2162598967552185, "learning_rate": 7.936975316376395e-06, "loss": 0.4595, "num_input_tokens_seen": 15394400, "step": 12670 }, { "epoch": 1.5881468487658188, "grad_norm": 0.2487480193376541, "learning_rate": 7.940107755920311e-06, "loss": 0.463, "num_input_tokens_seen": 15400192, "step": 12675 }, { "epoch": 1.5887733366746022, "grad_norm": 0.23042097687721252, "learning_rate": 7.943240195464228e-06, "loss": 0.4506, "num_input_tokens_seen": 15406688, "step": 12680 }, { "epoch": 1.5893998245833856, "grad_norm": 0.20550715923309326, "learning_rate": 7.946372635008145e-06, "loss": 0.453, "num_input_tokens_seen": 15412512, "step": 12685 }, { "epoch": 1.590026312492169, "grad_norm": 0.1973976045846939, "learning_rate": 7.949505074552062e-06, "loss": 0.4739, "num_input_tokens_seen": 15418752, "step": 12690 }, { "epoch": 1.5906528004009521, "grad_norm": 0.26059490442276, "learning_rate": 7.952637514095978e-06, "loss": 0.465, "num_input_tokens_seen": 15425088, "step": 12695 }, { "epoch": 1.5912792883097358, "grad_norm": 0.2583775520324707, "learning_rate": 7.955769953639895e-06, "loss": 0.4693, "num_input_tokens_seen": 15431392, "step": 12700 }, { "epoch": 1.591905776218519, "grad_norm": 0.1896762251853943, "learning_rate": 7.958902393183812e-06, "loss": 0.4638, "num_input_tokens_seen": 15436832, "step": 12705 }, { "epoch": 1.5925322641273023, "grad_norm": 0.2522101104259491, "learning_rate": 7.962034832727729e-06, "loss": 0.4677, "num_input_tokens_seen": 15442880, "step": 12710 }, { "epoch": 1.5931587520360857, "grad_norm": 0.2398802787065506, "learning_rate": 7.965167272271645e-06, "loss": 0.4659, "num_input_tokens_seen": 15449056, "step": 12715 }, { "epoch": 1.593785239944869, "grad_norm": 0.22457312047481537, "learning_rate": 7.968299711815562e-06, "loss": 0.4583, "num_input_tokens_seen": 15455200, "step": 12720 }, { "epoch": 1.5944117278536525, "grad_norm": 0.22839415073394775, "learning_rate": 7.971432151359479e-06, "loss": 0.4588, "num_input_tokens_seen": 15461088, "step": 12725 }, { "epoch": 1.5950382157624357, "grad_norm": 0.3093472719192505, "learning_rate": 7.974564590903396e-06, "loss": 0.4618, "num_input_tokens_seen": 15467168, "step": 12730 }, { "epoch": 1.5956647036712193, "grad_norm": 0.2430899441242218, "learning_rate": 7.977697030447314e-06, "loss": 0.4668, "num_input_tokens_seen": 15472800, "step": 12735 }, { "epoch": 1.5962911915800024, "grad_norm": 0.25177204608917236, "learning_rate": 7.980829469991231e-06, "loss": 0.4689, "num_input_tokens_seen": 15478624, "step": 12740 }, { "epoch": 1.5969176794887858, "grad_norm": 0.23045790195465088, "learning_rate": 7.983961909535146e-06, "loss": 0.461, "num_input_tokens_seen": 15484352, "step": 12745 }, { "epoch": 1.5975441673975692, "grad_norm": 0.18812517821788788, "learning_rate": 7.987094349079063e-06, "loss": 0.4604, "num_input_tokens_seen": 15490368, "step": 12750 }, { "epoch": 1.5981706553063526, "grad_norm": 0.36391061544418335, "learning_rate": 7.99022678862298e-06, "loss": 0.4717, "num_input_tokens_seen": 15496640, "step": 12755 }, { "epoch": 1.598797143215136, "grad_norm": 0.27880212664604187, "learning_rate": 7.993359228166896e-06, "loss": 0.4637, "num_input_tokens_seen": 15502816, "step": 12760 }, { "epoch": 1.5994236311239192, "grad_norm": 0.39119812846183777, "learning_rate": 7.996491667710815e-06, "loss": 0.4649, "num_input_tokens_seen": 15508672, "step": 12765 }, { "epoch": 1.6000501190327028, "grad_norm": 0.26793837547302246, "learning_rate": 7.999624107254732e-06, "loss": 0.4598, "num_input_tokens_seen": 15514848, "step": 12770 }, { "epoch": 1.600676606941486, "grad_norm": 0.27402380108833313, "learning_rate": 8.002756546798648e-06, "loss": 0.4665, "num_input_tokens_seen": 15520800, "step": 12775 }, { "epoch": 1.6013030948502693, "grad_norm": 0.25012239813804626, "learning_rate": 8.005888986342563e-06, "loss": 0.4639, "num_input_tokens_seen": 15526976, "step": 12780 }, { "epoch": 1.6019295827590527, "grad_norm": 0.0977645069360733, "learning_rate": 8.00902142588648e-06, "loss": 0.4658, "num_input_tokens_seen": 15533152, "step": 12785 }, { "epoch": 1.6025560706678361, "grad_norm": 0.2551387846469879, "learning_rate": 8.012153865430397e-06, "loss": 0.4642, "num_input_tokens_seen": 15539520, "step": 12790 }, { "epoch": 1.6031825585766195, "grad_norm": 0.19204948842525482, "learning_rate": 8.015286304974315e-06, "loss": 0.4578, "num_input_tokens_seen": 15545824, "step": 12795 }, { "epoch": 1.6038090464854027, "grad_norm": 0.17957256734371185, "learning_rate": 8.018418744518232e-06, "loss": 0.4594, "num_input_tokens_seen": 15551232, "step": 12800 }, { "epoch": 1.6044355343941863, "grad_norm": 0.19779011607170105, "learning_rate": 8.021551184062149e-06, "loss": 0.4609, "num_input_tokens_seen": 15557760, "step": 12805 }, { "epoch": 1.6050620223029695, "grad_norm": 0.22188840806484222, "learning_rate": 8.024683623606066e-06, "loss": 0.4638, "num_input_tokens_seen": 15563904, "step": 12810 }, { "epoch": 1.605688510211753, "grad_norm": 0.17182189226150513, "learning_rate": 8.027816063149982e-06, "loss": 0.4617, "num_input_tokens_seen": 15570048, "step": 12815 }, { "epoch": 1.6063149981205362, "grad_norm": 0.1875155121088028, "learning_rate": 8.030948502693897e-06, "loss": 0.4633, "num_input_tokens_seen": 15576192, "step": 12820 }, { "epoch": 1.6069414860293196, "grad_norm": 0.18333187699317932, "learning_rate": 8.034080942237816e-06, "loss": 0.4655, "num_input_tokens_seen": 15582464, "step": 12825 }, { "epoch": 1.607567973938103, "grad_norm": 0.07761004567146301, "learning_rate": 8.037213381781733e-06, "loss": 0.4628, "num_input_tokens_seen": 15588544, "step": 12830 }, { "epoch": 1.6081944618468862, "grad_norm": 0.2582165002822876, "learning_rate": 8.04034582132565e-06, "loss": 0.468, "num_input_tokens_seen": 15594368, "step": 12835 }, { "epoch": 1.6088209497556698, "grad_norm": 0.21990109980106354, "learning_rate": 8.043478260869566e-06, "loss": 0.4602, "num_input_tokens_seen": 15600672, "step": 12840 }, { "epoch": 1.609447437664453, "grad_norm": 0.3037578761577606, "learning_rate": 8.046610700413483e-06, "loss": 0.4576, "num_input_tokens_seen": 15606688, "step": 12845 }, { "epoch": 1.6100739255732366, "grad_norm": 0.16699771583080292, "learning_rate": 8.0497431399574e-06, "loss": 0.4581, "num_input_tokens_seen": 15612672, "step": 12850 }, { "epoch": 1.6107004134820198, "grad_norm": 0.24368180334568024, "learning_rate": 8.052875579501316e-06, "loss": 0.4595, "num_input_tokens_seen": 15618400, "step": 12855 }, { "epoch": 1.6113269013908031, "grad_norm": 0.22665071487426758, "learning_rate": 8.056008019045233e-06, "loss": 0.4695, "num_input_tokens_seen": 15624384, "step": 12860 }, { "epoch": 1.6119533892995865, "grad_norm": 0.09998884797096252, "learning_rate": 8.05914045858915e-06, "loss": 0.4693, "num_input_tokens_seen": 15630688, "step": 12865 }, { "epoch": 1.61257987720837, "grad_norm": 0.24792474508285522, "learning_rate": 8.062272898133067e-06, "loss": 0.4579, "num_input_tokens_seen": 15636896, "step": 12870 }, { "epoch": 1.6132063651171533, "grad_norm": 0.21339774131774902, "learning_rate": 8.065405337676984e-06, "loss": 0.4631, "num_input_tokens_seen": 15642720, "step": 12875 }, { "epoch": 1.6138328530259365, "grad_norm": 0.11442393064498901, "learning_rate": 8.0685377772209e-06, "loss": 0.4633, "num_input_tokens_seen": 15649056, "step": 12880 }, { "epoch": 1.61445934093472, "grad_norm": 0.21302703022956848, "learning_rate": 8.071670216764817e-06, "loss": 0.4591, "num_input_tokens_seen": 15655168, "step": 12885 }, { "epoch": 1.6150858288435033, "grad_norm": 0.2438880205154419, "learning_rate": 8.074802656308734e-06, "loss": 0.4633, "num_input_tokens_seen": 15661120, "step": 12890 }, { "epoch": 1.6157123167522867, "grad_norm": 0.2361435741186142, "learning_rate": 8.07793509585265e-06, "loss": 0.462, "num_input_tokens_seen": 15667168, "step": 12895 }, { "epoch": 1.61633880466107, "grad_norm": 0.3264104425907135, "learning_rate": 8.081067535396567e-06, "loss": 0.4627, "num_input_tokens_seen": 15673088, "step": 12900 }, { "epoch": 1.6169652925698534, "grad_norm": 0.36059796810150146, "learning_rate": 8.084199974940484e-06, "loss": 0.4647, "num_input_tokens_seen": 15679168, "step": 12905 }, { "epoch": 1.6175917804786368, "grad_norm": 0.2214708775281906, "learning_rate": 8.087332414484401e-06, "loss": 0.4619, "num_input_tokens_seen": 15685024, "step": 12910 }, { "epoch": 1.61821826838742, "grad_norm": 0.19759418070316315, "learning_rate": 8.090464854028318e-06, "loss": 0.4657, "num_input_tokens_seen": 15691168, "step": 12915 }, { "epoch": 1.6188447562962036, "grad_norm": 0.38759785890579224, "learning_rate": 8.093597293572234e-06, "loss": 0.4674, "num_input_tokens_seen": 15696992, "step": 12920 }, { "epoch": 1.6194712442049868, "grad_norm": 0.2779008746147156, "learning_rate": 8.096729733116153e-06, "loss": 0.4644, "num_input_tokens_seen": 15703168, "step": 12925 }, { "epoch": 1.6200977321137702, "grad_norm": 0.3341507315635681, "learning_rate": 8.099862172660068e-06, "loss": 0.4675, "num_input_tokens_seen": 15708864, "step": 12930 }, { "epoch": 1.6207242200225536, "grad_norm": 0.16595369577407837, "learning_rate": 8.102994612203985e-06, "loss": 0.4577, "num_input_tokens_seen": 15714688, "step": 12935 }, { "epoch": 1.621350707931337, "grad_norm": 0.08123953640460968, "learning_rate": 8.106127051747901e-06, "loss": 0.4645, "num_input_tokens_seen": 15720704, "step": 12940 }, { "epoch": 1.6219771958401203, "grad_norm": 0.3318946063518524, "learning_rate": 8.109259491291818e-06, "loss": 0.4676, "num_input_tokens_seen": 15726880, "step": 12945 }, { "epoch": 1.6226036837489035, "grad_norm": 0.2995567321777344, "learning_rate": 8.112391930835735e-06, "loss": 0.4681, "num_input_tokens_seen": 15733248, "step": 12950 }, { "epoch": 1.6232301716576871, "grad_norm": 0.22595834732055664, "learning_rate": 8.115524370379653e-06, "loss": 0.4622, "num_input_tokens_seen": 15739552, "step": 12955 }, { "epoch": 1.6238566595664703, "grad_norm": 0.2245444655418396, "learning_rate": 8.11865680992357e-06, "loss": 0.4638, "num_input_tokens_seen": 15745696, "step": 12960 }, { "epoch": 1.6244831474752537, "grad_norm": 0.22743044793605804, "learning_rate": 8.121789249467485e-06, "loss": 0.4631, "num_input_tokens_seen": 15751840, "step": 12965 }, { "epoch": 1.625109635384037, "grad_norm": 0.27227503061294556, "learning_rate": 8.124921689011402e-06, "loss": 0.459, "num_input_tokens_seen": 15758144, "step": 12970 }, { "epoch": 1.6257361232928205, "grad_norm": 0.1574118286371231, "learning_rate": 8.128054128555319e-06, "loss": 0.4664, "num_input_tokens_seen": 15764096, "step": 12975 }, { "epoch": 1.6263626112016039, "grad_norm": 0.1866026073694229, "learning_rate": 8.131186568099236e-06, "loss": 0.4695, "num_input_tokens_seen": 15770208, "step": 12980 }, { "epoch": 1.626989099110387, "grad_norm": 0.2644459307193756, "learning_rate": 8.134319007643154e-06, "loss": 0.4596, "num_input_tokens_seen": 15776608, "step": 12985 }, { "epoch": 1.6276155870191706, "grad_norm": 0.1719454824924469, "learning_rate": 8.13745144718707e-06, "loss": 0.466, "num_input_tokens_seen": 15782304, "step": 12990 }, { "epoch": 1.6282420749279538, "grad_norm": 0.17649151384830475, "learning_rate": 8.140583886730988e-06, "loss": 0.4536, "num_input_tokens_seen": 15788448, "step": 12995 }, { "epoch": 1.6288685628367372, "grad_norm": 0.0803588405251503, "learning_rate": 8.143716326274903e-06, "loss": 0.4576, "num_input_tokens_seen": 15794176, "step": 13000 }, { "epoch": 1.6294950507455206, "grad_norm": 0.23100310564041138, "learning_rate": 8.14684876581882e-06, "loss": 0.4698, "num_input_tokens_seen": 15800640, "step": 13005 }, { "epoch": 1.630121538654304, "grad_norm": 0.2858688533306122, "learning_rate": 8.149981205362736e-06, "loss": 0.4675, "num_input_tokens_seen": 15806944, "step": 13010 }, { "epoch": 1.6307480265630874, "grad_norm": 0.19954358041286469, "learning_rate": 8.153113644906655e-06, "loss": 0.4624, "num_input_tokens_seen": 15813152, "step": 13015 }, { "epoch": 1.6313745144718705, "grad_norm": 0.3312489092350006, "learning_rate": 8.156246084450571e-06, "loss": 0.471, "num_input_tokens_seen": 15819392, "step": 13020 }, { "epoch": 1.6320010023806542, "grad_norm": 0.4584304392337799, "learning_rate": 8.159378523994488e-06, "loss": 0.4738, "num_input_tokens_seen": 15824416, "step": 13025 }, { "epoch": 1.6326274902894373, "grad_norm": 0.23645640909671783, "learning_rate": 8.162510963538405e-06, "loss": 0.464, "num_input_tokens_seen": 15830688, "step": 13030 }, { "epoch": 1.633253978198221, "grad_norm": 0.14776524901390076, "learning_rate": 8.165643403082322e-06, "loss": 0.4643, "num_input_tokens_seen": 15836896, "step": 13035 }, { "epoch": 1.633880466107004, "grad_norm": 0.14828816056251526, "learning_rate": 8.168775842626237e-06, "loss": 0.4638, "num_input_tokens_seen": 15843264, "step": 13040 }, { "epoch": 1.6345069540157875, "grad_norm": 0.2133111208677292, "learning_rate": 8.171908282170155e-06, "loss": 0.4591, "num_input_tokens_seen": 15849408, "step": 13045 }, { "epoch": 1.6351334419245709, "grad_norm": 0.17122876644134521, "learning_rate": 8.175040721714072e-06, "loss": 0.4608, "num_input_tokens_seen": 15855424, "step": 13050 }, { "epoch": 1.635759929833354, "grad_norm": 0.302094042301178, "learning_rate": 8.178173161257989e-06, "loss": 0.4592, "num_input_tokens_seen": 15861696, "step": 13055 }, { "epoch": 1.6363864177421377, "grad_norm": 0.22433602809906006, "learning_rate": 8.181305600801905e-06, "loss": 0.4597, "num_input_tokens_seen": 15867936, "step": 13060 }, { "epoch": 1.6370129056509208, "grad_norm": 0.15728594362735748, "learning_rate": 8.184438040345822e-06, "loss": 0.4618, "num_input_tokens_seen": 15874208, "step": 13065 }, { "epoch": 1.6376393935597044, "grad_norm": 0.17385661602020264, "learning_rate": 8.187570479889739e-06, "loss": 0.4608, "num_input_tokens_seen": 15880192, "step": 13070 }, { "epoch": 1.6382658814684876, "grad_norm": 0.17646196484565735, "learning_rate": 8.190702919433656e-06, "loss": 0.4536, "num_input_tokens_seen": 15886816, "step": 13075 }, { "epoch": 1.638892369377271, "grad_norm": 0.3089113235473633, "learning_rate": 8.193835358977573e-06, "loss": 0.461, "num_input_tokens_seen": 15893056, "step": 13080 }, { "epoch": 1.6395188572860544, "grad_norm": 0.17823739349842072, "learning_rate": 8.19696779852149e-06, "loss": 0.4561, "num_input_tokens_seen": 15899264, "step": 13085 }, { "epoch": 1.6401453451948378, "grad_norm": 0.1700691431760788, "learning_rate": 8.200100238065406e-06, "loss": 0.46, "num_input_tokens_seen": 15905472, "step": 13090 }, { "epoch": 1.6407718331036212, "grad_norm": 0.21008990705013275, "learning_rate": 8.203232677609323e-06, "loss": 0.4638, "num_input_tokens_seen": 15911744, "step": 13095 }, { "epoch": 1.6413983210124043, "grad_norm": 0.2216845005750656, "learning_rate": 8.20636511715324e-06, "loss": 0.4562, "num_input_tokens_seen": 15918208, "step": 13100 }, { "epoch": 1.642024808921188, "grad_norm": 0.08117195963859558, "learning_rate": 8.209497556697156e-06, "loss": 0.4545, "num_input_tokens_seen": 15924608, "step": 13105 }, { "epoch": 1.6426512968299711, "grad_norm": 0.19905632734298706, "learning_rate": 8.212629996241073e-06, "loss": 0.4591, "num_input_tokens_seen": 15929984, "step": 13110 }, { "epoch": 1.6432777847387545, "grad_norm": 0.08943729847669601, "learning_rate": 8.21576243578499e-06, "loss": 0.4688, "num_input_tokens_seen": 15936288, "step": 13115 }, { "epoch": 1.643904272647538, "grad_norm": 0.1970319002866745, "learning_rate": 8.218894875328907e-06, "loss": 0.4709, "num_input_tokens_seen": 15942560, "step": 13120 }, { "epoch": 1.6445307605563213, "grad_norm": 0.3638760447502136, "learning_rate": 8.222027314872823e-06, "loss": 0.4585, "num_input_tokens_seen": 15948896, "step": 13125 }, { "epoch": 1.6451572484651047, "grad_norm": 0.06149204820394516, "learning_rate": 8.22515975441674e-06, "loss": 0.4575, "num_input_tokens_seen": 15955104, "step": 13130 }, { "epoch": 1.6457837363738879, "grad_norm": 0.30402180552482605, "learning_rate": 8.228292193960657e-06, "loss": 0.4558, "num_input_tokens_seen": 15961408, "step": 13135 }, { "epoch": 1.6464102242826715, "grad_norm": 0.20677584409713745, "learning_rate": 8.231424633504574e-06, "loss": 0.4664, "num_input_tokens_seen": 15967648, "step": 13140 }, { "epoch": 1.6470367121914546, "grad_norm": 0.2988409101963043, "learning_rate": 8.234557073048492e-06, "loss": 0.4803, "num_input_tokens_seen": 15973216, "step": 13145 }, { "epoch": 1.647663200100238, "grad_norm": 0.16940565407276154, "learning_rate": 8.237689512592407e-06, "loss": 0.4702, "num_input_tokens_seen": 15979520, "step": 13150 }, { "epoch": 1.6482896880090214, "grad_norm": 0.19377024471759796, "learning_rate": 8.240821952136324e-06, "loss": 0.4579, "num_input_tokens_seen": 15985440, "step": 13155 }, { "epoch": 1.6489161759178048, "grad_norm": 0.21250241994857788, "learning_rate": 8.24395439168024e-06, "loss": 0.4604, "num_input_tokens_seen": 15991712, "step": 13160 }, { "epoch": 1.6495426638265882, "grad_norm": 0.18390890955924988, "learning_rate": 8.247086831224158e-06, "loss": 0.4647, "num_input_tokens_seen": 15998112, "step": 13165 }, { "epoch": 1.6501691517353714, "grad_norm": 0.18566888570785522, "learning_rate": 8.250219270768074e-06, "loss": 0.4594, "num_input_tokens_seen": 16004448, "step": 13170 }, { "epoch": 1.650795639644155, "grad_norm": 0.23609815537929535, "learning_rate": 8.253351710311993e-06, "loss": 0.4652, "num_input_tokens_seen": 16010528, "step": 13175 }, { "epoch": 1.6514221275529382, "grad_norm": 0.092585988342762, "learning_rate": 8.25648414985591e-06, "loss": 0.4598, "num_input_tokens_seen": 16016960, "step": 13180 }, { "epoch": 1.6520486154617215, "grad_norm": 0.1896883249282837, "learning_rate": 8.259616589399825e-06, "loss": 0.461, "num_input_tokens_seen": 16023328, "step": 13185 }, { "epoch": 1.652675103370505, "grad_norm": 0.2444959431886673, "learning_rate": 8.262749028943741e-06, "loss": 0.4594, "num_input_tokens_seen": 16029216, "step": 13190 }, { "epoch": 1.6533015912792883, "grad_norm": 0.1836143136024475, "learning_rate": 8.265881468487658e-06, "loss": 0.4583, "num_input_tokens_seen": 16035488, "step": 13195 }, { "epoch": 1.6539280791880717, "grad_norm": 0.1870044767856598, "learning_rate": 8.269013908031575e-06, "loss": 0.4664, "num_input_tokens_seen": 16041152, "step": 13200 }, { "epoch": 1.654554567096855, "grad_norm": 0.2798924148082733, "learning_rate": 8.272146347575493e-06, "loss": 0.4646, "num_input_tokens_seen": 16047392, "step": 13205 }, { "epoch": 1.6551810550056385, "grad_norm": 0.5520912408828735, "learning_rate": 8.27527878711941e-06, "loss": 0.4485, "num_input_tokens_seen": 16053600, "step": 13210 }, { "epoch": 1.6558075429144217, "grad_norm": 0.4044063687324524, "learning_rate": 8.278411226663327e-06, "loss": 0.4777, "num_input_tokens_seen": 16059776, "step": 13215 }, { "epoch": 1.656434030823205, "grad_norm": 0.39754217863082886, "learning_rate": 8.281543666207242e-06, "loss": 0.4517, "num_input_tokens_seen": 16065920, "step": 13220 }, { "epoch": 1.6570605187319885, "grad_norm": 0.40033620595932007, "learning_rate": 8.284676105751159e-06, "loss": 0.4628, "num_input_tokens_seen": 16071904, "step": 13225 }, { "epoch": 1.6576870066407718, "grad_norm": 0.2969355881214142, "learning_rate": 8.287808545295077e-06, "loss": 0.4642, "num_input_tokens_seen": 16077920, "step": 13230 }, { "epoch": 1.6583134945495552, "grad_norm": 0.3666814863681793, "learning_rate": 8.290940984838994e-06, "loss": 0.4627, "num_input_tokens_seen": 16084064, "step": 13235 }, { "epoch": 1.6589399824583384, "grad_norm": 0.3765205442905426, "learning_rate": 8.29407342438291e-06, "loss": 0.4718, "num_input_tokens_seen": 16090016, "step": 13240 }, { "epoch": 1.659566470367122, "grad_norm": 0.4374752640724182, "learning_rate": 8.297205863926827e-06, "loss": 0.4655, "num_input_tokens_seen": 16095904, "step": 13245 }, { "epoch": 1.6601929582759052, "grad_norm": 0.31713399291038513, "learning_rate": 8.300338303470744e-06, "loss": 0.4539, "num_input_tokens_seen": 16102048, "step": 13250 }, { "epoch": 1.6608194461846888, "grad_norm": 0.18281161785125732, "learning_rate": 8.303470743014661e-06, "loss": 0.4638, "num_input_tokens_seen": 16108000, "step": 13255 }, { "epoch": 1.661445934093472, "grad_norm": 0.20335303246974945, "learning_rate": 8.306603182558578e-06, "loss": 0.462, "num_input_tokens_seen": 16114240, "step": 13260 }, { "epoch": 1.6620724220022554, "grad_norm": 0.22172048687934875, "learning_rate": 8.309735622102494e-06, "loss": 0.4693, "num_input_tokens_seen": 16120480, "step": 13265 }, { "epoch": 1.6626989099110387, "grad_norm": 0.2848634719848633, "learning_rate": 8.312868061646411e-06, "loss": 0.4537, "num_input_tokens_seen": 16126528, "step": 13270 }, { "epoch": 1.663325397819822, "grad_norm": 0.31237488985061646, "learning_rate": 8.316000501190328e-06, "loss": 0.4801, "num_input_tokens_seen": 16133184, "step": 13275 }, { "epoch": 1.6639518857286055, "grad_norm": 0.07855445146560669, "learning_rate": 8.319132940734245e-06, "loss": 0.4593, "num_input_tokens_seen": 16139104, "step": 13280 }, { "epoch": 1.6645783736373887, "grad_norm": 0.18651661276817322, "learning_rate": 8.322265380278162e-06, "loss": 0.4611, "num_input_tokens_seen": 16145600, "step": 13285 }, { "epoch": 1.6652048615461723, "grad_norm": 0.15951351821422577, "learning_rate": 8.325397819822078e-06, "loss": 0.4648, "num_input_tokens_seen": 16151808, "step": 13290 }, { "epoch": 1.6658313494549555, "grad_norm": 0.18329527974128723, "learning_rate": 8.328530259365995e-06, "loss": 0.4558, "num_input_tokens_seen": 16158048, "step": 13295 }, { "epoch": 1.6664578373637389, "grad_norm": 0.07980923354625702, "learning_rate": 8.331662698909912e-06, "loss": 0.4657, "num_input_tokens_seen": 16163936, "step": 13300 }, { "epoch": 1.6670843252725223, "grad_norm": 0.2212291657924652, "learning_rate": 8.334795138453829e-06, "loss": 0.4601, "num_input_tokens_seen": 16169984, "step": 13305 }, { "epoch": 1.6677108131813057, "grad_norm": 0.08577409386634827, "learning_rate": 8.337927577997745e-06, "loss": 0.4714, "num_input_tokens_seen": 16176064, "step": 13310 }, { "epoch": 1.668337301090089, "grad_norm": 0.18624168634414673, "learning_rate": 8.341060017541662e-06, "loss": 0.4626, "num_input_tokens_seen": 16182016, "step": 13315 }, { "epoch": 1.6689637889988722, "grad_norm": 0.3448936343193054, "learning_rate": 8.344192457085579e-06, "loss": 0.4609, "num_input_tokens_seen": 16188512, "step": 13320 }, { "epoch": 1.6695902769076558, "grad_norm": 0.16421563923358917, "learning_rate": 8.347324896629496e-06, "loss": 0.4669, "num_input_tokens_seen": 16194560, "step": 13325 }, { "epoch": 1.670216764816439, "grad_norm": 0.1794273555278778, "learning_rate": 8.350457336173412e-06, "loss": 0.4651, "num_input_tokens_seen": 16200864, "step": 13330 }, { "epoch": 1.6708432527252224, "grad_norm": 0.16905374825000763, "learning_rate": 8.353589775717329e-06, "loss": 0.4688, "num_input_tokens_seen": 16207360, "step": 13335 }, { "epoch": 1.6714697406340058, "grad_norm": 0.1635100394487381, "learning_rate": 8.356722215261246e-06, "loss": 0.4618, "num_input_tokens_seen": 16213440, "step": 13340 }, { "epoch": 1.6720962285427892, "grad_norm": 0.23360057175159454, "learning_rate": 8.359854654805163e-06, "loss": 0.4695, "num_input_tokens_seen": 16219456, "step": 13345 }, { "epoch": 1.6727227164515726, "grad_norm": 0.30251336097717285, "learning_rate": 8.36298709434908e-06, "loss": 0.4646, "num_input_tokens_seen": 16225920, "step": 13350 }, { "epoch": 1.6733492043603557, "grad_norm": 0.1541629284620285, "learning_rate": 8.366119533892996e-06, "loss": 0.4537, "num_input_tokens_seen": 16232096, "step": 13355 }, { "epoch": 1.6739756922691393, "grad_norm": 0.20354199409484863, "learning_rate": 8.369251973436913e-06, "loss": 0.4554, "num_input_tokens_seen": 16238048, "step": 13360 }, { "epoch": 1.6746021801779225, "grad_norm": 0.27428850531578064, "learning_rate": 8.37238441298083e-06, "loss": 0.4586, "num_input_tokens_seen": 16244224, "step": 13365 }, { "epoch": 1.675228668086706, "grad_norm": 0.19047173857688904, "learning_rate": 8.375516852524746e-06, "loss": 0.4702, "num_input_tokens_seen": 16250304, "step": 13370 }, { "epoch": 1.6758551559954893, "grad_norm": 0.08335322141647339, "learning_rate": 8.378649292068663e-06, "loss": 0.467, "num_input_tokens_seen": 16256256, "step": 13375 }, { "epoch": 1.6764816439042727, "grad_norm": 0.1438176929950714, "learning_rate": 8.38178173161258e-06, "loss": 0.4626, "num_input_tokens_seen": 16262464, "step": 13380 }, { "epoch": 1.677108131813056, "grad_norm": 0.3503116965293884, "learning_rate": 8.384914171156497e-06, "loss": 0.4647, "num_input_tokens_seen": 16268256, "step": 13385 }, { "epoch": 1.6777346197218392, "grad_norm": 0.07864799350500107, "learning_rate": 8.388046610700414e-06, "loss": 0.4546, "num_input_tokens_seen": 16274400, "step": 13390 }, { "epoch": 1.6783611076306229, "grad_norm": 0.29956722259521484, "learning_rate": 8.391179050244332e-06, "loss": 0.4675, "num_input_tokens_seen": 16280192, "step": 13395 }, { "epoch": 1.678987595539406, "grad_norm": 0.09345828741788864, "learning_rate": 8.394311489788249e-06, "loss": 0.4641, "num_input_tokens_seen": 16286176, "step": 13400 }, { "epoch": 1.6796140834481894, "grad_norm": 0.21928314864635468, "learning_rate": 8.397443929332164e-06, "loss": 0.4667, "num_input_tokens_seen": 16292288, "step": 13405 }, { "epoch": 1.6802405713569728, "grad_norm": 0.20297959446907043, "learning_rate": 8.40057636887608e-06, "loss": 0.4676, "num_input_tokens_seen": 16298528, "step": 13410 }, { "epoch": 1.6808670592657562, "grad_norm": 0.1617611199617386, "learning_rate": 8.403708808419997e-06, "loss": 0.4592, "num_input_tokens_seen": 16305024, "step": 13415 }, { "epoch": 1.6814935471745396, "grad_norm": 0.1781066358089447, "learning_rate": 8.406841247963916e-06, "loss": 0.4738, "num_input_tokens_seen": 16311488, "step": 13420 }, { "epoch": 1.6821200350833228, "grad_norm": 0.18431922793388367, "learning_rate": 8.409973687507833e-06, "loss": 0.4676, "num_input_tokens_seen": 16317728, "step": 13425 }, { "epoch": 1.6827465229921064, "grad_norm": 0.2154279500246048, "learning_rate": 8.41310612705175e-06, "loss": 0.4639, "num_input_tokens_seen": 16324000, "step": 13430 }, { "epoch": 1.6833730109008895, "grad_norm": 0.18573948740959167, "learning_rate": 8.416238566595666e-06, "loss": 0.4603, "num_input_tokens_seen": 16329888, "step": 13435 }, { "epoch": 1.683999498809673, "grad_norm": 0.07337921857833862, "learning_rate": 8.419371006139581e-06, "loss": 0.4588, "num_input_tokens_seen": 16335200, "step": 13440 }, { "epoch": 1.6846259867184563, "grad_norm": 0.19606667757034302, "learning_rate": 8.422503445683498e-06, "loss": 0.4672, "num_input_tokens_seen": 16341024, "step": 13445 }, { "epoch": 1.6852524746272397, "grad_norm": 0.19860540330410004, "learning_rate": 8.425635885227416e-06, "loss": 0.4677, "num_input_tokens_seen": 16346656, "step": 13450 }, { "epoch": 1.685878962536023, "grad_norm": 0.0787627324461937, "learning_rate": 8.428768324771333e-06, "loss": 0.4618, "num_input_tokens_seen": 16352576, "step": 13455 }, { "epoch": 1.6865054504448063, "grad_norm": 0.1801789253950119, "learning_rate": 8.43190076431525e-06, "loss": 0.4706, "num_input_tokens_seen": 16358720, "step": 13460 }, { "epoch": 1.6871319383535899, "grad_norm": 0.1472969949245453, "learning_rate": 8.435033203859167e-06, "loss": 0.4647, "num_input_tokens_seen": 16364800, "step": 13465 }, { "epoch": 1.687758426262373, "grad_norm": 0.20915961265563965, "learning_rate": 8.438165643403083e-06, "loss": 0.4627, "num_input_tokens_seen": 16370752, "step": 13470 }, { "epoch": 1.6883849141711567, "grad_norm": 0.21019528806209564, "learning_rate": 8.441298082946999e-06, "loss": 0.4579, "num_input_tokens_seen": 16376864, "step": 13475 }, { "epoch": 1.6890114020799398, "grad_norm": 0.17487813532352448, "learning_rate": 8.444430522490917e-06, "loss": 0.4623, "num_input_tokens_seen": 16382976, "step": 13480 }, { "epoch": 1.6896378899887232, "grad_norm": 0.21538862586021423, "learning_rate": 8.447562962034834e-06, "loss": 0.4639, "num_input_tokens_seen": 16389088, "step": 13485 }, { "epoch": 1.6902643778975066, "grad_norm": 0.21524037420749664, "learning_rate": 8.45069540157875e-06, "loss": 0.4639, "num_input_tokens_seen": 16395168, "step": 13490 }, { "epoch": 1.6908908658062898, "grad_norm": 0.20493975281715393, "learning_rate": 8.453827841122667e-06, "loss": 0.4618, "num_input_tokens_seen": 16401408, "step": 13495 }, { "epoch": 1.6915173537150734, "grad_norm": 0.06556988507509232, "learning_rate": 8.456960280666584e-06, "loss": 0.4691, "num_input_tokens_seen": 16407456, "step": 13500 }, { "epoch": 1.6921438416238566, "grad_norm": 0.06294706463813782, "learning_rate": 8.4600927202105e-06, "loss": 0.4571, "num_input_tokens_seen": 16413344, "step": 13505 }, { "epoch": 1.6927703295326402, "grad_norm": 0.23523671925067902, "learning_rate": 8.463225159754418e-06, "loss": 0.4654, "num_input_tokens_seen": 16419488, "step": 13510 }, { "epoch": 1.6933968174414233, "grad_norm": 0.1487245410680771, "learning_rate": 8.466357599298334e-06, "loss": 0.4636, "num_input_tokens_seen": 16425376, "step": 13515 }, { "epoch": 1.6940233053502067, "grad_norm": 0.17546489834785461, "learning_rate": 8.469490038842251e-06, "loss": 0.4614, "num_input_tokens_seen": 16431744, "step": 13520 }, { "epoch": 1.6946497932589901, "grad_norm": 0.3019871711730957, "learning_rate": 8.472622478386168e-06, "loss": 0.4587, "num_input_tokens_seen": 16437632, "step": 13525 }, { "epoch": 1.6952762811677735, "grad_norm": 0.2439056634902954, "learning_rate": 8.475754917930085e-06, "loss": 0.4619, "num_input_tokens_seen": 16443744, "step": 13530 }, { "epoch": 1.695902769076557, "grad_norm": 0.38067957758903503, "learning_rate": 8.478887357474001e-06, "loss": 0.462, "num_input_tokens_seen": 16450176, "step": 13535 }, { "epoch": 1.69652925698534, "grad_norm": 0.17081542313098907, "learning_rate": 8.482019797017918e-06, "loss": 0.4684, "num_input_tokens_seen": 16456448, "step": 13540 }, { "epoch": 1.6971557448941237, "grad_norm": 0.16331492364406586, "learning_rate": 8.485152236561835e-06, "loss": 0.4672, "num_input_tokens_seen": 16462400, "step": 13545 }, { "epoch": 1.6977822328029069, "grad_norm": 0.16760799288749695, "learning_rate": 8.488284676105752e-06, "loss": 0.4654, "num_input_tokens_seen": 16468736, "step": 13550 }, { "epoch": 1.6984087207116902, "grad_norm": 0.1808309704065323, "learning_rate": 8.491417115649668e-06, "loss": 0.4621, "num_input_tokens_seen": 16474496, "step": 13555 }, { "epoch": 1.6990352086204736, "grad_norm": 0.2100546658039093, "learning_rate": 8.494549555193585e-06, "loss": 0.458, "num_input_tokens_seen": 16480608, "step": 13560 }, { "epoch": 1.699661696529257, "grad_norm": 0.18423427641391754, "learning_rate": 8.497681994737502e-06, "loss": 0.4611, "num_input_tokens_seen": 16486560, "step": 13565 }, { "epoch": 1.7002881844380404, "grad_norm": 0.195944681763649, "learning_rate": 8.500814434281419e-06, "loss": 0.4687, "num_input_tokens_seen": 16492704, "step": 13570 }, { "epoch": 1.7009146723468236, "grad_norm": 0.1498965471982956, "learning_rate": 8.503946873825335e-06, "loss": 0.4594, "num_input_tokens_seen": 16499040, "step": 13575 }, { "epoch": 1.7015411602556072, "grad_norm": 0.1799010932445526, "learning_rate": 8.507079313369254e-06, "loss": 0.4601, "num_input_tokens_seen": 16505312, "step": 13580 }, { "epoch": 1.7021676481643904, "grad_norm": 0.16569429636001587, "learning_rate": 8.510211752913169e-06, "loss": 0.4582, "num_input_tokens_seen": 16511360, "step": 13585 }, { "epoch": 1.7027941360731738, "grad_norm": 0.06982207298278809, "learning_rate": 8.513344192457086e-06, "loss": 0.4641, "num_input_tokens_seen": 16517568, "step": 13590 }, { "epoch": 1.7034206239819571, "grad_norm": 0.2859538197517395, "learning_rate": 8.516476632001003e-06, "loss": 0.4634, "num_input_tokens_seen": 16523776, "step": 13595 }, { "epoch": 1.7040471118907405, "grad_norm": 0.15115603804588318, "learning_rate": 8.51960907154492e-06, "loss": 0.4613, "num_input_tokens_seen": 16529216, "step": 13600 }, { "epoch": 1.704673599799524, "grad_norm": 0.16100266575813293, "learning_rate": 8.522741511088836e-06, "loss": 0.4639, "num_input_tokens_seen": 16535488, "step": 13605 }, { "epoch": 1.705300087708307, "grad_norm": 0.24448584020137787, "learning_rate": 8.525873950632754e-06, "loss": 0.4598, "num_input_tokens_seen": 16541344, "step": 13610 }, { "epoch": 1.7059265756170907, "grad_norm": 0.223598912358284, "learning_rate": 8.529006390176671e-06, "loss": 0.4621, "num_input_tokens_seen": 16547712, "step": 13615 }, { "epoch": 1.7065530635258739, "grad_norm": 0.1878308653831482, "learning_rate": 8.532138829720588e-06, "loss": 0.4683, "num_input_tokens_seen": 16553920, "step": 13620 }, { "epoch": 1.7071795514346573, "grad_norm": 0.22428393363952637, "learning_rate": 8.535271269264503e-06, "loss": 0.4536, "num_input_tokens_seen": 16559616, "step": 13625 }, { "epoch": 1.7078060393434407, "grad_norm": 0.1387363076210022, "learning_rate": 8.53840370880842e-06, "loss": 0.46, "num_input_tokens_seen": 16565760, "step": 13630 }, { "epoch": 1.708432527252224, "grad_norm": 0.14499633014202118, "learning_rate": 8.541536148352337e-06, "loss": 0.4721, "num_input_tokens_seen": 16572000, "step": 13635 }, { "epoch": 1.7090590151610074, "grad_norm": 0.15761937201023102, "learning_rate": 8.544668587896255e-06, "loss": 0.4656, "num_input_tokens_seen": 16577984, "step": 13640 }, { "epoch": 1.7096855030697906, "grad_norm": 0.15636146068572998, "learning_rate": 8.547801027440172e-06, "loss": 0.4594, "num_input_tokens_seen": 16583808, "step": 13645 }, { "epoch": 1.7103119909785742, "grad_norm": 0.1480567455291748, "learning_rate": 8.550933466984089e-06, "loss": 0.4634, "num_input_tokens_seen": 16590112, "step": 13650 }, { "epoch": 1.7109384788873574, "grad_norm": 0.2791091799736023, "learning_rate": 8.554065906528005e-06, "loss": 0.4617, "num_input_tokens_seen": 16596160, "step": 13655 }, { "epoch": 1.7115649667961408, "grad_norm": 0.16199392080307007, "learning_rate": 8.55719834607192e-06, "loss": 0.4641, "num_input_tokens_seen": 16602304, "step": 13660 }, { "epoch": 1.7121914547049242, "grad_norm": 0.165628582239151, "learning_rate": 8.560330785615837e-06, "loss": 0.4648, "num_input_tokens_seen": 16607776, "step": 13665 }, { "epoch": 1.7128179426137076, "grad_norm": 0.05965235456824303, "learning_rate": 8.563463225159756e-06, "loss": 0.4594, "num_input_tokens_seen": 16613664, "step": 13670 }, { "epoch": 1.713444430522491, "grad_norm": 0.07147663086652756, "learning_rate": 8.566595664703672e-06, "loss": 0.4608, "num_input_tokens_seen": 16619136, "step": 13675 }, { "epoch": 1.7140709184312741, "grad_norm": 0.1531137079000473, "learning_rate": 8.56972810424759e-06, "loss": 0.4639, "num_input_tokens_seen": 16625120, "step": 13680 }, { "epoch": 1.7146974063400577, "grad_norm": 0.21315793693065643, "learning_rate": 8.572860543791506e-06, "loss": 0.4639, "num_input_tokens_seen": 16631136, "step": 13685 }, { "epoch": 1.715323894248841, "grad_norm": 0.17877666652202606, "learning_rate": 8.575992983335423e-06, "loss": 0.4585, "num_input_tokens_seen": 16637280, "step": 13690 }, { "epoch": 1.7159503821576245, "grad_norm": 0.23444534838199615, "learning_rate": 8.579125422879338e-06, "loss": 0.4627, "num_input_tokens_seen": 16643584, "step": 13695 }, { "epoch": 1.7165768700664077, "grad_norm": 0.24575074017047882, "learning_rate": 8.582257862423256e-06, "loss": 0.4616, "num_input_tokens_seen": 16650048, "step": 13700 }, { "epoch": 1.717203357975191, "grad_norm": 0.16088621318340302, "learning_rate": 8.585390301967173e-06, "loss": 0.4651, "num_input_tokens_seen": 16656096, "step": 13705 }, { "epoch": 1.7178298458839745, "grad_norm": 0.24597199261188507, "learning_rate": 8.58852274151109e-06, "loss": 0.4649, "num_input_tokens_seen": 16661984, "step": 13710 }, { "epoch": 1.7184563337927576, "grad_norm": 0.26336777210235596, "learning_rate": 8.591655181055007e-06, "loss": 0.4623, "num_input_tokens_seen": 16667872, "step": 13715 }, { "epoch": 1.7190828217015413, "grad_norm": 0.31744951009750366, "learning_rate": 8.594787620598923e-06, "loss": 0.462, "num_input_tokens_seen": 16674304, "step": 13720 }, { "epoch": 1.7197093096103244, "grad_norm": 0.3484782874584198, "learning_rate": 8.59792006014284e-06, "loss": 0.4598, "num_input_tokens_seen": 16680512, "step": 13725 }, { "epoch": 1.720335797519108, "grad_norm": 0.18045790493488312, "learning_rate": 8.601052499686757e-06, "loss": 0.463, "num_input_tokens_seen": 16686912, "step": 13730 }, { "epoch": 1.7209622854278912, "grad_norm": 0.3368564248085022, "learning_rate": 8.604184939230674e-06, "loss": 0.4556, "num_input_tokens_seen": 16693088, "step": 13735 }, { "epoch": 1.7215887733366746, "grad_norm": 0.5927166938781738, "learning_rate": 8.60731737877459e-06, "loss": 0.468, "num_input_tokens_seen": 16699136, "step": 13740 }, { "epoch": 1.722215261245458, "grad_norm": 0.28868478536605835, "learning_rate": 8.610449818318507e-06, "loss": 0.4762, "num_input_tokens_seen": 16705472, "step": 13745 }, { "epoch": 1.7228417491542414, "grad_norm": 0.2057899534702301, "learning_rate": 8.613582257862424e-06, "loss": 0.4522, "num_input_tokens_seen": 16711616, "step": 13750 }, { "epoch": 1.7234682370630248, "grad_norm": 0.22459372878074646, "learning_rate": 8.61671469740634e-06, "loss": 0.4654, "num_input_tokens_seen": 16717920, "step": 13755 }, { "epoch": 1.724094724971808, "grad_norm": 0.09015724062919617, "learning_rate": 8.619847136950257e-06, "loss": 0.4666, "num_input_tokens_seen": 16724064, "step": 13760 }, { "epoch": 1.7247212128805915, "grad_norm": 0.08278228342533112, "learning_rate": 8.622979576494174e-06, "loss": 0.4654, "num_input_tokens_seen": 16730048, "step": 13765 }, { "epoch": 1.7253477007893747, "grad_norm": 0.2596481144428253, "learning_rate": 8.626112016038091e-06, "loss": 0.4595, "num_input_tokens_seen": 16736320, "step": 13770 }, { "epoch": 1.725974188698158, "grad_norm": 0.16885653138160706, "learning_rate": 8.629244455582008e-06, "loss": 0.4594, "num_input_tokens_seen": 16742784, "step": 13775 }, { "epoch": 1.7266006766069415, "grad_norm": 0.20517347753047943, "learning_rate": 8.632376895125924e-06, "loss": 0.4643, "num_input_tokens_seen": 16748800, "step": 13780 }, { "epoch": 1.7272271645157249, "grad_norm": 0.24327728152275085, "learning_rate": 8.635509334669841e-06, "loss": 0.467, "num_input_tokens_seen": 16754944, "step": 13785 }, { "epoch": 1.7278536524245083, "grad_norm": 0.24602974951267242, "learning_rate": 8.638641774213758e-06, "loss": 0.4587, "num_input_tokens_seen": 16760576, "step": 13790 }, { "epoch": 1.7284801403332914, "grad_norm": 0.10861354321241379, "learning_rate": 8.641774213757675e-06, "loss": 0.4633, "num_input_tokens_seen": 16766944, "step": 13795 }, { "epoch": 1.729106628242075, "grad_norm": 0.32019782066345215, "learning_rate": 8.644906653301593e-06, "loss": 0.4603, "num_input_tokens_seen": 16772896, "step": 13800 }, { "epoch": 1.7297331161508582, "grad_norm": 0.25864604115486145, "learning_rate": 8.648039092845508e-06, "loss": 0.4598, "num_input_tokens_seen": 16779104, "step": 13805 }, { "epoch": 1.7303596040596416, "grad_norm": 0.18959416449069977, "learning_rate": 8.651171532389425e-06, "loss": 0.4614, "num_input_tokens_seen": 16783872, "step": 13810 }, { "epoch": 1.730986091968425, "grad_norm": 0.2350301593542099, "learning_rate": 8.654303971933342e-06, "loss": 0.4651, "num_input_tokens_seen": 16789888, "step": 13815 }, { "epoch": 1.7316125798772084, "grad_norm": 0.3379882276058197, "learning_rate": 8.657436411477259e-06, "loss": 0.4593, "num_input_tokens_seen": 16796000, "step": 13820 }, { "epoch": 1.7322390677859918, "grad_norm": 0.48169851303100586, "learning_rate": 8.660568851021175e-06, "loss": 0.4689, "num_input_tokens_seen": 16801920, "step": 13825 }, { "epoch": 1.732865555694775, "grad_norm": 0.2965616285800934, "learning_rate": 8.663701290565094e-06, "loss": 0.4581, "num_input_tokens_seen": 16808160, "step": 13830 }, { "epoch": 1.7334920436035586, "grad_norm": 0.17246013879776, "learning_rate": 8.66683373010901e-06, "loss": 0.4683, "num_input_tokens_seen": 16814336, "step": 13835 }, { "epoch": 1.7341185315123417, "grad_norm": 0.17139826714992523, "learning_rate": 8.669966169652927e-06, "loss": 0.4619, "num_input_tokens_seen": 16820416, "step": 13840 }, { "epoch": 1.7347450194211251, "grad_norm": 0.17979677021503448, "learning_rate": 8.673098609196842e-06, "loss": 0.4712, "num_input_tokens_seen": 16826720, "step": 13845 }, { "epoch": 1.7353715073299085, "grad_norm": 0.22156526148319244, "learning_rate": 8.676231048740759e-06, "loss": 0.4624, "num_input_tokens_seen": 16832800, "step": 13850 }, { "epoch": 1.735997995238692, "grad_norm": 0.06290637701749802, "learning_rate": 8.679363488284676e-06, "loss": 0.458, "num_input_tokens_seen": 16838048, "step": 13855 }, { "epoch": 1.7366244831474753, "grad_norm": 0.09106411784887314, "learning_rate": 8.682495927828594e-06, "loss": 0.4653, "num_input_tokens_seen": 16844000, "step": 13860 }, { "epoch": 1.7372509710562585, "grad_norm": 0.16008153557777405, "learning_rate": 8.685628367372511e-06, "loss": 0.4617, "num_input_tokens_seen": 16849504, "step": 13865 }, { "epoch": 1.737877458965042, "grad_norm": 0.22744688391685486, "learning_rate": 8.688760806916428e-06, "loss": 0.4729, "num_input_tokens_seen": 16855648, "step": 13870 }, { "epoch": 1.7385039468738253, "grad_norm": 0.2621496915817261, "learning_rate": 8.691893246460345e-06, "loss": 0.4623, "num_input_tokens_seen": 16861952, "step": 13875 }, { "epoch": 1.7391304347826086, "grad_norm": 0.08173736929893494, "learning_rate": 8.69502568600426e-06, "loss": 0.4569, "num_input_tokens_seen": 16868192, "step": 13880 }, { "epoch": 1.739756922691392, "grad_norm": 0.2368379533290863, "learning_rate": 8.698158125548176e-06, "loss": 0.4721, "num_input_tokens_seen": 16874112, "step": 13885 }, { "epoch": 1.7403834106001754, "grad_norm": 0.3607334792613983, "learning_rate": 8.701290565092095e-06, "loss": 0.4563, "num_input_tokens_seen": 16880800, "step": 13890 }, { "epoch": 1.7410098985089588, "grad_norm": 0.2006751298904419, "learning_rate": 8.704423004636012e-06, "loss": 0.4767, "num_input_tokens_seen": 16886944, "step": 13895 }, { "epoch": 1.741636386417742, "grad_norm": 0.292637437582016, "learning_rate": 8.707555444179928e-06, "loss": 0.468, "num_input_tokens_seen": 16893568, "step": 13900 }, { "epoch": 1.7422628743265256, "grad_norm": 0.2708602845668793, "learning_rate": 8.710687883723845e-06, "loss": 0.4612, "num_input_tokens_seen": 16900000, "step": 13905 }, { "epoch": 1.7428893622353088, "grad_norm": 0.16548343002796173, "learning_rate": 8.713820323267762e-06, "loss": 0.4597, "num_input_tokens_seen": 16905664, "step": 13910 }, { "epoch": 1.7435158501440924, "grad_norm": 0.1480773389339447, "learning_rate": 8.716952762811679e-06, "loss": 0.468, "num_input_tokens_seen": 16911872, "step": 13915 }, { "epoch": 1.7441423380528756, "grad_norm": 0.2529357969760895, "learning_rate": 8.720085202355596e-06, "loss": 0.467, "num_input_tokens_seen": 16918016, "step": 13920 }, { "epoch": 1.744768825961659, "grad_norm": 0.2126006782054901, "learning_rate": 8.723217641899512e-06, "loss": 0.4685, "num_input_tokens_seen": 16924608, "step": 13925 }, { "epoch": 1.7453953138704423, "grad_norm": 0.18240775167942047, "learning_rate": 8.726350081443429e-06, "loss": 0.4627, "num_input_tokens_seen": 16931104, "step": 13930 }, { "epoch": 1.7460218017792255, "grad_norm": 0.18541616201400757, "learning_rate": 8.729482520987346e-06, "loss": 0.4624, "num_input_tokens_seen": 16936896, "step": 13935 }, { "epoch": 1.7466482896880091, "grad_norm": 0.17371191084384918, "learning_rate": 8.732614960531263e-06, "loss": 0.4621, "num_input_tokens_seen": 16943008, "step": 13940 }, { "epoch": 1.7472747775967923, "grad_norm": 0.1412949562072754, "learning_rate": 8.73574740007518e-06, "loss": 0.4659, "num_input_tokens_seen": 16949088, "step": 13945 }, { "epoch": 1.747901265505576, "grad_norm": 0.1580745279788971, "learning_rate": 8.738879839619096e-06, "loss": 0.4597, "num_input_tokens_seen": 16954400, "step": 13950 }, { "epoch": 1.748527753414359, "grad_norm": 0.06958682090044022, "learning_rate": 8.742012279163013e-06, "loss": 0.4701, "num_input_tokens_seen": 16960768, "step": 13955 }, { "epoch": 1.7491542413231425, "grad_norm": 0.06255701184272766, "learning_rate": 8.74514471870693e-06, "loss": 0.4618, "num_input_tokens_seen": 16966976, "step": 13960 }, { "epoch": 1.7497807292319258, "grad_norm": 0.17988015711307526, "learning_rate": 8.748277158250846e-06, "loss": 0.4659, "num_input_tokens_seen": 16973184, "step": 13965 }, { "epoch": 1.7504072171407092, "grad_norm": 0.15510323643684387, "learning_rate": 8.751409597794763e-06, "loss": 0.4555, "num_input_tokens_seen": 16979072, "step": 13970 }, { "epoch": 1.7510337050494926, "grad_norm": 0.15517881512641907, "learning_rate": 8.75454203733868e-06, "loss": 0.4484, "num_input_tokens_seen": 16984736, "step": 13975 }, { "epoch": 1.7516601929582758, "grad_norm": 0.19253769516944885, "learning_rate": 8.757674476882597e-06, "loss": 0.4678, "num_input_tokens_seen": 16990848, "step": 13980 }, { "epoch": 1.7522866808670594, "grad_norm": 0.23051483929157257, "learning_rate": 8.760806916426513e-06, "loss": 0.4722, "num_input_tokens_seen": 16996896, "step": 13985 }, { "epoch": 1.7529131687758426, "grad_norm": 0.15178930759429932, "learning_rate": 8.76393935597043e-06, "loss": 0.4582, "num_input_tokens_seen": 17003104, "step": 13990 }, { "epoch": 1.753539656684626, "grad_norm": 0.1919512301683426, "learning_rate": 8.767071795514347e-06, "loss": 0.4637, "num_input_tokens_seen": 17009120, "step": 13995 }, { "epoch": 1.7541661445934094, "grad_norm": 0.28592976927757263, "learning_rate": 8.770204235058264e-06, "loss": 0.4712, "num_input_tokens_seen": 17015360, "step": 14000 }, { "epoch": 1.7547926325021928, "grad_norm": 0.1818930208683014, "learning_rate": 8.77333667460218e-06, "loss": 0.4637, "num_input_tokens_seen": 17021216, "step": 14005 }, { "epoch": 1.7554191204109761, "grad_norm": 0.15317398309707642, "learning_rate": 8.776469114146097e-06, "loss": 0.4643, "num_input_tokens_seen": 17027200, "step": 14010 }, { "epoch": 1.7560456083197593, "grad_norm": 0.162711501121521, "learning_rate": 8.779601553690014e-06, "loss": 0.464, "num_input_tokens_seen": 17032960, "step": 14015 }, { "epoch": 1.756672096228543, "grad_norm": 0.3544572591781616, "learning_rate": 8.782733993233932e-06, "loss": 0.4665, "num_input_tokens_seen": 17039328, "step": 14020 }, { "epoch": 1.757298584137326, "grad_norm": 0.30072659254074097, "learning_rate": 8.785866432777848e-06, "loss": 0.465, "num_input_tokens_seen": 17045824, "step": 14025 }, { "epoch": 1.7579250720461095, "grad_norm": 0.21539191901683807, "learning_rate": 8.788998872321764e-06, "loss": 0.4626, "num_input_tokens_seen": 17051968, "step": 14030 }, { "epoch": 1.7585515599548929, "grad_norm": 0.2940903604030609, "learning_rate": 8.792131311865681e-06, "loss": 0.4647, "num_input_tokens_seen": 17057632, "step": 14035 }, { "epoch": 1.7591780478636763, "grad_norm": 0.2284764051437378, "learning_rate": 8.795263751409598e-06, "loss": 0.4658, "num_input_tokens_seen": 17063776, "step": 14040 }, { "epoch": 1.7598045357724597, "grad_norm": 0.17117653787136078, "learning_rate": 8.798396190953515e-06, "loss": 0.4722, "num_input_tokens_seen": 17069888, "step": 14045 }, { "epoch": 1.7604310236812428, "grad_norm": 0.20503857731819153, "learning_rate": 8.801528630497433e-06, "loss": 0.4653, "num_input_tokens_seen": 17076160, "step": 14050 }, { "epoch": 1.7610575115900264, "grad_norm": 0.31519290804862976, "learning_rate": 8.80466107004135e-06, "loss": 0.4573, "num_input_tokens_seen": 17082176, "step": 14055 }, { "epoch": 1.7616839994988096, "grad_norm": 0.1878160685300827, "learning_rate": 8.807793509585267e-06, "loss": 0.4635, "num_input_tokens_seen": 17088736, "step": 14060 }, { "epoch": 1.762310487407593, "grad_norm": 0.14736787974834442, "learning_rate": 8.810925949129182e-06, "loss": 0.4623, "num_input_tokens_seen": 17095040, "step": 14065 }, { "epoch": 1.7629369753163764, "grad_norm": 0.16949543356895447, "learning_rate": 8.814058388673098e-06, "loss": 0.464, "num_input_tokens_seen": 17101280, "step": 14070 }, { "epoch": 1.7635634632251598, "grad_norm": 0.06262147426605225, "learning_rate": 8.817190828217015e-06, "loss": 0.4546, "num_input_tokens_seen": 17107584, "step": 14075 }, { "epoch": 1.7641899511339432, "grad_norm": 0.19254544377326965, "learning_rate": 8.820323267760934e-06, "loss": 0.4679, "num_input_tokens_seen": 17114048, "step": 14080 }, { "epoch": 1.7648164390427263, "grad_norm": 0.22632461786270142, "learning_rate": 8.82345570730485e-06, "loss": 0.4613, "num_input_tokens_seen": 17120064, "step": 14085 }, { "epoch": 1.76544292695151, "grad_norm": 0.1956227868795395, "learning_rate": 8.826588146848767e-06, "loss": 0.4683, "num_input_tokens_seen": 17126240, "step": 14090 }, { "epoch": 1.7660694148602931, "grad_norm": 0.1449766457080841, "learning_rate": 8.829720586392684e-06, "loss": 0.4584, "num_input_tokens_seen": 17132608, "step": 14095 }, { "epoch": 1.7666959027690765, "grad_norm": 0.1630115658044815, "learning_rate": 8.832853025936599e-06, "loss": 0.4678, "num_input_tokens_seen": 17139072, "step": 14100 }, { "epoch": 1.76732239067786, "grad_norm": 0.2902574837207794, "learning_rate": 8.835985465480517e-06, "loss": 0.465, "num_input_tokens_seen": 17145152, "step": 14105 }, { "epoch": 1.7679488785866433, "grad_norm": 0.15041615068912506, "learning_rate": 8.839117905024434e-06, "loss": 0.4645, "num_input_tokens_seen": 17151072, "step": 14110 }, { "epoch": 1.7685753664954267, "grad_norm": 0.20195147395133972, "learning_rate": 8.842250344568351e-06, "loss": 0.4543, "num_input_tokens_seen": 17156928, "step": 14115 }, { "epoch": 1.7692018544042099, "grad_norm": 0.27254095673561096, "learning_rate": 8.845382784112268e-06, "loss": 0.4651, "num_input_tokens_seen": 17162880, "step": 14120 }, { "epoch": 1.7698283423129935, "grad_norm": 0.2531352639198303, "learning_rate": 8.848515223656184e-06, "loss": 0.4761, "num_input_tokens_seen": 17169056, "step": 14125 }, { "epoch": 1.7704548302217766, "grad_norm": 0.30416494607925415, "learning_rate": 8.851647663200101e-06, "loss": 0.4593, "num_input_tokens_seen": 17175232, "step": 14130 }, { "epoch": 1.7710813181305602, "grad_norm": 0.15404728055000305, "learning_rate": 8.854780102744018e-06, "loss": 0.4568, "num_input_tokens_seen": 17181728, "step": 14135 }, { "epoch": 1.7717078060393434, "grad_norm": 0.1784365475177765, "learning_rate": 8.857912542287935e-06, "loss": 0.4588, "num_input_tokens_seen": 17187744, "step": 14140 }, { "epoch": 1.7723342939481268, "grad_norm": 0.207167387008667, "learning_rate": 8.861044981831852e-06, "loss": 0.4663, "num_input_tokens_seen": 17193984, "step": 14145 }, { "epoch": 1.7729607818569102, "grad_norm": 0.2904326319694519, "learning_rate": 8.864177421375768e-06, "loss": 0.4634, "num_input_tokens_seen": 17199968, "step": 14150 }, { "epoch": 1.7735872697656934, "grad_norm": 0.22514751553535461, "learning_rate": 8.867309860919685e-06, "loss": 0.4739, "num_input_tokens_seen": 17205952, "step": 14155 }, { "epoch": 1.774213757674477, "grad_norm": 0.16697625815868378, "learning_rate": 8.870442300463602e-06, "loss": 0.4597, "num_input_tokens_seen": 17212160, "step": 14160 }, { "epoch": 1.7748402455832601, "grad_norm": 0.15531626343727112, "learning_rate": 8.873574740007519e-06, "loss": 0.4756, "num_input_tokens_seen": 17217440, "step": 14165 }, { "epoch": 1.7754667334920438, "grad_norm": 0.15890470147132874, "learning_rate": 8.876707179551435e-06, "loss": 0.4522, "num_input_tokens_seen": 17223712, "step": 14170 }, { "epoch": 1.776093221400827, "grad_norm": 0.37633994221687317, "learning_rate": 8.879839619095352e-06, "loss": 0.446, "num_input_tokens_seen": 17230048, "step": 14175 }, { "epoch": 1.7767197093096103, "grad_norm": 0.22032485902309418, "learning_rate": 8.882972058639269e-06, "loss": 0.4657, "num_input_tokens_seen": 17236512, "step": 14180 }, { "epoch": 1.7773461972183937, "grad_norm": 0.22340616583824158, "learning_rate": 8.886104498183186e-06, "loss": 0.4707, "num_input_tokens_seen": 17242688, "step": 14185 }, { "epoch": 1.777972685127177, "grad_norm": 0.2845585346221924, "learning_rate": 8.889236937727102e-06, "loss": 0.4693, "num_input_tokens_seen": 17249120, "step": 14190 }, { "epoch": 1.7785991730359605, "grad_norm": 0.36879193782806396, "learning_rate": 8.89236937727102e-06, "loss": 0.4709, "num_input_tokens_seen": 17255200, "step": 14195 }, { "epoch": 1.7792256609447437, "grad_norm": 0.2397022396326065, "learning_rate": 8.895501816814936e-06, "loss": 0.4661, "num_input_tokens_seen": 17261216, "step": 14200 }, { "epoch": 1.7798521488535273, "grad_norm": 0.26828670501708984, "learning_rate": 8.898634256358853e-06, "loss": 0.4583, "num_input_tokens_seen": 17267328, "step": 14205 }, { "epoch": 1.7804786367623104, "grad_norm": 0.1850849837064743, "learning_rate": 8.90176669590277e-06, "loss": 0.4671, "num_input_tokens_seen": 17273472, "step": 14210 }, { "epoch": 1.7811051246710938, "grad_norm": 0.2966001629829407, "learning_rate": 8.904899135446686e-06, "loss": 0.4638, "num_input_tokens_seen": 17279232, "step": 14215 }, { "epoch": 1.7817316125798772, "grad_norm": 0.17764881253242493, "learning_rate": 8.908031574990603e-06, "loss": 0.4622, "num_input_tokens_seen": 17285696, "step": 14220 }, { "epoch": 1.7823581004886606, "grad_norm": 0.17715752124786377, "learning_rate": 8.91116401453452e-06, "loss": 0.4612, "num_input_tokens_seen": 17291840, "step": 14225 }, { "epoch": 1.782984588397444, "grad_norm": 0.1514531672000885, "learning_rate": 8.914296454078437e-06, "loss": 0.4587, "num_input_tokens_seen": 17297856, "step": 14230 }, { "epoch": 1.7836110763062272, "grad_norm": 0.2500179409980774, "learning_rate": 8.917428893622353e-06, "loss": 0.4748, "num_input_tokens_seen": 17304160, "step": 14235 }, { "epoch": 1.7842375642150108, "grad_norm": 0.2604103982448578, "learning_rate": 8.920561333166272e-06, "loss": 0.4614, "num_input_tokens_seen": 17310208, "step": 14240 }, { "epoch": 1.784864052123794, "grad_norm": 0.19024915993213654, "learning_rate": 8.923693772710187e-06, "loss": 0.4618, "num_input_tokens_seen": 17316224, "step": 14245 }, { "epoch": 1.7854905400325773, "grad_norm": 0.14957155287265778, "learning_rate": 8.926826212254104e-06, "loss": 0.454, "num_input_tokens_seen": 17322560, "step": 14250 }, { "epoch": 1.7861170279413607, "grad_norm": 0.15419748425483704, "learning_rate": 8.92995865179802e-06, "loss": 0.464, "num_input_tokens_seen": 17328992, "step": 14255 }, { "epoch": 1.7867435158501441, "grad_norm": 0.31896939873695374, "learning_rate": 8.933091091341937e-06, "loss": 0.4503, "num_input_tokens_seen": 17334848, "step": 14260 }, { "epoch": 1.7873700037589275, "grad_norm": 0.17633122205734253, "learning_rate": 8.936223530885856e-06, "loss": 0.4574, "num_input_tokens_seen": 17340576, "step": 14265 }, { "epoch": 1.7879964916677107, "grad_norm": 0.19784709811210632, "learning_rate": 8.939355970429772e-06, "loss": 0.466, "num_input_tokens_seen": 17347040, "step": 14270 }, { "epoch": 1.7886229795764943, "grad_norm": 0.17068266868591309, "learning_rate": 8.942488409973689e-06, "loss": 0.458, "num_input_tokens_seen": 17353248, "step": 14275 }, { "epoch": 1.7892494674852775, "grad_norm": 0.40285471081733704, "learning_rate": 8.945620849517606e-06, "loss": 0.4575, "num_input_tokens_seen": 17359232, "step": 14280 }, { "epoch": 1.7898759553940609, "grad_norm": 0.1603369414806366, "learning_rate": 8.948753289061521e-06, "loss": 0.469, "num_input_tokens_seen": 17365600, "step": 14285 }, { "epoch": 1.7905024433028442, "grad_norm": 0.10405080765485764, "learning_rate": 8.951885728605438e-06, "loss": 0.4634, "num_input_tokens_seen": 17372000, "step": 14290 }, { "epoch": 1.7911289312116276, "grad_norm": 0.20304526388645172, "learning_rate": 8.955018168149356e-06, "loss": 0.4718, "num_input_tokens_seen": 17377344, "step": 14295 }, { "epoch": 1.791755419120411, "grad_norm": 0.30224019289016724, "learning_rate": 8.958150607693273e-06, "loss": 0.4817, "num_input_tokens_seen": 17383744, "step": 14300 }, { "epoch": 1.7923819070291942, "grad_norm": 0.3257933557033539, "learning_rate": 8.96128304723719e-06, "loss": 0.4595, "num_input_tokens_seen": 17389888, "step": 14305 }, { "epoch": 1.7930083949379778, "grad_norm": 0.16660475730895996, "learning_rate": 8.964415486781106e-06, "loss": 0.4617, "num_input_tokens_seen": 17396032, "step": 14310 }, { "epoch": 1.793634882846761, "grad_norm": 0.263719767332077, "learning_rate": 8.967547926325023e-06, "loss": 0.4621, "num_input_tokens_seen": 17402400, "step": 14315 }, { "epoch": 1.7942613707555444, "grad_norm": 0.298432856798172, "learning_rate": 8.970680365868938e-06, "loss": 0.4642, "num_input_tokens_seen": 17408480, "step": 14320 }, { "epoch": 1.7948878586643278, "grad_norm": 0.14718389511108398, "learning_rate": 8.973812805412857e-06, "loss": 0.4617, "num_input_tokens_seen": 17414336, "step": 14325 }, { "epoch": 1.7955143465731112, "grad_norm": 0.05939515307545662, "learning_rate": 8.976945244956773e-06, "loss": 0.4602, "num_input_tokens_seen": 17420672, "step": 14330 }, { "epoch": 1.7961408344818945, "grad_norm": 0.21284204721450806, "learning_rate": 8.98007768450069e-06, "loss": 0.4702, "num_input_tokens_seen": 17426528, "step": 14335 }, { "epoch": 1.7967673223906777, "grad_norm": 0.06435025483369827, "learning_rate": 8.983210124044607e-06, "loss": 0.4678, "num_input_tokens_seen": 17432736, "step": 14340 }, { "epoch": 1.7973938102994613, "grad_norm": 0.23488159477710724, "learning_rate": 8.986342563588524e-06, "loss": 0.4701, "num_input_tokens_seen": 17438752, "step": 14345 }, { "epoch": 1.7980202982082445, "grad_norm": 0.2114100158214569, "learning_rate": 8.98947500313244e-06, "loss": 0.4659, "num_input_tokens_seen": 17444544, "step": 14350 }, { "epoch": 1.798646786117028, "grad_norm": 0.3560263514518738, "learning_rate": 8.992607442676357e-06, "loss": 0.4621, "num_input_tokens_seen": 17450368, "step": 14355 }, { "epoch": 1.7992732740258113, "grad_norm": 0.16454826295375824, "learning_rate": 8.995739882220274e-06, "loss": 0.4667, "num_input_tokens_seen": 17455904, "step": 14360 }, { "epoch": 1.7998997619345947, "grad_norm": 0.23995284736156464, "learning_rate": 8.99887232176419e-06, "loss": 0.4675, "num_input_tokens_seen": 17461824, "step": 14365 }, { "epoch": 1.800526249843378, "grad_norm": 0.19084271788597107, "learning_rate": 9.002004761308108e-06, "loss": 0.4661, "num_input_tokens_seen": 17467552, "step": 14370 }, { "epoch": 1.8011527377521612, "grad_norm": 0.2827588617801666, "learning_rate": 9.005137200852024e-06, "loss": 0.4618, "num_input_tokens_seen": 17473888, "step": 14375 }, { "epoch": 1.8017792256609448, "grad_norm": 0.19090677797794342, "learning_rate": 9.008269640395941e-06, "loss": 0.4703, "num_input_tokens_seen": 17480192, "step": 14380 }, { "epoch": 1.802405713569728, "grad_norm": 0.05774856358766556, "learning_rate": 9.011402079939858e-06, "loss": 0.4682, "num_input_tokens_seen": 17486016, "step": 14385 }, { "epoch": 1.8030322014785116, "grad_norm": 0.1463773250579834, "learning_rate": 9.014534519483775e-06, "loss": 0.4606, "num_input_tokens_seen": 17492064, "step": 14390 }, { "epoch": 1.8036586893872948, "grad_norm": 0.2119075506925583, "learning_rate": 9.017666959027691e-06, "loss": 0.4643, "num_input_tokens_seen": 17498272, "step": 14395 }, { "epoch": 1.8042851772960782, "grad_norm": 0.28366827964782715, "learning_rate": 9.020799398571608e-06, "loss": 0.4663, "num_input_tokens_seen": 17504416, "step": 14400 }, { "epoch": 1.8049116652048616, "grad_norm": 0.16838927567005157, "learning_rate": 9.023931838115525e-06, "loss": 0.4617, "num_input_tokens_seen": 17510720, "step": 14405 }, { "epoch": 1.805538153113645, "grad_norm": 0.17636696994304657, "learning_rate": 9.027064277659442e-06, "loss": 0.4635, "num_input_tokens_seen": 17516928, "step": 14410 }, { "epoch": 1.8061646410224284, "grad_norm": 0.14527803659439087, "learning_rate": 9.030196717203358e-06, "loss": 0.4654, "num_input_tokens_seen": 17522944, "step": 14415 }, { "epoch": 1.8067911289312115, "grad_norm": 0.06280773878097534, "learning_rate": 9.033329156747275e-06, "loss": 0.4664, "num_input_tokens_seen": 17529344, "step": 14420 }, { "epoch": 1.8074176168399951, "grad_norm": 0.18515171110630035, "learning_rate": 9.036461596291192e-06, "loss": 0.4654, "num_input_tokens_seen": 17535328, "step": 14425 }, { "epoch": 1.8080441047487783, "grad_norm": 0.1708693504333496, "learning_rate": 9.039594035835109e-06, "loss": 0.4674, "num_input_tokens_seen": 17541536, "step": 14430 }, { "epoch": 1.8086705926575617, "grad_norm": 0.0565776601433754, "learning_rate": 9.042726475379025e-06, "loss": 0.4651, "num_input_tokens_seen": 17547360, "step": 14435 }, { "epoch": 1.809297080566345, "grad_norm": 0.22442194819450378, "learning_rate": 9.045858914922942e-06, "loss": 0.4679, "num_input_tokens_seen": 17553440, "step": 14440 }, { "epoch": 1.8099235684751285, "grad_norm": 0.14002633094787598, "learning_rate": 9.048991354466859e-06, "loss": 0.4648, "num_input_tokens_seen": 17559648, "step": 14445 }, { "epoch": 1.8105500563839119, "grad_norm": 0.24601148068904877, "learning_rate": 9.052123794010776e-06, "loss": 0.4591, "num_input_tokens_seen": 17565920, "step": 14450 }, { "epoch": 1.811176544292695, "grad_norm": 0.15554048120975494, "learning_rate": 9.055256233554694e-06, "loss": 0.4697, "num_input_tokens_seen": 17572064, "step": 14455 }, { "epoch": 1.8118030322014786, "grad_norm": 0.06367423385381699, "learning_rate": 9.058388673098611e-06, "loss": 0.4638, "num_input_tokens_seen": 17578368, "step": 14460 }, { "epoch": 1.8124295201102618, "grad_norm": 0.12549681961536407, "learning_rate": 9.061521112642526e-06, "loss": 0.4602, "num_input_tokens_seen": 17584640, "step": 14465 }, { "epoch": 1.8130560080190452, "grad_norm": 0.15030211210250854, "learning_rate": 9.064653552186443e-06, "loss": 0.4614, "num_input_tokens_seen": 17590816, "step": 14470 }, { "epoch": 1.8136824959278286, "grad_norm": 0.15320079028606415, "learning_rate": 9.06778599173036e-06, "loss": 0.4585, "num_input_tokens_seen": 17596960, "step": 14475 }, { "epoch": 1.814308983836612, "grad_norm": 0.15417836606502533, "learning_rate": 9.070918431274276e-06, "loss": 0.4637, "num_input_tokens_seen": 17602784, "step": 14480 }, { "epoch": 1.8149354717453954, "grad_norm": 0.13937309384346008, "learning_rate": 9.074050870818195e-06, "loss": 0.4669, "num_input_tokens_seen": 17608704, "step": 14485 }, { "epoch": 1.8155619596541785, "grad_norm": 0.1311245858669281, "learning_rate": 9.077183310362112e-06, "loss": 0.4633, "num_input_tokens_seen": 17614624, "step": 14490 }, { "epoch": 1.8161884475629622, "grad_norm": 0.13455262780189514, "learning_rate": 9.080315749906028e-06, "loss": 0.4612, "num_input_tokens_seen": 17620928, "step": 14495 }, { "epoch": 1.8168149354717453, "grad_norm": 0.21108782291412354, "learning_rate": 9.083448189449945e-06, "loss": 0.4572, "num_input_tokens_seen": 17626880, "step": 14500 }, { "epoch": 1.8174414233805287, "grad_norm": 0.24864429235458374, "learning_rate": 9.08658062899386e-06, "loss": 0.462, "num_input_tokens_seen": 17633184, "step": 14505 }, { "epoch": 1.818067911289312, "grad_norm": 0.23214392364025116, "learning_rate": 9.089713068537777e-06, "loss": 0.469, "num_input_tokens_seen": 17639072, "step": 14510 }, { "epoch": 1.8186943991980955, "grad_norm": 0.1844492405653, "learning_rate": 9.092845508081695e-06, "loss": 0.4567, "num_input_tokens_seen": 17645184, "step": 14515 }, { "epoch": 1.819320887106879, "grad_norm": 0.19469879567623138, "learning_rate": 9.095977947625612e-06, "loss": 0.4628, "num_input_tokens_seen": 17651232, "step": 14520 }, { "epoch": 1.819947375015662, "grad_norm": 0.13334323465824127, "learning_rate": 9.099110387169529e-06, "loss": 0.4673, "num_input_tokens_seen": 17657472, "step": 14525 }, { "epoch": 1.8205738629244457, "grad_norm": 0.2854202687740326, "learning_rate": 9.102242826713446e-06, "loss": 0.468, "num_input_tokens_seen": 17663712, "step": 14530 }, { "epoch": 1.8212003508332288, "grad_norm": 0.1376998871564865, "learning_rate": 9.105375266257362e-06, "loss": 0.4651, "num_input_tokens_seen": 17669792, "step": 14535 }, { "epoch": 1.8218268387420122, "grad_norm": 0.24529613554477692, "learning_rate": 9.108507705801278e-06, "loss": 0.459, "num_input_tokens_seen": 17675552, "step": 14540 }, { "epoch": 1.8224533266507956, "grad_norm": 0.1349528580904007, "learning_rate": 9.111640145345196e-06, "loss": 0.4653, "num_input_tokens_seen": 17681408, "step": 14545 }, { "epoch": 1.823079814559579, "grad_norm": 0.13768431544303894, "learning_rate": 9.114772584889113e-06, "loss": 0.4591, "num_input_tokens_seen": 17687584, "step": 14550 }, { "epoch": 1.8237063024683624, "grad_norm": 0.049042362719774246, "learning_rate": 9.11790502443303e-06, "loss": 0.4634, "num_input_tokens_seen": 17694016, "step": 14555 }, { "epoch": 1.8243327903771456, "grad_norm": 0.19389386475086212, "learning_rate": 9.121037463976946e-06, "loss": 0.4622, "num_input_tokens_seen": 17700000, "step": 14560 }, { "epoch": 1.8249592782859292, "grad_norm": 0.15382887423038483, "learning_rate": 9.124169903520863e-06, "loss": 0.4657, "num_input_tokens_seen": 17706400, "step": 14565 }, { "epoch": 1.8255857661947124, "grad_norm": 0.1443953961133957, "learning_rate": 9.12730234306478e-06, "loss": 0.463, "num_input_tokens_seen": 17712384, "step": 14570 }, { "epoch": 1.826212254103496, "grad_norm": 0.12755540013313293, "learning_rate": 9.130434782608697e-06, "loss": 0.4651, "num_input_tokens_seen": 17718496, "step": 14575 }, { "epoch": 1.8268387420122791, "grad_norm": 0.13102132081985474, "learning_rate": 9.133567222152613e-06, "loss": 0.469, "num_input_tokens_seen": 17724352, "step": 14580 }, { "epoch": 1.8274652299210625, "grad_norm": 0.21229052543640137, "learning_rate": 9.13669966169653e-06, "loss": 0.4601, "num_input_tokens_seen": 17730016, "step": 14585 }, { "epoch": 1.828091717829846, "grad_norm": 0.23702120780944824, "learning_rate": 9.139832101240447e-06, "loss": 0.4582, "num_input_tokens_seen": 17736160, "step": 14590 }, { "epoch": 1.828718205738629, "grad_norm": 0.14944827556610107, "learning_rate": 9.142964540784364e-06, "loss": 0.4613, "num_input_tokens_seen": 17742496, "step": 14595 }, { "epoch": 1.8293446936474127, "grad_norm": 0.15416443347930908, "learning_rate": 9.14609698032828e-06, "loss": 0.4628, "num_input_tokens_seen": 17748544, "step": 14600 }, { "epoch": 1.8299711815561959, "grad_norm": 0.23613406717777252, "learning_rate": 9.149229419872197e-06, "loss": 0.4708, "num_input_tokens_seen": 17754624, "step": 14605 }, { "epoch": 1.8305976694649795, "grad_norm": 0.2118421196937561, "learning_rate": 9.152361859416114e-06, "loss": 0.4606, "num_input_tokens_seen": 17760800, "step": 14610 }, { "epoch": 1.8312241573737627, "grad_norm": 0.13974027335643768, "learning_rate": 9.15549429896003e-06, "loss": 0.4604, "num_input_tokens_seen": 17766944, "step": 14615 }, { "epoch": 1.831850645282546, "grad_norm": 0.137041836977005, "learning_rate": 9.158626738503947e-06, "loss": 0.4673, "num_input_tokens_seen": 17772576, "step": 14620 }, { "epoch": 1.8324771331913294, "grad_norm": 0.19909442961215973, "learning_rate": 9.161759178047864e-06, "loss": 0.4574, "num_input_tokens_seen": 17778656, "step": 14625 }, { "epoch": 1.8331036211001128, "grad_norm": 0.13974730670452118, "learning_rate": 9.164891617591781e-06, "loss": 0.4595, "num_input_tokens_seen": 17784736, "step": 14630 }, { "epoch": 1.8337301090088962, "grad_norm": 0.1381826251745224, "learning_rate": 9.168024057135698e-06, "loss": 0.4636, "num_input_tokens_seen": 17791168, "step": 14635 }, { "epoch": 1.8343565969176794, "grad_norm": 0.28173965215682983, "learning_rate": 9.171156496679614e-06, "loss": 0.4654, "num_input_tokens_seen": 17797696, "step": 14640 }, { "epoch": 1.834983084826463, "grad_norm": 0.23102104663848877, "learning_rate": 9.174288936223533e-06, "loss": 0.4623, "num_input_tokens_seen": 17803808, "step": 14645 }, { "epoch": 1.8356095727352462, "grad_norm": 0.1460943967103958, "learning_rate": 9.177421375767448e-06, "loss": 0.4615, "num_input_tokens_seen": 17810048, "step": 14650 }, { "epoch": 1.8362360606440296, "grad_norm": 0.21176382899284363, "learning_rate": 9.180553815311365e-06, "loss": 0.4657, "num_input_tokens_seen": 17816224, "step": 14655 }, { "epoch": 1.836862548552813, "grad_norm": 0.18375033140182495, "learning_rate": 9.183686254855282e-06, "loss": 0.4638, "num_input_tokens_seen": 17822464, "step": 14660 }, { "epoch": 1.8374890364615963, "grad_norm": 0.1933431327342987, "learning_rate": 9.186818694399198e-06, "loss": 0.4643, "num_input_tokens_seen": 17828448, "step": 14665 }, { "epoch": 1.8381155243703797, "grad_norm": 0.2086055427789688, "learning_rate": 9.189951133943115e-06, "loss": 0.4631, "num_input_tokens_seen": 17834784, "step": 14670 }, { "epoch": 1.838742012279163, "grad_norm": 0.1975691169500351, "learning_rate": 9.193083573487034e-06, "loss": 0.4629, "num_input_tokens_seen": 17840768, "step": 14675 }, { "epoch": 1.8393685001879465, "grad_norm": 0.14878061413764954, "learning_rate": 9.19621601303095e-06, "loss": 0.4658, "num_input_tokens_seen": 17846944, "step": 14680 }, { "epoch": 1.8399949880967297, "grad_norm": 0.18315565586090088, "learning_rate": 9.199348452574865e-06, "loss": 0.468, "num_input_tokens_seen": 17853408, "step": 14685 }, { "epoch": 1.840621476005513, "grad_norm": 0.059696320444345474, "learning_rate": 9.202480892118782e-06, "loss": 0.4685, "num_input_tokens_seen": 17859424, "step": 14690 }, { "epoch": 1.8412479639142965, "grad_norm": 0.18723903596401215, "learning_rate": 9.205613331662699e-06, "loss": 0.463, "num_input_tokens_seen": 17865664, "step": 14695 }, { "epoch": 1.8418744518230799, "grad_norm": 0.13524679839611053, "learning_rate": 9.208745771206616e-06, "loss": 0.4586, "num_input_tokens_seen": 17871808, "step": 14700 }, { "epoch": 1.8425009397318632, "grad_norm": 0.14057856798171997, "learning_rate": 9.211878210750534e-06, "loss": 0.4656, "num_input_tokens_seen": 17877888, "step": 14705 }, { "epoch": 1.8431274276406464, "grad_norm": 0.1426813006401062, "learning_rate": 9.21501065029445e-06, "loss": 0.4672, "num_input_tokens_seen": 17884032, "step": 14710 }, { "epoch": 1.84375391554943, "grad_norm": 0.056222815066576004, "learning_rate": 9.218143089838368e-06, "loss": 0.4655, "num_input_tokens_seen": 17889376, "step": 14715 }, { "epoch": 1.8443804034582132, "grad_norm": 0.12298920005559921, "learning_rate": 9.221275529382284e-06, "loss": 0.4605, "num_input_tokens_seen": 17894624, "step": 14720 }, { "epoch": 1.8450068913669966, "grad_norm": 0.1424633115530014, "learning_rate": 9.2244079689262e-06, "loss": 0.4658, "num_input_tokens_seen": 17900768, "step": 14725 }, { "epoch": 1.84563337927578, "grad_norm": 0.26188990473747253, "learning_rate": 9.227540408470116e-06, "loss": 0.4617, "num_input_tokens_seen": 17906976, "step": 14730 }, { "epoch": 1.8462598671845634, "grad_norm": 0.15973040461540222, "learning_rate": 9.230672848014035e-06, "loss": 0.4658, "num_input_tokens_seen": 17913056, "step": 14735 }, { "epoch": 1.8468863550933468, "grad_norm": 0.20732322335243225, "learning_rate": 9.233805287557951e-06, "loss": 0.4577, "num_input_tokens_seen": 17919488, "step": 14740 }, { "epoch": 1.84751284300213, "grad_norm": 0.13013528287410736, "learning_rate": 9.236937727101868e-06, "loss": 0.47, "num_input_tokens_seen": 17925760, "step": 14745 }, { "epoch": 1.8481393309109135, "grad_norm": 0.2328932136297226, "learning_rate": 9.240070166645785e-06, "loss": 0.464, "num_input_tokens_seen": 17931712, "step": 14750 }, { "epoch": 1.8487658188196967, "grad_norm": 0.14201238751411438, "learning_rate": 9.243202606189702e-06, "loss": 0.4569, "num_input_tokens_seen": 17938016, "step": 14755 }, { "epoch": 1.84939230672848, "grad_norm": 0.16242532432079315, "learning_rate": 9.246335045733617e-06, "loss": 0.4665, "num_input_tokens_seen": 17944032, "step": 14760 }, { "epoch": 1.8500187946372635, "grad_norm": 0.046322498470544815, "learning_rate": 9.249467485277535e-06, "loss": 0.4602, "num_input_tokens_seen": 17950368, "step": 14765 }, { "epoch": 1.8506452825460469, "grad_norm": 0.302860826253891, "learning_rate": 9.252599924821452e-06, "loss": 0.4633, "num_input_tokens_seen": 17956352, "step": 14770 }, { "epoch": 1.8512717704548303, "grad_norm": 0.21884161233901978, "learning_rate": 9.255732364365369e-06, "loss": 0.4633, "num_input_tokens_seen": 17962496, "step": 14775 }, { "epoch": 1.8518982583636134, "grad_norm": 0.1589672565460205, "learning_rate": 9.258864803909286e-06, "loss": 0.4621, "num_input_tokens_seen": 17968832, "step": 14780 }, { "epoch": 1.852524746272397, "grad_norm": 0.147097647190094, "learning_rate": 9.261997243453202e-06, "loss": 0.464, "num_input_tokens_seen": 17975072, "step": 14785 }, { "epoch": 1.8531512341811802, "grad_norm": 0.11511474847793579, "learning_rate": 9.265129682997119e-06, "loss": 0.4601, "num_input_tokens_seen": 17981184, "step": 14790 }, { "epoch": 1.8537777220899636, "grad_norm": 0.17227664589881897, "learning_rate": 9.268262122541036e-06, "loss": 0.4656, "num_input_tokens_seen": 17987392, "step": 14795 }, { "epoch": 1.854404209998747, "grad_norm": 0.19774098694324493, "learning_rate": 9.271394562084953e-06, "loss": 0.4604, "num_input_tokens_seen": 17993376, "step": 14800 }, { "epoch": 1.8550306979075304, "grad_norm": 0.21390686929225922, "learning_rate": 9.27452700162887e-06, "loss": 0.4586, "num_input_tokens_seen": 17999296, "step": 14805 }, { "epoch": 1.8556571858163138, "grad_norm": 0.22353535890579224, "learning_rate": 9.277659441172786e-06, "loss": 0.463, "num_input_tokens_seen": 18005472, "step": 14810 }, { "epoch": 1.856283673725097, "grad_norm": 0.14837932586669922, "learning_rate": 9.280791880716703e-06, "loss": 0.462, "num_input_tokens_seen": 18011520, "step": 14815 }, { "epoch": 1.8569101616338806, "grad_norm": 0.1270608752965927, "learning_rate": 9.28392432026062e-06, "loss": 0.464, "num_input_tokens_seen": 18017536, "step": 14820 }, { "epoch": 1.8575366495426637, "grad_norm": 0.05198376625776291, "learning_rate": 9.287056759804536e-06, "loss": 0.4628, "num_input_tokens_seen": 18023040, "step": 14825 }, { "epoch": 1.8581631374514473, "grad_norm": 0.187421977519989, "learning_rate": 9.290189199348453e-06, "loss": 0.4613, "num_input_tokens_seen": 18028416, "step": 14830 }, { "epoch": 1.8587896253602305, "grad_norm": 0.14390282332897186, "learning_rate": 9.29332163889237e-06, "loss": 0.4704, "num_input_tokens_seen": 18034528, "step": 14835 }, { "epoch": 1.859416113269014, "grad_norm": 0.1300816535949707, "learning_rate": 9.296454078436287e-06, "loss": 0.4659, "num_input_tokens_seen": 18040480, "step": 14840 }, { "epoch": 1.8600426011777973, "grad_norm": 0.14219622313976288, "learning_rate": 9.299586517980203e-06, "loss": 0.4643, "num_input_tokens_seen": 18046080, "step": 14845 }, { "epoch": 1.8606690890865805, "grad_norm": 0.05106164887547493, "learning_rate": 9.30271895752412e-06, "loss": 0.4659, "num_input_tokens_seen": 18052160, "step": 14850 }, { "epoch": 1.861295576995364, "grad_norm": 0.26032501459121704, "learning_rate": 9.305851397068037e-06, "loss": 0.4634, "num_input_tokens_seen": 18058400, "step": 14855 }, { "epoch": 1.8619220649041472, "grad_norm": 0.04783771187067032, "learning_rate": 9.308983836611954e-06, "loss": 0.466, "num_input_tokens_seen": 18064608, "step": 14860 }, { "epoch": 1.8625485528129309, "grad_norm": 0.1469356268644333, "learning_rate": 9.312116276155872e-06, "loss": 0.457, "num_input_tokens_seen": 18070656, "step": 14865 }, { "epoch": 1.863175040721714, "grad_norm": 0.1825316995382309, "learning_rate": 9.315248715699787e-06, "loss": 0.4648, "num_input_tokens_seen": 18076448, "step": 14870 }, { "epoch": 1.8638015286304974, "grad_norm": 0.15806886553764343, "learning_rate": 9.318381155243704e-06, "loss": 0.4685, "num_input_tokens_seen": 18081792, "step": 14875 }, { "epoch": 1.8644280165392808, "grad_norm": 0.05732009932398796, "learning_rate": 9.32151359478762e-06, "loss": 0.4587, "num_input_tokens_seen": 18087584, "step": 14880 }, { "epoch": 1.8650545044480642, "grad_norm": 0.20536507666110992, "learning_rate": 9.324646034331538e-06, "loss": 0.4595, "num_input_tokens_seen": 18093376, "step": 14885 }, { "epoch": 1.8656809923568476, "grad_norm": 0.1317838430404663, "learning_rate": 9.327778473875454e-06, "loss": 0.4606, "num_input_tokens_seen": 18099680, "step": 14890 }, { "epoch": 1.8663074802656308, "grad_norm": 0.1299627423286438, "learning_rate": 9.330910913419373e-06, "loss": 0.4597, "num_input_tokens_seen": 18106176, "step": 14895 }, { "epoch": 1.8669339681744144, "grad_norm": 0.13652263581752777, "learning_rate": 9.33404335296329e-06, "loss": 0.4577, "num_input_tokens_seen": 18112448, "step": 14900 }, { "epoch": 1.8675604560831975, "grad_norm": 0.24765391647815704, "learning_rate": 9.337175792507205e-06, "loss": 0.4584, "num_input_tokens_seen": 18118752, "step": 14905 }, { "epoch": 1.868186943991981, "grad_norm": 0.20992746949195862, "learning_rate": 9.340308232051121e-06, "loss": 0.4714, "num_input_tokens_seen": 18125056, "step": 14910 }, { "epoch": 1.8688134319007643, "grad_norm": 0.16679809987545013, "learning_rate": 9.343440671595038e-06, "loss": 0.4617, "num_input_tokens_seen": 18131008, "step": 14915 }, { "epoch": 1.8694399198095477, "grad_norm": 0.22643668949604034, "learning_rate": 9.346573111138955e-06, "loss": 0.4598, "num_input_tokens_seen": 18137280, "step": 14920 }, { "epoch": 1.870066407718331, "grad_norm": 0.18857069313526154, "learning_rate": 9.349705550682873e-06, "loss": 0.4715, "num_input_tokens_seen": 18143104, "step": 14925 }, { "epoch": 1.8706928956271143, "grad_norm": 0.23779235780239105, "learning_rate": 9.35283799022679e-06, "loss": 0.4629, "num_input_tokens_seen": 18149184, "step": 14930 }, { "epoch": 1.8713193835358979, "grad_norm": 0.1596134901046753, "learning_rate": 9.355970429770707e-06, "loss": 0.4607, "num_input_tokens_seen": 18155296, "step": 14935 }, { "epoch": 1.871945871444681, "grad_norm": 0.1505988985300064, "learning_rate": 9.359102869314624e-06, "loss": 0.4665, "num_input_tokens_seen": 18161888, "step": 14940 }, { "epoch": 1.8725723593534644, "grad_norm": 0.1324342042207718, "learning_rate": 9.362235308858539e-06, "loss": 0.4575, "num_input_tokens_seen": 18168160, "step": 14945 }, { "epoch": 1.8731988472622478, "grad_norm": 0.1273866891860962, "learning_rate": 9.365367748402457e-06, "loss": 0.4652, "num_input_tokens_seen": 18174272, "step": 14950 }, { "epoch": 1.8738253351710312, "grad_norm": 0.13523215055465698, "learning_rate": 9.368500187946374e-06, "loss": 0.4583, "num_input_tokens_seen": 18180320, "step": 14955 }, { "epoch": 1.8744518230798146, "grad_norm": 0.1594100147485733, "learning_rate": 9.37163262749029e-06, "loss": 0.4681, "num_input_tokens_seen": 18186400, "step": 14960 }, { "epoch": 1.8750783109885978, "grad_norm": 0.2876097559928894, "learning_rate": 9.374765067034207e-06, "loss": 0.4626, "num_input_tokens_seen": 18192704, "step": 14965 }, { "epoch": 1.8757047988973814, "grad_norm": 0.054999228566884995, "learning_rate": 9.377897506578124e-06, "loss": 0.4548, "num_input_tokens_seen": 18198720, "step": 14970 }, { "epoch": 1.8763312868061646, "grad_norm": 0.15580254793167114, "learning_rate": 9.381029946122041e-06, "loss": 0.4662, "num_input_tokens_seen": 18204672, "step": 14975 }, { "epoch": 1.876957774714948, "grad_norm": 0.1641157567501068, "learning_rate": 9.384162385665958e-06, "loss": 0.4532, "num_input_tokens_seen": 18211072, "step": 14980 }, { "epoch": 1.8775842626237313, "grad_norm": 0.13624894618988037, "learning_rate": 9.387294825209875e-06, "loss": 0.4548, "num_input_tokens_seen": 18216736, "step": 14985 }, { "epoch": 1.8782107505325147, "grad_norm": 0.17615698277950287, "learning_rate": 9.390427264753791e-06, "loss": 0.4764, "num_input_tokens_seen": 18222848, "step": 14990 }, { "epoch": 1.8788372384412981, "grad_norm": 0.06084864214062691, "learning_rate": 9.393559704297708e-06, "loss": 0.4568, "num_input_tokens_seen": 18228992, "step": 14995 }, { "epoch": 1.8794637263500813, "grad_norm": 0.2774895131587982, "learning_rate": 9.396692143841625e-06, "loss": 0.4704, "num_input_tokens_seen": 18235008, "step": 15000 }, { "epoch": 1.880090214258865, "grad_norm": 0.18816377222537994, "learning_rate": 9.399824583385542e-06, "loss": 0.4683, "num_input_tokens_seen": 18241024, "step": 15005 }, { "epoch": 1.880716702167648, "grad_norm": 0.05800464376807213, "learning_rate": 9.402957022929458e-06, "loss": 0.4617, "num_input_tokens_seen": 18247264, "step": 15010 }, { "epoch": 1.8813431900764315, "grad_norm": 0.1598588526248932, "learning_rate": 9.406089462473375e-06, "loss": 0.4668, "num_input_tokens_seen": 18253216, "step": 15015 }, { "epoch": 1.8819696779852149, "grad_norm": 0.2368467152118683, "learning_rate": 9.409221902017292e-06, "loss": 0.4604, "num_input_tokens_seen": 18258880, "step": 15020 }, { "epoch": 1.8825961658939983, "grad_norm": 0.13542230427265167, "learning_rate": 9.412354341561209e-06, "loss": 0.463, "num_input_tokens_seen": 18264768, "step": 15025 }, { "epoch": 1.8832226538027816, "grad_norm": 0.23521257936954498, "learning_rate": 9.415486781105125e-06, "loss": 0.471, "num_input_tokens_seen": 18270400, "step": 15030 }, { "epoch": 1.8838491417115648, "grad_norm": 0.2730936110019684, "learning_rate": 9.418619220649042e-06, "loss": 0.4617, "num_input_tokens_seen": 18276416, "step": 15035 }, { "epoch": 1.8844756296203484, "grad_norm": 0.1288815438747406, "learning_rate": 9.421751660192959e-06, "loss": 0.4589, "num_input_tokens_seen": 18282432, "step": 15040 }, { "epoch": 1.8851021175291316, "grad_norm": 0.17383840680122375, "learning_rate": 9.424884099736876e-06, "loss": 0.4661, "num_input_tokens_seen": 18288672, "step": 15045 }, { "epoch": 1.8857286054379152, "grad_norm": 0.14365875720977783, "learning_rate": 9.428016539280792e-06, "loss": 0.4472, "num_input_tokens_seen": 18295072, "step": 15050 }, { "epoch": 1.8863550933466984, "grad_norm": 0.234235480427742, "learning_rate": 9.43114897882471e-06, "loss": 0.4685, "num_input_tokens_seen": 18300768, "step": 15055 }, { "epoch": 1.8869815812554818, "grad_norm": 0.24898016452789307, "learning_rate": 9.434281418368626e-06, "loss": 0.4609, "num_input_tokens_seen": 18306976, "step": 15060 }, { "epoch": 1.8876080691642652, "grad_norm": 0.2129259556531906, "learning_rate": 9.437413857912543e-06, "loss": 0.458, "num_input_tokens_seen": 18312992, "step": 15065 }, { "epoch": 1.8882345570730483, "grad_norm": 0.17074702680110931, "learning_rate": 9.44054629745646e-06, "loss": 0.4625, "num_input_tokens_seen": 18319232, "step": 15070 }, { "epoch": 1.888861044981832, "grad_norm": 0.28741633892059326, "learning_rate": 9.443678737000376e-06, "loss": 0.4519, "num_input_tokens_seen": 18325472, "step": 15075 }, { "epoch": 1.889487532890615, "grad_norm": 0.24216683208942413, "learning_rate": 9.446811176544293e-06, "loss": 0.4614, "num_input_tokens_seen": 18331584, "step": 15080 }, { "epoch": 1.8901140207993987, "grad_norm": 0.19474010169506073, "learning_rate": 9.449943616088211e-06, "loss": 0.4717, "num_input_tokens_seen": 18337600, "step": 15085 }, { "epoch": 1.8907405087081819, "grad_norm": 0.27384284138679504, "learning_rate": 9.453076055632127e-06, "loss": 0.4556, "num_input_tokens_seen": 18343360, "step": 15090 }, { "epoch": 1.8913669966169653, "grad_norm": 0.21280652284622192, "learning_rate": 9.456208495176043e-06, "loss": 0.4589, "num_input_tokens_seen": 18349376, "step": 15095 }, { "epoch": 1.8919934845257487, "grad_norm": 0.1456485539674759, "learning_rate": 9.45934093471996e-06, "loss": 0.4618, "num_input_tokens_seen": 18355456, "step": 15100 }, { "epoch": 1.892619972434532, "grad_norm": 0.14476311206817627, "learning_rate": 9.462473374263877e-06, "loss": 0.458, "num_input_tokens_seen": 18361440, "step": 15105 }, { "epoch": 1.8932464603433155, "grad_norm": 0.2730676233768463, "learning_rate": 9.465605813807794e-06, "loss": 0.4602, "num_input_tokens_seen": 18367552, "step": 15110 }, { "epoch": 1.8938729482520986, "grad_norm": 0.189190074801445, "learning_rate": 9.468738253351712e-06, "loss": 0.4569, "num_input_tokens_seen": 18373504, "step": 15115 }, { "epoch": 1.8944994361608822, "grad_norm": 0.27849122881889343, "learning_rate": 9.471870692895629e-06, "loss": 0.4674, "num_input_tokens_seen": 18379776, "step": 15120 }, { "epoch": 1.8951259240696654, "grad_norm": 0.14044047892093658, "learning_rate": 9.475003132439544e-06, "loss": 0.4528, "num_input_tokens_seen": 18385920, "step": 15125 }, { "epoch": 1.8957524119784488, "grad_norm": 0.2490188479423523, "learning_rate": 9.47813557198346e-06, "loss": 0.4712, "num_input_tokens_seen": 18392128, "step": 15130 }, { "epoch": 1.8963788998872322, "grad_norm": 0.20831497013568878, "learning_rate": 9.481268011527377e-06, "loss": 0.4612, "num_input_tokens_seen": 18398560, "step": 15135 }, { "epoch": 1.8970053877960156, "grad_norm": 0.19298531115055084, "learning_rate": 9.484400451071296e-06, "loss": 0.4589, "num_input_tokens_seen": 18404768, "step": 15140 }, { "epoch": 1.897631875704799, "grad_norm": 0.3098183274269104, "learning_rate": 9.487532890615213e-06, "loss": 0.4595, "num_input_tokens_seen": 18410560, "step": 15145 }, { "epoch": 1.8982583636135821, "grad_norm": 0.2190920114517212, "learning_rate": 9.49066533015913e-06, "loss": 0.4582, "num_input_tokens_seen": 18416736, "step": 15150 }, { "epoch": 1.8988848515223657, "grad_norm": 0.3203887939453125, "learning_rate": 9.493797769703046e-06, "loss": 0.4649, "num_input_tokens_seen": 18422720, "step": 15155 }, { "epoch": 1.899511339431149, "grad_norm": 0.24543476104736328, "learning_rate": 9.496930209246963e-06, "loss": 0.4557, "num_input_tokens_seen": 18428576, "step": 15160 }, { "epoch": 1.9001378273399323, "grad_norm": 0.16080676019191742, "learning_rate": 9.500062648790878e-06, "loss": 0.4664, "num_input_tokens_seen": 18434624, "step": 15165 }, { "epoch": 1.9007643152487157, "grad_norm": 0.1400405764579773, "learning_rate": 9.503195088334796e-06, "loss": 0.4632, "num_input_tokens_seen": 18440128, "step": 15170 }, { "epoch": 1.901390803157499, "grad_norm": 0.2787781059741974, "learning_rate": 9.506327527878713e-06, "loss": 0.462, "num_input_tokens_seen": 18446048, "step": 15175 }, { "epoch": 1.9020172910662825, "grad_norm": 0.1875811368227005, "learning_rate": 9.50945996742263e-06, "loss": 0.4679, "num_input_tokens_seen": 18452128, "step": 15180 }, { "epoch": 1.9026437789750656, "grad_norm": 0.29381197690963745, "learning_rate": 9.512592406966547e-06, "loss": 0.4622, "num_input_tokens_seen": 18458368, "step": 15185 }, { "epoch": 1.9032702668838493, "grad_norm": 0.18317964673042297, "learning_rate": 9.515724846510463e-06, "loss": 0.4586, "num_input_tokens_seen": 18463936, "step": 15190 }, { "epoch": 1.9038967547926324, "grad_norm": 0.2626500427722931, "learning_rate": 9.51885728605438e-06, "loss": 0.4707, "num_input_tokens_seen": 18469792, "step": 15195 }, { "epoch": 1.9045232427014158, "grad_norm": 0.16278666257858276, "learning_rate": 9.521989725598297e-06, "loss": 0.4685, "num_input_tokens_seen": 18475872, "step": 15200 }, { "epoch": 1.9051497306101992, "grad_norm": 0.184330552816391, "learning_rate": 9.525122165142214e-06, "loss": 0.46, "num_input_tokens_seen": 18481984, "step": 15205 }, { "epoch": 1.9057762185189826, "grad_norm": 0.1719488799571991, "learning_rate": 9.52825460468613e-06, "loss": 0.463, "num_input_tokens_seen": 18487872, "step": 15210 }, { "epoch": 1.906402706427766, "grad_norm": 0.24404598772525787, "learning_rate": 9.531387044230047e-06, "loss": 0.4639, "num_input_tokens_seen": 18494240, "step": 15215 }, { "epoch": 1.9070291943365492, "grad_norm": 0.15626850724220276, "learning_rate": 9.534519483773964e-06, "loss": 0.4614, "num_input_tokens_seen": 18500768, "step": 15220 }, { "epoch": 1.9076556822453328, "grad_norm": 0.1943909227848053, "learning_rate": 9.53765192331788e-06, "loss": 0.4589, "num_input_tokens_seen": 18507200, "step": 15225 }, { "epoch": 1.908282170154116, "grad_norm": 0.2393455058336258, "learning_rate": 9.540784362861798e-06, "loss": 0.4641, "num_input_tokens_seen": 18513600, "step": 15230 }, { "epoch": 1.9089086580628993, "grad_norm": 0.14770552515983582, "learning_rate": 9.543916802405714e-06, "loss": 0.4582, "num_input_tokens_seen": 18519648, "step": 15235 }, { "epoch": 1.9095351459716827, "grad_norm": 0.18977786600589752, "learning_rate": 9.547049241949631e-06, "loss": 0.4666, "num_input_tokens_seen": 18526080, "step": 15240 }, { "epoch": 1.9101616338804661, "grad_norm": 0.28025224804878235, "learning_rate": 9.550181681493548e-06, "loss": 0.4625, "num_input_tokens_seen": 18532128, "step": 15245 }, { "epoch": 1.9107881217892495, "grad_norm": 0.2663964629173279, "learning_rate": 9.553314121037465e-06, "loss": 0.4605, "num_input_tokens_seen": 18538176, "step": 15250 }, { "epoch": 1.9114146096980327, "grad_norm": 1.3485163450241089, "learning_rate": 9.556446560581381e-06, "loss": 0.4676, "num_input_tokens_seen": 18544672, "step": 15255 }, { "epoch": 1.9120410976068163, "grad_norm": 0.44755294919013977, "learning_rate": 9.559579000125298e-06, "loss": 0.4793, "num_input_tokens_seen": 18550944, "step": 15260 }, { "epoch": 1.9126675855155995, "grad_norm": 0.22178581357002258, "learning_rate": 9.562711439669215e-06, "loss": 0.4629, "num_input_tokens_seen": 18556992, "step": 15265 }, { "epoch": 1.913294073424383, "grad_norm": 0.18036244809627533, "learning_rate": 9.565843879213132e-06, "loss": 0.4593, "num_input_tokens_seen": 18563264, "step": 15270 }, { "epoch": 1.9139205613331662, "grad_norm": 0.13918131589889526, "learning_rate": 9.568976318757048e-06, "loss": 0.4611, "num_input_tokens_seen": 18569216, "step": 15275 }, { "epoch": 1.9145470492419496, "grad_norm": 0.2636522352695465, "learning_rate": 9.572108758300965e-06, "loss": 0.4654, "num_input_tokens_seen": 18575520, "step": 15280 }, { "epoch": 1.915173537150733, "grad_norm": 0.1579716056585312, "learning_rate": 9.575241197844882e-06, "loss": 0.4627, "num_input_tokens_seen": 18581248, "step": 15285 }, { "epoch": 1.9158000250595162, "grad_norm": 0.20452821254730225, "learning_rate": 9.578373637388799e-06, "loss": 0.4566, "num_input_tokens_seen": 18588032, "step": 15290 }, { "epoch": 1.9164265129682998, "grad_norm": 0.3150944709777832, "learning_rate": 9.581506076932716e-06, "loss": 0.4699, "num_input_tokens_seen": 18594112, "step": 15295 }, { "epoch": 1.917053000877083, "grad_norm": 0.1766045093536377, "learning_rate": 9.584638516476634e-06, "loss": 0.463, "num_input_tokens_seen": 18600320, "step": 15300 }, { "epoch": 1.9176794887858666, "grad_norm": 0.14004987478256226, "learning_rate": 9.58777095602055e-06, "loss": 0.4505, "num_input_tokens_seen": 18606400, "step": 15305 }, { "epoch": 1.9183059766946498, "grad_norm": 0.16100458800792694, "learning_rate": 9.590903395564466e-06, "loss": 0.4656, "num_input_tokens_seen": 18612544, "step": 15310 }, { "epoch": 1.9189324646034331, "grad_norm": 0.21686524152755737, "learning_rate": 9.594035835108383e-06, "loss": 0.4632, "num_input_tokens_seen": 18618464, "step": 15315 }, { "epoch": 1.9195589525122165, "grad_norm": 0.06539318710565567, "learning_rate": 9.5971682746523e-06, "loss": 0.4568, "num_input_tokens_seen": 18624192, "step": 15320 }, { "epoch": 1.920185440421, "grad_norm": 0.20009486377239227, "learning_rate": 9.600300714196216e-06, "loss": 0.4598, "num_input_tokens_seen": 18630432, "step": 15325 }, { "epoch": 1.9208119283297833, "grad_norm": 0.16819711029529572, "learning_rate": 9.603433153740135e-06, "loss": 0.4616, "num_input_tokens_seen": 18636672, "step": 15330 }, { "epoch": 1.9214384162385665, "grad_norm": 0.2698647081851959, "learning_rate": 9.606565593284051e-06, "loss": 0.4655, "num_input_tokens_seen": 18642304, "step": 15335 }, { "epoch": 1.92206490414735, "grad_norm": 0.06706046313047409, "learning_rate": 9.609698032827968e-06, "loss": 0.4643, "num_input_tokens_seen": 18648544, "step": 15340 }, { "epoch": 1.9226913920561333, "grad_norm": 0.1349196881055832, "learning_rate": 9.612830472371883e-06, "loss": 0.4541, "num_input_tokens_seen": 18654496, "step": 15345 }, { "epoch": 1.9233178799649167, "grad_norm": 0.20962537825107574, "learning_rate": 9.6159629119158e-06, "loss": 0.4795, "num_input_tokens_seen": 18660896, "step": 15350 }, { "epoch": 1.9239443678737, "grad_norm": 0.3023225665092468, "learning_rate": 9.619095351459717e-06, "loss": 0.4648, "num_input_tokens_seen": 18667008, "step": 15355 }, { "epoch": 1.9245708557824834, "grad_norm": 0.15593324601650238, "learning_rate": 9.622227791003635e-06, "loss": 0.4709, "num_input_tokens_seen": 18672928, "step": 15360 }, { "epoch": 1.9251973436912668, "grad_norm": 0.16640737652778625, "learning_rate": 9.625360230547552e-06, "loss": 0.4665, "num_input_tokens_seen": 18679264, "step": 15365 }, { "epoch": 1.92582383160005, "grad_norm": 0.19379335641860962, "learning_rate": 9.628492670091469e-06, "loss": 0.4696, "num_input_tokens_seen": 18685344, "step": 15370 }, { "epoch": 1.9264503195088336, "grad_norm": 0.12475939840078354, "learning_rate": 9.631625109635385e-06, "loss": 0.4623, "num_input_tokens_seen": 18691680, "step": 15375 }, { "epoch": 1.9270768074176168, "grad_norm": 0.1816606968641281, "learning_rate": 9.634757549179302e-06, "loss": 0.463, "num_input_tokens_seen": 18697472, "step": 15380 }, { "epoch": 1.9277032953264002, "grad_norm": 0.2573234438896179, "learning_rate": 9.637889988723217e-06, "loss": 0.4666, "num_input_tokens_seen": 18703744, "step": 15385 }, { "epoch": 1.9283297832351836, "grad_norm": 0.0568726621568203, "learning_rate": 9.641022428267136e-06, "loss": 0.4675, "num_input_tokens_seen": 18710176, "step": 15390 }, { "epoch": 1.928956271143967, "grad_norm": 0.16796302795410156, "learning_rate": 9.644154867811052e-06, "loss": 0.4596, "num_input_tokens_seen": 18716128, "step": 15395 }, { "epoch": 1.9295827590527503, "grad_norm": 0.16646763682365417, "learning_rate": 9.64728730735497e-06, "loss": 0.4706, "num_input_tokens_seen": 18722208, "step": 15400 }, { "epoch": 1.9302092469615335, "grad_norm": 0.257123202085495, "learning_rate": 9.650419746898886e-06, "loss": 0.4696, "num_input_tokens_seen": 18727904, "step": 15405 }, { "epoch": 1.9308357348703171, "grad_norm": 0.06100841611623764, "learning_rate": 9.653552186442803e-06, "loss": 0.4625, "num_input_tokens_seen": 18734304, "step": 15410 }, { "epoch": 1.9314622227791003, "grad_norm": 0.22131259739398956, "learning_rate": 9.65668462598672e-06, "loss": 0.465, "num_input_tokens_seen": 18740864, "step": 15415 }, { "epoch": 1.9320887106878837, "grad_norm": 0.22715115547180176, "learning_rate": 9.659817065530636e-06, "loss": 0.4656, "num_input_tokens_seen": 18746688, "step": 15420 }, { "epoch": 1.932715198596667, "grad_norm": 0.06590378284454346, "learning_rate": 9.662949505074553e-06, "loss": 0.4638, "num_input_tokens_seen": 18753024, "step": 15425 }, { "epoch": 1.9333416865054505, "grad_norm": 0.25306037068367004, "learning_rate": 9.66608194461847e-06, "loss": 0.4643, "num_input_tokens_seen": 18759200, "step": 15430 }, { "epoch": 1.9339681744142339, "grad_norm": 0.48281046748161316, "learning_rate": 9.669214384162387e-06, "loss": 0.4648, "num_input_tokens_seen": 18765088, "step": 15435 }, { "epoch": 1.934594662323017, "grad_norm": 0.18256494402885437, "learning_rate": 9.672346823706303e-06, "loss": 0.4622, "num_input_tokens_seen": 18771328, "step": 15440 }, { "epoch": 1.9352211502318006, "grad_norm": 0.28080469369888306, "learning_rate": 9.67547926325022e-06, "loss": 0.4616, "num_input_tokens_seen": 18777408, "step": 15445 }, { "epoch": 1.9358476381405838, "grad_norm": 0.28390806913375854, "learning_rate": 9.678611702794137e-06, "loss": 0.4669, "num_input_tokens_seen": 18783360, "step": 15450 }, { "epoch": 1.9364741260493672, "grad_norm": 0.13192522525787354, "learning_rate": 9.681744142338054e-06, "loss": 0.4664, "num_input_tokens_seen": 18789632, "step": 15455 }, { "epoch": 1.9371006139581506, "grad_norm": 0.13356992602348328, "learning_rate": 9.68487658188197e-06, "loss": 0.4679, "num_input_tokens_seen": 18795744, "step": 15460 }, { "epoch": 1.937727101866934, "grad_norm": 0.1566224843263626, "learning_rate": 9.688009021425887e-06, "loss": 0.4596, "num_input_tokens_seen": 18801536, "step": 15465 }, { "epoch": 1.9383535897757174, "grad_norm": 0.12766627967357635, "learning_rate": 9.691141460969804e-06, "loss": 0.4623, "num_input_tokens_seen": 18807744, "step": 15470 }, { "epoch": 1.9389800776845005, "grad_norm": 0.1545533686876297, "learning_rate": 9.69427390051372e-06, "loss": 0.4651, "num_input_tokens_seen": 18813888, "step": 15475 }, { "epoch": 1.9396065655932841, "grad_norm": 0.1458830088376999, "learning_rate": 9.697406340057637e-06, "loss": 0.4616, "num_input_tokens_seen": 18820160, "step": 15480 }, { "epoch": 1.9402330535020673, "grad_norm": 0.15990860760211945, "learning_rate": 9.700538779601554e-06, "loss": 0.4614, "num_input_tokens_seen": 18825664, "step": 15485 }, { "epoch": 1.940859541410851, "grad_norm": 0.05881378799676895, "learning_rate": 9.703671219145473e-06, "loss": 0.4539, "num_input_tokens_seen": 18831520, "step": 15490 }, { "epoch": 1.941486029319634, "grad_norm": 0.13011768460273743, "learning_rate": 9.706803658689388e-06, "loss": 0.4648, "num_input_tokens_seen": 18837376, "step": 15495 }, { "epoch": 1.9421125172284175, "grad_norm": 0.21513710916042328, "learning_rate": 9.709936098233304e-06, "loss": 0.4658, "num_input_tokens_seen": 18843584, "step": 15500 }, { "epoch": 1.9427390051372009, "grad_norm": 0.1685640513896942, "learning_rate": 9.713068537777221e-06, "loss": 0.4643, "num_input_tokens_seen": 18849760, "step": 15505 }, { "epoch": 1.943365493045984, "grad_norm": 0.17136478424072266, "learning_rate": 9.716200977321138e-06, "loss": 0.4669, "num_input_tokens_seen": 18855872, "step": 15510 }, { "epoch": 1.9439919809547677, "grad_norm": 0.07080327719449997, "learning_rate": 9.719333416865055e-06, "loss": 0.4628, "num_input_tokens_seen": 18861952, "step": 15515 }, { "epoch": 1.9446184688635508, "grad_norm": 0.1817293018102646, "learning_rate": 9.722465856408973e-06, "loss": 0.4738, "num_input_tokens_seen": 18868032, "step": 15520 }, { "epoch": 1.9452449567723344, "grad_norm": 0.24796803295612335, "learning_rate": 9.72559829595289e-06, "loss": 0.4548, "num_input_tokens_seen": 18873920, "step": 15525 }, { "epoch": 1.9458714446811176, "grad_norm": 0.14769376814365387, "learning_rate": 9.728730735496805e-06, "loss": 0.4653, "num_input_tokens_seen": 18880096, "step": 15530 }, { "epoch": 1.946497932589901, "grad_norm": 0.2179819792509079, "learning_rate": 9.731863175040722e-06, "loss": 0.4679, "num_input_tokens_seen": 18886144, "step": 15535 }, { "epoch": 1.9471244204986844, "grad_norm": 0.2785071134567261, "learning_rate": 9.734995614584639e-06, "loss": 0.4698, "num_input_tokens_seen": 18892224, "step": 15540 }, { "epoch": 1.9477509084074678, "grad_norm": 0.1722409874200821, "learning_rate": 9.738128054128555e-06, "loss": 0.4561, "num_input_tokens_seen": 18898496, "step": 15545 }, { "epoch": 1.9483773963162512, "grad_norm": 0.20224681496620178, "learning_rate": 9.741260493672474e-06, "loss": 0.4577, "num_input_tokens_seen": 18904896, "step": 15550 }, { "epoch": 1.9490038842250343, "grad_norm": 0.19661536812782288, "learning_rate": 9.74439293321639e-06, "loss": 0.4609, "num_input_tokens_seen": 18910752, "step": 15555 }, { "epoch": 1.949630372133818, "grad_norm": 0.13128279149532318, "learning_rate": 9.747525372760307e-06, "loss": 0.4585, "num_input_tokens_seen": 18916896, "step": 15560 }, { "epoch": 1.9502568600426011, "grad_norm": 0.22536292672157288, "learning_rate": 9.750657812304222e-06, "loss": 0.4447, "num_input_tokens_seen": 18922944, "step": 15565 }, { "epoch": 1.9508833479513845, "grad_norm": 0.13316890597343445, "learning_rate": 9.75379025184814e-06, "loss": 0.4683, "num_input_tokens_seen": 18928960, "step": 15570 }, { "epoch": 1.951509835860168, "grad_norm": 0.2080231010913849, "learning_rate": 9.756922691392056e-06, "loss": 0.4703, "num_input_tokens_seen": 18935008, "step": 15575 }, { "epoch": 1.9521363237689513, "grad_norm": 0.16592034697532654, "learning_rate": 9.760055130935974e-06, "loss": 0.461, "num_input_tokens_seen": 18941120, "step": 15580 }, { "epoch": 1.9527628116777347, "grad_norm": 0.254919171333313, "learning_rate": 9.763187570479891e-06, "loss": 0.4555, "num_input_tokens_seen": 18947168, "step": 15585 }, { "epoch": 1.9533892995865179, "grad_norm": 0.06284666806459427, "learning_rate": 9.766320010023808e-06, "loss": 0.4655, "num_input_tokens_seen": 18953792, "step": 15590 }, { "epoch": 1.9540157874953015, "grad_norm": 0.06100161746144295, "learning_rate": 9.769452449567725e-06, "loss": 0.4604, "num_input_tokens_seen": 18959808, "step": 15595 }, { "epoch": 1.9546422754040846, "grad_norm": 0.05543383210897446, "learning_rate": 9.772584889111641e-06, "loss": 0.4559, "num_input_tokens_seen": 18965056, "step": 15600 }, { "epoch": 1.955268763312868, "grad_norm": 0.13581420481204987, "learning_rate": 9.775717328655557e-06, "loss": 0.4649, "num_input_tokens_seen": 18971520, "step": 15605 }, { "epoch": 1.9558952512216514, "grad_norm": 0.20802287757396698, "learning_rate": 9.778849768199475e-06, "loss": 0.4595, "num_input_tokens_seen": 18977280, "step": 15610 }, { "epoch": 1.9565217391304348, "grad_norm": 0.22721266746520996, "learning_rate": 9.781982207743392e-06, "loss": 0.4572, "num_input_tokens_seen": 18983168, "step": 15615 }, { "epoch": 1.9571482270392182, "grad_norm": 0.20296111702919006, "learning_rate": 9.785114647287308e-06, "loss": 0.4686, "num_input_tokens_seen": 18989536, "step": 15620 }, { "epoch": 1.9577747149480014, "grad_norm": 0.14371031522750854, "learning_rate": 9.788247086831225e-06, "loss": 0.465, "num_input_tokens_seen": 18994912, "step": 15625 }, { "epoch": 1.958401202856785, "grad_norm": 0.1759534776210785, "learning_rate": 9.791379526375142e-06, "loss": 0.4644, "num_input_tokens_seen": 19001056, "step": 15630 }, { "epoch": 1.9590276907655682, "grad_norm": 0.1161351352930069, "learning_rate": 9.794511965919059e-06, "loss": 0.4562, "num_input_tokens_seen": 19007040, "step": 15635 }, { "epoch": 1.9596541786743515, "grad_norm": 0.11695601046085358, "learning_rate": 9.797644405462976e-06, "loss": 0.4595, "num_input_tokens_seen": 19013504, "step": 15640 }, { "epoch": 1.960280666583135, "grad_norm": 0.0562477707862854, "learning_rate": 9.800776845006892e-06, "loss": 0.468, "num_input_tokens_seen": 19018816, "step": 15645 }, { "epoch": 1.9609071544919183, "grad_norm": 0.18705101311206818, "learning_rate": 9.803909284550809e-06, "loss": 0.4567, "num_input_tokens_seen": 19024992, "step": 15650 }, { "epoch": 1.9615336424007017, "grad_norm": 0.0645265132188797, "learning_rate": 9.807041724094726e-06, "loss": 0.4616, "num_input_tokens_seen": 19030848, "step": 15655 }, { "epoch": 1.9621601303094849, "grad_norm": 0.19494006037712097, "learning_rate": 9.810174163638643e-06, "loss": 0.4709, "num_input_tokens_seen": 19036896, "step": 15660 }, { "epoch": 1.9627866182182685, "grad_norm": 0.13728521764278412, "learning_rate": 9.81330660318256e-06, "loss": 0.4627, "num_input_tokens_seen": 19043072, "step": 15665 }, { "epoch": 1.9634131061270517, "grad_norm": 0.16218328475952148, "learning_rate": 9.816439042726476e-06, "loss": 0.4707, "num_input_tokens_seen": 19049120, "step": 15670 }, { "epoch": 1.964039594035835, "grad_norm": 0.21963191032409668, "learning_rate": 9.819571482270393e-06, "loss": 0.4574, "num_input_tokens_seen": 19055232, "step": 15675 }, { "epoch": 1.9646660819446184, "grad_norm": 0.20230025053024292, "learning_rate": 9.82270392181431e-06, "loss": 0.4682, "num_input_tokens_seen": 19061280, "step": 15680 }, { "epoch": 1.9652925698534018, "grad_norm": 0.2373475730419159, "learning_rate": 9.825836361358226e-06, "loss": 0.4597, "num_input_tokens_seen": 19067584, "step": 15685 }, { "epoch": 1.9659190577621852, "grad_norm": 0.18490387499332428, "learning_rate": 9.828968800902143e-06, "loss": 0.4608, "num_input_tokens_seen": 19073728, "step": 15690 }, { "epoch": 1.9665455456709684, "grad_norm": 0.13664697110652924, "learning_rate": 9.83210124044606e-06, "loss": 0.4645, "num_input_tokens_seen": 19080032, "step": 15695 }, { "epoch": 1.967172033579752, "grad_norm": 0.17600536346435547, "learning_rate": 9.835233679989977e-06, "loss": 0.4734, "num_input_tokens_seen": 19086240, "step": 15700 }, { "epoch": 1.9677985214885352, "grad_norm": 0.14818428456783295, "learning_rate": 9.838366119533893e-06, "loss": 0.4634, "num_input_tokens_seen": 19092544, "step": 15705 }, { "epoch": 1.9684250093973188, "grad_norm": 0.17210343480110168, "learning_rate": 9.841498559077812e-06, "loss": 0.4564, "num_input_tokens_seen": 19098464, "step": 15710 }, { "epoch": 1.969051497306102, "grad_norm": 0.37455835938453674, "learning_rate": 9.844630998621727e-06, "loss": 0.4796, "num_input_tokens_seen": 19104544, "step": 15715 }, { "epoch": 1.9696779852148854, "grad_norm": 0.4008581340312958, "learning_rate": 9.847763438165644e-06, "loss": 0.4551, "num_input_tokens_seen": 19110624, "step": 15720 }, { "epoch": 1.9703044731236687, "grad_norm": 0.27092763781547546, "learning_rate": 9.85089587770956e-06, "loss": 0.4658, "num_input_tokens_seen": 19117088, "step": 15725 }, { "epoch": 1.970930961032452, "grad_norm": 0.17712770402431488, "learning_rate": 9.854028317253477e-06, "loss": 0.4722, "num_input_tokens_seen": 19123200, "step": 15730 }, { "epoch": 1.9715574489412355, "grad_norm": 0.12359675019979477, "learning_rate": 9.857160756797394e-06, "loss": 0.459, "num_input_tokens_seen": 19129440, "step": 15735 }, { "epoch": 1.9721839368500187, "grad_norm": 0.14715516567230225, "learning_rate": 9.860293196341313e-06, "loss": 0.4605, "num_input_tokens_seen": 19135456, "step": 15740 }, { "epoch": 1.9728104247588023, "grad_norm": 0.06318893283605576, "learning_rate": 9.86342563588523e-06, "loss": 0.4641, "num_input_tokens_seen": 19140832, "step": 15745 }, { "epoch": 1.9734369126675855, "grad_norm": 0.15822985768318176, "learning_rate": 9.866558075429144e-06, "loss": 0.4643, "num_input_tokens_seen": 19146560, "step": 15750 }, { "epoch": 1.9740634005763689, "grad_norm": 0.19930565357208252, "learning_rate": 9.869690514973061e-06, "loss": 0.4661, "num_input_tokens_seen": 19152800, "step": 15755 }, { "epoch": 1.9746898884851523, "grad_norm": 0.14163647592067719, "learning_rate": 9.872822954516978e-06, "loss": 0.4592, "num_input_tokens_seen": 19158944, "step": 15760 }, { "epoch": 1.9753163763939356, "grad_norm": 0.2371249496936798, "learning_rate": 9.875955394060895e-06, "loss": 0.4636, "num_input_tokens_seen": 19164288, "step": 15765 }, { "epoch": 1.975942864302719, "grad_norm": 0.17617762088775635, "learning_rate": 9.879087833604813e-06, "loss": 0.4659, "num_input_tokens_seen": 19170528, "step": 15770 }, { "epoch": 1.9765693522115022, "grad_norm": 0.15962721407413483, "learning_rate": 9.88222027314873e-06, "loss": 0.4665, "num_input_tokens_seen": 19176768, "step": 15775 }, { "epoch": 1.9771958401202858, "grad_norm": 0.14703448116779327, "learning_rate": 9.885352712692647e-06, "loss": 0.4632, "num_input_tokens_seen": 19183040, "step": 15780 }, { "epoch": 1.977822328029069, "grad_norm": 0.12204058468341827, "learning_rate": 9.888485152236562e-06, "loss": 0.4595, "num_input_tokens_seen": 19188832, "step": 15785 }, { "epoch": 1.9784488159378524, "grad_norm": 0.11645597964525223, "learning_rate": 9.891617591780478e-06, "loss": 0.4607, "num_input_tokens_seen": 19195168, "step": 15790 }, { "epoch": 1.9790753038466358, "grad_norm": 0.1655888557434082, "learning_rate": 9.894750031324395e-06, "loss": 0.4604, "num_input_tokens_seen": 19201376, "step": 15795 }, { "epoch": 1.9797017917554192, "grad_norm": 0.11533360928297043, "learning_rate": 9.897882470868314e-06, "loss": 0.4682, "num_input_tokens_seen": 19207712, "step": 15800 }, { "epoch": 1.9803282796642026, "grad_norm": 0.05161488056182861, "learning_rate": 9.90101491041223e-06, "loss": 0.4602, "num_input_tokens_seen": 19213760, "step": 15805 }, { "epoch": 1.9809547675729857, "grad_norm": 0.1463264524936676, "learning_rate": 9.904147349956147e-06, "loss": 0.4581, "num_input_tokens_seen": 19219488, "step": 15810 }, { "epoch": 1.9815812554817693, "grad_norm": 0.1314115673303604, "learning_rate": 9.907279789500064e-06, "loss": 0.4639, "num_input_tokens_seen": 19225888, "step": 15815 }, { "epoch": 1.9822077433905525, "grad_norm": 0.15500804781913757, "learning_rate": 9.91041222904398e-06, "loss": 0.4493, "num_input_tokens_seen": 19232064, "step": 15820 }, { "epoch": 1.982834231299336, "grad_norm": 0.06402425467967987, "learning_rate": 9.913544668587897e-06, "loss": 0.4621, "num_input_tokens_seen": 19238336, "step": 15825 }, { "epoch": 1.9834607192081193, "grad_norm": 0.14234264194965363, "learning_rate": 9.916677108131814e-06, "loss": 0.4751, "num_input_tokens_seen": 19244672, "step": 15830 }, { "epoch": 1.9840872071169027, "grad_norm": 0.3394390344619751, "learning_rate": 9.919809547675731e-06, "loss": 0.4576, "num_input_tokens_seen": 19250944, "step": 15835 }, { "epoch": 1.984713695025686, "grad_norm": 0.20377562940120697, "learning_rate": 9.922941987219648e-06, "loss": 0.4567, "num_input_tokens_seen": 19256928, "step": 15840 }, { "epoch": 1.9853401829344692, "grad_norm": 0.2767501175403595, "learning_rate": 9.926074426763565e-06, "loss": 0.4776, "num_input_tokens_seen": 19262944, "step": 15845 }, { "epoch": 1.9859666708432528, "grad_norm": 0.1994771957397461, "learning_rate": 9.929206866307481e-06, "loss": 0.4719, "num_input_tokens_seen": 19268480, "step": 15850 }, { "epoch": 1.986593158752036, "grad_norm": 0.34826478362083435, "learning_rate": 9.932339305851398e-06, "loss": 0.4673, "num_input_tokens_seen": 19274656, "step": 15855 }, { "epoch": 1.9872196466608194, "grad_norm": 0.30314627289772034, "learning_rate": 9.935471745395315e-06, "loss": 0.4651, "num_input_tokens_seen": 19280960, "step": 15860 }, { "epoch": 1.9878461345696028, "grad_norm": 0.397114098072052, "learning_rate": 9.938604184939232e-06, "loss": 0.4614, "num_input_tokens_seen": 19286816, "step": 15865 }, { "epoch": 1.9884726224783862, "grad_norm": 0.455738365650177, "learning_rate": 9.941736624483148e-06, "loss": 0.4731, "num_input_tokens_seen": 19293152, "step": 15870 }, { "epoch": 1.9890991103871696, "grad_norm": 0.27946335077285767, "learning_rate": 9.944869064027065e-06, "loss": 0.4613, "num_input_tokens_seen": 19299456, "step": 15875 }, { "epoch": 1.9897255982959527, "grad_norm": 0.09810630232095718, "learning_rate": 9.948001503570982e-06, "loss": 0.4599, "num_input_tokens_seen": 19305792, "step": 15880 }, { "epoch": 1.9903520862047364, "grad_norm": 0.220864400267601, "learning_rate": 9.951133943114899e-06, "loss": 0.46, "num_input_tokens_seen": 19311552, "step": 15885 }, { "epoch": 1.9909785741135195, "grad_norm": 0.35210728645324707, "learning_rate": 9.954266382658815e-06, "loss": 0.4687, "num_input_tokens_seen": 19317856, "step": 15890 }, { "epoch": 1.991605062022303, "grad_norm": 0.23026371002197266, "learning_rate": 9.957398822202732e-06, "loss": 0.4685, "num_input_tokens_seen": 19323744, "step": 15895 }, { "epoch": 1.9922315499310863, "grad_norm": 0.14793722331523895, "learning_rate": 9.960531261746649e-06, "loss": 0.4603, "num_input_tokens_seen": 19329504, "step": 15900 }, { "epoch": 1.9928580378398697, "grad_norm": 0.15883883833885193, "learning_rate": 9.963663701290566e-06, "loss": 0.4594, "num_input_tokens_seen": 19335584, "step": 15905 }, { "epoch": 1.993484525748653, "grad_norm": 0.12291543930768967, "learning_rate": 9.966796140834482e-06, "loss": 0.4681, "num_input_tokens_seen": 19341696, "step": 15910 }, { "epoch": 1.9941110136574363, "grad_norm": 0.0593867264688015, "learning_rate": 9.9699285803784e-06, "loss": 0.4689, "num_input_tokens_seen": 19347680, "step": 15915 }, { "epoch": 1.9947375015662199, "grad_norm": 0.18364174664020538, "learning_rate": 9.973061019922316e-06, "loss": 0.4627, "num_input_tokens_seen": 19354016, "step": 15920 }, { "epoch": 1.995363989475003, "grad_norm": 0.14409886300563812, "learning_rate": 9.976193459466233e-06, "loss": 0.456, "num_input_tokens_seen": 19359712, "step": 15925 }, { "epoch": 1.9959904773837867, "grad_norm": 0.16118109226226807, "learning_rate": 9.979325899010151e-06, "loss": 0.4636, "num_input_tokens_seen": 19365984, "step": 15930 }, { "epoch": 1.9966169652925698, "grad_norm": 0.1809287667274475, "learning_rate": 9.982458338554066e-06, "loss": 0.4665, "num_input_tokens_seen": 19371872, "step": 15935 }, { "epoch": 1.9972434532013532, "grad_norm": 0.1358356773853302, "learning_rate": 9.985590778097983e-06, "loss": 0.4686, "num_input_tokens_seen": 19378208, "step": 15940 }, { "epoch": 1.9978699411101366, "grad_norm": 0.11856305599212646, "learning_rate": 9.9887232176419e-06, "loss": 0.4605, "num_input_tokens_seen": 19384032, "step": 15945 }, { "epoch": 1.9984964290189198, "grad_norm": 0.23055870831012726, "learning_rate": 9.991855657185817e-06, "loss": 0.46, "num_input_tokens_seen": 19389664, "step": 15950 }, { "epoch": 1.9991229169277034, "grad_norm": 0.32719147205352783, "learning_rate": 9.994988096729733e-06, "loss": 0.4741, "num_input_tokens_seen": 19395584, "step": 15955 }, { "epoch": 1.9997494048364866, "grad_norm": 0.128075510263443, "learning_rate": 9.998120536273652e-06, "loss": 0.4696, "num_input_tokens_seen": 19402080, "step": 15960 }, { "epoch": 2.0, "eval_loss": 0.46267828345298767, "eval_runtime": 223.079, "eval_samples_per_second": 35.777, "eval_steps_per_second": 8.948, "num_input_tokens_seen": 19404608, "step": 15962 }, { "epoch": 2.00037589274527, "grad_norm": 0.1648460179567337, "learning_rate": 9.999999995217665e-06, "loss": 0.4527, "num_input_tokens_seen": 19408320, "step": 15965 }, { "epoch": 2.0010023806540533, "grad_norm": 0.13854753971099854, "learning_rate": 9.99999994141638e-06, "loss": 0.4722, "num_input_tokens_seen": 19414720, "step": 15970 }, { "epoch": 2.001628868562837, "grad_norm": 0.05335504189133644, "learning_rate": 9.999999827835894e-06, "loss": 0.4664, "num_input_tokens_seen": 19420928, "step": 15975 }, { "epoch": 2.00225535647162, "grad_norm": 0.13934452831745148, "learning_rate": 9.999999654476207e-06, "loss": 0.466, "num_input_tokens_seen": 19426656, "step": 15980 }, { "epoch": 2.0028818443804033, "grad_norm": 0.19663001596927643, "learning_rate": 9.999999421337318e-06, "loss": 0.4616, "num_input_tokens_seen": 19432608, "step": 15985 }, { "epoch": 2.003508332289187, "grad_norm": 0.25389590859413147, "learning_rate": 9.999999128419234e-06, "loss": 0.4636, "num_input_tokens_seen": 19438880, "step": 15990 }, { "epoch": 2.00413482019797, "grad_norm": 0.13150997459888458, "learning_rate": 9.999998775721957e-06, "loss": 0.4651, "num_input_tokens_seen": 19444768, "step": 15995 }, { "epoch": 2.0047613081067537, "grad_norm": 0.12371671199798584, "learning_rate": 9.999998363245489e-06, "loss": 0.4662, "num_input_tokens_seen": 19451104, "step": 16000 }, { "epoch": 2.005387796015537, "grad_norm": 0.15804307162761688, "learning_rate": 9.999997890989838e-06, "loss": 0.4617, "num_input_tokens_seen": 19457088, "step": 16005 }, { "epoch": 2.0060142839243205, "grad_norm": 0.13143619894981384, "learning_rate": 9.999997358955008e-06, "loss": 0.4581, "num_input_tokens_seen": 19463232, "step": 16010 }, { "epoch": 2.0066407718331036, "grad_norm": 0.2327149659395218, "learning_rate": 9.999996767141006e-06, "loss": 0.4618, "num_input_tokens_seen": 19469408, "step": 16015 }, { "epoch": 2.007267259741887, "grad_norm": 0.14726445078849792, "learning_rate": 9.999996115547838e-06, "loss": 0.4653, "num_input_tokens_seen": 19475904, "step": 16020 }, { "epoch": 2.0078937476506704, "grad_norm": 0.1628454476594925, "learning_rate": 9.999995404175514e-06, "loss": 0.4641, "num_input_tokens_seen": 19481920, "step": 16025 }, { "epoch": 2.0085202355594536, "grad_norm": 0.16352200508117676, "learning_rate": 9.999994633024042e-06, "loss": 0.4601, "num_input_tokens_seen": 19488256, "step": 16030 }, { "epoch": 2.009146723468237, "grad_norm": 0.30363228917121887, "learning_rate": 9.999993802093428e-06, "loss": 0.465, "num_input_tokens_seen": 19494304, "step": 16035 }, { "epoch": 2.0097732113770204, "grad_norm": 0.3081705868244171, "learning_rate": 9.999992911383686e-06, "loss": 0.4597, "num_input_tokens_seen": 19500512, "step": 16040 }, { "epoch": 2.010399699285804, "grad_norm": 0.2322661131620407, "learning_rate": 9.999991960894824e-06, "loss": 0.4845, "num_input_tokens_seen": 19506240, "step": 16045 }, { "epoch": 2.011026187194587, "grad_norm": 0.41826820373535156, "learning_rate": 9.999990950626854e-06, "loss": 0.4569, "num_input_tokens_seen": 19512448, "step": 16050 }, { "epoch": 2.0116526751033703, "grad_norm": 0.4940820038318634, "learning_rate": 9.99998988057979e-06, "loss": 0.4594, "num_input_tokens_seen": 19518592, "step": 16055 }, { "epoch": 2.012279163012154, "grad_norm": 0.2317737340927124, "learning_rate": 9.999988750753643e-06, "loss": 0.4588, "num_input_tokens_seen": 19524736, "step": 16060 }, { "epoch": 2.012905650920937, "grad_norm": 0.43133610486984253, "learning_rate": 9.999987561148425e-06, "loss": 0.4378, "num_input_tokens_seen": 19531104, "step": 16065 }, { "epoch": 2.0135321388297207, "grad_norm": 0.7188762426376343, "learning_rate": 9.999986311764153e-06, "loss": 0.4561, "num_input_tokens_seen": 19537312, "step": 16070 }, { "epoch": 2.014158626738504, "grad_norm": 0.47994476556777954, "learning_rate": 9.999985002600842e-06, "loss": 0.509, "num_input_tokens_seen": 19543520, "step": 16075 }, { "epoch": 2.0147851146472875, "grad_norm": 0.13480477035045624, "learning_rate": 9.999983633658505e-06, "loss": 0.4644, "num_input_tokens_seen": 19549184, "step": 16080 }, { "epoch": 2.0154116025560707, "grad_norm": 0.15965741872787476, "learning_rate": 9.999982204937162e-06, "loss": 0.4674, "num_input_tokens_seen": 19555328, "step": 16085 }, { "epoch": 2.016038090464854, "grad_norm": 0.160999596118927, "learning_rate": 9.999980716436826e-06, "loss": 0.4566, "num_input_tokens_seen": 19561888, "step": 16090 }, { "epoch": 2.0166645783736374, "grad_norm": 0.17945273220539093, "learning_rate": 9.999979168157516e-06, "loss": 0.4607, "num_input_tokens_seen": 19568032, "step": 16095 }, { "epoch": 2.0172910662824206, "grad_norm": 0.3025516867637634, "learning_rate": 9.999977560099252e-06, "loss": 0.4613, "num_input_tokens_seen": 19573536, "step": 16100 }, { "epoch": 2.017917554191204, "grad_norm": 0.2528477907180786, "learning_rate": 9.999975892262054e-06, "loss": 0.4598, "num_input_tokens_seen": 19579328, "step": 16105 }, { "epoch": 2.0185440420999874, "grad_norm": 0.12541471421718597, "learning_rate": 9.999974164645939e-06, "loss": 0.464, "num_input_tokens_seen": 19585056, "step": 16110 }, { "epoch": 2.019170530008771, "grad_norm": 0.22382254898548126, "learning_rate": 9.99997237725093e-06, "loss": 0.4697, "num_input_tokens_seen": 19591296, "step": 16115 }, { "epoch": 2.019797017917554, "grad_norm": 0.05328422039747238, "learning_rate": 9.999970530077047e-06, "loss": 0.4664, "num_input_tokens_seen": 19597408, "step": 16120 }, { "epoch": 2.0204235058263373, "grad_norm": 0.052836887538433075, "learning_rate": 9.999968623124312e-06, "loss": 0.4638, "num_input_tokens_seen": 19602656, "step": 16125 }, { "epoch": 2.021049993735121, "grad_norm": 0.1841534823179245, "learning_rate": 9.99996665639275e-06, "loss": 0.4596, "num_input_tokens_seen": 19609120, "step": 16130 }, { "epoch": 2.021676481643904, "grad_norm": 0.13756173849105835, "learning_rate": 9.99996462988238e-06, "loss": 0.4663, "num_input_tokens_seen": 19614720, "step": 16135 }, { "epoch": 2.0223029695526877, "grad_norm": 0.16341952979564667, "learning_rate": 9.999962543593232e-06, "loss": 0.4676, "num_input_tokens_seen": 19620768, "step": 16140 }, { "epoch": 2.022929457461471, "grad_norm": 0.11823750287294388, "learning_rate": 9.999960397525328e-06, "loss": 0.4688, "num_input_tokens_seen": 19626752, "step": 16145 }, { "epoch": 2.0235559453702545, "grad_norm": 0.24045373499393463, "learning_rate": 9.999958191678692e-06, "loss": 0.4587, "num_input_tokens_seen": 19632928, "step": 16150 }, { "epoch": 2.0241824332790377, "grad_norm": 0.2633492648601532, "learning_rate": 9.999955926053353e-06, "loss": 0.4592, "num_input_tokens_seen": 19639424, "step": 16155 }, { "epoch": 2.0248089211878213, "grad_norm": 0.14011572301387787, "learning_rate": 9.999953600649337e-06, "loss": 0.4614, "num_input_tokens_seen": 19645184, "step": 16160 }, { "epoch": 2.0254354090966045, "grad_norm": 0.15346132218837738, "learning_rate": 9.999951215466673e-06, "loss": 0.4592, "num_input_tokens_seen": 19651360, "step": 16165 }, { "epoch": 2.0260618970053876, "grad_norm": 0.25438573956489563, "learning_rate": 9.999948770505386e-06, "loss": 0.47, "num_input_tokens_seen": 19656704, "step": 16170 }, { "epoch": 2.0266883849141712, "grad_norm": 0.35362204909324646, "learning_rate": 9.99994626576551e-06, "loss": 0.4637, "num_input_tokens_seen": 19662976, "step": 16175 }, { "epoch": 2.0273148728229544, "grad_norm": 0.24290309846401215, "learning_rate": 9.99994370124707e-06, "loss": 0.4555, "num_input_tokens_seen": 19669120, "step": 16180 }, { "epoch": 2.027941360731738, "grad_norm": 0.23974384367465973, "learning_rate": 9.999941076950103e-06, "loss": 0.4642, "num_input_tokens_seen": 19675264, "step": 16185 }, { "epoch": 2.028567848640521, "grad_norm": 0.05435016006231308, "learning_rate": 9.999938392874634e-06, "loss": 0.4598, "num_input_tokens_seen": 19681472, "step": 16190 }, { "epoch": 2.029194336549305, "grad_norm": 0.13574479520320892, "learning_rate": 9.9999356490207e-06, "loss": 0.4565, "num_input_tokens_seen": 19687232, "step": 16195 }, { "epoch": 2.029820824458088, "grad_norm": 0.14872531592845917, "learning_rate": 9.99993284538833e-06, "loss": 0.474, "num_input_tokens_seen": 19692864, "step": 16200 }, { "epoch": 2.030447312366871, "grad_norm": 0.27996039390563965, "learning_rate": 9.999929981977559e-06, "loss": 0.4756, "num_input_tokens_seen": 19698624, "step": 16205 }, { "epoch": 2.0310738002756548, "grad_norm": 0.12208328396081924, "learning_rate": 9.999927058788421e-06, "loss": 0.4698, "num_input_tokens_seen": 19704448, "step": 16210 }, { "epoch": 2.031700288184438, "grad_norm": 0.13288292288780212, "learning_rate": 9.999924075820953e-06, "loss": 0.4589, "num_input_tokens_seen": 19710368, "step": 16215 }, { "epoch": 2.0323267760932215, "grad_norm": 0.040514688938856125, "learning_rate": 9.999921033075188e-06, "loss": 0.4564, "num_input_tokens_seen": 19716384, "step": 16220 }, { "epoch": 2.0329532640020047, "grad_norm": 0.13755150139331818, "learning_rate": 9.999917930551164e-06, "loss": 0.4599, "num_input_tokens_seen": 19722656, "step": 16225 }, { "epoch": 2.0335797519107883, "grad_norm": 0.13479028642177582, "learning_rate": 9.999914768248916e-06, "loss": 0.4635, "num_input_tokens_seen": 19729120, "step": 16230 }, { "epoch": 2.0342062398195715, "grad_norm": 0.06402271240949631, "learning_rate": 9.999911546168483e-06, "loss": 0.4697, "num_input_tokens_seen": 19735072, "step": 16235 }, { "epoch": 2.0348327277283547, "grad_norm": 0.19404560327529907, "learning_rate": 9.999908264309905e-06, "loss": 0.4551, "num_input_tokens_seen": 19740992, "step": 16240 }, { "epoch": 2.0354592156371383, "grad_norm": 0.13698796927928925, "learning_rate": 9.99990492267322e-06, "loss": 0.4623, "num_input_tokens_seen": 19746944, "step": 16245 }, { "epoch": 2.0360857035459214, "grad_norm": 0.11589556932449341, "learning_rate": 9.999901521258468e-06, "loss": 0.4664, "num_input_tokens_seen": 19752544, "step": 16250 }, { "epoch": 2.036712191454705, "grad_norm": 0.2189057618379593, "learning_rate": 9.99989806006569e-06, "loss": 0.4627, "num_input_tokens_seen": 19758560, "step": 16255 }, { "epoch": 2.0373386793634882, "grad_norm": 0.1577228307723999, "learning_rate": 9.999894539094927e-06, "loss": 0.4632, "num_input_tokens_seen": 19764896, "step": 16260 }, { "epoch": 2.037965167272272, "grad_norm": 0.16966699063777924, "learning_rate": 9.99989095834622e-06, "loss": 0.4625, "num_input_tokens_seen": 19771104, "step": 16265 }, { "epoch": 2.038591655181055, "grad_norm": 0.04669661074876785, "learning_rate": 9.999887317819615e-06, "loss": 0.4682, "num_input_tokens_seen": 19776960, "step": 16270 }, { "epoch": 2.039218143089838, "grad_norm": 0.1311042606830597, "learning_rate": 9.999883617515151e-06, "loss": 0.4625, "num_input_tokens_seen": 19783200, "step": 16275 }, { "epoch": 2.039844630998622, "grad_norm": 0.1449069231748581, "learning_rate": 9.999879857432875e-06, "loss": 0.4629, "num_input_tokens_seen": 19789184, "step": 16280 }, { "epoch": 2.040471118907405, "grad_norm": 0.1128024309873581, "learning_rate": 9.999876037572833e-06, "loss": 0.4598, "num_input_tokens_seen": 19794656, "step": 16285 }, { "epoch": 2.0410976068161886, "grad_norm": 0.03981523588299751, "learning_rate": 9.999872157935069e-06, "loss": 0.4626, "num_input_tokens_seen": 19801120, "step": 16290 }, { "epoch": 2.0417240947249717, "grad_norm": 0.14038683474063873, "learning_rate": 9.999868218519628e-06, "loss": 0.4631, "num_input_tokens_seen": 19807392, "step": 16295 }, { "epoch": 2.0423505826337554, "grad_norm": 0.14271773397922516, "learning_rate": 9.99986421932656e-06, "loss": 0.462, "num_input_tokens_seen": 19813248, "step": 16300 }, { "epoch": 2.0429770705425385, "grad_norm": 0.13175424933433533, "learning_rate": 9.999860160355911e-06, "loss": 0.4641, "num_input_tokens_seen": 19818816, "step": 16305 }, { "epoch": 2.0436035584513217, "grad_norm": 0.2096216231584549, "learning_rate": 9.999856041607732e-06, "loss": 0.4646, "num_input_tokens_seen": 19825120, "step": 16310 }, { "epoch": 2.0442300463601053, "grad_norm": 0.1728668063879013, "learning_rate": 9.999851863082068e-06, "loss": 0.462, "num_input_tokens_seen": 19831616, "step": 16315 }, { "epoch": 2.0448565342688885, "grad_norm": 0.15313304960727692, "learning_rate": 9.999847624778973e-06, "loss": 0.4638, "num_input_tokens_seen": 19837504, "step": 16320 }, { "epoch": 2.045483022177672, "grad_norm": 0.04530135169625282, "learning_rate": 9.999843326698495e-06, "loss": 0.4605, "num_input_tokens_seen": 19843872, "step": 16325 }, { "epoch": 2.0461095100864553, "grad_norm": 0.19427278637886047, "learning_rate": 9.999838968840687e-06, "loss": 0.462, "num_input_tokens_seen": 19850144, "step": 16330 }, { "epoch": 2.046735997995239, "grad_norm": 0.13238036632537842, "learning_rate": 9.999834551205601e-06, "loss": 0.4614, "num_input_tokens_seen": 19856352, "step": 16335 }, { "epoch": 2.047362485904022, "grad_norm": 0.1417704075574875, "learning_rate": 9.99983007379329e-06, "loss": 0.4667, "num_input_tokens_seen": 19862368, "step": 16340 }, { "epoch": 2.047988973812805, "grad_norm": 0.043082404881715775, "learning_rate": 9.999825536603804e-06, "loss": 0.472, "num_input_tokens_seen": 19868544, "step": 16345 }, { "epoch": 2.048615461721589, "grad_norm": 0.13607776165008545, "learning_rate": 9.999820939637201e-06, "loss": 0.448, "num_input_tokens_seen": 19873536, "step": 16350 }, { "epoch": 2.049241949630372, "grad_norm": 0.13210612535476685, "learning_rate": 9.999816282893537e-06, "loss": 0.4633, "num_input_tokens_seen": 19879072, "step": 16355 }, { "epoch": 2.0498684375391556, "grad_norm": 0.119796521961689, "learning_rate": 9.999811566372863e-06, "loss": 0.4607, "num_input_tokens_seen": 19885088, "step": 16360 }, { "epoch": 2.0504949254479388, "grad_norm": 0.2502748966217041, "learning_rate": 9.99980679007524e-06, "loss": 0.4825, "num_input_tokens_seen": 19891552, "step": 16365 }, { "epoch": 2.0511214133567224, "grad_norm": 0.24411170184612274, "learning_rate": 9.999801954000723e-06, "loss": 0.4724, "num_input_tokens_seen": 19897440, "step": 16370 }, { "epoch": 2.0517479012655055, "grad_norm": 0.04504348337650299, "learning_rate": 9.99979705814937e-06, "loss": 0.4696, "num_input_tokens_seen": 19903680, "step": 16375 }, { "epoch": 2.0523743891742887, "grad_norm": 0.03551282733678818, "learning_rate": 9.99979210252124e-06, "loss": 0.4716, "num_input_tokens_seen": 19909472, "step": 16380 }, { "epoch": 2.0530008770830723, "grad_norm": 0.16672074794769287, "learning_rate": 9.99978708711639e-06, "loss": 0.4624, "num_input_tokens_seen": 19915744, "step": 16385 }, { "epoch": 2.0536273649918555, "grad_norm": 0.1447891741991043, "learning_rate": 9.999782011934883e-06, "loss": 0.4695, "num_input_tokens_seen": 19921664, "step": 16390 }, { "epoch": 2.054253852900639, "grad_norm": 0.1089296042919159, "learning_rate": 9.99977687697678e-06, "loss": 0.4638, "num_input_tokens_seen": 19927968, "step": 16395 }, { "epoch": 2.0548803408094223, "grad_norm": 0.13243718445301056, "learning_rate": 9.999771682242138e-06, "loss": 0.4631, "num_input_tokens_seen": 19933888, "step": 16400 }, { "epoch": 2.055506828718206, "grad_norm": 0.2305489182472229, "learning_rate": 9.999766427731024e-06, "loss": 0.4615, "num_input_tokens_seen": 19939616, "step": 16405 }, { "epoch": 2.056133316626989, "grad_norm": 0.18420253694057465, "learning_rate": 9.999761113443497e-06, "loss": 0.46, "num_input_tokens_seen": 19946112, "step": 16410 }, { "epoch": 2.0567598045357727, "grad_norm": 0.11452984809875488, "learning_rate": 9.999755739379624e-06, "loss": 0.4631, "num_input_tokens_seen": 19952256, "step": 16415 }, { "epoch": 2.057386292444556, "grad_norm": 0.1139223650097847, "learning_rate": 9.999750305539467e-06, "loss": 0.461, "num_input_tokens_seen": 19958496, "step": 16420 }, { "epoch": 2.058012780353339, "grad_norm": 0.15915989875793457, "learning_rate": 9.999744811923092e-06, "loss": 0.4674, "num_input_tokens_seen": 19963872, "step": 16425 }, { "epoch": 2.0586392682621226, "grad_norm": 0.11047378182411194, "learning_rate": 9.999739258530563e-06, "loss": 0.4606, "num_input_tokens_seen": 19969920, "step": 16430 }, { "epoch": 2.059265756170906, "grad_norm": 0.21523238718509674, "learning_rate": 9.999733645361948e-06, "loss": 0.4586, "num_input_tokens_seen": 19975904, "step": 16435 }, { "epoch": 2.0598922440796894, "grad_norm": 0.047382012009620667, "learning_rate": 9.999727972417314e-06, "loss": 0.4637, "num_input_tokens_seen": 19981920, "step": 16440 }, { "epoch": 2.0605187319884726, "grad_norm": 0.043278131633996964, "learning_rate": 9.999722239696728e-06, "loss": 0.4665, "num_input_tokens_seen": 19988128, "step": 16445 }, { "epoch": 2.061145219897256, "grad_norm": 0.19887112081050873, "learning_rate": 9.999716447200259e-06, "loss": 0.4603, "num_input_tokens_seen": 19994272, "step": 16450 }, { "epoch": 2.0617717078060394, "grad_norm": 0.21741852164268494, "learning_rate": 9.999710594927979e-06, "loss": 0.4666, "num_input_tokens_seen": 20000288, "step": 16455 }, { "epoch": 2.0623981957148225, "grad_norm": 0.2337714284658432, "learning_rate": 9.999704682879952e-06, "loss": 0.461, "num_input_tokens_seen": 20006112, "step": 16460 }, { "epoch": 2.063024683623606, "grad_norm": 0.11951173841953278, "learning_rate": 9.999698711056254e-06, "loss": 0.4605, "num_input_tokens_seen": 20012448, "step": 16465 }, { "epoch": 2.0636511715323893, "grad_norm": 0.23160915076732635, "learning_rate": 9.999692679456953e-06, "loss": 0.4617, "num_input_tokens_seen": 20018848, "step": 16470 }, { "epoch": 2.064277659441173, "grad_norm": 0.12316714227199554, "learning_rate": 9.999686588082123e-06, "loss": 0.4614, "num_input_tokens_seen": 20024928, "step": 16475 }, { "epoch": 2.064904147349956, "grad_norm": 0.1393737941980362, "learning_rate": 9.999680436931836e-06, "loss": 0.4677, "num_input_tokens_seen": 20031200, "step": 16480 }, { "epoch": 2.0655306352587397, "grad_norm": 0.14124298095703125, "learning_rate": 9.999674226006167e-06, "loss": 0.4618, "num_input_tokens_seen": 20037344, "step": 16485 }, { "epoch": 2.066157123167523, "grad_norm": 0.13002009689807892, "learning_rate": 9.999667955305188e-06, "loss": 0.4648, "num_input_tokens_seen": 20043392, "step": 16490 }, { "epoch": 2.066783611076306, "grad_norm": 0.12877829372882843, "learning_rate": 9.999661624828975e-06, "loss": 0.4647, "num_input_tokens_seen": 20049472, "step": 16495 }, { "epoch": 2.0674100989850897, "grad_norm": 0.1723538637161255, "learning_rate": 9.999655234577605e-06, "loss": 0.4575, "num_input_tokens_seen": 20055296, "step": 16500 }, { "epoch": 2.068036586893873, "grad_norm": 0.12413660436868668, "learning_rate": 9.999648784551153e-06, "loss": 0.4616, "num_input_tokens_seen": 20061152, "step": 16505 }, { "epoch": 2.0686630748026564, "grad_norm": 0.16822141408920288, "learning_rate": 9.999642274749697e-06, "loss": 0.4663, "num_input_tokens_seen": 20067200, "step": 16510 }, { "epoch": 2.0692895627114396, "grad_norm": 0.13116240501403809, "learning_rate": 9.999635705173312e-06, "loss": 0.4643, "num_input_tokens_seen": 20073344, "step": 16515 }, { "epoch": 2.069916050620223, "grad_norm": 0.12871801853179932, "learning_rate": 9.999629075822079e-06, "loss": 0.4595, "num_input_tokens_seen": 20079584, "step": 16520 }, { "epoch": 2.0705425385290064, "grad_norm": 0.2065943479537964, "learning_rate": 9.999622386696079e-06, "loss": 0.4654, "num_input_tokens_seen": 20085952, "step": 16525 }, { "epoch": 2.0711690264377896, "grad_norm": 0.19415068626403809, "learning_rate": 9.999615637795388e-06, "loss": 0.4596, "num_input_tokens_seen": 20092000, "step": 16530 }, { "epoch": 2.071795514346573, "grad_norm": 0.1621103286743164, "learning_rate": 9.99960882912009e-06, "loss": 0.4643, "num_input_tokens_seen": 20098144, "step": 16535 }, { "epoch": 2.0724220022553563, "grad_norm": 0.24395841360092163, "learning_rate": 9.999601960670263e-06, "loss": 0.4571, "num_input_tokens_seen": 20104032, "step": 16540 }, { "epoch": 2.07304849016414, "grad_norm": 0.11857016384601593, "learning_rate": 9.999595032445993e-06, "loss": 0.4613, "num_input_tokens_seen": 20110336, "step": 16545 }, { "epoch": 2.073674978072923, "grad_norm": 0.13107505440711975, "learning_rate": 9.99958804444736e-06, "loss": 0.4613, "num_input_tokens_seen": 20116384, "step": 16550 }, { "epoch": 2.0743014659817067, "grad_norm": 0.12201765179634094, "learning_rate": 9.99958099667445e-06, "loss": 0.4599, "num_input_tokens_seen": 20122688, "step": 16555 }, { "epoch": 2.07492795389049, "grad_norm": 0.15175551176071167, "learning_rate": 9.999573889127343e-06, "loss": 0.4682, "num_input_tokens_seen": 20128416, "step": 16560 }, { "epoch": 2.075554441799273, "grad_norm": 0.05077182129025459, "learning_rate": 9.99956672180613e-06, "loss": 0.4617, "num_input_tokens_seen": 20134752, "step": 16565 }, { "epoch": 2.0761809297080567, "grad_norm": 0.24764126539230347, "learning_rate": 9.999559494710892e-06, "loss": 0.4622, "num_input_tokens_seen": 20140992, "step": 16570 }, { "epoch": 2.07680741761684, "grad_norm": 0.11649824678897858, "learning_rate": 9.999552207841714e-06, "loss": 0.4654, "num_input_tokens_seen": 20147328, "step": 16575 }, { "epoch": 2.0774339055256235, "grad_norm": 0.18090437352657318, "learning_rate": 9.999544861198689e-06, "loss": 0.4683, "num_input_tokens_seen": 20153248, "step": 16580 }, { "epoch": 2.0780603934344066, "grad_norm": 0.12780870497226715, "learning_rate": 9.999537454781902e-06, "loss": 0.4577, "num_input_tokens_seen": 20159616, "step": 16585 }, { "epoch": 2.0786868813431902, "grad_norm": 0.12402280420064926, "learning_rate": 9.99952998859144e-06, "loss": 0.4656, "num_input_tokens_seen": 20166112, "step": 16590 }, { "epoch": 2.0793133692519734, "grad_norm": 0.18473933637142181, "learning_rate": 9.999522462627395e-06, "loss": 0.4551, "num_input_tokens_seen": 20172256, "step": 16595 }, { "epoch": 2.079939857160757, "grad_norm": 0.12907755374908447, "learning_rate": 9.999514876889855e-06, "loss": 0.4703, "num_input_tokens_seen": 20178560, "step": 16600 }, { "epoch": 2.08056634506954, "grad_norm": 0.11812291294336319, "learning_rate": 9.99950723137891e-06, "loss": 0.4552, "num_input_tokens_seen": 20184608, "step": 16605 }, { "epoch": 2.0811928329783234, "grad_norm": 0.12703518569469452, "learning_rate": 9.999499526094654e-06, "loss": 0.4608, "num_input_tokens_seen": 20190848, "step": 16610 }, { "epoch": 2.081819320887107, "grad_norm": 0.15080438554286957, "learning_rate": 9.999491761037179e-06, "loss": 0.4636, "num_input_tokens_seen": 20196928, "step": 16615 }, { "epoch": 2.08244580879589, "grad_norm": 0.22334857285022736, "learning_rate": 9.999483936206575e-06, "loss": 0.4557, "num_input_tokens_seen": 20203008, "step": 16620 }, { "epoch": 2.0830722967046738, "grad_norm": 0.14458614587783813, "learning_rate": 9.999476051602937e-06, "loss": 0.4692, "num_input_tokens_seen": 20209408, "step": 16625 }, { "epoch": 2.083698784613457, "grad_norm": 0.2392345517873764, "learning_rate": 9.99946810722636e-06, "loss": 0.4672, "num_input_tokens_seen": 20214464, "step": 16630 }, { "epoch": 2.0843252725222405, "grad_norm": 0.10829094797372818, "learning_rate": 9.999460103076939e-06, "loss": 0.4707, "num_input_tokens_seen": 20220672, "step": 16635 }, { "epoch": 2.0849517604310237, "grad_norm": 0.16477127373218536, "learning_rate": 9.999452039154769e-06, "loss": 0.46, "num_input_tokens_seen": 20226720, "step": 16640 }, { "epoch": 2.085578248339807, "grad_norm": 0.11374803632497787, "learning_rate": 9.999443915459945e-06, "loss": 0.4602, "num_input_tokens_seen": 20232992, "step": 16645 }, { "epoch": 2.0862047362485905, "grad_norm": 0.11568620055913925, "learning_rate": 9.999435731992568e-06, "loss": 0.4664, "num_input_tokens_seen": 20238784, "step": 16650 }, { "epoch": 2.0868312241573737, "grad_norm": 0.12187424302101135, "learning_rate": 9.999427488752732e-06, "loss": 0.4632, "num_input_tokens_seen": 20245088, "step": 16655 }, { "epoch": 2.0874577120661573, "grad_norm": 0.12435980886220932, "learning_rate": 9.999419185740537e-06, "loss": 0.4656, "num_input_tokens_seen": 20251328, "step": 16660 }, { "epoch": 2.0880841999749404, "grad_norm": 0.1943441480398178, "learning_rate": 9.999410822956083e-06, "loss": 0.4648, "num_input_tokens_seen": 20257056, "step": 16665 }, { "epoch": 2.088710687883724, "grad_norm": 0.2118920534849167, "learning_rate": 9.999402400399469e-06, "loss": 0.4601, "num_input_tokens_seen": 20263296, "step": 16670 }, { "epoch": 2.089337175792507, "grad_norm": 0.27182820439338684, "learning_rate": 9.999393918070797e-06, "loss": 0.4609, "num_input_tokens_seen": 20269792, "step": 16675 }, { "epoch": 2.0899636637012904, "grad_norm": 0.12064393609762192, "learning_rate": 9.999385375970168e-06, "loss": 0.4611, "num_input_tokens_seen": 20275840, "step": 16680 }, { "epoch": 2.090590151610074, "grad_norm": 0.11763840913772583, "learning_rate": 9.999376774097682e-06, "loss": 0.4582, "num_input_tokens_seen": 20281216, "step": 16685 }, { "epoch": 2.091216639518857, "grad_norm": 0.0487716943025589, "learning_rate": 9.999368112453443e-06, "loss": 0.4612, "num_input_tokens_seen": 20287424, "step": 16690 }, { "epoch": 2.091843127427641, "grad_norm": 0.108405202627182, "learning_rate": 9.999359391037555e-06, "loss": 0.4631, "num_input_tokens_seen": 20293088, "step": 16695 }, { "epoch": 2.092469615336424, "grad_norm": 0.10642548650503159, "learning_rate": 9.999350609850123e-06, "loss": 0.4598, "num_input_tokens_seen": 20298880, "step": 16700 }, { "epoch": 2.0930961032452076, "grad_norm": 0.11678009480237961, "learning_rate": 9.99934176889125e-06, "loss": 0.4595, "num_input_tokens_seen": 20304864, "step": 16705 }, { "epoch": 2.0937225911539907, "grad_norm": 0.11679008603096008, "learning_rate": 9.999332868161046e-06, "loss": 0.4563, "num_input_tokens_seen": 20311328, "step": 16710 }, { "epoch": 2.094349079062774, "grad_norm": 0.12524887919425964, "learning_rate": 9.999323907659612e-06, "loss": 0.4712, "num_input_tokens_seen": 20317216, "step": 16715 }, { "epoch": 2.0949755669715575, "grad_norm": 0.12452389299869537, "learning_rate": 9.999314887387058e-06, "loss": 0.473, "num_input_tokens_seen": 20323264, "step": 16720 }, { "epoch": 2.0956020548803407, "grad_norm": 0.2186392992734909, "learning_rate": 9.999305807343492e-06, "loss": 0.4643, "num_input_tokens_seen": 20328800, "step": 16725 }, { "epoch": 2.0962285427891243, "grad_norm": 0.12000146508216858, "learning_rate": 9.99929666752902e-06, "loss": 0.4573, "num_input_tokens_seen": 20334784, "step": 16730 }, { "epoch": 2.0968550306979075, "grad_norm": 0.045022524893283844, "learning_rate": 9.999287467943755e-06, "loss": 0.4595, "num_input_tokens_seen": 20340640, "step": 16735 }, { "epoch": 2.097481518606691, "grad_norm": 0.12026828527450562, "learning_rate": 9.999278208587803e-06, "loss": 0.4556, "num_input_tokens_seen": 20346720, "step": 16740 }, { "epoch": 2.0981080065154742, "grad_norm": 0.04590427502989769, "learning_rate": 9.99926888946128e-06, "loss": 0.4591, "num_input_tokens_seen": 20353088, "step": 16745 }, { "epoch": 2.0987344944242574, "grad_norm": 0.18570075929164886, "learning_rate": 9.999259510564292e-06, "loss": 0.4694, "num_input_tokens_seen": 20359264, "step": 16750 }, { "epoch": 2.099360982333041, "grad_norm": 0.17402684688568115, "learning_rate": 9.999250071896954e-06, "loss": 0.4644, "num_input_tokens_seen": 20365504, "step": 16755 }, { "epoch": 2.099987470241824, "grad_norm": 0.1445757895708084, "learning_rate": 9.99924057345938e-06, "loss": 0.4516, "num_input_tokens_seen": 20371520, "step": 16760 }, { "epoch": 2.100613958150608, "grad_norm": 0.15752698481082916, "learning_rate": 9.999231015251677e-06, "loss": 0.4633, "num_input_tokens_seen": 20377728, "step": 16765 }, { "epoch": 2.101240446059391, "grad_norm": 0.21362963318824768, "learning_rate": 9.999221397273968e-06, "loss": 0.4663, "num_input_tokens_seen": 20383072, "step": 16770 }, { "epoch": 2.1018669339681746, "grad_norm": 0.22570902109146118, "learning_rate": 9.999211719526365e-06, "loss": 0.4725, "num_input_tokens_seen": 20388736, "step": 16775 }, { "epoch": 2.1024934218769578, "grad_norm": 0.3550397753715515, "learning_rate": 9.999201982008979e-06, "loss": 0.4466, "num_input_tokens_seen": 20394976, "step": 16780 }, { "epoch": 2.103119909785741, "grad_norm": 0.3575442433357239, "learning_rate": 9.999192184721933e-06, "loss": 0.469, "num_input_tokens_seen": 20400896, "step": 16785 }, { "epoch": 2.1037463976945245, "grad_norm": 0.07965048402547836, "learning_rate": 9.99918232766534e-06, "loss": 0.4656, "num_input_tokens_seen": 20406592, "step": 16790 }, { "epoch": 2.1043728856033077, "grad_norm": 0.0591605119407177, "learning_rate": 9.99917241083932e-06, "loss": 0.4727, "num_input_tokens_seen": 20412576, "step": 16795 }, { "epoch": 2.1049993735120913, "grad_norm": 0.12059810012578964, "learning_rate": 9.99916243424399e-06, "loss": 0.4707, "num_input_tokens_seen": 20418624, "step": 16800 }, { "epoch": 2.1056258614208745, "grad_norm": 0.16242390871047974, "learning_rate": 9.99915239787947e-06, "loss": 0.4604, "num_input_tokens_seen": 20424736, "step": 16805 }, { "epoch": 2.106252349329658, "grad_norm": 0.12357597798109055, "learning_rate": 9.999142301745883e-06, "loss": 0.4617, "num_input_tokens_seen": 20431072, "step": 16810 }, { "epoch": 2.1068788372384413, "grad_norm": 0.05153968557715416, "learning_rate": 9.999132145843344e-06, "loss": 0.4595, "num_input_tokens_seen": 20436544, "step": 16815 }, { "epoch": 2.1075053251472244, "grad_norm": 0.22474168241024017, "learning_rate": 9.999121930171977e-06, "loss": 0.468, "num_input_tokens_seen": 20442848, "step": 16820 }, { "epoch": 2.108131813056008, "grad_norm": 0.22595573961734772, "learning_rate": 9.999111654731905e-06, "loss": 0.469, "num_input_tokens_seen": 20449088, "step": 16825 }, { "epoch": 2.108758300964791, "grad_norm": 0.2604062855243683, "learning_rate": 9.99910131952325e-06, "loss": 0.4648, "num_input_tokens_seen": 20455104, "step": 16830 }, { "epoch": 2.109384788873575, "grad_norm": 0.18344081938266754, "learning_rate": 9.999090924546135e-06, "loss": 0.4639, "num_input_tokens_seen": 20461120, "step": 16835 }, { "epoch": 2.110011276782358, "grad_norm": 0.2205272912979126, "learning_rate": 9.999080469800686e-06, "loss": 0.4578, "num_input_tokens_seen": 20466912, "step": 16840 }, { "epoch": 2.1106377646911416, "grad_norm": 0.31142833828926086, "learning_rate": 9.999069955287027e-06, "loss": 0.4736, "num_input_tokens_seen": 20473152, "step": 16845 }, { "epoch": 2.111264252599925, "grad_norm": 0.11856786906719208, "learning_rate": 9.999059381005283e-06, "loss": 0.463, "num_input_tokens_seen": 20479200, "step": 16850 }, { "epoch": 2.1118907405087084, "grad_norm": 0.16114480793476105, "learning_rate": 9.999048746955581e-06, "loss": 0.4628, "num_input_tokens_seen": 20485376, "step": 16855 }, { "epoch": 2.1125172284174916, "grad_norm": 0.1185884028673172, "learning_rate": 9.999038053138049e-06, "loss": 0.457, "num_input_tokens_seen": 20491680, "step": 16860 }, { "epoch": 2.1131437163262747, "grad_norm": 0.1499987244606018, "learning_rate": 9.999027299552813e-06, "loss": 0.4689, "num_input_tokens_seen": 20497088, "step": 16865 }, { "epoch": 2.1137702042350583, "grad_norm": 0.23131008446216583, "learning_rate": 9.999016486200004e-06, "loss": 0.4595, "num_input_tokens_seen": 20503296, "step": 16870 }, { "epoch": 2.1143966921438415, "grad_norm": 0.19976553320884705, "learning_rate": 9.99900561307975e-06, "loss": 0.4668, "num_input_tokens_seen": 20509280, "step": 16875 }, { "epoch": 2.115023180052625, "grad_norm": 0.12638308107852936, "learning_rate": 9.99899468019218e-06, "loss": 0.4637, "num_input_tokens_seen": 20515520, "step": 16880 }, { "epoch": 2.1156496679614083, "grad_norm": 0.22320838272571564, "learning_rate": 9.998983687537426e-06, "loss": 0.4565, "num_input_tokens_seen": 20521472, "step": 16885 }, { "epoch": 2.116276155870192, "grad_norm": 0.14138944447040558, "learning_rate": 9.99897263511562e-06, "loss": 0.4577, "num_input_tokens_seen": 20527424, "step": 16890 }, { "epoch": 2.116902643778975, "grad_norm": 0.15873025357723236, "learning_rate": 9.998961522926892e-06, "loss": 0.4674, "num_input_tokens_seen": 20533504, "step": 16895 }, { "epoch": 2.1175291316877582, "grad_norm": 0.14301051199436188, "learning_rate": 9.998950350971377e-06, "loss": 0.4689, "num_input_tokens_seen": 20539744, "step": 16900 }, { "epoch": 2.118155619596542, "grad_norm": 0.14226457476615906, "learning_rate": 9.998939119249207e-06, "loss": 0.4621, "num_input_tokens_seen": 20545728, "step": 16905 }, { "epoch": 2.118782107505325, "grad_norm": 0.1384759098291397, "learning_rate": 9.998927827760516e-06, "loss": 0.4672, "num_input_tokens_seen": 20551424, "step": 16910 }, { "epoch": 2.1194085954141086, "grad_norm": 0.12381807714700699, "learning_rate": 9.99891647650544e-06, "loss": 0.4655, "num_input_tokens_seen": 20557696, "step": 16915 }, { "epoch": 2.120035083322892, "grad_norm": 0.19071076810359955, "learning_rate": 9.998905065484118e-06, "loss": 0.4571, "num_input_tokens_seen": 20564000, "step": 16920 }, { "epoch": 2.1206615712316754, "grad_norm": 0.20863030850887299, "learning_rate": 9.99889359469668e-06, "loss": 0.4592, "num_input_tokens_seen": 20570304, "step": 16925 }, { "epoch": 2.1212880591404586, "grad_norm": 0.1233576312661171, "learning_rate": 9.998882064143267e-06, "loss": 0.4587, "num_input_tokens_seen": 20575936, "step": 16930 }, { "epoch": 2.1219145470492418, "grad_norm": 0.15314631164073944, "learning_rate": 9.998870473824017e-06, "loss": 0.4572, "num_input_tokens_seen": 20582240, "step": 16935 }, { "epoch": 2.1225410349580254, "grad_norm": 0.16153065860271454, "learning_rate": 9.998858823739065e-06, "loss": 0.4656, "num_input_tokens_seen": 20588768, "step": 16940 }, { "epoch": 2.1231675228668085, "grad_norm": 0.16651016473770142, "learning_rate": 9.998847113888557e-06, "loss": 0.4663, "num_input_tokens_seen": 20595008, "step": 16945 }, { "epoch": 2.123794010775592, "grad_norm": 0.16488836705684662, "learning_rate": 9.998835344272625e-06, "loss": 0.4692, "num_input_tokens_seen": 20601408, "step": 16950 }, { "epoch": 2.1244204986843753, "grad_norm": 0.12709614634513855, "learning_rate": 9.998823514891416e-06, "loss": 0.4619, "num_input_tokens_seen": 20606720, "step": 16955 }, { "epoch": 2.125046986593159, "grad_norm": 0.28598013520240784, "learning_rate": 9.998811625745068e-06, "loss": 0.4726, "num_input_tokens_seen": 20612704, "step": 16960 }, { "epoch": 2.125673474501942, "grad_norm": 0.1405428647994995, "learning_rate": 9.998799676833725e-06, "loss": 0.4692, "num_input_tokens_seen": 20618624, "step": 16965 }, { "epoch": 2.1262999624107253, "grad_norm": 0.1240641176700592, "learning_rate": 9.99878766815753e-06, "loss": 0.4602, "num_input_tokens_seen": 20624704, "step": 16970 }, { "epoch": 2.126926450319509, "grad_norm": 0.1462957262992859, "learning_rate": 9.998775599716623e-06, "loss": 0.4694, "num_input_tokens_seen": 20630976, "step": 16975 }, { "epoch": 2.127552938228292, "grad_norm": 0.1345985382795334, "learning_rate": 9.998763471511153e-06, "loss": 0.4626, "num_input_tokens_seen": 20637152, "step": 16980 }, { "epoch": 2.1281794261370757, "grad_norm": 0.041721925139427185, "learning_rate": 9.998751283541265e-06, "loss": 0.4642, "num_input_tokens_seen": 20643392, "step": 16985 }, { "epoch": 2.128805914045859, "grad_norm": 0.12788987159729004, "learning_rate": 9.9987390358071e-06, "loss": 0.46, "num_input_tokens_seen": 20649728, "step": 16990 }, { "epoch": 2.1294324019546425, "grad_norm": 0.11757326871156693, "learning_rate": 9.998726728308805e-06, "loss": 0.4641, "num_input_tokens_seen": 20655712, "step": 16995 }, { "epoch": 2.1300588898634256, "grad_norm": 0.12007109820842743, "learning_rate": 9.998714361046531e-06, "loss": 0.4668, "num_input_tokens_seen": 20661696, "step": 17000 }, { "epoch": 2.130685377772209, "grad_norm": 0.14084456861019135, "learning_rate": 9.998701934020426e-06, "loss": 0.4599, "num_input_tokens_seen": 20667872, "step": 17005 }, { "epoch": 2.1313118656809924, "grad_norm": 0.1338714212179184, "learning_rate": 9.998689447230634e-06, "loss": 0.4653, "num_input_tokens_seen": 20673792, "step": 17010 }, { "epoch": 2.1319383535897756, "grad_norm": 0.24045659601688385, "learning_rate": 9.99867690067731e-06, "loss": 0.4592, "num_input_tokens_seen": 20679936, "step": 17015 }, { "epoch": 2.132564841498559, "grad_norm": 0.05081053823232651, "learning_rate": 9.998664294360598e-06, "loss": 0.465, "num_input_tokens_seen": 20686432, "step": 17020 }, { "epoch": 2.1331913294073424, "grad_norm": 0.2655032277107239, "learning_rate": 9.998651628280653e-06, "loss": 0.4563, "num_input_tokens_seen": 20692928, "step": 17025 }, { "epoch": 2.133817817316126, "grad_norm": 0.1461685746908188, "learning_rate": 9.998638902437627e-06, "loss": 0.4528, "num_input_tokens_seen": 20698816, "step": 17030 }, { "epoch": 2.134444305224909, "grad_norm": 0.5100637674331665, "learning_rate": 9.998626116831668e-06, "loss": 0.4619, "num_input_tokens_seen": 20705024, "step": 17035 }, { "epoch": 2.1350707931336927, "grad_norm": 0.31808462738990784, "learning_rate": 9.998613271462931e-06, "loss": 0.4889, "num_input_tokens_seen": 20711008, "step": 17040 }, { "epoch": 2.135697281042476, "grad_norm": 0.11594141274690628, "learning_rate": 9.998600366331571e-06, "loss": 0.4681, "num_input_tokens_seen": 20716928, "step": 17045 }, { "epoch": 2.136323768951259, "grad_norm": 0.03317626565694809, "learning_rate": 9.99858740143774e-06, "loss": 0.466, "num_input_tokens_seen": 20723200, "step": 17050 }, { "epoch": 2.1369502568600427, "grad_norm": 0.1612161546945572, "learning_rate": 9.998574376781595e-06, "loss": 0.4617, "num_input_tokens_seen": 20729280, "step": 17055 }, { "epoch": 2.137576744768826, "grad_norm": 0.1232733502984047, "learning_rate": 9.99856129236329e-06, "loss": 0.4607, "num_input_tokens_seen": 20735200, "step": 17060 }, { "epoch": 2.1382032326776095, "grad_norm": 0.05718972906470299, "learning_rate": 9.998548148182983e-06, "loss": 0.4571, "num_input_tokens_seen": 20741248, "step": 17065 }, { "epoch": 2.1388297205863926, "grad_norm": 0.20568488538265228, "learning_rate": 9.998534944240829e-06, "loss": 0.4604, "num_input_tokens_seen": 20747488, "step": 17070 }, { "epoch": 2.139456208495176, "grad_norm": 0.216760516166687, "learning_rate": 9.998521680536988e-06, "loss": 0.4593, "num_input_tokens_seen": 20753728, "step": 17075 }, { "epoch": 2.1400826964039594, "grad_norm": 0.1290162056684494, "learning_rate": 9.998508357071619e-06, "loss": 0.4644, "num_input_tokens_seen": 20759936, "step": 17080 }, { "epoch": 2.1407091843127426, "grad_norm": 0.13816072046756744, "learning_rate": 9.998494973844878e-06, "loss": 0.4635, "num_input_tokens_seen": 20765984, "step": 17085 }, { "epoch": 2.141335672221526, "grad_norm": 0.13604120910167694, "learning_rate": 9.998481530856928e-06, "loss": 0.4684, "num_input_tokens_seen": 20771424, "step": 17090 }, { "epoch": 2.1419621601303094, "grad_norm": 0.17641673982143402, "learning_rate": 9.998468028107928e-06, "loss": 0.4573, "num_input_tokens_seen": 20777920, "step": 17095 }, { "epoch": 2.142588648039093, "grad_norm": 0.16634315252304077, "learning_rate": 9.998454465598041e-06, "loss": 0.4613, "num_input_tokens_seen": 20783648, "step": 17100 }, { "epoch": 2.143215135947876, "grad_norm": 0.2422746866941452, "learning_rate": 9.998440843327429e-06, "loss": 0.4546, "num_input_tokens_seen": 20789792, "step": 17105 }, { "epoch": 2.1438416238566598, "grad_norm": 0.17281481623649597, "learning_rate": 9.998427161296254e-06, "loss": 0.4744, "num_input_tokens_seen": 20796032, "step": 17110 }, { "epoch": 2.144468111765443, "grad_norm": 0.1365155577659607, "learning_rate": 9.99841341950468e-06, "loss": 0.4665, "num_input_tokens_seen": 20801952, "step": 17115 }, { "epoch": 2.145094599674226, "grad_norm": 0.17350277304649353, "learning_rate": 9.99839961795287e-06, "loss": 0.4551, "num_input_tokens_seen": 20807808, "step": 17120 }, { "epoch": 2.1457210875830097, "grad_norm": 0.05163774639368057, "learning_rate": 9.99838575664099e-06, "loss": 0.4693, "num_input_tokens_seen": 20814208, "step": 17125 }, { "epoch": 2.146347575491793, "grad_norm": 0.2559031844139099, "learning_rate": 9.998371835569206e-06, "loss": 0.4644, "num_input_tokens_seen": 20820224, "step": 17130 }, { "epoch": 2.1469740634005765, "grad_norm": 0.19617539644241333, "learning_rate": 9.998357854737686e-06, "loss": 0.4652, "num_input_tokens_seen": 20826400, "step": 17135 }, { "epoch": 2.1476005513093597, "grad_norm": 0.14207124710083008, "learning_rate": 9.998343814146595e-06, "loss": 0.4597, "num_input_tokens_seen": 20832832, "step": 17140 }, { "epoch": 2.1482270392181433, "grad_norm": 0.21952030062675476, "learning_rate": 9.9983297137961e-06, "loss": 0.4701, "num_input_tokens_seen": 20839104, "step": 17145 }, { "epoch": 2.1488535271269265, "grad_norm": 0.10899651050567627, "learning_rate": 9.998315553686372e-06, "loss": 0.4689, "num_input_tokens_seen": 20845344, "step": 17150 }, { "epoch": 2.1494800150357096, "grad_norm": 0.21042175590991974, "learning_rate": 9.99830133381758e-06, "loss": 0.4615, "num_input_tokens_seen": 20851712, "step": 17155 }, { "epoch": 2.1501065029444932, "grad_norm": 0.22053730487823486, "learning_rate": 9.99828705418989e-06, "loss": 0.4609, "num_input_tokens_seen": 20857824, "step": 17160 }, { "epoch": 2.1507329908532764, "grad_norm": 0.11523759365081787, "learning_rate": 9.998272714803478e-06, "loss": 0.4567, "num_input_tokens_seen": 20863744, "step": 17165 }, { "epoch": 2.15135947876206, "grad_norm": 0.20279540121555328, "learning_rate": 9.998258315658514e-06, "loss": 0.4579, "num_input_tokens_seen": 20870304, "step": 17170 }, { "epoch": 2.151985966670843, "grad_norm": 0.045974694192409515, "learning_rate": 9.99824385675517e-06, "loss": 0.4638, "num_input_tokens_seen": 20876416, "step": 17175 }, { "epoch": 2.152612454579627, "grad_norm": 0.10262840241193771, "learning_rate": 9.998229338093617e-06, "loss": 0.4623, "num_input_tokens_seen": 20882368, "step": 17180 }, { "epoch": 2.15323894248841, "grad_norm": 0.10072227567434311, "learning_rate": 9.99821475967403e-06, "loss": 0.4634, "num_input_tokens_seen": 20888384, "step": 17185 }, { "epoch": 2.153865430397193, "grad_norm": 0.1343054324388504, "learning_rate": 9.998200121496583e-06, "loss": 0.4598, "num_input_tokens_seen": 20894304, "step": 17190 }, { "epoch": 2.1544919183059767, "grad_norm": 0.19824905693531036, "learning_rate": 9.998185423561454e-06, "loss": 0.4605, "num_input_tokens_seen": 20900384, "step": 17195 }, { "epoch": 2.15511840621476, "grad_norm": 0.21584050357341766, "learning_rate": 9.998170665868812e-06, "loss": 0.4656, "num_input_tokens_seen": 20906368, "step": 17200 }, { "epoch": 2.1557448941235435, "grad_norm": 0.1461768001317978, "learning_rate": 9.99815584841884e-06, "loss": 0.4524, "num_input_tokens_seen": 20912736, "step": 17205 }, { "epoch": 2.1563713820323267, "grad_norm": 0.11530175060033798, "learning_rate": 9.998140971211712e-06, "loss": 0.4656, "num_input_tokens_seen": 20918976, "step": 17210 }, { "epoch": 2.1569978699411103, "grad_norm": 0.19920210540294647, "learning_rate": 9.998126034247607e-06, "loss": 0.4697, "num_input_tokens_seen": 20925184, "step": 17215 }, { "epoch": 2.1576243578498935, "grad_norm": 0.10857325047254562, "learning_rate": 9.9981110375267e-06, "loss": 0.4598, "num_input_tokens_seen": 20931168, "step": 17220 }, { "epoch": 2.1582508457586767, "grad_norm": 0.11168529838323593, "learning_rate": 9.998095981049177e-06, "loss": 0.4566, "num_input_tokens_seen": 20937440, "step": 17225 }, { "epoch": 2.1588773336674603, "grad_norm": 0.1333993524312973, "learning_rate": 9.998080864815213e-06, "loss": 0.4716, "num_input_tokens_seen": 20943200, "step": 17230 }, { "epoch": 2.1595038215762434, "grad_norm": 0.1037943884730339, "learning_rate": 9.99806568882499e-06, "loss": 0.4534, "num_input_tokens_seen": 20949088, "step": 17235 }, { "epoch": 2.160130309485027, "grad_norm": 0.1180235743522644, "learning_rate": 9.99805045307869e-06, "loss": 0.4632, "num_input_tokens_seen": 20955296, "step": 17240 }, { "epoch": 2.16075679739381, "grad_norm": 0.11525899916887283, "learning_rate": 9.998035157576496e-06, "loss": 0.4678, "num_input_tokens_seen": 20961504, "step": 17245 }, { "epoch": 2.161383285302594, "grad_norm": 0.18241912126541138, "learning_rate": 9.998019802318588e-06, "loss": 0.4585, "num_input_tokens_seen": 20967296, "step": 17250 }, { "epoch": 2.162009773211377, "grad_norm": 0.1378258615732193, "learning_rate": 9.998004387305152e-06, "loss": 0.4564, "num_input_tokens_seen": 20973504, "step": 17255 }, { "epoch": 2.16263626112016, "grad_norm": 0.1232762560248375, "learning_rate": 9.997988912536369e-06, "loss": 0.4579, "num_input_tokens_seen": 20979904, "step": 17260 }, { "epoch": 2.1632627490289438, "grad_norm": 0.19475072622299194, "learning_rate": 9.997973378012428e-06, "loss": 0.4612, "num_input_tokens_seen": 20986080, "step": 17265 }, { "epoch": 2.163889236937727, "grad_norm": 0.053404685109853745, "learning_rate": 9.997957783733513e-06, "loss": 0.4734, "num_input_tokens_seen": 20992320, "step": 17270 }, { "epoch": 2.1645157248465106, "grad_norm": 0.17908325791358948, "learning_rate": 9.997942129699812e-06, "loss": 0.4572, "num_input_tokens_seen": 20998400, "step": 17275 }, { "epoch": 2.1651422127552937, "grad_norm": 0.04840745031833649, "learning_rate": 9.99792641591151e-06, "loss": 0.4618, "num_input_tokens_seen": 21004000, "step": 17280 }, { "epoch": 2.1657687006640773, "grad_norm": 0.13284215331077576, "learning_rate": 9.997910642368796e-06, "loss": 0.4603, "num_input_tokens_seen": 21010464, "step": 17285 }, { "epoch": 2.1663951885728605, "grad_norm": 0.12855130434036255, "learning_rate": 9.997894809071858e-06, "loss": 0.4589, "num_input_tokens_seen": 21016736, "step": 17290 }, { "epoch": 2.167021676481644, "grad_norm": 0.1856578141450882, "learning_rate": 9.997878916020885e-06, "loss": 0.4593, "num_input_tokens_seen": 21022944, "step": 17295 }, { "epoch": 2.1676481643904273, "grad_norm": 0.05170920118689537, "learning_rate": 9.997862963216068e-06, "loss": 0.4611, "num_input_tokens_seen": 21029152, "step": 17300 }, { "epoch": 2.1682746522992105, "grad_norm": 0.110777847468853, "learning_rate": 9.997846950657596e-06, "loss": 0.4561, "num_input_tokens_seen": 21035104, "step": 17305 }, { "epoch": 2.168901140207994, "grad_norm": 0.20676131546497345, "learning_rate": 9.997830878345665e-06, "loss": 0.4625, "num_input_tokens_seen": 21041440, "step": 17310 }, { "epoch": 2.1695276281167772, "grad_norm": 0.20335890352725983, "learning_rate": 9.997814746280461e-06, "loss": 0.4589, "num_input_tokens_seen": 21047424, "step": 17315 }, { "epoch": 2.170154116025561, "grad_norm": 0.15175427496433258, "learning_rate": 9.99779855446218e-06, "loss": 0.4558, "num_input_tokens_seen": 21053600, "step": 17320 }, { "epoch": 2.170780603934344, "grad_norm": 0.15668658912181854, "learning_rate": 9.997782302891017e-06, "loss": 0.4633, "num_input_tokens_seen": 21060032, "step": 17325 }, { "epoch": 2.1714070918431276, "grad_norm": 0.07898233085870743, "learning_rate": 9.997765991567163e-06, "loss": 0.4594, "num_input_tokens_seen": 21066304, "step": 17330 }, { "epoch": 2.172033579751911, "grad_norm": 0.19643855094909668, "learning_rate": 9.997749620490816e-06, "loss": 0.4625, "num_input_tokens_seen": 21072608, "step": 17335 }, { "epoch": 2.172660067660694, "grad_norm": 0.3034975230693817, "learning_rate": 9.99773318966217e-06, "loss": 0.4554, "num_input_tokens_seen": 21078976, "step": 17340 }, { "epoch": 2.1732865555694776, "grad_norm": 0.11101550608873367, "learning_rate": 9.99771669908142e-06, "loss": 0.4576, "num_input_tokens_seen": 21085280, "step": 17345 }, { "epoch": 2.1739130434782608, "grad_norm": 0.24300988018512726, "learning_rate": 9.997700148748768e-06, "loss": 0.4615, "num_input_tokens_seen": 21091584, "step": 17350 }, { "epoch": 2.1745395313870444, "grad_norm": 0.14130809903144836, "learning_rate": 9.997683538664407e-06, "loss": 0.4617, "num_input_tokens_seen": 21097152, "step": 17355 }, { "epoch": 2.1751660192958275, "grad_norm": 0.2722122073173523, "learning_rate": 9.99766686882854e-06, "loss": 0.487, "num_input_tokens_seen": 21103264, "step": 17360 }, { "epoch": 2.175792507204611, "grad_norm": 0.11678989976644516, "learning_rate": 9.997650139241362e-06, "loss": 0.4602, "num_input_tokens_seen": 21109632, "step": 17365 }, { "epoch": 2.1764189951133943, "grad_norm": 0.1151825562119484, "learning_rate": 9.997633349903074e-06, "loss": 0.4674, "num_input_tokens_seen": 21115392, "step": 17370 }, { "epoch": 2.1770454830221775, "grad_norm": 0.1911526322364807, "learning_rate": 9.997616500813877e-06, "loss": 0.4667, "num_input_tokens_seen": 21121504, "step": 17375 }, { "epoch": 2.177671970930961, "grad_norm": 0.13805396854877472, "learning_rate": 9.997599591973974e-06, "loss": 0.4629, "num_input_tokens_seen": 21127872, "step": 17380 }, { "epoch": 2.1782984588397443, "grad_norm": 0.11220674216747284, "learning_rate": 9.997582623383567e-06, "loss": 0.4614, "num_input_tokens_seen": 21134176, "step": 17385 }, { "epoch": 2.178924946748528, "grad_norm": 0.12215554714202881, "learning_rate": 9.997565595042858e-06, "loss": 0.4613, "num_input_tokens_seen": 21140224, "step": 17390 }, { "epoch": 2.179551434657311, "grad_norm": 0.11134830117225647, "learning_rate": 9.997548506952048e-06, "loss": 0.4655, "num_input_tokens_seen": 21145952, "step": 17395 }, { "epoch": 2.1801779225660947, "grad_norm": 0.1925407201051712, "learning_rate": 9.997531359111346e-06, "loss": 0.459, "num_input_tokens_seen": 21151488, "step": 17400 }, { "epoch": 2.180804410474878, "grad_norm": 0.19332510232925415, "learning_rate": 9.997514151520956e-06, "loss": 0.4628, "num_input_tokens_seen": 21157440, "step": 17405 }, { "epoch": 2.181430898383661, "grad_norm": 0.11628251522779465, "learning_rate": 9.997496884181081e-06, "loss": 0.4601, "num_input_tokens_seen": 21163648, "step": 17410 }, { "epoch": 2.1820573862924446, "grad_norm": 0.11643176525831223, "learning_rate": 9.99747955709193e-06, "loss": 0.4674, "num_input_tokens_seen": 21169760, "step": 17415 }, { "epoch": 2.182683874201228, "grad_norm": 0.12044021487236023, "learning_rate": 9.997462170253709e-06, "loss": 0.4653, "num_input_tokens_seen": 21175424, "step": 17420 }, { "epoch": 2.1833103621100114, "grad_norm": 0.11793725192546844, "learning_rate": 9.997444723666624e-06, "loss": 0.4672, "num_input_tokens_seen": 21181376, "step": 17425 }, { "epoch": 2.1839368500187946, "grad_norm": 0.14557458460330963, "learning_rate": 9.997427217330888e-06, "loss": 0.4618, "num_input_tokens_seen": 21187680, "step": 17430 }, { "epoch": 2.184563337927578, "grad_norm": 0.10229413211345673, "learning_rate": 9.997409651246707e-06, "loss": 0.4621, "num_input_tokens_seen": 21194176, "step": 17435 }, { "epoch": 2.1851898258363613, "grad_norm": 0.1457105576992035, "learning_rate": 9.997392025414292e-06, "loss": 0.4673, "num_input_tokens_seen": 21200096, "step": 17440 }, { "epoch": 2.1858163137451445, "grad_norm": 0.34232354164123535, "learning_rate": 9.997374339833854e-06, "loss": 0.4604, "num_input_tokens_seen": 21206080, "step": 17445 }, { "epoch": 2.186442801653928, "grad_norm": 0.15987294912338257, "learning_rate": 9.997356594505605e-06, "loss": 0.4631, "num_input_tokens_seen": 21212288, "step": 17450 }, { "epoch": 2.1870692895627113, "grad_norm": 0.17178812623023987, "learning_rate": 9.997338789429756e-06, "loss": 0.4642, "num_input_tokens_seen": 21217824, "step": 17455 }, { "epoch": 2.187695777471495, "grad_norm": 0.1510254442691803, "learning_rate": 9.99732092460652e-06, "loss": 0.4647, "num_input_tokens_seen": 21223808, "step": 17460 }, { "epoch": 2.188322265380278, "grad_norm": 0.16145263612270355, "learning_rate": 9.997303000036112e-06, "loss": 0.4747, "num_input_tokens_seen": 21230144, "step": 17465 }, { "epoch": 2.1889487532890617, "grad_norm": 0.1318567544221878, "learning_rate": 9.997285015718743e-06, "loss": 0.4569, "num_input_tokens_seen": 21236160, "step": 17470 }, { "epoch": 2.189575241197845, "grad_norm": 0.1332343965768814, "learning_rate": 9.997266971654632e-06, "loss": 0.4653, "num_input_tokens_seen": 21241952, "step": 17475 }, { "epoch": 2.1902017291066285, "grad_norm": 0.1330355703830719, "learning_rate": 9.997248867843992e-06, "loss": 0.4685, "num_input_tokens_seen": 21248224, "step": 17480 }, { "epoch": 2.1908282170154116, "grad_norm": 0.13653425872325897, "learning_rate": 9.99723070428704e-06, "loss": 0.4581, "num_input_tokens_seen": 21254208, "step": 17485 }, { "epoch": 2.191454704924195, "grad_norm": 0.11337153613567352, "learning_rate": 9.997212480983992e-06, "loss": 0.4638, "num_input_tokens_seen": 21260448, "step": 17490 }, { "epoch": 2.1920811928329784, "grad_norm": 0.04548248276114464, "learning_rate": 9.99719419793507e-06, "loss": 0.4624, "num_input_tokens_seen": 21266368, "step": 17495 }, { "epoch": 2.1927076807417616, "grad_norm": 0.11477374285459518, "learning_rate": 9.997175855140489e-06, "loss": 0.463, "num_input_tokens_seen": 21272256, "step": 17500 }, { "epoch": 2.193334168650545, "grad_norm": 0.04160049557685852, "learning_rate": 9.99715745260047e-06, "loss": 0.4597, "num_input_tokens_seen": 21278688, "step": 17505 }, { "epoch": 2.1939606565593284, "grad_norm": 0.14275656640529633, "learning_rate": 9.997138990315232e-06, "loss": 0.4696, "num_input_tokens_seen": 21284320, "step": 17510 }, { "epoch": 2.1945871444681115, "grad_norm": 0.1075429916381836, "learning_rate": 9.997120468284995e-06, "loss": 0.469, "num_input_tokens_seen": 21290400, "step": 17515 }, { "epoch": 2.195213632376895, "grad_norm": 0.09965042024850845, "learning_rate": 9.997101886509984e-06, "loss": 0.4631, "num_input_tokens_seen": 21296512, "step": 17520 }, { "epoch": 2.1958401202856783, "grad_norm": 0.10806460678577423, "learning_rate": 9.997083244990415e-06, "loss": 0.4642, "num_input_tokens_seen": 21302432, "step": 17525 }, { "epoch": 2.196466608194462, "grad_norm": 0.13722267746925354, "learning_rate": 9.997064543726516e-06, "loss": 0.4589, "num_input_tokens_seen": 21308736, "step": 17530 }, { "epoch": 2.197093096103245, "grad_norm": 0.17129573225975037, "learning_rate": 9.99704578271851e-06, "loss": 0.4667, "num_input_tokens_seen": 21314912, "step": 17535 }, { "epoch": 2.1977195840120287, "grad_norm": 0.04451070725917816, "learning_rate": 9.997026961966621e-06, "loss": 0.4572, "num_input_tokens_seen": 21321152, "step": 17540 }, { "epoch": 2.198346071920812, "grad_norm": 0.11092448234558105, "learning_rate": 9.99700808147107e-06, "loss": 0.4639, "num_input_tokens_seen": 21327136, "step": 17545 }, { "epoch": 2.1989725598295955, "grad_norm": 0.16490618884563446, "learning_rate": 9.99698914123209e-06, "loss": 0.4663, "num_input_tokens_seen": 21333344, "step": 17550 }, { "epoch": 2.1995990477383787, "grad_norm": 0.20842845737934113, "learning_rate": 9.996970141249902e-06, "loss": 0.4668, "num_input_tokens_seen": 21339584, "step": 17555 }, { "epoch": 2.200225535647162, "grad_norm": 0.127633735537529, "learning_rate": 9.996951081524734e-06, "loss": 0.4627, "num_input_tokens_seen": 21345856, "step": 17560 }, { "epoch": 2.2008520235559454, "grad_norm": 0.16791541874408722, "learning_rate": 9.996931962056815e-06, "loss": 0.4656, "num_input_tokens_seen": 21352032, "step": 17565 }, { "epoch": 2.2014785114647286, "grad_norm": 0.13193769752979279, "learning_rate": 9.996912782846373e-06, "loss": 0.457, "num_input_tokens_seen": 21357888, "step": 17570 }, { "epoch": 2.2021049993735122, "grad_norm": 0.12380603700876236, "learning_rate": 9.996893543893638e-06, "loss": 0.4664, "num_input_tokens_seen": 21364064, "step": 17575 }, { "epoch": 2.2027314872822954, "grad_norm": 0.11924334615468979, "learning_rate": 9.99687424519884e-06, "loss": 0.4652, "num_input_tokens_seen": 21370304, "step": 17580 }, { "epoch": 2.203357975191079, "grad_norm": 0.13947793841362, "learning_rate": 9.996854886762208e-06, "loss": 0.4621, "num_input_tokens_seen": 21376544, "step": 17585 }, { "epoch": 2.203984463099862, "grad_norm": 0.14163276553153992, "learning_rate": 9.996835468583976e-06, "loss": 0.4637, "num_input_tokens_seen": 21382816, "step": 17590 }, { "epoch": 2.2046109510086453, "grad_norm": 0.18097150325775146, "learning_rate": 9.996815990664375e-06, "loss": 0.4573, "num_input_tokens_seen": 21389184, "step": 17595 }, { "epoch": 2.205237438917429, "grad_norm": 0.18988564610481262, "learning_rate": 9.996796453003637e-06, "loss": 0.4638, "num_input_tokens_seen": 21394976, "step": 17600 }, { "epoch": 2.205863926826212, "grad_norm": 0.13832393288612366, "learning_rate": 9.996776855601995e-06, "loss": 0.4571, "num_input_tokens_seen": 21401056, "step": 17605 }, { "epoch": 2.2064904147349957, "grad_norm": 0.17471401393413544, "learning_rate": 9.996757198459687e-06, "loss": 0.4601, "num_input_tokens_seen": 21407296, "step": 17610 }, { "epoch": 2.207116902643779, "grad_norm": 0.13858482241630554, "learning_rate": 9.996737481576943e-06, "loss": 0.4583, "num_input_tokens_seen": 21413216, "step": 17615 }, { "epoch": 2.2077433905525625, "grad_norm": 0.13189277052879333, "learning_rate": 9.996717704954004e-06, "loss": 0.465, "num_input_tokens_seen": 21418752, "step": 17620 }, { "epoch": 2.2083698784613457, "grad_norm": 0.20566892623901367, "learning_rate": 9.996697868591103e-06, "loss": 0.4539, "num_input_tokens_seen": 21424576, "step": 17625 }, { "epoch": 2.208996366370129, "grad_norm": 0.15248306095600128, "learning_rate": 9.996677972488478e-06, "loss": 0.4645, "num_input_tokens_seen": 21431168, "step": 17630 }, { "epoch": 2.2096228542789125, "grad_norm": 0.06986342370510101, "learning_rate": 9.996658016646366e-06, "loss": 0.4695, "num_input_tokens_seen": 21437216, "step": 17635 }, { "epoch": 2.2102493421876956, "grad_norm": 0.07697618752717972, "learning_rate": 9.996638001065007e-06, "loss": 0.4605, "num_input_tokens_seen": 21443424, "step": 17640 }, { "epoch": 2.2108758300964793, "grad_norm": 0.14432834088802338, "learning_rate": 9.996617925744641e-06, "loss": 0.4619, "num_input_tokens_seen": 21449696, "step": 17645 }, { "epoch": 2.2115023180052624, "grad_norm": 0.19977639615535736, "learning_rate": 9.996597790685505e-06, "loss": 0.4559, "num_input_tokens_seen": 21456064, "step": 17650 }, { "epoch": 2.212128805914046, "grad_norm": 0.2773340940475464, "learning_rate": 9.996577595887842e-06, "loss": 0.465, "num_input_tokens_seen": 21462464, "step": 17655 }, { "epoch": 2.212755293822829, "grad_norm": 0.3040256202220917, "learning_rate": 9.996557341351893e-06, "loss": 0.4624, "num_input_tokens_seen": 21468416, "step": 17660 }, { "epoch": 2.2133817817316124, "grad_norm": 0.2278309464454651, "learning_rate": 9.996537027077901e-06, "loss": 0.4499, "num_input_tokens_seen": 21474624, "step": 17665 }, { "epoch": 2.214008269640396, "grad_norm": 0.21623390913009644, "learning_rate": 9.996516653066106e-06, "loss": 0.4628, "num_input_tokens_seen": 21480800, "step": 17670 }, { "epoch": 2.214634757549179, "grad_norm": 0.19528831541538239, "learning_rate": 9.996496219316756e-06, "loss": 0.4474, "num_input_tokens_seen": 21486880, "step": 17675 }, { "epoch": 2.2152612454579628, "grad_norm": 0.18334755301475525, "learning_rate": 9.996475725830091e-06, "loss": 0.4696, "num_input_tokens_seen": 21493568, "step": 17680 }, { "epoch": 2.215887733366746, "grad_norm": 0.26016736030578613, "learning_rate": 9.996455172606358e-06, "loss": 0.4822, "num_input_tokens_seen": 21499552, "step": 17685 }, { "epoch": 2.2165142212755296, "grad_norm": 0.149249866604805, "learning_rate": 9.996434559645804e-06, "loss": 0.4571, "num_input_tokens_seen": 21505408, "step": 17690 }, { "epoch": 2.2171407091843127, "grad_norm": 0.1804840862751007, "learning_rate": 9.996413886948672e-06, "loss": 0.4578, "num_input_tokens_seen": 21511264, "step": 17695 }, { "epoch": 2.217767197093096, "grad_norm": 0.13427849113941193, "learning_rate": 9.996393154515214e-06, "loss": 0.459, "num_input_tokens_seen": 21516736, "step": 17700 }, { "epoch": 2.2183936850018795, "grad_norm": 0.08368620276451111, "learning_rate": 9.996372362345673e-06, "loss": 0.4576, "num_input_tokens_seen": 21522816, "step": 17705 }, { "epoch": 2.2190201729106627, "grad_norm": 0.31289345026016235, "learning_rate": 9.996351510440301e-06, "loss": 0.4826, "num_input_tokens_seen": 21529024, "step": 17710 }, { "epoch": 2.2196466608194463, "grad_norm": 0.18687595427036285, "learning_rate": 9.996330598799344e-06, "loss": 0.4665, "num_input_tokens_seen": 21535488, "step": 17715 }, { "epoch": 2.2202731487282295, "grad_norm": 0.13856452703475952, "learning_rate": 9.996309627423056e-06, "loss": 0.4522, "num_input_tokens_seen": 21541472, "step": 17720 }, { "epoch": 2.220899636637013, "grad_norm": 0.13405752182006836, "learning_rate": 9.996288596311684e-06, "loss": 0.4602, "num_input_tokens_seen": 21547648, "step": 17725 }, { "epoch": 2.2215261245457962, "grad_norm": 0.14262036979198456, "learning_rate": 9.996267505465484e-06, "loss": 0.467, "num_input_tokens_seen": 21553664, "step": 17730 }, { "epoch": 2.22215261245458, "grad_norm": 0.17697946727275848, "learning_rate": 9.996246354884703e-06, "loss": 0.4664, "num_input_tokens_seen": 21559104, "step": 17735 }, { "epoch": 2.222779100363363, "grad_norm": 0.1466817706823349, "learning_rate": 9.996225144569597e-06, "loss": 0.4614, "num_input_tokens_seen": 21565024, "step": 17740 }, { "epoch": 2.223405588272146, "grad_norm": 0.2471211552619934, "learning_rate": 9.996203874520417e-06, "loss": 0.4696, "num_input_tokens_seen": 21570976, "step": 17745 }, { "epoch": 2.22403207618093, "grad_norm": 0.13731901347637177, "learning_rate": 9.996182544737422e-06, "loss": 0.4618, "num_input_tokens_seen": 21577312, "step": 17750 }, { "epoch": 2.224658564089713, "grad_norm": 0.13788262009620667, "learning_rate": 9.996161155220862e-06, "loss": 0.4597, "num_input_tokens_seen": 21583520, "step": 17755 }, { "epoch": 2.2252850519984966, "grad_norm": 0.1572444885969162, "learning_rate": 9.996139705970996e-06, "loss": 0.4655, "num_input_tokens_seen": 21589568, "step": 17760 }, { "epoch": 2.2259115399072797, "grad_norm": 0.11784375458955765, "learning_rate": 9.996118196988078e-06, "loss": 0.4634, "num_input_tokens_seen": 21595360, "step": 17765 }, { "epoch": 2.226538027816063, "grad_norm": 0.17230072617530823, "learning_rate": 9.996096628272368e-06, "loss": 0.4622, "num_input_tokens_seen": 21601728, "step": 17770 }, { "epoch": 2.2271645157248465, "grad_norm": 0.14237968623638153, "learning_rate": 9.996074999824121e-06, "loss": 0.458, "num_input_tokens_seen": 21607968, "step": 17775 }, { "epoch": 2.2277910036336297, "grad_norm": 0.18856677412986755, "learning_rate": 9.996053311643598e-06, "loss": 0.4609, "num_input_tokens_seen": 21614144, "step": 17780 }, { "epoch": 2.2284174915424133, "grad_norm": 0.24075058102607727, "learning_rate": 9.996031563731057e-06, "loss": 0.4627, "num_input_tokens_seen": 21619840, "step": 17785 }, { "epoch": 2.2290439794511965, "grad_norm": 0.19610901176929474, "learning_rate": 9.996009756086759e-06, "loss": 0.4636, "num_input_tokens_seen": 21626240, "step": 17790 }, { "epoch": 2.22967046735998, "grad_norm": 0.0562535785138607, "learning_rate": 9.995987888710963e-06, "loss": 0.469, "num_input_tokens_seen": 21632864, "step": 17795 }, { "epoch": 2.2302969552687633, "grad_norm": 0.11380761116743088, "learning_rate": 9.995965961603931e-06, "loss": 0.4564, "num_input_tokens_seen": 21639008, "step": 17800 }, { "epoch": 2.230923443177547, "grad_norm": 0.20098960399627686, "learning_rate": 9.995943974765925e-06, "loss": 0.4455, "num_input_tokens_seen": 21645056, "step": 17805 }, { "epoch": 2.23154993108633, "grad_norm": 0.39726781845092773, "learning_rate": 9.995921928197209e-06, "loss": 0.4815, "num_input_tokens_seen": 21651168, "step": 17810 }, { "epoch": 2.232176418995113, "grad_norm": 0.18669043481349945, "learning_rate": 9.995899821898047e-06, "loss": 0.4713, "num_input_tokens_seen": 21657280, "step": 17815 }, { "epoch": 2.232802906903897, "grad_norm": 0.048139482736587524, "learning_rate": 9.995877655868703e-06, "loss": 0.4674, "num_input_tokens_seen": 21662816, "step": 17820 }, { "epoch": 2.23342939481268, "grad_norm": 0.16980955004692078, "learning_rate": 9.99585543010944e-06, "loss": 0.4642, "num_input_tokens_seen": 21669216, "step": 17825 }, { "epoch": 2.2340558827214636, "grad_norm": 0.1318349391222, "learning_rate": 9.995833144620524e-06, "loss": 0.462, "num_input_tokens_seen": 21674976, "step": 17830 }, { "epoch": 2.2346823706302468, "grad_norm": 0.25555944442749023, "learning_rate": 9.995810799402224e-06, "loss": 0.4614, "num_input_tokens_seen": 21681120, "step": 17835 }, { "epoch": 2.2353088585390304, "grad_norm": 0.16614724695682526, "learning_rate": 9.995788394454804e-06, "loss": 0.454, "num_input_tokens_seen": 21686784, "step": 17840 }, { "epoch": 2.2359353464478136, "grad_norm": 0.2454485446214676, "learning_rate": 9.995765929778536e-06, "loss": 0.4682, "num_input_tokens_seen": 21692832, "step": 17845 }, { "epoch": 2.2365618343565967, "grad_norm": 0.2376011610031128, "learning_rate": 9.995743405373683e-06, "loss": 0.4667, "num_input_tokens_seen": 21698912, "step": 17850 }, { "epoch": 2.2371883222653803, "grad_norm": 0.14956659078598022, "learning_rate": 9.995720821240519e-06, "loss": 0.462, "num_input_tokens_seen": 21705184, "step": 17855 }, { "epoch": 2.2378148101741635, "grad_norm": 0.12509958446025848, "learning_rate": 9.995698177379314e-06, "loss": 0.4628, "num_input_tokens_seen": 21711232, "step": 17860 }, { "epoch": 2.238441298082947, "grad_norm": 0.11840270459651947, "learning_rate": 9.995675473790335e-06, "loss": 0.4744, "num_input_tokens_seen": 21716928, "step": 17865 }, { "epoch": 2.2390677859917303, "grad_norm": 0.13102637231349945, "learning_rate": 9.995652710473857e-06, "loss": 0.4618, "num_input_tokens_seen": 21723296, "step": 17870 }, { "epoch": 2.239694273900514, "grad_norm": 0.11383817344903946, "learning_rate": 9.99562988743015e-06, "loss": 0.4674, "num_input_tokens_seen": 21729312, "step": 17875 }, { "epoch": 2.240320761809297, "grad_norm": 0.18459837138652802, "learning_rate": 9.995607004659488e-06, "loss": 0.4553, "num_input_tokens_seen": 21735520, "step": 17880 }, { "epoch": 2.2409472497180802, "grad_norm": 0.18004536628723145, "learning_rate": 9.995584062162143e-06, "loss": 0.4599, "num_input_tokens_seen": 21741664, "step": 17885 }, { "epoch": 2.241573737626864, "grad_norm": 0.15414127707481384, "learning_rate": 9.995561059938392e-06, "loss": 0.4646, "num_input_tokens_seen": 21748160, "step": 17890 }, { "epoch": 2.242200225535647, "grad_norm": 0.15272800624370575, "learning_rate": 9.995537997988507e-06, "loss": 0.4641, "num_input_tokens_seen": 21754368, "step": 17895 }, { "epoch": 2.2428267134444306, "grad_norm": 0.188457652926445, "learning_rate": 9.995514876312767e-06, "loss": 0.4605, "num_input_tokens_seen": 21760480, "step": 17900 }, { "epoch": 2.243453201353214, "grad_norm": 0.17715229094028473, "learning_rate": 9.995491694911446e-06, "loss": 0.4673, "num_input_tokens_seen": 21766656, "step": 17905 }, { "epoch": 2.2440796892619974, "grad_norm": 0.11337131261825562, "learning_rate": 9.99546845378482e-06, "loss": 0.4568, "num_input_tokens_seen": 21772576, "step": 17910 }, { "epoch": 2.2447061771707806, "grad_norm": 0.03997572138905525, "learning_rate": 9.995445152933171e-06, "loss": 0.4625, "num_input_tokens_seen": 21778656, "step": 17915 }, { "epoch": 2.245332665079564, "grad_norm": 0.17010214924812317, "learning_rate": 9.995421792356774e-06, "loss": 0.4641, "num_input_tokens_seen": 21784992, "step": 17920 }, { "epoch": 2.2459591529883474, "grad_norm": 0.12308476120233536, "learning_rate": 9.99539837205591e-06, "loss": 0.465, "num_input_tokens_seen": 21790944, "step": 17925 }, { "epoch": 2.2465856408971305, "grad_norm": 0.15717680752277374, "learning_rate": 9.995374892030859e-06, "loss": 0.4599, "num_input_tokens_seen": 21797088, "step": 17930 }, { "epoch": 2.247212128805914, "grad_norm": 0.172735333442688, "learning_rate": 9.9953513522819e-06, "loss": 0.4657, "num_input_tokens_seen": 21802944, "step": 17935 }, { "epoch": 2.2478386167146973, "grad_norm": 0.11480175703763962, "learning_rate": 9.995327752809317e-06, "loss": 0.4612, "num_input_tokens_seen": 21808896, "step": 17940 }, { "epoch": 2.248465104623481, "grad_norm": 0.1034465879201889, "learning_rate": 9.99530409361339e-06, "loss": 0.4584, "num_input_tokens_seen": 21814880, "step": 17945 }, { "epoch": 2.249091592532264, "grad_norm": 0.16624945402145386, "learning_rate": 9.995280374694404e-06, "loss": 0.4688, "num_input_tokens_seen": 21820768, "step": 17950 }, { "epoch": 2.2497180804410473, "grad_norm": 0.12723957002162933, "learning_rate": 9.99525659605264e-06, "loss": 0.4631, "num_input_tokens_seen": 21827072, "step": 17955 }, { "epoch": 2.250344568349831, "grad_norm": 0.14982959628105164, "learning_rate": 9.995232757688384e-06, "loss": 0.4632, "num_input_tokens_seen": 21832832, "step": 17960 }, { "epoch": 2.250971056258614, "grad_norm": 0.12234707176685333, "learning_rate": 9.99520885960192e-06, "loss": 0.4684, "num_input_tokens_seen": 21838784, "step": 17965 }, { "epoch": 2.2515975441673977, "grad_norm": 0.2028123438358307, "learning_rate": 9.995184901793536e-06, "loss": 0.4599, "num_input_tokens_seen": 21844736, "step": 17970 }, { "epoch": 2.252224032076181, "grad_norm": 0.17259076237678528, "learning_rate": 9.995160884263516e-06, "loss": 0.4604, "num_input_tokens_seen": 21850272, "step": 17975 }, { "epoch": 2.2528505199849644, "grad_norm": 0.19872580468654633, "learning_rate": 9.995136807012149e-06, "loss": 0.4629, "num_input_tokens_seen": 21856256, "step": 17980 }, { "epoch": 2.2534770078937476, "grad_norm": 0.10975752770900726, "learning_rate": 9.995112670039719e-06, "loss": 0.4676, "num_input_tokens_seen": 21862560, "step": 17985 }, { "epoch": 2.254103495802531, "grad_norm": 0.1121976375579834, "learning_rate": 9.99508847334652e-06, "loss": 0.4613, "num_input_tokens_seen": 21868384, "step": 17990 }, { "epoch": 2.2547299837113144, "grad_norm": 0.10505691915750504, "learning_rate": 9.995064216932837e-06, "loss": 0.4665, "num_input_tokens_seen": 21874400, "step": 17995 }, { "epoch": 2.2553564716200976, "grad_norm": 0.12792205810546875, "learning_rate": 9.995039900798962e-06, "loss": 0.4634, "num_input_tokens_seen": 21880512, "step": 18000 }, { "epoch": 2.255982959528881, "grad_norm": 0.19708672165870667, "learning_rate": 9.995015524945187e-06, "loss": 0.461, "num_input_tokens_seen": 21886336, "step": 18005 }, { "epoch": 2.2566094474376643, "grad_norm": 0.11104511469602585, "learning_rate": 9.9949910893718e-06, "loss": 0.4605, "num_input_tokens_seen": 21892512, "step": 18010 }, { "epoch": 2.257235935346448, "grad_norm": 0.12695026397705078, "learning_rate": 9.994966594079094e-06, "loss": 0.4642, "num_input_tokens_seen": 21898464, "step": 18015 }, { "epoch": 2.257862423255231, "grad_norm": 0.2038278728723526, "learning_rate": 9.994942039067365e-06, "loss": 0.4649, "num_input_tokens_seen": 21904576, "step": 18020 }, { "epoch": 2.2584889111640143, "grad_norm": 0.13160382211208344, "learning_rate": 9.994917424336903e-06, "loss": 0.4667, "num_input_tokens_seen": 21910560, "step": 18025 }, { "epoch": 2.259115399072798, "grad_norm": 0.1817283183336258, "learning_rate": 9.994892749888005e-06, "loss": 0.4669, "num_input_tokens_seen": 21916224, "step": 18030 }, { "epoch": 2.259741886981581, "grad_norm": 0.10341234505176544, "learning_rate": 9.994868015720964e-06, "loss": 0.4622, "num_input_tokens_seen": 21922336, "step": 18035 }, { "epoch": 2.2603683748903647, "grad_norm": 0.10812335461378098, "learning_rate": 9.994843221836076e-06, "loss": 0.4652, "num_input_tokens_seen": 21928384, "step": 18040 }, { "epoch": 2.260994862799148, "grad_norm": 0.10392769426107407, "learning_rate": 9.994818368233639e-06, "loss": 0.4683, "num_input_tokens_seen": 21934688, "step": 18045 }, { "epoch": 2.2616213507079315, "grad_norm": 0.1407812386751175, "learning_rate": 9.994793454913947e-06, "loss": 0.4646, "num_input_tokens_seen": 21940832, "step": 18050 }, { "epoch": 2.2622478386167146, "grad_norm": 0.11671409010887146, "learning_rate": 9.994768481877302e-06, "loss": 0.4635, "num_input_tokens_seen": 21947008, "step": 18055 }, { "epoch": 2.2628743265254982, "grad_norm": 0.10076100379228592, "learning_rate": 9.994743449124001e-06, "loss": 0.4627, "num_input_tokens_seen": 21953216, "step": 18060 }, { "epoch": 2.2635008144342814, "grad_norm": 0.04060709848999977, "learning_rate": 9.994718356654341e-06, "loss": 0.4609, "num_input_tokens_seen": 21959264, "step": 18065 }, { "epoch": 2.2641273023430646, "grad_norm": 0.1342824548482895, "learning_rate": 9.994693204468622e-06, "loss": 0.4578, "num_input_tokens_seen": 21965536, "step": 18070 }, { "epoch": 2.264753790251848, "grad_norm": 0.2668721675872803, "learning_rate": 9.99466799256715e-06, "loss": 0.4699, "num_input_tokens_seen": 21971616, "step": 18075 }, { "epoch": 2.2653802781606314, "grad_norm": 0.13908524811267853, "learning_rate": 9.99464272095022e-06, "loss": 0.4553, "num_input_tokens_seen": 21977472, "step": 18080 }, { "epoch": 2.266006766069415, "grad_norm": 0.14333713054656982, "learning_rate": 9.99461738961814e-06, "loss": 0.464, "num_input_tokens_seen": 21983296, "step": 18085 }, { "epoch": 2.266633253978198, "grad_norm": 0.12360288202762604, "learning_rate": 9.994591998571208e-06, "loss": 0.4653, "num_input_tokens_seen": 21989440, "step": 18090 }, { "epoch": 2.2672597418869818, "grad_norm": 0.12370221316814423, "learning_rate": 9.99456654780973e-06, "loss": 0.4695, "num_input_tokens_seen": 21995424, "step": 18095 }, { "epoch": 2.267886229795765, "grad_norm": 0.04384174197912216, "learning_rate": 9.994541037334008e-06, "loss": 0.4647, "num_input_tokens_seen": 22001312, "step": 18100 }, { "epoch": 2.2685127177045485, "grad_norm": 0.044538866728544235, "learning_rate": 9.99451546714435e-06, "loss": 0.4703, "num_input_tokens_seen": 22007744, "step": 18105 }, { "epoch": 2.2691392056133317, "grad_norm": 0.11451216042041779, "learning_rate": 9.994489837241062e-06, "loss": 0.466, "num_input_tokens_seen": 22013536, "step": 18110 }, { "epoch": 2.269765693522115, "grad_norm": 0.17953597009181976, "learning_rate": 9.994464147624447e-06, "loss": 0.4659, "num_input_tokens_seen": 22019456, "step": 18115 }, { "epoch": 2.2703921814308985, "grad_norm": 0.0972440242767334, "learning_rate": 9.994438398294815e-06, "loss": 0.4595, "num_input_tokens_seen": 22025760, "step": 18120 }, { "epoch": 2.2710186693396817, "grad_norm": 0.16529981791973114, "learning_rate": 9.994412589252471e-06, "loss": 0.4638, "num_input_tokens_seen": 22032096, "step": 18125 }, { "epoch": 2.2716451572484653, "grad_norm": 0.11587367206811905, "learning_rate": 9.994386720497726e-06, "loss": 0.4612, "num_input_tokens_seen": 22038336, "step": 18130 }, { "epoch": 2.2722716451572484, "grad_norm": 0.09635777026414871, "learning_rate": 9.994360792030891e-06, "loss": 0.4646, "num_input_tokens_seen": 22044416, "step": 18135 }, { "epoch": 2.2728981330660316, "grad_norm": 0.17668557167053223, "learning_rate": 9.994334803852271e-06, "loss": 0.4635, "num_input_tokens_seen": 22050720, "step": 18140 }, { "epoch": 2.2735246209748152, "grad_norm": 0.16729585826396942, "learning_rate": 9.994308755962181e-06, "loss": 0.4619, "num_input_tokens_seen": 22056928, "step": 18145 }, { "epoch": 2.2741511088835984, "grad_norm": 0.10243887454271317, "learning_rate": 9.99428264836093e-06, "loss": 0.4599, "num_input_tokens_seen": 22062720, "step": 18150 }, { "epoch": 2.274777596792382, "grad_norm": 0.09844999015331268, "learning_rate": 9.994256481048832e-06, "loss": 0.4556, "num_input_tokens_seen": 22068928, "step": 18155 }, { "epoch": 2.275404084701165, "grad_norm": 0.164193257689476, "learning_rate": 9.994230254026198e-06, "loss": 0.4602, "num_input_tokens_seen": 22074976, "step": 18160 }, { "epoch": 2.276030572609949, "grad_norm": 0.12817323207855225, "learning_rate": 9.994203967293342e-06, "loss": 0.4602, "num_input_tokens_seen": 22080928, "step": 18165 }, { "epoch": 2.276657060518732, "grad_norm": 0.14360296726226807, "learning_rate": 9.99417762085058e-06, "loss": 0.4593, "num_input_tokens_seen": 22087104, "step": 18170 }, { "epoch": 2.2772835484275156, "grad_norm": 0.17871445417404175, "learning_rate": 9.994151214698225e-06, "loss": 0.4719, "num_input_tokens_seen": 22093536, "step": 18175 }, { "epoch": 2.2779100363362987, "grad_norm": 0.15911443531513214, "learning_rate": 9.994124748836593e-06, "loss": 0.4676, "num_input_tokens_seen": 22099648, "step": 18180 }, { "epoch": 2.278536524245082, "grad_norm": 0.15385538339614868, "learning_rate": 9.994098223266e-06, "loss": 0.456, "num_input_tokens_seen": 22105984, "step": 18185 }, { "epoch": 2.2791630121538655, "grad_norm": 0.11948059499263763, "learning_rate": 9.994071637986765e-06, "loss": 0.4603, "num_input_tokens_seen": 22111936, "step": 18190 }, { "epoch": 2.2797895000626487, "grad_norm": 0.1613464504480362, "learning_rate": 9.994044992999206e-06, "loss": 0.4712, "num_input_tokens_seen": 22117984, "step": 18195 }, { "epoch": 2.2804159879714323, "grad_norm": 0.14355769753456116, "learning_rate": 9.994018288303637e-06, "loss": 0.4513, "num_input_tokens_seen": 22124288, "step": 18200 }, { "epoch": 2.2810424758802155, "grad_norm": 0.28801780939102173, "learning_rate": 9.993991523900384e-06, "loss": 0.461, "num_input_tokens_seen": 22130336, "step": 18205 }, { "epoch": 2.2816689637889986, "grad_norm": 0.09863272309303284, "learning_rate": 9.993964699789762e-06, "loss": 0.4608, "num_input_tokens_seen": 22136544, "step": 18210 }, { "epoch": 2.2822954516977823, "grad_norm": 0.13713428378105164, "learning_rate": 9.993937815972094e-06, "loss": 0.4608, "num_input_tokens_seen": 22142272, "step": 18215 }, { "epoch": 2.2829219396065654, "grad_norm": 0.39291617274284363, "learning_rate": 9.993910872447702e-06, "loss": 0.4645, "num_input_tokens_seen": 22148448, "step": 18220 }, { "epoch": 2.283548427515349, "grad_norm": 0.15632082521915436, "learning_rate": 9.993883869216904e-06, "loss": 0.4703, "num_input_tokens_seen": 22154432, "step": 18225 }, { "epoch": 2.284174915424132, "grad_norm": 0.11190215498209, "learning_rate": 9.993856806280026e-06, "loss": 0.4707, "num_input_tokens_seen": 22160544, "step": 18230 }, { "epoch": 2.284801403332916, "grad_norm": 0.16066007316112518, "learning_rate": 9.993829683637393e-06, "loss": 0.4584, "num_input_tokens_seen": 22166208, "step": 18235 }, { "epoch": 2.285427891241699, "grad_norm": 0.11198785901069641, "learning_rate": 9.993802501289327e-06, "loss": 0.4604, "num_input_tokens_seen": 22172384, "step": 18240 }, { "epoch": 2.2860543791504826, "grad_norm": 0.11059258133172989, "learning_rate": 9.993775259236154e-06, "loss": 0.4696, "num_input_tokens_seen": 22178592, "step": 18245 }, { "epoch": 2.2866808670592658, "grad_norm": 0.11280962824821472, "learning_rate": 9.993747957478197e-06, "loss": 0.4574, "num_input_tokens_seen": 22184896, "step": 18250 }, { "epoch": 2.287307354968049, "grad_norm": 0.1111413836479187, "learning_rate": 9.993720596015787e-06, "loss": 0.4627, "num_input_tokens_seen": 22191296, "step": 18255 }, { "epoch": 2.2879338428768325, "grad_norm": 0.11024697870016098, "learning_rate": 9.993693174849249e-06, "loss": 0.4627, "num_input_tokens_seen": 22197504, "step": 18260 }, { "epoch": 2.2885603307856157, "grad_norm": 0.11751354485750198, "learning_rate": 9.99366569397891e-06, "loss": 0.4636, "num_input_tokens_seen": 22203200, "step": 18265 }, { "epoch": 2.2891868186943993, "grad_norm": 0.10935568064451218, "learning_rate": 9.993638153405098e-06, "loss": 0.4683, "num_input_tokens_seen": 22209280, "step": 18270 }, { "epoch": 2.2898133066031825, "grad_norm": 0.16980214416980743, "learning_rate": 9.993610553128143e-06, "loss": 0.4609, "num_input_tokens_seen": 22215424, "step": 18275 }, { "epoch": 2.290439794511966, "grad_norm": 0.11765986680984497, "learning_rate": 9.993582893148376e-06, "loss": 0.4651, "num_input_tokens_seen": 22221824, "step": 18280 }, { "epoch": 2.2910662824207493, "grad_norm": 0.11447696387767792, "learning_rate": 9.99355517346613e-06, "loss": 0.4652, "num_input_tokens_seen": 22227808, "step": 18285 }, { "epoch": 2.2916927703295324, "grad_norm": 0.11339415609836578, "learning_rate": 9.99352739408173e-06, "loss": 0.4635, "num_input_tokens_seen": 22233600, "step": 18290 }, { "epoch": 2.292319258238316, "grad_norm": 0.12189047038555145, "learning_rate": 9.993499554995514e-06, "loss": 0.4609, "num_input_tokens_seen": 22239872, "step": 18295 }, { "epoch": 2.2929457461470992, "grad_norm": 0.1155950129032135, "learning_rate": 9.993471656207811e-06, "loss": 0.4626, "num_input_tokens_seen": 22246112, "step": 18300 }, { "epoch": 2.293572234055883, "grad_norm": 0.16097261011600494, "learning_rate": 9.993443697718958e-06, "loss": 0.4626, "num_input_tokens_seen": 22252384, "step": 18305 }, { "epoch": 2.294198721964666, "grad_norm": 0.128106027841568, "learning_rate": 9.993415679529286e-06, "loss": 0.4652, "num_input_tokens_seen": 22258432, "step": 18310 }, { "epoch": 2.2948252098734496, "grad_norm": 0.2002551108598709, "learning_rate": 9.993387601639131e-06, "loss": 0.4612, "num_input_tokens_seen": 22264864, "step": 18315 }, { "epoch": 2.295451697782233, "grad_norm": 0.20249439775943756, "learning_rate": 9.993359464048829e-06, "loss": 0.4663, "num_input_tokens_seen": 22270496, "step": 18320 }, { "epoch": 2.296078185691016, "grad_norm": 0.09622316807508469, "learning_rate": 9.993331266758716e-06, "loss": 0.4653, "num_input_tokens_seen": 22276608, "step": 18325 }, { "epoch": 2.2967046735997996, "grad_norm": 0.10245531052350998, "learning_rate": 9.99330300976913e-06, "loss": 0.4605, "num_input_tokens_seen": 22282528, "step": 18330 }, { "epoch": 2.2973311615085827, "grad_norm": 0.10164721310138702, "learning_rate": 9.99327469308041e-06, "loss": 0.4605, "num_input_tokens_seen": 22288672, "step": 18335 }, { "epoch": 2.2979576494173664, "grad_norm": 0.15202058851718903, "learning_rate": 9.993246316692892e-06, "loss": 0.4605, "num_input_tokens_seen": 22294848, "step": 18340 }, { "epoch": 2.2985841373261495, "grad_norm": 0.09814688563346863, "learning_rate": 9.993217880606915e-06, "loss": 0.4609, "num_input_tokens_seen": 22300896, "step": 18345 }, { "epoch": 2.299210625234933, "grad_norm": 0.0932154431939125, "learning_rate": 9.99318938482282e-06, "loss": 0.4647, "num_input_tokens_seen": 22307200, "step": 18350 }, { "epoch": 2.2998371131437163, "grad_norm": 0.12088900059461594, "learning_rate": 9.993160829340952e-06, "loss": 0.4646, "num_input_tokens_seen": 22313472, "step": 18355 }, { "epoch": 2.3004636010525, "grad_norm": 0.27685487270355225, "learning_rate": 9.993132214161644e-06, "loss": 0.469, "num_input_tokens_seen": 22319712, "step": 18360 }, { "epoch": 2.301090088961283, "grad_norm": 0.15599291026592255, "learning_rate": 9.993103539285243e-06, "loss": 0.4621, "num_input_tokens_seen": 22325760, "step": 18365 }, { "epoch": 2.3017165768700663, "grad_norm": 0.09522390365600586, "learning_rate": 9.993074804712091e-06, "loss": 0.4613, "num_input_tokens_seen": 22331840, "step": 18370 }, { "epoch": 2.30234306477885, "grad_norm": 0.14521194994449615, "learning_rate": 9.993046010442534e-06, "loss": 0.4638, "num_input_tokens_seen": 22338016, "step": 18375 }, { "epoch": 2.302969552687633, "grad_norm": 0.10445456951856613, "learning_rate": 9.993017156476912e-06, "loss": 0.4622, "num_input_tokens_seen": 22344192, "step": 18380 }, { "epoch": 2.3035960405964166, "grad_norm": 0.11697284132242203, "learning_rate": 9.992988242815572e-06, "loss": 0.4662, "num_input_tokens_seen": 22350336, "step": 18385 }, { "epoch": 2.3042225285052, "grad_norm": 0.1450105905532837, "learning_rate": 9.99295926945886e-06, "loss": 0.46, "num_input_tokens_seen": 22356384, "step": 18390 }, { "epoch": 2.304849016413983, "grad_norm": 0.09710405766963959, "learning_rate": 9.992930236407123e-06, "loss": 0.462, "num_input_tokens_seen": 22362592, "step": 18395 }, { "epoch": 2.3054755043227666, "grad_norm": 0.117705799639225, "learning_rate": 9.992901143660708e-06, "loss": 0.4611, "num_input_tokens_seen": 22369152, "step": 18400 }, { "epoch": 2.3061019922315498, "grad_norm": 0.28453126549720764, "learning_rate": 9.992871991219959e-06, "loss": 0.4615, "num_input_tokens_seen": 22375360, "step": 18405 }, { "epoch": 2.3067284801403334, "grad_norm": 0.1427011936903, "learning_rate": 9.992842779085228e-06, "loss": 0.4607, "num_input_tokens_seen": 22381568, "step": 18410 }, { "epoch": 2.3073549680491166, "grad_norm": 0.0541977696120739, "learning_rate": 9.992813507256867e-06, "loss": 0.4664, "num_input_tokens_seen": 22387808, "step": 18415 }, { "epoch": 2.3079814559579, "grad_norm": 0.11137236654758453, "learning_rate": 9.99278417573522e-06, "loss": 0.4656, "num_input_tokens_seen": 22394240, "step": 18420 }, { "epoch": 2.3086079438666833, "grad_norm": 0.18732593953609467, "learning_rate": 9.992754784520641e-06, "loss": 0.4614, "num_input_tokens_seen": 22400000, "step": 18425 }, { "epoch": 2.309234431775467, "grad_norm": 0.20209363102912903, "learning_rate": 9.992725333613481e-06, "loss": 0.4615, "num_input_tokens_seen": 22406016, "step": 18430 }, { "epoch": 2.30986091968425, "grad_norm": 0.21577031910419464, "learning_rate": 9.992695823014091e-06, "loss": 0.4583, "num_input_tokens_seen": 22412320, "step": 18435 }, { "epoch": 2.3104874075930333, "grad_norm": 0.19664132595062256, "learning_rate": 9.992666252722827e-06, "loss": 0.4674, "num_input_tokens_seen": 22418400, "step": 18440 }, { "epoch": 2.311113895501817, "grad_norm": 0.14042414724826813, "learning_rate": 9.992636622740039e-06, "loss": 0.457, "num_input_tokens_seen": 22424192, "step": 18445 }, { "epoch": 2.3117403834106, "grad_norm": 0.11185251176357269, "learning_rate": 9.992606933066082e-06, "loss": 0.4577, "num_input_tokens_seen": 22430496, "step": 18450 }, { "epoch": 2.3123668713193837, "grad_norm": 0.16950707137584686, "learning_rate": 9.992577183701313e-06, "loss": 0.4764, "num_input_tokens_seen": 22436960, "step": 18455 }, { "epoch": 2.312993359228167, "grad_norm": 0.2132691740989685, "learning_rate": 9.992547374646086e-06, "loss": 0.4704, "num_input_tokens_seen": 22443104, "step": 18460 }, { "epoch": 2.31361984713695, "grad_norm": 0.23729869723320007, "learning_rate": 9.992517505900758e-06, "loss": 0.4703, "num_input_tokens_seen": 22448448, "step": 18465 }, { "epoch": 2.3142463350457336, "grad_norm": 0.17169372737407684, "learning_rate": 9.992487577465683e-06, "loss": 0.4572, "num_input_tokens_seen": 22454528, "step": 18470 }, { "epoch": 2.314872822954517, "grad_norm": 0.1520482450723648, "learning_rate": 9.992457589341225e-06, "loss": 0.4691, "num_input_tokens_seen": 22460672, "step": 18475 }, { "epoch": 2.3154993108633004, "grad_norm": 0.10199427604675293, "learning_rate": 9.992427541527736e-06, "loss": 0.4654, "num_input_tokens_seen": 22466816, "step": 18480 }, { "epoch": 2.3161257987720836, "grad_norm": 0.10355152934789658, "learning_rate": 9.992397434025582e-06, "loss": 0.4653, "num_input_tokens_seen": 22472544, "step": 18485 }, { "epoch": 2.316752286680867, "grad_norm": 0.11379563808441162, "learning_rate": 9.992367266835117e-06, "loss": 0.4668, "num_input_tokens_seen": 22478816, "step": 18490 }, { "epoch": 2.3173787745896504, "grad_norm": 0.15415005385875702, "learning_rate": 9.992337039956703e-06, "loss": 0.4633, "num_input_tokens_seen": 22484608, "step": 18495 }, { "epoch": 2.318005262498434, "grad_norm": 0.11542368680238724, "learning_rate": 9.992306753390704e-06, "loss": 0.4633, "num_input_tokens_seen": 22491008, "step": 18500 }, { "epoch": 2.318631750407217, "grad_norm": 0.14589565992355347, "learning_rate": 9.99227640713748e-06, "loss": 0.4695, "num_input_tokens_seen": 22497248, "step": 18505 }, { "epoch": 2.3192582383160003, "grad_norm": 0.17774276435375214, "learning_rate": 9.992246001197394e-06, "loss": 0.4599, "num_input_tokens_seen": 22503008, "step": 18510 }, { "epoch": 2.319884726224784, "grad_norm": 0.10148713737726212, "learning_rate": 9.992215535570811e-06, "loss": 0.4606, "num_input_tokens_seen": 22509088, "step": 18515 }, { "epoch": 2.320511214133567, "grad_norm": 0.09330612421035767, "learning_rate": 9.992185010258094e-06, "loss": 0.4583, "num_input_tokens_seen": 22515232, "step": 18520 }, { "epoch": 2.3211377020423507, "grad_norm": 0.09787097573280334, "learning_rate": 9.992154425259606e-06, "loss": 0.4688, "num_input_tokens_seen": 22521344, "step": 18525 }, { "epoch": 2.321764189951134, "grad_norm": 0.15221209824085236, "learning_rate": 9.992123780575715e-06, "loss": 0.4604, "num_input_tokens_seen": 22527424, "step": 18530 }, { "epoch": 2.3223906778599175, "grad_norm": 0.107613205909729, "learning_rate": 9.99209307620679e-06, "loss": 0.4656, "num_input_tokens_seen": 22533504, "step": 18535 }, { "epoch": 2.3230171657687007, "grad_norm": 0.10698207467794418, "learning_rate": 9.992062312153193e-06, "loss": 0.4688, "num_input_tokens_seen": 22539936, "step": 18540 }, { "epoch": 2.3236436536774843, "grad_norm": 0.16013740003108978, "learning_rate": 9.992031488415293e-06, "loss": 0.46, "num_input_tokens_seen": 22546464, "step": 18545 }, { "epoch": 2.3242701415862674, "grad_norm": 0.19215375185012817, "learning_rate": 9.99200060499346e-06, "loss": 0.46, "num_input_tokens_seen": 22552800, "step": 18550 }, { "epoch": 2.3248966294950506, "grad_norm": 0.11726342141628265, "learning_rate": 9.991969661888064e-06, "loss": 0.4642, "num_input_tokens_seen": 22559168, "step": 18555 }, { "epoch": 2.325523117403834, "grad_norm": 0.18775674700737, "learning_rate": 9.991938659099473e-06, "loss": 0.4574, "num_input_tokens_seen": 22565344, "step": 18560 }, { "epoch": 2.3261496053126174, "grad_norm": 0.13591808080673218, "learning_rate": 9.991907596628059e-06, "loss": 0.4612, "num_input_tokens_seen": 22571552, "step": 18565 }, { "epoch": 2.326776093221401, "grad_norm": 0.12269286811351776, "learning_rate": 9.991876474474192e-06, "loss": 0.4623, "num_input_tokens_seen": 22577888, "step": 18570 }, { "epoch": 2.327402581130184, "grad_norm": 0.17148348689079285, "learning_rate": 9.991845292638245e-06, "loss": 0.4503, "num_input_tokens_seen": 22583808, "step": 18575 }, { "epoch": 2.3280290690389673, "grad_norm": 0.12150608748197556, "learning_rate": 9.99181405112059e-06, "loss": 0.4614, "num_input_tokens_seen": 22590272, "step": 18580 }, { "epoch": 2.328655556947751, "grad_norm": 0.21154433488845825, "learning_rate": 9.991782749921601e-06, "loss": 0.4764, "num_input_tokens_seen": 22595552, "step": 18585 }, { "epoch": 2.329282044856534, "grad_norm": 0.1589193195104599, "learning_rate": 9.991751389041652e-06, "loss": 0.4671, "num_input_tokens_seen": 22601792, "step": 18590 }, { "epoch": 2.3299085327653177, "grad_norm": 0.17480260133743286, "learning_rate": 9.991719968481119e-06, "loss": 0.4522, "num_input_tokens_seen": 22608032, "step": 18595 }, { "epoch": 2.330535020674101, "grad_norm": 0.052213914692401886, "learning_rate": 9.991688488240379e-06, "loss": 0.4635, "num_input_tokens_seen": 22613984, "step": 18600 }, { "epoch": 2.3311615085828845, "grad_norm": 0.3320530354976654, "learning_rate": 9.991656948319803e-06, "loss": 0.4486, "num_input_tokens_seen": 22620384, "step": 18605 }, { "epoch": 2.3317879964916677, "grad_norm": 0.06501737982034683, "learning_rate": 9.991625348719774e-06, "loss": 0.47, "num_input_tokens_seen": 22626240, "step": 18610 }, { "epoch": 2.3324144844004513, "grad_norm": 0.2478114366531372, "learning_rate": 9.991593689440667e-06, "loss": 0.4683, "num_input_tokens_seen": 22632320, "step": 18615 }, { "epoch": 2.3330409723092345, "grad_norm": 0.1139349713921547, "learning_rate": 9.991561970482859e-06, "loss": 0.4697, "num_input_tokens_seen": 22638208, "step": 18620 }, { "epoch": 2.3336674602180176, "grad_norm": 0.14765074849128723, "learning_rate": 9.991530191846733e-06, "loss": 0.4759, "num_input_tokens_seen": 22644640, "step": 18625 }, { "epoch": 2.3342939481268012, "grad_norm": 0.13345293700695038, "learning_rate": 9.991498353532665e-06, "loss": 0.457, "num_input_tokens_seen": 22650624, "step": 18630 }, { "epoch": 2.3349204360355844, "grad_norm": 0.12855908274650574, "learning_rate": 9.991466455541037e-06, "loss": 0.4645, "num_input_tokens_seen": 22656544, "step": 18635 }, { "epoch": 2.335546923944368, "grad_norm": 0.187803253531456, "learning_rate": 9.991434497872233e-06, "loss": 0.4696, "num_input_tokens_seen": 22663040, "step": 18640 }, { "epoch": 2.336173411853151, "grad_norm": 0.1917746514081955, "learning_rate": 9.991402480526634e-06, "loss": 0.4602, "num_input_tokens_seen": 22669280, "step": 18645 }, { "epoch": 2.3367998997619344, "grad_norm": 0.2152077555656433, "learning_rate": 9.991370403504619e-06, "loss": 0.4613, "num_input_tokens_seen": 22675424, "step": 18650 }, { "epoch": 2.337426387670718, "grad_norm": 0.18561479449272156, "learning_rate": 9.991338266806575e-06, "loss": 0.464, "num_input_tokens_seen": 22681504, "step": 18655 }, { "epoch": 2.338052875579501, "grad_norm": 0.04866379126906395, "learning_rate": 9.991306070432887e-06, "loss": 0.4598, "num_input_tokens_seen": 22687584, "step": 18660 }, { "epoch": 2.3386793634882848, "grad_norm": 0.1243969053030014, "learning_rate": 9.991273814383937e-06, "loss": 0.4619, "num_input_tokens_seen": 22693760, "step": 18665 }, { "epoch": 2.339305851397068, "grad_norm": 0.20318207144737244, "learning_rate": 9.991241498660111e-06, "loss": 0.4659, "num_input_tokens_seen": 22700224, "step": 18670 }, { "epoch": 2.3399323393058515, "grad_norm": 0.1355745941400528, "learning_rate": 9.9912091232618e-06, "loss": 0.4669, "num_input_tokens_seen": 22706176, "step": 18675 }, { "epoch": 2.3405588272146347, "grad_norm": 0.1312664896249771, "learning_rate": 9.991176688189385e-06, "loss": 0.463, "num_input_tokens_seen": 22712288, "step": 18680 }, { "epoch": 2.3411853151234183, "grad_norm": 0.15110087394714355, "learning_rate": 9.991144193443257e-06, "loss": 0.4646, "num_input_tokens_seen": 22718272, "step": 18685 }, { "epoch": 2.3418118030322015, "grad_norm": 0.149977445602417, "learning_rate": 9.991111639023802e-06, "loss": 0.4638, "num_input_tokens_seen": 22724448, "step": 18690 }, { "epoch": 2.3424382909409847, "grad_norm": 0.13229508697986603, "learning_rate": 9.991079024931414e-06, "loss": 0.4585, "num_input_tokens_seen": 22730464, "step": 18695 }, { "epoch": 2.3430647788497683, "grad_norm": 0.04116341844201088, "learning_rate": 9.991046351166479e-06, "loss": 0.4638, "num_input_tokens_seen": 22736576, "step": 18700 }, { "epoch": 2.3436912667585514, "grad_norm": 0.15451720356941223, "learning_rate": 9.991013617729389e-06, "loss": 0.4622, "num_input_tokens_seen": 22742624, "step": 18705 }, { "epoch": 2.344317754667335, "grad_norm": 0.11236818134784698, "learning_rate": 9.990980824620533e-06, "loss": 0.4556, "num_input_tokens_seen": 22748576, "step": 18710 }, { "epoch": 2.344944242576118, "grad_norm": 0.04144907742738724, "learning_rate": 9.990947971840307e-06, "loss": 0.4697, "num_input_tokens_seen": 22754464, "step": 18715 }, { "epoch": 2.345570730484902, "grad_norm": 0.11728674918413162, "learning_rate": 9.9909150593891e-06, "loss": 0.4579, "num_input_tokens_seen": 22760544, "step": 18720 }, { "epoch": 2.346197218393685, "grad_norm": 0.0942782312631607, "learning_rate": 9.990882087267311e-06, "loss": 0.4606, "num_input_tokens_seen": 22766912, "step": 18725 }, { "epoch": 2.346823706302468, "grad_norm": 0.13143397867679596, "learning_rate": 9.990849055475327e-06, "loss": 0.4664, "num_input_tokens_seen": 22772928, "step": 18730 }, { "epoch": 2.347450194211252, "grad_norm": 0.09013920277357101, "learning_rate": 9.990815964013547e-06, "loss": 0.4617, "num_input_tokens_seen": 22779296, "step": 18735 }, { "epoch": 2.348076682120035, "grad_norm": 0.17505337297916412, "learning_rate": 9.990782812882368e-06, "loss": 0.4485, "num_input_tokens_seen": 22785664, "step": 18740 }, { "epoch": 2.3487031700288186, "grad_norm": 0.13633060455322266, "learning_rate": 9.990749602082184e-06, "loss": 0.4611, "num_input_tokens_seen": 22792128, "step": 18745 }, { "epoch": 2.3493296579376017, "grad_norm": 0.09639191627502441, "learning_rate": 9.99071633161339e-06, "loss": 0.4559, "num_input_tokens_seen": 22798560, "step": 18750 }, { "epoch": 2.3499561458463853, "grad_norm": 0.15257421135902405, "learning_rate": 9.99068300147639e-06, "loss": 0.4569, "num_input_tokens_seen": 22804704, "step": 18755 }, { "epoch": 2.3505826337551685, "grad_norm": 0.18436507880687714, "learning_rate": 9.990649611671576e-06, "loss": 0.4643, "num_input_tokens_seen": 22811008, "step": 18760 }, { "epoch": 2.3512091216639517, "grad_norm": 0.18898694217205048, "learning_rate": 9.990616162199351e-06, "loss": 0.468, "num_input_tokens_seen": 22817504, "step": 18765 }, { "epoch": 2.3518356095727353, "grad_norm": 0.28308597207069397, "learning_rate": 9.990582653060113e-06, "loss": 0.4598, "num_input_tokens_seen": 22823136, "step": 18770 }, { "epoch": 2.3524620974815185, "grad_norm": 0.12180577218532562, "learning_rate": 9.990549084254263e-06, "loss": 0.4773, "num_input_tokens_seen": 22829440, "step": 18775 }, { "epoch": 2.353088585390302, "grad_norm": 0.03929324820637703, "learning_rate": 9.990515455782206e-06, "loss": 0.4716, "num_input_tokens_seen": 22835456, "step": 18780 }, { "epoch": 2.3537150732990852, "grad_norm": 0.09483779221773148, "learning_rate": 9.990481767644339e-06, "loss": 0.4755, "num_input_tokens_seen": 22841600, "step": 18785 }, { "epoch": 2.354341561207869, "grad_norm": 0.09531346708536148, "learning_rate": 9.990448019841066e-06, "loss": 0.4642, "num_input_tokens_seen": 22847552, "step": 18790 }, { "epoch": 2.354968049116652, "grad_norm": 0.08975214511156082, "learning_rate": 9.990414212372791e-06, "loss": 0.468, "num_input_tokens_seen": 22853568, "step": 18795 }, { "epoch": 2.3555945370254356, "grad_norm": 0.15190663933753967, "learning_rate": 9.99038034523992e-06, "loss": 0.4561, "num_input_tokens_seen": 22859744, "step": 18800 }, { "epoch": 2.356221024934219, "grad_norm": 0.13894405961036682, "learning_rate": 9.990346418442857e-06, "loss": 0.4607, "num_input_tokens_seen": 22866048, "step": 18805 }, { "epoch": 2.356847512843002, "grad_norm": 0.08692540228366852, "learning_rate": 9.990312431982005e-06, "loss": 0.4703, "num_input_tokens_seen": 22872160, "step": 18810 }, { "epoch": 2.3574740007517856, "grad_norm": 0.08703149855136871, "learning_rate": 9.990278385857773e-06, "loss": 0.4602, "num_input_tokens_seen": 22878112, "step": 18815 }, { "epoch": 2.3581004886605688, "grad_norm": 0.11910996586084366, "learning_rate": 9.990244280070567e-06, "loss": 0.4628, "num_input_tokens_seen": 22884064, "step": 18820 }, { "epoch": 2.3587269765693524, "grad_norm": 0.13879558444023132, "learning_rate": 9.990210114620795e-06, "loss": 0.461, "num_input_tokens_seen": 22890240, "step": 18825 }, { "epoch": 2.3593534644781355, "grad_norm": 0.04053289443254471, "learning_rate": 9.990175889508866e-06, "loss": 0.4602, "num_input_tokens_seen": 22896800, "step": 18830 }, { "epoch": 2.3599799523869187, "grad_norm": 0.1554335653781891, "learning_rate": 9.990141604735188e-06, "loss": 0.4613, "num_input_tokens_seen": 22902080, "step": 18835 }, { "epoch": 2.3606064402957023, "grad_norm": 0.1660347878932953, "learning_rate": 9.990107260300172e-06, "loss": 0.4649, "num_input_tokens_seen": 22907936, "step": 18840 }, { "epoch": 2.3612329282044855, "grad_norm": 0.1552780419588089, "learning_rate": 9.990072856204228e-06, "loss": 0.4718, "num_input_tokens_seen": 22914080, "step": 18845 }, { "epoch": 2.361859416113269, "grad_norm": 0.09193066507577896, "learning_rate": 9.990038392447769e-06, "loss": 0.453, "num_input_tokens_seen": 22920256, "step": 18850 }, { "epoch": 2.3624859040220523, "grad_norm": 0.17075128853321075, "learning_rate": 9.990003869031203e-06, "loss": 0.4612, "num_input_tokens_seen": 22926528, "step": 18855 }, { "epoch": 2.363112391930836, "grad_norm": 0.04504033550620079, "learning_rate": 9.989969285954948e-06, "loss": 0.4654, "num_input_tokens_seen": 22932640, "step": 18860 }, { "epoch": 2.363738879839619, "grad_norm": 0.10604974627494812, "learning_rate": 9.989934643219414e-06, "loss": 0.4581, "num_input_tokens_seen": 22938784, "step": 18865 }, { "epoch": 2.3643653677484027, "grad_norm": 0.08956553041934967, "learning_rate": 9.989899940825015e-06, "loss": 0.4586, "num_input_tokens_seen": 22945216, "step": 18870 }, { "epoch": 2.364991855657186, "grad_norm": 0.14833520352840424, "learning_rate": 9.989865178772168e-06, "loss": 0.4727, "num_input_tokens_seen": 22951360, "step": 18875 }, { "epoch": 2.365618343565969, "grad_norm": 0.20194900035858154, "learning_rate": 9.989830357061288e-06, "loss": 0.4618, "num_input_tokens_seen": 22957152, "step": 18880 }, { "epoch": 2.3662448314747526, "grad_norm": 0.1013956367969513, "learning_rate": 9.98979547569279e-06, "loss": 0.4568, "num_input_tokens_seen": 22963040, "step": 18885 }, { "epoch": 2.366871319383536, "grad_norm": 0.18194539844989777, "learning_rate": 9.98976053466709e-06, "loss": 0.4476, "num_input_tokens_seen": 22969344, "step": 18890 }, { "epoch": 2.3674978072923194, "grad_norm": 0.0451049767434597, "learning_rate": 9.989725533984612e-06, "loss": 0.4666, "num_input_tokens_seen": 22975488, "step": 18895 }, { "epoch": 2.3681242952011026, "grad_norm": 0.122224360704422, "learning_rate": 9.989690473645767e-06, "loss": 0.4586, "num_input_tokens_seen": 22981600, "step": 18900 }, { "epoch": 2.3687507831098857, "grad_norm": 0.1288735270500183, "learning_rate": 9.989655353650977e-06, "loss": 0.4639, "num_input_tokens_seen": 22988096, "step": 18905 }, { "epoch": 2.3693772710186694, "grad_norm": 0.20967602729797363, "learning_rate": 9.989620174000664e-06, "loss": 0.4672, "num_input_tokens_seen": 22994144, "step": 18910 }, { "epoch": 2.3700037589274525, "grad_norm": 0.10613353550434113, "learning_rate": 9.989584934695246e-06, "loss": 0.4622, "num_input_tokens_seen": 23000000, "step": 18915 }, { "epoch": 2.370630246836236, "grad_norm": 0.09373965859413147, "learning_rate": 9.989549635735145e-06, "loss": 0.4596, "num_input_tokens_seen": 23006240, "step": 18920 }, { "epoch": 2.3712567347450193, "grad_norm": 0.0931125208735466, "learning_rate": 9.989514277120782e-06, "loss": 0.4605, "num_input_tokens_seen": 23012384, "step": 18925 }, { "epoch": 2.371883222653803, "grad_norm": 0.14361603558063507, "learning_rate": 9.989478858852584e-06, "loss": 0.4641, "num_input_tokens_seen": 23018656, "step": 18930 }, { "epoch": 2.372509710562586, "grad_norm": 0.10111194103956223, "learning_rate": 9.98944338093097e-06, "loss": 0.4658, "num_input_tokens_seen": 23025056, "step": 18935 }, { "epoch": 2.3731361984713697, "grad_norm": 0.10749276727437973, "learning_rate": 9.989407843356364e-06, "loss": 0.4645, "num_input_tokens_seen": 23030848, "step": 18940 }, { "epoch": 2.373762686380153, "grad_norm": 0.15029749274253845, "learning_rate": 9.989372246129194e-06, "loss": 0.4636, "num_input_tokens_seen": 23036864, "step": 18945 }, { "epoch": 2.374389174288936, "grad_norm": 0.11092200875282288, "learning_rate": 9.989336589249882e-06, "loss": 0.4584, "num_input_tokens_seen": 23042784, "step": 18950 }, { "epoch": 2.3750156621977196, "grad_norm": 0.1710892766714096, "learning_rate": 9.989300872718858e-06, "loss": 0.4705, "num_input_tokens_seen": 23048960, "step": 18955 }, { "epoch": 2.375642150106503, "grad_norm": 0.10699926316738129, "learning_rate": 9.989265096536547e-06, "loss": 0.4713, "num_input_tokens_seen": 23055008, "step": 18960 }, { "epoch": 2.3762686380152864, "grad_norm": 0.13477306067943573, "learning_rate": 9.989229260703377e-06, "loss": 0.4602, "num_input_tokens_seen": 23061216, "step": 18965 }, { "epoch": 2.3768951259240696, "grad_norm": 0.1287706047296524, "learning_rate": 9.989193365219777e-06, "loss": 0.4679, "num_input_tokens_seen": 23067296, "step": 18970 }, { "epoch": 2.377521613832853, "grad_norm": 0.13817422091960907, "learning_rate": 9.989157410086173e-06, "loss": 0.4646, "num_input_tokens_seen": 23073504, "step": 18975 }, { "epoch": 2.3781481017416364, "grad_norm": 0.11284198611974716, "learning_rate": 9.989121395303e-06, "loss": 0.4671, "num_input_tokens_seen": 23079552, "step": 18980 }, { "epoch": 2.37877458965042, "grad_norm": 0.03907787427306175, "learning_rate": 9.989085320870686e-06, "loss": 0.4633, "num_input_tokens_seen": 23085472, "step": 18985 }, { "epoch": 2.379401077559203, "grad_norm": 0.104335255920887, "learning_rate": 9.989049186789662e-06, "loss": 0.4673, "num_input_tokens_seen": 23091616, "step": 18990 }, { "epoch": 2.3800275654679863, "grad_norm": 0.14200058579444885, "learning_rate": 9.98901299306036e-06, "loss": 0.4693, "num_input_tokens_seen": 23097504, "step": 18995 }, { "epoch": 2.38065405337677, "grad_norm": 0.19182604551315308, "learning_rate": 9.988976739683213e-06, "loss": 0.4594, "num_input_tokens_seen": 23103616, "step": 19000 }, { "epoch": 2.381280541285553, "grad_norm": 0.09271278977394104, "learning_rate": 9.988940426658655e-06, "loss": 0.462, "num_input_tokens_seen": 23109984, "step": 19005 }, { "epoch": 2.3819070291943367, "grad_norm": 0.09756623953580856, "learning_rate": 9.98890405398712e-06, "loss": 0.4684, "num_input_tokens_seen": 23116032, "step": 19010 }, { "epoch": 2.38253351710312, "grad_norm": 0.03929702565073967, "learning_rate": 9.988867621669042e-06, "loss": 0.4564, "num_input_tokens_seen": 23122208, "step": 19015 }, { "epoch": 2.383160005011903, "grad_norm": 0.11151473969221115, "learning_rate": 9.988831129704857e-06, "loss": 0.467, "num_input_tokens_seen": 23128288, "step": 19020 }, { "epoch": 2.3837864929206867, "grad_norm": 0.09722252190113068, "learning_rate": 9.988794578095003e-06, "loss": 0.4685, "num_input_tokens_seen": 23134528, "step": 19025 }, { "epoch": 2.38441298082947, "grad_norm": 0.1056714728474617, "learning_rate": 9.988757966839916e-06, "loss": 0.4596, "num_input_tokens_seen": 23140512, "step": 19030 }, { "epoch": 2.3850394687382535, "grad_norm": 0.15955893695354462, "learning_rate": 9.98872129594003e-06, "loss": 0.4565, "num_input_tokens_seen": 23146624, "step": 19035 }, { "epoch": 2.3856659566470366, "grad_norm": 0.10972240567207336, "learning_rate": 9.98868456539579e-06, "loss": 0.4614, "num_input_tokens_seen": 23152096, "step": 19040 }, { "epoch": 2.3862924445558202, "grad_norm": 0.12005430459976196, "learning_rate": 9.98864777520763e-06, "loss": 0.4643, "num_input_tokens_seen": 23158080, "step": 19045 }, { "epoch": 2.3869189324646034, "grad_norm": 0.17206227779388428, "learning_rate": 9.98861092537599e-06, "loss": 0.4705, "num_input_tokens_seen": 23164032, "step": 19050 }, { "epoch": 2.387545420373387, "grad_norm": 0.11326458305120468, "learning_rate": 9.988574015901314e-06, "loss": 0.4674, "num_input_tokens_seen": 23170432, "step": 19055 }, { "epoch": 2.38817190828217, "grad_norm": 0.12200362980365753, "learning_rate": 9.988537046784041e-06, "loss": 0.4615, "num_input_tokens_seen": 23176320, "step": 19060 }, { "epoch": 2.3887983961909534, "grad_norm": 0.09958887100219727, "learning_rate": 9.988500018024613e-06, "loss": 0.4624, "num_input_tokens_seen": 23182208, "step": 19065 }, { "epoch": 2.389424884099737, "grad_norm": 0.12220913171768188, "learning_rate": 9.988462929623475e-06, "loss": 0.4592, "num_input_tokens_seen": 23188480, "step": 19070 }, { "epoch": 2.39005137200852, "grad_norm": 0.17401978373527527, "learning_rate": 9.988425781581067e-06, "loss": 0.4604, "num_input_tokens_seen": 23194560, "step": 19075 }, { "epoch": 2.3906778599173037, "grad_norm": 0.09211117774248123, "learning_rate": 9.988388573897835e-06, "loss": 0.4628, "num_input_tokens_seen": 23200544, "step": 19080 }, { "epoch": 2.391304347826087, "grad_norm": 0.09782563894987106, "learning_rate": 9.988351306574226e-06, "loss": 0.4636, "num_input_tokens_seen": 23206720, "step": 19085 }, { "epoch": 2.39193083573487, "grad_norm": 0.1036725714802742, "learning_rate": 9.988313979610679e-06, "loss": 0.4675, "num_input_tokens_seen": 23213120, "step": 19090 }, { "epoch": 2.3925573236436537, "grad_norm": 0.14081014692783356, "learning_rate": 9.98827659300765e-06, "loss": 0.4566, "num_input_tokens_seen": 23218976, "step": 19095 }, { "epoch": 2.393183811552437, "grad_norm": 0.11067993193864822, "learning_rate": 9.988239146765575e-06, "loss": 0.4606, "num_input_tokens_seen": 23224960, "step": 19100 }, { "epoch": 2.3938102994612205, "grad_norm": 0.09902159124612808, "learning_rate": 9.98820164088491e-06, "loss": 0.4647, "num_input_tokens_seen": 23231040, "step": 19105 }, { "epoch": 2.3944367873700036, "grad_norm": 0.12683996558189392, "learning_rate": 9.988164075366098e-06, "loss": 0.4633, "num_input_tokens_seen": 23236960, "step": 19110 }, { "epoch": 2.3950632752787873, "grad_norm": 0.16780845820903778, "learning_rate": 9.988126450209593e-06, "loss": 0.4633, "num_input_tokens_seen": 23243104, "step": 19115 }, { "epoch": 2.3956897631875704, "grad_norm": 0.1523779034614563, "learning_rate": 9.988088765415841e-06, "loss": 0.4706, "num_input_tokens_seen": 23249536, "step": 19120 }, { "epoch": 2.396316251096354, "grad_norm": 0.1627325862646103, "learning_rate": 9.988051020985295e-06, "loss": 0.4667, "num_input_tokens_seen": 23255744, "step": 19125 }, { "epoch": 2.396942739005137, "grad_norm": 0.0895199105143547, "learning_rate": 9.988013216918406e-06, "loss": 0.4602, "num_input_tokens_seen": 23261568, "step": 19130 }, { "epoch": 2.3975692269139204, "grad_norm": 0.16103972494602203, "learning_rate": 9.987975353215624e-06, "loss": 0.462, "num_input_tokens_seen": 23267904, "step": 19135 }, { "epoch": 2.398195714822704, "grad_norm": 0.09612194448709488, "learning_rate": 9.987937429877404e-06, "loss": 0.4606, "num_input_tokens_seen": 23274080, "step": 19140 }, { "epoch": 2.398822202731487, "grad_norm": 0.10749653726816177, "learning_rate": 9.987899446904197e-06, "loss": 0.4636, "num_input_tokens_seen": 23280512, "step": 19145 }, { "epoch": 2.3994486906402708, "grad_norm": 0.10154370218515396, "learning_rate": 9.987861404296459e-06, "loss": 0.4629, "num_input_tokens_seen": 23286560, "step": 19150 }, { "epoch": 2.400075178549054, "grad_norm": 0.09299169480800629, "learning_rate": 9.987823302054645e-06, "loss": 0.4571, "num_input_tokens_seen": 23293024, "step": 19155 }, { "epoch": 2.4007016664578376, "grad_norm": 0.09497451037168503, "learning_rate": 9.98778514017921e-06, "loss": 0.4624, "num_input_tokens_seen": 23298752, "step": 19160 }, { "epoch": 2.4013281543666207, "grad_norm": 0.09858886897563934, "learning_rate": 9.98774691867061e-06, "loss": 0.4609, "num_input_tokens_seen": 23304512, "step": 19165 }, { "epoch": 2.401954642275404, "grad_norm": 0.038613591343164444, "learning_rate": 9.987708637529302e-06, "loss": 0.4675, "num_input_tokens_seen": 23310816, "step": 19170 }, { "epoch": 2.4025811301841875, "grad_norm": 0.11530603468418121, "learning_rate": 9.987670296755744e-06, "loss": 0.4596, "num_input_tokens_seen": 23316544, "step": 19175 }, { "epoch": 2.4032076180929707, "grad_norm": 0.10423322767019272, "learning_rate": 9.987631896350394e-06, "loss": 0.464, "num_input_tokens_seen": 23322816, "step": 19180 }, { "epoch": 2.4038341060017543, "grad_norm": 0.17834174633026123, "learning_rate": 9.98759343631371e-06, "loss": 0.4601, "num_input_tokens_seen": 23328736, "step": 19185 }, { "epoch": 2.4044605939105375, "grad_norm": 0.09659905731678009, "learning_rate": 9.987554916646155e-06, "loss": 0.4619, "num_input_tokens_seen": 23335008, "step": 19190 }, { "epoch": 2.405087081819321, "grad_norm": 0.18787731230258942, "learning_rate": 9.987516337348187e-06, "loss": 0.4634, "num_input_tokens_seen": 23341152, "step": 19195 }, { "epoch": 2.4057135697281042, "grad_norm": 0.09870664030313492, "learning_rate": 9.987477698420269e-06, "loss": 0.4618, "num_input_tokens_seen": 23346784, "step": 19200 }, { "epoch": 2.4063400576368874, "grad_norm": 0.19069109857082367, "learning_rate": 9.987438999862859e-06, "loss": 0.4657, "num_input_tokens_seen": 23352832, "step": 19205 }, { "epoch": 2.406966545545671, "grad_norm": 0.10301605612039566, "learning_rate": 9.987400241676425e-06, "loss": 0.4639, "num_input_tokens_seen": 23358368, "step": 19210 }, { "epoch": 2.407593033454454, "grad_norm": 0.04269185662269592, "learning_rate": 9.987361423861428e-06, "loss": 0.4596, "num_input_tokens_seen": 23364448, "step": 19215 }, { "epoch": 2.408219521363238, "grad_norm": 0.09696222841739655, "learning_rate": 9.98732254641833e-06, "loss": 0.4602, "num_input_tokens_seen": 23370496, "step": 19220 }, { "epoch": 2.408846009272021, "grad_norm": 0.10697964578866959, "learning_rate": 9.9872836093476e-06, "loss": 0.4631, "num_input_tokens_seen": 23376736, "step": 19225 }, { "epoch": 2.4094724971808046, "grad_norm": 0.1917629837989807, "learning_rate": 9.9872446126497e-06, "loss": 0.462, "num_input_tokens_seen": 23382208, "step": 19230 }, { "epoch": 2.4100989850895878, "grad_norm": 0.14420387148857117, "learning_rate": 9.987205556325099e-06, "loss": 0.4569, "num_input_tokens_seen": 23388704, "step": 19235 }, { "epoch": 2.4107254729983714, "grad_norm": 0.13414666056632996, "learning_rate": 9.98716644037426e-06, "loss": 0.4587, "num_input_tokens_seen": 23394752, "step": 19240 }, { "epoch": 2.4113519609071545, "grad_norm": 0.10129144042730331, "learning_rate": 9.987127264797656e-06, "loss": 0.4663, "num_input_tokens_seen": 23401088, "step": 19245 }, { "epoch": 2.4119784488159377, "grad_norm": 0.18239569664001465, "learning_rate": 9.987088029595751e-06, "loss": 0.465, "num_input_tokens_seen": 23407520, "step": 19250 }, { "epoch": 2.4126049367247213, "grad_norm": 0.12601792812347412, "learning_rate": 9.987048734769017e-06, "loss": 0.4597, "num_input_tokens_seen": 23413440, "step": 19255 }, { "epoch": 2.4132314246335045, "grad_norm": 0.09797767549753189, "learning_rate": 9.987009380317921e-06, "loss": 0.4662, "num_input_tokens_seen": 23419584, "step": 19260 }, { "epoch": 2.413857912542288, "grad_norm": 0.1697457730770111, "learning_rate": 9.986969966242936e-06, "loss": 0.458, "num_input_tokens_seen": 23425696, "step": 19265 }, { "epoch": 2.4144844004510713, "grad_norm": 0.15553659200668335, "learning_rate": 9.986930492544532e-06, "loss": 0.4633, "num_input_tokens_seen": 23431616, "step": 19270 }, { "epoch": 2.4151108883598544, "grad_norm": 0.08807146549224854, "learning_rate": 9.986890959223181e-06, "loss": 0.4613, "num_input_tokens_seen": 23437856, "step": 19275 }, { "epoch": 2.415737376268638, "grad_norm": 0.10003502666950226, "learning_rate": 9.986851366279356e-06, "loss": 0.4594, "num_input_tokens_seen": 23443808, "step": 19280 }, { "epoch": 2.416363864177421, "grad_norm": 0.10606130212545395, "learning_rate": 9.986811713713532e-06, "loss": 0.4653, "num_input_tokens_seen": 23450048, "step": 19285 }, { "epoch": 2.416990352086205, "grad_norm": 0.12009920179843903, "learning_rate": 9.986772001526178e-06, "loss": 0.4641, "num_input_tokens_seen": 23456288, "step": 19290 }, { "epoch": 2.417616839994988, "grad_norm": 0.09368966519832611, "learning_rate": 9.986732229717774e-06, "loss": 0.4684, "num_input_tokens_seen": 23462464, "step": 19295 }, { "epoch": 2.4182433279037716, "grad_norm": 0.15506769716739655, "learning_rate": 9.986692398288794e-06, "loss": 0.4642, "num_input_tokens_seen": 23468512, "step": 19300 }, { "epoch": 2.418869815812555, "grad_norm": 0.0941539779305458, "learning_rate": 9.986652507239711e-06, "loss": 0.4685, "num_input_tokens_seen": 23474784, "step": 19305 }, { "epoch": 2.4194963037213384, "grad_norm": 0.1775941550731659, "learning_rate": 9.986612556571007e-06, "loss": 0.4581, "num_input_tokens_seen": 23480832, "step": 19310 }, { "epoch": 2.4201227916301216, "grad_norm": 0.10715661942958832, "learning_rate": 9.986572546283158e-06, "loss": 0.4658, "num_input_tokens_seen": 23486848, "step": 19315 }, { "epoch": 2.4207492795389047, "grad_norm": 0.16593223810195923, "learning_rate": 9.98653247637664e-06, "loss": 0.4627, "num_input_tokens_seen": 23492608, "step": 19320 }, { "epoch": 2.4213757674476883, "grad_norm": 0.04268553853034973, "learning_rate": 9.986492346851935e-06, "loss": 0.4584, "num_input_tokens_seen": 23498752, "step": 19325 }, { "epoch": 2.4220022553564715, "grad_norm": 0.15499839186668396, "learning_rate": 9.98645215770952e-06, "loss": 0.4598, "num_input_tokens_seen": 23504896, "step": 19330 }, { "epoch": 2.422628743265255, "grad_norm": 0.18502336740493774, "learning_rate": 9.986411908949878e-06, "loss": 0.463, "num_input_tokens_seen": 23510880, "step": 19335 }, { "epoch": 2.4232552311740383, "grad_norm": 0.19028940796852112, "learning_rate": 9.98637160057349e-06, "loss": 0.4638, "num_input_tokens_seen": 23516992, "step": 19340 }, { "epoch": 2.4238817190828215, "grad_norm": 0.04342293366789818, "learning_rate": 9.986331232580837e-06, "loss": 0.4616, "num_input_tokens_seen": 23523168, "step": 19345 }, { "epoch": 2.424508206991605, "grad_norm": 0.18178784847259521, "learning_rate": 9.986290804972402e-06, "loss": 0.4585, "num_input_tokens_seen": 23529504, "step": 19350 }, { "epoch": 2.4251346949003882, "grad_norm": 0.10323180258274078, "learning_rate": 9.986250317748667e-06, "loss": 0.4624, "num_input_tokens_seen": 23535520, "step": 19355 }, { "epoch": 2.425761182809172, "grad_norm": 0.18294371664524078, "learning_rate": 9.986209770910117e-06, "loss": 0.461, "num_input_tokens_seen": 23541792, "step": 19360 }, { "epoch": 2.426387670717955, "grad_norm": 0.1925622522830963, "learning_rate": 9.98616916445724e-06, "loss": 0.4526, "num_input_tokens_seen": 23548096, "step": 19365 }, { "epoch": 2.4270141586267386, "grad_norm": 0.04029633104801178, "learning_rate": 9.986128498390516e-06, "loss": 0.4564, "num_input_tokens_seen": 23554240, "step": 19370 }, { "epoch": 2.427640646535522, "grad_norm": 0.17400960624217987, "learning_rate": 9.986087772710433e-06, "loss": 0.4518, "num_input_tokens_seen": 23560352, "step": 19375 }, { "epoch": 2.4282671344443054, "grad_norm": 0.11830637603998184, "learning_rate": 9.986046987417482e-06, "loss": 0.4779, "num_input_tokens_seen": 23566368, "step": 19380 }, { "epoch": 2.4288936223530886, "grad_norm": 0.22227415442466736, "learning_rate": 9.986006142512143e-06, "loss": 0.4707, "num_input_tokens_seen": 23571968, "step": 19385 }, { "epoch": 2.4295201102618718, "grad_norm": 0.05098562315106392, "learning_rate": 9.985965237994911e-06, "loss": 0.4703, "num_input_tokens_seen": 23578176, "step": 19390 }, { "epoch": 2.4301465981706554, "grad_norm": 0.1610489785671234, "learning_rate": 9.985924273866271e-06, "loss": 0.4628, "num_input_tokens_seen": 23584544, "step": 19395 }, { "epoch": 2.4307730860794385, "grad_norm": 0.13675448298454285, "learning_rate": 9.985883250126717e-06, "loss": 0.4638, "num_input_tokens_seen": 23590784, "step": 19400 }, { "epoch": 2.431399573988222, "grad_norm": 0.10650932043790817, "learning_rate": 9.985842166776735e-06, "loss": 0.4588, "num_input_tokens_seen": 23597184, "step": 19405 }, { "epoch": 2.4320260618970053, "grad_norm": 0.22677239775657654, "learning_rate": 9.98580102381682e-06, "loss": 0.4625, "num_input_tokens_seen": 23603264, "step": 19410 }, { "epoch": 2.432652549805789, "grad_norm": 0.2015860676765442, "learning_rate": 9.98575982124746e-06, "loss": 0.4571, "num_input_tokens_seen": 23609536, "step": 19415 }, { "epoch": 2.433279037714572, "grad_norm": 0.19418203830718994, "learning_rate": 9.98571855906915e-06, "loss": 0.4705, "num_input_tokens_seen": 23615808, "step": 19420 }, { "epoch": 2.4339055256233557, "grad_norm": 0.16238060593605042, "learning_rate": 9.985677237282382e-06, "loss": 0.4687, "num_input_tokens_seen": 23621952, "step": 19425 }, { "epoch": 2.434532013532139, "grad_norm": 0.14841315150260925, "learning_rate": 9.985635855887652e-06, "loss": 0.4716, "num_input_tokens_seen": 23628128, "step": 19430 }, { "epoch": 2.435158501440922, "grad_norm": 0.12341544777154922, "learning_rate": 9.985594414885454e-06, "loss": 0.4636, "num_input_tokens_seen": 23634048, "step": 19435 }, { "epoch": 2.4357849893497057, "grad_norm": 0.11697202920913696, "learning_rate": 9.985552914276282e-06, "loss": 0.4625, "num_input_tokens_seen": 23640288, "step": 19440 }, { "epoch": 2.436411477258489, "grad_norm": 0.05512024089694023, "learning_rate": 9.985511354060632e-06, "loss": 0.4667, "num_input_tokens_seen": 23646336, "step": 19445 }, { "epoch": 2.4370379651672724, "grad_norm": 0.04812953621149063, "learning_rate": 9.985469734239006e-06, "loss": 0.4579, "num_input_tokens_seen": 23652576, "step": 19450 }, { "epoch": 2.4376644530760556, "grad_norm": 0.1288839727640152, "learning_rate": 9.985428054811894e-06, "loss": 0.4617, "num_input_tokens_seen": 23658912, "step": 19455 }, { "epoch": 2.438290940984839, "grad_norm": 0.17624600231647491, "learning_rate": 9.985386315779801e-06, "loss": 0.4597, "num_input_tokens_seen": 23664704, "step": 19460 }, { "epoch": 2.4389174288936224, "grad_norm": 0.28276577591896057, "learning_rate": 9.985344517143221e-06, "loss": 0.4682, "num_input_tokens_seen": 23670208, "step": 19465 }, { "epoch": 2.4395439168024056, "grad_norm": 0.14093521237373352, "learning_rate": 9.985302658902657e-06, "loss": 0.4592, "num_input_tokens_seen": 23676032, "step": 19470 }, { "epoch": 2.440170404711189, "grad_norm": 0.15711383521556854, "learning_rate": 9.985260741058608e-06, "loss": 0.4603, "num_input_tokens_seen": 23682144, "step": 19475 }, { "epoch": 2.4407968926199723, "grad_norm": 0.16383472084999084, "learning_rate": 9.985218763611576e-06, "loss": 0.4624, "num_input_tokens_seen": 23688448, "step": 19480 }, { "epoch": 2.441423380528756, "grad_norm": 0.11836745589971542, "learning_rate": 9.985176726562062e-06, "loss": 0.4592, "num_input_tokens_seen": 23694720, "step": 19485 }, { "epoch": 2.442049868437539, "grad_norm": 0.0948348343372345, "learning_rate": 9.98513462991057e-06, "loss": 0.4575, "num_input_tokens_seen": 23700800, "step": 19490 }, { "epoch": 2.4426763563463227, "grad_norm": 0.14061030745506287, "learning_rate": 9.985092473657599e-06, "loss": 0.4582, "num_input_tokens_seen": 23706944, "step": 19495 }, { "epoch": 2.443302844255106, "grad_norm": 0.13671262562274933, "learning_rate": 9.98505025780366e-06, "loss": 0.4612, "num_input_tokens_seen": 23712704, "step": 19500 }, { "epoch": 2.443929332163889, "grad_norm": 0.10685709118843079, "learning_rate": 9.98500798234925e-06, "loss": 0.4624, "num_input_tokens_seen": 23718848, "step": 19505 }, { "epoch": 2.4445558200726727, "grad_norm": 0.12741906940937042, "learning_rate": 9.984965647294882e-06, "loss": 0.4574, "num_input_tokens_seen": 23725024, "step": 19510 }, { "epoch": 2.445182307981456, "grad_norm": 0.18953481316566467, "learning_rate": 9.984923252641057e-06, "loss": 0.4612, "num_input_tokens_seen": 23730592, "step": 19515 }, { "epoch": 2.4458087958902395, "grad_norm": 0.11198081076145172, "learning_rate": 9.984880798388284e-06, "loss": 0.462, "num_input_tokens_seen": 23735968, "step": 19520 }, { "epoch": 2.4464352837990226, "grad_norm": 0.1559477001428604, "learning_rate": 9.984838284537068e-06, "loss": 0.4697, "num_input_tokens_seen": 23741984, "step": 19525 }, { "epoch": 2.447061771707806, "grad_norm": 0.09993237257003784, "learning_rate": 9.984795711087921e-06, "loss": 0.465, "num_input_tokens_seen": 23748192, "step": 19530 }, { "epoch": 2.4476882596165894, "grad_norm": 0.16941823065280914, "learning_rate": 9.984753078041351e-06, "loss": 0.4627, "num_input_tokens_seen": 23754240, "step": 19535 }, { "epoch": 2.4483147475253726, "grad_norm": 0.10322807729244232, "learning_rate": 9.984710385397865e-06, "loss": 0.4664, "num_input_tokens_seen": 23760288, "step": 19540 }, { "epoch": 2.448941235434156, "grad_norm": 0.1461474746465683, "learning_rate": 9.984667633157978e-06, "loss": 0.467, "num_input_tokens_seen": 23766496, "step": 19545 }, { "epoch": 2.4495677233429394, "grad_norm": 0.21386398375034332, "learning_rate": 9.984624821322197e-06, "loss": 0.4596, "num_input_tokens_seen": 23772736, "step": 19550 }, { "epoch": 2.450194211251723, "grad_norm": 0.14423507452011108, "learning_rate": 9.984581949891035e-06, "loss": 0.4659, "num_input_tokens_seen": 23778848, "step": 19555 }, { "epoch": 2.450820699160506, "grad_norm": 0.1979122906923294, "learning_rate": 9.984539018865005e-06, "loss": 0.4648, "num_input_tokens_seen": 23784832, "step": 19560 }, { "epoch": 2.4514471870692898, "grad_norm": 0.18937356770038605, "learning_rate": 9.984496028244622e-06, "loss": 0.4644, "num_input_tokens_seen": 23790976, "step": 19565 }, { "epoch": 2.452073674978073, "grad_norm": 0.16944286227226257, "learning_rate": 9.984452978030397e-06, "loss": 0.4631, "num_input_tokens_seen": 23797248, "step": 19570 }, { "epoch": 2.452700162886856, "grad_norm": 0.055641259998083115, "learning_rate": 9.984409868222845e-06, "loss": 0.4629, "num_input_tokens_seen": 23803328, "step": 19575 }, { "epoch": 2.4533266507956397, "grad_norm": 0.045695047825574875, "learning_rate": 9.984366698822482e-06, "loss": 0.4644, "num_input_tokens_seen": 23809280, "step": 19580 }, { "epoch": 2.453953138704423, "grad_norm": 0.13612686097621918, "learning_rate": 9.984323469829828e-06, "loss": 0.4576, "num_input_tokens_seen": 23815392, "step": 19585 }, { "epoch": 2.4545796266132065, "grad_norm": 0.10213258862495422, "learning_rate": 9.984280181245393e-06, "loss": 0.462, "num_input_tokens_seen": 23820768, "step": 19590 }, { "epoch": 2.4552061145219897, "grad_norm": 0.1149546280503273, "learning_rate": 9.9842368330697e-06, "loss": 0.4659, "num_input_tokens_seen": 23826880, "step": 19595 }, { "epoch": 2.455832602430773, "grad_norm": 0.12645328044891357, "learning_rate": 9.984193425303263e-06, "loss": 0.4641, "num_input_tokens_seen": 23832896, "step": 19600 }, { "epoch": 2.4564590903395565, "grad_norm": 0.11428554356098175, "learning_rate": 9.984149957946605e-06, "loss": 0.4639, "num_input_tokens_seen": 23839168, "step": 19605 }, { "epoch": 2.4570855782483396, "grad_norm": 0.09959156811237335, "learning_rate": 9.984106431000243e-06, "loss": 0.4652, "num_input_tokens_seen": 23845504, "step": 19610 }, { "epoch": 2.4577120661571232, "grad_norm": 0.10678993910551071, "learning_rate": 9.984062844464699e-06, "loss": 0.4618, "num_input_tokens_seen": 23851040, "step": 19615 }, { "epoch": 2.4583385540659064, "grad_norm": 0.10456451028585434, "learning_rate": 9.984019198340493e-06, "loss": 0.4601, "num_input_tokens_seen": 23857280, "step": 19620 }, { "epoch": 2.45896504197469, "grad_norm": 0.16481946408748627, "learning_rate": 9.983975492628146e-06, "loss": 0.4644, "num_input_tokens_seen": 23863424, "step": 19625 }, { "epoch": 2.459591529883473, "grad_norm": 0.20465891063213348, "learning_rate": 9.983931727328182e-06, "loss": 0.4701, "num_input_tokens_seen": 23869408, "step": 19630 }, { "epoch": 2.460218017792257, "grad_norm": 0.10677296668291092, "learning_rate": 9.983887902441125e-06, "loss": 0.4605, "num_input_tokens_seen": 23875328, "step": 19635 }, { "epoch": 2.46084450570104, "grad_norm": 0.11580908298492432, "learning_rate": 9.983844017967498e-06, "loss": 0.4608, "num_input_tokens_seen": 23880672, "step": 19640 }, { "epoch": 2.461470993609823, "grad_norm": 0.046687155961990356, "learning_rate": 9.983800073907825e-06, "loss": 0.4654, "num_input_tokens_seen": 23886880, "step": 19645 }, { "epoch": 2.4620974815186067, "grad_norm": 0.1405911147594452, "learning_rate": 9.983756070262632e-06, "loss": 0.4632, "num_input_tokens_seen": 23892928, "step": 19650 }, { "epoch": 2.46272396942739, "grad_norm": 0.12884831428527832, "learning_rate": 9.983712007032444e-06, "loss": 0.4639, "num_input_tokens_seen": 23899008, "step": 19655 }, { "epoch": 2.4633504573361735, "grad_norm": 0.12379411607980728, "learning_rate": 9.98366788421779e-06, "loss": 0.4647, "num_input_tokens_seen": 23905312, "step": 19660 }, { "epoch": 2.4639769452449567, "grad_norm": 0.14440855383872986, "learning_rate": 9.983623701819196e-06, "loss": 0.466, "num_input_tokens_seen": 23911648, "step": 19665 }, { "epoch": 2.4646034331537403, "grad_norm": 0.21315397322177887, "learning_rate": 9.983579459837192e-06, "loss": 0.4612, "num_input_tokens_seen": 23917824, "step": 19670 }, { "epoch": 2.4652299210625235, "grad_norm": 0.20367541909217834, "learning_rate": 9.983535158272303e-06, "loss": 0.4637, "num_input_tokens_seen": 23923840, "step": 19675 }, { "epoch": 2.465856408971307, "grad_norm": 0.13273434340953827, "learning_rate": 9.983490797125064e-06, "loss": 0.4651, "num_input_tokens_seen": 23929728, "step": 19680 }, { "epoch": 2.4664828968800903, "grad_norm": 0.16844341158866882, "learning_rate": 9.983446376396003e-06, "loss": 0.461, "num_input_tokens_seen": 23936192, "step": 19685 }, { "epoch": 2.4671093847888734, "grad_norm": 0.22148890793323517, "learning_rate": 9.983401896085648e-06, "loss": 0.4549, "num_input_tokens_seen": 23942496, "step": 19690 }, { "epoch": 2.467735872697657, "grad_norm": 0.17640122771263123, "learning_rate": 9.983357356194535e-06, "loss": 0.4626, "num_input_tokens_seen": 23948768, "step": 19695 }, { "epoch": 2.46836236060644, "grad_norm": 0.04227181151509285, "learning_rate": 9.983312756723193e-06, "loss": 0.4651, "num_input_tokens_seen": 23955136, "step": 19700 }, { "epoch": 2.468988848515224, "grad_norm": 0.1273549646139145, "learning_rate": 9.98326809767216e-06, "loss": 0.4704, "num_input_tokens_seen": 23961312, "step": 19705 }, { "epoch": 2.469615336424007, "grad_norm": 0.1201724112033844, "learning_rate": 9.983223379041966e-06, "loss": 0.4648, "num_input_tokens_seen": 23967392, "step": 19710 }, { "epoch": 2.47024182433279, "grad_norm": 0.0506010428071022, "learning_rate": 9.983178600833147e-06, "loss": 0.4719, "num_input_tokens_seen": 23973440, "step": 19715 }, { "epoch": 2.4708683122415738, "grad_norm": 0.1117841824889183, "learning_rate": 9.983133763046237e-06, "loss": 0.4622, "num_input_tokens_seen": 23979520, "step": 19720 }, { "epoch": 2.471494800150357, "grad_norm": 0.11464717239141464, "learning_rate": 9.983088865681777e-06, "loss": 0.4623, "num_input_tokens_seen": 23985568, "step": 19725 }, { "epoch": 2.4721212880591406, "grad_norm": 0.04502442851662636, "learning_rate": 9.983043908740295e-06, "loss": 0.4602, "num_input_tokens_seen": 23991616, "step": 19730 }, { "epoch": 2.4727477759679237, "grad_norm": 0.11754456907510757, "learning_rate": 9.982998892222338e-06, "loss": 0.4631, "num_input_tokens_seen": 23997888, "step": 19735 }, { "epoch": 2.4733742638767073, "grad_norm": 0.11585211008787155, "learning_rate": 9.982953816128437e-06, "loss": 0.4581, "num_input_tokens_seen": 24004128, "step": 19740 }, { "epoch": 2.4740007517854905, "grad_norm": 0.12229299545288086, "learning_rate": 9.982908680459135e-06, "loss": 0.461, "num_input_tokens_seen": 24009856, "step": 19745 }, { "epoch": 2.474627239694274, "grad_norm": 0.13137729465961456, "learning_rate": 9.982863485214968e-06, "loss": 0.4642, "num_input_tokens_seen": 24015616, "step": 19750 }, { "epoch": 2.4752537276030573, "grad_norm": 0.15264040231704712, "learning_rate": 9.982818230396482e-06, "loss": 0.4694, "num_input_tokens_seen": 24021728, "step": 19755 }, { "epoch": 2.4758802155118405, "grad_norm": 0.13721919059753418, "learning_rate": 9.982772916004213e-06, "loss": 0.4616, "num_input_tokens_seen": 24027904, "step": 19760 }, { "epoch": 2.476506703420624, "grad_norm": 0.11641436070203781, "learning_rate": 9.982727542038705e-06, "loss": 0.4699, "num_input_tokens_seen": 24034176, "step": 19765 }, { "epoch": 2.4771331913294072, "grad_norm": 0.1405896693468094, "learning_rate": 9.982682108500498e-06, "loss": 0.4704, "num_input_tokens_seen": 24040384, "step": 19770 }, { "epoch": 2.477759679238191, "grad_norm": 0.1266387403011322, "learning_rate": 9.98263661539014e-06, "loss": 0.461, "num_input_tokens_seen": 24046592, "step": 19775 }, { "epoch": 2.478386167146974, "grad_norm": 0.13269637525081635, "learning_rate": 9.982591062708172e-06, "loss": 0.4739, "num_input_tokens_seen": 24052896, "step": 19780 }, { "epoch": 2.479012655055757, "grad_norm": 0.054733045399188995, "learning_rate": 9.982545450455136e-06, "loss": 0.4618, "num_input_tokens_seen": 24058688, "step": 19785 }, { "epoch": 2.479639142964541, "grad_norm": 0.127854123711586, "learning_rate": 9.982499778631583e-06, "loss": 0.4633, "num_input_tokens_seen": 24064800, "step": 19790 }, { "epoch": 2.480265630873324, "grad_norm": 0.1720288246870041, "learning_rate": 9.982454047238056e-06, "loss": 0.4539, "num_input_tokens_seen": 24071328, "step": 19795 }, { "epoch": 2.4808921187821076, "grad_norm": 0.13067203760147095, "learning_rate": 9.9824082562751e-06, "loss": 0.4634, "num_input_tokens_seen": 24077312, "step": 19800 }, { "epoch": 2.4815186066908907, "grad_norm": 0.1831950545310974, "learning_rate": 9.982362405743265e-06, "loss": 0.4528, "num_input_tokens_seen": 24083456, "step": 19805 }, { "epoch": 2.4821450945996744, "grad_norm": 0.13136076927185059, "learning_rate": 9.982316495643099e-06, "loss": 0.4571, "num_input_tokens_seen": 24089376, "step": 19810 }, { "epoch": 2.4827715825084575, "grad_norm": 0.11950468271970749, "learning_rate": 9.982270525975148e-06, "loss": 0.4612, "num_input_tokens_seen": 24095296, "step": 19815 }, { "epoch": 2.483398070417241, "grad_norm": 0.1397198587656021, "learning_rate": 9.982224496739968e-06, "loss": 0.4479, "num_input_tokens_seen": 24101568, "step": 19820 }, { "epoch": 2.4840245583260243, "grad_norm": 0.22008833289146423, "learning_rate": 9.982178407938102e-06, "loss": 0.4709, "num_input_tokens_seen": 24107456, "step": 19825 }, { "epoch": 2.4846510462348075, "grad_norm": 0.2245652675628662, "learning_rate": 9.982132259570106e-06, "loss": 0.4547, "num_input_tokens_seen": 24113632, "step": 19830 }, { "epoch": 2.485277534143591, "grad_norm": 0.28687915205955505, "learning_rate": 9.98208605163653e-06, "loss": 0.4594, "num_input_tokens_seen": 24119520, "step": 19835 }, { "epoch": 2.4859040220523743, "grad_norm": 0.20364245772361755, "learning_rate": 9.982039784137926e-06, "loss": 0.4647, "num_input_tokens_seen": 24125536, "step": 19840 }, { "epoch": 2.486530509961158, "grad_norm": 0.07331468164920807, "learning_rate": 9.981993457074848e-06, "loss": 0.4547, "num_input_tokens_seen": 24131776, "step": 19845 }, { "epoch": 2.487156997869941, "grad_norm": 0.22511020302772522, "learning_rate": 9.981947070447847e-06, "loss": 0.4656, "num_input_tokens_seen": 24138144, "step": 19850 }, { "epoch": 2.4877834857787247, "grad_norm": 0.2078687846660614, "learning_rate": 9.981900624257485e-06, "loss": 0.4736, "num_input_tokens_seen": 24144352, "step": 19855 }, { "epoch": 2.488409973687508, "grad_norm": 0.05816151201725006, "learning_rate": 9.98185411850431e-06, "loss": 0.4697, "num_input_tokens_seen": 24150368, "step": 19860 }, { "epoch": 2.4890364615962914, "grad_norm": 0.1692294180393219, "learning_rate": 9.98180755318888e-06, "loss": 0.471, "num_input_tokens_seen": 24156544, "step": 19865 }, { "epoch": 2.4896629495050746, "grad_norm": 0.12568821012973785, "learning_rate": 9.981760928311753e-06, "loss": 0.4692, "num_input_tokens_seen": 24162720, "step": 19870 }, { "epoch": 2.4902894374138578, "grad_norm": 0.12064064294099808, "learning_rate": 9.981714243873486e-06, "loss": 0.4515, "num_input_tokens_seen": 24168704, "step": 19875 }, { "epoch": 2.4909159253226414, "grad_norm": 0.08951077610254288, "learning_rate": 9.981667499874637e-06, "loss": 0.4645, "num_input_tokens_seen": 24174880, "step": 19880 }, { "epoch": 2.4915424132314246, "grad_norm": 0.160641148686409, "learning_rate": 9.981620696315763e-06, "loss": 0.4582, "num_input_tokens_seen": 24180544, "step": 19885 }, { "epoch": 2.492168901140208, "grad_norm": 0.11066853255033493, "learning_rate": 9.981573833197427e-06, "loss": 0.4578, "num_input_tokens_seen": 24186560, "step": 19890 }, { "epoch": 2.4927953890489913, "grad_norm": 0.14908570051193237, "learning_rate": 9.981526910520188e-06, "loss": 0.4644, "num_input_tokens_seen": 24192352, "step": 19895 }, { "epoch": 2.4934218769577745, "grad_norm": 0.14377298951148987, "learning_rate": 9.981479928284605e-06, "loss": 0.4647, "num_input_tokens_seen": 24198784, "step": 19900 }, { "epoch": 2.494048364866558, "grad_norm": 0.1272333413362503, "learning_rate": 9.981432886491242e-06, "loss": 0.4694, "num_input_tokens_seen": 24205152, "step": 19905 }, { "epoch": 2.4946748527753413, "grad_norm": 0.1702970564365387, "learning_rate": 9.981385785140661e-06, "loss": 0.4564, "num_input_tokens_seen": 24211200, "step": 19910 }, { "epoch": 2.495301340684125, "grad_norm": 0.12094590812921524, "learning_rate": 9.981338624233424e-06, "loss": 0.4605, "num_input_tokens_seen": 24217344, "step": 19915 }, { "epoch": 2.495927828592908, "grad_norm": 0.1709168404340744, "learning_rate": 9.981291403770095e-06, "loss": 0.4638, "num_input_tokens_seen": 24223232, "step": 19920 }, { "epoch": 2.4965543165016917, "grad_norm": 0.11964410543441772, "learning_rate": 9.98124412375124e-06, "loss": 0.4567, "num_input_tokens_seen": 24229440, "step": 19925 }, { "epoch": 2.497180804410475, "grad_norm": 0.16555488109588623, "learning_rate": 9.981196784177423e-06, "loss": 0.4689, "num_input_tokens_seen": 24235296, "step": 19930 }, { "epoch": 2.4978072923192585, "grad_norm": 0.26881080865859985, "learning_rate": 9.981149385049212e-06, "loss": 0.4568, "num_input_tokens_seen": 24241504, "step": 19935 }, { "epoch": 2.4984337802280416, "grad_norm": 0.20608851313591003, "learning_rate": 9.981101926367172e-06, "loss": 0.4778, "num_input_tokens_seen": 24247648, "step": 19940 }, { "epoch": 2.499060268136825, "grad_norm": 0.19719786942005157, "learning_rate": 9.98105440813187e-06, "loss": 0.4477, "num_input_tokens_seen": 24253984, "step": 19945 }, { "epoch": 2.4996867560456084, "grad_norm": 0.12193260341882706, "learning_rate": 9.981006830343874e-06, "loss": 0.4607, "num_input_tokens_seen": 24259936, "step": 19950 }, { "epoch": 2.5003132439543916, "grad_norm": 0.19322547316551208, "learning_rate": 9.980959193003754e-06, "loss": 0.4604, "num_input_tokens_seen": 24265376, "step": 19955 }, { "epoch": 2.500939731863175, "grad_norm": 0.06099037453532219, "learning_rate": 9.980911496112077e-06, "loss": 0.4625, "num_input_tokens_seen": 24271456, "step": 19960 }, { "epoch": 2.5015662197719584, "grad_norm": 0.18439336121082306, "learning_rate": 9.980863739669419e-06, "loss": 0.4638, "num_input_tokens_seen": 24277120, "step": 19965 }, { "epoch": 2.5021927076807415, "grad_norm": 0.17794568836688995, "learning_rate": 9.980815923676346e-06, "loss": 0.4675, "num_input_tokens_seen": 24283232, "step": 19970 }, { "epoch": 2.502819195589525, "grad_norm": 0.3138396739959717, "learning_rate": 9.980768048133432e-06, "loss": 0.4607, "num_input_tokens_seen": 24289216, "step": 19975 }, { "epoch": 2.5034456834983083, "grad_norm": 0.46755626797676086, "learning_rate": 9.980720113041247e-06, "loss": 0.4659, "num_input_tokens_seen": 24295264, "step": 19980 }, { "epoch": 2.504072171407092, "grad_norm": 0.277183473110199, "learning_rate": 9.980672118400365e-06, "loss": 0.4565, "num_input_tokens_seen": 24301024, "step": 19985 }, { "epoch": 2.504698659315875, "grad_norm": 0.07188200950622559, "learning_rate": 9.980624064211362e-06, "loss": 0.467, "num_input_tokens_seen": 24307008, "step": 19990 }, { "epoch": 2.5053251472246587, "grad_norm": 0.1563742607831955, "learning_rate": 9.980575950474809e-06, "loss": 0.4832, "num_input_tokens_seen": 24312320, "step": 19995 }, { "epoch": 2.505951635133442, "grad_norm": 0.16289052367210388, "learning_rate": 9.980527777191284e-06, "loss": 0.4582, "num_input_tokens_seen": 24318304, "step": 20000 }, { "epoch": 2.5065781230422255, "grad_norm": 0.1390310823917389, "learning_rate": 9.980479544361361e-06, "loss": 0.4574, "num_input_tokens_seen": 24324352, "step": 20005 }, { "epoch": 2.5072046109510087, "grad_norm": 0.16635175049304962, "learning_rate": 9.980431251985619e-06, "loss": 0.4595, "num_input_tokens_seen": 24330336, "step": 20010 }, { "epoch": 2.507831098859792, "grad_norm": 0.054279934614896774, "learning_rate": 9.980382900064634e-06, "loss": 0.4656, "num_input_tokens_seen": 24336832, "step": 20015 }, { "epoch": 2.5084575867685754, "grad_norm": 0.10948936641216278, "learning_rate": 9.980334488598983e-06, "loss": 0.4614, "num_input_tokens_seen": 24343008, "step": 20020 }, { "epoch": 2.5090840746773586, "grad_norm": 0.13782988488674164, "learning_rate": 9.980286017589247e-06, "loss": 0.4672, "num_input_tokens_seen": 24349184, "step": 20025 }, { "epoch": 2.5097105625861422, "grad_norm": 0.169255793094635, "learning_rate": 9.980237487036006e-06, "loss": 0.4571, "num_input_tokens_seen": 24355328, "step": 20030 }, { "epoch": 2.5103370504949254, "grad_norm": 0.15918205678462982, "learning_rate": 9.980188896939836e-06, "loss": 0.4786, "num_input_tokens_seen": 24361312, "step": 20035 }, { "epoch": 2.5109635384037086, "grad_norm": 0.14657631516456604, "learning_rate": 9.980140247301322e-06, "loss": 0.465, "num_input_tokens_seen": 24367808, "step": 20040 }, { "epoch": 2.511590026312492, "grad_norm": 0.09809455275535583, "learning_rate": 9.980091538121044e-06, "loss": 0.4645, "num_input_tokens_seen": 24374080, "step": 20045 }, { "epoch": 2.512216514221276, "grad_norm": 0.12268359959125519, "learning_rate": 9.980042769399585e-06, "loss": 0.4628, "num_input_tokens_seen": 24379840, "step": 20050 }, { "epoch": 2.512843002130059, "grad_norm": 0.09639797359704971, "learning_rate": 9.979993941137527e-06, "loss": 0.4633, "num_input_tokens_seen": 24385728, "step": 20055 }, { "epoch": 2.513469490038842, "grad_norm": 0.19182713329792023, "learning_rate": 9.979945053335453e-06, "loss": 0.4598, "num_input_tokens_seen": 24391488, "step": 20060 }, { "epoch": 2.5140959779476257, "grad_norm": 0.03844529390335083, "learning_rate": 9.979896105993952e-06, "loss": 0.4749, "num_input_tokens_seen": 24397632, "step": 20065 }, { "epoch": 2.514722465856409, "grad_norm": 0.09773708879947662, "learning_rate": 9.979847099113604e-06, "loss": 0.4588, "num_input_tokens_seen": 24403584, "step": 20070 }, { "epoch": 2.5153489537651925, "grad_norm": 0.12125058472156525, "learning_rate": 9.979798032694998e-06, "loss": 0.4651, "num_input_tokens_seen": 24409760, "step": 20075 }, { "epoch": 2.5159754416739757, "grad_norm": 0.03932621702551842, "learning_rate": 9.979748906738719e-06, "loss": 0.4593, "num_input_tokens_seen": 24415712, "step": 20080 }, { "epoch": 2.516601929582759, "grad_norm": 0.1135832890868187, "learning_rate": 9.979699721245357e-06, "loss": 0.4629, "num_input_tokens_seen": 24421920, "step": 20085 }, { "epoch": 2.5172284174915425, "grad_norm": 0.12541569769382477, "learning_rate": 9.979650476215496e-06, "loss": 0.4586, "num_input_tokens_seen": 24427840, "step": 20090 }, { "epoch": 2.5178549054003256, "grad_norm": 0.186288520693779, "learning_rate": 9.979601171649729e-06, "loss": 0.4593, "num_input_tokens_seen": 24433696, "step": 20095 }, { "epoch": 2.5184813933091093, "grad_norm": 0.0984216183423996, "learning_rate": 9.979551807548641e-06, "loss": 0.4629, "num_input_tokens_seen": 24440320, "step": 20100 }, { "epoch": 2.5191078812178924, "grad_norm": 0.09728843718767166, "learning_rate": 9.979502383912827e-06, "loss": 0.4627, "num_input_tokens_seen": 24445952, "step": 20105 }, { "epoch": 2.5197343691266756, "grad_norm": 0.09424148499965668, "learning_rate": 9.979452900742875e-06, "loss": 0.4693, "num_input_tokens_seen": 24452064, "step": 20110 }, { "epoch": 2.520360857035459, "grad_norm": 0.11071665585041046, "learning_rate": 9.979403358039375e-06, "loss": 0.458, "num_input_tokens_seen": 24458208, "step": 20115 }, { "epoch": 2.520987344944243, "grad_norm": 0.13140295445919037, "learning_rate": 9.979353755802923e-06, "loss": 0.4605, "num_input_tokens_seen": 24464352, "step": 20120 }, { "epoch": 2.521613832853026, "grad_norm": 0.09947854280471802, "learning_rate": 9.97930409403411e-06, "loss": 0.4562, "num_input_tokens_seen": 24470368, "step": 20125 }, { "epoch": 2.522240320761809, "grad_norm": 0.1440226435661316, "learning_rate": 9.979254372733532e-06, "loss": 0.4634, "num_input_tokens_seen": 24476480, "step": 20130 }, { "epoch": 2.5228668086705928, "grad_norm": 0.15045106410980225, "learning_rate": 9.979204591901779e-06, "loss": 0.4596, "num_input_tokens_seen": 24482432, "step": 20135 }, { "epoch": 2.523493296579376, "grad_norm": 0.11028808355331421, "learning_rate": 9.979154751539449e-06, "loss": 0.456, "num_input_tokens_seen": 24488704, "step": 20140 }, { "epoch": 2.5241197844881595, "grad_norm": 0.09346652030944824, "learning_rate": 9.979104851647139e-06, "loss": 0.4663, "num_input_tokens_seen": 24495168, "step": 20145 }, { "epoch": 2.5247462723969427, "grad_norm": 0.22924771904945374, "learning_rate": 9.979054892225443e-06, "loss": 0.4684, "num_input_tokens_seen": 24501120, "step": 20150 }, { "epoch": 2.525372760305726, "grad_norm": 0.11264832317829132, "learning_rate": 9.979004873274961e-06, "loss": 0.462, "num_input_tokens_seen": 24507488, "step": 20155 }, { "epoch": 2.5259992482145095, "grad_norm": 0.11715000122785568, "learning_rate": 9.97895479479629e-06, "loss": 0.4673, "num_input_tokens_seen": 24513728, "step": 20160 }, { "epoch": 2.5266257361232927, "grad_norm": 0.11889460682868958, "learning_rate": 9.978904656790028e-06, "loss": 0.4721, "num_input_tokens_seen": 24519936, "step": 20165 }, { "epoch": 2.5272522240320763, "grad_norm": 0.1507100611925125, "learning_rate": 9.978854459256772e-06, "loss": 0.4713, "num_input_tokens_seen": 24525184, "step": 20170 }, { "epoch": 2.5278787119408594, "grad_norm": 0.12995019555091858, "learning_rate": 9.978804202197128e-06, "loss": 0.4538, "num_input_tokens_seen": 24531584, "step": 20175 }, { "epoch": 2.528505199849643, "grad_norm": 0.13817629218101501, "learning_rate": 9.978753885611693e-06, "loss": 0.463, "num_input_tokens_seen": 24537632, "step": 20180 }, { "epoch": 2.5291316877584262, "grad_norm": 0.17015919089317322, "learning_rate": 9.978703509501068e-06, "loss": 0.4552, "num_input_tokens_seen": 24543680, "step": 20185 }, { "epoch": 2.52975817566721, "grad_norm": 0.12042445689439774, "learning_rate": 9.97865307386586e-06, "loss": 0.4627, "num_input_tokens_seen": 24550016, "step": 20190 }, { "epoch": 2.530384663575993, "grad_norm": 0.14360885322093964, "learning_rate": 9.978602578706667e-06, "loss": 0.4672, "num_input_tokens_seen": 24556000, "step": 20195 }, { "epoch": 2.531011151484776, "grad_norm": 0.10916268080472946, "learning_rate": 9.978552024024092e-06, "loss": 0.4702, "num_input_tokens_seen": 24562048, "step": 20200 }, { "epoch": 2.53163763939356, "grad_norm": 0.1158793717622757, "learning_rate": 9.978501409818745e-06, "loss": 0.4709, "num_input_tokens_seen": 24568096, "step": 20205 }, { "epoch": 2.532264127302343, "grad_norm": 0.16620899736881256, "learning_rate": 9.978450736091226e-06, "loss": 0.4672, "num_input_tokens_seen": 24574304, "step": 20210 }, { "epoch": 2.5328906152111266, "grad_norm": 0.12615427374839783, "learning_rate": 9.978400002842144e-06, "loss": 0.4592, "num_input_tokens_seen": 24580448, "step": 20215 }, { "epoch": 2.5335171031199097, "grad_norm": 0.08503129333257675, "learning_rate": 9.978349210072104e-06, "loss": 0.4618, "num_input_tokens_seen": 24586912, "step": 20220 }, { "epoch": 2.534143591028693, "grad_norm": 0.12564902007579803, "learning_rate": 9.978298357781715e-06, "loss": 0.4602, "num_input_tokens_seen": 24592800, "step": 20225 }, { "epoch": 2.5347700789374765, "grad_norm": 0.043529510498046875, "learning_rate": 9.97824744597158e-06, "loss": 0.4562, "num_input_tokens_seen": 24599264, "step": 20230 }, { "epoch": 2.53539656684626, "grad_norm": 0.10528112947940826, "learning_rate": 9.978196474642315e-06, "loss": 0.459, "num_input_tokens_seen": 24604736, "step": 20235 }, { "epoch": 2.5360230547550433, "grad_norm": 0.1545579880475998, "learning_rate": 9.978145443794525e-06, "loss": 0.4534, "num_input_tokens_seen": 24610944, "step": 20240 }, { "epoch": 2.5366495426638265, "grad_norm": 0.1803065538406372, "learning_rate": 9.97809435342882e-06, "loss": 0.4698, "num_input_tokens_seen": 24616736, "step": 20245 }, { "epoch": 2.53727603057261, "grad_norm": 0.1347932517528534, "learning_rate": 9.978043203545811e-06, "loss": 0.4719, "num_input_tokens_seen": 24623008, "step": 20250 }, { "epoch": 2.5379025184813933, "grad_norm": 0.10199129581451416, "learning_rate": 9.97799199414611e-06, "loss": 0.4752, "num_input_tokens_seen": 24628608, "step": 20255 }, { "epoch": 2.538529006390177, "grad_norm": 0.09303292632102966, "learning_rate": 9.97794072523033e-06, "loss": 0.4544, "num_input_tokens_seen": 24634240, "step": 20260 }, { "epoch": 2.53915549429896, "grad_norm": 0.10747811943292618, "learning_rate": 9.977889396799086e-06, "loss": 0.4595, "num_input_tokens_seen": 24640544, "step": 20265 }, { "epoch": 2.539781982207743, "grad_norm": 0.10580495744943619, "learning_rate": 9.977838008852985e-06, "loss": 0.4734, "num_input_tokens_seen": 24646816, "step": 20270 }, { "epoch": 2.540408470116527, "grad_norm": 0.18060320615768433, "learning_rate": 9.97778656139265e-06, "loss": 0.4607, "num_input_tokens_seen": 24652928, "step": 20275 }, { "epoch": 2.54103495802531, "grad_norm": 0.10195913910865784, "learning_rate": 9.977735054418687e-06, "loss": 0.461, "num_input_tokens_seen": 24659296, "step": 20280 }, { "epoch": 2.5416614459340936, "grad_norm": 0.11676985770463943, "learning_rate": 9.97768348793172e-06, "loss": 0.4617, "num_input_tokens_seen": 24665472, "step": 20285 }, { "epoch": 2.5422879338428768, "grad_norm": 0.17911405861377716, "learning_rate": 9.977631861932359e-06, "loss": 0.4653, "num_input_tokens_seen": 24671776, "step": 20290 }, { "epoch": 2.54291442175166, "grad_norm": 0.15905869007110596, "learning_rate": 9.977580176421224e-06, "loss": 0.4661, "num_input_tokens_seen": 24677760, "step": 20295 }, { "epoch": 2.5435409096604435, "grad_norm": 0.1100868359208107, "learning_rate": 9.977528431398936e-06, "loss": 0.4637, "num_input_tokens_seen": 24684032, "step": 20300 }, { "epoch": 2.544167397569227, "grad_norm": 0.11473122984170914, "learning_rate": 9.977476626866108e-06, "loss": 0.4649, "num_input_tokens_seen": 24690112, "step": 20305 }, { "epoch": 2.5447938854780103, "grad_norm": 0.18235039710998535, "learning_rate": 9.977424762823364e-06, "loss": 0.4606, "num_input_tokens_seen": 24696288, "step": 20310 }, { "epoch": 2.5454203733867935, "grad_norm": 0.11132939159870148, "learning_rate": 9.97737283927132e-06, "loss": 0.4681, "num_input_tokens_seen": 24702592, "step": 20315 }, { "epoch": 2.546046861295577, "grad_norm": 0.10974361002445221, "learning_rate": 9.977320856210602e-06, "loss": 0.4671, "num_input_tokens_seen": 24708928, "step": 20320 }, { "epoch": 2.5466733492043603, "grad_norm": 0.15297943353652954, "learning_rate": 9.977268813641825e-06, "loss": 0.4701, "num_input_tokens_seen": 24715200, "step": 20325 }, { "epoch": 2.547299837113144, "grad_norm": 0.16675014793872833, "learning_rate": 9.977216711565618e-06, "loss": 0.4579, "num_input_tokens_seen": 24721408, "step": 20330 }, { "epoch": 2.547926325021927, "grad_norm": 0.041963618248701096, "learning_rate": 9.977164549982599e-06, "loss": 0.4655, "num_input_tokens_seen": 24727520, "step": 20335 }, { "epoch": 2.5485528129307102, "grad_norm": 0.12230521440505981, "learning_rate": 9.977112328893392e-06, "loss": 0.4535, "num_input_tokens_seen": 24733760, "step": 20340 }, { "epoch": 2.549179300839494, "grad_norm": 0.04017561674118042, "learning_rate": 9.977060048298625e-06, "loss": 0.465, "num_input_tokens_seen": 24739840, "step": 20345 }, { "epoch": 2.549805788748277, "grad_norm": 0.18603233993053436, "learning_rate": 9.97700770819892e-06, "loss": 0.461, "num_input_tokens_seen": 24745600, "step": 20350 }, { "epoch": 2.5504322766570606, "grad_norm": 0.1372673064470291, "learning_rate": 9.976955308594902e-06, "loss": 0.4751, "num_input_tokens_seen": 24751552, "step": 20355 }, { "epoch": 2.551058764565844, "grad_norm": 0.16939228773117065, "learning_rate": 9.9769028494872e-06, "loss": 0.4582, "num_input_tokens_seen": 24757856, "step": 20360 }, { "epoch": 2.551685252474627, "grad_norm": 0.1407344788312912, "learning_rate": 9.97685033087644e-06, "loss": 0.4712, "num_input_tokens_seen": 24763936, "step": 20365 }, { "epoch": 2.5523117403834106, "grad_norm": 0.09146390855312347, "learning_rate": 9.97679775276325e-06, "loss": 0.4654, "num_input_tokens_seen": 24769920, "step": 20370 }, { "epoch": 2.552938228292194, "grad_norm": 0.10730264335870743, "learning_rate": 9.976745115148258e-06, "loss": 0.4679, "num_input_tokens_seen": 24776064, "step": 20375 }, { "epoch": 2.5535647162009774, "grad_norm": 0.08144085854291916, "learning_rate": 9.976692418032095e-06, "loss": 0.4713, "num_input_tokens_seen": 24782336, "step": 20380 }, { "epoch": 2.5541912041097605, "grad_norm": 0.25177374482154846, "learning_rate": 9.97663966141539e-06, "loss": 0.4664, "num_input_tokens_seen": 24788640, "step": 20385 }, { "epoch": 2.554817692018544, "grad_norm": 0.09653473645448685, "learning_rate": 9.976586845298773e-06, "loss": 0.4603, "num_input_tokens_seen": 24795200, "step": 20390 }, { "epoch": 2.5554441799273273, "grad_norm": 0.08054883778095245, "learning_rate": 9.976533969682876e-06, "loss": 0.466, "num_input_tokens_seen": 24801056, "step": 20395 }, { "epoch": 2.556070667836111, "grad_norm": 0.10960410535335541, "learning_rate": 9.976481034568331e-06, "loss": 0.4678, "num_input_tokens_seen": 24807200, "step": 20400 }, { "epoch": 2.556697155744894, "grad_norm": 0.0990254133939743, "learning_rate": 9.976428039955772e-06, "loss": 0.4618, "num_input_tokens_seen": 24813504, "step": 20405 }, { "epoch": 2.5573236436536773, "grad_norm": 0.10728964954614639, "learning_rate": 9.976374985845832e-06, "loss": 0.4616, "num_input_tokens_seen": 24819808, "step": 20410 }, { "epoch": 2.557950131562461, "grad_norm": 0.1253831386566162, "learning_rate": 9.976321872239146e-06, "loss": 0.4632, "num_input_tokens_seen": 24825952, "step": 20415 }, { "epoch": 2.558576619471244, "grad_norm": 0.09672669321298599, "learning_rate": 9.976268699136345e-06, "loss": 0.4612, "num_input_tokens_seen": 24831968, "step": 20420 }, { "epoch": 2.5592031073800277, "grad_norm": 0.11879108846187592, "learning_rate": 9.97621546653807e-06, "loss": 0.4699, "num_input_tokens_seen": 24838272, "step": 20425 }, { "epoch": 2.559829595288811, "grad_norm": 0.12197921425104141, "learning_rate": 9.976162174444957e-06, "loss": 0.4638, "num_input_tokens_seen": 24844320, "step": 20430 }, { "epoch": 2.5604560831975944, "grad_norm": 0.1392882615327835, "learning_rate": 9.976108822857639e-06, "loss": 0.4621, "num_input_tokens_seen": 24850144, "step": 20435 }, { "epoch": 2.5610825711063776, "grad_norm": 0.14352810382843018, "learning_rate": 9.976055411776757e-06, "loss": 0.4668, "num_input_tokens_seen": 24856160, "step": 20440 }, { "epoch": 2.561709059015161, "grad_norm": 0.09401712566614151, "learning_rate": 9.976001941202948e-06, "loss": 0.4643, "num_input_tokens_seen": 24862496, "step": 20445 }, { "epoch": 2.5623355469239444, "grad_norm": 0.07693897187709808, "learning_rate": 9.975948411136853e-06, "loss": 0.4616, "num_input_tokens_seen": 24868672, "step": 20450 }, { "epoch": 2.5629620348327276, "grad_norm": 0.08697935938835144, "learning_rate": 9.975894821579113e-06, "loss": 0.4608, "num_input_tokens_seen": 24875040, "step": 20455 }, { "epoch": 2.563588522741511, "grad_norm": 0.09615308791399002, "learning_rate": 9.975841172530365e-06, "loss": 0.4629, "num_input_tokens_seen": 24881184, "step": 20460 }, { "epoch": 2.5642150106502943, "grad_norm": 0.07645338773727417, "learning_rate": 9.975787463991252e-06, "loss": 0.4666, "num_input_tokens_seen": 24887552, "step": 20465 }, { "epoch": 2.564841498559078, "grad_norm": 0.08298476040363312, "learning_rate": 9.975733695962417e-06, "loss": 0.4661, "num_input_tokens_seen": 24892576, "step": 20470 }, { "epoch": 2.565467986467861, "grad_norm": 0.0747910737991333, "learning_rate": 9.975679868444503e-06, "loss": 0.4587, "num_input_tokens_seen": 24898624, "step": 20475 }, { "epoch": 2.5660944743766443, "grad_norm": 0.10081354528665543, "learning_rate": 9.975625981438151e-06, "loss": 0.4616, "num_input_tokens_seen": 24904480, "step": 20480 }, { "epoch": 2.566720962285428, "grad_norm": 0.10252393037080765, "learning_rate": 9.975572034944009e-06, "loss": 0.459, "num_input_tokens_seen": 24910368, "step": 20485 }, { "epoch": 2.5673474501942115, "grad_norm": 0.15975607931613922, "learning_rate": 9.97551802896272e-06, "loss": 0.4592, "num_input_tokens_seen": 24916672, "step": 20490 }, { "epoch": 2.5679739381029947, "grad_norm": 0.09028485417366028, "learning_rate": 9.97546396349493e-06, "loss": 0.4686, "num_input_tokens_seen": 24922656, "step": 20495 }, { "epoch": 2.568600426011778, "grad_norm": 0.12539604306221008, "learning_rate": 9.975409838541283e-06, "loss": 0.4671, "num_input_tokens_seen": 24928992, "step": 20500 }, { "epoch": 2.5692269139205615, "grad_norm": 0.08348177373409271, "learning_rate": 9.97535565410243e-06, "loss": 0.4563, "num_input_tokens_seen": 24935104, "step": 20505 }, { "epoch": 2.5698534018293446, "grad_norm": 0.0352632999420166, "learning_rate": 9.975301410179017e-06, "loss": 0.4619, "num_input_tokens_seen": 24941088, "step": 20510 }, { "epoch": 2.5704798897381282, "grad_norm": 0.03209543600678444, "learning_rate": 9.975247106771692e-06, "loss": 0.4729, "num_input_tokens_seen": 24947072, "step": 20515 }, { "epoch": 2.5711063776469114, "grad_norm": 0.11376255750656128, "learning_rate": 9.975192743881106e-06, "loss": 0.4633, "num_input_tokens_seen": 24953280, "step": 20520 }, { "epoch": 2.5717328655556946, "grad_norm": 0.10206210613250732, "learning_rate": 9.975138321507907e-06, "loss": 0.4535, "num_input_tokens_seen": 24959584, "step": 20525 }, { "epoch": 2.572359353464478, "grad_norm": 0.20926432311534882, "learning_rate": 9.975083839652746e-06, "loss": 0.4697, "num_input_tokens_seen": 24965696, "step": 20530 }, { "epoch": 2.5729858413732614, "grad_norm": 0.10354314744472504, "learning_rate": 9.975029298316274e-06, "loss": 0.4628, "num_input_tokens_seen": 24971904, "step": 20535 }, { "epoch": 2.573612329282045, "grad_norm": 0.15120536088943481, "learning_rate": 9.974974697499147e-06, "loss": 0.4653, "num_input_tokens_seen": 24978400, "step": 20540 }, { "epoch": 2.574238817190828, "grad_norm": 0.04127798601984978, "learning_rate": 9.974920037202012e-06, "loss": 0.4643, "num_input_tokens_seen": 24984384, "step": 20545 }, { "epoch": 2.5748653050996113, "grad_norm": 0.10904106497764587, "learning_rate": 9.974865317425527e-06, "loss": 0.4628, "num_input_tokens_seen": 24990240, "step": 20550 }, { "epoch": 2.575491793008395, "grad_norm": 0.08735624700784683, "learning_rate": 9.974810538170344e-06, "loss": 0.4631, "num_input_tokens_seen": 24996480, "step": 20555 }, { "epoch": 2.5761182809171785, "grad_norm": 0.12764906883239746, "learning_rate": 9.974755699437119e-06, "loss": 0.4611, "num_input_tokens_seen": 25001824, "step": 20560 }, { "epoch": 2.5767447688259617, "grad_norm": 0.08303480595350266, "learning_rate": 9.974700801226506e-06, "loss": 0.4673, "num_input_tokens_seen": 25007936, "step": 20565 }, { "epoch": 2.577371256734745, "grad_norm": 0.08550631254911423, "learning_rate": 9.974645843539164e-06, "loss": 0.4667, "num_input_tokens_seen": 25014208, "step": 20570 }, { "epoch": 2.5779977446435285, "grad_norm": 0.1066231057047844, "learning_rate": 9.974590826375747e-06, "loss": 0.4636, "num_input_tokens_seen": 25019968, "step": 20575 }, { "epoch": 2.5786242325523117, "grad_norm": 0.12624038755893707, "learning_rate": 9.974535749736915e-06, "loss": 0.4667, "num_input_tokens_seen": 25026080, "step": 20580 }, { "epoch": 2.5792507204610953, "grad_norm": 0.2086566835641861, "learning_rate": 9.974480613623325e-06, "loss": 0.4671, "num_input_tokens_seen": 25032192, "step": 20585 }, { "epoch": 2.5798772083698784, "grad_norm": 0.09135118126869202, "learning_rate": 9.974425418035637e-06, "loss": 0.4568, "num_input_tokens_seen": 25038144, "step": 20590 }, { "epoch": 2.5805036962786616, "grad_norm": 0.11051279306411743, "learning_rate": 9.974370162974511e-06, "loss": 0.4568, "num_input_tokens_seen": 25044704, "step": 20595 }, { "epoch": 2.581130184187445, "grad_norm": 0.15351182222366333, "learning_rate": 9.974314848440608e-06, "loss": 0.4652, "num_input_tokens_seen": 25050816, "step": 20600 }, { "epoch": 2.5817566720962284, "grad_norm": 0.1468425691127777, "learning_rate": 9.974259474434589e-06, "loss": 0.4605, "num_input_tokens_seen": 25056576, "step": 20605 }, { "epoch": 2.582383160005012, "grad_norm": 0.09770818799734116, "learning_rate": 9.974204040957114e-06, "loss": 0.4582, "num_input_tokens_seen": 25062720, "step": 20610 }, { "epoch": 2.583009647913795, "grad_norm": 0.11979695409536362, "learning_rate": 9.97414854800885e-06, "loss": 0.4608, "num_input_tokens_seen": 25068800, "step": 20615 }, { "epoch": 2.583636135822579, "grad_norm": 0.04949159547686577, "learning_rate": 9.974092995590456e-06, "loss": 0.4638, "num_input_tokens_seen": 25074784, "step": 20620 }, { "epoch": 2.584262623731362, "grad_norm": 0.18048246204853058, "learning_rate": 9.974037383702599e-06, "loss": 0.4613, "num_input_tokens_seen": 25080864, "step": 20625 }, { "epoch": 2.5848891116401456, "grad_norm": 0.09467962384223938, "learning_rate": 9.973981712345943e-06, "loss": 0.4679, "num_input_tokens_seen": 25087264, "step": 20630 }, { "epoch": 2.5855155995489287, "grad_norm": 0.11230382323265076, "learning_rate": 9.973925981521155e-06, "loss": 0.4632, "num_input_tokens_seen": 25093504, "step": 20635 }, { "epoch": 2.586142087457712, "grad_norm": 0.11563916504383087, "learning_rate": 9.9738701912289e-06, "loss": 0.461, "num_input_tokens_seen": 25099776, "step": 20640 }, { "epoch": 2.5867685753664955, "grad_norm": 0.16418901085853577, "learning_rate": 9.973814341469844e-06, "loss": 0.4666, "num_input_tokens_seen": 25106304, "step": 20645 }, { "epoch": 2.5873950632752787, "grad_norm": 0.1368062049150467, "learning_rate": 9.973758432244656e-06, "loss": 0.4671, "num_input_tokens_seen": 25112640, "step": 20650 }, { "epoch": 2.5880215511840623, "grad_norm": 0.16788466274738312, "learning_rate": 9.973702463554004e-06, "loss": 0.4651, "num_input_tokens_seen": 25118432, "step": 20655 }, { "epoch": 2.5886480390928455, "grad_norm": 0.10902775079011917, "learning_rate": 9.973646435398558e-06, "loss": 0.4562, "num_input_tokens_seen": 25124576, "step": 20660 }, { "epoch": 2.5892745270016286, "grad_norm": 0.09404098242521286, "learning_rate": 9.973590347778987e-06, "loss": 0.4664, "num_input_tokens_seen": 25130912, "step": 20665 }, { "epoch": 2.5899010149104122, "grad_norm": 0.25168684124946594, "learning_rate": 9.973534200695963e-06, "loss": 0.466, "num_input_tokens_seen": 25137056, "step": 20670 }, { "epoch": 2.590527502819196, "grad_norm": 0.15222465991973877, "learning_rate": 9.973477994150153e-06, "loss": 0.4706, "num_input_tokens_seen": 25143072, "step": 20675 }, { "epoch": 2.591153990727979, "grad_norm": 0.10184868425130844, "learning_rate": 9.973421728142235e-06, "loss": 0.4673, "num_input_tokens_seen": 25149376, "step": 20680 }, { "epoch": 2.591780478636762, "grad_norm": 0.11984571814537048, "learning_rate": 9.973365402672878e-06, "loss": 0.4596, "num_input_tokens_seen": 25155520, "step": 20685 }, { "epoch": 2.592406966545546, "grad_norm": 0.08810772001743317, "learning_rate": 9.973309017742756e-06, "loss": 0.4642, "num_input_tokens_seen": 25161632, "step": 20690 }, { "epoch": 2.593033454454329, "grad_norm": 0.14964668452739716, "learning_rate": 9.973252573352543e-06, "loss": 0.4601, "num_input_tokens_seen": 25167680, "step": 20695 }, { "epoch": 2.5936599423631126, "grad_norm": 0.1837393045425415, "learning_rate": 9.973196069502914e-06, "loss": 0.4598, "num_input_tokens_seen": 25173824, "step": 20700 }, { "epoch": 2.5942864302718958, "grad_norm": 0.0980064794421196, "learning_rate": 9.973139506194548e-06, "loss": 0.4623, "num_input_tokens_seen": 25179744, "step": 20705 }, { "epoch": 2.594912918180679, "grad_norm": 0.14469268918037415, "learning_rate": 9.973082883428113e-06, "loss": 0.4613, "num_input_tokens_seen": 25185600, "step": 20710 }, { "epoch": 2.5955394060894625, "grad_norm": 0.16246193647384644, "learning_rate": 9.973026201204295e-06, "loss": 0.4656, "num_input_tokens_seen": 25191552, "step": 20715 }, { "epoch": 2.5961658939982457, "grad_norm": 0.08503780514001846, "learning_rate": 9.972969459523764e-06, "loss": 0.4571, "num_input_tokens_seen": 25197824, "step": 20720 }, { "epoch": 2.5967923819070293, "grad_norm": 0.11543326079845428, "learning_rate": 9.972912658387206e-06, "loss": 0.4619, "num_input_tokens_seen": 25203872, "step": 20725 }, { "epoch": 2.5974188698158125, "grad_norm": 0.0938359722495079, "learning_rate": 9.972855797795294e-06, "loss": 0.4589, "num_input_tokens_seen": 25210048, "step": 20730 }, { "epoch": 2.5980453577245957, "grad_norm": 0.16397836804389954, "learning_rate": 9.97279887774871e-06, "loss": 0.4587, "num_input_tokens_seen": 25216032, "step": 20735 }, { "epoch": 2.5986718456333793, "grad_norm": 0.16868145763874054, "learning_rate": 9.972741898248134e-06, "loss": 0.464, "num_input_tokens_seen": 25222016, "step": 20740 }, { "epoch": 2.599298333542163, "grad_norm": 0.08944334089756012, "learning_rate": 9.972684859294249e-06, "loss": 0.4647, "num_input_tokens_seen": 25228128, "step": 20745 }, { "epoch": 2.599924821450946, "grad_norm": 0.19202476739883423, "learning_rate": 9.972627760887735e-06, "loss": 0.4617, "num_input_tokens_seen": 25233696, "step": 20750 }, { "epoch": 2.6005513093597292, "grad_norm": 0.09311605989933014, "learning_rate": 9.972570603029277e-06, "loss": 0.466, "num_input_tokens_seen": 25239072, "step": 20755 }, { "epoch": 2.601177797268513, "grad_norm": 0.0971500352025032, "learning_rate": 9.972513385719555e-06, "loss": 0.4644, "num_input_tokens_seen": 25245120, "step": 20760 }, { "epoch": 2.601804285177296, "grad_norm": 0.0916970744729042, "learning_rate": 9.972456108959255e-06, "loss": 0.4609, "num_input_tokens_seen": 25251392, "step": 20765 }, { "epoch": 2.6024307730860796, "grad_norm": 0.1071038544178009, "learning_rate": 9.972398772749062e-06, "loss": 0.4627, "num_input_tokens_seen": 25257728, "step": 20770 }, { "epoch": 2.603057260994863, "grad_norm": 0.13194429874420166, "learning_rate": 9.97234137708966e-06, "loss": 0.4689, "num_input_tokens_seen": 25263808, "step": 20775 }, { "epoch": 2.603683748903646, "grad_norm": 0.12808427214622498, "learning_rate": 9.972283921981737e-06, "loss": 0.4604, "num_input_tokens_seen": 25269920, "step": 20780 }, { "epoch": 2.6043102368124296, "grad_norm": 0.10490215569734573, "learning_rate": 9.97222640742598e-06, "loss": 0.4664, "num_input_tokens_seen": 25276096, "step": 20785 }, { "epoch": 2.6049367247212127, "grad_norm": 0.03756275027990341, "learning_rate": 9.972168833423074e-06, "loss": 0.4594, "num_input_tokens_seen": 25282080, "step": 20790 }, { "epoch": 2.6055632126299964, "grad_norm": 0.09584597498178482, "learning_rate": 9.972111199973712e-06, "loss": 0.4604, "num_input_tokens_seen": 25288096, "step": 20795 }, { "epoch": 2.6061897005387795, "grad_norm": 0.0866684690117836, "learning_rate": 9.972053507078578e-06, "loss": 0.4622, "num_input_tokens_seen": 25294336, "step": 20800 }, { "epoch": 2.6068161884475627, "grad_norm": 0.10381794720888138, "learning_rate": 9.971995754738365e-06, "loss": 0.4648, "num_input_tokens_seen": 25300288, "step": 20805 }, { "epoch": 2.6074426763563463, "grad_norm": 0.0920976996421814, "learning_rate": 9.971937942953763e-06, "loss": 0.4588, "num_input_tokens_seen": 25306528, "step": 20810 }, { "epoch": 2.60806916426513, "grad_norm": 0.11538825929164886, "learning_rate": 9.971880071725463e-06, "loss": 0.4687, "num_input_tokens_seen": 25312448, "step": 20815 }, { "epoch": 2.608695652173913, "grad_norm": 0.09832626581192017, "learning_rate": 9.971822141054156e-06, "loss": 0.4647, "num_input_tokens_seen": 25318848, "step": 20820 }, { "epoch": 2.6093221400826963, "grad_norm": 0.1055319532752037, "learning_rate": 9.971764150940536e-06, "loss": 0.4627, "num_input_tokens_seen": 25324640, "step": 20825 }, { "epoch": 2.60994862799148, "grad_norm": 0.09709934890270233, "learning_rate": 9.971706101385293e-06, "loss": 0.4667, "num_input_tokens_seen": 25330656, "step": 20830 }, { "epoch": 2.610575115900263, "grad_norm": 0.0952838882803917, "learning_rate": 9.971647992389124e-06, "loss": 0.4654, "num_input_tokens_seen": 25336544, "step": 20835 }, { "epoch": 2.6112016038090466, "grad_norm": 0.09552351385354996, "learning_rate": 9.971589823952725e-06, "loss": 0.4643, "num_input_tokens_seen": 25342592, "step": 20840 }, { "epoch": 2.61182809171783, "grad_norm": 0.13109616935253143, "learning_rate": 9.97153159607679e-06, "loss": 0.4627, "num_input_tokens_seen": 25348672, "step": 20845 }, { "epoch": 2.612454579626613, "grad_norm": 0.08751996606588364, "learning_rate": 9.971473308762014e-06, "loss": 0.4695, "num_input_tokens_seen": 25354912, "step": 20850 }, { "epoch": 2.6130810675353966, "grad_norm": 0.0980302020907402, "learning_rate": 9.971414962009094e-06, "loss": 0.4683, "num_input_tokens_seen": 25360672, "step": 20855 }, { "epoch": 2.6137075554441798, "grad_norm": 0.1384032666683197, "learning_rate": 9.97135655581873e-06, "loss": 0.4607, "num_input_tokens_seen": 25366816, "step": 20860 }, { "epoch": 2.6143340433529634, "grad_norm": 0.10257121175527573, "learning_rate": 9.971298090191618e-06, "loss": 0.4617, "num_input_tokens_seen": 25372992, "step": 20865 }, { "epoch": 2.6149605312617465, "grad_norm": 0.13976311683654785, "learning_rate": 9.971239565128458e-06, "loss": 0.4628, "num_input_tokens_seen": 25379232, "step": 20870 }, { "epoch": 2.61558701917053, "grad_norm": 0.04701594263315201, "learning_rate": 9.971180980629949e-06, "loss": 0.4644, "num_input_tokens_seen": 25385120, "step": 20875 }, { "epoch": 2.6162135070793133, "grad_norm": 0.08879119902849197, "learning_rate": 9.971122336696793e-06, "loss": 0.4673, "num_input_tokens_seen": 25391072, "step": 20880 }, { "epoch": 2.616839994988097, "grad_norm": 0.08747569471597672, "learning_rate": 9.971063633329688e-06, "loss": 0.4617, "num_input_tokens_seen": 25397216, "step": 20885 }, { "epoch": 2.61746648289688, "grad_norm": 0.09188994020223618, "learning_rate": 9.971004870529339e-06, "loss": 0.4641, "num_input_tokens_seen": 25402592, "step": 20890 }, { "epoch": 2.6180929708056633, "grad_norm": 0.08878976851701736, "learning_rate": 9.970946048296447e-06, "loss": 0.4602, "num_input_tokens_seen": 25408704, "step": 20895 }, { "epoch": 2.618719458714447, "grad_norm": 0.09971985220909119, "learning_rate": 9.970887166631715e-06, "loss": 0.4625, "num_input_tokens_seen": 25415136, "step": 20900 }, { "epoch": 2.61934594662323, "grad_norm": 0.16821037232875824, "learning_rate": 9.970828225535849e-06, "loss": 0.4685, "num_input_tokens_seen": 25421184, "step": 20905 }, { "epoch": 2.6199724345320137, "grad_norm": 0.11375903338193893, "learning_rate": 9.970769225009552e-06, "loss": 0.4645, "num_input_tokens_seen": 25427296, "step": 20910 }, { "epoch": 2.620598922440797, "grad_norm": 0.08774487674236298, "learning_rate": 9.97071016505353e-06, "loss": 0.4631, "num_input_tokens_seen": 25433664, "step": 20915 }, { "epoch": 2.62122541034958, "grad_norm": 0.10682674497365952, "learning_rate": 9.970651045668488e-06, "loss": 0.4631, "num_input_tokens_seen": 25439328, "step": 20920 }, { "epoch": 2.6218518982583636, "grad_norm": 0.1547805517911911, "learning_rate": 9.970591866855134e-06, "loss": 0.4578, "num_input_tokens_seen": 25445184, "step": 20925 }, { "epoch": 2.6224783861671472, "grad_norm": 0.10386286675930023, "learning_rate": 9.970532628614175e-06, "loss": 0.4688, "num_input_tokens_seen": 25450880, "step": 20930 }, { "epoch": 2.6231048740759304, "grad_norm": 0.09662698954343796, "learning_rate": 9.97047333094632e-06, "loss": 0.4632, "num_input_tokens_seen": 25456704, "step": 20935 }, { "epoch": 2.6237313619847136, "grad_norm": 0.1554943472146988, "learning_rate": 9.970413973852276e-06, "loss": 0.459, "num_input_tokens_seen": 25462432, "step": 20940 }, { "epoch": 2.624357849893497, "grad_norm": 0.12997928261756897, "learning_rate": 9.970354557332756e-06, "loss": 0.4625, "num_input_tokens_seen": 25468640, "step": 20945 }, { "epoch": 2.6249843378022804, "grad_norm": 0.04268570989370346, "learning_rate": 9.970295081388467e-06, "loss": 0.4613, "num_input_tokens_seen": 25474656, "step": 20950 }, { "epoch": 2.625610825711064, "grad_norm": 0.14589187502861023, "learning_rate": 9.970235546020122e-06, "loss": 0.4667, "num_input_tokens_seen": 25480768, "step": 20955 }, { "epoch": 2.626237313619847, "grad_norm": 0.08900418877601624, "learning_rate": 9.970175951228432e-06, "loss": 0.4576, "num_input_tokens_seen": 25487168, "step": 20960 }, { "epoch": 2.6268638015286303, "grad_norm": 0.09980294108390808, "learning_rate": 9.97011629701411e-06, "loss": 0.4696, "num_input_tokens_seen": 25493312, "step": 20965 }, { "epoch": 2.627490289437414, "grad_norm": 0.16087880730628967, "learning_rate": 9.97005658337787e-06, "loss": 0.4628, "num_input_tokens_seen": 25499264, "step": 20970 }, { "epoch": 2.628116777346197, "grad_norm": 0.09978929907083511, "learning_rate": 9.969996810320424e-06, "loss": 0.4571, "num_input_tokens_seen": 25505536, "step": 20975 }, { "epoch": 2.6287432652549807, "grad_norm": 0.1265464872121811, "learning_rate": 9.969936977842488e-06, "loss": 0.46, "num_input_tokens_seen": 25511328, "step": 20980 }, { "epoch": 2.629369753163764, "grad_norm": 0.12207645177841187, "learning_rate": 9.969877085944775e-06, "loss": 0.4709, "num_input_tokens_seen": 25517600, "step": 20985 }, { "epoch": 2.629996241072547, "grad_norm": 0.16987313330173492, "learning_rate": 9.969817134628007e-06, "loss": 0.4587, "num_input_tokens_seen": 25523712, "step": 20990 }, { "epoch": 2.6306227289813306, "grad_norm": 0.14257505536079407, "learning_rate": 9.969757123892893e-06, "loss": 0.4619, "num_input_tokens_seen": 25529952, "step": 20995 }, { "epoch": 2.6312492168901143, "grad_norm": 0.12886473536491394, "learning_rate": 9.969697053740155e-06, "loss": 0.4694, "num_input_tokens_seen": 25535872, "step": 21000 }, { "epoch": 2.6318757047988974, "grad_norm": 0.17960380017757416, "learning_rate": 9.969636924170512e-06, "loss": 0.4694, "num_input_tokens_seen": 25542016, "step": 21005 }, { "epoch": 2.6325021927076806, "grad_norm": 0.037165749818086624, "learning_rate": 9.969576735184679e-06, "loss": 0.4671, "num_input_tokens_seen": 25548224, "step": 21010 }, { "epoch": 2.633128680616464, "grad_norm": 0.1072583720088005, "learning_rate": 9.969516486783379e-06, "loss": 0.4596, "num_input_tokens_seen": 25554688, "step": 21015 }, { "epoch": 2.6337551685252474, "grad_norm": 0.1563037484884262, "learning_rate": 9.969456178967331e-06, "loss": 0.4653, "num_input_tokens_seen": 25560864, "step": 21020 }, { "epoch": 2.634381656434031, "grad_norm": 0.19530369341373444, "learning_rate": 9.969395811737258e-06, "loss": 0.4668, "num_input_tokens_seen": 25566976, "step": 21025 }, { "epoch": 2.635008144342814, "grad_norm": 0.1188831701874733, "learning_rate": 9.969335385093879e-06, "loss": 0.4605, "num_input_tokens_seen": 25573248, "step": 21030 }, { "epoch": 2.6356346322515973, "grad_norm": 0.10303925722837448, "learning_rate": 9.969274899037918e-06, "loss": 0.46, "num_input_tokens_seen": 25579648, "step": 21035 }, { "epoch": 2.636261120160381, "grad_norm": 0.09625479578971863, "learning_rate": 9.969214353570096e-06, "loss": 0.4673, "num_input_tokens_seen": 25585760, "step": 21040 }, { "epoch": 2.636887608069164, "grad_norm": 0.10015319287776947, "learning_rate": 9.96915374869114e-06, "loss": 0.4626, "num_input_tokens_seen": 25592192, "step": 21045 }, { "epoch": 2.6375140959779477, "grad_norm": 0.025759514421224594, "learning_rate": 9.969093084401773e-06, "loss": 0.4626, "num_input_tokens_seen": 25597664, "step": 21050 }, { "epoch": 2.638140583886731, "grad_norm": 0.17310293018817902, "learning_rate": 9.969032360702719e-06, "loss": 0.4576, "num_input_tokens_seen": 25603520, "step": 21055 }, { "epoch": 2.6387670717955145, "grad_norm": 0.09530290961265564, "learning_rate": 9.968971577594707e-06, "loss": 0.4567, "num_input_tokens_seen": 25609376, "step": 21060 }, { "epoch": 2.6393935597042977, "grad_norm": 0.09778880327939987, "learning_rate": 9.96891073507846e-06, "loss": 0.4622, "num_input_tokens_seen": 25615584, "step": 21065 }, { "epoch": 2.6400200476130813, "grad_norm": 0.14460545778274536, "learning_rate": 9.96884983315471e-06, "loss": 0.4628, "num_input_tokens_seen": 25621920, "step": 21070 }, { "epoch": 2.6406465355218645, "grad_norm": 0.08754619210958481, "learning_rate": 9.968788871824183e-06, "loss": 0.4662, "num_input_tokens_seen": 25628032, "step": 21075 }, { "epoch": 2.6412730234306476, "grad_norm": 0.14474475383758545, "learning_rate": 9.968727851087605e-06, "loss": 0.4629, "num_input_tokens_seen": 25634240, "step": 21080 }, { "epoch": 2.6418995113394312, "grad_norm": 0.11881259828805923, "learning_rate": 9.96866677094571e-06, "loss": 0.4663, "num_input_tokens_seen": 25640160, "step": 21085 }, { "epoch": 2.6425259992482144, "grad_norm": 0.16557016968727112, "learning_rate": 9.968605631399225e-06, "loss": 0.4571, "num_input_tokens_seen": 25646688, "step": 21090 }, { "epoch": 2.643152487156998, "grad_norm": 0.1315304934978485, "learning_rate": 9.968544432448883e-06, "loss": 0.4622, "num_input_tokens_seen": 25652864, "step": 21095 }, { "epoch": 2.643778975065781, "grad_norm": 0.16545432806015015, "learning_rate": 9.968483174095416e-06, "loss": 0.4617, "num_input_tokens_seen": 25659104, "step": 21100 }, { "epoch": 2.6444054629745644, "grad_norm": 0.1102292463183403, "learning_rate": 9.968421856339554e-06, "loss": 0.4671, "num_input_tokens_seen": 25665024, "step": 21105 }, { "epoch": 2.645031950883348, "grad_norm": 0.16345493495464325, "learning_rate": 9.96836047918203e-06, "loss": 0.4599, "num_input_tokens_seen": 25671200, "step": 21110 }, { "epoch": 2.6456584387921316, "grad_norm": 0.17552721500396729, "learning_rate": 9.968299042623582e-06, "loss": 0.4643, "num_input_tokens_seen": 25677312, "step": 21115 }, { "epoch": 2.6462849267009148, "grad_norm": 0.08816490322351456, "learning_rate": 9.96823754666494e-06, "loss": 0.4673, "num_input_tokens_seen": 25683264, "step": 21120 }, { "epoch": 2.646911414609698, "grad_norm": 0.0937870666384697, "learning_rate": 9.968175991306843e-06, "loss": 0.4644, "num_input_tokens_seen": 25689312, "step": 21125 }, { "epoch": 2.6475379025184815, "grad_norm": 0.1267489194869995, "learning_rate": 9.968114376550025e-06, "loss": 0.4615, "num_input_tokens_seen": 25695712, "step": 21130 }, { "epoch": 2.6481643904272647, "grad_norm": 0.09275488555431366, "learning_rate": 9.968052702395221e-06, "loss": 0.46, "num_input_tokens_seen": 25701664, "step": 21135 }, { "epoch": 2.6487908783360483, "grad_norm": 0.0936717540025711, "learning_rate": 9.967990968843171e-06, "loss": 0.4618, "num_input_tokens_seen": 25707360, "step": 21140 }, { "epoch": 2.6494173662448315, "grad_norm": 0.08990876376628876, "learning_rate": 9.96792917589461e-06, "loss": 0.4616, "num_input_tokens_seen": 25713600, "step": 21145 }, { "epoch": 2.6500438541536147, "grad_norm": 0.09727190434932709, "learning_rate": 9.967867323550282e-06, "loss": 0.4621, "num_input_tokens_seen": 25719456, "step": 21150 }, { "epoch": 2.6506703420623983, "grad_norm": 0.18078044056892395, "learning_rate": 9.967805411810922e-06, "loss": 0.4631, "num_input_tokens_seen": 25725600, "step": 21155 }, { "epoch": 2.6512968299711814, "grad_norm": 0.0810031145811081, "learning_rate": 9.967743440677272e-06, "loss": 0.4632, "num_input_tokens_seen": 25731712, "step": 21160 }, { "epoch": 2.651923317879965, "grad_norm": 0.08266621083021164, "learning_rate": 9.967681410150073e-06, "loss": 0.4634, "num_input_tokens_seen": 25737824, "step": 21165 }, { "epoch": 2.652549805788748, "grad_norm": 0.0367148220539093, "learning_rate": 9.967619320230065e-06, "loss": 0.4548, "num_input_tokens_seen": 25743968, "step": 21170 }, { "epoch": 2.6531762936975314, "grad_norm": 0.09301336109638214, "learning_rate": 9.967557170917991e-06, "loss": 0.465, "num_input_tokens_seen": 25750400, "step": 21175 }, { "epoch": 2.653802781606315, "grad_norm": 0.0929204672574997, "learning_rate": 9.967494962214596e-06, "loss": 0.466, "num_input_tokens_seen": 25756544, "step": 21180 }, { "epoch": 2.6544292695150986, "grad_norm": 0.13919731974601746, "learning_rate": 9.967432694120622e-06, "loss": 0.4653, "num_input_tokens_seen": 25762656, "step": 21185 }, { "epoch": 2.655055757423882, "grad_norm": 0.0882272869348526, "learning_rate": 9.967370366636812e-06, "loss": 0.4621, "num_input_tokens_seen": 25768960, "step": 21190 }, { "epoch": 2.655682245332665, "grad_norm": 0.09057953208684921, "learning_rate": 9.967307979763914e-06, "loss": 0.4648, "num_input_tokens_seen": 25774880, "step": 21195 }, { "epoch": 2.6563087332414486, "grad_norm": 0.102776899933815, "learning_rate": 9.967245533502673e-06, "loss": 0.4619, "num_input_tokens_seen": 25780704, "step": 21200 }, { "epoch": 2.6569352211502317, "grad_norm": 0.13766832649707794, "learning_rate": 9.967183027853837e-06, "loss": 0.4633, "num_input_tokens_seen": 25787008, "step": 21205 }, { "epoch": 2.6575617090590153, "grad_norm": 0.1272912174463272, "learning_rate": 9.96712046281815e-06, "loss": 0.4635, "num_input_tokens_seen": 25793024, "step": 21210 }, { "epoch": 2.6581881969677985, "grad_norm": 0.03971352428197861, "learning_rate": 9.967057838396361e-06, "loss": 0.4624, "num_input_tokens_seen": 25799360, "step": 21215 }, { "epoch": 2.6588146848765817, "grad_norm": 0.08764366060495377, "learning_rate": 9.966995154589221e-06, "loss": 0.4627, "num_input_tokens_seen": 25805376, "step": 21220 }, { "epoch": 2.6594411727853653, "grad_norm": 0.12337271124124527, "learning_rate": 9.966932411397477e-06, "loss": 0.4678, "num_input_tokens_seen": 25811776, "step": 21225 }, { "epoch": 2.6600676606941485, "grad_norm": 0.098708376288414, "learning_rate": 9.966869608821881e-06, "loss": 0.4626, "num_input_tokens_seen": 25817280, "step": 21230 }, { "epoch": 2.660694148602932, "grad_norm": 0.14731299877166748, "learning_rate": 9.966806746863182e-06, "loss": 0.4556, "num_input_tokens_seen": 25823424, "step": 21235 }, { "epoch": 2.6613206365117152, "grad_norm": 0.11123915016651154, "learning_rate": 9.966743825522134e-06, "loss": 0.4561, "num_input_tokens_seen": 25829696, "step": 21240 }, { "epoch": 2.6619471244204984, "grad_norm": 0.09554906189441681, "learning_rate": 9.966680844799485e-06, "loss": 0.4671, "num_input_tokens_seen": 25835712, "step": 21245 }, { "epoch": 2.662573612329282, "grad_norm": 0.10989314317703247, "learning_rate": 9.966617804695993e-06, "loss": 0.4675, "num_input_tokens_seen": 25842016, "step": 21250 }, { "epoch": 2.6632001002380656, "grad_norm": 0.10276056081056595, "learning_rate": 9.96655470521241e-06, "loss": 0.4683, "num_input_tokens_seen": 25848256, "step": 21255 }, { "epoch": 2.663826588146849, "grad_norm": 0.12385386228561401, "learning_rate": 9.966491546349489e-06, "loss": 0.4565, "num_input_tokens_seen": 25853632, "step": 21260 }, { "epoch": 2.664453076055632, "grad_norm": 0.11256548762321472, "learning_rate": 9.966428328107985e-06, "loss": 0.4622, "num_input_tokens_seen": 25859968, "step": 21265 }, { "epoch": 2.6650795639644156, "grad_norm": 0.03952011093497276, "learning_rate": 9.966365050488655e-06, "loss": 0.4596, "num_input_tokens_seen": 25865952, "step": 21270 }, { "epoch": 2.6657060518731988, "grad_norm": 0.10061609745025635, "learning_rate": 9.966301713492256e-06, "loss": 0.4587, "num_input_tokens_seen": 25872384, "step": 21275 }, { "epoch": 2.6663325397819824, "grad_norm": 0.14733433723449707, "learning_rate": 9.966238317119544e-06, "loss": 0.4586, "num_input_tokens_seen": 25878528, "step": 21280 }, { "epoch": 2.6669590276907655, "grad_norm": 0.08919528871774673, "learning_rate": 9.966174861371279e-06, "loss": 0.4685, "num_input_tokens_seen": 25884640, "step": 21285 }, { "epoch": 2.6675855155995487, "grad_norm": 0.08145692944526672, "learning_rate": 9.966111346248217e-06, "loss": 0.4627, "num_input_tokens_seen": 25890848, "step": 21290 }, { "epoch": 2.6682120035083323, "grad_norm": 0.08417624980211258, "learning_rate": 9.966047771751118e-06, "loss": 0.4621, "num_input_tokens_seen": 25897088, "step": 21295 }, { "epoch": 2.6688384914171155, "grad_norm": 0.04024873301386833, "learning_rate": 9.965984137880743e-06, "loss": 0.4562, "num_input_tokens_seen": 25902656, "step": 21300 }, { "epoch": 2.669464979325899, "grad_norm": 0.09782478958368301, "learning_rate": 9.965920444637854e-06, "loss": 0.4584, "num_input_tokens_seen": 25908672, "step": 21305 }, { "epoch": 2.6700914672346823, "grad_norm": 0.10229815542697906, "learning_rate": 9.965856692023208e-06, "loss": 0.4609, "num_input_tokens_seen": 25914560, "step": 21310 }, { "epoch": 2.670717955143466, "grad_norm": 0.11397653818130493, "learning_rate": 9.965792880037573e-06, "loss": 0.4668, "num_input_tokens_seen": 25920832, "step": 21315 }, { "epoch": 2.671344443052249, "grad_norm": 0.09671124815940857, "learning_rate": 9.965729008681708e-06, "loss": 0.4693, "num_input_tokens_seen": 25927008, "step": 21320 }, { "epoch": 2.6719709309610327, "grad_norm": 0.10185109078884125, "learning_rate": 9.965665077956377e-06, "loss": 0.4717, "num_input_tokens_seen": 25933120, "step": 21325 }, { "epoch": 2.672597418869816, "grad_norm": 0.10917871445417404, "learning_rate": 9.965601087862346e-06, "loss": 0.4607, "num_input_tokens_seen": 25938944, "step": 21330 }, { "epoch": 2.673223906778599, "grad_norm": 0.08076857775449753, "learning_rate": 9.965537038400379e-06, "loss": 0.4534, "num_input_tokens_seen": 25944480, "step": 21335 }, { "epoch": 2.6738503946873826, "grad_norm": 0.12348712980747223, "learning_rate": 9.965472929571242e-06, "loss": 0.4606, "num_input_tokens_seen": 25950528, "step": 21340 }, { "epoch": 2.674476882596166, "grad_norm": 0.0925460159778595, "learning_rate": 9.965408761375702e-06, "loss": 0.4601, "num_input_tokens_seen": 25956896, "step": 21345 }, { "epoch": 2.6751033705049494, "grad_norm": 0.16615881025791168, "learning_rate": 9.965344533814525e-06, "loss": 0.4648, "num_input_tokens_seen": 25963232, "step": 21350 }, { "epoch": 2.6757298584137326, "grad_norm": 0.08963603526353836, "learning_rate": 9.965280246888477e-06, "loss": 0.4604, "num_input_tokens_seen": 25969312, "step": 21355 }, { "epoch": 2.6763563463225157, "grad_norm": 0.13765540719032288, "learning_rate": 9.965215900598333e-06, "loss": 0.4622, "num_input_tokens_seen": 25975456, "step": 21360 }, { "epoch": 2.6769828342312993, "grad_norm": 0.09917911142110825, "learning_rate": 9.965151494944856e-06, "loss": 0.4679, "num_input_tokens_seen": 25981536, "step": 21365 }, { "epoch": 2.677609322140083, "grad_norm": 0.15080055594444275, "learning_rate": 9.96508702992882e-06, "loss": 0.4528, "num_input_tokens_seen": 25987680, "step": 21370 }, { "epoch": 2.678235810048866, "grad_norm": 0.044869884848594666, "learning_rate": 9.965022505550993e-06, "loss": 0.4634, "num_input_tokens_seen": 25993856, "step": 21375 }, { "epoch": 2.6788622979576493, "grad_norm": 0.09380343556404114, "learning_rate": 9.96495792181215e-06, "loss": 0.4639, "num_input_tokens_seen": 26000128, "step": 21380 }, { "epoch": 2.679488785866433, "grad_norm": 0.10539114475250244, "learning_rate": 9.964893278713057e-06, "loss": 0.4663, "num_input_tokens_seen": 26006336, "step": 21385 }, { "epoch": 2.680115273775216, "grad_norm": 0.03838184103369713, "learning_rate": 9.964828576254493e-06, "loss": 0.4587, "num_input_tokens_seen": 26012352, "step": 21390 }, { "epoch": 2.6807417616839997, "grad_norm": 0.03489040583372116, "learning_rate": 9.964763814437228e-06, "loss": 0.4598, "num_input_tokens_seen": 26018528, "step": 21395 }, { "epoch": 2.681368249592783, "grad_norm": 0.08451632410287857, "learning_rate": 9.964698993262038e-06, "loss": 0.4642, "num_input_tokens_seen": 26024480, "step": 21400 }, { "epoch": 2.681994737501566, "grad_norm": 0.09346002340316772, "learning_rate": 9.964634112729696e-06, "loss": 0.4637, "num_input_tokens_seen": 26030304, "step": 21405 }, { "epoch": 2.6826212254103496, "grad_norm": 0.13634420931339264, "learning_rate": 9.96456917284098e-06, "loss": 0.4604, "num_input_tokens_seen": 26036512, "step": 21410 }, { "epoch": 2.683247713319133, "grad_norm": 0.10053244233131409, "learning_rate": 9.964504173596667e-06, "loss": 0.4615, "num_input_tokens_seen": 26042720, "step": 21415 }, { "epoch": 2.6838742012279164, "grad_norm": 0.039700672030448914, "learning_rate": 9.964439114997532e-06, "loss": 0.4587, "num_input_tokens_seen": 26048640, "step": 21420 }, { "epoch": 2.6845006891366996, "grad_norm": 0.11561830341815948, "learning_rate": 9.964373997044354e-06, "loss": 0.4586, "num_input_tokens_seen": 26054016, "step": 21425 }, { "epoch": 2.6851271770454828, "grad_norm": 0.0934528112411499, "learning_rate": 9.964308819737911e-06, "loss": 0.4599, "num_input_tokens_seen": 26060160, "step": 21430 }, { "epoch": 2.6857536649542664, "grad_norm": 0.13665641844272614, "learning_rate": 9.964243583078981e-06, "loss": 0.4692, "num_input_tokens_seen": 26066496, "step": 21435 }, { "epoch": 2.68638015286305, "grad_norm": 0.09944740682840347, "learning_rate": 9.964178287068344e-06, "loss": 0.4579, "num_input_tokens_seen": 26072672, "step": 21440 }, { "epoch": 2.687006640771833, "grad_norm": 0.09264646470546722, "learning_rate": 9.964112931706785e-06, "loss": 0.4599, "num_input_tokens_seen": 26078528, "step": 21445 }, { "epoch": 2.6876331286806163, "grad_norm": 0.0856647863984108, "learning_rate": 9.96404751699508e-06, "loss": 0.4551, "num_input_tokens_seen": 26084832, "step": 21450 }, { "epoch": 2.6882596165894, "grad_norm": 0.0962395891547203, "learning_rate": 9.963982042934015e-06, "loss": 0.4563, "num_input_tokens_seen": 26090176, "step": 21455 }, { "epoch": 2.688886104498183, "grad_norm": 0.10210639238357544, "learning_rate": 9.963916509524372e-06, "loss": 0.4749, "num_input_tokens_seen": 26096288, "step": 21460 }, { "epoch": 2.6895125924069667, "grad_norm": 0.13505885004997253, "learning_rate": 9.963850916766931e-06, "loss": 0.4623, "num_input_tokens_seen": 26102144, "step": 21465 }, { "epoch": 2.69013908031575, "grad_norm": 0.10520809888839722, "learning_rate": 9.963785264662482e-06, "loss": 0.4657, "num_input_tokens_seen": 26108160, "step": 21470 }, { "epoch": 2.690765568224533, "grad_norm": 0.14407485723495483, "learning_rate": 9.963719553211803e-06, "loss": 0.4735, "num_input_tokens_seen": 26114560, "step": 21475 }, { "epoch": 2.6913920561333167, "grad_norm": 0.14169375598430634, "learning_rate": 9.963653782415686e-06, "loss": 0.4681, "num_input_tokens_seen": 26120672, "step": 21480 }, { "epoch": 2.6920185440421, "grad_norm": 0.08831450343132019, "learning_rate": 9.963587952274916e-06, "loss": 0.4706, "num_input_tokens_seen": 26126720, "step": 21485 }, { "epoch": 2.6926450319508834, "grad_norm": 0.05226457118988037, "learning_rate": 9.963522062790277e-06, "loss": 0.4665, "num_input_tokens_seen": 26132416, "step": 21490 }, { "epoch": 2.6932715198596666, "grad_norm": 0.12232279032468796, "learning_rate": 9.963456113962559e-06, "loss": 0.4543, "num_input_tokens_seen": 26138272, "step": 21495 }, { "epoch": 2.6938980077684502, "grad_norm": 0.1995241492986679, "learning_rate": 9.96339010579255e-06, "loss": 0.4674, "num_input_tokens_seen": 26144448, "step": 21500 }, { "epoch": 2.6945244956772334, "grad_norm": 0.21145692467689514, "learning_rate": 9.96332403828104e-06, "loss": 0.456, "num_input_tokens_seen": 26150848, "step": 21505 }, { "epoch": 2.695150983586017, "grad_norm": 0.14688530564308167, "learning_rate": 9.963257911428816e-06, "loss": 0.4572, "num_input_tokens_seen": 26157408, "step": 21510 }, { "epoch": 2.6957774714948, "grad_norm": 0.16983579099178314, "learning_rate": 9.963191725236672e-06, "loss": 0.4638, "num_input_tokens_seen": 26163680, "step": 21515 }, { "epoch": 2.6964039594035834, "grad_norm": 0.10751243680715561, "learning_rate": 9.963125479705398e-06, "loss": 0.4618, "num_input_tokens_seen": 26169728, "step": 21520 }, { "epoch": 2.697030447312367, "grad_norm": 0.10585718601942062, "learning_rate": 9.963059174835787e-06, "loss": 0.4655, "num_input_tokens_seen": 26175296, "step": 21525 }, { "epoch": 2.69765693522115, "grad_norm": 0.16046194732189178, "learning_rate": 9.96299281062863e-06, "loss": 0.4572, "num_input_tokens_seen": 26181312, "step": 21530 }, { "epoch": 2.6982834231299337, "grad_norm": 0.22009047865867615, "learning_rate": 9.962926387084723e-06, "loss": 0.4597, "num_input_tokens_seen": 26187488, "step": 21535 }, { "epoch": 2.698909911038717, "grad_norm": 0.328232079744339, "learning_rate": 9.962859904204858e-06, "loss": 0.451, "num_input_tokens_seen": 26193504, "step": 21540 }, { "epoch": 2.6995363989475, "grad_norm": 0.2566068768501282, "learning_rate": 9.962793361989828e-06, "loss": 0.4573, "num_input_tokens_seen": 26199904, "step": 21545 }, { "epoch": 2.7001628868562837, "grad_norm": 0.19978705048561096, "learning_rate": 9.962726760440433e-06, "loss": 0.4589, "num_input_tokens_seen": 26205888, "step": 21550 }, { "epoch": 2.7007893747650673, "grad_norm": 0.29080548882484436, "learning_rate": 9.962660099557467e-06, "loss": 0.473, "num_input_tokens_seen": 26212128, "step": 21555 }, { "epoch": 2.7014158626738505, "grad_norm": 0.1210251972079277, "learning_rate": 9.962593379341724e-06, "loss": 0.4748, "num_input_tokens_seen": 26218368, "step": 21560 }, { "epoch": 2.7020423505826336, "grad_norm": 0.15289151668548584, "learning_rate": 9.962526599794009e-06, "loss": 0.4746, "num_input_tokens_seen": 26224320, "step": 21565 }, { "epoch": 2.7026688384914173, "grad_norm": 0.10633192211389542, "learning_rate": 9.962459760915115e-06, "loss": 0.4547, "num_input_tokens_seen": 26230528, "step": 21570 }, { "epoch": 2.7032953264002004, "grad_norm": 0.09982770681381226, "learning_rate": 9.962392862705841e-06, "loss": 0.4544, "num_input_tokens_seen": 26236608, "step": 21575 }, { "epoch": 2.703921814308984, "grad_norm": 0.04400720074772835, "learning_rate": 9.96232590516699e-06, "loss": 0.4577, "num_input_tokens_seen": 26242880, "step": 21580 }, { "epoch": 2.704548302217767, "grad_norm": 0.1275578737258911, "learning_rate": 9.96225888829936e-06, "loss": 0.4768, "num_input_tokens_seen": 26248960, "step": 21585 }, { "epoch": 2.7051747901265504, "grad_norm": 0.04448442533612251, "learning_rate": 9.962191812103753e-06, "loss": 0.4614, "num_input_tokens_seen": 26255200, "step": 21590 }, { "epoch": 2.705801278035334, "grad_norm": 0.10242116451263428, "learning_rate": 9.962124676580971e-06, "loss": 0.4633, "num_input_tokens_seen": 26261504, "step": 21595 }, { "epoch": 2.706427765944117, "grad_norm": 0.08263784646987915, "learning_rate": 9.962057481731816e-06, "loss": 0.4627, "num_input_tokens_seen": 26267424, "step": 21600 }, { "epoch": 2.7070542538529008, "grad_norm": 0.08723168075084686, "learning_rate": 9.961990227557092e-06, "loss": 0.4742, "num_input_tokens_seen": 26273568, "step": 21605 }, { "epoch": 2.707680741761684, "grad_norm": 0.09954213351011276, "learning_rate": 9.961922914057603e-06, "loss": 0.4638, "num_input_tokens_seen": 26279840, "step": 21610 }, { "epoch": 2.708307229670467, "grad_norm": 0.098641037940979, "learning_rate": 9.961855541234154e-06, "loss": 0.4575, "num_input_tokens_seen": 26285952, "step": 21615 }, { "epoch": 2.7089337175792507, "grad_norm": 0.12641744315624237, "learning_rate": 9.961788109087551e-06, "loss": 0.464, "num_input_tokens_seen": 26292064, "step": 21620 }, { "epoch": 2.7095602054880343, "grad_norm": 0.09771227091550827, "learning_rate": 9.9617206176186e-06, "loss": 0.4644, "num_input_tokens_seen": 26297792, "step": 21625 }, { "epoch": 2.7101866933968175, "grad_norm": 0.08820661157369614, "learning_rate": 9.961653066828106e-06, "loss": 0.465, "num_input_tokens_seen": 26303872, "step": 21630 }, { "epoch": 2.7108131813056007, "grad_norm": 0.10424107313156128, "learning_rate": 9.961585456716877e-06, "loss": 0.4649, "num_input_tokens_seen": 26310080, "step": 21635 }, { "epoch": 2.7114396692143843, "grad_norm": 0.20196771621704102, "learning_rate": 9.961517787285724e-06, "loss": 0.4648, "num_input_tokens_seen": 26316128, "step": 21640 }, { "epoch": 2.7120661571231675, "grad_norm": 0.18281744420528412, "learning_rate": 9.961450058535455e-06, "loss": 0.4694, "num_input_tokens_seen": 26322176, "step": 21645 }, { "epoch": 2.712692645031951, "grad_norm": 0.21506436169147491, "learning_rate": 9.961382270466876e-06, "loss": 0.4578, "num_input_tokens_seen": 26328128, "step": 21650 }, { "epoch": 2.7133191329407342, "grad_norm": 0.10887658596038818, "learning_rate": 9.961314423080806e-06, "loss": 0.4614, "num_input_tokens_seen": 26334688, "step": 21655 }, { "epoch": 2.7139456208495174, "grad_norm": 0.12790104746818542, "learning_rate": 9.961246516378047e-06, "loss": 0.4572, "num_input_tokens_seen": 26340544, "step": 21660 }, { "epoch": 2.714572108758301, "grad_norm": 0.15993477404117584, "learning_rate": 9.961178550359415e-06, "loss": 0.4582, "num_input_tokens_seen": 26346720, "step": 21665 }, { "epoch": 2.715198596667084, "grad_norm": 0.20682911574840546, "learning_rate": 9.961110525025723e-06, "loss": 0.4613, "num_input_tokens_seen": 26352800, "step": 21670 }, { "epoch": 2.715825084575868, "grad_norm": 0.13027839362621307, "learning_rate": 9.961042440377784e-06, "loss": 0.4634, "num_input_tokens_seen": 26358432, "step": 21675 }, { "epoch": 2.716451572484651, "grad_norm": 0.04950319603085518, "learning_rate": 9.96097429641641e-06, "loss": 0.4689, "num_input_tokens_seen": 26364544, "step": 21680 }, { "epoch": 2.717078060393434, "grad_norm": 0.08988217264413834, "learning_rate": 9.960906093142419e-06, "loss": 0.4615, "num_input_tokens_seen": 26370400, "step": 21685 }, { "epoch": 2.7177045483022177, "grad_norm": 0.17152725160121918, "learning_rate": 9.960837830556624e-06, "loss": 0.4617, "num_input_tokens_seen": 26376256, "step": 21690 }, { "epoch": 2.7183310362110014, "grad_norm": 0.10038367658853531, "learning_rate": 9.960769508659844e-06, "loss": 0.4617, "num_input_tokens_seen": 26382368, "step": 21695 }, { "epoch": 2.7189575241197845, "grad_norm": 0.08941870927810669, "learning_rate": 9.96070112745289e-06, "loss": 0.4586, "num_input_tokens_seen": 26388352, "step": 21700 }, { "epoch": 2.7195840120285677, "grad_norm": 0.08472678810358047, "learning_rate": 9.960632686936587e-06, "loss": 0.472, "num_input_tokens_seen": 26394176, "step": 21705 }, { "epoch": 2.7202104999373513, "grad_norm": 0.19350090622901917, "learning_rate": 9.960564187111747e-06, "loss": 0.4681, "num_input_tokens_seen": 26400544, "step": 21710 }, { "epoch": 2.7208369878461345, "grad_norm": 0.0844283476471901, "learning_rate": 9.960495627979192e-06, "loss": 0.4608, "num_input_tokens_seen": 26406624, "step": 21715 }, { "epoch": 2.721463475754918, "grad_norm": 0.08393839746713638, "learning_rate": 9.960427009539742e-06, "loss": 0.4618, "num_input_tokens_seen": 26412768, "step": 21720 }, { "epoch": 2.7220899636637013, "grad_norm": 0.10230031609535217, "learning_rate": 9.960358331794215e-06, "loss": 0.4616, "num_input_tokens_seen": 26418752, "step": 21725 }, { "epoch": 2.7227164515724844, "grad_norm": 0.13912373781204224, "learning_rate": 9.960289594743435e-06, "loss": 0.4579, "num_input_tokens_seen": 26424992, "step": 21730 }, { "epoch": 2.723342939481268, "grad_norm": 0.15861031413078308, "learning_rate": 9.960220798388223e-06, "loss": 0.4505, "num_input_tokens_seen": 26430848, "step": 21735 }, { "epoch": 2.723969427390051, "grad_norm": 0.13280068337917328, "learning_rate": 9.960151942729399e-06, "loss": 0.4649, "num_input_tokens_seen": 26437024, "step": 21740 }, { "epoch": 2.724595915298835, "grad_norm": 0.15290674567222595, "learning_rate": 9.960083027767791e-06, "loss": 0.4646, "num_input_tokens_seen": 26443232, "step": 21745 }, { "epoch": 2.725222403207618, "grad_norm": 0.12002093344926834, "learning_rate": 9.960014053504218e-06, "loss": 0.4602, "num_input_tokens_seen": 26449344, "step": 21750 }, { "epoch": 2.7258488911164016, "grad_norm": 0.16211871802806854, "learning_rate": 9.959945019939507e-06, "loss": 0.4658, "num_input_tokens_seen": 26455744, "step": 21755 }, { "epoch": 2.7264753790251848, "grad_norm": 0.1112070232629776, "learning_rate": 9.959875927074485e-06, "loss": 0.4619, "num_input_tokens_seen": 26461408, "step": 21760 }, { "epoch": 2.7271018669339684, "grad_norm": 0.15294331312179565, "learning_rate": 9.959806774909974e-06, "loss": 0.4671, "num_input_tokens_seen": 26467488, "step": 21765 }, { "epoch": 2.7277283548427516, "grad_norm": 0.04477344825863838, "learning_rate": 9.959737563446804e-06, "loss": 0.4783, "num_input_tokens_seen": 26473792, "step": 21770 }, { "epoch": 2.7283548427515347, "grad_norm": 0.16547341644763947, "learning_rate": 9.959668292685802e-06, "loss": 0.4688, "num_input_tokens_seen": 26480032, "step": 21775 }, { "epoch": 2.7289813306603183, "grad_norm": 0.11045604944229126, "learning_rate": 9.959598962627796e-06, "loss": 0.4542, "num_input_tokens_seen": 26486336, "step": 21780 }, { "epoch": 2.7296078185691015, "grad_norm": 0.11715541034936905, "learning_rate": 9.959529573273613e-06, "loss": 0.4606, "num_input_tokens_seen": 26491648, "step": 21785 }, { "epoch": 2.730234306477885, "grad_norm": 0.1829223483800888, "learning_rate": 9.959460124624085e-06, "loss": 0.4619, "num_input_tokens_seen": 26497792, "step": 21790 }, { "epoch": 2.7308607943866683, "grad_norm": 0.04967295750975609, "learning_rate": 9.959390616680043e-06, "loss": 0.4662, "num_input_tokens_seen": 26503712, "step": 21795 }, { "epoch": 2.7314872822954515, "grad_norm": 0.044159792363643646, "learning_rate": 9.959321049442315e-06, "loss": 0.4593, "num_input_tokens_seen": 26510048, "step": 21800 }, { "epoch": 2.732113770204235, "grad_norm": 0.1650070995092392, "learning_rate": 9.959251422911733e-06, "loss": 0.4623, "num_input_tokens_seen": 26516480, "step": 21805 }, { "epoch": 2.7327402581130187, "grad_norm": 0.12484236061573029, "learning_rate": 9.959181737089132e-06, "loss": 0.4638, "num_input_tokens_seen": 26522624, "step": 21810 }, { "epoch": 2.733366746021802, "grad_norm": 0.31397560238838196, "learning_rate": 9.959111991975346e-06, "loss": 0.465, "num_input_tokens_seen": 26528640, "step": 21815 }, { "epoch": 2.733993233930585, "grad_norm": 0.12770076096057892, "learning_rate": 9.959042187571205e-06, "loss": 0.4639, "num_input_tokens_seen": 26534848, "step": 21820 }, { "epoch": 2.7346197218393686, "grad_norm": 0.11159274727106094, "learning_rate": 9.958972323877544e-06, "loss": 0.4718, "num_input_tokens_seen": 26540992, "step": 21825 }, { "epoch": 2.735246209748152, "grad_norm": 0.04642574116587639, "learning_rate": 9.958902400895201e-06, "loss": 0.4598, "num_input_tokens_seen": 26547200, "step": 21830 }, { "epoch": 2.7358726976569354, "grad_norm": 0.10905222594738007, "learning_rate": 9.95883241862501e-06, "loss": 0.478, "num_input_tokens_seen": 26553248, "step": 21835 }, { "epoch": 2.7364991855657186, "grad_norm": 0.09341595321893692, "learning_rate": 9.958762377067807e-06, "loss": 0.4633, "num_input_tokens_seen": 26559648, "step": 21840 }, { "epoch": 2.7371256734745018, "grad_norm": 0.09202532470226288, "learning_rate": 9.958692276224434e-06, "loss": 0.4528, "num_input_tokens_seen": 26565888, "step": 21845 }, { "epoch": 2.7377521613832854, "grad_norm": 0.17342256009578705, "learning_rate": 9.958622116095725e-06, "loss": 0.4679, "num_input_tokens_seen": 26572160, "step": 21850 }, { "epoch": 2.7383786492920685, "grad_norm": 0.1480560451745987, "learning_rate": 9.958551896682518e-06, "loss": 0.4601, "num_input_tokens_seen": 26578208, "step": 21855 }, { "epoch": 2.739005137200852, "grad_norm": 0.08492688834667206, "learning_rate": 9.958481617985654e-06, "loss": 0.461, "num_input_tokens_seen": 26584096, "step": 21860 }, { "epoch": 2.7396316251096353, "grad_norm": 0.084974505007267, "learning_rate": 9.958411280005975e-06, "loss": 0.4701, "num_input_tokens_seen": 26590208, "step": 21865 }, { "epoch": 2.7402581130184185, "grad_norm": 0.24573035538196564, "learning_rate": 9.958340882744319e-06, "loss": 0.4648, "num_input_tokens_seen": 26596352, "step": 21870 }, { "epoch": 2.740884600927202, "grad_norm": 0.23050054907798767, "learning_rate": 9.95827042620153e-06, "loss": 0.4691, "num_input_tokens_seen": 26602688, "step": 21875 }, { "epoch": 2.7415110888359857, "grad_norm": 0.09413480013608932, "learning_rate": 9.958199910378448e-06, "loss": 0.4572, "num_input_tokens_seen": 26608896, "step": 21880 }, { "epoch": 2.742137576744769, "grad_norm": 0.08940903097391129, "learning_rate": 9.958129335275918e-06, "loss": 0.4681, "num_input_tokens_seen": 26614752, "step": 21885 }, { "epoch": 2.742764064653552, "grad_norm": 0.07752977311611176, "learning_rate": 9.958058700894784e-06, "loss": 0.4611, "num_input_tokens_seen": 26620736, "step": 21890 }, { "epoch": 2.7433905525623357, "grad_norm": 0.0791444480419159, "learning_rate": 9.95798800723589e-06, "loss": 0.4596, "num_input_tokens_seen": 26626432, "step": 21895 }, { "epoch": 2.744017040471119, "grad_norm": 0.1426590532064438, "learning_rate": 9.95791725430008e-06, "loss": 0.4628, "num_input_tokens_seen": 26632576, "step": 21900 }, { "epoch": 2.7446435283799024, "grad_norm": 0.15348881483078003, "learning_rate": 9.957846442088202e-06, "loss": 0.4666, "num_input_tokens_seen": 26638880, "step": 21905 }, { "epoch": 2.7452700162886856, "grad_norm": 0.07753526419401169, "learning_rate": 9.9577755706011e-06, "loss": 0.4647, "num_input_tokens_seen": 26644992, "step": 21910 }, { "epoch": 2.745896504197469, "grad_norm": 0.03395268693566322, "learning_rate": 9.957704639839622e-06, "loss": 0.4636, "num_input_tokens_seen": 26650848, "step": 21915 }, { "epoch": 2.7465229921062524, "grad_norm": 0.09460121393203735, "learning_rate": 9.957633649804618e-06, "loss": 0.4636, "num_input_tokens_seen": 26656992, "step": 21920 }, { "epoch": 2.7471494800150356, "grad_norm": 0.036048922687768936, "learning_rate": 9.957562600496937e-06, "loss": 0.4628, "num_input_tokens_seen": 26663232, "step": 21925 }, { "epoch": 2.747775967923819, "grad_norm": 0.08753052353858948, "learning_rate": 9.957491491917426e-06, "loss": 0.4614, "num_input_tokens_seen": 26669216, "step": 21930 }, { "epoch": 2.7484024558326023, "grad_norm": 0.07905711233615875, "learning_rate": 9.957420324066935e-06, "loss": 0.462, "num_input_tokens_seen": 26674400, "step": 21935 }, { "epoch": 2.749028943741386, "grad_norm": 0.09232733398675919, "learning_rate": 9.957349096946318e-06, "loss": 0.4648, "num_input_tokens_seen": 26680512, "step": 21940 }, { "epoch": 2.749655431650169, "grad_norm": 0.10157494992017746, "learning_rate": 9.957277810556425e-06, "loss": 0.4605, "num_input_tokens_seen": 26686880, "step": 21945 }, { "epoch": 2.7502819195589527, "grad_norm": 0.08241080492734909, "learning_rate": 9.957206464898106e-06, "loss": 0.469, "num_input_tokens_seen": 26693248, "step": 21950 }, { "epoch": 2.750908407467736, "grad_norm": 0.10201413184404373, "learning_rate": 9.957135059972215e-06, "loss": 0.4617, "num_input_tokens_seen": 26699552, "step": 21955 }, { "epoch": 2.751534895376519, "grad_norm": 0.13276119530200958, "learning_rate": 9.957063595779608e-06, "loss": 0.4633, "num_input_tokens_seen": 26706112, "step": 21960 }, { "epoch": 2.7521613832853027, "grad_norm": 0.07781802117824554, "learning_rate": 9.95699207232114e-06, "loss": 0.464, "num_input_tokens_seen": 26712256, "step": 21965 }, { "epoch": 2.752787871194086, "grad_norm": 0.15055404603481293, "learning_rate": 9.956920489597662e-06, "loss": 0.4596, "num_input_tokens_seen": 26718240, "step": 21970 }, { "epoch": 2.7534143591028695, "grad_norm": 0.09849751740694046, "learning_rate": 9.956848847610031e-06, "loss": 0.4628, "num_input_tokens_seen": 26724320, "step": 21975 }, { "epoch": 2.7540408470116526, "grad_norm": 0.09441318362951279, "learning_rate": 9.956777146359107e-06, "loss": 0.4634, "num_input_tokens_seen": 26730528, "step": 21980 }, { "epoch": 2.754667334920436, "grad_norm": 0.08293602615594864, "learning_rate": 9.956705385845743e-06, "loss": 0.4634, "num_input_tokens_seen": 26735840, "step": 21985 }, { "epoch": 2.7552938228292194, "grad_norm": 0.09035439789295197, "learning_rate": 9.9566335660708e-06, "loss": 0.4654, "num_input_tokens_seen": 26742304, "step": 21990 }, { "epoch": 2.755920310738003, "grad_norm": 0.11989076435565948, "learning_rate": 9.956561687035134e-06, "loss": 0.4612, "num_input_tokens_seen": 26748704, "step": 21995 }, { "epoch": 2.756546798646786, "grad_norm": 0.1018831729888916, "learning_rate": 9.956489748739605e-06, "loss": 0.4677, "num_input_tokens_seen": 26754816, "step": 22000 }, { "epoch": 2.7571732865555694, "grad_norm": 0.08100784569978714, "learning_rate": 9.956417751185077e-06, "loss": 0.4661, "num_input_tokens_seen": 26760896, "step": 22005 }, { "epoch": 2.757799774464353, "grad_norm": 0.09932482987642288, "learning_rate": 9.956345694372407e-06, "loss": 0.4598, "num_input_tokens_seen": 26767008, "step": 22010 }, { "epoch": 2.758426262373136, "grad_norm": 0.07942304760217667, "learning_rate": 9.956273578302456e-06, "loss": 0.4615, "num_input_tokens_seen": 26772800, "step": 22015 }, { "epoch": 2.7590527502819198, "grad_norm": 0.09415001422166824, "learning_rate": 9.956201402976086e-06, "loss": 0.4587, "num_input_tokens_seen": 26778944, "step": 22020 }, { "epoch": 2.759679238190703, "grad_norm": 0.13389725983142853, "learning_rate": 9.956129168394164e-06, "loss": 0.4623, "num_input_tokens_seen": 26785120, "step": 22025 }, { "epoch": 2.760305726099486, "grad_norm": 0.10118948668241501, "learning_rate": 9.95605687455755e-06, "loss": 0.4624, "num_input_tokens_seen": 26791296, "step": 22030 }, { "epoch": 2.7609322140082697, "grad_norm": 0.07782663404941559, "learning_rate": 9.955984521467108e-06, "loss": 0.4536, "num_input_tokens_seen": 26797632, "step": 22035 }, { "epoch": 2.761558701917053, "grad_norm": 0.16610004007816315, "learning_rate": 9.955912109123707e-06, "loss": 0.4688, "num_input_tokens_seen": 26803616, "step": 22040 }, { "epoch": 2.7621851898258365, "grad_norm": 0.13861390948295593, "learning_rate": 9.955839637528208e-06, "loss": 0.4652, "num_input_tokens_seen": 26809312, "step": 22045 }, { "epoch": 2.7628116777346197, "grad_norm": 0.08377324044704437, "learning_rate": 9.955767106681479e-06, "loss": 0.4597, "num_input_tokens_seen": 26814336, "step": 22050 }, { "epoch": 2.763438165643403, "grad_norm": 0.09020067006349564, "learning_rate": 9.955694516584389e-06, "loss": 0.461, "num_input_tokens_seen": 26820448, "step": 22055 }, { "epoch": 2.7640646535521864, "grad_norm": 0.1087024137377739, "learning_rate": 9.955621867237804e-06, "loss": 0.4633, "num_input_tokens_seen": 26826464, "step": 22060 }, { "epoch": 2.76469114146097, "grad_norm": 0.08759602904319763, "learning_rate": 9.955549158642594e-06, "loss": 0.4576, "num_input_tokens_seen": 26832736, "step": 22065 }, { "epoch": 2.7653176293697532, "grad_norm": 0.14386385679244995, "learning_rate": 9.955476390799625e-06, "loss": 0.4735, "num_input_tokens_seen": 26838752, "step": 22070 }, { "epoch": 2.7659441172785364, "grad_norm": 0.08994284272193909, "learning_rate": 9.955403563709772e-06, "loss": 0.4674, "num_input_tokens_seen": 26845088, "step": 22075 }, { "epoch": 2.76657060518732, "grad_norm": 0.11504022777080536, "learning_rate": 9.955330677373903e-06, "loss": 0.4672, "num_input_tokens_seen": 26851200, "step": 22080 }, { "epoch": 2.767197093096103, "grad_norm": 0.03350165858864784, "learning_rate": 9.955257731792888e-06, "loss": 0.4661, "num_input_tokens_seen": 26857504, "step": 22085 }, { "epoch": 2.767823581004887, "grad_norm": 0.0970810055732727, "learning_rate": 9.955184726967601e-06, "loss": 0.4538, "num_input_tokens_seen": 26863648, "step": 22090 }, { "epoch": 2.76845006891367, "grad_norm": 0.10165343433618546, "learning_rate": 9.955111662898915e-06, "loss": 0.4663, "num_input_tokens_seen": 26869568, "step": 22095 }, { "epoch": 2.769076556822453, "grad_norm": 0.09151492267847061, "learning_rate": 9.955038539587702e-06, "loss": 0.4631, "num_input_tokens_seen": 26875680, "step": 22100 }, { "epoch": 2.7697030447312367, "grad_norm": 0.13250569999217987, "learning_rate": 9.95496535703484e-06, "loss": 0.4674, "num_input_tokens_seen": 26881664, "step": 22105 }, { "epoch": 2.77032953264002, "grad_norm": 0.07902473956346512, "learning_rate": 9.954892115241199e-06, "loss": 0.4531, "num_input_tokens_seen": 26887872, "step": 22110 }, { "epoch": 2.7709560205488035, "grad_norm": 0.09175839275121689, "learning_rate": 9.954818814207657e-06, "loss": 0.4625, "num_input_tokens_seen": 26894080, "step": 22115 }, { "epoch": 2.7715825084575867, "grad_norm": 0.07527182996273041, "learning_rate": 9.954745453935092e-06, "loss": 0.4645, "num_input_tokens_seen": 26900064, "step": 22120 }, { "epoch": 2.77220899636637, "grad_norm": 0.09132906794548035, "learning_rate": 9.954672034424378e-06, "loss": 0.4687, "num_input_tokens_seen": 26906368, "step": 22125 }, { "epoch": 2.7728354842751535, "grad_norm": 0.13667401671409607, "learning_rate": 9.954598555676394e-06, "loss": 0.4635, "num_input_tokens_seen": 26912128, "step": 22130 }, { "epoch": 2.773461972183937, "grad_norm": 0.18150083720684052, "learning_rate": 9.954525017692019e-06, "loss": 0.4602, "num_input_tokens_seen": 26918400, "step": 22135 }, { "epoch": 2.7740884600927203, "grad_norm": 0.10500681400299072, "learning_rate": 9.95445142047213e-06, "loss": 0.466, "num_input_tokens_seen": 26924448, "step": 22140 }, { "epoch": 2.7747149480015034, "grad_norm": 0.10768086463212967, "learning_rate": 9.95437776401761e-06, "loss": 0.4654, "num_input_tokens_seen": 26930624, "step": 22145 }, { "epoch": 2.775341435910287, "grad_norm": 0.10292523354291916, "learning_rate": 9.95430404832934e-06, "loss": 0.4647, "num_input_tokens_seen": 26936832, "step": 22150 }, { "epoch": 2.77596792381907, "grad_norm": 0.0770641416311264, "learning_rate": 9.954230273408199e-06, "loss": 0.4532, "num_input_tokens_seen": 26943232, "step": 22155 }, { "epoch": 2.776594411727854, "grad_norm": 0.1443019062280655, "learning_rate": 9.954156439255069e-06, "loss": 0.4671, "num_input_tokens_seen": 26949344, "step": 22160 }, { "epoch": 2.777220899636637, "grad_norm": 0.1145666167140007, "learning_rate": 9.954082545870834e-06, "loss": 0.4594, "num_input_tokens_seen": 26955552, "step": 22165 }, { "epoch": 2.77784738754542, "grad_norm": 0.13433051109313965, "learning_rate": 9.954008593256377e-06, "loss": 0.4652, "num_input_tokens_seen": 26961728, "step": 22170 }, { "epoch": 2.7784738754542038, "grad_norm": 0.09133546054363251, "learning_rate": 9.953934581412583e-06, "loss": 0.4651, "num_input_tokens_seen": 26967328, "step": 22175 }, { "epoch": 2.779100363362987, "grad_norm": 0.08043175935745239, "learning_rate": 9.953860510340334e-06, "loss": 0.4703, "num_input_tokens_seen": 26972672, "step": 22180 }, { "epoch": 2.7797268512717705, "grad_norm": 0.09887619316577911, "learning_rate": 9.95378638004052e-06, "loss": 0.4624, "num_input_tokens_seen": 26978816, "step": 22185 }, { "epoch": 2.7803533391805537, "grad_norm": 0.09264972060918808, "learning_rate": 9.953712190514022e-06, "loss": 0.4624, "num_input_tokens_seen": 26984896, "step": 22190 }, { "epoch": 2.7809798270893373, "grad_norm": 0.09066178649663925, "learning_rate": 9.953637941761733e-06, "loss": 0.4592, "num_input_tokens_seen": 26990944, "step": 22195 }, { "epoch": 2.7816063149981205, "grad_norm": 0.09096504002809525, "learning_rate": 9.953563633784537e-06, "loss": 0.4645, "num_input_tokens_seen": 26997472, "step": 22200 }, { "epoch": 2.782232802906904, "grad_norm": 0.08610124886035919, "learning_rate": 9.953489266583322e-06, "loss": 0.4676, "num_input_tokens_seen": 27004096, "step": 22205 }, { "epoch": 2.7828592908156873, "grad_norm": 0.10145321488380432, "learning_rate": 9.953414840158979e-06, "loss": 0.4624, "num_input_tokens_seen": 27010368, "step": 22210 }, { "epoch": 2.7834857787244704, "grad_norm": 0.08039391040802002, "learning_rate": 9.953340354512395e-06, "loss": 0.4616, "num_input_tokens_seen": 27016416, "step": 22215 }, { "epoch": 2.784112266633254, "grad_norm": 0.07526987791061401, "learning_rate": 9.953265809644465e-06, "loss": 0.4591, "num_input_tokens_seen": 27022176, "step": 22220 }, { "epoch": 2.7847387545420372, "grad_norm": 0.17738041281700134, "learning_rate": 9.953191205556077e-06, "loss": 0.4696, "num_input_tokens_seen": 27027616, "step": 22225 }, { "epoch": 2.785365242450821, "grad_norm": 0.09333530813455582, "learning_rate": 9.953116542248123e-06, "loss": 0.4649, "num_input_tokens_seen": 27033312, "step": 22230 }, { "epoch": 2.785991730359604, "grad_norm": 0.08750540763139725, "learning_rate": 9.953041819721496e-06, "loss": 0.4591, "num_input_tokens_seen": 27039680, "step": 22235 }, { "epoch": 2.786618218268387, "grad_norm": 0.09954514354467392, "learning_rate": 9.952967037977092e-06, "loss": 0.4701, "num_input_tokens_seen": 27045824, "step": 22240 }, { "epoch": 2.787244706177171, "grad_norm": 0.08048571646213531, "learning_rate": 9.952892197015802e-06, "loss": 0.4663, "num_input_tokens_seen": 27052224, "step": 22245 }, { "epoch": 2.7878711940859544, "grad_norm": 0.08609549701213837, "learning_rate": 9.95281729683852e-06, "loss": 0.4559, "num_input_tokens_seen": 27058560, "step": 22250 }, { "epoch": 2.7884976819947376, "grad_norm": 0.10417405515909195, "learning_rate": 9.952742337446144e-06, "loss": 0.4648, "num_input_tokens_seen": 27064448, "step": 22255 }, { "epoch": 2.7891241699035207, "grad_norm": 0.09810531884431839, "learning_rate": 9.952667318839571e-06, "loss": 0.4649, "num_input_tokens_seen": 27070560, "step": 22260 }, { "epoch": 2.7897506578123044, "grad_norm": 0.10615552961826324, "learning_rate": 9.952592241019695e-06, "loss": 0.4639, "num_input_tokens_seen": 27076896, "step": 22265 }, { "epoch": 2.7903771457210875, "grad_norm": 0.1482490748167038, "learning_rate": 9.952517103987414e-06, "loss": 0.4655, "num_input_tokens_seen": 27082944, "step": 22270 }, { "epoch": 2.791003633629871, "grad_norm": 0.09257402271032333, "learning_rate": 9.952441907743626e-06, "loss": 0.4712, "num_input_tokens_seen": 27088832, "step": 22275 }, { "epoch": 2.7916301215386543, "grad_norm": 0.10978072881698608, "learning_rate": 9.952366652289234e-06, "loss": 0.4613, "num_input_tokens_seen": 27094912, "step": 22280 }, { "epoch": 2.7922566094474375, "grad_norm": 0.12306967377662659, "learning_rate": 9.952291337625134e-06, "loss": 0.4649, "num_input_tokens_seen": 27100800, "step": 22285 }, { "epoch": 2.792883097356221, "grad_norm": 0.08741164207458496, "learning_rate": 9.952215963752228e-06, "loss": 0.4627, "num_input_tokens_seen": 27106944, "step": 22290 }, { "epoch": 2.7935095852650043, "grad_norm": 0.08156599849462509, "learning_rate": 9.952140530671415e-06, "loss": 0.4617, "num_input_tokens_seen": 27113184, "step": 22295 }, { "epoch": 2.794136073173788, "grad_norm": 0.08961956202983856, "learning_rate": 9.9520650383836e-06, "loss": 0.4611, "num_input_tokens_seen": 27119488, "step": 22300 }, { "epoch": 2.794762561082571, "grad_norm": 0.09304852783679962, "learning_rate": 9.951989486889684e-06, "loss": 0.457, "num_input_tokens_seen": 27125792, "step": 22305 }, { "epoch": 2.795389048991354, "grad_norm": 0.10455765575170517, "learning_rate": 9.951913876190568e-06, "loss": 0.4686, "num_input_tokens_seen": 27132128, "step": 22310 }, { "epoch": 2.796015536900138, "grad_norm": 0.12709036469459534, "learning_rate": 9.95183820628716e-06, "loss": 0.4554, "num_input_tokens_seen": 27137920, "step": 22315 }, { "epoch": 2.7966420248089214, "grad_norm": 0.09591519832611084, "learning_rate": 9.951762477180364e-06, "loss": 0.4582, "num_input_tokens_seen": 27144064, "step": 22320 }, { "epoch": 2.7972685127177046, "grad_norm": 0.09679756313562393, "learning_rate": 9.951686688871083e-06, "loss": 0.4641, "num_input_tokens_seen": 27150272, "step": 22325 }, { "epoch": 2.7978950006264878, "grad_norm": 0.09133841097354889, "learning_rate": 9.951610841360224e-06, "loss": 0.463, "num_input_tokens_seen": 27156128, "step": 22330 }, { "epoch": 2.7985214885352714, "grad_norm": 0.15888653695583344, "learning_rate": 9.951534934648694e-06, "loss": 0.4635, "num_input_tokens_seen": 27162240, "step": 22335 }, { "epoch": 2.7991479764440546, "grad_norm": 0.1422697752714157, "learning_rate": 9.951458968737404e-06, "loss": 0.4684, "num_input_tokens_seen": 27168384, "step": 22340 }, { "epoch": 2.799774464352838, "grad_norm": 0.09383099526166916, "learning_rate": 9.951382943627256e-06, "loss": 0.4657, "num_input_tokens_seen": 27174208, "step": 22345 }, { "epoch": 2.8004009522616213, "grad_norm": 0.0969289168715477, "learning_rate": 9.951306859319162e-06, "loss": 0.4648, "num_input_tokens_seen": 27180448, "step": 22350 }, { "epoch": 2.8010274401704045, "grad_norm": 0.09020919352769852, "learning_rate": 9.951230715814033e-06, "loss": 0.468, "num_input_tokens_seen": 27186272, "step": 22355 }, { "epoch": 2.801653928079188, "grad_norm": 0.08946254849433899, "learning_rate": 9.951154513112778e-06, "loss": 0.462, "num_input_tokens_seen": 27192384, "step": 22360 }, { "epoch": 2.8022804159879713, "grad_norm": 0.09632772952318192, "learning_rate": 9.951078251216308e-06, "loss": 0.4584, "num_input_tokens_seen": 27198720, "step": 22365 }, { "epoch": 2.802906903896755, "grad_norm": 0.09400884062051773, "learning_rate": 9.951001930125534e-06, "loss": 0.4659, "num_input_tokens_seen": 27204512, "step": 22370 }, { "epoch": 2.803533391805538, "grad_norm": 0.0896502137184143, "learning_rate": 9.95092554984137e-06, "loss": 0.4648, "num_input_tokens_seen": 27210688, "step": 22375 }, { "epoch": 2.8041598797143217, "grad_norm": 0.09482383728027344, "learning_rate": 9.950849110364729e-06, "loss": 0.465, "num_input_tokens_seen": 27216864, "step": 22380 }, { "epoch": 2.804786367623105, "grad_norm": 0.08827119320631027, "learning_rate": 9.950772611696525e-06, "loss": 0.4632, "num_input_tokens_seen": 27223072, "step": 22385 }, { "epoch": 2.8054128555318885, "grad_norm": 0.1366358995437622, "learning_rate": 9.95069605383767e-06, "loss": 0.4629, "num_input_tokens_seen": 27228320, "step": 22390 }, { "epoch": 2.8060393434406716, "grad_norm": 0.0900682806968689, "learning_rate": 9.950619436789081e-06, "loss": 0.4622, "num_input_tokens_seen": 27234400, "step": 22395 }, { "epoch": 2.806665831349455, "grad_norm": 0.08603230863809586, "learning_rate": 9.950542760551677e-06, "loss": 0.4631, "num_input_tokens_seen": 27240576, "step": 22400 }, { "epoch": 2.8072923192582384, "grad_norm": 0.08470624685287476, "learning_rate": 9.950466025126371e-06, "loss": 0.462, "num_input_tokens_seen": 27246560, "step": 22405 }, { "epoch": 2.8079188071670216, "grad_norm": 0.1142909899353981, "learning_rate": 9.950389230514081e-06, "loss": 0.4633, "num_input_tokens_seen": 27252928, "step": 22410 }, { "epoch": 2.808545295075805, "grad_norm": 0.03958804905414581, "learning_rate": 9.950312376715726e-06, "loss": 0.4617, "num_input_tokens_seen": 27259360, "step": 22415 }, { "epoch": 2.8091717829845884, "grad_norm": 0.11285647004842758, "learning_rate": 9.950235463732225e-06, "loss": 0.4661, "num_input_tokens_seen": 27265760, "step": 22420 }, { "epoch": 2.8097982708933715, "grad_norm": 0.15927527844905853, "learning_rate": 9.950158491564497e-06, "loss": 0.4575, "num_input_tokens_seen": 27271648, "step": 22425 }, { "epoch": 2.810424758802155, "grad_norm": 0.1294921487569809, "learning_rate": 9.95008146021346e-06, "loss": 0.4575, "num_input_tokens_seen": 27278208, "step": 22430 }, { "epoch": 2.8110512467109388, "grad_norm": 0.09341154992580414, "learning_rate": 9.950004369680041e-06, "loss": 0.4653, "num_input_tokens_seen": 27284096, "step": 22435 }, { "epoch": 2.811677734619722, "grad_norm": 0.045171547681093216, "learning_rate": 9.949927219965156e-06, "loss": 0.4648, "num_input_tokens_seen": 27289984, "step": 22440 }, { "epoch": 2.812304222528505, "grad_norm": 0.17806774377822876, "learning_rate": 9.94985001106973e-06, "loss": 0.45, "num_input_tokens_seen": 27296128, "step": 22445 }, { "epoch": 2.8129307104372887, "grad_norm": 0.11684809625148773, "learning_rate": 9.949772742994685e-06, "loss": 0.4644, "num_input_tokens_seen": 27301760, "step": 22450 }, { "epoch": 2.813557198346072, "grad_norm": 0.16472254693508148, "learning_rate": 9.949695415740945e-06, "loss": 0.4613, "num_input_tokens_seen": 27307904, "step": 22455 }, { "epoch": 2.8141836862548555, "grad_norm": 0.1095987856388092, "learning_rate": 9.949618029309435e-06, "loss": 0.4609, "num_input_tokens_seen": 27313888, "step": 22460 }, { "epoch": 2.8148101741636387, "grad_norm": 0.12267487496137619, "learning_rate": 9.949540583701083e-06, "loss": 0.4832, "num_input_tokens_seen": 27320128, "step": 22465 }, { "epoch": 2.815436662072422, "grad_norm": 0.11357125639915466, "learning_rate": 9.949463078916808e-06, "loss": 0.4679, "num_input_tokens_seen": 27326304, "step": 22470 }, { "epoch": 2.8160631499812054, "grad_norm": 0.09903652220964432, "learning_rate": 9.949385514957543e-06, "loss": 0.456, "num_input_tokens_seen": 27332512, "step": 22475 }, { "epoch": 2.8166896378899886, "grad_norm": 0.09404244273900986, "learning_rate": 9.949307891824211e-06, "loss": 0.4623, "num_input_tokens_seen": 27338752, "step": 22480 }, { "epoch": 2.817316125798772, "grad_norm": 0.08200079202651978, "learning_rate": 9.949230209517745e-06, "loss": 0.4622, "num_input_tokens_seen": 27344928, "step": 22485 }, { "epoch": 2.8179426137075554, "grad_norm": 0.09020554274320602, "learning_rate": 9.949152468039068e-06, "loss": 0.4621, "num_input_tokens_seen": 27351072, "step": 22490 }, { "epoch": 2.8185691016163386, "grad_norm": 0.08594880253076553, "learning_rate": 9.949074667389114e-06, "loss": 0.4621, "num_input_tokens_seen": 27357184, "step": 22495 }, { "epoch": 2.819195589525122, "grad_norm": 0.13858181238174438, "learning_rate": 9.94899680756881e-06, "loss": 0.455, "num_input_tokens_seen": 27363456, "step": 22500 }, { "epoch": 2.819822077433906, "grad_norm": 0.1647244542837143, "learning_rate": 9.948918888579088e-06, "loss": 0.4644, "num_input_tokens_seen": 27369728, "step": 22505 }, { "epoch": 2.820448565342689, "grad_norm": 0.12673474848270416, "learning_rate": 9.948840910420881e-06, "loss": 0.4563, "num_input_tokens_seen": 27375808, "step": 22510 }, { "epoch": 2.821075053251472, "grad_norm": 0.13334310054779053, "learning_rate": 9.948762873095119e-06, "loss": 0.4642, "num_input_tokens_seen": 27382144, "step": 22515 }, { "epoch": 2.8217015411602557, "grad_norm": 0.09088028967380524, "learning_rate": 9.948684776602737e-06, "loss": 0.4591, "num_input_tokens_seen": 27388480, "step": 22520 }, { "epoch": 2.822328029069039, "grad_norm": 0.09009415656328201, "learning_rate": 9.94860662094467e-06, "loss": 0.4597, "num_input_tokens_seen": 27394720, "step": 22525 }, { "epoch": 2.8229545169778225, "grad_norm": 0.09272686392068863, "learning_rate": 9.948528406121847e-06, "loss": 0.4532, "num_input_tokens_seen": 27401024, "step": 22530 }, { "epoch": 2.8235810048866057, "grad_norm": 0.07655511051416397, "learning_rate": 9.948450132135208e-06, "loss": 0.4581, "num_input_tokens_seen": 27407424, "step": 22535 }, { "epoch": 2.824207492795389, "grad_norm": 0.04792889207601547, "learning_rate": 9.948371798985686e-06, "loss": 0.4712, "num_input_tokens_seen": 27413856, "step": 22540 }, { "epoch": 2.8248339807041725, "grad_norm": 0.11015694588422775, "learning_rate": 9.94829340667422e-06, "loss": 0.4739, "num_input_tokens_seen": 27419936, "step": 22545 }, { "epoch": 2.8254604686129556, "grad_norm": 0.15161491930484772, "learning_rate": 9.948214955201745e-06, "loss": 0.461, "num_input_tokens_seen": 27425472, "step": 22550 }, { "epoch": 2.8260869565217392, "grad_norm": 0.0878337100148201, "learning_rate": 9.9481364445692e-06, "loss": 0.4655, "num_input_tokens_seen": 27431424, "step": 22555 }, { "epoch": 2.8267134444305224, "grad_norm": 0.09549553692340851, "learning_rate": 9.948057874777525e-06, "loss": 0.4636, "num_input_tokens_seen": 27437280, "step": 22560 }, { "epoch": 2.8273399323393056, "grad_norm": 0.10269704461097717, "learning_rate": 9.947979245827657e-06, "loss": 0.4574, "num_input_tokens_seen": 27443200, "step": 22565 }, { "epoch": 2.827966420248089, "grad_norm": 0.10781975090503693, "learning_rate": 9.947900557720538e-06, "loss": 0.4613, "num_input_tokens_seen": 27448608, "step": 22570 }, { "epoch": 2.828592908156873, "grad_norm": 0.12744732201099396, "learning_rate": 9.947821810457106e-06, "loss": 0.4629, "num_input_tokens_seen": 27454816, "step": 22575 }, { "epoch": 2.829219396065656, "grad_norm": 0.08759179711341858, "learning_rate": 9.947743004038306e-06, "loss": 0.4643, "num_input_tokens_seen": 27460608, "step": 22580 }, { "epoch": 2.829845883974439, "grad_norm": 0.09084055572748184, "learning_rate": 9.947664138465078e-06, "loss": 0.4581, "num_input_tokens_seen": 27466464, "step": 22585 }, { "epoch": 2.8304723718832228, "grad_norm": 0.0821838304400444, "learning_rate": 9.947585213738366e-06, "loss": 0.4677, "num_input_tokens_seen": 27471392, "step": 22590 }, { "epoch": 2.831098859792006, "grad_norm": 0.08607269823551178, "learning_rate": 9.947506229859111e-06, "loss": 0.4708, "num_input_tokens_seen": 27476896, "step": 22595 }, { "epoch": 2.8317253477007895, "grad_norm": 0.12554533779621124, "learning_rate": 9.947427186828263e-06, "loss": 0.4669, "num_input_tokens_seen": 27483104, "step": 22600 }, { "epoch": 2.8323518356095727, "grad_norm": 0.07980867475271225, "learning_rate": 9.94734808464676e-06, "loss": 0.4704, "num_input_tokens_seen": 27489536, "step": 22605 }, { "epoch": 2.832978323518356, "grad_norm": 0.15949077904224396, "learning_rate": 9.947268923315553e-06, "loss": 0.4587, "num_input_tokens_seen": 27495520, "step": 22610 }, { "epoch": 2.8336048114271395, "grad_norm": 0.0859086811542511, "learning_rate": 9.947189702835587e-06, "loss": 0.4634, "num_input_tokens_seen": 27501632, "step": 22615 }, { "epoch": 2.8342312993359227, "grad_norm": 0.03694799542427063, "learning_rate": 9.947110423207807e-06, "loss": 0.4643, "num_input_tokens_seen": 27507808, "step": 22620 }, { "epoch": 2.8348577872447063, "grad_norm": 0.08353842794895172, "learning_rate": 9.947031084433164e-06, "loss": 0.4605, "num_input_tokens_seen": 27513568, "step": 22625 }, { "epoch": 2.8354842751534894, "grad_norm": 0.09045396000146866, "learning_rate": 9.946951686512605e-06, "loss": 0.4721, "num_input_tokens_seen": 27519488, "step": 22630 }, { "epoch": 2.836110763062273, "grad_norm": 0.11025422811508179, "learning_rate": 9.946872229447078e-06, "loss": 0.4669, "num_input_tokens_seen": 27524864, "step": 22635 }, { "epoch": 2.8367372509710562, "grad_norm": 0.21042388677597046, "learning_rate": 9.946792713237536e-06, "loss": 0.4653, "num_input_tokens_seen": 27531072, "step": 22640 }, { "epoch": 2.83736373887984, "grad_norm": 0.09577140212059021, "learning_rate": 9.946713137884927e-06, "loss": 0.4653, "num_input_tokens_seen": 27537696, "step": 22645 }, { "epoch": 2.837990226788623, "grad_norm": 0.08246969431638718, "learning_rate": 9.946633503390204e-06, "loss": 0.4637, "num_input_tokens_seen": 27543744, "step": 22650 }, { "epoch": 2.838616714697406, "grad_norm": 0.07450774312019348, "learning_rate": 9.94655380975432e-06, "loss": 0.4622, "num_input_tokens_seen": 27549856, "step": 22655 }, { "epoch": 2.83924320260619, "grad_norm": 0.07487266510725021, "learning_rate": 9.946474056978223e-06, "loss": 0.4651, "num_input_tokens_seen": 27555776, "step": 22660 }, { "epoch": 2.839869690514973, "grad_norm": 0.11992606520652771, "learning_rate": 9.946394245062873e-06, "loss": 0.4566, "num_input_tokens_seen": 27561760, "step": 22665 }, { "epoch": 2.8404961784237566, "grad_norm": 0.03071417845785618, "learning_rate": 9.946314374009219e-06, "loss": 0.4656, "num_input_tokens_seen": 27567648, "step": 22670 }, { "epoch": 2.8411226663325397, "grad_norm": 0.0851423367857933, "learning_rate": 9.94623444381822e-06, "loss": 0.463, "num_input_tokens_seen": 27573856, "step": 22675 }, { "epoch": 2.841749154241323, "grad_norm": 0.0923745185136795, "learning_rate": 9.946154454490828e-06, "loss": 0.4629, "num_input_tokens_seen": 27579840, "step": 22680 }, { "epoch": 2.8423756421501065, "grad_norm": 0.08799970895051956, "learning_rate": 9.946074406028001e-06, "loss": 0.4652, "num_input_tokens_seen": 27585504, "step": 22685 }, { "epoch": 2.84300213005889, "grad_norm": 0.13360747694969177, "learning_rate": 9.945994298430696e-06, "loss": 0.4602, "num_input_tokens_seen": 27591680, "step": 22690 }, { "epoch": 2.8436286179676733, "grad_norm": 0.1401178240776062, "learning_rate": 9.945914131699872e-06, "loss": 0.4628, "num_input_tokens_seen": 27597632, "step": 22695 }, { "epoch": 2.8442551058764565, "grad_norm": 0.11757609993219376, "learning_rate": 9.945833905836486e-06, "loss": 0.4589, "num_input_tokens_seen": 27603520, "step": 22700 }, { "epoch": 2.84488159378524, "grad_norm": 0.1371510773897171, "learning_rate": 9.945753620841498e-06, "loss": 0.4594, "num_input_tokens_seen": 27609888, "step": 22705 }, { "epoch": 2.8455080816940233, "grad_norm": 0.1362132728099823, "learning_rate": 9.945673276715865e-06, "loss": 0.4566, "num_input_tokens_seen": 27616288, "step": 22710 }, { "epoch": 2.846134569602807, "grad_norm": 0.11451579630374908, "learning_rate": 9.945592873460553e-06, "loss": 0.4577, "num_input_tokens_seen": 27622432, "step": 22715 }, { "epoch": 2.84676105751159, "grad_norm": 0.08015234023332596, "learning_rate": 9.945512411076519e-06, "loss": 0.4643, "num_input_tokens_seen": 27628480, "step": 22720 }, { "epoch": 2.847387545420373, "grad_norm": 0.0921771228313446, "learning_rate": 9.945431889564724e-06, "loss": 0.4592, "num_input_tokens_seen": 27634432, "step": 22725 }, { "epoch": 2.848014033329157, "grad_norm": 0.10851091891527176, "learning_rate": 9.945351308926134e-06, "loss": 0.4685, "num_input_tokens_seen": 27640736, "step": 22730 }, { "epoch": 2.84864052123794, "grad_norm": 0.07958351075649261, "learning_rate": 9.945270669161713e-06, "loss": 0.461, "num_input_tokens_seen": 27647008, "step": 22735 }, { "epoch": 2.8492670091467236, "grad_norm": 0.08650415390729904, "learning_rate": 9.945189970272422e-06, "loss": 0.4616, "num_input_tokens_seen": 27652544, "step": 22740 }, { "epoch": 2.8498934970555068, "grad_norm": 0.035523537546396255, "learning_rate": 9.945109212259227e-06, "loss": 0.4602, "num_input_tokens_seen": 27658528, "step": 22745 }, { "epoch": 2.85051998496429, "grad_norm": 0.11334212124347687, "learning_rate": 9.945028395123094e-06, "loss": 0.4727, "num_input_tokens_seen": 27664672, "step": 22750 }, { "epoch": 2.8511464728730735, "grad_norm": 0.10594472289085388, "learning_rate": 9.94494751886499e-06, "loss": 0.4672, "num_input_tokens_seen": 27670624, "step": 22755 }, { "epoch": 2.851772960781857, "grad_norm": 0.08776576071977615, "learning_rate": 9.94486658348588e-06, "loss": 0.464, "num_input_tokens_seen": 27676640, "step": 22760 }, { "epoch": 2.8523994486906403, "grad_norm": 0.0902569517493248, "learning_rate": 9.944785588986732e-06, "loss": 0.4611, "num_input_tokens_seen": 27682944, "step": 22765 }, { "epoch": 2.8530259365994235, "grad_norm": 0.09020555019378662, "learning_rate": 9.944704535368515e-06, "loss": 0.4653, "num_input_tokens_seen": 27688704, "step": 22770 }, { "epoch": 2.853652424508207, "grad_norm": 0.07201030105352402, "learning_rate": 9.9446234226322e-06, "loss": 0.4621, "num_input_tokens_seen": 27694976, "step": 22775 }, { "epoch": 2.8542789124169903, "grad_norm": 0.07846228033304214, "learning_rate": 9.944542250778752e-06, "loss": 0.4653, "num_input_tokens_seen": 27701184, "step": 22780 }, { "epoch": 2.854905400325774, "grad_norm": 0.08384569734334946, "learning_rate": 9.944461019809146e-06, "loss": 0.466, "num_input_tokens_seen": 27707360, "step": 22785 }, { "epoch": 2.855531888234557, "grad_norm": 0.11307836323976517, "learning_rate": 9.944379729724351e-06, "loss": 0.4603, "num_input_tokens_seen": 27713248, "step": 22790 }, { "epoch": 2.8561583761433402, "grad_norm": 0.07999885082244873, "learning_rate": 9.944298380525339e-06, "loss": 0.4635, "num_input_tokens_seen": 27719552, "step": 22795 }, { "epoch": 2.856784864052124, "grad_norm": 0.1092611625790596, "learning_rate": 9.944216972213083e-06, "loss": 0.4612, "num_input_tokens_seen": 27725696, "step": 22800 }, { "epoch": 2.857411351960907, "grad_norm": 0.0930512398481369, "learning_rate": 9.944135504788557e-06, "loss": 0.4596, "num_input_tokens_seen": 27731776, "step": 22805 }, { "epoch": 2.8580378398696906, "grad_norm": 0.08499911427497864, "learning_rate": 9.944053978252735e-06, "loss": 0.4659, "num_input_tokens_seen": 27738112, "step": 22810 }, { "epoch": 2.858664327778474, "grad_norm": 0.16086065769195557, "learning_rate": 9.94397239260659e-06, "loss": 0.4702, "num_input_tokens_seen": 27744096, "step": 22815 }, { "epoch": 2.859290815687257, "grad_norm": 0.16629137098789215, "learning_rate": 9.943890747851097e-06, "loss": 0.4585, "num_input_tokens_seen": 27750528, "step": 22820 }, { "epoch": 2.8599173035960406, "grad_norm": 0.1150948703289032, "learning_rate": 9.943809043987235e-06, "loss": 0.4719, "num_input_tokens_seen": 27756544, "step": 22825 }, { "epoch": 2.860543791504824, "grad_norm": 0.0844598338007927, "learning_rate": 9.943727281015979e-06, "loss": 0.4634, "num_input_tokens_seen": 27762144, "step": 22830 }, { "epoch": 2.8611702794136074, "grad_norm": 0.11608228087425232, "learning_rate": 9.943645458938306e-06, "loss": 0.4623, "num_input_tokens_seen": 27768000, "step": 22835 }, { "epoch": 2.8617967673223905, "grad_norm": 0.15346843004226685, "learning_rate": 9.943563577755196e-06, "loss": 0.4674, "num_input_tokens_seen": 27774272, "step": 22840 }, { "epoch": 2.862423255231174, "grad_norm": 0.094376340508461, "learning_rate": 9.943481637467627e-06, "loss": 0.4653, "num_input_tokens_seen": 27780544, "step": 22845 }, { "epoch": 2.8630497431399573, "grad_norm": 0.14215581119060516, "learning_rate": 9.94339963807658e-06, "loss": 0.4652, "num_input_tokens_seen": 27786368, "step": 22850 }, { "epoch": 2.863676231048741, "grad_norm": 0.03415214642882347, "learning_rate": 9.943317579583031e-06, "loss": 0.46, "num_input_tokens_seen": 27792096, "step": 22855 }, { "epoch": 2.864302718957524, "grad_norm": 0.08487672358751297, "learning_rate": 9.943235461987967e-06, "loss": 0.4584, "num_input_tokens_seen": 27797984, "step": 22860 }, { "epoch": 2.8649292068663073, "grad_norm": 0.14792582392692566, "learning_rate": 9.943153285292367e-06, "loss": 0.4658, "num_input_tokens_seen": 27804384, "step": 22865 }, { "epoch": 2.865555694775091, "grad_norm": 0.16359512507915497, "learning_rate": 9.943071049497213e-06, "loss": 0.4641, "num_input_tokens_seen": 27810144, "step": 22870 }, { "epoch": 2.866182182683874, "grad_norm": 0.08355133980512619, "learning_rate": 9.942988754603489e-06, "loss": 0.4608, "num_input_tokens_seen": 27816448, "step": 22875 }, { "epoch": 2.8668086705926576, "grad_norm": 0.1452482044696808, "learning_rate": 9.942906400612178e-06, "loss": 0.4626, "num_input_tokens_seen": 27822720, "step": 22880 }, { "epoch": 2.867435158501441, "grad_norm": 0.10617800801992416, "learning_rate": 9.942823987524263e-06, "loss": 0.4683, "num_input_tokens_seen": 27828192, "step": 22885 }, { "epoch": 2.8680616464102244, "grad_norm": 0.09865046292543411, "learning_rate": 9.942741515340735e-06, "loss": 0.458, "num_input_tokens_seen": 27834368, "step": 22890 }, { "epoch": 2.8686881343190076, "grad_norm": 0.07789669185876846, "learning_rate": 9.942658984062577e-06, "loss": 0.4663, "num_input_tokens_seen": 27840352, "step": 22895 }, { "epoch": 2.869314622227791, "grad_norm": 0.13560879230499268, "learning_rate": 9.942576393690773e-06, "loss": 0.4589, "num_input_tokens_seen": 27846464, "step": 22900 }, { "epoch": 2.8699411101365744, "grad_norm": 0.08304444700479507, "learning_rate": 9.942493744226313e-06, "loss": 0.4608, "num_input_tokens_seen": 27852576, "step": 22905 }, { "epoch": 2.8705675980453575, "grad_norm": 0.03966105356812477, "learning_rate": 9.942411035670185e-06, "loss": 0.4585, "num_input_tokens_seen": 27858528, "step": 22910 }, { "epoch": 2.871194085954141, "grad_norm": 0.09154241532087326, "learning_rate": 9.942328268023377e-06, "loss": 0.46, "num_input_tokens_seen": 27865024, "step": 22915 }, { "epoch": 2.8718205738629243, "grad_norm": 0.1375543624162674, "learning_rate": 9.942245441286882e-06, "loss": 0.4649, "num_input_tokens_seen": 27871072, "step": 22920 }, { "epoch": 2.872447061771708, "grad_norm": 0.13637612760066986, "learning_rate": 9.942162555461685e-06, "loss": 0.4638, "num_input_tokens_seen": 27877248, "step": 22925 }, { "epoch": 2.873073549680491, "grad_norm": 0.11911821365356445, "learning_rate": 9.94207961054878e-06, "loss": 0.4667, "num_input_tokens_seen": 27883232, "step": 22930 }, { "epoch": 2.8737000375892743, "grad_norm": 0.09545610845088959, "learning_rate": 9.941996606549158e-06, "loss": 0.4571, "num_input_tokens_seen": 27889184, "step": 22935 }, { "epoch": 2.874326525498058, "grad_norm": 0.08671557158231735, "learning_rate": 9.941913543463813e-06, "loss": 0.4621, "num_input_tokens_seen": 27894848, "step": 22940 }, { "epoch": 2.8749530134068415, "grad_norm": 0.07777685672044754, "learning_rate": 9.941830421293736e-06, "loss": 0.4665, "num_input_tokens_seen": 27900832, "step": 22945 }, { "epoch": 2.8755795013156247, "grad_norm": 0.04221109673380852, "learning_rate": 9.94174724003992e-06, "loss": 0.4638, "num_input_tokens_seen": 27906816, "step": 22950 }, { "epoch": 2.876205989224408, "grad_norm": 0.08788050711154938, "learning_rate": 9.941663999703362e-06, "loss": 0.4586, "num_input_tokens_seen": 27913216, "step": 22955 }, { "epoch": 2.8768324771331915, "grad_norm": 0.09486454725265503, "learning_rate": 9.941580700285057e-06, "loss": 0.4701, "num_input_tokens_seen": 27919296, "step": 22960 }, { "epoch": 2.8774589650419746, "grad_norm": 0.03635130822658539, "learning_rate": 9.941497341786e-06, "loss": 0.4654, "num_input_tokens_seen": 27925472, "step": 22965 }, { "epoch": 2.8780854529507582, "grad_norm": 0.03857390210032463, "learning_rate": 9.941413924207186e-06, "loss": 0.4632, "num_input_tokens_seen": 27931488, "step": 22970 }, { "epoch": 2.8787119408595414, "grad_norm": 0.14557334780693054, "learning_rate": 9.941330447549615e-06, "loss": 0.4631, "num_input_tokens_seen": 27937536, "step": 22975 }, { "epoch": 2.8793384287683246, "grad_norm": 0.09288207441568375, "learning_rate": 9.941246911814284e-06, "loss": 0.4688, "num_input_tokens_seen": 27943616, "step": 22980 }, { "epoch": 2.879964916677108, "grad_norm": 0.07215078175067902, "learning_rate": 9.941163317002193e-06, "loss": 0.4627, "num_input_tokens_seen": 27949984, "step": 22985 }, { "epoch": 2.8805914045858914, "grad_norm": 0.13172534108161926, "learning_rate": 9.941079663114338e-06, "loss": 0.4635, "num_input_tokens_seen": 27955744, "step": 22990 }, { "epoch": 2.881217892494675, "grad_norm": 0.09010134637355804, "learning_rate": 9.940995950151724e-06, "loss": 0.4661, "num_input_tokens_seen": 27961856, "step": 22995 }, { "epoch": 2.881844380403458, "grad_norm": 0.12803062796592712, "learning_rate": 9.940912178115348e-06, "loss": 0.4602, "num_input_tokens_seen": 27968192, "step": 23000 }, { "epoch": 2.8824708683122413, "grad_norm": 0.08709786087274551, "learning_rate": 9.940828347006211e-06, "loss": 0.4663, "num_input_tokens_seen": 27974304, "step": 23005 }, { "epoch": 2.883097356221025, "grad_norm": 0.035606417804956436, "learning_rate": 9.940744456825319e-06, "loss": 0.458, "num_input_tokens_seen": 27980224, "step": 23010 }, { "epoch": 2.8837238441298085, "grad_norm": 0.1428920179605484, "learning_rate": 9.940660507573672e-06, "loss": 0.464, "num_input_tokens_seen": 27986304, "step": 23015 }, { "epoch": 2.8843503320385917, "grad_norm": 0.09328901767730713, "learning_rate": 9.940576499252277e-06, "loss": 0.4601, "num_input_tokens_seen": 27992192, "step": 23020 }, { "epoch": 2.884976819947375, "grad_norm": 0.15380901098251343, "learning_rate": 9.940492431862134e-06, "loss": 0.4716, "num_input_tokens_seen": 27997856, "step": 23025 }, { "epoch": 2.8856033078561585, "grad_norm": 0.07809077948331833, "learning_rate": 9.94040830540425e-06, "loss": 0.4585, "num_input_tokens_seen": 28004096, "step": 23030 }, { "epoch": 2.8862297957649417, "grad_norm": 0.09074472635984421, "learning_rate": 9.940324119879632e-06, "loss": 0.4606, "num_input_tokens_seen": 28010336, "step": 23035 }, { "epoch": 2.8868562836737253, "grad_norm": 0.03779370337724686, "learning_rate": 9.940239875289286e-06, "loss": 0.4624, "num_input_tokens_seen": 28016928, "step": 23040 }, { "epoch": 2.8874827715825084, "grad_norm": 0.0868171751499176, "learning_rate": 9.940155571634218e-06, "loss": 0.4606, "num_input_tokens_seen": 28022816, "step": 23045 }, { "epoch": 2.8881092594912916, "grad_norm": 0.09266819804906845, "learning_rate": 9.940071208915439e-06, "loss": 0.4669, "num_input_tokens_seen": 28028832, "step": 23050 }, { "epoch": 2.888735747400075, "grad_norm": 0.08061588555574417, "learning_rate": 9.939986787133953e-06, "loss": 0.4593, "num_input_tokens_seen": 28034688, "step": 23055 }, { "epoch": 2.8893622353088584, "grad_norm": 0.12398305535316467, "learning_rate": 9.939902306290771e-06, "loss": 0.4678, "num_input_tokens_seen": 28040992, "step": 23060 }, { "epoch": 2.889988723217642, "grad_norm": 0.09509289264678955, "learning_rate": 9.939817766386906e-06, "loss": 0.4612, "num_input_tokens_seen": 28046944, "step": 23065 }, { "epoch": 2.890615211126425, "grad_norm": 0.03107939101755619, "learning_rate": 9.939733167423364e-06, "loss": 0.4617, "num_input_tokens_seen": 28053088, "step": 23070 }, { "epoch": 2.891241699035209, "grad_norm": 0.11513927578926086, "learning_rate": 9.93964850940116e-06, "loss": 0.4623, "num_input_tokens_seen": 28059232, "step": 23075 }, { "epoch": 2.891868186943992, "grad_norm": 0.09461337327957153, "learning_rate": 9.939563792321305e-06, "loss": 0.469, "num_input_tokens_seen": 28065120, "step": 23080 }, { "epoch": 2.8924946748527756, "grad_norm": 0.13051699101924896, "learning_rate": 9.939479016184812e-06, "loss": 0.4644, "num_input_tokens_seen": 28071168, "step": 23085 }, { "epoch": 2.8931211627615587, "grad_norm": 0.08315031230449677, "learning_rate": 9.939394180992694e-06, "loss": 0.462, "num_input_tokens_seen": 28076960, "step": 23090 }, { "epoch": 2.893747650670342, "grad_norm": 0.09895097464323044, "learning_rate": 9.939309286745967e-06, "loss": 0.4649, "num_input_tokens_seen": 28082304, "step": 23095 }, { "epoch": 2.8943741385791255, "grad_norm": 0.08171173930168152, "learning_rate": 9.939224333445643e-06, "loss": 0.4636, "num_input_tokens_seen": 28088672, "step": 23100 }, { "epoch": 2.8950006264879087, "grad_norm": 0.033015016466379166, "learning_rate": 9.939139321092741e-06, "loss": 0.4607, "num_input_tokens_seen": 28094848, "step": 23105 }, { "epoch": 2.8956271143966923, "grad_norm": 0.08855600655078888, "learning_rate": 9.939054249688274e-06, "loss": 0.4681, "num_input_tokens_seen": 28100928, "step": 23110 }, { "epoch": 2.8962536023054755, "grad_norm": 0.0868101641535759, "learning_rate": 9.938969119233261e-06, "loss": 0.4636, "num_input_tokens_seen": 28106528, "step": 23115 }, { "epoch": 2.8968800902142586, "grad_norm": 0.08557143807411194, "learning_rate": 9.938883929728722e-06, "loss": 0.4631, "num_input_tokens_seen": 28112512, "step": 23120 }, { "epoch": 2.8975065781230422, "grad_norm": 0.10244698822498322, "learning_rate": 9.938798681175671e-06, "loss": 0.4636, "num_input_tokens_seen": 28118464, "step": 23125 }, { "epoch": 2.898133066031826, "grad_norm": 0.09217500686645508, "learning_rate": 9.93871337357513e-06, "loss": 0.4633, "num_input_tokens_seen": 28124704, "step": 23130 }, { "epoch": 2.898759553940609, "grad_norm": 0.10968326032161713, "learning_rate": 9.93862800692812e-06, "loss": 0.4658, "num_input_tokens_seen": 28130848, "step": 23135 }, { "epoch": 2.899386041849392, "grad_norm": 0.0809563398361206, "learning_rate": 9.93854258123566e-06, "loss": 0.4627, "num_input_tokens_seen": 28136928, "step": 23140 }, { "epoch": 2.900012529758176, "grad_norm": 0.0970086008310318, "learning_rate": 9.93845709649877e-06, "loss": 0.4608, "num_input_tokens_seen": 28143136, "step": 23145 }, { "epoch": 2.900639017666959, "grad_norm": 0.07446471601724625, "learning_rate": 9.938371552718473e-06, "loss": 0.4644, "num_input_tokens_seen": 28149248, "step": 23150 }, { "epoch": 2.9012655055757426, "grad_norm": 0.0841773971915245, "learning_rate": 9.938285949895792e-06, "loss": 0.4587, "num_input_tokens_seen": 28155488, "step": 23155 }, { "epoch": 2.9018919934845258, "grad_norm": 0.03390311822295189, "learning_rate": 9.938200288031752e-06, "loss": 0.4619, "num_input_tokens_seen": 28161216, "step": 23160 }, { "epoch": 2.902518481393309, "grad_norm": 0.14614641666412354, "learning_rate": 9.938114567127375e-06, "loss": 0.4654, "num_input_tokens_seen": 28167456, "step": 23165 }, { "epoch": 2.9031449693020925, "grad_norm": 0.07346615195274353, "learning_rate": 9.938028787183687e-06, "loss": 0.4582, "num_input_tokens_seen": 28173440, "step": 23170 }, { "epoch": 2.9037714572108757, "grad_norm": 0.07334215939044952, "learning_rate": 9.937942948201712e-06, "loss": 0.4608, "num_input_tokens_seen": 28179616, "step": 23175 }, { "epoch": 2.9043979451196593, "grad_norm": 0.12875153124332428, "learning_rate": 9.937857050182478e-06, "loss": 0.4651, "num_input_tokens_seen": 28185856, "step": 23180 }, { "epoch": 2.9050244330284425, "grad_norm": 0.07724378257989883, "learning_rate": 9.937771093127011e-06, "loss": 0.4635, "num_input_tokens_seen": 28192032, "step": 23185 }, { "epoch": 2.9056509209372257, "grad_norm": 0.07695626467466354, "learning_rate": 9.93768507703634e-06, "loss": 0.4613, "num_input_tokens_seen": 28198496, "step": 23190 }, { "epoch": 2.9062774088460093, "grad_norm": 0.0337139293551445, "learning_rate": 9.937599001911493e-06, "loss": 0.4627, "num_input_tokens_seen": 28204608, "step": 23195 }, { "epoch": 2.906903896754793, "grad_norm": 0.08623659610748291, "learning_rate": 9.937512867753498e-06, "loss": 0.4648, "num_input_tokens_seen": 28210688, "step": 23200 }, { "epoch": 2.907530384663576, "grad_norm": 0.1325150728225708, "learning_rate": 9.937426674563385e-06, "loss": 0.4647, "num_input_tokens_seen": 28216832, "step": 23205 }, { "epoch": 2.908156872572359, "grad_norm": 0.13939444720745087, "learning_rate": 9.937340422342186e-06, "loss": 0.4642, "num_input_tokens_seen": 28223232, "step": 23210 }, { "epoch": 2.908783360481143, "grad_norm": 0.13665176928043365, "learning_rate": 9.93725411109093e-06, "loss": 0.4604, "num_input_tokens_seen": 28229248, "step": 23215 }, { "epoch": 2.909409848389926, "grad_norm": 0.10991211980581284, "learning_rate": 9.937167740810652e-06, "loss": 0.4615, "num_input_tokens_seen": 28235520, "step": 23220 }, { "epoch": 2.9100363362987096, "grad_norm": 0.08667060732841492, "learning_rate": 9.93708131150238e-06, "loss": 0.4643, "num_input_tokens_seen": 28241664, "step": 23225 }, { "epoch": 2.910662824207493, "grad_norm": 0.09372514486312866, "learning_rate": 9.936994823167152e-06, "loss": 0.4631, "num_input_tokens_seen": 28247904, "step": 23230 }, { "epoch": 2.911289312116276, "grad_norm": 0.08816161751747131, "learning_rate": 9.936908275806e-06, "loss": 0.4691, "num_input_tokens_seen": 28253824, "step": 23235 }, { "epoch": 2.9119158000250596, "grad_norm": 0.08339866250753403, "learning_rate": 9.936821669419957e-06, "loss": 0.4593, "num_input_tokens_seen": 28260032, "step": 23240 }, { "epoch": 2.9125422879338427, "grad_norm": 0.11942756175994873, "learning_rate": 9.936735004010062e-06, "loss": 0.4653, "num_input_tokens_seen": 28266496, "step": 23245 }, { "epoch": 2.9131687758426263, "grad_norm": 0.08662204444408417, "learning_rate": 9.93664827957735e-06, "loss": 0.4585, "num_input_tokens_seen": 28272480, "step": 23250 }, { "epoch": 2.9137952637514095, "grad_norm": 0.07663505524396896, "learning_rate": 9.936561496122854e-06, "loss": 0.4574, "num_input_tokens_seen": 28278560, "step": 23255 }, { "epoch": 2.9144217516601927, "grad_norm": 0.21324795484542847, "learning_rate": 9.936474653647618e-06, "loss": 0.461, "num_input_tokens_seen": 28284448, "step": 23260 }, { "epoch": 2.9150482395689763, "grad_norm": 0.08835086226463318, "learning_rate": 9.936387752152677e-06, "loss": 0.4673, "num_input_tokens_seen": 28290624, "step": 23265 }, { "epoch": 2.91567472747776, "grad_norm": 0.08728916198015213, "learning_rate": 9.93630079163907e-06, "loss": 0.4591, "num_input_tokens_seen": 28296064, "step": 23270 }, { "epoch": 2.916301215386543, "grad_norm": 0.09930571168661118, "learning_rate": 9.936213772107835e-06, "loss": 0.4645, "num_input_tokens_seen": 28301504, "step": 23275 }, { "epoch": 2.9169277032953262, "grad_norm": 0.08121050894260406, "learning_rate": 9.936126693560017e-06, "loss": 0.4601, "num_input_tokens_seen": 28307520, "step": 23280 }, { "epoch": 2.91755419120411, "grad_norm": 0.1292877197265625, "learning_rate": 9.936039555996653e-06, "loss": 0.4679, "num_input_tokens_seen": 28313984, "step": 23285 }, { "epoch": 2.918180679112893, "grad_norm": 0.0401134118437767, "learning_rate": 9.935952359418785e-06, "loss": 0.4654, "num_input_tokens_seen": 28320320, "step": 23290 }, { "epoch": 2.9188071670216766, "grad_norm": 0.08912820369005203, "learning_rate": 9.935865103827457e-06, "loss": 0.4638, "num_input_tokens_seen": 28326624, "step": 23295 }, { "epoch": 2.91943365493046, "grad_norm": 0.0820188969373703, "learning_rate": 9.935777789223712e-06, "loss": 0.4658, "num_input_tokens_seen": 28332864, "step": 23300 }, { "epoch": 2.920060142839243, "grad_norm": 0.14860506355762482, "learning_rate": 9.935690415608596e-06, "loss": 0.4708, "num_input_tokens_seen": 28338464, "step": 23305 }, { "epoch": 2.9206866307480266, "grad_norm": 0.08251495659351349, "learning_rate": 9.935602982983149e-06, "loss": 0.4661, "num_input_tokens_seen": 28344192, "step": 23310 }, { "epoch": 2.9213131186568098, "grad_norm": 0.10657037049531937, "learning_rate": 9.93551549134842e-06, "loss": 0.4666, "num_input_tokens_seen": 28350400, "step": 23315 }, { "epoch": 2.9219396065655934, "grad_norm": 0.10421601682901382, "learning_rate": 9.935427940705454e-06, "loss": 0.4562, "num_input_tokens_seen": 28356512, "step": 23320 }, { "epoch": 2.9225660944743765, "grad_norm": 0.03674913942813873, "learning_rate": 9.935340331055295e-06, "loss": 0.4623, "num_input_tokens_seen": 28362560, "step": 23325 }, { "epoch": 2.92319258238316, "grad_norm": 0.08792999386787415, "learning_rate": 9.935252662398994e-06, "loss": 0.4607, "num_input_tokens_seen": 28368224, "step": 23330 }, { "epoch": 2.9238190702919433, "grad_norm": 0.13859611749649048, "learning_rate": 9.935164934737599e-06, "loss": 0.4602, "num_input_tokens_seen": 28374016, "step": 23335 }, { "epoch": 2.924445558200727, "grad_norm": 0.08702805638313293, "learning_rate": 9.935077148072157e-06, "loss": 0.4572, "num_input_tokens_seen": 28379872, "step": 23340 }, { "epoch": 2.92507204610951, "grad_norm": 0.08362285792827606, "learning_rate": 9.934989302403719e-06, "loss": 0.4626, "num_input_tokens_seen": 28386112, "step": 23345 }, { "epoch": 2.9256985340182933, "grad_norm": 0.10773090273141861, "learning_rate": 9.934901397733336e-06, "loss": 0.4482, "num_input_tokens_seen": 28392064, "step": 23350 }, { "epoch": 2.926325021927077, "grad_norm": 0.03497455641627312, "learning_rate": 9.934813434062057e-06, "loss": 0.4585, "num_input_tokens_seen": 28398400, "step": 23355 }, { "epoch": 2.92695150983586, "grad_norm": 0.08822953701019287, "learning_rate": 9.934725411390933e-06, "loss": 0.4598, "num_input_tokens_seen": 28405024, "step": 23360 }, { "epoch": 2.9275779977446437, "grad_norm": 0.17531569302082062, "learning_rate": 9.934637329721018e-06, "loss": 0.4624, "num_input_tokens_seen": 28411040, "step": 23365 }, { "epoch": 2.928204485653427, "grad_norm": 0.07873024791479111, "learning_rate": 9.934549189053366e-06, "loss": 0.4703, "num_input_tokens_seen": 28416992, "step": 23370 }, { "epoch": 2.92883097356221, "grad_norm": 0.11855149269104004, "learning_rate": 9.93446098938903e-06, "loss": 0.4731, "num_input_tokens_seen": 28423072, "step": 23375 }, { "epoch": 2.9294574614709936, "grad_norm": 0.15377305448055267, "learning_rate": 9.934372730729063e-06, "loss": 0.4564, "num_input_tokens_seen": 28429312, "step": 23380 }, { "epoch": 2.9300839493797772, "grad_norm": 0.16807550191879272, "learning_rate": 9.934284413074521e-06, "loss": 0.4577, "num_input_tokens_seen": 28435360, "step": 23385 }, { "epoch": 2.9307104372885604, "grad_norm": 0.12370608747005463, "learning_rate": 9.934196036426462e-06, "loss": 0.4648, "num_input_tokens_seen": 28441248, "step": 23390 }, { "epoch": 2.9313369251973436, "grad_norm": 0.12216708809137344, "learning_rate": 9.93410760078594e-06, "loss": 0.4651, "num_input_tokens_seen": 28447392, "step": 23395 }, { "epoch": 2.931963413106127, "grad_norm": 0.12368924170732498, "learning_rate": 9.934019106154015e-06, "loss": 0.456, "num_input_tokens_seen": 28453312, "step": 23400 }, { "epoch": 2.9325899010149103, "grad_norm": 0.1350182443857193, "learning_rate": 9.933930552531742e-06, "loss": 0.4657, "num_input_tokens_seen": 28459424, "step": 23405 }, { "epoch": 2.933216388923694, "grad_norm": 0.10442892462015152, "learning_rate": 9.933841939920181e-06, "loss": 0.4505, "num_input_tokens_seen": 28465280, "step": 23410 }, { "epoch": 2.933842876832477, "grad_norm": 0.11389541625976562, "learning_rate": 9.933753268320391e-06, "loss": 0.4647, "num_input_tokens_seen": 28471648, "step": 23415 }, { "epoch": 2.9344693647412603, "grad_norm": 0.13234345614910126, "learning_rate": 9.933664537733433e-06, "loss": 0.4615, "num_input_tokens_seen": 28477216, "step": 23420 }, { "epoch": 2.935095852650044, "grad_norm": 0.12757055461406708, "learning_rate": 9.933575748160366e-06, "loss": 0.4719, "num_input_tokens_seen": 28483104, "step": 23425 }, { "epoch": 2.935722340558827, "grad_norm": 0.1549425721168518, "learning_rate": 9.933486899602256e-06, "loss": 0.4648, "num_input_tokens_seen": 28489088, "step": 23430 }, { "epoch": 2.9363488284676107, "grad_norm": 0.04669235274195671, "learning_rate": 9.93339799206016e-06, "loss": 0.458, "num_input_tokens_seen": 28495392, "step": 23435 }, { "epoch": 2.936975316376394, "grad_norm": 0.1358361840248108, "learning_rate": 9.933309025535145e-06, "loss": 0.4628, "num_input_tokens_seen": 28501152, "step": 23440 }, { "epoch": 2.937601804285177, "grad_norm": 0.09943385422229767, "learning_rate": 9.93322000002827e-06, "loss": 0.4576, "num_input_tokens_seen": 28507232, "step": 23445 }, { "epoch": 2.9382282921939606, "grad_norm": 0.09663765132427216, "learning_rate": 9.933130915540605e-06, "loss": 0.4655, "num_input_tokens_seen": 28513344, "step": 23450 }, { "epoch": 2.9388547801027443, "grad_norm": 0.14853261411190033, "learning_rate": 9.933041772073212e-06, "loss": 0.4612, "num_input_tokens_seen": 28519392, "step": 23455 }, { "epoch": 2.9394812680115274, "grad_norm": 0.12025769054889679, "learning_rate": 9.932952569627157e-06, "loss": 0.4645, "num_input_tokens_seen": 28525472, "step": 23460 }, { "epoch": 2.9401077559203106, "grad_norm": 0.11327707022428513, "learning_rate": 9.932863308203505e-06, "loss": 0.4589, "num_input_tokens_seen": 28531808, "step": 23465 }, { "epoch": 2.940734243829094, "grad_norm": 0.12713468074798584, "learning_rate": 9.932773987803325e-06, "loss": 0.461, "num_input_tokens_seen": 28537760, "step": 23470 }, { "epoch": 2.9413607317378774, "grad_norm": 0.15490570664405823, "learning_rate": 9.932684608427688e-06, "loss": 0.4714, "num_input_tokens_seen": 28543936, "step": 23475 }, { "epoch": 2.941987219646661, "grad_norm": 0.046564556658267975, "learning_rate": 9.932595170077656e-06, "loss": 0.461, "num_input_tokens_seen": 28550368, "step": 23480 }, { "epoch": 2.942613707555444, "grad_norm": 0.09598319977521896, "learning_rate": 9.932505672754303e-06, "loss": 0.4656, "num_input_tokens_seen": 28556704, "step": 23485 }, { "epoch": 2.9432401954642273, "grad_norm": 0.16856950521469116, "learning_rate": 9.932416116458698e-06, "loss": 0.4627, "num_input_tokens_seen": 28562976, "step": 23490 }, { "epoch": 2.943866683373011, "grad_norm": 0.03673694282770157, "learning_rate": 9.93232650119191e-06, "loss": 0.4638, "num_input_tokens_seen": 28569312, "step": 23495 }, { "epoch": 2.944493171281794, "grad_norm": 0.030561208724975586, "learning_rate": 9.932236826955013e-06, "loss": 0.4624, "num_input_tokens_seen": 28575424, "step": 23500 }, { "epoch": 2.9451196591905777, "grad_norm": 0.08268679678440094, "learning_rate": 9.932147093749078e-06, "loss": 0.4618, "num_input_tokens_seen": 28581632, "step": 23505 }, { "epoch": 2.945746147099361, "grad_norm": 0.09190922230482101, "learning_rate": 9.932057301575176e-06, "loss": 0.4635, "num_input_tokens_seen": 28587808, "step": 23510 }, { "epoch": 2.9463726350081445, "grad_norm": 0.10042889416217804, "learning_rate": 9.931967450434385e-06, "loss": 0.4648, "num_input_tokens_seen": 28593952, "step": 23515 }, { "epoch": 2.9469991229169277, "grad_norm": 0.08569735288619995, "learning_rate": 9.931877540327775e-06, "loss": 0.4575, "num_input_tokens_seen": 28600160, "step": 23520 }, { "epoch": 2.9476256108257113, "grad_norm": 0.11389047652482986, "learning_rate": 9.931787571256421e-06, "loss": 0.4662, "num_input_tokens_seen": 28606112, "step": 23525 }, { "epoch": 2.9482520987344945, "grad_norm": 0.12441518902778625, "learning_rate": 9.931697543221403e-06, "loss": 0.4571, "num_input_tokens_seen": 28612160, "step": 23530 }, { "epoch": 2.9488785866432776, "grad_norm": 0.12142418324947357, "learning_rate": 9.931607456223793e-06, "loss": 0.4646, "num_input_tokens_seen": 28618272, "step": 23535 }, { "epoch": 2.9495050745520612, "grad_norm": 0.14352600276470184, "learning_rate": 9.931517310264672e-06, "loss": 0.458, "num_input_tokens_seen": 28624096, "step": 23540 }, { "epoch": 2.9501315624608444, "grad_norm": 0.14474928379058838, "learning_rate": 9.931427105345112e-06, "loss": 0.477, "num_input_tokens_seen": 28629984, "step": 23545 }, { "epoch": 2.950758050369628, "grad_norm": 0.12115449458360672, "learning_rate": 9.931336841466195e-06, "loss": 0.4538, "num_input_tokens_seen": 28636224, "step": 23550 }, { "epoch": 2.951384538278411, "grad_norm": 0.13990215957164764, "learning_rate": 9.931246518629002e-06, "loss": 0.453, "num_input_tokens_seen": 28642272, "step": 23555 }, { "epoch": 2.9520110261871944, "grad_norm": 0.11960191279649734, "learning_rate": 9.93115613683461e-06, "loss": 0.4535, "num_input_tokens_seen": 28648416, "step": 23560 }, { "epoch": 2.952637514095978, "grad_norm": 0.11817967146635056, "learning_rate": 9.931065696084102e-06, "loss": 0.4787, "num_input_tokens_seen": 28654752, "step": 23565 }, { "epoch": 2.9532640020047616, "grad_norm": 0.09707687795162201, "learning_rate": 9.930975196378555e-06, "loss": 0.4561, "num_input_tokens_seen": 28660704, "step": 23570 }, { "epoch": 2.9538904899135447, "grad_norm": 0.10316469520330429, "learning_rate": 9.930884637719056e-06, "loss": 0.4679, "num_input_tokens_seen": 28666720, "step": 23575 }, { "epoch": 2.954516977822328, "grad_norm": 0.15261797606945038, "learning_rate": 9.930794020106684e-06, "loss": 0.4714, "num_input_tokens_seen": 28672672, "step": 23580 }, { "epoch": 2.9551434657311115, "grad_norm": 0.11802000552415848, "learning_rate": 9.930703343542524e-06, "loss": 0.4714, "num_input_tokens_seen": 28679008, "step": 23585 }, { "epoch": 2.9557699536398947, "grad_norm": 0.0873308777809143, "learning_rate": 9.930612608027662e-06, "loss": 0.4635, "num_input_tokens_seen": 28685056, "step": 23590 }, { "epoch": 2.9563964415486783, "grad_norm": 0.16044451296329498, "learning_rate": 9.930521813563179e-06, "loss": 0.4651, "num_input_tokens_seen": 28691040, "step": 23595 }, { "epoch": 2.9570229294574615, "grad_norm": 0.13742949068546295, "learning_rate": 9.930430960150163e-06, "loss": 0.4697, "num_input_tokens_seen": 28696576, "step": 23600 }, { "epoch": 2.9576494173662446, "grad_norm": 0.042299553751945496, "learning_rate": 9.930340047789699e-06, "loss": 0.467, "num_input_tokens_seen": 28702784, "step": 23605 }, { "epoch": 2.9582759052750283, "grad_norm": 0.10261780768632889, "learning_rate": 9.930249076482875e-06, "loss": 0.4654, "num_input_tokens_seen": 28709376, "step": 23610 }, { "epoch": 2.9589023931838114, "grad_norm": 0.129255011677742, "learning_rate": 9.930158046230777e-06, "loss": 0.4586, "num_input_tokens_seen": 28715712, "step": 23615 }, { "epoch": 2.959528881092595, "grad_norm": 0.13169489800930023, "learning_rate": 9.930066957034496e-06, "loss": 0.4611, "num_input_tokens_seen": 28721920, "step": 23620 }, { "epoch": 2.960155369001378, "grad_norm": 0.14271797239780426, "learning_rate": 9.92997580889512e-06, "loss": 0.4612, "num_input_tokens_seen": 28727584, "step": 23625 }, { "epoch": 2.9607818569101614, "grad_norm": 0.11519873142242432, "learning_rate": 9.929884601813737e-06, "loss": 0.4655, "num_input_tokens_seen": 28733376, "step": 23630 }, { "epoch": 2.961408344818945, "grad_norm": 0.07637994736433029, "learning_rate": 9.92979333579144e-06, "loss": 0.4638, "num_input_tokens_seen": 28739392, "step": 23635 }, { "epoch": 2.9620348327277286, "grad_norm": 0.08748990297317505, "learning_rate": 9.929702010829318e-06, "loss": 0.4626, "num_input_tokens_seen": 28745440, "step": 23640 }, { "epoch": 2.9626613206365118, "grad_norm": 0.08878259360790253, "learning_rate": 9.929610626928462e-06, "loss": 0.4613, "num_input_tokens_seen": 28751264, "step": 23645 }, { "epoch": 2.963287808545295, "grad_norm": 0.08679588884115219, "learning_rate": 9.92951918408997e-06, "loss": 0.4668, "num_input_tokens_seen": 28757376, "step": 23650 }, { "epoch": 2.9639142964540786, "grad_norm": 0.07375451922416687, "learning_rate": 9.92942768231493e-06, "loss": 0.4634, "num_input_tokens_seen": 28763296, "step": 23655 }, { "epoch": 2.9645407843628617, "grad_norm": 0.029236450791358948, "learning_rate": 9.929336121604438e-06, "loss": 0.4631, "num_input_tokens_seen": 28769632, "step": 23660 }, { "epoch": 2.9651672722716453, "grad_norm": 0.0800214633345604, "learning_rate": 9.929244501959587e-06, "loss": 0.4657, "num_input_tokens_seen": 28775648, "step": 23665 }, { "epoch": 2.9657937601804285, "grad_norm": 0.1296110451221466, "learning_rate": 9.929152823381475e-06, "loss": 0.4605, "num_input_tokens_seen": 28781696, "step": 23670 }, { "epoch": 2.9664202480892117, "grad_norm": 0.1113588735461235, "learning_rate": 9.929061085871197e-06, "loss": 0.46, "num_input_tokens_seen": 28787936, "step": 23675 }, { "epoch": 2.9670467359979953, "grad_norm": 0.03635712340474129, "learning_rate": 9.928969289429848e-06, "loss": 0.4623, "num_input_tokens_seen": 28794112, "step": 23680 }, { "epoch": 2.9676732239067785, "grad_norm": 0.15973645448684692, "learning_rate": 9.928877434058528e-06, "loss": 0.4685, "num_input_tokens_seen": 28800096, "step": 23685 }, { "epoch": 2.968299711815562, "grad_norm": 0.08476077765226364, "learning_rate": 9.928785519758335e-06, "loss": 0.4597, "num_input_tokens_seen": 28806144, "step": 23690 }, { "epoch": 2.9689261997243452, "grad_norm": 0.08509311825037003, "learning_rate": 9.928693546530365e-06, "loss": 0.46, "num_input_tokens_seen": 28812224, "step": 23695 }, { "epoch": 2.9695526876331284, "grad_norm": 0.1304292380809784, "learning_rate": 9.928601514375724e-06, "loss": 0.4561, "num_input_tokens_seen": 28818336, "step": 23700 }, { "epoch": 2.970179175541912, "grad_norm": 0.03447437286376953, "learning_rate": 9.928509423295504e-06, "loss": 0.4619, "num_input_tokens_seen": 28824480, "step": 23705 }, { "epoch": 2.9708056634506956, "grad_norm": 0.08802583813667297, "learning_rate": 9.928417273290813e-06, "loss": 0.4642, "num_input_tokens_seen": 28830176, "step": 23710 }, { "epoch": 2.971432151359479, "grad_norm": 0.0911392942070961, "learning_rate": 9.928325064362747e-06, "loss": 0.4641, "num_input_tokens_seen": 28835968, "step": 23715 }, { "epoch": 2.972058639268262, "grad_norm": 0.0385020449757576, "learning_rate": 9.928232796512414e-06, "loss": 0.4628, "num_input_tokens_seen": 28842304, "step": 23720 }, { "epoch": 2.9726851271770456, "grad_norm": 0.17165762186050415, "learning_rate": 9.928140469740913e-06, "loss": 0.4651, "num_input_tokens_seen": 28848512, "step": 23725 }, { "epoch": 2.9733116150858288, "grad_norm": 0.1127389445900917, "learning_rate": 9.928048084049352e-06, "loss": 0.4636, "num_input_tokens_seen": 28854080, "step": 23730 }, { "epoch": 2.9739381029946124, "grad_norm": 0.0755431279540062, "learning_rate": 9.927955639438828e-06, "loss": 0.4634, "num_input_tokens_seen": 28860256, "step": 23735 }, { "epoch": 2.9745645909033955, "grad_norm": 0.1297692060470581, "learning_rate": 9.927863135910455e-06, "loss": 0.4614, "num_input_tokens_seen": 28866368, "step": 23740 }, { "epoch": 2.9751910788121787, "grad_norm": 0.1372380256652832, "learning_rate": 9.927770573465334e-06, "loss": 0.462, "num_input_tokens_seen": 28872672, "step": 23745 }, { "epoch": 2.9758175667209623, "grad_norm": 0.11612176895141602, "learning_rate": 9.927677952104572e-06, "loss": 0.4628, "num_input_tokens_seen": 28879072, "step": 23750 }, { "epoch": 2.9764440546297455, "grad_norm": 0.08882761001586914, "learning_rate": 9.927585271829278e-06, "loss": 0.462, "num_input_tokens_seen": 28884928, "step": 23755 }, { "epoch": 2.977070542538529, "grad_norm": 0.08772940188646317, "learning_rate": 9.92749253264056e-06, "loss": 0.461, "num_input_tokens_seen": 28890560, "step": 23760 }, { "epoch": 2.9776970304473123, "grad_norm": 0.14505282044410706, "learning_rate": 9.927399734539525e-06, "loss": 0.4612, "num_input_tokens_seen": 28896576, "step": 23765 }, { "epoch": 2.978323518356096, "grad_norm": 0.08847019821405411, "learning_rate": 9.927306877527282e-06, "loss": 0.4705, "num_input_tokens_seen": 28901920, "step": 23770 }, { "epoch": 2.978950006264879, "grad_norm": 0.11061878502368927, "learning_rate": 9.927213961604944e-06, "loss": 0.4622, "num_input_tokens_seen": 28908128, "step": 23775 }, { "epoch": 2.9795764941736627, "grad_norm": 0.1344558149576187, "learning_rate": 9.927120986773622e-06, "loss": 0.4712, "num_input_tokens_seen": 28914560, "step": 23780 }, { "epoch": 2.980202982082446, "grad_norm": 0.07369881868362427, "learning_rate": 9.927027953034425e-06, "loss": 0.4596, "num_input_tokens_seen": 28920704, "step": 23785 }, { "epoch": 2.980829469991229, "grad_norm": 0.10848165303468704, "learning_rate": 9.926934860388466e-06, "loss": 0.4626, "num_input_tokens_seen": 28927040, "step": 23790 }, { "epoch": 2.9814559579000126, "grad_norm": 0.13156096637248993, "learning_rate": 9.926841708836859e-06, "loss": 0.4639, "num_input_tokens_seen": 28932992, "step": 23795 }, { "epoch": 2.982082445808796, "grad_norm": 0.09709841012954712, "learning_rate": 9.926748498380716e-06, "loss": 0.4621, "num_input_tokens_seen": 28939232, "step": 23800 }, { "epoch": 2.9827089337175794, "grad_norm": 0.15815433859825134, "learning_rate": 9.926655229021152e-06, "loss": 0.4555, "num_input_tokens_seen": 28945344, "step": 23805 }, { "epoch": 2.9833354216263626, "grad_norm": 0.09108249098062515, "learning_rate": 9.926561900759284e-06, "loss": 0.4669, "num_input_tokens_seen": 28951520, "step": 23810 }, { "epoch": 2.9839619095351457, "grad_norm": 0.17627593874931335, "learning_rate": 9.926468513596225e-06, "loss": 0.4607, "num_input_tokens_seen": 28957696, "step": 23815 }, { "epoch": 2.9845883974439293, "grad_norm": 0.035693082958459854, "learning_rate": 9.926375067533097e-06, "loss": 0.4649, "num_input_tokens_seen": 28963584, "step": 23820 }, { "epoch": 2.985214885352713, "grad_norm": 0.07684499770402908, "learning_rate": 9.92628156257101e-06, "loss": 0.465, "num_input_tokens_seen": 28969792, "step": 23825 }, { "epoch": 2.985841373261496, "grad_norm": 0.12735222280025482, "learning_rate": 9.926187998711086e-06, "loss": 0.4636, "num_input_tokens_seen": 28976000, "step": 23830 }, { "epoch": 2.9864678611702793, "grad_norm": 0.09063099324703217, "learning_rate": 9.92609437595444e-06, "loss": 0.4633, "num_input_tokens_seen": 28982048, "step": 23835 }, { "epoch": 2.987094349079063, "grad_norm": 0.18619699776172638, "learning_rate": 9.926000694302198e-06, "loss": 0.4685, "num_input_tokens_seen": 28988160, "step": 23840 }, { "epoch": 2.987720836987846, "grad_norm": 0.12685571610927582, "learning_rate": 9.925906953755472e-06, "loss": 0.4544, "num_input_tokens_seen": 28994048, "step": 23845 }, { "epoch": 2.9883473248966297, "grad_norm": 0.13226847350597382, "learning_rate": 9.925813154315391e-06, "loss": 0.4681, "num_input_tokens_seen": 29000032, "step": 23850 }, { "epoch": 2.988973812805413, "grad_norm": 0.10968603193759918, "learning_rate": 9.925719295983072e-06, "loss": 0.4657, "num_input_tokens_seen": 29006304, "step": 23855 }, { "epoch": 2.989600300714196, "grad_norm": 0.08222369849681854, "learning_rate": 9.925625378759636e-06, "loss": 0.4611, "num_input_tokens_seen": 29012320, "step": 23860 }, { "epoch": 2.9902267886229796, "grad_norm": 0.0819263756275177, "learning_rate": 9.925531402646207e-06, "loss": 0.4657, "num_input_tokens_seen": 29017120, "step": 23865 }, { "epoch": 2.990853276531763, "grad_norm": 0.11028563976287842, "learning_rate": 9.925437367643908e-06, "loss": 0.4662, "num_input_tokens_seen": 29022848, "step": 23870 }, { "epoch": 2.9914797644405464, "grad_norm": 0.07566147297620773, "learning_rate": 9.925343273753866e-06, "loss": 0.4626, "num_input_tokens_seen": 29028832, "step": 23875 }, { "epoch": 2.9921062523493296, "grad_norm": 0.0826629176735878, "learning_rate": 9.925249120977205e-06, "loss": 0.4605, "num_input_tokens_seen": 29035008, "step": 23880 }, { "epoch": 2.9927327402581128, "grad_norm": 0.09344995766878128, "learning_rate": 9.92515490931505e-06, "loss": 0.4636, "num_input_tokens_seen": 29040352, "step": 23885 }, { "epoch": 2.9933592281668964, "grad_norm": 0.13354100286960602, "learning_rate": 9.925060638768525e-06, "loss": 0.4631, "num_input_tokens_seen": 29046656, "step": 23890 }, { "epoch": 2.99398571607568, "grad_norm": 0.09474976360797882, "learning_rate": 9.92496630933876e-06, "loss": 0.4622, "num_input_tokens_seen": 29052672, "step": 23895 }, { "epoch": 2.994612203984463, "grad_norm": 0.0837300568819046, "learning_rate": 9.924871921026882e-06, "loss": 0.4668, "num_input_tokens_seen": 29058592, "step": 23900 }, { "epoch": 2.9952386918932463, "grad_norm": 0.14545948803424835, "learning_rate": 9.92477747383402e-06, "loss": 0.4591, "num_input_tokens_seen": 29064352, "step": 23905 }, { "epoch": 2.99586517980203, "grad_norm": 0.08506704866886139, "learning_rate": 9.924682967761303e-06, "loss": 0.4631, "num_input_tokens_seen": 29070880, "step": 23910 }, { "epoch": 2.996491667710813, "grad_norm": 0.11651729792356491, "learning_rate": 9.92458840280986e-06, "loss": 0.4655, "num_input_tokens_seen": 29076640, "step": 23915 }, { "epoch": 2.9971181556195967, "grad_norm": 0.08383883535861969, "learning_rate": 9.924493778980823e-06, "loss": 0.4598, "num_input_tokens_seen": 29083008, "step": 23920 }, { "epoch": 2.99774464352838, "grad_norm": 0.0809021145105362, "learning_rate": 9.924399096275322e-06, "loss": 0.4595, "num_input_tokens_seen": 29089120, "step": 23925 }, { "epoch": 2.998371131437163, "grad_norm": 0.12659314274787903, "learning_rate": 9.92430435469449e-06, "loss": 0.4607, "num_input_tokens_seen": 29094528, "step": 23930 }, { "epoch": 2.9989976193459467, "grad_norm": 0.08203128725290298, "learning_rate": 9.924209554239458e-06, "loss": 0.4674, "num_input_tokens_seen": 29100704, "step": 23935 }, { "epoch": 2.99962410725473, "grad_norm": 0.07981671392917633, "learning_rate": 9.924114694911363e-06, "loss": 0.4694, "num_input_tokens_seen": 29106592, "step": 23940 }, { "epoch": 3.0002505951635134, "grad_norm": 0.12445279210805893, "learning_rate": 9.924019776711336e-06, "loss": 0.4632, "num_input_tokens_seen": 29112768, "step": 23945 }, { "epoch": 3.0008770830722966, "grad_norm": 0.12745703756809235, "learning_rate": 9.923924799640513e-06, "loss": 0.4607, "num_input_tokens_seen": 29118816, "step": 23950 }, { "epoch": 3.0015035709810802, "grad_norm": 0.08260907232761383, "learning_rate": 9.923829763700027e-06, "loss": 0.4664, "num_input_tokens_seen": 29124704, "step": 23955 }, { "epoch": 3.0021300588898634, "grad_norm": 0.034204691648483276, "learning_rate": 9.923734668891018e-06, "loss": 0.4613, "num_input_tokens_seen": 29131072, "step": 23960 }, { "epoch": 3.002756546798647, "grad_norm": 0.0777648314833641, "learning_rate": 9.923639515214623e-06, "loss": 0.4607, "num_input_tokens_seen": 29137184, "step": 23965 }, { "epoch": 3.00338303470743, "grad_norm": 0.032577697187662125, "learning_rate": 9.923544302671977e-06, "loss": 0.4623, "num_input_tokens_seen": 29143200, "step": 23970 }, { "epoch": 3.0040095226162133, "grad_norm": 0.11490917205810547, "learning_rate": 9.923449031264218e-06, "loss": 0.4629, "num_input_tokens_seen": 29149248, "step": 23975 }, { "epoch": 3.004636010524997, "grad_norm": 0.08235976845026016, "learning_rate": 9.923353700992487e-06, "loss": 0.4582, "num_input_tokens_seen": 29155136, "step": 23980 }, { "epoch": 3.00526249843378, "grad_norm": 0.12998077273368835, "learning_rate": 9.923258311857925e-06, "loss": 0.4583, "num_input_tokens_seen": 29160992, "step": 23985 }, { "epoch": 3.0058889863425637, "grad_norm": 0.11602428555488586, "learning_rate": 9.923162863861667e-06, "loss": 0.4676, "num_input_tokens_seen": 29167104, "step": 23990 }, { "epoch": 3.006515474251347, "grad_norm": 0.16209153831005096, "learning_rate": 9.923067357004861e-06, "loss": 0.4552, "num_input_tokens_seen": 29172736, "step": 23995 }, { "epoch": 3.0071419621601305, "grad_norm": 0.11352405697107315, "learning_rate": 9.922971791288644e-06, "loss": 0.4591, "num_input_tokens_seen": 29178912, "step": 24000 }, { "epoch": 3.0077684500689137, "grad_norm": 0.09562305361032486, "learning_rate": 9.92287616671416e-06, "loss": 0.4627, "num_input_tokens_seen": 29185504, "step": 24005 }, { "epoch": 3.008394937977697, "grad_norm": 0.1108407974243164, "learning_rate": 9.922780483282554e-06, "loss": 0.4666, "num_input_tokens_seen": 29191520, "step": 24010 }, { "epoch": 3.0090214258864805, "grad_norm": 0.14158891141414642, "learning_rate": 9.922684740994968e-06, "loss": 0.4528, "num_input_tokens_seen": 29197664, "step": 24015 }, { "epoch": 3.0096479137952636, "grad_norm": 0.10410627722740173, "learning_rate": 9.922588939852547e-06, "loss": 0.4646, "num_input_tokens_seen": 29203904, "step": 24020 }, { "epoch": 3.0102744017040473, "grad_norm": 0.12411580234766006, "learning_rate": 9.922493079856436e-06, "loss": 0.4546, "num_input_tokens_seen": 29209888, "step": 24025 }, { "epoch": 3.0109008896128304, "grad_norm": 0.17587515711784363, "learning_rate": 9.922397161007782e-06, "loss": 0.4568, "num_input_tokens_seen": 29216256, "step": 24030 }, { "epoch": 3.011527377521614, "grad_norm": 0.18054383993148804, "learning_rate": 9.922301183307733e-06, "loss": 0.4755, "num_input_tokens_seen": 29222176, "step": 24035 }, { "epoch": 3.012153865430397, "grad_norm": 0.16458958387374878, "learning_rate": 9.922205146757432e-06, "loss": 0.4727, "num_input_tokens_seen": 29228448, "step": 24040 }, { "epoch": 3.0127803533391804, "grad_norm": 0.10058226436376572, "learning_rate": 9.92210905135803e-06, "loss": 0.4537, "num_input_tokens_seen": 29234432, "step": 24045 }, { "epoch": 3.013406841247964, "grad_norm": 0.03397374972701073, "learning_rate": 9.922012897110679e-06, "loss": 0.4657, "num_input_tokens_seen": 29240992, "step": 24050 }, { "epoch": 3.014033329156747, "grad_norm": 0.08934175968170166, "learning_rate": 9.921916684016525e-06, "loss": 0.4614, "num_input_tokens_seen": 29247136, "step": 24055 }, { "epoch": 3.0146598170655308, "grad_norm": 0.12773706018924713, "learning_rate": 9.921820412076716e-06, "loss": 0.4569, "num_input_tokens_seen": 29253184, "step": 24060 }, { "epoch": 3.015286304974314, "grad_norm": 0.10569209605455399, "learning_rate": 9.921724081292409e-06, "loss": 0.4552, "num_input_tokens_seen": 29259104, "step": 24065 }, { "epoch": 3.0159127928830975, "grad_norm": 0.09952830523252487, "learning_rate": 9.921627691664752e-06, "loss": 0.4705, "num_input_tokens_seen": 29265024, "step": 24070 }, { "epoch": 3.0165392807918807, "grad_norm": 0.0938916876912117, "learning_rate": 9.921531243194897e-06, "loss": 0.4609, "num_input_tokens_seen": 29271200, "step": 24075 }, { "epoch": 3.017165768700664, "grad_norm": 0.038955677300691605, "learning_rate": 9.921434735883999e-06, "loss": 0.467, "num_input_tokens_seen": 29277184, "step": 24080 }, { "epoch": 3.0177922566094475, "grad_norm": 0.07836582511663437, "learning_rate": 9.92133816973321e-06, "loss": 0.4621, "num_input_tokens_seen": 29283136, "step": 24085 }, { "epoch": 3.0184187445182307, "grad_norm": 0.12830494344234467, "learning_rate": 9.921241544743686e-06, "loss": 0.4574, "num_input_tokens_seen": 29288672, "step": 24090 }, { "epoch": 3.0190452324270143, "grad_norm": 0.10721205174922943, "learning_rate": 9.921144860916584e-06, "loss": 0.4643, "num_input_tokens_seen": 29294944, "step": 24095 }, { "epoch": 3.0196717203357974, "grad_norm": 0.13952971994876862, "learning_rate": 9.921048118253057e-06, "loss": 0.4501, "num_input_tokens_seen": 29301152, "step": 24100 }, { "epoch": 3.020298208244581, "grad_norm": 0.042994752526283264, "learning_rate": 9.920951316754259e-06, "loss": 0.4597, "num_input_tokens_seen": 29307328, "step": 24105 }, { "epoch": 3.0209246961533642, "grad_norm": 0.09616006910800934, "learning_rate": 9.920854456421353e-06, "loss": 0.465, "num_input_tokens_seen": 29313440, "step": 24110 }, { "epoch": 3.0215511840621474, "grad_norm": 0.03633410111069679, "learning_rate": 9.920757537255496e-06, "loss": 0.4604, "num_input_tokens_seen": 29319584, "step": 24115 }, { "epoch": 3.022177671970931, "grad_norm": 0.1271398365497589, "learning_rate": 9.920660559257844e-06, "loss": 0.4746, "num_input_tokens_seen": 29325632, "step": 24120 }, { "epoch": 3.022804159879714, "grad_norm": 0.18918097019195557, "learning_rate": 9.920563522429557e-06, "loss": 0.4716, "num_input_tokens_seen": 29331360, "step": 24125 }, { "epoch": 3.023430647788498, "grad_norm": 0.12296313792467117, "learning_rate": 9.920466426771799e-06, "loss": 0.4576, "num_input_tokens_seen": 29337344, "step": 24130 }, { "epoch": 3.024057135697281, "grad_norm": 0.09522844105958939, "learning_rate": 9.920369272285726e-06, "loss": 0.4571, "num_input_tokens_seen": 29343168, "step": 24135 }, { "epoch": 3.0246836236060646, "grad_norm": 0.0953555554151535, "learning_rate": 9.920272058972503e-06, "loss": 0.4582, "num_input_tokens_seen": 29349088, "step": 24140 }, { "epoch": 3.0253101115148477, "grad_norm": 0.14413656294345856, "learning_rate": 9.920174786833288e-06, "loss": 0.4662, "num_input_tokens_seen": 29355136, "step": 24145 }, { "epoch": 3.025936599423631, "grad_norm": 0.219157874584198, "learning_rate": 9.920077455869248e-06, "loss": 0.448, "num_input_tokens_seen": 29361472, "step": 24150 }, { "epoch": 3.0265630873324145, "grad_norm": 0.08774826675653458, "learning_rate": 9.919980066081547e-06, "loss": 0.4691, "num_input_tokens_seen": 29367552, "step": 24155 }, { "epoch": 3.0271895752411977, "grad_norm": 0.14386285841464996, "learning_rate": 9.919882617471345e-06, "loss": 0.4633, "num_input_tokens_seen": 29373760, "step": 24160 }, { "epoch": 3.0278160631499813, "grad_norm": 0.1341376155614853, "learning_rate": 9.919785110039812e-06, "loss": 0.4675, "num_input_tokens_seen": 29379840, "step": 24165 }, { "epoch": 3.0284425510587645, "grad_norm": 0.11686854064464569, "learning_rate": 9.919687543788112e-06, "loss": 0.4671, "num_input_tokens_seen": 29385888, "step": 24170 }, { "epoch": 3.029069038967548, "grad_norm": 0.14048565924167633, "learning_rate": 9.91958991871741e-06, "loss": 0.4528, "num_input_tokens_seen": 29392096, "step": 24175 }, { "epoch": 3.0296955268763313, "grad_norm": 0.08894901722669601, "learning_rate": 9.919492234828875e-06, "loss": 0.4676, "num_input_tokens_seen": 29397696, "step": 24180 }, { "epoch": 3.0303220147851144, "grad_norm": 0.13721832633018494, "learning_rate": 9.919394492123675e-06, "loss": 0.4706, "num_input_tokens_seen": 29403392, "step": 24185 }, { "epoch": 3.030948502693898, "grad_norm": 0.1739969402551651, "learning_rate": 9.919296690602974e-06, "loss": 0.4693, "num_input_tokens_seen": 29409536, "step": 24190 }, { "epoch": 3.031574990602681, "grad_norm": 0.09793590009212494, "learning_rate": 9.91919883026795e-06, "loss": 0.4623, "num_input_tokens_seen": 29415584, "step": 24195 }, { "epoch": 3.032201478511465, "grad_norm": 0.0490647554397583, "learning_rate": 9.919100911119766e-06, "loss": 0.4575, "num_input_tokens_seen": 29421696, "step": 24200 }, { "epoch": 3.032827966420248, "grad_norm": 0.09616425633430481, "learning_rate": 9.919002933159594e-06, "loss": 0.4595, "num_input_tokens_seen": 29427776, "step": 24205 }, { "epoch": 3.0334544543290316, "grad_norm": 0.0823572501540184, "learning_rate": 9.918904896388609e-06, "loss": 0.4615, "num_input_tokens_seen": 29433760, "step": 24210 }, { "epoch": 3.0340809422378148, "grad_norm": 0.11612134426832199, "learning_rate": 9.918806800807978e-06, "loss": 0.4652, "num_input_tokens_seen": 29439936, "step": 24215 }, { "epoch": 3.0347074301465984, "grad_norm": 0.0976700559258461, "learning_rate": 9.918708646418876e-06, "loss": 0.4599, "num_input_tokens_seen": 29446048, "step": 24220 }, { "epoch": 3.0353339180553816, "grad_norm": 0.09722914546728134, "learning_rate": 9.918610433222478e-06, "loss": 0.4634, "num_input_tokens_seen": 29452160, "step": 24225 }, { "epoch": 3.0359604059641647, "grad_norm": 0.09226784855127335, "learning_rate": 9.918512161219958e-06, "loss": 0.4641, "num_input_tokens_seen": 29458528, "step": 24230 }, { "epoch": 3.0365868938729483, "grad_norm": 0.1469654142856598, "learning_rate": 9.918413830412487e-06, "loss": 0.4625, "num_input_tokens_seen": 29464672, "step": 24235 }, { "epoch": 3.0372133817817315, "grad_norm": 0.11292295157909393, "learning_rate": 9.918315440801245e-06, "loss": 0.4622, "num_input_tokens_seen": 29470848, "step": 24240 }, { "epoch": 3.037839869690515, "grad_norm": 0.13985596597194672, "learning_rate": 9.918216992387406e-06, "loss": 0.4499, "num_input_tokens_seen": 29477184, "step": 24245 }, { "epoch": 3.0384663575992983, "grad_norm": 0.07797788828611374, "learning_rate": 9.91811848517215e-06, "loss": 0.4631, "num_input_tokens_seen": 29483104, "step": 24250 }, { "epoch": 3.039092845508082, "grad_norm": 0.09712488204240799, "learning_rate": 9.91801991915665e-06, "loss": 0.4656, "num_input_tokens_seen": 29489280, "step": 24255 }, { "epoch": 3.039719333416865, "grad_norm": 0.0789286345243454, "learning_rate": 9.917921294342089e-06, "loss": 0.4621, "num_input_tokens_seen": 29495520, "step": 24260 }, { "epoch": 3.0403458213256482, "grad_norm": 0.11966873705387115, "learning_rate": 9.91782261072964e-06, "loss": 0.4651, "num_input_tokens_seen": 29501760, "step": 24265 }, { "epoch": 3.040972309234432, "grad_norm": 0.07940058410167694, "learning_rate": 9.917723868320492e-06, "loss": 0.4594, "num_input_tokens_seen": 29508064, "step": 24270 }, { "epoch": 3.041598797143215, "grad_norm": 0.0936005711555481, "learning_rate": 9.91762506711582e-06, "loss": 0.4666, "num_input_tokens_seen": 29513152, "step": 24275 }, { "epoch": 3.0422252850519986, "grad_norm": 0.0948001816868782, "learning_rate": 9.917526207116804e-06, "loss": 0.4603, "num_input_tokens_seen": 29519328, "step": 24280 }, { "epoch": 3.042851772960782, "grad_norm": 0.07803946733474731, "learning_rate": 9.917427288324628e-06, "loss": 0.4641, "num_input_tokens_seen": 29525504, "step": 24285 }, { "epoch": 3.0434782608695654, "grad_norm": 0.08366196602582932, "learning_rate": 9.917328310740475e-06, "loss": 0.4604, "num_input_tokens_seen": 29531840, "step": 24290 }, { "epoch": 3.0441047487783486, "grad_norm": 0.12880286574363708, "learning_rate": 9.917229274365527e-06, "loss": 0.4687, "num_input_tokens_seen": 29537312, "step": 24295 }, { "epoch": 3.0447312366871317, "grad_norm": 0.08996909856796265, "learning_rate": 9.91713017920097e-06, "loss": 0.469, "num_input_tokens_seen": 29543264, "step": 24300 }, { "epoch": 3.0453577245959154, "grad_norm": 0.08686283230781555, "learning_rate": 9.917031025247987e-06, "loss": 0.4607, "num_input_tokens_seen": 29549536, "step": 24305 }, { "epoch": 3.0459842125046985, "grad_norm": 0.12103703618049622, "learning_rate": 9.916931812507764e-06, "loss": 0.4632, "num_input_tokens_seen": 29555392, "step": 24310 }, { "epoch": 3.046610700413482, "grad_norm": 0.19129973649978638, "learning_rate": 9.916832540981487e-06, "loss": 0.4602, "num_input_tokens_seen": 29561120, "step": 24315 }, { "epoch": 3.0472371883222653, "grad_norm": 0.16648909449577332, "learning_rate": 9.916733210670345e-06, "loss": 0.4654, "num_input_tokens_seen": 29567168, "step": 24320 }, { "epoch": 3.047863676231049, "grad_norm": 0.15862798690795898, "learning_rate": 9.91663382157552e-06, "loss": 0.459, "num_input_tokens_seen": 29572736, "step": 24325 }, { "epoch": 3.048490164139832, "grad_norm": 0.10341664403676987, "learning_rate": 9.916534373698207e-06, "loss": 0.4676, "num_input_tokens_seen": 29578976, "step": 24330 }, { "epoch": 3.0491166520486153, "grad_norm": 0.08619147539138794, "learning_rate": 9.916434867039591e-06, "loss": 0.4622, "num_input_tokens_seen": 29584992, "step": 24335 }, { "epoch": 3.049743139957399, "grad_norm": 0.16099593043327332, "learning_rate": 9.916335301600863e-06, "loss": 0.4637, "num_input_tokens_seen": 29591168, "step": 24340 }, { "epoch": 3.050369627866182, "grad_norm": 0.08328071236610413, "learning_rate": 9.916235677383214e-06, "loss": 0.46, "num_input_tokens_seen": 29597280, "step": 24345 }, { "epoch": 3.0509961157749657, "grad_norm": 0.1357356309890747, "learning_rate": 9.916135994387832e-06, "loss": 0.4572, "num_input_tokens_seen": 29602688, "step": 24350 }, { "epoch": 3.051622603683749, "grad_norm": 0.12969304621219635, "learning_rate": 9.916036252615913e-06, "loss": 0.458, "num_input_tokens_seen": 29608736, "step": 24355 }, { "epoch": 3.0522490915925324, "grad_norm": 0.0961378961801529, "learning_rate": 9.915936452068646e-06, "loss": 0.4611, "num_input_tokens_seen": 29615072, "step": 24360 }, { "epoch": 3.0528755795013156, "grad_norm": 0.04280318692326546, "learning_rate": 9.915836592747225e-06, "loss": 0.4646, "num_input_tokens_seen": 29621376, "step": 24365 }, { "epoch": 3.0535020674100988, "grad_norm": 0.11032138019800186, "learning_rate": 9.915736674652846e-06, "loss": 0.4583, "num_input_tokens_seen": 29626464, "step": 24370 }, { "epoch": 3.0541285553188824, "grad_norm": 0.1633802205324173, "learning_rate": 9.915636697786701e-06, "loss": 0.4556, "num_input_tokens_seen": 29632576, "step": 24375 }, { "epoch": 3.0547550432276656, "grad_norm": 0.1506597250699997, "learning_rate": 9.915536662149987e-06, "loss": 0.4666, "num_input_tokens_seen": 29638720, "step": 24380 }, { "epoch": 3.055381531136449, "grad_norm": 0.11892179399728775, "learning_rate": 9.915436567743901e-06, "loss": 0.4638, "num_input_tokens_seen": 29644992, "step": 24385 }, { "epoch": 3.0560080190452323, "grad_norm": 0.08527432382106781, "learning_rate": 9.915336414569636e-06, "loss": 0.466, "num_input_tokens_seen": 29651040, "step": 24390 }, { "epoch": 3.056634506954016, "grad_norm": 0.09806093573570251, "learning_rate": 9.915236202628392e-06, "loss": 0.4587, "num_input_tokens_seen": 29657152, "step": 24395 }, { "epoch": 3.057260994862799, "grad_norm": 0.10624795407056808, "learning_rate": 9.915135931921366e-06, "loss": 0.4779, "num_input_tokens_seen": 29663712, "step": 24400 }, { "epoch": 3.0578874827715827, "grad_norm": 0.0941372737288475, "learning_rate": 9.91503560244976e-06, "loss": 0.4534, "num_input_tokens_seen": 29670144, "step": 24405 }, { "epoch": 3.058513970680366, "grad_norm": 0.10083182156085968, "learning_rate": 9.91493521421477e-06, "loss": 0.462, "num_input_tokens_seen": 29676384, "step": 24410 }, { "epoch": 3.059140458589149, "grad_norm": 0.1432267129421234, "learning_rate": 9.914834767217597e-06, "loss": 0.4567, "num_input_tokens_seen": 29682464, "step": 24415 }, { "epoch": 3.0597669464979327, "grad_norm": 0.09203674644231796, "learning_rate": 9.914734261459441e-06, "loss": 0.4627, "num_input_tokens_seen": 29688352, "step": 24420 }, { "epoch": 3.060393434406716, "grad_norm": 0.0825885534286499, "learning_rate": 9.914633696941506e-06, "loss": 0.4629, "num_input_tokens_seen": 29694464, "step": 24425 }, { "epoch": 3.0610199223154995, "grad_norm": 0.1365789920091629, "learning_rate": 9.914533073664992e-06, "loss": 0.4598, "num_input_tokens_seen": 29700768, "step": 24430 }, { "epoch": 3.0616464102242826, "grad_norm": 0.14074833691120148, "learning_rate": 9.914432391631105e-06, "loss": 0.4634, "num_input_tokens_seen": 29706880, "step": 24435 }, { "epoch": 3.0622728981330662, "grad_norm": 0.1481972187757492, "learning_rate": 9.914331650841046e-06, "loss": 0.4633, "num_input_tokens_seen": 29712864, "step": 24440 }, { "epoch": 3.0628993860418494, "grad_norm": 0.11927804350852966, "learning_rate": 9.914230851296021e-06, "loss": 0.4555, "num_input_tokens_seen": 29718368, "step": 24445 }, { "epoch": 3.0635258739506326, "grad_norm": 0.11365072429180145, "learning_rate": 9.914129992997232e-06, "loss": 0.4649, "num_input_tokens_seen": 29724768, "step": 24450 }, { "epoch": 3.064152361859416, "grad_norm": 0.042805951088666916, "learning_rate": 9.91402907594589e-06, "loss": 0.4673, "num_input_tokens_seen": 29731040, "step": 24455 }, { "epoch": 3.0647788497681994, "grad_norm": 0.053674038499593735, "learning_rate": 9.9139281001432e-06, "loss": 0.4683, "num_input_tokens_seen": 29736896, "step": 24460 }, { "epoch": 3.065405337676983, "grad_norm": 0.044335346668958664, "learning_rate": 9.913827065590364e-06, "loss": 0.4549, "num_input_tokens_seen": 29743040, "step": 24465 }, { "epoch": 3.066031825585766, "grad_norm": 0.18835268914699554, "learning_rate": 9.913725972288596e-06, "loss": 0.4621, "num_input_tokens_seen": 29748864, "step": 24470 }, { "epoch": 3.0666583134945498, "grad_norm": 0.12662865221500397, "learning_rate": 9.913624820239105e-06, "loss": 0.457, "num_input_tokens_seen": 29754944, "step": 24475 }, { "epoch": 3.067284801403333, "grad_norm": 0.1623491495847702, "learning_rate": 9.913523609443096e-06, "loss": 0.4636, "num_input_tokens_seen": 29760800, "step": 24480 }, { "epoch": 3.067911289312116, "grad_norm": 0.27725502848625183, "learning_rate": 9.913422339901781e-06, "loss": 0.4631, "num_input_tokens_seen": 29766720, "step": 24485 }, { "epoch": 3.0685377772208997, "grad_norm": 0.8884960412979126, "learning_rate": 9.913321011616372e-06, "loss": 0.4587, "num_input_tokens_seen": 29772640, "step": 24490 }, { "epoch": 3.069164265129683, "grad_norm": 0.5861693620681763, "learning_rate": 9.913219624588079e-06, "loss": 0.4712, "num_input_tokens_seen": 29778208, "step": 24495 }, { "epoch": 3.0697907530384665, "grad_norm": 0.20762060582637787, "learning_rate": 9.913118178818116e-06, "loss": 0.4733, "num_input_tokens_seen": 29784512, "step": 24500 }, { "epoch": 3.0704172409472497, "grad_norm": 0.11600557714700699, "learning_rate": 9.913016674307691e-06, "loss": 0.4606, "num_input_tokens_seen": 29790784, "step": 24505 }, { "epoch": 3.0710437288560333, "grad_norm": 0.18433290719985962, "learning_rate": 9.912915111058024e-06, "loss": 0.4601, "num_input_tokens_seen": 29796928, "step": 24510 }, { "epoch": 3.0716702167648164, "grad_norm": 0.12520091235637665, "learning_rate": 9.912813489070325e-06, "loss": 0.4642, "num_input_tokens_seen": 29803552, "step": 24515 }, { "epoch": 3.0722967046735996, "grad_norm": 0.16901329159736633, "learning_rate": 9.91271180834581e-06, "loss": 0.4646, "num_input_tokens_seen": 29809792, "step": 24520 }, { "epoch": 3.0729231925823832, "grad_norm": 0.12151265144348145, "learning_rate": 9.912610068885696e-06, "loss": 0.4606, "num_input_tokens_seen": 29815936, "step": 24525 }, { "epoch": 3.0735496804911664, "grad_norm": 0.1984081268310547, "learning_rate": 9.912508270691198e-06, "loss": 0.4636, "num_input_tokens_seen": 29822176, "step": 24530 }, { "epoch": 3.07417616839995, "grad_norm": 0.13270407915115356, "learning_rate": 9.912406413763533e-06, "loss": 0.4591, "num_input_tokens_seen": 29828064, "step": 24535 }, { "epoch": 3.074802656308733, "grad_norm": 0.14510534703731537, "learning_rate": 9.912304498103919e-06, "loss": 0.4636, "num_input_tokens_seen": 29833984, "step": 24540 }, { "epoch": 3.075429144217517, "grad_norm": 0.18708355724811554, "learning_rate": 9.912202523713575e-06, "loss": 0.4592, "num_input_tokens_seen": 29840288, "step": 24545 }, { "epoch": 3.0760556321263, "grad_norm": 0.6577258110046387, "learning_rate": 9.912100490593722e-06, "loss": 0.4521, "num_input_tokens_seen": 29846048, "step": 24550 }, { "epoch": 3.076682120035083, "grad_norm": 0.5906305909156799, "learning_rate": 9.911998398745575e-06, "loss": 0.4733, "num_input_tokens_seen": 29851680, "step": 24555 }, { "epoch": 3.0773086079438667, "grad_norm": 0.2743186056613922, "learning_rate": 9.911896248170357e-06, "loss": 0.4536, "num_input_tokens_seen": 29857824, "step": 24560 }, { "epoch": 3.07793509585265, "grad_norm": 0.3348071277141571, "learning_rate": 9.91179403886929e-06, "loss": 0.4521, "num_input_tokens_seen": 29863648, "step": 24565 }, { "epoch": 3.0785615837614335, "grad_norm": 0.10561323910951614, "learning_rate": 9.911691770843598e-06, "loss": 0.4839, "num_input_tokens_seen": 29869728, "step": 24570 }, { "epoch": 3.0791880716702167, "grad_norm": 0.12249670177698135, "learning_rate": 9.911589444094499e-06, "loss": 0.4659, "num_input_tokens_seen": 29876128, "step": 24575 }, { "epoch": 3.0798145595790003, "grad_norm": 0.1373286098241806, "learning_rate": 9.911487058623219e-06, "loss": 0.466, "num_input_tokens_seen": 29881696, "step": 24580 }, { "epoch": 3.0804410474877835, "grad_norm": 0.03752009943127632, "learning_rate": 9.911384614430984e-06, "loss": 0.4675, "num_input_tokens_seen": 29887968, "step": 24585 }, { "epoch": 3.0810675353965666, "grad_norm": 0.09063538163900375, "learning_rate": 9.911282111519015e-06, "loss": 0.4636, "num_input_tokens_seen": 29894080, "step": 24590 }, { "epoch": 3.0816940233053502, "grad_norm": 0.09572752565145493, "learning_rate": 9.91117954988854e-06, "loss": 0.4631, "num_input_tokens_seen": 29900192, "step": 24595 }, { "epoch": 3.0823205112141334, "grad_norm": 0.16748900711536407, "learning_rate": 9.911076929540783e-06, "loss": 0.4626, "num_input_tokens_seen": 29906048, "step": 24600 }, { "epoch": 3.082946999122917, "grad_norm": 0.1634751558303833, "learning_rate": 9.910974250476973e-06, "loss": 0.4663, "num_input_tokens_seen": 29912000, "step": 24605 }, { "epoch": 3.0835734870317, "grad_norm": 0.08659029752016068, "learning_rate": 9.910871512698337e-06, "loss": 0.4689, "num_input_tokens_seen": 29918208, "step": 24610 }, { "epoch": 3.084199974940484, "grad_norm": 0.1396142542362213, "learning_rate": 9.910768716206102e-06, "loss": 0.4603, "num_input_tokens_seen": 29924512, "step": 24615 }, { "epoch": 3.084826462849267, "grad_norm": 0.08251733332872391, "learning_rate": 9.9106658610015e-06, "loss": 0.4583, "num_input_tokens_seen": 29930752, "step": 24620 }, { "epoch": 3.08545295075805, "grad_norm": 0.16200929880142212, "learning_rate": 9.910562947085759e-06, "loss": 0.4594, "num_input_tokens_seen": 29937056, "step": 24625 }, { "epoch": 3.0860794386668338, "grad_norm": 0.09333450347185135, "learning_rate": 9.910459974460107e-06, "loss": 0.468, "num_input_tokens_seen": 29942912, "step": 24630 }, { "epoch": 3.086705926575617, "grad_norm": 0.10757079720497131, "learning_rate": 9.910356943125781e-06, "loss": 0.4562, "num_input_tokens_seen": 29949248, "step": 24635 }, { "epoch": 3.0873324144844005, "grad_norm": 0.1115892231464386, "learning_rate": 9.910253853084007e-06, "loss": 0.4595, "num_input_tokens_seen": 29955552, "step": 24640 }, { "epoch": 3.0879589023931837, "grad_norm": 0.03797109052538872, "learning_rate": 9.91015070433602e-06, "loss": 0.4612, "num_input_tokens_seen": 29961664, "step": 24645 }, { "epoch": 3.0885853903019673, "grad_norm": 0.08937133103609085, "learning_rate": 9.910047496883053e-06, "loss": 0.4598, "num_input_tokens_seen": 29967776, "step": 24650 }, { "epoch": 3.0892118782107505, "grad_norm": 0.09341650456190109, "learning_rate": 9.90994423072634e-06, "loss": 0.4645, "num_input_tokens_seen": 29973856, "step": 24655 }, { "epoch": 3.089838366119534, "grad_norm": 0.1011814996600151, "learning_rate": 9.909840905867118e-06, "loss": 0.465, "num_input_tokens_seen": 29979744, "step": 24660 }, { "epoch": 3.0904648540283173, "grad_norm": 0.036917075514793396, "learning_rate": 9.909737522306618e-06, "loss": 0.4679, "num_input_tokens_seen": 29985824, "step": 24665 }, { "epoch": 3.0910913419371004, "grad_norm": 0.09844180941581726, "learning_rate": 9.909634080046078e-06, "loss": 0.4589, "num_input_tokens_seen": 29991936, "step": 24670 }, { "epoch": 3.091717829845884, "grad_norm": 0.08096276223659515, "learning_rate": 9.909530579086735e-06, "loss": 0.4621, "num_input_tokens_seen": 29998144, "step": 24675 }, { "epoch": 3.0923443177546672, "grad_norm": 0.10323913395404816, "learning_rate": 9.909427019429827e-06, "loss": 0.4643, "num_input_tokens_seen": 30004384, "step": 24680 }, { "epoch": 3.092970805663451, "grad_norm": 0.08463029563426971, "learning_rate": 9.90932340107659e-06, "loss": 0.4698, "num_input_tokens_seen": 30010656, "step": 24685 }, { "epoch": 3.093597293572234, "grad_norm": 0.08562961220741272, "learning_rate": 9.909219724028265e-06, "loss": 0.4645, "num_input_tokens_seen": 30016992, "step": 24690 }, { "epoch": 3.0942237814810176, "grad_norm": 0.08423151075839996, "learning_rate": 9.909115988286092e-06, "loss": 0.4603, "num_input_tokens_seen": 30022816, "step": 24695 }, { "epoch": 3.094850269389801, "grad_norm": 0.08755970001220703, "learning_rate": 9.909012193851307e-06, "loss": 0.4625, "num_input_tokens_seen": 30029152, "step": 24700 }, { "epoch": 3.095476757298584, "grad_norm": 0.09198768436908722, "learning_rate": 9.908908340725157e-06, "loss": 0.467, "num_input_tokens_seen": 30035040, "step": 24705 }, { "epoch": 3.0961032452073676, "grad_norm": 0.10554520785808563, "learning_rate": 9.908804428908878e-06, "loss": 0.4592, "num_input_tokens_seen": 30041088, "step": 24710 }, { "epoch": 3.0967297331161507, "grad_norm": 0.11661221086978912, "learning_rate": 9.908700458403717e-06, "loss": 0.4603, "num_input_tokens_seen": 30047328, "step": 24715 }, { "epoch": 3.0973562210249344, "grad_norm": 0.18793249130249023, "learning_rate": 9.908596429210913e-06, "loss": 0.4632, "num_input_tokens_seen": 30053472, "step": 24720 }, { "epoch": 3.0979827089337175, "grad_norm": 0.10789866000413895, "learning_rate": 9.908492341331714e-06, "loss": 0.4653, "num_input_tokens_seen": 30059360, "step": 24725 }, { "epoch": 3.098609196842501, "grad_norm": 0.14580388367176056, "learning_rate": 9.90838819476736e-06, "loss": 0.4669, "num_input_tokens_seen": 30065440, "step": 24730 }, { "epoch": 3.0992356847512843, "grad_norm": 0.11054300516843796, "learning_rate": 9.908283989519102e-06, "loss": 0.4632, "num_input_tokens_seen": 30071936, "step": 24735 }, { "epoch": 3.0998621726600675, "grad_norm": 0.09917598962783813, "learning_rate": 9.908179725588179e-06, "loss": 0.4587, "num_input_tokens_seen": 30077888, "step": 24740 }, { "epoch": 3.100488660568851, "grad_norm": 0.10656709223985672, "learning_rate": 9.90807540297584e-06, "loss": 0.46, "num_input_tokens_seen": 30084000, "step": 24745 }, { "epoch": 3.1011151484776343, "grad_norm": 0.14478947222232819, "learning_rate": 9.907971021683335e-06, "loss": 0.4661, "num_input_tokens_seen": 30090272, "step": 24750 }, { "epoch": 3.101741636386418, "grad_norm": 0.09241843223571777, "learning_rate": 9.907866581711909e-06, "loss": 0.4534, "num_input_tokens_seen": 30096544, "step": 24755 }, { "epoch": 3.102368124295201, "grad_norm": 0.19514645636081696, "learning_rate": 9.907762083062813e-06, "loss": 0.4605, "num_input_tokens_seen": 30102240, "step": 24760 }, { "epoch": 3.1029946122039846, "grad_norm": 0.11944673210382462, "learning_rate": 9.907657525737295e-06, "loss": 0.4579, "num_input_tokens_seen": 30108576, "step": 24765 }, { "epoch": 3.103621100112768, "grad_norm": 0.11842481791973114, "learning_rate": 9.907552909736604e-06, "loss": 0.461, "num_input_tokens_seen": 30114848, "step": 24770 }, { "epoch": 3.104247588021551, "grad_norm": 0.09114132821559906, "learning_rate": 9.907448235061993e-06, "loss": 0.4579, "num_input_tokens_seen": 30120672, "step": 24775 }, { "epoch": 3.1048740759303346, "grad_norm": 0.14495013654232025, "learning_rate": 9.907343501714713e-06, "loss": 0.4517, "num_input_tokens_seen": 30126688, "step": 24780 }, { "epoch": 3.1055005638391178, "grad_norm": 0.05252288281917572, "learning_rate": 9.907238709696012e-06, "loss": 0.4585, "num_input_tokens_seen": 30132800, "step": 24785 }, { "epoch": 3.1061270517479014, "grad_norm": 0.0563511922955513, "learning_rate": 9.90713385900715e-06, "loss": 0.4573, "num_input_tokens_seen": 30139136, "step": 24790 }, { "epoch": 3.1067535396566845, "grad_norm": 0.17310969531536102, "learning_rate": 9.907028949649376e-06, "loss": 0.453, "num_input_tokens_seen": 30145408, "step": 24795 }, { "epoch": 3.107380027565468, "grad_norm": 0.12588199973106384, "learning_rate": 9.906923981623946e-06, "loss": 0.4607, "num_input_tokens_seen": 30151616, "step": 24800 }, { "epoch": 3.1080065154742513, "grad_norm": 0.13978351652622223, "learning_rate": 9.906818954932113e-06, "loss": 0.4534, "num_input_tokens_seen": 30157760, "step": 24805 }, { "epoch": 3.1086330033830345, "grad_norm": 0.15683263540267944, "learning_rate": 9.906713869575135e-06, "loss": 0.4614, "num_input_tokens_seen": 30163904, "step": 24810 }, { "epoch": 3.109259491291818, "grad_norm": 0.11380700767040253, "learning_rate": 9.906608725554268e-06, "loss": 0.4586, "num_input_tokens_seen": 30170144, "step": 24815 }, { "epoch": 3.1098859792006013, "grad_norm": 0.07291727513074875, "learning_rate": 9.906503522870766e-06, "loss": 0.468, "num_input_tokens_seen": 30176416, "step": 24820 }, { "epoch": 3.110512467109385, "grad_norm": 0.16364553570747375, "learning_rate": 9.906398261525891e-06, "loss": 0.4628, "num_input_tokens_seen": 30182688, "step": 24825 }, { "epoch": 3.111138955018168, "grad_norm": 0.1797892451286316, "learning_rate": 9.906292941520899e-06, "loss": 0.4595, "num_input_tokens_seen": 30189120, "step": 24830 }, { "epoch": 3.1117654429269517, "grad_norm": 0.1516513079404831, "learning_rate": 9.90618756285705e-06, "loss": 0.4727, "num_input_tokens_seen": 30195200, "step": 24835 }, { "epoch": 3.112391930835735, "grad_norm": 0.20443867146968842, "learning_rate": 9.906082125535605e-06, "loss": 0.47, "num_input_tokens_seen": 30201184, "step": 24840 }, { "epoch": 3.1130184187445185, "grad_norm": 0.10584520548582077, "learning_rate": 9.905976629557822e-06, "loss": 0.4659, "num_input_tokens_seen": 30207104, "step": 24845 }, { "epoch": 3.1136449066533016, "grad_norm": 0.14513267576694489, "learning_rate": 9.905871074924964e-06, "loss": 0.4547, "num_input_tokens_seen": 30213312, "step": 24850 }, { "epoch": 3.114271394562085, "grad_norm": 0.10186506062746048, "learning_rate": 9.905765461638293e-06, "loss": 0.4536, "num_input_tokens_seen": 30219456, "step": 24855 }, { "epoch": 3.1148978824708684, "grad_norm": 0.09472600370645523, "learning_rate": 9.90565978969907e-06, "loss": 0.4586, "num_input_tokens_seen": 30225824, "step": 24860 }, { "epoch": 3.1155243703796516, "grad_norm": 0.10442767292261124, "learning_rate": 9.905554059108562e-06, "loss": 0.4805, "num_input_tokens_seen": 30232032, "step": 24865 }, { "epoch": 3.116150858288435, "grad_norm": 0.1102103590965271, "learning_rate": 9.905448269868029e-06, "loss": 0.46, "num_input_tokens_seen": 30238368, "step": 24870 }, { "epoch": 3.1167773461972184, "grad_norm": 0.17375953495502472, "learning_rate": 9.90534242197874e-06, "loss": 0.4707, "num_input_tokens_seen": 30244608, "step": 24875 }, { "epoch": 3.1174038341060015, "grad_norm": 0.1029336079955101, "learning_rate": 9.905236515441954e-06, "loss": 0.4593, "num_input_tokens_seen": 30250816, "step": 24880 }, { "epoch": 3.118030322014785, "grad_norm": 0.14230166375637054, "learning_rate": 9.905130550258945e-06, "loss": 0.4691, "num_input_tokens_seen": 30256960, "step": 24885 }, { "epoch": 3.1186568099235683, "grad_norm": 0.09910426288843155, "learning_rate": 9.905024526430975e-06, "loss": 0.4615, "num_input_tokens_seen": 30263008, "step": 24890 }, { "epoch": 3.119283297832352, "grad_norm": 0.09007154405117035, "learning_rate": 9.904918443959312e-06, "loss": 0.4567, "num_input_tokens_seen": 30268960, "step": 24895 }, { "epoch": 3.119909785741135, "grad_norm": 0.1636812537908554, "learning_rate": 9.904812302845227e-06, "loss": 0.4619, "num_input_tokens_seen": 30275136, "step": 24900 }, { "epoch": 3.1205362736499187, "grad_norm": 0.11371272802352905, "learning_rate": 9.904706103089986e-06, "loss": 0.4656, "num_input_tokens_seen": 30280928, "step": 24905 }, { "epoch": 3.121162761558702, "grad_norm": 0.08953709155321121, "learning_rate": 9.904599844694859e-06, "loss": 0.4527, "num_input_tokens_seen": 30287136, "step": 24910 }, { "epoch": 3.1217892494674855, "grad_norm": 0.041909437626600266, "learning_rate": 9.90449352766112e-06, "loss": 0.4641, "num_input_tokens_seen": 30293408, "step": 24915 }, { "epoch": 3.1224157373762687, "grad_norm": 0.09394429624080658, "learning_rate": 9.904387151990036e-06, "loss": 0.4647, "num_input_tokens_seen": 30299616, "step": 24920 }, { "epoch": 3.123042225285052, "grad_norm": 0.10477962344884872, "learning_rate": 9.904280717682881e-06, "loss": 0.4601, "num_input_tokens_seen": 30306080, "step": 24925 }, { "epoch": 3.1236687131938354, "grad_norm": 0.1819547414779663, "learning_rate": 9.904174224740926e-06, "loss": 0.4667, "num_input_tokens_seen": 30312256, "step": 24930 }, { "epoch": 3.1242952011026186, "grad_norm": 0.12315447628498077, "learning_rate": 9.904067673165444e-06, "loss": 0.4696, "num_input_tokens_seen": 30318464, "step": 24935 }, { "epoch": 3.124921689011402, "grad_norm": 0.1813105344772339, "learning_rate": 9.90396106295771e-06, "loss": 0.4612, "num_input_tokens_seen": 30324640, "step": 24940 }, { "epoch": 3.1255481769201854, "grad_norm": 0.16613233089447021, "learning_rate": 9.903854394118998e-06, "loss": 0.4532, "num_input_tokens_seen": 30330464, "step": 24945 }, { "epoch": 3.126174664828969, "grad_norm": 0.1737840175628662, "learning_rate": 9.903747666650588e-06, "loss": 0.4612, "num_input_tokens_seen": 30336608, "step": 24950 }, { "epoch": 3.126801152737752, "grad_norm": 0.13835059106349945, "learning_rate": 9.903640880553747e-06, "loss": 0.4661, "num_input_tokens_seen": 30343168, "step": 24955 }, { "epoch": 3.1274276406465353, "grad_norm": 0.1385970115661621, "learning_rate": 9.903534035829759e-06, "loss": 0.4521, "num_input_tokens_seen": 30348928, "step": 24960 }, { "epoch": 3.128054128555319, "grad_norm": 0.105280302464962, "learning_rate": 9.903427132479896e-06, "loss": 0.4557, "num_input_tokens_seen": 30355008, "step": 24965 }, { "epoch": 3.128680616464102, "grad_norm": 0.21110723912715912, "learning_rate": 9.903320170505444e-06, "loss": 0.4648, "num_input_tokens_seen": 30361056, "step": 24970 }, { "epoch": 3.1293071043728857, "grad_norm": 0.12410356104373932, "learning_rate": 9.903213149907673e-06, "loss": 0.4496, "num_input_tokens_seen": 30366912, "step": 24975 }, { "epoch": 3.129933592281669, "grad_norm": 0.18154527246952057, "learning_rate": 9.903106070687867e-06, "loss": 0.4655, "num_input_tokens_seen": 30373280, "step": 24980 }, { "epoch": 3.1305600801904525, "grad_norm": 0.26807165145874023, "learning_rate": 9.902998932847308e-06, "loss": 0.4624, "num_input_tokens_seen": 30379584, "step": 24985 }, { "epoch": 3.1311865680992357, "grad_norm": 0.050621528178453445, "learning_rate": 9.902891736387273e-06, "loss": 0.4626, "num_input_tokens_seen": 30385856, "step": 24990 }, { "epoch": 3.131813056008019, "grad_norm": 0.10799911618232727, "learning_rate": 9.902784481309043e-06, "loss": 0.463, "num_input_tokens_seen": 30392032, "step": 24995 }, { "epoch": 3.1324395439168025, "grad_norm": 0.12945495545864105, "learning_rate": 9.902677167613905e-06, "loss": 0.4624, "num_input_tokens_seen": 30397824, "step": 25000 }, { "epoch": 3.1330660318255856, "grad_norm": 0.10979510843753815, "learning_rate": 9.902569795303138e-06, "loss": 0.4525, "num_input_tokens_seen": 30404000, "step": 25005 }, { "epoch": 3.1336925197343692, "grad_norm": 0.1070290207862854, "learning_rate": 9.902462364378029e-06, "loss": 0.4616, "num_input_tokens_seen": 30409984, "step": 25010 }, { "epoch": 3.1343190076431524, "grad_norm": 0.17204777896404266, "learning_rate": 9.90235487483986e-06, "loss": 0.4708, "num_input_tokens_seen": 30415872, "step": 25015 }, { "epoch": 3.134945495551936, "grad_norm": 0.11655440181493759, "learning_rate": 9.902247326689915e-06, "loss": 0.4572, "num_input_tokens_seen": 30421568, "step": 25020 }, { "epoch": 3.135571983460719, "grad_norm": 0.1187014952301979, "learning_rate": 9.902139719929485e-06, "loss": 0.4586, "num_input_tokens_seen": 30427520, "step": 25025 }, { "epoch": 3.1361984713695024, "grad_norm": 0.09219358116388321, "learning_rate": 9.902032054559852e-06, "loss": 0.4794, "num_input_tokens_seen": 30433504, "step": 25030 }, { "epoch": 3.136824959278286, "grad_norm": 0.135555237531662, "learning_rate": 9.901924330582302e-06, "loss": 0.4563, "num_input_tokens_seen": 30439232, "step": 25035 }, { "epoch": 3.137451447187069, "grad_norm": 0.037989165633916855, "learning_rate": 9.901816547998127e-06, "loss": 0.4564, "num_input_tokens_seen": 30445152, "step": 25040 }, { "epoch": 3.1380779350958528, "grad_norm": 0.09243132919073105, "learning_rate": 9.901708706808615e-06, "loss": 0.4635, "num_input_tokens_seen": 30451424, "step": 25045 }, { "epoch": 3.138704423004636, "grad_norm": 0.10728944838047028, "learning_rate": 9.901600807015052e-06, "loss": 0.4537, "num_input_tokens_seen": 30456384, "step": 25050 }, { "epoch": 3.1393309109134195, "grad_norm": 0.0876428484916687, "learning_rate": 9.901492848618732e-06, "loss": 0.4624, "num_input_tokens_seen": 30462432, "step": 25055 }, { "epoch": 3.1399573988222027, "grad_norm": 0.09227535873651505, "learning_rate": 9.901384831620943e-06, "loss": 0.4533, "num_input_tokens_seen": 30468288, "step": 25060 }, { "epoch": 3.140583886730986, "grad_norm": 0.08486435562372208, "learning_rate": 9.90127675602298e-06, "loss": 0.4601, "num_input_tokens_seen": 30474432, "step": 25065 }, { "epoch": 3.1412103746397695, "grad_norm": 0.15377353131771088, "learning_rate": 9.90116862182613e-06, "loss": 0.4644, "num_input_tokens_seen": 30480736, "step": 25070 }, { "epoch": 3.1418368625485527, "grad_norm": 0.12177964299917221, "learning_rate": 9.90106042903169e-06, "loss": 0.4623, "num_input_tokens_seen": 30486944, "step": 25075 }, { "epoch": 3.1424633504573363, "grad_norm": 0.11465953290462494, "learning_rate": 9.90095217764095e-06, "loss": 0.4752, "num_input_tokens_seen": 30493088, "step": 25080 }, { "epoch": 3.1430898383661194, "grad_norm": 0.1510157287120819, "learning_rate": 9.900843867655207e-06, "loss": 0.4579, "num_input_tokens_seen": 30499360, "step": 25085 }, { "epoch": 3.143716326274903, "grad_norm": 0.08798765391111374, "learning_rate": 9.900735499075757e-06, "loss": 0.459, "num_input_tokens_seen": 30505536, "step": 25090 }, { "epoch": 3.144342814183686, "grad_norm": 0.09094754606485367, "learning_rate": 9.900627071903892e-06, "loss": 0.4639, "num_input_tokens_seen": 30511904, "step": 25095 }, { "epoch": 3.14496930209247, "grad_norm": 0.04056521877646446, "learning_rate": 9.90051858614091e-06, "loss": 0.4608, "num_input_tokens_seen": 30518016, "step": 25100 }, { "epoch": 3.145595790001253, "grad_norm": 0.1101459488272667, "learning_rate": 9.90041004178811e-06, "loss": 0.4726, "num_input_tokens_seen": 30524320, "step": 25105 }, { "epoch": 3.146222277910036, "grad_norm": 0.09805832803249359, "learning_rate": 9.900301438846786e-06, "loss": 0.4671, "num_input_tokens_seen": 30530560, "step": 25110 }, { "epoch": 3.14684876581882, "grad_norm": 0.08700515329837799, "learning_rate": 9.900192777318238e-06, "loss": 0.4637, "num_input_tokens_seen": 30536544, "step": 25115 }, { "epoch": 3.147475253727603, "grad_norm": 0.10203808546066284, "learning_rate": 9.900084057203767e-06, "loss": 0.4632, "num_input_tokens_seen": 30542432, "step": 25120 }, { "epoch": 3.1481017416363866, "grad_norm": 0.10941405594348907, "learning_rate": 9.89997527850467e-06, "loss": 0.4622, "num_input_tokens_seen": 30548544, "step": 25125 }, { "epoch": 3.1487282295451697, "grad_norm": 0.03632061928510666, "learning_rate": 9.899866441222251e-06, "loss": 0.4637, "num_input_tokens_seen": 30554432, "step": 25130 }, { "epoch": 3.1493547174539533, "grad_norm": 0.08050843328237534, "learning_rate": 9.899757545357806e-06, "loss": 0.4623, "num_input_tokens_seen": 30560320, "step": 25135 }, { "epoch": 3.1499812053627365, "grad_norm": 0.07662758231163025, "learning_rate": 9.899648590912643e-06, "loss": 0.456, "num_input_tokens_seen": 30566528, "step": 25140 }, { "epoch": 3.1506076932715197, "grad_norm": 0.10515625774860382, "learning_rate": 9.89953957788806e-06, "loss": 0.4526, "num_input_tokens_seen": 30572576, "step": 25145 }, { "epoch": 3.1512341811803033, "grad_norm": 0.12079267203807831, "learning_rate": 9.899430506285363e-06, "loss": 0.4664, "num_input_tokens_seen": 30578784, "step": 25150 }, { "epoch": 3.1518606690890865, "grad_norm": 0.08066827803850174, "learning_rate": 9.899321376105855e-06, "loss": 0.4647, "num_input_tokens_seen": 30584288, "step": 25155 }, { "epoch": 3.15248715699787, "grad_norm": 0.0830063745379448, "learning_rate": 9.89921218735084e-06, "loss": 0.4589, "num_input_tokens_seen": 30590624, "step": 25160 }, { "epoch": 3.1531136449066532, "grad_norm": 0.08255507797002792, "learning_rate": 9.899102940021624e-06, "loss": 0.4661, "num_input_tokens_seen": 30596480, "step": 25165 }, { "epoch": 3.153740132815437, "grad_norm": 0.08576882630586624, "learning_rate": 9.898993634119515e-06, "loss": 0.4599, "num_input_tokens_seen": 30602880, "step": 25170 }, { "epoch": 3.15436662072422, "grad_norm": 0.08842582255601883, "learning_rate": 9.898884269645816e-06, "loss": 0.4578, "num_input_tokens_seen": 30609152, "step": 25175 }, { "epoch": 3.154993108633003, "grad_norm": 0.07822653651237488, "learning_rate": 9.89877484660184e-06, "loss": 0.461, "num_input_tokens_seen": 30615648, "step": 25180 }, { "epoch": 3.155619596541787, "grad_norm": 0.15507592260837555, "learning_rate": 9.898665364988889e-06, "loss": 0.4634, "num_input_tokens_seen": 30622080, "step": 25185 }, { "epoch": 3.15624608445057, "grad_norm": 0.03493048623204231, "learning_rate": 9.898555824808277e-06, "loss": 0.4706, "num_input_tokens_seen": 30628320, "step": 25190 }, { "epoch": 3.1568725723593536, "grad_norm": 0.14965952932834625, "learning_rate": 9.89844622606131e-06, "loss": 0.4628, "num_input_tokens_seen": 30634496, "step": 25195 }, { "epoch": 3.1574990602681368, "grad_norm": 0.03947697579860687, "learning_rate": 9.898336568749302e-06, "loss": 0.4623, "num_input_tokens_seen": 30640512, "step": 25200 }, { "epoch": 3.1581255481769204, "grad_norm": 0.14789429306983948, "learning_rate": 9.89822685287356e-06, "loss": 0.4666, "num_input_tokens_seen": 30646624, "step": 25205 }, { "epoch": 3.1587520360857035, "grad_norm": 0.13306303322315216, "learning_rate": 9.898117078435399e-06, "loss": 0.4641, "num_input_tokens_seen": 30652096, "step": 25210 }, { "epoch": 3.1593785239944867, "grad_norm": 0.03594023734331131, "learning_rate": 9.89800724543613e-06, "loss": 0.4564, "num_input_tokens_seen": 30658272, "step": 25215 }, { "epoch": 3.1600050119032703, "grad_norm": 0.09233448654413223, "learning_rate": 9.897897353877066e-06, "loss": 0.4609, "num_input_tokens_seen": 30664384, "step": 25220 }, { "epoch": 3.1606314998120535, "grad_norm": 0.15924392640590668, "learning_rate": 9.897787403759521e-06, "loss": 0.462, "num_input_tokens_seen": 30670848, "step": 25225 }, { "epoch": 3.161257987720837, "grad_norm": 0.09576819837093353, "learning_rate": 9.89767739508481e-06, "loss": 0.4606, "num_input_tokens_seen": 30677056, "step": 25230 }, { "epoch": 3.1618844756296203, "grad_norm": 0.04412512108683586, "learning_rate": 9.89756732785425e-06, "loss": 0.4552, "num_input_tokens_seen": 30682944, "step": 25235 }, { "epoch": 3.162510963538404, "grad_norm": 0.08925393968820572, "learning_rate": 9.897457202069151e-06, "loss": 0.4671, "num_input_tokens_seen": 30689184, "step": 25240 }, { "epoch": 3.163137451447187, "grad_norm": 0.1405973732471466, "learning_rate": 9.897347017730837e-06, "loss": 0.4629, "num_input_tokens_seen": 30695360, "step": 25245 }, { "epoch": 3.1637639393559702, "grad_norm": 0.14878274500370026, "learning_rate": 9.89723677484062e-06, "loss": 0.462, "num_input_tokens_seen": 30701632, "step": 25250 }, { "epoch": 3.164390427264754, "grad_norm": 0.08818748593330383, "learning_rate": 9.897126473399821e-06, "loss": 0.4624, "num_input_tokens_seen": 30707776, "step": 25255 }, { "epoch": 3.165016915173537, "grad_norm": 0.08323461562395096, "learning_rate": 9.897016113409757e-06, "loss": 0.4644, "num_input_tokens_seen": 30713888, "step": 25260 }, { "epoch": 3.1656434030823206, "grad_norm": 0.08297938108444214, "learning_rate": 9.896905694871749e-06, "loss": 0.4546, "num_input_tokens_seen": 30719968, "step": 25265 }, { "epoch": 3.166269890991104, "grad_norm": 0.1558241844177246, "learning_rate": 9.896795217787117e-06, "loss": 0.4647, "num_input_tokens_seen": 30725984, "step": 25270 }, { "epoch": 3.1668963788998874, "grad_norm": 0.08872334659099579, "learning_rate": 9.896684682157179e-06, "loss": 0.4647, "num_input_tokens_seen": 30732160, "step": 25275 }, { "epoch": 3.1675228668086706, "grad_norm": 0.08450298756361008, "learning_rate": 9.896574087983259e-06, "loss": 0.4556, "num_input_tokens_seen": 30737952, "step": 25280 }, { "epoch": 3.168149354717454, "grad_norm": 0.0940500795841217, "learning_rate": 9.89646343526668e-06, "loss": 0.4655, "num_input_tokens_seen": 30744128, "step": 25285 }, { "epoch": 3.1687758426262373, "grad_norm": 0.041833583265542984, "learning_rate": 9.896352724008765e-06, "loss": 0.4607, "num_input_tokens_seen": 30750112, "step": 25290 }, { "epoch": 3.1694023305350205, "grad_norm": 0.08587004244327545, "learning_rate": 9.896241954210834e-06, "loss": 0.4602, "num_input_tokens_seen": 30756096, "step": 25295 }, { "epoch": 3.170028818443804, "grad_norm": 0.12030580639839172, "learning_rate": 9.896131125874215e-06, "loss": 0.4617, "num_input_tokens_seen": 30762144, "step": 25300 }, { "epoch": 3.1706553063525873, "grad_norm": 0.09485651552677155, "learning_rate": 9.896020239000232e-06, "loss": 0.4653, "num_input_tokens_seen": 30768224, "step": 25305 }, { "epoch": 3.171281794261371, "grad_norm": 0.08643781393766403, "learning_rate": 9.895909293590211e-06, "loss": 0.4632, "num_input_tokens_seen": 30774368, "step": 25310 }, { "epoch": 3.171908282170154, "grad_norm": 0.14530499279499054, "learning_rate": 9.895798289645477e-06, "loss": 0.4613, "num_input_tokens_seen": 30780480, "step": 25315 }, { "epoch": 3.1725347700789372, "grad_norm": 0.23116280138492584, "learning_rate": 9.89568722716736e-06, "loss": 0.4631, "num_input_tokens_seen": 30786368, "step": 25320 }, { "epoch": 3.173161257987721, "grad_norm": 0.09028445184230804, "learning_rate": 9.895576106157186e-06, "loss": 0.4636, "num_input_tokens_seen": 30792960, "step": 25325 }, { "epoch": 3.173787745896504, "grad_norm": 0.13720938563346863, "learning_rate": 9.895464926616282e-06, "loss": 0.4607, "num_input_tokens_seen": 30799296, "step": 25330 }, { "epoch": 3.1744142338052876, "grad_norm": 0.10027632862329483, "learning_rate": 9.895353688545981e-06, "loss": 0.4641, "num_input_tokens_seen": 30805408, "step": 25335 }, { "epoch": 3.175040721714071, "grad_norm": 0.09876806288957596, "learning_rate": 9.89524239194761e-06, "loss": 0.4659, "num_input_tokens_seen": 30811488, "step": 25340 }, { "epoch": 3.1756672096228544, "grad_norm": 0.07763873040676117, "learning_rate": 9.8951310368225e-06, "loss": 0.4658, "num_input_tokens_seen": 30817824, "step": 25345 }, { "epoch": 3.1762936975316376, "grad_norm": 0.09641645848751068, "learning_rate": 9.895019623171983e-06, "loss": 0.4549, "num_input_tokens_seen": 30823936, "step": 25350 }, { "epoch": 3.176920185440421, "grad_norm": 0.09430793672800064, "learning_rate": 9.894908150997391e-06, "loss": 0.4626, "num_input_tokens_seen": 30829920, "step": 25355 }, { "epoch": 3.1775466733492044, "grad_norm": 0.10721098631620407, "learning_rate": 9.894796620300057e-06, "loss": 0.4646, "num_input_tokens_seen": 30836416, "step": 25360 }, { "epoch": 3.1781731612579875, "grad_norm": 0.08142521232366562, "learning_rate": 9.894685031081313e-06, "loss": 0.4642, "num_input_tokens_seen": 30842688, "step": 25365 }, { "epoch": 3.178799649166771, "grad_norm": 0.15383200347423553, "learning_rate": 9.894573383342495e-06, "loss": 0.4608, "num_input_tokens_seen": 30848576, "step": 25370 }, { "epoch": 3.1794261370755543, "grad_norm": 0.1610068529844284, "learning_rate": 9.894461677084936e-06, "loss": 0.4591, "num_input_tokens_seen": 30854624, "step": 25375 }, { "epoch": 3.180052624984338, "grad_norm": 0.1096084862947464, "learning_rate": 9.894349912309975e-06, "loss": 0.4627, "num_input_tokens_seen": 30861088, "step": 25380 }, { "epoch": 3.180679112893121, "grad_norm": 0.11824101209640503, "learning_rate": 9.894238089018944e-06, "loss": 0.4637, "num_input_tokens_seen": 30867328, "step": 25385 }, { "epoch": 3.1813056008019047, "grad_norm": 0.08946478366851807, "learning_rate": 9.894126207213184e-06, "loss": 0.4609, "num_input_tokens_seen": 30873632, "step": 25390 }, { "epoch": 3.181932088710688, "grad_norm": 0.16055990755558014, "learning_rate": 9.894014266894028e-06, "loss": 0.4681, "num_input_tokens_seen": 30879616, "step": 25395 }, { "epoch": 3.182558576619471, "grad_norm": 0.1113634705543518, "learning_rate": 9.893902268062816e-06, "loss": 0.4632, "num_input_tokens_seen": 30885824, "step": 25400 }, { "epoch": 3.1831850645282547, "grad_norm": 0.08266900479793549, "learning_rate": 9.89379021072089e-06, "loss": 0.4594, "num_input_tokens_seen": 30892352, "step": 25405 }, { "epoch": 3.183811552437038, "grad_norm": 0.08183590322732925, "learning_rate": 9.893678094869587e-06, "loss": 0.4636, "num_input_tokens_seen": 30898432, "step": 25410 }, { "epoch": 3.1844380403458215, "grad_norm": 0.1280210018157959, "learning_rate": 9.893565920510249e-06, "loss": 0.462, "num_input_tokens_seen": 30904544, "step": 25415 }, { "epoch": 3.1850645282546046, "grad_norm": 0.14797545969486237, "learning_rate": 9.893453687644214e-06, "loss": 0.4594, "num_input_tokens_seen": 30910816, "step": 25420 }, { "epoch": 3.1856910161633882, "grad_norm": 0.08887200057506561, "learning_rate": 9.893341396272827e-06, "loss": 0.4639, "num_input_tokens_seen": 30916864, "step": 25425 }, { "epoch": 3.1863175040721714, "grad_norm": 0.15806573629379272, "learning_rate": 9.89322904639743e-06, "loss": 0.4537, "num_input_tokens_seen": 30923232, "step": 25430 }, { "epoch": 3.1869439919809546, "grad_norm": 0.090908482670784, "learning_rate": 9.893116638019365e-06, "loss": 0.4621, "num_input_tokens_seen": 30929248, "step": 25435 }, { "epoch": 3.187570479889738, "grad_norm": 0.09308335930109024, "learning_rate": 9.893004171139977e-06, "loss": 0.4602, "num_input_tokens_seen": 30935392, "step": 25440 }, { "epoch": 3.1881969677985214, "grad_norm": 0.0957922637462616, "learning_rate": 9.892891645760611e-06, "loss": 0.4631, "num_input_tokens_seen": 30941664, "step": 25445 }, { "epoch": 3.188823455707305, "grad_norm": 0.05105777829885483, "learning_rate": 9.892779061882611e-06, "loss": 0.4637, "num_input_tokens_seen": 30947840, "step": 25450 }, { "epoch": 3.189449943616088, "grad_norm": 0.10010260343551636, "learning_rate": 9.892666419507324e-06, "loss": 0.462, "num_input_tokens_seen": 30953568, "step": 25455 }, { "epoch": 3.1900764315248717, "grad_norm": 0.09718289226293564, "learning_rate": 9.892553718636095e-06, "loss": 0.4648, "num_input_tokens_seen": 30959584, "step": 25460 }, { "epoch": 3.190702919433655, "grad_norm": 0.09503541141748428, "learning_rate": 9.892440959270276e-06, "loss": 0.4607, "num_input_tokens_seen": 30965152, "step": 25465 }, { "epoch": 3.191329407342438, "grad_norm": 0.09019113332033157, "learning_rate": 9.892328141411209e-06, "loss": 0.4598, "num_input_tokens_seen": 30971392, "step": 25470 }, { "epoch": 3.1919558952512217, "grad_norm": 0.13284888863563538, "learning_rate": 9.892215265060246e-06, "loss": 0.4592, "num_input_tokens_seen": 30977504, "step": 25475 }, { "epoch": 3.192582383160005, "grad_norm": 0.16357877850532532, "learning_rate": 9.892102330218737e-06, "loss": 0.4645, "num_input_tokens_seen": 30983936, "step": 25480 }, { "epoch": 3.1932088710687885, "grad_norm": 0.09151078015565872, "learning_rate": 9.891989336888033e-06, "loss": 0.4655, "num_input_tokens_seen": 30990272, "step": 25485 }, { "epoch": 3.1938353589775716, "grad_norm": 0.1467888057231903, "learning_rate": 9.89187628506948e-06, "loss": 0.465, "num_input_tokens_seen": 30996416, "step": 25490 }, { "epoch": 3.1944618468863553, "grad_norm": 0.10936480015516281, "learning_rate": 9.891763174764437e-06, "loss": 0.4735, "num_input_tokens_seen": 31002144, "step": 25495 }, { "epoch": 3.1950883347951384, "grad_norm": 0.09886964410543442, "learning_rate": 9.89165000597425e-06, "loss": 0.4708, "num_input_tokens_seen": 31008480, "step": 25500 }, { "epoch": 3.1957148227039216, "grad_norm": 0.09628184884786606, "learning_rate": 9.891536778700277e-06, "loss": 0.4686, "num_input_tokens_seen": 31014624, "step": 25505 }, { "epoch": 3.196341310612705, "grad_norm": 0.08781208842992783, "learning_rate": 9.891423492943867e-06, "loss": 0.4653, "num_input_tokens_seen": 31020448, "step": 25510 }, { "epoch": 3.1969677985214884, "grad_norm": 0.09002922475337982, "learning_rate": 9.891310148706377e-06, "loss": 0.46, "num_input_tokens_seen": 31026688, "step": 25515 }, { "epoch": 3.197594286430272, "grad_norm": 0.07995912432670593, "learning_rate": 9.891196745989162e-06, "loss": 0.4627, "num_input_tokens_seen": 31032736, "step": 25520 }, { "epoch": 3.198220774339055, "grad_norm": 0.08901164680719376, "learning_rate": 9.891083284793578e-06, "loss": 0.4664, "num_input_tokens_seen": 31038752, "step": 25525 }, { "epoch": 3.1988472622478388, "grad_norm": 0.12840110063552856, "learning_rate": 9.890969765120981e-06, "loss": 0.4558, "num_input_tokens_seen": 31044768, "step": 25530 }, { "epoch": 3.199473750156622, "grad_norm": 0.09802860021591187, "learning_rate": 9.890856186972728e-06, "loss": 0.458, "num_input_tokens_seen": 31050912, "step": 25535 }, { "epoch": 3.2001002380654056, "grad_norm": 0.09839827567338943, "learning_rate": 9.890742550350176e-06, "loss": 0.4627, "num_input_tokens_seen": 31057056, "step": 25540 }, { "epoch": 3.2007267259741887, "grad_norm": 0.04368661344051361, "learning_rate": 9.890628855254688e-06, "loss": 0.4596, "num_input_tokens_seen": 31063360, "step": 25545 }, { "epoch": 3.201353213882972, "grad_norm": 0.11389674246311188, "learning_rate": 9.890515101687619e-06, "loss": 0.4603, "num_input_tokens_seen": 31069824, "step": 25550 }, { "epoch": 3.2019797017917555, "grad_norm": 0.1260642409324646, "learning_rate": 9.890401289650328e-06, "loss": 0.4669, "num_input_tokens_seen": 31076064, "step": 25555 }, { "epoch": 3.2026061897005387, "grad_norm": 0.09495026618242264, "learning_rate": 9.89028741914418e-06, "loss": 0.4576, "num_input_tokens_seen": 31081920, "step": 25560 }, { "epoch": 3.2032326776093223, "grad_norm": 0.11881737411022186, "learning_rate": 9.890173490170535e-06, "loss": 0.463, "num_input_tokens_seen": 31088256, "step": 25565 }, { "epoch": 3.2038591655181055, "grad_norm": 0.11116918921470642, "learning_rate": 9.890059502730753e-06, "loss": 0.4596, "num_input_tokens_seen": 31094624, "step": 25570 }, { "epoch": 3.204485653426889, "grad_norm": 0.039804376661777496, "learning_rate": 9.8899454568262e-06, "loss": 0.4555, "num_input_tokens_seen": 31100960, "step": 25575 }, { "epoch": 3.2051121413356722, "grad_norm": 0.15144263207912445, "learning_rate": 9.889831352458237e-06, "loss": 0.4679, "num_input_tokens_seen": 31107200, "step": 25580 }, { "epoch": 3.2057386292444554, "grad_norm": 0.09272675961256027, "learning_rate": 9.889717189628228e-06, "loss": 0.4597, "num_input_tokens_seen": 31113376, "step": 25585 }, { "epoch": 3.206365117153239, "grad_norm": 0.15293852984905243, "learning_rate": 9.88960296833754e-06, "loss": 0.4707, "num_input_tokens_seen": 31119072, "step": 25590 }, { "epoch": 3.206991605062022, "grad_norm": 0.11307055503129959, "learning_rate": 9.889488688587536e-06, "loss": 0.4587, "num_input_tokens_seen": 31125056, "step": 25595 }, { "epoch": 3.207618092970806, "grad_norm": 0.141157865524292, "learning_rate": 9.889374350379586e-06, "loss": 0.4608, "num_input_tokens_seen": 31131072, "step": 25600 }, { "epoch": 3.208244580879589, "grad_norm": 0.14264975488185883, "learning_rate": 9.889259953715053e-06, "loss": 0.4651, "num_input_tokens_seen": 31137024, "step": 25605 }, { "epoch": 3.2088710687883726, "grad_norm": 0.11166083067655563, "learning_rate": 9.889145498595309e-06, "loss": 0.4609, "num_input_tokens_seen": 31143168, "step": 25610 }, { "epoch": 3.2094975566971558, "grad_norm": 0.10752341896295547, "learning_rate": 9.88903098502172e-06, "loss": 0.4641, "num_input_tokens_seen": 31149152, "step": 25615 }, { "epoch": 3.210124044605939, "grad_norm": 0.09509432315826416, "learning_rate": 9.888916412995653e-06, "loss": 0.4641, "num_input_tokens_seen": 31154784, "step": 25620 }, { "epoch": 3.2107505325147225, "grad_norm": 0.12775667011737823, "learning_rate": 9.88880178251848e-06, "loss": 0.4792, "num_input_tokens_seen": 31160768, "step": 25625 }, { "epoch": 3.2113770204235057, "grad_norm": 0.08274281769990921, "learning_rate": 9.88868709359157e-06, "loss": 0.4711, "num_input_tokens_seen": 31166752, "step": 25630 }, { "epoch": 3.2120035083322893, "grad_norm": 0.13110122084617615, "learning_rate": 9.8885723462163e-06, "loss": 0.4659, "num_input_tokens_seen": 31172736, "step": 25635 }, { "epoch": 3.2126299962410725, "grad_norm": 0.09563755244016647, "learning_rate": 9.888457540394035e-06, "loss": 0.4634, "num_input_tokens_seen": 31178592, "step": 25640 }, { "epoch": 3.213256484149856, "grad_norm": 0.08487033098936081, "learning_rate": 9.88834267612615e-06, "loss": 0.4663, "num_input_tokens_seen": 31184768, "step": 25645 }, { "epoch": 3.2138829720586393, "grad_norm": 0.13244837522506714, "learning_rate": 9.88822775341402e-06, "loss": 0.4574, "num_input_tokens_seen": 31191072, "step": 25650 }, { "epoch": 3.2145094599674224, "grad_norm": 0.08553440123796463, "learning_rate": 9.888112772259015e-06, "loss": 0.4621, "num_input_tokens_seen": 31197440, "step": 25655 }, { "epoch": 3.215135947876206, "grad_norm": 0.1254957914352417, "learning_rate": 9.887997732662513e-06, "loss": 0.466, "num_input_tokens_seen": 31203456, "step": 25660 }, { "epoch": 3.215762435784989, "grad_norm": 0.08478618413209915, "learning_rate": 9.88788263462589e-06, "loss": 0.4657, "num_input_tokens_seen": 31209152, "step": 25665 }, { "epoch": 3.216388923693773, "grad_norm": 0.03857513144612312, "learning_rate": 9.887767478150519e-06, "loss": 0.4584, "num_input_tokens_seen": 31215264, "step": 25670 }, { "epoch": 3.217015411602556, "grad_norm": 0.1357773393392563, "learning_rate": 9.887652263237778e-06, "loss": 0.4657, "num_input_tokens_seen": 31221440, "step": 25675 }, { "epoch": 3.2176418995113396, "grad_norm": 0.10283933579921722, "learning_rate": 9.887536989889047e-06, "loss": 0.462, "num_input_tokens_seen": 31227616, "step": 25680 }, { "epoch": 3.218268387420123, "grad_norm": 0.09537681937217712, "learning_rate": 9.8874216581057e-06, "loss": 0.4593, "num_input_tokens_seen": 31233472, "step": 25685 }, { "epoch": 3.218894875328906, "grad_norm": 0.14756768941879272, "learning_rate": 9.887306267889119e-06, "loss": 0.4666, "num_input_tokens_seen": 31239808, "step": 25690 }, { "epoch": 3.2195213632376896, "grad_norm": 0.0807717889547348, "learning_rate": 9.887190819240682e-06, "loss": 0.4645, "num_input_tokens_seen": 31245952, "step": 25695 }, { "epoch": 3.2201478511464727, "grad_norm": 0.08938434720039368, "learning_rate": 9.887075312161769e-06, "loss": 0.4665, "num_input_tokens_seen": 31251552, "step": 25700 }, { "epoch": 3.2207743390552563, "grad_norm": 0.10437879711389542, "learning_rate": 9.886959746653764e-06, "loss": 0.4603, "num_input_tokens_seen": 31257728, "step": 25705 }, { "epoch": 3.2214008269640395, "grad_norm": 0.09164708852767944, "learning_rate": 9.886844122718044e-06, "loss": 0.4619, "num_input_tokens_seen": 31264064, "step": 25710 }, { "epoch": 3.222027314872823, "grad_norm": 0.1514325886964798, "learning_rate": 9.886728440355996e-06, "loss": 0.4607, "num_input_tokens_seen": 31270176, "step": 25715 }, { "epoch": 3.2226538027816063, "grad_norm": 0.07538611441850662, "learning_rate": 9.886612699569001e-06, "loss": 0.4574, "num_input_tokens_seen": 31276192, "step": 25720 }, { "epoch": 3.22328029069039, "grad_norm": 0.10701998323202133, "learning_rate": 9.886496900358442e-06, "loss": 0.4622, "num_input_tokens_seen": 31282112, "step": 25725 }, { "epoch": 3.223906778599173, "grad_norm": 0.08486573398113251, "learning_rate": 9.886381042725703e-06, "loss": 0.454, "num_input_tokens_seen": 31288352, "step": 25730 }, { "epoch": 3.2245332665079562, "grad_norm": 0.09756847470998764, "learning_rate": 9.886265126672173e-06, "loss": 0.4649, "num_input_tokens_seen": 31294656, "step": 25735 }, { "epoch": 3.22515975441674, "grad_norm": 0.12828665971755981, "learning_rate": 9.886149152199233e-06, "loss": 0.4576, "num_input_tokens_seen": 31300832, "step": 25740 }, { "epoch": 3.225786242325523, "grad_norm": 0.155330628156662, "learning_rate": 9.886033119308272e-06, "loss": 0.4527, "num_input_tokens_seen": 31307296, "step": 25745 }, { "epoch": 3.2264127302343066, "grad_norm": 0.15359704196453094, "learning_rate": 9.885917028000678e-06, "loss": 0.4614, "num_input_tokens_seen": 31313088, "step": 25750 }, { "epoch": 3.22703921814309, "grad_norm": 0.09588725864887238, "learning_rate": 9.885800878277837e-06, "loss": 0.4675, "num_input_tokens_seen": 31319200, "step": 25755 }, { "epoch": 3.227665706051873, "grad_norm": 0.12929342687129974, "learning_rate": 9.88568467014114e-06, "loss": 0.4599, "num_input_tokens_seen": 31325184, "step": 25760 }, { "epoch": 3.2282921939606566, "grad_norm": 0.11760628968477249, "learning_rate": 9.885568403591975e-06, "loss": 0.4688, "num_input_tokens_seen": 31331296, "step": 25765 }, { "epoch": 3.2289186818694398, "grad_norm": 0.09435093402862549, "learning_rate": 9.885452078631733e-06, "loss": 0.4551, "num_input_tokens_seen": 31337600, "step": 25770 }, { "epoch": 3.2295451697782234, "grad_norm": 0.04218556359410286, "learning_rate": 9.885335695261801e-06, "loss": 0.4557, "num_input_tokens_seen": 31343616, "step": 25775 }, { "epoch": 3.2301716576870065, "grad_norm": 0.12012999504804611, "learning_rate": 9.885219253483576e-06, "loss": 0.4683, "num_input_tokens_seen": 31349120, "step": 25780 }, { "epoch": 3.23079814559579, "grad_norm": 0.16422820091247559, "learning_rate": 9.885102753298447e-06, "loss": 0.4631, "num_input_tokens_seen": 31355072, "step": 25785 }, { "epoch": 3.2314246335045733, "grad_norm": 0.08421380817890167, "learning_rate": 9.884986194707807e-06, "loss": 0.4553, "num_input_tokens_seen": 31361024, "step": 25790 }, { "epoch": 3.232051121413357, "grad_norm": 0.11334297806024551, "learning_rate": 9.88486957771305e-06, "loss": 0.4531, "num_input_tokens_seen": 31367232, "step": 25795 }, { "epoch": 3.23267760932214, "grad_norm": 0.07858184725046158, "learning_rate": 9.88475290231557e-06, "loss": 0.4557, "num_input_tokens_seen": 31373312, "step": 25800 }, { "epoch": 3.2333040972309233, "grad_norm": 0.08234405517578125, "learning_rate": 9.884636168516763e-06, "loss": 0.4681, "num_input_tokens_seen": 31379296, "step": 25805 }, { "epoch": 3.233930585139707, "grad_norm": 0.12205907702445984, "learning_rate": 9.884519376318023e-06, "loss": 0.4526, "num_input_tokens_seen": 31385344, "step": 25810 }, { "epoch": 3.23455707304849, "grad_norm": 0.11217615753412247, "learning_rate": 9.884402525720748e-06, "loss": 0.459, "num_input_tokens_seen": 31391264, "step": 25815 }, { "epoch": 3.2351835609572737, "grad_norm": 0.04143383726477623, "learning_rate": 9.884285616726334e-06, "loss": 0.4719, "num_input_tokens_seen": 31397440, "step": 25820 }, { "epoch": 3.235810048866057, "grad_norm": 0.04592730104923248, "learning_rate": 9.88416864933618e-06, "loss": 0.4632, "num_input_tokens_seen": 31403744, "step": 25825 }, { "epoch": 3.2364365367748404, "grad_norm": 0.13957035541534424, "learning_rate": 9.88405162355168e-06, "loss": 0.468, "num_input_tokens_seen": 31409824, "step": 25830 }, { "epoch": 3.2370630246836236, "grad_norm": 0.10964582860469818, "learning_rate": 9.88393453937424e-06, "loss": 0.4555, "num_input_tokens_seen": 31415936, "step": 25835 }, { "epoch": 3.237689512592407, "grad_norm": 0.1334148645401001, "learning_rate": 9.883817396805254e-06, "loss": 0.465, "num_input_tokens_seen": 31421984, "step": 25840 }, { "epoch": 3.2383160005011904, "grad_norm": 0.08930104225873947, "learning_rate": 9.883700195846127e-06, "loss": 0.4601, "num_input_tokens_seen": 31427776, "step": 25845 }, { "epoch": 3.2389424884099736, "grad_norm": 0.08517409861087799, "learning_rate": 9.883582936498257e-06, "loss": 0.4592, "num_input_tokens_seen": 31433856, "step": 25850 }, { "epoch": 3.239568976318757, "grad_norm": 0.09937205165624619, "learning_rate": 9.883465618763047e-06, "loss": 0.4577, "num_input_tokens_seen": 31440160, "step": 25855 }, { "epoch": 3.2401954642275403, "grad_norm": 0.1090407744050026, "learning_rate": 9.883348242641899e-06, "loss": 0.4617, "num_input_tokens_seen": 31445856, "step": 25860 }, { "epoch": 3.240821952136324, "grad_norm": 0.10058687627315521, "learning_rate": 9.88323080813622e-06, "loss": 0.4628, "num_input_tokens_seen": 31452288, "step": 25865 }, { "epoch": 3.241448440045107, "grad_norm": 0.16732977330684662, "learning_rate": 9.883113315247409e-06, "loss": 0.4584, "num_input_tokens_seen": 31458272, "step": 25870 }, { "epoch": 3.2420749279538903, "grad_norm": 0.1066804751753807, "learning_rate": 9.882995763976872e-06, "loss": 0.4634, "num_input_tokens_seen": 31464384, "step": 25875 }, { "epoch": 3.242701415862674, "grad_norm": 0.1107652336359024, "learning_rate": 9.882878154326016e-06, "loss": 0.4745, "num_input_tokens_seen": 31470272, "step": 25880 }, { "epoch": 3.243327903771457, "grad_norm": 0.0442051999270916, "learning_rate": 9.882760486296249e-06, "loss": 0.4647, "num_input_tokens_seen": 31476384, "step": 25885 }, { "epoch": 3.2439543916802407, "grad_norm": 0.16283829510211945, "learning_rate": 9.882642759888972e-06, "loss": 0.4683, "num_input_tokens_seen": 31482688, "step": 25890 }, { "epoch": 3.244580879589024, "grad_norm": 0.08242897689342499, "learning_rate": 9.882524975105596e-06, "loss": 0.4619, "num_input_tokens_seen": 31488480, "step": 25895 }, { "epoch": 3.2452073674978075, "grad_norm": 0.12714186310768127, "learning_rate": 9.88240713194753e-06, "loss": 0.4612, "num_input_tokens_seen": 31494784, "step": 25900 }, { "epoch": 3.2458338554065906, "grad_norm": 0.1641698032617569, "learning_rate": 9.88228923041618e-06, "loss": 0.4577, "num_input_tokens_seen": 31501056, "step": 25905 }, { "epoch": 3.246460343315374, "grad_norm": 0.10081471502780914, "learning_rate": 9.882171270512959e-06, "loss": 0.4518, "num_input_tokens_seen": 31506560, "step": 25910 }, { "epoch": 3.2470868312241574, "grad_norm": 0.10043313354253769, "learning_rate": 9.882053252239276e-06, "loss": 0.4615, "num_input_tokens_seen": 31512640, "step": 25915 }, { "epoch": 3.2477133191329406, "grad_norm": 0.12232907116413116, "learning_rate": 9.88193517559654e-06, "loss": 0.4598, "num_input_tokens_seen": 31518784, "step": 25920 }, { "epoch": 3.248339807041724, "grad_norm": 0.0882125124335289, "learning_rate": 9.881817040586165e-06, "loss": 0.4688, "num_input_tokens_seen": 31525152, "step": 25925 }, { "epoch": 3.2489662949505074, "grad_norm": 0.04445970803499222, "learning_rate": 9.881698847209566e-06, "loss": 0.4695, "num_input_tokens_seen": 31530656, "step": 25930 }, { "epoch": 3.249592782859291, "grad_norm": 0.08354146033525467, "learning_rate": 9.88158059546815e-06, "loss": 0.4651, "num_input_tokens_seen": 31536768, "step": 25935 }, { "epoch": 3.250219270768074, "grad_norm": 0.09783037006855011, "learning_rate": 9.881462285363334e-06, "loss": 0.4661, "num_input_tokens_seen": 31542944, "step": 25940 }, { "epoch": 3.2508457586768573, "grad_norm": 0.14100876450538635, "learning_rate": 9.881343916896534e-06, "loss": 0.4658, "num_input_tokens_seen": 31549344, "step": 25945 }, { "epoch": 3.251472246585641, "grad_norm": 0.08816881477832794, "learning_rate": 9.881225490069161e-06, "loss": 0.4621, "num_input_tokens_seen": 31555488, "step": 25950 }, { "epoch": 3.252098734494424, "grad_norm": 0.08803724497556686, "learning_rate": 9.881107004882633e-06, "loss": 0.4609, "num_input_tokens_seen": 31561760, "step": 25955 }, { "epoch": 3.2527252224032077, "grad_norm": 0.131035715341568, "learning_rate": 9.88098846133837e-06, "loss": 0.463, "num_input_tokens_seen": 31568128, "step": 25960 }, { "epoch": 3.253351710311991, "grad_norm": 0.13321895897388458, "learning_rate": 9.880869859437784e-06, "loss": 0.4629, "num_input_tokens_seen": 31574464, "step": 25965 }, { "epoch": 3.2539781982207745, "grad_norm": 0.0889110267162323, "learning_rate": 9.880751199182298e-06, "loss": 0.4634, "num_input_tokens_seen": 31579904, "step": 25970 }, { "epoch": 3.2546046861295577, "grad_norm": 0.04378895089030266, "learning_rate": 9.880632480573325e-06, "loss": 0.4605, "num_input_tokens_seen": 31586016, "step": 25975 }, { "epoch": 3.2552311740383413, "grad_norm": 0.16764113306999207, "learning_rate": 9.880513703612287e-06, "loss": 0.4563, "num_input_tokens_seen": 31592128, "step": 25980 }, { "epoch": 3.2558576619471244, "grad_norm": 0.08713552355766296, "learning_rate": 9.880394868300606e-06, "loss": 0.4652, "num_input_tokens_seen": 31598304, "step": 25985 }, { "epoch": 3.2564841498559076, "grad_norm": 0.040707167237997055, "learning_rate": 9.8802759746397e-06, "loss": 0.4595, "num_input_tokens_seen": 31604192, "step": 25990 }, { "epoch": 3.2571106377646912, "grad_norm": 0.07786423712968826, "learning_rate": 9.880157022630992e-06, "loss": 0.4632, "num_input_tokens_seen": 31610304, "step": 25995 }, { "epoch": 3.2577371256734744, "grad_norm": 0.10564471781253815, "learning_rate": 9.880038012275904e-06, "loss": 0.4612, "num_input_tokens_seen": 31616256, "step": 26000 }, { "epoch": 3.258363613582258, "grad_norm": 0.09138642251491547, "learning_rate": 9.87991894357586e-06, "loss": 0.4571, "num_input_tokens_seen": 31622304, "step": 26005 }, { "epoch": 3.258990101491041, "grad_norm": 0.04010132700204849, "learning_rate": 9.879799816532279e-06, "loss": 0.4639, "num_input_tokens_seen": 31628224, "step": 26010 }, { "epoch": 3.2596165893998243, "grad_norm": 0.04687037691473961, "learning_rate": 9.87968063114659e-06, "loss": 0.4623, "num_input_tokens_seen": 31634400, "step": 26015 }, { "epoch": 3.260243077308608, "grad_norm": 0.09190573543310165, "learning_rate": 9.879561387420218e-06, "loss": 0.463, "num_input_tokens_seen": 31640288, "step": 26020 }, { "epoch": 3.260869565217391, "grad_norm": 0.0860191211104393, "learning_rate": 9.879442085354584e-06, "loss": 0.467, "num_input_tokens_seen": 31646464, "step": 26025 }, { "epoch": 3.2614960531261747, "grad_norm": 0.0830894410610199, "learning_rate": 9.879322724951121e-06, "loss": 0.4665, "num_input_tokens_seen": 31652608, "step": 26030 }, { "epoch": 3.262122541034958, "grad_norm": 0.08526360988616943, "learning_rate": 9.879203306211252e-06, "loss": 0.465, "num_input_tokens_seen": 31658368, "step": 26035 }, { "epoch": 3.2627490289437415, "grad_norm": 0.10975279659032822, "learning_rate": 9.879083829136402e-06, "loss": 0.4628, "num_input_tokens_seen": 31664672, "step": 26040 }, { "epoch": 3.2633755168525247, "grad_norm": 0.08702429383993149, "learning_rate": 9.878964293728007e-06, "loss": 0.4638, "num_input_tokens_seen": 31670240, "step": 26045 }, { "epoch": 3.2640020047613083, "grad_norm": 0.07914512604475021, "learning_rate": 9.878844699987489e-06, "loss": 0.4653, "num_input_tokens_seen": 31676320, "step": 26050 }, { "epoch": 3.2646284926700915, "grad_norm": 0.11201237142086029, "learning_rate": 9.87872504791628e-06, "loss": 0.4563, "num_input_tokens_seen": 31681984, "step": 26055 }, { "epoch": 3.2652549805788746, "grad_norm": 0.11674094200134277, "learning_rate": 9.878605337515813e-06, "loss": 0.4663, "num_input_tokens_seen": 31688256, "step": 26060 }, { "epoch": 3.2658814684876583, "grad_norm": 0.03587963059544563, "learning_rate": 9.878485568787518e-06, "loss": 0.462, "num_input_tokens_seen": 31694432, "step": 26065 }, { "epoch": 3.2665079563964414, "grad_norm": 0.09446932375431061, "learning_rate": 9.878365741732824e-06, "loss": 0.4618, "num_input_tokens_seen": 31700480, "step": 26070 }, { "epoch": 3.267134444305225, "grad_norm": 0.08597995340824127, "learning_rate": 9.878245856353169e-06, "loss": 0.4672, "num_input_tokens_seen": 31706400, "step": 26075 }, { "epoch": 3.267760932214008, "grad_norm": 0.152430921792984, "learning_rate": 9.87812591264998e-06, "loss": 0.4634, "num_input_tokens_seen": 31712512, "step": 26080 }, { "epoch": 3.268387420122792, "grad_norm": 0.11299367994070053, "learning_rate": 9.878005910624698e-06, "loss": 0.4628, "num_input_tokens_seen": 31718560, "step": 26085 }, { "epoch": 3.269013908031575, "grad_norm": 0.04038483649492264, "learning_rate": 9.877885850278752e-06, "loss": 0.4614, "num_input_tokens_seen": 31724640, "step": 26090 }, { "epoch": 3.269640395940358, "grad_norm": 0.03339887782931328, "learning_rate": 9.87776573161358e-06, "loss": 0.4582, "num_input_tokens_seen": 31730400, "step": 26095 }, { "epoch": 3.2702668838491418, "grad_norm": 0.03877159208059311, "learning_rate": 9.877645554630618e-06, "loss": 0.4558, "num_input_tokens_seen": 31736448, "step": 26100 }, { "epoch": 3.270893371757925, "grad_norm": 0.0936686247587204, "learning_rate": 9.877525319331302e-06, "loss": 0.464, "num_input_tokens_seen": 31742304, "step": 26105 }, { "epoch": 3.2715198596667086, "grad_norm": 0.07536394894123077, "learning_rate": 9.87740502571707e-06, "loss": 0.4565, "num_input_tokens_seen": 31748704, "step": 26110 }, { "epoch": 3.2721463475754917, "grad_norm": 0.1405269354581833, "learning_rate": 9.87728467378936e-06, "loss": 0.4594, "num_input_tokens_seen": 31754656, "step": 26115 }, { "epoch": 3.2727728354842753, "grad_norm": 0.1396639496088028, "learning_rate": 9.877164263549613e-06, "loss": 0.461, "num_input_tokens_seen": 31760736, "step": 26120 }, { "epoch": 3.2733993233930585, "grad_norm": 0.13483306765556335, "learning_rate": 9.877043794999264e-06, "loss": 0.468, "num_input_tokens_seen": 31767136, "step": 26125 }, { "epoch": 3.2740258113018417, "grad_norm": 0.14497403800487518, "learning_rate": 9.876923268139758e-06, "loss": 0.464, "num_input_tokens_seen": 31773536, "step": 26130 }, { "epoch": 3.2746522992106253, "grad_norm": 0.08609353005886078, "learning_rate": 9.876802682972534e-06, "loss": 0.4731, "num_input_tokens_seen": 31779744, "step": 26135 }, { "epoch": 3.2752787871194085, "grad_norm": 0.10685797780752182, "learning_rate": 9.876682039499032e-06, "loss": 0.4687, "num_input_tokens_seen": 31786048, "step": 26140 }, { "epoch": 3.275905275028192, "grad_norm": 0.09479270875453949, "learning_rate": 9.876561337720697e-06, "loss": 0.4559, "num_input_tokens_seen": 31792192, "step": 26145 }, { "epoch": 3.2765317629369752, "grad_norm": 0.041759688407182693, "learning_rate": 9.876440577638972e-06, "loss": 0.4583, "num_input_tokens_seen": 31798496, "step": 26150 }, { "epoch": 3.277158250845759, "grad_norm": 0.04071735963225365, "learning_rate": 9.8763197592553e-06, "loss": 0.4549, "num_input_tokens_seen": 31804288, "step": 26155 }, { "epoch": 3.277784738754542, "grad_norm": 0.13727392256259918, "learning_rate": 9.876198882571126e-06, "loss": 0.4573, "num_input_tokens_seen": 31810432, "step": 26160 }, { "epoch": 3.2784112266633256, "grad_norm": 0.11596619337797165, "learning_rate": 9.876077947587892e-06, "loss": 0.4757, "num_input_tokens_seen": 31816320, "step": 26165 }, { "epoch": 3.279037714572109, "grad_norm": 0.0802551880478859, "learning_rate": 9.875956954307049e-06, "loss": 0.465, "num_input_tokens_seen": 31822528, "step": 26170 }, { "epoch": 3.279664202480892, "grad_norm": 0.08733034133911133, "learning_rate": 9.87583590273004e-06, "loss": 0.4712, "num_input_tokens_seen": 31828736, "step": 26175 }, { "epoch": 3.2802906903896756, "grad_norm": 0.07990729063749313, "learning_rate": 9.875714792858313e-06, "loss": 0.4644, "num_input_tokens_seen": 31834784, "step": 26180 }, { "epoch": 3.2809171782984587, "grad_norm": 0.08027619123458862, "learning_rate": 9.875593624693317e-06, "loss": 0.467, "num_input_tokens_seen": 31841024, "step": 26185 }, { "epoch": 3.2815436662072424, "grad_norm": 0.0718112587928772, "learning_rate": 9.875472398236502e-06, "loss": 0.4585, "num_input_tokens_seen": 31847136, "step": 26190 }, { "epoch": 3.2821701541160255, "grad_norm": 0.11162744462490082, "learning_rate": 9.875351113489313e-06, "loss": 0.4599, "num_input_tokens_seen": 31853056, "step": 26195 }, { "epoch": 3.2827966420248087, "grad_norm": 0.07575424015522003, "learning_rate": 9.875229770453203e-06, "loss": 0.4663, "num_input_tokens_seen": 31859264, "step": 26200 }, { "epoch": 3.2834231299335923, "grad_norm": 0.03592057153582573, "learning_rate": 9.875108369129623e-06, "loss": 0.4564, "num_input_tokens_seen": 31865312, "step": 26205 }, { "epoch": 3.2840496178423755, "grad_norm": 0.08434689044952393, "learning_rate": 9.874986909520022e-06, "loss": 0.466, "num_input_tokens_seen": 31871296, "step": 26210 }, { "epoch": 3.284676105751159, "grad_norm": 0.11086947470903397, "learning_rate": 9.874865391625854e-06, "loss": 0.4612, "num_input_tokens_seen": 31877312, "step": 26215 }, { "epoch": 3.2853025936599423, "grad_norm": 0.08724066615104675, "learning_rate": 9.874743815448572e-06, "loss": 0.4619, "num_input_tokens_seen": 31883360, "step": 26220 }, { "epoch": 3.285929081568726, "grad_norm": 0.09557858854532242, "learning_rate": 9.874622180989632e-06, "loss": 0.4614, "num_input_tokens_seen": 31889760, "step": 26225 }, { "epoch": 3.286555569477509, "grad_norm": 0.1022062823176384, "learning_rate": 9.874500488250482e-06, "loss": 0.463, "num_input_tokens_seen": 31895776, "step": 26230 }, { "epoch": 3.2871820573862927, "grad_norm": 0.13879911601543427, "learning_rate": 9.874378737232581e-06, "loss": 0.4645, "num_input_tokens_seen": 31901856, "step": 26235 }, { "epoch": 3.287808545295076, "grad_norm": 0.15640604496002197, "learning_rate": 9.874256927937387e-06, "loss": 0.4652, "num_input_tokens_seen": 31907520, "step": 26240 }, { "epoch": 3.288435033203859, "grad_norm": 0.0786905288696289, "learning_rate": 9.874135060366349e-06, "loss": 0.4671, "num_input_tokens_seen": 31913504, "step": 26245 }, { "epoch": 3.2890615211126426, "grad_norm": 0.21295088529586792, "learning_rate": 9.87401313452093e-06, "loss": 0.4689, "num_input_tokens_seen": 31919712, "step": 26250 }, { "epoch": 3.2896880090214258, "grad_norm": 0.12862640619277954, "learning_rate": 9.873891150402588e-06, "loss": 0.4638, "num_input_tokens_seen": 31925504, "step": 26255 }, { "epoch": 3.2903144969302094, "grad_norm": 0.09261125326156616, "learning_rate": 9.873769108012778e-06, "loss": 0.4653, "num_input_tokens_seen": 31931680, "step": 26260 }, { "epoch": 3.2909409848389926, "grad_norm": 0.16080456972122192, "learning_rate": 9.873647007352961e-06, "loss": 0.4642, "num_input_tokens_seen": 31938016, "step": 26265 }, { "epoch": 3.2915674727477757, "grad_norm": 0.08844875544309616, "learning_rate": 9.873524848424596e-06, "loss": 0.4631, "num_input_tokens_seen": 31944224, "step": 26270 }, { "epoch": 3.2921939606565593, "grad_norm": 0.09125825762748718, "learning_rate": 9.873402631229144e-06, "loss": 0.4588, "num_input_tokens_seen": 31950368, "step": 26275 }, { "epoch": 3.2928204485653425, "grad_norm": 0.12021351605653763, "learning_rate": 9.873280355768067e-06, "loss": 0.4635, "num_input_tokens_seen": 31956576, "step": 26280 }, { "epoch": 3.293446936474126, "grad_norm": 0.10678383708000183, "learning_rate": 9.873158022042825e-06, "loss": 0.4625, "num_input_tokens_seen": 31962752, "step": 26285 }, { "epoch": 3.2940734243829093, "grad_norm": 0.09300535172224045, "learning_rate": 9.873035630054883e-06, "loss": 0.4667, "num_input_tokens_seen": 31968768, "step": 26290 }, { "epoch": 3.294699912291693, "grad_norm": 0.15191048383712769, "learning_rate": 9.872913179805702e-06, "loss": 0.4655, "num_input_tokens_seen": 31974368, "step": 26295 }, { "epoch": 3.295326400200476, "grad_norm": 0.0925278440117836, "learning_rate": 9.872790671296747e-06, "loss": 0.4627, "num_input_tokens_seen": 31980672, "step": 26300 }, { "epoch": 3.2959528881092597, "grad_norm": 0.17956626415252686, "learning_rate": 9.872668104529484e-06, "loss": 0.4554, "num_input_tokens_seen": 31986656, "step": 26305 }, { "epoch": 3.296579376018043, "grad_norm": 0.13037213683128357, "learning_rate": 9.872545479505376e-06, "loss": 0.471, "num_input_tokens_seen": 31992096, "step": 26310 }, { "epoch": 3.297205863926826, "grad_norm": 0.11033344268798828, "learning_rate": 9.87242279622589e-06, "loss": 0.4638, "num_input_tokens_seen": 31997632, "step": 26315 }, { "epoch": 3.2978323518356096, "grad_norm": 0.14542438089847565, "learning_rate": 9.872300054692494e-06, "loss": 0.4608, "num_input_tokens_seen": 32003808, "step": 26320 }, { "epoch": 3.298458839744393, "grad_norm": 0.14731040596961975, "learning_rate": 9.872177254906653e-06, "loss": 0.4634, "num_input_tokens_seen": 32009856, "step": 26325 }, { "epoch": 3.2990853276531764, "grad_norm": 0.1256447732448578, "learning_rate": 9.872054396869836e-06, "loss": 0.4653, "num_input_tokens_seen": 32016288, "step": 26330 }, { "epoch": 3.2997118155619596, "grad_norm": 0.09229264408349991, "learning_rate": 9.871931480583515e-06, "loss": 0.4684, "num_input_tokens_seen": 32022528, "step": 26335 }, { "epoch": 3.300338303470743, "grad_norm": 0.21209749579429626, "learning_rate": 9.871808506049156e-06, "loss": 0.4614, "num_input_tokens_seen": 32028256, "step": 26340 }, { "epoch": 3.3009647913795264, "grad_norm": 0.10224524885416031, "learning_rate": 9.87168547326823e-06, "loss": 0.463, "num_input_tokens_seen": 32034240, "step": 26345 }, { "epoch": 3.30159127928831, "grad_norm": 0.07688411325216293, "learning_rate": 9.87156238224221e-06, "loss": 0.4558, "num_input_tokens_seen": 32040288, "step": 26350 }, { "epoch": 3.302217767197093, "grad_norm": 0.08258458226919174, "learning_rate": 9.871439232972563e-06, "loss": 0.4641, "num_input_tokens_seen": 32046176, "step": 26355 }, { "epoch": 3.3028442551058763, "grad_norm": 0.11469455063343048, "learning_rate": 9.871316025460767e-06, "loss": 0.4568, "num_input_tokens_seen": 32052352, "step": 26360 }, { "epoch": 3.30347074301466, "grad_norm": 0.13816602528095245, "learning_rate": 9.87119275970829e-06, "loss": 0.4651, "num_input_tokens_seen": 32058528, "step": 26365 }, { "epoch": 3.304097230923443, "grad_norm": 0.09341759234666824, "learning_rate": 9.871069435716609e-06, "loss": 0.4585, "num_input_tokens_seen": 32064544, "step": 26370 }, { "epoch": 3.3047237188322267, "grad_norm": 0.03865397721529007, "learning_rate": 9.870946053487198e-06, "loss": 0.4693, "num_input_tokens_seen": 32070816, "step": 26375 }, { "epoch": 3.30535020674101, "grad_norm": 0.08408914506435394, "learning_rate": 9.870822613021531e-06, "loss": 0.462, "num_input_tokens_seen": 32076800, "step": 26380 }, { "epoch": 3.305976694649793, "grad_norm": 0.09222789108753204, "learning_rate": 9.870699114321085e-06, "loss": 0.4607, "num_input_tokens_seen": 32083008, "step": 26385 }, { "epoch": 3.3066031825585767, "grad_norm": 0.09119687974452972, "learning_rate": 9.870575557387336e-06, "loss": 0.4619, "num_input_tokens_seen": 32088768, "step": 26390 }, { "epoch": 3.30722967046736, "grad_norm": 0.10038291662931442, "learning_rate": 9.870451942221762e-06, "loss": 0.4619, "num_input_tokens_seen": 32094944, "step": 26395 }, { "epoch": 3.3078561583761434, "grad_norm": 0.13601401448249817, "learning_rate": 9.870328268825839e-06, "loss": 0.4711, "num_input_tokens_seen": 32100896, "step": 26400 }, { "epoch": 3.3084826462849266, "grad_norm": 0.03394802287220955, "learning_rate": 9.870204537201046e-06, "loss": 0.464, "num_input_tokens_seen": 32107424, "step": 26405 }, { "epoch": 3.3091091341937102, "grad_norm": 0.04112378880381584, "learning_rate": 9.870080747348864e-06, "loss": 0.4659, "num_input_tokens_seen": 32114016, "step": 26410 }, { "epoch": 3.3097356221024934, "grad_norm": 0.08862698823213577, "learning_rate": 9.869956899270774e-06, "loss": 0.4619, "num_input_tokens_seen": 32120032, "step": 26415 }, { "epoch": 3.310362110011277, "grad_norm": 0.07597963511943817, "learning_rate": 9.869832992968253e-06, "loss": 0.4629, "num_input_tokens_seen": 32126304, "step": 26420 }, { "epoch": 3.31098859792006, "grad_norm": 0.031382229179143906, "learning_rate": 9.869709028442785e-06, "loss": 0.464, "num_input_tokens_seen": 32132608, "step": 26425 }, { "epoch": 3.3116150858288433, "grad_norm": 0.13534066081047058, "learning_rate": 9.86958500569585e-06, "loss": 0.4615, "num_input_tokens_seen": 32139040, "step": 26430 }, { "epoch": 3.312241573737627, "grad_norm": 0.04310554265975952, "learning_rate": 9.869460924728933e-06, "loss": 0.4657, "num_input_tokens_seen": 32145024, "step": 26435 }, { "epoch": 3.31286806164641, "grad_norm": 0.12579278647899628, "learning_rate": 9.869336785543516e-06, "loss": 0.4601, "num_input_tokens_seen": 32151296, "step": 26440 }, { "epoch": 3.3134945495551937, "grad_norm": 0.08367646485567093, "learning_rate": 9.869212588141084e-06, "loss": 0.4635, "num_input_tokens_seen": 32157216, "step": 26445 }, { "epoch": 3.314121037463977, "grad_norm": 0.15147744119167328, "learning_rate": 9.869088332523123e-06, "loss": 0.4587, "num_input_tokens_seen": 32163776, "step": 26450 }, { "epoch": 3.31474752537276, "grad_norm": 0.07642605155706406, "learning_rate": 9.868964018691117e-06, "loss": 0.4651, "num_input_tokens_seen": 32169600, "step": 26455 }, { "epoch": 3.3153740132815437, "grad_norm": 0.1057719811797142, "learning_rate": 9.86883964664655e-06, "loss": 0.4638, "num_input_tokens_seen": 32175776, "step": 26460 }, { "epoch": 3.316000501190327, "grad_norm": 0.08329122513532639, "learning_rate": 9.868715216390914e-06, "loss": 0.4579, "num_input_tokens_seen": 32181152, "step": 26465 }, { "epoch": 3.3166269890991105, "grad_norm": 0.08758527785539627, "learning_rate": 9.868590727925695e-06, "loss": 0.462, "num_input_tokens_seen": 32187008, "step": 26470 }, { "epoch": 3.3172534770078936, "grad_norm": 0.13352535665035248, "learning_rate": 9.868466181252378e-06, "loss": 0.4622, "num_input_tokens_seen": 32193376, "step": 26475 }, { "epoch": 3.3178799649166772, "grad_norm": 0.0920633003115654, "learning_rate": 9.868341576372455e-06, "loss": 0.4667, "num_input_tokens_seen": 32199552, "step": 26480 }, { "epoch": 3.3185064528254604, "grad_norm": 0.07321047782897949, "learning_rate": 9.868216913287417e-06, "loss": 0.4587, "num_input_tokens_seen": 32205568, "step": 26485 }, { "epoch": 3.319132940734244, "grad_norm": 0.11165787279605865, "learning_rate": 9.868092191998753e-06, "loss": 0.4623, "num_input_tokens_seen": 32211744, "step": 26490 }, { "epoch": 3.319759428643027, "grad_norm": 0.09635739773511887, "learning_rate": 9.867967412507953e-06, "loss": 0.4668, "num_input_tokens_seen": 32218080, "step": 26495 }, { "epoch": 3.3203859165518104, "grad_norm": 0.040816530585289, "learning_rate": 9.867842574816508e-06, "loss": 0.4619, "num_input_tokens_seen": 32224064, "step": 26500 }, { "epoch": 3.321012404460594, "grad_norm": 0.10960926860570908, "learning_rate": 9.867717678925915e-06, "loss": 0.4681, "num_input_tokens_seen": 32230240, "step": 26505 }, { "epoch": 3.321638892369377, "grad_norm": 0.08152543753385544, "learning_rate": 9.867592724837665e-06, "loss": 0.4641, "num_input_tokens_seen": 32236224, "step": 26510 }, { "epoch": 3.3222653802781608, "grad_norm": 0.15093541145324707, "learning_rate": 9.86746771255325e-06, "loss": 0.4582, "num_input_tokens_seen": 32242208, "step": 26515 }, { "epoch": 3.322891868186944, "grad_norm": 0.10591559112071991, "learning_rate": 9.867342642074165e-06, "loss": 0.4723, "num_input_tokens_seen": 32248416, "step": 26520 }, { "epoch": 3.3235183560957275, "grad_norm": 0.07559625059366226, "learning_rate": 9.867217513401908e-06, "loss": 0.4644, "num_input_tokens_seen": 32254688, "step": 26525 }, { "epoch": 3.3241448440045107, "grad_norm": 0.07463984191417694, "learning_rate": 9.867092326537975e-06, "loss": 0.4679, "num_input_tokens_seen": 32261024, "step": 26530 }, { "epoch": 3.324771331913294, "grad_norm": 0.03431686758995056, "learning_rate": 9.866967081483858e-06, "loss": 0.4658, "num_input_tokens_seen": 32267008, "step": 26535 }, { "epoch": 3.3253978198220775, "grad_norm": 0.08390884846448898, "learning_rate": 9.86684177824106e-06, "loss": 0.4599, "num_input_tokens_seen": 32273280, "step": 26540 }, { "epoch": 3.3260243077308607, "grad_norm": 0.09232435375452042, "learning_rate": 9.866716416811075e-06, "loss": 0.4641, "num_input_tokens_seen": 32279744, "step": 26545 }, { "epoch": 3.3266507956396443, "grad_norm": 0.07484500855207443, "learning_rate": 9.866590997195406e-06, "loss": 0.4673, "num_input_tokens_seen": 32285856, "step": 26550 }, { "epoch": 3.3272772835484274, "grad_norm": 0.0786178931593895, "learning_rate": 9.866465519395547e-06, "loss": 0.4615, "num_input_tokens_seen": 32291072, "step": 26555 }, { "epoch": 3.327903771457211, "grad_norm": 0.09289363771677017, "learning_rate": 9.866339983413003e-06, "loss": 0.4619, "num_input_tokens_seen": 32297184, "step": 26560 }, { "epoch": 3.3285302593659942, "grad_norm": 0.14896491169929504, "learning_rate": 9.866214389249273e-06, "loss": 0.4598, "num_input_tokens_seen": 32303072, "step": 26565 }, { "epoch": 3.3291567472747774, "grad_norm": 0.11315406113862991, "learning_rate": 9.86608873690586e-06, "loss": 0.4661, "num_input_tokens_seen": 32309280, "step": 26570 }, { "epoch": 3.329783235183561, "grad_norm": 0.0964232012629509, "learning_rate": 9.865963026384262e-06, "loss": 0.461, "num_input_tokens_seen": 32314880, "step": 26575 }, { "epoch": 3.330409723092344, "grad_norm": 0.0813891589641571, "learning_rate": 9.865837257685987e-06, "loss": 0.469, "num_input_tokens_seen": 32320960, "step": 26580 }, { "epoch": 3.331036211001128, "grad_norm": 0.11799611896276474, "learning_rate": 9.865711430812537e-06, "loss": 0.4627, "num_input_tokens_seen": 32327328, "step": 26585 }, { "epoch": 3.331662698909911, "grad_norm": 0.10450339317321777, "learning_rate": 9.865585545765415e-06, "loss": 0.4564, "num_input_tokens_seen": 32333440, "step": 26590 }, { "epoch": 3.3322891868186946, "grad_norm": 0.12190397083759308, "learning_rate": 9.865459602546125e-06, "loss": 0.46, "num_input_tokens_seen": 32339648, "step": 26595 }, { "epoch": 3.3329156747274777, "grad_norm": 0.13639721274375916, "learning_rate": 9.865333601156178e-06, "loss": 0.4633, "num_input_tokens_seen": 32345856, "step": 26600 }, { "epoch": 3.3335421626362614, "grad_norm": 0.07243894040584564, "learning_rate": 9.865207541597075e-06, "loss": 0.4646, "num_input_tokens_seen": 32351904, "step": 26605 }, { "epoch": 3.3341686505450445, "grad_norm": 0.08907163888216019, "learning_rate": 9.865081423870328e-06, "loss": 0.4608, "num_input_tokens_seen": 32358208, "step": 26610 }, { "epoch": 3.3347951384538277, "grad_norm": 0.038530655205249786, "learning_rate": 9.864955247977442e-06, "loss": 0.4637, "num_input_tokens_seen": 32364320, "step": 26615 }, { "epoch": 3.3354216263626113, "grad_norm": 0.08179962635040283, "learning_rate": 9.864829013919923e-06, "loss": 0.4665, "num_input_tokens_seen": 32370752, "step": 26620 }, { "epoch": 3.3360481142713945, "grad_norm": 0.09040939807891846, "learning_rate": 9.864702721699284e-06, "loss": 0.4608, "num_input_tokens_seen": 32376672, "step": 26625 }, { "epoch": 3.336674602180178, "grad_norm": 0.08744176477193832, "learning_rate": 9.864576371317035e-06, "loss": 0.4618, "num_input_tokens_seen": 32382880, "step": 26630 }, { "epoch": 3.3373010900889613, "grad_norm": 0.0897766724228859, "learning_rate": 9.864449962774684e-06, "loss": 0.4593, "num_input_tokens_seen": 32389088, "step": 26635 }, { "epoch": 3.3379275779977444, "grad_norm": 0.11765100806951523, "learning_rate": 9.864323496073747e-06, "loss": 0.4579, "num_input_tokens_seen": 32395008, "step": 26640 }, { "epoch": 3.338554065906528, "grad_norm": 0.08128402382135391, "learning_rate": 9.86419697121573e-06, "loss": 0.4576, "num_input_tokens_seen": 32400864, "step": 26645 }, { "epoch": 3.339180553815311, "grad_norm": 0.0993279367685318, "learning_rate": 9.86407038820215e-06, "loss": 0.4706, "num_input_tokens_seen": 32406976, "step": 26650 }, { "epoch": 3.339807041724095, "grad_norm": 0.0801912173628807, "learning_rate": 9.863943747034518e-06, "loss": 0.4586, "num_input_tokens_seen": 32413056, "step": 26655 }, { "epoch": 3.340433529632878, "grad_norm": 0.11036606878042221, "learning_rate": 9.86381704771435e-06, "loss": 0.4655, "num_input_tokens_seen": 32419456, "step": 26660 }, { "epoch": 3.3410600175416616, "grad_norm": 0.12070214748382568, "learning_rate": 9.86369029024316e-06, "loss": 0.4568, "num_input_tokens_seen": 32425760, "step": 26665 }, { "epoch": 3.3416865054504448, "grad_norm": 0.03891308233141899, "learning_rate": 9.863563474622461e-06, "loss": 0.4652, "num_input_tokens_seen": 32431776, "step": 26670 }, { "epoch": 3.3423129933592284, "grad_norm": 0.08760735392570496, "learning_rate": 9.863436600853775e-06, "loss": 0.4728, "num_input_tokens_seen": 32437984, "step": 26675 }, { "epoch": 3.3429394812680115, "grad_norm": 0.12138959765434265, "learning_rate": 9.863309668938614e-06, "loss": 0.4607, "num_input_tokens_seen": 32444032, "step": 26680 }, { "epoch": 3.3435659691767947, "grad_norm": 0.0868685245513916, "learning_rate": 9.863182678878498e-06, "loss": 0.4617, "num_input_tokens_seen": 32450208, "step": 26685 }, { "epoch": 3.3441924570855783, "grad_norm": 0.08136041462421417, "learning_rate": 9.863055630674943e-06, "loss": 0.4676, "num_input_tokens_seen": 32456448, "step": 26690 }, { "epoch": 3.3448189449943615, "grad_norm": 0.10112544149160385, "learning_rate": 9.86292852432947e-06, "loss": 0.464, "num_input_tokens_seen": 32462464, "step": 26695 }, { "epoch": 3.345445432903145, "grad_norm": 0.07785531133413315, "learning_rate": 9.862801359843597e-06, "loss": 0.4619, "num_input_tokens_seen": 32468736, "step": 26700 }, { "epoch": 3.3460719208119283, "grad_norm": 0.09132128208875656, "learning_rate": 9.862674137218846e-06, "loss": 0.4589, "num_input_tokens_seen": 32474816, "step": 26705 }, { "epoch": 3.3466984087207114, "grad_norm": 0.0765305683016777, "learning_rate": 9.862546856456738e-06, "loss": 0.4664, "num_input_tokens_seen": 32480864, "step": 26710 }, { "epoch": 3.347324896629495, "grad_norm": 0.08467262983322144, "learning_rate": 9.862419517558794e-06, "loss": 0.4591, "num_input_tokens_seen": 32487008, "step": 26715 }, { "epoch": 3.3479513845382782, "grad_norm": 0.11682184785604477, "learning_rate": 9.862292120526536e-06, "loss": 0.4596, "num_input_tokens_seen": 32493280, "step": 26720 }, { "epoch": 3.348577872447062, "grad_norm": 0.1712821125984192, "learning_rate": 9.862164665361487e-06, "loss": 0.4633, "num_input_tokens_seen": 32499136, "step": 26725 }, { "epoch": 3.349204360355845, "grad_norm": 0.08358869701623917, "learning_rate": 9.862037152065173e-06, "loss": 0.4628, "num_input_tokens_seen": 32505312, "step": 26730 }, { "epoch": 3.3498308482646286, "grad_norm": 0.0843629539012909, "learning_rate": 9.861909580639116e-06, "loss": 0.468, "num_input_tokens_seen": 32510752, "step": 26735 }, { "epoch": 3.350457336173412, "grad_norm": 0.15383002161979675, "learning_rate": 9.861781951084844e-06, "loss": 0.4655, "num_input_tokens_seen": 32517024, "step": 26740 }, { "epoch": 3.3510838240821954, "grad_norm": 0.1080518290400505, "learning_rate": 9.861654263403879e-06, "loss": 0.4647, "num_input_tokens_seen": 32523264, "step": 26745 }, { "epoch": 3.3517103119909786, "grad_norm": 0.087700754404068, "learning_rate": 9.861526517597751e-06, "loss": 0.4576, "num_input_tokens_seen": 32529216, "step": 26750 }, { "epoch": 3.3523367998997617, "grad_norm": 0.08541008085012436, "learning_rate": 9.861398713667985e-06, "loss": 0.4579, "num_input_tokens_seen": 32535584, "step": 26755 }, { "epoch": 3.3529632878085454, "grad_norm": 0.15890488028526306, "learning_rate": 9.861270851616112e-06, "loss": 0.4625, "num_input_tokens_seen": 32541952, "step": 26760 }, { "epoch": 3.3535897757173285, "grad_norm": 0.09386413544416428, "learning_rate": 9.861142931443658e-06, "loss": 0.4654, "num_input_tokens_seen": 32548096, "step": 26765 }, { "epoch": 3.354216263626112, "grad_norm": 0.11281358450651169, "learning_rate": 9.861014953152153e-06, "loss": 0.4655, "num_input_tokens_seen": 32554592, "step": 26770 }, { "epoch": 3.3548427515348953, "grad_norm": 0.030842341482639313, "learning_rate": 9.860886916743126e-06, "loss": 0.458, "num_input_tokens_seen": 32560480, "step": 26775 }, { "epoch": 3.355469239443679, "grad_norm": 0.08961968868970871, "learning_rate": 9.860758822218111e-06, "loss": 0.4661, "num_input_tokens_seen": 32566816, "step": 26780 }, { "epoch": 3.356095727352462, "grad_norm": 0.08590291440486908, "learning_rate": 9.860630669578637e-06, "loss": 0.4575, "num_input_tokens_seen": 32572608, "step": 26785 }, { "epoch": 3.3567222152612457, "grad_norm": 0.10829363763332367, "learning_rate": 9.860502458826238e-06, "loss": 0.4704, "num_input_tokens_seen": 32579072, "step": 26790 }, { "epoch": 3.357348703170029, "grad_norm": 0.0816570296883583, "learning_rate": 9.860374189962445e-06, "loss": 0.4559, "num_input_tokens_seen": 32585120, "step": 26795 }, { "epoch": 3.357975191078812, "grad_norm": 0.12903714179992676, "learning_rate": 9.86024586298879e-06, "loss": 0.4534, "num_input_tokens_seen": 32591168, "step": 26800 }, { "epoch": 3.3586016789875957, "grad_norm": 0.1833149790763855, "learning_rate": 9.860117477906813e-06, "loss": 0.477, "num_input_tokens_seen": 32597376, "step": 26805 }, { "epoch": 3.359228166896379, "grad_norm": 0.11065491288900375, "learning_rate": 9.859989034718043e-06, "loss": 0.4606, "num_input_tokens_seen": 32603360, "step": 26810 }, { "epoch": 3.3598546548051624, "grad_norm": 0.1423536092042923, "learning_rate": 9.859860533424017e-06, "loss": 0.4617, "num_input_tokens_seen": 32609856, "step": 26815 }, { "epoch": 3.3604811427139456, "grad_norm": 0.08289378881454468, "learning_rate": 9.859731974026274e-06, "loss": 0.4663, "num_input_tokens_seen": 32615904, "step": 26820 }, { "epoch": 3.3611076306227288, "grad_norm": 0.15626472234725952, "learning_rate": 9.85960335652635e-06, "loss": 0.4538, "num_input_tokens_seen": 32622112, "step": 26825 }, { "epoch": 3.3617341185315124, "grad_norm": 0.08397229015827179, "learning_rate": 9.859474680925783e-06, "loss": 0.4501, "num_input_tokens_seen": 32628416, "step": 26830 }, { "epoch": 3.3623606064402956, "grad_norm": 0.10670129209756851, "learning_rate": 9.859345947226108e-06, "loss": 0.4594, "num_input_tokens_seen": 32634464, "step": 26835 }, { "epoch": 3.362987094349079, "grad_norm": 0.07532637566328049, "learning_rate": 9.859217155428867e-06, "loss": 0.4635, "num_input_tokens_seen": 32640672, "step": 26840 }, { "epoch": 3.3636135822578623, "grad_norm": 0.07567291706800461, "learning_rate": 9.8590883055356e-06, "loss": 0.4616, "num_input_tokens_seen": 32646816, "step": 26845 }, { "epoch": 3.364240070166646, "grad_norm": 0.08581326901912689, "learning_rate": 9.858959397547847e-06, "loss": 0.4654, "num_input_tokens_seen": 32653152, "step": 26850 }, { "epoch": 3.364866558075429, "grad_norm": 0.041020844131708145, "learning_rate": 9.85883043146715e-06, "loss": 0.4629, "num_input_tokens_seen": 32658528, "step": 26855 }, { "epoch": 3.3654930459842127, "grad_norm": 0.14116349816322327, "learning_rate": 9.858701407295051e-06, "loss": 0.4571, "num_input_tokens_seen": 32664928, "step": 26860 }, { "epoch": 3.366119533892996, "grad_norm": 0.11786594241857529, "learning_rate": 9.858572325033089e-06, "loss": 0.4584, "num_input_tokens_seen": 32671136, "step": 26865 }, { "epoch": 3.366746021801779, "grad_norm": 0.08903612941503525, "learning_rate": 9.858443184682811e-06, "loss": 0.4732, "num_input_tokens_seen": 32677184, "step": 26870 }, { "epoch": 3.3673725097105627, "grad_norm": 0.043533872812986374, "learning_rate": 9.85831398624576e-06, "loss": 0.4649, "num_input_tokens_seen": 32683136, "step": 26875 }, { "epoch": 3.367998997619346, "grad_norm": 0.13820917904376984, "learning_rate": 9.858184729723481e-06, "loss": 0.47, "num_input_tokens_seen": 32689568, "step": 26880 }, { "epoch": 3.3686254855281295, "grad_norm": 0.08834534138441086, "learning_rate": 9.85805541511752e-06, "loss": 0.4569, "num_input_tokens_seen": 32695680, "step": 26885 }, { "epoch": 3.3692519734369126, "grad_norm": 0.10481510311365128, "learning_rate": 9.85792604242942e-06, "loss": 0.4678, "num_input_tokens_seen": 32701216, "step": 26890 }, { "epoch": 3.369878461345696, "grad_norm": 0.08799593895673752, "learning_rate": 9.857796611660731e-06, "loss": 0.4548, "num_input_tokens_seen": 32707520, "step": 26895 }, { "epoch": 3.3705049492544794, "grad_norm": 0.0973237082362175, "learning_rate": 9.857667122813e-06, "loss": 0.4601, "num_input_tokens_seen": 32713600, "step": 26900 }, { "epoch": 3.3711314371632626, "grad_norm": 0.12961159646511078, "learning_rate": 9.857537575887773e-06, "loss": 0.4562, "num_input_tokens_seen": 32719424, "step": 26905 }, { "epoch": 3.371757925072046, "grad_norm": 0.08168281614780426, "learning_rate": 9.857407970886602e-06, "loss": 0.4483, "num_input_tokens_seen": 32725568, "step": 26910 }, { "epoch": 3.3723844129808294, "grad_norm": 0.043043240904808044, "learning_rate": 9.857278307811034e-06, "loss": 0.4719, "num_input_tokens_seen": 32731360, "step": 26915 }, { "epoch": 3.373010900889613, "grad_norm": 0.10591115057468414, "learning_rate": 9.85714858666262e-06, "loss": 0.4751, "num_input_tokens_seen": 32737632, "step": 26920 }, { "epoch": 3.373637388798396, "grad_norm": 0.10180234163999557, "learning_rate": 9.85701880744291e-06, "loss": 0.4752, "num_input_tokens_seen": 32743232, "step": 26925 }, { "epoch": 3.3742638767071798, "grad_norm": 0.10694071650505066, "learning_rate": 9.856888970153456e-06, "loss": 0.4703, "num_input_tokens_seen": 32749216, "step": 26930 }, { "epoch": 3.374890364615963, "grad_norm": 0.031639065593481064, "learning_rate": 9.856759074795813e-06, "loss": 0.4587, "num_input_tokens_seen": 32755456, "step": 26935 }, { "epoch": 3.375516852524746, "grad_norm": 0.09971126914024353, "learning_rate": 9.85662912137153e-06, "loss": 0.467, "num_input_tokens_seen": 32761664, "step": 26940 }, { "epoch": 3.3761433404335297, "grad_norm": 0.06991668045520782, "learning_rate": 9.856499109882166e-06, "loss": 0.4605, "num_input_tokens_seen": 32767872, "step": 26945 }, { "epoch": 3.376769828342313, "grad_norm": 0.10877853631973267, "learning_rate": 9.85636904032927e-06, "loss": 0.4638, "num_input_tokens_seen": 32773984, "step": 26950 }, { "epoch": 3.3773963162510965, "grad_norm": 0.12346822023391724, "learning_rate": 9.856238912714397e-06, "loss": 0.4605, "num_input_tokens_seen": 32779872, "step": 26955 }, { "epoch": 3.3780228041598797, "grad_norm": 0.0733761116862297, "learning_rate": 9.856108727039108e-06, "loss": 0.4627, "num_input_tokens_seen": 32786016, "step": 26960 }, { "epoch": 3.3786492920686633, "grad_norm": 0.1970096230506897, "learning_rate": 9.855978483304955e-06, "loss": 0.4569, "num_input_tokens_seen": 32792320, "step": 26965 }, { "epoch": 3.3792757799774464, "grad_norm": 0.13161851465702057, "learning_rate": 9.855848181513497e-06, "loss": 0.4547, "num_input_tokens_seen": 32798336, "step": 26970 }, { "epoch": 3.3799022678862296, "grad_norm": 0.07434294372797012, "learning_rate": 9.85571782166629e-06, "loss": 0.4548, "num_input_tokens_seen": 32804320, "step": 26975 }, { "epoch": 3.380528755795013, "grad_norm": 0.09718851745128632, "learning_rate": 9.855587403764896e-06, "loss": 0.4647, "num_input_tokens_seen": 32810528, "step": 26980 }, { "epoch": 3.3811552437037964, "grad_norm": 0.1296650469303131, "learning_rate": 9.855456927810871e-06, "loss": 0.4648, "num_input_tokens_seen": 32816480, "step": 26985 }, { "epoch": 3.38178173161258, "grad_norm": 0.13855376839637756, "learning_rate": 9.855326393805778e-06, "loss": 0.4612, "num_input_tokens_seen": 32822080, "step": 26990 }, { "epoch": 3.382408219521363, "grad_norm": 0.08389333635568619, "learning_rate": 9.855195801751174e-06, "loss": 0.4691, "num_input_tokens_seen": 32827776, "step": 26995 }, { "epoch": 3.383034707430147, "grad_norm": 0.09931245446205139, "learning_rate": 9.855065151648622e-06, "loss": 0.4632, "num_input_tokens_seen": 32833600, "step": 27000 }, { "epoch": 3.38366119533893, "grad_norm": 0.08322262018918991, "learning_rate": 9.854934443499684e-06, "loss": 0.4625, "num_input_tokens_seen": 32840224, "step": 27005 }, { "epoch": 3.384287683247713, "grad_norm": 0.0923025831580162, "learning_rate": 9.854803677305924e-06, "loss": 0.4614, "num_input_tokens_seen": 32845248, "step": 27010 }, { "epoch": 3.3849141711564967, "grad_norm": 0.03599080070853233, "learning_rate": 9.854672853068905e-06, "loss": 0.4647, "num_input_tokens_seen": 32851360, "step": 27015 }, { "epoch": 3.38554065906528, "grad_norm": 0.07621506601572037, "learning_rate": 9.854541970790188e-06, "loss": 0.4644, "num_input_tokens_seen": 32857792, "step": 27020 }, { "epoch": 3.3861671469740635, "grad_norm": 0.07874622941017151, "learning_rate": 9.854411030471343e-06, "loss": 0.4605, "num_input_tokens_seen": 32863680, "step": 27025 }, { "epoch": 3.3867936348828467, "grad_norm": 0.0895712822675705, "learning_rate": 9.854280032113931e-06, "loss": 0.4612, "num_input_tokens_seen": 32869760, "step": 27030 }, { "epoch": 3.3874201227916303, "grad_norm": 0.0798560157418251, "learning_rate": 9.85414897571952e-06, "loss": 0.46, "num_input_tokens_seen": 32875936, "step": 27035 }, { "epoch": 3.3880466107004135, "grad_norm": 0.08443786948919296, "learning_rate": 9.854017861289678e-06, "loss": 0.4587, "num_input_tokens_seen": 32881568, "step": 27040 }, { "epoch": 3.388673098609197, "grad_norm": 0.12535975873470306, "learning_rate": 9.85388668882597e-06, "loss": 0.4607, "num_input_tokens_seen": 32887776, "step": 27045 }, { "epoch": 3.3892995865179802, "grad_norm": 0.13607901334762573, "learning_rate": 9.853755458329967e-06, "loss": 0.4711, "num_input_tokens_seen": 32893760, "step": 27050 }, { "epoch": 3.3899260744267634, "grad_norm": 0.08126724511384964, "learning_rate": 9.853624169803237e-06, "loss": 0.4563, "num_input_tokens_seen": 32899872, "step": 27055 }, { "epoch": 3.390552562335547, "grad_norm": 0.07371550798416138, "learning_rate": 9.853492823247349e-06, "loss": 0.4532, "num_input_tokens_seen": 32906176, "step": 27060 }, { "epoch": 3.39117905024433, "grad_norm": 0.07303760200738907, "learning_rate": 9.853361418663872e-06, "loss": 0.46, "num_input_tokens_seen": 32911552, "step": 27065 }, { "epoch": 3.391805538153114, "grad_norm": 0.18051110208034515, "learning_rate": 9.85322995605438e-06, "loss": 0.4717, "num_input_tokens_seen": 32917504, "step": 27070 }, { "epoch": 3.392432026061897, "grad_norm": 0.08764728903770447, "learning_rate": 9.853098435420443e-06, "loss": 0.4649, "num_input_tokens_seen": 32923328, "step": 27075 }, { "epoch": 3.39305851397068, "grad_norm": 0.10017763823270798, "learning_rate": 9.852966856763635e-06, "loss": 0.4648, "num_input_tokens_seen": 32929504, "step": 27080 }, { "epoch": 3.3936850018794638, "grad_norm": 0.07165884226560593, "learning_rate": 9.852835220085527e-06, "loss": 0.4637, "num_input_tokens_seen": 32935616, "step": 27085 }, { "epoch": 3.394311489788247, "grad_norm": 0.0734499990940094, "learning_rate": 9.852703525387694e-06, "loss": 0.4626, "num_input_tokens_seen": 32941312, "step": 27090 }, { "epoch": 3.3949379776970305, "grad_norm": 0.11072976887226105, "learning_rate": 9.852571772671712e-06, "loss": 0.465, "num_input_tokens_seen": 32947552, "step": 27095 }, { "epoch": 3.3955644656058137, "grad_norm": 0.11899320036172867, "learning_rate": 9.852439961939152e-06, "loss": 0.4579, "num_input_tokens_seen": 32953824, "step": 27100 }, { "epoch": 3.3961909535145973, "grad_norm": 0.132233664393425, "learning_rate": 9.852308093191596e-06, "loss": 0.4691, "num_input_tokens_seen": 32960288, "step": 27105 }, { "epoch": 3.3968174414233805, "grad_norm": 0.09353366494178772, "learning_rate": 9.852176166430613e-06, "loss": 0.4654, "num_input_tokens_seen": 32965664, "step": 27110 }, { "epoch": 3.397443929332164, "grad_norm": 0.07492021471261978, "learning_rate": 9.852044181657788e-06, "loss": 0.4632, "num_input_tokens_seen": 32971904, "step": 27115 }, { "epoch": 3.3980704172409473, "grad_norm": 0.0923595055937767, "learning_rate": 9.851912138874695e-06, "loss": 0.4633, "num_input_tokens_seen": 32977920, "step": 27120 }, { "epoch": 3.3986969051497304, "grad_norm": 0.10580708831548691, "learning_rate": 9.851780038082911e-06, "loss": 0.4673, "num_input_tokens_seen": 32983936, "step": 27125 }, { "epoch": 3.399323393058514, "grad_norm": 0.1343872994184494, "learning_rate": 9.851647879284019e-06, "loss": 0.4681, "num_input_tokens_seen": 32989472, "step": 27130 }, { "epoch": 3.3999498809672972, "grad_norm": 0.1310368776321411, "learning_rate": 9.851515662479598e-06, "loss": 0.4645, "num_input_tokens_seen": 32995616, "step": 27135 }, { "epoch": 3.400576368876081, "grad_norm": 0.07122471183538437, "learning_rate": 9.851383387671227e-06, "loss": 0.4663, "num_input_tokens_seen": 33001728, "step": 27140 }, { "epoch": 3.401202856784864, "grad_norm": 0.06830458343029022, "learning_rate": 9.851251054860489e-06, "loss": 0.4617, "num_input_tokens_seen": 33008128, "step": 27145 }, { "epoch": 3.401829344693647, "grad_norm": 0.07912347465753555, "learning_rate": 9.851118664048966e-06, "loss": 0.461, "num_input_tokens_seen": 33013920, "step": 27150 }, { "epoch": 3.402455832602431, "grad_norm": 0.07882734388113022, "learning_rate": 9.85098621523824e-06, "loss": 0.4631, "num_input_tokens_seen": 33019968, "step": 27155 }, { "epoch": 3.403082320511214, "grad_norm": 0.08017712086439133, "learning_rate": 9.850853708429897e-06, "loss": 0.4597, "num_input_tokens_seen": 33026048, "step": 27160 }, { "epoch": 3.4037088084199976, "grad_norm": 0.08017000555992126, "learning_rate": 9.850721143625518e-06, "loss": 0.4668, "num_input_tokens_seen": 33032032, "step": 27165 }, { "epoch": 3.4043352963287807, "grad_norm": 0.1334177553653717, "learning_rate": 9.850588520826692e-06, "loss": 0.4623, "num_input_tokens_seen": 33038208, "step": 27170 }, { "epoch": 3.4049617842375643, "grad_norm": 0.0791802704334259, "learning_rate": 9.850455840035e-06, "loss": 0.4563, "num_input_tokens_seen": 33044000, "step": 27175 }, { "epoch": 3.4055882721463475, "grad_norm": 0.03704002872109413, "learning_rate": 9.85032310125203e-06, "loss": 0.4596, "num_input_tokens_seen": 33049920, "step": 27180 }, { "epoch": 3.406214760055131, "grad_norm": 0.14394427835941315, "learning_rate": 9.850190304479371e-06, "loss": 0.4598, "num_input_tokens_seen": 33055776, "step": 27185 }, { "epoch": 3.4068412479639143, "grad_norm": 0.13514858484268188, "learning_rate": 9.85005744971861e-06, "loss": 0.4644, "num_input_tokens_seen": 33062304, "step": 27190 }, { "epoch": 3.4074677358726975, "grad_norm": 0.10050142556428909, "learning_rate": 9.849924536971332e-06, "loss": 0.459, "num_input_tokens_seen": 33068512, "step": 27195 }, { "epoch": 3.408094223781481, "grad_norm": 0.10733287036418915, "learning_rate": 9.849791566239129e-06, "loss": 0.4606, "num_input_tokens_seen": 33074688, "step": 27200 }, { "epoch": 3.4087207116902642, "grad_norm": 0.09004431962966919, "learning_rate": 9.84965853752359e-06, "loss": 0.4633, "num_input_tokens_seen": 33080064, "step": 27205 }, { "epoch": 3.409347199599048, "grad_norm": 0.08928142488002777, "learning_rate": 9.849525450826308e-06, "loss": 0.4678, "num_input_tokens_seen": 33085856, "step": 27210 }, { "epoch": 3.409973687507831, "grad_norm": 0.08673632889986038, "learning_rate": 9.84939230614887e-06, "loss": 0.4659, "num_input_tokens_seen": 33091872, "step": 27215 }, { "epoch": 3.4106001754166146, "grad_norm": 0.07209470123052597, "learning_rate": 9.84925910349287e-06, "loss": 0.467, "num_input_tokens_seen": 33097888, "step": 27220 }, { "epoch": 3.411226663325398, "grad_norm": 0.1462557166814804, "learning_rate": 9.849125842859901e-06, "loss": 0.4705, "num_input_tokens_seen": 33104128, "step": 27225 }, { "epoch": 3.4118531512341814, "grad_norm": 0.10210768133401871, "learning_rate": 9.848992524251556e-06, "loss": 0.4605, "num_input_tokens_seen": 33110080, "step": 27230 }, { "epoch": 3.4124796391429646, "grad_norm": 0.1212398111820221, "learning_rate": 9.848859147669428e-06, "loss": 0.461, "num_input_tokens_seen": 33116672, "step": 27235 }, { "epoch": 3.4131061270517478, "grad_norm": 0.0791943296790123, "learning_rate": 9.84872571311511e-06, "loss": 0.46, "num_input_tokens_seen": 33121824, "step": 27240 }, { "epoch": 3.4137326149605314, "grad_norm": 0.10131248831748962, "learning_rate": 9.848592220590203e-06, "loss": 0.4626, "num_input_tokens_seen": 33128064, "step": 27245 }, { "epoch": 3.4143591028693145, "grad_norm": 0.037292417138814926, "learning_rate": 9.848458670096297e-06, "loss": 0.4551, "num_input_tokens_seen": 33134304, "step": 27250 }, { "epoch": 3.414985590778098, "grad_norm": 0.07380542904138565, "learning_rate": 9.848325061634993e-06, "loss": 0.4682, "num_input_tokens_seen": 33140512, "step": 27255 }, { "epoch": 3.4156120786868813, "grad_norm": 0.153591588139534, "learning_rate": 9.848191395207888e-06, "loss": 0.4676, "num_input_tokens_seen": 33146752, "step": 27260 }, { "epoch": 3.4162385665956645, "grad_norm": 0.0879773423075676, "learning_rate": 9.848057670816576e-06, "loss": 0.4721, "num_input_tokens_seen": 33153024, "step": 27265 }, { "epoch": 3.416865054504448, "grad_norm": 0.10424448549747467, "learning_rate": 9.847923888462657e-06, "loss": 0.4643, "num_input_tokens_seen": 33159168, "step": 27270 }, { "epoch": 3.4174915424132313, "grad_norm": 0.07730089128017426, "learning_rate": 9.847790048147736e-06, "loss": 0.4607, "num_input_tokens_seen": 33165248, "step": 27275 }, { "epoch": 3.418118030322015, "grad_norm": 0.07469963282346725, "learning_rate": 9.847656149873407e-06, "loss": 0.4607, "num_input_tokens_seen": 33171264, "step": 27280 }, { "epoch": 3.418744518230798, "grad_norm": 0.07232046127319336, "learning_rate": 9.847522193641273e-06, "loss": 0.4595, "num_input_tokens_seen": 33177504, "step": 27285 }, { "epoch": 3.4193710061395817, "grad_norm": 0.08264204859733582, "learning_rate": 9.847388179452938e-06, "loss": 0.4636, "num_input_tokens_seen": 33183616, "step": 27290 }, { "epoch": 3.419997494048365, "grad_norm": 0.10785123705863953, "learning_rate": 9.847254107309998e-06, "loss": 0.4663, "num_input_tokens_seen": 33189600, "step": 27295 }, { "epoch": 3.4206239819571485, "grad_norm": 0.12979543209075928, "learning_rate": 9.847119977214062e-06, "loss": 0.4624, "num_input_tokens_seen": 33195776, "step": 27300 }, { "epoch": 3.4212504698659316, "grad_norm": 0.0818934217095375, "learning_rate": 9.846985789166732e-06, "loss": 0.4656, "num_input_tokens_seen": 33201056, "step": 27305 }, { "epoch": 3.421876957774715, "grad_norm": 0.12150393426418304, "learning_rate": 9.846851543169611e-06, "loss": 0.4652, "num_input_tokens_seen": 33207168, "step": 27310 }, { "epoch": 3.4225034456834984, "grad_norm": 0.12007676810026169, "learning_rate": 9.846717239224303e-06, "loss": 0.4531, "num_input_tokens_seen": 33213152, "step": 27315 }, { "epoch": 3.4231299335922816, "grad_norm": 0.09871801733970642, "learning_rate": 9.846582877332418e-06, "loss": 0.457, "num_input_tokens_seen": 33219328, "step": 27320 }, { "epoch": 3.423756421501065, "grad_norm": 0.09085948020219803, "learning_rate": 9.84644845749556e-06, "loss": 0.4666, "num_input_tokens_seen": 33225440, "step": 27325 }, { "epoch": 3.4243829094098484, "grad_norm": 0.08563869446516037, "learning_rate": 9.846313979715335e-06, "loss": 0.4609, "num_input_tokens_seen": 33231776, "step": 27330 }, { "epoch": 3.4250093973186315, "grad_norm": 0.07728582620620728, "learning_rate": 9.846179443993353e-06, "loss": 0.4664, "num_input_tokens_seen": 33237824, "step": 27335 }, { "epoch": 3.425635885227415, "grad_norm": 0.09122562408447266, "learning_rate": 9.84604485033122e-06, "loss": 0.4633, "num_input_tokens_seen": 33244096, "step": 27340 }, { "epoch": 3.4262623731361983, "grad_norm": 0.09149318188428879, "learning_rate": 9.845910198730546e-06, "loss": 0.4569, "num_input_tokens_seen": 33250592, "step": 27345 }, { "epoch": 3.426888861044982, "grad_norm": 0.12047889083623886, "learning_rate": 9.845775489192943e-06, "loss": 0.4563, "num_input_tokens_seen": 33256352, "step": 27350 }, { "epoch": 3.427515348953765, "grad_norm": 0.1732727587223053, "learning_rate": 9.845640721720019e-06, "loss": 0.4636, "num_input_tokens_seen": 33262304, "step": 27355 }, { "epoch": 3.4281418368625487, "grad_norm": 0.08464144170284271, "learning_rate": 9.845505896313385e-06, "loss": 0.4671, "num_input_tokens_seen": 33268544, "step": 27360 }, { "epoch": 3.428768324771332, "grad_norm": 0.1184559091925621, "learning_rate": 9.845371012974655e-06, "loss": 0.4612, "num_input_tokens_seen": 33274240, "step": 27365 }, { "epoch": 3.4293948126801155, "grad_norm": 0.17176279425621033, "learning_rate": 9.845236071705442e-06, "loss": 0.4602, "num_input_tokens_seen": 33280352, "step": 27370 }, { "epoch": 3.4300213005888986, "grad_norm": 0.04172565042972565, "learning_rate": 9.845101072507358e-06, "loss": 0.4693, "num_input_tokens_seen": 33286848, "step": 27375 }, { "epoch": 3.430647788497682, "grad_norm": 0.0415702685713768, "learning_rate": 9.844966015382015e-06, "loss": 0.4638, "num_input_tokens_seen": 33292800, "step": 27380 }, { "epoch": 3.4312742764064654, "grad_norm": 0.0769534558057785, "learning_rate": 9.844830900331033e-06, "loss": 0.4624, "num_input_tokens_seen": 33298336, "step": 27385 }, { "epoch": 3.4319007643152486, "grad_norm": 0.10605315864086151, "learning_rate": 9.844695727356022e-06, "loss": 0.4642, "num_input_tokens_seen": 33304256, "step": 27390 }, { "epoch": 3.432527252224032, "grad_norm": 0.07939151674509048, "learning_rate": 9.844560496458602e-06, "loss": 0.4635, "num_input_tokens_seen": 33310336, "step": 27395 }, { "epoch": 3.4331537401328154, "grad_norm": 0.0899476706981659, "learning_rate": 9.844425207640386e-06, "loss": 0.463, "num_input_tokens_seen": 33316352, "step": 27400 }, { "epoch": 3.4337802280415985, "grad_norm": 0.16097332537174225, "learning_rate": 9.844289860902996e-06, "loss": 0.4562, "num_input_tokens_seen": 33321728, "step": 27405 }, { "epoch": 3.434406715950382, "grad_norm": 0.10194958001375198, "learning_rate": 9.844154456248046e-06, "loss": 0.4626, "num_input_tokens_seen": 33327968, "step": 27410 }, { "epoch": 3.4350332038591653, "grad_norm": 0.08179017156362534, "learning_rate": 9.844018993677158e-06, "loss": 0.4557, "num_input_tokens_seen": 33334144, "step": 27415 }, { "epoch": 3.435659691767949, "grad_norm": 0.09709879755973816, "learning_rate": 9.843883473191948e-06, "loss": 0.4586, "num_input_tokens_seen": 33340224, "step": 27420 }, { "epoch": 3.436286179676732, "grad_norm": 0.09419925510883331, "learning_rate": 9.84374789479404e-06, "loss": 0.4616, "num_input_tokens_seen": 33346304, "step": 27425 }, { "epoch": 3.4369126675855157, "grad_norm": 0.1337645798921585, "learning_rate": 9.843612258485056e-06, "loss": 0.4403, "num_input_tokens_seen": 33352384, "step": 27430 }, { "epoch": 3.437539155494299, "grad_norm": 0.07953981310129166, "learning_rate": 9.843476564266611e-06, "loss": 0.4596, "num_input_tokens_seen": 33358304, "step": 27435 }, { "epoch": 3.4381656434030825, "grad_norm": 0.050093621015548706, "learning_rate": 9.843340812140334e-06, "loss": 0.4646, "num_input_tokens_seen": 33364416, "step": 27440 }, { "epoch": 3.4387921313118657, "grad_norm": 0.04369590803980827, "learning_rate": 9.843205002107844e-06, "loss": 0.4586, "num_input_tokens_seen": 33370432, "step": 27445 }, { "epoch": 3.439418619220649, "grad_norm": 0.17335757613182068, "learning_rate": 9.843069134170769e-06, "loss": 0.4705, "num_input_tokens_seen": 33376672, "step": 27450 }, { "epoch": 3.4400451071294325, "grad_norm": 0.09325871616601944, "learning_rate": 9.842933208330729e-06, "loss": 0.4624, "num_input_tokens_seen": 33382880, "step": 27455 }, { "epoch": 3.4406715950382156, "grad_norm": 0.2025085985660553, "learning_rate": 9.842797224589351e-06, "loss": 0.4745, "num_input_tokens_seen": 33389056, "step": 27460 }, { "epoch": 3.4412980829469992, "grad_norm": 0.12495600432157516, "learning_rate": 9.842661182948259e-06, "loss": 0.473, "num_input_tokens_seen": 33395136, "step": 27465 }, { "epoch": 3.4419245708557824, "grad_norm": 0.09345187246799469, "learning_rate": 9.842525083409084e-06, "loss": 0.4699, "num_input_tokens_seen": 33401088, "step": 27470 }, { "epoch": 3.442551058764566, "grad_norm": 0.07934994995594025, "learning_rate": 9.842388925973448e-06, "loss": 0.4665, "num_input_tokens_seen": 33407104, "step": 27475 }, { "epoch": 3.443177546673349, "grad_norm": 0.08874289691448212, "learning_rate": 9.842252710642981e-06, "loss": 0.4601, "num_input_tokens_seen": 33413248, "step": 27480 }, { "epoch": 3.443804034582133, "grad_norm": 0.045615196228027344, "learning_rate": 9.842116437419314e-06, "loss": 0.4558, "num_input_tokens_seen": 33419008, "step": 27485 }, { "epoch": 3.444430522490916, "grad_norm": 0.1264505237340927, "learning_rate": 9.841980106304072e-06, "loss": 0.4662, "num_input_tokens_seen": 33425120, "step": 27490 }, { "epoch": 3.445057010399699, "grad_norm": 0.08187467604875565, "learning_rate": 9.841843717298886e-06, "loss": 0.462, "num_input_tokens_seen": 33430944, "step": 27495 }, { "epoch": 3.4456834983084828, "grad_norm": 0.0965864509344101, "learning_rate": 9.841707270405389e-06, "loss": 0.4632, "num_input_tokens_seen": 33437216, "step": 27500 }, { "epoch": 3.446309986217266, "grad_norm": 0.041464876383543015, "learning_rate": 9.841570765625212e-06, "loss": 0.4597, "num_input_tokens_seen": 33443232, "step": 27505 }, { "epoch": 3.4469364741260495, "grad_norm": 0.08391410112380981, "learning_rate": 9.841434202959985e-06, "loss": 0.4608, "num_input_tokens_seen": 33449728, "step": 27510 }, { "epoch": 3.4475629620348327, "grad_norm": 0.08102978765964508, "learning_rate": 9.84129758241134e-06, "loss": 0.4725, "num_input_tokens_seen": 33455328, "step": 27515 }, { "epoch": 3.448189449943616, "grad_norm": 0.14878784120082855, "learning_rate": 9.841160903980916e-06, "loss": 0.4682, "num_input_tokens_seen": 33461248, "step": 27520 }, { "epoch": 3.4488159378523995, "grad_norm": 0.14108169078826904, "learning_rate": 9.841024167670338e-06, "loss": 0.4649, "num_input_tokens_seen": 33467168, "step": 27525 }, { "epoch": 3.4494424257611827, "grad_norm": 0.13761748373508453, "learning_rate": 9.84088737348125e-06, "loss": 0.4609, "num_input_tokens_seen": 33473280, "step": 27530 }, { "epoch": 3.4500689136699663, "grad_norm": 0.11302470415830612, "learning_rate": 9.84075052141528e-06, "loss": 0.4637, "num_input_tokens_seen": 33479392, "step": 27535 }, { "epoch": 3.4506954015787494, "grad_norm": 0.10076330602169037, "learning_rate": 9.840613611474071e-06, "loss": 0.4615, "num_input_tokens_seen": 33485344, "step": 27540 }, { "epoch": 3.451321889487533, "grad_norm": 0.1019807979464531, "learning_rate": 9.840476643659255e-06, "loss": 0.4623, "num_input_tokens_seen": 33491328, "step": 27545 }, { "epoch": 3.451948377396316, "grad_norm": 0.09501589834690094, "learning_rate": 9.840339617972471e-06, "loss": 0.4625, "num_input_tokens_seen": 33497568, "step": 27550 }, { "epoch": 3.4525748653051, "grad_norm": 0.14625588059425354, "learning_rate": 9.840202534415358e-06, "loss": 0.462, "num_input_tokens_seen": 33503872, "step": 27555 }, { "epoch": 3.453201353213883, "grad_norm": 0.1480390876531601, "learning_rate": 9.840065392989555e-06, "loss": 0.4688, "num_input_tokens_seen": 33510016, "step": 27560 }, { "epoch": 3.453827841122666, "grad_norm": 0.11779657751321793, "learning_rate": 9.8399281936967e-06, "loss": 0.4651, "num_input_tokens_seen": 33516160, "step": 27565 }, { "epoch": 3.4544543290314498, "grad_norm": 0.12216639518737793, "learning_rate": 9.839790936538434e-06, "loss": 0.4585, "num_input_tokens_seen": 33522144, "step": 27570 }, { "epoch": 3.455080816940233, "grad_norm": 0.043633051216602325, "learning_rate": 9.839653621516398e-06, "loss": 0.4646, "num_input_tokens_seen": 33528576, "step": 27575 }, { "epoch": 3.4557073048490166, "grad_norm": 0.1075797900557518, "learning_rate": 9.839516248632234e-06, "loss": 0.4715, "num_input_tokens_seen": 33535136, "step": 27580 }, { "epoch": 3.4563337927577997, "grad_norm": 0.1117272824048996, "learning_rate": 9.839378817887585e-06, "loss": 0.4642, "num_input_tokens_seen": 33541440, "step": 27585 }, { "epoch": 3.456960280666583, "grad_norm": 0.10307101160287857, "learning_rate": 9.839241329284093e-06, "loss": 0.4632, "num_input_tokens_seen": 33547904, "step": 27590 }, { "epoch": 3.4575867685753665, "grad_norm": 0.08573098480701447, "learning_rate": 9.839103782823403e-06, "loss": 0.4618, "num_input_tokens_seen": 33553920, "step": 27595 }, { "epoch": 3.4582132564841497, "grad_norm": 0.09741005301475525, "learning_rate": 9.838966178507158e-06, "loss": 0.4652, "num_input_tokens_seen": 33559584, "step": 27600 }, { "epoch": 3.4588397443929333, "grad_norm": 0.22602295875549316, "learning_rate": 9.838828516337005e-06, "loss": 0.4647, "num_input_tokens_seen": 33565664, "step": 27605 }, { "epoch": 3.4594662323017165, "grad_norm": 0.0914953425526619, "learning_rate": 9.838690796314587e-06, "loss": 0.4592, "num_input_tokens_seen": 33571680, "step": 27610 }, { "epoch": 3.4600927202105, "grad_norm": 0.17091204226016998, "learning_rate": 9.838553018441555e-06, "loss": 0.4616, "num_input_tokens_seen": 33577696, "step": 27615 }, { "epoch": 3.4607192081192832, "grad_norm": 0.03453827649354935, "learning_rate": 9.838415182719552e-06, "loss": 0.4649, "num_input_tokens_seen": 33583744, "step": 27620 }, { "epoch": 3.461345696028067, "grad_norm": 0.13289326429367065, "learning_rate": 9.838277289150227e-06, "loss": 0.4606, "num_input_tokens_seen": 33590176, "step": 27625 }, { "epoch": 3.46197218393685, "grad_norm": 0.14086633920669556, "learning_rate": 9.83813933773523e-06, "loss": 0.463, "num_input_tokens_seen": 33596128, "step": 27630 }, { "epoch": 3.462598671845633, "grad_norm": 0.0702214315533638, "learning_rate": 9.83800132847621e-06, "loss": 0.4669, "num_input_tokens_seen": 33602144, "step": 27635 }, { "epoch": 3.463225159754417, "grad_norm": 0.11328206211328506, "learning_rate": 9.837863261374816e-06, "loss": 0.4698, "num_input_tokens_seen": 33607552, "step": 27640 }, { "epoch": 3.4638516476632, "grad_norm": 0.13953059911727905, "learning_rate": 9.837725136432699e-06, "loss": 0.4621, "num_input_tokens_seen": 33613728, "step": 27645 }, { "epoch": 3.4644781355719836, "grad_norm": 0.08922328054904938, "learning_rate": 9.83758695365151e-06, "loss": 0.4611, "num_input_tokens_seen": 33619712, "step": 27650 }, { "epoch": 3.4651046234807668, "grad_norm": 0.03523951396346092, "learning_rate": 9.837448713032904e-06, "loss": 0.4615, "num_input_tokens_seen": 33625728, "step": 27655 }, { "epoch": 3.4657311113895504, "grad_norm": 0.0757734403014183, "learning_rate": 9.83731041457853e-06, "loss": 0.4686, "num_input_tokens_seen": 33630944, "step": 27660 }, { "epoch": 3.4663575992983335, "grad_norm": 0.09741808474063873, "learning_rate": 9.837172058290042e-06, "loss": 0.4602, "num_input_tokens_seen": 33637344, "step": 27665 }, { "epoch": 3.466984087207117, "grad_norm": 0.10149127244949341, "learning_rate": 9.837033644169096e-06, "loss": 0.4688, "num_input_tokens_seen": 33643584, "step": 27670 }, { "epoch": 3.4676105751159003, "grad_norm": 0.07862041890621185, "learning_rate": 9.836895172217347e-06, "loss": 0.4673, "num_input_tokens_seen": 33649376, "step": 27675 }, { "epoch": 3.4682370630246835, "grad_norm": 0.03321608901023865, "learning_rate": 9.836756642436449e-06, "loss": 0.4637, "num_input_tokens_seen": 33655808, "step": 27680 }, { "epoch": 3.468863550933467, "grad_norm": 0.07488274574279785, "learning_rate": 9.83661805482806e-06, "loss": 0.4669, "num_input_tokens_seen": 33661888, "step": 27685 }, { "epoch": 3.4694900388422503, "grad_norm": 0.0850134864449501, "learning_rate": 9.836479409393833e-06, "loss": 0.4687, "num_input_tokens_seen": 33667808, "step": 27690 }, { "epoch": 3.470116526751034, "grad_norm": 0.07787703722715378, "learning_rate": 9.83634070613543e-06, "loss": 0.4611, "num_input_tokens_seen": 33673888, "step": 27695 }, { "epoch": 3.470743014659817, "grad_norm": 0.06788337230682373, "learning_rate": 9.836201945054509e-06, "loss": 0.4649, "num_input_tokens_seen": 33679424, "step": 27700 }, { "epoch": 3.4713695025686, "grad_norm": 0.08742355555295944, "learning_rate": 9.836063126152726e-06, "loss": 0.4623, "num_input_tokens_seen": 33685344, "step": 27705 }, { "epoch": 3.471995990477384, "grad_norm": 0.11148196458816528, "learning_rate": 9.835924249431742e-06, "loss": 0.4634, "num_input_tokens_seen": 33691424, "step": 27710 }, { "epoch": 3.472622478386167, "grad_norm": 0.11466572433710098, "learning_rate": 9.835785314893218e-06, "loss": 0.4616, "num_input_tokens_seen": 33697632, "step": 27715 }, { "epoch": 3.4732489662949506, "grad_norm": 0.07743988931179047, "learning_rate": 9.835646322538817e-06, "loss": 0.4638, "num_input_tokens_seen": 33703904, "step": 27720 }, { "epoch": 3.473875454203734, "grad_norm": 0.12837257981300354, "learning_rate": 9.835507272370197e-06, "loss": 0.4606, "num_input_tokens_seen": 33710144, "step": 27725 }, { "epoch": 3.4745019421125174, "grad_norm": 0.07833176106214523, "learning_rate": 9.83536816438902e-06, "loss": 0.4569, "num_input_tokens_seen": 33716320, "step": 27730 }, { "epoch": 3.4751284300213006, "grad_norm": 0.03411947935819626, "learning_rate": 9.835228998596956e-06, "loss": 0.4591, "num_input_tokens_seen": 33722912, "step": 27735 }, { "epoch": 3.475754917930084, "grad_norm": 0.0723273903131485, "learning_rate": 9.83508977499566e-06, "loss": 0.4664, "num_input_tokens_seen": 33729504, "step": 27740 }, { "epoch": 3.4763814058388673, "grad_norm": 0.03157813102006912, "learning_rate": 9.834950493586801e-06, "loss": 0.4612, "num_input_tokens_seen": 33735584, "step": 27745 }, { "epoch": 3.4770078937476505, "grad_norm": 0.10002373903989792, "learning_rate": 9.834811154372046e-06, "loss": 0.4644, "num_input_tokens_seen": 33741760, "step": 27750 }, { "epoch": 3.477634381656434, "grad_norm": 0.07394328713417053, "learning_rate": 9.834671757353058e-06, "loss": 0.4608, "num_input_tokens_seen": 33747968, "step": 27755 }, { "epoch": 3.4782608695652173, "grad_norm": 0.06709521263837814, "learning_rate": 9.834532302531503e-06, "loss": 0.4612, "num_input_tokens_seen": 33754272, "step": 27760 }, { "epoch": 3.478887357474001, "grad_norm": 0.07204436510801315, "learning_rate": 9.83439278990905e-06, "loss": 0.4555, "num_input_tokens_seen": 33760320, "step": 27765 }, { "epoch": 3.479513845382784, "grad_norm": 0.03493453934788704, "learning_rate": 9.834253219487368e-06, "loss": 0.4642, "num_input_tokens_seen": 33766368, "step": 27770 }, { "epoch": 3.4801403332915672, "grad_norm": 0.10333003848791122, "learning_rate": 9.834113591268123e-06, "loss": 0.4661, "num_input_tokens_seen": 33772768, "step": 27775 }, { "epoch": 3.480766821200351, "grad_norm": 0.0869799554347992, "learning_rate": 9.833973905252987e-06, "loss": 0.4646, "num_input_tokens_seen": 33778720, "step": 27780 }, { "epoch": 3.481393309109134, "grad_norm": 0.10610578209161758, "learning_rate": 9.833834161443628e-06, "loss": 0.4603, "num_input_tokens_seen": 33784256, "step": 27785 }, { "epoch": 3.4820197970179176, "grad_norm": 0.08524426817893982, "learning_rate": 9.833694359841718e-06, "loss": 0.4599, "num_input_tokens_seen": 33789472, "step": 27790 }, { "epoch": 3.482646284926701, "grad_norm": 0.14493109285831451, "learning_rate": 9.833554500448928e-06, "loss": 0.4671, "num_input_tokens_seen": 33795584, "step": 27795 }, { "epoch": 3.4832727728354844, "grad_norm": 0.07843966782093048, "learning_rate": 9.83341458326693e-06, "loss": 0.4663, "num_input_tokens_seen": 33801984, "step": 27800 }, { "epoch": 3.4838992607442676, "grad_norm": 0.09344606101512909, "learning_rate": 9.833274608297396e-06, "loss": 0.4596, "num_input_tokens_seen": 33807808, "step": 27805 }, { "epoch": 3.484525748653051, "grad_norm": 0.03586236387491226, "learning_rate": 9.833134575542002e-06, "loss": 0.465, "num_input_tokens_seen": 33814144, "step": 27810 }, { "epoch": 3.4851522365618344, "grad_norm": 0.08235670626163483, "learning_rate": 9.83299448500242e-06, "loss": 0.4614, "num_input_tokens_seen": 33820064, "step": 27815 }, { "epoch": 3.4857787244706175, "grad_norm": 0.13961344957351685, "learning_rate": 9.832854336680324e-06, "loss": 0.4572, "num_input_tokens_seen": 33826080, "step": 27820 }, { "epoch": 3.486405212379401, "grad_norm": 0.06537219882011414, "learning_rate": 9.832714130577393e-06, "loss": 0.4638, "num_input_tokens_seen": 33832288, "step": 27825 }, { "epoch": 3.4870317002881843, "grad_norm": 0.11655053496360779, "learning_rate": 9.8325738666953e-06, "loss": 0.4635, "num_input_tokens_seen": 33838368, "step": 27830 }, { "epoch": 3.487658188196968, "grad_norm": 0.0912596732378006, "learning_rate": 9.832433545035726e-06, "loss": 0.4571, "num_input_tokens_seen": 33844128, "step": 27835 }, { "epoch": 3.488284676105751, "grad_norm": 0.11438734829425812, "learning_rate": 9.832293165600343e-06, "loss": 0.4604, "num_input_tokens_seen": 33850272, "step": 27840 }, { "epoch": 3.4889111640145343, "grad_norm": 0.11687443405389786, "learning_rate": 9.832152728390836e-06, "loss": 0.4615, "num_input_tokens_seen": 33856448, "step": 27845 }, { "epoch": 3.489537651923318, "grad_norm": 0.08298094570636749, "learning_rate": 9.832012233408878e-06, "loss": 0.4551, "num_input_tokens_seen": 33862336, "step": 27850 }, { "epoch": 3.490164139832101, "grad_norm": 0.11026813089847565, "learning_rate": 9.831871680656151e-06, "loss": 0.4652, "num_input_tokens_seen": 33868416, "step": 27855 }, { "epoch": 3.4907906277408847, "grad_norm": 0.08561263233423233, "learning_rate": 9.831731070134337e-06, "loss": 0.4605, "num_input_tokens_seen": 33874624, "step": 27860 }, { "epoch": 3.491417115649668, "grad_norm": 0.18750032782554626, "learning_rate": 9.831590401845116e-06, "loss": 0.4657, "num_input_tokens_seen": 33880672, "step": 27865 }, { "epoch": 3.4920436035584514, "grad_norm": 0.080157570540905, "learning_rate": 9.831449675790168e-06, "loss": 0.469, "num_input_tokens_seen": 33886912, "step": 27870 }, { "epoch": 3.4926700914672346, "grad_norm": 0.15335430204868317, "learning_rate": 9.831308891971178e-06, "loss": 0.4821, "num_input_tokens_seen": 33893056, "step": 27875 }, { "epoch": 3.4932965793760182, "grad_norm": 0.13906456530094147, "learning_rate": 9.831168050389828e-06, "loss": 0.4615, "num_input_tokens_seen": 33899296, "step": 27880 }, { "epoch": 3.4939230672848014, "grad_norm": 0.08501012623310089, "learning_rate": 9.831027151047801e-06, "loss": 0.4634, "num_input_tokens_seen": 33905248, "step": 27885 }, { "epoch": 3.4945495551935846, "grad_norm": 0.11142690479755402, "learning_rate": 9.830886193946785e-06, "loss": 0.4644, "num_input_tokens_seen": 33911104, "step": 27890 }, { "epoch": 3.495176043102368, "grad_norm": 0.14956101775169373, "learning_rate": 9.830745179088461e-06, "loss": 0.4623, "num_input_tokens_seen": 33917280, "step": 27895 }, { "epoch": 3.4958025310111513, "grad_norm": 0.09713254868984222, "learning_rate": 9.830604106474518e-06, "loss": 0.4633, "num_input_tokens_seen": 33923520, "step": 27900 }, { "epoch": 3.496429018919935, "grad_norm": 0.09289035201072693, "learning_rate": 9.830462976106641e-06, "loss": 0.4619, "num_input_tokens_seen": 33929664, "step": 27905 }, { "epoch": 3.497055506828718, "grad_norm": 0.033609967678785324, "learning_rate": 9.830321787986518e-06, "loss": 0.4603, "num_input_tokens_seen": 33935808, "step": 27910 }, { "epoch": 3.4976819947375017, "grad_norm": 0.11520450562238693, "learning_rate": 9.830180542115836e-06, "loss": 0.4593, "num_input_tokens_seen": 33941312, "step": 27915 }, { "epoch": 3.498308482646285, "grad_norm": 0.07744887471199036, "learning_rate": 9.830039238496288e-06, "loss": 0.4663, "num_input_tokens_seen": 33947296, "step": 27920 }, { "epoch": 3.4989349705550685, "grad_norm": 0.042608071118593216, "learning_rate": 9.829897877129557e-06, "loss": 0.4601, "num_input_tokens_seen": 33953632, "step": 27925 }, { "epoch": 3.4995614584638517, "grad_norm": 0.0796567052602768, "learning_rate": 9.829756458017337e-06, "loss": 0.4566, "num_input_tokens_seen": 33959456, "step": 27930 }, { "epoch": 3.500187946372635, "grad_norm": 0.0808025598526001, "learning_rate": 9.829614981161317e-06, "loss": 0.4581, "num_input_tokens_seen": 33964864, "step": 27935 }, { "epoch": 3.5008144342814185, "grad_norm": 0.14269575476646423, "learning_rate": 9.82947344656319e-06, "loss": 0.4618, "num_input_tokens_seen": 33970880, "step": 27940 }, { "epoch": 3.5014409221902016, "grad_norm": 0.11355449259281158, "learning_rate": 9.829331854224649e-06, "loss": 0.4713, "num_input_tokens_seen": 33976992, "step": 27945 }, { "epoch": 3.5020674100989853, "grad_norm": 0.07514674961566925, "learning_rate": 9.829190204147383e-06, "loss": 0.4675, "num_input_tokens_seen": 33982592, "step": 27950 }, { "epoch": 3.5026938980077684, "grad_norm": 0.1203206405043602, "learning_rate": 9.829048496333089e-06, "loss": 0.4606, "num_input_tokens_seen": 33988864, "step": 27955 }, { "epoch": 3.5033203859165516, "grad_norm": 0.08981424570083618, "learning_rate": 9.82890673078346e-06, "loss": 0.4616, "num_input_tokens_seen": 33994688, "step": 27960 }, { "epoch": 3.503946873825335, "grad_norm": 0.1044333353638649, "learning_rate": 9.828764907500193e-06, "loss": 0.4663, "num_input_tokens_seen": 34000864, "step": 27965 }, { "epoch": 3.5045733617341184, "grad_norm": 0.08171747624874115, "learning_rate": 9.82862302648498e-06, "loss": 0.4627, "num_input_tokens_seen": 34006720, "step": 27970 }, { "epoch": 3.505199849642902, "grad_norm": 0.10306553542613983, "learning_rate": 9.828481087739519e-06, "loss": 0.4699, "num_input_tokens_seen": 34012960, "step": 27975 }, { "epoch": 3.505826337551685, "grad_norm": 0.07373268902301788, "learning_rate": 9.828339091265507e-06, "loss": 0.46, "num_input_tokens_seen": 34019136, "step": 27980 }, { "epoch": 3.5064528254604688, "grad_norm": 0.07273641973733902, "learning_rate": 9.828197037064643e-06, "loss": 0.4633, "num_input_tokens_seen": 34024608, "step": 27985 }, { "epoch": 3.507079313369252, "grad_norm": 0.11650238186120987, "learning_rate": 9.828054925138622e-06, "loss": 0.462, "num_input_tokens_seen": 34030080, "step": 27990 }, { "epoch": 3.5077058012780356, "grad_norm": 0.10108926147222519, "learning_rate": 9.827912755489146e-06, "loss": 0.4647, "num_input_tokens_seen": 34036128, "step": 27995 }, { "epoch": 3.5083322891868187, "grad_norm": 0.09380259364843369, "learning_rate": 9.827770528117913e-06, "loss": 0.4667, "num_input_tokens_seen": 34042336, "step": 28000 }, { "epoch": 3.508958777095602, "grad_norm": 0.08594498038291931, "learning_rate": 9.827628243026626e-06, "loss": 0.4674, "num_input_tokens_seen": 34048256, "step": 28005 }, { "epoch": 3.5095852650043855, "grad_norm": 0.07773078978061676, "learning_rate": 9.827485900216985e-06, "loss": 0.4634, "num_input_tokens_seen": 34054496, "step": 28010 }, { "epoch": 3.5102117529131687, "grad_norm": 0.13029265403747559, "learning_rate": 9.82734349969069e-06, "loss": 0.4625, "num_input_tokens_seen": 34060832, "step": 28015 }, { "epoch": 3.5108382408219523, "grad_norm": 0.07606883347034454, "learning_rate": 9.827201041449445e-06, "loss": 0.4557, "num_input_tokens_seen": 34066880, "step": 28020 }, { "epoch": 3.5114647287307355, "grad_norm": 0.16154751181602478, "learning_rate": 9.827058525494952e-06, "loss": 0.4596, "num_input_tokens_seen": 34072352, "step": 28025 }, { "epoch": 3.5120912166395186, "grad_norm": 0.08531562983989716, "learning_rate": 9.826915951828917e-06, "loss": 0.4598, "num_input_tokens_seen": 34079040, "step": 28030 }, { "epoch": 3.5127177045483022, "grad_norm": 0.029462773352861404, "learning_rate": 9.826773320453045e-06, "loss": 0.4592, "num_input_tokens_seen": 34085152, "step": 28035 }, { "epoch": 3.513344192457086, "grad_norm": 0.07719258219003677, "learning_rate": 9.826630631369039e-06, "loss": 0.4556, "num_input_tokens_seen": 34091584, "step": 28040 }, { "epoch": 3.513970680365869, "grad_norm": 0.11583573371171951, "learning_rate": 9.826487884578603e-06, "loss": 0.4613, "num_input_tokens_seen": 34097664, "step": 28045 }, { "epoch": 3.514597168274652, "grad_norm": 0.11623804271221161, "learning_rate": 9.82634508008345e-06, "loss": 0.4629, "num_input_tokens_seen": 34103776, "step": 28050 }, { "epoch": 3.515223656183436, "grad_norm": 0.033669613301754, "learning_rate": 9.826202217885281e-06, "loss": 0.4581, "num_input_tokens_seen": 34109952, "step": 28055 }, { "epoch": 3.515850144092219, "grad_norm": 0.0336809940636158, "learning_rate": 9.82605929798581e-06, "loss": 0.4675, "num_input_tokens_seen": 34115872, "step": 28060 }, { "epoch": 3.5164766320010026, "grad_norm": 0.07366762310266495, "learning_rate": 9.825916320386742e-06, "loss": 0.4586, "num_input_tokens_seen": 34121856, "step": 28065 }, { "epoch": 3.5171031199097857, "grad_norm": 0.08650390058755875, "learning_rate": 9.825773285089785e-06, "loss": 0.4648, "num_input_tokens_seen": 34127840, "step": 28070 }, { "epoch": 3.517729607818569, "grad_norm": 0.0859927386045456, "learning_rate": 9.82563019209665e-06, "loss": 0.4627, "num_input_tokens_seen": 34133920, "step": 28075 }, { "epoch": 3.5183560957273525, "grad_norm": 0.14263615012168884, "learning_rate": 9.825487041409053e-06, "loss": 0.4601, "num_input_tokens_seen": 34139424, "step": 28080 }, { "epoch": 3.5189825836361357, "grad_norm": 0.1506296694278717, "learning_rate": 9.8253438330287e-06, "loss": 0.4717, "num_input_tokens_seen": 34145760, "step": 28085 }, { "epoch": 3.5196090715449193, "grad_norm": 0.07732547074556351, "learning_rate": 9.825200566957302e-06, "loss": 0.4546, "num_input_tokens_seen": 34151744, "step": 28090 }, { "epoch": 3.5202355594537025, "grad_norm": 0.08446294814348221, "learning_rate": 9.825057243196577e-06, "loss": 0.4608, "num_input_tokens_seen": 34157952, "step": 28095 }, { "epoch": 3.5208620473624856, "grad_norm": 0.08899151533842087, "learning_rate": 9.824913861748236e-06, "loss": 0.4607, "num_input_tokens_seen": 34163936, "step": 28100 }, { "epoch": 3.5214885352712693, "grad_norm": 0.07196106016635895, "learning_rate": 9.824770422613991e-06, "loss": 0.4599, "num_input_tokens_seen": 34170208, "step": 28105 }, { "epoch": 3.522115023180053, "grad_norm": 0.030481578782200813, "learning_rate": 9.824626925795561e-06, "loss": 0.4667, "num_input_tokens_seen": 34176512, "step": 28110 }, { "epoch": 3.522741511088836, "grad_norm": 0.1204291582107544, "learning_rate": 9.82448337129466e-06, "loss": 0.4609, "num_input_tokens_seen": 34182912, "step": 28115 }, { "epoch": 3.523367998997619, "grad_norm": 0.035997409373521805, "learning_rate": 9.824339759113004e-06, "loss": 0.473, "num_input_tokens_seen": 34189184, "step": 28120 }, { "epoch": 3.523994486906403, "grad_norm": 0.12126441299915314, "learning_rate": 9.82419608925231e-06, "loss": 0.4568, "num_input_tokens_seen": 34195232, "step": 28125 }, { "epoch": 3.524620974815186, "grad_norm": 0.038656171411275864, "learning_rate": 9.824052361714295e-06, "loss": 0.4648, "num_input_tokens_seen": 34201504, "step": 28130 }, { "epoch": 3.5252474627239696, "grad_norm": 0.13220474123954773, "learning_rate": 9.823908576500678e-06, "loss": 0.4617, "num_input_tokens_seen": 34207520, "step": 28135 }, { "epoch": 3.5258739506327528, "grad_norm": 0.08339368551969528, "learning_rate": 9.82376473361318e-06, "loss": 0.4652, "num_input_tokens_seen": 34213504, "step": 28140 }, { "epoch": 3.526500438541536, "grad_norm": 0.1387133002281189, "learning_rate": 9.823620833053517e-06, "loss": 0.4609, "num_input_tokens_seen": 34219424, "step": 28145 }, { "epoch": 3.5271269264503196, "grad_norm": 0.08464451879262924, "learning_rate": 9.823476874823412e-06, "loss": 0.4639, "num_input_tokens_seen": 34225344, "step": 28150 }, { "epoch": 3.5277534143591027, "grad_norm": 0.07014524936676025, "learning_rate": 9.823332858924586e-06, "loss": 0.4643, "num_input_tokens_seen": 34231168, "step": 28155 }, { "epoch": 3.5283799022678863, "grad_norm": 0.13463270664215088, "learning_rate": 9.823188785358759e-06, "loss": 0.4665, "num_input_tokens_seen": 34237120, "step": 28160 }, { "epoch": 3.5290063901766695, "grad_norm": 0.09675223380327225, "learning_rate": 9.823044654127657e-06, "loss": 0.4628, "num_input_tokens_seen": 34243360, "step": 28165 }, { "epoch": 3.5296328780854527, "grad_norm": 0.11760341376066208, "learning_rate": 9.822900465233001e-06, "loss": 0.4646, "num_input_tokens_seen": 34249152, "step": 28170 }, { "epoch": 3.5302593659942363, "grad_norm": 0.11873061954975128, "learning_rate": 9.822756218676515e-06, "loss": 0.4614, "num_input_tokens_seen": 34255520, "step": 28175 }, { "epoch": 3.53088585390302, "grad_norm": 0.08296361565589905, "learning_rate": 9.822611914459922e-06, "loss": 0.4629, "num_input_tokens_seen": 34261920, "step": 28180 }, { "epoch": 3.531512341811803, "grad_norm": 0.11280506104230881, "learning_rate": 9.822467552584951e-06, "loss": 0.4626, "num_input_tokens_seen": 34267936, "step": 28185 }, { "epoch": 3.5321388297205862, "grad_norm": 0.07245547324419022, "learning_rate": 9.822323133053325e-06, "loss": 0.4668, "num_input_tokens_seen": 34274208, "step": 28190 }, { "epoch": 3.53276531762937, "grad_norm": 0.09251391887664795, "learning_rate": 9.822178655866771e-06, "loss": 0.4623, "num_input_tokens_seen": 34280416, "step": 28195 }, { "epoch": 3.533391805538153, "grad_norm": 0.09895186126232147, "learning_rate": 9.822034121027018e-06, "loss": 0.4696, "num_input_tokens_seen": 34286624, "step": 28200 }, { "epoch": 3.5340182934469366, "grad_norm": 0.09536222368478775, "learning_rate": 9.821889528535792e-06, "loss": 0.4633, "num_input_tokens_seen": 34292992, "step": 28205 }, { "epoch": 3.53464478135572, "grad_norm": 0.03596639260649681, "learning_rate": 9.821744878394823e-06, "loss": 0.4647, "num_input_tokens_seen": 34298720, "step": 28210 }, { "epoch": 3.535271269264503, "grad_norm": 0.08459299057722092, "learning_rate": 9.82160017060584e-06, "loss": 0.4605, "num_input_tokens_seen": 34304832, "step": 28215 }, { "epoch": 3.5358977571732866, "grad_norm": 0.11369876563549042, "learning_rate": 9.821455405170573e-06, "loss": 0.4602, "num_input_tokens_seen": 34311136, "step": 28220 }, { "epoch": 3.5365242450820698, "grad_norm": 0.14407256245613098, "learning_rate": 9.821310582090754e-06, "loss": 0.4587, "num_input_tokens_seen": 34317504, "step": 28225 }, { "epoch": 3.5371507329908534, "grad_norm": 0.03352075815200806, "learning_rate": 9.821165701368113e-06, "loss": 0.4656, "num_input_tokens_seen": 34323520, "step": 28230 }, { "epoch": 3.5377772208996365, "grad_norm": 0.11838693916797638, "learning_rate": 9.821020763004381e-06, "loss": 0.4653, "num_input_tokens_seen": 34329696, "step": 28235 }, { "epoch": 3.53840370880842, "grad_norm": 0.1361381858587265, "learning_rate": 9.820875767001294e-06, "loss": 0.4614, "num_input_tokens_seen": 34335712, "step": 28240 }, { "epoch": 3.5390301967172033, "grad_norm": 0.1436336487531662, "learning_rate": 9.820730713360585e-06, "loss": 0.4614, "num_input_tokens_seen": 34341728, "step": 28245 }, { "epoch": 3.539656684625987, "grad_norm": 0.1319398283958435, "learning_rate": 9.820585602083985e-06, "loss": 0.4636, "num_input_tokens_seen": 34347712, "step": 28250 }, { "epoch": 3.54028317253477, "grad_norm": 0.0771871879696846, "learning_rate": 9.820440433173233e-06, "loss": 0.4583, "num_input_tokens_seen": 34353984, "step": 28255 }, { "epoch": 3.5409096604435533, "grad_norm": 0.07921120524406433, "learning_rate": 9.820295206630063e-06, "loss": 0.4652, "num_input_tokens_seen": 34360160, "step": 28260 }, { "epoch": 3.541536148352337, "grad_norm": 0.07147211581468582, "learning_rate": 9.82014992245621e-06, "loss": 0.469, "num_input_tokens_seen": 34366048, "step": 28265 }, { "epoch": 3.54216263626112, "grad_norm": 0.0853739008307457, "learning_rate": 9.820004580653413e-06, "loss": 0.4654, "num_input_tokens_seen": 34372448, "step": 28270 }, { "epoch": 3.5427891241699037, "grad_norm": 0.0349302776157856, "learning_rate": 9.819859181223407e-06, "loss": 0.4621, "num_input_tokens_seen": 34378624, "step": 28275 }, { "epoch": 3.543415612078687, "grad_norm": 0.06805257499217987, "learning_rate": 9.819713724167934e-06, "loss": 0.4627, "num_input_tokens_seen": 34384896, "step": 28280 }, { "epoch": 3.54404209998747, "grad_norm": 0.08225645124912262, "learning_rate": 9.81956820948873e-06, "loss": 0.4654, "num_input_tokens_seen": 34391296, "step": 28285 }, { "epoch": 3.5446685878962536, "grad_norm": 0.07299911975860596, "learning_rate": 9.819422637187538e-06, "loss": 0.4617, "num_input_tokens_seen": 34397408, "step": 28290 }, { "epoch": 3.545295075805037, "grad_norm": 0.08356424421072006, "learning_rate": 9.819277007266094e-06, "loss": 0.4653, "num_input_tokens_seen": 34403584, "step": 28295 }, { "epoch": 3.5459215637138204, "grad_norm": 0.0845429003238678, "learning_rate": 9.819131319726142e-06, "loss": 0.4579, "num_input_tokens_seen": 34408416, "step": 28300 }, { "epoch": 3.5465480516226036, "grad_norm": 0.0381290502846241, "learning_rate": 9.818985574569425e-06, "loss": 0.4597, "num_input_tokens_seen": 34414496, "step": 28305 }, { "epoch": 3.547174539531387, "grad_norm": 0.06827589869499207, "learning_rate": 9.818839771797683e-06, "loss": 0.4587, "num_input_tokens_seen": 34420256, "step": 28310 }, { "epoch": 3.5478010274401703, "grad_norm": 0.07817481458187103, "learning_rate": 9.81869391141266e-06, "loss": 0.4602, "num_input_tokens_seen": 34426368, "step": 28315 }, { "epoch": 3.548427515348954, "grad_norm": 0.06922587007284164, "learning_rate": 9.8185479934161e-06, "loss": 0.4591, "num_input_tokens_seen": 34432576, "step": 28320 }, { "epoch": 3.549054003257737, "grad_norm": 0.11697188764810562, "learning_rate": 9.818402017809746e-06, "loss": 0.4748, "num_input_tokens_seen": 34438208, "step": 28325 }, { "epoch": 3.5496804911665203, "grad_norm": 0.034406889230012894, "learning_rate": 9.818255984595347e-06, "loss": 0.4688, "num_input_tokens_seen": 34443808, "step": 28330 }, { "epoch": 3.550306979075304, "grad_norm": 0.09291763603687286, "learning_rate": 9.818109893774646e-06, "loss": 0.458, "num_input_tokens_seen": 34450272, "step": 28335 }, { "epoch": 3.550933466984087, "grad_norm": 0.1534697264432907, "learning_rate": 9.81796374534939e-06, "loss": 0.4535, "num_input_tokens_seen": 34456416, "step": 28340 }, { "epoch": 3.5515599548928707, "grad_norm": 0.08893147855997086, "learning_rate": 9.817817539321328e-06, "loss": 0.4624, "num_input_tokens_seen": 34462464, "step": 28345 }, { "epoch": 3.552186442801654, "grad_norm": 0.10473435372114182, "learning_rate": 9.817671275692206e-06, "loss": 0.4644, "num_input_tokens_seen": 34468448, "step": 28350 }, { "epoch": 3.552812930710437, "grad_norm": 0.0720377042889595, "learning_rate": 9.817524954463774e-06, "loss": 0.4643, "num_input_tokens_seen": 34474656, "step": 28355 }, { "epoch": 3.5534394186192206, "grad_norm": 0.09421593695878983, "learning_rate": 9.817378575637779e-06, "loss": 0.4601, "num_input_tokens_seen": 34480672, "step": 28360 }, { "epoch": 3.5540659065280042, "grad_norm": 0.079062819480896, "learning_rate": 9.817232139215973e-06, "loss": 0.4555, "num_input_tokens_seen": 34486336, "step": 28365 }, { "epoch": 3.5546923944367874, "grad_norm": 0.07244662940502167, "learning_rate": 9.817085645200109e-06, "loss": 0.469, "num_input_tokens_seen": 34492672, "step": 28370 }, { "epoch": 3.5553188823455706, "grad_norm": 0.08721397072076797, "learning_rate": 9.816939093591932e-06, "loss": 0.4635, "num_input_tokens_seen": 34498944, "step": 28375 }, { "epoch": 3.555945370254354, "grad_norm": 0.10971684008836746, "learning_rate": 9.816792484393203e-06, "loss": 0.4594, "num_input_tokens_seen": 34505344, "step": 28380 }, { "epoch": 3.5565718581631374, "grad_norm": 0.03716529533267021, "learning_rate": 9.816645817605667e-06, "loss": 0.4678, "num_input_tokens_seen": 34511520, "step": 28385 }, { "epoch": 3.557198346071921, "grad_norm": 0.08920835703611374, "learning_rate": 9.816499093231082e-06, "loss": 0.4678, "num_input_tokens_seen": 34517920, "step": 28390 }, { "epoch": 3.557824833980704, "grad_norm": 0.03404819965362549, "learning_rate": 9.816352311271201e-06, "loss": 0.4631, "num_input_tokens_seen": 34524320, "step": 28395 }, { "epoch": 3.5584513218894873, "grad_norm": 0.08835349231958389, "learning_rate": 9.816205471727778e-06, "loss": 0.4594, "num_input_tokens_seen": 34530208, "step": 28400 }, { "epoch": 3.559077809798271, "grad_norm": 0.10410821437835693, "learning_rate": 9.81605857460257e-06, "loss": 0.4704, "num_input_tokens_seen": 34536448, "step": 28405 }, { "epoch": 3.559704297707054, "grad_norm": 0.08216921240091324, "learning_rate": 9.81591161989733e-06, "loss": 0.4673, "num_input_tokens_seen": 34542400, "step": 28410 }, { "epoch": 3.5603307856158377, "grad_norm": 0.09192031621932983, "learning_rate": 9.815764607613823e-06, "loss": 0.462, "num_input_tokens_seen": 34548704, "step": 28415 }, { "epoch": 3.560957273524621, "grad_norm": 0.12022989988327026, "learning_rate": 9.815617537753796e-06, "loss": 0.463, "num_input_tokens_seen": 34554816, "step": 28420 }, { "epoch": 3.5615837614334045, "grad_norm": 0.08989132195711136, "learning_rate": 9.815470410319015e-06, "loss": 0.459, "num_input_tokens_seen": 34561088, "step": 28425 }, { "epoch": 3.5622102493421877, "grad_norm": 0.12824730575084686, "learning_rate": 9.815323225311237e-06, "loss": 0.4619, "num_input_tokens_seen": 34566880, "step": 28430 }, { "epoch": 3.5628367372509713, "grad_norm": 0.08278040587902069, "learning_rate": 9.815175982732222e-06, "loss": 0.4657, "num_input_tokens_seen": 34573056, "step": 28435 }, { "epoch": 3.5634632251597544, "grad_norm": 0.14180169999599457, "learning_rate": 9.815028682583727e-06, "loss": 0.4646, "num_input_tokens_seen": 34579520, "step": 28440 }, { "epoch": 3.5640897130685376, "grad_norm": 0.11762885749340057, "learning_rate": 9.814881324867518e-06, "loss": 0.463, "num_input_tokens_seen": 34585760, "step": 28445 }, { "epoch": 3.5647162009773212, "grad_norm": 0.07991170138120651, "learning_rate": 9.814733909585353e-06, "loss": 0.467, "num_input_tokens_seen": 34591584, "step": 28450 }, { "epoch": 3.5653426888861044, "grad_norm": 0.0784299373626709, "learning_rate": 9.814586436738998e-06, "loss": 0.4605, "num_input_tokens_seen": 34597472, "step": 28455 }, { "epoch": 3.565969176794888, "grad_norm": 0.08716738969087601, "learning_rate": 9.814438906330213e-06, "loss": 0.4578, "num_input_tokens_seen": 34603680, "step": 28460 }, { "epoch": 3.566595664703671, "grad_norm": 0.10310377180576324, "learning_rate": 9.814291318360763e-06, "loss": 0.4683, "num_input_tokens_seen": 34609952, "step": 28465 }, { "epoch": 3.5672221526124543, "grad_norm": 0.07775231450796127, "learning_rate": 9.814143672832413e-06, "loss": 0.4658, "num_input_tokens_seen": 34615296, "step": 28470 }, { "epoch": 3.567848640521238, "grad_norm": 0.11408831924200058, "learning_rate": 9.813995969746928e-06, "loss": 0.4666, "num_input_tokens_seen": 34621248, "step": 28475 }, { "epoch": 3.5684751284300216, "grad_norm": 0.07349057495594025, "learning_rate": 9.813848209106074e-06, "loss": 0.4689, "num_input_tokens_seen": 34627168, "step": 28480 }, { "epoch": 3.5691016163388047, "grad_norm": 0.08914079517126083, "learning_rate": 9.813700390911616e-06, "loss": 0.4613, "num_input_tokens_seen": 34633344, "step": 28485 }, { "epoch": 3.569728104247588, "grad_norm": 0.08859580010175705, "learning_rate": 9.813552515165324e-06, "loss": 0.4672, "num_input_tokens_seen": 34639296, "step": 28490 }, { "epoch": 3.5703545921563715, "grad_norm": 0.11547547578811646, "learning_rate": 9.813404581868966e-06, "loss": 0.456, "num_input_tokens_seen": 34645504, "step": 28495 }, { "epoch": 3.5709810800651547, "grad_norm": 0.08712521940469742, "learning_rate": 9.813256591024308e-06, "loss": 0.4636, "num_input_tokens_seen": 34651872, "step": 28500 }, { "epoch": 3.5716075679739383, "grad_norm": 0.16613292694091797, "learning_rate": 9.813108542633119e-06, "loss": 0.4642, "num_input_tokens_seen": 34657664, "step": 28505 }, { "epoch": 3.5722340558827215, "grad_norm": 0.07841881364583969, "learning_rate": 9.812960436697172e-06, "loss": 0.4659, "num_input_tokens_seen": 34664128, "step": 28510 }, { "epoch": 3.5728605437915046, "grad_norm": 0.1100444570183754, "learning_rate": 9.812812273218237e-06, "loss": 0.4622, "num_input_tokens_seen": 34670272, "step": 28515 }, { "epoch": 3.5734870317002883, "grad_norm": 0.06749825924634933, "learning_rate": 9.812664052198083e-06, "loss": 0.4606, "num_input_tokens_seen": 34676544, "step": 28520 }, { "epoch": 3.5741135196090714, "grad_norm": 0.09961053729057312, "learning_rate": 9.812515773638487e-06, "loss": 0.4664, "num_input_tokens_seen": 34682752, "step": 28525 }, { "epoch": 3.574740007517855, "grad_norm": 0.0774359330534935, "learning_rate": 9.812367437541217e-06, "loss": 0.4593, "num_input_tokens_seen": 34688960, "step": 28530 }, { "epoch": 3.575366495426638, "grad_norm": 0.08733958005905151, "learning_rate": 9.81221904390805e-06, "loss": 0.4635, "num_input_tokens_seen": 34694464, "step": 28535 }, { "epoch": 3.5759929833354214, "grad_norm": 0.0753849446773529, "learning_rate": 9.812070592740756e-06, "loss": 0.4641, "num_input_tokens_seen": 34700608, "step": 28540 }, { "epoch": 3.576619471244205, "grad_norm": 0.07503212988376617, "learning_rate": 9.811922084041115e-06, "loss": 0.4594, "num_input_tokens_seen": 34706720, "step": 28545 }, { "epoch": 3.5772459591529886, "grad_norm": 0.07641003280878067, "learning_rate": 9.811773517810896e-06, "loss": 0.4592, "num_input_tokens_seen": 34712992, "step": 28550 }, { "epoch": 3.5778724470617718, "grad_norm": 0.07354238629341125, "learning_rate": 9.811624894051882e-06, "loss": 0.4614, "num_input_tokens_seen": 34719040, "step": 28555 }, { "epoch": 3.578498934970555, "grad_norm": 0.07239175587892532, "learning_rate": 9.811476212765847e-06, "loss": 0.458, "num_input_tokens_seen": 34725056, "step": 28560 }, { "epoch": 3.5791254228793385, "grad_norm": 0.09473241120576859, "learning_rate": 9.811327473954568e-06, "loss": 0.4624, "num_input_tokens_seen": 34731360, "step": 28565 }, { "epoch": 3.5797519107881217, "grad_norm": 0.07994550466537476, "learning_rate": 9.811178677619824e-06, "loss": 0.4649, "num_input_tokens_seen": 34737568, "step": 28570 }, { "epoch": 3.5803783986969053, "grad_norm": 0.07772664725780487, "learning_rate": 9.811029823763393e-06, "loss": 0.4622, "num_input_tokens_seen": 34743520, "step": 28575 }, { "epoch": 3.5810048866056885, "grad_norm": 0.06786023080348969, "learning_rate": 9.810880912387057e-06, "loss": 0.4517, "num_input_tokens_seen": 34749568, "step": 28580 }, { "epoch": 3.5816313745144717, "grad_norm": 0.10390587896108627, "learning_rate": 9.810731943492592e-06, "loss": 0.4657, "num_input_tokens_seen": 34755520, "step": 28585 }, { "epoch": 3.5822578624232553, "grad_norm": 0.07051243633031845, "learning_rate": 9.810582917081786e-06, "loss": 0.4677, "num_input_tokens_seen": 34761632, "step": 28590 }, { "epoch": 3.5828843503320384, "grad_norm": 0.07000365853309631, "learning_rate": 9.810433833156413e-06, "loss": 0.4638, "num_input_tokens_seen": 34767520, "step": 28595 }, { "epoch": 3.583510838240822, "grad_norm": 0.0874708965420723, "learning_rate": 9.810284691718261e-06, "loss": 0.4544, "num_input_tokens_seen": 34773760, "step": 28600 }, { "epoch": 3.5841373261496052, "grad_norm": 0.12216009944677353, "learning_rate": 9.810135492769113e-06, "loss": 0.4587, "num_input_tokens_seen": 34779744, "step": 28605 }, { "epoch": 3.5847638140583884, "grad_norm": 0.08007309585809708, "learning_rate": 9.809986236310748e-06, "loss": 0.4597, "num_input_tokens_seen": 34786144, "step": 28610 }, { "epoch": 3.585390301967172, "grad_norm": 0.08781784772872925, "learning_rate": 9.809836922344955e-06, "loss": 0.469, "num_input_tokens_seen": 34792416, "step": 28615 }, { "epoch": 3.5860167898759556, "grad_norm": 0.08265221863985062, "learning_rate": 9.809687550873518e-06, "loss": 0.4653, "num_input_tokens_seen": 34798528, "step": 28620 }, { "epoch": 3.586643277784739, "grad_norm": 0.0810893252491951, "learning_rate": 9.809538121898223e-06, "loss": 0.4576, "num_input_tokens_seen": 34804800, "step": 28625 }, { "epoch": 3.587269765693522, "grad_norm": 0.0341968834400177, "learning_rate": 9.809388635420854e-06, "loss": 0.4528, "num_input_tokens_seen": 34811136, "step": 28630 }, { "epoch": 3.5878962536023056, "grad_norm": 0.1219792440533638, "learning_rate": 9.809239091443201e-06, "loss": 0.4606, "num_input_tokens_seen": 34817312, "step": 28635 }, { "epoch": 3.5885227415110887, "grad_norm": 0.11272735893726349, "learning_rate": 9.809089489967053e-06, "loss": 0.4606, "num_input_tokens_seen": 34823616, "step": 28640 }, { "epoch": 3.5891492294198724, "grad_norm": 0.07917693257331848, "learning_rate": 9.808939830994196e-06, "loss": 0.4614, "num_input_tokens_seen": 34829536, "step": 28645 }, { "epoch": 3.5897757173286555, "grad_norm": 0.079068124294281, "learning_rate": 9.80879011452642e-06, "loss": 0.4651, "num_input_tokens_seen": 34835712, "step": 28650 }, { "epoch": 3.5904022052374387, "grad_norm": 0.09665130078792572, "learning_rate": 9.808640340565515e-06, "loss": 0.4557, "num_input_tokens_seen": 34841312, "step": 28655 }, { "epoch": 3.5910286931462223, "grad_norm": 0.0994129553437233, "learning_rate": 9.808490509113272e-06, "loss": 0.4668, "num_input_tokens_seen": 34847584, "step": 28660 }, { "epoch": 3.5916551810550055, "grad_norm": 0.10594040900468826, "learning_rate": 9.808340620171481e-06, "loss": 0.4451, "num_input_tokens_seen": 34853920, "step": 28665 }, { "epoch": 3.592281668963789, "grad_norm": 0.09160519391298294, "learning_rate": 9.808190673741937e-06, "loss": 0.4692, "num_input_tokens_seen": 34860160, "step": 28670 }, { "epoch": 3.5929081568725723, "grad_norm": 0.08245597779750824, "learning_rate": 9.80804066982643e-06, "loss": 0.4713, "num_input_tokens_seen": 34866528, "step": 28675 }, { "epoch": 3.593534644781356, "grad_norm": 0.14654333889484406, "learning_rate": 9.807890608426755e-06, "loss": 0.4823, "num_input_tokens_seen": 34872544, "step": 28680 }, { "epoch": 3.594161132690139, "grad_norm": 0.12988965213298798, "learning_rate": 9.807740489544707e-06, "loss": 0.4597, "num_input_tokens_seen": 34878432, "step": 28685 }, { "epoch": 3.5947876205989227, "grad_norm": 0.03744857758283615, "learning_rate": 9.807590313182075e-06, "loss": 0.4587, "num_input_tokens_seen": 34884576, "step": 28690 }, { "epoch": 3.595414108507706, "grad_norm": 0.16139310598373413, "learning_rate": 9.807440079340663e-06, "loss": 0.458, "num_input_tokens_seen": 34890432, "step": 28695 }, { "epoch": 3.596040596416489, "grad_norm": 0.08103714138269424, "learning_rate": 9.80728978802226e-06, "loss": 0.4597, "num_input_tokens_seen": 34896640, "step": 28700 }, { "epoch": 3.5966670843252726, "grad_norm": 0.09598289430141449, "learning_rate": 9.807139439228669e-06, "loss": 0.4547, "num_input_tokens_seen": 34902848, "step": 28705 }, { "epoch": 3.5972935722340558, "grad_norm": 0.07103171944618225, "learning_rate": 9.806989032961682e-06, "loss": 0.4544, "num_input_tokens_seen": 34908672, "step": 28710 }, { "epoch": 3.5979200601428394, "grad_norm": 0.21171116828918457, "learning_rate": 9.806838569223101e-06, "loss": 0.47, "num_input_tokens_seen": 34914592, "step": 28715 }, { "epoch": 3.5985465480516226, "grad_norm": 0.08174970746040344, "learning_rate": 9.806688048014723e-06, "loss": 0.4817, "num_input_tokens_seen": 34921056, "step": 28720 }, { "epoch": 3.5991730359604057, "grad_norm": 0.11493664979934692, "learning_rate": 9.806537469338347e-06, "loss": 0.4553, "num_input_tokens_seen": 34927136, "step": 28725 }, { "epoch": 3.5997995238691893, "grad_norm": 0.149053156375885, "learning_rate": 9.806386833195776e-06, "loss": 0.4697, "num_input_tokens_seen": 34932736, "step": 28730 }, { "epoch": 3.600426011777973, "grad_norm": 0.1500530242919922, "learning_rate": 9.806236139588808e-06, "loss": 0.4674, "num_input_tokens_seen": 34938176, "step": 28735 }, { "epoch": 3.601052499686756, "grad_norm": 0.14262370765209198, "learning_rate": 9.806085388519247e-06, "loss": 0.4595, "num_input_tokens_seen": 34944544, "step": 28740 }, { "epoch": 3.6016789875955393, "grad_norm": 0.10464996099472046, "learning_rate": 9.805934579988894e-06, "loss": 0.461, "num_input_tokens_seen": 34950624, "step": 28745 }, { "epoch": 3.602305475504323, "grad_norm": 0.07199563086032867, "learning_rate": 9.805783713999552e-06, "loss": 0.4616, "num_input_tokens_seen": 34956576, "step": 28750 }, { "epoch": 3.602931963413106, "grad_norm": 0.10243424773216248, "learning_rate": 9.805632790553028e-06, "loss": 0.4603, "num_input_tokens_seen": 34962720, "step": 28755 }, { "epoch": 3.6035584513218897, "grad_norm": 0.09017635881900787, "learning_rate": 9.805481809651121e-06, "loss": 0.4632, "num_input_tokens_seen": 34968608, "step": 28760 }, { "epoch": 3.604184939230673, "grad_norm": 0.08566708862781525, "learning_rate": 9.80533077129564e-06, "loss": 0.4685, "num_input_tokens_seen": 34974720, "step": 28765 }, { "epoch": 3.604811427139456, "grad_norm": 0.08155524730682373, "learning_rate": 9.805179675488387e-06, "loss": 0.4669, "num_input_tokens_seen": 34980864, "step": 28770 }, { "epoch": 3.6054379150482396, "grad_norm": 0.0882306843996048, "learning_rate": 9.805028522231174e-06, "loss": 0.4641, "num_input_tokens_seen": 34986784, "step": 28775 }, { "epoch": 3.606064402957023, "grad_norm": 0.08067858219146729, "learning_rate": 9.804877311525804e-06, "loss": 0.4631, "num_input_tokens_seen": 34992704, "step": 28780 }, { "epoch": 3.6066908908658064, "grad_norm": 0.12467740476131439, "learning_rate": 9.804726043374086e-06, "loss": 0.4637, "num_input_tokens_seen": 34998176, "step": 28785 }, { "epoch": 3.6073173787745896, "grad_norm": 0.09536346048116684, "learning_rate": 9.80457471777783e-06, "loss": 0.4625, "num_input_tokens_seen": 35004128, "step": 28790 }, { "epoch": 3.6079438666833727, "grad_norm": 0.12956322729587555, "learning_rate": 9.804423334738842e-06, "loss": 0.4606, "num_input_tokens_seen": 35010240, "step": 28795 }, { "epoch": 3.6085703545921564, "grad_norm": 0.14095328748226166, "learning_rate": 9.804271894258935e-06, "loss": 0.4685, "num_input_tokens_seen": 35016608, "step": 28800 }, { "epoch": 3.60919684250094, "grad_norm": 0.06946232169866562, "learning_rate": 9.804120396339917e-06, "loss": 0.4674, "num_input_tokens_seen": 35022528, "step": 28805 }, { "epoch": 3.609823330409723, "grad_norm": 0.07590937614440918, "learning_rate": 9.803968840983601e-06, "loss": 0.4604, "num_input_tokens_seen": 35028768, "step": 28810 }, { "epoch": 3.6104498183185063, "grad_norm": 0.08993571251630783, "learning_rate": 9.803817228191798e-06, "loss": 0.4679, "num_input_tokens_seen": 35034368, "step": 28815 }, { "epoch": 3.61107630622729, "grad_norm": 0.07328900694847107, "learning_rate": 9.803665557966322e-06, "loss": 0.4609, "num_input_tokens_seen": 35040672, "step": 28820 }, { "epoch": 3.611702794136073, "grad_norm": 0.04514436423778534, "learning_rate": 9.803513830308986e-06, "loss": 0.4651, "num_input_tokens_seen": 35046752, "step": 28825 }, { "epoch": 3.6123292820448567, "grad_norm": 0.09786536544561386, "learning_rate": 9.803362045221603e-06, "loss": 0.4624, "num_input_tokens_seen": 35052864, "step": 28830 }, { "epoch": 3.61295576995364, "grad_norm": 0.13397309184074402, "learning_rate": 9.803210202705988e-06, "loss": 0.4581, "num_input_tokens_seen": 35058880, "step": 28835 }, { "epoch": 3.613582257862423, "grad_norm": 0.07970572263002396, "learning_rate": 9.803058302763958e-06, "loss": 0.462, "num_input_tokens_seen": 35064800, "step": 28840 }, { "epoch": 3.6142087457712067, "grad_norm": 0.1575041264295578, "learning_rate": 9.802906345397326e-06, "loss": 0.4551, "num_input_tokens_seen": 35070912, "step": 28845 }, { "epoch": 3.61483523367999, "grad_norm": 0.0870450958609581, "learning_rate": 9.802754330607911e-06, "loss": 0.4714, "num_input_tokens_seen": 35077056, "step": 28850 }, { "epoch": 3.6154617215887734, "grad_norm": 0.08917292207479477, "learning_rate": 9.802602258397532e-06, "loss": 0.4703, "num_input_tokens_seen": 35082880, "step": 28855 }, { "epoch": 3.6160882094975566, "grad_norm": 0.09349974989891052, "learning_rate": 9.802450128768001e-06, "loss": 0.4661, "num_input_tokens_seen": 35088704, "step": 28860 }, { "epoch": 3.61671469740634, "grad_norm": 0.10223997384309769, "learning_rate": 9.802297941721144e-06, "loss": 0.4601, "num_input_tokens_seen": 35094880, "step": 28865 }, { "epoch": 3.6173411853151234, "grad_norm": 0.145018070936203, "learning_rate": 9.802145697258778e-06, "loss": 0.4637, "num_input_tokens_seen": 35100928, "step": 28870 }, { "epoch": 3.617967673223907, "grad_norm": 0.03634550794959068, "learning_rate": 9.801993395382721e-06, "loss": 0.4685, "num_input_tokens_seen": 35106816, "step": 28875 }, { "epoch": 3.61859416113269, "grad_norm": 0.08649231493473053, "learning_rate": 9.801841036094797e-06, "loss": 0.4612, "num_input_tokens_seen": 35113216, "step": 28880 }, { "epoch": 3.6192206490414733, "grad_norm": 0.08468662947416306, "learning_rate": 9.801688619396826e-06, "loss": 0.4603, "num_input_tokens_seen": 35119360, "step": 28885 }, { "epoch": 3.619847136950257, "grad_norm": 0.12038573622703552, "learning_rate": 9.80153614529063e-06, "loss": 0.4582, "num_input_tokens_seen": 35125120, "step": 28890 }, { "epoch": 3.62047362485904, "grad_norm": 0.09585423022508621, "learning_rate": 9.801383613778033e-06, "loss": 0.4639, "num_input_tokens_seen": 35131072, "step": 28895 }, { "epoch": 3.6211001127678237, "grad_norm": 0.08576572686433792, "learning_rate": 9.801231024860858e-06, "loss": 0.4629, "num_input_tokens_seen": 35137472, "step": 28900 }, { "epoch": 3.621726600676607, "grad_norm": 0.05336104705929756, "learning_rate": 9.80107837854093e-06, "loss": 0.4603, "num_input_tokens_seen": 35143264, "step": 28905 }, { "epoch": 3.62235308858539, "grad_norm": 0.11600630730390549, "learning_rate": 9.800925674820071e-06, "loss": 0.4627, "num_input_tokens_seen": 35149856, "step": 28910 }, { "epoch": 3.6229795764941737, "grad_norm": 0.03668120130896568, "learning_rate": 9.800772913700113e-06, "loss": 0.4632, "num_input_tokens_seen": 35156256, "step": 28915 }, { "epoch": 3.6236060644029573, "grad_norm": 0.072036013007164, "learning_rate": 9.800620095182876e-06, "loss": 0.4618, "num_input_tokens_seen": 35162368, "step": 28920 }, { "epoch": 3.6242325523117405, "grad_norm": 0.08832443505525589, "learning_rate": 9.800467219270191e-06, "loss": 0.4589, "num_input_tokens_seen": 35168480, "step": 28925 }, { "epoch": 3.6248590402205236, "grad_norm": 0.04324957728385925, "learning_rate": 9.800314285963883e-06, "loss": 0.4633, "num_input_tokens_seen": 35174432, "step": 28930 }, { "epoch": 3.6254855281293072, "grad_norm": 0.07852870970964432, "learning_rate": 9.800161295265782e-06, "loss": 0.4659, "num_input_tokens_seen": 35180672, "step": 28935 }, { "epoch": 3.6261120160380904, "grad_norm": 0.14619363844394684, "learning_rate": 9.800008247177719e-06, "loss": 0.4582, "num_input_tokens_seen": 35186336, "step": 28940 }, { "epoch": 3.626738503946874, "grad_norm": 0.1396292895078659, "learning_rate": 9.79985514170152e-06, "loss": 0.4721, "num_input_tokens_seen": 35192160, "step": 28945 }, { "epoch": 3.627364991855657, "grad_norm": 0.1187327578663826, "learning_rate": 9.799701978839017e-06, "loss": 0.4649, "num_input_tokens_seen": 35198144, "step": 28950 }, { "epoch": 3.6279914797644404, "grad_norm": 0.07234138995409012, "learning_rate": 9.799548758592041e-06, "loss": 0.4627, "num_input_tokens_seen": 35204448, "step": 28955 }, { "epoch": 3.628617967673224, "grad_norm": 0.1472240835428238, "learning_rate": 9.799395480962425e-06, "loss": 0.4608, "num_input_tokens_seen": 35210656, "step": 28960 }, { "epoch": 3.629244455582007, "grad_norm": 0.07790843397378922, "learning_rate": 9.799242145952e-06, "loss": 0.4536, "num_input_tokens_seen": 35217056, "step": 28965 }, { "epoch": 3.6298709434907908, "grad_norm": 0.08046247810125351, "learning_rate": 9.799088753562605e-06, "loss": 0.4624, "num_input_tokens_seen": 35223040, "step": 28970 }, { "epoch": 3.630497431399574, "grad_norm": 0.09148497879505157, "learning_rate": 9.798935303796064e-06, "loss": 0.4643, "num_input_tokens_seen": 35229536, "step": 28975 }, { "epoch": 3.631123919308357, "grad_norm": 0.08372663706541061, "learning_rate": 9.798781796654217e-06, "loss": 0.4582, "num_input_tokens_seen": 35235520, "step": 28980 }, { "epoch": 3.6317504072171407, "grad_norm": 0.0853174477815628, "learning_rate": 9.798628232138901e-06, "loss": 0.464, "num_input_tokens_seen": 35241920, "step": 28985 }, { "epoch": 3.6323768951259243, "grad_norm": 0.1159500703215599, "learning_rate": 9.79847461025195e-06, "loss": 0.4593, "num_input_tokens_seen": 35247968, "step": 28990 }, { "epoch": 3.6330033830347075, "grad_norm": 0.11925835907459259, "learning_rate": 9.7983209309952e-06, "loss": 0.4602, "num_input_tokens_seen": 35254208, "step": 28995 }, { "epoch": 3.6336298709434907, "grad_norm": 0.13054318726062775, "learning_rate": 9.798167194370489e-06, "loss": 0.4515, "num_input_tokens_seen": 35260320, "step": 29000 }, { "epoch": 3.6342563588522743, "grad_norm": 0.1083046942949295, "learning_rate": 9.798013400379656e-06, "loss": 0.4619, "num_input_tokens_seen": 35266528, "step": 29005 }, { "epoch": 3.6348828467610574, "grad_norm": 0.09580251574516296, "learning_rate": 9.797859549024536e-06, "loss": 0.462, "num_input_tokens_seen": 35272096, "step": 29010 }, { "epoch": 3.635509334669841, "grad_norm": 0.08625321835279465, "learning_rate": 9.797705640306974e-06, "loss": 0.4502, "num_input_tokens_seen": 35278240, "step": 29015 }, { "epoch": 3.636135822578624, "grad_norm": 0.11719213426113129, "learning_rate": 9.797551674228807e-06, "loss": 0.4782, "num_input_tokens_seen": 35284416, "step": 29020 }, { "epoch": 3.6367623104874074, "grad_norm": 0.08408153802156448, "learning_rate": 9.797397650791875e-06, "loss": 0.4643, "num_input_tokens_seen": 35290880, "step": 29025 }, { "epoch": 3.637388798396191, "grad_norm": 0.09206020832061768, "learning_rate": 9.797243569998022e-06, "loss": 0.4541, "num_input_tokens_seen": 35297344, "step": 29030 }, { "epoch": 3.638015286304974, "grad_norm": 0.14185672998428345, "learning_rate": 9.797089431849089e-06, "loss": 0.4844, "num_input_tokens_seen": 35303552, "step": 29035 }, { "epoch": 3.638641774213758, "grad_norm": 0.0648871585726738, "learning_rate": 9.796935236346917e-06, "loss": 0.4721, "num_input_tokens_seen": 35309696, "step": 29040 }, { "epoch": 3.639268262122541, "grad_norm": 0.15992727875709534, "learning_rate": 9.796780983493353e-06, "loss": 0.4665, "num_input_tokens_seen": 35315808, "step": 29045 }, { "epoch": 3.639894750031324, "grad_norm": 0.10401483625173569, "learning_rate": 9.796626673290237e-06, "loss": 0.4551, "num_input_tokens_seen": 35321824, "step": 29050 }, { "epoch": 3.6405212379401077, "grad_norm": 0.18159300088882446, "learning_rate": 9.796472305739419e-06, "loss": 0.468, "num_input_tokens_seen": 35327680, "step": 29055 }, { "epoch": 3.6411477258488913, "grad_norm": 0.053160410374403, "learning_rate": 9.796317880842741e-06, "loss": 0.4679, "num_input_tokens_seen": 35333888, "step": 29060 }, { "epoch": 3.6417742137576745, "grad_norm": 0.09230543673038483, "learning_rate": 9.79616339860205e-06, "loss": 0.4652, "num_input_tokens_seen": 35340512, "step": 29065 }, { "epoch": 3.6424007016664577, "grad_norm": 0.08846262842416763, "learning_rate": 9.796008859019193e-06, "loss": 0.4589, "num_input_tokens_seen": 35346560, "step": 29070 }, { "epoch": 3.6430271895752413, "grad_norm": 0.1132974699139595, "learning_rate": 9.795854262096019e-06, "loss": 0.458, "num_input_tokens_seen": 35352416, "step": 29075 }, { "epoch": 3.6436536774840245, "grad_norm": 0.11143292486667633, "learning_rate": 9.795699607834372e-06, "loss": 0.4622, "num_input_tokens_seen": 35358496, "step": 29080 }, { "epoch": 3.644280165392808, "grad_norm": 0.0869554802775383, "learning_rate": 9.795544896236107e-06, "loss": 0.4611, "num_input_tokens_seen": 35364416, "step": 29085 }, { "epoch": 3.6449066533015912, "grad_norm": 0.0745585635304451, "learning_rate": 9.79539012730307e-06, "loss": 0.4599, "num_input_tokens_seen": 35370208, "step": 29090 }, { "epoch": 3.6455331412103744, "grad_norm": 0.08153438568115234, "learning_rate": 9.795235301037113e-06, "loss": 0.4588, "num_input_tokens_seen": 35376128, "step": 29095 }, { "epoch": 3.646159629119158, "grad_norm": 0.08082225173711777, "learning_rate": 9.795080417440085e-06, "loss": 0.4641, "num_input_tokens_seen": 35382144, "step": 29100 }, { "epoch": 3.646786117027941, "grad_norm": 0.08603674173355103, "learning_rate": 9.794925476513841e-06, "loss": 0.4624, "num_input_tokens_seen": 35387776, "step": 29105 }, { "epoch": 3.647412604936725, "grad_norm": 0.09634716808795929, "learning_rate": 9.794770478260231e-06, "loss": 0.4641, "num_input_tokens_seen": 35393728, "step": 29110 }, { "epoch": 3.648039092845508, "grad_norm": 0.06631669402122498, "learning_rate": 9.794615422681108e-06, "loss": 0.4629, "num_input_tokens_seen": 35399584, "step": 29115 }, { "epoch": 3.6486655807542916, "grad_norm": 0.042585693299770355, "learning_rate": 9.794460309778327e-06, "loss": 0.4636, "num_input_tokens_seen": 35405664, "step": 29120 }, { "epoch": 3.6492920686630748, "grad_norm": 0.10666881501674652, "learning_rate": 9.794305139553742e-06, "loss": 0.4651, "num_input_tokens_seen": 35411648, "step": 29125 }, { "epoch": 3.6499185565718584, "grad_norm": 0.07601243257522583, "learning_rate": 9.79414991200921e-06, "loss": 0.4683, "num_input_tokens_seen": 35417952, "step": 29130 }, { "epoch": 3.6505450444806415, "grad_norm": 0.0810556411743164, "learning_rate": 9.79399462714658e-06, "loss": 0.4621, "num_input_tokens_seen": 35423904, "step": 29135 }, { "epoch": 3.6511715323894247, "grad_norm": 0.13293400406837463, "learning_rate": 9.793839284967719e-06, "loss": 0.4574, "num_input_tokens_seen": 35430208, "step": 29140 }, { "epoch": 3.6517980202982083, "grad_norm": 0.12699997425079346, "learning_rate": 9.793683885474476e-06, "loss": 0.4579, "num_input_tokens_seen": 35436416, "step": 29145 }, { "epoch": 3.6524245082069915, "grad_norm": 0.08170274645090103, "learning_rate": 9.793528428668713e-06, "loss": 0.4575, "num_input_tokens_seen": 35442368, "step": 29150 }, { "epoch": 3.653050996115775, "grad_norm": 0.09978698194026947, "learning_rate": 9.793372914552288e-06, "loss": 0.4607, "num_input_tokens_seen": 35448576, "step": 29155 }, { "epoch": 3.6536774840245583, "grad_norm": 0.1713232696056366, "learning_rate": 9.793217343127058e-06, "loss": 0.4666, "num_input_tokens_seen": 35454432, "step": 29160 }, { "epoch": 3.6543039719333414, "grad_norm": 0.07926449924707413, "learning_rate": 9.793061714394887e-06, "loss": 0.4696, "num_input_tokens_seen": 35459872, "step": 29165 }, { "epoch": 3.654930459842125, "grad_norm": 0.08100039511919022, "learning_rate": 9.792906028357632e-06, "loss": 0.4622, "num_input_tokens_seen": 35466112, "step": 29170 }, { "epoch": 3.6555569477509087, "grad_norm": 0.08126331120729446, "learning_rate": 9.792750285017155e-06, "loss": 0.4634, "num_input_tokens_seen": 35472288, "step": 29175 }, { "epoch": 3.656183435659692, "grad_norm": 0.11011213064193726, "learning_rate": 9.79259448437532e-06, "loss": 0.4656, "num_input_tokens_seen": 35478272, "step": 29180 }, { "epoch": 3.656809923568475, "grad_norm": 0.08696785569190979, "learning_rate": 9.792438626433989e-06, "loss": 0.4564, "num_input_tokens_seen": 35484640, "step": 29185 }, { "epoch": 3.6574364114772586, "grad_norm": 0.12180768698453903, "learning_rate": 9.792282711195022e-06, "loss": 0.4617, "num_input_tokens_seen": 35490784, "step": 29190 }, { "epoch": 3.658062899386042, "grad_norm": 0.13536906242370605, "learning_rate": 9.79212673866029e-06, "loss": 0.4586, "num_input_tokens_seen": 35497152, "step": 29195 }, { "epoch": 3.6586893872948254, "grad_norm": 0.07072190940380096, "learning_rate": 9.791970708831651e-06, "loss": 0.4549, "num_input_tokens_seen": 35503264, "step": 29200 }, { "epoch": 3.6593158752036086, "grad_norm": 0.042952701449394226, "learning_rate": 9.791814621710975e-06, "loss": 0.4654, "num_input_tokens_seen": 35509120, "step": 29205 }, { "epoch": 3.6599423631123917, "grad_norm": 0.10781718045473099, "learning_rate": 9.791658477300125e-06, "loss": 0.4662, "num_input_tokens_seen": 35515008, "step": 29210 }, { "epoch": 3.6605688510211754, "grad_norm": 0.07762890309095383, "learning_rate": 9.79150227560097e-06, "loss": 0.462, "num_input_tokens_seen": 35520832, "step": 29215 }, { "epoch": 3.6611953389299585, "grad_norm": 0.04329250752925873, "learning_rate": 9.791346016615378e-06, "loss": 0.4585, "num_input_tokens_seen": 35527040, "step": 29220 }, { "epoch": 3.661821826838742, "grad_norm": 0.093755804002285, "learning_rate": 9.791189700345215e-06, "loss": 0.4609, "num_input_tokens_seen": 35533216, "step": 29225 }, { "epoch": 3.6624483147475253, "grad_norm": 0.041970688849687576, "learning_rate": 9.79103332679235e-06, "loss": 0.4661, "num_input_tokens_seen": 35539424, "step": 29230 }, { "epoch": 3.6630748026563085, "grad_norm": 0.03168255835771561, "learning_rate": 9.790876895958655e-06, "loss": 0.4681, "num_input_tokens_seen": 35545088, "step": 29235 }, { "epoch": 3.663701290565092, "grad_norm": 0.07802300900220871, "learning_rate": 9.790720407845999e-06, "loss": 0.4618, "num_input_tokens_seen": 35550752, "step": 29240 }, { "epoch": 3.6643277784738757, "grad_norm": 0.1143193319439888, "learning_rate": 9.790563862456251e-06, "loss": 0.4582, "num_input_tokens_seen": 35556704, "step": 29245 }, { "epoch": 3.664954266382659, "grad_norm": 0.0776645839214325, "learning_rate": 9.790407259791286e-06, "loss": 0.4611, "num_input_tokens_seen": 35562816, "step": 29250 }, { "epoch": 3.665580754291442, "grad_norm": 0.09467543661594391, "learning_rate": 9.790250599852973e-06, "loss": 0.4703, "num_input_tokens_seen": 35568544, "step": 29255 }, { "epoch": 3.6662072422002256, "grad_norm": 0.03932735696434975, "learning_rate": 9.790093882643189e-06, "loss": 0.4607, "num_input_tokens_seen": 35574784, "step": 29260 }, { "epoch": 3.666833730109009, "grad_norm": 0.0908164530992508, "learning_rate": 9.789937108163802e-06, "loss": 0.4622, "num_input_tokens_seen": 35580832, "step": 29265 }, { "epoch": 3.6674602180177924, "grad_norm": 0.07453887909650803, "learning_rate": 9.789780276416693e-06, "loss": 0.4598, "num_input_tokens_seen": 35587104, "step": 29270 }, { "epoch": 3.6680867059265756, "grad_norm": 0.13835422694683075, "learning_rate": 9.789623387403732e-06, "loss": 0.4646, "num_input_tokens_seen": 35593088, "step": 29275 }, { "epoch": 3.6687131938353588, "grad_norm": 0.12786202132701874, "learning_rate": 9.789466441126797e-06, "loss": 0.4619, "num_input_tokens_seen": 35599232, "step": 29280 }, { "epoch": 3.6693396817441424, "grad_norm": 0.11624765396118164, "learning_rate": 9.789309437587764e-06, "loss": 0.4622, "num_input_tokens_seen": 35605120, "step": 29285 }, { "epoch": 3.6699661696529255, "grad_norm": 0.06857195496559143, "learning_rate": 9.789152376788508e-06, "loss": 0.4587, "num_input_tokens_seen": 35610720, "step": 29290 }, { "epoch": 3.670592657561709, "grad_norm": 0.07578078657388687, "learning_rate": 9.788995258730912e-06, "loss": 0.4617, "num_input_tokens_seen": 35616832, "step": 29295 }, { "epoch": 3.6712191454704923, "grad_norm": 0.12347672134637833, "learning_rate": 9.788838083416849e-06, "loss": 0.4672, "num_input_tokens_seen": 35622784, "step": 29300 }, { "epoch": 3.671845633379276, "grad_norm": 0.16513191163539886, "learning_rate": 9.788680850848202e-06, "loss": 0.4592, "num_input_tokens_seen": 35628832, "step": 29305 }, { "epoch": 3.672472121288059, "grad_norm": 0.11103348433971405, "learning_rate": 9.78852356102685e-06, "loss": 0.456, "num_input_tokens_seen": 35635136, "step": 29310 }, { "epoch": 3.6730986091968427, "grad_norm": 0.06810864061117172, "learning_rate": 9.78836621395467e-06, "loss": 0.4639, "num_input_tokens_seen": 35641056, "step": 29315 }, { "epoch": 3.673725097105626, "grad_norm": 0.08877958357334137, "learning_rate": 9.788208809633548e-06, "loss": 0.473, "num_input_tokens_seen": 35647296, "step": 29320 }, { "epoch": 3.674351585014409, "grad_norm": 0.08250266313552856, "learning_rate": 9.788051348065365e-06, "loss": 0.463, "num_input_tokens_seen": 35653312, "step": 29325 }, { "epoch": 3.6749780729231927, "grad_norm": 0.10441279411315918, "learning_rate": 9.787893829252e-06, "loss": 0.467, "num_input_tokens_seen": 35659520, "step": 29330 }, { "epoch": 3.675604560831976, "grad_norm": 0.06975588947534561, "learning_rate": 9.78773625319534e-06, "loss": 0.4549, "num_input_tokens_seen": 35665920, "step": 29335 }, { "epoch": 3.6762310487407595, "grad_norm": 0.14279966056346893, "learning_rate": 9.787578619897269e-06, "loss": 0.4616, "num_input_tokens_seen": 35672256, "step": 29340 }, { "epoch": 3.6768575366495426, "grad_norm": 0.033184558153152466, "learning_rate": 9.78742092935967e-06, "loss": 0.4617, "num_input_tokens_seen": 35678496, "step": 29345 }, { "epoch": 3.677484024558326, "grad_norm": 0.041869066655635834, "learning_rate": 9.787263181584427e-06, "loss": 0.4616, "num_input_tokens_seen": 35684928, "step": 29350 }, { "epoch": 3.6781105124671094, "grad_norm": 0.10462094098329544, "learning_rate": 9.787105376573428e-06, "loss": 0.4527, "num_input_tokens_seen": 35690368, "step": 29355 }, { "epoch": 3.678737000375893, "grad_norm": 0.08461961150169373, "learning_rate": 9.786947514328561e-06, "loss": 0.4689, "num_input_tokens_seen": 35696512, "step": 29360 }, { "epoch": 3.679363488284676, "grad_norm": 0.07805205136537552, "learning_rate": 9.78678959485171e-06, "loss": 0.4673, "num_input_tokens_seen": 35702912, "step": 29365 }, { "epoch": 3.6799899761934594, "grad_norm": 0.03975199535489082, "learning_rate": 9.786631618144769e-06, "loss": 0.47, "num_input_tokens_seen": 35709440, "step": 29370 }, { "epoch": 3.680616464102243, "grad_norm": 0.07610902935266495, "learning_rate": 9.786473584209618e-06, "loss": 0.4578, "num_input_tokens_seen": 35715840, "step": 29375 }, { "epoch": 3.681242952011026, "grad_norm": 0.14506031572818756, "learning_rate": 9.786315493048154e-06, "loss": 0.4608, "num_input_tokens_seen": 35721888, "step": 29380 }, { "epoch": 3.6818694399198097, "grad_norm": 0.07580409944057465, "learning_rate": 9.786157344662262e-06, "loss": 0.4661, "num_input_tokens_seen": 35728032, "step": 29385 }, { "epoch": 3.682495927828593, "grad_norm": 0.1281980723142624, "learning_rate": 9.785999139053836e-06, "loss": 0.462, "num_input_tokens_seen": 35734112, "step": 29390 }, { "epoch": 3.683122415737376, "grad_norm": 0.08741386234760284, "learning_rate": 9.785840876224765e-06, "loss": 0.4732, "num_input_tokens_seen": 35740032, "step": 29395 }, { "epoch": 3.6837489036461597, "grad_norm": 0.09641791880130768, "learning_rate": 9.785682556176945e-06, "loss": 0.4591, "num_input_tokens_seen": 35746400, "step": 29400 }, { "epoch": 3.684375391554943, "grad_norm": 0.10055436193943024, "learning_rate": 9.785524178912265e-06, "loss": 0.4611, "num_input_tokens_seen": 35752480, "step": 29405 }, { "epoch": 3.6850018794637265, "grad_norm": 0.07371658831834793, "learning_rate": 9.78536574443262e-06, "loss": 0.4614, "num_input_tokens_seen": 35758848, "step": 29410 }, { "epoch": 3.6856283673725097, "grad_norm": 0.08920559287071228, "learning_rate": 9.785207252739905e-06, "loss": 0.4633, "num_input_tokens_seen": 35765024, "step": 29415 }, { "epoch": 3.686254855281293, "grad_norm": 0.03608911857008934, "learning_rate": 9.785048703836014e-06, "loss": 0.4539, "num_input_tokens_seen": 35770144, "step": 29420 }, { "epoch": 3.6868813431900764, "grad_norm": 0.11282835900783539, "learning_rate": 9.784890097722841e-06, "loss": 0.462, "num_input_tokens_seen": 35776192, "step": 29425 }, { "epoch": 3.68750783109886, "grad_norm": 0.1224113255739212, "learning_rate": 9.784731434402286e-06, "loss": 0.4658, "num_input_tokens_seen": 35782240, "step": 29430 }, { "epoch": 3.688134319007643, "grad_norm": 0.16604074835777283, "learning_rate": 9.784572713876244e-06, "loss": 0.4645, "num_input_tokens_seen": 35788320, "step": 29435 }, { "epoch": 3.6887608069164264, "grad_norm": 0.039597004652023315, "learning_rate": 9.78441393614661e-06, "loss": 0.4648, "num_input_tokens_seen": 35794368, "step": 29440 }, { "epoch": 3.68938729482521, "grad_norm": 0.07401543855667114, "learning_rate": 9.784255101215288e-06, "loss": 0.4647, "num_input_tokens_seen": 35800544, "step": 29445 }, { "epoch": 3.690013782733993, "grad_norm": 0.07665064930915833, "learning_rate": 9.784096209084173e-06, "loss": 0.4639, "num_input_tokens_seen": 35806912, "step": 29450 }, { "epoch": 3.6906402706427768, "grad_norm": 0.0395168736577034, "learning_rate": 9.783937259755165e-06, "loss": 0.4625, "num_input_tokens_seen": 35813088, "step": 29455 }, { "epoch": 3.69126675855156, "grad_norm": 0.1631547063589096, "learning_rate": 9.783778253230165e-06, "loss": 0.4546, "num_input_tokens_seen": 35818944, "step": 29460 }, { "epoch": 3.691893246460343, "grad_norm": 0.16616258025169373, "learning_rate": 9.783619189511073e-06, "loss": 0.474, "num_input_tokens_seen": 35825248, "step": 29465 }, { "epoch": 3.6925197343691267, "grad_norm": 0.07669072598218918, "learning_rate": 9.783460068599793e-06, "loss": 0.4638, "num_input_tokens_seen": 35831424, "step": 29470 }, { "epoch": 3.69314622227791, "grad_norm": 0.1554194986820221, "learning_rate": 9.783300890498226e-06, "loss": 0.4612, "num_input_tokens_seen": 35837568, "step": 29475 }, { "epoch": 3.6937727101866935, "grad_norm": 0.039221085608005524, "learning_rate": 9.783141655208276e-06, "loss": 0.4666, "num_input_tokens_seen": 35844032, "step": 29480 }, { "epoch": 3.6943991980954767, "grad_norm": 0.10797885805368423, "learning_rate": 9.782982362731846e-06, "loss": 0.4653, "num_input_tokens_seen": 35850208, "step": 29485 }, { "epoch": 3.69502568600426, "grad_norm": 0.04312686249613762, "learning_rate": 9.782823013070838e-06, "loss": 0.4627, "num_input_tokens_seen": 35856544, "step": 29490 }, { "epoch": 3.6956521739130435, "grad_norm": 0.04496985673904419, "learning_rate": 9.782663606227163e-06, "loss": 0.4637, "num_input_tokens_seen": 35862624, "step": 29495 }, { "epoch": 3.696278661821827, "grad_norm": 0.08541540056467056, "learning_rate": 9.782504142202723e-06, "loss": 0.4613, "num_input_tokens_seen": 35868832, "step": 29500 }, { "epoch": 3.6969051497306102, "grad_norm": 0.08830703049898148, "learning_rate": 9.782344620999422e-06, "loss": 0.4576, "num_input_tokens_seen": 35875232, "step": 29505 }, { "epoch": 3.6975316376393934, "grad_norm": 0.11779561638832092, "learning_rate": 9.782185042619174e-06, "loss": 0.4625, "num_input_tokens_seen": 35880832, "step": 29510 }, { "epoch": 3.698158125548177, "grad_norm": 0.08100371062755585, "learning_rate": 9.78202540706388e-06, "loss": 0.4644, "num_input_tokens_seen": 35887008, "step": 29515 }, { "epoch": 3.69878461345696, "grad_norm": 0.07995513081550598, "learning_rate": 9.781865714335454e-06, "loss": 0.4675, "num_input_tokens_seen": 35893056, "step": 29520 }, { "epoch": 3.699411101365744, "grad_norm": 0.14717353880405426, "learning_rate": 9.781705964435802e-06, "loss": 0.4609, "num_input_tokens_seen": 35899072, "step": 29525 }, { "epoch": 3.700037589274527, "grad_norm": 0.07365436106920242, "learning_rate": 9.781546157366836e-06, "loss": 0.4674, "num_input_tokens_seen": 35905312, "step": 29530 }, { "epoch": 3.70066407718331, "grad_norm": 0.06924466788768768, "learning_rate": 9.781386293130465e-06, "loss": 0.4618, "num_input_tokens_seen": 35911136, "step": 29535 }, { "epoch": 3.7012905650920938, "grad_norm": 0.0392116941511631, "learning_rate": 9.7812263717286e-06, "loss": 0.4668, "num_input_tokens_seen": 35917152, "step": 29540 }, { "epoch": 3.701917053000877, "grad_norm": 0.09110503643751144, "learning_rate": 9.781066393163155e-06, "loss": 0.4586, "num_input_tokens_seen": 35923392, "step": 29545 }, { "epoch": 3.7025435409096605, "grad_norm": 0.08607850968837738, "learning_rate": 9.780906357436041e-06, "loss": 0.4647, "num_input_tokens_seen": 35929632, "step": 29550 }, { "epoch": 3.7031700288184437, "grad_norm": 0.08003892004489899, "learning_rate": 9.780746264549174e-06, "loss": 0.461, "num_input_tokens_seen": 35935648, "step": 29555 }, { "epoch": 3.7037965167272273, "grad_norm": 0.11439741402864456, "learning_rate": 9.780586114504463e-06, "loss": 0.4657, "num_input_tokens_seen": 35941376, "step": 29560 }, { "epoch": 3.7044230046360105, "grad_norm": 0.03807487338781357, "learning_rate": 9.780425907303828e-06, "loss": 0.4605, "num_input_tokens_seen": 35948000, "step": 29565 }, { "epoch": 3.705049492544794, "grad_norm": 0.0712510347366333, "learning_rate": 9.78026564294918e-06, "loss": 0.4682, "num_input_tokens_seen": 35954080, "step": 29570 }, { "epoch": 3.7056759804535773, "grad_norm": 0.09762091189622879, "learning_rate": 9.780105321442439e-06, "loss": 0.4609, "num_input_tokens_seen": 35960064, "step": 29575 }, { "epoch": 3.7063024683623604, "grad_norm": 0.07438108325004578, "learning_rate": 9.77994494278552e-06, "loss": 0.4585, "num_input_tokens_seen": 35966464, "step": 29580 }, { "epoch": 3.706928956271144, "grad_norm": 0.07927905768156052, "learning_rate": 9.77978450698034e-06, "loss": 0.4708, "num_input_tokens_seen": 35971968, "step": 29585 }, { "epoch": 3.707555444179927, "grad_norm": 0.14447098970413208, "learning_rate": 9.779624014028816e-06, "loss": 0.4603, "num_input_tokens_seen": 35978112, "step": 29590 }, { "epoch": 3.708181932088711, "grad_norm": 0.08750820904970169, "learning_rate": 9.77946346393287e-06, "loss": 0.468, "num_input_tokens_seen": 35984064, "step": 29595 }, { "epoch": 3.708808419997494, "grad_norm": 0.07803245633840561, "learning_rate": 9.779302856694419e-06, "loss": 0.4664, "num_input_tokens_seen": 35990208, "step": 29600 }, { "epoch": 3.709434907906277, "grad_norm": 0.07843317091464996, "learning_rate": 9.779142192315386e-06, "loss": 0.4578, "num_input_tokens_seen": 35996000, "step": 29605 }, { "epoch": 3.710061395815061, "grad_norm": 0.13478468358516693, "learning_rate": 9.778981470797689e-06, "loss": 0.4638, "num_input_tokens_seen": 36001536, "step": 29610 }, { "epoch": 3.7106878837238444, "grad_norm": 0.12272487580776215, "learning_rate": 9.778820692143249e-06, "loss": 0.4691, "num_input_tokens_seen": 36007776, "step": 29615 }, { "epoch": 3.7113143716326276, "grad_norm": 0.0893409252166748, "learning_rate": 9.778659856353992e-06, "loss": 0.4653, "num_input_tokens_seen": 36013760, "step": 29620 }, { "epoch": 3.7119408595414107, "grad_norm": 0.0803670734167099, "learning_rate": 9.778498963431838e-06, "loss": 0.4718, "num_input_tokens_seen": 36019712, "step": 29625 }, { "epoch": 3.7125673474501943, "grad_norm": 0.1277429312467575, "learning_rate": 9.77833801337871e-06, "loss": 0.4621, "num_input_tokens_seen": 36026176, "step": 29630 }, { "epoch": 3.7131938353589775, "grad_norm": 0.12266834080219269, "learning_rate": 9.778177006196537e-06, "loss": 0.4666, "num_input_tokens_seen": 36032576, "step": 29635 }, { "epoch": 3.713820323267761, "grad_norm": 0.0345231294631958, "learning_rate": 9.778015941887239e-06, "loss": 0.462, "num_input_tokens_seen": 36038688, "step": 29640 }, { "epoch": 3.7144468111765443, "grad_norm": 0.08211177587509155, "learning_rate": 9.77785482045274e-06, "loss": 0.4579, "num_input_tokens_seen": 36045024, "step": 29645 }, { "epoch": 3.7150732990853275, "grad_norm": 0.08403573930263519, "learning_rate": 9.777693641894975e-06, "loss": 0.4576, "num_input_tokens_seen": 36051104, "step": 29650 }, { "epoch": 3.715699786994111, "grad_norm": 0.06532410532236099, "learning_rate": 9.777532406215861e-06, "loss": 0.4612, "num_input_tokens_seen": 36057056, "step": 29655 }, { "epoch": 3.7163262749028942, "grad_norm": 0.08618395030498505, "learning_rate": 9.777371113417333e-06, "loss": 0.4636, "num_input_tokens_seen": 36062848, "step": 29660 }, { "epoch": 3.716952762811678, "grad_norm": 0.07398801296949387, "learning_rate": 9.777209763501316e-06, "loss": 0.461, "num_input_tokens_seen": 36069184, "step": 29665 }, { "epoch": 3.717579250720461, "grad_norm": 0.07638254761695862, "learning_rate": 9.777048356469741e-06, "loss": 0.4612, "num_input_tokens_seen": 36075136, "step": 29670 }, { "epoch": 3.718205738629244, "grad_norm": 0.11184941232204437, "learning_rate": 9.776886892324535e-06, "loss": 0.4589, "num_input_tokens_seen": 36081376, "step": 29675 }, { "epoch": 3.718832226538028, "grad_norm": 0.08540984988212585, "learning_rate": 9.77672537106763e-06, "loss": 0.4592, "num_input_tokens_seen": 36087296, "step": 29680 }, { "epoch": 3.7194587144468114, "grad_norm": 0.08371230959892273, "learning_rate": 9.776563792700958e-06, "loss": 0.4554, "num_input_tokens_seen": 36093728, "step": 29685 }, { "epoch": 3.7200852023555946, "grad_norm": 0.03884131833910942, "learning_rate": 9.77640215722645e-06, "loss": 0.4642, "num_input_tokens_seen": 36100064, "step": 29690 }, { "epoch": 3.7207116902643778, "grad_norm": 0.06836981326341629, "learning_rate": 9.776240464646038e-06, "loss": 0.4609, "num_input_tokens_seen": 36106272, "step": 29695 }, { "epoch": 3.7213381781731614, "grad_norm": 0.07370118796825409, "learning_rate": 9.776078714961657e-06, "loss": 0.4552, "num_input_tokens_seen": 36112736, "step": 29700 }, { "epoch": 3.7219646660819445, "grad_norm": 0.10479903966188431, "learning_rate": 9.775916908175237e-06, "loss": 0.4688, "num_input_tokens_seen": 36119168, "step": 29705 }, { "epoch": 3.722591153990728, "grad_norm": 0.0695377066731453, "learning_rate": 9.775755044288716e-06, "loss": 0.4664, "num_input_tokens_seen": 36125376, "step": 29710 }, { "epoch": 3.7232176418995113, "grad_norm": 0.10013659298419952, "learning_rate": 9.775593123304029e-06, "loss": 0.4678, "num_input_tokens_seen": 36131488, "step": 29715 }, { "epoch": 3.7238441298082945, "grad_norm": 0.11493632942438126, "learning_rate": 9.77543114522311e-06, "loss": 0.4628, "num_input_tokens_seen": 36137472, "step": 29720 }, { "epoch": 3.724470617717078, "grad_norm": 0.11307349056005478, "learning_rate": 9.775269110047897e-06, "loss": 0.4632, "num_input_tokens_seen": 36143936, "step": 29725 }, { "epoch": 3.7250971056258613, "grad_norm": 0.03816290572285652, "learning_rate": 9.775107017780328e-06, "loss": 0.4572, "num_input_tokens_seen": 36150176, "step": 29730 }, { "epoch": 3.725723593534645, "grad_norm": 0.1251250058412552, "learning_rate": 9.774944868422337e-06, "loss": 0.4561, "num_input_tokens_seen": 36156192, "step": 29735 }, { "epoch": 3.726350081443428, "grad_norm": 0.03666979447007179, "learning_rate": 9.77478266197587e-06, "loss": 0.4632, "num_input_tokens_seen": 36162176, "step": 29740 }, { "epoch": 3.7269765693522117, "grad_norm": 0.07082537561655045, "learning_rate": 9.77462039844286e-06, "loss": 0.4645, "num_input_tokens_seen": 36167840, "step": 29745 }, { "epoch": 3.727603057260995, "grad_norm": 0.1451917439699173, "learning_rate": 9.774458077825248e-06, "loss": 0.4654, "num_input_tokens_seen": 36174176, "step": 29750 }, { "epoch": 3.7282295451697784, "grad_norm": 0.09022406488656998, "learning_rate": 9.774295700124974e-06, "loss": 0.4609, "num_input_tokens_seen": 36180352, "step": 29755 }, { "epoch": 3.7288560330785616, "grad_norm": 0.12265366315841675, "learning_rate": 9.774133265343984e-06, "loss": 0.4563, "num_input_tokens_seen": 36186496, "step": 29760 }, { "epoch": 3.729482520987345, "grad_norm": 0.07294722646474838, "learning_rate": 9.773970773484215e-06, "loss": 0.466, "num_input_tokens_seen": 36192608, "step": 29765 }, { "epoch": 3.7301090088961284, "grad_norm": 0.12727932631969452, "learning_rate": 9.773808224547612e-06, "loss": 0.4691, "num_input_tokens_seen": 36199008, "step": 29770 }, { "epoch": 3.7307354968049116, "grad_norm": 0.07900724560022354, "learning_rate": 9.77364561853612e-06, "loss": 0.4581, "num_input_tokens_seen": 36205120, "step": 29775 }, { "epoch": 3.731361984713695, "grad_norm": 0.18237893283367157, "learning_rate": 9.773482955451678e-06, "loss": 0.4599, "num_input_tokens_seen": 36210816, "step": 29780 }, { "epoch": 3.7319884726224783, "grad_norm": 0.07432878017425537, "learning_rate": 9.773320235296237e-06, "loss": 0.4617, "num_input_tokens_seen": 36217024, "step": 29785 }, { "epoch": 3.7326149605312615, "grad_norm": 0.12767747044563293, "learning_rate": 9.773157458071737e-06, "loss": 0.4655, "num_input_tokens_seen": 36223520, "step": 29790 }, { "epoch": 3.733241448440045, "grad_norm": 0.07644554227590561, "learning_rate": 9.772994623780127e-06, "loss": 0.4634, "num_input_tokens_seen": 36229408, "step": 29795 }, { "epoch": 3.7338679363488287, "grad_norm": 0.07656744867563248, "learning_rate": 9.772831732423356e-06, "loss": 0.4649, "num_input_tokens_seen": 36235712, "step": 29800 }, { "epoch": 3.734494424257612, "grad_norm": 0.06638786196708679, "learning_rate": 9.772668784003365e-06, "loss": 0.4531, "num_input_tokens_seen": 36241632, "step": 29805 }, { "epoch": 3.735120912166395, "grad_norm": 0.06962791830301285, "learning_rate": 9.772505778522107e-06, "loss": 0.4696, "num_input_tokens_seen": 36247616, "step": 29810 }, { "epoch": 3.7357474000751787, "grad_norm": 0.07831615209579468, "learning_rate": 9.772342715981532e-06, "loss": 0.4653, "num_input_tokens_seen": 36253632, "step": 29815 }, { "epoch": 3.736373887983962, "grad_norm": 0.07337120175361633, "learning_rate": 9.772179596383586e-06, "loss": 0.4625, "num_input_tokens_seen": 36259744, "step": 29820 }, { "epoch": 3.7370003758927455, "grad_norm": 0.06764040887355804, "learning_rate": 9.772016419730222e-06, "loss": 0.4616, "num_input_tokens_seen": 36265984, "step": 29825 }, { "epoch": 3.7376268638015286, "grad_norm": 0.10840670764446259, "learning_rate": 9.771853186023387e-06, "loss": 0.4565, "num_input_tokens_seen": 36271968, "step": 29830 }, { "epoch": 3.738253351710312, "grad_norm": 0.07357960939407349, "learning_rate": 9.771689895265038e-06, "loss": 0.4621, "num_input_tokens_seen": 36278080, "step": 29835 }, { "epoch": 3.7388798396190954, "grad_norm": 0.1012539193034172, "learning_rate": 9.771526547457122e-06, "loss": 0.4538, "num_input_tokens_seen": 36283872, "step": 29840 }, { "epoch": 3.7395063275278786, "grad_norm": 0.13902147114276886, "learning_rate": 9.771363142601596e-06, "loss": 0.472, "num_input_tokens_seen": 36289664, "step": 29845 }, { "epoch": 3.740132815436662, "grad_norm": 0.07176724821329117, "learning_rate": 9.771199680700411e-06, "loss": 0.462, "num_input_tokens_seen": 36296064, "step": 29850 }, { "epoch": 3.7407593033454454, "grad_norm": 0.07163460552692413, "learning_rate": 9.771036161755526e-06, "loss": 0.4565, "num_input_tokens_seen": 36302176, "step": 29855 }, { "epoch": 3.7413857912542285, "grad_norm": 0.07571857422590256, "learning_rate": 9.770872585768887e-06, "loss": 0.4617, "num_input_tokens_seen": 36308384, "step": 29860 }, { "epoch": 3.742012279163012, "grad_norm": 0.12142820656299591, "learning_rate": 9.770708952742459e-06, "loss": 0.472, "num_input_tokens_seen": 36314560, "step": 29865 }, { "epoch": 3.7426387670717958, "grad_norm": 0.09736408293247223, "learning_rate": 9.770545262678192e-06, "loss": 0.4604, "num_input_tokens_seen": 36320640, "step": 29870 }, { "epoch": 3.743265254980579, "grad_norm": 0.08445612341165543, "learning_rate": 9.770381515578047e-06, "loss": 0.4586, "num_input_tokens_seen": 36326976, "step": 29875 }, { "epoch": 3.743891742889362, "grad_norm": 0.11685633659362793, "learning_rate": 9.77021771144398e-06, "loss": 0.452, "num_input_tokens_seen": 36333056, "step": 29880 }, { "epoch": 3.7445182307981457, "grad_norm": 0.08492836356163025, "learning_rate": 9.77005385027795e-06, "loss": 0.4562, "num_input_tokens_seen": 36339584, "step": 29885 }, { "epoch": 3.745144718706929, "grad_norm": 0.0932098776102066, "learning_rate": 9.769889932081915e-06, "loss": 0.4667, "num_input_tokens_seen": 36345920, "step": 29890 }, { "epoch": 3.7457712066157125, "grad_norm": 0.08379829674959183, "learning_rate": 9.769725956857836e-06, "loss": 0.461, "num_input_tokens_seen": 36352032, "step": 29895 }, { "epoch": 3.7463976945244957, "grad_norm": 0.17568272352218628, "learning_rate": 9.769561924607674e-06, "loss": 0.4609, "num_input_tokens_seen": 36358720, "step": 29900 }, { "epoch": 3.747024182433279, "grad_norm": 0.13483686745166779, "learning_rate": 9.769397835333388e-06, "loss": 0.4666, "num_input_tokens_seen": 36364512, "step": 29905 }, { "epoch": 3.7476506703420625, "grad_norm": 0.1946694254875183, "learning_rate": 9.769233689036942e-06, "loss": 0.4613, "num_input_tokens_seen": 36370720, "step": 29910 }, { "epoch": 3.7482771582508456, "grad_norm": 0.1269778311252594, "learning_rate": 9.769069485720294e-06, "loss": 0.4563, "num_input_tokens_seen": 36376992, "step": 29915 }, { "epoch": 3.7489036461596292, "grad_norm": 0.08976256847381592, "learning_rate": 9.768905225385414e-06, "loss": 0.4724, "num_input_tokens_seen": 36383232, "step": 29920 }, { "epoch": 3.7495301340684124, "grad_norm": 0.09495388716459274, "learning_rate": 9.768740908034262e-06, "loss": 0.4607, "num_input_tokens_seen": 36389376, "step": 29925 }, { "epoch": 3.7501566219771956, "grad_norm": 0.09218853712081909, "learning_rate": 9.768576533668803e-06, "loss": 0.4678, "num_input_tokens_seen": 36395584, "step": 29930 }, { "epoch": 3.750783109885979, "grad_norm": 0.05103639140725136, "learning_rate": 9.768412102291003e-06, "loss": 0.4656, "num_input_tokens_seen": 36401824, "step": 29935 }, { "epoch": 3.751409597794763, "grad_norm": 0.09387136250734329, "learning_rate": 9.768247613902827e-06, "loss": 0.4563, "num_input_tokens_seen": 36407808, "step": 29940 }, { "epoch": 3.752036085703546, "grad_norm": 0.08822941780090332, "learning_rate": 9.76808306850624e-06, "loss": 0.4692, "num_input_tokens_seen": 36413952, "step": 29945 }, { "epoch": 3.752662573612329, "grad_norm": 0.12934672832489014, "learning_rate": 9.767918466103213e-06, "loss": 0.4663, "num_input_tokens_seen": 36420064, "step": 29950 }, { "epoch": 3.7532890615211127, "grad_norm": 0.04061301052570343, "learning_rate": 9.767753806695712e-06, "loss": 0.4596, "num_input_tokens_seen": 36426400, "step": 29955 }, { "epoch": 3.753915549429896, "grad_norm": 0.15643325448036194, "learning_rate": 9.767589090285705e-06, "loss": 0.4642, "num_input_tokens_seen": 36432640, "step": 29960 }, { "epoch": 3.7545420373386795, "grad_norm": 0.17420336604118347, "learning_rate": 9.767424316875162e-06, "loss": 0.4694, "num_input_tokens_seen": 36439008, "step": 29965 }, { "epoch": 3.7551685252474627, "grad_norm": 0.14466553926467896, "learning_rate": 9.767259486466055e-06, "loss": 0.4621, "num_input_tokens_seen": 36445472, "step": 29970 }, { "epoch": 3.755795013156246, "grad_norm": 0.08908678591251373, "learning_rate": 9.767094599060349e-06, "loss": 0.465, "num_input_tokens_seen": 36451680, "step": 29975 }, { "epoch": 3.7564215010650295, "grad_norm": 0.09060999751091003, "learning_rate": 9.766929654660022e-06, "loss": 0.462, "num_input_tokens_seen": 36457152, "step": 29980 }, { "epoch": 3.7570479889738126, "grad_norm": 0.09490913152694702, "learning_rate": 9.766764653267043e-06, "loss": 0.4754, "num_input_tokens_seen": 36462976, "step": 29985 }, { "epoch": 3.7576744768825963, "grad_norm": 0.08164790272712708, "learning_rate": 9.766599594883383e-06, "loss": 0.4596, "num_input_tokens_seen": 36468736, "step": 29990 }, { "epoch": 3.7583009647913794, "grad_norm": 0.08292829245328903, "learning_rate": 9.76643447951102e-06, "loss": 0.4684, "num_input_tokens_seen": 36474816, "step": 29995 }, { "epoch": 3.758927452700163, "grad_norm": 0.09340325742959976, "learning_rate": 9.766269307151924e-06, "loss": 0.462, "num_input_tokens_seen": 36480992, "step": 30000 }, { "epoch": 3.759553940608946, "grad_norm": 0.03778033331036568, "learning_rate": 9.766104077808071e-06, "loss": 0.4632, "num_input_tokens_seen": 36487488, "step": 30005 }, { "epoch": 3.76018042851773, "grad_norm": 0.12922073900699615, "learning_rate": 9.765938791481438e-06, "loss": 0.4611, "num_input_tokens_seen": 36493248, "step": 30010 }, { "epoch": 3.760806916426513, "grad_norm": 0.12479933351278305, "learning_rate": 9.765773448174e-06, "loss": 0.4647, "num_input_tokens_seen": 36499392, "step": 30015 }, { "epoch": 3.761433404335296, "grad_norm": 0.04134577885270119, "learning_rate": 9.765608047887731e-06, "loss": 0.4654, "num_input_tokens_seen": 36505376, "step": 30020 }, { "epoch": 3.7620598922440798, "grad_norm": 0.1323491334915161, "learning_rate": 9.765442590624612e-06, "loss": 0.4576, "num_input_tokens_seen": 36511328, "step": 30025 }, { "epoch": 3.762686380152863, "grad_norm": 0.12821413576602936, "learning_rate": 9.76527707638662e-06, "loss": 0.4707, "num_input_tokens_seen": 36517632, "step": 30030 }, { "epoch": 3.7633128680616466, "grad_norm": 0.06766380369663239, "learning_rate": 9.765111505175734e-06, "loss": 0.4654, "num_input_tokens_seen": 36523872, "step": 30035 }, { "epoch": 3.7639393559704297, "grad_norm": 0.0711369663476944, "learning_rate": 9.764945876993936e-06, "loss": 0.4551, "num_input_tokens_seen": 36529984, "step": 30040 }, { "epoch": 3.764565843879213, "grad_norm": 0.08157332241535187, "learning_rate": 9.764780191843202e-06, "loss": 0.4665, "num_input_tokens_seen": 36536064, "step": 30045 }, { "epoch": 3.7651923317879965, "grad_norm": 0.1276116520166397, "learning_rate": 9.764614449725516e-06, "loss": 0.4567, "num_input_tokens_seen": 36542336, "step": 30050 }, { "epoch": 3.76581881969678, "grad_norm": 0.03661849722266197, "learning_rate": 9.764448650642857e-06, "loss": 0.462, "num_input_tokens_seen": 36548416, "step": 30055 }, { "epoch": 3.7664453076055633, "grad_norm": 0.0358116440474987, "learning_rate": 9.764282794597208e-06, "loss": 0.47, "num_input_tokens_seen": 36554560, "step": 30060 }, { "epoch": 3.7670717955143465, "grad_norm": 0.08462806046009064, "learning_rate": 9.764116881590556e-06, "loss": 0.475, "num_input_tokens_seen": 36560480, "step": 30065 }, { "epoch": 3.76769828342313, "grad_norm": 0.10399162769317627, "learning_rate": 9.763950911624878e-06, "loss": 0.4606, "num_input_tokens_seen": 36566688, "step": 30070 }, { "epoch": 3.7683247713319132, "grad_norm": 0.0717008113861084, "learning_rate": 9.763784884702162e-06, "loss": 0.462, "num_input_tokens_seen": 36572896, "step": 30075 }, { "epoch": 3.768951259240697, "grad_norm": 0.1047564148902893, "learning_rate": 9.763618800824393e-06, "loss": 0.462, "num_input_tokens_seen": 36579200, "step": 30080 }, { "epoch": 3.76957774714948, "grad_norm": 0.0867842435836792, "learning_rate": 9.763452659993557e-06, "loss": 0.4584, "num_input_tokens_seen": 36584864, "step": 30085 }, { "epoch": 3.770204235058263, "grad_norm": 0.07377869635820389, "learning_rate": 9.76328646221164e-06, "loss": 0.4595, "num_input_tokens_seen": 36590624, "step": 30090 }, { "epoch": 3.770830722967047, "grad_norm": 0.08264719694852829, "learning_rate": 9.763120207480625e-06, "loss": 0.4636, "num_input_tokens_seen": 36596800, "step": 30095 }, { "epoch": 3.77145721087583, "grad_norm": 0.09196934103965759, "learning_rate": 9.762953895802507e-06, "loss": 0.461, "num_input_tokens_seen": 36603136, "step": 30100 }, { "epoch": 3.7720836987846136, "grad_norm": 0.08837004750967026, "learning_rate": 9.76278752717927e-06, "loss": 0.4622, "num_input_tokens_seen": 36608704, "step": 30105 }, { "epoch": 3.7727101866933968, "grad_norm": 0.11402586847543716, "learning_rate": 9.762621101612903e-06, "loss": 0.4699, "num_input_tokens_seen": 36614656, "step": 30110 }, { "epoch": 3.77333667460218, "grad_norm": 0.09596092998981476, "learning_rate": 9.762454619105396e-06, "loss": 0.4636, "num_input_tokens_seen": 36620544, "step": 30115 }, { "epoch": 3.7739631625109635, "grad_norm": 0.08323447406291962, "learning_rate": 9.762288079658741e-06, "loss": 0.4609, "num_input_tokens_seen": 36626592, "step": 30120 }, { "epoch": 3.774589650419747, "grad_norm": 0.10257494449615479, "learning_rate": 9.762121483274928e-06, "loss": 0.4588, "num_input_tokens_seen": 36633024, "step": 30125 }, { "epoch": 3.7752161383285303, "grad_norm": 0.07716763019561768, "learning_rate": 9.761954829955948e-06, "loss": 0.4589, "num_input_tokens_seen": 36638944, "step": 30130 }, { "epoch": 3.7758426262373135, "grad_norm": 0.08044818788766861, "learning_rate": 9.761788119703796e-06, "loss": 0.4602, "num_input_tokens_seen": 36645088, "step": 30135 }, { "epoch": 3.776469114146097, "grad_norm": 0.12948352098464966, "learning_rate": 9.761621352520463e-06, "loss": 0.4623, "num_input_tokens_seen": 36651552, "step": 30140 }, { "epoch": 3.7770956020548803, "grad_norm": 0.09349144995212555, "learning_rate": 9.761454528407942e-06, "loss": 0.4613, "num_input_tokens_seen": 36658016, "step": 30145 }, { "epoch": 3.777722089963664, "grad_norm": 0.08570186048746109, "learning_rate": 9.761287647368229e-06, "loss": 0.4665, "num_input_tokens_seen": 36664288, "step": 30150 }, { "epoch": 3.778348577872447, "grad_norm": 0.1838042438030243, "learning_rate": 9.76112070940332e-06, "loss": 0.4634, "num_input_tokens_seen": 36670176, "step": 30155 }, { "epoch": 3.77897506578123, "grad_norm": 0.07893810421228409, "learning_rate": 9.760953714515212e-06, "loss": 0.467, "num_input_tokens_seen": 36675872, "step": 30160 }, { "epoch": 3.779601553690014, "grad_norm": 0.11436312645673752, "learning_rate": 9.760786662705896e-06, "loss": 0.4674, "num_input_tokens_seen": 36682336, "step": 30165 }, { "epoch": 3.780228041598797, "grad_norm": 0.03629956766963005, "learning_rate": 9.760619553977376e-06, "loss": 0.4638, "num_input_tokens_seen": 36688544, "step": 30170 }, { "epoch": 3.7808545295075806, "grad_norm": 0.1257452368736267, "learning_rate": 9.760452388331644e-06, "loss": 0.4668, "num_input_tokens_seen": 36694784, "step": 30175 }, { "epoch": 3.7814810174163638, "grad_norm": 0.13881337642669678, "learning_rate": 9.760285165770702e-06, "loss": 0.46, "num_input_tokens_seen": 36700896, "step": 30180 }, { "epoch": 3.7821075053251474, "grad_norm": 0.032391563057899475, "learning_rate": 9.76011788629655e-06, "loss": 0.4614, "num_input_tokens_seen": 36706848, "step": 30185 }, { "epoch": 3.7827339932339306, "grad_norm": 0.14833612740039825, "learning_rate": 9.759950549911185e-06, "loss": 0.4611, "num_input_tokens_seen": 36712832, "step": 30190 }, { "epoch": 3.783360481142714, "grad_norm": 0.07557941973209381, "learning_rate": 9.75978315661661e-06, "loss": 0.461, "num_input_tokens_seen": 36719104, "step": 30195 }, { "epoch": 3.7839869690514973, "grad_norm": 0.11008267104625702, "learning_rate": 9.759615706414826e-06, "loss": 0.462, "num_input_tokens_seen": 36725344, "step": 30200 }, { "epoch": 3.7846134569602805, "grad_norm": 0.08569968491792679, "learning_rate": 9.759448199307833e-06, "loss": 0.4625, "num_input_tokens_seen": 36731328, "step": 30205 }, { "epoch": 3.785239944869064, "grad_norm": 0.06450507044792175, "learning_rate": 9.759280635297636e-06, "loss": 0.4593, "num_input_tokens_seen": 36737536, "step": 30210 }, { "epoch": 3.7858664327778473, "grad_norm": 0.07921823859214783, "learning_rate": 9.75911301438624e-06, "loss": 0.4615, "num_input_tokens_seen": 36743488, "step": 30215 }, { "epoch": 3.786492920686631, "grad_norm": 0.08077870309352875, "learning_rate": 9.758945336575642e-06, "loss": 0.4603, "num_input_tokens_seen": 36749376, "step": 30220 }, { "epoch": 3.787119408595414, "grad_norm": 0.0942002534866333, "learning_rate": 9.758777601867856e-06, "loss": 0.4635, "num_input_tokens_seen": 36755360, "step": 30225 }, { "epoch": 3.7877458965041972, "grad_norm": 0.08009948581457138, "learning_rate": 9.758609810264881e-06, "loss": 0.4637, "num_input_tokens_seen": 36761568, "step": 30230 }, { "epoch": 3.788372384412981, "grad_norm": 0.08820999413728714, "learning_rate": 9.758441961768724e-06, "loss": 0.4636, "num_input_tokens_seen": 36767680, "step": 30235 }, { "epoch": 3.7889988723217645, "grad_norm": 0.10961107164621353, "learning_rate": 9.758274056381393e-06, "loss": 0.4636, "num_input_tokens_seen": 36773984, "step": 30240 }, { "epoch": 3.7896253602305476, "grad_norm": 0.1430457979440689, "learning_rate": 9.758106094104895e-06, "loss": 0.4621, "num_input_tokens_seen": 36780160, "step": 30245 }, { "epoch": 3.790251848139331, "grad_norm": 0.10441213846206665, "learning_rate": 9.757938074941239e-06, "loss": 0.4637, "num_input_tokens_seen": 36786432, "step": 30250 }, { "epoch": 3.7908783360481144, "grad_norm": 0.08504285663366318, "learning_rate": 9.757769998892433e-06, "loss": 0.4644, "num_input_tokens_seen": 36792960, "step": 30255 }, { "epoch": 3.7915048239568976, "grad_norm": 0.03591226413846016, "learning_rate": 9.757601865960487e-06, "loss": 0.459, "num_input_tokens_seen": 36798400, "step": 30260 }, { "epoch": 3.792131311865681, "grad_norm": 0.07546018809080124, "learning_rate": 9.75743367614741e-06, "loss": 0.4622, "num_input_tokens_seen": 36804032, "step": 30265 }, { "epoch": 3.7927577997744644, "grad_norm": 0.042151015251874924, "learning_rate": 9.757265429455213e-06, "loss": 0.4692, "num_input_tokens_seen": 36810400, "step": 30270 }, { "epoch": 3.7933842876832475, "grad_norm": 0.10982227325439453, "learning_rate": 9.757097125885908e-06, "loss": 0.4554, "num_input_tokens_seen": 36816576, "step": 30275 }, { "epoch": 3.794010775592031, "grad_norm": 0.050718653947114944, "learning_rate": 9.756928765441509e-06, "loss": 0.4626, "num_input_tokens_seen": 36822464, "step": 30280 }, { "epoch": 3.7946372635008143, "grad_norm": 0.1012769490480423, "learning_rate": 9.756760348124026e-06, "loss": 0.4704, "num_input_tokens_seen": 36828512, "step": 30285 }, { "epoch": 3.795263751409598, "grad_norm": 0.1123034805059433, "learning_rate": 9.756591873935474e-06, "loss": 0.4653, "num_input_tokens_seen": 36834528, "step": 30290 }, { "epoch": 3.795890239318381, "grad_norm": 0.07695945352315903, "learning_rate": 9.756423342877866e-06, "loss": 0.4624, "num_input_tokens_seen": 36840736, "step": 30295 }, { "epoch": 3.7965167272271643, "grad_norm": 0.08216974139213562, "learning_rate": 9.75625475495322e-06, "loss": 0.4653, "num_input_tokens_seen": 36847040, "step": 30300 }, { "epoch": 3.797143215135948, "grad_norm": 0.09007388353347778, "learning_rate": 9.756086110163548e-06, "loss": 0.4618, "num_input_tokens_seen": 36853216, "step": 30305 }, { "epoch": 3.7977697030447315, "grad_norm": 0.07444711774587631, "learning_rate": 9.755917408510869e-06, "loss": 0.4636, "num_input_tokens_seen": 36859296, "step": 30310 }, { "epoch": 3.7983961909535147, "grad_norm": 0.06619519740343094, "learning_rate": 9.755748649997197e-06, "loss": 0.4646, "num_input_tokens_seen": 36865216, "step": 30315 }, { "epoch": 3.799022678862298, "grad_norm": 0.06453549861907959, "learning_rate": 9.755579834624553e-06, "loss": 0.4646, "num_input_tokens_seen": 36871328, "step": 30320 }, { "epoch": 3.7996491667710814, "grad_norm": 0.07442791014909744, "learning_rate": 9.755410962394954e-06, "loss": 0.4647, "num_input_tokens_seen": 36877536, "step": 30325 }, { "epoch": 3.8002756546798646, "grad_norm": 0.12629349529743195, "learning_rate": 9.755242033310419e-06, "loss": 0.4618, "num_input_tokens_seen": 36883424, "step": 30330 }, { "epoch": 3.8009021425886482, "grad_norm": 0.07178789377212524, "learning_rate": 9.755073047372966e-06, "loss": 0.4645, "num_input_tokens_seen": 36889664, "step": 30335 }, { "epoch": 3.8015286304974314, "grad_norm": 0.10922196507453918, "learning_rate": 9.754904004584618e-06, "loss": 0.4604, "num_input_tokens_seen": 36895488, "step": 30340 }, { "epoch": 3.8021551184062146, "grad_norm": 0.1135164126753807, "learning_rate": 9.754734904947393e-06, "loss": 0.4614, "num_input_tokens_seen": 36901728, "step": 30345 }, { "epoch": 3.802781606314998, "grad_norm": 0.08530556410551071, "learning_rate": 9.754565748463317e-06, "loss": 0.4613, "num_input_tokens_seen": 36907712, "step": 30350 }, { "epoch": 3.8034080942237813, "grad_norm": 0.1266617476940155, "learning_rate": 9.75439653513441e-06, "loss": 0.4644, "num_input_tokens_seen": 36914240, "step": 30355 }, { "epoch": 3.804034582132565, "grad_norm": 0.08753615617752075, "learning_rate": 9.754227264962695e-06, "loss": 0.4643, "num_input_tokens_seen": 36920544, "step": 30360 }, { "epoch": 3.804661070041348, "grad_norm": 0.11750606447458267, "learning_rate": 9.754057937950194e-06, "loss": 0.4618, "num_input_tokens_seen": 36926464, "step": 30365 }, { "epoch": 3.8052875579501313, "grad_norm": 0.14296025037765503, "learning_rate": 9.753888554098936e-06, "loss": 0.4636, "num_input_tokens_seen": 36932672, "step": 30370 }, { "epoch": 3.805914045858915, "grad_norm": 0.07216832041740417, "learning_rate": 9.753719113410942e-06, "loss": 0.4583, "num_input_tokens_seen": 36938752, "step": 30375 }, { "epoch": 3.8065405337676985, "grad_norm": 0.07948753237724304, "learning_rate": 9.753549615888239e-06, "loss": 0.4609, "num_input_tokens_seen": 36945056, "step": 30380 }, { "epoch": 3.8071670216764817, "grad_norm": 0.07945086807012558, "learning_rate": 9.753380061532855e-06, "loss": 0.4552, "num_input_tokens_seen": 36951648, "step": 30385 }, { "epoch": 3.807793509585265, "grad_norm": 0.15930645167827606, "learning_rate": 9.753210450346816e-06, "loss": 0.45, "num_input_tokens_seen": 36957664, "step": 30390 }, { "epoch": 3.8084199974940485, "grad_norm": 0.07715453207492828, "learning_rate": 9.753040782332147e-06, "loss": 0.462, "num_input_tokens_seen": 36963744, "step": 30395 }, { "epoch": 3.8090464854028316, "grad_norm": 0.14204449951648712, "learning_rate": 9.752871057490882e-06, "loss": 0.4586, "num_input_tokens_seen": 36969856, "step": 30400 }, { "epoch": 3.8096729733116153, "grad_norm": 0.07811912894248962, "learning_rate": 9.752701275825048e-06, "loss": 0.4623, "num_input_tokens_seen": 36976032, "step": 30405 }, { "epoch": 3.8102994612203984, "grad_norm": 0.14601555466651917, "learning_rate": 9.752531437336672e-06, "loss": 0.4692, "num_input_tokens_seen": 36982016, "step": 30410 }, { "epoch": 3.8109259491291816, "grad_norm": 0.08677604794502258, "learning_rate": 9.752361542027789e-06, "loss": 0.4733, "num_input_tokens_seen": 36988000, "step": 30415 }, { "epoch": 3.811552437037965, "grad_norm": 0.10261351615190506, "learning_rate": 9.752191589900426e-06, "loss": 0.4555, "num_input_tokens_seen": 36993920, "step": 30420 }, { "epoch": 3.8121789249467484, "grad_norm": 0.16168369352817535, "learning_rate": 9.752021580956619e-06, "loss": 0.4495, "num_input_tokens_seen": 37000064, "step": 30425 }, { "epoch": 3.812805412855532, "grad_norm": 0.1471690833568573, "learning_rate": 9.751851515198398e-06, "loss": 0.4578, "num_input_tokens_seen": 37006272, "step": 30430 }, { "epoch": 3.813431900764315, "grad_norm": 0.1216726154088974, "learning_rate": 9.751681392627798e-06, "loss": 0.4646, "num_input_tokens_seen": 37012032, "step": 30435 }, { "epoch": 3.8140583886730988, "grad_norm": 0.11821331083774567, "learning_rate": 9.75151121324685e-06, "loss": 0.4791, "num_input_tokens_seen": 37018112, "step": 30440 }, { "epoch": 3.814684876581882, "grad_norm": 0.04953275993466377, "learning_rate": 9.751340977057592e-06, "loss": 0.4691, "num_input_tokens_seen": 37023936, "step": 30445 }, { "epoch": 3.8153113644906655, "grad_norm": 0.11976177990436554, "learning_rate": 9.751170684062059e-06, "loss": 0.4648, "num_input_tokens_seen": 37030272, "step": 30450 }, { "epoch": 3.8159378523994487, "grad_norm": 0.09363850206136703, "learning_rate": 9.751000334262285e-06, "loss": 0.4635, "num_input_tokens_seen": 37035648, "step": 30455 }, { "epoch": 3.816564340308232, "grad_norm": 0.03857824206352234, "learning_rate": 9.750829927660305e-06, "loss": 0.4559, "num_input_tokens_seen": 37041760, "step": 30460 }, { "epoch": 3.8171908282170155, "grad_norm": 0.12183228135108948, "learning_rate": 9.750659464258161e-06, "loss": 0.467, "num_input_tokens_seen": 37047712, "step": 30465 }, { "epoch": 3.8178173161257987, "grad_norm": 0.08338393270969391, "learning_rate": 9.75048894405789e-06, "loss": 0.4637, "num_input_tokens_seen": 37053824, "step": 30470 }, { "epoch": 3.8184438040345823, "grad_norm": 0.10128836333751678, "learning_rate": 9.75031836706153e-06, "loss": 0.4591, "num_input_tokens_seen": 37059840, "step": 30475 }, { "epoch": 3.8190702919433654, "grad_norm": 0.07469779998064041, "learning_rate": 9.750147733271119e-06, "loss": 0.4574, "num_input_tokens_seen": 37065888, "step": 30480 }, { "epoch": 3.8196967798521486, "grad_norm": 0.10862994939088821, "learning_rate": 9.749977042688696e-06, "loss": 0.4647, "num_input_tokens_seen": 37071872, "step": 30485 }, { "epoch": 3.8203232677609322, "grad_norm": 0.07453437149524689, "learning_rate": 9.749806295316307e-06, "loss": 0.4723, "num_input_tokens_seen": 37078176, "step": 30490 }, { "epoch": 3.820949755669716, "grad_norm": 0.09608887135982513, "learning_rate": 9.74963549115599e-06, "loss": 0.4634, "num_input_tokens_seen": 37084064, "step": 30495 }, { "epoch": 3.821576243578499, "grad_norm": 0.06945820152759552, "learning_rate": 9.749464630209786e-06, "loss": 0.4616, "num_input_tokens_seen": 37090304, "step": 30500 }, { "epoch": 3.822202731487282, "grad_norm": 0.07371725887060165, "learning_rate": 9.74929371247974e-06, "loss": 0.4565, "num_input_tokens_seen": 37096416, "step": 30505 }, { "epoch": 3.822829219396066, "grad_norm": 0.0712088868021965, "learning_rate": 9.749122737967894e-06, "loss": 0.4574, "num_input_tokens_seen": 37102560, "step": 30510 }, { "epoch": 3.823455707304849, "grad_norm": 0.11263247579336166, "learning_rate": 9.748951706676293e-06, "loss": 0.4638, "num_input_tokens_seen": 37108736, "step": 30515 }, { "epoch": 3.8240821952136326, "grad_norm": 0.08305264264345169, "learning_rate": 9.748780618606982e-06, "loss": 0.4731, "num_input_tokens_seen": 37114848, "step": 30520 }, { "epoch": 3.8247086831224157, "grad_norm": 0.10168018192052841, "learning_rate": 9.748609473762007e-06, "loss": 0.4669, "num_input_tokens_seen": 37120416, "step": 30525 }, { "epoch": 3.825335171031199, "grad_norm": 0.15421518683433533, "learning_rate": 9.748438272143412e-06, "loss": 0.4674, "num_input_tokens_seen": 37126528, "step": 30530 }, { "epoch": 3.8259616589399825, "grad_norm": 0.04529736563563347, "learning_rate": 9.748267013753246e-06, "loss": 0.4624, "num_input_tokens_seen": 37132480, "step": 30535 }, { "epoch": 3.8265881468487657, "grad_norm": 0.039278000593185425, "learning_rate": 9.748095698593554e-06, "loss": 0.4627, "num_input_tokens_seen": 37138432, "step": 30540 }, { "epoch": 3.8272146347575493, "grad_norm": 0.0883333832025528, "learning_rate": 9.747924326666389e-06, "loss": 0.4631, "num_input_tokens_seen": 37144576, "step": 30545 }, { "epoch": 3.8278411226663325, "grad_norm": 0.09337612986564636, "learning_rate": 9.747752897973794e-06, "loss": 0.459, "num_input_tokens_seen": 37150976, "step": 30550 }, { "epoch": 3.8284676105751156, "grad_norm": 0.1057676374912262, "learning_rate": 9.747581412517823e-06, "loss": 0.4658, "num_input_tokens_seen": 37157248, "step": 30555 }, { "epoch": 3.8290940984838993, "grad_norm": 0.03511582687497139, "learning_rate": 9.747409870300524e-06, "loss": 0.4621, "num_input_tokens_seen": 37163296, "step": 30560 }, { "epoch": 3.829720586392683, "grad_norm": 0.08641975373029709, "learning_rate": 9.74723827132395e-06, "loss": 0.4631, "num_input_tokens_seen": 37169792, "step": 30565 }, { "epoch": 3.830347074301466, "grad_norm": 0.14800454676151276, "learning_rate": 9.747066615590148e-06, "loss": 0.4637, "num_input_tokens_seen": 37176256, "step": 30570 }, { "epoch": 3.830973562210249, "grad_norm": 0.0808115154504776, "learning_rate": 9.746894903101177e-06, "loss": 0.4625, "num_input_tokens_seen": 37182080, "step": 30575 }, { "epoch": 3.831600050119033, "grad_norm": 0.07954096049070358, "learning_rate": 9.746723133859084e-06, "loss": 0.4584, "num_input_tokens_seen": 37187872, "step": 30580 }, { "epoch": 3.832226538027816, "grad_norm": 0.13787834346294403, "learning_rate": 9.746551307865927e-06, "loss": 0.4624, "num_input_tokens_seen": 37194144, "step": 30585 }, { "epoch": 3.8328530259365996, "grad_norm": 0.07205133885145187, "learning_rate": 9.746379425123757e-06, "loss": 0.4635, "num_input_tokens_seen": 37200512, "step": 30590 }, { "epoch": 3.8334795138453828, "grad_norm": 0.07447727769613266, "learning_rate": 9.746207485634632e-06, "loss": 0.4662, "num_input_tokens_seen": 37205728, "step": 30595 }, { "epoch": 3.834106001754166, "grad_norm": 0.06468839943408966, "learning_rate": 9.746035489400604e-06, "loss": 0.4667, "num_input_tokens_seen": 37211392, "step": 30600 }, { "epoch": 3.8347324896629496, "grad_norm": 0.0706387311220169, "learning_rate": 9.745863436423733e-06, "loss": 0.4596, "num_input_tokens_seen": 37217792, "step": 30605 }, { "epoch": 3.8353589775717327, "grad_norm": 0.11283563077449799, "learning_rate": 9.745691326706075e-06, "loss": 0.4644, "num_input_tokens_seen": 37224160, "step": 30610 }, { "epoch": 3.8359854654805163, "grad_norm": 0.09997022151947021, "learning_rate": 9.745519160249686e-06, "loss": 0.4607, "num_input_tokens_seen": 37230144, "step": 30615 }, { "epoch": 3.8366119533892995, "grad_norm": 0.08561200648546219, "learning_rate": 9.745346937056626e-06, "loss": 0.4639, "num_input_tokens_seen": 37236096, "step": 30620 }, { "epoch": 3.8372384412980827, "grad_norm": 0.10185052454471588, "learning_rate": 9.745174657128953e-06, "loss": 0.4571, "num_input_tokens_seen": 37242304, "step": 30625 }, { "epoch": 3.8378649292068663, "grad_norm": 0.085199274122715, "learning_rate": 9.745002320468728e-06, "loss": 0.4592, "num_input_tokens_seen": 37248448, "step": 30630 }, { "epoch": 3.83849141711565, "grad_norm": 0.07532636821269989, "learning_rate": 9.744829927078011e-06, "loss": 0.4569, "num_input_tokens_seen": 37254624, "step": 30635 }, { "epoch": 3.839117905024433, "grad_norm": 0.08325264602899551, "learning_rate": 9.744657476958863e-06, "loss": 0.4674, "num_input_tokens_seen": 37260320, "step": 30640 }, { "epoch": 3.8397443929332162, "grad_norm": 0.10893111675977707, "learning_rate": 9.744484970113346e-06, "loss": 0.4677, "num_input_tokens_seen": 37266144, "step": 30645 }, { "epoch": 3.840370880842, "grad_norm": 0.10022687166929245, "learning_rate": 9.744312406543521e-06, "loss": 0.4562, "num_input_tokens_seen": 37272384, "step": 30650 }, { "epoch": 3.840997368750783, "grad_norm": 0.0380149707198143, "learning_rate": 9.744139786251454e-06, "loss": 0.4648, "num_input_tokens_seen": 37278304, "step": 30655 }, { "epoch": 3.8416238566595666, "grad_norm": 0.10659518837928772, "learning_rate": 9.743967109239207e-06, "loss": 0.4629, "num_input_tokens_seen": 37283776, "step": 30660 }, { "epoch": 3.84225034456835, "grad_norm": 0.06541275233030319, "learning_rate": 9.743794375508843e-06, "loss": 0.4661, "num_input_tokens_seen": 37290144, "step": 30665 }, { "epoch": 3.842876832477133, "grad_norm": 0.033848345279693604, "learning_rate": 9.743621585062431e-06, "loss": 0.4619, "num_input_tokens_seen": 37296544, "step": 30670 }, { "epoch": 3.8435033203859166, "grad_norm": 0.10479306429624557, "learning_rate": 9.743448737902034e-06, "loss": 0.4594, "num_input_tokens_seen": 37302592, "step": 30675 }, { "epoch": 3.8441298082946997, "grad_norm": 0.14307135343551636, "learning_rate": 9.74327583402972e-06, "loss": 0.4648, "num_input_tokens_seen": 37308832, "step": 30680 }, { "epoch": 3.8447562962034834, "grad_norm": 0.10094940662384033, "learning_rate": 9.743102873447555e-06, "loss": 0.471, "num_input_tokens_seen": 37315200, "step": 30685 }, { "epoch": 3.8453827841122665, "grad_norm": 0.117573581635952, "learning_rate": 9.742929856157605e-06, "loss": 0.4584, "num_input_tokens_seen": 37321728, "step": 30690 }, { "epoch": 3.84600927202105, "grad_norm": 0.06721024215221405, "learning_rate": 9.742756782161944e-06, "loss": 0.4684, "num_input_tokens_seen": 37327488, "step": 30695 }, { "epoch": 3.8466357599298333, "grad_norm": 0.10807450860738754, "learning_rate": 9.742583651462639e-06, "loss": 0.46, "num_input_tokens_seen": 37333920, "step": 30700 }, { "epoch": 3.847262247838617, "grad_norm": 0.06325359642505646, "learning_rate": 9.742410464061756e-06, "loss": 0.4619, "num_input_tokens_seen": 37340064, "step": 30705 }, { "epoch": 3.8478887357474, "grad_norm": 0.10911595076322556, "learning_rate": 9.74223721996137e-06, "loss": 0.4621, "num_input_tokens_seen": 37346624, "step": 30710 }, { "epoch": 3.8485152236561833, "grad_norm": 0.06384934484958649, "learning_rate": 9.74206391916355e-06, "loss": 0.4686, "num_input_tokens_seen": 37352832, "step": 30715 }, { "epoch": 3.849141711564967, "grad_norm": 0.07773337513208389, "learning_rate": 9.741890561670371e-06, "loss": 0.4609, "num_input_tokens_seen": 37359200, "step": 30720 }, { "epoch": 3.84976819947375, "grad_norm": 0.07260917872190475, "learning_rate": 9.741717147483903e-06, "loss": 0.4708, "num_input_tokens_seen": 37365504, "step": 30725 }, { "epoch": 3.8503946873825337, "grad_norm": 0.07693307101726532, "learning_rate": 9.741543676606218e-06, "loss": 0.464, "num_input_tokens_seen": 37371488, "step": 30730 }, { "epoch": 3.851021175291317, "grad_norm": 0.029694734141230583, "learning_rate": 9.741370149039393e-06, "loss": 0.4646, "num_input_tokens_seen": 37377664, "step": 30735 }, { "epoch": 3.8516476632001, "grad_norm": 0.08647314459085464, "learning_rate": 9.741196564785503e-06, "loss": 0.4567, "num_input_tokens_seen": 37383552, "step": 30740 }, { "epoch": 3.8522741511088836, "grad_norm": 0.07031770795583725, "learning_rate": 9.741022923846619e-06, "loss": 0.4617, "num_input_tokens_seen": 37389536, "step": 30745 }, { "epoch": 3.852900639017667, "grad_norm": 0.0727853998541832, "learning_rate": 9.740849226224822e-06, "loss": 0.4624, "num_input_tokens_seen": 37395904, "step": 30750 }, { "epoch": 3.8535271269264504, "grad_norm": 0.13116949796676636, "learning_rate": 9.740675471922187e-06, "loss": 0.4642, "num_input_tokens_seen": 37402080, "step": 30755 }, { "epoch": 3.8541536148352336, "grad_norm": 0.07455848902463913, "learning_rate": 9.740501660940788e-06, "loss": 0.464, "num_input_tokens_seen": 37408224, "step": 30760 }, { "epoch": 3.854780102744017, "grad_norm": 0.07393762469291687, "learning_rate": 9.74032779328271e-06, "loss": 0.4597, "num_input_tokens_seen": 37414272, "step": 30765 }, { "epoch": 3.8554065906528003, "grad_norm": 0.03496202081441879, "learning_rate": 9.740153868950027e-06, "loss": 0.4594, "num_input_tokens_seen": 37419936, "step": 30770 }, { "epoch": 3.856033078561584, "grad_norm": 0.09796915203332901, "learning_rate": 9.739979887944817e-06, "loss": 0.4631, "num_input_tokens_seen": 37426144, "step": 30775 }, { "epoch": 3.856659566470367, "grad_norm": 0.06748604029417038, "learning_rate": 9.739805850269164e-06, "loss": 0.4676, "num_input_tokens_seen": 37432384, "step": 30780 }, { "epoch": 3.8572860543791503, "grad_norm": 0.11458711326122284, "learning_rate": 9.739631755925145e-06, "loss": 0.4594, "num_input_tokens_seen": 37438304, "step": 30785 }, { "epoch": 3.857912542287934, "grad_norm": 0.07299922406673431, "learning_rate": 9.739457604914845e-06, "loss": 0.4594, "num_input_tokens_seen": 37444448, "step": 30790 }, { "epoch": 3.858539030196717, "grad_norm": 0.07574116438627243, "learning_rate": 9.739283397240346e-06, "loss": 0.46, "num_input_tokens_seen": 37450592, "step": 30795 }, { "epoch": 3.8591655181055007, "grad_norm": 0.08690618723630905, "learning_rate": 9.739109132903728e-06, "loss": 0.4604, "num_input_tokens_seen": 37456800, "step": 30800 }, { "epoch": 3.859792006014284, "grad_norm": 0.08809006959199905, "learning_rate": 9.738934811907074e-06, "loss": 0.4599, "num_input_tokens_seen": 37462912, "step": 30805 }, { "epoch": 3.860418493923067, "grad_norm": 0.12189933657646179, "learning_rate": 9.738760434252474e-06, "loss": 0.46, "num_input_tokens_seen": 37469376, "step": 30810 }, { "epoch": 3.8610449818318506, "grad_norm": 0.0787183865904808, "learning_rate": 9.738585999942006e-06, "loss": 0.4649, "num_input_tokens_seen": 37475360, "step": 30815 }, { "epoch": 3.8616714697406342, "grad_norm": 0.13229505717754364, "learning_rate": 9.73841150897776e-06, "loss": 0.4622, "num_input_tokens_seen": 37481568, "step": 30820 }, { "epoch": 3.8622979576494174, "grad_norm": 0.09244433045387268, "learning_rate": 9.738236961361819e-06, "loss": 0.4601, "num_input_tokens_seen": 37488000, "step": 30825 }, { "epoch": 3.8629244455582006, "grad_norm": 0.07842240482568741, "learning_rate": 9.738062357096272e-06, "loss": 0.4569, "num_input_tokens_seen": 37493856, "step": 30830 }, { "epoch": 3.863550933466984, "grad_norm": 0.0932362824678421, "learning_rate": 9.737887696183206e-06, "loss": 0.4673, "num_input_tokens_seen": 37499840, "step": 30835 }, { "epoch": 3.8641774213757674, "grad_norm": 0.0792432501912117, "learning_rate": 9.737712978624709e-06, "loss": 0.4634, "num_input_tokens_seen": 37506304, "step": 30840 }, { "epoch": 3.864803909284551, "grad_norm": 0.08913331478834152, "learning_rate": 9.737538204422872e-06, "loss": 0.4574, "num_input_tokens_seen": 37512512, "step": 30845 }, { "epoch": 3.865430397193334, "grad_norm": 0.06942086666822433, "learning_rate": 9.73736337357978e-06, "loss": 0.4674, "num_input_tokens_seen": 37518560, "step": 30850 }, { "epoch": 3.8660568851021173, "grad_norm": 0.09040168672800064, "learning_rate": 9.737188486097529e-06, "loss": 0.4678, "num_input_tokens_seen": 37524448, "step": 30855 }, { "epoch": 3.866683373010901, "grad_norm": 0.06875058263540268, "learning_rate": 9.737013541978204e-06, "loss": 0.4577, "num_input_tokens_seen": 37530528, "step": 30860 }, { "epoch": 3.867309860919684, "grad_norm": 0.03830292075872421, "learning_rate": 9.7368385412239e-06, "loss": 0.4628, "num_input_tokens_seen": 37536704, "step": 30865 }, { "epoch": 3.8679363488284677, "grad_norm": 0.12118551880121231, "learning_rate": 9.736663483836709e-06, "loss": 0.4485, "num_input_tokens_seen": 37542592, "step": 30870 }, { "epoch": 3.868562836737251, "grad_norm": 0.10950641334056854, "learning_rate": 9.736488369818725e-06, "loss": 0.463, "num_input_tokens_seen": 37548736, "step": 30875 }, { "epoch": 3.8691893246460345, "grad_norm": 0.09876889735460281, "learning_rate": 9.736313199172038e-06, "loss": 0.4583, "num_input_tokens_seen": 37555072, "step": 30880 }, { "epoch": 3.8698158125548177, "grad_norm": 0.08082568645477295, "learning_rate": 9.736137971898746e-06, "loss": 0.4735, "num_input_tokens_seen": 37561184, "step": 30885 }, { "epoch": 3.8704423004636013, "grad_norm": 0.12301602959632874, "learning_rate": 9.735962688000942e-06, "loss": 0.4711, "num_input_tokens_seen": 37567136, "step": 30890 }, { "epoch": 3.8710687883723844, "grad_norm": 0.09962604194879532, "learning_rate": 9.735787347480723e-06, "loss": 0.4558, "num_input_tokens_seen": 37573344, "step": 30895 }, { "epoch": 3.8716952762811676, "grad_norm": 0.15154927968978882, "learning_rate": 9.735611950340184e-06, "loss": 0.4541, "num_input_tokens_seen": 37579552, "step": 30900 }, { "epoch": 3.872321764189951, "grad_norm": 0.08582769334316254, "learning_rate": 9.735436496581422e-06, "loss": 0.4728, "num_input_tokens_seen": 37585856, "step": 30905 }, { "epoch": 3.8729482520987344, "grad_norm": 0.14581598341464996, "learning_rate": 9.735260986206537e-06, "loss": 0.4679, "num_input_tokens_seen": 37592032, "step": 30910 }, { "epoch": 3.873574740007518, "grad_norm": 0.04279899224638939, "learning_rate": 9.735085419217626e-06, "loss": 0.4599, "num_input_tokens_seen": 37598368, "step": 30915 }, { "epoch": 3.874201227916301, "grad_norm": 0.08661921322345734, "learning_rate": 9.734909795616785e-06, "loss": 0.4669, "num_input_tokens_seen": 37603680, "step": 30920 }, { "epoch": 3.8748277158250843, "grad_norm": 0.0718497782945633, "learning_rate": 9.73473411540612e-06, "loss": 0.4589, "num_input_tokens_seen": 37609760, "step": 30925 }, { "epoch": 3.875454203733868, "grad_norm": 0.08874759078025818, "learning_rate": 9.734558378587725e-06, "loss": 0.471, "num_input_tokens_seen": 37615904, "step": 30930 }, { "epoch": 3.8760806916426516, "grad_norm": 0.14981776475906372, "learning_rate": 9.734382585163705e-06, "loss": 0.4714, "num_input_tokens_seen": 37622080, "step": 30935 }, { "epoch": 3.8767071795514347, "grad_norm": 0.06408035010099411, "learning_rate": 9.734206735136162e-06, "loss": 0.4731, "num_input_tokens_seen": 37628448, "step": 30940 }, { "epoch": 3.877333667460218, "grad_norm": 0.031788550317287445, "learning_rate": 9.734030828507196e-06, "loss": 0.4681, "num_input_tokens_seen": 37634368, "step": 30945 }, { "epoch": 3.8779601553690015, "grad_norm": 0.07302078604698181, "learning_rate": 9.73385486527891e-06, "loss": 0.4659, "num_input_tokens_seen": 37640512, "step": 30950 }, { "epoch": 3.8785866432777847, "grad_norm": 0.12595680356025696, "learning_rate": 9.733678845453412e-06, "loss": 0.4592, "num_input_tokens_seen": 37646848, "step": 30955 }, { "epoch": 3.8792131311865683, "grad_norm": 0.11076367646455765, "learning_rate": 9.733502769032802e-06, "loss": 0.4571, "num_input_tokens_seen": 37653216, "step": 30960 }, { "epoch": 3.8798396190953515, "grad_norm": 0.06684452295303345, "learning_rate": 9.733326636019186e-06, "loss": 0.4669, "num_input_tokens_seen": 37659488, "step": 30965 }, { "epoch": 3.8804661070041346, "grad_norm": 0.07821600884199142, "learning_rate": 9.733150446414671e-06, "loss": 0.4603, "num_input_tokens_seen": 37665696, "step": 30970 }, { "epoch": 3.8810925949129182, "grad_norm": 0.06123621389269829, "learning_rate": 9.732974200221364e-06, "loss": 0.4556, "num_input_tokens_seen": 37672032, "step": 30975 }, { "epoch": 3.8817190828217014, "grad_norm": 0.13762186467647552, "learning_rate": 9.73279789744137e-06, "loss": 0.4514, "num_input_tokens_seen": 37677536, "step": 30980 }, { "epoch": 3.882345570730485, "grad_norm": 0.15544621646404266, "learning_rate": 9.732621538076799e-06, "loss": 0.4636, "num_input_tokens_seen": 37683776, "step": 30985 }, { "epoch": 3.882972058639268, "grad_norm": 0.08191709965467453, "learning_rate": 9.732445122129757e-06, "loss": 0.4584, "num_input_tokens_seen": 37689856, "step": 30990 }, { "epoch": 3.8835985465480514, "grad_norm": 0.11538206040859222, "learning_rate": 9.732268649602355e-06, "loss": 0.4618, "num_input_tokens_seen": 37695872, "step": 30995 }, { "epoch": 3.884225034456835, "grad_norm": 0.07471016049385071, "learning_rate": 9.732092120496704e-06, "loss": 0.4551, "num_input_tokens_seen": 37702144, "step": 31000 }, { "epoch": 3.8848515223656186, "grad_norm": 0.06656446307897568, "learning_rate": 9.731915534814912e-06, "loss": 0.4647, "num_input_tokens_seen": 37707392, "step": 31005 }, { "epoch": 3.8854780102744018, "grad_norm": 0.08265762031078339, "learning_rate": 9.731738892559092e-06, "loss": 0.4586, "num_input_tokens_seen": 37713280, "step": 31010 }, { "epoch": 3.886104498183185, "grad_norm": 0.0942799374461174, "learning_rate": 9.731562193731354e-06, "loss": 0.473, "num_input_tokens_seen": 37719520, "step": 31015 }, { "epoch": 3.8867309860919685, "grad_norm": 0.07409530878067017, "learning_rate": 9.731385438333813e-06, "loss": 0.466, "num_input_tokens_seen": 37725792, "step": 31020 }, { "epoch": 3.8873574740007517, "grad_norm": 0.07815267890691757, "learning_rate": 9.731208626368582e-06, "loss": 0.4654, "num_input_tokens_seen": 37732128, "step": 31025 }, { "epoch": 3.8879839619095353, "grad_norm": 0.16060100495815277, "learning_rate": 9.731031757837774e-06, "loss": 0.4783, "num_input_tokens_seen": 37738336, "step": 31030 }, { "epoch": 3.8886104498183185, "grad_norm": 0.07669224590063095, "learning_rate": 9.730854832743503e-06, "loss": 0.4598, "num_input_tokens_seen": 37744704, "step": 31035 }, { "epoch": 3.8892369377271017, "grad_norm": 0.1365191489458084, "learning_rate": 9.730677851087884e-06, "loss": 0.4587, "num_input_tokens_seen": 37751040, "step": 31040 }, { "epoch": 3.8898634256358853, "grad_norm": 0.14685645699501038, "learning_rate": 9.730500812873036e-06, "loss": 0.4645, "num_input_tokens_seen": 37757152, "step": 31045 }, { "epoch": 3.8904899135446684, "grad_norm": 0.03306098282337189, "learning_rate": 9.730323718101072e-06, "loss": 0.4649, "num_input_tokens_seen": 37763232, "step": 31050 }, { "epoch": 3.891116401453452, "grad_norm": 0.07691485434770584, "learning_rate": 9.730146566774112e-06, "loss": 0.4576, "num_input_tokens_seen": 37769472, "step": 31055 }, { "epoch": 3.8917428893622352, "grad_norm": 0.08998873829841614, "learning_rate": 9.729969358894272e-06, "loss": 0.466, "num_input_tokens_seen": 37775872, "step": 31060 }, { "epoch": 3.8923693772710184, "grad_norm": 0.08190581202507019, "learning_rate": 9.729792094463673e-06, "loss": 0.4649, "num_input_tokens_seen": 37782368, "step": 31065 }, { "epoch": 3.892995865179802, "grad_norm": 0.09172847121953964, "learning_rate": 9.72961477348443e-06, "loss": 0.471, "num_input_tokens_seen": 37787680, "step": 31070 }, { "epoch": 3.8936223530885856, "grad_norm": 0.1655503511428833, "learning_rate": 9.729437395958668e-06, "loss": 0.4641, "num_input_tokens_seen": 37793664, "step": 31075 }, { "epoch": 3.894248840997369, "grad_norm": 0.08898184448480606, "learning_rate": 9.729259961888507e-06, "loss": 0.4619, "num_input_tokens_seen": 37799616, "step": 31080 }, { "epoch": 3.894875328906152, "grad_norm": 0.15189969539642334, "learning_rate": 9.729082471276064e-06, "loss": 0.463, "num_input_tokens_seen": 37805760, "step": 31085 }, { "epoch": 3.8955018168149356, "grad_norm": 0.09310988336801529, "learning_rate": 9.728904924123466e-06, "loss": 0.4579, "num_input_tokens_seen": 37811648, "step": 31090 }, { "epoch": 3.8961283047237187, "grad_norm": 0.07150894403457642, "learning_rate": 9.728727320432834e-06, "loss": 0.4601, "num_input_tokens_seen": 37817312, "step": 31095 }, { "epoch": 3.8967547926325024, "grad_norm": 0.12468873709440231, "learning_rate": 9.728549660206291e-06, "loss": 0.4623, "num_input_tokens_seen": 37823616, "step": 31100 }, { "epoch": 3.8973812805412855, "grad_norm": 0.030543439090251923, "learning_rate": 9.72837194344596e-06, "loss": 0.4652, "num_input_tokens_seen": 37829856, "step": 31105 }, { "epoch": 3.8980077684500687, "grad_norm": 0.08624792098999023, "learning_rate": 9.728194170153967e-06, "loss": 0.4643, "num_input_tokens_seen": 37835872, "step": 31110 }, { "epoch": 3.8986342563588523, "grad_norm": 0.08659933507442474, "learning_rate": 9.728016340332439e-06, "loss": 0.4616, "num_input_tokens_seen": 37841952, "step": 31115 }, { "epoch": 3.8992607442676355, "grad_norm": 0.07477754354476929, "learning_rate": 9.7278384539835e-06, "loss": 0.4611, "num_input_tokens_seen": 37847712, "step": 31120 }, { "epoch": 3.899887232176419, "grad_norm": 0.08470894396305084, "learning_rate": 9.727660511109276e-06, "loss": 0.4626, "num_input_tokens_seen": 37854080, "step": 31125 }, { "epoch": 3.9005137200852023, "grad_norm": 0.1353670209646225, "learning_rate": 9.727482511711898e-06, "loss": 0.4563, "num_input_tokens_seen": 37860032, "step": 31130 }, { "epoch": 3.901140207993986, "grad_norm": 0.12856264412403107, "learning_rate": 9.727304455793491e-06, "loss": 0.4657, "num_input_tokens_seen": 37866080, "step": 31135 }, { "epoch": 3.901766695902769, "grad_norm": 0.09201977401971817, "learning_rate": 9.727126343356184e-06, "loss": 0.4631, "num_input_tokens_seen": 37872192, "step": 31140 }, { "epoch": 3.9023931838115526, "grad_norm": 0.0755116418004036, "learning_rate": 9.726948174402108e-06, "loss": 0.4672, "num_input_tokens_seen": 37878400, "step": 31145 }, { "epoch": 3.903019671720336, "grad_norm": 0.1113307848572731, "learning_rate": 9.726769948933393e-06, "loss": 0.4658, "num_input_tokens_seen": 37884480, "step": 31150 }, { "epoch": 3.903646159629119, "grad_norm": 0.10663076490163803, "learning_rate": 9.726591666952167e-06, "loss": 0.4637, "num_input_tokens_seen": 37890528, "step": 31155 }, { "epoch": 3.9042726475379026, "grad_norm": 0.06894838809967041, "learning_rate": 9.726413328460564e-06, "loss": 0.4619, "num_input_tokens_seen": 37896320, "step": 31160 }, { "epoch": 3.9048991354466858, "grad_norm": 0.07803954184055328, "learning_rate": 9.726234933460718e-06, "loss": 0.4626, "num_input_tokens_seen": 37902752, "step": 31165 }, { "epoch": 3.9055256233554694, "grad_norm": 0.1346934735774994, "learning_rate": 9.726056481954759e-06, "loss": 0.4647, "num_input_tokens_seen": 37909088, "step": 31170 }, { "epoch": 3.9061521112642525, "grad_norm": 0.08545918762683868, "learning_rate": 9.725877973944822e-06, "loss": 0.4615, "num_input_tokens_seen": 37915232, "step": 31175 }, { "epoch": 3.9067785991730357, "grad_norm": 0.06736447662115097, "learning_rate": 9.725699409433038e-06, "loss": 0.4682, "num_input_tokens_seen": 37921504, "step": 31180 }, { "epoch": 3.9074050870818193, "grad_norm": 0.06689966470003128, "learning_rate": 9.725520788421547e-06, "loss": 0.4589, "num_input_tokens_seen": 37927680, "step": 31185 }, { "epoch": 3.908031574990603, "grad_norm": 0.06273233890533447, "learning_rate": 9.725342110912481e-06, "loss": 0.4643, "num_input_tokens_seen": 37933600, "step": 31190 }, { "epoch": 3.908658062899386, "grad_norm": 0.08837272226810455, "learning_rate": 9.725163376907976e-06, "loss": 0.4649, "num_input_tokens_seen": 37939456, "step": 31195 }, { "epoch": 3.9092845508081693, "grad_norm": 0.0736892968416214, "learning_rate": 9.724984586410173e-06, "loss": 0.4648, "num_input_tokens_seen": 37945120, "step": 31200 }, { "epoch": 3.909911038716953, "grad_norm": 0.07233356684446335, "learning_rate": 9.724805739421203e-06, "loss": 0.4614, "num_input_tokens_seen": 37951104, "step": 31205 }, { "epoch": 3.910537526625736, "grad_norm": 0.07733573764562607, "learning_rate": 9.724626835943212e-06, "loss": 0.4651, "num_input_tokens_seen": 37957280, "step": 31210 }, { "epoch": 3.9111640145345197, "grad_norm": 0.07562083750963211, "learning_rate": 9.724447875978333e-06, "loss": 0.4635, "num_input_tokens_seen": 37963392, "step": 31215 }, { "epoch": 3.911790502443303, "grad_norm": 0.07374928146600723, "learning_rate": 9.724268859528708e-06, "loss": 0.4625, "num_input_tokens_seen": 37969504, "step": 31220 }, { "epoch": 3.912416990352086, "grad_norm": 0.0731629803776741, "learning_rate": 9.724089786596476e-06, "loss": 0.4617, "num_input_tokens_seen": 37975584, "step": 31225 }, { "epoch": 3.9130434782608696, "grad_norm": 0.06643825024366379, "learning_rate": 9.723910657183778e-06, "loss": 0.4615, "num_input_tokens_seen": 37981824, "step": 31230 }, { "epoch": 3.913669966169653, "grad_norm": 0.11767271906137466, "learning_rate": 9.723731471292758e-06, "loss": 0.462, "num_input_tokens_seen": 37987840, "step": 31235 }, { "epoch": 3.9142964540784364, "grad_norm": 0.07949427515268326, "learning_rate": 9.723552228925557e-06, "loss": 0.462, "num_input_tokens_seen": 37994272, "step": 31240 }, { "epoch": 3.9149229419872196, "grad_norm": 0.07277325540781021, "learning_rate": 9.723372930084317e-06, "loss": 0.4633, "num_input_tokens_seen": 38000288, "step": 31245 }, { "epoch": 3.9155494298960027, "grad_norm": 0.03509830683469772, "learning_rate": 9.723193574771182e-06, "loss": 0.4628, "num_input_tokens_seen": 38006720, "step": 31250 }, { "epoch": 3.9161759178047864, "grad_norm": 0.1126357764005661, "learning_rate": 9.723014162988297e-06, "loss": 0.4623, "num_input_tokens_seen": 38012384, "step": 31255 }, { "epoch": 3.91680240571357, "grad_norm": 0.0925292894244194, "learning_rate": 9.722834694737807e-06, "loss": 0.4603, "num_input_tokens_seen": 38018592, "step": 31260 }, { "epoch": 3.917428893622353, "grad_norm": 0.08558154106140137, "learning_rate": 9.722655170021857e-06, "loss": 0.453, "num_input_tokens_seen": 38024512, "step": 31265 }, { "epoch": 3.9180553815311363, "grad_norm": 0.03585004061460495, "learning_rate": 9.722475588842594e-06, "loss": 0.4638, "num_input_tokens_seen": 38030784, "step": 31270 }, { "epoch": 3.91868186943992, "grad_norm": 0.09256163239479065, "learning_rate": 9.722295951202167e-06, "loss": 0.4639, "num_input_tokens_seen": 38036992, "step": 31275 }, { "epoch": 3.919308357348703, "grad_norm": 0.13717640936374664, "learning_rate": 9.72211625710272e-06, "loss": 0.4499, "num_input_tokens_seen": 38043040, "step": 31280 }, { "epoch": 3.9199348452574867, "grad_norm": 0.07321973145008087, "learning_rate": 9.721936506546403e-06, "loss": 0.4675, "num_input_tokens_seen": 38049280, "step": 31285 }, { "epoch": 3.92056133316627, "grad_norm": 0.07409069687128067, "learning_rate": 9.721756699535365e-06, "loss": 0.4651, "num_input_tokens_seen": 38055488, "step": 31290 }, { "epoch": 3.921187821075053, "grad_norm": 0.079219751060009, "learning_rate": 9.721576836071756e-06, "loss": 0.4527, "num_input_tokens_seen": 38061888, "step": 31295 }, { "epoch": 3.9218143089838366, "grad_norm": 0.10808955132961273, "learning_rate": 9.721396916157726e-06, "loss": 0.4631, "num_input_tokens_seen": 38068032, "step": 31300 }, { "epoch": 3.92244079689262, "grad_norm": 0.09722046554088593, "learning_rate": 9.721216939795427e-06, "loss": 0.4525, "num_input_tokens_seen": 38074176, "step": 31305 }, { "epoch": 3.9230672848014034, "grad_norm": 0.08287458121776581, "learning_rate": 9.721036906987009e-06, "loss": 0.4658, "num_input_tokens_seen": 38080032, "step": 31310 }, { "epoch": 3.9236937727101866, "grad_norm": 0.07785981893539429, "learning_rate": 9.720856817734625e-06, "loss": 0.4617, "num_input_tokens_seen": 38085504, "step": 31315 }, { "epoch": 3.92432026061897, "grad_norm": 0.14381562173366547, "learning_rate": 9.72067667204043e-06, "loss": 0.473, "num_input_tokens_seen": 38091648, "step": 31320 }, { "epoch": 3.9249467485277534, "grad_norm": 0.03677980974316597, "learning_rate": 9.720496469906578e-06, "loss": 0.4651, "num_input_tokens_seen": 38097312, "step": 31325 }, { "epoch": 3.925573236436537, "grad_norm": 0.1187257319688797, "learning_rate": 9.72031621133522e-06, "loss": 0.462, "num_input_tokens_seen": 38103552, "step": 31330 }, { "epoch": 3.92619972434532, "grad_norm": 0.11418057978153229, "learning_rate": 9.720135896328513e-06, "loss": 0.4641, "num_input_tokens_seen": 38109216, "step": 31335 }, { "epoch": 3.9268262122541033, "grad_norm": 0.077190101146698, "learning_rate": 9.719955524888613e-06, "loss": 0.4604, "num_input_tokens_seen": 38115168, "step": 31340 }, { "epoch": 3.927452700162887, "grad_norm": 0.15784990787506104, "learning_rate": 9.719775097017676e-06, "loss": 0.4704, "num_input_tokens_seen": 38121184, "step": 31345 }, { "epoch": 3.92807918807167, "grad_norm": 0.036906253546476364, "learning_rate": 9.71959461271786e-06, "loss": 0.4677, "num_input_tokens_seen": 38127328, "step": 31350 }, { "epoch": 3.9287056759804537, "grad_norm": 0.02958044409751892, "learning_rate": 9.719414071991323e-06, "loss": 0.468, "num_input_tokens_seen": 38133408, "step": 31355 }, { "epoch": 3.929332163889237, "grad_norm": 0.06625451892614365, "learning_rate": 9.719233474840221e-06, "loss": 0.461, "num_input_tokens_seen": 38139392, "step": 31360 }, { "epoch": 3.92995865179802, "grad_norm": 0.06686321645975113, "learning_rate": 9.719052821266717e-06, "loss": 0.4572, "num_input_tokens_seen": 38145632, "step": 31365 }, { "epoch": 3.9305851397068037, "grad_norm": 0.0814194306731224, "learning_rate": 9.718872111272968e-06, "loss": 0.4676, "num_input_tokens_seen": 38151840, "step": 31370 }, { "epoch": 3.9312116276155873, "grad_norm": 0.10789132863283157, "learning_rate": 9.718691344861136e-06, "loss": 0.4551, "num_input_tokens_seen": 38157920, "step": 31375 }, { "epoch": 3.9318381155243705, "grad_norm": 0.06860338151454926, "learning_rate": 9.718510522033382e-06, "loss": 0.4663, "num_input_tokens_seen": 38164192, "step": 31380 }, { "epoch": 3.9324646034331536, "grad_norm": 0.06833166629076004, "learning_rate": 9.718329642791868e-06, "loss": 0.4688, "num_input_tokens_seen": 38170400, "step": 31385 }, { "epoch": 3.9330910913419372, "grad_norm": 0.06669294089078903, "learning_rate": 9.718148707138755e-06, "loss": 0.4622, "num_input_tokens_seen": 38176416, "step": 31390 }, { "epoch": 3.9337175792507204, "grad_norm": 0.06628765165805817, "learning_rate": 9.717967715076207e-06, "loss": 0.4674, "num_input_tokens_seen": 38182688, "step": 31395 }, { "epoch": 3.934344067159504, "grad_norm": 0.11184727400541306, "learning_rate": 9.717786666606388e-06, "loss": 0.464, "num_input_tokens_seen": 38188896, "step": 31400 }, { "epoch": 3.934970555068287, "grad_norm": 0.09888490289449692, "learning_rate": 9.717605561731465e-06, "loss": 0.461, "num_input_tokens_seen": 38194784, "step": 31405 }, { "epoch": 3.9355970429770704, "grad_norm": 0.08135496824979782, "learning_rate": 9.717424400453601e-06, "loss": 0.4579, "num_input_tokens_seen": 38201024, "step": 31410 }, { "epoch": 3.936223530885854, "grad_norm": 0.13169598579406738, "learning_rate": 9.71724318277496e-06, "loss": 0.4613, "num_input_tokens_seen": 38206368, "step": 31415 }, { "epoch": 3.936850018794637, "grad_norm": 0.073857881128788, "learning_rate": 9.717061908697712e-06, "loss": 0.4633, "num_input_tokens_seen": 38212224, "step": 31420 }, { "epoch": 3.9374765067034208, "grad_norm": 0.12168435007333755, "learning_rate": 9.716880578224025e-06, "loss": 0.4606, "num_input_tokens_seen": 38218208, "step": 31425 }, { "epoch": 3.938102994612204, "grad_norm": 0.10911011695861816, "learning_rate": 9.716699191356061e-06, "loss": 0.4625, "num_input_tokens_seen": 38223776, "step": 31430 }, { "epoch": 3.938729482520987, "grad_norm": 0.13801349699497223, "learning_rate": 9.716517748095995e-06, "loss": 0.4635, "num_input_tokens_seen": 38229856, "step": 31435 }, { "epoch": 3.9393559704297707, "grad_norm": 0.1365717053413391, "learning_rate": 9.716336248445994e-06, "loss": 0.4596, "num_input_tokens_seen": 38235744, "step": 31440 }, { "epoch": 3.9399824583385543, "grad_norm": 0.06779497861862183, "learning_rate": 9.716154692408226e-06, "loss": 0.4597, "num_input_tokens_seen": 38241760, "step": 31445 }, { "epoch": 3.9406089462473375, "grad_norm": 0.08635815232992172, "learning_rate": 9.715973079984866e-06, "loss": 0.4612, "num_input_tokens_seen": 38247776, "step": 31450 }, { "epoch": 3.9412354341561207, "grad_norm": 0.1286524087190628, "learning_rate": 9.715791411178082e-06, "loss": 0.4621, "num_input_tokens_seen": 38254048, "step": 31455 }, { "epoch": 3.9418619220649043, "grad_norm": 0.07293135672807693, "learning_rate": 9.715609685990046e-06, "loss": 0.4662, "num_input_tokens_seen": 38260160, "step": 31460 }, { "epoch": 3.9424884099736874, "grad_norm": 0.11823204159736633, "learning_rate": 9.715427904422932e-06, "loss": 0.4654, "num_input_tokens_seen": 38266208, "step": 31465 }, { "epoch": 3.943114897882471, "grad_norm": 0.07614833116531372, "learning_rate": 9.715246066478912e-06, "loss": 0.4628, "num_input_tokens_seen": 38272736, "step": 31470 }, { "epoch": 3.943741385791254, "grad_norm": 0.09948959946632385, "learning_rate": 9.715064172160163e-06, "loss": 0.4634, "num_input_tokens_seen": 38278752, "step": 31475 }, { "epoch": 3.9443678737000374, "grad_norm": 0.1817345917224884, "learning_rate": 9.714882221468856e-06, "loss": 0.4657, "num_input_tokens_seen": 38285056, "step": 31480 }, { "epoch": 3.944994361608821, "grad_norm": 0.17203673720359802, "learning_rate": 9.714700214407168e-06, "loss": 0.4624, "num_input_tokens_seen": 38290976, "step": 31485 }, { "epoch": 3.945620849517604, "grad_norm": 0.0730181559920311, "learning_rate": 9.714518150977276e-06, "loss": 0.4633, "num_input_tokens_seen": 38296992, "step": 31490 }, { "epoch": 3.946247337426388, "grad_norm": 0.06432870775461197, "learning_rate": 9.714336031181354e-06, "loss": 0.4575, "num_input_tokens_seen": 38303072, "step": 31495 }, { "epoch": 3.946873825335171, "grad_norm": 0.0732041448354721, "learning_rate": 9.714153855021583e-06, "loss": 0.4587, "num_input_tokens_seen": 38309504, "step": 31500 }, { "epoch": 3.947500313243954, "grad_norm": 0.07137461751699448, "learning_rate": 9.71397162250014e-06, "loss": 0.4621, "num_input_tokens_seen": 38315392, "step": 31505 }, { "epoch": 3.9481268011527377, "grad_norm": 0.07339830696582794, "learning_rate": 9.713789333619202e-06, "loss": 0.4623, "num_input_tokens_seen": 38321600, "step": 31510 }, { "epoch": 3.9487532890615213, "grad_norm": 0.07119917124509811, "learning_rate": 9.713606988380948e-06, "loss": 0.47, "num_input_tokens_seen": 38327936, "step": 31515 }, { "epoch": 3.9493797769703045, "grad_norm": 0.07972010970115662, "learning_rate": 9.713424586787561e-06, "loss": 0.4663, "num_input_tokens_seen": 38334528, "step": 31520 }, { "epoch": 3.9500062648790877, "grad_norm": 0.07574446499347687, "learning_rate": 9.713242128841218e-06, "loss": 0.462, "num_input_tokens_seen": 38340768, "step": 31525 }, { "epoch": 3.9506327527878713, "grad_norm": 0.030514685437083244, "learning_rate": 9.713059614544104e-06, "loss": 0.4699, "num_input_tokens_seen": 38346720, "step": 31530 }, { "epoch": 3.9512592406966545, "grad_norm": 0.07744821161031723, "learning_rate": 9.712877043898401e-06, "loss": 0.4656, "num_input_tokens_seen": 38352480, "step": 31535 }, { "epoch": 3.951885728605438, "grad_norm": 0.07035641372203827, "learning_rate": 9.712694416906289e-06, "loss": 0.4615, "num_input_tokens_seen": 38358752, "step": 31540 }, { "epoch": 3.9525122165142212, "grad_norm": 0.07842393219470978, "learning_rate": 9.712511733569956e-06, "loss": 0.4567, "num_input_tokens_seen": 38364928, "step": 31545 }, { "epoch": 3.9531387044230044, "grad_norm": 0.07152716815471649, "learning_rate": 9.71232899389158e-06, "loss": 0.4631, "num_input_tokens_seen": 38371232, "step": 31550 }, { "epoch": 3.953765192331788, "grad_norm": 0.10404501110315323, "learning_rate": 9.712146197873348e-06, "loss": 0.4651, "num_input_tokens_seen": 38377408, "step": 31555 }, { "epoch": 3.954391680240571, "grad_norm": 0.03531914949417114, "learning_rate": 9.71196334551745e-06, "loss": 0.4648, "num_input_tokens_seen": 38383072, "step": 31560 }, { "epoch": 3.955018168149355, "grad_norm": 0.033273495733737946, "learning_rate": 9.711780436826068e-06, "loss": 0.4621, "num_input_tokens_seen": 38389216, "step": 31565 }, { "epoch": 3.955644656058138, "grad_norm": 0.07579762488603592, "learning_rate": 9.711597471801387e-06, "loss": 0.4655, "num_input_tokens_seen": 38395072, "step": 31570 }, { "epoch": 3.9562711439669216, "grad_norm": 0.06136225536465645, "learning_rate": 9.711414450445599e-06, "loss": 0.4583, "num_input_tokens_seen": 38400384, "step": 31575 }, { "epoch": 3.9568976318757048, "grad_norm": 0.11737386137247086, "learning_rate": 9.71123137276089e-06, "loss": 0.4596, "num_input_tokens_seen": 38406496, "step": 31580 }, { "epoch": 3.9575241197844884, "grad_norm": 0.12474345415830612, "learning_rate": 9.711048238749447e-06, "loss": 0.4654, "num_input_tokens_seen": 38412448, "step": 31585 }, { "epoch": 3.9581506076932715, "grad_norm": 0.07342267781496048, "learning_rate": 9.710865048413462e-06, "loss": 0.4671, "num_input_tokens_seen": 38418496, "step": 31590 }, { "epoch": 3.9587770956020547, "grad_norm": 0.08423944562673569, "learning_rate": 9.710681801755126e-06, "loss": 0.4719, "num_input_tokens_seen": 38424576, "step": 31595 }, { "epoch": 3.9594035835108383, "grad_norm": 0.0673508569598198, "learning_rate": 9.710498498776627e-06, "loss": 0.4694, "num_input_tokens_seen": 38430656, "step": 31600 }, { "epoch": 3.9600300714196215, "grad_norm": 0.07354728132486343, "learning_rate": 9.71031513948016e-06, "loss": 0.4634, "num_input_tokens_seen": 38436416, "step": 31605 }, { "epoch": 3.960656559328405, "grad_norm": 0.06989777833223343, "learning_rate": 9.710131723867914e-06, "loss": 0.4646, "num_input_tokens_seen": 38442304, "step": 31610 }, { "epoch": 3.9612830472371883, "grad_norm": 0.07528731971979141, "learning_rate": 9.709948251942086e-06, "loss": 0.461, "num_input_tokens_seen": 38448320, "step": 31615 }, { "epoch": 3.9619095351459714, "grad_norm": 0.07528568059206009, "learning_rate": 9.709764723704863e-06, "loss": 0.4618, "num_input_tokens_seen": 38453888, "step": 31620 }, { "epoch": 3.962536023054755, "grad_norm": 0.11464183777570724, "learning_rate": 9.709581139158444e-06, "loss": 0.4632, "num_input_tokens_seen": 38459968, "step": 31625 }, { "epoch": 3.9631625109635387, "grad_norm": 0.07843591272830963, "learning_rate": 9.709397498305025e-06, "loss": 0.4639, "num_input_tokens_seen": 38466336, "step": 31630 }, { "epoch": 3.963788998872322, "grad_norm": 0.17025679349899292, "learning_rate": 9.709213801146796e-06, "loss": 0.4625, "num_input_tokens_seen": 38472160, "step": 31635 }, { "epoch": 3.964415486781105, "grad_norm": 0.0928058847784996, "learning_rate": 9.70903004768596e-06, "loss": 0.4644, "num_input_tokens_seen": 38478208, "step": 31640 }, { "epoch": 3.9650419746898886, "grad_norm": 0.0935615673661232, "learning_rate": 9.708846237924708e-06, "loss": 0.4592, "num_input_tokens_seen": 38484576, "step": 31645 }, { "epoch": 3.965668462598672, "grad_norm": 0.08119456470012665, "learning_rate": 9.70866237186524e-06, "loss": 0.4634, "num_input_tokens_seen": 38490848, "step": 31650 }, { "epoch": 3.9662949505074554, "grad_norm": 0.13947488367557526, "learning_rate": 9.708478449509757e-06, "loss": 0.4574, "num_input_tokens_seen": 38497184, "step": 31655 }, { "epoch": 3.9669214384162386, "grad_norm": 0.033338580280542374, "learning_rate": 9.708294470860454e-06, "loss": 0.4639, "num_input_tokens_seen": 38503200, "step": 31660 }, { "epoch": 3.9675479263250217, "grad_norm": 0.1040705218911171, "learning_rate": 9.708110435919531e-06, "loss": 0.4613, "num_input_tokens_seen": 38508992, "step": 31665 }, { "epoch": 3.9681744142338053, "grad_norm": 0.1287248283624649, "learning_rate": 9.70792634468919e-06, "loss": 0.4615, "num_input_tokens_seen": 38515136, "step": 31670 }, { "epoch": 3.9688009021425885, "grad_norm": 0.1208401545882225, "learning_rate": 9.707742197171635e-06, "loss": 0.4577, "num_input_tokens_seen": 38521312, "step": 31675 }, { "epoch": 3.969427390051372, "grad_norm": 0.0692354217171669, "learning_rate": 9.70755799336906e-06, "loss": 0.4618, "num_input_tokens_seen": 38527520, "step": 31680 }, { "epoch": 3.9700538779601553, "grad_norm": 0.06890084594488144, "learning_rate": 9.707373733283672e-06, "loss": 0.4622, "num_input_tokens_seen": 38533440, "step": 31685 }, { "epoch": 3.9706803658689385, "grad_norm": 0.0754457488656044, "learning_rate": 9.707189416917674e-06, "loss": 0.4653, "num_input_tokens_seen": 38538976, "step": 31690 }, { "epoch": 3.971306853777722, "grad_norm": 0.03807438910007477, "learning_rate": 9.707005044273268e-06, "loss": 0.4576, "num_input_tokens_seen": 38544896, "step": 31695 }, { "epoch": 3.9719333416865057, "grad_norm": 0.0706319659948349, "learning_rate": 9.70682061535266e-06, "loss": 0.4614, "num_input_tokens_seen": 38551232, "step": 31700 }, { "epoch": 3.972559829595289, "grad_norm": 0.08123648911714554, "learning_rate": 9.706636130158054e-06, "loss": 0.4623, "num_input_tokens_seen": 38557376, "step": 31705 }, { "epoch": 3.973186317504072, "grad_norm": 0.0796508640050888, "learning_rate": 9.706451588691656e-06, "loss": 0.4649, "num_input_tokens_seen": 38563456, "step": 31710 }, { "epoch": 3.9738128054128556, "grad_norm": 0.10706720501184464, "learning_rate": 9.706266990955672e-06, "loss": 0.4551, "num_input_tokens_seen": 38569632, "step": 31715 }, { "epoch": 3.974439293321639, "grad_norm": 0.11748898774385452, "learning_rate": 9.70608233695231e-06, "loss": 0.459, "num_input_tokens_seen": 38575840, "step": 31720 }, { "epoch": 3.9750657812304224, "grad_norm": 0.1204604059457779, "learning_rate": 9.705897626683777e-06, "loss": 0.4642, "num_input_tokens_seen": 38582208, "step": 31725 }, { "epoch": 3.9756922691392056, "grad_norm": 0.07781191915273666, "learning_rate": 9.70571286015228e-06, "loss": 0.4624, "num_input_tokens_seen": 38588384, "step": 31730 }, { "epoch": 3.9763187570479888, "grad_norm": 0.12024197727441788, "learning_rate": 9.70552803736003e-06, "loss": 0.4679, "num_input_tokens_seen": 38594336, "step": 31735 }, { "epoch": 3.9769452449567724, "grad_norm": 0.12454681843519211, "learning_rate": 9.705343158309237e-06, "loss": 0.4605, "num_input_tokens_seen": 38600256, "step": 31740 }, { "epoch": 3.9775717328655555, "grad_norm": 0.12423526495695114, "learning_rate": 9.70515822300211e-06, "loss": 0.4644, "num_input_tokens_seen": 38606400, "step": 31745 }, { "epoch": 3.978198220774339, "grad_norm": 0.15796752274036407, "learning_rate": 9.704973231440862e-06, "loss": 0.4645, "num_input_tokens_seen": 38612192, "step": 31750 }, { "epoch": 3.9788247086831223, "grad_norm": 0.12324582785367966, "learning_rate": 9.7047881836277e-06, "loss": 0.4568, "num_input_tokens_seen": 38617824, "step": 31755 }, { "epoch": 3.979451196591906, "grad_norm": 0.11597904562950134, "learning_rate": 9.704603079564842e-06, "loss": 0.4507, "num_input_tokens_seen": 38623872, "step": 31760 }, { "epoch": 3.980077684500689, "grad_norm": 0.08112603425979614, "learning_rate": 9.704417919254498e-06, "loss": 0.4698, "num_input_tokens_seen": 38629792, "step": 31765 }, { "epoch": 3.9807041724094727, "grad_norm": 0.10637591779232025, "learning_rate": 9.704232702698884e-06, "loss": 0.4639, "num_input_tokens_seen": 38635744, "step": 31770 }, { "epoch": 3.981330660318256, "grad_norm": 0.12667423486709595, "learning_rate": 9.704047429900211e-06, "loss": 0.4681, "num_input_tokens_seen": 38641824, "step": 31775 }, { "epoch": 3.981957148227039, "grad_norm": 0.11623550206422806, "learning_rate": 9.703862100860697e-06, "loss": 0.4672, "num_input_tokens_seen": 38647936, "step": 31780 }, { "epoch": 3.9825836361358227, "grad_norm": 0.07006141543388367, "learning_rate": 9.703676715582557e-06, "loss": 0.4561, "num_input_tokens_seen": 38654144, "step": 31785 }, { "epoch": 3.983210124044606, "grad_norm": 0.08504752069711685, "learning_rate": 9.703491274068006e-06, "loss": 0.4627, "num_input_tokens_seen": 38660224, "step": 31790 }, { "epoch": 3.9838366119533895, "grad_norm": 0.03405974432826042, "learning_rate": 9.703305776319263e-06, "loss": 0.467, "num_input_tokens_seen": 38665952, "step": 31795 }, { "epoch": 3.9844630998621726, "grad_norm": 0.06811732053756714, "learning_rate": 9.703120222338547e-06, "loss": 0.4613, "num_input_tokens_seen": 38671712, "step": 31800 }, { "epoch": 3.985089587770956, "grad_norm": 0.03676620125770569, "learning_rate": 9.702934612128073e-06, "loss": 0.4626, "num_input_tokens_seen": 38678432, "step": 31805 }, { "epoch": 3.9857160756797394, "grad_norm": 0.06920341402292252, "learning_rate": 9.702748945690062e-06, "loss": 0.4621, "num_input_tokens_seen": 38684480, "step": 31810 }, { "epoch": 3.986342563588523, "grad_norm": 0.1322880983352661, "learning_rate": 9.702563223026733e-06, "loss": 0.46, "num_input_tokens_seen": 38690944, "step": 31815 }, { "epoch": 3.986969051497306, "grad_norm": 0.09305381774902344, "learning_rate": 9.702377444140305e-06, "loss": 0.4587, "num_input_tokens_seen": 38696864, "step": 31820 }, { "epoch": 3.9875955394060894, "grad_norm": 0.18387643992900848, "learning_rate": 9.702191609033005e-06, "loss": 0.4676, "num_input_tokens_seen": 38702816, "step": 31825 }, { "epoch": 3.988222027314873, "grad_norm": 0.1187451109290123, "learning_rate": 9.702005717707049e-06, "loss": 0.4618, "num_input_tokens_seen": 38708800, "step": 31830 }, { "epoch": 3.988848515223656, "grad_norm": 0.08938208967447281, "learning_rate": 9.70181977016466e-06, "loss": 0.46, "num_input_tokens_seen": 38714976, "step": 31835 }, { "epoch": 3.9894750031324397, "grad_norm": 0.094459168612957, "learning_rate": 9.701633766408064e-06, "loss": 0.4648, "num_input_tokens_seen": 38721120, "step": 31840 }, { "epoch": 3.990101491041223, "grad_norm": 0.0694703534245491, "learning_rate": 9.701447706439485e-06, "loss": 0.4681, "num_input_tokens_seen": 38727104, "step": 31845 }, { "epoch": 3.990727978950006, "grad_norm": 0.06529349833726883, "learning_rate": 9.701261590261143e-06, "loss": 0.4603, "num_input_tokens_seen": 38733056, "step": 31850 }, { "epoch": 3.9913544668587897, "grad_norm": 0.07461190968751907, "learning_rate": 9.701075417875267e-06, "loss": 0.4624, "num_input_tokens_seen": 38739104, "step": 31855 }, { "epoch": 3.991980954767573, "grad_norm": 0.06301277130842209, "learning_rate": 9.700889189284083e-06, "loss": 0.4614, "num_input_tokens_seen": 38745280, "step": 31860 }, { "epoch": 3.9926074426763565, "grad_norm": 0.07109420001506805, "learning_rate": 9.700702904489817e-06, "loss": 0.4613, "num_input_tokens_seen": 38751776, "step": 31865 }, { "epoch": 3.9932339305851396, "grad_norm": 0.07056170701980591, "learning_rate": 9.700516563494694e-06, "loss": 0.4649, "num_input_tokens_seen": 38757536, "step": 31870 }, { "epoch": 3.993860418493923, "grad_norm": 0.1086689680814743, "learning_rate": 9.700330166300945e-06, "loss": 0.4638, "num_input_tokens_seen": 38763520, "step": 31875 }, { "epoch": 3.9944869064027064, "grad_norm": 0.11449404060840607, "learning_rate": 9.700143712910794e-06, "loss": 0.4621, "num_input_tokens_seen": 38769792, "step": 31880 }, { "epoch": 3.99511339431149, "grad_norm": 0.07431783527135849, "learning_rate": 9.699957203326477e-06, "loss": 0.4611, "num_input_tokens_seen": 38775424, "step": 31885 }, { "epoch": 3.995739882220273, "grad_norm": 0.0781836062669754, "learning_rate": 9.699770637550218e-06, "loss": 0.4603, "num_input_tokens_seen": 38781056, "step": 31890 }, { "epoch": 3.9963663701290564, "grad_norm": 0.0768953338265419, "learning_rate": 9.69958401558425e-06, "loss": 0.4676, "num_input_tokens_seen": 38786656, "step": 31895 }, { "epoch": 3.99699285803784, "grad_norm": 0.10696320980787277, "learning_rate": 9.699397337430805e-06, "loss": 0.4655, "num_input_tokens_seen": 38792384, "step": 31900 }, { "epoch": 3.997619345946623, "grad_norm": 0.0962643250823021, "learning_rate": 9.699210603092111e-06, "loss": 0.4606, "num_input_tokens_seen": 38798304, "step": 31905 }, { "epoch": 3.9982458338554068, "grad_norm": 0.031268149614334106, "learning_rate": 9.699023812570407e-06, "loss": 0.4598, "num_input_tokens_seen": 38804608, "step": 31910 }, { "epoch": 3.99887232176419, "grad_norm": 0.07455730438232422, "learning_rate": 9.698836965867919e-06, "loss": 0.4634, "num_input_tokens_seen": 38810016, "step": 31915 }, { "epoch": 3.999498809672973, "grad_norm": 0.07091870158910751, "learning_rate": 9.698650062986887e-06, "loss": 0.4645, "num_input_tokens_seen": 38815968, "step": 31920 }, { "epoch": 4.0, "eval_loss": 0.462841659784317, "eval_runtime": 222.8576, "eval_samples_per_second": 35.812, "eval_steps_per_second": 8.956, "num_input_tokens_seen": 38820832, "step": 31924 }, { "epoch": 4.000125297581756, "grad_norm": 0.0640908032655716, "learning_rate": 9.698463103929542e-06, "loss": 0.4601, "num_input_tokens_seen": 38822080, "step": 31925 }, { "epoch": 4.00075178549054, "grad_norm": 0.07797807455062866, "learning_rate": 9.698276088698122e-06, "loss": 0.4564, "num_input_tokens_seen": 38827520, "step": 31930 }, { "epoch": 4.0013782733993235, "grad_norm": 0.06399679183959961, "learning_rate": 9.698089017294859e-06, "loss": 0.4631, "num_input_tokens_seen": 38833600, "step": 31935 }, { "epoch": 4.002004761308107, "grad_norm": 0.07298894971609116, "learning_rate": 9.697901889721993e-06, "loss": 0.4665, "num_input_tokens_seen": 38839488, "step": 31940 }, { "epoch": 4.00263124921689, "grad_norm": 0.06305588781833649, "learning_rate": 9.69771470598176e-06, "loss": 0.4681, "num_input_tokens_seen": 38845472, "step": 31945 }, { "epoch": 4.003257737125674, "grad_norm": 0.06770988553762436, "learning_rate": 9.697527466076398e-06, "loss": 0.4596, "num_input_tokens_seen": 38851648, "step": 31950 }, { "epoch": 4.003884225034457, "grad_norm": 0.07128237932920456, "learning_rate": 9.697340170008146e-06, "loss": 0.4601, "num_input_tokens_seen": 38857856, "step": 31955 }, { "epoch": 4.00451071294324, "grad_norm": 0.11371168494224548, "learning_rate": 9.697152817779243e-06, "loss": 0.4636, "num_input_tokens_seen": 38864064, "step": 31960 }, { "epoch": 4.005137200852023, "grad_norm": 0.10265578329563141, "learning_rate": 9.696965409391928e-06, "loss": 0.4582, "num_input_tokens_seen": 38870336, "step": 31965 }, { "epoch": 4.005763688760807, "grad_norm": 0.0683608278632164, "learning_rate": 9.696777944848443e-06, "loss": 0.4643, "num_input_tokens_seen": 38876672, "step": 31970 }, { "epoch": 4.006390176669591, "grad_norm": 0.15521953999996185, "learning_rate": 9.696590424151029e-06, "loss": 0.4539, "num_input_tokens_seen": 38882240, "step": 31975 }, { "epoch": 4.007016664578374, "grad_norm": 0.11751853674650192, "learning_rate": 9.696402847301929e-06, "loss": 0.4557, "num_input_tokens_seen": 38888224, "step": 31980 }, { "epoch": 4.007643152487157, "grad_norm": 0.08097466826438904, "learning_rate": 9.696215214303383e-06, "loss": 0.4565, "num_input_tokens_seen": 38894464, "step": 31985 }, { "epoch": 4.00826964039594, "grad_norm": 0.08117374777793884, "learning_rate": 9.696027525157636e-06, "loss": 0.4652, "num_input_tokens_seen": 38900576, "step": 31990 }, { "epoch": 4.008896128304723, "grad_norm": 0.18930964171886444, "learning_rate": 9.69583977986693e-06, "loss": 0.458, "num_input_tokens_seen": 38906720, "step": 31995 }, { "epoch": 4.009522616213507, "grad_norm": 0.07360337674617767, "learning_rate": 9.695651978433514e-06, "loss": 0.4527, "num_input_tokens_seen": 38912800, "step": 32000 }, { "epoch": 4.0101491041222905, "grad_norm": 0.10686881095170975, "learning_rate": 9.69546412085963e-06, "loss": 0.4737, "num_input_tokens_seen": 38918624, "step": 32005 }, { "epoch": 4.010775592031074, "grad_norm": 0.03733163699507713, "learning_rate": 9.695276207147524e-06, "loss": 0.4663, "num_input_tokens_seen": 38924384, "step": 32010 }, { "epoch": 4.011402079939857, "grad_norm": 0.03257475793361664, "learning_rate": 9.695088237299445e-06, "loss": 0.4638, "num_input_tokens_seen": 38930688, "step": 32015 }, { "epoch": 4.012028567848641, "grad_norm": 0.07865159958600998, "learning_rate": 9.694900211317637e-06, "loss": 0.4521, "num_input_tokens_seen": 38936224, "step": 32020 }, { "epoch": 4.012655055757424, "grad_norm": 0.06310538202524185, "learning_rate": 9.694712129204351e-06, "loss": 0.4629, "num_input_tokens_seen": 38942464, "step": 32025 }, { "epoch": 4.013281543666207, "grad_norm": 0.09659598022699356, "learning_rate": 9.694523990961832e-06, "loss": 0.4783, "num_input_tokens_seen": 38948704, "step": 32030 }, { "epoch": 4.01390803157499, "grad_norm": 0.07733584195375443, "learning_rate": 9.694335796592334e-06, "loss": 0.4556, "num_input_tokens_seen": 38954752, "step": 32035 }, { "epoch": 4.014534519483774, "grad_norm": 0.07000415027141571, "learning_rate": 9.694147546098105e-06, "loss": 0.4656, "num_input_tokens_seen": 38960640, "step": 32040 }, { "epoch": 4.015161007392558, "grad_norm": 0.10611172765493393, "learning_rate": 9.693959239481395e-06, "loss": 0.4586, "num_input_tokens_seen": 38966752, "step": 32045 }, { "epoch": 4.015787495301341, "grad_norm": 0.06973414868116379, "learning_rate": 9.693770876744456e-06, "loss": 0.473, "num_input_tokens_seen": 38972704, "step": 32050 }, { "epoch": 4.016413983210124, "grad_norm": 0.07107765972614288, "learning_rate": 9.693582457889539e-06, "loss": 0.4655, "num_input_tokens_seen": 38978304, "step": 32055 }, { "epoch": 4.017040471118907, "grad_norm": 0.09849461913108826, "learning_rate": 9.693393982918898e-06, "loss": 0.4611, "num_input_tokens_seen": 38984320, "step": 32060 }, { "epoch": 4.01766695902769, "grad_norm": 0.10286599397659302, "learning_rate": 9.693205451834787e-06, "loss": 0.459, "num_input_tokens_seen": 38990592, "step": 32065 }, { "epoch": 4.018293446936474, "grad_norm": 0.08239470422267914, "learning_rate": 9.693016864639456e-06, "loss": 0.4581, "num_input_tokens_seen": 38996576, "step": 32070 }, { "epoch": 4.018919934845258, "grad_norm": 0.03220893442630768, "learning_rate": 9.692828221335164e-06, "loss": 0.4606, "num_input_tokens_seen": 39003008, "step": 32075 }, { "epoch": 4.019546422754041, "grad_norm": 0.07416348904371262, "learning_rate": 9.692639521924168e-06, "loss": 0.4626, "num_input_tokens_seen": 39009376, "step": 32080 }, { "epoch": 4.020172910662824, "grad_norm": 0.10734999924898148, "learning_rate": 9.692450766408718e-06, "loss": 0.4635, "num_input_tokens_seen": 39015616, "step": 32085 }, { "epoch": 4.020799398571608, "grad_norm": 0.07176526635885239, "learning_rate": 9.692261954791075e-06, "loss": 0.4577, "num_input_tokens_seen": 39021408, "step": 32090 }, { "epoch": 4.021425886480391, "grad_norm": 0.06875313073396683, "learning_rate": 9.692073087073494e-06, "loss": 0.4546, "num_input_tokens_seen": 39027168, "step": 32095 }, { "epoch": 4.022052374389174, "grad_norm": 0.06525330990552902, "learning_rate": 9.691884163258236e-06, "loss": 0.4464, "num_input_tokens_seen": 39033504, "step": 32100 }, { "epoch": 4.0226788622979575, "grad_norm": 0.09211140125989914, "learning_rate": 9.691695183347558e-06, "loss": 0.4663, "num_input_tokens_seen": 39040128, "step": 32105 }, { "epoch": 4.023305350206741, "grad_norm": 0.07608575373888016, "learning_rate": 9.691506147343719e-06, "loss": 0.457, "num_input_tokens_seen": 39045952, "step": 32110 }, { "epoch": 4.023931838115525, "grad_norm": 0.13695982098579407, "learning_rate": 9.691317055248979e-06, "loss": 0.4702, "num_input_tokens_seen": 39051968, "step": 32115 }, { "epoch": 4.024558326024308, "grad_norm": 0.07019998133182526, "learning_rate": 9.691127907065601e-06, "loss": 0.4728, "num_input_tokens_seen": 39057952, "step": 32120 }, { "epoch": 4.025184813933091, "grad_norm": 0.12478435784578323, "learning_rate": 9.690938702795843e-06, "loss": 0.4589, "num_input_tokens_seen": 39064032, "step": 32125 }, { "epoch": 4.025811301841874, "grad_norm": 0.08194861561059952, "learning_rate": 9.690749442441969e-06, "loss": 0.4596, "num_input_tokens_seen": 39069984, "step": 32130 }, { "epoch": 4.026437789750658, "grad_norm": 0.09012721478939056, "learning_rate": 9.690560126006243e-06, "loss": 0.4601, "num_input_tokens_seen": 39075680, "step": 32135 }, { "epoch": 4.027064277659441, "grad_norm": 0.07565265148878098, "learning_rate": 9.690370753490927e-06, "loss": 0.455, "num_input_tokens_seen": 39082272, "step": 32140 }, { "epoch": 4.027690765568225, "grad_norm": 0.0925629585981369, "learning_rate": 9.690181324898285e-06, "loss": 0.4628, "num_input_tokens_seen": 39088256, "step": 32145 }, { "epoch": 4.028317253477008, "grad_norm": 0.03934106603264809, "learning_rate": 9.689991840230582e-06, "loss": 0.4626, "num_input_tokens_seen": 39094560, "step": 32150 }, { "epoch": 4.028943741385791, "grad_norm": 0.0418388657271862, "learning_rate": 9.689802299490081e-06, "loss": 0.4615, "num_input_tokens_seen": 39100928, "step": 32155 }, { "epoch": 4.029570229294575, "grad_norm": 0.14104671776294708, "learning_rate": 9.689612702679054e-06, "loss": 0.4547, "num_input_tokens_seen": 39107456, "step": 32160 }, { "epoch": 4.030196717203358, "grad_norm": 0.0838480293750763, "learning_rate": 9.689423049799763e-06, "loss": 0.4579, "num_input_tokens_seen": 39113376, "step": 32165 }, { "epoch": 4.030823205112141, "grad_norm": 0.08420538902282715, "learning_rate": 9.689233340854476e-06, "loss": 0.4676, "num_input_tokens_seen": 39119136, "step": 32170 }, { "epoch": 4.0314496930209245, "grad_norm": 0.1637333184480667, "learning_rate": 9.689043575845462e-06, "loss": 0.4739, "num_input_tokens_seen": 39125152, "step": 32175 }, { "epoch": 4.032076180929708, "grad_norm": 0.04410835728049278, "learning_rate": 9.688853754774992e-06, "loss": 0.4667, "num_input_tokens_seen": 39131328, "step": 32180 }, { "epoch": 4.032702668838492, "grad_norm": 0.0879431888461113, "learning_rate": 9.688663877645331e-06, "loss": 0.4512, "num_input_tokens_seen": 39136928, "step": 32185 }, { "epoch": 4.033329156747275, "grad_norm": 0.14808529615402222, "learning_rate": 9.688473944458751e-06, "loss": 0.4705, "num_input_tokens_seen": 39143104, "step": 32190 }, { "epoch": 4.033955644656058, "grad_norm": 0.07815414667129517, "learning_rate": 9.688283955217525e-06, "loss": 0.4632, "num_input_tokens_seen": 39149216, "step": 32195 }, { "epoch": 4.034582132564841, "grad_norm": 0.0863436684012413, "learning_rate": 9.688093909923921e-06, "loss": 0.4568, "num_input_tokens_seen": 39155296, "step": 32200 }, { "epoch": 4.035208620473625, "grad_norm": 0.07618542015552521, "learning_rate": 9.687903808580214e-06, "loss": 0.4616, "num_input_tokens_seen": 39161536, "step": 32205 }, { "epoch": 4.035835108382408, "grad_norm": 0.04058158025145531, "learning_rate": 9.687713651188675e-06, "loss": 0.463, "num_input_tokens_seen": 39167552, "step": 32210 }, { "epoch": 4.036461596291192, "grad_norm": 0.1293392777442932, "learning_rate": 9.687523437751578e-06, "loss": 0.4567, "num_input_tokens_seen": 39173440, "step": 32215 }, { "epoch": 4.037088084199975, "grad_norm": 0.07377217710018158, "learning_rate": 9.687333168271198e-06, "loss": 0.464, "num_input_tokens_seen": 39179488, "step": 32220 }, { "epoch": 4.037714572108758, "grad_norm": 0.08898214250802994, "learning_rate": 9.687142842749807e-06, "loss": 0.4609, "num_input_tokens_seen": 39185504, "step": 32225 }, { "epoch": 4.038341060017542, "grad_norm": 0.0827888548374176, "learning_rate": 9.686952461189685e-06, "loss": 0.4629, "num_input_tokens_seen": 39191840, "step": 32230 }, { "epoch": 4.038967547926325, "grad_norm": 0.08807798475027084, "learning_rate": 9.686762023593104e-06, "loss": 0.463, "num_input_tokens_seen": 39197760, "step": 32235 }, { "epoch": 4.039594035835108, "grad_norm": 0.08033466339111328, "learning_rate": 9.686571529962344e-06, "loss": 0.461, "num_input_tokens_seen": 39204064, "step": 32240 }, { "epoch": 4.0402205237438915, "grad_norm": 0.043459076434373856, "learning_rate": 9.68638098029968e-06, "loss": 0.4594, "num_input_tokens_seen": 39210400, "step": 32245 }, { "epoch": 4.040847011652675, "grad_norm": 0.08187061548233032, "learning_rate": 9.686190374607392e-06, "loss": 0.4616, "num_input_tokens_seen": 39216736, "step": 32250 }, { "epoch": 4.041473499561459, "grad_norm": 0.14739033579826355, "learning_rate": 9.685999712887757e-06, "loss": 0.4651, "num_input_tokens_seen": 39222880, "step": 32255 }, { "epoch": 4.042099987470242, "grad_norm": 0.09010867029428482, "learning_rate": 9.685808995143056e-06, "loss": 0.4694, "num_input_tokens_seen": 39228928, "step": 32260 }, { "epoch": 4.042726475379025, "grad_norm": 0.16737376153469086, "learning_rate": 9.68561822137557e-06, "loss": 0.4623, "num_input_tokens_seen": 39235168, "step": 32265 }, { "epoch": 4.043352963287808, "grad_norm": 0.10379546135663986, "learning_rate": 9.685427391587578e-06, "loss": 0.4624, "num_input_tokens_seen": 39241504, "step": 32270 }, { "epoch": 4.043979451196592, "grad_norm": 0.14890260994434357, "learning_rate": 9.685236505781364e-06, "loss": 0.4632, "num_input_tokens_seen": 39247424, "step": 32275 }, { "epoch": 4.0446059391053755, "grad_norm": 0.1527351438999176, "learning_rate": 9.685045563959206e-06, "loss": 0.4617, "num_input_tokens_seen": 39253824, "step": 32280 }, { "epoch": 4.045232427014159, "grad_norm": 0.18497249484062195, "learning_rate": 9.68485456612339e-06, "loss": 0.4638, "num_input_tokens_seen": 39260032, "step": 32285 }, { "epoch": 4.045858914922942, "grad_norm": 0.1263047754764557, "learning_rate": 9.684663512276198e-06, "loss": 0.467, "num_input_tokens_seen": 39266048, "step": 32290 }, { "epoch": 4.046485402831725, "grad_norm": 0.05239718407392502, "learning_rate": 9.684472402419917e-06, "loss": 0.4536, "num_input_tokens_seen": 39272160, "step": 32295 }, { "epoch": 4.047111890740509, "grad_norm": 0.11063054203987122, "learning_rate": 9.684281236556828e-06, "loss": 0.4584, "num_input_tokens_seen": 39278560, "step": 32300 }, { "epoch": 4.047738378649292, "grad_norm": 0.04921065270900726, "learning_rate": 9.68409001468922e-06, "loss": 0.4629, "num_input_tokens_seen": 39284704, "step": 32305 }, { "epoch": 4.048364866558075, "grad_norm": 0.11387331038713455, "learning_rate": 9.683898736819377e-06, "loss": 0.464, "num_input_tokens_seen": 39290848, "step": 32310 }, { "epoch": 4.0489913544668585, "grad_norm": 0.14355483651161194, "learning_rate": 9.683707402949586e-06, "loss": 0.4651, "num_input_tokens_seen": 39296992, "step": 32315 }, { "epoch": 4.049617842375643, "grad_norm": 0.17205572128295898, "learning_rate": 9.683516013082138e-06, "loss": 0.4724, "num_input_tokens_seen": 39303328, "step": 32320 }, { "epoch": 4.050244330284426, "grad_norm": 0.10892952233552933, "learning_rate": 9.683324567219315e-06, "loss": 0.4549, "num_input_tokens_seen": 39309344, "step": 32325 }, { "epoch": 4.050870818193209, "grad_norm": 0.08225342631340027, "learning_rate": 9.68313306536341e-06, "loss": 0.4577, "num_input_tokens_seen": 39315776, "step": 32330 }, { "epoch": 4.051497306101992, "grad_norm": 0.1028333529829979, "learning_rate": 9.682941507516713e-06, "loss": 0.467, "num_input_tokens_seen": 39321792, "step": 32335 }, { "epoch": 4.052123794010775, "grad_norm": 0.1334703415632248, "learning_rate": 9.682749893681513e-06, "loss": 0.4648, "num_input_tokens_seen": 39328192, "step": 32340 }, { "epoch": 4.052750281919559, "grad_norm": 0.22503213584423065, "learning_rate": 9.6825582238601e-06, "loss": 0.453, "num_input_tokens_seen": 39333984, "step": 32345 }, { "epoch": 4.0533767698283425, "grad_norm": 1.6548857688903809, "learning_rate": 9.682366498054768e-06, "loss": 0.4655, "num_input_tokens_seen": 39340224, "step": 32350 }, { "epoch": 4.054003257737126, "grad_norm": 0.20505905151367188, "learning_rate": 9.682174716267805e-06, "loss": 0.4721, "num_input_tokens_seen": 39346048, "step": 32355 }, { "epoch": 4.054629745645909, "grad_norm": 0.09199707955121994, "learning_rate": 9.681982878501511e-06, "loss": 0.4543, "num_input_tokens_seen": 39352320, "step": 32360 }, { "epoch": 4.055256233554692, "grad_norm": 0.09219656884670258, "learning_rate": 9.681790984758172e-06, "loss": 0.4691, "num_input_tokens_seen": 39358368, "step": 32365 }, { "epoch": 4.055882721463476, "grad_norm": 0.08555581420660019, "learning_rate": 9.681599035040086e-06, "loss": 0.456, "num_input_tokens_seen": 39364224, "step": 32370 }, { "epoch": 4.056509209372259, "grad_norm": 0.07912751287221909, "learning_rate": 9.681407029349547e-06, "loss": 0.4694, "num_input_tokens_seen": 39370624, "step": 32375 }, { "epoch": 4.057135697281042, "grad_norm": 0.07648500800132751, "learning_rate": 9.681214967688852e-06, "loss": 0.4655, "num_input_tokens_seen": 39376256, "step": 32380 }, { "epoch": 4.057762185189826, "grad_norm": 0.15394112467765808, "learning_rate": 9.681022850060297e-06, "loss": 0.4627, "num_input_tokens_seen": 39382496, "step": 32385 }, { "epoch": 4.05838867309861, "grad_norm": 0.1028287410736084, "learning_rate": 9.680830676466178e-06, "loss": 0.4606, "num_input_tokens_seen": 39388672, "step": 32390 }, { "epoch": 4.059015161007393, "grad_norm": 0.09606762230396271, "learning_rate": 9.680638446908792e-06, "loss": 0.4591, "num_input_tokens_seen": 39394976, "step": 32395 }, { "epoch": 4.059641648916176, "grad_norm": 0.08323194831609726, "learning_rate": 9.680446161390438e-06, "loss": 0.4596, "num_input_tokens_seen": 39400736, "step": 32400 }, { "epoch": 4.060268136824959, "grad_norm": 0.08365485817193985, "learning_rate": 9.680253819913415e-06, "loss": 0.458, "num_input_tokens_seen": 39406880, "step": 32405 }, { "epoch": 4.060894624733742, "grad_norm": 0.07229704409837723, "learning_rate": 9.680061422480023e-06, "loss": 0.4634, "num_input_tokens_seen": 39413152, "step": 32410 }, { "epoch": 4.061521112642526, "grad_norm": 0.12398294359445572, "learning_rate": 9.679868969092561e-06, "loss": 0.4689, "num_input_tokens_seen": 39418944, "step": 32415 }, { "epoch": 4.0621476005513095, "grad_norm": 0.07911625504493713, "learning_rate": 9.679676459753333e-06, "loss": 0.4647, "num_input_tokens_seen": 39425056, "step": 32420 }, { "epoch": 4.062774088460093, "grad_norm": 0.08008282631635666, "learning_rate": 9.679483894464637e-06, "loss": 0.4658, "num_input_tokens_seen": 39431296, "step": 32425 }, { "epoch": 4.063400576368876, "grad_norm": 0.08224853873252869, "learning_rate": 9.679291273228777e-06, "loss": 0.4635, "num_input_tokens_seen": 39437088, "step": 32430 }, { "epoch": 4.064027064277659, "grad_norm": 0.07596288621425629, "learning_rate": 9.679098596048055e-06, "loss": 0.4624, "num_input_tokens_seen": 39443328, "step": 32435 }, { "epoch": 4.064653552186443, "grad_norm": 0.07715681195259094, "learning_rate": 9.678905862924776e-06, "loss": 0.4622, "num_input_tokens_seen": 39449472, "step": 32440 }, { "epoch": 4.065280040095226, "grad_norm": 0.11501741409301758, "learning_rate": 9.678713073861244e-06, "loss": 0.4597, "num_input_tokens_seen": 39455584, "step": 32445 }, { "epoch": 4.065906528004009, "grad_norm": 0.08326340466737747, "learning_rate": 9.678520228859765e-06, "loss": 0.4625, "num_input_tokens_seen": 39461600, "step": 32450 }, { "epoch": 4.066533015912793, "grad_norm": 0.07086948305368423, "learning_rate": 9.678327327922642e-06, "loss": 0.4617, "num_input_tokens_seen": 39467712, "step": 32455 }, { "epoch": 4.067159503821577, "grad_norm": 0.07693570852279663, "learning_rate": 9.67813437105218e-06, "loss": 0.4634, "num_input_tokens_seen": 39473504, "step": 32460 }, { "epoch": 4.06778599173036, "grad_norm": 0.07198678702116013, "learning_rate": 9.677941358250691e-06, "loss": 0.458, "num_input_tokens_seen": 39479616, "step": 32465 }, { "epoch": 4.068412479639143, "grad_norm": 0.0835595577955246, "learning_rate": 9.67774828952048e-06, "loss": 0.4646, "num_input_tokens_seen": 39485728, "step": 32470 }, { "epoch": 4.069038967547926, "grad_norm": 0.0963926613330841, "learning_rate": 9.677555164863854e-06, "loss": 0.458, "num_input_tokens_seen": 39491584, "step": 32475 }, { "epoch": 4.069665455456709, "grad_norm": 0.09020418673753738, "learning_rate": 9.677361984283123e-06, "loss": 0.4648, "num_input_tokens_seen": 39496768, "step": 32480 }, { "epoch": 4.070291943365493, "grad_norm": 0.08385640382766724, "learning_rate": 9.677168747780598e-06, "loss": 0.4659, "num_input_tokens_seen": 39502816, "step": 32485 }, { "epoch": 4.0709184312742765, "grad_norm": 0.0733472928404808, "learning_rate": 9.676975455358589e-06, "loss": 0.4678, "num_input_tokens_seen": 39508928, "step": 32490 }, { "epoch": 4.07154491918306, "grad_norm": 0.11891470104455948, "learning_rate": 9.676782107019405e-06, "loss": 0.463, "num_input_tokens_seen": 39514784, "step": 32495 }, { "epoch": 4.072171407091843, "grad_norm": 0.07202093303203583, "learning_rate": 9.67658870276536e-06, "loss": 0.4631, "num_input_tokens_seen": 39520224, "step": 32500 }, { "epoch": 4.072797895000626, "grad_norm": 0.0831182673573494, "learning_rate": 9.676395242598765e-06, "loss": 0.4606, "num_input_tokens_seen": 39526080, "step": 32505 }, { "epoch": 4.07342438290941, "grad_norm": 0.13529737293720245, "learning_rate": 9.676201726521933e-06, "loss": 0.4623, "num_input_tokens_seen": 39532064, "step": 32510 }, { "epoch": 4.074050870818193, "grad_norm": 0.07155104726552963, "learning_rate": 9.676008154537177e-06, "loss": 0.4621, "num_input_tokens_seen": 39538400, "step": 32515 }, { "epoch": 4.0746773587269765, "grad_norm": 0.07474707067012787, "learning_rate": 9.675814526646812e-06, "loss": 0.4653, "num_input_tokens_seen": 39544640, "step": 32520 }, { "epoch": 4.07530384663576, "grad_norm": 0.0806889459490776, "learning_rate": 9.675620842853155e-06, "loss": 0.4721, "num_input_tokens_seen": 39550720, "step": 32525 }, { "epoch": 4.075930334544544, "grad_norm": 0.08000010251998901, "learning_rate": 9.675427103158519e-06, "loss": 0.4657, "num_input_tokens_seen": 39556384, "step": 32530 }, { "epoch": 4.076556822453327, "grad_norm": 0.0720096156001091, "learning_rate": 9.67523330756522e-06, "loss": 0.4686, "num_input_tokens_seen": 39562496, "step": 32535 }, { "epoch": 4.07718331036211, "grad_norm": 0.13476194441318512, "learning_rate": 9.675039456075576e-06, "loss": 0.4675, "num_input_tokens_seen": 39568608, "step": 32540 }, { "epoch": 4.077809798270893, "grad_norm": 0.07866349816322327, "learning_rate": 9.674845548691906e-06, "loss": 0.4615, "num_input_tokens_seen": 39574944, "step": 32545 }, { "epoch": 4.078436286179676, "grad_norm": 0.08188071846961975, "learning_rate": 9.674651585416528e-06, "loss": 0.4636, "num_input_tokens_seen": 39580768, "step": 32550 }, { "epoch": 4.07906277408846, "grad_norm": 0.0817512646317482, "learning_rate": 9.674457566251758e-06, "loss": 0.463, "num_input_tokens_seen": 39586848, "step": 32555 }, { "epoch": 4.079689261997244, "grad_norm": 0.06498519331216812, "learning_rate": 9.674263491199919e-06, "loss": 0.4646, "num_input_tokens_seen": 39592736, "step": 32560 }, { "epoch": 4.080315749906027, "grad_norm": 0.07003287225961685, "learning_rate": 9.674069360263329e-06, "loss": 0.4617, "num_input_tokens_seen": 39598688, "step": 32565 }, { "epoch": 4.08094223781481, "grad_norm": 0.11758922040462494, "learning_rate": 9.673875173444311e-06, "loss": 0.463, "num_input_tokens_seen": 39604768, "step": 32570 }, { "epoch": 4.081568725723594, "grad_norm": 0.034690484404563904, "learning_rate": 9.673680930745185e-06, "loss": 0.4605, "num_input_tokens_seen": 39610912, "step": 32575 }, { "epoch": 4.082195213632377, "grad_norm": 0.07051576673984528, "learning_rate": 9.673486632168275e-06, "loss": 0.464, "num_input_tokens_seen": 39617088, "step": 32580 }, { "epoch": 4.08282170154116, "grad_norm": 0.037380918860435486, "learning_rate": 9.673292277715902e-06, "loss": 0.4557, "num_input_tokens_seen": 39623520, "step": 32585 }, { "epoch": 4.0834481894499435, "grad_norm": 0.03698824718594551, "learning_rate": 9.67309786739039e-06, "loss": 0.4619, "num_input_tokens_seen": 39629664, "step": 32590 }, { "epoch": 4.084074677358727, "grad_norm": 0.09590593725442886, "learning_rate": 9.672903401194067e-06, "loss": 0.4611, "num_input_tokens_seen": 39635136, "step": 32595 }, { "epoch": 4.084701165267511, "grad_norm": 0.06541987508535385, "learning_rate": 9.672708879129253e-06, "loss": 0.4679, "num_input_tokens_seen": 39641408, "step": 32600 }, { "epoch": 4.085327653176294, "grad_norm": 0.09391766786575317, "learning_rate": 9.672514301198276e-06, "loss": 0.4598, "num_input_tokens_seen": 39647712, "step": 32605 }, { "epoch": 4.085954141085077, "grad_norm": 0.07803657650947571, "learning_rate": 9.672319667403462e-06, "loss": 0.4582, "num_input_tokens_seen": 39653504, "step": 32610 }, { "epoch": 4.08658062899386, "grad_norm": 0.09452900290489197, "learning_rate": 9.672124977747139e-06, "loss": 0.4657, "num_input_tokens_seen": 39659712, "step": 32615 }, { "epoch": 4.087207116902643, "grad_norm": 0.07988213002681732, "learning_rate": 9.671930232231631e-06, "loss": 0.4562, "num_input_tokens_seen": 39665984, "step": 32620 }, { "epoch": 4.087833604811427, "grad_norm": 0.07863093912601471, "learning_rate": 9.671735430859272e-06, "loss": 0.4613, "num_input_tokens_seen": 39672288, "step": 32625 }, { "epoch": 4.088460092720211, "grad_norm": 0.07654229551553726, "learning_rate": 9.671540573632387e-06, "loss": 0.4659, "num_input_tokens_seen": 39678272, "step": 32630 }, { "epoch": 4.089086580628994, "grad_norm": 0.14042243361473083, "learning_rate": 9.671345660553309e-06, "loss": 0.4676, "num_input_tokens_seen": 39684352, "step": 32635 }, { "epoch": 4.089713068537777, "grad_norm": 0.07932119816541672, "learning_rate": 9.671150691624363e-06, "loss": 0.4618, "num_input_tokens_seen": 39690560, "step": 32640 }, { "epoch": 4.090339556446561, "grad_norm": 0.06990894675254822, "learning_rate": 9.670955666847885e-06, "loss": 0.4664, "num_input_tokens_seen": 39696864, "step": 32645 }, { "epoch": 4.090966044355344, "grad_norm": 0.12000678479671478, "learning_rate": 9.670760586226205e-06, "loss": 0.4675, "num_input_tokens_seen": 39702848, "step": 32650 }, { "epoch": 4.091592532264127, "grad_norm": 0.08305563032627106, "learning_rate": 9.670565449761653e-06, "loss": 0.4716, "num_input_tokens_seen": 39709120, "step": 32655 }, { "epoch": 4.0922190201729105, "grad_norm": 0.06510838866233826, "learning_rate": 9.670370257456567e-06, "loss": 0.458, "num_input_tokens_seen": 39715200, "step": 32660 }, { "epoch": 4.092845508081694, "grad_norm": 0.06384309381246567, "learning_rate": 9.670175009313276e-06, "loss": 0.4629, "num_input_tokens_seen": 39721248, "step": 32665 }, { "epoch": 4.093471995990478, "grad_norm": 0.0695314034819603, "learning_rate": 9.669979705334118e-06, "loss": 0.4657, "num_input_tokens_seen": 39727136, "step": 32670 }, { "epoch": 4.094098483899261, "grad_norm": 0.0775817334651947, "learning_rate": 9.669784345521425e-06, "loss": 0.4644, "num_input_tokens_seen": 39733440, "step": 32675 }, { "epoch": 4.094724971808044, "grad_norm": 0.11616364121437073, "learning_rate": 9.669588929877535e-06, "loss": 0.4636, "num_input_tokens_seen": 39739552, "step": 32680 }, { "epoch": 4.095351459716827, "grad_norm": 0.07131514698266983, "learning_rate": 9.669393458404783e-06, "loss": 0.4646, "num_input_tokens_seen": 39745760, "step": 32685 }, { "epoch": 4.09597794762561, "grad_norm": 0.07208164036273956, "learning_rate": 9.669197931105505e-06, "loss": 0.4577, "num_input_tokens_seen": 39751936, "step": 32690 }, { "epoch": 4.0966044355343945, "grad_norm": 0.10265087336301804, "learning_rate": 9.669002347982042e-06, "loss": 0.4652, "num_input_tokens_seen": 39758080, "step": 32695 }, { "epoch": 4.097230923443178, "grad_norm": 0.08226969093084335, "learning_rate": 9.66880670903673e-06, "loss": 0.4678, "num_input_tokens_seen": 39764448, "step": 32700 }, { "epoch": 4.097857411351961, "grad_norm": 0.0821433737874031, "learning_rate": 9.668611014271908e-06, "loss": 0.4647, "num_input_tokens_seen": 39770624, "step": 32705 }, { "epoch": 4.098483899260744, "grad_norm": 0.17556002736091614, "learning_rate": 9.668415263689916e-06, "loss": 0.4586, "num_input_tokens_seen": 39776736, "step": 32710 }, { "epoch": 4.099110387169528, "grad_norm": 0.08377011865377426, "learning_rate": 9.668219457293095e-06, "loss": 0.467, "num_input_tokens_seen": 39782912, "step": 32715 }, { "epoch": 4.099736875078311, "grad_norm": 0.11950651556253433, "learning_rate": 9.668023595083786e-06, "loss": 0.4654, "num_input_tokens_seen": 39788800, "step": 32720 }, { "epoch": 4.100363362987094, "grad_norm": 0.07849473506212234, "learning_rate": 9.667827677064329e-06, "loss": 0.4625, "num_input_tokens_seen": 39794688, "step": 32725 }, { "epoch": 4.1009898508958775, "grad_norm": 0.0929611399769783, "learning_rate": 9.66763170323707e-06, "loss": 0.4623, "num_input_tokens_seen": 39801024, "step": 32730 }, { "epoch": 4.101616338804661, "grad_norm": 0.07374614477157593, "learning_rate": 9.667435673604348e-06, "loss": 0.4636, "num_input_tokens_seen": 39807104, "step": 32735 }, { "epoch": 4.102242826713445, "grad_norm": 0.035392675548791885, "learning_rate": 9.667239588168507e-06, "loss": 0.46, "num_input_tokens_seen": 39813344, "step": 32740 }, { "epoch": 4.102869314622228, "grad_norm": 0.035959988832473755, "learning_rate": 9.667043446931895e-06, "loss": 0.4655, "num_input_tokens_seen": 39819584, "step": 32745 }, { "epoch": 4.103495802531011, "grad_norm": 0.07058499753475189, "learning_rate": 9.666847249896853e-06, "loss": 0.4631, "num_input_tokens_seen": 39825632, "step": 32750 }, { "epoch": 4.104122290439794, "grad_norm": 0.07999768108129501, "learning_rate": 9.66665099706573e-06, "loss": 0.4646, "num_input_tokens_seen": 39832064, "step": 32755 }, { "epoch": 4.104748778348577, "grad_norm": 0.06569231301546097, "learning_rate": 9.66645468844087e-06, "loss": 0.461, "num_input_tokens_seen": 39837984, "step": 32760 }, { "epoch": 4.1053752662573615, "grad_norm": 0.12680959701538086, "learning_rate": 9.666258324024622e-06, "loss": 0.4628, "num_input_tokens_seen": 39843936, "step": 32765 }, { "epoch": 4.106001754166145, "grad_norm": 0.08715534210205078, "learning_rate": 9.66606190381933e-06, "loss": 0.4626, "num_input_tokens_seen": 39850176, "step": 32770 }, { "epoch": 4.106628242074928, "grad_norm": 0.06927136331796646, "learning_rate": 9.665865427827347e-06, "loss": 0.4654, "num_input_tokens_seen": 39856544, "step": 32775 }, { "epoch": 4.107254729983711, "grad_norm": 0.09904477000236511, "learning_rate": 9.66566889605102e-06, "loss": 0.4549, "num_input_tokens_seen": 39863008, "step": 32780 }, { "epoch": 4.107881217892495, "grad_norm": 0.06663971394300461, "learning_rate": 9.665472308492699e-06, "loss": 0.4544, "num_input_tokens_seen": 39869280, "step": 32785 }, { "epoch": 4.108507705801278, "grad_norm": 0.029722947627305984, "learning_rate": 9.665275665154731e-06, "loss": 0.4632, "num_input_tokens_seen": 39875264, "step": 32790 }, { "epoch": 4.109134193710061, "grad_norm": 0.03733581677079201, "learning_rate": 9.665078966039474e-06, "loss": 0.4577, "num_input_tokens_seen": 39881632, "step": 32795 }, { "epoch": 4.109760681618845, "grad_norm": 0.0710102766752243, "learning_rate": 9.664882211149273e-06, "loss": 0.4591, "num_input_tokens_seen": 39887520, "step": 32800 }, { "epoch": 4.110387169527628, "grad_norm": 0.14415206015110016, "learning_rate": 9.664685400486486e-06, "loss": 0.4568, "num_input_tokens_seen": 39893472, "step": 32805 }, { "epoch": 4.111013657436412, "grad_norm": 0.14184865355491638, "learning_rate": 9.66448853405346e-06, "loss": 0.4712, "num_input_tokens_seen": 39899808, "step": 32810 }, { "epoch": 4.111640145345195, "grad_norm": 0.0669759213924408, "learning_rate": 9.664291611852554e-06, "loss": 0.4586, "num_input_tokens_seen": 39905888, "step": 32815 }, { "epoch": 4.112266633253978, "grad_norm": 0.08065623044967651, "learning_rate": 9.66409463388612e-06, "loss": 0.4653, "num_input_tokens_seen": 39911520, "step": 32820 }, { "epoch": 4.112893121162761, "grad_norm": 0.10833395272493362, "learning_rate": 9.663897600156512e-06, "loss": 0.4683, "num_input_tokens_seen": 39917408, "step": 32825 }, { "epoch": 4.113519609071545, "grad_norm": 0.1401567906141281, "learning_rate": 9.663700510666088e-06, "loss": 0.4618, "num_input_tokens_seen": 39923488, "step": 32830 }, { "epoch": 4.1141460969803285, "grad_norm": 0.10555564612150192, "learning_rate": 9.663503365417205e-06, "loss": 0.4552, "num_input_tokens_seen": 39929408, "step": 32835 }, { "epoch": 4.114772584889112, "grad_norm": 0.09055907279253006, "learning_rate": 9.663306164412215e-06, "loss": 0.4635, "num_input_tokens_seen": 39935488, "step": 32840 }, { "epoch": 4.115399072797895, "grad_norm": 0.07765832543373108, "learning_rate": 9.66310890765348e-06, "loss": 0.4617, "num_input_tokens_seen": 39941696, "step": 32845 }, { "epoch": 4.116025560706678, "grad_norm": 0.09781863540410995, "learning_rate": 9.662911595143359e-06, "loss": 0.4667, "num_input_tokens_seen": 39948000, "step": 32850 }, { "epoch": 4.116652048615462, "grad_norm": 0.07965624332427979, "learning_rate": 9.662714226884209e-06, "loss": 0.4629, "num_input_tokens_seen": 39953824, "step": 32855 }, { "epoch": 4.117278536524245, "grad_norm": 0.1393968164920807, "learning_rate": 9.66251680287839e-06, "loss": 0.4592, "num_input_tokens_seen": 39960192, "step": 32860 }, { "epoch": 4.117905024433028, "grad_norm": 0.06557948142290115, "learning_rate": 9.662319323128263e-06, "loss": 0.463, "num_input_tokens_seen": 39966464, "step": 32865 }, { "epoch": 4.118531512341812, "grad_norm": 0.07401301711797714, "learning_rate": 9.662121787636188e-06, "loss": 0.466, "num_input_tokens_seen": 39972832, "step": 32870 }, { "epoch": 4.119158000250595, "grad_norm": 0.09186612069606781, "learning_rate": 9.661924196404527e-06, "loss": 0.4653, "num_input_tokens_seen": 39979232, "step": 32875 }, { "epoch": 4.119784488159379, "grad_norm": 0.09696121513843536, "learning_rate": 9.661726549435643e-06, "loss": 0.4604, "num_input_tokens_seen": 39985504, "step": 32880 }, { "epoch": 4.120410976068162, "grad_norm": 0.1274109184741974, "learning_rate": 9.661528846731899e-06, "loss": 0.4716, "num_input_tokens_seen": 39991648, "step": 32885 }, { "epoch": 4.121037463976945, "grad_norm": 0.03275197744369507, "learning_rate": 9.661331088295658e-06, "loss": 0.467, "num_input_tokens_seen": 39997728, "step": 32890 }, { "epoch": 4.121663951885728, "grad_norm": 0.0623779259622097, "learning_rate": 9.661133274129286e-06, "loss": 0.4611, "num_input_tokens_seen": 40003968, "step": 32895 }, { "epoch": 4.122290439794512, "grad_norm": 0.02984052337706089, "learning_rate": 9.660935404235146e-06, "loss": 0.4659, "num_input_tokens_seen": 40009600, "step": 32900 }, { "epoch": 4.1229169277032955, "grad_norm": 0.06603798270225525, "learning_rate": 9.660737478615604e-06, "loss": 0.464, "num_input_tokens_seen": 40015328, "step": 32905 }, { "epoch": 4.123543415612079, "grad_norm": 0.0644012913107872, "learning_rate": 9.660539497273028e-06, "loss": 0.4619, "num_input_tokens_seen": 40021312, "step": 32910 }, { "epoch": 4.124169903520862, "grad_norm": 0.07991495728492737, "learning_rate": 9.660341460209783e-06, "loss": 0.4686, "num_input_tokens_seen": 40027456, "step": 32915 }, { "epoch": 4.124796391429645, "grad_norm": 0.10720867663621902, "learning_rate": 9.660143367428239e-06, "loss": 0.4637, "num_input_tokens_seen": 40033600, "step": 32920 }, { "epoch": 4.125422879338429, "grad_norm": 0.06362423300743103, "learning_rate": 9.659945218930761e-06, "loss": 0.4618, "num_input_tokens_seen": 40039424, "step": 32925 }, { "epoch": 4.126049367247212, "grad_norm": 0.11670316010713577, "learning_rate": 9.659747014719722e-06, "loss": 0.4621, "num_input_tokens_seen": 40045664, "step": 32930 }, { "epoch": 4.126675855155995, "grad_norm": 0.12041199207305908, "learning_rate": 9.65954875479749e-06, "loss": 0.4641, "num_input_tokens_seen": 40051264, "step": 32935 }, { "epoch": 4.127302343064779, "grad_norm": 0.06296247988939285, "learning_rate": 9.659350439166433e-06, "loss": 0.4605, "num_input_tokens_seen": 40057472, "step": 32940 }, { "epoch": 4.127928830973562, "grad_norm": 0.0742548406124115, "learning_rate": 9.659152067828926e-06, "loss": 0.461, "num_input_tokens_seen": 40063168, "step": 32945 }, { "epoch": 4.128555318882346, "grad_norm": 0.040833890438079834, "learning_rate": 9.658953640787336e-06, "loss": 0.4618, "num_input_tokens_seen": 40069248, "step": 32950 }, { "epoch": 4.129181806791129, "grad_norm": 0.0688372403383255, "learning_rate": 9.65875515804404e-06, "loss": 0.4599, "num_input_tokens_seen": 40075264, "step": 32955 }, { "epoch": 4.129808294699912, "grad_norm": 0.08732917904853821, "learning_rate": 9.65855661960141e-06, "loss": 0.4628, "num_input_tokens_seen": 40081696, "step": 32960 }, { "epoch": 4.130434782608695, "grad_norm": 0.07935898005962372, "learning_rate": 9.658358025461818e-06, "loss": 0.4581, "num_input_tokens_seen": 40087808, "step": 32965 }, { "epoch": 4.131061270517479, "grad_norm": 0.03816810995340347, "learning_rate": 9.658159375627641e-06, "loss": 0.459, "num_input_tokens_seen": 40093920, "step": 32970 }, { "epoch": 4.131687758426263, "grad_norm": 0.07484220713376999, "learning_rate": 9.65796067010125e-06, "loss": 0.4653, "num_input_tokens_seen": 40099296, "step": 32975 }, { "epoch": 4.132314246335046, "grad_norm": 0.0648096576333046, "learning_rate": 9.657761908885024e-06, "loss": 0.4617, "num_input_tokens_seen": 40104800, "step": 32980 }, { "epoch": 4.132940734243829, "grad_norm": 0.06352439522743225, "learning_rate": 9.657563091981338e-06, "loss": 0.4612, "num_input_tokens_seen": 40110944, "step": 32985 }, { "epoch": 4.133567222152612, "grad_norm": 0.07453668117523193, "learning_rate": 9.65736421939257e-06, "loss": 0.4646, "num_input_tokens_seen": 40117152, "step": 32990 }, { "epoch": 4.134193710061396, "grad_norm": 0.09169205278158188, "learning_rate": 9.657165291121096e-06, "loss": 0.469, "num_input_tokens_seen": 40123296, "step": 32995 }, { "epoch": 4.134820197970179, "grad_norm": 0.11959951370954514, "learning_rate": 9.656966307169296e-06, "loss": 0.4598, "num_input_tokens_seen": 40129376, "step": 33000 }, { "epoch": 4.1354466858789625, "grad_norm": 0.13152307271957397, "learning_rate": 9.656767267539548e-06, "loss": 0.4594, "num_input_tokens_seen": 40135520, "step": 33005 }, { "epoch": 4.136073173787746, "grad_norm": 0.08435894548892975, "learning_rate": 9.656568172234234e-06, "loss": 0.4648, "num_input_tokens_seen": 40141696, "step": 33010 }, { "epoch": 4.136699661696529, "grad_norm": 0.03863812983036041, "learning_rate": 9.65636902125573e-06, "loss": 0.4587, "num_input_tokens_seen": 40147648, "step": 33015 }, { "epoch": 4.137326149605313, "grad_norm": 0.08337559551000595, "learning_rate": 9.656169814606421e-06, "loss": 0.4617, "num_input_tokens_seen": 40153664, "step": 33020 }, { "epoch": 4.137952637514096, "grad_norm": 0.12442953139543533, "learning_rate": 9.655970552288687e-06, "loss": 0.4685, "num_input_tokens_seen": 40159904, "step": 33025 }, { "epoch": 4.138579125422879, "grad_norm": 0.14457322657108307, "learning_rate": 9.65577123430491e-06, "loss": 0.4637, "num_input_tokens_seen": 40165888, "step": 33030 }, { "epoch": 4.139205613331662, "grad_norm": 0.06963276118040085, "learning_rate": 9.655571860657474e-06, "loss": 0.4603, "num_input_tokens_seen": 40171872, "step": 33035 }, { "epoch": 4.139832101240446, "grad_norm": 0.07710011303424835, "learning_rate": 9.655372431348761e-06, "loss": 0.4709, "num_input_tokens_seen": 40178272, "step": 33040 }, { "epoch": 4.14045858914923, "grad_norm": 0.07114087045192719, "learning_rate": 9.655172946381159e-06, "loss": 0.4656, "num_input_tokens_seen": 40184384, "step": 33045 }, { "epoch": 4.141085077058013, "grad_norm": 0.07562193274497986, "learning_rate": 9.654973405757048e-06, "loss": 0.4604, "num_input_tokens_seen": 40190400, "step": 33050 }, { "epoch": 4.141711564966796, "grad_norm": 0.0720343217253685, "learning_rate": 9.654773809478817e-06, "loss": 0.4577, "num_input_tokens_seen": 40196704, "step": 33055 }, { "epoch": 4.142338052875579, "grad_norm": 0.08102907985448837, "learning_rate": 9.654574157548853e-06, "loss": 0.47, "num_input_tokens_seen": 40202784, "step": 33060 }, { "epoch": 4.142964540784363, "grad_norm": 0.13347749412059784, "learning_rate": 9.654374449969542e-06, "loss": 0.4618, "num_input_tokens_seen": 40209056, "step": 33065 }, { "epoch": 4.143591028693146, "grad_norm": 0.0809725970029831, "learning_rate": 9.654174686743269e-06, "loss": 0.4545, "num_input_tokens_seen": 40215296, "step": 33070 }, { "epoch": 4.1442175166019295, "grad_norm": 0.10966005176305771, "learning_rate": 9.653974867872424e-06, "loss": 0.4608, "num_input_tokens_seen": 40221440, "step": 33075 }, { "epoch": 4.144844004510713, "grad_norm": 0.07287750393152237, "learning_rate": 9.6537749933594e-06, "loss": 0.4687, "num_input_tokens_seen": 40227616, "step": 33080 }, { "epoch": 4.145470492419497, "grad_norm": 0.06784097850322723, "learning_rate": 9.653575063206581e-06, "loss": 0.4612, "num_input_tokens_seen": 40233696, "step": 33085 }, { "epoch": 4.14609698032828, "grad_norm": 0.0732761099934578, "learning_rate": 9.65337507741636e-06, "loss": 0.4602, "num_input_tokens_seen": 40239616, "step": 33090 }, { "epoch": 4.146723468237063, "grad_norm": 0.06483650207519531, "learning_rate": 9.653175035991127e-06, "loss": 0.4648, "num_input_tokens_seen": 40245952, "step": 33095 }, { "epoch": 4.147349956145846, "grad_norm": 0.04724687710404396, "learning_rate": 9.652974938933275e-06, "loss": 0.4665, "num_input_tokens_seen": 40251616, "step": 33100 }, { "epoch": 4.147976444054629, "grad_norm": 0.12171722203493118, "learning_rate": 9.652774786245195e-06, "loss": 0.4601, "num_input_tokens_seen": 40257536, "step": 33105 }, { "epoch": 4.1486029319634135, "grad_norm": 0.13621866703033447, "learning_rate": 9.65257457792928e-06, "loss": 0.4575, "num_input_tokens_seen": 40263424, "step": 33110 }, { "epoch": 4.149229419872197, "grad_norm": 0.07731418311595917, "learning_rate": 9.652374313987927e-06, "loss": 0.4669, "num_input_tokens_seen": 40269408, "step": 33115 }, { "epoch": 4.14985590778098, "grad_norm": 0.06666959822177887, "learning_rate": 9.652173994423526e-06, "loss": 0.4601, "num_input_tokens_seen": 40275296, "step": 33120 }, { "epoch": 4.150482395689763, "grad_norm": 0.07233579456806183, "learning_rate": 9.651973619238472e-06, "loss": 0.46, "num_input_tokens_seen": 40281440, "step": 33125 }, { "epoch": 4.151108883598546, "grad_norm": 0.06920164823532104, "learning_rate": 9.651773188435165e-06, "loss": 0.4637, "num_input_tokens_seen": 40287008, "step": 33130 }, { "epoch": 4.15173537150733, "grad_norm": 0.06609921157360077, "learning_rate": 9.651572702015997e-06, "loss": 0.4528, "num_input_tokens_seen": 40293216, "step": 33135 }, { "epoch": 4.152361859416113, "grad_norm": 0.0762842521071434, "learning_rate": 9.651372159983365e-06, "loss": 0.4618, "num_input_tokens_seen": 40299136, "step": 33140 }, { "epoch": 4.1529883473248965, "grad_norm": 0.0382051095366478, "learning_rate": 9.651171562339672e-06, "loss": 0.4648, "num_input_tokens_seen": 40305280, "step": 33145 }, { "epoch": 4.15361483523368, "grad_norm": 0.040598295629024506, "learning_rate": 9.650970909087309e-06, "loss": 0.4584, "num_input_tokens_seen": 40311456, "step": 33150 }, { "epoch": 4.154241323142464, "grad_norm": 0.0895342156291008, "learning_rate": 9.65077020022868e-06, "loss": 0.4642, "num_input_tokens_seen": 40317568, "step": 33155 }, { "epoch": 4.154867811051247, "grad_norm": 0.07372969388961792, "learning_rate": 9.650569435766183e-06, "loss": 0.464, "num_input_tokens_seen": 40323872, "step": 33160 }, { "epoch": 4.15549429896003, "grad_norm": 0.1108294278383255, "learning_rate": 9.650368615702218e-06, "loss": 0.4619, "num_input_tokens_seen": 40329760, "step": 33165 }, { "epoch": 4.156120786868813, "grad_norm": 0.07885927706956863, "learning_rate": 9.650167740039187e-06, "loss": 0.4642, "num_input_tokens_seen": 40335680, "step": 33170 }, { "epoch": 4.156747274777596, "grad_norm": 0.07885489612817764, "learning_rate": 9.64996680877949e-06, "loss": 0.4586, "num_input_tokens_seen": 40341632, "step": 33175 }, { "epoch": 4.1573737626863805, "grad_norm": 0.07442783564329147, "learning_rate": 9.649765821925532e-06, "loss": 0.4653, "num_input_tokens_seen": 40347744, "step": 33180 }, { "epoch": 4.158000250595164, "grad_norm": 0.06895078718662262, "learning_rate": 9.649564779479713e-06, "loss": 0.4571, "num_input_tokens_seen": 40353344, "step": 33185 }, { "epoch": 4.158626738503947, "grad_norm": 0.08401983976364136, "learning_rate": 9.649363681444438e-06, "loss": 0.4585, "num_input_tokens_seen": 40359104, "step": 33190 }, { "epoch": 4.15925322641273, "grad_norm": 0.068961001932621, "learning_rate": 9.649162527822112e-06, "loss": 0.4606, "num_input_tokens_seen": 40365344, "step": 33195 }, { "epoch": 4.159879714321514, "grad_norm": 0.07770317792892456, "learning_rate": 9.648961318615138e-06, "loss": 0.4595, "num_input_tokens_seen": 40371616, "step": 33200 }, { "epoch": 4.160506202230297, "grad_norm": 0.13297680020332336, "learning_rate": 9.648760053825924e-06, "loss": 0.4694, "num_input_tokens_seen": 40378208, "step": 33205 }, { "epoch": 4.16113269013908, "grad_norm": 0.06848481297492981, "learning_rate": 9.648558733456874e-06, "loss": 0.463, "num_input_tokens_seen": 40384320, "step": 33210 }, { "epoch": 4.1617591780478635, "grad_norm": 0.0765657052397728, "learning_rate": 9.648357357510398e-06, "loss": 0.46, "num_input_tokens_seen": 40390752, "step": 33215 }, { "epoch": 4.162385665956647, "grad_norm": 0.08327893167734146, "learning_rate": 9.6481559259889e-06, "loss": 0.4657, "num_input_tokens_seen": 40396704, "step": 33220 }, { "epoch": 4.163012153865431, "grad_norm": 0.08342490345239639, "learning_rate": 9.64795443889479e-06, "loss": 0.4633, "num_input_tokens_seen": 40402848, "step": 33225 }, { "epoch": 4.163638641774214, "grad_norm": 0.08817464113235474, "learning_rate": 9.647752896230477e-06, "loss": 0.4628, "num_input_tokens_seen": 40408960, "step": 33230 }, { "epoch": 4.164265129682997, "grad_norm": 0.04204966872930527, "learning_rate": 9.64755129799837e-06, "loss": 0.4609, "num_input_tokens_seen": 40415072, "step": 33235 }, { "epoch": 4.16489161759178, "grad_norm": 0.0720173567533493, "learning_rate": 9.647349644200882e-06, "loss": 0.462, "num_input_tokens_seen": 40421152, "step": 33240 }, { "epoch": 4.1655181055005635, "grad_norm": 0.0836564302444458, "learning_rate": 9.64714793484042e-06, "loss": 0.4616, "num_input_tokens_seen": 40426816, "step": 33245 }, { "epoch": 4.1661445934093475, "grad_norm": 0.03958045691251755, "learning_rate": 9.646946169919395e-06, "loss": 0.4621, "num_input_tokens_seen": 40433184, "step": 33250 }, { "epoch": 4.166771081318131, "grad_norm": 0.09541281312704086, "learning_rate": 9.646744349440224e-06, "loss": 0.4587, "num_input_tokens_seen": 40439584, "step": 33255 }, { "epoch": 4.167397569226914, "grad_norm": 0.0795540064573288, "learning_rate": 9.64654247340532e-06, "loss": 0.4573, "num_input_tokens_seen": 40445760, "step": 33260 }, { "epoch": 4.168024057135697, "grad_norm": 0.07519067823886871, "learning_rate": 9.64634054181709e-06, "loss": 0.4621, "num_input_tokens_seen": 40451968, "step": 33265 }, { "epoch": 4.168650545044481, "grad_norm": 0.08264613151550293, "learning_rate": 9.646138554677955e-06, "loss": 0.4614, "num_input_tokens_seen": 40458176, "step": 33270 }, { "epoch": 4.169277032953264, "grad_norm": 0.08509417623281479, "learning_rate": 9.645936511990326e-06, "loss": 0.4638, "num_input_tokens_seen": 40464576, "step": 33275 }, { "epoch": 4.169903520862047, "grad_norm": 0.09429135918617249, "learning_rate": 9.64573441375662e-06, "loss": 0.4692, "num_input_tokens_seen": 40470816, "step": 33280 }, { "epoch": 4.170530008770831, "grad_norm": 0.1361287534236908, "learning_rate": 9.645532259979255e-06, "loss": 0.4621, "num_input_tokens_seen": 40476928, "step": 33285 }, { "epoch": 4.171156496679614, "grad_norm": 0.09538811445236206, "learning_rate": 9.645330050660644e-06, "loss": 0.4631, "num_input_tokens_seen": 40483136, "step": 33290 }, { "epoch": 4.171782984588398, "grad_norm": 0.044922441244125366, "learning_rate": 9.645127785803208e-06, "loss": 0.4599, "num_input_tokens_seen": 40489184, "step": 33295 }, { "epoch": 4.172409472497181, "grad_norm": 0.1272597461938858, "learning_rate": 9.644925465409363e-06, "loss": 0.4583, "num_input_tokens_seen": 40495168, "step": 33300 }, { "epoch": 4.173035960405964, "grad_norm": 0.1417139172554016, "learning_rate": 9.644723089481528e-06, "loss": 0.4671, "num_input_tokens_seen": 40500672, "step": 33305 }, { "epoch": 4.173662448314747, "grad_norm": 0.09873604029417038, "learning_rate": 9.644520658022124e-06, "loss": 0.4688, "num_input_tokens_seen": 40506624, "step": 33310 }, { "epoch": 4.1742889362235305, "grad_norm": 0.07509023696184158, "learning_rate": 9.644318171033571e-06, "loss": 0.4566, "num_input_tokens_seen": 40512672, "step": 33315 }, { "epoch": 4.1749154241323145, "grad_norm": 0.059081993997097015, "learning_rate": 9.64411562851829e-06, "loss": 0.4631, "num_input_tokens_seen": 40519072, "step": 33320 }, { "epoch": 4.175541912041098, "grad_norm": 0.0916154682636261, "learning_rate": 9.643913030478702e-06, "loss": 0.4633, "num_input_tokens_seen": 40525344, "step": 33325 }, { "epoch": 4.176168399949881, "grad_norm": 0.10085310786962509, "learning_rate": 9.643710376917228e-06, "loss": 0.4624, "num_input_tokens_seen": 40531488, "step": 33330 }, { "epoch": 4.176794887858664, "grad_norm": 0.09294933080673218, "learning_rate": 9.643507667836294e-06, "loss": 0.4668, "num_input_tokens_seen": 40537760, "step": 33335 }, { "epoch": 4.177421375767448, "grad_norm": 0.12321028858423233, "learning_rate": 9.643304903238322e-06, "loss": 0.4645, "num_input_tokens_seen": 40544128, "step": 33340 }, { "epoch": 4.178047863676231, "grad_norm": 0.12365732342004776, "learning_rate": 9.643102083125735e-06, "loss": 0.4634, "num_input_tokens_seen": 40550272, "step": 33345 }, { "epoch": 4.178674351585014, "grad_norm": 0.08624038845300674, "learning_rate": 9.642899207500957e-06, "loss": 0.4617, "num_input_tokens_seen": 40556320, "step": 33350 }, { "epoch": 4.179300839493798, "grad_norm": 0.08667368441820145, "learning_rate": 9.64269627636642e-06, "loss": 0.4687, "num_input_tokens_seen": 40562432, "step": 33355 }, { "epoch": 4.179927327402581, "grad_norm": 0.07232996821403503, "learning_rate": 9.642493289724542e-06, "loss": 0.4571, "num_input_tokens_seen": 40568480, "step": 33360 }, { "epoch": 4.180553815311365, "grad_norm": 0.11962398886680603, "learning_rate": 9.642290247577754e-06, "loss": 0.4628, "num_input_tokens_seen": 40574592, "step": 33365 }, { "epoch": 4.181180303220148, "grad_norm": 0.1421913057565689, "learning_rate": 9.642087149928484e-06, "loss": 0.4669, "num_input_tokens_seen": 40580512, "step": 33370 }, { "epoch": 4.181806791128931, "grad_norm": 0.03903266414999962, "learning_rate": 9.641883996779158e-06, "loss": 0.4639, "num_input_tokens_seen": 40586496, "step": 33375 }, { "epoch": 4.182433279037714, "grad_norm": 0.06757195293903351, "learning_rate": 9.641680788132209e-06, "loss": 0.4576, "num_input_tokens_seen": 40592224, "step": 33380 }, { "epoch": 4.1830597669464975, "grad_norm": 0.14315280318260193, "learning_rate": 9.641477523990061e-06, "loss": 0.4631, "num_input_tokens_seen": 40598336, "step": 33385 }, { "epoch": 4.183686254855282, "grad_norm": 0.07826583087444305, "learning_rate": 9.641274204355146e-06, "loss": 0.4579, "num_input_tokens_seen": 40604544, "step": 33390 }, { "epoch": 4.184312742764065, "grad_norm": 0.07332658767700195, "learning_rate": 9.641070829229898e-06, "loss": 0.4606, "num_input_tokens_seen": 40610816, "step": 33395 }, { "epoch": 4.184939230672848, "grad_norm": 0.09426950663328171, "learning_rate": 9.640867398616745e-06, "loss": 0.4634, "num_input_tokens_seen": 40617216, "step": 33400 }, { "epoch": 4.185565718581631, "grad_norm": 0.07830415666103363, "learning_rate": 9.640663912518123e-06, "loss": 0.4608, "num_input_tokens_seen": 40623168, "step": 33405 }, { "epoch": 4.186192206490415, "grad_norm": 0.08764997869729996, "learning_rate": 9.64046037093646e-06, "loss": 0.4658, "num_input_tokens_seen": 40629632, "step": 33410 }, { "epoch": 4.186818694399198, "grad_norm": 0.07685869187116623, "learning_rate": 9.640256773874191e-06, "loss": 0.4591, "num_input_tokens_seen": 40635584, "step": 33415 }, { "epoch": 4.1874451823079815, "grad_norm": 0.1033926010131836, "learning_rate": 9.640053121333752e-06, "loss": 0.4595, "num_input_tokens_seen": 40641760, "step": 33420 }, { "epoch": 4.188071670216765, "grad_norm": 0.12485978752374649, "learning_rate": 9.639849413317576e-06, "loss": 0.4648, "num_input_tokens_seen": 40647104, "step": 33425 }, { "epoch": 4.188698158125548, "grad_norm": 0.08986087143421173, "learning_rate": 9.6396456498281e-06, "loss": 0.4639, "num_input_tokens_seen": 40652928, "step": 33430 }, { "epoch": 4.189324646034332, "grad_norm": 0.09655705839395523, "learning_rate": 9.639441830867759e-06, "loss": 0.462, "num_input_tokens_seen": 40659136, "step": 33435 }, { "epoch": 4.189951133943115, "grad_norm": 0.0746568962931633, "learning_rate": 9.639237956438991e-06, "loss": 0.4595, "num_input_tokens_seen": 40664672, "step": 33440 }, { "epoch": 4.190577621851898, "grad_norm": 0.13921861350536346, "learning_rate": 9.639034026544234e-06, "loss": 0.4627, "num_input_tokens_seen": 40671168, "step": 33445 }, { "epoch": 4.191204109760681, "grad_norm": 0.0897083431482315, "learning_rate": 9.638830041185922e-06, "loss": 0.4531, "num_input_tokens_seen": 40677120, "step": 33450 }, { "epoch": 4.191830597669465, "grad_norm": 0.14091244339942932, "learning_rate": 9.6386260003665e-06, "loss": 0.4639, "num_input_tokens_seen": 40683584, "step": 33455 }, { "epoch": 4.192457085578249, "grad_norm": 0.08202093094587326, "learning_rate": 9.638421904088402e-06, "loss": 0.4591, "num_input_tokens_seen": 40689408, "step": 33460 }, { "epoch": 4.193083573487032, "grad_norm": 0.12926064431667328, "learning_rate": 9.638217752354071e-06, "loss": 0.4652, "num_input_tokens_seen": 40695712, "step": 33465 }, { "epoch": 4.193710061395815, "grad_norm": 0.09519098699092865, "learning_rate": 9.638013545165947e-06, "loss": 0.4702, "num_input_tokens_seen": 40701568, "step": 33470 }, { "epoch": 4.194336549304598, "grad_norm": 0.12803083658218384, "learning_rate": 9.637809282526471e-06, "loss": 0.462, "num_input_tokens_seen": 40707040, "step": 33475 }, { "epoch": 4.194963037213382, "grad_norm": 0.10079722851514816, "learning_rate": 9.637604964438087e-06, "loss": 0.4659, "num_input_tokens_seen": 40712832, "step": 33480 }, { "epoch": 4.195589525122165, "grad_norm": 0.13377061486244202, "learning_rate": 9.637400590903235e-06, "loss": 0.464, "num_input_tokens_seen": 40718944, "step": 33485 }, { "epoch": 4.1962160130309485, "grad_norm": 0.076447032392025, "learning_rate": 9.637196161924362e-06, "loss": 0.4614, "num_input_tokens_seen": 40724768, "step": 33490 }, { "epoch": 4.196842500939732, "grad_norm": 0.08505752682685852, "learning_rate": 9.636991677503909e-06, "loss": 0.468, "num_input_tokens_seen": 40731040, "step": 33495 }, { "epoch": 4.197468988848515, "grad_norm": 0.12524956464767456, "learning_rate": 9.636787137644321e-06, "loss": 0.4684, "num_input_tokens_seen": 40737216, "step": 33500 }, { "epoch": 4.198095476757299, "grad_norm": 0.1466747373342514, "learning_rate": 9.636582542348045e-06, "loss": 0.4619, "num_input_tokens_seen": 40743584, "step": 33505 }, { "epoch": 4.198721964666082, "grad_norm": 0.038832489401102066, "learning_rate": 9.636377891617528e-06, "loss": 0.4658, "num_input_tokens_seen": 40749632, "step": 33510 }, { "epoch": 4.199348452574865, "grad_norm": 0.15108570456504822, "learning_rate": 9.636173185455213e-06, "loss": 0.4643, "num_input_tokens_seen": 40755072, "step": 33515 }, { "epoch": 4.199974940483648, "grad_norm": 0.040911078453063965, "learning_rate": 9.635968423863552e-06, "loss": 0.4752, "num_input_tokens_seen": 40761184, "step": 33520 }, { "epoch": 4.2006014283924324, "grad_norm": 0.03600238636136055, "learning_rate": 9.63576360684499e-06, "loss": 0.457, "num_input_tokens_seen": 40767328, "step": 33525 }, { "epoch": 4.201227916301216, "grad_norm": 0.08261014521121979, "learning_rate": 9.635558734401975e-06, "loss": 0.4612, "num_input_tokens_seen": 40773440, "step": 33530 }, { "epoch": 4.201854404209999, "grad_norm": 0.08524678647518158, "learning_rate": 9.63535380653696e-06, "loss": 0.4602, "num_input_tokens_seen": 40779424, "step": 33535 }, { "epoch": 4.202480892118782, "grad_norm": 0.1052304357290268, "learning_rate": 9.635148823252392e-06, "loss": 0.4693, "num_input_tokens_seen": 40785792, "step": 33540 }, { "epoch": 4.203107380027565, "grad_norm": 0.07742089033126831, "learning_rate": 9.634943784550722e-06, "loss": 0.4639, "num_input_tokens_seen": 40792064, "step": 33545 }, { "epoch": 4.203733867936349, "grad_norm": 0.07546436786651611, "learning_rate": 9.634738690434404e-06, "loss": 0.4627, "num_input_tokens_seen": 40798112, "step": 33550 }, { "epoch": 4.204360355845132, "grad_norm": 0.1127467006444931, "learning_rate": 9.634533540905888e-06, "loss": 0.4634, "num_input_tokens_seen": 40804416, "step": 33555 }, { "epoch": 4.2049868437539155, "grad_norm": 0.06992695480585098, "learning_rate": 9.634328335967626e-06, "loss": 0.4606, "num_input_tokens_seen": 40810880, "step": 33560 }, { "epoch": 4.205613331662699, "grad_norm": 0.08254321664571762, "learning_rate": 9.634123075622073e-06, "loss": 0.4608, "num_input_tokens_seen": 40816896, "step": 33565 }, { "epoch": 4.206239819571482, "grad_norm": 0.06857620179653168, "learning_rate": 9.633917759871682e-06, "loss": 0.4658, "num_input_tokens_seen": 40823200, "step": 33570 }, { "epoch": 4.206866307480266, "grad_norm": 0.07983061671257019, "learning_rate": 9.633712388718908e-06, "loss": 0.4643, "num_input_tokens_seen": 40829440, "step": 33575 }, { "epoch": 4.207492795389049, "grad_norm": 0.07351355999708176, "learning_rate": 9.633506962166209e-06, "loss": 0.4659, "num_input_tokens_seen": 40835488, "step": 33580 }, { "epoch": 4.208119283297832, "grad_norm": 0.07480475306510925, "learning_rate": 9.633301480216035e-06, "loss": 0.4584, "num_input_tokens_seen": 40840736, "step": 33585 }, { "epoch": 4.208745771206615, "grad_norm": 0.0715295597910881, "learning_rate": 9.63309594287085e-06, "loss": 0.4586, "num_input_tokens_seen": 40846560, "step": 33590 }, { "epoch": 4.2093722591153995, "grad_norm": 0.08383622020483017, "learning_rate": 9.632890350133105e-06, "loss": 0.4628, "num_input_tokens_seen": 40852768, "step": 33595 }, { "epoch": 4.209998747024183, "grad_norm": 0.10800129175186157, "learning_rate": 9.632684702005262e-06, "loss": 0.4655, "num_input_tokens_seen": 40858944, "step": 33600 }, { "epoch": 4.210625234932966, "grad_norm": 0.07137037068605423, "learning_rate": 9.632478998489778e-06, "loss": 0.4656, "num_input_tokens_seen": 40864928, "step": 33605 }, { "epoch": 4.211251722841749, "grad_norm": 0.03637329116463661, "learning_rate": 9.632273239589112e-06, "loss": 0.4641, "num_input_tokens_seen": 40871168, "step": 33610 }, { "epoch": 4.211878210750532, "grad_norm": 0.12153435498476028, "learning_rate": 9.632067425305726e-06, "loss": 0.4611, "num_input_tokens_seen": 40877536, "step": 33615 }, { "epoch": 4.212504698659316, "grad_norm": 0.1229454055428505, "learning_rate": 9.631861555642079e-06, "loss": 0.4639, "num_input_tokens_seen": 40883744, "step": 33620 }, { "epoch": 4.213131186568099, "grad_norm": 0.08315238356590271, "learning_rate": 9.631655630600633e-06, "loss": 0.4641, "num_input_tokens_seen": 40889984, "step": 33625 }, { "epoch": 4.2137576744768825, "grad_norm": 0.07162006944417953, "learning_rate": 9.63144965018385e-06, "loss": 0.4628, "num_input_tokens_seen": 40896256, "step": 33630 }, { "epoch": 4.214384162385666, "grad_norm": 0.07863644510507584, "learning_rate": 9.631243614394192e-06, "loss": 0.461, "num_input_tokens_seen": 40901792, "step": 33635 }, { "epoch": 4.215010650294449, "grad_norm": 0.0794287845492363, "learning_rate": 9.631037523234123e-06, "loss": 0.4633, "num_input_tokens_seen": 40907424, "step": 33640 }, { "epoch": 4.215637138203233, "grad_norm": 0.07294017821550369, "learning_rate": 9.630831376706108e-06, "loss": 0.4612, "num_input_tokens_seen": 40913376, "step": 33645 }, { "epoch": 4.216263626112016, "grad_norm": 0.11023569852113724, "learning_rate": 9.630625174812608e-06, "loss": 0.4659, "num_input_tokens_seen": 40919552, "step": 33650 }, { "epoch": 4.216890114020799, "grad_norm": 0.08722478896379471, "learning_rate": 9.630418917556093e-06, "loss": 0.462, "num_input_tokens_seen": 40925536, "step": 33655 }, { "epoch": 4.217516601929582, "grad_norm": 0.10088454186916351, "learning_rate": 9.630212604939026e-06, "loss": 0.4662, "num_input_tokens_seen": 40931552, "step": 33660 }, { "epoch": 4.2181430898383665, "grad_norm": 0.08074033260345459, "learning_rate": 9.630006236963876e-06, "loss": 0.4665, "num_input_tokens_seen": 40937600, "step": 33665 }, { "epoch": 4.21876957774715, "grad_norm": 0.11995278298854828, "learning_rate": 9.629799813633108e-06, "loss": 0.4654, "num_input_tokens_seen": 40943936, "step": 33670 }, { "epoch": 4.219396065655933, "grad_norm": 0.07251673936843872, "learning_rate": 9.629593334949188e-06, "loss": 0.4628, "num_input_tokens_seen": 40950176, "step": 33675 }, { "epoch": 4.220022553564716, "grad_norm": 0.11175991594791412, "learning_rate": 9.629386800914592e-06, "loss": 0.4644, "num_input_tokens_seen": 40956128, "step": 33680 }, { "epoch": 4.220649041473499, "grad_norm": 0.1081269159913063, "learning_rate": 9.62918021153178e-06, "loss": 0.463, "num_input_tokens_seen": 40962400, "step": 33685 }, { "epoch": 4.221275529382283, "grad_norm": 0.11650165170431137, "learning_rate": 9.628973566803228e-06, "loss": 0.4652, "num_input_tokens_seen": 40968416, "step": 33690 }, { "epoch": 4.221902017291066, "grad_norm": 0.06227128207683563, "learning_rate": 9.628766866731407e-06, "loss": 0.4622, "num_input_tokens_seen": 40974720, "step": 33695 }, { "epoch": 4.22252850519985, "grad_norm": 0.06512721627950668, "learning_rate": 9.628560111318786e-06, "loss": 0.4573, "num_input_tokens_seen": 40980832, "step": 33700 }, { "epoch": 4.223154993108633, "grad_norm": 0.07415900379419327, "learning_rate": 9.628353300567836e-06, "loss": 0.4641, "num_input_tokens_seen": 40986912, "step": 33705 }, { "epoch": 4.223781481017417, "grad_norm": 0.08715503662824631, "learning_rate": 9.628146434481032e-06, "loss": 0.4646, "num_input_tokens_seen": 40992960, "step": 33710 }, { "epoch": 4.2244079689262, "grad_norm": 0.030468512326478958, "learning_rate": 9.627939513060846e-06, "loss": 0.4607, "num_input_tokens_seen": 40999168, "step": 33715 }, { "epoch": 4.225034456834983, "grad_norm": 0.08452760428190231, "learning_rate": 9.627732536309751e-06, "loss": 0.4625, "num_input_tokens_seen": 41005472, "step": 33720 }, { "epoch": 4.225660944743766, "grad_norm": 0.1181560754776001, "learning_rate": 9.627525504230223e-06, "loss": 0.462, "num_input_tokens_seen": 41011488, "step": 33725 }, { "epoch": 4.2262874326525495, "grad_norm": 0.16990239918231964, "learning_rate": 9.62731841682474e-06, "loss": 0.4625, "num_input_tokens_seen": 41017504, "step": 33730 }, { "epoch": 4.2269139205613335, "grad_norm": 0.11705654859542847, "learning_rate": 9.627111274095772e-06, "loss": 0.457, "num_input_tokens_seen": 41023552, "step": 33735 }, { "epoch": 4.227540408470117, "grad_norm": 0.0707092434167862, "learning_rate": 9.626904076045798e-06, "loss": 0.461, "num_input_tokens_seen": 41030112, "step": 33740 }, { "epoch": 4.2281668963789, "grad_norm": 0.08176256716251373, "learning_rate": 9.626696822677297e-06, "loss": 0.4639, "num_input_tokens_seen": 41035904, "step": 33745 }, { "epoch": 4.228793384287683, "grad_norm": 0.10646089911460876, "learning_rate": 9.626489513992747e-06, "loss": 0.4649, "num_input_tokens_seen": 41042112, "step": 33750 }, { "epoch": 4.229419872196466, "grad_norm": 0.03711222484707832, "learning_rate": 9.626282149994623e-06, "loss": 0.4631, "num_input_tokens_seen": 41048032, "step": 33755 }, { "epoch": 4.23004636010525, "grad_norm": 0.07878082990646362, "learning_rate": 9.626074730685406e-06, "loss": 0.4603, "num_input_tokens_seen": 41053664, "step": 33760 }, { "epoch": 4.230672848014033, "grad_norm": 0.12142080813646317, "learning_rate": 9.625867256067577e-06, "loss": 0.4686, "num_input_tokens_seen": 41059712, "step": 33765 }, { "epoch": 4.231299335922817, "grad_norm": 0.11648488789796829, "learning_rate": 9.625659726143617e-06, "loss": 0.4602, "num_input_tokens_seen": 41065728, "step": 33770 }, { "epoch": 4.2319258238316, "grad_norm": 0.1216009333729744, "learning_rate": 9.625452140916004e-06, "loss": 0.4588, "num_input_tokens_seen": 41072288, "step": 33775 }, { "epoch": 4.232552311740384, "grad_norm": 0.09005168080329895, "learning_rate": 9.625244500387223e-06, "loss": 0.463, "num_input_tokens_seen": 41078400, "step": 33780 }, { "epoch": 4.233178799649167, "grad_norm": 0.06989358365535736, "learning_rate": 9.625036804559754e-06, "loss": 0.4583, "num_input_tokens_seen": 41084416, "step": 33785 }, { "epoch": 4.23380528755795, "grad_norm": 0.08320572227239609, "learning_rate": 9.624829053436082e-06, "loss": 0.4578, "num_input_tokens_seen": 41090208, "step": 33790 }, { "epoch": 4.234431775466733, "grad_norm": 0.09507221728563309, "learning_rate": 9.62462124701869e-06, "loss": 0.462, "num_input_tokens_seen": 41096192, "step": 33795 }, { "epoch": 4.2350582633755165, "grad_norm": 0.09294585138559341, "learning_rate": 9.624413385310064e-06, "loss": 0.4628, "num_input_tokens_seen": 41102304, "step": 33800 }, { "epoch": 4.2356847512843006, "grad_norm": 0.12706255912780762, "learning_rate": 9.624205468312689e-06, "loss": 0.4588, "num_input_tokens_seen": 41108640, "step": 33805 }, { "epoch": 4.236311239193084, "grad_norm": 0.06366157531738281, "learning_rate": 9.623997496029046e-06, "loss": 0.4642, "num_input_tokens_seen": 41114144, "step": 33810 }, { "epoch": 4.236937727101867, "grad_norm": 0.06972511112689972, "learning_rate": 9.623789468461628e-06, "loss": 0.46, "num_input_tokens_seen": 41120128, "step": 33815 }, { "epoch": 4.23756421501065, "grad_norm": 0.06816630065441132, "learning_rate": 9.623581385612918e-06, "loss": 0.462, "num_input_tokens_seen": 41125728, "step": 33820 }, { "epoch": 4.238190702919433, "grad_norm": 0.07580669224262238, "learning_rate": 9.623373247485408e-06, "loss": 0.4635, "num_input_tokens_seen": 41131648, "step": 33825 }, { "epoch": 4.238817190828217, "grad_norm": 0.08013936877250671, "learning_rate": 9.623165054081582e-06, "loss": 0.4676, "num_input_tokens_seen": 41137664, "step": 33830 }, { "epoch": 4.2394436787370005, "grad_norm": 0.06913500279188156, "learning_rate": 9.62295680540393e-06, "loss": 0.464, "num_input_tokens_seen": 41144128, "step": 33835 }, { "epoch": 4.240070166645784, "grad_norm": 0.0690913125872612, "learning_rate": 9.622748501454943e-06, "loss": 0.4607, "num_input_tokens_seen": 41150368, "step": 33840 }, { "epoch": 4.240696654554567, "grad_norm": 0.07605140656232834, "learning_rate": 9.622540142237112e-06, "loss": 0.4638, "num_input_tokens_seen": 41156576, "step": 33845 }, { "epoch": 4.241323142463351, "grad_norm": 0.1315145641565323, "learning_rate": 9.622331727752926e-06, "loss": 0.4648, "num_input_tokens_seen": 41162912, "step": 33850 }, { "epoch": 4.241949630372134, "grad_norm": 0.09443879127502441, "learning_rate": 9.622123258004879e-06, "loss": 0.463, "num_input_tokens_seen": 41169088, "step": 33855 }, { "epoch": 4.242576118280917, "grad_norm": 0.15056492388248444, "learning_rate": 9.621914732995462e-06, "loss": 0.4639, "num_input_tokens_seen": 41175168, "step": 33860 }, { "epoch": 4.2432026061897, "grad_norm": 0.11472547799348831, "learning_rate": 9.621706152727168e-06, "loss": 0.4676, "num_input_tokens_seen": 41181440, "step": 33865 }, { "epoch": 4.2438290940984835, "grad_norm": 0.03989164158701897, "learning_rate": 9.621497517202493e-06, "loss": 0.4622, "num_input_tokens_seen": 41187648, "step": 33870 }, { "epoch": 4.244455582007268, "grad_norm": 0.11961526423692703, "learning_rate": 9.621288826423929e-06, "loss": 0.4575, "num_input_tokens_seen": 41194048, "step": 33875 }, { "epoch": 4.245082069916051, "grad_norm": 0.08102034777402878, "learning_rate": 9.62108008039397e-06, "loss": 0.4624, "num_input_tokens_seen": 41200448, "step": 33880 }, { "epoch": 4.245708557824834, "grad_norm": 0.10760746151208878, "learning_rate": 9.620871279115114e-06, "loss": 0.4572, "num_input_tokens_seen": 41206368, "step": 33885 }, { "epoch": 4.246335045733617, "grad_norm": 0.0766424834728241, "learning_rate": 9.620662422589857e-06, "loss": 0.4677, "num_input_tokens_seen": 41212736, "step": 33890 }, { "epoch": 4.2469615336424, "grad_norm": 0.08370348066091537, "learning_rate": 9.620453510820697e-06, "loss": 0.4608, "num_input_tokens_seen": 41218240, "step": 33895 }, { "epoch": 4.247588021551184, "grad_norm": 0.13507382571697235, "learning_rate": 9.62024454381013e-06, "loss": 0.4664, "num_input_tokens_seen": 41224032, "step": 33900 }, { "epoch": 4.2482145094599675, "grad_norm": 0.13549435138702393, "learning_rate": 9.620035521560654e-06, "loss": 0.4638, "num_input_tokens_seen": 41230304, "step": 33905 }, { "epoch": 4.248840997368751, "grad_norm": 0.0744364857673645, "learning_rate": 9.61982644407477e-06, "loss": 0.466, "num_input_tokens_seen": 41236544, "step": 33910 }, { "epoch": 4.249467485277534, "grad_norm": 0.0962323546409607, "learning_rate": 9.619617311354976e-06, "loss": 0.4637, "num_input_tokens_seen": 41242848, "step": 33915 }, { "epoch": 4.250093973186318, "grad_norm": 0.07238292694091797, "learning_rate": 9.619408123403774e-06, "loss": 0.4647, "num_input_tokens_seen": 41248960, "step": 33920 }, { "epoch": 4.250720461095101, "grad_norm": 0.09025755524635315, "learning_rate": 9.619198880223663e-06, "loss": 0.4635, "num_input_tokens_seen": 41254912, "step": 33925 }, { "epoch": 4.251346949003884, "grad_norm": 0.08661298453807831, "learning_rate": 9.618989581817147e-06, "loss": 0.4571, "num_input_tokens_seen": 41260448, "step": 33930 }, { "epoch": 4.251973436912667, "grad_norm": 0.07762547582387924, "learning_rate": 9.618780228186728e-06, "loss": 0.4645, "num_input_tokens_seen": 41266592, "step": 33935 }, { "epoch": 4.2525999248214505, "grad_norm": 0.08601053059101105, "learning_rate": 9.618570819334906e-06, "loss": 0.4645, "num_input_tokens_seen": 41272800, "step": 33940 }, { "epoch": 4.253226412730235, "grad_norm": 0.0841594859957695, "learning_rate": 9.618361355264188e-06, "loss": 0.4587, "num_input_tokens_seen": 41279136, "step": 33945 }, { "epoch": 4.253852900639018, "grad_norm": 0.09521325677633286, "learning_rate": 9.618151835977077e-06, "loss": 0.4563, "num_input_tokens_seen": 41285408, "step": 33950 }, { "epoch": 4.254479388547801, "grad_norm": 0.10000577569007874, "learning_rate": 9.617942261476077e-06, "loss": 0.4634, "num_input_tokens_seen": 41291584, "step": 33955 }, { "epoch": 4.255105876456584, "grad_norm": 0.12718579173088074, "learning_rate": 9.617732631763695e-06, "loss": 0.4687, "num_input_tokens_seen": 41297696, "step": 33960 }, { "epoch": 4.255732364365368, "grad_norm": 0.07013077288866043, "learning_rate": 9.617522946842437e-06, "loss": 0.4651, "num_input_tokens_seen": 41303488, "step": 33965 }, { "epoch": 4.256358852274151, "grad_norm": 0.09060361236333847, "learning_rate": 9.61731320671481e-06, "loss": 0.4671, "num_input_tokens_seen": 41309536, "step": 33970 }, { "epoch": 4.2569853401829345, "grad_norm": 0.09990566968917847, "learning_rate": 9.617103411383323e-06, "loss": 0.4698, "num_input_tokens_seen": 41315392, "step": 33975 }, { "epoch": 4.257611828091718, "grad_norm": 0.09062959998846054, "learning_rate": 9.616893560850483e-06, "loss": 0.4656, "num_input_tokens_seen": 41321504, "step": 33980 }, { "epoch": 4.258238316000501, "grad_norm": 0.07415014505386353, "learning_rate": 9.616683655118798e-06, "loss": 0.4642, "num_input_tokens_seen": 41327456, "step": 33985 }, { "epoch": 4.258864803909285, "grad_norm": 0.03708857670426369, "learning_rate": 9.616473694190776e-06, "loss": 0.4653, "num_input_tokens_seen": 41333664, "step": 33990 }, { "epoch": 4.259491291818068, "grad_norm": 0.08905769884586334, "learning_rate": 9.616263678068933e-06, "loss": 0.4632, "num_input_tokens_seen": 41339488, "step": 33995 }, { "epoch": 4.260117779726851, "grad_norm": 0.08878464251756668, "learning_rate": 9.616053606755776e-06, "loss": 0.4649, "num_input_tokens_seen": 41345440, "step": 34000 }, { "epoch": 4.260744267635634, "grad_norm": 0.07235082238912582, "learning_rate": 9.615843480253815e-06, "loss": 0.4622, "num_input_tokens_seen": 41351328, "step": 34005 }, { "epoch": 4.261370755544418, "grad_norm": 0.07123343646526337, "learning_rate": 9.615633298565565e-06, "loss": 0.4626, "num_input_tokens_seen": 41357504, "step": 34010 }, { "epoch": 4.261997243453202, "grad_norm": 0.10868746042251587, "learning_rate": 9.61542306169354e-06, "loss": 0.466, "num_input_tokens_seen": 41363552, "step": 34015 }, { "epoch": 4.262623731361985, "grad_norm": 0.11154161393642426, "learning_rate": 9.615212769640252e-06, "loss": 0.4575, "num_input_tokens_seen": 41369376, "step": 34020 }, { "epoch": 4.263250219270768, "grad_norm": 0.08945374935865402, "learning_rate": 9.615002422408213e-06, "loss": 0.4659, "num_input_tokens_seen": 41375456, "step": 34025 }, { "epoch": 4.263876707179551, "grad_norm": 0.03411213681101799, "learning_rate": 9.614792019999942e-06, "loss": 0.4674, "num_input_tokens_seen": 41381792, "step": 34030 }, { "epoch": 4.264503195088335, "grad_norm": 0.06801018863916397, "learning_rate": 9.61458156241795e-06, "loss": 0.4579, "num_input_tokens_seen": 41386816, "step": 34035 }, { "epoch": 4.265129682997118, "grad_norm": 0.0638682097196579, "learning_rate": 9.614371049664758e-06, "loss": 0.4628, "num_input_tokens_seen": 41392704, "step": 34040 }, { "epoch": 4.2657561709059015, "grad_norm": 0.06438654661178589, "learning_rate": 9.61416048174288e-06, "loss": 0.4619, "num_input_tokens_seen": 41399296, "step": 34045 }, { "epoch": 4.266382658814685, "grad_norm": 0.06787704676389694, "learning_rate": 9.613949858654834e-06, "loss": 0.468, "num_input_tokens_seen": 41405504, "step": 34050 }, { "epoch": 4.267009146723468, "grad_norm": 0.07434902340173721, "learning_rate": 9.61373918040314e-06, "loss": 0.4692, "num_input_tokens_seen": 41411712, "step": 34055 }, { "epoch": 4.267635634632252, "grad_norm": 0.1095598042011261, "learning_rate": 9.613528446990312e-06, "loss": 0.4661, "num_input_tokens_seen": 41417664, "step": 34060 }, { "epoch": 4.268262122541035, "grad_norm": 0.08906147629022598, "learning_rate": 9.613317658418874e-06, "loss": 0.4617, "num_input_tokens_seen": 41423744, "step": 34065 }, { "epoch": 4.268888610449818, "grad_norm": 0.12488716840744019, "learning_rate": 9.613106814691345e-06, "loss": 0.4628, "num_input_tokens_seen": 41429760, "step": 34070 }, { "epoch": 4.269515098358601, "grad_norm": 0.09020771831274033, "learning_rate": 9.612895915810244e-06, "loss": 0.4652, "num_input_tokens_seen": 41435840, "step": 34075 }, { "epoch": 4.2701415862673855, "grad_norm": 0.11305335909128189, "learning_rate": 9.612684961778095e-06, "loss": 0.4561, "num_input_tokens_seen": 41441792, "step": 34080 }, { "epoch": 4.270768074176169, "grad_norm": 0.028257407248020172, "learning_rate": 9.61247395259742e-06, "loss": 0.4638, "num_input_tokens_seen": 41447968, "step": 34085 }, { "epoch": 4.271394562084952, "grad_norm": 0.0721578598022461, "learning_rate": 9.61226288827074e-06, "loss": 0.4596, "num_input_tokens_seen": 41454176, "step": 34090 }, { "epoch": 4.272021049993735, "grad_norm": 0.06614948064088821, "learning_rate": 9.61205176880058e-06, "loss": 0.4638, "num_input_tokens_seen": 41460192, "step": 34095 }, { "epoch": 4.272647537902518, "grad_norm": 0.06866832822561264, "learning_rate": 9.611840594189461e-06, "loss": 0.4641, "num_input_tokens_seen": 41466304, "step": 34100 }, { "epoch": 4.273274025811302, "grad_norm": 0.03162631392478943, "learning_rate": 9.611629364439914e-06, "loss": 0.4631, "num_input_tokens_seen": 41472576, "step": 34105 }, { "epoch": 4.273900513720085, "grad_norm": 0.07561745494604111, "learning_rate": 9.61141807955446e-06, "loss": 0.4697, "num_input_tokens_seen": 41478880, "step": 34110 }, { "epoch": 4.274527001628869, "grad_norm": 0.12297341972589493, "learning_rate": 9.611206739535623e-06, "loss": 0.459, "num_input_tokens_seen": 41484992, "step": 34115 }, { "epoch": 4.275153489537652, "grad_norm": 0.10712514817714691, "learning_rate": 9.610995344385935e-06, "loss": 0.4637, "num_input_tokens_seen": 41490720, "step": 34120 }, { "epoch": 4.275779977446435, "grad_norm": 0.06427393853664398, "learning_rate": 9.61078389410792e-06, "loss": 0.463, "num_input_tokens_seen": 41496960, "step": 34125 }, { "epoch": 4.276406465355219, "grad_norm": 0.0657765194773674, "learning_rate": 9.610572388704108e-06, "loss": 0.4598, "num_input_tokens_seen": 41503104, "step": 34130 }, { "epoch": 4.277032953264002, "grad_norm": 0.034453049302101135, "learning_rate": 9.610360828177026e-06, "loss": 0.4653, "num_input_tokens_seen": 41508992, "step": 34135 }, { "epoch": 4.277659441172785, "grad_norm": 0.06500030308961868, "learning_rate": 9.610149212529203e-06, "loss": 0.4664, "num_input_tokens_seen": 41515136, "step": 34140 }, { "epoch": 4.2782859290815685, "grad_norm": 0.08032473921775818, "learning_rate": 9.609937541763173e-06, "loss": 0.4679, "num_input_tokens_seen": 41521408, "step": 34145 }, { "epoch": 4.278912416990352, "grad_norm": 0.06846361607313156, "learning_rate": 9.609725815881461e-06, "loss": 0.4619, "num_input_tokens_seen": 41527520, "step": 34150 }, { "epoch": 4.279538904899136, "grad_norm": 0.06378382444381714, "learning_rate": 9.609514034886604e-06, "loss": 0.46, "num_input_tokens_seen": 41533696, "step": 34155 }, { "epoch": 4.280165392807919, "grad_norm": 0.06322525441646576, "learning_rate": 9.60930219878113e-06, "loss": 0.4599, "num_input_tokens_seen": 41539104, "step": 34160 }, { "epoch": 4.280791880716702, "grad_norm": 0.034087248146533966, "learning_rate": 9.609090307567572e-06, "loss": 0.4607, "num_input_tokens_seen": 41545152, "step": 34165 }, { "epoch": 4.281418368625485, "grad_norm": 0.10745936632156372, "learning_rate": 9.608878361248463e-06, "loss": 0.4625, "num_input_tokens_seen": 41550976, "step": 34170 }, { "epoch": 4.282044856534269, "grad_norm": 0.07842402160167694, "learning_rate": 9.608666359826342e-06, "loss": 0.4648, "num_input_tokens_seen": 41557024, "step": 34175 }, { "epoch": 4.282671344443052, "grad_norm": 0.06633727252483368, "learning_rate": 9.608454303303739e-06, "loss": 0.4635, "num_input_tokens_seen": 41562944, "step": 34180 }, { "epoch": 4.283297832351836, "grad_norm": 0.11987153440713882, "learning_rate": 9.608242191683189e-06, "loss": 0.4629, "num_input_tokens_seen": 41569248, "step": 34185 }, { "epoch": 4.283924320260619, "grad_norm": 0.09682420641183853, "learning_rate": 9.608030024967228e-06, "loss": 0.4686, "num_input_tokens_seen": 41574848, "step": 34190 }, { "epoch": 4.284550808169402, "grad_norm": 0.11606699228286743, "learning_rate": 9.607817803158395e-06, "loss": 0.4623, "num_input_tokens_seen": 41580320, "step": 34195 }, { "epoch": 4.285177296078186, "grad_norm": 0.1108008548617363, "learning_rate": 9.607605526259227e-06, "loss": 0.4674, "num_input_tokens_seen": 41586688, "step": 34200 }, { "epoch": 4.285803783986969, "grad_norm": 0.07606088370084763, "learning_rate": 9.607393194272261e-06, "loss": 0.4622, "num_input_tokens_seen": 41592832, "step": 34205 }, { "epoch": 4.286430271895752, "grad_norm": 0.067206010222435, "learning_rate": 9.607180807200036e-06, "loss": 0.4636, "num_input_tokens_seen": 41599136, "step": 34210 }, { "epoch": 4.2870567598045355, "grad_norm": 0.06295209378004074, "learning_rate": 9.60696836504509e-06, "loss": 0.4636, "num_input_tokens_seen": 41605536, "step": 34215 }, { "epoch": 4.2876832477133195, "grad_norm": 0.1010679081082344, "learning_rate": 9.606755867809962e-06, "loss": 0.4638, "num_input_tokens_seen": 41611616, "step": 34220 }, { "epoch": 4.288309735622103, "grad_norm": 0.06726241111755371, "learning_rate": 9.606543315497197e-06, "loss": 0.4623, "num_input_tokens_seen": 41617664, "step": 34225 }, { "epoch": 4.288936223530886, "grad_norm": 0.07453780621290207, "learning_rate": 9.606330708109334e-06, "loss": 0.46, "num_input_tokens_seen": 41623072, "step": 34230 }, { "epoch": 4.289562711439669, "grad_norm": 0.11942366510629654, "learning_rate": 9.606118045648913e-06, "loss": 0.4603, "num_input_tokens_seen": 41628992, "step": 34235 }, { "epoch": 4.290189199348452, "grad_norm": 0.08229999244213104, "learning_rate": 9.60590532811848e-06, "loss": 0.4666, "num_input_tokens_seen": 41635168, "step": 34240 }, { "epoch": 4.290815687257236, "grad_norm": 0.06991101056337357, "learning_rate": 9.605692555520577e-06, "loss": 0.4592, "num_input_tokens_seen": 41641248, "step": 34245 }, { "epoch": 4.2914421751660194, "grad_norm": 0.07031529396772385, "learning_rate": 9.605479727857743e-06, "loss": 0.4627, "num_input_tokens_seen": 41646976, "step": 34250 }, { "epoch": 4.292068663074803, "grad_norm": 0.07564234733581543, "learning_rate": 9.60526684513253e-06, "loss": 0.4569, "num_input_tokens_seen": 41653312, "step": 34255 }, { "epoch": 4.292695150983586, "grad_norm": 0.07542785257101059, "learning_rate": 9.605053907347479e-06, "loss": 0.4582, "num_input_tokens_seen": 41659616, "step": 34260 }, { "epoch": 4.293321638892369, "grad_norm": 0.09089240431785583, "learning_rate": 9.604840914505138e-06, "loss": 0.4625, "num_input_tokens_seen": 41665824, "step": 34265 }, { "epoch": 4.293948126801153, "grad_norm": 0.12961800396442413, "learning_rate": 9.604627866608051e-06, "loss": 0.4689, "num_input_tokens_seen": 41672128, "step": 34270 }, { "epoch": 4.294574614709936, "grad_norm": 0.07508489489555359, "learning_rate": 9.604414763658767e-06, "loss": 0.4552, "num_input_tokens_seen": 41677984, "step": 34275 }, { "epoch": 4.295201102618719, "grad_norm": 0.12842896580696106, "learning_rate": 9.604201605659833e-06, "loss": 0.4689, "num_input_tokens_seen": 41684576, "step": 34280 }, { "epoch": 4.2958275905275025, "grad_norm": 0.09339843690395355, "learning_rate": 9.603988392613798e-06, "loss": 0.4583, "num_input_tokens_seen": 41690784, "step": 34285 }, { "epoch": 4.296454078436287, "grad_norm": 0.08373651653528214, "learning_rate": 9.60377512452321e-06, "loss": 0.4644, "num_input_tokens_seen": 41697152, "step": 34290 }, { "epoch": 4.29708056634507, "grad_norm": 0.08520927280187607, "learning_rate": 9.603561801390622e-06, "loss": 0.4635, "num_input_tokens_seen": 41703328, "step": 34295 }, { "epoch": 4.297707054253853, "grad_norm": 0.09940695017576218, "learning_rate": 9.603348423218582e-06, "loss": 0.4689, "num_input_tokens_seen": 41709376, "step": 34300 }, { "epoch": 4.298333542162636, "grad_norm": 0.07964877784252167, "learning_rate": 9.60313499000964e-06, "loss": 0.4554, "num_input_tokens_seen": 41715328, "step": 34305 }, { "epoch": 4.298960030071419, "grad_norm": 0.07288576662540436, "learning_rate": 9.60292150176635e-06, "loss": 0.4585, "num_input_tokens_seen": 41721344, "step": 34310 }, { "epoch": 4.299586517980203, "grad_norm": 0.1349761188030243, "learning_rate": 9.602707958491261e-06, "loss": 0.4683, "num_input_tokens_seen": 41727424, "step": 34315 }, { "epoch": 4.3002130058889865, "grad_norm": 0.10571645945310593, "learning_rate": 9.60249436018693e-06, "loss": 0.4608, "num_input_tokens_seen": 41733376, "step": 34320 }, { "epoch": 4.30083949379777, "grad_norm": 0.10071656852960587, "learning_rate": 9.602280706855911e-06, "loss": 0.4636, "num_input_tokens_seen": 41739744, "step": 34325 }, { "epoch": 4.301465981706553, "grad_norm": 0.1280045062303543, "learning_rate": 9.602066998500756e-06, "loss": 0.4531, "num_input_tokens_seen": 41745568, "step": 34330 }, { "epoch": 4.302092469615337, "grad_norm": 0.04258681833744049, "learning_rate": 9.60185323512402e-06, "loss": 0.4636, "num_input_tokens_seen": 41751648, "step": 34335 }, { "epoch": 4.30271895752412, "grad_norm": 0.12248322367668152, "learning_rate": 9.60163941672826e-06, "loss": 0.4705, "num_input_tokens_seen": 41757920, "step": 34340 }, { "epoch": 4.303345445432903, "grad_norm": 0.06999281793832779, "learning_rate": 9.601425543316032e-06, "loss": 0.4629, "num_input_tokens_seen": 41763424, "step": 34345 }, { "epoch": 4.303971933341686, "grad_norm": 0.0389522984623909, "learning_rate": 9.601211614889892e-06, "loss": 0.4668, "num_input_tokens_seen": 41769472, "step": 34350 }, { "epoch": 4.3045984212504695, "grad_norm": 0.15426801145076752, "learning_rate": 9.6009976314524e-06, "loss": 0.4679, "num_input_tokens_seen": 41775584, "step": 34355 }, { "epoch": 4.305224909159254, "grad_norm": 0.07538174092769623, "learning_rate": 9.600783593006113e-06, "loss": 0.4611, "num_input_tokens_seen": 41781984, "step": 34360 }, { "epoch": 4.305851397068037, "grad_norm": 0.07880088686943054, "learning_rate": 9.600569499553589e-06, "loss": 0.4565, "num_input_tokens_seen": 41788480, "step": 34365 }, { "epoch": 4.30647788497682, "grad_norm": 0.10441283136606216, "learning_rate": 9.60035535109739e-06, "loss": 0.4675, "num_input_tokens_seen": 41794240, "step": 34370 }, { "epoch": 4.307104372885603, "grad_norm": 0.09394795447587967, "learning_rate": 9.600141147640074e-06, "loss": 0.4643, "num_input_tokens_seen": 41800672, "step": 34375 }, { "epoch": 4.307730860794386, "grad_norm": 0.09697243571281433, "learning_rate": 9.599926889184204e-06, "loss": 0.4612, "num_input_tokens_seen": 41806592, "step": 34380 }, { "epoch": 4.30835734870317, "grad_norm": 0.18355675041675568, "learning_rate": 9.59971257573234e-06, "loss": 0.4632, "num_input_tokens_seen": 41812640, "step": 34385 }, { "epoch": 4.3089838366119535, "grad_norm": 0.07458792626857758, "learning_rate": 9.599498207287044e-06, "loss": 0.4608, "num_input_tokens_seen": 41818752, "step": 34390 }, { "epoch": 4.309610324520737, "grad_norm": 0.0354808047413826, "learning_rate": 9.599283783850881e-06, "loss": 0.4639, "num_input_tokens_seen": 41825024, "step": 34395 }, { "epoch": 4.31023681242952, "grad_norm": 0.10330523550510406, "learning_rate": 9.599069305426414e-06, "loss": 0.4656, "num_input_tokens_seen": 41831136, "step": 34400 }, { "epoch": 4.310863300338303, "grad_norm": 0.10351289808750153, "learning_rate": 9.598854772016206e-06, "loss": 0.4566, "num_input_tokens_seen": 41837376, "step": 34405 }, { "epoch": 4.311489788247087, "grad_norm": 0.07486461848020554, "learning_rate": 9.598640183622822e-06, "loss": 0.4627, "num_input_tokens_seen": 41843328, "step": 34410 }, { "epoch": 4.31211627615587, "grad_norm": 0.07040085643529892, "learning_rate": 9.59842554024883e-06, "loss": 0.4722, "num_input_tokens_seen": 41849280, "step": 34415 }, { "epoch": 4.312742764064653, "grad_norm": 0.06401165574789047, "learning_rate": 9.598210841896792e-06, "loss": 0.4653, "num_input_tokens_seen": 41855392, "step": 34420 }, { "epoch": 4.313369251973437, "grad_norm": 0.03531232848763466, "learning_rate": 9.59799608856928e-06, "loss": 0.4653, "num_input_tokens_seen": 41861504, "step": 34425 }, { "epoch": 4.313995739882221, "grad_norm": 0.08556094765663147, "learning_rate": 9.597781280268858e-06, "loss": 0.459, "num_input_tokens_seen": 41867744, "step": 34430 }, { "epoch": 4.314622227791004, "grad_norm": 0.1152380034327507, "learning_rate": 9.597566416998094e-06, "loss": 0.4612, "num_input_tokens_seen": 41874016, "step": 34435 }, { "epoch": 4.315248715699787, "grad_norm": 0.12439798563718796, "learning_rate": 9.597351498759559e-06, "loss": 0.4652, "num_input_tokens_seen": 41879840, "step": 34440 }, { "epoch": 4.31587520360857, "grad_norm": 0.1042412742972374, "learning_rate": 9.597136525555822e-06, "loss": 0.4654, "num_input_tokens_seen": 41885696, "step": 34445 }, { "epoch": 4.316501691517353, "grad_norm": 0.09842506796121597, "learning_rate": 9.596921497389453e-06, "loss": 0.4617, "num_input_tokens_seen": 41891968, "step": 34450 }, { "epoch": 4.317128179426137, "grad_norm": 0.0682404488325119, "learning_rate": 9.596706414263022e-06, "loss": 0.4665, "num_input_tokens_seen": 41897856, "step": 34455 }, { "epoch": 4.3177546673349205, "grad_norm": 0.0969056785106659, "learning_rate": 9.596491276179101e-06, "loss": 0.4683, "num_input_tokens_seen": 41903968, "step": 34460 }, { "epoch": 4.318381155243704, "grad_norm": 0.0664348155260086, "learning_rate": 9.596276083140264e-06, "loss": 0.4566, "num_input_tokens_seen": 41910208, "step": 34465 }, { "epoch": 4.319007643152487, "grad_norm": 0.03196105360984802, "learning_rate": 9.596060835149078e-06, "loss": 0.4645, "num_input_tokens_seen": 41916256, "step": 34470 }, { "epoch": 4.319634131061271, "grad_norm": 0.07807888090610504, "learning_rate": 9.595845532208125e-06, "loss": 0.4604, "num_input_tokens_seen": 41922272, "step": 34475 }, { "epoch": 4.320260618970054, "grad_norm": 0.06631731986999512, "learning_rate": 9.595630174319972e-06, "loss": 0.4609, "num_input_tokens_seen": 41928352, "step": 34480 }, { "epoch": 4.320887106878837, "grad_norm": 0.06608816981315613, "learning_rate": 9.595414761487198e-06, "loss": 0.4635, "num_input_tokens_seen": 41934816, "step": 34485 }, { "epoch": 4.32151359478762, "grad_norm": 0.0656154528260231, "learning_rate": 9.595199293712376e-06, "loss": 0.4633, "num_input_tokens_seen": 41940928, "step": 34490 }, { "epoch": 4.322140082696404, "grad_norm": 0.08068562299013138, "learning_rate": 9.594983770998083e-06, "loss": 0.4623, "num_input_tokens_seen": 41947392, "step": 34495 }, { "epoch": 4.322766570605188, "grad_norm": 0.03423527628183365, "learning_rate": 9.594768193346897e-06, "loss": 0.4631, "num_input_tokens_seen": 41953632, "step": 34500 }, { "epoch": 4.323393058513971, "grad_norm": 0.13480830192565918, "learning_rate": 9.594552560761391e-06, "loss": 0.4654, "num_input_tokens_seen": 41958912, "step": 34505 }, { "epoch": 4.324019546422754, "grad_norm": 0.08193333446979523, "learning_rate": 9.59433687324415e-06, "loss": 0.4613, "num_input_tokens_seen": 41964960, "step": 34510 }, { "epoch": 4.324646034331537, "grad_norm": 0.06944973766803741, "learning_rate": 9.594121130797748e-06, "loss": 0.4582, "num_input_tokens_seen": 41971008, "step": 34515 }, { "epoch": 4.32527252224032, "grad_norm": 0.0880090743303299, "learning_rate": 9.593905333424764e-06, "loss": 0.4641, "num_input_tokens_seen": 41977216, "step": 34520 }, { "epoch": 4.325899010149104, "grad_norm": 0.06764505058526993, "learning_rate": 9.593689481127781e-06, "loss": 0.4669, "num_input_tokens_seen": 41983648, "step": 34525 }, { "epoch": 4.3265254980578876, "grad_norm": 0.03674843907356262, "learning_rate": 9.593473573909377e-06, "loss": 0.4653, "num_input_tokens_seen": 41989504, "step": 34530 }, { "epoch": 4.327151985966671, "grad_norm": 0.05951736494898796, "learning_rate": 9.593257611772136e-06, "loss": 0.4652, "num_input_tokens_seen": 41995616, "step": 34535 }, { "epoch": 4.327778473875454, "grad_norm": 0.1745349019765854, "learning_rate": 9.593041594718636e-06, "loss": 0.4615, "num_input_tokens_seen": 42001952, "step": 34540 }, { "epoch": 4.328404961784238, "grad_norm": 0.0887228325009346, "learning_rate": 9.592825522751463e-06, "loss": 0.4617, "num_input_tokens_seen": 42008256, "step": 34545 }, { "epoch": 4.329031449693021, "grad_norm": 0.07897506654262543, "learning_rate": 9.592609395873201e-06, "loss": 0.4653, "num_input_tokens_seen": 42014240, "step": 34550 }, { "epoch": 4.329657937601804, "grad_norm": 0.07083305716514587, "learning_rate": 9.592393214086431e-06, "loss": 0.4623, "num_input_tokens_seen": 42020224, "step": 34555 }, { "epoch": 4.3302844255105875, "grad_norm": 0.07772593945264816, "learning_rate": 9.592176977393739e-06, "loss": 0.4615, "num_input_tokens_seen": 42026176, "step": 34560 }, { "epoch": 4.330910913419371, "grad_norm": 0.031399793922901154, "learning_rate": 9.59196068579771e-06, "loss": 0.4661, "num_input_tokens_seen": 42032384, "step": 34565 }, { "epoch": 4.331537401328155, "grad_norm": 0.09277622401714325, "learning_rate": 9.591744339300931e-06, "loss": 0.4657, "num_input_tokens_seen": 42038752, "step": 34570 }, { "epoch": 4.332163889236938, "grad_norm": 0.07204662263393402, "learning_rate": 9.591527937905988e-06, "loss": 0.4638, "num_input_tokens_seen": 42044576, "step": 34575 }, { "epoch": 4.332790377145721, "grad_norm": 0.06625396013259888, "learning_rate": 9.591311481615469e-06, "loss": 0.4615, "num_input_tokens_seen": 42050880, "step": 34580 }, { "epoch": 4.333416865054504, "grad_norm": 0.0753830298781395, "learning_rate": 9.591094970431959e-06, "loss": 0.466, "num_input_tokens_seen": 42056992, "step": 34585 }, { "epoch": 4.334043352963288, "grad_norm": 0.0720059722661972, "learning_rate": 9.59087840435805e-06, "loss": 0.4631, "num_input_tokens_seen": 42063680, "step": 34590 }, { "epoch": 4.334669840872071, "grad_norm": 0.06837198138237, "learning_rate": 9.590661783396329e-06, "loss": 0.4656, "num_input_tokens_seen": 42069824, "step": 34595 }, { "epoch": 4.335296328780855, "grad_norm": 0.06327444314956665, "learning_rate": 9.590445107549386e-06, "loss": 0.4594, "num_input_tokens_seen": 42075680, "step": 34600 }, { "epoch": 4.335922816689638, "grad_norm": 0.031240956857800484, "learning_rate": 9.590228376819815e-06, "loss": 0.4633, "num_input_tokens_seen": 42082144, "step": 34605 }, { "epoch": 4.336549304598421, "grad_norm": 0.09929721802473068, "learning_rate": 9.590011591210203e-06, "loss": 0.4646, "num_input_tokens_seen": 42088224, "step": 34610 }, { "epoch": 4.337175792507205, "grad_norm": 0.06972332298755646, "learning_rate": 9.589794750723144e-06, "loss": 0.4593, "num_input_tokens_seen": 42094272, "step": 34615 }, { "epoch": 4.337802280415988, "grad_norm": 0.10540642589330673, "learning_rate": 9.58957785536123e-06, "loss": 0.4597, "num_input_tokens_seen": 42100448, "step": 34620 }, { "epoch": 4.338428768324771, "grad_norm": 0.06779135763645172, "learning_rate": 9.589360905127052e-06, "loss": 0.4603, "num_input_tokens_seen": 42106880, "step": 34625 }, { "epoch": 4.3390552562335545, "grad_norm": 0.07136521488428116, "learning_rate": 9.589143900023209e-06, "loss": 0.4698, "num_input_tokens_seen": 42112928, "step": 34630 }, { "epoch": 4.339681744142338, "grad_norm": 0.035964686423540115, "learning_rate": 9.588926840052288e-06, "loss": 0.4622, "num_input_tokens_seen": 42118976, "step": 34635 }, { "epoch": 4.340308232051122, "grad_norm": 0.06129498779773712, "learning_rate": 9.588709725216892e-06, "loss": 0.4646, "num_input_tokens_seen": 42125152, "step": 34640 }, { "epoch": 4.340934719959905, "grad_norm": 0.0842256024479866, "learning_rate": 9.588492555519612e-06, "loss": 0.4637, "num_input_tokens_seen": 42131104, "step": 34645 }, { "epoch": 4.341561207868688, "grad_norm": 0.09883466362953186, "learning_rate": 9.588275330963046e-06, "loss": 0.4595, "num_input_tokens_seen": 42137536, "step": 34650 }, { "epoch": 4.342187695777471, "grad_norm": 0.1305074244737625, "learning_rate": 9.58805805154979e-06, "loss": 0.4627, "num_input_tokens_seen": 42143552, "step": 34655 }, { "epoch": 4.342814183686255, "grad_norm": 0.0861089825630188, "learning_rate": 9.587840717282444e-06, "loss": 0.4656, "num_input_tokens_seen": 42149728, "step": 34660 }, { "epoch": 4.343440671595038, "grad_norm": 0.09029834717512131, "learning_rate": 9.587623328163604e-06, "loss": 0.4653, "num_input_tokens_seen": 42155808, "step": 34665 }, { "epoch": 4.344067159503822, "grad_norm": 0.07038451731204987, "learning_rate": 9.58740588419587e-06, "loss": 0.464, "num_input_tokens_seen": 42161888, "step": 34670 }, { "epoch": 4.344693647412605, "grad_norm": 0.06270334869623184, "learning_rate": 9.587188385381843e-06, "loss": 0.4629, "num_input_tokens_seen": 42167904, "step": 34675 }, { "epoch": 4.345320135321388, "grad_norm": 0.08643430471420288, "learning_rate": 9.586970831724122e-06, "loss": 0.4626, "num_input_tokens_seen": 42174208, "step": 34680 }, { "epoch": 4.345946623230172, "grad_norm": 0.08849956840276718, "learning_rate": 9.586753223225307e-06, "loss": 0.4618, "num_input_tokens_seen": 42180192, "step": 34685 }, { "epoch": 4.346573111138955, "grad_norm": 0.1038399264216423, "learning_rate": 9.586535559888002e-06, "loss": 0.4666, "num_input_tokens_seen": 42186336, "step": 34690 }, { "epoch": 4.347199599047738, "grad_norm": 0.10151316970586777, "learning_rate": 9.586317841714807e-06, "loss": 0.4595, "num_input_tokens_seen": 42192576, "step": 34695 }, { "epoch": 4.3478260869565215, "grad_norm": 0.07862550765275955, "learning_rate": 9.586100068708327e-06, "loss": 0.4649, "num_input_tokens_seen": 42198560, "step": 34700 }, { "epoch": 4.348452574865306, "grad_norm": 0.07712366431951523, "learning_rate": 9.585882240871166e-06, "loss": 0.4615, "num_input_tokens_seen": 42204608, "step": 34705 }, { "epoch": 4.349079062774089, "grad_norm": 0.07907203584909439, "learning_rate": 9.585664358205926e-06, "loss": 0.4602, "num_input_tokens_seen": 42210784, "step": 34710 }, { "epoch": 4.349705550682872, "grad_norm": 0.03202693164348602, "learning_rate": 9.585446420715213e-06, "loss": 0.462, "num_input_tokens_seen": 42216864, "step": 34715 }, { "epoch": 4.350332038591655, "grad_norm": 0.06852928549051285, "learning_rate": 9.585228428401632e-06, "loss": 0.4571, "num_input_tokens_seen": 42223200, "step": 34720 }, { "epoch": 4.350958526500438, "grad_norm": 0.03202126920223236, "learning_rate": 9.585010381267791e-06, "loss": 0.4596, "num_input_tokens_seen": 42229312, "step": 34725 }, { "epoch": 4.351585014409222, "grad_norm": 0.08227821439504623, "learning_rate": 9.584792279316295e-06, "loss": 0.4649, "num_input_tokens_seen": 42235520, "step": 34730 }, { "epoch": 4.3522115023180055, "grad_norm": 0.10761849582195282, "learning_rate": 9.584574122549755e-06, "loss": 0.4713, "num_input_tokens_seen": 42241280, "step": 34735 }, { "epoch": 4.352837990226789, "grad_norm": 0.06748931854963303, "learning_rate": 9.584355910970775e-06, "loss": 0.4634, "num_input_tokens_seen": 42247328, "step": 34740 }, { "epoch": 4.353464478135572, "grad_norm": 0.07384998351335526, "learning_rate": 9.584137644581968e-06, "loss": 0.4629, "num_input_tokens_seen": 42253312, "step": 34745 }, { "epoch": 4.354090966044355, "grad_norm": 0.0659211277961731, "learning_rate": 9.58391932338594e-06, "loss": 0.4613, "num_input_tokens_seen": 42259296, "step": 34750 }, { "epoch": 4.354717453953139, "grad_norm": 0.06723391264677048, "learning_rate": 9.583700947385302e-06, "loss": 0.4636, "num_input_tokens_seen": 42265408, "step": 34755 }, { "epoch": 4.355343941861922, "grad_norm": 0.023736216127872467, "learning_rate": 9.583482516582665e-06, "loss": 0.4597, "num_input_tokens_seen": 42271360, "step": 34760 }, { "epoch": 4.355970429770705, "grad_norm": 0.06338954716920853, "learning_rate": 9.583264030980642e-06, "loss": 0.464, "num_input_tokens_seen": 42277632, "step": 34765 }, { "epoch": 4.3565969176794885, "grad_norm": 0.10670020431280136, "learning_rate": 9.583045490581843e-06, "loss": 0.4641, "num_input_tokens_seen": 42283968, "step": 34770 }, { "epoch": 4.357223405588272, "grad_norm": 0.03036070242524147, "learning_rate": 9.582826895388883e-06, "loss": 0.4667, "num_input_tokens_seen": 42289856, "step": 34775 }, { "epoch": 4.357849893497056, "grad_norm": 0.029645146802067757, "learning_rate": 9.582608245404373e-06, "loss": 0.463, "num_input_tokens_seen": 42296000, "step": 34780 }, { "epoch": 4.358476381405839, "grad_norm": 0.059759341180324554, "learning_rate": 9.58238954063093e-06, "loss": 0.4651, "num_input_tokens_seen": 42302112, "step": 34785 }, { "epoch": 4.359102869314622, "grad_norm": 0.0700811818242073, "learning_rate": 9.582170781071167e-06, "loss": 0.4614, "num_input_tokens_seen": 42308224, "step": 34790 }, { "epoch": 4.359729357223405, "grad_norm": 0.05843394622206688, "learning_rate": 9.5819519667277e-06, "loss": 0.4646, "num_input_tokens_seen": 42314016, "step": 34795 }, { "epoch": 4.360355845132189, "grad_norm": 0.0650511384010315, "learning_rate": 9.581733097603146e-06, "loss": 0.4657, "num_input_tokens_seen": 42320480, "step": 34800 }, { "epoch": 4.3609823330409725, "grad_norm": 0.07782098650932312, "learning_rate": 9.581514173700118e-06, "loss": 0.4686, "num_input_tokens_seen": 42326464, "step": 34805 }, { "epoch": 4.361608820949756, "grad_norm": 0.02862718142569065, "learning_rate": 9.581295195021239e-06, "loss": 0.4624, "num_input_tokens_seen": 42332544, "step": 34810 }, { "epoch": 4.362235308858539, "grad_norm": 0.08460269868373871, "learning_rate": 9.581076161569122e-06, "loss": 0.4622, "num_input_tokens_seen": 42338720, "step": 34815 }, { "epoch": 4.362861796767322, "grad_norm": 0.07005968689918518, "learning_rate": 9.580857073346388e-06, "loss": 0.462, "num_input_tokens_seen": 42344928, "step": 34820 }, { "epoch": 4.363488284676106, "grad_norm": 0.07834424823522568, "learning_rate": 9.580637930355659e-06, "loss": 0.4607, "num_input_tokens_seen": 42351136, "step": 34825 }, { "epoch": 4.364114772584889, "grad_norm": 0.0899554118514061, "learning_rate": 9.58041873259955e-06, "loss": 0.4647, "num_input_tokens_seen": 42357056, "step": 34830 }, { "epoch": 4.364741260493672, "grad_norm": 0.07360029965639114, "learning_rate": 9.580199480080682e-06, "loss": 0.465, "num_input_tokens_seen": 42363264, "step": 34835 }, { "epoch": 4.365367748402456, "grad_norm": 0.07288514077663422, "learning_rate": 9.579980172801682e-06, "loss": 0.4692, "num_input_tokens_seen": 42369504, "step": 34840 }, { "epoch": 4.36599423631124, "grad_norm": 0.09179525077342987, "learning_rate": 9.579760810765168e-06, "loss": 0.4566, "num_input_tokens_seen": 42375808, "step": 34845 }, { "epoch": 4.366620724220023, "grad_norm": 0.06967130303382874, "learning_rate": 9.579541393973763e-06, "loss": 0.4594, "num_input_tokens_seen": 42382304, "step": 34850 }, { "epoch": 4.367247212128806, "grad_norm": 0.07310649007558823, "learning_rate": 9.57932192243009e-06, "loss": 0.4563, "num_input_tokens_seen": 42388288, "step": 34855 }, { "epoch": 4.367873700037589, "grad_norm": 0.06890949606895447, "learning_rate": 9.579102396136772e-06, "loss": 0.4611, "num_input_tokens_seen": 42394688, "step": 34860 }, { "epoch": 4.368500187946372, "grad_norm": 0.07195806503295898, "learning_rate": 9.578882815096437e-06, "loss": 0.4542, "num_input_tokens_seen": 42400992, "step": 34865 }, { "epoch": 4.369126675855156, "grad_norm": 0.0980442613363266, "learning_rate": 9.578663179311707e-06, "loss": 0.4635, "num_input_tokens_seen": 42407104, "step": 34870 }, { "epoch": 4.3697531637639395, "grad_norm": 0.062405336648225784, "learning_rate": 9.578443488785208e-06, "loss": 0.4532, "num_input_tokens_seen": 42412992, "step": 34875 }, { "epoch": 4.370379651672723, "grad_norm": 0.06604988127946854, "learning_rate": 9.57822374351957e-06, "loss": 0.4649, "num_input_tokens_seen": 42419232, "step": 34880 }, { "epoch": 4.371006139581506, "grad_norm": 0.08457434922456741, "learning_rate": 9.578003943517417e-06, "loss": 0.4744, "num_input_tokens_seen": 42425120, "step": 34885 }, { "epoch": 4.371632627490289, "grad_norm": 0.12277436256408691, "learning_rate": 9.577784088781378e-06, "loss": 0.4728, "num_input_tokens_seen": 42431360, "step": 34890 }, { "epoch": 4.372259115399073, "grad_norm": 0.07364922016859055, "learning_rate": 9.577564179314081e-06, "loss": 0.4605, "num_input_tokens_seen": 42437664, "step": 34895 }, { "epoch": 4.372885603307856, "grad_norm": 0.06804119050502777, "learning_rate": 9.577344215118157e-06, "loss": 0.468, "num_input_tokens_seen": 42443808, "step": 34900 }, { "epoch": 4.373512091216639, "grad_norm": 0.07028276473283768, "learning_rate": 9.577124196196233e-06, "loss": 0.4632, "num_input_tokens_seen": 42449632, "step": 34905 }, { "epoch": 4.374138579125423, "grad_norm": 0.0637776106595993, "learning_rate": 9.576904122550942e-06, "loss": 0.4633, "num_input_tokens_seen": 42455488, "step": 34910 }, { "epoch": 4.374765067034207, "grad_norm": 0.07082168757915497, "learning_rate": 9.576683994184914e-06, "loss": 0.4675, "num_input_tokens_seen": 42461600, "step": 34915 }, { "epoch": 4.37539155494299, "grad_norm": 0.1109643429517746, "learning_rate": 9.57646381110078e-06, "loss": 0.4587, "num_input_tokens_seen": 42467776, "step": 34920 }, { "epoch": 4.376018042851773, "grad_norm": 0.0677177757024765, "learning_rate": 9.576243573301173e-06, "loss": 0.4633, "num_input_tokens_seen": 42473952, "step": 34925 }, { "epoch": 4.376644530760556, "grad_norm": 0.09455761313438416, "learning_rate": 9.57602328078873e-06, "loss": 0.4655, "num_input_tokens_seen": 42480000, "step": 34930 }, { "epoch": 4.377271018669339, "grad_norm": 0.11362512409687042, "learning_rate": 9.575802933566078e-06, "loss": 0.4645, "num_input_tokens_seen": 42486560, "step": 34935 }, { "epoch": 4.377897506578123, "grad_norm": 0.07369554042816162, "learning_rate": 9.575582531635856e-06, "loss": 0.4681, "num_input_tokens_seen": 42492896, "step": 34940 }, { "epoch": 4.3785239944869065, "grad_norm": 0.05744427815079689, "learning_rate": 9.575362075000698e-06, "loss": 0.4607, "num_input_tokens_seen": 42499200, "step": 34945 }, { "epoch": 4.37915048239569, "grad_norm": 0.06858856976032257, "learning_rate": 9.57514156366324e-06, "loss": 0.4585, "num_input_tokens_seen": 42504768, "step": 34950 }, { "epoch": 4.379776970304473, "grad_norm": 0.0819917768239975, "learning_rate": 9.574920997626117e-06, "loss": 0.4627, "num_input_tokens_seen": 42510976, "step": 34955 }, { "epoch": 4.380403458213257, "grad_norm": 0.06483523547649384, "learning_rate": 9.574700376891969e-06, "loss": 0.471, "num_input_tokens_seen": 42517056, "step": 34960 }, { "epoch": 4.38102994612204, "grad_norm": 0.06476402282714844, "learning_rate": 9.574479701463429e-06, "loss": 0.4602, "num_input_tokens_seen": 42523520, "step": 34965 }, { "epoch": 4.381656434030823, "grad_norm": 0.06379975378513336, "learning_rate": 9.57425897134314e-06, "loss": 0.4652, "num_input_tokens_seen": 42529632, "step": 34970 }, { "epoch": 4.3822829219396064, "grad_norm": 0.07082607597112656, "learning_rate": 9.574038186533739e-06, "loss": 0.4625, "num_input_tokens_seen": 42535936, "step": 34975 }, { "epoch": 4.38290940984839, "grad_norm": 0.06119411811232567, "learning_rate": 9.573817347037867e-06, "loss": 0.4623, "num_input_tokens_seen": 42541696, "step": 34980 }, { "epoch": 4.383535897757174, "grad_norm": 0.12339158356189728, "learning_rate": 9.57359645285816e-06, "loss": 0.4568, "num_input_tokens_seen": 42547776, "step": 34985 }, { "epoch": 4.384162385665957, "grad_norm": 0.0786186158657074, "learning_rate": 9.573375503997264e-06, "loss": 0.4649, "num_input_tokens_seen": 42553600, "step": 34990 }, { "epoch": 4.38478887357474, "grad_norm": 0.11500933766365051, "learning_rate": 9.573154500457819e-06, "loss": 0.4578, "num_input_tokens_seen": 42559936, "step": 34995 }, { "epoch": 4.385415361483523, "grad_norm": 0.10259950906038284, "learning_rate": 9.572933442242468e-06, "loss": 0.4701, "num_input_tokens_seen": 42565792, "step": 35000 }, { "epoch": 4.386041849392306, "grad_norm": 0.0720491036772728, "learning_rate": 9.57271232935385e-06, "loss": 0.4719, "num_input_tokens_seen": 42572256, "step": 35005 }, { "epoch": 4.38666833730109, "grad_norm": 0.06099523976445198, "learning_rate": 9.572491161794613e-06, "loss": 0.4598, "num_input_tokens_seen": 42577984, "step": 35010 }, { "epoch": 4.387294825209874, "grad_norm": 0.07445354759693146, "learning_rate": 9.5722699395674e-06, "loss": 0.4615, "num_input_tokens_seen": 42584288, "step": 35015 }, { "epoch": 4.387921313118657, "grad_norm": 0.06695085763931274, "learning_rate": 9.572048662674856e-06, "loss": 0.4691, "num_input_tokens_seen": 42590432, "step": 35020 }, { "epoch": 4.38854780102744, "grad_norm": 0.030724018812179565, "learning_rate": 9.571827331119627e-06, "loss": 0.4619, "num_input_tokens_seen": 42596544, "step": 35025 }, { "epoch": 4.389174288936223, "grad_norm": 0.0755738914012909, "learning_rate": 9.571605944904357e-06, "loss": 0.4612, "num_input_tokens_seen": 42602560, "step": 35030 }, { "epoch": 4.389800776845007, "grad_norm": 0.09879233688116074, "learning_rate": 9.571384504031694e-06, "loss": 0.4656, "num_input_tokens_seen": 42608800, "step": 35035 }, { "epoch": 4.39042726475379, "grad_norm": 0.06412852555513382, "learning_rate": 9.571163008504287e-06, "loss": 0.4688, "num_input_tokens_seen": 42614944, "step": 35040 }, { "epoch": 4.3910537526625735, "grad_norm": 0.13295112550258636, "learning_rate": 9.570941458324783e-06, "loss": 0.4578, "num_input_tokens_seen": 42621120, "step": 35045 }, { "epoch": 4.391680240571357, "grad_norm": 0.07233360409736633, "learning_rate": 9.57071985349583e-06, "loss": 0.4657, "num_input_tokens_seen": 42627136, "step": 35050 }, { "epoch": 4.392306728480141, "grad_norm": 0.06212259829044342, "learning_rate": 9.57049819402008e-06, "loss": 0.4636, "num_input_tokens_seen": 42633248, "step": 35055 }, { "epoch": 4.392933216388924, "grad_norm": 0.07079694420099258, "learning_rate": 9.57027647990018e-06, "loss": 0.4617, "num_input_tokens_seen": 42639360, "step": 35060 }, { "epoch": 4.393559704297707, "grad_norm": 0.0679423063993454, "learning_rate": 9.570054711138782e-06, "loss": 0.4657, "num_input_tokens_seen": 42645376, "step": 35065 }, { "epoch": 4.39418619220649, "grad_norm": 0.08682288974523544, "learning_rate": 9.569832887738539e-06, "loss": 0.4589, "num_input_tokens_seen": 42651232, "step": 35070 }, { "epoch": 4.394812680115273, "grad_norm": 0.07486750930547714, "learning_rate": 9.5696110097021e-06, "loss": 0.4557, "num_input_tokens_seen": 42657632, "step": 35075 }, { "epoch": 4.395439168024057, "grad_norm": 0.11615349352359772, "learning_rate": 9.56938907703212e-06, "loss": 0.4625, "num_input_tokens_seen": 42663744, "step": 35080 }, { "epoch": 4.396065655932841, "grad_norm": 0.12961196899414062, "learning_rate": 9.569167089731253e-06, "loss": 0.4544, "num_input_tokens_seen": 42670144, "step": 35085 }, { "epoch": 4.396692143841624, "grad_norm": 0.07924388349056244, "learning_rate": 9.56894504780215e-06, "loss": 0.4557, "num_input_tokens_seen": 42676192, "step": 35090 }, { "epoch": 4.397318631750407, "grad_norm": 0.07288971543312073, "learning_rate": 9.568722951247469e-06, "loss": 0.4728, "num_input_tokens_seen": 42682272, "step": 35095 }, { "epoch": 4.397945119659191, "grad_norm": 0.100041963160038, "learning_rate": 9.568500800069864e-06, "loss": 0.4715, "num_input_tokens_seen": 42688256, "step": 35100 }, { "epoch": 4.398571607567974, "grad_norm": 0.06613588333129883, "learning_rate": 9.568278594271991e-06, "loss": 0.4624, "num_input_tokens_seen": 42694496, "step": 35105 }, { "epoch": 4.399198095476757, "grad_norm": 0.07266831398010254, "learning_rate": 9.568056333856505e-06, "loss": 0.4628, "num_input_tokens_seen": 42700352, "step": 35110 }, { "epoch": 4.3998245833855405, "grad_norm": 0.06336004287004471, "learning_rate": 9.567834018826068e-06, "loss": 0.459, "num_input_tokens_seen": 42706496, "step": 35115 }, { "epoch": 4.400451071294324, "grad_norm": 0.15543889999389648, "learning_rate": 9.567611649183333e-06, "loss": 0.4647, "num_input_tokens_seen": 42712480, "step": 35120 }, { "epoch": 4.401077559203108, "grad_norm": 0.03748306259512901, "learning_rate": 9.56738922493096e-06, "loss": 0.4632, "num_input_tokens_seen": 42718560, "step": 35125 }, { "epoch": 4.401704047111891, "grad_norm": 0.04101948440074921, "learning_rate": 9.567166746071608e-06, "loss": 0.4584, "num_input_tokens_seen": 42724992, "step": 35130 }, { "epoch": 4.402330535020674, "grad_norm": 0.07368247210979462, "learning_rate": 9.56694421260794e-06, "loss": 0.4705, "num_input_tokens_seen": 42730720, "step": 35135 }, { "epoch": 4.402957022929457, "grad_norm": 0.06844768673181534, "learning_rate": 9.566721624542613e-06, "loss": 0.4651, "num_input_tokens_seen": 42736800, "step": 35140 }, { "epoch": 4.40358351083824, "grad_norm": 0.0363670215010643, "learning_rate": 9.566498981878289e-06, "loss": 0.4594, "num_input_tokens_seen": 42743072, "step": 35145 }, { "epoch": 4.4042099987470245, "grad_norm": 0.09823257476091385, "learning_rate": 9.566276284617632e-06, "loss": 0.4609, "num_input_tokens_seen": 42749216, "step": 35150 }, { "epoch": 4.404836486655808, "grad_norm": 0.08047481626272202, "learning_rate": 9.566053532763301e-06, "loss": 0.4603, "num_input_tokens_seen": 42754368, "step": 35155 }, { "epoch": 4.405462974564591, "grad_norm": 0.07959003001451492, "learning_rate": 9.565830726317963e-06, "loss": 0.4583, "num_input_tokens_seen": 42760736, "step": 35160 }, { "epoch": 4.406089462473374, "grad_norm": 0.07276931405067444, "learning_rate": 9.565607865284279e-06, "loss": 0.4609, "num_input_tokens_seen": 42766464, "step": 35165 }, { "epoch": 4.406715950382158, "grad_norm": 0.07894225418567657, "learning_rate": 9.565384949664914e-06, "loss": 0.4625, "num_input_tokens_seen": 42772480, "step": 35170 }, { "epoch": 4.407342438290941, "grad_norm": 0.07884905487298965, "learning_rate": 9.565161979462533e-06, "loss": 0.4631, "num_input_tokens_seen": 42778496, "step": 35175 }, { "epoch": 4.407968926199724, "grad_norm": 0.07714197784662247, "learning_rate": 9.564938954679804e-06, "loss": 0.4594, "num_input_tokens_seen": 42784992, "step": 35180 }, { "epoch": 4.4085954141085075, "grad_norm": 0.07250147312879562, "learning_rate": 9.564715875319392e-06, "loss": 0.461, "num_input_tokens_seen": 42791232, "step": 35185 }, { "epoch": 4.409221902017291, "grad_norm": 0.0980834886431694, "learning_rate": 9.564492741383962e-06, "loss": 0.4619, "num_input_tokens_seen": 42797184, "step": 35190 }, { "epoch": 4.409848389926075, "grad_norm": 0.07983308285474777, "learning_rate": 9.564269552876185e-06, "loss": 0.4636, "num_input_tokens_seen": 42803392, "step": 35195 }, { "epoch": 4.410474877834858, "grad_norm": 0.08092311769723892, "learning_rate": 9.564046309798729e-06, "loss": 0.4653, "num_input_tokens_seen": 42809408, "step": 35200 }, { "epoch": 4.411101365743641, "grad_norm": 0.08818463236093521, "learning_rate": 9.563823012154261e-06, "loss": 0.4628, "num_input_tokens_seen": 42815584, "step": 35205 }, { "epoch": 4.411727853652424, "grad_norm": 0.1113540455698967, "learning_rate": 9.563599659945451e-06, "loss": 0.4652, "num_input_tokens_seen": 42821792, "step": 35210 }, { "epoch": 4.412354341561208, "grad_norm": 0.03489319235086441, "learning_rate": 9.563376253174972e-06, "loss": 0.4602, "num_input_tokens_seen": 42827328, "step": 35215 }, { "epoch": 4.4129808294699915, "grad_norm": 0.11364366114139557, "learning_rate": 9.563152791845492e-06, "loss": 0.4583, "num_input_tokens_seen": 42833440, "step": 35220 }, { "epoch": 4.413607317378775, "grad_norm": 0.08637281507253647, "learning_rate": 9.562929275959682e-06, "loss": 0.4629, "num_input_tokens_seen": 42839712, "step": 35225 }, { "epoch": 4.414233805287558, "grad_norm": 0.08467978239059448, "learning_rate": 9.56270570552022e-06, "loss": 0.4584, "num_input_tokens_seen": 42845952, "step": 35230 }, { "epoch": 4.414860293196341, "grad_norm": 0.03430246561765671, "learning_rate": 9.562482080529773e-06, "loss": 0.4636, "num_input_tokens_seen": 42851456, "step": 35235 }, { "epoch": 4.415486781105125, "grad_norm": 0.09235025197267532, "learning_rate": 9.562258400991016e-06, "loss": 0.4691, "num_input_tokens_seen": 42857024, "step": 35240 }, { "epoch": 4.416113269013908, "grad_norm": 0.09905071556568146, "learning_rate": 9.562034666906626e-06, "loss": 0.467, "num_input_tokens_seen": 42862976, "step": 35245 }, { "epoch": 4.416739756922691, "grad_norm": 0.08106325566768646, "learning_rate": 9.561810878279276e-06, "loss": 0.4586, "num_input_tokens_seen": 42869088, "step": 35250 }, { "epoch": 4.4173662448314746, "grad_norm": 0.10434189438819885, "learning_rate": 9.56158703511164e-06, "loss": 0.4614, "num_input_tokens_seen": 42875328, "step": 35255 }, { "epoch": 4.417992732740258, "grad_norm": 0.1015489250421524, "learning_rate": 9.561363137406396e-06, "loss": 0.4578, "num_input_tokens_seen": 42881408, "step": 35260 }, { "epoch": 4.418619220649042, "grad_norm": 0.08235945552587509, "learning_rate": 9.561139185166223e-06, "loss": 0.446, "num_input_tokens_seen": 42887264, "step": 35265 }, { "epoch": 4.419245708557825, "grad_norm": 0.16820506751537323, "learning_rate": 9.560915178393795e-06, "loss": 0.4573, "num_input_tokens_seen": 42893408, "step": 35270 }, { "epoch": 4.419872196466608, "grad_norm": 0.0962517261505127, "learning_rate": 9.56069111709179e-06, "loss": 0.4719, "num_input_tokens_seen": 42899392, "step": 35275 }, { "epoch": 4.420498684375391, "grad_norm": 0.13512717187404633, "learning_rate": 9.56046700126289e-06, "loss": 0.4761, "num_input_tokens_seen": 42905856, "step": 35280 }, { "epoch": 4.4211251722841745, "grad_norm": 0.11182296276092529, "learning_rate": 9.560242830909772e-06, "loss": 0.4604, "num_input_tokens_seen": 42912064, "step": 35285 }, { "epoch": 4.4217516601929585, "grad_norm": 0.0839901939034462, "learning_rate": 9.560018606035117e-06, "loss": 0.4662, "num_input_tokens_seen": 42918368, "step": 35290 }, { "epoch": 4.422378148101742, "grad_norm": 0.12545324862003326, "learning_rate": 9.559794326641607e-06, "loss": 0.4677, "num_input_tokens_seen": 42924512, "step": 35295 }, { "epoch": 4.423004636010525, "grad_norm": 0.1446174681186676, "learning_rate": 9.55956999273192e-06, "loss": 0.4663, "num_input_tokens_seen": 42930880, "step": 35300 }, { "epoch": 4.423631123919308, "grad_norm": 0.14220759272575378, "learning_rate": 9.559345604308741e-06, "loss": 0.4614, "num_input_tokens_seen": 42937056, "step": 35305 }, { "epoch": 4.424257611828092, "grad_norm": 0.14238819479942322, "learning_rate": 9.559121161374752e-06, "loss": 0.4644, "num_input_tokens_seen": 42943360, "step": 35310 }, { "epoch": 4.424884099736875, "grad_norm": 0.10839438438415527, "learning_rate": 9.558896663932637e-06, "loss": 0.4617, "num_input_tokens_seen": 42949504, "step": 35315 }, { "epoch": 4.425510587645658, "grad_norm": 0.09736427664756775, "learning_rate": 9.55867211198508e-06, "loss": 0.4561, "num_input_tokens_seen": 42955456, "step": 35320 }, { "epoch": 4.426137075554442, "grad_norm": 0.0819852277636528, "learning_rate": 9.558447505534764e-06, "loss": 0.457, "num_input_tokens_seen": 42961632, "step": 35325 }, { "epoch": 4.426763563463225, "grad_norm": 0.08524155616760254, "learning_rate": 9.558222844584374e-06, "loss": 0.465, "num_input_tokens_seen": 42967776, "step": 35330 }, { "epoch": 4.427390051372009, "grad_norm": 0.09644428640604019, "learning_rate": 9.5579981291366e-06, "loss": 0.4688, "num_input_tokens_seen": 42973632, "step": 35335 }, { "epoch": 4.428016539280792, "grad_norm": 0.1080193817615509, "learning_rate": 9.557773359194126e-06, "loss": 0.4668, "num_input_tokens_seen": 42978496, "step": 35340 }, { "epoch": 4.428643027189575, "grad_norm": 0.079546257853508, "learning_rate": 9.557548534759639e-06, "loss": 0.4662, "num_input_tokens_seen": 42984768, "step": 35345 }, { "epoch": 4.429269515098358, "grad_norm": 0.10868830978870392, "learning_rate": 9.557323655835829e-06, "loss": 0.4673, "num_input_tokens_seen": 42990976, "step": 35350 }, { "epoch": 4.429896003007142, "grad_norm": 0.07370629906654358, "learning_rate": 9.557098722425381e-06, "loss": 0.4639, "num_input_tokens_seen": 42997088, "step": 35355 }, { "epoch": 4.4305224909159255, "grad_norm": 0.08394742012023926, "learning_rate": 9.556873734530988e-06, "loss": 0.463, "num_input_tokens_seen": 43003136, "step": 35360 }, { "epoch": 4.431148978824709, "grad_norm": 0.03561076894402504, "learning_rate": 9.556648692155337e-06, "loss": 0.4634, "num_input_tokens_seen": 43009376, "step": 35365 }, { "epoch": 4.431775466733492, "grad_norm": 0.05978712439537048, "learning_rate": 9.55642359530112e-06, "loss": 0.4654, "num_input_tokens_seen": 43015680, "step": 35370 }, { "epoch": 4.432401954642275, "grad_norm": 0.07113008201122284, "learning_rate": 9.556198443971028e-06, "loss": 0.4633, "num_input_tokens_seen": 43021760, "step": 35375 }, { "epoch": 4.433028442551059, "grad_norm": 0.11157166957855225, "learning_rate": 9.555973238167753e-06, "loss": 0.4664, "num_input_tokens_seen": 43028128, "step": 35380 }, { "epoch": 4.433654930459842, "grad_norm": 0.08840273320674896, "learning_rate": 9.555747977893987e-06, "loss": 0.4611, "num_input_tokens_seen": 43034112, "step": 35385 }, { "epoch": 4.434281418368625, "grad_norm": 0.15110225975513458, "learning_rate": 9.555522663152425e-06, "loss": 0.4594, "num_input_tokens_seen": 43040384, "step": 35390 }, { "epoch": 4.434907906277409, "grad_norm": 0.10156626254320145, "learning_rate": 9.55529729394576e-06, "loss": 0.4616, "num_input_tokens_seen": 43046336, "step": 35395 }, { "epoch": 4.435534394186192, "grad_norm": 0.07174884527921677, "learning_rate": 9.555071870276685e-06, "loss": 0.4612, "num_input_tokens_seen": 43052448, "step": 35400 }, { "epoch": 4.436160882094976, "grad_norm": 0.10305945575237274, "learning_rate": 9.554846392147895e-06, "loss": 0.4625, "num_input_tokens_seen": 43059040, "step": 35405 }, { "epoch": 4.436787370003759, "grad_norm": 0.07509361207485199, "learning_rate": 9.554620859562089e-06, "loss": 0.4671, "num_input_tokens_seen": 43064704, "step": 35410 }, { "epoch": 4.437413857912542, "grad_norm": 0.0967857763171196, "learning_rate": 9.554395272521959e-06, "loss": 0.4595, "num_input_tokens_seen": 43070848, "step": 35415 }, { "epoch": 4.438040345821325, "grad_norm": 0.07828795909881592, "learning_rate": 9.554169631030206e-06, "loss": 0.4652, "num_input_tokens_seen": 43077056, "step": 35420 }, { "epoch": 4.438666833730109, "grad_norm": 0.07597272098064423, "learning_rate": 9.553943935089526e-06, "loss": 0.4631, "num_input_tokens_seen": 43083456, "step": 35425 }, { "epoch": 4.439293321638893, "grad_norm": 0.10298343747854233, "learning_rate": 9.553718184702618e-06, "loss": 0.4623, "num_input_tokens_seen": 43089568, "step": 35430 }, { "epoch": 4.439919809547676, "grad_norm": 0.11141643673181534, "learning_rate": 9.553492379872178e-06, "loss": 0.4626, "num_input_tokens_seen": 43095232, "step": 35435 }, { "epoch": 4.440546297456459, "grad_norm": 0.07338587194681168, "learning_rate": 9.55326652060091e-06, "loss": 0.465, "num_input_tokens_seen": 43101344, "step": 35440 }, { "epoch": 4.441172785365242, "grad_norm": 0.1265747845172882, "learning_rate": 9.553040606891513e-06, "loss": 0.4587, "num_input_tokens_seen": 43107264, "step": 35445 }, { "epoch": 4.441799273274026, "grad_norm": 0.07171527296304703, "learning_rate": 9.552814638746688e-06, "loss": 0.4647, "num_input_tokens_seen": 43113472, "step": 35450 }, { "epoch": 4.442425761182809, "grad_norm": 0.09537433087825775, "learning_rate": 9.552588616169135e-06, "loss": 0.4658, "num_input_tokens_seen": 43119392, "step": 35455 }, { "epoch": 4.4430522490915925, "grad_norm": 0.08052996546030045, "learning_rate": 9.552362539161558e-06, "loss": 0.4592, "num_input_tokens_seen": 43125696, "step": 35460 }, { "epoch": 4.443678737000376, "grad_norm": 0.13057206571102142, "learning_rate": 9.55213640772666e-06, "loss": 0.4568, "num_input_tokens_seen": 43131648, "step": 35465 }, { "epoch": 4.44430522490916, "grad_norm": 0.1945594996213913, "learning_rate": 9.551910221867143e-06, "loss": 0.4548, "num_input_tokens_seen": 43138304, "step": 35470 }, { "epoch": 4.444931712817943, "grad_norm": 0.0751764252781868, "learning_rate": 9.551683981585712e-06, "loss": 0.4717, "num_input_tokens_seen": 43144512, "step": 35475 }, { "epoch": 4.445558200726726, "grad_norm": 0.13098014891147614, "learning_rate": 9.551457686885072e-06, "loss": 0.4545, "num_input_tokens_seen": 43150496, "step": 35480 }, { "epoch": 4.446184688635509, "grad_norm": 0.21034158766269684, "learning_rate": 9.55123133776793e-06, "loss": 0.4693, "num_input_tokens_seen": 43155840, "step": 35485 }, { "epoch": 4.446811176544292, "grad_norm": 0.04812047258019447, "learning_rate": 9.55100493423699e-06, "loss": 0.4547, "num_input_tokens_seen": 43162016, "step": 35490 }, { "epoch": 4.447437664453076, "grad_norm": 0.12226065993309021, "learning_rate": 9.550778476294961e-06, "loss": 0.4646, "num_input_tokens_seen": 43168256, "step": 35495 }, { "epoch": 4.44806415236186, "grad_norm": 0.08243311941623688, "learning_rate": 9.550551963944548e-06, "loss": 0.4596, "num_input_tokens_seen": 43173856, "step": 35500 }, { "epoch": 4.448690640270643, "grad_norm": 0.0952141284942627, "learning_rate": 9.550325397188461e-06, "loss": 0.4538, "num_input_tokens_seen": 43180064, "step": 35505 }, { "epoch": 4.449317128179426, "grad_norm": 0.1487453281879425, "learning_rate": 9.550098776029407e-06, "loss": 0.468, "num_input_tokens_seen": 43186304, "step": 35510 }, { "epoch": 4.449943616088209, "grad_norm": 0.11649515479803085, "learning_rate": 9.549872100470098e-06, "loss": 0.4586, "num_input_tokens_seen": 43192544, "step": 35515 }, { "epoch": 4.450570103996993, "grad_norm": 0.1371256411075592, "learning_rate": 9.549645370513242e-06, "loss": 0.4642, "num_input_tokens_seen": 43198592, "step": 35520 }, { "epoch": 4.451196591905776, "grad_norm": 0.131510928273201, "learning_rate": 9.54941858616155e-06, "loss": 0.4663, "num_input_tokens_seen": 43204704, "step": 35525 }, { "epoch": 4.4518230798145595, "grad_norm": 0.08885776996612549, "learning_rate": 9.549191747417734e-06, "loss": 0.4578, "num_input_tokens_seen": 43211008, "step": 35530 }, { "epoch": 4.452449567723343, "grad_norm": 0.05169431492686272, "learning_rate": 9.548964854284508e-06, "loss": 0.4604, "num_input_tokens_seen": 43217184, "step": 35535 }, { "epoch": 4.453076055632126, "grad_norm": 0.117607980966568, "learning_rate": 9.548737906764581e-06, "loss": 0.4542, "num_input_tokens_seen": 43222432, "step": 35540 }, { "epoch": 4.45370254354091, "grad_norm": 0.1386728733778, "learning_rate": 9.548510904860667e-06, "loss": 0.4742, "num_input_tokens_seen": 43228608, "step": 35545 }, { "epoch": 4.454329031449693, "grad_norm": 0.10065417736768723, "learning_rate": 9.548283848575482e-06, "loss": 0.4555, "num_input_tokens_seen": 43234848, "step": 35550 }, { "epoch": 4.454955519358476, "grad_norm": 0.13618794083595276, "learning_rate": 9.54805673791174e-06, "loss": 0.468, "num_input_tokens_seen": 43241280, "step": 35555 }, { "epoch": 4.455582007267259, "grad_norm": 0.17613403499126434, "learning_rate": 9.547829572872155e-06, "loss": 0.4605, "num_input_tokens_seen": 43247488, "step": 35560 }, { "epoch": 4.4562084951760434, "grad_norm": 0.13788893818855286, "learning_rate": 9.547602353459444e-06, "loss": 0.461, "num_input_tokens_seen": 43253728, "step": 35565 }, { "epoch": 4.456834983084827, "grad_norm": 0.07281794399023056, "learning_rate": 9.547375079676325e-06, "loss": 0.4624, "num_input_tokens_seen": 43259872, "step": 35570 }, { "epoch": 4.45746147099361, "grad_norm": 0.11277210712432861, "learning_rate": 9.547147751525513e-06, "loss": 0.4578, "num_input_tokens_seen": 43265984, "step": 35575 }, { "epoch": 4.458087958902393, "grad_norm": 0.06874623894691467, "learning_rate": 9.546920369009725e-06, "loss": 0.4613, "num_input_tokens_seen": 43272320, "step": 35580 }, { "epoch": 4.458714446811177, "grad_norm": 0.07159298658370972, "learning_rate": 9.546692932131682e-06, "loss": 0.4688, "num_input_tokens_seen": 43278176, "step": 35585 }, { "epoch": 4.45934093471996, "grad_norm": 0.06764642894268036, "learning_rate": 9.546465440894102e-06, "loss": 0.4601, "num_input_tokens_seen": 43283616, "step": 35590 }, { "epoch": 4.459967422628743, "grad_norm": 0.08369175344705582, "learning_rate": 9.546237895299707e-06, "loss": 0.4612, "num_input_tokens_seen": 43289376, "step": 35595 }, { "epoch": 4.4605939105375265, "grad_norm": 0.07184521853923798, "learning_rate": 9.546010295351214e-06, "loss": 0.4618, "num_input_tokens_seen": 43295520, "step": 35600 }, { "epoch": 4.46122039844631, "grad_norm": 0.03578760102391243, "learning_rate": 9.545782641051345e-06, "loss": 0.4635, "num_input_tokens_seen": 43301760, "step": 35605 }, { "epoch": 4.461846886355094, "grad_norm": 0.10926678031682968, "learning_rate": 9.545554932402824e-06, "loss": 0.4607, "num_input_tokens_seen": 43307936, "step": 35610 }, { "epoch": 4.462473374263877, "grad_norm": 0.034777428954839706, "learning_rate": 9.545327169408372e-06, "loss": 0.4639, "num_input_tokens_seen": 43313824, "step": 35615 }, { "epoch": 4.46309986217266, "grad_norm": 0.08855970948934555, "learning_rate": 9.545099352070712e-06, "loss": 0.463, "num_input_tokens_seen": 43319680, "step": 35620 }, { "epoch": 4.463726350081443, "grad_norm": 0.11989520490169525, "learning_rate": 9.544871480392567e-06, "loss": 0.4717, "num_input_tokens_seen": 43326048, "step": 35625 }, { "epoch": 4.464352837990226, "grad_norm": 0.11712254583835602, "learning_rate": 9.544643554376665e-06, "loss": 0.4571, "num_input_tokens_seen": 43332256, "step": 35630 }, { "epoch": 4.4649793258990105, "grad_norm": 0.09275540709495544, "learning_rate": 9.544415574025727e-06, "loss": 0.4638, "num_input_tokens_seen": 43338112, "step": 35635 }, { "epoch": 4.465605813807794, "grad_norm": 0.08983203768730164, "learning_rate": 9.54418753934248e-06, "loss": 0.4638, "num_input_tokens_seen": 43344160, "step": 35640 }, { "epoch": 4.466232301716577, "grad_norm": 0.06525351852178574, "learning_rate": 9.543959450329649e-06, "loss": 0.468, "num_input_tokens_seen": 43350240, "step": 35645 }, { "epoch": 4.46685878962536, "grad_norm": 0.07022884488105774, "learning_rate": 9.543731306989964e-06, "loss": 0.4664, "num_input_tokens_seen": 43355776, "step": 35650 }, { "epoch": 4.467485277534143, "grad_norm": 0.06615109741687775, "learning_rate": 9.54350310932615e-06, "loss": 0.4606, "num_input_tokens_seen": 43361408, "step": 35655 }, { "epoch": 4.468111765442927, "grad_norm": 0.09359746426343918, "learning_rate": 9.543274857340938e-06, "loss": 0.4695, "num_input_tokens_seen": 43367744, "step": 35660 }, { "epoch": 4.46873825335171, "grad_norm": 0.0705951601266861, "learning_rate": 9.543046551037056e-06, "loss": 0.4695, "num_input_tokens_seen": 43373952, "step": 35665 }, { "epoch": 4.4693647412604935, "grad_norm": 0.07316280156373978, "learning_rate": 9.542818190417232e-06, "loss": 0.4684, "num_input_tokens_seen": 43379936, "step": 35670 }, { "epoch": 4.469991229169277, "grad_norm": 0.07433570176362991, "learning_rate": 9.542589775484195e-06, "loss": 0.4626, "num_input_tokens_seen": 43386112, "step": 35675 }, { "epoch": 4.470617717078061, "grad_norm": 0.07333499193191528, "learning_rate": 9.54236130624068e-06, "loss": 0.4616, "num_input_tokens_seen": 43392000, "step": 35680 }, { "epoch": 4.471244204986844, "grad_norm": 0.09081675112247467, "learning_rate": 9.542132782689415e-06, "loss": 0.461, "num_input_tokens_seen": 43398048, "step": 35685 }, { "epoch": 4.471870692895627, "grad_norm": 0.03381926566362381, "learning_rate": 9.541904204833136e-06, "loss": 0.4636, "num_input_tokens_seen": 43403872, "step": 35690 }, { "epoch": 4.47249718080441, "grad_norm": 0.11859046667814255, "learning_rate": 9.541675572674573e-06, "loss": 0.4711, "num_input_tokens_seen": 43410176, "step": 35695 }, { "epoch": 4.4731236687131934, "grad_norm": 0.10305003821849823, "learning_rate": 9.541446886216458e-06, "loss": 0.4673, "num_input_tokens_seen": 43416224, "step": 35700 }, { "epoch": 4.4737501566219775, "grad_norm": 0.1032625362277031, "learning_rate": 9.54121814546153e-06, "loss": 0.4631, "num_input_tokens_seen": 43422432, "step": 35705 }, { "epoch": 4.474376644530761, "grad_norm": 0.06730905920267105, "learning_rate": 9.54098935041252e-06, "loss": 0.4615, "num_input_tokens_seen": 43428480, "step": 35710 }, { "epoch": 4.475003132439544, "grad_norm": 0.09122034162282944, "learning_rate": 9.540760501072165e-06, "loss": 0.4575, "num_input_tokens_seen": 43434496, "step": 35715 }, { "epoch": 4.475629620348327, "grad_norm": 0.06883430480957031, "learning_rate": 9.5405315974432e-06, "loss": 0.4632, "num_input_tokens_seen": 43440960, "step": 35720 }, { "epoch": 4.476256108257111, "grad_norm": 0.0782000869512558, "learning_rate": 9.54030263952836e-06, "loss": 0.4552, "num_input_tokens_seen": 43447136, "step": 35725 }, { "epoch": 4.476882596165894, "grad_norm": 0.06441303342580795, "learning_rate": 9.540073627330388e-06, "loss": 0.4582, "num_input_tokens_seen": 43452896, "step": 35730 }, { "epoch": 4.477509084074677, "grad_norm": 0.0798897072672844, "learning_rate": 9.539844560852017e-06, "loss": 0.4621, "num_input_tokens_seen": 43458976, "step": 35735 }, { "epoch": 4.478135571983461, "grad_norm": 0.07281655073165894, "learning_rate": 9.539615440095989e-06, "loss": 0.462, "num_input_tokens_seen": 43464704, "step": 35740 }, { "epoch": 4.478762059892244, "grad_norm": 0.07585001736879349, "learning_rate": 9.53938626506504e-06, "loss": 0.4615, "num_input_tokens_seen": 43470784, "step": 35745 }, { "epoch": 4.479388547801028, "grad_norm": 0.09188675135374069, "learning_rate": 9.539157035761913e-06, "loss": 0.4677, "num_input_tokens_seen": 43476832, "step": 35750 }, { "epoch": 4.480015035709811, "grad_norm": 0.06926289945840836, "learning_rate": 9.538927752189347e-06, "loss": 0.4622, "num_input_tokens_seen": 43482784, "step": 35755 }, { "epoch": 4.480641523618594, "grad_norm": 0.08543477952480316, "learning_rate": 9.538698414350083e-06, "loss": 0.4667, "num_input_tokens_seen": 43489056, "step": 35760 }, { "epoch": 4.481268011527377, "grad_norm": 0.07286456972360611, "learning_rate": 9.538469022246863e-06, "loss": 0.4683, "num_input_tokens_seen": 43495264, "step": 35765 }, { "epoch": 4.4818944994361605, "grad_norm": 0.07617337256669998, "learning_rate": 9.53823957588243e-06, "loss": 0.4618, "num_input_tokens_seen": 43501568, "step": 35770 }, { "epoch": 4.4825209873449445, "grad_norm": 0.07552603632211685, "learning_rate": 9.53801007525953e-06, "loss": 0.4693, "num_input_tokens_seen": 43507744, "step": 35775 }, { "epoch": 4.483147475253728, "grad_norm": 0.08460187911987305, "learning_rate": 9.537780520380902e-06, "loss": 0.4626, "num_input_tokens_seen": 43512992, "step": 35780 }, { "epoch": 4.483773963162511, "grad_norm": 0.03938588872551918, "learning_rate": 9.537550911249293e-06, "loss": 0.4632, "num_input_tokens_seen": 43518784, "step": 35785 }, { "epoch": 4.484400451071294, "grad_norm": 0.0817461758852005, "learning_rate": 9.537321247867448e-06, "loss": 0.465, "num_input_tokens_seen": 43524928, "step": 35790 }, { "epoch": 4.485026938980078, "grad_norm": 0.07425472885370255, "learning_rate": 9.537091530238111e-06, "loss": 0.462, "num_input_tokens_seen": 43531136, "step": 35795 }, { "epoch": 4.485653426888861, "grad_norm": 0.0958053469657898, "learning_rate": 9.536861758364034e-06, "loss": 0.4654, "num_input_tokens_seen": 43537120, "step": 35800 }, { "epoch": 4.486279914797644, "grad_norm": 0.10501356422901154, "learning_rate": 9.536631932247958e-06, "loss": 0.4597, "num_input_tokens_seen": 43543264, "step": 35805 }, { "epoch": 4.486906402706428, "grad_norm": 0.09315831959247589, "learning_rate": 9.536402051892633e-06, "loss": 0.4674, "num_input_tokens_seen": 43549504, "step": 35810 }, { "epoch": 4.487532890615211, "grad_norm": 0.07658621668815613, "learning_rate": 9.536172117300807e-06, "loss": 0.4629, "num_input_tokens_seen": 43555968, "step": 35815 }, { "epoch": 4.488159378523995, "grad_norm": 0.06850381195545197, "learning_rate": 9.535942128475231e-06, "loss": 0.4584, "num_input_tokens_seen": 43562176, "step": 35820 }, { "epoch": 4.488785866432778, "grad_norm": 0.11601143330335617, "learning_rate": 9.535712085418652e-06, "loss": 0.4637, "num_input_tokens_seen": 43568416, "step": 35825 }, { "epoch": 4.489412354341561, "grad_norm": 0.13741298019886017, "learning_rate": 9.535481988133822e-06, "loss": 0.4606, "num_input_tokens_seen": 43574656, "step": 35830 }, { "epoch": 4.490038842250344, "grad_norm": 0.06847726553678513, "learning_rate": 9.535251836623491e-06, "loss": 0.4615, "num_input_tokens_seen": 43580768, "step": 35835 }, { "epoch": 4.490665330159128, "grad_norm": 0.07043582946062088, "learning_rate": 9.535021630890414e-06, "loss": 0.4588, "num_input_tokens_seen": 43586880, "step": 35840 }, { "epoch": 4.491291818067912, "grad_norm": 0.084734246134758, "learning_rate": 9.534791370937338e-06, "loss": 0.4591, "num_input_tokens_seen": 43593120, "step": 35845 }, { "epoch": 4.491918305976695, "grad_norm": 0.08216657489538193, "learning_rate": 9.534561056767018e-06, "loss": 0.4649, "num_input_tokens_seen": 43598912, "step": 35850 }, { "epoch": 4.492544793885478, "grad_norm": 0.06938213109970093, "learning_rate": 9.534330688382209e-06, "loss": 0.4644, "num_input_tokens_seen": 43605184, "step": 35855 }, { "epoch": 4.493171281794261, "grad_norm": 0.0678289458155632, "learning_rate": 9.534100265785665e-06, "loss": 0.4652, "num_input_tokens_seen": 43611200, "step": 35860 }, { "epoch": 4.493797769703045, "grad_norm": 0.07789760082960129, "learning_rate": 9.53386978898014e-06, "loss": 0.4665, "num_input_tokens_seen": 43617696, "step": 35865 }, { "epoch": 4.494424257611828, "grad_norm": 0.11694508790969849, "learning_rate": 9.53363925796839e-06, "loss": 0.4646, "num_input_tokens_seen": 43624032, "step": 35870 }, { "epoch": 4.4950507455206115, "grad_norm": 0.034386396408081055, "learning_rate": 9.53340867275317e-06, "loss": 0.4622, "num_input_tokens_seen": 43630336, "step": 35875 }, { "epoch": 4.495677233429395, "grad_norm": 0.08456971496343613, "learning_rate": 9.533178033337238e-06, "loss": 0.4614, "num_input_tokens_seen": 43636384, "step": 35880 }, { "epoch": 4.496303721338178, "grad_norm": 0.03135312721133232, "learning_rate": 9.532947339723353e-06, "loss": 0.4627, "num_input_tokens_seen": 43642784, "step": 35885 }, { "epoch": 4.496930209246962, "grad_norm": 0.08635780960321426, "learning_rate": 9.53271659191427e-06, "loss": 0.462, "num_input_tokens_seen": 43648832, "step": 35890 }, { "epoch": 4.497556697155745, "grad_norm": 0.06849481910467148, "learning_rate": 9.532485789912748e-06, "loss": 0.4627, "num_input_tokens_seen": 43655008, "step": 35895 }, { "epoch": 4.498183185064528, "grad_norm": 0.1174086257815361, "learning_rate": 9.53225493372155e-06, "loss": 0.4623, "num_input_tokens_seen": 43661312, "step": 35900 }, { "epoch": 4.498809672973311, "grad_norm": 0.11433679610490799, "learning_rate": 9.532024023343434e-06, "loss": 0.4628, "num_input_tokens_seen": 43667552, "step": 35905 }, { "epoch": 4.4994361608820945, "grad_norm": 0.06914924830198288, "learning_rate": 9.53179305878116e-06, "loss": 0.4646, "num_input_tokens_seen": 43673760, "step": 35910 }, { "epoch": 4.500062648790879, "grad_norm": 0.06571295857429504, "learning_rate": 9.531562040037487e-06, "loss": 0.4607, "num_input_tokens_seen": 43679648, "step": 35915 }, { "epoch": 4.500689136699662, "grad_norm": 0.07401121407747269, "learning_rate": 9.531330967115184e-06, "loss": 0.4659, "num_input_tokens_seen": 43685792, "step": 35920 }, { "epoch": 4.501315624608445, "grad_norm": 0.06854245066642761, "learning_rate": 9.531099840017007e-06, "loss": 0.4632, "num_input_tokens_seen": 43692256, "step": 35925 }, { "epoch": 4.501942112517228, "grad_norm": 0.06893859058618546, "learning_rate": 9.530868658745722e-06, "loss": 0.461, "num_input_tokens_seen": 43698432, "step": 35930 }, { "epoch": 4.502568600426012, "grad_norm": 0.12039536237716675, "learning_rate": 9.530637423304095e-06, "loss": 0.4627, "num_input_tokens_seen": 43704032, "step": 35935 }, { "epoch": 4.503195088334795, "grad_norm": 0.06180822104215622, "learning_rate": 9.530406133694887e-06, "loss": 0.4624, "num_input_tokens_seen": 43710080, "step": 35940 }, { "epoch": 4.5038215762435785, "grad_norm": 0.06400884687900543, "learning_rate": 9.530174789920863e-06, "loss": 0.4639, "num_input_tokens_seen": 43716224, "step": 35945 }, { "epoch": 4.504448064152362, "grad_norm": 0.11997261643409729, "learning_rate": 9.529943391984792e-06, "loss": 0.4644, "num_input_tokens_seen": 43722336, "step": 35950 }, { "epoch": 4.505074552061146, "grad_norm": 0.08067876845598221, "learning_rate": 9.52971193988944e-06, "loss": 0.4656, "num_input_tokens_seen": 43728512, "step": 35955 }, { "epoch": 4.505701039969929, "grad_norm": 0.1119520366191864, "learning_rate": 9.529480433637573e-06, "loss": 0.463, "num_input_tokens_seen": 43734304, "step": 35960 }, { "epoch": 4.506327527878712, "grad_norm": 0.06523289531469345, "learning_rate": 9.529248873231959e-06, "loss": 0.4635, "num_input_tokens_seen": 43740544, "step": 35965 }, { "epoch": 4.506954015787495, "grad_norm": 0.06770218163728714, "learning_rate": 9.529017258675366e-06, "loss": 0.4645, "num_input_tokens_seen": 43746688, "step": 35970 }, { "epoch": 4.507580503696278, "grad_norm": 0.08289032429456711, "learning_rate": 9.528785589970565e-06, "loss": 0.4622, "num_input_tokens_seen": 43752864, "step": 35975 }, { "epoch": 4.508206991605062, "grad_norm": 0.10250452160835266, "learning_rate": 9.528553867120325e-06, "loss": 0.4658, "num_input_tokens_seen": 43759104, "step": 35980 }, { "epoch": 4.508833479513846, "grad_norm": 0.10548559576272964, "learning_rate": 9.528322090127416e-06, "loss": 0.462, "num_input_tokens_seen": 43765088, "step": 35985 }, { "epoch": 4.509459967422629, "grad_norm": 0.09959259629249573, "learning_rate": 9.528090258994608e-06, "loss": 0.4585, "num_input_tokens_seen": 43771136, "step": 35990 }, { "epoch": 4.510086455331412, "grad_norm": 0.07573636621236801, "learning_rate": 9.527858373724674e-06, "loss": 0.4665, "num_input_tokens_seen": 43777056, "step": 35995 }, { "epoch": 4.510712943240195, "grad_norm": 0.06996145844459534, "learning_rate": 9.527626434320387e-06, "loss": 0.469, "num_input_tokens_seen": 43782976, "step": 36000 }, { "epoch": 4.511339431148979, "grad_norm": 0.07789143919944763, "learning_rate": 9.527394440784519e-06, "loss": 0.459, "num_input_tokens_seen": 43788960, "step": 36005 }, { "epoch": 4.511965919057762, "grad_norm": 0.06647787243127823, "learning_rate": 9.527162393119844e-06, "loss": 0.4605, "num_input_tokens_seen": 43795264, "step": 36010 }, { "epoch": 4.5125924069665455, "grad_norm": 0.07134516537189484, "learning_rate": 9.526930291329138e-06, "loss": 0.4621, "num_input_tokens_seen": 43801696, "step": 36015 }, { "epoch": 4.513218894875329, "grad_norm": 0.09759584814310074, "learning_rate": 9.526698135415174e-06, "loss": 0.4594, "num_input_tokens_seen": 43807808, "step": 36020 }, { "epoch": 4.513845382784112, "grad_norm": 0.07737160474061966, "learning_rate": 9.526465925380725e-06, "loss": 0.463, "num_input_tokens_seen": 43813856, "step": 36025 }, { "epoch": 4.514471870692896, "grad_norm": 0.16151371598243713, "learning_rate": 9.526233661228571e-06, "loss": 0.462, "num_input_tokens_seen": 43819840, "step": 36030 }, { "epoch": 4.515098358601679, "grad_norm": 0.06797324866056442, "learning_rate": 9.52600134296149e-06, "loss": 0.4594, "num_input_tokens_seen": 43825792, "step": 36035 }, { "epoch": 4.515724846510462, "grad_norm": 0.0675811916589737, "learning_rate": 9.525768970582259e-06, "loss": 0.4651, "num_input_tokens_seen": 43831968, "step": 36040 }, { "epoch": 4.516351334419245, "grad_norm": 0.06994718313217163, "learning_rate": 9.525536544093652e-06, "loss": 0.4623, "num_input_tokens_seen": 43838080, "step": 36045 }, { "epoch": 4.516977822328029, "grad_norm": 0.10151498019695282, "learning_rate": 9.525304063498451e-06, "loss": 0.463, "num_input_tokens_seen": 43844352, "step": 36050 }, { "epoch": 4.517604310236813, "grad_norm": 0.08485136926174164, "learning_rate": 9.525071528799437e-06, "loss": 0.4654, "num_input_tokens_seen": 43849824, "step": 36055 }, { "epoch": 4.518230798145596, "grad_norm": 0.10995804518461227, "learning_rate": 9.524838939999387e-06, "loss": 0.459, "num_input_tokens_seen": 43855936, "step": 36060 }, { "epoch": 4.518857286054379, "grad_norm": 0.06544807553291321, "learning_rate": 9.524606297101084e-06, "loss": 0.4684, "num_input_tokens_seen": 43861920, "step": 36065 }, { "epoch": 4.519483773963162, "grad_norm": 0.11391900479793549, "learning_rate": 9.524373600107308e-06, "loss": 0.4633, "num_input_tokens_seen": 43868160, "step": 36070 }, { "epoch": 4.520110261871946, "grad_norm": 0.07088565081357956, "learning_rate": 9.524140849020842e-06, "loss": 0.4605, "num_input_tokens_seen": 43873856, "step": 36075 }, { "epoch": 4.520736749780729, "grad_norm": 0.07999268919229507, "learning_rate": 9.523908043844467e-06, "loss": 0.4675, "num_input_tokens_seen": 43880064, "step": 36080 }, { "epoch": 4.5213632376895125, "grad_norm": 0.07029502838850021, "learning_rate": 9.52367518458097e-06, "loss": 0.4612, "num_input_tokens_seen": 43886336, "step": 36085 }, { "epoch": 4.521989725598296, "grad_norm": 0.06793497502803802, "learning_rate": 9.523442271233132e-06, "loss": 0.4602, "num_input_tokens_seen": 43892224, "step": 36090 }, { "epoch": 4.52261621350708, "grad_norm": 0.125984787940979, "learning_rate": 9.523209303803739e-06, "loss": 0.4579, "num_input_tokens_seen": 43898368, "step": 36095 }, { "epoch": 4.523242701415863, "grad_norm": 0.1029929667711258, "learning_rate": 9.522976282295576e-06, "loss": 0.4644, "num_input_tokens_seen": 43904288, "step": 36100 }, { "epoch": 4.523869189324646, "grad_norm": 0.07082144916057587, "learning_rate": 9.522743206711429e-06, "loss": 0.4642, "num_input_tokens_seen": 43910368, "step": 36105 }, { "epoch": 4.524495677233429, "grad_norm": 0.07782186567783356, "learning_rate": 9.522510077054082e-06, "loss": 0.4682, "num_input_tokens_seen": 43916608, "step": 36110 }, { "epoch": 4.525122165142212, "grad_norm": 0.0721421167254448, "learning_rate": 9.522276893326326e-06, "loss": 0.4661, "num_input_tokens_seen": 43922880, "step": 36115 }, { "epoch": 4.5257486530509965, "grad_norm": 0.0391758494079113, "learning_rate": 9.522043655530948e-06, "loss": 0.4667, "num_input_tokens_seen": 43929056, "step": 36120 }, { "epoch": 4.52637514095978, "grad_norm": 0.07265286147594452, "learning_rate": 9.521810363670738e-06, "loss": 0.4648, "num_input_tokens_seen": 43935136, "step": 36125 }, { "epoch": 4.527001628868563, "grad_norm": 0.10606931149959564, "learning_rate": 9.52157701774848e-06, "loss": 0.4669, "num_input_tokens_seen": 43941408, "step": 36130 }, { "epoch": 4.527628116777346, "grad_norm": 0.099638931453228, "learning_rate": 9.521343617766969e-06, "loss": 0.4685, "num_input_tokens_seen": 43947488, "step": 36135 }, { "epoch": 4.528254604686129, "grad_norm": 0.06290014833211899, "learning_rate": 9.521110163728995e-06, "loss": 0.463, "num_input_tokens_seen": 43954048, "step": 36140 }, { "epoch": 4.528881092594913, "grad_norm": 0.10749122500419617, "learning_rate": 9.520876655637345e-06, "loss": 0.4581, "num_input_tokens_seen": 43960320, "step": 36145 }, { "epoch": 4.529507580503696, "grad_norm": 0.07505729794502258, "learning_rate": 9.520643093494815e-06, "loss": 0.4601, "num_input_tokens_seen": 43966368, "step": 36150 }, { "epoch": 4.53013406841248, "grad_norm": 0.06902162730693817, "learning_rate": 9.520409477304198e-06, "loss": 0.463, "num_input_tokens_seen": 43972288, "step": 36155 }, { "epoch": 4.530760556321263, "grad_norm": 0.06612695753574371, "learning_rate": 9.520175807068282e-06, "loss": 0.4643, "num_input_tokens_seen": 43978464, "step": 36160 }, { "epoch": 4.531387044230046, "grad_norm": 0.06746464967727661, "learning_rate": 9.519942082789866e-06, "loss": 0.4618, "num_input_tokens_seen": 43983680, "step": 36165 }, { "epoch": 4.53201353213883, "grad_norm": 0.0710972473025322, "learning_rate": 9.51970830447174e-06, "loss": 0.4657, "num_input_tokens_seen": 43990016, "step": 36170 }, { "epoch": 4.532640020047613, "grad_norm": 0.03300592303276062, "learning_rate": 9.519474472116703e-06, "loss": 0.4645, "num_input_tokens_seen": 43996032, "step": 36175 }, { "epoch": 4.533266507956396, "grad_norm": 0.10220419615507126, "learning_rate": 9.51924058572755e-06, "loss": 0.4596, "num_input_tokens_seen": 44002176, "step": 36180 }, { "epoch": 4.5338929958651795, "grad_norm": 0.06896989047527313, "learning_rate": 9.519006645307073e-06, "loss": 0.4585, "num_input_tokens_seen": 44008160, "step": 36185 }, { "epoch": 4.5345194837739635, "grad_norm": 0.10970786958932877, "learning_rate": 9.518772650858075e-06, "loss": 0.4689, "num_input_tokens_seen": 44013888, "step": 36190 }, { "epoch": 4.535145971682747, "grad_norm": 0.100513756275177, "learning_rate": 9.51853860238335e-06, "loss": 0.4639, "num_input_tokens_seen": 44019936, "step": 36195 }, { "epoch": 4.53577245959153, "grad_norm": 0.06930246204137802, "learning_rate": 9.518304499885697e-06, "loss": 0.4605, "num_input_tokens_seen": 44025952, "step": 36200 }, { "epoch": 4.536398947500313, "grad_norm": 0.0698762834072113, "learning_rate": 9.518070343367913e-06, "loss": 0.4612, "num_input_tokens_seen": 44031872, "step": 36205 }, { "epoch": 4.537025435409097, "grad_norm": 0.11018511652946472, "learning_rate": 9.517836132832804e-06, "loss": 0.4608, "num_input_tokens_seen": 44038272, "step": 36210 }, { "epoch": 4.53765192331788, "grad_norm": 0.06473439186811447, "learning_rate": 9.517601868283162e-06, "loss": 0.4614, "num_input_tokens_seen": 44044704, "step": 36215 }, { "epoch": 4.538278411226663, "grad_norm": 0.0852159783244133, "learning_rate": 9.517367549721792e-06, "loss": 0.4629, "num_input_tokens_seen": 44050880, "step": 36220 }, { "epoch": 4.538904899135447, "grad_norm": 0.12010329961776733, "learning_rate": 9.517133177151494e-06, "loss": 0.4656, "num_input_tokens_seen": 44057184, "step": 36225 }, { "epoch": 4.53953138704423, "grad_norm": 0.03776268661022186, "learning_rate": 9.516898750575073e-06, "loss": 0.4629, "num_input_tokens_seen": 44063360, "step": 36230 }, { "epoch": 4.540157874953014, "grad_norm": 0.07563948631286621, "learning_rate": 9.51666426999533e-06, "loss": 0.462, "num_input_tokens_seen": 44068768, "step": 36235 }, { "epoch": 4.540784362861797, "grad_norm": 0.10836827009916306, "learning_rate": 9.516429735415067e-06, "loss": 0.46, "num_input_tokens_seen": 44074336, "step": 36240 }, { "epoch": 4.54141085077058, "grad_norm": 0.10214801132678986, "learning_rate": 9.51619514683709e-06, "loss": 0.4626, "num_input_tokens_seen": 44080416, "step": 36245 }, { "epoch": 4.542037338679363, "grad_norm": 0.1061331182718277, "learning_rate": 9.515960504264201e-06, "loss": 0.4549, "num_input_tokens_seen": 44086784, "step": 36250 }, { "epoch": 4.5426638265881465, "grad_norm": 0.09797714650630951, "learning_rate": 9.51572580769921e-06, "loss": 0.4554, "num_input_tokens_seen": 44092640, "step": 36255 }, { "epoch": 4.5432903144969305, "grad_norm": 0.24487093091011047, "learning_rate": 9.51549105714492e-06, "loss": 0.4567, "num_input_tokens_seen": 44098176, "step": 36260 }, { "epoch": 4.543916802405714, "grad_norm": 0.13443443179130554, "learning_rate": 9.515256252604137e-06, "loss": 0.4792, "num_input_tokens_seen": 44104256, "step": 36265 }, { "epoch": 4.544543290314497, "grad_norm": 0.10104026645421982, "learning_rate": 9.51502139407967e-06, "loss": 0.4581, "num_input_tokens_seen": 44109632, "step": 36270 }, { "epoch": 4.54516977822328, "grad_norm": 0.1626080721616745, "learning_rate": 9.514786481574325e-06, "loss": 0.4623, "num_input_tokens_seen": 44115776, "step": 36275 }, { "epoch": 4.545796266132063, "grad_norm": 0.12704448401927948, "learning_rate": 9.514551515090913e-06, "loss": 0.4651, "num_input_tokens_seen": 44122016, "step": 36280 }, { "epoch": 4.546422754040847, "grad_norm": 0.08842842280864716, "learning_rate": 9.514316494632242e-06, "loss": 0.4683, "num_input_tokens_seen": 44128256, "step": 36285 }, { "epoch": 4.5470492419496304, "grad_norm": 0.11000759154558182, "learning_rate": 9.514081420201123e-06, "loss": 0.4594, "num_input_tokens_seen": 44134080, "step": 36290 }, { "epoch": 4.547675729858414, "grad_norm": 0.1122826337814331, "learning_rate": 9.513846291800363e-06, "loss": 0.4624, "num_input_tokens_seen": 44140064, "step": 36295 }, { "epoch": 4.548302217767197, "grad_norm": 0.07810576260089874, "learning_rate": 9.513611109432777e-06, "loss": 0.4553, "num_input_tokens_seen": 44146304, "step": 36300 }, { "epoch": 4.54892870567598, "grad_norm": 0.10872667282819748, "learning_rate": 9.513375873101173e-06, "loss": 0.4632, "num_input_tokens_seen": 44152192, "step": 36305 }, { "epoch": 4.549555193584764, "grad_norm": 0.10068360716104507, "learning_rate": 9.51314058280837e-06, "loss": 0.4671, "num_input_tokens_seen": 44158400, "step": 36310 }, { "epoch": 4.550181681493547, "grad_norm": 0.09867314249277115, "learning_rate": 9.512905238557174e-06, "loss": 0.4712, "num_input_tokens_seen": 44164512, "step": 36315 }, { "epoch": 4.55080816940233, "grad_norm": 0.15802423655986786, "learning_rate": 9.512669840350401e-06, "loss": 0.4619, "num_input_tokens_seen": 44170496, "step": 36320 }, { "epoch": 4.5514346573111135, "grad_norm": 0.12304199486970901, "learning_rate": 9.512434388190868e-06, "loss": 0.468, "num_input_tokens_seen": 44176832, "step": 36325 }, { "epoch": 4.552061145219898, "grad_norm": 0.10063665360212326, "learning_rate": 9.512198882081386e-06, "loss": 0.4576, "num_input_tokens_seen": 44182560, "step": 36330 }, { "epoch": 4.552687633128681, "grad_norm": 0.09901326149702072, "learning_rate": 9.511963322024776e-06, "loss": 0.4659, "num_input_tokens_seen": 44188640, "step": 36335 }, { "epoch": 4.553314121037464, "grad_norm": 0.11141746491193771, "learning_rate": 9.511727708023847e-06, "loss": 0.4622, "num_input_tokens_seen": 44195328, "step": 36340 }, { "epoch": 4.553940608946247, "grad_norm": 0.12728504836559296, "learning_rate": 9.511492040081423e-06, "loss": 0.4685, "num_input_tokens_seen": 44201344, "step": 36345 }, { "epoch": 4.554567096855031, "grad_norm": 0.12238800525665283, "learning_rate": 9.511256318200318e-06, "loss": 0.4626, "num_input_tokens_seen": 44207232, "step": 36350 }, { "epoch": 4.555193584763814, "grad_norm": 0.0661453902721405, "learning_rate": 9.51102054238335e-06, "loss": 0.462, "num_input_tokens_seen": 44213504, "step": 36355 }, { "epoch": 4.5558200726725975, "grad_norm": 0.06714674830436707, "learning_rate": 9.51078471263334e-06, "loss": 0.4636, "num_input_tokens_seen": 44219744, "step": 36360 }, { "epoch": 4.556446560581381, "grad_norm": 0.0756683498620987, "learning_rate": 9.510548828953106e-06, "loss": 0.4611, "num_input_tokens_seen": 44226080, "step": 36365 }, { "epoch": 4.557073048490164, "grad_norm": 0.05980809032917023, "learning_rate": 9.510312891345468e-06, "loss": 0.4631, "num_input_tokens_seen": 44232000, "step": 36370 }, { "epoch": 4.557699536398948, "grad_norm": 0.07226544618606567, "learning_rate": 9.510076899813247e-06, "loss": 0.459, "num_input_tokens_seen": 44238144, "step": 36375 }, { "epoch": 4.558326024307731, "grad_norm": 0.10211125016212463, "learning_rate": 9.509840854359265e-06, "loss": 0.4614, "num_input_tokens_seen": 44244096, "step": 36380 }, { "epoch": 4.558952512216514, "grad_norm": 0.07508382946252823, "learning_rate": 9.509604754986343e-06, "loss": 0.4593, "num_input_tokens_seen": 44250432, "step": 36385 }, { "epoch": 4.559579000125297, "grad_norm": 0.03481394797563553, "learning_rate": 9.509368601697306e-06, "loss": 0.4585, "num_input_tokens_seen": 44256512, "step": 36390 }, { "epoch": 4.5602054880340805, "grad_norm": 0.06629309058189392, "learning_rate": 9.509132394494976e-06, "loss": 0.4561, "num_input_tokens_seen": 44262624, "step": 36395 }, { "epoch": 4.560831975942865, "grad_norm": 0.07386219501495361, "learning_rate": 9.508896133382177e-06, "loss": 0.4666, "num_input_tokens_seen": 44268064, "step": 36400 }, { "epoch": 4.561458463851648, "grad_norm": 0.07250747084617615, "learning_rate": 9.508659818361735e-06, "loss": 0.4602, "num_input_tokens_seen": 44274560, "step": 36405 }, { "epoch": 4.562084951760431, "grad_norm": 0.0679154023528099, "learning_rate": 9.508423449436473e-06, "loss": 0.4571, "num_input_tokens_seen": 44280704, "step": 36410 }, { "epoch": 4.562711439669214, "grad_norm": 0.03132656589150429, "learning_rate": 9.508187026609217e-06, "loss": 0.465, "num_input_tokens_seen": 44286656, "step": 36415 }, { "epoch": 4.563337927577997, "grad_norm": 0.0662442222237587, "learning_rate": 9.507950549882798e-06, "loss": 0.4628, "num_input_tokens_seen": 44292160, "step": 36420 }, { "epoch": 4.563964415486781, "grad_norm": 0.06404873728752136, "learning_rate": 9.507714019260037e-06, "loss": 0.4608, "num_input_tokens_seen": 44298336, "step": 36425 }, { "epoch": 4.5645909033955645, "grad_norm": 0.08286773413419724, "learning_rate": 9.507477434743765e-06, "loss": 0.4629, "num_input_tokens_seen": 44304512, "step": 36430 }, { "epoch": 4.565217391304348, "grad_norm": 0.07266219705343246, "learning_rate": 9.507240796336815e-06, "loss": 0.4629, "num_input_tokens_seen": 44310784, "step": 36435 }, { "epoch": 4.565843879213131, "grad_norm": 0.07943198829889297, "learning_rate": 9.507004104042007e-06, "loss": 0.4589, "num_input_tokens_seen": 44317056, "step": 36440 }, { "epoch": 4.566470367121915, "grad_norm": 0.07756534963846207, "learning_rate": 9.506767357862177e-06, "loss": 0.4651, "num_input_tokens_seen": 44322496, "step": 36445 }, { "epoch": 4.567096855030698, "grad_norm": 0.09763862937688828, "learning_rate": 9.506530557800156e-06, "loss": 0.4647, "num_input_tokens_seen": 44328704, "step": 36450 }, { "epoch": 4.567723342939481, "grad_norm": 0.13549983501434326, "learning_rate": 9.506293703858772e-06, "loss": 0.4459, "num_input_tokens_seen": 44334656, "step": 36455 }, { "epoch": 4.568349830848264, "grad_norm": 0.15889084339141846, "learning_rate": 9.506056796040857e-06, "loss": 0.4504, "num_input_tokens_seen": 44340416, "step": 36460 }, { "epoch": 4.5689763187570485, "grad_norm": 0.08864753693342209, "learning_rate": 9.505819834349246e-06, "loss": 0.4696, "num_input_tokens_seen": 44346944, "step": 36465 }, { "epoch": 4.569602806665832, "grad_norm": 0.08315768092870712, "learning_rate": 9.505582818786768e-06, "loss": 0.4618, "num_input_tokens_seen": 44353024, "step": 36470 }, { "epoch": 4.570229294574615, "grad_norm": 0.09632895886898041, "learning_rate": 9.505345749356262e-06, "loss": 0.447, "num_input_tokens_seen": 44359392, "step": 36475 }, { "epoch": 4.570855782483398, "grad_norm": 0.07641895860433578, "learning_rate": 9.505108626060559e-06, "loss": 0.4656, "num_input_tokens_seen": 44365472, "step": 36480 }, { "epoch": 4.571482270392181, "grad_norm": 0.0924353376030922, "learning_rate": 9.504871448902493e-06, "loss": 0.4673, "num_input_tokens_seen": 44371744, "step": 36485 }, { "epoch": 4.572108758300965, "grad_norm": 0.09983295947313309, "learning_rate": 9.504634217884904e-06, "loss": 0.4675, "num_input_tokens_seen": 44378144, "step": 36490 }, { "epoch": 4.572735246209748, "grad_norm": 0.08711396157741547, "learning_rate": 9.504396933010624e-06, "loss": 0.4563, "num_input_tokens_seen": 44384160, "step": 36495 }, { "epoch": 4.5733617341185315, "grad_norm": 0.08138197660446167, "learning_rate": 9.504159594282494e-06, "loss": 0.4716, "num_input_tokens_seen": 44390304, "step": 36500 }, { "epoch": 4.573988222027315, "grad_norm": 0.11335105448961258, "learning_rate": 9.503922201703347e-06, "loss": 0.4529, "num_input_tokens_seen": 44396928, "step": 36505 }, { "epoch": 4.574614709936098, "grad_norm": 0.07636934518814087, "learning_rate": 9.503684755276024e-06, "loss": 0.4648, "num_input_tokens_seen": 44403168, "step": 36510 }, { "epoch": 4.575241197844882, "grad_norm": 0.10826486349105835, "learning_rate": 9.503447255003361e-06, "loss": 0.4668, "num_input_tokens_seen": 44409248, "step": 36515 }, { "epoch": 4.575867685753665, "grad_norm": 0.08328580111265182, "learning_rate": 9.503209700888204e-06, "loss": 0.4584, "num_input_tokens_seen": 44414912, "step": 36520 }, { "epoch": 4.576494173662448, "grad_norm": 0.08388075977563858, "learning_rate": 9.502972092933384e-06, "loss": 0.4728, "num_input_tokens_seen": 44421312, "step": 36525 }, { "epoch": 4.577120661571231, "grad_norm": 0.11080560833215714, "learning_rate": 9.502734431141752e-06, "loss": 0.471, "num_input_tokens_seen": 44427328, "step": 36530 }, { "epoch": 4.577747149480015, "grad_norm": 0.14366158843040466, "learning_rate": 9.50249671551614e-06, "loss": 0.4672, "num_input_tokens_seen": 44433568, "step": 36535 }, { "epoch": 4.578373637388799, "grad_norm": 0.09725537151098251, "learning_rate": 9.502258946059397e-06, "loss": 0.4654, "num_input_tokens_seen": 44439552, "step": 36540 }, { "epoch": 4.579000125297582, "grad_norm": 0.0749427005648613, "learning_rate": 9.50202112277436e-06, "loss": 0.4585, "num_input_tokens_seen": 44445632, "step": 36545 }, { "epoch": 4.579626613206365, "grad_norm": 0.08802694082260132, "learning_rate": 9.501783245663878e-06, "loss": 0.4518, "num_input_tokens_seen": 44451904, "step": 36550 }, { "epoch": 4.580253101115148, "grad_norm": 0.0797843486070633, "learning_rate": 9.501545314730792e-06, "loss": 0.4644, "num_input_tokens_seen": 44458336, "step": 36555 }, { "epoch": 4.580879589023932, "grad_norm": 0.07227008044719696, "learning_rate": 9.501307329977947e-06, "loss": 0.4635, "num_input_tokens_seen": 44464320, "step": 36560 }, { "epoch": 4.581506076932715, "grad_norm": 0.07998353242874146, "learning_rate": 9.501069291408189e-06, "loss": 0.4597, "num_input_tokens_seen": 44470400, "step": 36565 }, { "epoch": 4.582132564841499, "grad_norm": 0.0464574433863163, "learning_rate": 9.500831199024362e-06, "loss": 0.4619, "num_input_tokens_seen": 44476512, "step": 36570 }, { "epoch": 4.582759052750282, "grad_norm": 0.0744822546839714, "learning_rate": 9.500593052829314e-06, "loss": 0.4666, "num_input_tokens_seen": 44481952, "step": 36575 }, { "epoch": 4.583385540659065, "grad_norm": 0.0964348167181015, "learning_rate": 9.500354852825893e-06, "loss": 0.4781, "num_input_tokens_seen": 44487872, "step": 36580 }, { "epoch": 4.584012028567849, "grad_norm": 0.08575574308633804, "learning_rate": 9.500116599016945e-06, "loss": 0.4701, "num_input_tokens_seen": 44493760, "step": 36585 }, { "epoch": 4.584638516476632, "grad_norm": 0.07492964714765549, "learning_rate": 9.49987829140532e-06, "loss": 0.4625, "num_input_tokens_seen": 44500096, "step": 36590 }, { "epoch": 4.585265004385415, "grad_norm": 0.07292871177196503, "learning_rate": 9.499639929993866e-06, "loss": 0.4673, "num_input_tokens_seen": 44505760, "step": 36595 }, { "epoch": 4.5858914922941985, "grad_norm": 0.06511002779006958, "learning_rate": 9.499401514785436e-06, "loss": 0.4639, "num_input_tokens_seen": 44511872, "step": 36600 }, { "epoch": 4.5865179802029825, "grad_norm": 0.12788215279579163, "learning_rate": 9.499163045782877e-06, "loss": 0.4583, "num_input_tokens_seen": 44518016, "step": 36605 }, { "epoch": 4.587144468111766, "grad_norm": 0.11775465309619904, "learning_rate": 9.498924522989038e-06, "loss": 0.4682, "num_input_tokens_seen": 44524064, "step": 36610 }, { "epoch": 4.587770956020549, "grad_norm": 0.06643540412187576, "learning_rate": 9.498685946406778e-06, "loss": 0.4682, "num_input_tokens_seen": 44529984, "step": 36615 }, { "epoch": 4.588397443929332, "grad_norm": 0.1132320910692215, "learning_rate": 9.498447316038944e-06, "loss": 0.4658, "num_input_tokens_seen": 44535712, "step": 36620 }, { "epoch": 4.589023931838115, "grad_norm": 0.07593559473752975, "learning_rate": 9.49820863188839e-06, "loss": 0.4595, "num_input_tokens_seen": 44541088, "step": 36625 }, { "epoch": 4.589650419746899, "grad_norm": 0.11844434589147568, "learning_rate": 9.497969893957971e-06, "loss": 0.4629, "num_input_tokens_seen": 44546976, "step": 36630 }, { "epoch": 4.590276907655682, "grad_norm": 0.0781630277633667, "learning_rate": 9.497731102250538e-06, "loss": 0.4649, "num_input_tokens_seen": 44553280, "step": 36635 }, { "epoch": 4.590903395564466, "grad_norm": 0.0809953510761261, "learning_rate": 9.49749225676895e-06, "loss": 0.4657, "num_input_tokens_seen": 44559520, "step": 36640 }, { "epoch": 4.591529883473249, "grad_norm": 0.11119664460420609, "learning_rate": 9.497253357516059e-06, "loss": 0.4662, "num_input_tokens_seen": 44565792, "step": 36645 }, { "epoch": 4.592156371382032, "grad_norm": 0.0833410769701004, "learning_rate": 9.497014404494725e-06, "loss": 0.4635, "num_input_tokens_seen": 44571744, "step": 36650 }, { "epoch": 4.592782859290816, "grad_norm": 0.06809142231941223, "learning_rate": 9.4967753977078e-06, "loss": 0.461, "num_input_tokens_seen": 44578176, "step": 36655 }, { "epoch": 4.593409347199599, "grad_norm": 0.07728612422943115, "learning_rate": 9.496536337158146e-06, "loss": 0.4609, "num_input_tokens_seen": 44584544, "step": 36660 }, { "epoch": 4.594035835108382, "grad_norm": 0.06843075156211853, "learning_rate": 9.49629722284862e-06, "loss": 0.4571, "num_input_tokens_seen": 44590560, "step": 36665 }, { "epoch": 4.5946623230171655, "grad_norm": 0.06858085095882416, "learning_rate": 9.496058054782082e-06, "loss": 0.458, "num_input_tokens_seen": 44596768, "step": 36670 }, { "epoch": 4.595288810925949, "grad_norm": 0.07462809979915619, "learning_rate": 9.495818832961388e-06, "loss": 0.4593, "num_input_tokens_seen": 44603104, "step": 36675 }, { "epoch": 4.595915298834733, "grad_norm": 0.11294609308242798, "learning_rate": 9.495579557389401e-06, "loss": 0.4613, "num_input_tokens_seen": 44609472, "step": 36680 }, { "epoch": 4.596541786743516, "grad_norm": 0.06892121583223343, "learning_rate": 9.49534022806898e-06, "loss": 0.4629, "num_input_tokens_seen": 44615136, "step": 36685 }, { "epoch": 4.597168274652299, "grad_norm": 0.1005842536687851, "learning_rate": 9.495100845002985e-06, "loss": 0.4644, "num_input_tokens_seen": 44620992, "step": 36690 }, { "epoch": 4.597794762561082, "grad_norm": 0.09317360073328018, "learning_rate": 9.494861408194282e-06, "loss": 0.4715, "num_input_tokens_seen": 44627488, "step": 36695 }, { "epoch": 4.598421250469866, "grad_norm": 0.0982384905219078, "learning_rate": 9.494621917645733e-06, "loss": 0.4609, "num_input_tokens_seen": 44633856, "step": 36700 }, { "epoch": 4.599047738378649, "grad_norm": 0.09289027750492096, "learning_rate": 9.4943823733602e-06, "loss": 0.4712, "num_input_tokens_seen": 44639840, "step": 36705 }, { "epoch": 4.599674226287433, "grad_norm": 0.09549344331026077, "learning_rate": 9.494142775340544e-06, "loss": 0.4653, "num_input_tokens_seen": 44645792, "step": 36710 }, { "epoch": 4.600300714196216, "grad_norm": 0.03718920424580574, "learning_rate": 9.493903123589636e-06, "loss": 0.457, "num_input_tokens_seen": 44651680, "step": 36715 }, { "epoch": 4.600927202105, "grad_norm": 0.07894529402256012, "learning_rate": 9.493663418110337e-06, "loss": 0.4549, "num_input_tokens_seen": 44657824, "step": 36720 }, { "epoch": 4.601553690013783, "grad_norm": 0.1423255205154419, "learning_rate": 9.493423658905514e-06, "loss": 0.4723, "num_input_tokens_seen": 44664128, "step": 36725 }, { "epoch": 4.602180177922566, "grad_norm": 0.07047328352928162, "learning_rate": 9.493183845978033e-06, "loss": 0.4568, "num_input_tokens_seen": 44670176, "step": 36730 }, { "epoch": 4.602806665831349, "grad_norm": 0.09149055927991867, "learning_rate": 9.49294397933076e-06, "loss": 0.4673, "num_input_tokens_seen": 44675808, "step": 36735 }, { "epoch": 4.6034331537401325, "grad_norm": 0.07065156102180481, "learning_rate": 9.492704058966567e-06, "loss": 0.4614, "num_input_tokens_seen": 44681472, "step": 36740 }, { "epoch": 4.604059641648917, "grad_norm": 0.08964347094297409, "learning_rate": 9.492464084888318e-06, "loss": 0.4634, "num_input_tokens_seen": 44687520, "step": 36745 }, { "epoch": 4.6046861295577, "grad_norm": 0.07104220986366272, "learning_rate": 9.492224057098883e-06, "loss": 0.4635, "num_input_tokens_seen": 44693760, "step": 36750 }, { "epoch": 4.605312617466483, "grad_norm": 0.07915333658456802, "learning_rate": 9.491983975601134e-06, "loss": 0.459, "num_input_tokens_seen": 44700064, "step": 36755 }, { "epoch": 4.605939105375266, "grad_norm": 0.10379264503717422, "learning_rate": 9.49174384039794e-06, "loss": 0.4621, "num_input_tokens_seen": 44706368, "step": 36760 }, { "epoch": 4.606565593284049, "grad_norm": 0.08223076164722443, "learning_rate": 9.491503651492171e-06, "loss": 0.4641, "num_input_tokens_seen": 44712320, "step": 36765 }, { "epoch": 4.607192081192833, "grad_norm": 0.1349756419658661, "learning_rate": 9.491263408886699e-06, "loss": 0.4636, "num_input_tokens_seen": 44718176, "step": 36770 }, { "epoch": 4.6078185691016165, "grad_norm": 0.07453160732984543, "learning_rate": 9.4910231125844e-06, "loss": 0.4612, "num_input_tokens_seen": 44723840, "step": 36775 }, { "epoch": 4.6084450570104, "grad_norm": 0.0791812315583229, "learning_rate": 9.49078276258814e-06, "loss": 0.4679, "num_input_tokens_seen": 44729824, "step": 36780 }, { "epoch": 4.609071544919183, "grad_norm": 0.10414119064807892, "learning_rate": 9.490542358900799e-06, "loss": 0.4579, "num_input_tokens_seen": 44736192, "step": 36785 }, { "epoch": 4.609698032827966, "grad_norm": 0.06685462594032288, "learning_rate": 9.490301901525247e-06, "loss": 0.4588, "num_input_tokens_seen": 44742432, "step": 36790 }, { "epoch": 4.61032452073675, "grad_norm": 0.12401488423347473, "learning_rate": 9.490061390464363e-06, "loss": 0.4575, "num_input_tokens_seen": 44748288, "step": 36795 }, { "epoch": 4.610951008645533, "grad_norm": 0.0653565376996994, "learning_rate": 9.489820825721017e-06, "loss": 0.4606, "num_input_tokens_seen": 44754592, "step": 36800 }, { "epoch": 4.611577496554316, "grad_norm": 0.08786294609308243, "learning_rate": 9.48958020729809e-06, "loss": 0.4583, "num_input_tokens_seen": 44760736, "step": 36805 }, { "epoch": 4.6122039844630995, "grad_norm": 0.07208812981843948, "learning_rate": 9.489339535198456e-06, "loss": 0.4656, "num_input_tokens_seen": 44766848, "step": 36810 }, { "epoch": 4.612830472371884, "grad_norm": 0.06968056410551071, "learning_rate": 9.489098809424994e-06, "loss": 0.4645, "num_input_tokens_seen": 44772576, "step": 36815 }, { "epoch": 4.613456960280667, "grad_norm": 0.08897766470909119, "learning_rate": 9.488858029980582e-06, "loss": 0.4617, "num_input_tokens_seen": 44778720, "step": 36820 }, { "epoch": 4.61408344818945, "grad_norm": 0.11736007034778595, "learning_rate": 9.488617196868098e-06, "loss": 0.4677, "num_input_tokens_seen": 44784704, "step": 36825 }, { "epoch": 4.614709936098233, "grad_norm": 0.08213254809379578, "learning_rate": 9.488376310090422e-06, "loss": 0.4668, "num_input_tokens_seen": 44791168, "step": 36830 }, { "epoch": 4.615336424007017, "grad_norm": 0.10011831670999527, "learning_rate": 9.488135369650434e-06, "loss": 0.4651, "num_input_tokens_seen": 44797376, "step": 36835 }, { "epoch": 4.6159629119158, "grad_norm": 0.07654248178005219, "learning_rate": 9.487894375551012e-06, "loss": 0.4634, "num_input_tokens_seen": 44803488, "step": 36840 }, { "epoch": 4.6165893998245835, "grad_norm": 0.08970165997743607, "learning_rate": 9.48765332779504e-06, "loss": 0.4599, "num_input_tokens_seen": 44809760, "step": 36845 }, { "epoch": 4.617215887733367, "grad_norm": 0.082011878490448, "learning_rate": 9.487412226385401e-06, "loss": 0.4604, "num_input_tokens_seen": 44816224, "step": 36850 }, { "epoch": 4.61784237564215, "grad_norm": 0.06914306432008743, "learning_rate": 9.487171071324974e-06, "loss": 0.4634, "num_input_tokens_seen": 44822400, "step": 36855 }, { "epoch": 4.618468863550934, "grad_norm": 0.070733442902565, "learning_rate": 9.486929862616646e-06, "loss": 0.4599, "num_input_tokens_seen": 44828672, "step": 36860 }, { "epoch": 4.619095351459717, "grad_norm": 0.10473965108394623, "learning_rate": 9.486688600263299e-06, "loss": 0.4601, "num_input_tokens_seen": 44834816, "step": 36865 }, { "epoch": 4.6197218393685, "grad_norm": 0.10963499546051025, "learning_rate": 9.486447284267816e-06, "loss": 0.4607, "num_input_tokens_seen": 44840896, "step": 36870 }, { "epoch": 4.620348327277283, "grad_norm": 0.07212530821561813, "learning_rate": 9.486205914633086e-06, "loss": 0.456, "num_input_tokens_seen": 44847328, "step": 36875 }, { "epoch": 4.620974815186067, "grad_norm": 0.03959818184375763, "learning_rate": 9.48596449136199e-06, "loss": 0.4604, "num_input_tokens_seen": 44853728, "step": 36880 }, { "epoch": 4.621601303094851, "grad_norm": 0.08907955139875412, "learning_rate": 9.48572301445742e-06, "loss": 0.4593, "num_input_tokens_seen": 44859872, "step": 36885 }, { "epoch": 4.622227791003634, "grad_norm": 0.07302947342395782, "learning_rate": 9.485481483922256e-06, "loss": 0.469, "num_input_tokens_seen": 44865952, "step": 36890 }, { "epoch": 4.622854278912417, "grad_norm": 0.11713803559541702, "learning_rate": 9.485239899759393e-06, "loss": 0.4667, "num_input_tokens_seen": 44872224, "step": 36895 }, { "epoch": 4.6234807668212, "grad_norm": 0.042097579687833786, "learning_rate": 9.484998261971715e-06, "loss": 0.4652, "num_input_tokens_seen": 44878176, "step": 36900 }, { "epoch": 4.624107254729983, "grad_norm": 0.08086767047643661, "learning_rate": 9.484756570562112e-06, "loss": 0.4641, "num_input_tokens_seen": 44884224, "step": 36905 }, { "epoch": 4.624733742638767, "grad_norm": 0.08400961011648178, "learning_rate": 9.484514825533473e-06, "loss": 0.4632, "num_input_tokens_seen": 44890528, "step": 36910 }, { "epoch": 4.6253602305475505, "grad_norm": 0.07385702431201935, "learning_rate": 9.484273026888691e-06, "loss": 0.4617, "num_input_tokens_seen": 44896288, "step": 36915 }, { "epoch": 4.625986718456334, "grad_norm": 0.045208889991045, "learning_rate": 9.484031174630652e-06, "loss": 0.4581, "num_input_tokens_seen": 44902592, "step": 36920 }, { "epoch": 4.626613206365117, "grad_norm": 0.08660072088241577, "learning_rate": 9.483789268762252e-06, "loss": 0.4705, "num_input_tokens_seen": 44908576, "step": 36925 }, { "epoch": 4.6272396942739, "grad_norm": 0.04032454639673233, "learning_rate": 9.483547309286381e-06, "loss": 0.4632, "num_input_tokens_seen": 44914656, "step": 36930 }, { "epoch": 4.627866182182684, "grad_norm": 0.06750497967004776, "learning_rate": 9.483305296205932e-06, "loss": 0.466, "num_input_tokens_seen": 44920800, "step": 36935 }, { "epoch": 4.628492670091467, "grad_norm": 0.11273480206727982, "learning_rate": 9.483063229523799e-06, "loss": 0.4665, "num_input_tokens_seen": 44926592, "step": 36940 }, { "epoch": 4.62911915800025, "grad_norm": 0.0770585909485817, "learning_rate": 9.482821109242877e-06, "loss": 0.4717, "num_input_tokens_seen": 44932544, "step": 36945 }, { "epoch": 4.629745645909034, "grad_norm": 0.08501186966896057, "learning_rate": 9.482578935366059e-06, "loss": 0.457, "num_input_tokens_seen": 44938176, "step": 36950 }, { "epoch": 4.630372133817818, "grad_norm": 0.0784914419054985, "learning_rate": 9.48233670789624e-06, "loss": 0.462, "num_input_tokens_seen": 44944192, "step": 36955 }, { "epoch": 4.630998621726601, "grad_norm": 0.03645557910203934, "learning_rate": 9.48209442683632e-06, "loss": 0.462, "num_input_tokens_seen": 44950208, "step": 36960 }, { "epoch": 4.631625109635384, "grad_norm": 0.11334754526615143, "learning_rate": 9.481852092189189e-06, "loss": 0.461, "num_input_tokens_seen": 44956480, "step": 36965 }, { "epoch": 4.632251597544167, "grad_norm": 0.07508838176727295, "learning_rate": 9.48160970395775e-06, "loss": 0.4656, "num_input_tokens_seen": 44962656, "step": 36970 }, { "epoch": 4.632878085452951, "grad_norm": 0.11241935193538666, "learning_rate": 9.4813672621449e-06, "loss": 0.4619, "num_input_tokens_seen": 44968576, "step": 36975 }, { "epoch": 4.633504573361734, "grad_norm": 0.08633572608232498, "learning_rate": 9.481124766753535e-06, "loss": 0.4618, "num_input_tokens_seen": 44974304, "step": 36980 }, { "epoch": 4.6341310612705175, "grad_norm": 0.06467307358980179, "learning_rate": 9.480882217786559e-06, "loss": 0.4602, "num_input_tokens_seen": 44980256, "step": 36985 }, { "epoch": 4.634757549179301, "grad_norm": 0.07264818251132965, "learning_rate": 9.480639615246864e-06, "loss": 0.4613, "num_input_tokens_seen": 44986432, "step": 36990 }, { "epoch": 4.635384037088084, "grad_norm": 0.08775452524423599, "learning_rate": 9.480396959137359e-06, "loss": 0.4639, "num_input_tokens_seen": 44992640, "step": 36995 }, { "epoch": 4.636010524996868, "grad_norm": 0.04901232197880745, "learning_rate": 9.48015424946094e-06, "loss": 0.4608, "num_input_tokens_seen": 44998592, "step": 37000 }, { "epoch": 4.636637012905651, "grad_norm": 0.0700637698173523, "learning_rate": 9.47991148622051e-06, "loss": 0.4618, "num_input_tokens_seen": 45004480, "step": 37005 }, { "epoch": 4.637263500814434, "grad_norm": 0.07001281529664993, "learning_rate": 9.47966866941897e-06, "loss": 0.4644, "num_input_tokens_seen": 45010464, "step": 37010 }, { "epoch": 4.6378899887232174, "grad_norm": 0.08227121829986572, "learning_rate": 9.479425799059228e-06, "loss": 0.4571, "num_input_tokens_seen": 45016352, "step": 37015 }, { "epoch": 4.638516476632001, "grad_norm": 0.07165083289146423, "learning_rate": 9.479182875144183e-06, "loss": 0.4628, "num_input_tokens_seen": 45022368, "step": 37020 }, { "epoch": 4.639142964540785, "grad_norm": 0.12022016942501068, "learning_rate": 9.478939897676742e-06, "loss": 0.4607, "num_input_tokens_seen": 45028128, "step": 37025 }, { "epoch": 4.639769452449568, "grad_norm": 0.13577240705490112, "learning_rate": 9.478696866659807e-06, "loss": 0.4623, "num_input_tokens_seen": 45034368, "step": 37030 }, { "epoch": 4.640395940358351, "grad_norm": 0.0808820053935051, "learning_rate": 9.478453782096287e-06, "loss": 0.4652, "num_input_tokens_seen": 45040576, "step": 37035 }, { "epoch": 4.641022428267134, "grad_norm": 0.07575643807649612, "learning_rate": 9.478210643989085e-06, "loss": 0.4597, "num_input_tokens_seen": 45046720, "step": 37040 }, { "epoch": 4.641648916175917, "grad_norm": 0.10374484956264496, "learning_rate": 9.477967452341113e-06, "loss": 0.4691, "num_input_tokens_seen": 45052928, "step": 37045 }, { "epoch": 4.642275404084701, "grad_norm": 0.06590019166469574, "learning_rate": 9.477724207155271e-06, "loss": 0.4615, "num_input_tokens_seen": 45059264, "step": 37050 }, { "epoch": 4.642901891993485, "grad_norm": 0.0713239312171936, "learning_rate": 9.477480908434475e-06, "loss": 0.4678, "num_input_tokens_seen": 45065888, "step": 37055 }, { "epoch": 4.643528379902268, "grad_norm": 0.08588018268346786, "learning_rate": 9.47723755618163e-06, "loss": 0.4663, "num_input_tokens_seen": 45071424, "step": 37060 }, { "epoch": 4.644154867811051, "grad_norm": 0.07809297740459442, "learning_rate": 9.476994150399644e-06, "loss": 0.4553, "num_input_tokens_seen": 45077536, "step": 37065 }, { "epoch": 4.644781355719835, "grad_norm": 0.09053100645542145, "learning_rate": 9.47675069109143e-06, "loss": 0.4646, "num_input_tokens_seen": 45083136, "step": 37070 }, { "epoch": 4.645407843628618, "grad_norm": 0.08292023837566376, "learning_rate": 9.476507178259895e-06, "loss": 0.4586, "num_input_tokens_seen": 45089312, "step": 37075 }, { "epoch": 4.646034331537401, "grad_norm": 0.09884490072727203, "learning_rate": 9.476263611907955e-06, "loss": 0.4604, "num_input_tokens_seen": 45094560, "step": 37080 }, { "epoch": 4.6466608194461845, "grad_norm": 0.10679929703474045, "learning_rate": 9.47601999203852e-06, "loss": 0.4647, "num_input_tokens_seen": 45100736, "step": 37085 }, { "epoch": 4.6472873073549685, "grad_norm": 0.07095217704772949, "learning_rate": 9.475776318654504e-06, "loss": 0.4573, "num_input_tokens_seen": 45106976, "step": 37090 }, { "epoch": 4.647913795263752, "grad_norm": 0.13501133024692535, "learning_rate": 9.475532591758818e-06, "loss": 0.4626, "num_input_tokens_seen": 45113568, "step": 37095 }, { "epoch": 4.648540283172535, "grad_norm": 0.0957709401845932, "learning_rate": 9.475288811354377e-06, "loss": 0.465, "num_input_tokens_seen": 45119936, "step": 37100 }, { "epoch": 4.649166771081318, "grad_norm": 0.1281467080116272, "learning_rate": 9.475044977444095e-06, "loss": 0.4537, "num_input_tokens_seen": 45126048, "step": 37105 }, { "epoch": 4.649793258990101, "grad_norm": 0.10894567519426346, "learning_rate": 9.47480109003089e-06, "loss": 0.457, "num_input_tokens_seen": 45132288, "step": 37110 }, { "epoch": 4.650419746898885, "grad_norm": 0.12794195115566254, "learning_rate": 9.474557149117673e-06, "loss": 0.4681, "num_input_tokens_seen": 45138720, "step": 37115 }, { "epoch": 4.651046234807668, "grad_norm": 0.08502582460641861, "learning_rate": 9.474313154707367e-06, "loss": 0.4576, "num_input_tokens_seen": 45144832, "step": 37120 }, { "epoch": 4.651672722716452, "grad_norm": 0.11539983004331589, "learning_rate": 9.47406910680288e-06, "loss": 0.4746, "num_input_tokens_seen": 45150912, "step": 37125 }, { "epoch": 4.652299210625235, "grad_norm": 0.07521658390760422, "learning_rate": 9.47382500540714e-06, "loss": 0.4713, "num_input_tokens_seen": 45157056, "step": 37130 }, { "epoch": 4.652925698534018, "grad_norm": 0.10424647480249405, "learning_rate": 9.473580850523059e-06, "loss": 0.4535, "num_input_tokens_seen": 45163360, "step": 37135 }, { "epoch": 4.653552186442802, "grad_norm": 0.10288194566965103, "learning_rate": 9.473336642153558e-06, "loss": 0.4515, "num_input_tokens_seen": 45169408, "step": 37140 }, { "epoch": 4.654178674351585, "grad_norm": 0.0505196712911129, "learning_rate": 9.473092380301556e-06, "loss": 0.458, "num_input_tokens_seen": 45175168, "step": 37145 }, { "epoch": 4.654805162260368, "grad_norm": 0.1458057165145874, "learning_rate": 9.472848064969975e-06, "loss": 0.4586, "num_input_tokens_seen": 45181280, "step": 37150 }, { "epoch": 4.6554316501691515, "grad_norm": 0.17187698185443878, "learning_rate": 9.472603696161734e-06, "loss": 0.4634, "num_input_tokens_seen": 45187424, "step": 37155 }, { "epoch": 4.656058138077935, "grad_norm": 0.12035930901765823, "learning_rate": 9.472359273879754e-06, "loss": 0.466, "num_input_tokens_seen": 45193280, "step": 37160 }, { "epoch": 4.656684625986719, "grad_norm": 0.09133835136890411, "learning_rate": 9.472114798126961e-06, "loss": 0.4643, "num_input_tokens_seen": 45199040, "step": 37165 }, { "epoch": 4.657311113895502, "grad_norm": 0.10503856837749481, "learning_rate": 9.471870268906274e-06, "loss": 0.4654, "num_input_tokens_seen": 45205280, "step": 37170 }, { "epoch": 4.657937601804285, "grad_norm": 0.07634387165307999, "learning_rate": 9.47162568622062e-06, "loss": 0.4604, "num_input_tokens_seen": 45211392, "step": 37175 }, { "epoch": 4.658564089713068, "grad_norm": 0.07548205554485321, "learning_rate": 9.47138105007292e-06, "loss": 0.4618, "num_input_tokens_seen": 45217216, "step": 37180 }, { "epoch": 4.659190577621851, "grad_norm": 0.12747058272361755, "learning_rate": 9.4711363604661e-06, "loss": 0.4674, "num_input_tokens_seen": 45223360, "step": 37185 }, { "epoch": 4.6598170655306355, "grad_norm": 0.09762846678495407, "learning_rate": 9.470891617403086e-06, "loss": 0.457, "num_input_tokens_seen": 45229632, "step": 37190 }, { "epoch": 4.660443553439419, "grad_norm": 0.09047678858041763, "learning_rate": 9.470646820886803e-06, "loss": 0.458, "num_input_tokens_seen": 45235904, "step": 37195 }, { "epoch": 4.661070041348202, "grad_norm": 0.10345634818077087, "learning_rate": 9.47040197092018e-06, "loss": 0.461, "num_input_tokens_seen": 45241952, "step": 37200 }, { "epoch": 4.661696529256985, "grad_norm": 0.14396104216575623, "learning_rate": 9.470157067506143e-06, "loss": 0.4628, "num_input_tokens_seen": 45247968, "step": 37205 }, { "epoch": 4.662323017165769, "grad_norm": 0.08403430134057999, "learning_rate": 9.46991211064762e-06, "loss": 0.4663, "num_input_tokens_seen": 45254048, "step": 37210 }, { "epoch": 4.662949505074552, "grad_norm": 0.128220796585083, "learning_rate": 9.469667100347539e-06, "loss": 0.4637, "num_input_tokens_seen": 45260384, "step": 37215 }, { "epoch": 4.663575992983335, "grad_norm": 0.08623921126127243, "learning_rate": 9.46942203660883e-06, "loss": 0.4562, "num_input_tokens_seen": 45266368, "step": 37220 }, { "epoch": 4.6642024808921185, "grad_norm": 0.07832358032464981, "learning_rate": 9.469176919434423e-06, "loss": 0.4526, "num_input_tokens_seen": 45272480, "step": 37225 }, { "epoch": 4.664828968800903, "grad_norm": 0.07744524627923965, "learning_rate": 9.468931748827248e-06, "loss": 0.4548, "num_input_tokens_seen": 45278432, "step": 37230 }, { "epoch": 4.665455456709686, "grad_norm": 0.08319658041000366, "learning_rate": 9.468686524790236e-06, "loss": 0.4636, "num_input_tokens_seen": 45284224, "step": 37235 }, { "epoch": 4.666081944618469, "grad_norm": 0.09110274165868759, "learning_rate": 9.46844124732632e-06, "loss": 0.4635, "num_input_tokens_seen": 45290176, "step": 37240 }, { "epoch": 4.666708432527252, "grad_norm": 0.08007460832595825, "learning_rate": 9.468195916438434e-06, "loss": 0.4612, "num_input_tokens_seen": 45296096, "step": 37245 }, { "epoch": 4.667334920436035, "grad_norm": 0.08816280961036682, "learning_rate": 9.46795053212951e-06, "loss": 0.4657, "num_input_tokens_seen": 45302240, "step": 37250 }, { "epoch": 4.667961408344819, "grad_norm": 0.10391970723867416, "learning_rate": 9.467705094402475e-06, "loss": 0.462, "num_input_tokens_seen": 45308512, "step": 37255 }, { "epoch": 4.6685878962536025, "grad_norm": 0.08698762208223343, "learning_rate": 9.467459603260273e-06, "loss": 0.4721, "num_input_tokens_seen": 45314560, "step": 37260 }, { "epoch": 4.669214384162386, "grad_norm": 0.052244771271944046, "learning_rate": 9.467214058705837e-06, "loss": 0.4591, "num_input_tokens_seen": 45320864, "step": 37265 }, { "epoch": 4.669840872071169, "grad_norm": 0.13075503706932068, "learning_rate": 9.466968460742098e-06, "loss": 0.4609, "num_input_tokens_seen": 45326944, "step": 37270 }, { "epoch": 4.670467359979952, "grad_norm": 0.04782446101307869, "learning_rate": 9.466722809371998e-06, "loss": 0.4657, "num_input_tokens_seen": 45333216, "step": 37275 }, { "epoch": 4.671093847888736, "grad_norm": 0.07302553951740265, "learning_rate": 9.466477104598471e-06, "loss": 0.4654, "num_input_tokens_seen": 45339104, "step": 37280 }, { "epoch": 4.671720335797519, "grad_norm": 0.12010959535837173, "learning_rate": 9.466231346424455e-06, "loss": 0.459, "num_input_tokens_seen": 45345312, "step": 37285 }, { "epoch": 4.672346823706302, "grad_norm": 0.08673495799303055, "learning_rate": 9.465985534852886e-06, "loss": 0.459, "num_input_tokens_seen": 45351552, "step": 37290 }, { "epoch": 4.672973311615086, "grad_norm": 0.07726853340864182, "learning_rate": 9.465739669886706e-06, "loss": 0.4664, "num_input_tokens_seen": 45357600, "step": 37295 }, { "epoch": 4.673599799523869, "grad_norm": 0.09665881842374802, "learning_rate": 9.465493751528856e-06, "loss": 0.4648, "num_input_tokens_seen": 45363776, "step": 37300 }, { "epoch": 4.674226287432653, "grad_norm": 0.10301312059164047, "learning_rate": 9.465247779782272e-06, "loss": 0.4629, "num_input_tokens_seen": 45370016, "step": 37305 }, { "epoch": 4.674852775341436, "grad_norm": 0.08872976899147034, "learning_rate": 9.465001754649896e-06, "loss": 0.4666, "num_input_tokens_seen": 45376128, "step": 37310 }, { "epoch": 4.675479263250219, "grad_norm": 0.11493576318025589, "learning_rate": 9.464755676134672e-06, "loss": 0.465, "num_input_tokens_seen": 45382400, "step": 37315 }, { "epoch": 4.676105751159002, "grad_norm": 0.12163509428501129, "learning_rate": 9.464509544239538e-06, "loss": 0.46, "num_input_tokens_seen": 45388512, "step": 37320 }, { "epoch": 4.676732239067786, "grad_norm": 0.07613608241081238, "learning_rate": 9.46426335896744e-06, "loss": 0.4605, "num_input_tokens_seen": 45395008, "step": 37325 }, { "epoch": 4.6773587269765695, "grad_norm": 0.07679448276758194, "learning_rate": 9.464017120321321e-06, "loss": 0.459, "num_input_tokens_seen": 45401376, "step": 37330 }, { "epoch": 4.677985214885353, "grad_norm": 0.1188717931509018, "learning_rate": 9.463770828304122e-06, "loss": 0.4608, "num_input_tokens_seen": 45407712, "step": 37335 }, { "epoch": 4.678611702794136, "grad_norm": 0.05149342492222786, "learning_rate": 9.463524482918792e-06, "loss": 0.4624, "num_input_tokens_seen": 45413824, "step": 37340 }, { "epoch": 4.67923819070292, "grad_norm": 0.08138341456651688, "learning_rate": 9.463278084168273e-06, "loss": 0.4618, "num_input_tokens_seen": 45419936, "step": 37345 }, { "epoch": 4.679864678611703, "grad_norm": 0.11442841589450836, "learning_rate": 9.463031632055513e-06, "loss": 0.4688, "num_input_tokens_seen": 45426048, "step": 37350 }, { "epoch": 4.680491166520486, "grad_norm": 0.10460924357175827, "learning_rate": 9.462785126583456e-06, "loss": 0.4697, "num_input_tokens_seen": 45431904, "step": 37355 }, { "epoch": 4.681117654429269, "grad_norm": 0.07682802528142929, "learning_rate": 9.462538567755051e-06, "loss": 0.4595, "num_input_tokens_seen": 45438016, "step": 37360 }, { "epoch": 4.681744142338053, "grad_norm": 0.08877701312303543, "learning_rate": 9.462291955573246e-06, "loss": 0.4626, "num_input_tokens_seen": 45444192, "step": 37365 }, { "epoch": 4.682370630246837, "grad_norm": 0.08512405306100845, "learning_rate": 9.462045290040988e-06, "loss": 0.4618, "num_input_tokens_seen": 45450240, "step": 37370 }, { "epoch": 4.68299711815562, "grad_norm": 0.07994008809328079, "learning_rate": 9.461798571161229e-06, "loss": 0.4676, "num_input_tokens_seen": 45456352, "step": 37375 }, { "epoch": 4.683623606064403, "grad_norm": 0.11090182512998581, "learning_rate": 9.461551798936916e-06, "loss": 0.463, "num_input_tokens_seen": 45462528, "step": 37380 }, { "epoch": 4.684250093973186, "grad_norm": 0.12142636626958847, "learning_rate": 9.461304973371e-06, "loss": 0.4594, "num_input_tokens_seen": 45468672, "step": 37385 }, { "epoch": 4.684876581881969, "grad_norm": 0.0730186179280281, "learning_rate": 9.461058094466432e-06, "loss": 0.4635, "num_input_tokens_seen": 45474848, "step": 37390 }, { "epoch": 4.685503069790753, "grad_norm": 0.08134705573320389, "learning_rate": 9.460811162226165e-06, "loss": 0.4619, "num_input_tokens_seen": 45481280, "step": 37395 }, { "epoch": 4.6861295576995365, "grad_norm": 0.05453117936849594, "learning_rate": 9.460564176653148e-06, "loss": 0.4674, "num_input_tokens_seen": 45487520, "step": 37400 }, { "epoch": 4.68675604560832, "grad_norm": 0.07373417913913727, "learning_rate": 9.460317137750337e-06, "loss": 0.463, "num_input_tokens_seen": 45493536, "step": 37405 }, { "epoch": 4.687382533517103, "grad_norm": 0.10702227056026459, "learning_rate": 9.460070045520685e-06, "loss": 0.4618, "num_input_tokens_seen": 45499680, "step": 37410 }, { "epoch": 4.688009021425886, "grad_norm": 0.10614204406738281, "learning_rate": 9.459822899967145e-06, "loss": 0.4664, "num_input_tokens_seen": 45505856, "step": 37415 }, { "epoch": 4.68863550933467, "grad_norm": 0.09793411195278168, "learning_rate": 9.459575701092674e-06, "loss": 0.4662, "num_input_tokens_seen": 45511840, "step": 37420 }, { "epoch": 4.689261997243453, "grad_norm": 0.07467786967754364, "learning_rate": 9.459328448900224e-06, "loss": 0.4603, "num_input_tokens_seen": 45517728, "step": 37425 }, { "epoch": 4.689888485152236, "grad_norm": 0.10248104482889175, "learning_rate": 9.459081143392754e-06, "loss": 0.4643, "num_input_tokens_seen": 45523680, "step": 37430 }, { "epoch": 4.69051497306102, "grad_norm": 0.07048189640045166, "learning_rate": 9.45883378457322e-06, "loss": 0.4566, "num_input_tokens_seen": 45530112, "step": 37435 }, { "epoch": 4.691141460969804, "grad_norm": 0.14119145274162292, "learning_rate": 9.458586372444581e-06, "loss": 0.4593, "num_input_tokens_seen": 45536448, "step": 37440 }, { "epoch": 4.691767948878587, "grad_norm": 0.09312345832586288, "learning_rate": 9.458338907009791e-06, "loss": 0.4684, "num_input_tokens_seen": 45542752, "step": 37445 }, { "epoch": 4.69239443678737, "grad_norm": 0.04529643431305885, "learning_rate": 9.458091388271813e-06, "loss": 0.4602, "num_input_tokens_seen": 45548800, "step": 37450 }, { "epoch": 4.693020924696153, "grad_norm": 0.08004648983478546, "learning_rate": 9.457843816233603e-06, "loss": 0.4635, "num_input_tokens_seen": 45554912, "step": 37455 }, { "epoch": 4.693647412604936, "grad_norm": 0.10761838406324387, "learning_rate": 9.457596190898122e-06, "loss": 0.4652, "num_input_tokens_seen": 45561280, "step": 37460 }, { "epoch": 4.69427390051372, "grad_norm": 0.03934374079108238, "learning_rate": 9.457348512268331e-06, "loss": 0.465, "num_input_tokens_seen": 45567552, "step": 37465 }, { "epoch": 4.694900388422504, "grad_norm": 0.0731043890118599, "learning_rate": 9.457100780347193e-06, "loss": 0.4609, "num_input_tokens_seen": 45573696, "step": 37470 }, { "epoch": 4.695526876331287, "grad_norm": 0.07439403235912323, "learning_rate": 9.456852995137666e-06, "loss": 0.4633, "num_input_tokens_seen": 45579968, "step": 37475 }, { "epoch": 4.69615336424007, "grad_norm": 0.10864076763391495, "learning_rate": 9.456605156642715e-06, "loss": 0.4691, "num_input_tokens_seen": 45586304, "step": 37480 }, { "epoch": 4.696779852148854, "grad_norm": 0.07573612779378891, "learning_rate": 9.4563572648653e-06, "loss": 0.4658, "num_input_tokens_seen": 45591840, "step": 37485 }, { "epoch": 4.697406340057637, "grad_norm": 0.0757564827799797, "learning_rate": 9.456109319808391e-06, "loss": 0.4623, "num_input_tokens_seen": 45597312, "step": 37490 }, { "epoch": 4.69803282796642, "grad_norm": 0.040892478078603745, "learning_rate": 9.455861321474947e-06, "loss": 0.4639, "num_input_tokens_seen": 45603456, "step": 37495 }, { "epoch": 4.6986593158752035, "grad_norm": 0.09193449467420578, "learning_rate": 9.455613269867933e-06, "loss": 0.4587, "num_input_tokens_seen": 45609152, "step": 37500 }, { "epoch": 4.699285803783987, "grad_norm": 0.14464086294174194, "learning_rate": 9.455365164990317e-06, "loss": 0.4587, "num_input_tokens_seen": 45615808, "step": 37505 }, { "epoch": 4.699912291692771, "grad_norm": 0.09765412658452988, "learning_rate": 9.455117006845067e-06, "loss": 0.474, "num_input_tokens_seen": 45621856, "step": 37510 }, { "epoch": 4.700538779601554, "grad_norm": 0.14458055794239044, "learning_rate": 9.454868795435145e-06, "loss": 0.4574, "num_input_tokens_seen": 45628000, "step": 37515 }, { "epoch": 4.701165267510337, "grad_norm": 0.09393901377916336, "learning_rate": 9.454620530763522e-06, "loss": 0.4568, "num_input_tokens_seen": 45633952, "step": 37520 }, { "epoch": 4.70179175541912, "grad_norm": 0.14236615598201752, "learning_rate": 9.454372212833165e-06, "loss": 0.4551, "num_input_tokens_seen": 45640352, "step": 37525 }, { "epoch": 4.702418243327903, "grad_norm": 0.1230228915810585, "learning_rate": 9.454123841647042e-06, "loss": 0.4698, "num_input_tokens_seen": 45645888, "step": 37530 }, { "epoch": 4.703044731236687, "grad_norm": 0.08864403516054153, "learning_rate": 9.453875417208126e-06, "loss": 0.4589, "num_input_tokens_seen": 45651712, "step": 37535 }, { "epoch": 4.703671219145471, "grad_norm": 0.08606527000665665, "learning_rate": 9.453626939519385e-06, "loss": 0.4688, "num_input_tokens_seen": 45656832, "step": 37540 }, { "epoch": 4.704297707054254, "grad_norm": 0.07264281809329987, "learning_rate": 9.453378408583786e-06, "loss": 0.4651, "num_input_tokens_seen": 45663040, "step": 37545 }, { "epoch": 4.704924194963037, "grad_norm": 0.0873480960726738, "learning_rate": 9.453129824404308e-06, "loss": 0.4648, "num_input_tokens_seen": 45668736, "step": 37550 }, { "epoch": 4.70555068287182, "grad_norm": 0.05196072906255722, "learning_rate": 9.452881186983916e-06, "loss": 0.4602, "num_input_tokens_seen": 45674944, "step": 37555 }, { "epoch": 4.706177170780604, "grad_norm": 0.05336625874042511, "learning_rate": 9.452632496325589e-06, "loss": 0.4643, "num_input_tokens_seen": 45681344, "step": 37560 }, { "epoch": 4.706803658689387, "grad_norm": 0.10473895072937012, "learning_rate": 9.452383752432294e-06, "loss": 0.4633, "num_input_tokens_seen": 45687168, "step": 37565 }, { "epoch": 4.7074301465981705, "grad_norm": 0.12509870529174805, "learning_rate": 9.45213495530701e-06, "loss": 0.4659, "num_input_tokens_seen": 45692320, "step": 37570 }, { "epoch": 4.708056634506954, "grad_norm": 0.07125933468341827, "learning_rate": 9.451886104952705e-06, "loss": 0.4633, "num_input_tokens_seen": 45698528, "step": 37575 }, { "epoch": 4.708683122415738, "grad_norm": 0.08674579858779907, "learning_rate": 9.451637201372363e-06, "loss": 0.4587, "num_input_tokens_seen": 45704704, "step": 37580 }, { "epoch": 4.709309610324521, "grad_norm": 0.10958686470985413, "learning_rate": 9.451388244568954e-06, "loss": 0.463, "num_input_tokens_seen": 45711008, "step": 37585 }, { "epoch": 4.709936098233304, "grad_norm": 0.044604942202568054, "learning_rate": 9.451139234545456e-06, "loss": 0.4582, "num_input_tokens_seen": 45716928, "step": 37590 }, { "epoch": 4.710562586142087, "grad_norm": 0.047137584537267685, "learning_rate": 9.450890171304847e-06, "loss": 0.4516, "num_input_tokens_seen": 45722944, "step": 37595 }, { "epoch": 4.711189074050871, "grad_norm": 0.07538106292486191, "learning_rate": 9.450641054850102e-06, "loss": 0.4587, "num_input_tokens_seen": 45729248, "step": 37600 }, { "epoch": 4.7118155619596545, "grad_norm": 0.11598695069551468, "learning_rate": 9.450391885184205e-06, "loss": 0.4693, "num_input_tokens_seen": 45735392, "step": 37605 }, { "epoch": 4.712442049868438, "grad_norm": 0.10772673785686493, "learning_rate": 9.450142662310128e-06, "loss": 0.4594, "num_input_tokens_seen": 45741568, "step": 37610 }, { "epoch": 4.713068537777221, "grad_norm": 0.07540911436080933, "learning_rate": 9.449893386230856e-06, "loss": 0.4599, "num_input_tokens_seen": 45747872, "step": 37615 }, { "epoch": 4.713695025686004, "grad_norm": 0.06648826599121094, "learning_rate": 9.449644056949365e-06, "loss": 0.4528, "num_input_tokens_seen": 45754176, "step": 37620 }, { "epoch": 4.714321513594788, "grad_norm": 0.12203417718410492, "learning_rate": 9.44939467446864e-06, "loss": 0.4658, "num_input_tokens_seen": 45760288, "step": 37625 }, { "epoch": 4.714948001503571, "grad_norm": 0.0895799845457077, "learning_rate": 9.44914523879166e-06, "loss": 0.4632, "num_input_tokens_seen": 45766400, "step": 37630 }, { "epoch": 4.715574489412354, "grad_norm": 0.08155898749828339, "learning_rate": 9.44889574992141e-06, "loss": 0.4679, "num_input_tokens_seen": 45772800, "step": 37635 }, { "epoch": 4.7162009773211375, "grad_norm": 0.06801038980484009, "learning_rate": 9.448646207860868e-06, "loss": 0.4668, "num_input_tokens_seen": 45778848, "step": 37640 }, { "epoch": 4.716827465229921, "grad_norm": 0.04473031684756279, "learning_rate": 9.448396612613021e-06, "loss": 0.4593, "num_input_tokens_seen": 45784736, "step": 37645 }, { "epoch": 4.717453953138705, "grad_norm": 0.07221389561891556, "learning_rate": 9.448146964180852e-06, "loss": 0.4593, "num_input_tokens_seen": 45790624, "step": 37650 }, { "epoch": 4.718080441047488, "grad_norm": 0.0877918004989624, "learning_rate": 9.447897262567349e-06, "loss": 0.4598, "num_input_tokens_seen": 45796736, "step": 37655 }, { "epoch": 4.718706928956271, "grad_norm": 0.047885552048683167, "learning_rate": 9.447647507775492e-06, "loss": 0.463, "num_input_tokens_seen": 45802880, "step": 37660 }, { "epoch": 4.719333416865054, "grad_norm": 0.05085672810673714, "learning_rate": 9.447397699808272e-06, "loss": 0.4628, "num_input_tokens_seen": 45809024, "step": 37665 }, { "epoch": 4.719959904773837, "grad_norm": 0.08206890523433685, "learning_rate": 9.447147838668671e-06, "loss": 0.4562, "num_input_tokens_seen": 45815264, "step": 37670 }, { "epoch": 4.7205863926826215, "grad_norm": 0.08382906019687653, "learning_rate": 9.44689792435968e-06, "loss": 0.4626, "num_input_tokens_seen": 45821504, "step": 37675 }, { "epoch": 4.721212880591405, "grad_norm": 0.0759488120675087, "learning_rate": 9.446647956884285e-06, "loss": 0.463, "num_input_tokens_seen": 45827872, "step": 37680 }, { "epoch": 4.721839368500188, "grad_norm": 0.11812429875135422, "learning_rate": 9.446397936245476e-06, "loss": 0.4588, "num_input_tokens_seen": 45833536, "step": 37685 }, { "epoch": 4.722465856408971, "grad_norm": 0.12356822192668915, "learning_rate": 9.44614786244624e-06, "loss": 0.4567, "num_input_tokens_seen": 45839872, "step": 37690 }, { "epoch": 4.723092344317755, "grad_norm": 0.0765601098537445, "learning_rate": 9.44589773548957e-06, "loss": 0.4593, "num_input_tokens_seen": 45845632, "step": 37695 }, { "epoch": 4.723718832226538, "grad_norm": 0.07776223123073578, "learning_rate": 9.445647555378453e-06, "loss": 0.4572, "num_input_tokens_seen": 45851808, "step": 37700 }, { "epoch": 4.724345320135321, "grad_norm": 0.13076084852218628, "learning_rate": 9.445397322115882e-06, "loss": 0.4573, "num_input_tokens_seen": 45857824, "step": 37705 }, { "epoch": 4.7249718080441045, "grad_norm": 0.08267766237258911, "learning_rate": 9.445147035704848e-06, "loss": 0.4581, "num_input_tokens_seen": 45863968, "step": 37710 }, { "epoch": 4.725598295952889, "grad_norm": 0.177016943693161, "learning_rate": 9.444896696148347e-06, "loss": 0.4529, "num_input_tokens_seen": 45870464, "step": 37715 }, { "epoch": 4.726224783861672, "grad_norm": 0.097599558532238, "learning_rate": 9.444646303449365e-06, "loss": 0.4619, "num_input_tokens_seen": 45876608, "step": 37720 }, { "epoch": 4.726851271770455, "grad_norm": 0.13500268757343292, "learning_rate": 9.444395857610902e-06, "loss": 0.47, "num_input_tokens_seen": 45882880, "step": 37725 }, { "epoch": 4.727477759679238, "grad_norm": 0.08892882615327835, "learning_rate": 9.44414535863595e-06, "loss": 0.463, "num_input_tokens_seen": 45889056, "step": 37730 }, { "epoch": 4.728104247588021, "grad_norm": 0.09042850136756897, "learning_rate": 9.443894806527503e-06, "loss": 0.4642, "num_input_tokens_seen": 45895200, "step": 37735 }, { "epoch": 4.728730735496805, "grad_norm": 0.04904584214091301, "learning_rate": 9.443644201288558e-06, "loss": 0.4599, "num_input_tokens_seen": 45901504, "step": 37740 }, { "epoch": 4.7293572234055885, "grad_norm": 0.11704444140195847, "learning_rate": 9.443393542922108e-06, "loss": 0.4762, "num_input_tokens_seen": 45907264, "step": 37745 }, { "epoch": 4.729983711314372, "grad_norm": 0.07091523706912994, "learning_rate": 9.443142831431155e-06, "loss": 0.4628, "num_input_tokens_seen": 45912768, "step": 37750 }, { "epoch": 4.730610199223155, "grad_norm": 0.12408274412155151, "learning_rate": 9.442892066818693e-06, "loss": 0.4679, "num_input_tokens_seen": 45919200, "step": 37755 }, { "epoch": 4.731236687131938, "grad_norm": 0.11677782237529755, "learning_rate": 9.442641249087721e-06, "loss": 0.4675, "num_input_tokens_seen": 45925216, "step": 37760 }, { "epoch": 4.731863175040722, "grad_norm": 0.08922113478183746, "learning_rate": 9.442390378241237e-06, "loss": 0.4571, "num_input_tokens_seen": 45930976, "step": 37765 }, { "epoch": 4.732489662949505, "grad_norm": 0.1112523004412651, "learning_rate": 9.442139454282242e-06, "loss": 0.4612, "num_input_tokens_seen": 45937376, "step": 37770 }, { "epoch": 4.733116150858288, "grad_norm": 0.06755641847848892, "learning_rate": 9.441888477213737e-06, "loss": 0.4701, "num_input_tokens_seen": 45942848, "step": 37775 }, { "epoch": 4.733742638767072, "grad_norm": 0.10245067626237869, "learning_rate": 9.441637447038718e-06, "loss": 0.4596, "num_input_tokens_seen": 45948448, "step": 37780 }, { "epoch": 4.734369126675855, "grad_norm": 0.10323340445756912, "learning_rate": 9.44138636376019e-06, "loss": 0.4648, "num_input_tokens_seen": 45954656, "step": 37785 }, { "epoch": 4.734995614584639, "grad_norm": 0.11349902302026749, "learning_rate": 9.441135227381152e-06, "loss": 0.4653, "num_input_tokens_seen": 45960640, "step": 37790 }, { "epoch": 4.735622102493422, "grad_norm": 0.08691129833459854, "learning_rate": 9.440884037904612e-06, "loss": 0.4654, "num_input_tokens_seen": 45966464, "step": 37795 }, { "epoch": 4.736248590402205, "grad_norm": 0.09800390154123306, "learning_rate": 9.440632795333566e-06, "loss": 0.4636, "num_input_tokens_seen": 45972672, "step": 37800 }, { "epoch": 4.736875078310988, "grad_norm": 0.07020165026187897, "learning_rate": 9.440381499671024e-06, "loss": 0.4617, "num_input_tokens_seen": 45978720, "step": 37805 }, { "epoch": 4.7375015662197715, "grad_norm": 0.07436805963516235, "learning_rate": 9.440130150919987e-06, "loss": 0.4621, "num_input_tokens_seen": 45984096, "step": 37810 }, { "epoch": 4.7381280541285555, "grad_norm": 0.07160792499780655, "learning_rate": 9.43987874908346e-06, "loss": 0.4625, "num_input_tokens_seen": 45990400, "step": 37815 }, { "epoch": 4.738754542037339, "grad_norm": 0.06951496750116348, "learning_rate": 9.439627294164452e-06, "loss": 0.4698, "num_input_tokens_seen": 45996640, "step": 37820 }, { "epoch": 4.739381029946122, "grad_norm": 0.04009561240673065, "learning_rate": 9.439375786165966e-06, "loss": 0.47, "num_input_tokens_seen": 46002432, "step": 37825 }, { "epoch": 4.740007517854905, "grad_norm": 0.10751210153102875, "learning_rate": 9.43912422509101e-06, "loss": 0.4684, "num_input_tokens_seen": 46008416, "step": 37830 }, { "epoch": 4.740634005763689, "grad_norm": 0.11082803457975388, "learning_rate": 9.438872610942592e-06, "loss": 0.4692, "num_input_tokens_seen": 46014784, "step": 37835 }, { "epoch": 4.741260493672472, "grad_norm": 0.07667262852191925, "learning_rate": 9.43862094372372e-06, "loss": 0.4572, "num_input_tokens_seen": 46020928, "step": 37840 }, { "epoch": 4.741886981581255, "grad_norm": 0.04046894609928131, "learning_rate": 9.438369223437401e-06, "loss": 0.4611, "num_input_tokens_seen": 46027136, "step": 37845 }, { "epoch": 4.742513469490039, "grad_norm": 0.06906058639287949, "learning_rate": 9.438117450086647e-06, "loss": 0.4613, "num_input_tokens_seen": 46033440, "step": 37850 }, { "epoch": 4.743139957398823, "grad_norm": 0.08315135538578033, "learning_rate": 9.43786562367447e-06, "loss": 0.463, "num_input_tokens_seen": 46039456, "step": 37855 }, { "epoch": 4.743766445307606, "grad_norm": 0.07343561202287674, "learning_rate": 9.437613744203876e-06, "loss": 0.459, "num_input_tokens_seen": 46045888, "step": 37860 }, { "epoch": 4.744392933216389, "grad_norm": 0.07435151934623718, "learning_rate": 9.437361811677879e-06, "loss": 0.4681, "num_input_tokens_seen": 46051936, "step": 37865 }, { "epoch": 4.745019421125172, "grad_norm": 0.0744514986872673, "learning_rate": 9.437109826099491e-06, "loss": 0.4638, "num_input_tokens_seen": 46057984, "step": 37870 }, { "epoch": 4.745645909033955, "grad_norm": 0.1125224381685257, "learning_rate": 9.436857787471725e-06, "loss": 0.468, "num_input_tokens_seen": 46064288, "step": 37875 }, { "epoch": 4.746272396942739, "grad_norm": 0.10943856090307236, "learning_rate": 9.436605695797594e-06, "loss": 0.4597, "num_input_tokens_seen": 46070400, "step": 37880 }, { "epoch": 4.746898884851523, "grad_norm": 0.14026762545108795, "learning_rate": 9.436353551080111e-06, "loss": 0.4613, "num_input_tokens_seen": 46076672, "step": 37885 }, { "epoch": 4.747525372760306, "grad_norm": 0.10333272814750671, "learning_rate": 9.436101353322291e-06, "loss": 0.4672, "num_input_tokens_seen": 46083040, "step": 37890 }, { "epoch": 4.748151860669089, "grad_norm": 0.07330815494060516, "learning_rate": 9.435849102527151e-06, "loss": 0.4661, "num_input_tokens_seen": 46088608, "step": 37895 }, { "epoch": 4.748778348577872, "grad_norm": 0.0845404639840126, "learning_rate": 9.435596798697704e-06, "loss": 0.4624, "num_input_tokens_seen": 46094816, "step": 37900 }, { "epoch": 4.749404836486656, "grad_norm": 0.07520420849323273, "learning_rate": 9.43534444183697e-06, "loss": 0.4637, "num_input_tokens_seen": 46100704, "step": 37905 }, { "epoch": 4.750031324395439, "grad_norm": 0.08529151231050491, "learning_rate": 9.435092031947962e-06, "loss": 0.4653, "num_input_tokens_seen": 46106496, "step": 37910 }, { "epoch": 4.7506578123042225, "grad_norm": 0.06977786123752594, "learning_rate": 9.434839569033701e-06, "loss": 0.4657, "num_input_tokens_seen": 46112512, "step": 37915 }, { "epoch": 4.751284300213006, "grad_norm": 0.08032047748565674, "learning_rate": 9.434587053097204e-06, "loss": 0.4542, "num_input_tokens_seen": 46117984, "step": 37920 }, { "epoch": 4.751910788121789, "grad_norm": 0.10760517418384552, "learning_rate": 9.434334484141492e-06, "loss": 0.4621, "num_input_tokens_seen": 46124064, "step": 37925 }, { "epoch": 4.752537276030573, "grad_norm": 0.10628875344991684, "learning_rate": 9.434081862169582e-06, "loss": 0.4691, "num_input_tokens_seen": 46129824, "step": 37930 }, { "epoch": 4.753163763939356, "grad_norm": 0.06470029056072235, "learning_rate": 9.433829187184495e-06, "loss": 0.4661, "num_input_tokens_seen": 46135680, "step": 37935 }, { "epoch": 4.753790251848139, "grad_norm": 0.1306789368391037, "learning_rate": 9.433576459189251e-06, "loss": 0.465, "num_input_tokens_seen": 46142080, "step": 37940 }, { "epoch": 4.754416739756922, "grad_norm": 0.09313260763883591, "learning_rate": 9.433323678186875e-06, "loss": 0.4696, "num_input_tokens_seen": 46148256, "step": 37945 }, { "epoch": 4.755043227665706, "grad_norm": 0.07711433619260788, "learning_rate": 9.433070844180387e-06, "loss": 0.4601, "num_input_tokens_seen": 46154784, "step": 37950 }, { "epoch": 4.75566971557449, "grad_norm": 0.07141841948032379, "learning_rate": 9.432817957172808e-06, "loss": 0.4642, "num_input_tokens_seen": 46161056, "step": 37955 }, { "epoch": 4.756296203483273, "grad_norm": 0.07100511342287064, "learning_rate": 9.432565017167165e-06, "loss": 0.4601, "num_input_tokens_seen": 46167424, "step": 37960 }, { "epoch": 4.756922691392056, "grad_norm": 0.1131167784333229, "learning_rate": 9.43231202416648e-06, "loss": 0.4678, "num_input_tokens_seen": 46173472, "step": 37965 }, { "epoch": 4.75754917930084, "grad_norm": 0.09258679300546646, "learning_rate": 9.432058978173779e-06, "loss": 0.4608, "num_input_tokens_seen": 46179456, "step": 37970 }, { "epoch": 4.758175667209623, "grad_norm": 0.06799466907978058, "learning_rate": 9.431805879192087e-06, "loss": 0.4643, "num_input_tokens_seen": 46184896, "step": 37975 }, { "epoch": 4.758802155118406, "grad_norm": 0.09913712739944458, "learning_rate": 9.43155272722443e-06, "loss": 0.4591, "num_input_tokens_seen": 46190976, "step": 37980 }, { "epoch": 4.7594286430271895, "grad_norm": 0.10176732391119003, "learning_rate": 9.431299522273832e-06, "loss": 0.4604, "num_input_tokens_seen": 46197088, "step": 37985 }, { "epoch": 4.760055130935973, "grad_norm": 0.07410356402397156, "learning_rate": 9.431046264343324e-06, "loss": 0.4571, "num_input_tokens_seen": 46202848, "step": 37990 }, { "epoch": 4.760681618844757, "grad_norm": 0.06807061284780502, "learning_rate": 9.430792953435933e-06, "loss": 0.4528, "num_input_tokens_seen": 46209088, "step": 37995 }, { "epoch": 4.76130810675354, "grad_norm": 0.07170342653989792, "learning_rate": 9.430539589554686e-06, "loss": 0.4695, "num_input_tokens_seen": 46215360, "step": 38000 }, { "epoch": 4.761934594662323, "grad_norm": 0.0881914421916008, "learning_rate": 9.430286172702612e-06, "loss": 0.465, "num_input_tokens_seen": 46221888, "step": 38005 }, { "epoch": 4.762561082571106, "grad_norm": 0.08289743959903717, "learning_rate": 9.430032702882744e-06, "loss": 0.4608, "num_input_tokens_seen": 46228224, "step": 38010 }, { "epoch": 4.763187570479889, "grad_norm": 0.10006141662597656, "learning_rate": 9.42977918009811e-06, "loss": 0.4658, "num_input_tokens_seen": 46234240, "step": 38015 }, { "epoch": 4.7638140583886734, "grad_norm": 0.06879681348800659, "learning_rate": 9.429525604351742e-06, "loss": 0.4623, "num_input_tokens_seen": 46240416, "step": 38020 }, { "epoch": 4.764440546297457, "grad_norm": 0.06926406174898148, "learning_rate": 9.42927197564667e-06, "loss": 0.4639, "num_input_tokens_seen": 46246080, "step": 38025 }, { "epoch": 4.76506703420624, "grad_norm": 0.0877465307712555, "learning_rate": 9.429018293985929e-06, "loss": 0.4649, "num_input_tokens_seen": 46252448, "step": 38030 }, { "epoch": 4.765693522115023, "grad_norm": 0.06652269512414932, "learning_rate": 9.428764559372549e-06, "loss": 0.4626, "num_input_tokens_seen": 46258464, "step": 38035 }, { "epoch": 4.766320010023806, "grad_norm": 0.12938129901885986, "learning_rate": 9.428510771809568e-06, "loss": 0.4609, "num_input_tokens_seen": 46264992, "step": 38040 }, { "epoch": 4.76694649793259, "grad_norm": 0.0667264461517334, "learning_rate": 9.428256931300015e-06, "loss": 0.4578, "num_input_tokens_seen": 46271264, "step": 38045 }, { "epoch": 4.767572985841373, "grad_norm": 0.0779460221529007, "learning_rate": 9.428003037846928e-06, "loss": 0.4615, "num_input_tokens_seen": 46277408, "step": 38050 }, { "epoch": 4.7681994737501565, "grad_norm": 0.0822979286313057, "learning_rate": 9.427749091453343e-06, "loss": 0.4633, "num_input_tokens_seen": 46283744, "step": 38055 }, { "epoch": 4.76882596165894, "grad_norm": 0.07613062113523483, "learning_rate": 9.427495092122292e-06, "loss": 0.4642, "num_input_tokens_seen": 46289728, "step": 38060 }, { "epoch": 4.769452449567723, "grad_norm": 0.07409976422786713, "learning_rate": 9.427241039856816e-06, "loss": 0.4643, "num_input_tokens_seen": 46296064, "step": 38065 }, { "epoch": 4.770078937476507, "grad_norm": 0.060144390910863876, "learning_rate": 9.426986934659952e-06, "loss": 0.4652, "num_input_tokens_seen": 46302176, "step": 38070 }, { "epoch": 4.77070542538529, "grad_norm": 0.09275529533624649, "learning_rate": 9.426732776534737e-06, "loss": 0.4694, "num_input_tokens_seen": 46308576, "step": 38075 }, { "epoch": 4.771331913294073, "grad_norm": 0.10653931647539139, "learning_rate": 9.42647856548421e-06, "loss": 0.4652, "num_input_tokens_seen": 46313664, "step": 38080 }, { "epoch": 4.771958401202856, "grad_norm": 0.09805013239383698, "learning_rate": 9.42622430151141e-06, "loss": 0.4631, "num_input_tokens_seen": 46319744, "step": 38085 }, { "epoch": 4.7725848891116405, "grad_norm": 0.03498854488134384, "learning_rate": 9.425969984619376e-06, "loss": 0.469, "num_input_tokens_seen": 46326144, "step": 38090 }, { "epoch": 4.773211377020424, "grad_norm": 0.07574720680713654, "learning_rate": 9.425715614811151e-06, "loss": 0.4613, "num_input_tokens_seen": 46332448, "step": 38095 }, { "epoch": 4.773837864929207, "grad_norm": 0.073776014149189, "learning_rate": 9.425461192089776e-06, "loss": 0.4596, "num_input_tokens_seen": 46338848, "step": 38100 }, { "epoch": 4.77446435283799, "grad_norm": 0.07360997051000595, "learning_rate": 9.425206716458289e-06, "loss": 0.4696, "num_input_tokens_seen": 46345344, "step": 38105 }, { "epoch": 4.775090840746774, "grad_norm": 0.037677101790905, "learning_rate": 9.424952187919735e-06, "loss": 0.4604, "num_input_tokens_seen": 46351296, "step": 38110 }, { "epoch": 4.775717328655557, "grad_norm": 0.08772493153810501, "learning_rate": 9.42469760647716e-06, "loss": 0.4665, "num_input_tokens_seen": 46357120, "step": 38115 }, { "epoch": 4.77634381656434, "grad_norm": 0.033269017934799194, "learning_rate": 9.424442972133604e-06, "loss": 0.4651, "num_input_tokens_seen": 46362976, "step": 38120 }, { "epoch": 4.7769703044731235, "grad_norm": 0.08773594349622726, "learning_rate": 9.424188284892111e-06, "loss": 0.4582, "num_input_tokens_seen": 46368928, "step": 38125 }, { "epoch": 4.777596792381907, "grad_norm": 0.10093901306390762, "learning_rate": 9.42393354475573e-06, "loss": 0.4548, "num_input_tokens_seen": 46375008, "step": 38130 }, { "epoch": 4.778223280290691, "grad_norm": 0.06746397167444229, "learning_rate": 9.423678751727501e-06, "loss": 0.4638, "num_input_tokens_seen": 46381184, "step": 38135 }, { "epoch": 4.778849768199474, "grad_norm": 0.07361133396625519, "learning_rate": 9.423423905810476e-06, "loss": 0.4603, "num_input_tokens_seen": 46386624, "step": 38140 }, { "epoch": 4.779476256108257, "grad_norm": 0.07465063780546188, "learning_rate": 9.423169007007697e-06, "loss": 0.4656, "num_input_tokens_seen": 46392928, "step": 38145 }, { "epoch": 4.78010274401704, "grad_norm": 0.12394062429666519, "learning_rate": 9.422914055322216e-06, "loss": 0.465, "num_input_tokens_seen": 46399552, "step": 38150 }, { "epoch": 4.780729231925823, "grad_norm": 0.06841392070055008, "learning_rate": 9.422659050757077e-06, "loss": 0.4587, "num_input_tokens_seen": 46405536, "step": 38155 }, { "epoch": 4.7813557198346075, "grad_norm": 0.07078598439693451, "learning_rate": 9.422403993315332e-06, "loss": 0.4631, "num_input_tokens_seen": 46411520, "step": 38160 }, { "epoch": 4.781982207743391, "grad_norm": 0.07567225396633148, "learning_rate": 9.422148883000028e-06, "loss": 0.4605, "num_input_tokens_seen": 46417600, "step": 38165 }, { "epoch": 4.782608695652174, "grad_norm": 0.07696184515953064, "learning_rate": 9.421893719814217e-06, "loss": 0.466, "num_input_tokens_seen": 46423872, "step": 38170 }, { "epoch": 4.783235183560957, "grad_norm": 0.04007914289832115, "learning_rate": 9.421638503760948e-06, "loss": 0.4646, "num_input_tokens_seen": 46430368, "step": 38175 }, { "epoch": 4.78386167146974, "grad_norm": 0.035140346735715866, "learning_rate": 9.421383234843276e-06, "loss": 0.4595, "num_input_tokens_seen": 46436224, "step": 38180 }, { "epoch": 4.784488159378524, "grad_norm": 0.04014810547232628, "learning_rate": 9.421127913064247e-06, "loss": 0.4618, "num_input_tokens_seen": 46442368, "step": 38185 }, { "epoch": 4.785114647287307, "grad_norm": 0.08378108590841293, "learning_rate": 9.420872538426916e-06, "loss": 0.466, "num_input_tokens_seen": 46448512, "step": 38190 }, { "epoch": 4.785741135196091, "grad_norm": 0.03714780509471893, "learning_rate": 9.420617110934339e-06, "loss": 0.4673, "num_input_tokens_seen": 46454592, "step": 38195 }, { "epoch": 4.786367623104874, "grad_norm": 0.09723876416683197, "learning_rate": 9.420361630589567e-06, "loss": 0.4594, "num_input_tokens_seen": 46460640, "step": 38200 }, { "epoch": 4.786994111013658, "grad_norm": 0.06485006213188171, "learning_rate": 9.420106097395656e-06, "loss": 0.4615, "num_input_tokens_seen": 46466752, "step": 38205 }, { "epoch": 4.787620598922441, "grad_norm": 0.07065214961767197, "learning_rate": 9.41985051135566e-06, "loss": 0.4604, "num_input_tokens_seen": 46472736, "step": 38210 }, { "epoch": 4.788247086831224, "grad_norm": 0.08657541871070862, "learning_rate": 9.419594872472636e-06, "loss": 0.4611, "num_input_tokens_seen": 46478688, "step": 38215 }, { "epoch": 4.788873574740007, "grad_norm": 0.11303211748600006, "learning_rate": 9.419339180749638e-06, "loss": 0.4626, "num_input_tokens_seen": 46484928, "step": 38220 }, { "epoch": 4.789500062648791, "grad_norm": 0.032583605498075485, "learning_rate": 9.419083436189724e-06, "loss": 0.4648, "num_input_tokens_seen": 46490880, "step": 38225 }, { "epoch": 4.7901265505575745, "grad_norm": 0.04333280771970749, "learning_rate": 9.418827638795954e-06, "loss": 0.4686, "num_input_tokens_seen": 46496704, "step": 38230 }, { "epoch": 4.790753038466358, "grad_norm": 0.07274211198091507, "learning_rate": 9.418571788571384e-06, "loss": 0.46, "num_input_tokens_seen": 46502912, "step": 38235 }, { "epoch": 4.791379526375141, "grad_norm": 0.074284628033638, "learning_rate": 9.418315885519074e-06, "loss": 0.4582, "num_input_tokens_seen": 46509120, "step": 38240 }, { "epoch": 4.792006014283924, "grad_norm": 0.0683220624923706, "learning_rate": 9.41805992964208e-06, "loss": 0.4583, "num_input_tokens_seen": 46515264, "step": 38245 }, { "epoch": 4.792632502192708, "grad_norm": 0.035409752279520035, "learning_rate": 9.417803920943468e-06, "loss": 0.4586, "num_input_tokens_seen": 46520896, "step": 38250 }, { "epoch": 4.793258990101491, "grad_norm": 0.07017814368009567, "learning_rate": 9.417547859426294e-06, "loss": 0.4635, "num_input_tokens_seen": 46526784, "step": 38255 }, { "epoch": 4.793885478010274, "grad_norm": 0.0711405873298645, "learning_rate": 9.417291745093622e-06, "loss": 0.4624, "num_input_tokens_seen": 46533152, "step": 38260 }, { "epoch": 4.794511965919058, "grad_norm": 0.11875206232070923, "learning_rate": 9.417035577948514e-06, "loss": 0.4655, "num_input_tokens_seen": 46539360, "step": 38265 }, { "epoch": 4.795138453827841, "grad_norm": 0.09171395003795624, "learning_rate": 9.416779357994032e-06, "loss": 0.4676, "num_input_tokens_seen": 46545728, "step": 38270 }, { "epoch": 4.795764941736625, "grad_norm": 0.06695599853992462, "learning_rate": 9.416523085233237e-06, "loss": 0.4625, "num_input_tokens_seen": 46551840, "step": 38275 }, { "epoch": 4.796391429645408, "grad_norm": 0.07551950961351395, "learning_rate": 9.416266759669199e-06, "loss": 0.4623, "num_input_tokens_seen": 46558304, "step": 38280 }, { "epoch": 4.797017917554191, "grad_norm": 0.06921868771314621, "learning_rate": 9.416010381304976e-06, "loss": 0.4645, "num_input_tokens_seen": 46564448, "step": 38285 }, { "epoch": 4.797644405462974, "grad_norm": 0.1409938484430313, "learning_rate": 9.415753950143639e-06, "loss": 0.4693, "num_input_tokens_seen": 46570368, "step": 38290 }, { "epoch": 4.7982708933717575, "grad_norm": 0.06897177547216415, "learning_rate": 9.415497466188249e-06, "loss": 0.4591, "num_input_tokens_seen": 46576928, "step": 38295 }, { "epoch": 4.7988973812805416, "grad_norm": 0.0697011724114418, "learning_rate": 9.415240929441874e-06, "loss": 0.4601, "num_input_tokens_seen": 46583168, "step": 38300 }, { "epoch": 4.799523869189325, "grad_norm": 0.06695318222045898, "learning_rate": 9.41498433990758e-06, "loss": 0.4643, "num_input_tokens_seen": 46589728, "step": 38305 }, { "epoch": 4.800150357098108, "grad_norm": 0.07034377753734589, "learning_rate": 9.41472769758844e-06, "loss": 0.4585, "num_input_tokens_seen": 46595776, "step": 38310 }, { "epoch": 4.800776845006891, "grad_norm": 0.10726764798164368, "learning_rate": 9.414471002487516e-06, "loss": 0.4674, "num_input_tokens_seen": 46601888, "step": 38315 }, { "epoch": 4.801403332915675, "grad_norm": 0.036552198231220245, "learning_rate": 9.41421425460788e-06, "loss": 0.4696, "num_input_tokens_seen": 46607776, "step": 38320 }, { "epoch": 4.802029820824458, "grad_norm": 0.03877710551023483, "learning_rate": 9.413957453952603e-06, "loss": 0.4623, "num_input_tokens_seen": 46613760, "step": 38325 }, { "epoch": 4.8026563087332415, "grad_norm": 0.06771307438611984, "learning_rate": 9.413700600524753e-06, "loss": 0.4611, "num_input_tokens_seen": 46619680, "step": 38330 }, { "epoch": 4.803282796642025, "grad_norm": 0.0760393738746643, "learning_rate": 9.413443694327401e-06, "loss": 0.4552, "num_input_tokens_seen": 46625568, "step": 38335 }, { "epoch": 4.803909284550808, "grad_norm": 0.08969348669052124, "learning_rate": 9.413186735363618e-06, "loss": 0.463, "num_input_tokens_seen": 46631328, "step": 38340 }, { "epoch": 4.804535772459592, "grad_norm": 0.17217549681663513, "learning_rate": 9.41292972363648e-06, "loss": 0.4584, "num_input_tokens_seen": 46637664, "step": 38345 }, { "epoch": 4.805162260368375, "grad_norm": 0.0984269455075264, "learning_rate": 9.412672659149053e-06, "loss": 0.4617, "num_input_tokens_seen": 46643904, "step": 38350 }, { "epoch": 4.805788748277158, "grad_norm": 0.1286652535200119, "learning_rate": 9.412415541904418e-06, "loss": 0.4573, "num_input_tokens_seen": 46649984, "step": 38355 }, { "epoch": 4.806415236185941, "grad_norm": 0.07746197283267975, "learning_rate": 9.412158371905644e-06, "loss": 0.4676, "num_input_tokens_seen": 46656416, "step": 38360 }, { "epoch": 4.807041724094725, "grad_norm": 0.0803847387433052, "learning_rate": 9.411901149155807e-06, "loss": 0.4586, "num_input_tokens_seen": 46662656, "step": 38365 }, { "epoch": 4.807668212003509, "grad_norm": 0.07292231172323227, "learning_rate": 9.411643873657984e-06, "loss": 0.4657, "num_input_tokens_seen": 46668960, "step": 38370 }, { "epoch": 4.808294699912292, "grad_norm": 0.06914711743593216, "learning_rate": 9.411386545415246e-06, "loss": 0.4651, "num_input_tokens_seen": 46675040, "step": 38375 }, { "epoch": 4.808921187821075, "grad_norm": 0.1048225611448288, "learning_rate": 9.411129164430675e-06, "loss": 0.4655, "num_input_tokens_seen": 46681024, "step": 38380 }, { "epoch": 4.809547675729858, "grad_norm": 0.08101359754800797, "learning_rate": 9.410871730707347e-06, "loss": 0.4659, "num_input_tokens_seen": 46687136, "step": 38385 }, { "epoch": 4.810174163638642, "grad_norm": 0.08115671575069427, "learning_rate": 9.410614244248338e-06, "loss": 0.4624, "num_input_tokens_seen": 46693280, "step": 38390 }, { "epoch": 4.810800651547425, "grad_norm": 0.07174395024776459, "learning_rate": 9.410356705056727e-06, "loss": 0.4656, "num_input_tokens_seen": 46699424, "step": 38395 }, { "epoch": 4.8114271394562085, "grad_norm": 0.09180781990289688, "learning_rate": 9.410099113135596e-06, "loss": 0.4619, "num_input_tokens_seen": 46705312, "step": 38400 }, { "epoch": 4.812053627364992, "grad_norm": 0.08209196478128433, "learning_rate": 9.409841468488019e-06, "loss": 0.4657, "num_input_tokens_seen": 46711456, "step": 38405 }, { "epoch": 4.812680115273775, "grad_norm": 0.03570057824254036, "learning_rate": 9.409583771117081e-06, "loss": 0.4708, "num_input_tokens_seen": 46717504, "step": 38410 }, { "epoch": 4.813306603182559, "grad_norm": 0.1063452661037445, "learning_rate": 9.409326021025863e-06, "loss": 0.4674, "num_input_tokens_seen": 46723776, "step": 38415 }, { "epoch": 4.813933091091342, "grad_norm": 0.10304854065179825, "learning_rate": 9.409068218217443e-06, "loss": 0.4596, "num_input_tokens_seen": 46730336, "step": 38420 }, { "epoch": 4.814559579000125, "grad_norm": 0.10101635754108429, "learning_rate": 9.408810362694906e-06, "loss": 0.4558, "num_input_tokens_seen": 46736800, "step": 38425 }, { "epoch": 4.815186066908908, "grad_norm": 0.06543401628732681, "learning_rate": 9.408552454461335e-06, "loss": 0.4601, "num_input_tokens_seen": 46743008, "step": 38430 }, { "epoch": 4.8158125548176915, "grad_norm": 0.07177101820707321, "learning_rate": 9.408294493519814e-06, "loss": 0.4674, "num_input_tokens_seen": 46749248, "step": 38435 }, { "epoch": 4.816439042726476, "grad_norm": 0.035834744572639465, "learning_rate": 9.408036479873425e-06, "loss": 0.4583, "num_input_tokens_seen": 46755488, "step": 38440 }, { "epoch": 4.817065530635259, "grad_norm": 0.07911307364702225, "learning_rate": 9.407778413525254e-06, "loss": 0.4682, "num_input_tokens_seen": 46761888, "step": 38445 }, { "epoch": 4.817692018544042, "grad_norm": 0.11456368863582611, "learning_rate": 9.407520294478384e-06, "loss": 0.4616, "num_input_tokens_seen": 46767936, "step": 38450 }, { "epoch": 4.818318506452825, "grad_norm": 0.10264378786087036, "learning_rate": 9.407262122735907e-06, "loss": 0.4616, "num_input_tokens_seen": 46773536, "step": 38455 }, { "epoch": 4.818944994361609, "grad_norm": 0.06670019775629044, "learning_rate": 9.407003898300904e-06, "loss": 0.4725, "num_input_tokens_seen": 46779392, "step": 38460 }, { "epoch": 4.819571482270392, "grad_norm": 0.09632059931755066, "learning_rate": 9.406745621176464e-06, "loss": 0.4582, "num_input_tokens_seen": 46785152, "step": 38465 }, { "epoch": 4.8201979701791755, "grad_norm": 0.12810386717319489, "learning_rate": 9.406487291365676e-06, "loss": 0.4597, "num_input_tokens_seen": 46791328, "step": 38470 }, { "epoch": 4.820824458087959, "grad_norm": 0.03841743245720863, "learning_rate": 9.406228908871626e-06, "loss": 0.4622, "num_input_tokens_seen": 46797344, "step": 38475 }, { "epoch": 4.821450945996743, "grad_norm": 0.0715206041932106, "learning_rate": 9.405970473697405e-06, "loss": 0.4628, "num_input_tokens_seen": 46803520, "step": 38480 }, { "epoch": 4.822077433905526, "grad_norm": 0.03881359100341797, "learning_rate": 9.405711985846105e-06, "loss": 0.4637, "num_input_tokens_seen": 46809632, "step": 38485 }, { "epoch": 4.822703921814309, "grad_norm": 0.03465527296066284, "learning_rate": 9.405453445320812e-06, "loss": 0.4593, "num_input_tokens_seen": 46816256, "step": 38490 }, { "epoch": 4.823330409723092, "grad_norm": 0.070797860622406, "learning_rate": 9.40519485212462e-06, "loss": 0.4617, "num_input_tokens_seen": 46821920, "step": 38495 }, { "epoch": 4.823956897631875, "grad_norm": 0.09566781669855118, "learning_rate": 9.404936206260619e-06, "loss": 0.4567, "num_input_tokens_seen": 46828064, "step": 38500 }, { "epoch": 4.8245833855406595, "grad_norm": 0.08619052171707153, "learning_rate": 9.404677507731903e-06, "loss": 0.4714, "num_input_tokens_seen": 46834176, "step": 38505 }, { "epoch": 4.825209873449443, "grad_norm": 0.06494680047035217, "learning_rate": 9.404418756541565e-06, "loss": 0.4573, "num_input_tokens_seen": 46840192, "step": 38510 }, { "epoch": 4.825836361358226, "grad_norm": 0.05979245528578758, "learning_rate": 9.404159952692696e-06, "loss": 0.4608, "num_input_tokens_seen": 46846112, "step": 38515 }, { "epoch": 4.826462849267009, "grad_norm": 0.07201540470123291, "learning_rate": 9.403901096188393e-06, "loss": 0.4639, "num_input_tokens_seen": 46852352, "step": 38520 }, { "epoch": 4.827089337175792, "grad_norm": 0.09981115162372589, "learning_rate": 9.40364218703175e-06, "loss": 0.4641, "num_input_tokens_seen": 46858208, "step": 38525 }, { "epoch": 4.827715825084576, "grad_norm": 0.08933353424072266, "learning_rate": 9.403383225225861e-06, "loss": 0.4629, "num_input_tokens_seen": 46864544, "step": 38530 }, { "epoch": 4.828342312993359, "grad_norm": 0.0699932649731636, "learning_rate": 9.403124210773824e-06, "loss": 0.4658, "num_input_tokens_seen": 46870592, "step": 38535 }, { "epoch": 4.8289688009021425, "grad_norm": 0.07334651052951813, "learning_rate": 9.402865143678735e-06, "loss": 0.4623, "num_input_tokens_seen": 46876736, "step": 38540 }, { "epoch": 4.829595288810926, "grad_norm": 0.1094316691160202, "learning_rate": 9.402606023943692e-06, "loss": 0.4587, "num_input_tokens_seen": 46882944, "step": 38545 }, { "epoch": 4.830221776719709, "grad_norm": 0.06334121525287628, "learning_rate": 9.402346851571793e-06, "loss": 0.46, "num_input_tokens_seen": 46889088, "step": 38550 }, { "epoch": 4.830848264628493, "grad_norm": 0.07396820932626724, "learning_rate": 9.402087626566135e-06, "loss": 0.459, "num_input_tokens_seen": 46895328, "step": 38555 }, { "epoch": 4.831474752537276, "grad_norm": 0.061360299587249756, "learning_rate": 9.401828348929817e-06, "loss": 0.4669, "num_input_tokens_seen": 46901376, "step": 38560 }, { "epoch": 4.832101240446059, "grad_norm": 0.07105445116758347, "learning_rate": 9.401569018665943e-06, "loss": 0.459, "num_input_tokens_seen": 46907456, "step": 38565 }, { "epoch": 4.832727728354842, "grad_norm": 0.0401201993227005, "learning_rate": 9.40130963577761e-06, "loss": 0.4581, "num_input_tokens_seen": 46913568, "step": 38570 }, { "epoch": 4.8333542162636265, "grad_norm": 0.1183810755610466, "learning_rate": 9.401050200267918e-06, "loss": 0.4588, "num_input_tokens_seen": 46919168, "step": 38575 }, { "epoch": 4.83398070417241, "grad_norm": 0.10528529435396194, "learning_rate": 9.400790712139973e-06, "loss": 0.4597, "num_input_tokens_seen": 46925024, "step": 38580 }, { "epoch": 4.834607192081193, "grad_norm": 0.11723369359970093, "learning_rate": 9.400531171396874e-06, "loss": 0.4671, "num_input_tokens_seen": 46931168, "step": 38585 }, { "epoch": 4.835233679989976, "grad_norm": 0.10360495746135712, "learning_rate": 9.400271578041725e-06, "loss": 0.465, "num_input_tokens_seen": 46937024, "step": 38590 }, { "epoch": 4.835860167898759, "grad_norm": 0.09314201027154922, "learning_rate": 9.40001193207763e-06, "loss": 0.4695, "num_input_tokens_seen": 46942784, "step": 38595 }, { "epoch": 4.836486655807543, "grad_norm": 0.0866214856505394, "learning_rate": 9.399752233507693e-06, "loss": 0.4653, "num_input_tokens_seen": 46949216, "step": 38600 }, { "epoch": 4.837113143716326, "grad_norm": 0.06278403103351593, "learning_rate": 9.39949248233502e-06, "loss": 0.463, "num_input_tokens_seen": 46955264, "step": 38605 }, { "epoch": 4.83773963162511, "grad_norm": 0.08311568200588226, "learning_rate": 9.399232678562714e-06, "loss": 0.4615, "num_input_tokens_seen": 46961376, "step": 38610 }, { "epoch": 4.838366119533893, "grad_norm": 0.07229528576135635, "learning_rate": 9.398972822193883e-06, "loss": 0.4614, "num_input_tokens_seen": 46967360, "step": 38615 }, { "epoch": 4.838992607442677, "grad_norm": 0.18398591876029968, "learning_rate": 9.398712913231633e-06, "loss": 0.4738, "num_input_tokens_seen": 46972864, "step": 38620 }, { "epoch": 4.83961909535146, "grad_norm": 0.08862127363681793, "learning_rate": 9.398452951679074e-06, "loss": 0.4651, "num_input_tokens_seen": 46978528, "step": 38625 }, { "epoch": 4.840245583260243, "grad_norm": 0.07817947119474411, "learning_rate": 9.39819293753931e-06, "loss": 0.459, "num_input_tokens_seen": 46984608, "step": 38630 }, { "epoch": 4.840872071169026, "grad_norm": 0.09231369942426682, "learning_rate": 9.397932870815452e-06, "loss": 0.4598, "num_input_tokens_seen": 46990976, "step": 38635 }, { "epoch": 4.8414985590778095, "grad_norm": 0.07242485880851746, "learning_rate": 9.397672751510609e-06, "loss": 0.4622, "num_input_tokens_seen": 46996832, "step": 38640 }, { "epoch": 4.8421250469865935, "grad_norm": 0.078276127576828, "learning_rate": 9.397412579627892e-06, "loss": 0.4715, "num_input_tokens_seen": 47002720, "step": 38645 }, { "epoch": 4.842751534895377, "grad_norm": 0.03806599974632263, "learning_rate": 9.397152355170407e-06, "loss": 0.4637, "num_input_tokens_seen": 47008640, "step": 38650 }, { "epoch": 4.84337802280416, "grad_norm": 0.11567272245883942, "learning_rate": 9.396892078141272e-06, "loss": 0.466, "num_input_tokens_seen": 47014848, "step": 38655 }, { "epoch": 4.844004510712943, "grad_norm": 0.09897184371948242, "learning_rate": 9.396631748543594e-06, "loss": 0.4556, "num_input_tokens_seen": 47020640, "step": 38660 }, { "epoch": 4.844630998621726, "grad_norm": 0.08584557473659515, "learning_rate": 9.396371366380487e-06, "loss": 0.4655, "num_input_tokens_seen": 47026752, "step": 38665 }, { "epoch": 4.84525748653051, "grad_norm": 0.07885085791349411, "learning_rate": 9.396110931655066e-06, "loss": 0.4593, "num_input_tokens_seen": 47032864, "step": 38670 }, { "epoch": 4.845883974439293, "grad_norm": 0.06819213926792145, "learning_rate": 9.395850444370441e-06, "loss": 0.4628, "num_input_tokens_seen": 47039232, "step": 38675 }, { "epoch": 4.846510462348077, "grad_norm": 0.0790422186255455, "learning_rate": 9.395589904529729e-06, "loss": 0.4683, "num_input_tokens_seen": 47045376, "step": 38680 }, { "epoch": 4.84713695025686, "grad_norm": 0.06890752166509628, "learning_rate": 9.395329312136043e-06, "loss": 0.46, "num_input_tokens_seen": 47051680, "step": 38685 }, { "epoch": 4.847763438165643, "grad_norm": 0.07535183429718018, "learning_rate": 9.3950686671925e-06, "loss": 0.4621, "num_input_tokens_seen": 47057696, "step": 38690 }, { "epoch": 4.848389926074427, "grad_norm": 0.11051265895366669, "learning_rate": 9.394807969702216e-06, "loss": 0.4623, "num_input_tokens_seen": 47063840, "step": 38695 }, { "epoch": 4.84901641398321, "grad_norm": 0.08144189417362213, "learning_rate": 9.394547219668308e-06, "loss": 0.4589, "num_input_tokens_seen": 47070176, "step": 38700 }, { "epoch": 4.849642901891993, "grad_norm": 0.041384149342775345, "learning_rate": 9.394286417093895e-06, "loss": 0.4631, "num_input_tokens_seen": 47076032, "step": 38705 }, { "epoch": 4.8502693898007765, "grad_norm": 0.12967714667320251, "learning_rate": 9.39402556198209e-06, "loss": 0.4551, "num_input_tokens_seen": 47082176, "step": 38710 }, { "epoch": 4.8508958777095605, "grad_norm": 0.04386515915393829, "learning_rate": 9.393764654336018e-06, "loss": 0.4688, "num_input_tokens_seen": 47088544, "step": 38715 }, { "epoch": 4.851522365618344, "grad_norm": 0.07798810303211212, "learning_rate": 9.393503694158792e-06, "loss": 0.4611, "num_input_tokens_seen": 47094720, "step": 38720 }, { "epoch": 4.852148853527127, "grad_norm": 0.07673434913158417, "learning_rate": 9.393242681453539e-06, "loss": 0.4658, "num_input_tokens_seen": 47100992, "step": 38725 }, { "epoch": 4.85277534143591, "grad_norm": 0.08980443328619003, "learning_rate": 9.392981616223374e-06, "loss": 0.4679, "num_input_tokens_seen": 47106912, "step": 38730 }, { "epoch": 4.853401829344694, "grad_norm": 0.06881499290466309, "learning_rate": 9.39272049847142e-06, "loss": 0.4745, "num_input_tokens_seen": 47112960, "step": 38735 }, { "epoch": 4.854028317253477, "grad_norm": 0.06899644434452057, "learning_rate": 9.392459328200801e-06, "loss": 0.4656, "num_input_tokens_seen": 47119136, "step": 38740 }, { "epoch": 4.8546548051622604, "grad_norm": 0.10447157919406891, "learning_rate": 9.392198105414637e-06, "loss": 0.4607, "num_input_tokens_seen": 47125152, "step": 38745 }, { "epoch": 4.855281293071044, "grad_norm": 0.10177533328533173, "learning_rate": 9.391936830116053e-06, "loss": 0.4642, "num_input_tokens_seen": 47130656, "step": 38750 }, { "epoch": 4.855907780979827, "grad_norm": 0.09747371077537537, "learning_rate": 9.391675502308171e-06, "loss": 0.4605, "num_input_tokens_seen": 47136800, "step": 38755 }, { "epoch": 4.856534268888611, "grad_norm": 0.06748297065496445, "learning_rate": 9.391414121994115e-06, "loss": 0.4639, "num_input_tokens_seen": 47142208, "step": 38760 }, { "epoch": 4.857160756797394, "grad_norm": 0.06473562866449356, "learning_rate": 9.391152689177012e-06, "loss": 0.4606, "num_input_tokens_seen": 47148256, "step": 38765 }, { "epoch": 4.857787244706177, "grad_norm": 0.11347733438014984, "learning_rate": 9.390891203859987e-06, "loss": 0.4621, "num_input_tokens_seen": 47154016, "step": 38770 }, { "epoch": 4.85841373261496, "grad_norm": 0.08897903561592102, "learning_rate": 9.390629666046164e-06, "loss": 0.4591, "num_input_tokens_seen": 47160160, "step": 38775 }, { "epoch": 4.8590402205237435, "grad_norm": 0.04280906543135643, "learning_rate": 9.390368075738674e-06, "loss": 0.4631, "num_input_tokens_seen": 47166400, "step": 38780 }, { "epoch": 4.859666708432528, "grad_norm": 0.07573670148849487, "learning_rate": 9.390106432940642e-06, "loss": 0.4657, "num_input_tokens_seen": 47172288, "step": 38785 }, { "epoch": 4.860293196341311, "grad_norm": 0.07812624424695969, "learning_rate": 9.389844737655196e-06, "loss": 0.4571, "num_input_tokens_seen": 47178592, "step": 38790 }, { "epoch": 4.860919684250094, "grad_norm": 0.07711755484342575, "learning_rate": 9.389582989885467e-06, "loss": 0.4642, "num_input_tokens_seen": 47184608, "step": 38795 }, { "epoch": 4.861546172158877, "grad_norm": 0.08355873078107834, "learning_rate": 9.38932118963458e-06, "loss": 0.4656, "num_input_tokens_seen": 47190400, "step": 38800 }, { "epoch": 4.86217266006766, "grad_norm": 0.04408540576696396, "learning_rate": 9.38905933690567e-06, "loss": 0.4655, "num_input_tokens_seen": 47196288, "step": 38805 }, { "epoch": 4.862799147976444, "grad_norm": 0.10085254907608032, "learning_rate": 9.388797431701865e-06, "loss": 0.4658, "num_input_tokens_seen": 47202464, "step": 38810 }, { "epoch": 4.8634256358852275, "grad_norm": 0.1284085214138031, "learning_rate": 9.388535474026298e-06, "loss": 0.4618, "num_input_tokens_seen": 47208704, "step": 38815 }, { "epoch": 4.864052123794011, "grad_norm": 0.10985539853572845, "learning_rate": 9.388273463882097e-06, "loss": 0.4634, "num_input_tokens_seen": 47215200, "step": 38820 }, { "epoch": 4.864678611702794, "grad_norm": 0.062347982078790665, "learning_rate": 9.388011401272398e-06, "loss": 0.4603, "num_input_tokens_seen": 47221312, "step": 38825 }, { "epoch": 4.865305099611578, "grad_norm": 0.10558606684207916, "learning_rate": 9.387749286200334e-06, "loss": 0.46, "num_input_tokens_seen": 47227488, "step": 38830 }, { "epoch": 4.865931587520361, "grad_norm": 0.03970468416810036, "learning_rate": 9.387487118669039e-06, "loss": 0.4623, "num_input_tokens_seen": 47233664, "step": 38835 }, { "epoch": 4.866558075429144, "grad_norm": 0.12625610828399658, "learning_rate": 9.387224898681646e-06, "loss": 0.466, "num_input_tokens_seen": 47240064, "step": 38840 }, { "epoch": 4.867184563337927, "grad_norm": 0.1121276319026947, "learning_rate": 9.386962626241292e-06, "loss": 0.4623, "num_input_tokens_seen": 47246112, "step": 38845 }, { "epoch": 4.867811051246711, "grad_norm": 0.07541809231042862, "learning_rate": 9.38670030135111e-06, "loss": 0.4707, "num_input_tokens_seen": 47252096, "step": 38850 }, { "epoch": 4.868437539155495, "grad_norm": 0.12052223831415176, "learning_rate": 9.386437924014237e-06, "loss": 0.4693, "num_input_tokens_seen": 47258368, "step": 38855 }, { "epoch": 4.869064027064278, "grad_norm": 0.07741378247737885, "learning_rate": 9.386175494233813e-06, "loss": 0.4676, "num_input_tokens_seen": 47264672, "step": 38860 }, { "epoch": 4.869690514973061, "grad_norm": 0.11642517894506454, "learning_rate": 9.385913012012972e-06, "loss": 0.4637, "num_input_tokens_seen": 47270880, "step": 38865 }, { "epoch": 4.870317002881844, "grad_norm": 0.06743960827589035, "learning_rate": 9.385650477354852e-06, "loss": 0.4645, "num_input_tokens_seen": 47277024, "step": 38870 }, { "epoch": 4.870943490790628, "grad_norm": 0.10555150359869003, "learning_rate": 9.385387890262596e-06, "loss": 0.4632, "num_input_tokens_seen": 47283200, "step": 38875 }, { "epoch": 4.871569978699411, "grad_norm": 0.08476412296295166, "learning_rate": 9.38512525073934e-06, "loss": 0.4655, "num_input_tokens_seen": 47289184, "step": 38880 }, { "epoch": 4.8721964666081945, "grad_norm": 0.07121250033378601, "learning_rate": 9.384862558788225e-06, "loss": 0.4691, "num_input_tokens_seen": 47295296, "step": 38885 }, { "epoch": 4.872822954516978, "grad_norm": 0.07750122994184494, "learning_rate": 9.384599814412391e-06, "loss": 0.4647, "num_input_tokens_seen": 47301024, "step": 38890 }, { "epoch": 4.873449442425761, "grad_norm": 0.10787630081176758, "learning_rate": 9.384337017614979e-06, "loss": 0.4631, "num_input_tokens_seen": 47306784, "step": 38895 }, { "epoch": 4.874075930334545, "grad_norm": 0.07456856220960617, "learning_rate": 9.384074168399133e-06, "loss": 0.4668, "num_input_tokens_seen": 47312896, "step": 38900 }, { "epoch": 4.874702418243328, "grad_norm": 0.03260797634720802, "learning_rate": 9.383811266767994e-06, "loss": 0.4627, "num_input_tokens_seen": 47319040, "step": 38905 }, { "epoch": 4.875328906152111, "grad_norm": 0.06629268825054169, "learning_rate": 9.383548312724707e-06, "loss": 0.4667, "num_input_tokens_seen": 47325184, "step": 38910 }, { "epoch": 4.875955394060894, "grad_norm": 0.07281696051359177, "learning_rate": 9.383285306272412e-06, "loss": 0.4549, "num_input_tokens_seen": 47331040, "step": 38915 }, { "epoch": 4.876581881969678, "grad_norm": 0.10620585829019547, "learning_rate": 9.383022247414258e-06, "loss": 0.4593, "num_input_tokens_seen": 47337152, "step": 38920 }, { "epoch": 4.877208369878462, "grad_norm": 0.10167186707258224, "learning_rate": 9.382759136153386e-06, "loss": 0.462, "num_input_tokens_seen": 47343232, "step": 38925 }, { "epoch": 4.877834857787245, "grad_norm": 0.07151458412408829, "learning_rate": 9.382495972492945e-06, "loss": 0.4619, "num_input_tokens_seen": 47349600, "step": 38930 }, { "epoch": 4.878461345696028, "grad_norm": 0.06673400849103928, "learning_rate": 9.382232756436078e-06, "loss": 0.4565, "num_input_tokens_seen": 47355680, "step": 38935 }, { "epoch": 4.879087833604811, "grad_norm": 0.06643624603748322, "learning_rate": 9.381969487985936e-06, "loss": 0.4602, "num_input_tokens_seen": 47361760, "step": 38940 }, { "epoch": 4.879714321513594, "grad_norm": 0.0831664577126503, "learning_rate": 9.381706167145665e-06, "loss": 0.4636, "num_input_tokens_seen": 47368032, "step": 38945 }, { "epoch": 4.880340809422378, "grad_norm": 0.11581306904554367, "learning_rate": 9.381442793918411e-06, "loss": 0.463, "num_input_tokens_seen": 47374112, "step": 38950 }, { "epoch": 4.8809672973311615, "grad_norm": 0.12433885782957077, "learning_rate": 9.381179368307327e-06, "loss": 0.4583, "num_input_tokens_seen": 47380352, "step": 38955 }, { "epoch": 4.881593785239945, "grad_norm": 0.08394749462604523, "learning_rate": 9.380915890315558e-06, "loss": 0.4748, "num_input_tokens_seen": 47386688, "step": 38960 }, { "epoch": 4.882220273148728, "grad_norm": 0.08107221126556396, "learning_rate": 9.380652359946258e-06, "loss": 0.4652, "num_input_tokens_seen": 47392544, "step": 38965 }, { "epoch": 4.882846761057512, "grad_norm": 0.13451765477657318, "learning_rate": 9.380388777202577e-06, "loss": 0.4749, "num_input_tokens_seen": 47398784, "step": 38970 }, { "epoch": 4.883473248966295, "grad_norm": 0.036710504442453384, "learning_rate": 9.380125142087661e-06, "loss": 0.4626, "num_input_tokens_seen": 47404960, "step": 38975 }, { "epoch": 4.884099736875078, "grad_norm": 0.07533305883407593, "learning_rate": 9.37986145460467e-06, "loss": 0.4582, "num_input_tokens_seen": 47410912, "step": 38980 }, { "epoch": 4.884726224783861, "grad_norm": 0.06014590710401535, "learning_rate": 9.379597714756751e-06, "loss": 0.4653, "num_input_tokens_seen": 47416672, "step": 38985 }, { "epoch": 4.8853527126926455, "grad_norm": 0.09131162613630295, "learning_rate": 9.37933392254706e-06, "loss": 0.4664, "num_input_tokens_seen": 47422720, "step": 38990 }, { "epoch": 4.885979200601429, "grad_norm": 0.0776834562420845, "learning_rate": 9.379070077978752e-06, "loss": 0.4668, "num_input_tokens_seen": 47428640, "step": 38995 }, { "epoch": 4.886605688510212, "grad_norm": 0.1259717494249344, "learning_rate": 9.378806181054975e-06, "loss": 0.4642, "num_input_tokens_seen": 47434560, "step": 39000 }, { "epoch": 4.887232176418995, "grad_norm": 0.11410239338874817, "learning_rate": 9.378542231778892e-06, "loss": 0.4647, "num_input_tokens_seen": 47440832, "step": 39005 }, { "epoch": 4.887858664327778, "grad_norm": 0.05945320427417755, "learning_rate": 9.378278230153654e-06, "loss": 0.4626, "num_input_tokens_seen": 47446336, "step": 39010 }, { "epoch": 4.888485152236562, "grad_norm": 0.06610144674777985, "learning_rate": 9.37801417618242e-06, "loss": 0.4645, "num_input_tokens_seen": 47452512, "step": 39015 }, { "epoch": 4.889111640145345, "grad_norm": 0.10672686994075775, "learning_rate": 9.377750069868346e-06, "loss": 0.4693, "num_input_tokens_seen": 47458400, "step": 39020 }, { "epoch": 4.8897381280541286, "grad_norm": 0.09012969583272934, "learning_rate": 9.377485911214587e-06, "loss": 0.4651, "num_input_tokens_seen": 47464544, "step": 39025 }, { "epoch": 4.890364615962912, "grad_norm": 0.06949541717767715, "learning_rate": 9.377221700224305e-06, "loss": 0.4628, "num_input_tokens_seen": 47470688, "step": 39030 }, { "epoch": 4.890991103871695, "grad_norm": 0.06168801337480545, "learning_rate": 9.376957436900657e-06, "loss": 0.4638, "num_input_tokens_seen": 47476768, "step": 39035 }, { "epoch": 4.891617591780479, "grad_norm": 0.15745873749256134, "learning_rate": 9.376693121246802e-06, "loss": 0.4635, "num_input_tokens_seen": 47482848, "step": 39040 }, { "epoch": 4.892244079689262, "grad_norm": 0.0609106719493866, "learning_rate": 9.376428753265902e-06, "loss": 0.4617, "num_input_tokens_seen": 47488832, "step": 39045 }, { "epoch": 4.892870567598045, "grad_norm": 0.05858626216650009, "learning_rate": 9.376164332961118e-06, "loss": 0.4604, "num_input_tokens_seen": 47495104, "step": 39050 }, { "epoch": 4.8934970555068285, "grad_norm": 0.10613930225372314, "learning_rate": 9.375899860335608e-06, "loss": 0.4604, "num_input_tokens_seen": 47501056, "step": 39055 }, { "epoch": 4.894123543415612, "grad_norm": 0.06237185373902321, "learning_rate": 9.375635335392536e-06, "loss": 0.4625, "num_input_tokens_seen": 47507264, "step": 39060 }, { "epoch": 4.894750031324396, "grad_norm": 0.029286805540323257, "learning_rate": 9.375370758135067e-06, "loss": 0.4641, "num_input_tokens_seen": 47512864, "step": 39065 }, { "epoch": 4.895376519233179, "grad_norm": 0.13613176345825195, "learning_rate": 9.37510612856636e-06, "loss": 0.4545, "num_input_tokens_seen": 47519104, "step": 39070 }, { "epoch": 4.896003007141962, "grad_norm": 0.11033744364976883, "learning_rate": 9.374841446689582e-06, "loss": 0.4582, "num_input_tokens_seen": 47524960, "step": 39075 }, { "epoch": 4.896629495050745, "grad_norm": 0.07446654886007309, "learning_rate": 9.374576712507897e-06, "loss": 0.46, "num_input_tokens_seen": 47530848, "step": 39080 }, { "epoch": 4.897255982959529, "grad_norm": 0.16521042585372925, "learning_rate": 9.374311926024468e-06, "loss": 0.4613, "num_input_tokens_seen": 47537280, "step": 39085 }, { "epoch": 4.897882470868312, "grad_norm": 0.11131968349218369, "learning_rate": 9.374047087242461e-06, "loss": 0.4689, "num_input_tokens_seen": 47543392, "step": 39090 }, { "epoch": 4.898508958777096, "grad_norm": 0.07977454364299774, "learning_rate": 9.373782196165047e-06, "loss": 0.468, "num_input_tokens_seen": 47549504, "step": 39095 }, { "epoch": 4.899135446685879, "grad_norm": 0.03774441033601761, "learning_rate": 9.373517252795388e-06, "loss": 0.4589, "num_input_tokens_seen": 47555872, "step": 39100 }, { "epoch": 4.899761934594663, "grad_norm": 0.1398659348487854, "learning_rate": 9.373252257136652e-06, "loss": 0.4655, "num_input_tokens_seen": 47562048, "step": 39105 }, { "epoch": 4.900388422503446, "grad_norm": 0.0658569410443306, "learning_rate": 9.372987209192011e-06, "loss": 0.4706, "num_input_tokens_seen": 47568000, "step": 39110 }, { "epoch": 4.901014910412229, "grad_norm": 0.1310206949710846, "learning_rate": 9.372722108964628e-06, "loss": 0.4618, "num_input_tokens_seen": 47574208, "step": 39115 }, { "epoch": 4.901641398321012, "grad_norm": 0.11665359884500504, "learning_rate": 9.372456956457678e-06, "loss": 0.4624, "num_input_tokens_seen": 47580416, "step": 39120 }, { "epoch": 4.9022678862297955, "grad_norm": 0.08034955710172653, "learning_rate": 9.372191751674329e-06, "loss": 0.4652, "num_input_tokens_seen": 47586176, "step": 39125 }, { "epoch": 4.9028943741385795, "grad_norm": 0.0726117268204689, "learning_rate": 9.37192649461775e-06, "loss": 0.466, "num_input_tokens_seen": 47592160, "step": 39130 }, { "epoch": 4.903520862047363, "grad_norm": 0.09044762700796127, "learning_rate": 9.371661185291116e-06, "loss": 0.4615, "num_input_tokens_seen": 47598272, "step": 39135 }, { "epoch": 4.904147349956146, "grad_norm": 0.06576288491487503, "learning_rate": 9.371395823697596e-06, "loss": 0.4656, "num_input_tokens_seen": 47604384, "step": 39140 }, { "epoch": 4.904773837864929, "grad_norm": 0.10453856736421585, "learning_rate": 9.371130409840364e-06, "loss": 0.4634, "num_input_tokens_seen": 47610496, "step": 39145 }, { "epoch": 4.905400325773712, "grad_norm": 0.07381154596805573, "learning_rate": 9.370864943722592e-06, "loss": 0.4682, "num_input_tokens_seen": 47616608, "step": 39150 }, { "epoch": 4.906026813682496, "grad_norm": 0.06201060116291046, "learning_rate": 9.370599425347456e-06, "loss": 0.459, "num_input_tokens_seen": 47622880, "step": 39155 }, { "epoch": 4.906653301591279, "grad_norm": 0.0833498165011406, "learning_rate": 9.370333854718127e-06, "loss": 0.459, "num_input_tokens_seen": 47629088, "step": 39160 }, { "epoch": 4.907279789500063, "grad_norm": 0.07080046832561493, "learning_rate": 9.370068231837786e-06, "loss": 0.46, "num_input_tokens_seen": 47635072, "step": 39165 }, { "epoch": 4.907906277408846, "grad_norm": 0.06583590805530548, "learning_rate": 9.369802556709603e-06, "loss": 0.4635, "num_input_tokens_seen": 47641056, "step": 39170 }, { "epoch": 4.908532765317629, "grad_norm": 0.11786827445030212, "learning_rate": 9.369536829336756e-06, "loss": 0.4637, "num_input_tokens_seen": 47647168, "step": 39175 }, { "epoch": 4.909159253226413, "grad_norm": 0.09332097321748734, "learning_rate": 9.369271049722424e-06, "loss": 0.4628, "num_input_tokens_seen": 47653152, "step": 39180 }, { "epoch": 4.909785741135196, "grad_norm": 0.10394471883773804, "learning_rate": 9.369005217869783e-06, "loss": 0.4619, "num_input_tokens_seen": 47659520, "step": 39185 }, { "epoch": 4.910412229043979, "grad_norm": 0.06979387998580933, "learning_rate": 9.36873933378201e-06, "loss": 0.4554, "num_input_tokens_seen": 47665312, "step": 39190 }, { "epoch": 4.9110387169527625, "grad_norm": 0.06962334364652634, "learning_rate": 9.368473397462285e-06, "loss": 0.4711, "num_input_tokens_seen": 47671424, "step": 39195 }, { "epoch": 4.911665204861546, "grad_norm": 0.06553029268980026, "learning_rate": 9.36820740891379e-06, "loss": 0.4689, "num_input_tokens_seen": 47677504, "step": 39200 }, { "epoch": 4.91229169277033, "grad_norm": 0.07982606440782547, "learning_rate": 9.367941368139702e-06, "loss": 0.4668, "num_input_tokens_seen": 47683712, "step": 39205 }, { "epoch": 4.912918180679113, "grad_norm": 0.07884970307350159, "learning_rate": 9.367675275143201e-06, "loss": 0.4586, "num_input_tokens_seen": 47689888, "step": 39210 }, { "epoch": 4.913544668587896, "grad_norm": 0.07221689820289612, "learning_rate": 9.367409129927472e-06, "loss": 0.4612, "num_input_tokens_seen": 47696160, "step": 39215 }, { "epoch": 4.914171156496679, "grad_norm": 0.10640634596347809, "learning_rate": 9.367142932495694e-06, "loss": 0.4665, "num_input_tokens_seen": 47702400, "step": 39220 }, { "epoch": 4.914797644405463, "grad_norm": 0.12825767695903778, "learning_rate": 9.366876682851051e-06, "loss": 0.4593, "num_input_tokens_seen": 47708352, "step": 39225 }, { "epoch": 4.9154241323142465, "grad_norm": 0.056849997490644455, "learning_rate": 9.366610380996724e-06, "loss": 0.4636, "num_input_tokens_seen": 47712896, "step": 39230 }, { "epoch": 4.91605062022303, "grad_norm": 0.06141606345772743, "learning_rate": 9.366344026935901e-06, "loss": 0.4631, "num_input_tokens_seen": 47718912, "step": 39235 }, { "epoch": 4.916677108131813, "grad_norm": 0.06496275216341019, "learning_rate": 9.366077620671763e-06, "loss": 0.4649, "num_input_tokens_seen": 47725088, "step": 39240 }, { "epoch": 4.917303596040597, "grad_norm": 0.07383958995342255, "learning_rate": 9.365811162207497e-06, "loss": 0.4607, "num_input_tokens_seen": 47731328, "step": 39245 }, { "epoch": 4.91793008394938, "grad_norm": 0.12886586785316467, "learning_rate": 9.365544651546287e-06, "loss": 0.4657, "num_input_tokens_seen": 47737440, "step": 39250 }, { "epoch": 4.918556571858163, "grad_norm": 0.07441854476928711, "learning_rate": 9.36527808869132e-06, "loss": 0.4635, "num_input_tokens_seen": 47743104, "step": 39255 }, { "epoch": 4.919183059766946, "grad_norm": 0.09921000152826309, "learning_rate": 9.365011473645784e-06, "loss": 0.4641, "num_input_tokens_seen": 47749440, "step": 39260 }, { "epoch": 4.9198095476757295, "grad_norm": 0.03361018747091293, "learning_rate": 9.364744806412866e-06, "loss": 0.4641, "num_input_tokens_seen": 47755808, "step": 39265 }, { "epoch": 4.920436035584514, "grad_norm": 0.033306196331977844, "learning_rate": 9.364478086995754e-06, "loss": 0.4642, "num_input_tokens_seen": 47762048, "step": 39270 }, { "epoch": 4.921062523493297, "grad_norm": 0.0634264275431633, "learning_rate": 9.364211315397638e-06, "loss": 0.462, "num_input_tokens_seen": 47768416, "step": 39275 }, { "epoch": 4.92168901140208, "grad_norm": 0.06932021677494049, "learning_rate": 9.363944491621706e-06, "loss": 0.4662, "num_input_tokens_seen": 47774560, "step": 39280 }, { "epoch": 4.922315499310863, "grad_norm": 0.06896961480379105, "learning_rate": 9.363677615671148e-06, "loss": 0.4626, "num_input_tokens_seen": 47780704, "step": 39285 }, { "epoch": 4.922941987219646, "grad_norm": 0.0634237751364708, "learning_rate": 9.363410687549156e-06, "loss": 0.463, "num_input_tokens_seen": 47786400, "step": 39290 }, { "epoch": 4.92356847512843, "grad_norm": 0.06873705983161926, "learning_rate": 9.363143707258919e-06, "loss": 0.4695, "num_input_tokens_seen": 47792608, "step": 39295 }, { "epoch": 4.9241949630372135, "grad_norm": 0.06457185000181198, "learning_rate": 9.362876674803633e-06, "loss": 0.4599, "num_input_tokens_seen": 47798880, "step": 39300 }, { "epoch": 4.924821450945997, "grad_norm": 0.07300090044736862, "learning_rate": 9.362609590186486e-06, "loss": 0.4657, "num_input_tokens_seen": 47804800, "step": 39305 }, { "epoch": 4.92544793885478, "grad_norm": 0.11193867027759552, "learning_rate": 9.362342453410675e-06, "loss": 0.4589, "num_input_tokens_seen": 47810944, "step": 39310 }, { "epoch": 4.926074426763563, "grad_norm": 0.0634332224726677, "learning_rate": 9.362075264479393e-06, "loss": 0.4622, "num_input_tokens_seen": 47817344, "step": 39315 }, { "epoch": 4.926700914672347, "grad_norm": 0.03158195689320564, "learning_rate": 9.361808023395834e-06, "loss": 0.4622, "num_input_tokens_seen": 47823904, "step": 39320 }, { "epoch": 4.92732740258113, "grad_norm": 0.06107798218727112, "learning_rate": 9.36154073016319e-06, "loss": 0.462, "num_input_tokens_seen": 47830336, "step": 39325 }, { "epoch": 4.927953890489913, "grad_norm": 0.07647901773452759, "learning_rate": 9.361273384784661e-06, "loss": 0.4636, "num_input_tokens_seen": 47836512, "step": 39330 }, { "epoch": 4.928580378398697, "grad_norm": 0.06610209494829178, "learning_rate": 9.361005987263442e-06, "loss": 0.4612, "num_input_tokens_seen": 47842688, "step": 39335 }, { "epoch": 4.929206866307481, "grad_norm": 0.08622424304485321, "learning_rate": 9.36073853760273e-06, "loss": 0.4641, "num_input_tokens_seen": 47848928, "step": 39340 }, { "epoch": 4.929833354216264, "grad_norm": 0.08604118973016739, "learning_rate": 9.360471035805723e-06, "loss": 0.4664, "num_input_tokens_seen": 47855104, "step": 39345 }, { "epoch": 4.930459842125047, "grad_norm": 0.07112066447734833, "learning_rate": 9.360203481875618e-06, "loss": 0.4587, "num_input_tokens_seen": 47861088, "step": 39350 }, { "epoch": 4.93108633003383, "grad_norm": 0.06259658187627792, "learning_rate": 9.359935875815615e-06, "loss": 0.4632, "num_input_tokens_seen": 47866784, "step": 39355 }, { "epoch": 4.931712817942614, "grad_norm": 0.06704916805028915, "learning_rate": 9.359668217628912e-06, "loss": 0.4606, "num_input_tokens_seen": 47873088, "step": 39360 }, { "epoch": 4.932339305851397, "grad_norm": 0.06769206374883652, "learning_rate": 9.35940050731871e-06, "loss": 0.4589, "num_input_tokens_seen": 47879296, "step": 39365 }, { "epoch": 4.9329657937601805, "grad_norm": 0.15152651071548462, "learning_rate": 9.359132744888212e-06, "loss": 0.4661, "num_input_tokens_seen": 47885440, "step": 39370 }, { "epoch": 4.933592281668964, "grad_norm": 0.10325842350721359, "learning_rate": 9.358864930340616e-06, "loss": 0.4691, "num_input_tokens_seen": 47891328, "step": 39375 }, { "epoch": 4.934218769577747, "grad_norm": 0.08947806805372238, "learning_rate": 9.358597063679124e-06, "loss": 0.4576, "num_input_tokens_seen": 47897376, "step": 39380 }, { "epoch": 4.934845257486531, "grad_norm": 0.11940504610538483, "learning_rate": 9.35832914490694e-06, "loss": 0.4639, "num_input_tokens_seen": 47903360, "step": 39385 }, { "epoch": 4.935471745395314, "grad_norm": 0.07411791384220123, "learning_rate": 9.358061174027267e-06, "loss": 0.468, "num_input_tokens_seen": 47909440, "step": 39390 }, { "epoch": 4.936098233304097, "grad_norm": 0.10702551901340485, "learning_rate": 9.357793151043308e-06, "loss": 0.4646, "num_input_tokens_seen": 47915040, "step": 39395 }, { "epoch": 4.93672472121288, "grad_norm": 0.09761958569288254, "learning_rate": 9.357525075958269e-06, "loss": 0.4569, "num_input_tokens_seen": 47920928, "step": 39400 }, { "epoch": 4.937351209121664, "grad_norm": 0.12419714778661728, "learning_rate": 9.357256948775354e-06, "loss": 0.4652, "num_input_tokens_seen": 47926880, "step": 39405 }, { "epoch": 4.937977697030448, "grad_norm": 0.1258164495229721, "learning_rate": 9.356988769497768e-06, "loss": 0.4545, "num_input_tokens_seen": 47932672, "step": 39410 }, { "epoch": 4.938604184939231, "grad_norm": 0.08388365805149078, "learning_rate": 9.35672053812872e-06, "loss": 0.4699, "num_input_tokens_seen": 47938752, "step": 39415 }, { "epoch": 4.939230672848014, "grad_norm": 0.07233881205320358, "learning_rate": 9.356452254671412e-06, "loss": 0.4585, "num_input_tokens_seen": 47944832, "step": 39420 }, { "epoch": 4.939857160756797, "grad_norm": 0.0814719945192337, "learning_rate": 9.356183919129057e-06, "loss": 0.4653, "num_input_tokens_seen": 47951040, "step": 39425 }, { "epoch": 4.94048364866558, "grad_norm": 0.11773581802845001, "learning_rate": 9.35591553150486e-06, "loss": 0.4636, "num_input_tokens_seen": 47957312, "step": 39430 }, { "epoch": 4.941110136574364, "grad_norm": 0.11445167660713196, "learning_rate": 9.355647091802031e-06, "loss": 0.4566, "num_input_tokens_seen": 47963808, "step": 39435 }, { "epoch": 4.9417366244831475, "grad_norm": 0.11452485620975494, "learning_rate": 9.35537860002378e-06, "loss": 0.4611, "num_input_tokens_seen": 47969536, "step": 39440 }, { "epoch": 4.942363112391931, "grad_norm": 0.03765375167131424, "learning_rate": 9.355110056173313e-06, "loss": 0.4624, "num_input_tokens_seen": 47975360, "step": 39445 }, { "epoch": 4.942989600300714, "grad_norm": 0.06660926342010498, "learning_rate": 9.354841460253845e-06, "loss": 0.4578, "num_input_tokens_seen": 47981664, "step": 39450 }, { "epoch": 4.943616088209498, "grad_norm": 0.10229909420013428, "learning_rate": 9.354572812268588e-06, "loss": 0.4615, "num_input_tokens_seen": 47987072, "step": 39455 }, { "epoch": 4.944242576118281, "grad_norm": 0.08633504062891006, "learning_rate": 9.35430411222075e-06, "loss": 0.4703, "num_input_tokens_seen": 47993408, "step": 39460 }, { "epoch": 4.944869064027064, "grad_norm": 0.06567937880754471, "learning_rate": 9.354035360113545e-06, "loss": 0.4672, "num_input_tokens_seen": 47999392, "step": 39465 }, { "epoch": 4.9454955519358474, "grad_norm": 0.12115131318569183, "learning_rate": 9.353766555950187e-06, "loss": 0.4601, "num_input_tokens_seen": 48005344, "step": 39470 }, { "epoch": 4.946122039844631, "grad_norm": 0.08230150490999222, "learning_rate": 9.353497699733889e-06, "loss": 0.4615, "num_input_tokens_seen": 48011584, "step": 39475 }, { "epoch": 4.946748527753415, "grad_norm": 0.09880723804235458, "learning_rate": 9.353228791467868e-06, "loss": 0.4597, "num_input_tokens_seen": 48017632, "step": 39480 }, { "epoch": 4.947375015662198, "grad_norm": 0.08139586448669434, "learning_rate": 9.352959831155335e-06, "loss": 0.4644, "num_input_tokens_seen": 48023904, "step": 39485 }, { "epoch": 4.948001503570981, "grad_norm": 0.10375571995973587, "learning_rate": 9.352690818799507e-06, "loss": 0.4544, "num_input_tokens_seen": 48030208, "step": 39490 }, { "epoch": 4.948627991479764, "grad_norm": 0.07362174242734909, "learning_rate": 9.352421754403602e-06, "loss": 0.464, "num_input_tokens_seen": 48036416, "step": 39495 }, { "epoch": 4.949254479388548, "grad_norm": 0.08842092007398605, "learning_rate": 9.352152637970835e-06, "loss": 0.4651, "num_input_tokens_seen": 48042432, "step": 39500 }, { "epoch": 4.949880967297331, "grad_norm": 0.07999411970376968, "learning_rate": 9.351883469504423e-06, "loss": 0.4629, "num_input_tokens_seen": 48048704, "step": 39505 }, { "epoch": 4.950507455206115, "grad_norm": 0.06300417333841324, "learning_rate": 9.351614249007588e-06, "loss": 0.4629, "num_input_tokens_seen": 48054880, "step": 39510 }, { "epoch": 4.951133943114898, "grad_norm": 0.03405623883008957, "learning_rate": 9.351344976483544e-06, "loss": 0.4596, "num_input_tokens_seen": 48060544, "step": 39515 }, { "epoch": 4.951760431023681, "grad_norm": 0.06480821222066879, "learning_rate": 9.351075651935512e-06, "loss": 0.4598, "num_input_tokens_seen": 48066912, "step": 39520 }, { "epoch": 4.952386918932465, "grad_norm": 0.0830160453915596, "learning_rate": 9.350806275366713e-06, "loss": 0.4623, "num_input_tokens_seen": 48071616, "step": 39525 }, { "epoch": 4.953013406841248, "grad_norm": 0.1159113347530365, "learning_rate": 9.350536846780369e-06, "loss": 0.4682, "num_input_tokens_seen": 48077376, "step": 39530 }, { "epoch": 4.953639894750031, "grad_norm": 0.06836376339197159, "learning_rate": 9.350267366179698e-06, "loss": 0.4629, "num_input_tokens_seen": 48083424, "step": 39535 }, { "epoch": 4.9542663826588145, "grad_norm": 0.11398565769195557, "learning_rate": 9.34999783356792e-06, "loss": 0.458, "num_input_tokens_seen": 48089472, "step": 39540 }, { "epoch": 4.954892870567598, "grad_norm": 0.07152359932661057, "learning_rate": 9.349728248948265e-06, "loss": 0.4587, "num_input_tokens_seen": 48095712, "step": 39545 }, { "epoch": 4.955519358476382, "grad_norm": 0.07506255060434341, "learning_rate": 9.349458612323949e-06, "loss": 0.4604, "num_input_tokens_seen": 48101856, "step": 39550 }, { "epoch": 4.956145846385165, "grad_norm": 0.12304381281137466, "learning_rate": 9.3491889236982e-06, "loss": 0.4634, "num_input_tokens_seen": 48107936, "step": 39555 }, { "epoch": 4.956772334293948, "grad_norm": 0.07489479333162308, "learning_rate": 9.34891918307424e-06, "loss": 0.4604, "num_input_tokens_seen": 48114016, "step": 39560 }, { "epoch": 4.957398822202731, "grad_norm": 0.07489821314811707, "learning_rate": 9.348649390455295e-06, "loss": 0.4644, "num_input_tokens_seen": 48120192, "step": 39565 }, { "epoch": 4.958025310111514, "grad_norm": 0.07559210062026978, "learning_rate": 9.34837954584459e-06, "loss": 0.455, "num_input_tokens_seen": 48126528, "step": 39570 }, { "epoch": 4.958651798020298, "grad_norm": 0.09740887582302094, "learning_rate": 9.348109649245352e-06, "loss": 0.4621, "num_input_tokens_seen": 48132576, "step": 39575 }, { "epoch": 4.959278285929082, "grad_norm": 0.06729231774806976, "learning_rate": 9.347839700660808e-06, "loss": 0.4412, "num_input_tokens_seen": 48138560, "step": 39580 }, { "epoch": 4.959904773837865, "grad_norm": 0.1071876585483551, "learning_rate": 9.347569700094185e-06, "loss": 0.4709, "num_input_tokens_seen": 48144224, "step": 39585 }, { "epoch": 4.960531261746648, "grad_norm": 0.09798769652843475, "learning_rate": 9.347299647548711e-06, "loss": 0.4611, "num_input_tokens_seen": 48150176, "step": 39590 }, { "epoch": 4.961157749655432, "grad_norm": 0.13852530717849731, "learning_rate": 9.347029543027613e-06, "loss": 0.4656, "num_input_tokens_seen": 48156416, "step": 39595 }, { "epoch": 4.961784237564215, "grad_norm": 0.0870039239525795, "learning_rate": 9.346759386534124e-06, "loss": 0.4658, "num_input_tokens_seen": 48162752, "step": 39600 }, { "epoch": 4.962410725472998, "grad_norm": 0.08814197033643723, "learning_rate": 9.346489178071472e-06, "loss": 0.468, "num_input_tokens_seen": 48168736, "step": 39605 }, { "epoch": 4.9630372133817815, "grad_norm": 0.09442689269781113, "learning_rate": 9.346218917642886e-06, "loss": 0.4661, "num_input_tokens_seen": 48174816, "step": 39610 }, { "epoch": 4.9636637012905656, "grad_norm": 0.07515045255422592, "learning_rate": 9.3459486052516e-06, "loss": 0.4604, "num_input_tokens_seen": 48180736, "step": 39615 }, { "epoch": 4.964290189199349, "grad_norm": 0.06490160524845123, "learning_rate": 9.345678240900843e-06, "loss": 0.4635, "num_input_tokens_seen": 48186464, "step": 39620 }, { "epoch": 4.964916677108132, "grad_norm": 0.07062316685914993, "learning_rate": 9.345407824593851e-06, "loss": 0.4649, "num_input_tokens_seen": 48192768, "step": 39625 }, { "epoch": 4.965543165016915, "grad_norm": 0.07376472651958466, "learning_rate": 9.345137356333853e-06, "loss": 0.4599, "num_input_tokens_seen": 48198496, "step": 39630 }, { "epoch": 4.966169652925698, "grad_norm": 0.08950933814048767, "learning_rate": 9.344866836124086e-06, "loss": 0.4588, "num_input_tokens_seen": 48204864, "step": 39635 }, { "epoch": 4.966796140834482, "grad_norm": 0.038390763103961945, "learning_rate": 9.344596263967782e-06, "loss": 0.4658, "num_input_tokens_seen": 48210880, "step": 39640 }, { "epoch": 4.9674226287432655, "grad_norm": 0.09749521315097809, "learning_rate": 9.34432563986818e-06, "loss": 0.4701, "num_input_tokens_seen": 48216768, "step": 39645 }, { "epoch": 4.968049116652049, "grad_norm": 0.07306000590324402, "learning_rate": 9.344054963828508e-06, "loss": 0.4719, "num_input_tokens_seen": 48222560, "step": 39650 }, { "epoch": 4.968675604560832, "grad_norm": 0.12150649726390839, "learning_rate": 9.34378423585201e-06, "loss": 0.4531, "num_input_tokens_seen": 48228608, "step": 39655 }, { "epoch": 4.969302092469615, "grad_norm": 0.08534659445285797, "learning_rate": 9.343513455941917e-06, "loss": 0.4531, "num_input_tokens_seen": 48234848, "step": 39660 }, { "epoch": 4.969928580378399, "grad_norm": 0.19309945404529572, "learning_rate": 9.34324262410147e-06, "loss": 0.464, "num_input_tokens_seen": 48240864, "step": 39665 }, { "epoch": 4.970555068287182, "grad_norm": 0.12594765424728394, "learning_rate": 9.342971740333905e-06, "loss": 0.4609, "num_input_tokens_seen": 48246208, "step": 39670 }, { "epoch": 4.971181556195965, "grad_norm": 0.14741764962673187, "learning_rate": 9.342700804642465e-06, "loss": 0.4683, "num_input_tokens_seen": 48252544, "step": 39675 }, { "epoch": 4.9718080441047485, "grad_norm": 0.1301506906747818, "learning_rate": 9.342429817030381e-06, "loss": 0.468, "num_input_tokens_seen": 48257632, "step": 39680 }, { "epoch": 4.972434532013532, "grad_norm": 0.10514815896749496, "learning_rate": 9.3421587775009e-06, "loss": 0.4641, "num_input_tokens_seen": 48263552, "step": 39685 }, { "epoch": 4.973061019922316, "grad_norm": 0.07176563888788223, "learning_rate": 9.34188768605726e-06, "loss": 0.4627, "num_input_tokens_seen": 48269792, "step": 39690 }, { "epoch": 4.973687507831099, "grad_norm": 0.0313209667801857, "learning_rate": 9.341616542702702e-06, "loss": 0.454, "num_input_tokens_seen": 48276000, "step": 39695 }, { "epoch": 4.974313995739882, "grad_norm": 0.07205697894096375, "learning_rate": 9.34134534744047e-06, "loss": 0.4617, "num_input_tokens_seen": 48281664, "step": 39700 }, { "epoch": 4.974940483648665, "grad_norm": 0.08392824977636337, "learning_rate": 9.341074100273802e-06, "loss": 0.4563, "num_input_tokens_seen": 48288064, "step": 39705 }, { "epoch": 4.975566971557449, "grad_norm": 0.11227257549762726, "learning_rate": 9.340802801205943e-06, "loss": 0.4664, "num_input_tokens_seen": 48294368, "step": 39710 }, { "epoch": 4.9761934594662325, "grad_norm": 0.10252845287322998, "learning_rate": 9.340531450240137e-06, "loss": 0.47, "num_input_tokens_seen": 48300480, "step": 39715 }, { "epoch": 4.976819947375016, "grad_norm": 0.13428130745887756, "learning_rate": 9.34026004737963e-06, "loss": 0.4702, "num_input_tokens_seen": 48306688, "step": 39720 }, { "epoch": 4.977446435283799, "grad_norm": 0.07310524582862854, "learning_rate": 9.339988592627663e-06, "loss": 0.465, "num_input_tokens_seen": 48312800, "step": 39725 }, { "epoch": 4.978072923192583, "grad_norm": 0.11725963652133942, "learning_rate": 9.339717085987486e-06, "loss": 0.4647, "num_input_tokens_seen": 48319008, "step": 39730 }, { "epoch": 4.978699411101366, "grad_norm": 0.12488497048616409, "learning_rate": 9.33944552746234e-06, "loss": 0.4584, "num_input_tokens_seen": 48325536, "step": 39735 }, { "epoch": 4.979325899010149, "grad_norm": 0.07121150940656662, "learning_rate": 9.339173917055475e-06, "loss": 0.4652, "num_input_tokens_seen": 48331680, "step": 39740 }, { "epoch": 4.979952386918932, "grad_norm": 0.06208806112408638, "learning_rate": 9.33890225477014e-06, "loss": 0.4539, "num_input_tokens_seen": 48336896, "step": 39745 }, { "epoch": 4.9805788748277156, "grad_norm": 0.07205915451049805, "learning_rate": 9.338630540609578e-06, "loss": 0.4658, "num_input_tokens_seen": 48343072, "step": 39750 }, { "epoch": 4.9812053627365, "grad_norm": 0.062271058559417725, "learning_rate": 9.33835877457704e-06, "loss": 0.4617, "num_input_tokens_seen": 48349088, "step": 39755 }, { "epoch": 4.981831850645283, "grad_norm": 0.06866521388292313, "learning_rate": 9.338086956675777e-06, "loss": 0.4687, "num_input_tokens_seen": 48354816, "step": 39760 }, { "epoch": 4.982458338554066, "grad_norm": 0.09372232109308243, "learning_rate": 9.337815086909036e-06, "loss": 0.4617, "num_input_tokens_seen": 48360736, "step": 39765 }, { "epoch": 4.983084826462849, "grad_norm": 0.03622663393616676, "learning_rate": 9.337543165280068e-06, "loss": 0.4636, "num_input_tokens_seen": 48366784, "step": 39770 }, { "epoch": 4.983711314371632, "grad_norm": 0.09986115247011185, "learning_rate": 9.337271191792125e-06, "loss": 0.462, "num_input_tokens_seen": 48372576, "step": 39775 }, { "epoch": 4.984337802280416, "grad_norm": 0.030355028808116913, "learning_rate": 9.33699916644846e-06, "loss": 0.4613, "num_input_tokens_seen": 48378496, "step": 39780 }, { "epoch": 4.9849642901891995, "grad_norm": 0.11968973278999329, "learning_rate": 9.336727089252323e-06, "loss": 0.4654, "num_input_tokens_seen": 48384960, "step": 39785 }, { "epoch": 4.985590778097983, "grad_norm": 0.06984115391969681, "learning_rate": 9.336454960206966e-06, "loss": 0.4625, "num_input_tokens_seen": 48390624, "step": 39790 }, { "epoch": 4.986217266006766, "grad_norm": 0.06409601867198944, "learning_rate": 9.336182779315646e-06, "loss": 0.464, "num_input_tokens_seen": 48396544, "step": 39795 }, { "epoch": 4.986843753915549, "grad_norm": 0.0640636458992958, "learning_rate": 9.335910546581615e-06, "loss": 0.4579, "num_input_tokens_seen": 48402816, "step": 39800 }, { "epoch": 4.987470241824333, "grad_norm": 0.033046916127204895, "learning_rate": 9.335638262008126e-06, "loss": 0.4636, "num_input_tokens_seen": 48408896, "step": 39805 }, { "epoch": 4.988096729733116, "grad_norm": 0.06800977885723114, "learning_rate": 9.33536592559844e-06, "loss": 0.4664, "num_input_tokens_seen": 48414944, "step": 39810 }, { "epoch": 4.988723217641899, "grad_norm": 0.03233341872692108, "learning_rate": 9.335093537355807e-06, "loss": 0.469, "num_input_tokens_seen": 48421024, "step": 39815 }, { "epoch": 4.989349705550683, "grad_norm": 0.09798551350831985, "learning_rate": 9.334821097283484e-06, "loss": 0.466, "num_input_tokens_seen": 48426976, "step": 39820 }, { "epoch": 4.989976193459466, "grad_norm": 0.06508460640907288, "learning_rate": 9.334548605384732e-06, "loss": 0.4563, "num_input_tokens_seen": 48433216, "step": 39825 }, { "epoch": 4.99060268136825, "grad_norm": 0.1205703392624855, "learning_rate": 9.334276061662808e-06, "loss": 0.4548, "num_input_tokens_seen": 48438496, "step": 39830 }, { "epoch": 4.991229169277033, "grad_norm": 0.07533171027898788, "learning_rate": 9.33400346612097e-06, "loss": 0.4564, "num_input_tokens_seen": 48444768, "step": 39835 }, { "epoch": 4.991855657185816, "grad_norm": 0.14235475659370422, "learning_rate": 9.333730818762476e-06, "loss": 0.4657, "num_input_tokens_seen": 48450944, "step": 39840 }, { "epoch": 4.992482145094599, "grad_norm": 0.08291850239038467, "learning_rate": 9.333458119590586e-06, "loss": 0.464, "num_input_tokens_seen": 48456992, "step": 39845 }, { "epoch": 4.993108633003383, "grad_norm": 0.10293015837669373, "learning_rate": 9.333185368608562e-06, "loss": 0.464, "num_input_tokens_seen": 48463072, "step": 39850 }, { "epoch": 4.9937351209121665, "grad_norm": 0.1100001260638237, "learning_rate": 9.332912565819662e-06, "loss": 0.4678, "num_input_tokens_seen": 48469152, "step": 39855 }, { "epoch": 4.99436160882095, "grad_norm": 0.07313167303800583, "learning_rate": 9.33263971122715e-06, "loss": 0.4535, "num_input_tokens_seen": 48475296, "step": 39860 }, { "epoch": 4.994988096729733, "grad_norm": 0.06675127893686295, "learning_rate": 9.332366804834287e-06, "loss": 0.4669, "num_input_tokens_seen": 48481344, "step": 39865 }, { "epoch": 4.995614584638517, "grad_norm": 0.07051888108253479, "learning_rate": 9.33209384664434e-06, "loss": 0.4617, "num_input_tokens_seen": 48487488, "step": 39870 }, { "epoch": 4.9962410725473, "grad_norm": 0.07456955313682556, "learning_rate": 9.331820836660564e-06, "loss": 0.4573, "num_input_tokens_seen": 48493280, "step": 39875 }, { "epoch": 4.996867560456083, "grad_norm": 0.11299916356801987, "learning_rate": 9.33154777488623e-06, "loss": 0.4626, "num_input_tokens_seen": 48499264, "step": 39880 }, { "epoch": 4.997494048364866, "grad_norm": 0.07348200678825378, "learning_rate": 9.331274661324601e-06, "loss": 0.4592, "num_input_tokens_seen": 48505120, "step": 39885 }, { "epoch": 4.99812053627365, "grad_norm": 0.09258321672677994, "learning_rate": 9.331001495978942e-06, "loss": 0.4676, "num_input_tokens_seen": 48511264, "step": 39890 }, { "epoch": 4.998747024182434, "grad_norm": 0.0736832246184349, "learning_rate": 9.330728278852518e-06, "loss": 0.462, "num_input_tokens_seen": 48516704, "step": 39895 }, { "epoch": 4.999373512091217, "grad_norm": 0.07734208554029465, "learning_rate": 9.330455009948595e-06, "loss": 0.4658, "num_input_tokens_seen": 48522656, "step": 39900 }, { "epoch": 5.0, "grad_norm": 0.09162705391645432, "learning_rate": 9.330181689270444e-06, "loss": 0.4625, "num_input_tokens_seen": 48528832, "step": 39905 }, { "epoch": 5.000626487908783, "grad_norm": 0.07316049188375473, "learning_rate": 9.32990831682133e-06, "loss": 0.4636, "num_input_tokens_seen": 48534816, "step": 39910 }, { "epoch": 5.001252975817566, "grad_norm": 0.07603053003549576, "learning_rate": 9.329634892604522e-06, "loss": 0.4525, "num_input_tokens_seen": 48540800, "step": 39915 }, { "epoch": 5.00187946372635, "grad_norm": 0.04064207524061203, "learning_rate": 9.329361416623286e-06, "loss": 0.4641, "num_input_tokens_seen": 48546560, "step": 39920 }, { "epoch": 5.002505951635134, "grad_norm": 0.06489446759223938, "learning_rate": 9.329087888880896e-06, "loss": 0.4648, "num_input_tokens_seen": 48552736, "step": 39925 }, { "epoch": 5.003132439543917, "grad_norm": 0.08629737049341202, "learning_rate": 9.328814309380622e-06, "loss": 0.4672, "num_input_tokens_seen": 48558944, "step": 39930 }, { "epoch": 5.0037589274527, "grad_norm": 0.03560153767466545, "learning_rate": 9.328540678125733e-06, "loss": 0.4543, "num_input_tokens_seen": 48565024, "step": 39935 }, { "epoch": 5.004385415361484, "grad_norm": 0.0629233792424202, "learning_rate": 9.3282669951195e-06, "loss": 0.461, "num_input_tokens_seen": 48570944, "step": 39940 }, { "epoch": 5.005011903270267, "grad_norm": 0.06811552494764328, "learning_rate": 9.327993260365196e-06, "loss": 0.4641, "num_input_tokens_seen": 48577120, "step": 39945 }, { "epoch": 5.00563839117905, "grad_norm": 0.0880395695567131, "learning_rate": 9.327719473866093e-06, "loss": 0.4651, "num_input_tokens_seen": 48583520, "step": 39950 }, { "epoch": 5.0062648790878335, "grad_norm": 0.0382920578122139, "learning_rate": 9.327445635625468e-06, "loss": 0.4586, "num_input_tokens_seen": 48589376, "step": 39955 }, { "epoch": 5.006891366996617, "grad_norm": 0.06690475344657898, "learning_rate": 9.327171745646588e-06, "loss": 0.4567, "num_input_tokens_seen": 48595776, "step": 39960 }, { "epoch": 5.007517854905401, "grad_norm": 0.11375173181295395, "learning_rate": 9.326897803932735e-06, "loss": 0.4614, "num_input_tokens_seen": 48602080, "step": 39965 }, { "epoch": 5.008144342814184, "grad_norm": 0.07488857209682465, "learning_rate": 9.32662381048718e-06, "loss": 0.4543, "num_input_tokens_seen": 48608224, "step": 39970 }, { "epoch": 5.008770830722967, "grad_norm": 0.06525259464979172, "learning_rate": 9.326349765313199e-06, "loss": 0.4564, "num_input_tokens_seen": 48614464, "step": 39975 }, { "epoch": 5.00939731863175, "grad_norm": 0.09206714481115341, "learning_rate": 9.32607566841407e-06, "loss": 0.4552, "num_input_tokens_seen": 48621152, "step": 39980 }, { "epoch": 5.010023806540533, "grad_norm": 0.14213411509990692, "learning_rate": 9.32580151979307e-06, "loss": 0.4664, "num_input_tokens_seen": 48627648, "step": 39985 }, { "epoch": 5.010650294449317, "grad_norm": 0.08595200628042221, "learning_rate": 9.325527319453473e-06, "loss": 0.4571, "num_input_tokens_seen": 48633696, "step": 39990 }, { "epoch": 5.011276782358101, "grad_norm": 0.12955033779144287, "learning_rate": 9.325253067398562e-06, "loss": 0.4589, "num_input_tokens_seen": 48640096, "step": 39995 }, { "epoch": 5.011903270266884, "grad_norm": 0.17800092697143555, "learning_rate": 9.324978763631616e-06, "loss": 0.46, "num_input_tokens_seen": 48646080, "step": 40000 }, { "epoch": 5.012529758175667, "grad_norm": 0.050504934042692184, "learning_rate": 9.32470440815591e-06, "loss": 0.4659, "num_input_tokens_seen": 48652160, "step": 40005 }, { "epoch": 5.013156246084451, "grad_norm": 0.16213609278202057, "learning_rate": 9.324430000974727e-06, "loss": 0.4695, "num_input_tokens_seen": 48658112, "step": 40010 }, { "epoch": 5.013782733993234, "grad_norm": 0.09168156236410141, "learning_rate": 9.324155542091349e-06, "loss": 0.4584, "num_input_tokens_seen": 48663936, "step": 40015 }, { "epoch": 5.014409221902017, "grad_norm": 0.09121609479188919, "learning_rate": 9.323881031509055e-06, "loss": 0.4676, "num_input_tokens_seen": 48669536, "step": 40020 }, { "epoch": 5.0150357098108005, "grad_norm": 0.08958309143781662, "learning_rate": 9.323606469231128e-06, "loss": 0.4568, "num_input_tokens_seen": 48675840, "step": 40025 }, { "epoch": 5.015662197719584, "grad_norm": 0.038825660943984985, "learning_rate": 9.323331855260852e-06, "loss": 0.4676, "num_input_tokens_seen": 48681152, "step": 40030 }, { "epoch": 5.016288685628368, "grad_norm": 0.09318764507770538, "learning_rate": 9.323057189601506e-06, "loss": 0.4587, "num_input_tokens_seen": 48687232, "step": 40035 }, { "epoch": 5.016915173537151, "grad_norm": 0.07954955846071243, "learning_rate": 9.322782472256378e-06, "loss": 0.4571, "num_input_tokens_seen": 48693376, "step": 40040 }, { "epoch": 5.017541661445934, "grad_norm": 0.07298888266086578, "learning_rate": 9.322507703228752e-06, "loss": 0.457, "num_input_tokens_seen": 48699104, "step": 40045 }, { "epoch": 5.018168149354717, "grad_norm": 0.11105576902627945, "learning_rate": 9.322232882521911e-06, "loss": 0.4636, "num_input_tokens_seen": 48705280, "step": 40050 }, { "epoch": 5.0187946372635, "grad_norm": 0.09511848539113998, "learning_rate": 9.321958010139142e-06, "loss": 0.462, "num_input_tokens_seen": 48711488, "step": 40055 }, { "epoch": 5.0194211251722844, "grad_norm": 0.14110690355300903, "learning_rate": 9.321683086083732e-06, "loss": 0.4714, "num_input_tokens_seen": 48717792, "step": 40060 }, { "epoch": 5.020047613081068, "grad_norm": 0.0894516333937645, "learning_rate": 9.321408110358968e-06, "loss": 0.4607, "num_input_tokens_seen": 48723904, "step": 40065 }, { "epoch": 5.020674100989851, "grad_norm": 0.06811992824077606, "learning_rate": 9.321133082968134e-06, "loss": 0.4639, "num_input_tokens_seen": 48729984, "step": 40070 }, { "epoch": 5.021300588898634, "grad_norm": 0.10098675638437271, "learning_rate": 9.320858003914523e-06, "loss": 0.4523, "num_input_tokens_seen": 48736448, "step": 40075 }, { "epoch": 5.021927076807418, "grad_norm": 0.0788043886423111, "learning_rate": 9.320582873201423e-06, "loss": 0.4626, "num_input_tokens_seen": 48742880, "step": 40080 }, { "epoch": 5.022553564716201, "grad_norm": 0.07953206449747086, "learning_rate": 9.32030769083212e-06, "loss": 0.4622, "num_input_tokens_seen": 48748864, "step": 40085 }, { "epoch": 5.023180052624984, "grad_norm": 0.07681997120380402, "learning_rate": 9.320032456809906e-06, "loss": 0.4658, "num_input_tokens_seen": 48754880, "step": 40090 }, { "epoch": 5.0238065405337675, "grad_norm": 0.10009893774986267, "learning_rate": 9.319757171138073e-06, "loss": 0.4574, "num_input_tokens_seen": 48761120, "step": 40095 }, { "epoch": 5.024433028442551, "grad_norm": 0.08381283283233643, "learning_rate": 9.319481833819912e-06, "loss": 0.4652, "num_input_tokens_seen": 48767360, "step": 40100 }, { "epoch": 5.025059516351335, "grad_norm": 0.13393577933311462, "learning_rate": 9.319206444858712e-06, "loss": 0.4603, "num_input_tokens_seen": 48773728, "step": 40105 }, { "epoch": 5.025686004260118, "grad_norm": 0.14674170315265656, "learning_rate": 9.31893100425777e-06, "loss": 0.4613, "num_input_tokens_seen": 48779616, "step": 40110 }, { "epoch": 5.026312492168901, "grad_norm": 0.07529885321855545, "learning_rate": 9.318655512020374e-06, "loss": 0.4609, "num_input_tokens_seen": 48785248, "step": 40115 }, { "epoch": 5.026938980077684, "grad_norm": 0.07885758578777313, "learning_rate": 9.318379968149822e-06, "loss": 0.4682, "num_input_tokens_seen": 48791616, "step": 40120 }, { "epoch": 5.027565467986468, "grad_norm": 0.07168187946081161, "learning_rate": 9.318104372649408e-06, "loss": 0.4583, "num_input_tokens_seen": 48797984, "step": 40125 }, { "epoch": 5.0281919558952515, "grad_norm": 0.13822950422763824, "learning_rate": 9.317828725522424e-06, "loss": 0.4614, "num_input_tokens_seen": 48804224, "step": 40130 }, { "epoch": 5.028818443804035, "grad_norm": 0.0809122771024704, "learning_rate": 9.317553026772169e-06, "loss": 0.4557, "num_input_tokens_seen": 48810816, "step": 40135 }, { "epoch": 5.029444931712818, "grad_norm": 0.07710719108581543, "learning_rate": 9.317277276401935e-06, "loss": 0.4703, "num_input_tokens_seen": 48816896, "step": 40140 }, { "epoch": 5.030071419621601, "grad_norm": 0.07793599367141724, "learning_rate": 9.317001474415022e-06, "loss": 0.4597, "num_input_tokens_seen": 48823296, "step": 40145 }, { "epoch": 5.030697907530385, "grad_norm": 0.09308412671089172, "learning_rate": 9.316725620814728e-06, "loss": 0.4669, "num_input_tokens_seen": 48829632, "step": 40150 }, { "epoch": 5.031324395439168, "grad_norm": 0.07387151569128036, "learning_rate": 9.31644971560435e-06, "loss": 0.4581, "num_input_tokens_seen": 48835712, "step": 40155 }, { "epoch": 5.031950883347951, "grad_norm": 0.08235767483711243, "learning_rate": 9.316173758787186e-06, "loss": 0.458, "num_input_tokens_seen": 48841792, "step": 40160 }, { "epoch": 5.0325773712567345, "grad_norm": 0.0927613154053688, "learning_rate": 9.315897750366535e-06, "loss": 0.4556, "num_input_tokens_seen": 48847936, "step": 40165 }, { "epoch": 5.033203859165518, "grad_norm": 0.07989974319934845, "learning_rate": 9.3156216903457e-06, "loss": 0.4598, "num_input_tokens_seen": 48854048, "step": 40170 }, { "epoch": 5.033830347074302, "grad_norm": 0.12859797477722168, "learning_rate": 9.315345578727977e-06, "loss": 0.4564, "num_input_tokens_seen": 48860064, "step": 40175 }, { "epoch": 5.034456834983085, "grad_norm": 0.08823690563440323, "learning_rate": 9.31506941551667e-06, "loss": 0.4611, "num_input_tokens_seen": 48866144, "step": 40180 }, { "epoch": 5.035083322891868, "grad_norm": 0.11689230799674988, "learning_rate": 9.31479320071508e-06, "loss": 0.4658, "num_input_tokens_seen": 48871616, "step": 40185 }, { "epoch": 5.035709810800651, "grad_norm": 0.10805033147335052, "learning_rate": 9.314516934326512e-06, "loss": 0.4598, "num_input_tokens_seen": 48877568, "step": 40190 }, { "epoch": 5.036336298709435, "grad_norm": 0.12195058912038803, "learning_rate": 9.314240616354264e-06, "loss": 0.4514, "num_input_tokens_seen": 48883584, "step": 40195 }, { "epoch": 5.0369627866182185, "grad_norm": 0.15236404538154602, "learning_rate": 9.313964246801644e-06, "loss": 0.4656, "num_input_tokens_seen": 48889280, "step": 40200 }, { "epoch": 5.037589274527002, "grad_norm": 0.14568805694580078, "learning_rate": 9.313687825671952e-06, "loss": 0.4588, "num_input_tokens_seen": 48895744, "step": 40205 }, { "epoch": 5.038215762435785, "grad_norm": 0.10927369445562363, "learning_rate": 9.313411352968498e-06, "loss": 0.4652, "num_input_tokens_seen": 48901920, "step": 40210 }, { "epoch": 5.038842250344568, "grad_norm": 0.1129998192191124, "learning_rate": 9.313134828694583e-06, "loss": 0.4595, "num_input_tokens_seen": 48907744, "step": 40215 }, { "epoch": 5.039468738253352, "grad_norm": 0.2196814864873886, "learning_rate": 9.312858252853514e-06, "loss": 0.4621, "num_input_tokens_seen": 48914016, "step": 40220 }, { "epoch": 5.040095226162135, "grad_norm": 333.12701416015625, "learning_rate": 9.3125816254486e-06, "loss": 2.2287, "num_input_tokens_seen": 48920064, "step": 40225 }, { "epoch": 5.040721714070918, "grad_norm": 0.25675976276397705, "learning_rate": 9.312304946483148e-06, "loss": 0.458, "num_input_tokens_seen": 48926528, "step": 40230 }, { "epoch": 5.041348201979702, "grad_norm": 0.16758054494857788, "learning_rate": 9.312028215960464e-06, "loss": 0.4598, "num_input_tokens_seen": 48932800, "step": 40235 }, { "epoch": 5.041974689888485, "grad_norm": 0.09495382010936737, "learning_rate": 9.311751433883856e-06, "loss": 0.4507, "num_input_tokens_seen": 48938752, "step": 40240 }, { "epoch": 5.042601177797269, "grad_norm": 0.14940735697746277, "learning_rate": 9.311474600256636e-06, "loss": 0.4604, "num_input_tokens_seen": 48945216, "step": 40245 }, { "epoch": 5.043227665706052, "grad_norm": 0.09162232279777527, "learning_rate": 9.311197715082112e-06, "loss": 0.4681, "num_input_tokens_seen": 48951296, "step": 40250 }, { "epoch": 5.043854153614835, "grad_norm": 0.06713790446519852, "learning_rate": 9.310920778363594e-06, "loss": 0.4666, "num_input_tokens_seen": 48957408, "step": 40255 }, { "epoch": 5.044480641523618, "grad_norm": 0.09245414286851883, "learning_rate": 9.310643790104393e-06, "loss": 0.4598, "num_input_tokens_seen": 48963456, "step": 40260 }, { "epoch": 5.045107129432402, "grad_norm": 0.1796598583459854, "learning_rate": 9.310366750307822e-06, "loss": 0.4598, "num_input_tokens_seen": 48969600, "step": 40265 }, { "epoch": 5.0457336173411855, "grad_norm": 0.0888991430401802, "learning_rate": 9.310089658977194e-06, "loss": 0.4587, "num_input_tokens_seen": 48975808, "step": 40270 }, { "epoch": 5.046360105249969, "grad_norm": 0.10770872980356216, "learning_rate": 9.30981251611582e-06, "loss": 0.4572, "num_input_tokens_seen": 48981792, "step": 40275 }, { "epoch": 5.046986593158752, "grad_norm": 0.1477431207895279, "learning_rate": 9.309535321727012e-06, "loss": 0.4671, "num_input_tokens_seen": 48987968, "step": 40280 }, { "epoch": 5.047613081067535, "grad_norm": 0.11842463910579681, "learning_rate": 9.309258075814086e-06, "loss": 0.4731, "num_input_tokens_seen": 48993248, "step": 40285 }, { "epoch": 5.048239568976319, "grad_norm": 0.13220341503620148, "learning_rate": 9.308980778380358e-06, "loss": 0.4463, "num_input_tokens_seen": 48999136, "step": 40290 }, { "epoch": 5.048866056885102, "grad_norm": 0.09475463628768921, "learning_rate": 9.308703429429142e-06, "loss": 0.4699, "num_input_tokens_seen": 49005376, "step": 40295 }, { "epoch": 5.049492544793885, "grad_norm": 0.10706750303506851, "learning_rate": 9.308426028963755e-06, "loss": 0.4613, "num_input_tokens_seen": 49011584, "step": 40300 }, { "epoch": 5.050119032702669, "grad_norm": 0.10479845851659775, "learning_rate": 9.30814857698751e-06, "loss": 0.4586, "num_input_tokens_seen": 49018240, "step": 40305 }, { "epoch": 5.050745520611452, "grad_norm": 0.13074244558811188, "learning_rate": 9.307871073503729e-06, "loss": 0.4566, "num_input_tokens_seen": 49024672, "step": 40310 }, { "epoch": 5.051372008520236, "grad_norm": 0.18867981433868408, "learning_rate": 9.307593518515725e-06, "loss": 0.456, "num_input_tokens_seen": 49030784, "step": 40315 }, { "epoch": 5.051998496429019, "grad_norm": 0.20224492251873016, "learning_rate": 9.307315912026821e-06, "loss": 0.469, "num_input_tokens_seen": 49036960, "step": 40320 }, { "epoch": 5.052624984337802, "grad_norm": 0.1431134194135666, "learning_rate": 9.307038254040332e-06, "loss": 0.4621, "num_input_tokens_seen": 49043040, "step": 40325 }, { "epoch": 5.053251472246585, "grad_norm": 0.18792641162872314, "learning_rate": 9.306760544559582e-06, "loss": 0.4617, "num_input_tokens_seen": 49049312, "step": 40330 }, { "epoch": 5.053877960155369, "grad_norm": 0.17201317846775055, "learning_rate": 9.306482783587886e-06, "loss": 0.4637, "num_input_tokens_seen": 49055552, "step": 40335 }, { "epoch": 5.0545044480641526, "grad_norm": 0.2228466421365738, "learning_rate": 9.30620497112857e-06, "loss": 0.4532, "num_input_tokens_seen": 49061600, "step": 40340 }, { "epoch": 5.055130935972936, "grad_norm": 0.15611153841018677, "learning_rate": 9.305927107184952e-06, "loss": 0.4668, "num_input_tokens_seen": 49067520, "step": 40345 }, { "epoch": 5.055757423881719, "grad_norm": 0.28214552998542786, "learning_rate": 9.305649191760355e-06, "loss": 0.447, "num_input_tokens_seen": 49073696, "step": 40350 }, { "epoch": 5.056383911790502, "grad_norm": 0.4048755466938019, "learning_rate": 9.305371224858103e-06, "loss": 0.4582, "num_input_tokens_seen": 49080032, "step": 40355 }, { "epoch": 5.057010399699286, "grad_norm": 0.2924560010433197, "learning_rate": 9.305093206481517e-06, "loss": 0.4653, "num_input_tokens_seen": 49086208, "step": 40360 }, { "epoch": 5.057636887608069, "grad_norm": 0.3119800388813019, "learning_rate": 9.304815136633923e-06, "loss": 0.464, "num_input_tokens_seen": 49092192, "step": 40365 }, { "epoch": 5.0582633755168525, "grad_norm": 0.14799556136131287, "learning_rate": 9.304537015318645e-06, "loss": 0.4618, "num_input_tokens_seen": 49098336, "step": 40370 }, { "epoch": 5.058889863425636, "grad_norm": 0.21792499721050262, "learning_rate": 9.304258842539007e-06, "loss": 0.4498, "num_input_tokens_seen": 49104448, "step": 40375 }, { "epoch": 5.05951635133442, "grad_norm": 0.16435739398002625, "learning_rate": 9.303980618298337e-06, "loss": 0.4656, "num_input_tokens_seen": 49110752, "step": 40380 }, { "epoch": 5.060142839243203, "grad_norm": 0.11450830847024918, "learning_rate": 9.303702342599957e-06, "loss": 0.4623, "num_input_tokens_seen": 49116640, "step": 40385 }, { "epoch": 5.060769327151986, "grad_norm": 0.09954848140478134, "learning_rate": 9.3034240154472e-06, "loss": 0.4708, "num_input_tokens_seen": 49122752, "step": 40390 }, { "epoch": 5.061395815060769, "grad_norm": 0.10158693045377731, "learning_rate": 9.30314563684339e-06, "loss": 0.468, "num_input_tokens_seen": 49128896, "step": 40395 }, { "epoch": 5.062022302969552, "grad_norm": 0.08608365058898926, "learning_rate": 9.302867206791857e-06, "loss": 0.4617, "num_input_tokens_seen": 49134880, "step": 40400 }, { "epoch": 5.062648790878336, "grad_norm": 0.055813100188970566, "learning_rate": 9.302588725295927e-06, "loss": 0.4662, "num_input_tokens_seen": 49140832, "step": 40405 }, { "epoch": 5.06327527878712, "grad_norm": 0.08744321763515472, "learning_rate": 9.302310192358932e-06, "loss": 0.4658, "num_input_tokens_seen": 49146944, "step": 40410 }, { "epoch": 5.063901766695903, "grad_norm": 0.11966855823993683, "learning_rate": 9.302031607984203e-06, "loss": 0.4565, "num_input_tokens_seen": 49152704, "step": 40415 }, { "epoch": 5.064528254604686, "grad_norm": 0.09875357896089554, "learning_rate": 9.301752972175067e-06, "loss": 0.4658, "num_input_tokens_seen": 49158752, "step": 40420 }, { "epoch": 5.065154742513469, "grad_norm": 0.09408614039421082, "learning_rate": 9.30147428493486e-06, "loss": 0.4581, "num_input_tokens_seen": 49164864, "step": 40425 }, { "epoch": 5.065781230422253, "grad_norm": 0.061251431703567505, "learning_rate": 9.301195546266909e-06, "loss": 0.459, "num_input_tokens_seen": 49170880, "step": 40430 }, { "epoch": 5.066407718331036, "grad_norm": 0.08701871335506439, "learning_rate": 9.30091675617455e-06, "loss": 0.4604, "num_input_tokens_seen": 49176896, "step": 40435 }, { "epoch": 5.0670342062398195, "grad_norm": 0.093665711581707, "learning_rate": 9.300637914661115e-06, "loss": 0.4665, "num_input_tokens_seen": 49183168, "step": 40440 }, { "epoch": 5.067660694148603, "grad_norm": 0.11283581703901291, "learning_rate": 9.300359021729939e-06, "loss": 0.4616, "num_input_tokens_seen": 49188992, "step": 40445 }, { "epoch": 5.068287182057387, "grad_norm": 0.06824523210525513, "learning_rate": 9.300080077384354e-06, "loss": 0.468, "num_input_tokens_seen": 49195040, "step": 40450 }, { "epoch": 5.06891366996617, "grad_norm": 0.13575424253940582, "learning_rate": 9.299801081627696e-06, "loss": 0.4626, "num_input_tokens_seen": 49201344, "step": 40455 }, { "epoch": 5.069540157874953, "grad_norm": 0.15351469814777374, "learning_rate": 9.299522034463302e-06, "loss": 0.4516, "num_input_tokens_seen": 49207040, "step": 40460 }, { "epoch": 5.070166645783736, "grad_norm": 0.20650199055671692, "learning_rate": 9.299242935894509e-06, "loss": 0.453, "num_input_tokens_seen": 49213408, "step": 40465 }, { "epoch": 5.070793133692519, "grad_norm": 0.1708788275718689, "learning_rate": 9.298963785924649e-06, "loss": 0.4634, "num_input_tokens_seen": 49219552, "step": 40470 }, { "epoch": 5.071419621601303, "grad_norm": 0.1307975947856903, "learning_rate": 9.298684584557063e-06, "loss": 0.4738, "num_input_tokens_seen": 49225760, "step": 40475 }, { "epoch": 5.072046109510087, "grad_norm": 0.134197399020195, "learning_rate": 9.29840533179509e-06, "loss": 0.4686, "num_input_tokens_seen": 49232032, "step": 40480 }, { "epoch": 5.07267259741887, "grad_norm": 0.1909290850162506, "learning_rate": 9.298126027642066e-06, "loss": 0.4676, "num_input_tokens_seen": 49238176, "step": 40485 }, { "epoch": 5.073299085327653, "grad_norm": 0.11070414632558823, "learning_rate": 9.297846672101333e-06, "loss": 0.4632, "num_input_tokens_seen": 49244224, "step": 40490 }, { "epoch": 5.073925573236436, "grad_norm": 0.04203353077173233, "learning_rate": 9.297567265176228e-06, "loss": 0.4613, "num_input_tokens_seen": 49249632, "step": 40495 }, { "epoch": 5.07455206114522, "grad_norm": 0.0829155370593071, "learning_rate": 9.297287806870095e-06, "loss": 0.4633, "num_input_tokens_seen": 49255936, "step": 40500 }, { "epoch": 5.075178549054003, "grad_norm": 0.08070100843906403, "learning_rate": 9.297008297186271e-06, "loss": 0.4622, "num_input_tokens_seen": 49262016, "step": 40505 }, { "epoch": 5.0758050369627865, "grad_norm": 0.11476251482963562, "learning_rate": 9.296728736128103e-06, "loss": 0.4622, "num_input_tokens_seen": 49268256, "step": 40510 }, { "epoch": 5.07643152487157, "grad_norm": 0.07893432676792145, "learning_rate": 9.29644912369893e-06, "loss": 0.4686, "num_input_tokens_seen": 49274272, "step": 40515 }, { "epoch": 5.077058012780354, "grad_norm": 0.13935333490371704, "learning_rate": 9.296169459902093e-06, "loss": 0.4676, "num_input_tokens_seen": 49279936, "step": 40520 }, { "epoch": 5.077684500689137, "grad_norm": 0.13876193761825562, "learning_rate": 9.29588974474094e-06, "loss": 0.4637, "num_input_tokens_seen": 49286208, "step": 40525 }, { "epoch": 5.07831098859792, "grad_norm": 0.0898834690451622, "learning_rate": 9.295609978218814e-06, "loss": 0.4701, "num_input_tokens_seen": 49292160, "step": 40530 }, { "epoch": 5.078937476506703, "grad_norm": 0.0935160294175148, "learning_rate": 9.295330160339057e-06, "loss": 0.4613, "num_input_tokens_seen": 49298080, "step": 40535 }, { "epoch": 5.079563964415486, "grad_norm": 0.12100803107023239, "learning_rate": 9.295050291105017e-06, "loss": 0.4575, "num_input_tokens_seen": 49304224, "step": 40540 }, { "epoch": 5.0801904523242705, "grad_norm": 0.08334992080926895, "learning_rate": 9.29477037052004e-06, "loss": 0.4628, "num_input_tokens_seen": 49310560, "step": 40545 }, { "epoch": 5.080816940233054, "grad_norm": 0.05490754172205925, "learning_rate": 9.294490398587474e-06, "loss": 0.4699, "num_input_tokens_seen": 49316736, "step": 40550 }, { "epoch": 5.081443428141837, "grad_norm": 0.072983019053936, "learning_rate": 9.294210375310662e-06, "loss": 0.4689, "num_input_tokens_seen": 49321632, "step": 40555 }, { "epoch": 5.08206991605062, "grad_norm": 0.07975217700004578, "learning_rate": 9.293930300692957e-06, "loss": 0.4684, "num_input_tokens_seen": 49327680, "step": 40560 }, { "epoch": 5.082696403959403, "grad_norm": 0.09401433169841766, "learning_rate": 9.293650174737705e-06, "loss": 0.4611, "num_input_tokens_seen": 49333952, "step": 40565 }, { "epoch": 5.083322891868187, "grad_norm": 0.08278769254684448, "learning_rate": 9.293369997448253e-06, "loss": 0.4626, "num_input_tokens_seen": 49339840, "step": 40570 }, { "epoch": 5.08394937977697, "grad_norm": 0.14427517354488373, "learning_rate": 9.293089768827955e-06, "loss": 0.4632, "num_input_tokens_seen": 49345920, "step": 40575 }, { "epoch": 5.0845758676857535, "grad_norm": 0.07212769985198975, "learning_rate": 9.29280948888016e-06, "loss": 0.4621, "num_input_tokens_seen": 49351936, "step": 40580 }, { "epoch": 5.085202355594537, "grad_norm": 0.06754381209611893, "learning_rate": 9.292529157608217e-06, "loss": 0.4623, "num_input_tokens_seen": 49357888, "step": 40585 }, { "epoch": 5.085828843503321, "grad_norm": 0.08241615444421768, "learning_rate": 9.292248775015477e-06, "loss": 0.4637, "num_input_tokens_seen": 49363968, "step": 40590 }, { "epoch": 5.086455331412104, "grad_norm": 0.11411795765161514, "learning_rate": 9.291968341105297e-06, "loss": 0.4592, "num_input_tokens_seen": 49369888, "step": 40595 }, { "epoch": 5.087081819320887, "grad_norm": 0.08548591285943985, "learning_rate": 9.291687855881027e-06, "loss": 0.4597, "num_input_tokens_seen": 49376032, "step": 40600 }, { "epoch": 5.08770830722967, "grad_norm": 0.0693168044090271, "learning_rate": 9.29140731934602e-06, "loss": 0.4617, "num_input_tokens_seen": 49382368, "step": 40605 }, { "epoch": 5.088334795138453, "grad_norm": 0.12029831856489182, "learning_rate": 9.29112673150363e-06, "loss": 0.4626, "num_input_tokens_seen": 49388224, "step": 40610 }, { "epoch": 5.0889612830472375, "grad_norm": 0.07009264081716537, "learning_rate": 9.290846092357212e-06, "loss": 0.4614, "num_input_tokens_seen": 49394304, "step": 40615 }, { "epoch": 5.089587770956021, "grad_norm": 0.09269608557224274, "learning_rate": 9.290565401910121e-06, "loss": 0.4624, "num_input_tokens_seen": 49399968, "step": 40620 }, { "epoch": 5.090214258864804, "grad_norm": 0.09331532567739487, "learning_rate": 9.290284660165715e-06, "loss": 0.4574, "num_input_tokens_seen": 49406144, "step": 40625 }, { "epoch": 5.090840746773587, "grad_norm": 0.04844937473535538, "learning_rate": 9.290003867127348e-06, "loss": 0.4596, "num_input_tokens_seen": 49412640, "step": 40630 }, { "epoch": 5.091467234682371, "grad_norm": 0.06839849799871445, "learning_rate": 9.289723022798378e-06, "loss": 0.4569, "num_input_tokens_seen": 49418880, "step": 40635 }, { "epoch": 5.092093722591154, "grad_norm": 0.07068566232919693, "learning_rate": 9.289442127182162e-06, "loss": 0.4694, "num_input_tokens_seen": 49424928, "step": 40640 }, { "epoch": 5.092720210499937, "grad_norm": 0.11182279139757156, "learning_rate": 9.28916118028206e-06, "loss": 0.4651, "num_input_tokens_seen": 49430784, "step": 40645 }, { "epoch": 5.093346698408721, "grad_norm": 0.11524593830108643, "learning_rate": 9.288880182101428e-06, "loss": 0.4619, "num_input_tokens_seen": 49436960, "step": 40650 }, { "epoch": 5.093973186317504, "grad_norm": 0.07668156176805496, "learning_rate": 9.28859913264363e-06, "loss": 0.454, "num_input_tokens_seen": 49443104, "step": 40655 }, { "epoch": 5.094599674226288, "grad_norm": 0.07629817724227905, "learning_rate": 9.288318031912022e-06, "loss": 0.4604, "num_input_tokens_seen": 49448864, "step": 40660 }, { "epoch": 5.095226162135071, "grad_norm": 0.08003169298171997, "learning_rate": 9.288036879909967e-06, "loss": 0.4634, "num_input_tokens_seen": 49455008, "step": 40665 }, { "epoch": 5.095852650043854, "grad_norm": 0.14598016440868378, "learning_rate": 9.287755676640826e-06, "loss": 0.4524, "num_input_tokens_seen": 49460960, "step": 40670 }, { "epoch": 5.096479137952637, "grad_norm": 0.10227590054273605, "learning_rate": 9.287474422107962e-06, "loss": 0.4639, "num_input_tokens_seen": 49466944, "step": 40675 }, { "epoch": 5.0971056258614205, "grad_norm": 0.11732817441225052, "learning_rate": 9.287193116314734e-06, "loss": 0.4492, "num_input_tokens_seen": 49472928, "step": 40680 }, { "epoch": 5.0977321137702045, "grad_norm": 0.10313914716243744, "learning_rate": 9.28691175926451e-06, "loss": 0.4591, "num_input_tokens_seen": 49479072, "step": 40685 }, { "epoch": 5.098358601678988, "grad_norm": 0.1528875082731247, "learning_rate": 9.286630350960652e-06, "loss": 0.4618, "num_input_tokens_seen": 49484992, "step": 40690 }, { "epoch": 5.098985089587771, "grad_norm": 0.17903509736061096, "learning_rate": 9.286348891406522e-06, "loss": 0.4706, "num_input_tokens_seen": 49491328, "step": 40695 }, { "epoch": 5.099611577496554, "grad_norm": 0.1395144909620285, "learning_rate": 9.286067380605489e-06, "loss": 0.4723, "num_input_tokens_seen": 49497344, "step": 40700 }, { "epoch": 5.100238065405338, "grad_norm": 0.05264393612742424, "learning_rate": 9.285785818560918e-06, "loss": 0.4707, "num_input_tokens_seen": 49503648, "step": 40705 }, { "epoch": 5.100864553314121, "grad_norm": 0.08121326565742493, "learning_rate": 9.285504205276172e-06, "loss": 0.468, "num_input_tokens_seen": 49509568, "step": 40710 }, { "epoch": 5.101491041222904, "grad_norm": 0.1211298257112503, "learning_rate": 9.28522254075462e-06, "loss": 0.464, "num_input_tokens_seen": 49515424, "step": 40715 }, { "epoch": 5.102117529131688, "grad_norm": 0.07678845524787903, "learning_rate": 9.28494082499963e-06, "loss": 0.4612, "num_input_tokens_seen": 49521696, "step": 40720 }, { "epoch": 5.102744017040471, "grad_norm": 0.09874885529279709, "learning_rate": 9.284659058014571e-06, "loss": 0.4504, "num_input_tokens_seen": 49528032, "step": 40725 }, { "epoch": 5.103370504949255, "grad_norm": 0.1248941570520401, "learning_rate": 9.28437723980281e-06, "loss": 0.4648, "num_input_tokens_seen": 49534240, "step": 40730 }, { "epoch": 5.103996992858038, "grad_norm": 0.09064600616693497, "learning_rate": 9.284095370367717e-06, "loss": 0.4607, "num_input_tokens_seen": 49540128, "step": 40735 }, { "epoch": 5.104623480766821, "grad_norm": 0.0738450214266777, "learning_rate": 9.283813449712661e-06, "loss": 0.4583, "num_input_tokens_seen": 49546272, "step": 40740 }, { "epoch": 5.105249968675604, "grad_norm": 0.08640274405479431, "learning_rate": 9.283531477841016e-06, "loss": 0.4597, "num_input_tokens_seen": 49552384, "step": 40745 }, { "epoch": 5.1058764565843875, "grad_norm": 0.16318552196025848, "learning_rate": 9.283249454756148e-06, "loss": 0.4673, "num_input_tokens_seen": 49558208, "step": 40750 }, { "epoch": 5.1065029444931715, "grad_norm": 0.08631236851215363, "learning_rate": 9.282967380461434e-06, "loss": 0.4661, "num_input_tokens_seen": 49564256, "step": 40755 }, { "epoch": 5.107129432401955, "grad_norm": 0.08153769373893738, "learning_rate": 9.282685254960245e-06, "loss": 0.4569, "num_input_tokens_seen": 49570336, "step": 40760 }, { "epoch": 5.107755920310738, "grad_norm": 0.1313190758228302, "learning_rate": 9.282403078255949e-06, "loss": 0.4592, "num_input_tokens_seen": 49576384, "step": 40765 }, { "epoch": 5.108382408219521, "grad_norm": 0.04628051444888115, "learning_rate": 9.282120850351926e-06, "loss": 0.4627, "num_input_tokens_seen": 49582720, "step": 40770 }, { "epoch": 5.109008896128305, "grad_norm": 0.1318650096654892, "learning_rate": 9.28183857125155e-06, "loss": 0.4612, "num_input_tokens_seen": 49589056, "step": 40775 }, { "epoch": 5.109635384037088, "grad_norm": 0.11702878028154373, "learning_rate": 9.28155624095819e-06, "loss": 0.4655, "num_input_tokens_seen": 49595360, "step": 40780 }, { "epoch": 5.1102618719458714, "grad_norm": 0.08311792463064194, "learning_rate": 9.281273859475228e-06, "loss": 0.462, "num_input_tokens_seen": 49601184, "step": 40785 }, { "epoch": 5.110888359854655, "grad_norm": 0.07097712904214859, "learning_rate": 9.280991426806036e-06, "loss": 0.4555, "num_input_tokens_seen": 49607104, "step": 40790 }, { "epoch": 5.111514847763438, "grad_norm": 0.0933135449886322, "learning_rate": 9.280708942953993e-06, "loss": 0.4577, "num_input_tokens_seen": 49613152, "step": 40795 }, { "epoch": 5.112141335672222, "grad_norm": 0.10565619170665741, "learning_rate": 9.280426407922476e-06, "loss": 0.4605, "num_input_tokens_seen": 49619456, "step": 40800 }, { "epoch": 5.112767823581005, "grad_norm": 0.08497609198093414, "learning_rate": 9.280143821714862e-06, "loss": 0.4683, "num_input_tokens_seen": 49625664, "step": 40805 }, { "epoch": 5.113394311489788, "grad_norm": 0.07625498622655869, "learning_rate": 9.27986118433453e-06, "loss": 0.4637, "num_input_tokens_seen": 49632000, "step": 40810 }, { "epoch": 5.114020799398571, "grad_norm": 0.0758817046880722, "learning_rate": 9.279578495784859e-06, "loss": 0.4657, "num_input_tokens_seen": 49638208, "step": 40815 }, { "epoch": 5.1146472873073545, "grad_norm": 0.12341196089982986, "learning_rate": 9.27929575606923e-06, "loss": 0.4638, "num_input_tokens_seen": 49643616, "step": 40820 }, { "epoch": 5.115273775216139, "grad_norm": 0.08037170022726059, "learning_rate": 9.279012965191021e-06, "loss": 0.4644, "num_input_tokens_seen": 49649952, "step": 40825 }, { "epoch": 5.115900263124922, "grad_norm": 0.1065780371427536, "learning_rate": 9.278730123153615e-06, "loss": 0.4592, "num_input_tokens_seen": 49656288, "step": 40830 }, { "epoch": 5.116526751033705, "grad_norm": 0.09501427412033081, "learning_rate": 9.278447229960393e-06, "loss": 0.4623, "num_input_tokens_seen": 49662592, "step": 40835 }, { "epoch": 5.117153238942488, "grad_norm": 0.10109225660562515, "learning_rate": 9.278164285614738e-06, "loss": 0.459, "num_input_tokens_seen": 49668640, "step": 40840 }, { "epoch": 5.117779726851272, "grad_norm": 0.0755300298333168, "learning_rate": 9.277881290120032e-06, "loss": 0.4669, "num_input_tokens_seen": 49674720, "step": 40845 }, { "epoch": 5.118406214760055, "grad_norm": 0.09173960983753204, "learning_rate": 9.277598243479658e-06, "loss": 0.4622, "num_input_tokens_seen": 49680704, "step": 40850 }, { "epoch": 5.1190327026688385, "grad_norm": 0.08123775571584702, "learning_rate": 9.277315145697e-06, "loss": 0.4595, "num_input_tokens_seen": 49686816, "step": 40855 }, { "epoch": 5.119659190577622, "grad_norm": 0.06898919492959976, "learning_rate": 9.277031996775444e-06, "loss": 0.4595, "num_input_tokens_seen": 49692992, "step": 40860 }, { "epoch": 5.120285678486405, "grad_norm": 0.08440445363521576, "learning_rate": 9.276748796718374e-06, "loss": 0.4664, "num_input_tokens_seen": 49699328, "step": 40865 }, { "epoch": 5.120912166395189, "grad_norm": 0.0750993862748146, "learning_rate": 9.276465545529177e-06, "loss": 0.4641, "num_input_tokens_seen": 49705280, "step": 40870 }, { "epoch": 5.121538654303972, "grad_norm": 0.07624327391386032, "learning_rate": 9.276182243211238e-06, "loss": 0.457, "num_input_tokens_seen": 49711680, "step": 40875 }, { "epoch": 5.122165142212755, "grad_norm": 0.11156900972127914, "learning_rate": 9.275898889767946e-06, "loss": 0.4609, "num_input_tokens_seen": 49717824, "step": 40880 }, { "epoch": 5.122791630121538, "grad_norm": 0.08588439971208572, "learning_rate": 9.275615485202689e-06, "loss": 0.4614, "num_input_tokens_seen": 49723648, "step": 40885 }, { "epoch": 5.123418118030322, "grad_norm": 0.07891903817653656, "learning_rate": 9.275332029518853e-06, "loss": 0.4632, "num_input_tokens_seen": 49729888, "step": 40890 }, { "epoch": 5.124044605939106, "grad_norm": 0.07742326706647873, "learning_rate": 9.275048522719827e-06, "loss": 0.4665, "num_input_tokens_seen": 49735936, "step": 40895 }, { "epoch": 5.124671093847889, "grad_norm": 0.08214250206947327, "learning_rate": 9.274764964809004e-06, "loss": 0.4641, "num_input_tokens_seen": 49741312, "step": 40900 }, { "epoch": 5.125297581756672, "grad_norm": 0.046584371477365494, "learning_rate": 9.27448135578977e-06, "loss": 0.4651, "num_input_tokens_seen": 49747648, "step": 40905 }, { "epoch": 5.125924069665455, "grad_norm": 0.07547491788864136, "learning_rate": 9.274197695665517e-06, "loss": 0.4636, "num_input_tokens_seen": 49754016, "step": 40910 }, { "epoch": 5.126550557574239, "grad_norm": 0.07697120308876038, "learning_rate": 9.273913984439638e-06, "loss": 0.4621, "num_input_tokens_seen": 49759968, "step": 40915 }, { "epoch": 5.127177045483022, "grad_norm": 0.06052357703447342, "learning_rate": 9.273630222115524e-06, "loss": 0.4636, "num_input_tokens_seen": 49765856, "step": 40920 }, { "epoch": 5.1278035333918055, "grad_norm": 0.07192859053611755, "learning_rate": 9.273346408696569e-06, "loss": 0.4607, "num_input_tokens_seen": 49771648, "step": 40925 }, { "epoch": 5.128430021300589, "grad_norm": 0.10651164501905441, "learning_rate": 9.273062544186162e-06, "loss": 0.4637, "num_input_tokens_seen": 49778144, "step": 40930 }, { "epoch": 5.129056509209372, "grad_norm": 0.07965787500143051, "learning_rate": 9.272778628587702e-06, "loss": 0.4593, "num_input_tokens_seen": 49784448, "step": 40935 }, { "epoch": 5.129682997118156, "grad_norm": 0.07163353264331818, "learning_rate": 9.27249466190458e-06, "loss": 0.4621, "num_input_tokens_seen": 49790624, "step": 40940 }, { "epoch": 5.130309485026939, "grad_norm": 0.08723173290491104, "learning_rate": 9.27221064414019e-06, "loss": 0.4616, "num_input_tokens_seen": 49796768, "step": 40945 }, { "epoch": 5.130935972935722, "grad_norm": 0.06705817580223083, "learning_rate": 9.271926575297935e-06, "loss": 0.471, "num_input_tokens_seen": 49802944, "step": 40950 }, { "epoch": 5.131562460844505, "grad_norm": 0.07575935870409012, "learning_rate": 9.271642455381204e-06, "loss": 0.4656, "num_input_tokens_seen": 49809152, "step": 40955 }, { "epoch": 5.1321889487532895, "grad_norm": 0.0701141506433487, "learning_rate": 9.271358284393395e-06, "loss": 0.4564, "num_input_tokens_seen": 49815040, "step": 40960 }, { "epoch": 5.132815436662073, "grad_norm": 0.07760898023843765, "learning_rate": 9.271074062337907e-06, "loss": 0.4678, "num_input_tokens_seen": 49821408, "step": 40965 }, { "epoch": 5.133441924570856, "grad_norm": 0.10686890035867691, "learning_rate": 9.270789789218137e-06, "loss": 0.4568, "num_input_tokens_seen": 49827680, "step": 40970 }, { "epoch": 5.134068412479639, "grad_norm": 0.10427138209342957, "learning_rate": 9.270505465037486e-06, "loss": 0.4619, "num_input_tokens_seen": 49833760, "step": 40975 }, { "epoch": 5.134694900388422, "grad_norm": 0.04354977235198021, "learning_rate": 9.270221089799352e-06, "loss": 0.4668, "num_input_tokens_seen": 49840000, "step": 40980 }, { "epoch": 5.135321388297206, "grad_norm": 0.042197514325380325, "learning_rate": 9.269936663507133e-06, "loss": 0.4581, "num_input_tokens_seen": 49846208, "step": 40985 }, { "epoch": 5.135947876205989, "grad_norm": 0.09209185093641281, "learning_rate": 9.26965218616423e-06, "loss": 0.4639, "num_input_tokens_seen": 49852480, "step": 40990 }, { "epoch": 5.1365743641147725, "grad_norm": 0.05820250138640404, "learning_rate": 9.269367657774048e-06, "loss": 0.458, "num_input_tokens_seen": 49858496, "step": 40995 }, { "epoch": 5.137200852023556, "grad_norm": 0.1019623875617981, "learning_rate": 9.269083078339986e-06, "loss": 0.4657, "num_input_tokens_seen": 49864800, "step": 41000 }, { "epoch": 5.13782733993234, "grad_norm": 0.11766874045133591, "learning_rate": 9.268798447865445e-06, "loss": 0.4619, "num_input_tokens_seen": 49871040, "step": 41005 }, { "epoch": 5.138453827841123, "grad_norm": 0.07989881932735443, "learning_rate": 9.26851376635383e-06, "loss": 0.4692, "num_input_tokens_seen": 49877248, "step": 41010 }, { "epoch": 5.139080315749906, "grad_norm": 0.07682649046182632, "learning_rate": 9.268229033808546e-06, "loss": 0.4599, "num_input_tokens_seen": 49883456, "step": 41015 }, { "epoch": 5.139706803658689, "grad_norm": 0.04069165140390396, "learning_rate": 9.267944250232994e-06, "loss": 0.4688, "num_input_tokens_seen": 49889024, "step": 41020 }, { "epoch": 5.140333291567472, "grad_norm": 0.08848390728235245, "learning_rate": 9.26765941563058e-06, "loss": 0.4644, "num_input_tokens_seen": 49895168, "step": 41025 }, { "epoch": 5.1409597794762565, "grad_norm": 0.0685223713517189, "learning_rate": 9.26737453000471e-06, "loss": 0.4637, "num_input_tokens_seen": 49901184, "step": 41030 }, { "epoch": 5.14158626738504, "grad_norm": 0.08369149267673492, "learning_rate": 9.267089593358788e-06, "loss": 0.4631, "num_input_tokens_seen": 49906912, "step": 41035 }, { "epoch": 5.142212755293823, "grad_norm": 0.0866420790553093, "learning_rate": 9.266804605696225e-06, "loss": 0.4557, "num_input_tokens_seen": 49913248, "step": 41040 }, { "epoch": 5.142839243202606, "grad_norm": 0.0657707080245018, "learning_rate": 9.266519567020425e-06, "loss": 0.468, "num_input_tokens_seen": 49919456, "step": 41045 }, { "epoch": 5.143465731111389, "grad_norm": 0.07656297832727432, "learning_rate": 9.266234477334795e-06, "loss": 0.4609, "num_input_tokens_seen": 49925440, "step": 41050 }, { "epoch": 5.144092219020173, "grad_norm": 0.09200505912303925, "learning_rate": 9.265949336642746e-06, "loss": 0.4601, "num_input_tokens_seen": 49931680, "step": 41055 }, { "epoch": 5.144718706928956, "grad_norm": 0.07387354224920273, "learning_rate": 9.265664144947686e-06, "loss": 0.464, "num_input_tokens_seen": 49937984, "step": 41060 }, { "epoch": 5.1453451948377396, "grad_norm": 0.1350683569908142, "learning_rate": 9.265378902253025e-06, "loss": 0.4685, "num_input_tokens_seen": 49943808, "step": 41065 }, { "epoch": 5.145971682746523, "grad_norm": 0.04265790060162544, "learning_rate": 9.265093608562173e-06, "loss": 0.4571, "num_input_tokens_seen": 49949792, "step": 41070 }, { "epoch": 5.146598170655307, "grad_norm": 0.077320896089077, "learning_rate": 9.264808263878539e-06, "loss": 0.4695, "num_input_tokens_seen": 49956160, "step": 41075 }, { "epoch": 5.14722465856409, "grad_norm": 0.12625722587108612, "learning_rate": 9.26452286820554e-06, "loss": 0.4603, "num_input_tokens_seen": 49962048, "step": 41080 }, { "epoch": 5.147851146472873, "grad_norm": 0.038666144013404846, "learning_rate": 9.264237421546583e-06, "loss": 0.465, "num_input_tokens_seen": 49968448, "step": 41085 }, { "epoch": 5.148477634381656, "grad_norm": 0.16781842708587646, "learning_rate": 9.263951923905084e-06, "loss": 0.4625, "num_input_tokens_seen": 49974240, "step": 41090 }, { "epoch": 5.1491041222904395, "grad_norm": 0.044626329094171524, "learning_rate": 9.263666375284452e-06, "loss": 0.4633, "num_input_tokens_seen": 49980416, "step": 41095 }, { "epoch": 5.1497306101992235, "grad_norm": 0.07214254140853882, "learning_rate": 9.263380775688105e-06, "loss": 0.4634, "num_input_tokens_seen": 49986240, "step": 41100 }, { "epoch": 5.150357098108007, "grad_norm": 0.1011563315987587, "learning_rate": 9.263095125119457e-06, "loss": 0.4561, "num_input_tokens_seen": 49991840, "step": 41105 }, { "epoch": 5.15098358601679, "grad_norm": 0.09990492463111877, "learning_rate": 9.262809423581922e-06, "loss": 0.4666, "num_input_tokens_seen": 49997984, "step": 41110 }, { "epoch": 5.151610073925573, "grad_norm": 0.08420785516500473, "learning_rate": 9.262523671078916e-06, "loss": 0.4641, "num_input_tokens_seen": 50004160, "step": 41115 }, { "epoch": 5.152236561834356, "grad_norm": 0.10523374378681183, "learning_rate": 9.262237867613856e-06, "loss": 0.4587, "num_input_tokens_seen": 50010240, "step": 41120 }, { "epoch": 5.15286304974314, "grad_norm": 0.07790673524141312, "learning_rate": 9.261952013190159e-06, "loss": 0.4647, "num_input_tokens_seen": 50016256, "step": 41125 }, { "epoch": 5.153489537651923, "grad_norm": 0.07111041992902756, "learning_rate": 9.261666107811243e-06, "loss": 0.4626, "num_input_tokens_seen": 50022464, "step": 41130 }, { "epoch": 5.154116025560707, "grad_norm": 0.11673790961503983, "learning_rate": 9.261380151480524e-06, "loss": 0.4611, "num_input_tokens_seen": 50028544, "step": 41135 }, { "epoch": 5.15474251346949, "grad_norm": 0.04424264281988144, "learning_rate": 9.261094144201425e-06, "loss": 0.4642, "num_input_tokens_seen": 50034816, "step": 41140 }, { "epoch": 5.155369001378274, "grad_norm": 0.038315411657094955, "learning_rate": 9.260808085977362e-06, "loss": 0.4575, "num_input_tokens_seen": 50040800, "step": 41145 }, { "epoch": 5.155995489287057, "grad_norm": 0.13170531392097473, "learning_rate": 9.260521976811755e-06, "loss": 0.4653, "num_input_tokens_seen": 50046912, "step": 41150 }, { "epoch": 5.15662197719584, "grad_norm": 0.07075410336256027, "learning_rate": 9.260235816708027e-06, "loss": 0.4624, "num_input_tokens_seen": 50053248, "step": 41155 }, { "epoch": 5.157248465104623, "grad_norm": 0.17857970297336578, "learning_rate": 9.259949605669596e-06, "loss": 0.4671, "num_input_tokens_seen": 50059232, "step": 41160 }, { "epoch": 5.1578749530134065, "grad_norm": 0.07240886986255646, "learning_rate": 9.259663343699887e-06, "loss": 0.4578, "num_input_tokens_seen": 50065408, "step": 41165 }, { "epoch": 5.1585014409221905, "grad_norm": 0.07851291447877884, "learning_rate": 9.259377030802323e-06, "loss": 0.4615, "num_input_tokens_seen": 50071584, "step": 41170 }, { "epoch": 5.159127928830974, "grad_norm": 0.0862322673201561, "learning_rate": 9.259090666980324e-06, "loss": 0.4651, "num_input_tokens_seen": 50077760, "step": 41175 }, { "epoch": 5.159754416739757, "grad_norm": 0.1251676380634308, "learning_rate": 9.258804252237314e-06, "loss": 0.4662, "num_input_tokens_seen": 50083904, "step": 41180 }, { "epoch": 5.16038090464854, "grad_norm": 0.08358125388622284, "learning_rate": 9.258517786576722e-06, "loss": 0.4673, "num_input_tokens_seen": 50089952, "step": 41185 }, { "epoch": 5.161007392557323, "grad_norm": 0.06569821387529373, "learning_rate": 9.258231270001968e-06, "loss": 0.4611, "num_input_tokens_seen": 50096032, "step": 41190 }, { "epoch": 5.161633880466107, "grad_norm": 0.08675217628479004, "learning_rate": 9.257944702516477e-06, "loss": 0.4628, "num_input_tokens_seen": 50102016, "step": 41195 }, { "epoch": 5.16226036837489, "grad_norm": 0.04781540855765343, "learning_rate": 9.25765808412368e-06, "loss": 0.465, "num_input_tokens_seen": 50108128, "step": 41200 }, { "epoch": 5.162886856283674, "grad_norm": 0.06797385215759277, "learning_rate": 9.257371414827e-06, "loss": 0.4621, "num_input_tokens_seen": 50114464, "step": 41205 }, { "epoch": 5.163513344192457, "grad_norm": 0.1168072298169136, "learning_rate": 9.257084694629863e-06, "loss": 0.4624, "num_input_tokens_seen": 50120640, "step": 41210 }, { "epoch": 5.164139832101241, "grad_norm": 0.04640403389930725, "learning_rate": 9.256797923535702e-06, "loss": 0.4586, "num_input_tokens_seen": 50126400, "step": 41215 }, { "epoch": 5.164766320010024, "grad_norm": 0.10770171880722046, "learning_rate": 9.256511101547943e-06, "loss": 0.4636, "num_input_tokens_seen": 50132800, "step": 41220 }, { "epoch": 5.165392807918807, "grad_norm": 0.05585316941142082, "learning_rate": 9.256224228670014e-06, "loss": 0.4635, "num_input_tokens_seen": 50139456, "step": 41225 }, { "epoch": 5.16601929582759, "grad_norm": 0.06168999895453453, "learning_rate": 9.255937304905344e-06, "loss": 0.4611, "num_input_tokens_seen": 50144832, "step": 41230 }, { "epoch": 5.1666457837363735, "grad_norm": 0.1673327535390854, "learning_rate": 9.255650330257369e-06, "loss": 0.464, "num_input_tokens_seen": 50150944, "step": 41235 }, { "epoch": 5.167272271645158, "grad_norm": 0.07133873552083969, "learning_rate": 9.255363304729515e-06, "loss": 0.4618, "num_input_tokens_seen": 50156512, "step": 41240 }, { "epoch": 5.167898759553941, "grad_norm": 0.07513255625963211, "learning_rate": 9.255076228325215e-06, "loss": 0.4657, "num_input_tokens_seen": 50162592, "step": 41245 }, { "epoch": 5.168525247462724, "grad_norm": 0.07637662440538406, "learning_rate": 9.254789101047901e-06, "loss": 0.4553, "num_input_tokens_seen": 50168800, "step": 41250 }, { "epoch": 5.169151735371507, "grad_norm": 0.0667724758386612, "learning_rate": 9.254501922901008e-06, "loss": 0.4643, "num_input_tokens_seen": 50174368, "step": 41255 }, { "epoch": 5.169778223280291, "grad_norm": 0.06396903842687607, "learning_rate": 9.254214693887964e-06, "loss": 0.4641, "num_input_tokens_seen": 50180416, "step": 41260 }, { "epoch": 5.170404711189074, "grad_norm": 0.07995876669883728, "learning_rate": 9.253927414012208e-06, "loss": 0.4597, "num_input_tokens_seen": 50186336, "step": 41265 }, { "epoch": 5.1710311990978575, "grad_norm": 0.048751216381788254, "learning_rate": 9.253640083277173e-06, "loss": 0.4678, "num_input_tokens_seen": 50192416, "step": 41270 }, { "epoch": 5.171657687006641, "grad_norm": 0.08339045196771622, "learning_rate": 9.253352701686295e-06, "loss": 0.4641, "num_input_tokens_seen": 50198368, "step": 41275 }, { "epoch": 5.172284174915424, "grad_norm": 0.07547621428966522, "learning_rate": 9.253065269243011e-06, "loss": 0.4633, "num_input_tokens_seen": 50204416, "step": 41280 }, { "epoch": 5.172910662824208, "grad_norm": 0.07677073031663895, "learning_rate": 9.252777785950755e-06, "loss": 0.4639, "num_input_tokens_seen": 50210752, "step": 41285 }, { "epoch": 5.173537150732991, "grad_norm": 0.04950582981109619, "learning_rate": 9.252490251812964e-06, "loss": 0.4603, "num_input_tokens_seen": 50216768, "step": 41290 }, { "epoch": 5.174163638641774, "grad_norm": 0.07228084653615952, "learning_rate": 9.252202666833077e-06, "loss": 0.4694, "num_input_tokens_seen": 50222848, "step": 41295 }, { "epoch": 5.174790126550557, "grad_norm": 0.09836872667074203, "learning_rate": 9.251915031014533e-06, "loss": 0.4652, "num_input_tokens_seen": 50228928, "step": 41300 }, { "epoch": 5.1754166144593405, "grad_norm": 0.043292708694934845, "learning_rate": 9.251627344360767e-06, "loss": 0.4645, "num_input_tokens_seen": 50235168, "step": 41305 }, { "epoch": 5.176043102368125, "grad_norm": 0.07538879662752151, "learning_rate": 9.251339606875224e-06, "loss": 0.4616, "num_input_tokens_seen": 50241472, "step": 41310 }, { "epoch": 5.176669590276908, "grad_norm": 0.07858259230852127, "learning_rate": 9.251051818561344e-06, "loss": 0.4664, "num_input_tokens_seen": 50247424, "step": 41315 }, { "epoch": 5.177296078185691, "grad_norm": 0.11162441223859787, "learning_rate": 9.25076397942256e-06, "loss": 0.4579, "num_input_tokens_seen": 50252800, "step": 41320 }, { "epoch": 5.177922566094474, "grad_norm": 0.06878085434436798, "learning_rate": 9.250476089462324e-06, "loss": 0.4699, "num_input_tokens_seen": 50258336, "step": 41325 }, { "epoch": 5.178549054003258, "grad_norm": 0.0766584649682045, "learning_rate": 9.25018814868407e-06, "loss": 0.4605, "num_input_tokens_seen": 50264416, "step": 41330 }, { "epoch": 5.179175541912041, "grad_norm": 0.15859968960285187, "learning_rate": 9.249900157091243e-06, "loss": 0.4628, "num_input_tokens_seen": 50270848, "step": 41335 }, { "epoch": 5.1798020298208245, "grad_norm": 0.11358003318309784, "learning_rate": 9.249612114687287e-06, "loss": 0.4658, "num_input_tokens_seen": 50276992, "step": 41340 }, { "epoch": 5.180428517729608, "grad_norm": 0.11379300802946091, "learning_rate": 9.249324021475645e-06, "loss": 0.4666, "num_input_tokens_seen": 50282880, "step": 41345 }, { "epoch": 5.181055005638391, "grad_norm": 0.07647431641817093, "learning_rate": 9.249035877459763e-06, "loss": 0.4622, "num_input_tokens_seen": 50288992, "step": 41350 }, { "epoch": 5.181681493547175, "grad_norm": 0.05113467574119568, "learning_rate": 9.248747682643085e-06, "loss": 0.4619, "num_input_tokens_seen": 50294944, "step": 41355 }, { "epoch": 5.182307981455958, "grad_norm": 0.08669305592775345, "learning_rate": 9.248459437029054e-06, "loss": 0.4572, "num_input_tokens_seen": 50300896, "step": 41360 }, { "epoch": 5.182934469364741, "grad_norm": 0.07629168033599854, "learning_rate": 9.248171140621121e-06, "loss": 0.4624, "num_input_tokens_seen": 50307104, "step": 41365 }, { "epoch": 5.183560957273524, "grad_norm": 0.052312977612018585, "learning_rate": 9.247882793422728e-06, "loss": 0.463, "num_input_tokens_seen": 50313536, "step": 41370 }, { "epoch": 5.184187445182308, "grad_norm": 0.0988592803478241, "learning_rate": 9.247594395437326e-06, "loss": 0.4609, "num_input_tokens_seen": 50319488, "step": 41375 }, { "epoch": 5.184813933091092, "grad_norm": 0.05754087120294571, "learning_rate": 9.247305946668361e-06, "loss": 0.4681, "num_input_tokens_seen": 50325856, "step": 41380 }, { "epoch": 5.185440420999875, "grad_norm": 0.12684312462806702, "learning_rate": 9.247017447119283e-06, "loss": 0.4529, "num_input_tokens_seen": 50331008, "step": 41385 }, { "epoch": 5.186066908908658, "grad_norm": 0.0699407234787941, "learning_rate": 9.246728896793542e-06, "loss": 0.4604, "num_input_tokens_seen": 50337728, "step": 41390 }, { "epoch": 5.186693396817441, "grad_norm": 0.08267678320407867, "learning_rate": 9.246440295694585e-06, "loss": 0.4651, "num_input_tokens_seen": 50343968, "step": 41395 }, { "epoch": 5.187319884726225, "grad_norm": 0.04776919260621071, "learning_rate": 9.246151643825864e-06, "loss": 0.4538, "num_input_tokens_seen": 50349920, "step": 41400 }, { "epoch": 5.187946372635008, "grad_norm": 0.0784623771905899, "learning_rate": 9.24586294119083e-06, "loss": 0.4589, "num_input_tokens_seen": 50356032, "step": 41405 }, { "epoch": 5.1885728605437915, "grad_norm": 0.049050040543079376, "learning_rate": 9.245574187792936e-06, "loss": 0.4625, "num_input_tokens_seen": 50361888, "step": 41410 }, { "epoch": 5.189199348452575, "grad_norm": 0.1060623824596405, "learning_rate": 9.245285383635633e-06, "loss": 0.4594, "num_input_tokens_seen": 50368000, "step": 41415 }, { "epoch": 5.189825836361358, "grad_norm": 0.07794956862926483, "learning_rate": 9.244996528722375e-06, "loss": 0.4566, "num_input_tokens_seen": 50374144, "step": 41420 }, { "epoch": 5.190452324270142, "grad_norm": 0.10178148001432419, "learning_rate": 9.244707623056613e-06, "loss": 0.4666, "num_input_tokens_seen": 50380480, "step": 41425 }, { "epoch": 5.191078812178925, "grad_norm": 0.07890065759420395, "learning_rate": 9.244418666641802e-06, "loss": 0.4659, "num_input_tokens_seen": 50386496, "step": 41430 }, { "epoch": 5.191705300087708, "grad_norm": 0.05054185166954994, "learning_rate": 9.244129659481397e-06, "loss": 0.4482, "num_input_tokens_seen": 50392576, "step": 41435 }, { "epoch": 5.192331787996491, "grad_norm": 0.11064942926168442, "learning_rate": 9.243840601578854e-06, "loss": 0.4662, "num_input_tokens_seen": 50398592, "step": 41440 }, { "epoch": 5.192958275905275, "grad_norm": 0.1063932254910469, "learning_rate": 9.24355149293763e-06, "loss": 0.4638, "num_input_tokens_seen": 50404960, "step": 41445 }, { "epoch": 5.193584763814059, "grad_norm": 0.08410708606243134, "learning_rate": 9.243262333561181e-06, "loss": 0.4551, "num_input_tokens_seen": 50411168, "step": 41450 }, { "epoch": 5.194211251722842, "grad_norm": 0.07639729976654053, "learning_rate": 9.24297312345296e-06, "loss": 0.4583, "num_input_tokens_seen": 50417120, "step": 41455 }, { "epoch": 5.194837739631625, "grad_norm": 0.10543227195739746, "learning_rate": 9.242683862616431e-06, "loss": 0.4626, "num_input_tokens_seen": 50423648, "step": 41460 }, { "epoch": 5.195464227540408, "grad_norm": 0.10478854179382324, "learning_rate": 9.242394551055048e-06, "loss": 0.4662, "num_input_tokens_seen": 50429824, "step": 41465 }, { "epoch": 5.196090715449192, "grad_norm": 0.11924256384372711, "learning_rate": 9.24210518877227e-06, "loss": 0.4631, "num_input_tokens_seen": 50435808, "step": 41470 }, { "epoch": 5.196717203357975, "grad_norm": 0.08405465632677078, "learning_rate": 9.24181577577156e-06, "loss": 0.4594, "num_input_tokens_seen": 50442112, "step": 41475 }, { "epoch": 5.1973436912667585, "grad_norm": 0.1005626991391182, "learning_rate": 9.241526312056374e-06, "loss": 0.4677, "num_input_tokens_seen": 50447968, "step": 41480 }, { "epoch": 5.197970179175542, "grad_norm": 0.06975492089986801, "learning_rate": 9.241236797630178e-06, "loss": 0.4603, "num_input_tokens_seen": 50454112, "step": 41485 }, { "epoch": 5.198596667084325, "grad_norm": 0.06707121431827545, "learning_rate": 9.24094723249643e-06, "loss": 0.4621, "num_input_tokens_seen": 50460160, "step": 41490 }, { "epoch": 5.199223154993109, "grad_norm": 0.1192045509815216, "learning_rate": 9.240657616658588e-06, "loss": 0.4691, "num_input_tokens_seen": 50465856, "step": 41495 }, { "epoch": 5.199849642901892, "grad_norm": 0.10861386358737946, "learning_rate": 9.240367950120122e-06, "loss": 0.458, "num_input_tokens_seen": 50472064, "step": 41500 }, { "epoch": 5.200476130810675, "grad_norm": 0.0655113086104393, "learning_rate": 9.240078232884492e-06, "loss": 0.4577, "num_input_tokens_seen": 50478016, "step": 41505 }, { "epoch": 5.2011026187194584, "grad_norm": 0.107912078499794, "learning_rate": 9.239788464955161e-06, "loss": 0.4573, "num_input_tokens_seen": 50483680, "step": 41510 }, { "epoch": 5.2017291066282425, "grad_norm": 0.07200582325458527, "learning_rate": 9.239498646335595e-06, "loss": 0.4584, "num_input_tokens_seen": 50489344, "step": 41515 }, { "epoch": 5.202355594537026, "grad_norm": 0.07768413424491882, "learning_rate": 9.239208777029259e-06, "loss": 0.4616, "num_input_tokens_seen": 50495488, "step": 41520 }, { "epoch": 5.202982082445809, "grad_norm": 0.0956677570939064, "learning_rate": 9.238918857039616e-06, "loss": 0.4597, "num_input_tokens_seen": 50501472, "step": 41525 }, { "epoch": 5.203608570354592, "grad_norm": 0.1291135996580124, "learning_rate": 9.238628886370136e-06, "loss": 0.458, "num_input_tokens_seen": 50507424, "step": 41530 }, { "epoch": 5.204235058263375, "grad_norm": 0.08790049701929092, "learning_rate": 9.238338865024283e-06, "loss": 0.4601, "num_input_tokens_seen": 50513216, "step": 41535 }, { "epoch": 5.204861546172159, "grad_norm": 0.0821298360824585, "learning_rate": 9.238048793005525e-06, "loss": 0.463, "num_input_tokens_seen": 50518752, "step": 41540 }, { "epoch": 5.205488034080942, "grad_norm": 0.04535220190882683, "learning_rate": 9.23775867031733e-06, "loss": 0.4603, "num_input_tokens_seen": 50524704, "step": 41545 }, { "epoch": 5.206114521989726, "grad_norm": 0.11581587046384811, "learning_rate": 9.23746849696317e-06, "loss": 0.4652, "num_input_tokens_seen": 50530944, "step": 41550 }, { "epoch": 5.206741009898509, "grad_norm": 0.06971881538629532, "learning_rate": 9.23717827294651e-06, "loss": 0.4597, "num_input_tokens_seen": 50536800, "step": 41555 }, { "epoch": 5.207367497807292, "grad_norm": 0.08125267177820206, "learning_rate": 9.23688799827082e-06, "loss": 0.4568, "num_input_tokens_seen": 50542944, "step": 41560 }, { "epoch": 5.207993985716076, "grad_norm": 0.08708550035953522, "learning_rate": 9.236597672939573e-06, "loss": 0.4571, "num_input_tokens_seen": 50549184, "step": 41565 }, { "epoch": 5.208620473624859, "grad_norm": 0.08512531965970993, "learning_rate": 9.23630729695624e-06, "loss": 0.4634, "num_input_tokens_seen": 50555360, "step": 41570 }, { "epoch": 5.209246961533642, "grad_norm": 0.07325194776058197, "learning_rate": 9.23601687032429e-06, "loss": 0.4654, "num_input_tokens_seen": 50561536, "step": 41575 }, { "epoch": 5.2098734494424255, "grad_norm": 0.06384984403848648, "learning_rate": 9.235726393047197e-06, "loss": 0.4657, "num_input_tokens_seen": 50567360, "step": 41580 }, { "epoch": 5.2104999373512095, "grad_norm": 0.041855357587337494, "learning_rate": 9.235435865128434e-06, "loss": 0.4644, "num_input_tokens_seen": 50573632, "step": 41585 }, { "epoch": 5.211126425259993, "grad_norm": 0.07492292672395706, "learning_rate": 9.235145286571474e-06, "loss": 0.4592, "num_input_tokens_seen": 50579424, "step": 41590 }, { "epoch": 5.211752913168776, "grad_norm": 0.08069845288991928, "learning_rate": 9.234854657379793e-06, "loss": 0.4736, "num_input_tokens_seen": 50585504, "step": 41595 }, { "epoch": 5.212379401077559, "grad_norm": 0.11319006234407425, "learning_rate": 9.234563977556862e-06, "loss": 0.4707, "num_input_tokens_seen": 50592288, "step": 41600 }, { "epoch": 5.213005888986342, "grad_norm": 0.06850003451108932, "learning_rate": 9.234273247106159e-06, "loss": 0.4576, "num_input_tokens_seen": 50598336, "step": 41605 }, { "epoch": 5.213632376895126, "grad_norm": 0.09038128703832626, "learning_rate": 9.233982466031159e-06, "loss": 0.4651, "num_input_tokens_seen": 50604096, "step": 41610 }, { "epoch": 5.214258864803909, "grad_norm": 0.05835970491170883, "learning_rate": 9.23369163433534e-06, "loss": 0.4609, "num_input_tokens_seen": 50610528, "step": 41615 }, { "epoch": 5.214885352712693, "grad_norm": 0.1392950564622879, "learning_rate": 9.233400752022177e-06, "loss": 0.4621, "num_input_tokens_seen": 50616768, "step": 41620 }, { "epoch": 5.215511840621476, "grad_norm": 0.10442505031824112, "learning_rate": 9.233109819095149e-06, "loss": 0.4725, "num_input_tokens_seen": 50622944, "step": 41625 }, { "epoch": 5.216138328530259, "grad_norm": 0.08749309927225113, "learning_rate": 9.232818835557735e-06, "loss": 0.46, "num_input_tokens_seen": 50628960, "step": 41630 }, { "epoch": 5.216764816439043, "grad_norm": 0.12959438562393188, "learning_rate": 9.232527801413412e-06, "loss": 0.4712, "num_input_tokens_seen": 50635072, "step": 41635 }, { "epoch": 5.217391304347826, "grad_norm": 0.06583993136882782, "learning_rate": 9.23223671666566e-06, "loss": 0.4665, "num_input_tokens_seen": 50641312, "step": 41640 }, { "epoch": 5.218017792256609, "grad_norm": 0.0754953920841217, "learning_rate": 9.231945581317963e-06, "loss": 0.463, "num_input_tokens_seen": 50647744, "step": 41645 }, { "epoch": 5.2186442801653925, "grad_norm": 0.10610750317573547, "learning_rate": 9.231654395373796e-06, "loss": 0.4622, "num_input_tokens_seen": 50654048, "step": 41650 }, { "epoch": 5.219270768074177, "grad_norm": 0.091944120824337, "learning_rate": 9.231363158836643e-06, "loss": 0.4705, "num_input_tokens_seen": 50660320, "step": 41655 }, { "epoch": 5.21989725598296, "grad_norm": 0.11701786518096924, "learning_rate": 9.231071871709986e-06, "loss": 0.4524, "num_input_tokens_seen": 50666400, "step": 41660 }, { "epoch": 5.220523743891743, "grad_norm": 0.05067913979291916, "learning_rate": 9.230780533997307e-06, "loss": 0.46, "num_input_tokens_seen": 50672000, "step": 41665 }, { "epoch": 5.221150231800526, "grad_norm": 0.11138032376766205, "learning_rate": 9.23048914570209e-06, "loss": 0.4477, "num_input_tokens_seen": 50678208, "step": 41670 }, { "epoch": 5.221776719709309, "grad_norm": 0.11276302486658096, "learning_rate": 9.230197706827819e-06, "loss": 0.4623, "num_input_tokens_seen": 50684288, "step": 41675 }, { "epoch": 5.222403207618093, "grad_norm": 0.08483158051967621, "learning_rate": 9.229906217377978e-06, "loss": 0.464, "num_input_tokens_seen": 50690464, "step": 41680 }, { "epoch": 5.2230296955268765, "grad_norm": 0.07412054389715195, "learning_rate": 9.229614677356051e-06, "loss": 0.4612, "num_input_tokens_seen": 50696448, "step": 41685 }, { "epoch": 5.22365618343566, "grad_norm": 0.07281040400266647, "learning_rate": 9.229323086765526e-06, "loss": 0.459, "num_input_tokens_seen": 50702688, "step": 41690 }, { "epoch": 5.224282671344443, "grad_norm": 0.07456208020448685, "learning_rate": 9.229031445609885e-06, "loss": 0.4574, "num_input_tokens_seen": 50708448, "step": 41695 }, { "epoch": 5.224909159253226, "grad_norm": 0.07895208150148392, "learning_rate": 9.22873975389262e-06, "loss": 0.4682, "num_input_tokens_seen": 50714368, "step": 41700 }, { "epoch": 5.22553564716201, "grad_norm": 0.09054679423570633, "learning_rate": 9.228448011617215e-06, "loss": 0.4642, "num_input_tokens_seen": 50720416, "step": 41705 }, { "epoch": 5.226162135070793, "grad_norm": 0.06814690679311752, "learning_rate": 9.22815621878716e-06, "loss": 0.4599, "num_input_tokens_seen": 50726592, "step": 41710 }, { "epoch": 5.226788622979576, "grad_norm": 0.07141800224781036, "learning_rate": 9.22786437540594e-06, "loss": 0.4686, "num_input_tokens_seen": 50732448, "step": 41715 }, { "epoch": 5.2274151108883595, "grad_norm": 0.049143556505441666, "learning_rate": 9.227572481477049e-06, "loss": 0.4621, "num_input_tokens_seen": 50738528, "step": 41720 }, { "epoch": 5.228041598797144, "grad_norm": 0.09085046499967575, "learning_rate": 9.227280537003975e-06, "loss": 0.4643, "num_input_tokens_seen": 50744864, "step": 41725 }, { "epoch": 5.228668086705927, "grad_norm": 0.08317574858665466, "learning_rate": 9.226988541990207e-06, "loss": 0.4687, "num_input_tokens_seen": 50751072, "step": 41730 }, { "epoch": 5.22929457461471, "grad_norm": 0.122828409075737, "learning_rate": 9.226696496439238e-06, "loss": 0.4647, "num_input_tokens_seen": 50757472, "step": 41735 }, { "epoch": 5.229921062523493, "grad_norm": 0.10019543021917343, "learning_rate": 9.226404400354557e-06, "loss": 0.4623, "num_input_tokens_seen": 50762784, "step": 41740 }, { "epoch": 5.230547550432276, "grad_norm": 0.09194742888212204, "learning_rate": 9.226112253739659e-06, "loss": 0.4619, "num_input_tokens_seen": 50769088, "step": 41745 }, { "epoch": 5.23117403834106, "grad_norm": 0.09122379124164581, "learning_rate": 9.225820056598035e-06, "loss": 0.4655, "num_input_tokens_seen": 50775200, "step": 41750 }, { "epoch": 5.2318005262498435, "grad_norm": 0.09877696633338928, "learning_rate": 9.22552780893318e-06, "loss": 0.4555, "num_input_tokens_seen": 50781120, "step": 41755 }, { "epoch": 5.232427014158627, "grad_norm": 0.07419215887784958, "learning_rate": 9.225235510748589e-06, "loss": 0.464, "num_input_tokens_seen": 50787520, "step": 41760 }, { "epoch": 5.23305350206741, "grad_norm": 0.09084606170654297, "learning_rate": 9.224943162047753e-06, "loss": 0.4712, "num_input_tokens_seen": 50793504, "step": 41765 }, { "epoch": 5.233679989976194, "grad_norm": 0.1097780242562294, "learning_rate": 9.22465076283417e-06, "loss": 0.4674, "num_input_tokens_seen": 50799072, "step": 41770 }, { "epoch": 5.234306477884977, "grad_norm": 0.08434198796749115, "learning_rate": 9.224358313111336e-06, "loss": 0.4643, "num_input_tokens_seen": 50805024, "step": 41775 }, { "epoch": 5.23493296579376, "grad_norm": 0.11919302493333817, "learning_rate": 9.224065812882745e-06, "loss": 0.456, "num_input_tokens_seen": 50811040, "step": 41780 }, { "epoch": 5.235559453702543, "grad_norm": 0.10437508672475815, "learning_rate": 9.223773262151899e-06, "loss": 0.4598, "num_input_tokens_seen": 50816544, "step": 41785 }, { "epoch": 5.2361859416113266, "grad_norm": 0.1333928406238556, "learning_rate": 9.22348066092229e-06, "loss": 0.4608, "num_input_tokens_seen": 50822528, "step": 41790 }, { "epoch": 5.236812429520111, "grad_norm": 0.06998232007026672, "learning_rate": 9.22318800919742e-06, "loss": 0.4683, "num_input_tokens_seen": 50828480, "step": 41795 }, { "epoch": 5.237438917428894, "grad_norm": 0.07218439131975174, "learning_rate": 9.222895306980785e-06, "loss": 0.4681, "num_input_tokens_seen": 50834880, "step": 41800 }, { "epoch": 5.238065405337677, "grad_norm": 0.09073281288146973, "learning_rate": 9.222602554275886e-06, "loss": 0.467, "num_input_tokens_seen": 50840768, "step": 41805 }, { "epoch": 5.23869189324646, "grad_norm": 0.06964551657438278, "learning_rate": 9.222309751086225e-06, "loss": 0.4612, "num_input_tokens_seen": 50846880, "step": 41810 }, { "epoch": 5.239318381155243, "grad_norm": 0.13800741732120514, "learning_rate": 9.2220168974153e-06, "loss": 0.4567, "num_input_tokens_seen": 50853056, "step": 41815 }, { "epoch": 5.239944869064027, "grad_norm": 0.0662863701581955, "learning_rate": 9.221723993266614e-06, "loss": 0.4588, "num_input_tokens_seen": 50859168, "step": 41820 }, { "epoch": 5.2405713569728105, "grad_norm": 0.03976093605160713, "learning_rate": 9.221431038643668e-06, "loss": 0.4595, "num_input_tokens_seen": 50865152, "step": 41825 }, { "epoch": 5.241197844881594, "grad_norm": 0.06521187722682953, "learning_rate": 9.221138033549965e-06, "loss": 0.4654, "num_input_tokens_seen": 50871392, "step": 41830 }, { "epoch": 5.241824332790377, "grad_norm": 0.12780097126960754, "learning_rate": 9.220844977989008e-06, "loss": 0.4619, "num_input_tokens_seen": 50877504, "step": 41835 }, { "epoch": 5.242450820699161, "grad_norm": 0.08638394623994827, "learning_rate": 9.220551871964299e-06, "loss": 0.4624, "num_input_tokens_seen": 50883520, "step": 41840 }, { "epoch": 5.243077308607944, "grad_norm": 0.08548908680677414, "learning_rate": 9.220258715479346e-06, "loss": 0.4648, "num_input_tokens_seen": 50889472, "step": 41845 }, { "epoch": 5.243703796516727, "grad_norm": 0.13316379487514496, "learning_rate": 9.219965508537649e-06, "loss": 0.4627, "num_input_tokens_seen": 50895808, "step": 41850 }, { "epoch": 5.24433028442551, "grad_norm": 0.06855055689811707, "learning_rate": 9.21967225114272e-06, "loss": 0.4648, "num_input_tokens_seen": 50901984, "step": 41855 }, { "epoch": 5.244956772334294, "grad_norm": 0.07288292795419693, "learning_rate": 9.219378943298059e-06, "loss": 0.4536, "num_input_tokens_seen": 50908160, "step": 41860 }, { "epoch": 5.245583260243078, "grad_norm": 0.11965936422348022, "learning_rate": 9.219085585007176e-06, "loss": 0.4633, "num_input_tokens_seen": 50914368, "step": 41865 }, { "epoch": 5.246209748151861, "grad_norm": 0.07283122092485428, "learning_rate": 9.218792176273579e-06, "loss": 0.4588, "num_input_tokens_seen": 50920800, "step": 41870 }, { "epoch": 5.246836236060644, "grad_norm": 0.13111922144889832, "learning_rate": 9.218498717100773e-06, "loss": 0.4609, "num_input_tokens_seen": 50926880, "step": 41875 }, { "epoch": 5.247462723969427, "grad_norm": 0.08176995813846588, "learning_rate": 9.218205207492268e-06, "loss": 0.4723, "num_input_tokens_seen": 50933184, "step": 41880 }, { "epoch": 5.248089211878211, "grad_norm": 0.045171041041612625, "learning_rate": 9.217911647451573e-06, "loss": 0.4649, "num_input_tokens_seen": 50939552, "step": 41885 }, { "epoch": 5.248715699786994, "grad_norm": 0.07036887109279633, "learning_rate": 9.2176180369822e-06, "loss": 0.465, "num_input_tokens_seen": 50945664, "step": 41890 }, { "epoch": 5.2493421876957775, "grad_norm": 0.09092298150062561, "learning_rate": 9.217324376087655e-06, "loss": 0.4627, "num_input_tokens_seen": 50951264, "step": 41895 }, { "epoch": 5.249968675604561, "grad_norm": 0.08766930550336838, "learning_rate": 9.217030664771452e-06, "loss": 0.4651, "num_input_tokens_seen": 50957280, "step": 41900 }, { "epoch": 5.250595163513344, "grad_norm": 0.09525341540575027, "learning_rate": 9.216736903037102e-06, "loss": 0.4641, "num_input_tokens_seen": 50963424, "step": 41905 }, { "epoch": 5.251221651422128, "grad_norm": 0.07982253283262253, "learning_rate": 9.216443090888119e-06, "loss": 0.4657, "num_input_tokens_seen": 50969952, "step": 41910 }, { "epoch": 5.251848139330911, "grad_norm": 0.08702312409877777, "learning_rate": 9.216149228328013e-06, "loss": 0.4665, "num_input_tokens_seen": 50976064, "step": 41915 }, { "epoch": 5.252474627239694, "grad_norm": 0.07854694128036499, "learning_rate": 9.215855315360296e-06, "loss": 0.4604, "num_input_tokens_seen": 50981984, "step": 41920 }, { "epoch": 5.253101115148477, "grad_norm": 0.09766292572021484, "learning_rate": 9.215561351988487e-06, "loss": 0.4661, "num_input_tokens_seen": 50988128, "step": 41925 }, { "epoch": 5.253727603057261, "grad_norm": 0.08054012805223465, "learning_rate": 9.215267338216096e-06, "loss": 0.463, "num_input_tokens_seen": 50994176, "step": 41930 }, { "epoch": 5.254354090966045, "grad_norm": 0.06480178982019424, "learning_rate": 9.21497327404664e-06, "loss": 0.4674, "num_input_tokens_seen": 50999744, "step": 41935 }, { "epoch": 5.254980578874828, "grad_norm": 0.11638044565916061, "learning_rate": 9.214679159483635e-06, "loss": 0.465, "num_input_tokens_seen": 51006016, "step": 41940 }, { "epoch": 5.255607066783611, "grad_norm": 0.10483341664075851, "learning_rate": 9.214384994530598e-06, "loss": 0.4585, "num_input_tokens_seen": 51012288, "step": 41945 }, { "epoch": 5.256233554692394, "grad_norm": 0.0788094699382782, "learning_rate": 9.214090779191044e-06, "loss": 0.4625, "num_input_tokens_seen": 51018464, "step": 41950 }, { "epoch": 5.256860042601177, "grad_norm": 0.09532327950000763, "learning_rate": 9.213796513468493e-06, "loss": 0.4641, "num_input_tokens_seen": 51024768, "step": 41955 }, { "epoch": 5.257486530509961, "grad_norm": 0.10245799273252487, "learning_rate": 9.21350219736646e-06, "loss": 0.4569, "num_input_tokens_seen": 51031200, "step": 41960 }, { "epoch": 5.258113018418745, "grad_norm": 0.06295272707939148, "learning_rate": 9.213207830888468e-06, "loss": 0.4532, "num_input_tokens_seen": 51036800, "step": 41965 }, { "epoch": 5.258739506327528, "grad_norm": 0.08395003527402878, "learning_rate": 9.212913414038033e-06, "loss": 0.4686, "num_input_tokens_seen": 51043104, "step": 41970 }, { "epoch": 5.259365994236311, "grad_norm": 0.11621236056089401, "learning_rate": 9.212618946818675e-06, "loss": 0.4546, "num_input_tokens_seen": 51049280, "step": 41975 }, { "epoch": 5.259992482145095, "grad_norm": 0.08001997321844101, "learning_rate": 9.212324429233918e-06, "loss": 0.4691, "num_input_tokens_seen": 51055296, "step": 41980 }, { "epoch": 5.260618970053878, "grad_norm": 0.10624340921640396, "learning_rate": 9.212029861287279e-06, "loss": 0.4632, "num_input_tokens_seen": 51061536, "step": 41985 }, { "epoch": 5.261245457962661, "grad_norm": 0.09548144042491913, "learning_rate": 9.211735242982283e-06, "loss": 0.4656, "num_input_tokens_seen": 51067776, "step": 41990 }, { "epoch": 5.2618719458714445, "grad_norm": 0.10495046526193619, "learning_rate": 9.21144057432245e-06, "loss": 0.4586, "num_input_tokens_seen": 51073984, "step": 41995 }, { "epoch": 5.262498433780228, "grad_norm": 0.09541298449039459, "learning_rate": 9.211145855311306e-06, "loss": 0.4641, "num_input_tokens_seen": 51079424, "step": 42000 }, { "epoch": 5.263124921689012, "grad_norm": 0.06956324726343155, "learning_rate": 9.21085108595237e-06, "loss": 0.4613, "num_input_tokens_seen": 51085408, "step": 42005 }, { "epoch": 5.263751409597795, "grad_norm": 0.08333228528499603, "learning_rate": 9.210556266249172e-06, "loss": 0.455, "num_input_tokens_seen": 51091616, "step": 42010 }, { "epoch": 5.264377897506578, "grad_norm": 0.034803565591573715, "learning_rate": 9.210261396205232e-06, "loss": 0.4526, "num_input_tokens_seen": 51097536, "step": 42015 }, { "epoch": 5.265004385415361, "grad_norm": 0.08682282269001007, "learning_rate": 9.209966475824077e-06, "loss": 0.467, "num_input_tokens_seen": 51103680, "step": 42020 }, { "epoch": 5.265630873324145, "grad_norm": 0.06316085904836655, "learning_rate": 9.209671505109235e-06, "loss": 0.4641, "num_input_tokens_seen": 51109728, "step": 42025 }, { "epoch": 5.266257361232928, "grad_norm": 0.08950390666723251, "learning_rate": 9.209376484064228e-06, "loss": 0.4548, "num_input_tokens_seen": 51115168, "step": 42030 }, { "epoch": 5.266883849141712, "grad_norm": 0.07363609969615936, "learning_rate": 9.209081412692588e-06, "loss": 0.4635, "num_input_tokens_seen": 51121216, "step": 42035 }, { "epoch": 5.267510337050495, "grad_norm": 0.08113092184066772, "learning_rate": 9.208786290997841e-06, "loss": 0.4592, "num_input_tokens_seen": 51127296, "step": 42040 }, { "epoch": 5.268136824959278, "grad_norm": 0.07680485397577286, "learning_rate": 9.208491118983515e-06, "loss": 0.4572, "num_input_tokens_seen": 51133280, "step": 42045 }, { "epoch": 5.268763312868062, "grad_norm": 0.08907434344291687, "learning_rate": 9.208195896653138e-06, "loss": 0.4609, "num_input_tokens_seen": 51138912, "step": 42050 }, { "epoch": 5.269389800776845, "grad_norm": 0.12278224527835846, "learning_rate": 9.207900624010243e-06, "loss": 0.4691, "num_input_tokens_seen": 51145088, "step": 42055 }, { "epoch": 5.270016288685628, "grad_norm": 0.11791697889566422, "learning_rate": 9.207605301058357e-06, "loss": 0.4578, "num_input_tokens_seen": 51151008, "step": 42060 }, { "epoch": 5.2706427765944115, "grad_norm": 0.09788515418767929, "learning_rate": 9.207309927801014e-06, "loss": 0.4598, "num_input_tokens_seen": 51157472, "step": 42065 }, { "epoch": 5.271269264503195, "grad_norm": 0.13207311928272247, "learning_rate": 9.207014504241743e-06, "loss": 0.4666, "num_input_tokens_seen": 51163456, "step": 42070 }, { "epoch": 5.271895752411979, "grad_norm": 0.0776190459728241, "learning_rate": 9.206719030384074e-06, "loss": 0.4616, "num_input_tokens_seen": 51169792, "step": 42075 }, { "epoch": 5.272522240320762, "grad_norm": 0.03668661043047905, "learning_rate": 9.206423506231545e-06, "loss": 0.4566, "num_input_tokens_seen": 51175776, "step": 42080 }, { "epoch": 5.273148728229545, "grad_norm": 0.09455107152462006, "learning_rate": 9.206127931787684e-06, "loss": 0.4647, "num_input_tokens_seen": 51182048, "step": 42085 }, { "epoch": 5.273775216138328, "grad_norm": 0.05825582891702652, "learning_rate": 9.205832307056028e-06, "loss": 0.4726, "num_input_tokens_seen": 51188000, "step": 42090 }, { "epoch": 5.274401704047112, "grad_norm": 0.03220488876104355, "learning_rate": 9.205536632040112e-06, "loss": 0.4606, "num_input_tokens_seen": 51194112, "step": 42095 }, { "epoch": 5.2750281919558955, "grad_norm": 0.12540242075920105, "learning_rate": 9.205240906743468e-06, "loss": 0.4685, "num_input_tokens_seen": 51199840, "step": 42100 }, { "epoch": 5.275654679864679, "grad_norm": 0.10407353937625885, "learning_rate": 9.204945131169635e-06, "loss": 0.4642, "num_input_tokens_seen": 51206400, "step": 42105 }, { "epoch": 5.276281167773462, "grad_norm": 0.0861298069357872, "learning_rate": 9.204649305322147e-06, "loss": 0.4631, "num_input_tokens_seen": 51212448, "step": 42110 }, { "epoch": 5.276907655682245, "grad_norm": 0.06548698991537094, "learning_rate": 9.204353429204543e-06, "loss": 0.4621, "num_input_tokens_seen": 51218784, "step": 42115 }, { "epoch": 5.277534143591029, "grad_norm": 0.07392624765634537, "learning_rate": 9.204057502820356e-06, "loss": 0.4676, "num_input_tokens_seen": 51224704, "step": 42120 }, { "epoch": 5.278160631499812, "grad_norm": 0.09115517139434814, "learning_rate": 9.203761526173128e-06, "loss": 0.4624, "num_input_tokens_seen": 51231008, "step": 42125 }, { "epoch": 5.278787119408595, "grad_norm": 0.0582476370036602, "learning_rate": 9.203465499266398e-06, "loss": 0.4637, "num_input_tokens_seen": 51237088, "step": 42130 }, { "epoch": 5.2794136073173785, "grad_norm": 0.09521641582250595, "learning_rate": 9.203169422103701e-06, "loss": 0.4682, "num_input_tokens_seen": 51243200, "step": 42135 }, { "epoch": 5.280040095226163, "grad_norm": 0.0690341666340828, "learning_rate": 9.202873294688584e-06, "loss": 0.4688, "num_input_tokens_seen": 51249216, "step": 42140 }, { "epoch": 5.280666583134946, "grad_norm": 0.0879841148853302, "learning_rate": 9.20257711702458e-06, "loss": 0.4627, "num_input_tokens_seen": 51255424, "step": 42145 }, { "epoch": 5.281293071043729, "grad_norm": 0.10428303480148315, "learning_rate": 9.202280889115233e-06, "loss": 0.4569, "num_input_tokens_seen": 51261312, "step": 42150 }, { "epoch": 5.281919558952512, "grad_norm": 0.06946452707052231, "learning_rate": 9.201984610964086e-06, "loss": 0.4626, "num_input_tokens_seen": 51267776, "step": 42155 }, { "epoch": 5.282546046861295, "grad_norm": 0.08122581988573074, "learning_rate": 9.201688282574681e-06, "loss": 0.4542, "num_input_tokens_seen": 51273984, "step": 42160 }, { "epoch": 5.283172534770079, "grad_norm": 0.07242961972951889, "learning_rate": 9.201391903950558e-06, "loss": 0.4606, "num_input_tokens_seen": 51280256, "step": 42165 }, { "epoch": 5.2837990226788625, "grad_norm": 0.07578220218420029, "learning_rate": 9.201095475095265e-06, "loss": 0.4612, "num_input_tokens_seen": 51286304, "step": 42170 }, { "epoch": 5.284425510587646, "grad_norm": 0.10880911350250244, "learning_rate": 9.20079899601234e-06, "loss": 0.4694, "num_input_tokens_seen": 51292544, "step": 42175 }, { "epoch": 5.285051998496429, "grad_norm": 0.09971821308135986, "learning_rate": 9.200502466705335e-06, "loss": 0.4615, "num_input_tokens_seen": 51298816, "step": 42180 }, { "epoch": 5.285678486405212, "grad_norm": 0.07227994501590729, "learning_rate": 9.200205887177788e-06, "loss": 0.4638, "num_input_tokens_seen": 51304992, "step": 42185 }, { "epoch": 5.286304974313996, "grad_norm": 0.07503065466880798, "learning_rate": 9.199909257433251e-06, "loss": 0.4625, "num_input_tokens_seen": 51311072, "step": 42190 }, { "epoch": 5.286931462222779, "grad_norm": 0.07446867227554321, "learning_rate": 9.199612577475266e-06, "loss": 0.4568, "num_input_tokens_seen": 51317600, "step": 42195 }, { "epoch": 5.287557950131562, "grad_norm": 0.10554960370063782, "learning_rate": 9.199315847307384e-06, "loss": 0.466, "num_input_tokens_seen": 51324032, "step": 42200 }, { "epoch": 5.2881844380403455, "grad_norm": 0.13205456733703613, "learning_rate": 9.199019066933148e-06, "loss": 0.4566, "num_input_tokens_seen": 51329536, "step": 42205 }, { "epoch": 5.288810925949129, "grad_norm": 0.04042414203286171, "learning_rate": 9.198722236356108e-06, "loss": 0.4635, "num_input_tokens_seen": 51335584, "step": 42210 }, { "epoch": 5.289437413857913, "grad_norm": 0.08077485859394073, "learning_rate": 9.198425355579817e-06, "loss": 0.4574, "num_input_tokens_seen": 51342176, "step": 42215 }, { "epoch": 5.290063901766696, "grad_norm": 0.0692054033279419, "learning_rate": 9.198128424607819e-06, "loss": 0.4632, "num_input_tokens_seen": 51348128, "step": 42220 }, { "epoch": 5.290690389675479, "grad_norm": 0.078761525452137, "learning_rate": 9.197831443443666e-06, "loss": 0.466, "num_input_tokens_seen": 51354560, "step": 42225 }, { "epoch": 5.291316877584262, "grad_norm": 0.09093521535396576, "learning_rate": 9.19753441209091e-06, "loss": 0.4644, "num_input_tokens_seen": 51360576, "step": 42230 }, { "epoch": 5.291943365493046, "grad_norm": 0.06727501004934311, "learning_rate": 9.1972373305531e-06, "loss": 0.4676, "num_input_tokens_seen": 51366592, "step": 42235 }, { "epoch": 5.2925698534018295, "grad_norm": 0.0924781784415245, "learning_rate": 9.196940198833787e-06, "loss": 0.4553, "num_input_tokens_seen": 51372832, "step": 42240 }, { "epoch": 5.293196341310613, "grad_norm": 0.08607152849435806, "learning_rate": 9.196643016936529e-06, "loss": 0.4615, "num_input_tokens_seen": 51379232, "step": 42245 }, { "epoch": 5.293822829219396, "grad_norm": 0.06027205288410187, "learning_rate": 9.196345784864873e-06, "loss": 0.462, "num_input_tokens_seen": 51384384, "step": 42250 }, { "epoch": 5.294449317128179, "grad_norm": 0.08169393986463547, "learning_rate": 9.196048502622374e-06, "loss": 0.4604, "num_input_tokens_seen": 51390400, "step": 42255 }, { "epoch": 5.295075805036963, "grad_norm": 0.11298384517431259, "learning_rate": 9.19575117021259e-06, "loss": 0.4594, "num_input_tokens_seen": 51396320, "step": 42260 }, { "epoch": 5.295702292945746, "grad_norm": 0.06083787605166435, "learning_rate": 9.195453787639073e-06, "loss": 0.4659, "num_input_tokens_seen": 51402496, "step": 42265 }, { "epoch": 5.296328780854529, "grad_norm": 0.08965189754962921, "learning_rate": 9.195156354905376e-06, "loss": 0.4613, "num_input_tokens_seen": 51408736, "step": 42270 }, { "epoch": 5.296955268763313, "grad_norm": 0.12240138649940491, "learning_rate": 9.19485887201506e-06, "loss": 0.4692, "num_input_tokens_seen": 51414944, "step": 42275 }, { "epoch": 5.297581756672097, "grad_norm": 0.09576898068189621, "learning_rate": 9.194561338971679e-06, "loss": 0.4629, "num_input_tokens_seen": 51420704, "step": 42280 }, { "epoch": 5.29820824458088, "grad_norm": 0.06569075584411621, "learning_rate": 9.194263755778791e-06, "loss": 0.4673, "num_input_tokens_seen": 51426816, "step": 42285 }, { "epoch": 5.298834732489663, "grad_norm": 0.07570058852434158, "learning_rate": 9.193966122439952e-06, "loss": 0.4623, "num_input_tokens_seen": 51432704, "step": 42290 }, { "epoch": 5.299461220398446, "grad_norm": 0.038753692060709, "learning_rate": 9.193668438958723e-06, "loss": 0.4621, "num_input_tokens_seen": 51438944, "step": 42295 }, { "epoch": 5.300087708307229, "grad_norm": 0.10202781111001968, "learning_rate": 9.193370705338662e-06, "loss": 0.4547, "num_input_tokens_seen": 51444992, "step": 42300 }, { "epoch": 5.300714196216013, "grad_norm": 0.08953477442264557, "learning_rate": 9.193072921583327e-06, "loss": 0.4569, "num_input_tokens_seen": 51451104, "step": 42305 }, { "epoch": 5.3013406841247965, "grad_norm": 0.037574995309114456, "learning_rate": 9.192775087696282e-06, "loss": 0.473, "num_input_tokens_seen": 51457120, "step": 42310 }, { "epoch": 5.30196717203358, "grad_norm": 0.040106114000082016, "learning_rate": 9.192477203681086e-06, "loss": 0.4576, "num_input_tokens_seen": 51463104, "step": 42315 }, { "epoch": 5.302593659942363, "grad_norm": 0.10098300874233246, "learning_rate": 9.192179269541299e-06, "loss": 0.4615, "num_input_tokens_seen": 51469120, "step": 42320 }, { "epoch": 5.303220147851146, "grad_norm": 0.09998391568660736, "learning_rate": 9.191881285280484e-06, "loss": 0.4646, "num_input_tokens_seen": 51475104, "step": 42325 }, { "epoch": 5.30384663575993, "grad_norm": 0.07339855283498764, "learning_rate": 9.191583250902205e-06, "loss": 0.4656, "num_input_tokens_seen": 51481120, "step": 42330 }, { "epoch": 5.304473123668713, "grad_norm": 0.062240153551101685, "learning_rate": 9.191285166410023e-06, "loss": 0.4589, "num_input_tokens_seen": 51487232, "step": 42335 }, { "epoch": 5.305099611577496, "grad_norm": 0.1418454498052597, "learning_rate": 9.190987031807503e-06, "loss": 0.462, "num_input_tokens_seen": 51493536, "step": 42340 }, { "epoch": 5.30572609948628, "grad_norm": 0.045307859778404236, "learning_rate": 9.19068884709821e-06, "loss": 0.4607, "num_input_tokens_seen": 51499584, "step": 42345 }, { "epoch": 5.306352587395064, "grad_norm": 0.08245988935232162, "learning_rate": 9.19039061228571e-06, "loss": 0.4516, "num_input_tokens_seen": 51505696, "step": 42350 }, { "epoch": 5.306979075303847, "grad_norm": 0.1274726241827011, "learning_rate": 9.190092327373564e-06, "loss": 0.4655, "num_input_tokens_seen": 51512000, "step": 42355 }, { "epoch": 5.30760556321263, "grad_norm": 0.12434153258800507, "learning_rate": 9.189793992365343e-06, "loss": 0.4588, "num_input_tokens_seen": 51518272, "step": 42360 }, { "epoch": 5.308232051121413, "grad_norm": 0.04156837984919548, "learning_rate": 9.189495607264612e-06, "loss": 0.4608, "num_input_tokens_seen": 51524128, "step": 42365 }, { "epoch": 5.308858539030196, "grad_norm": 0.05487200245261192, "learning_rate": 9.189197172074938e-06, "loss": 0.463, "num_input_tokens_seen": 51530048, "step": 42370 }, { "epoch": 5.30948502693898, "grad_norm": 0.12652012705802917, "learning_rate": 9.188898686799891e-06, "loss": 0.4644, "num_input_tokens_seen": 51536352, "step": 42375 }, { "epoch": 5.310111514847764, "grad_norm": 0.093802310526371, "learning_rate": 9.188600151443039e-06, "loss": 0.4621, "num_input_tokens_seen": 51542304, "step": 42380 }, { "epoch": 5.310738002756547, "grad_norm": 0.09072982519865036, "learning_rate": 9.188301566007948e-06, "loss": 0.4598, "num_input_tokens_seen": 51547904, "step": 42385 }, { "epoch": 5.31136449066533, "grad_norm": 0.08004976809024811, "learning_rate": 9.188002930498193e-06, "loss": 0.4601, "num_input_tokens_seen": 51554016, "step": 42390 }, { "epoch": 5.311990978574114, "grad_norm": 0.13331998884677887, "learning_rate": 9.18770424491734e-06, "loss": 0.4575, "num_input_tokens_seen": 51559584, "step": 42395 }, { "epoch": 5.312617466482897, "grad_norm": 0.11478269845247269, "learning_rate": 9.187405509268962e-06, "loss": 0.4578, "num_input_tokens_seen": 51565504, "step": 42400 }, { "epoch": 5.31324395439168, "grad_norm": 0.0438392348587513, "learning_rate": 9.187106723556633e-06, "loss": 0.4613, "num_input_tokens_seen": 51571008, "step": 42405 }, { "epoch": 5.3138704423004635, "grad_norm": 0.13947102427482605, "learning_rate": 9.18680788778392e-06, "loss": 0.4612, "num_input_tokens_seen": 51577088, "step": 42410 }, { "epoch": 5.314496930209247, "grad_norm": 0.1619788557291031, "learning_rate": 9.1865090019544e-06, "loss": 0.4694, "num_input_tokens_seen": 51583296, "step": 42415 }, { "epoch": 5.315123418118031, "grad_norm": 0.13254643976688385, "learning_rate": 9.186210066071646e-06, "loss": 0.4558, "num_input_tokens_seen": 51589280, "step": 42420 }, { "epoch": 5.315749906026814, "grad_norm": 0.11251474916934967, "learning_rate": 9.185911080139228e-06, "loss": 0.4637, "num_input_tokens_seen": 51595392, "step": 42425 }, { "epoch": 5.316376393935597, "grad_norm": 0.15971063077449799, "learning_rate": 9.185612044160727e-06, "loss": 0.4555, "num_input_tokens_seen": 51601440, "step": 42430 }, { "epoch": 5.31700288184438, "grad_norm": 0.12097875028848648, "learning_rate": 9.185312958139714e-06, "loss": 0.4645, "num_input_tokens_seen": 51607456, "step": 42435 }, { "epoch": 5.317629369753163, "grad_norm": 0.1673383116722107, "learning_rate": 9.185013822079765e-06, "loss": 0.4459, "num_input_tokens_seen": 51613792, "step": 42440 }, { "epoch": 5.318255857661947, "grad_norm": 0.11316484957933426, "learning_rate": 9.184714635984458e-06, "loss": 0.4628, "num_input_tokens_seen": 51619712, "step": 42445 }, { "epoch": 5.318882345570731, "grad_norm": 0.15666742622852325, "learning_rate": 9.184415399857369e-06, "loss": 0.4581, "num_input_tokens_seen": 51626144, "step": 42450 }, { "epoch": 5.319508833479514, "grad_norm": 0.15975697338581085, "learning_rate": 9.184116113702075e-06, "loss": 0.4644, "num_input_tokens_seen": 51632608, "step": 42455 }, { "epoch": 5.320135321388297, "grad_norm": 0.23905165493488312, "learning_rate": 9.183816777522155e-06, "loss": 0.4426, "num_input_tokens_seen": 51639008, "step": 42460 }, { "epoch": 5.320761809297081, "grad_norm": 0.06504765152931213, "learning_rate": 9.183517391321188e-06, "loss": 0.4639, "num_input_tokens_seen": 51645120, "step": 42465 }, { "epoch": 5.321388297205864, "grad_norm": 0.19680263102054596, "learning_rate": 9.183217955102754e-06, "loss": 0.4839, "num_input_tokens_seen": 51651008, "step": 42470 }, { "epoch": 5.322014785114647, "grad_norm": 0.15755978226661682, "learning_rate": 9.182918468870432e-06, "loss": 0.4823, "num_input_tokens_seen": 51656992, "step": 42475 }, { "epoch": 5.3226412730234305, "grad_norm": 0.12337286025285721, "learning_rate": 9.182618932627803e-06, "loss": 0.7374, "num_input_tokens_seen": 51663392, "step": 42480 }, { "epoch": 5.323267760932214, "grad_norm": 0.1260334998369217, "learning_rate": 9.182319346378449e-06, "loss": 0.4651, "num_input_tokens_seen": 51669632, "step": 42485 }, { "epoch": 5.323894248840998, "grad_norm": 0.054684750735759735, "learning_rate": 9.18201971012595e-06, "loss": 0.4659, "num_input_tokens_seen": 51675936, "step": 42490 }, { "epoch": 5.324520736749781, "grad_norm": 0.08090610057115555, "learning_rate": 9.181720023873888e-06, "loss": 0.4653, "num_input_tokens_seen": 51681888, "step": 42495 }, { "epoch": 5.325147224658564, "grad_norm": 0.08177080005407333, "learning_rate": 9.18142028762585e-06, "loss": 0.4673, "num_input_tokens_seen": 51688256, "step": 42500 }, { "epoch": 5.325773712567347, "grad_norm": 0.08443815261125565, "learning_rate": 9.181120501385414e-06, "loss": 0.4643, "num_input_tokens_seen": 51694176, "step": 42505 }, { "epoch": 5.326400200476131, "grad_norm": 0.05269038304686546, "learning_rate": 9.18082066515617e-06, "loss": 0.465, "num_input_tokens_seen": 51700736, "step": 42510 }, { "epoch": 5.327026688384914, "grad_norm": 0.11067909002304077, "learning_rate": 9.180520778941698e-06, "loss": 0.465, "num_input_tokens_seen": 51707264, "step": 42515 }, { "epoch": 5.327653176293698, "grad_norm": 0.08035808801651001, "learning_rate": 9.180220842745587e-06, "loss": 0.4701, "num_input_tokens_seen": 51713280, "step": 42520 }, { "epoch": 5.328279664202481, "grad_norm": 0.13494764268398285, "learning_rate": 9.179920856571419e-06, "loss": 0.4677, "num_input_tokens_seen": 51719488, "step": 42525 }, { "epoch": 5.328906152111264, "grad_norm": 0.142798513174057, "learning_rate": 9.179620820422785e-06, "loss": 0.4626, "num_input_tokens_seen": 51725600, "step": 42530 }, { "epoch": 5.329532640020048, "grad_norm": 0.07888861000537872, "learning_rate": 9.179320734303269e-06, "loss": 0.4681, "num_input_tokens_seen": 51731968, "step": 42535 }, { "epoch": 5.330159127928831, "grad_norm": 0.08379533886909485, "learning_rate": 9.17902059821646e-06, "loss": 0.4638, "num_input_tokens_seen": 51738112, "step": 42540 }, { "epoch": 5.330785615837614, "grad_norm": 0.12728451192378998, "learning_rate": 9.178720412165946e-06, "loss": 0.457, "num_input_tokens_seen": 51744096, "step": 42545 }, { "epoch": 5.3314121037463975, "grad_norm": 0.14156673848628998, "learning_rate": 9.178420176155316e-06, "loss": 0.4622, "num_input_tokens_seen": 51750368, "step": 42550 }, { "epoch": 5.332038591655181, "grad_norm": 0.08213863521814346, "learning_rate": 9.17811989018816e-06, "loss": 0.4653, "num_input_tokens_seen": 51756704, "step": 42555 }, { "epoch": 5.332665079563965, "grad_norm": 0.11623606085777283, "learning_rate": 9.177819554268068e-06, "loss": 0.4675, "num_input_tokens_seen": 51763200, "step": 42560 }, { "epoch": 5.333291567472748, "grad_norm": 0.07808580249547958, "learning_rate": 9.17751916839863e-06, "loss": 0.4602, "num_input_tokens_seen": 51769248, "step": 42565 }, { "epoch": 5.333918055381531, "grad_norm": 0.0706486627459526, "learning_rate": 9.17721873258344e-06, "loss": 0.4613, "num_input_tokens_seen": 51775456, "step": 42570 }, { "epoch": 5.334544543290314, "grad_norm": 0.09946034848690033, "learning_rate": 9.176918246826086e-06, "loss": 0.4644, "num_input_tokens_seen": 51781792, "step": 42575 }, { "epoch": 5.335171031199097, "grad_norm": 0.07758835703134537, "learning_rate": 9.176617711130164e-06, "loss": 0.4608, "num_input_tokens_seen": 51787904, "step": 42580 }, { "epoch": 5.3357975191078815, "grad_norm": 0.12474393844604492, "learning_rate": 9.176317125499265e-06, "loss": 0.4609, "num_input_tokens_seen": 51793856, "step": 42585 }, { "epoch": 5.336424007016665, "grad_norm": 0.08469732850790024, "learning_rate": 9.176016489936984e-06, "loss": 0.4597, "num_input_tokens_seen": 51800320, "step": 42590 }, { "epoch": 5.337050494925448, "grad_norm": 0.07656237483024597, "learning_rate": 9.175715804446914e-06, "loss": 0.4641, "num_input_tokens_seen": 51806368, "step": 42595 }, { "epoch": 5.337676982834231, "grad_norm": 0.12190201133489609, "learning_rate": 9.175415069032651e-06, "loss": 0.4678, "num_input_tokens_seen": 51812256, "step": 42600 }, { "epoch": 5.338303470743015, "grad_norm": 0.09402426332235336, "learning_rate": 9.175114283697793e-06, "loss": 0.4593, "num_input_tokens_seen": 51818304, "step": 42605 }, { "epoch": 5.338929958651798, "grad_norm": 0.07760754972696304, "learning_rate": 9.17481344844593e-06, "loss": 0.4609, "num_input_tokens_seen": 51824096, "step": 42610 }, { "epoch": 5.339556446560581, "grad_norm": 0.07422620803117752, "learning_rate": 9.174512563280663e-06, "loss": 0.4598, "num_input_tokens_seen": 51830048, "step": 42615 }, { "epoch": 5.3401829344693645, "grad_norm": 0.09567703306674957, "learning_rate": 9.17421162820559e-06, "loss": 0.4607, "num_input_tokens_seen": 51836096, "step": 42620 }, { "epoch": 5.340809422378148, "grad_norm": 0.09302721172571182, "learning_rate": 9.173910643224307e-06, "loss": 0.4611, "num_input_tokens_seen": 51842464, "step": 42625 }, { "epoch": 5.341435910286932, "grad_norm": 0.08914902061223984, "learning_rate": 9.173609608340412e-06, "loss": 0.4634, "num_input_tokens_seen": 51848480, "step": 42630 }, { "epoch": 5.342062398195715, "grad_norm": 0.11562909930944443, "learning_rate": 9.173308523557505e-06, "loss": 0.4655, "num_input_tokens_seen": 51854720, "step": 42635 }, { "epoch": 5.342688886104498, "grad_norm": 0.08415627479553223, "learning_rate": 9.173007388879187e-06, "loss": 0.4626, "num_input_tokens_seen": 51860608, "step": 42640 }, { "epoch": 5.343315374013281, "grad_norm": 0.12616725265979767, "learning_rate": 9.172706204309057e-06, "loss": 0.4589, "num_input_tokens_seen": 51866688, "step": 42645 }, { "epoch": 5.343941861922065, "grad_norm": 0.08549924939870834, "learning_rate": 9.172404969850715e-06, "loss": 0.4639, "num_input_tokens_seen": 51872736, "step": 42650 }, { "epoch": 5.3445683498308485, "grad_norm": 0.1322091668844223, "learning_rate": 9.172103685507765e-06, "loss": 0.4629, "num_input_tokens_seen": 51878976, "step": 42655 }, { "epoch": 5.345194837739632, "grad_norm": 0.0893554612994194, "learning_rate": 9.171802351283807e-06, "loss": 0.4592, "num_input_tokens_seen": 51884704, "step": 42660 }, { "epoch": 5.345821325648415, "grad_norm": 0.09079981595277786, "learning_rate": 9.171500967182445e-06, "loss": 0.4583, "num_input_tokens_seen": 51890560, "step": 42665 }, { "epoch": 5.346447813557198, "grad_norm": 0.0753522515296936, "learning_rate": 9.171199533207283e-06, "loss": 0.4632, "num_input_tokens_seen": 51896992, "step": 42670 }, { "epoch": 5.347074301465982, "grad_norm": 0.08502820879220963, "learning_rate": 9.17089804936192e-06, "loss": 0.4649, "num_input_tokens_seen": 51903232, "step": 42675 }, { "epoch": 5.347700789374765, "grad_norm": 0.08442569524049759, "learning_rate": 9.170596515649967e-06, "loss": 0.4664, "num_input_tokens_seen": 51909344, "step": 42680 }, { "epoch": 5.348327277283548, "grad_norm": 0.09097529947757721, "learning_rate": 9.170294932075026e-06, "loss": 0.4598, "num_input_tokens_seen": 51914400, "step": 42685 }, { "epoch": 5.348953765192332, "grad_norm": 0.06871701776981354, "learning_rate": 9.169993298640703e-06, "loss": 0.4658, "num_input_tokens_seen": 51920416, "step": 42690 }, { "epoch": 5.349580253101115, "grad_norm": 0.08487015962600708, "learning_rate": 9.169691615350602e-06, "loss": 0.4614, "num_input_tokens_seen": 51926688, "step": 42695 }, { "epoch": 5.350206741009899, "grad_norm": 0.04715970903635025, "learning_rate": 9.169389882208334e-06, "loss": 0.4536, "num_input_tokens_seen": 51932768, "step": 42700 }, { "epoch": 5.350833228918682, "grad_norm": 0.11554044485092163, "learning_rate": 9.169088099217503e-06, "loss": 0.4661, "num_input_tokens_seen": 51938944, "step": 42705 }, { "epoch": 5.351459716827465, "grad_norm": 0.11084838211536407, "learning_rate": 9.16878626638172e-06, "loss": 0.4736, "num_input_tokens_seen": 51944896, "step": 42710 }, { "epoch": 5.352086204736248, "grad_norm": 0.07793782651424408, "learning_rate": 9.168484383704592e-06, "loss": 0.4583, "num_input_tokens_seen": 51951008, "step": 42715 }, { "epoch": 5.352712692645032, "grad_norm": 0.06741039454936981, "learning_rate": 9.168182451189728e-06, "loss": 0.4676, "num_input_tokens_seen": 51957088, "step": 42720 }, { "epoch": 5.3533391805538155, "grad_norm": 0.07526874542236328, "learning_rate": 9.16788046884074e-06, "loss": 0.467, "num_input_tokens_seen": 51963072, "step": 42725 }, { "epoch": 5.353965668462599, "grad_norm": 0.06462894380092621, "learning_rate": 9.167578436661236e-06, "loss": 0.4658, "num_input_tokens_seen": 51969216, "step": 42730 }, { "epoch": 5.354592156371382, "grad_norm": 0.08162277191877365, "learning_rate": 9.167276354654827e-06, "loss": 0.4588, "num_input_tokens_seen": 51975424, "step": 42735 }, { "epoch": 5.355218644280165, "grad_norm": 0.09951084852218628, "learning_rate": 9.166974222825127e-06, "loss": 0.4572, "num_input_tokens_seen": 51981600, "step": 42740 }, { "epoch": 5.355845132188949, "grad_norm": 0.08815328776836395, "learning_rate": 9.166672041175745e-06, "loss": 0.4544, "num_input_tokens_seen": 51987840, "step": 42745 }, { "epoch": 5.356471620097732, "grad_norm": 0.03980400040745735, "learning_rate": 9.166369809710297e-06, "loss": 0.4628, "num_input_tokens_seen": 51994272, "step": 42750 }, { "epoch": 5.357098108006515, "grad_norm": 0.11388693749904633, "learning_rate": 9.166067528432394e-06, "loss": 0.4583, "num_input_tokens_seen": 52000256, "step": 42755 }, { "epoch": 5.357724595915299, "grad_norm": 0.09948732703924179, "learning_rate": 9.165765197345652e-06, "loss": 0.4638, "num_input_tokens_seen": 52006688, "step": 42760 }, { "epoch": 5.358351083824083, "grad_norm": 0.09155230224132538, "learning_rate": 9.165462816453682e-06, "loss": 0.4636, "num_input_tokens_seen": 52012928, "step": 42765 }, { "epoch": 5.358977571732866, "grad_norm": 0.08096138387918472, "learning_rate": 9.165160385760106e-06, "loss": 0.4594, "num_input_tokens_seen": 52018880, "step": 42770 }, { "epoch": 5.359604059641649, "grad_norm": 0.09074408560991287, "learning_rate": 9.164857905268534e-06, "loss": 0.4609, "num_input_tokens_seen": 52025088, "step": 42775 }, { "epoch": 5.360230547550432, "grad_norm": 0.08605729788541794, "learning_rate": 9.164555374982584e-06, "loss": 0.4629, "num_input_tokens_seen": 52031232, "step": 42780 }, { "epoch": 5.360857035459215, "grad_norm": 0.08863453567028046, "learning_rate": 9.164252794905872e-06, "loss": 0.4689, "num_input_tokens_seen": 52037504, "step": 42785 }, { "epoch": 5.361483523367999, "grad_norm": 0.06756016612052917, "learning_rate": 9.163950165042019e-06, "loss": 0.465, "num_input_tokens_seen": 52043680, "step": 42790 }, { "epoch": 5.3621100112767826, "grad_norm": 0.07496363669633865, "learning_rate": 9.16364748539464e-06, "loss": 0.4638, "num_input_tokens_seen": 52049888, "step": 42795 }, { "epoch": 5.362736499185566, "grad_norm": 0.1278352439403534, "learning_rate": 9.163344755967354e-06, "loss": 0.4585, "num_input_tokens_seen": 52056064, "step": 42800 }, { "epoch": 5.363362987094349, "grad_norm": 0.10629116743803024, "learning_rate": 9.16304197676378e-06, "loss": 0.464, "num_input_tokens_seen": 52061888, "step": 42805 }, { "epoch": 5.363989475003132, "grad_norm": 0.0748627707362175, "learning_rate": 9.162739147787542e-06, "loss": 0.4592, "num_input_tokens_seen": 52067904, "step": 42810 }, { "epoch": 5.364615962911916, "grad_norm": 0.08339117467403412, "learning_rate": 9.162436269042256e-06, "loss": 0.4588, "num_input_tokens_seen": 52074048, "step": 42815 }, { "epoch": 5.365242450820699, "grad_norm": 0.08746795356273651, "learning_rate": 9.162133340531545e-06, "loss": 0.4618, "num_input_tokens_seen": 52079680, "step": 42820 }, { "epoch": 5.3658689387294825, "grad_norm": 0.07912745326757431, "learning_rate": 9.16183036225903e-06, "loss": 0.4609, "num_input_tokens_seen": 52085696, "step": 42825 }, { "epoch": 5.366495426638266, "grad_norm": 0.07571809738874435, "learning_rate": 9.161527334228334e-06, "loss": 0.4551, "num_input_tokens_seen": 52091904, "step": 42830 }, { "epoch": 5.367121914547049, "grad_norm": 0.08293842524290085, "learning_rate": 9.161224256443082e-06, "loss": 0.464, "num_input_tokens_seen": 52098240, "step": 42835 }, { "epoch": 5.367748402455833, "grad_norm": 0.08292209357023239, "learning_rate": 9.160921128906895e-06, "loss": 0.4557, "num_input_tokens_seen": 52104192, "step": 42840 }, { "epoch": 5.368374890364616, "grad_norm": 0.08703414350748062, "learning_rate": 9.160617951623396e-06, "loss": 0.4649, "num_input_tokens_seen": 52110336, "step": 42845 }, { "epoch": 5.369001378273399, "grad_norm": 0.10767551511526108, "learning_rate": 9.160314724596211e-06, "loss": 0.4657, "num_input_tokens_seen": 52116576, "step": 42850 }, { "epoch": 5.369627866182182, "grad_norm": 0.12575684487819672, "learning_rate": 9.160011447828968e-06, "loss": 0.46, "num_input_tokens_seen": 52122688, "step": 42855 }, { "epoch": 5.370254354090966, "grad_norm": 0.16267132759094238, "learning_rate": 9.159708121325288e-06, "loss": 0.4542, "num_input_tokens_seen": 52128928, "step": 42860 }, { "epoch": 5.37088084199975, "grad_norm": 0.13102905452251434, "learning_rate": 9.159404745088802e-06, "loss": 0.4779, "num_input_tokens_seen": 52134592, "step": 42865 }, { "epoch": 5.371507329908533, "grad_norm": 0.12853555381298065, "learning_rate": 9.159101319123134e-06, "loss": 0.4585, "num_input_tokens_seen": 52140800, "step": 42870 }, { "epoch": 5.372133817817316, "grad_norm": 0.0926654264330864, "learning_rate": 9.158797843431913e-06, "loss": 0.4542, "num_input_tokens_seen": 52146816, "step": 42875 }, { "epoch": 5.372760305726099, "grad_norm": 0.07509994506835938, "learning_rate": 9.158494318018768e-06, "loss": 0.4675, "num_input_tokens_seen": 52152896, "step": 42880 }, { "epoch": 5.373386793634883, "grad_norm": 0.1813770830631256, "learning_rate": 9.158190742887327e-06, "loss": 0.4744, "num_input_tokens_seen": 52158752, "step": 42885 }, { "epoch": 5.374013281543666, "grad_norm": 0.11785905808210373, "learning_rate": 9.15788711804122e-06, "loss": 0.4474, "num_input_tokens_seen": 52165152, "step": 42890 }, { "epoch": 5.3746397694524495, "grad_norm": 0.12175535410642624, "learning_rate": 9.157583443484075e-06, "loss": 0.4736, "num_input_tokens_seen": 52171296, "step": 42895 }, { "epoch": 5.375266257361233, "grad_norm": 0.09157656878232956, "learning_rate": 9.157279719219527e-06, "loss": 0.4606, "num_input_tokens_seen": 52176320, "step": 42900 }, { "epoch": 5.375892745270017, "grad_norm": 0.04713911563158035, "learning_rate": 9.156975945251204e-06, "loss": 0.4723, "num_input_tokens_seen": 52182208, "step": 42905 }, { "epoch": 5.3765192331788, "grad_norm": 0.08027001470327377, "learning_rate": 9.156672121582737e-06, "loss": 0.4591, "num_input_tokens_seen": 52188160, "step": 42910 }, { "epoch": 5.377145721087583, "grad_norm": 0.08127285540103912, "learning_rate": 9.15636824821776e-06, "loss": 0.4624, "num_input_tokens_seen": 52194144, "step": 42915 }, { "epoch": 5.377772208996366, "grad_norm": 0.08859851956367493, "learning_rate": 9.156064325159908e-06, "loss": 0.4609, "num_input_tokens_seen": 52199904, "step": 42920 }, { "epoch": 5.378398696905149, "grad_norm": 0.0821819007396698, "learning_rate": 9.155760352412813e-06, "loss": 0.4648, "num_input_tokens_seen": 52206400, "step": 42925 }, { "epoch": 5.379025184813933, "grad_norm": 0.09072179347276688, "learning_rate": 9.155456329980106e-06, "loss": 0.4627, "num_input_tokens_seen": 52212768, "step": 42930 }, { "epoch": 5.379651672722717, "grad_norm": 0.05501185357570648, "learning_rate": 9.155152257865427e-06, "loss": 0.4697, "num_input_tokens_seen": 52219168, "step": 42935 }, { "epoch": 5.3802781606315, "grad_norm": 0.11037731170654297, "learning_rate": 9.15484813607241e-06, "loss": 0.468, "num_input_tokens_seen": 52225120, "step": 42940 }, { "epoch": 5.380904648540283, "grad_norm": 0.08303563296794891, "learning_rate": 9.15454396460469e-06, "loss": 0.4552, "num_input_tokens_seen": 52231008, "step": 42945 }, { "epoch": 5.381531136449066, "grad_norm": 0.08200905472040176, "learning_rate": 9.154239743465903e-06, "loss": 0.4565, "num_input_tokens_seen": 52237088, "step": 42950 }, { "epoch": 5.38215762435785, "grad_norm": 0.05573135241866112, "learning_rate": 9.153935472659687e-06, "loss": 0.4625, "num_input_tokens_seen": 52243168, "step": 42955 }, { "epoch": 5.382784112266633, "grad_norm": 0.14473673701286316, "learning_rate": 9.15363115218968e-06, "loss": 0.4666, "num_input_tokens_seen": 52248384, "step": 42960 }, { "epoch": 5.3834106001754165, "grad_norm": 0.0869864970445633, "learning_rate": 9.153326782059522e-06, "loss": 0.4599, "num_input_tokens_seen": 52254624, "step": 42965 }, { "epoch": 5.3840370880842, "grad_norm": 0.07612684369087219, "learning_rate": 9.15302236227285e-06, "loss": 0.4587, "num_input_tokens_seen": 52260864, "step": 42970 }, { "epoch": 5.384663575992984, "grad_norm": 0.09983103722333908, "learning_rate": 9.152717892833302e-06, "loss": 0.4624, "num_input_tokens_seen": 52267072, "step": 42975 }, { "epoch": 5.385290063901767, "grad_norm": 0.054560184478759766, "learning_rate": 9.152413373744522e-06, "loss": 0.464, "num_input_tokens_seen": 52273344, "step": 42980 }, { "epoch": 5.38591655181055, "grad_norm": 0.11135492473840714, "learning_rate": 9.152108805010147e-06, "loss": 0.4633, "num_input_tokens_seen": 52279104, "step": 42985 }, { "epoch": 5.386543039719333, "grad_norm": 0.0963582843542099, "learning_rate": 9.151804186633822e-06, "loss": 0.4584, "num_input_tokens_seen": 52284928, "step": 42990 }, { "epoch": 5.387169527628116, "grad_norm": 0.09335675090551376, "learning_rate": 9.151499518619188e-06, "loss": 0.4548, "num_input_tokens_seen": 52290720, "step": 42995 }, { "epoch": 5.3877960155369005, "grad_norm": 0.11719881743192673, "learning_rate": 9.151194800969886e-06, "loss": 0.4663, "num_input_tokens_seen": 52296992, "step": 43000 }, { "epoch": 5.388422503445684, "grad_norm": 0.07541126757860184, "learning_rate": 9.15089003368956e-06, "loss": 0.4581, "num_input_tokens_seen": 52303136, "step": 43005 }, { "epoch": 5.389048991354467, "grad_norm": 0.1339116096496582, "learning_rate": 9.150585216781853e-06, "loss": 0.4673, "num_input_tokens_seen": 52309056, "step": 43010 }, { "epoch": 5.38967547926325, "grad_norm": 0.16471166908740997, "learning_rate": 9.15028035025041e-06, "loss": 0.4745, "num_input_tokens_seen": 52314528, "step": 43015 }, { "epoch": 5.390301967172034, "grad_norm": 0.14862239360809326, "learning_rate": 9.149975434098878e-06, "loss": 0.4763, "num_input_tokens_seen": 52320448, "step": 43020 }, { "epoch": 5.390928455080817, "grad_norm": 0.0806434229016304, "learning_rate": 9.149670468330901e-06, "loss": 0.4649, "num_input_tokens_seen": 52326592, "step": 43025 }, { "epoch": 5.3915549429896, "grad_norm": 0.0895097553730011, "learning_rate": 9.149365452950123e-06, "loss": 0.4676, "num_input_tokens_seen": 52332800, "step": 43030 }, { "epoch": 5.3921814308983835, "grad_norm": 0.09252690523862839, "learning_rate": 9.149060387960191e-06, "loss": 0.4613, "num_input_tokens_seen": 52339136, "step": 43035 }, { "epoch": 5.392807918807167, "grad_norm": 0.09474358707666397, "learning_rate": 9.148755273364756e-06, "loss": 0.4593, "num_input_tokens_seen": 52344928, "step": 43040 }, { "epoch": 5.393434406715951, "grad_norm": 0.1122763454914093, "learning_rate": 9.148450109167463e-06, "loss": 0.4645, "num_input_tokens_seen": 52351040, "step": 43045 }, { "epoch": 5.394060894624734, "grad_norm": 0.0784652978181839, "learning_rate": 9.14814489537196e-06, "loss": 0.4607, "num_input_tokens_seen": 52356800, "step": 43050 }, { "epoch": 5.394687382533517, "grad_norm": 0.06967364996671677, "learning_rate": 9.1478396319819e-06, "loss": 0.462, "num_input_tokens_seen": 52362752, "step": 43055 }, { "epoch": 5.3953138704423, "grad_norm": 0.08491125702857971, "learning_rate": 9.147534319000927e-06, "loss": 0.4594, "num_input_tokens_seen": 52368768, "step": 43060 }, { "epoch": 5.395940358351083, "grad_norm": 0.07737530022859573, "learning_rate": 9.147228956432696e-06, "loss": 0.4716, "num_input_tokens_seen": 52374880, "step": 43065 }, { "epoch": 5.3965668462598675, "grad_norm": 0.09402385354042053, "learning_rate": 9.146923544280854e-06, "loss": 0.4649, "num_input_tokens_seen": 52381088, "step": 43070 }, { "epoch": 5.397193334168651, "grad_norm": 0.07482905685901642, "learning_rate": 9.146618082549057e-06, "loss": 0.4604, "num_input_tokens_seen": 52387296, "step": 43075 }, { "epoch": 5.397819822077434, "grad_norm": 0.045651424676179886, "learning_rate": 9.146312571240955e-06, "loss": 0.465, "num_input_tokens_seen": 52393440, "step": 43080 }, { "epoch": 5.398446309986217, "grad_norm": 0.12473472952842712, "learning_rate": 9.146007010360197e-06, "loss": 0.4723, "num_input_tokens_seen": 52399520, "step": 43085 }, { "epoch": 5.399072797895, "grad_norm": 0.07553046941757202, "learning_rate": 9.14570139991044e-06, "loss": 0.4627, "num_input_tokens_seen": 52405696, "step": 43090 }, { "epoch": 5.399699285803784, "grad_norm": 0.09668358415365219, "learning_rate": 9.14539573989534e-06, "loss": 0.4669, "num_input_tokens_seen": 52412160, "step": 43095 }, { "epoch": 5.400325773712567, "grad_norm": 0.0760180652141571, "learning_rate": 9.145090030318546e-06, "loss": 0.4635, "num_input_tokens_seen": 52418272, "step": 43100 }, { "epoch": 5.400952261621351, "grad_norm": 0.0769992396235466, "learning_rate": 9.144784271183716e-06, "loss": 0.4661, "num_input_tokens_seen": 52424448, "step": 43105 }, { "epoch": 5.401578749530134, "grad_norm": 0.07364591956138611, "learning_rate": 9.144478462494506e-06, "loss": 0.46, "num_input_tokens_seen": 52430368, "step": 43110 }, { "epoch": 5.402205237438918, "grad_norm": 0.09733720868825912, "learning_rate": 9.144172604254571e-06, "loss": 0.4629, "num_input_tokens_seen": 52436544, "step": 43115 }, { "epoch": 5.402831725347701, "grad_norm": 0.09977633506059647, "learning_rate": 9.14386669646757e-06, "loss": 0.4592, "num_input_tokens_seen": 52442880, "step": 43120 }, { "epoch": 5.403458213256484, "grad_norm": 0.07790064066648483, "learning_rate": 9.143560739137157e-06, "loss": 0.461, "num_input_tokens_seen": 52449152, "step": 43125 }, { "epoch": 5.404084701165267, "grad_norm": 0.07035644352436066, "learning_rate": 9.143254732266993e-06, "loss": 0.4641, "num_input_tokens_seen": 52454112, "step": 43130 }, { "epoch": 5.4047111890740505, "grad_norm": 0.11772158741950989, "learning_rate": 9.142948675860733e-06, "loss": 0.4604, "num_input_tokens_seen": 52460224, "step": 43135 }, { "epoch": 5.4053376769828345, "grad_norm": 0.06904234737157822, "learning_rate": 9.14264256992204e-06, "loss": 0.4628, "num_input_tokens_seen": 52465152, "step": 43140 }, { "epoch": 5.405964164891618, "grad_norm": 0.04162805154919624, "learning_rate": 9.142336414454572e-06, "loss": 0.462, "num_input_tokens_seen": 52471520, "step": 43145 }, { "epoch": 5.406590652800401, "grad_norm": 0.08747568726539612, "learning_rate": 9.14203020946199e-06, "loss": 0.4624, "num_input_tokens_seen": 52477728, "step": 43150 }, { "epoch": 5.407217140709184, "grad_norm": 0.07425304502248764, "learning_rate": 9.141723954947952e-06, "loss": 0.4611, "num_input_tokens_seen": 52483904, "step": 43155 }, { "epoch": 5.407843628617968, "grad_norm": 0.08926782011985779, "learning_rate": 9.141417650916124e-06, "loss": 0.4594, "num_input_tokens_seen": 52489888, "step": 43160 }, { "epoch": 5.408470116526751, "grad_norm": 0.08531283587217331, "learning_rate": 9.141111297370167e-06, "loss": 0.4634, "num_input_tokens_seen": 52496128, "step": 43165 }, { "epoch": 5.409096604435534, "grad_norm": 0.10166812688112259, "learning_rate": 9.14080489431374e-06, "loss": 0.4509, "num_input_tokens_seen": 52501696, "step": 43170 }, { "epoch": 5.409723092344318, "grad_norm": 0.08852394670248032, "learning_rate": 9.14049844175051e-06, "loss": 0.4695, "num_input_tokens_seen": 52507584, "step": 43175 }, { "epoch": 5.410349580253101, "grad_norm": 0.08789672702550888, "learning_rate": 9.140191939684142e-06, "loss": 0.4572, "num_input_tokens_seen": 52513792, "step": 43180 }, { "epoch": 5.410976068161885, "grad_norm": 0.06095001474022865, "learning_rate": 9.139885388118297e-06, "loss": 0.4627, "num_input_tokens_seen": 52519936, "step": 43185 }, { "epoch": 5.411602556070668, "grad_norm": 0.12344246357679367, "learning_rate": 9.13957878705664e-06, "loss": 0.4681, "num_input_tokens_seen": 52526176, "step": 43190 }, { "epoch": 5.412229043979451, "grad_norm": 0.08422678709030151, "learning_rate": 9.139272136502842e-06, "loss": 0.4614, "num_input_tokens_seen": 52532096, "step": 43195 }, { "epoch": 5.412855531888234, "grad_norm": 0.0979454442858696, "learning_rate": 9.138965436460561e-06, "loss": 0.4604, "num_input_tokens_seen": 52538144, "step": 43200 }, { "epoch": 5.4134820197970175, "grad_norm": 0.10652663558721542, "learning_rate": 9.13865868693347e-06, "loss": 0.4672, "num_input_tokens_seen": 52543680, "step": 43205 }, { "epoch": 5.4141085077058015, "grad_norm": 0.10003361850976944, "learning_rate": 9.138351887925236e-06, "loss": 0.4717, "num_input_tokens_seen": 52549856, "step": 43210 }, { "epoch": 5.414734995614585, "grad_norm": 0.07592424005270004, "learning_rate": 9.138045039439525e-06, "loss": 0.4659, "num_input_tokens_seen": 52555840, "step": 43215 }, { "epoch": 5.415361483523368, "grad_norm": 0.09302037954330444, "learning_rate": 9.137738141480008e-06, "loss": 0.4622, "num_input_tokens_seen": 52562080, "step": 43220 }, { "epoch": 5.415987971432151, "grad_norm": 0.12277403473854065, "learning_rate": 9.137431194050352e-06, "loss": 0.4622, "num_input_tokens_seen": 52568320, "step": 43225 }, { "epoch": 5.416614459340935, "grad_norm": 0.11996354907751083, "learning_rate": 9.137124197154225e-06, "loss": 0.4626, "num_input_tokens_seen": 52574592, "step": 43230 }, { "epoch": 5.417240947249718, "grad_norm": 0.05039655789732933, "learning_rate": 9.136817150795302e-06, "loss": 0.4599, "num_input_tokens_seen": 52580608, "step": 43235 }, { "epoch": 5.417867435158501, "grad_norm": 0.13305629789829254, "learning_rate": 9.136510054977253e-06, "loss": 0.472, "num_input_tokens_seen": 52585824, "step": 43240 }, { "epoch": 5.418493923067285, "grad_norm": 0.12426541745662689, "learning_rate": 9.136202909703747e-06, "loss": 0.4587, "num_input_tokens_seen": 52592192, "step": 43245 }, { "epoch": 5.419120410976068, "grad_norm": 0.11257163435220718, "learning_rate": 9.135895714978459e-06, "loss": 0.4566, "num_input_tokens_seen": 52598176, "step": 43250 }, { "epoch": 5.419746898884852, "grad_norm": 0.09105806797742844, "learning_rate": 9.13558847080506e-06, "loss": 0.4585, "num_input_tokens_seen": 52604192, "step": 43255 }, { "epoch": 5.420373386793635, "grad_norm": 0.07114694267511368, "learning_rate": 9.135281177187222e-06, "loss": 0.4639, "num_input_tokens_seen": 52610528, "step": 43260 }, { "epoch": 5.420999874702418, "grad_norm": 0.042988259345293045, "learning_rate": 9.134973834128621e-06, "loss": 0.461, "num_input_tokens_seen": 52616800, "step": 43265 }, { "epoch": 5.421626362611201, "grad_norm": 0.08293097466230392, "learning_rate": 9.134666441632934e-06, "loss": 0.4643, "num_input_tokens_seen": 52622720, "step": 43270 }, { "epoch": 5.422252850519985, "grad_norm": 0.08429723232984543, "learning_rate": 9.134358999703831e-06, "loss": 0.4602, "num_input_tokens_seen": 52628576, "step": 43275 }, { "epoch": 5.422879338428769, "grad_norm": 0.08103273808956146, "learning_rate": 9.134051508344992e-06, "loss": 0.4599, "num_input_tokens_seen": 52634240, "step": 43280 }, { "epoch": 5.423505826337552, "grad_norm": 0.08192720264196396, "learning_rate": 9.13374396756009e-06, "loss": 0.4653, "num_input_tokens_seen": 52640288, "step": 43285 }, { "epoch": 5.424132314246335, "grad_norm": 0.07713839411735535, "learning_rate": 9.133436377352805e-06, "loss": 0.4656, "num_input_tokens_seen": 52646464, "step": 43290 }, { "epoch": 5.424758802155118, "grad_norm": 0.08092856407165527, "learning_rate": 9.13312873772681e-06, "loss": 0.462, "num_input_tokens_seen": 52652640, "step": 43295 }, { "epoch": 5.425385290063902, "grad_norm": 0.0975157842040062, "learning_rate": 9.13282104868579e-06, "loss": 0.4638, "num_input_tokens_seen": 52657984, "step": 43300 }, { "epoch": 5.426011777972685, "grad_norm": 0.07745189964771271, "learning_rate": 9.132513310233416e-06, "loss": 0.4617, "num_input_tokens_seen": 52664032, "step": 43305 }, { "epoch": 5.4266382658814685, "grad_norm": 0.11052374541759491, "learning_rate": 9.132205522373373e-06, "loss": 0.4649, "num_input_tokens_seen": 52670144, "step": 43310 }, { "epoch": 5.427264753790252, "grad_norm": 0.10718879848718643, "learning_rate": 9.131897685109338e-06, "loss": 0.4593, "num_input_tokens_seen": 52676416, "step": 43315 }, { "epoch": 5.427891241699035, "grad_norm": 0.08932453393936157, "learning_rate": 9.131589798444993e-06, "loss": 0.4539, "num_input_tokens_seen": 52682880, "step": 43320 }, { "epoch": 5.428517729607819, "grad_norm": 0.06629551202058792, "learning_rate": 9.131281862384017e-06, "loss": 0.4641, "num_input_tokens_seen": 52688896, "step": 43325 }, { "epoch": 5.429144217516602, "grad_norm": 0.03975584730505943, "learning_rate": 9.130973876930094e-06, "loss": 0.4664, "num_input_tokens_seen": 52694976, "step": 43330 }, { "epoch": 5.429770705425385, "grad_norm": 0.10663498938083649, "learning_rate": 9.130665842086905e-06, "loss": 0.4647, "num_input_tokens_seen": 52700512, "step": 43335 }, { "epoch": 5.430397193334168, "grad_norm": 0.0840250700712204, "learning_rate": 9.130357757858132e-06, "loss": 0.4631, "num_input_tokens_seen": 52706176, "step": 43340 }, { "epoch": 5.4310236812429515, "grad_norm": 0.10337696224451065, "learning_rate": 9.130049624247461e-06, "loss": 0.4646, "num_input_tokens_seen": 52712192, "step": 43345 }, { "epoch": 5.431650169151736, "grad_norm": 0.09313724935054779, "learning_rate": 9.129741441258575e-06, "loss": 0.468, "num_input_tokens_seen": 52718336, "step": 43350 }, { "epoch": 5.432276657060519, "grad_norm": 0.08345204591751099, "learning_rate": 9.129433208895155e-06, "loss": 0.4607, "num_input_tokens_seen": 52724288, "step": 43355 }, { "epoch": 5.432903144969302, "grad_norm": 0.10857195407152176, "learning_rate": 9.129124927160891e-06, "loss": 0.4575, "num_input_tokens_seen": 52730656, "step": 43360 }, { "epoch": 5.433529632878085, "grad_norm": 0.08298805356025696, "learning_rate": 9.128816596059467e-06, "loss": 0.4588, "num_input_tokens_seen": 52736576, "step": 43365 }, { "epoch": 5.434156120786869, "grad_norm": 0.06862059235572815, "learning_rate": 9.128508215594569e-06, "loss": 0.4579, "num_input_tokens_seen": 52742560, "step": 43370 }, { "epoch": 5.434782608695652, "grad_norm": 0.12908503413200378, "learning_rate": 9.128199785769884e-06, "loss": 0.4642, "num_input_tokens_seen": 52748768, "step": 43375 }, { "epoch": 5.4354090966044355, "grad_norm": 0.09800057858228683, "learning_rate": 9.127891306589101e-06, "loss": 0.4635, "num_input_tokens_seen": 52754848, "step": 43380 }, { "epoch": 5.436035584513219, "grad_norm": 0.12027348577976227, "learning_rate": 9.127582778055906e-06, "loss": 0.4619, "num_input_tokens_seen": 52760480, "step": 43385 }, { "epoch": 5.436662072422003, "grad_norm": 0.0687539353966713, "learning_rate": 9.127274200173988e-06, "loss": 0.4589, "num_input_tokens_seen": 52766592, "step": 43390 }, { "epoch": 5.437288560330786, "grad_norm": 0.10736455023288727, "learning_rate": 9.126965572947038e-06, "loss": 0.4626, "num_input_tokens_seen": 52772960, "step": 43395 }, { "epoch": 5.437915048239569, "grad_norm": 0.06653456389904022, "learning_rate": 9.126656896378744e-06, "loss": 0.458, "num_input_tokens_seen": 52779072, "step": 43400 }, { "epoch": 5.438541536148352, "grad_norm": 0.1006971150636673, "learning_rate": 9.126348170472798e-06, "loss": 0.458, "num_input_tokens_seen": 52785248, "step": 43405 }, { "epoch": 5.439168024057135, "grad_norm": 0.0717511996626854, "learning_rate": 9.12603939523289e-06, "loss": 0.4601, "num_input_tokens_seen": 52791296, "step": 43410 }, { "epoch": 5.4397945119659195, "grad_norm": 0.07807579636573792, "learning_rate": 9.125730570662712e-06, "loss": 0.4668, "num_input_tokens_seen": 52797216, "step": 43415 }, { "epoch": 5.440420999874703, "grad_norm": 0.07270745933055878, "learning_rate": 9.125421696765956e-06, "loss": 0.4603, "num_input_tokens_seen": 52803552, "step": 43420 }, { "epoch": 5.441047487783486, "grad_norm": 0.07603579759597778, "learning_rate": 9.125112773546315e-06, "loss": 0.4626, "num_input_tokens_seen": 52809472, "step": 43425 }, { "epoch": 5.441673975692269, "grad_norm": 0.06854642927646637, "learning_rate": 9.124803801007484e-06, "loss": 0.4659, "num_input_tokens_seen": 52815712, "step": 43430 }, { "epoch": 5.442300463601052, "grad_norm": 0.10518772155046463, "learning_rate": 9.124494779153154e-06, "loss": 0.4558, "num_input_tokens_seen": 52821856, "step": 43435 }, { "epoch": 5.442926951509836, "grad_norm": 0.0857359915971756, "learning_rate": 9.124185707987023e-06, "loss": 0.4658, "num_input_tokens_seen": 52828288, "step": 43440 }, { "epoch": 5.443553439418619, "grad_norm": 0.07147509604692459, "learning_rate": 9.123876587512783e-06, "loss": 0.4667, "num_input_tokens_seen": 52834560, "step": 43445 }, { "epoch": 5.4441799273274025, "grad_norm": 0.1105034127831459, "learning_rate": 9.12356741773413e-06, "loss": 0.4581, "num_input_tokens_seen": 52840864, "step": 43450 }, { "epoch": 5.444806415236186, "grad_norm": 0.09011892974376678, "learning_rate": 9.123258198654764e-06, "loss": 0.4616, "num_input_tokens_seen": 52847040, "step": 43455 }, { "epoch": 5.445432903144969, "grad_norm": 0.10199205577373505, "learning_rate": 9.12294893027838e-06, "loss": 0.4603, "num_input_tokens_seen": 52853248, "step": 43460 }, { "epoch": 5.446059391053753, "grad_norm": 0.10408154875040054, "learning_rate": 9.122639612608673e-06, "loss": 0.4651, "num_input_tokens_seen": 52859360, "step": 43465 }, { "epoch": 5.446685878962536, "grad_norm": 0.07366686314344406, "learning_rate": 9.122330245649344e-06, "loss": 0.4683, "num_input_tokens_seen": 52866048, "step": 43470 }, { "epoch": 5.447312366871319, "grad_norm": 0.04721350595355034, "learning_rate": 9.12202082940409e-06, "loss": 0.4607, "num_input_tokens_seen": 52872192, "step": 43475 }, { "epoch": 5.447938854780102, "grad_norm": 0.04619922861456871, "learning_rate": 9.121711363876612e-06, "loss": 0.4528, "num_input_tokens_seen": 52878368, "step": 43480 }, { "epoch": 5.4485653426888865, "grad_norm": 0.07104659080505371, "learning_rate": 9.121401849070612e-06, "loss": 0.4616, "num_input_tokens_seen": 52884256, "step": 43485 }, { "epoch": 5.44919183059767, "grad_norm": 0.07837116718292236, "learning_rate": 9.121092284989786e-06, "loss": 0.4598, "num_input_tokens_seen": 52890496, "step": 43490 }, { "epoch": 5.449818318506453, "grad_norm": 0.12131433188915253, "learning_rate": 9.120782671637835e-06, "loss": 0.4634, "num_input_tokens_seen": 52896544, "step": 43495 }, { "epoch": 5.450444806415236, "grad_norm": 0.08085975050926208, "learning_rate": 9.120473009018466e-06, "loss": 0.4612, "num_input_tokens_seen": 52902720, "step": 43500 }, { "epoch": 5.451071294324019, "grad_norm": 0.04701235890388489, "learning_rate": 9.120163297135377e-06, "loss": 0.4669, "num_input_tokens_seen": 52909056, "step": 43505 }, { "epoch": 5.451697782232803, "grad_norm": 0.08209898322820663, "learning_rate": 9.11985353599227e-06, "loss": 0.4725, "num_input_tokens_seen": 52915264, "step": 43510 }, { "epoch": 5.452324270141586, "grad_norm": 0.07670692354440689, "learning_rate": 9.11954372559285e-06, "loss": 0.4645, "num_input_tokens_seen": 52921344, "step": 43515 }, { "epoch": 5.4529507580503696, "grad_norm": 0.10640626400709152, "learning_rate": 9.119233865940823e-06, "loss": 0.4659, "num_input_tokens_seen": 52927616, "step": 43520 }, { "epoch": 5.453577245959153, "grad_norm": 0.08258085697889328, "learning_rate": 9.11892395703989e-06, "loss": 0.4629, "num_input_tokens_seen": 52933792, "step": 43525 }, { "epoch": 5.454203733867937, "grad_norm": 0.06728807836771011, "learning_rate": 9.11861399889376e-06, "loss": 0.4578, "num_input_tokens_seen": 52939648, "step": 43530 }, { "epoch": 5.45483022177672, "grad_norm": 0.07937045395374298, "learning_rate": 9.118303991506136e-06, "loss": 0.4624, "num_input_tokens_seen": 52945184, "step": 43535 }, { "epoch": 5.455456709685503, "grad_norm": 0.040994442999362946, "learning_rate": 9.117993934880727e-06, "loss": 0.4694, "num_input_tokens_seen": 52951008, "step": 43540 }, { "epoch": 5.456083197594286, "grad_norm": 0.11982596665620804, "learning_rate": 9.117683829021235e-06, "loss": 0.4658, "num_input_tokens_seen": 52956960, "step": 43545 }, { "epoch": 5.4567096855030695, "grad_norm": 0.07299249619245529, "learning_rate": 9.117373673931373e-06, "loss": 0.4618, "num_input_tokens_seen": 52963104, "step": 43550 }, { "epoch": 5.4573361734118535, "grad_norm": 0.04428211972117424, "learning_rate": 9.117063469614846e-06, "loss": 0.4609, "num_input_tokens_seen": 52969152, "step": 43555 }, { "epoch": 5.457962661320637, "grad_norm": 0.0765790268778801, "learning_rate": 9.116753216075366e-06, "loss": 0.4635, "num_input_tokens_seen": 52975296, "step": 43560 }, { "epoch": 5.45858914922942, "grad_norm": 0.07960768789052963, "learning_rate": 9.11644291331664e-06, "loss": 0.4597, "num_input_tokens_seen": 52981408, "step": 43565 }, { "epoch": 5.459215637138203, "grad_norm": 0.07042239606380463, "learning_rate": 9.116132561342375e-06, "loss": 0.4681, "num_input_tokens_seen": 52987552, "step": 43570 }, { "epoch": 5.459842125046986, "grad_norm": 0.06964164227247238, "learning_rate": 9.115822160156287e-06, "loss": 0.4629, "num_input_tokens_seen": 52993440, "step": 43575 }, { "epoch": 5.46046861295577, "grad_norm": 0.04041608050465584, "learning_rate": 9.115511709762083e-06, "loss": 0.4579, "num_input_tokens_seen": 52999456, "step": 43580 }, { "epoch": 5.461095100864553, "grad_norm": 0.1391463428735733, "learning_rate": 9.115201210163479e-06, "loss": 0.4649, "num_input_tokens_seen": 53005408, "step": 43585 }, { "epoch": 5.461721588773337, "grad_norm": 0.10271279513835907, "learning_rate": 9.114890661364181e-06, "loss": 0.4669, "num_input_tokens_seen": 53011328, "step": 43590 }, { "epoch": 5.46234807668212, "grad_norm": 0.16184474527835846, "learning_rate": 9.114580063367908e-06, "loss": 0.4621, "num_input_tokens_seen": 53017344, "step": 43595 }, { "epoch": 5.462974564590904, "grad_norm": 0.11198265850543976, "learning_rate": 9.11426941617837e-06, "loss": 0.4602, "num_input_tokens_seen": 53023456, "step": 43600 }, { "epoch": 5.463601052499687, "grad_norm": 0.0729377493262291, "learning_rate": 9.113958719799282e-06, "loss": 0.4612, "num_input_tokens_seen": 53029088, "step": 43605 }, { "epoch": 5.46422754040847, "grad_norm": 0.06987528502941132, "learning_rate": 9.11364797423436e-06, "loss": 0.464, "num_input_tokens_seen": 53034656, "step": 43610 }, { "epoch": 5.464854028317253, "grad_norm": 0.1060928925871849, "learning_rate": 9.113337179487316e-06, "loss": 0.4665, "num_input_tokens_seen": 53040512, "step": 43615 }, { "epoch": 5.4654805162260365, "grad_norm": 0.13684490323066711, "learning_rate": 9.11302633556187e-06, "loss": 0.4639, "num_input_tokens_seen": 53046272, "step": 43620 }, { "epoch": 5.4661070041348205, "grad_norm": 0.039547860622406006, "learning_rate": 9.112715442461735e-06, "loss": 0.4623, "num_input_tokens_seen": 53052416, "step": 43625 }, { "epoch": 5.466733492043604, "grad_norm": 0.0465206578373909, "learning_rate": 9.112404500190628e-06, "loss": 0.4635, "num_input_tokens_seen": 53058624, "step": 43630 }, { "epoch": 5.467359979952387, "grad_norm": 0.0988023653626442, "learning_rate": 9.112093508752269e-06, "loss": 0.4628, "num_input_tokens_seen": 53064576, "step": 43635 }, { "epoch": 5.46798646786117, "grad_norm": 0.08217503130435944, "learning_rate": 9.111782468150374e-06, "loss": 0.4626, "num_input_tokens_seen": 53070112, "step": 43640 }, { "epoch": 5.468612955769954, "grad_norm": 0.10820125043392181, "learning_rate": 9.111471378388664e-06, "loss": 0.4538, "num_input_tokens_seen": 53075968, "step": 43645 }, { "epoch": 5.469239443678737, "grad_norm": 0.11329350620508194, "learning_rate": 9.111160239470855e-06, "loss": 0.4621, "num_input_tokens_seen": 53082080, "step": 43650 }, { "epoch": 5.46986593158752, "grad_norm": 0.07502724975347519, "learning_rate": 9.110849051400671e-06, "loss": 0.4631, "num_input_tokens_seen": 53087840, "step": 43655 }, { "epoch": 5.470492419496304, "grad_norm": 0.0917937383055687, "learning_rate": 9.11053781418183e-06, "loss": 0.4653, "num_input_tokens_seen": 53094048, "step": 43660 }, { "epoch": 5.471118907405087, "grad_norm": 0.1134888157248497, "learning_rate": 9.110226527818052e-06, "loss": 0.4566, "num_input_tokens_seen": 53100352, "step": 43665 }, { "epoch": 5.471745395313871, "grad_norm": 0.1378830224275589, "learning_rate": 9.10991519231306e-06, "loss": 0.4666, "num_input_tokens_seen": 53106592, "step": 43670 }, { "epoch": 5.472371883222654, "grad_norm": 0.16204455494880676, "learning_rate": 9.109603807670579e-06, "loss": 0.4599, "num_input_tokens_seen": 53112672, "step": 43675 }, { "epoch": 5.472998371131437, "grad_norm": 0.06423653662204742, "learning_rate": 9.109292373894327e-06, "loss": 0.4481, "num_input_tokens_seen": 53118656, "step": 43680 }, { "epoch": 5.47362485904022, "grad_norm": 0.09649959951639175, "learning_rate": 9.10898089098803e-06, "loss": 0.4586, "num_input_tokens_seen": 53125184, "step": 43685 }, { "epoch": 5.4742513469490035, "grad_norm": 0.31194064021110535, "learning_rate": 9.108669358955414e-06, "loss": 0.4813, "num_input_tokens_seen": 53131328, "step": 43690 }, { "epoch": 5.474877834857788, "grad_norm": 0.1443438082933426, "learning_rate": 9.1083577778002e-06, "loss": 0.4678, "num_input_tokens_seen": 53137600, "step": 43695 }, { "epoch": 5.475504322766571, "grad_norm": 0.07130753248929977, "learning_rate": 9.108046147526113e-06, "loss": 0.465, "num_input_tokens_seen": 53143488, "step": 43700 }, { "epoch": 5.476130810675354, "grad_norm": 0.08290612697601318, "learning_rate": 9.107734468136882e-06, "loss": 0.4612, "num_input_tokens_seen": 53149728, "step": 43705 }, { "epoch": 5.476757298584137, "grad_norm": 0.06985963881015778, "learning_rate": 9.10742273963623e-06, "loss": 0.4574, "num_input_tokens_seen": 53156000, "step": 43710 }, { "epoch": 5.47738378649292, "grad_norm": 0.16920959949493408, "learning_rate": 9.10711096202789e-06, "loss": 0.4653, "num_input_tokens_seen": 53161664, "step": 43715 }, { "epoch": 5.478010274401704, "grad_norm": 0.09532693773508072, "learning_rate": 9.106799135315583e-06, "loss": 0.4589, "num_input_tokens_seen": 53167584, "step": 43720 }, { "epoch": 5.4786367623104875, "grad_norm": 0.07168315351009369, "learning_rate": 9.106487259503039e-06, "loss": 0.463, "num_input_tokens_seen": 53173824, "step": 43725 }, { "epoch": 5.479263250219271, "grad_norm": 0.10496332496404648, "learning_rate": 9.106175334593987e-06, "loss": 0.4626, "num_input_tokens_seen": 53179744, "step": 43730 }, { "epoch": 5.479889738128054, "grad_norm": 0.06889889389276505, "learning_rate": 9.105863360592157e-06, "loss": 0.4622, "num_input_tokens_seen": 53185920, "step": 43735 }, { "epoch": 5.480516226036838, "grad_norm": 0.04057912528514862, "learning_rate": 9.10555133750128e-06, "loss": 0.4663, "num_input_tokens_seen": 53192032, "step": 43740 }, { "epoch": 5.481142713945621, "grad_norm": 0.08763369172811508, "learning_rate": 9.105239265325084e-06, "loss": 0.4621, "num_input_tokens_seen": 53198080, "step": 43745 }, { "epoch": 5.481769201854404, "grad_norm": 0.10671471059322357, "learning_rate": 9.1049271440673e-06, "loss": 0.4522, "num_input_tokens_seen": 53204224, "step": 43750 }, { "epoch": 5.482395689763187, "grad_norm": 0.06375087797641754, "learning_rate": 9.104614973731662e-06, "loss": 0.4606, "num_input_tokens_seen": 53210400, "step": 43755 }, { "epoch": 5.4830221776719705, "grad_norm": 0.10268734395503998, "learning_rate": 9.1043027543219e-06, "loss": 0.4633, "num_input_tokens_seen": 53216672, "step": 43760 }, { "epoch": 5.483648665580755, "grad_norm": 0.09316971153020859, "learning_rate": 9.10399048584175e-06, "loss": 0.4563, "num_input_tokens_seen": 53223296, "step": 43765 }, { "epoch": 5.484275153489538, "grad_norm": 0.06631474196910858, "learning_rate": 9.10367816829494e-06, "loss": 0.4608, "num_input_tokens_seen": 53229760, "step": 43770 }, { "epoch": 5.484901641398321, "grad_norm": 0.14757198095321655, "learning_rate": 9.10336580168521e-06, "loss": 0.4638, "num_input_tokens_seen": 53235552, "step": 43775 }, { "epoch": 5.485528129307104, "grad_norm": 0.13248009979724884, "learning_rate": 9.103053386016291e-06, "loss": 0.4637, "num_input_tokens_seen": 53240832, "step": 43780 }, { "epoch": 5.486154617215888, "grad_norm": 0.08744946867227554, "learning_rate": 9.102740921291919e-06, "loss": 0.4631, "num_input_tokens_seen": 53246752, "step": 43785 }, { "epoch": 5.486781105124671, "grad_norm": 0.120752714574337, "learning_rate": 9.10242840751583e-06, "loss": 0.4554, "num_input_tokens_seen": 53252960, "step": 43790 }, { "epoch": 5.4874075930334545, "grad_norm": 0.12350025027990341, "learning_rate": 9.10211584469176e-06, "loss": 0.4638, "num_input_tokens_seen": 53259136, "step": 43795 }, { "epoch": 5.488034080942238, "grad_norm": 0.10528462380170822, "learning_rate": 9.101803232823448e-06, "loss": 0.4668, "num_input_tokens_seen": 53265376, "step": 43800 }, { "epoch": 5.488660568851021, "grad_norm": 0.06353308260440826, "learning_rate": 9.101490571914627e-06, "loss": 0.4587, "num_input_tokens_seen": 53271328, "step": 43805 }, { "epoch": 5.489287056759805, "grad_norm": 0.04137852042913437, "learning_rate": 9.101177861969039e-06, "loss": 0.4632, "num_input_tokens_seen": 53277216, "step": 43810 }, { "epoch": 5.489913544668588, "grad_norm": 0.09238120913505554, "learning_rate": 9.100865102990421e-06, "loss": 0.4621, "num_input_tokens_seen": 53283296, "step": 43815 }, { "epoch": 5.490540032577371, "grad_norm": 0.08272778987884521, "learning_rate": 9.100552294982515e-06, "loss": 0.4533, "num_input_tokens_seen": 53289568, "step": 43820 }, { "epoch": 5.491166520486154, "grad_norm": 0.04943421110510826, "learning_rate": 9.100239437949056e-06, "loss": 0.4663, "num_input_tokens_seen": 53295264, "step": 43825 }, { "epoch": 5.491793008394938, "grad_norm": 0.0414932444691658, "learning_rate": 9.09992653189379e-06, "loss": 0.462, "num_input_tokens_seen": 53301024, "step": 43830 }, { "epoch": 5.492419496303722, "grad_norm": 0.08897069841623306, "learning_rate": 9.099613576820452e-06, "loss": 0.4703, "num_input_tokens_seen": 53307296, "step": 43835 }, { "epoch": 5.493045984212505, "grad_norm": 0.07312601059675217, "learning_rate": 9.09930057273279e-06, "loss": 0.4643, "num_input_tokens_seen": 53313312, "step": 43840 }, { "epoch": 5.493672472121288, "grad_norm": 0.07730474323034286, "learning_rate": 9.098987519634542e-06, "loss": 0.4657, "num_input_tokens_seen": 53319456, "step": 43845 }, { "epoch": 5.494298960030071, "grad_norm": 0.07258737832307816, "learning_rate": 9.098674417529452e-06, "loss": 0.4635, "num_input_tokens_seen": 53326016, "step": 43850 }, { "epoch": 5.494925447938855, "grad_norm": 0.048537470400333405, "learning_rate": 9.098361266421263e-06, "loss": 0.4591, "num_input_tokens_seen": 53332096, "step": 43855 }, { "epoch": 5.495551935847638, "grad_norm": 0.09626661241054535, "learning_rate": 9.09804806631372e-06, "loss": 0.4619, "num_input_tokens_seen": 53338144, "step": 43860 }, { "epoch": 5.4961784237564215, "grad_norm": 0.10454890131950378, "learning_rate": 9.097734817210566e-06, "loss": 0.4666, "num_input_tokens_seen": 53344160, "step": 43865 }, { "epoch": 5.496804911665205, "grad_norm": 0.07409169524908066, "learning_rate": 9.097421519115547e-06, "loss": 0.4633, "num_input_tokens_seen": 53350240, "step": 43870 }, { "epoch": 5.497431399573988, "grad_norm": 0.10609120875597, "learning_rate": 9.09710817203241e-06, "loss": 0.4626, "num_input_tokens_seen": 53356416, "step": 43875 }, { "epoch": 5.498057887482772, "grad_norm": 0.07106999307870865, "learning_rate": 9.096794775964899e-06, "loss": 0.4627, "num_input_tokens_seen": 53362656, "step": 43880 }, { "epoch": 5.498684375391555, "grad_norm": 0.07571656256914139, "learning_rate": 9.096481330916763e-06, "loss": 0.4609, "num_input_tokens_seen": 53368576, "step": 43885 }, { "epoch": 5.499310863300338, "grad_norm": 0.0649731457233429, "learning_rate": 9.096167836891748e-06, "loss": 0.4619, "num_input_tokens_seen": 53374816, "step": 43890 }, { "epoch": 5.499937351209121, "grad_norm": 0.04257659614086151, "learning_rate": 9.095854293893603e-06, "loss": 0.4651, "num_input_tokens_seen": 53380736, "step": 43895 }, { "epoch": 5.5005638391179055, "grad_norm": 0.08821363002061844, "learning_rate": 9.095540701926077e-06, "loss": 0.4619, "num_input_tokens_seen": 53387040, "step": 43900 }, { "epoch": 5.501190327026689, "grad_norm": 0.12678512930870056, "learning_rate": 9.095227060992918e-06, "loss": 0.4687, "num_input_tokens_seen": 53392992, "step": 43905 }, { "epoch": 5.501816814935472, "grad_norm": 0.07528684288263321, "learning_rate": 9.094913371097876e-06, "loss": 0.4607, "num_input_tokens_seen": 53399168, "step": 43910 }, { "epoch": 5.502443302844255, "grad_norm": 0.09926978498697281, "learning_rate": 9.0945996322447e-06, "loss": 0.4588, "num_input_tokens_seen": 53404992, "step": 43915 }, { "epoch": 5.503069790753038, "grad_norm": 0.06473342329263687, "learning_rate": 9.094285844437145e-06, "loss": 0.4595, "num_input_tokens_seen": 53411232, "step": 43920 }, { "epoch": 5.503696278661822, "grad_norm": 0.051439616829156876, "learning_rate": 9.093972007678962e-06, "loss": 0.4677, "num_input_tokens_seen": 53417184, "step": 43925 }, { "epoch": 5.504322766570605, "grad_norm": 0.06533915549516678, "learning_rate": 9.093658121973899e-06, "loss": 0.463, "num_input_tokens_seen": 53423232, "step": 43930 }, { "epoch": 5.5049492544793885, "grad_norm": 0.06680744141340256, "learning_rate": 9.093344187325712e-06, "loss": 0.4634, "num_input_tokens_seen": 53429472, "step": 43935 }, { "epoch": 5.505575742388172, "grad_norm": 0.11014620214700699, "learning_rate": 9.093030203738154e-06, "loss": 0.47, "num_input_tokens_seen": 53435680, "step": 43940 }, { "epoch": 5.506202230296955, "grad_norm": 0.0801430493593216, "learning_rate": 9.09271617121498e-06, "loss": 0.4714, "num_input_tokens_seen": 53442016, "step": 43945 }, { "epoch": 5.506828718205739, "grad_norm": 0.05799496918916702, "learning_rate": 9.09240208975994e-06, "loss": 0.4631, "num_input_tokens_seen": 53448128, "step": 43950 }, { "epoch": 5.507455206114522, "grad_norm": 0.06819387525320053, "learning_rate": 9.092087959376796e-06, "loss": 0.4609, "num_input_tokens_seen": 53454016, "step": 43955 }, { "epoch": 5.508081694023305, "grad_norm": 0.06516470015048981, "learning_rate": 9.091773780069297e-06, "loss": 0.4605, "num_input_tokens_seen": 53460160, "step": 43960 }, { "epoch": 5.508708181932088, "grad_norm": 0.07314486056566238, "learning_rate": 9.091459551841203e-06, "loss": 0.4583, "num_input_tokens_seen": 53466400, "step": 43965 }, { "epoch": 5.509334669840872, "grad_norm": 0.10289228707551956, "learning_rate": 9.091145274696271e-06, "loss": 0.4595, "num_input_tokens_seen": 53472512, "step": 43970 }, { "epoch": 5.509961157749656, "grad_norm": 0.10777971893548965, "learning_rate": 9.090830948638257e-06, "loss": 0.4616, "num_input_tokens_seen": 53478752, "step": 43975 }, { "epoch": 5.510587645658439, "grad_norm": 0.08232714980840683, "learning_rate": 9.09051657367092e-06, "loss": 0.4683, "num_input_tokens_seen": 53484384, "step": 43980 }, { "epoch": 5.511214133567222, "grad_norm": 0.09736138582229614, "learning_rate": 9.09020214979802e-06, "loss": 0.4653, "num_input_tokens_seen": 53490496, "step": 43985 }, { "epoch": 5.511840621476005, "grad_norm": 0.12017685920000076, "learning_rate": 9.08988767702331e-06, "loss": 0.4673, "num_input_tokens_seen": 53496704, "step": 43990 }, { "epoch": 5.512467109384789, "grad_norm": 0.06682175397872925, "learning_rate": 9.089573155350557e-06, "loss": 0.4633, "num_input_tokens_seen": 53503104, "step": 43995 }, { "epoch": 5.513093597293572, "grad_norm": 0.10010530799627304, "learning_rate": 9.08925858478352e-06, "loss": 0.4619, "num_input_tokens_seen": 53509280, "step": 44000 }, { "epoch": 5.513720085202356, "grad_norm": 0.06844883412122726, "learning_rate": 9.088943965325957e-06, "loss": 0.4647, "num_input_tokens_seen": 53515616, "step": 44005 }, { "epoch": 5.514346573111139, "grad_norm": 0.07061431556940079, "learning_rate": 9.08862929698163e-06, "loss": 0.4597, "num_input_tokens_seen": 53521120, "step": 44010 }, { "epoch": 5.514973061019923, "grad_norm": 0.08881578594446182, "learning_rate": 9.088314579754302e-06, "loss": 0.4575, "num_input_tokens_seen": 53527072, "step": 44015 }, { "epoch": 5.515599548928706, "grad_norm": 0.07807426899671555, "learning_rate": 9.087999813647739e-06, "loss": 0.4624, "num_input_tokens_seen": 53533408, "step": 44020 }, { "epoch": 5.516226036837489, "grad_norm": 0.06194450333714485, "learning_rate": 9.0876849986657e-06, "loss": 0.457, "num_input_tokens_seen": 53539392, "step": 44025 }, { "epoch": 5.516852524746272, "grad_norm": 0.05907857045531273, "learning_rate": 9.087370134811948e-06, "loss": 0.4521, "num_input_tokens_seen": 53545440, "step": 44030 }, { "epoch": 5.5174790126550555, "grad_norm": 0.0739901214838028, "learning_rate": 9.087055222090252e-06, "loss": 0.4667, "num_input_tokens_seen": 53551456, "step": 44035 }, { "epoch": 5.5181055005638395, "grad_norm": 0.06479618698358536, "learning_rate": 9.086740260504375e-06, "loss": 0.462, "num_input_tokens_seen": 53557376, "step": 44040 }, { "epoch": 5.518731988472623, "grad_norm": 0.10977950692176819, "learning_rate": 9.08642525005808e-06, "loss": 0.4635, "num_input_tokens_seen": 53563616, "step": 44045 }, { "epoch": 5.519358476381406, "grad_norm": 0.07104482501745224, "learning_rate": 9.086110190755135e-06, "loss": 0.4652, "num_input_tokens_seen": 53569792, "step": 44050 }, { "epoch": 5.519984964290189, "grad_norm": 0.1250787079334259, "learning_rate": 9.085795082599309e-06, "loss": 0.4718, "num_input_tokens_seen": 53575904, "step": 44055 }, { "epoch": 5.520611452198972, "grad_norm": 0.07358089089393616, "learning_rate": 9.085479925594368e-06, "loss": 0.4559, "num_input_tokens_seen": 53582208, "step": 44060 }, { "epoch": 5.521237940107756, "grad_norm": 0.07034514099359512, "learning_rate": 9.085164719744079e-06, "loss": 0.4612, "num_input_tokens_seen": 53588352, "step": 44065 }, { "epoch": 5.521864428016539, "grad_norm": 0.08342844992876053, "learning_rate": 9.08484946505221e-06, "loss": 0.4654, "num_input_tokens_seen": 53594528, "step": 44070 }, { "epoch": 5.522490915925323, "grad_norm": 0.08941899240016937, "learning_rate": 9.084534161522534e-06, "loss": 0.4648, "num_input_tokens_seen": 53600448, "step": 44075 }, { "epoch": 5.523117403834106, "grad_norm": 0.1018049493432045, "learning_rate": 9.084218809158815e-06, "loss": 0.4522, "num_input_tokens_seen": 53605728, "step": 44080 }, { "epoch": 5.523743891742889, "grad_norm": 0.0744691789150238, "learning_rate": 9.08390340796483e-06, "loss": 0.4568, "num_input_tokens_seen": 53611840, "step": 44085 }, { "epoch": 5.524370379651673, "grad_norm": 0.039821963757276535, "learning_rate": 9.083587957944343e-06, "loss": 0.4667, "num_input_tokens_seen": 53617824, "step": 44090 }, { "epoch": 5.524996867560456, "grad_norm": 0.07554476708173752, "learning_rate": 9.083272459101129e-06, "loss": 0.4652, "num_input_tokens_seen": 53624224, "step": 44095 }, { "epoch": 5.525623355469239, "grad_norm": 0.07473542541265488, "learning_rate": 9.08295691143896e-06, "loss": 0.4605, "num_input_tokens_seen": 53630464, "step": 44100 }, { "epoch": 5.5262498433780225, "grad_norm": 0.07333460450172424, "learning_rate": 9.08264131496161e-06, "loss": 0.4665, "num_input_tokens_seen": 53636672, "step": 44105 }, { "epoch": 5.526876331286806, "grad_norm": 0.07383845001459122, "learning_rate": 9.082325669672848e-06, "loss": 0.4693, "num_input_tokens_seen": 53642912, "step": 44110 }, { "epoch": 5.52750281919559, "grad_norm": 0.06850463151931763, "learning_rate": 9.082009975576452e-06, "loss": 0.4491, "num_input_tokens_seen": 53649088, "step": 44115 }, { "epoch": 5.528129307104373, "grad_norm": 0.04519134387373924, "learning_rate": 9.081694232676195e-06, "loss": 0.463, "num_input_tokens_seen": 53654592, "step": 44120 }, { "epoch": 5.528755795013156, "grad_norm": 0.07210581749677658, "learning_rate": 9.08137844097585e-06, "loss": 0.4564, "num_input_tokens_seen": 53659904, "step": 44125 }, { "epoch": 5.529382282921939, "grad_norm": 0.06659160554409027, "learning_rate": 9.081062600479196e-06, "loss": 0.4604, "num_input_tokens_seen": 53666304, "step": 44130 }, { "epoch": 5.530008770830723, "grad_norm": 0.06109648570418358, "learning_rate": 9.080746711190008e-06, "loss": 0.4625, "num_input_tokens_seen": 53672224, "step": 44135 }, { "epoch": 5.5306352587395065, "grad_norm": 0.04897281900048256, "learning_rate": 9.08043077311206e-06, "loss": 0.4588, "num_input_tokens_seen": 53678208, "step": 44140 }, { "epoch": 5.53126174664829, "grad_norm": 0.09711424261331558, "learning_rate": 9.080114786249132e-06, "loss": 0.4658, "num_input_tokens_seen": 53684480, "step": 44145 }, { "epoch": 5.531888234557073, "grad_norm": 0.071980781853199, "learning_rate": 9.079798750605002e-06, "loss": 0.4602, "num_input_tokens_seen": 53690560, "step": 44150 }, { "epoch": 5.532514722465857, "grad_norm": 0.08764809370040894, "learning_rate": 9.079482666183448e-06, "loss": 0.4502, "num_input_tokens_seen": 53696256, "step": 44155 }, { "epoch": 5.53314121037464, "grad_norm": 0.17915114760398865, "learning_rate": 9.079166532988248e-06, "loss": 0.4542, "num_input_tokens_seen": 53702560, "step": 44160 }, { "epoch": 5.533767698283423, "grad_norm": 0.09207110852003098, "learning_rate": 9.078850351023183e-06, "loss": 0.459, "num_input_tokens_seen": 53708352, "step": 44165 }, { "epoch": 5.534394186192206, "grad_norm": 0.0889723151922226, "learning_rate": 9.078534120292034e-06, "loss": 0.4615, "num_input_tokens_seen": 53714464, "step": 44170 }, { "epoch": 5.5350206741009895, "grad_norm": 0.07316990196704865, "learning_rate": 9.078217840798579e-06, "loss": 0.4594, "num_input_tokens_seen": 53720832, "step": 44175 }, { "epoch": 5.535647162009774, "grad_norm": 0.04783491790294647, "learning_rate": 9.0779015125466e-06, "loss": 0.4682, "num_input_tokens_seen": 53727456, "step": 44180 }, { "epoch": 5.536273649918557, "grad_norm": 0.042633745819330215, "learning_rate": 9.077585135539883e-06, "loss": 0.4625, "num_input_tokens_seen": 53733024, "step": 44185 }, { "epoch": 5.53690013782734, "grad_norm": 0.07683853060007095, "learning_rate": 9.077268709782205e-06, "loss": 0.4527, "num_input_tokens_seen": 53739264, "step": 44190 }, { "epoch": 5.537526625736123, "grad_norm": 0.08088149130344391, "learning_rate": 9.076952235277352e-06, "loss": 0.4734, "num_input_tokens_seen": 53745280, "step": 44195 }, { "epoch": 5.538153113644906, "grad_norm": 0.07497799396514893, "learning_rate": 9.076635712029109e-06, "loss": 0.4641, "num_input_tokens_seen": 53751584, "step": 44200 }, { "epoch": 5.53877960155369, "grad_norm": 0.1050456091761589, "learning_rate": 9.076319140041257e-06, "loss": 0.4631, "num_input_tokens_seen": 53757568, "step": 44205 }, { "epoch": 5.5394060894624735, "grad_norm": 0.06799279153347015, "learning_rate": 9.076002519317582e-06, "loss": 0.4663, "num_input_tokens_seen": 53764352, "step": 44210 }, { "epoch": 5.540032577371257, "grad_norm": 0.07950951159000397, "learning_rate": 9.07568584986187e-06, "loss": 0.4619, "num_input_tokens_seen": 53770656, "step": 44215 }, { "epoch": 5.54065906528004, "grad_norm": 0.09990763664245605, "learning_rate": 9.075369131677909e-06, "loss": 0.4626, "num_input_tokens_seen": 53776896, "step": 44220 }, { "epoch": 5.541285553188823, "grad_norm": 0.11101999133825302, "learning_rate": 9.075052364769481e-06, "loss": 0.464, "num_input_tokens_seen": 53783008, "step": 44225 }, { "epoch": 5.541912041097607, "grad_norm": 0.07366622984409332, "learning_rate": 9.07473554914038e-06, "loss": 0.4547, "num_input_tokens_seen": 53789120, "step": 44230 }, { "epoch": 5.54253852900639, "grad_norm": 0.08408509194850922, "learning_rate": 9.074418684794386e-06, "loss": 0.4586, "num_input_tokens_seen": 53795104, "step": 44235 }, { "epoch": 5.543165016915173, "grad_norm": 0.06632377952337265, "learning_rate": 9.07410177173529e-06, "loss": 0.467, "num_input_tokens_seen": 53800384, "step": 44240 }, { "epoch": 5.5437915048239566, "grad_norm": 0.11062968522310257, "learning_rate": 9.073784809966886e-06, "loss": 0.4666, "num_input_tokens_seen": 53806464, "step": 44245 }, { "epoch": 5.544417992732741, "grad_norm": 0.07067056745290756, "learning_rate": 9.073467799492957e-06, "loss": 0.4546, "num_input_tokens_seen": 53812736, "step": 44250 }, { "epoch": 5.545044480641524, "grad_norm": 0.04530995339155197, "learning_rate": 9.073150740317295e-06, "loss": 0.4632, "num_input_tokens_seen": 53818784, "step": 44255 }, { "epoch": 5.545670968550307, "grad_norm": 0.08616002649068832, "learning_rate": 9.072833632443693e-06, "loss": 0.4562, "num_input_tokens_seen": 53824960, "step": 44260 }, { "epoch": 5.54629745645909, "grad_norm": 0.11302125453948975, "learning_rate": 9.072516475875941e-06, "loss": 0.4678, "num_input_tokens_seen": 53831072, "step": 44265 }, { "epoch": 5.546923944367874, "grad_norm": 0.07683111727237701, "learning_rate": 9.07219927061783e-06, "loss": 0.4586, "num_input_tokens_seen": 53837408, "step": 44270 }, { "epoch": 5.547550432276657, "grad_norm": 0.07077691704034805, "learning_rate": 9.071882016673152e-06, "loss": 0.4675, "num_input_tokens_seen": 53843488, "step": 44275 }, { "epoch": 5.5481769201854405, "grad_norm": 0.1401125192642212, "learning_rate": 9.071564714045703e-06, "loss": 0.4671, "num_input_tokens_seen": 53849536, "step": 44280 }, { "epoch": 5.548803408094224, "grad_norm": 0.073436439037323, "learning_rate": 9.071247362739274e-06, "loss": 0.4501, "num_input_tokens_seen": 53855328, "step": 44285 }, { "epoch": 5.549429896003007, "grad_norm": 0.06581076234579086, "learning_rate": 9.070929962757657e-06, "loss": 0.4637, "num_input_tokens_seen": 53860928, "step": 44290 }, { "epoch": 5.550056383911791, "grad_norm": 0.11965030431747437, "learning_rate": 9.070612514104654e-06, "loss": 0.4707, "num_input_tokens_seen": 53867040, "step": 44295 }, { "epoch": 5.550682871820574, "grad_norm": 0.07281523197889328, "learning_rate": 9.070295016784053e-06, "loss": 0.4584, "num_input_tokens_seen": 53873120, "step": 44300 }, { "epoch": 5.551309359729357, "grad_norm": 0.06554701179265976, "learning_rate": 9.069977470799654e-06, "loss": 0.4563, "num_input_tokens_seen": 53878496, "step": 44305 }, { "epoch": 5.55193584763814, "grad_norm": 0.10405106097459793, "learning_rate": 9.069659876155253e-06, "loss": 0.4547, "num_input_tokens_seen": 53884672, "step": 44310 }, { "epoch": 5.552562335546924, "grad_norm": 0.07848601043224335, "learning_rate": 9.069342232854646e-06, "loss": 0.4725, "num_input_tokens_seen": 53890944, "step": 44315 }, { "epoch": 5.553188823455708, "grad_norm": 0.08922488987445831, "learning_rate": 9.069024540901631e-06, "loss": 0.4727, "num_input_tokens_seen": 53896960, "step": 44320 }, { "epoch": 5.553815311364491, "grad_norm": 0.11770553886890411, "learning_rate": 9.068706800300007e-06, "loss": 0.4661, "num_input_tokens_seen": 53903296, "step": 44325 }, { "epoch": 5.554441799273274, "grad_norm": 0.09738685935735703, "learning_rate": 9.068389011053571e-06, "loss": 0.4646, "num_input_tokens_seen": 53909408, "step": 44330 }, { "epoch": 5.555068287182057, "grad_norm": 0.07144321501255035, "learning_rate": 9.068071173166126e-06, "loss": 0.4562, "num_input_tokens_seen": 53915424, "step": 44335 }, { "epoch": 5.55569477509084, "grad_norm": 0.09976184368133545, "learning_rate": 9.06775328664147e-06, "loss": 0.4618, "num_input_tokens_seen": 53921600, "step": 44340 }, { "epoch": 5.556321262999624, "grad_norm": 0.11129622161388397, "learning_rate": 9.067435351483402e-06, "loss": 0.4669, "num_input_tokens_seen": 53927936, "step": 44345 }, { "epoch": 5.5569477509084075, "grad_norm": 0.10050270706415176, "learning_rate": 9.067117367695727e-06, "loss": 0.4606, "num_input_tokens_seen": 53933984, "step": 44350 }, { "epoch": 5.557574238817191, "grad_norm": 0.0854216143488884, "learning_rate": 9.06679933528224e-06, "loss": 0.4669, "num_input_tokens_seen": 53940224, "step": 44355 }, { "epoch": 5.558200726725974, "grad_norm": 0.13202761113643646, "learning_rate": 9.066481254246751e-06, "loss": 0.4673, "num_input_tokens_seen": 53946464, "step": 44360 }, { "epoch": 5.558827214634758, "grad_norm": 0.09496865421533585, "learning_rate": 9.06616312459306e-06, "loss": 0.4596, "num_input_tokens_seen": 53952672, "step": 44365 }, { "epoch": 5.559453702543541, "grad_norm": 0.042632538825273514, "learning_rate": 9.065844946324969e-06, "loss": 0.4642, "num_input_tokens_seen": 53958368, "step": 44370 }, { "epoch": 5.560080190452324, "grad_norm": 0.07688406854867935, "learning_rate": 9.065526719446283e-06, "loss": 0.4646, "num_input_tokens_seen": 53964704, "step": 44375 }, { "epoch": 5.560706678361107, "grad_norm": 0.04782078042626381, "learning_rate": 9.065208443960808e-06, "loss": 0.4679, "num_input_tokens_seen": 53971200, "step": 44380 }, { "epoch": 5.561333166269891, "grad_norm": 0.08996353298425674, "learning_rate": 9.064890119872348e-06, "loss": 0.4612, "num_input_tokens_seen": 53977408, "step": 44385 }, { "epoch": 5.561959654178675, "grad_norm": 0.08326848596334457, "learning_rate": 9.064571747184709e-06, "loss": 0.4615, "num_input_tokens_seen": 53983456, "step": 44390 }, { "epoch": 5.562586142087458, "grad_norm": 0.07019137591123581, "learning_rate": 9.064253325901697e-06, "loss": 0.4558, "num_input_tokens_seen": 53989408, "step": 44395 }, { "epoch": 5.563212629996241, "grad_norm": 0.09182474762201309, "learning_rate": 9.063934856027119e-06, "loss": 0.4609, "num_input_tokens_seen": 53995392, "step": 44400 }, { "epoch": 5.563839117905024, "grad_norm": 0.09773498773574829, "learning_rate": 9.063616337564784e-06, "loss": 0.4629, "num_input_tokens_seen": 54001600, "step": 44405 }, { "epoch": 5.564465605813808, "grad_norm": 0.09948378056287766, "learning_rate": 9.063297770518498e-06, "loss": 0.4622, "num_input_tokens_seen": 54007904, "step": 44410 }, { "epoch": 5.565092093722591, "grad_norm": 0.052049506455659866, "learning_rate": 9.062979154892074e-06, "loss": 0.4637, "num_input_tokens_seen": 54013856, "step": 44415 }, { "epoch": 5.565718581631375, "grad_norm": 0.08806353807449341, "learning_rate": 9.062660490689315e-06, "loss": 0.4597, "num_input_tokens_seen": 54019808, "step": 44420 }, { "epoch": 5.566345069540158, "grad_norm": 0.07162439078092575, "learning_rate": 9.062341777914036e-06, "loss": 0.458, "num_input_tokens_seen": 54026176, "step": 44425 }, { "epoch": 5.566971557448941, "grad_norm": 0.043755125254392624, "learning_rate": 9.062023016570043e-06, "loss": 0.4645, "num_input_tokens_seen": 54032192, "step": 44430 }, { "epoch": 5.567598045357725, "grad_norm": 0.07216398417949677, "learning_rate": 9.061704206661153e-06, "loss": 0.4596, "num_input_tokens_seen": 54038432, "step": 44435 }, { "epoch": 5.568224533266508, "grad_norm": 0.080239437520504, "learning_rate": 9.061385348191172e-06, "loss": 0.464, "num_input_tokens_seen": 54044480, "step": 44440 }, { "epoch": 5.568851021175291, "grad_norm": 0.13024674355983734, "learning_rate": 9.061066441163916e-06, "loss": 0.4656, "num_input_tokens_seen": 54050368, "step": 44445 }, { "epoch": 5.5694775090840745, "grad_norm": 0.07791683077812195, "learning_rate": 9.060747485583195e-06, "loss": 0.4687, "num_input_tokens_seen": 54056448, "step": 44450 }, { "epoch": 5.570103996992858, "grad_norm": 0.0755624920129776, "learning_rate": 9.060428481452825e-06, "loss": 0.4645, "num_input_tokens_seen": 54062560, "step": 44455 }, { "epoch": 5.570730484901642, "grad_norm": 0.07956276834011078, "learning_rate": 9.060109428776617e-06, "loss": 0.4693, "num_input_tokens_seen": 54068864, "step": 44460 }, { "epoch": 5.571356972810425, "grad_norm": 0.07545997947454453, "learning_rate": 9.059790327558388e-06, "loss": 0.4621, "num_input_tokens_seen": 54074624, "step": 44465 }, { "epoch": 5.571983460719208, "grad_norm": 0.07655895501375198, "learning_rate": 9.059471177801954e-06, "loss": 0.4645, "num_input_tokens_seen": 54080896, "step": 44470 }, { "epoch": 5.572609948627991, "grad_norm": 0.048172540962696075, "learning_rate": 9.059151979511127e-06, "loss": 0.4687, "num_input_tokens_seen": 54086880, "step": 44475 }, { "epoch": 5.573236436536774, "grad_norm": 0.06276863068342209, "learning_rate": 9.058832732689726e-06, "loss": 0.4576, "num_input_tokens_seen": 54092736, "step": 44480 }, { "epoch": 5.573862924445558, "grad_norm": 0.09877438098192215, "learning_rate": 9.058513437341568e-06, "loss": 0.4637, "num_input_tokens_seen": 54098816, "step": 44485 }, { "epoch": 5.574489412354342, "grad_norm": 0.07193131744861603, "learning_rate": 9.05819409347047e-06, "loss": 0.4621, "num_input_tokens_seen": 54105120, "step": 44490 }, { "epoch": 5.575115900263125, "grad_norm": 0.07220043241977692, "learning_rate": 9.057874701080248e-06, "loss": 0.4628, "num_input_tokens_seen": 54111168, "step": 44495 }, { "epoch": 5.575742388171908, "grad_norm": 0.08483003824949265, "learning_rate": 9.057555260174721e-06, "loss": 0.46, "num_input_tokens_seen": 54116832, "step": 44500 }, { "epoch": 5.576368876080692, "grad_norm": 0.09347781538963318, "learning_rate": 9.057235770757713e-06, "loss": 0.4656, "num_input_tokens_seen": 54122912, "step": 44505 }, { "epoch": 5.576995363989475, "grad_norm": 0.06825671344995499, "learning_rate": 9.056916232833038e-06, "loss": 0.4628, "num_input_tokens_seen": 54128832, "step": 44510 }, { "epoch": 5.577621851898258, "grad_norm": 0.07412629574537277, "learning_rate": 9.05659664640452e-06, "loss": 0.4625, "num_input_tokens_seen": 54135200, "step": 44515 }, { "epoch": 5.5782483398070415, "grad_norm": 0.06465915590524673, "learning_rate": 9.056277011475977e-06, "loss": 0.4615, "num_input_tokens_seen": 54140672, "step": 44520 }, { "epoch": 5.5788748277158255, "grad_norm": 0.04322890564799309, "learning_rate": 9.055957328051232e-06, "loss": 0.4634, "num_input_tokens_seen": 54146464, "step": 44525 }, { "epoch": 5.579501315624609, "grad_norm": 0.06610564887523651, "learning_rate": 9.055637596134109e-06, "loss": 0.4586, "num_input_tokens_seen": 54152448, "step": 44530 }, { "epoch": 5.580127803533392, "grad_norm": 0.08602887392044067, "learning_rate": 9.055317815728427e-06, "loss": 0.4609, "num_input_tokens_seen": 54158464, "step": 44535 }, { "epoch": 5.580754291442175, "grad_norm": 0.06999138742685318, "learning_rate": 9.054997986838013e-06, "loss": 0.463, "num_input_tokens_seen": 54164512, "step": 44540 }, { "epoch": 5.581380779350958, "grad_norm": 0.07320916652679443, "learning_rate": 9.054678109466686e-06, "loss": 0.4674, "num_input_tokens_seen": 54170880, "step": 44545 }, { "epoch": 5.582007267259742, "grad_norm": 0.12982644140720367, "learning_rate": 9.054358183618275e-06, "loss": 0.4691, "num_input_tokens_seen": 54177088, "step": 44550 }, { "epoch": 5.5826337551685254, "grad_norm": 0.07247772067785263, "learning_rate": 9.054038209296602e-06, "loss": 0.4563, "num_input_tokens_seen": 54183136, "step": 44555 }, { "epoch": 5.583260243077309, "grad_norm": 0.07733193784952164, "learning_rate": 9.053718186505493e-06, "loss": 0.4656, "num_input_tokens_seen": 54189344, "step": 44560 }, { "epoch": 5.583886730986092, "grad_norm": 0.09036242216825485, "learning_rate": 9.053398115248777e-06, "loss": 0.4559, "num_input_tokens_seen": 54195200, "step": 44565 }, { "epoch": 5.584513218894875, "grad_norm": 0.08999121189117432, "learning_rate": 9.05307799553028e-06, "loss": 0.4599, "num_input_tokens_seen": 54201344, "step": 44570 }, { "epoch": 5.585139706803659, "grad_norm": 0.10515567660331726, "learning_rate": 9.052757827353824e-06, "loss": 0.4635, "num_input_tokens_seen": 54207616, "step": 44575 }, { "epoch": 5.585766194712442, "grad_norm": 0.0736246183514595, "learning_rate": 9.052437610723243e-06, "loss": 0.4578, "num_input_tokens_seen": 54213440, "step": 44580 }, { "epoch": 5.586392682621225, "grad_norm": 0.12948036193847656, "learning_rate": 9.052117345642364e-06, "loss": 0.4659, "num_input_tokens_seen": 54219616, "step": 44585 }, { "epoch": 5.5870191705300085, "grad_norm": 0.07179588079452515, "learning_rate": 9.051797032115014e-06, "loss": 0.4586, "num_input_tokens_seen": 54225632, "step": 44590 }, { "epoch": 5.587645658438792, "grad_norm": 0.11106899380683899, "learning_rate": 9.051476670145023e-06, "loss": 0.4682, "num_input_tokens_seen": 54232000, "step": 44595 }, { "epoch": 5.588272146347576, "grad_norm": 0.1363030970096588, "learning_rate": 9.051156259736225e-06, "loss": 0.4713, "num_input_tokens_seen": 54238144, "step": 44600 }, { "epoch": 5.588898634256359, "grad_norm": 0.06669657677412033, "learning_rate": 9.050835800892446e-06, "loss": 0.466, "num_input_tokens_seen": 54244544, "step": 44605 }, { "epoch": 5.589525122165142, "grad_norm": 0.07807397097349167, "learning_rate": 9.050515293617519e-06, "loss": 0.4566, "num_input_tokens_seen": 54250400, "step": 44610 }, { "epoch": 5.590151610073925, "grad_norm": 0.1097482293844223, "learning_rate": 9.050194737915277e-06, "loss": 0.4529, "num_input_tokens_seen": 54256704, "step": 44615 }, { "epoch": 5.590778097982709, "grad_norm": 0.06349683552980423, "learning_rate": 9.049874133789552e-06, "loss": 0.461, "num_input_tokens_seen": 54262688, "step": 44620 }, { "epoch": 5.5914045858914925, "grad_norm": 0.05010291188955307, "learning_rate": 9.049553481244175e-06, "loss": 0.4664, "num_input_tokens_seen": 54268640, "step": 44625 }, { "epoch": 5.592031073800276, "grad_norm": 0.037021394819021225, "learning_rate": 9.049232780282982e-06, "loss": 0.462, "num_input_tokens_seen": 54275008, "step": 44630 }, { "epoch": 5.592657561709059, "grad_norm": 0.08100295811891556, "learning_rate": 9.048912030909806e-06, "loss": 0.4628, "num_input_tokens_seen": 54280928, "step": 44635 }, { "epoch": 5.593284049617843, "grad_norm": 0.070297472178936, "learning_rate": 9.048591233128486e-06, "loss": 0.4627, "num_input_tokens_seen": 54287328, "step": 44640 }, { "epoch": 5.593910537526626, "grad_norm": 0.10673186182975769, "learning_rate": 9.048270386942852e-06, "loss": 0.4604, "num_input_tokens_seen": 54293376, "step": 44645 }, { "epoch": 5.594537025435409, "grad_norm": 0.07207780331373215, "learning_rate": 9.04794949235674e-06, "loss": 0.4535, "num_input_tokens_seen": 54299872, "step": 44650 }, { "epoch": 5.595163513344192, "grad_norm": 0.09743288159370422, "learning_rate": 9.04762854937399e-06, "loss": 0.4638, "num_input_tokens_seen": 54306144, "step": 44655 }, { "epoch": 5.5957900012529755, "grad_norm": 0.07183995097875595, "learning_rate": 9.04730755799844e-06, "loss": 0.4637, "num_input_tokens_seen": 54311648, "step": 44660 }, { "epoch": 5.59641648916176, "grad_norm": 0.06782015413045883, "learning_rate": 9.04698651823392e-06, "loss": 0.4687, "num_input_tokens_seen": 54317856, "step": 44665 }, { "epoch": 5.597042977070543, "grad_norm": 0.06703247129917145, "learning_rate": 9.046665430084278e-06, "loss": 0.4568, "num_input_tokens_seen": 54323776, "step": 44670 }, { "epoch": 5.597669464979326, "grad_norm": 0.07105699926614761, "learning_rate": 9.046344293553348e-06, "loss": 0.4647, "num_input_tokens_seen": 54329792, "step": 44675 }, { "epoch": 5.598295952888109, "grad_norm": 0.08477196842432022, "learning_rate": 9.04602310864497e-06, "loss": 0.4623, "num_input_tokens_seen": 54335968, "step": 44680 }, { "epoch": 5.598922440796892, "grad_norm": 0.07589550316333771, "learning_rate": 9.045701875362982e-06, "loss": 0.4633, "num_input_tokens_seen": 54341952, "step": 44685 }, { "epoch": 5.599548928705676, "grad_norm": 0.08319731801748276, "learning_rate": 9.045380593711229e-06, "loss": 0.4575, "num_input_tokens_seen": 54348000, "step": 44690 }, { "epoch": 5.6001754166144595, "grad_norm": 0.10799705982208252, "learning_rate": 9.045059263693547e-06, "loss": 0.4724, "num_input_tokens_seen": 54353600, "step": 44695 }, { "epoch": 5.600801904523243, "grad_norm": 0.09020671993494034, "learning_rate": 9.044737885313784e-06, "loss": 0.4649, "num_input_tokens_seen": 54360224, "step": 44700 }, { "epoch": 5.601428392432026, "grad_norm": 0.10973437130451202, "learning_rate": 9.044416458575777e-06, "loss": 0.4675, "num_input_tokens_seen": 54365984, "step": 44705 }, { "epoch": 5.602054880340809, "grad_norm": 0.07532884925603867, "learning_rate": 9.04409498348337e-06, "loss": 0.4627, "num_input_tokens_seen": 54371584, "step": 44710 }, { "epoch": 5.602681368249593, "grad_norm": 0.08572593331336975, "learning_rate": 9.04377346004041e-06, "loss": 0.4662, "num_input_tokens_seen": 54377504, "step": 44715 }, { "epoch": 5.603307856158376, "grad_norm": 0.07615339010953903, "learning_rate": 9.043451888250736e-06, "loss": 0.4605, "num_input_tokens_seen": 54383744, "step": 44720 }, { "epoch": 5.603934344067159, "grad_norm": 0.07662590593099594, "learning_rate": 9.043130268118197e-06, "loss": 0.4644, "num_input_tokens_seen": 54389568, "step": 44725 }, { "epoch": 5.604560831975943, "grad_norm": 0.0693298801779747, "learning_rate": 9.042808599646636e-06, "loss": 0.4596, "num_input_tokens_seen": 54395616, "step": 44730 }, { "epoch": 5.605187319884726, "grad_norm": 0.07557044178247452, "learning_rate": 9.042486882839898e-06, "loss": 0.4681, "num_input_tokens_seen": 54401152, "step": 44735 }, { "epoch": 5.60581380779351, "grad_norm": 0.10613225400447845, "learning_rate": 9.04216511770183e-06, "loss": 0.4628, "num_input_tokens_seen": 54407136, "step": 44740 }, { "epoch": 5.606440295702293, "grad_norm": 0.08130421489477158, "learning_rate": 9.041843304236282e-06, "loss": 0.468, "num_input_tokens_seen": 54413184, "step": 44745 }, { "epoch": 5.607066783611076, "grad_norm": 0.1212434470653534, "learning_rate": 9.041521442447098e-06, "loss": 0.4607, "num_input_tokens_seen": 54419552, "step": 44750 }, { "epoch": 5.607693271519859, "grad_norm": 0.09063073992729187, "learning_rate": 9.041199532338127e-06, "loss": 0.4612, "num_input_tokens_seen": 54425568, "step": 44755 }, { "epoch": 5.608319759428643, "grad_norm": 0.10293847322463989, "learning_rate": 9.040877573913218e-06, "loss": 0.4542, "num_input_tokens_seen": 54431648, "step": 44760 }, { "epoch": 5.6089462473374265, "grad_norm": 0.07076232880353928, "learning_rate": 9.04055556717622e-06, "loss": 0.4615, "num_input_tokens_seen": 54437984, "step": 44765 }, { "epoch": 5.60957273524621, "grad_norm": 0.061554860323667526, "learning_rate": 9.040233512130984e-06, "loss": 0.4637, "num_input_tokens_seen": 54444256, "step": 44770 }, { "epoch": 5.610199223154993, "grad_norm": 0.10824256390333176, "learning_rate": 9.03991140878136e-06, "loss": 0.4647, "num_input_tokens_seen": 54450400, "step": 44775 }, { "epoch": 5.610825711063777, "grad_norm": 0.1074034795165062, "learning_rate": 9.039589257131197e-06, "loss": 0.4648, "num_input_tokens_seen": 54456640, "step": 44780 }, { "epoch": 5.61145219897256, "grad_norm": 0.10501021891832352, "learning_rate": 9.03926705718435e-06, "loss": 0.464, "num_input_tokens_seen": 54462944, "step": 44785 }, { "epoch": 5.612078686881343, "grad_norm": 0.036218296736478806, "learning_rate": 9.038944808944667e-06, "loss": 0.4627, "num_input_tokens_seen": 54468896, "step": 44790 }, { "epoch": 5.612705174790126, "grad_norm": 0.10193252563476562, "learning_rate": 9.038622512416004e-06, "loss": 0.4663, "num_input_tokens_seen": 54474976, "step": 44795 }, { "epoch": 5.61333166269891, "grad_norm": 0.07025148719549179, "learning_rate": 9.038300167602212e-06, "loss": 0.4653, "num_input_tokens_seen": 54481056, "step": 44800 }, { "epoch": 5.613958150607694, "grad_norm": 0.11680347472429276, "learning_rate": 9.03797777450715e-06, "loss": 0.4615, "num_input_tokens_seen": 54487136, "step": 44805 }, { "epoch": 5.614584638516477, "grad_norm": 0.16253714263439178, "learning_rate": 9.037655333134665e-06, "loss": 0.4646, "num_input_tokens_seen": 54493472, "step": 44810 }, { "epoch": 5.61521112642526, "grad_norm": 0.07538898289203644, "learning_rate": 9.037332843488617e-06, "loss": 0.4602, "num_input_tokens_seen": 54499296, "step": 44815 }, { "epoch": 5.615837614334043, "grad_norm": 0.06901855766773224, "learning_rate": 9.03701030557286e-06, "loss": 0.4565, "num_input_tokens_seen": 54504768, "step": 44820 }, { "epoch": 5.616464102242826, "grad_norm": 0.04072694107890129, "learning_rate": 9.036687719391252e-06, "loss": 0.4604, "num_input_tokens_seen": 54511168, "step": 44825 }, { "epoch": 5.61709059015161, "grad_norm": 0.06939697265625, "learning_rate": 9.03636508494765e-06, "loss": 0.4573, "num_input_tokens_seen": 54516704, "step": 44830 }, { "epoch": 5.6177170780603936, "grad_norm": 0.07680224627256393, "learning_rate": 9.036042402245905e-06, "loss": 0.4676, "num_input_tokens_seen": 54522656, "step": 44835 }, { "epoch": 5.618343565969177, "grad_norm": 0.055985208600759506, "learning_rate": 9.035719671289885e-06, "loss": 0.46, "num_input_tokens_seen": 54528000, "step": 44840 }, { "epoch": 5.61897005387796, "grad_norm": 0.06526105850934982, "learning_rate": 9.035396892083439e-06, "loss": 0.4618, "num_input_tokens_seen": 54534240, "step": 44845 }, { "epoch": 5.619596541786743, "grad_norm": 0.06470030546188354, "learning_rate": 9.035074064630431e-06, "loss": 0.4595, "num_input_tokens_seen": 54540352, "step": 44850 }, { "epoch": 5.620223029695527, "grad_norm": 0.0360109768807888, "learning_rate": 9.034751188934724e-06, "loss": 0.4638, "num_input_tokens_seen": 54546560, "step": 44855 }, { "epoch": 5.62084951760431, "grad_norm": 0.11833292245864868, "learning_rate": 9.034428265000171e-06, "loss": 0.4586, "num_input_tokens_seen": 54552768, "step": 44860 }, { "epoch": 5.6214760055130935, "grad_norm": 0.06609053164720535, "learning_rate": 9.034105292830634e-06, "loss": 0.4639, "num_input_tokens_seen": 54558944, "step": 44865 }, { "epoch": 5.622102493421877, "grad_norm": 0.06656022369861603, "learning_rate": 9.033782272429981e-06, "loss": 0.4585, "num_input_tokens_seen": 54564768, "step": 44870 }, { "epoch": 5.622728981330661, "grad_norm": 0.0988997370004654, "learning_rate": 9.033459203802067e-06, "loss": 0.4625, "num_input_tokens_seen": 54570944, "step": 44875 }, { "epoch": 5.623355469239444, "grad_norm": 0.12005830556154251, "learning_rate": 9.033136086950758e-06, "loss": 0.4672, "num_input_tokens_seen": 54576512, "step": 44880 }, { "epoch": 5.623981957148227, "grad_norm": 0.06997747719287872, "learning_rate": 9.032812921879915e-06, "loss": 0.4608, "num_input_tokens_seen": 54582400, "step": 44885 }, { "epoch": 5.62460844505701, "grad_norm": 0.036794669926166534, "learning_rate": 9.032489708593403e-06, "loss": 0.4537, "num_input_tokens_seen": 54588896, "step": 44890 }, { "epoch": 5.625234932965794, "grad_norm": 0.08171210438013077, "learning_rate": 9.032166447095088e-06, "loss": 0.4639, "num_input_tokens_seen": 54594400, "step": 44895 }, { "epoch": 5.625861420874577, "grad_norm": 0.06995659321546555, "learning_rate": 9.031843137388832e-06, "loss": 0.4595, "num_input_tokens_seen": 54600416, "step": 44900 }, { "epoch": 5.626487908783361, "grad_norm": 0.07195555418729782, "learning_rate": 9.031519779478502e-06, "loss": 0.4618, "num_input_tokens_seen": 54606720, "step": 44905 }, { "epoch": 5.627114396692144, "grad_norm": 0.06279422342777252, "learning_rate": 9.031196373367962e-06, "loss": 0.4616, "num_input_tokens_seen": 54612896, "step": 44910 }, { "epoch": 5.627740884600927, "grad_norm": 0.06679940968751907, "learning_rate": 9.030872919061082e-06, "loss": 0.4642, "num_input_tokens_seen": 54619072, "step": 44915 }, { "epoch": 5.628367372509711, "grad_norm": 0.07338523864746094, "learning_rate": 9.030549416561727e-06, "loss": 0.4609, "num_input_tokens_seen": 54625184, "step": 44920 }, { "epoch": 5.628993860418494, "grad_norm": 0.0725475326180458, "learning_rate": 9.030225865873766e-06, "loss": 0.4741, "num_input_tokens_seen": 54631136, "step": 44925 }, { "epoch": 5.629620348327277, "grad_norm": 0.061365824192762375, "learning_rate": 9.029902267001063e-06, "loss": 0.4632, "num_input_tokens_seen": 54637184, "step": 44930 }, { "epoch": 5.6302468362360605, "grad_norm": 0.06897958368062973, "learning_rate": 9.029578619947492e-06, "loss": 0.4621, "num_input_tokens_seen": 54642720, "step": 44935 }, { "epoch": 5.630873324144844, "grad_norm": 0.10662288218736649, "learning_rate": 9.029254924716922e-06, "loss": 0.458, "num_input_tokens_seen": 54648896, "step": 44940 }, { "epoch": 5.631499812053628, "grad_norm": 0.06603660434484482, "learning_rate": 9.02893118131322e-06, "loss": 0.4585, "num_input_tokens_seen": 54655136, "step": 44945 }, { "epoch": 5.632126299962411, "grad_norm": 0.03709036111831665, "learning_rate": 9.028607389740258e-06, "loss": 0.4688, "num_input_tokens_seen": 54661600, "step": 44950 }, { "epoch": 5.632752787871194, "grad_norm": 0.10004082322120667, "learning_rate": 9.02828355000191e-06, "loss": 0.4611, "num_input_tokens_seen": 54667648, "step": 44955 }, { "epoch": 5.633379275779977, "grad_norm": 0.08162520080804825, "learning_rate": 9.027959662102043e-06, "loss": 0.4635, "num_input_tokens_seen": 54673856, "step": 44960 }, { "epoch": 5.63400576368876, "grad_norm": 0.06157820299267769, "learning_rate": 9.027635726044532e-06, "loss": 0.464, "num_input_tokens_seen": 54679776, "step": 44965 }, { "epoch": 5.634632251597544, "grad_norm": 0.07948625832796097, "learning_rate": 9.027311741833252e-06, "loss": 0.4668, "num_input_tokens_seen": 54685824, "step": 44970 }, { "epoch": 5.635258739506328, "grad_norm": 0.06311807036399841, "learning_rate": 9.026987709472072e-06, "loss": 0.4697, "num_input_tokens_seen": 54691936, "step": 44975 }, { "epoch": 5.635885227415111, "grad_norm": 0.06944578140974045, "learning_rate": 9.026663628964869e-06, "loss": 0.4615, "num_input_tokens_seen": 54697792, "step": 44980 }, { "epoch": 5.636511715323894, "grad_norm": 0.05760667473077774, "learning_rate": 9.026339500315515e-06, "loss": 0.4634, "num_input_tokens_seen": 54704064, "step": 44985 }, { "epoch": 5.637138203232677, "grad_norm": 0.04038053750991821, "learning_rate": 9.02601532352789e-06, "loss": 0.4619, "num_input_tokens_seen": 54710368, "step": 44990 }, { "epoch": 5.637764691141461, "grad_norm": 0.06898169964551926, "learning_rate": 9.025691098605864e-06, "loss": 0.456, "num_input_tokens_seen": 54716768, "step": 44995 }, { "epoch": 5.638391179050244, "grad_norm": 0.06478371471166611, "learning_rate": 9.025366825553319e-06, "loss": 0.4644, "num_input_tokens_seen": 54722784, "step": 45000 }, { "epoch": 5.6390176669590275, "grad_norm": 0.11186086386442184, "learning_rate": 9.02504250437413e-06, "loss": 0.4679, "num_input_tokens_seen": 54728672, "step": 45005 }, { "epoch": 5.639644154867811, "grad_norm": 0.07244838029146194, "learning_rate": 9.024718135072171e-06, "loss": 0.4653, "num_input_tokens_seen": 54734816, "step": 45010 }, { "epoch": 5.640270642776595, "grad_norm": 0.038860518485307693, "learning_rate": 9.024393717651323e-06, "loss": 0.46, "num_input_tokens_seen": 54741088, "step": 45015 }, { "epoch": 5.640897130685378, "grad_norm": 0.06357936561107635, "learning_rate": 9.024069252115466e-06, "loss": 0.4648, "num_input_tokens_seen": 54747136, "step": 45020 }, { "epoch": 5.641523618594161, "grad_norm": 0.10883808135986328, "learning_rate": 9.023744738468477e-06, "loss": 0.4651, "num_input_tokens_seen": 54752896, "step": 45025 }, { "epoch": 5.642150106502944, "grad_norm": 0.08801357448101044, "learning_rate": 9.023420176714238e-06, "loss": 0.4673, "num_input_tokens_seen": 54759072, "step": 45030 }, { "epoch": 5.642776594411728, "grad_norm": 0.06983187049627304, "learning_rate": 9.023095566856625e-06, "loss": 0.4626, "num_input_tokens_seen": 54765408, "step": 45035 }, { "epoch": 5.6434030823205115, "grad_norm": 0.07400371134281158, "learning_rate": 9.022770908899525e-06, "loss": 0.459, "num_input_tokens_seen": 54771680, "step": 45040 }, { "epoch": 5.644029570229295, "grad_norm": 0.10330437868833542, "learning_rate": 9.022446202846816e-06, "loss": 0.4632, "num_input_tokens_seen": 54777856, "step": 45045 }, { "epoch": 5.644656058138078, "grad_norm": 0.06343895941972733, "learning_rate": 9.02212144870238e-06, "loss": 0.4584, "num_input_tokens_seen": 54784096, "step": 45050 }, { "epoch": 5.645282546046861, "grad_norm": 0.06647907942533493, "learning_rate": 9.0217966464701e-06, "loss": 0.4626, "num_input_tokens_seen": 54790240, "step": 45055 }, { "epoch": 5.645909033955645, "grad_norm": 0.03777473792433739, "learning_rate": 9.02147179615386e-06, "loss": 0.4577, "num_input_tokens_seen": 54796320, "step": 45060 }, { "epoch": 5.646535521864428, "grad_norm": 0.09509728103876114, "learning_rate": 9.021146897757546e-06, "loss": 0.4686, "num_input_tokens_seen": 54802016, "step": 45065 }, { "epoch": 5.647162009773211, "grad_norm": 0.07712456583976746, "learning_rate": 9.020821951285038e-06, "loss": 0.4627, "num_input_tokens_seen": 54808096, "step": 45070 }, { "epoch": 5.6477884976819945, "grad_norm": 0.06828542798757553, "learning_rate": 9.020496956740223e-06, "loss": 0.458, "num_input_tokens_seen": 54814048, "step": 45075 }, { "epoch": 5.648414985590778, "grad_norm": 0.07337237894535065, "learning_rate": 9.020171914126987e-06, "loss": 0.4559, "num_input_tokens_seen": 54820288, "step": 45080 }, { "epoch": 5.649041473499562, "grad_norm": 0.08739499747753143, "learning_rate": 9.019846823449216e-06, "loss": 0.4644, "num_input_tokens_seen": 54826944, "step": 45085 }, { "epoch": 5.649667961408345, "grad_norm": 0.07494723796844482, "learning_rate": 9.019521684710797e-06, "loss": 0.4665, "num_input_tokens_seen": 54833120, "step": 45090 }, { "epoch": 5.650294449317128, "grad_norm": 0.06465570628643036, "learning_rate": 9.019196497915617e-06, "loss": 0.4648, "num_input_tokens_seen": 54839328, "step": 45095 }, { "epoch": 5.650920937225911, "grad_norm": 0.058648739010095596, "learning_rate": 9.01887126306756e-06, "loss": 0.4596, "num_input_tokens_seen": 54845408, "step": 45100 }, { "epoch": 5.651547425134694, "grad_norm": 0.11340361833572388, "learning_rate": 9.018545980170523e-06, "loss": 0.4648, "num_input_tokens_seen": 54851584, "step": 45105 }, { "epoch": 5.6521739130434785, "grad_norm": 0.096469447016716, "learning_rate": 9.018220649228388e-06, "loss": 0.467, "num_input_tokens_seen": 54856832, "step": 45110 }, { "epoch": 5.652800400952262, "grad_norm": 0.032460667192935944, "learning_rate": 9.017895270245047e-06, "loss": 0.4575, "num_input_tokens_seen": 54863040, "step": 45115 }, { "epoch": 5.653426888861045, "grad_norm": 0.07585721462965012, "learning_rate": 9.017569843224389e-06, "loss": 0.4655, "num_input_tokens_seen": 54869728, "step": 45120 }, { "epoch": 5.654053376769828, "grad_norm": 0.062367137521505356, "learning_rate": 9.017244368170309e-06, "loss": 0.46, "num_input_tokens_seen": 54874848, "step": 45125 }, { "epoch": 5.654679864678612, "grad_norm": 0.06152758002281189, "learning_rate": 9.01691884508669e-06, "loss": 0.4628, "num_input_tokens_seen": 54881088, "step": 45130 }, { "epoch": 5.655306352587395, "grad_norm": 0.10156139731407166, "learning_rate": 9.016593273977434e-06, "loss": 0.4662, "num_input_tokens_seen": 54887040, "step": 45135 }, { "epoch": 5.655932840496178, "grad_norm": 0.06424254179000854, "learning_rate": 9.016267654846425e-06, "loss": 0.4645, "num_input_tokens_seen": 54892960, "step": 45140 }, { "epoch": 5.656559328404962, "grad_norm": 0.08784531056880951, "learning_rate": 9.015941987697562e-06, "loss": 0.4593, "num_input_tokens_seen": 54899008, "step": 45145 }, { "epoch": 5.657185816313746, "grad_norm": 0.1386013776063919, "learning_rate": 9.015616272534734e-06, "loss": 0.4619, "num_input_tokens_seen": 54905088, "step": 45150 }, { "epoch": 5.657812304222529, "grad_norm": 0.07475396245718002, "learning_rate": 9.015290509361837e-06, "loss": 0.4652, "num_input_tokens_seen": 54911168, "step": 45155 }, { "epoch": 5.658438792131312, "grad_norm": 0.09767734259366989, "learning_rate": 9.014964698182769e-06, "loss": 0.4582, "num_input_tokens_seen": 54917536, "step": 45160 }, { "epoch": 5.659065280040095, "grad_norm": 0.031717974692583084, "learning_rate": 9.01463883900142e-06, "loss": 0.461, "num_input_tokens_seen": 54923456, "step": 45165 }, { "epoch": 5.659691767948878, "grad_norm": 0.0659039169549942, "learning_rate": 9.01431293182169e-06, "loss": 0.4667, "num_input_tokens_seen": 54929600, "step": 45170 }, { "epoch": 5.660318255857662, "grad_norm": 0.04245719313621521, "learning_rate": 9.013986976647473e-06, "loss": 0.4647, "num_input_tokens_seen": 54935264, "step": 45175 }, { "epoch": 5.6609447437664455, "grad_norm": 0.06365737318992615, "learning_rate": 9.013660973482666e-06, "loss": 0.4641, "num_input_tokens_seen": 54941408, "step": 45180 }, { "epoch": 5.661571231675229, "grad_norm": 0.10792107880115509, "learning_rate": 9.013334922331168e-06, "loss": 0.464, "num_input_tokens_seen": 54947520, "step": 45185 }, { "epoch": 5.662197719584012, "grad_norm": 0.036051828414201736, "learning_rate": 9.013008823196878e-06, "loss": 0.4629, "num_input_tokens_seen": 54953696, "step": 45190 }, { "epoch": 5.662824207492795, "grad_norm": 0.06645499914884567, "learning_rate": 9.012682676083692e-06, "loss": 0.458, "num_input_tokens_seen": 54959808, "step": 45195 }, { "epoch": 5.663450695401579, "grad_norm": 0.07809337973594666, "learning_rate": 9.012356480995513e-06, "loss": 0.4609, "num_input_tokens_seen": 54965824, "step": 45200 }, { "epoch": 5.664077183310362, "grad_norm": 0.04291051998734474, "learning_rate": 9.012030237936237e-06, "loss": 0.465, "num_input_tokens_seen": 54972064, "step": 45205 }, { "epoch": 5.664703671219145, "grad_norm": 0.10177602618932724, "learning_rate": 9.011703946909767e-06, "loss": 0.4595, "num_input_tokens_seen": 54978176, "step": 45210 }, { "epoch": 5.665330159127929, "grad_norm": 0.06645342707633972, "learning_rate": 9.011377607920002e-06, "loss": 0.4604, "num_input_tokens_seen": 54984096, "step": 45215 }, { "epoch": 5.665956647036712, "grad_norm": 0.06943151354789734, "learning_rate": 9.011051220970848e-06, "loss": 0.4604, "num_input_tokens_seen": 54990528, "step": 45220 }, { "epoch": 5.666583134945496, "grad_norm": 0.08505576848983765, "learning_rate": 9.010724786066203e-06, "loss": 0.4613, "num_input_tokens_seen": 54996896, "step": 45225 }, { "epoch": 5.667209622854279, "grad_norm": 0.06813901662826538, "learning_rate": 9.01039830320997e-06, "loss": 0.4633, "num_input_tokens_seen": 55002976, "step": 45230 }, { "epoch": 5.667836110763062, "grad_norm": 0.07437179237604141, "learning_rate": 9.010071772406054e-06, "loss": 0.4578, "num_input_tokens_seen": 55009152, "step": 45235 }, { "epoch": 5.668462598671845, "grad_norm": 0.06355500221252441, "learning_rate": 9.009745193658358e-06, "loss": 0.4644, "num_input_tokens_seen": 55015168, "step": 45240 }, { "epoch": 5.669089086580629, "grad_norm": 0.06742547452449799, "learning_rate": 9.00941856697079e-06, "loss": 0.462, "num_input_tokens_seen": 55021760, "step": 45245 }, { "epoch": 5.6697155744894125, "grad_norm": 0.09403747320175171, "learning_rate": 9.00909189234725e-06, "loss": 0.456, "num_input_tokens_seen": 55027808, "step": 45250 }, { "epoch": 5.670342062398196, "grad_norm": 0.07620257884263992, "learning_rate": 9.008765169791646e-06, "loss": 0.4649, "num_input_tokens_seen": 55033792, "step": 45255 }, { "epoch": 5.670968550306979, "grad_norm": 0.07749920338392258, "learning_rate": 9.008438399307885e-06, "loss": 0.46, "num_input_tokens_seen": 55040192, "step": 45260 }, { "epoch": 5.671595038215762, "grad_norm": 0.12106268107891083, "learning_rate": 9.00811158089987e-06, "loss": 0.4525, "num_input_tokens_seen": 55046272, "step": 45265 }, { "epoch": 5.672221526124546, "grad_norm": 0.07238517701625824, "learning_rate": 9.007784714571514e-06, "loss": 0.4605, "num_input_tokens_seen": 55051872, "step": 45270 }, { "epoch": 5.672848014033329, "grad_norm": 0.14567892253398895, "learning_rate": 9.00745780032672e-06, "loss": 0.4654, "num_input_tokens_seen": 55058144, "step": 45275 }, { "epoch": 5.6734745019421124, "grad_norm": 0.08583987504243851, "learning_rate": 9.0071308381694e-06, "loss": 0.4686, "num_input_tokens_seen": 55064512, "step": 45280 }, { "epoch": 5.674100989850896, "grad_norm": 0.06929207593202591, "learning_rate": 9.006803828103461e-06, "loss": 0.4583, "num_input_tokens_seen": 55070624, "step": 45285 }, { "epoch": 5.67472747775968, "grad_norm": 0.07666802406311035, "learning_rate": 9.006476770132815e-06, "loss": 0.4599, "num_input_tokens_seen": 55076704, "step": 45290 }, { "epoch": 5.675353965668463, "grad_norm": 0.06305214017629623, "learning_rate": 9.00614966426137e-06, "loss": 0.4527, "num_input_tokens_seen": 55082688, "step": 45295 }, { "epoch": 5.675980453577246, "grad_norm": 0.09317226707935333, "learning_rate": 9.005822510493037e-06, "loss": 0.4662, "num_input_tokens_seen": 55088832, "step": 45300 }, { "epoch": 5.676606941486029, "grad_norm": 0.03662484139204025, "learning_rate": 9.00549530883173e-06, "loss": 0.4689, "num_input_tokens_seen": 55094784, "step": 45305 }, { "epoch": 5.677233429394812, "grad_norm": 0.06531090289354324, "learning_rate": 9.005168059281356e-06, "loss": 0.4501, "num_input_tokens_seen": 55100704, "step": 45310 }, { "epoch": 5.677859917303596, "grad_norm": 0.08549810200929642, "learning_rate": 9.004840761845832e-06, "loss": 0.4608, "num_input_tokens_seen": 55106240, "step": 45315 }, { "epoch": 5.67848640521238, "grad_norm": 0.06932564079761505, "learning_rate": 9.004513416529069e-06, "loss": 0.4606, "num_input_tokens_seen": 55112416, "step": 45320 }, { "epoch": 5.679112893121163, "grad_norm": 0.06933940201997757, "learning_rate": 9.004186023334983e-06, "loss": 0.4569, "num_input_tokens_seen": 55118432, "step": 45325 }, { "epoch": 5.679739381029946, "grad_norm": 0.04330694302916527, "learning_rate": 9.003858582267484e-06, "loss": 0.4599, "num_input_tokens_seen": 55124256, "step": 45330 }, { "epoch": 5.680365868938729, "grad_norm": 0.07819831371307373, "learning_rate": 9.00353109333049e-06, "loss": 0.4599, "num_input_tokens_seen": 55130432, "step": 45335 }, { "epoch": 5.680992356847513, "grad_norm": 0.10571285337209702, "learning_rate": 9.003203556527918e-06, "loss": 0.4639, "num_input_tokens_seen": 55136544, "step": 45340 }, { "epoch": 5.681618844756296, "grad_norm": 0.06457897275686264, "learning_rate": 9.00287597186368e-06, "loss": 0.4688, "num_input_tokens_seen": 55142784, "step": 45345 }, { "epoch": 5.6822453326650795, "grad_norm": 0.10889022797346115, "learning_rate": 9.002548339341694e-06, "loss": 0.4633, "num_input_tokens_seen": 55148768, "step": 45350 }, { "epoch": 5.682871820573863, "grad_norm": 0.06988794356584549, "learning_rate": 9.00222065896588e-06, "loss": 0.4657, "num_input_tokens_seen": 55154560, "step": 45355 }, { "epoch": 5.683498308482646, "grad_norm": 0.07335236668586731, "learning_rate": 9.00189293074015e-06, "loss": 0.4604, "num_input_tokens_seen": 55160480, "step": 45360 }, { "epoch": 5.68412479639143, "grad_norm": 0.06522754579782486, "learning_rate": 9.001565154668426e-06, "loss": 0.4625, "num_input_tokens_seen": 55166208, "step": 45365 }, { "epoch": 5.684751284300213, "grad_norm": 0.09332598745822906, "learning_rate": 9.001237330754627e-06, "loss": 0.4743, "num_input_tokens_seen": 55172288, "step": 45370 }, { "epoch": 5.685377772208996, "grad_norm": 0.07264808565378189, "learning_rate": 9.000909459002672e-06, "loss": 0.46, "num_input_tokens_seen": 55177568, "step": 45375 }, { "epoch": 5.686004260117779, "grad_norm": 0.06848689168691635, "learning_rate": 9.00058153941648e-06, "loss": 0.4683, "num_input_tokens_seen": 55183232, "step": 45380 }, { "epoch": 5.686630748026563, "grad_norm": 0.06582546979188919, "learning_rate": 9.000253571999973e-06, "loss": 0.4636, "num_input_tokens_seen": 55189312, "step": 45385 }, { "epoch": 5.687257235935347, "grad_norm": 0.04055636003613472, "learning_rate": 8.99992555675707e-06, "loss": 0.4704, "num_input_tokens_seen": 55195360, "step": 45390 }, { "epoch": 5.68788372384413, "grad_norm": 0.10850049555301666, "learning_rate": 8.999597493691694e-06, "loss": 0.4682, "num_input_tokens_seen": 55201440, "step": 45395 }, { "epoch": 5.688510211752913, "grad_norm": 0.09970270842313766, "learning_rate": 8.99926938280777e-06, "loss": 0.4647, "num_input_tokens_seen": 55207808, "step": 45400 }, { "epoch": 5.689136699661697, "grad_norm": 0.0884452685713768, "learning_rate": 8.998941224109214e-06, "loss": 0.4677, "num_input_tokens_seen": 55214112, "step": 45405 }, { "epoch": 5.68976318757048, "grad_norm": 0.06765814870595932, "learning_rate": 8.998613017599957e-06, "loss": 0.4605, "num_input_tokens_seen": 55220288, "step": 45410 }, { "epoch": 5.690389675479263, "grad_norm": 0.06735096126794815, "learning_rate": 8.998284763283918e-06, "loss": 0.4556, "num_input_tokens_seen": 55226688, "step": 45415 }, { "epoch": 5.6910161633880465, "grad_norm": 0.07166359573602676, "learning_rate": 8.997956461165022e-06, "loss": 0.4643, "num_input_tokens_seen": 55232416, "step": 45420 }, { "epoch": 5.69164265129683, "grad_norm": 0.11965569853782654, "learning_rate": 8.997628111247198e-06, "loss": 0.4684, "num_input_tokens_seen": 55238496, "step": 45425 }, { "epoch": 5.692269139205614, "grad_norm": 0.06605938822031021, "learning_rate": 8.997299713534366e-06, "loss": 0.4622, "num_input_tokens_seen": 55244384, "step": 45430 }, { "epoch": 5.692895627114397, "grad_norm": 0.0753014087677002, "learning_rate": 8.996971268030457e-06, "loss": 0.4713, "num_input_tokens_seen": 55250496, "step": 45435 }, { "epoch": 5.69352211502318, "grad_norm": 0.06738628447055817, "learning_rate": 8.996642774739394e-06, "loss": 0.4661, "num_input_tokens_seen": 55256704, "step": 45440 }, { "epoch": 5.694148602931963, "grad_norm": 0.0648358091711998, "learning_rate": 8.99631423366511e-06, "loss": 0.4618, "num_input_tokens_seen": 55262944, "step": 45445 }, { "epoch": 5.694775090840746, "grad_norm": 0.06831637024879456, "learning_rate": 8.995985644811526e-06, "loss": 0.4595, "num_input_tokens_seen": 55268480, "step": 45450 }, { "epoch": 5.6954015787495305, "grad_norm": 0.06989903748035431, "learning_rate": 8.995657008182577e-06, "loss": 0.459, "num_input_tokens_seen": 55274752, "step": 45455 }, { "epoch": 5.696028066658314, "grad_norm": 0.0965464785695076, "learning_rate": 8.995328323782186e-06, "loss": 0.4606, "num_input_tokens_seen": 55280448, "step": 45460 }, { "epoch": 5.696654554567097, "grad_norm": 0.09516491740942001, "learning_rate": 8.994999591614289e-06, "loss": 0.4642, "num_input_tokens_seen": 55286560, "step": 45465 }, { "epoch": 5.69728104247588, "grad_norm": 0.1204390749335289, "learning_rate": 8.99467081168281e-06, "loss": 0.4568, "num_input_tokens_seen": 55293216, "step": 45470 }, { "epoch": 5.697907530384663, "grad_norm": 0.07003939896821976, "learning_rate": 8.994341983991685e-06, "loss": 0.464, "num_input_tokens_seen": 55299392, "step": 45475 }, { "epoch": 5.698534018293447, "grad_norm": 0.06821353733539581, "learning_rate": 8.994013108544844e-06, "loss": 0.4657, "num_input_tokens_seen": 55305600, "step": 45480 }, { "epoch": 5.69916050620223, "grad_norm": 0.04525424912571907, "learning_rate": 8.993684185346216e-06, "loss": 0.465, "num_input_tokens_seen": 55311968, "step": 45485 }, { "epoch": 5.6997869941110135, "grad_norm": 0.06574533879756927, "learning_rate": 8.993355214399736e-06, "loss": 0.4621, "num_input_tokens_seen": 55318208, "step": 45490 }, { "epoch": 5.700413482019797, "grad_norm": 0.07431433349847794, "learning_rate": 8.993026195709337e-06, "loss": 0.4604, "num_input_tokens_seen": 55324192, "step": 45495 }, { "epoch": 5.701039969928581, "grad_norm": 0.08458345383405685, "learning_rate": 8.992697129278954e-06, "loss": 0.4642, "num_input_tokens_seen": 55329952, "step": 45500 }, { "epoch": 5.701666457837364, "grad_norm": 0.11885909736156464, "learning_rate": 8.992368015112521e-06, "loss": 0.4613, "num_input_tokens_seen": 55335936, "step": 45505 }, { "epoch": 5.702292945746147, "grad_norm": 0.06947091221809387, "learning_rate": 8.992038853213969e-06, "loss": 0.4621, "num_input_tokens_seen": 55342016, "step": 45510 }, { "epoch": 5.70291943365493, "grad_norm": 0.06750548630952835, "learning_rate": 8.991709643587236e-06, "loss": 0.4631, "num_input_tokens_seen": 55347872, "step": 45515 }, { "epoch": 5.703545921563714, "grad_norm": 0.06831815093755722, "learning_rate": 8.99138038623626e-06, "loss": 0.4616, "num_input_tokens_seen": 55353952, "step": 45520 }, { "epoch": 5.7041724094724975, "grad_norm": 0.1282500922679901, "learning_rate": 8.991051081164976e-06, "loss": 0.4644, "num_input_tokens_seen": 55360192, "step": 45525 }, { "epoch": 5.704798897381281, "grad_norm": 0.06989701837301254, "learning_rate": 8.99072172837732e-06, "loss": 0.4638, "num_input_tokens_seen": 55366304, "step": 45530 }, { "epoch": 5.705425385290064, "grad_norm": 0.09754055738449097, "learning_rate": 8.99039232787723e-06, "loss": 0.4616, "num_input_tokens_seen": 55371936, "step": 45535 }, { "epoch": 5.706051873198847, "grad_norm": 0.09782335162162781, "learning_rate": 8.990062879668646e-06, "loss": 0.4635, "num_input_tokens_seen": 55378368, "step": 45540 }, { "epoch": 5.706678361107631, "grad_norm": 0.06859195977449417, "learning_rate": 8.989733383755505e-06, "loss": 0.4625, "num_input_tokens_seen": 55384512, "step": 45545 }, { "epoch": 5.707304849016414, "grad_norm": 0.07407759875059128, "learning_rate": 8.989403840141745e-06, "loss": 0.4641, "num_input_tokens_seen": 55390624, "step": 45550 }, { "epoch": 5.707931336925197, "grad_norm": 0.03534530848264694, "learning_rate": 8.989074248831313e-06, "loss": 0.4665, "num_input_tokens_seen": 55396672, "step": 45555 }, { "epoch": 5.7085578248339806, "grad_norm": 0.07291794568300247, "learning_rate": 8.988744609828141e-06, "loss": 0.4623, "num_input_tokens_seen": 55403104, "step": 45560 }, { "epoch": 5.709184312742764, "grad_norm": 0.06589838862419128, "learning_rate": 8.988414923136173e-06, "loss": 0.4653, "num_input_tokens_seen": 55409600, "step": 45565 }, { "epoch": 5.709810800651548, "grad_norm": 0.06826745718717575, "learning_rate": 8.988085188759353e-06, "loss": 0.4603, "num_input_tokens_seen": 55415872, "step": 45570 }, { "epoch": 5.710437288560331, "grad_norm": 0.036397021263837814, "learning_rate": 8.987755406701622e-06, "loss": 0.4658, "num_input_tokens_seen": 55422112, "step": 45575 }, { "epoch": 5.711063776469114, "grad_norm": 0.0663914605975151, "learning_rate": 8.98742557696692e-06, "loss": 0.4627, "num_input_tokens_seen": 55428256, "step": 45580 }, { "epoch": 5.711690264377897, "grad_norm": 0.1088002473115921, "learning_rate": 8.987095699559194e-06, "loss": 0.469, "num_input_tokens_seen": 55433920, "step": 45585 }, { "epoch": 5.7123167522866805, "grad_norm": 0.07362231612205505, "learning_rate": 8.986765774482388e-06, "loss": 0.4615, "num_input_tokens_seen": 55440256, "step": 45590 }, { "epoch": 5.7129432401954645, "grad_norm": 0.08069248497486115, "learning_rate": 8.986435801740442e-06, "loss": 0.4643, "num_input_tokens_seen": 55446368, "step": 45595 }, { "epoch": 5.713569728104248, "grad_norm": 0.0325976200401783, "learning_rate": 8.986105781337308e-06, "loss": 0.459, "num_input_tokens_seen": 55452160, "step": 45600 }, { "epoch": 5.714196216013031, "grad_norm": 0.06641180068254471, "learning_rate": 8.985775713276927e-06, "loss": 0.462, "num_input_tokens_seen": 55457472, "step": 45605 }, { "epoch": 5.714822703921814, "grad_norm": 0.08617808669805527, "learning_rate": 8.985445597563247e-06, "loss": 0.4587, "num_input_tokens_seen": 55463648, "step": 45610 }, { "epoch": 5.715449191830597, "grad_norm": 0.033714376389980316, "learning_rate": 8.985115434200213e-06, "loss": 0.4596, "num_input_tokens_seen": 55469824, "step": 45615 }, { "epoch": 5.716075679739381, "grad_norm": 0.10318483412265778, "learning_rate": 8.984785223191772e-06, "loss": 0.4688, "num_input_tokens_seen": 55475712, "step": 45620 }, { "epoch": 5.716702167648164, "grad_norm": 0.10023517906665802, "learning_rate": 8.984454964541875e-06, "loss": 0.4589, "num_input_tokens_seen": 55481792, "step": 45625 }, { "epoch": 5.717328655556948, "grad_norm": 0.06811931729316711, "learning_rate": 8.98412465825447e-06, "loss": 0.4608, "num_input_tokens_seen": 55488064, "step": 45630 }, { "epoch": 5.717955143465731, "grad_norm": 0.0707450732588768, "learning_rate": 8.983794304333504e-06, "loss": 0.4698, "num_input_tokens_seen": 55494400, "step": 45635 }, { "epoch": 5.718581631374515, "grad_norm": 0.05797736346721649, "learning_rate": 8.983463902782928e-06, "loss": 0.4612, "num_input_tokens_seen": 55500768, "step": 45640 }, { "epoch": 5.719208119283298, "grad_norm": 0.06818876415491104, "learning_rate": 8.983133453606691e-06, "loss": 0.4609, "num_input_tokens_seen": 55506912, "step": 45645 }, { "epoch": 5.719834607192081, "grad_norm": 0.10959256440401077, "learning_rate": 8.982802956808744e-06, "loss": 0.4659, "num_input_tokens_seen": 55512608, "step": 45650 }, { "epoch": 5.720461095100864, "grad_norm": 0.0706465095281601, "learning_rate": 8.982472412393041e-06, "loss": 0.4582, "num_input_tokens_seen": 55518784, "step": 45655 }, { "epoch": 5.721087583009648, "grad_norm": 0.0703839659690857, "learning_rate": 8.982141820363533e-06, "loss": 0.46, "num_input_tokens_seen": 55524736, "step": 45660 }, { "epoch": 5.7217140709184315, "grad_norm": 0.061060916632413864, "learning_rate": 8.981811180724169e-06, "loss": 0.4609, "num_input_tokens_seen": 55530656, "step": 45665 }, { "epoch": 5.722340558827215, "grad_norm": 0.06951078027486801, "learning_rate": 8.981480493478905e-06, "loss": 0.4567, "num_input_tokens_seen": 55537056, "step": 45670 }, { "epoch": 5.722967046735998, "grad_norm": 0.06931226700544357, "learning_rate": 8.981149758631695e-06, "loss": 0.4588, "num_input_tokens_seen": 55543552, "step": 45675 }, { "epoch": 5.723593534644781, "grad_norm": 0.08249454945325851, "learning_rate": 8.980818976186493e-06, "loss": 0.459, "num_input_tokens_seen": 55549376, "step": 45680 }, { "epoch": 5.724220022553565, "grad_norm": 0.03446946665644646, "learning_rate": 8.980488146147253e-06, "loss": 0.465, "num_input_tokens_seen": 55555520, "step": 45685 }, { "epoch": 5.724846510462348, "grad_norm": 0.07026518881320953, "learning_rate": 8.98015726851793e-06, "loss": 0.4567, "num_input_tokens_seen": 55561600, "step": 45690 }, { "epoch": 5.725472998371131, "grad_norm": 0.10031048208475113, "learning_rate": 8.97982634330248e-06, "loss": 0.4614, "num_input_tokens_seen": 55568064, "step": 45695 }, { "epoch": 5.726099486279915, "grad_norm": 0.06798641383647919, "learning_rate": 8.97949537050486e-06, "loss": 0.4703, "num_input_tokens_seen": 55574016, "step": 45700 }, { "epoch": 5.726725974188698, "grad_norm": 0.08393467217683792, "learning_rate": 8.979164350129028e-06, "loss": 0.4644, "num_input_tokens_seen": 55579552, "step": 45705 }, { "epoch": 5.727352462097482, "grad_norm": 0.07000688463449478, "learning_rate": 8.97883328217894e-06, "loss": 0.466, "num_input_tokens_seen": 55585056, "step": 45710 }, { "epoch": 5.727978950006265, "grad_norm": 0.0628928542137146, "learning_rate": 8.978502166658556e-06, "loss": 0.4619, "num_input_tokens_seen": 55590944, "step": 45715 }, { "epoch": 5.728605437915048, "grad_norm": 0.05732002109289169, "learning_rate": 8.978171003571833e-06, "loss": 0.4575, "num_input_tokens_seen": 55597344, "step": 45720 }, { "epoch": 5.729231925823831, "grad_norm": 0.03921790421009064, "learning_rate": 8.977839792922732e-06, "loss": 0.4629, "num_input_tokens_seen": 55603488, "step": 45725 }, { "epoch": 5.7298584137326145, "grad_norm": 0.06755328923463821, "learning_rate": 8.977508534715212e-06, "loss": 0.4676, "num_input_tokens_seen": 55609408, "step": 45730 }, { "epoch": 5.730484901641399, "grad_norm": 0.08681712299585342, "learning_rate": 8.97717722895323e-06, "loss": 0.4732, "num_input_tokens_seen": 55614592, "step": 45735 }, { "epoch": 5.731111389550182, "grad_norm": 0.06111215427517891, "learning_rate": 8.976845875640753e-06, "loss": 0.4637, "num_input_tokens_seen": 55620288, "step": 45740 }, { "epoch": 5.731737877458965, "grad_norm": 0.06489794701337814, "learning_rate": 8.97651447478174e-06, "loss": 0.4643, "num_input_tokens_seen": 55626464, "step": 45745 }, { "epoch": 5.732364365367748, "grad_norm": 0.08902950584888458, "learning_rate": 8.976183026380152e-06, "loss": 0.467, "num_input_tokens_seen": 55632928, "step": 45750 }, { "epoch": 5.732990853276532, "grad_norm": 0.10753551870584488, "learning_rate": 8.975851530439954e-06, "loss": 0.4544, "num_input_tokens_seen": 55639040, "step": 45755 }, { "epoch": 5.733617341185315, "grad_norm": 0.06337917596101761, "learning_rate": 8.975519986965109e-06, "loss": 0.4583, "num_input_tokens_seen": 55644928, "step": 45760 }, { "epoch": 5.7342438290940985, "grad_norm": 0.07366994023323059, "learning_rate": 8.97518839595958e-06, "loss": 0.4684, "num_input_tokens_seen": 55651136, "step": 45765 }, { "epoch": 5.734870317002882, "grad_norm": 0.05916006863117218, "learning_rate": 8.97485675742733e-06, "loss": 0.4591, "num_input_tokens_seen": 55657024, "step": 45770 }, { "epoch": 5.735496804911666, "grad_norm": 0.07710715383291245, "learning_rate": 8.974525071372326e-06, "loss": 0.4632, "num_input_tokens_seen": 55663168, "step": 45775 }, { "epoch": 5.736123292820449, "grad_norm": 0.0722658708691597, "learning_rate": 8.974193337798534e-06, "loss": 0.4618, "num_input_tokens_seen": 55669152, "step": 45780 }, { "epoch": 5.736749780729232, "grad_norm": 0.06437543779611588, "learning_rate": 8.973861556709918e-06, "loss": 0.4643, "num_input_tokens_seen": 55675424, "step": 45785 }, { "epoch": 5.737376268638015, "grad_norm": 0.10412080585956573, "learning_rate": 8.973529728110449e-06, "loss": 0.4623, "num_input_tokens_seen": 55681632, "step": 45790 }, { "epoch": 5.738002756546798, "grad_norm": 0.09611963480710983, "learning_rate": 8.973197852004088e-06, "loss": 0.4587, "num_input_tokens_seen": 55688000, "step": 45795 }, { "epoch": 5.738629244455582, "grad_norm": 0.07183940708637238, "learning_rate": 8.972865928394806e-06, "loss": 0.4643, "num_input_tokens_seen": 55693504, "step": 45800 }, { "epoch": 5.739255732364366, "grad_norm": 0.06236880645155907, "learning_rate": 8.972533957286574e-06, "loss": 0.466, "num_input_tokens_seen": 55700160, "step": 45805 }, { "epoch": 5.739882220273149, "grad_norm": 0.06105763465166092, "learning_rate": 8.972201938683359e-06, "loss": 0.4634, "num_input_tokens_seen": 55706400, "step": 45810 }, { "epoch": 5.740508708181932, "grad_norm": 0.07530267536640167, "learning_rate": 8.971869872589127e-06, "loss": 0.4731, "num_input_tokens_seen": 55712640, "step": 45815 }, { "epoch": 5.741135196090715, "grad_norm": 0.07290580868721008, "learning_rate": 8.971537759007854e-06, "loss": 0.4668, "num_input_tokens_seen": 55718496, "step": 45820 }, { "epoch": 5.741761683999499, "grad_norm": 0.06605688482522964, "learning_rate": 8.971205597943507e-06, "loss": 0.4601, "num_input_tokens_seen": 55724608, "step": 45825 }, { "epoch": 5.742388171908282, "grad_norm": 0.0665493905544281, "learning_rate": 8.970873389400059e-06, "loss": 0.4636, "num_input_tokens_seen": 55730880, "step": 45830 }, { "epoch": 5.7430146598170655, "grad_norm": 0.10072050243616104, "learning_rate": 8.970541133381479e-06, "loss": 0.4673, "num_input_tokens_seen": 55736608, "step": 45835 }, { "epoch": 5.743641147725849, "grad_norm": 0.04497662931680679, "learning_rate": 8.970208829891744e-06, "loss": 0.4647, "num_input_tokens_seen": 55742784, "step": 45840 }, { "epoch": 5.744267635634632, "grad_norm": 0.07089672982692719, "learning_rate": 8.969876478934824e-06, "loss": 0.4622, "num_input_tokens_seen": 55748864, "step": 45845 }, { "epoch": 5.744894123543416, "grad_norm": 0.07120374590158463, "learning_rate": 8.969544080514691e-06, "loss": 0.467, "num_input_tokens_seen": 55754464, "step": 45850 }, { "epoch": 5.745520611452199, "grad_norm": 0.06992007791996002, "learning_rate": 8.969211634635325e-06, "loss": 0.4603, "num_input_tokens_seen": 55760928, "step": 45855 }, { "epoch": 5.746147099360982, "grad_norm": 0.06377150118350983, "learning_rate": 8.968879141300692e-06, "loss": 0.4616, "num_input_tokens_seen": 55767008, "step": 45860 }, { "epoch": 5.746773587269765, "grad_norm": 0.06695101410150528, "learning_rate": 8.968546600514775e-06, "loss": 0.4644, "num_input_tokens_seen": 55773280, "step": 45865 }, { "epoch": 5.747400075178549, "grad_norm": 0.0644240528345108, "learning_rate": 8.968214012281546e-06, "loss": 0.4621, "num_input_tokens_seen": 55779264, "step": 45870 }, { "epoch": 5.748026563087333, "grad_norm": 0.06758458912372589, "learning_rate": 8.967881376604983e-06, "loss": 0.4617, "num_input_tokens_seen": 55785472, "step": 45875 }, { "epoch": 5.748653050996116, "grad_norm": 0.03584088757634163, "learning_rate": 8.96754869348906e-06, "loss": 0.4627, "num_input_tokens_seen": 55791744, "step": 45880 }, { "epoch": 5.749279538904899, "grad_norm": 0.07128768414258957, "learning_rate": 8.96721596293776e-06, "loss": 0.462, "num_input_tokens_seen": 55797888, "step": 45885 }, { "epoch": 5.749906026813682, "grad_norm": 0.09297335147857666, "learning_rate": 8.966883184955056e-06, "loss": 0.4651, "num_input_tokens_seen": 55804224, "step": 45890 }, { "epoch": 5.750532514722466, "grad_norm": 0.07787813246250153, "learning_rate": 8.966550359544928e-06, "loss": 0.4608, "num_input_tokens_seen": 55810464, "step": 45895 }, { "epoch": 5.751159002631249, "grad_norm": 0.07220654934644699, "learning_rate": 8.966217486711354e-06, "loss": 0.4638, "num_input_tokens_seen": 55817024, "step": 45900 }, { "epoch": 5.7517854905400325, "grad_norm": 0.034596074372529984, "learning_rate": 8.965884566458318e-06, "loss": 0.4676, "num_input_tokens_seen": 55823200, "step": 45905 }, { "epoch": 5.752411978448816, "grad_norm": 0.11478590220212936, "learning_rate": 8.965551598789798e-06, "loss": 0.4567, "num_input_tokens_seen": 55829536, "step": 45910 }, { "epoch": 5.7530384663576, "grad_norm": 0.06555792689323425, "learning_rate": 8.965218583709773e-06, "loss": 0.461, "num_input_tokens_seen": 55835392, "step": 45915 }, { "epoch": 5.753664954266383, "grad_norm": 0.03496500477194786, "learning_rate": 8.964885521222226e-06, "loss": 0.4643, "num_input_tokens_seen": 55841920, "step": 45920 }, { "epoch": 5.754291442175166, "grad_norm": 0.09893271327018738, "learning_rate": 8.96455241133114e-06, "loss": 0.4624, "num_input_tokens_seen": 55848032, "step": 45925 }, { "epoch": 5.754917930083949, "grad_norm": 0.09796732664108276, "learning_rate": 8.964219254040495e-06, "loss": 0.4585, "num_input_tokens_seen": 55853856, "step": 45930 }, { "epoch": 5.755544417992732, "grad_norm": 0.06762466579675674, "learning_rate": 8.963886049354278e-06, "loss": 0.4607, "num_input_tokens_seen": 55860288, "step": 45935 }, { "epoch": 5.7561709059015165, "grad_norm": 0.07861808687448502, "learning_rate": 8.96355279727647e-06, "loss": 0.4663, "num_input_tokens_seen": 55866560, "step": 45940 }, { "epoch": 5.7567973938103, "grad_norm": 0.10027609020471573, "learning_rate": 8.963219497811056e-06, "loss": 0.4662, "num_input_tokens_seen": 55872672, "step": 45945 }, { "epoch": 5.757423881719083, "grad_norm": 0.06359218060970306, "learning_rate": 8.96288615096202e-06, "loss": 0.4629, "num_input_tokens_seen": 55878656, "step": 45950 }, { "epoch": 5.758050369627866, "grad_norm": 0.10370555520057678, "learning_rate": 8.96255275673335e-06, "loss": 0.4612, "num_input_tokens_seen": 55884672, "step": 45955 }, { "epoch": 5.758676857536649, "grad_norm": 0.03513815253973007, "learning_rate": 8.962219315129029e-06, "loss": 0.4613, "num_input_tokens_seen": 55890752, "step": 45960 }, { "epoch": 5.759303345445433, "grad_norm": 0.062031228095293045, "learning_rate": 8.961885826153045e-06, "loss": 0.4596, "num_input_tokens_seen": 55896800, "step": 45965 }, { "epoch": 5.759929833354216, "grad_norm": 0.10178865492343903, "learning_rate": 8.961552289809385e-06, "loss": 0.4539, "num_input_tokens_seen": 55902880, "step": 45970 }, { "epoch": 5.7605563212629995, "grad_norm": 0.06575541943311691, "learning_rate": 8.961218706102036e-06, "loss": 0.4598, "num_input_tokens_seen": 55908672, "step": 45975 }, { "epoch": 5.761182809171783, "grad_norm": 0.06379683315753937, "learning_rate": 8.960885075034988e-06, "loss": 0.4639, "num_input_tokens_seen": 55914496, "step": 45980 }, { "epoch": 5.761809297080566, "grad_norm": 0.16958217322826385, "learning_rate": 8.960551396612227e-06, "loss": 0.4558, "num_input_tokens_seen": 55920544, "step": 45985 }, { "epoch": 5.76243578498935, "grad_norm": 0.0799148827791214, "learning_rate": 8.960217670837745e-06, "loss": 0.4678, "num_input_tokens_seen": 55926720, "step": 45990 }, { "epoch": 5.763062272898133, "grad_norm": 0.1087028905749321, "learning_rate": 8.959883897715534e-06, "loss": 0.4644, "num_input_tokens_seen": 55932992, "step": 45995 }, { "epoch": 5.763688760806916, "grad_norm": 0.119559645652771, "learning_rate": 8.959550077249579e-06, "loss": 0.4668, "num_input_tokens_seen": 55939264, "step": 46000 }, { "epoch": 5.7643152487156994, "grad_norm": 0.08563505858182907, "learning_rate": 8.959216209443873e-06, "loss": 0.4617, "num_input_tokens_seen": 55945600, "step": 46005 }, { "epoch": 5.7649417366244835, "grad_norm": 0.128085657954216, "learning_rate": 8.95888229430241e-06, "loss": 0.4555, "num_input_tokens_seen": 55951712, "step": 46010 }, { "epoch": 5.765568224533267, "grad_norm": 0.09968125075101852, "learning_rate": 8.958548331829179e-06, "loss": 0.4621, "num_input_tokens_seen": 55957632, "step": 46015 }, { "epoch": 5.76619471244205, "grad_norm": 0.09167879819869995, "learning_rate": 8.958214322028178e-06, "loss": 0.4616, "num_input_tokens_seen": 55963840, "step": 46020 }, { "epoch": 5.766821200350833, "grad_norm": 0.04598545655608177, "learning_rate": 8.957880264903394e-06, "loss": 0.4637, "num_input_tokens_seen": 55970080, "step": 46025 }, { "epoch": 5.767447688259617, "grad_norm": 0.09556876868009567, "learning_rate": 8.957546160458826e-06, "loss": 0.4611, "num_input_tokens_seen": 55976352, "step": 46030 }, { "epoch": 5.7680741761684, "grad_norm": 0.07693768292665482, "learning_rate": 8.957212008698463e-06, "loss": 0.4555, "num_input_tokens_seen": 55982784, "step": 46035 }, { "epoch": 5.768700664077183, "grad_norm": 0.1025099903345108, "learning_rate": 8.956877809626308e-06, "loss": 0.4602, "num_input_tokens_seen": 55988800, "step": 46040 }, { "epoch": 5.769327151985967, "grad_norm": 0.1176077350974083, "learning_rate": 8.956543563246348e-06, "loss": 0.467, "num_input_tokens_seen": 55994912, "step": 46045 }, { "epoch": 5.76995363989475, "grad_norm": 0.0785713717341423, "learning_rate": 8.956209269562586e-06, "loss": 0.4814, "num_input_tokens_seen": 56001184, "step": 46050 }, { "epoch": 5.770580127803534, "grad_norm": 0.08832349628210068, "learning_rate": 8.955874928579014e-06, "loss": 0.4683, "num_input_tokens_seen": 56007424, "step": 46055 }, { "epoch": 5.771206615712317, "grad_norm": 0.06964465230703354, "learning_rate": 8.955540540299632e-06, "loss": 0.4654, "num_input_tokens_seen": 56013280, "step": 46060 }, { "epoch": 5.7718331036211, "grad_norm": 0.034463878720998764, "learning_rate": 8.955206104728436e-06, "loss": 0.471, "num_input_tokens_seen": 56019808, "step": 46065 }, { "epoch": 5.772459591529883, "grad_norm": 0.03324637562036514, "learning_rate": 8.954871621869428e-06, "loss": 0.4589, "num_input_tokens_seen": 56025856, "step": 46070 }, { "epoch": 5.7730860794386665, "grad_norm": 0.06756674498319626, "learning_rate": 8.954537091726602e-06, "loss": 0.462, "num_input_tokens_seen": 56032064, "step": 46075 }, { "epoch": 5.7737125673474505, "grad_norm": 0.0666554644703865, "learning_rate": 8.954202514303962e-06, "loss": 0.4616, "num_input_tokens_seen": 56038016, "step": 46080 }, { "epoch": 5.774339055256234, "grad_norm": 0.09998791664838791, "learning_rate": 8.953867889605508e-06, "loss": 0.4668, "num_input_tokens_seen": 56044320, "step": 46085 }, { "epoch": 5.774965543165017, "grad_norm": 0.07003379613161087, "learning_rate": 8.953533217635238e-06, "loss": 0.4662, "num_input_tokens_seen": 56050080, "step": 46090 }, { "epoch": 5.7755920310738, "grad_norm": 0.039176519960165024, "learning_rate": 8.953198498397153e-06, "loss": 0.4662, "num_input_tokens_seen": 56056608, "step": 46095 }, { "epoch": 5.776218518982583, "grad_norm": 0.07679473608732224, "learning_rate": 8.952863731895256e-06, "loss": 0.4638, "num_input_tokens_seen": 56062752, "step": 46100 }, { "epoch": 5.776845006891367, "grad_norm": 0.062209922820329666, "learning_rate": 8.952528918133551e-06, "loss": 0.4647, "num_input_tokens_seen": 56068896, "step": 46105 }, { "epoch": 5.77747149480015, "grad_norm": 0.09905258566141129, "learning_rate": 8.952194057116039e-06, "loss": 0.4623, "num_input_tokens_seen": 56074912, "step": 46110 }, { "epoch": 5.778097982708934, "grad_norm": 0.06374768912792206, "learning_rate": 8.951859148846726e-06, "loss": 0.4613, "num_input_tokens_seen": 56081056, "step": 46115 }, { "epoch": 5.778724470617717, "grad_norm": 0.09934729337692261, "learning_rate": 8.951524193329613e-06, "loss": 0.4574, "num_input_tokens_seen": 56086976, "step": 46120 }, { "epoch": 5.779350958526501, "grad_norm": 0.03591505438089371, "learning_rate": 8.951189190568705e-06, "loss": 0.465, "num_input_tokens_seen": 56092832, "step": 46125 }, { "epoch": 5.779977446435284, "grad_norm": 0.06997295469045639, "learning_rate": 8.950854140568011e-06, "loss": 0.4614, "num_input_tokens_seen": 56098976, "step": 46130 }, { "epoch": 5.780603934344067, "grad_norm": 0.07182373106479645, "learning_rate": 8.950519043331531e-06, "loss": 0.4681, "num_input_tokens_seen": 56105312, "step": 46135 }, { "epoch": 5.78123042225285, "grad_norm": 0.06326232850551605, "learning_rate": 8.950183898863276e-06, "loss": 0.4611, "num_input_tokens_seen": 56111488, "step": 46140 }, { "epoch": 5.7818569101616335, "grad_norm": 0.0854605883359909, "learning_rate": 8.949848707167252e-06, "loss": 0.4624, "num_input_tokens_seen": 56116800, "step": 46145 }, { "epoch": 5.782483398070418, "grad_norm": 0.06548181176185608, "learning_rate": 8.949513468247464e-06, "loss": 0.4597, "num_input_tokens_seen": 56123136, "step": 46150 }, { "epoch": 5.783109885979201, "grad_norm": 0.06138930842280388, "learning_rate": 8.949178182107923e-06, "loss": 0.4601, "num_input_tokens_seen": 56129216, "step": 46155 }, { "epoch": 5.783736373887984, "grad_norm": 0.03599514812231064, "learning_rate": 8.948842848752637e-06, "loss": 0.4637, "num_input_tokens_seen": 56135168, "step": 46160 }, { "epoch": 5.784362861796767, "grad_norm": 0.10629643499851227, "learning_rate": 8.948507468185615e-06, "loss": 0.4671, "num_input_tokens_seen": 56141344, "step": 46165 }, { "epoch": 5.784989349705551, "grad_norm": 0.08502693474292755, "learning_rate": 8.948172040410865e-06, "loss": 0.4661, "num_input_tokens_seen": 56147712, "step": 46170 }, { "epoch": 5.785615837614334, "grad_norm": 0.08828169107437134, "learning_rate": 8.9478365654324e-06, "loss": 0.4602, "num_input_tokens_seen": 56153920, "step": 46175 }, { "epoch": 5.7862423255231175, "grad_norm": 0.07854770869016647, "learning_rate": 8.947501043254228e-06, "loss": 0.4583, "num_input_tokens_seen": 56159840, "step": 46180 }, { "epoch": 5.786868813431901, "grad_norm": 0.09628143906593323, "learning_rate": 8.947165473880364e-06, "loss": 0.4571, "num_input_tokens_seen": 56166112, "step": 46185 }, { "epoch": 5.787495301340684, "grad_norm": 0.06475285440683365, "learning_rate": 8.946829857314817e-06, "loss": 0.4678, "num_input_tokens_seen": 56172000, "step": 46190 }, { "epoch": 5.788121789249468, "grad_norm": 0.1054503470659256, "learning_rate": 8.946494193561601e-06, "loss": 0.4634, "num_input_tokens_seen": 56177856, "step": 46195 }, { "epoch": 5.788748277158251, "grad_norm": 0.07148183882236481, "learning_rate": 8.946158482624729e-06, "loss": 0.4625, "num_input_tokens_seen": 56184000, "step": 46200 }, { "epoch": 5.789374765067034, "grad_norm": 0.06494873017072678, "learning_rate": 8.945822724508215e-06, "loss": 0.4598, "num_input_tokens_seen": 56190016, "step": 46205 }, { "epoch": 5.790001252975817, "grad_norm": 0.10180334746837616, "learning_rate": 8.945486919216073e-06, "loss": 0.4614, "num_input_tokens_seen": 56196256, "step": 46210 }, { "epoch": 5.7906277408846005, "grad_norm": 0.09908685088157654, "learning_rate": 8.945151066752318e-06, "loss": 0.4677, "num_input_tokens_seen": 56202528, "step": 46215 }, { "epoch": 5.791254228793385, "grad_norm": 0.10658682882785797, "learning_rate": 8.944815167120963e-06, "loss": 0.4666, "num_input_tokens_seen": 56208960, "step": 46220 }, { "epoch": 5.791880716702168, "grad_norm": 0.07576445490121841, "learning_rate": 8.944479220326029e-06, "loss": 0.4614, "num_input_tokens_seen": 56215072, "step": 46225 }, { "epoch": 5.792507204610951, "grad_norm": 0.07039941102266312, "learning_rate": 8.944143226371528e-06, "loss": 0.4596, "num_input_tokens_seen": 56221312, "step": 46230 }, { "epoch": 5.793133692519734, "grad_norm": 0.10524898022413254, "learning_rate": 8.943807185261479e-06, "loss": 0.4616, "num_input_tokens_seen": 56227520, "step": 46235 }, { "epoch": 5.793760180428517, "grad_norm": 0.07982610166072845, "learning_rate": 8.943471096999898e-06, "loss": 0.4564, "num_input_tokens_seen": 56232992, "step": 46240 }, { "epoch": 5.794386668337301, "grad_norm": 0.07201507687568665, "learning_rate": 8.943134961590806e-06, "loss": 0.4646, "num_input_tokens_seen": 56239296, "step": 46245 }, { "epoch": 5.7950131562460845, "grad_norm": 0.10692499577999115, "learning_rate": 8.942798779038219e-06, "loss": 0.4606, "num_input_tokens_seen": 56245216, "step": 46250 }, { "epoch": 5.795639644154868, "grad_norm": 0.0864296704530716, "learning_rate": 8.942462549346161e-06, "loss": 0.4667, "num_input_tokens_seen": 56251616, "step": 46255 }, { "epoch": 5.796266132063651, "grad_norm": 0.08960466086864471, "learning_rate": 8.942126272518646e-06, "loss": 0.4534, "num_input_tokens_seen": 56257888, "step": 46260 }, { "epoch": 5.796892619972435, "grad_norm": 0.0637638196349144, "learning_rate": 8.941789948559697e-06, "loss": 0.4621, "num_input_tokens_seen": 56263328, "step": 46265 }, { "epoch": 5.797519107881218, "grad_norm": 0.07427973300218582, "learning_rate": 8.941453577473333e-06, "loss": 0.4702, "num_input_tokens_seen": 56269696, "step": 46270 }, { "epoch": 5.798145595790001, "grad_norm": 0.07677806913852692, "learning_rate": 8.941117159263581e-06, "loss": 0.4702, "num_input_tokens_seen": 56275744, "step": 46275 }, { "epoch": 5.798772083698784, "grad_norm": 0.030259672552347183, "learning_rate": 8.940780693934459e-06, "loss": 0.4612, "num_input_tokens_seen": 56281952, "step": 46280 }, { "epoch": 5.799398571607568, "grad_norm": 0.06665560603141785, "learning_rate": 8.94044418148999e-06, "loss": 0.4705, "num_input_tokens_seen": 56288288, "step": 46285 }, { "epoch": 5.800025059516352, "grad_norm": 0.06723009049892426, "learning_rate": 8.9401076219342e-06, "loss": 0.4556, "num_input_tokens_seen": 56294272, "step": 46290 }, { "epoch": 5.800651547425135, "grad_norm": 0.1028030663728714, "learning_rate": 8.939771015271109e-06, "loss": 0.4591, "num_input_tokens_seen": 56300416, "step": 46295 }, { "epoch": 5.801278035333918, "grad_norm": 0.037812240421772, "learning_rate": 8.939434361504742e-06, "loss": 0.4616, "num_input_tokens_seen": 56306016, "step": 46300 }, { "epoch": 5.801904523242701, "grad_norm": 0.07120714336633682, "learning_rate": 8.939097660639126e-06, "loss": 0.4666, "num_input_tokens_seen": 56311840, "step": 46305 }, { "epoch": 5.802531011151485, "grad_norm": 0.10801703482866287, "learning_rate": 8.938760912678287e-06, "loss": 0.4651, "num_input_tokens_seen": 56317760, "step": 46310 }, { "epoch": 5.803157499060268, "grad_norm": 0.037086714059114456, "learning_rate": 8.938424117626247e-06, "loss": 0.4593, "num_input_tokens_seen": 56324064, "step": 46315 }, { "epoch": 5.8037839869690515, "grad_norm": 0.07244841009378433, "learning_rate": 8.938087275487039e-06, "loss": 0.4672, "num_input_tokens_seen": 56329792, "step": 46320 }, { "epoch": 5.804410474877835, "grad_norm": 0.07075225561857224, "learning_rate": 8.937750386264684e-06, "loss": 0.4636, "num_input_tokens_seen": 56336128, "step": 46325 }, { "epoch": 5.805036962786618, "grad_norm": 0.06863117218017578, "learning_rate": 8.937413449963213e-06, "loss": 0.4644, "num_input_tokens_seen": 56342240, "step": 46330 }, { "epoch": 5.805663450695402, "grad_norm": 0.06881657242774963, "learning_rate": 8.937076466586655e-06, "loss": 0.4651, "num_input_tokens_seen": 56348160, "step": 46335 }, { "epoch": 5.806289938604185, "grad_norm": 0.0734512135386467, "learning_rate": 8.936739436139037e-06, "loss": 0.4603, "num_input_tokens_seen": 56354144, "step": 46340 }, { "epoch": 5.806916426512968, "grad_norm": 0.10092543810606003, "learning_rate": 8.936402358624389e-06, "loss": 0.4558, "num_input_tokens_seen": 56360480, "step": 46345 }, { "epoch": 5.807542914421751, "grad_norm": 0.06488744914531708, "learning_rate": 8.93606523404674e-06, "loss": 0.4644, "num_input_tokens_seen": 56366752, "step": 46350 }, { "epoch": 5.808169402330535, "grad_norm": 0.1393299251794815, "learning_rate": 8.935728062410124e-06, "loss": 0.4558, "num_input_tokens_seen": 56372224, "step": 46355 }, { "epoch": 5.808795890239319, "grad_norm": 0.06353747099637985, "learning_rate": 8.93539084371857e-06, "loss": 0.4639, "num_input_tokens_seen": 56378368, "step": 46360 }, { "epoch": 5.809422378148102, "grad_norm": 0.03827199339866638, "learning_rate": 8.935053577976108e-06, "loss": 0.4664, "num_input_tokens_seen": 56384576, "step": 46365 }, { "epoch": 5.810048866056885, "grad_norm": 0.06770751625299454, "learning_rate": 8.934716265186774e-06, "loss": 0.4592, "num_input_tokens_seen": 56390784, "step": 46370 }, { "epoch": 5.810675353965668, "grad_norm": 0.06621091067790985, "learning_rate": 8.934378905354596e-06, "loss": 0.4587, "num_input_tokens_seen": 56396896, "step": 46375 }, { "epoch": 5.811301841874452, "grad_norm": 0.08047088235616684, "learning_rate": 8.934041498483614e-06, "loss": 0.4583, "num_input_tokens_seen": 56403264, "step": 46380 }, { "epoch": 5.811928329783235, "grad_norm": 0.11349780112504959, "learning_rate": 8.933704044577855e-06, "loss": 0.4686, "num_input_tokens_seen": 56409440, "step": 46385 }, { "epoch": 5.8125548176920185, "grad_norm": 0.10771665722131729, "learning_rate": 8.93336654364136e-06, "loss": 0.4524, "num_input_tokens_seen": 56415488, "step": 46390 }, { "epoch": 5.813181305600802, "grad_norm": 0.06873592734336853, "learning_rate": 8.933028995678158e-06, "loss": 0.46, "num_input_tokens_seen": 56421696, "step": 46395 }, { "epoch": 5.813807793509586, "grad_norm": 0.07108829915523529, "learning_rate": 8.93269140069229e-06, "loss": 0.4586, "num_input_tokens_seen": 56427808, "step": 46400 }, { "epoch": 5.814434281418369, "grad_norm": 0.06808963418006897, "learning_rate": 8.932353758687788e-06, "loss": 0.4652, "num_input_tokens_seen": 56433728, "step": 46405 }, { "epoch": 5.815060769327152, "grad_norm": 0.06923115998506546, "learning_rate": 8.932016069668693e-06, "loss": 0.4673, "num_input_tokens_seen": 56440128, "step": 46410 }, { "epoch": 5.815687257235935, "grad_norm": 0.07150465250015259, "learning_rate": 8.931678333639039e-06, "loss": 0.4681, "num_input_tokens_seen": 56446048, "step": 46415 }, { "epoch": 5.816313745144718, "grad_norm": 0.09214959293603897, "learning_rate": 8.931340550602864e-06, "loss": 0.4635, "num_input_tokens_seen": 56452256, "step": 46420 }, { "epoch": 5.8169402330535025, "grad_norm": 0.07497017830610275, "learning_rate": 8.931002720564208e-06, "loss": 0.4663, "num_input_tokens_seen": 56458560, "step": 46425 }, { "epoch": 5.817566720962286, "grad_norm": 0.0675099715590477, "learning_rate": 8.93066484352711e-06, "loss": 0.4617, "num_input_tokens_seen": 56464032, "step": 46430 }, { "epoch": 5.818193208871069, "grad_norm": 0.06991607695817947, "learning_rate": 8.930326919495609e-06, "loss": 0.4631, "num_input_tokens_seen": 56470144, "step": 46435 }, { "epoch": 5.818819696779852, "grad_norm": 0.09402292966842651, "learning_rate": 8.929988948473745e-06, "loss": 0.4673, "num_input_tokens_seen": 56476512, "step": 46440 }, { "epoch": 5.819446184688635, "grad_norm": 0.10713303834199905, "learning_rate": 8.929650930465557e-06, "loss": 0.4693, "num_input_tokens_seen": 56482432, "step": 46445 }, { "epoch": 5.820072672597419, "grad_norm": 0.11795926839113235, "learning_rate": 8.929312865475091e-06, "loss": 0.4669, "num_input_tokens_seen": 56488576, "step": 46450 }, { "epoch": 5.820699160506202, "grad_norm": 0.08376163244247437, "learning_rate": 8.928974753506386e-06, "loss": 0.4649, "num_input_tokens_seen": 56494912, "step": 46455 }, { "epoch": 5.821325648414986, "grad_norm": 0.0788303017616272, "learning_rate": 8.928636594563483e-06, "loss": 0.4625, "num_input_tokens_seen": 56500768, "step": 46460 }, { "epoch": 5.821952136323769, "grad_norm": 0.06450681388378143, "learning_rate": 8.928298388650427e-06, "loss": 0.4637, "num_input_tokens_seen": 56506560, "step": 46465 }, { "epoch": 5.822578624232552, "grad_norm": 0.037350352853536606, "learning_rate": 8.92796013577126e-06, "loss": 0.4668, "num_input_tokens_seen": 56512480, "step": 46470 }, { "epoch": 5.823205112141336, "grad_norm": 0.06301217526197433, "learning_rate": 8.927621835930029e-06, "loss": 0.4604, "num_input_tokens_seen": 56519072, "step": 46475 }, { "epoch": 5.823831600050119, "grad_norm": 0.06871730089187622, "learning_rate": 8.927283489130777e-06, "loss": 0.4708, "num_input_tokens_seen": 56525408, "step": 46480 }, { "epoch": 5.824458087958902, "grad_norm": 0.11963151395320892, "learning_rate": 8.926945095377549e-06, "loss": 0.4591, "num_input_tokens_seen": 56531520, "step": 46485 }, { "epoch": 5.8250845758676855, "grad_norm": 0.06800802052021027, "learning_rate": 8.926606654674389e-06, "loss": 0.4691, "num_input_tokens_seen": 56537664, "step": 46490 }, { "epoch": 5.825711063776469, "grad_norm": 0.03536013141274452, "learning_rate": 8.926268167025346e-06, "loss": 0.4655, "num_input_tokens_seen": 56543808, "step": 46495 }, { "epoch": 5.826337551685253, "grad_norm": 0.0961715430021286, "learning_rate": 8.925929632434466e-06, "loss": 0.4625, "num_input_tokens_seen": 56549824, "step": 46500 }, { "epoch": 5.826964039594036, "grad_norm": 0.0765671357512474, "learning_rate": 8.925591050905798e-06, "loss": 0.466, "num_input_tokens_seen": 56555840, "step": 46505 }, { "epoch": 5.827590527502819, "grad_norm": 0.06539097428321838, "learning_rate": 8.925252422443386e-06, "loss": 0.4623, "num_input_tokens_seen": 56561760, "step": 46510 }, { "epoch": 5.828217015411602, "grad_norm": 0.08902984112501144, "learning_rate": 8.924913747051283e-06, "loss": 0.464, "num_input_tokens_seen": 56567776, "step": 46515 }, { "epoch": 5.828843503320386, "grad_norm": 0.0994240865111351, "learning_rate": 8.924575024733536e-06, "loss": 0.4654, "num_input_tokens_seen": 56573632, "step": 46520 }, { "epoch": 5.829469991229169, "grad_norm": 0.08657261729240417, "learning_rate": 8.924236255494194e-06, "loss": 0.4644, "num_input_tokens_seen": 56580096, "step": 46525 }, { "epoch": 5.830096479137953, "grad_norm": 0.09790217876434326, "learning_rate": 8.923897439337307e-06, "loss": 0.467, "num_input_tokens_seen": 56586112, "step": 46530 }, { "epoch": 5.830722967046736, "grad_norm": 0.05757307633757591, "learning_rate": 8.923558576266931e-06, "loss": 0.4636, "num_input_tokens_seen": 56592064, "step": 46535 }, { "epoch": 5.83134945495552, "grad_norm": 0.06419657170772552, "learning_rate": 8.92321966628711e-06, "loss": 0.4612, "num_input_tokens_seen": 56598080, "step": 46540 }, { "epoch": 5.831975942864303, "grad_norm": 0.035329487174749374, "learning_rate": 8.9228807094019e-06, "loss": 0.4592, "num_input_tokens_seen": 56604352, "step": 46545 }, { "epoch": 5.832602430773086, "grad_norm": 0.07040127366781235, "learning_rate": 8.922541705615356e-06, "loss": 0.458, "num_input_tokens_seen": 56609952, "step": 46550 }, { "epoch": 5.833228918681869, "grad_norm": 0.11391060799360275, "learning_rate": 8.922202654931524e-06, "loss": 0.4563, "num_input_tokens_seen": 56615936, "step": 46555 }, { "epoch": 5.8338554065906525, "grad_norm": 0.08719826489686966, "learning_rate": 8.921863557354465e-06, "loss": 0.4674, "num_input_tokens_seen": 56622304, "step": 46560 }, { "epoch": 5.8344818944994365, "grad_norm": 0.07197075337171555, "learning_rate": 8.921524412888228e-06, "loss": 0.4682, "num_input_tokens_seen": 56627680, "step": 46565 }, { "epoch": 5.83510838240822, "grad_norm": 0.07389863580465317, "learning_rate": 8.921185221536869e-06, "loss": 0.4667, "num_input_tokens_seen": 56633696, "step": 46570 }, { "epoch": 5.835734870317003, "grad_norm": 0.06040637567639351, "learning_rate": 8.920845983304445e-06, "loss": 0.466, "num_input_tokens_seen": 56639936, "step": 46575 }, { "epoch": 5.836361358225786, "grad_norm": 0.06875515729188919, "learning_rate": 8.92050669819501e-06, "loss": 0.4559, "num_input_tokens_seen": 56645920, "step": 46580 }, { "epoch": 5.836987846134569, "grad_norm": 0.1544020026922226, "learning_rate": 8.920167366212622e-06, "loss": 0.4717, "num_input_tokens_seen": 56651808, "step": 46585 }, { "epoch": 5.837614334043353, "grad_norm": 0.06101853400468826, "learning_rate": 8.919827987361337e-06, "loss": 0.4633, "num_input_tokens_seen": 56657696, "step": 46590 }, { "epoch": 5.8382408219521365, "grad_norm": 0.10314591228961945, "learning_rate": 8.919488561645213e-06, "loss": 0.4648, "num_input_tokens_seen": 56663872, "step": 46595 }, { "epoch": 5.83886730986092, "grad_norm": 0.08169955015182495, "learning_rate": 8.919149089068308e-06, "loss": 0.465, "num_input_tokens_seen": 56670016, "step": 46600 }, { "epoch": 5.839493797769703, "grad_norm": 0.114108145236969, "learning_rate": 8.91880956963468e-06, "loss": 0.4592, "num_input_tokens_seen": 56676096, "step": 46605 }, { "epoch": 5.840120285678486, "grad_norm": 0.06440072506666183, "learning_rate": 8.91847000334839e-06, "loss": 0.4647, "num_input_tokens_seen": 56682240, "step": 46610 }, { "epoch": 5.84074677358727, "grad_norm": 0.06777126342058182, "learning_rate": 8.918130390213495e-06, "loss": 0.4629, "num_input_tokens_seen": 56688256, "step": 46615 }, { "epoch": 5.841373261496053, "grad_norm": 0.06565885245800018, "learning_rate": 8.917790730234057e-06, "loss": 0.4666, "num_input_tokens_seen": 56694496, "step": 46620 }, { "epoch": 5.841999749404836, "grad_norm": 0.06294400244951248, "learning_rate": 8.917451023414139e-06, "loss": 0.465, "num_input_tokens_seen": 56700672, "step": 46625 }, { "epoch": 5.8426262373136195, "grad_norm": 0.060718197375535965, "learning_rate": 8.9171112697578e-06, "loss": 0.464, "num_input_tokens_seen": 56706848, "step": 46630 }, { "epoch": 5.843252725222404, "grad_norm": 0.09133438766002655, "learning_rate": 8.9167714692691e-06, "loss": 0.4608, "num_input_tokens_seen": 56712832, "step": 46635 }, { "epoch": 5.843879213131187, "grad_norm": 0.06964386254549026, "learning_rate": 8.916431621952107e-06, "loss": 0.4696, "num_input_tokens_seen": 56718912, "step": 46640 }, { "epoch": 5.84450570103997, "grad_norm": 0.0594882033765316, "learning_rate": 8.916091727810878e-06, "loss": 0.4599, "num_input_tokens_seen": 56725248, "step": 46645 }, { "epoch": 5.845132188948753, "grad_norm": 0.10124329477548599, "learning_rate": 8.915751786849482e-06, "loss": 0.4641, "num_input_tokens_seen": 56730976, "step": 46650 }, { "epoch": 5.845758676857537, "grad_norm": 0.09870216995477676, "learning_rate": 8.915411799071983e-06, "loss": 0.4641, "num_input_tokens_seen": 56736000, "step": 46655 }, { "epoch": 5.84638516476632, "grad_norm": 0.06995901465415955, "learning_rate": 8.915071764482442e-06, "loss": 0.4614, "num_input_tokens_seen": 56742240, "step": 46660 }, { "epoch": 5.8470116526751035, "grad_norm": 0.10664302855730057, "learning_rate": 8.914731683084925e-06, "loss": 0.4664, "num_input_tokens_seen": 56748192, "step": 46665 }, { "epoch": 5.847638140583887, "grad_norm": 0.0662500262260437, "learning_rate": 8.914391554883502e-06, "loss": 0.4651, "num_input_tokens_seen": 56754368, "step": 46670 }, { "epoch": 5.84826462849267, "grad_norm": 0.09373527765274048, "learning_rate": 8.914051379882235e-06, "loss": 0.4633, "num_input_tokens_seen": 56760864, "step": 46675 }, { "epoch": 5.848891116401454, "grad_norm": 0.05928393453359604, "learning_rate": 8.913711158085194e-06, "loss": 0.4583, "num_input_tokens_seen": 56766880, "step": 46680 }, { "epoch": 5.849517604310237, "grad_norm": 0.06749219447374344, "learning_rate": 8.913370889496445e-06, "loss": 0.4631, "num_input_tokens_seen": 56772800, "step": 46685 }, { "epoch": 5.85014409221902, "grad_norm": 0.0647130012512207, "learning_rate": 8.913030574120058e-06, "loss": 0.4631, "num_input_tokens_seen": 56779104, "step": 46690 }, { "epoch": 5.850770580127803, "grad_norm": 0.059866052120923996, "learning_rate": 8.9126902119601e-06, "loss": 0.462, "num_input_tokens_seen": 56785344, "step": 46695 }, { "epoch": 5.8513970680365865, "grad_norm": 0.0872306078672409, "learning_rate": 8.91234980302064e-06, "loss": 0.4623, "num_input_tokens_seen": 56791584, "step": 46700 }, { "epoch": 5.852023555945371, "grad_norm": 0.1003619134426117, "learning_rate": 8.91200934730575e-06, "loss": 0.4605, "num_input_tokens_seen": 56797632, "step": 46705 }, { "epoch": 5.852650043854154, "grad_norm": 0.059074923396110535, "learning_rate": 8.911668844819498e-06, "loss": 0.4615, "num_input_tokens_seen": 56803232, "step": 46710 }, { "epoch": 5.853276531762937, "grad_norm": 0.03720369562506676, "learning_rate": 8.911328295565957e-06, "loss": 0.4568, "num_input_tokens_seen": 56809600, "step": 46715 }, { "epoch": 5.85390301967172, "grad_norm": 0.10354464501142502, "learning_rate": 8.910987699549198e-06, "loss": 0.464, "num_input_tokens_seen": 56815936, "step": 46720 }, { "epoch": 5.854529507580503, "grad_norm": 0.07840327173471451, "learning_rate": 8.910647056773292e-06, "loss": 0.4595, "num_input_tokens_seen": 56822048, "step": 46725 }, { "epoch": 5.855155995489287, "grad_norm": 0.0668315589427948, "learning_rate": 8.910306367242313e-06, "loss": 0.4583, "num_input_tokens_seen": 56828320, "step": 46730 }, { "epoch": 5.8557824833980705, "grad_norm": 0.06905137747526169, "learning_rate": 8.909965630960335e-06, "loss": 0.4607, "num_input_tokens_seen": 56834304, "step": 46735 }, { "epoch": 5.856408971306854, "grad_norm": 0.05957329273223877, "learning_rate": 8.90962484793143e-06, "loss": 0.4683, "num_input_tokens_seen": 56840480, "step": 46740 }, { "epoch": 5.857035459215637, "grad_norm": 0.07153850793838501, "learning_rate": 8.909284018159672e-06, "loss": 0.4595, "num_input_tokens_seen": 56846592, "step": 46745 }, { "epoch": 5.85766194712442, "grad_norm": 0.06417655944824219, "learning_rate": 8.908943141649138e-06, "loss": 0.4622, "num_input_tokens_seen": 56852576, "step": 46750 }, { "epoch": 5.858288435033204, "grad_norm": 0.06867525726556778, "learning_rate": 8.908602218403902e-06, "loss": 0.4692, "num_input_tokens_seen": 56858880, "step": 46755 }, { "epoch": 5.858914922941987, "grad_norm": 0.029183438047766685, "learning_rate": 8.908261248428039e-06, "loss": 0.4552, "num_input_tokens_seen": 56864928, "step": 46760 }, { "epoch": 5.85954141085077, "grad_norm": 0.12889030575752258, "learning_rate": 8.907920231725627e-06, "loss": 0.4688, "num_input_tokens_seen": 56871232, "step": 46765 }, { "epoch": 5.860167898759554, "grad_norm": 0.09646735340356827, "learning_rate": 8.907579168300744e-06, "loss": 0.459, "num_input_tokens_seen": 56877184, "step": 46770 }, { "epoch": 5.860794386668338, "grad_norm": 0.06270449608564377, "learning_rate": 8.907238058157467e-06, "loss": 0.4632, "num_input_tokens_seen": 56883392, "step": 46775 }, { "epoch": 5.861420874577121, "grad_norm": 0.08391972631216049, "learning_rate": 8.906896901299873e-06, "loss": 0.4614, "num_input_tokens_seen": 56889536, "step": 46780 }, { "epoch": 5.862047362485904, "grad_norm": 0.05820953845977783, "learning_rate": 8.906555697732042e-06, "loss": 0.4601, "num_input_tokens_seen": 56895904, "step": 46785 }, { "epoch": 5.862673850394687, "grad_norm": 0.060523539781570435, "learning_rate": 8.906214447458054e-06, "loss": 0.4598, "num_input_tokens_seen": 56902240, "step": 46790 }, { "epoch": 5.863300338303471, "grad_norm": 0.07176004350185394, "learning_rate": 8.905873150481988e-06, "loss": 0.4621, "num_input_tokens_seen": 56907936, "step": 46795 }, { "epoch": 5.863926826212254, "grad_norm": 0.06853032857179642, "learning_rate": 8.905531806807923e-06, "loss": 0.4532, "num_input_tokens_seen": 56914080, "step": 46800 }, { "epoch": 5.8645533141210375, "grad_norm": 0.05919278785586357, "learning_rate": 8.905190416439944e-06, "loss": 0.4689, "num_input_tokens_seen": 56919648, "step": 46805 }, { "epoch": 5.865179802029821, "grad_norm": 0.0942317470908165, "learning_rate": 8.904848979382128e-06, "loss": 0.4666, "num_input_tokens_seen": 56925632, "step": 46810 }, { "epoch": 5.865806289938604, "grad_norm": 0.08135997503995895, "learning_rate": 8.904507495638561e-06, "loss": 0.4592, "num_input_tokens_seen": 56931552, "step": 46815 }, { "epoch": 5.866432777847388, "grad_norm": 0.06424827128648758, "learning_rate": 8.904165965213323e-06, "loss": 0.4572, "num_input_tokens_seen": 56937728, "step": 46820 }, { "epoch": 5.867059265756171, "grad_norm": 0.07163576036691666, "learning_rate": 8.9038243881105e-06, "loss": 0.4623, "num_input_tokens_seen": 56943904, "step": 46825 }, { "epoch": 5.867685753664954, "grad_norm": 0.09984885901212692, "learning_rate": 8.903482764334172e-06, "loss": 0.4641, "num_input_tokens_seen": 56950176, "step": 46830 }, { "epoch": 5.868312241573737, "grad_norm": 0.0603867806494236, "learning_rate": 8.903141093888429e-06, "loss": 0.4616, "num_input_tokens_seen": 56956256, "step": 46835 }, { "epoch": 5.868938729482521, "grad_norm": 0.06898272037506104, "learning_rate": 8.902799376777351e-06, "loss": 0.4607, "num_input_tokens_seen": 56962304, "step": 46840 }, { "epoch": 5.869565217391305, "grad_norm": 0.06387917697429657, "learning_rate": 8.902457613005024e-06, "loss": 0.4631, "num_input_tokens_seen": 56968256, "step": 46845 }, { "epoch": 5.870191705300088, "grad_norm": 0.09227103739976883, "learning_rate": 8.902115802575535e-06, "loss": 0.46, "num_input_tokens_seen": 56974528, "step": 46850 }, { "epoch": 5.870818193208871, "grad_norm": 0.06421466916799545, "learning_rate": 8.901773945492973e-06, "loss": 0.4567, "num_input_tokens_seen": 56980544, "step": 46855 }, { "epoch": 5.871444681117654, "grad_norm": 0.07469820976257324, "learning_rate": 8.901432041761421e-06, "loss": 0.4549, "num_input_tokens_seen": 56986976, "step": 46860 }, { "epoch": 5.872071169026437, "grad_norm": 0.06797001510858536, "learning_rate": 8.901090091384971e-06, "loss": 0.459, "num_input_tokens_seen": 56993120, "step": 46865 }, { "epoch": 5.872697656935221, "grad_norm": 0.06197543069720268, "learning_rate": 8.900748094367708e-06, "loss": 0.4583, "num_input_tokens_seen": 56999136, "step": 46870 }, { "epoch": 5.873324144844005, "grad_norm": 0.09133341163396835, "learning_rate": 8.900406050713723e-06, "loss": 0.4588, "num_input_tokens_seen": 57005312, "step": 46875 }, { "epoch": 5.873950632752788, "grad_norm": 0.1166997104883194, "learning_rate": 8.900063960427103e-06, "loss": 0.4703, "num_input_tokens_seen": 57011584, "step": 46880 }, { "epoch": 5.874577120661571, "grad_norm": 0.07903055101633072, "learning_rate": 8.89972182351194e-06, "loss": 0.4714, "num_input_tokens_seen": 57017984, "step": 46885 }, { "epoch": 5.875203608570355, "grad_norm": 0.06269285082817078, "learning_rate": 8.899379639972324e-06, "loss": 0.4636, "num_input_tokens_seen": 57024256, "step": 46890 }, { "epoch": 5.875830096479138, "grad_norm": 0.12827670574188232, "learning_rate": 8.899037409812345e-06, "loss": 0.4656, "num_input_tokens_seen": 57030336, "step": 46895 }, { "epoch": 5.876456584387921, "grad_norm": 0.0807652696967125, "learning_rate": 8.898695133036096e-06, "loss": 0.4584, "num_input_tokens_seen": 57036384, "step": 46900 }, { "epoch": 5.8770830722967045, "grad_norm": 0.1531800478696823, "learning_rate": 8.898352809647667e-06, "loss": 0.4567, "num_input_tokens_seen": 57042528, "step": 46905 }, { "epoch": 5.8777095602054885, "grad_norm": 0.05754811689257622, "learning_rate": 8.898010439651156e-06, "loss": 0.465, "num_input_tokens_seen": 57048736, "step": 46910 }, { "epoch": 5.878336048114272, "grad_norm": 0.07444611191749573, "learning_rate": 8.897668023050652e-06, "loss": 0.4635, "num_input_tokens_seen": 57054976, "step": 46915 }, { "epoch": 5.878962536023055, "grad_norm": 0.07951957732439041, "learning_rate": 8.897325559850249e-06, "loss": 0.4636, "num_input_tokens_seen": 57060896, "step": 46920 }, { "epoch": 5.879589023931838, "grad_norm": 0.033096615225076675, "learning_rate": 8.896983050054041e-06, "loss": 0.4479, "num_input_tokens_seen": 57066752, "step": 46925 }, { "epoch": 5.880215511840621, "grad_norm": 0.04112498462200165, "learning_rate": 8.896640493666128e-06, "loss": 0.4611, "num_input_tokens_seen": 57072992, "step": 46930 }, { "epoch": 5.880841999749405, "grad_norm": 0.09131143242120743, "learning_rate": 8.896297890690599e-06, "loss": 0.4611, "num_input_tokens_seen": 57079360, "step": 46935 }, { "epoch": 5.881468487658188, "grad_norm": 0.07941807806491852, "learning_rate": 8.895955241131553e-06, "loss": 0.4836, "num_input_tokens_seen": 57085504, "step": 46940 }, { "epoch": 5.882094975566972, "grad_norm": 0.03694953769445419, "learning_rate": 8.895612544993088e-06, "loss": 0.4646, "num_input_tokens_seen": 57091232, "step": 46945 }, { "epoch": 5.882721463475755, "grad_norm": 0.07363449037075043, "learning_rate": 8.895269802279299e-06, "loss": 0.4568, "num_input_tokens_seen": 57097184, "step": 46950 }, { "epoch": 5.883347951384538, "grad_norm": 0.0866314247250557, "learning_rate": 8.894927012994284e-06, "loss": 0.4606, "num_input_tokens_seen": 57103744, "step": 46955 }, { "epoch": 5.883974439293322, "grad_norm": 0.034158654510974884, "learning_rate": 8.894584177142142e-06, "loss": 0.4614, "num_input_tokens_seen": 57109696, "step": 46960 }, { "epoch": 5.884600927202105, "grad_norm": 0.08531907945871353, "learning_rate": 8.894241294726973e-06, "loss": 0.466, "num_input_tokens_seen": 57115264, "step": 46965 }, { "epoch": 5.885227415110888, "grad_norm": 0.06560274958610535, "learning_rate": 8.893898365752873e-06, "loss": 0.4607, "num_input_tokens_seen": 57121408, "step": 46970 }, { "epoch": 5.8858539030196715, "grad_norm": 0.06017215549945831, "learning_rate": 8.893555390223946e-06, "loss": 0.4545, "num_input_tokens_seen": 57127552, "step": 46975 }, { "epoch": 5.886480390928455, "grad_norm": 0.07234634459018707, "learning_rate": 8.893212368144291e-06, "loss": 0.4623, "num_input_tokens_seen": 57133728, "step": 46980 }, { "epoch": 5.887106878837239, "grad_norm": 0.05913514643907547, "learning_rate": 8.892869299518007e-06, "loss": 0.4598, "num_input_tokens_seen": 57139744, "step": 46985 }, { "epoch": 5.887733366746022, "grad_norm": 0.06254497170448303, "learning_rate": 8.892526184349199e-06, "loss": 0.4592, "num_input_tokens_seen": 57145984, "step": 46990 }, { "epoch": 5.888359854654805, "grad_norm": 0.17923857271671295, "learning_rate": 8.892183022641968e-06, "loss": 0.468, "num_input_tokens_seen": 57152000, "step": 46995 }, { "epoch": 5.888986342563588, "grad_norm": 0.058191247284412384, "learning_rate": 8.891839814400415e-06, "loss": 0.4607, "num_input_tokens_seen": 57158080, "step": 47000 }, { "epoch": 5.889612830472371, "grad_norm": 0.07559030503034592, "learning_rate": 8.891496559628648e-06, "loss": 0.4643, "num_input_tokens_seen": 57164192, "step": 47005 }, { "epoch": 5.890239318381155, "grad_norm": 0.05566263198852539, "learning_rate": 8.891153258330765e-06, "loss": 0.4666, "num_input_tokens_seen": 57170592, "step": 47010 }, { "epoch": 5.890865806289939, "grad_norm": 0.07131834328174591, "learning_rate": 8.890809910510875e-06, "loss": 0.4664, "num_input_tokens_seen": 57176896, "step": 47015 }, { "epoch": 5.891492294198722, "grad_norm": 0.06542465835809708, "learning_rate": 8.89046651617308e-06, "loss": 0.4591, "num_input_tokens_seen": 57183200, "step": 47020 }, { "epoch": 5.892118782107505, "grad_norm": 0.034752655774354935, "learning_rate": 8.890123075321488e-06, "loss": 0.4577, "num_input_tokens_seen": 57189152, "step": 47025 }, { "epoch": 5.892745270016289, "grad_norm": 0.10686186701059341, "learning_rate": 8.889779587960204e-06, "loss": 0.464, "num_input_tokens_seen": 57195296, "step": 47030 }, { "epoch": 5.893371757925072, "grad_norm": 0.12705327570438385, "learning_rate": 8.889436054093333e-06, "loss": 0.4686, "num_input_tokens_seen": 57201472, "step": 47035 }, { "epoch": 5.893998245833855, "grad_norm": 0.08613049983978271, "learning_rate": 8.889092473724984e-06, "loss": 0.4641, "num_input_tokens_seen": 57207648, "step": 47040 }, { "epoch": 5.8946247337426385, "grad_norm": 0.07867567986249924, "learning_rate": 8.888748846859268e-06, "loss": 0.4595, "num_input_tokens_seen": 57213440, "step": 47045 }, { "epoch": 5.895251221651423, "grad_norm": 0.08444046974182129, "learning_rate": 8.888405173500288e-06, "loss": 0.4649, "num_input_tokens_seen": 57219488, "step": 47050 }, { "epoch": 5.895877709560206, "grad_norm": 0.09860856086015701, "learning_rate": 8.888061453652153e-06, "loss": 0.464, "num_input_tokens_seen": 57225632, "step": 47055 }, { "epoch": 5.896504197468989, "grad_norm": 0.06842406094074249, "learning_rate": 8.887717687318977e-06, "loss": 0.466, "num_input_tokens_seen": 57231616, "step": 47060 }, { "epoch": 5.897130685377772, "grad_norm": 0.06316473335027695, "learning_rate": 8.887373874504867e-06, "loss": 0.4562, "num_input_tokens_seen": 57237920, "step": 47065 }, { "epoch": 5.897757173286555, "grad_norm": 0.06205778196454048, "learning_rate": 8.887030015213933e-06, "loss": 0.464, "num_input_tokens_seen": 57243872, "step": 47070 }, { "epoch": 5.898383661195339, "grad_norm": 0.06895249336957932, "learning_rate": 8.886686109450288e-06, "loss": 0.4653, "num_input_tokens_seen": 57249920, "step": 47075 }, { "epoch": 5.8990101491041225, "grad_norm": 0.06452882289886475, "learning_rate": 8.886342157218043e-06, "loss": 0.4581, "num_input_tokens_seen": 57256384, "step": 47080 }, { "epoch": 5.899636637012906, "grad_norm": 0.06122208014130592, "learning_rate": 8.88599815852131e-06, "loss": 0.455, "num_input_tokens_seen": 57262560, "step": 47085 }, { "epoch": 5.900263124921689, "grad_norm": 0.08227478712797165, "learning_rate": 8.8856541133642e-06, "loss": 0.4611, "num_input_tokens_seen": 57268576, "step": 47090 }, { "epoch": 5.900889612830472, "grad_norm": 0.06257396191358566, "learning_rate": 8.88531002175083e-06, "loss": 0.4598, "num_input_tokens_seen": 57274528, "step": 47095 }, { "epoch": 5.901516100739256, "grad_norm": 0.07191739976406097, "learning_rate": 8.884965883685313e-06, "loss": 0.4667, "num_input_tokens_seen": 57280480, "step": 47100 }, { "epoch": 5.902142588648039, "grad_norm": 0.06507153064012527, "learning_rate": 8.884621699171761e-06, "loss": 0.4571, "num_input_tokens_seen": 57286688, "step": 47105 }, { "epoch": 5.902769076556822, "grad_norm": 0.06266071647405624, "learning_rate": 8.884277468214292e-06, "loss": 0.4611, "num_input_tokens_seen": 57292608, "step": 47110 }, { "epoch": 5.9033955644656055, "grad_norm": 0.07642984390258789, "learning_rate": 8.883933190817018e-06, "loss": 0.4699, "num_input_tokens_seen": 57298528, "step": 47115 }, { "epoch": 5.904022052374389, "grad_norm": 0.06482933461666107, "learning_rate": 8.883588866984057e-06, "loss": 0.4575, "num_input_tokens_seen": 57304704, "step": 47120 }, { "epoch": 5.904648540283173, "grad_norm": 0.07969187200069427, "learning_rate": 8.883244496719528e-06, "loss": 0.4683, "num_input_tokens_seen": 57310656, "step": 47125 }, { "epoch": 5.905275028191956, "grad_norm": 0.03819867596030235, "learning_rate": 8.882900080027545e-06, "loss": 0.4673, "num_input_tokens_seen": 57316800, "step": 47130 }, { "epoch": 5.905901516100739, "grad_norm": 0.063100665807724, "learning_rate": 8.882555616912228e-06, "loss": 0.4591, "num_input_tokens_seen": 57322976, "step": 47135 }, { "epoch": 5.906528004009522, "grad_norm": 0.05911023169755936, "learning_rate": 8.882211107377694e-06, "loss": 0.462, "num_input_tokens_seen": 57329056, "step": 47140 }, { "epoch": 5.907154491918306, "grad_norm": 0.05981384962797165, "learning_rate": 8.881866551428062e-06, "loss": 0.4584, "num_input_tokens_seen": 57335296, "step": 47145 }, { "epoch": 5.9077809798270895, "grad_norm": 0.06563994288444519, "learning_rate": 8.881521949067451e-06, "loss": 0.461, "num_input_tokens_seen": 57341344, "step": 47150 }, { "epoch": 5.908407467735873, "grad_norm": 0.06835098564624786, "learning_rate": 8.881177300299983e-06, "loss": 0.4569, "num_input_tokens_seen": 57347424, "step": 47155 }, { "epoch": 5.909033955644656, "grad_norm": 0.07364562153816223, "learning_rate": 8.880832605129776e-06, "loss": 0.467, "num_input_tokens_seen": 57353216, "step": 47160 }, { "epoch": 5.90966044355344, "grad_norm": 0.06089867278933525, "learning_rate": 8.880487863560954e-06, "loss": 0.461, "num_input_tokens_seen": 57359488, "step": 47165 }, { "epoch": 5.910286931462223, "grad_norm": 0.13275282084941864, "learning_rate": 8.880143075597636e-06, "loss": 0.465, "num_input_tokens_seen": 57365664, "step": 47170 }, { "epoch": 5.910913419371006, "grad_norm": 0.08122096955776215, "learning_rate": 8.879798241243944e-06, "loss": 0.473, "num_input_tokens_seen": 57371936, "step": 47175 }, { "epoch": 5.911539907279789, "grad_norm": 0.0853215903043747, "learning_rate": 8.879453360504004e-06, "loss": 0.4593, "num_input_tokens_seen": 57377856, "step": 47180 }, { "epoch": 5.912166395188573, "grad_norm": 0.07945911586284637, "learning_rate": 8.879108433381937e-06, "loss": 0.4675, "num_input_tokens_seen": 57383808, "step": 47185 }, { "epoch": 5.912792883097357, "grad_norm": 0.07243315875530243, "learning_rate": 8.878763459881867e-06, "loss": 0.4659, "num_input_tokens_seen": 57390112, "step": 47190 }, { "epoch": 5.91341937100614, "grad_norm": 0.08395412564277649, "learning_rate": 8.87841844000792e-06, "loss": 0.4644, "num_input_tokens_seen": 57395936, "step": 47195 }, { "epoch": 5.914045858914923, "grad_norm": 0.032882850617170334, "learning_rate": 8.878073373764218e-06, "loss": 0.4664, "num_input_tokens_seen": 57402016, "step": 47200 }, { "epoch": 5.914672346823706, "grad_norm": 0.032973628491163254, "learning_rate": 8.877728261154889e-06, "loss": 0.4616, "num_input_tokens_seen": 57408320, "step": 47205 }, { "epoch": 5.915298834732489, "grad_norm": 0.07536318898200989, "learning_rate": 8.877383102184058e-06, "loss": 0.4653, "num_input_tokens_seen": 57414144, "step": 47210 }, { "epoch": 5.915925322641273, "grad_norm": 0.0890338197350502, "learning_rate": 8.877037896855853e-06, "loss": 0.4599, "num_input_tokens_seen": 57420096, "step": 47215 }, { "epoch": 5.9165518105500565, "grad_norm": 0.06054406985640526, "learning_rate": 8.8766926451744e-06, "loss": 0.4661, "num_input_tokens_seen": 57425888, "step": 47220 }, { "epoch": 5.91717829845884, "grad_norm": 0.07135029137134552, "learning_rate": 8.876347347143826e-06, "loss": 0.461, "num_input_tokens_seen": 57431488, "step": 47225 }, { "epoch": 5.917804786367623, "grad_norm": 0.11121353507041931, "learning_rate": 8.876002002768261e-06, "loss": 0.4651, "num_input_tokens_seen": 57437440, "step": 47230 }, { "epoch": 5.918431274276406, "grad_norm": 0.09716708958148956, "learning_rate": 8.875656612051835e-06, "loss": 0.462, "num_input_tokens_seen": 57443680, "step": 47235 }, { "epoch": 5.91905776218519, "grad_norm": 0.06769982725381851, "learning_rate": 8.875311174998673e-06, "loss": 0.4599, "num_input_tokens_seen": 57450048, "step": 47240 }, { "epoch": 5.919684250093973, "grad_norm": 0.10365073382854462, "learning_rate": 8.87496569161291e-06, "loss": 0.4606, "num_input_tokens_seen": 57456064, "step": 47245 }, { "epoch": 5.920310738002756, "grad_norm": 0.09438716620206833, "learning_rate": 8.874620161898673e-06, "loss": 0.4538, "num_input_tokens_seen": 57462432, "step": 47250 }, { "epoch": 5.92093722591154, "grad_norm": 0.07396793365478516, "learning_rate": 8.874274585860094e-06, "loss": 0.4668, "num_input_tokens_seen": 57468352, "step": 47255 }, { "epoch": 5.921563713820324, "grad_norm": 0.04171639308333397, "learning_rate": 8.873928963501305e-06, "loss": 0.4652, "num_input_tokens_seen": 57474048, "step": 47260 }, { "epoch": 5.922190201729107, "grad_norm": 0.03324909135699272, "learning_rate": 8.87358329482644e-06, "loss": 0.4621, "num_input_tokens_seen": 57480480, "step": 47265 }, { "epoch": 5.92281668963789, "grad_norm": 0.08218688517808914, "learning_rate": 8.873237579839629e-06, "loss": 0.471, "num_input_tokens_seen": 57486624, "step": 47270 }, { "epoch": 5.923443177546673, "grad_norm": 0.10953272134065628, "learning_rate": 8.872891818545006e-06, "loss": 0.4631, "num_input_tokens_seen": 57492544, "step": 47275 }, { "epoch": 5.924069665455456, "grad_norm": 0.09921009093523026, "learning_rate": 8.872546010946706e-06, "loss": 0.4594, "num_input_tokens_seen": 57498656, "step": 47280 }, { "epoch": 5.92469615336424, "grad_norm": 0.07209860533475876, "learning_rate": 8.872200157048861e-06, "loss": 0.4595, "num_input_tokens_seen": 57504928, "step": 47285 }, { "epoch": 5.9253226412730235, "grad_norm": 0.08325901627540588, "learning_rate": 8.87185425685561e-06, "loss": 0.462, "num_input_tokens_seen": 57510560, "step": 47290 }, { "epoch": 5.925949129181807, "grad_norm": 0.11914325505495071, "learning_rate": 8.871508310371085e-06, "loss": 0.4667, "num_input_tokens_seen": 57516608, "step": 47295 }, { "epoch": 5.92657561709059, "grad_norm": 0.17561382055282593, "learning_rate": 8.871162317599422e-06, "loss": 0.4684, "num_input_tokens_seen": 57522592, "step": 47300 }, { "epoch": 5.927202104999374, "grad_norm": 0.08711223304271698, "learning_rate": 8.870816278544759e-06, "loss": 0.4674, "num_input_tokens_seen": 57528768, "step": 47305 }, { "epoch": 5.927828592908157, "grad_norm": 0.11054377257823944, "learning_rate": 8.870470193211233e-06, "loss": 0.4634, "num_input_tokens_seen": 57534368, "step": 47310 }, { "epoch": 5.92845508081694, "grad_norm": 0.10157009959220886, "learning_rate": 8.870124061602982e-06, "loss": 0.4624, "num_input_tokens_seen": 57540416, "step": 47315 }, { "epoch": 5.9290815687257235, "grad_norm": 0.06636180728673935, "learning_rate": 8.869777883724144e-06, "loss": 0.4625, "num_input_tokens_seen": 57546624, "step": 47320 }, { "epoch": 5.929708056634507, "grad_norm": 0.07404346019029617, "learning_rate": 8.869431659578859e-06, "loss": 0.458, "num_input_tokens_seen": 57552704, "step": 47325 }, { "epoch": 5.930334544543291, "grad_norm": 0.08090231567621231, "learning_rate": 8.869085389171263e-06, "loss": 0.4641, "num_input_tokens_seen": 57558208, "step": 47330 }, { "epoch": 5.930961032452074, "grad_norm": 0.07836835831403732, "learning_rate": 8.8687390725055e-06, "loss": 0.4657, "num_input_tokens_seen": 57564544, "step": 47335 }, { "epoch": 5.931587520360857, "grad_norm": 0.08613072335720062, "learning_rate": 8.86839270958571e-06, "loss": 0.4587, "num_input_tokens_seen": 57570752, "step": 47340 }, { "epoch": 5.93221400826964, "grad_norm": 0.09379622340202332, "learning_rate": 8.868046300416032e-06, "loss": 0.468, "num_input_tokens_seen": 57576832, "step": 47345 }, { "epoch": 5.932840496178423, "grad_norm": 0.0991315022110939, "learning_rate": 8.867699845000606e-06, "loss": 0.4644, "num_input_tokens_seen": 57582208, "step": 47350 }, { "epoch": 5.933466984087207, "grad_norm": 0.06515125185251236, "learning_rate": 8.86735334334358e-06, "loss": 0.4633, "num_input_tokens_seen": 57588512, "step": 47355 }, { "epoch": 5.934093471995991, "grad_norm": 0.08939684927463531, "learning_rate": 8.86700679544909e-06, "loss": 0.4633, "num_input_tokens_seen": 57594304, "step": 47360 }, { "epoch": 5.934719959904774, "grad_norm": 0.1000419557094574, "learning_rate": 8.866660201321285e-06, "loss": 0.4632, "num_input_tokens_seen": 57600608, "step": 47365 }, { "epoch": 5.935346447813557, "grad_norm": 0.06670107692480087, "learning_rate": 8.866313560964304e-06, "loss": 0.4602, "num_input_tokens_seen": 57606912, "step": 47370 }, { "epoch": 5.93597293572234, "grad_norm": 0.0994686484336853, "learning_rate": 8.865966874382294e-06, "loss": 0.4678, "num_input_tokens_seen": 57612832, "step": 47375 }, { "epoch": 5.936599423631124, "grad_norm": 0.07023890316486359, "learning_rate": 8.865620141579402e-06, "loss": 0.461, "num_input_tokens_seen": 57618784, "step": 47380 }, { "epoch": 5.937225911539907, "grad_norm": 0.09769494086503983, "learning_rate": 8.86527336255977e-06, "loss": 0.4659, "num_input_tokens_seen": 57624800, "step": 47385 }, { "epoch": 5.9378523994486905, "grad_norm": 0.0638953149318695, "learning_rate": 8.864926537327546e-06, "loss": 0.4604, "num_input_tokens_seen": 57630080, "step": 47390 }, { "epoch": 5.938478887357474, "grad_norm": 0.13119250535964966, "learning_rate": 8.864579665886875e-06, "loss": 0.4595, "num_input_tokens_seen": 57636416, "step": 47395 }, { "epoch": 5.939105375266258, "grad_norm": 0.07776203751564026, "learning_rate": 8.864232748241904e-06, "loss": 0.4632, "num_input_tokens_seen": 57642272, "step": 47400 }, { "epoch": 5.939731863175041, "grad_norm": 0.08941872417926788, "learning_rate": 8.863885784396782e-06, "loss": 0.463, "num_input_tokens_seen": 57648384, "step": 47405 }, { "epoch": 5.940358351083824, "grad_norm": 0.10953208804130554, "learning_rate": 8.863538774355658e-06, "loss": 0.4691, "num_input_tokens_seen": 57654592, "step": 47410 }, { "epoch": 5.940984838992607, "grad_norm": 0.06598550081253052, "learning_rate": 8.863191718122678e-06, "loss": 0.4594, "num_input_tokens_seen": 57660992, "step": 47415 }, { "epoch": 5.941611326901391, "grad_norm": 0.06784136593341827, "learning_rate": 8.862844615701996e-06, "loss": 0.4607, "num_input_tokens_seen": 57667360, "step": 47420 }, { "epoch": 5.942237814810174, "grad_norm": 0.15788041055202484, "learning_rate": 8.862497467097756e-06, "loss": 0.4694, "num_input_tokens_seen": 57673568, "step": 47425 }, { "epoch": 5.942864302718958, "grad_norm": 0.040701668709516525, "learning_rate": 8.862150272314112e-06, "loss": 0.464, "num_input_tokens_seen": 57679520, "step": 47430 }, { "epoch": 5.943490790627741, "grad_norm": 0.033456284552812576, "learning_rate": 8.861803031355215e-06, "loss": 0.4654, "num_input_tokens_seen": 57685984, "step": 47435 }, { "epoch": 5.944117278536524, "grad_norm": 0.06899573653936386, "learning_rate": 8.861455744225216e-06, "loss": 0.4588, "num_input_tokens_seen": 57691520, "step": 47440 }, { "epoch": 5.944743766445308, "grad_norm": 0.06055184826254845, "learning_rate": 8.861108410928267e-06, "loss": 0.4614, "num_input_tokens_seen": 57697824, "step": 47445 }, { "epoch": 5.945370254354091, "grad_norm": 0.06461210548877716, "learning_rate": 8.86076103146852e-06, "loss": 0.4632, "num_input_tokens_seen": 57703360, "step": 47450 }, { "epoch": 5.945996742262874, "grad_norm": 0.06596332043409348, "learning_rate": 8.860413605850129e-06, "loss": 0.4607, "num_input_tokens_seen": 57709440, "step": 47455 }, { "epoch": 5.9466232301716575, "grad_norm": 0.038609255105257034, "learning_rate": 8.860066134077249e-06, "loss": 0.46, "num_input_tokens_seen": 57715552, "step": 47460 }, { "epoch": 5.947249718080441, "grad_norm": 0.06456496566534042, "learning_rate": 8.859718616154031e-06, "loss": 0.4618, "num_input_tokens_seen": 57721664, "step": 47465 }, { "epoch": 5.947876205989225, "grad_norm": 0.15522457659244537, "learning_rate": 8.859371052084633e-06, "loss": 0.4588, "num_input_tokens_seen": 57727872, "step": 47470 }, { "epoch": 5.948502693898008, "grad_norm": 0.08183881640434265, "learning_rate": 8.85902344187321e-06, "loss": 0.4618, "num_input_tokens_seen": 57734112, "step": 47475 }, { "epoch": 5.949129181806791, "grad_norm": 0.06558384001255035, "learning_rate": 8.858675785523915e-06, "loss": 0.4647, "num_input_tokens_seen": 57739968, "step": 47480 }, { "epoch": 5.949755669715574, "grad_norm": 0.036745585501194, "learning_rate": 8.85832808304091e-06, "loss": 0.4704, "num_input_tokens_seen": 57746144, "step": 47485 }, { "epoch": 5.950382157624357, "grad_norm": 0.0340953953564167, "learning_rate": 8.857980334428346e-06, "loss": 0.4574, "num_input_tokens_seen": 57752224, "step": 47490 }, { "epoch": 5.9510086455331415, "grad_norm": 0.058584146201610565, "learning_rate": 8.857632539690387e-06, "loss": 0.4673, "num_input_tokens_seen": 57757952, "step": 47495 }, { "epoch": 5.951635133441925, "grad_norm": 0.1042390838265419, "learning_rate": 8.857284698831184e-06, "loss": 0.464, "num_input_tokens_seen": 57764128, "step": 47500 }, { "epoch": 5.952261621350708, "grad_norm": 0.06865245848894119, "learning_rate": 8.8569368118549e-06, "loss": 0.4571, "num_input_tokens_seen": 57770368, "step": 47505 }, { "epoch": 5.952888109259491, "grad_norm": 0.03787824511528015, "learning_rate": 8.856588878765695e-06, "loss": 0.4665, "num_input_tokens_seen": 57776320, "step": 47510 }, { "epoch": 5.953514597168275, "grad_norm": 0.060460373759269714, "learning_rate": 8.856240899567726e-06, "loss": 0.4634, "num_input_tokens_seen": 57782560, "step": 47515 }, { "epoch": 5.954141085077058, "grad_norm": 0.07227773219347, "learning_rate": 8.855892874265156e-06, "loss": 0.4626, "num_input_tokens_seen": 57788704, "step": 47520 }, { "epoch": 5.954767572985841, "grad_norm": 0.0784849002957344, "learning_rate": 8.855544802862145e-06, "loss": 0.4603, "num_input_tokens_seen": 57794944, "step": 47525 }, { "epoch": 5.9553940608946245, "grad_norm": 0.0663098692893982, "learning_rate": 8.855196685362854e-06, "loss": 0.4709, "num_input_tokens_seen": 57801184, "step": 47530 }, { "epoch": 5.956020548803409, "grad_norm": 0.06788811832666397, "learning_rate": 8.854848521771443e-06, "loss": 0.4621, "num_input_tokens_seen": 57807360, "step": 47535 }, { "epoch": 5.956647036712192, "grad_norm": 0.10399134457111359, "learning_rate": 8.854500312092081e-06, "loss": 0.4635, "num_input_tokens_seen": 57813536, "step": 47540 }, { "epoch": 5.957273524620975, "grad_norm": 0.06287527829408646, "learning_rate": 8.854152056328927e-06, "loss": 0.4614, "num_input_tokens_seen": 57819744, "step": 47545 }, { "epoch": 5.957900012529758, "grad_norm": 0.08585432171821594, "learning_rate": 8.853803754486142e-06, "loss": 0.4598, "num_input_tokens_seen": 57825792, "step": 47550 }, { "epoch": 5.958526500438541, "grad_norm": 0.03486786037683487, "learning_rate": 8.853455406567894e-06, "loss": 0.4612, "num_input_tokens_seen": 57831776, "step": 47555 }, { "epoch": 5.959152988347325, "grad_norm": 0.07236239314079285, "learning_rate": 8.853107012578347e-06, "loss": 0.4585, "num_input_tokens_seen": 57837856, "step": 47560 }, { "epoch": 5.9597794762561085, "grad_norm": 0.12640564143657684, "learning_rate": 8.852758572521666e-06, "loss": 0.4673, "num_input_tokens_seen": 57843744, "step": 47565 }, { "epoch": 5.960405964164892, "grad_norm": 0.09106022864580154, "learning_rate": 8.852410086402017e-06, "loss": 0.4637, "num_input_tokens_seen": 57849792, "step": 47570 }, { "epoch": 5.961032452073675, "grad_norm": 0.07479950785636902, "learning_rate": 8.852061554223567e-06, "loss": 0.4582, "num_input_tokens_seen": 57856096, "step": 47575 }, { "epoch": 5.961658939982458, "grad_norm": 0.13624800741672516, "learning_rate": 8.851712975990481e-06, "loss": 0.4606, "num_input_tokens_seen": 57862048, "step": 47580 }, { "epoch": 5.962285427891242, "grad_norm": 0.06635899096727371, "learning_rate": 8.85136435170693e-06, "loss": 0.4669, "num_input_tokens_seen": 57868288, "step": 47585 }, { "epoch": 5.962911915800025, "grad_norm": 0.09966006129980087, "learning_rate": 8.85101568137708e-06, "loss": 0.4657, "num_input_tokens_seen": 57874496, "step": 47590 }, { "epoch": 5.963538403708808, "grad_norm": 0.031620174646377563, "learning_rate": 8.850666965005098e-06, "loss": 0.4658, "num_input_tokens_seen": 57880096, "step": 47595 }, { "epoch": 5.964164891617592, "grad_norm": 0.12657609581947327, "learning_rate": 8.850318202595156e-06, "loss": 0.4653, "num_input_tokens_seen": 57886080, "step": 47600 }, { "epoch": 5.964791379526375, "grad_norm": 0.058889225125312805, "learning_rate": 8.849969394151424e-06, "loss": 0.4645, "num_input_tokens_seen": 57892160, "step": 47605 }, { "epoch": 5.965417867435159, "grad_norm": 0.07084383815526962, "learning_rate": 8.849620539678068e-06, "loss": 0.4646, "num_input_tokens_seen": 57897760, "step": 47610 }, { "epoch": 5.966044355343942, "grad_norm": 0.0682009607553482, "learning_rate": 8.849271639179264e-06, "loss": 0.4653, "num_input_tokens_seen": 57903520, "step": 47615 }, { "epoch": 5.966670843252725, "grad_norm": 0.05492265895009041, "learning_rate": 8.848922692659181e-06, "loss": 0.4679, "num_input_tokens_seen": 57909600, "step": 47620 }, { "epoch": 5.967297331161508, "grad_norm": 0.09282828867435455, "learning_rate": 8.84857370012199e-06, "loss": 0.4528, "num_input_tokens_seen": 57915840, "step": 47625 }, { "epoch": 5.9679238190702915, "grad_norm": 0.12731005251407623, "learning_rate": 8.848224661571866e-06, "loss": 0.4624, "num_input_tokens_seen": 57921344, "step": 47630 }, { "epoch": 5.9685503069790755, "grad_norm": 0.10357087850570679, "learning_rate": 8.847875577012981e-06, "loss": 0.4607, "num_input_tokens_seen": 57927712, "step": 47635 }, { "epoch": 5.969176794887859, "grad_norm": 0.08539704233407974, "learning_rate": 8.847526446449509e-06, "loss": 0.4616, "num_input_tokens_seen": 57933856, "step": 47640 }, { "epoch": 5.969803282796642, "grad_norm": 0.09691361337900162, "learning_rate": 8.847177269885621e-06, "loss": 0.4657, "num_input_tokens_seen": 57939968, "step": 47645 }, { "epoch": 5.970429770705425, "grad_norm": 0.06771879643201828, "learning_rate": 8.846828047325496e-06, "loss": 0.4644, "num_input_tokens_seen": 57945792, "step": 47650 }, { "epoch": 5.971056258614209, "grad_norm": 0.10705562680959702, "learning_rate": 8.846478778773307e-06, "loss": 0.4637, "num_input_tokens_seen": 57952160, "step": 47655 }, { "epoch": 5.971682746522992, "grad_norm": 0.11607062071561813, "learning_rate": 8.846129464233231e-06, "loss": 0.46, "num_input_tokens_seen": 57958304, "step": 47660 }, { "epoch": 5.972309234431775, "grad_norm": 0.10898053646087646, "learning_rate": 8.845780103709443e-06, "loss": 0.459, "num_input_tokens_seen": 57964544, "step": 47665 }, { "epoch": 5.972935722340559, "grad_norm": 0.11572449654340744, "learning_rate": 8.84543069720612e-06, "loss": 0.4625, "num_input_tokens_seen": 57970720, "step": 47670 }, { "epoch": 5.973562210249343, "grad_norm": 0.06590736657381058, "learning_rate": 8.84508124472744e-06, "loss": 0.4689, "num_input_tokens_seen": 57976928, "step": 47675 }, { "epoch": 5.974188698158126, "grad_norm": 0.06620132923126221, "learning_rate": 8.844731746277582e-06, "loss": 0.4641, "num_input_tokens_seen": 57983232, "step": 47680 }, { "epoch": 5.974815186066909, "grad_norm": 0.11539358645677567, "learning_rate": 8.844382201860722e-06, "loss": 0.4654, "num_input_tokens_seen": 57989536, "step": 47685 }, { "epoch": 5.975441673975692, "grad_norm": 0.07593216001987457, "learning_rate": 8.844032611481041e-06, "loss": 0.4693, "num_input_tokens_seen": 57995648, "step": 47690 }, { "epoch": 5.976068161884475, "grad_norm": 0.0644305869936943, "learning_rate": 8.84368297514272e-06, "loss": 0.4568, "num_input_tokens_seen": 58001600, "step": 47695 }, { "epoch": 5.976694649793259, "grad_norm": 0.07142583280801773, "learning_rate": 8.843333292849936e-06, "loss": 0.4633, "num_input_tokens_seen": 58007648, "step": 47700 }, { "epoch": 5.9773211377020425, "grad_norm": 0.0968664363026619, "learning_rate": 8.84298356460687e-06, "loss": 0.4637, "num_input_tokens_seen": 58013984, "step": 47705 }, { "epoch": 5.977947625610826, "grad_norm": 0.09575245529413223, "learning_rate": 8.842633790417706e-06, "loss": 0.4589, "num_input_tokens_seen": 58020160, "step": 47710 }, { "epoch": 5.978574113519609, "grad_norm": 0.07220412790775299, "learning_rate": 8.842283970286624e-06, "loss": 0.4623, "num_input_tokens_seen": 58025952, "step": 47715 }, { "epoch": 5.979200601428392, "grad_norm": 0.03650998696684837, "learning_rate": 8.841934104217808e-06, "loss": 0.4649, "num_input_tokens_seen": 58032032, "step": 47720 }, { "epoch": 5.979827089337176, "grad_norm": 0.09709551930427551, "learning_rate": 8.841584192215438e-06, "loss": 0.4609, "num_input_tokens_seen": 58037856, "step": 47725 }, { "epoch": 5.980453577245959, "grad_norm": 0.05971008539199829, "learning_rate": 8.8412342342837e-06, "loss": 0.4649, "num_input_tokens_seen": 58044320, "step": 47730 }, { "epoch": 5.981080065154742, "grad_norm": 0.033056605607271194, "learning_rate": 8.840884230426777e-06, "loss": 0.4633, "num_input_tokens_seen": 58050144, "step": 47735 }, { "epoch": 5.981706553063526, "grad_norm": 0.05903568118810654, "learning_rate": 8.840534180648854e-06, "loss": 0.4661, "num_input_tokens_seen": 58056352, "step": 47740 }, { "epoch": 5.982333040972309, "grad_norm": 0.06330219656229019, "learning_rate": 8.840184084954115e-06, "loss": 0.4672, "num_input_tokens_seen": 58062496, "step": 47745 }, { "epoch": 5.982959528881093, "grad_norm": 0.06416647881269455, "learning_rate": 8.839833943346748e-06, "loss": 0.46, "num_input_tokens_seen": 58068384, "step": 47750 }, { "epoch": 5.983586016789876, "grad_norm": 0.05334393307566643, "learning_rate": 8.839483755830937e-06, "loss": 0.4631, "num_input_tokens_seen": 58074720, "step": 47755 }, { "epoch": 5.984212504698659, "grad_norm": 0.0997430607676506, "learning_rate": 8.839133522410869e-06, "loss": 0.462, "num_input_tokens_seen": 58080544, "step": 47760 }, { "epoch": 5.984838992607442, "grad_norm": 0.0716024786233902, "learning_rate": 8.838783243090734e-06, "loss": 0.4617, "num_input_tokens_seen": 58086592, "step": 47765 }, { "epoch": 5.985465480516226, "grad_norm": 0.06612814217805862, "learning_rate": 8.838432917874715e-06, "loss": 0.4641, "num_input_tokens_seen": 58092800, "step": 47770 }, { "epoch": 5.98609196842501, "grad_norm": 0.058965519070625305, "learning_rate": 8.838082546767006e-06, "loss": 0.4614, "num_input_tokens_seen": 58098400, "step": 47775 }, { "epoch": 5.986718456333793, "grad_norm": 0.06488654017448425, "learning_rate": 8.837732129771791e-06, "loss": 0.4593, "num_input_tokens_seen": 58104704, "step": 47780 }, { "epoch": 5.987344944242576, "grad_norm": 0.06918847560882568, "learning_rate": 8.837381666893264e-06, "loss": 0.4656, "num_input_tokens_seen": 58111040, "step": 47785 }, { "epoch": 5.98797143215136, "grad_norm": 0.05534807965159416, "learning_rate": 8.83703115813561e-06, "loss": 0.4627, "num_input_tokens_seen": 58116800, "step": 47790 }, { "epoch": 5.988597920060143, "grad_norm": 0.05764995515346527, "learning_rate": 8.836680603503025e-06, "loss": 0.4601, "num_input_tokens_seen": 58123072, "step": 47795 }, { "epoch": 5.989224407968926, "grad_norm": 0.06298662722110748, "learning_rate": 8.836330002999699e-06, "loss": 0.4609, "num_input_tokens_seen": 58128768, "step": 47800 }, { "epoch": 5.9898508958777095, "grad_norm": 0.06924694031476974, "learning_rate": 8.835979356629818e-06, "loss": 0.463, "num_input_tokens_seen": 58134912, "step": 47805 }, { "epoch": 5.990477383786493, "grad_norm": 0.06439686566591263, "learning_rate": 8.835628664397582e-06, "loss": 0.4616, "num_input_tokens_seen": 58140928, "step": 47810 }, { "epoch": 5.991103871695277, "grad_norm": 0.03874491527676582, "learning_rate": 8.835277926307178e-06, "loss": 0.4663, "num_input_tokens_seen": 58147072, "step": 47815 }, { "epoch": 5.99173035960406, "grad_norm": 0.06492523849010468, "learning_rate": 8.834927142362804e-06, "loss": 0.464, "num_input_tokens_seen": 58153088, "step": 47820 }, { "epoch": 5.992356847512843, "grad_norm": 0.09607674181461334, "learning_rate": 8.834576312568652e-06, "loss": 0.4596, "num_input_tokens_seen": 58159072, "step": 47825 }, { "epoch": 5.992983335421626, "grad_norm": 0.06453994661569595, "learning_rate": 8.834225436928916e-06, "loss": 0.4656, "num_input_tokens_seen": 58165248, "step": 47830 }, { "epoch": 5.993609823330409, "grad_norm": 0.05754554644227028, "learning_rate": 8.83387451544779e-06, "loss": 0.464, "num_input_tokens_seen": 58171456, "step": 47835 }, { "epoch": 5.994236311239193, "grad_norm": 0.10904520750045776, "learning_rate": 8.833523548129471e-06, "loss": 0.4629, "num_input_tokens_seen": 58177440, "step": 47840 }, { "epoch": 5.994862799147977, "grad_norm": 0.06439678370952606, "learning_rate": 8.833172534978156e-06, "loss": 0.4594, "num_input_tokens_seen": 58183520, "step": 47845 }, { "epoch": 5.99548928705676, "grad_norm": 0.07031133770942688, "learning_rate": 8.832821475998039e-06, "loss": 0.4608, "num_input_tokens_seen": 58189696, "step": 47850 }, { "epoch": 5.996115774965543, "grad_norm": 0.055770453065633774, "learning_rate": 8.83247037119332e-06, "loss": 0.4571, "num_input_tokens_seen": 58195904, "step": 47855 }, { "epoch": 5.996742262874326, "grad_norm": 0.030471062287688255, "learning_rate": 8.832119220568196e-06, "loss": 0.4556, "num_input_tokens_seen": 58201920, "step": 47860 }, { "epoch": 5.99736875078311, "grad_norm": 0.06739028543233871, "learning_rate": 8.831768024126864e-06, "loss": 0.4631, "num_input_tokens_seen": 58208096, "step": 47865 }, { "epoch": 5.997995238691893, "grad_norm": 0.12415659427642822, "learning_rate": 8.831416781873524e-06, "loss": 0.4587, "num_input_tokens_seen": 58214080, "step": 47870 }, { "epoch": 5.9986217266006765, "grad_norm": 0.08783183991909027, "learning_rate": 8.831065493812375e-06, "loss": 0.459, "num_input_tokens_seen": 58220064, "step": 47875 }, { "epoch": 5.99924821450946, "grad_norm": 0.09779641777276993, "learning_rate": 8.830714159947618e-06, "loss": 0.4678, "num_input_tokens_seen": 58225824, "step": 47880 }, { "epoch": 5.999874702418243, "grad_norm": 0.06458035856485367, "learning_rate": 8.830362780283451e-06, "loss": 0.453, "num_input_tokens_seen": 58231744, "step": 47885 }, { "epoch": 6.0, "eval_loss": 0.4630619287490845, "eval_runtime": 222.7978, "eval_samples_per_second": 35.822, "eval_steps_per_second": 8.959, "num_input_tokens_seen": 58233056, "step": 47886 }, { "epoch": 6.000501190327027, "grad_norm": 0.10835020244121552, "learning_rate": 8.830011354824077e-06, "loss": 0.4726, "num_input_tokens_seen": 58238272, "step": 47890 }, { "epoch": 6.00112767823581, "grad_norm": 0.05393127724528313, "learning_rate": 8.829659883573699e-06, "loss": 0.4621, "num_input_tokens_seen": 58244544, "step": 47895 }, { "epoch": 6.001754166144593, "grad_norm": 0.06650180369615555, "learning_rate": 8.829308366536514e-06, "loss": 0.4656, "num_input_tokens_seen": 58250912, "step": 47900 }, { "epoch": 6.002380654053376, "grad_norm": 0.061968788504600525, "learning_rate": 8.82895680371673e-06, "loss": 0.4566, "num_input_tokens_seen": 58256992, "step": 47905 }, { "epoch": 6.0030071419621605, "grad_norm": 0.06938350200653076, "learning_rate": 8.828605195118546e-06, "loss": 0.4614, "num_input_tokens_seen": 58263072, "step": 47910 }, { "epoch": 6.003633629870944, "grad_norm": 0.07351958006620407, "learning_rate": 8.828253540746172e-06, "loss": 0.4589, "num_input_tokens_seen": 58269376, "step": 47915 }, { "epoch": 6.004260117779727, "grad_norm": 0.1000603511929512, "learning_rate": 8.827901840603805e-06, "loss": 0.4693, "num_input_tokens_seen": 58275680, "step": 47920 }, { "epoch": 6.00488660568851, "grad_norm": 0.183871328830719, "learning_rate": 8.827550094695654e-06, "loss": 0.4693, "num_input_tokens_seen": 58282016, "step": 47925 }, { "epoch": 6.005513093597294, "grad_norm": 0.037299636751413345, "learning_rate": 8.827198303025924e-06, "loss": 0.4671, "num_input_tokens_seen": 58287808, "step": 47930 }, { "epoch": 6.006139581506077, "grad_norm": 0.07371057569980621, "learning_rate": 8.826846465598821e-06, "loss": 0.4651, "num_input_tokens_seen": 58293888, "step": 47935 }, { "epoch": 6.00676606941486, "grad_norm": 0.06637900322675705, "learning_rate": 8.82649458241855e-06, "loss": 0.4588, "num_input_tokens_seen": 58300160, "step": 47940 }, { "epoch": 6.0073925573236435, "grad_norm": 0.03506059572100639, "learning_rate": 8.82614265348932e-06, "loss": 0.465, "num_input_tokens_seen": 58306112, "step": 47945 }, { "epoch": 6.008019045232427, "grad_norm": 0.03437497839331627, "learning_rate": 8.825790678815337e-06, "loss": 0.4634, "num_input_tokens_seen": 58312384, "step": 47950 }, { "epoch": 6.008645533141211, "grad_norm": 0.03703070431947708, "learning_rate": 8.825438658400811e-06, "loss": 0.4664, "num_input_tokens_seen": 58318624, "step": 47955 }, { "epoch": 6.009272021049994, "grad_norm": 0.09073121100664139, "learning_rate": 8.825086592249948e-06, "loss": 0.4615, "num_input_tokens_seen": 58324672, "step": 47960 }, { "epoch": 6.009898508958777, "grad_norm": 0.07315697520971298, "learning_rate": 8.82473448036696e-06, "loss": 0.4693, "num_input_tokens_seen": 58330976, "step": 47965 }, { "epoch": 6.01052499686756, "grad_norm": 0.06821868568658829, "learning_rate": 8.824382322756055e-06, "loss": 0.4607, "num_input_tokens_seen": 58336960, "step": 47970 }, { "epoch": 6.011151484776343, "grad_norm": 0.05862938612699509, "learning_rate": 8.824030119421443e-06, "loss": 0.4635, "num_input_tokens_seen": 58343168, "step": 47975 }, { "epoch": 6.0117779726851275, "grad_norm": 0.09290329366922379, "learning_rate": 8.823677870367338e-06, "loss": 0.4604, "num_input_tokens_seen": 58349344, "step": 47980 }, { "epoch": 6.012404460593911, "grad_norm": 0.034287240356206894, "learning_rate": 8.823325575597949e-06, "loss": 0.458, "num_input_tokens_seen": 58355456, "step": 47985 }, { "epoch": 6.013030948502694, "grad_norm": 0.06248914822936058, "learning_rate": 8.822973235117487e-06, "loss": 0.4586, "num_input_tokens_seen": 58361568, "step": 47990 }, { "epoch": 6.013657436411477, "grad_norm": 0.057738739997148514, "learning_rate": 8.822620848930166e-06, "loss": 0.4662, "num_input_tokens_seen": 58366912, "step": 47995 }, { "epoch": 6.014283924320261, "grad_norm": 0.0633246898651123, "learning_rate": 8.822268417040201e-06, "loss": 0.4591, "num_input_tokens_seen": 58372832, "step": 48000 }, { "epoch": 6.014910412229044, "grad_norm": 0.0366935059428215, "learning_rate": 8.821915939451801e-06, "loss": 0.4634, "num_input_tokens_seen": 58378816, "step": 48005 }, { "epoch": 6.015536900137827, "grad_norm": 0.03549626097083092, "learning_rate": 8.821563416169184e-06, "loss": 0.4576, "num_input_tokens_seen": 58384480, "step": 48010 }, { "epoch": 6.0161633880466105, "grad_norm": 0.07751835882663727, "learning_rate": 8.821210847196562e-06, "loss": 0.4615, "num_input_tokens_seen": 58390880, "step": 48015 }, { "epoch": 6.016789875955394, "grad_norm": 0.03743797540664673, "learning_rate": 8.820858232538151e-06, "loss": 0.4636, "num_input_tokens_seen": 58396800, "step": 48020 }, { "epoch": 6.017416363864178, "grad_norm": 0.06050553172826767, "learning_rate": 8.820505572198171e-06, "loss": 0.4645, "num_input_tokens_seen": 58403040, "step": 48025 }, { "epoch": 6.018042851772961, "grad_norm": 0.0587163120508194, "learning_rate": 8.820152866180831e-06, "loss": 0.4639, "num_input_tokens_seen": 58409120, "step": 48030 }, { "epoch": 6.018669339681744, "grad_norm": 0.06163768097758293, "learning_rate": 8.819800114490355e-06, "loss": 0.4647, "num_input_tokens_seen": 58415104, "step": 48035 }, { "epoch": 6.019295827590527, "grad_norm": 0.09515716880559921, "learning_rate": 8.819447317130954e-06, "loss": 0.46, "num_input_tokens_seen": 58421152, "step": 48040 }, { "epoch": 6.0199223154993105, "grad_norm": 0.062497735023498535, "learning_rate": 8.81909447410685e-06, "loss": 0.4565, "num_input_tokens_seen": 58427168, "step": 48045 }, { "epoch": 6.0205488034080945, "grad_norm": 0.10865052789449692, "learning_rate": 8.818741585422262e-06, "loss": 0.4666, "num_input_tokens_seen": 58433408, "step": 48050 }, { "epoch": 6.021175291316878, "grad_norm": 0.09236513078212738, "learning_rate": 8.818388651081408e-06, "loss": 0.4618, "num_input_tokens_seen": 58439648, "step": 48055 }, { "epoch": 6.021801779225661, "grad_norm": 0.06647511571645737, "learning_rate": 8.818035671088505e-06, "loss": 0.4557, "num_input_tokens_seen": 58445600, "step": 48060 }, { "epoch": 6.022428267134444, "grad_norm": 0.032876260578632355, "learning_rate": 8.817682645447779e-06, "loss": 0.4628, "num_input_tokens_seen": 58451744, "step": 48065 }, { "epoch": 6.023054755043228, "grad_norm": 0.09854987263679504, "learning_rate": 8.817329574163444e-06, "loss": 0.4627, "num_input_tokens_seen": 58458048, "step": 48070 }, { "epoch": 6.023681242952011, "grad_norm": 0.06481342762708664, "learning_rate": 8.816976457239725e-06, "loss": 0.4598, "num_input_tokens_seen": 58463936, "step": 48075 }, { "epoch": 6.024307730860794, "grad_norm": 0.13363583385944366, "learning_rate": 8.816623294680844e-06, "loss": 0.467, "num_input_tokens_seen": 58470112, "step": 48080 }, { "epoch": 6.024934218769578, "grad_norm": 0.03375910222530365, "learning_rate": 8.816270086491024e-06, "loss": 0.4582, "num_input_tokens_seen": 58476064, "step": 48085 }, { "epoch": 6.025560706678361, "grad_norm": 0.06732591986656189, "learning_rate": 8.815916832674484e-06, "loss": 0.4657, "num_input_tokens_seen": 58482240, "step": 48090 }, { "epoch": 6.026187194587145, "grad_norm": 0.08079066127538681, "learning_rate": 8.815563533235451e-06, "loss": 0.4646, "num_input_tokens_seen": 58488128, "step": 48095 }, { "epoch": 6.026813682495928, "grad_norm": 0.1536151021718979, "learning_rate": 8.815210188178146e-06, "loss": 0.4584, "num_input_tokens_seen": 58494208, "step": 48100 }, { "epoch": 6.027440170404711, "grad_norm": 0.1306115835905075, "learning_rate": 8.8148567975068e-06, "loss": 0.4634, "num_input_tokens_seen": 58500896, "step": 48105 }, { "epoch": 6.028066658313494, "grad_norm": 0.08441682904958725, "learning_rate": 8.81450336122563e-06, "loss": 0.455, "num_input_tokens_seen": 58507200, "step": 48110 }, { "epoch": 6.0286931462222775, "grad_norm": 0.06865226477384567, "learning_rate": 8.814149879338867e-06, "loss": 0.4628, "num_input_tokens_seen": 58513440, "step": 48115 }, { "epoch": 6.0293196341310615, "grad_norm": 0.07637793570756912, "learning_rate": 8.813796351850735e-06, "loss": 0.4614, "num_input_tokens_seen": 58519552, "step": 48120 }, { "epoch": 6.029946122039845, "grad_norm": 0.06451189517974854, "learning_rate": 8.813442778765462e-06, "loss": 0.4615, "num_input_tokens_seen": 58525472, "step": 48125 }, { "epoch": 6.030572609948628, "grad_norm": 0.06802592426538467, "learning_rate": 8.813089160087274e-06, "loss": 0.4604, "num_input_tokens_seen": 58531808, "step": 48130 }, { "epoch": 6.031199097857411, "grad_norm": 0.04030654579401016, "learning_rate": 8.812735495820399e-06, "loss": 0.4632, "num_input_tokens_seen": 58538080, "step": 48135 }, { "epoch": 6.031825585766195, "grad_norm": 0.06846386939287186, "learning_rate": 8.812381785969063e-06, "loss": 0.4604, "num_input_tokens_seen": 58544608, "step": 48140 }, { "epoch": 6.032452073674978, "grad_norm": 0.07602433860301971, "learning_rate": 8.8120280305375e-06, "loss": 0.4677, "num_input_tokens_seen": 58550720, "step": 48145 }, { "epoch": 6.033078561583761, "grad_norm": 0.06545381993055344, "learning_rate": 8.811674229529938e-06, "loss": 0.4712, "num_input_tokens_seen": 58556448, "step": 48150 }, { "epoch": 6.033705049492545, "grad_norm": 0.07154776901006699, "learning_rate": 8.811320382950605e-06, "loss": 0.458, "num_input_tokens_seen": 58562688, "step": 48155 }, { "epoch": 6.034331537401328, "grad_norm": 0.06826142966747284, "learning_rate": 8.810966490803732e-06, "loss": 0.4668, "num_input_tokens_seen": 58568864, "step": 48160 }, { "epoch": 6.034958025310112, "grad_norm": 0.038550883531570435, "learning_rate": 8.81061255309355e-06, "loss": 0.4614, "num_input_tokens_seen": 58575200, "step": 48165 }, { "epoch": 6.035584513218895, "grad_norm": 0.07346131652593613, "learning_rate": 8.810258569824293e-06, "loss": 0.4633, "num_input_tokens_seen": 58581120, "step": 48170 }, { "epoch": 6.036211001127678, "grad_norm": 0.04057324677705765, "learning_rate": 8.80990454100019e-06, "loss": 0.4615, "num_input_tokens_seen": 58587712, "step": 48175 }, { "epoch": 6.036837489036461, "grad_norm": 0.09868934750556946, "learning_rate": 8.809550466625475e-06, "loss": 0.4607, "num_input_tokens_seen": 58593600, "step": 48180 }, { "epoch": 6.037463976945245, "grad_norm": 0.067369244992733, "learning_rate": 8.809196346704381e-06, "loss": 0.463, "num_input_tokens_seen": 58599776, "step": 48185 }, { "epoch": 6.038090464854029, "grad_norm": 0.0649292841553688, "learning_rate": 8.808842181241142e-06, "loss": 0.4634, "num_input_tokens_seen": 58606080, "step": 48190 }, { "epoch": 6.038716952762812, "grad_norm": 0.07117991149425507, "learning_rate": 8.808487970239993e-06, "loss": 0.4669, "num_input_tokens_seen": 58612128, "step": 48195 }, { "epoch": 6.039343440671595, "grad_norm": 0.07790203392505646, "learning_rate": 8.808133713705169e-06, "loss": 0.4614, "num_input_tokens_seen": 58618464, "step": 48200 }, { "epoch": 6.039969928580378, "grad_norm": 0.07698185741901398, "learning_rate": 8.807779411640904e-06, "loss": 0.4632, "num_input_tokens_seen": 58624480, "step": 48205 }, { "epoch": 6.040596416489162, "grad_norm": 0.06912505626678467, "learning_rate": 8.807425064051434e-06, "loss": 0.466, "num_input_tokens_seen": 58630624, "step": 48210 }, { "epoch": 6.041222904397945, "grad_norm": 0.12051042169332504, "learning_rate": 8.807070670940996e-06, "loss": 0.4565, "num_input_tokens_seen": 58636864, "step": 48215 }, { "epoch": 6.0418493923067285, "grad_norm": 0.065036840736866, "learning_rate": 8.80671623231383e-06, "loss": 0.46, "num_input_tokens_seen": 58642976, "step": 48220 }, { "epoch": 6.042475880215512, "grad_norm": 0.0783076286315918, "learning_rate": 8.806361748174168e-06, "loss": 0.4599, "num_input_tokens_seen": 58649152, "step": 48225 }, { "epoch": 6.043102368124295, "grad_norm": 0.038318708539009094, "learning_rate": 8.806007218526253e-06, "loss": 0.4628, "num_input_tokens_seen": 58654560, "step": 48230 }, { "epoch": 6.043728856033079, "grad_norm": 0.06298049539327621, "learning_rate": 8.80565264337432e-06, "loss": 0.4651, "num_input_tokens_seen": 58660672, "step": 48235 }, { "epoch": 6.044355343941862, "grad_norm": 0.10233011841773987, "learning_rate": 8.805298022722612e-06, "loss": 0.4616, "num_input_tokens_seen": 58666592, "step": 48240 }, { "epoch": 6.044981831850645, "grad_norm": 0.07516159117221832, "learning_rate": 8.804943356575365e-06, "loss": 0.4684, "num_input_tokens_seen": 58672672, "step": 48245 }, { "epoch": 6.045608319759428, "grad_norm": 0.07210489362478256, "learning_rate": 8.804588644936823e-06, "loss": 0.4705, "num_input_tokens_seen": 58678624, "step": 48250 }, { "epoch": 6.046234807668212, "grad_norm": 0.06912754476070404, "learning_rate": 8.804233887811224e-06, "loss": 0.4647, "num_input_tokens_seen": 58685344, "step": 48255 }, { "epoch": 6.046861295576996, "grad_norm": 0.06285953521728516, "learning_rate": 8.80387908520281e-06, "loss": 0.4569, "num_input_tokens_seen": 58691264, "step": 48260 }, { "epoch": 6.047487783485779, "grad_norm": 0.11266753822565079, "learning_rate": 8.803524237115824e-06, "loss": 0.4598, "num_input_tokens_seen": 58697568, "step": 48265 }, { "epoch": 6.048114271394562, "grad_norm": 0.06774544715881348, "learning_rate": 8.803169343554508e-06, "loss": 0.4654, "num_input_tokens_seen": 58702880, "step": 48270 }, { "epoch": 6.048740759303345, "grad_norm": 0.04075857624411583, "learning_rate": 8.802814404523105e-06, "loss": 0.4626, "num_input_tokens_seen": 58708800, "step": 48275 }, { "epoch": 6.049367247212129, "grad_norm": 0.06402923166751862, "learning_rate": 8.802459420025858e-06, "loss": 0.464, "num_input_tokens_seen": 58714688, "step": 48280 }, { "epoch": 6.049993735120912, "grad_norm": 0.06651376932859421, "learning_rate": 8.802104390067013e-06, "loss": 0.4613, "num_input_tokens_seen": 58720800, "step": 48285 }, { "epoch": 6.0506202230296955, "grad_norm": 0.06784377247095108, "learning_rate": 8.801749314650812e-06, "loss": 0.4552, "num_input_tokens_seen": 58726656, "step": 48290 }, { "epoch": 6.051246710938479, "grad_norm": 0.06272516399621964, "learning_rate": 8.801394193781501e-06, "loss": 0.4639, "num_input_tokens_seen": 58732896, "step": 48295 }, { "epoch": 6.051873198847262, "grad_norm": 0.10416305810213089, "learning_rate": 8.801039027463326e-06, "loss": 0.4652, "num_input_tokens_seen": 58739008, "step": 48300 }, { "epoch": 6.052499686756046, "grad_norm": 0.1148003414273262, "learning_rate": 8.800683815700537e-06, "loss": 0.4623, "num_input_tokens_seen": 58745184, "step": 48305 }, { "epoch": 6.053126174664829, "grad_norm": 0.0977618545293808, "learning_rate": 8.800328558497375e-06, "loss": 0.4635, "num_input_tokens_seen": 58751296, "step": 48310 }, { "epoch": 6.053752662573612, "grad_norm": 0.07809996604919434, "learning_rate": 8.799973255858092e-06, "loss": 0.4574, "num_input_tokens_seen": 58757472, "step": 48315 }, { "epoch": 6.054379150482395, "grad_norm": 0.1087581142783165, "learning_rate": 8.799617907786932e-06, "loss": 0.4631, "num_input_tokens_seen": 58763424, "step": 48320 }, { "epoch": 6.0550056383911794, "grad_norm": 0.07621324062347412, "learning_rate": 8.799262514288146e-06, "loss": 0.4612, "num_input_tokens_seen": 58769440, "step": 48325 }, { "epoch": 6.055632126299963, "grad_norm": 0.06907685846090317, "learning_rate": 8.798907075365982e-06, "loss": 0.466, "num_input_tokens_seen": 58775616, "step": 48330 }, { "epoch": 6.056258614208746, "grad_norm": 0.07168503105640411, "learning_rate": 8.798551591024691e-06, "loss": 0.4621, "num_input_tokens_seen": 58781728, "step": 48335 }, { "epoch": 6.056885102117529, "grad_norm": 0.06674783676862717, "learning_rate": 8.79819606126852e-06, "loss": 0.4661, "num_input_tokens_seen": 58787936, "step": 48340 }, { "epoch": 6.057511590026312, "grad_norm": 0.07451183348894119, "learning_rate": 8.797840486101724e-06, "loss": 0.4656, "num_input_tokens_seen": 58794272, "step": 48345 }, { "epoch": 6.058138077935096, "grad_norm": 0.07281529903411865, "learning_rate": 8.797484865528552e-06, "loss": 0.4666, "num_input_tokens_seen": 58800416, "step": 48350 }, { "epoch": 6.058764565843879, "grad_norm": 0.0810161754488945, "learning_rate": 8.797129199553255e-06, "loss": 0.4542, "num_input_tokens_seen": 58806144, "step": 48355 }, { "epoch": 6.0593910537526625, "grad_norm": 0.10249362885951996, "learning_rate": 8.796773488180083e-06, "loss": 0.4676, "num_input_tokens_seen": 58812096, "step": 48360 }, { "epoch": 6.060017541661446, "grad_norm": 0.062856525182724, "learning_rate": 8.796417731413296e-06, "loss": 0.4613, "num_input_tokens_seen": 58817568, "step": 48365 }, { "epoch": 6.060644029570229, "grad_norm": 0.07984943687915802, "learning_rate": 8.796061929257141e-06, "loss": 0.4654, "num_input_tokens_seen": 58824160, "step": 48370 }, { "epoch": 6.061270517479013, "grad_norm": 0.07121238112449646, "learning_rate": 8.795706081715877e-06, "loss": 0.4639, "num_input_tokens_seen": 58830240, "step": 48375 }, { "epoch": 6.061897005387796, "grad_norm": 0.037975065410137177, "learning_rate": 8.795350188793753e-06, "loss": 0.4674, "num_input_tokens_seen": 58836224, "step": 48380 }, { "epoch": 6.062523493296579, "grad_norm": 0.06454186886548996, "learning_rate": 8.794994250495025e-06, "loss": 0.4645, "num_input_tokens_seen": 58842080, "step": 48385 }, { "epoch": 6.063149981205362, "grad_norm": 0.04114632308483124, "learning_rate": 8.794638266823951e-06, "loss": 0.4637, "num_input_tokens_seen": 58848544, "step": 48390 }, { "epoch": 6.0637764691141465, "grad_norm": 0.09526488184928894, "learning_rate": 8.794282237784788e-06, "loss": 0.4605, "num_input_tokens_seen": 58855392, "step": 48395 }, { "epoch": 6.06440295702293, "grad_norm": 0.06916853040456772, "learning_rate": 8.79392616338179e-06, "loss": 0.4658, "num_input_tokens_seen": 58861792, "step": 48400 }, { "epoch": 6.065029444931713, "grad_norm": 0.06513063609600067, "learning_rate": 8.793570043619215e-06, "loss": 0.4645, "num_input_tokens_seen": 58868032, "step": 48405 }, { "epoch": 6.065655932840496, "grad_norm": 0.06138810142874718, "learning_rate": 8.79321387850132e-06, "loss": 0.4613, "num_input_tokens_seen": 58873984, "step": 48410 }, { "epoch": 6.066282420749279, "grad_norm": 0.06699882447719574, "learning_rate": 8.792857668032364e-06, "loss": 0.4684, "num_input_tokens_seen": 58880000, "step": 48415 }, { "epoch": 6.066908908658063, "grad_norm": 0.0743190348148346, "learning_rate": 8.792501412216607e-06, "loss": 0.4646, "num_input_tokens_seen": 58886496, "step": 48420 }, { "epoch": 6.067535396566846, "grad_norm": 0.10977887362241745, "learning_rate": 8.792145111058306e-06, "loss": 0.468, "num_input_tokens_seen": 58892640, "step": 48425 }, { "epoch": 6.0681618844756295, "grad_norm": 0.06780959665775299, "learning_rate": 8.791788764561721e-06, "loss": 0.4624, "num_input_tokens_seen": 58898816, "step": 48430 }, { "epoch": 6.068788372384413, "grad_norm": 0.07279156893491745, "learning_rate": 8.791432372731115e-06, "loss": 0.4657, "num_input_tokens_seen": 58904768, "step": 48435 }, { "epoch": 6.069414860293197, "grad_norm": 0.06771160662174225, "learning_rate": 8.791075935570748e-06, "loss": 0.4661, "num_input_tokens_seen": 58910912, "step": 48440 }, { "epoch": 6.07004134820198, "grad_norm": 0.11521119624376297, "learning_rate": 8.79071945308488e-06, "loss": 0.4583, "num_input_tokens_seen": 58916928, "step": 48445 }, { "epoch": 6.070667836110763, "grad_norm": 0.06561000645160675, "learning_rate": 8.790362925277772e-06, "loss": 0.4614, "num_input_tokens_seen": 58922976, "step": 48450 }, { "epoch": 6.071294324019546, "grad_norm": 0.05814356356859207, "learning_rate": 8.79000635215369e-06, "loss": 0.4592, "num_input_tokens_seen": 58928960, "step": 48455 }, { "epoch": 6.071920811928329, "grad_norm": 0.06351478397846222, "learning_rate": 8.789649733716897e-06, "loss": 0.4631, "num_input_tokens_seen": 58934560, "step": 48460 }, { "epoch": 6.0725472998371135, "grad_norm": 0.0870589017868042, "learning_rate": 8.789293069971652e-06, "loss": 0.4642, "num_input_tokens_seen": 58940640, "step": 48465 }, { "epoch": 6.073173787745897, "grad_norm": 0.03804251551628113, "learning_rate": 8.788936360922224e-06, "loss": 0.4642, "num_input_tokens_seen": 58946912, "step": 48470 }, { "epoch": 6.07380027565468, "grad_norm": 0.06226867809891701, "learning_rate": 8.788579606572877e-06, "loss": 0.4598, "num_input_tokens_seen": 58952960, "step": 48475 }, { "epoch": 6.074426763563463, "grad_norm": 0.0619695819914341, "learning_rate": 8.788222806927875e-06, "loss": 0.4624, "num_input_tokens_seen": 58959008, "step": 48480 }, { "epoch": 6.075053251472246, "grad_norm": 0.07774296402931213, "learning_rate": 8.787865961991485e-06, "loss": 0.4549, "num_input_tokens_seen": 58965248, "step": 48485 }, { "epoch": 6.07567973938103, "grad_norm": 0.07510510832071304, "learning_rate": 8.787509071767973e-06, "loss": 0.4645, "num_input_tokens_seen": 58971200, "step": 48490 }, { "epoch": 6.076306227289813, "grad_norm": 0.03072834387421608, "learning_rate": 8.787152136261604e-06, "loss": 0.4578, "num_input_tokens_seen": 58977568, "step": 48495 }, { "epoch": 6.076932715198597, "grad_norm": 0.06752032041549683, "learning_rate": 8.786795155476648e-06, "loss": 0.4689, "num_input_tokens_seen": 58983456, "step": 48500 }, { "epoch": 6.07755920310738, "grad_norm": 0.07126155495643616, "learning_rate": 8.786438129417373e-06, "loss": 0.4583, "num_input_tokens_seen": 58989440, "step": 48505 }, { "epoch": 6.078185691016164, "grad_norm": 0.07187200337648392, "learning_rate": 8.786081058088046e-06, "loss": 0.4645, "num_input_tokens_seen": 58995392, "step": 48510 }, { "epoch": 6.078812178924947, "grad_norm": 0.07549721747636795, "learning_rate": 8.785723941492936e-06, "loss": 0.4567, "num_input_tokens_seen": 59001600, "step": 48515 }, { "epoch": 6.07943866683373, "grad_norm": 0.03439033776521683, "learning_rate": 8.785366779636314e-06, "loss": 0.4546, "num_input_tokens_seen": 59007008, "step": 48520 }, { "epoch": 6.080065154742513, "grad_norm": 0.060038045048713684, "learning_rate": 8.78500957252245e-06, "loss": 0.4562, "num_input_tokens_seen": 59012832, "step": 48525 }, { "epoch": 6.0806916426512965, "grad_norm": 0.08097045868635178, "learning_rate": 8.784652320155614e-06, "loss": 0.4588, "num_input_tokens_seen": 59019136, "step": 48530 }, { "epoch": 6.0813181305600805, "grad_norm": 0.12649190425872803, "learning_rate": 8.784295022540077e-06, "loss": 0.4595, "num_input_tokens_seen": 59025216, "step": 48535 }, { "epoch": 6.081944618468864, "grad_norm": 0.08742062747478485, "learning_rate": 8.783937679680113e-06, "loss": 0.468, "num_input_tokens_seen": 59031552, "step": 48540 }, { "epoch": 6.082571106377647, "grad_norm": 0.10618289560079575, "learning_rate": 8.783580291579991e-06, "loss": 0.4688, "num_input_tokens_seen": 59037824, "step": 48545 }, { "epoch": 6.08319759428643, "grad_norm": 0.08453397452831268, "learning_rate": 8.783222858243987e-06, "loss": 0.4618, "num_input_tokens_seen": 59043840, "step": 48550 }, { "epoch": 6.083824082195213, "grad_norm": 0.08193667978048325, "learning_rate": 8.78286537967637e-06, "loss": 0.4494, "num_input_tokens_seen": 59050368, "step": 48555 }, { "epoch": 6.084450570103997, "grad_norm": 0.10902416706085205, "learning_rate": 8.78250785588142e-06, "loss": 0.4673, "num_input_tokens_seen": 59056576, "step": 48560 }, { "epoch": 6.08507705801278, "grad_norm": 0.12289627641439438, "learning_rate": 8.782150286863408e-06, "loss": 0.4573, "num_input_tokens_seen": 59062656, "step": 48565 }, { "epoch": 6.085703545921564, "grad_norm": 0.13415847718715668, "learning_rate": 8.78179267262661e-06, "loss": 0.462, "num_input_tokens_seen": 59068416, "step": 48570 }, { "epoch": 6.086330033830347, "grad_norm": 0.07325845956802368, "learning_rate": 8.781435013175299e-06, "loss": 0.4608, "num_input_tokens_seen": 59074912, "step": 48575 }, { "epoch": 6.086956521739131, "grad_norm": 0.0777602270245552, "learning_rate": 8.781077308513755e-06, "loss": 0.4717, "num_input_tokens_seen": 59080736, "step": 48580 }, { "epoch": 6.087583009647914, "grad_norm": 0.07384835928678513, "learning_rate": 8.780719558646251e-06, "loss": 0.457, "num_input_tokens_seen": 59086432, "step": 48585 }, { "epoch": 6.088209497556697, "grad_norm": 0.11066412925720215, "learning_rate": 8.780361763577066e-06, "loss": 0.4589, "num_input_tokens_seen": 59092448, "step": 48590 }, { "epoch": 6.08883598546548, "grad_norm": 0.07949858158826828, "learning_rate": 8.780003923310479e-06, "loss": 0.4622, "num_input_tokens_seen": 59098976, "step": 48595 }, { "epoch": 6.0894624733742635, "grad_norm": 0.04505613073706627, "learning_rate": 8.779646037850767e-06, "loss": 0.4584, "num_input_tokens_seen": 59104832, "step": 48600 }, { "epoch": 6.0900889612830476, "grad_norm": 0.06984805315732956, "learning_rate": 8.779288107202209e-06, "loss": 0.4582, "num_input_tokens_seen": 59110912, "step": 48605 }, { "epoch": 6.090715449191831, "grad_norm": 0.0765155628323555, "learning_rate": 8.778930131369083e-06, "loss": 0.4644, "num_input_tokens_seen": 59117024, "step": 48610 }, { "epoch": 6.091341937100614, "grad_norm": 0.12419752776622772, "learning_rate": 8.778572110355672e-06, "loss": 0.4588, "num_input_tokens_seen": 59123104, "step": 48615 }, { "epoch": 6.091968425009397, "grad_norm": 0.07318724691867828, "learning_rate": 8.778214044166253e-06, "loss": 0.4668, "num_input_tokens_seen": 59129184, "step": 48620 }, { "epoch": 6.092594912918181, "grad_norm": 0.07913923263549805, "learning_rate": 8.777855932805109e-06, "loss": 0.4625, "num_input_tokens_seen": 59135040, "step": 48625 }, { "epoch": 6.093221400826964, "grad_norm": 0.06988262385129929, "learning_rate": 8.777497776276523e-06, "loss": 0.4702, "num_input_tokens_seen": 59141280, "step": 48630 }, { "epoch": 6.0938478887357475, "grad_norm": 0.08194047957658768, "learning_rate": 8.777139574584772e-06, "loss": 0.4606, "num_input_tokens_seen": 59147648, "step": 48635 }, { "epoch": 6.094474376644531, "grad_norm": 0.10514114797115326, "learning_rate": 8.776781327734142e-06, "loss": 0.4634, "num_input_tokens_seen": 59153664, "step": 48640 }, { "epoch": 6.095100864553314, "grad_norm": 0.03899412974715233, "learning_rate": 8.776423035728918e-06, "loss": 0.4642, "num_input_tokens_seen": 59159424, "step": 48645 }, { "epoch": 6.095727352462098, "grad_norm": 0.07459362596273422, "learning_rate": 8.776064698573381e-06, "loss": 0.4597, "num_input_tokens_seen": 59165824, "step": 48650 }, { "epoch": 6.096353840370881, "grad_norm": 0.06637131422758102, "learning_rate": 8.775706316271816e-06, "loss": 0.4567, "num_input_tokens_seen": 59171968, "step": 48655 }, { "epoch": 6.096980328279664, "grad_norm": 0.07937519997358322, "learning_rate": 8.775347888828506e-06, "loss": 0.4676, "num_input_tokens_seen": 59177952, "step": 48660 }, { "epoch": 6.097606816188447, "grad_norm": 0.07200323790311813, "learning_rate": 8.77498941624774e-06, "loss": 0.4633, "num_input_tokens_seen": 59184032, "step": 48665 }, { "epoch": 6.0982333040972305, "grad_norm": 0.06704331189393997, "learning_rate": 8.774630898533798e-06, "loss": 0.4537, "num_input_tokens_seen": 59190400, "step": 48670 }, { "epoch": 6.098859792006015, "grad_norm": 0.06727533787488937, "learning_rate": 8.774272335690975e-06, "loss": 0.4538, "num_input_tokens_seen": 59196256, "step": 48675 }, { "epoch": 6.099486279914798, "grad_norm": 0.12271151691675186, "learning_rate": 8.773913727723549e-06, "loss": 0.4719, "num_input_tokens_seen": 59202336, "step": 48680 }, { "epoch": 6.100112767823581, "grad_norm": 0.10577718168497086, "learning_rate": 8.773555074635814e-06, "loss": 0.4584, "num_input_tokens_seen": 59208352, "step": 48685 }, { "epoch": 6.100739255732364, "grad_norm": 0.11686503887176514, "learning_rate": 8.773196376432053e-06, "loss": 0.4614, "num_input_tokens_seen": 59214592, "step": 48690 }, { "epoch": 6.101365743641148, "grad_norm": 0.06547881662845612, "learning_rate": 8.772837633116558e-06, "loss": 0.4634, "num_input_tokens_seen": 59220672, "step": 48695 }, { "epoch": 6.101992231549931, "grad_norm": 0.10541960597038269, "learning_rate": 8.77247884469362e-06, "loss": 0.4532, "num_input_tokens_seen": 59226720, "step": 48700 }, { "epoch": 6.1026187194587145, "grad_norm": 0.08346717059612274, "learning_rate": 8.772120011167521e-06, "loss": 0.4705, "num_input_tokens_seen": 59231840, "step": 48705 }, { "epoch": 6.103245207367498, "grad_norm": 0.07746438682079315, "learning_rate": 8.771761132542558e-06, "loss": 0.4553, "num_input_tokens_seen": 59237888, "step": 48710 }, { "epoch": 6.103871695276281, "grad_norm": 0.09809912741184235, "learning_rate": 8.771402208823021e-06, "loss": 0.4589, "num_input_tokens_seen": 59244192, "step": 48715 }, { "epoch": 6.104498183185065, "grad_norm": 0.0769784078001976, "learning_rate": 8.771043240013196e-06, "loss": 0.4546, "num_input_tokens_seen": 59250272, "step": 48720 }, { "epoch": 6.105124671093848, "grad_norm": 0.038963548839092255, "learning_rate": 8.770684226117383e-06, "loss": 0.4742, "num_input_tokens_seen": 59256256, "step": 48725 }, { "epoch": 6.105751159002631, "grad_norm": 0.1370076686143875, "learning_rate": 8.770325167139866e-06, "loss": 0.4691, "num_input_tokens_seen": 59262496, "step": 48730 }, { "epoch": 6.106377646911414, "grad_norm": 0.09767971932888031, "learning_rate": 8.769966063084945e-06, "loss": 0.4628, "num_input_tokens_seen": 59268416, "step": 48735 }, { "epoch": 6.1070041348201975, "grad_norm": 0.07036543637514114, "learning_rate": 8.769606913956907e-06, "loss": 0.4548, "num_input_tokens_seen": 59274336, "step": 48740 }, { "epoch": 6.107630622728982, "grad_norm": 0.0474458672106266, "learning_rate": 8.769247719760051e-06, "loss": 0.4647, "num_input_tokens_seen": 59280576, "step": 48745 }, { "epoch": 6.108257110637765, "grad_norm": 0.0748894214630127, "learning_rate": 8.76888848049867e-06, "loss": 0.4666, "num_input_tokens_seen": 59286592, "step": 48750 }, { "epoch": 6.108883598546548, "grad_norm": 0.07859187573194504, "learning_rate": 8.768529196177057e-06, "loss": 0.4503, "num_input_tokens_seen": 59292672, "step": 48755 }, { "epoch": 6.109510086455331, "grad_norm": 0.11247751116752625, "learning_rate": 8.76816986679951e-06, "loss": 0.4598, "num_input_tokens_seen": 59298400, "step": 48760 }, { "epoch": 6.110136574364115, "grad_norm": 0.07585085928440094, "learning_rate": 8.767810492370323e-06, "loss": 0.4633, "num_input_tokens_seen": 59303808, "step": 48765 }, { "epoch": 6.110763062272898, "grad_norm": 0.07903338968753815, "learning_rate": 8.767451072893795e-06, "loss": 0.466, "num_input_tokens_seen": 59309856, "step": 48770 }, { "epoch": 6.1113895501816815, "grad_norm": 0.07031498849391937, "learning_rate": 8.767091608374221e-06, "loss": 0.4623, "num_input_tokens_seen": 59316096, "step": 48775 }, { "epoch": 6.112016038090465, "grad_norm": 0.07689525932073593, "learning_rate": 8.7667320988159e-06, "loss": 0.4628, "num_input_tokens_seen": 59321984, "step": 48780 }, { "epoch": 6.112642525999248, "grad_norm": 0.09137182682752609, "learning_rate": 8.76637254422313e-06, "loss": 0.4619, "num_input_tokens_seen": 59328192, "step": 48785 }, { "epoch": 6.113269013908032, "grad_norm": 0.12934990227222443, "learning_rate": 8.766012944600211e-06, "loss": 0.4696, "num_input_tokens_seen": 59333568, "step": 48790 }, { "epoch": 6.113895501816815, "grad_norm": 0.07996806502342224, "learning_rate": 8.765653299951439e-06, "loss": 0.474, "num_input_tokens_seen": 59339712, "step": 48795 }, { "epoch": 6.114521989725598, "grad_norm": 0.06585714221000671, "learning_rate": 8.765293610281116e-06, "loss": 0.4632, "num_input_tokens_seen": 59345728, "step": 48800 }, { "epoch": 6.115148477634381, "grad_norm": 0.07512833178043365, "learning_rate": 8.764933875593541e-06, "loss": 0.4581, "num_input_tokens_seen": 59352064, "step": 48805 }, { "epoch": 6.1157749655431655, "grad_norm": 0.08478237688541412, "learning_rate": 8.76457409589302e-06, "loss": 0.464, "num_input_tokens_seen": 59357568, "step": 48810 }, { "epoch": 6.116401453451949, "grad_norm": 0.08073391020298004, "learning_rate": 8.764214271183849e-06, "loss": 0.4617, "num_input_tokens_seen": 59363776, "step": 48815 }, { "epoch": 6.117027941360732, "grad_norm": 0.10912668704986572, "learning_rate": 8.763854401470329e-06, "loss": 0.4545, "num_input_tokens_seen": 59370112, "step": 48820 }, { "epoch": 6.117654429269515, "grad_norm": 0.15160885453224182, "learning_rate": 8.763494486756767e-06, "loss": 0.473, "num_input_tokens_seen": 59376064, "step": 48825 }, { "epoch": 6.118280917178298, "grad_norm": 0.11556656658649445, "learning_rate": 8.763134527047465e-06, "loss": 0.4637, "num_input_tokens_seen": 59382496, "step": 48830 }, { "epoch": 6.118907405087082, "grad_norm": 0.07216867059469223, "learning_rate": 8.762774522346724e-06, "loss": 0.464, "num_input_tokens_seen": 59388544, "step": 48835 }, { "epoch": 6.119533892995865, "grad_norm": 0.08442209661006927, "learning_rate": 8.762414472658853e-06, "loss": 0.4597, "num_input_tokens_seen": 59394624, "step": 48840 }, { "epoch": 6.1201603809046485, "grad_norm": 0.09297407418489456, "learning_rate": 8.762054377988152e-06, "loss": 0.4664, "num_input_tokens_seen": 59400544, "step": 48845 }, { "epoch": 6.120786868813432, "grad_norm": 0.10865774750709534, "learning_rate": 8.761694238338927e-06, "loss": 0.4592, "num_input_tokens_seen": 59406912, "step": 48850 }, { "epoch": 6.121413356722215, "grad_norm": 0.044692911207675934, "learning_rate": 8.761334053715485e-06, "loss": 0.4607, "num_input_tokens_seen": 59413216, "step": 48855 }, { "epoch": 6.122039844630999, "grad_norm": 0.07972816377878189, "learning_rate": 8.760973824122133e-06, "loss": 0.4629, "num_input_tokens_seen": 59419296, "step": 48860 }, { "epoch": 6.122666332539782, "grad_norm": 0.07064691931009293, "learning_rate": 8.760613549563176e-06, "loss": 0.456, "num_input_tokens_seen": 59425280, "step": 48865 }, { "epoch": 6.123292820448565, "grad_norm": 0.10073813796043396, "learning_rate": 8.760253230042923e-06, "loss": 0.4676, "num_input_tokens_seen": 59431360, "step": 48870 }, { "epoch": 6.123919308357348, "grad_norm": 0.09124568104743958, "learning_rate": 8.759892865565681e-06, "loss": 0.4702, "num_input_tokens_seen": 59437632, "step": 48875 }, { "epoch": 6.1245457962661325, "grad_norm": 0.071813203394413, "learning_rate": 8.759532456135759e-06, "loss": 0.462, "num_input_tokens_seen": 59443744, "step": 48880 }, { "epoch": 6.125172284174916, "grad_norm": 0.04466193914413452, "learning_rate": 8.759172001757465e-06, "loss": 0.4625, "num_input_tokens_seen": 59450016, "step": 48885 }, { "epoch": 6.125798772083699, "grad_norm": 0.09968889504671097, "learning_rate": 8.75881150243511e-06, "loss": 0.4646, "num_input_tokens_seen": 59455968, "step": 48890 }, { "epoch": 6.126425259992482, "grad_norm": 0.08073059469461441, "learning_rate": 8.758450958173002e-06, "loss": 0.4615, "num_input_tokens_seen": 59461504, "step": 48895 }, { "epoch": 6.127051747901265, "grad_norm": 0.04387243464589119, "learning_rate": 8.758090368975454e-06, "loss": 0.4597, "num_input_tokens_seen": 59467136, "step": 48900 }, { "epoch": 6.127678235810049, "grad_norm": 0.08596636354923248, "learning_rate": 8.757729734846774e-06, "loss": 0.4619, "num_input_tokens_seen": 59473344, "step": 48905 }, { "epoch": 6.128304723718832, "grad_norm": 0.061782971024513245, "learning_rate": 8.757369055791279e-06, "loss": 0.4586, "num_input_tokens_seen": 59479360, "step": 48910 }, { "epoch": 6.128931211627616, "grad_norm": 0.06891444325447083, "learning_rate": 8.757008331813276e-06, "loss": 0.4628, "num_input_tokens_seen": 59485504, "step": 48915 }, { "epoch": 6.129557699536399, "grad_norm": 0.0757984146475792, "learning_rate": 8.75664756291708e-06, "loss": 0.4655, "num_input_tokens_seen": 59491392, "step": 48920 }, { "epoch": 6.130184187445182, "grad_norm": 0.08710361272096634, "learning_rate": 8.756286749107004e-06, "loss": 0.4638, "num_input_tokens_seen": 59497856, "step": 48925 }, { "epoch": 6.130810675353966, "grad_norm": 0.04219827055931091, "learning_rate": 8.75592589038736e-06, "loss": 0.4666, "num_input_tokens_seen": 59504000, "step": 48930 }, { "epoch": 6.131437163262749, "grad_norm": 0.06798085570335388, "learning_rate": 8.755564986762466e-06, "loss": 0.453, "num_input_tokens_seen": 59510112, "step": 48935 }, { "epoch": 6.132063651171532, "grad_norm": 0.06290851533412933, "learning_rate": 8.755204038236636e-06, "loss": 0.4635, "num_input_tokens_seen": 59516192, "step": 48940 }, { "epoch": 6.1326901390803155, "grad_norm": 0.0678231492638588, "learning_rate": 8.754843044814183e-06, "loss": 0.4608, "num_input_tokens_seen": 59522464, "step": 48945 }, { "epoch": 6.1333166269890995, "grad_norm": 0.035471655428409576, "learning_rate": 8.754482006499426e-06, "loss": 0.4686, "num_input_tokens_seen": 59528608, "step": 48950 }, { "epoch": 6.133943114897883, "grad_norm": 0.06836136430501938, "learning_rate": 8.754120923296678e-06, "loss": 0.459, "num_input_tokens_seen": 59534848, "step": 48955 }, { "epoch": 6.134569602806666, "grad_norm": 0.07273097336292267, "learning_rate": 8.753759795210259e-06, "loss": 0.4696, "num_input_tokens_seen": 59541152, "step": 48960 }, { "epoch": 6.135196090715449, "grad_norm": 0.07280130684375763, "learning_rate": 8.753398622244485e-06, "loss": 0.4639, "num_input_tokens_seen": 59546944, "step": 48965 }, { "epoch": 6.135822578624232, "grad_norm": 0.05834953486919403, "learning_rate": 8.753037404403675e-06, "loss": 0.4629, "num_input_tokens_seen": 59553120, "step": 48970 }, { "epoch": 6.136449066533016, "grad_norm": 0.0803542286157608, "learning_rate": 8.752676141692149e-06, "loss": 0.4635, "num_input_tokens_seen": 59559136, "step": 48975 }, { "epoch": 6.137075554441799, "grad_norm": 0.06908255070447922, "learning_rate": 8.752314834114224e-06, "loss": 0.4617, "num_input_tokens_seen": 59565376, "step": 48980 }, { "epoch": 6.137702042350583, "grad_norm": 0.060355812311172485, "learning_rate": 8.751953481674218e-06, "loss": 0.4692, "num_input_tokens_seen": 59571648, "step": 48985 }, { "epoch": 6.138328530259366, "grad_norm": 0.06324782967567444, "learning_rate": 8.751592084376456e-06, "loss": 0.4666, "num_input_tokens_seen": 59577728, "step": 48990 }, { "epoch": 6.138955018168149, "grad_norm": 0.070401631295681, "learning_rate": 8.751230642225257e-06, "loss": 0.4574, "num_input_tokens_seen": 59583104, "step": 48995 }, { "epoch": 6.139581506076933, "grad_norm": 0.0932990089058876, "learning_rate": 8.750869155224938e-06, "loss": 0.4579, "num_input_tokens_seen": 59589088, "step": 49000 }, { "epoch": 6.140207993985716, "grad_norm": 0.07455338537693024, "learning_rate": 8.75050762337983e-06, "loss": 0.4568, "num_input_tokens_seen": 59595392, "step": 49005 }, { "epoch": 6.140834481894499, "grad_norm": 0.06875770539045334, "learning_rate": 8.750146046694248e-06, "loss": 0.4614, "num_input_tokens_seen": 59601600, "step": 49010 }, { "epoch": 6.1414609698032825, "grad_norm": 0.03562849014997482, "learning_rate": 8.749784425172516e-06, "loss": 0.4706, "num_input_tokens_seen": 59608032, "step": 49015 }, { "epoch": 6.1420874577120665, "grad_norm": 0.05773882940411568, "learning_rate": 8.74942275881896e-06, "loss": 0.4699, "num_input_tokens_seen": 59613888, "step": 49020 }, { "epoch": 6.14271394562085, "grad_norm": 0.03692129626870155, "learning_rate": 8.749061047637901e-06, "loss": 0.4591, "num_input_tokens_seen": 59619904, "step": 49025 }, { "epoch": 6.143340433529633, "grad_norm": 0.09408347308635712, "learning_rate": 8.748699291633666e-06, "loss": 0.4686, "num_input_tokens_seen": 59625664, "step": 49030 }, { "epoch": 6.143966921438416, "grad_norm": 0.06457550823688507, "learning_rate": 8.748337490810578e-06, "loss": 0.458, "num_input_tokens_seen": 59632096, "step": 49035 }, { "epoch": 6.144593409347199, "grad_norm": 0.03492194786667824, "learning_rate": 8.747975645172966e-06, "loss": 0.4651, "num_input_tokens_seen": 59638272, "step": 49040 }, { "epoch": 6.145219897255983, "grad_norm": 0.06094770506024361, "learning_rate": 8.747613754725153e-06, "loss": 0.4612, "num_input_tokens_seen": 59644224, "step": 49045 }, { "epoch": 6.1458463851647664, "grad_norm": 0.06461239606142044, "learning_rate": 8.747251819471467e-06, "loss": 0.4607, "num_input_tokens_seen": 59650240, "step": 49050 }, { "epoch": 6.14647287307355, "grad_norm": 0.0637841522693634, "learning_rate": 8.746889839416235e-06, "loss": 0.4639, "num_input_tokens_seen": 59656576, "step": 49055 }, { "epoch": 6.147099360982333, "grad_norm": 0.10209976136684418, "learning_rate": 8.746527814563785e-06, "loss": 0.4634, "num_input_tokens_seen": 59662624, "step": 49060 }, { "epoch": 6.147725848891117, "grad_norm": 0.07072695344686508, "learning_rate": 8.746165744918445e-06, "loss": 0.4666, "num_input_tokens_seen": 59668704, "step": 49065 }, { "epoch": 6.1483523367999, "grad_norm": 0.09249704331159592, "learning_rate": 8.745803630484543e-06, "loss": 0.4688, "num_input_tokens_seen": 59674784, "step": 49070 }, { "epoch": 6.148978824708683, "grad_norm": 0.07082493603229523, "learning_rate": 8.74544147126641e-06, "loss": 0.4624, "num_input_tokens_seen": 59681184, "step": 49075 }, { "epoch": 6.149605312617466, "grad_norm": 0.06019405648112297, "learning_rate": 8.745079267268375e-06, "loss": 0.4562, "num_input_tokens_seen": 59687648, "step": 49080 }, { "epoch": 6.1502318005262495, "grad_norm": 0.07487014681100845, "learning_rate": 8.744717018494769e-06, "loss": 0.4596, "num_input_tokens_seen": 59693472, "step": 49085 }, { "epoch": 6.150858288435034, "grad_norm": 0.07551921904087067, "learning_rate": 8.744354724949922e-06, "loss": 0.4555, "num_input_tokens_seen": 59699776, "step": 49090 }, { "epoch": 6.151484776343817, "grad_norm": 0.07411651313304901, "learning_rate": 8.743992386638166e-06, "loss": 0.4626, "num_input_tokens_seen": 59705824, "step": 49095 }, { "epoch": 6.1521112642526, "grad_norm": 0.07415346056222916, "learning_rate": 8.743630003563834e-06, "loss": 0.4699, "num_input_tokens_seen": 59712000, "step": 49100 }, { "epoch": 6.152737752161383, "grad_norm": 0.05394013971090317, "learning_rate": 8.743267575731258e-06, "loss": 0.457, "num_input_tokens_seen": 59718208, "step": 49105 }, { "epoch": 6.153364240070166, "grad_norm": 0.06063098460435867, "learning_rate": 8.74290510314477e-06, "loss": 0.458, "num_input_tokens_seen": 59723776, "step": 49110 }, { "epoch": 6.15399072797895, "grad_norm": 0.08191291987895966, "learning_rate": 8.742542585808706e-06, "loss": 0.4693, "num_input_tokens_seen": 59729888, "step": 49115 }, { "epoch": 6.1546172158877335, "grad_norm": 0.09426039457321167, "learning_rate": 8.742180023727398e-06, "loss": 0.4593, "num_input_tokens_seen": 59736192, "step": 49120 }, { "epoch": 6.155243703796517, "grad_norm": 0.05655810236930847, "learning_rate": 8.741817416905181e-06, "loss": 0.4578, "num_input_tokens_seen": 59742528, "step": 49125 }, { "epoch": 6.1558701917053, "grad_norm": 0.0855836495757103, "learning_rate": 8.741454765346393e-06, "loss": 0.4677, "num_input_tokens_seen": 59748864, "step": 49130 }, { "epoch": 6.156496679614084, "grad_norm": 0.06166858598589897, "learning_rate": 8.741092069055365e-06, "loss": 0.4639, "num_input_tokens_seen": 59754752, "step": 49135 }, { "epoch": 6.157123167522867, "grad_norm": 0.10197267681360245, "learning_rate": 8.740729328036436e-06, "loss": 0.4619, "num_input_tokens_seen": 59760832, "step": 49140 }, { "epoch": 6.15774965543165, "grad_norm": 0.13151313364505768, "learning_rate": 8.740366542293946e-06, "loss": 0.4554, "num_input_tokens_seen": 59766720, "step": 49145 }, { "epoch": 6.158376143340433, "grad_norm": 0.06730270385742188, "learning_rate": 8.740003711832227e-06, "loss": 0.4564, "num_input_tokens_seen": 59772800, "step": 49150 }, { "epoch": 6.1590026312492165, "grad_norm": 0.09114844352006912, "learning_rate": 8.73964083665562e-06, "loss": 0.4502, "num_input_tokens_seen": 59778784, "step": 49155 }, { "epoch": 6.159629119158001, "grad_norm": 0.06390602886676788, "learning_rate": 8.73927791676846e-06, "loss": 0.4609, "num_input_tokens_seen": 59784896, "step": 49160 }, { "epoch": 6.160255607066784, "grad_norm": 0.11257723718881607, "learning_rate": 8.738914952175091e-06, "loss": 0.4692, "num_input_tokens_seen": 59791328, "step": 49165 }, { "epoch": 6.160882094975567, "grad_norm": 0.06662735342979431, "learning_rate": 8.73855194287985e-06, "loss": 0.4585, "num_input_tokens_seen": 59797280, "step": 49170 }, { "epoch": 6.16150858288435, "grad_norm": 0.0649275928735733, "learning_rate": 8.738188888887077e-06, "loss": 0.4641, "num_input_tokens_seen": 59803584, "step": 49175 }, { "epoch": 6.162135070793133, "grad_norm": 0.06819494068622589, "learning_rate": 8.737825790201114e-06, "loss": 0.46, "num_input_tokens_seen": 59809216, "step": 49180 }, { "epoch": 6.162761558701917, "grad_norm": 0.062261492013931274, "learning_rate": 8.7374626468263e-06, "loss": 0.4547, "num_input_tokens_seen": 59815200, "step": 49185 }, { "epoch": 6.1633880466107005, "grad_norm": 0.09020645916461945, "learning_rate": 8.737099458766978e-06, "loss": 0.4569, "num_input_tokens_seen": 59821632, "step": 49190 }, { "epoch": 6.164014534519484, "grad_norm": 0.08348352462053299, "learning_rate": 8.736736226027492e-06, "loss": 0.467, "num_input_tokens_seen": 59827712, "step": 49195 }, { "epoch": 6.164641022428267, "grad_norm": 0.06401233375072479, "learning_rate": 8.736372948612179e-06, "loss": 0.4649, "num_input_tokens_seen": 59833216, "step": 49200 }, { "epoch": 6.165267510337051, "grad_norm": 0.04280390217900276, "learning_rate": 8.736009626525389e-06, "loss": 0.4683, "num_input_tokens_seen": 59839136, "step": 49205 }, { "epoch": 6.165893998245834, "grad_norm": 0.13055714964866638, "learning_rate": 8.735646259771462e-06, "loss": 0.4622, "num_input_tokens_seen": 59845504, "step": 49210 }, { "epoch": 6.166520486154617, "grad_norm": 0.0778181403875351, "learning_rate": 8.735282848354742e-06, "loss": 0.4619, "num_input_tokens_seen": 59851584, "step": 49215 }, { "epoch": 6.1671469740634, "grad_norm": 0.06350205093622208, "learning_rate": 8.734919392279578e-06, "loss": 0.465, "num_input_tokens_seen": 59857984, "step": 49220 }, { "epoch": 6.167773461972184, "grad_norm": 0.0899813175201416, "learning_rate": 8.73455589155031e-06, "loss": 0.4596, "num_input_tokens_seen": 59864032, "step": 49225 }, { "epoch": 6.168399949880968, "grad_norm": 0.03793846443295479, "learning_rate": 8.734192346171286e-06, "loss": 0.4612, "num_input_tokens_seen": 59870240, "step": 49230 }, { "epoch": 6.169026437789751, "grad_norm": 0.06605513393878937, "learning_rate": 8.733828756146853e-06, "loss": 0.4593, "num_input_tokens_seen": 59876352, "step": 49235 }, { "epoch": 6.169652925698534, "grad_norm": 0.06832335889339447, "learning_rate": 8.73346512148136e-06, "loss": 0.469, "num_input_tokens_seen": 59882624, "step": 49240 }, { "epoch": 6.170279413607317, "grad_norm": 0.13827988505363464, "learning_rate": 8.733101442179153e-06, "loss": 0.4695, "num_input_tokens_seen": 59888768, "step": 49245 }, { "epoch": 6.1709059015161, "grad_norm": 0.07214175164699554, "learning_rate": 8.732737718244577e-06, "loss": 0.4581, "num_input_tokens_seen": 59894304, "step": 49250 }, { "epoch": 6.171532389424884, "grad_norm": 0.0716666504740715, "learning_rate": 8.732373949681984e-06, "loss": 0.4591, "num_input_tokens_seen": 59900512, "step": 49255 }, { "epoch": 6.1721588773336675, "grad_norm": 0.12328439205884933, "learning_rate": 8.732010136495723e-06, "loss": 0.465, "num_input_tokens_seen": 59906880, "step": 49260 }, { "epoch": 6.172785365242451, "grad_norm": 0.11303269118070602, "learning_rate": 8.731646278690144e-06, "loss": 0.4712, "num_input_tokens_seen": 59913024, "step": 49265 }, { "epoch": 6.173411853151234, "grad_norm": 0.07276066392660141, "learning_rate": 8.731282376269594e-06, "loss": 0.4667, "num_input_tokens_seen": 59918720, "step": 49270 }, { "epoch": 6.174038341060018, "grad_norm": 0.11265042424201965, "learning_rate": 8.730918429238429e-06, "loss": 0.4568, "num_input_tokens_seen": 59924672, "step": 49275 }, { "epoch": 6.174664828968801, "grad_norm": 0.09175149351358414, "learning_rate": 8.730554437600995e-06, "loss": 0.4646, "num_input_tokens_seen": 59930528, "step": 49280 }, { "epoch": 6.175291316877584, "grad_norm": 0.06818859279155731, "learning_rate": 8.730190401361647e-06, "loss": 0.4587, "num_input_tokens_seen": 59936704, "step": 49285 }, { "epoch": 6.175917804786367, "grad_norm": 0.11482129245996475, "learning_rate": 8.729826320524737e-06, "loss": 0.4653, "num_input_tokens_seen": 59942752, "step": 49290 }, { "epoch": 6.176544292695151, "grad_norm": 0.11299986392259598, "learning_rate": 8.729462195094616e-06, "loss": 0.4641, "num_input_tokens_seen": 59948992, "step": 49295 }, { "epoch": 6.177170780603935, "grad_norm": 0.09770261496305466, "learning_rate": 8.72909802507564e-06, "loss": 0.4591, "num_input_tokens_seen": 59954848, "step": 49300 }, { "epoch": 6.177797268512718, "grad_norm": 0.08547089993953705, "learning_rate": 8.728733810472163e-06, "loss": 0.471, "num_input_tokens_seen": 59960736, "step": 49305 }, { "epoch": 6.178423756421501, "grad_norm": 0.03963779658079147, "learning_rate": 8.728369551288537e-06, "loss": 0.4603, "num_input_tokens_seen": 59966912, "step": 49310 }, { "epoch": 6.179050244330284, "grad_norm": 0.07464947551488876, "learning_rate": 8.72800524752912e-06, "loss": 0.4622, "num_input_tokens_seen": 59972704, "step": 49315 }, { "epoch": 6.179676732239068, "grad_norm": 0.067036472260952, "learning_rate": 8.727640899198264e-06, "loss": 0.4703, "num_input_tokens_seen": 59979040, "step": 49320 }, { "epoch": 6.180303220147851, "grad_norm": 0.07184381783008575, "learning_rate": 8.727276506300327e-06, "loss": 0.4652, "num_input_tokens_seen": 59985184, "step": 49325 }, { "epoch": 6.1809297080566346, "grad_norm": 0.0945887640118599, "learning_rate": 8.726912068839666e-06, "loss": 0.4635, "num_input_tokens_seen": 59991296, "step": 49330 }, { "epoch": 6.181556195965418, "grad_norm": 0.0911983773112297, "learning_rate": 8.726547586820637e-06, "loss": 0.4716, "num_input_tokens_seen": 59997568, "step": 49335 }, { "epoch": 6.182182683874201, "grad_norm": 0.11976571381092072, "learning_rate": 8.7261830602476e-06, "loss": 0.4691, "num_input_tokens_seen": 60004160, "step": 49340 }, { "epoch": 6.182809171782985, "grad_norm": 0.08331679552793503, "learning_rate": 8.725818489124913e-06, "loss": 0.4572, "num_input_tokens_seen": 60010176, "step": 49345 }, { "epoch": 6.183435659691768, "grad_norm": 0.070868581533432, "learning_rate": 8.72545387345693e-06, "loss": 0.4637, "num_input_tokens_seen": 60016448, "step": 49350 }, { "epoch": 6.184062147600551, "grad_norm": 0.1253240704536438, "learning_rate": 8.725089213248017e-06, "loss": 0.4557, "num_input_tokens_seen": 60022336, "step": 49355 }, { "epoch": 6.1846886355093345, "grad_norm": 0.07120369374752045, "learning_rate": 8.724724508502529e-06, "loss": 0.458, "num_input_tokens_seen": 60028576, "step": 49360 }, { "epoch": 6.185315123418118, "grad_norm": 0.0752134844660759, "learning_rate": 8.724359759224826e-06, "loss": 0.4652, "num_input_tokens_seen": 60034592, "step": 49365 }, { "epoch": 6.185941611326902, "grad_norm": 0.06692813336849213, "learning_rate": 8.723994965419273e-06, "loss": 0.4637, "num_input_tokens_seen": 60041184, "step": 49370 }, { "epoch": 6.186568099235685, "grad_norm": 0.08781791478395462, "learning_rate": 8.72363012709023e-06, "loss": 0.4656, "num_input_tokens_seen": 60047168, "step": 49375 }, { "epoch": 6.187194587144468, "grad_norm": 0.07912282645702362, "learning_rate": 8.723265244242056e-06, "loss": 0.4619, "num_input_tokens_seen": 60053536, "step": 49380 }, { "epoch": 6.187821075053251, "grad_norm": 0.0669960230588913, "learning_rate": 8.722900316879118e-06, "loss": 0.4651, "num_input_tokens_seen": 60059712, "step": 49385 }, { "epoch": 6.188447562962035, "grad_norm": 0.10832463949918747, "learning_rate": 8.722535345005775e-06, "loss": 0.4644, "num_input_tokens_seen": 60065856, "step": 49390 }, { "epoch": 6.189074050870818, "grad_norm": 0.07755065709352493, "learning_rate": 8.722170328626392e-06, "loss": 0.4664, "num_input_tokens_seen": 60071488, "step": 49395 }, { "epoch": 6.189700538779602, "grad_norm": 0.1044955626130104, "learning_rate": 8.721805267745335e-06, "loss": 0.4692, "num_input_tokens_seen": 60077984, "step": 49400 }, { "epoch": 6.190327026688385, "grad_norm": 0.06654804199934006, "learning_rate": 8.721440162366965e-06, "loss": 0.4611, "num_input_tokens_seen": 60084096, "step": 49405 }, { "epoch": 6.190953514597168, "grad_norm": 0.11075975745916367, "learning_rate": 8.72107501249565e-06, "loss": 0.4618, "num_input_tokens_seen": 60090272, "step": 49410 }, { "epoch": 6.191580002505952, "grad_norm": 0.07675778865814209, "learning_rate": 8.720709818135755e-06, "loss": 0.4628, "num_input_tokens_seen": 60096768, "step": 49415 }, { "epoch": 6.192206490414735, "grad_norm": 0.09503379464149475, "learning_rate": 8.720344579291646e-06, "loss": 0.4644, "num_input_tokens_seen": 60102304, "step": 49420 }, { "epoch": 6.192832978323518, "grad_norm": 0.05849899724125862, "learning_rate": 8.719979295967688e-06, "loss": 0.4602, "num_input_tokens_seen": 60108224, "step": 49425 }, { "epoch": 6.1934594662323015, "grad_norm": 0.07101180404424667, "learning_rate": 8.719613968168253e-06, "loss": 0.4565, "num_input_tokens_seen": 60114336, "step": 49430 }, { "epoch": 6.194085954141085, "grad_norm": 0.0672953799366951, "learning_rate": 8.719248595897703e-06, "loss": 0.4599, "num_input_tokens_seen": 60120448, "step": 49435 }, { "epoch": 6.194712442049869, "grad_norm": 0.0745781660079956, "learning_rate": 8.718883179160408e-06, "loss": 0.4589, "num_input_tokens_seen": 60126464, "step": 49440 }, { "epoch": 6.195338929958652, "grad_norm": 0.0698736384510994, "learning_rate": 8.718517717960741e-06, "loss": 0.4574, "num_input_tokens_seen": 60132640, "step": 49445 }, { "epoch": 6.195965417867435, "grad_norm": 0.036983832716941833, "learning_rate": 8.718152212303068e-06, "loss": 0.4597, "num_input_tokens_seen": 60138784, "step": 49450 }, { "epoch": 6.196591905776218, "grad_norm": 0.09183084964752197, "learning_rate": 8.717786662191759e-06, "loss": 0.4587, "num_input_tokens_seen": 60144832, "step": 49455 }, { "epoch": 6.197218393685002, "grad_norm": 0.07280012220144272, "learning_rate": 8.717421067631183e-06, "loss": 0.4586, "num_input_tokens_seen": 60151040, "step": 49460 }, { "epoch": 6.197844881593785, "grad_norm": 0.08795876801013947, "learning_rate": 8.717055428625714e-06, "loss": 0.4614, "num_input_tokens_seen": 60157248, "step": 49465 }, { "epoch": 6.198471369502569, "grad_norm": 0.1611880511045456, "learning_rate": 8.716689745179723e-06, "loss": 0.4611, "num_input_tokens_seen": 60163616, "step": 49470 }, { "epoch": 6.199097857411352, "grad_norm": 0.08533323556184769, "learning_rate": 8.716324017297579e-06, "loss": 0.4608, "num_input_tokens_seen": 60169952, "step": 49475 }, { "epoch": 6.199724345320135, "grad_norm": 0.11218975484371185, "learning_rate": 8.71595824498366e-06, "loss": 0.4684, "num_input_tokens_seen": 60176000, "step": 49480 }, { "epoch": 6.200350833228919, "grad_norm": 0.10366453230381012, "learning_rate": 8.715592428242333e-06, "loss": 0.4573, "num_input_tokens_seen": 60182048, "step": 49485 }, { "epoch": 6.200977321137702, "grad_norm": 0.09141377359628677, "learning_rate": 8.715226567077976e-06, "loss": 0.4617, "num_input_tokens_seen": 60188256, "step": 49490 }, { "epoch": 6.201603809046485, "grad_norm": 0.0936279445886612, "learning_rate": 8.714860661494962e-06, "loss": 0.469, "num_input_tokens_seen": 60194080, "step": 49495 }, { "epoch": 6.2022302969552685, "grad_norm": 0.07070533186197281, "learning_rate": 8.714494711497665e-06, "loss": 0.463, "num_input_tokens_seen": 60200096, "step": 49500 }, { "epoch": 6.202856784864052, "grad_norm": 0.09179525077342987, "learning_rate": 8.71412871709046e-06, "loss": 0.4586, "num_input_tokens_seen": 60206176, "step": 49505 }, { "epoch": 6.203483272772836, "grad_norm": 0.09989675879478455, "learning_rate": 8.713762678277725e-06, "loss": 0.4583, "num_input_tokens_seen": 60212224, "step": 49510 }, { "epoch": 6.204109760681619, "grad_norm": 0.07981009036302567, "learning_rate": 8.713396595063835e-06, "loss": 0.4572, "num_input_tokens_seen": 60218304, "step": 49515 }, { "epoch": 6.204736248590402, "grad_norm": 0.07077671587467194, "learning_rate": 8.713030467453165e-06, "loss": 0.4676, "num_input_tokens_seen": 60224128, "step": 49520 }, { "epoch": 6.205362736499185, "grad_norm": 0.0800023227930069, "learning_rate": 8.712664295450095e-06, "loss": 0.4692, "num_input_tokens_seen": 60230240, "step": 49525 }, { "epoch": 6.205989224407969, "grad_norm": 0.08009675145149231, "learning_rate": 8.712298079059002e-06, "loss": 0.4519, "num_input_tokens_seen": 60236128, "step": 49530 }, { "epoch": 6.2066157123167525, "grad_norm": 0.1129087507724762, "learning_rate": 8.711931818284265e-06, "loss": 0.4678, "num_input_tokens_seen": 60242240, "step": 49535 }, { "epoch": 6.207242200225536, "grad_norm": 0.06601674854755402, "learning_rate": 8.71156551313026e-06, "loss": 0.4618, "num_input_tokens_seen": 60248288, "step": 49540 }, { "epoch": 6.207868688134319, "grad_norm": 0.07214353233575821, "learning_rate": 8.711199163601369e-06, "loss": 0.4656, "num_input_tokens_seen": 60254432, "step": 49545 }, { "epoch": 6.208495176043102, "grad_norm": 0.06839968264102936, "learning_rate": 8.710832769701974e-06, "loss": 0.4623, "num_input_tokens_seen": 60260352, "step": 49550 }, { "epoch": 6.209121663951886, "grad_norm": 0.07141134887933731, "learning_rate": 8.710466331436453e-06, "loss": 0.4588, "num_input_tokens_seen": 60266848, "step": 49555 }, { "epoch": 6.209748151860669, "grad_norm": 0.11880642175674438, "learning_rate": 8.710099848809186e-06, "loss": 0.4557, "num_input_tokens_seen": 60273024, "step": 49560 }, { "epoch": 6.210374639769452, "grad_norm": 0.08928796648979187, "learning_rate": 8.709733321824557e-06, "loss": 0.465, "num_input_tokens_seen": 60279264, "step": 49565 }, { "epoch": 6.2110011276782355, "grad_norm": 0.06542983651161194, "learning_rate": 8.709366750486946e-06, "loss": 0.4608, "num_input_tokens_seen": 60285248, "step": 49570 }, { "epoch": 6.21162761558702, "grad_norm": 0.0693608745932579, "learning_rate": 8.709000134800737e-06, "loss": 0.4639, "num_input_tokens_seen": 60291168, "step": 49575 }, { "epoch": 6.212254103495803, "grad_norm": 0.10276630520820618, "learning_rate": 8.708633474770316e-06, "loss": 0.4643, "num_input_tokens_seen": 60296960, "step": 49580 }, { "epoch": 6.212880591404586, "grad_norm": 0.07700680196285248, "learning_rate": 8.708266770400062e-06, "loss": 0.4646, "num_input_tokens_seen": 60302720, "step": 49585 }, { "epoch": 6.213507079313369, "grad_norm": 0.0814255103468895, "learning_rate": 8.70790002169436e-06, "loss": 0.4568, "num_input_tokens_seen": 60308992, "step": 49590 }, { "epoch": 6.214133567222152, "grad_norm": 0.07661096751689911, "learning_rate": 8.707533228657596e-06, "loss": 0.4549, "num_input_tokens_seen": 60314880, "step": 49595 }, { "epoch": 6.214760055130936, "grad_norm": 0.09904695302248001, "learning_rate": 8.707166391294157e-06, "loss": 0.4494, "num_input_tokens_seen": 60320544, "step": 49600 }, { "epoch": 6.2153865430397195, "grad_norm": 0.06990892440080643, "learning_rate": 8.706799509608423e-06, "loss": 0.4636, "num_input_tokens_seen": 60326624, "step": 49605 }, { "epoch": 6.216013030948503, "grad_norm": 0.07800464332103729, "learning_rate": 8.706432583604789e-06, "loss": 0.4585, "num_input_tokens_seen": 60332960, "step": 49610 }, { "epoch": 6.216639518857286, "grad_norm": 0.13191168010234833, "learning_rate": 8.706065613287635e-06, "loss": 0.4712, "num_input_tokens_seen": 60339168, "step": 49615 }, { "epoch": 6.217266006766069, "grad_norm": 0.08329812437295914, "learning_rate": 8.70569859866135e-06, "loss": 0.4658, "num_input_tokens_seen": 60345120, "step": 49620 }, { "epoch": 6.217892494674853, "grad_norm": 0.043074335902929306, "learning_rate": 8.705331539730323e-06, "loss": 0.4589, "num_input_tokens_seen": 60350976, "step": 49625 }, { "epoch": 6.218518982583636, "grad_norm": 0.07255302369594574, "learning_rate": 8.704964436498945e-06, "loss": 0.4694, "num_input_tokens_seen": 60357568, "step": 49630 }, { "epoch": 6.219145470492419, "grad_norm": 0.10255470126867294, "learning_rate": 8.704597288971598e-06, "loss": 0.4589, "num_input_tokens_seen": 60363456, "step": 49635 }, { "epoch": 6.219771958401203, "grad_norm": 0.08512653410434723, "learning_rate": 8.704230097152678e-06, "loss": 0.4646, "num_input_tokens_seen": 60369536, "step": 49640 }, { "epoch": 6.220398446309987, "grad_norm": 0.04494272544980049, "learning_rate": 8.703862861046573e-06, "loss": 0.4638, "num_input_tokens_seen": 60375776, "step": 49645 }, { "epoch": 6.22102493421877, "grad_norm": 0.09015814960002899, "learning_rate": 8.703495580657672e-06, "loss": 0.4605, "num_input_tokens_seen": 60381856, "step": 49650 }, { "epoch": 6.221651422127553, "grad_norm": 0.07089060544967651, "learning_rate": 8.70312825599037e-06, "loss": 0.4629, "num_input_tokens_seen": 60387776, "step": 49655 }, { "epoch": 6.222277910036336, "grad_norm": 0.07783720642328262, "learning_rate": 8.702760887049053e-06, "loss": 0.4623, "num_input_tokens_seen": 60394272, "step": 49660 }, { "epoch": 6.222904397945119, "grad_norm": 0.06590453535318375, "learning_rate": 8.702393473838119e-06, "loss": 0.4589, "num_input_tokens_seen": 60400320, "step": 49665 }, { "epoch": 6.223530885853903, "grad_norm": 0.09782271832227707, "learning_rate": 8.702026016361958e-06, "loss": 0.4603, "num_input_tokens_seen": 60406304, "step": 49670 }, { "epoch": 6.2241573737626865, "grad_norm": 0.0417911633849144, "learning_rate": 8.701658514624962e-06, "loss": 0.4682, "num_input_tokens_seen": 60412512, "step": 49675 }, { "epoch": 6.22478386167147, "grad_norm": 0.07766034454107285, "learning_rate": 8.701290968631529e-06, "loss": 0.4607, "num_input_tokens_seen": 60418656, "step": 49680 }, { "epoch": 6.225410349580253, "grad_norm": 0.09737341850996017, "learning_rate": 8.700923378386048e-06, "loss": 0.4542, "num_input_tokens_seen": 60424288, "step": 49685 }, { "epoch": 6.226036837489037, "grad_norm": 0.10134215652942657, "learning_rate": 8.700555743892918e-06, "loss": 0.4586, "num_input_tokens_seen": 60430816, "step": 49690 }, { "epoch": 6.22666332539782, "grad_norm": 0.07992994785308838, "learning_rate": 8.700188065156531e-06, "loss": 0.4686, "num_input_tokens_seen": 60436864, "step": 49695 }, { "epoch": 6.227289813306603, "grad_norm": 0.16912893950939178, "learning_rate": 8.699820342181286e-06, "loss": 0.4561, "num_input_tokens_seen": 60442880, "step": 49700 }, { "epoch": 6.227916301215386, "grad_norm": 0.07615385949611664, "learning_rate": 8.69945257497158e-06, "loss": 0.4682, "num_input_tokens_seen": 60448896, "step": 49705 }, { "epoch": 6.22854278912417, "grad_norm": 0.10914935171604156, "learning_rate": 8.699084763531805e-06, "loss": 0.4569, "num_input_tokens_seen": 60455168, "step": 49710 }, { "epoch": 6.229169277032954, "grad_norm": 0.06600417196750641, "learning_rate": 8.698716907866363e-06, "loss": 0.4694, "num_input_tokens_seen": 60461056, "step": 49715 }, { "epoch": 6.229795764941737, "grad_norm": 0.08041654527187347, "learning_rate": 8.69834900797965e-06, "loss": 0.4784, "num_input_tokens_seen": 60467488, "step": 49720 }, { "epoch": 6.23042225285052, "grad_norm": 0.1199495941400528, "learning_rate": 8.697981063876066e-06, "loss": 0.4625, "num_input_tokens_seen": 60473440, "step": 49725 }, { "epoch": 6.231048740759303, "grad_norm": 0.10925637185573578, "learning_rate": 8.697613075560008e-06, "loss": 0.4637, "num_input_tokens_seen": 60479552, "step": 49730 }, { "epoch": 6.231675228668086, "grad_norm": 0.08058665692806244, "learning_rate": 8.697245043035877e-06, "loss": 0.4642, "num_input_tokens_seen": 60485536, "step": 49735 }, { "epoch": 6.23230171657687, "grad_norm": 0.1211041808128357, "learning_rate": 8.696876966308075e-06, "loss": 0.4653, "num_input_tokens_seen": 60491744, "step": 49740 }, { "epoch": 6.2329282044856535, "grad_norm": 0.06836777180433273, "learning_rate": 8.696508845381e-06, "loss": 0.4602, "num_input_tokens_seen": 60498048, "step": 49745 }, { "epoch": 6.233554692394437, "grad_norm": 0.12480879575014114, "learning_rate": 8.696140680259054e-06, "loss": 0.4689, "num_input_tokens_seen": 60504480, "step": 49750 }, { "epoch": 6.23418118030322, "grad_norm": 0.0715644583106041, "learning_rate": 8.695772470946637e-06, "loss": 0.4616, "num_input_tokens_seen": 60510656, "step": 49755 }, { "epoch": 6.234807668212003, "grad_norm": 0.07380729168653488, "learning_rate": 8.695404217448154e-06, "loss": 0.4663, "num_input_tokens_seen": 60517024, "step": 49760 }, { "epoch": 6.235434156120787, "grad_norm": 0.0793013721704483, "learning_rate": 8.695035919768007e-06, "loss": 0.461, "num_input_tokens_seen": 60522976, "step": 49765 }, { "epoch": 6.23606064402957, "grad_norm": 0.07752911001443863, "learning_rate": 8.6946675779106e-06, "loss": 0.4562, "num_input_tokens_seen": 60529056, "step": 49770 }, { "epoch": 6.2366871319383534, "grad_norm": 0.06104342266917229, "learning_rate": 8.694299191880333e-06, "loss": 0.4602, "num_input_tokens_seen": 60535232, "step": 49775 }, { "epoch": 6.237313619847137, "grad_norm": 0.0880742147564888, "learning_rate": 8.693930761681615e-06, "loss": 0.4596, "num_input_tokens_seen": 60541280, "step": 49780 }, { "epoch": 6.237940107755921, "grad_norm": 0.046500932425260544, "learning_rate": 8.693562287318848e-06, "loss": 0.4611, "num_input_tokens_seen": 60547424, "step": 49785 }, { "epoch": 6.238566595664704, "grad_norm": 0.039446860551834106, "learning_rate": 8.69319376879644e-06, "loss": 0.4642, "num_input_tokens_seen": 60553504, "step": 49790 }, { "epoch": 6.239193083573487, "grad_norm": 0.0755307674407959, "learning_rate": 8.692825206118794e-06, "loss": 0.4616, "num_input_tokens_seen": 60559520, "step": 49795 }, { "epoch": 6.23981957148227, "grad_norm": 0.06169760227203369, "learning_rate": 8.692456599290318e-06, "loss": 0.4621, "num_input_tokens_seen": 60565344, "step": 49800 }, { "epoch": 6.240446059391053, "grad_norm": 0.09624048322439194, "learning_rate": 8.69208794831542e-06, "loss": 0.4648, "num_input_tokens_seen": 60571520, "step": 49805 }, { "epoch": 6.241072547299837, "grad_norm": 0.0667928159236908, "learning_rate": 8.691719253198506e-06, "loss": 0.4581, "num_input_tokens_seen": 60577760, "step": 49810 }, { "epoch": 6.241699035208621, "grad_norm": 0.06668337434530258, "learning_rate": 8.691350513943984e-06, "loss": 0.4592, "num_input_tokens_seen": 60584160, "step": 49815 }, { "epoch": 6.242325523117404, "grad_norm": 0.06726890802383423, "learning_rate": 8.690981730556264e-06, "loss": 0.4639, "num_input_tokens_seen": 60590528, "step": 49820 }, { "epoch": 6.242952011026187, "grad_norm": 0.04308982938528061, "learning_rate": 8.690612903039754e-06, "loss": 0.4661, "num_input_tokens_seen": 60596800, "step": 49825 }, { "epoch": 6.243578498934971, "grad_norm": 0.04680100083351135, "learning_rate": 8.690244031398863e-06, "loss": 0.4565, "num_input_tokens_seen": 60602976, "step": 49830 }, { "epoch": 6.244204986843754, "grad_norm": 0.11137524992227554, "learning_rate": 8.689875115638003e-06, "loss": 0.4641, "num_input_tokens_seen": 60609216, "step": 49835 }, { "epoch": 6.244831474752537, "grad_norm": 0.09025363624095917, "learning_rate": 8.689506155761582e-06, "loss": 0.4571, "num_input_tokens_seen": 60614880, "step": 49840 }, { "epoch": 6.2454579626613205, "grad_norm": 0.08787479996681213, "learning_rate": 8.689137151774015e-06, "loss": 0.4565, "num_input_tokens_seen": 60620992, "step": 49845 }, { "epoch": 6.246084450570104, "grad_norm": 0.07852756977081299, "learning_rate": 8.68876810367971e-06, "loss": 0.459, "num_input_tokens_seen": 60627104, "step": 49850 }, { "epoch": 6.246710938478888, "grad_norm": 0.0864739790558815, "learning_rate": 8.688399011483082e-06, "loss": 0.465, "num_input_tokens_seen": 60633504, "step": 49855 }, { "epoch": 6.247337426387671, "grad_norm": 0.10024676471948624, "learning_rate": 8.688029875188544e-06, "loss": 0.4612, "num_input_tokens_seen": 60639424, "step": 49860 }, { "epoch": 6.247963914296454, "grad_norm": 0.09097947925329208, "learning_rate": 8.687660694800505e-06, "loss": 0.462, "num_input_tokens_seen": 60645952, "step": 49865 }, { "epoch": 6.248590402205237, "grad_norm": 0.10487782210111618, "learning_rate": 8.687291470323385e-06, "loss": 0.4576, "num_input_tokens_seen": 60651904, "step": 49870 }, { "epoch": 6.24921689011402, "grad_norm": 0.07416505366563797, "learning_rate": 8.686922201761595e-06, "loss": 0.4607, "num_input_tokens_seen": 60657760, "step": 49875 }, { "epoch": 6.249843378022804, "grad_norm": 0.13875465095043182, "learning_rate": 8.686552889119549e-06, "loss": 0.4616, "num_input_tokens_seen": 60663840, "step": 49880 }, { "epoch": 6.250469865931588, "grad_norm": 0.1268576681613922, "learning_rate": 8.686183532401663e-06, "loss": 0.4611, "num_input_tokens_seen": 60669632, "step": 49885 }, { "epoch": 6.251096353840371, "grad_norm": 0.07996078580617905, "learning_rate": 8.685814131612356e-06, "loss": 0.469, "num_input_tokens_seen": 60675872, "step": 49890 }, { "epoch": 6.251722841749154, "grad_norm": 0.06923447549343109, "learning_rate": 8.68544468675604e-06, "loss": 0.4622, "num_input_tokens_seen": 60682176, "step": 49895 }, { "epoch": 6.252349329657938, "grad_norm": 0.09336742013692856, "learning_rate": 8.685075197837136e-06, "loss": 0.4694, "num_input_tokens_seen": 60688384, "step": 49900 }, { "epoch": 6.252975817566721, "grad_norm": 0.08147240430116653, "learning_rate": 8.684705664860059e-06, "loss": 0.4664, "num_input_tokens_seen": 60694432, "step": 49905 }, { "epoch": 6.253602305475504, "grad_norm": 0.05869712308049202, "learning_rate": 8.684336087829227e-06, "loss": 0.4617, "num_input_tokens_seen": 60701056, "step": 49910 }, { "epoch": 6.2542287933842875, "grad_norm": 0.07629559189081192, "learning_rate": 8.683966466749061e-06, "loss": 0.4597, "num_input_tokens_seen": 60707040, "step": 49915 }, { "epoch": 6.254855281293071, "grad_norm": 0.11308807879686356, "learning_rate": 8.683596801623977e-06, "loss": 0.4663, "num_input_tokens_seen": 60713088, "step": 49920 }, { "epoch": 6.255481769201855, "grad_norm": 0.07035793364048004, "learning_rate": 8.683227092458397e-06, "loss": 0.4657, "num_input_tokens_seen": 60719392, "step": 49925 }, { "epoch": 6.256108257110638, "grad_norm": 0.07505136728286743, "learning_rate": 8.68285733925674e-06, "loss": 0.4634, "num_input_tokens_seen": 60725600, "step": 49930 }, { "epoch": 6.256734745019421, "grad_norm": 0.08829424530267715, "learning_rate": 8.682487542023428e-06, "loss": 0.4601, "num_input_tokens_seen": 60731936, "step": 49935 }, { "epoch": 6.257361232928204, "grad_norm": 0.06590387970209122, "learning_rate": 8.682117700762882e-06, "loss": 0.4597, "num_input_tokens_seen": 60738272, "step": 49940 }, { "epoch": 6.257987720836988, "grad_norm": 0.08806971460580826, "learning_rate": 8.681747815479522e-06, "loss": 0.466, "num_input_tokens_seen": 60744416, "step": 49945 }, { "epoch": 6.2586142087457715, "grad_norm": 0.12420788407325745, "learning_rate": 8.68137788617777e-06, "loss": 0.4688, "num_input_tokens_seen": 60750400, "step": 49950 }, { "epoch": 6.259240696654555, "grad_norm": 0.07375935465097427, "learning_rate": 8.681007912862052e-06, "loss": 0.4541, "num_input_tokens_seen": 60756352, "step": 49955 }, { "epoch": 6.259867184563338, "grad_norm": 0.07666132599115372, "learning_rate": 8.68063789553679e-06, "loss": 0.455, "num_input_tokens_seen": 60762624, "step": 49960 }, { "epoch": 6.260493672472121, "grad_norm": 0.06816066056489944, "learning_rate": 8.680267834206407e-06, "loss": 0.4598, "num_input_tokens_seen": 60768512, "step": 49965 }, { "epoch": 6.261120160380905, "grad_norm": 0.07605019956827164, "learning_rate": 8.679897728875329e-06, "loss": 0.4568, "num_input_tokens_seen": 60774560, "step": 49970 }, { "epoch": 6.261746648289688, "grad_norm": 0.08106846362352371, "learning_rate": 8.679527579547977e-06, "loss": 0.4605, "num_input_tokens_seen": 60780480, "step": 49975 }, { "epoch": 6.262373136198471, "grad_norm": 0.1190686821937561, "learning_rate": 8.679157386228781e-06, "loss": 0.4659, "num_input_tokens_seen": 60786304, "step": 49980 }, { "epoch": 6.2629996241072545, "grad_norm": 0.08179300278425217, "learning_rate": 8.678787148922165e-06, "loss": 0.4531, "num_input_tokens_seen": 60792384, "step": 49985 }, { "epoch": 6.263626112016038, "grad_norm": 0.08582335710525513, "learning_rate": 8.678416867632555e-06, "loss": 0.4527, "num_input_tokens_seen": 60798400, "step": 49990 }, { "epoch": 6.264252599924822, "grad_norm": 0.04766565188765526, "learning_rate": 8.67804654236438e-06, "loss": 0.4581, "num_input_tokens_seen": 60804576, "step": 49995 }, { "epoch": 6.264879087833605, "grad_norm": 0.11919528245925903, "learning_rate": 8.677676173122065e-06, "loss": 0.4705, "num_input_tokens_seen": 60810560, "step": 50000 }, { "epoch": 6.265505575742388, "grad_norm": 0.09424613416194916, "learning_rate": 8.67730575991004e-06, "loss": 0.4642, "num_input_tokens_seen": 60816896, "step": 50005 }, { "epoch": 6.266132063651171, "grad_norm": 0.092987559735775, "learning_rate": 8.676935302732732e-06, "loss": 0.4543, "num_input_tokens_seen": 60823168, "step": 50010 }, { "epoch": 6.266758551559954, "grad_norm": 0.04736059531569481, "learning_rate": 8.676564801594572e-06, "loss": 0.4564, "num_input_tokens_seen": 60829696, "step": 50015 }, { "epoch": 6.2673850394687385, "grad_norm": 0.05337512120604515, "learning_rate": 8.67619425649999e-06, "loss": 0.4691, "num_input_tokens_seen": 60835712, "step": 50020 }, { "epoch": 6.268011527377522, "grad_norm": 0.09723731875419617, "learning_rate": 8.675823667453413e-06, "loss": 0.4582, "num_input_tokens_seen": 60842016, "step": 50025 }, { "epoch": 6.268638015286305, "grad_norm": 0.08761604875326157, "learning_rate": 8.675453034459274e-06, "loss": 0.4625, "num_input_tokens_seen": 60847648, "step": 50030 }, { "epoch": 6.269264503195088, "grad_norm": 0.09022822231054306, "learning_rate": 8.675082357522002e-06, "loss": 0.4564, "num_input_tokens_seen": 60853824, "step": 50035 }, { "epoch": 6.269890991103872, "grad_norm": 0.13418622314929962, "learning_rate": 8.674711636646033e-06, "loss": 0.4579, "num_input_tokens_seen": 60860096, "step": 50040 }, { "epoch": 6.270517479012655, "grad_norm": 0.12326966226100922, "learning_rate": 8.674340871835795e-06, "loss": 0.4524, "num_input_tokens_seen": 60865984, "step": 50045 }, { "epoch": 6.271143966921438, "grad_norm": 0.13032637536525726, "learning_rate": 8.673970063095726e-06, "loss": 0.4657, "num_input_tokens_seen": 60872192, "step": 50050 }, { "epoch": 6.2717704548302216, "grad_norm": 0.07776667177677155, "learning_rate": 8.673599210430254e-06, "loss": 0.456, "num_input_tokens_seen": 60877952, "step": 50055 }, { "epoch": 6.272396942739005, "grad_norm": 0.08165574073791504, "learning_rate": 8.673228313843813e-06, "loss": 0.4596, "num_input_tokens_seen": 60883904, "step": 50060 }, { "epoch": 6.273023430647789, "grad_norm": 0.10470138490200043, "learning_rate": 8.67285737334084e-06, "loss": 0.4521, "num_input_tokens_seen": 60890144, "step": 50065 }, { "epoch": 6.273649918556572, "grad_norm": 0.07970143854618073, "learning_rate": 8.672486388925771e-06, "loss": 0.4682, "num_input_tokens_seen": 60896256, "step": 50070 }, { "epoch": 6.274276406465355, "grad_norm": 0.052564993500709534, "learning_rate": 8.67211536060304e-06, "loss": 0.4607, "num_input_tokens_seen": 60902336, "step": 50075 }, { "epoch": 6.274902894374138, "grad_norm": 0.26277732849121094, "learning_rate": 8.671744288377082e-06, "loss": 0.4688, "num_input_tokens_seen": 60908160, "step": 50080 }, { "epoch": 6.275529382282922, "grad_norm": 0.11284534633159637, "learning_rate": 8.671373172252331e-06, "loss": 0.4611, "num_input_tokens_seen": 60914080, "step": 50085 }, { "epoch": 6.2761558701917055, "grad_norm": 0.10095083713531494, "learning_rate": 8.67100201223323e-06, "loss": 0.462, "num_input_tokens_seen": 60920416, "step": 50090 }, { "epoch": 6.276782358100489, "grad_norm": 0.10148576647043228, "learning_rate": 8.670630808324212e-06, "loss": 0.4586, "num_input_tokens_seen": 60926784, "step": 50095 }, { "epoch": 6.277408846009272, "grad_norm": 0.13533946871757507, "learning_rate": 8.67025956052972e-06, "loss": 0.4606, "num_input_tokens_seen": 60933184, "step": 50100 }, { "epoch": 6.278035333918055, "grad_norm": 0.10727345198392868, "learning_rate": 8.669888268854185e-06, "loss": 0.4534, "num_input_tokens_seen": 60939296, "step": 50105 }, { "epoch": 6.278661821826839, "grad_norm": 0.1385214775800705, "learning_rate": 8.669516933302051e-06, "loss": 0.4697, "num_input_tokens_seen": 60945472, "step": 50110 }, { "epoch": 6.279288309735622, "grad_norm": 0.06381331384181976, "learning_rate": 8.669145553877757e-06, "loss": 0.4574, "num_input_tokens_seen": 60952032, "step": 50115 }, { "epoch": 6.279914797644405, "grad_norm": 0.1604079306125641, "learning_rate": 8.668774130585746e-06, "loss": 0.4625, "num_input_tokens_seen": 60958080, "step": 50120 }, { "epoch": 6.280541285553189, "grad_norm": 0.09709440916776657, "learning_rate": 8.668402663430454e-06, "loss": 0.4741, "num_input_tokens_seen": 60964160, "step": 50125 }, { "epoch": 6.281167773461972, "grad_norm": 0.13590994477272034, "learning_rate": 8.668031152416322e-06, "loss": 0.4575, "num_input_tokens_seen": 60970272, "step": 50130 }, { "epoch": 6.281794261370756, "grad_norm": 0.15908853709697723, "learning_rate": 8.667659597547795e-06, "loss": 0.4457, "num_input_tokens_seen": 60976256, "step": 50135 }, { "epoch": 6.282420749279539, "grad_norm": 0.0873124971985817, "learning_rate": 8.667287998829314e-06, "loss": 0.4632, "num_input_tokens_seen": 60982624, "step": 50140 }, { "epoch": 6.283047237188322, "grad_norm": 0.1517217457294464, "learning_rate": 8.66691635626532e-06, "loss": 0.4624, "num_input_tokens_seen": 60988384, "step": 50145 }, { "epoch": 6.283673725097105, "grad_norm": 0.12260934710502625, "learning_rate": 8.666544669860261e-06, "loss": 0.4504, "num_input_tokens_seen": 60994368, "step": 50150 }, { "epoch": 6.284300213005889, "grad_norm": 0.12958462536334991, "learning_rate": 8.666172939618577e-06, "loss": 0.4683, "num_input_tokens_seen": 61000544, "step": 50155 }, { "epoch": 6.2849267009146725, "grad_norm": 0.16215883195400238, "learning_rate": 8.66580116554471e-06, "loss": 0.4694, "num_input_tokens_seen": 61006816, "step": 50160 }, { "epoch": 6.285553188823456, "grad_norm": 0.14670288562774658, "learning_rate": 8.665429347643112e-06, "loss": 0.4755, "num_input_tokens_seen": 61012768, "step": 50165 }, { "epoch": 6.286179676732239, "grad_norm": 0.09796668589115143, "learning_rate": 8.665057485918224e-06, "loss": 0.4587, "num_input_tokens_seen": 61019232, "step": 50170 }, { "epoch": 6.286806164641022, "grad_norm": 0.07420080900192261, "learning_rate": 8.664685580374491e-06, "loss": 0.4633, "num_input_tokens_seen": 61025120, "step": 50175 }, { "epoch": 6.287432652549806, "grad_norm": 0.10543832927942276, "learning_rate": 8.66431363101636e-06, "loss": 0.462, "num_input_tokens_seen": 61031104, "step": 50180 }, { "epoch": 6.288059140458589, "grad_norm": 0.054936692118644714, "learning_rate": 8.663941637848282e-06, "loss": 0.4561, "num_input_tokens_seen": 61037248, "step": 50185 }, { "epoch": 6.288685628367372, "grad_norm": 0.08873062580823898, "learning_rate": 8.6635696008747e-06, "loss": 0.4532, "num_input_tokens_seen": 61043360, "step": 50190 }, { "epoch": 6.289312116276156, "grad_norm": 0.09013058990240097, "learning_rate": 8.66319752010006e-06, "loss": 0.4587, "num_input_tokens_seen": 61049568, "step": 50195 }, { "epoch": 6.28993860418494, "grad_norm": 0.07922260463237762, "learning_rate": 8.662825395528817e-06, "loss": 0.4667, "num_input_tokens_seen": 61055776, "step": 50200 }, { "epoch": 6.290565092093723, "grad_norm": 0.05082108825445175, "learning_rate": 8.662453227165417e-06, "loss": 0.4723, "num_input_tokens_seen": 61061728, "step": 50205 }, { "epoch": 6.291191580002506, "grad_norm": 0.09686204046010971, "learning_rate": 8.662081015014307e-06, "loss": 0.469, "num_input_tokens_seen": 61067872, "step": 50210 }, { "epoch": 6.291818067911289, "grad_norm": 0.07914227992296219, "learning_rate": 8.661708759079942e-06, "loss": 0.4621, "num_input_tokens_seen": 61074016, "step": 50215 }, { "epoch": 6.292444555820072, "grad_norm": 0.08965998888015747, "learning_rate": 8.661336459366768e-06, "loss": 0.4673, "num_input_tokens_seen": 61079776, "step": 50220 }, { "epoch": 6.293071043728856, "grad_norm": 0.08243107795715332, "learning_rate": 8.66096411587924e-06, "loss": 0.4631, "num_input_tokens_seen": 61085888, "step": 50225 }, { "epoch": 6.29369753163764, "grad_norm": 0.06937096267938614, "learning_rate": 8.660591728621809e-06, "loss": 0.4621, "num_input_tokens_seen": 61091744, "step": 50230 }, { "epoch": 6.294324019546423, "grad_norm": 0.10282605141401291, "learning_rate": 8.660219297598925e-06, "loss": 0.4658, "num_input_tokens_seen": 61097728, "step": 50235 }, { "epoch": 6.294950507455206, "grad_norm": 0.04190719127655029, "learning_rate": 8.65984682281504e-06, "loss": 0.4712, "num_input_tokens_seen": 61103392, "step": 50240 }, { "epoch": 6.295576995363989, "grad_norm": 0.0985368862748146, "learning_rate": 8.659474304274614e-06, "loss": 0.4577, "num_input_tokens_seen": 61109504, "step": 50245 }, { "epoch": 6.296203483272773, "grad_norm": 0.10175099968910217, "learning_rate": 8.659101741982092e-06, "loss": 0.4619, "num_input_tokens_seen": 61114624, "step": 50250 }, { "epoch": 6.296829971181556, "grad_norm": 0.07593101263046265, "learning_rate": 8.658729135941932e-06, "loss": 0.4573, "num_input_tokens_seen": 61120960, "step": 50255 }, { "epoch": 6.2974564590903395, "grad_norm": 0.07790513336658478, "learning_rate": 8.658356486158592e-06, "loss": 0.4583, "num_input_tokens_seen": 61126848, "step": 50260 }, { "epoch": 6.298082946999123, "grad_norm": 0.04341626912355423, "learning_rate": 8.657983792636521e-06, "loss": 0.4594, "num_input_tokens_seen": 61133024, "step": 50265 }, { "epoch": 6.298709434907907, "grad_norm": 0.0778069719672203, "learning_rate": 8.657611055380181e-06, "loss": 0.4623, "num_input_tokens_seen": 61139456, "step": 50270 }, { "epoch": 6.29933592281669, "grad_norm": 0.08382277935743332, "learning_rate": 8.657238274394026e-06, "loss": 0.4693, "num_input_tokens_seen": 61145472, "step": 50275 }, { "epoch": 6.299962410725473, "grad_norm": 0.12342799454927444, "learning_rate": 8.656865449682512e-06, "loss": 0.4495, "num_input_tokens_seen": 61151552, "step": 50280 }, { "epoch": 6.300588898634256, "grad_norm": 0.0452323816716671, "learning_rate": 8.656492581250096e-06, "loss": 0.4602, "num_input_tokens_seen": 61157664, "step": 50285 }, { "epoch": 6.301215386543039, "grad_norm": 0.07632381469011307, "learning_rate": 8.656119669101237e-06, "loss": 0.4586, "num_input_tokens_seen": 61163904, "step": 50290 }, { "epoch": 6.301841874451823, "grad_norm": 0.09558361023664474, "learning_rate": 8.655746713240393e-06, "loss": 0.4618, "num_input_tokens_seen": 61169856, "step": 50295 }, { "epoch": 6.302468362360607, "grad_norm": 0.09987565875053406, "learning_rate": 8.655373713672024e-06, "loss": 0.4636, "num_input_tokens_seen": 61175360, "step": 50300 }, { "epoch": 6.30309485026939, "grad_norm": 0.07128085196018219, "learning_rate": 8.65500067040059e-06, "loss": 0.4633, "num_input_tokens_seen": 61181536, "step": 50305 }, { "epoch": 6.303721338178173, "grad_norm": 0.15129196643829346, "learning_rate": 8.65462758343055e-06, "loss": 0.4523, "num_input_tokens_seen": 61187520, "step": 50310 }, { "epoch": 6.304347826086957, "grad_norm": 0.07150506228208542, "learning_rate": 8.654254452766365e-06, "loss": 0.4524, "num_input_tokens_seen": 61193536, "step": 50315 }, { "epoch": 6.30497431399574, "grad_norm": 0.1201920285820961, "learning_rate": 8.653881278412493e-06, "loss": 0.4691, "num_input_tokens_seen": 61199392, "step": 50320 }, { "epoch": 6.305600801904523, "grad_norm": 0.07483606785535812, "learning_rate": 8.6535080603734e-06, "loss": 0.4723, "num_input_tokens_seen": 61205472, "step": 50325 }, { "epoch": 6.3062272898133065, "grad_norm": 0.1352735012769699, "learning_rate": 8.653134798653546e-06, "loss": 0.4619, "num_input_tokens_seen": 61211744, "step": 50330 }, { "epoch": 6.30685377772209, "grad_norm": 0.0845877006649971, "learning_rate": 8.652761493257395e-06, "loss": 0.4628, "num_input_tokens_seen": 61217984, "step": 50335 }, { "epoch": 6.307480265630874, "grad_norm": 0.07041962444782257, "learning_rate": 8.652388144189408e-06, "loss": 0.4667, "num_input_tokens_seen": 61223264, "step": 50340 }, { "epoch": 6.308106753539657, "grad_norm": 0.08496172726154327, "learning_rate": 8.652014751454052e-06, "loss": 0.4625, "num_input_tokens_seen": 61229632, "step": 50345 }, { "epoch": 6.30873324144844, "grad_norm": 0.09164562821388245, "learning_rate": 8.651641315055788e-06, "loss": 0.4659, "num_input_tokens_seen": 61235808, "step": 50350 }, { "epoch": 6.309359729357223, "grad_norm": 0.05248944088816643, "learning_rate": 8.65126783499908e-06, "loss": 0.467, "num_input_tokens_seen": 61242016, "step": 50355 }, { "epoch": 6.309986217266006, "grad_norm": 0.08880089223384857, "learning_rate": 8.650894311288398e-06, "loss": 0.4614, "num_input_tokens_seen": 61248032, "step": 50360 }, { "epoch": 6.3106127051747904, "grad_norm": 0.08044840395450592, "learning_rate": 8.650520743928205e-06, "loss": 0.471, "num_input_tokens_seen": 61253856, "step": 50365 }, { "epoch": 6.311239193083574, "grad_norm": 0.13966704905033112, "learning_rate": 8.650147132922965e-06, "loss": 0.4576, "num_input_tokens_seen": 61259840, "step": 50370 }, { "epoch": 6.311865680992357, "grad_norm": 0.0960862785577774, "learning_rate": 8.64977347827715e-06, "loss": 0.4644, "num_input_tokens_seen": 61266240, "step": 50375 }, { "epoch": 6.31249216890114, "grad_norm": 0.10090456902980804, "learning_rate": 8.649399779995222e-06, "loss": 0.4604, "num_input_tokens_seen": 61272256, "step": 50380 }, { "epoch": 6.313118656809923, "grad_norm": 0.08775104582309723, "learning_rate": 8.649026038081653e-06, "loss": 0.4627, "num_input_tokens_seen": 61278400, "step": 50385 }, { "epoch": 6.313745144718707, "grad_norm": 0.1138024777173996, "learning_rate": 8.64865225254091e-06, "loss": 0.4604, "num_input_tokens_seen": 61284288, "step": 50390 }, { "epoch": 6.31437163262749, "grad_norm": 0.10217512398958206, "learning_rate": 8.64827842337746e-06, "loss": 0.4581, "num_input_tokens_seen": 61290208, "step": 50395 }, { "epoch": 6.3149981205362735, "grad_norm": 0.11136439442634583, "learning_rate": 8.647904550595776e-06, "loss": 0.465, "num_input_tokens_seen": 61296192, "step": 50400 }, { "epoch": 6.315624608445057, "grad_norm": 0.14211329817771912, "learning_rate": 8.647530634200326e-06, "loss": 0.4609, "num_input_tokens_seen": 61302368, "step": 50405 }, { "epoch": 6.316251096353841, "grad_norm": 0.05659182369709015, "learning_rate": 8.647156674195578e-06, "loss": 0.4629, "num_input_tokens_seen": 61308768, "step": 50410 }, { "epoch": 6.316877584262624, "grad_norm": 0.08565492928028107, "learning_rate": 8.646782670586007e-06, "loss": 0.4661, "num_input_tokens_seen": 61315008, "step": 50415 }, { "epoch": 6.317504072171407, "grad_norm": 0.12856367230415344, "learning_rate": 8.646408623376084e-06, "loss": 0.459, "num_input_tokens_seen": 61320224, "step": 50420 }, { "epoch": 6.31813056008019, "grad_norm": 0.08056972920894623, "learning_rate": 8.64603453257028e-06, "loss": 0.4565, "num_input_tokens_seen": 61326304, "step": 50425 }, { "epoch": 6.318757047988973, "grad_norm": 0.11861201375722885, "learning_rate": 8.645660398173066e-06, "loss": 0.465, "num_input_tokens_seen": 61332096, "step": 50430 }, { "epoch": 6.3193835358977575, "grad_norm": 0.07956337183713913, "learning_rate": 8.64528622018892e-06, "loss": 0.4618, "num_input_tokens_seen": 61338048, "step": 50435 }, { "epoch": 6.320010023806541, "grad_norm": 0.0789542868733406, "learning_rate": 8.64491199862231e-06, "loss": 0.4564, "num_input_tokens_seen": 61344000, "step": 50440 }, { "epoch": 6.320636511715324, "grad_norm": 0.09620998799800873, "learning_rate": 8.644537733477714e-06, "loss": 0.4679, "num_input_tokens_seen": 61350016, "step": 50445 }, { "epoch": 6.321262999624107, "grad_norm": 0.08722773939371109, "learning_rate": 8.644163424759604e-06, "loss": 0.4601, "num_input_tokens_seen": 61355968, "step": 50450 }, { "epoch": 6.321889487532891, "grad_norm": 0.08313785493373871, "learning_rate": 8.643789072472458e-06, "loss": 0.4675, "num_input_tokens_seen": 61362144, "step": 50455 }, { "epoch": 6.322515975441674, "grad_norm": 0.0814044177532196, "learning_rate": 8.643414676620748e-06, "loss": 0.4573, "num_input_tokens_seen": 61368096, "step": 50460 }, { "epoch": 6.323142463350457, "grad_norm": 0.11577071994543076, "learning_rate": 8.643040237208954e-06, "loss": 0.4668, "num_input_tokens_seen": 61373920, "step": 50465 }, { "epoch": 6.3237689512592405, "grad_norm": 0.08508174121379852, "learning_rate": 8.64266575424155e-06, "loss": 0.4703, "num_input_tokens_seen": 61380384, "step": 50470 }, { "epoch": 6.324395439168024, "grad_norm": 0.11033168435096741, "learning_rate": 8.642291227723014e-06, "loss": 0.4638, "num_input_tokens_seen": 61386560, "step": 50475 }, { "epoch": 6.325021927076808, "grad_norm": 0.13120576739311218, "learning_rate": 8.641916657657825e-06, "loss": 0.4536, "num_input_tokens_seen": 61392608, "step": 50480 }, { "epoch": 6.325648414985591, "grad_norm": 0.1113426461815834, "learning_rate": 8.641542044050462e-06, "loss": 0.4656, "num_input_tokens_seen": 61398336, "step": 50485 }, { "epoch": 6.326274902894374, "grad_norm": 0.04979228973388672, "learning_rate": 8.6411673869054e-06, "loss": 0.4627, "num_input_tokens_seen": 61404096, "step": 50490 }, { "epoch": 6.326901390803157, "grad_norm": 0.07074657827615738, "learning_rate": 8.64079268622712e-06, "loss": 0.4616, "num_input_tokens_seen": 61410368, "step": 50495 }, { "epoch": 6.3275278787119404, "grad_norm": 0.08755061775445938, "learning_rate": 8.640417942020104e-06, "loss": 0.4679, "num_input_tokens_seen": 61415968, "step": 50500 }, { "epoch": 6.3281543666207245, "grad_norm": 0.0741317868232727, "learning_rate": 8.640043154288831e-06, "loss": 0.4607, "num_input_tokens_seen": 61422112, "step": 50505 }, { "epoch": 6.328780854529508, "grad_norm": 0.051738664507865906, "learning_rate": 8.63966832303778e-06, "loss": 0.4638, "num_input_tokens_seen": 61428352, "step": 50510 }, { "epoch": 6.329407342438291, "grad_norm": 0.07986555993556976, "learning_rate": 8.639293448271435e-06, "loss": 0.4596, "num_input_tokens_seen": 61434336, "step": 50515 }, { "epoch": 6.330033830347074, "grad_norm": 0.10044042766094208, "learning_rate": 8.638918529994277e-06, "loss": 0.4517, "num_input_tokens_seen": 61440224, "step": 50520 }, { "epoch": 6.330660318255858, "grad_norm": 0.1142328605055809, "learning_rate": 8.638543568210788e-06, "loss": 0.4612, "num_input_tokens_seen": 61446208, "step": 50525 }, { "epoch": 6.331286806164641, "grad_norm": 0.07405375689268112, "learning_rate": 8.638168562925453e-06, "loss": 0.4616, "num_input_tokens_seen": 61452256, "step": 50530 }, { "epoch": 6.331913294073424, "grad_norm": 0.08235227316617966, "learning_rate": 8.637793514142754e-06, "loss": 0.4623, "num_input_tokens_seen": 61458048, "step": 50535 }, { "epoch": 6.332539781982208, "grad_norm": 0.1047850102186203, "learning_rate": 8.637418421867174e-06, "loss": 0.4565, "num_input_tokens_seen": 61464032, "step": 50540 }, { "epoch": 6.333166269890991, "grad_norm": 0.056317996233701706, "learning_rate": 8.6370432861032e-06, "loss": 0.4529, "num_input_tokens_seen": 61469600, "step": 50545 }, { "epoch": 6.333792757799775, "grad_norm": 0.04995191842317581, "learning_rate": 8.636668106855314e-06, "loss": 0.455, "num_input_tokens_seen": 61475360, "step": 50550 }, { "epoch": 6.334419245708558, "grad_norm": 0.11489184945821762, "learning_rate": 8.636292884128003e-06, "loss": 0.4606, "num_input_tokens_seen": 61481408, "step": 50555 }, { "epoch": 6.335045733617341, "grad_norm": 0.08050057291984558, "learning_rate": 8.635917617925756e-06, "loss": 0.4657, "num_input_tokens_seen": 61487488, "step": 50560 }, { "epoch": 6.335672221526124, "grad_norm": 0.06580288708209991, "learning_rate": 8.635542308253054e-06, "loss": 0.4686, "num_input_tokens_seen": 61493856, "step": 50565 }, { "epoch": 6.336298709434908, "grad_norm": 0.06807214021682739, "learning_rate": 8.635166955114389e-06, "loss": 0.4562, "num_input_tokens_seen": 61500000, "step": 50570 }, { "epoch": 6.3369251973436915, "grad_norm": 0.14466698467731476, "learning_rate": 8.634791558514245e-06, "loss": 0.4684, "num_input_tokens_seen": 61506016, "step": 50575 }, { "epoch": 6.337551685252475, "grad_norm": 0.116946280002594, "learning_rate": 8.634416118457114e-06, "loss": 0.4634, "num_input_tokens_seen": 61512256, "step": 50580 }, { "epoch": 6.338178173161258, "grad_norm": 0.1299600601196289, "learning_rate": 8.634040634947481e-06, "loss": 0.4654, "num_input_tokens_seen": 61518464, "step": 50585 }, { "epoch": 6.338804661070041, "grad_norm": 0.058870311826467514, "learning_rate": 8.633665107989837e-06, "loss": 0.464, "num_input_tokens_seen": 61524544, "step": 50590 }, { "epoch": 6.339431148978825, "grad_norm": 0.08256405591964722, "learning_rate": 8.633289537588673e-06, "loss": 0.4609, "num_input_tokens_seen": 61530784, "step": 50595 }, { "epoch": 6.340057636887608, "grad_norm": 0.09075886011123657, "learning_rate": 8.632913923748478e-06, "loss": 0.4622, "num_input_tokens_seen": 61536672, "step": 50600 }, { "epoch": 6.340684124796391, "grad_norm": 0.1287245750427246, "learning_rate": 8.63253826647374e-06, "loss": 0.457, "num_input_tokens_seen": 61542528, "step": 50605 }, { "epoch": 6.341310612705175, "grad_norm": 0.10254063457250595, "learning_rate": 8.632162565768955e-06, "loss": 0.4697, "num_input_tokens_seen": 61548672, "step": 50610 }, { "epoch": 6.341937100613958, "grad_norm": 0.12404999136924744, "learning_rate": 8.631786821638612e-06, "loss": 0.4548, "num_input_tokens_seen": 61554656, "step": 50615 }, { "epoch": 6.342563588522742, "grad_norm": 0.11775868386030197, "learning_rate": 8.631411034087205e-06, "loss": 0.4611, "num_input_tokens_seen": 61560896, "step": 50620 }, { "epoch": 6.343190076431525, "grad_norm": 0.1008378192782402, "learning_rate": 8.631035203119226e-06, "loss": 0.4624, "num_input_tokens_seen": 61567040, "step": 50625 }, { "epoch": 6.343816564340308, "grad_norm": 0.05191310867667198, "learning_rate": 8.630659328739167e-06, "loss": 0.4662, "num_input_tokens_seen": 61573440, "step": 50630 }, { "epoch": 6.344443052249091, "grad_norm": 0.1216217651963234, "learning_rate": 8.630283410951524e-06, "loss": 0.4669, "num_input_tokens_seen": 61579680, "step": 50635 }, { "epoch": 6.3450695401578745, "grad_norm": 0.06390299648046494, "learning_rate": 8.629907449760791e-06, "loss": 0.4606, "num_input_tokens_seen": 61585792, "step": 50640 }, { "epoch": 6.345696028066659, "grad_norm": 0.09288672357797623, "learning_rate": 8.629531445171461e-06, "loss": 0.4708, "num_input_tokens_seen": 61591840, "step": 50645 }, { "epoch": 6.346322515975442, "grad_norm": 0.07746012508869171, "learning_rate": 8.629155397188034e-06, "loss": 0.4645, "num_input_tokens_seen": 61598144, "step": 50650 }, { "epoch": 6.346949003884225, "grad_norm": 0.0776626318693161, "learning_rate": 8.628779305815e-06, "loss": 0.466, "num_input_tokens_seen": 61604416, "step": 50655 }, { "epoch": 6.347575491793008, "grad_norm": 0.09453800320625305, "learning_rate": 8.62840317105686e-06, "loss": 0.4563, "num_input_tokens_seen": 61610464, "step": 50660 }, { "epoch": 6.348201979701792, "grad_norm": 0.1076393648982048, "learning_rate": 8.62802699291811e-06, "loss": 0.4642, "num_input_tokens_seen": 61616448, "step": 50665 }, { "epoch": 6.348828467610575, "grad_norm": 0.0928674191236496, "learning_rate": 8.627650771403246e-06, "loss": 0.4646, "num_input_tokens_seen": 61622688, "step": 50670 }, { "epoch": 6.3494549555193585, "grad_norm": 0.057530418038368225, "learning_rate": 8.627274506516768e-06, "loss": 0.4564, "num_input_tokens_seen": 61628672, "step": 50675 }, { "epoch": 6.350081443428142, "grad_norm": 0.09714601933956146, "learning_rate": 8.626898198263172e-06, "loss": 0.4641, "num_input_tokens_seen": 61634688, "step": 50680 }, { "epoch": 6.350707931336925, "grad_norm": 0.07663512229919434, "learning_rate": 8.62652184664696e-06, "loss": 0.4643, "num_input_tokens_seen": 61640640, "step": 50685 }, { "epoch": 6.351334419245709, "grad_norm": 0.08177527785301208, "learning_rate": 8.62614545167263e-06, "loss": 0.463, "num_input_tokens_seen": 61646528, "step": 50690 }, { "epoch": 6.351960907154492, "grad_norm": 0.09066968411207199, "learning_rate": 8.625769013344682e-06, "loss": 0.4565, "num_input_tokens_seen": 61652640, "step": 50695 }, { "epoch": 6.352587395063275, "grad_norm": 0.14222468435764313, "learning_rate": 8.62539253166762e-06, "loss": 0.4679, "num_input_tokens_seen": 61659168, "step": 50700 }, { "epoch": 6.353213882972058, "grad_norm": 0.05312718078494072, "learning_rate": 8.625016006645939e-06, "loss": 0.4616, "num_input_tokens_seen": 61664960, "step": 50705 }, { "epoch": 6.353840370880842, "grad_norm": 0.11021724343299866, "learning_rate": 8.624639438284143e-06, "loss": 0.4655, "num_input_tokens_seen": 61671040, "step": 50710 }, { "epoch": 6.354466858789626, "grad_norm": 0.07886439561843872, "learning_rate": 8.624262826586738e-06, "loss": 0.4718, "num_input_tokens_seen": 61677312, "step": 50715 }, { "epoch": 6.355093346698409, "grad_norm": 0.10418258607387543, "learning_rate": 8.623886171558221e-06, "loss": 0.4606, "num_input_tokens_seen": 61683040, "step": 50720 }, { "epoch": 6.355719834607192, "grad_norm": 0.10030827671289444, "learning_rate": 8.6235094732031e-06, "loss": 0.4682, "num_input_tokens_seen": 61689312, "step": 50725 }, { "epoch": 6.356346322515975, "grad_norm": 0.11176574230194092, "learning_rate": 8.623132731525878e-06, "loss": 0.4596, "num_input_tokens_seen": 61695712, "step": 50730 }, { "epoch": 6.356972810424759, "grad_norm": 0.09085032343864441, "learning_rate": 8.622755946531057e-06, "loss": 0.4562, "num_input_tokens_seen": 61702080, "step": 50735 }, { "epoch": 6.357599298333542, "grad_norm": 0.08326833695173264, "learning_rate": 8.622379118223141e-06, "loss": 0.4597, "num_input_tokens_seen": 61708608, "step": 50740 }, { "epoch": 6.3582257862423255, "grad_norm": 0.09563297778367996, "learning_rate": 8.622002246606638e-06, "loss": 0.4606, "num_input_tokens_seen": 61714528, "step": 50745 }, { "epoch": 6.358852274151109, "grad_norm": 0.11206221580505371, "learning_rate": 8.621625331686053e-06, "loss": 0.4624, "num_input_tokens_seen": 61720576, "step": 50750 }, { "epoch": 6.359478762059892, "grad_norm": 0.12320350110530853, "learning_rate": 8.621248373465894e-06, "loss": 0.4642, "num_input_tokens_seen": 61726976, "step": 50755 }, { "epoch": 6.360105249968676, "grad_norm": 0.08523378521203995, "learning_rate": 8.620871371950663e-06, "loss": 0.4677, "num_input_tokens_seen": 61732832, "step": 50760 }, { "epoch": 6.360731737877459, "grad_norm": 0.09141665697097778, "learning_rate": 8.620494327144873e-06, "loss": 0.4617, "num_input_tokens_seen": 61737856, "step": 50765 }, { "epoch": 6.361358225786242, "grad_norm": 0.07978524267673492, "learning_rate": 8.620117239053027e-06, "loss": 0.459, "num_input_tokens_seen": 61743584, "step": 50770 }, { "epoch": 6.361984713695025, "grad_norm": 0.07264121621847153, "learning_rate": 8.619740107679639e-06, "loss": 0.4616, "num_input_tokens_seen": 61749920, "step": 50775 }, { "epoch": 6.362611201603809, "grad_norm": 0.08469659835100174, "learning_rate": 8.619362933029211e-06, "loss": 0.4602, "num_input_tokens_seen": 61756352, "step": 50780 }, { "epoch": 6.363237689512593, "grad_norm": 0.09874260425567627, "learning_rate": 8.618985715106259e-06, "loss": 0.4562, "num_input_tokens_seen": 61762240, "step": 50785 }, { "epoch": 6.363864177421376, "grad_norm": 0.08250846713781357, "learning_rate": 8.61860845391529e-06, "loss": 0.454, "num_input_tokens_seen": 61768640, "step": 50790 }, { "epoch": 6.364490665330159, "grad_norm": 0.08882877975702286, "learning_rate": 8.618231149460813e-06, "loss": 0.4627, "num_input_tokens_seen": 61775072, "step": 50795 }, { "epoch": 6.365117153238942, "grad_norm": 0.1137956902384758, "learning_rate": 8.617853801747343e-06, "loss": 0.464, "num_input_tokens_seen": 61781440, "step": 50800 }, { "epoch": 6.365743641147726, "grad_norm": 0.09354698657989502, "learning_rate": 8.617476410779386e-06, "loss": 0.4619, "num_input_tokens_seen": 61787104, "step": 50805 }, { "epoch": 6.366370129056509, "grad_norm": 0.10748399049043655, "learning_rate": 8.617098976561458e-06, "loss": 0.4578, "num_input_tokens_seen": 61793088, "step": 50810 }, { "epoch": 6.3669966169652925, "grad_norm": 0.07938159257173538, "learning_rate": 8.616721499098071e-06, "loss": 0.4578, "num_input_tokens_seen": 61799328, "step": 50815 }, { "epoch": 6.367623104874076, "grad_norm": 0.06300658732652664, "learning_rate": 8.616343978393739e-06, "loss": 0.4615, "num_input_tokens_seen": 61805600, "step": 50820 }, { "epoch": 6.36824959278286, "grad_norm": 0.07820959389209747, "learning_rate": 8.615966414452973e-06, "loss": 0.4621, "num_input_tokens_seen": 61811808, "step": 50825 }, { "epoch": 6.368876080691643, "grad_norm": 0.09549926966428757, "learning_rate": 8.615588807280288e-06, "loss": 0.4612, "num_input_tokens_seen": 61817664, "step": 50830 }, { "epoch": 6.369502568600426, "grad_norm": 0.10507942736148834, "learning_rate": 8.615211156880199e-06, "loss": 0.4661, "num_input_tokens_seen": 61824000, "step": 50835 }, { "epoch": 6.370129056509209, "grad_norm": 0.10156875103712082, "learning_rate": 8.614833463257221e-06, "loss": 0.4682, "num_input_tokens_seen": 61830048, "step": 50840 }, { "epoch": 6.370755544417992, "grad_norm": 0.10395114868879318, "learning_rate": 8.614455726415871e-06, "loss": 0.4515, "num_input_tokens_seen": 61836416, "step": 50845 }, { "epoch": 6.3713820323267765, "grad_norm": 0.09068213403224945, "learning_rate": 8.614077946360664e-06, "loss": 0.4561, "num_input_tokens_seen": 61842560, "step": 50850 }, { "epoch": 6.37200852023556, "grad_norm": 0.0971481055021286, "learning_rate": 8.613700123096115e-06, "loss": 0.4654, "num_input_tokens_seen": 61848736, "step": 50855 }, { "epoch": 6.372635008144343, "grad_norm": 0.10216258466243744, "learning_rate": 8.613322256626744e-06, "loss": 0.4684, "num_input_tokens_seen": 61854976, "step": 50860 }, { "epoch": 6.373261496053126, "grad_norm": 0.1503208726644516, "learning_rate": 8.61294434695707e-06, "loss": 0.4571, "num_input_tokens_seen": 61861280, "step": 50865 }, { "epoch": 6.373887983961909, "grad_norm": 0.07896533608436584, "learning_rate": 8.612566394091604e-06, "loss": 0.4517, "num_input_tokens_seen": 61866848, "step": 50870 }, { "epoch": 6.374514471870693, "grad_norm": 0.14133583009243011, "learning_rate": 8.61218839803487e-06, "loss": 0.4624, "num_input_tokens_seen": 61873120, "step": 50875 }, { "epoch": 6.375140959779476, "grad_norm": 0.09651590883731842, "learning_rate": 8.61181035879139e-06, "loss": 0.4635, "num_input_tokens_seen": 61879136, "step": 50880 }, { "epoch": 6.3757674476882595, "grad_norm": 0.1498754769563675, "learning_rate": 8.611432276365678e-06, "loss": 0.4728, "num_input_tokens_seen": 61885056, "step": 50885 }, { "epoch": 6.376393935597043, "grad_norm": 0.09215816110372543, "learning_rate": 8.611054150762256e-06, "loss": 0.4518, "num_input_tokens_seen": 61891296, "step": 50890 }, { "epoch": 6.377020423505826, "grad_norm": 0.1031530573964119, "learning_rate": 8.610675981985649e-06, "loss": 0.4577, "num_input_tokens_seen": 61897632, "step": 50895 }, { "epoch": 6.37764691141461, "grad_norm": 0.14799442887306213, "learning_rate": 8.610297770040373e-06, "loss": 0.4693, "num_input_tokens_seen": 61903744, "step": 50900 }, { "epoch": 6.378273399323393, "grad_norm": 0.12362837046384811, "learning_rate": 8.609919514930952e-06, "loss": 0.4622, "num_input_tokens_seen": 61909536, "step": 50905 }, { "epoch": 6.378899887232176, "grad_norm": 0.10316289961338043, "learning_rate": 8.609541216661907e-06, "loss": 0.4693, "num_input_tokens_seen": 61915936, "step": 50910 }, { "epoch": 6.379526375140959, "grad_norm": 0.09253780543804169, "learning_rate": 8.609162875237764e-06, "loss": 0.4722, "num_input_tokens_seen": 61921984, "step": 50915 }, { "epoch": 6.3801528630497435, "grad_norm": 0.09554208815097809, "learning_rate": 8.608784490663043e-06, "loss": 0.4601, "num_input_tokens_seen": 61928192, "step": 50920 }, { "epoch": 6.380779350958527, "grad_norm": 0.09707707911729813, "learning_rate": 8.60840606294227e-06, "loss": 0.4624, "num_input_tokens_seen": 61934400, "step": 50925 }, { "epoch": 6.38140583886731, "grad_norm": 0.10842312127351761, "learning_rate": 8.608027592079969e-06, "loss": 0.464, "num_input_tokens_seen": 61940736, "step": 50930 }, { "epoch": 6.382032326776093, "grad_norm": 0.14147600531578064, "learning_rate": 8.607649078080664e-06, "loss": 0.4717, "num_input_tokens_seen": 61947072, "step": 50935 }, { "epoch": 6.382658814684876, "grad_norm": 0.11024972051382065, "learning_rate": 8.60727052094888e-06, "loss": 0.469, "num_input_tokens_seen": 61952832, "step": 50940 }, { "epoch": 6.38328530259366, "grad_norm": 0.09180767089128494, "learning_rate": 8.606891920689146e-06, "loss": 0.4586, "num_input_tokens_seen": 61959008, "step": 50945 }, { "epoch": 6.383911790502443, "grad_norm": 0.10099316388368607, "learning_rate": 8.606513277305985e-06, "loss": 0.4439, "num_input_tokens_seen": 61965152, "step": 50950 }, { "epoch": 6.384538278411227, "grad_norm": 0.08800457417964935, "learning_rate": 8.606134590803927e-06, "loss": 0.4646, "num_input_tokens_seen": 61971616, "step": 50955 }, { "epoch": 6.38516476632001, "grad_norm": 0.09169679880142212, "learning_rate": 8.605755861187498e-06, "loss": 0.4587, "num_input_tokens_seen": 61977664, "step": 50960 }, { "epoch": 6.385791254228794, "grad_norm": 0.08424187451601028, "learning_rate": 8.605377088461227e-06, "loss": 0.4604, "num_input_tokens_seen": 61983744, "step": 50965 }, { "epoch": 6.386417742137577, "grad_norm": 0.0630948469042778, "learning_rate": 8.604998272629639e-06, "loss": 0.4614, "num_input_tokens_seen": 61989760, "step": 50970 }, { "epoch": 6.38704423004636, "grad_norm": 0.08619244396686554, "learning_rate": 8.604619413697269e-06, "loss": 0.4619, "num_input_tokens_seen": 61995456, "step": 50975 }, { "epoch": 6.387670717955143, "grad_norm": 0.1355229914188385, "learning_rate": 8.60424051166864e-06, "loss": 0.4664, "num_input_tokens_seen": 62001888, "step": 50980 }, { "epoch": 6.3882972058639265, "grad_norm": 0.13264402747154236, "learning_rate": 8.603861566548287e-06, "loss": 0.4673, "num_input_tokens_seen": 62007840, "step": 50985 }, { "epoch": 6.3889236937727105, "grad_norm": 0.09697915613651276, "learning_rate": 8.603482578340738e-06, "loss": 0.4554, "num_input_tokens_seen": 62014144, "step": 50990 }, { "epoch": 6.389550181681494, "grad_norm": 0.11244209855794907, "learning_rate": 8.603103547050526e-06, "loss": 0.4617, "num_input_tokens_seen": 62020160, "step": 50995 }, { "epoch": 6.390176669590277, "grad_norm": 0.10592728853225708, "learning_rate": 8.602724472682183e-06, "loss": 0.4606, "num_input_tokens_seen": 62026080, "step": 51000 }, { "epoch": 6.39080315749906, "grad_norm": 0.12588828802108765, "learning_rate": 8.602345355240238e-06, "loss": 0.4662, "num_input_tokens_seen": 62031520, "step": 51005 }, { "epoch": 6.391429645407843, "grad_norm": 0.09593697637319565, "learning_rate": 8.601966194729228e-06, "loss": 0.458, "num_input_tokens_seen": 62037216, "step": 51010 }, { "epoch": 6.392056133316627, "grad_norm": 0.12353640794754028, "learning_rate": 8.601586991153681e-06, "loss": 0.4605, "num_input_tokens_seen": 62043232, "step": 51015 }, { "epoch": 6.39268262122541, "grad_norm": 0.1077326238155365, "learning_rate": 8.601207744518134e-06, "loss": 0.4613, "num_input_tokens_seen": 62049344, "step": 51020 }, { "epoch": 6.393309109134194, "grad_norm": 0.13900364935398102, "learning_rate": 8.600828454827122e-06, "loss": 0.4563, "num_input_tokens_seen": 62055552, "step": 51025 }, { "epoch": 6.393935597042977, "grad_norm": 0.14688917994499207, "learning_rate": 8.600449122085177e-06, "loss": 0.469, "num_input_tokens_seen": 62061664, "step": 51030 }, { "epoch": 6.394562084951761, "grad_norm": 0.07264693826436996, "learning_rate": 8.600069746296838e-06, "loss": 0.4589, "num_input_tokens_seen": 62067584, "step": 51035 }, { "epoch": 6.395188572860544, "grad_norm": 0.09814698249101639, "learning_rate": 8.599690327466635e-06, "loss": 0.4593, "num_input_tokens_seen": 62073856, "step": 51040 }, { "epoch": 6.395815060769327, "grad_norm": 0.17818279564380646, "learning_rate": 8.59931086559911e-06, "loss": 0.4558, "num_input_tokens_seen": 62080224, "step": 51045 }, { "epoch": 6.39644154867811, "grad_norm": 0.1300319880247116, "learning_rate": 8.598931360698797e-06, "loss": 0.4676, "num_input_tokens_seen": 62086464, "step": 51050 }, { "epoch": 6.3970680365868935, "grad_norm": 0.09252721816301346, "learning_rate": 8.598551812770234e-06, "loss": 0.4674, "num_input_tokens_seen": 62092640, "step": 51055 }, { "epoch": 6.3976945244956775, "grad_norm": 0.125698983669281, "learning_rate": 8.598172221817958e-06, "loss": 0.4631, "num_input_tokens_seen": 62098880, "step": 51060 }, { "epoch": 6.398321012404461, "grad_norm": 0.08468437939882278, "learning_rate": 8.597792587846507e-06, "loss": 0.4498, "num_input_tokens_seen": 62104928, "step": 51065 }, { "epoch": 6.398947500313244, "grad_norm": 0.1079014465212822, "learning_rate": 8.597412910860423e-06, "loss": 0.4561, "num_input_tokens_seen": 62111264, "step": 51070 }, { "epoch": 6.399573988222027, "grad_norm": 0.09774014353752136, "learning_rate": 8.59703319086424e-06, "loss": 0.4541, "num_input_tokens_seen": 62117440, "step": 51075 }, { "epoch": 6.400200476130811, "grad_norm": 0.12226971983909607, "learning_rate": 8.596653427862505e-06, "loss": 0.4691, "num_input_tokens_seen": 62123904, "step": 51080 }, { "epoch": 6.400826964039594, "grad_norm": 0.14696352183818817, "learning_rate": 8.596273621859752e-06, "loss": 0.4733, "num_input_tokens_seen": 62130208, "step": 51085 }, { "epoch": 6.4014534519483774, "grad_norm": 0.07241298258304596, "learning_rate": 8.595893772860524e-06, "loss": 0.4586, "num_input_tokens_seen": 62136064, "step": 51090 }, { "epoch": 6.402079939857161, "grad_norm": 0.1725797802209854, "learning_rate": 8.595513880869364e-06, "loss": 0.4617, "num_input_tokens_seen": 62142432, "step": 51095 }, { "epoch": 6.402706427765944, "grad_norm": 0.15597407519817352, "learning_rate": 8.595133945890812e-06, "loss": 0.4596, "num_input_tokens_seen": 62148032, "step": 51100 }, { "epoch": 6.403332915674728, "grad_norm": 0.10439594835042953, "learning_rate": 8.594753967929412e-06, "loss": 0.4493, "num_input_tokens_seen": 62154496, "step": 51105 }, { "epoch": 6.403959403583511, "grad_norm": 0.06843724846839905, "learning_rate": 8.594373946989704e-06, "loss": 0.458, "num_input_tokens_seen": 62160416, "step": 51110 }, { "epoch": 6.404585891492294, "grad_norm": 0.11680012196302414, "learning_rate": 8.593993883076236e-06, "loss": 0.4606, "num_input_tokens_seen": 62166496, "step": 51115 }, { "epoch": 6.405212379401077, "grad_norm": 0.11253480613231659, "learning_rate": 8.593613776193546e-06, "loss": 0.4606, "num_input_tokens_seen": 62172512, "step": 51120 }, { "epoch": 6.4058388673098605, "grad_norm": 0.07609183341264725, "learning_rate": 8.593233626346185e-06, "loss": 0.4554, "num_input_tokens_seen": 62178432, "step": 51125 }, { "epoch": 6.406465355218645, "grad_norm": 0.21959470212459564, "learning_rate": 8.592853433538693e-06, "loss": 0.4616, "num_input_tokens_seen": 62184704, "step": 51130 }, { "epoch": 6.407091843127428, "grad_norm": 0.14839714765548706, "learning_rate": 8.592473197775619e-06, "loss": 0.4616, "num_input_tokens_seen": 62191040, "step": 51135 }, { "epoch": 6.407718331036211, "grad_norm": 0.09021925926208496, "learning_rate": 8.592092919061507e-06, "loss": 0.4626, "num_input_tokens_seen": 62197504, "step": 51140 }, { "epoch": 6.408344818944994, "grad_norm": 0.16347773373126984, "learning_rate": 8.591712597400903e-06, "loss": 0.462, "num_input_tokens_seen": 62202944, "step": 51145 }, { "epoch": 6.408971306853778, "grad_norm": 0.094292551279068, "learning_rate": 8.591332232798357e-06, "loss": 0.4687, "num_input_tokens_seen": 62209120, "step": 51150 }, { "epoch": 6.409597794762561, "grad_norm": 0.07750999182462692, "learning_rate": 8.590951825258414e-06, "loss": 0.4641, "num_input_tokens_seen": 62215584, "step": 51155 }, { "epoch": 6.4102242826713445, "grad_norm": 0.11336378753185272, "learning_rate": 8.590571374785622e-06, "loss": 0.4696, "num_input_tokens_seen": 62222016, "step": 51160 }, { "epoch": 6.410850770580128, "grad_norm": 0.08577550947666168, "learning_rate": 8.590190881384533e-06, "loss": 0.4694, "num_input_tokens_seen": 62228192, "step": 51165 }, { "epoch": 6.411477258488911, "grad_norm": 0.11291014403104782, "learning_rate": 8.58981034505969e-06, "loss": 0.4575, "num_input_tokens_seen": 62234240, "step": 51170 }, { "epoch": 6.412103746397695, "grad_norm": 0.13312937319278717, "learning_rate": 8.58942976581565e-06, "loss": 0.4627, "num_input_tokens_seen": 62240384, "step": 51175 }, { "epoch": 6.412730234306478, "grad_norm": 0.09925346076488495, "learning_rate": 8.589049143656955e-06, "loss": 0.4721, "num_input_tokens_seen": 62246496, "step": 51180 }, { "epoch": 6.413356722215261, "grad_norm": 0.1402951180934906, "learning_rate": 8.588668478588163e-06, "loss": 0.4578, "num_input_tokens_seen": 62252768, "step": 51185 }, { "epoch": 6.413983210124044, "grad_norm": 0.08805159479379654, "learning_rate": 8.588287770613821e-06, "loss": 0.4579, "num_input_tokens_seen": 62258688, "step": 51190 }, { "epoch": 6.414609698032828, "grad_norm": 0.08330848067998886, "learning_rate": 8.587907019738482e-06, "loss": 0.4692, "num_input_tokens_seen": 62264960, "step": 51195 }, { "epoch": 6.415236185941612, "grad_norm": 0.0893356129527092, "learning_rate": 8.587526225966698e-06, "loss": 0.4659, "num_input_tokens_seen": 62271232, "step": 51200 }, { "epoch": 6.415862673850395, "grad_norm": 0.11245889216661453, "learning_rate": 8.587145389303022e-06, "loss": 0.468, "num_input_tokens_seen": 62277216, "step": 51205 }, { "epoch": 6.416489161759178, "grad_norm": 0.08898428082466125, "learning_rate": 8.586764509752007e-06, "loss": 0.4657, "num_input_tokens_seen": 62282944, "step": 51210 }, { "epoch": 6.417115649667961, "grad_norm": 0.08689301460981369, "learning_rate": 8.586383587318207e-06, "loss": 0.4537, "num_input_tokens_seen": 62289120, "step": 51215 }, { "epoch": 6.417742137576745, "grad_norm": 0.10027876496315002, "learning_rate": 8.586002622006176e-06, "loss": 0.4629, "num_input_tokens_seen": 62295264, "step": 51220 }, { "epoch": 6.418368625485528, "grad_norm": 0.1400536298751831, "learning_rate": 8.585621613820467e-06, "loss": 0.4603, "num_input_tokens_seen": 62301472, "step": 51225 }, { "epoch": 6.4189951133943115, "grad_norm": 0.08259829878807068, "learning_rate": 8.585240562765639e-06, "loss": 0.4659, "num_input_tokens_seen": 62307904, "step": 51230 }, { "epoch": 6.419621601303095, "grad_norm": 0.12284764647483826, "learning_rate": 8.584859468846246e-06, "loss": 0.4649, "num_input_tokens_seen": 62313952, "step": 51235 }, { "epoch": 6.420248089211878, "grad_norm": 0.0829407274723053, "learning_rate": 8.584478332066841e-06, "loss": 0.4645, "num_input_tokens_seen": 62320352, "step": 51240 }, { "epoch": 6.420874577120662, "grad_norm": 0.09132902324199677, "learning_rate": 8.584097152431986e-06, "loss": 0.4613, "num_input_tokens_seen": 62326112, "step": 51245 }, { "epoch": 6.421501065029445, "grad_norm": 0.1432029753923416, "learning_rate": 8.583715929946238e-06, "loss": 0.4612, "num_input_tokens_seen": 62332096, "step": 51250 }, { "epoch": 6.422127552938228, "grad_norm": 0.12861937284469604, "learning_rate": 8.583334664614149e-06, "loss": 0.4645, "num_input_tokens_seen": 62338208, "step": 51255 }, { "epoch": 6.422754040847011, "grad_norm": 0.1300356239080429, "learning_rate": 8.582953356440284e-06, "loss": 0.4654, "num_input_tokens_seen": 62344064, "step": 51260 }, { "epoch": 6.423380528755795, "grad_norm": 0.12386671453714371, "learning_rate": 8.5825720054292e-06, "loss": 0.4659, "num_input_tokens_seen": 62350112, "step": 51265 }, { "epoch": 6.424007016664579, "grad_norm": 0.0930100828409195, "learning_rate": 8.582190611585454e-06, "loss": 0.4571, "num_input_tokens_seen": 62356640, "step": 51270 }, { "epoch": 6.424633504573362, "grad_norm": 0.08837547153234482, "learning_rate": 8.581809174913607e-06, "loss": 0.451, "num_input_tokens_seen": 62362816, "step": 51275 }, { "epoch": 6.425259992482145, "grad_norm": 0.0826498419046402, "learning_rate": 8.581427695418222e-06, "loss": 0.4645, "num_input_tokens_seen": 62369024, "step": 51280 }, { "epoch": 6.425886480390928, "grad_norm": 0.08692841231822968, "learning_rate": 8.581046173103857e-06, "loss": 0.466, "num_input_tokens_seen": 62375104, "step": 51285 }, { "epoch": 6.426512968299712, "grad_norm": 0.08302723616361618, "learning_rate": 8.580664607975073e-06, "loss": 0.4651, "num_input_tokens_seen": 62381248, "step": 51290 }, { "epoch": 6.427139456208495, "grad_norm": 0.08030878752470016, "learning_rate": 8.580283000036434e-06, "loss": 0.4649, "num_input_tokens_seen": 62387264, "step": 51295 }, { "epoch": 6.4277659441172785, "grad_norm": 0.08407075703144073, "learning_rate": 8.579901349292501e-06, "loss": 0.4592, "num_input_tokens_seen": 62393248, "step": 51300 }, { "epoch": 6.428392432026062, "grad_norm": 0.08630117774009705, "learning_rate": 8.579519655747839e-06, "loss": 0.4597, "num_input_tokens_seen": 62399104, "step": 51305 }, { "epoch": 6.429018919934845, "grad_norm": 0.10320279002189636, "learning_rate": 8.579137919407008e-06, "loss": 0.4553, "num_input_tokens_seen": 62405184, "step": 51310 }, { "epoch": 6.429645407843629, "grad_norm": 0.055460501462221146, "learning_rate": 8.578756140274576e-06, "loss": 0.468, "num_input_tokens_seen": 62411456, "step": 51315 }, { "epoch": 6.430271895752412, "grad_norm": 0.09520527720451355, "learning_rate": 8.578374318355105e-06, "loss": 0.4607, "num_input_tokens_seen": 62417664, "step": 51320 }, { "epoch": 6.430898383661195, "grad_norm": 0.08090365678071976, "learning_rate": 8.577992453653158e-06, "loss": 0.4618, "num_input_tokens_seen": 62423936, "step": 51325 }, { "epoch": 6.431524871569978, "grad_norm": 0.07476375997066498, "learning_rate": 8.577610546173307e-06, "loss": 0.4571, "num_input_tokens_seen": 62430016, "step": 51330 }, { "epoch": 6.4321513594787625, "grad_norm": 0.07307463884353638, "learning_rate": 8.577228595920113e-06, "loss": 0.4598, "num_input_tokens_seen": 62436256, "step": 51335 }, { "epoch": 6.432777847387546, "grad_norm": 0.08167029917240143, "learning_rate": 8.576846602898141e-06, "loss": 0.4607, "num_input_tokens_seen": 62442208, "step": 51340 }, { "epoch": 6.433404335296329, "grad_norm": 0.08672700077295303, "learning_rate": 8.576464567111963e-06, "loss": 0.4624, "num_input_tokens_seen": 62448416, "step": 51345 }, { "epoch": 6.434030823205112, "grad_norm": 0.09590921550989151, "learning_rate": 8.576082488566144e-06, "loss": 0.4615, "num_input_tokens_seen": 62454688, "step": 51350 }, { "epoch": 6.434657311113895, "grad_norm": 0.07738802582025528, "learning_rate": 8.575700367265251e-06, "loss": 0.4584, "num_input_tokens_seen": 62460672, "step": 51355 }, { "epoch": 6.435283799022679, "grad_norm": 0.08249283581972122, "learning_rate": 8.575318203213856e-06, "loss": 0.4695, "num_input_tokens_seen": 62466976, "step": 51360 }, { "epoch": 6.435910286931462, "grad_norm": 0.09207659959793091, "learning_rate": 8.574935996416525e-06, "loss": 0.4643, "num_input_tokens_seen": 62473088, "step": 51365 }, { "epoch": 6.436536774840246, "grad_norm": 0.07751519978046417, "learning_rate": 8.574553746877827e-06, "loss": 0.4601, "num_input_tokens_seen": 62479488, "step": 51370 }, { "epoch": 6.437163262749029, "grad_norm": 0.130076065659523, "learning_rate": 8.574171454602334e-06, "loss": 0.466, "num_input_tokens_seen": 62485376, "step": 51375 }, { "epoch": 6.437789750657812, "grad_norm": 0.11299824714660645, "learning_rate": 8.573789119594617e-06, "loss": 0.4729, "num_input_tokens_seen": 62491616, "step": 51380 }, { "epoch": 6.438416238566596, "grad_norm": 0.1468009352684021, "learning_rate": 8.573406741859248e-06, "loss": 0.454, "num_input_tokens_seen": 62497760, "step": 51385 }, { "epoch": 6.439042726475379, "grad_norm": 0.1270584911108017, "learning_rate": 8.573024321400793e-06, "loss": 0.4614, "num_input_tokens_seen": 62503808, "step": 51390 }, { "epoch": 6.439669214384162, "grad_norm": 0.09677154570817947, "learning_rate": 8.572641858223832e-06, "loss": 0.473, "num_input_tokens_seen": 62509760, "step": 51395 }, { "epoch": 6.4402957022929455, "grad_norm": 0.10711584985256195, "learning_rate": 8.572259352332933e-06, "loss": 0.4671, "num_input_tokens_seen": 62515808, "step": 51400 }, { "epoch": 6.4409221902017295, "grad_norm": 0.09670238941907883, "learning_rate": 8.571876803732668e-06, "loss": 0.4791, "num_input_tokens_seen": 62521600, "step": 51405 }, { "epoch": 6.441548678110513, "grad_norm": 0.10191790759563446, "learning_rate": 8.571494212427616e-06, "loss": 0.4629, "num_input_tokens_seen": 62527456, "step": 51410 }, { "epoch": 6.442175166019296, "grad_norm": 0.0822274386882782, "learning_rate": 8.571111578422345e-06, "loss": 0.4653, "num_input_tokens_seen": 62532896, "step": 51415 }, { "epoch": 6.442801653928079, "grad_norm": 0.07956304401159286, "learning_rate": 8.570728901721435e-06, "loss": 0.4528, "num_input_tokens_seen": 62539200, "step": 51420 }, { "epoch": 6.443428141836862, "grad_norm": 0.04342900216579437, "learning_rate": 8.570346182329457e-06, "loss": 0.4673, "num_input_tokens_seen": 62545088, "step": 51425 }, { "epoch": 6.444054629745646, "grad_norm": 0.08102936297655106, "learning_rate": 8.569963420250989e-06, "loss": 0.4663, "num_input_tokens_seen": 62551456, "step": 51430 }, { "epoch": 6.444681117654429, "grad_norm": 0.07255406677722931, "learning_rate": 8.569580615490606e-06, "loss": 0.4686, "num_input_tokens_seen": 62557760, "step": 51435 }, { "epoch": 6.445307605563213, "grad_norm": 0.10873851180076599, "learning_rate": 8.569197768052887e-06, "loss": 0.4638, "num_input_tokens_seen": 62563968, "step": 51440 }, { "epoch": 6.445934093471996, "grad_norm": 0.04620605334639549, "learning_rate": 8.568814877942408e-06, "loss": 0.4703, "num_input_tokens_seen": 62570112, "step": 51445 }, { "epoch": 6.44656058138078, "grad_norm": 0.06685112416744232, "learning_rate": 8.568431945163748e-06, "loss": 0.4542, "num_input_tokens_seen": 62575680, "step": 51450 }, { "epoch": 6.447187069289563, "grad_norm": 0.12125898897647858, "learning_rate": 8.568048969721482e-06, "loss": 0.4587, "num_input_tokens_seen": 62581664, "step": 51455 }, { "epoch": 6.447813557198346, "grad_norm": 0.10500695556402206, "learning_rate": 8.56766595162019e-06, "loss": 0.4601, "num_input_tokens_seen": 62588000, "step": 51460 }, { "epoch": 6.448440045107129, "grad_norm": 0.07643473148345947, "learning_rate": 8.567282890864454e-06, "loss": 0.4633, "num_input_tokens_seen": 62594016, "step": 51465 }, { "epoch": 6.4490665330159125, "grad_norm": 0.07585497200489044, "learning_rate": 8.56689978745885e-06, "loss": 0.4598, "num_input_tokens_seen": 62600096, "step": 51470 }, { "epoch": 6.4496930209246965, "grad_norm": 0.0766277089715004, "learning_rate": 8.566516641407961e-06, "loss": 0.4513, "num_input_tokens_seen": 62606176, "step": 51475 }, { "epoch": 6.45031950883348, "grad_norm": 0.11293484270572662, "learning_rate": 8.566133452716367e-06, "loss": 0.4658, "num_input_tokens_seen": 62612736, "step": 51480 }, { "epoch": 6.450945996742263, "grad_norm": 0.0780433788895607, "learning_rate": 8.56575022138865e-06, "loss": 0.4528, "num_input_tokens_seen": 62618944, "step": 51485 }, { "epoch": 6.451572484651046, "grad_norm": 0.0714477002620697, "learning_rate": 8.56536694742939e-06, "loss": 0.4546, "num_input_tokens_seen": 62624832, "step": 51490 }, { "epoch": 6.452198972559829, "grad_norm": 0.08113034069538116, "learning_rate": 8.564983630843172e-06, "loss": 0.4527, "num_input_tokens_seen": 62631040, "step": 51495 }, { "epoch": 6.452825460468613, "grad_norm": 0.08638325333595276, "learning_rate": 8.564600271634575e-06, "loss": 0.4636, "num_input_tokens_seen": 62637024, "step": 51500 }, { "epoch": 6.453451948377396, "grad_norm": 0.07749945670366287, "learning_rate": 8.564216869808187e-06, "loss": 0.4608, "num_input_tokens_seen": 62643328, "step": 51505 }, { "epoch": 6.45407843628618, "grad_norm": 0.11260883510112762, "learning_rate": 8.563833425368589e-06, "loss": 0.4571, "num_input_tokens_seen": 62648768, "step": 51510 }, { "epoch": 6.454704924194963, "grad_norm": 0.09990763664245605, "learning_rate": 8.563449938320365e-06, "loss": 0.4615, "num_input_tokens_seen": 62655072, "step": 51515 }, { "epoch": 6.455331412103746, "grad_norm": 0.12198451906442642, "learning_rate": 8.563066408668103e-06, "loss": 0.4479, "num_input_tokens_seen": 62661312, "step": 51520 }, { "epoch": 6.45595790001253, "grad_norm": 0.15169775485992432, "learning_rate": 8.562682836416385e-06, "loss": 0.462, "num_input_tokens_seen": 62667200, "step": 51525 }, { "epoch": 6.456584387921313, "grad_norm": 0.04789325222373009, "learning_rate": 8.562299221569799e-06, "loss": 0.4665, "num_input_tokens_seen": 62673440, "step": 51530 }, { "epoch": 6.457210875830096, "grad_norm": 0.08021160215139389, "learning_rate": 8.56191556413293e-06, "loss": 0.4733, "num_input_tokens_seen": 62678976, "step": 51535 }, { "epoch": 6.4578373637388795, "grad_norm": 0.10598592460155487, "learning_rate": 8.561531864110366e-06, "loss": 0.4553, "num_input_tokens_seen": 62685280, "step": 51540 }, { "epoch": 6.458463851647664, "grad_norm": 0.11329247802495956, "learning_rate": 8.561148121506695e-06, "loss": 0.4676, "num_input_tokens_seen": 62691296, "step": 51545 }, { "epoch": 6.459090339556447, "grad_norm": 0.057190340012311935, "learning_rate": 8.560764336326503e-06, "loss": 0.4651, "num_input_tokens_seen": 62697408, "step": 51550 }, { "epoch": 6.45971682746523, "grad_norm": 0.06187969818711281, "learning_rate": 8.560380508574383e-06, "loss": 0.4695, "num_input_tokens_seen": 62703584, "step": 51555 }, { "epoch": 6.460343315374013, "grad_norm": 0.05061664059758186, "learning_rate": 8.559996638254916e-06, "loss": 0.4583, "num_input_tokens_seen": 62709728, "step": 51560 }, { "epoch": 6.460969803282796, "grad_norm": 0.08907735347747803, "learning_rate": 8.559612725372698e-06, "loss": 0.474, "num_input_tokens_seen": 62715744, "step": 51565 }, { "epoch": 6.46159629119158, "grad_norm": 0.07230525463819504, "learning_rate": 8.559228769932318e-06, "loss": 0.4527, "num_input_tokens_seen": 62721696, "step": 51570 }, { "epoch": 6.4622227791003635, "grad_norm": 0.08037735521793365, "learning_rate": 8.558844771938364e-06, "loss": 0.4598, "num_input_tokens_seen": 62727840, "step": 51575 }, { "epoch": 6.462849267009147, "grad_norm": 0.09262245893478394, "learning_rate": 8.55846073139543e-06, "loss": 0.4686, "num_input_tokens_seen": 62734016, "step": 51580 }, { "epoch": 6.46347575491793, "grad_norm": 0.08561743795871735, "learning_rate": 8.558076648308104e-06, "loss": 0.4607, "num_input_tokens_seen": 62740320, "step": 51585 }, { "epoch": 6.464102242826714, "grad_norm": 0.11161323636770248, "learning_rate": 8.557692522680983e-06, "loss": 0.4642, "num_input_tokens_seen": 62746368, "step": 51590 }, { "epoch": 6.464728730735497, "grad_norm": 0.09283977001905441, "learning_rate": 8.557308354518656e-06, "loss": 0.4542, "num_input_tokens_seen": 62752352, "step": 51595 }, { "epoch": 6.46535521864428, "grad_norm": 0.08384249359369278, "learning_rate": 8.556924143825715e-06, "loss": 0.46, "num_input_tokens_seen": 62758144, "step": 51600 }, { "epoch": 6.465981706553063, "grad_norm": 0.09895886480808258, "learning_rate": 8.556539890606758e-06, "loss": 0.4679, "num_input_tokens_seen": 62764096, "step": 51605 }, { "epoch": 6.4666081944618465, "grad_norm": 0.11920487135648727, "learning_rate": 8.556155594866375e-06, "loss": 0.4448, "num_input_tokens_seen": 62769920, "step": 51610 }, { "epoch": 6.467234682370631, "grad_norm": 0.08677069842815399, "learning_rate": 8.555771256609163e-06, "loss": 0.4576, "num_input_tokens_seen": 62776064, "step": 51615 }, { "epoch": 6.467861170279414, "grad_norm": 0.1364859789609909, "learning_rate": 8.555386875839714e-06, "loss": 0.4762, "num_input_tokens_seen": 62782016, "step": 51620 }, { "epoch": 6.468487658188197, "grad_norm": 0.09576091170310974, "learning_rate": 8.555002452562627e-06, "loss": 0.4575, "num_input_tokens_seen": 62788384, "step": 51625 }, { "epoch": 6.46911414609698, "grad_norm": 0.1261279135942459, "learning_rate": 8.554617986782497e-06, "loss": 0.4505, "num_input_tokens_seen": 62794816, "step": 51630 }, { "epoch": 6.469740634005763, "grad_norm": 0.12239516526460648, "learning_rate": 8.55423347850392e-06, "loss": 0.4531, "num_input_tokens_seen": 62800864, "step": 51635 }, { "epoch": 6.470367121914547, "grad_norm": 0.15494336187839508, "learning_rate": 8.553848927731495e-06, "loss": 0.4591, "num_input_tokens_seen": 62807008, "step": 51640 }, { "epoch": 6.4709936098233305, "grad_norm": 0.24021615087985992, "learning_rate": 8.553464334469816e-06, "loss": 0.4703, "num_input_tokens_seen": 62813024, "step": 51645 }, { "epoch": 6.471620097732114, "grad_norm": 0.09106550365686417, "learning_rate": 8.553079698723484e-06, "loss": 0.4554, "num_input_tokens_seen": 62818944, "step": 51650 }, { "epoch": 6.472246585640897, "grad_norm": 0.1359482705593109, "learning_rate": 8.552695020497098e-06, "loss": 0.4691, "num_input_tokens_seen": 62825088, "step": 51655 }, { "epoch": 6.472873073549681, "grad_norm": 0.23090730607509613, "learning_rate": 8.552310299795253e-06, "loss": 0.4516, "num_input_tokens_seen": 62831328, "step": 51660 }, { "epoch": 6.473499561458464, "grad_norm": 0.4318726062774658, "learning_rate": 8.551925536622554e-06, "loss": 0.4803, "num_input_tokens_seen": 62837568, "step": 51665 }, { "epoch": 6.474126049367247, "grad_norm": 0.1571810245513916, "learning_rate": 8.5515407309836e-06, "loss": 0.4494, "num_input_tokens_seen": 62843360, "step": 51670 }, { "epoch": 6.47475253727603, "grad_norm": 0.1222887635231018, "learning_rate": 8.551155882882989e-06, "loss": 0.4591, "num_input_tokens_seen": 62849568, "step": 51675 }, { "epoch": 6.475379025184814, "grad_norm": 0.09667627513408661, "learning_rate": 8.550770992325323e-06, "loss": 0.469, "num_input_tokens_seen": 62855744, "step": 51680 }, { "epoch": 6.476005513093598, "grad_norm": 0.06559419631958008, "learning_rate": 8.550386059315207e-06, "loss": 0.4681, "num_input_tokens_seen": 62861568, "step": 51685 }, { "epoch": 6.476632001002381, "grad_norm": 0.08395252376794815, "learning_rate": 8.550001083857238e-06, "loss": 0.4561, "num_input_tokens_seen": 62867904, "step": 51690 }, { "epoch": 6.477258488911164, "grad_norm": 0.08654038608074188, "learning_rate": 8.549616065956024e-06, "loss": 0.4567, "num_input_tokens_seen": 62873824, "step": 51695 }, { "epoch": 6.477884976819947, "grad_norm": 0.06295371055603027, "learning_rate": 8.549231005616163e-06, "loss": 0.4752, "num_input_tokens_seen": 62879872, "step": 51700 }, { "epoch": 6.478511464728731, "grad_norm": 0.12560121715068817, "learning_rate": 8.548845902842264e-06, "loss": 0.4561, "num_input_tokens_seen": 62886144, "step": 51705 }, { "epoch": 6.479137952637514, "grad_norm": 0.10214431583881378, "learning_rate": 8.548460757638927e-06, "loss": 0.466, "num_input_tokens_seen": 62892064, "step": 51710 }, { "epoch": 6.4797644405462975, "grad_norm": 0.09833028167486191, "learning_rate": 8.548075570010758e-06, "loss": 0.459, "num_input_tokens_seen": 62898080, "step": 51715 }, { "epoch": 6.480390928455081, "grad_norm": 0.14326435327529907, "learning_rate": 8.547690339962363e-06, "loss": 0.463, "num_input_tokens_seen": 62904256, "step": 51720 }, { "epoch": 6.481017416363864, "grad_norm": 0.139777272939682, "learning_rate": 8.547305067498345e-06, "loss": 0.4566, "num_input_tokens_seen": 62910432, "step": 51725 }, { "epoch": 6.481643904272648, "grad_norm": 0.19138845801353455, "learning_rate": 8.546919752623317e-06, "loss": 0.4617, "num_input_tokens_seen": 62916768, "step": 51730 }, { "epoch": 6.482270392181431, "grad_norm": 0.10751961171627045, "learning_rate": 8.546534395341878e-06, "loss": 0.4578, "num_input_tokens_seen": 62922816, "step": 51735 }, { "epoch": 6.482896880090214, "grad_norm": 0.11546262353658676, "learning_rate": 8.546148995658638e-06, "loss": 0.4754, "num_input_tokens_seen": 62928992, "step": 51740 }, { "epoch": 6.483523367998997, "grad_norm": 0.07592890411615372, "learning_rate": 8.545763553578207e-06, "loss": 0.4716, "num_input_tokens_seen": 62935296, "step": 51745 }, { "epoch": 6.484149855907781, "grad_norm": 0.12336912751197815, "learning_rate": 8.545378069105193e-06, "loss": 0.4552, "num_input_tokens_seen": 62941184, "step": 51750 }, { "epoch": 6.484776343816565, "grad_norm": 0.0788271427154541, "learning_rate": 8.544992542244201e-06, "loss": 0.4575, "num_input_tokens_seen": 62947456, "step": 51755 }, { "epoch": 6.485402831725348, "grad_norm": 0.09991060197353363, "learning_rate": 8.544606972999844e-06, "loss": 0.4627, "num_input_tokens_seen": 62953536, "step": 51760 }, { "epoch": 6.486029319634131, "grad_norm": 0.09517364948987961, "learning_rate": 8.54422136137673e-06, "loss": 0.4679, "num_input_tokens_seen": 62959744, "step": 51765 }, { "epoch": 6.486655807542914, "grad_norm": 0.10970140248537064, "learning_rate": 8.54383570737947e-06, "loss": 0.471, "num_input_tokens_seen": 62965728, "step": 51770 }, { "epoch": 6.487282295451697, "grad_norm": 0.10028491169214249, "learning_rate": 8.543450011012672e-06, "loss": 0.4571, "num_input_tokens_seen": 62971744, "step": 51775 }, { "epoch": 6.487908783360481, "grad_norm": 0.08111841231584549, "learning_rate": 8.543064272280954e-06, "loss": 0.4607, "num_input_tokens_seen": 62978016, "step": 51780 }, { "epoch": 6.4885352712692645, "grad_norm": 0.10609817504882812, "learning_rate": 8.542678491188923e-06, "loss": 0.4618, "num_input_tokens_seen": 62984000, "step": 51785 }, { "epoch": 6.489161759178048, "grad_norm": 0.1045038104057312, "learning_rate": 8.54229266774119e-06, "loss": 0.4708, "num_input_tokens_seen": 62990272, "step": 51790 }, { "epoch": 6.489788247086831, "grad_norm": 0.08273601531982422, "learning_rate": 8.541906801942371e-06, "loss": 0.463, "num_input_tokens_seen": 62995648, "step": 51795 }, { "epoch": 6.490414734995615, "grad_norm": 0.14990149438381195, "learning_rate": 8.54152089379708e-06, "loss": 0.4641, "num_input_tokens_seen": 63001632, "step": 51800 }, { "epoch": 6.491041222904398, "grad_norm": 0.09667166322469711, "learning_rate": 8.541134943309926e-06, "loss": 0.4652, "num_input_tokens_seen": 63007840, "step": 51805 }, { "epoch": 6.491667710813181, "grad_norm": 0.08760912716388702, "learning_rate": 8.540748950485529e-06, "loss": 0.4687, "num_input_tokens_seen": 63013984, "step": 51810 }, { "epoch": 6.4922941987219644, "grad_norm": 0.08035575598478317, "learning_rate": 8.540362915328499e-06, "loss": 0.4554, "num_input_tokens_seen": 63019424, "step": 51815 }, { "epoch": 6.492920686630748, "grad_norm": 0.08495286852121353, "learning_rate": 8.539976837843457e-06, "loss": 0.4527, "num_input_tokens_seen": 63025472, "step": 51820 }, { "epoch": 6.493547174539532, "grad_norm": 0.08971994370222092, "learning_rate": 8.539590718035013e-06, "loss": 0.4618, "num_input_tokens_seen": 63031776, "step": 51825 }, { "epoch": 6.494173662448315, "grad_norm": 0.08343923091888428, "learning_rate": 8.539204555907788e-06, "loss": 0.465, "num_input_tokens_seen": 63037472, "step": 51830 }, { "epoch": 6.494800150357098, "grad_norm": 0.09709049761295319, "learning_rate": 8.538818351466395e-06, "loss": 0.4598, "num_input_tokens_seen": 63043744, "step": 51835 }, { "epoch": 6.495426638265881, "grad_norm": 0.08293603360652924, "learning_rate": 8.538432104715455e-06, "loss": 0.4586, "num_input_tokens_seen": 63049600, "step": 51840 }, { "epoch": 6.496053126174665, "grad_norm": 0.08374322950839996, "learning_rate": 8.538045815659584e-06, "loss": 0.4711, "num_input_tokens_seen": 63055840, "step": 51845 }, { "epoch": 6.496679614083448, "grad_norm": 0.05647290498018265, "learning_rate": 8.5376594843034e-06, "loss": 0.4665, "num_input_tokens_seen": 63062112, "step": 51850 }, { "epoch": 6.497306101992232, "grad_norm": 0.08666741102933884, "learning_rate": 8.537273110651522e-06, "loss": 0.4639, "num_input_tokens_seen": 63068384, "step": 51855 }, { "epoch": 6.497932589901015, "grad_norm": 0.09003611654043198, "learning_rate": 8.53688669470857e-06, "loss": 0.4687, "num_input_tokens_seen": 63074144, "step": 51860 }, { "epoch": 6.498559077809798, "grad_norm": 0.07856070250272751, "learning_rate": 8.536500236479166e-06, "loss": 0.4607, "num_input_tokens_seen": 63080384, "step": 51865 }, { "epoch": 6.499185565718582, "grad_norm": 0.07653003185987473, "learning_rate": 8.536113735967926e-06, "loss": 0.4642, "num_input_tokens_seen": 63086656, "step": 51870 }, { "epoch": 6.499812053627365, "grad_norm": 0.08664899319410324, "learning_rate": 8.535727193179475e-06, "loss": 0.4561, "num_input_tokens_seen": 63092800, "step": 51875 }, { "epoch": 6.500438541536148, "grad_norm": 0.08986261487007141, "learning_rate": 8.535340608118431e-06, "loss": 0.462, "num_input_tokens_seen": 63099040, "step": 51880 }, { "epoch": 6.5010650294449315, "grad_norm": 0.1436954140663147, "learning_rate": 8.53495398078942e-06, "loss": 0.4652, "num_input_tokens_seen": 63105216, "step": 51885 }, { "epoch": 6.501691517353715, "grad_norm": 0.08593054860830307, "learning_rate": 8.53456731119706e-06, "loss": 0.4612, "num_input_tokens_seen": 63111168, "step": 51890 }, { "epoch": 6.502318005262499, "grad_norm": 0.0858464390039444, "learning_rate": 8.534180599345977e-06, "loss": 0.4561, "num_input_tokens_seen": 63116896, "step": 51895 }, { "epoch": 6.502944493171282, "grad_norm": 0.08076190948486328, "learning_rate": 8.533793845240794e-06, "loss": 0.4663, "num_input_tokens_seen": 63122912, "step": 51900 }, { "epoch": 6.503570981080065, "grad_norm": 0.12372931838035583, "learning_rate": 8.533407048886133e-06, "loss": 0.4628, "num_input_tokens_seen": 63129152, "step": 51905 }, { "epoch": 6.504197468988848, "grad_norm": 0.086695596575737, "learning_rate": 8.533020210286623e-06, "loss": 0.4615, "num_input_tokens_seen": 63135488, "step": 51910 }, { "epoch": 6.504823956897631, "grad_norm": 0.0953509509563446, "learning_rate": 8.532633329446884e-06, "loss": 0.4792, "num_input_tokens_seen": 63141504, "step": 51915 }, { "epoch": 6.505450444806415, "grad_norm": 0.08877842873334885, "learning_rate": 8.532246406371543e-06, "loss": 0.4657, "num_input_tokens_seen": 63147744, "step": 51920 }, { "epoch": 6.506076932715199, "grad_norm": 0.0545252226293087, "learning_rate": 8.531859441065226e-06, "loss": 0.4647, "num_input_tokens_seen": 63153760, "step": 51925 }, { "epoch": 6.506703420623982, "grad_norm": 0.07886210829019547, "learning_rate": 8.531472433532562e-06, "loss": 0.4598, "num_input_tokens_seen": 63160192, "step": 51930 }, { "epoch": 6.507329908532765, "grad_norm": 0.16647742688655853, "learning_rate": 8.531085383778175e-06, "loss": 0.466, "num_input_tokens_seen": 63166368, "step": 51935 }, { "epoch": 6.507956396441549, "grad_norm": 0.0966990739107132, "learning_rate": 8.530698291806691e-06, "loss": 0.4556, "num_input_tokens_seen": 63172448, "step": 51940 }, { "epoch": 6.508582884350332, "grad_norm": 0.12400250136852264, "learning_rate": 8.530311157622744e-06, "loss": 0.4583, "num_input_tokens_seen": 63178240, "step": 51945 }, { "epoch": 6.509209372259115, "grad_norm": 0.12525956332683563, "learning_rate": 8.529923981230958e-06, "loss": 0.4621, "num_input_tokens_seen": 63184352, "step": 51950 }, { "epoch": 6.5098358601678985, "grad_norm": 0.12409234791994095, "learning_rate": 8.529536762635962e-06, "loss": 0.4585, "num_input_tokens_seen": 63190400, "step": 51955 }, { "epoch": 6.510462348076683, "grad_norm": 0.08079700171947479, "learning_rate": 8.529149501842388e-06, "loss": 0.4578, "num_input_tokens_seen": 63196576, "step": 51960 }, { "epoch": 6.511088835985466, "grad_norm": 0.09947789460420609, "learning_rate": 8.528762198854864e-06, "loss": 0.4649, "num_input_tokens_seen": 63202752, "step": 51965 }, { "epoch": 6.511715323894249, "grad_norm": 0.08741255849599838, "learning_rate": 8.52837485367802e-06, "loss": 0.461, "num_input_tokens_seen": 63208768, "step": 51970 }, { "epoch": 6.512341811803032, "grad_norm": 0.11668449640274048, "learning_rate": 8.52798746631649e-06, "loss": 0.4553, "num_input_tokens_seen": 63214912, "step": 51975 }, { "epoch": 6.512968299711815, "grad_norm": 0.1256544440984726, "learning_rate": 8.527600036774904e-06, "loss": 0.4607, "num_input_tokens_seen": 63221248, "step": 51980 }, { "epoch": 6.513594787620599, "grad_norm": 0.10339363664388657, "learning_rate": 8.527212565057891e-06, "loss": 0.4628, "num_input_tokens_seen": 63227488, "step": 51985 }, { "epoch": 6.5142212755293825, "grad_norm": 0.09386152029037476, "learning_rate": 8.526825051170088e-06, "loss": 0.4655, "num_input_tokens_seen": 63233664, "step": 51990 }, { "epoch": 6.514847763438166, "grad_norm": 0.09105882793664932, "learning_rate": 8.526437495116126e-06, "loss": 0.4641, "num_input_tokens_seen": 63239968, "step": 51995 }, { "epoch": 6.515474251346949, "grad_norm": 0.08083511143922806, "learning_rate": 8.526049896900638e-06, "loss": 0.4656, "num_input_tokens_seen": 63246080, "step": 52000 }, { "epoch": 6.516100739255732, "grad_norm": 0.09138496965169907, "learning_rate": 8.52566225652826e-06, "loss": 0.458, "num_input_tokens_seen": 63251808, "step": 52005 }, { "epoch": 6.516727227164516, "grad_norm": 0.08786725252866745, "learning_rate": 8.525274574003625e-06, "loss": 0.4622, "num_input_tokens_seen": 63258144, "step": 52010 }, { "epoch": 6.517353715073299, "grad_norm": 0.13062343001365662, "learning_rate": 8.524886849331372e-06, "loss": 0.4565, "num_input_tokens_seen": 63264000, "step": 52015 }, { "epoch": 6.517980202982082, "grad_norm": 0.11013074964284897, "learning_rate": 8.524499082516129e-06, "loss": 0.4612, "num_input_tokens_seen": 63270304, "step": 52020 }, { "epoch": 6.5186066908908655, "grad_norm": 0.1304873824119568, "learning_rate": 8.524111273562537e-06, "loss": 0.4696, "num_input_tokens_seen": 63276416, "step": 52025 }, { "epoch": 6.519233178799649, "grad_norm": 0.09471820294857025, "learning_rate": 8.523723422475232e-06, "loss": 0.4711, "num_input_tokens_seen": 63282624, "step": 52030 }, { "epoch": 6.519859666708433, "grad_norm": 0.14394396543502808, "learning_rate": 8.523335529258851e-06, "loss": 0.4615, "num_input_tokens_seen": 63288608, "step": 52035 }, { "epoch": 6.520486154617216, "grad_norm": 0.09005586802959442, "learning_rate": 8.522947593918032e-06, "loss": 0.4604, "num_input_tokens_seen": 63294336, "step": 52040 }, { "epoch": 6.521112642525999, "grad_norm": 0.08435525000095367, "learning_rate": 8.522559616457411e-06, "loss": 0.4651, "num_input_tokens_seen": 63300224, "step": 52045 }, { "epoch": 6.521739130434782, "grad_norm": 0.08669842779636383, "learning_rate": 8.52217159688163e-06, "loss": 0.4629, "num_input_tokens_seen": 63306560, "step": 52050 }, { "epoch": 6.522365618343566, "grad_norm": 0.09831367433071136, "learning_rate": 8.521783535195325e-06, "loss": 0.4581, "num_input_tokens_seen": 63312640, "step": 52055 }, { "epoch": 6.5229921062523495, "grad_norm": 0.12826378643512726, "learning_rate": 8.521395431403139e-06, "loss": 0.4581, "num_input_tokens_seen": 63318400, "step": 52060 }, { "epoch": 6.523618594161133, "grad_norm": 0.07947118580341339, "learning_rate": 8.521007285509708e-06, "loss": 0.463, "num_input_tokens_seen": 63324480, "step": 52065 }, { "epoch": 6.524245082069916, "grad_norm": 0.16447612643241882, "learning_rate": 8.520619097519674e-06, "loss": 0.4572, "num_input_tokens_seen": 63329472, "step": 52070 }, { "epoch": 6.5248715699787, "grad_norm": 0.10321969538927078, "learning_rate": 8.52023086743768e-06, "loss": 0.4682, "num_input_tokens_seen": 63335712, "step": 52075 }, { "epoch": 6.525498057887483, "grad_norm": 0.13002240657806396, "learning_rate": 8.519842595268366e-06, "loss": 0.4575, "num_input_tokens_seen": 63342080, "step": 52080 }, { "epoch": 6.526124545796266, "grad_norm": 0.12745453417301178, "learning_rate": 8.519454281016375e-06, "loss": 0.4519, "num_input_tokens_seen": 63348320, "step": 52085 }, { "epoch": 6.526751033705049, "grad_norm": 0.14174875617027283, "learning_rate": 8.51906592468635e-06, "loss": 0.4576, "num_input_tokens_seen": 63354176, "step": 52090 }, { "epoch": 6.527377521613833, "grad_norm": 0.06626424938440323, "learning_rate": 8.518677526282931e-06, "loss": 0.4609, "num_input_tokens_seen": 63360416, "step": 52095 }, { "epoch": 6.528004009522617, "grad_norm": 0.10860758274793625, "learning_rate": 8.518289085810764e-06, "loss": 0.4626, "num_input_tokens_seen": 63365984, "step": 52100 }, { "epoch": 6.5286304974314, "grad_norm": 0.11878488957881927, "learning_rate": 8.517900603274494e-06, "loss": 0.4612, "num_input_tokens_seen": 63372224, "step": 52105 }, { "epoch": 6.529256985340183, "grad_norm": 0.16390842199325562, "learning_rate": 8.517512078678765e-06, "loss": 0.4656, "num_input_tokens_seen": 63378464, "step": 52110 }, { "epoch": 6.529883473248966, "grad_norm": 0.13730761408805847, "learning_rate": 8.51712351202822e-06, "loss": 0.4664, "num_input_tokens_seen": 63384608, "step": 52115 }, { "epoch": 6.530509961157749, "grad_norm": 0.11284611374139786, "learning_rate": 8.516734903327508e-06, "loss": 0.4637, "num_input_tokens_seen": 63390720, "step": 52120 }, { "epoch": 6.531136449066533, "grad_norm": 0.11249387264251709, "learning_rate": 8.516346252581274e-06, "loss": 0.4698, "num_input_tokens_seen": 63397504, "step": 52125 }, { "epoch": 6.5317629369753165, "grad_norm": 0.15582218766212463, "learning_rate": 8.515957559794161e-06, "loss": 0.4653, "num_input_tokens_seen": 63403776, "step": 52130 }, { "epoch": 6.5323894248841, "grad_norm": 0.19642981886863708, "learning_rate": 8.515568824970822e-06, "loss": 0.4652, "num_input_tokens_seen": 63410048, "step": 52135 }, { "epoch": 6.533015912792883, "grad_norm": 0.10354556888341904, "learning_rate": 8.515180048115903e-06, "loss": 0.463, "num_input_tokens_seen": 63416192, "step": 52140 }, { "epoch": 6.533642400701666, "grad_norm": 0.09516850113868713, "learning_rate": 8.514791229234048e-06, "loss": 0.4645, "num_input_tokens_seen": 63422592, "step": 52145 }, { "epoch": 6.53426888861045, "grad_norm": 0.08970236778259277, "learning_rate": 8.51440236832991e-06, "loss": 0.4675, "num_input_tokens_seen": 63428832, "step": 52150 }, { "epoch": 6.534895376519233, "grad_norm": 0.06035836040973663, "learning_rate": 8.514013465408137e-06, "loss": 0.4556, "num_input_tokens_seen": 63435104, "step": 52155 }, { "epoch": 6.535521864428016, "grad_norm": 0.18761274218559265, "learning_rate": 8.51362452047338e-06, "loss": 0.4666, "num_input_tokens_seen": 63441120, "step": 52160 }, { "epoch": 6.5361483523368, "grad_norm": 0.08565956354141235, "learning_rate": 8.513235533530286e-06, "loss": 0.459, "num_input_tokens_seen": 63447488, "step": 52165 }, { "epoch": 6.536774840245584, "grad_norm": 0.10001685470342636, "learning_rate": 8.512846504583508e-06, "loss": 0.4614, "num_input_tokens_seen": 63453536, "step": 52170 }, { "epoch": 6.537401328154367, "grad_norm": 0.08983372896909714, "learning_rate": 8.512457433637697e-06, "loss": 0.4619, "num_input_tokens_seen": 63459968, "step": 52175 }, { "epoch": 6.53802781606315, "grad_norm": 0.09495045989751816, "learning_rate": 8.512068320697504e-06, "loss": 0.4618, "num_input_tokens_seen": 63466080, "step": 52180 }, { "epoch": 6.538654303971933, "grad_norm": 0.10890953987836838, "learning_rate": 8.511679165767581e-06, "loss": 0.4542, "num_input_tokens_seen": 63472224, "step": 52185 }, { "epoch": 6.539280791880716, "grad_norm": 0.07267004251480103, "learning_rate": 8.511289968852581e-06, "loss": 0.462, "num_input_tokens_seen": 63478400, "step": 52190 }, { "epoch": 6.5399072797895, "grad_norm": 0.12143351137638092, "learning_rate": 8.51090072995716e-06, "loss": 0.4532, "num_input_tokens_seen": 63484192, "step": 52195 }, { "epoch": 6.5405337676982835, "grad_norm": 0.10229689627885818, "learning_rate": 8.510511449085965e-06, "loss": 0.4723, "num_input_tokens_seen": 63490240, "step": 52200 }, { "epoch": 6.541160255607067, "grad_norm": 0.09790165722370148, "learning_rate": 8.510122126243656e-06, "loss": 0.4642, "num_input_tokens_seen": 63496672, "step": 52205 }, { "epoch": 6.54178674351585, "grad_norm": 0.08813369274139404, "learning_rate": 8.509732761434886e-06, "loss": 0.4688, "num_input_tokens_seen": 63502560, "step": 52210 }, { "epoch": 6.542413231424634, "grad_norm": 0.10784965008497238, "learning_rate": 8.509343354664309e-06, "loss": 0.4587, "num_input_tokens_seen": 63508608, "step": 52215 }, { "epoch": 6.543039719333417, "grad_norm": 0.14051999151706696, "learning_rate": 8.508953905936583e-06, "loss": 0.4621, "num_input_tokens_seen": 63514624, "step": 52220 }, { "epoch": 6.5436662072422, "grad_norm": 0.1038382351398468, "learning_rate": 8.508564415256362e-06, "loss": 0.4641, "num_input_tokens_seen": 63521184, "step": 52225 }, { "epoch": 6.544292695150983, "grad_norm": 0.07701877504587173, "learning_rate": 8.508174882628304e-06, "loss": 0.4562, "num_input_tokens_seen": 63526784, "step": 52230 }, { "epoch": 6.544919183059767, "grad_norm": 0.09584734588861465, "learning_rate": 8.507785308057066e-06, "loss": 0.4603, "num_input_tokens_seen": 63532704, "step": 52235 }, { "epoch": 6.545545670968551, "grad_norm": 0.05779443681240082, "learning_rate": 8.507395691547304e-06, "loss": 0.4588, "num_input_tokens_seen": 63538336, "step": 52240 }, { "epoch": 6.546172158877334, "grad_norm": 0.15123683214187622, "learning_rate": 8.50700603310368e-06, "loss": 0.4659, "num_input_tokens_seen": 63544544, "step": 52245 }, { "epoch": 6.546798646786117, "grad_norm": 0.08364413678646088, "learning_rate": 8.506616332730848e-06, "loss": 0.4585, "num_input_tokens_seen": 63550656, "step": 52250 }, { "epoch": 6.5474251346949, "grad_norm": 0.12097673863172531, "learning_rate": 8.50622659043347e-06, "loss": 0.4541, "num_input_tokens_seen": 63556832, "step": 52255 }, { "epoch": 6.548051622603683, "grad_norm": 0.08516976237297058, "learning_rate": 8.505836806216206e-06, "loss": 0.4661, "num_input_tokens_seen": 63562816, "step": 52260 }, { "epoch": 6.548678110512467, "grad_norm": 0.13015957176685333, "learning_rate": 8.505446980083716e-06, "loss": 0.464, "num_input_tokens_seen": 63569024, "step": 52265 }, { "epoch": 6.549304598421251, "grad_norm": 0.06620968133211136, "learning_rate": 8.50505711204066e-06, "loss": 0.4598, "num_input_tokens_seen": 63575296, "step": 52270 }, { "epoch": 6.549931086330034, "grad_norm": 0.072628915309906, "learning_rate": 8.504667202091698e-06, "loss": 0.465, "num_input_tokens_seen": 63581024, "step": 52275 }, { "epoch": 6.550557574238817, "grad_norm": 0.1306743621826172, "learning_rate": 8.504277250241494e-06, "loss": 0.4619, "num_input_tokens_seen": 63587456, "step": 52280 }, { "epoch": 6.5511840621476, "grad_norm": 0.11544563621282578, "learning_rate": 8.50388725649471e-06, "loss": 0.4591, "num_input_tokens_seen": 63593664, "step": 52285 }, { "epoch": 6.551810550056384, "grad_norm": 0.08272913843393326, "learning_rate": 8.503497220856007e-06, "loss": 0.4628, "num_input_tokens_seen": 63599776, "step": 52290 }, { "epoch": 6.552437037965167, "grad_norm": 0.09477975964546204, "learning_rate": 8.503107143330049e-06, "loss": 0.4682, "num_input_tokens_seen": 63605888, "step": 52295 }, { "epoch": 6.5530635258739505, "grad_norm": 0.10226258635520935, "learning_rate": 8.5027170239215e-06, "loss": 0.4553, "num_input_tokens_seen": 63612032, "step": 52300 }, { "epoch": 6.553690013782734, "grad_norm": 0.08214277029037476, "learning_rate": 8.502326862635024e-06, "loss": 0.4689, "num_input_tokens_seen": 63617824, "step": 52305 }, { "epoch": 6.554316501691518, "grad_norm": 0.08325916528701782, "learning_rate": 8.501936659475285e-06, "loss": 0.4604, "num_input_tokens_seen": 63623808, "step": 52310 }, { "epoch": 6.554942989600301, "grad_norm": 0.0893506109714508, "learning_rate": 8.50154641444695e-06, "loss": 0.4691, "num_input_tokens_seen": 63630176, "step": 52315 }, { "epoch": 6.555569477509084, "grad_norm": 0.19892333447933197, "learning_rate": 8.501156127554684e-06, "loss": 0.4669, "num_input_tokens_seen": 63636416, "step": 52320 }, { "epoch": 6.556195965417867, "grad_norm": 0.11229470372200012, "learning_rate": 8.500765798803153e-06, "loss": 0.4572, "num_input_tokens_seen": 63642880, "step": 52325 }, { "epoch": 6.556822453326651, "grad_norm": 0.12247902154922485, "learning_rate": 8.500375428197023e-06, "loss": 0.4549, "num_input_tokens_seen": 63649088, "step": 52330 }, { "epoch": 6.557448941235434, "grad_norm": 0.08658944815397263, "learning_rate": 8.499985015740961e-06, "loss": 0.4656, "num_input_tokens_seen": 63655104, "step": 52335 }, { "epoch": 6.558075429144218, "grad_norm": 0.08874619007110596, "learning_rate": 8.499594561439635e-06, "loss": 0.4669, "num_input_tokens_seen": 63661504, "step": 52340 }, { "epoch": 6.558701917053001, "grad_norm": 0.056078214198350906, "learning_rate": 8.499204065297717e-06, "loss": 0.4582, "num_input_tokens_seen": 63667168, "step": 52345 }, { "epoch": 6.559328404961784, "grad_norm": 0.08444061875343323, "learning_rate": 8.49881352731987e-06, "loss": 0.4536, "num_input_tokens_seen": 63672992, "step": 52350 }, { "epoch": 6.559954892870568, "grad_norm": 0.11083584278821945, "learning_rate": 8.498422947510765e-06, "loss": 0.463, "num_input_tokens_seen": 63678912, "step": 52355 }, { "epoch": 6.560581380779351, "grad_norm": 0.13559699058532715, "learning_rate": 8.498032325875074e-06, "loss": 0.4611, "num_input_tokens_seen": 63684736, "step": 52360 }, { "epoch": 6.561207868688134, "grad_norm": 0.09949726611375809, "learning_rate": 8.497641662417465e-06, "loss": 0.465, "num_input_tokens_seen": 63690848, "step": 52365 }, { "epoch": 6.5618343565969175, "grad_norm": 0.11495956778526306, "learning_rate": 8.497250957142608e-06, "loss": 0.4508, "num_input_tokens_seen": 63697312, "step": 52370 }, { "epoch": 6.562460844505701, "grad_norm": 0.07992955297231674, "learning_rate": 8.496860210055178e-06, "loss": 0.4561, "num_input_tokens_seen": 63703808, "step": 52375 }, { "epoch": 6.563087332414485, "grad_norm": 0.08564852923154831, "learning_rate": 8.496469421159843e-06, "loss": 0.4642, "num_input_tokens_seen": 63709600, "step": 52380 }, { "epoch": 6.563713820323268, "grad_norm": 0.11347714811563492, "learning_rate": 8.496078590461275e-06, "loss": 0.4702, "num_input_tokens_seen": 63715648, "step": 52385 }, { "epoch": 6.564340308232051, "grad_norm": 0.10616245120763779, "learning_rate": 8.495687717964149e-06, "loss": 0.4601, "num_input_tokens_seen": 63722080, "step": 52390 }, { "epoch": 6.564966796140834, "grad_norm": 0.1363660842180252, "learning_rate": 8.495296803673138e-06, "loss": 0.4593, "num_input_tokens_seen": 63728384, "step": 52395 }, { "epoch": 6.565593284049617, "grad_norm": 0.11997807770967484, "learning_rate": 8.494905847592917e-06, "loss": 0.4524, "num_input_tokens_seen": 63734400, "step": 52400 }, { "epoch": 6.5662197719584015, "grad_norm": 0.10320321470499039, "learning_rate": 8.494514849728155e-06, "loss": 0.4588, "num_input_tokens_seen": 63740512, "step": 52405 }, { "epoch": 6.566846259867185, "grad_norm": 0.09931706637144089, "learning_rate": 8.49412381008353e-06, "loss": 0.457, "num_input_tokens_seen": 63746784, "step": 52410 }, { "epoch": 6.567472747775968, "grad_norm": 0.12508274614810944, "learning_rate": 8.493732728663719e-06, "loss": 0.4597, "num_input_tokens_seen": 63753280, "step": 52415 }, { "epoch": 6.568099235684751, "grad_norm": 0.1704578995704651, "learning_rate": 8.493341605473395e-06, "loss": 0.4725, "num_input_tokens_seen": 63759648, "step": 52420 }, { "epoch": 6.568725723593535, "grad_norm": 0.09480086714029312, "learning_rate": 8.492950440517234e-06, "loss": 0.46, "num_input_tokens_seen": 63765856, "step": 52425 }, { "epoch": 6.569352211502318, "grad_norm": 0.0727033019065857, "learning_rate": 8.492559233799915e-06, "loss": 0.4622, "num_input_tokens_seen": 63772000, "step": 52430 }, { "epoch": 6.569978699411101, "grad_norm": 0.07521440088748932, "learning_rate": 8.492167985326114e-06, "loss": 0.464, "num_input_tokens_seen": 63778304, "step": 52435 }, { "epoch": 6.5706051873198845, "grad_norm": 0.15458093583583832, "learning_rate": 8.491776695100509e-06, "loss": 0.4724, "num_input_tokens_seen": 63784512, "step": 52440 }, { "epoch": 6.571231675228669, "grad_norm": 0.1637854278087616, "learning_rate": 8.491385363127777e-06, "loss": 0.4734, "num_input_tokens_seen": 63790496, "step": 52445 }, { "epoch": 6.571858163137452, "grad_norm": 0.1014711782336235, "learning_rate": 8.490993989412597e-06, "loss": 0.4817, "num_input_tokens_seen": 63796864, "step": 52450 }, { "epoch": 6.572484651046235, "grad_norm": 0.10995981097221375, "learning_rate": 8.49060257395965e-06, "loss": 0.4637, "num_input_tokens_seen": 63803296, "step": 52455 }, { "epoch": 6.573111138955018, "grad_norm": 0.09209202975034714, "learning_rate": 8.490211116773614e-06, "loss": 0.4655, "num_input_tokens_seen": 63809504, "step": 52460 }, { "epoch": 6.573737626863801, "grad_norm": 0.05898074805736542, "learning_rate": 8.48981961785917e-06, "loss": 0.4602, "num_input_tokens_seen": 63815296, "step": 52465 }, { "epoch": 6.574364114772585, "grad_norm": 0.10062147676944733, "learning_rate": 8.489428077220999e-06, "loss": 0.4587, "num_input_tokens_seen": 63821280, "step": 52470 }, { "epoch": 6.5749906026813685, "grad_norm": 0.10607712715864182, "learning_rate": 8.48903649486378e-06, "loss": 0.456, "num_input_tokens_seen": 63827840, "step": 52475 }, { "epoch": 6.575617090590152, "grad_norm": 0.0857534185051918, "learning_rate": 8.488644870792197e-06, "loss": 0.4622, "num_input_tokens_seen": 63832896, "step": 52480 }, { "epoch": 6.576243578498935, "grad_norm": 0.08973193168640137, "learning_rate": 8.488253205010933e-06, "loss": 0.4653, "num_input_tokens_seen": 63839168, "step": 52485 }, { "epoch": 6.576870066407718, "grad_norm": 0.0796060562133789, "learning_rate": 8.487861497524668e-06, "loss": 0.4629, "num_input_tokens_seen": 63845216, "step": 52490 }, { "epoch": 6.577496554316502, "grad_norm": 0.13028624653816223, "learning_rate": 8.487469748338086e-06, "loss": 0.453, "num_input_tokens_seen": 63851232, "step": 52495 }, { "epoch": 6.578123042225285, "grad_norm": 0.11418084800243378, "learning_rate": 8.48707795745587e-06, "loss": 0.4565, "num_input_tokens_seen": 63857312, "step": 52500 }, { "epoch": 6.578749530134068, "grad_norm": 0.13114063441753387, "learning_rate": 8.486686124882707e-06, "loss": 0.4608, "num_input_tokens_seen": 63863424, "step": 52505 }, { "epoch": 6.5793760180428515, "grad_norm": 0.11275547742843628, "learning_rate": 8.486294250623279e-06, "loss": 0.4596, "num_input_tokens_seen": 63869344, "step": 52510 }, { "epoch": 6.580002505951635, "grad_norm": 0.08579616993665695, "learning_rate": 8.485902334682272e-06, "loss": 0.4548, "num_input_tokens_seen": 63875520, "step": 52515 }, { "epoch": 6.580628993860419, "grad_norm": 0.11965024471282959, "learning_rate": 8.485510377064375e-06, "loss": 0.4538, "num_input_tokens_seen": 63881216, "step": 52520 }, { "epoch": 6.581255481769202, "grad_norm": 0.10401831567287445, "learning_rate": 8.485118377774266e-06, "loss": 0.4638, "num_input_tokens_seen": 63887392, "step": 52525 }, { "epoch": 6.581881969677985, "grad_norm": 0.10733431577682495, "learning_rate": 8.48472633681664e-06, "loss": 0.4627, "num_input_tokens_seen": 63893792, "step": 52530 }, { "epoch": 6.582508457586768, "grad_norm": 0.11046512424945831, "learning_rate": 8.48433425419618e-06, "loss": 0.4637, "num_input_tokens_seen": 63900000, "step": 52535 }, { "epoch": 6.5831349454955514, "grad_norm": 0.0895056501030922, "learning_rate": 8.483942129917575e-06, "loss": 0.4485, "num_input_tokens_seen": 63906176, "step": 52540 }, { "epoch": 6.5837614334043355, "grad_norm": 0.08704382926225662, "learning_rate": 8.48354996398551e-06, "loss": 0.4655, "num_input_tokens_seen": 63912256, "step": 52545 }, { "epoch": 6.584387921313119, "grad_norm": 0.12815317511558533, "learning_rate": 8.48315775640468e-06, "loss": 0.4612, "num_input_tokens_seen": 63918176, "step": 52550 }, { "epoch": 6.585014409221902, "grad_norm": 0.17952454090118408, "learning_rate": 8.482765507179768e-06, "loss": 0.46, "num_input_tokens_seen": 63924608, "step": 52555 }, { "epoch": 6.585640897130685, "grad_norm": 0.11613395065069199, "learning_rate": 8.482373216315467e-06, "loss": 0.4658, "num_input_tokens_seen": 63930880, "step": 52560 }, { "epoch": 6.586267385039469, "grad_norm": 0.07753438502550125, "learning_rate": 8.481980883816467e-06, "loss": 0.4704, "num_input_tokens_seen": 63936480, "step": 52565 }, { "epoch": 6.586893872948252, "grad_norm": 0.11719497293233871, "learning_rate": 8.481588509687457e-06, "loss": 0.465, "num_input_tokens_seen": 63942624, "step": 52570 }, { "epoch": 6.587520360857035, "grad_norm": 0.08785109221935272, "learning_rate": 8.48119609393313e-06, "loss": 0.4578, "num_input_tokens_seen": 63949248, "step": 52575 }, { "epoch": 6.588146848765819, "grad_norm": 0.09929955005645752, "learning_rate": 8.480803636558177e-06, "loss": 0.4673, "num_input_tokens_seen": 63954752, "step": 52580 }, { "epoch": 6.588773336674603, "grad_norm": 0.09984509646892548, "learning_rate": 8.480411137567291e-06, "loss": 0.4647, "num_input_tokens_seen": 63960704, "step": 52585 }, { "epoch": 6.589399824583386, "grad_norm": 0.11177974939346313, "learning_rate": 8.480018596965161e-06, "loss": 0.4648, "num_input_tokens_seen": 63966656, "step": 52590 }, { "epoch": 6.590026312492169, "grad_norm": 0.11480896919965744, "learning_rate": 8.479626014756485e-06, "loss": 0.4593, "num_input_tokens_seen": 63972512, "step": 52595 }, { "epoch": 6.590652800400952, "grad_norm": 0.13434182107448578, "learning_rate": 8.479233390945953e-06, "loss": 0.4592, "num_input_tokens_seen": 63978592, "step": 52600 }, { "epoch": 6.591279288309735, "grad_norm": 0.059682734310626984, "learning_rate": 8.478840725538262e-06, "loss": 0.4502, "num_input_tokens_seen": 63984640, "step": 52605 }, { "epoch": 6.591905776218519, "grad_norm": 0.08945424854755402, "learning_rate": 8.478448018538105e-06, "loss": 0.4667, "num_input_tokens_seen": 63990752, "step": 52610 }, { "epoch": 6.5925322641273025, "grad_norm": 0.090012326836586, "learning_rate": 8.478055269950176e-06, "loss": 0.4645, "num_input_tokens_seen": 63996864, "step": 52615 }, { "epoch": 6.593158752036086, "grad_norm": 0.08987381309270859, "learning_rate": 8.477662479779175e-06, "loss": 0.4573, "num_input_tokens_seen": 64002400, "step": 52620 }, { "epoch": 6.593785239944869, "grad_norm": 0.11507688462734222, "learning_rate": 8.477269648029791e-06, "loss": 0.4546, "num_input_tokens_seen": 64008384, "step": 52625 }, { "epoch": 6.594411727853652, "grad_norm": 0.08595811575651169, "learning_rate": 8.476876774706728e-06, "loss": 0.4691, "num_input_tokens_seen": 64014432, "step": 52630 }, { "epoch": 6.595038215762436, "grad_norm": 0.09986947476863861, "learning_rate": 8.476483859814679e-06, "loss": 0.4724, "num_input_tokens_seen": 64020256, "step": 52635 }, { "epoch": 6.595664703671219, "grad_norm": 0.12384463101625443, "learning_rate": 8.476090903358342e-06, "loss": 0.4602, "num_input_tokens_seen": 64026272, "step": 52640 }, { "epoch": 6.596291191580002, "grad_norm": 0.09006467461585999, "learning_rate": 8.475697905342417e-06, "loss": 0.4563, "num_input_tokens_seen": 64032448, "step": 52645 }, { "epoch": 6.596917679488786, "grad_norm": 0.14449861645698547, "learning_rate": 8.4753048657716e-06, "loss": 0.4674, "num_input_tokens_seen": 64038528, "step": 52650 }, { "epoch": 6.597544167397569, "grad_norm": 0.1400057077407837, "learning_rate": 8.474911784650593e-06, "loss": 0.4642, "num_input_tokens_seen": 64044608, "step": 52655 }, { "epoch": 6.598170655306353, "grad_norm": 0.1132621318101883, "learning_rate": 8.474518661984093e-06, "loss": 0.456, "num_input_tokens_seen": 64050592, "step": 52660 }, { "epoch": 6.598797143215136, "grad_norm": 0.13049247860908508, "learning_rate": 8.474125497776802e-06, "loss": 0.4616, "num_input_tokens_seen": 64056960, "step": 52665 }, { "epoch": 6.599423631123919, "grad_norm": 0.13841034471988678, "learning_rate": 8.473732292033419e-06, "loss": 0.4472, "num_input_tokens_seen": 64063008, "step": 52670 }, { "epoch": 6.600050119032702, "grad_norm": 0.2413259744644165, "learning_rate": 8.473339044758646e-06, "loss": 0.4752, "num_input_tokens_seen": 64069440, "step": 52675 }, { "epoch": 6.600676606941486, "grad_norm": 0.09884019941091537, "learning_rate": 8.472945755957184e-06, "loss": 0.4585, "num_input_tokens_seen": 64075456, "step": 52680 }, { "epoch": 6.60130309485027, "grad_norm": 0.12651841342449188, "learning_rate": 8.472552425633737e-06, "loss": 0.4601, "num_input_tokens_seen": 64081376, "step": 52685 }, { "epoch": 6.601929582759053, "grad_norm": 0.18955829739570618, "learning_rate": 8.472159053793005e-06, "loss": 0.4646, "num_input_tokens_seen": 64087712, "step": 52690 }, { "epoch": 6.602556070667836, "grad_norm": 0.14735426008701324, "learning_rate": 8.471765640439694e-06, "loss": 0.461, "num_input_tokens_seen": 64093152, "step": 52695 }, { "epoch": 6.60318255857662, "grad_norm": 0.123079314827919, "learning_rate": 8.471372185578503e-06, "loss": 0.4532, "num_input_tokens_seen": 64098848, "step": 52700 }, { "epoch": 6.603809046485403, "grad_norm": 0.09574461728334427, "learning_rate": 8.470978689214142e-06, "loss": 0.4606, "num_input_tokens_seen": 64105280, "step": 52705 }, { "epoch": 6.604435534394186, "grad_norm": 0.10991743952035904, "learning_rate": 8.47058515135131e-06, "loss": 0.4587, "num_input_tokens_seen": 64111200, "step": 52710 }, { "epoch": 6.6050620223029695, "grad_norm": 0.15498770773410797, "learning_rate": 8.470191571994716e-06, "loss": 0.4706, "num_input_tokens_seen": 64116928, "step": 52715 }, { "epoch": 6.605688510211753, "grad_norm": 0.09752210974693298, "learning_rate": 8.469797951149063e-06, "loss": 0.4773, "num_input_tokens_seen": 64122912, "step": 52720 }, { "epoch": 6.606314998120537, "grad_norm": 0.06537415832281113, "learning_rate": 8.46940428881906e-06, "loss": 0.4596, "num_input_tokens_seen": 64128672, "step": 52725 }, { "epoch": 6.60694148602932, "grad_norm": 0.10407206416130066, "learning_rate": 8.469010585009411e-06, "loss": 0.4708, "num_input_tokens_seen": 64134656, "step": 52730 }, { "epoch": 6.607567973938103, "grad_norm": 0.08459598571062088, "learning_rate": 8.468616839724824e-06, "loss": 0.4726, "num_input_tokens_seen": 64140608, "step": 52735 }, { "epoch": 6.608194461846886, "grad_norm": 0.07694771885871887, "learning_rate": 8.468223052970006e-06, "loss": 0.4647, "num_input_tokens_seen": 64146624, "step": 52740 }, { "epoch": 6.608820949755669, "grad_norm": 0.055667486041784286, "learning_rate": 8.467829224749665e-06, "loss": 0.464, "num_input_tokens_seen": 64153024, "step": 52745 }, { "epoch": 6.609447437664453, "grad_norm": 0.10590948164463043, "learning_rate": 8.46743535506851e-06, "loss": 0.472, "num_input_tokens_seen": 64159008, "step": 52750 }, { "epoch": 6.610073925573237, "grad_norm": 0.06652352213859558, "learning_rate": 8.467041443931253e-06, "loss": 0.4569, "num_input_tokens_seen": 64165312, "step": 52755 }, { "epoch": 6.61070041348202, "grad_norm": 0.14278578758239746, "learning_rate": 8.466647491342596e-06, "loss": 0.4752, "num_input_tokens_seen": 64171584, "step": 52760 }, { "epoch": 6.611326901390803, "grad_norm": 0.12133175879716873, "learning_rate": 8.466253497307257e-06, "loss": 0.4574, "num_input_tokens_seen": 64177728, "step": 52765 }, { "epoch": 6.611953389299586, "grad_norm": 0.07864587008953094, "learning_rate": 8.465859461829941e-06, "loss": 0.4692, "num_input_tokens_seen": 64183936, "step": 52770 }, { "epoch": 6.61257987720837, "grad_norm": 0.10258319228887558, "learning_rate": 8.465465384915364e-06, "loss": 0.4588, "num_input_tokens_seen": 64189824, "step": 52775 }, { "epoch": 6.613206365117153, "grad_norm": 0.07711023837327957, "learning_rate": 8.465071266568231e-06, "loss": 0.4517, "num_input_tokens_seen": 64195808, "step": 52780 }, { "epoch": 6.6138328530259365, "grad_norm": 0.10489942878484726, "learning_rate": 8.464677106793263e-06, "loss": 0.465, "num_input_tokens_seen": 64201696, "step": 52785 }, { "epoch": 6.61445934093472, "grad_norm": 0.1265610158443451, "learning_rate": 8.464282905595162e-06, "loss": 0.4733, "num_input_tokens_seen": 64207936, "step": 52790 }, { "epoch": 6.615085828843503, "grad_norm": 0.0764041319489479, "learning_rate": 8.46388866297865e-06, "loss": 0.4577, "num_input_tokens_seen": 64214144, "step": 52795 }, { "epoch": 6.615712316752287, "grad_norm": 0.11050785332918167, "learning_rate": 8.463494378948434e-06, "loss": 0.4556, "num_input_tokens_seen": 64220224, "step": 52800 }, { "epoch": 6.61633880466107, "grad_norm": 0.11565743386745453, "learning_rate": 8.463100053509232e-06, "loss": 0.4665, "num_input_tokens_seen": 64226432, "step": 52805 }, { "epoch": 6.616965292569853, "grad_norm": 0.11643897742033005, "learning_rate": 8.462705686665758e-06, "loss": 0.4636, "num_input_tokens_seen": 64232480, "step": 52810 }, { "epoch": 6.617591780478636, "grad_norm": 0.13189265131950378, "learning_rate": 8.462311278422725e-06, "loss": 0.4579, "num_input_tokens_seen": 64238368, "step": 52815 }, { "epoch": 6.6182182683874204, "grad_norm": 0.0816224217414856, "learning_rate": 8.46191682878485e-06, "loss": 0.4651, "num_input_tokens_seen": 64244448, "step": 52820 }, { "epoch": 6.618844756296204, "grad_norm": 0.04938226938247681, "learning_rate": 8.461522337756848e-06, "loss": 0.4654, "num_input_tokens_seen": 64250656, "step": 52825 }, { "epoch": 6.619471244204987, "grad_norm": 0.1247100755572319, "learning_rate": 8.461127805343437e-06, "loss": 0.4557, "num_input_tokens_seen": 64256768, "step": 52830 }, { "epoch": 6.62009773211377, "grad_norm": 0.0770283043384552, "learning_rate": 8.460733231549335e-06, "loss": 0.4586, "num_input_tokens_seen": 64262784, "step": 52835 }, { "epoch": 6.620724220022554, "grad_norm": 0.07530438154935837, "learning_rate": 8.460338616379254e-06, "loss": 0.4615, "num_input_tokens_seen": 64269120, "step": 52840 }, { "epoch": 6.621350707931337, "grad_norm": 0.08618206530809402, "learning_rate": 8.459943959837918e-06, "loss": 0.4557, "num_input_tokens_seen": 64275136, "step": 52845 }, { "epoch": 6.62197719584012, "grad_norm": 0.09421107172966003, "learning_rate": 8.459549261930041e-06, "loss": 0.4686, "num_input_tokens_seen": 64281664, "step": 52850 }, { "epoch": 6.6226036837489035, "grad_norm": 0.08913173526525497, "learning_rate": 8.459154522660346e-06, "loss": 0.4625, "num_input_tokens_seen": 64287904, "step": 52855 }, { "epoch": 6.623230171657687, "grad_norm": 0.08639007061719894, "learning_rate": 8.45875974203355e-06, "loss": 0.4527, "num_input_tokens_seen": 64293632, "step": 52860 }, { "epoch": 6.623856659566471, "grad_norm": 0.0904991403222084, "learning_rate": 8.458364920054372e-06, "loss": 0.4589, "num_input_tokens_seen": 64298912, "step": 52865 }, { "epoch": 6.624483147475254, "grad_norm": 0.12733402848243713, "learning_rate": 8.457970056727534e-06, "loss": 0.4495, "num_input_tokens_seen": 64305120, "step": 52870 }, { "epoch": 6.625109635384037, "grad_norm": 0.07200322300195694, "learning_rate": 8.457575152057757e-06, "loss": 0.4711, "num_input_tokens_seen": 64311584, "step": 52875 }, { "epoch": 6.62573612329282, "grad_norm": 0.13773390650749207, "learning_rate": 8.457180206049763e-06, "loss": 0.4651, "num_input_tokens_seen": 64317760, "step": 52880 }, { "epoch": 6.626362611201603, "grad_norm": 0.11269111186265945, "learning_rate": 8.456785218708272e-06, "loss": 0.4558, "num_input_tokens_seen": 64323808, "step": 52885 }, { "epoch": 6.6269890991103875, "grad_norm": 0.17781619727611542, "learning_rate": 8.456390190038008e-06, "loss": 0.4605, "num_input_tokens_seen": 64330496, "step": 52890 }, { "epoch": 6.627615587019171, "grad_norm": 0.15272049605846405, "learning_rate": 8.455995120043694e-06, "loss": 0.4515, "num_input_tokens_seen": 64336288, "step": 52895 }, { "epoch": 6.628242074927954, "grad_norm": 0.13559062778949738, "learning_rate": 8.455600008730051e-06, "loss": 0.4801, "num_input_tokens_seen": 64342464, "step": 52900 }, { "epoch": 6.628868562836737, "grad_norm": 0.15473203361034393, "learning_rate": 8.455204856101805e-06, "loss": 0.4635, "num_input_tokens_seen": 64348320, "step": 52905 }, { "epoch": 6.62949505074552, "grad_norm": 0.10276956111192703, "learning_rate": 8.45480966216368e-06, "loss": 0.4585, "num_input_tokens_seen": 64354496, "step": 52910 }, { "epoch": 6.630121538654304, "grad_norm": 0.09979060292243958, "learning_rate": 8.454414426920402e-06, "loss": 0.4704, "num_input_tokens_seen": 64360256, "step": 52915 }, { "epoch": 6.630748026563087, "grad_norm": 0.09133802354335785, "learning_rate": 8.454019150376694e-06, "loss": 0.4582, "num_input_tokens_seen": 64366560, "step": 52920 }, { "epoch": 6.6313745144718705, "grad_norm": 0.12806811928749084, "learning_rate": 8.453623832537284e-06, "loss": 0.4624, "num_input_tokens_seen": 64372608, "step": 52925 }, { "epoch": 6.632001002380654, "grad_norm": 0.07700715959072113, "learning_rate": 8.453228473406896e-06, "loss": 0.4749, "num_input_tokens_seen": 64378624, "step": 52930 }, { "epoch": 6.632627490289438, "grad_norm": 0.06990472972393036, "learning_rate": 8.452833072990259e-06, "loss": 0.466, "num_input_tokens_seen": 64384640, "step": 52935 }, { "epoch": 6.633253978198221, "grad_norm": 0.14194363355636597, "learning_rate": 8.4524376312921e-06, "loss": 0.4665, "num_input_tokens_seen": 64390304, "step": 52940 }, { "epoch": 6.633880466107004, "grad_norm": 0.08877163380384445, "learning_rate": 8.452042148317147e-06, "loss": 0.465, "num_input_tokens_seen": 64396480, "step": 52945 }, { "epoch": 6.634506954015787, "grad_norm": 0.08133257925510406, "learning_rate": 8.451646624070126e-06, "loss": 0.4707, "num_input_tokens_seen": 64402688, "step": 52950 }, { "epoch": 6.635133441924571, "grad_norm": 0.08190556615591049, "learning_rate": 8.45125105855577e-06, "loss": 0.4618, "num_input_tokens_seen": 64409024, "step": 52955 }, { "epoch": 6.6357599298333545, "grad_norm": 0.06825336813926697, "learning_rate": 8.450855451778803e-06, "loss": 0.4621, "num_input_tokens_seen": 64415232, "step": 52960 }, { "epoch": 6.636386417742138, "grad_norm": 0.08093859255313873, "learning_rate": 8.45045980374396e-06, "loss": 0.4659, "num_input_tokens_seen": 64421408, "step": 52965 }, { "epoch": 6.637012905650921, "grad_norm": 0.0707138329744339, "learning_rate": 8.45006411445597e-06, "loss": 0.4662, "num_input_tokens_seen": 64426976, "step": 52970 }, { "epoch": 6.637639393559704, "grad_norm": 0.0769854336977005, "learning_rate": 8.449668383919561e-06, "loss": 0.4661, "num_input_tokens_seen": 64432864, "step": 52975 }, { "epoch": 6.638265881468488, "grad_norm": 0.11740858852863312, "learning_rate": 8.449272612139465e-06, "loss": 0.4607, "num_input_tokens_seen": 64438240, "step": 52980 }, { "epoch": 6.638892369377271, "grad_norm": 0.0653260350227356, "learning_rate": 8.448876799120418e-06, "loss": 0.465, "num_input_tokens_seen": 64444448, "step": 52985 }, { "epoch": 6.639518857286054, "grad_norm": 0.07236570119857788, "learning_rate": 8.448480944867147e-06, "loss": 0.4602, "num_input_tokens_seen": 64450464, "step": 52990 }, { "epoch": 6.640145345194838, "grad_norm": 0.11634515970945358, "learning_rate": 8.448085049384388e-06, "loss": 0.4533, "num_input_tokens_seen": 64456832, "step": 52995 }, { "epoch": 6.640771833103621, "grad_norm": 0.11304056644439697, "learning_rate": 8.447689112676872e-06, "loss": 0.4661, "num_input_tokens_seen": 64462624, "step": 53000 }, { "epoch": 6.641398321012405, "grad_norm": 0.06749536097049713, "learning_rate": 8.447293134749336e-06, "loss": 0.4596, "num_input_tokens_seen": 64468448, "step": 53005 }, { "epoch": 6.642024808921188, "grad_norm": 0.07507294416427612, "learning_rate": 8.44689711560651e-06, "loss": 0.4663, "num_input_tokens_seen": 64474880, "step": 53010 }, { "epoch": 6.642651296829971, "grad_norm": 0.07384250313043594, "learning_rate": 8.446501055253132e-06, "loss": 0.4628, "num_input_tokens_seen": 64481216, "step": 53015 }, { "epoch": 6.643277784738754, "grad_norm": 0.07518859952688217, "learning_rate": 8.446104953693936e-06, "loss": 0.4713, "num_input_tokens_seen": 64487552, "step": 53020 }, { "epoch": 6.6439042726475375, "grad_norm": 0.08786260336637497, "learning_rate": 8.44570881093366e-06, "loss": 0.4642, "num_input_tokens_seen": 64494144, "step": 53025 }, { "epoch": 6.6445307605563215, "grad_norm": 0.08093944936990738, "learning_rate": 8.445312626977035e-06, "loss": 0.4605, "num_input_tokens_seen": 64500480, "step": 53030 }, { "epoch": 6.645157248465105, "grad_norm": 0.04330949857831001, "learning_rate": 8.444916401828801e-06, "loss": 0.4519, "num_input_tokens_seen": 64506688, "step": 53035 }, { "epoch": 6.645783736373888, "grad_norm": 0.0848100334405899, "learning_rate": 8.444520135493697e-06, "loss": 0.4717, "num_input_tokens_seen": 64512608, "step": 53040 }, { "epoch": 6.646410224282671, "grad_norm": 0.11793973296880722, "learning_rate": 8.44412382797646e-06, "loss": 0.4693, "num_input_tokens_seen": 64518816, "step": 53045 }, { "epoch": 6.647036712191455, "grad_norm": 0.06636807322502136, "learning_rate": 8.443727479281825e-06, "loss": 0.4646, "num_input_tokens_seen": 64525248, "step": 53050 }, { "epoch": 6.647663200100238, "grad_norm": 0.06761617213487625, "learning_rate": 8.443331089414532e-06, "loss": 0.463, "num_input_tokens_seen": 64531104, "step": 53055 }, { "epoch": 6.648289688009021, "grad_norm": 0.09506084769964218, "learning_rate": 8.442934658379321e-06, "loss": 0.4669, "num_input_tokens_seen": 64537248, "step": 53060 }, { "epoch": 6.648916175917805, "grad_norm": 0.12176014482975006, "learning_rate": 8.442538186180933e-06, "loss": 0.4685, "num_input_tokens_seen": 64542912, "step": 53065 }, { "epoch": 6.649542663826588, "grad_norm": 0.0825740396976471, "learning_rate": 8.442141672824105e-06, "loss": 0.463, "num_input_tokens_seen": 64548896, "step": 53070 }, { "epoch": 6.650169151735372, "grad_norm": 0.080580934882164, "learning_rate": 8.44174511831358e-06, "loss": 0.4649, "num_input_tokens_seen": 64555040, "step": 53075 }, { "epoch": 6.650795639644155, "grad_norm": 0.08003024756908417, "learning_rate": 8.441348522654099e-06, "loss": 0.4602, "num_input_tokens_seen": 64561504, "step": 53080 }, { "epoch": 6.651422127552938, "grad_norm": 0.06888532638549805, "learning_rate": 8.440951885850402e-06, "loss": 0.4538, "num_input_tokens_seen": 64567232, "step": 53085 }, { "epoch": 6.652048615461721, "grad_norm": 0.08297675102949142, "learning_rate": 8.440555207907232e-06, "loss": 0.4597, "num_input_tokens_seen": 64573760, "step": 53090 }, { "epoch": 6.652675103370505, "grad_norm": 0.10364466905593872, "learning_rate": 8.440158488829334e-06, "loss": 0.4519, "num_input_tokens_seen": 64579680, "step": 53095 }, { "epoch": 6.6533015912792886, "grad_norm": 0.06588169187307358, "learning_rate": 8.439761728621448e-06, "loss": 0.4615, "num_input_tokens_seen": 64586080, "step": 53100 }, { "epoch": 6.653928079188072, "grad_norm": 0.12354671210050583, "learning_rate": 8.439364927288319e-06, "loss": 0.4669, "num_input_tokens_seen": 64592096, "step": 53105 }, { "epoch": 6.654554567096855, "grad_norm": 0.07643253356218338, "learning_rate": 8.438968084834689e-06, "loss": 0.458, "num_input_tokens_seen": 64598528, "step": 53110 }, { "epoch": 6.655181055005638, "grad_norm": 0.09600692987442017, "learning_rate": 8.438571201265306e-06, "loss": 0.4597, "num_input_tokens_seen": 64604448, "step": 53115 }, { "epoch": 6.655807542914422, "grad_norm": 0.07149474322795868, "learning_rate": 8.438174276584911e-06, "loss": 0.4594, "num_input_tokens_seen": 64610272, "step": 53120 }, { "epoch": 6.656434030823205, "grad_norm": 0.12915271520614624, "learning_rate": 8.437777310798253e-06, "loss": 0.4626, "num_input_tokens_seen": 64616128, "step": 53125 }, { "epoch": 6.6570605187319885, "grad_norm": 0.04949372261762619, "learning_rate": 8.437380303910077e-06, "loss": 0.4613, "num_input_tokens_seen": 64621856, "step": 53130 }, { "epoch": 6.657687006640772, "grad_norm": 0.09560797363519669, "learning_rate": 8.43698325592513e-06, "loss": 0.462, "num_input_tokens_seen": 64628160, "step": 53135 }, { "epoch": 6.658313494549555, "grad_norm": 0.07738184183835983, "learning_rate": 8.436586166848157e-06, "loss": 0.4516, "num_input_tokens_seen": 64634176, "step": 53140 }, { "epoch": 6.658939982458339, "grad_norm": 0.07714176177978516, "learning_rate": 8.43618903668391e-06, "loss": 0.4704, "num_input_tokens_seen": 64640448, "step": 53145 }, { "epoch": 6.659566470367122, "grad_norm": 0.10939207673072815, "learning_rate": 8.435791865437132e-06, "loss": 0.4581, "num_input_tokens_seen": 64646432, "step": 53150 }, { "epoch": 6.660192958275905, "grad_norm": 0.12981685996055603, "learning_rate": 8.435394653112573e-06, "loss": 0.4599, "num_input_tokens_seen": 64652640, "step": 53155 }, { "epoch": 6.660819446184688, "grad_norm": 0.1154005154967308, "learning_rate": 8.434997399714983e-06, "loss": 0.4675, "num_input_tokens_seen": 64658784, "step": 53160 }, { "epoch": 6.6614459340934715, "grad_norm": 0.061884451657533646, "learning_rate": 8.434600105249113e-06, "loss": 0.4681, "num_input_tokens_seen": 64665088, "step": 53165 }, { "epoch": 6.662072422002256, "grad_norm": 0.07226324081420898, "learning_rate": 8.434202769719708e-06, "loss": 0.4653, "num_input_tokens_seen": 64671200, "step": 53170 }, { "epoch": 6.662698909911039, "grad_norm": 0.0841408222913742, "learning_rate": 8.433805393131524e-06, "loss": 0.4675, "num_input_tokens_seen": 64677536, "step": 53175 }, { "epoch": 6.663325397819822, "grad_norm": 0.07812361419200897, "learning_rate": 8.43340797548931e-06, "loss": 0.4696, "num_input_tokens_seen": 64683872, "step": 53180 }, { "epoch": 6.663951885728605, "grad_norm": 0.08246865123510361, "learning_rate": 8.433010516797814e-06, "loss": 0.4649, "num_input_tokens_seen": 64689856, "step": 53185 }, { "epoch": 6.664578373637389, "grad_norm": 0.08603167533874512, "learning_rate": 8.432613017061793e-06, "loss": 0.457, "num_input_tokens_seen": 64696096, "step": 53190 }, { "epoch": 6.665204861546172, "grad_norm": 0.07228979468345642, "learning_rate": 8.432215476285996e-06, "loss": 0.4594, "num_input_tokens_seen": 64702144, "step": 53195 }, { "epoch": 6.6658313494549555, "grad_norm": 0.07062125951051712, "learning_rate": 8.431817894475179e-06, "loss": 0.4609, "num_input_tokens_seen": 64708320, "step": 53200 }, { "epoch": 6.666457837363739, "grad_norm": 0.08491624146699905, "learning_rate": 8.431420271634095e-06, "loss": 0.4604, "num_input_tokens_seen": 64714592, "step": 53205 }, { "epoch": 6.667084325272523, "grad_norm": 0.08357850462198257, "learning_rate": 8.431022607767495e-06, "loss": 0.459, "num_input_tokens_seen": 64720704, "step": 53210 }, { "epoch": 6.667710813181306, "grad_norm": 0.11746331304311752, "learning_rate": 8.430624902880134e-06, "loss": 0.4635, "num_input_tokens_seen": 64726624, "step": 53215 }, { "epoch": 6.668337301090089, "grad_norm": 0.05371079593896866, "learning_rate": 8.430227156976769e-06, "loss": 0.4655, "num_input_tokens_seen": 64732640, "step": 53220 }, { "epoch": 6.668963788998872, "grad_norm": 0.08980533480644226, "learning_rate": 8.429829370062155e-06, "loss": 0.4678, "num_input_tokens_seen": 64738976, "step": 53225 }, { "epoch": 6.669590276907655, "grad_norm": 0.13490566611289978, "learning_rate": 8.429431542141045e-06, "loss": 0.4683, "num_input_tokens_seen": 64744992, "step": 53230 }, { "epoch": 6.670216764816439, "grad_norm": 0.11039338260889053, "learning_rate": 8.4290336732182e-06, "loss": 0.4599, "num_input_tokens_seen": 64751040, "step": 53235 }, { "epoch": 6.670843252725223, "grad_norm": 0.08813893795013428, "learning_rate": 8.428635763298372e-06, "loss": 0.4603, "num_input_tokens_seen": 64757056, "step": 53240 }, { "epoch": 6.671469740634006, "grad_norm": 0.07845158874988556, "learning_rate": 8.428237812386323e-06, "loss": 0.4652, "num_input_tokens_seen": 64762944, "step": 53245 }, { "epoch": 6.672096228542789, "grad_norm": 0.12041053920984268, "learning_rate": 8.427839820486809e-06, "loss": 0.4538, "num_input_tokens_seen": 64769088, "step": 53250 }, { "epoch": 6.672722716451572, "grad_norm": 0.07206546515226364, "learning_rate": 8.427441787604586e-06, "loss": 0.4604, "num_input_tokens_seen": 64775072, "step": 53255 }, { "epoch": 6.673349204360356, "grad_norm": 0.0865805447101593, "learning_rate": 8.427043713744415e-06, "loss": 0.4644, "num_input_tokens_seen": 64781184, "step": 53260 }, { "epoch": 6.673975692269139, "grad_norm": 0.11868014931678772, "learning_rate": 8.426645598911055e-06, "loss": 0.4604, "num_input_tokens_seen": 64787520, "step": 53265 }, { "epoch": 6.6746021801779225, "grad_norm": 0.07151693105697632, "learning_rate": 8.426247443109268e-06, "loss": 0.457, "num_input_tokens_seen": 64792992, "step": 53270 }, { "epoch": 6.675228668086706, "grad_norm": 0.07873973250389099, "learning_rate": 8.42584924634381e-06, "loss": 0.4646, "num_input_tokens_seen": 64799360, "step": 53275 }, { "epoch": 6.675855155995489, "grad_norm": 0.04639824852347374, "learning_rate": 8.425451008619448e-06, "loss": 0.456, "num_input_tokens_seen": 64805504, "step": 53280 }, { "epoch": 6.676481643904273, "grad_norm": 0.09527544677257538, "learning_rate": 8.425052729940934e-06, "loss": 0.4658, "num_input_tokens_seen": 64811680, "step": 53285 }, { "epoch": 6.677108131813056, "grad_norm": 0.0760205015540123, "learning_rate": 8.424654410313038e-06, "loss": 0.4684, "num_input_tokens_seen": 64817536, "step": 53290 }, { "epoch": 6.677734619721839, "grad_norm": 0.11094030737876892, "learning_rate": 8.424256049740519e-06, "loss": 0.4614, "num_input_tokens_seen": 64823584, "step": 53295 }, { "epoch": 6.678361107630622, "grad_norm": 0.10741031169891357, "learning_rate": 8.42385764822814e-06, "loss": 0.4642, "num_input_tokens_seen": 64829376, "step": 53300 }, { "epoch": 6.6789875955394065, "grad_norm": 0.07405924052000046, "learning_rate": 8.423459205780662e-06, "loss": 0.467, "num_input_tokens_seen": 64835456, "step": 53305 }, { "epoch": 6.67961408344819, "grad_norm": 0.11225251853466034, "learning_rate": 8.423060722402853e-06, "loss": 0.4633, "num_input_tokens_seen": 64841408, "step": 53310 }, { "epoch": 6.680240571356973, "grad_norm": 0.09061051160097122, "learning_rate": 8.422662198099474e-06, "loss": 0.4577, "num_input_tokens_seen": 64847392, "step": 53315 }, { "epoch": 6.680867059265756, "grad_norm": 0.07293147593736649, "learning_rate": 8.42226363287529e-06, "loss": 0.4662, "num_input_tokens_seen": 64853728, "step": 53320 }, { "epoch": 6.68149354717454, "grad_norm": 0.0883379727602005, "learning_rate": 8.42186502673507e-06, "loss": 0.4631, "num_input_tokens_seen": 64859648, "step": 53325 }, { "epoch": 6.682120035083323, "grad_norm": 0.07300568372011185, "learning_rate": 8.421466379683577e-06, "loss": 0.4612, "num_input_tokens_seen": 64865792, "step": 53330 }, { "epoch": 6.682746522992106, "grad_norm": 0.08462483435869217, "learning_rate": 8.421067691725573e-06, "loss": 0.463, "num_input_tokens_seen": 64871968, "step": 53335 }, { "epoch": 6.6833730109008895, "grad_norm": 0.1145797148346901, "learning_rate": 8.42066896286583e-06, "loss": 0.4612, "num_input_tokens_seen": 64878144, "step": 53340 }, { "epoch": 6.683999498809673, "grad_norm": 0.075776107609272, "learning_rate": 8.420270193109115e-06, "loss": 0.4576, "num_input_tokens_seen": 64884192, "step": 53345 }, { "epoch": 6.684625986718457, "grad_norm": 0.09272407740354538, "learning_rate": 8.419871382460193e-06, "loss": 0.4599, "num_input_tokens_seen": 64890560, "step": 53350 }, { "epoch": 6.68525247462724, "grad_norm": 0.12392718344926834, "learning_rate": 8.419472530923834e-06, "loss": 0.4632, "num_input_tokens_seen": 64896704, "step": 53355 }, { "epoch": 6.685878962536023, "grad_norm": 0.09102489799261093, "learning_rate": 8.419073638504804e-06, "loss": 0.4661, "num_input_tokens_seen": 64903072, "step": 53360 }, { "epoch": 6.686505450444806, "grad_norm": 0.09615105390548706, "learning_rate": 8.418674705207877e-06, "loss": 0.4596, "num_input_tokens_seen": 64909056, "step": 53365 }, { "epoch": 6.687131938353589, "grad_norm": 0.12809155881404877, "learning_rate": 8.418275731037816e-06, "loss": 0.4653, "num_input_tokens_seen": 64915200, "step": 53370 }, { "epoch": 6.6877584262623735, "grad_norm": 0.08408630639314651, "learning_rate": 8.417876715999399e-06, "loss": 0.4513, "num_input_tokens_seen": 64921376, "step": 53375 }, { "epoch": 6.688384914171157, "grad_norm": 0.06967318803071976, "learning_rate": 8.41747766009739e-06, "loss": 0.4684, "num_input_tokens_seen": 64927232, "step": 53380 }, { "epoch": 6.68901140207994, "grad_norm": 0.09377559274435043, "learning_rate": 8.417078563336562e-06, "loss": 0.4515, "num_input_tokens_seen": 64933632, "step": 53385 }, { "epoch": 6.689637889988723, "grad_norm": 0.051552463322877884, "learning_rate": 8.416679425721688e-06, "loss": 0.4614, "num_input_tokens_seen": 64939936, "step": 53390 }, { "epoch": 6.690264377897506, "grad_norm": 0.08736257255077362, "learning_rate": 8.416280247257538e-06, "loss": 0.4666, "num_input_tokens_seen": 64946144, "step": 53395 }, { "epoch": 6.69089086580629, "grad_norm": 0.08141495287418365, "learning_rate": 8.415881027948885e-06, "loss": 0.4607, "num_input_tokens_seen": 64952160, "step": 53400 }, { "epoch": 6.691517353715073, "grad_norm": 0.1449463814496994, "learning_rate": 8.415481767800503e-06, "loss": 0.464, "num_input_tokens_seen": 64958304, "step": 53405 }, { "epoch": 6.692143841623857, "grad_norm": 0.15966923534870148, "learning_rate": 8.415082466817166e-06, "loss": 0.4585, "num_input_tokens_seen": 64964640, "step": 53410 }, { "epoch": 6.69277032953264, "grad_norm": 0.1255425661802292, "learning_rate": 8.414683125003645e-06, "loss": 0.4564, "num_input_tokens_seen": 64970976, "step": 53415 }, { "epoch": 6.693396817441423, "grad_norm": 0.11737783998250961, "learning_rate": 8.414283742364717e-06, "loss": 0.458, "num_input_tokens_seen": 64976832, "step": 53420 }, { "epoch": 6.694023305350207, "grad_norm": 0.07542673498392105, "learning_rate": 8.413884318905157e-06, "loss": 0.4529, "num_input_tokens_seen": 64983008, "step": 53425 }, { "epoch": 6.69464979325899, "grad_norm": 0.09656555205583572, "learning_rate": 8.413484854629738e-06, "loss": 0.457, "num_input_tokens_seen": 64988928, "step": 53430 }, { "epoch": 6.695276281167773, "grad_norm": 0.10977759212255478, "learning_rate": 8.41308534954324e-06, "loss": 0.4487, "num_input_tokens_seen": 64994304, "step": 53435 }, { "epoch": 6.6959027690765565, "grad_norm": 0.08308958262205124, "learning_rate": 8.412685803650436e-06, "loss": 0.4588, "num_input_tokens_seen": 65000704, "step": 53440 }, { "epoch": 6.6965292569853405, "grad_norm": 0.08182243257761002, "learning_rate": 8.412286216956103e-06, "loss": 0.4675, "num_input_tokens_seen": 65006784, "step": 53445 }, { "epoch": 6.697155744894124, "grad_norm": 0.09537636488676071, "learning_rate": 8.41188658946502e-06, "loss": 0.4642, "num_input_tokens_seen": 65012768, "step": 53450 }, { "epoch": 6.697782232802907, "grad_norm": 0.0926864743232727, "learning_rate": 8.411486921181965e-06, "loss": 0.4709, "num_input_tokens_seen": 65018880, "step": 53455 }, { "epoch": 6.69840872071169, "grad_norm": 0.10874634981155396, "learning_rate": 8.411087212111715e-06, "loss": 0.4665, "num_input_tokens_seen": 65024832, "step": 53460 }, { "epoch": 6.699035208620474, "grad_norm": 0.1252618283033371, "learning_rate": 8.41068746225905e-06, "loss": 0.4596, "num_input_tokens_seen": 65030976, "step": 53465 }, { "epoch": 6.699661696529257, "grad_norm": 0.08417630195617676, "learning_rate": 8.410287671628747e-06, "loss": 0.4704, "num_input_tokens_seen": 65036896, "step": 53470 }, { "epoch": 6.70028818443804, "grad_norm": 0.11346466094255447, "learning_rate": 8.40988784022559e-06, "loss": 0.4513, "num_input_tokens_seen": 65043072, "step": 53475 }, { "epoch": 6.700914672346824, "grad_norm": 0.0822562575340271, "learning_rate": 8.409487968054358e-06, "loss": 0.4602, "num_input_tokens_seen": 65048960, "step": 53480 }, { "epoch": 6.701541160255607, "grad_norm": 0.12404712289571762, "learning_rate": 8.409088055119828e-06, "loss": 0.4606, "num_input_tokens_seen": 65055392, "step": 53485 }, { "epoch": 6.702167648164391, "grad_norm": 0.0744311735033989, "learning_rate": 8.408688101426785e-06, "loss": 0.473, "num_input_tokens_seen": 65061504, "step": 53490 }, { "epoch": 6.702794136073174, "grad_norm": 0.08238361030817032, "learning_rate": 8.40828810698001e-06, "loss": 0.4737, "num_input_tokens_seen": 65068000, "step": 53495 }, { "epoch": 6.703420623981957, "grad_norm": 0.092194102704525, "learning_rate": 8.407888071784286e-06, "loss": 0.4673, "num_input_tokens_seen": 65074400, "step": 53500 }, { "epoch": 6.70404711189074, "grad_norm": 0.08835938572883606, "learning_rate": 8.407487995844393e-06, "loss": 0.4612, "num_input_tokens_seen": 65080480, "step": 53505 }, { "epoch": 6.7046735997995235, "grad_norm": 0.079141765832901, "learning_rate": 8.407087879165117e-06, "loss": 0.4587, "num_input_tokens_seen": 65086240, "step": 53510 }, { "epoch": 6.7053000877083075, "grad_norm": 0.11609270423650742, "learning_rate": 8.406687721751241e-06, "loss": 0.457, "num_input_tokens_seen": 65092416, "step": 53515 }, { "epoch": 6.705926575617091, "grad_norm": 0.09840703010559082, "learning_rate": 8.406287523607549e-06, "loss": 0.4667, "num_input_tokens_seen": 65098240, "step": 53520 }, { "epoch": 6.706553063525874, "grad_norm": 0.05593368038535118, "learning_rate": 8.405887284738826e-06, "loss": 0.4619, "num_input_tokens_seen": 65103936, "step": 53525 }, { "epoch": 6.707179551434657, "grad_norm": 0.09868739545345306, "learning_rate": 8.405487005149858e-06, "loss": 0.4709, "num_input_tokens_seen": 65110304, "step": 53530 }, { "epoch": 6.70780603934344, "grad_norm": 0.07940306514501572, "learning_rate": 8.405086684845427e-06, "loss": 0.466, "num_input_tokens_seen": 65116352, "step": 53535 }, { "epoch": 6.708432527252224, "grad_norm": 0.08346998691558838, "learning_rate": 8.404686323830324e-06, "loss": 0.4578, "num_input_tokens_seen": 65122560, "step": 53540 }, { "epoch": 6.7090590151610074, "grad_norm": 0.07127514481544495, "learning_rate": 8.404285922109332e-06, "loss": 0.4695, "num_input_tokens_seen": 65128544, "step": 53545 }, { "epoch": 6.709685503069791, "grad_norm": 0.08981412649154663, "learning_rate": 8.40388547968724e-06, "loss": 0.4625, "num_input_tokens_seen": 65133824, "step": 53550 }, { "epoch": 6.710311990978574, "grad_norm": 0.08310358226299286, "learning_rate": 8.403484996568836e-06, "loss": 0.4601, "num_input_tokens_seen": 65140256, "step": 53555 }, { "epoch": 6.710938478887358, "grad_norm": 0.10279820114374161, "learning_rate": 8.403084472758907e-06, "loss": 0.4654, "num_input_tokens_seen": 65146112, "step": 53560 }, { "epoch": 6.711564966796141, "grad_norm": 0.0789375826716423, "learning_rate": 8.402683908262244e-06, "loss": 0.4652, "num_input_tokens_seen": 65151904, "step": 53565 }, { "epoch": 6.712191454704924, "grad_norm": 0.10909078270196915, "learning_rate": 8.402283303083631e-06, "loss": 0.4609, "num_input_tokens_seen": 65158080, "step": 53570 }, { "epoch": 6.712817942613707, "grad_norm": 0.15724967420101166, "learning_rate": 8.401882657227862e-06, "loss": 0.4667, "num_input_tokens_seen": 65164352, "step": 53575 }, { "epoch": 6.713444430522491, "grad_norm": 0.08201103657484055, "learning_rate": 8.401481970699726e-06, "loss": 0.4546, "num_input_tokens_seen": 65170368, "step": 53580 }, { "epoch": 6.714070918431275, "grad_norm": 0.06330999732017517, "learning_rate": 8.401081243504013e-06, "loss": 0.463, "num_input_tokens_seen": 65176480, "step": 53585 }, { "epoch": 6.714697406340058, "grad_norm": 0.08078816533088684, "learning_rate": 8.400680475645513e-06, "loss": 0.4635, "num_input_tokens_seen": 65182496, "step": 53590 }, { "epoch": 6.715323894248841, "grad_norm": 0.17162442207336426, "learning_rate": 8.400279667129022e-06, "loss": 0.4628, "num_input_tokens_seen": 65188576, "step": 53595 }, { "epoch": 6.715950382157624, "grad_norm": 0.09187065064907074, "learning_rate": 8.399878817959324e-06, "loss": 0.466, "num_input_tokens_seen": 65194560, "step": 53600 }, { "epoch": 6.716576870066408, "grad_norm": 0.08739780634641647, "learning_rate": 8.39947792814122e-06, "loss": 0.4694, "num_input_tokens_seen": 65200448, "step": 53605 }, { "epoch": 6.717203357975191, "grad_norm": 0.08041884005069733, "learning_rate": 8.399076997679498e-06, "loss": 0.4631, "num_input_tokens_seen": 65206688, "step": 53610 }, { "epoch": 6.7178298458839745, "grad_norm": 0.08447013795375824, "learning_rate": 8.398676026578953e-06, "loss": 0.4634, "num_input_tokens_seen": 65213056, "step": 53615 }, { "epoch": 6.718456333792758, "grad_norm": 0.10701607167720795, "learning_rate": 8.398275014844378e-06, "loss": 0.4628, "num_input_tokens_seen": 65219296, "step": 53620 }, { "epoch": 6.719082821701541, "grad_norm": 0.07772545516490936, "learning_rate": 8.397873962480569e-06, "loss": 0.4657, "num_input_tokens_seen": 65225344, "step": 53625 }, { "epoch": 6.719709309610325, "grad_norm": 0.07211871445178986, "learning_rate": 8.39747286949232e-06, "loss": 0.4631, "num_input_tokens_seen": 65231328, "step": 53630 }, { "epoch": 6.720335797519108, "grad_norm": 0.1078653559088707, "learning_rate": 8.397071735884425e-06, "loss": 0.4592, "num_input_tokens_seen": 65237600, "step": 53635 }, { "epoch": 6.720962285427891, "grad_norm": 0.08897016942501068, "learning_rate": 8.396670561661682e-06, "loss": 0.4687, "num_input_tokens_seen": 65244032, "step": 53640 }, { "epoch": 6.721588773336674, "grad_norm": 0.04262625426054001, "learning_rate": 8.396269346828887e-06, "loss": 0.466, "num_input_tokens_seen": 65250208, "step": 53645 }, { "epoch": 6.7222152612454575, "grad_norm": 0.04661800339818001, "learning_rate": 8.395868091390838e-06, "loss": 0.4568, "num_input_tokens_seen": 65256256, "step": 53650 }, { "epoch": 6.722841749154242, "grad_norm": 0.08581515401601791, "learning_rate": 8.395466795352328e-06, "loss": 0.4629, "num_input_tokens_seen": 65262592, "step": 53655 }, { "epoch": 6.723468237063025, "grad_norm": 0.11753962934017181, "learning_rate": 8.39506545871816e-06, "loss": 0.4661, "num_input_tokens_seen": 65268384, "step": 53660 }, { "epoch": 6.724094724971808, "grad_norm": 0.08213931322097778, "learning_rate": 8.394664081493129e-06, "loss": 0.4663, "num_input_tokens_seen": 65273696, "step": 53665 }, { "epoch": 6.724721212880591, "grad_norm": 0.0777997374534607, "learning_rate": 8.394262663682035e-06, "loss": 0.4602, "num_input_tokens_seen": 65279520, "step": 53670 }, { "epoch": 6.725347700789374, "grad_norm": 0.08575639873743057, "learning_rate": 8.393861205289676e-06, "loss": 0.4607, "num_input_tokens_seen": 65285728, "step": 53675 }, { "epoch": 6.725974188698158, "grad_norm": 0.08443484455347061, "learning_rate": 8.393459706320854e-06, "loss": 0.4642, "num_input_tokens_seen": 65291744, "step": 53680 }, { "epoch": 6.7266006766069415, "grad_norm": 0.07178448140621185, "learning_rate": 8.393058166780368e-06, "loss": 0.4651, "num_input_tokens_seen": 65297152, "step": 53685 }, { "epoch": 6.727227164515725, "grad_norm": 0.10674392431974411, "learning_rate": 8.392656586673018e-06, "loss": 0.4644, "num_input_tokens_seen": 65303232, "step": 53690 }, { "epoch": 6.727853652424508, "grad_norm": 0.07459066063165665, "learning_rate": 8.392254966003607e-06, "loss": 0.4665, "num_input_tokens_seen": 65309536, "step": 53695 }, { "epoch": 6.728480140333292, "grad_norm": 0.0713856965303421, "learning_rate": 8.391853304776937e-06, "loss": 0.4658, "num_input_tokens_seen": 65315584, "step": 53700 }, { "epoch": 6.729106628242075, "grad_norm": 0.06565011292695999, "learning_rate": 8.391451602997809e-06, "loss": 0.4649, "num_input_tokens_seen": 65321888, "step": 53705 }, { "epoch": 6.729733116150858, "grad_norm": 0.0643211305141449, "learning_rate": 8.391049860671024e-06, "loss": 0.4614, "num_input_tokens_seen": 65327968, "step": 53710 }, { "epoch": 6.730359604059641, "grad_norm": 0.06129925698041916, "learning_rate": 8.390648077801388e-06, "loss": 0.4689, "num_input_tokens_seen": 65334176, "step": 53715 }, { "epoch": 6.7309860919684255, "grad_norm": 0.10303398966789246, "learning_rate": 8.390246254393703e-06, "loss": 0.4699, "num_input_tokens_seen": 65339776, "step": 53720 }, { "epoch": 6.731612579877209, "grad_norm": 0.0809386596083641, "learning_rate": 8.389844390452772e-06, "loss": 0.4612, "num_input_tokens_seen": 65345760, "step": 53725 }, { "epoch": 6.732239067785992, "grad_norm": 0.06482695788145065, "learning_rate": 8.389442485983405e-06, "loss": 0.4703, "num_input_tokens_seen": 65351808, "step": 53730 }, { "epoch": 6.732865555694775, "grad_norm": 0.120360367000103, "learning_rate": 8.3890405409904e-06, "loss": 0.4617, "num_input_tokens_seen": 65358144, "step": 53735 }, { "epoch": 6.733492043603558, "grad_norm": 0.042773958295583725, "learning_rate": 8.388638555478567e-06, "loss": 0.4646, "num_input_tokens_seen": 65363616, "step": 53740 }, { "epoch": 6.734118531512342, "grad_norm": 0.08583307266235352, "learning_rate": 8.388236529452713e-06, "loss": 0.4668, "num_input_tokens_seen": 65369888, "step": 53745 }, { "epoch": 6.734745019421125, "grad_norm": 0.062879778444767, "learning_rate": 8.387834462917642e-06, "loss": 0.4657, "num_input_tokens_seen": 65376160, "step": 53750 }, { "epoch": 6.7353715073299085, "grad_norm": 0.07380605489015579, "learning_rate": 8.387432355878158e-06, "loss": 0.4614, "num_input_tokens_seen": 65382176, "step": 53755 }, { "epoch": 6.735997995238692, "grad_norm": 0.1055334284901619, "learning_rate": 8.387030208339075e-06, "loss": 0.4634, "num_input_tokens_seen": 65388128, "step": 53760 }, { "epoch": 6.736624483147475, "grad_norm": 0.03918347880244255, "learning_rate": 8.3866280203052e-06, "loss": 0.4629, "num_input_tokens_seen": 65394208, "step": 53765 }, { "epoch": 6.737250971056259, "grad_norm": 0.060250889509916306, "learning_rate": 8.386225791781337e-06, "loss": 0.4645, "num_input_tokens_seen": 65400160, "step": 53770 }, { "epoch": 6.737877458965042, "grad_norm": 0.0717150941491127, "learning_rate": 8.385823522772299e-06, "loss": 0.4579, "num_input_tokens_seen": 65406528, "step": 53775 }, { "epoch": 6.738503946873825, "grad_norm": 0.07771694660186768, "learning_rate": 8.385421213282895e-06, "loss": 0.4615, "num_input_tokens_seen": 65412896, "step": 53780 }, { "epoch": 6.739130434782608, "grad_norm": 0.08190833032131195, "learning_rate": 8.385018863317931e-06, "loss": 0.4555, "num_input_tokens_seen": 65419104, "step": 53785 }, { "epoch": 6.739756922691392, "grad_norm": 0.061745885759592056, "learning_rate": 8.384616472882223e-06, "loss": 0.4673, "num_input_tokens_seen": 65425056, "step": 53790 }, { "epoch": 6.740383410600176, "grad_norm": 0.08278167247772217, "learning_rate": 8.38421404198058e-06, "loss": 0.4642, "num_input_tokens_seen": 65430400, "step": 53795 }, { "epoch": 6.741009898508959, "grad_norm": 0.06795419752597809, "learning_rate": 8.383811570617808e-06, "loss": 0.4642, "num_input_tokens_seen": 65436576, "step": 53800 }, { "epoch": 6.741636386417742, "grad_norm": 0.06382256001234055, "learning_rate": 8.383409058798728e-06, "loss": 0.4594, "num_input_tokens_seen": 65442816, "step": 53805 }, { "epoch": 6.742262874326525, "grad_norm": 0.08917959779500961, "learning_rate": 8.383006506528148e-06, "loss": 0.461, "num_input_tokens_seen": 65448832, "step": 53810 }, { "epoch": 6.742889362235309, "grad_norm": 0.04763177037239075, "learning_rate": 8.38260391381088e-06, "loss": 0.4632, "num_input_tokens_seen": 65454624, "step": 53815 }, { "epoch": 6.743515850144092, "grad_norm": 0.0437367707490921, "learning_rate": 8.382201280651738e-06, "loss": 0.4649, "num_input_tokens_seen": 65460768, "step": 53820 }, { "epoch": 6.7441423380528756, "grad_norm": 0.12043534964323044, "learning_rate": 8.381798607055538e-06, "loss": 0.4628, "num_input_tokens_seen": 65466816, "step": 53825 }, { "epoch": 6.744768825961659, "grad_norm": 0.07891707122325897, "learning_rate": 8.38139589302709e-06, "loss": 0.4628, "num_input_tokens_seen": 65472704, "step": 53830 }, { "epoch": 6.745395313870443, "grad_norm": 0.07602246105670929, "learning_rate": 8.380993138571211e-06, "loss": 0.4598, "num_input_tokens_seen": 65478880, "step": 53835 }, { "epoch": 6.746021801779226, "grad_norm": 0.08552300930023193, "learning_rate": 8.380590343692719e-06, "loss": 0.453, "num_input_tokens_seen": 65484928, "step": 53840 }, { "epoch": 6.746648289688009, "grad_norm": 0.11347512155771255, "learning_rate": 8.380187508396426e-06, "loss": 0.4662, "num_input_tokens_seen": 65490944, "step": 53845 }, { "epoch": 6.747274777596792, "grad_norm": 0.07060981541872025, "learning_rate": 8.379784632687149e-06, "loss": 0.4644, "num_input_tokens_seen": 65496928, "step": 53850 }, { "epoch": 6.7479012655055755, "grad_norm": 0.09832065552473068, "learning_rate": 8.379381716569705e-06, "loss": 0.4674, "num_input_tokens_seen": 65503104, "step": 53855 }, { "epoch": 6.7485277534143595, "grad_norm": 0.0642952248454094, "learning_rate": 8.378978760048912e-06, "loss": 0.4633, "num_input_tokens_seen": 65509312, "step": 53860 }, { "epoch": 6.749154241323143, "grad_norm": 0.07469555735588074, "learning_rate": 8.378575763129587e-06, "loss": 0.4644, "num_input_tokens_seen": 65515360, "step": 53865 }, { "epoch": 6.749780729231926, "grad_norm": 0.07272517681121826, "learning_rate": 8.378172725816548e-06, "loss": 0.4668, "num_input_tokens_seen": 65521472, "step": 53870 }, { "epoch": 6.750407217140709, "grad_norm": 0.07584621012210846, "learning_rate": 8.377769648114615e-06, "loss": 0.4632, "num_input_tokens_seen": 65527424, "step": 53875 }, { "epoch": 6.751033705049492, "grad_norm": 0.08604506403207779, "learning_rate": 8.377366530028603e-06, "loss": 0.463, "num_input_tokens_seen": 65533536, "step": 53880 }, { "epoch": 6.751660192958276, "grad_norm": 0.06807790696620941, "learning_rate": 8.376963371563337e-06, "loss": 0.4686, "num_input_tokens_seen": 65539648, "step": 53885 }, { "epoch": 6.752286680867059, "grad_norm": 0.08003559708595276, "learning_rate": 8.376560172723634e-06, "loss": 0.4594, "num_input_tokens_seen": 65545632, "step": 53890 }, { "epoch": 6.752913168775843, "grad_norm": 0.07900045812129974, "learning_rate": 8.376156933514317e-06, "loss": 0.4664, "num_input_tokens_seen": 65551808, "step": 53895 }, { "epoch": 6.753539656684626, "grad_norm": 0.08032344281673431, "learning_rate": 8.375753653940203e-06, "loss": 0.4534, "num_input_tokens_seen": 65557536, "step": 53900 }, { "epoch": 6.754166144593409, "grad_norm": 0.0434361957013607, "learning_rate": 8.375350334006116e-06, "loss": 0.4588, "num_input_tokens_seen": 65563744, "step": 53905 }, { "epoch": 6.754792632502193, "grad_norm": 0.06490562111139297, "learning_rate": 8.37494697371688e-06, "loss": 0.4661, "num_input_tokens_seen": 65570208, "step": 53910 }, { "epoch": 6.755419120410976, "grad_norm": 0.06986402720212936, "learning_rate": 8.374543573077313e-06, "loss": 0.4568, "num_input_tokens_seen": 65576096, "step": 53915 }, { "epoch": 6.756045608319759, "grad_norm": 0.09424644708633423, "learning_rate": 8.374140132092241e-06, "loss": 0.4636, "num_input_tokens_seen": 65582016, "step": 53920 }, { "epoch": 6.7566720962285425, "grad_norm": 0.08489781618118286, "learning_rate": 8.373736650766487e-06, "loss": 0.4499, "num_input_tokens_seen": 65587776, "step": 53925 }, { "epoch": 6.7572985841373265, "grad_norm": 0.08686652034521103, "learning_rate": 8.373333129104876e-06, "loss": 0.4609, "num_input_tokens_seen": 65593856, "step": 53930 }, { "epoch": 6.75792507204611, "grad_norm": 0.06555497646331787, "learning_rate": 8.37292956711223e-06, "loss": 0.4594, "num_input_tokens_seen": 65600192, "step": 53935 }, { "epoch": 6.758551559954893, "grad_norm": 0.09090356528759003, "learning_rate": 8.372525964793376e-06, "loss": 0.4697, "num_input_tokens_seen": 65606432, "step": 53940 }, { "epoch": 6.759178047863676, "grad_norm": 0.08514582365751266, "learning_rate": 8.372122322153138e-06, "loss": 0.4678, "num_input_tokens_seen": 65612448, "step": 53945 }, { "epoch": 6.759804535772459, "grad_norm": 0.06969136744737625, "learning_rate": 8.371718639196343e-06, "loss": 0.4693, "num_input_tokens_seen": 65618784, "step": 53950 }, { "epoch": 6.760431023681243, "grad_norm": 0.07631456106901169, "learning_rate": 8.371314915927818e-06, "loss": 0.4663, "num_input_tokens_seen": 65624736, "step": 53955 }, { "epoch": 6.761057511590026, "grad_norm": 0.07673045247793198, "learning_rate": 8.370911152352386e-06, "loss": 0.4569, "num_input_tokens_seen": 65630368, "step": 53960 }, { "epoch": 6.76168399949881, "grad_norm": 0.08494257181882858, "learning_rate": 8.370507348474879e-06, "loss": 0.4587, "num_input_tokens_seen": 65635904, "step": 53965 }, { "epoch": 6.762310487407593, "grad_norm": 0.08236192911863327, "learning_rate": 8.370103504300123e-06, "loss": 0.4615, "num_input_tokens_seen": 65642272, "step": 53970 }, { "epoch": 6.762936975316377, "grad_norm": 0.06124658137559891, "learning_rate": 8.369699619832945e-06, "loss": 0.4596, "num_input_tokens_seen": 65648608, "step": 53975 }, { "epoch": 6.76356346322516, "grad_norm": 0.07645253837108612, "learning_rate": 8.369295695078175e-06, "loss": 0.4608, "num_input_tokens_seen": 65654624, "step": 53980 }, { "epoch": 6.764189951133943, "grad_norm": 0.07690216600894928, "learning_rate": 8.368891730040642e-06, "loss": 0.4623, "num_input_tokens_seen": 65661024, "step": 53985 }, { "epoch": 6.764816439042726, "grad_norm": 0.06920775771141052, "learning_rate": 8.368487724725178e-06, "loss": 0.4598, "num_input_tokens_seen": 65667392, "step": 53990 }, { "epoch": 6.7654429269515095, "grad_norm": 0.07797862589359283, "learning_rate": 8.36808367913661e-06, "loss": 0.4656, "num_input_tokens_seen": 65673920, "step": 53995 }, { "epoch": 6.766069414860294, "grad_norm": 0.07261768728494644, "learning_rate": 8.36767959327977e-06, "loss": 0.4625, "num_input_tokens_seen": 65679744, "step": 54000 }, { "epoch": 6.766695902769077, "grad_norm": 0.10304845124483109, "learning_rate": 8.367275467159487e-06, "loss": 0.4596, "num_input_tokens_seen": 65685792, "step": 54005 }, { "epoch": 6.76732239067786, "grad_norm": 0.049105454236269, "learning_rate": 8.366871300780598e-06, "loss": 0.4578, "num_input_tokens_seen": 65691936, "step": 54010 }, { "epoch": 6.767948878586643, "grad_norm": 0.10664455592632294, "learning_rate": 8.36646709414793e-06, "loss": 0.4543, "num_input_tokens_seen": 65697728, "step": 54015 }, { "epoch": 6.768575366495426, "grad_norm": 0.08187933266162872, "learning_rate": 8.366062847266319e-06, "loss": 0.4516, "num_input_tokens_seen": 65703296, "step": 54020 }, { "epoch": 6.76920185440421, "grad_norm": 0.08088461309671402, "learning_rate": 8.365658560140595e-06, "loss": 0.4598, "num_input_tokens_seen": 65709344, "step": 54025 }, { "epoch": 6.7698283423129935, "grad_norm": 0.07526534050703049, "learning_rate": 8.365254232775594e-06, "loss": 0.4639, "num_input_tokens_seen": 65716064, "step": 54030 }, { "epoch": 6.770454830221777, "grad_norm": 0.11885906010866165, "learning_rate": 8.36484986517615e-06, "loss": 0.4615, "num_input_tokens_seen": 65722208, "step": 54035 }, { "epoch": 6.77108131813056, "grad_norm": 0.08980073034763336, "learning_rate": 8.364445457347095e-06, "loss": 0.4671, "num_input_tokens_seen": 65728128, "step": 54040 }, { "epoch": 6.771707806039343, "grad_norm": 0.07606443017721176, "learning_rate": 8.364041009293266e-06, "loss": 0.464, "num_input_tokens_seen": 65734272, "step": 54045 }, { "epoch": 6.772334293948127, "grad_norm": 0.11701326072216034, "learning_rate": 8.363636521019501e-06, "loss": 0.4632, "num_input_tokens_seen": 65740288, "step": 54050 }, { "epoch": 6.77296078185691, "grad_norm": 0.07834506034851074, "learning_rate": 8.363231992530632e-06, "loss": 0.4583, "num_input_tokens_seen": 65746464, "step": 54055 }, { "epoch": 6.773587269765693, "grad_norm": 0.06744850426912308, "learning_rate": 8.362827423831497e-06, "loss": 0.4718, "num_input_tokens_seen": 65752512, "step": 54060 }, { "epoch": 6.7742137576744765, "grad_norm": 0.05747047811746597, "learning_rate": 8.36242281492693e-06, "loss": 0.4558, "num_input_tokens_seen": 65758720, "step": 54065 }, { "epoch": 6.774840245583261, "grad_norm": 0.0801444947719574, "learning_rate": 8.362018165821775e-06, "loss": 0.4592, "num_input_tokens_seen": 65764864, "step": 54070 }, { "epoch": 6.775466733492044, "grad_norm": 0.05365399643778801, "learning_rate": 8.361613476520865e-06, "loss": 0.4694, "num_input_tokens_seen": 65770848, "step": 54075 }, { "epoch": 6.776093221400827, "grad_norm": 0.09028711915016174, "learning_rate": 8.36120874702904e-06, "loss": 0.468, "num_input_tokens_seen": 65776800, "step": 54080 }, { "epoch": 6.77671970930961, "grad_norm": 0.11485181003808975, "learning_rate": 8.360803977351137e-06, "loss": 0.4649, "num_input_tokens_seen": 65782976, "step": 54085 }, { "epoch": 6.777346197218394, "grad_norm": 0.07278922200202942, "learning_rate": 8.360399167491998e-06, "loss": 0.4636, "num_input_tokens_seen": 65789216, "step": 54090 }, { "epoch": 6.777972685127177, "grad_norm": 0.07844676077365875, "learning_rate": 8.359994317456463e-06, "loss": 0.4646, "num_input_tokens_seen": 65795552, "step": 54095 }, { "epoch": 6.7785991730359605, "grad_norm": 0.07097916305065155, "learning_rate": 8.359589427249368e-06, "loss": 0.4555, "num_input_tokens_seen": 65800992, "step": 54100 }, { "epoch": 6.779225660944744, "grad_norm": 0.07390005886554718, "learning_rate": 8.359184496875559e-06, "loss": 0.4586, "num_input_tokens_seen": 65807168, "step": 54105 }, { "epoch": 6.779852148853527, "grad_norm": 0.09936600923538208, "learning_rate": 8.358779526339874e-06, "loss": 0.4638, "num_input_tokens_seen": 65813024, "step": 54110 }, { "epoch": 6.780478636762311, "grad_norm": 0.12998463213443756, "learning_rate": 8.358374515647156e-06, "loss": 0.4667, "num_input_tokens_seen": 65818784, "step": 54115 }, { "epoch": 6.781105124671094, "grad_norm": 0.11600024253129959, "learning_rate": 8.357969464802247e-06, "loss": 0.4646, "num_input_tokens_seen": 65824800, "step": 54120 }, { "epoch": 6.781731612579877, "grad_norm": 0.104517862200737, "learning_rate": 8.357564373809991e-06, "loss": 0.4635, "num_input_tokens_seen": 65830656, "step": 54125 }, { "epoch": 6.78235810048866, "grad_norm": 0.07625894248485565, "learning_rate": 8.357159242675229e-06, "loss": 0.4718, "num_input_tokens_seen": 65836448, "step": 54130 }, { "epoch": 6.782984588397444, "grad_norm": 0.12146017700433731, "learning_rate": 8.356754071402806e-06, "loss": 0.4623, "num_input_tokens_seen": 65842720, "step": 54135 }, { "epoch": 6.783611076306228, "grad_norm": 0.1179700717329979, "learning_rate": 8.356348859997565e-06, "loss": 0.4679, "num_input_tokens_seen": 65848160, "step": 54140 }, { "epoch": 6.784237564215011, "grad_norm": 0.11270174384117126, "learning_rate": 8.355943608464354e-06, "loss": 0.4578, "num_input_tokens_seen": 65854336, "step": 54145 }, { "epoch": 6.784864052123794, "grad_norm": 0.0662769302725792, "learning_rate": 8.355538316808014e-06, "loss": 0.4694, "num_input_tokens_seen": 65860384, "step": 54150 }, { "epoch": 6.785490540032577, "grad_norm": 0.10620471835136414, "learning_rate": 8.355132985033391e-06, "loss": 0.464, "num_input_tokens_seen": 65866528, "step": 54155 }, { "epoch": 6.78611702794136, "grad_norm": 0.07062752544879913, "learning_rate": 8.354727613145335e-06, "loss": 0.4621, "num_input_tokens_seen": 65872768, "step": 54160 }, { "epoch": 6.786743515850144, "grad_norm": 0.09039140492677689, "learning_rate": 8.354322201148687e-06, "loss": 0.4723, "num_input_tokens_seen": 65878432, "step": 54165 }, { "epoch": 6.7873700037589275, "grad_norm": 0.035454802215099335, "learning_rate": 8.353916749048298e-06, "loss": 0.4557, "num_input_tokens_seen": 65884576, "step": 54170 }, { "epoch": 6.787996491667711, "grad_norm": 0.07030686736106873, "learning_rate": 8.353511256849015e-06, "loss": 0.4746, "num_input_tokens_seen": 65890880, "step": 54175 }, { "epoch": 6.788622979576494, "grad_norm": 0.06232111528515816, "learning_rate": 8.353105724555686e-06, "loss": 0.4608, "num_input_tokens_seen": 65897216, "step": 54180 }, { "epoch": 6.789249467485278, "grad_norm": 0.06428301334381104, "learning_rate": 8.35270015217316e-06, "loss": 0.4665, "num_input_tokens_seen": 65903328, "step": 54185 }, { "epoch": 6.789875955394061, "grad_norm": 0.07060984522104263, "learning_rate": 8.35229453970628e-06, "loss": 0.4611, "num_input_tokens_seen": 65909248, "step": 54190 }, { "epoch": 6.790502443302844, "grad_norm": 0.0991569384932518, "learning_rate": 8.351888887159904e-06, "loss": 0.4633, "num_input_tokens_seen": 65915232, "step": 54195 }, { "epoch": 6.791128931211627, "grad_norm": 0.06702951341867447, "learning_rate": 8.351483194538878e-06, "loss": 0.4634, "num_input_tokens_seen": 65921280, "step": 54200 }, { "epoch": 6.7917554191204115, "grad_norm": 0.07578254491090775, "learning_rate": 8.351077461848053e-06, "loss": 0.4595, "num_input_tokens_seen": 65926816, "step": 54205 }, { "epoch": 6.792381907029195, "grad_norm": 0.09774641692638397, "learning_rate": 8.350671689092277e-06, "loss": 0.464, "num_input_tokens_seen": 65933280, "step": 54210 }, { "epoch": 6.793008394937978, "grad_norm": 0.03787829354405403, "learning_rate": 8.350265876276407e-06, "loss": 0.465, "num_input_tokens_seen": 65939456, "step": 54215 }, { "epoch": 6.793634882846761, "grad_norm": 0.0725928395986557, "learning_rate": 8.349860023405289e-06, "loss": 0.4604, "num_input_tokens_seen": 65945632, "step": 54220 }, { "epoch": 6.794261370755544, "grad_norm": 0.0880427435040474, "learning_rate": 8.34945413048378e-06, "loss": 0.4665, "num_input_tokens_seen": 65952096, "step": 54225 }, { "epoch": 6.794887858664328, "grad_norm": 0.06038450077176094, "learning_rate": 8.34904819751673e-06, "loss": 0.4621, "num_input_tokens_seen": 65958400, "step": 54230 }, { "epoch": 6.795514346573111, "grad_norm": 0.08917678892612457, "learning_rate": 8.348642224508993e-06, "loss": 0.4591, "num_input_tokens_seen": 65964736, "step": 54235 }, { "epoch": 6.7961408344818945, "grad_norm": 0.07427777349948883, "learning_rate": 8.348236211465424e-06, "loss": 0.4612, "num_input_tokens_seen": 65970944, "step": 54240 }, { "epoch": 6.796767322390678, "grad_norm": 0.09046458452939987, "learning_rate": 8.347830158390875e-06, "loss": 0.471, "num_input_tokens_seen": 65977120, "step": 54245 }, { "epoch": 6.797393810299461, "grad_norm": 0.059998661279678345, "learning_rate": 8.347424065290202e-06, "loss": 0.4586, "num_input_tokens_seen": 65983104, "step": 54250 }, { "epoch": 6.798020298208245, "grad_norm": 0.0680016279220581, "learning_rate": 8.34701793216826e-06, "loss": 0.466, "num_input_tokens_seen": 65989472, "step": 54255 }, { "epoch": 6.798646786117028, "grad_norm": 0.09171712398529053, "learning_rate": 8.346611759029903e-06, "loss": 0.4642, "num_input_tokens_seen": 65995744, "step": 54260 }, { "epoch": 6.799273274025811, "grad_norm": 0.06543762981891632, "learning_rate": 8.34620554587999e-06, "loss": 0.4649, "num_input_tokens_seen": 66001728, "step": 54265 }, { "epoch": 6.7998997619345944, "grad_norm": 0.07113242894411087, "learning_rate": 8.345799292723378e-06, "loss": 0.4568, "num_input_tokens_seen": 66007904, "step": 54270 }, { "epoch": 6.800526249843378, "grad_norm": 0.08032714575529099, "learning_rate": 8.34539299956492e-06, "loss": 0.4695, "num_input_tokens_seen": 66014208, "step": 54275 }, { "epoch": 6.801152737752162, "grad_norm": 0.09797848761081696, "learning_rate": 8.344986666409479e-06, "loss": 0.4614, "num_input_tokens_seen": 66020256, "step": 54280 }, { "epoch": 6.801779225660945, "grad_norm": 0.1288476437330246, "learning_rate": 8.344580293261909e-06, "loss": 0.4643, "num_input_tokens_seen": 66026272, "step": 54285 }, { "epoch": 6.802405713569728, "grad_norm": 0.06426238268613815, "learning_rate": 8.344173880127067e-06, "loss": 0.4642, "num_input_tokens_seen": 66032416, "step": 54290 }, { "epoch": 6.803032201478511, "grad_norm": 0.07840824127197266, "learning_rate": 8.343767427009816e-06, "loss": 0.4645, "num_input_tokens_seen": 66037984, "step": 54295 }, { "epoch": 6.803658689387294, "grad_norm": 0.06166023761034012, "learning_rate": 8.343360933915015e-06, "loss": 0.4682, "num_input_tokens_seen": 66044064, "step": 54300 }, { "epoch": 6.804285177296078, "grad_norm": 0.07565551996231079, "learning_rate": 8.342954400847521e-06, "loss": 0.4669, "num_input_tokens_seen": 66049984, "step": 54305 }, { "epoch": 6.804911665204862, "grad_norm": 0.040338706225156784, "learning_rate": 8.3425478278122e-06, "loss": 0.4613, "num_input_tokens_seen": 66056064, "step": 54310 }, { "epoch": 6.805538153113645, "grad_norm": 0.03693915531039238, "learning_rate": 8.342141214813906e-06, "loss": 0.4624, "num_input_tokens_seen": 66062272, "step": 54315 }, { "epoch": 6.806164641022428, "grad_norm": 0.06411118805408478, "learning_rate": 8.341734561857506e-06, "loss": 0.459, "num_input_tokens_seen": 66068512, "step": 54320 }, { "epoch": 6.806791128931212, "grad_norm": 0.08944059908390045, "learning_rate": 8.34132786894786e-06, "loss": 0.4629, "num_input_tokens_seen": 66074496, "step": 54325 }, { "epoch": 6.807417616839995, "grad_norm": 0.05649349093437195, "learning_rate": 8.340921136089828e-06, "loss": 0.4621, "num_input_tokens_seen": 66080128, "step": 54330 }, { "epoch": 6.808044104748778, "grad_norm": 0.059092942625284195, "learning_rate": 8.340514363288276e-06, "loss": 0.4606, "num_input_tokens_seen": 66085888, "step": 54335 }, { "epoch": 6.8086705926575615, "grad_norm": 0.04409800469875336, "learning_rate": 8.340107550548067e-06, "loss": 0.4628, "num_input_tokens_seen": 66091552, "step": 54340 }, { "epoch": 6.8092970805663455, "grad_norm": 0.11970039457082748, "learning_rate": 8.339700697874063e-06, "loss": 0.4701, "num_input_tokens_seen": 66097728, "step": 54345 }, { "epoch": 6.809923568475129, "grad_norm": 0.09464116394519806, "learning_rate": 8.339293805271132e-06, "loss": 0.4624, "num_input_tokens_seen": 66104096, "step": 54350 }, { "epoch": 6.810550056383912, "grad_norm": 0.03414905443787575, "learning_rate": 8.338886872744132e-06, "loss": 0.4592, "num_input_tokens_seen": 66109792, "step": 54355 }, { "epoch": 6.811176544292695, "grad_norm": 0.0582590326666832, "learning_rate": 8.338479900297934e-06, "loss": 0.4613, "num_input_tokens_seen": 66115840, "step": 54360 }, { "epoch": 6.811803032201478, "grad_norm": 0.109134741127491, "learning_rate": 8.338072887937402e-06, "loss": 0.4647, "num_input_tokens_seen": 66121952, "step": 54365 }, { "epoch": 6.812429520110262, "grad_norm": 0.0994892492890358, "learning_rate": 8.337665835667402e-06, "loss": 0.4607, "num_input_tokens_seen": 66128160, "step": 54370 }, { "epoch": 6.813056008019045, "grad_norm": 0.07470527291297913, "learning_rate": 8.337258743492801e-06, "loss": 0.463, "num_input_tokens_seen": 66133920, "step": 54375 }, { "epoch": 6.813682495927829, "grad_norm": 0.06853818893432617, "learning_rate": 8.336851611418465e-06, "loss": 0.4661, "num_input_tokens_seen": 66140064, "step": 54380 }, { "epoch": 6.814308983836612, "grad_norm": 0.09154266864061356, "learning_rate": 8.336444439449266e-06, "loss": 0.4612, "num_input_tokens_seen": 66146240, "step": 54385 }, { "epoch": 6.814935471745395, "grad_norm": 0.056472547352313995, "learning_rate": 8.336037227590064e-06, "loss": 0.4648, "num_input_tokens_seen": 66152544, "step": 54390 }, { "epoch": 6.815561959654179, "grad_norm": 0.07840267568826675, "learning_rate": 8.335629975845733e-06, "loss": 0.4664, "num_input_tokens_seen": 66158752, "step": 54395 }, { "epoch": 6.816188447562962, "grad_norm": 0.06506236642599106, "learning_rate": 8.335222684221144e-06, "loss": 0.4618, "num_input_tokens_seen": 66164160, "step": 54400 }, { "epoch": 6.816814935471745, "grad_norm": 0.06788033992052078, "learning_rate": 8.334815352721161e-06, "loss": 0.462, "num_input_tokens_seen": 66170208, "step": 54405 }, { "epoch": 6.8174414233805285, "grad_norm": 0.07208099216222763, "learning_rate": 8.334407981350658e-06, "loss": 0.4611, "num_input_tokens_seen": 66176192, "step": 54410 }, { "epoch": 6.818067911289312, "grad_norm": 0.0613497830927372, "learning_rate": 8.334000570114504e-06, "loss": 0.459, "num_input_tokens_seen": 66182528, "step": 54415 }, { "epoch": 6.818694399198096, "grad_norm": 0.05990081652998924, "learning_rate": 8.33359311901757e-06, "loss": 0.4624, "num_input_tokens_seen": 66188480, "step": 54420 }, { "epoch": 6.819320887106879, "grad_norm": 0.05888631194829941, "learning_rate": 8.333185628064728e-06, "loss": 0.4636, "num_input_tokens_seen": 66194720, "step": 54425 }, { "epoch": 6.819947375015662, "grad_norm": 0.057009268552064896, "learning_rate": 8.332778097260847e-06, "loss": 0.4633, "num_input_tokens_seen": 66200608, "step": 54430 }, { "epoch": 6.820573862924445, "grad_norm": 0.07025443017482758, "learning_rate": 8.332370526610805e-06, "loss": 0.4623, "num_input_tokens_seen": 66207008, "step": 54435 }, { "epoch": 6.821200350833229, "grad_norm": 0.062036771327257156, "learning_rate": 8.33196291611947e-06, "loss": 0.4612, "num_input_tokens_seen": 66213408, "step": 54440 }, { "epoch": 6.8218268387420125, "grad_norm": 0.07020905613899231, "learning_rate": 8.331555265791717e-06, "loss": 0.4623, "num_input_tokens_seen": 66218976, "step": 54445 }, { "epoch": 6.822453326650796, "grad_norm": 0.06873417645692825, "learning_rate": 8.331147575632421e-06, "loss": 0.4664, "num_input_tokens_seen": 66225344, "step": 54450 }, { "epoch": 6.823079814559579, "grad_norm": 0.10684792697429657, "learning_rate": 8.330739845646454e-06, "loss": 0.4628, "num_input_tokens_seen": 66231232, "step": 54455 }, { "epoch": 6.823706302468363, "grad_norm": 0.06446705013513565, "learning_rate": 8.330332075838691e-06, "loss": 0.4673, "num_input_tokens_seen": 66237312, "step": 54460 }, { "epoch": 6.824332790377146, "grad_norm": 0.06525520235300064, "learning_rate": 8.32992426621401e-06, "loss": 0.4558, "num_input_tokens_seen": 66243296, "step": 54465 }, { "epoch": 6.824959278285929, "grad_norm": 0.09200218319892883, "learning_rate": 8.329516416777284e-06, "loss": 0.4577, "num_input_tokens_seen": 66249408, "step": 54470 }, { "epoch": 6.825585766194712, "grad_norm": 0.06641431897878647, "learning_rate": 8.32910852753339e-06, "loss": 0.4616, "num_input_tokens_seen": 66255424, "step": 54475 }, { "epoch": 6.8262122541034955, "grad_norm": 0.042129743844270706, "learning_rate": 8.328700598487203e-06, "loss": 0.4642, "num_input_tokens_seen": 66261312, "step": 54480 }, { "epoch": 6.82683874201228, "grad_norm": 0.07095666229724884, "learning_rate": 8.328292629643604e-06, "loss": 0.4582, "num_input_tokens_seen": 66267552, "step": 54485 }, { "epoch": 6.827465229921063, "grad_norm": 0.06501209735870361, "learning_rate": 8.327884621007467e-06, "loss": 0.4613, "num_input_tokens_seen": 66273504, "step": 54490 }, { "epoch": 6.828091717829846, "grad_norm": 0.06674061715602875, "learning_rate": 8.327476572583673e-06, "loss": 0.4595, "num_input_tokens_seen": 66279456, "step": 54495 }, { "epoch": 6.828718205738629, "grad_norm": 0.06265565007925034, "learning_rate": 8.327068484377096e-06, "loss": 0.4551, "num_input_tokens_seen": 66285600, "step": 54500 }, { "epoch": 6.829344693647412, "grad_norm": 0.08824716508388519, "learning_rate": 8.326660356392621e-06, "loss": 0.4571, "num_input_tokens_seen": 66291776, "step": 54505 }, { "epoch": 6.829971181556196, "grad_norm": 0.032591819763183594, "learning_rate": 8.326252188635124e-06, "loss": 0.4603, "num_input_tokens_seen": 66297952, "step": 54510 }, { "epoch": 6.8305976694649795, "grad_norm": 0.08391477912664413, "learning_rate": 8.325843981109484e-06, "loss": 0.4659, "num_input_tokens_seen": 66304256, "step": 54515 }, { "epoch": 6.831224157373763, "grad_norm": 0.037959445267915726, "learning_rate": 8.325435733820584e-06, "loss": 0.4625, "num_input_tokens_seen": 66309504, "step": 54520 }, { "epoch": 6.831850645282546, "grad_norm": 0.07744942605495453, "learning_rate": 8.325027446773304e-06, "loss": 0.4631, "num_input_tokens_seen": 66315776, "step": 54525 }, { "epoch": 6.832477133191329, "grad_norm": 0.03504635766148567, "learning_rate": 8.324619119972525e-06, "loss": 0.4609, "num_input_tokens_seen": 66321792, "step": 54530 }, { "epoch": 6.833103621100113, "grad_norm": 0.11721857637166977, "learning_rate": 8.32421075342313e-06, "loss": 0.4681, "num_input_tokens_seen": 66328000, "step": 54535 }, { "epoch": 6.833730109008896, "grad_norm": 0.10342460125684738, "learning_rate": 8.32380234713e-06, "loss": 0.4615, "num_input_tokens_seen": 66334240, "step": 54540 }, { "epoch": 6.834356596917679, "grad_norm": 0.10976826399564743, "learning_rate": 8.323393901098018e-06, "loss": 0.4604, "num_input_tokens_seen": 66340096, "step": 54545 }, { "epoch": 6.8349830848264626, "grad_norm": 0.12352544814348221, "learning_rate": 8.322985415332067e-06, "loss": 0.4628, "num_input_tokens_seen": 66346272, "step": 54550 }, { "epoch": 6.835609572735246, "grad_norm": 0.08246675878763199, "learning_rate": 8.322576889837032e-06, "loss": 0.4616, "num_input_tokens_seen": 66352064, "step": 54555 }, { "epoch": 6.83623606064403, "grad_norm": 0.11208061128854752, "learning_rate": 8.322168324617798e-06, "loss": 0.4619, "num_input_tokens_seen": 66358208, "step": 54560 }, { "epoch": 6.836862548552813, "grad_norm": 0.05135743319988251, "learning_rate": 8.32175971967925e-06, "loss": 0.466, "num_input_tokens_seen": 66364320, "step": 54565 }, { "epoch": 6.837489036461596, "grad_norm": 0.11143957078456879, "learning_rate": 8.32135107502627e-06, "loss": 0.4565, "num_input_tokens_seen": 66370464, "step": 54570 }, { "epoch": 6.838115524370379, "grad_norm": 0.07940308749675751, "learning_rate": 8.320942390663744e-06, "loss": 0.459, "num_input_tokens_seen": 66376544, "step": 54575 }, { "epoch": 6.838742012279163, "grad_norm": 0.07639069855213165, "learning_rate": 8.320533666596562e-06, "loss": 0.4651, "num_input_tokens_seen": 66382880, "step": 54580 }, { "epoch": 6.8393685001879465, "grad_norm": 0.07829748839139938, "learning_rate": 8.320124902829607e-06, "loss": 0.4651, "num_input_tokens_seen": 66389120, "step": 54585 }, { "epoch": 6.83999498809673, "grad_norm": 0.10861848294734955, "learning_rate": 8.319716099367768e-06, "loss": 0.4643, "num_input_tokens_seen": 66395040, "step": 54590 }, { "epoch": 6.840621476005513, "grad_norm": 0.11177780479192734, "learning_rate": 8.319307256215931e-06, "loss": 0.4635, "num_input_tokens_seen": 66401184, "step": 54595 }, { "epoch": 6.841247963914297, "grad_norm": 0.08047683537006378, "learning_rate": 8.318898373378985e-06, "loss": 0.4677, "num_input_tokens_seen": 66407296, "step": 54600 }, { "epoch": 6.84187445182308, "grad_norm": 0.08816741406917572, "learning_rate": 8.31848945086182e-06, "loss": 0.4671, "num_input_tokens_seen": 66413472, "step": 54605 }, { "epoch": 6.842500939731863, "grad_norm": 0.06565342843532562, "learning_rate": 8.318080488669322e-06, "loss": 0.4618, "num_input_tokens_seen": 66418912, "step": 54610 }, { "epoch": 6.843127427640646, "grad_norm": 0.14505092799663544, "learning_rate": 8.317671486806383e-06, "loss": 0.4549, "num_input_tokens_seen": 66425216, "step": 54615 }, { "epoch": 6.84375391554943, "grad_norm": 0.10311079770326614, "learning_rate": 8.317262445277892e-06, "loss": 0.4615, "num_input_tokens_seen": 66431328, "step": 54620 }, { "epoch": 6.844380403458214, "grad_norm": 0.11774961650371552, "learning_rate": 8.31685336408874e-06, "loss": 0.4631, "num_input_tokens_seen": 66437696, "step": 54625 }, { "epoch": 6.845006891366997, "grad_norm": 0.06504862755537033, "learning_rate": 8.316444243243816e-06, "loss": 0.4623, "num_input_tokens_seen": 66443968, "step": 54630 }, { "epoch": 6.84563337927578, "grad_norm": 0.06269776821136475, "learning_rate": 8.316035082748013e-06, "loss": 0.4646, "num_input_tokens_seen": 66449888, "step": 54635 }, { "epoch": 6.846259867184563, "grad_norm": 0.08731027692556381, "learning_rate": 8.315625882606224e-06, "loss": 0.4607, "num_input_tokens_seen": 66455808, "step": 54640 }, { "epoch": 6.846886355093346, "grad_norm": 0.11760853976011276, "learning_rate": 8.315216642823338e-06, "loss": 0.4587, "num_input_tokens_seen": 66462016, "step": 54645 }, { "epoch": 6.84751284300213, "grad_norm": 0.08542803674936295, "learning_rate": 8.31480736340425e-06, "loss": 0.4622, "num_input_tokens_seen": 66468000, "step": 54650 }, { "epoch": 6.8481393309109135, "grad_norm": 0.08135081082582474, "learning_rate": 8.314398044353854e-06, "loss": 0.4592, "num_input_tokens_seen": 66474048, "step": 54655 }, { "epoch": 6.848765818819697, "grad_norm": 0.051575854420661926, "learning_rate": 8.313988685677044e-06, "loss": 0.4583, "num_input_tokens_seen": 66480384, "step": 54660 }, { "epoch": 6.84939230672848, "grad_norm": 0.07013921439647675, "learning_rate": 8.31357928737871e-06, "loss": 0.4626, "num_input_tokens_seen": 66486688, "step": 54665 }, { "epoch": 6.850018794637263, "grad_norm": 0.0653577595949173, "learning_rate": 8.313169849463753e-06, "loss": 0.4656, "num_input_tokens_seen": 66492928, "step": 54670 }, { "epoch": 6.850645282546047, "grad_norm": 0.07000778615474701, "learning_rate": 8.312760371937063e-06, "loss": 0.4588, "num_input_tokens_seen": 66498848, "step": 54675 }, { "epoch": 6.85127177045483, "grad_norm": 0.07409632205963135, "learning_rate": 8.312350854803536e-06, "loss": 0.4615, "num_input_tokens_seen": 66504992, "step": 54680 }, { "epoch": 6.851898258363613, "grad_norm": 0.12248416244983673, "learning_rate": 8.311941298068073e-06, "loss": 0.4715, "num_input_tokens_seen": 66511328, "step": 54685 }, { "epoch": 6.852524746272397, "grad_norm": 0.0723905861377716, "learning_rate": 8.311531701735566e-06, "loss": 0.4616, "num_input_tokens_seen": 66517568, "step": 54690 }, { "epoch": 6.853151234181181, "grad_norm": 0.06783352792263031, "learning_rate": 8.311122065810912e-06, "loss": 0.4621, "num_input_tokens_seen": 66523616, "step": 54695 }, { "epoch": 6.853777722089964, "grad_norm": 0.06840874999761581, "learning_rate": 8.310712390299012e-06, "loss": 0.4671, "num_input_tokens_seen": 66529952, "step": 54700 }, { "epoch": 6.854404209998747, "grad_norm": 0.06584813445806503, "learning_rate": 8.31030267520476e-06, "loss": 0.4586, "num_input_tokens_seen": 66536032, "step": 54705 }, { "epoch": 6.85503069790753, "grad_norm": 0.06267052888870239, "learning_rate": 8.309892920533059e-06, "loss": 0.4565, "num_input_tokens_seen": 66542240, "step": 54710 }, { "epoch": 6.855657185816314, "grad_norm": 0.09330606460571289, "learning_rate": 8.309483126288802e-06, "loss": 0.458, "num_input_tokens_seen": 66548096, "step": 54715 }, { "epoch": 6.856283673725097, "grad_norm": 0.07507692277431488, "learning_rate": 8.309073292476893e-06, "loss": 0.4629, "num_input_tokens_seen": 66554336, "step": 54720 }, { "epoch": 6.856910161633881, "grad_norm": 0.10149773955345154, "learning_rate": 8.30866341910223e-06, "loss": 0.4648, "num_input_tokens_seen": 66560704, "step": 54725 }, { "epoch": 6.857536649542664, "grad_norm": 0.06892621517181396, "learning_rate": 8.308253506169716e-06, "loss": 0.4645, "num_input_tokens_seen": 66566368, "step": 54730 }, { "epoch": 6.858163137451447, "grad_norm": 0.06329582631587982, "learning_rate": 8.307843553684248e-06, "loss": 0.457, "num_input_tokens_seen": 66572448, "step": 54735 }, { "epoch": 6.858789625360231, "grad_norm": 0.1152680292725563, "learning_rate": 8.30743356165073e-06, "loss": 0.4603, "num_input_tokens_seen": 66578752, "step": 54740 }, { "epoch": 6.859416113269014, "grad_norm": 0.0990641713142395, "learning_rate": 8.307023530074064e-06, "loss": 0.4554, "num_input_tokens_seen": 66584736, "step": 54745 }, { "epoch": 6.860042601177797, "grad_norm": 0.06874176114797592, "learning_rate": 8.30661345895915e-06, "loss": 0.4563, "num_input_tokens_seen": 66590720, "step": 54750 }, { "epoch": 6.8606690890865805, "grad_norm": 0.11999805271625519, "learning_rate": 8.306203348310893e-06, "loss": 0.4603, "num_input_tokens_seen": 66596992, "step": 54755 }, { "epoch": 6.861295576995364, "grad_norm": 0.0657801702618599, "learning_rate": 8.305793198134193e-06, "loss": 0.4657, "num_input_tokens_seen": 66603232, "step": 54760 }, { "epoch": 6.861922064904148, "grad_norm": 0.07626914978027344, "learning_rate": 8.305383008433958e-06, "loss": 0.4566, "num_input_tokens_seen": 66609536, "step": 54765 }, { "epoch": 6.862548552812931, "grad_norm": 0.07429813593626022, "learning_rate": 8.304972779215091e-06, "loss": 0.462, "num_input_tokens_seen": 66615840, "step": 54770 }, { "epoch": 6.863175040721714, "grad_norm": 0.05418010056018829, "learning_rate": 8.304562510482494e-06, "loss": 0.4595, "num_input_tokens_seen": 66621376, "step": 54775 }, { "epoch": 6.863801528630497, "grad_norm": 0.07439695298671722, "learning_rate": 8.304152202241074e-06, "loss": 0.4633, "num_input_tokens_seen": 66627552, "step": 54780 }, { "epoch": 6.86442801653928, "grad_norm": 0.09284713119268417, "learning_rate": 8.303741854495736e-06, "loss": 0.4643, "num_input_tokens_seen": 66633568, "step": 54785 }, { "epoch": 6.865054504448064, "grad_norm": 0.08772987127304077, "learning_rate": 8.303331467251387e-06, "loss": 0.4605, "num_input_tokens_seen": 66639680, "step": 54790 }, { "epoch": 6.865680992356848, "grad_norm": 0.0868995189666748, "learning_rate": 8.302921040512934e-06, "loss": 0.459, "num_input_tokens_seen": 66645920, "step": 54795 }, { "epoch": 6.866307480265631, "grad_norm": 0.08134762942790985, "learning_rate": 8.302510574285283e-06, "loss": 0.4535, "num_input_tokens_seen": 66652128, "step": 54800 }, { "epoch": 6.866933968174414, "grad_norm": 0.11742853373289108, "learning_rate": 8.302100068573343e-06, "loss": 0.4543, "num_input_tokens_seen": 66658016, "step": 54805 }, { "epoch": 6.867560456083197, "grad_norm": 0.04861297085881233, "learning_rate": 8.301689523382018e-06, "loss": 0.4574, "num_input_tokens_seen": 66664064, "step": 54810 }, { "epoch": 6.868186943991981, "grad_norm": 0.11834727972745895, "learning_rate": 8.301278938716218e-06, "loss": 0.4763, "num_input_tokens_seen": 66669920, "step": 54815 }, { "epoch": 6.868813431900764, "grad_norm": 0.04894697293639183, "learning_rate": 8.300868314580855e-06, "loss": 0.4709, "num_input_tokens_seen": 66676096, "step": 54820 }, { "epoch": 6.8694399198095475, "grad_norm": 0.07373597472906113, "learning_rate": 8.300457650980833e-06, "loss": 0.4672, "num_input_tokens_seen": 66682112, "step": 54825 }, { "epoch": 6.870066407718331, "grad_norm": 0.0988100916147232, "learning_rate": 8.300046947921067e-06, "loss": 0.47, "num_input_tokens_seen": 66688320, "step": 54830 }, { "epoch": 6.870692895627115, "grad_norm": 0.07574203610420227, "learning_rate": 8.299636205406465e-06, "loss": 0.4609, "num_input_tokens_seen": 66694528, "step": 54835 }, { "epoch": 6.871319383535898, "grad_norm": 0.10320372879505157, "learning_rate": 8.299225423441938e-06, "loss": 0.4635, "num_input_tokens_seen": 66700352, "step": 54840 }, { "epoch": 6.871945871444681, "grad_norm": 0.06740271300077438, "learning_rate": 8.298814602032398e-06, "loss": 0.4623, "num_input_tokens_seen": 66706400, "step": 54845 }, { "epoch": 6.872572359353464, "grad_norm": 0.07283686101436615, "learning_rate": 8.298403741182755e-06, "loss": 0.4624, "num_input_tokens_seen": 66712480, "step": 54850 }, { "epoch": 6.873198847262248, "grad_norm": 0.08162528276443481, "learning_rate": 8.297992840897922e-06, "loss": 0.4618, "num_input_tokens_seen": 66718848, "step": 54855 }, { "epoch": 6.8738253351710314, "grad_norm": 0.10100103914737701, "learning_rate": 8.297581901182813e-06, "loss": 0.4682, "num_input_tokens_seen": 66724960, "step": 54860 }, { "epoch": 6.874451823079815, "grad_norm": 0.0598079077899456, "learning_rate": 8.29717092204234e-06, "loss": 0.4599, "num_input_tokens_seen": 66731168, "step": 54865 }, { "epoch": 6.875078310988598, "grad_norm": 0.07730326056480408, "learning_rate": 8.296759903481414e-06, "loss": 0.4681, "num_input_tokens_seen": 66737408, "step": 54870 }, { "epoch": 6.875704798897381, "grad_norm": 0.06745985895395279, "learning_rate": 8.296348845504954e-06, "loss": 0.4675, "num_input_tokens_seen": 66743360, "step": 54875 }, { "epoch": 6.876331286806165, "grad_norm": 0.0703464150428772, "learning_rate": 8.29593774811787e-06, "loss": 0.459, "num_input_tokens_seen": 66749376, "step": 54880 }, { "epoch": 6.876957774714948, "grad_norm": 0.09810742735862732, "learning_rate": 8.29552661132508e-06, "loss": 0.4649, "num_input_tokens_seen": 66755488, "step": 54885 }, { "epoch": 6.877584262623731, "grad_norm": 0.07264743745326996, "learning_rate": 8.2951154351315e-06, "loss": 0.4618, "num_input_tokens_seen": 66761792, "step": 54890 }, { "epoch": 6.8782107505325145, "grad_norm": 0.06575863063335419, "learning_rate": 8.294704219542044e-06, "loss": 0.4586, "num_input_tokens_seen": 66767456, "step": 54895 }, { "epoch": 6.878837238441298, "grad_norm": 0.06683853268623352, "learning_rate": 8.29429296456163e-06, "loss": 0.4707, "num_input_tokens_seen": 66773248, "step": 54900 }, { "epoch": 6.879463726350082, "grad_norm": 0.11275602877140045, "learning_rate": 8.293881670195172e-06, "loss": 0.4576, "num_input_tokens_seen": 66779680, "step": 54905 }, { "epoch": 6.880090214258865, "grad_norm": 0.03968796506524086, "learning_rate": 8.29347033644759e-06, "loss": 0.4554, "num_input_tokens_seen": 66785504, "step": 54910 }, { "epoch": 6.880716702167648, "grad_norm": 0.04259456694126129, "learning_rate": 8.2930589633238e-06, "loss": 0.4668, "num_input_tokens_seen": 66790912, "step": 54915 }, { "epoch": 6.881343190076431, "grad_norm": 0.07723440229892731, "learning_rate": 8.292647550828723e-06, "loss": 0.4632, "num_input_tokens_seen": 66797184, "step": 54920 }, { "epoch": 6.881969677985214, "grad_norm": 0.06439261138439178, "learning_rate": 8.292236098967274e-06, "loss": 0.4597, "num_input_tokens_seen": 66803296, "step": 54925 }, { "epoch": 6.8825961658939985, "grad_norm": 0.07809443771839142, "learning_rate": 8.291824607744378e-06, "loss": 0.4613, "num_input_tokens_seen": 66809440, "step": 54930 }, { "epoch": 6.883222653802782, "grad_norm": 0.10354841500520706, "learning_rate": 8.29141307716495e-06, "loss": 0.4708, "num_input_tokens_seen": 66815808, "step": 54935 }, { "epoch": 6.883849141711565, "grad_norm": 0.11813580244779587, "learning_rate": 8.291001507233909e-06, "loss": 0.4685, "num_input_tokens_seen": 66821792, "step": 54940 }, { "epoch": 6.884475629620348, "grad_norm": 0.06466635316610336, "learning_rate": 8.29058989795618e-06, "loss": 0.4624, "num_input_tokens_seen": 66827520, "step": 54945 }, { "epoch": 6.885102117529132, "grad_norm": 0.06592027842998505, "learning_rate": 8.290178249336683e-06, "loss": 0.4697, "num_input_tokens_seen": 66833792, "step": 54950 }, { "epoch": 6.885728605437915, "grad_norm": 0.08453086018562317, "learning_rate": 8.289766561380337e-06, "loss": 0.4639, "num_input_tokens_seen": 66839808, "step": 54955 }, { "epoch": 6.886355093346698, "grad_norm": 0.046785637736320496, "learning_rate": 8.289354834092067e-06, "loss": 0.4664, "num_input_tokens_seen": 66845824, "step": 54960 }, { "epoch": 6.8869815812554815, "grad_norm": 0.09515173733234406, "learning_rate": 8.288943067476795e-06, "loss": 0.4653, "num_input_tokens_seen": 66851872, "step": 54965 }, { "epoch": 6.887608069164266, "grad_norm": 0.1006489247083664, "learning_rate": 8.288531261539442e-06, "loss": 0.4655, "num_input_tokens_seen": 66858016, "step": 54970 }, { "epoch": 6.888234557073049, "grad_norm": 0.06150807440280914, "learning_rate": 8.288119416284933e-06, "loss": 0.4617, "num_input_tokens_seen": 66864032, "step": 54975 }, { "epoch": 6.888861044981832, "grad_norm": 0.06731210649013519, "learning_rate": 8.287707531718191e-06, "loss": 0.4586, "num_input_tokens_seen": 66870048, "step": 54980 }, { "epoch": 6.889487532890615, "grad_norm": 0.0719737783074379, "learning_rate": 8.287295607844144e-06, "loss": 0.4675, "num_input_tokens_seen": 66875872, "step": 54985 }, { "epoch": 6.890114020799398, "grad_norm": 0.11173848062753677, "learning_rate": 8.286883644667713e-06, "loss": 0.4632, "num_input_tokens_seen": 66881888, "step": 54990 }, { "epoch": 6.890740508708182, "grad_norm": 0.09521916508674622, "learning_rate": 8.286471642193824e-06, "loss": 0.4571, "num_input_tokens_seen": 66888096, "step": 54995 }, { "epoch": 6.8913669966169655, "grad_norm": 0.0720604807138443, "learning_rate": 8.286059600427405e-06, "loss": 0.4621, "num_input_tokens_seen": 66894144, "step": 55000 }, { "epoch": 6.891993484525749, "grad_norm": 0.04360562190413475, "learning_rate": 8.285647519373379e-06, "loss": 0.4647, "num_input_tokens_seen": 66900288, "step": 55005 }, { "epoch": 6.892619972434532, "grad_norm": 0.04751531034708023, "learning_rate": 8.285235399036675e-06, "loss": 0.4659, "num_input_tokens_seen": 66906144, "step": 55010 }, { "epoch": 6.893246460343315, "grad_norm": 0.06722033023834229, "learning_rate": 8.28482323942222e-06, "loss": 0.4627, "num_input_tokens_seen": 66912192, "step": 55015 }, { "epoch": 6.893872948252099, "grad_norm": 0.09595052897930145, "learning_rate": 8.284411040534941e-06, "loss": 0.457, "num_input_tokens_seen": 66918368, "step": 55020 }, { "epoch": 6.894499436160882, "grad_norm": 0.07177333533763885, "learning_rate": 8.283998802379768e-06, "loss": 0.4712, "num_input_tokens_seen": 66924640, "step": 55025 }, { "epoch": 6.895125924069665, "grad_norm": 0.11147107183933258, "learning_rate": 8.283586524961627e-06, "loss": 0.4628, "num_input_tokens_seen": 66930592, "step": 55030 }, { "epoch": 6.895752411978449, "grad_norm": 0.07944390177726746, "learning_rate": 8.283174208285447e-06, "loss": 0.4551, "num_input_tokens_seen": 66936896, "step": 55035 }, { "epoch": 6.896378899887232, "grad_norm": 0.0767074003815651, "learning_rate": 8.282761852356161e-06, "loss": 0.4551, "num_input_tokens_seen": 66942912, "step": 55040 }, { "epoch": 6.897005387796016, "grad_norm": 0.0706242173910141, "learning_rate": 8.282349457178697e-06, "loss": 0.4587, "num_input_tokens_seen": 66948512, "step": 55045 }, { "epoch": 6.897631875704799, "grad_norm": 0.07303910702466965, "learning_rate": 8.281937022757984e-06, "loss": 0.4577, "num_input_tokens_seen": 66954496, "step": 55050 }, { "epoch": 6.898258363613582, "grad_norm": 0.0690394937992096, "learning_rate": 8.281524549098957e-06, "loss": 0.46, "num_input_tokens_seen": 66960640, "step": 55055 }, { "epoch": 6.898884851522365, "grad_norm": 0.11691448837518692, "learning_rate": 8.281112036206544e-06, "loss": 0.4652, "num_input_tokens_seen": 66966432, "step": 55060 }, { "epoch": 6.899511339431149, "grad_norm": 0.04210522770881653, "learning_rate": 8.280699484085678e-06, "loss": 0.4642, "num_input_tokens_seen": 66972448, "step": 55065 }, { "epoch": 6.9001378273399325, "grad_norm": 0.07791142165660858, "learning_rate": 8.280286892741292e-06, "loss": 0.4599, "num_input_tokens_seen": 66978368, "step": 55070 }, { "epoch": 6.900764315248716, "grad_norm": 0.10209786146879196, "learning_rate": 8.279874262178316e-06, "loss": 0.4701, "num_input_tokens_seen": 66984608, "step": 55075 }, { "epoch": 6.901390803157499, "grad_norm": 0.1005144938826561, "learning_rate": 8.279461592401688e-06, "loss": 0.4685, "num_input_tokens_seen": 66990688, "step": 55080 }, { "epoch": 6.902017291066282, "grad_norm": 0.12617981433868408, "learning_rate": 8.279048883416339e-06, "loss": 0.4742, "num_input_tokens_seen": 66996768, "step": 55085 }, { "epoch": 6.902643778975066, "grad_norm": 0.06385951489210129, "learning_rate": 8.278636135227203e-06, "loss": 0.4528, "num_input_tokens_seen": 67002016, "step": 55090 }, { "epoch": 6.903270266883849, "grad_norm": 0.04212065041065216, "learning_rate": 8.278223347839216e-06, "loss": 0.4549, "num_input_tokens_seen": 67008224, "step": 55095 }, { "epoch": 6.903896754792632, "grad_norm": 0.07550490647554398, "learning_rate": 8.277810521257311e-06, "loss": 0.4682, "num_input_tokens_seen": 67014112, "step": 55100 }, { "epoch": 6.904523242701416, "grad_norm": 0.07810482382774353, "learning_rate": 8.277397655486428e-06, "loss": 0.4682, "num_input_tokens_seen": 67020384, "step": 55105 }, { "epoch": 6.9051497306102, "grad_norm": 0.09150247275829315, "learning_rate": 8.2769847505315e-06, "loss": 0.4602, "num_input_tokens_seen": 67026048, "step": 55110 }, { "epoch": 6.905776218518983, "grad_norm": 0.0661076158285141, "learning_rate": 8.276571806397462e-06, "loss": 0.4618, "num_input_tokens_seen": 67032192, "step": 55115 }, { "epoch": 6.906402706427766, "grad_norm": 0.0716528445482254, "learning_rate": 8.276158823089256e-06, "loss": 0.4624, "num_input_tokens_seen": 67038432, "step": 55120 }, { "epoch": 6.907029194336549, "grad_norm": 0.0715629830956459, "learning_rate": 8.275745800611816e-06, "loss": 0.4577, "num_input_tokens_seen": 67044576, "step": 55125 }, { "epoch": 6.907655682245332, "grad_norm": 0.05955120176076889, "learning_rate": 8.27533273897008e-06, "loss": 0.4566, "num_input_tokens_seen": 67050624, "step": 55130 }, { "epoch": 6.908282170154116, "grad_norm": 0.06968778371810913, "learning_rate": 8.274919638168988e-06, "loss": 0.4615, "num_input_tokens_seen": 67056480, "step": 55135 }, { "epoch": 6.9089086580628996, "grad_norm": 0.09101585298776627, "learning_rate": 8.27450649821348e-06, "loss": 0.4605, "num_input_tokens_seen": 67063008, "step": 55140 }, { "epoch": 6.909535145971683, "grad_norm": 0.1148325502872467, "learning_rate": 8.27409331910849e-06, "loss": 0.4583, "num_input_tokens_seen": 67068864, "step": 55145 }, { "epoch": 6.910161633880466, "grad_norm": 0.10439420491456985, "learning_rate": 8.273680100858965e-06, "loss": 0.4604, "num_input_tokens_seen": 67075136, "step": 55150 }, { "epoch": 6.910788121789249, "grad_norm": 0.040197309106588364, "learning_rate": 8.27326684346984e-06, "loss": 0.4584, "num_input_tokens_seen": 67080736, "step": 55155 }, { "epoch": 6.911414609698033, "grad_norm": 0.061460379511117935, "learning_rate": 8.272853546946059e-06, "loss": 0.4596, "num_input_tokens_seen": 67086912, "step": 55160 }, { "epoch": 6.912041097606816, "grad_norm": 0.10168047994375229, "learning_rate": 8.27244021129256e-06, "loss": 0.4686, "num_input_tokens_seen": 67093184, "step": 55165 }, { "epoch": 6.9126675855155995, "grad_norm": 0.07053277641534805, "learning_rate": 8.27202683651429e-06, "loss": 0.4559, "num_input_tokens_seen": 67099328, "step": 55170 }, { "epoch": 6.913294073424383, "grad_norm": 0.06595161557197571, "learning_rate": 8.271613422616185e-06, "loss": 0.4646, "num_input_tokens_seen": 67105280, "step": 55175 }, { "epoch": 6.913920561333166, "grad_norm": 0.08190332353115082, "learning_rate": 8.271199969603194e-06, "loss": 0.463, "num_input_tokens_seen": 67111264, "step": 55180 }, { "epoch": 6.91454704924195, "grad_norm": 0.0892886221408844, "learning_rate": 8.270786477480255e-06, "loss": 0.4604, "num_input_tokens_seen": 67117536, "step": 55185 }, { "epoch": 6.915173537150733, "grad_norm": 0.0738099217414856, "learning_rate": 8.270372946252316e-06, "loss": 0.4525, "num_input_tokens_seen": 67123040, "step": 55190 }, { "epoch": 6.915800025059516, "grad_norm": 0.0679800808429718, "learning_rate": 8.269959375924316e-06, "loss": 0.464, "num_input_tokens_seen": 67129120, "step": 55195 }, { "epoch": 6.916426512968299, "grad_norm": 0.07040635496377945, "learning_rate": 8.269545766501206e-06, "loss": 0.4579, "num_input_tokens_seen": 67135200, "step": 55200 }, { "epoch": 6.917053000877083, "grad_norm": 0.06893448531627655, "learning_rate": 8.269132117987925e-06, "loss": 0.465, "num_input_tokens_seen": 67141088, "step": 55205 }, { "epoch": 6.917679488785867, "grad_norm": 0.07017987966537476, "learning_rate": 8.268718430389421e-06, "loss": 0.4654, "num_input_tokens_seen": 67147168, "step": 55210 }, { "epoch": 6.91830597669465, "grad_norm": 0.06580565869808197, "learning_rate": 8.268304703710642e-06, "loss": 0.4639, "num_input_tokens_seen": 67153408, "step": 55215 }, { "epoch": 6.918932464603433, "grad_norm": 0.09496248513460159, "learning_rate": 8.267890937956529e-06, "loss": 0.4639, "num_input_tokens_seen": 67159392, "step": 55220 }, { "epoch": 6.919558952512217, "grad_norm": 0.07351680845022202, "learning_rate": 8.267477133132035e-06, "loss": 0.4711, "num_input_tokens_seen": 67165376, "step": 55225 }, { "epoch": 6.920185440421, "grad_norm": 0.0776943638920784, "learning_rate": 8.267063289242103e-06, "loss": 0.4577, "num_input_tokens_seen": 67171840, "step": 55230 }, { "epoch": 6.920811928329783, "grad_norm": 0.08114895224571228, "learning_rate": 8.266649406291683e-06, "loss": 0.4729, "num_input_tokens_seen": 67177952, "step": 55235 }, { "epoch": 6.9214384162385665, "grad_norm": 0.07769756764173508, "learning_rate": 8.266235484285724e-06, "loss": 0.4721, "num_input_tokens_seen": 67184320, "step": 55240 }, { "epoch": 6.92206490414735, "grad_norm": 0.07533638924360275, "learning_rate": 8.265821523229174e-06, "loss": 0.4661, "num_input_tokens_seen": 67190336, "step": 55245 }, { "epoch": 6.922691392056134, "grad_norm": 0.0645010694861412, "learning_rate": 8.26540752312698e-06, "loss": 0.4664, "num_input_tokens_seen": 67196480, "step": 55250 }, { "epoch": 6.923317879964917, "grad_norm": 0.07266628742218018, "learning_rate": 8.264993483984095e-06, "loss": 0.4668, "num_input_tokens_seen": 67202560, "step": 55255 }, { "epoch": 6.9239443678737, "grad_norm": 0.06565912812948227, "learning_rate": 8.264579405805467e-06, "loss": 0.4612, "num_input_tokens_seen": 67208192, "step": 55260 }, { "epoch": 6.924570855782483, "grad_norm": 0.16606388986110687, "learning_rate": 8.264165288596048e-06, "loss": 0.4587, "num_input_tokens_seen": 67214016, "step": 55265 }, { "epoch": 6.925197343691266, "grad_norm": 0.07039551436901093, "learning_rate": 8.263751132360788e-06, "loss": 0.4649, "num_input_tokens_seen": 67220096, "step": 55270 }, { "epoch": 6.92582383160005, "grad_norm": 0.09874191880226135, "learning_rate": 8.263336937104641e-06, "loss": 0.4587, "num_input_tokens_seen": 67226368, "step": 55275 }, { "epoch": 6.926450319508834, "grad_norm": 0.12232136726379395, "learning_rate": 8.262922702832556e-06, "loss": 0.4625, "num_input_tokens_seen": 67232224, "step": 55280 }, { "epoch": 6.927076807417617, "grad_norm": 0.11059076339006424, "learning_rate": 8.262508429549486e-06, "loss": 0.4635, "num_input_tokens_seen": 67238304, "step": 55285 }, { "epoch": 6.9277032953264, "grad_norm": 0.07044339179992676, "learning_rate": 8.262094117260387e-06, "loss": 0.4563, "num_input_tokens_seen": 67244608, "step": 55290 }, { "epoch": 6.928329783235183, "grad_norm": 0.06591027230024338, "learning_rate": 8.261679765970207e-06, "loss": 0.4632, "num_input_tokens_seen": 67250624, "step": 55295 }, { "epoch": 6.928956271143967, "grad_norm": 0.07651457190513611, "learning_rate": 8.261265375683907e-06, "loss": 0.4565, "num_input_tokens_seen": 67256864, "step": 55300 }, { "epoch": 6.92958275905275, "grad_norm": 0.09436628222465515, "learning_rate": 8.260850946406435e-06, "loss": 0.4627, "num_input_tokens_seen": 67263232, "step": 55305 }, { "epoch": 6.9302092469615335, "grad_norm": 0.08201252669095993, "learning_rate": 8.260436478142748e-06, "loss": 0.4619, "num_input_tokens_seen": 67269568, "step": 55310 }, { "epoch": 6.930835734870317, "grad_norm": 0.06608359515666962, "learning_rate": 8.260021970897802e-06, "loss": 0.4583, "num_input_tokens_seen": 67275520, "step": 55315 }, { "epoch": 6.931462222779101, "grad_norm": 0.09239988774061203, "learning_rate": 8.259607424676555e-06, "loss": 0.4621, "num_input_tokens_seen": 67281952, "step": 55320 }, { "epoch": 6.932088710687884, "grad_norm": 0.09318789094686508, "learning_rate": 8.259192839483958e-06, "loss": 0.4611, "num_input_tokens_seen": 67287840, "step": 55325 }, { "epoch": 6.932715198596667, "grad_norm": 0.09854176640510559, "learning_rate": 8.25877821532497e-06, "loss": 0.463, "num_input_tokens_seen": 67293792, "step": 55330 }, { "epoch": 6.93334168650545, "grad_norm": 0.09236601740121841, "learning_rate": 8.258363552204552e-06, "loss": 0.4697, "num_input_tokens_seen": 67299808, "step": 55335 }, { "epoch": 6.933968174414234, "grad_norm": 0.07600012421607971, "learning_rate": 8.257948850127656e-06, "loss": 0.4617, "num_input_tokens_seen": 67306016, "step": 55340 }, { "epoch": 6.9345946623230175, "grad_norm": 0.038823649287223816, "learning_rate": 8.25753410909924e-06, "loss": 0.4566, "num_input_tokens_seen": 67312384, "step": 55345 }, { "epoch": 6.935221150231801, "grad_norm": 0.03767827898263931, "learning_rate": 8.257119329124268e-06, "loss": 0.4584, "num_input_tokens_seen": 67318464, "step": 55350 }, { "epoch": 6.935847638140584, "grad_norm": 0.15761756896972656, "learning_rate": 8.256704510207695e-06, "loss": 0.4528, "num_input_tokens_seen": 67324480, "step": 55355 }, { "epoch": 6.936474126049367, "grad_norm": 0.06768243759870529, "learning_rate": 8.256289652354482e-06, "loss": 0.4674, "num_input_tokens_seen": 67330688, "step": 55360 }, { "epoch": 6.937100613958151, "grad_norm": 0.06790823489427567, "learning_rate": 8.255874755569588e-06, "loss": 0.4648, "num_input_tokens_seen": 67337024, "step": 55365 }, { "epoch": 6.937727101866934, "grad_norm": 0.05531129613518715, "learning_rate": 8.255459819857974e-06, "loss": 0.4621, "num_input_tokens_seen": 67343200, "step": 55370 }, { "epoch": 6.938353589775717, "grad_norm": 0.08256698399782181, "learning_rate": 8.255044845224601e-06, "loss": 0.4603, "num_input_tokens_seen": 67349280, "step": 55375 }, { "epoch": 6.9389800776845005, "grad_norm": 0.08331161737442017, "learning_rate": 8.25462983167443e-06, "loss": 0.4616, "num_input_tokens_seen": 67355808, "step": 55380 }, { "epoch": 6.939606565593284, "grad_norm": 0.07217884063720703, "learning_rate": 8.254214779212423e-06, "loss": 0.4633, "num_input_tokens_seen": 67361632, "step": 55385 }, { "epoch": 6.940233053502068, "grad_norm": 0.09333605319261551, "learning_rate": 8.253799687843543e-06, "loss": 0.4759, "num_input_tokens_seen": 67367584, "step": 55390 }, { "epoch": 6.940859541410851, "grad_norm": 0.06154666468501091, "learning_rate": 8.253384557572752e-06, "loss": 0.46, "num_input_tokens_seen": 67373664, "step": 55395 }, { "epoch": 6.941486029319634, "grad_norm": 0.08308559656143188, "learning_rate": 8.252969388405012e-06, "loss": 0.4599, "num_input_tokens_seen": 67379424, "step": 55400 }, { "epoch": 6.942112517228417, "grad_norm": 0.036759261041879654, "learning_rate": 8.252554180345289e-06, "loss": 0.4628, "num_input_tokens_seen": 67385568, "step": 55405 }, { "epoch": 6.9427390051372, "grad_norm": 0.08540964871644974, "learning_rate": 8.252138933398547e-06, "loss": 0.4653, "num_input_tokens_seen": 67391328, "step": 55410 }, { "epoch": 6.9433654930459845, "grad_norm": 0.059872355312108994, "learning_rate": 8.251723647569747e-06, "loss": 0.4676, "num_input_tokens_seen": 67397536, "step": 55415 }, { "epoch": 6.943991980954768, "grad_norm": 0.06736642867326736, "learning_rate": 8.25130832286386e-06, "loss": 0.468, "num_input_tokens_seen": 67403776, "step": 55420 }, { "epoch": 6.944618468863551, "grad_norm": 0.10944945365190506, "learning_rate": 8.250892959285847e-06, "loss": 0.4649, "num_input_tokens_seen": 67409728, "step": 55425 }, { "epoch": 6.945244956772334, "grad_norm": 0.061966631561517715, "learning_rate": 8.250477556840675e-06, "loss": 0.468, "num_input_tokens_seen": 67415840, "step": 55430 }, { "epoch": 6.945871444681117, "grad_norm": 0.06615504622459412, "learning_rate": 8.250062115533312e-06, "loss": 0.4606, "num_input_tokens_seen": 67421600, "step": 55435 }, { "epoch": 6.946497932589901, "grad_norm": 0.08035348355770111, "learning_rate": 8.249646635368725e-06, "loss": 0.4641, "num_input_tokens_seen": 67427712, "step": 55440 }, { "epoch": 6.947124420498684, "grad_norm": 0.14059869945049286, "learning_rate": 8.249231116351878e-06, "loss": 0.4631, "num_input_tokens_seen": 67434048, "step": 55445 }, { "epoch": 6.947750908407468, "grad_norm": 0.07392936199903488, "learning_rate": 8.248815558487742e-06, "loss": 0.4664, "num_input_tokens_seen": 67439968, "step": 55450 }, { "epoch": 6.948377396316251, "grad_norm": 0.055151376873254776, "learning_rate": 8.248399961781284e-06, "loss": 0.4608, "num_input_tokens_seen": 67446112, "step": 55455 }, { "epoch": 6.949003884225035, "grad_norm": 0.07058814913034439, "learning_rate": 8.247984326237475e-06, "loss": 0.4625, "num_input_tokens_seen": 67452416, "step": 55460 }, { "epoch": 6.949630372133818, "grad_norm": 0.06949633359909058, "learning_rate": 8.247568651861282e-06, "loss": 0.4597, "num_input_tokens_seen": 67458496, "step": 55465 }, { "epoch": 6.950256860042601, "grad_norm": 0.07313910871744156, "learning_rate": 8.247152938657673e-06, "loss": 0.4621, "num_input_tokens_seen": 67463648, "step": 55470 }, { "epoch": 6.950883347951384, "grad_norm": 0.09668853878974915, "learning_rate": 8.246737186631624e-06, "loss": 0.4594, "num_input_tokens_seen": 67469792, "step": 55475 }, { "epoch": 6.951509835860168, "grad_norm": 0.07251878827810287, "learning_rate": 8.2463213957881e-06, "loss": 0.4676, "num_input_tokens_seen": 67475968, "step": 55480 }, { "epoch": 6.9521363237689515, "grad_norm": 0.08833576738834381, "learning_rate": 8.245905566132075e-06, "loss": 0.4608, "num_input_tokens_seen": 67481824, "step": 55485 }, { "epoch": 6.952762811677735, "grad_norm": 0.06672493368387222, "learning_rate": 8.24548969766852e-06, "loss": 0.4588, "num_input_tokens_seen": 67487840, "step": 55490 }, { "epoch": 6.953389299586518, "grad_norm": 0.10308104008436203, "learning_rate": 8.245073790402408e-06, "loss": 0.463, "num_input_tokens_seen": 67494080, "step": 55495 }, { "epoch": 6.954015787495301, "grad_norm": 0.06379517167806625, "learning_rate": 8.244657844338708e-06, "loss": 0.4651, "num_input_tokens_seen": 67500160, "step": 55500 }, { "epoch": 6.954642275404085, "grad_norm": 0.06845302134752274, "learning_rate": 8.244241859482399e-06, "loss": 0.4624, "num_input_tokens_seen": 67505728, "step": 55505 }, { "epoch": 6.955268763312868, "grad_norm": 0.06846870481967926, "learning_rate": 8.243825835838449e-06, "loss": 0.4642, "num_input_tokens_seen": 67511936, "step": 55510 }, { "epoch": 6.955895251221651, "grad_norm": 0.0688718929886818, "learning_rate": 8.243409773411833e-06, "loss": 0.4642, "num_input_tokens_seen": 67518112, "step": 55515 }, { "epoch": 6.956521739130435, "grad_norm": 0.0626966804265976, "learning_rate": 8.242993672207527e-06, "loss": 0.4623, "num_input_tokens_seen": 67524288, "step": 55520 }, { "epoch": 6.957148227039218, "grad_norm": 0.07190654426813126, "learning_rate": 8.242577532230506e-06, "loss": 0.471, "num_input_tokens_seen": 67530464, "step": 55525 }, { "epoch": 6.957774714948002, "grad_norm": 0.062113549560308456, "learning_rate": 8.242161353485743e-06, "loss": 0.4596, "num_input_tokens_seen": 67536576, "step": 55530 }, { "epoch": 6.958401202856785, "grad_norm": 0.04331729933619499, "learning_rate": 8.241745135978215e-06, "loss": 0.4583, "num_input_tokens_seen": 67542848, "step": 55535 }, { "epoch": 6.959027690765568, "grad_norm": 0.06899888068437576, "learning_rate": 8.2413288797129e-06, "loss": 0.4624, "num_input_tokens_seen": 67549120, "step": 55540 }, { "epoch": 6.959654178674351, "grad_norm": 0.0625743642449379, "learning_rate": 8.240912584694772e-06, "loss": 0.4626, "num_input_tokens_seen": 67555328, "step": 55545 }, { "epoch": 6.9602806665831345, "grad_norm": 0.1044226735830307, "learning_rate": 8.240496250928809e-06, "loss": 0.4605, "num_input_tokens_seen": 67561344, "step": 55550 }, { "epoch": 6.9609071544919185, "grad_norm": 0.10402008891105652, "learning_rate": 8.24007987841999e-06, "loss": 0.4657, "num_input_tokens_seen": 67567648, "step": 55555 }, { "epoch": 6.961533642400702, "grad_norm": 0.0463726744055748, "learning_rate": 8.239663467173291e-06, "loss": 0.4666, "num_input_tokens_seen": 67573728, "step": 55560 }, { "epoch": 6.962160130309485, "grad_norm": 0.09657995402812958, "learning_rate": 8.239247017193692e-06, "loss": 0.4564, "num_input_tokens_seen": 67580000, "step": 55565 }, { "epoch": 6.962786618218268, "grad_norm": 0.06359339505434036, "learning_rate": 8.238830528486172e-06, "loss": 0.4647, "num_input_tokens_seen": 67586208, "step": 55570 }, { "epoch": 6.963413106127052, "grad_norm": 0.06250900775194168, "learning_rate": 8.238414001055709e-06, "loss": 0.46, "num_input_tokens_seen": 67592416, "step": 55575 }, { "epoch": 6.964039594035835, "grad_norm": 0.09159776568412781, "learning_rate": 8.237997434907283e-06, "loss": 0.4661, "num_input_tokens_seen": 67598304, "step": 55580 }, { "epoch": 6.9646660819446184, "grad_norm": 0.10678771883249283, "learning_rate": 8.237580830045877e-06, "loss": 0.4615, "num_input_tokens_seen": 67604384, "step": 55585 }, { "epoch": 6.965292569853402, "grad_norm": 0.06931345164775848, "learning_rate": 8.23716418647647e-06, "loss": 0.4562, "num_input_tokens_seen": 67610976, "step": 55590 }, { "epoch": 6.965919057762186, "grad_norm": 0.06527707725763321, "learning_rate": 8.236747504204043e-06, "loss": 0.4608, "num_input_tokens_seen": 67616704, "step": 55595 }, { "epoch": 6.966545545670969, "grad_norm": 0.05824100226163864, "learning_rate": 8.236330783233578e-06, "loss": 0.4596, "num_input_tokens_seen": 67622592, "step": 55600 }, { "epoch": 6.967172033579752, "grad_norm": 0.08728864043951035, "learning_rate": 8.235914023570059e-06, "loss": 0.4573, "num_input_tokens_seen": 67628992, "step": 55605 }, { "epoch": 6.967798521488535, "grad_norm": 0.0654735416173935, "learning_rate": 8.235497225218466e-06, "loss": 0.4606, "num_input_tokens_seen": 67634976, "step": 55610 }, { "epoch": 6.968425009397318, "grad_norm": 0.08628292381763458, "learning_rate": 8.235080388183782e-06, "loss": 0.4664, "num_input_tokens_seen": 67641216, "step": 55615 }, { "epoch": 6.969051497306102, "grad_norm": 0.07025434076786041, "learning_rate": 8.234663512470994e-06, "loss": 0.4603, "num_input_tokens_seen": 67647200, "step": 55620 }, { "epoch": 6.969677985214886, "grad_norm": 0.08619408309459686, "learning_rate": 8.234246598085084e-06, "loss": 0.4646, "num_input_tokens_seen": 67653408, "step": 55625 }, { "epoch": 6.970304473123669, "grad_norm": 0.06746019423007965, "learning_rate": 8.233829645031038e-06, "loss": 0.4527, "num_input_tokens_seen": 67658368, "step": 55630 }, { "epoch": 6.970930961032452, "grad_norm": 0.10579253733158112, "learning_rate": 8.233412653313838e-06, "loss": 0.4698, "num_input_tokens_seen": 67664448, "step": 55635 }, { "epoch": 6.971557448941235, "grad_norm": 0.10715947300195694, "learning_rate": 8.23299562293847e-06, "loss": 0.4635, "num_input_tokens_seen": 67670592, "step": 55640 }, { "epoch": 6.972183936850019, "grad_norm": 0.13456325232982635, "learning_rate": 8.232578553909924e-06, "loss": 0.4605, "num_input_tokens_seen": 67677120, "step": 55645 }, { "epoch": 6.972810424758802, "grad_norm": 0.05087099224328995, "learning_rate": 8.232161446233183e-06, "loss": 0.4522, "num_input_tokens_seen": 67683200, "step": 55650 }, { "epoch": 6.9734369126675855, "grad_norm": 0.06516028195619583, "learning_rate": 8.231744299913233e-06, "loss": 0.4606, "num_input_tokens_seen": 67689056, "step": 55655 }, { "epoch": 6.974063400576369, "grad_norm": 0.07061664015054703, "learning_rate": 8.231327114955064e-06, "loss": 0.458, "num_input_tokens_seen": 67695136, "step": 55660 }, { "epoch": 6.974689888485152, "grad_norm": 0.12216222286224365, "learning_rate": 8.23090989136366e-06, "loss": 0.4607, "num_input_tokens_seen": 67701408, "step": 55665 }, { "epoch": 6.975316376393936, "grad_norm": 0.12475865334272385, "learning_rate": 8.230492629144016e-06, "loss": 0.4668, "num_input_tokens_seen": 67707712, "step": 55670 }, { "epoch": 6.975942864302719, "grad_norm": 0.08911743015050888, "learning_rate": 8.230075328301115e-06, "loss": 0.4675, "num_input_tokens_seen": 67714048, "step": 55675 }, { "epoch": 6.976569352211502, "grad_norm": 0.11468490213155746, "learning_rate": 8.229657988839948e-06, "loss": 0.457, "num_input_tokens_seen": 67720384, "step": 55680 }, { "epoch": 6.977195840120285, "grad_norm": 0.08265471458435059, "learning_rate": 8.229240610765504e-06, "loss": 0.4621, "num_input_tokens_seen": 67726144, "step": 55685 }, { "epoch": 6.9778223280290685, "grad_norm": 0.0784497782588005, "learning_rate": 8.228823194082771e-06, "loss": 0.4616, "num_input_tokens_seen": 67732160, "step": 55690 }, { "epoch": 6.978448815937853, "grad_norm": 0.06921535730361938, "learning_rate": 8.228405738796746e-06, "loss": 0.4516, "num_input_tokens_seen": 67738240, "step": 55695 }, { "epoch": 6.979075303846636, "grad_norm": 0.09619735181331635, "learning_rate": 8.227988244912413e-06, "loss": 0.4675, "num_input_tokens_seen": 67744032, "step": 55700 }, { "epoch": 6.979701791755419, "grad_norm": 0.07701034843921661, "learning_rate": 8.227570712434769e-06, "loss": 0.4642, "num_input_tokens_seen": 67750336, "step": 55705 }, { "epoch": 6.980328279664202, "grad_norm": 0.08207455277442932, "learning_rate": 8.227153141368802e-06, "loss": 0.4506, "num_input_tokens_seen": 67755648, "step": 55710 }, { "epoch": 6.980954767572986, "grad_norm": 0.07623391598463058, "learning_rate": 8.226735531719506e-06, "loss": 0.4655, "num_input_tokens_seen": 67761152, "step": 55715 }, { "epoch": 6.981581255481769, "grad_norm": 0.1182507574558258, "learning_rate": 8.226317883491874e-06, "loss": 0.4585, "num_input_tokens_seen": 67767456, "step": 55720 }, { "epoch": 6.9822077433905525, "grad_norm": 0.07608425617218018, "learning_rate": 8.2259001966909e-06, "loss": 0.4593, "num_input_tokens_seen": 67773888, "step": 55725 }, { "epoch": 6.982834231299336, "grad_norm": 0.13172930479049683, "learning_rate": 8.225482471321575e-06, "loss": 0.4638, "num_input_tokens_seen": 67779872, "step": 55730 }, { "epoch": 6.98346071920812, "grad_norm": 0.07331668585538864, "learning_rate": 8.225064707388898e-06, "loss": 0.4645, "num_input_tokens_seen": 67785568, "step": 55735 }, { "epoch": 6.984087207116903, "grad_norm": 0.09611187875270844, "learning_rate": 8.224646904897859e-06, "loss": 0.4554, "num_input_tokens_seen": 67791776, "step": 55740 }, { "epoch": 6.984713695025686, "grad_norm": 0.12894751131534576, "learning_rate": 8.224229063853456e-06, "loss": 0.4578, "num_input_tokens_seen": 67798016, "step": 55745 }, { "epoch": 6.985340182934469, "grad_norm": 0.07579906284809113, "learning_rate": 8.223811184260684e-06, "loss": 0.4591, "num_input_tokens_seen": 67804096, "step": 55750 }, { "epoch": 6.985966670843252, "grad_norm": 0.046228233724832535, "learning_rate": 8.223393266124537e-06, "loss": 0.4565, "num_input_tokens_seen": 67810368, "step": 55755 }, { "epoch": 6.9865931587520365, "grad_norm": 0.054713211953639984, "learning_rate": 8.222975309450017e-06, "loss": 0.4493, "num_input_tokens_seen": 67816512, "step": 55760 }, { "epoch": 6.98721964666082, "grad_norm": 0.08231847733259201, "learning_rate": 8.222557314242114e-06, "loss": 0.4552, "num_input_tokens_seen": 67822752, "step": 55765 }, { "epoch": 6.987846134569603, "grad_norm": 0.10927849262952805, "learning_rate": 8.22213928050583e-06, "loss": 0.4713, "num_input_tokens_seen": 67828864, "step": 55770 }, { "epoch": 6.988472622478386, "grad_norm": 0.15592467784881592, "learning_rate": 8.221721208246162e-06, "loss": 0.4736, "num_input_tokens_seen": 67834784, "step": 55775 }, { "epoch": 6.989099110387169, "grad_norm": 0.08531132340431213, "learning_rate": 8.221303097468107e-06, "loss": 0.4635, "num_input_tokens_seen": 67840096, "step": 55780 }, { "epoch": 6.989725598295953, "grad_norm": 0.14034275710582733, "learning_rate": 8.220884948176668e-06, "loss": 0.4848, "num_input_tokens_seen": 67846144, "step": 55785 }, { "epoch": 6.990352086204736, "grad_norm": 0.0814322829246521, "learning_rate": 8.220466760376839e-06, "loss": 0.4594, "num_input_tokens_seen": 67851488, "step": 55790 }, { "epoch": 6.9909785741135195, "grad_norm": 0.09439517557621002, "learning_rate": 8.220048534073623e-06, "loss": 0.4755, "num_input_tokens_seen": 67857728, "step": 55795 }, { "epoch": 6.991605062022303, "grad_norm": 0.08438018709421158, "learning_rate": 8.21963026927202e-06, "loss": 0.4613, "num_input_tokens_seen": 67863648, "step": 55800 }, { "epoch": 6.992231549931086, "grad_norm": 0.07005096971988678, "learning_rate": 8.21921196597703e-06, "loss": 0.4634, "num_input_tokens_seen": 67869600, "step": 55805 }, { "epoch": 6.99285803783987, "grad_norm": 0.08523931354284286, "learning_rate": 8.218793624193655e-06, "loss": 0.469, "num_input_tokens_seen": 67875840, "step": 55810 }, { "epoch": 6.993484525748653, "grad_norm": 0.06841006129980087, "learning_rate": 8.218375243926895e-06, "loss": 0.4597, "num_input_tokens_seen": 67882048, "step": 55815 }, { "epoch": 6.994111013657436, "grad_norm": 0.06174671649932861, "learning_rate": 8.217956825181752e-06, "loss": 0.4639, "num_input_tokens_seen": 67888128, "step": 55820 }, { "epoch": 6.994737501566219, "grad_norm": 0.0838964581489563, "learning_rate": 8.217538367963231e-06, "loss": 0.4616, "num_input_tokens_seen": 67894272, "step": 55825 }, { "epoch": 6.9953639894750035, "grad_norm": 0.0640077143907547, "learning_rate": 8.217119872276335e-06, "loss": 0.4662, "num_input_tokens_seen": 67900288, "step": 55830 }, { "epoch": 6.995990477383787, "grad_norm": 0.06835369765758514, "learning_rate": 8.216701338126066e-06, "loss": 0.4582, "num_input_tokens_seen": 67906304, "step": 55835 }, { "epoch": 6.99661696529257, "grad_norm": 0.07026299089193344, "learning_rate": 8.216282765517424e-06, "loss": 0.4687, "num_input_tokens_seen": 67912480, "step": 55840 }, { "epoch": 6.997243453201353, "grad_norm": 0.062106262892484665, "learning_rate": 8.215864154455421e-06, "loss": 0.4658, "num_input_tokens_seen": 67918496, "step": 55845 }, { "epoch": 6.997869941110137, "grad_norm": 0.06543905287981033, "learning_rate": 8.215445504945057e-06, "loss": 0.4564, "num_input_tokens_seen": 67924480, "step": 55850 }, { "epoch": 6.99849642901892, "grad_norm": 0.08242172747850418, "learning_rate": 8.21502681699134e-06, "loss": 0.4675, "num_input_tokens_seen": 67930688, "step": 55855 }, { "epoch": 6.999122916927703, "grad_norm": 0.06195215880870819, "learning_rate": 8.214608090599274e-06, "loss": 0.4607, "num_input_tokens_seen": 67936768, "step": 55860 }, { "epoch": 6.9997494048364866, "grad_norm": 0.061472173780202866, "learning_rate": 8.214189325773866e-06, "loss": 0.468, "num_input_tokens_seen": 67943104, "step": 55865 }, { "epoch": 7.00037589274527, "grad_norm": 0.0975995734333992, "learning_rate": 8.21377052252012e-06, "loss": 0.4564, "num_input_tokens_seen": 67949120, "step": 55870 }, { "epoch": 7.001002380654054, "grad_norm": 0.059131406247615814, "learning_rate": 8.213351680843048e-06, "loss": 0.4674, "num_input_tokens_seen": 67955488, "step": 55875 }, { "epoch": 7.001628868562837, "grad_norm": 0.05865243822336197, "learning_rate": 8.212932800747653e-06, "loss": 0.469, "num_input_tokens_seen": 67961696, "step": 55880 }, { "epoch": 7.00225535647162, "grad_norm": 0.07047157734632492, "learning_rate": 8.212513882238945e-06, "loss": 0.4637, "num_input_tokens_seen": 67967776, "step": 55885 }, { "epoch": 7.002881844380403, "grad_norm": 0.05827803537249565, "learning_rate": 8.212094925321933e-06, "loss": 0.4599, "num_input_tokens_seen": 67974208, "step": 55890 }, { "epoch": 7.0035083322891865, "grad_norm": 0.15115460753440857, "learning_rate": 8.211675930001627e-06, "loss": 0.4589, "num_input_tokens_seen": 67980288, "step": 55895 }, { "epoch": 7.0041348201979705, "grad_norm": 0.05846986919641495, "learning_rate": 8.211256896283034e-06, "loss": 0.4678, "num_input_tokens_seen": 67986432, "step": 55900 }, { "epoch": 7.004761308106754, "grad_norm": 0.06044790521264076, "learning_rate": 8.210837824171164e-06, "loss": 0.4638, "num_input_tokens_seen": 67992672, "step": 55905 }, { "epoch": 7.005387796015537, "grad_norm": 0.07081387937068939, "learning_rate": 8.21041871367103e-06, "loss": 0.4605, "num_input_tokens_seen": 67998816, "step": 55910 }, { "epoch": 7.00601428392432, "grad_norm": 0.06629027426242828, "learning_rate": 8.20999956478764e-06, "loss": 0.4631, "num_input_tokens_seen": 68005120, "step": 55915 }, { "epoch": 7.006640771833103, "grad_norm": 0.05788884311914444, "learning_rate": 8.209580377526007e-06, "loss": 0.4579, "num_input_tokens_seen": 68011200, "step": 55920 }, { "epoch": 7.007267259741887, "grad_norm": 0.06581943482160568, "learning_rate": 8.209161151891142e-06, "loss": 0.4641, "num_input_tokens_seen": 68017312, "step": 55925 }, { "epoch": 7.00789374765067, "grad_norm": 0.0943935364484787, "learning_rate": 8.208741887888057e-06, "loss": 0.4565, "num_input_tokens_seen": 68023520, "step": 55930 }, { "epoch": 7.008520235559454, "grad_norm": 0.06991244107484818, "learning_rate": 8.208322585521766e-06, "loss": 0.4599, "num_input_tokens_seen": 68029568, "step": 55935 }, { "epoch": 7.009146723468237, "grad_norm": 0.0755968689918518, "learning_rate": 8.20790324479728e-06, "loss": 0.4587, "num_input_tokens_seen": 68035456, "step": 55940 }, { "epoch": 7.009773211377021, "grad_norm": 0.06421244144439697, "learning_rate": 8.207483865719615e-06, "loss": 0.4706, "num_input_tokens_seen": 68041760, "step": 55945 }, { "epoch": 7.010399699285804, "grad_norm": 0.0743982344865799, "learning_rate": 8.207064448293781e-06, "loss": 0.4614, "num_input_tokens_seen": 68048224, "step": 55950 }, { "epoch": 7.011026187194587, "grad_norm": 0.06683633476495743, "learning_rate": 8.206644992524797e-06, "loss": 0.4675, "num_input_tokens_seen": 68054368, "step": 55955 }, { "epoch": 7.01165267510337, "grad_norm": 0.07251283526420593, "learning_rate": 8.206225498417677e-06, "loss": 0.4668, "num_input_tokens_seen": 68060768, "step": 55960 }, { "epoch": 7.0122791630121535, "grad_norm": 0.06438930332660675, "learning_rate": 8.205805965977434e-06, "loss": 0.4588, "num_input_tokens_seen": 68066848, "step": 55965 }, { "epoch": 7.0129056509209375, "grad_norm": 0.07067041099071503, "learning_rate": 8.205386395209084e-06, "loss": 0.4544, "num_input_tokens_seen": 68072960, "step": 55970 }, { "epoch": 7.013532138829721, "grad_norm": 0.0731133371591568, "learning_rate": 8.204966786117647e-06, "loss": 0.4617, "num_input_tokens_seen": 68078752, "step": 55975 }, { "epoch": 7.014158626738504, "grad_norm": 0.06257858127355576, "learning_rate": 8.204547138708136e-06, "loss": 0.4642, "num_input_tokens_seen": 68084896, "step": 55980 }, { "epoch": 7.014785114647287, "grad_norm": 0.08157789707183838, "learning_rate": 8.204127452985572e-06, "loss": 0.4623, "num_input_tokens_seen": 68091328, "step": 55985 }, { "epoch": 7.015411602556071, "grad_norm": 0.09985513985157013, "learning_rate": 8.203707728954969e-06, "loss": 0.4649, "num_input_tokens_seen": 68097536, "step": 55990 }, { "epoch": 7.016038090464854, "grad_norm": 0.0653180181980133, "learning_rate": 8.203287966621347e-06, "loss": 0.4613, "num_input_tokens_seen": 68103776, "step": 55995 }, { "epoch": 7.016664578373637, "grad_norm": 0.10980980098247528, "learning_rate": 8.202868165989724e-06, "loss": 0.463, "num_input_tokens_seen": 68109952, "step": 56000 }, { "epoch": 7.017291066282421, "grad_norm": 0.055623479187488556, "learning_rate": 8.202448327065119e-06, "loss": 0.4594, "num_input_tokens_seen": 68115712, "step": 56005 }, { "epoch": 7.017917554191204, "grad_norm": 0.03661355748772621, "learning_rate": 8.202028449852553e-06, "loss": 0.4594, "num_input_tokens_seen": 68121760, "step": 56010 }, { "epoch": 7.018544042099988, "grad_norm": 0.11958172172307968, "learning_rate": 8.201608534357043e-06, "loss": 0.4621, "num_input_tokens_seen": 68127456, "step": 56015 }, { "epoch": 7.019170530008771, "grad_norm": 0.09829627722501755, "learning_rate": 8.201188580583612e-06, "loss": 0.468, "num_input_tokens_seen": 68133632, "step": 56020 }, { "epoch": 7.019797017917554, "grad_norm": 0.08823823183774948, "learning_rate": 8.20076858853728e-06, "loss": 0.4609, "num_input_tokens_seen": 68139808, "step": 56025 }, { "epoch": 7.020423505826337, "grad_norm": 0.060557227581739426, "learning_rate": 8.200348558223067e-06, "loss": 0.4658, "num_input_tokens_seen": 68145856, "step": 56030 }, { "epoch": 7.0210499937351205, "grad_norm": 0.06740615516901016, "learning_rate": 8.199928489645999e-06, "loss": 0.4669, "num_input_tokens_seen": 68152032, "step": 56035 }, { "epoch": 7.021676481643905, "grad_norm": 0.06666356325149536, "learning_rate": 8.199508382811094e-06, "loss": 0.4634, "num_input_tokens_seen": 68157984, "step": 56040 }, { "epoch": 7.022302969552688, "grad_norm": 0.09722218662500381, "learning_rate": 8.199088237723376e-06, "loss": 0.4557, "num_input_tokens_seen": 68164416, "step": 56045 }, { "epoch": 7.022929457461471, "grad_norm": 0.09446578472852707, "learning_rate": 8.19866805438787e-06, "loss": 0.4569, "num_input_tokens_seen": 68170048, "step": 56050 }, { "epoch": 7.023555945370254, "grad_norm": 0.11200512200593948, "learning_rate": 8.198247832809596e-06, "loss": 0.4664, "num_input_tokens_seen": 68176128, "step": 56055 }, { "epoch": 7.024182433279038, "grad_norm": 0.0733233168721199, "learning_rate": 8.197827572993583e-06, "loss": 0.4687, "num_input_tokens_seen": 68182112, "step": 56060 }, { "epoch": 7.024808921187821, "grad_norm": 0.06851015239953995, "learning_rate": 8.197407274944851e-06, "loss": 0.465, "num_input_tokens_seen": 68188384, "step": 56065 }, { "epoch": 7.0254354090966045, "grad_norm": 0.06977731734514236, "learning_rate": 8.196986938668427e-06, "loss": 0.4575, "num_input_tokens_seen": 68194016, "step": 56070 }, { "epoch": 7.026061897005388, "grad_norm": 0.06601143628358841, "learning_rate": 8.196566564169334e-06, "loss": 0.4671, "num_input_tokens_seen": 68200032, "step": 56075 }, { "epoch": 7.026688384914171, "grad_norm": 0.06921476870775223, "learning_rate": 8.196146151452605e-06, "loss": 0.4563, "num_input_tokens_seen": 68205888, "step": 56080 }, { "epoch": 7.027314872822955, "grad_norm": 0.06344060599803925, "learning_rate": 8.195725700523258e-06, "loss": 0.4641, "num_input_tokens_seen": 68212256, "step": 56085 }, { "epoch": 7.027941360731738, "grad_norm": 0.11043659597635269, "learning_rate": 8.195305211386325e-06, "loss": 0.4615, "num_input_tokens_seen": 68218688, "step": 56090 }, { "epoch": 7.028567848640521, "grad_norm": 0.06154178828001022, "learning_rate": 8.19488468404683e-06, "loss": 0.4667, "num_input_tokens_seen": 68224320, "step": 56095 }, { "epoch": 7.029194336549304, "grad_norm": 0.05983494222164154, "learning_rate": 8.194464118509803e-06, "loss": 0.4652, "num_input_tokens_seen": 68230464, "step": 56100 }, { "epoch": 7.0298208244580875, "grad_norm": 0.06977193802595139, "learning_rate": 8.194043514780273e-06, "loss": 0.4648, "num_input_tokens_seen": 68236800, "step": 56105 }, { "epoch": 7.030447312366872, "grad_norm": 0.0811069905757904, "learning_rate": 8.193622872863267e-06, "loss": 0.4579, "num_input_tokens_seen": 68242912, "step": 56110 }, { "epoch": 7.031073800275655, "grad_norm": 0.0978655144572258, "learning_rate": 8.193202192763815e-06, "loss": 0.4687, "num_input_tokens_seen": 68248960, "step": 56115 }, { "epoch": 7.031700288184438, "grad_norm": 0.05855752155184746, "learning_rate": 8.192781474486946e-06, "loss": 0.4621, "num_input_tokens_seen": 68255136, "step": 56120 }, { "epoch": 7.032326776093221, "grad_norm": 0.0639239028096199, "learning_rate": 8.192360718037688e-06, "loss": 0.4594, "num_input_tokens_seen": 68261280, "step": 56125 }, { "epoch": 7.032953264002005, "grad_norm": 0.06368457525968552, "learning_rate": 8.191939923421076e-06, "loss": 0.4635, "num_input_tokens_seen": 68267136, "step": 56130 }, { "epoch": 7.033579751910788, "grad_norm": 0.15279823541641235, "learning_rate": 8.191519090642137e-06, "loss": 0.4622, "num_input_tokens_seen": 68272416, "step": 56135 }, { "epoch": 7.0342062398195715, "grad_norm": 0.06586555391550064, "learning_rate": 8.191098219705906e-06, "loss": 0.4616, "num_input_tokens_seen": 68277856, "step": 56140 }, { "epoch": 7.034832727728355, "grad_norm": 0.06540022790431976, "learning_rate": 8.19067731061741e-06, "loss": 0.4641, "num_input_tokens_seen": 68284320, "step": 56145 }, { "epoch": 7.035459215637138, "grad_norm": 0.06424936652183533, "learning_rate": 8.190256363381686e-06, "loss": 0.4608, "num_input_tokens_seen": 68289792, "step": 56150 }, { "epoch": 7.036085703545922, "grad_norm": 0.06116881221532822, "learning_rate": 8.189835378003764e-06, "loss": 0.467, "num_input_tokens_seen": 68295744, "step": 56155 }, { "epoch": 7.036712191454705, "grad_norm": 0.09782280027866364, "learning_rate": 8.189414354488679e-06, "loss": 0.4615, "num_input_tokens_seen": 68302432, "step": 56160 }, { "epoch": 7.037338679363488, "grad_norm": 0.06219965219497681, "learning_rate": 8.188993292841463e-06, "loss": 0.4611, "num_input_tokens_seen": 68308832, "step": 56165 }, { "epoch": 7.037965167272271, "grad_norm": 0.0613027885556221, "learning_rate": 8.188572193067152e-06, "loss": 0.4609, "num_input_tokens_seen": 68314624, "step": 56170 }, { "epoch": 7.038591655181055, "grad_norm": 0.1292506605386734, "learning_rate": 8.188151055170781e-06, "loss": 0.4679, "num_input_tokens_seen": 68320896, "step": 56175 }, { "epoch": 7.039218143089839, "grad_norm": 0.09920259565114975, "learning_rate": 8.18772987915738e-06, "loss": 0.4613, "num_input_tokens_seen": 68327200, "step": 56180 }, { "epoch": 7.039844630998622, "grad_norm": 0.15155740082263947, "learning_rate": 8.187308665031992e-06, "loss": 0.4685, "num_input_tokens_seen": 68333024, "step": 56185 }, { "epoch": 7.040471118907405, "grad_norm": 0.058562640100717545, "learning_rate": 8.186887412799647e-06, "loss": 0.4603, "num_input_tokens_seen": 68339328, "step": 56190 }, { "epoch": 7.041097606816188, "grad_norm": 0.061975814402103424, "learning_rate": 8.186466122465384e-06, "loss": 0.4625, "num_input_tokens_seen": 68345024, "step": 56195 }, { "epoch": 7.041724094724972, "grad_norm": 0.03918513283133507, "learning_rate": 8.186044794034241e-06, "loss": 0.4564, "num_input_tokens_seen": 68351168, "step": 56200 }, { "epoch": 7.042350582633755, "grad_norm": 0.09735699743032455, "learning_rate": 8.185623427511252e-06, "loss": 0.4667, "num_input_tokens_seen": 68357152, "step": 56205 }, { "epoch": 7.0429770705425385, "grad_norm": 0.05681267008185387, "learning_rate": 8.185202022901455e-06, "loss": 0.4573, "num_input_tokens_seen": 68363232, "step": 56210 }, { "epoch": 7.043603558451322, "grad_norm": 0.07753187417984009, "learning_rate": 8.184780580209892e-06, "loss": 0.4601, "num_input_tokens_seen": 68369440, "step": 56215 }, { "epoch": 7.044230046360105, "grad_norm": 0.058937735855579376, "learning_rate": 8.184359099441599e-06, "loss": 0.4615, "num_input_tokens_seen": 68375616, "step": 56220 }, { "epoch": 7.044856534268889, "grad_norm": 0.09763453900814056, "learning_rate": 8.183937580601618e-06, "loss": 0.4649, "num_input_tokens_seen": 68381152, "step": 56225 }, { "epoch": 7.045483022177672, "grad_norm": 0.04669570550322533, "learning_rate": 8.183516023694984e-06, "loss": 0.4635, "num_input_tokens_seen": 68387360, "step": 56230 }, { "epoch": 7.046109510086455, "grad_norm": 0.06495622545480728, "learning_rate": 8.18309442872674e-06, "loss": 0.464, "num_input_tokens_seen": 68393600, "step": 56235 }, { "epoch": 7.046735997995238, "grad_norm": 0.10363076627254486, "learning_rate": 8.182672795701924e-06, "loss": 0.4612, "num_input_tokens_seen": 68399744, "step": 56240 }, { "epoch": 7.0473624859040225, "grad_norm": 0.06472937762737274, "learning_rate": 8.182251124625581e-06, "loss": 0.462, "num_input_tokens_seen": 68405920, "step": 56245 }, { "epoch": 7.047988973812806, "grad_norm": 0.1091466024518013, "learning_rate": 8.18182941550275e-06, "loss": 0.4648, "num_input_tokens_seen": 68411552, "step": 56250 }, { "epoch": 7.048615461721589, "grad_norm": 0.0610625334084034, "learning_rate": 8.181407668338472e-06, "loss": 0.462, "num_input_tokens_seen": 68417504, "step": 56255 }, { "epoch": 7.049241949630372, "grad_norm": 0.057533763349056244, "learning_rate": 8.180985883137793e-06, "loss": 0.4673, "num_input_tokens_seen": 68423616, "step": 56260 }, { "epoch": 7.049868437539155, "grad_norm": 0.0585801787674427, "learning_rate": 8.18056405990575e-06, "loss": 0.462, "num_input_tokens_seen": 68429664, "step": 56265 }, { "epoch": 7.050494925447939, "grad_norm": 0.03669837862253189, "learning_rate": 8.180142198647395e-06, "loss": 0.4624, "num_input_tokens_seen": 68435904, "step": 56270 }, { "epoch": 7.051121413356722, "grad_norm": 0.0643441304564476, "learning_rate": 8.17972029936776e-06, "loss": 0.4633, "num_input_tokens_seen": 68442272, "step": 56275 }, { "epoch": 7.0517479012655055, "grad_norm": 0.08989421278238297, "learning_rate": 8.1792983620719e-06, "loss": 0.4659, "num_input_tokens_seen": 68448416, "step": 56280 }, { "epoch": 7.052374389174289, "grad_norm": 0.0661698430776596, "learning_rate": 8.17887638676485e-06, "loss": 0.4656, "num_input_tokens_seen": 68454560, "step": 56285 }, { "epoch": 7.053000877083072, "grad_norm": 0.08054511994123459, "learning_rate": 8.178454373451665e-06, "loss": 0.4636, "num_input_tokens_seen": 68460704, "step": 56290 }, { "epoch": 7.053627364991856, "grad_norm": 0.09611351788043976, "learning_rate": 8.178032322137385e-06, "loss": 0.4619, "num_input_tokens_seen": 68466720, "step": 56295 }, { "epoch": 7.054253852900639, "grad_norm": 0.06481150537729263, "learning_rate": 8.177610232827055e-06, "loss": 0.4701, "num_input_tokens_seen": 68473184, "step": 56300 }, { "epoch": 7.054880340809422, "grad_norm": 0.03461882099509239, "learning_rate": 8.177188105525723e-06, "loss": 0.4573, "num_input_tokens_seen": 68479136, "step": 56305 }, { "epoch": 7.0555068287182054, "grad_norm": 0.10593289136886597, "learning_rate": 8.176765940238439e-06, "loss": 0.4634, "num_input_tokens_seen": 68484608, "step": 56310 }, { "epoch": 7.0561333166269895, "grad_norm": 0.06937837600708008, "learning_rate": 8.176343736970244e-06, "loss": 0.462, "num_input_tokens_seen": 68490656, "step": 56315 }, { "epoch": 7.056759804535773, "grad_norm": 0.105927012860775, "learning_rate": 8.175921495726192e-06, "loss": 0.4601, "num_input_tokens_seen": 68496832, "step": 56320 }, { "epoch": 7.057386292444556, "grad_norm": 0.03544261306524277, "learning_rate": 8.175499216511325e-06, "loss": 0.4586, "num_input_tokens_seen": 68502880, "step": 56325 }, { "epoch": 7.058012780353339, "grad_norm": 0.08132651448249817, "learning_rate": 8.175076899330696e-06, "loss": 0.4632, "num_input_tokens_seen": 68509024, "step": 56330 }, { "epoch": 7.058639268262122, "grad_norm": 0.06353525072336197, "learning_rate": 8.174654544189353e-06, "loss": 0.4604, "num_input_tokens_seen": 68515232, "step": 56335 }, { "epoch": 7.059265756170906, "grad_norm": 0.06477364152669907, "learning_rate": 8.174232151092348e-06, "loss": 0.4588, "num_input_tokens_seen": 68520608, "step": 56340 }, { "epoch": 7.059892244079689, "grad_norm": 0.04088320955634117, "learning_rate": 8.173809720044727e-06, "loss": 0.4658, "num_input_tokens_seen": 68527072, "step": 56345 }, { "epoch": 7.060518731988473, "grad_norm": 0.03098534420132637, "learning_rate": 8.173387251051543e-06, "loss": 0.4606, "num_input_tokens_seen": 68533280, "step": 56350 }, { "epoch": 7.061145219897256, "grad_norm": 0.09027035534381866, "learning_rate": 8.172964744117847e-06, "loss": 0.4604, "num_input_tokens_seen": 68539328, "step": 56355 }, { "epoch": 7.061771707806039, "grad_norm": 0.06597312539815903, "learning_rate": 8.172542199248688e-06, "loss": 0.4604, "num_input_tokens_seen": 68545472, "step": 56360 }, { "epoch": 7.062398195714823, "grad_norm": 0.04808127135038376, "learning_rate": 8.172119616449122e-06, "loss": 0.4643, "num_input_tokens_seen": 68551648, "step": 56365 }, { "epoch": 7.063024683623606, "grad_norm": 0.03986603766679764, "learning_rate": 8.171696995724198e-06, "loss": 0.4637, "num_input_tokens_seen": 68557856, "step": 56370 }, { "epoch": 7.063651171532389, "grad_norm": 0.07312554121017456, "learning_rate": 8.17127433707897e-06, "loss": 0.4601, "num_input_tokens_seen": 68563488, "step": 56375 }, { "epoch": 7.0642776594411725, "grad_norm": 0.11888191103935242, "learning_rate": 8.17085164051849e-06, "loss": 0.4601, "num_input_tokens_seen": 68569120, "step": 56380 }, { "epoch": 7.0649041473499565, "grad_norm": 0.060019683092832565, "learning_rate": 8.170428906047814e-06, "loss": 0.4647, "num_input_tokens_seen": 68575392, "step": 56385 }, { "epoch": 7.06553063525874, "grad_norm": 0.096324123442173, "learning_rate": 8.170006133671993e-06, "loss": 0.4641, "num_input_tokens_seen": 68581344, "step": 56390 }, { "epoch": 7.066157123167523, "grad_norm": 0.10420694947242737, "learning_rate": 8.169583323396086e-06, "loss": 0.468, "num_input_tokens_seen": 68586624, "step": 56395 }, { "epoch": 7.066783611076306, "grad_norm": 0.09342078864574432, "learning_rate": 8.169160475225143e-06, "loss": 0.4597, "num_input_tokens_seen": 68592544, "step": 56400 }, { "epoch": 7.067410098985089, "grad_norm": 0.08822450786828995, "learning_rate": 8.168737589164224e-06, "loss": 0.4656, "num_input_tokens_seen": 68598848, "step": 56405 }, { "epoch": 7.068036586893873, "grad_norm": 0.0652826800942421, "learning_rate": 8.168314665218382e-06, "loss": 0.4556, "num_input_tokens_seen": 68604928, "step": 56410 }, { "epoch": 7.068663074802656, "grad_norm": 0.07361152023077011, "learning_rate": 8.167891703392674e-06, "loss": 0.4639, "num_input_tokens_seen": 68611168, "step": 56415 }, { "epoch": 7.06928956271144, "grad_norm": 0.08770369738340378, "learning_rate": 8.167468703692159e-06, "loss": 0.4669, "num_input_tokens_seen": 68617536, "step": 56420 }, { "epoch": 7.069916050620223, "grad_norm": 0.04252736642956734, "learning_rate": 8.167045666121892e-06, "loss": 0.461, "num_input_tokens_seen": 68623520, "step": 56425 }, { "epoch": 7.070542538529007, "grad_norm": 0.06280459463596344, "learning_rate": 8.166622590686931e-06, "loss": 0.4591, "num_input_tokens_seen": 68629696, "step": 56430 }, { "epoch": 7.07116902643779, "grad_norm": 0.06976743042469025, "learning_rate": 8.166199477392333e-06, "loss": 0.4614, "num_input_tokens_seen": 68635840, "step": 56435 }, { "epoch": 7.071795514346573, "grad_norm": 0.10384315997362137, "learning_rate": 8.16577632624316e-06, "loss": 0.4605, "num_input_tokens_seen": 68641824, "step": 56440 }, { "epoch": 7.072422002255356, "grad_norm": 0.07306578755378723, "learning_rate": 8.16535313724447e-06, "loss": 0.4659, "num_input_tokens_seen": 68648160, "step": 56445 }, { "epoch": 7.0730484901641395, "grad_norm": 0.12348221242427826, "learning_rate": 8.164929910401323e-06, "loss": 0.4645, "num_input_tokens_seen": 68654272, "step": 56450 }, { "epoch": 7.073674978072924, "grad_norm": 0.09405521303415298, "learning_rate": 8.164506645718775e-06, "loss": 0.4656, "num_input_tokens_seen": 68660384, "step": 56455 }, { "epoch": 7.074301465981707, "grad_norm": 0.04418829828500748, "learning_rate": 8.16408334320189e-06, "loss": 0.463, "num_input_tokens_seen": 68665824, "step": 56460 }, { "epoch": 7.07492795389049, "grad_norm": 0.11297276616096497, "learning_rate": 8.16366000285573e-06, "loss": 0.4693, "num_input_tokens_seen": 68671776, "step": 56465 }, { "epoch": 7.075554441799273, "grad_norm": 0.061518244445323944, "learning_rate": 8.163236624685352e-06, "loss": 0.4654, "num_input_tokens_seen": 68677152, "step": 56470 }, { "epoch": 7.076180929708056, "grad_norm": 0.09644148498773575, "learning_rate": 8.162813208695823e-06, "loss": 0.4596, "num_input_tokens_seen": 68683232, "step": 56475 }, { "epoch": 7.07680741761684, "grad_norm": 0.06404447555541992, "learning_rate": 8.162389754892203e-06, "loss": 0.4651, "num_input_tokens_seen": 68689472, "step": 56480 }, { "epoch": 7.0774339055256235, "grad_norm": 0.09573861956596375, "learning_rate": 8.161966263279554e-06, "loss": 0.4672, "num_input_tokens_seen": 68695424, "step": 56485 }, { "epoch": 7.078060393434407, "grad_norm": 0.15748287737369537, "learning_rate": 8.16154273386294e-06, "loss": 0.4621, "num_input_tokens_seen": 68701536, "step": 56490 }, { "epoch": 7.07868688134319, "grad_norm": 0.06358393281698227, "learning_rate": 8.161119166647425e-06, "loss": 0.466, "num_input_tokens_seen": 68707456, "step": 56495 }, { "epoch": 7.079313369251974, "grad_norm": 0.07442507147789001, "learning_rate": 8.160695561638072e-06, "loss": 0.462, "num_input_tokens_seen": 68713696, "step": 56500 }, { "epoch": 7.079939857160757, "grad_norm": 0.07436492294073105, "learning_rate": 8.160271918839946e-06, "loss": 0.4578, "num_input_tokens_seen": 68719936, "step": 56505 }, { "epoch": 7.08056634506954, "grad_norm": 0.057789649814367294, "learning_rate": 8.159848238258112e-06, "loss": 0.4625, "num_input_tokens_seen": 68725024, "step": 56510 }, { "epoch": 7.081192832978323, "grad_norm": 0.08571602404117584, "learning_rate": 8.159424519897636e-06, "loss": 0.4597, "num_input_tokens_seen": 68730560, "step": 56515 }, { "epoch": 7.0818193208871065, "grad_norm": 0.05713112652301788, "learning_rate": 8.159000763763584e-06, "loss": 0.4578, "num_input_tokens_seen": 68736704, "step": 56520 }, { "epoch": 7.082445808795891, "grad_norm": 0.06606540083885193, "learning_rate": 8.158576969861022e-06, "loss": 0.4593, "num_input_tokens_seen": 68742464, "step": 56525 }, { "epoch": 7.083072296704674, "grad_norm": 0.0917825698852539, "learning_rate": 8.158153138195017e-06, "loss": 0.4637, "num_input_tokens_seen": 68748544, "step": 56530 }, { "epoch": 7.083698784613457, "grad_norm": 0.06454236805438995, "learning_rate": 8.157729268770636e-06, "loss": 0.4671, "num_input_tokens_seen": 68754464, "step": 56535 }, { "epoch": 7.08432527252224, "grad_norm": 0.08673051744699478, "learning_rate": 8.157305361592945e-06, "loss": 0.4674, "num_input_tokens_seen": 68760384, "step": 56540 }, { "epoch": 7.084951760431023, "grad_norm": 0.0376567542552948, "learning_rate": 8.156881416667016e-06, "loss": 0.4616, "num_input_tokens_seen": 68766592, "step": 56545 }, { "epoch": 7.085578248339807, "grad_norm": 0.11104703694581985, "learning_rate": 8.156457433997913e-06, "loss": 0.4578, "num_input_tokens_seen": 68772832, "step": 56550 }, { "epoch": 7.0862047362485905, "grad_norm": 0.03578367084264755, "learning_rate": 8.15603341359071e-06, "loss": 0.4565, "num_input_tokens_seen": 68779040, "step": 56555 }, { "epoch": 7.086831224157374, "grad_norm": 0.03708343207836151, "learning_rate": 8.155609355450473e-06, "loss": 0.4638, "num_input_tokens_seen": 68785216, "step": 56560 }, { "epoch": 7.087457712066157, "grad_norm": 0.08165812492370605, "learning_rate": 8.155185259582273e-06, "loss": 0.4606, "num_input_tokens_seen": 68791136, "step": 56565 }, { "epoch": 7.088084199974941, "grad_norm": 0.11714519560337067, "learning_rate": 8.154761125991182e-06, "loss": 0.4616, "num_input_tokens_seen": 68797376, "step": 56570 }, { "epoch": 7.088710687883724, "grad_norm": 0.06474309414625168, "learning_rate": 8.154336954682267e-06, "loss": 0.4625, "num_input_tokens_seen": 68803552, "step": 56575 }, { "epoch": 7.089337175792507, "grad_norm": 0.07040093839168549, "learning_rate": 8.153912745660604e-06, "loss": 0.4594, "num_input_tokens_seen": 68809696, "step": 56580 }, { "epoch": 7.08996366370129, "grad_norm": 0.06599263101816177, "learning_rate": 8.15348849893126e-06, "loss": 0.4581, "num_input_tokens_seen": 68816032, "step": 56585 }, { "epoch": 7.0905901516100736, "grad_norm": 0.11997894197702408, "learning_rate": 8.153064214499311e-06, "loss": 0.4641, "num_input_tokens_seen": 68822368, "step": 56590 }, { "epoch": 7.091216639518858, "grad_norm": 0.061910539865493774, "learning_rate": 8.152639892369827e-06, "loss": 0.4599, "num_input_tokens_seen": 68828608, "step": 56595 }, { "epoch": 7.091843127427641, "grad_norm": 0.06472064554691315, "learning_rate": 8.152215532547885e-06, "loss": 0.4585, "num_input_tokens_seen": 68834784, "step": 56600 }, { "epoch": 7.092469615336424, "grad_norm": 0.10119478404521942, "learning_rate": 8.151791135038555e-06, "loss": 0.4602, "num_input_tokens_seen": 68841216, "step": 56605 }, { "epoch": 7.093096103245207, "grad_norm": 0.06099596619606018, "learning_rate": 8.151366699846912e-06, "loss": 0.4656, "num_input_tokens_seen": 68847584, "step": 56610 }, { "epoch": 7.093722591153991, "grad_norm": 0.0859999880194664, "learning_rate": 8.150942226978031e-06, "loss": 0.4596, "num_input_tokens_seen": 68853888, "step": 56615 }, { "epoch": 7.094349079062774, "grad_norm": 0.05843048915266991, "learning_rate": 8.150517716436985e-06, "loss": 0.456, "num_input_tokens_seen": 68859456, "step": 56620 }, { "epoch": 7.0949755669715575, "grad_norm": 0.16418415307998657, "learning_rate": 8.150093168228854e-06, "loss": 0.4663, "num_input_tokens_seen": 68865824, "step": 56625 }, { "epoch": 7.095602054880341, "grad_norm": 0.10744401067495346, "learning_rate": 8.149668582358707e-06, "loss": 0.4574, "num_input_tokens_seen": 68871712, "step": 56630 }, { "epoch": 7.096228542789124, "grad_norm": 0.10998013615608215, "learning_rate": 8.149243958831627e-06, "loss": 0.4657, "num_input_tokens_seen": 68877792, "step": 56635 }, { "epoch": 7.096855030697908, "grad_norm": 0.09064358472824097, "learning_rate": 8.148819297652686e-06, "loss": 0.466, "num_input_tokens_seen": 68884256, "step": 56640 }, { "epoch": 7.097481518606691, "grad_norm": 0.08726929873228073, "learning_rate": 8.148394598826962e-06, "loss": 0.4673, "num_input_tokens_seen": 68890272, "step": 56645 }, { "epoch": 7.098108006515474, "grad_norm": 0.10826296359300613, "learning_rate": 8.147969862359535e-06, "loss": 0.4631, "num_input_tokens_seen": 68896544, "step": 56650 }, { "epoch": 7.098734494424257, "grad_norm": 0.09006581455469131, "learning_rate": 8.147545088255482e-06, "loss": 0.4609, "num_input_tokens_seen": 68902784, "step": 56655 }, { "epoch": 7.099360982333041, "grad_norm": 0.06573861092329025, "learning_rate": 8.147120276519882e-06, "loss": 0.4664, "num_input_tokens_seen": 68909056, "step": 56660 }, { "epoch": 7.099987470241825, "grad_norm": 0.06852521747350693, "learning_rate": 8.146695427157811e-06, "loss": 0.4686, "num_input_tokens_seen": 68914336, "step": 56665 }, { "epoch": 7.100613958150608, "grad_norm": 0.06307577341794968, "learning_rate": 8.146270540174352e-06, "loss": 0.4525, "num_input_tokens_seen": 68920288, "step": 56670 }, { "epoch": 7.101240446059391, "grad_norm": 0.0990736335515976, "learning_rate": 8.145845615574583e-06, "loss": 0.4555, "num_input_tokens_seen": 68926304, "step": 56675 }, { "epoch": 7.101866933968174, "grad_norm": 0.10121803730726242, "learning_rate": 8.145420653363585e-06, "loss": 0.4638, "num_input_tokens_seen": 68932384, "step": 56680 }, { "epoch": 7.102493421876958, "grad_norm": 0.06495155394077301, "learning_rate": 8.144995653546436e-06, "loss": 0.4603, "num_input_tokens_seen": 68937888, "step": 56685 }, { "epoch": 7.103119909785741, "grad_norm": 0.07929711043834686, "learning_rate": 8.144570616128225e-06, "loss": 0.4625, "num_input_tokens_seen": 68944032, "step": 56690 }, { "epoch": 7.1037463976945245, "grad_norm": 0.11907081305980682, "learning_rate": 8.144145541114024e-06, "loss": 0.4644, "num_input_tokens_seen": 68950080, "step": 56695 }, { "epoch": 7.104372885603308, "grad_norm": 0.09181461483240128, "learning_rate": 8.14372042850892e-06, "loss": 0.4675, "num_input_tokens_seen": 68956256, "step": 56700 }, { "epoch": 7.104999373512091, "grad_norm": 0.07287464290857315, "learning_rate": 8.143295278317998e-06, "loss": 0.4667, "num_input_tokens_seen": 68961888, "step": 56705 }, { "epoch": 7.105625861420875, "grad_norm": 0.09204261004924774, "learning_rate": 8.142870090546336e-06, "loss": 0.4677, "num_input_tokens_seen": 68967648, "step": 56710 }, { "epoch": 7.106252349329658, "grad_norm": 0.07408680766820908, "learning_rate": 8.14244486519902e-06, "loss": 0.4619, "num_input_tokens_seen": 68973472, "step": 56715 }, { "epoch": 7.106878837238441, "grad_norm": 0.07724088430404663, "learning_rate": 8.142019602281134e-06, "loss": 0.465, "num_input_tokens_seen": 68979680, "step": 56720 }, { "epoch": 7.107505325147224, "grad_norm": 0.08601334691047668, "learning_rate": 8.141594301797762e-06, "loss": 0.4584, "num_input_tokens_seen": 68985920, "step": 56725 }, { "epoch": 7.108131813056008, "grad_norm": 0.06101372838020325, "learning_rate": 8.141168963753987e-06, "loss": 0.4634, "num_input_tokens_seen": 68992000, "step": 56730 }, { "epoch": 7.108758300964792, "grad_norm": 0.040695920586586, "learning_rate": 8.140743588154899e-06, "loss": 0.4577, "num_input_tokens_seen": 68998080, "step": 56735 }, { "epoch": 7.109384788873575, "grad_norm": 0.07594273239374161, "learning_rate": 8.140318175005579e-06, "loss": 0.4642, "num_input_tokens_seen": 69004128, "step": 56740 }, { "epoch": 7.110011276782358, "grad_norm": 0.08085483312606812, "learning_rate": 8.139892724311115e-06, "loss": 0.4698, "num_input_tokens_seen": 69010432, "step": 56745 }, { "epoch": 7.110637764691141, "grad_norm": 0.06794197857379913, "learning_rate": 8.139467236076595e-06, "loss": 0.4612, "num_input_tokens_seen": 69016544, "step": 56750 }, { "epoch": 7.111264252599925, "grad_norm": 0.06483030319213867, "learning_rate": 8.139041710307103e-06, "loss": 0.462, "num_input_tokens_seen": 69022464, "step": 56755 }, { "epoch": 7.111890740508708, "grad_norm": 0.0965818539261818, "learning_rate": 8.138616147007729e-06, "loss": 0.4625, "num_input_tokens_seen": 69028672, "step": 56760 }, { "epoch": 7.112517228417492, "grad_norm": 0.06687668710947037, "learning_rate": 8.13819054618356e-06, "loss": 0.4599, "num_input_tokens_seen": 69035072, "step": 56765 }, { "epoch": 7.113143716326275, "grad_norm": 0.08733808994293213, "learning_rate": 8.137764907839684e-06, "loss": 0.4635, "num_input_tokens_seen": 69041504, "step": 56770 }, { "epoch": 7.113770204235058, "grad_norm": 0.05843956768512726, "learning_rate": 8.137339231981192e-06, "loss": 0.4581, "num_input_tokens_seen": 69047840, "step": 56775 }, { "epoch": 7.114396692143842, "grad_norm": 0.0403142087161541, "learning_rate": 8.136913518613171e-06, "loss": 0.4606, "num_input_tokens_seen": 69054112, "step": 56780 }, { "epoch": 7.115023180052625, "grad_norm": 0.07620545476675034, "learning_rate": 8.136487767740711e-06, "loss": 0.4646, "num_input_tokens_seen": 69059744, "step": 56785 }, { "epoch": 7.115649667961408, "grad_norm": 0.09679929167032242, "learning_rate": 8.136061979368902e-06, "loss": 0.4573, "num_input_tokens_seen": 69066144, "step": 56790 }, { "epoch": 7.1162761558701915, "grad_norm": 0.07197173684835434, "learning_rate": 8.135636153502838e-06, "loss": 0.4613, "num_input_tokens_seen": 69072032, "step": 56795 }, { "epoch": 7.116902643778975, "grad_norm": 0.06791813671588898, "learning_rate": 8.135210290147606e-06, "loss": 0.46, "num_input_tokens_seen": 69077792, "step": 56800 }, { "epoch": 7.117529131687759, "grad_norm": 0.11689068377017975, "learning_rate": 8.1347843893083e-06, "loss": 0.4656, "num_input_tokens_seen": 69083968, "step": 56805 }, { "epoch": 7.118155619596542, "grad_norm": 0.06806205213069916, "learning_rate": 8.134358450990009e-06, "loss": 0.4599, "num_input_tokens_seen": 69090016, "step": 56810 }, { "epoch": 7.118782107505325, "grad_norm": 0.08576460182666779, "learning_rate": 8.133932475197829e-06, "loss": 0.4608, "num_input_tokens_seen": 69096096, "step": 56815 }, { "epoch": 7.119408595414108, "grad_norm": 0.06593279540538788, "learning_rate": 8.13350646193685e-06, "loss": 0.461, "num_input_tokens_seen": 69102080, "step": 56820 }, { "epoch": 7.120035083322892, "grad_norm": 0.03901505842804909, "learning_rate": 8.133080411212169e-06, "loss": 0.4683, "num_input_tokens_seen": 69108160, "step": 56825 }, { "epoch": 7.120661571231675, "grad_norm": 0.07037603855133057, "learning_rate": 8.132654323028874e-06, "loss": 0.4664, "num_input_tokens_seen": 69114432, "step": 56830 }, { "epoch": 7.121288059140459, "grad_norm": 0.09514430910348892, "learning_rate": 8.132228197392067e-06, "loss": 0.4622, "num_input_tokens_seen": 69120352, "step": 56835 }, { "epoch": 7.121914547049242, "grad_norm": 0.11405961215496063, "learning_rate": 8.131802034306834e-06, "loss": 0.4624, "num_input_tokens_seen": 69126784, "step": 56840 }, { "epoch": 7.122541034958025, "grad_norm": 0.1621071696281433, "learning_rate": 8.131375833778279e-06, "loss": 0.4601, "num_input_tokens_seen": 69132768, "step": 56845 }, { "epoch": 7.123167522866809, "grad_norm": 0.06316084414720535, "learning_rate": 8.130949595811492e-06, "loss": 0.4674, "num_input_tokens_seen": 69138880, "step": 56850 }, { "epoch": 7.123794010775592, "grad_norm": 0.07397457212209702, "learning_rate": 8.130523320411568e-06, "loss": 0.4625, "num_input_tokens_seen": 69145120, "step": 56855 }, { "epoch": 7.124420498684375, "grad_norm": 0.07530388236045837, "learning_rate": 8.130097007583608e-06, "loss": 0.4638, "num_input_tokens_seen": 69151264, "step": 56860 }, { "epoch": 7.1250469865931585, "grad_norm": 0.05687630549073219, "learning_rate": 8.129670657332706e-06, "loss": 0.464, "num_input_tokens_seen": 69157280, "step": 56865 }, { "epoch": 7.1256734745019426, "grad_norm": 0.07041706144809723, "learning_rate": 8.129244269663959e-06, "loss": 0.4593, "num_input_tokens_seen": 69163264, "step": 56870 }, { "epoch": 7.126299962410726, "grad_norm": 0.06857264041900635, "learning_rate": 8.128817844582468e-06, "loss": 0.4625, "num_input_tokens_seen": 69169632, "step": 56875 }, { "epoch": 7.126926450319509, "grad_norm": 0.11722562462091446, "learning_rate": 8.128391382093327e-06, "loss": 0.4627, "num_input_tokens_seen": 69176064, "step": 56880 }, { "epoch": 7.127552938228292, "grad_norm": 0.07409884035587311, "learning_rate": 8.127964882201637e-06, "loss": 0.4606, "num_input_tokens_seen": 69182240, "step": 56885 }, { "epoch": 7.128179426137075, "grad_norm": 0.0646740198135376, "learning_rate": 8.127538344912497e-06, "loss": 0.4584, "num_input_tokens_seen": 69188384, "step": 56890 }, { "epoch": 7.128805914045859, "grad_norm": 0.0831611305475235, "learning_rate": 8.127111770231009e-06, "loss": 0.4568, "num_input_tokens_seen": 69194880, "step": 56895 }, { "epoch": 7.1294324019546425, "grad_norm": 0.08069785684347153, "learning_rate": 8.126685158162268e-06, "loss": 0.4614, "num_input_tokens_seen": 69200320, "step": 56900 }, { "epoch": 7.130058889863426, "grad_norm": 0.08437969535589218, "learning_rate": 8.126258508711378e-06, "loss": 0.4667, "num_input_tokens_seen": 69205568, "step": 56905 }, { "epoch": 7.130685377772209, "grad_norm": 0.11195997893810272, "learning_rate": 8.125831821883439e-06, "loss": 0.4582, "num_input_tokens_seen": 69211456, "step": 56910 }, { "epoch": 7.131311865680992, "grad_norm": 0.09827058017253876, "learning_rate": 8.125405097683552e-06, "loss": 0.4696, "num_input_tokens_seen": 69217504, "step": 56915 }, { "epoch": 7.131938353589776, "grad_norm": 0.07708866894245148, "learning_rate": 8.12497833611682e-06, "loss": 0.4626, "num_input_tokens_seen": 69223776, "step": 56920 }, { "epoch": 7.132564841498559, "grad_norm": 0.0871870219707489, "learning_rate": 8.124551537188345e-06, "loss": 0.4591, "num_input_tokens_seen": 69230048, "step": 56925 }, { "epoch": 7.133191329407342, "grad_norm": 0.0696401298046112, "learning_rate": 8.12412470090323e-06, "loss": 0.463, "num_input_tokens_seen": 69236128, "step": 56930 }, { "epoch": 7.1338178173161255, "grad_norm": 0.1317579746246338, "learning_rate": 8.123697827266576e-06, "loss": 0.4672, "num_input_tokens_seen": 69242368, "step": 56935 }, { "epoch": 7.13444430522491, "grad_norm": 0.06809122860431671, "learning_rate": 8.123270916283488e-06, "loss": 0.4617, "num_input_tokens_seen": 69248640, "step": 56940 }, { "epoch": 7.135070793133693, "grad_norm": 0.08691399544477463, "learning_rate": 8.122843967959073e-06, "loss": 0.4593, "num_input_tokens_seen": 69254848, "step": 56945 }, { "epoch": 7.135697281042476, "grad_norm": 0.08767993003129959, "learning_rate": 8.12241698229843e-06, "loss": 0.4546, "num_input_tokens_seen": 69260960, "step": 56950 }, { "epoch": 7.136323768951259, "grad_norm": 0.07016407698392868, "learning_rate": 8.121989959306667e-06, "loss": 0.4577, "num_input_tokens_seen": 69267072, "step": 56955 }, { "epoch": 7.136950256860042, "grad_norm": 0.08884210884571075, "learning_rate": 8.12156289898889e-06, "loss": 0.4673, "num_input_tokens_seen": 69273152, "step": 56960 }, { "epoch": 7.137576744768826, "grad_norm": 0.07924918830394745, "learning_rate": 8.121135801350204e-06, "loss": 0.4636, "num_input_tokens_seen": 69278816, "step": 56965 }, { "epoch": 7.1382032326776095, "grad_norm": 0.07709405571222305, "learning_rate": 8.120708666395716e-06, "loss": 0.4621, "num_input_tokens_seen": 69284960, "step": 56970 }, { "epoch": 7.138829720586393, "grad_norm": 0.06339509040117264, "learning_rate": 8.120281494130531e-06, "loss": 0.4621, "num_input_tokens_seen": 69290976, "step": 56975 }, { "epoch": 7.139456208495176, "grad_norm": 0.054536234587430954, "learning_rate": 8.119854284559759e-06, "loss": 0.4488, "num_input_tokens_seen": 69296864, "step": 56980 }, { "epoch": 7.140082696403959, "grad_norm": 0.10996484756469727, "learning_rate": 8.119427037688505e-06, "loss": 0.4641, "num_input_tokens_seen": 69302944, "step": 56985 }, { "epoch": 7.140709184312743, "grad_norm": 0.11678363382816315, "learning_rate": 8.118999753521877e-06, "loss": 0.4657, "num_input_tokens_seen": 69308224, "step": 56990 }, { "epoch": 7.141335672221526, "grad_norm": 0.10522713512182236, "learning_rate": 8.118572432064987e-06, "loss": 0.4717, "num_input_tokens_seen": 69314112, "step": 56995 }, { "epoch": 7.141962160130309, "grad_norm": 0.06455371528863907, "learning_rate": 8.118145073322939e-06, "loss": 0.4614, "num_input_tokens_seen": 69319808, "step": 57000 }, { "epoch": 7.1425886480390925, "grad_norm": 0.09428829699754715, "learning_rate": 8.117717677300846e-06, "loss": 0.4653, "num_input_tokens_seen": 69325984, "step": 57005 }, { "epoch": 7.143215135947877, "grad_norm": 0.07823483645915985, "learning_rate": 8.117290244003818e-06, "loss": 0.4612, "num_input_tokens_seen": 69332128, "step": 57010 }, { "epoch": 7.14384162385666, "grad_norm": 0.07090160995721817, "learning_rate": 8.116862773436963e-06, "loss": 0.4638, "num_input_tokens_seen": 69338240, "step": 57015 }, { "epoch": 7.144468111765443, "grad_norm": 0.06725963950157166, "learning_rate": 8.116435265605393e-06, "loss": 0.4604, "num_input_tokens_seen": 69344512, "step": 57020 }, { "epoch": 7.145094599674226, "grad_norm": 0.07388745248317719, "learning_rate": 8.11600772051422e-06, "loss": 0.4629, "num_input_tokens_seen": 69350656, "step": 57025 }, { "epoch": 7.145721087583009, "grad_norm": 0.16313986480236053, "learning_rate": 8.115580138168556e-06, "loss": 0.4599, "num_input_tokens_seen": 69356800, "step": 57030 }, { "epoch": 7.146347575491793, "grad_norm": 0.06946732103824615, "learning_rate": 8.11515251857351e-06, "loss": 0.4662, "num_input_tokens_seen": 69363040, "step": 57035 }, { "epoch": 7.1469740634005765, "grad_norm": 0.08937142789363861, "learning_rate": 8.114724861734196e-06, "loss": 0.4611, "num_input_tokens_seen": 69368800, "step": 57040 }, { "epoch": 7.14760055130936, "grad_norm": 0.06542287021875381, "learning_rate": 8.11429716765573e-06, "loss": 0.4634, "num_input_tokens_seen": 69374912, "step": 57045 }, { "epoch": 7.148227039218143, "grad_norm": 0.07696332037448883, "learning_rate": 8.113869436343223e-06, "loss": 0.4617, "num_input_tokens_seen": 69380896, "step": 57050 }, { "epoch": 7.148853527126926, "grad_norm": 0.07351583987474442, "learning_rate": 8.113441667801787e-06, "loss": 0.4659, "num_input_tokens_seen": 69387008, "step": 57055 }, { "epoch": 7.14948001503571, "grad_norm": 0.06503701210021973, "learning_rate": 8.113013862036539e-06, "loss": 0.461, "num_input_tokens_seen": 69393056, "step": 57060 }, { "epoch": 7.150106502944493, "grad_norm": 0.11580633372068405, "learning_rate": 8.112586019052593e-06, "loss": 0.4618, "num_input_tokens_seen": 69399296, "step": 57065 }, { "epoch": 7.150732990853276, "grad_norm": 0.07823502272367477, "learning_rate": 8.112158138855065e-06, "loss": 0.4584, "num_input_tokens_seen": 69405152, "step": 57070 }, { "epoch": 7.15135947876206, "grad_norm": 0.0701020136475563, "learning_rate": 8.111730221449067e-06, "loss": 0.4597, "num_input_tokens_seen": 69410912, "step": 57075 }, { "epoch": 7.151985966670844, "grad_norm": 0.06760106980800629, "learning_rate": 8.11130226683972e-06, "loss": 0.468, "num_input_tokens_seen": 69416928, "step": 57080 }, { "epoch": 7.152612454579627, "grad_norm": 0.07581421732902527, "learning_rate": 8.11087427503214e-06, "loss": 0.4603, "num_input_tokens_seen": 69423328, "step": 57085 }, { "epoch": 7.15323894248841, "grad_norm": 0.11870431900024414, "learning_rate": 8.11044624603144e-06, "loss": 0.459, "num_input_tokens_seen": 69429568, "step": 57090 }, { "epoch": 7.153865430397193, "grad_norm": 0.03934496268630028, "learning_rate": 8.110018179842741e-06, "loss": 0.4571, "num_input_tokens_seen": 69435424, "step": 57095 }, { "epoch": 7.154491918305976, "grad_norm": 0.09796923398971558, "learning_rate": 8.109590076471161e-06, "loss": 0.4662, "num_input_tokens_seen": 69441504, "step": 57100 }, { "epoch": 7.15511840621476, "grad_norm": 0.06533315032720566, "learning_rate": 8.109161935921817e-06, "loss": 0.4615, "num_input_tokens_seen": 69447680, "step": 57105 }, { "epoch": 7.1557448941235435, "grad_norm": 0.11817418783903122, "learning_rate": 8.108733758199828e-06, "loss": 0.4591, "num_input_tokens_seen": 69453760, "step": 57110 }, { "epoch": 7.156371382032327, "grad_norm": 0.08817606419324875, "learning_rate": 8.10830554331031e-06, "loss": 0.457, "num_input_tokens_seen": 69460000, "step": 57115 }, { "epoch": 7.15699786994111, "grad_norm": 0.12438789755105972, "learning_rate": 8.10787729125839e-06, "loss": 0.4711, "num_input_tokens_seen": 69466528, "step": 57120 }, { "epoch": 7.157624357849894, "grad_norm": 0.07553013414144516, "learning_rate": 8.107449002049183e-06, "loss": 0.4556, "num_input_tokens_seen": 69472736, "step": 57125 }, { "epoch": 7.158250845758677, "grad_norm": 0.09073299169540405, "learning_rate": 8.10702067568781e-06, "loss": 0.4577, "num_input_tokens_seen": 69478624, "step": 57130 }, { "epoch": 7.15887733366746, "grad_norm": 0.06754753738641739, "learning_rate": 8.106592312179393e-06, "loss": 0.4657, "num_input_tokens_seen": 69485056, "step": 57135 }, { "epoch": 7.159503821576243, "grad_norm": 0.07872515171766281, "learning_rate": 8.10616391152905e-06, "loss": 0.4635, "num_input_tokens_seen": 69491072, "step": 57140 }, { "epoch": 7.160130309485027, "grad_norm": 0.08717663586139679, "learning_rate": 8.105735473741909e-06, "loss": 0.4631, "num_input_tokens_seen": 69497216, "step": 57145 }, { "epoch": 7.160756797393811, "grad_norm": 0.10956955701112747, "learning_rate": 8.105306998823091e-06, "loss": 0.4638, "num_input_tokens_seen": 69503488, "step": 57150 }, { "epoch": 7.161383285302594, "grad_norm": 0.08807310461997986, "learning_rate": 8.104878486777715e-06, "loss": 0.4502, "num_input_tokens_seen": 69509664, "step": 57155 }, { "epoch": 7.162009773211377, "grad_norm": 0.10272731631994247, "learning_rate": 8.104449937610906e-06, "loss": 0.4711, "num_input_tokens_seen": 69515584, "step": 57160 }, { "epoch": 7.16263626112016, "grad_norm": 0.11541685461997986, "learning_rate": 8.104021351327787e-06, "loss": 0.4583, "num_input_tokens_seen": 69521440, "step": 57165 }, { "epoch": 7.163262749028943, "grad_norm": 0.06683450937271118, "learning_rate": 8.103592727933486e-06, "loss": 0.4539, "num_input_tokens_seen": 69527392, "step": 57170 }, { "epoch": 7.163889236937727, "grad_norm": 0.06820819526910782, "learning_rate": 8.103164067433122e-06, "loss": 0.4551, "num_input_tokens_seen": 69533760, "step": 57175 }, { "epoch": 7.164515724846511, "grad_norm": 0.07564491033554077, "learning_rate": 8.102735369831822e-06, "loss": 0.4631, "num_input_tokens_seen": 69539776, "step": 57180 }, { "epoch": 7.165142212755294, "grad_norm": 0.10451136529445648, "learning_rate": 8.102306635134713e-06, "loss": 0.4629, "num_input_tokens_seen": 69545984, "step": 57185 }, { "epoch": 7.165768700664077, "grad_norm": 0.08975713700056076, "learning_rate": 8.10187786334692e-06, "loss": 0.4535, "num_input_tokens_seen": 69552448, "step": 57190 }, { "epoch": 7.166395188572861, "grad_norm": 0.10712642967700958, "learning_rate": 8.10144905447357e-06, "loss": 0.4574, "num_input_tokens_seen": 69558048, "step": 57195 }, { "epoch": 7.167021676481644, "grad_norm": 0.07007886469364166, "learning_rate": 8.10102020851979e-06, "loss": 0.4683, "num_input_tokens_seen": 69564448, "step": 57200 }, { "epoch": 7.167648164390427, "grad_norm": 0.047615520656108856, "learning_rate": 8.100591325490703e-06, "loss": 0.4693, "num_input_tokens_seen": 69570720, "step": 57205 }, { "epoch": 7.1682746522992105, "grad_norm": 0.10838381201028824, "learning_rate": 8.100162405391443e-06, "loss": 0.4614, "num_input_tokens_seen": 69576672, "step": 57210 }, { "epoch": 7.168901140207994, "grad_norm": 0.04007718712091446, "learning_rate": 8.099733448227132e-06, "loss": 0.4651, "num_input_tokens_seen": 69582976, "step": 57215 }, { "epoch": 7.169527628116778, "grad_norm": 0.06809812784194946, "learning_rate": 8.099304454002904e-06, "loss": 0.4662, "num_input_tokens_seen": 69588960, "step": 57220 }, { "epoch": 7.170154116025561, "grad_norm": 0.08703078329563141, "learning_rate": 8.098875422723884e-06, "loss": 0.4692, "num_input_tokens_seen": 69595456, "step": 57225 }, { "epoch": 7.170780603934344, "grad_norm": 0.07141821831464767, "learning_rate": 8.098446354395203e-06, "loss": 0.4552, "num_input_tokens_seen": 69601856, "step": 57230 }, { "epoch": 7.171407091843127, "grad_norm": 0.10854943841695786, "learning_rate": 8.098017249021992e-06, "loss": 0.4699, "num_input_tokens_seen": 69607392, "step": 57235 }, { "epoch": 7.17203357975191, "grad_norm": 0.10284275561571121, "learning_rate": 8.097588106609378e-06, "loss": 0.4685, "num_input_tokens_seen": 69613376, "step": 57240 }, { "epoch": 7.172660067660694, "grad_norm": 0.0686984658241272, "learning_rate": 8.097158927162496e-06, "loss": 0.4697, "num_input_tokens_seen": 69619328, "step": 57245 }, { "epoch": 7.173286555569478, "grad_norm": 0.08151903003454208, "learning_rate": 8.096729710686475e-06, "loss": 0.4681, "num_input_tokens_seen": 69625184, "step": 57250 }, { "epoch": 7.173913043478261, "grad_norm": 0.08004827797412872, "learning_rate": 8.096300457186446e-06, "loss": 0.4658, "num_input_tokens_seen": 69631104, "step": 57255 }, { "epoch": 7.174539531387044, "grad_norm": 0.09951192140579224, "learning_rate": 8.095871166667541e-06, "loss": 0.4664, "num_input_tokens_seen": 69637248, "step": 57260 }, { "epoch": 7.175166019295828, "grad_norm": 0.06890472769737244, "learning_rate": 8.095441839134895e-06, "loss": 0.4585, "num_input_tokens_seen": 69643008, "step": 57265 }, { "epoch": 7.175792507204611, "grad_norm": 0.04026220366358757, "learning_rate": 8.095012474593637e-06, "loss": 0.4607, "num_input_tokens_seen": 69649472, "step": 57270 }, { "epoch": 7.176418995113394, "grad_norm": 0.06754341721534729, "learning_rate": 8.094583073048905e-06, "loss": 0.4564, "num_input_tokens_seen": 69655680, "step": 57275 }, { "epoch": 7.1770454830221775, "grad_norm": 0.06083774194121361, "learning_rate": 8.09415363450583e-06, "loss": 0.4611, "num_input_tokens_seen": 69661760, "step": 57280 }, { "epoch": 7.177671970930961, "grad_norm": 0.09504487365484238, "learning_rate": 8.093724158969547e-06, "loss": 0.4617, "num_input_tokens_seen": 69668192, "step": 57285 }, { "epoch": 7.178298458839745, "grad_norm": 0.1103467270731926, "learning_rate": 8.093294646445189e-06, "loss": 0.4685, "num_input_tokens_seen": 69674208, "step": 57290 }, { "epoch": 7.178924946748528, "grad_norm": 0.06428050249814987, "learning_rate": 8.092865096937894e-06, "loss": 0.468, "num_input_tokens_seen": 69680384, "step": 57295 }, { "epoch": 7.179551434657311, "grad_norm": 0.1098359152674675, "learning_rate": 8.092435510452798e-06, "loss": 0.4653, "num_input_tokens_seen": 69685920, "step": 57300 }, { "epoch": 7.180177922566094, "grad_norm": 0.06279724091291428, "learning_rate": 8.092005886995033e-06, "loss": 0.4556, "num_input_tokens_seen": 69692288, "step": 57305 }, { "epoch": 7.180804410474877, "grad_norm": 0.07238451391458511, "learning_rate": 8.09157622656974e-06, "loss": 0.4663, "num_input_tokens_seen": 69698560, "step": 57310 }, { "epoch": 7.181430898383661, "grad_norm": 0.06511512398719788, "learning_rate": 8.091146529182052e-06, "loss": 0.4655, "num_input_tokens_seen": 69704192, "step": 57315 }, { "epoch": 7.182057386292445, "grad_norm": 0.11357861757278442, "learning_rate": 8.09071679483711e-06, "loss": 0.47, "num_input_tokens_seen": 69710336, "step": 57320 }, { "epoch": 7.182683874201228, "grad_norm": 0.12603184580802917, "learning_rate": 8.090287023540049e-06, "loss": 0.4666, "num_input_tokens_seen": 69716416, "step": 57325 }, { "epoch": 7.183310362110011, "grad_norm": 0.09266234934329987, "learning_rate": 8.089857215296008e-06, "loss": 0.4612, "num_input_tokens_seen": 69722752, "step": 57330 }, { "epoch": 7.183936850018795, "grad_norm": 0.10286349803209305, "learning_rate": 8.089427370110128e-06, "loss": 0.4618, "num_input_tokens_seen": 69728640, "step": 57335 }, { "epoch": 7.184563337927578, "grad_norm": 0.06250830739736557, "learning_rate": 8.088997487987546e-06, "loss": 0.4591, "num_input_tokens_seen": 69734816, "step": 57340 }, { "epoch": 7.185189825836361, "grad_norm": 0.06777936220169067, "learning_rate": 8.088567568933402e-06, "loss": 0.4629, "num_input_tokens_seen": 69740928, "step": 57345 }, { "epoch": 7.1858163137451445, "grad_norm": 0.060596778988838196, "learning_rate": 8.088137612952835e-06, "loss": 0.4678, "num_input_tokens_seen": 69747072, "step": 57350 }, { "epoch": 7.186442801653928, "grad_norm": 0.09413155913352966, "learning_rate": 8.087707620050987e-06, "loss": 0.4613, "num_input_tokens_seen": 69753088, "step": 57355 }, { "epoch": 7.187069289562712, "grad_norm": 0.06508571654558182, "learning_rate": 8.087277590232998e-06, "loss": 0.4618, "num_input_tokens_seen": 69759232, "step": 57360 }, { "epoch": 7.187695777471495, "grad_norm": 0.11288796365261078, "learning_rate": 8.086847523504009e-06, "loss": 0.4625, "num_input_tokens_seen": 69765184, "step": 57365 }, { "epoch": 7.188322265380278, "grad_norm": 0.0768304169178009, "learning_rate": 8.086417419869164e-06, "loss": 0.4647, "num_input_tokens_seen": 69771200, "step": 57370 }, { "epoch": 7.188948753289061, "grad_norm": 0.10146953910589218, "learning_rate": 8.085987279333603e-06, "loss": 0.4566, "num_input_tokens_seen": 69777408, "step": 57375 }, { "epoch": 7.189575241197845, "grad_norm": 0.06906305998563766, "learning_rate": 8.08555710190247e-06, "loss": 0.4626, "num_input_tokens_seen": 69783616, "step": 57380 }, { "epoch": 7.1902017291066285, "grad_norm": 0.08865591138601303, "learning_rate": 8.085126887580908e-06, "loss": 0.4628, "num_input_tokens_seen": 69789536, "step": 57385 }, { "epoch": 7.190828217015412, "grad_norm": 0.06744580715894699, "learning_rate": 8.084696636374059e-06, "loss": 0.4588, "num_input_tokens_seen": 69795776, "step": 57390 }, { "epoch": 7.191454704924195, "grad_norm": 0.07872328907251358, "learning_rate": 8.08426634828707e-06, "loss": 0.4592, "num_input_tokens_seen": 69802304, "step": 57395 }, { "epoch": 7.192081192832978, "grad_norm": 0.06457899510860443, "learning_rate": 8.083836023325082e-06, "loss": 0.4656, "num_input_tokens_seen": 69808320, "step": 57400 }, { "epoch": 7.192707680741762, "grad_norm": 0.06375811994075775, "learning_rate": 8.083405661493243e-06, "loss": 0.4659, "num_input_tokens_seen": 69813984, "step": 57405 }, { "epoch": 7.193334168650545, "grad_norm": 0.07521749287843704, "learning_rate": 8.082975262796697e-06, "loss": 0.4624, "num_input_tokens_seen": 69820096, "step": 57410 }, { "epoch": 7.193960656559328, "grad_norm": 0.07296236604452133, "learning_rate": 8.082544827240589e-06, "loss": 0.4606, "num_input_tokens_seen": 69826208, "step": 57415 }, { "epoch": 7.1945871444681115, "grad_norm": 0.07061274349689484, "learning_rate": 8.082114354830065e-06, "loss": 0.462, "num_input_tokens_seen": 69832480, "step": 57420 }, { "epoch": 7.195213632376895, "grad_norm": 0.09278153628110886, "learning_rate": 8.081683845570273e-06, "loss": 0.4583, "num_input_tokens_seen": 69838720, "step": 57425 }, { "epoch": 7.195840120285679, "grad_norm": 0.07783670723438263, "learning_rate": 8.081253299466362e-06, "loss": 0.4646, "num_input_tokens_seen": 69844704, "step": 57430 }, { "epoch": 7.196466608194462, "grad_norm": 0.10130347311496735, "learning_rate": 8.080822716523475e-06, "loss": 0.4659, "num_input_tokens_seen": 69850848, "step": 57435 }, { "epoch": 7.197093096103245, "grad_norm": 0.09153502434492111, "learning_rate": 8.080392096746763e-06, "loss": 0.4627, "num_input_tokens_seen": 69857280, "step": 57440 }, { "epoch": 7.197719584012028, "grad_norm": 0.0838329941034317, "learning_rate": 8.079961440141373e-06, "loss": 0.47, "num_input_tokens_seen": 69863680, "step": 57445 }, { "epoch": 7.198346071920812, "grad_norm": 0.07957049459218979, "learning_rate": 8.079530746712454e-06, "loss": 0.4623, "num_input_tokens_seen": 69869728, "step": 57450 }, { "epoch": 7.1989725598295955, "grad_norm": 0.03998342901468277, "learning_rate": 8.079100016465159e-06, "loss": 0.4613, "num_input_tokens_seen": 69875648, "step": 57455 }, { "epoch": 7.199599047738379, "grad_norm": 0.06398388743400574, "learning_rate": 8.078669249404631e-06, "loss": 0.4618, "num_input_tokens_seen": 69881600, "step": 57460 }, { "epoch": 7.200225535647162, "grad_norm": 0.05921261012554169, "learning_rate": 8.078238445536024e-06, "loss": 0.457, "num_input_tokens_seen": 69887744, "step": 57465 }, { "epoch": 7.200852023555945, "grad_norm": 0.04218222200870514, "learning_rate": 8.07780760486449e-06, "loss": 0.4585, "num_input_tokens_seen": 69893728, "step": 57470 }, { "epoch": 7.201478511464729, "grad_norm": 0.09727220982313156, "learning_rate": 8.077376727395177e-06, "loss": 0.4627, "num_input_tokens_seen": 69899712, "step": 57475 }, { "epoch": 7.202104999373512, "grad_norm": 0.08299317210912704, "learning_rate": 8.076945813133239e-06, "loss": 0.4623, "num_input_tokens_seen": 69905856, "step": 57480 }, { "epoch": 7.202731487282295, "grad_norm": 0.10535752028226852, "learning_rate": 8.076514862083826e-06, "loss": 0.4591, "num_input_tokens_seen": 69912160, "step": 57485 }, { "epoch": 7.203357975191079, "grad_norm": 0.06908445805311203, "learning_rate": 8.076083874252092e-06, "loss": 0.4605, "num_input_tokens_seen": 69918560, "step": 57490 }, { "epoch": 7.203984463099863, "grad_norm": 0.06667327880859375, "learning_rate": 8.075652849643188e-06, "loss": 0.4586, "num_input_tokens_seen": 69924704, "step": 57495 }, { "epoch": 7.204610951008646, "grad_norm": 0.09971781075000763, "learning_rate": 8.07522178826227e-06, "loss": 0.4626, "num_input_tokens_seen": 69930912, "step": 57500 }, { "epoch": 7.205237438917429, "grad_norm": 0.08882435411214828, "learning_rate": 8.074790690114488e-06, "loss": 0.4654, "num_input_tokens_seen": 69937088, "step": 57505 }, { "epoch": 7.205863926826212, "grad_norm": 0.09274056553840637, "learning_rate": 8.074359555204999e-06, "loss": 0.4631, "num_input_tokens_seen": 69943200, "step": 57510 }, { "epoch": 7.206490414734995, "grad_norm": 0.0725630447268486, "learning_rate": 8.073928383538959e-06, "loss": 0.4616, "num_input_tokens_seen": 69948640, "step": 57515 }, { "epoch": 7.207116902643779, "grad_norm": 0.09289690107107162, "learning_rate": 8.073497175121517e-06, "loss": 0.4646, "num_input_tokens_seen": 69954944, "step": 57520 }, { "epoch": 7.2077433905525625, "grad_norm": 0.08444351702928543, "learning_rate": 8.073065929957834e-06, "loss": 0.473, "num_input_tokens_seen": 69960960, "step": 57525 }, { "epoch": 7.208369878461346, "grad_norm": 0.04400038346648216, "learning_rate": 8.072634648053064e-06, "loss": 0.461, "num_input_tokens_seen": 69967200, "step": 57530 }, { "epoch": 7.208996366370129, "grad_norm": 0.0745113417506218, "learning_rate": 8.072203329412364e-06, "loss": 0.4687, "num_input_tokens_seen": 69973472, "step": 57535 }, { "epoch": 7.209622854278912, "grad_norm": 0.0712759867310524, "learning_rate": 8.07177197404089e-06, "loss": 0.4613, "num_input_tokens_seen": 69979616, "step": 57540 }, { "epoch": 7.210249342187696, "grad_norm": 0.0669657438993454, "learning_rate": 8.071340581943798e-06, "loss": 0.4578, "num_input_tokens_seen": 69985856, "step": 57545 }, { "epoch": 7.210875830096479, "grad_norm": 0.0667605921626091, "learning_rate": 8.070909153126249e-06, "loss": 0.4577, "num_input_tokens_seen": 69991968, "step": 57550 }, { "epoch": 7.211502318005262, "grad_norm": 0.061229076236486435, "learning_rate": 8.070477687593398e-06, "loss": 0.4609, "num_input_tokens_seen": 69998144, "step": 57555 }, { "epoch": 7.212128805914046, "grad_norm": 0.0725879818201065, "learning_rate": 8.070046185350406e-06, "loss": 0.4599, "num_input_tokens_seen": 70004352, "step": 57560 }, { "epoch": 7.212755293822829, "grad_norm": 0.0687621608376503, "learning_rate": 8.06961464640243e-06, "loss": 0.4651, "num_input_tokens_seen": 70010496, "step": 57565 }, { "epoch": 7.213381781731613, "grad_norm": 0.09800431877374649, "learning_rate": 8.06918307075463e-06, "loss": 0.4696, "num_input_tokens_seen": 70016800, "step": 57570 }, { "epoch": 7.214008269640396, "grad_norm": 0.08999749273061752, "learning_rate": 8.068751458412165e-06, "loss": 0.4663, "num_input_tokens_seen": 70022624, "step": 57575 }, { "epoch": 7.214634757549179, "grad_norm": 0.07000403851270676, "learning_rate": 8.068319809380195e-06, "loss": 0.4635, "num_input_tokens_seen": 70028576, "step": 57580 }, { "epoch": 7.215261245457962, "grad_norm": 0.06723470240831375, "learning_rate": 8.067888123663886e-06, "loss": 0.4647, "num_input_tokens_seen": 70034912, "step": 57585 }, { "epoch": 7.215887733366746, "grad_norm": 0.0866965651512146, "learning_rate": 8.067456401268393e-06, "loss": 0.4571, "num_input_tokens_seen": 70041184, "step": 57590 }, { "epoch": 7.2165142212755296, "grad_norm": 0.0646439865231514, "learning_rate": 8.067024642198881e-06, "loss": 0.4645, "num_input_tokens_seen": 70047264, "step": 57595 }, { "epoch": 7.217140709184313, "grad_norm": 0.11017195880413055, "learning_rate": 8.066592846460509e-06, "loss": 0.4651, "num_input_tokens_seen": 70052896, "step": 57600 }, { "epoch": 7.217767197093096, "grad_norm": 0.059840891510248184, "learning_rate": 8.066161014058442e-06, "loss": 0.4565, "num_input_tokens_seen": 70059104, "step": 57605 }, { "epoch": 7.218393685001879, "grad_norm": 0.08123541623353958, "learning_rate": 8.065729144997843e-06, "loss": 0.4644, "num_input_tokens_seen": 70065504, "step": 57610 }, { "epoch": 7.219020172910663, "grad_norm": 0.07568429410457611, "learning_rate": 8.065297239283874e-06, "loss": 0.461, "num_input_tokens_seen": 70071808, "step": 57615 }, { "epoch": 7.219646660819446, "grad_norm": 0.07805605232715607, "learning_rate": 8.0648652969217e-06, "loss": 0.4652, "num_input_tokens_seen": 70078016, "step": 57620 }, { "epoch": 7.2202731487282295, "grad_norm": 0.0963326022028923, "learning_rate": 8.064433317916482e-06, "loss": 0.459, "num_input_tokens_seen": 70084096, "step": 57625 }, { "epoch": 7.220899636637013, "grad_norm": 0.06471146643161774, "learning_rate": 8.06400130227339e-06, "loss": 0.4624, "num_input_tokens_seen": 70090176, "step": 57630 }, { "epoch": 7.221526124545797, "grad_norm": 0.07125800102949142, "learning_rate": 8.063569249997585e-06, "loss": 0.4621, "num_input_tokens_seen": 70096416, "step": 57635 }, { "epoch": 7.22215261245458, "grad_norm": 0.07549590617418289, "learning_rate": 8.063137161094234e-06, "loss": 0.4602, "num_input_tokens_seen": 70102784, "step": 57640 }, { "epoch": 7.222779100363363, "grad_norm": 0.06293901801109314, "learning_rate": 8.062705035568503e-06, "loss": 0.4648, "num_input_tokens_seen": 70109216, "step": 57645 }, { "epoch": 7.223405588272146, "grad_norm": 0.06318367272615433, "learning_rate": 8.062272873425558e-06, "loss": 0.4585, "num_input_tokens_seen": 70115072, "step": 57650 }, { "epoch": 7.224032076180929, "grad_norm": 0.07303885370492935, "learning_rate": 8.061840674670568e-06, "loss": 0.4581, "num_input_tokens_seen": 70121312, "step": 57655 }, { "epoch": 7.224658564089713, "grad_norm": 0.07735487073659897, "learning_rate": 8.061408439308697e-06, "loss": 0.4638, "num_input_tokens_seen": 70127168, "step": 57660 }, { "epoch": 7.225285051998497, "grad_norm": 0.0589594803750515, "learning_rate": 8.060976167345113e-06, "loss": 0.4663, "num_input_tokens_seen": 70133504, "step": 57665 }, { "epoch": 7.22591153990728, "grad_norm": 0.086692214012146, "learning_rate": 8.060543858784988e-06, "loss": 0.4597, "num_input_tokens_seen": 70139520, "step": 57670 }, { "epoch": 7.226538027816063, "grad_norm": 0.07717496156692505, "learning_rate": 8.060111513633486e-06, "loss": 0.4663, "num_input_tokens_seen": 70145408, "step": 57675 }, { "epoch": 7.227164515724846, "grad_norm": 0.11435090750455856, "learning_rate": 8.059679131895779e-06, "loss": 0.4662, "num_input_tokens_seen": 70151616, "step": 57680 }, { "epoch": 7.22779100363363, "grad_norm": 0.03880660980939865, "learning_rate": 8.059246713577036e-06, "loss": 0.4628, "num_input_tokens_seen": 70157152, "step": 57685 }, { "epoch": 7.228417491542413, "grad_norm": 0.07961110770702362, "learning_rate": 8.058814258682425e-06, "loss": 0.4623, "num_input_tokens_seen": 70163200, "step": 57690 }, { "epoch": 7.2290439794511965, "grad_norm": 0.06508312374353409, "learning_rate": 8.058381767217121e-06, "loss": 0.4643, "num_input_tokens_seen": 70169248, "step": 57695 }, { "epoch": 7.22967046735998, "grad_norm": 0.069115549325943, "learning_rate": 8.057949239186287e-06, "loss": 0.4543, "num_input_tokens_seen": 70174848, "step": 57700 }, { "epoch": 7.230296955268764, "grad_norm": 0.05005668103694916, "learning_rate": 8.0575166745951e-06, "loss": 0.4542, "num_input_tokens_seen": 70180736, "step": 57705 }, { "epoch": 7.230923443177547, "grad_norm": 0.09213130921125412, "learning_rate": 8.057084073448733e-06, "loss": 0.4613, "num_input_tokens_seen": 70186944, "step": 57710 }, { "epoch": 7.23154993108633, "grad_norm": 0.16609187424182892, "learning_rate": 8.056651435752354e-06, "loss": 0.4517, "num_input_tokens_seen": 70192736, "step": 57715 }, { "epoch": 7.232176418995113, "grad_norm": 0.13023178279399872, "learning_rate": 8.05621876151114e-06, "loss": 0.4614, "num_input_tokens_seen": 70198752, "step": 57720 }, { "epoch": 7.232802906903896, "grad_norm": 0.07593528181314468, "learning_rate": 8.055786050730257e-06, "loss": 0.4555, "num_input_tokens_seen": 70204896, "step": 57725 }, { "epoch": 7.23342939481268, "grad_norm": 0.07399087399244308, "learning_rate": 8.055353303414883e-06, "loss": 0.4629, "num_input_tokens_seen": 70210912, "step": 57730 }, { "epoch": 7.234055882721464, "grad_norm": 0.08648708462715149, "learning_rate": 8.054920519570194e-06, "loss": 0.4627, "num_input_tokens_seen": 70217568, "step": 57735 }, { "epoch": 7.234682370630247, "grad_norm": 0.11151660233736038, "learning_rate": 8.054487699201361e-06, "loss": 0.4611, "num_input_tokens_seen": 70224000, "step": 57740 }, { "epoch": 7.23530885853903, "grad_norm": 0.07935480028390884, "learning_rate": 8.05405484231356e-06, "loss": 0.4601, "num_input_tokens_seen": 70230304, "step": 57745 }, { "epoch": 7.235935346447814, "grad_norm": 0.11119381338357925, "learning_rate": 8.053621948911962e-06, "loss": 0.461, "num_input_tokens_seen": 70236064, "step": 57750 }, { "epoch": 7.236561834356597, "grad_norm": 0.07616306096315384, "learning_rate": 8.05318901900175e-06, "loss": 0.4572, "num_input_tokens_seen": 70242048, "step": 57755 }, { "epoch": 7.23718832226538, "grad_norm": 0.14141008257865906, "learning_rate": 8.052756052588096e-06, "loss": 0.4747, "num_input_tokens_seen": 70247712, "step": 57760 }, { "epoch": 7.2378148101741635, "grad_norm": 0.09865744411945343, "learning_rate": 8.052323049676175e-06, "loss": 0.4633, "num_input_tokens_seen": 70254016, "step": 57765 }, { "epoch": 7.238441298082947, "grad_norm": 0.08915029466152191, "learning_rate": 8.051890010271166e-06, "loss": 0.473, "num_input_tokens_seen": 70259936, "step": 57770 }, { "epoch": 7.239067785991731, "grad_norm": 0.10608724504709244, "learning_rate": 8.051456934378248e-06, "loss": 0.4581, "num_input_tokens_seen": 70266080, "step": 57775 }, { "epoch": 7.239694273900514, "grad_norm": 0.0567995049059391, "learning_rate": 8.051023822002594e-06, "loss": 0.4609, "num_input_tokens_seen": 70272192, "step": 57780 }, { "epoch": 7.240320761809297, "grad_norm": 0.11856809258460999, "learning_rate": 8.050590673149387e-06, "loss": 0.4759, "num_input_tokens_seen": 70278176, "step": 57785 }, { "epoch": 7.24094724971808, "grad_norm": 0.07541894167661667, "learning_rate": 8.050157487823805e-06, "loss": 0.4653, "num_input_tokens_seen": 70284160, "step": 57790 }, { "epoch": 7.241573737626863, "grad_norm": 0.11647740006446838, "learning_rate": 8.049724266031023e-06, "loss": 0.4602, "num_input_tokens_seen": 70290432, "step": 57795 }, { "epoch": 7.2422002255356475, "grad_norm": 0.07639691233634949, "learning_rate": 8.049291007776225e-06, "loss": 0.457, "num_input_tokens_seen": 70296736, "step": 57800 }, { "epoch": 7.242826713444431, "grad_norm": 0.0718395859003067, "learning_rate": 8.04885771306459e-06, "loss": 0.4591, "num_input_tokens_seen": 70302432, "step": 57805 }, { "epoch": 7.243453201353214, "grad_norm": 0.08066734671592712, "learning_rate": 8.048424381901296e-06, "loss": 0.4611, "num_input_tokens_seen": 70308768, "step": 57810 }, { "epoch": 7.244079689261997, "grad_norm": 0.1547170728445053, "learning_rate": 8.047991014291525e-06, "loss": 0.4653, "num_input_tokens_seen": 70314976, "step": 57815 }, { "epoch": 7.244706177170781, "grad_norm": 0.09047029912471771, "learning_rate": 8.04755761024046e-06, "loss": 0.4589, "num_input_tokens_seen": 70321248, "step": 57820 }, { "epoch": 7.245332665079564, "grad_norm": 0.10661276429891586, "learning_rate": 8.04712416975328e-06, "loss": 0.4697, "num_input_tokens_seen": 70327104, "step": 57825 }, { "epoch": 7.245959152988347, "grad_norm": 0.08383960276842117, "learning_rate": 8.046690692835171e-06, "loss": 0.47, "num_input_tokens_seen": 70333152, "step": 57830 }, { "epoch": 7.2465856408971305, "grad_norm": 0.12260110676288605, "learning_rate": 8.046257179491313e-06, "loss": 0.4656, "num_input_tokens_seen": 70339360, "step": 57835 }, { "epoch": 7.247212128805914, "grad_norm": 0.06223633885383606, "learning_rate": 8.045823629726888e-06, "loss": 0.461, "num_input_tokens_seen": 70345440, "step": 57840 }, { "epoch": 7.247838616714698, "grad_norm": 0.07207250595092773, "learning_rate": 8.04539004354708e-06, "loss": 0.461, "num_input_tokens_seen": 70350848, "step": 57845 }, { "epoch": 7.248465104623481, "grad_norm": 0.06863311678171158, "learning_rate": 8.044956420957075e-06, "loss": 0.4633, "num_input_tokens_seen": 70356928, "step": 57850 }, { "epoch": 7.249091592532264, "grad_norm": 0.044632695615291595, "learning_rate": 8.044522761962056e-06, "loss": 0.4635, "num_input_tokens_seen": 70363424, "step": 57855 }, { "epoch": 7.249718080441047, "grad_norm": 0.09001980721950531, "learning_rate": 8.044089066567208e-06, "loss": 0.4651, "num_input_tokens_seen": 70369568, "step": 57860 }, { "epoch": 7.25034456834983, "grad_norm": 0.10764392465353012, "learning_rate": 8.043655334777717e-06, "loss": 0.4623, "num_input_tokens_seen": 70375008, "step": 57865 }, { "epoch": 7.2509710562586145, "grad_norm": 0.07586105167865753, "learning_rate": 8.043221566598765e-06, "loss": 0.4615, "num_input_tokens_seen": 70381472, "step": 57870 }, { "epoch": 7.251597544167398, "grad_norm": 0.07695423811674118, "learning_rate": 8.04278776203554e-06, "loss": 0.4635, "num_input_tokens_seen": 70387648, "step": 57875 }, { "epoch": 7.252224032076181, "grad_norm": 0.08002281934022903, "learning_rate": 8.042353921093232e-06, "loss": 0.4626, "num_input_tokens_seen": 70394048, "step": 57880 }, { "epoch": 7.252850519984964, "grad_norm": 0.10357855260372162, "learning_rate": 8.041920043777022e-06, "loss": 0.4603, "num_input_tokens_seen": 70400544, "step": 57885 }, { "epoch": 7.253477007893748, "grad_norm": 0.13145709037780762, "learning_rate": 8.041486130092102e-06, "loss": 0.4591, "num_input_tokens_seen": 70406816, "step": 57890 }, { "epoch": 7.254103495802531, "grad_norm": 0.05735713988542557, "learning_rate": 8.041052180043658e-06, "loss": 0.463, "num_input_tokens_seen": 70413088, "step": 57895 }, { "epoch": 7.254729983711314, "grad_norm": 0.062266603112220764, "learning_rate": 8.040618193636878e-06, "loss": 0.4606, "num_input_tokens_seen": 70418848, "step": 57900 }, { "epoch": 7.255356471620098, "grad_norm": 0.04148855060338974, "learning_rate": 8.040184170876953e-06, "loss": 0.4611, "num_input_tokens_seen": 70424928, "step": 57905 }, { "epoch": 7.255982959528881, "grad_norm": 0.06890974938869476, "learning_rate": 8.039750111769067e-06, "loss": 0.4624, "num_input_tokens_seen": 70431232, "step": 57910 }, { "epoch": 7.256609447437665, "grad_norm": 0.10637678951025009, "learning_rate": 8.039316016318415e-06, "loss": 0.4585, "num_input_tokens_seen": 70437504, "step": 57915 }, { "epoch": 7.257235935346448, "grad_norm": 0.10224026441574097, "learning_rate": 8.038881884530186e-06, "loss": 0.4612, "num_input_tokens_seen": 70443680, "step": 57920 }, { "epoch": 7.257862423255231, "grad_norm": 0.07147695124149323, "learning_rate": 8.038447716409565e-06, "loss": 0.4583, "num_input_tokens_seen": 70449824, "step": 57925 }, { "epoch": 7.258488911164014, "grad_norm": 0.08875722438097, "learning_rate": 8.03801351196175e-06, "loss": 0.4657, "num_input_tokens_seen": 70456416, "step": 57930 }, { "epoch": 7.2591153990727975, "grad_norm": 0.09492191672325134, "learning_rate": 8.037579271191928e-06, "loss": 0.4634, "num_input_tokens_seen": 70462112, "step": 57935 }, { "epoch": 7.2597418869815815, "grad_norm": 0.12299728393554688, "learning_rate": 8.037144994105292e-06, "loss": 0.4632, "num_input_tokens_seen": 70468192, "step": 57940 }, { "epoch": 7.260368374890365, "grad_norm": 0.11929549276828766, "learning_rate": 8.036710680707035e-06, "loss": 0.4561, "num_input_tokens_seen": 70474656, "step": 57945 }, { "epoch": 7.260994862799148, "grad_norm": 0.10084514319896698, "learning_rate": 8.036276331002348e-06, "loss": 0.4653, "num_input_tokens_seen": 70480064, "step": 57950 }, { "epoch": 7.261621350707931, "grad_norm": 0.10683584958314896, "learning_rate": 8.035841944996425e-06, "loss": 0.4647, "num_input_tokens_seen": 70486144, "step": 57955 }, { "epoch": 7.262247838616715, "grad_norm": 0.0716918557882309, "learning_rate": 8.035407522694459e-06, "loss": 0.4624, "num_input_tokens_seen": 70492384, "step": 57960 }, { "epoch": 7.262874326525498, "grad_norm": 0.07204744964838028, "learning_rate": 8.034973064101644e-06, "loss": 0.4653, "num_input_tokens_seen": 70498336, "step": 57965 }, { "epoch": 7.263500814434281, "grad_norm": 0.06931643187999725, "learning_rate": 8.034538569223176e-06, "loss": 0.4648, "num_input_tokens_seen": 70504384, "step": 57970 }, { "epoch": 7.264127302343065, "grad_norm": 0.11272088438272476, "learning_rate": 8.034104038064245e-06, "loss": 0.4628, "num_input_tokens_seen": 70510048, "step": 57975 }, { "epoch": 7.264753790251848, "grad_norm": 0.04335452988743782, "learning_rate": 8.033669470630053e-06, "loss": 0.4696, "num_input_tokens_seen": 70516512, "step": 57980 }, { "epoch": 7.265380278160632, "grad_norm": 0.06885870546102524, "learning_rate": 8.033234866925788e-06, "loss": 0.4657, "num_input_tokens_seen": 70522784, "step": 57985 }, { "epoch": 7.266006766069415, "grad_norm": 0.15707522630691528, "learning_rate": 8.032800226956651e-06, "loss": 0.4634, "num_input_tokens_seen": 70528928, "step": 57990 }, { "epoch": 7.266633253978198, "grad_norm": 0.16338111460208893, "learning_rate": 8.03236555072784e-06, "loss": 0.4557, "num_input_tokens_seen": 70535104, "step": 57995 }, { "epoch": 7.267259741886981, "grad_norm": 0.053770214319229126, "learning_rate": 8.031930838244547e-06, "loss": 0.4649, "num_input_tokens_seen": 70540352, "step": 58000 }, { "epoch": 7.267886229795765, "grad_norm": 0.07881961762905121, "learning_rate": 8.031496089511972e-06, "loss": 0.4641, "num_input_tokens_seen": 70546464, "step": 58005 }, { "epoch": 7.2685127177045485, "grad_norm": 0.07621477544307709, "learning_rate": 8.031061304535312e-06, "loss": 0.4603, "num_input_tokens_seen": 70552608, "step": 58010 }, { "epoch": 7.269139205613332, "grad_norm": 0.07954376190900803, "learning_rate": 8.030626483319766e-06, "loss": 0.461, "num_input_tokens_seen": 70559008, "step": 58015 }, { "epoch": 7.269765693522115, "grad_norm": 0.06782408058643341, "learning_rate": 8.030191625870534e-06, "loss": 0.4606, "num_input_tokens_seen": 70565152, "step": 58020 }, { "epoch": 7.270392181430898, "grad_norm": 0.07788093388080597, "learning_rate": 8.029756732192813e-06, "loss": 0.4607, "num_input_tokens_seen": 70571392, "step": 58025 }, { "epoch": 7.271018669339682, "grad_norm": 0.06685848534107208, "learning_rate": 8.029321802291802e-06, "loss": 0.4592, "num_input_tokens_seen": 70577472, "step": 58030 }, { "epoch": 7.271645157248465, "grad_norm": 0.060414623469114304, "learning_rate": 8.028886836172703e-06, "loss": 0.4672, "num_input_tokens_seen": 70583232, "step": 58035 }, { "epoch": 7.272271645157248, "grad_norm": 0.07452274858951569, "learning_rate": 8.028451833840713e-06, "loss": 0.4602, "num_input_tokens_seen": 70588992, "step": 58040 }, { "epoch": 7.272898133066032, "grad_norm": 0.06863833218812943, "learning_rate": 8.028016795301037e-06, "loss": 0.4644, "num_input_tokens_seen": 70595232, "step": 58045 }, { "epoch": 7.273524620974815, "grad_norm": 0.07755161076784134, "learning_rate": 8.027581720558874e-06, "loss": 0.4619, "num_input_tokens_seen": 70601312, "step": 58050 }, { "epoch": 7.274151108883599, "grad_norm": 0.11879955232143402, "learning_rate": 8.027146609619426e-06, "loss": 0.4635, "num_input_tokens_seen": 70607104, "step": 58055 }, { "epoch": 7.274777596792382, "grad_norm": 0.07178671658039093, "learning_rate": 8.026711462487898e-06, "loss": 0.4519, "num_input_tokens_seen": 70613408, "step": 58060 }, { "epoch": 7.275404084701165, "grad_norm": 0.11873112618923187, "learning_rate": 8.026276279169485e-06, "loss": 0.4598, "num_input_tokens_seen": 70619712, "step": 58065 }, { "epoch": 7.276030572609948, "grad_norm": 0.09119528532028198, "learning_rate": 8.025841059669398e-06, "loss": 0.4634, "num_input_tokens_seen": 70625824, "step": 58070 }, { "epoch": 7.276657060518732, "grad_norm": 0.11784452199935913, "learning_rate": 8.025405803992835e-06, "loss": 0.4611, "num_input_tokens_seen": 70632032, "step": 58075 }, { "epoch": 7.277283548427516, "grad_norm": 0.0830206498503685, "learning_rate": 8.024970512145004e-06, "loss": 0.4618, "num_input_tokens_seen": 70638208, "step": 58080 }, { "epoch": 7.277910036336299, "grad_norm": 0.076822429895401, "learning_rate": 8.024535184131105e-06, "loss": 0.4618, "num_input_tokens_seen": 70644544, "step": 58085 }, { "epoch": 7.278536524245082, "grad_norm": 0.06348606944084167, "learning_rate": 8.024099819956347e-06, "loss": 0.4637, "num_input_tokens_seen": 70650816, "step": 58090 }, { "epoch": 7.279163012153865, "grad_norm": 0.0969596579670906, "learning_rate": 8.023664419625932e-06, "loss": 0.4639, "num_input_tokens_seen": 70656960, "step": 58095 }, { "epoch": 7.279789500062649, "grad_norm": 0.11471404135227203, "learning_rate": 8.023228983145068e-06, "loss": 0.4601, "num_input_tokens_seen": 70663040, "step": 58100 }, { "epoch": 7.280415987971432, "grad_norm": 0.12907207012176514, "learning_rate": 8.022793510518958e-06, "loss": 0.4644, "num_input_tokens_seen": 70669408, "step": 58105 }, { "epoch": 7.2810424758802155, "grad_norm": 0.10034402459859848, "learning_rate": 8.022358001752812e-06, "loss": 0.4597, "num_input_tokens_seen": 70675680, "step": 58110 }, { "epoch": 7.281668963788999, "grad_norm": 0.06827199459075928, "learning_rate": 8.021922456851833e-06, "loss": 0.4545, "num_input_tokens_seen": 70681728, "step": 58115 }, { "epoch": 7.282295451697783, "grad_norm": 0.09267138689756393, "learning_rate": 8.021486875821232e-06, "loss": 0.4699, "num_input_tokens_seen": 70688192, "step": 58120 }, { "epoch": 7.282921939606566, "grad_norm": 0.09847375750541687, "learning_rate": 8.021051258666214e-06, "loss": 0.4683, "num_input_tokens_seen": 70694400, "step": 58125 }, { "epoch": 7.283548427515349, "grad_norm": 0.10618633031845093, "learning_rate": 8.020615605391988e-06, "loss": 0.4626, "num_input_tokens_seen": 70700320, "step": 58130 }, { "epoch": 7.284174915424132, "grad_norm": 0.11541427671909332, "learning_rate": 8.020179916003765e-06, "loss": 0.4611, "num_input_tokens_seen": 70706336, "step": 58135 }, { "epoch": 7.284801403332915, "grad_norm": 0.07256698608398438, "learning_rate": 8.01974419050675e-06, "loss": 0.4591, "num_input_tokens_seen": 70712480, "step": 58140 }, { "epoch": 7.285427891241699, "grad_norm": 0.06042151525616646, "learning_rate": 8.019308428906154e-06, "loss": 0.46, "num_input_tokens_seen": 70718656, "step": 58145 }, { "epoch": 7.286054379150483, "grad_norm": 0.07293123751878738, "learning_rate": 8.01887263120719e-06, "loss": 0.4612, "num_input_tokens_seen": 70724960, "step": 58150 }, { "epoch": 7.286680867059266, "grad_norm": 0.09562984108924866, "learning_rate": 8.018436797415063e-06, "loss": 0.4605, "num_input_tokens_seen": 70731232, "step": 58155 }, { "epoch": 7.287307354968049, "grad_norm": 0.07375287264585495, "learning_rate": 8.018000927534986e-06, "loss": 0.4697, "num_input_tokens_seen": 70737344, "step": 58160 }, { "epoch": 7.287933842876832, "grad_norm": 0.07303064316511154, "learning_rate": 8.017565021572171e-06, "loss": 0.4618, "num_input_tokens_seen": 70743104, "step": 58165 }, { "epoch": 7.288560330785616, "grad_norm": 0.04932815209031105, "learning_rate": 8.01712907953183e-06, "loss": 0.4599, "num_input_tokens_seen": 70749280, "step": 58170 }, { "epoch": 7.289186818694399, "grad_norm": 0.12154916673898697, "learning_rate": 8.016693101419175e-06, "loss": 0.4558, "num_input_tokens_seen": 70755584, "step": 58175 }, { "epoch": 7.2898133066031825, "grad_norm": 0.08337894082069397, "learning_rate": 8.016257087239415e-06, "loss": 0.4604, "num_input_tokens_seen": 70761600, "step": 58180 }, { "epoch": 7.290439794511966, "grad_norm": 0.12400522083044052, "learning_rate": 8.015821036997767e-06, "loss": 0.4659, "num_input_tokens_seen": 70767872, "step": 58185 }, { "epoch": 7.291066282420749, "grad_norm": 0.1241438090801239, "learning_rate": 8.015384950699444e-06, "loss": 0.4619, "num_input_tokens_seen": 70774048, "step": 58190 }, { "epoch": 7.291692770329533, "grad_norm": 0.07892030477523804, "learning_rate": 8.014948828349657e-06, "loss": 0.4576, "num_input_tokens_seen": 70780256, "step": 58195 }, { "epoch": 7.292319258238316, "grad_norm": 0.08543962985277176, "learning_rate": 8.014512669953622e-06, "loss": 0.4599, "num_input_tokens_seen": 70786528, "step": 58200 }, { "epoch": 7.292945746147099, "grad_norm": 0.07262216508388519, "learning_rate": 8.014076475516554e-06, "loss": 0.4639, "num_input_tokens_seen": 70792224, "step": 58205 }, { "epoch": 7.293572234055882, "grad_norm": 0.07848944514989853, "learning_rate": 8.013640245043667e-06, "loss": 0.4622, "num_input_tokens_seen": 70798368, "step": 58210 }, { "epoch": 7.2941987219646665, "grad_norm": 0.07277429848909378, "learning_rate": 8.013203978540179e-06, "loss": 0.4613, "num_input_tokens_seen": 70804192, "step": 58215 }, { "epoch": 7.29482520987345, "grad_norm": 0.09073927253484726, "learning_rate": 8.012767676011303e-06, "loss": 0.4587, "num_input_tokens_seen": 70809984, "step": 58220 }, { "epoch": 7.295451697782233, "grad_norm": 0.05703004449605942, "learning_rate": 8.012331337462256e-06, "loss": 0.4701, "num_input_tokens_seen": 70816160, "step": 58225 }, { "epoch": 7.296078185691016, "grad_norm": 0.06555727869272232, "learning_rate": 8.011894962898256e-06, "loss": 0.4692, "num_input_tokens_seen": 70822336, "step": 58230 }, { "epoch": 7.296704673599799, "grad_norm": 0.06776994466781616, "learning_rate": 8.011458552324521e-06, "loss": 0.4584, "num_input_tokens_seen": 70828448, "step": 58235 }, { "epoch": 7.297331161508583, "grad_norm": 0.07295913994312286, "learning_rate": 8.011022105746263e-06, "loss": 0.4668, "num_input_tokens_seen": 70834592, "step": 58240 }, { "epoch": 7.297957649417366, "grad_norm": 0.07969465106725693, "learning_rate": 8.010585623168707e-06, "loss": 0.4592, "num_input_tokens_seen": 70840416, "step": 58245 }, { "epoch": 7.2985841373261495, "grad_norm": 0.13632674515247345, "learning_rate": 8.010149104597068e-06, "loss": 0.4502, "num_input_tokens_seen": 70846624, "step": 58250 }, { "epoch": 7.299210625234933, "grad_norm": 0.0965137705206871, "learning_rate": 8.009712550036565e-06, "loss": 0.471, "num_input_tokens_seen": 70852320, "step": 58255 }, { "epoch": 7.299837113143717, "grad_norm": 0.06820090115070343, "learning_rate": 8.00927595949242e-06, "loss": 0.4593, "num_input_tokens_seen": 70858336, "step": 58260 }, { "epoch": 7.3004636010525, "grad_norm": 0.07353844493627548, "learning_rate": 8.00883933296985e-06, "loss": 0.4636, "num_input_tokens_seen": 70864768, "step": 58265 }, { "epoch": 7.301090088961283, "grad_norm": 0.08552815020084381, "learning_rate": 8.008402670474076e-06, "loss": 0.4582, "num_input_tokens_seen": 70870880, "step": 58270 }, { "epoch": 7.301716576870066, "grad_norm": 0.05730738863348961, "learning_rate": 8.007965972010317e-06, "loss": 0.4598, "num_input_tokens_seen": 70876416, "step": 58275 }, { "epoch": 7.302343064778849, "grad_norm": 0.10367240011692047, "learning_rate": 8.007529237583797e-06, "loss": 0.4605, "num_input_tokens_seen": 70882464, "step": 58280 }, { "epoch": 7.3029695526876335, "grad_norm": 0.117298424243927, "learning_rate": 8.007092467199738e-06, "loss": 0.4599, "num_input_tokens_seen": 70888512, "step": 58285 }, { "epoch": 7.303596040596417, "grad_norm": 0.13928085565567017, "learning_rate": 8.00665566086336e-06, "loss": 0.4667, "num_input_tokens_seen": 70894560, "step": 58290 }, { "epoch": 7.3042225285052, "grad_norm": 0.12074334919452667, "learning_rate": 8.006218818579885e-06, "loss": 0.457, "num_input_tokens_seen": 70900608, "step": 58295 }, { "epoch": 7.304849016413983, "grad_norm": 0.10245248675346375, "learning_rate": 8.005781940354536e-06, "loss": 0.4664, "num_input_tokens_seen": 70906080, "step": 58300 }, { "epoch": 7.305475504322766, "grad_norm": 0.08809783309698105, "learning_rate": 8.005345026192536e-06, "loss": 0.4652, "num_input_tokens_seen": 70912480, "step": 58305 }, { "epoch": 7.30610199223155, "grad_norm": 0.13996057212352753, "learning_rate": 8.00490807609911e-06, "loss": 0.4623, "num_input_tokens_seen": 70918272, "step": 58310 }, { "epoch": 7.306728480140333, "grad_norm": 0.0919245108962059, "learning_rate": 8.004471090079482e-06, "loss": 0.4649, "num_input_tokens_seen": 70924416, "step": 58315 }, { "epoch": 7.3073549680491166, "grad_norm": 0.08826874941587448, "learning_rate": 8.004034068138877e-06, "loss": 0.4628, "num_input_tokens_seen": 70930240, "step": 58320 }, { "epoch": 7.3079814559579, "grad_norm": 0.07979235798120499, "learning_rate": 8.003597010282518e-06, "loss": 0.4677, "num_input_tokens_seen": 70936448, "step": 58325 }, { "epoch": 7.308607943866684, "grad_norm": 0.0832340195775032, "learning_rate": 8.003159916515632e-06, "loss": 0.4602, "num_input_tokens_seen": 70942784, "step": 58330 }, { "epoch": 7.309234431775467, "grad_norm": 0.08872456848621368, "learning_rate": 8.002722786843444e-06, "loss": 0.4555, "num_input_tokens_seen": 70949216, "step": 58335 }, { "epoch": 7.30986091968425, "grad_norm": 0.1478632390499115, "learning_rate": 8.00228562127118e-06, "loss": 0.4596, "num_input_tokens_seen": 70955552, "step": 58340 }, { "epoch": 7.310487407593033, "grad_norm": 0.12609656155109406, "learning_rate": 8.001848419804069e-06, "loss": 0.4542, "num_input_tokens_seen": 70961856, "step": 58345 }, { "epoch": 7.3111138955018165, "grad_norm": 0.13727346062660217, "learning_rate": 8.001411182447334e-06, "loss": 0.4615, "num_input_tokens_seen": 70968096, "step": 58350 }, { "epoch": 7.3117403834106005, "grad_norm": 0.09906084835529327, "learning_rate": 8.000973909206207e-06, "loss": 0.4695, "num_input_tokens_seen": 70974208, "step": 58355 }, { "epoch": 7.312366871319384, "grad_norm": 0.09280385822057724, "learning_rate": 8.000536600085913e-06, "loss": 0.4608, "num_input_tokens_seen": 70980288, "step": 58360 }, { "epoch": 7.312993359228167, "grad_norm": 0.09641928970813751, "learning_rate": 8.00009925509168e-06, "loss": 0.4611, "num_input_tokens_seen": 70986304, "step": 58365 }, { "epoch": 7.31361984713695, "grad_norm": 0.12677448987960815, "learning_rate": 7.99966187422874e-06, "loss": 0.4541, "num_input_tokens_seen": 70992704, "step": 58370 }, { "epoch": 7.314246335045734, "grad_norm": 0.07107804715633392, "learning_rate": 7.999224457502319e-06, "loss": 0.4622, "num_input_tokens_seen": 70999008, "step": 58375 }, { "epoch": 7.314872822954517, "grad_norm": 0.12616229057312012, "learning_rate": 7.998787004917648e-06, "loss": 0.4716, "num_input_tokens_seen": 71004832, "step": 58380 }, { "epoch": 7.3154993108633, "grad_norm": 0.08958877623081207, "learning_rate": 7.998349516479957e-06, "loss": 0.4642, "num_input_tokens_seen": 71011168, "step": 58385 }, { "epoch": 7.316125798772084, "grad_norm": 0.11230365186929703, "learning_rate": 7.997911992194477e-06, "loss": 0.4602, "num_input_tokens_seen": 71017536, "step": 58390 }, { "epoch": 7.316752286680867, "grad_norm": 0.11694874614477158, "learning_rate": 7.99747443206644e-06, "loss": 0.4632, "num_input_tokens_seen": 71023904, "step": 58395 }, { "epoch": 7.317378774589651, "grad_norm": 0.08441584557294846, "learning_rate": 7.997036836101074e-06, "loss": 0.4712, "num_input_tokens_seen": 71029376, "step": 58400 }, { "epoch": 7.318005262498434, "grad_norm": 0.09658071398735046, "learning_rate": 7.996599204303615e-06, "loss": 0.4583, "num_input_tokens_seen": 71035424, "step": 58405 }, { "epoch": 7.318631750407217, "grad_norm": 0.0808667242527008, "learning_rate": 7.99616153667929e-06, "loss": 0.462, "num_input_tokens_seen": 71041152, "step": 58410 }, { "epoch": 7.319258238316, "grad_norm": 0.08528733253479004, "learning_rate": 7.995723833233337e-06, "loss": 0.4679, "num_input_tokens_seen": 71047296, "step": 58415 }, { "epoch": 7.3198847262247835, "grad_norm": 0.13691812753677368, "learning_rate": 7.995286093970988e-06, "loss": 0.4661, "num_input_tokens_seen": 71052864, "step": 58420 }, { "epoch": 7.3205112141335675, "grad_norm": 0.09461642056703568, "learning_rate": 7.994848318897472e-06, "loss": 0.4686, "num_input_tokens_seen": 71058944, "step": 58425 }, { "epoch": 7.321137702042351, "grad_norm": 0.09952212125062943, "learning_rate": 7.994410508018029e-06, "loss": 0.4695, "num_input_tokens_seen": 71064928, "step": 58430 }, { "epoch": 7.321764189951134, "grad_norm": 0.0508914552628994, "learning_rate": 7.99397266133789e-06, "loss": 0.4672, "num_input_tokens_seen": 71070944, "step": 58435 }, { "epoch": 7.322390677859917, "grad_norm": 0.09867659956216812, "learning_rate": 7.993534778862291e-06, "loss": 0.4584, "num_input_tokens_seen": 71076896, "step": 58440 }, { "epoch": 7.3230171657687, "grad_norm": 0.07109151035547256, "learning_rate": 7.993096860596468e-06, "loss": 0.4595, "num_input_tokens_seen": 71083072, "step": 58445 }, { "epoch": 7.323643653677484, "grad_norm": 0.10558480769395828, "learning_rate": 7.992658906545654e-06, "loss": 0.4621, "num_input_tokens_seen": 71089088, "step": 58450 }, { "epoch": 7.324270141586267, "grad_norm": 0.1411750614643097, "learning_rate": 7.992220916715086e-06, "loss": 0.4587, "num_input_tokens_seen": 71095296, "step": 58455 }, { "epoch": 7.324896629495051, "grad_norm": 0.04990830272436142, "learning_rate": 7.991782891110002e-06, "loss": 0.4609, "num_input_tokens_seen": 71101568, "step": 58460 }, { "epoch": 7.325523117403834, "grad_norm": 0.09766067564487457, "learning_rate": 7.99134482973564e-06, "loss": 0.4548, "num_input_tokens_seen": 71107648, "step": 58465 }, { "epoch": 7.326149605312618, "grad_norm": 0.11655028164386749, "learning_rate": 7.990906732597232e-06, "loss": 0.4557, "num_input_tokens_seen": 71113600, "step": 58470 }, { "epoch": 7.326776093221401, "grad_norm": 0.07809590548276901, "learning_rate": 7.990468599700021e-06, "loss": 0.4679, "num_input_tokens_seen": 71119328, "step": 58475 }, { "epoch": 7.327402581130184, "grad_norm": 0.08967330306768417, "learning_rate": 7.990030431049244e-06, "loss": 0.464, "num_input_tokens_seen": 71125760, "step": 58480 }, { "epoch": 7.328029069038967, "grad_norm": 0.07649899274110794, "learning_rate": 7.98959222665014e-06, "loss": 0.4601, "num_input_tokens_seen": 71131808, "step": 58485 }, { "epoch": 7.3286555569477505, "grad_norm": 0.1344376653432846, "learning_rate": 7.989153986507946e-06, "loss": 0.4622, "num_input_tokens_seen": 71137408, "step": 58490 }, { "epoch": 7.329282044856535, "grad_norm": 0.08474665135145187, "learning_rate": 7.988715710627906e-06, "loss": 0.4604, "num_input_tokens_seen": 71143488, "step": 58495 }, { "epoch": 7.329908532765318, "grad_norm": 0.11664801090955734, "learning_rate": 7.988277399015254e-06, "loss": 0.4648, "num_input_tokens_seen": 71149696, "step": 58500 }, { "epoch": 7.330535020674101, "grad_norm": 0.08409518003463745, "learning_rate": 7.987839051675232e-06, "loss": 0.463, "num_input_tokens_seen": 71155808, "step": 58505 }, { "epoch": 7.331161508582884, "grad_norm": 0.11770378053188324, "learning_rate": 7.987400668613086e-06, "loss": 0.4613, "num_input_tokens_seen": 71162080, "step": 58510 }, { "epoch": 7.331787996491668, "grad_norm": 0.1102302074432373, "learning_rate": 7.986962249834051e-06, "loss": 0.4628, "num_input_tokens_seen": 71168224, "step": 58515 }, { "epoch": 7.332414484400451, "grad_norm": 0.15672831237316132, "learning_rate": 7.98652379534337e-06, "loss": 0.4629, "num_input_tokens_seen": 71174560, "step": 58520 }, { "epoch": 7.3330409723092345, "grad_norm": 0.08377230167388916, "learning_rate": 7.986085305146288e-06, "loss": 0.4581, "num_input_tokens_seen": 71180736, "step": 58525 }, { "epoch": 7.333667460218018, "grad_norm": 0.08220723271369934, "learning_rate": 7.985646779248046e-06, "loss": 0.4603, "num_input_tokens_seen": 71187008, "step": 58530 }, { "epoch": 7.334293948126801, "grad_norm": 0.11415338516235352, "learning_rate": 7.985208217653886e-06, "loss": 0.4643, "num_input_tokens_seen": 71193216, "step": 58535 }, { "epoch": 7.334920436035585, "grad_norm": 0.09523621201515198, "learning_rate": 7.98476962036905e-06, "loss": 0.4697, "num_input_tokens_seen": 71199104, "step": 58540 }, { "epoch": 7.335546923944368, "grad_norm": 0.0952458456158638, "learning_rate": 7.984330987398786e-06, "loss": 0.4668, "num_input_tokens_seen": 71205216, "step": 58545 }, { "epoch": 7.336173411853151, "grad_norm": 0.10121632367372513, "learning_rate": 7.983892318748333e-06, "loss": 0.461, "num_input_tokens_seen": 71211168, "step": 58550 }, { "epoch": 7.336799899761934, "grad_norm": 0.11480271071195602, "learning_rate": 7.983453614422942e-06, "loss": 0.4637, "num_input_tokens_seen": 71217056, "step": 58555 }, { "epoch": 7.3374263876707175, "grad_norm": 0.13577601313591003, "learning_rate": 7.983014874427854e-06, "loss": 0.4715, "num_input_tokens_seen": 71223104, "step": 58560 }, { "epoch": 7.338052875579502, "grad_norm": 0.11623979359865189, "learning_rate": 7.982576098768314e-06, "loss": 0.4647, "num_input_tokens_seen": 71228448, "step": 58565 }, { "epoch": 7.338679363488285, "grad_norm": 0.08888644725084305, "learning_rate": 7.98213728744957e-06, "loss": 0.4638, "num_input_tokens_seen": 71234880, "step": 58570 }, { "epoch": 7.339305851397068, "grad_norm": 0.07767531275749207, "learning_rate": 7.981698440476868e-06, "loss": 0.4595, "num_input_tokens_seen": 71241088, "step": 58575 }, { "epoch": 7.339932339305851, "grad_norm": 0.084813691675663, "learning_rate": 7.981259557855454e-06, "loss": 0.4677, "num_input_tokens_seen": 71247200, "step": 58580 }, { "epoch": 7.340558827214635, "grad_norm": 0.11762472242116928, "learning_rate": 7.980820639590573e-06, "loss": 0.4658, "num_input_tokens_seen": 71253376, "step": 58585 }, { "epoch": 7.341185315123418, "grad_norm": 0.10699287801980972, "learning_rate": 7.98038168568748e-06, "loss": 0.4628, "num_input_tokens_seen": 71259552, "step": 58590 }, { "epoch": 7.3418118030322015, "grad_norm": 0.07514666020870209, "learning_rate": 7.979942696151415e-06, "loss": 0.4646, "num_input_tokens_seen": 71265664, "step": 58595 }, { "epoch": 7.342438290940985, "grad_norm": 0.07159766554832458, "learning_rate": 7.979503670987629e-06, "loss": 0.4563, "num_input_tokens_seen": 71271776, "step": 58600 }, { "epoch": 7.343064778849768, "grad_norm": 0.09403176605701447, "learning_rate": 7.979064610201372e-06, "loss": 0.463, "num_input_tokens_seen": 71277824, "step": 58605 }, { "epoch": 7.343691266758552, "grad_norm": 0.10261396318674088, "learning_rate": 7.978625513797896e-06, "loss": 0.4607, "num_input_tokens_seen": 71283744, "step": 58610 }, { "epoch": 7.344317754667335, "grad_norm": 0.12238314002752304, "learning_rate": 7.978186381782445e-06, "loss": 0.4669, "num_input_tokens_seen": 71289728, "step": 58615 }, { "epoch": 7.344944242576118, "grad_norm": 0.07416378706693649, "learning_rate": 7.977747214160272e-06, "loss": 0.4692, "num_input_tokens_seen": 71295840, "step": 58620 }, { "epoch": 7.345570730484901, "grad_norm": 0.07645253092050552, "learning_rate": 7.977308010936626e-06, "loss": 0.4628, "num_input_tokens_seen": 71301920, "step": 58625 }, { "epoch": 7.3461972183936854, "grad_norm": 0.07646707445383072, "learning_rate": 7.976868772116761e-06, "loss": 0.4611, "num_input_tokens_seen": 71307744, "step": 58630 }, { "epoch": 7.346823706302469, "grad_norm": 0.12546062469482422, "learning_rate": 7.976429497705928e-06, "loss": 0.4649, "num_input_tokens_seen": 71313856, "step": 58635 }, { "epoch": 7.347450194211252, "grad_norm": 0.07467186450958252, "learning_rate": 7.975990187709378e-06, "loss": 0.4621, "num_input_tokens_seen": 71319168, "step": 58640 }, { "epoch": 7.348076682120035, "grad_norm": 0.0735069289803505, "learning_rate": 7.975550842132362e-06, "loss": 0.4625, "num_input_tokens_seen": 71325120, "step": 58645 }, { "epoch": 7.348703170028818, "grad_norm": 0.08981823921203613, "learning_rate": 7.975111460980134e-06, "loss": 0.4584, "num_input_tokens_seen": 71331008, "step": 58650 }, { "epoch": 7.349329657937602, "grad_norm": 0.14977826178073883, "learning_rate": 7.974672044257948e-06, "loss": 0.4586, "num_input_tokens_seen": 71337440, "step": 58655 }, { "epoch": 7.349956145846385, "grad_norm": 0.07069908827543259, "learning_rate": 7.974232591971055e-06, "loss": 0.4682, "num_input_tokens_seen": 71343584, "step": 58660 }, { "epoch": 7.3505826337551685, "grad_norm": 0.0829814225435257, "learning_rate": 7.973793104124712e-06, "loss": 0.4597, "num_input_tokens_seen": 71349728, "step": 58665 }, { "epoch": 7.351209121663952, "grad_norm": 0.06998582184314728, "learning_rate": 7.973353580724173e-06, "loss": 0.4555, "num_input_tokens_seen": 71355936, "step": 58670 }, { "epoch": 7.351835609572735, "grad_norm": 0.06854380667209625, "learning_rate": 7.97291402177469e-06, "loss": 0.4579, "num_input_tokens_seen": 71362016, "step": 58675 }, { "epoch": 7.352462097481519, "grad_norm": 0.08128379285335541, "learning_rate": 7.972474427281522e-06, "loss": 0.4612, "num_input_tokens_seen": 71368096, "step": 58680 }, { "epoch": 7.353088585390302, "grad_norm": 0.11823348701000214, "learning_rate": 7.972034797249923e-06, "loss": 0.4589, "num_input_tokens_seen": 71374304, "step": 58685 }, { "epoch": 7.353715073299085, "grad_norm": 0.09108618646860123, "learning_rate": 7.971595131685147e-06, "loss": 0.4561, "num_input_tokens_seen": 71379968, "step": 58690 }, { "epoch": 7.354341561207868, "grad_norm": 0.08494429290294647, "learning_rate": 7.971155430592456e-06, "loss": 0.4565, "num_input_tokens_seen": 71385856, "step": 58695 }, { "epoch": 7.354968049116652, "grad_norm": 0.07131549715995789, "learning_rate": 7.970715693977103e-06, "loss": 0.4649, "num_input_tokens_seen": 71392256, "step": 58700 }, { "epoch": 7.355594537025436, "grad_norm": 0.10271953046321869, "learning_rate": 7.970275921844346e-06, "loss": 0.4574, "num_input_tokens_seen": 71398688, "step": 58705 }, { "epoch": 7.356221024934219, "grad_norm": 0.09409041702747345, "learning_rate": 7.969836114199443e-06, "loss": 0.4622, "num_input_tokens_seen": 71404800, "step": 58710 }, { "epoch": 7.356847512843002, "grad_norm": 0.13724491000175476, "learning_rate": 7.969396271047652e-06, "loss": 0.462, "num_input_tokens_seen": 71410176, "step": 58715 }, { "epoch": 7.357474000751785, "grad_norm": 0.08844048529863358, "learning_rate": 7.968956392394234e-06, "loss": 0.4655, "num_input_tokens_seen": 71416320, "step": 58720 }, { "epoch": 7.358100488660569, "grad_norm": 0.11460883170366287, "learning_rate": 7.968516478244443e-06, "loss": 0.4646, "num_input_tokens_seen": 71422528, "step": 58725 }, { "epoch": 7.358726976569352, "grad_norm": 0.1131359338760376, "learning_rate": 7.968076528603544e-06, "loss": 0.4629, "num_input_tokens_seen": 71428704, "step": 58730 }, { "epoch": 7.3593534644781355, "grad_norm": 0.09571501612663269, "learning_rate": 7.967636543476793e-06, "loss": 0.4619, "num_input_tokens_seen": 71435008, "step": 58735 }, { "epoch": 7.359979952386919, "grad_norm": 0.09821541607379913, "learning_rate": 7.967196522869454e-06, "loss": 0.4596, "num_input_tokens_seen": 71440800, "step": 58740 }, { "epoch": 7.360606440295702, "grad_norm": 0.07822897285223007, "learning_rate": 7.966756466786785e-06, "loss": 0.4546, "num_input_tokens_seen": 71446944, "step": 58745 }, { "epoch": 7.361232928204486, "grad_norm": 0.11993392556905746, "learning_rate": 7.966316375234046e-06, "loss": 0.4671, "num_input_tokens_seen": 71452672, "step": 58750 }, { "epoch": 7.361859416113269, "grad_norm": 0.08705105632543564, "learning_rate": 7.965876248216504e-06, "loss": 0.4703, "num_input_tokens_seen": 71459136, "step": 58755 }, { "epoch": 7.362485904022052, "grad_norm": 0.1344003826379776, "learning_rate": 7.965436085739418e-06, "loss": 0.4535, "num_input_tokens_seen": 71465536, "step": 58760 }, { "epoch": 7.363112391930835, "grad_norm": 0.09241480380296707, "learning_rate": 7.964995887808047e-06, "loss": 0.4656, "num_input_tokens_seen": 71471968, "step": 58765 }, { "epoch": 7.3637388798396195, "grad_norm": 0.09296653419733047, "learning_rate": 7.96455565442766e-06, "loss": 0.4608, "num_input_tokens_seen": 71477632, "step": 58770 }, { "epoch": 7.364365367748403, "grad_norm": 0.11470295488834381, "learning_rate": 7.964115385603518e-06, "loss": 0.4567, "num_input_tokens_seen": 71483872, "step": 58775 }, { "epoch": 7.364991855657186, "grad_norm": 0.07658154517412186, "learning_rate": 7.963675081340882e-06, "loss": 0.4658, "num_input_tokens_seen": 71489792, "step": 58780 }, { "epoch": 7.365618343565969, "grad_norm": 0.13764631748199463, "learning_rate": 7.96323474164502e-06, "loss": 0.4531, "num_input_tokens_seen": 71495296, "step": 58785 }, { "epoch": 7.366244831474752, "grad_norm": 0.13130220770835876, "learning_rate": 7.962794366521197e-06, "loss": 0.4565, "num_input_tokens_seen": 71501280, "step": 58790 }, { "epoch": 7.366871319383536, "grad_norm": 0.08369728922843933, "learning_rate": 7.962353955974676e-06, "loss": 0.4556, "num_input_tokens_seen": 71507616, "step": 58795 }, { "epoch": 7.367497807292319, "grad_norm": 0.05784805491566658, "learning_rate": 7.961913510010721e-06, "loss": 0.4657, "num_input_tokens_seen": 71513664, "step": 58800 }, { "epoch": 7.368124295201103, "grad_norm": 0.06346306949853897, "learning_rate": 7.9614730286346e-06, "loss": 0.4698, "num_input_tokens_seen": 71519680, "step": 58805 }, { "epoch": 7.368750783109886, "grad_norm": 0.09650328755378723, "learning_rate": 7.96103251185158e-06, "loss": 0.4686, "num_input_tokens_seen": 71525728, "step": 58810 }, { "epoch": 7.369377271018669, "grad_norm": 0.08283361792564392, "learning_rate": 7.960591959666926e-06, "loss": 0.455, "num_input_tokens_seen": 71532032, "step": 58815 }, { "epoch": 7.370003758927453, "grad_norm": 0.0773780569434166, "learning_rate": 7.960151372085905e-06, "loss": 0.4657, "num_input_tokens_seen": 71537856, "step": 58820 }, { "epoch": 7.370630246836236, "grad_norm": 0.12669897079467773, "learning_rate": 7.959710749113789e-06, "loss": 0.4587, "num_input_tokens_seen": 71543936, "step": 58825 }, { "epoch": 7.371256734745019, "grad_norm": 0.0816703587770462, "learning_rate": 7.95927009075584e-06, "loss": 0.4689, "num_input_tokens_seen": 71550208, "step": 58830 }, { "epoch": 7.3718832226538025, "grad_norm": 0.08718916028738022, "learning_rate": 7.95882939701733e-06, "loss": 0.4609, "num_input_tokens_seen": 71556608, "step": 58835 }, { "epoch": 7.3725097105625865, "grad_norm": 0.11659233272075653, "learning_rate": 7.958388667903526e-06, "loss": 0.4672, "num_input_tokens_seen": 71562624, "step": 58840 }, { "epoch": 7.37313619847137, "grad_norm": 0.12540069222450256, "learning_rate": 7.957947903419698e-06, "loss": 0.4652, "num_input_tokens_seen": 71568480, "step": 58845 }, { "epoch": 7.373762686380153, "grad_norm": 0.08961286395788193, "learning_rate": 7.957507103571118e-06, "loss": 0.4704, "num_input_tokens_seen": 71574592, "step": 58850 }, { "epoch": 7.374389174288936, "grad_norm": 0.09909354150295258, "learning_rate": 7.957066268363052e-06, "loss": 0.4635, "num_input_tokens_seen": 71580544, "step": 58855 }, { "epoch": 7.375015662197719, "grad_norm": 0.07293125987052917, "learning_rate": 7.956625397800773e-06, "loss": 0.4632, "num_input_tokens_seen": 71586592, "step": 58860 }, { "epoch": 7.375642150106503, "grad_norm": 0.11217016726732254, "learning_rate": 7.956184491889552e-06, "loss": 0.464, "num_input_tokens_seen": 71592576, "step": 58865 }, { "epoch": 7.376268638015286, "grad_norm": 0.06745006144046783, "learning_rate": 7.95574355063466e-06, "loss": 0.4674, "num_input_tokens_seen": 71598752, "step": 58870 }, { "epoch": 7.37689512592407, "grad_norm": 0.07845107465982437, "learning_rate": 7.955302574041368e-06, "loss": 0.4675, "num_input_tokens_seen": 71604928, "step": 58875 }, { "epoch": 7.377521613832853, "grad_norm": 0.09708350896835327, "learning_rate": 7.95486156211495e-06, "loss": 0.4639, "num_input_tokens_seen": 71611200, "step": 58880 }, { "epoch": 7.378148101741637, "grad_norm": 0.08386767655611038, "learning_rate": 7.954420514860677e-06, "loss": 0.4639, "num_input_tokens_seen": 71617184, "step": 58885 }, { "epoch": 7.37877458965042, "grad_norm": 0.08647479861974716, "learning_rate": 7.953979432283825e-06, "loss": 0.4666, "num_input_tokens_seen": 71623136, "step": 58890 }, { "epoch": 7.379401077559203, "grad_norm": 0.07972049713134766, "learning_rate": 7.953538314389663e-06, "loss": 0.463, "num_input_tokens_seen": 71629344, "step": 58895 }, { "epoch": 7.380027565467986, "grad_norm": 0.0698307678103447, "learning_rate": 7.953097161183467e-06, "loss": 0.4548, "num_input_tokens_seen": 71635392, "step": 58900 }, { "epoch": 7.3806540533767695, "grad_norm": 0.1161833330988884, "learning_rate": 7.952655972670513e-06, "loss": 0.4588, "num_input_tokens_seen": 71641440, "step": 58905 }, { "epoch": 7.3812805412855536, "grad_norm": 0.1167530044913292, "learning_rate": 7.952214748856074e-06, "loss": 0.4646, "num_input_tokens_seen": 71647808, "step": 58910 }, { "epoch": 7.381907029194337, "grad_norm": 0.11276524513959885, "learning_rate": 7.951773489745424e-06, "loss": 0.4584, "num_input_tokens_seen": 71653920, "step": 58915 }, { "epoch": 7.38253351710312, "grad_norm": 0.09583792835474014, "learning_rate": 7.951332195343843e-06, "loss": 0.4612, "num_input_tokens_seen": 71660160, "step": 58920 }, { "epoch": 7.383160005011903, "grad_norm": 0.07986285537481308, "learning_rate": 7.950890865656603e-06, "loss": 0.4602, "num_input_tokens_seen": 71665664, "step": 58925 }, { "epoch": 7.383786492920686, "grad_norm": 0.09110232442617416, "learning_rate": 7.950449500688981e-06, "loss": 0.4612, "num_input_tokens_seen": 71671808, "step": 58930 }, { "epoch": 7.38441298082947, "grad_norm": 0.0822017639875412, "learning_rate": 7.950008100446256e-06, "loss": 0.4574, "num_input_tokens_seen": 71678112, "step": 58935 }, { "epoch": 7.3850394687382535, "grad_norm": 0.11406750977039337, "learning_rate": 7.949566664933702e-06, "loss": 0.4691, "num_input_tokens_seen": 71684416, "step": 58940 }, { "epoch": 7.385665956647037, "grad_norm": 0.08665572106838226, "learning_rate": 7.949125194156599e-06, "loss": 0.4632, "num_input_tokens_seen": 71690816, "step": 58945 }, { "epoch": 7.38629244455582, "grad_norm": 0.07499808818101883, "learning_rate": 7.948683688120226e-06, "loss": 0.46, "num_input_tokens_seen": 71696832, "step": 58950 }, { "epoch": 7.386918932464604, "grad_norm": 0.10890509188175201, "learning_rate": 7.94824214682986e-06, "loss": 0.4586, "num_input_tokens_seen": 71703040, "step": 58955 }, { "epoch": 7.387545420373387, "grad_norm": 0.085430808365345, "learning_rate": 7.947800570290782e-06, "loss": 0.4589, "num_input_tokens_seen": 71709312, "step": 58960 }, { "epoch": 7.38817190828217, "grad_norm": 0.13822337985038757, "learning_rate": 7.947358958508268e-06, "loss": 0.4588, "num_input_tokens_seen": 71715360, "step": 58965 }, { "epoch": 7.388798396190953, "grad_norm": 0.08477067947387695, "learning_rate": 7.9469173114876e-06, "loss": 0.4661, "num_input_tokens_seen": 71721536, "step": 58970 }, { "epoch": 7.3894248840997365, "grad_norm": 0.09983426332473755, "learning_rate": 7.946475629234057e-06, "loss": 0.4643, "num_input_tokens_seen": 71727584, "step": 58975 }, { "epoch": 7.390051372008521, "grad_norm": 0.050023868680000305, "learning_rate": 7.946033911752922e-06, "loss": 0.4667, "num_input_tokens_seen": 71733408, "step": 58980 }, { "epoch": 7.390677859917304, "grad_norm": 0.10116712003946304, "learning_rate": 7.945592159049474e-06, "loss": 0.4618, "num_input_tokens_seen": 71739616, "step": 58985 }, { "epoch": 7.391304347826087, "grad_norm": 0.10761240124702454, "learning_rate": 7.945150371128995e-06, "loss": 0.4591, "num_input_tokens_seen": 71745664, "step": 58990 }, { "epoch": 7.39193083573487, "grad_norm": 0.09194362908601761, "learning_rate": 7.944708547996766e-06, "loss": 0.4645, "num_input_tokens_seen": 71752000, "step": 58995 }, { "epoch": 7.392557323643654, "grad_norm": 0.12464724481105804, "learning_rate": 7.944266689658073e-06, "loss": 0.4563, "num_input_tokens_seen": 71758112, "step": 59000 }, { "epoch": 7.393183811552437, "grad_norm": 0.1008196622133255, "learning_rate": 7.943824796118196e-06, "loss": 0.4588, "num_input_tokens_seen": 71763968, "step": 59005 }, { "epoch": 7.3938102994612205, "grad_norm": 0.07760751247406006, "learning_rate": 7.943382867382417e-06, "loss": 0.4651, "num_input_tokens_seen": 71769696, "step": 59010 }, { "epoch": 7.394436787370004, "grad_norm": 0.0807628333568573, "learning_rate": 7.942940903456022e-06, "loss": 0.4632, "num_input_tokens_seen": 71776032, "step": 59015 }, { "epoch": 7.395063275278787, "grad_norm": 0.07937242090702057, "learning_rate": 7.942498904344294e-06, "loss": 0.458, "num_input_tokens_seen": 71782144, "step": 59020 }, { "epoch": 7.395689763187571, "grad_norm": 0.08018216490745544, "learning_rate": 7.942056870052519e-06, "loss": 0.4706, "num_input_tokens_seen": 71788416, "step": 59025 }, { "epoch": 7.396316251096354, "grad_norm": 0.09088250249624252, "learning_rate": 7.941614800585979e-06, "loss": 0.4608, "num_input_tokens_seen": 71794784, "step": 59030 }, { "epoch": 7.396942739005137, "grad_norm": 0.08638224750757217, "learning_rate": 7.941172695949962e-06, "loss": 0.4655, "num_input_tokens_seen": 71800832, "step": 59035 }, { "epoch": 7.39756922691392, "grad_norm": 0.11633298546075821, "learning_rate": 7.940730556149752e-06, "loss": 0.4605, "num_input_tokens_seen": 71807232, "step": 59040 }, { "epoch": 7.3981957148227036, "grad_norm": 0.08326674997806549, "learning_rate": 7.940288381190635e-06, "loss": 0.4584, "num_input_tokens_seen": 71813088, "step": 59045 }, { "epoch": 7.398822202731488, "grad_norm": 0.06081508845090866, "learning_rate": 7.9398461710779e-06, "loss": 0.4575, "num_input_tokens_seen": 71818912, "step": 59050 }, { "epoch": 7.399448690640271, "grad_norm": 0.06720773875713348, "learning_rate": 7.93940392581683e-06, "loss": 0.4638, "num_input_tokens_seen": 71824960, "step": 59055 }, { "epoch": 7.400075178549054, "grad_norm": 0.08265390992164612, "learning_rate": 7.938961645412717e-06, "loss": 0.4597, "num_input_tokens_seen": 71831072, "step": 59060 }, { "epoch": 7.400701666457837, "grad_norm": 0.0733448788523674, "learning_rate": 7.938519329870844e-06, "loss": 0.4602, "num_input_tokens_seen": 71836800, "step": 59065 }, { "epoch": 7.40132815436662, "grad_norm": 0.12947246432304382, "learning_rate": 7.938076979196505e-06, "loss": 0.4626, "num_input_tokens_seen": 71843040, "step": 59070 }, { "epoch": 7.401954642275404, "grad_norm": 0.11914343386888504, "learning_rate": 7.937634593394984e-06, "loss": 0.4652, "num_input_tokens_seen": 71849216, "step": 59075 }, { "epoch": 7.4025811301841875, "grad_norm": 0.06027279794216156, "learning_rate": 7.937192172471572e-06, "loss": 0.4639, "num_input_tokens_seen": 71855104, "step": 59080 }, { "epoch": 7.403207618092971, "grad_norm": 0.06989043205976486, "learning_rate": 7.936749716431558e-06, "loss": 0.46, "num_input_tokens_seen": 71861024, "step": 59085 }, { "epoch": 7.403834106001754, "grad_norm": 0.07269005477428436, "learning_rate": 7.936307225280232e-06, "loss": 0.4563, "num_input_tokens_seen": 71867200, "step": 59090 }, { "epoch": 7.404460593910538, "grad_norm": 0.09262116998434067, "learning_rate": 7.935864699022884e-06, "loss": 0.4577, "num_input_tokens_seen": 71873408, "step": 59095 }, { "epoch": 7.405087081819321, "grad_norm": 0.12512628734111786, "learning_rate": 7.935422137664806e-06, "loss": 0.4703, "num_input_tokens_seen": 71879680, "step": 59100 }, { "epoch": 7.405713569728104, "grad_norm": 0.08145251870155334, "learning_rate": 7.934979541211287e-06, "loss": 0.4626, "num_input_tokens_seen": 71886016, "step": 59105 }, { "epoch": 7.406340057636887, "grad_norm": 0.08892755210399628, "learning_rate": 7.934536909667622e-06, "loss": 0.4622, "num_input_tokens_seen": 71891808, "step": 59110 }, { "epoch": 7.406966545545671, "grad_norm": 0.12655958533287048, "learning_rate": 7.934094243039099e-06, "loss": 0.4656, "num_input_tokens_seen": 71898304, "step": 59115 }, { "epoch": 7.407593033454455, "grad_norm": 0.08811645209789276, "learning_rate": 7.933651541331014e-06, "loss": 0.4639, "num_input_tokens_seen": 71904192, "step": 59120 }, { "epoch": 7.408219521363238, "grad_norm": 0.07584633678197861, "learning_rate": 7.933208804548657e-06, "loss": 0.4587, "num_input_tokens_seen": 71910272, "step": 59125 }, { "epoch": 7.408846009272021, "grad_norm": 0.12001832574605942, "learning_rate": 7.932766032697323e-06, "loss": 0.4619, "num_input_tokens_seen": 71916192, "step": 59130 }, { "epoch": 7.409472497180804, "grad_norm": 0.08412549644708633, "learning_rate": 7.932323225782305e-06, "loss": 0.4635, "num_input_tokens_seen": 71921920, "step": 59135 }, { "epoch": 7.410098985089588, "grad_norm": 0.12807969748973846, "learning_rate": 7.931880383808899e-06, "loss": 0.4637, "num_input_tokens_seen": 71927936, "step": 59140 }, { "epoch": 7.410725472998371, "grad_norm": 0.08483510464429855, "learning_rate": 7.931437506782399e-06, "loss": 0.4645, "num_input_tokens_seen": 71934240, "step": 59145 }, { "epoch": 7.4113519609071545, "grad_norm": 0.11585759371519089, "learning_rate": 7.930994594708095e-06, "loss": 0.4616, "num_input_tokens_seen": 71940032, "step": 59150 }, { "epoch": 7.411978448815938, "grad_norm": 0.07596629112958908, "learning_rate": 7.930551647591289e-06, "loss": 0.4654, "num_input_tokens_seen": 71946336, "step": 59155 }, { "epoch": 7.412604936724721, "grad_norm": 0.07747917622327805, "learning_rate": 7.930108665437273e-06, "loss": 0.4573, "num_input_tokens_seen": 71952704, "step": 59160 }, { "epoch": 7.413231424633505, "grad_norm": 0.08124236017465591, "learning_rate": 7.929665648251346e-06, "loss": 0.4617, "num_input_tokens_seen": 71958240, "step": 59165 }, { "epoch": 7.413857912542288, "grad_norm": 0.048617567867040634, "learning_rate": 7.929222596038801e-06, "loss": 0.4573, "num_input_tokens_seen": 71964352, "step": 59170 }, { "epoch": 7.414484400451071, "grad_norm": 0.07351307570934296, "learning_rate": 7.92877950880494e-06, "loss": 0.4579, "num_input_tokens_seen": 71970144, "step": 59175 }, { "epoch": 7.415110888359854, "grad_norm": 0.1051754280924797, "learning_rate": 7.928336386555056e-06, "loss": 0.4586, "num_input_tokens_seen": 71976416, "step": 59180 }, { "epoch": 7.415737376268638, "grad_norm": 0.07918129861354828, "learning_rate": 7.927893229294448e-06, "loss": 0.4632, "num_input_tokens_seen": 71982656, "step": 59185 }, { "epoch": 7.416363864177422, "grad_norm": 0.0795690044760704, "learning_rate": 7.927450037028414e-06, "loss": 0.4675, "num_input_tokens_seen": 71988704, "step": 59190 }, { "epoch": 7.416990352086205, "grad_norm": 0.08517467230558395, "learning_rate": 7.927006809762256e-06, "loss": 0.4594, "num_input_tokens_seen": 71994688, "step": 59195 }, { "epoch": 7.417616839994988, "grad_norm": 0.08795902132987976, "learning_rate": 7.926563547501269e-06, "loss": 0.4626, "num_input_tokens_seen": 72000832, "step": 59200 }, { "epoch": 7.418243327903771, "grad_norm": 0.08605781942605972, "learning_rate": 7.926120250250755e-06, "loss": 0.4646, "num_input_tokens_seen": 72007072, "step": 59205 }, { "epoch": 7.418869815812555, "grad_norm": 0.08499334007501602, "learning_rate": 7.925676918016013e-06, "loss": 0.4643, "num_input_tokens_seen": 72013408, "step": 59210 }, { "epoch": 7.419496303721338, "grad_norm": 0.05831519141793251, "learning_rate": 7.925233550802343e-06, "loss": 0.4632, "num_input_tokens_seen": 72019584, "step": 59215 }, { "epoch": 7.420122791630122, "grad_norm": 0.050954777747392654, "learning_rate": 7.924790148615048e-06, "loss": 0.4632, "num_input_tokens_seen": 72025504, "step": 59220 }, { "epoch": 7.420749279538905, "grad_norm": 0.07310429215431213, "learning_rate": 7.924346711459426e-06, "loss": 0.4653, "num_input_tokens_seen": 72031328, "step": 59225 }, { "epoch": 7.421375767447688, "grad_norm": 0.06104359030723572, "learning_rate": 7.923903239340781e-06, "loss": 0.4629, "num_input_tokens_seen": 72036064, "step": 59230 }, { "epoch": 7.422002255356472, "grad_norm": 0.08966955542564392, "learning_rate": 7.923459732264415e-06, "loss": 0.4643, "num_input_tokens_seen": 72042304, "step": 59235 }, { "epoch": 7.422628743265255, "grad_norm": 0.04847761243581772, "learning_rate": 7.92301619023563e-06, "loss": 0.4599, "num_input_tokens_seen": 72048736, "step": 59240 }, { "epoch": 7.423255231174038, "grad_norm": 0.05427948758006096, "learning_rate": 7.922572613259727e-06, "loss": 0.4612, "num_input_tokens_seen": 72054688, "step": 59245 }, { "epoch": 7.4238817190828215, "grad_norm": 0.08625175803899765, "learning_rate": 7.922129001342015e-06, "loss": 0.4624, "num_input_tokens_seen": 72061056, "step": 59250 }, { "epoch": 7.4245082069916055, "grad_norm": 0.08587110787630081, "learning_rate": 7.921685354487791e-06, "loss": 0.4606, "num_input_tokens_seen": 72067136, "step": 59255 }, { "epoch": 7.425134694900389, "grad_norm": 0.11395464837551117, "learning_rate": 7.921241672702362e-06, "loss": 0.4709, "num_input_tokens_seen": 72073280, "step": 59260 }, { "epoch": 7.425761182809172, "grad_norm": 0.1588282436132431, "learning_rate": 7.920797955991036e-06, "loss": 0.4695, "num_input_tokens_seen": 72079328, "step": 59265 }, { "epoch": 7.426387670717955, "grad_norm": 0.08587024360895157, "learning_rate": 7.920354204359111e-06, "loss": 0.4679, "num_input_tokens_seen": 72085344, "step": 59270 }, { "epoch": 7.427014158626738, "grad_norm": 0.08104255050420761, "learning_rate": 7.919910417811897e-06, "loss": 0.465, "num_input_tokens_seen": 72091808, "step": 59275 }, { "epoch": 7.427640646535522, "grad_norm": 0.12752550840377808, "learning_rate": 7.919466596354702e-06, "loss": 0.4553, "num_input_tokens_seen": 72098016, "step": 59280 }, { "epoch": 7.428267134444305, "grad_norm": 0.07452791184186935, "learning_rate": 7.919022739992826e-06, "loss": 0.4553, "num_input_tokens_seen": 72103872, "step": 59285 }, { "epoch": 7.428893622353089, "grad_norm": 0.08061152696609497, "learning_rate": 7.918578848731581e-06, "loss": 0.4598, "num_input_tokens_seen": 72109920, "step": 59290 }, { "epoch": 7.429520110261872, "grad_norm": 0.1113799437880516, "learning_rate": 7.918134922576271e-06, "loss": 0.4579, "num_input_tokens_seen": 72116128, "step": 59295 }, { "epoch": 7.430146598170655, "grad_norm": 0.10867153853178024, "learning_rate": 7.917690961532205e-06, "loss": 0.4673, "num_input_tokens_seen": 72121280, "step": 59300 }, { "epoch": 7.430773086079439, "grad_norm": 0.12528890371322632, "learning_rate": 7.917246965604689e-06, "loss": 0.4629, "num_input_tokens_seen": 72127616, "step": 59305 }, { "epoch": 7.431399573988222, "grad_norm": 0.10266351699829102, "learning_rate": 7.916802934799034e-06, "loss": 0.4625, "num_input_tokens_seen": 72133696, "step": 59310 }, { "epoch": 7.432026061897005, "grad_norm": 0.09340803325176239, "learning_rate": 7.916358869120549e-06, "loss": 0.4555, "num_input_tokens_seen": 72139584, "step": 59315 }, { "epoch": 7.4326525498057885, "grad_norm": 0.08341872692108154, "learning_rate": 7.91591476857454e-06, "loss": 0.463, "num_input_tokens_seen": 72145664, "step": 59320 }, { "epoch": 7.433279037714572, "grad_norm": 0.09198563545942307, "learning_rate": 7.915470633166317e-06, "loss": 0.4581, "num_input_tokens_seen": 72152064, "step": 59325 }, { "epoch": 7.433905525623356, "grad_norm": 0.10186927020549774, "learning_rate": 7.915026462901194e-06, "loss": 0.4583, "num_input_tokens_seen": 72158048, "step": 59330 }, { "epoch": 7.434532013532139, "grad_norm": 0.06091924011707306, "learning_rate": 7.914582257784477e-06, "loss": 0.4597, "num_input_tokens_seen": 72164032, "step": 59335 }, { "epoch": 7.435158501440922, "grad_norm": 0.130497545003891, "learning_rate": 7.914138017821482e-06, "loss": 0.4587, "num_input_tokens_seen": 72170304, "step": 59340 }, { "epoch": 7.435784989349705, "grad_norm": 0.13804876804351807, "learning_rate": 7.913693743017514e-06, "loss": 0.4586, "num_input_tokens_seen": 72176608, "step": 59345 }, { "epoch": 7.436411477258489, "grad_norm": 0.11002733558416367, "learning_rate": 7.913249433377887e-06, "loss": 0.456, "num_input_tokens_seen": 72182464, "step": 59350 }, { "epoch": 7.4370379651672724, "grad_norm": 0.1852593570947647, "learning_rate": 7.912805088907916e-06, "loss": 0.4606, "num_input_tokens_seen": 72188576, "step": 59355 }, { "epoch": 7.437664453076056, "grad_norm": 0.16734132170677185, "learning_rate": 7.91236070961291e-06, "loss": 0.4708, "num_input_tokens_seen": 72194720, "step": 59360 }, { "epoch": 7.438290940984839, "grad_norm": 0.12812303006649017, "learning_rate": 7.911916295498182e-06, "loss": 0.4644, "num_input_tokens_seen": 72200992, "step": 59365 }, { "epoch": 7.438917428893622, "grad_norm": 0.13825273513793945, "learning_rate": 7.911471846569047e-06, "loss": 0.4535, "num_input_tokens_seen": 72206912, "step": 59370 }, { "epoch": 7.439543916802406, "grad_norm": 0.09496309608221054, "learning_rate": 7.911027362830822e-06, "loss": 0.4646, "num_input_tokens_seen": 72213120, "step": 59375 }, { "epoch": 7.440170404711189, "grad_norm": 0.11082051694393158, "learning_rate": 7.910582844288812e-06, "loss": 0.4672, "num_input_tokens_seen": 72219264, "step": 59380 }, { "epoch": 7.440796892619972, "grad_norm": 0.13068066537380219, "learning_rate": 7.91013829094834e-06, "loss": 0.4594, "num_input_tokens_seen": 72225376, "step": 59385 }, { "epoch": 7.4414233805287555, "grad_norm": 0.15810392796993256, "learning_rate": 7.909693702814718e-06, "loss": 0.4674, "num_input_tokens_seen": 72231488, "step": 59390 }, { "epoch": 7.44204986843754, "grad_norm": 0.09130232781171799, "learning_rate": 7.90924907989326e-06, "loss": 0.4618, "num_input_tokens_seen": 72237504, "step": 59395 }, { "epoch": 7.442676356346323, "grad_norm": 0.08079911023378372, "learning_rate": 7.908804422189285e-06, "loss": 0.4678, "num_input_tokens_seen": 72243552, "step": 59400 }, { "epoch": 7.443302844255106, "grad_norm": 0.14023974537849426, "learning_rate": 7.908359729708106e-06, "loss": 0.4585, "num_input_tokens_seen": 72249664, "step": 59405 }, { "epoch": 7.443929332163889, "grad_norm": 0.09304127097129822, "learning_rate": 7.907915002455042e-06, "loss": 0.4576, "num_input_tokens_seen": 72255776, "step": 59410 }, { "epoch": 7.444555820072672, "grad_norm": 0.09365341067314148, "learning_rate": 7.90747024043541e-06, "loss": 0.4558, "num_input_tokens_seen": 72262336, "step": 59415 }, { "epoch": 7.445182307981456, "grad_norm": 0.10819488763809204, "learning_rate": 7.907025443654525e-06, "loss": 0.4658, "num_input_tokens_seen": 72268480, "step": 59420 }, { "epoch": 7.4458087958902395, "grad_norm": 0.06220310181379318, "learning_rate": 7.906580612117709e-06, "loss": 0.4657, "num_input_tokens_seen": 72274720, "step": 59425 }, { "epoch": 7.446435283799023, "grad_norm": 0.14307552576065063, "learning_rate": 7.906135745830275e-06, "loss": 0.4691, "num_input_tokens_seen": 72280672, "step": 59430 }, { "epoch": 7.447061771707806, "grad_norm": 0.14372581243515015, "learning_rate": 7.905690844797547e-06, "loss": 0.4635, "num_input_tokens_seen": 72286880, "step": 59435 }, { "epoch": 7.447688259616589, "grad_norm": 0.12759581208229065, "learning_rate": 7.90524590902484e-06, "loss": 0.4555, "num_input_tokens_seen": 72292928, "step": 59440 }, { "epoch": 7.448314747525373, "grad_norm": 0.11741361021995544, "learning_rate": 7.904800938517479e-06, "loss": 0.4658, "num_input_tokens_seen": 72298688, "step": 59445 }, { "epoch": 7.448941235434156, "grad_norm": 0.09020696580410004, "learning_rate": 7.904355933280778e-06, "loss": 0.4593, "num_input_tokens_seen": 72304896, "step": 59450 }, { "epoch": 7.449567723342939, "grad_norm": 0.09462816268205643, "learning_rate": 7.903910893320062e-06, "loss": 0.4633, "num_input_tokens_seen": 72310848, "step": 59455 }, { "epoch": 7.4501942112517225, "grad_norm": 0.08675266802310944, "learning_rate": 7.903465818640648e-06, "loss": 0.4622, "num_input_tokens_seen": 72316832, "step": 59460 }, { "epoch": 7.450820699160507, "grad_norm": 0.08857288211584091, "learning_rate": 7.90302070924786e-06, "loss": 0.4636, "num_input_tokens_seen": 72322720, "step": 59465 }, { "epoch": 7.45144718706929, "grad_norm": 0.06054649129509926, "learning_rate": 7.902575565147018e-06, "loss": 0.46, "num_input_tokens_seen": 72328544, "step": 59470 }, { "epoch": 7.452073674978073, "grad_norm": 0.09692943096160889, "learning_rate": 7.902130386343446e-06, "loss": 0.4666, "num_input_tokens_seen": 72334464, "step": 59475 }, { "epoch": 7.452700162886856, "grad_norm": 0.10816417634487152, "learning_rate": 7.901685172842463e-06, "loss": 0.4636, "num_input_tokens_seen": 72340672, "step": 59480 }, { "epoch": 7.453326650795639, "grad_norm": 0.09938685595989227, "learning_rate": 7.901239924649395e-06, "loss": 0.4639, "num_input_tokens_seen": 72346624, "step": 59485 }, { "epoch": 7.453953138704423, "grad_norm": 0.1614484190940857, "learning_rate": 7.900794641769565e-06, "loss": 0.4633, "num_input_tokens_seen": 72352832, "step": 59490 }, { "epoch": 7.4545796266132065, "grad_norm": 0.08523152768611908, "learning_rate": 7.900349324208297e-06, "loss": 0.4675, "num_input_tokens_seen": 72358880, "step": 59495 }, { "epoch": 7.45520611452199, "grad_norm": 0.08689234405755997, "learning_rate": 7.899903971970912e-06, "loss": 0.4629, "num_input_tokens_seen": 72365024, "step": 59500 }, { "epoch": 7.455832602430773, "grad_norm": 0.12757475674152374, "learning_rate": 7.89945858506274e-06, "loss": 0.4652, "num_input_tokens_seen": 72371136, "step": 59505 }, { "epoch": 7.456459090339557, "grad_norm": 0.15530049800872803, "learning_rate": 7.8990131634891e-06, "loss": 0.4597, "num_input_tokens_seen": 72377568, "step": 59510 }, { "epoch": 7.45708557824834, "grad_norm": 0.09535727649927139, "learning_rate": 7.898567707255322e-06, "loss": 0.4618, "num_input_tokens_seen": 72383584, "step": 59515 }, { "epoch": 7.457712066157123, "grad_norm": 0.10513736307621002, "learning_rate": 7.89812221636673e-06, "loss": 0.4589, "num_input_tokens_seen": 72389824, "step": 59520 }, { "epoch": 7.458338554065906, "grad_norm": 0.13696783781051636, "learning_rate": 7.89767669082865e-06, "loss": 0.4635, "num_input_tokens_seen": 72395904, "step": 59525 }, { "epoch": 7.45896504197469, "grad_norm": 0.09517379105091095, "learning_rate": 7.897231130646409e-06, "loss": 0.458, "num_input_tokens_seen": 72402016, "step": 59530 }, { "epoch": 7.459591529883474, "grad_norm": 0.10791989415884018, "learning_rate": 7.896785535825334e-06, "loss": 0.4634, "num_input_tokens_seen": 72408416, "step": 59535 }, { "epoch": 7.460218017792257, "grad_norm": 0.08954274654388428, "learning_rate": 7.896339906370752e-06, "loss": 0.4617, "num_input_tokens_seen": 72413984, "step": 59540 }, { "epoch": 7.46084450570104, "grad_norm": 0.08864208310842514, "learning_rate": 7.895894242287991e-06, "loss": 0.4653, "num_input_tokens_seen": 72420096, "step": 59545 }, { "epoch": 7.461470993609823, "grad_norm": 0.09583093225955963, "learning_rate": 7.89544854358238e-06, "loss": 0.4699, "num_input_tokens_seen": 72425536, "step": 59550 }, { "epoch": 7.462097481518606, "grad_norm": 0.0926467627286911, "learning_rate": 7.895002810259248e-06, "loss": 0.4567, "num_input_tokens_seen": 72431968, "step": 59555 }, { "epoch": 7.46272396942739, "grad_norm": 0.15242622792720795, "learning_rate": 7.894557042323922e-06, "loss": 0.4556, "num_input_tokens_seen": 72438208, "step": 59560 }, { "epoch": 7.4633504573361735, "grad_norm": 0.11367511004209518, "learning_rate": 7.894111239781734e-06, "loss": 0.4615, "num_input_tokens_seen": 72444416, "step": 59565 }, { "epoch": 7.463976945244957, "grad_norm": 0.08583023399114609, "learning_rate": 7.893665402638012e-06, "loss": 0.4589, "num_input_tokens_seen": 72450848, "step": 59570 }, { "epoch": 7.46460343315374, "grad_norm": 0.08962103724479675, "learning_rate": 7.893219530898089e-06, "loss": 0.4557, "num_input_tokens_seen": 72456928, "step": 59575 }, { "epoch": 7.465229921062523, "grad_norm": 0.10212323814630508, "learning_rate": 7.892773624567292e-06, "loss": 0.4651, "num_input_tokens_seen": 72463168, "step": 59580 }, { "epoch": 7.465856408971307, "grad_norm": 0.08646389842033386, "learning_rate": 7.892327683650955e-06, "loss": 0.4642, "num_input_tokens_seen": 72469632, "step": 59585 }, { "epoch": 7.46648289688009, "grad_norm": 0.09173449128866196, "learning_rate": 7.891881708154409e-06, "loss": 0.4664, "num_input_tokens_seen": 72475808, "step": 59590 }, { "epoch": 7.467109384788873, "grad_norm": 0.07426155358552933, "learning_rate": 7.891435698082986e-06, "loss": 0.46, "num_input_tokens_seen": 72481824, "step": 59595 }, { "epoch": 7.467735872697657, "grad_norm": 0.12640097737312317, "learning_rate": 7.890989653442018e-06, "loss": 0.4607, "num_input_tokens_seen": 72488448, "step": 59600 }, { "epoch": 7.468362360606441, "grad_norm": 0.11902634054422379, "learning_rate": 7.890543574236838e-06, "loss": 0.4581, "num_input_tokens_seen": 72494560, "step": 59605 }, { "epoch": 7.468988848515224, "grad_norm": 0.10993711650371552, "learning_rate": 7.89009746047278e-06, "loss": 0.4611, "num_input_tokens_seen": 72500160, "step": 59610 }, { "epoch": 7.469615336424007, "grad_norm": 0.09746800363063812, "learning_rate": 7.889651312155176e-06, "loss": 0.4605, "num_input_tokens_seen": 72506304, "step": 59615 }, { "epoch": 7.47024182433279, "grad_norm": 0.07227998971939087, "learning_rate": 7.889205129289362e-06, "loss": 0.4511, "num_input_tokens_seen": 72511936, "step": 59620 }, { "epoch": 7.470868312241573, "grad_norm": 0.08021270483732224, "learning_rate": 7.88875891188067e-06, "loss": 0.4638, "num_input_tokens_seen": 72518112, "step": 59625 }, { "epoch": 7.471494800150357, "grad_norm": 0.09170352667570114, "learning_rate": 7.888312659934438e-06, "loss": 0.4596, "num_input_tokens_seen": 72524288, "step": 59630 }, { "epoch": 7.4721212880591406, "grad_norm": 0.12372030317783356, "learning_rate": 7.887866373456e-06, "loss": 0.46, "num_input_tokens_seen": 72530368, "step": 59635 }, { "epoch": 7.472747775967924, "grad_norm": 0.12611937522888184, "learning_rate": 7.887420052450689e-06, "loss": 0.4572, "num_input_tokens_seen": 72536576, "step": 59640 }, { "epoch": 7.473374263876707, "grad_norm": 0.08457832038402557, "learning_rate": 7.886973696923847e-06, "loss": 0.455, "num_input_tokens_seen": 72542848, "step": 59645 }, { "epoch": 7.474000751785491, "grad_norm": 0.08050112426280975, "learning_rate": 7.886527306880806e-06, "loss": 0.4561, "num_input_tokens_seen": 72549248, "step": 59650 }, { "epoch": 7.474627239694274, "grad_norm": 0.11967295408248901, "learning_rate": 7.886080882326904e-06, "loss": 0.4651, "num_input_tokens_seen": 72555488, "step": 59655 }, { "epoch": 7.475253727603057, "grad_norm": 0.0913022980093956, "learning_rate": 7.885634423267479e-06, "loss": 0.4659, "num_input_tokens_seen": 72561440, "step": 59660 }, { "epoch": 7.4758802155118405, "grad_norm": 0.06843963265419006, "learning_rate": 7.885187929707867e-06, "loss": 0.4603, "num_input_tokens_seen": 72567072, "step": 59665 }, { "epoch": 7.476506703420624, "grad_norm": 0.11440245062112808, "learning_rate": 7.884741401653408e-06, "loss": 0.4512, "num_input_tokens_seen": 72573344, "step": 59670 }, { "epoch": 7.477133191329408, "grad_norm": 0.10858212411403656, "learning_rate": 7.88429483910944e-06, "loss": 0.4671, "num_input_tokens_seen": 72579776, "step": 59675 }, { "epoch": 7.477759679238191, "grad_norm": 0.11299371719360352, "learning_rate": 7.883848242081305e-06, "loss": 0.4662, "num_input_tokens_seen": 72585952, "step": 59680 }, { "epoch": 7.478386167146974, "grad_norm": 0.17517352104187012, "learning_rate": 7.883401610574338e-06, "loss": 0.4617, "num_input_tokens_seen": 72591968, "step": 59685 }, { "epoch": 7.479012655055757, "grad_norm": 0.1530492603778839, "learning_rate": 7.882954944593878e-06, "loss": 0.4644, "num_input_tokens_seen": 72597696, "step": 59690 }, { "epoch": 7.47963914296454, "grad_norm": 0.1884566992521286, "learning_rate": 7.88250824414527e-06, "loss": 0.4647, "num_input_tokens_seen": 72603840, "step": 59695 }, { "epoch": 7.480265630873324, "grad_norm": 0.11409559100866318, "learning_rate": 7.882061509233853e-06, "loss": 0.4568, "num_input_tokens_seen": 72610112, "step": 59700 }, { "epoch": 7.480892118782108, "grad_norm": 0.10358768701553345, "learning_rate": 7.881614739864967e-06, "loss": 0.4562, "num_input_tokens_seen": 72616352, "step": 59705 }, { "epoch": 7.481518606690891, "grad_norm": 0.20391495525836945, "learning_rate": 7.881167936043955e-06, "loss": 0.4463, "num_input_tokens_seen": 72622432, "step": 59710 }, { "epoch": 7.482145094599674, "grad_norm": 0.13621723651885986, "learning_rate": 7.880721097776155e-06, "loss": 0.4673, "num_input_tokens_seen": 72628480, "step": 59715 }, { "epoch": 7.482771582508458, "grad_norm": 0.12597022950649261, "learning_rate": 7.880274225066915e-06, "loss": 0.464, "num_input_tokens_seen": 72634560, "step": 59720 }, { "epoch": 7.483398070417241, "grad_norm": 0.17445966601371765, "learning_rate": 7.879827317921573e-06, "loss": 0.4582, "num_input_tokens_seen": 72640640, "step": 59725 }, { "epoch": 7.484024558326024, "grad_norm": 0.1575392186641693, "learning_rate": 7.879380376345476e-06, "loss": 0.4558, "num_input_tokens_seen": 72646816, "step": 59730 }, { "epoch": 7.4846510462348075, "grad_norm": 0.12241371721029282, "learning_rate": 7.878933400343965e-06, "loss": 0.4616, "num_input_tokens_seen": 72652864, "step": 59735 }, { "epoch": 7.485277534143591, "grad_norm": 0.17855022847652435, "learning_rate": 7.878486389922384e-06, "loss": 0.4594, "num_input_tokens_seen": 72659200, "step": 59740 }, { "epoch": 7.485904022052375, "grad_norm": 0.1226334497332573, "learning_rate": 7.87803934508608e-06, "loss": 0.4717, "num_input_tokens_seen": 72665408, "step": 59745 }, { "epoch": 7.486530509961158, "grad_norm": 0.10254469513893127, "learning_rate": 7.877592265840395e-06, "loss": 0.458, "num_input_tokens_seen": 72670688, "step": 59750 }, { "epoch": 7.487156997869941, "grad_norm": 0.17003288865089417, "learning_rate": 7.877145152190675e-06, "loss": 0.4601, "num_input_tokens_seen": 72676928, "step": 59755 }, { "epoch": 7.487783485778724, "grad_norm": 0.11110929399728775, "learning_rate": 7.876698004142265e-06, "loss": 0.4624, "num_input_tokens_seen": 72683296, "step": 59760 }, { "epoch": 7.488409973687508, "grad_norm": 0.09638527780771255, "learning_rate": 7.876250821700514e-06, "loss": 0.4501, "num_input_tokens_seen": 72689632, "step": 59765 }, { "epoch": 7.489036461596291, "grad_norm": 0.10393938422203064, "learning_rate": 7.875803604870763e-06, "loss": 0.4633, "num_input_tokens_seen": 72695296, "step": 59770 }, { "epoch": 7.489662949505075, "grad_norm": 0.10203871876001358, "learning_rate": 7.875356353658364e-06, "loss": 0.4592, "num_input_tokens_seen": 72701216, "step": 59775 }, { "epoch": 7.490289437413858, "grad_norm": 0.11976508051156998, "learning_rate": 7.874909068068663e-06, "loss": 0.4627, "num_input_tokens_seen": 72707360, "step": 59780 }, { "epoch": 7.490915925322641, "grad_norm": 0.10563911497592926, "learning_rate": 7.874461748107006e-06, "loss": 0.4696, "num_input_tokens_seen": 72713440, "step": 59785 }, { "epoch": 7.491542413231425, "grad_norm": 0.10560856014490128, "learning_rate": 7.874014393778742e-06, "loss": 0.4594, "num_input_tokens_seen": 72719712, "step": 59790 }, { "epoch": 7.492168901140208, "grad_norm": 0.12460537999868393, "learning_rate": 7.87356700508922e-06, "loss": 0.4711, "num_input_tokens_seen": 72725888, "step": 59795 }, { "epoch": 7.492795389048991, "grad_norm": 0.09315557777881622, "learning_rate": 7.873119582043787e-06, "loss": 0.4691, "num_input_tokens_seen": 72732000, "step": 59800 }, { "epoch": 7.4934218769577745, "grad_norm": 0.12175092846155167, "learning_rate": 7.872672124647795e-06, "loss": 0.455, "num_input_tokens_seen": 72738240, "step": 59805 }, { "epoch": 7.494048364866558, "grad_norm": 0.09472040086984634, "learning_rate": 7.872224632906594e-06, "loss": 0.4688, "num_input_tokens_seen": 72744032, "step": 59810 }, { "epoch": 7.494674852775342, "grad_norm": 0.10007534176111221, "learning_rate": 7.87177710682553e-06, "loss": 0.4651, "num_input_tokens_seen": 72749504, "step": 59815 }, { "epoch": 7.495301340684125, "grad_norm": 0.14100943505764008, "learning_rate": 7.871329546409958e-06, "loss": 0.4683, "num_input_tokens_seen": 72755616, "step": 59820 }, { "epoch": 7.495927828592908, "grad_norm": 0.07524634152650833, "learning_rate": 7.870881951665226e-06, "loss": 0.4622, "num_input_tokens_seen": 72761600, "step": 59825 }, { "epoch": 7.496554316501691, "grad_norm": 0.08446656912565231, "learning_rate": 7.870434322596687e-06, "loss": 0.4646, "num_input_tokens_seen": 72767232, "step": 59830 }, { "epoch": 7.497180804410474, "grad_norm": 0.12002073228359222, "learning_rate": 7.869986659209694e-06, "loss": 0.4642, "num_input_tokens_seen": 72773088, "step": 59835 }, { "epoch": 7.4978072923192585, "grad_norm": 0.09039546549320221, "learning_rate": 7.869538961509596e-06, "loss": 0.459, "num_input_tokens_seen": 72779328, "step": 59840 }, { "epoch": 7.498433780228042, "grad_norm": 0.08228766918182373, "learning_rate": 7.869091229501747e-06, "loss": 0.4638, "num_input_tokens_seen": 72785568, "step": 59845 }, { "epoch": 7.499060268136825, "grad_norm": 0.12399762123823166, "learning_rate": 7.8686434631915e-06, "loss": 0.4676, "num_input_tokens_seen": 72791648, "step": 59850 }, { "epoch": 7.499686756045608, "grad_norm": 0.1273512989282608, "learning_rate": 7.86819566258421e-06, "loss": 0.459, "num_input_tokens_seen": 72797472, "step": 59855 }, { "epoch": 7.500313243954392, "grad_norm": 0.10309914499521255, "learning_rate": 7.867747827685229e-06, "loss": 0.4542, "num_input_tokens_seen": 72803616, "step": 59860 }, { "epoch": 7.500939731863175, "grad_norm": 0.096041239798069, "learning_rate": 7.86729995849991e-06, "loss": 0.4552, "num_input_tokens_seen": 72809376, "step": 59865 }, { "epoch": 7.501566219771958, "grad_norm": 0.06555814296007156, "learning_rate": 7.86685205503361e-06, "loss": 0.4672, "num_input_tokens_seen": 72815744, "step": 59870 }, { "epoch": 7.5021927076807415, "grad_norm": 0.0783412978053093, "learning_rate": 7.866404117291684e-06, "loss": 0.4596, "num_input_tokens_seen": 72821920, "step": 59875 }, { "epoch": 7.502819195589526, "grad_norm": 0.1256285458803177, "learning_rate": 7.865956145279486e-06, "loss": 0.4594, "num_input_tokens_seen": 72828352, "step": 59880 }, { "epoch": 7.503445683498309, "grad_norm": 0.08675438910722733, "learning_rate": 7.865508139002374e-06, "loss": 0.4686, "num_input_tokens_seen": 72834176, "step": 59885 }, { "epoch": 7.504072171407092, "grad_norm": 0.0862150564789772, "learning_rate": 7.865060098465702e-06, "loss": 0.4708, "num_input_tokens_seen": 72839904, "step": 59890 }, { "epoch": 7.504698659315875, "grad_norm": 0.07906659692525864, "learning_rate": 7.864612023674829e-06, "loss": 0.4558, "num_input_tokens_seen": 72844704, "step": 59895 }, { "epoch": 7.505325147224658, "grad_norm": 0.053696442395448685, "learning_rate": 7.86416391463511e-06, "loss": 0.4614, "num_input_tokens_seen": 72851040, "step": 59900 }, { "epoch": 7.505951635133442, "grad_norm": 0.11188028752803802, "learning_rate": 7.863715771351904e-06, "loss": 0.4646, "num_input_tokens_seen": 72857536, "step": 59905 }, { "epoch": 7.5065781230422255, "grad_norm": 0.11557979136705399, "learning_rate": 7.863267593830565e-06, "loss": 0.4613, "num_input_tokens_seen": 72863808, "step": 59910 }, { "epoch": 7.507204610951009, "grad_norm": 0.14336535334587097, "learning_rate": 7.862819382076458e-06, "loss": 0.4488, "num_input_tokens_seen": 72870144, "step": 59915 }, { "epoch": 7.507831098859792, "grad_norm": 0.13696430623531342, "learning_rate": 7.862371136094937e-06, "loss": 0.4768, "num_input_tokens_seen": 72875744, "step": 59920 }, { "epoch": 7.508457586768575, "grad_norm": 0.08556203544139862, "learning_rate": 7.861922855891364e-06, "loss": 0.4683, "num_input_tokens_seen": 72882208, "step": 59925 }, { "epoch": 7.509084074677359, "grad_norm": 0.11450889706611633, "learning_rate": 7.861474541471095e-06, "loss": 0.4672, "num_input_tokens_seen": 72888512, "step": 59930 }, { "epoch": 7.509710562586142, "grad_norm": 0.12481933832168579, "learning_rate": 7.861026192839494e-06, "loss": 0.4597, "num_input_tokens_seen": 72894976, "step": 59935 }, { "epoch": 7.510337050494925, "grad_norm": 0.07372651994228363, "learning_rate": 7.860577810001916e-06, "loss": 0.4579, "num_input_tokens_seen": 72901024, "step": 59940 }, { "epoch": 7.510963538403709, "grad_norm": 0.09687589854001999, "learning_rate": 7.86012939296373e-06, "loss": 0.4593, "num_input_tokens_seen": 72907040, "step": 59945 }, { "epoch": 7.511590026312492, "grad_norm": 0.0748186782002449, "learning_rate": 7.85968094173029e-06, "loss": 0.4607, "num_input_tokens_seen": 72913120, "step": 59950 }, { "epoch": 7.512216514221276, "grad_norm": 0.06528206914663315, "learning_rate": 7.85923245630696e-06, "loss": 0.4633, "num_input_tokens_seen": 72919200, "step": 59955 }, { "epoch": 7.512843002130059, "grad_norm": 0.09728902578353882, "learning_rate": 7.858783936699103e-06, "loss": 0.4624, "num_input_tokens_seen": 72925056, "step": 59960 }, { "epoch": 7.513469490038842, "grad_norm": 0.06296791881322861, "learning_rate": 7.858335382912079e-06, "loss": 0.4666, "num_input_tokens_seen": 72931392, "step": 59965 }, { "epoch": 7.514095977947625, "grad_norm": 0.0507495179772377, "learning_rate": 7.857886794951254e-06, "loss": 0.4598, "num_input_tokens_seen": 72937216, "step": 59970 }, { "epoch": 7.514722465856409, "grad_norm": 0.11176472157239914, "learning_rate": 7.857438172821988e-06, "loss": 0.469, "num_input_tokens_seen": 72943456, "step": 59975 }, { "epoch": 7.5153489537651925, "grad_norm": 0.10029478371143341, "learning_rate": 7.856989516529647e-06, "loss": 0.4623, "num_input_tokens_seen": 72949344, "step": 59980 }, { "epoch": 7.515975441673976, "grad_norm": 0.07653405517339706, "learning_rate": 7.856540826079595e-06, "loss": 0.4645, "num_input_tokens_seen": 72955072, "step": 59985 }, { "epoch": 7.516601929582759, "grad_norm": 0.08599680662155151, "learning_rate": 7.856092101477196e-06, "loss": 0.4649, "num_input_tokens_seen": 72961024, "step": 59990 }, { "epoch": 7.517228417491542, "grad_norm": 0.08515501022338867, "learning_rate": 7.855643342727812e-06, "loss": 0.4573, "num_input_tokens_seen": 72967168, "step": 59995 }, { "epoch": 7.517854905400326, "grad_norm": 0.08552978187799454, "learning_rate": 7.855194549836814e-06, "loss": 0.4598, "num_input_tokens_seen": 72973440, "step": 60000 }, { "epoch": 7.518481393309109, "grad_norm": 0.047913193702697754, "learning_rate": 7.854745722809563e-06, "loss": 0.4677, "num_input_tokens_seen": 72979680, "step": 60005 }, { "epoch": 7.519107881217892, "grad_norm": 0.0994889959692955, "learning_rate": 7.854296861651427e-06, "loss": 0.4669, "num_input_tokens_seen": 72985824, "step": 60010 }, { "epoch": 7.519734369126676, "grad_norm": 0.04721603915095329, "learning_rate": 7.853847966367771e-06, "loss": 0.4596, "num_input_tokens_seen": 72991904, "step": 60015 }, { "epoch": 7.52036085703546, "grad_norm": 0.1028938814997673, "learning_rate": 7.853399036963966e-06, "loss": 0.4686, "num_input_tokens_seen": 72997824, "step": 60020 }, { "epoch": 7.520987344944243, "grad_norm": 0.08541373908519745, "learning_rate": 7.852950073445374e-06, "loss": 0.4614, "num_input_tokens_seen": 73003904, "step": 60025 }, { "epoch": 7.521613832853026, "grad_norm": 0.07197292149066925, "learning_rate": 7.852501075817368e-06, "loss": 0.4655, "num_input_tokens_seen": 73009952, "step": 60030 }, { "epoch": 7.522240320761809, "grad_norm": 0.0823705866932869, "learning_rate": 7.852052044085311e-06, "loss": 0.4637, "num_input_tokens_seen": 73016000, "step": 60035 }, { "epoch": 7.522866808670592, "grad_norm": 0.05035458505153656, "learning_rate": 7.851602978254575e-06, "loss": 0.4551, "num_input_tokens_seen": 73021824, "step": 60040 }, { "epoch": 7.523493296579376, "grad_norm": 0.08724740147590637, "learning_rate": 7.851153878330527e-06, "loss": 0.4708, "num_input_tokens_seen": 73027616, "step": 60045 }, { "epoch": 7.5241197844881595, "grad_norm": 0.07749151438474655, "learning_rate": 7.850704744318537e-06, "loss": 0.4629, "num_input_tokens_seen": 73033728, "step": 60050 }, { "epoch": 7.524746272396943, "grad_norm": 0.09453758597373962, "learning_rate": 7.850255576223975e-06, "loss": 0.4691, "num_input_tokens_seen": 73040032, "step": 60055 }, { "epoch": 7.525372760305726, "grad_norm": 0.07823532074689865, "learning_rate": 7.849806374052213e-06, "loss": 0.4652, "num_input_tokens_seen": 73046208, "step": 60060 }, { "epoch": 7.525999248214509, "grad_norm": 0.08225814998149872, "learning_rate": 7.84935713780862e-06, "loss": 0.4739, "num_input_tokens_seen": 73052288, "step": 60065 }, { "epoch": 7.526625736123293, "grad_norm": 0.0965476855635643, "learning_rate": 7.848907867498565e-06, "loss": 0.4588, "num_input_tokens_seen": 73057760, "step": 60070 }, { "epoch": 7.527252224032076, "grad_norm": 0.07719837874174118, "learning_rate": 7.848458563127422e-06, "loss": 0.4692, "num_input_tokens_seen": 73063392, "step": 60075 }, { "epoch": 7.5278787119408594, "grad_norm": 0.09852321445941925, "learning_rate": 7.848009224700561e-06, "loss": 0.4645, "num_input_tokens_seen": 73068960, "step": 60080 }, { "epoch": 7.528505199849643, "grad_norm": 0.07245352864265442, "learning_rate": 7.847559852223357e-06, "loss": 0.4628, "num_input_tokens_seen": 73075008, "step": 60085 }, { "epoch": 7.529131687758426, "grad_norm": 0.08745697885751724, "learning_rate": 7.84711044570118e-06, "loss": 0.4655, "num_input_tokens_seen": 73080448, "step": 60090 }, { "epoch": 7.52975817566721, "grad_norm": 0.10367011278867722, "learning_rate": 7.846661005139403e-06, "loss": 0.4632, "num_input_tokens_seen": 73086560, "step": 60095 }, { "epoch": 7.530384663575993, "grad_norm": 0.07941819727420807, "learning_rate": 7.846211530543401e-06, "loss": 0.4607, "num_input_tokens_seen": 73092544, "step": 60100 }, { "epoch": 7.531011151484776, "grad_norm": 0.08705094456672668, "learning_rate": 7.845762021918548e-06, "loss": 0.4606, "num_input_tokens_seen": 73098624, "step": 60105 }, { "epoch": 7.531637639393559, "grad_norm": 0.07647925615310669, "learning_rate": 7.845312479270217e-06, "loss": 0.4653, "num_input_tokens_seen": 73104256, "step": 60110 }, { "epoch": 7.532264127302343, "grad_norm": 0.1333175003528595, "learning_rate": 7.844862902603783e-06, "loss": 0.4588, "num_input_tokens_seen": 73110304, "step": 60115 }, { "epoch": 7.532890615211127, "grad_norm": 0.1338430941104889, "learning_rate": 7.844413291924623e-06, "loss": 0.4636, "num_input_tokens_seen": 73116288, "step": 60120 }, { "epoch": 7.53351710311991, "grad_norm": 0.08090867102146149, "learning_rate": 7.843963647238107e-06, "loss": 0.465, "num_input_tokens_seen": 73122368, "step": 60125 }, { "epoch": 7.534143591028693, "grad_norm": 0.08391711860895157, "learning_rate": 7.843513968549616e-06, "loss": 0.4622, "num_input_tokens_seen": 73128608, "step": 60130 }, { "epoch": 7.534770078937477, "grad_norm": 0.17067588865756989, "learning_rate": 7.843064255864526e-06, "loss": 0.4645, "num_input_tokens_seen": 73134880, "step": 60135 }, { "epoch": 7.53539656684626, "grad_norm": 0.12164328247308731, "learning_rate": 7.842614509188212e-06, "loss": 0.4618, "num_input_tokens_seen": 73141120, "step": 60140 }, { "epoch": 7.536023054755043, "grad_norm": 0.10755801945924759, "learning_rate": 7.842164728526052e-06, "loss": 0.4599, "num_input_tokens_seen": 73147360, "step": 60145 }, { "epoch": 7.5366495426638265, "grad_norm": 0.10187356919050217, "learning_rate": 7.841714913883422e-06, "loss": 0.4621, "num_input_tokens_seen": 73153440, "step": 60150 }, { "epoch": 7.53727603057261, "grad_norm": 0.08038346469402313, "learning_rate": 7.841265065265701e-06, "loss": 0.4579, "num_input_tokens_seen": 73159968, "step": 60155 }, { "epoch": 7.537902518481394, "grad_norm": 0.09486449509859085, "learning_rate": 7.840815182678268e-06, "loss": 0.467, "num_input_tokens_seen": 73166176, "step": 60160 }, { "epoch": 7.538529006390177, "grad_norm": 0.07634439319372177, "learning_rate": 7.840365266126501e-06, "loss": 0.4633, "num_input_tokens_seen": 73172256, "step": 60165 }, { "epoch": 7.53915549429896, "grad_norm": 0.08200334012508392, "learning_rate": 7.839915315615778e-06, "loss": 0.4586, "num_input_tokens_seen": 73178656, "step": 60170 }, { "epoch": 7.539781982207743, "grad_norm": 0.10564250499010086, "learning_rate": 7.83946533115148e-06, "loss": 0.4617, "num_input_tokens_seen": 73184928, "step": 60175 }, { "epoch": 7.540408470116526, "grad_norm": 0.05159701034426689, "learning_rate": 7.839015312738989e-06, "loss": 0.4629, "num_input_tokens_seen": 73191008, "step": 60180 }, { "epoch": 7.54103495802531, "grad_norm": 0.07799127697944641, "learning_rate": 7.83856526038368e-06, "loss": 0.464, "num_input_tokens_seen": 73196960, "step": 60185 }, { "epoch": 7.541661445934094, "grad_norm": 0.07537683844566345, "learning_rate": 7.838115174090939e-06, "loss": 0.4623, "num_input_tokens_seen": 73203008, "step": 60190 }, { "epoch": 7.542287933842877, "grad_norm": 0.09621062129735947, "learning_rate": 7.837665053866144e-06, "loss": 0.4566, "num_input_tokens_seen": 73209248, "step": 60195 }, { "epoch": 7.54291442175166, "grad_norm": 0.08003068715333939, "learning_rate": 7.837214899714678e-06, "loss": 0.4669, "num_input_tokens_seen": 73215104, "step": 60200 }, { "epoch": 7.543540909660443, "grad_norm": 0.09407524019479752, "learning_rate": 7.836764711641923e-06, "loss": 0.4621, "num_input_tokens_seen": 73221184, "step": 60205 }, { "epoch": 7.544167397569227, "grad_norm": 0.08533766120672226, "learning_rate": 7.836314489653258e-06, "loss": 0.467, "num_input_tokens_seen": 73227232, "step": 60210 }, { "epoch": 7.54479388547801, "grad_norm": 0.07536657154560089, "learning_rate": 7.835864233754072e-06, "loss": 0.4603, "num_input_tokens_seen": 73233408, "step": 60215 }, { "epoch": 7.5454203733867935, "grad_norm": 0.10992363095283508, "learning_rate": 7.835413943949744e-06, "loss": 0.4562, "num_input_tokens_seen": 73239552, "step": 60220 }, { "epoch": 7.546046861295577, "grad_norm": 0.12525272369384766, "learning_rate": 7.834963620245657e-06, "loss": 0.4632, "num_input_tokens_seen": 73245152, "step": 60225 }, { "epoch": 7.546673349204361, "grad_norm": 0.08246634900569916, "learning_rate": 7.834513262647197e-06, "loss": 0.4629, "num_input_tokens_seen": 73251392, "step": 60230 }, { "epoch": 7.547299837113144, "grad_norm": 0.09946245700120926, "learning_rate": 7.834062871159748e-06, "loss": 0.4586, "num_input_tokens_seen": 73257440, "step": 60235 }, { "epoch": 7.547926325021927, "grad_norm": 0.07920007407665253, "learning_rate": 7.833612445788694e-06, "loss": 0.4611, "num_input_tokens_seen": 73263680, "step": 60240 }, { "epoch": 7.54855281293071, "grad_norm": 0.10034874081611633, "learning_rate": 7.83316198653942e-06, "loss": 0.4698, "num_input_tokens_seen": 73270080, "step": 60245 }, { "epoch": 7.549179300839494, "grad_norm": 0.05900077149271965, "learning_rate": 7.832711493417316e-06, "loss": 0.4583, "num_input_tokens_seen": 73276128, "step": 60250 }, { "epoch": 7.5498057887482775, "grad_norm": 0.12207821011543274, "learning_rate": 7.832260966427762e-06, "loss": 0.4623, "num_input_tokens_seen": 73282304, "step": 60255 }, { "epoch": 7.550432276657061, "grad_norm": 0.07621470838785172, "learning_rate": 7.831810405576147e-06, "loss": 0.4664, "num_input_tokens_seen": 73288448, "step": 60260 }, { "epoch": 7.551058764565844, "grad_norm": 0.0982615053653717, "learning_rate": 7.83135981086786e-06, "loss": 0.4565, "num_input_tokens_seen": 73294720, "step": 60265 }, { "epoch": 7.551685252474627, "grad_norm": 0.09266793727874756, "learning_rate": 7.830909182308283e-06, "loss": 0.4669, "num_input_tokens_seen": 73300512, "step": 60270 }, { "epoch": 7.552311740383411, "grad_norm": 0.08940102159976959, "learning_rate": 7.830458519902808e-06, "loss": 0.459, "num_input_tokens_seen": 73306656, "step": 60275 }, { "epoch": 7.552938228292194, "grad_norm": 0.08168869465589523, "learning_rate": 7.830007823656822e-06, "loss": 0.4593, "num_input_tokens_seen": 73313024, "step": 60280 }, { "epoch": 7.553564716200977, "grad_norm": 0.07367975264787674, "learning_rate": 7.829557093575712e-06, "loss": 0.4601, "num_input_tokens_seen": 73319008, "step": 60285 }, { "epoch": 7.5541912041097605, "grad_norm": 0.07997618615627289, "learning_rate": 7.829106329664868e-06, "loss": 0.4641, "num_input_tokens_seen": 73325216, "step": 60290 }, { "epoch": 7.554817692018544, "grad_norm": 0.05734550580382347, "learning_rate": 7.82865553192968e-06, "loss": 0.4594, "num_input_tokens_seen": 73331424, "step": 60295 }, { "epoch": 7.555444179927328, "grad_norm": 0.0960317850112915, "learning_rate": 7.828204700375537e-06, "loss": 0.4637, "num_input_tokens_seen": 73337088, "step": 60300 }, { "epoch": 7.556070667836111, "grad_norm": 0.12229621410369873, "learning_rate": 7.827753835007828e-06, "loss": 0.4687, "num_input_tokens_seen": 73343200, "step": 60305 }, { "epoch": 7.556697155744894, "grad_norm": 0.0768970474600792, "learning_rate": 7.827302935831947e-06, "loss": 0.4663, "num_input_tokens_seen": 73349152, "step": 60310 }, { "epoch": 7.557323643653677, "grad_norm": 0.07085330039262772, "learning_rate": 7.82685200285328e-06, "loss": 0.4662, "num_input_tokens_seen": 73355168, "step": 60315 }, { "epoch": 7.55795013156246, "grad_norm": 0.045006655156612396, "learning_rate": 7.82640103607722e-06, "loss": 0.4611, "num_input_tokens_seen": 73360992, "step": 60320 }, { "epoch": 7.5585766194712445, "grad_norm": 0.07446147501468658, "learning_rate": 7.825950035509162e-06, "loss": 0.46, "num_input_tokens_seen": 73367360, "step": 60325 }, { "epoch": 7.559203107380028, "grad_norm": 0.0761432871222496, "learning_rate": 7.825499001154493e-06, "loss": 0.4632, "num_input_tokens_seen": 73373536, "step": 60330 }, { "epoch": 7.559829595288811, "grad_norm": 0.09616698324680328, "learning_rate": 7.825047933018608e-06, "loss": 0.4579, "num_input_tokens_seen": 73379264, "step": 60335 }, { "epoch": 7.560456083197594, "grad_norm": 0.06950421631336212, "learning_rate": 7.8245968311069e-06, "loss": 0.465, "num_input_tokens_seen": 73384992, "step": 60340 }, { "epoch": 7.561082571106377, "grad_norm": 0.12107399106025696, "learning_rate": 7.824145695424763e-06, "loss": 0.4651, "num_input_tokens_seen": 73391200, "step": 60345 }, { "epoch": 7.561709059015161, "grad_norm": 0.07167398929595947, "learning_rate": 7.82369452597759e-06, "loss": 0.4644, "num_input_tokens_seen": 73397216, "step": 60350 }, { "epoch": 7.562335546923944, "grad_norm": 0.13221049308776855, "learning_rate": 7.823243322770776e-06, "loss": 0.4637, "num_input_tokens_seen": 73403680, "step": 60355 }, { "epoch": 7.5629620348327276, "grad_norm": 0.11931444704532623, "learning_rate": 7.82279208580971e-06, "loss": 0.4646, "num_input_tokens_seen": 73409856, "step": 60360 }, { "epoch": 7.563588522741511, "grad_norm": 0.0773855671286583, "learning_rate": 7.822340815099794e-06, "loss": 0.4613, "num_input_tokens_seen": 73416192, "step": 60365 }, { "epoch": 7.564215010650295, "grad_norm": 0.11490286141633987, "learning_rate": 7.821889510646422e-06, "loss": 0.4627, "num_input_tokens_seen": 73422272, "step": 60370 }, { "epoch": 7.564841498559078, "grad_norm": 0.07688029855489731, "learning_rate": 7.821438172454986e-06, "loss": 0.4576, "num_input_tokens_seen": 73428320, "step": 60375 }, { "epoch": 7.565467986467861, "grad_norm": 0.07453805208206177, "learning_rate": 7.820986800530886e-06, "loss": 0.4621, "num_input_tokens_seen": 73434400, "step": 60380 }, { "epoch": 7.566094474376644, "grad_norm": 0.08971692621707916, "learning_rate": 7.820535394879518e-06, "loss": 0.4573, "num_input_tokens_seen": 73440448, "step": 60385 }, { "epoch": 7.566720962285428, "grad_norm": 0.13606944680213928, "learning_rate": 7.820083955506275e-06, "loss": 0.4544, "num_input_tokens_seen": 73446336, "step": 60390 }, { "epoch": 7.5673474501942115, "grad_norm": 0.09168094396591187, "learning_rate": 7.81963248241656e-06, "loss": 0.462, "num_input_tokens_seen": 73452608, "step": 60395 }, { "epoch": 7.567973938102995, "grad_norm": 0.09482761472463608, "learning_rate": 7.819180975615767e-06, "loss": 0.4627, "num_input_tokens_seen": 73458848, "step": 60400 }, { "epoch": 7.568600426011778, "grad_norm": 0.06634880602359772, "learning_rate": 7.818729435109296e-06, "loss": 0.4628, "num_input_tokens_seen": 73465088, "step": 60405 }, { "epoch": 7.569226913920561, "grad_norm": 0.11220288276672363, "learning_rate": 7.818277860902544e-06, "loss": 0.4536, "num_input_tokens_seen": 73471488, "step": 60410 }, { "epoch": 7.569853401829345, "grad_norm": 0.09789948910474777, "learning_rate": 7.817826253000909e-06, "loss": 0.4596, "num_input_tokens_seen": 73477504, "step": 60415 }, { "epoch": 7.570479889738128, "grad_norm": 0.07676392048597336, "learning_rate": 7.817374611409794e-06, "loss": 0.4646, "num_input_tokens_seen": 73484032, "step": 60420 }, { "epoch": 7.571106377646911, "grad_norm": 0.05411001294851303, "learning_rate": 7.816922936134596e-06, "loss": 0.4548, "num_input_tokens_seen": 73490304, "step": 60425 }, { "epoch": 7.571732865555695, "grad_norm": 0.05285145714879036, "learning_rate": 7.816471227180717e-06, "loss": 0.462, "num_input_tokens_seen": 73496512, "step": 60430 }, { "epoch": 7.572359353464478, "grad_norm": 0.10877077281475067, "learning_rate": 7.816019484553556e-06, "loss": 0.4601, "num_input_tokens_seen": 73502784, "step": 60435 }, { "epoch": 7.572985841373262, "grad_norm": 0.0781221091747284, "learning_rate": 7.815567708258513e-06, "loss": 0.4606, "num_input_tokens_seen": 73508704, "step": 60440 }, { "epoch": 7.573612329282045, "grad_norm": 0.17864583432674408, "learning_rate": 7.815115898300994e-06, "loss": 0.4634, "num_input_tokens_seen": 73514784, "step": 60445 }, { "epoch": 7.574238817190828, "grad_norm": 0.1138959676027298, "learning_rate": 7.814664054686397e-06, "loss": 0.465, "num_input_tokens_seen": 73520256, "step": 60450 }, { "epoch": 7.574865305099611, "grad_norm": 0.0862165242433548, "learning_rate": 7.814212177420123e-06, "loss": 0.4651, "num_input_tokens_seen": 73526560, "step": 60455 }, { "epoch": 7.5754917930083945, "grad_norm": 0.09329953789710999, "learning_rate": 7.813760266507577e-06, "loss": 0.4668, "num_input_tokens_seen": 73532768, "step": 60460 }, { "epoch": 7.5761182809171785, "grad_norm": 0.0700840950012207, "learning_rate": 7.81330832195416e-06, "loss": 0.4678, "num_input_tokens_seen": 73538528, "step": 60465 }, { "epoch": 7.576744768825962, "grad_norm": 0.10151297599077225, "learning_rate": 7.81285634376528e-06, "loss": 0.4674, "num_input_tokens_seen": 73543872, "step": 60470 }, { "epoch": 7.577371256734745, "grad_norm": 0.10016459226608276, "learning_rate": 7.812404331946335e-06, "loss": 0.4561, "num_input_tokens_seen": 73549952, "step": 60475 }, { "epoch": 7.577997744643528, "grad_norm": 0.07963193207979202, "learning_rate": 7.811952286502733e-06, "loss": 0.4668, "num_input_tokens_seen": 73556256, "step": 60480 }, { "epoch": 7.578624232552312, "grad_norm": 0.08926644176244736, "learning_rate": 7.811500207439874e-06, "loss": 0.4627, "num_input_tokens_seen": 73562496, "step": 60485 }, { "epoch": 7.579250720461095, "grad_norm": 0.07637513428926468, "learning_rate": 7.811048094763171e-06, "loss": 0.46, "num_input_tokens_seen": 73568480, "step": 60490 }, { "epoch": 7.579877208369878, "grad_norm": 0.06996327638626099, "learning_rate": 7.810595948478022e-06, "loss": 0.4637, "num_input_tokens_seen": 73574144, "step": 60495 }, { "epoch": 7.580503696278662, "grad_norm": 0.09781254827976227, "learning_rate": 7.810143768589835e-06, "loss": 0.4644, "num_input_tokens_seen": 73579968, "step": 60500 }, { "epoch": 7.581130184187446, "grad_norm": 0.1105501800775528, "learning_rate": 7.809691555104017e-06, "loss": 0.463, "num_input_tokens_seen": 73586656, "step": 60505 }, { "epoch": 7.581756672096229, "grad_norm": 0.047393471002578735, "learning_rate": 7.809239308025976e-06, "loss": 0.4609, "num_input_tokens_seen": 73592384, "step": 60510 }, { "epoch": 7.582383160005012, "grad_norm": 0.09354148060083389, "learning_rate": 7.808787027361116e-06, "loss": 0.459, "num_input_tokens_seen": 73598112, "step": 60515 }, { "epoch": 7.583009647913795, "grad_norm": 0.10761623829603195, "learning_rate": 7.808334713114845e-06, "loss": 0.4673, "num_input_tokens_seen": 73604352, "step": 60520 }, { "epoch": 7.583636135822578, "grad_norm": 0.10311827063560486, "learning_rate": 7.807882365292572e-06, "loss": 0.4651, "num_input_tokens_seen": 73609792, "step": 60525 }, { "epoch": 7.584262623731362, "grad_norm": 0.06959571689367294, "learning_rate": 7.807429983899704e-06, "loss": 0.4607, "num_input_tokens_seen": 73616064, "step": 60530 }, { "epoch": 7.584889111640146, "grad_norm": 0.08787348121404648, "learning_rate": 7.806977568941652e-06, "loss": 0.4636, "num_input_tokens_seen": 73622656, "step": 60535 }, { "epoch": 7.585515599548929, "grad_norm": 0.06677315384149551, "learning_rate": 7.80652512042382e-06, "loss": 0.4581, "num_input_tokens_seen": 73628768, "step": 60540 }, { "epoch": 7.586142087457712, "grad_norm": 0.10397671908140182, "learning_rate": 7.806072638351623e-06, "loss": 0.4641, "num_input_tokens_seen": 73634976, "step": 60545 }, { "epoch": 7.586768575366495, "grad_norm": 0.07987008988857269, "learning_rate": 7.805620122730468e-06, "loss": 0.4636, "num_input_tokens_seen": 73640672, "step": 60550 }, { "epoch": 7.587395063275279, "grad_norm": 0.10471725463867188, "learning_rate": 7.805167573565766e-06, "loss": 0.4608, "num_input_tokens_seen": 73646944, "step": 60555 }, { "epoch": 7.588021551184062, "grad_norm": 0.07113601267337799, "learning_rate": 7.804714990862925e-06, "loss": 0.4614, "num_input_tokens_seen": 73652480, "step": 60560 }, { "epoch": 7.5886480390928455, "grad_norm": 0.0917455330491066, "learning_rate": 7.80426237462736e-06, "loss": 0.4613, "num_input_tokens_seen": 73658912, "step": 60565 }, { "epoch": 7.589274527001629, "grad_norm": 0.08081971853971481, "learning_rate": 7.80380972486448e-06, "loss": 0.4684, "num_input_tokens_seen": 73664960, "step": 60570 }, { "epoch": 7.589901014910412, "grad_norm": 0.07526509463787079, "learning_rate": 7.803357041579698e-06, "loss": 0.46, "num_input_tokens_seen": 73671328, "step": 60575 }, { "epoch": 7.590527502819196, "grad_norm": 0.07372860610485077, "learning_rate": 7.802904324778426e-06, "loss": 0.458, "num_input_tokens_seen": 73677152, "step": 60580 }, { "epoch": 7.591153990727979, "grad_norm": 0.08739978820085526, "learning_rate": 7.802451574466077e-06, "loss": 0.4618, "num_input_tokens_seen": 73683232, "step": 60585 }, { "epoch": 7.591780478636762, "grad_norm": 0.08032315969467163, "learning_rate": 7.80199879064806e-06, "loss": 0.4637, "num_input_tokens_seen": 73689184, "step": 60590 }, { "epoch": 7.592406966545545, "grad_norm": 0.1074765995144844, "learning_rate": 7.801545973329794e-06, "loss": 0.4585, "num_input_tokens_seen": 73695584, "step": 60595 }, { "epoch": 7.5930334544543285, "grad_norm": 0.09754516929388046, "learning_rate": 7.80109312251669e-06, "loss": 0.4491, "num_input_tokens_seen": 73701632, "step": 60600 }, { "epoch": 7.593659942363113, "grad_norm": 0.07222886383533478, "learning_rate": 7.800640238214163e-06, "loss": 0.4591, "num_input_tokens_seen": 73707968, "step": 60605 }, { "epoch": 7.594286430271896, "grad_norm": 0.10734989494085312, "learning_rate": 7.800187320427627e-06, "loss": 0.4667, "num_input_tokens_seen": 73714304, "step": 60610 }, { "epoch": 7.594912918180679, "grad_norm": 0.10531765222549438, "learning_rate": 7.799734369162497e-06, "loss": 0.4667, "num_input_tokens_seen": 73720320, "step": 60615 }, { "epoch": 7.595539406089462, "grad_norm": 0.10839463770389557, "learning_rate": 7.79928138442419e-06, "loss": 0.462, "num_input_tokens_seen": 73726784, "step": 60620 }, { "epoch": 7.596165893998246, "grad_norm": 0.052062202244997025, "learning_rate": 7.798828366218119e-06, "loss": 0.4542, "num_input_tokens_seen": 73732384, "step": 60625 }, { "epoch": 7.596792381907029, "grad_norm": 0.08538007736206055, "learning_rate": 7.798375314549701e-06, "loss": 0.4654, "num_input_tokens_seen": 73738624, "step": 60630 }, { "epoch": 7.5974188698158125, "grad_norm": 0.13337112963199615, "learning_rate": 7.797922229424358e-06, "loss": 0.4649, "num_input_tokens_seen": 73744736, "step": 60635 }, { "epoch": 7.598045357724596, "grad_norm": 0.10226787626743317, "learning_rate": 7.797469110847498e-06, "loss": 0.4698, "num_input_tokens_seen": 73750816, "step": 60640 }, { "epoch": 7.59867184563338, "grad_norm": 0.08442544937133789, "learning_rate": 7.797015958824543e-06, "loss": 0.4695, "num_input_tokens_seen": 73757120, "step": 60645 }, { "epoch": 7.599298333542163, "grad_norm": 0.07381895184516907, "learning_rate": 7.796562773360912e-06, "loss": 0.4572, "num_input_tokens_seen": 73763232, "step": 60650 }, { "epoch": 7.599924821450946, "grad_norm": 0.10680864751338959, "learning_rate": 7.796109554462022e-06, "loss": 0.4622, "num_input_tokens_seen": 73769344, "step": 60655 }, { "epoch": 7.600551309359729, "grad_norm": 0.06917819380760193, "learning_rate": 7.795656302133291e-06, "loss": 0.46, "num_input_tokens_seen": 73775840, "step": 60660 }, { "epoch": 7.601177797268512, "grad_norm": 0.0628712847828865, "learning_rate": 7.795203016380138e-06, "loss": 0.4594, "num_input_tokens_seen": 73781984, "step": 60665 }, { "epoch": 7.6018042851772964, "grad_norm": 0.14351886510849, "learning_rate": 7.794749697207983e-06, "loss": 0.4666, "num_input_tokens_seen": 73787616, "step": 60670 }, { "epoch": 7.60243077308608, "grad_norm": 0.08399597555398941, "learning_rate": 7.794296344622246e-06, "loss": 0.459, "num_input_tokens_seen": 73793664, "step": 60675 }, { "epoch": 7.603057260994863, "grad_norm": 0.07656553387641907, "learning_rate": 7.793842958628345e-06, "loss": 0.4598, "num_input_tokens_seen": 73799456, "step": 60680 }, { "epoch": 7.603683748903646, "grad_norm": 0.07695082575082779, "learning_rate": 7.793389539231707e-06, "loss": 0.4687, "num_input_tokens_seen": 73805504, "step": 60685 }, { "epoch": 7.604310236812429, "grad_norm": 0.07835856080055237, "learning_rate": 7.792936086437745e-06, "loss": 0.4646, "num_input_tokens_seen": 73811264, "step": 60690 }, { "epoch": 7.604936724721213, "grad_norm": 0.058136045932769775, "learning_rate": 7.792482600251885e-06, "loss": 0.4547, "num_input_tokens_seen": 73817280, "step": 60695 }, { "epoch": 7.605563212629996, "grad_norm": 0.08035396039485931, "learning_rate": 7.792029080679546e-06, "loss": 0.4671, "num_input_tokens_seen": 73823328, "step": 60700 }, { "epoch": 7.6061897005387795, "grad_norm": 0.10494078695774078, "learning_rate": 7.791575527726154e-06, "loss": 0.4619, "num_input_tokens_seen": 73829056, "step": 60705 }, { "epoch": 7.606816188447563, "grad_norm": 0.11871811002492905, "learning_rate": 7.791121941397129e-06, "loss": 0.4704, "num_input_tokens_seen": 73835168, "step": 60710 }, { "epoch": 7.607442676356346, "grad_norm": 0.0853390172123909, "learning_rate": 7.790668321697895e-06, "loss": 0.4614, "num_input_tokens_seen": 73841344, "step": 60715 }, { "epoch": 7.60806916426513, "grad_norm": 0.08117377012968063, "learning_rate": 7.790214668633874e-06, "loss": 0.4516, "num_input_tokens_seen": 73847872, "step": 60720 }, { "epoch": 7.608695652173913, "grad_norm": 0.07477378100156784, "learning_rate": 7.78976098221049e-06, "loss": 0.4595, "num_input_tokens_seen": 73853824, "step": 60725 }, { "epoch": 7.609322140082696, "grad_norm": 0.07990282773971558, "learning_rate": 7.78930726243317e-06, "loss": 0.4585, "num_input_tokens_seen": 73859904, "step": 60730 }, { "epoch": 7.609948627991479, "grad_norm": 0.0912807434797287, "learning_rate": 7.788853509307334e-06, "loss": 0.4539, "num_input_tokens_seen": 73865984, "step": 60735 }, { "epoch": 7.6105751159002635, "grad_norm": 0.07770004868507385, "learning_rate": 7.78839972283841e-06, "loss": 0.4601, "num_input_tokens_seen": 73871872, "step": 60740 }, { "epoch": 7.611201603809047, "grad_norm": 0.08483370393514633, "learning_rate": 7.787945903031825e-06, "loss": 0.4689, "num_input_tokens_seen": 73876960, "step": 60745 }, { "epoch": 7.61182809171783, "grad_norm": 0.12528853118419647, "learning_rate": 7.787492049893003e-06, "loss": 0.4597, "num_input_tokens_seen": 73882752, "step": 60750 }, { "epoch": 7.612454579626613, "grad_norm": 0.08147662878036499, "learning_rate": 7.787038163427368e-06, "loss": 0.456, "num_input_tokens_seen": 73887872, "step": 60755 }, { "epoch": 7.613081067535397, "grad_norm": 0.10616028308868408, "learning_rate": 7.786584243640349e-06, "loss": 0.4671, "num_input_tokens_seen": 73894016, "step": 60760 }, { "epoch": 7.61370755544418, "grad_norm": 0.0753728598356247, "learning_rate": 7.786130290537375e-06, "loss": 0.4581, "num_input_tokens_seen": 73900192, "step": 60765 }, { "epoch": 7.614334043352963, "grad_norm": 0.1222650557756424, "learning_rate": 7.785676304123868e-06, "loss": 0.4645, "num_input_tokens_seen": 73906208, "step": 60770 }, { "epoch": 7.6149605312617465, "grad_norm": 0.0852258950471878, "learning_rate": 7.785222284405259e-06, "loss": 0.4686, "num_input_tokens_seen": 73912416, "step": 60775 }, { "epoch": 7.61558701917053, "grad_norm": 0.12445611506700516, "learning_rate": 7.784768231386976e-06, "loss": 0.456, "num_input_tokens_seen": 73918496, "step": 60780 }, { "epoch": 7.616213507079314, "grad_norm": 0.07395032048225403, "learning_rate": 7.78431414507445e-06, "loss": 0.4643, "num_input_tokens_seen": 73924576, "step": 60785 }, { "epoch": 7.616839994988097, "grad_norm": 0.07865255326032639, "learning_rate": 7.783860025473104e-06, "loss": 0.457, "num_input_tokens_seen": 73930240, "step": 60790 }, { "epoch": 7.61746648289688, "grad_norm": 0.08565191924571991, "learning_rate": 7.783405872588373e-06, "loss": 0.4634, "num_input_tokens_seen": 73936256, "step": 60795 }, { "epoch": 7.618092970805663, "grad_norm": 0.08690600842237473, "learning_rate": 7.782951686425684e-06, "loss": 0.4627, "num_input_tokens_seen": 73942304, "step": 60800 }, { "epoch": 7.6187194587144464, "grad_norm": 0.12120769917964935, "learning_rate": 7.782497466990466e-06, "loss": 0.4522, "num_input_tokens_seen": 73948192, "step": 60805 }, { "epoch": 7.6193459466232305, "grad_norm": 0.08107887953519821, "learning_rate": 7.782043214288153e-06, "loss": 0.4597, "num_input_tokens_seen": 73954336, "step": 60810 }, { "epoch": 7.619972434532014, "grad_norm": 0.12367192655801773, "learning_rate": 7.781588928324175e-06, "loss": 0.4589, "num_input_tokens_seen": 73960640, "step": 60815 }, { "epoch": 7.620598922440797, "grad_norm": 0.09955224394798279, "learning_rate": 7.781134609103963e-06, "loss": 0.4637, "num_input_tokens_seen": 73966880, "step": 60820 }, { "epoch": 7.62122541034958, "grad_norm": 0.0857187882065773, "learning_rate": 7.780680256632946e-06, "loss": 0.4573, "num_input_tokens_seen": 73973088, "step": 60825 }, { "epoch": 7.621851898258363, "grad_norm": 0.10914194583892822, "learning_rate": 7.780225870916563e-06, "loss": 0.4694, "num_input_tokens_seen": 73979264, "step": 60830 }, { "epoch": 7.622478386167147, "grad_norm": 0.05812232941389084, "learning_rate": 7.779771451960238e-06, "loss": 0.4657, "num_input_tokens_seen": 73984672, "step": 60835 }, { "epoch": 7.62310487407593, "grad_norm": 0.10164394974708557, "learning_rate": 7.77931699976941e-06, "loss": 0.4566, "num_input_tokens_seen": 73990784, "step": 60840 }, { "epoch": 7.623731361984714, "grad_norm": 0.0554310567677021, "learning_rate": 7.77886251434951e-06, "loss": 0.4586, "num_input_tokens_seen": 73997376, "step": 60845 }, { "epoch": 7.624357849893497, "grad_norm": 0.13177749514579773, "learning_rate": 7.778407995705972e-06, "loss": 0.4602, "num_input_tokens_seen": 74002912, "step": 60850 }, { "epoch": 7.624984337802281, "grad_norm": 0.11312901973724365, "learning_rate": 7.777953443844231e-06, "loss": 0.4567, "num_input_tokens_seen": 74008800, "step": 60855 }, { "epoch": 7.625610825711064, "grad_norm": 0.10185568779706955, "learning_rate": 7.777498858769721e-06, "loss": 0.4655, "num_input_tokens_seen": 74014752, "step": 60860 }, { "epoch": 7.626237313619847, "grad_norm": 0.10254610329866409, "learning_rate": 7.777044240487877e-06, "loss": 0.4667, "num_input_tokens_seen": 74020928, "step": 60865 }, { "epoch": 7.62686380152863, "grad_norm": 0.0922025591135025, "learning_rate": 7.776589589004136e-06, "loss": 0.4703, "num_input_tokens_seen": 74026912, "step": 60870 }, { "epoch": 7.6274902894374135, "grad_norm": 0.08848290890455246, "learning_rate": 7.77613490432393e-06, "loss": 0.467, "num_input_tokens_seen": 74033120, "step": 60875 }, { "epoch": 7.6281167773461975, "grad_norm": 0.26125141978263855, "learning_rate": 7.775680186452697e-06, "loss": 0.4633, "num_input_tokens_seen": 74039328, "step": 60880 }, { "epoch": 7.628743265254981, "grad_norm": 0.10809006541967392, "learning_rate": 7.775225435395875e-06, "loss": 0.4659, "num_input_tokens_seen": 74045280, "step": 60885 }, { "epoch": 7.629369753163764, "grad_norm": 0.15183906257152557, "learning_rate": 7.774770651158897e-06, "loss": 0.4597, "num_input_tokens_seen": 74050240, "step": 60890 }, { "epoch": 7.629996241072547, "grad_norm": 0.12703639268875122, "learning_rate": 7.774315833747205e-06, "loss": 0.4598, "num_input_tokens_seen": 74056160, "step": 60895 }, { "epoch": 7.630622728981331, "grad_norm": 0.12936221063137054, "learning_rate": 7.773860983166235e-06, "loss": 0.4719, "num_input_tokens_seen": 74062368, "step": 60900 }, { "epoch": 7.631249216890114, "grad_norm": 0.08560998737812042, "learning_rate": 7.773406099421426e-06, "loss": 0.4545, "num_input_tokens_seen": 74068544, "step": 60905 }, { "epoch": 7.631875704798897, "grad_norm": 0.17379219830036163, "learning_rate": 7.772951182518213e-06, "loss": 0.4638, "num_input_tokens_seen": 74074912, "step": 60910 }, { "epoch": 7.632502192707681, "grad_norm": 0.12073031067848206, "learning_rate": 7.772496232462037e-06, "loss": 0.4674, "num_input_tokens_seen": 74080832, "step": 60915 }, { "epoch": 7.633128680616464, "grad_norm": 0.07834766060113907, "learning_rate": 7.772041249258339e-06, "loss": 0.4595, "num_input_tokens_seen": 74086784, "step": 60920 }, { "epoch": 7.633755168525248, "grad_norm": 0.14557640254497528, "learning_rate": 7.771586232912556e-06, "loss": 0.4554, "num_input_tokens_seen": 74092672, "step": 60925 }, { "epoch": 7.634381656434031, "grad_norm": 0.07861853390932083, "learning_rate": 7.77113118343013e-06, "loss": 0.4559, "num_input_tokens_seen": 74098752, "step": 60930 }, { "epoch": 7.635008144342814, "grad_norm": 0.09217581897974014, "learning_rate": 7.770676100816502e-06, "loss": 0.4621, "num_input_tokens_seen": 74104832, "step": 60935 }, { "epoch": 7.635634632251597, "grad_norm": 0.09086674451828003, "learning_rate": 7.77022098507711e-06, "loss": 0.4649, "num_input_tokens_seen": 74110816, "step": 60940 }, { "epoch": 7.6362611201603805, "grad_norm": 0.055371444672346115, "learning_rate": 7.769765836217398e-06, "loss": 0.4596, "num_input_tokens_seen": 74116928, "step": 60945 }, { "epoch": 7.636887608069165, "grad_norm": 0.12876656651496887, "learning_rate": 7.769310654242806e-06, "loss": 0.4627, "num_input_tokens_seen": 74122912, "step": 60950 }, { "epoch": 7.637514095977948, "grad_norm": 0.1365419179201126, "learning_rate": 7.768855439158778e-06, "loss": 0.4562, "num_input_tokens_seen": 74128800, "step": 60955 }, { "epoch": 7.638140583886731, "grad_norm": 0.11382424086332321, "learning_rate": 7.768400190970756e-06, "loss": 0.4676, "num_input_tokens_seen": 74134560, "step": 60960 }, { "epoch": 7.638767071795514, "grad_norm": 0.048723313957452774, "learning_rate": 7.767944909684179e-06, "loss": 0.462, "num_input_tokens_seen": 74140640, "step": 60965 }, { "epoch": 7.639393559704297, "grad_norm": 0.0854593962430954, "learning_rate": 7.767489595304496e-06, "loss": 0.469, "num_input_tokens_seen": 74146688, "step": 60970 }, { "epoch": 7.640020047613081, "grad_norm": 0.07737968116998672, "learning_rate": 7.767034247837146e-06, "loss": 0.4661, "num_input_tokens_seen": 74152576, "step": 60975 }, { "epoch": 7.6406465355218645, "grad_norm": 0.12171794474124908, "learning_rate": 7.766578867287578e-06, "loss": 0.4561, "num_input_tokens_seen": 74158624, "step": 60980 }, { "epoch": 7.641273023430648, "grad_norm": 0.049431513994932175, "learning_rate": 7.766123453661232e-06, "loss": 0.4543, "num_input_tokens_seen": 74165152, "step": 60985 }, { "epoch": 7.641899511339431, "grad_norm": 0.08897589147090912, "learning_rate": 7.765668006963554e-06, "loss": 0.458, "num_input_tokens_seen": 74171328, "step": 60990 }, { "epoch": 7.642525999248215, "grad_norm": 0.05032472684979439, "learning_rate": 7.76521252719999e-06, "loss": 0.4671, "num_input_tokens_seen": 74177664, "step": 60995 }, { "epoch": 7.643152487156998, "grad_norm": 0.09580425173044205, "learning_rate": 7.764757014375985e-06, "loss": 0.4547, "num_input_tokens_seen": 74183744, "step": 61000 }, { "epoch": 7.643778975065781, "grad_norm": 0.09636810421943665, "learning_rate": 7.764301468496988e-06, "loss": 0.4681, "num_input_tokens_seen": 74189824, "step": 61005 }, { "epoch": 7.644405462974564, "grad_norm": 0.1158653274178505, "learning_rate": 7.76384588956844e-06, "loss": 0.4592, "num_input_tokens_seen": 74196000, "step": 61010 }, { "epoch": 7.645031950883348, "grad_norm": 0.07650458067655563, "learning_rate": 7.763390277595792e-06, "loss": 0.4653, "num_input_tokens_seen": 74202336, "step": 61015 }, { "epoch": 7.645658438792132, "grad_norm": 0.0807679072022438, "learning_rate": 7.76293463258449e-06, "loss": 0.4606, "num_input_tokens_seen": 74208288, "step": 61020 }, { "epoch": 7.646284926700915, "grad_norm": 0.0751291811466217, "learning_rate": 7.76247895453998e-06, "loss": 0.4602, "num_input_tokens_seen": 74214336, "step": 61025 }, { "epoch": 7.646911414609698, "grad_norm": 0.09582781791687012, "learning_rate": 7.762023243467714e-06, "loss": 0.4631, "num_input_tokens_seen": 74220832, "step": 61030 }, { "epoch": 7.647537902518481, "grad_norm": 0.09381301701068878, "learning_rate": 7.761567499373136e-06, "loss": 0.4611, "num_input_tokens_seen": 74227136, "step": 61035 }, { "epoch": 7.648164390427265, "grad_norm": 0.04807954281568527, "learning_rate": 7.761111722261697e-06, "loss": 0.4599, "num_input_tokens_seen": 74233312, "step": 61040 }, { "epoch": 7.648790878336048, "grad_norm": 0.0790175199508667, "learning_rate": 7.760655912138846e-06, "loss": 0.4732, "num_input_tokens_seen": 74239008, "step": 61045 }, { "epoch": 7.6494173662448315, "grad_norm": 0.12564651668071747, "learning_rate": 7.760200069010033e-06, "loss": 0.4591, "num_input_tokens_seen": 74245312, "step": 61050 }, { "epoch": 7.650043854153615, "grad_norm": 0.07379773259162903, "learning_rate": 7.759744192880707e-06, "loss": 0.462, "num_input_tokens_seen": 74251616, "step": 61055 }, { "epoch": 7.650670342062398, "grad_norm": 0.10119867324829102, "learning_rate": 7.75928828375632e-06, "loss": 0.4606, "num_input_tokens_seen": 74257856, "step": 61060 }, { "epoch": 7.651296829971182, "grad_norm": 0.11281666159629822, "learning_rate": 7.75883234164232e-06, "loss": 0.4556, "num_input_tokens_seen": 74264064, "step": 61065 }, { "epoch": 7.651923317879965, "grad_norm": 0.07616355270147324, "learning_rate": 7.758376366544161e-06, "loss": 0.4657, "num_input_tokens_seen": 74270112, "step": 61070 }, { "epoch": 7.652549805788748, "grad_norm": 0.09834376722574234, "learning_rate": 7.757920358467293e-06, "loss": 0.4676, "num_input_tokens_seen": 74276128, "step": 61075 }, { "epoch": 7.653176293697531, "grad_norm": 0.07444646209478378, "learning_rate": 7.757464317417169e-06, "loss": 0.4629, "num_input_tokens_seen": 74282208, "step": 61080 }, { "epoch": 7.6538027816063146, "grad_norm": 0.08311546593904495, "learning_rate": 7.75700824339924e-06, "loss": 0.458, "num_input_tokens_seen": 74288832, "step": 61085 }, { "epoch": 7.654429269515099, "grad_norm": 0.07251130044460297, "learning_rate": 7.75655213641896e-06, "loss": 0.4632, "num_input_tokens_seen": 74294880, "step": 61090 }, { "epoch": 7.655055757423882, "grad_norm": 0.12765489518642426, "learning_rate": 7.756095996481783e-06, "loss": 0.4701, "num_input_tokens_seen": 74301056, "step": 61095 }, { "epoch": 7.655682245332665, "grad_norm": 0.0805630311369896, "learning_rate": 7.755639823593158e-06, "loss": 0.4634, "num_input_tokens_seen": 74307552, "step": 61100 }, { "epoch": 7.656308733241448, "grad_norm": 0.07358722388744354, "learning_rate": 7.755183617758543e-06, "loss": 0.4643, "num_input_tokens_seen": 74313792, "step": 61105 }, { "epoch": 7.656935221150232, "grad_norm": 0.08087009191513062, "learning_rate": 7.754727378983394e-06, "loss": 0.459, "num_input_tokens_seen": 74319552, "step": 61110 }, { "epoch": 7.657561709059015, "grad_norm": 0.17397749423980713, "learning_rate": 7.754271107273159e-06, "loss": 0.465, "num_input_tokens_seen": 74325504, "step": 61115 }, { "epoch": 7.6581881969677985, "grad_norm": 0.08675384521484375, "learning_rate": 7.7538148026333e-06, "loss": 0.4543, "num_input_tokens_seen": 74331424, "step": 61120 }, { "epoch": 7.658814684876582, "grad_norm": 0.0848434567451477, "learning_rate": 7.75335846506927e-06, "loss": 0.464, "num_input_tokens_seen": 74337696, "step": 61125 }, { "epoch": 7.659441172785366, "grad_norm": 0.09467757493257523, "learning_rate": 7.752902094586525e-06, "loss": 0.4613, "num_input_tokens_seen": 74343776, "step": 61130 }, { "epoch": 7.660067660694149, "grad_norm": 0.06430403888225555, "learning_rate": 7.752445691190518e-06, "loss": 0.4621, "num_input_tokens_seen": 74350176, "step": 61135 }, { "epoch": 7.660694148602932, "grad_norm": 0.08381812274456024, "learning_rate": 7.751989254886711e-06, "loss": 0.46, "num_input_tokens_seen": 74356352, "step": 61140 }, { "epoch": 7.661320636511715, "grad_norm": 0.06591769307851791, "learning_rate": 7.751532785680558e-06, "loss": 0.4622, "num_input_tokens_seen": 74362624, "step": 61145 }, { "epoch": 7.661947124420498, "grad_norm": 0.09522495418787003, "learning_rate": 7.751076283577519e-06, "loss": 0.4605, "num_input_tokens_seen": 74368928, "step": 61150 }, { "epoch": 7.6625736123292825, "grad_norm": 0.08473213762044907, "learning_rate": 7.750619748583048e-06, "loss": 0.4608, "num_input_tokens_seen": 74375360, "step": 61155 }, { "epoch": 7.663200100238066, "grad_norm": 0.10240218788385391, "learning_rate": 7.750163180702605e-06, "loss": 0.4662, "num_input_tokens_seen": 74381024, "step": 61160 }, { "epoch": 7.663826588146849, "grad_norm": 0.04695165902376175, "learning_rate": 7.749706579941649e-06, "loss": 0.4717, "num_input_tokens_seen": 74387040, "step": 61165 }, { "epoch": 7.664453076055632, "grad_norm": 0.0938139334321022, "learning_rate": 7.74924994630564e-06, "loss": 0.4626, "num_input_tokens_seen": 74393248, "step": 61170 }, { "epoch": 7.665079563964415, "grad_norm": 0.07698319107294083, "learning_rate": 7.748793279800036e-06, "loss": 0.4612, "num_input_tokens_seen": 74399648, "step": 61175 }, { "epoch": 7.665706051873199, "grad_norm": 0.08613383024930954, "learning_rate": 7.748336580430298e-06, "loss": 0.462, "num_input_tokens_seen": 74405536, "step": 61180 }, { "epoch": 7.666332539781982, "grad_norm": 0.07215629518032074, "learning_rate": 7.747879848201885e-06, "loss": 0.4602, "num_input_tokens_seen": 74411296, "step": 61185 }, { "epoch": 7.6669590276907655, "grad_norm": 0.07782568782567978, "learning_rate": 7.747423083120257e-06, "loss": 0.4612, "num_input_tokens_seen": 74417152, "step": 61190 }, { "epoch": 7.667585515599549, "grad_norm": 0.07101422548294067, "learning_rate": 7.746966285190878e-06, "loss": 0.4571, "num_input_tokens_seen": 74423008, "step": 61195 }, { "epoch": 7.668212003508332, "grad_norm": 0.10969576984643936, "learning_rate": 7.746509454419206e-06, "loss": 0.4625, "num_input_tokens_seen": 74429632, "step": 61200 }, { "epoch": 7.668838491417116, "grad_norm": 0.11458556354045868, "learning_rate": 7.746052590810706e-06, "loss": 0.4606, "num_input_tokens_seen": 74435456, "step": 61205 }, { "epoch": 7.669464979325899, "grad_norm": 0.1268223226070404, "learning_rate": 7.745595694370838e-06, "loss": 0.4648, "num_input_tokens_seen": 74441344, "step": 61210 }, { "epoch": 7.670091467234682, "grad_norm": 0.04975628852844238, "learning_rate": 7.745138765105065e-06, "loss": 0.4683, "num_input_tokens_seen": 74447296, "step": 61215 }, { "epoch": 7.670717955143465, "grad_norm": 0.05409710854291916, "learning_rate": 7.744681803018849e-06, "loss": 0.4656, "num_input_tokens_seen": 74453472, "step": 61220 }, { "epoch": 7.671344443052249, "grad_norm": 0.1000913679599762, "learning_rate": 7.744224808117655e-06, "loss": 0.4623, "num_input_tokens_seen": 74459776, "step": 61225 }, { "epoch": 7.671970930961033, "grad_norm": 0.09801256656646729, "learning_rate": 7.743767780406947e-06, "loss": 0.4604, "num_input_tokens_seen": 74465856, "step": 61230 }, { "epoch": 7.672597418869816, "grad_norm": 0.10768594592809677, "learning_rate": 7.743310719892187e-06, "loss": 0.4634, "num_input_tokens_seen": 74471872, "step": 61235 }, { "epoch": 7.673223906778599, "grad_norm": 0.07166111469268799, "learning_rate": 7.742853626578843e-06, "loss": 0.4611, "num_input_tokens_seen": 74478016, "step": 61240 }, { "epoch": 7.673850394687382, "grad_norm": 0.04925156012177467, "learning_rate": 7.742396500472376e-06, "loss": 0.4611, "num_input_tokens_seen": 74484384, "step": 61245 }, { "epoch": 7.674476882596166, "grad_norm": 0.10794354975223541, "learning_rate": 7.741939341578253e-06, "loss": 0.4575, "num_input_tokens_seen": 74490432, "step": 61250 }, { "epoch": 7.675103370504949, "grad_norm": 0.08474055677652359, "learning_rate": 7.741482149901941e-06, "loss": 0.4588, "num_input_tokens_seen": 74496736, "step": 61255 }, { "epoch": 7.675729858413733, "grad_norm": 0.10417047888040543, "learning_rate": 7.741024925448906e-06, "loss": 0.465, "num_input_tokens_seen": 74502656, "step": 61260 }, { "epoch": 7.676356346322516, "grad_norm": 0.06995636224746704, "learning_rate": 7.74056766822461e-06, "loss": 0.4587, "num_input_tokens_seen": 74509024, "step": 61265 }, { "epoch": 7.6769828342313, "grad_norm": 0.07253824919462204, "learning_rate": 7.740110378234527e-06, "loss": 0.4653, "num_input_tokens_seen": 74515008, "step": 61270 }, { "epoch": 7.677609322140083, "grad_norm": 0.05179573968052864, "learning_rate": 7.739653055484117e-06, "loss": 0.4563, "num_input_tokens_seen": 74520864, "step": 61275 }, { "epoch": 7.678235810048866, "grad_norm": 0.06670372188091278, "learning_rate": 7.739195699978854e-06, "loss": 0.4629, "num_input_tokens_seen": 74526720, "step": 61280 }, { "epoch": 7.678862297957649, "grad_norm": 0.10480908304452896, "learning_rate": 7.738738311724203e-06, "loss": 0.4611, "num_input_tokens_seen": 74532992, "step": 61285 }, { "epoch": 7.6794887858664325, "grad_norm": 0.07318653166294098, "learning_rate": 7.738280890725633e-06, "loss": 0.4642, "num_input_tokens_seen": 74539104, "step": 61290 }, { "epoch": 7.6801152737752165, "grad_norm": 0.08083043247461319, "learning_rate": 7.737823436988613e-06, "loss": 0.4588, "num_input_tokens_seen": 74545280, "step": 61295 }, { "epoch": 7.680741761684, "grad_norm": 0.09069471806287766, "learning_rate": 7.73736595051861e-06, "loss": 0.4619, "num_input_tokens_seen": 74551680, "step": 61300 }, { "epoch": 7.681368249592783, "grad_norm": 0.09155512601137161, "learning_rate": 7.736908431321097e-06, "loss": 0.4654, "num_input_tokens_seen": 74558144, "step": 61305 }, { "epoch": 7.681994737501566, "grad_norm": 0.11708832532167435, "learning_rate": 7.736450879401543e-06, "loss": 0.4578, "num_input_tokens_seen": 74564288, "step": 61310 }, { "epoch": 7.682621225410349, "grad_norm": 0.1149994432926178, "learning_rate": 7.735993294765419e-06, "loss": 0.4639, "num_input_tokens_seen": 74570368, "step": 61315 }, { "epoch": 7.683247713319133, "grad_norm": 0.17471139132976532, "learning_rate": 7.735535677418194e-06, "loss": 0.4611, "num_input_tokens_seen": 74576032, "step": 61320 }, { "epoch": 7.683874201227916, "grad_norm": 0.11431613564491272, "learning_rate": 7.735078027365339e-06, "loss": 0.4639, "num_input_tokens_seen": 74582272, "step": 61325 }, { "epoch": 7.6845006891367, "grad_norm": 0.08358145505189896, "learning_rate": 7.734620344612327e-06, "loss": 0.4577, "num_input_tokens_seen": 74588160, "step": 61330 }, { "epoch": 7.685127177045483, "grad_norm": 0.10080249607563019, "learning_rate": 7.73416262916463e-06, "loss": 0.4636, "num_input_tokens_seen": 74594016, "step": 61335 }, { "epoch": 7.685753664954266, "grad_norm": 0.0930170789361, "learning_rate": 7.73370488102772e-06, "loss": 0.4589, "num_input_tokens_seen": 74600544, "step": 61340 }, { "epoch": 7.68638015286305, "grad_norm": 0.10457728058099747, "learning_rate": 7.73324710020707e-06, "loss": 0.4579, "num_input_tokens_seen": 74606752, "step": 61345 }, { "epoch": 7.687006640771833, "grad_norm": 0.09643979370594025, "learning_rate": 7.732789286708152e-06, "loss": 0.461, "num_input_tokens_seen": 74612928, "step": 61350 }, { "epoch": 7.687633128680616, "grad_norm": 0.08886595815420151, "learning_rate": 7.73233144053644e-06, "loss": 0.463, "num_input_tokens_seen": 74619008, "step": 61355 }, { "epoch": 7.6882596165893995, "grad_norm": 0.08859166502952576, "learning_rate": 7.731873561697411e-06, "loss": 0.4604, "num_input_tokens_seen": 74625248, "step": 61360 }, { "epoch": 7.6888861044981835, "grad_norm": 0.12146938592195511, "learning_rate": 7.731415650196535e-06, "loss": 0.4646, "num_input_tokens_seen": 74631008, "step": 61365 }, { "epoch": 7.689512592406967, "grad_norm": 0.14107325673103333, "learning_rate": 7.730957706039289e-06, "loss": 0.4624, "num_input_tokens_seen": 74636800, "step": 61370 }, { "epoch": 7.69013908031575, "grad_norm": 0.09759621322154999, "learning_rate": 7.730499729231146e-06, "loss": 0.4645, "num_input_tokens_seen": 74643168, "step": 61375 }, { "epoch": 7.690765568224533, "grad_norm": 0.08306682109832764, "learning_rate": 7.730041719777586e-06, "loss": 0.4623, "num_input_tokens_seen": 74649184, "step": 61380 }, { "epoch": 7.691392056133317, "grad_norm": 0.08644405007362366, "learning_rate": 7.729583677684081e-06, "loss": 0.4638, "num_input_tokens_seen": 74655264, "step": 61385 }, { "epoch": 7.6920185440421, "grad_norm": 0.09841097146272659, "learning_rate": 7.729125602956108e-06, "loss": 0.4559, "num_input_tokens_seen": 74661248, "step": 61390 }, { "epoch": 7.6926450319508834, "grad_norm": 0.11232171952724457, "learning_rate": 7.728667495599143e-06, "loss": 0.4572, "num_input_tokens_seen": 74667136, "step": 61395 }, { "epoch": 7.693271519859667, "grad_norm": 0.08287224918603897, "learning_rate": 7.728209355618663e-06, "loss": 0.46, "num_input_tokens_seen": 74673216, "step": 61400 }, { "epoch": 7.69389800776845, "grad_norm": 0.09603967517614365, "learning_rate": 7.727751183020149e-06, "loss": 0.4613, "num_input_tokens_seen": 74678752, "step": 61405 }, { "epoch": 7.694524495677234, "grad_norm": 0.08870217949151993, "learning_rate": 7.727292977809074e-06, "loss": 0.4696, "num_input_tokens_seen": 74684640, "step": 61410 }, { "epoch": 7.695150983586017, "grad_norm": 0.14725647866725922, "learning_rate": 7.726834739990919e-06, "loss": 0.4589, "num_input_tokens_seen": 74690752, "step": 61415 }, { "epoch": 7.6957774714948, "grad_norm": 0.15539135038852692, "learning_rate": 7.726376469571165e-06, "loss": 0.4687, "num_input_tokens_seen": 74696640, "step": 61420 }, { "epoch": 7.696403959403583, "grad_norm": 0.10406631231307983, "learning_rate": 7.725918166555284e-06, "loss": 0.4612, "num_input_tokens_seen": 74702688, "step": 61425 }, { "epoch": 7.6970304473123665, "grad_norm": 0.11817191541194916, "learning_rate": 7.725459830948761e-06, "loss": 0.46, "num_input_tokens_seen": 74708960, "step": 61430 }, { "epoch": 7.697656935221151, "grad_norm": 0.05486271530389786, "learning_rate": 7.725001462757075e-06, "loss": 0.4674, "num_input_tokens_seen": 74715072, "step": 61435 }, { "epoch": 7.698283423129934, "grad_norm": 0.07551276683807373, "learning_rate": 7.724543061985705e-06, "loss": 0.4652, "num_input_tokens_seen": 74721376, "step": 61440 }, { "epoch": 7.698909911038717, "grad_norm": 0.08817291259765625, "learning_rate": 7.724084628640132e-06, "loss": 0.4583, "num_input_tokens_seen": 74727584, "step": 61445 }, { "epoch": 7.6995363989475, "grad_norm": 0.11595455557107925, "learning_rate": 7.723626162725837e-06, "loss": 0.4668, "num_input_tokens_seen": 74733696, "step": 61450 }, { "epoch": 7.700162886856283, "grad_norm": 0.0696660578250885, "learning_rate": 7.723167664248301e-06, "loss": 0.4618, "num_input_tokens_seen": 74739904, "step": 61455 }, { "epoch": 7.700789374765067, "grad_norm": 0.05760889872908592, "learning_rate": 7.722709133213007e-06, "loss": 0.466, "num_input_tokens_seen": 74746048, "step": 61460 }, { "epoch": 7.7014158626738505, "grad_norm": 0.07946722209453583, "learning_rate": 7.722250569625436e-06, "loss": 0.4663, "num_input_tokens_seen": 74752288, "step": 61465 }, { "epoch": 7.702042350582634, "grad_norm": 0.05419439822435379, "learning_rate": 7.721791973491068e-06, "loss": 0.4564, "num_input_tokens_seen": 74758432, "step": 61470 }, { "epoch": 7.702668838491417, "grad_norm": 0.10014688968658447, "learning_rate": 7.721333344815392e-06, "loss": 0.4558, "num_input_tokens_seen": 74764576, "step": 61475 }, { "epoch": 7.7032953264002, "grad_norm": 0.1438315063714981, "learning_rate": 7.720874683603884e-06, "loss": 0.4627, "num_input_tokens_seen": 74770944, "step": 61480 }, { "epoch": 7.703921814308984, "grad_norm": 0.09413839131593704, "learning_rate": 7.720415989862036e-06, "loss": 0.4712, "num_input_tokens_seen": 74777280, "step": 61485 }, { "epoch": 7.704548302217767, "grad_norm": 0.07788171619176865, "learning_rate": 7.719957263595326e-06, "loss": 0.4613, "num_input_tokens_seen": 74783744, "step": 61490 }, { "epoch": 7.70517479012655, "grad_norm": 0.05567516013979912, "learning_rate": 7.71949850480924e-06, "loss": 0.4621, "num_input_tokens_seen": 74789888, "step": 61495 }, { "epoch": 7.7058012780353335, "grad_norm": 0.11007850617170334, "learning_rate": 7.719039713509264e-06, "loss": 0.4645, "num_input_tokens_seen": 74795744, "step": 61500 }, { "epoch": 7.706427765944118, "grad_norm": 0.08008820563554764, "learning_rate": 7.71858088970088e-06, "loss": 0.4591, "num_input_tokens_seen": 74801888, "step": 61505 }, { "epoch": 7.707054253852901, "grad_norm": 0.18202465772628784, "learning_rate": 7.718122033389576e-06, "loss": 0.464, "num_input_tokens_seen": 74808096, "step": 61510 }, { "epoch": 7.707680741761684, "grad_norm": 0.14012078940868378, "learning_rate": 7.717663144580841e-06, "loss": 0.4731, "num_input_tokens_seen": 74814400, "step": 61515 }, { "epoch": 7.708307229670467, "grad_norm": 0.08764170110225677, "learning_rate": 7.717204223280155e-06, "loss": 0.4623, "num_input_tokens_seen": 74819968, "step": 61520 }, { "epoch": 7.708933717579251, "grad_norm": 0.16106963157653809, "learning_rate": 7.716745269493009e-06, "loss": 0.461, "num_input_tokens_seen": 74825856, "step": 61525 }, { "epoch": 7.709560205488034, "grad_norm": 0.08769067376852036, "learning_rate": 7.71628628322489e-06, "loss": 0.4594, "num_input_tokens_seen": 74831936, "step": 61530 }, { "epoch": 7.7101866933968175, "grad_norm": 0.13711707293987274, "learning_rate": 7.715827264481283e-06, "loss": 0.4704, "num_input_tokens_seen": 74837376, "step": 61535 }, { "epoch": 7.710813181305601, "grad_norm": 0.10763704031705856, "learning_rate": 7.715368213267678e-06, "loss": 0.4577, "num_input_tokens_seen": 74843744, "step": 61540 }, { "epoch": 7.711439669214384, "grad_norm": 0.09082793444395065, "learning_rate": 7.714909129589564e-06, "loss": 0.4633, "num_input_tokens_seen": 74849792, "step": 61545 }, { "epoch": 7.712066157123168, "grad_norm": 0.07881739735603333, "learning_rate": 7.714450013452429e-06, "loss": 0.4698, "num_input_tokens_seen": 74855392, "step": 61550 }, { "epoch": 7.712692645031951, "grad_norm": 0.08634158223867416, "learning_rate": 7.71399086486176e-06, "loss": 0.4601, "num_input_tokens_seen": 74861696, "step": 61555 }, { "epoch": 7.713319132940734, "grad_norm": 0.11979126185178757, "learning_rate": 7.71353168382305e-06, "loss": 0.4579, "num_input_tokens_seen": 74867936, "step": 61560 }, { "epoch": 7.713945620849517, "grad_norm": 0.07694442570209503, "learning_rate": 7.713072470341787e-06, "loss": 0.4589, "num_input_tokens_seen": 74874240, "step": 61565 }, { "epoch": 7.714572108758301, "grad_norm": 0.08136596530675888, "learning_rate": 7.712613224423463e-06, "loss": 0.4566, "num_input_tokens_seen": 74880416, "step": 61570 }, { "epoch": 7.715198596667085, "grad_norm": 0.0922928899526596, "learning_rate": 7.712153946073566e-06, "loss": 0.4603, "num_input_tokens_seen": 74886592, "step": 61575 }, { "epoch": 7.715825084575868, "grad_norm": 0.11048109084367752, "learning_rate": 7.711694635297588e-06, "loss": 0.4617, "num_input_tokens_seen": 74892608, "step": 61580 }, { "epoch": 7.716451572484651, "grad_norm": 0.11662081629037857, "learning_rate": 7.711235292101022e-06, "loss": 0.4615, "num_input_tokens_seen": 74898432, "step": 61585 }, { "epoch": 7.717078060393434, "grad_norm": 0.10667368769645691, "learning_rate": 7.710775916489357e-06, "loss": 0.4616, "num_input_tokens_seen": 74904480, "step": 61590 }, { "epoch": 7.717704548302217, "grad_norm": 0.10342365503311157, "learning_rate": 7.710316508468088e-06, "loss": 0.4601, "num_input_tokens_seen": 74910656, "step": 61595 }, { "epoch": 7.718331036211001, "grad_norm": 0.08826174587011337, "learning_rate": 7.709857068042705e-06, "loss": 0.4616, "num_input_tokens_seen": 74916768, "step": 61600 }, { "epoch": 7.7189575241197845, "grad_norm": 0.12896473705768585, "learning_rate": 7.709397595218703e-06, "loss": 0.4587, "num_input_tokens_seen": 74922784, "step": 61605 }, { "epoch": 7.719584012028568, "grad_norm": 0.11335264146327972, "learning_rate": 7.708938090001575e-06, "loss": 0.4639, "num_input_tokens_seen": 74928768, "step": 61610 }, { "epoch": 7.720210499937351, "grad_norm": 0.09252775460481644, "learning_rate": 7.708478552396815e-06, "loss": 0.4624, "num_input_tokens_seen": 74934784, "step": 61615 }, { "epoch": 7.720836987846135, "grad_norm": 0.0876060202717781, "learning_rate": 7.708018982409918e-06, "loss": 0.4693, "num_input_tokens_seen": 74941088, "step": 61620 }, { "epoch": 7.721463475754918, "grad_norm": 0.0871662124991417, "learning_rate": 7.707559380046375e-06, "loss": 0.4662, "num_input_tokens_seen": 74947360, "step": 61625 }, { "epoch": 7.722089963663701, "grad_norm": 0.05266376957297325, "learning_rate": 7.707099745311683e-06, "loss": 0.4625, "num_input_tokens_seen": 74953376, "step": 61630 }, { "epoch": 7.722716451572484, "grad_norm": 0.0813756138086319, "learning_rate": 7.706640078211339e-06, "loss": 0.4626, "num_input_tokens_seen": 74960000, "step": 61635 }, { "epoch": 7.7233429394812685, "grad_norm": 0.08310958743095398, "learning_rate": 7.706180378750836e-06, "loss": 0.4635, "num_input_tokens_seen": 74966048, "step": 61640 }, { "epoch": 7.723969427390052, "grad_norm": 0.08243655413389206, "learning_rate": 7.705720646935672e-06, "loss": 0.4602, "num_input_tokens_seen": 74972064, "step": 61645 }, { "epoch": 7.724595915298835, "grad_norm": 0.08032997697591782, "learning_rate": 7.705260882771344e-06, "loss": 0.4645, "num_input_tokens_seen": 74978272, "step": 61650 }, { "epoch": 7.725222403207618, "grad_norm": 0.07364192605018616, "learning_rate": 7.704801086263347e-06, "loss": 0.466, "num_input_tokens_seen": 74984352, "step": 61655 }, { "epoch": 7.725848891116401, "grad_norm": 0.050483737140893936, "learning_rate": 7.704341257417177e-06, "loss": 0.4621, "num_input_tokens_seen": 74990656, "step": 61660 }, { "epoch": 7.726475379025185, "grad_norm": 0.08354182541370392, "learning_rate": 7.703881396238335e-06, "loss": 0.4607, "num_input_tokens_seen": 74996768, "step": 61665 }, { "epoch": 7.727101866933968, "grad_norm": 0.08350393176078796, "learning_rate": 7.703421502732319e-06, "loss": 0.4638, "num_input_tokens_seen": 75003136, "step": 61670 }, { "epoch": 7.727728354842752, "grad_norm": 0.0849156454205513, "learning_rate": 7.702961576904624e-06, "loss": 0.4622, "num_input_tokens_seen": 75008960, "step": 61675 }, { "epoch": 7.728354842751535, "grad_norm": 0.0867353156208992, "learning_rate": 7.702501618760752e-06, "loss": 0.4513, "num_input_tokens_seen": 75015040, "step": 61680 }, { "epoch": 7.728981330660318, "grad_norm": 0.06682348251342773, "learning_rate": 7.7020416283062e-06, "loss": 0.4586, "num_input_tokens_seen": 75021024, "step": 61685 }, { "epoch": 7.729607818569102, "grad_norm": 0.08424412459135056, "learning_rate": 7.70158160554647e-06, "loss": 0.4638, "num_input_tokens_seen": 75027136, "step": 61690 }, { "epoch": 7.730234306477885, "grad_norm": 0.1398444026708603, "learning_rate": 7.70112155048706e-06, "loss": 0.4627, "num_input_tokens_seen": 75033344, "step": 61695 }, { "epoch": 7.730860794386668, "grad_norm": 0.060254719108343124, "learning_rate": 7.700661463133469e-06, "loss": 0.4626, "num_input_tokens_seen": 75040000, "step": 61700 }, { "epoch": 7.7314872822954515, "grad_norm": 0.13738775253295898, "learning_rate": 7.700201343491202e-06, "loss": 0.46, "num_input_tokens_seen": 75046112, "step": 61705 }, { "epoch": 7.732113770204235, "grad_norm": 0.13643404841423035, "learning_rate": 7.699741191565758e-06, "loss": 0.4637, "num_input_tokens_seen": 75052448, "step": 61710 }, { "epoch": 7.732740258113019, "grad_norm": 0.1362696886062622, "learning_rate": 7.699281007362636e-06, "loss": 0.4604, "num_input_tokens_seen": 75058560, "step": 61715 }, { "epoch": 7.733366746021802, "grad_norm": 0.10903199762105942, "learning_rate": 7.698820790887343e-06, "loss": 0.4613, "num_input_tokens_seen": 75064288, "step": 61720 }, { "epoch": 7.733993233930585, "grad_norm": 0.10115044564008713, "learning_rate": 7.698360542145375e-06, "loss": 0.4581, "num_input_tokens_seen": 75070496, "step": 61725 }, { "epoch": 7.734619721839368, "grad_norm": 0.08742883056402206, "learning_rate": 7.69790026114224e-06, "loss": 0.4594, "num_input_tokens_seen": 75076800, "step": 61730 }, { "epoch": 7.735246209748152, "grad_norm": 0.09239383041858673, "learning_rate": 7.697439947883438e-06, "loss": 0.4625, "num_input_tokens_seen": 75083040, "step": 61735 }, { "epoch": 7.735872697656935, "grad_norm": 0.08604953438043594, "learning_rate": 7.696979602374475e-06, "loss": 0.4642, "num_input_tokens_seen": 75089120, "step": 61740 }, { "epoch": 7.736499185565719, "grad_norm": 0.14792639017105103, "learning_rate": 7.696519224620852e-06, "loss": 0.4606, "num_input_tokens_seen": 75095264, "step": 61745 }, { "epoch": 7.737125673474502, "grad_norm": 0.0894199013710022, "learning_rate": 7.696058814628076e-06, "loss": 0.4669, "num_input_tokens_seen": 75101632, "step": 61750 }, { "epoch": 7.737752161383285, "grad_norm": 0.0980800986289978, "learning_rate": 7.695598372401651e-06, "loss": 0.4692, "num_input_tokens_seen": 75107616, "step": 61755 }, { "epoch": 7.738378649292069, "grad_norm": 0.11792575567960739, "learning_rate": 7.695137897947081e-06, "loss": 0.4612, "num_input_tokens_seen": 75113568, "step": 61760 }, { "epoch": 7.739005137200852, "grad_norm": 0.09920196235179901, "learning_rate": 7.694677391269868e-06, "loss": 0.4606, "num_input_tokens_seen": 75119680, "step": 61765 }, { "epoch": 7.739631625109635, "grad_norm": 0.0969657301902771, "learning_rate": 7.694216852375523e-06, "loss": 0.465, "num_input_tokens_seen": 75126016, "step": 61770 }, { "epoch": 7.7402581130184185, "grad_norm": 0.08841142058372498, "learning_rate": 7.69375628126955e-06, "loss": 0.467, "num_input_tokens_seen": 75132192, "step": 61775 }, { "epoch": 7.7408846009272025, "grad_norm": 0.11123350262641907, "learning_rate": 7.693295677957457e-06, "loss": 0.4596, "num_input_tokens_seen": 75138304, "step": 61780 }, { "epoch": 7.741511088835986, "grad_norm": 0.09069523215293884, "learning_rate": 7.69283504244475e-06, "loss": 0.4619, "num_input_tokens_seen": 75144416, "step": 61785 }, { "epoch": 7.742137576744769, "grad_norm": 0.08430445194244385, "learning_rate": 7.692374374736935e-06, "loss": 0.4615, "num_input_tokens_seen": 75150880, "step": 61790 }, { "epoch": 7.742764064653552, "grad_norm": 0.09434030205011368, "learning_rate": 7.691913674839522e-06, "loss": 0.463, "num_input_tokens_seen": 75157248, "step": 61795 }, { "epoch": 7.743390552562335, "grad_norm": 0.08842481672763824, "learning_rate": 7.691452942758017e-06, "loss": 0.4696, "num_input_tokens_seen": 75163552, "step": 61800 }, { "epoch": 7.744017040471119, "grad_norm": 0.07968617230653763, "learning_rate": 7.690992178497929e-06, "loss": 0.4597, "num_input_tokens_seen": 75169952, "step": 61805 }, { "epoch": 7.744643528379902, "grad_norm": 0.08120813965797424, "learning_rate": 7.690531382064768e-06, "loss": 0.4652, "num_input_tokens_seen": 75176160, "step": 61810 }, { "epoch": 7.745270016288686, "grad_norm": 0.10663309693336487, "learning_rate": 7.69007055346404e-06, "loss": 0.4653, "num_input_tokens_seen": 75181984, "step": 61815 }, { "epoch": 7.745896504197469, "grad_norm": 0.07622227817773819, "learning_rate": 7.689609692701259e-06, "loss": 0.4609, "num_input_tokens_seen": 75188576, "step": 61820 }, { "epoch": 7.746522992106252, "grad_norm": 0.08064522594213486, "learning_rate": 7.689148799781931e-06, "loss": 0.4619, "num_input_tokens_seen": 75194816, "step": 61825 }, { "epoch": 7.747149480015036, "grad_norm": 0.07586001604795456, "learning_rate": 7.68868787471157e-06, "loss": 0.4688, "num_input_tokens_seen": 75200832, "step": 61830 }, { "epoch": 7.747775967923819, "grad_norm": 0.07875183969736099, "learning_rate": 7.688226917495685e-06, "loss": 0.4656, "num_input_tokens_seen": 75207168, "step": 61835 }, { "epoch": 7.748402455832602, "grad_norm": 0.07906950265169144, "learning_rate": 7.687765928139786e-06, "loss": 0.4582, "num_input_tokens_seen": 75213568, "step": 61840 }, { "epoch": 7.7490289437413855, "grad_norm": 0.10922008752822876, "learning_rate": 7.687304906649384e-06, "loss": 0.4683, "num_input_tokens_seen": 75219712, "step": 61845 }, { "epoch": 7.749655431650169, "grad_norm": 0.08151119202375412, "learning_rate": 7.686843853029995e-06, "loss": 0.4572, "num_input_tokens_seen": 75225888, "step": 61850 }, { "epoch": 7.750281919558953, "grad_norm": 0.11585906893014908, "learning_rate": 7.686382767287129e-06, "loss": 0.4591, "num_input_tokens_seen": 75231936, "step": 61855 }, { "epoch": 7.750908407467736, "grad_norm": 0.08939129114151001, "learning_rate": 7.685921649426298e-06, "loss": 0.4592, "num_input_tokens_seen": 75237632, "step": 61860 }, { "epoch": 7.751534895376519, "grad_norm": 0.1589682698249817, "learning_rate": 7.685460499453015e-06, "loss": 0.4557, "num_input_tokens_seen": 75243808, "step": 61865 }, { "epoch": 7.752161383285302, "grad_norm": 0.08053583651781082, "learning_rate": 7.684999317372794e-06, "loss": 0.4674, "num_input_tokens_seen": 75250016, "step": 61870 }, { "epoch": 7.752787871194086, "grad_norm": 0.08888790756464005, "learning_rate": 7.684538103191149e-06, "loss": 0.4578, "num_input_tokens_seen": 75256128, "step": 61875 }, { "epoch": 7.7534143591028695, "grad_norm": 0.1006375253200531, "learning_rate": 7.68407685691359e-06, "loss": 0.4661, "num_input_tokens_seen": 75262208, "step": 61880 }, { "epoch": 7.754040847011653, "grad_norm": 0.09358648955821991, "learning_rate": 7.683615578545643e-06, "loss": 0.4611, "num_input_tokens_seen": 75268256, "step": 61885 }, { "epoch": 7.754667334920436, "grad_norm": 0.08377844095230103, "learning_rate": 7.683154268092811e-06, "loss": 0.4671, "num_input_tokens_seen": 75273632, "step": 61890 }, { "epoch": 7.75529382282922, "grad_norm": 0.08272276073694229, "learning_rate": 7.682692925560614e-06, "loss": 0.4594, "num_input_tokens_seen": 75278784, "step": 61895 }, { "epoch": 7.755920310738003, "grad_norm": 0.08453378826379776, "learning_rate": 7.682231550954568e-06, "loss": 0.4553, "num_input_tokens_seen": 75285120, "step": 61900 }, { "epoch": 7.756546798646786, "grad_norm": 0.07389698177576065, "learning_rate": 7.681770144280188e-06, "loss": 0.4577, "num_input_tokens_seen": 75291040, "step": 61905 }, { "epoch": 7.757173286555569, "grad_norm": 0.08577489852905273, "learning_rate": 7.681308705542992e-06, "loss": 0.4682, "num_input_tokens_seen": 75297312, "step": 61910 }, { "epoch": 7.7577997744643525, "grad_norm": 0.0786338597536087, "learning_rate": 7.680847234748496e-06, "loss": 0.4702, "num_input_tokens_seen": 75303488, "step": 61915 }, { "epoch": 7.758426262373137, "grad_norm": 0.09069645404815674, "learning_rate": 7.680385731902217e-06, "loss": 0.4564, "num_input_tokens_seen": 75309600, "step": 61920 }, { "epoch": 7.75905275028192, "grad_norm": 0.0974627211689949, "learning_rate": 7.679924197009673e-06, "loss": 0.4675, "num_input_tokens_seen": 75315296, "step": 61925 }, { "epoch": 7.759679238190703, "grad_norm": 0.12476083636283875, "learning_rate": 7.679462630076384e-06, "loss": 0.4611, "num_input_tokens_seen": 75321792, "step": 61930 }, { "epoch": 7.760305726099486, "grad_norm": 0.0782557874917984, "learning_rate": 7.679001031107864e-06, "loss": 0.4641, "num_input_tokens_seen": 75328032, "step": 61935 }, { "epoch": 7.760932214008269, "grad_norm": 0.10836879909038544, "learning_rate": 7.678539400109637e-06, "loss": 0.469, "num_input_tokens_seen": 75333888, "step": 61940 }, { "epoch": 7.761558701917053, "grad_norm": 0.11184857040643692, "learning_rate": 7.678077737087218e-06, "loss": 0.462, "num_input_tokens_seen": 75340032, "step": 61945 }, { "epoch": 7.7621851898258365, "grad_norm": 0.058529939502477646, "learning_rate": 7.677616042046127e-06, "loss": 0.4608, "num_input_tokens_seen": 75346208, "step": 61950 }, { "epoch": 7.76281167773462, "grad_norm": 0.11862898617982864, "learning_rate": 7.677154314991887e-06, "loss": 0.4543, "num_input_tokens_seen": 75352192, "step": 61955 }, { "epoch": 7.763438165643403, "grad_norm": 0.07487553358078003, "learning_rate": 7.676692555930016e-06, "loss": 0.4573, "num_input_tokens_seen": 75357760, "step": 61960 }, { "epoch": 7.764064653552186, "grad_norm": 0.08546541631221771, "learning_rate": 7.676230764866033e-06, "loss": 0.4631, "num_input_tokens_seen": 75364256, "step": 61965 }, { "epoch": 7.76469114146097, "grad_norm": 0.1288127601146698, "learning_rate": 7.675768941805462e-06, "loss": 0.4562, "num_input_tokens_seen": 75370240, "step": 61970 }, { "epoch": 7.765317629369753, "grad_norm": 0.08120608329772949, "learning_rate": 7.675307086753824e-06, "loss": 0.4619, "num_input_tokens_seen": 75375840, "step": 61975 }, { "epoch": 7.765944117278536, "grad_norm": 0.08882828056812286, "learning_rate": 7.674845199716641e-06, "loss": 0.4659, "num_input_tokens_seen": 75381728, "step": 61980 }, { "epoch": 7.76657060518732, "grad_norm": 0.07775962352752686, "learning_rate": 7.674383280699433e-06, "loss": 0.4608, "num_input_tokens_seen": 75388192, "step": 61985 }, { "epoch": 7.767197093096104, "grad_norm": 0.06323007494211197, "learning_rate": 7.673921329707727e-06, "loss": 0.4572, "num_input_tokens_seen": 75394464, "step": 61990 }, { "epoch": 7.767823581004887, "grad_norm": 0.09066557139158249, "learning_rate": 7.673459346747042e-06, "loss": 0.4603, "num_input_tokens_seen": 75400640, "step": 61995 }, { "epoch": 7.76845006891367, "grad_norm": 0.05785312131047249, "learning_rate": 7.672997331822903e-06, "loss": 0.4642, "num_input_tokens_seen": 75407072, "step": 62000 }, { "epoch": 7.769076556822453, "grad_norm": 0.0753755271434784, "learning_rate": 7.672535284940834e-06, "loss": 0.4639, "num_input_tokens_seen": 75413408, "step": 62005 }, { "epoch": 7.769703044731237, "grad_norm": 0.09194274991750717, "learning_rate": 7.672073206106359e-06, "loss": 0.4635, "num_input_tokens_seen": 75419712, "step": 62010 }, { "epoch": 7.77032953264002, "grad_norm": 0.12124833464622498, "learning_rate": 7.671611095325001e-06, "loss": 0.4635, "num_input_tokens_seen": 75425888, "step": 62015 }, { "epoch": 7.7709560205488035, "grad_norm": 0.08431930840015411, "learning_rate": 7.671148952602287e-06, "loss": 0.4539, "num_input_tokens_seen": 75431808, "step": 62020 }, { "epoch": 7.771582508457587, "grad_norm": 0.10930507630109787, "learning_rate": 7.67068677794374e-06, "loss": 0.4626, "num_input_tokens_seen": 75437888, "step": 62025 }, { "epoch": 7.77220899636637, "grad_norm": 0.1775740683078766, "learning_rate": 7.67022457135489e-06, "loss": 0.4596, "num_input_tokens_seen": 75443968, "step": 62030 }, { "epoch": 7.772835484275154, "grad_norm": 0.083778016269207, "learning_rate": 7.669762332841256e-06, "loss": 0.4624, "num_input_tokens_seen": 75450304, "step": 62035 }, { "epoch": 7.773461972183937, "grad_norm": 0.13366474211215973, "learning_rate": 7.66930006240837e-06, "loss": 0.4561, "num_input_tokens_seen": 75456096, "step": 62040 }, { "epoch": 7.77408846009272, "grad_norm": 0.1351945549249649, "learning_rate": 7.66883776006176e-06, "loss": 0.4584, "num_input_tokens_seen": 75462528, "step": 62045 }, { "epoch": 7.774714948001503, "grad_norm": 0.10928795486688614, "learning_rate": 7.668375425806952e-06, "loss": 0.461, "num_input_tokens_seen": 75468544, "step": 62050 }, { "epoch": 7.775341435910287, "grad_norm": 0.10458623617887497, "learning_rate": 7.667913059649468e-06, "loss": 0.47, "num_input_tokens_seen": 75474752, "step": 62055 }, { "epoch": 7.775967923819071, "grad_norm": 0.1049538105726242, "learning_rate": 7.667450661594844e-06, "loss": 0.4687, "num_input_tokens_seen": 75480896, "step": 62060 }, { "epoch": 7.776594411727854, "grad_norm": 0.10434015095233917, "learning_rate": 7.666988231648602e-06, "loss": 0.4587, "num_input_tokens_seen": 75486656, "step": 62065 }, { "epoch": 7.777220899636637, "grad_norm": 0.1186402440071106, "learning_rate": 7.666525769816274e-06, "loss": 0.4699, "num_input_tokens_seen": 75492960, "step": 62070 }, { "epoch": 7.77784738754542, "grad_norm": 0.07958189398050308, "learning_rate": 7.666063276103388e-06, "loss": 0.4664, "num_input_tokens_seen": 75498784, "step": 62075 }, { "epoch": 7.778473875454203, "grad_norm": 0.08609301596879959, "learning_rate": 7.665600750515476e-06, "loss": 0.4633, "num_input_tokens_seen": 75504512, "step": 62080 }, { "epoch": 7.779100363362987, "grad_norm": 0.11359182000160217, "learning_rate": 7.665138193058063e-06, "loss": 0.4578, "num_input_tokens_seen": 75510400, "step": 62085 }, { "epoch": 7.7797268512717705, "grad_norm": 0.08317632228136063, "learning_rate": 7.664675603736685e-06, "loss": 0.4699, "num_input_tokens_seen": 75516128, "step": 62090 }, { "epoch": 7.780353339180554, "grad_norm": 0.08563479036092758, "learning_rate": 7.664212982556868e-06, "loss": 0.4642, "num_input_tokens_seen": 75521952, "step": 62095 }, { "epoch": 7.780979827089337, "grad_norm": 0.08257723599672318, "learning_rate": 7.663750329524147e-06, "loss": 0.4539, "num_input_tokens_seen": 75528096, "step": 62100 }, { "epoch": 7.78160631499812, "grad_norm": 0.09735218435525894, "learning_rate": 7.663287644644048e-06, "loss": 0.4629, "num_input_tokens_seen": 75534208, "step": 62105 }, { "epoch": 7.782232802906904, "grad_norm": 0.08080022782087326, "learning_rate": 7.66282492792211e-06, "loss": 0.4603, "num_input_tokens_seen": 75540032, "step": 62110 }, { "epoch": 7.782859290815687, "grad_norm": 0.09334446489810944, "learning_rate": 7.662362179363859e-06, "loss": 0.4637, "num_input_tokens_seen": 75546528, "step": 62115 }, { "epoch": 7.7834857787244704, "grad_norm": 0.11260217428207397, "learning_rate": 7.66189939897483e-06, "loss": 0.4718, "num_input_tokens_seen": 75552448, "step": 62120 }, { "epoch": 7.784112266633254, "grad_norm": 0.07451770454645157, "learning_rate": 7.661436586760556e-06, "loss": 0.465, "num_input_tokens_seen": 75558368, "step": 62125 }, { "epoch": 7.784738754542038, "grad_norm": 0.054829105734825134, "learning_rate": 7.660973742726567e-06, "loss": 0.4572, "num_input_tokens_seen": 75564384, "step": 62130 }, { "epoch": 7.785365242450821, "grad_norm": 0.06132035702466965, "learning_rate": 7.660510866878404e-06, "loss": 0.4593, "num_input_tokens_seen": 75570496, "step": 62135 }, { "epoch": 7.785991730359604, "grad_norm": 0.098520427942276, "learning_rate": 7.660047959221595e-06, "loss": 0.4643, "num_input_tokens_seen": 75576416, "step": 62140 }, { "epoch": 7.786618218268387, "grad_norm": 0.0841437503695488, "learning_rate": 7.659585019761676e-06, "loss": 0.4591, "num_input_tokens_seen": 75582752, "step": 62145 }, { "epoch": 7.787244706177171, "grad_norm": 0.07811429351568222, "learning_rate": 7.659122048504182e-06, "loss": 0.4702, "num_input_tokens_seen": 75588288, "step": 62150 }, { "epoch": 7.787871194085954, "grad_norm": 0.1246330514550209, "learning_rate": 7.658659045454649e-06, "loss": 0.456, "num_input_tokens_seen": 75594304, "step": 62155 }, { "epoch": 7.788497681994738, "grad_norm": 0.08510523289442062, "learning_rate": 7.65819601061861e-06, "loss": 0.466, "num_input_tokens_seen": 75600320, "step": 62160 }, { "epoch": 7.789124169903521, "grad_norm": 0.08852635324001312, "learning_rate": 7.657732944001605e-06, "loss": 0.4644, "num_input_tokens_seen": 75606464, "step": 62165 }, { "epoch": 7.789750657812304, "grad_norm": 0.09143420308828354, "learning_rate": 7.657269845609168e-06, "loss": 0.4571, "num_input_tokens_seen": 75612416, "step": 62170 }, { "epoch": 7.790377145721088, "grad_norm": 0.09757102280855179, "learning_rate": 7.656806715446834e-06, "loss": 0.4624, "num_input_tokens_seen": 75618592, "step": 62175 }, { "epoch": 7.791003633629871, "grad_norm": 0.12286257743835449, "learning_rate": 7.656343553520143e-06, "loss": 0.4643, "num_input_tokens_seen": 75624544, "step": 62180 }, { "epoch": 7.791630121538654, "grad_norm": 0.0892561674118042, "learning_rate": 7.655880359834632e-06, "loss": 0.4525, "num_input_tokens_seen": 75630336, "step": 62185 }, { "epoch": 7.7922566094474375, "grad_norm": 0.1211135983467102, "learning_rate": 7.655417134395837e-06, "loss": 0.4642, "num_input_tokens_seen": 75636672, "step": 62190 }, { "epoch": 7.792883097356221, "grad_norm": 0.0876869410276413, "learning_rate": 7.654953877209299e-06, "loss": 0.4628, "num_input_tokens_seen": 75642112, "step": 62195 }, { "epoch": 7.793509585265005, "grad_norm": 0.07394306361675262, "learning_rate": 7.654490588280555e-06, "loss": 0.4728, "num_input_tokens_seen": 75648448, "step": 62200 }, { "epoch": 7.794136073173788, "grad_norm": 0.08071684092283249, "learning_rate": 7.654027267615146e-06, "loss": 0.4584, "num_input_tokens_seen": 75654560, "step": 62205 }, { "epoch": 7.794762561082571, "grad_norm": 0.07192264497280121, "learning_rate": 7.653563915218608e-06, "loss": 0.464, "num_input_tokens_seen": 75660544, "step": 62210 }, { "epoch": 7.795389048991354, "grad_norm": 0.08477803319692612, "learning_rate": 7.65310053109648e-06, "loss": 0.4695, "num_input_tokens_seen": 75666848, "step": 62215 }, { "epoch": 7.796015536900137, "grad_norm": 0.08323387801647186, "learning_rate": 7.652637115254306e-06, "loss": 0.4681, "num_input_tokens_seen": 75672544, "step": 62220 }, { "epoch": 7.796642024808921, "grad_norm": 0.08220966905355453, "learning_rate": 7.652173667697626e-06, "loss": 0.4642, "num_input_tokens_seen": 75678624, "step": 62225 }, { "epoch": 7.797268512717705, "grad_norm": 0.056293901056051254, "learning_rate": 7.65171018843198e-06, "loss": 0.465, "num_input_tokens_seen": 75684704, "step": 62230 }, { "epoch": 7.797895000626488, "grad_norm": 0.07943403720855713, "learning_rate": 7.651246677462909e-06, "loss": 0.4561, "num_input_tokens_seen": 75691232, "step": 62235 }, { "epoch": 7.798521488535271, "grad_norm": 0.11765410751104355, "learning_rate": 7.650783134795954e-06, "loss": 0.4663, "num_input_tokens_seen": 75697280, "step": 62240 }, { "epoch": 7.799147976444055, "grad_norm": 0.08730675280094147, "learning_rate": 7.650319560436658e-06, "loss": 0.4632, "num_input_tokens_seen": 75703520, "step": 62245 }, { "epoch": 7.799774464352838, "grad_norm": 0.07736849039793015, "learning_rate": 7.649855954390566e-06, "loss": 0.461, "num_input_tokens_seen": 75709536, "step": 62250 }, { "epoch": 7.800400952261621, "grad_norm": 0.07361283898353577, "learning_rate": 7.649392316663216e-06, "loss": 0.4621, "num_input_tokens_seen": 75715424, "step": 62255 }, { "epoch": 7.8010274401704045, "grad_norm": 0.07671648263931274, "learning_rate": 7.648928647260154e-06, "loss": 0.4567, "num_input_tokens_seen": 75721504, "step": 62260 }, { "epoch": 7.801653928079189, "grad_norm": 0.055462129414081573, "learning_rate": 7.648464946186923e-06, "loss": 0.4603, "num_input_tokens_seen": 75727680, "step": 62265 }, { "epoch": 7.802280415987972, "grad_norm": 0.0920281633734703, "learning_rate": 7.648001213449067e-06, "loss": 0.4588, "num_input_tokens_seen": 75733792, "step": 62270 }, { "epoch": 7.802906903896755, "grad_norm": 0.08606934547424316, "learning_rate": 7.647537449052129e-06, "loss": 0.4632, "num_input_tokens_seen": 75739456, "step": 62275 }, { "epoch": 7.803533391805538, "grad_norm": 0.06978408247232437, "learning_rate": 7.647073653001656e-06, "loss": 0.4558, "num_input_tokens_seen": 75745568, "step": 62280 }, { "epoch": 7.804159879714321, "grad_norm": 0.05757622793316841, "learning_rate": 7.646609825303191e-06, "loss": 0.4561, "num_input_tokens_seen": 75751808, "step": 62285 }, { "epoch": 7.804786367623105, "grad_norm": 0.0783018246293068, "learning_rate": 7.646145965962281e-06, "loss": 0.462, "num_input_tokens_seen": 75757760, "step": 62290 }, { "epoch": 7.8054128555318885, "grad_norm": 0.10306353867053986, "learning_rate": 7.64568207498447e-06, "loss": 0.4648, "num_input_tokens_seen": 75763936, "step": 62295 }, { "epoch": 7.806039343440672, "grad_norm": 0.10376010835170746, "learning_rate": 7.645218152375308e-06, "loss": 0.4662, "num_input_tokens_seen": 75769536, "step": 62300 }, { "epoch": 7.806665831349455, "grad_norm": 0.052599869668483734, "learning_rate": 7.644754198140337e-06, "loss": 0.463, "num_input_tokens_seen": 75775712, "step": 62305 }, { "epoch": 7.807292319258238, "grad_norm": 0.07644316554069519, "learning_rate": 7.644290212285107e-06, "loss": 0.4547, "num_input_tokens_seen": 75781120, "step": 62310 }, { "epoch": 7.807918807167022, "grad_norm": 0.05208173394203186, "learning_rate": 7.643826194815165e-06, "loss": 0.4628, "num_input_tokens_seen": 75787296, "step": 62315 }, { "epoch": 7.808545295075805, "grad_norm": 0.0808468833565712, "learning_rate": 7.643362145736057e-06, "loss": 0.4655, "num_input_tokens_seen": 75793760, "step": 62320 }, { "epoch": 7.809171782984588, "grad_norm": 0.1021169126033783, "learning_rate": 7.642898065053332e-06, "loss": 0.4487, "num_input_tokens_seen": 75800032, "step": 62325 }, { "epoch": 7.8097982708933715, "grad_norm": 0.11067573726177216, "learning_rate": 7.64243395277254e-06, "loss": 0.4539, "num_input_tokens_seen": 75806080, "step": 62330 }, { "epoch": 7.810424758802155, "grad_norm": 0.12739983201026917, "learning_rate": 7.641969808899225e-06, "loss": 0.4656, "num_input_tokens_seen": 75812064, "step": 62335 }, { "epoch": 7.811051246710939, "grad_norm": 0.10063484311103821, "learning_rate": 7.641505633438941e-06, "loss": 0.459, "num_input_tokens_seen": 75818208, "step": 62340 }, { "epoch": 7.811677734619722, "grad_norm": 0.09352836012840271, "learning_rate": 7.641041426397238e-06, "loss": 0.4639, "num_input_tokens_seen": 75824480, "step": 62345 }, { "epoch": 7.812304222528505, "grad_norm": 0.0791592001914978, "learning_rate": 7.640577187779664e-06, "loss": 0.4662, "num_input_tokens_seen": 75830880, "step": 62350 }, { "epoch": 7.812930710437288, "grad_norm": 0.07239709794521332, "learning_rate": 7.640112917591767e-06, "loss": 0.4644, "num_input_tokens_seen": 75836896, "step": 62355 }, { "epoch": 7.813557198346071, "grad_norm": 0.1295095980167389, "learning_rate": 7.639648615839104e-06, "loss": 0.4603, "num_input_tokens_seen": 75843168, "step": 62360 }, { "epoch": 7.8141836862548555, "grad_norm": 0.0948278158903122, "learning_rate": 7.63918428252722e-06, "loss": 0.4594, "num_input_tokens_seen": 75849248, "step": 62365 }, { "epoch": 7.814810174163639, "grad_norm": 2.369875192642212, "learning_rate": 7.63871991766167e-06, "loss": 0.4563, "num_input_tokens_seen": 75855392, "step": 62370 }, { "epoch": 7.815436662072422, "grad_norm": 0.4198831617832184, "learning_rate": 7.638255521248005e-06, "loss": 0.6383, "num_input_tokens_seen": 75861568, "step": 62375 }, { "epoch": 7.816063149981205, "grad_norm": 1.9785351753234863, "learning_rate": 7.637791093291777e-06, "loss": 0.4692, "num_input_tokens_seen": 75867840, "step": 62380 }, { "epoch": 7.816689637889989, "grad_norm": 2.7382888793945312, "learning_rate": 7.637326633798539e-06, "loss": 0.5149, "num_input_tokens_seen": 75873920, "step": 62385 }, { "epoch": 7.817316125798772, "grad_norm": 4.442965030670166, "learning_rate": 7.636862142773841e-06, "loss": 0.461, "num_input_tokens_seen": 75879936, "step": 62390 }, { "epoch": 7.817942613707555, "grad_norm": 1.0054830312728882, "learning_rate": 7.636397620223244e-06, "loss": 0.4632, "num_input_tokens_seen": 75886336, "step": 62395 }, { "epoch": 7.818569101616339, "grad_norm": 0.24655672907829285, "learning_rate": 7.635933066152293e-06, "loss": 0.455, "num_input_tokens_seen": 75892448, "step": 62400 }, { "epoch": 7.819195589525123, "grad_norm": 0.9754849672317505, "learning_rate": 7.635468480566547e-06, "loss": 0.4639, "num_input_tokens_seen": 75898688, "step": 62405 }, { "epoch": 7.819822077433906, "grad_norm": 0.16115610301494598, "learning_rate": 7.635003863471562e-06, "loss": 0.4636, "num_input_tokens_seen": 75904864, "step": 62410 }, { "epoch": 7.820448565342689, "grad_norm": 0.15505436062812805, "learning_rate": 7.634539214872887e-06, "loss": 0.4672, "num_input_tokens_seen": 75911104, "step": 62415 }, { "epoch": 7.821075053251472, "grad_norm": 0.11557396501302719, "learning_rate": 7.634074534776083e-06, "loss": 0.4651, "num_input_tokens_seen": 75917344, "step": 62420 }, { "epoch": 7.821701541160255, "grad_norm": 0.13187886774539948, "learning_rate": 7.633609823186704e-06, "loss": 0.4557, "num_input_tokens_seen": 75923488, "step": 62425 }, { "epoch": 7.822328029069039, "grad_norm": 0.11928503215312958, "learning_rate": 7.633145080110302e-06, "loss": 0.4619, "num_input_tokens_seen": 75929600, "step": 62430 }, { "epoch": 7.8229545169778225, "grad_norm": 0.08125373721122742, "learning_rate": 7.632680305552439e-06, "loss": 0.4656, "num_input_tokens_seen": 75935776, "step": 62435 }, { "epoch": 7.823581004886606, "grad_norm": 0.09252431243658066, "learning_rate": 7.632215499518669e-06, "loss": 0.4568, "num_input_tokens_seen": 75941568, "step": 62440 }, { "epoch": 7.824207492795389, "grad_norm": 0.09167847037315369, "learning_rate": 7.63175066201455e-06, "loss": 0.4553, "num_input_tokens_seen": 75947392, "step": 62445 }, { "epoch": 7.824833980704172, "grad_norm": 0.08647563308477402, "learning_rate": 7.631285793045637e-06, "loss": 0.4575, "num_input_tokens_seen": 75953664, "step": 62450 }, { "epoch": 7.825460468612956, "grad_norm": 0.07841606438159943, "learning_rate": 7.630820892617494e-06, "loss": 0.4655, "num_input_tokens_seen": 75959968, "step": 62455 }, { "epoch": 7.826086956521739, "grad_norm": 0.11082249879837036, "learning_rate": 7.630355960735673e-06, "loss": 0.4624, "num_input_tokens_seen": 75966080, "step": 62460 }, { "epoch": 7.826713444430522, "grad_norm": 0.11851063370704651, "learning_rate": 7.629890997405735e-06, "loss": 0.4634, "num_input_tokens_seen": 75971872, "step": 62465 }, { "epoch": 7.827339932339306, "grad_norm": 0.11643077433109283, "learning_rate": 7.629426002633239e-06, "loss": 0.4624, "num_input_tokens_seen": 75977888, "step": 62470 }, { "epoch": 7.827966420248089, "grad_norm": 0.11778703331947327, "learning_rate": 7.628960976423745e-06, "loss": 0.4705, "num_input_tokens_seen": 75983968, "step": 62475 }, { "epoch": 7.828592908156873, "grad_norm": 0.1394507735967636, "learning_rate": 7.628495918782812e-06, "loss": 0.4553, "num_input_tokens_seen": 75990240, "step": 62480 }, { "epoch": 7.829219396065656, "grad_norm": 0.1295921802520752, "learning_rate": 7.628030829716e-06, "loss": 0.4653, "num_input_tokens_seen": 75996800, "step": 62485 }, { "epoch": 7.829845883974439, "grad_norm": 0.15508317947387695, "learning_rate": 7.6275657092288705e-06, "loss": 0.4674, "num_input_tokens_seen": 76002304, "step": 62490 }, { "epoch": 7.830472371883222, "grad_norm": 0.16058264672756195, "learning_rate": 7.627100557326983e-06, "loss": 0.4579, "num_input_tokens_seen": 76008416, "step": 62495 }, { "epoch": 7.831098859792006, "grad_norm": 0.13575716316699982, "learning_rate": 7.6266353740159e-06, "loss": 0.4576, "num_input_tokens_seen": 76014656, "step": 62500 }, { "epoch": 7.8317253477007895, "grad_norm": 0.13247421383857727, "learning_rate": 7.6261701593011835e-06, "loss": 0.4609, "num_input_tokens_seen": 76021088, "step": 62505 }, { "epoch": 7.832351835609573, "grad_norm": 0.22748450934886932, "learning_rate": 7.6257049131883946e-06, "loss": 0.4625, "num_input_tokens_seen": 76027392, "step": 62510 }, { "epoch": 7.832978323518356, "grad_norm": 0.1995498687028885, "learning_rate": 7.625239635683095e-06, "loss": 0.4692, "num_input_tokens_seen": 76033920, "step": 62515 }, { "epoch": 7.83360481142714, "grad_norm": 0.2329334318637848, "learning_rate": 7.624774326790849e-06, "loss": 0.4507, "num_input_tokens_seen": 76039840, "step": 62520 }, { "epoch": 7.834231299335923, "grad_norm": 0.13205118477344513, "learning_rate": 7.62430898651722e-06, "loss": 0.4716, "num_input_tokens_seen": 76045760, "step": 62525 }, { "epoch": 7.834857787244706, "grad_norm": 0.1837591528892517, "learning_rate": 7.623843614867769e-06, "loss": 0.4628, "num_input_tokens_seen": 76052032, "step": 62530 }, { "epoch": 7.835484275153489, "grad_norm": 0.11614161729812622, "learning_rate": 7.6233782118480625e-06, "loss": 0.475, "num_input_tokens_seen": 76058016, "step": 62535 }, { "epoch": 7.836110763062273, "grad_norm": 0.21166111528873444, "learning_rate": 7.622912777463664e-06, "loss": 0.4675, "num_input_tokens_seen": 76064160, "step": 62540 }, { "epoch": 7.836737250971057, "grad_norm": 0.13497479259967804, "learning_rate": 7.622447311720136e-06, "loss": 0.4627, "num_input_tokens_seen": 76070432, "step": 62545 }, { "epoch": 7.83736373887984, "grad_norm": 0.08973246812820435, "learning_rate": 7.621981814623047e-06, "loss": 0.458, "num_input_tokens_seen": 76075808, "step": 62550 }, { "epoch": 7.837990226788623, "grad_norm": 0.12540462613105774, "learning_rate": 7.621516286177962e-06, "loss": 0.4492, "num_input_tokens_seen": 76081984, "step": 62555 }, { "epoch": 7.838616714697406, "grad_norm": 0.10579812526702881, "learning_rate": 7.621050726390446e-06, "loss": 0.4594, "num_input_tokens_seen": 76088096, "step": 62560 }, { "epoch": 7.839243202606189, "grad_norm": 0.1218864843249321, "learning_rate": 7.6205851352660635e-06, "loss": 0.4627, "num_input_tokens_seen": 76094240, "step": 62565 }, { "epoch": 7.839869690514973, "grad_norm": 0.1307074874639511, "learning_rate": 7.620119512810382e-06, "loss": 0.4607, "num_input_tokens_seen": 76100480, "step": 62570 }, { "epoch": 7.840496178423757, "grad_norm": 0.09577556699514389, "learning_rate": 7.619653859028971e-06, "loss": 0.4643, "num_input_tokens_seen": 76105920, "step": 62575 }, { "epoch": 7.84112266633254, "grad_norm": 0.1054406687617302, "learning_rate": 7.619188173927394e-06, "loss": 0.4565, "num_input_tokens_seen": 76112192, "step": 62580 }, { "epoch": 7.841749154241323, "grad_norm": 0.10759574174880981, "learning_rate": 7.618722457511219e-06, "loss": 0.4652, "num_input_tokens_seen": 76118112, "step": 62585 }, { "epoch": 7.842375642150106, "grad_norm": 0.11496224254369736, "learning_rate": 7.6182567097860186e-06, "loss": 0.4523, "num_input_tokens_seen": 76124640, "step": 62590 }, { "epoch": 7.84300213005889, "grad_norm": 0.1365867257118225, "learning_rate": 7.617790930757356e-06, "loss": 0.4651, "num_input_tokens_seen": 76130912, "step": 62595 }, { "epoch": 7.843628617967673, "grad_norm": 0.12170465290546417, "learning_rate": 7.6173251204308005e-06, "loss": 0.465, "num_input_tokens_seen": 76137056, "step": 62600 }, { "epoch": 7.8442551058764565, "grad_norm": 0.11048150807619095, "learning_rate": 7.616859278811924e-06, "loss": 0.4595, "num_input_tokens_seen": 76142848, "step": 62605 }, { "epoch": 7.84488159378524, "grad_norm": 0.11867246776819229, "learning_rate": 7.616393405906295e-06, "loss": 0.4675, "num_input_tokens_seen": 76148832, "step": 62610 }, { "epoch": 7.845508081694023, "grad_norm": 0.09055320173501968, "learning_rate": 7.615927501719484e-06, "loss": 0.4635, "num_input_tokens_seen": 76154912, "step": 62615 }, { "epoch": 7.846134569602807, "grad_norm": 0.1511683613061905, "learning_rate": 7.61546156625706e-06, "loss": 0.4653, "num_input_tokens_seen": 76161376, "step": 62620 }, { "epoch": 7.84676105751159, "grad_norm": 0.09329769760370255, "learning_rate": 7.614995599524593e-06, "loss": 0.4648, "num_input_tokens_seen": 76167200, "step": 62625 }, { "epoch": 7.847387545420373, "grad_norm": 0.06053312495350838, "learning_rate": 7.614529601527656e-06, "loss": 0.4716, "num_input_tokens_seen": 76173600, "step": 62630 }, { "epoch": 7.848014033329156, "grad_norm": 0.06299307942390442, "learning_rate": 7.614063572271819e-06, "loss": 0.4591, "num_input_tokens_seen": 76179904, "step": 62635 }, { "epoch": 7.84864052123794, "grad_norm": 0.14396850764751434, "learning_rate": 7.6135975117626534e-06, "loss": 0.4686, "num_input_tokens_seen": 76185984, "step": 62640 }, { "epoch": 7.849267009146724, "grad_norm": 0.1271650195121765, "learning_rate": 7.613131420005733e-06, "loss": 0.4676, "num_input_tokens_seen": 76191808, "step": 62645 }, { "epoch": 7.849893497055507, "grad_norm": 0.11768459528684616, "learning_rate": 7.6126652970066295e-06, "loss": 0.4634, "num_input_tokens_seen": 76198080, "step": 62650 }, { "epoch": 7.85051998496429, "grad_norm": 0.07916747778654099, "learning_rate": 7.612199142770915e-06, "loss": 0.4619, "num_input_tokens_seen": 76204288, "step": 62655 }, { "epoch": 7.851146472873074, "grad_norm": 0.10771192610263824, "learning_rate": 7.611732957304163e-06, "loss": 0.4656, "num_input_tokens_seen": 76210432, "step": 62660 }, { "epoch": 7.851772960781857, "grad_norm": 0.08783330768346786, "learning_rate": 7.611266740611949e-06, "loss": 0.4646, "num_input_tokens_seen": 76216256, "step": 62665 }, { "epoch": 7.85239944869064, "grad_norm": 0.0942983403801918, "learning_rate": 7.6108004926998454e-06, "loss": 0.4589, "num_input_tokens_seen": 76222368, "step": 62670 }, { "epoch": 7.8530259365994235, "grad_norm": 0.1124664768576622, "learning_rate": 7.610334213573428e-06, "loss": 0.4641, "num_input_tokens_seen": 76228672, "step": 62675 }, { "epoch": 7.853652424508207, "grad_norm": 0.10896507650613785, "learning_rate": 7.6098679032382694e-06, "loss": 0.4589, "num_input_tokens_seen": 76234784, "step": 62680 }, { "epoch": 7.854278912416991, "grad_norm": 0.11659996211528778, "learning_rate": 7.609401561699946e-06, "loss": 0.4624, "num_input_tokens_seen": 76241088, "step": 62685 }, { "epoch": 7.854905400325774, "grad_norm": 0.08982180058956146, "learning_rate": 7.608935188964033e-06, "loss": 0.4682, "num_input_tokens_seen": 76247232, "step": 62690 }, { "epoch": 7.855531888234557, "grad_norm": 0.07734062522649765, "learning_rate": 7.608468785036105e-06, "loss": 0.4619, "num_input_tokens_seen": 76253344, "step": 62695 }, { "epoch": 7.85615837614334, "grad_norm": 0.10032709687948227, "learning_rate": 7.608002349921742e-06, "loss": 0.4836, "num_input_tokens_seen": 76259296, "step": 62700 }, { "epoch": 7.856784864052123, "grad_norm": 0.09127987176179886, "learning_rate": 7.607535883626516e-06, "loss": 0.4679, "num_input_tokens_seen": 76265376, "step": 62705 }, { "epoch": 7.8574113519609075, "grad_norm": 0.11903414875268936, "learning_rate": 7.6070693861560075e-06, "loss": 0.4563, "num_input_tokens_seen": 76271552, "step": 62710 }, { "epoch": 7.858037839869691, "grad_norm": 0.09095823764801025, "learning_rate": 7.606602857515792e-06, "loss": 0.4616, "num_input_tokens_seen": 76277760, "step": 62715 }, { "epoch": 7.858664327778474, "grad_norm": 0.1018362045288086, "learning_rate": 7.606136297711448e-06, "loss": 0.4607, "num_input_tokens_seen": 76284064, "step": 62720 }, { "epoch": 7.859290815687257, "grad_norm": 0.08551813662052155, "learning_rate": 7.605669706748553e-06, "loss": 0.4587, "num_input_tokens_seen": 76290336, "step": 62725 }, { "epoch": 7.85991730359604, "grad_norm": 0.14000168442726135, "learning_rate": 7.605203084632687e-06, "loss": 0.4677, "num_input_tokens_seen": 76295968, "step": 62730 }, { "epoch": 7.860543791504824, "grad_norm": 0.06945189088582993, "learning_rate": 7.604736431369428e-06, "loss": 0.4653, "num_input_tokens_seen": 76302112, "step": 62735 }, { "epoch": 7.861170279413607, "grad_norm": 0.10066381096839905, "learning_rate": 7.604269746964354e-06, "loss": 0.4601, "num_input_tokens_seen": 76308448, "step": 62740 }, { "epoch": 7.8617967673223905, "grad_norm": 0.0982002392411232, "learning_rate": 7.603803031423046e-06, "loss": 0.4589, "num_input_tokens_seen": 76314688, "step": 62745 }, { "epoch": 7.862423255231174, "grad_norm": 0.09336145222187042, "learning_rate": 7.603336284751082e-06, "loss": 0.4605, "num_input_tokens_seen": 76320992, "step": 62750 }, { "epoch": 7.863049743139958, "grad_norm": 0.15935127437114716, "learning_rate": 7.602869506954047e-06, "loss": 0.4632, "num_input_tokens_seen": 76326880, "step": 62755 }, { "epoch": 7.863676231048741, "grad_norm": 0.13025572896003723, "learning_rate": 7.6024026980375165e-06, "loss": 0.4614, "num_input_tokens_seen": 76332768, "step": 62760 }, { "epoch": 7.864302718957524, "grad_norm": 0.08320619910955429, "learning_rate": 7.601935858007075e-06, "loss": 0.4638, "num_input_tokens_seen": 76338464, "step": 62765 }, { "epoch": 7.864929206866307, "grad_norm": 0.10711296647787094, "learning_rate": 7.601468986868302e-06, "loss": 0.4615, "num_input_tokens_seen": 76344768, "step": 62770 }, { "epoch": 7.865555694775091, "grad_norm": 0.13919374346733093, "learning_rate": 7.60100208462678e-06, "loss": 0.4565, "num_input_tokens_seen": 76350912, "step": 62775 }, { "epoch": 7.8661821826838745, "grad_norm": 0.08850644528865814, "learning_rate": 7.600535151288091e-06, "loss": 0.4573, "num_input_tokens_seen": 76356928, "step": 62780 }, { "epoch": 7.866808670592658, "grad_norm": 0.10165134817361832, "learning_rate": 7.600068186857819e-06, "loss": 0.4629, "num_input_tokens_seen": 76363232, "step": 62785 }, { "epoch": 7.867435158501441, "grad_norm": 0.09329693764448166, "learning_rate": 7.5996011913415455e-06, "loss": 0.4605, "num_input_tokens_seen": 76369536, "step": 62790 }, { "epoch": 7.868061646410224, "grad_norm": 0.10219024866819382, "learning_rate": 7.599134164744853e-06, "loss": 0.46, "num_input_tokens_seen": 76375872, "step": 62795 }, { "epoch": 7.868688134319008, "grad_norm": 0.14683927595615387, "learning_rate": 7.598667107073328e-06, "loss": 0.4587, "num_input_tokens_seen": 76381984, "step": 62800 }, { "epoch": 7.869314622227791, "grad_norm": 0.09589944034814835, "learning_rate": 7.598200018332551e-06, "loss": 0.4616, "num_input_tokens_seen": 76387680, "step": 62805 }, { "epoch": 7.869941110136574, "grad_norm": 0.08612405508756638, "learning_rate": 7.59773289852811e-06, "loss": 0.457, "num_input_tokens_seen": 76393728, "step": 62810 }, { "epoch": 7.8705675980453575, "grad_norm": 0.10290252417325974, "learning_rate": 7.597265747665588e-06, "loss": 0.4626, "num_input_tokens_seen": 76399296, "step": 62815 }, { "epoch": 7.871194085954141, "grad_norm": 0.07840383052825928, "learning_rate": 7.596798565750569e-06, "loss": 0.4646, "num_input_tokens_seen": 76405248, "step": 62820 }, { "epoch": 7.871820573862925, "grad_norm": 0.12597893178462982, "learning_rate": 7.596331352788641e-06, "loss": 0.4666, "num_input_tokens_seen": 76411648, "step": 62825 }, { "epoch": 7.872447061771708, "grad_norm": 0.11134250462055206, "learning_rate": 7.595864108785389e-06, "loss": 0.463, "num_input_tokens_seen": 76417696, "step": 62830 }, { "epoch": 7.873073549680491, "grad_norm": 0.1043124571442604, "learning_rate": 7.595396833746398e-06, "loss": 0.4574, "num_input_tokens_seen": 76423744, "step": 62835 }, { "epoch": 7.873700037589274, "grad_norm": 0.14168229699134827, "learning_rate": 7.594929527677255e-06, "loss": 0.4584, "num_input_tokens_seen": 76429824, "step": 62840 }, { "epoch": 7.8743265254980574, "grad_norm": 0.10427196323871613, "learning_rate": 7.59446219058355e-06, "loss": 0.4647, "num_input_tokens_seen": 76435616, "step": 62845 }, { "epoch": 7.8749530134068415, "grad_norm": 0.09803080558776855, "learning_rate": 7.593994822470867e-06, "loss": 0.4599, "num_input_tokens_seen": 76441824, "step": 62850 }, { "epoch": 7.875579501315625, "grad_norm": 0.09578754752874374, "learning_rate": 7.593527423344795e-06, "loss": 0.4577, "num_input_tokens_seen": 76447872, "step": 62855 }, { "epoch": 7.876205989224408, "grad_norm": 0.09992840886116028, "learning_rate": 7.5930599932109224e-06, "loss": 0.4538, "num_input_tokens_seen": 76453984, "step": 62860 }, { "epoch": 7.876832477133191, "grad_norm": 0.10504050552845001, "learning_rate": 7.592592532074838e-06, "loss": 0.4653, "num_input_tokens_seen": 76459968, "step": 62865 }, { "epoch": 7.877458965041975, "grad_norm": 0.11702192574739456, "learning_rate": 7.59212503994213e-06, "loss": 0.4562, "num_input_tokens_seen": 76466144, "step": 62870 }, { "epoch": 7.878085452950758, "grad_norm": 0.0986773893237114, "learning_rate": 7.591657516818387e-06, "loss": 0.4556, "num_input_tokens_seen": 76472288, "step": 62875 }, { "epoch": 7.878711940859541, "grad_norm": 0.09838441014289856, "learning_rate": 7.5911899627092e-06, "loss": 0.4567, "num_input_tokens_seen": 76478464, "step": 62880 }, { "epoch": 7.879338428768325, "grad_norm": 0.09974240511655807, "learning_rate": 7.590722377620159e-06, "loss": 0.4575, "num_input_tokens_seen": 76484416, "step": 62885 }, { "epoch": 7.879964916677108, "grad_norm": 0.06403499096632004, "learning_rate": 7.5902547615568535e-06, "loss": 0.4546, "num_input_tokens_seen": 76490176, "step": 62890 }, { "epoch": 7.880591404585892, "grad_norm": 0.20829597115516663, "learning_rate": 7.5897871145248735e-06, "loss": 0.4728, "num_input_tokens_seen": 76496512, "step": 62895 }, { "epoch": 7.881217892494675, "grad_norm": 0.21905195713043213, "learning_rate": 7.589319436529812e-06, "loss": 0.4557, "num_input_tokens_seen": 76502880, "step": 62900 }, { "epoch": 7.881844380403458, "grad_norm": 0.12274157255887985, "learning_rate": 7.58885172757726e-06, "loss": 0.4795, "num_input_tokens_seen": 76508448, "step": 62905 }, { "epoch": 7.882470868312241, "grad_norm": 0.06851492077112198, "learning_rate": 7.588383987672807e-06, "loss": 0.4623, "num_input_tokens_seen": 76514624, "step": 62910 }, { "epoch": 7.883097356221025, "grad_norm": 0.13097596168518066, "learning_rate": 7.587916216822049e-06, "loss": 0.4528, "num_input_tokens_seen": 76520544, "step": 62915 }, { "epoch": 7.8837238441298085, "grad_norm": 0.09040746092796326, "learning_rate": 7.587448415030577e-06, "loss": 0.4794, "num_input_tokens_seen": 76526464, "step": 62920 }, { "epoch": 7.884350332038592, "grad_norm": 0.09560813009738922, "learning_rate": 7.5869805823039845e-06, "loss": 0.4618, "num_input_tokens_seen": 76532832, "step": 62925 }, { "epoch": 7.884976819947375, "grad_norm": 0.15390080213546753, "learning_rate": 7.586512718647862e-06, "loss": 0.4594, "num_input_tokens_seen": 76538592, "step": 62930 }, { "epoch": 7.885603307856158, "grad_norm": 0.09106273204088211, "learning_rate": 7.5860448240678065e-06, "loss": 0.4618, "num_input_tokens_seen": 76544736, "step": 62935 }, { "epoch": 7.886229795764942, "grad_norm": 0.1834610551595688, "learning_rate": 7.585576898569411e-06, "loss": 0.4641, "num_input_tokens_seen": 76550336, "step": 62940 }, { "epoch": 7.886856283673725, "grad_norm": 0.13652363419532776, "learning_rate": 7.58510894215827e-06, "loss": 0.4563, "num_input_tokens_seen": 76556320, "step": 62945 }, { "epoch": 7.887482771582508, "grad_norm": 0.139313206076622, "learning_rate": 7.584640954839978e-06, "loss": 0.4498, "num_input_tokens_seen": 76562688, "step": 62950 }, { "epoch": 7.888109259491292, "grad_norm": 0.1441897749900818, "learning_rate": 7.584172936620129e-06, "loss": 0.4681, "num_input_tokens_seen": 76569344, "step": 62955 }, { "epoch": 7.888735747400075, "grad_norm": 0.15576998889446259, "learning_rate": 7.583704887504322e-06, "loss": 0.4598, "num_input_tokens_seen": 76575200, "step": 62960 }, { "epoch": 7.889362235308859, "grad_norm": 0.13011832535266876, "learning_rate": 7.5832368074981484e-06, "loss": 0.4699, "num_input_tokens_seen": 76581248, "step": 62965 }, { "epoch": 7.889988723217642, "grad_norm": 0.15154200792312622, "learning_rate": 7.582768696607209e-06, "loss": 0.4532, "num_input_tokens_seen": 76586848, "step": 62970 }, { "epoch": 7.890615211126425, "grad_norm": 0.1139909029006958, "learning_rate": 7.582300554837098e-06, "loss": 0.4688, "num_input_tokens_seen": 76593024, "step": 62975 }, { "epoch": 7.891241699035208, "grad_norm": 0.09868203103542328, "learning_rate": 7.581832382193412e-06, "loss": 0.4607, "num_input_tokens_seen": 76599232, "step": 62980 }, { "epoch": 7.8918681869439915, "grad_norm": 0.1153579130768776, "learning_rate": 7.581364178681749e-06, "loss": 0.4546, "num_input_tokens_seen": 76605536, "step": 62985 }, { "epoch": 7.892494674852776, "grad_norm": 0.13569344580173492, "learning_rate": 7.580895944307709e-06, "loss": 0.4542, "num_input_tokens_seen": 76611712, "step": 62990 }, { "epoch": 7.893121162761559, "grad_norm": 0.08882425725460052, "learning_rate": 7.580427679076887e-06, "loss": 0.4625, "num_input_tokens_seen": 76617664, "step": 62995 }, { "epoch": 7.893747650670342, "grad_norm": 0.10568958520889282, "learning_rate": 7.579959382994881e-06, "loss": 0.4673, "num_input_tokens_seen": 76623680, "step": 63000 }, { "epoch": 7.894374138579125, "grad_norm": 0.11479262262582779, "learning_rate": 7.579491056067293e-06, "loss": 0.4624, "num_input_tokens_seen": 76629984, "step": 63005 }, { "epoch": 7.895000626487909, "grad_norm": 0.1388799399137497, "learning_rate": 7.579022698299721e-06, "loss": 0.4705, "num_input_tokens_seen": 76635872, "step": 63010 }, { "epoch": 7.895627114396692, "grad_norm": 0.08474055677652359, "learning_rate": 7.578554309697763e-06, "loss": 0.4677, "num_input_tokens_seen": 76642112, "step": 63015 }, { "epoch": 7.8962536023054755, "grad_norm": 0.1561378538608551, "learning_rate": 7.578085890267022e-06, "loss": 0.453, "num_input_tokens_seen": 76648192, "step": 63020 }, { "epoch": 7.896880090214259, "grad_norm": 0.14083294570446014, "learning_rate": 7.577617440013094e-06, "loss": 0.4591, "num_input_tokens_seen": 76654432, "step": 63025 }, { "epoch": 7.897506578123043, "grad_norm": 0.13716597855091095, "learning_rate": 7.5771489589415845e-06, "loss": 0.466, "num_input_tokens_seen": 76660800, "step": 63030 }, { "epoch": 7.898133066031826, "grad_norm": 0.08243893831968307, "learning_rate": 7.5766804470580935e-06, "loss": 0.4537, "num_input_tokens_seen": 76666848, "step": 63035 }, { "epoch": 7.898759553940609, "grad_norm": 0.13620419800281525, "learning_rate": 7.576211904368219e-06, "loss": 0.4503, "num_input_tokens_seen": 76672864, "step": 63040 }, { "epoch": 7.899386041849392, "grad_norm": 0.09767472743988037, "learning_rate": 7.575743330877567e-06, "loss": 0.4624, "num_input_tokens_seen": 76678912, "step": 63045 }, { "epoch": 7.900012529758175, "grad_norm": 0.14517945051193237, "learning_rate": 7.575274726591737e-06, "loss": 0.4696, "num_input_tokens_seen": 76684800, "step": 63050 }, { "epoch": 7.900639017666959, "grad_norm": 0.10039137303829193, "learning_rate": 7.5748060915163336e-06, "loss": 0.4622, "num_input_tokens_seen": 76690880, "step": 63055 }, { "epoch": 7.901265505575743, "grad_norm": 0.11389844864606857, "learning_rate": 7.5743374256569555e-06, "loss": 0.4619, "num_input_tokens_seen": 76697088, "step": 63060 }, { "epoch": 7.901891993484526, "grad_norm": 0.07676661014556885, "learning_rate": 7.5738687290192115e-06, "loss": 0.4595, "num_input_tokens_seen": 76703072, "step": 63065 }, { "epoch": 7.902518481393309, "grad_norm": 0.0899953693151474, "learning_rate": 7.573400001608703e-06, "loss": 0.459, "num_input_tokens_seen": 76709088, "step": 63070 }, { "epoch": 7.903144969302092, "grad_norm": 0.12905333936214447, "learning_rate": 7.572931243431033e-06, "loss": 0.464, "num_input_tokens_seen": 76715168, "step": 63075 }, { "epoch": 7.903771457210876, "grad_norm": 0.07079152017831802, "learning_rate": 7.572462454491807e-06, "loss": 0.4701, "num_input_tokens_seen": 76721536, "step": 63080 }, { "epoch": 7.904397945119659, "grad_norm": 0.12749816477298737, "learning_rate": 7.5719936347966295e-06, "loss": 0.4637, "num_input_tokens_seen": 76727520, "step": 63085 }, { "epoch": 7.9050244330284425, "grad_norm": 0.1170663982629776, "learning_rate": 7.5715247843511066e-06, "loss": 0.4671, "num_input_tokens_seen": 76733600, "step": 63090 }, { "epoch": 7.905650920937226, "grad_norm": 0.10075537860393524, "learning_rate": 7.571055903160842e-06, "loss": 0.4632, "num_input_tokens_seen": 76739872, "step": 63095 }, { "epoch": 7.906277408846009, "grad_norm": 0.08881273120641708, "learning_rate": 7.570586991231443e-06, "loss": 0.4714, "num_input_tokens_seen": 76746240, "step": 63100 }, { "epoch": 7.906903896754793, "grad_norm": 0.1631920337677002, "learning_rate": 7.570118048568516e-06, "loss": 0.4664, "num_input_tokens_seen": 76752480, "step": 63105 }, { "epoch": 7.907530384663576, "grad_norm": 0.07227270305156708, "learning_rate": 7.569649075177665e-06, "loss": 0.4605, "num_input_tokens_seen": 76758496, "step": 63110 }, { "epoch": 7.908156872572359, "grad_norm": 0.076279416680336, "learning_rate": 7.5691800710644996e-06, "loss": 0.4612, "num_input_tokens_seen": 76764416, "step": 63115 }, { "epoch": 7.908783360481142, "grad_norm": 0.13562971353530884, "learning_rate": 7.568711036234626e-06, "loss": 0.4591, "num_input_tokens_seen": 76770400, "step": 63120 }, { "epoch": 7.9094098483899264, "grad_norm": 0.07432311773300171, "learning_rate": 7.5682419706936524e-06, "loss": 0.4578, "num_input_tokens_seen": 76776608, "step": 63125 }, { "epoch": 7.91003633629871, "grad_norm": 0.13799820840358734, "learning_rate": 7.567772874447188e-06, "loss": 0.4572, "num_input_tokens_seen": 76782784, "step": 63130 }, { "epoch": 7.910662824207493, "grad_norm": 0.09721391648054123, "learning_rate": 7.567303747500838e-06, "loss": 0.4585, "num_input_tokens_seen": 76788992, "step": 63135 }, { "epoch": 7.911289312116276, "grad_norm": 0.11789091676473618, "learning_rate": 7.566834589860215e-06, "loss": 0.4658, "num_input_tokens_seen": 76795136, "step": 63140 }, { "epoch": 7.91191580002506, "grad_norm": 0.09862196445465088, "learning_rate": 7.5663654015309264e-06, "loss": 0.4637, "num_input_tokens_seen": 76801248, "step": 63145 }, { "epoch": 7.912542287933843, "grad_norm": 0.12149626761674881, "learning_rate": 7.5658961825185804e-06, "loss": 0.4689, "num_input_tokens_seen": 76807104, "step": 63150 }, { "epoch": 7.913168775842626, "grad_norm": 0.12904860079288483, "learning_rate": 7.56542693282879e-06, "loss": 0.474, "num_input_tokens_seen": 76813024, "step": 63155 }, { "epoch": 7.9137952637514095, "grad_norm": 0.09870535135269165, "learning_rate": 7.564957652467162e-06, "loss": 0.4623, "num_input_tokens_seen": 76819104, "step": 63160 }, { "epoch": 7.914421751660193, "grad_norm": 0.09453198313713074, "learning_rate": 7.564488341439309e-06, "loss": 0.4483, "num_input_tokens_seen": 76825536, "step": 63165 }, { "epoch": 7.915048239568977, "grad_norm": 0.0619112104177475, "learning_rate": 7.564018999750842e-06, "loss": 0.4548, "num_input_tokens_seen": 76831776, "step": 63170 }, { "epoch": 7.91567472747776, "grad_norm": 0.11632607132196426, "learning_rate": 7.563549627407373e-06, "loss": 0.4568, "num_input_tokens_seen": 76838240, "step": 63175 }, { "epoch": 7.916301215386543, "grad_norm": 0.14183342456817627, "learning_rate": 7.5630802244145116e-06, "loss": 0.4609, "num_input_tokens_seen": 76844256, "step": 63180 }, { "epoch": 7.916927703295326, "grad_norm": 0.09407863020896912, "learning_rate": 7.562610790777873e-06, "loss": 0.456, "num_input_tokens_seen": 76850208, "step": 63185 }, { "epoch": 7.917554191204109, "grad_norm": 0.12490013241767883, "learning_rate": 7.5621413265030674e-06, "loss": 0.4566, "num_input_tokens_seen": 76856224, "step": 63190 }, { "epoch": 7.9181806791128935, "grad_norm": 0.10407227277755737, "learning_rate": 7.561671831595707e-06, "loss": 0.4658, "num_input_tokens_seen": 76862688, "step": 63195 }, { "epoch": 7.918807167021677, "grad_norm": 0.1211896538734436, "learning_rate": 7.561202306061407e-06, "loss": 0.4599, "num_input_tokens_seen": 76868736, "step": 63200 }, { "epoch": 7.91943365493046, "grad_norm": 0.10239993780851364, "learning_rate": 7.560732749905779e-06, "loss": 0.4619, "num_input_tokens_seen": 76875040, "step": 63205 }, { "epoch": 7.920060142839243, "grad_norm": 0.1325109601020813, "learning_rate": 7.560263163134438e-06, "loss": 0.4607, "num_input_tokens_seen": 76881184, "step": 63210 }, { "epoch": 7.920686630748026, "grad_norm": 0.10772515088319778, "learning_rate": 7.559793545753e-06, "loss": 0.4596, "num_input_tokens_seen": 76887456, "step": 63215 }, { "epoch": 7.92131311865681, "grad_norm": 0.08486191928386688, "learning_rate": 7.559323897767077e-06, "loss": 0.4691, "num_input_tokens_seen": 76893664, "step": 63220 }, { "epoch": 7.921939606565593, "grad_norm": 0.10203082859516144, "learning_rate": 7.558854219182285e-06, "loss": 0.4626, "num_input_tokens_seen": 76899808, "step": 63225 }, { "epoch": 7.9225660944743765, "grad_norm": 0.10617335140705109, "learning_rate": 7.55838451000424e-06, "loss": 0.4574, "num_input_tokens_seen": 76906112, "step": 63230 }, { "epoch": 7.92319258238316, "grad_norm": 0.12129524350166321, "learning_rate": 7.557914770238557e-06, "loss": 0.4615, "num_input_tokens_seen": 76912192, "step": 63235 }, { "epoch": 7.923819070291943, "grad_norm": 0.056294117122888565, "learning_rate": 7.557444999890852e-06, "loss": 0.4639, "num_input_tokens_seen": 76918720, "step": 63240 }, { "epoch": 7.924445558200727, "grad_norm": 0.09992820769548416, "learning_rate": 7.556975198966741e-06, "loss": 0.4618, "num_input_tokens_seen": 76924800, "step": 63245 }, { "epoch": 7.92507204610951, "grad_norm": 0.08621703833341599, "learning_rate": 7.556505367471844e-06, "loss": 0.4564, "num_input_tokens_seen": 76930912, "step": 63250 }, { "epoch": 7.925698534018293, "grad_norm": 0.09029098600149155, "learning_rate": 7.556035505411774e-06, "loss": 0.4616, "num_input_tokens_seen": 76937184, "step": 63255 }, { "epoch": 7.926325021927076, "grad_norm": 0.07046601921319962, "learning_rate": 7.5555656127921515e-06, "loss": 0.4623, "num_input_tokens_seen": 76943072, "step": 63260 }, { "epoch": 7.9269515098358605, "grad_norm": 0.09589524567127228, "learning_rate": 7.555095689618592e-06, "loss": 0.4627, "num_input_tokens_seen": 76949312, "step": 63265 }, { "epoch": 7.927577997744644, "grad_norm": 0.09968608617782593, "learning_rate": 7.554625735896717e-06, "loss": 0.4626, "num_input_tokens_seen": 76955424, "step": 63270 }, { "epoch": 7.928204485653427, "grad_norm": 0.08389831334352493, "learning_rate": 7.554155751632142e-06, "loss": 0.4681, "num_input_tokens_seen": 76961760, "step": 63275 }, { "epoch": 7.92883097356221, "grad_norm": 0.1048712506890297, "learning_rate": 7.55368573683049e-06, "loss": 0.469, "num_input_tokens_seen": 76968000, "step": 63280 }, { "epoch": 7.929457461470994, "grad_norm": 0.14430058002471924, "learning_rate": 7.5532156914973754e-06, "loss": 0.4621, "num_input_tokens_seen": 76973728, "step": 63285 }, { "epoch": 7.930083949379777, "grad_norm": 0.16983909904956818, "learning_rate": 7.552745615638422e-06, "loss": 0.4559, "num_input_tokens_seen": 76979680, "step": 63290 }, { "epoch": 7.93071043728856, "grad_norm": 0.0732635036110878, "learning_rate": 7.5522755092592485e-06, "loss": 0.4642, "num_input_tokens_seen": 76985696, "step": 63295 }, { "epoch": 7.931336925197344, "grad_norm": 0.11267922073602676, "learning_rate": 7.551805372365475e-06, "loss": 0.4628, "num_input_tokens_seen": 76991936, "step": 63300 }, { "epoch": 7.931963413106127, "grad_norm": 0.09241660684347153, "learning_rate": 7.551335204962724e-06, "loss": 0.463, "num_input_tokens_seen": 76998112, "step": 63305 }, { "epoch": 7.932589901014911, "grad_norm": 0.10850095748901367, "learning_rate": 7.550865007056615e-06, "loss": 0.4539, "num_input_tokens_seen": 77004128, "step": 63310 }, { "epoch": 7.933216388923694, "grad_norm": 0.12981967628002167, "learning_rate": 7.5503947786527686e-06, "loss": 0.4638, "num_input_tokens_seen": 77010176, "step": 63315 }, { "epoch": 7.933842876832477, "grad_norm": 0.0999298095703125, "learning_rate": 7.54992451975681e-06, "loss": 0.4561, "num_input_tokens_seen": 77016064, "step": 63320 }, { "epoch": 7.93446936474126, "grad_norm": 0.17741355299949646, "learning_rate": 7.54945423037436e-06, "loss": 0.4696, "num_input_tokens_seen": 77022144, "step": 63325 }, { "epoch": 7.9350958526500435, "grad_norm": 0.11172103881835938, "learning_rate": 7.54898391051104e-06, "loss": 0.4653, "num_input_tokens_seen": 77028096, "step": 63330 }, { "epoch": 7.9357223405588275, "grad_norm": 0.11436779052019119, "learning_rate": 7.548513560172474e-06, "loss": 0.4618, "num_input_tokens_seen": 77034496, "step": 63335 }, { "epoch": 7.936348828467611, "grad_norm": 0.08257715404033661, "learning_rate": 7.548043179364287e-06, "loss": 0.4651, "num_input_tokens_seen": 77040448, "step": 63340 }, { "epoch": 7.936975316376394, "grad_norm": 0.0886424109339714, "learning_rate": 7.547572768092102e-06, "loss": 0.4667, "num_input_tokens_seen": 77046656, "step": 63345 }, { "epoch": 7.937601804285177, "grad_norm": 0.09249528497457504, "learning_rate": 7.547102326361542e-06, "loss": 0.4706, "num_input_tokens_seen": 77052704, "step": 63350 }, { "epoch": 7.93822829219396, "grad_norm": 0.10297603905200958, "learning_rate": 7.546631854178232e-06, "loss": 0.4576, "num_input_tokens_seen": 77058848, "step": 63355 }, { "epoch": 7.938854780102744, "grad_norm": 0.0997079536318779, "learning_rate": 7.546161351547798e-06, "loss": 0.4578, "num_input_tokens_seen": 77064832, "step": 63360 }, { "epoch": 7.939481268011527, "grad_norm": 0.060122281312942505, "learning_rate": 7.545690818475864e-06, "loss": 0.4601, "num_input_tokens_seen": 77071040, "step": 63365 }, { "epoch": 7.940107755920311, "grad_norm": 0.08399468660354614, "learning_rate": 7.545220254968054e-06, "loss": 0.4586, "num_input_tokens_seen": 77077280, "step": 63370 }, { "epoch": 7.940734243829094, "grad_norm": 0.10138965398073196, "learning_rate": 7.544749661029997e-06, "loss": 0.4645, "num_input_tokens_seen": 77083296, "step": 63375 }, { "epoch": 7.941360731737878, "grad_norm": 0.12672656774520874, "learning_rate": 7.544279036667319e-06, "loss": 0.4629, "num_input_tokens_seen": 77089472, "step": 63380 }, { "epoch": 7.941987219646661, "grad_norm": 0.13911403715610504, "learning_rate": 7.543808381885647e-06, "loss": 0.4601, "num_input_tokens_seen": 77095488, "step": 63385 }, { "epoch": 7.942613707555444, "grad_norm": 0.09508586674928665, "learning_rate": 7.543337696690604e-06, "loss": 0.4616, "num_input_tokens_seen": 77101568, "step": 63390 }, { "epoch": 7.943240195464227, "grad_norm": 0.0871652290225029, "learning_rate": 7.542866981087823e-06, "loss": 0.4585, "num_input_tokens_seen": 77107552, "step": 63395 }, { "epoch": 7.943866683373011, "grad_norm": 0.13360455632209778, "learning_rate": 7.54239623508293e-06, "loss": 0.4618, "num_input_tokens_seen": 77113664, "step": 63400 }, { "epoch": 7.9444931712817946, "grad_norm": 0.09007581323385239, "learning_rate": 7.54192545868155e-06, "loss": 0.4656, "num_input_tokens_seen": 77119520, "step": 63405 }, { "epoch": 7.945119659190578, "grad_norm": 0.10793722420930862, "learning_rate": 7.541454651889315e-06, "loss": 0.4601, "num_input_tokens_seen": 77125504, "step": 63410 }, { "epoch": 7.945746147099361, "grad_norm": 0.09848933666944504, "learning_rate": 7.540983814711853e-06, "loss": 0.4564, "num_input_tokens_seen": 77131392, "step": 63415 }, { "epoch": 7.946372635008144, "grad_norm": 0.0806150808930397, "learning_rate": 7.540512947154794e-06, "loss": 0.462, "num_input_tokens_seen": 77136992, "step": 63420 }, { "epoch": 7.946999122916928, "grad_norm": 0.10636311024427414, "learning_rate": 7.540042049223766e-06, "loss": 0.4608, "num_input_tokens_seen": 77143200, "step": 63425 }, { "epoch": 7.947625610825711, "grad_norm": 0.15899749100208282, "learning_rate": 7.539571120924399e-06, "loss": 0.4758, "num_input_tokens_seen": 77149440, "step": 63430 }, { "epoch": 7.9482520987344945, "grad_norm": 0.09526209533214569, "learning_rate": 7.539100162262325e-06, "loss": 0.4676, "num_input_tokens_seen": 77155520, "step": 63435 }, { "epoch": 7.948878586643278, "grad_norm": 0.10764390975236893, "learning_rate": 7.538629173243175e-06, "loss": 0.4599, "num_input_tokens_seen": 77161600, "step": 63440 }, { "epoch": 7.949505074552061, "grad_norm": 0.08665154874324799, "learning_rate": 7.538158153872577e-06, "loss": 0.461, "num_input_tokens_seen": 77167936, "step": 63445 }, { "epoch": 7.950131562460845, "grad_norm": 0.1185559406876564, "learning_rate": 7.537687104156165e-06, "loss": 0.4581, "num_input_tokens_seen": 77174208, "step": 63450 }, { "epoch": 7.950758050369628, "grad_norm": 0.05685480684041977, "learning_rate": 7.53721602409957e-06, "loss": 0.4637, "num_input_tokens_seen": 77180096, "step": 63455 }, { "epoch": 7.951384538278411, "grad_norm": 0.17315298318862915, "learning_rate": 7.536744913708423e-06, "loss": 0.4656, "num_input_tokens_seen": 77186560, "step": 63460 }, { "epoch": 7.952011026187194, "grad_norm": 0.08709219098091125, "learning_rate": 7.53627377298836e-06, "loss": 0.4569, "num_input_tokens_seen": 77192960, "step": 63465 }, { "epoch": 7.9526375140959775, "grad_norm": 0.1265794038772583, "learning_rate": 7.5358026019450104e-06, "loss": 0.464, "num_input_tokens_seen": 77199200, "step": 63470 }, { "epoch": 7.953264002004762, "grad_norm": 0.11697771400213242, "learning_rate": 7.535331400584007e-06, "loss": 0.4646, "num_input_tokens_seen": 77205056, "step": 63475 }, { "epoch": 7.953890489913545, "grad_norm": 0.1307447850704193, "learning_rate": 7.534860168910988e-06, "loss": 0.4684, "num_input_tokens_seen": 77211264, "step": 63480 }, { "epoch": 7.954516977822328, "grad_norm": 0.1147724837064743, "learning_rate": 7.534388906931584e-06, "loss": 0.4651, "num_input_tokens_seen": 77216928, "step": 63485 }, { "epoch": 7.955143465731111, "grad_norm": 0.10578035563230515, "learning_rate": 7.533917614651429e-06, "loss": 0.4678, "num_input_tokens_seen": 77223264, "step": 63490 }, { "epoch": 7.955769953639894, "grad_norm": 0.05393853411078453, "learning_rate": 7.533446292076159e-06, "loss": 0.4678, "num_input_tokens_seen": 77229440, "step": 63495 }, { "epoch": 7.956396441548678, "grad_norm": 0.1456054300069809, "learning_rate": 7.532974939211408e-06, "loss": 0.4632, "num_input_tokens_seen": 77235680, "step": 63500 }, { "epoch": 7.9570229294574615, "grad_norm": 0.08337954431772232, "learning_rate": 7.5325035560628135e-06, "loss": 0.4626, "num_input_tokens_seen": 77241952, "step": 63505 }, { "epoch": 7.957649417366245, "grad_norm": 0.0903739482164383, "learning_rate": 7.532032142636009e-06, "loss": 0.4602, "num_input_tokens_seen": 77248480, "step": 63510 }, { "epoch": 7.958275905275028, "grad_norm": 0.07700450718402863, "learning_rate": 7.531560698936631e-06, "loss": 0.4604, "num_input_tokens_seen": 77254432, "step": 63515 }, { "epoch": 7.958902393183812, "grad_norm": 0.07871771603822708, "learning_rate": 7.531089224970316e-06, "loss": 0.4626, "num_input_tokens_seen": 77260672, "step": 63520 }, { "epoch": 7.959528881092595, "grad_norm": 0.08539921045303345, "learning_rate": 7.530617720742701e-06, "loss": 0.4615, "num_input_tokens_seen": 77267360, "step": 63525 }, { "epoch": 7.960155369001378, "grad_norm": 0.1339213103055954, "learning_rate": 7.530146186259424e-06, "loss": 0.4621, "num_input_tokens_seen": 77273152, "step": 63530 }, { "epoch": 7.960781856910161, "grad_norm": 0.08569081872701645, "learning_rate": 7.529674621526123e-06, "loss": 0.4657, "num_input_tokens_seen": 77279040, "step": 63535 }, { "epoch": 7.961408344818945, "grad_norm": 0.12320411205291748, "learning_rate": 7.529203026548435e-06, "loss": 0.4611, "num_input_tokens_seen": 77285120, "step": 63540 }, { "epoch": 7.962034832727729, "grad_norm": 0.0897553339600563, "learning_rate": 7.528731401331998e-06, "loss": 0.4608, "num_input_tokens_seen": 77291552, "step": 63545 }, { "epoch": 7.962661320636512, "grad_norm": 0.11628085374832153, "learning_rate": 7.528259745882451e-06, "loss": 0.4604, "num_input_tokens_seen": 77297440, "step": 63550 }, { "epoch": 7.963287808545295, "grad_norm": 0.1283951848745346, "learning_rate": 7.527788060205433e-06, "loss": 0.4594, "num_input_tokens_seen": 77303488, "step": 63555 }, { "epoch": 7.963914296454078, "grad_norm": 0.10161392390727997, "learning_rate": 7.5273163443065835e-06, "loss": 0.4615, "num_input_tokens_seen": 77309600, "step": 63560 }, { "epoch": 7.964540784362862, "grad_norm": 0.07819794863462448, "learning_rate": 7.526844598191543e-06, "loss": 0.4631, "num_input_tokens_seen": 77315136, "step": 63565 }, { "epoch": 7.965167272271645, "grad_norm": 0.12354512512683868, "learning_rate": 7.526372821865951e-06, "loss": 0.4649, "num_input_tokens_seen": 77321248, "step": 63570 }, { "epoch": 7.9657937601804285, "grad_norm": 0.07792530208826065, "learning_rate": 7.5259010153354464e-06, "loss": 0.4669, "num_input_tokens_seen": 77327104, "step": 63575 }, { "epoch": 7.966420248089212, "grad_norm": 0.08459430187940598, "learning_rate": 7.52542917860567e-06, "loss": 0.4564, "num_input_tokens_seen": 77333120, "step": 63580 }, { "epoch": 7.967046735997995, "grad_norm": 0.08407606184482574, "learning_rate": 7.524957311682267e-06, "loss": 0.4588, "num_input_tokens_seen": 77339232, "step": 63585 }, { "epoch": 7.967673223906779, "grad_norm": 0.11023490130901337, "learning_rate": 7.524485414570877e-06, "loss": 0.4593, "num_input_tokens_seen": 77344672, "step": 63590 }, { "epoch": 7.968299711815562, "grad_norm": 0.09724760055541992, "learning_rate": 7.52401348727714e-06, "loss": 0.4613, "num_input_tokens_seen": 77350432, "step": 63595 }, { "epoch": 7.968926199724345, "grad_norm": 0.10498089343309402, "learning_rate": 7.523541529806702e-06, "loss": 0.4683, "num_input_tokens_seen": 77356352, "step": 63600 }, { "epoch": 7.969552687633128, "grad_norm": 0.11922762542963028, "learning_rate": 7.5230695421652e-06, "loss": 0.4689, "num_input_tokens_seen": 77362688, "step": 63605 }, { "epoch": 7.970179175541912, "grad_norm": 0.06537401676177979, "learning_rate": 7.522597524358283e-06, "loss": 0.4603, "num_input_tokens_seen": 77368544, "step": 63610 }, { "epoch": 7.970805663450696, "grad_norm": 0.07013458013534546, "learning_rate": 7.522125476391591e-06, "loss": 0.4685, "num_input_tokens_seen": 77373824, "step": 63615 }, { "epoch": 7.971432151359479, "grad_norm": 0.11007484048604965, "learning_rate": 7.521653398270768e-06, "loss": 0.4651, "num_input_tokens_seen": 77379968, "step": 63620 }, { "epoch": 7.972058639268262, "grad_norm": 0.09836341440677643, "learning_rate": 7.521181290001458e-06, "loss": 0.4528, "num_input_tokens_seen": 77386080, "step": 63625 }, { "epoch": 7.972685127177045, "grad_norm": 0.09604596346616745, "learning_rate": 7.520709151589306e-06, "loss": 0.4707, "num_input_tokens_seen": 77391968, "step": 63630 }, { "epoch": 7.973311615085829, "grad_norm": 0.08269397914409637, "learning_rate": 7.520236983039957e-06, "loss": 0.4569, "num_input_tokens_seen": 77398048, "step": 63635 }, { "epoch": 7.973938102994612, "grad_norm": 0.09554320573806763, "learning_rate": 7.519764784359056e-06, "loss": 0.4619, "num_input_tokens_seen": 77403968, "step": 63640 }, { "epoch": 7.9745645909033955, "grad_norm": 0.09585420787334442, "learning_rate": 7.519292555552249e-06, "loss": 0.4647, "num_input_tokens_seen": 77410240, "step": 63645 }, { "epoch": 7.975191078812179, "grad_norm": 0.0717359334230423, "learning_rate": 7.518820296625182e-06, "loss": 0.4577, "num_input_tokens_seen": 77416544, "step": 63650 }, { "epoch": 7.975817566720963, "grad_norm": 0.08144446462392807, "learning_rate": 7.5183480075835005e-06, "loss": 0.464, "num_input_tokens_seen": 77422496, "step": 63655 }, { "epoch": 7.976444054629746, "grad_norm": 0.09511195123195648, "learning_rate": 7.5178756884328495e-06, "loss": 0.4649, "num_input_tokens_seen": 77428352, "step": 63660 }, { "epoch": 7.977070542538529, "grad_norm": 0.05699765682220459, "learning_rate": 7.5174033391788795e-06, "loss": 0.4686, "num_input_tokens_seen": 77434208, "step": 63665 }, { "epoch": 7.977697030447312, "grad_norm": 0.08194377273321152, "learning_rate": 7.516930959827235e-06, "loss": 0.4612, "num_input_tokens_seen": 77440448, "step": 63670 }, { "epoch": 7.978323518356095, "grad_norm": 0.08658695220947266, "learning_rate": 7.516458550383565e-06, "loss": 0.4602, "num_input_tokens_seen": 77446656, "step": 63675 }, { "epoch": 7.9789500062648795, "grad_norm": 0.09325825423002243, "learning_rate": 7.515986110853517e-06, "loss": 0.4556, "num_input_tokens_seen": 77452640, "step": 63680 }, { "epoch": 7.979576494173663, "grad_norm": 0.14112907648086548, "learning_rate": 7.515513641242741e-06, "loss": 0.4586, "num_input_tokens_seen": 77458784, "step": 63685 }, { "epoch": 7.980202982082446, "grad_norm": 0.107105553150177, "learning_rate": 7.5150411415568845e-06, "loss": 0.4599, "num_input_tokens_seen": 77464960, "step": 63690 }, { "epoch": 7.980829469991229, "grad_norm": 0.09836199134588242, "learning_rate": 7.514568611801596e-06, "loss": 0.4685, "num_input_tokens_seen": 77470464, "step": 63695 }, { "epoch": 7.981455957900012, "grad_norm": 0.14033474028110504, "learning_rate": 7.514096051982524e-06, "loss": 0.4625, "num_input_tokens_seen": 77476896, "step": 63700 }, { "epoch": 7.982082445808796, "grad_norm": 0.09146271646022797, "learning_rate": 7.513623462105322e-06, "loss": 0.4584, "num_input_tokens_seen": 77483008, "step": 63705 }, { "epoch": 7.982708933717579, "grad_norm": 0.10523657500743866, "learning_rate": 7.513150842175637e-06, "loss": 0.4633, "num_input_tokens_seen": 77488896, "step": 63710 }, { "epoch": 7.983335421626363, "grad_norm": 0.11968490481376648, "learning_rate": 7.512678192199122e-06, "loss": 0.4622, "num_input_tokens_seen": 77495360, "step": 63715 }, { "epoch": 7.983961909535146, "grad_norm": 0.10967190563678741, "learning_rate": 7.5122055121814255e-06, "loss": 0.4635, "num_input_tokens_seen": 77501600, "step": 63720 }, { "epoch": 7.984588397443929, "grad_norm": 0.0896228775382042, "learning_rate": 7.511732802128201e-06, "loss": 0.4682, "num_input_tokens_seen": 77507680, "step": 63725 }, { "epoch": 7.985214885352713, "grad_norm": 0.07154729962348938, "learning_rate": 7.511260062045098e-06, "loss": 0.4596, "num_input_tokens_seen": 77513376, "step": 63730 }, { "epoch": 7.985841373261496, "grad_norm": 0.08741946518421173, "learning_rate": 7.51078729193777e-06, "loss": 0.4577, "num_input_tokens_seen": 77519840, "step": 63735 }, { "epoch": 7.986467861170279, "grad_norm": 0.10092175751924515, "learning_rate": 7.51031449181187e-06, "loss": 0.4582, "num_input_tokens_seen": 77525792, "step": 63740 }, { "epoch": 7.9870943490790625, "grad_norm": 0.10312414914369583, "learning_rate": 7.509841661673049e-06, "loss": 0.4661, "num_input_tokens_seen": 77531968, "step": 63745 }, { "epoch": 7.9877208369878465, "grad_norm": 0.08147472143173218, "learning_rate": 7.50936880152696e-06, "loss": 0.4632, "num_input_tokens_seen": 77538080, "step": 63750 }, { "epoch": 7.98834732489663, "grad_norm": 0.09579699486494064, "learning_rate": 7.5088959113792585e-06, "loss": 0.4678, "num_input_tokens_seen": 77543680, "step": 63755 }, { "epoch": 7.988973812805413, "grad_norm": 0.08261869847774506, "learning_rate": 7.5084229912355955e-06, "loss": 0.4643, "num_input_tokens_seen": 77549920, "step": 63760 }, { "epoch": 7.989600300714196, "grad_norm": 0.1212584525346756, "learning_rate": 7.507950041101628e-06, "loss": 0.4557, "num_input_tokens_seen": 77555808, "step": 63765 }, { "epoch": 7.990226788622979, "grad_norm": 0.06595972180366516, "learning_rate": 7.507477060983008e-06, "loss": 0.4633, "num_input_tokens_seen": 77561600, "step": 63770 }, { "epoch": 7.990853276531763, "grad_norm": 0.13980594277381897, "learning_rate": 7.507004050885393e-06, "loss": 0.4684, "num_input_tokens_seen": 77567584, "step": 63775 }, { "epoch": 7.991479764440546, "grad_norm": 0.12202242016792297, "learning_rate": 7.506531010814435e-06, "loss": 0.4644, "num_input_tokens_seen": 77573728, "step": 63780 }, { "epoch": 7.99210625234933, "grad_norm": 0.11430676281452179, "learning_rate": 7.506057940775793e-06, "loss": 0.4725, "num_input_tokens_seen": 77580064, "step": 63785 }, { "epoch": 7.992732740258113, "grad_norm": 0.08413047343492508, "learning_rate": 7.50558484077512e-06, "loss": 0.4609, "num_input_tokens_seen": 77585472, "step": 63790 }, { "epoch": 7.993359228166897, "grad_norm": 0.06749647855758667, "learning_rate": 7.505111710818075e-06, "loss": 0.4616, "num_input_tokens_seen": 77591136, "step": 63795 }, { "epoch": 7.99398571607568, "grad_norm": 0.18296127021312714, "learning_rate": 7.504638550910313e-06, "loss": 0.4558, "num_input_tokens_seen": 77597280, "step": 63800 }, { "epoch": 7.994612203984463, "grad_norm": 0.09253484010696411, "learning_rate": 7.504165361057492e-06, "loss": 0.4571, "num_input_tokens_seen": 77603488, "step": 63805 }, { "epoch": 7.995238691893246, "grad_norm": 0.05515831708908081, "learning_rate": 7.503692141265268e-06, "loss": 0.4558, "num_input_tokens_seen": 77609664, "step": 63810 }, { "epoch": 7.9958651798020295, "grad_norm": 0.09027861803770065, "learning_rate": 7.503218891539299e-06, "loss": 0.4663, "num_input_tokens_seen": 77615424, "step": 63815 }, { "epoch": 7.9964916677108135, "grad_norm": 0.10585930943489075, "learning_rate": 7.502745611885244e-06, "loss": 0.4673, "num_input_tokens_seen": 77621536, "step": 63820 }, { "epoch": 7.997118155619597, "grad_norm": 0.11736714094877243, "learning_rate": 7.5022723023087594e-06, "loss": 0.4664, "num_input_tokens_seen": 77627776, "step": 63825 }, { "epoch": 7.99774464352838, "grad_norm": 0.11349696666002274, "learning_rate": 7.501798962815507e-06, "loss": 0.4695, "num_input_tokens_seen": 77633760, "step": 63830 }, { "epoch": 7.998371131437163, "grad_norm": 0.08585537225008011, "learning_rate": 7.501325593411146e-06, "loss": 0.4554, "num_input_tokens_seen": 77640128, "step": 63835 }, { "epoch": 7.998997619345946, "grad_norm": 0.09363515675067902, "learning_rate": 7.500852194101331e-06, "loss": 0.4615, "num_input_tokens_seen": 77646624, "step": 63840 }, { "epoch": 7.99962410725473, "grad_norm": 0.12103047221899033, "learning_rate": 7.500378764891729e-06, "loss": 0.4619, "num_input_tokens_seen": 77652608, "step": 63845 }, { "epoch": 8.0, "eval_loss": 0.46283775568008423, "eval_runtime": 222.8496, "eval_samples_per_second": 35.813, "eval_steps_per_second": 8.957, "num_input_tokens_seen": 77656576, "step": 63848 }, { "epoch": 8.000250595163513, "grad_norm": 0.0879310816526413, "learning_rate": 7.499905305787996e-06, "loss": 0.4674, "num_input_tokens_seen": 77659136, "step": 63850 }, { "epoch": 8.000877083072297, "grad_norm": 0.08741746842861176, "learning_rate": 7.4994318167957925e-06, "loss": 0.4636, "num_input_tokens_seen": 77665088, "step": 63855 }, { "epoch": 8.00150357098108, "grad_norm": 0.12296342104673386, "learning_rate": 7.498958297920779e-06, "loss": 0.47, "num_input_tokens_seen": 77671584, "step": 63860 }, { "epoch": 8.002130058889863, "grad_norm": 0.1175645962357521, "learning_rate": 7.49848474916862e-06, "loss": 0.4628, "num_input_tokens_seen": 77677344, "step": 63865 }, { "epoch": 8.002756546798647, "grad_norm": 0.0973348319530487, "learning_rate": 7.498011170544974e-06, "loss": 0.4626, "num_input_tokens_seen": 77683264, "step": 63870 }, { "epoch": 8.00338303470743, "grad_norm": 0.08826335519552231, "learning_rate": 7.4975375620555055e-06, "loss": 0.4541, "num_input_tokens_seen": 77688480, "step": 63875 }, { "epoch": 8.004009522616213, "grad_norm": 0.1405288428068161, "learning_rate": 7.497063923705876e-06, "loss": 0.461, "num_input_tokens_seen": 77694528, "step": 63880 }, { "epoch": 8.004636010524997, "grad_norm": 0.08011285960674286, "learning_rate": 7.496590255501745e-06, "loss": 0.4653, "num_input_tokens_seen": 77700448, "step": 63885 }, { "epoch": 8.00526249843378, "grad_norm": 0.055900949984788895, "learning_rate": 7.49611655744878e-06, "loss": 0.4612, "num_input_tokens_seen": 77706080, "step": 63890 }, { "epoch": 8.005888986342564, "grad_norm": 0.11618086695671082, "learning_rate": 7.4956428295526425e-06, "loss": 0.4679, "num_input_tokens_seen": 77712320, "step": 63895 }, { "epoch": 8.006515474251348, "grad_norm": 0.12004445493221283, "learning_rate": 7.495169071818999e-06, "loss": 0.4673, "num_input_tokens_seen": 77718304, "step": 63900 }, { "epoch": 8.00714196216013, "grad_norm": 0.07853001356124878, "learning_rate": 7.494695284253509e-06, "loss": 0.4686, "num_input_tokens_seen": 77724192, "step": 63905 }, { "epoch": 8.007768450068914, "grad_norm": 0.07711216062307358, "learning_rate": 7.494221466861841e-06, "loss": 0.4586, "num_input_tokens_seen": 77729856, "step": 63910 }, { "epoch": 8.008394937977696, "grad_norm": 0.11098067462444305, "learning_rate": 7.493747619649658e-06, "loss": 0.4625, "num_input_tokens_seen": 77735904, "step": 63915 }, { "epoch": 8.00902142588648, "grad_norm": 0.10252240300178528, "learning_rate": 7.493273742622626e-06, "loss": 0.4622, "num_input_tokens_seen": 77742048, "step": 63920 }, { "epoch": 8.009647913795265, "grad_norm": 0.1007239893078804, "learning_rate": 7.492799835786408e-06, "loss": 0.4621, "num_input_tokens_seen": 77747968, "step": 63925 }, { "epoch": 8.010274401704047, "grad_norm": 0.10164223611354828, "learning_rate": 7.492325899146675e-06, "loss": 0.46, "num_input_tokens_seen": 77754048, "step": 63930 }, { "epoch": 8.01090088961283, "grad_norm": 0.07353407889604568, "learning_rate": 7.491851932709088e-06, "loss": 0.4692, "num_input_tokens_seen": 77759968, "step": 63935 }, { "epoch": 8.011527377521613, "grad_norm": 0.07955818623304367, "learning_rate": 7.491377936479317e-06, "loss": 0.4597, "num_input_tokens_seen": 77766048, "step": 63940 }, { "epoch": 8.012153865430397, "grad_norm": 0.0835254117846489, "learning_rate": 7.490903910463029e-06, "loss": 0.4602, "num_input_tokens_seen": 77772352, "step": 63945 }, { "epoch": 8.012780353339181, "grad_norm": 0.11342527717351913, "learning_rate": 7.490429854665889e-06, "loss": 0.4667, "num_input_tokens_seen": 77778560, "step": 63950 }, { "epoch": 8.013406841247964, "grad_norm": 0.1394287496805191, "learning_rate": 7.489955769093567e-06, "loss": 0.4636, "num_input_tokens_seen": 77784704, "step": 63955 }, { "epoch": 8.014033329156748, "grad_norm": 0.07035324722528458, "learning_rate": 7.48948165375173e-06, "loss": 0.4628, "num_input_tokens_seen": 77790624, "step": 63960 }, { "epoch": 8.01465981706553, "grad_norm": 0.09017923474311829, "learning_rate": 7.489007508646048e-06, "loss": 0.453, "num_input_tokens_seen": 77796416, "step": 63965 }, { "epoch": 8.015286304974314, "grad_norm": 0.1102895438671112, "learning_rate": 7.4885333337821876e-06, "loss": 0.4644, "num_input_tokens_seen": 77802592, "step": 63970 }, { "epoch": 8.015912792883098, "grad_norm": 0.07926689088344574, "learning_rate": 7.488059129165819e-06, "loss": 0.4678, "num_input_tokens_seen": 77808576, "step": 63975 }, { "epoch": 8.01653928079188, "grad_norm": 0.08763467520475388, "learning_rate": 7.487584894802612e-06, "loss": 0.4598, "num_input_tokens_seen": 77814496, "step": 63980 }, { "epoch": 8.017165768700664, "grad_norm": 0.09374406933784485, "learning_rate": 7.487110630698235e-06, "loss": 0.4681, "num_input_tokens_seen": 77820640, "step": 63985 }, { "epoch": 8.017792256609447, "grad_norm": 0.10027223825454712, "learning_rate": 7.4866363368583604e-06, "loss": 0.464, "num_input_tokens_seen": 77826368, "step": 63990 }, { "epoch": 8.01841874451823, "grad_norm": 0.08527102321386337, "learning_rate": 7.486162013288658e-06, "loss": 0.4686, "num_input_tokens_seen": 77832032, "step": 63995 }, { "epoch": 8.019045232427015, "grad_norm": 0.08894000947475433, "learning_rate": 7.485687659994798e-06, "loss": 0.4605, "num_input_tokens_seen": 77837952, "step": 64000 }, { "epoch": 8.019671720335797, "grad_norm": 0.09111086279153824, "learning_rate": 7.4852132769824526e-06, "loss": 0.4617, "num_input_tokens_seen": 77844448, "step": 64005 }, { "epoch": 8.020298208244581, "grad_norm": 0.13413777947425842, "learning_rate": 7.484738864257291e-06, "loss": 0.4644, "num_input_tokens_seen": 77850304, "step": 64010 }, { "epoch": 8.020924696153365, "grad_norm": 0.0864495113492012, "learning_rate": 7.484264421824989e-06, "loss": 0.464, "num_input_tokens_seen": 77855968, "step": 64015 }, { "epoch": 8.021551184062147, "grad_norm": 0.1226053461432457, "learning_rate": 7.483789949691218e-06, "loss": 0.4481, "num_input_tokens_seen": 77861504, "step": 64020 }, { "epoch": 8.022177671970931, "grad_norm": 0.08885057270526886, "learning_rate": 7.48331544786165e-06, "loss": 0.4721, "num_input_tokens_seen": 77867136, "step": 64025 }, { "epoch": 8.022804159879714, "grad_norm": 0.0810513123869896, "learning_rate": 7.482840916341956e-06, "loss": 0.4653, "num_input_tokens_seen": 77873408, "step": 64030 }, { "epoch": 8.023430647788498, "grad_norm": 0.09422767907381058, "learning_rate": 7.482366355137814e-06, "loss": 0.4655, "num_input_tokens_seen": 77879392, "step": 64035 }, { "epoch": 8.024057135697282, "grad_norm": 0.09216158092021942, "learning_rate": 7.481891764254891e-06, "loss": 0.4642, "num_input_tokens_seen": 77885376, "step": 64040 }, { "epoch": 8.024683623606064, "grad_norm": 0.07636173814535141, "learning_rate": 7.481417143698868e-06, "loss": 0.4631, "num_input_tokens_seen": 77891584, "step": 64045 }, { "epoch": 8.025310111514848, "grad_norm": 0.10862628370523453, "learning_rate": 7.480942493475417e-06, "loss": 0.4615, "num_input_tokens_seen": 77898016, "step": 64050 }, { "epoch": 8.02593659942363, "grad_norm": 0.08404584974050522, "learning_rate": 7.480467813590213e-06, "loss": 0.4644, "num_input_tokens_seen": 77903328, "step": 64055 }, { "epoch": 8.026563087332415, "grad_norm": 0.09960541874170303, "learning_rate": 7.4799931040489305e-06, "loss": 0.4674, "num_input_tokens_seen": 77909152, "step": 64060 }, { "epoch": 8.027189575241199, "grad_norm": 0.07221730798482895, "learning_rate": 7.479518364857244e-06, "loss": 0.4649, "num_input_tokens_seen": 77915200, "step": 64065 }, { "epoch": 8.02781606314998, "grad_norm": 0.10728490352630615, "learning_rate": 7.479043596020832e-06, "loss": 0.4587, "num_input_tokens_seen": 77921408, "step": 64070 }, { "epoch": 8.028442551058765, "grad_norm": 0.06975813210010529, "learning_rate": 7.478568797545369e-06, "loss": 0.459, "num_input_tokens_seen": 77927008, "step": 64075 }, { "epoch": 8.029069038967547, "grad_norm": 0.07757347822189331, "learning_rate": 7.478093969436533e-06, "loss": 0.4564, "num_input_tokens_seen": 77933376, "step": 64080 }, { "epoch": 8.029695526876331, "grad_norm": 0.07609392702579498, "learning_rate": 7.477619111699999e-06, "loss": 0.4571, "num_input_tokens_seen": 77939584, "step": 64085 }, { "epoch": 8.030322014785115, "grad_norm": 0.11617767065763474, "learning_rate": 7.477144224341446e-06, "loss": 0.4597, "num_input_tokens_seen": 77945664, "step": 64090 }, { "epoch": 8.030948502693898, "grad_norm": 0.08212417364120483, "learning_rate": 7.476669307366551e-06, "loss": 0.4509, "num_input_tokens_seen": 77951712, "step": 64095 }, { "epoch": 8.031574990602682, "grad_norm": 0.08506669849157333, "learning_rate": 7.476194360780992e-06, "loss": 0.4633, "num_input_tokens_seen": 77958560, "step": 64100 }, { "epoch": 8.032201478511464, "grad_norm": 0.12791642546653748, "learning_rate": 7.4757193845904475e-06, "loss": 0.4608, "num_input_tokens_seen": 77964256, "step": 64105 }, { "epoch": 8.032827966420248, "grad_norm": 0.08409236371517181, "learning_rate": 7.475244378800597e-06, "loss": 0.4644, "num_input_tokens_seen": 77969632, "step": 64110 }, { "epoch": 8.033454454329032, "grad_norm": 0.11222118139266968, "learning_rate": 7.474769343417119e-06, "loss": 0.4505, "num_input_tokens_seen": 77975712, "step": 64115 }, { "epoch": 8.034080942237814, "grad_norm": 0.076422318816185, "learning_rate": 7.474294278445692e-06, "loss": 0.4628, "num_input_tokens_seen": 77981600, "step": 64120 }, { "epoch": 8.034707430146598, "grad_norm": 0.0826525017619133, "learning_rate": 7.473819183891997e-06, "loss": 0.465, "num_input_tokens_seen": 77987520, "step": 64125 }, { "epoch": 8.03533391805538, "grad_norm": 0.08104977756738663, "learning_rate": 7.473344059761714e-06, "loss": 0.4715, "num_input_tokens_seen": 77993632, "step": 64130 }, { "epoch": 8.035960405964165, "grad_norm": 0.0893588438630104, "learning_rate": 7.472868906060522e-06, "loss": 0.4663, "num_input_tokens_seen": 77999584, "step": 64135 }, { "epoch": 8.036586893872949, "grad_norm": 0.12655647099018097, "learning_rate": 7.472393722794105e-06, "loss": 0.4606, "num_input_tokens_seen": 78005792, "step": 64140 }, { "epoch": 8.037213381781731, "grad_norm": 0.08862277865409851, "learning_rate": 7.471918509968141e-06, "loss": 0.4645, "num_input_tokens_seen": 78011936, "step": 64145 }, { "epoch": 8.037839869690515, "grad_norm": 0.0824824869632721, "learning_rate": 7.471443267588313e-06, "loss": 0.4564, "num_input_tokens_seen": 78018208, "step": 64150 }, { "epoch": 8.0384663575993, "grad_norm": 0.05532671883702278, "learning_rate": 7.470967995660303e-06, "loss": 0.4556, "num_input_tokens_seen": 78024160, "step": 64155 }, { "epoch": 8.039092845508081, "grad_norm": 0.061353832483291626, "learning_rate": 7.4704926941897945e-06, "loss": 0.4659, "num_input_tokens_seen": 78030208, "step": 64160 }, { "epoch": 8.039719333416866, "grad_norm": 0.08007873594760895, "learning_rate": 7.470017363182468e-06, "loss": 0.4667, "num_input_tokens_seen": 78036256, "step": 64165 }, { "epoch": 8.040345821325648, "grad_norm": 0.09330739080905914, "learning_rate": 7.469542002644007e-06, "loss": 0.4617, "num_input_tokens_seen": 78042368, "step": 64170 }, { "epoch": 8.040972309234432, "grad_norm": 0.07323654741048813, "learning_rate": 7.4690666125800945e-06, "loss": 0.4568, "num_input_tokens_seen": 78048576, "step": 64175 }, { "epoch": 8.041598797143216, "grad_norm": 0.13900133967399597, "learning_rate": 7.468591192996416e-06, "loss": 0.4533, "num_input_tokens_seen": 78054720, "step": 64180 }, { "epoch": 8.042225285051998, "grad_norm": 0.08498460799455643, "learning_rate": 7.468115743898654e-06, "loss": 0.4614, "num_input_tokens_seen": 78060384, "step": 64185 }, { "epoch": 8.042851772960782, "grad_norm": 0.13037706911563873, "learning_rate": 7.4676402652924925e-06, "loss": 0.4645, "num_input_tokens_seen": 78066368, "step": 64190 }, { "epoch": 8.043478260869565, "grad_norm": 0.09535431116819382, "learning_rate": 7.467164757183617e-06, "loss": 0.4666, "num_input_tokens_seen": 78072608, "step": 64195 }, { "epoch": 8.044104748778349, "grad_norm": 0.09815163910388947, "learning_rate": 7.466689219577712e-06, "loss": 0.4627, "num_input_tokens_seen": 78078752, "step": 64200 }, { "epoch": 8.044731236687133, "grad_norm": 0.08181115239858627, "learning_rate": 7.466213652480464e-06, "loss": 0.4614, "num_input_tokens_seen": 78084512, "step": 64205 }, { "epoch": 8.045357724595915, "grad_norm": 0.11799880862236023, "learning_rate": 7.465738055897559e-06, "loss": 0.4607, "num_input_tokens_seen": 78090880, "step": 64210 }, { "epoch": 8.045984212504699, "grad_norm": 0.07845933735370636, "learning_rate": 7.4652624298346835e-06, "loss": 0.4602, "num_input_tokens_seen": 78097088, "step": 64215 }, { "epoch": 8.046610700413481, "grad_norm": 0.0857359915971756, "learning_rate": 7.464786774297521e-06, "loss": 0.4603, "num_input_tokens_seen": 78103168, "step": 64220 }, { "epoch": 8.047237188322265, "grad_norm": 0.09153059870004654, "learning_rate": 7.46431108929176e-06, "loss": 0.4578, "num_input_tokens_seen": 78109312, "step": 64225 }, { "epoch": 8.04786367623105, "grad_norm": 0.09873766452074051, "learning_rate": 7.463835374823089e-06, "loss": 0.4534, "num_input_tokens_seen": 78115648, "step": 64230 }, { "epoch": 8.048490164139832, "grad_norm": 0.08419904857873917, "learning_rate": 7.463359630897196e-06, "loss": 0.4565, "num_input_tokens_seen": 78121728, "step": 64235 }, { "epoch": 8.049116652048616, "grad_norm": 0.1269003301858902, "learning_rate": 7.462883857519766e-06, "loss": 0.4638, "num_input_tokens_seen": 78128096, "step": 64240 }, { "epoch": 8.049743139957398, "grad_norm": 0.11558801680803299, "learning_rate": 7.462408054696489e-06, "loss": 0.4595, "num_input_tokens_seen": 78134048, "step": 64245 }, { "epoch": 8.050369627866182, "grad_norm": 0.08667409420013428, "learning_rate": 7.461932222433052e-06, "loss": 0.4601, "num_input_tokens_seen": 78139488, "step": 64250 }, { "epoch": 8.050996115774966, "grad_norm": 0.1154337227344513, "learning_rate": 7.461456360735147e-06, "loss": 0.4692, "num_input_tokens_seen": 78145536, "step": 64255 }, { "epoch": 8.051622603683748, "grad_norm": 0.14243333041667938, "learning_rate": 7.460980469608461e-06, "loss": 0.4589, "num_input_tokens_seen": 78151552, "step": 64260 }, { "epoch": 8.052249091592532, "grad_norm": 0.1315954029560089, "learning_rate": 7.460504549058686e-06, "loss": 0.469, "num_input_tokens_seen": 78157600, "step": 64265 }, { "epoch": 8.052875579501316, "grad_norm": 0.14623980224132538, "learning_rate": 7.460028599091509e-06, "loss": 0.4583, "num_input_tokens_seen": 78163744, "step": 64270 }, { "epoch": 8.053502067410099, "grad_norm": 0.09729571640491486, "learning_rate": 7.459552619712622e-06, "loss": 0.4598, "num_input_tokens_seen": 78169984, "step": 64275 }, { "epoch": 8.054128555318883, "grad_norm": 0.08139953762292862, "learning_rate": 7.459076610927715e-06, "loss": 0.4612, "num_input_tokens_seen": 78176160, "step": 64280 }, { "epoch": 8.054755043227665, "grad_norm": 0.13253054022789001, "learning_rate": 7.458600572742481e-06, "loss": 0.4644, "num_input_tokens_seen": 78182240, "step": 64285 }, { "epoch": 8.05538153113645, "grad_norm": 0.09188900887966156, "learning_rate": 7.458124505162611e-06, "loss": 0.458, "num_input_tokens_seen": 78188096, "step": 64290 }, { "epoch": 8.056008019045233, "grad_norm": 0.08349058777093887, "learning_rate": 7.457648408193794e-06, "loss": 0.4616, "num_input_tokens_seen": 78194144, "step": 64295 }, { "epoch": 8.056634506954016, "grad_norm": 0.08422190696001053, "learning_rate": 7.457172281841724e-06, "loss": 0.454, "num_input_tokens_seen": 78200000, "step": 64300 }, { "epoch": 8.0572609948628, "grad_norm": 0.08242208510637283, "learning_rate": 7.456696126112093e-06, "loss": 0.4613, "num_input_tokens_seen": 78206048, "step": 64305 }, { "epoch": 8.057887482771582, "grad_norm": 0.07580627501010895, "learning_rate": 7.456219941010595e-06, "loss": 0.4651, "num_input_tokens_seen": 78212096, "step": 64310 }, { "epoch": 8.058513970680366, "grad_norm": 0.08482713252305984, "learning_rate": 7.455743726542924e-06, "loss": 0.4592, "num_input_tokens_seen": 78218464, "step": 64315 }, { "epoch": 8.05914045858915, "grad_norm": 0.10369974374771118, "learning_rate": 7.45526748271477e-06, "loss": 0.4555, "num_input_tokens_seen": 78224736, "step": 64320 }, { "epoch": 8.059766946497932, "grad_norm": 0.07756403833627701, "learning_rate": 7.454791209531829e-06, "loss": 0.4671, "num_input_tokens_seen": 78231008, "step": 64325 }, { "epoch": 8.060393434406716, "grad_norm": 0.1254393309354782, "learning_rate": 7.454314906999796e-06, "loss": 0.4653, "num_input_tokens_seen": 78237280, "step": 64330 }, { "epoch": 8.061019922315499, "grad_norm": 0.08558658510446548, "learning_rate": 7.453838575124364e-06, "loss": 0.4606, "num_input_tokens_seen": 78243456, "step": 64335 }, { "epoch": 8.061646410224283, "grad_norm": 0.10752761363983154, "learning_rate": 7.45336221391123e-06, "loss": 0.4618, "num_input_tokens_seen": 78249376, "step": 64340 }, { "epoch": 8.062272898133067, "grad_norm": 0.12860126793384552, "learning_rate": 7.452885823366089e-06, "loss": 0.4611, "num_input_tokens_seen": 78254912, "step": 64345 }, { "epoch": 8.062899386041849, "grad_norm": 0.07857824862003326, "learning_rate": 7.452409403494633e-06, "loss": 0.4647, "num_input_tokens_seen": 78261184, "step": 64350 }, { "epoch": 8.063525873950633, "grad_norm": 0.08155357837677002, "learning_rate": 7.451932954302562e-06, "loss": 0.466, "num_input_tokens_seen": 78267040, "step": 64355 }, { "epoch": 8.064152361859415, "grad_norm": 0.09783362597227097, "learning_rate": 7.451456475795571e-06, "loss": 0.465, "num_input_tokens_seen": 78273440, "step": 64360 }, { "epoch": 8.0647788497682, "grad_norm": 0.08922015130519867, "learning_rate": 7.4509799679793575e-06, "loss": 0.4539, "num_input_tokens_seen": 78279616, "step": 64365 }, { "epoch": 8.065405337676983, "grad_norm": 0.08928649872541428, "learning_rate": 7.450503430859617e-06, "loss": 0.4629, "num_input_tokens_seen": 78285600, "step": 64370 }, { "epoch": 8.066031825585766, "grad_norm": 0.0998399406671524, "learning_rate": 7.450026864442048e-06, "loss": 0.4659, "num_input_tokens_seen": 78291712, "step": 64375 }, { "epoch": 8.06665831349455, "grad_norm": 0.07717976719141006, "learning_rate": 7.449550268732347e-06, "loss": 0.4584, "num_input_tokens_seen": 78297920, "step": 64380 }, { "epoch": 8.067284801403332, "grad_norm": 0.09474460780620575, "learning_rate": 7.449073643736213e-06, "loss": 0.4618, "num_input_tokens_seen": 78303968, "step": 64385 }, { "epoch": 8.067911289312116, "grad_norm": 0.11423077434301376, "learning_rate": 7.448596989459347e-06, "loss": 0.4574, "num_input_tokens_seen": 78310304, "step": 64390 }, { "epoch": 8.0685377772209, "grad_norm": 0.08062569797039032, "learning_rate": 7.448120305907443e-06, "loss": 0.4606, "num_input_tokens_seen": 78316320, "step": 64395 }, { "epoch": 8.069164265129682, "grad_norm": 0.09531638771295547, "learning_rate": 7.447643593086203e-06, "loss": 0.465, "num_input_tokens_seen": 78322400, "step": 64400 }, { "epoch": 8.069790753038466, "grad_norm": 0.12802645564079285, "learning_rate": 7.447166851001326e-06, "loss": 0.4634, "num_input_tokens_seen": 78328384, "step": 64405 }, { "epoch": 8.07041724094725, "grad_norm": 0.08967059850692749, "learning_rate": 7.446690079658512e-06, "loss": 0.4632, "num_input_tokens_seen": 78334496, "step": 64410 }, { "epoch": 8.071043728856033, "grad_norm": 0.11389435082674026, "learning_rate": 7.446213279063461e-06, "loss": 0.4636, "num_input_tokens_seen": 78340640, "step": 64415 }, { "epoch": 8.071670216764817, "grad_norm": 0.1372036635875702, "learning_rate": 7.4457364492218745e-06, "loss": 0.4742, "num_input_tokens_seen": 78347008, "step": 64420 }, { "epoch": 8.0722967046736, "grad_norm": 0.12176111340522766, "learning_rate": 7.445259590139451e-06, "loss": 0.4636, "num_input_tokens_seen": 78353184, "step": 64425 }, { "epoch": 8.072923192582383, "grad_norm": 0.08915998041629791, "learning_rate": 7.444782701821894e-06, "loss": 0.4669, "num_input_tokens_seen": 78359136, "step": 64430 }, { "epoch": 8.073549680491167, "grad_norm": 0.08591080456972122, "learning_rate": 7.444305784274904e-06, "loss": 0.4546, "num_input_tokens_seen": 78365088, "step": 64435 }, { "epoch": 8.07417616839995, "grad_norm": 0.08934887498617172, "learning_rate": 7.443828837504184e-06, "loss": 0.464, "num_input_tokens_seen": 78371488, "step": 64440 }, { "epoch": 8.074802656308734, "grad_norm": 0.07902789860963821, "learning_rate": 7.443351861515434e-06, "loss": 0.4583, "num_input_tokens_seen": 78377440, "step": 64445 }, { "epoch": 8.075429144217516, "grad_norm": 0.09885333478450775, "learning_rate": 7.442874856314359e-06, "loss": 0.4612, "num_input_tokens_seen": 78383872, "step": 64450 }, { "epoch": 8.0760556321263, "grad_norm": 0.07943933457136154, "learning_rate": 7.442397821906662e-06, "loss": 0.4627, "num_input_tokens_seen": 78389568, "step": 64455 }, { "epoch": 8.076682120035084, "grad_norm": 0.0801706463098526, "learning_rate": 7.441920758298045e-06, "loss": 0.4641, "num_input_tokens_seen": 78395136, "step": 64460 }, { "epoch": 8.077308607943866, "grad_norm": 0.0724547877907753, "learning_rate": 7.441443665494212e-06, "loss": 0.46, "num_input_tokens_seen": 78401216, "step": 64465 }, { "epoch": 8.07793509585265, "grad_norm": 0.1080319955945015, "learning_rate": 7.440966543500868e-06, "loss": 0.4578, "num_input_tokens_seen": 78407648, "step": 64470 }, { "epoch": 8.078561583761433, "grad_norm": 0.08714190870523453, "learning_rate": 7.440489392323716e-06, "loss": 0.4587, "num_input_tokens_seen": 78413792, "step": 64475 }, { "epoch": 8.079188071670217, "grad_norm": 0.08943726867437363, "learning_rate": 7.440012211968462e-06, "loss": 0.4579, "num_input_tokens_seen": 78419936, "step": 64480 }, { "epoch": 8.079814559579, "grad_norm": 0.08620155602693558, "learning_rate": 7.439535002440812e-06, "loss": 0.4616, "num_input_tokens_seen": 78425856, "step": 64485 }, { "epoch": 8.080441047487783, "grad_norm": 0.08701534569263458, "learning_rate": 7.439057763746468e-06, "loss": 0.4563, "num_input_tokens_seen": 78432384, "step": 64490 }, { "epoch": 8.081067535396567, "grad_norm": 0.08973594754934311, "learning_rate": 7.438580495891139e-06, "loss": 0.4589, "num_input_tokens_seen": 78438144, "step": 64495 }, { "epoch": 8.08169402330535, "grad_norm": 0.10375379025936127, "learning_rate": 7.438103198880527e-06, "loss": 0.4631, "num_input_tokens_seen": 78444256, "step": 64500 }, { "epoch": 8.082320511214133, "grad_norm": 0.11465766280889511, "learning_rate": 7.437625872720344e-06, "loss": 0.4564, "num_input_tokens_seen": 78450208, "step": 64505 }, { "epoch": 8.082946999122917, "grad_norm": 0.06832178682088852, "learning_rate": 7.437148517416294e-06, "loss": 0.4601, "num_input_tokens_seen": 78456576, "step": 64510 }, { "epoch": 8.0835734870317, "grad_norm": 0.07725483179092407, "learning_rate": 7.436671132974084e-06, "loss": 0.4559, "num_input_tokens_seen": 78462944, "step": 64515 }, { "epoch": 8.084199974940484, "grad_norm": 0.11529428511857986, "learning_rate": 7.4361937193994224e-06, "loss": 0.4598, "num_input_tokens_seen": 78468736, "step": 64520 }, { "epoch": 8.084826462849268, "grad_norm": 0.0990622416138649, "learning_rate": 7.435716276698016e-06, "loss": 0.4644, "num_input_tokens_seen": 78474816, "step": 64525 }, { "epoch": 8.08545295075805, "grad_norm": 0.09287438541650772, "learning_rate": 7.4352388048755756e-06, "loss": 0.4634, "num_input_tokens_seen": 78480800, "step": 64530 }, { "epoch": 8.086079438666834, "grad_norm": 0.09091073274612427, "learning_rate": 7.4347613039378055e-06, "loss": 0.4624, "num_input_tokens_seen": 78486752, "step": 64535 }, { "epoch": 8.086705926575616, "grad_norm": 0.09381679445505142, "learning_rate": 7.434283773890418e-06, "loss": 0.4634, "num_input_tokens_seen": 78492896, "step": 64540 }, { "epoch": 8.0873324144844, "grad_norm": 0.08971911668777466, "learning_rate": 7.433806214739121e-06, "loss": 0.4628, "num_input_tokens_seen": 78499232, "step": 64545 }, { "epoch": 8.087958902393185, "grad_norm": 0.09295875579118729, "learning_rate": 7.433328626489626e-06, "loss": 0.4526, "num_input_tokens_seen": 78505312, "step": 64550 }, { "epoch": 8.088585390301967, "grad_norm": 0.09045757353305817, "learning_rate": 7.432851009147639e-06, "loss": 0.4671, "num_input_tokens_seen": 78511552, "step": 64555 }, { "epoch": 8.089211878210751, "grad_norm": 0.094133161008358, "learning_rate": 7.432373362718874e-06, "loss": 0.4629, "num_input_tokens_seen": 78517760, "step": 64560 }, { "epoch": 8.089838366119533, "grad_norm": 0.11723887920379639, "learning_rate": 7.4318956872090385e-06, "loss": 0.4664, "num_input_tokens_seen": 78523936, "step": 64565 }, { "epoch": 8.090464854028317, "grad_norm": 0.10743443667888641, "learning_rate": 7.431417982623849e-06, "loss": 0.4664, "num_input_tokens_seen": 78529344, "step": 64570 }, { "epoch": 8.091091341937101, "grad_norm": 0.07038141787052155, "learning_rate": 7.4309402489690105e-06, "loss": 0.4587, "num_input_tokens_seen": 78534816, "step": 64575 }, { "epoch": 8.091717829845884, "grad_norm": 0.13616400957107544, "learning_rate": 7.43046248625024e-06, "loss": 0.4674, "num_input_tokens_seen": 78540992, "step": 64580 }, { "epoch": 8.092344317754668, "grad_norm": 0.08498214930295944, "learning_rate": 7.429984694473245e-06, "loss": 0.4636, "num_input_tokens_seen": 78546336, "step": 64585 }, { "epoch": 8.09297080566345, "grad_norm": 0.11064320802688599, "learning_rate": 7.42950687364374e-06, "loss": 0.4629, "num_input_tokens_seen": 78552704, "step": 64590 }, { "epoch": 8.093597293572234, "grad_norm": 0.0681803748011589, "learning_rate": 7.429029023767439e-06, "loss": 0.4691, "num_input_tokens_seen": 78558720, "step": 64595 }, { "epoch": 8.094223781481018, "grad_norm": 0.05936724320054054, "learning_rate": 7.428551144850054e-06, "loss": 0.4569, "num_input_tokens_seen": 78564960, "step": 64600 }, { "epoch": 8.0948502693898, "grad_norm": 0.07711012661457062, "learning_rate": 7.4280732368972976e-06, "loss": 0.4572, "num_input_tokens_seen": 78570304, "step": 64605 }, { "epoch": 8.095476757298584, "grad_norm": 0.12151577323675156, "learning_rate": 7.427595299914883e-06, "loss": 0.4604, "num_input_tokens_seen": 78576480, "step": 64610 }, { "epoch": 8.096103245207367, "grad_norm": 0.10728708654642105, "learning_rate": 7.4271173339085255e-06, "loss": 0.4634, "num_input_tokens_seen": 78582560, "step": 64615 }, { "epoch": 8.09672973311615, "grad_norm": 0.12716683745384216, "learning_rate": 7.426639338883941e-06, "loss": 0.4709, "num_input_tokens_seen": 78588672, "step": 64620 }, { "epoch": 8.097356221024935, "grad_norm": 0.107819564640522, "learning_rate": 7.426161314846843e-06, "loss": 0.4547, "num_input_tokens_seen": 78594272, "step": 64625 }, { "epoch": 8.097982708933717, "grad_norm": 0.09063772857189178, "learning_rate": 7.425683261802946e-06, "loss": 0.4585, "num_input_tokens_seen": 78600160, "step": 64630 }, { "epoch": 8.098609196842501, "grad_norm": 0.08832579851150513, "learning_rate": 7.425205179757966e-06, "loss": 0.4661, "num_input_tokens_seen": 78606144, "step": 64635 }, { "epoch": 8.099235684751285, "grad_norm": 0.13818131387233734, "learning_rate": 7.424727068717621e-06, "loss": 0.457, "num_input_tokens_seen": 78612064, "step": 64640 }, { "epoch": 8.099862172660067, "grad_norm": 0.08634031563997269, "learning_rate": 7.424248928687623e-06, "loss": 0.4638, "num_input_tokens_seen": 78617632, "step": 64645 }, { "epoch": 8.100488660568852, "grad_norm": 0.08771169185638428, "learning_rate": 7.423770759673692e-06, "loss": 0.4697, "num_input_tokens_seen": 78623808, "step": 64650 }, { "epoch": 8.101115148477634, "grad_norm": 0.08453328162431717, "learning_rate": 7.423292561681544e-06, "loss": 0.4592, "num_input_tokens_seen": 78629824, "step": 64655 }, { "epoch": 8.101741636386418, "grad_norm": 0.13097496330738068, "learning_rate": 7.422814334716895e-06, "loss": 0.4612, "num_input_tokens_seen": 78636032, "step": 64660 }, { "epoch": 8.102368124295202, "grad_norm": 0.08771113306283951, "learning_rate": 7.4223360787854645e-06, "loss": 0.4695, "num_input_tokens_seen": 78641600, "step": 64665 }, { "epoch": 8.102994612203984, "grad_norm": 0.06386251002550125, "learning_rate": 7.421857793892969e-06, "loss": 0.4605, "num_input_tokens_seen": 78647680, "step": 64670 }, { "epoch": 8.103621100112768, "grad_norm": 0.07969671487808228, "learning_rate": 7.421379480045128e-06, "loss": 0.4586, "num_input_tokens_seen": 78653696, "step": 64675 }, { "epoch": 8.10424758802155, "grad_norm": 0.1274162232875824, "learning_rate": 7.42090113724766e-06, "loss": 0.4682, "num_input_tokens_seen": 78659872, "step": 64680 }, { "epoch": 8.104874075930335, "grad_norm": 0.0805879533290863, "learning_rate": 7.420422765506283e-06, "loss": 0.4649, "num_input_tokens_seen": 78665856, "step": 64685 }, { "epoch": 8.105500563839119, "grad_norm": 0.12668749690055847, "learning_rate": 7.4199443648267155e-06, "loss": 0.4666, "num_input_tokens_seen": 78672192, "step": 64690 }, { "epoch": 8.106127051747901, "grad_norm": 0.11992595344781876, "learning_rate": 7.41946593521468e-06, "loss": 0.4643, "num_input_tokens_seen": 78678208, "step": 64695 }, { "epoch": 8.106753539656685, "grad_norm": 0.07340236008167267, "learning_rate": 7.418987476675895e-06, "loss": 0.4644, "num_input_tokens_seen": 78684288, "step": 64700 }, { "epoch": 8.107380027565467, "grad_norm": 0.05567232891917229, "learning_rate": 7.41850898921608e-06, "loss": 0.4649, "num_input_tokens_seen": 78690208, "step": 64705 }, { "epoch": 8.108006515474251, "grad_norm": 0.08184760063886642, "learning_rate": 7.418030472840958e-06, "loss": 0.4577, "num_input_tokens_seen": 78695872, "step": 64710 }, { "epoch": 8.108633003383035, "grad_norm": 0.08167307078838348, "learning_rate": 7.417551927556248e-06, "loss": 0.459, "num_input_tokens_seen": 78701856, "step": 64715 }, { "epoch": 8.109259491291818, "grad_norm": 0.110237717628479, "learning_rate": 7.417073353367672e-06, "loss": 0.4615, "num_input_tokens_seen": 78708096, "step": 64720 }, { "epoch": 8.109885979200602, "grad_norm": 0.0804649069905281, "learning_rate": 7.4165947502809524e-06, "loss": 0.4607, "num_input_tokens_seen": 78714144, "step": 64725 }, { "epoch": 8.110512467109384, "grad_norm": 0.0815289318561554, "learning_rate": 7.416116118301811e-06, "loss": 0.4616, "num_input_tokens_seen": 78720160, "step": 64730 }, { "epoch": 8.111138955018168, "grad_norm": 0.08519741892814636, "learning_rate": 7.415637457435969e-06, "loss": 0.4626, "num_input_tokens_seen": 78726368, "step": 64735 }, { "epoch": 8.111765442926952, "grad_norm": 0.0863586962223053, "learning_rate": 7.415158767689151e-06, "loss": 0.4651, "num_input_tokens_seen": 78732512, "step": 64740 }, { "epoch": 8.112391930835734, "grad_norm": 0.06678440421819687, "learning_rate": 7.41468004906708e-06, "loss": 0.4598, "num_input_tokens_seen": 78738048, "step": 64745 }, { "epoch": 8.113018418744518, "grad_norm": 0.15667876601219177, "learning_rate": 7.414201301575478e-06, "loss": 0.4648, "num_input_tokens_seen": 78744064, "step": 64750 }, { "epoch": 8.1136449066533, "grad_norm": 0.08621036261320114, "learning_rate": 7.413722525220069e-06, "loss": 0.4601, "num_input_tokens_seen": 78750080, "step": 64755 }, { "epoch": 8.114271394562085, "grad_norm": 0.08903170377016068, "learning_rate": 7.413243720006579e-06, "loss": 0.4606, "num_input_tokens_seen": 78755840, "step": 64760 }, { "epoch": 8.114897882470869, "grad_norm": 0.08600204437971115, "learning_rate": 7.412764885940731e-06, "loss": 0.4598, "num_input_tokens_seen": 78762336, "step": 64765 }, { "epoch": 8.115524370379651, "grad_norm": 0.07901985943317413, "learning_rate": 7.412286023028249e-06, "loss": 0.4583, "num_input_tokens_seen": 78768576, "step": 64770 }, { "epoch": 8.116150858288435, "grad_norm": 0.10770440101623535, "learning_rate": 7.411807131274861e-06, "loss": 0.4598, "num_input_tokens_seen": 78774816, "step": 64775 }, { "epoch": 8.11677734619722, "grad_norm": 0.10060302913188934, "learning_rate": 7.411328210686291e-06, "loss": 0.4578, "num_input_tokens_seen": 78781024, "step": 64780 }, { "epoch": 8.117403834106002, "grad_norm": 0.0660051554441452, "learning_rate": 7.410849261268265e-06, "loss": 0.466, "num_input_tokens_seen": 78787104, "step": 64785 }, { "epoch": 8.118030322014786, "grad_norm": 0.06628088653087616, "learning_rate": 7.410370283026509e-06, "loss": 0.457, "num_input_tokens_seen": 78793184, "step": 64790 }, { "epoch": 8.118656809923568, "grad_norm": 0.0760156512260437, "learning_rate": 7.40989127596675e-06, "loss": 0.4647, "num_input_tokens_seen": 78799040, "step": 64795 }, { "epoch": 8.119283297832352, "grad_norm": 0.09063728898763657, "learning_rate": 7.409412240094714e-06, "loss": 0.4662, "num_input_tokens_seen": 78805056, "step": 64800 }, { "epoch": 8.119909785741136, "grad_norm": 0.1282251924276352, "learning_rate": 7.408933175416129e-06, "loss": 0.4573, "num_input_tokens_seen": 78810720, "step": 64805 }, { "epoch": 8.120536273649918, "grad_norm": 0.08500903844833374, "learning_rate": 7.408454081936722e-06, "loss": 0.4638, "num_input_tokens_seen": 78816768, "step": 64810 }, { "epoch": 8.121162761558702, "grad_norm": 0.053312599658966064, "learning_rate": 7.407974959662222e-06, "loss": 0.4641, "num_input_tokens_seen": 78822208, "step": 64815 }, { "epoch": 8.121789249467485, "grad_norm": 0.09986107796430588, "learning_rate": 7.407495808598356e-06, "loss": 0.4609, "num_input_tokens_seen": 78828160, "step": 64820 }, { "epoch": 8.122415737376269, "grad_norm": 0.10179277509450912, "learning_rate": 7.407016628750855e-06, "loss": 0.4651, "num_input_tokens_seen": 78834336, "step": 64825 }, { "epoch": 8.123042225285053, "grad_norm": 0.139285147190094, "learning_rate": 7.4065374201254455e-06, "loss": 0.4672, "num_input_tokens_seen": 78840288, "step": 64830 }, { "epoch": 8.123668713193835, "grad_norm": 0.12134432047605515, "learning_rate": 7.40605818272786e-06, "loss": 0.4628, "num_input_tokens_seen": 78846432, "step": 64835 }, { "epoch": 8.124295201102619, "grad_norm": 0.11483294516801834, "learning_rate": 7.405578916563823e-06, "loss": 0.4566, "num_input_tokens_seen": 78852480, "step": 64840 }, { "epoch": 8.124921689011401, "grad_norm": 0.11901882290840149, "learning_rate": 7.40509962163907e-06, "loss": 0.4626, "num_input_tokens_seen": 78858176, "step": 64845 }, { "epoch": 8.125548176920185, "grad_norm": 0.0916600376367569, "learning_rate": 7.404620297959329e-06, "loss": 0.4609, "num_input_tokens_seen": 78864288, "step": 64850 }, { "epoch": 8.12617466482897, "grad_norm": 0.13019607961177826, "learning_rate": 7.40414094553033e-06, "loss": 0.4597, "num_input_tokens_seen": 78869952, "step": 64855 }, { "epoch": 8.126801152737752, "grad_norm": 0.12270189821720123, "learning_rate": 7.4036615643578035e-06, "loss": 0.456, "num_input_tokens_seen": 78875904, "step": 64860 }, { "epoch": 8.127427640646536, "grad_norm": 0.10774615406990051, "learning_rate": 7.4031821544474815e-06, "loss": 0.4608, "num_input_tokens_seen": 78882272, "step": 64865 }, { "epoch": 8.128054128555318, "grad_norm": 0.13367959856987, "learning_rate": 7.402702715805098e-06, "loss": 0.4681, "num_input_tokens_seen": 78888640, "step": 64870 }, { "epoch": 8.128680616464102, "grad_norm": 0.11009316146373749, "learning_rate": 7.402223248436382e-06, "loss": 0.46, "num_input_tokens_seen": 78894752, "step": 64875 }, { "epoch": 8.129307104372886, "grad_norm": 0.09975896030664444, "learning_rate": 7.401743752347069e-06, "loss": 0.4614, "num_input_tokens_seen": 78900416, "step": 64880 }, { "epoch": 8.129933592281668, "grad_norm": 0.13160577416419983, "learning_rate": 7.40126422754289e-06, "loss": 0.463, "num_input_tokens_seen": 78906560, "step": 64885 }, { "epoch": 8.130560080190453, "grad_norm": 0.1504833996295929, "learning_rate": 7.400784674029579e-06, "loss": 0.4531, "num_input_tokens_seen": 78912544, "step": 64890 }, { "epoch": 8.131186568099235, "grad_norm": 0.1487237811088562, "learning_rate": 7.400305091812867e-06, "loss": 0.4625, "num_input_tokens_seen": 78918784, "step": 64895 }, { "epoch": 8.131813056008019, "grad_norm": 0.14016777276992798, "learning_rate": 7.39982548089849e-06, "loss": 0.4671, "num_input_tokens_seen": 78925024, "step": 64900 }, { "epoch": 8.132439543916803, "grad_norm": 0.12402736395597458, "learning_rate": 7.3993458412921825e-06, "loss": 0.4634, "num_input_tokens_seen": 78931104, "step": 64905 }, { "epoch": 8.133066031825585, "grad_norm": 0.1413457691669464, "learning_rate": 7.398866172999679e-06, "loss": 0.4614, "num_input_tokens_seen": 78937440, "step": 64910 }, { "epoch": 8.13369251973437, "grad_norm": 0.12358015030622482, "learning_rate": 7.398386476026712e-06, "loss": 0.4619, "num_input_tokens_seen": 78943488, "step": 64915 }, { "epoch": 8.134319007643153, "grad_norm": 0.15455228090286255, "learning_rate": 7.397906750379018e-06, "loss": 0.4598, "num_input_tokens_seen": 78949408, "step": 64920 }, { "epoch": 8.134945495551936, "grad_norm": 0.1809842586517334, "learning_rate": 7.397426996062334e-06, "loss": 0.4675, "num_input_tokens_seen": 78955296, "step": 64925 }, { "epoch": 8.13557198346072, "grad_norm": 0.13068249821662903, "learning_rate": 7.396947213082393e-06, "loss": 0.4535, "num_input_tokens_seen": 78961440, "step": 64930 }, { "epoch": 8.136198471369502, "grad_norm": 0.13327868282794952, "learning_rate": 7.396467401444935e-06, "loss": 0.4633, "num_input_tokens_seen": 78967552, "step": 64935 }, { "epoch": 8.136824959278286, "grad_norm": 0.08668581396341324, "learning_rate": 7.395987561155691e-06, "loss": 0.463, "num_input_tokens_seen": 78973952, "step": 64940 }, { "epoch": 8.13745144718707, "grad_norm": 0.12356089055538177, "learning_rate": 7.395507692220404e-06, "loss": 0.4611, "num_input_tokens_seen": 78980064, "step": 64945 }, { "epoch": 8.138077935095852, "grad_norm": 0.09582047909498215, "learning_rate": 7.395027794644808e-06, "loss": 0.459, "num_input_tokens_seen": 78986368, "step": 64950 }, { "epoch": 8.138704423004636, "grad_norm": 0.1579059660434723, "learning_rate": 7.39454786843464e-06, "loss": 0.459, "num_input_tokens_seen": 78992480, "step": 64955 }, { "epoch": 8.139330910913419, "grad_norm": 0.1071741133928299, "learning_rate": 7.394067913595642e-06, "loss": 0.4507, "num_input_tokens_seen": 78998944, "step": 64960 }, { "epoch": 8.139957398822203, "grad_norm": 0.129124715924263, "learning_rate": 7.3935879301335465e-06, "loss": 0.4606, "num_input_tokens_seen": 79005024, "step": 64965 }, { "epoch": 8.140583886730987, "grad_norm": 0.16009952127933502, "learning_rate": 7.393107918054095e-06, "loss": 0.4667, "num_input_tokens_seen": 79011168, "step": 64970 }, { "epoch": 8.141210374639769, "grad_norm": 0.1930364966392517, "learning_rate": 7.392627877363027e-06, "loss": 0.4786, "num_input_tokens_seen": 79017184, "step": 64975 }, { "epoch": 8.141836862548553, "grad_norm": 0.16102482378482819, "learning_rate": 7.392147808066081e-06, "loss": 0.4669, "num_input_tokens_seen": 79023328, "step": 64980 }, { "epoch": 8.142463350457335, "grad_norm": 0.1896856427192688, "learning_rate": 7.391667710168997e-06, "loss": 0.4557, "num_input_tokens_seen": 79029280, "step": 64985 }, { "epoch": 8.14308983836612, "grad_norm": 0.15424706041812897, "learning_rate": 7.3911875836775146e-06, "loss": 0.4593, "num_input_tokens_seen": 79035520, "step": 64990 }, { "epoch": 8.143716326274903, "grad_norm": 0.12486754357814789, "learning_rate": 7.390707428597375e-06, "loss": 0.4706, "num_input_tokens_seen": 79041664, "step": 64995 }, { "epoch": 8.144342814183686, "grad_norm": 0.13543838262557983, "learning_rate": 7.390227244934316e-06, "loss": 0.4652, "num_input_tokens_seen": 79047296, "step": 65000 }, { "epoch": 8.14496930209247, "grad_norm": 0.14504922926425934, "learning_rate": 7.389747032694083e-06, "loss": 0.4665, "num_input_tokens_seen": 79053184, "step": 65005 }, { "epoch": 8.145595790001252, "grad_norm": 0.12729182839393616, "learning_rate": 7.3892667918824136e-06, "loss": 0.4643, "num_input_tokens_seen": 79059328, "step": 65010 }, { "epoch": 8.146222277910036, "grad_norm": 0.16309741139411926, "learning_rate": 7.388786522505051e-06, "loss": 0.4634, "num_input_tokens_seen": 79065408, "step": 65015 }, { "epoch": 8.14684876581882, "grad_norm": 0.20638351142406464, "learning_rate": 7.388306224567738e-06, "loss": 0.4566, "num_input_tokens_seen": 79070880, "step": 65020 }, { "epoch": 8.147475253727603, "grad_norm": 0.12219048291444778, "learning_rate": 7.3878258980762155e-06, "loss": 0.4561, "num_input_tokens_seen": 79076384, "step": 65025 }, { "epoch": 8.148101741636387, "grad_norm": 0.17798654735088348, "learning_rate": 7.387345543036226e-06, "loss": 0.4626, "num_input_tokens_seen": 79082528, "step": 65030 }, { "epoch": 8.14872822954517, "grad_norm": 0.13812041282653809, "learning_rate": 7.386865159453515e-06, "loss": 0.4714, "num_input_tokens_seen": 79088992, "step": 65035 }, { "epoch": 8.149354717453953, "grad_norm": 0.13720476627349854, "learning_rate": 7.386384747333823e-06, "loss": 0.469, "num_input_tokens_seen": 79095136, "step": 65040 }, { "epoch": 8.149981205362737, "grad_norm": 0.1328272819519043, "learning_rate": 7.385904306682896e-06, "loss": 0.4494, "num_input_tokens_seen": 79101344, "step": 65045 }, { "epoch": 8.15060769327152, "grad_norm": 0.1642511785030365, "learning_rate": 7.385423837506477e-06, "loss": 0.4661, "num_input_tokens_seen": 79107008, "step": 65050 }, { "epoch": 8.151234181180303, "grad_norm": 0.1770043671131134, "learning_rate": 7.38494333981031e-06, "loss": 0.4694, "num_input_tokens_seen": 79113152, "step": 65055 }, { "epoch": 8.151860669089087, "grad_norm": 0.14305078983306885, "learning_rate": 7.38446281360014e-06, "loss": 0.4641, "num_input_tokens_seen": 79119296, "step": 65060 }, { "epoch": 8.15248715699787, "grad_norm": 0.15725430846214294, "learning_rate": 7.383982258881712e-06, "loss": 0.4681, "num_input_tokens_seen": 79125376, "step": 65065 }, { "epoch": 8.153113644906654, "grad_norm": 0.1091976910829544, "learning_rate": 7.383501675660773e-06, "loss": 0.4533, "num_input_tokens_seen": 79131360, "step": 65070 }, { "epoch": 8.153740132815436, "grad_norm": 0.10228089243173599, "learning_rate": 7.3830210639430655e-06, "loss": 0.4609, "num_input_tokens_seen": 79138016, "step": 65075 }, { "epoch": 8.15436662072422, "grad_norm": 0.10780128836631775, "learning_rate": 7.382540423734339e-06, "loss": 0.4587, "num_input_tokens_seen": 79144224, "step": 65080 }, { "epoch": 8.154993108633004, "grad_norm": 0.12387372553348541, "learning_rate": 7.382059755040338e-06, "loss": 0.4592, "num_input_tokens_seen": 79150560, "step": 65085 }, { "epoch": 8.155619596541786, "grad_norm": 0.10865311324596405, "learning_rate": 7.3815790578668115e-06, "loss": 0.4709, "num_input_tokens_seen": 79156960, "step": 65090 }, { "epoch": 8.15624608445057, "grad_norm": 0.1272049993276596, "learning_rate": 7.381098332219504e-06, "loss": 0.4644, "num_input_tokens_seen": 79163040, "step": 65095 }, { "epoch": 8.156872572359353, "grad_norm": 0.08672573417425156, "learning_rate": 7.380617578104165e-06, "loss": 0.4639, "num_input_tokens_seen": 79169440, "step": 65100 }, { "epoch": 8.157499060268137, "grad_norm": 0.11047334969043732, "learning_rate": 7.38013679552654e-06, "loss": 0.4726, "num_input_tokens_seen": 79175392, "step": 65105 }, { "epoch": 8.15812554817692, "grad_norm": 0.09418344497680664, "learning_rate": 7.379655984492379e-06, "loss": 0.462, "num_input_tokens_seen": 79181600, "step": 65110 }, { "epoch": 8.158752036085703, "grad_norm": 0.09746053069829941, "learning_rate": 7.37917514500743e-06, "loss": 0.4588, "num_input_tokens_seen": 79187904, "step": 65115 }, { "epoch": 8.159378523994487, "grad_norm": 0.1468520313501358, "learning_rate": 7.3786942770774405e-06, "loss": 0.4605, "num_input_tokens_seen": 79194048, "step": 65120 }, { "epoch": 8.16000501190327, "grad_norm": 0.10701429843902588, "learning_rate": 7.3782133807081625e-06, "loss": 0.4699, "num_input_tokens_seen": 79199808, "step": 65125 }, { "epoch": 8.160631499812053, "grad_norm": 0.11402272433042526, "learning_rate": 7.377732455905343e-06, "loss": 0.4674, "num_input_tokens_seen": 79205888, "step": 65130 }, { "epoch": 8.161257987720838, "grad_norm": 0.09653947502374649, "learning_rate": 7.377251502674734e-06, "loss": 0.458, "num_input_tokens_seen": 79211968, "step": 65135 }, { "epoch": 8.16188447562962, "grad_norm": 0.09826899319887161, "learning_rate": 7.376770521022084e-06, "loss": 0.4621, "num_input_tokens_seen": 79218016, "step": 65140 }, { "epoch": 8.162510963538404, "grad_norm": 0.07192737609148026, "learning_rate": 7.376289510953145e-06, "loss": 0.4595, "num_input_tokens_seen": 79223328, "step": 65145 }, { "epoch": 8.163137451447188, "grad_norm": 0.12553870677947998, "learning_rate": 7.375808472473667e-06, "loss": 0.4618, "num_input_tokens_seen": 79229632, "step": 65150 }, { "epoch": 8.16376393935597, "grad_norm": 0.09792953729629517, "learning_rate": 7.375327405589401e-06, "loss": 0.4627, "num_input_tokens_seen": 79235552, "step": 65155 }, { "epoch": 8.164390427264754, "grad_norm": 0.09627428650856018, "learning_rate": 7.374846310306099e-06, "loss": 0.4662, "num_input_tokens_seen": 79241728, "step": 65160 }, { "epoch": 8.165016915173537, "grad_norm": 0.1316259801387787, "learning_rate": 7.3743651866295104e-06, "loss": 0.4586, "num_input_tokens_seen": 79247744, "step": 65165 }, { "epoch": 8.16564340308232, "grad_norm": 0.10524633526802063, "learning_rate": 7.373884034565393e-06, "loss": 0.4687, "num_input_tokens_seen": 79253792, "step": 65170 }, { "epoch": 8.166269890991105, "grad_norm": 0.13555072247982025, "learning_rate": 7.3734028541194935e-06, "loss": 0.4593, "num_input_tokens_seen": 79260064, "step": 65175 }, { "epoch": 8.166896378899887, "grad_norm": 0.11513309180736542, "learning_rate": 7.372921645297568e-06, "loss": 0.4572, "num_input_tokens_seen": 79265920, "step": 65180 }, { "epoch": 8.167522866808671, "grad_norm": 0.1162218227982521, "learning_rate": 7.37244040810537e-06, "loss": 0.4586, "num_input_tokens_seen": 79272320, "step": 65185 }, { "epoch": 8.168149354717453, "grad_norm": 0.14591798186302185, "learning_rate": 7.371959142548652e-06, "loss": 0.4602, "num_input_tokens_seen": 79278272, "step": 65190 }, { "epoch": 8.168775842626237, "grad_norm": 0.06395982205867767, "learning_rate": 7.371477848633169e-06, "loss": 0.4604, "num_input_tokens_seen": 79284160, "step": 65195 }, { "epoch": 8.169402330535021, "grad_norm": 0.19120845198631287, "learning_rate": 7.370996526364673e-06, "loss": 0.4597, "num_input_tokens_seen": 79290208, "step": 65200 }, { "epoch": 8.170028818443804, "grad_norm": 0.11403404176235199, "learning_rate": 7.370515175748922e-06, "loss": 0.4619, "num_input_tokens_seen": 79296416, "step": 65205 }, { "epoch": 8.170655306352588, "grad_norm": 0.11592160165309906, "learning_rate": 7.370033796791667e-06, "loss": 0.46, "num_input_tokens_seen": 79302304, "step": 65210 }, { "epoch": 8.17128179426137, "grad_norm": 0.06769277900457382, "learning_rate": 7.369552389498667e-06, "loss": 0.4643, "num_input_tokens_seen": 79308416, "step": 65215 }, { "epoch": 8.171908282170154, "grad_norm": 0.11843330413103104, "learning_rate": 7.369070953875676e-06, "loss": 0.4631, "num_input_tokens_seen": 79314560, "step": 65220 }, { "epoch": 8.172534770078938, "grad_norm": 0.14729800820350647, "learning_rate": 7.3685894899284484e-06, "loss": 0.4732, "num_input_tokens_seen": 79320992, "step": 65225 }, { "epoch": 8.17316125798772, "grad_norm": 0.1568617820739746, "learning_rate": 7.368107997662742e-06, "loss": 0.4672, "num_input_tokens_seen": 79326688, "step": 65230 }, { "epoch": 8.173787745896504, "grad_norm": 0.09130416065454483, "learning_rate": 7.367626477084314e-06, "loss": 0.462, "num_input_tokens_seen": 79332608, "step": 65235 }, { "epoch": 8.174414233805287, "grad_norm": 0.12176349014043808, "learning_rate": 7.367144928198922e-06, "loss": 0.4635, "num_input_tokens_seen": 79338624, "step": 65240 }, { "epoch": 8.17504072171407, "grad_norm": 0.09009627252817154, "learning_rate": 7.366663351012322e-06, "loss": 0.4651, "num_input_tokens_seen": 79344992, "step": 65245 }, { "epoch": 8.175667209622855, "grad_norm": 0.07816021144390106, "learning_rate": 7.36618174553027e-06, "loss": 0.4511, "num_input_tokens_seen": 79351232, "step": 65250 }, { "epoch": 8.176293697531637, "grad_norm": 0.11672460287809372, "learning_rate": 7.365700111758527e-06, "loss": 0.4593, "num_input_tokens_seen": 79357568, "step": 65255 }, { "epoch": 8.176920185440421, "grad_norm": 0.12486270070075989, "learning_rate": 7.3652184497028516e-06, "loss": 0.4575, "num_input_tokens_seen": 79363840, "step": 65260 }, { "epoch": 8.177546673349203, "grad_norm": 0.10632693022489548, "learning_rate": 7.364736759368999e-06, "loss": 0.4646, "num_input_tokens_seen": 79369984, "step": 65265 }, { "epoch": 8.178173161257988, "grad_norm": 0.14226636290550232, "learning_rate": 7.3642550407627315e-06, "loss": 0.4616, "num_input_tokens_seen": 79376352, "step": 65270 }, { "epoch": 8.178799649166772, "grad_norm": 0.08781202882528305, "learning_rate": 7.3637732938898065e-06, "loss": 0.4591, "num_input_tokens_seen": 79382272, "step": 65275 }, { "epoch": 8.179426137075554, "grad_norm": 0.09199680387973785, "learning_rate": 7.363291518755984e-06, "loss": 0.4573, "num_input_tokens_seen": 79388256, "step": 65280 }, { "epoch": 8.180052624984338, "grad_norm": 0.08479119092226028, "learning_rate": 7.362809715367025e-06, "loss": 0.4646, "num_input_tokens_seen": 79394016, "step": 65285 }, { "epoch": 8.180679112893122, "grad_norm": 0.0787719264626503, "learning_rate": 7.362327883728691e-06, "loss": 0.4513, "num_input_tokens_seen": 79400544, "step": 65290 }, { "epoch": 8.181305600801904, "grad_norm": 0.12013670802116394, "learning_rate": 7.361846023846738e-06, "loss": 0.465, "num_input_tokens_seen": 79406240, "step": 65295 }, { "epoch": 8.181932088710688, "grad_norm": 0.14480824768543243, "learning_rate": 7.361364135726932e-06, "loss": 0.4583, "num_input_tokens_seen": 79412320, "step": 65300 }, { "epoch": 8.18255857661947, "grad_norm": 0.15343615412712097, "learning_rate": 7.3608822193750316e-06, "loss": 0.4632, "num_input_tokens_seen": 79418560, "step": 65305 }, { "epoch": 8.183185064528255, "grad_norm": 0.09170856326818466, "learning_rate": 7.3604002747967995e-06, "loss": 0.4595, "num_input_tokens_seen": 79424192, "step": 65310 }, { "epoch": 8.183811552437039, "grad_norm": 0.13366366922855377, "learning_rate": 7.359918301997998e-06, "loss": 0.4615, "num_input_tokens_seen": 79429952, "step": 65315 }, { "epoch": 8.184438040345821, "grad_norm": 0.09897927194833755, "learning_rate": 7.35943630098439e-06, "loss": 0.4617, "num_input_tokens_seen": 79436064, "step": 65320 }, { "epoch": 8.185064528254605, "grad_norm": 0.1020847037434578, "learning_rate": 7.358954271761736e-06, "loss": 0.4614, "num_input_tokens_seen": 79442336, "step": 65325 }, { "epoch": 8.185691016163387, "grad_norm": 0.0924157202243805, "learning_rate": 7.3584722143358004e-06, "loss": 0.4528, "num_input_tokens_seen": 79448352, "step": 65330 }, { "epoch": 8.186317504072171, "grad_norm": 0.10236497968435287, "learning_rate": 7.357990128712346e-06, "loss": 0.466, "num_input_tokens_seen": 79454496, "step": 65335 }, { "epoch": 8.186943991980955, "grad_norm": 0.08507701754570007, "learning_rate": 7.357508014897138e-06, "loss": 0.4674, "num_input_tokens_seen": 79460704, "step": 65340 }, { "epoch": 8.187570479889738, "grad_norm": 0.17159564793109894, "learning_rate": 7.35702587289594e-06, "loss": 0.4618, "num_input_tokens_seen": 79467104, "step": 65345 }, { "epoch": 8.188196967798522, "grad_norm": 0.11151474714279175, "learning_rate": 7.356543702714515e-06, "loss": 0.4605, "num_input_tokens_seen": 79473312, "step": 65350 }, { "epoch": 8.188823455707304, "grad_norm": 0.08923210203647614, "learning_rate": 7.356061504358629e-06, "loss": 0.4532, "num_input_tokens_seen": 79478976, "step": 65355 }, { "epoch": 8.189449943616088, "grad_norm": 0.09236263483762741, "learning_rate": 7.355579277834048e-06, "loss": 0.4555, "num_input_tokens_seen": 79484992, "step": 65360 }, { "epoch": 8.190076431524872, "grad_norm": 0.07562343031167984, "learning_rate": 7.355097023146536e-06, "loss": 0.4592, "num_input_tokens_seen": 79490304, "step": 65365 }, { "epoch": 8.190702919433654, "grad_norm": 0.07495338469743729, "learning_rate": 7.3546147403018575e-06, "loss": 0.4597, "num_input_tokens_seen": 79496224, "step": 65370 }, { "epoch": 8.191329407342439, "grad_norm": 0.11479773372411728, "learning_rate": 7.35413242930578e-06, "loss": 0.4604, "num_input_tokens_seen": 79502464, "step": 65375 }, { "epoch": 8.19195589525122, "grad_norm": 0.1630651205778122, "learning_rate": 7.3536500901640716e-06, "loss": 0.4585, "num_input_tokens_seen": 79508512, "step": 65380 }, { "epoch": 8.192582383160005, "grad_norm": 0.12620362639427185, "learning_rate": 7.353167722882497e-06, "loss": 0.4669, "num_input_tokens_seen": 79514720, "step": 65385 }, { "epoch": 8.193208871068789, "grad_norm": 0.12143266946077347, "learning_rate": 7.352685327466824e-06, "loss": 0.463, "num_input_tokens_seen": 79520928, "step": 65390 }, { "epoch": 8.193835358977571, "grad_norm": 0.15300332009792328, "learning_rate": 7.352202903922819e-06, "loss": 0.4628, "num_input_tokens_seen": 79527232, "step": 65395 }, { "epoch": 8.194461846886355, "grad_norm": 0.07834798842668533, "learning_rate": 7.351720452256252e-06, "loss": 0.4556, "num_input_tokens_seen": 79533760, "step": 65400 }, { "epoch": 8.19508833479514, "grad_norm": 0.1312020868062973, "learning_rate": 7.3512379724728885e-06, "loss": 0.4615, "num_input_tokens_seen": 79540000, "step": 65405 }, { "epoch": 8.195714822703922, "grad_norm": 0.13114045560359955, "learning_rate": 7.350755464578501e-06, "loss": 0.4629, "num_input_tokens_seen": 79546496, "step": 65410 }, { "epoch": 8.196341310612706, "grad_norm": 0.10667770355939865, "learning_rate": 7.350272928578852e-06, "loss": 0.4652, "num_input_tokens_seen": 79552896, "step": 65415 }, { "epoch": 8.196967798521488, "grad_norm": 0.0849975198507309, "learning_rate": 7.349790364479717e-06, "loss": 0.4623, "num_input_tokens_seen": 79558976, "step": 65420 }, { "epoch": 8.197594286430272, "grad_norm": 0.13305358588695526, "learning_rate": 7.349307772286861e-06, "loss": 0.452, "num_input_tokens_seen": 79565024, "step": 65425 }, { "epoch": 8.198220774339056, "grad_norm": 0.12297750264406204, "learning_rate": 7.348825152006054e-06, "loss": 0.4672, "num_input_tokens_seen": 79571232, "step": 65430 }, { "epoch": 8.198847262247838, "grad_norm": 0.08374625444412231, "learning_rate": 7.34834250364307e-06, "loss": 0.4606, "num_input_tokens_seen": 79577088, "step": 65435 }, { "epoch": 8.199473750156622, "grad_norm": 0.12185750901699066, "learning_rate": 7.347859827203675e-06, "loss": 0.4651, "num_input_tokens_seen": 79583360, "step": 65440 }, { "epoch": 8.200100238065405, "grad_norm": 0.11823485046625137, "learning_rate": 7.347377122693643e-06, "loss": 0.4586, "num_input_tokens_seen": 79589536, "step": 65445 }, { "epoch": 8.200726725974189, "grad_norm": 0.16061802208423615, "learning_rate": 7.346894390118744e-06, "loss": 0.4612, "num_input_tokens_seen": 79595296, "step": 65450 }, { "epoch": 8.201353213882973, "grad_norm": 0.0826650857925415, "learning_rate": 7.346411629484749e-06, "loss": 0.4655, "num_input_tokens_seen": 79601280, "step": 65455 }, { "epoch": 8.201979701791755, "grad_norm": 0.12688060104846954, "learning_rate": 7.345928840797431e-06, "loss": 0.4613, "num_input_tokens_seen": 79606240, "step": 65460 }, { "epoch": 8.20260618970054, "grad_norm": 0.08406780660152435, "learning_rate": 7.345446024062561e-06, "loss": 0.4605, "num_input_tokens_seen": 79611904, "step": 65465 }, { "epoch": 8.203232677609321, "grad_norm": 0.17110244929790497, "learning_rate": 7.344963179285911e-06, "loss": 0.4622, "num_input_tokens_seen": 79618048, "step": 65470 }, { "epoch": 8.203859165518105, "grad_norm": 0.13765646517276764, "learning_rate": 7.344480306473256e-06, "loss": 0.4614, "num_input_tokens_seen": 79624416, "step": 65475 }, { "epoch": 8.20448565342689, "grad_norm": 0.0836518406867981, "learning_rate": 7.343997405630368e-06, "loss": 0.4725, "num_input_tokens_seen": 79630624, "step": 65480 }, { "epoch": 8.205112141335672, "grad_norm": 0.15803112089633942, "learning_rate": 7.343514476763019e-06, "loss": 0.4644, "num_input_tokens_seen": 79636416, "step": 65485 }, { "epoch": 8.205738629244456, "grad_norm": 0.11665552854537964, "learning_rate": 7.343031519876984e-06, "loss": 0.4585, "num_input_tokens_seen": 79642624, "step": 65490 }, { "epoch": 8.206365117153238, "grad_norm": 0.09184204787015915, "learning_rate": 7.342548534978036e-06, "loss": 0.4632, "num_input_tokens_seen": 79649088, "step": 65495 }, { "epoch": 8.206991605062022, "grad_norm": 0.1753852516412735, "learning_rate": 7.3420655220719525e-06, "loss": 0.4595, "num_input_tokens_seen": 79655072, "step": 65500 }, { "epoch": 8.207618092970806, "grad_norm": 0.09058718383312225, "learning_rate": 7.341582481164508e-06, "loss": 0.4654, "num_input_tokens_seen": 79661184, "step": 65505 }, { "epoch": 8.208244580879589, "grad_norm": 0.11675219982862473, "learning_rate": 7.341099412261474e-06, "loss": 0.4651, "num_input_tokens_seen": 79667424, "step": 65510 }, { "epoch": 8.208871068788373, "grad_norm": 0.10879255831241608, "learning_rate": 7.340616315368629e-06, "loss": 0.4665, "num_input_tokens_seen": 79673280, "step": 65515 }, { "epoch": 8.209497556697155, "grad_norm": 0.16416452825069427, "learning_rate": 7.34013319049175e-06, "loss": 0.4587, "num_input_tokens_seen": 79679424, "step": 65520 }, { "epoch": 8.210124044605939, "grad_norm": 0.09671349078416824, "learning_rate": 7.339650037636609e-06, "loss": 0.4602, "num_input_tokens_seen": 79685632, "step": 65525 }, { "epoch": 8.210750532514723, "grad_norm": 0.09404109418392181, "learning_rate": 7.339166856808984e-06, "loss": 0.4684, "num_input_tokens_seen": 79691200, "step": 65530 }, { "epoch": 8.211377020423505, "grad_norm": 0.09584734588861465, "learning_rate": 7.338683648014655e-06, "loss": 0.4646, "num_input_tokens_seen": 79697152, "step": 65535 }, { "epoch": 8.21200350833229, "grad_norm": 0.1245458647608757, "learning_rate": 7.338200411259394e-06, "loss": 0.4616, "num_input_tokens_seen": 79703616, "step": 65540 }, { "epoch": 8.212629996241073, "grad_norm": 0.09431397169828415, "learning_rate": 7.337717146548983e-06, "loss": 0.4604, "num_input_tokens_seen": 79709536, "step": 65545 }, { "epoch": 8.213256484149856, "grad_norm": 0.11250632256269455, "learning_rate": 7.337233853889199e-06, "loss": 0.4562, "num_input_tokens_seen": 79715840, "step": 65550 }, { "epoch": 8.21388297205864, "grad_norm": 0.07503321766853333, "learning_rate": 7.336750533285817e-06, "loss": 0.4626, "num_input_tokens_seen": 79722016, "step": 65555 }, { "epoch": 8.214509459967422, "grad_norm": 0.11603040993213654, "learning_rate": 7.336267184744617e-06, "loss": 0.4576, "num_input_tokens_seen": 79728032, "step": 65560 }, { "epoch": 8.215135947876206, "grad_norm": 0.12228246033191681, "learning_rate": 7.335783808271382e-06, "loss": 0.4501, "num_input_tokens_seen": 79734176, "step": 65565 }, { "epoch": 8.21576243578499, "grad_norm": 0.11238108575344086, "learning_rate": 7.3353004038718855e-06, "loss": 0.4638, "num_input_tokens_seen": 79739488, "step": 65570 }, { "epoch": 8.216388923693772, "grad_norm": 0.13663563132286072, "learning_rate": 7.33481697155191e-06, "loss": 0.466, "num_input_tokens_seen": 79745792, "step": 65575 }, { "epoch": 8.217015411602556, "grad_norm": 0.09824326634407043, "learning_rate": 7.3343335113172345e-06, "loss": 0.4634, "num_input_tokens_seen": 79751840, "step": 65580 }, { "epoch": 8.217641899511339, "grad_norm": 0.16905121505260468, "learning_rate": 7.333850023173639e-06, "loss": 0.4641, "num_input_tokens_seen": 79757824, "step": 65585 }, { "epoch": 8.218268387420123, "grad_norm": 0.13179995119571686, "learning_rate": 7.333366507126905e-06, "loss": 0.4647, "num_input_tokens_seen": 79763872, "step": 65590 }, { "epoch": 8.218894875328907, "grad_norm": 0.10992566496133804, "learning_rate": 7.332882963182812e-06, "loss": 0.4525, "num_input_tokens_seen": 79770048, "step": 65595 }, { "epoch": 8.21952136323769, "grad_norm": 0.12098957598209381, "learning_rate": 7.3323993913471415e-06, "loss": 0.4613, "num_input_tokens_seen": 79776352, "step": 65600 }, { "epoch": 8.220147851146473, "grad_norm": 0.11226644366979599, "learning_rate": 7.3319157916256756e-06, "loss": 0.4622, "num_input_tokens_seen": 79781856, "step": 65605 }, { "epoch": 8.220774339055255, "grad_norm": 0.12487322092056274, "learning_rate": 7.3314321640241955e-06, "loss": 0.4652, "num_input_tokens_seen": 79788128, "step": 65610 }, { "epoch": 8.22140082696404, "grad_norm": 0.11237268149852753, "learning_rate": 7.330948508548484e-06, "loss": 0.4607, "num_input_tokens_seen": 79794336, "step": 65615 }, { "epoch": 8.222027314872824, "grad_norm": 0.12840160727500916, "learning_rate": 7.330464825204323e-06, "loss": 0.4648, "num_input_tokens_seen": 79800416, "step": 65620 }, { "epoch": 8.222653802781606, "grad_norm": 0.18198750913143158, "learning_rate": 7.329981113997496e-06, "loss": 0.4594, "num_input_tokens_seen": 79806592, "step": 65625 }, { "epoch": 8.22328029069039, "grad_norm": 0.14871717989444733, "learning_rate": 7.329497374933788e-06, "loss": 0.4608, "num_input_tokens_seen": 79812480, "step": 65630 }, { "epoch": 8.223906778599172, "grad_norm": 0.12424596399068832, "learning_rate": 7.329013608018978e-06, "loss": 0.4625, "num_input_tokens_seen": 79818720, "step": 65635 }, { "epoch": 8.224533266507956, "grad_norm": 0.13042406737804413, "learning_rate": 7.328529813258851e-06, "loss": 0.4699, "num_input_tokens_seen": 79824864, "step": 65640 }, { "epoch": 8.22515975441674, "grad_norm": 0.19524559378623962, "learning_rate": 7.328045990659194e-06, "loss": 0.4662, "num_input_tokens_seen": 79830880, "step": 65645 }, { "epoch": 8.225786242325523, "grad_norm": 0.1566656529903412, "learning_rate": 7.3275621402257904e-06, "loss": 0.4557, "num_input_tokens_seen": 79837216, "step": 65650 }, { "epoch": 8.226412730234307, "grad_norm": 0.15536339581012726, "learning_rate": 7.327078261964424e-06, "loss": 0.4623, "num_input_tokens_seen": 79842592, "step": 65655 }, { "epoch": 8.22703921814309, "grad_norm": 0.2342880368232727, "learning_rate": 7.32659435588088e-06, "loss": 0.4668, "num_input_tokens_seen": 79849024, "step": 65660 }, { "epoch": 8.227665706051873, "grad_norm": 0.19212523102760315, "learning_rate": 7.326110421980944e-06, "loss": 0.4697, "num_input_tokens_seen": 79855296, "step": 65665 }, { "epoch": 8.228292193960657, "grad_norm": 0.10335595905780792, "learning_rate": 7.325626460270403e-06, "loss": 0.4656, "num_input_tokens_seen": 79860544, "step": 65670 }, { "epoch": 8.22891868186944, "grad_norm": 0.13087281584739685, "learning_rate": 7.325142470755043e-06, "loss": 0.462, "num_input_tokens_seen": 79866528, "step": 65675 }, { "epoch": 8.229545169778223, "grad_norm": 0.15643775463104248, "learning_rate": 7.324658453440648e-06, "loss": 0.4611, "num_input_tokens_seen": 79872768, "step": 65680 }, { "epoch": 8.230171657687007, "grad_norm": 0.1155184656381607, "learning_rate": 7.324174408333009e-06, "loss": 0.4577, "num_input_tokens_seen": 79878720, "step": 65685 }, { "epoch": 8.23079814559579, "grad_norm": 0.1198420524597168, "learning_rate": 7.323690335437909e-06, "loss": 0.4675, "num_input_tokens_seen": 79884960, "step": 65690 }, { "epoch": 8.231424633504574, "grad_norm": 0.13440164923667908, "learning_rate": 7.323206234761139e-06, "loss": 0.4656, "num_input_tokens_seen": 79891136, "step": 65695 }, { "epoch": 8.232051121413356, "grad_norm": 0.11890896409749985, "learning_rate": 7.322722106308484e-06, "loss": 0.4677, "num_input_tokens_seen": 79897408, "step": 65700 }, { "epoch": 8.23267760932214, "grad_norm": 0.13510411977767944, "learning_rate": 7.322237950085733e-06, "loss": 0.463, "num_input_tokens_seen": 79903456, "step": 65705 }, { "epoch": 8.233304097230924, "grad_norm": 0.2151809185743332, "learning_rate": 7.321753766098676e-06, "loss": 0.4527, "num_input_tokens_seen": 79909824, "step": 65710 }, { "epoch": 8.233930585139706, "grad_norm": 0.2277354747056961, "learning_rate": 7.321269554353101e-06, "loss": 0.4609, "num_input_tokens_seen": 79914816, "step": 65715 }, { "epoch": 8.23455707304849, "grad_norm": 0.2835238575935364, "learning_rate": 7.320785314854796e-06, "loss": 0.4709, "num_input_tokens_seen": 79920736, "step": 65720 }, { "epoch": 8.235183560957273, "grad_norm": 0.06838489323854446, "learning_rate": 7.320301047609553e-06, "loss": 0.4704, "num_input_tokens_seen": 79926656, "step": 65725 }, { "epoch": 8.235810048866057, "grad_norm": 0.15302987396717072, "learning_rate": 7.3198167526231575e-06, "loss": 0.4652, "num_input_tokens_seen": 79932608, "step": 65730 }, { "epoch": 8.236436536774841, "grad_norm": 0.10035328567028046, "learning_rate": 7.319332429901404e-06, "loss": 0.4649, "num_input_tokens_seen": 79938912, "step": 65735 }, { "epoch": 8.237063024683623, "grad_norm": 0.0997769758105278, "learning_rate": 7.318848079450081e-06, "loss": 0.4649, "num_input_tokens_seen": 79944864, "step": 65740 }, { "epoch": 8.237689512592407, "grad_norm": 0.1178036481142044, "learning_rate": 7.318363701274979e-06, "loss": 0.4562, "num_input_tokens_seen": 79950400, "step": 65745 }, { "epoch": 8.23831600050119, "grad_norm": 0.09436199069023132, "learning_rate": 7.317879295381889e-06, "loss": 0.4617, "num_input_tokens_seen": 79956800, "step": 65750 }, { "epoch": 8.238942488409974, "grad_norm": 0.11272580921649933, "learning_rate": 7.317394861776606e-06, "loss": 0.4645, "num_input_tokens_seen": 79963040, "step": 65755 }, { "epoch": 8.239568976318758, "grad_norm": 0.08681595325469971, "learning_rate": 7.3169104004649176e-06, "loss": 0.4612, "num_input_tokens_seen": 79968480, "step": 65760 }, { "epoch": 8.24019546422754, "grad_norm": 0.10137871652841568, "learning_rate": 7.316425911452617e-06, "loss": 0.4623, "num_input_tokens_seen": 79974560, "step": 65765 }, { "epoch": 8.240821952136324, "grad_norm": 0.16030625998973846, "learning_rate": 7.315941394745498e-06, "loss": 0.45, "num_input_tokens_seen": 79980800, "step": 65770 }, { "epoch": 8.241448440045108, "grad_norm": 0.1436067670583725, "learning_rate": 7.315456850349352e-06, "loss": 0.4656, "num_input_tokens_seen": 79986976, "step": 65775 }, { "epoch": 8.24207492795389, "grad_norm": 0.10022880882024765, "learning_rate": 7.314972278269973e-06, "loss": 0.4643, "num_input_tokens_seen": 79992992, "step": 65780 }, { "epoch": 8.242701415862674, "grad_norm": 0.11905371397733688, "learning_rate": 7.314487678513153e-06, "loss": 0.4638, "num_input_tokens_seen": 79999040, "step": 65785 }, { "epoch": 8.243327903771457, "grad_norm": 0.16702556610107422, "learning_rate": 7.314003051084688e-06, "loss": 0.4592, "num_input_tokens_seen": 80005408, "step": 65790 }, { "epoch": 8.24395439168024, "grad_norm": 0.11343964189291, "learning_rate": 7.31351839599037e-06, "loss": 0.4618, "num_input_tokens_seen": 80011488, "step": 65795 }, { "epoch": 8.244580879589025, "grad_norm": 0.12352541089057922, "learning_rate": 7.313033713235995e-06, "loss": 0.4622, "num_input_tokens_seen": 80017760, "step": 65800 }, { "epoch": 8.245207367497807, "grad_norm": 0.09393241256475449, "learning_rate": 7.312549002827357e-06, "loss": 0.4709, "num_input_tokens_seen": 80024192, "step": 65805 }, { "epoch": 8.245833855406591, "grad_norm": 0.09563061594963074, "learning_rate": 7.312064264770253e-06, "loss": 0.4633, "num_input_tokens_seen": 80030112, "step": 65810 }, { "epoch": 8.246460343315373, "grad_norm": 0.098301462829113, "learning_rate": 7.311579499070475e-06, "loss": 0.4639, "num_input_tokens_seen": 80036128, "step": 65815 }, { "epoch": 8.247086831224157, "grad_norm": 0.14019879698753357, "learning_rate": 7.311094705733821e-06, "loss": 0.4613, "num_input_tokens_seen": 80042176, "step": 65820 }, { "epoch": 8.247713319132941, "grad_norm": 0.10380097478628159, "learning_rate": 7.310609884766087e-06, "loss": 0.4635, "num_input_tokens_seen": 80048448, "step": 65825 }, { "epoch": 8.248339807041724, "grad_norm": 0.08284937590360641, "learning_rate": 7.3101250361730695e-06, "loss": 0.4598, "num_input_tokens_seen": 80053952, "step": 65830 }, { "epoch": 8.248966294950508, "grad_norm": 0.11139549314975739, "learning_rate": 7.309640159960565e-06, "loss": 0.4597, "num_input_tokens_seen": 80059872, "step": 65835 }, { "epoch": 8.24959278285929, "grad_norm": 0.22822305560112, "learning_rate": 7.3091552561343705e-06, "loss": 0.4623, "num_input_tokens_seen": 80066016, "step": 65840 }, { "epoch": 8.250219270768074, "grad_norm": 0.15185029804706573, "learning_rate": 7.308670324700282e-06, "loss": 0.4575, "num_input_tokens_seen": 80071424, "step": 65845 }, { "epoch": 8.250845758676858, "grad_norm": 0.2549463212490082, "learning_rate": 7.3081853656641e-06, "loss": 0.4712, "num_input_tokens_seen": 80077760, "step": 65850 }, { "epoch": 8.25147224658564, "grad_norm": 0.17607739567756653, "learning_rate": 7.3077003790316216e-06, "loss": 0.4609, "num_input_tokens_seen": 80084096, "step": 65855 }, { "epoch": 8.252098734494425, "grad_norm": 0.09650306403636932, "learning_rate": 7.307215364808644e-06, "loss": 0.4545, "num_input_tokens_seen": 80089536, "step": 65860 }, { "epoch": 8.252725222403207, "grad_norm": 0.19748353958129883, "learning_rate": 7.306730323000967e-06, "loss": 0.463, "num_input_tokens_seen": 80095424, "step": 65865 }, { "epoch": 8.25335171031199, "grad_norm": 0.10925956070423126, "learning_rate": 7.3062452536143904e-06, "loss": 0.4517, "num_input_tokens_seen": 80101120, "step": 65870 }, { "epoch": 8.253978198220775, "grad_norm": 0.10789301991462708, "learning_rate": 7.305760156654712e-06, "loss": 0.4665, "num_input_tokens_seen": 80107040, "step": 65875 }, { "epoch": 8.254604686129557, "grad_norm": 0.14219680428504944, "learning_rate": 7.305275032127734e-06, "loss": 0.4551, "num_input_tokens_seen": 80113216, "step": 65880 }, { "epoch": 8.255231174038341, "grad_norm": 0.1203266829252243, "learning_rate": 7.304789880039254e-06, "loss": 0.4613, "num_input_tokens_seen": 80119296, "step": 65885 }, { "epoch": 8.255857661947124, "grad_norm": 0.22559712827205658, "learning_rate": 7.304304700395072e-06, "loss": 0.472, "num_input_tokens_seen": 80125280, "step": 65890 }, { "epoch": 8.256484149855908, "grad_norm": 0.20363262295722961, "learning_rate": 7.303819493200991e-06, "loss": 0.4642, "num_input_tokens_seen": 80130656, "step": 65895 }, { "epoch": 8.257110637764692, "grad_norm": 0.09502331167459488, "learning_rate": 7.303334258462811e-06, "loss": 0.46, "num_input_tokens_seen": 80136960, "step": 65900 }, { "epoch": 8.257737125673474, "grad_norm": 0.1452968567609787, "learning_rate": 7.302848996186332e-06, "loss": 0.4636, "num_input_tokens_seen": 80142400, "step": 65905 }, { "epoch": 8.258363613582258, "grad_norm": 0.19293822348117828, "learning_rate": 7.302363706377359e-06, "loss": 0.4557, "num_input_tokens_seen": 80148512, "step": 65910 }, { "epoch": 8.258990101491042, "grad_norm": 0.1363128423690796, "learning_rate": 7.3018783890416924e-06, "loss": 0.458, "num_input_tokens_seen": 80154688, "step": 65915 }, { "epoch": 8.259616589399824, "grad_norm": 0.12354367971420288, "learning_rate": 7.301393044185133e-06, "loss": 0.4556, "num_input_tokens_seen": 80160640, "step": 65920 }, { "epoch": 8.260243077308608, "grad_norm": 0.178399920463562, "learning_rate": 7.300907671813486e-06, "loss": 0.4663, "num_input_tokens_seen": 80166080, "step": 65925 }, { "epoch": 8.26086956521739, "grad_norm": 0.21060466766357422, "learning_rate": 7.3004222719325515e-06, "loss": 0.4614, "num_input_tokens_seen": 80172256, "step": 65930 }, { "epoch": 8.261496053126175, "grad_norm": 0.12526045739650726, "learning_rate": 7.299936844548136e-06, "loss": 0.4675, "num_input_tokens_seen": 80178496, "step": 65935 }, { "epoch": 8.262122541034959, "grad_norm": 0.13658857345581055, "learning_rate": 7.299451389666041e-06, "loss": 0.4698, "num_input_tokens_seen": 80184576, "step": 65940 }, { "epoch": 8.262749028943741, "grad_norm": 0.09505967795848846, "learning_rate": 7.298965907292073e-06, "loss": 0.4708, "num_input_tokens_seen": 80190720, "step": 65945 }, { "epoch": 8.263375516852525, "grad_norm": 0.12986037135124207, "learning_rate": 7.298480397432034e-06, "loss": 0.4589, "num_input_tokens_seen": 80196736, "step": 65950 }, { "epoch": 8.264002004761307, "grad_norm": 0.0996844470500946, "learning_rate": 7.297994860091728e-06, "loss": 0.4623, "num_input_tokens_seen": 80202304, "step": 65955 }, { "epoch": 8.264628492670091, "grad_norm": 0.07572083175182343, "learning_rate": 7.297509295276962e-06, "loss": 0.4627, "num_input_tokens_seen": 80208704, "step": 65960 }, { "epoch": 8.265254980578876, "grad_norm": 0.12392506748437881, "learning_rate": 7.297023702993542e-06, "loss": 0.468, "num_input_tokens_seen": 80215104, "step": 65965 }, { "epoch": 8.265881468487658, "grad_norm": 0.09746580570936203, "learning_rate": 7.296538083247271e-06, "loss": 0.4613, "num_input_tokens_seen": 80221536, "step": 65970 }, { "epoch": 8.266507956396442, "grad_norm": 0.1405014991760254, "learning_rate": 7.296052436043958e-06, "loss": 0.4665, "num_input_tokens_seen": 80227520, "step": 65975 }, { "epoch": 8.267134444305224, "grad_norm": 0.09053794294595718, "learning_rate": 7.2955667613894066e-06, "loss": 0.4612, "num_input_tokens_seen": 80233280, "step": 65980 }, { "epoch": 8.267760932214008, "grad_norm": 0.0902172178030014, "learning_rate": 7.295081059289425e-06, "loss": 0.4722, "num_input_tokens_seen": 80238848, "step": 65985 }, { "epoch": 8.268387420122792, "grad_norm": 0.14079491794109344, "learning_rate": 7.294595329749818e-06, "loss": 0.4701, "num_input_tokens_seen": 80244640, "step": 65990 }, { "epoch": 8.269013908031575, "grad_norm": 0.09939733147621155, "learning_rate": 7.294109572776397e-06, "loss": 0.4596, "num_input_tokens_seen": 80250624, "step": 65995 }, { "epoch": 8.269640395940359, "grad_norm": 0.12248096615076065, "learning_rate": 7.2936237883749665e-06, "loss": 0.462, "num_input_tokens_seen": 80256928, "step": 66000 }, { "epoch": 8.27026688384914, "grad_norm": 0.11733011901378632, "learning_rate": 7.293137976551334e-06, "loss": 0.463, "num_input_tokens_seen": 80262816, "step": 66005 }, { "epoch": 8.270893371757925, "grad_norm": 0.12460137158632278, "learning_rate": 7.29265213731131e-06, "loss": 0.4694, "num_input_tokens_seen": 80268768, "step": 66010 }, { "epoch": 8.271519859666709, "grad_norm": 0.10951924324035645, "learning_rate": 7.2921662706607036e-06, "loss": 0.4637, "num_input_tokens_seen": 80275008, "step": 66015 }, { "epoch": 8.272146347575491, "grad_norm": 0.12934869527816772, "learning_rate": 7.29168037660532e-06, "loss": 0.4644, "num_input_tokens_seen": 80281120, "step": 66020 }, { "epoch": 8.272772835484275, "grad_norm": 0.09201790392398834, "learning_rate": 7.291194455150971e-06, "loss": 0.4564, "num_input_tokens_seen": 80286944, "step": 66025 }, { "epoch": 8.273399323393058, "grad_norm": 0.1504320353269577, "learning_rate": 7.290708506303468e-06, "loss": 0.4652, "num_input_tokens_seen": 80293152, "step": 66030 }, { "epoch": 8.274025811301842, "grad_norm": 0.11469309031963348, "learning_rate": 7.2902225300686184e-06, "loss": 0.4637, "num_input_tokens_seen": 80299168, "step": 66035 }, { "epoch": 8.274652299210626, "grad_norm": 0.10724052041769028, "learning_rate": 7.289736526452231e-06, "loss": 0.4602, "num_input_tokens_seen": 80304832, "step": 66040 }, { "epoch": 8.275278787119408, "grad_norm": 0.1168057769536972, "learning_rate": 7.28925049546012e-06, "loss": 0.4591, "num_input_tokens_seen": 80311232, "step": 66045 }, { "epoch": 8.275905275028192, "grad_norm": 0.12635859847068787, "learning_rate": 7.288764437098092e-06, "loss": 0.4555, "num_input_tokens_seen": 80317536, "step": 66050 }, { "epoch": 8.276531762936976, "grad_norm": 0.11270814388990402, "learning_rate": 7.2882783513719644e-06, "loss": 0.4685, "num_input_tokens_seen": 80323360, "step": 66055 }, { "epoch": 8.277158250845758, "grad_norm": 0.09535087645053864, "learning_rate": 7.287792238287543e-06, "loss": 0.4534, "num_input_tokens_seen": 80329792, "step": 66060 }, { "epoch": 8.277784738754542, "grad_norm": 0.14738747477531433, "learning_rate": 7.287306097850643e-06, "loss": 0.4642, "num_input_tokens_seen": 80335616, "step": 66065 }, { "epoch": 8.278411226663325, "grad_norm": 0.12287051975727081, "learning_rate": 7.286819930067075e-06, "loss": 0.4691, "num_input_tokens_seen": 80341472, "step": 66070 }, { "epoch": 8.279037714572109, "grad_norm": 0.10469868034124374, "learning_rate": 7.286333734942653e-06, "loss": 0.4633, "num_input_tokens_seen": 80347552, "step": 66075 }, { "epoch": 8.279664202480893, "grad_norm": 0.113911934196949, "learning_rate": 7.2858475124831876e-06, "loss": 0.4528, "num_input_tokens_seen": 80353792, "step": 66080 }, { "epoch": 8.280290690389675, "grad_norm": 0.06866144388914108, "learning_rate": 7.285361262694495e-06, "loss": 0.4725, "num_input_tokens_seen": 80360000, "step": 66085 }, { "epoch": 8.28091717829846, "grad_norm": 0.11300262063741684, "learning_rate": 7.284874985582386e-06, "loss": 0.4601, "num_input_tokens_seen": 80366368, "step": 66090 }, { "epoch": 8.281543666207241, "grad_norm": 0.0887453556060791, "learning_rate": 7.284388681152675e-06, "loss": 0.4614, "num_input_tokens_seen": 80372448, "step": 66095 }, { "epoch": 8.282170154116026, "grad_norm": 0.11956807225942612, "learning_rate": 7.283902349411178e-06, "loss": 0.462, "num_input_tokens_seen": 80378528, "step": 66100 }, { "epoch": 8.28279664202481, "grad_norm": 0.08328569680452347, "learning_rate": 7.283415990363707e-06, "loss": 0.4574, "num_input_tokens_seen": 80384320, "step": 66105 }, { "epoch": 8.283423129933592, "grad_norm": 0.1315942257642746, "learning_rate": 7.282929604016078e-06, "loss": 0.4616, "num_input_tokens_seen": 80390176, "step": 66110 }, { "epoch": 8.284049617842376, "grad_norm": 0.09841529279947281, "learning_rate": 7.282443190374107e-06, "loss": 0.4597, "num_input_tokens_seen": 80396064, "step": 66115 }, { "epoch": 8.284676105751158, "grad_norm": 0.14221662282943726, "learning_rate": 7.2819567494436086e-06, "loss": 0.4684, "num_input_tokens_seen": 80402272, "step": 66120 }, { "epoch": 8.285302593659942, "grad_norm": 0.10177233070135117, "learning_rate": 7.281470281230399e-06, "loss": 0.4601, "num_input_tokens_seen": 80408480, "step": 66125 }, { "epoch": 8.285929081568726, "grad_norm": 0.11887680739164352, "learning_rate": 7.280983785740292e-06, "loss": 0.4705, "num_input_tokens_seen": 80414496, "step": 66130 }, { "epoch": 8.286555569477509, "grad_norm": 0.11603101342916489, "learning_rate": 7.280497262979108e-06, "loss": 0.465, "num_input_tokens_seen": 80420704, "step": 66135 }, { "epoch": 8.287182057386293, "grad_norm": 0.11138701438903809, "learning_rate": 7.280010712952662e-06, "loss": 0.4585, "num_input_tokens_seen": 80427136, "step": 66140 }, { "epoch": 8.287808545295075, "grad_norm": 0.13696657121181488, "learning_rate": 7.279524135666771e-06, "loss": 0.4569, "num_input_tokens_seen": 80433120, "step": 66145 }, { "epoch": 8.288435033203859, "grad_norm": 0.09545964002609253, "learning_rate": 7.279037531127252e-06, "loss": 0.4596, "num_input_tokens_seen": 80439168, "step": 66150 }, { "epoch": 8.289061521112643, "grad_norm": 0.09773200750350952, "learning_rate": 7.278550899339924e-06, "loss": 0.4679, "num_input_tokens_seen": 80445600, "step": 66155 }, { "epoch": 8.289688009021425, "grad_norm": 0.11775216460227966, "learning_rate": 7.278064240310602e-06, "loss": 0.4624, "num_input_tokens_seen": 80451680, "step": 66160 }, { "epoch": 8.29031449693021, "grad_norm": 0.06867586076259613, "learning_rate": 7.277577554045108e-06, "loss": 0.458, "num_input_tokens_seen": 80457888, "step": 66165 }, { "epoch": 8.290940984838993, "grad_norm": 0.14226707816123962, "learning_rate": 7.27709084054926e-06, "loss": 0.4592, "num_input_tokens_seen": 80463904, "step": 66170 }, { "epoch": 8.291567472747776, "grad_norm": 0.10450301319360733, "learning_rate": 7.276604099828876e-06, "loss": 0.4569, "num_input_tokens_seen": 80470080, "step": 66175 }, { "epoch": 8.29219396065656, "grad_norm": 0.15713050961494446, "learning_rate": 7.276117331889776e-06, "loss": 0.4652, "num_input_tokens_seen": 80476128, "step": 66180 }, { "epoch": 8.292820448565342, "grad_norm": 0.12918280065059662, "learning_rate": 7.27563053673778e-06, "loss": 0.4608, "num_input_tokens_seen": 80482208, "step": 66185 }, { "epoch": 8.293446936474126, "grad_norm": 0.15149369835853577, "learning_rate": 7.2751437143787076e-06, "loss": 0.4659, "num_input_tokens_seen": 80488512, "step": 66190 }, { "epoch": 8.29407342438291, "grad_norm": 0.10276172310113907, "learning_rate": 7.274656864818379e-06, "loss": 0.458, "num_input_tokens_seen": 80494240, "step": 66195 }, { "epoch": 8.294699912291692, "grad_norm": 0.13286684453487396, "learning_rate": 7.274169988062616e-06, "loss": 0.458, "num_input_tokens_seen": 80500288, "step": 66200 }, { "epoch": 8.295326400200477, "grad_norm": 0.14871186017990112, "learning_rate": 7.273683084117238e-06, "loss": 0.4682, "num_input_tokens_seen": 80506880, "step": 66205 }, { "epoch": 8.295952888109259, "grad_norm": 0.120051808655262, "learning_rate": 7.273196152988065e-06, "loss": 0.4615, "num_input_tokens_seen": 80512736, "step": 66210 }, { "epoch": 8.296579376018043, "grad_norm": 0.1325867921113968, "learning_rate": 7.272709194680925e-06, "loss": 0.4647, "num_input_tokens_seen": 80518816, "step": 66215 }, { "epoch": 8.297205863926827, "grad_norm": 0.12287994474172592, "learning_rate": 7.272222209201634e-06, "loss": 0.4572, "num_input_tokens_seen": 80524896, "step": 66220 }, { "epoch": 8.29783235183561, "grad_norm": 0.09003235399723053, "learning_rate": 7.271735196556016e-06, "loss": 0.461, "num_input_tokens_seen": 80531232, "step": 66225 }, { "epoch": 8.298458839744393, "grad_norm": 0.10334853827953339, "learning_rate": 7.271248156749892e-06, "loss": 0.4658, "num_input_tokens_seen": 80537600, "step": 66230 }, { "epoch": 8.299085327653176, "grad_norm": 0.09062392264604568, "learning_rate": 7.270761089789089e-06, "loss": 0.4588, "num_input_tokens_seen": 80543360, "step": 66235 }, { "epoch": 8.29971181556196, "grad_norm": 0.12781840562820435, "learning_rate": 7.270273995679427e-06, "loss": 0.4532, "num_input_tokens_seen": 80549728, "step": 66240 }, { "epoch": 8.300338303470744, "grad_norm": 0.08641058951616287, "learning_rate": 7.269786874426731e-06, "loss": 0.4562, "num_input_tokens_seen": 80555872, "step": 66245 }, { "epoch": 8.300964791379526, "grad_norm": 0.15545275807380676, "learning_rate": 7.269299726036825e-06, "loss": 0.457, "num_input_tokens_seen": 80561856, "step": 66250 }, { "epoch": 8.30159127928831, "grad_norm": 0.07805317640304565, "learning_rate": 7.268812550515533e-06, "loss": 0.4591, "num_input_tokens_seen": 80567840, "step": 66255 }, { "epoch": 8.302217767197092, "grad_norm": 0.10200562328100204, "learning_rate": 7.268325347868679e-06, "loss": 0.4593, "num_input_tokens_seen": 80574048, "step": 66260 }, { "epoch": 8.302844255105876, "grad_norm": 0.11623021960258484, "learning_rate": 7.267838118102087e-06, "loss": 0.464, "num_input_tokens_seen": 80580064, "step": 66265 }, { "epoch": 8.30347074301466, "grad_norm": 0.1827860325574875, "learning_rate": 7.267350861221585e-06, "loss": 0.4611, "num_input_tokens_seen": 80585984, "step": 66270 }, { "epoch": 8.304097230923443, "grad_norm": 0.12760744988918304, "learning_rate": 7.266863577232997e-06, "loss": 0.4623, "num_input_tokens_seen": 80592224, "step": 66275 }, { "epoch": 8.304723718832227, "grad_norm": 0.14630761742591858, "learning_rate": 7.26637626614215e-06, "loss": 0.4634, "num_input_tokens_seen": 80598336, "step": 66280 }, { "epoch": 8.30535020674101, "grad_norm": 0.18548928201198578, "learning_rate": 7.265888927954867e-06, "loss": 0.4628, "num_input_tokens_seen": 80604480, "step": 66285 }, { "epoch": 8.305976694649793, "grad_norm": 0.18593323230743408, "learning_rate": 7.2654015626769795e-06, "loss": 0.4547, "num_input_tokens_seen": 80610528, "step": 66290 }, { "epoch": 8.306603182558577, "grad_norm": 0.15766538679599762, "learning_rate": 7.26491417031431e-06, "loss": 0.4569, "num_input_tokens_seen": 80616800, "step": 66295 }, { "epoch": 8.30722967046736, "grad_norm": 0.16004140675067902, "learning_rate": 7.264426750872687e-06, "loss": 0.4605, "num_input_tokens_seen": 80623200, "step": 66300 }, { "epoch": 8.307856158376143, "grad_norm": 0.1845574975013733, "learning_rate": 7.263939304357939e-06, "loss": 0.4547, "num_input_tokens_seen": 80629472, "step": 66305 }, { "epoch": 8.308482646284927, "grad_norm": 0.24978160858154297, "learning_rate": 7.263451830775894e-06, "loss": 0.46, "num_input_tokens_seen": 80635584, "step": 66310 }, { "epoch": 8.30910913419371, "grad_norm": 0.2187570184469223, "learning_rate": 7.2629643301323774e-06, "loss": 0.4672, "num_input_tokens_seen": 80641888, "step": 66315 }, { "epoch": 8.309735622102494, "grad_norm": 0.16880810260772705, "learning_rate": 7.262476802433222e-06, "loss": 0.465, "num_input_tokens_seen": 80648064, "step": 66320 }, { "epoch": 8.310362110011276, "grad_norm": 0.10968514531850815, "learning_rate": 7.2619892476842524e-06, "loss": 0.4614, "num_input_tokens_seen": 80654176, "step": 66325 }, { "epoch": 8.31098859792006, "grad_norm": 0.12851980328559875, "learning_rate": 7.261501665891301e-06, "loss": 0.4658, "num_input_tokens_seen": 80660192, "step": 66330 }, { "epoch": 8.311615085828844, "grad_norm": 0.12181082367897034, "learning_rate": 7.261014057060195e-06, "loss": 0.462, "num_input_tokens_seen": 80665536, "step": 66335 }, { "epoch": 8.312241573737627, "grad_norm": 0.1991294026374817, "learning_rate": 7.260526421196765e-06, "loss": 0.4665, "num_input_tokens_seen": 80671520, "step": 66340 }, { "epoch": 8.31286806164641, "grad_norm": 0.18155810236930847, "learning_rate": 7.260038758306841e-06, "loss": 0.46, "num_input_tokens_seen": 80677696, "step": 66345 }, { "epoch": 8.313494549555193, "grad_norm": 0.12073665112257004, "learning_rate": 7.259551068396255e-06, "loss": 0.4645, "num_input_tokens_seen": 80684064, "step": 66350 }, { "epoch": 8.314121037463977, "grad_norm": 0.08719650655984879, "learning_rate": 7.2590633514708345e-06, "loss": 0.4577, "num_input_tokens_seen": 80690432, "step": 66355 }, { "epoch": 8.314747525372761, "grad_norm": 0.194954052567482, "learning_rate": 7.258575607536411e-06, "loss": 0.4751, "num_input_tokens_seen": 80696672, "step": 66360 }, { "epoch": 8.315374013281543, "grad_norm": 0.11755681782960892, "learning_rate": 7.25808783659882e-06, "loss": 0.4603, "num_input_tokens_seen": 80702656, "step": 66365 }, { "epoch": 8.316000501190327, "grad_norm": 0.12346410006284714, "learning_rate": 7.257600038663888e-06, "loss": 0.4694, "num_input_tokens_seen": 80709088, "step": 66370 }, { "epoch": 8.31662698909911, "grad_norm": 0.11597078293561935, "learning_rate": 7.257112213737452e-06, "loss": 0.4568, "num_input_tokens_seen": 80715104, "step": 66375 }, { "epoch": 8.317253477007894, "grad_norm": 0.1447957456111908, "learning_rate": 7.2566243618253394e-06, "loss": 0.4591, "num_input_tokens_seen": 80721248, "step": 66380 }, { "epoch": 8.317879964916678, "grad_norm": 0.11726493388414383, "learning_rate": 7.256136482933387e-06, "loss": 0.469, "num_input_tokens_seen": 80727296, "step": 66385 }, { "epoch": 8.31850645282546, "grad_norm": 0.16411496698856354, "learning_rate": 7.255648577067427e-06, "loss": 0.4616, "num_input_tokens_seen": 80733216, "step": 66390 }, { "epoch": 8.319132940734244, "grad_norm": 0.11182048171758652, "learning_rate": 7.25516064423329e-06, "loss": 0.4693, "num_input_tokens_seen": 80739712, "step": 66395 }, { "epoch": 8.319759428643028, "grad_norm": 0.12145553529262543, "learning_rate": 7.254672684436812e-06, "loss": 0.4628, "num_input_tokens_seen": 80745952, "step": 66400 }, { "epoch": 8.32038591655181, "grad_norm": 0.12484815716743469, "learning_rate": 7.254184697683826e-06, "loss": 0.4681, "num_input_tokens_seen": 80752032, "step": 66405 }, { "epoch": 8.321012404460594, "grad_norm": 0.08968257158994675, "learning_rate": 7.253696683980167e-06, "loss": 0.4678, "num_input_tokens_seen": 80757760, "step": 66410 }, { "epoch": 8.321638892369377, "grad_norm": 0.14740090072155, "learning_rate": 7.25320864333167e-06, "loss": 0.4651, "num_input_tokens_seen": 80763872, "step": 66415 }, { "epoch": 8.32226538027816, "grad_norm": 0.0980769544839859, "learning_rate": 7.252720575744167e-06, "loss": 0.462, "num_input_tokens_seen": 80769376, "step": 66420 }, { "epoch": 8.322891868186945, "grad_norm": 0.09675628691911697, "learning_rate": 7.2522324812234975e-06, "loss": 0.4613, "num_input_tokens_seen": 80775744, "step": 66425 }, { "epoch": 8.323518356095727, "grad_norm": 0.13241149485111237, "learning_rate": 7.251744359775494e-06, "loss": 0.4622, "num_input_tokens_seen": 80781856, "step": 66430 }, { "epoch": 8.324144844004511, "grad_norm": 0.10225166380405426, "learning_rate": 7.251256211405995e-06, "loss": 0.4601, "num_input_tokens_seen": 80788032, "step": 66435 }, { "epoch": 8.324771331913293, "grad_norm": 0.18882079422473907, "learning_rate": 7.250768036120836e-06, "loss": 0.4601, "num_input_tokens_seen": 80794144, "step": 66440 }, { "epoch": 8.325397819822077, "grad_norm": 0.1273198425769806, "learning_rate": 7.250279833925851e-06, "loss": 0.46, "num_input_tokens_seen": 80800288, "step": 66445 }, { "epoch": 8.326024307730862, "grad_norm": 0.1132858395576477, "learning_rate": 7.249791604826879e-06, "loss": 0.4659, "num_input_tokens_seen": 80805984, "step": 66450 }, { "epoch": 8.326650795639644, "grad_norm": 0.10723146796226501, "learning_rate": 7.249303348829757e-06, "loss": 0.4602, "num_input_tokens_seen": 80812064, "step": 66455 }, { "epoch": 8.327277283548428, "grad_norm": 0.10203050076961517, "learning_rate": 7.2488150659403225e-06, "loss": 0.4621, "num_input_tokens_seen": 80817920, "step": 66460 }, { "epoch": 8.32790377145721, "grad_norm": 0.06520422548055649, "learning_rate": 7.248326756164412e-06, "loss": 0.4614, "num_input_tokens_seen": 80824000, "step": 66465 }, { "epoch": 8.328530259365994, "grad_norm": 0.16261956095695496, "learning_rate": 7.247838419507866e-06, "loss": 0.4659, "num_input_tokens_seen": 80830400, "step": 66470 }, { "epoch": 8.329156747274778, "grad_norm": 0.07159506529569626, "learning_rate": 7.247350055976521e-06, "loss": 0.4583, "num_input_tokens_seen": 80836544, "step": 66475 }, { "epoch": 8.32978323518356, "grad_norm": 0.10025988519191742, "learning_rate": 7.246861665576218e-06, "loss": 0.4632, "num_input_tokens_seen": 80842752, "step": 66480 }, { "epoch": 8.330409723092345, "grad_norm": 0.17002913355827332, "learning_rate": 7.2463732483127925e-06, "loss": 0.4479, "num_input_tokens_seen": 80848768, "step": 66485 }, { "epoch": 8.331036211001127, "grad_norm": 0.19040384888648987, "learning_rate": 7.245884804192088e-06, "loss": 0.462, "num_input_tokens_seen": 80854944, "step": 66490 }, { "epoch": 8.331662698909911, "grad_norm": 0.10305694490671158, "learning_rate": 7.245396333219942e-06, "loss": 0.468, "num_input_tokens_seen": 80860800, "step": 66495 }, { "epoch": 8.332289186818695, "grad_norm": 0.1159021407365799, "learning_rate": 7.244907835402195e-06, "loss": 0.4546, "num_input_tokens_seen": 80867168, "step": 66500 }, { "epoch": 8.332915674727477, "grad_norm": 0.13580870628356934, "learning_rate": 7.244419310744688e-06, "loss": 0.4746, "num_input_tokens_seen": 80873504, "step": 66505 }, { "epoch": 8.333542162636261, "grad_norm": 0.13357357680797577, "learning_rate": 7.24393075925326e-06, "loss": 0.4627, "num_input_tokens_seen": 80879936, "step": 66510 }, { "epoch": 8.334168650545044, "grad_norm": 0.10318956524133682, "learning_rate": 7.243442180933755e-06, "loss": 0.4695, "num_input_tokens_seen": 80886240, "step": 66515 }, { "epoch": 8.334795138453828, "grad_norm": 0.12635235488414764, "learning_rate": 7.24295357579201e-06, "loss": 0.4633, "num_input_tokens_seen": 80892192, "step": 66520 }, { "epoch": 8.335421626362612, "grad_norm": 0.10532166063785553, "learning_rate": 7.242464943833871e-06, "loss": 0.4672, "num_input_tokens_seen": 80898496, "step": 66525 }, { "epoch": 8.336048114271394, "grad_norm": 0.08786293119192123, "learning_rate": 7.241976285065177e-06, "loss": 0.4589, "num_input_tokens_seen": 80904800, "step": 66530 }, { "epoch": 8.336674602180178, "grad_norm": 0.11645717918872833, "learning_rate": 7.241487599491773e-06, "loss": 0.4665, "num_input_tokens_seen": 80910784, "step": 66535 }, { "epoch": 8.337301090088962, "grad_norm": 0.09856154769659042, "learning_rate": 7.240998887119499e-06, "loss": 0.4628, "num_input_tokens_seen": 80917440, "step": 66540 }, { "epoch": 8.337927577997744, "grad_norm": 0.14453671872615814, "learning_rate": 7.2405101479542004e-06, "loss": 0.4591, "num_input_tokens_seen": 80923456, "step": 66545 }, { "epoch": 8.338554065906528, "grad_norm": 0.09165696054697037, "learning_rate": 7.240021382001719e-06, "loss": 0.4586, "num_input_tokens_seen": 80929760, "step": 66550 }, { "epoch": 8.33918055381531, "grad_norm": 0.09314116090536118, "learning_rate": 7.239532589267897e-06, "loss": 0.4554, "num_input_tokens_seen": 80935808, "step": 66555 }, { "epoch": 8.339807041724095, "grad_norm": 0.10044338554143906, "learning_rate": 7.239043769758581e-06, "loss": 0.4604, "num_input_tokens_seen": 80942080, "step": 66560 }, { "epoch": 8.340433529632879, "grad_norm": 0.0822930559515953, "learning_rate": 7.238554923479615e-06, "loss": 0.4597, "num_input_tokens_seen": 80948544, "step": 66565 }, { "epoch": 8.341060017541661, "grad_norm": 0.09014680981636047, "learning_rate": 7.2380660504368425e-06, "loss": 0.4655, "num_input_tokens_seen": 80954656, "step": 66570 }, { "epoch": 8.341686505450445, "grad_norm": 0.1164514347910881, "learning_rate": 7.2375771506361085e-06, "loss": 0.4599, "num_input_tokens_seen": 80960544, "step": 66575 }, { "epoch": 8.342312993359227, "grad_norm": 0.10296443849802017, "learning_rate": 7.2370882240832584e-06, "loss": 0.4616, "num_input_tokens_seen": 80966432, "step": 66580 }, { "epoch": 8.342939481268012, "grad_norm": 0.07388822734355927, "learning_rate": 7.236599270784137e-06, "loss": 0.4665, "num_input_tokens_seen": 80971936, "step": 66585 }, { "epoch": 8.343565969176796, "grad_norm": 0.1641390472650528, "learning_rate": 7.236110290744593e-06, "loss": 0.4743, "num_input_tokens_seen": 80978112, "step": 66590 }, { "epoch": 8.344192457085578, "grad_norm": 0.10033384710550308, "learning_rate": 7.2356212839704686e-06, "loss": 0.4629, "num_input_tokens_seen": 80984032, "step": 66595 }, { "epoch": 8.344818944994362, "grad_norm": 0.10158014297485352, "learning_rate": 7.235132250467613e-06, "loss": 0.469, "num_input_tokens_seen": 80990048, "step": 66600 }, { "epoch": 8.345445432903144, "grad_norm": 0.08923883736133575, "learning_rate": 7.234643190241872e-06, "loss": 0.4635, "num_input_tokens_seen": 80996352, "step": 66605 }, { "epoch": 8.346071920811928, "grad_norm": 0.10415472090244293, "learning_rate": 7.234154103299092e-06, "loss": 0.4598, "num_input_tokens_seen": 81002688, "step": 66610 }, { "epoch": 8.346698408720712, "grad_norm": 0.09605659544467926, "learning_rate": 7.233664989645121e-06, "loss": 0.4589, "num_input_tokens_seen": 81008832, "step": 66615 }, { "epoch": 8.347324896629495, "grad_norm": 0.08315441012382507, "learning_rate": 7.23317584928581e-06, "loss": 0.4556, "num_input_tokens_seen": 81014880, "step": 66620 }, { "epoch": 8.347951384538279, "grad_norm": 0.09026381373405457, "learning_rate": 7.232686682227001e-06, "loss": 0.4605, "num_input_tokens_seen": 81021408, "step": 66625 }, { "epoch": 8.348577872447061, "grad_norm": 0.09431669861078262, "learning_rate": 7.232197488474546e-06, "loss": 0.4553, "num_input_tokens_seen": 81027584, "step": 66630 }, { "epoch": 8.349204360355845, "grad_norm": 0.10612308979034424, "learning_rate": 7.231708268034294e-06, "loss": 0.4675, "num_input_tokens_seen": 81033088, "step": 66635 }, { "epoch": 8.349830848264629, "grad_norm": 0.09533312171697617, "learning_rate": 7.231219020912095e-06, "loss": 0.4688, "num_input_tokens_seen": 81039040, "step": 66640 }, { "epoch": 8.350457336173411, "grad_norm": 0.10498245805501938, "learning_rate": 7.230729747113795e-06, "loss": 0.4543, "num_input_tokens_seen": 81045216, "step": 66645 }, { "epoch": 8.351083824082195, "grad_norm": 0.11524777114391327, "learning_rate": 7.230240446645246e-06, "loss": 0.4627, "num_input_tokens_seen": 81051616, "step": 66650 }, { "epoch": 8.351710311990978, "grad_norm": 0.14250530302524567, "learning_rate": 7.229751119512296e-06, "loss": 0.467, "num_input_tokens_seen": 81057792, "step": 66655 }, { "epoch": 8.352336799899762, "grad_norm": 0.12310905009508133, "learning_rate": 7.2292617657207984e-06, "loss": 0.4608, "num_input_tokens_seen": 81063872, "step": 66660 }, { "epoch": 8.352963287808546, "grad_norm": 0.15127289295196533, "learning_rate": 7.228772385276601e-06, "loss": 0.4618, "num_input_tokens_seen": 81070016, "step": 66665 }, { "epoch": 8.353589775717328, "grad_norm": 0.08791320770978928, "learning_rate": 7.228282978185556e-06, "loss": 0.4676, "num_input_tokens_seen": 81076096, "step": 66670 }, { "epoch": 8.354216263626112, "grad_norm": 0.09249144047498703, "learning_rate": 7.227793544453514e-06, "loss": 0.4664, "num_input_tokens_seen": 81082272, "step": 66675 }, { "epoch": 8.354842751534896, "grad_norm": 0.10267098248004913, "learning_rate": 7.227304084086328e-06, "loss": 0.4607, "num_input_tokens_seen": 81088320, "step": 66680 }, { "epoch": 8.355469239443678, "grad_norm": 0.08374538272619247, "learning_rate": 7.2268145970898485e-06, "loss": 0.4594, "num_input_tokens_seen": 81094272, "step": 66685 }, { "epoch": 8.356095727352463, "grad_norm": 0.10338341444730759, "learning_rate": 7.22632508346993e-06, "loss": 0.4647, "num_input_tokens_seen": 81100160, "step": 66690 }, { "epoch": 8.356722215261245, "grad_norm": 0.10876889526844025, "learning_rate": 7.225835543232423e-06, "loss": 0.468, "num_input_tokens_seen": 81106432, "step": 66695 }, { "epoch": 8.357348703170029, "grad_norm": 0.09287997335195541, "learning_rate": 7.225345976383179e-06, "loss": 0.4653, "num_input_tokens_seen": 81112960, "step": 66700 }, { "epoch": 8.357975191078813, "grad_norm": 0.06694474071264267, "learning_rate": 7.224856382928054e-06, "loss": 0.4614, "num_input_tokens_seen": 81119008, "step": 66705 }, { "epoch": 8.358601678987595, "grad_norm": 0.10502733290195465, "learning_rate": 7.2243667628729e-06, "loss": 0.4668, "num_input_tokens_seen": 81125216, "step": 66710 }, { "epoch": 8.35922816689638, "grad_norm": 0.15214718878269196, "learning_rate": 7.223877116223571e-06, "loss": 0.469, "num_input_tokens_seen": 81131168, "step": 66715 }, { "epoch": 8.359854654805162, "grad_norm": 0.0866679921746254, "learning_rate": 7.223387442985922e-06, "loss": 0.4626, "num_input_tokens_seen": 81137312, "step": 66720 }, { "epoch": 8.360481142713946, "grad_norm": 0.15826934576034546, "learning_rate": 7.222897743165806e-06, "loss": 0.4609, "num_input_tokens_seen": 81143104, "step": 66725 }, { "epoch": 8.36110763062273, "grad_norm": 0.08909670263528824, "learning_rate": 7.222408016769079e-06, "loss": 0.457, "num_input_tokens_seen": 81149280, "step": 66730 }, { "epoch": 8.361734118531512, "grad_norm": 0.12512697279453278, "learning_rate": 7.221918263801595e-06, "loss": 0.4662, "num_input_tokens_seen": 81155488, "step": 66735 }, { "epoch": 8.362360606440296, "grad_norm": 0.09183186292648315, "learning_rate": 7.22142848426921e-06, "loss": 0.4598, "num_input_tokens_seen": 81161696, "step": 66740 }, { "epoch": 8.362987094349078, "grad_norm": 0.11205796897411346, "learning_rate": 7.220938678177781e-06, "loss": 0.4606, "num_input_tokens_seen": 81167648, "step": 66745 }, { "epoch": 8.363613582257862, "grad_norm": 0.12243075668811798, "learning_rate": 7.220448845533164e-06, "loss": 0.4623, "num_input_tokens_seen": 81173600, "step": 66750 }, { "epoch": 8.364240070166646, "grad_norm": 0.10838959366083145, "learning_rate": 7.2199589863412115e-06, "loss": 0.4573, "num_input_tokens_seen": 81179968, "step": 66755 }, { "epoch": 8.364866558075429, "grad_norm": 0.18118610978126526, "learning_rate": 7.2194691006077845e-06, "loss": 0.4641, "num_input_tokens_seen": 81186080, "step": 66760 }, { "epoch": 8.365493045984213, "grad_norm": 0.05660219490528107, "learning_rate": 7.218979188338737e-06, "loss": 0.4594, "num_input_tokens_seen": 81192288, "step": 66765 }, { "epoch": 8.366119533892995, "grad_norm": 0.13584508001804352, "learning_rate": 7.218489249539929e-06, "loss": 0.4703, "num_input_tokens_seen": 81198272, "step": 66770 }, { "epoch": 8.366746021801779, "grad_norm": 0.12782564759254456, "learning_rate": 7.217999284217216e-06, "loss": 0.4628, "num_input_tokens_seen": 81204448, "step": 66775 }, { "epoch": 8.367372509710563, "grad_norm": 0.09776457399129868, "learning_rate": 7.217509292376456e-06, "loss": 0.4598, "num_input_tokens_seen": 81210656, "step": 66780 }, { "epoch": 8.367998997619345, "grad_norm": 0.0857824757695198, "learning_rate": 7.21701927402351e-06, "loss": 0.4553, "num_input_tokens_seen": 81216704, "step": 66785 }, { "epoch": 8.36862548552813, "grad_norm": 0.08172713220119476, "learning_rate": 7.216529229164234e-06, "loss": 0.4491, "num_input_tokens_seen": 81222784, "step": 66790 }, { "epoch": 8.369251973436914, "grad_norm": 0.0853796899318695, "learning_rate": 7.216039157804486e-06, "loss": 0.4599, "num_input_tokens_seen": 81229088, "step": 66795 }, { "epoch": 8.369878461345696, "grad_norm": 0.12620776891708374, "learning_rate": 7.215549059950129e-06, "loss": 0.4655, "num_input_tokens_seen": 81234400, "step": 66800 }, { "epoch": 8.37050494925448, "grad_norm": 0.11464985460042953, "learning_rate": 7.21505893560702e-06, "loss": 0.4608, "num_input_tokens_seen": 81240608, "step": 66805 }, { "epoch": 8.371131437163262, "grad_norm": 0.11445394158363342, "learning_rate": 7.2145687847810175e-06, "loss": 0.4692, "num_input_tokens_seen": 81246400, "step": 66810 }, { "epoch": 8.371757925072046, "grad_norm": 0.06671261042356491, "learning_rate": 7.214078607477984e-06, "loss": 0.465, "num_input_tokens_seen": 81252832, "step": 66815 }, { "epoch": 8.37238441298083, "grad_norm": 0.09059597551822662, "learning_rate": 7.21358840370378e-06, "loss": 0.4548, "num_input_tokens_seen": 81259136, "step": 66820 }, { "epoch": 8.373010900889613, "grad_norm": 0.12531951069831848, "learning_rate": 7.213098173464266e-06, "loss": 0.4645, "num_input_tokens_seen": 81264960, "step": 66825 }, { "epoch": 8.373637388798397, "grad_norm": 0.11264203488826752, "learning_rate": 7.2126079167653e-06, "loss": 0.4598, "num_input_tokens_seen": 81270464, "step": 66830 }, { "epoch": 8.374263876707179, "grad_norm": 0.09451479464769363, "learning_rate": 7.212117633612748e-06, "loss": 0.4688, "num_input_tokens_seen": 81276576, "step": 66835 }, { "epoch": 8.374890364615963, "grad_norm": 0.09908898174762726, "learning_rate": 7.211627324012469e-06, "loss": 0.4545, "num_input_tokens_seen": 81282752, "step": 66840 }, { "epoch": 8.375516852524747, "grad_norm": 0.06245328485965729, "learning_rate": 7.211136987970328e-06, "loss": 0.4654, "num_input_tokens_seen": 81288640, "step": 66845 }, { "epoch": 8.37614334043353, "grad_norm": 0.08714821189641953, "learning_rate": 7.210646625492184e-06, "loss": 0.463, "num_input_tokens_seen": 81294624, "step": 66850 }, { "epoch": 8.376769828342313, "grad_norm": 0.11038629710674286, "learning_rate": 7.210156236583898e-06, "loss": 0.4652, "num_input_tokens_seen": 81301024, "step": 66855 }, { "epoch": 8.377396316251096, "grad_norm": 0.0978998988866806, "learning_rate": 7.2096658212513394e-06, "loss": 0.4734, "num_input_tokens_seen": 81307424, "step": 66860 }, { "epoch": 8.37802280415988, "grad_norm": 0.08833152800798416, "learning_rate": 7.209175379500368e-06, "loss": 0.4614, "num_input_tokens_seen": 81313504, "step": 66865 }, { "epoch": 8.378649292068664, "grad_norm": 0.05920638516545296, "learning_rate": 7.208684911336847e-06, "loss": 0.4615, "num_input_tokens_seen": 81319840, "step": 66870 }, { "epoch": 8.379275779977446, "grad_norm": 0.07621528208255768, "learning_rate": 7.208194416766641e-06, "loss": 0.469, "num_input_tokens_seen": 81326144, "step": 66875 }, { "epoch": 8.37990226788623, "grad_norm": 0.08153180032968521, "learning_rate": 7.2077038957956145e-06, "loss": 0.4668, "num_input_tokens_seen": 81332608, "step": 66880 }, { "epoch": 8.380528755795012, "grad_norm": 0.13482293486595154, "learning_rate": 7.20721334842963e-06, "loss": 0.4715, "num_input_tokens_seen": 81338752, "step": 66885 }, { "epoch": 8.381155243703796, "grad_norm": 0.10810708999633789, "learning_rate": 7.206722774674556e-06, "loss": 0.4682, "num_input_tokens_seen": 81345120, "step": 66890 }, { "epoch": 8.38178173161258, "grad_norm": 0.10411644726991653, "learning_rate": 7.206232174536256e-06, "loss": 0.4599, "num_input_tokens_seen": 81350976, "step": 66895 }, { "epoch": 8.382408219521363, "grad_norm": 0.05422807112336159, "learning_rate": 7.205741548020595e-06, "loss": 0.4602, "num_input_tokens_seen": 81356832, "step": 66900 }, { "epoch": 8.383034707430147, "grad_norm": 0.08056032657623291, "learning_rate": 7.20525089513344e-06, "loss": 0.458, "num_input_tokens_seen": 81363136, "step": 66905 }, { "epoch": 8.38366119533893, "grad_norm": 0.08324424177408218, "learning_rate": 7.204760215880655e-06, "loss": 0.4618, "num_input_tokens_seen": 81369184, "step": 66910 }, { "epoch": 8.384287683247713, "grad_norm": 0.09618319571018219, "learning_rate": 7.204269510268109e-06, "loss": 0.4614, "num_input_tokens_seen": 81375424, "step": 66915 }, { "epoch": 8.384914171156497, "grad_norm": 0.06944570690393448, "learning_rate": 7.203778778301667e-06, "loss": 0.4573, "num_input_tokens_seen": 81381472, "step": 66920 }, { "epoch": 8.38554065906528, "grad_norm": 0.09368893504142761, "learning_rate": 7.203288019987198e-06, "loss": 0.4666, "num_input_tokens_seen": 81387776, "step": 66925 }, { "epoch": 8.386167146974064, "grad_norm": 0.07920508831739426, "learning_rate": 7.2027972353305674e-06, "loss": 0.4616, "num_input_tokens_seen": 81393952, "step": 66930 }, { "epoch": 8.386793634882848, "grad_norm": 0.10413473844528198, "learning_rate": 7.202306424337643e-06, "loss": 0.4644, "num_input_tokens_seen": 81400256, "step": 66935 }, { "epoch": 8.38742012279163, "grad_norm": 0.10949430614709854, "learning_rate": 7.201815587014295e-06, "loss": 0.4524, "num_input_tokens_seen": 81406144, "step": 66940 }, { "epoch": 8.388046610700414, "grad_norm": 0.12779074907302856, "learning_rate": 7.201324723366391e-06, "loss": 0.4755, "num_input_tokens_seen": 81412192, "step": 66945 }, { "epoch": 8.388673098609196, "grad_norm": 0.09411323070526123, "learning_rate": 7.200833833399799e-06, "loss": 0.4671, "num_input_tokens_seen": 81418240, "step": 66950 }, { "epoch": 8.38929958651798, "grad_norm": 0.14824406802654266, "learning_rate": 7.200342917120386e-06, "loss": 0.4608, "num_input_tokens_seen": 81424512, "step": 66955 }, { "epoch": 8.389926074426764, "grad_norm": 0.10056441277265549, "learning_rate": 7.199851974534027e-06, "loss": 0.4612, "num_input_tokens_seen": 81430944, "step": 66960 }, { "epoch": 8.390552562335547, "grad_norm": 0.1127234622836113, "learning_rate": 7.199361005646586e-06, "loss": 0.4642, "num_input_tokens_seen": 81437056, "step": 66965 }, { "epoch": 8.39117905024433, "grad_norm": 0.17552946507930756, "learning_rate": 7.198870010463934e-06, "loss": 0.454, "num_input_tokens_seen": 81443008, "step": 66970 }, { "epoch": 8.391805538153113, "grad_norm": 0.08341492712497711, "learning_rate": 7.198378988991944e-06, "loss": 0.4587, "num_input_tokens_seen": 81448992, "step": 66975 }, { "epoch": 8.392432026061897, "grad_norm": 0.12175153940916061, "learning_rate": 7.197887941236484e-06, "loss": 0.4675, "num_input_tokens_seen": 81455296, "step": 66980 }, { "epoch": 8.393058513970681, "grad_norm": 0.09420105814933777, "learning_rate": 7.1973968672034265e-06, "loss": 0.4658, "num_input_tokens_seen": 81461536, "step": 66985 }, { "epoch": 8.393685001879463, "grad_norm": 0.1148238480091095, "learning_rate": 7.1969057668986406e-06, "loss": 0.4597, "num_input_tokens_seen": 81467392, "step": 66990 }, { "epoch": 8.394311489788247, "grad_norm": 0.10075994580984116, "learning_rate": 7.196414640328001e-06, "loss": 0.4621, "num_input_tokens_seen": 81473280, "step": 66995 }, { "epoch": 8.39493797769703, "grad_norm": 0.10561099648475647, "learning_rate": 7.195923487497377e-06, "loss": 0.4634, "num_input_tokens_seen": 81479232, "step": 67000 }, { "epoch": 8.395564465605814, "grad_norm": 0.08679181337356567, "learning_rate": 7.195432308412642e-06, "loss": 0.4674, "num_input_tokens_seen": 81485280, "step": 67005 }, { "epoch": 8.396190953514598, "grad_norm": 0.17125049233436584, "learning_rate": 7.194941103079667e-06, "loss": 0.4476, "num_input_tokens_seen": 81490816, "step": 67010 }, { "epoch": 8.39681744142338, "grad_norm": 0.1234930232167244, "learning_rate": 7.194449871504326e-06, "loss": 0.4655, "num_input_tokens_seen": 81496544, "step": 67015 }, { "epoch": 8.397443929332164, "grad_norm": 0.1003398448228836, "learning_rate": 7.193958613692492e-06, "loss": 0.469, "num_input_tokens_seen": 81502784, "step": 67020 }, { "epoch": 8.398070417240948, "grad_norm": 0.15164339542388916, "learning_rate": 7.193467329650039e-06, "loss": 0.461, "num_input_tokens_seen": 81509056, "step": 67025 }, { "epoch": 8.39869690514973, "grad_norm": 0.08526555448770523, "learning_rate": 7.192976019382839e-06, "loss": 0.4636, "num_input_tokens_seen": 81515552, "step": 67030 }, { "epoch": 8.399323393058514, "grad_norm": 0.10397428274154663, "learning_rate": 7.192484682896767e-06, "loss": 0.4564, "num_input_tokens_seen": 81521696, "step": 67035 }, { "epoch": 8.399949880967297, "grad_norm": 0.1308206170797348, "learning_rate": 7.191993320197697e-06, "loss": 0.4606, "num_input_tokens_seen": 81527744, "step": 67040 }, { "epoch": 8.40057636887608, "grad_norm": 0.1009141132235527, "learning_rate": 7.191501931291502e-06, "loss": 0.4708, "num_input_tokens_seen": 81534176, "step": 67045 }, { "epoch": 8.401202856784865, "grad_norm": 0.08522260934114456, "learning_rate": 7.191010516184063e-06, "loss": 0.462, "num_input_tokens_seen": 81540192, "step": 67050 }, { "epoch": 8.401829344693647, "grad_norm": 0.08000843971967697, "learning_rate": 7.190519074881248e-06, "loss": 0.4695, "num_input_tokens_seen": 81546080, "step": 67055 }, { "epoch": 8.402455832602431, "grad_norm": 0.09242991358041763, "learning_rate": 7.190027607388938e-06, "loss": 0.4634, "num_input_tokens_seen": 81552192, "step": 67060 }, { "epoch": 8.403082320511214, "grad_norm": 0.13887393474578857, "learning_rate": 7.189536113713006e-06, "loss": 0.4644, "num_input_tokens_seen": 81558304, "step": 67065 }, { "epoch": 8.403708808419998, "grad_norm": 0.0851951539516449, "learning_rate": 7.189044593859329e-06, "loss": 0.4685, "num_input_tokens_seen": 81564800, "step": 67070 }, { "epoch": 8.404335296328782, "grad_norm": 0.08345925807952881, "learning_rate": 7.1885530478337825e-06, "loss": 0.4582, "num_input_tokens_seen": 81571072, "step": 67075 }, { "epoch": 8.404961784237564, "grad_norm": 0.08624386787414551, "learning_rate": 7.188061475642245e-06, "loss": 0.4622, "num_input_tokens_seen": 81576960, "step": 67080 }, { "epoch": 8.405588272146348, "grad_norm": 0.08458058536052704, "learning_rate": 7.187569877290592e-06, "loss": 0.4624, "num_input_tokens_seen": 81583200, "step": 67085 }, { "epoch": 8.40621476005513, "grad_norm": 0.12712077796459198, "learning_rate": 7.187078252784703e-06, "loss": 0.4634, "num_input_tokens_seen": 81589568, "step": 67090 }, { "epoch": 8.406841247963914, "grad_norm": 0.08660976588726044, "learning_rate": 7.186586602130454e-06, "loss": 0.4593, "num_input_tokens_seen": 81595488, "step": 67095 }, { "epoch": 8.407467735872698, "grad_norm": 0.10984072089195251, "learning_rate": 7.186094925333724e-06, "loss": 0.4646, "num_input_tokens_seen": 81601888, "step": 67100 }, { "epoch": 8.40809422378148, "grad_norm": 0.11834915727376938, "learning_rate": 7.185603222400391e-06, "loss": 0.4645, "num_input_tokens_seen": 81607968, "step": 67105 }, { "epoch": 8.408720711690265, "grad_norm": 0.06997376680374146, "learning_rate": 7.185111493336334e-06, "loss": 0.4689, "num_input_tokens_seen": 81614336, "step": 67110 }, { "epoch": 8.409347199599047, "grad_norm": 0.1214435026049614, "learning_rate": 7.184619738147434e-06, "loss": 0.4624, "num_input_tokens_seen": 81620256, "step": 67115 }, { "epoch": 8.409973687507831, "grad_norm": 0.11428538709878922, "learning_rate": 7.184127956839566e-06, "loss": 0.4615, "num_input_tokens_seen": 81626432, "step": 67120 }, { "epoch": 8.410600175416615, "grad_norm": 0.12794990837574005, "learning_rate": 7.183636149418612e-06, "loss": 0.4585, "num_input_tokens_seen": 81632384, "step": 67125 }, { "epoch": 8.411226663325397, "grad_norm": 0.12733681499958038, "learning_rate": 7.183144315890453e-06, "loss": 0.4671, "num_input_tokens_seen": 81638496, "step": 67130 }, { "epoch": 8.411853151234181, "grad_norm": 0.1011338084936142, "learning_rate": 7.182652456260968e-06, "loss": 0.4657, "num_input_tokens_seen": 81644736, "step": 67135 }, { "epoch": 8.412479639142964, "grad_norm": 0.09869686514139175, "learning_rate": 7.182160570536037e-06, "loss": 0.4687, "num_input_tokens_seen": 81650784, "step": 67140 }, { "epoch": 8.413106127051748, "grad_norm": 0.09622304886579514, "learning_rate": 7.181668658721542e-06, "loss": 0.4599, "num_input_tokens_seen": 81656640, "step": 67145 }, { "epoch": 8.413732614960532, "grad_norm": 0.08335942029953003, "learning_rate": 7.181176720823364e-06, "loss": 0.4632, "num_input_tokens_seen": 81662880, "step": 67150 }, { "epoch": 8.414359102869314, "grad_norm": 0.07466228306293488, "learning_rate": 7.180684756847384e-06, "loss": 0.4601, "num_input_tokens_seen": 81669248, "step": 67155 }, { "epoch": 8.414985590778098, "grad_norm": 0.07800901681184769, "learning_rate": 7.180192766799486e-06, "loss": 0.4607, "num_input_tokens_seen": 81674528, "step": 67160 }, { "epoch": 8.41561207868688, "grad_norm": 0.08482913672924042, "learning_rate": 7.179700750685549e-06, "loss": 0.462, "num_input_tokens_seen": 81680704, "step": 67165 }, { "epoch": 8.416238566595664, "grad_norm": 0.0690602958202362, "learning_rate": 7.179208708511457e-06, "loss": 0.4611, "num_input_tokens_seen": 81686752, "step": 67170 }, { "epoch": 8.416865054504449, "grad_norm": 0.10211365669965744, "learning_rate": 7.1787166402830934e-06, "loss": 0.4662, "num_input_tokens_seen": 81693152, "step": 67175 }, { "epoch": 8.41749154241323, "grad_norm": 0.08725348114967346, "learning_rate": 7.17822454600634e-06, "loss": 0.4693, "num_input_tokens_seen": 81699232, "step": 67180 }, { "epoch": 8.418118030322015, "grad_norm": 0.08307904750108719, "learning_rate": 7.177732425687082e-06, "loss": 0.4624, "num_input_tokens_seen": 81705504, "step": 67185 }, { "epoch": 8.418744518230799, "grad_norm": 0.09295720607042313, "learning_rate": 7.1772402793312e-06, "loss": 0.4664, "num_input_tokens_seen": 81711360, "step": 67190 }, { "epoch": 8.419371006139581, "grad_norm": 0.07102705538272858, "learning_rate": 7.17674810694458e-06, "loss": 0.4611, "num_input_tokens_seen": 81717504, "step": 67195 }, { "epoch": 8.419997494048365, "grad_norm": 0.07842658460140228, "learning_rate": 7.176255908533107e-06, "loss": 0.458, "num_input_tokens_seen": 81723744, "step": 67200 }, { "epoch": 8.420623981957148, "grad_norm": 0.057189036160707474, "learning_rate": 7.1757636841026654e-06, "loss": 0.4585, "num_input_tokens_seen": 81729856, "step": 67205 }, { "epoch": 8.421250469865932, "grad_norm": 0.07126648724079132, "learning_rate": 7.17527143365914e-06, "loss": 0.4605, "num_input_tokens_seen": 81735712, "step": 67210 }, { "epoch": 8.421876957774716, "grad_norm": 0.11341539025306702, "learning_rate": 7.174779157208416e-06, "loss": 0.4571, "num_input_tokens_seen": 81742208, "step": 67215 }, { "epoch": 8.422503445683498, "grad_norm": 0.08292154967784882, "learning_rate": 7.174286854756377e-06, "loss": 0.4639, "num_input_tokens_seen": 81748384, "step": 67220 }, { "epoch": 8.423129933592282, "grad_norm": 0.09310203045606613, "learning_rate": 7.1737945263089126e-06, "loss": 0.464, "num_input_tokens_seen": 81754720, "step": 67225 }, { "epoch": 8.423756421501064, "grad_norm": 0.09834392368793488, "learning_rate": 7.173302171871904e-06, "loss": 0.4581, "num_input_tokens_seen": 81760768, "step": 67230 }, { "epoch": 8.424382909409848, "grad_norm": 0.11424175649881363, "learning_rate": 7.172809791451243e-06, "loss": 0.4595, "num_input_tokens_seen": 81766912, "step": 67235 }, { "epoch": 8.425009397318632, "grad_norm": 0.07363095134496689, "learning_rate": 7.172317385052814e-06, "loss": 0.4612, "num_input_tokens_seen": 81773024, "step": 67240 }, { "epoch": 8.425635885227415, "grad_norm": 0.09858710318803787, "learning_rate": 7.171824952682503e-06, "loss": 0.4637, "num_input_tokens_seen": 81779264, "step": 67245 }, { "epoch": 8.426262373136199, "grad_norm": 0.08985787630081177, "learning_rate": 7.171332494346199e-06, "loss": 0.4638, "num_input_tokens_seen": 81784992, "step": 67250 }, { "epoch": 8.426888861044981, "grad_norm": 0.08616302907466888, "learning_rate": 7.170840010049791e-06, "loss": 0.4698, "num_input_tokens_seen": 81790848, "step": 67255 }, { "epoch": 8.427515348953765, "grad_norm": 0.1075507402420044, "learning_rate": 7.170347499799165e-06, "loss": 0.4668, "num_input_tokens_seen": 81796992, "step": 67260 }, { "epoch": 8.42814183686255, "grad_norm": 0.08593408018350601, "learning_rate": 7.16985496360021e-06, "loss": 0.4566, "num_input_tokens_seen": 81803200, "step": 67265 }, { "epoch": 8.428768324771331, "grad_norm": 0.08365883678197861, "learning_rate": 7.169362401458815e-06, "loss": 0.4611, "num_input_tokens_seen": 81809024, "step": 67270 }, { "epoch": 8.429394812680115, "grad_norm": 0.0501343309879303, "learning_rate": 7.168869813380867e-06, "loss": 0.4584, "num_input_tokens_seen": 81815296, "step": 67275 }, { "epoch": 8.430021300588898, "grad_norm": 0.07761228829622269, "learning_rate": 7.168377199372258e-06, "loss": 0.4622, "num_input_tokens_seen": 81820736, "step": 67280 }, { "epoch": 8.430647788497682, "grad_norm": 0.1015390157699585, "learning_rate": 7.167884559438877e-06, "loss": 0.46, "num_input_tokens_seen": 81826944, "step": 67285 }, { "epoch": 8.431274276406466, "grad_norm": 0.11585898697376251, "learning_rate": 7.167391893586611e-06, "loss": 0.459, "num_input_tokens_seen": 81832576, "step": 67290 }, { "epoch": 8.431900764315248, "grad_norm": 0.08247948437929153, "learning_rate": 7.166899201821354e-06, "loss": 0.4617, "num_input_tokens_seen": 81838432, "step": 67295 }, { "epoch": 8.432527252224032, "grad_norm": 0.11629997193813324, "learning_rate": 7.1664064841489956e-06, "loss": 0.4629, "num_input_tokens_seen": 81844576, "step": 67300 }, { "epoch": 8.433153740132816, "grad_norm": 0.08388680219650269, "learning_rate": 7.1659137405754265e-06, "loss": 0.4629, "num_input_tokens_seen": 81850336, "step": 67305 }, { "epoch": 8.433780228041599, "grad_norm": 0.07377711683511734, "learning_rate": 7.165420971106537e-06, "loss": 0.4577, "num_input_tokens_seen": 81856544, "step": 67310 }, { "epoch": 8.434406715950383, "grad_norm": 0.10456663370132446, "learning_rate": 7.164928175748221e-06, "loss": 0.4598, "num_input_tokens_seen": 81862848, "step": 67315 }, { "epoch": 8.435033203859165, "grad_norm": 0.0601414330303669, "learning_rate": 7.164435354506366e-06, "loss": 0.462, "num_input_tokens_seen": 81869088, "step": 67320 }, { "epoch": 8.435659691767949, "grad_norm": 0.07881034165620804, "learning_rate": 7.163942507386867e-06, "loss": 0.4579, "num_input_tokens_seen": 81875168, "step": 67325 }, { "epoch": 8.436286179676733, "grad_norm": 0.08841264992952347, "learning_rate": 7.163449634395619e-06, "loss": 0.4685, "num_input_tokens_seen": 81881216, "step": 67330 }, { "epoch": 8.436912667585515, "grad_norm": 0.12675641477108002, "learning_rate": 7.162956735538509e-06, "loss": 0.4642, "num_input_tokens_seen": 81887392, "step": 67335 }, { "epoch": 8.4375391554943, "grad_norm": 0.09268417209386826, "learning_rate": 7.162463810821432e-06, "loss": 0.4629, "num_input_tokens_seen": 81893344, "step": 67340 }, { "epoch": 8.438165643403082, "grad_norm": 0.09302692115306854, "learning_rate": 7.161970860250282e-06, "loss": 0.4614, "num_input_tokens_seen": 81899456, "step": 67345 }, { "epoch": 8.438792131311866, "grad_norm": 0.09064481407403946, "learning_rate": 7.161477883830954e-06, "loss": 0.466, "num_input_tokens_seen": 81905536, "step": 67350 }, { "epoch": 8.43941861922065, "grad_norm": 0.1326838582754135, "learning_rate": 7.160984881569339e-06, "loss": 0.4559, "num_input_tokens_seen": 81911552, "step": 67355 }, { "epoch": 8.440045107129432, "grad_norm": 0.08666130155324936, "learning_rate": 7.160491853471335e-06, "loss": 0.4673, "num_input_tokens_seen": 81917664, "step": 67360 }, { "epoch": 8.440671595038216, "grad_norm": 0.12154082208871841, "learning_rate": 7.159998799542834e-06, "loss": 0.4676, "num_input_tokens_seen": 81923840, "step": 67365 }, { "epoch": 8.441298082946998, "grad_norm": 0.08454637974500656, "learning_rate": 7.159505719789731e-06, "loss": 0.4676, "num_input_tokens_seen": 81929536, "step": 67370 }, { "epoch": 8.441924570855782, "grad_norm": 0.07644161581993103, "learning_rate": 7.159012614217922e-06, "loss": 0.4647, "num_input_tokens_seen": 81935200, "step": 67375 }, { "epoch": 8.442551058764566, "grad_norm": 0.0756971463561058, "learning_rate": 7.158519482833301e-06, "loss": 0.463, "num_input_tokens_seen": 81940480, "step": 67380 }, { "epoch": 8.443177546673349, "grad_norm": 0.09354748576879501, "learning_rate": 7.158026325641766e-06, "loss": 0.4607, "num_input_tokens_seen": 81946432, "step": 67385 }, { "epoch": 8.443804034582133, "grad_norm": 0.12520968914031982, "learning_rate": 7.157533142649211e-06, "loss": 0.466, "num_input_tokens_seen": 81952832, "step": 67390 }, { "epoch": 8.444430522490915, "grad_norm": 0.0927472859621048, "learning_rate": 7.1570399338615335e-06, "loss": 0.4579, "num_input_tokens_seen": 81958880, "step": 67395 }, { "epoch": 8.4450570103997, "grad_norm": 0.0679747685790062, "learning_rate": 7.15654669928463e-06, "loss": 0.4594, "num_input_tokens_seen": 81964928, "step": 67400 }, { "epoch": 8.445683498308483, "grad_norm": 0.11951641738414764, "learning_rate": 7.156053438924397e-06, "loss": 0.4655, "num_input_tokens_seen": 81971008, "step": 67405 }, { "epoch": 8.446309986217265, "grad_norm": 0.10080704092979431, "learning_rate": 7.1555601527867335e-06, "loss": 0.4585, "num_input_tokens_seen": 81977056, "step": 67410 }, { "epoch": 8.44693647412605, "grad_norm": 0.099580317735672, "learning_rate": 7.155066840877536e-06, "loss": 0.4614, "num_input_tokens_seen": 81982976, "step": 67415 }, { "epoch": 8.447562962034834, "grad_norm": 0.0843120589852333, "learning_rate": 7.154573503202702e-06, "loss": 0.4637, "num_input_tokens_seen": 81989152, "step": 67420 }, { "epoch": 8.448189449943616, "grad_norm": 0.07224760204553604, "learning_rate": 7.15408013976813e-06, "loss": 0.4667, "num_input_tokens_seen": 81995136, "step": 67425 }, { "epoch": 8.4488159378524, "grad_norm": 0.09132439643144608, "learning_rate": 7.1535867505797195e-06, "loss": 0.4572, "num_input_tokens_seen": 82001376, "step": 67430 }, { "epoch": 8.449442425761182, "grad_norm": 0.08948010206222534, "learning_rate": 7.153093335643369e-06, "loss": 0.4579, "num_input_tokens_seen": 82007264, "step": 67435 }, { "epoch": 8.450068913669966, "grad_norm": 0.08376729488372803, "learning_rate": 7.152599894964978e-06, "loss": 0.4621, "num_input_tokens_seen": 82013408, "step": 67440 }, { "epoch": 8.45069540157875, "grad_norm": 0.09291055053472519, "learning_rate": 7.152106428550445e-06, "loss": 0.4581, "num_input_tokens_seen": 82019680, "step": 67445 }, { "epoch": 8.451321889487533, "grad_norm": 0.10736536979675293, "learning_rate": 7.151612936405668e-06, "loss": 0.4592, "num_input_tokens_seen": 82025504, "step": 67450 }, { "epoch": 8.451948377396317, "grad_norm": 0.12201925367116928, "learning_rate": 7.151119418536552e-06, "loss": 0.4562, "num_input_tokens_seen": 82031520, "step": 67455 }, { "epoch": 8.452574865305099, "grad_norm": 0.1030678004026413, "learning_rate": 7.150625874948993e-06, "loss": 0.4629, "num_input_tokens_seen": 82038016, "step": 67460 }, { "epoch": 8.453201353213883, "grad_norm": 0.10382366925477982, "learning_rate": 7.150132305648896e-06, "loss": 0.4588, "num_input_tokens_seen": 82044000, "step": 67465 }, { "epoch": 8.453827841122667, "grad_norm": 0.08553896099328995, "learning_rate": 7.149638710642157e-06, "loss": 0.4651, "num_input_tokens_seen": 82050048, "step": 67470 }, { "epoch": 8.45445432903145, "grad_norm": 0.05644006282091141, "learning_rate": 7.14914508993468e-06, "loss": 0.4685, "num_input_tokens_seen": 82056064, "step": 67475 }, { "epoch": 8.455080816940233, "grad_norm": 0.07176610827445984, "learning_rate": 7.148651443532369e-06, "loss": 0.4599, "num_input_tokens_seen": 82062368, "step": 67480 }, { "epoch": 8.455707304849016, "grad_norm": 0.08686753362417221, "learning_rate": 7.148157771441122e-06, "loss": 0.463, "num_input_tokens_seen": 82068800, "step": 67485 }, { "epoch": 8.4563337927578, "grad_norm": 0.102528877556324, "learning_rate": 7.1476640736668426e-06, "loss": 0.459, "num_input_tokens_seen": 82075104, "step": 67490 }, { "epoch": 8.456960280666584, "grad_norm": 0.1176200658082962, "learning_rate": 7.147170350215434e-06, "loss": 0.4529, "num_input_tokens_seen": 82081248, "step": 67495 }, { "epoch": 8.457586768575366, "grad_norm": 0.0891813263297081, "learning_rate": 7.146676601092798e-06, "loss": 0.464, "num_input_tokens_seen": 82087392, "step": 67500 }, { "epoch": 8.45821325648415, "grad_norm": 0.08281541615724564, "learning_rate": 7.146182826304838e-06, "loss": 0.455, "num_input_tokens_seen": 82092992, "step": 67505 }, { "epoch": 8.458839744392932, "grad_norm": 0.11826461553573608, "learning_rate": 7.145689025857459e-06, "loss": 0.4642, "num_input_tokens_seen": 82099200, "step": 67510 }, { "epoch": 8.459466232301716, "grad_norm": 0.11937768012285233, "learning_rate": 7.145195199756564e-06, "loss": 0.4597, "num_input_tokens_seen": 82104832, "step": 67515 }, { "epoch": 8.4600927202105, "grad_norm": 0.14771896600723267, "learning_rate": 7.144701348008056e-06, "loss": 0.4595, "num_input_tokens_seen": 82111104, "step": 67520 }, { "epoch": 8.460719208119283, "grad_norm": 0.10855408757925034, "learning_rate": 7.144207470617841e-06, "loss": 0.4679, "num_input_tokens_seen": 82117248, "step": 67525 }, { "epoch": 8.461345696028067, "grad_norm": 0.08532354980707169, "learning_rate": 7.143713567591822e-06, "loss": 0.4564, "num_input_tokens_seen": 82123392, "step": 67530 }, { "epoch": 8.461972183936851, "grad_norm": 0.10407841950654984, "learning_rate": 7.143219638935905e-06, "loss": 0.4554, "num_input_tokens_seen": 82129376, "step": 67535 }, { "epoch": 8.462598671845633, "grad_norm": 0.0889134556055069, "learning_rate": 7.1427256846559955e-06, "loss": 0.4611, "num_input_tokens_seen": 82135776, "step": 67540 }, { "epoch": 8.463225159754417, "grad_norm": 0.1425667405128479, "learning_rate": 7.142231704757998e-06, "loss": 0.4641, "num_input_tokens_seen": 82142048, "step": 67545 }, { "epoch": 8.4638516476632, "grad_norm": 0.09513425081968307, "learning_rate": 7.141737699247822e-06, "loss": 0.4589, "num_input_tokens_seen": 82148128, "step": 67550 }, { "epoch": 8.464478135571984, "grad_norm": 0.0957103744149208, "learning_rate": 7.141243668131368e-06, "loss": 0.475, "num_input_tokens_seen": 82154208, "step": 67555 }, { "epoch": 8.465104623480768, "grad_norm": 0.0984351634979248, "learning_rate": 7.140749611414547e-06, "loss": 0.459, "num_input_tokens_seen": 82160448, "step": 67560 }, { "epoch": 8.46573111138955, "grad_norm": 0.08215805143117905, "learning_rate": 7.140255529103265e-06, "loss": 0.4607, "num_input_tokens_seen": 82166240, "step": 67565 }, { "epoch": 8.466357599298334, "grad_norm": 0.0978851243853569, "learning_rate": 7.139761421203428e-06, "loss": 0.4637, "num_input_tokens_seen": 82172224, "step": 67570 }, { "epoch": 8.466984087207116, "grad_norm": 0.08704928308725357, "learning_rate": 7.139267287720945e-06, "loss": 0.4583, "num_input_tokens_seen": 82177696, "step": 67575 }, { "epoch": 8.4676105751159, "grad_norm": 0.12435011565685272, "learning_rate": 7.138773128661723e-06, "loss": 0.4529, "num_input_tokens_seen": 82183744, "step": 67580 }, { "epoch": 8.468237063024684, "grad_norm": 0.11201804876327515, "learning_rate": 7.13827894403167e-06, "loss": 0.4597, "num_input_tokens_seen": 82188992, "step": 67585 }, { "epoch": 8.468863550933467, "grad_norm": 0.09058409184217453, "learning_rate": 7.137784733836695e-06, "loss": 0.4669, "num_input_tokens_seen": 82194944, "step": 67590 }, { "epoch": 8.46949003884225, "grad_norm": 0.07897953689098358, "learning_rate": 7.1372904980827054e-06, "loss": 0.4645, "num_input_tokens_seen": 82200608, "step": 67595 }, { "epoch": 8.470116526751033, "grad_norm": 0.0862770825624466, "learning_rate": 7.13679623677561e-06, "loss": 0.4546, "num_input_tokens_seen": 82207008, "step": 67600 }, { "epoch": 8.470743014659817, "grad_norm": 0.08191308379173279, "learning_rate": 7.13630194992132e-06, "loss": 0.4616, "num_input_tokens_seen": 82213312, "step": 67605 }, { "epoch": 8.471369502568601, "grad_norm": 0.10055086761713028, "learning_rate": 7.135807637525743e-06, "loss": 0.4575, "num_input_tokens_seen": 82219200, "step": 67610 }, { "epoch": 8.471995990477383, "grad_norm": 0.11022323369979858, "learning_rate": 7.135313299594792e-06, "loss": 0.4584, "num_input_tokens_seen": 82225472, "step": 67615 }, { "epoch": 8.472622478386167, "grad_norm": 0.12387626618146896, "learning_rate": 7.134818936134373e-06, "loss": 0.4745, "num_input_tokens_seen": 82231552, "step": 67620 }, { "epoch": 8.47324896629495, "grad_norm": 0.09867487102746964, "learning_rate": 7.134324547150401e-06, "loss": 0.4603, "num_input_tokens_seen": 82237760, "step": 67625 }, { "epoch": 8.473875454203734, "grad_norm": 0.1291283667087555, "learning_rate": 7.133830132648784e-06, "loss": 0.4671, "num_input_tokens_seen": 82243840, "step": 67630 }, { "epoch": 8.474501942112518, "grad_norm": 0.12310045957565308, "learning_rate": 7.133335692635433e-06, "loss": 0.4624, "num_input_tokens_seen": 82250336, "step": 67635 }, { "epoch": 8.4751284300213, "grad_norm": 0.08765054494142532, "learning_rate": 7.132841227116261e-06, "loss": 0.4594, "num_input_tokens_seen": 82256672, "step": 67640 }, { "epoch": 8.475754917930084, "grad_norm": 0.15689311921596527, "learning_rate": 7.132346736097179e-06, "loss": 0.462, "num_input_tokens_seen": 82262912, "step": 67645 }, { "epoch": 8.476381405838866, "grad_norm": 0.09665471315383911, "learning_rate": 7.1318522195840985e-06, "loss": 0.4662, "num_input_tokens_seen": 82269248, "step": 67650 }, { "epoch": 8.47700789374765, "grad_norm": 0.10449544340372086, "learning_rate": 7.131357677582932e-06, "loss": 0.4645, "num_input_tokens_seen": 82275712, "step": 67655 }, { "epoch": 8.477634381656435, "grad_norm": 0.09650086611509323, "learning_rate": 7.130863110099593e-06, "loss": 0.4598, "num_input_tokens_seen": 82281504, "step": 67660 }, { "epoch": 8.478260869565217, "grad_norm": 0.18795756995677948, "learning_rate": 7.130368517139993e-06, "loss": 0.4541, "num_input_tokens_seen": 82286848, "step": 67665 }, { "epoch": 8.478887357474001, "grad_norm": 0.1089334487915039, "learning_rate": 7.129873898710047e-06, "loss": 0.4602, "num_input_tokens_seen": 82293152, "step": 67670 }, { "epoch": 8.479513845382785, "grad_norm": 0.10480523854494095, "learning_rate": 7.129379254815668e-06, "loss": 0.4627, "num_input_tokens_seen": 82299360, "step": 67675 }, { "epoch": 8.480140333291567, "grad_norm": 0.1105160266160965, "learning_rate": 7.12888458546277e-06, "loss": 0.4565, "num_input_tokens_seen": 82305344, "step": 67680 }, { "epoch": 8.480766821200351, "grad_norm": 0.10638916492462158, "learning_rate": 7.128389890657267e-06, "loss": 0.4562, "num_input_tokens_seen": 82312096, "step": 67685 }, { "epoch": 8.481393309109134, "grad_norm": 0.10328016430139542, "learning_rate": 7.127895170405072e-06, "loss": 0.4642, "num_input_tokens_seen": 82318304, "step": 67690 }, { "epoch": 8.482019797017918, "grad_norm": 0.10185100138187408, "learning_rate": 7.1274004247121036e-06, "loss": 0.4662, "num_input_tokens_seen": 82324544, "step": 67695 }, { "epoch": 8.482646284926702, "grad_norm": 0.13681679964065552, "learning_rate": 7.126905653584273e-06, "loss": 0.4572, "num_input_tokens_seen": 82330400, "step": 67700 }, { "epoch": 8.483272772835484, "grad_norm": 0.09442707151174545, "learning_rate": 7.1264108570274955e-06, "loss": 0.4764, "num_input_tokens_seen": 82336416, "step": 67705 }, { "epoch": 8.483899260744268, "grad_norm": 0.11296319216489792, "learning_rate": 7.125916035047691e-06, "loss": 0.4705, "num_input_tokens_seen": 82342688, "step": 67710 }, { "epoch": 8.48452574865305, "grad_norm": 0.12466821074485779, "learning_rate": 7.1254211876507715e-06, "loss": 0.4658, "num_input_tokens_seen": 82348736, "step": 67715 }, { "epoch": 8.485152236561834, "grad_norm": 0.08400867134332657, "learning_rate": 7.124926314842656e-06, "loss": 0.4537, "num_input_tokens_seen": 82354496, "step": 67720 }, { "epoch": 8.485778724470618, "grad_norm": 0.11278540641069412, "learning_rate": 7.124431416629259e-06, "loss": 0.4622, "num_input_tokens_seen": 82360608, "step": 67725 }, { "epoch": 8.4864052123794, "grad_norm": 0.12307760119438171, "learning_rate": 7.1239364930164975e-06, "loss": 0.4665, "num_input_tokens_seen": 82366624, "step": 67730 }, { "epoch": 8.487031700288185, "grad_norm": 0.14679883420467377, "learning_rate": 7.123441544010291e-06, "loss": 0.4661, "num_input_tokens_seen": 82372224, "step": 67735 }, { "epoch": 8.487658188196967, "grad_norm": 0.09702246636152267, "learning_rate": 7.122946569616556e-06, "loss": 0.4573, "num_input_tokens_seen": 82377792, "step": 67740 }, { "epoch": 8.488284676105751, "grad_norm": 0.09813293814659119, "learning_rate": 7.122451569841209e-06, "loss": 0.4611, "num_input_tokens_seen": 82384032, "step": 67745 }, { "epoch": 8.488911164014535, "grad_norm": 0.08428015559911728, "learning_rate": 7.121956544690168e-06, "loss": 0.4623, "num_input_tokens_seen": 82390048, "step": 67750 }, { "epoch": 8.489537651923317, "grad_norm": 0.14520886540412903, "learning_rate": 7.121461494169354e-06, "loss": 0.4648, "num_input_tokens_seen": 82396288, "step": 67755 }, { "epoch": 8.490164139832101, "grad_norm": 0.08251431584358215, "learning_rate": 7.120966418284683e-06, "loss": 0.4677, "num_input_tokens_seen": 82402304, "step": 67760 }, { "epoch": 8.490790627740884, "grad_norm": 0.07553095370531082, "learning_rate": 7.120471317042076e-06, "loss": 0.4629, "num_input_tokens_seen": 82407968, "step": 67765 }, { "epoch": 8.491417115649668, "grad_norm": 0.13404302299022675, "learning_rate": 7.119976190447452e-06, "loss": 0.4552, "num_input_tokens_seen": 82414016, "step": 67770 }, { "epoch": 8.492043603558452, "grad_norm": 0.08666934818029404, "learning_rate": 7.119481038506729e-06, "loss": 0.4597, "num_input_tokens_seen": 82419840, "step": 67775 }, { "epoch": 8.492670091467234, "grad_norm": 0.0887928232550621, "learning_rate": 7.1189858612258285e-06, "loss": 0.4623, "num_input_tokens_seen": 82425952, "step": 67780 }, { "epoch": 8.493296579376018, "grad_norm": 0.09351565688848495, "learning_rate": 7.11849065861067e-06, "loss": 0.4553, "num_input_tokens_seen": 82432224, "step": 67785 }, { "epoch": 8.4939230672848, "grad_norm": 0.09124834090471268, "learning_rate": 7.117995430667177e-06, "loss": 0.4644, "num_input_tokens_seen": 82438112, "step": 67790 }, { "epoch": 8.494549555193585, "grad_norm": 0.09323663264513016, "learning_rate": 7.117500177401267e-06, "loss": 0.4629, "num_input_tokens_seen": 82443936, "step": 67795 }, { "epoch": 8.495176043102369, "grad_norm": 0.10433703660964966, "learning_rate": 7.117004898818861e-06, "loss": 0.4684, "num_input_tokens_seen": 82450016, "step": 67800 }, { "epoch": 8.495802531011151, "grad_norm": 0.1080782487988472, "learning_rate": 7.116509594925883e-06, "loss": 0.4543, "num_input_tokens_seen": 82456256, "step": 67805 }, { "epoch": 8.496429018919935, "grad_norm": 0.10077496618032455, "learning_rate": 7.1160142657282515e-06, "loss": 0.4705, "num_input_tokens_seen": 82462400, "step": 67810 }, { "epoch": 8.497055506828719, "grad_norm": 0.08068247139453888, "learning_rate": 7.115518911231891e-06, "loss": 0.4735, "num_input_tokens_seen": 82468768, "step": 67815 }, { "epoch": 8.497681994737501, "grad_norm": 0.12038075178861618, "learning_rate": 7.115023531442724e-06, "loss": 0.4616, "num_input_tokens_seen": 82474688, "step": 67820 }, { "epoch": 8.498308482646285, "grad_norm": 0.09679275006055832, "learning_rate": 7.114528126366672e-06, "loss": 0.4581, "num_input_tokens_seen": 82481408, "step": 67825 }, { "epoch": 8.498934970555068, "grad_norm": 0.1290799379348755, "learning_rate": 7.114032696009659e-06, "loss": 0.466, "num_input_tokens_seen": 82487584, "step": 67830 }, { "epoch": 8.499561458463852, "grad_norm": 0.09449374675750732, "learning_rate": 7.1135372403776074e-06, "loss": 0.4644, "num_input_tokens_seen": 82493824, "step": 67835 }, { "epoch": 8.500187946372636, "grad_norm": 0.09328732639551163, "learning_rate": 7.11304175947644e-06, "loss": 0.4638, "num_input_tokens_seen": 82500000, "step": 67840 }, { "epoch": 8.500814434281418, "grad_norm": 0.09949672967195511, "learning_rate": 7.112546253312083e-06, "loss": 0.4633, "num_input_tokens_seen": 82506208, "step": 67845 }, { "epoch": 8.501440922190202, "grad_norm": 0.10714348405599594, "learning_rate": 7.112050721890459e-06, "loss": 0.4631, "num_input_tokens_seen": 82512320, "step": 67850 }, { "epoch": 8.502067410098984, "grad_norm": 0.09062610566616058, "learning_rate": 7.111555165217493e-06, "loss": 0.4565, "num_input_tokens_seen": 82518496, "step": 67855 }, { "epoch": 8.502693898007768, "grad_norm": 0.09406747668981552, "learning_rate": 7.11105958329911e-06, "loss": 0.4583, "num_input_tokens_seen": 82524192, "step": 67860 }, { "epoch": 8.503320385916552, "grad_norm": 0.09703553467988968, "learning_rate": 7.110563976141233e-06, "loss": 0.4556, "num_input_tokens_seen": 82529952, "step": 67865 }, { "epoch": 8.503946873825335, "grad_norm": 0.10181444138288498, "learning_rate": 7.110068343749792e-06, "loss": 0.4663, "num_input_tokens_seen": 82535904, "step": 67870 }, { "epoch": 8.504573361734119, "grad_norm": 0.09060198813676834, "learning_rate": 7.109572686130708e-06, "loss": 0.4592, "num_input_tokens_seen": 82542176, "step": 67875 }, { "epoch": 8.505199849642901, "grad_norm": 0.09423014521598816, "learning_rate": 7.1090770032899105e-06, "loss": 0.4644, "num_input_tokens_seen": 82548096, "step": 67880 }, { "epoch": 8.505826337551685, "grad_norm": 0.12247493118047714, "learning_rate": 7.108581295233323e-06, "loss": 0.463, "num_input_tokens_seen": 82554336, "step": 67885 }, { "epoch": 8.50645282546047, "grad_norm": 0.13057149946689606, "learning_rate": 7.108085561966874e-06, "loss": 0.4569, "num_input_tokens_seen": 82560640, "step": 67890 }, { "epoch": 8.507079313369251, "grad_norm": 0.07290519773960114, "learning_rate": 7.10758980349649e-06, "loss": 0.4571, "num_input_tokens_seen": 82566720, "step": 67895 }, { "epoch": 8.507705801278036, "grad_norm": 0.14559288322925568, "learning_rate": 7.107094019828097e-06, "loss": 0.4548, "num_input_tokens_seen": 82572192, "step": 67900 }, { "epoch": 8.508332289186818, "grad_norm": 0.13336342573165894, "learning_rate": 7.106598210967623e-06, "loss": 0.4566, "num_input_tokens_seen": 82578240, "step": 67905 }, { "epoch": 8.508958777095602, "grad_norm": 0.13080061972141266, "learning_rate": 7.106102376920997e-06, "loss": 0.4713, "num_input_tokens_seen": 82584480, "step": 67910 }, { "epoch": 8.509585265004386, "grad_norm": 0.10174226760864258, "learning_rate": 7.105606517694147e-06, "loss": 0.4618, "num_input_tokens_seen": 82590272, "step": 67915 }, { "epoch": 8.510211752913168, "grad_norm": 0.12975849211215973, "learning_rate": 7.105110633292999e-06, "loss": 0.4666, "num_input_tokens_seen": 82596256, "step": 67920 }, { "epoch": 8.510838240821952, "grad_norm": 0.1216408759355545, "learning_rate": 7.104614723723484e-06, "loss": 0.4618, "num_input_tokens_seen": 82602496, "step": 67925 }, { "epoch": 8.511464728730736, "grad_norm": 0.0777965560555458, "learning_rate": 7.104118788991532e-06, "loss": 0.4608, "num_input_tokens_seen": 82608544, "step": 67930 }, { "epoch": 8.512091216639519, "grad_norm": 0.08715607970952988, "learning_rate": 7.103622829103071e-06, "loss": 0.4505, "num_input_tokens_seen": 82614752, "step": 67935 }, { "epoch": 8.512717704548303, "grad_norm": 0.07815738767385483, "learning_rate": 7.103126844064029e-06, "loss": 0.4573, "num_input_tokens_seen": 82620864, "step": 67940 }, { "epoch": 8.513344192457085, "grad_norm": 0.129709392786026, "learning_rate": 7.102630833880337e-06, "loss": 0.457, "num_input_tokens_seen": 82626848, "step": 67945 }, { "epoch": 8.513970680365869, "grad_norm": 0.11908740550279617, "learning_rate": 7.102134798557927e-06, "loss": 0.4757, "num_input_tokens_seen": 82632896, "step": 67950 }, { "epoch": 8.514597168274653, "grad_norm": 0.08265789598226547, "learning_rate": 7.101638738102729e-06, "loss": 0.4548, "num_input_tokens_seen": 82638560, "step": 67955 }, { "epoch": 8.515223656183435, "grad_norm": 0.10562067478895187, "learning_rate": 7.101142652520671e-06, "loss": 0.4684, "num_input_tokens_seen": 82644640, "step": 67960 }, { "epoch": 8.51585014409222, "grad_norm": 0.08697529882192612, "learning_rate": 7.100646541817685e-06, "loss": 0.4623, "num_input_tokens_seen": 82650496, "step": 67965 }, { "epoch": 8.516476632001002, "grad_norm": 0.10791510343551636, "learning_rate": 7.100150405999705e-06, "loss": 0.4715, "num_input_tokens_seen": 82656704, "step": 67970 }, { "epoch": 8.517103119909786, "grad_norm": 0.08248704671859741, "learning_rate": 7.09965424507266e-06, "loss": 0.463, "num_input_tokens_seen": 82662624, "step": 67975 }, { "epoch": 8.51772960781857, "grad_norm": 0.10514049232006073, "learning_rate": 7.099158059042486e-06, "loss": 0.4633, "num_input_tokens_seen": 82668512, "step": 67980 }, { "epoch": 8.518356095727352, "grad_norm": 0.10901424288749695, "learning_rate": 7.09866184791511e-06, "loss": 0.4613, "num_input_tokens_seen": 82674432, "step": 67985 }, { "epoch": 8.518982583636136, "grad_norm": 0.11324454098939896, "learning_rate": 7.098165611696468e-06, "loss": 0.4671, "num_input_tokens_seen": 82680672, "step": 67990 }, { "epoch": 8.519609071544918, "grad_norm": 0.10192157328128815, "learning_rate": 7.097669350392493e-06, "loss": 0.4566, "num_input_tokens_seen": 82686816, "step": 67995 }, { "epoch": 8.520235559453702, "grad_norm": 0.09646657109260559, "learning_rate": 7.097173064009117e-06, "loss": 0.4628, "num_input_tokens_seen": 82693024, "step": 68000 }, { "epoch": 8.520862047362487, "grad_norm": 0.09013745188713074, "learning_rate": 7.096676752552273e-06, "loss": 0.4581, "num_input_tokens_seen": 82698688, "step": 68005 }, { "epoch": 8.521488535271269, "grad_norm": 0.0865064263343811, "learning_rate": 7.096180416027895e-06, "loss": 0.461, "num_input_tokens_seen": 82705248, "step": 68010 }, { "epoch": 8.522115023180053, "grad_norm": 0.09709008038043976, "learning_rate": 7.095684054441918e-06, "loss": 0.4596, "num_input_tokens_seen": 82711104, "step": 68015 }, { "epoch": 8.522741511088835, "grad_norm": 0.11356401443481445, "learning_rate": 7.095187667800277e-06, "loss": 0.4581, "num_input_tokens_seen": 82717408, "step": 68020 }, { "epoch": 8.52336799899762, "grad_norm": 0.12078896164894104, "learning_rate": 7.094691256108907e-06, "loss": 0.4554, "num_input_tokens_seen": 82723168, "step": 68025 }, { "epoch": 8.523994486906403, "grad_norm": 0.08549508452415466, "learning_rate": 7.0941948193737395e-06, "loss": 0.4605, "num_input_tokens_seen": 82729344, "step": 68030 }, { "epoch": 8.524620974815186, "grad_norm": 0.10493677854537964, "learning_rate": 7.0936983576007135e-06, "loss": 0.4596, "num_input_tokens_seen": 82735456, "step": 68035 }, { "epoch": 8.52524746272397, "grad_norm": 0.12343288213014603, "learning_rate": 7.093201870795763e-06, "loss": 0.4639, "num_input_tokens_seen": 82741824, "step": 68040 }, { "epoch": 8.525873950632754, "grad_norm": 0.23067134618759155, "learning_rate": 7.092705358964824e-06, "loss": 0.4544, "num_input_tokens_seen": 82747584, "step": 68045 }, { "epoch": 8.526500438541536, "grad_norm": 0.10910210758447647, "learning_rate": 7.092208822113834e-06, "loss": 0.4663, "num_input_tokens_seen": 82753984, "step": 68050 }, { "epoch": 8.52712692645032, "grad_norm": 0.11835648864507675, "learning_rate": 7.091712260248727e-06, "loss": 0.4643, "num_input_tokens_seen": 82760160, "step": 68055 }, { "epoch": 8.527753414359102, "grad_norm": 0.1192030981183052, "learning_rate": 7.0912156733754435e-06, "loss": 0.465, "num_input_tokens_seen": 82766528, "step": 68060 }, { "epoch": 8.528379902267886, "grad_norm": 0.16309279203414917, "learning_rate": 7.090719061499918e-06, "loss": 0.452, "num_input_tokens_seen": 82772960, "step": 68065 }, { "epoch": 8.52900639017667, "grad_norm": 0.11138955503702164, "learning_rate": 7.090222424628086e-06, "loss": 0.4464, "num_input_tokens_seen": 82779200, "step": 68070 }, { "epoch": 8.529632878085453, "grad_norm": 0.14439047873020172, "learning_rate": 7.089725762765889e-06, "loss": 0.4621, "num_input_tokens_seen": 82785344, "step": 68075 }, { "epoch": 8.530259365994237, "grad_norm": 0.11944381147623062, "learning_rate": 7.089229075919264e-06, "loss": 0.4586, "num_input_tokens_seen": 82791520, "step": 68080 }, { "epoch": 8.530885853903019, "grad_norm": 0.13046985864639282, "learning_rate": 7.088732364094148e-06, "loss": 0.4703, "num_input_tokens_seen": 82797600, "step": 68085 }, { "epoch": 8.531512341811803, "grad_norm": 0.09194333106279373, "learning_rate": 7.088235627296482e-06, "loss": 0.4624, "num_input_tokens_seen": 82803520, "step": 68090 }, { "epoch": 8.532138829720587, "grad_norm": 0.09089218080043793, "learning_rate": 7.087738865532201e-06, "loss": 0.4565, "num_input_tokens_seen": 82809792, "step": 68095 }, { "epoch": 8.53276531762937, "grad_norm": 0.13280007243156433, "learning_rate": 7.087242078807248e-06, "loss": 0.4677, "num_input_tokens_seen": 82815808, "step": 68100 }, { "epoch": 8.533391805538153, "grad_norm": 0.11024395376443863, "learning_rate": 7.086745267127562e-06, "loss": 0.4531, "num_input_tokens_seen": 82821792, "step": 68105 }, { "epoch": 8.534018293446936, "grad_norm": 0.08649389445781708, "learning_rate": 7.086248430499081e-06, "loss": 0.4627, "num_input_tokens_seen": 82827904, "step": 68110 }, { "epoch": 8.53464478135572, "grad_norm": 0.1475590169429779, "learning_rate": 7.0857515689277455e-06, "loss": 0.4653, "num_input_tokens_seen": 82833792, "step": 68115 }, { "epoch": 8.535271269264504, "grad_norm": 0.11370078474283218, "learning_rate": 7.085254682419497e-06, "loss": 0.4611, "num_input_tokens_seen": 82839936, "step": 68120 }, { "epoch": 8.535897757173286, "grad_norm": 0.0765891745686531, "learning_rate": 7.084757770980274e-06, "loss": 0.4617, "num_input_tokens_seen": 82846016, "step": 68125 }, { "epoch": 8.53652424508207, "grad_norm": 0.08562810719013214, "learning_rate": 7.084260834616021e-06, "loss": 0.4638, "num_input_tokens_seen": 82851616, "step": 68130 }, { "epoch": 8.537150732990852, "grad_norm": 0.10558579862117767, "learning_rate": 7.083763873332676e-06, "loss": 0.4622, "num_input_tokens_seen": 82857632, "step": 68135 }, { "epoch": 8.537777220899637, "grad_norm": 0.10175464302301407, "learning_rate": 7.0832668871361845e-06, "loss": 0.4655, "num_input_tokens_seen": 82864256, "step": 68140 }, { "epoch": 8.53840370880842, "grad_norm": 0.12615247070789337, "learning_rate": 7.0827698760324835e-06, "loss": 0.4681, "num_input_tokens_seen": 82870624, "step": 68145 }, { "epoch": 8.539030196717203, "grad_norm": 0.09549432247877121, "learning_rate": 7.082272840027519e-06, "loss": 0.4532, "num_input_tokens_seen": 82876928, "step": 68150 }, { "epoch": 8.539656684625987, "grad_norm": 0.12748396396636963, "learning_rate": 7.08177577912723e-06, "loss": 0.4622, "num_input_tokens_seen": 82882880, "step": 68155 }, { "epoch": 8.540283172534771, "grad_norm": 0.10040630400180817, "learning_rate": 7.081278693337562e-06, "loss": 0.4688, "num_input_tokens_seen": 82889088, "step": 68160 }, { "epoch": 8.540909660443553, "grad_norm": 0.0997689887881279, "learning_rate": 7.080781582664459e-06, "loss": 0.4673, "num_input_tokens_seen": 82894752, "step": 68165 }, { "epoch": 8.541536148352337, "grad_norm": 0.06380311399698257, "learning_rate": 7.080284447113861e-06, "loss": 0.465, "num_input_tokens_seen": 82901024, "step": 68170 }, { "epoch": 8.54216263626112, "grad_norm": 0.1004178375005722, "learning_rate": 7.079787286691713e-06, "loss": 0.4595, "num_input_tokens_seen": 82907232, "step": 68175 }, { "epoch": 8.542789124169904, "grad_norm": 0.11259140074253082, "learning_rate": 7.0792901014039616e-06, "loss": 0.4681, "num_input_tokens_seen": 82913312, "step": 68180 }, { "epoch": 8.543415612078686, "grad_norm": 0.08783835917711258, "learning_rate": 7.078792891256547e-06, "loss": 0.4589, "num_input_tokens_seen": 82919296, "step": 68185 }, { "epoch": 8.54404209998747, "grad_norm": 0.09682957828044891, "learning_rate": 7.078295656255416e-06, "loss": 0.4602, "num_input_tokens_seen": 82925536, "step": 68190 }, { "epoch": 8.544668587896254, "grad_norm": 0.13539522886276245, "learning_rate": 7.077798396406515e-06, "loss": 0.4595, "num_input_tokens_seen": 82931584, "step": 68195 }, { "epoch": 8.545295075805036, "grad_norm": 0.09634372591972351, "learning_rate": 7.0773011117157855e-06, "loss": 0.4635, "num_input_tokens_seen": 82937440, "step": 68200 }, { "epoch": 8.54592156371382, "grad_norm": 0.13001340627670288, "learning_rate": 7.076803802189175e-06, "loss": 0.4605, "num_input_tokens_seen": 82943520, "step": 68205 }, { "epoch": 8.546548051622604, "grad_norm": 0.07805721461772919, "learning_rate": 7.0763064678326285e-06, "loss": 0.4581, "num_input_tokens_seen": 82949536, "step": 68210 }, { "epoch": 8.547174539531387, "grad_norm": 0.08769181370735168, "learning_rate": 7.075809108652094e-06, "loss": 0.4567, "num_input_tokens_seen": 82955936, "step": 68215 }, { "epoch": 8.54780102744017, "grad_norm": 0.09137525409460068, "learning_rate": 7.075311724653514e-06, "loss": 0.464, "num_input_tokens_seen": 82961952, "step": 68220 }, { "epoch": 8.548427515348953, "grad_norm": 0.09973381459712982, "learning_rate": 7.074814315842838e-06, "loss": 0.4657, "num_input_tokens_seen": 82968256, "step": 68225 }, { "epoch": 8.549054003257737, "grad_norm": 0.12710034847259521, "learning_rate": 7.074316882226013e-06, "loss": 0.4634, "num_input_tokens_seen": 82974560, "step": 68230 }, { "epoch": 8.549680491166521, "grad_norm": 0.08506827056407928, "learning_rate": 7.0738194238089865e-06, "loss": 0.4595, "num_input_tokens_seen": 82980768, "step": 68235 }, { "epoch": 8.550306979075303, "grad_norm": 0.11337422579526901, "learning_rate": 7.073321940597704e-06, "loss": 0.4651, "num_input_tokens_seen": 82986976, "step": 68240 }, { "epoch": 8.550933466984088, "grad_norm": 0.09639331698417664, "learning_rate": 7.072824432598115e-06, "loss": 0.4648, "num_input_tokens_seen": 82992832, "step": 68245 }, { "epoch": 8.55155995489287, "grad_norm": 0.09987987577915192, "learning_rate": 7.072326899816169e-06, "loss": 0.4585, "num_input_tokens_seen": 82999264, "step": 68250 }, { "epoch": 8.552186442801654, "grad_norm": 0.10673492401838303, "learning_rate": 7.07182934225781e-06, "loss": 0.4662, "num_input_tokens_seen": 83005472, "step": 68255 }, { "epoch": 8.552812930710438, "grad_norm": 0.100191630423069, "learning_rate": 7.071331759928991e-06, "loss": 0.4639, "num_input_tokens_seen": 83011616, "step": 68260 }, { "epoch": 8.55343941861922, "grad_norm": 0.12784452736377716, "learning_rate": 7.0708341528356585e-06, "loss": 0.4594, "num_input_tokens_seen": 83017824, "step": 68265 }, { "epoch": 8.554065906528004, "grad_norm": 0.11791849136352539, "learning_rate": 7.070336520983763e-06, "loss": 0.4639, "num_input_tokens_seen": 83024096, "step": 68270 }, { "epoch": 8.554692394436788, "grad_norm": 0.09412932395935059, "learning_rate": 7.069838864379252e-06, "loss": 0.4608, "num_input_tokens_seen": 83030112, "step": 68275 }, { "epoch": 8.55531888234557, "grad_norm": 0.09520484507083893, "learning_rate": 7.0693411830280785e-06, "loss": 0.4588, "num_input_tokens_seen": 83036576, "step": 68280 }, { "epoch": 8.555945370254355, "grad_norm": 0.1278294324874878, "learning_rate": 7.068843476936191e-06, "loss": 0.4633, "num_input_tokens_seen": 83042112, "step": 68285 }, { "epoch": 8.556571858163137, "grad_norm": 0.10589580237865448, "learning_rate": 7.068345746109542e-06, "loss": 0.464, "num_input_tokens_seen": 83048288, "step": 68290 }, { "epoch": 8.557198346071921, "grad_norm": 0.06353398412466049, "learning_rate": 7.067847990554078e-06, "loss": 0.4643, "num_input_tokens_seen": 83054784, "step": 68295 }, { "epoch": 8.557824833980703, "grad_norm": 0.13224898278713226, "learning_rate": 7.067350210275755e-06, "loss": 0.4618, "num_input_tokens_seen": 83061152, "step": 68300 }, { "epoch": 8.558451321889487, "grad_norm": 0.09743750095367432, "learning_rate": 7.066852405280522e-06, "loss": 0.4746, "num_input_tokens_seen": 83067200, "step": 68305 }, { "epoch": 8.559077809798271, "grad_norm": 0.11899314075708389, "learning_rate": 7.06635457557433e-06, "loss": 0.4572, "num_input_tokens_seen": 83073440, "step": 68310 }, { "epoch": 8.559704297707054, "grad_norm": 0.10585231333971024, "learning_rate": 7.065856721163131e-06, "loss": 0.469, "num_input_tokens_seen": 83079584, "step": 68315 }, { "epoch": 8.560330785615838, "grad_norm": 0.145233154296875, "learning_rate": 7.06535884205288e-06, "loss": 0.4628, "num_input_tokens_seen": 83085792, "step": 68320 }, { "epoch": 8.560957273524622, "grad_norm": 0.10734845697879791, "learning_rate": 7.064860938249525e-06, "loss": 0.4621, "num_input_tokens_seen": 83091808, "step": 68325 }, { "epoch": 8.561583761433404, "grad_norm": 0.14369608461856842, "learning_rate": 7.064363009759023e-06, "loss": 0.4696, "num_input_tokens_seen": 83097856, "step": 68330 }, { "epoch": 8.562210249342188, "grad_norm": 0.10940645635128021, "learning_rate": 7.0638650565873256e-06, "loss": 0.4662, "num_input_tokens_seen": 83104064, "step": 68335 }, { "epoch": 8.56283673725097, "grad_norm": 0.08455176651477814, "learning_rate": 7.063367078740387e-06, "loss": 0.463, "num_input_tokens_seen": 83110336, "step": 68340 }, { "epoch": 8.563463225159754, "grad_norm": 0.09684012085199356, "learning_rate": 7.062869076224158e-06, "loss": 0.4592, "num_input_tokens_seen": 83116576, "step": 68345 }, { "epoch": 8.564089713068539, "grad_norm": 0.08885461837053299, "learning_rate": 7.062371049044596e-06, "loss": 0.4706, "num_input_tokens_seen": 83122592, "step": 68350 }, { "epoch": 8.56471620097732, "grad_norm": 0.09571514278650284, "learning_rate": 7.061872997207655e-06, "loss": 0.4685, "num_input_tokens_seen": 83128672, "step": 68355 }, { "epoch": 8.565342688886105, "grad_norm": 0.09599418193101883, "learning_rate": 7.061374920719288e-06, "loss": 0.4602, "num_input_tokens_seen": 83135168, "step": 68360 }, { "epoch": 8.565969176794887, "grad_norm": 0.09156657010316849, "learning_rate": 7.060876819585451e-06, "loss": 0.4553, "num_input_tokens_seen": 83141120, "step": 68365 }, { "epoch": 8.566595664703671, "grad_norm": 0.12831181287765503, "learning_rate": 7.060378693812099e-06, "loss": 0.4636, "num_input_tokens_seen": 83147488, "step": 68370 }, { "epoch": 8.567222152612455, "grad_norm": 0.10931386798620224, "learning_rate": 7.059880543405187e-06, "loss": 0.4618, "num_input_tokens_seen": 83153344, "step": 68375 }, { "epoch": 8.567848640521238, "grad_norm": 0.09171923249959946, "learning_rate": 7.059382368370671e-06, "loss": 0.4628, "num_input_tokens_seen": 83159680, "step": 68380 }, { "epoch": 8.568475128430022, "grad_norm": 0.0868295207619667, "learning_rate": 7.0588841687145075e-06, "loss": 0.4676, "num_input_tokens_seen": 83165664, "step": 68385 }, { "epoch": 8.569101616338804, "grad_norm": 0.0845387801527977, "learning_rate": 7.0583859444426525e-06, "loss": 0.46, "num_input_tokens_seen": 83171776, "step": 68390 }, { "epoch": 8.569728104247588, "grad_norm": 0.12090817093849182, "learning_rate": 7.057887695561064e-06, "loss": 0.4612, "num_input_tokens_seen": 83177440, "step": 68395 }, { "epoch": 8.570354592156372, "grad_norm": 0.08209928125143051, "learning_rate": 7.057389422075698e-06, "loss": 0.4599, "num_input_tokens_seen": 83183776, "step": 68400 }, { "epoch": 8.570981080065154, "grad_norm": 0.059756387025117874, "learning_rate": 7.056891123992509e-06, "loss": 0.4632, "num_input_tokens_seen": 83190016, "step": 68405 }, { "epoch": 8.571607567973938, "grad_norm": 0.10495316237211227, "learning_rate": 7.056392801317461e-06, "loss": 0.4614, "num_input_tokens_seen": 83196192, "step": 68410 }, { "epoch": 8.57223405588272, "grad_norm": 0.1057671308517456, "learning_rate": 7.0558944540565065e-06, "loss": 0.4685, "num_input_tokens_seen": 83202336, "step": 68415 }, { "epoch": 8.572860543791505, "grad_norm": 0.08719080686569214, "learning_rate": 7.055396082215606e-06, "loss": 0.4573, "num_input_tokens_seen": 83208640, "step": 68420 }, { "epoch": 8.573487031700289, "grad_norm": 0.08587400615215302, "learning_rate": 7.0548976858007155e-06, "loss": 0.4587, "num_input_tokens_seen": 83214656, "step": 68425 }, { "epoch": 8.574113519609071, "grad_norm": 0.061245135962963104, "learning_rate": 7.054399264817797e-06, "loss": 0.4689, "num_input_tokens_seen": 83220864, "step": 68430 }, { "epoch": 8.574740007517855, "grad_norm": 0.12304725497961044, "learning_rate": 7.0539008192728065e-06, "loss": 0.46, "num_input_tokens_seen": 83227200, "step": 68435 }, { "epoch": 8.575366495426639, "grad_norm": 0.12445424497127533, "learning_rate": 7.053402349171706e-06, "loss": 0.4647, "num_input_tokens_seen": 83233536, "step": 68440 }, { "epoch": 8.575992983335421, "grad_norm": 0.12874546647071838, "learning_rate": 7.052903854520454e-06, "loss": 0.4572, "num_input_tokens_seen": 83239520, "step": 68445 }, { "epoch": 8.576619471244205, "grad_norm": 0.09384189546108246, "learning_rate": 7.052405335325011e-06, "loss": 0.4575, "num_input_tokens_seen": 83245888, "step": 68450 }, { "epoch": 8.577245959152988, "grad_norm": 0.07299205660820007, "learning_rate": 7.0519067915913365e-06, "loss": 0.4598, "num_input_tokens_seen": 83252032, "step": 68455 }, { "epoch": 8.577872447061772, "grad_norm": 0.08295928686857224, "learning_rate": 7.051408223325391e-06, "loss": 0.4613, "num_input_tokens_seen": 83258016, "step": 68460 }, { "epoch": 8.578498934970556, "grad_norm": 0.10801167041063309, "learning_rate": 7.050909630533134e-06, "loss": 0.4687, "num_input_tokens_seen": 83264160, "step": 68465 }, { "epoch": 8.579125422879338, "grad_norm": 0.08401957154273987, "learning_rate": 7.05041101322053e-06, "loss": 0.4613, "num_input_tokens_seen": 83269856, "step": 68470 }, { "epoch": 8.579751910788122, "grad_norm": 0.1491401195526123, "learning_rate": 7.049912371393537e-06, "loss": 0.4634, "num_input_tokens_seen": 83276224, "step": 68475 }, { "epoch": 8.580378398696904, "grad_norm": 0.0983632355928421, "learning_rate": 7.049413705058119e-06, "loss": 0.4562, "num_input_tokens_seen": 83282496, "step": 68480 }, { "epoch": 8.581004886605688, "grad_norm": 0.103065624833107, "learning_rate": 7.048915014220237e-06, "loss": 0.461, "num_input_tokens_seen": 83288736, "step": 68485 }, { "epoch": 8.581631374514473, "grad_norm": 0.10661338269710541, "learning_rate": 7.048416298885853e-06, "loss": 0.4643, "num_input_tokens_seen": 83295008, "step": 68490 }, { "epoch": 8.582257862423255, "grad_norm": 0.06133834272623062, "learning_rate": 7.04791755906093e-06, "loss": 0.4641, "num_input_tokens_seen": 83301024, "step": 68495 }, { "epoch": 8.582884350332039, "grad_norm": 0.06971777230501175, "learning_rate": 7.0474187947514316e-06, "loss": 0.4619, "num_input_tokens_seen": 83306944, "step": 68500 }, { "epoch": 8.583510838240821, "grad_norm": 0.09412652254104614, "learning_rate": 7.046920005963319e-06, "loss": 0.4649, "num_input_tokens_seen": 83313280, "step": 68505 }, { "epoch": 8.584137326149605, "grad_norm": 0.0915185809135437, "learning_rate": 7.0464211927025586e-06, "loss": 0.459, "num_input_tokens_seen": 83318944, "step": 68510 }, { "epoch": 8.58476381405839, "grad_norm": 0.09446341544389725, "learning_rate": 7.0459223549751096e-06, "loss": 0.4631, "num_input_tokens_seen": 83324256, "step": 68515 }, { "epoch": 8.585390301967172, "grad_norm": 0.07662824541330338, "learning_rate": 7.0454234927869415e-06, "loss": 0.4621, "num_input_tokens_seen": 83330272, "step": 68520 }, { "epoch": 8.586016789875956, "grad_norm": 0.10638109594583511, "learning_rate": 7.044924606144015e-06, "loss": 0.4612, "num_input_tokens_seen": 83336768, "step": 68525 }, { "epoch": 8.586643277784738, "grad_norm": 0.10329655557870865, "learning_rate": 7.044425695052294e-06, "loss": 0.4564, "num_input_tokens_seen": 83342720, "step": 68530 }, { "epoch": 8.587269765693522, "grad_norm": 0.11229825019836426, "learning_rate": 7.043926759517745e-06, "loss": 0.4553, "num_input_tokens_seen": 83349152, "step": 68535 }, { "epoch": 8.587896253602306, "grad_norm": 0.11478263884782791, "learning_rate": 7.043427799546334e-06, "loss": 0.456, "num_input_tokens_seen": 83355424, "step": 68540 }, { "epoch": 8.588522741511088, "grad_norm": 0.07450215518474579, "learning_rate": 7.042928815144025e-06, "loss": 0.4597, "num_input_tokens_seen": 83361600, "step": 68545 }, { "epoch": 8.589149229419872, "grad_norm": 0.09251946955919266, "learning_rate": 7.0424298063167865e-06, "loss": 0.4618, "num_input_tokens_seen": 83367552, "step": 68550 }, { "epoch": 8.589775717328656, "grad_norm": 0.0912136435508728, "learning_rate": 7.041930773070581e-06, "loss": 0.462, "num_input_tokens_seen": 83373696, "step": 68555 }, { "epoch": 8.590402205237439, "grad_norm": 0.07836180180311203, "learning_rate": 7.041431715411378e-06, "loss": 0.4628, "num_input_tokens_seen": 83379296, "step": 68560 }, { "epoch": 8.591028693146223, "grad_norm": 0.12698189914226532, "learning_rate": 7.0409326333451396e-06, "loss": 0.4584, "num_input_tokens_seen": 83385472, "step": 68565 }, { "epoch": 8.591655181055005, "grad_norm": 0.0674268901348114, "learning_rate": 7.040433526877838e-06, "loss": 0.4612, "num_input_tokens_seen": 83391840, "step": 68570 }, { "epoch": 8.592281668963789, "grad_norm": 0.1296742558479309, "learning_rate": 7.039934396015438e-06, "loss": 0.4611, "num_input_tokens_seen": 83397984, "step": 68575 }, { "epoch": 8.592908156872573, "grad_norm": 0.0926065668463707, "learning_rate": 7.039435240763906e-06, "loss": 0.4606, "num_input_tokens_seen": 83403872, "step": 68580 }, { "epoch": 8.593534644781355, "grad_norm": 0.09150847792625427, "learning_rate": 7.0389360611292105e-06, "loss": 0.4621, "num_input_tokens_seen": 83410464, "step": 68585 }, { "epoch": 8.59416113269014, "grad_norm": 0.09502216428518295, "learning_rate": 7.038436857117321e-06, "loss": 0.4616, "num_input_tokens_seen": 83416416, "step": 68590 }, { "epoch": 8.594787620598922, "grad_norm": 0.1633414328098297, "learning_rate": 7.037937628734204e-06, "loss": 0.4607, "num_input_tokens_seen": 83422496, "step": 68595 }, { "epoch": 8.595414108507706, "grad_norm": 0.0700165331363678, "learning_rate": 7.037438375985831e-06, "loss": 0.4559, "num_input_tokens_seen": 83428608, "step": 68600 }, { "epoch": 8.59604059641649, "grad_norm": 0.09181982278823853, "learning_rate": 7.036939098878169e-06, "loss": 0.4589, "num_input_tokens_seen": 83435008, "step": 68605 }, { "epoch": 8.596667084325272, "grad_norm": 0.1341315656900406, "learning_rate": 7.036439797417187e-06, "loss": 0.4587, "num_input_tokens_seen": 83441248, "step": 68610 }, { "epoch": 8.597293572234056, "grad_norm": 0.11377337574958801, "learning_rate": 7.0359404716088555e-06, "loss": 0.4698, "num_input_tokens_seen": 83447008, "step": 68615 }, { "epoch": 8.597920060142838, "grad_norm": 0.10320296883583069, "learning_rate": 7.0354411214591435e-06, "loss": 0.4565, "num_input_tokens_seen": 83453056, "step": 68620 }, { "epoch": 8.598546548051623, "grad_norm": 0.1739845722913742, "learning_rate": 7.0349417469740225e-06, "loss": 0.4631, "num_input_tokens_seen": 83459200, "step": 68625 }, { "epoch": 8.599173035960407, "grad_norm": 0.10268168896436691, "learning_rate": 7.0344423481594605e-06, "loss": 0.4635, "num_input_tokens_seen": 83465600, "step": 68630 }, { "epoch": 8.599799523869189, "grad_norm": 0.09274953603744507, "learning_rate": 7.033942925021429e-06, "loss": 0.4633, "num_input_tokens_seen": 83471840, "step": 68635 }, { "epoch": 8.600426011777973, "grad_norm": 0.14309565722942352, "learning_rate": 7.033443477565902e-06, "loss": 0.453, "num_input_tokens_seen": 83478112, "step": 68640 }, { "epoch": 8.601052499686755, "grad_norm": 0.11717254668474197, "learning_rate": 7.032944005798848e-06, "loss": 0.4667, "num_input_tokens_seen": 83484512, "step": 68645 }, { "epoch": 8.60167898759554, "grad_norm": 0.15183940529823303, "learning_rate": 7.032444509726239e-06, "loss": 0.468, "num_input_tokens_seen": 83490720, "step": 68650 }, { "epoch": 8.602305475504323, "grad_norm": 0.1136508360505104, "learning_rate": 7.031944989354047e-06, "loss": 0.4643, "num_input_tokens_seen": 83496384, "step": 68655 }, { "epoch": 8.602931963413106, "grad_norm": 0.11701642721891403, "learning_rate": 7.031445444688245e-06, "loss": 0.4686, "num_input_tokens_seen": 83502432, "step": 68660 }, { "epoch": 8.60355845132189, "grad_norm": 0.09149694442749023, "learning_rate": 7.030945875734804e-06, "loss": 0.4638, "num_input_tokens_seen": 83508352, "step": 68665 }, { "epoch": 8.604184939230674, "grad_norm": 0.11440744996070862, "learning_rate": 7.030446282499697e-06, "loss": 0.4688, "num_input_tokens_seen": 83514624, "step": 68670 }, { "epoch": 8.604811427139456, "grad_norm": 0.11288474500179291, "learning_rate": 7.029946664988899e-06, "loss": 0.4632, "num_input_tokens_seen": 83520864, "step": 68675 }, { "epoch": 8.60543791504824, "grad_norm": 0.11478952318429947, "learning_rate": 7.029447023208382e-06, "loss": 0.4594, "num_input_tokens_seen": 83526976, "step": 68680 }, { "epoch": 8.606064402957022, "grad_norm": 0.0876152440905571, "learning_rate": 7.028947357164119e-06, "loss": 0.4654, "num_input_tokens_seen": 83533312, "step": 68685 }, { "epoch": 8.606690890865806, "grad_norm": 0.08911741524934769, "learning_rate": 7.028447666862084e-06, "loss": 0.4645, "num_input_tokens_seen": 83538752, "step": 68690 }, { "epoch": 8.60731737877459, "grad_norm": 0.11572723835706711, "learning_rate": 7.027947952308252e-06, "loss": 0.4628, "num_input_tokens_seen": 83544832, "step": 68695 }, { "epoch": 8.607943866683373, "grad_norm": 0.09870748966932297, "learning_rate": 7.027448213508598e-06, "loss": 0.4552, "num_input_tokens_seen": 83550880, "step": 68700 }, { "epoch": 8.608570354592157, "grad_norm": 0.07629770785570145, "learning_rate": 7.026948450469094e-06, "loss": 0.4617, "num_input_tokens_seen": 83556992, "step": 68705 }, { "epoch": 8.609196842500939, "grad_norm": 0.12728635966777802, "learning_rate": 7.02644866319572e-06, "loss": 0.4647, "num_input_tokens_seen": 83563168, "step": 68710 }, { "epoch": 8.609823330409723, "grad_norm": 0.10635973513126373, "learning_rate": 7.025948851694445e-06, "loss": 0.4657, "num_input_tokens_seen": 83569664, "step": 68715 }, { "epoch": 8.610449818318507, "grad_norm": 0.1377166211605072, "learning_rate": 7.0254490159712505e-06, "loss": 0.4618, "num_input_tokens_seen": 83575680, "step": 68720 }, { "epoch": 8.61107630622729, "grad_norm": 0.09517551958560944, "learning_rate": 7.0249491560321105e-06, "loss": 0.4579, "num_input_tokens_seen": 83582080, "step": 68725 }, { "epoch": 8.611702794136074, "grad_norm": 0.10506675392389297, "learning_rate": 7.024449271882999e-06, "loss": 0.4526, "num_input_tokens_seen": 83588256, "step": 68730 }, { "epoch": 8.612329282044856, "grad_norm": 0.11730268597602844, "learning_rate": 7.023949363529896e-06, "loss": 0.4679, "num_input_tokens_seen": 83594400, "step": 68735 }, { "epoch": 8.61295576995364, "grad_norm": 0.0941019132733345, "learning_rate": 7.023449430978777e-06, "loss": 0.4676, "num_input_tokens_seen": 83600416, "step": 68740 }, { "epoch": 8.613582257862424, "grad_norm": 0.07862434536218643, "learning_rate": 7.0229494742356164e-06, "loss": 0.4641, "num_input_tokens_seen": 83606208, "step": 68745 }, { "epoch": 8.614208745771206, "grad_norm": 0.08469600230455399, "learning_rate": 7.022449493306396e-06, "loss": 0.4601, "num_input_tokens_seen": 83612672, "step": 68750 }, { "epoch": 8.61483523367999, "grad_norm": 0.08868596702814102, "learning_rate": 7.0219494881970914e-06, "loss": 0.4572, "num_input_tokens_seen": 83618208, "step": 68755 }, { "epoch": 8.615461721588773, "grad_norm": 0.11657373607158661, "learning_rate": 7.021449458913681e-06, "loss": 0.4611, "num_input_tokens_seen": 83624224, "step": 68760 }, { "epoch": 8.616088209497557, "grad_norm": 0.08622138947248459, "learning_rate": 7.0209494054621415e-06, "loss": 0.462, "num_input_tokens_seen": 83630240, "step": 68765 }, { "epoch": 8.61671469740634, "grad_norm": 0.10970266163349152, "learning_rate": 7.020449327848454e-06, "loss": 0.4595, "num_input_tokens_seen": 83636384, "step": 68770 }, { "epoch": 8.617341185315123, "grad_norm": 0.09202714264392853, "learning_rate": 7.019949226078595e-06, "loss": 0.4587, "num_input_tokens_seen": 83642624, "step": 68775 }, { "epoch": 8.617967673223907, "grad_norm": 0.09520651400089264, "learning_rate": 7.019449100158545e-06, "loss": 0.4573, "num_input_tokens_seen": 83649120, "step": 68780 }, { "epoch": 8.618594161132691, "grad_norm": 0.13302090764045715, "learning_rate": 7.018948950094284e-06, "loss": 0.464, "num_input_tokens_seen": 83655072, "step": 68785 }, { "epoch": 8.619220649041473, "grad_norm": 0.14072977006435394, "learning_rate": 7.018448775891791e-06, "loss": 0.4633, "num_input_tokens_seen": 83661248, "step": 68790 }, { "epoch": 8.619847136950257, "grad_norm": 0.13677878677845, "learning_rate": 7.017948577557043e-06, "loss": 0.471, "num_input_tokens_seen": 83667360, "step": 68795 }, { "epoch": 8.62047362485904, "grad_norm": 0.11009040474891663, "learning_rate": 7.017448355096026e-06, "loss": 0.4638, "num_input_tokens_seen": 83673760, "step": 68800 }, { "epoch": 8.621100112767824, "grad_norm": 0.0995410606265068, "learning_rate": 7.0169481085147175e-06, "loss": 0.4633, "num_input_tokens_seen": 83679872, "step": 68805 }, { "epoch": 8.621726600676606, "grad_norm": 0.10769778490066528, "learning_rate": 7.016447837819098e-06, "loss": 0.4631, "num_input_tokens_seen": 83685728, "step": 68810 }, { "epoch": 8.62235308858539, "grad_norm": 0.10796315968036652, "learning_rate": 7.015947543015149e-06, "loss": 0.4567, "num_input_tokens_seen": 83691744, "step": 68815 }, { "epoch": 8.622979576494174, "grad_norm": 0.0817575603723526, "learning_rate": 7.015447224108853e-06, "loss": 0.4589, "num_input_tokens_seen": 83697376, "step": 68820 }, { "epoch": 8.623606064402956, "grad_norm": 0.10968102514743805, "learning_rate": 7.0149468811061905e-06, "loss": 0.4646, "num_input_tokens_seen": 83703616, "step": 68825 }, { "epoch": 8.62423255231174, "grad_norm": 0.05410877242684364, "learning_rate": 7.014446514013144e-06, "loss": 0.4693, "num_input_tokens_seen": 83709728, "step": 68830 }, { "epoch": 8.624859040220525, "grad_norm": 0.14679838716983795, "learning_rate": 7.013946122835696e-06, "loss": 0.4647, "num_input_tokens_seen": 83716000, "step": 68835 }, { "epoch": 8.625485528129307, "grad_norm": 0.09353971481323242, "learning_rate": 7.013445707579829e-06, "loss": 0.4633, "num_input_tokens_seen": 83722208, "step": 68840 }, { "epoch": 8.62611201603809, "grad_norm": 0.1969117820262909, "learning_rate": 7.012945268251525e-06, "loss": 0.4718, "num_input_tokens_seen": 83728160, "step": 68845 }, { "epoch": 8.626738503946873, "grad_norm": 0.12993353605270386, "learning_rate": 7.012444804856768e-06, "loss": 0.4598, "num_input_tokens_seen": 83734560, "step": 68850 }, { "epoch": 8.627364991855657, "grad_norm": 0.085630401968956, "learning_rate": 7.011944317401542e-06, "loss": 0.476, "num_input_tokens_seen": 83740832, "step": 68855 }, { "epoch": 8.627991479764441, "grad_norm": 0.10626884549856186, "learning_rate": 7.01144380589183e-06, "loss": 0.4586, "num_input_tokens_seen": 83746976, "step": 68860 }, { "epoch": 8.628617967673224, "grad_norm": 0.12610630691051483, "learning_rate": 7.0109432703336155e-06, "loss": 0.4657, "num_input_tokens_seen": 83753248, "step": 68865 }, { "epoch": 8.629244455582008, "grad_norm": 0.09379134327173233, "learning_rate": 7.010442710732884e-06, "loss": 0.4628, "num_input_tokens_seen": 83759456, "step": 68870 }, { "epoch": 8.62987094349079, "grad_norm": 0.10217094421386719, "learning_rate": 7.00994212709562e-06, "loss": 0.4642, "num_input_tokens_seen": 83765344, "step": 68875 }, { "epoch": 8.630497431399574, "grad_norm": 0.09316710382699966, "learning_rate": 7.009441519427809e-06, "loss": 0.4652, "num_input_tokens_seen": 83771232, "step": 68880 }, { "epoch": 8.631123919308358, "grad_norm": 0.09778046607971191, "learning_rate": 7.0089408877354335e-06, "loss": 0.4685, "num_input_tokens_seen": 83777600, "step": 68885 }, { "epoch": 8.63175040721714, "grad_norm": 0.08805175125598907, "learning_rate": 7.00844023202448e-06, "loss": 0.46, "num_input_tokens_seen": 83784096, "step": 68890 }, { "epoch": 8.632376895125924, "grad_norm": 0.11775955557823181, "learning_rate": 7.007939552300935e-06, "loss": 0.4585, "num_input_tokens_seen": 83790432, "step": 68895 }, { "epoch": 8.633003383034707, "grad_norm": 0.10535719990730286, "learning_rate": 7.007438848570785e-06, "loss": 0.4626, "num_input_tokens_seen": 83796768, "step": 68900 }, { "epoch": 8.63362987094349, "grad_norm": 0.07933179289102554, "learning_rate": 7.006938120840017e-06, "loss": 0.4623, "num_input_tokens_seen": 83802976, "step": 68905 }, { "epoch": 8.634256358852275, "grad_norm": 0.11971943080425262, "learning_rate": 7.006437369114616e-06, "loss": 0.4653, "num_input_tokens_seen": 83808960, "step": 68910 }, { "epoch": 8.634882846761057, "grad_norm": 0.08327700942754745, "learning_rate": 7.005936593400569e-06, "loss": 0.462, "num_input_tokens_seen": 83815040, "step": 68915 }, { "epoch": 8.635509334669841, "grad_norm": 0.07929162681102753, "learning_rate": 7.005435793703863e-06, "loss": 0.4558, "num_input_tokens_seen": 83821152, "step": 68920 }, { "epoch": 8.636135822578623, "grad_norm": 0.11841212958097458, "learning_rate": 7.004934970030487e-06, "loss": 0.4627, "num_input_tokens_seen": 83827488, "step": 68925 }, { "epoch": 8.636762310487407, "grad_norm": 0.09748691320419312, "learning_rate": 7.0044341223864276e-06, "loss": 0.4627, "num_input_tokens_seen": 83833408, "step": 68930 }, { "epoch": 8.637388798396191, "grad_norm": 0.10273386538028717, "learning_rate": 7.003933250777673e-06, "loss": 0.4689, "num_input_tokens_seen": 83839680, "step": 68935 }, { "epoch": 8.638015286304974, "grad_norm": 0.09445860236883163, "learning_rate": 7.003432355210211e-06, "loss": 0.4642, "num_input_tokens_seen": 83845952, "step": 68940 }, { "epoch": 8.638641774213758, "grad_norm": 0.09244686365127563, "learning_rate": 7.00293143569003e-06, "loss": 0.4644, "num_input_tokens_seen": 83851872, "step": 68945 }, { "epoch": 8.639268262122542, "grad_norm": 0.09689483046531677, "learning_rate": 7.002430492223121e-06, "loss": 0.462, "num_input_tokens_seen": 83857984, "step": 68950 }, { "epoch": 8.639894750031324, "grad_norm": 0.08898599445819855, "learning_rate": 7.0019295248154714e-06, "loss": 0.4647, "num_input_tokens_seen": 83864352, "step": 68955 }, { "epoch": 8.640521237940108, "grad_norm": 0.08724169433116913, "learning_rate": 7.001428533473072e-06, "loss": 0.4643, "num_input_tokens_seen": 83870624, "step": 68960 }, { "epoch": 8.64114772584889, "grad_norm": 0.11731160432100296, "learning_rate": 7.000927518201911e-06, "loss": 0.4544, "num_input_tokens_seen": 83876736, "step": 68965 }, { "epoch": 8.641774213757675, "grad_norm": 0.09734658151865005, "learning_rate": 7.00042647900798e-06, "loss": 0.458, "num_input_tokens_seen": 83883008, "step": 68970 }, { "epoch": 8.642400701666459, "grad_norm": 0.09849608689546585, "learning_rate": 6.999925415897267e-06, "loss": 0.4631, "num_input_tokens_seen": 83889216, "step": 68975 }, { "epoch": 8.64302718957524, "grad_norm": 0.12020460516214371, "learning_rate": 6.999424328875766e-06, "loss": 0.4627, "num_input_tokens_seen": 83895584, "step": 68980 }, { "epoch": 8.643653677484025, "grad_norm": 0.07803154736757278, "learning_rate": 6.998923217949466e-06, "loss": 0.4616, "num_input_tokens_seen": 83901728, "step": 68985 }, { "epoch": 8.644280165392807, "grad_norm": 0.08236438781023026, "learning_rate": 6.9984220831243575e-06, "loss": 0.4642, "num_input_tokens_seen": 83908384, "step": 68990 }, { "epoch": 8.644906653301591, "grad_norm": 0.09428058564662933, "learning_rate": 6.997920924406434e-06, "loss": 0.4621, "num_input_tokens_seen": 83914592, "step": 68995 }, { "epoch": 8.645533141210375, "grad_norm": 0.061965666711330414, "learning_rate": 6.997419741801683e-06, "loss": 0.4656, "num_input_tokens_seen": 83920896, "step": 69000 }, { "epoch": 8.646159629119158, "grad_norm": 0.08961599320173264, "learning_rate": 6.996918535316103e-06, "loss": 0.4629, "num_input_tokens_seen": 83927168, "step": 69005 }, { "epoch": 8.646786117027942, "grad_norm": 0.10999777913093567, "learning_rate": 6.996417304955682e-06, "loss": 0.4584, "num_input_tokens_seen": 83932896, "step": 69010 }, { "epoch": 8.647412604936724, "grad_norm": 0.13532058894634247, "learning_rate": 6.9959160507264135e-06, "loss": 0.4649, "num_input_tokens_seen": 83938880, "step": 69015 }, { "epoch": 8.648039092845508, "grad_norm": 0.10214628279209137, "learning_rate": 6.99541477263429e-06, "loss": 0.4688, "num_input_tokens_seen": 83944864, "step": 69020 }, { "epoch": 8.648665580754292, "grad_norm": 0.06461870670318604, "learning_rate": 6.994913470685305e-06, "loss": 0.459, "num_input_tokens_seen": 83950720, "step": 69025 }, { "epoch": 8.649292068663074, "grad_norm": 0.07956094294786453, "learning_rate": 6.994412144885453e-06, "loss": 0.4602, "num_input_tokens_seen": 83956896, "step": 69030 }, { "epoch": 8.649918556571858, "grad_norm": 0.0718112364411354, "learning_rate": 6.993910795240727e-06, "loss": 0.4638, "num_input_tokens_seen": 83963104, "step": 69035 }, { "epoch": 8.65054504448064, "grad_norm": 0.13901932537555695, "learning_rate": 6.993409421757121e-06, "loss": 0.4633, "num_input_tokens_seen": 83969376, "step": 69040 }, { "epoch": 8.651171532389425, "grad_norm": 0.10729760676622391, "learning_rate": 6.99290802444063e-06, "loss": 0.464, "num_input_tokens_seen": 83975456, "step": 69045 }, { "epoch": 8.651798020298209, "grad_norm": 0.09423423558473587, "learning_rate": 6.992406603297248e-06, "loss": 0.4574, "num_input_tokens_seen": 83981376, "step": 69050 }, { "epoch": 8.652424508206991, "grad_norm": 0.07921461015939713, "learning_rate": 6.991905158332968e-06, "loss": 0.4709, "num_input_tokens_seen": 83987488, "step": 69055 }, { "epoch": 8.653050996115775, "grad_norm": 0.07122471928596497, "learning_rate": 6.991403689553788e-06, "loss": 0.457, "num_input_tokens_seen": 83993440, "step": 69060 }, { "epoch": 8.65367748402456, "grad_norm": 0.10644616931676865, "learning_rate": 6.990902196965704e-06, "loss": 0.4657, "num_input_tokens_seen": 83999904, "step": 69065 }, { "epoch": 8.654303971933341, "grad_norm": 0.05103728920221329, "learning_rate": 6.990400680574709e-06, "loss": 0.4607, "num_input_tokens_seen": 84005824, "step": 69070 }, { "epoch": 8.654930459842126, "grad_norm": 0.09935858845710754, "learning_rate": 6.989899140386802e-06, "loss": 0.4663, "num_input_tokens_seen": 84011840, "step": 69075 }, { "epoch": 8.655556947750908, "grad_norm": 0.11217965930700302, "learning_rate": 6.989397576407977e-06, "loss": 0.4615, "num_input_tokens_seen": 84018208, "step": 69080 }, { "epoch": 8.656183435659692, "grad_norm": 0.08801914006471634, "learning_rate": 6.98889598864423e-06, "loss": 0.4581, "num_input_tokens_seen": 84024768, "step": 69085 }, { "epoch": 8.656809923568476, "grad_norm": 0.10283523797988892, "learning_rate": 6.988394377101559e-06, "loss": 0.4625, "num_input_tokens_seen": 84030688, "step": 69090 }, { "epoch": 8.657436411477258, "grad_norm": 0.05839606374502182, "learning_rate": 6.987892741785965e-06, "loss": 0.4628, "num_input_tokens_seen": 84036544, "step": 69095 }, { "epoch": 8.658062899386042, "grad_norm": 0.10410382598638535, "learning_rate": 6.987391082703439e-06, "loss": 0.4535, "num_input_tokens_seen": 84042560, "step": 69100 }, { "epoch": 8.658689387294825, "grad_norm": 0.09636954218149185, "learning_rate": 6.986889399859982e-06, "loss": 0.4664, "num_input_tokens_seen": 84048608, "step": 69105 }, { "epoch": 8.659315875203609, "grad_norm": 0.10838872939348221, "learning_rate": 6.986387693261592e-06, "loss": 0.463, "num_input_tokens_seen": 84054624, "step": 69110 }, { "epoch": 8.659942363112393, "grad_norm": 0.09693122655153275, "learning_rate": 6.985885962914267e-06, "loss": 0.4648, "num_input_tokens_seen": 84060704, "step": 69115 }, { "epoch": 8.660568851021175, "grad_norm": 0.08778416365385056, "learning_rate": 6.985384208824007e-06, "loss": 0.4632, "num_input_tokens_seen": 84066816, "step": 69120 }, { "epoch": 8.661195338929959, "grad_norm": 0.07946887612342834, "learning_rate": 6.984882430996808e-06, "loss": 0.4533, "num_input_tokens_seen": 84072928, "step": 69125 }, { "epoch": 8.661821826838741, "grad_norm": 0.08449424058198929, "learning_rate": 6.984380629438672e-06, "loss": 0.4503, "num_input_tokens_seen": 84078240, "step": 69130 }, { "epoch": 8.662448314747525, "grad_norm": 0.18482278287410736, "learning_rate": 6.983878804155596e-06, "loss": 0.4659, "num_input_tokens_seen": 84084736, "step": 69135 }, { "epoch": 8.66307480265631, "grad_norm": 0.1332881599664688, "learning_rate": 6.983376955153581e-06, "loss": 0.4598, "num_input_tokens_seen": 84090144, "step": 69140 }, { "epoch": 8.663701290565092, "grad_norm": 0.1264243870973587, "learning_rate": 6.982875082438627e-06, "loss": 0.4632, "num_input_tokens_seen": 84096640, "step": 69145 }, { "epoch": 8.664327778473876, "grad_norm": 0.08575087040662766, "learning_rate": 6.982373186016734e-06, "loss": 0.4649, "num_input_tokens_seen": 84103040, "step": 69150 }, { "epoch": 8.664954266382658, "grad_norm": 0.0835595354437828, "learning_rate": 6.981871265893904e-06, "loss": 0.4694, "num_input_tokens_seen": 84109024, "step": 69155 }, { "epoch": 8.665580754291442, "grad_norm": 0.11563248932361603, "learning_rate": 6.981369322076135e-06, "loss": 0.4548, "num_input_tokens_seen": 84114912, "step": 69160 }, { "epoch": 8.666207242200226, "grad_norm": 0.17253519594669342, "learning_rate": 6.980867354569432e-06, "loss": 0.464, "num_input_tokens_seen": 84121280, "step": 69165 }, { "epoch": 8.666833730109008, "grad_norm": 0.11762310564517975, "learning_rate": 6.9803653633797926e-06, "loss": 0.4644, "num_input_tokens_seen": 84127424, "step": 69170 }, { "epoch": 8.667460218017792, "grad_norm": 0.16487295925617218, "learning_rate": 6.979863348513222e-06, "loss": 0.4637, "num_input_tokens_seen": 84133376, "step": 69175 }, { "epoch": 8.668086705926576, "grad_norm": 0.06247939169406891, "learning_rate": 6.97936130997572e-06, "loss": 0.4565, "num_input_tokens_seen": 84139520, "step": 69180 }, { "epoch": 8.668713193835359, "grad_norm": 0.07830478250980377, "learning_rate": 6.978859247773289e-06, "loss": 0.469, "num_input_tokens_seen": 84145760, "step": 69185 }, { "epoch": 8.669339681744143, "grad_norm": 0.09805712103843689, "learning_rate": 6.9783571619119325e-06, "loss": 0.4626, "num_input_tokens_seen": 84151808, "step": 69190 }, { "epoch": 8.669966169652925, "grad_norm": 0.08377265185117722, "learning_rate": 6.977855052397652e-06, "loss": 0.4594, "num_input_tokens_seen": 84157824, "step": 69195 }, { "epoch": 8.67059265756171, "grad_norm": 0.10175757855176926, "learning_rate": 6.977352919236452e-06, "loss": 0.4652, "num_input_tokens_seen": 84162848, "step": 69200 }, { "epoch": 8.671219145470493, "grad_norm": 0.11042813956737518, "learning_rate": 6.976850762434334e-06, "loss": 0.4661, "num_input_tokens_seen": 84168960, "step": 69205 }, { "epoch": 8.671845633379275, "grad_norm": 0.08703771978616714, "learning_rate": 6.976348581997303e-06, "loss": 0.4659, "num_input_tokens_seen": 84174784, "step": 69210 }, { "epoch": 8.67247212128806, "grad_norm": 0.12635084986686707, "learning_rate": 6.9758463779313654e-06, "loss": 0.4547, "num_input_tokens_seen": 84180896, "step": 69215 }, { "epoch": 8.673098609196842, "grad_norm": 0.1208522692322731, "learning_rate": 6.975344150242523e-06, "loss": 0.4631, "num_input_tokens_seen": 84186816, "step": 69220 }, { "epoch": 8.673725097105626, "grad_norm": 0.09239377826452255, "learning_rate": 6.974841898936779e-06, "loss": 0.4637, "num_input_tokens_seen": 84192704, "step": 69225 }, { "epoch": 8.67435158501441, "grad_norm": 0.16651761531829834, "learning_rate": 6.97433962402014e-06, "loss": 0.4688, "num_input_tokens_seen": 84198944, "step": 69230 }, { "epoch": 8.674978072923192, "grad_norm": 0.08967304229736328, "learning_rate": 6.973837325498612e-06, "loss": 0.4668, "num_input_tokens_seen": 84205056, "step": 69235 }, { "epoch": 8.675604560831976, "grad_norm": 0.07803486287593842, "learning_rate": 6.973335003378199e-06, "loss": 0.4601, "num_input_tokens_seen": 84210336, "step": 69240 }, { "epoch": 8.676231048740759, "grad_norm": 0.11221061646938324, "learning_rate": 6.972832657664905e-06, "loss": 0.4592, "num_input_tokens_seen": 84217056, "step": 69245 }, { "epoch": 8.676857536649543, "grad_norm": 0.08449530601501465, "learning_rate": 6.972330288364741e-06, "loss": 0.4617, "num_input_tokens_seen": 84223072, "step": 69250 }, { "epoch": 8.677484024558327, "grad_norm": 0.07191409915685654, "learning_rate": 6.9718278954837075e-06, "loss": 0.4621, "num_input_tokens_seen": 84228800, "step": 69255 }, { "epoch": 8.678110512467109, "grad_norm": 0.08765532076358795, "learning_rate": 6.971325479027814e-06, "loss": 0.4658, "num_input_tokens_seen": 84234848, "step": 69260 }, { "epoch": 8.678737000375893, "grad_norm": 0.11024913936853409, "learning_rate": 6.9708230390030684e-06, "loss": 0.459, "num_input_tokens_seen": 84241088, "step": 69265 }, { "epoch": 8.679363488284675, "grad_norm": 0.0768386647105217, "learning_rate": 6.9703205754154745e-06, "loss": 0.4571, "num_input_tokens_seen": 84247264, "step": 69270 }, { "epoch": 8.67998997619346, "grad_norm": 0.07858727872371674, "learning_rate": 6.969818088271043e-06, "loss": 0.4614, "num_input_tokens_seen": 84253440, "step": 69275 }, { "epoch": 8.680616464102243, "grad_norm": 0.09548601508140564, "learning_rate": 6.9693155775757795e-06, "loss": 0.4606, "num_input_tokens_seen": 84259552, "step": 69280 }, { "epoch": 8.681242952011026, "grad_norm": 0.08469878882169724, "learning_rate": 6.968813043335694e-06, "loss": 0.4623, "num_input_tokens_seen": 84265088, "step": 69285 }, { "epoch": 8.68186943991981, "grad_norm": 0.091120645403862, "learning_rate": 6.968310485556792e-06, "loss": 0.4665, "num_input_tokens_seen": 84271168, "step": 69290 }, { "epoch": 8.682495927828594, "grad_norm": 0.09164020419120789, "learning_rate": 6.967807904245083e-06, "loss": 0.464, "num_input_tokens_seen": 84277440, "step": 69295 }, { "epoch": 8.683122415737376, "grad_norm": 0.08077505975961685, "learning_rate": 6.967305299406577e-06, "loss": 0.4567, "num_input_tokens_seen": 84283616, "step": 69300 }, { "epoch": 8.68374890364616, "grad_norm": 0.06016950681805611, "learning_rate": 6.966802671047281e-06, "loss": 0.4593, "num_input_tokens_seen": 84289728, "step": 69305 }, { "epoch": 8.684375391554942, "grad_norm": 0.07479316741228104, "learning_rate": 6.966300019173206e-06, "loss": 0.4592, "num_input_tokens_seen": 84295936, "step": 69310 }, { "epoch": 8.685001879463726, "grad_norm": 0.09587345272302628, "learning_rate": 6.965797343790361e-06, "loss": 0.4668, "num_input_tokens_seen": 84302240, "step": 69315 }, { "epoch": 8.68562836737251, "grad_norm": 0.08863460272550583, "learning_rate": 6.965294644904756e-06, "loss": 0.4652, "num_input_tokens_seen": 84308096, "step": 69320 }, { "epoch": 8.686254855281293, "grad_norm": 0.12637433409690857, "learning_rate": 6.964791922522402e-06, "loss": 0.4619, "num_input_tokens_seen": 84313888, "step": 69325 }, { "epoch": 8.686881343190077, "grad_norm": 0.10314159840345383, "learning_rate": 6.9642891766493084e-06, "loss": 0.4592, "num_input_tokens_seen": 84319232, "step": 69330 }, { "epoch": 8.68750783109886, "grad_norm": 0.09180168062448502, "learning_rate": 6.963786407291485e-06, "loss": 0.4579, "num_input_tokens_seen": 84325280, "step": 69335 }, { "epoch": 8.688134319007643, "grad_norm": 0.08626789599657059, "learning_rate": 6.963283614454946e-06, "loss": 0.4605, "num_input_tokens_seen": 84331424, "step": 69340 }, { "epoch": 8.688760806916427, "grad_norm": 0.08014772832393646, "learning_rate": 6.9627807981456995e-06, "loss": 0.4595, "num_input_tokens_seen": 84337312, "step": 69345 }, { "epoch": 8.68938729482521, "grad_norm": 0.08208614587783813, "learning_rate": 6.962277958369759e-06, "loss": 0.4595, "num_input_tokens_seen": 84343584, "step": 69350 }, { "epoch": 8.690013782733994, "grad_norm": 0.09280376136302948, "learning_rate": 6.961775095133134e-06, "loss": 0.4675, "num_input_tokens_seen": 84349536, "step": 69355 }, { "epoch": 8.690640270642776, "grad_norm": 0.0670710876584053, "learning_rate": 6.961272208441839e-06, "loss": 0.46, "num_input_tokens_seen": 84355648, "step": 69360 }, { "epoch": 8.69126675855156, "grad_norm": 0.1013103649020195, "learning_rate": 6.960769298301887e-06, "loss": 0.4634, "num_input_tokens_seen": 84362080, "step": 69365 }, { "epoch": 8.691893246460344, "grad_norm": 0.15449854731559753, "learning_rate": 6.960266364719289e-06, "loss": 0.4569, "num_input_tokens_seen": 84368032, "step": 69370 }, { "epoch": 8.692519734369126, "grad_norm": 0.08931128680706024, "learning_rate": 6.959763407700059e-06, "loss": 0.4581, "num_input_tokens_seen": 84374400, "step": 69375 }, { "epoch": 8.69314622227791, "grad_norm": 0.12591636180877686, "learning_rate": 6.9592604272502094e-06, "loss": 0.4627, "num_input_tokens_seen": 84380352, "step": 69380 }, { "epoch": 8.693772710186693, "grad_norm": 0.12240059673786163, "learning_rate": 6.9587574233757536e-06, "loss": 0.4616, "num_input_tokens_seen": 84386080, "step": 69385 }, { "epoch": 8.694399198095477, "grad_norm": 0.07696003466844559, "learning_rate": 6.958254396082706e-06, "loss": 0.4623, "num_input_tokens_seen": 84392096, "step": 69390 }, { "epoch": 8.69502568600426, "grad_norm": 0.1269148588180542, "learning_rate": 6.957751345377081e-06, "loss": 0.4609, "num_input_tokens_seen": 84398368, "step": 69395 }, { "epoch": 8.695652173913043, "grad_norm": 0.1397138386964798, "learning_rate": 6.957248271264892e-06, "loss": 0.4626, "num_input_tokens_seen": 84404544, "step": 69400 }, { "epoch": 8.696278661821827, "grad_norm": 0.11176197975873947, "learning_rate": 6.956745173752156e-06, "loss": 0.4571, "num_input_tokens_seen": 84410528, "step": 69405 }, { "epoch": 8.696905149730611, "grad_norm": 0.09533301740884781, "learning_rate": 6.956242052844886e-06, "loss": 0.4601, "num_input_tokens_seen": 84416640, "step": 69410 }, { "epoch": 8.697531637639393, "grad_norm": 0.10597644001245499, "learning_rate": 6.955738908549095e-06, "loss": 0.4632, "num_input_tokens_seen": 84422336, "step": 69415 }, { "epoch": 8.698158125548177, "grad_norm": 0.15041211247444153, "learning_rate": 6.955235740870804e-06, "loss": 0.4625, "num_input_tokens_seen": 84428512, "step": 69420 }, { "epoch": 8.69878461345696, "grad_norm": 0.14958073198795319, "learning_rate": 6.954732549816025e-06, "loss": 0.4676, "num_input_tokens_seen": 84434784, "step": 69425 }, { "epoch": 8.699411101365744, "grad_norm": 0.14253085851669312, "learning_rate": 6.954229335390774e-06, "loss": 0.4585, "num_input_tokens_seen": 84440032, "step": 69430 }, { "epoch": 8.700037589274526, "grad_norm": 0.23165558278560638, "learning_rate": 6.95372609760107e-06, "loss": 0.4558, "num_input_tokens_seen": 84446144, "step": 69435 }, { "epoch": 8.70066407718331, "grad_norm": 0.1862800419330597, "learning_rate": 6.953222836452926e-06, "loss": 0.477, "num_input_tokens_seen": 84452032, "step": 69440 }, { "epoch": 8.701290565092094, "grad_norm": 0.16708774864673615, "learning_rate": 6.952719551952363e-06, "loss": 0.4624, "num_input_tokens_seen": 84458464, "step": 69445 }, { "epoch": 8.701917053000876, "grad_norm": 0.14370131492614746, "learning_rate": 6.952216244105394e-06, "loss": 0.4707, "num_input_tokens_seen": 84464160, "step": 69450 }, { "epoch": 8.70254354090966, "grad_norm": 0.1168389618396759, "learning_rate": 6.951712912918038e-06, "loss": 0.4707, "num_input_tokens_seen": 84470688, "step": 69455 }, { "epoch": 8.703170028818445, "grad_norm": 0.1127377301454544, "learning_rate": 6.951209558396315e-06, "loss": 0.4648, "num_input_tokens_seen": 84476928, "step": 69460 }, { "epoch": 8.703796516727227, "grad_norm": 0.09752292186021805, "learning_rate": 6.950706180546241e-06, "loss": 0.4638, "num_input_tokens_seen": 84483200, "step": 69465 }, { "epoch": 8.704423004636011, "grad_norm": 0.08583829551935196, "learning_rate": 6.950202779373834e-06, "loss": 0.451, "num_input_tokens_seen": 84489184, "step": 69470 }, { "epoch": 8.705049492544793, "grad_norm": 0.09020176529884338, "learning_rate": 6.949699354885112e-06, "loss": 0.4664, "num_input_tokens_seen": 84495360, "step": 69475 }, { "epoch": 8.705675980453577, "grad_norm": 0.09917409718036652, "learning_rate": 6.949195907086097e-06, "loss": 0.4642, "num_input_tokens_seen": 84501536, "step": 69480 }, { "epoch": 8.706302468362361, "grad_norm": 0.08309004455804825, "learning_rate": 6.948692435982805e-06, "loss": 0.4615, "num_input_tokens_seen": 84507552, "step": 69485 }, { "epoch": 8.706928956271144, "grad_norm": 0.1297840029001236, "learning_rate": 6.948188941581258e-06, "loss": 0.4573, "num_input_tokens_seen": 84513792, "step": 69490 }, { "epoch": 8.707555444179928, "grad_norm": 0.1035727933049202, "learning_rate": 6.947685423887473e-06, "loss": 0.4622, "num_input_tokens_seen": 84520032, "step": 69495 }, { "epoch": 8.70818193208871, "grad_norm": 0.07859154790639877, "learning_rate": 6.9471818829074724e-06, "loss": 0.4616, "num_input_tokens_seen": 84525888, "step": 69500 }, { "epoch": 8.708808419997494, "grad_norm": 0.052521318197250366, "learning_rate": 6.946678318647273e-06, "loss": 0.4637, "num_input_tokens_seen": 84531264, "step": 69505 }, { "epoch": 8.709434907906278, "grad_norm": 0.11537279188632965, "learning_rate": 6.946174731112901e-06, "loss": 0.4608, "num_input_tokens_seen": 84537376, "step": 69510 }, { "epoch": 8.71006139581506, "grad_norm": 0.13126926124095917, "learning_rate": 6.94567112031037e-06, "loss": 0.4594, "num_input_tokens_seen": 84543584, "step": 69515 }, { "epoch": 8.710687883723844, "grad_norm": 0.1850680708885193, "learning_rate": 6.945167486245706e-06, "loss": 0.4643, "num_input_tokens_seen": 84549856, "step": 69520 }, { "epoch": 8.711314371632627, "grad_norm": 0.0850638747215271, "learning_rate": 6.944663828924931e-06, "loss": 0.4588, "num_input_tokens_seen": 84555904, "step": 69525 }, { "epoch": 8.71194085954141, "grad_norm": 0.10699045658111572, "learning_rate": 6.944160148354063e-06, "loss": 0.4612, "num_input_tokens_seen": 84562304, "step": 69530 }, { "epoch": 8.712567347450195, "grad_norm": 0.10965850204229355, "learning_rate": 6.943656444539126e-06, "loss": 0.4597, "num_input_tokens_seen": 84568544, "step": 69535 }, { "epoch": 8.713193835358977, "grad_norm": 0.07557716965675354, "learning_rate": 6.943152717486144e-06, "loss": 0.4639, "num_input_tokens_seen": 84574720, "step": 69540 }, { "epoch": 8.713820323267761, "grad_norm": 0.08739432692527771, "learning_rate": 6.942648967201137e-06, "loss": 0.464, "num_input_tokens_seen": 84580608, "step": 69545 }, { "epoch": 8.714446811176543, "grad_norm": 0.056115277111530304, "learning_rate": 6.942145193690128e-06, "loss": 0.4604, "num_input_tokens_seen": 84586656, "step": 69550 }, { "epoch": 8.715073299085327, "grad_norm": 0.10146261751651764, "learning_rate": 6.94164139695914e-06, "loss": 0.4652, "num_input_tokens_seen": 84592928, "step": 69555 }, { "epoch": 8.715699786994112, "grad_norm": 0.10867904126644135, "learning_rate": 6.941137577014196e-06, "loss": 0.4632, "num_input_tokens_seen": 84598560, "step": 69560 }, { "epoch": 8.716326274902894, "grad_norm": 0.11473652720451355, "learning_rate": 6.940633733861322e-06, "loss": 0.4681, "num_input_tokens_seen": 84604640, "step": 69565 }, { "epoch": 8.716952762811678, "grad_norm": 0.13352714478969574, "learning_rate": 6.940129867506539e-06, "loss": 0.4671, "num_input_tokens_seen": 84611040, "step": 69570 }, { "epoch": 8.717579250720462, "grad_norm": 0.0864962711930275, "learning_rate": 6.939625977955872e-06, "loss": 0.4646, "num_input_tokens_seen": 84616320, "step": 69575 }, { "epoch": 8.718205738629244, "grad_norm": 0.13259561359882355, "learning_rate": 6.939122065215345e-06, "loss": 0.4684, "num_input_tokens_seen": 84622464, "step": 69580 }, { "epoch": 8.718832226538028, "grad_norm": 0.08305531740188599, "learning_rate": 6.9386181292909846e-06, "loss": 0.4611, "num_input_tokens_seen": 84628288, "step": 69585 }, { "epoch": 8.71945871444681, "grad_norm": 0.0981103926897049, "learning_rate": 6.938114170188815e-06, "loss": 0.4621, "num_input_tokens_seen": 84634784, "step": 69590 }, { "epoch": 8.720085202355595, "grad_norm": 0.07567533850669861, "learning_rate": 6.937610187914861e-06, "loss": 0.4581, "num_input_tokens_seen": 84641184, "step": 69595 }, { "epoch": 8.720711690264379, "grad_norm": 0.10170350968837738, "learning_rate": 6.937106182475147e-06, "loss": 0.4614, "num_input_tokens_seen": 84647616, "step": 69600 }, { "epoch": 8.721338178173161, "grad_norm": 0.12386839836835861, "learning_rate": 6.936602153875701e-06, "loss": 0.4597, "num_input_tokens_seen": 84653568, "step": 69605 }, { "epoch": 8.721964666081945, "grad_norm": 0.11331027746200562, "learning_rate": 6.936098102122548e-06, "loss": 0.4605, "num_input_tokens_seen": 84659680, "step": 69610 }, { "epoch": 8.722591153990727, "grad_norm": 0.08122559636831284, "learning_rate": 6.935594027221714e-06, "loss": 0.4646, "num_input_tokens_seen": 84665632, "step": 69615 }, { "epoch": 8.723217641899511, "grad_norm": 0.09594881534576416, "learning_rate": 6.9350899291792255e-06, "loss": 0.4585, "num_input_tokens_seen": 84671744, "step": 69620 }, { "epoch": 8.723844129808295, "grad_norm": 0.08448799699544907, "learning_rate": 6.934585808001111e-06, "loss": 0.4712, "num_input_tokens_seen": 84677952, "step": 69625 }, { "epoch": 8.724470617717078, "grad_norm": 0.10387761890888214, "learning_rate": 6.934081663693395e-06, "loss": 0.4583, "num_input_tokens_seen": 84683904, "step": 69630 }, { "epoch": 8.725097105625862, "grad_norm": 0.12488097697496414, "learning_rate": 6.933577496262108e-06, "loss": 0.458, "num_input_tokens_seen": 84689952, "step": 69635 }, { "epoch": 8.725723593534644, "grad_norm": 0.09002068638801575, "learning_rate": 6.933073305713276e-06, "loss": 0.4638, "num_input_tokens_seen": 84696160, "step": 69640 }, { "epoch": 8.726350081443428, "grad_norm": 0.08222928643226624, "learning_rate": 6.932569092052927e-06, "loss": 0.4622, "num_input_tokens_seen": 84702240, "step": 69645 }, { "epoch": 8.726976569352212, "grad_norm": 0.09917163103818893, "learning_rate": 6.932064855287091e-06, "loss": 0.4618, "num_input_tokens_seen": 84708064, "step": 69650 }, { "epoch": 8.727603057260994, "grad_norm": 0.09302721172571182, "learning_rate": 6.931560595421795e-06, "loss": 0.4629, "num_input_tokens_seen": 84714240, "step": 69655 }, { "epoch": 8.728229545169778, "grad_norm": 0.08925985544919968, "learning_rate": 6.931056312463067e-06, "loss": 0.4605, "num_input_tokens_seen": 84720064, "step": 69660 }, { "epoch": 8.72885603307856, "grad_norm": 0.12755170464515686, "learning_rate": 6.930552006416938e-06, "loss": 0.4632, "num_input_tokens_seen": 84726240, "step": 69665 }, { "epoch": 8.729482520987345, "grad_norm": 0.10454057902097702, "learning_rate": 6.930047677289435e-06, "loss": 0.4603, "num_input_tokens_seen": 84732512, "step": 69670 }, { "epoch": 8.730109008896129, "grad_norm": 0.10808637738227844, "learning_rate": 6.929543325086592e-06, "loss": 0.4625, "num_input_tokens_seen": 84738432, "step": 69675 }, { "epoch": 8.730735496804911, "grad_norm": 0.1271849423646927, "learning_rate": 6.929038949814435e-06, "loss": 0.4508, "num_input_tokens_seen": 84744704, "step": 69680 }, { "epoch": 8.731361984713695, "grad_norm": 0.11168753355741501, "learning_rate": 6.928534551478996e-06, "loss": 0.4629, "num_input_tokens_seen": 84750848, "step": 69685 }, { "epoch": 8.73198847262248, "grad_norm": 0.14713504910469055, "learning_rate": 6.928030130086305e-06, "loss": 0.4662, "num_input_tokens_seen": 84756960, "step": 69690 }, { "epoch": 8.732614960531262, "grad_norm": 0.07541913539171219, "learning_rate": 6.927525685642392e-06, "loss": 0.4621, "num_input_tokens_seen": 84762848, "step": 69695 }, { "epoch": 8.733241448440046, "grad_norm": 0.0994877889752388, "learning_rate": 6.9270212181532905e-06, "loss": 0.4599, "num_input_tokens_seen": 84768800, "step": 69700 }, { "epoch": 8.733867936348828, "grad_norm": 0.10856114327907562, "learning_rate": 6.926516727625028e-06, "loss": 0.4577, "num_input_tokens_seen": 84774944, "step": 69705 }, { "epoch": 8.734494424257612, "grad_norm": 0.11195894330739975, "learning_rate": 6.926012214063639e-06, "loss": 0.4605, "num_input_tokens_seen": 84780896, "step": 69710 }, { "epoch": 8.735120912166396, "grad_norm": 0.0945766270160675, "learning_rate": 6.925507677475155e-06, "loss": 0.4601, "num_input_tokens_seen": 84786752, "step": 69715 }, { "epoch": 8.735747400075178, "grad_norm": 0.09625112265348434, "learning_rate": 6.925003117865608e-06, "loss": 0.4693, "num_input_tokens_seen": 84792448, "step": 69720 }, { "epoch": 8.736373887983962, "grad_norm": 0.09811975061893463, "learning_rate": 6.924498535241029e-06, "loss": 0.4615, "num_input_tokens_seen": 84798752, "step": 69725 }, { "epoch": 8.737000375892745, "grad_norm": 0.10223565995693207, "learning_rate": 6.9239939296074535e-06, "loss": 0.4618, "num_input_tokens_seen": 84804896, "step": 69730 }, { "epoch": 8.737626863801529, "grad_norm": 0.08816344290971756, "learning_rate": 6.923489300970913e-06, "loss": 0.4595, "num_input_tokens_seen": 84810880, "step": 69735 }, { "epoch": 8.738253351710313, "grad_norm": 0.15308323502540588, "learning_rate": 6.9229846493374394e-06, "loss": 0.4693, "num_input_tokens_seen": 84817280, "step": 69740 }, { "epoch": 8.738879839619095, "grad_norm": 0.10023263096809387, "learning_rate": 6.9224799747130685e-06, "loss": 0.4611, "num_input_tokens_seen": 84823232, "step": 69745 }, { "epoch": 8.739506327527879, "grad_norm": 0.06941324472427368, "learning_rate": 6.921975277103834e-06, "loss": 0.4561, "num_input_tokens_seen": 84829312, "step": 69750 }, { "epoch": 8.740132815436661, "grad_norm": 0.10822904109954834, "learning_rate": 6.921470556515768e-06, "loss": 0.4591, "num_input_tokens_seen": 84835520, "step": 69755 }, { "epoch": 8.740759303345445, "grad_norm": 0.07854389399290085, "learning_rate": 6.920965812954907e-06, "loss": 0.4606, "num_input_tokens_seen": 84841472, "step": 69760 }, { "epoch": 8.74138579125423, "grad_norm": 0.12663142383098602, "learning_rate": 6.920461046427283e-06, "loss": 0.46, "num_input_tokens_seen": 84847712, "step": 69765 }, { "epoch": 8.742012279163012, "grad_norm": 0.14407692849636078, "learning_rate": 6.919956256938933e-06, "loss": 0.46, "num_input_tokens_seen": 84853792, "step": 69770 }, { "epoch": 8.742638767071796, "grad_norm": 0.12067041546106339, "learning_rate": 6.919451444495893e-06, "loss": 0.4632, "num_input_tokens_seen": 84859936, "step": 69775 }, { "epoch": 8.743265254980578, "grad_norm": 0.13308842480182648, "learning_rate": 6.918946609104196e-06, "loss": 0.4615, "num_input_tokens_seen": 84866080, "step": 69780 }, { "epoch": 8.743891742889362, "grad_norm": 0.08805762231349945, "learning_rate": 6.918441750769879e-06, "loss": 0.4597, "num_input_tokens_seen": 84872096, "step": 69785 }, { "epoch": 8.744518230798146, "grad_norm": 0.12494318932294846, "learning_rate": 6.917936869498978e-06, "loss": 0.4622, "num_input_tokens_seen": 84878208, "step": 69790 }, { "epoch": 8.745144718706928, "grad_norm": 0.12411140650510788, "learning_rate": 6.9174319652975295e-06, "loss": 0.4679, "num_input_tokens_seen": 84884320, "step": 69795 }, { "epoch": 8.745771206615713, "grad_norm": 0.10926299542188644, "learning_rate": 6.916927038171571e-06, "loss": 0.4543, "num_input_tokens_seen": 84890240, "step": 69800 }, { "epoch": 8.746397694524497, "grad_norm": 0.14160099625587463, "learning_rate": 6.916422088127136e-06, "loss": 0.459, "num_input_tokens_seen": 84895840, "step": 69805 }, { "epoch": 8.747024182433279, "grad_norm": 0.14932756125926971, "learning_rate": 6.915917115170265e-06, "loss": 0.4515, "num_input_tokens_seen": 84902144, "step": 69810 }, { "epoch": 8.747650670342063, "grad_norm": 0.14008693397045135, "learning_rate": 6.915412119306993e-06, "loss": 0.4528, "num_input_tokens_seen": 84908480, "step": 69815 }, { "epoch": 8.748277158250845, "grad_norm": 0.12452834099531174, "learning_rate": 6.9149071005433585e-06, "loss": 0.4604, "num_input_tokens_seen": 84914912, "step": 69820 }, { "epoch": 8.74890364615963, "grad_norm": 0.14038512110710144, "learning_rate": 6.914402058885401e-06, "loss": 0.4593, "num_input_tokens_seen": 84921248, "step": 69825 }, { "epoch": 8.749530134068413, "grad_norm": 0.10888408124446869, "learning_rate": 6.913896994339155e-06, "loss": 0.4601, "num_input_tokens_seen": 84927200, "step": 69830 }, { "epoch": 8.750156621977196, "grad_norm": 0.12618577480316162, "learning_rate": 6.913391906910664e-06, "loss": 0.4612, "num_input_tokens_seen": 84933248, "step": 69835 }, { "epoch": 8.75078310988598, "grad_norm": 0.17383365333080292, "learning_rate": 6.9128867966059634e-06, "loss": 0.455, "num_input_tokens_seen": 84939520, "step": 69840 }, { "epoch": 8.751409597794762, "grad_norm": 0.14207617938518524, "learning_rate": 6.912381663431093e-06, "loss": 0.467, "num_input_tokens_seen": 84945536, "step": 69845 }, { "epoch": 8.752036085703546, "grad_norm": 0.13710126280784607, "learning_rate": 6.911876507392093e-06, "loss": 0.4572, "num_input_tokens_seen": 84951584, "step": 69850 }, { "epoch": 8.75266257361233, "grad_norm": 0.29018253087997437, "learning_rate": 6.911371328495e-06, "loss": 0.4749, "num_input_tokens_seen": 84957248, "step": 69855 }, { "epoch": 8.753289061521112, "grad_norm": 0.11980496346950531, "learning_rate": 6.910866126745857e-06, "loss": 0.4624, "num_input_tokens_seen": 84963296, "step": 69860 }, { "epoch": 8.753915549429896, "grad_norm": 0.12512943148612976, "learning_rate": 6.910360902150701e-06, "loss": 0.4562, "num_input_tokens_seen": 84969280, "step": 69865 }, { "epoch": 8.754542037338679, "grad_norm": 0.12194135040044785, "learning_rate": 6.909855654715576e-06, "loss": 0.4636, "num_input_tokens_seen": 84975776, "step": 69870 }, { "epoch": 8.755168525247463, "grad_norm": 0.1252107322216034, "learning_rate": 6.909350384446519e-06, "loss": 0.4712, "num_input_tokens_seen": 84981888, "step": 69875 }, { "epoch": 8.755795013156247, "grad_norm": 0.13297320902347565, "learning_rate": 6.908845091349576e-06, "loss": 0.4662, "num_input_tokens_seen": 84987392, "step": 69880 }, { "epoch": 8.756421501065029, "grad_norm": 0.1322483867406845, "learning_rate": 6.908339775430783e-06, "loss": 0.4573, "num_input_tokens_seen": 84993312, "step": 69885 }, { "epoch": 8.757047988973813, "grad_norm": 0.10957902669906616, "learning_rate": 6.9078344366961825e-06, "loss": 0.4553, "num_input_tokens_seen": 84999520, "step": 69890 }, { "epoch": 8.757674476882595, "grad_norm": 0.09615720063447952, "learning_rate": 6.907329075151818e-06, "loss": 0.4561, "num_input_tokens_seen": 85005728, "step": 69895 }, { "epoch": 8.75830096479138, "grad_norm": 0.11684715002775192, "learning_rate": 6.9068236908037325e-06, "loss": 0.468, "num_input_tokens_seen": 85011808, "step": 69900 }, { "epoch": 8.758927452700163, "grad_norm": 0.08924762904644012, "learning_rate": 6.906318283657966e-06, "loss": 0.4561, "num_input_tokens_seen": 85017920, "step": 69905 }, { "epoch": 8.759553940608946, "grad_norm": 0.13900510966777802, "learning_rate": 6.9058128537205616e-06, "loss": 0.4677, "num_input_tokens_seen": 85023936, "step": 69910 }, { "epoch": 8.76018042851773, "grad_norm": 0.1257258653640747, "learning_rate": 6.905307400997562e-06, "loss": 0.4714, "num_input_tokens_seen": 85030208, "step": 69915 }, { "epoch": 8.760806916426514, "grad_norm": 0.08755118399858475, "learning_rate": 6.904801925495011e-06, "loss": 0.4699, "num_input_tokens_seen": 85036224, "step": 69920 }, { "epoch": 8.761433404335296, "grad_norm": 0.15590907633304596, "learning_rate": 6.90429642721895e-06, "loss": 0.4608, "num_input_tokens_seen": 85042304, "step": 69925 }, { "epoch": 8.76205989224408, "grad_norm": 0.06550874561071396, "learning_rate": 6.903790906175425e-06, "loss": 0.4625, "num_input_tokens_seen": 85048416, "step": 69930 }, { "epoch": 8.762686380152862, "grad_norm": 0.09278591722249985, "learning_rate": 6.903285362370478e-06, "loss": 0.4625, "num_input_tokens_seen": 85054400, "step": 69935 }, { "epoch": 8.763312868061647, "grad_norm": 0.13549889624118805, "learning_rate": 6.902779795810157e-06, "loss": 0.4637, "num_input_tokens_seen": 85060672, "step": 69940 }, { "epoch": 8.763939355970429, "grad_norm": 0.09327038377523422, "learning_rate": 6.902274206500502e-06, "loss": 0.4554, "num_input_tokens_seen": 85066784, "step": 69945 }, { "epoch": 8.764565843879213, "grad_norm": 0.11453652381896973, "learning_rate": 6.901768594447559e-06, "loss": 0.4562, "num_input_tokens_seen": 85072576, "step": 69950 }, { "epoch": 8.765192331787997, "grad_norm": 0.1407879739999771, "learning_rate": 6.901262959657374e-06, "loss": 0.4621, "num_input_tokens_seen": 85078752, "step": 69955 }, { "epoch": 8.76581881969678, "grad_norm": 0.15401306748390198, "learning_rate": 6.900757302135993e-06, "loss": 0.4566, "num_input_tokens_seen": 85084672, "step": 69960 }, { "epoch": 8.766445307605563, "grad_norm": 0.11772848665714264, "learning_rate": 6.90025162188946e-06, "loss": 0.4522, "num_input_tokens_seen": 85091104, "step": 69965 }, { "epoch": 8.767071795514347, "grad_norm": 0.12031541019678116, "learning_rate": 6.89974591892382e-06, "loss": 0.4578, "num_input_tokens_seen": 85097216, "step": 69970 }, { "epoch": 8.76769828342313, "grad_norm": 0.19212490320205688, "learning_rate": 6.8992401932451215e-06, "loss": 0.4607, "num_input_tokens_seen": 85103616, "step": 69975 }, { "epoch": 8.768324771331914, "grad_norm": 0.1288972795009613, "learning_rate": 6.898734444859409e-06, "loss": 0.4516, "num_input_tokens_seen": 85109856, "step": 69980 }, { "epoch": 8.768951259240696, "grad_norm": 0.20974044501781464, "learning_rate": 6.89822867377273e-06, "loss": 0.4698, "num_input_tokens_seen": 85116096, "step": 69985 }, { "epoch": 8.76957774714948, "grad_norm": 0.2061624974012375, "learning_rate": 6.897722879991131e-06, "loss": 0.4688, "num_input_tokens_seen": 85122368, "step": 69990 }, { "epoch": 8.770204235058264, "grad_norm": 0.1869901865720749, "learning_rate": 6.8972170635206605e-06, "loss": 0.4692, "num_input_tokens_seen": 85128448, "step": 69995 }, { "epoch": 8.770830722967046, "grad_norm": 0.13046951591968536, "learning_rate": 6.896711224367364e-06, "loss": 0.4628, "num_input_tokens_seen": 85134272, "step": 70000 }, { "epoch": 8.77145721087583, "grad_norm": 0.10088706761598587, "learning_rate": 6.896205362537291e-06, "loss": 0.4671, "num_input_tokens_seen": 85139712, "step": 70005 }, { "epoch": 8.772083698784613, "grad_norm": 0.08799326419830322, "learning_rate": 6.895699478036488e-06, "loss": 0.4519, "num_input_tokens_seen": 85145632, "step": 70010 }, { "epoch": 8.772710186693397, "grad_norm": 0.09285348653793335, "learning_rate": 6.895193570871003e-06, "loss": 0.4697, "num_input_tokens_seen": 85151360, "step": 70015 }, { "epoch": 8.77333667460218, "grad_norm": 0.13186681270599365, "learning_rate": 6.8946876410468864e-06, "loss": 0.4608, "num_input_tokens_seen": 85157440, "step": 70020 }, { "epoch": 8.773963162510963, "grad_norm": 0.08348696678876877, "learning_rate": 6.894181688570186e-06, "loss": 0.4639, "num_input_tokens_seen": 85163456, "step": 70025 }, { "epoch": 8.774589650419747, "grad_norm": 0.1292012631893158, "learning_rate": 6.893675713446951e-06, "loss": 0.4694, "num_input_tokens_seen": 85169792, "step": 70030 }, { "epoch": 8.775216138328531, "grad_norm": 0.10903578996658325, "learning_rate": 6.89316971568323e-06, "loss": 0.4664, "num_input_tokens_seen": 85176096, "step": 70035 }, { "epoch": 8.775842626237313, "grad_norm": 0.09875789284706116, "learning_rate": 6.892663695285074e-06, "loss": 0.4668, "num_input_tokens_seen": 85181664, "step": 70040 }, { "epoch": 8.776469114146098, "grad_norm": 0.12301328778266907, "learning_rate": 6.892157652258533e-06, "loss": 0.4603, "num_input_tokens_seen": 85187744, "step": 70045 }, { "epoch": 8.77709560205488, "grad_norm": 0.09209056198596954, "learning_rate": 6.891651586609657e-06, "loss": 0.4577, "num_input_tokens_seen": 85193632, "step": 70050 }, { "epoch": 8.777722089963664, "grad_norm": 0.09708847105503082, "learning_rate": 6.891145498344494e-06, "loss": 0.4602, "num_input_tokens_seen": 85199584, "step": 70055 }, { "epoch": 8.778348577872446, "grad_norm": 0.12850846350193024, "learning_rate": 6.890639387469098e-06, "loss": 0.4692, "num_input_tokens_seen": 85205568, "step": 70060 }, { "epoch": 8.77897506578123, "grad_norm": 0.11063377559185028, "learning_rate": 6.890133253989517e-06, "loss": 0.469, "num_input_tokens_seen": 85211904, "step": 70065 }, { "epoch": 8.779601553690014, "grad_norm": 0.1307229846715927, "learning_rate": 6.889627097911806e-06, "loss": 0.4647, "num_input_tokens_seen": 85217344, "step": 70070 }, { "epoch": 8.780228041598797, "grad_norm": 0.11031678318977356, "learning_rate": 6.889120919242012e-06, "loss": 0.4691, "num_input_tokens_seen": 85223360, "step": 70075 }, { "epoch": 8.78085452950758, "grad_norm": 0.11496187001466751, "learning_rate": 6.888614717986191e-06, "loss": 0.4607, "num_input_tokens_seen": 85229472, "step": 70080 }, { "epoch": 8.781481017416365, "grad_norm": 0.09265531599521637, "learning_rate": 6.888108494150391e-06, "loss": 0.4613, "num_input_tokens_seen": 85235296, "step": 70085 }, { "epoch": 8.782107505325147, "grad_norm": 0.12195273488759995, "learning_rate": 6.887602247740668e-06, "loss": 0.4662, "num_input_tokens_seen": 85241280, "step": 70090 }, { "epoch": 8.782733993233931, "grad_norm": 0.09046671539545059, "learning_rate": 6.887095978763072e-06, "loss": 0.4591, "num_input_tokens_seen": 85247392, "step": 70095 }, { "epoch": 8.783360481142713, "grad_norm": 0.10549948364496231, "learning_rate": 6.886589687223659e-06, "loss": 0.4656, "num_input_tokens_seen": 85253568, "step": 70100 }, { "epoch": 8.783986969051497, "grad_norm": 0.13679368793964386, "learning_rate": 6.886083373128479e-06, "loss": 0.4593, "num_input_tokens_seen": 85259520, "step": 70105 }, { "epoch": 8.784613456960281, "grad_norm": 0.08145950734615326, "learning_rate": 6.885577036483586e-06, "loss": 0.4614, "num_input_tokens_seen": 85265536, "step": 70110 }, { "epoch": 8.785239944869064, "grad_norm": 0.10624977201223373, "learning_rate": 6.885070677295036e-06, "loss": 0.4594, "num_input_tokens_seen": 85271168, "step": 70115 }, { "epoch": 8.785866432777848, "grad_norm": 0.09841884672641754, "learning_rate": 6.88456429556888e-06, "loss": 0.4561, "num_input_tokens_seen": 85277344, "step": 70120 }, { "epoch": 8.78649292068663, "grad_norm": 0.10712730139493942, "learning_rate": 6.884057891311173e-06, "loss": 0.4726, "num_input_tokens_seen": 85283552, "step": 70125 }, { "epoch": 8.787119408595414, "grad_norm": 0.09559522569179535, "learning_rate": 6.88355146452797e-06, "loss": 0.4649, "num_input_tokens_seen": 85289376, "step": 70130 }, { "epoch": 8.787745896504198, "grad_norm": 0.07708822190761566, "learning_rate": 6.8830450152253255e-06, "loss": 0.464, "num_input_tokens_seen": 85295808, "step": 70135 }, { "epoch": 8.78837238441298, "grad_norm": 0.07374311983585358, "learning_rate": 6.882538543409295e-06, "loss": 0.46, "num_input_tokens_seen": 85301984, "step": 70140 }, { "epoch": 8.788998872321764, "grad_norm": 0.09654752910137177, "learning_rate": 6.882032049085934e-06, "loss": 0.4676, "num_input_tokens_seen": 85308448, "step": 70145 }, { "epoch": 8.789625360230547, "grad_norm": 0.07776438444852829, "learning_rate": 6.881525532261297e-06, "loss": 0.4589, "num_input_tokens_seen": 85314656, "step": 70150 }, { "epoch": 8.79025184813933, "grad_norm": 0.09376320242881775, "learning_rate": 6.881018992941441e-06, "loss": 0.4666, "num_input_tokens_seen": 85320864, "step": 70155 }, { "epoch": 8.790878336048115, "grad_norm": 0.11292559653520584, "learning_rate": 6.88051243113242e-06, "loss": 0.4663, "num_input_tokens_seen": 85327328, "step": 70160 }, { "epoch": 8.791504823956897, "grad_norm": 0.08528344333171844, "learning_rate": 6.880005846840293e-06, "loss": 0.4645, "num_input_tokens_seen": 85333344, "step": 70165 }, { "epoch": 8.792131311865681, "grad_norm": 0.08144164085388184, "learning_rate": 6.879499240071116e-06, "loss": 0.4585, "num_input_tokens_seen": 85339520, "step": 70170 }, { "epoch": 8.792757799774463, "grad_norm": 0.1084010899066925, "learning_rate": 6.878992610830944e-06, "loss": 0.4606, "num_input_tokens_seen": 85345216, "step": 70175 }, { "epoch": 8.793384287683248, "grad_norm": 0.08809981495141983, "learning_rate": 6.878485959125837e-06, "loss": 0.461, "num_input_tokens_seen": 85351136, "step": 70180 }, { "epoch": 8.794010775592032, "grad_norm": 0.11891351640224457, "learning_rate": 6.877979284961849e-06, "loss": 0.4613, "num_input_tokens_seen": 85357056, "step": 70185 }, { "epoch": 8.794637263500814, "grad_norm": 0.09646905958652496, "learning_rate": 6.877472588345041e-06, "loss": 0.4587, "num_input_tokens_seen": 85363168, "step": 70190 }, { "epoch": 8.795263751409598, "grad_norm": 0.10155760496854782, "learning_rate": 6.87696586928147e-06, "loss": 0.4654, "num_input_tokens_seen": 85369344, "step": 70195 }, { "epoch": 8.795890239318382, "grad_norm": 0.1239691749215126, "learning_rate": 6.876459127777193e-06, "loss": 0.4636, "num_input_tokens_seen": 85375520, "step": 70200 }, { "epoch": 8.796516727227164, "grad_norm": 0.10221373289823532, "learning_rate": 6.8759523638382695e-06, "loss": 0.4572, "num_input_tokens_seen": 85381376, "step": 70205 }, { "epoch": 8.797143215135948, "grad_norm": 0.06043549254536629, "learning_rate": 6.87544557747076e-06, "loss": 0.4603, "num_input_tokens_seen": 85387584, "step": 70210 }, { "epoch": 8.79776970304473, "grad_norm": 0.08502858877182007, "learning_rate": 6.87493876868072e-06, "loss": 0.4633, "num_input_tokens_seen": 85393536, "step": 70215 }, { "epoch": 8.798396190953515, "grad_norm": 0.09138613194227219, "learning_rate": 6.874431937474212e-06, "loss": 0.456, "num_input_tokens_seen": 85399648, "step": 70220 }, { "epoch": 8.799022678862299, "grad_norm": 0.09011822938919067, "learning_rate": 6.873925083857294e-06, "loss": 0.4636, "num_input_tokens_seen": 85406016, "step": 70225 }, { "epoch": 8.799649166771081, "grad_norm": 0.08952650427818298, "learning_rate": 6.8734182078360255e-06, "loss": 0.461, "num_input_tokens_seen": 85412160, "step": 70230 }, { "epoch": 8.800275654679865, "grad_norm": 0.09028059244155884, "learning_rate": 6.872911309416468e-06, "loss": 0.4671, "num_input_tokens_seen": 85418240, "step": 70235 }, { "epoch": 8.800902142588647, "grad_norm": 0.09125775098800659, "learning_rate": 6.87240438860468e-06, "loss": 0.4616, "num_input_tokens_seen": 85424352, "step": 70240 }, { "epoch": 8.801528630497431, "grad_norm": 0.08108417689800262, "learning_rate": 6.871897445406723e-06, "loss": 0.4618, "num_input_tokens_seen": 85430144, "step": 70245 }, { "epoch": 8.802155118406215, "grad_norm": 0.08251858502626419, "learning_rate": 6.87139047982866e-06, "loss": 0.4623, "num_input_tokens_seen": 85436288, "step": 70250 }, { "epoch": 8.802781606314998, "grad_norm": 0.09230600297451019, "learning_rate": 6.87088349187655e-06, "loss": 0.4624, "num_input_tokens_seen": 85442464, "step": 70255 }, { "epoch": 8.803408094223782, "grad_norm": 0.08291113376617432, "learning_rate": 6.870376481556453e-06, "loss": 0.4662, "num_input_tokens_seen": 85448384, "step": 70260 }, { "epoch": 8.804034582132564, "grad_norm": 0.09501038491725922, "learning_rate": 6.869869448874435e-06, "loss": 0.4618, "num_input_tokens_seen": 85454432, "step": 70265 }, { "epoch": 8.804661070041348, "grad_norm": 0.06507579237222672, "learning_rate": 6.8693623938365536e-06, "loss": 0.4648, "num_input_tokens_seen": 85460672, "step": 70270 }, { "epoch": 8.805287557950132, "grad_norm": 0.10931374877691269, "learning_rate": 6.868855316448875e-06, "loss": 0.4635, "num_input_tokens_seen": 85466880, "step": 70275 }, { "epoch": 8.805914045858914, "grad_norm": 0.08281850069761276, "learning_rate": 6.86834821671746e-06, "loss": 0.4527, "num_input_tokens_seen": 85473120, "step": 70280 }, { "epoch": 8.806540533767699, "grad_norm": 0.08588436990976334, "learning_rate": 6.86784109464837e-06, "loss": 0.4594, "num_input_tokens_seen": 85479072, "step": 70285 }, { "epoch": 8.80716702167648, "grad_norm": 0.07523074746131897, "learning_rate": 6.867333950247669e-06, "loss": 0.4546, "num_input_tokens_seen": 85485248, "step": 70290 }, { "epoch": 8.807793509585265, "grad_norm": 0.08050119876861572, "learning_rate": 6.866826783521421e-06, "loss": 0.4703, "num_input_tokens_seen": 85491232, "step": 70295 }, { "epoch": 8.808419997494049, "grad_norm": 0.10233155637979507, "learning_rate": 6.86631959447569e-06, "loss": 0.4544, "num_input_tokens_seen": 85497472, "step": 70300 }, { "epoch": 8.809046485402831, "grad_norm": 0.11075262725353241, "learning_rate": 6.865812383116539e-06, "loss": 0.4602, "num_input_tokens_seen": 85503328, "step": 70305 }, { "epoch": 8.809672973311615, "grad_norm": 0.06881902366876602, "learning_rate": 6.865305149450032e-06, "loss": 0.4621, "num_input_tokens_seen": 85509504, "step": 70310 }, { "epoch": 8.8102994612204, "grad_norm": 0.0908050462603569, "learning_rate": 6.864797893482234e-06, "loss": 0.4584, "num_input_tokens_seen": 85515840, "step": 70315 }, { "epoch": 8.810925949129182, "grad_norm": 0.1462511420249939, "learning_rate": 6.864290615219209e-06, "loss": 0.4544, "num_input_tokens_seen": 85522176, "step": 70320 }, { "epoch": 8.811552437037966, "grad_norm": 0.1449844092130661, "learning_rate": 6.863783314667022e-06, "loss": 0.4593, "num_input_tokens_seen": 85528384, "step": 70325 }, { "epoch": 8.812178924946748, "grad_norm": 0.08740749210119247, "learning_rate": 6.86327599183174e-06, "loss": 0.4581, "num_input_tokens_seen": 85534368, "step": 70330 }, { "epoch": 8.812805412855532, "grad_norm": 0.10604545474052429, "learning_rate": 6.862768646719425e-06, "loss": 0.4596, "num_input_tokens_seen": 85540768, "step": 70335 }, { "epoch": 8.813431900764316, "grad_norm": 0.1406448930501938, "learning_rate": 6.862261279336145e-06, "loss": 0.4641, "num_input_tokens_seen": 85546752, "step": 70340 }, { "epoch": 8.814058388673098, "grad_norm": 0.13014240562915802, "learning_rate": 6.861753889687965e-06, "loss": 0.4637, "num_input_tokens_seen": 85552512, "step": 70345 }, { "epoch": 8.814684876581882, "grad_norm": 0.1316329538822174, "learning_rate": 6.861246477780954e-06, "loss": 0.4694, "num_input_tokens_seen": 85558720, "step": 70350 }, { "epoch": 8.815311364490665, "grad_norm": 0.14130446314811707, "learning_rate": 6.860739043621176e-06, "loss": 0.4639, "num_input_tokens_seen": 85565184, "step": 70355 }, { "epoch": 8.815937852399449, "grad_norm": 0.10625892132520676, "learning_rate": 6.860231587214699e-06, "loss": 0.4528, "num_input_tokens_seen": 85570944, "step": 70360 }, { "epoch": 8.816564340308233, "grad_norm": 0.09358203411102295, "learning_rate": 6.859724108567587e-06, "loss": 0.4534, "num_input_tokens_seen": 85577376, "step": 70365 }, { "epoch": 8.817190828217015, "grad_norm": 0.13345791399478912, "learning_rate": 6.859216607685911e-06, "loss": 0.467, "num_input_tokens_seen": 85583168, "step": 70370 }, { "epoch": 8.817817316125799, "grad_norm": 0.14728383719921112, "learning_rate": 6.858709084575739e-06, "loss": 0.4619, "num_input_tokens_seen": 85589856, "step": 70375 }, { "epoch": 8.818443804034581, "grad_norm": 0.12969036400318146, "learning_rate": 6.8582015392431345e-06, "loss": 0.4653, "num_input_tokens_seen": 85595936, "step": 70380 }, { "epoch": 8.819070291943365, "grad_norm": 0.10758908092975616, "learning_rate": 6.857693971694168e-06, "loss": 0.4593, "num_input_tokens_seen": 85601856, "step": 70385 }, { "epoch": 8.81969677985215, "grad_norm": 0.148418128490448, "learning_rate": 6.857186381934911e-06, "loss": 0.4643, "num_input_tokens_seen": 85608192, "step": 70390 }, { "epoch": 8.820323267760932, "grad_norm": 0.1285068541765213, "learning_rate": 6.856678769971428e-06, "loss": 0.4653, "num_input_tokens_seen": 85614240, "step": 70395 }, { "epoch": 8.820949755669716, "grad_norm": 0.1057213768362999, "learning_rate": 6.856171135809789e-06, "loss": 0.468, "num_input_tokens_seen": 85620480, "step": 70400 }, { "epoch": 8.821576243578498, "grad_norm": 0.1355152726173401, "learning_rate": 6.8556634794560625e-06, "loss": 0.4673, "num_input_tokens_seen": 85626304, "step": 70405 }, { "epoch": 8.822202731487282, "grad_norm": 0.13872873783111572, "learning_rate": 6.855155800916321e-06, "loss": 0.4626, "num_input_tokens_seen": 85632544, "step": 70410 }, { "epoch": 8.822829219396066, "grad_norm": 0.11812666058540344, "learning_rate": 6.854648100196631e-06, "loss": 0.4674, "num_input_tokens_seen": 85638624, "step": 70415 }, { "epoch": 8.823455707304849, "grad_norm": 0.1706671267747879, "learning_rate": 6.8541403773030646e-06, "loss": 0.4664, "num_input_tokens_seen": 85644288, "step": 70420 }, { "epoch": 8.824082195213633, "grad_norm": 0.09166860580444336, "learning_rate": 6.85363263224169e-06, "loss": 0.4625, "num_input_tokens_seen": 85649600, "step": 70425 }, { "epoch": 8.824708683122417, "grad_norm": 0.1322355568408966, "learning_rate": 6.85312486501858e-06, "loss": 0.4656, "num_input_tokens_seen": 85655968, "step": 70430 }, { "epoch": 8.825335171031199, "grad_norm": 0.12963801622390747, "learning_rate": 6.852617075639803e-06, "loss": 0.4624, "num_input_tokens_seen": 85661312, "step": 70435 }, { "epoch": 8.825961658939983, "grad_norm": 0.09364663809537888, "learning_rate": 6.852109264111432e-06, "loss": 0.4666, "num_input_tokens_seen": 85667520, "step": 70440 }, { "epoch": 8.826588146848765, "grad_norm": 0.11125070601701736, "learning_rate": 6.851601430439537e-06, "loss": 0.4638, "num_input_tokens_seen": 85673504, "step": 70445 }, { "epoch": 8.82721463475755, "grad_norm": 0.07456144690513611, "learning_rate": 6.85109357463019e-06, "loss": 0.4676, "num_input_tokens_seen": 85679808, "step": 70450 }, { "epoch": 8.827841122666333, "grad_norm": 0.13697370886802673, "learning_rate": 6.8505856966894635e-06, "loss": 0.4666, "num_input_tokens_seen": 85685952, "step": 70455 }, { "epoch": 8.828467610575116, "grad_norm": 0.10771960020065308, "learning_rate": 6.850077796623429e-06, "loss": 0.4618, "num_input_tokens_seen": 85692096, "step": 70460 }, { "epoch": 8.8290940984839, "grad_norm": 0.1121583878993988, "learning_rate": 6.84956987443816e-06, "loss": 0.4697, "num_input_tokens_seen": 85698112, "step": 70465 }, { "epoch": 8.829720586392682, "grad_norm": 0.08774754405021667, "learning_rate": 6.8490619301397265e-06, "loss": 0.4609, "num_input_tokens_seen": 85704384, "step": 70470 }, { "epoch": 8.830347074301466, "grad_norm": 0.12579481303691864, "learning_rate": 6.848553963734205e-06, "loss": 0.4653, "num_input_tokens_seen": 85710688, "step": 70475 }, { "epoch": 8.83097356221025, "grad_norm": 0.08455043286085129, "learning_rate": 6.8480459752276654e-06, "loss": 0.4643, "num_input_tokens_seen": 85716736, "step": 70480 }, { "epoch": 8.831600050119032, "grad_norm": 0.12379907071590424, "learning_rate": 6.847537964626183e-06, "loss": 0.4563, "num_input_tokens_seen": 85722208, "step": 70485 }, { "epoch": 8.832226538027816, "grad_norm": 0.12088723480701447, "learning_rate": 6.84702993193583e-06, "loss": 0.4603, "num_input_tokens_seen": 85728544, "step": 70490 }, { "epoch": 8.832853025936599, "grad_norm": 0.07949692755937576, "learning_rate": 6.84652187716268e-06, "loss": 0.453, "num_input_tokens_seen": 85734048, "step": 70495 }, { "epoch": 8.833479513845383, "grad_norm": 0.08417509496212006, "learning_rate": 6.84601380031281e-06, "loss": 0.4597, "num_input_tokens_seen": 85740096, "step": 70500 }, { "epoch": 8.834106001754167, "grad_norm": 0.0804702639579773, "learning_rate": 6.845505701392294e-06, "loss": 0.4667, "num_input_tokens_seen": 85746368, "step": 70505 }, { "epoch": 8.834732489662949, "grad_norm": 0.09870865941047668, "learning_rate": 6.844997580407204e-06, "loss": 0.4627, "num_input_tokens_seen": 85752416, "step": 70510 }, { "epoch": 8.835358977571733, "grad_norm": 0.10816452652215958, "learning_rate": 6.8444894373636185e-06, "loss": 0.4648, "num_input_tokens_seen": 85758848, "step": 70515 }, { "epoch": 8.835985465480515, "grad_norm": 0.0795571580529213, "learning_rate": 6.843981272267611e-06, "loss": 0.4591, "num_input_tokens_seen": 85764960, "step": 70520 }, { "epoch": 8.8366119533893, "grad_norm": 0.06936067342758179, "learning_rate": 6.843473085125255e-06, "loss": 0.4605, "num_input_tokens_seen": 85771136, "step": 70525 }, { "epoch": 8.837238441298084, "grad_norm": 0.1396380215883255, "learning_rate": 6.84296487594263e-06, "loss": 0.4612, "num_input_tokens_seen": 85777248, "step": 70530 }, { "epoch": 8.837864929206866, "grad_norm": 0.07626958191394806, "learning_rate": 6.84245664472581e-06, "loss": 0.4669, "num_input_tokens_seen": 85783392, "step": 70535 }, { "epoch": 8.83849141711565, "grad_norm": 0.09008783102035522, "learning_rate": 6.841948391480871e-06, "loss": 0.459, "num_input_tokens_seen": 85789824, "step": 70540 }, { "epoch": 8.839117905024434, "grad_norm": 0.10601010173559189, "learning_rate": 6.8414401162138924e-06, "loss": 0.4574, "num_input_tokens_seen": 85795808, "step": 70545 }, { "epoch": 8.839744392933216, "grad_norm": 0.09072547405958176, "learning_rate": 6.840931818930946e-06, "loss": 0.4621, "num_input_tokens_seen": 85801664, "step": 70550 }, { "epoch": 8.840370880842, "grad_norm": 0.12735962867736816, "learning_rate": 6.8404234996381135e-06, "loss": 0.4614, "num_input_tokens_seen": 85807968, "step": 70555 }, { "epoch": 8.840997368750783, "grad_norm": 0.09304225444793701, "learning_rate": 6.839915158341472e-06, "loss": 0.4561, "num_input_tokens_seen": 85814240, "step": 70560 }, { "epoch": 8.841623856659567, "grad_norm": 0.13840363919734955, "learning_rate": 6.839406795047096e-06, "loss": 0.4638, "num_input_tokens_seen": 85820640, "step": 70565 }, { "epoch": 8.842250344568349, "grad_norm": 0.1255519539117813, "learning_rate": 6.838898409761065e-06, "loss": 0.4578, "num_input_tokens_seen": 85827168, "step": 70570 }, { "epoch": 8.842876832477133, "grad_norm": 0.135658860206604, "learning_rate": 6.838390002489459e-06, "loss": 0.4594, "num_input_tokens_seen": 85833632, "step": 70575 }, { "epoch": 8.843503320385917, "grad_norm": 0.11826808005571365, "learning_rate": 6.837881573238354e-06, "loss": 0.4586, "num_input_tokens_seen": 85839680, "step": 70580 }, { "epoch": 8.8441298082947, "grad_norm": 0.04854334890842438, "learning_rate": 6.837373122013831e-06, "loss": 0.4642, "num_input_tokens_seen": 85845216, "step": 70585 }, { "epoch": 8.844756296203483, "grad_norm": 0.10845556855201721, "learning_rate": 6.836864648821966e-06, "loss": 0.4576, "num_input_tokens_seen": 85851584, "step": 70590 }, { "epoch": 8.845382784112267, "grad_norm": 0.17250999808311462, "learning_rate": 6.836356153668841e-06, "loss": 0.4599, "num_input_tokens_seen": 85857568, "step": 70595 }, { "epoch": 8.84600927202105, "grad_norm": 0.0666198655962944, "learning_rate": 6.835847636560532e-06, "loss": 0.4573, "num_input_tokens_seen": 85863776, "step": 70600 }, { "epoch": 8.846635759929834, "grad_norm": 0.1263759732246399, "learning_rate": 6.8353390975031234e-06, "loss": 0.4555, "num_input_tokens_seen": 85870144, "step": 70605 }, { "epoch": 8.847262247838616, "grad_norm": 0.15755809843540192, "learning_rate": 6.834830536502691e-06, "loss": 0.4592, "num_input_tokens_seen": 85875872, "step": 70610 }, { "epoch": 8.8478887357474, "grad_norm": 0.12180683016777039, "learning_rate": 6.8343219535653185e-06, "loss": 0.4593, "num_input_tokens_seen": 85881920, "step": 70615 }, { "epoch": 8.848515223656184, "grad_norm": 0.16116972267627716, "learning_rate": 6.833813348697085e-06, "loss": 0.4623, "num_input_tokens_seen": 85888032, "step": 70620 }, { "epoch": 8.849141711564966, "grad_norm": 0.08413427323102951, "learning_rate": 6.8333047219040695e-06, "loss": 0.458, "num_input_tokens_seen": 85893856, "step": 70625 }, { "epoch": 8.84976819947375, "grad_norm": 0.07949367165565491, "learning_rate": 6.832796073192356e-06, "loss": 0.4552, "num_input_tokens_seen": 85899264, "step": 70630 }, { "epoch": 8.850394687382533, "grad_norm": 0.0904112383723259, "learning_rate": 6.832287402568023e-06, "loss": 0.4619, "num_input_tokens_seen": 85904704, "step": 70635 }, { "epoch": 8.851021175291317, "grad_norm": 0.14233307540416718, "learning_rate": 6.831778710037155e-06, "loss": 0.4545, "num_input_tokens_seen": 85911200, "step": 70640 }, { "epoch": 8.8516476632001, "grad_norm": 0.20156973600387573, "learning_rate": 6.831269995605832e-06, "loss": 0.4652, "num_input_tokens_seen": 85917504, "step": 70645 }, { "epoch": 8.852274151108883, "grad_norm": 0.16982834041118622, "learning_rate": 6.830761259280137e-06, "loss": 0.4715, "num_input_tokens_seen": 85923712, "step": 70650 }, { "epoch": 8.852900639017667, "grad_norm": 0.1436680406332016, "learning_rate": 6.83025250106615e-06, "loss": 0.4563, "num_input_tokens_seen": 85929952, "step": 70655 }, { "epoch": 8.85352712692645, "grad_norm": 0.1875181347131729, "learning_rate": 6.829743720969957e-06, "loss": 0.4587, "num_input_tokens_seen": 85936192, "step": 70660 }, { "epoch": 8.854153614835234, "grad_norm": 0.14240749180316925, "learning_rate": 6.82923491899764e-06, "loss": 0.4685, "num_input_tokens_seen": 85942272, "step": 70665 }, { "epoch": 8.854780102744018, "grad_norm": 0.15048421919345856, "learning_rate": 6.828726095155281e-06, "loss": 0.4541, "num_input_tokens_seen": 85948352, "step": 70670 }, { "epoch": 8.8554065906528, "grad_norm": 0.1804630309343338, "learning_rate": 6.828217249448965e-06, "loss": 0.4582, "num_input_tokens_seen": 85954432, "step": 70675 }, { "epoch": 8.856033078561584, "grad_norm": 0.1563992202281952, "learning_rate": 6.827708381884773e-06, "loss": 0.4531, "num_input_tokens_seen": 85960736, "step": 70680 }, { "epoch": 8.856659566470366, "grad_norm": 0.12299825251102448, "learning_rate": 6.827199492468792e-06, "loss": 0.462, "num_input_tokens_seen": 85967200, "step": 70685 }, { "epoch": 8.85728605437915, "grad_norm": 0.19684170186519623, "learning_rate": 6.826690581207105e-06, "loss": 0.4644, "num_input_tokens_seen": 85973152, "step": 70690 }, { "epoch": 8.857912542287934, "grad_norm": 0.1392790973186493, "learning_rate": 6.826181648105794e-06, "loss": 0.4584, "num_input_tokens_seen": 85979264, "step": 70695 }, { "epoch": 8.858539030196717, "grad_norm": 0.18365825712680817, "learning_rate": 6.825672693170949e-06, "loss": 0.4645, "num_input_tokens_seen": 85985088, "step": 70700 }, { "epoch": 8.8591655181055, "grad_norm": 0.20668740570545197, "learning_rate": 6.825163716408651e-06, "loss": 0.4641, "num_input_tokens_seen": 85991456, "step": 70705 }, { "epoch": 8.859792006014285, "grad_norm": 0.14193177223205566, "learning_rate": 6.824654717824985e-06, "loss": 0.4642, "num_input_tokens_seen": 85997728, "step": 70710 }, { "epoch": 8.860418493923067, "grad_norm": 0.14952382445335388, "learning_rate": 6.824145697426038e-06, "loss": 0.4491, "num_input_tokens_seen": 86003808, "step": 70715 }, { "epoch": 8.861044981831851, "grad_norm": 0.1499892920255661, "learning_rate": 6.8236366552178965e-06, "loss": 0.4659, "num_input_tokens_seen": 86009888, "step": 70720 }, { "epoch": 8.861671469740633, "grad_norm": 0.1428748071193695, "learning_rate": 6.823127591206645e-06, "loss": 0.4659, "num_input_tokens_seen": 86015680, "step": 70725 }, { "epoch": 8.862297957649417, "grad_norm": 0.12208853662014008, "learning_rate": 6.822618505398371e-06, "loss": 0.4529, "num_input_tokens_seen": 86021984, "step": 70730 }, { "epoch": 8.862924445558201, "grad_norm": 0.16593097150325775, "learning_rate": 6.822109397799161e-06, "loss": 0.4577, "num_input_tokens_seen": 86028096, "step": 70735 }, { "epoch": 8.863550933466984, "grad_norm": 0.15398068726062775, "learning_rate": 6.821600268415099e-06, "loss": 0.4634, "num_input_tokens_seen": 86033792, "step": 70740 }, { "epoch": 8.864177421375768, "grad_norm": 0.2963331937789917, "learning_rate": 6.8210911172522766e-06, "loss": 0.4708, "num_input_tokens_seen": 86039680, "step": 70745 }, { "epoch": 8.86480390928455, "grad_norm": 0.17881804704666138, "learning_rate": 6.820581944316776e-06, "loss": 0.4688, "num_input_tokens_seen": 86045280, "step": 70750 }, { "epoch": 8.865430397193334, "grad_norm": 0.13810187578201294, "learning_rate": 6.82007274961469e-06, "loss": 0.4603, "num_input_tokens_seen": 86051488, "step": 70755 }, { "epoch": 8.866056885102118, "grad_norm": 0.18658915162086487, "learning_rate": 6.819563533152103e-06, "loss": 0.4583, "num_input_tokens_seen": 86057696, "step": 70760 }, { "epoch": 8.8666833730109, "grad_norm": 0.1176670715212822, "learning_rate": 6.819054294935103e-06, "loss": 0.4698, "num_input_tokens_seen": 86063072, "step": 70765 }, { "epoch": 8.867309860919685, "grad_norm": 0.21522551774978638, "learning_rate": 6.818545034969782e-06, "loss": 0.4633, "num_input_tokens_seen": 86068896, "step": 70770 }, { "epoch": 8.867936348828467, "grad_norm": 0.10008500516414642, "learning_rate": 6.8180357532622255e-06, "loss": 0.4578, "num_input_tokens_seen": 86074912, "step": 70775 }, { "epoch": 8.86856283673725, "grad_norm": 0.10240834206342697, "learning_rate": 6.817526449818524e-06, "loss": 0.4642, "num_input_tokens_seen": 86081024, "step": 70780 }, { "epoch": 8.869189324646035, "grad_norm": 0.11964254826307297, "learning_rate": 6.817017124644764e-06, "loss": 0.4642, "num_input_tokens_seen": 86086912, "step": 70785 }, { "epoch": 8.869815812554817, "grad_norm": 0.15605396032333374, "learning_rate": 6.816507777747037e-06, "loss": 0.4644, "num_input_tokens_seen": 86092992, "step": 70790 }, { "epoch": 8.870442300463601, "grad_norm": 0.1307527869939804, "learning_rate": 6.8159984091314325e-06, "loss": 0.4592, "num_input_tokens_seen": 86099232, "step": 70795 }, { "epoch": 8.871068788372384, "grad_norm": 0.10174164921045303, "learning_rate": 6.81548901880404e-06, "loss": 0.4682, "num_input_tokens_seen": 86105120, "step": 70800 }, { "epoch": 8.871695276281168, "grad_norm": 0.11673444509506226, "learning_rate": 6.814979606770948e-06, "loss": 0.4654, "num_input_tokens_seen": 86110720, "step": 70805 }, { "epoch": 8.872321764189952, "grad_norm": 0.10974790155887604, "learning_rate": 6.814470173038253e-06, "loss": 0.4591, "num_input_tokens_seen": 86117344, "step": 70810 }, { "epoch": 8.872948252098734, "grad_norm": 0.2003099024295807, "learning_rate": 6.813960717612038e-06, "loss": 0.4667, "num_input_tokens_seen": 86123520, "step": 70815 }, { "epoch": 8.873574740007518, "grad_norm": 0.15560130774974823, "learning_rate": 6.813451240498399e-06, "loss": 0.4712, "num_input_tokens_seen": 86128672, "step": 70820 }, { "epoch": 8.874201227916302, "grad_norm": 0.16535185277462006, "learning_rate": 6.812941741703425e-06, "loss": 0.4531, "num_input_tokens_seen": 86134944, "step": 70825 }, { "epoch": 8.874827715825084, "grad_norm": 0.12292005866765976, "learning_rate": 6.8124322212332094e-06, "loss": 0.4602, "num_input_tokens_seen": 86141120, "step": 70830 }, { "epoch": 8.875454203733868, "grad_norm": 0.12181063741445541, "learning_rate": 6.811922679093842e-06, "loss": 0.4599, "num_input_tokens_seen": 86147168, "step": 70835 }, { "epoch": 8.87608069164265, "grad_norm": 0.12811459600925446, "learning_rate": 6.8114131152914166e-06, "loss": 0.458, "num_input_tokens_seen": 86153088, "step": 70840 }, { "epoch": 8.876707179551435, "grad_norm": 0.12492942810058594, "learning_rate": 6.810903529832023e-06, "loss": 0.4598, "num_input_tokens_seen": 86159168, "step": 70845 }, { "epoch": 8.877333667460219, "grad_norm": 0.10433322936296463, "learning_rate": 6.810393922721757e-06, "loss": 0.4635, "num_input_tokens_seen": 86165664, "step": 70850 }, { "epoch": 8.877960155369001, "grad_norm": 0.18584689497947693, "learning_rate": 6.809884293966709e-06, "loss": 0.4591, "num_input_tokens_seen": 86171648, "step": 70855 }, { "epoch": 8.878586643277785, "grad_norm": 0.09999629855155945, "learning_rate": 6.80937464357297e-06, "loss": 0.4602, "num_input_tokens_seen": 86177760, "step": 70860 }, { "epoch": 8.879213131186567, "grad_norm": 0.08786294609308243, "learning_rate": 6.808864971546638e-06, "loss": 0.4635, "num_input_tokens_seen": 86184352, "step": 70865 }, { "epoch": 8.879839619095351, "grad_norm": 0.1619882434606552, "learning_rate": 6.8083552778938055e-06, "loss": 0.4653, "num_input_tokens_seen": 86190624, "step": 70870 }, { "epoch": 8.880466107004136, "grad_norm": 0.1448858380317688, "learning_rate": 6.8078455626205645e-06, "loss": 0.4518, "num_input_tokens_seen": 86196288, "step": 70875 }, { "epoch": 8.881092594912918, "grad_norm": 0.1488177478313446, "learning_rate": 6.807335825733008e-06, "loss": 0.4645, "num_input_tokens_seen": 86201984, "step": 70880 }, { "epoch": 8.881719082821702, "grad_norm": 0.12502911686897278, "learning_rate": 6.806826067237233e-06, "loss": 0.4635, "num_input_tokens_seen": 86208192, "step": 70885 }, { "epoch": 8.882345570730484, "grad_norm": 0.10555683821439743, "learning_rate": 6.806316287139335e-06, "loss": 0.4563, "num_input_tokens_seen": 86214336, "step": 70890 }, { "epoch": 8.882972058639268, "grad_norm": 0.11079402267932892, "learning_rate": 6.805806485445406e-06, "loss": 0.4571, "num_input_tokens_seen": 86220224, "step": 70895 }, { "epoch": 8.883598546548052, "grad_norm": 0.14126437902450562, "learning_rate": 6.8052966621615426e-06, "loss": 0.4658, "num_input_tokens_seen": 86226208, "step": 70900 }, { "epoch": 8.884225034456835, "grad_norm": 0.13245674967765808, "learning_rate": 6.80478681729384e-06, "loss": 0.4655, "num_input_tokens_seen": 86232352, "step": 70905 }, { "epoch": 8.884851522365619, "grad_norm": 0.11298276484012604, "learning_rate": 6.804276950848392e-06, "loss": 0.4592, "num_input_tokens_seen": 86238816, "step": 70910 }, { "epoch": 8.8854780102744, "grad_norm": 0.11872744560241699, "learning_rate": 6.803767062831297e-06, "loss": 0.4591, "num_input_tokens_seen": 86244992, "step": 70915 }, { "epoch": 8.886104498183185, "grad_norm": 0.1007489413022995, "learning_rate": 6.80325715324865e-06, "loss": 0.4665, "num_input_tokens_seen": 86251040, "step": 70920 }, { "epoch": 8.886730986091969, "grad_norm": 0.0884796530008316, "learning_rate": 6.80274722210655e-06, "loss": 0.4572, "num_input_tokens_seen": 86257440, "step": 70925 }, { "epoch": 8.887357474000751, "grad_norm": 0.10545520484447479, "learning_rate": 6.802237269411088e-06, "loss": 0.4638, "num_input_tokens_seen": 86263744, "step": 70930 }, { "epoch": 8.887983961909535, "grad_norm": 0.10727142542600632, "learning_rate": 6.8017272951683665e-06, "loss": 0.4652, "num_input_tokens_seen": 86269664, "step": 70935 }, { "epoch": 8.88861044981832, "grad_norm": 0.13159802556037903, "learning_rate": 6.801217299384478e-06, "loss": 0.4708, "num_input_tokens_seen": 86276064, "step": 70940 }, { "epoch": 8.889236937727102, "grad_norm": 0.10135208070278168, "learning_rate": 6.800707282065524e-06, "loss": 0.4543, "num_input_tokens_seen": 86282080, "step": 70945 }, { "epoch": 8.889863425635886, "grad_norm": 0.18371683359146118, "learning_rate": 6.800197243217601e-06, "loss": 0.4676, "num_input_tokens_seen": 86288512, "step": 70950 }, { "epoch": 8.890489913544668, "grad_norm": 0.09346282482147217, "learning_rate": 6.799687182846806e-06, "loss": 0.4619, "num_input_tokens_seen": 86294720, "step": 70955 }, { "epoch": 8.891116401453452, "grad_norm": 0.10639732331037521, "learning_rate": 6.799177100959238e-06, "loss": 0.4629, "num_input_tokens_seen": 86300544, "step": 70960 }, { "epoch": 8.891742889362236, "grad_norm": 0.1584264487028122, "learning_rate": 6.798666997560992e-06, "loss": 0.4662, "num_input_tokens_seen": 86305984, "step": 70965 }, { "epoch": 8.892369377271018, "grad_norm": 0.08819398283958435, "learning_rate": 6.798156872658172e-06, "loss": 0.4606, "num_input_tokens_seen": 86311968, "step": 70970 }, { "epoch": 8.892995865179802, "grad_norm": 0.1127573773264885, "learning_rate": 6.7976467262568765e-06, "loss": 0.4529, "num_input_tokens_seen": 86317856, "step": 70975 }, { "epoch": 8.893622353088585, "grad_norm": 0.1170835793018341, "learning_rate": 6.797136558363202e-06, "loss": 0.4565, "num_input_tokens_seen": 86323808, "step": 70980 }, { "epoch": 8.894248840997369, "grad_norm": 0.09915214031934738, "learning_rate": 6.796626368983248e-06, "loss": 0.4574, "num_input_tokens_seen": 86329600, "step": 70985 }, { "epoch": 8.894875328906153, "grad_norm": 0.14686453342437744, "learning_rate": 6.796116158123116e-06, "loss": 0.462, "num_input_tokens_seen": 86335648, "step": 70990 }, { "epoch": 8.895501816814935, "grad_norm": 0.1420416682958603, "learning_rate": 6.795605925788905e-06, "loss": 0.4595, "num_input_tokens_seen": 86341504, "step": 70995 }, { "epoch": 8.89612830472372, "grad_norm": 0.1799747347831726, "learning_rate": 6.795095671986716e-06, "loss": 0.4618, "num_input_tokens_seen": 86347328, "step": 71000 }, { "epoch": 8.896754792632501, "grad_norm": 0.17718343436717987, "learning_rate": 6.794585396722647e-06, "loss": 0.4496, "num_input_tokens_seen": 86353888, "step": 71005 }, { "epoch": 8.897381280541286, "grad_norm": 0.15257643163204193, "learning_rate": 6.794075100002803e-06, "loss": 0.4711, "num_input_tokens_seen": 86360064, "step": 71010 }, { "epoch": 8.89800776845007, "grad_norm": 0.13688956201076508, "learning_rate": 6.793564781833282e-06, "loss": 0.4646, "num_input_tokens_seen": 86366240, "step": 71015 }, { "epoch": 8.898634256358852, "grad_norm": 0.15382370352745056, "learning_rate": 6.793054442220186e-06, "loss": 0.4548, "num_input_tokens_seen": 86372160, "step": 71020 }, { "epoch": 8.899260744267636, "grad_norm": 0.1401742398738861, "learning_rate": 6.7925440811696165e-06, "loss": 0.4632, "num_input_tokens_seen": 86377920, "step": 71025 }, { "epoch": 8.899887232176418, "grad_norm": 0.10644500702619553, "learning_rate": 6.792033698687676e-06, "loss": 0.4583, "num_input_tokens_seen": 86384224, "step": 71030 }, { "epoch": 8.900513720085202, "grad_norm": 0.1400829255580902, "learning_rate": 6.791523294780466e-06, "loss": 0.4708, "num_input_tokens_seen": 86390336, "step": 71035 }, { "epoch": 8.901140207993986, "grad_norm": 0.0988200306892395, "learning_rate": 6.7910128694540874e-06, "loss": 0.4646, "num_input_tokens_seen": 86396384, "step": 71040 }, { "epoch": 8.901766695902769, "grad_norm": 0.11906372010707855, "learning_rate": 6.790502422714647e-06, "loss": 0.4582, "num_input_tokens_seen": 86402848, "step": 71045 }, { "epoch": 8.902393183811553, "grad_norm": 0.12570489943027496, "learning_rate": 6.789991954568243e-06, "loss": 0.464, "num_input_tokens_seen": 86409248, "step": 71050 }, { "epoch": 8.903019671720337, "grad_norm": 0.07331006973981857, "learning_rate": 6.789481465020979e-06, "loss": 0.4692, "num_input_tokens_seen": 86415040, "step": 71055 }, { "epoch": 8.903646159629119, "grad_norm": 0.09782136976718903, "learning_rate": 6.7889709540789604e-06, "loss": 0.4539, "num_input_tokens_seen": 86420896, "step": 71060 }, { "epoch": 8.904272647537903, "grad_norm": 0.15879903733730316, "learning_rate": 6.788460421748291e-06, "loss": 0.4612, "num_input_tokens_seen": 86426880, "step": 71065 }, { "epoch": 8.904899135446685, "grad_norm": 0.07913436740636826, "learning_rate": 6.787949868035072e-06, "loss": 0.4646, "num_input_tokens_seen": 86432928, "step": 71070 }, { "epoch": 8.90552562335547, "grad_norm": 0.09707120805978775, "learning_rate": 6.78743929294541e-06, "loss": 0.4575, "num_input_tokens_seen": 86439328, "step": 71075 }, { "epoch": 8.906152111264252, "grad_norm": 0.134961798787117, "learning_rate": 6.7869286964854084e-06, "loss": 0.459, "num_input_tokens_seen": 86445440, "step": 71080 }, { "epoch": 8.906778599173036, "grad_norm": 0.09091933071613312, "learning_rate": 6.786418078661172e-06, "loss": 0.459, "num_input_tokens_seen": 86451392, "step": 71085 }, { "epoch": 8.90740508708182, "grad_norm": 0.11171048879623413, "learning_rate": 6.785907439478806e-06, "loss": 0.4622, "num_input_tokens_seen": 86457408, "step": 71090 }, { "epoch": 8.908031574990602, "grad_norm": 0.18122656643390656, "learning_rate": 6.785396778944415e-06, "loss": 0.4608, "num_input_tokens_seen": 86462816, "step": 71095 }, { "epoch": 8.908658062899386, "grad_norm": 0.11314492672681808, "learning_rate": 6.784886097064104e-06, "loss": 0.4574, "num_input_tokens_seen": 86468864, "step": 71100 }, { "epoch": 8.90928455080817, "grad_norm": 0.13480237126350403, "learning_rate": 6.784375393843979e-06, "loss": 0.4625, "num_input_tokens_seen": 86475296, "step": 71105 }, { "epoch": 8.909911038716952, "grad_norm": 0.1031753346323967, "learning_rate": 6.783864669290147e-06, "loss": 0.4597, "num_input_tokens_seen": 86481600, "step": 71110 }, { "epoch": 8.910537526625737, "grad_norm": 0.12082882970571518, "learning_rate": 6.783353923408711e-06, "loss": 0.4535, "num_input_tokens_seen": 86487680, "step": 71115 }, { "epoch": 8.911164014534519, "grad_norm": 0.0981065109372139, "learning_rate": 6.782843156205781e-06, "loss": 0.4694, "num_input_tokens_seen": 86493600, "step": 71120 }, { "epoch": 8.911790502443303, "grad_norm": 0.09557885676622391, "learning_rate": 6.782332367687461e-06, "loss": 0.4616, "num_input_tokens_seen": 86499616, "step": 71125 }, { "epoch": 8.912416990352087, "grad_norm": 0.09364516288042068, "learning_rate": 6.781821557859859e-06, "loss": 0.459, "num_input_tokens_seen": 86505856, "step": 71130 }, { "epoch": 8.91304347826087, "grad_norm": 0.10597865283489227, "learning_rate": 6.7813107267290835e-06, "loss": 0.4694, "num_input_tokens_seen": 86512224, "step": 71135 }, { "epoch": 8.913669966169653, "grad_norm": 0.15113966166973114, "learning_rate": 6.78079987430124e-06, "loss": 0.4554, "num_input_tokens_seen": 86517504, "step": 71140 }, { "epoch": 8.914296454078436, "grad_norm": 0.12667281925678253, "learning_rate": 6.780289000582438e-06, "loss": 0.4605, "num_input_tokens_seen": 86523616, "step": 71145 }, { "epoch": 8.91492294198722, "grad_norm": 0.09870904684066772, "learning_rate": 6.779778105578782e-06, "loss": 0.4623, "num_input_tokens_seen": 86529632, "step": 71150 }, { "epoch": 8.915549429896004, "grad_norm": 0.09315640479326248, "learning_rate": 6.7792671892963835e-06, "loss": 0.4576, "num_input_tokens_seen": 86535648, "step": 71155 }, { "epoch": 8.916175917804786, "grad_norm": 0.1219557672739029, "learning_rate": 6.7787562517413495e-06, "loss": 0.4693, "num_input_tokens_seen": 86542016, "step": 71160 }, { "epoch": 8.91680240571357, "grad_norm": 0.12157603353261948, "learning_rate": 6.778245292919791e-06, "loss": 0.4667, "num_input_tokens_seen": 86548096, "step": 71165 }, { "epoch": 8.917428893622354, "grad_norm": 0.1261596828699112, "learning_rate": 6.777734312837812e-06, "loss": 0.4595, "num_input_tokens_seen": 86554272, "step": 71170 }, { "epoch": 8.918055381531136, "grad_norm": 0.055023740977048874, "learning_rate": 6.777223311501526e-06, "loss": 0.4608, "num_input_tokens_seen": 86560448, "step": 71175 }, { "epoch": 8.91868186943992, "grad_norm": 0.14054156839847565, "learning_rate": 6.7767122889170414e-06, "loss": 0.4605, "num_input_tokens_seen": 86566560, "step": 71180 }, { "epoch": 8.919308357348703, "grad_norm": 0.16624930500984192, "learning_rate": 6.776201245090467e-06, "loss": 0.4591, "num_input_tokens_seen": 86572640, "step": 71185 }, { "epoch": 8.919934845257487, "grad_norm": 0.1269347071647644, "learning_rate": 6.775690180027913e-06, "loss": 0.459, "num_input_tokens_seen": 86578304, "step": 71190 }, { "epoch": 8.920561333166269, "grad_norm": 0.11289210617542267, "learning_rate": 6.77517909373549e-06, "loss": 0.4557, "num_input_tokens_seen": 86584576, "step": 71195 }, { "epoch": 8.921187821075053, "grad_norm": 0.11465835571289062, "learning_rate": 6.77466798621931e-06, "loss": 0.4613, "num_input_tokens_seen": 86590176, "step": 71200 }, { "epoch": 8.921814308983837, "grad_norm": 0.10551922768354416, "learning_rate": 6.774156857485481e-06, "loss": 0.4581, "num_input_tokens_seen": 86596160, "step": 71205 }, { "epoch": 8.92244079689262, "grad_norm": 0.1420004665851593, "learning_rate": 6.773645707540115e-06, "loss": 0.4604, "num_input_tokens_seen": 86602432, "step": 71210 }, { "epoch": 8.923067284801403, "grad_norm": 0.11286692321300507, "learning_rate": 6.773134536389324e-06, "loss": 0.4613, "num_input_tokens_seen": 86608864, "step": 71215 }, { "epoch": 8.923693772710187, "grad_norm": 0.09899935126304626, "learning_rate": 6.772623344039218e-06, "loss": 0.4619, "num_input_tokens_seen": 86615360, "step": 71220 }, { "epoch": 8.92432026061897, "grad_norm": 0.12980623543262482, "learning_rate": 6.77211213049591e-06, "loss": 0.4568, "num_input_tokens_seen": 86621376, "step": 71225 }, { "epoch": 8.924946748527754, "grad_norm": 0.1554621011018753, "learning_rate": 6.771600895765512e-06, "loss": 0.4647, "num_input_tokens_seen": 86627744, "step": 71230 }, { "epoch": 8.925573236436536, "grad_norm": 0.11457186937332153, "learning_rate": 6.771089639854136e-06, "loss": 0.4608, "num_input_tokens_seen": 86633888, "step": 71235 }, { "epoch": 8.92619972434532, "grad_norm": 0.09037264436483383, "learning_rate": 6.770578362767893e-06, "loss": 0.4601, "num_input_tokens_seen": 86639584, "step": 71240 }, { "epoch": 8.926826212254104, "grad_norm": 0.09763582795858383, "learning_rate": 6.7700670645128985e-06, "loss": 0.4753, "num_input_tokens_seen": 86645440, "step": 71245 }, { "epoch": 8.927452700162887, "grad_norm": 0.09031160175800323, "learning_rate": 6.769555745095262e-06, "loss": 0.4588, "num_input_tokens_seen": 86651552, "step": 71250 }, { "epoch": 8.92807918807167, "grad_norm": 0.10399012267589569, "learning_rate": 6.769044404521101e-06, "loss": 0.4659, "num_input_tokens_seen": 86657120, "step": 71255 }, { "epoch": 8.928705675980453, "grad_norm": 0.1243022233247757, "learning_rate": 6.768533042796526e-06, "loss": 0.4647, "num_input_tokens_seen": 86663520, "step": 71260 }, { "epoch": 8.929332163889237, "grad_norm": 0.13377194106578827, "learning_rate": 6.768021659927653e-06, "loss": 0.4609, "num_input_tokens_seen": 86669376, "step": 71265 }, { "epoch": 8.929958651798021, "grad_norm": 0.1929544359445572, "learning_rate": 6.767510255920593e-06, "loss": 0.4579, "num_input_tokens_seen": 86675200, "step": 71270 }, { "epoch": 8.930585139706803, "grad_norm": 0.10686173290014267, "learning_rate": 6.766998830781461e-06, "loss": 0.4517, "num_input_tokens_seen": 86681408, "step": 71275 }, { "epoch": 8.931211627615587, "grad_norm": 0.10506884753704071, "learning_rate": 6.766487384516374e-06, "loss": 0.4615, "num_input_tokens_seen": 86687040, "step": 71280 }, { "epoch": 8.93183811552437, "grad_norm": 0.11397886276245117, "learning_rate": 6.765975917131444e-06, "loss": 0.46, "num_input_tokens_seen": 86692800, "step": 71285 }, { "epoch": 8.932464603433154, "grad_norm": 0.10557813942432404, "learning_rate": 6.765464428632788e-06, "loss": 0.4711, "num_input_tokens_seen": 86699264, "step": 71290 }, { "epoch": 8.933091091341938, "grad_norm": 0.09762449562549591, "learning_rate": 6.764952919026519e-06, "loss": 0.4638, "num_input_tokens_seen": 86704832, "step": 71295 }, { "epoch": 8.93371757925072, "grad_norm": 0.11573665589094162, "learning_rate": 6.764441388318757e-06, "loss": 0.4659, "num_input_tokens_seen": 86710400, "step": 71300 }, { "epoch": 8.934344067159504, "grad_norm": 0.11017303168773651, "learning_rate": 6.763929836515613e-06, "loss": 0.4539, "num_input_tokens_seen": 86716544, "step": 71305 }, { "epoch": 8.934970555068286, "grad_norm": 0.13318446278572083, "learning_rate": 6.763418263623204e-06, "loss": 0.4575, "num_input_tokens_seen": 86722816, "step": 71310 }, { "epoch": 8.93559704297707, "grad_norm": 0.12581872940063477, "learning_rate": 6.762906669647646e-06, "loss": 0.4608, "num_input_tokens_seen": 86728256, "step": 71315 }, { "epoch": 8.936223530885854, "grad_norm": 0.1103651374578476, "learning_rate": 6.762395054595059e-06, "loss": 0.456, "num_input_tokens_seen": 86734272, "step": 71320 }, { "epoch": 8.936850018794637, "grad_norm": 0.10995936393737793, "learning_rate": 6.761883418471558e-06, "loss": 0.4687, "num_input_tokens_seen": 86739840, "step": 71325 }, { "epoch": 8.93747650670342, "grad_norm": 0.09487303346395493, "learning_rate": 6.761371761283256e-06, "loss": 0.4681, "num_input_tokens_seen": 86746304, "step": 71330 }, { "epoch": 8.938102994612205, "grad_norm": 0.13086369633674622, "learning_rate": 6.760860083036276e-06, "loss": 0.4658, "num_input_tokens_seen": 86752288, "step": 71335 }, { "epoch": 8.938729482520987, "grad_norm": 0.13780146837234497, "learning_rate": 6.760348383736733e-06, "loss": 0.4642, "num_input_tokens_seen": 86758656, "step": 71340 }, { "epoch": 8.939355970429771, "grad_norm": 0.09636103361845016, "learning_rate": 6.759836663390746e-06, "loss": 0.4659, "num_input_tokens_seen": 86764800, "step": 71345 }, { "epoch": 8.939982458338553, "grad_norm": 0.0859491303563118, "learning_rate": 6.759324922004431e-06, "loss": 0.4585, "num_input_tokens_seen": 86770944, "step": 71350 }, { "epoch": 8.940608946247337, "grad_norm": 0.09780792146921158, "learning_rate": 6.758813159583908e-06, "loss": 0.4655, "num_input_tokens_seen": 86776928, "step": 71355 }, { "epoch": 8.941235434156122, "grad_norm": 0.11711262166500092, "learning_rate": 6.758301376135294e-06, "loss": 0.4568, "num_input_tokens_seen": 86783136, "step": 71360 }, { "epoch": 8.941861922064904, "grad_norm": 0.1296040117740631, "learning_rate": 6.75778957166471e-06, "loss": 0.4584, "num_input_tokens_seen": 86789344, "step": 71365 }, { "epoch": 8.942488409973688, "grad_norm": 0.11391406506299973, "learning_rate": 6.7572777461782725e-06, "loss": 0.4638, "num_input_tokens_seen": 86795584, "step": 71370 }, { "epoch": 8.94311489788247, "grad_norm": 0.09374947100877762, "learning_rate": 6.756765899682103e-06, "loss": 0.4629, "num_input_tokens_seen": 86801600, "step": 71375 }, { "epoch": 8.943741385791254, "grad_norm": 0.10745947808027267, "learning_rate": 6.756254032182319e-06, "loss": 0.4695, "num_input_tokens_seen": 86807296, "step": 71380 }, { "epoch": 8.944367873700038, "grad_norm": 0.11078046262264252, "learning_rate": 6.755742143685043e-06, "loss": 0.4642, "num_input_tokens_seen": 86813440, "step": 71385 }, { "epoch": 8.94499436160882, "grad_norm": 0.08545064926147461, "learning_rate": 6.7552302341963925e-06, "loss": 0.4643, "num_input_tokens_seen": 86819488, "step": 71390 }, { "epoch": 8.945620849517605, "grad_norm": 0.09781918674707413, "learning_rate": 6.754718303722489e-06, "loss": 0.4622, "num_input_tokens_seen": 86825248, "step": 71395 }, { "epoch": 8.946247337426387, "grad_norm": 0.14810577034950256, "learning_rate": 6.754206352269454e-06, "loss": 0.4611, "num_input_tokens_seen": 86831264, "step": 71400 }, { "epoch": 8.946873825335171, "grad_norm": 0.11173580586910248, "learning_rate": 6.753694379843405e-06, "loss": 0.459, "num_input_tokens_seen": 86837632, "step": 71405 }, { "epoch": 8.947500313243955, "grad_norm": 0.08640304952859879, "learning_rate": 6.753182386450468e-06, "loss": 0.4726, "num_input_tokens_seen": 86843552, "step": 71410 }, { "epoch": 8.948126801152737, "grad_norm": 0.11769616603851318, "learning_rate": 6.75267037209676e-06, "loss": 0.4647, "num_input_tokens_seen": 86849632, "step": 71415 }, { "epoch": 8.948753289061521, "grad_norm": 0.08971767127513885, "learning_rate": 6.752158336788404e-06, "loss": 0.4538, "num_input_tokens_seen": 86855488, "step": 71420 }, { "epoch": 8.949379776970304, "grad_norm": 0.1477992683649063, "learning_rate": 6.751646280531522e-06, "loss": 0.4633, "num_input_tokens_seen": 86861696, "step": 71425 }, { "epoch": 8.950006264879088, "grad_norm": 0.16162338852882385, "learning_rate": 6.751134203332236e-06, "loss": 0.4641, "num_input_tokens_seen": 86867904, "step": 71430 }, { "epoch": 8.950632752787872, "grad_norm": 0.07966306060552597, "learning_rate": 6.750622105196667e-06, "loss": 0.4536, "num_input_tokens_seen": 86873984, "step": 71435 }, { "epoch": 8.951259240696654, "grad_norm": 0.10992942005395889, "learning_rate": 6.7501099861309415e-06, "loss": 0.4556, "num_input_tokens_seen": 86880096, "step": 71440 }, { "epoch": 8.951885728605438, "grad_norm": 0.15690231323242188, "learning_rate": 6.749597846141178e-06, "loss": 0.4633, "num_input_tokens_seen": 86886048, "step": 71445 }, { "epoch": 8.952512216514222, "grad_norm": 0.11940543353557587, "learning_rate": 6.749085685233503e-06, "loss": 0.459, "num_input_tokens_seen": 86891936, "step": 71450 }, { "epoch": 8.953138704423004, "grad_norm": 0.12346315383911133, "learning_rate": 6.748573503414037e-06, "loss": 0.4701, "num_input_tokens_seen": 86898176, "step": 71455 }, { "epoch": 8.953765192331788, "grad_norm": 0.11389050632715225, "learning_rate": 6.748061300688904e-06, "loss": 0.4632, "num_input_tokens_seen": 86904704, "step": 71460 }, { "epoch": 8.95439168024057, "grad_norm": 0.15769162774085999, "learning_rate": 6.747549077064231e-06, "loss": 0.4638, "num_input_tokens_seen": 86910592, "step": 71465 }, { "epoch": 8.955018168149355, "grad_norm": 0.09995714575052261, "learning_rate": 6.747036832546137e-06, "loss": 0.4662, "num_input_tokens_seen": 86916704, "step": 71470 }, { "epoch": 8.955644656058139, "grad_norm": 0.13047799468040466, "learning_rate": 6.7465245671407485e-06, "loss": 0.4634, "num_input_tokens_seen": 86922880, "step": 71475 }, { "epoch": 8.956271143966921, "grad_norm": 0.09529488533735275, "learning_rate": 6.746012280854192e-06, "loss": 0.4596, "num_input_tokens_seen": 86928960, "step": 71480 }, { "epoch": 8.956897631875705, "grad_norm": 0.12353822588920593, "learning_rate": 6.74549997369259e-06, "loss": 0.4623, "num_input_tokens_seen": 86935424, "step": 71485 }, { "epoch": 8.957524119784487, "grad_norm": 0.1493425816297531, "learning_rate": 6.744987645662068e-06, "loss": 0.4661, "num_input_tokens_seen": 86942144, "step": 71490 }, { "epoch": 8.958150607693272, "grad_norm": 0.10344481468200684, "learning_rate": 6.744475296768752e-06, "loss": 0.462, "num_input_tokens_seen": 86948288, "step": 71495 }, { "epoch": 8.958777095602056, "grad_norm": 0.08472422510385513, "learning_rate": 6.743962927018767e-06, "loss": 0.4591, "num_input_tokens_seen": 86954496, "step": 71500 }, { "epoch": 8.959403583510838, "grad_norm": 0.10230543464422226, "learning_rate": 6.743450536418239e-06, "loss": 0.4667, "num_input_tokens_seen": 86960768, "step": 71505 }, { "epoch": 8.960030071419622, "grad_norm": 0.11024068295955658, "learning_rate": 6.742938124973294e-06, "loss": 0.4617, "num_input_tokens_seen": 86967360, "step": 71510 }, { "epoch": 8.960656559328404, "grad_norm": 0.08283374458551407, "learning_rate": 6.742425692690059e-06, "loss": 0.4669, "num_input_tokens_seen": 86973344, "step": 71515 }, { "epoch": 8.961283047237188, "grad_norm": 0.11700140684843063, "learning_rate": 6.7419132395746585e-06, "loss": 0.465, "num_input_tokens_seen": 86979328, "step": 71520 }, { "epoch": 8.961909535145972, "grad_norm": 0.09799414873123169, "learning_rate": 6.741400765633223e-06, "loss": 0.4626, "num_input_tokens_seen": 86984768, "step": 71525 }, { "epoch": 8.962536023054755, "grad_norm": 0.08738614618778229, "learning_rate": 6.740888270871875e-06, "loss": 0.4608, "num_input_tokens_seen": 86990784, "step": 71530 }, { "epoch": 8.963162510963539, "grad_norm": 0.12862074375152588, "learning_rate": 6.7403757552967446e-06, "loss": 0.4653, "num_input_tokens_seen": 86997280, "step": 71535 }, { "epoch": 8.963788998872321, "grad_norm": 0.09803657233715057, "learning_rate": 6.739863218913959e-06, "loss": 0.4653, "num_input_tokens_seen": 87003552, "step": 71540 }, { "epoch": 8.964415486781105, "grad_norm": 0.1097053736448288, "learning_rate": 6.739350661729646e-06, "loss": 0.4594, "num_input_tokens_seen": 87009632, "step": 71545 }, { "epoch": 8.965041974689889, "grad_norm": 0.11638859659433365, "learning_rate": 6.738838083749933e-06, "loss": 0.4657, "num_input_tokens_seen": 87015200, "step": 71550 }, { "epoch": 8.965668462598671, "grad_norm": 0.14172129333019257, "learning_rate": 6.73832548498095e-06, "loss": 0.4571, "num_input_tokens_seen": 87021376, "step": 71555 }, { "epoch": 8.966294950507455, "grad_norm": 0.09229031950235367, "learning_rate": 6.737812865428822e-06, "loss": 0.4555, "num_input_tokens_seen": 87027424, "step": 71560 }, { "epoch": 8.96692143841624, "grad_norm": 0.12838949263095856, "learning_rate": 6.737300225099682e-06, "loss": 0.4676, "num_input_tokens_seen": 87033376, "step": 71565 }, { "epoch": 8.967547926325022, "grad_norm": 0.10644742846488953, "learning_rate": 6.736787563999657e-06, "loss": 0.4549, "num_input_tokens_seen": 87039296, "step": 71570 }, { "epoch": 8.968174414233806, "grad_norm": 0.1676878184080124, "learning_rate": 6.736274882134877e-06, "loss": 0.4633, "num_input_tokens_seen": 87045216, "step": 71575 }, { "epoch": 8.968800902142588, "grad_norm": 0.11574521660804749, "learning_rate": 6.735762179511469e-06, "loss": 0.4666, "num_input_tokens_seen": 87051616, "step": 71580 }, { "epoch": 8.969427390051372, "grad_norm": 0.12335637211799622, "learning_rate": 6.735249456135566e-06, "loss": 0.4725, "num_input_tokens_seen": 87057728, "step": 71585 }, { "epoch": 8.970053877960156, "grad_norm": 0.10130433738231659, "learning_rate": 6.734736712013297e-06, "loss": 0.4568, "num_input_tokens_seen": 87063744, "step": 71590 }, { "epoch": 8.970680365868938, "grad_norm": 0.11742135137319565, "learning_rate": 6.734223947150792e-06, "loss": 0.4615, "num_input_tokens_seen": 87069152, "step": 71595 }, { "epoch": 8.971306853777723, "grad_norm": 0.11014814674854279, "learning_rate": 6.733711161554182e-06, "loss": 0.4561, "num_input_tokens_seen": 87075136, "step": 71600 }, { "epoch": 8.971933341686505, "grad_norm": 0.11140754073858261, "learning_rate": 6.733198355229598e-06, "loss": 0.4586, "num_input_tokens_seen": 87081088, "step": 71605 }, { "epoch": 8.972559829595289, "grad_norm": 0.11967046558856964, "learning_rate": 6.732685528183169e-06, "loss": 0.4559, "num_input_tokens_seen": 87087296, "step": 71610 }, { "epoch": 8.973186317504073, "grad_norm": 0.08782106637954712, "learning_rate": 6.732172680421028e-06, "loss": 0.4613, "num_input_tokens_seen": 87093408, "step": 71615 }, { "epoch": 8.973812805412855, "grad_norm": 0.1311323195695877, "learning_rate": 6.731659811949307e-06, "loss": 0.4597, "num_input_tokens_seen": 87099552, "step": 71620 }, { "epoch": 8.97443929332164, "grad_norm": 0.13563093543052673, "learning_rate": 6.731146922774135e-06, "loss": 0.4597, "num_input_tokens_seen": 87105600, "step": 71625 }, { "epoch": 8.975065781230422, "grad_norm": 0.11607962101697922, "learning_rate": 6.730634012901649e-06, "loss": 0.4659, "num_input_tokens_seen": 87111744, "step": 71630 }, { "epoch": 8.975692269139206, "grad_norm": 0.09946952760219574, "learning_rate": 6.730121082337976e-06, "loss": 0.4596, "num_input_tokens_seen": 87117728, "step": 71635 }, { "epoch": 8.97631875704799, "grad_norm": 0.1462399661540985, "learning_rate": 6.729608131089252e-06, "loss": 0.4676, "num_input_tokens_seen": 87123808, "step": 71640 }, { "epoch": 8.976945244956772, "grad_norm": 0.09914372861385345, "learning_rate": 6.729095159161608e-06, "loss": 0.455, "num_input_tokens_seen": 87130048, "step": 71645 }, { "epoch": 8.977571732865556, "grad_norm": 0.08061131834983826, "learning_rate": 6.728582166561178e-06, "loss": 0.4571, "num_input_tokens_seen": 87136256, "step": 71650 }, { "epoch": 8.978198220774338, "grad_norm": 0.10873884707689285, "learning_rate": 6.728069153294096e-06, "loss": 0.4594, "num_input_tokens_seen": 87142560, "step": 71655 }, { "epoch": 8.978824708683122, "grad_norm": 0.11399257928133011, "learning_rate": 6.727556119366494e-06, "loss": 0.4659, "num_input_tokens_seen": 87148448, "step": 71660 }, { "epoch": 8.979451196591906, "grad_norm": 0.10586164891719818, "learning_rate": 6.7270430647845055e-06, "loss": 0.4638, "num_input_tokens_seen": 87154688, "step": 71665 }, { "epoch": 8.980077684500689, "grad_norm": 0.15880876779556274, "learning_rate": 6.726529989554265e-06, "loss": 0.4612, "num_input_tokens_seen": 87160896, "step": 71670 }, { "epoch": 8.980704172409473, "grad_norm": 0.10422413796186447, "learning_rate": 6.726016893681907e-06, "loss": 0.4555, "num_input_tokens_seen": 87167008, "step": 71675 }, { "epoch": 8.981330660318257, "grad_norm": 0.10611279308795929, "learning_rate": 6.725503777173565e-06, "loss": 0.4675, "num_input_tokens_seen": 87173184, "step": 71680 }, { "epoch": 8.981957148227039, "grad_norm": 0.10154782980680466, "learning_rate": 6.724990640035376e-06, "loss": 0.4639, "num_input_tokens_seen": 87179072, "step": 71685 }, { "epoch": 8.982583636135823, "grad_norm": 0.14267440140247345, "learning_rate": 6.724477482273472e-06, "loss": 0.466, "num_input_tokens_seen": 87185504, "step": 71690 }, { "epoch": 8.983210124044605, "grad_norm": 0.1631719022989273, "learning_rate": 6.723964303893992e-06, "loss": 0.4624, "num_input_tokens_seen": 87191648, "step": 71695 }, { "epoch": 8.98383661195339, "grad_norm": 0.12586122751235962, "learning_rate": 6.723451104903068e-06, "loss": 0.4721, "num_input_tokens_seen": 87197632, "step": 71700 }, { "epoch": 8.984463099862172, "grad_norm": 0.2323845773935318, "learning_rate": 6.722937885306837e-06, "loss": 0.4601, "num_input_tokens_seen": 87204000, "step": 71705 }, { "epoch": 8.985089587770956, "grad_norm": 0.10013601183891296, "learning_rate": 6.722424645111436e-06, "loss": 0.4584, "num_input_tokens_seen": 87210144, "step": 71710 }, { "epoch": 8.98571607567974, "grad_norm": 0.12790191173553467, "learning_rate": 6.721911384323e-06, "loss": 0.4579, "num_input_tokens_seen": 87216320, "step": 71715 }, { "epoch": 8.986342563588522, "grad_norm": 0.10011559724807739, "learning_rate": 6.7213981029476645e-06, "loss": 0.4606, "num_input_tokens_seen": 87222368, "step": 71720 }, { "epoch": 8.986969051497306, "grad_norm": 0.08236733078956604, "learning_rate": 6.720884800991569e-06, "loss": 0.4678, "num_input_tokens_seen": 87227872, "step": 71725 }, { "epoch": 8.98759553940609, "grad_norm": 0.13063907623291016, "learning_rate": 6.720371478460849e-06, "loss": 0.4607, "num_input_tokens_seen": 87233856, "step": 71730 }, { "epoch": 8.988222027314873, "grad_norm": 0.12279298156499863, "learning_rate": 6.719858135361639e-06, "loss": 0.462, "num_input_tokens_seen": 87240032, "step": 71735 }, { "epoch": 8.988848515223657, "grad_norm": 0.1214587613940239, "learning_rate": 6.719344771700081e-06, "loss": 0.4544, "num_input_tokens_seen": 87246208, "step": 71740 }, { "epoch": 8.989475003132439, "grad_norm": 0.13747495412826538, "learning_rate": 6.7188313874823094e-06, "loss": 0.4548, "num_input_tokens_seen": 87251584, "step": 71745 }, { "epoch": 8.990101491041223, "grad_norm": 0.12048370391130447, "learning_rate": 6.7183179827144654e-06, "loss": 0.4624, "num_input_tokens_seen": 87258112, "step": 71750 }, { "epoch": 8.990727978950007, "grad_norm": 0.14649909734725952, "learning_rate": 6.717804557402684e-06, "loss": 0.4734, "num_input_tokens_seen": 87264192, "step": 71755 }, { "epoch": 8.99135446685879, "grad_norm": 0.14882376790046692, "learning_rate": 6.717291111553106e-06, "loss": 0.4649, "num_input_tokens_seen": 87270496, "step": 71760 }, { "epoch": 8.991980954767573, "grad_norm": 0.0651743933558464, "learning_rate": 6.716777645171868e-06, "loss": 0.4675, "num_input_tokens_seen": 87276576, "step": 71765 }, { "epoch": 8.992607442676356, "grad_norm": 0.10975222289562225, "learning_rate": 6.716264158265109e-06, "loss": 0.4713, "num_input_tokens_seen": 87282944, "step": 71770 }, { "epoch": 8.99323393058514, "grad_norm": 0.10831587016582489, "learning_rate": 6.7157506508389705e-06, "loss": 0.4547, "num_input_tokens_seen": 87288288, "step": 71775 }, { "epoch": 8.993860418493924, "grad_norm": 0.11195609718561172, "learning_rate": 6.715237122899591e-06, "loss": 0.4565, "num_input_tokens_seen": 87294464, "step": 71780 }, { "epoch": 8.994486906402706, "grad_norm": 0.057534802705049515, "learning_rate": 6.714723574453108e-06, "loss": 0.4618, "num_input_tokens_seen": 87299712, "step": 71785 }, { "epoch": 8.99511339431149, "grad_norm": 0.07878255844116211, "learning_rate": 6.714210005505662e-06, "loss": 0.4695, "num_input_tokens_seen": 87305728, "step": 71790 }, { "epoch": 8.995739882220272, "grad_norm": 0.1309676617383957, "learning_rate": 6.7136964160633965e-06, "loss": 0.4598, "num_input_tokens_seen": 87311776, "step": 71795 }, { "epoch": 8.996366370129056, "grad_norm": 0.07964231073856354, "learning_rate": 6.713182806132449e-06, "loss": 0.4575, "num_input_tokens_seen": 87318080, "step": 71800 }, { "epoch": 8.99699285803784, "grad_norm": 0.1133980005979538, "learning_rate": 6.71266917571896e-06, "loss": 0.4555, "num_input_tokens_seen": 87324160, "step": 71805 }, { "epoch": 8.997619345946623, "grad_norm": 0.09671353548765182, "learning_rate": 6.7121555248290715e-06, "loss": 0.4646, "num_input_tokens_seen": 87329984, "step": 71810 }, { "epoch": 8.998245833855407, "grad_norm": 0.1140766441822052, "learning_rate": 6.7116418534689246e-06, "loss": 0.469, "num_input_tokens_seen": 87335968, "step": 71815 }, { "epoch": 8.998872321764189, "grad_norm": 0.06991249322891235, "learning_rate": 6.71112816164466e-06, "loss": 0.463, "num_input_tokens_seen": 87341984, "step": 71820 }, { "epoch": 8.999498809672973, "grad_norm": 0.12658023834228516, "learning_rate": 6.710614449362419e-06, "loss": 0.4565, "num_input_tokens_seen": 87347968, "step": 71825 }, { "epoch": 9.000125297581757, "grad_norm": 0.12082229554653168, "learning_rate": 6.710100716628345e-06, "loss": 0.4619, "num_input_tokens_seen": 87354144, "step": 71830 }, { "epoch": 9.00075178549054, "grad_norm": 0.11854115128517151, "learning_rate": 6.709586963448579e-06, "loss": 0.4607, "num_input_tokens_seen": 87360352, "step": 71835 }, { "epoch": 9.001378273399324, "grad_norm": 0.08416992425918579, "learning_rate": 6.709073189829261e-06, "loss": 0.4594, "num_input_tokens_seen": 87365920, "step": 71840 }, { "epoch": 9.002004761308108, "grad_norm": 0.13295724987983704, "learning_rate": 6.708559395776539e-06, "loss": 0.4602, "num_input_tokens_seen": 87371968, "step": 71845 }, { "epoch": 9.00263124921689, "grad_norm": 0.13527719676494598, "learning_rate": 6.708045581296553e-06, "loss": 0.461, "num_input_tokens_seen": 87378240, "step": 71850 }, { "epoch": 9.003257737125674, "grad_norm": 0.10579388588666916, "learning_rate": 6.7075317463954444e-06, "loss": 0.4537, "num_input_tokens_seen": 87383872, "step": 71855 }, { "epoch": 9.003884225034456, "grad_norm": 0.10062714666128159, "learning_rate": 6.7070178910793595e-06, "loss": 0.4663, "num_input_tokens_seen": 87390144, "step": 71860 }, { "epoch": 9.00451071294324, "grad_norm": 0.11101669073104858, "learning_rate": 6.70650401535444e-06, "loss": 0.4641, "num_input_tokens_seen": 87396448, "step": 71865 }, { "epoch": 9.005137200852024, "grad_norm": 0.07542423158884048, "learning_rate": 6.7059901192268304e-06, "loss": 0.4597, "num_input_tokens_seen": 87402432, "step": 71870 }, { "epoch": 9.005763688760807, "grad_norm": 0.07810793071985245, "learning_rate": 6.705476202702674e-06, "loss": 0.4564, "num_input_tokens_seen": 87408576, "step": 71875 }, { "epoch": 9.00639017666959, "grad_norm": 0.09664331376552582, "learning_rate": 6.704962265788117e-06, "loss": 0.4577, "num_input_tokens_seen": 87413504, "step": 71880 }, { "epoch": 9.007016664578373, "grad_norm": 0.11377265304327011, "learning_rate": 6.704448308489302e-06, "loss": 0.457, "num_input_tokens_seen": 87419328, "step": 71885 }, { "epoch": 9.007643152487157, "grad_norm": 0.1178249716758728, "learning_rate": 6.703934330812374e-06, "loss": 0.4651, "num_input_tokens_seen": 87425184, "step": 71890 }, { "epoch": 9.008269640395941, "grad_norm": 0.1347884088754654, "learning_rate": 6.703420332763479e-06, "loss": 0.4646, "num_input_tokens_seen": 87431456, "step": 71895 }, { "epoch": 9.008896128304723, "grad_norm": 0.1467691957950592, "learning_rate": 6.70290631434876e-06, "loss": 0.4682, "num_input_tokens_seen": 87437248, "step": 71900 }, { "epoch": 9.009522616213507, "grad_norm": 0.08926711231470108, "learning_rate": 6.702392275574367e-06, "loss": 0.4571, "num_input_tokens_seen": 87443424, "step": 71905 }, { "epoch": 9.01014910412229, "grad_norm": 0.1434548944234848, "learning_rate": 6.701878216446442e-06, "loss": 0.4698, "num_input_tokens_seen": 87449600, "step": 71910 }, { "epoch": 9.010775592031074, "grad_norm": 0.1493905484676361, "learning_rate": 6.701364136971132e-06, "loss": 0.4551, "num_input_tokens_seen": 87455808, "step": 71915 }, { "epoch": 9.011402079939858, "grad_norm": 0.11047754436731339, "learning_rate": 6.700850037154584e-06, "loss": 0.4784, "num_input_tokens_seen": 87461376, "step": 71920 }, { "epoch": 9.01202856784864, "grad_norm": 0.10035906732082367, "learning_rate": 6.700335917002942e-06, "loss": 0.4484, "num_input_tokens_seen": 87467520, "step": 71925 }, { "epoch": 9.012655055757424, "grad_norm": 0.11107128113508224, "learning_rate": 6.699821776522355e-06, "loss": 0.4587, "num_input_tokens_seen": 87473280, "step": 71930 }, { "epoch": 9.013281543666206, "grad_norm": 0.14437349140644073, "learning_rate": 6.699307615718971e-06, "loss": 0.4634, "num_input_tokens_seen": 87479104, "step": 71935 }, { "epoch": 9.01390803157499, "grad_norm": 0.0940481573343277, "learning_rate": 6.698793434598935e-06, "loss": 0.4713, "num_input_tokens_seen": 87484608, "step": 71940 }, { "epoch": 9.014534519483774, "grad_norm": 0.0966256856918335, "learning_rate": 6.698279233168395e-06, "loss": 0.4604, "num_input_tokens_seen": 87490976, "step": 71945 }, { "epoch": 9.015161007392557, "grad_norm": 0.09303305298089981, "learning_rate": 6.6977650114334995e-06, "loss": 0.4543, "num_input_tokens_seen": 87497344, "step": 71950 }, { "epoch": 9.01578749530134, "grad_norm": 0.09288658201694489, "learning_rate": 6.697250769400394e-06, "loss": 0.457, "num_input_tokens_seen": 87503744, "step": 71955 }, { "epoch": 9.016413983210125, "grad_norm": 0.09742730855941772, "learning_rate": 6.696736507075231e-06, "loss": 0.4622, "num_input_tokens_seen": 87509664, "step": 71960 }, { "epoch": 9.017040471118907, "grad_norm": 0.1282503455877304, "learning_rate": 6.696222224464155e-06, "loss": 0.4598, "num_input_tokens_seen": 87516064, "step": 71965 }, { "epoch": 9.017666959027691, "grad_norm": 0.11627783626317978, "learning_rate": 6.695707921573318e-06, "loss": 0.4618, "num_input_tokens_seen": 87522144, "step": 71970 }, { "epoch": 9.018293446936474, "grad_norm": 0.13125567138195038, "learning_rate": 6.695193598408866e-06, "loss": 0.4499, "num_input_tokens_seen": 87528000, "step": 71975 }, { "epoch": 9.018919934845258, "grad_norm": 0.0859491229057312, "learning_rate": 6.694679254976949e-06, "loss": 0.4565, "num_input_tokens_seen": 87534112, "step": 71980 }, { "epoch": 9.019546422754042, "grad_norm": 0.10448987036943436, "learning_rate": 6.694164891283716e-06, "loss": 0.4571, "num_input_tokens_seen": 87540160, "step": 71985 }, { "epoch": 9.020172910662824, "grad_norm": 0.10903072357177734, "learning_rate": 6.6936505073353184e-06, "loss": 0.4552, "num_input_tokens_seen": 87546432, "step": 71990 }, { "epoch": 9.020799398571608, "grad_norm": 0.13265849649906158, "learning_rate": 6.693136103137905e-06, "loss": 0.4674, "num_input_tokens_seen": 87552640, "step": 71995 }, { "epoch": 9.02142588648039, "grad_norm": 0.12281765788793564, "learning_rate": 6.692621678697625e-06, "loss": 0.4632, "num_input_tokens_seen": 87558848, "step": 72000 }, { "epoch": 9.022052374389174, "grad_norm": 0.0940825566649437, "learning_rate": 6.692107234020631e-06, "loss": 0.4696, "num_input_tokens_seen": 87564992, "step": 72005 }, { "epoch": 9.022678862297958, "grad_norm": 0.09107878804206848, "learning_rate": 6.691592769113072e-06, "loss": 0.4689, "num_input_tokens_seen": 87571200, "step": 72010 }, { "epoch": 9.02330535020674, "grad_norm": 0.10227629542350769, "learning_rate": 6.691078283981099e-06, "loss": 0.4617, "num_input_tokens_seen": 87577440, "step": 72015 }, { "epoch": 9.023931838115525, "grad_norm": 0.10353866964578629, "learning_rate": 6.690563778630864e-06, "loss": 0.4664, "num_input_tokens_seen": 87583776, "step": 72020 }, { "epoch": 9.024558326024307, "grad_norm": 0.11580405384302139, "learning_rate": 6.690049253068518e-06, "loss": 0.4654, "num_input_tokens_seen": 87590176, "step": 72025 }, { "epoch": 9.025184813933091, "grad_norm": 0.10491534322500229, "learning_rate": 6.689534707300211e-06, "loss": 0.4619, "num_input_tokens_seen": 87596256, "step": 72030 }, { "epoch": 9.025811301841875, "grad_norm": 0.1145549863576889, "learning_rate": 6.689020141332096e-06, "loss": 0.4632, "num_input_tokens_seen": 87602624, "step": 72035 }, { "epoch": 9.026437789750657, "grad_norm": 0.0982443243265152, "learning_rate": 6.688505555170327e-06, "loss": 0.4623, "num_input_tokens_seen": 87608672, "step": 72040 }, { "epoch": 9.027064277659441, "grad_norm": 0.11345798522233963, "learning_rate": 6.687990948821053e-06, "loss": 0.455, "num_input_tokens_seen": 87614912, "step": 72045 }, { "epoch": 9.027690765568224, "grad_norm": 0.11105993390083313, "learning_rate": 6.6874763222904275e-06, "loss": 0.4595, "num_input_tokens_seen": 87621120, "step": 72050 }, { "epoch": 9.028317253477008, "grad_norm": 0.11665501445531845, "learning_rate": 6.686961675584603e-06, "loss": 0.4614, "num_input_tokens_seen": 87627328, "step": 72055 }, { "epoch": 9.028943741385792, "grad_norm": 0.1323385089635849, "learning_rate": 6.686447008709735e-06, "loss": 0.4674, "num_input_tokens_seen": 87633504, "step": 72060 }, { "epoch": 9.029570229294574, "grad_norm": 0.12387564778327942, "learning_rate": 6.685932321671976e-06, "loss": 0.464, "num_input_tokens_seen": 87639392, "step": 72065 }, { "epoch": 9.030196717203358, "grad_norm": 0.09479908645153046, "learning_rate": 6.6854176144774785e-06, "loss": 0.4672, "num_input_tokens_seen": 87645760, "step": 72070 }, { "epoch": 9.030823205112142, "grad_norm": 0.11435776203870773, "learning_rate": 6.684902887132395e-06, "loss": 0.4618, "num_input_tokens_seen": 87651840, "step": 72075 }, { "epoch": 9.031449693020924, "grad_norm": 0.1033816933631897, "learning_rate": 6.684388139642881e-06, "loss": 0.4671, "num_input_tokens_seen": 87658112, "step": 72080 }, { "epoch": 9.032076180929709, "grad_norm": 0.11072918772697449, "learning_rate": 6.683873372015092e-06, "loss": 0.4686, "num_input_tokens_seen": 87664128, "step": 72085 }, { "epoch": 9.03270266883849, "grad_norm": 0.08613315224647522, "learning_rate": 6.6833585842551805e-06, "loss": 0.4583, "num_input_tokens_seen": 87670048, "step": 72090 }, { "epoch": 9.033329156747275, "grad_norm": 0.12721063196659088, "learning_rate": 6.682843776369301e-06, "loss": 0.4599, "num_input_tokens_seen": 87676096, "step": 72095 }, { "epoch": 9.033955644656059, "grad_norm": 0.1456829011440277, "learning_rate": 6.68232894836361e-06, "loss": 0.467, "num_input_tokens_seen": 87682144, "step": 72100 }, { "epoch": 9.034582132564841, "grad_norm": 0.11497444659471512, "learning_rate": 6.6818141002442615e-06, "loss": 0.4624, "num_input_tokens_seen": 87688320, "step": 72105 }, { "epoch": 9.035208620473625, "grad_norm": 0.14613841474056244, "learning_rate": 6.681299232017412e-06, "loss": 0.4542, "num_input_tokens_seen": 87694400, "step": 72110 }, { "epoch": 9.035835108382408, "grad_norm": 0.08125270903110504, "learning_rate": 6.680784343689217e-06, "loss": 0.4554, "num_input_tokens_seen": 87700576, "step": 72115 }, { "epoch": 9.036461596291192, "grad_norm": 0.1193058118224144, "learning_rate": 6.680269435265831e-06, "loss": 0.46, "num_input_tokens_seen": 87706368, "step": 72120 }, { "epoch": 9.037088084199976, "grad_norm": 0.09442538022994995, "learning_rate": 6.679754506753411e-06, "loss": 0.4696, "num_input_tokens_seen": 87712672, "step": 72125 }, { "epoch": 9.037714572108758, "grad_norm": 0.172274649143219, "learning_rate": 6.679239558158115e-06, "loss": 0.4632, "num_input_tokens_seen": 87717952, "step": 72130 }, { "epoch": 9.038341060017542, "grad_norm": 0.10703660547733307, "learning_rate": 6.678724589486097e-06, "loss": 0.4612, "num_input_tokens_seen": 87723840, "step": 72135 }, { "epoch": 9.038967547926324, "grad_norm": 0.1029956117272377, "learning_rate": 6.678209600743516e-06, "loss": 0.4568, "num_input_tokens_seen": 87729984, "step": 72140 }, { "epoch": 9.039594035835108, "grad_norm": 0.1480369120836258, "learning_rate": 6.677694591936528e-06, "loss": 0.46, "num_input_tokens_seen": 87736128, "step": 72145 }, { "epoch": 9.040220523743892, "grad_norm": 0.13547979295253754, "learning_rate": 6.6771795630712896e-06, "loss": 0.4631, "num_input_tokens_seen": 87742240, "step": 72150 }, { "epoch": 9.040847011652675, "grad_norm": 0.15229976177215576, "learning_rate": 6.676664514153961e-06, "loss": 0.4618, "num_input_tokens_seen": 87748064, "step": 72155 }, { "epoch": 9.041473499561459, "grad_norm": 0.1690274477005005, "learning_rate": 6.6761494451906985e-06, "loss": 0.4679, "num_input_tokens_seen": 87754080, "step": 72160 }, { "epoch": 9.042099987470241, "grad_norm": 0.09950532019138336, "learning_rate": 6.675634356187659e-06, "loss": 0.468, "num_input_tokens_seen": 87760128, "step": 72165 }, { "epoch": 9.042726475379025, "grad_norm": 0.1218973770737648, "learning_rate": 6.675119247151003e-06, "loss": 0.4589, "num_input_tokens_seen": 87766432, "step": 72170 }, { "epoch": 9.04335296328781, "grad_norm": 0.10671170800924301, "learning_rate": 6.674604118086887e-06, "loss": 0.4667, "num_input_tokens_seen": 87772480, "step": 72175 }, { "epoch": 9.043979451196591, "grad_norm": 0.11408396810293198, "learning_rate": 6.674088969001471e-06, "loss": 0.4612, "num_input_tokens_seen": 87778240, "step": 72180 }, { "epoch": 9.044605939105375, "grad_norm": 0.0849260464310646, "learning_rate": 6.673573799900914e-06, "loss": 0.4598, "num_input_tokens_seen": 87784256, "step": 72185 }, { "epoch": 9.045232427014158, "grad_norm": 0.12042205780744553, "learning_rate": 6.673058610791375e-06, "loss": 0.4585, "num_input_tokens_seen": 87790144, "step": 72190 }, { "epoch": 9.045858914922942, "grad_norm": 0.12164772301912308, "learning_rate": 6.672543401679015e-06, "loss": 0.465, "num_input_tokens_seen": 87796192, "step": 72195 }, { "epoch": 9.046485402831726, "grad_norm": 0.13975301384925842, "learning_rate": 6.672028172569991e-06, "loss": 0.4553, "num_input_tokens_seen": 87802176, "step": 72200 }, { "epoch": 9.047111890740508, "grad_norm": 0.11103629320859909, "learning_rate": 6.671512923470465e-06, "loss": 0.462, "num_input_tokens_seen": 87808160, "step": 72205 }, { "epoch": 9.047738378649292, "grad_norm": 0.12559862434864044, "learning_rate": 6.670997654386596e-06, "loss": 0.4586, "num_input_tokens_seen": 87814560, "step": 72210 }, { "epoch": 9.048364866558076, "grad_norm": 0.12768077850341797, "learning_rate": 6.670482365324547e-06, "loss": 0.4635, "num_input_tokens_seen": 87820576, "step": 72215 }, { "epoch": 9.048991354466859, "grad_norm": 0.11065756529569626, "learning_rate": 6.669967056290475e-06, "loss": 0.4635, "num_input_tokens_seen": 87825920, "step": 72220 }, { "epoch": 9.049617842375643, "grad_norm": 0.15937258303165436, "learning_rate": 6.6694517272905435e-06, "loss": 0.4584, "num_input_tokens_seen": 87832288, "step": 72225 }, { "epoch": 9.050244330284425, "grad_norm": 0.14710573852062225, "learning_rate": 6.668936378330913e-06, "loss": 0.4667, "num_input_tokens_seen": 87838240, "step": 72230 }, { "epoch": 9.050870818193209, "grad_norm": 0.14441730082035065, "learning_rate": 6.668421009417745e-06, "loss": 0.456, "num_input_tokens_seen": 87844320, "step": 72235 }, { "epoch": 9.051497306101993, "grad_norm": 0.08866795897483826, "learning_rate": 6.6679056205572e-06, "loss": 0.4717, "num_input_tokens_seen": 87850208, "step": 72240 }, { "epoch": 9.052123794010775, "grad_norm": 0.13635669648647308, "learning_rate": 6.6673902117554425e-06, "loss": 0.4611, "num_input_tokens_seen": 87856320, "step": 72245 }, { "epoch": 9.05275028191956, "grad_norm": 0.09795129299163818, "learning_rate": 6.666874783018633e-06, "loss": 0.4529, "num_input_tokens_seen": 87862400, "step": 72250 }, { "epoch": 9.053376769828342, "grad_norm": 0.18475188314914703, "learning_rate": 6.666359334352932e-06, "loss": 0.4475, "num_input_tokens_seen": 87868640, "step": 72255 }, { "epoch": 9.054003257737126, "grad_norm": 0.10028044879436493, "learning_rate": 6.6658438657645065e-06, "loss": 0.4617, "num_input_tokens_seen": 87874432, "step": 72260 }, { "epoch": 9.05462974564591, "grad_norm": 0.11175645887851715, "learning_rate": 6.665328377259517e-06, "loss": 0.4496, "num_input_tokens_seen": 87880480, "step": 72265 }, { "epoch": 9.055256233554692, "grad_norm": 0.16612640023231506, "learning_rate": 6.664812868844126e-06, "loss": 0.4633, "num_input_tokens_seen": 87886656, "step": 72270 }, { "epoch": 9.055882721463476, "grad_norm": 0.11277797818183899, "learning_rate": 6.664297340524497e-06, "loss": 0.4607, "num_input_tokens_seen": 87892416, "step": 72275 }, { "epoch": 9.056509209372258, "grad_norm": 0.12163183838129044, "learning_rate": 6.663781792306795e-06, "loss": 0.4594, "num_input_tokens_seen": 87898496, "step": 72280 }, { "epoch": 9.057135697281042, "grad_norm": 0.0993841215968132, "learning_rate": 6.663266224197182e-06, "loss": 0.4687, "num_input_tokens_seen": 87904416, "step": 72285 }, { "epoch": 9.057762185189826, "grad_norm": 0.1188177838921547, "learning_rate": 6.6627506362018234e-06, "loss": 0.4728, "num_input_tokens_seen": 87910976, "step": 72290 }, { "epoch": 9.058388673098609, "grad_norm": 0.12238252907991409, "learning_rate": 6.6622350283268835e-06, "loss": 0.4562, "num_input_tokens_seen": 87917152, "step": 72295 }, { "epoch": 9.059015161007393, "grad_norm": 0.0787917971611023, "learning_rate": 6.6617194005785245e-06, "loss": 0.4633, "num_input_tokens_seen": 87923520, "step": 72300 }, { "epoch": 9.059641648916175, "grad_norm": 0.14059419929981232, "learning_rate": 6.661203752962915e-06, "loss": 0.4572, "num_input_tokens_seen": 87929888, "step": 72305 }, { "epoch": 9.06026813682496, "grad_norm": 0.07825201749801636, "learning_rate": 6.660688085486216e-06, "loss": 0.4678, "num_input_tokens_seen": 87936000, "step": 72310 }, { "epoch": 9.060894624733743, "grad_norm": 0.1055571585893631, "learning_rate": 6.660172398154595e-06, "loss": 0.4614, "num_input_tokens_seen": 87941696, "step": 72315 }, { "epoch": 9.061521112642525, "grad_norm": 0.12181741744279861, "learning_rate": 6.6596566909742185e-06, "loss": 0.462, "num_input_tokens_seen": 87947360, "step": 72320 }, { "epoch": 9.06214760055131, "grad_norm": 0.12102928012609482, "learning_rate": 6.65914096395125e-06, "loss": 0.4551, "num_input_tokens_seen": 87953504, "step": 72325 }, { "epoch": 9.062774088460094, "grad_norm": 0.14741972088813782, "learning_rate": 6.658625217091856e-06, "loss": 0.4631, "num_input_tokens_seen": 87959936, "step": 72330 }, { "epoch": 9.063400576368876, "grad_norm": 0.12950731813907623, "learning_rate": 6.658109450402205e-06, "loss": 0.4646, "num_input_tokens_seen": 87966112, "step": 72335 }, { "epoch": 9.06402706427766, "grad_norm": 0.15239141881465912, "learning_rate": 6.65759366388846e-06, "loss": 0.4568, "num_input_tokens_seen": 87972192, "step": 72340 }, { "epoch": 9.064653552186442, "grad_norm": 0.13897991180419922, "learning_rate": 6.657077857556789e-06, "loss": 0.4485, "num_input_tokens_seen": 87978496, "step": 72345 }, { "epoch": 9.065280040095226, "grad_norm": 0.13943421840667725, "learning_rate": 6.6565620314133605e-06, "loss": 0.4564, "num_input_tokens_seen": 87984640, "step": 72350 }, { "epoch": 9.06590652800401, "grad_norm": 0.236571803689003, "learning_rate": 6.656046185464338e-06, "loss": 0.462, "num_input_tokens_seen": 87990944, "step": 72355 }, { "epoch": 9.066533015912793, "grad_norm": 0.14525984227657318, "learning_rate": 6.6555303197158915e-06, "loss": 0.4468, "num_input_tokens_seen": 87997312, "step": 72360 }, { "epoch": 9.067159503821577, "grad_norm": 0.16644497215747833, "learning_rate": 6.6550144341741875e-06, "loss": 0.4512, "num_input_tokens_seen": 88002784, "step": 72365 }, { "epoch": 9.067785991730359, "grad_norm": 0.16420121490955353, "learning_rate": 6.654498528845397e-06, "loss": 0.4541, "num_input_tokens_seen": 88008096, "step": 72370 }, { "epoch": 9.068412479639143, "grad_norm": 0.12743833661079407, "learning_rate": 6.6539826037356835e-06, "loss": 0.4561, "num_input_tokens_seen": 88014656, "step": 72375 }, { "epoch": 9.069038967547927, "grad_norm": 0.10870695114135742, "learning_rate": 6.653466658851217e-06, "loss": 0.4458, "num_input_tokens_seen": 88020896, "step": 72380 }, { "epoch": 9.06966545545671, "grad_norm": 0.1552356630563736, "learning_rate": 6.652950694198168e-06, "loss": 0.4714, "num_input_tokens_seen": 88027104, "step": 72385 }, { "epoch": 9.070291943365493, "grad_norm": 0.17595826089382172, "learning_rate": 6.652434709782705e-06, "loss": 0.4568, "num_input_tokens_seen": 88033344, "step": 72390 }, { "epoch": 9.070918431274276, "grad_norm": 0.14381706714630127, "learning_rate": 6.651918705610994e-06, "loss": 0.4614, "num_input_tokens_seen": 88038912, "step": 72395 }, { "epoch": 9.07154491918306, "grad_norm": 0.1541607677936554, "learning_rate": 6.6514026816892056e-06, "loss": 0.4676, "num_input_tokens_seen": 88044416, "step": 72400 }, { "epoch": 9.072171407091844, "grad_norm": 0.19730441272258759, "learning_rate": 6.650886638023508e-06, "loss": 0.464, "num_input_tokens_seen": 88050496, "step": 72405 }, { "epoch": 9.072797895000626, "grad_norm": 0.1885400265455246, "learning_rate": 6.650370574620076e-06, "loss": 0.4636, "num_input_tokens_seen": 88056608, "step": 72410 }, { "epoch": 9.07342438290941, "grad_norm": 0.21667686104774475, "learning_rate": 6.649854491485075e-06, "loss": 0.4587, "num_input_tokens_seen": 88062752, "step": 72415 }, { "epoch": 9.074050870818192, "grad_norm": 0.19715403020381927, "learning_rate": 6.6493383886246766e-06, "loss": 0.4629, "num_input_tokens_seen": 88068800, "step": 72420 }, { "epoch": 9.074677358726976, "grad_norm": 0.19553051888942719, "learning_rate": 6.648822266045049e-06, "loss": 0.4651, "num_input_tokens_seen": 88075328, "step": 72425 }, { "epoch": 9.07530384663576, "grad_norm": 0.16089460253715515, "learning_rate": 6.648306123752368e-06, "loss": 0.4783, "num_input_tokens_seen": 88080864, "step": 72430 }, { "epoch": 9.075930334544543, "grad_norm": 0.17240864038467407, "learning_rate": 6.647789961752799e-06, "loss": 0.4598, "num_input_tokens_seen": 88087264, "step": 72435 }, { "epoch": 9.076556822453327, "grad_norm": 0.27179446816444397, "learning_rate": 6.647273780052518e-06, "loss": 0.4577, "num_input_tokens_seen": 88092832, "step": 72440 }, { "epoch": 9.07718331036211, "grad_norm": 0.12299450486898422, "learning_rate": 6.646757578657693e-06, "loss": 0.4619, "num_input_tokens_seen": 88098656, "step": 72445 }, { "epoch": 9.077809798270893, "grad_norm": 0.15071336925029755, "learning_rate": 6.646241357574497e-06, "loss": 0.4603, "num_input_tokens_seen": 88104768, "step": 72450 }, { "epoch": 9.078436286179677, "grad_norm": 0.14060302078723907, "learning_rate": 6.645725116809101e-06, "loss": 0.4529, "num_input_tokens_seen": 88111072, "step": 72455 }, { "epoch": 9.07906277408846, "grad_norm": 0.16290265321731567, "learning_rate": 6.645208856367676e-06, "loss": 0.4567, "num_input_tokens_seen": 88116576, "step": 72460 }, { "epoch": 9.079689261997244, "grad_norm": 0.14868174493312836, "learning_rate": 6.644692576256399e-06, "loss": 0.4681, "num_input_tokens_seen": 88122592, "step": 72465 }, { "epoch": 9.080315749906028, "grad_norm": 0.16737176477909088, "learning_rate": 6.644176276481438e-06, "loss": 0.4594, "num_input_tokens_seen": 88128608, "step": 72470 }, { "epoch": 9.08094223781481, "grad_norm": 0.1647234857082367, "learning_rate": 6.643659957048967e-06, "loss": 0.462, "num_input_tokens_seen": 88135104, "step": 72475 }, { "epoch": 9.081568725723594, "grad_norm": 0.1034071147441864, "learning_rate": 6.6431436179651594e-06, "loss": 0.4618, "num_input_tokens_seen": 88140864, "step": 72480 }, { "epoch": 9.082195213632376, "grad_norm": 0.20896539092063904, "learning_rate": 6.64262725923619e-06, "loss": 0.4498, "num_input_tokens_seen": 88147040, "step": 72485 }, { "epoch": 9.08282170154116, "grad_norm": 0.153370663523674, "learning_rate": 6.642110880868227e-06, "loss": 0.475, "num_input_tokens_seen": 88152992, "step": 72490 }, { "epoch": 9.083448189449944, "grad_norm": 0.1662742644548416, "learning_rate": 6.641594482867451e-06, "loss": 0.4662, "num_input_tokens_seen": 88159040, "step": 72495 }, { "epoch": 9.084074677358727, "grad_norm": 0.13336336612701416, "learning_rate": 6.6410780652400315e-06, "loss": 0.4667, "num_input_tokens_seen": 88165312, "step": 72500 }, { "epoch": 9.08470116526751, "grad_norm": 0.14879727363586426, "learning_rate": 6.640561627992145e-06, "loss": 0.459, "num_input_tokens_seen": 88171680, "step": 72505 }, { "epoch": 9.085327653176293, "grad_norm": 0.17006555199623108, "learning_rate": 6.640045171129964e-06, "loss": 0.4654, "num_input_tokens_seen": 88177760, "step": 72510 }, { "epoch": 9.085954141085077, "grad_norm": 0.1354319453239441, "learning_rate": 6.639528694659662e-06, "loss": 0.4502, "num_input_tokens_seen": 88183104, "step": 72515 }, { "epoch": 9.086580628993861, "grad_norm": 0.11226057261228561, "learning_rate": 6.63901219858742e-06, "loss": 0.4505, "num_input_tokens_seen": 88189472, "step": 72520 }, { "epoch": 9.087207116902643, "grad_norm": 0.19195468723773956, "learning_rate": 6.638495682919407e-06, "loss": 0.4499, "num_input_tokens_seen": 88195488, "step": 72525 }, { "epoch": 9.087833604811427, "grad_norm": 0.19634407758712769, "learning_rate": 6.637979147661802e-06, "loss": 0.4593, "num_input_tokens_seen": 88201824, "step": 72530 }, { "epoch": 9.08846009272021, "grad_norm": 0.15627963840961456, "learning_rate": 6.637462592820778e-06, "loss": 0.4566, "num_input_tokens_seen": 88207808, "step": 72535 }, { "epoch": 9.089086580628994, "grad_norm": 0.18410201370716095, "learning_rate": 6.636946018402512e-06, "loss": 0.4769, "num_input_tokens_seen": 88213344, "step": 72540 }, { "epoch": 9.089713068537778, "grad_norm": 0.1265689730644226, "learning_rate": 6.63642942441318e-06, "loss": 0.4643, "num_input_tokens_seen": 88219680, "step": 72545 }, { "epoch": 9.09033955644656, "grad_norm": 0.2346217781305313, "learning_rate": 6.6359128108589574e-06, "loss": 0.4526, "num_input_tokens_seen": 88225952, "step": 72550 }, { "epoch": 9.090966044355344, "grad_norm": 0.12448751926422119, "learning_rate": 6.635396177746024e-06, "loss": 0.4598, "num_input_tokens_seen": 88232160, "step": 72555 }, { "epoch": 9.091592532264126, "grad_norm": 0.15523436665534973, "learning_rate": 6.634879525080553e-06, "loss": 0.4733, "num_input_tokens_seen": 88238624, "step": 72560 }, { "epoch": 9.09221902017291, "grad_norm": 0.1532677412033081, "learning_rate": 6.634362852868723e-06, "loss": 0.461, "num_input_tokens_seen": 88244544, "step": 72565 }, { "epoch": 9.092845508081695, "grad_norm": 0.16787189245224, "learning_rate": 6.63384616111671e-06, "loss": 0.4517, "num_input_tokens_seen": 88250656, "step": 72570 }, { "epoch": 9.093471995990477, "grad_norm": 0.15868015587329865, "learning_rate": 6.633329449830695e-06, "loss": 0.4688, "num_input_tokens_seen": 88256896, "step": 72575 }, { "epoch": 9.094098483899261, "grad_norm": 0.10575586557388306, "learning_rate": 6.632812719016851e-06, "loss": 0.4657, "num_input_tokens_seen": 88262976, "step": 72580 }, { "epoch": 9.094724971808045, "grad_norm": 0.15331290662288666, "learning_rate": 6.632295968681359e-06, "loss": 0.4664, "num_input_tokens_seen": 88269344, "step": 72585 }, { "epoch": 9.095351459716827, "grad_norm": 0.18237298727035522, "learning_rate": 6.631779198830397e-06, "loss": 0.4536, "num_input_tokens_seen": 88275328, "step": 72590 }, { "epoch": 9.095977947625611, "grad_norm": 0.14332051575183868, "learning_rate": 6.631262409470143e-06, "loss": 0.4817, "num_input_tokens_seen": 88281504, "step": 72595 }, { "epoch": 9.096604435534394, "grad_norm": 0.17211268842220306, "learning_rate": 6.630745600606774e-06, "loss": 0.457, "num_input_tokens_seen": 88287552, "step": 72600 }, { "epoch": 9.097230923443178, "grad_norm": 0.1558367758989334, "learning_rate": 6.630228772246471e-06, "loss": 0.4558, "num_input_tokens_seen": 88293536, "step": 72605 }, { "epoch": 9.097857411351962, "grad_norm": 0.12273626029491425, "learning_rate": 6.629711924395411e-06, "loss": 0.4664, "num_input_tokens_seen": 88299648, "step": 72610 }, { "epoch": 9.098483899260744, "grad_norm": 0.14983515441417694, "learning_rate": 6.629195057059776e-06, "loss": 0.453, "num_input_tokens_seen": 88305728, "step": 72615 }, { "epoch": 9.099110387169528, "grad_norm": 0.11653611063957214, "learning_rate": 6.628678170245744e-06, "loss": 0.4627, "num_input_tokens_seen": 88311712, "step": 72620 }, { "epoch": 9.09973687507831, "grad_norm": 0.15680687129497528, "learning_rate": 6.628161263959494e-06, "loss": 0.4595, "num_input_tokens_seen": 88318208, "step": 72625 }, { "epoch": 9.100363362987094, "grad_norm": 0.10020428150892258, "learning_rate": 6.627644338207208e-06, "loss": 0.4568, "num_input_tokens_seen": 88324192, "step": 72630 }, { "epoch": 9.100989850895878, "grad_norm": 0.11264277994632721, "learning_rate": 6.627127392995067e-06, "loss": 0.4653, "num_input_tokens_seen": 88330464, "step": 72635 }, { "epoch": 9.10161633880466, "grad_norm": 0.133568674325943, "learning_rate": 6.62661042832925e-06, "loss": 0.4419, "num_input_tokens_seen": 88336416, "step": 72640 }, { "epoch": 9.102242826713445, "grad_norm": 0.15408268570899963, "learning_rate": 6.626093444215936e-06, "loss": 0.4733, "num_input_tokens_seen": 88342176, "step": 72645 }, { "epoch": 9.102869314622227, "grad_norm": 0.11409765481948853, "learning_rate": 6.62557644066131e-06, "loss": 0.46, "num_input_tokens_seen": 88348320, "step": 72650 }, { "epoch": 9.103495802531011, "grad_norm": 0.12307723611593246, "learning_rate": 6.625059417671549e-06, "loss": 0.4552, "num_input_tokens_seen": 88354592, "step": 72655 }, { "epoch": 9.104122290439795, "grad_norm": 0.12037275731563568, "learning_rate": 6.624542375252837e-06, "loss": 0.4686, "num_input_tokens_seen": 88360960, "step": 72660 }, { "epoch": 9.104748778348577, "grad_norm": 0.1231238842010498, "learning_rate": 6.624025313411354e-06, "loss": 0.4605, "num_input_tokens_seen": 88366944, "step": 72665 }, { "epoch": 9.105375266257361, "grad_norm": 0.09564223885536194, "learning_rate": 6.623508232153284e-06, "loss": 0.4556, "num_input_tokens_seen": 88373344, "step": 72670 }, { "epoch": 9.106001754166144, "grad_norm": 0.13182084262371063, "learning_rate": 6.622991131484807e-06, "loss": 0.4497, "num_input_tokens_seen": 88379584, "step": 72675 }, { "epoch": 9.106628242074928, "grad_norm": 0.2013421654701233, "learning_rate": 6.622474011412108e-06, "loss": 0.4578, "num_input_tokens_seen": 88385792, "step": 72680 }, { "epoch": 9.107254729983712, "grad_norm": 0.1502162665128708, "learning_rate": 6.621956871941367e-06, "loss": 0.479, "num_input_tokens_seen": 88391648, "step": 72685 }, { "epoch": 9.107881217892494, "grad_norm": 0.13862094283103943, "learning_rate": 6.621439713078769e-06, "loss": 0.4627, "num_input_tokens_seen": 88397408, "step": 72690 }, { "epoch": 9.108507705801278, "grad_norm": 0.10099341720342636, "learning_rate": 6.6209225348304965e-06, "loss": 0.456, "num_input_tokens_seen": 88402784, "step": 72695 }, { "epoch": 9.109134193710062, "grad_norm": 0.10858873277902603, "learning_rate": 6.620405337202731e-06, "loss": 0.468, "num_input_tokens_seen": 88408896, "step": 72700 }, { "epoch": 9.109760681618845, "grad_norm": 0.14047521352767944, "learning_rate": 6.619888120201657e-06, "loss": 0.46, "num_input_tokens_seen": 88415008, "step": 72705 }, { "epoch": 9.110387169527629, "grad_norm": 0.14801695942878723, "learning_rate": 6.61937088383346e-06, "loss": 0.4632, "num_input_tokens_seen": 88421120, "step": 72710 }, { "epoch": 9.111013657436411, "grad_norm": 0.10867030173540115, "learning_rate": 6.61885362810432e-06, "loss": 0.4668, "num_input_tokens_seen": 88427360, "step": 72715 }, { "epoch": 9.111640145345195, "grad_norm": 0.14631175994873047, "learning_rate": 6.618336353020427e-06, "loss": 0.4502, "num_input_tokens_seen": 88433312, "step": 72720 }, { "epoch": 9.112266633253979, "grad_norm": 0.15441341698169708, "learning_rate": 6.617819058587961e-06, "loss": 0.4578, "num_input_tokens_seen": 88439648, "step": 72725 }, { "epoch": 9.112893121162761, "grad_norm": 0.16308939456939697, "learning_rate": 6.617301744813108e-06, "loss": 0.4589, "num_input_tokens_seen": 88446048, "step": 72730 }, { "epoch": 9.113519609071545, "grad_norm": 0.18627694249153137, "learning_rate": 6.616784411702052e-06, "loss": 0.4528, "num_input_tokens_seen": 88452352, "step": 72735 }, { "epoch": 9.114146096980328, "grad_norm": 0.22678709030151367, "learning_rate": 6.61626705926098e-06, "loss": 0.4549, "num_input_tokens_seen": 88458368, "step": 72740 }, { "epoch": 9.114772584889112, "grad_norm": 0.1198824793100357, "learning_rate": 6.615749687496076e-06, "loss": 0.4711, "num_input_tokens_seen": 88464832, "step": 72745 }, { "epoch": 9.115399072797896, "grad_norm": 0.12764841318130493, "learning_rate": 6.615232296413527e-06, "loss": 0.4715, "num_input_tokens_seen": 88471040, "step": 72750 }, { "epoch": 9.116025560706678, "grad_norm": 0.10890120267868042, "learning_rate": 6.614714886019518e-06, "loss": 0.4644, "num_input_tokens_seen": 88476992, "step": 72755 }, { "epoch": 9.116652048615462, "grad_norm": 0.1225864440202713, "learning_rate": 6.614197456320233e-06, "loss": 0.4615, "num_input_tokens_seen": 88483328, "step": 72760 }, { "epoch": 9.117278536524244, "grad_norm": 0.12349426746368408, "learning_rate": 6.613680007321862e-06, "loss": 0.4769, "num_input_tokens_seen": 88489312, "step": 72765 }, { "epoch": 9.117905024433028, "grad_norm": 0.1427261084318161, "learning_rate": 6.613162539030588e-06, "loss": 0.464, "num_input_tokens_seen": 88495456, "step": 72770 }, { "epoch": 9.118531512341812, "grad_norm": 0.1447584629058838, "learning_rate": 6.612645051452601e-06, "loss": 0.4584, "num_input_tokens_seen": 88501408, "step": 72775 }, { "epoch": 9.119158000250595, "grad_norm": 0.11528880894184113, "learning_rate": 6.612127544594087e-06, "loss": 0.4648, "num_input_tokens_seen": 88507296, "step": 72780 }, { "epoch": 9.119784488159379, "grad_norm": 0.11932569742202759, "learning_rate": 6.611610018461232e-06, "loss": 0.4635, "num_input_tokens_seen": 88513408, "step": 72785 }, { "epoch": 9.120410976068161, "grad_norm": 0.13434240221977234, "learning_rate": 6.611092473060224e-06, "loss": 0.4478, "num_input_tokens_seen": 88519488, "step": 72790 }, { "epoch": 9.121037463976945, "grad_norm": 0.12513205409049988, "learning_rate": 6.6105749083972516e-06, "loss": 0.4617, "num_input_tokens_seen": 88525824, "step": 72795 }, { "epoch": 9.12166395188573, "grad_norm": 0.13583089411258698, "learning_rate": 6.610057324478502e-06, "loss": 0.4607, "num_input_tokens_seen": 88531744, "step": 72800 }, { "epoch": 9.122290439794511, "grad_norm": 0.13231773674488068, "learning_rate": 6.609539721310163e-06, "loss": 0.457, "num_input_tokens_seen": 88537216, "step": 72805 }, { "epoch": 9.122916927703296, "grad_norm": 0.15657390654087067, "learning_rate": 6.609022098898423e-06, "loss": 0.4598, "num_input_tokens_seen": 88542976, "step": 72810 }, { "epoch": 9.123543415612078, "grad_norm": 0.17121264338493347, "learning_rate": 6.608504457249473e-06, "loss": 0.4593, "num_input_tokens_seen": 88549152, "step": 72815 }, { "epoch": 9.124169903520862, "grad_norm": 0.13005851209163666, "learning_rate": 6.607986796369499e-06, "loss": 0.4611, "num_input_tokens_seen": 88555168, "step": 72820 }, { "epoch": 9.124796391429646, "grad_norm": 0.19613893330097198, "learning_rate": 6.6074691162646885e-06, "loss": 0.4496, "num_input_tokens_seen": 88561504, "step": 72825 }, { "epoch": 9.125422879338428, "grad_norm": 0.20737697184085846, "learning_rate": 6.606951416941235e-06, "loss": 0.4569, "num_input_tokens_seen": 88567904, "step": 72830 }, { "epoch": 9.126049367247212, "grad_norm": 0.13368964195251465, "learning_rate": 6.606433698405328e-06, "loss": 0.4609, "num_input_tokens_seen": 88573728, "step": 72835 }, { "epoch": 9.126675855155996, "grad_norm": 0.15035367012023926, "learning_rate": 6.605915960663154e-06, "loss": 0.4733, "num_input_tokens_seen": 88579744, "step": 72840 }, { "epoch": 9.127302343064779, "grad_norm": 0.12206024676561356, "learning_rate": 6.6053982037209045e-06, "loss": 0.4753, "num_input_tokens_seen": 88585952, "step": 72845 }, { "epoch": 9.127928830973563, "grad_norm": 0.12513679265975952, "learning_rate": 6.60488042758477e-06, "loss": 0.4781, "num_input_tokens_seen": 88592000, "step": 72850 }, { "epoch": 9.128555318882345, "grad_norm": 0.119179368019104, "learning_rate": 6.60436263226094e-06, "loss": 0.4586, "num_input_tokens_seen": 88598176, "step": 72855 }, { "epoch": 9.129181806791129, "grad_norm": 0.15190108120441437, "learning_rate": 6.603844817755605e-06, "loss": 0.4553, "num_input_tokens_seen": 88604224, "step": 72860 }, { "epoch": 9.129808294699913, "grad_norm": 0.1392003744840622, "learning_rate": 6.603326984074958e-06, "loss": 0.452, "num_input_tokens_seen": 88610368, "step": 72865 }, { "epoch": 9.130434782608695, "grad_norm": 0.19163332879543304, "learning_rate": 6.6028091312251895e-06, "loss": 0.4479, "num_input_tokens_seen": 88616288, "step": 72870 }, { "epoch": 9.13106127051748, "grad_norm": 0.136947900056839, "learning_rate": 6.602291259212488e-06, "loss": 0.4781, "num_input_tokens_seen": 88622368, "step": 72875 }, { "epoch": 9.131687758426262, "grad_norm": 0.11269044131040573, "learning_rate": 6.60177336804305e-06, "loss": 0.4675, "num_input_tokens_seen": 88628768, "step": 72880 }, { "epoch": 9.132314246335046, "grad_norm": 0.13727420568466187, "learning_rate": 6.601255457723063e-06, "loss": 0.4586, "num_input_tokens_seen": 88635264, "step": 72885 }, { "epoch": 9.13294073424383, "grad_norm": 0.20690366625785828, "learning_rate": 6.600737528258723e-06, "loss": 0.4552, "num_input_tokens_seen": 88641184, "step": 72890 }, { "epoch": 9.133567222152612, "grad_norm": 0.13078632950782776, "learning_rate": 6.600219579656218e-06, "loss": 0.4597, "num_input_tokens_seen": 88647392, "step": 72895 }, { "epoch": 9.134193710061396, "grad_norm": 0.1358623504638672, "learning_rate": 6.5997016119217435e-06, "loss": 0.46, "num_input_tokens_seen": 88653472, "step": 72900 }, { "epoch": 9.134820197970178, "grad_norm": 0.1278286874294281, "learning_rate": 6.599183625061492e-06, "loss": 0.4625, "num_input_tokens_seen": 88659328, "step": 72905 }, { "epoch": 9.135446685878962, "grad_norm": 0.14971552789211273, "learning_rate": 6.598665619081653e-06, "loss": 0.4666, "num_input_tokens_seen": 88665824, "step": 72910 }, { "epoch": 9.136073173787747, "grad_norm": 0.129007488489151, "learning_rate": 6.5981475939884245e-06, "loss": 0.4703, "num_input_tokens_seen": 88672000, "step": 72915 }, { "epoch": 9.136699661696529, "grad_norm": 0.13185755908489227, "learning_rate": 6.597629549787997e-06, "loss": 0.4588, "num_input_tokens_seen": 88678080, "step": 72920 }, { "epoch": 9.137326149605313, "grad_norm": 0.1344079077243805, "learning_rate": 6.597111486486565e-06, "loss": 0.4665, "num_input_tokens_seen": 88684352, "step": 72925 }, { "epoch": 9.137952637514095, "grad_norm": 0.12448035180568695, "learning_rate": 6.596593404090322e-06, "loss": 0.4631, "num_input_tokens_seen": 88690496, "step": 72930 }, { "epoch": 9.13857912542288, "grad_norm": 0.13395917415618896, "learning_rate": 6.596075302605463e-06, "loss": 0.4616, "num_input_tokens_seen": 88696896, "step": 72935 }, { "epoch": 9.139205613331663, "grad_norm": 0.1191198080778122, "learning_rate": 6.595557182038183e-06, "loss": 0.4633, "num_input_tokens_seen": 88703168, "step": 72940 }, { "epoch": 9.139832101240446, "grad_norm": 0.09522148221731186, "learning_rate": 6.595039042394675e-06, "loss": 0.4597, "num_input_tokens_seen": 88709088, "step": 72945 }, { "epoch": 9.14045858914923, "grad_norm": 0.09679538011550903, "learning_rate": 6.594520883681134e-06, "loss": 0.4597, "num_input_tokens_seen": 88714976, "step": 72950 }, { "epoch": 9.141085077058014, "grad_norm": 0.13328522443771362, "learning_rate": 6.594002705903755e-06, "loss": 0.4718, "num_input_tokens_seen": 88720832, "step": 72955 }, { "epoch": 9.141711564966796, "grad_norm": 0.1176505759358406, "learning_rate": 6.5934845090687334e-06, "loss": 0.4623, "num_input_tokens_seen": 88727264, "step": 72960 }, { "epoch": 9.14233805287558, "grad_norm": 0.14117203652858734, "learning_rate": 6.592966293182266e-06, "loss": 0.4637, "num_input_tokens_seen": 88733472, "step": 72965 }, { "epoch": 9.142964540784362, "grad_norm": 0.16875572502613068, "learning_rate": 6.592448058250546e-06, "loss": 0.46, "num_input_tokens_seen": 88739808, "step": 72970 }, { "epoch": 9.143591028693146, "grad_norm": 0.18417079746723175, "learning_rate": 6.5919298042797695e-06, "loss": 0.453, "num_input_tokens_seen": 88745728, "step": 72975 }, { "epoch": 9.14421751660193, "grad_norm": 0.10544902831315994, "learning_rate": 6.591411531276135e-06, "loss": 0.4628, "num_input_tokens_seen": 88751808, "step": 72980 }, { "epoch": 9.144844004510713, "grad_norm": 0.11924708634614944, "learning_rate": 6.5908932392458365e-06, "loss": 0.4623, "num_input_tokens_seen": 88757888, "step": 72985 }, { "epoch": 9.145470492419497, "grad_norm": 0.13151945173740387, "learning_rate": 6.5903749281950735e-06, "loss": 0.4613, "num_input_tokens_seen": 88764192, "step": 72990 }, { "epoch": 9.146096980328279, "grad_norm": 0.18070121109485626, "learning_rate": 6.589856598130041e-06, "loss": 0.4569, "num_input_tokens_seen": 88770208, "step": 72995 }, { "epoch": 9.146723468237063, "grad_norm": 0.10994943976402283, "learning_rate": 6.589338249056936e-06, "loss": 0.4632, "num_input_tokens_seen": 88776608, "step": 73000 }, { "epoch": 9.147349956145847, "grad_norm": 0.12565851211547852, "learning_rate": 6.588819880981956e-06, "loss": 0.4693, "num_input_tokens_seen": 88782656, "step": 73005 }, { "epoch": 9.14797644405463, "grad_norm": 0.10603634268045425, "learning_rate": 6.588301493911298e-06, "loss": 0.4547, "num_input_tokens_seen": 88788896, "step": 73010 }, { "epoch": 9.148602931963413, "grad_norm": 0.13463854789733887, "learning_rate": 6.5877830878511605e-06, "loss": 0.464, "num_input_tokens_seen": 88794880, "step": 73015 }, { "epoch": 9.149229419872196, "grad_norm": 0.11749294400215149, "learning_rate": 6.587264662807742e-06, "loss": 0.4611, "num_input_tokens_seen": 88801024, "step": 73020 }, { "epoch": 9.14985590778098, "grad_norm": 0.10058518499135971, "learning_rate": 6.586746218787238e-06, "loss": 0.4581, "num_input_tokens_seen": 88806784, "step": 73025 }, { "epoch": 9.150482395689764, "grad_norm": 0.1450326293706894, "learning_rate": 6.586227755795851e-06, "loss": 0.4544, "num_input_tokens_seen": 88812896, "step": 73030 }, { "epoch": 9.151108883598546, "grad_norm": 0.13618135452270508, "learning_rate": 6.585709273839777e-06, "loss": 0.4597, "num_input_tokens_seen": 88819168, "step": 73035 }, { "epoch": 9.15173537150733, "grad_norm": 0.13339945673942566, "learning_rate": 6.585190772925214e-06, "loss": 0.4639, "num_input_tokens_seen": 88825152, "step": 73040 }, { "epoch": 9.152361859416112, "grad_norm": 0.19264820218086243, "learning_rate": 6.584672253058363e-06, "loss": 0.4703, "num_input_tokens_seen": 88831712, "step": 73045 }, { "epoch": 9.152988347324897, "grad_norm": 0.12093093991279602, "learning_rate": 6.584153714245424e-06, "loss": 0.4525, "num_input_tokens_seen": 88838016, "step": 73050 }, { "epoch": 9.15361483523368, "grad_norm": 0.134917214512825, "learning_rate": 6.5836351564925946e-06, "loss": 0.4579, "num_input_tokens_seen": 88843968, "step": 73055 }, { "epoch": 9.154241323142463, "grad_norm": 0.13547758758068085, "learning_rate": 6.583116579806077e-06, "loss": 0.4506, "num_input_tokens_seen": 88850272, "step": 73060 }, { "epoch": 9.154867811051247, "grad_norm": 0.16511821746826172, "learning_rate": 6.582597984192069e-06, "loss": 0.4573, "num_input_tokens_seen": 88856480, "step": 73065 }, { "epoch": 9.15549429896003, "grad_norm": 0.10847919434309006, "learning_rate": 6.582079369656771e-06, "loss": 0.4582, "num_input_tokens_seen": 88862528, "step": 73070 }, { "epoch": 9.156120786868813, "grad_norm": 0.1561400145292282, "learning_rate": 6.5815607362063835e-06, "loss": 0.4621, "num_input_tokens_seen": 88868544, "step": 73075 }, { "epoch": 9.156747274777597, "grad_norm": 0.1618446558713913, "learning_rate": 6.581042083847108e-06, "loss": 0.4698, "num_input_tokens_seen": 88874624, "step": 73080 }, { "epoch": 9.15737376268638, "grad_norm": 0.1472526490688324, "learning_rate": 6.580523412585146e-06, "loss": 0.4641, "num_input_tokens_seen": 88881024, "step": 73085 }, { "epoch": 9.158000250595164, "grad_norm": 0.13283249735832214, "learning_rate": 6.580004722426698e-06, "loss": 0.458, "num_input_tokens_seen": 88887040, "step": 73090 }, { "epoch": 9.158626738503948, "grad_norm": 0.22419625520706177, "learning_rate": 6.579486013377963e-06, "loss": 0.4593, "num_input_tokens_seen": 88892928, "step": 73095 }, { "epoch": 9.15925322641273, "grad_norm": 0.10870848596096039, "learning_rate": 6.578967285445146e-06, "loss": 0.4662, "num_input_tokens_seen": 88899328, "step": 73100 }, { "epoch": 9.159879714321514, "grad_norm": 0.12405221164226532, "learning_rate": 6.578448538634449e-06, "loss": 0.4667, "num_input_tokens_seen": 88905536, "step": 73105 }, { "epoch": 9.160506202230296, "grad_norm": 0.12379877269268036, "learning_rate": 6.5779297729520694e-06, "loss": 0.4678, "num_input_tokens_seen": 88911680, "step": 73110 }, { "epoch": 9.16113269013908, "grad_norm": 0.09411871433258057, "learning_rate": 6.577410988404215e-06, "loss": 0.4647, "num_input_tokens_seen": 88917696, "step": 73115 }, { "epoch": 9.161759178047864, "grad_norm": 0.15359553694725037, "learning_rate": 6.576892184997086e-06, "loss": 0.4566, "num_input_tokens_seen": 88923776, "step": 73120 }, { "epoch": 9.162385665956647, "grad_norm": 0.1075417771935463, "learning_rate": 6.576373362736886e-06, "loss": 0.4619, "num_input_tokens_seen": 88929984, "step": 73125 }, { "epoch": 9.16301215386543, "grad_norm": 0.1810152232646942, "learning_rate": 6.575854521629815e-06, "loss": 0.4579, "num_input_tokens_seen": 88936128, "step": 73130 }, { "epoch": 9.163638641774213, "grad_norm": 0.13550792634487152, "learning_rate": 6.575335661682078e-06, "loss": 0.4625, "num_input_tokens_seen": 88942368, "step": 73135 }, { "epoch": 9.164265129682997, "grad_norm": 0.14066894352436066, "learning_rate": 6.5748167828998796e-06, "loss": 0.4601, "num_input_tokens_seen": 88948448, "step": 73140 }, { "epoch": 9.164891617591781, "grad_norm": 0.1414911299943924, "learning_rate": 6.574297885289423e-06, "loss": 0.4727, "num_input_tokens_seen": 88954560, "step": 73145 }, { "epoch": 9.165518105500563, "grad_norm": 0.12712019681930542, "learning_rate": 6.573778968856912e-06, "loss": 0.4634, "num_input_tokens_seen": 88960896, "step": 73150 }, { "epoch": 9.166144593409348, "grad_norm": 0.13214726746082306, "learning_rate": 6.57326003360855e-06, "loss": 0.4637, "num_input_tokens_seen": 88966912, "step": 73155 }, { "epoch": 9.16677108131813, "grad_norm": 0.14853835105895996, "learning_rate": 6.572741079550542e-06, "loss": 0.4609, "num_input_tokens_seen": 88973184, "step": 73160 }, { "epoch": 9.167397569226914, "grad_norm": 0.11476033926010132, "learning_rate": 6.572222106689091e-06, "loss": 0.4644, "num_input_tokens_seen": 88979360, "step": 73165 }, { "epoch": 9.168024057135698, "grad_norm": 0.1327241063117981, "learning_rate": 6.571703115030402e-06, "loss": 0.4684, "num_input_tokens_seen": 88985312, "step": 73170 }, { "epoch": 9.16865054504448, "grad_norm": 0.12122762203216553, "learning_rate": 6.571184104580682e-06, "loss": 0.4727, "num_input_tokens_seen": 88991456, "step": 73175 }, { "epoch": 9.169277032953264, "grad_norm": 0.125923290848732, "learning_rate": 6.570665075346135e-06, "loss": 0.4583, "num_input_tokens_seen": 88997760, "step": 73180 }, { "epoch": 9.169903520862047, "grad_norm": 0.10393459349870682, "learning_rate": 6.570146027332966e-06, "loss": 0.4669, "num_input_tokens_seen": 89003808, "step": 73185 }, { "epoch": 9.17053000877083, "grad_norm": 0.11041473597288132, "learning_rate": 6.569626960547382e-06, "loss": 0.4647, "num_input_tokens_seen": 89009984, "step": 73190 }, { "epoch": 9.171156496679615, "grad_norm": 0.181343212723732, "learning_rate": 6.569107874995588e-06, "loss": 0.4544, "num_input_tokens_seen": 89016032, "step": 73195 }, { "epoch": 9.171782984588397, "grad_norm": 0.14761757850646973, "learning_rate": 6.56858877068379e-06, "loss": 0.4495, "num_input_tokens_seen": 89022240, "step": 73200 }, { "epoch": 9.172409472497181, "grad_norm": 0.11260804533958435, "learning_rate": 6.568069647618195e-06, "loss": 0.4712, "num_input_tokens_seen": 89028512, "step": 73205 }, { "epoch": 9.173035960405965, "grad_norm": 0.1364280879497528, "learning_rate": 6.567550505805008e-06, "loss": 0.4664, "num_input_tokens_seen": 89034656, "step": 73210 }, { "epoch": 9.173662448314747, "grad_norm": 0.09947269409894943, "learning_rate": 6.567031345250438e-06, "loss": 0.4598, "num_input_tokens_seen": 89040864, "step": 73215 }, { "epoch": 9.174288936223531, "grad_norm": 0.14402395486831665, "learning_rate": 6.566512165960689e-06, "loss": 0.4744, "num_input_tokens_seen": 89046944, "step": 73220 }, { "epoch": 9.174915424132314, "grad_norm": 0.12773647904396057, "learning_rate": 6.565992967941971e-06, "loss": 0.4592, "num_input_tokens_seen": 89052832, "step": 73225 }, { "epoch": 9.175541912041098, "grad_norm": 0.13850392401218414, "learning_rate": 6.5654737512004896e-06, "loss": 0.4626, "num_input_tokens_seen": 89059296, "step": 73230 }, { "epoch": 9.176168399949882, "grad_norm": 0.1239916980266571, "learning_rate": 6.564954515742454e-06, "loss": 0.4567, "num_input_tokens_seen": 89065536, "step": 73235 }, { "epoch": 9.176794887858664, "grad_norm": 0.1667773574590683, "learning_rate": 6.564435261574072e-06, "loss": 0.4646, "num_input_tokens_seen": 89071296, "step": 73240 }, { "epoch": 9.177421375767448, "grad_norm": 0.09830163419246674, "learning_rate": 6.56391598870155e-06, "loss": 0.4636, "num_input_tokens_seen": 89077024, "step": 73245 }, { "epoch": 9.17804786367623, "grad_norm": 0.13663041591644287, "learning_rate": 6.563396697131098e-06, "loss": 0.4681, "num_input_tokens_seen": 89083040, "step": 73250 }, { "epoch": 9.178674351585014, "grad_norm": 0.09655279666185379, "learning_rate": 6.562877386868925e-06, "loss": 0.4579, "num_input_tokens_seen": 89088832, "step": 73255 }, { "epoch": 9.179300839493798, "grad_norm": 0.1342346966266632, "learning_rate": 6.562358057921238e-06, "loss": 0.4597, "num_input_tokens_seen": 89095264, "step": 73260 }, { "epoch": 9.17992732740258, "grad_norm": 0.15422366559505463, "learning_rate": 6.561838710294246e-06, "loss": 0.4675, "num_input_tokens_seen": 89100960, "step": 73265 }, { "epoch": 9.180553815311365, "grad_norm": 0.13050775229930878, "learning_rate": 6.561319343994161e-06, "loss": 0.4584, "num_input_tokens_seen": 89107136, "step": 73270 }, { "epoch": 9.181180303220147, "grad_norm": 0.1240098625421524, "learning_rate": 6.560799959027188e-06, "loss": 0.4683, "num_input_tokens_seen": 89113088, "step": 73275 }, { "epoch": 9.181806791128931, "grad_norm": 0.14294198155403137, "learning_rate": 6.560280555399542e-06, "loss": 0.4545, "num_input_tokens_seen": 89119136, "step": 73280 }, { "epoch": 9.182433279037715, "grad_norm": 0.14904341101646423, "learning_rate": 6.559761133117428e-06, "loss": 0.4614, "num_input_tokens_seen": 89125184, "step": 73285 }, { "epoch": 9.183059766946498, "grad_norm": 0.13195033371448517, "learning_rate": 6.5592416921870595e-06, "loss": 0.466, "num_input_tokens_seen": 89131584, "step": 73290 }, { "epoch": 9.183686254855282, "grad_norm": 0.171833336353302, "learning_rate": 6.558722232614645e-06, "loss": 0.4738, "num_input_tokens_seen": 89137888, "step": 73295 }, { "epoch": 9.184312742764064, "grad_norm": 0.1385050117969513, "learning_rate": 6.558202754406395e-06, "loss": 0.4608, "num_input_tokens_seen": 89143776, "step": 73300 }, { "epoch": 9.184939230672848, "grad_norm": 0.11371678113937378, "learning_rate": 6.557683257568523e-06, "loss": 0.4664, "num_input_tokens_seen": 89149696, "step": 73305 }, { "epoch": 9.185565718581632, "grad_norm": 0.11560273170471191, "learning_rate": 6.557163742107237e-06, "loss": 0.4707, "num_input_tokens_seen": 89155744, "step": 73310 }, { "epoch": 9.186192206490414, "grad_norm": 0.10780351608991623, "learning_rate": 6.5566442080287495e-06, "loss": 0.4542, "num_input_tokens_seen": 89162016, "step": 73315 }, { "epoch": 9.186818694399198, "grad_norm": 0.1256127506494522, "learning_rate": 6.556124655339272e-06, "loss": 0.4623, "num_input_tokens_seen": 89168192, "step": 73320 }, { "epoch": 9.187445182307982, "grad_norm": 0.18767240643501282, "learning_rate": 6.555605084045016e-06, "loss": 0.4561, "num_input_tokens_seen": 89174336, "step": 73325 }, { "epoch": 9.188071670216765, "grad_norm": 0.11400526762008667, "learning_rate": 6.555085494152193e-06, "loss": 0.4637, "num_input_tokens_seen": 89180640, "step": 73330 }, { "epoch": 9.188698158125549, "grad_norm": 0.09709137678146362, "learning_rate": 6.554565885667014e-06, "loss": 0.4603, "num_input_tokens_seen": 89186912, "step": 73335 }, { "epoch": 9.189324646034331, "grad_norm": 0.11338447779417038, "learning_rate": 6.554046258595695e-06, "loss": 0.4607, "num_input_tokens_seen": 89193184, "step": 73340 }, { "epoch": 9.189951133943115, "grad_norm": 0.16916385293006897, "learning_rate": 6.5535266129444455e-06, "loss": 0.4683, "num_input_tokens_seen": 89199424, "step": 73345 }, { "epoch": 9.190577621851899, "grad_norm": 0.13235275447368622, "learning_rate": 6.553006948719479e-06, "loss": 0.4517, "num_input_tokens_seen": 89205632, "step": 73350 }, { "epoch": 9.191204109760681, "grad_norm": 0.11999380588531494, "learning_rate": 6.5524872659270075e-06, "loss": 0.4579, "num_input_tokens_seen": 89211872, "step": 73355 }, { "epoch": 9.191830597669465, "grad_norm": 0.1756255030632019, "learning_rate": 6.551967564573247e-06, "loss": 0.4617, "num_input_tokens_seen": 89217952, "step": 73360 }, { "epoch": 9.192457085578248, "grad_norm": 0.12644518911838531, "learning_rate": 6.55144784466441e-06, "loss": 0.4724, "num_input_tokens_seen": 89224288, "step": 73365 }, { "epoch": 9.193083573487032, "grad_norm": 0.20862926542758942, "learning_rate": 6.5509281062067086e-06, "loss": 0.4589, "num_input_tokens_seen": 89230560, "step": 73370 }, { "epoch": 9.193710061395816, "grad_norm": 0.12237244099378586, "learning_rate": 6.550408349206359e-06, "loss": 0.4608, "num_input_tokens_seen": 89237024, "step": 73375 }, { "epoch": 9.194336549304598, "grad_norm": 0.11069847643375397, "learning_rate": 6.549888573669574e-06, "loss": 0.4532, "num_input_tokens_seen": 89243232, "step": 73380 }, { "epoch": 9.194963037213382, "grad_norm": 0.1653936803340912, "learning_rate": 6.549368779602567e-06, "loss": 0.4772, "num_input_tokens_seen": 89249280, "step": 73385 }, { "epoch": 9.195589525122164, "grad_norm": 0.1099310889840126, "learning_rate": 6.548848967011553e-06, "loss": 0.4569, "num_input_tokens_seen": 89255264, "step": 73390 }, { "epoch": 9.196216013030948, "grad_norm": 0.12046652287244797, "learning_rate": 6.548329135902748e-06, "loss": 0.4502, "num_input_tokens_seen": 89260768, "step": 73395 }, { "epoch": 9.196842500939733, "grad_norm": 0.12976522743701935, "learning_rate": 6.547809286282367e-06, "loss": 0.4649, "num_input_tokens_seen": 89267008, "step": 73400 }, { "epoch": 9.197468988848515, "grad_norm": 0.11475366353988647, "learning_rate": 6.547289418156625e-06, "loss": 0.4654, "num_input_tokens_seen": 89273280, "step": 73405 }, { "epoch": 9.198095476757299, "grad_norm": 0.16672037541866302, "learning_rate": 6.546769531531736e-06, "loss": 0.4601, "num_input_tokens_seen": 89279488, "step": 73410 }, { "epoch": 9.198721964666081, "grad_norm": 0.15770262479782104, "learning_rate": 6.546249626413917e-06, "loss": 0.4623, "num_input_tokens_seen": 89285344, "step": 73415 }, { "epoch": 9.199348452574865, "grad_norm": 0.15904350578784943, "learning_rate": 6.545729702809383e-06, "loss": 0.4655, "num_input_tokens_seen": 89291328, "step": 73420 }, { "epoch": 9.19997494048365, "grad_norm": 0.13412252068519592, "learning_rate": 6.545209760724352e-06, "loss": 0.4615, "num_input_tokens_seen": 89297024, "step": 73425 }, { "epoch": 9.200601428392432, "grad_norm": 0.1127658560872078, "learning_rate": 6.544689800165041e-06, "loss": 0.458, "num_input_tokens_seen": 89302528, "step": 73430 }, { "epoch": 9.201227916301216, "grad_norm": 0.12561273574829102, "learning_rate": 6.544169821137661e-06, "loss": 0.4706, "num_input_tokens_seen": 89308704, "step": 73435 }, { "epoch": 9.201854404209998, "grad_norm": 0.14012031257152557, "learning_rate": 6.5436498236484345e-06, "loss": 0.4658, "num_input_tokens_seen": 89315104, "step": 73440 }, { "epoch": 9.202480892118782, "grad_norm": 0.15870100259780884, "learning_rate": 6.543129807703576e-06, "loss": 0.4613, "num_input_tokens_seen": 89321312, "step": 73445 }, { "epoch": 9.203107380027566, "grad_norm": 0.12018439173698425, "learning_rate": 6.542609773309303e-06, "loss": 0.4562, "num_input_tokens_seen": 89327392, "step": 73450 }, { "epoch": 9.203733867936348, "grad_norm": 0.12943771481513977, "learning_rate": 6.542089720471835e-06, "loss": 0.4754, "num_input_tokens_seen": 89333568, "step": 73455 }, { "epoch": 9.204360355845132, "grad_norm": 0.13110598921775818, "learning_rate": 6.5415696491973855e-06, "loss": 0.452, "num_input_tokens_seen": 89339776, "step": 73460 }, { "epoch": 9.204986843753916, "grad_norm": 0.14470721781253815, "learning_rate": 6.5410495594921766e-06, "loss": 0.4672, "num_input_tokens_seen": 89345952, "step": 73465 }, { "epoch": 9.205613331662699, "grad_norm": 0.21749809384346008, "learning_rate": 6.540529451362424e-06, "loss": 0.4606, "num_input_tokens_seen": 89352128, "step": 73470 }, { "epoch": 9.206239819571483, "grad_norm": 0.15827912092208862, "learning_rate": 6.540009324814347e-06, "loss": 0.4547, "num_input_tokens_seen": 89358400, "step": 73475 }, { "epoch": 9.206866307480265, "grad_norm": 0.26852455735206604, "learning_rate": 6.539489179854161e-06, "loss": 0.4576, "num_input_tokens_seen": 89364576, "step": 73480 }, { "epoch": 9.207492795389049, "grad_norm": 0.13758742809295654, "learning_rate": 6.538969016488091e-06, "loss": 0.4609, "num_input_tokens_seen": 89370688, "step": 73485 }, { "epoch": 9.208119283297833, "grad_norm": 0.15811926126480103, "learning_rate": 6.53844883472235e-06, "loss": 0.4584, "num_input_tokens_seen": 89376992, "step": 73490 }, { "epoch": 9.208745771206615, "grad_norm": 0.12487778812646866, "learning_rate": 6.537928634563161e-06, "loss": 0.4587, "num_input_tokens_seen": 89383104, "step": 73495 }, { "epoch": 9.2093722591154, "grad_norm": 0.1040496751666069, "learning_rate": 6.537408416016743e-06, "loss": 0.4612, "num_input_tokens_seen": 89389152, "step": 73500 }, { "epoch": 9.209998747024182, "grad_norm": 0.13787025213241577, "learning_rate": 6.536888179089313e-06, "loss": 0.4718, "num_input_tokens_seen": 89395552, "step": 73505 }, { "epoch": 9.210625234932966, "grad_norm": 0.10989592969417572, "learning_rate": 6.536367923787095e-06, "loss": 0.455, "num_input_tokens_seen": 89401760, "step": 73510 }, { "epoch": 9.21125172284175, "grad_norm": 0.12650057673454285, "learning_rate": 6.535847650116305e-06, "loss": 0.458, "num_input_tokens_seen": 89408160, "step": 73515 }, { "epoch": 9.211878210750532, "grad_norm": 0.09547560662031174, "learning_rate": 6.535327358083165e-06, "loss": 0.4657, "num_input_tokens_seen": 89414304, "step": 73520 }, { "epoch": 9.212504698659316, "grad_norm": 0.12375429272651672, "learning_rate": 6.5348070476938965e-06, "loss": 0.4586, "num_input_tokens_seen": 89420640, "step": 73525 }, { "epoch": 9.213131186568098, "grad_norm": 0.13734489679336548, "learning_rate": 6.5342867189547185e-06, "loss": 0.4728, "num_input_tokens_seen": 89426944, "step": 73530 }, { "epoch": 9.213757674476883, "grad_norm": 0.1226755753159523, "learning_rate": 6.533766371871854e-06, "loss": 0.4604, "num_input_tokens_seen": 89432992, "step": 73535 }, { "epoch": 9.214384162385667, "grad_norm": 0.14289230108261108, "learning_rate": 6.533246006451522e-06, "loss": 0.4602, "num_input_tokens_seen": 89439040, "step": 73540 }, { "epoch": 9.215010650294449, "grad_norm": 0.14214728772640228, "learning_rate": 6.532725622699945e-06, "loss": 0.4553, "num_input_tokens_seen": 89445088, "step": 73545 }, { "epoch": 9.215637138203233, "grad_norm": 0.1498413383960724, "learning_rate": 6.532205220623343e-06, "loss": 0.4632, "num_input_tokens_seen": 89451040, "step": 73550 }, { "epoch": 9.216263626112015, "grad_norm": 0.15958306193351746, "learning_rate": 6.531684800227942e-06, "loss": 0.4634, "num_input_tokens_seen": 89457024, "step": 73555 }, { "epoch": 9.2168901140208, "grad_norm": 0.13424314558506012, "learning_rate": 6.53116436151996e-06, "loss": 0.4608, "num_input_tokens_seen": 89463296, "step": 73560 }, { "epoch": 9.217516601929583, "grad_norm": 0.15129567682743073, "learning_rate": 6.530643904505622e-06, "loss": 0.464, "num_input_tokens_seen": 89469664, "step": 73565 }, { "epoch": 9.218143089838366, "grad_norm": 0.155887633562088, "learning_rate": 6.530123429191149e-06, "loss": 0.465, "num_input_tokens_seen": 89475296, "step": 73570 }, { "epoch": 9.21876957774715, "grad_norm": 0.10712483525276184, "learning_rate": 6.529602935582763e-06, "loss": 0.4619, "num_input_tokens_seen": 89481024, "step": 73575 }, { "epoch": 9.219396065655932, "grad_norm": 0.13437771797180176, "learning_rate": 6.529082423686688e-06, "loss": 0.4542, "num_input_tokens_seen": 89487552, "step": 73580 }, { "epoch": 9.220022553564716, "grad_norm": 0.10215717554092407, "learning_rate": 6.5285618935091475e-06, "loss": 0.4592, "num_input_tokens_seen": 89493504, "step": 73585 }, { "epoch": 9.2206490414735, "grad_norm": 0.11385418474674225, "learning_rate": 6.528041345056364e-06, "loss": 0.4649, "num_input_tokens_seen": 89499520, "step": 73590 }, { "epoch": 9.221275529382282, "grad_norm": 0.1245078593492508, "learning_rate": 6.527520778334561e-06, "loss": 0.4766, "num_input_tokens_seen": 89505696, "step": 73595 }, { "epoch": 9.221902017291066, "grad_norm": 0.11829126626253128, "learning_rate": 6.527000193349964e-06, "loss": 0.4543, "num_input_tokens_seen": 89511712, "step": 73600 }, { "epoch": 9.22252850519985, "grad_norm": 0.11086118221282959, "learning_rate": 6.526479590108794e-06, "loss": 0.4546, "num_input_tokens_seen": 89517824, "step": 73605 }, { "epoch": 9.223154993108633, "grad_norm": 0.17896825075149536, "learning_rate": 6.525958968617279e-06, "loss": 0.464, "num_input_tokens_seen": 89524128, "step": 73610 }, { "epoch": 9.223781481017417, "grad_norm": 0.1224931925535202, "learning_rate": 6.525438328881641e-06, "loss": 0.4681, "num_input_tokens_seen": 89529568, "step": 73615 }, { "epoch": 9.224407968926199, "grad_norm": 0.11343520879745483, "learning_rate": 6.524917670908106e-06, "loss": 0.4601, "num_input_tokens_seen": 89535328, "step": 73620 }, { "epoch": 9.225034456834983, "grad_norm": 0.08794186264276505, "learning_rate": 6.524396994702897e-06, "loss": 0.455, "num_input_tokens_seen": 89541760, "step": 73625 }, { "epoch": 9.225660944743767, "grad_norm": 0.13141323626041412, "learning_rate": 6.523876300272242e-06, "loss": 0.4681, "num_input_tokens_seen": 89547808, "step": 73630 }, { "epoch": 9.22628743265255, "grad_norm": 0.10460850596427917, "learning_rate": 6.523355587622363e-06, "loss": 0.4559, "num_input_tokens_seen": 89553760, "step": 73635 }, { "epoch": 9.226913920561334, "grad_norm": 0.1419346034526825, "learning_rate": 6.522834856759488e-06, "loss": 0.4539, "num_input_tokens_seen": 89560256, "step": 73640 }, { "epoch": 9.227540408470116, "grad_norm": 0.1616881936788559, "learning_rate": 6.5223141076898396e-06, "loss": 0.4655, "num_input_tokens_seen": 89566432, "step": 73645 }, { "epoch": 9.2281668963789, "grad_norm": 0.1523483246564865, "learning_rate": 6.521793340419649e-06, "loss": 0.4685, "num_input_tokens_seen": 89572608, "step": 73650 }, { "epoch": 9.228793384287684, "grad_norm": 0.16981469094753265, "learning_rate": 6.521272554955138e-06, "loss": 0.4621, "num_input_tokens_seen": 89578592, "step": 73655 }, { "epoch": 9.229419872196466, "grad_norm": 0.15270811319351196, "learning_rate": 6.520751751302537e-06, "loss": 0.4549, "num_input_tokens_seen": 89584736, "step": 73660 }, { "epoch": 9.23004636010525, "grad_norm": 0.1177004873752594, "learning_rate": 6.520230929468067e-06, "loss": 0.4686, "num_input_tokens_seen": 89591040, "step": 73665 }, { "epoch": 9.230672848014033, "grad_norm": 0.111255943775177, "learning_rate": 6.51971008945796e-06, "loss": 0.4621, "num_input_tokens_seen": 89597024, "step": 73670 }, { "epoch": 9.231299335922817, "grad_norm": 0.12957896292209625, "learning_rate": 6.51918923127844e-06, "loss": 0.4508, "num_input_tokens_seen": 89603168, "step": 73675 }, { "epoch": 9.2319258238316, "grad_norm": 0.13919247686862946, "learning_rate": 6.518668354935737e-06, "loss": 0.4609, "num_input_tokens_seen": 89609472, "step": 73680 }, { "epoch": 9.232552311740383, "grad_norm": 0.13649505376815796, "learning_rate": 6.518147460436076e-06, "loss": 0.4625, "num_input_tokens_seen": 89615744, "step": 73685 }, { "epoch": 9.233178799649167, "grad_norm": 0.1416570246219635, "learning_rate": 6.517626547785687e-06, "loss": 0.4497, "num_input_tokens_seen": 89621568, "step": 73690 }, { "epoch": 9.23380528755795, "grad_norm": 0.13278773427009583, "learning_rate": 6.5171056169907955e-06, "loss": 0.4643, "num_input_tokens_seen": 89627424, "step": 73695 }, { "epoch": 9.234431775466733, "grad_norm": 0.15720675885677338, "learning_rate": 6.51658466805763e-06, "loss": 0.4645, "num_input_tokens_seen": 89633376, "step": 73700 }, { "epoch": 9.235058263375517, "grad_norm": 0.1522362381219864, "learning_rate": 6.516063700992421e-06, "loss": 0.4545, "num_input_tokens_seen": 89639424, "step": 73705 }, { "epoch": 9.2356847512843, "grad_norm": 0.12214972078800201, "learning_rate": 6.515542715801395e-06, "loss": 0.4648, "num_input_tokens_seen": 89645600, "step": 73710 }, { "epoch": 9.236311239193084, "grad_norm": 0.19031140208244324, "learning_rate": 6.515021712490782e-06, "loss": 0.4646, "num_input_tokens_seen": 89651648, "step": 73715 }, { "epoch": 9.236937727101868, "grad_norm": 0.13190950453281403, "learning_rate": 6.5145006910668115e-06, "loss": 0.4558, "num_input_tokens_seen": 89657760, "step": 73720 }, { "epoch": 9.23756421501065, "grad_norm": 0.12150859832763672, "learning_rate": 6.51397965153571e-06, "loss": 0.4691, "num_input_tokens_seen": 89664096, "step": 73725 }, { "epoch": 9.238190702919434, "grad_norm": 0.19555369019508362, "learning_rate": 6.513458593903711e-06, "loss": 0.4737, "num_input_tokens_seen": 89670560, "step": 73730 }, { "epoch": 9.238817190828216, "grad_norm": 0.12291170656681061, "learning_rate": 6.512937518177041e-06, "loss": 0.4627, "num_input_tokens_seen": 89676992, "step": 73735 }, { "epoch": 9.239443678737, "grad_norm": 0.13325855135917664, "learning_rate": 6.512416424361931e-06, "loss": 0.4631, "num_input_tokens_seen": 89683328, "step": 73740 }, { "epoch": 9.240070166645785, "grad_norm": 0.11190655827522278, "learning_rate": 6.511895312464611e-06, "loss": 0.4656, "num_input_tokens_seen": 89689056, "step": 73745 }, { "epoch": 9.240696654554567, "grad_norm": 0.1283949464559555, "learning_rate": 6.511374182491313e-06, "loss": 0.4659, "num_input_tokens_seen": 89694848, "step": 73750 }, { "epoch": 9.24132314246335, "grad_norm": 0.12101641297340393, "learning_rate": 6.510853034448264e-06, "loss": 0.4622, "num_input_tokens_seen": 89701312, "step": 73755 }, { "epoch": 9.241949630372133, "grad_norm": 0.16054479777812958, "learning_rate": 6.5103318683416975e-06, "loss": 0.4614, "num_input_tokens_seen": 89707552, "step": 73760 }, { "epoch": 9.242576118280917, "grad_norm": 0.1354646384716034, "learning_rate": 6.509810684177843e-06, "loss": 0.4603, "num_input_tokens_seen": 89713440, "step": 73765 }, { "epoch": 9.243202606189701, "grad_norm": 0.11269349604845047, "learning_rate": 6.509289481962932e-06, "loss": 0.4515, "num_input_tokens_seen": 89719808, "step": 73770 }, { "epoch": 9.243829094098484, "grad_norm": 0.14579267799854279, "learning_rate": 6.508768261703198e-06, "loss": 0.4536, "num_input_tokens_seen": 89726048, "step": 73775 }, { "epoch": 9.244455582007268, "grad_norm": 0.09722641110420227, "learning_rate": 6.50824702340487e-06, "loss": 0.4634, "num_input_tokens_seen": 89732128, "step": 73780 }, { "epoch": 9.24508206991605, "grad_norm": 0.13768278062343597, "learning_rate": 6.507725767074181e-06, "loss": 0.4586, "num_input_tokens_seen": 89738016, "step": 73785 }, { "epoch": 9.245708557824834, "grad_norm": 0.16667425632476807, "learning_rate": 6.507204492717363e-06, "loss": 0.4669, "num_input_tokens_seen": 89744032, "step": 73790 }, { "epoch": 9.246335045733618, "grad_norm": 0.14333723485469818, "learning_rate": 6.506683200340647e-06, "loss": 0.4604, "num_input_tokens_seen": 89750016, "step": 73795 }, { "epoch": 9.2469615336424, "grad_norm": 0.12207571417093277, "learning_rate": 6.506161889950267e-06, "loss": 0.4574, "num_input_tokens_seen": 89755968, "step": 73800 }, { "epoch": 9.247588021551184, "grad_norm": 0.1308683454990387, "learning_rate": 6.505640561552455e-06, "loss": 0.4661, "num_input_tokens_seen": 89761504, "step": 73805 }, { "epoch": 9.248214509459967, "grad_norm": 0.17450661957263947, "learning_rate": 6.505119215153446e-06, "loss": 0.4708, "num_input_tokens_seen": 89767904, "step": 73810 }, { "epoch": 9.24884099736875, "grad_norm": 0.15261507034301758, "learning_rate": 6.50459785075947e-06, "loss": 0.4718, "num_input_tokens_seen": 89774368, "step": 73815 }, { "epoch": 9.249467485277535, "grad_norm": 0.11524885147809982, "learning_rate": 6.504076468376762e-06, "loss": 0.4528, "num_input_tokens_seen": 89780384, "step": 73820 }, { "epoch": 9.250093973186317, "grad_norm": 0.08509965240955353, "learning_rate": 6.503555068011555e-06, "loss": 0.4642, "num_input_tokens_seen": 89786208, "step": 73825 }, { "epoch": 9.250720461095101, "grad_norm": 0.1632629930973053, "learning_rate": 6.503033649670084e-06, "loss": 0.4562, "num_input_tokens_seen": 89792256, "step": 73830 }, { "epoch": 9.251346949003885, "grad_norm": 0.13450559973716736, "learning_rate": 6.502512213358581e-06, "loss": 0.4745, "num_input_tokens_seen": 89798272, "step": 73835 }, { "epoch": 9.251973436912667, "grad_norm": 0.10926017165184021, "learning_rate": 6.501990759083282e-06, "loss": 0.4568, "num_input_tokens_seen": 89804512, "step": 73840 }, { "epoch": 9.252599924821451, "grad_norm": 0.13025252521038055, "learning_rate": 6.50146928685042e-06, "loss": 0.4589, "num_input_tokens_seen": 89810880, "step": 73845 }, { "epoch": 9.253226412730234, "grad_norm": 0.16564327478408813, "learning_rate": 6.50094779666623e-06, "loss": 0.468, "num_input_tokens_seen": 89817024, "step": 73850 }, { "epoch": 9.253852900639018, "grad_norm": 0.1238735094666481, "learning_rate": 6.500426288536947e-06, "loss": 0.4662, "num_input_tokens_seen": 89823264, "step": 73855 }, { "epoch": 9.254479388547802, "grad_norm": 0.16676752269268036, "learning_rate": 6.499904762468807e-06, "loss": 0.4628, "num_input_tokens_seen": 89829344, "step": 73860 }, { "epoch": 9.255105876456584, "grad_norm": 0.11126178503036499, "learning_rate": 6.499383218468045e-06, "loss": 0.4518, "num_input_tokens_seen": 89835680, "step": 73865 }, { "epoch": 9.255732364365368, "grad_norm": 0.12918223440647125, "learning_rate": 6.498861656540896e-06, "loss": 0.4586, "num_input_tokens_seen": 89841888, "step": 73870 }, { "epoch": 9.25635885227415, "grad_norm": 0.14707247912883759, "learning_rate": 6.498340076693595e-06, "loss": 0.4588, "num_input_tokens_seen": 89847744, "step": 73875 }, { "epoch": 9.256985340182935, "grad_norm": 0.12345331907272339, "learning_rate": 6.4978184789323785e-06, "loss": 0.4708, "num_input_tokens_seen": 89853824, "step": 73880 }, { "epoch": 9.257611828091719, "grad_norm": 0.12904073297977448, "learning_rate": 6.497296863263484e-06, "loss": 0.4576, "num_input_tokens_seen": 89859968, "step": 73885 }, { "epoch": 9.2582383160005, "grad_norm": 0.17910973727703094, "learning_rate": 6.496775229693145e-06, "loss": 0.4651, "num_input_tokens_seen": 89865984, "step": 73890 }, { "epoch": 9.258864803909285, "grad_norm": 0.11288612335920334, "learning_rate": 6.4962535782276e-06, "loss": 0.4633, "num_input_tokens_seen": 89871648, "step": 73895 }, { "epoch": 9.259491291818067, "grad_norm": 0.1365688592195511, "learning_rate": 6.4957319088730864e-06, "loss": 0.4585, "num_input_tokens_seen": 89877568, "step": 73900 }, { "epoch": 9.260117779726851, "grad_norm": 0.10327693819999695, "learning_rate": 6.4952102216358395e-06, "loss": 0.4664, "num_input_tokens_seen": 89884096, "step": 73905 }, { "epoch": 9.260744267635635, "grad_norm": 0.13793504238128662, "learning_rate": 6.494688516522098e-06, "loss": 0.4571, "num_input_tokens_seen": 89890208, "step": 73910 }, { "epoch": 9.261370755544418, "grad_norm": 0.14308476448059082, "learning_rate": 6.494166793538097e-06, "loss": 0.4655, "num_input_tokens_seen": 89896320, "step": 73915 }, { "epoch": 9.261997243453202, "grad_norm": 0.13142129778862, "learning_rate": 6.493645052690077e-06, "loss": 0.4533, "num_input_tokens_seen": 89902176, "step": 73920 }, { "epoch": 9.262623731361984, "grad_norm": 0.15947657823562622, "learning_rate": 6.4931232939842746e-06, "loss": 0.467, "num_input_tokens_seen": 89908672, "step": 73925 }, { "epoch": 9.263250219270768, "grad_norm": 0.1684110015630722, "learning_rate": 6.492601517426929e-06, "loss": 0.4645, "num_input_tokens_seen": 89914880, "step": 73930 }, { "epoch": 9.263876707179552, "grad_norm": 0.12269199639558792, "learning_rate": 6.492079723024276e-06, "loss": 0.4615, "num_input_tokens_seen": 89921184, "step": 73935 }, { "epoch": 9.264503195088334, "grad_norm": 0.1317284256219864, "learning_rate": 6.491557910782556e-06, "loss": 0.4661, "num_input_tokens_seen": 89926656, "step": 73940 }, { "epoch": 9.265129682997118, "grad_norm": 0.13386808335781097, "learning_rate": 6.491036080708007e-06, "loss": 0.4608, "num_input_tokens_seen": 89932096, "step": 73945 }, { "epoch": 9.265756170905902, "grad_norm": 0.17662878334522247, "learning_rate": 6.490514232806868e-06, "loss": 0.459, "num_input_tokens_seen": 89938176, "step": 73950 }, { "epoch": 9.266382658814685, "grad_norm": 0.13189789652824402, "learning_rate": 6.489992367085377e-06, "loss": 0.4573, "num_input_tokens_seen": 89944288, "step": 73955 }, { "epoch": 9.267009146723469, "grad_norm": 0.1635594666004181, "learning_rate": 6.489470483549776e-06, "loss": 0.4602, "num_input_tokens_seen": 89950400, "step": 73960 }, { "epoch": 9.267635634632251, "grad_norm": 0.11572238057851791, "learning_rate": 6.488948582206303e-06, "loss": 0.461, "num_input_tokens_seen": 89956352, "step": 73965 }, { "epoch": 9.268262122541035, "grad_norm": 0.14365974068641663, "learning_rate": 6.488426663061197e-06, "loss": 0.472, "num_input_tokens_seen": 89962272, "step": 73970 }, { "epoch": 9.26888861044982, "grad_norm": 0.12116272747516632, "learning_rate": 6.487904726120699e-06, "loss": 0.4585, "num_input_tokens_seen": 89968064, "step": 73975 }, { "epoch": 9.269515098358601, "grad_norm": 0.1633618175983429, "learning_rate": 6.48738277139105e-06, "loss": 0.4688, "num_input_tokens_seen": 89974208, "step": 73980 }, { "epoch": 9.270141586267385, "grad_norm": 0.1331067830324173, "learning_rate": 6.486860798878488e-06, "loss": 0.464, "num_input_tokens_seen": 89980288, "step": 73985 }, { "epoch": 9.270768074176168, "grad_norm": 0.1235165148973465, "learning_rate": 6.4863388085892565e-06, "loss": 0.4586, "num_input_tokens_seen": 89986496, "step": 73990 }, { "epoch": 9.271394562084952, "grad_norm": 0.12049207836389542, "learning_rate": 6.485816800529594e-06, "loss": 0.4723, "num_input_tokens_seen": 89992448, "step": 73995 }, { "epoch": 9.272021049993736, "grad_norm": 0.17255030572414398, "learning_rate": 6.485294774705744e-06, "loss": 0.4596, "num_input_tokens_seen": 89998688, "step": 74000 }, { "epoch": 9.272647537902518, "grad_norm": 0.15490476787090302, "learning_rate": 6.484772731123945e-06, "loss": 0.4555, "num_input_tokens_seen": 90004576, "step": 74005 }, { "epoch": 9.273274025811302, "grad_norm": 0.10402064770460129, "learning_rate": 6.484250669790438e-06, "loss": 0.4616, "num_input_tokens_seen": 90010656, "step": 74010 }, { "epoch": 9.273900513720085, "grad_norm": 0.14879384636878967, "learning_rate": 6.483728590711469e-06, "loss": 0.4741, "num_input_tokens_seen": 90016544, "step": 74015 }, { "epoch": 9.274527001628869, "grad_norm": 0.13421668112277985, "learning_rate": 6.483206493893276e-06, "loss": 0.4675, "num_input_tokens_seen": 90022880, "step": 74020 }, { "epoch": 9.275153489537653, "grad_norm": 0.22518697381019592, "learning_rate": 6.482684379342104e-06, "loss": 0.473, "num_input_tokens_seen": 90029024, "step": 74025 }, { "epoch": 9.275779977446435, "grad_norm": 0.103116475045681, "learning_rate": 6.482162247064192e-06, "loss": 0.4637, "num_input_tokens_seen": 90035232, "step": 74030 }, { "epoch": 9.276406465355219, "grad_norm": 0.11543375253677368, "learning_rate": 6.481640097065783e-06, "loss": 0.4585, "num_input_tokens_seen": 90041280, "step": 74035 }, { "epoch": 9.277032953264001, "grad_norm": 0.10924939066171646, "learning_rate": 6.481117929353123e-06, "loss": 0.4588, "num_input_tokens_seen": 90047488, "step": 74040 }, { "epoch": 9.277659441172785, "grad_norm": 0.13164237141609192, "learning_rate": 6.480595743932452e-06, "loss": 0.4598, "num_input_tokens_seen": 90053568, "step": 74045 }, { "epoch": 9.27828592908157, "grad_norm": 0.10174588114023209, "learning_rate": 6.480073540810014e-06, "loss": 0.4755, "num_input_tokens_seen": 90059680, "step": 74050 }, { "epoch": 9.278912416990352, "grad_norm": 0.09404220432043076, "learning_rate": 6.479551319992053e-06, "loss": 0.449, "num_input_tokens_seen": 90065856, "step": 74055 }, { "epoch": 9.279538904899136, "grad_norm": 0.1304474174976349, "learning_rate": 6.479029081484814e-06, "loss": 0.466, "num_input_tokens_seen": 90071648, "step": 74060 }, { "epoch": 9.280165392807918, "grad_norm": 0.11886343359947205, "learning_rate": 6.478506825294535e-06, "loss": 0.4547, "num_input_tokens_seen": 90077632, "step": 74065 }, { "epoch": 9.280791880716702, "grad_norm": 0.18889817595481873, "learning_rate": 6.477984551427465e-06, "loss": 0.4656, "num_input_tokens_seen": 90083840, "step": 74070 }, { "epoch": 9.281418368625486, "grad_norm": 0.11685186624526978, "learning_rate": 6.477462259889849e-06, "loss": 0.4626, "num_input_tokens_seen": 90089120, "step": 74075 }, { "epoch": 9.282044856534268, "grad_norm": 0.11676011234521866, "learning_rate": 6.476939950687927e-06, "loss": 0.4643, "num_input_tokens_seen": 90095456, "step": 74080 }, { "epoch": 9.282671344443052, "grad_norm": 0.13414578139781952, "learning_rate": 6.476417623827947e-06, "loss": 0.4646, "num_input_tokens_seen": 90101024, "step": 74085 }, { "epoch": 9.283297832351835, "grad_norm": 0.11397437751293182, "learning_rate": 6.475895279316155e-06, "loss": 0.4669, "num_input_tokens_seen": 90107072, "step": 74090 }, { "epoch": 9.283924320260619, "grad_norm": 0.09567005932331085, "learning_rate": 6.475372917158792e-06, "loss": 0.4553, "num_input_tokens_seen": 90113184, "step": 74095 }, { "epoch": 9.284550808169403, "grad_norm": 0.0960177555680275, "learning_rate": 6.474850537362105e-06, "loss": 0.4582, "num_input_tokens_seen": 90119392, "step": 74100 }, { "epoch": 9.285177296078185, "grad_norm": 0.14286431670188904, "learning_rate": 6.47432813993234e-06, "loss": 0.4555, "num_input_tokens_seen": 90125728, "step": 74105 }, { "epoch": 9.28580378398697, "grad_norm": 0.13719932734966278, "learning_rate": 6.473805724875744e-06, "loss": 0.4531, "num_input_tokens_seen": 90131840, "step": 74110 }, { "epoch": 9.286430271895753, "grad_norm": 0.1400974839925766, "learning_rate": 6.473283292198559e-06, "loss": 0.4655, "num_input_tokens_seen": 90137984, "step": 74115 }, { "epoch": 9.287056759804535, "grad_norm": 0.14315542578697205, "learning_rate": 6.472760841907036e-06, "loss": 0.4507, "num_input_tokens_seen": 90144448, "step": 74120 }, { "epoch": 9.28768324771332, "grad_norm": 0.13900654017925262, "learning_rate": 6.472238374007418e-06, "loss": 0.4695, "num_input_tokens_seen": 90150048, "step": 74125 }, { "epoch": 9.288309735622102, "grad_norm": 0.10159309208393097, "learning_rate": 6.471715888505952e-06, "loss": 0.4616, "num_input_tokens_seen": 90156000, "step": 74130 }, { "epoch": 9.288936223530886, "grad_norm": 0.11211684346199036, "learning_rate": 6.471193385408886e-06, "loss": 0.4717, "num_input_tokens_seen": 90162176, "step": 74135 }, { "epoch": 9.28956271143967, "grad_norm": 0.09719318896532059, "learning_rate": 6.470670864722466e-06, "loss": 0.4572, "num_input_tokens_seen": 90167872, "step": 74140 }, { "epoch": 9.290189199348452, "grad_norm": 0.13743163645267487, "learning_rate": 6.4701483264529384e-06, "loss": 0.4594, "num_input_tokens_seen": 90173824, "step": 74145 }, { "epoch": 9.290815687257236, "grad_norm": 0.12669497728347778, "learning_rate": 6.4696257706065535e-06, "loss": 0.4654, "num_input_tokens_seen": 90179968, "step": 74150 }, { "epoch": 9.291442175166019, "grad_norm": 0.10095870494842529, "learning_rate": 6.469103197189556e-06, "loss": 0.4673, "num_input_tokens_seen": 90186080, "step": 74155 }, { "epoch": 9.292068663074803, "grad_norm": 0.11200189590454102, "learning_rate": 6.468580606208193e-06, "loss": 0.4606, "num_input_tokens_seen": 90192640, "step": 74160 }, { "epoch": 9.292695150983587, "grad_norm": 0.109260693192482, "learning_rate": 6.4680579976687155e-06, "loss": 0.4645, "num_input_tokens_seen": 90198432, "step": 74165 }, { "epoch": 9.293321638892369, "grad_norm": 0.09684647619724274, "learning_rate": 6.467535371577369e-06, "loss": 0.4675, "num_input_tokens_seen": 90204480, "step": 74170 }, { "epoch": 9.293948126801153, "grad_norm": 0.14773382246494293, "learning_rate": 6.467012727940405e-06, "loss": 0.4623, "num_input_tokens_seen": 90210688, "step": 74175 }, { "epoch": 9.294574614709935, "grad_norm": 0.12175596505403519, "learning_rate": 6.46649006676407e-06, "loss": 0.4639, "num_input_tokens_seen": 90216640, "step": 74180 }, { "epoch": 9.29520110261872, "grad_norm": 0.15879924595355988, "learning_rate": 6.4659673880546144e-06, "loss": 0.4584, "num_input_tokens_seen": 90222880, "step": 74185 }, { "epoch": 9.295827590527503, "grad_norm": 0.13039642572402954, "learning_rate": 6.465444691818286e-06, "loss": 0.4574, "num_input_tokens_seen": 90229120, "step": 74190 }, { "epoch": 9.296454078436286, "grad_norm": 0.11977865546941757, "learning_rate": 6.464921978061332e-06, "loss": 0.4669, "num_input_tokens_seen": 90235296, "step": 74195 }, { "epoch": 9.29708056634507, "grad_norm": 0.14101427793502808, "learning_rate": 6.464399246790006e-06, "loss": 0.4644, "num_input_tokens_seen": 90241632, "step": 74200 }, { "epoch": 9.297707054253852, "grad_norm": 0.12568549811840057, "learning_rate": 6.463876498010555e-06, "loss": 0.4645, "num_input_tokens_seen": 90248032, "step": 74205 }, { "epoch": 9.298333542162636, "grad_norm": 0.14045299589633942, "learning_rate": 6.463353731729231e-06, "loss": 0.465, "num_input_tokens_seen": 90254176, "step": 74210 }, { "epoch": 9.29896003007142, "grad_norm": 0.09833822399377823, "learning_rate": 6.462830947952281e-06, "loss": 0.454, "num_input_tokens_seen": 90260256, "step": 74215 }, { "epoch": 9.299586517980202, "grad_norm": 0.11014393717050552, "learning_rate": 6.462308146685956e-06, "loss": 0.4642, "num_input_tokens_seen": 90266368, "step": 74220 }, { "epoch": 9.300213005888986, "grad_norm": 0.1281408816576004, "learning_rate": 6.4617853279365105e-06, "loss": 0.4699, "num_input_tokens_seen": 90271680, "step": 74225 }, { "epoch": 9.30083949379777, "grad_norm": 0.07564708590507507, "learning_rate": 6.461262491710192e-06, "loss": 0.4551, "num_input_tokens_seen": 90277792, "step": 74230 }, { "epoch": 9.301465981706553, "grad_norm": 0.13065291941165924, "learning_rate": 6.460739638013252e-06, "loss": 0.4642, "num_input_tokens_seen": 90283616, "step": 74235 }, { "epoch": 9.302092469615337, "grad_norm": 0.0805630087852478, "learning_rate": 6.46021676685194e-06, "loss": 0.4698, "num_input_tokens_seen": 90289344, "step": 74240 }, { "epoch": 9.30271895752412, "grad_norm": 0.11412138491868973, "learning_rate": 6.45969387823251e-06, "loss": 0.4569, "num_input_tokens_seen": 90295904, "step": 74245 }, { "epoch": 9.303345445432903, "grad_norm": 0.08944813162088394, "learning_rate": 6.4591709721612125e-06, "loss": 0.459, "num_input_tokens_seen": 90302048, "step": 74250 }, { "epoch": 9.303971933341687, "grad_norm": 0.08382376283407211, "learning_rate": 6.458648048644299e-06, "loss": 0.468, "num_input_tokens_seen": 90308288, "step": 74255 }, { "epoch": 9.30459842125047, "grad_norm": 0.08770307898521423, "learning_rate": 6.458125107688022e-06, "loss": 0.4651, "num_input_tokens_seen": 90314688, "step": 74260 }, { "epoch": 9.305224909159254, "grad_norm": 0.10064638406038284, "learning_rate": 6.4576021492986315e-06, "loss": 0.4627, "num_input_tokens_seen": 90321024, "step": 74265 }, { "epoch": 9.305851397068036, "grad_norm": 0.0813683345913887, "learning_rate": 6.457079173482384e-06, "loss": 0.4681, "num_input_tokens_seen": 90327168, "step": 74270 }, { "epoch": 9.30647788497682, "grad_norm": 0.11086242645978928, "learning_rate": 6.456556180245529e-06, "loss": 0.4607, "num_input_tokens_seen": 90333248, "step": 74275 }, { "epoch": 9.307104372885604, "grad_norm": 0.10360557585954666, "learning_rate": 6.4560331695943216e-06, "loss": 0.4643, "num_input_tokens_seen": 90339648, "step": 74280 }, { "epoch": 9.307730860794386, "grad_norm": 0.13958798348903656, "learning_rate": 6.455510141535011e-06, "loss": 0.4727, "num_input_tokens_seen": 90345504, "step": 74285 }, { "epoch": 9.30835734870317, "grad_norm": 0.09380444139242172, "learning_rate": 6.454987096073855e-06, "loss": 0.4557, "num_input_tokens_seen": 90351648, "step": 74290 }, { "epoch": 9.308983836611953, "grad_norm": 0.11332995444536209, "learning_rate": 6.454464033217104e-06, "loss": 0.4633, "num_input_tokens_seen": 90358080, "step": 74295 }, { "epoch": 9.309610324520737, "grad_norm": 0.12142668664455414, "learning_rate": 6.453940952971013e-06, "loss": 0.4619, "num_input_tokens_seen": 90364000, "step": 74300 }, { "epoch": 9.31023681242952, "grad_norm": 0.10977942496538162, "learning_rate": 6.453417855341836e-06, "loss": 0.4647, "num_input_tokens_seen": 90369664, "step": 74305 }, { "epoch": 9.310863300338303, "grad_norm": 0.13353043794631958, "learning_rate": 6.452894740335825e-06, "loss": 0.4682, "num_input_tokens_seen": 90375424, "step": 74310 }, { "epoch": 9.311489788247087, "grad_norm": 0.1290162056684494, "learning_rate": 6.452371607959238e-06, "loss": 0.4669, "num_input_tokens_seen": 90381504, "step": 74315 }, { "epoch": 9.31211627615587, "grad_norm": 0.10040684044361115, "learning_rate": 6.451848458218325e-06, "loss": 0.4624, "num_input_tokens_seen": 90387680, "step": 74320 }, { "epoch": 9.312742764064653, "grad_norm": 0.11569095402956009, "learning_rate": 6.451325291119344e-06, "loss": 0.4637, "num_input_tokens_seen": 90393600, "step": 74325 }, { "epoch": 9.313369251973437, "grad_norm": 0.08276432752609253, "learning_rate": 6.45080210666855e-06, "loss": 0.463, "num_input_tokens_seen": 90399584, "step": 74330 }, { "epoch": 9.31399573988222, "grad_norm": 0.1035267636179924, "learning_rate": 6.450278904872196e-06, "loss": 0.4632, "num_input_tokens_seen": 90405568, "step": 74335 }, { "epoch": 9.314622227791004, "grad_norm": 0.11609429121017456, "learning_rate": 6.44975568573654e-06, "loss": 0.4646, "num_input_tokens_seen": 90411232, "step": 74340 }, { "epoch": 9.315248715699788, "grad_norm": 0.1295459121465683, "learning_rate": 6.449232449267833e-06, "loss": 0.453, "num_input_tokens_seen": 90417216, "step": 74345 }, { "epoch": 9.31587520360857, "grad_norm": 0.10718721151351929, "learning_rate": 6.448709195472335e-06, "loss": 0.4585, "num_input_tokens_seen": 90423296, "step": 74350 }, { "epoch": 9.316501691517354, "grad_norm": 0.07333242893218994, "learning_rate": 6.448185924356301e-06, "loss": 0.466, "num_input_tokens_seen": 90429664, "step": 74355 }, { "epoch": 9.317128179426136, "grad_norm": 0.11005263030529022, "learning_rate": 6.447662635925987e-06, "loss": 0.4604, "num_input_tokens_seen": 90435776, "step": 74360 }, { "epoch": 9.31775466733492, "grad_norm": 0.08789825439453125, "learning_rate": 6.447139330187648e-06, "loss": 0.4689, "num_input_tokens_seen": 90441888, "step": 74365 }, { "epoch": 9.318381155243705, "grad_norm": 0.12448021024465561, "learning_rate": 6.4466160071475425e-06, "loss": 0.4562, "num_input_tokens_seen": 90447936, "step": 74370 }, { "epoch": 9.319007643152487, "grad_norm": 0.11011305451393127, "learning_rate": 6.446092666811925e-06, "loss": 0.4674, "num_input_tokens_seen": 90453984, "step": 74375 }, { "epoch": 9.319634131061271, "grad_norm": 0.12481747567653656, "learning_rate": 6.4455693091870555e-06, "loss": 0.4699, "num_input_tokens_seen": 90459776, "step": 74380 }, { "epoch": 9.320260618970053, "grad_norm": 0.09129482507705688, "learning_rate": 6.445045934279188e-06, "loss": 0.4582, "num_input_tokens_seen": 90466048, "step": 74385 }, { "epoch": 9.320887106878837, "grad_norm": 0.09838131815195084, "learning_rate": 6.444522542094584e-06, "loss": 0.4608, "num_input_tokens_seen": 90472000, "step": 74390 }, { "epoch": 9.321513594787621, "grad_norm": 0.10954535007476807, "learning_rate": 6.443999132639496e-06, "loss": 0.4608, "num_input_tokens_seen": 90478304, "step": 74395 }, { "epoch": 9.322140082696404, "grad_norm": 0.09731117635965347, "learning_rate": 6.443475705920186e-06, "loss": 0.4606, "num_input_tokens_seen": 90484320, "step": 74400 }, { "epoch": 9.322766570605188, "grad_norm": 0.09912286698818207, "learning_rate": 6.44295226194291e-06, "loss": 0.4674, "num_input_tokens_seen": 90490368, "step": 74405 }, { "epoch": 9.32339305851397, "grad_norm": 0.09215489029884338, "learning_rate": 6.442428800713925e-06, "loss": 0.4621, "num_input_tokens_seen": 90496704, "step": 74410 }, { "epoch": 9.324019546422754, "grad_norm": 0.1694297343492508, "learning_rate": 6.4419053222394915e-06, "loss": 0.4675, "num_input_tokens_seen": 90502752, "step": 74415 }, { "epoch": 9.324646034331538, "grad_norm": 0.11957888305187225, "learning_rate": 6.4413818265258675e-06, "loss": 0.4729, "num_input_tokens_seen": 90509120, "step": 74420 }, { "epoch": 9.32527252224032, "grad_norm": 0.11810345947742462, "learning_rate": 6.440858313579312e-06, "loss": 0.4657, "num_input_tokens_seen": 90514944, "step": 74425 }, { "epoch": 9.325899010149104, "grad_norm": 0.08740519732236862, "learning_rate": 6.440334783406085e-06, "loss": 0.4663, "num_input_tokens_seen": 90520992, "step": 74430 }, { "epoch": 9.326525498057887, "grad_norm": 0.09987130016088486, "learning_rate": 6.439811236012444e-06, "loss": 0.4625, "num_input_tokens_seen": 90526528, "step": 74435 }, { "epoch": 9.32715198596667, "grad_norm": 0.10275008529424667, "learning_rate": 6.439287671404649e-06, "loss": 0.4625, "num_input_tokens_seen": 90532544, "step": 74440 }, { "epoch": 9.327778473875455, "grad_norm": 0.10287674516439438, "learning_rate": 6.43876408958896e-06, "loss": 0.4638, "num_input_tokens_seen": 90538656, "step": 74445 }, { "epoch": 9.328404961784237, "grad_norm": 0.10922963917255402, "learning_rate": 6.438240490571638e-06, "loss": 0.4622, "num_input_tokens_seen": 90544224, "step": 74450 }, { "epoch": 9.329031449693021, "grad_norm": 0.10346392542123795, "learning_rate": 6.437716874358941e-06, "loss": 0.4576, "num_input_tokens_seen": 90550464, "step": 74455 }, { "epoch": 9.329657937601805, "grad_norm": 0.11339592188596725, "learning_rate": 6.4371932409571294e-06, "loss": 0.4564, "num_input_tokens_seen": 90556416, "step": 74460 }, { "epoch": 9.330284425510587, "grad_norm": 0.08082933723926544, "learning_rate": 6.436669590372464e-06, "loss": 0.467, "num_input_tokens_seen": 90562560, "step": 74465 }, { "epoch": 9.330910913419372, "grad_norm": 0.09735685586929321, "learning_rate": 6.436145922611206e-06, "loss": 0.4565, "num_input_tokens_seen": 90568608, "step": 74470 }, { "epoch": 9.331537401328154, "grad_norm": 0.10170052200555801, "learning_rate": 6.435622237679615e-06, "loss": 0.4635, "num_input_tokens_seen": 90574304, "step": 74475 }, { "epoch": 9.332163889236938, "grad_norm": 0.08900697529315948, "learning_rate": 6.4350985355839546e-06, "loss": 0.4581, "num_input_tokens_seen": 90580352, "step": 74480 }, { "epoch": 9.332790377145722, "grad_norm": 0.10650376975536346, "learning_rate": 6.434574816330483e-06, "loss": 0.4642, "num_input_tokens_seen": 90586496, "step": 74485 }, { "epoch": 9.333416865054504, "grad_norm": 0.09485562145709991, "learning_rate": 6.434051079925465e-06, "loss": 0.4606, "num_input_tokens_seen": 90592416, "step": 74490 }, { "epoch": 9.334043352963288, "grad_norm": 0.10506348311901093, "learning_rate": 6.433527326375162e-06, "loss": 0.4649, "num_input_tokens_seen": 90598560, "step": 74495 }, { "epoch": 9.33466984087207, "grad_norm": 0.15292944014072418, "learning_rate": 6.433003555685833e-06, "loss": 0.4573, "num_input_tokens_seen": 90604736, "step": 74500 }, { "epoch": 9.335296328780855, "grad_norm": 0.1808711588382721, "learning_rate": 6.4324797678637405e-06, "loss": 0.4599, "num_input_tokens_seen": 90610720, "step": 74505 }, { "epoch": 9.335922816689639, "grad_norm": 0.09385072439908981, "learning_rate": 6.431955962915149e-06, "loss": 0.4597, "num_input_tokens_seen": 90616448, "step": 74510 }, { "epoch": 9.336549304598421, "grad_norm": 0.06293794512748718, "learning_rate": 6.431432140846321e-06, "loss": 0.4605, "num_input_tokens_seen": 90622240, "step": 74515 }, { "epoch": 9.337175792507205, "grad_norm": 0.09880489110946655, "learning_rate": 6.430908301663517e-06, "loss": 0.4652, "num_input_tokens_seen": 90628256, "step": 74520 }, { "epoch": 9.337802280415987, "grad_norm": 0.12924028933048248, "learning_rate": 6.430384445373002e-06, "loss": 0.4685, "num_input_tokens_seen": 90634176, "step": 74525 }, { "epoch": 9.338428768324771, "grad_norm": 0.13472230732440948, "learning_rate": 6.429860571981037e-06, "loss": 0.4611, "num_input_tokens_seen": 90640320, "step": 74530 }, { "epoch": 9.339055256233555, "grad_norm": 0.10219164192676544, "learning_rate": 6.429336681493887e-06, "loss": 0.4598, "num_input_tokens_seen": 90646016, "step": 74535 }, { "epoch": 9.339681744142338, "grad_norm": 0.1338370144367218, "learning_rate": 6.428812773917816e-06, "loss": 0.4582, "num_input_tokens_seen": 90652416, "step": 74540 }, { "epoch": 9.340308232051122, "grad_norm": 0.16141265630722046, "learning_rate": 6.4282888492590875e-06, "loss": 0.4701, "num_input_tokens_seen": 90658592, "step": 74545 }, { "epoch": 9.340934719959904, "grad_norm": 0.11153819411993027, "learning_rate": 6.427764907523965e-06, "loss": 0.4632, "num_input_tokens_seen": 90664512, "step": 74550 }, { "epoch": 9.341561207868688, "grad_norm": 0.11216104030609131, "learning_rate": 6.427240948718712e-06, "loss": 0.4503, "num_input_tokens_seen": 90670656, "step": 74555 }, { "epoch": 9.342187695777472, "grad_norm": 0.11142222583293915, "learning_rate": 6.4267169728495935e-06, "loss": 0.4611, "num_input_tokens_seen": 90676512, "step": 74560 }, { "epoch": 9.342814183686254, "grad_norm": 0.10608082264661789, "learning_rate": 6.426192979922874e-06, "loss": 0.4693, "num_input_tokens_seen": 90682720, "step": 74565 }, { "epoch": 9.343440671595038, "grad_norm": 0.14878860116004944, "learning_rate": 6.425668969944819e-06, "loss": 0.466, "num_input_tokens_seen": 90688576, "step": 74570 }, { "epoch": 9.34406715950382, "grad_norm": 0.08957789093255997, "learning_rate": 6.425144942921692e-06, "loss": 0.4633, "num_input_tokens_seen": 90694912, "step": 74575 }, { "epoch": 9.344693647412605, "grad_norm": 0.07650629431009293, "learning_rate": 6.424620898859759e-06, "loss": 0.4629, "num_input_tokens_seen": 90700704, "step": 74580 }, { "epoch": 9.345320135321389, "grad_norm": 0.08897958695888519, "learning_rate": 6.424096837765286e-06, "loss": 0.4625, "num_input_tokens_seen": 90706624, "step": 74585 }, { "epoch": 9.345946623230171, "grad_norm": 0.1265297383069992, "learning_rate": 6.4235727596445385e-06, "loss": 0.4565, "num_input_tokens_seen": 90712960, "step": 74590 }, { "epoch": 9.346573111138955, "grad_norm": 0.1363282948732376, "learning_rate": 6.423048664503779e-06, "loss": 0.4624, "num_input_tokens_seen": 90719136, "step": 74595 }, { "epoch": 9.34719959904774, "grad_norm": 0.09056714177131653, "learning_rate": 6.422524552349278e-06, "loss": 0.4644, "num_input_tokens_seen": 90725312, "step": 74600 }, { "epoch": 9.347826086956522, "grad_norm": 0.09858603030443192, "learning_rate": 6.422000423187301e-06, "loss": 0.4601, "num_input_tokens_seen": 90731456, "step": 74605 }, { "epoch": 9.348452574865306, "grad_norm": 0.1190817579627037, "learning_rate": 6.421476277024114e-06, "loss": 0.4638, "num_input_tokens_seen": 90737408, "step": 74610 }, { "epoch": 9.349079062774088, "grad_norm": 0.09999458491802216, "learning_rate": 6.420952113865981e-06, "loss": 0.4566, "num_input_tokens_seen": 90743456, "step": 74615 }, { "epoch": 9.349705550682872, "grad_norm": 0.10470858216285706, "learning_rate": 6.420427933719171e-06, "loss": 0.467, "num_input_tokens_seen": 90750176, "step": 74620 }, { "epoch": 9.350332038591656, "grad_norm": 0.09037944674491882, "learning_rate": 6.419903736589952e-06, "loss": 0.4595, "num_input_tokens_seen": 90756320, "step": 74625 }, { "epoch": 9.350958526500438, "grad_norm": 0.1049598827958107, "learning_rate": 6.4193795224845875e-06, "loss": 0.4668, "num_input_tokens_seen": 90762624, "step": 74630 }, { "epoch": 9.351585014409222, "grad_norm": 0.0994177982211113, "learning_rate": 6.418855291409351e-06, "loss": 0.4514, "num_input_tokens_seen": 90768640, "step": 74635 }, { "epoch": 9.352211502318005, "grad_norm": 0.11660963296890259, "learning_rate": 6.418331043370505e-06, "loss": 0.4556, "num_input_tokens_seen": 90774912, "step": 74640 }, { "epoch": 9.352837990226789, "grad_norm": 0.09894124418497086, "learning_rate": 6.417806778374318e-06, "loss": 0.4635, "num_input_tokens_seen": 90781120, "step": 74645 }, { "epoch": 9.353464478135573, "grad_norm": 0.15087342262268066, "learning_rate": 6.41728249642706e-06, "loss": 0.4541, "num_input_tokens_seen": 90786624, "step": 74650 }, { "epoch": 9.354090966044355, "grad_norm": 0.1407884806394577, "learning_rate": 6.416758197534998e-06, "loss": 0.4658, "num_input_tokens_seen": 90792704, "step": 74655 }, { "epoch": 9.354717453953139, "grad_norm": 0.11978031694889069, "learning_rate": 6.4162338817044e-06, "loss": 0.467, "num_input_tokens_seen": 90798816, "step": 74660 }, { "epoch": 9.355343941861921, "grad_norm": 0.10780199617147446, "learning_rate": 6.415709548941535e-06, "loss": 0.4617, "num_input_tokens_seen": 90804928, "step": 74665 }, { "epoch": 9.355970429770705, "grad_norm": 0.08913058042526245, "learning_rate": 6.4151851992526735e-06, "loss": 0.4581, "num_input_tokens_seen": 90811136, "step": 74670 }, { "epoch": 9.35659691767949, "grad_norm": 0.1294042021036148, "learning_rate": 6.414660832644083e-06, "loss": 0.4601, "num_input_tokens_seen": 90817248, "step": 74675 }, { "epoch": 9.357223405588272, "grad_norm": 0.07914262264966965, "learning_rate": 6.414136449122032e-06, "loss": 0.4688, "num_input_tokens_seen": 90823808, "step": 74680 }, { "epoch": 9.357849893497056, "grad_norm": 0.1271655559539795, "learning_rate": 6.41361204869279e-06, "loss": 0.4661, "num_input_tokens_seen": 90830144, "step": 74685 }, { "epoch": 9.358476381405838, "grad_norm": 0.08716955035924911, "learning_rate": 6.4130876313626285e-06, "loss": 0.4603, "num_input_tokens_seen": 90836352, "step": 74690 }, { "epoch": 9.359102869314622, "grad_norm": 0.120695099234581, "learning_rate": 6.412563197137818e-06, "loss": 0.4601, "num_input_tokens_seen": 90842528, "step": 74695 }, { "epoch": 9.359729357223406, "grad_norm": 0.12871485948562622, "learning_rate": 6.4120387460246244e-06, "loss": 0.463, "num_input_tokens_seen": 90848768, "step": 74700 }, { "epoch": 9.360355845132188, "grad_norm": 0.1289680302143097, "learning_rate": 6.411514278029322e-06, "loss": 0.4622, "num_input_tokens_seen": 90854848, "step": 74705 }, { "epoch": 9.360982333040972, "grad_norm": 0.09803780168294907, "learning_rate": 6.410989793158179e-06, "loss": 0.4576, "num_input_tokens_seen": 90861376, "step": 74710 }, { "epoch": 9.361608820949755, "grad_norm": 0.11751869320869446, "learning_rate": 6.4104652914174655e-06, "loss": 0.4585, "num_input_tokens_seen": 90867616, "step": 74715 }, { "epoch": 9.362235308858539, "grad_norm": 0.13966159522533417, "learning_rate": 6.409940772813454e-06, "loss": 0.4639, "num_input_tokens_seen": 90873952, "step": 74720 }, { "epoch": 9.362861796767323, "grad_norm": 0.12622515857219696, "learning_rate": 6.409416237352417e-06, "loss": 0.4537, "num_input_tokens_seen": 90880032, "step": 74725 }, { "epoch": 9.363488284676105, "grad_norm": 0.12223084270954132, "learning_rate": 6.408891685040624e-06, "loss": 0.4648, "num_input_tokens_seen": 90886016, "step": 74730 }, { "epoch": 9.36411477258489, "grad_norm": 0.1252298504114151, "learning_rate": 6.408367115884343e-06, "loss": 0.4568, "num_input_tokens_seen": 90892192, "step": 74735 }, { "epoch": 9.364741260493673, "grad_norm": 0.12542563676834106, "learning_rate": 6.407842529889852e-06, "loss": 0.4564, "num_input_tokens_seen": 90898272, "step": 74740 }, { "epoch": 9.365367748402456, "grad_norm": 0.11993014812469482, "learning_rate": 6.407317927063419e-06, "loss": 0.4648, "num_input_tokens_seen": 90904576, "step": 74745 }, { "epoch": 9.36599423631124, "grad_norm": 0.10746178030967712, "learning_rate": 6.406793307411316e-06, "loss": 0.4649, "num_input_tokens_seen": 90910784, "step": 74750 }, { "epoch": 9.366620724220022, "grad_norm": 0.12949927151203156, "learning_rate": 6.406268670939818e-06, "loss": 0.4625, "num_input_tokens_seen": 90916960, "step": 74755 }, { "epoch": 9.367247212128806, "grad_norm": 0.09327221661806107, "learning_rate": 6.4057440176551955e-06, "loss": 0.467, "num_input_tokens_seen": 90922752, "step": 74760 }, { "epoch": 9.36787370003759, "grad_norm": 0.1319969743490219, "learning_rate": 6.405219347563722e-06, "loss": 0.4609, "num_input_tokens_seen": 90929056, "step": 74765 }, { "epoch": 9.368500187946372, "grad_norm": 0.1280386596918106, "learning_rate": 6.404694660671669e-06, "loss": 0.468, "num_input_tokens_seen": 90935200, "step": 74770 }, { "epoch": 9.369126675855156, "grad_norm": 0.18919157981872559, "learning_rate": 6.404169956985309e-06, "loss": 0.4559, "num_input_tokens_seen": 90941312, "step": 74775 }, { "epoch": 9.369753163763939, "grad_norm": 0.12473367154598236, "learning_rate": 6.403645236510917e-06, "loss": 0.4615, "num_input_tokens_seen": 90947360, "step": 74780 }, { "epoch": 9.370379651672723, "grad_norm": 0.11412743479013443, "learning_rate": 6.403120499254767e-06, "loss": 0.4572, "num_input_tokens_seen": 90953440, "step": 74785 }, { "epoch": 9.371006139581507, "grad_norm": 0.11529609560966492, "learning_rate": 6.40259574522313e-06, "loss": 0.4638, "num_input_tokens_seen": 90959648, "step": 74790 }, { "epoch": 9.371632627490289, "grad_norm": 0.12715891003608704, "learning_rate": 6.402070974422283e-06, "loss": 0.4602, "num_input_tokens_seen": 90965536, "step": 74795 }, { "epoch": 9.372259115399073, "grad_norm": 0.15029631555080414, "learning_rate": 6.401546186858499e-06, "loss": 0.4541, "num_input_tokens_seen": 90971872, "step": 74800 }, { "epoch": 9.372885603307855, "grad_norm": 0.14804595708847046, "learning_rate": 6.401021382538053e-06, "loss": 0.4521, "num_input_tokens_seen": 90978048, "step": 74805 }, { "epoch": 9.37351209121664, "grad_norm": 0.10215804725885391, "learning_rate": 6.400496561467216e-06, "loss": 0.4639, "num_input_tokens_seen": 90984000, "step": 74810 }, { "epoch": 9.374138579125423, "grad_norm": 0.08504418283700943, "learning_rate": 6.399971723652268e-06, "loss": 0.4661, "num_input_tokens_seen": 90989920, "step": 74815 }, { "epoch": 9.374765067034206, "grad_norm": 0.12184590101242065, "learning_rate": 6.3994468690994795e-06, "loss": 0.4677, "num_input_tokens_seen": 90996256, "step": 74820 }, { "epoch": 9.37539155494299, "grad_norm": 0.09849928319454193, "learning_rate": 6.398921997815126e-06, "loss": 0.4687, "num_input_tokens_seen": 91002464, "step": 74825 }, { "epoch": 9.376018042851772, "grad_norm": 0.1551629602909088, "learning_rate": 6.398397109805485e-06, "loss": 0.4572, "num_input_tokens_seen": 91008352, "step": 74830 }, { "epoch": 9.376644530760556, "grad_norm": 0.09740190953016281, "learning_rate": 6.39787220507683e-06, "loss": 0.4553, "num_input_tokens_seen": 91014272, "step": 74835 }, { "epoch": 9.37727101866934, "grad_norm": 0.11926107108592987, "learning_rate": 6.397347283635438e-06, "loss": 0.4606, "num_input_tokens_seen": 91020320, "step": 74840 }, { "epoch": 9.377897506578122, "grad_norm": 0.11359620839357376, "learning_rate": 6.396822345487585e-06, "loss": 0.4677, "num_input_tokens_seen": 91025920, "step": 74845 }, { "epoch": 9.378523994486907, "grad_norm": 0.12312167137861252, "learning_rate": 6.396297390639546e-06, "loss": 0.4608, "num_input_tokens_seen": 91032160, "step": 74850 }, { "epoch": 9.37915048239569, "grad_norm": 0.12188760191202164, "learning_rate": 6.395772419097597e-06, "loss": 0.458, "num_input_tokens_seen": 91038368, "step": 74855 }, { "epoch": 9.379776970304473, "grad_norm": 0.12495088577270508, "learning_rate": 6.395247430868017e-06, "loss": 0.4687, "num_input_tokens_seen": 91044448, "step": 74860 }, { "epoch": 9.380403458213257, "grad_norm": 0.09061973541975021, "learning_rate": 6.39472242595708e-06, "loss": 0.4628, "num_input_tokens_seen": 91050368, "step": 74865 }, { "epoch": 9.38102994612204, "grad_norm": 0.11089638620615005, "learning_rate": 6.394197404371063e-06, "loss": 0.4539, "num_input_tokens_seen": 91056416, "step": 74870 }, { "epoch": 9.381656434030823, "grad_norm": 0.0957353413105011, "learning_rate": 6.393672366116246e-06, "loss": 0.4588, "num_input_tokens_seen": 91062496, "step": 74875 }, { "epoch": 9.382282921939607, "grad_norm": 0.09938127547502518, "learning_rate": 6.393147311198902e-06, "loss": 0.4577, "num_input_tokens_seen": 91068800, "step": 74880 }, { "epoch": 9.38290940984839, "grad_norm": 0.12351462990045547, "learning_rate": 6.3926222396253105e-06, "loss": 0.4673, "num_input_tokens_seen": 91075008, "step": 74885 }, { "epoch": 9.383535897757174, "grad_norm": 0.1341271698474884, "learning_rate": 6.392097151401751e-06, "loss": 0.4583, "num_input_tokens_seen": 91081216, "step": 74890 }, { "epoch": 9.384162385665956, "grad_norm": 0.09000100195407867, "learning_rate": 6.391572046534498e-06, "loss": 0.465, "num_input_tokens_seen": 91087616, "step": 74895 }, { "epoch": 9.38478887357474, "grad_norm": 0.16632242500782013, "learning_rate": 6.391046925029833e-06, "loss": 0.4704, "num_input_tokens_seen": 91093824, "step": 74900 }, { "epoch": 9.385415361483524, "grad_norm": 0.09824512153863907, "learning_rate": 6.39052178689403e-06, "loss": 0.4593, "num_input_tokens_seen": 91100192, "step": 74905 }, { "epoch": 9.386041849392306, "grad_norm": 0.11552441865205765, "learning_rate": 6.3899966321333716e-06, "loss": 0.4656, "num_input_tokens_seen": 91106400, "step": 74910 }, { "epoch": 9.38666833730109, "grad_norm": 0.11507267504930496, "learning_rate": 6.389471460754134e-06, "loss": 0.4524, "num_input_tokens_seen": 91112896, "step": 74915 }, { "epoch": 9.387294825209873, "grad_norm": 0.12964722514152527, "learning_rate": 6.388946272762598e-06, "loss": 0.4635, "num_input_tokens_seen": 91119072, "step": 74920 }, { "epoch": 9.387921313118657, "grad_norm": 0.12309020012617111, "learning_rate": 6.388421068165041e-06, "loss": 0.4664, "num_input_tokens_seen": 91125216, "step": 74925 }, { "epoch": 9.38854780102744, "grad_norm": 0.11122333258390427, "learning_rate": 6.3878958469677434e-06, "loss": 0.4622, "num_input_tokens_seen": 91131392, "step": 74930 }, { "epoch": 9.389174288936223, "grad_norm": 0.1102067083120346, "learning_rate": 6.387370609176984e-06, "loss": 0.456, "num_input_tokens_seen": 91137440, "step": 74935 }, { "epoch": 9.389800776845007, "grad_norm": 0.12957783043384552, "learning_rate": 6.3868453547990405e-06, "loss": 0.4557, "num_input_tokens_seen": 91143008, "step": 74940 }, { "epoch": 9.39042726475379, "grad_norm": 0.16312609612941742, "learning_rate": 6.386320083840196e-06, "loss": 0.4507, "num_input_tokens_seen": 91149248, "step": 74945 }, { "epoch": 9.391053752662573, "grad_norm": 0.08903006464242935, "learning_rate": 6.38579479630673e-06, "loss": 0.462, "num_input_tokens_seen": 91155488, "step": 74950 }, { "epoch": 9.391680240571358, "grad_norm": 0.07869497686624527, "learning_rate": 6.385269492204922e-06, "loss": 0.4591, "num_input_tokens_seen": 91161792, "step": 74955 }, { "epoch": 9.39230672848014, "grad_norm": 0.15624384582042694, "learning_rate": 6.384744171541053e-06, "loss": 0.4628, "num_input_tokens_seen": 91167840, "step": 74960 }, { "epoch": 9.392933216388924, "grad_norm": 0.11788307875394821, "learning_rate": 6.384218834321402e-06, "loss": 0.4679, "num_input_tokens_seen": 91173920, "step": 74965 }, { "epoch": 9.393559704297708, "grad_norm": 0.1303378790616989, "learning_rate": 6.383693480552251e-06, "loss": 0.4651, "num_input_tokens_seen": 91179616, "step": 74970 }, { "epoch": 9.39418619220649, "grad_norm": 0.16361917555332184, "learning_rate": 6.383168110239882e-06, "loss": 0.468, "num_input_tokens_seen": 91185632, "step": 74975 }, { "epoch": 9.394812680115274, "grad_norm": 0.1349128931760788, "learning_rate": 6.382642723390575e-06, "loss": 0.4477, "num_input_tokens_seen": 91191552, "step": 74980 }, { "epoch": 9.395439168024057, "grad_norm": 0.121790811419487, "learning_rate": 6.382117320010612e-06, "loss": 0.4574, "num_input_tokens_seen": 91197632, "step": 74985 }, { "epoch": 9.39606565593284, "grad_norm": 0.1802046000957489, "learning_rate": 6.381591900106273e-06, "loss": 0.468, "num_input_tokens_seen": 91203872, "step": 74990 }, { "epoch": 9.396692143841625, "grad_norm": 0.1758454293012619, "learning_rate": 6.3810664636838425e-06, "loss": 0.4545, "num_input_tokens_seen": 91210208, "step": 74995 }, { "epoch": 9.397318631750407, "grad_norm": 0.09433939307928085, "learning_rate": 6.380541010749601e-06, "loss": 0.4659, "num_input_tokens_seen": 91216128, "step": 75000 }, { "epoch": 9.397945119659191, "grad_norm": 0.15536914765834808, "learning_rate": 6.380015541309832e-06, "loss": 0.4688, "num_input_tokens_seen": 91222112, "step": 75005 }, { "epoch": 9.398571607567973, "grad_norm": 0.18470622599124908, "learning_rate": 6.379490055370816e-06, "loss": 0.4707, "num_input_tokens_seen": 91228192, "step": 75010 }, { "epoch": 9.399198095476757, "grad_norm": 0.0792800560593605, "learning_rate": 6.378964552938838e-06, "loss": 0.4611, "num_input_tokens_seen": 91234528, "step": 75015 }, { "epoch": 9.399824583385541, "grad_norm": 0.09856153279542923, "learning_rate": 6.378439034020179e-06, "loss": 0.4684, "num_input_tokens_seen": 91240640, "step": 75020 }, { "epoch": 9.400451071294324, "grad_norm": 0.1071290597319603, "learning_rate": 6.37791349862112e-06, "loss": 0.4635, "num_input_tokens_seen": 91246528, "step": 75025 }, { "epoch": 9.401077559203108, "grad_norm": 0.100252665579319, "learning_rate": 6.377387946747948e-06, "loss": 0.4513, "num_input_tokens_seen": 91252768, "step": 75030 }, { "epoch": 9.40170404711189, "grad_norm": 0.10683108121156693, "learning_rate": 6.376862378406945e-06, "loss": 0.4586, "num_input_tokens_seen": 91259008, "step": 75035 }, { "epoch": 9.402330535020674, "grad_norm": 0.11364195495843887, "learning_rate": 6.376336793604396e-06, "loss": 0.4627, "num_input_tokens_seen": 91264896, "step": 75040 }, { "epoch": 9.402957022929458, "grad_norm": 0.15425018966197968, "learning_rate": 6.375811192346581e-06, "loss": 0.4664, "num_input_tokens_seen": 91271008, "step": 75045 }, { "epoch": 9.40358351083824, "grad_norm": 0.13826629519462585, "learning_rate": 6.375285574639789e-06, "loss": 0.4669, "num_input_tokens_seen": 91276928, "step": 75050 }, { "epoch": 9.404209998747024, "grad_norm": 0.0966247022151947, "learning_rate": 6.3747599404903e-06, "loss": 0.4651, "num_input_tokens_seen": 91283424, "step": 75055 }, { "epoch": 9.404836486655807, "grad_norm": 0.11698416620492935, "learning_rate": 6.374234289904401e-06, "loss": 0.4731, "num_input_tokens_seen": 91289472, "step": 75060 }, { "epoch": 9.40546297456459, "grad_norm": 0.10874225199222565, "learning_rate": 6.373708622888375e-06, "loss": 0.4616, "num_input_tokens_seen": 91295648, "step": 75065 }, { "epoch": 9.406089462473375, "grad_norm": 0.1752406507730484, "learning_rate": 6.373182939448508e-06, "loss": 0.4662, "num_input_tokens_seen": 91301536, "step": 75070 }, { "epoch": 9.406715950382157, "grad_norm": 0.11477979272603989, "learning_rate": 6.3726572395910846e-06, "loss": 0.4622, "num_input_tokens_seen": 91307808, "step": 75075 }, { "epoch": 9.407342438290941, "grad_norm": 0.10329923778772354, "learning_rate": 6.3721315233223905e-06, "loss": 0.4553, "num_input_tokens_seen": 91314016, "step": 75080 }, { "epoch": 9.407968926199725, "grad_norm": 0.12196461111307144, "learning_rate": 6.37160579064871e-06, "loss": 0.4606, "num_input_tokens_seen": 91320096, "step": 75085 }, { "epoch": 9.408595414108508, "grad_norm": 0.08574876189231873, "learning_rate": 6.371080041576326e-06, "loss": 0.4621, "num_input_tokens_seen": 91326240, "step": 75090 }, { "epoch": 9.409221902017292, "grad_norm": 0.10870829224586487, "learning_rate": 6.37055427611153e-06, "loss": 0.4722, "num_input_tokens_seen": 91332416, "step": 75095 }, { "epoch": 9.409848389926074, "grad_norm": 0.11121754348278046, "learning_rate": 6.370028494260605e-06, "loss": 0.4697, "num_input_tokens_seen": 91338560, "step": 75100 }, { "epoch": 9.410474877834858, "grad_norm": 0.07027290761470795, "learning_rate": 6.369502696029838e-06, "loss": 0.47, "num_input_tokens_seen": 91344224, "step": 75105 }, { "epoch": 9.411101365743642, "grad_norm": 0.06773881614208221, "learning_rate": 6.3689768814255144e-06, "loss": 0.4633, "num_input_tokens_seen": 91350144, "step": 75110 }, { "epoch": 9.411727853652424, "grad_norm": 0.10272041708230972, "learning_rate": 6.368451050453921e-06, "loss": 0.4557, "num_input_tokens_seen": 91356320, "step": 75115 }, { "epoch": 9.412354341561208, "grad_norm": 0.1209699735045433, "learning_rate": 6.367925203121346e-06, "loss": 0.4658, "num_input_tokens_seen": 91362784, "step": 75120 }, { "epoch": 9.41298082946999, "grad_norm": 0.11985579878091812, "learning_rate": 6.3673993394340735e-06, "loss": 0.4533, "num_input_tokens_seen": 91368640, "step": 75125 }, { "epoch": 9.413607317378775, "grad_norm": 0.09145835787057877, "learning_rate": 6.366873459398394e-06, "loss": 0.4556, "num_input_tokens_seen": 91375104, "step": 75130 }, { "epoch": 9.414233805287559, "grad_norm": 0.07552953064441681, "learning_rate": 6.3663475630205916e-06, "loss": 0.4712, "num_input_tokens_seen": 91381536, "step": 75135 }, { "epoch": 9.414860293196341, "grad_norm": 0.07501155138015747, "learning_rate": 6.365821650306955e-06, "loss": 0.4568, "num_input_tokens_seen": 91387680, "step": 75140 }, { "epoch": 9.415486781105125, "grad_norm": 0.10409694910049438, "learning_rate": 6.365295721263774e-06, "loss": 0.4606, "num_input_tokens_seen": 91393728, "step": 75145 }, { "epoch": 9.416113269013907, "grad_norm": 0.09554636478424072, "learning_rate": 6.364769775897333e-06, "loss": 0.4591, "num_input_tokens_seen": 91399680, "step": 75150 }, { "epoch": 9.416739756922691, "grad_norm": 0.09150873124599457, "learning_rate": 6.3642438142139214e-06, "loss": 0.4636, "num_input_tokens_seen": 91405632, "step": 75155 }, { "epoch": 9.417366244831475, "grad_norm": 0.11674558371305466, "learning_rate": 6.36371783621983e-06, "loss": 0.4778, "num_input_tokens_seen": 91411680, "step": 75160 }, { "epoch": 9.417992732740258, "grad_norm": 0.14560222625732422, "learning_rate": 6.363191841921345e-06, "loss": 0.4686, "num_input_tokens_seen": 91417760, "step": 75165 }, { "epoch": 9.418619220649042, "grad_norm": 0.11372841894626617, "learning_rate": 6.3626658313247545e-06, "loss": 0.4598, "num_input_tokens_seen": 91424192, "step": 75170 }, { "epoch": 9.419245708557824, "grad_norm": 0.0881165936589241, "learning_rate": 6.3621398044363495e-06, "loss": 0.4686, "num_input_tokens_seen": 91430656, "step": 75175 }, { "epoch": 9.419872196466608, "grad_norm": 0.11186309903860092, "learning_rate": 6.361613761262417e-06, "loss": 0.4597, "num_input_tokens_seen": 91436832, "step": 75180 }, { "epoch": 9.420498684375392, "grad_norm": 0.1327688843011856, "learning_rate": 6.361087701809248e-06, "loss": 0.4615, "num_input_tokens_seen": 91442944, "step": 75185 }, { "epoch": 9.421125172284174, "grad_norm": 0.0939004123210907, "learning_rate": 6.36056162608313e-06, "loss": 0.4632, "num_input_tokens_seen": 91448768, "step": 75190 }, { "epoch": 9.421751660192959, "grad_norm": 0.08364865183830261, "learning_rate": 6.360035534090354e-06, "loss": 0.4677, "num_input_tokens_seen": 91454240, "step": 75195 }, { "epoch": 9.42237814810174, "grad_norm": 0.10848481208086014, "learning_rate": 6.359509425837211e-06, "loss": 0.4574, "num_input_tokens_seen": 91460672, "step": 75200 }, { "epoch": 9.423004636010525, "grad_norm": 0.1455342173576355, "learning_rate": 6.3589833013299895e-06, "loss": 0.4617, "num_input_tokens_seen": 91466080, "step": 75205 }, { "epoch": 9.423631123919309, "grad_norm": 0.11570226401090622, "learning_rate": 6.358457160574978e-06, "loss": 0.4584, "num_input_tokens_seen": 91472160, "step": 75210 }, { "epoch": 9.424257611828091, "grad_norm": 0.12644179165363312, "learning_rate": 6.3579310035784715e-06, "loss": 0.451, "num_input_tokens_seen": 91478336, "step": 75215 }, { "epoch": 9.424884099736875, "grad_norm": 0.09426719695329666, "learning_rate": 6.357404830346757e-06, "loss": 0.4588, "num_input_tokens_seen": 91484832, "step": 75220 }, { "epoch": 9.425510587645658, "grad_norm": 0.13579456508159637, "learning_rate": 6.356878640886127e-06, "loss": 0.4621, "num_input_tokens_seen": 91490976, "step": 75225 }, { "epoch": 9.426137075554442, "grad_norm": 0.1473066657781601, "learning_rate": 6.356352435202873e-06, "loss": 0.4651, "num_input_tokens_seen": 91496928, "step": 75230 }, { "epoch": 9.426763563463226, "grad_norm": 0.11227496713399887, "learning_rate": 6.355826213303283e-06, "loss": 0.4682, "num_input_tokens_seen": 91503392, "step": 75235 }, { "epoch": 9.427390051372008, "grad_norm": 0.10386070609092712, "learning_rate": 6.3552999751936525e-06, "loss": 0.4605, "num_input_tokens_seen": 91509696, "step": 75240 }, { "epoch": 9.428016539280792, "grad_norm": 0.1393336057662964, "learning_rate": 6.354773720880271e-06, "loss": 0.4576, "num_input_tokens_seen": 91516032, "step": 75245 }, { "epoch": 9.428643027189576, "grad_norm": 0.1366802453994751, "learning_rate": 6.354247450369429e-06, "loss": 0.4651, "num_input_tokens_seen": 91521952, "step": 75250 }, { "epoch": 9.429269515098358, "grad_norm": 0.16575278341770172, "learning_rate": 6.353721163667423e-06, "loss": 0.4637, "num_input_tokens_seen": 91528320, "step": 75255 }, { "epoch": 9.429896003007142, "grad_norm": 0.12809258699417114, "learning_rate": 6.353194860780541e-06, "loss": 0.4653, "num_input_tokens_seen": 91534144, "step": 75260 }, { "epoch": 9.430522490915925, "grad_norm": 0.09805003553628922, "learning_rate": 6.352668541715076e-06, "loss": 0.4594, "num_input_tokens_seen": 91540288, "step": 75265 }, { "epoch": 9.431148978824709, "grad_norm": 0.09677830338478088, "learning_rate": 6.352142206477322e-06, "loss": 0.4548, "num_input_tokens_seen": 91545984, "step": 75270 }, { "epoch": 9.431775466733493, "grad_norm": 0.10264354199171066, "learning_rate": 6.35161585507357e-06, "loss": 0.464, "num_input_tokens_seen": 91552320, "step": 75275 }, { "epoch": 9.432401954642275, "grad_norm": 0.11340142786502838, "learning_rate": 6.351089487510115e-06, "loss": 0.4582, "num_input_tokens_seen": 91558272, "step": 75280 }, { "epoch": 9.433028442551059, "grad_norm": 0.09315952658653259, "learning_rate": 6.350563103793249e-06, "loss": 0.4606, "num_input_tokens_seen": 91564576, "step": 75285 }, { "epoch": 9.433654930459841, "grad_norm": 0.07820050418376923, "learning_rate": 6.350036703929266e-06, "loss": 0.4636, "num_input_tokens_seen": 91570720, "step": 75290 }, { "epoch": 9.434281418368625, "grad_norm": 0.15261417627334595, "learning_rate": 6.34951028792446e-06, "loss": 0.4627, "num_input_tokens_seen": 91576768, "step": 75295 }, { "epoch": 9.43490790627741, "grad_norm": 0.12960156798362732, "learning_rate": 6.348983855785122e-06, "loss": 0.457, "num_input_tokens_seen": 91582912, "step": 75300 }, { "epoch": 9.435534394186192, "grad_norm": 0.11873956024646759, "learning_rate": 6.348457407517549e-06, "loss": 0.4593, "num_input_tokens_seen": 91588704, "step": 75305 }, { "epoch": 9.436160882094976, "grad_norm": 0.11479832231998444, "learning_rate": 6.347930943128033e-06, "loss": 0.4611, "num_input_tokens_seen": 91595008, "step": 75310 }, { "epoch": 9.436787370003758, "grad_norm": 0.1710909754037857, "learning_rate": 6.347404462622871e-06, "loss": 0.4594, "num_input_tokens_seen": 91601056, "step": 75315 }, { "epoch": 9.437413857912542, "grad_norm": 0.10260666161775589, "learning_rate": 6.346877966008354e-06, "loss": 0.4604, "num_input_tokens_seen": 91607200, "step": 75320 }, { "epoch": 9.438040345821326, "grad_norm": 0.11128681898117065, "learning_rate": 6.34635145329078e-06, "loss": 0.4673, "num_input_tokens_seen": 91613408, "step": 75325 }, { "epoch": 9.438666833730109, "grad_norm": 0.11039716750383377, "learning_rate": 6.345824924476442e-06, "loss": 0.4636, "num_input_tokens_seen": 91619520, "step": 75330 }, { "epoch": 9.439293321638893, "grad_norm": 0.10889864712953568, "learning_rate": 6.345298379571635e-06, "loss": 0.4593, "num_input_tokens_seen": 91624864, "step": 75335 }, { "epoch": 9.439919809547675, "grad_norm": 0.11075257509946823, "learning_rate": 6.344771818582654e-06, "loss": 0.463, "num_input_tokens_seen": 91630976, "step": 75340 }, { "epoch": 9.440546297456459, "grad_norm": 0.10966509580612183, "learning_rate": 6.344245241515798e-06, "loss": 0.4677, "num_input_tokens_seen": 91637344, "step": 75345 }, { "epoch": 9.441172785365243, "grad_norm": 0.12899982929229736, "learning_rate": 6.343718648377358e-06, "loss": 0.4549, "num_input_tokens_seen": 91642848, "step": 75350 }, { "epoch": 9.441799273274025, "grad_norm": 0.09991668164730072, "learning_rate": 6.343192039173631e-06, "loss": 0.4576, "num_input_tokens_seen": 91648416, "step": 75355 }, { "epoch": 9.44242576118281, "grad_norm": 0.14755724370479584, "learning_rate": 6.342665413910915e-06, "loss": 0.4703, "num_input_tokens_seen": 91653632, "step": 75360 }, { "epoch": 9.443052249091593, "grad_norm": 0.1328420341014862, "learning_rate": 6.342138772595505e-06, "loss": 0.4679, "num_input_tokens_seen": 91659712, "step": 75365 }, { "epoch": 9.443678737000376, "grad_norm": 0.09362374246120453, "learning_rate": 6.3416121152336985e-06, "loss": 0.4603, "num_input_tokens_seen": 91665696, "step": 75370 }, { "epoch": 9.44430522490916, "grad_norm": 0.12056561559438705, "learning_rate": 6.341085441831792e-06, "loss": 0.4623, "num_input_tokens_seen": 91671968, "step": 75375 }, { "epoch": 9.444931712817942, "grad_norm": 0.10305933654308319, "learning_rate": 6.340558752396079e-06, "loss": 0.4642, "num_input_tokens_seen": 91678368, "step": 75380 }, { "epoch": 9.445558200726726, "grad_norm": 0.17658504843711853, "learning_rate": 6.34003204693286e-06, "loss": 0.4594, "num_input_tokens_seen": 91684384, "step": 75385 }, { "epoch": 9.44618468863551, "grad_norm": 0.15314821898937225, "learning_rate": 6.339505325448432e-06, "loss": 0.4594, "num_input_tokens_seen": 91690080, "step": 75390 }, { "epoch": 9.446811176544292, "grad_norm": 0.13851211965084076, "learning_rate": 6.33897858794909e-06, "loss": 0.4701, "num_input_tokens_seen": 91696192, "step": 75395 }, { "epoch": 9.447437664453076, "grad_norm": 0.14682786166667938, "learning_rate": 6.338451834441134e-06, "loss": 0.4582, "num_input_tokens_seen": 91702368, "step": 75400 }, { "epoch": 9.448064152361859, "grad_norm": 0.1214507594704628, "learning_rate": 6.337925064930861e-06, "loss": 0.4614, "num_input_tokens_seen": 91708032, "step": 75405 }, { "epoch": 9.448690640270643, "grad_norm": 0.1438727080821991, "learning_rate": 6.337398279424569e-06, "loss": 0.4615, "num_input_tokens_seen": 91713984, "step": 75410 }, { "epoch": 9.449317128179427, "grad_norm": 0.12073704600334167, "learning_rate": 6.336871477928556e-06, "loss": 0.4639, "num_input_tokens_seen": 91720288, "step": 75415 }, { "epoch": 9.449943616088209, "grad_norm": 0.11543520539999008, "learning_rate": 6.336344660449121e-06, "loss": 0.4612, "num_input_tokens_seen": 91726432, "step": 75420 }, { "epoch": 9.450570103996993, "grad_norm": 0.1442807912826538, "learning_rate": 6.335817826992562e-06, "loss": 0.4681, "num_input_tokens_seen": 91732352, "step": 75425 }, { "epoch": 9.451196591905775, "grad_norm": 0.1119665876030922, "learning_rate": 6.335290977565178e-06, "loss": 0.4622, "num_input_tokens_seen": 91738400, "step": 75430 }, { "epoch": 9.45182307981456, "grad_norm": 0.11831645667552948, "learning_rate": 6.3347641121732675e-06, "loss": 0.4715, "num_input_tokens_seen": 91744992, "step": 75435 }, { "epoch": 9.452449567723344, "grad_norm": 0.11698595434427261, "learning_rate": 6.334237230823129e-06, "loss": 0.4565, "num_input_tokens_seen": 91751168, "step": 75440 }, { "epoch": 9.453076055632126, "grad_norm": 0.10847773402929306, "learning_rate": 6.3337103335210635e-06, "loss": 0.4578, "num_input_tokens_seen": 91757120, "step": 75445 }, { "epoch": 9.45370254354091, "grad_norm": 0.10695519298315048, "learning_rate": 6.333183420273369e-06, "loss": 0.4513, "num_input_tokens_seen": 91763392, "step": 75450 }, { "epoch": 9.454329031449692, "grad_norm": 0.11656244844198227, "learning_rate": 6.332656491086347e-06, "loss": 0.4596, "num_input_tokens_seen": 91768736, "step": 75455 }, { "epoch": 9.454955519358476, "grad_norm": 0.12684562802314758, "learning_rate": 6.332129545966295e-06, "loss": 0.4588, "num_input_tokens_seen": 91774656, "step": 75460 }, { "epoch": 9.45558200726726, "grad_norm": 0.1361820101737976, "learning_rate": 6.331602584919514e-06, "loss": 0.4605, "num_input_tokens_seen": 91780768, "step": 75465 }, { "epoch": 9.456208495176043, "grad_norm": 0.09443984180688858, "learning_rate": 6.331075607952305e-06, "loss": 0.4656, "num_input_tokens_seen": 91786880, "step": 75470 }, { "epoch": 9.456834983084827, "grad_norm": 0.09604840725660324, "learning_rate": 6.330548615070968e-06, "loss": 0.4665, "num_input_tokens_seen": 91792992, "step": 75475 }, { "epoch": 9.45746147099361, "grad_norm": 0.11730827391147614, "learning_rate": 6.3300216062818055e-06, "loss": 0.4665, "num_input_tokens_seen": 91798976, "step": 75480 }, { "epoch": 9.458087958902393, "grad_norm": 0.13181614875793457, "learning_rate": 6.3294945815911145e-06, "loss": 0.4697, "num_input_tokens_seen": 91805280, "step": 75485 }, { "epoch": 9.458714446811177, "grad_norm": 0.0706377848982811, "learning_rate": 6.328967541005199e-06, "loss": 0.4642, "num_input_tokens_seen": 91811488, "step": 75490 }, { "epoch": 9.45934093471996, "grad_norm": 0.08883573859930038, "learning_rate": 6.328440484530358e-06, "loss": 0.4729, "num_input_tokens_seen": 91817248, "step": 75495 }, { "epoch": 9.459967422628743, "grad_norm": 0.12721890211105347, "learning_rate": 6.3279134121728955e-06, "loss": 0.4728, "num_input_tokens_seen": 91822560, "step": 75500 }, { "epoch": 9.460593910537527, "grad_norm": 0.09662308543920517, "learning_rate": 6.32738632393911e-06, "loss": 0.4631, "num_input_tokens_seen": 91828768, "step": 75505 }, { "epoch": 9.46122039844631, "grad_norm": 0.14131107926368713, "learning_rate": 6.3268592198353065e-06, "loss": 0.4576, "num_input_tokens_seen": 91834560, "step": 75510 }, { "epoch": 9.461846886355094, "grad_norm": 0.10806895047426224, "learning_rate": 6.326332099867785e-06, "loss": 0.4632, "num_input_tokens_seen": 91840256, "step": 75515 }, { "epoch": 9.462473374263876, "grad_norm": 0.09331411123275757, "learning_rate": 6.325804964042847e-06, "loss": 0.4546, "num_input_tokens_seen": 91846400, "step": 75520 }, { "epoch": 9.46309986217266, "grad_norm": 0.09601806104183197, "learning_rate": 6.325277812366797e-06, "loss": 0.4656, "num_input_tokens_seen": 91852384, "step": 75525 }, { "epoch": 9.463726350081444, "grad_norm": 0.11372210830450058, "learning_rate": 6.324750644845936e-06, "loss": 0.4523, "num_input_tokens_seen": 91858176, "step": 75530 }, { "epoch": 9.464352837990226, "grad_norm": 0.08854923397302628, "learning_rate": 6.324223461486569e-06, "loss": 0.4569, "num_input_tokens_seen": 91864288, "step": 75535 }, { "epoch": 9.46497932589901, "grad_norm": 0.10583797842264175, "learning_rate": 6.323696262294996e-06, "loss": 0.4714, "num_input_tokens_seen": 91870592, "step": 75540 }, { "epoch": 9.465605813807793, "grad_norm": 0.1342461258172989, "learning_rate": 6.323169047277521e-06, "loss": 0.4636, "num_input_tokens_seen": 91876512, "step": 75545 }, { "epoch": 9.466232301716577, "grad_norm": 0.10716597735881805, "learning_rate": 6.322641816440448e-06, "loss": 0.4584, "num_input_tokens_seen": 91882784, "step": 75550 }, { "epoch": 9.46685878962536, "grad_norm": 0.10060059279203415, "learning_rate": 6.322114569790081e-06, "loss": 0.4611, "num_input_tokens_seen": 91888736, "step": 75555 }, { "epoch": 9.467485277534143, "grad_norm": 0.1121855229139328, "learning_rate": 6.321587307332721e-06, "loss": 0.4589, "num_input_tokens_seen": 91895072, "step": 75560 }, { "epoch": 9.468111765442927, "grad_norm": 0.08890000730752945, "learning_rate": 6.321060029074674e-06, "loss": 0.461, "num_input_tokens_seen": 91900960, "step": 75565 }, { "epoch": 9.46873825335171, "grad_norm": 0.07526426762342453, "learning_rate": 6.320532735022244e-06, "loss": 0.4631, "num_input_tokens_seen": 91907296, "step": 75570 }, { "epoch": 9.469364741260494, "grad_norm": 0.09984535723924637, "learning_rate": 6.3200054251817345e-06, "loss": 0.4569, "num_input_tokens_seen": 91913568, "step": 75575 }, { "epoch": 9.469991229169278, "grad_norm": 0.10808990895748138, "learning_rate": 6.3194780995594505e-06, "loss": 0.4589, "num_input_tokens_seen": 91919296, "step": 75580 }, { "epoch": 9.47061771707806, "grad_norm": 0.10672669112682343, "learning_rate": 6.318950758161696e-06, "loss": 0.4631, "num_input_tokens_seen": 91925376, "step": 75585 }, { "epoch": 9.471244204986844, "grad_norm": 0.09960845112800598, "learning_rate": 6.318423400994777e-06, "loss": 0.464, "num_input_tokens_seen": 91931200, "step": 75590 }, { "epoch": 9.471870692895628, "grad_norm": 0.12992125749588013, "learning_rate": 6.3178960280649984e-06, "loss": 0.4624, "num_input_tokens_seen": 91937280, "step": 75595 }, { "epoch": 9.47249718080441, "grad_norm": 0.10546939074993134, "learning_rate": 6.317368639378664e-06, "loss": 0.4708, "num_input_tokens_seen": 91943168, "step": 75600 }, { "epoch": 9.473123668713194, "grad_norm": 0.10613545775413513, "learning_rate": 6.316841234942081e-06, "loss": 0.4613, "num_input_tokens_seen": 91949504, "step": 75605 }, { "epoch": 9.473750156621977, "grad_norm": 0.07024576514959335, "learning_rate": 6.316313814761551e-06, "loss": 0.4594, "num_input_tokens_seen": 91956000, "step": 75610 }, { "epoch": 9.47437664453076, "grad_norm": 0.11686206609010696, "learning_rate": 6.315786378843385e-06, "loss": 0.464, "num_input_tokens_seen": 91961952, "step": 75615 }, { "epoch": 9.475003132439545, "grad_norm": 0.10413695871829987, "learning_rate": 6.315258927193886e-06, "loss": 0.4606, "num_input_tokens_seen": 91968160, "step": 75620 }, { "epoch": 9.475629620348327, "grad_norm": 0.07056587934494019, "learning_rate": 6.314731459819361e-06, "loss": 0.4652, "num_input_tokens_seen": 91974400, "step": 75625 }, { "epoch": 9.476256108257111, "grad_norm": 0.09170665591955185, "learning_rate": 6.314203976726116e-06, "loss": 0.4527, "num_input_tokens_seen": 91980736, "step": 75630 }, { "epoch": 9.476882596165893, "grad_norm": 0.11383622884750366, "learning_rate": 6.3136764779204575e-06, "loss": 0.4635, "num_input_tokens_seen": 91986688, "step": 75635 }, { "epoch": 9.477509084074677, "grad_norm": 0.10979831963777542, "learning_rate": 6.313148963408693e-06, "loss": 0.4535, "num_input_tokens_seen": 91992960, "step": 75640 }, { "epoch": 9.478135571983461, "grad_norm": 0.1230635941028595, "learning_rate": 6.312621433197126e-06, "loss": 0.4684, "num_input_tokens_seen": 91999136, "step": 75645 }, { "epoch": 9.478762059892244, "grad_norm": 0.08125718683004379, "learning_rate": 6.312093887292068e-06, "loss": 0.4646, "num_input_tokens_seen": 92005440, "step": 75650 }, { "epoch": 9.479388547801028, "grad_norm": 0.12240158766508102, "learning_rate": 6.311566325699825e-06, "loss": 0.465, "num_input_tokens_seen": 92011584, "step": 75655 }, { "epoch": 9.48001503570981, "grad_norm": 0.14415118098258972, "learning_rate": 6.311038748426704e-06, "loss": 0.4648, "num_input_tokens_seen": 92017920, "step": 75660 }, { "epoch": 9.480641523618594, "grad_norm": 0.128925621509552, "learning_rate": 6.31051115547901e-06, "loss": 0.4577, "num_input_tokens_seen": 92024064, "step": 75665 }, { "epoch": 9.481268011527378, "grad_norm": 0.08372233808040619, "learning_rate": 6.309983546863055e-06, "loss": 0.4584, "num_input_tokens_seen": 92030496, "step": 75670 }, { "epoch": 9.48189449943616, "grad_norm": 0.1448739767074585, "learning_rate": 6.309455922585146e-06, "loss": 0.4555, "num_input_tokens_seen": 92036768, "step": 75675 }, { "epoch": 9.482520987344945, "grad_norm": 0.09328003972768784, "learning_rate": 6.30892828265159e-06, "loss": 0.4578, "num_input_tokens_seen": 92043040, "step": 75680 }, { "epoch": 9.483147475253727, "grad_norm": 0.0965939611196518, "learning_rate": 6.308400627068696e-06, "loss": 0.47, "num_input_tokens_seen": 92049120, "step": 75685 }, { "epoch": 9.48377396316251, "grad_norm": 0.12660233676433563, "learning_rate": 6.307872955842772e-06, "loss": 0.4531, "num_input_tokens_seen": 92055200, "step": 75690 }, { "epoch": 9.484400451071295, "grad_norm": 0.11379606276750565, "learning_rate": 6.307345268980127e-06, "loss": 0.4608, "num_input_tokens_seen": 92061408, "step": 75695 }, { "epoch": 9.485026938980077, "grad_norm": 0.11027565598487854, "learning_rate": 6.306817566487071e-06, "loss": 0.4716, "num_input_tokens_seen": 92067424, "step": 75700 }, { "epoch": 9.485653426888861, "grad_norm": 0.09867341816425323, "learning_rate": 6.306289848369912e-06, "loss": 0.4682, "num_input_tokens_seen": 92072992, "step": 75705 }, { "epoch": 9.486279914797645, "grad_norm": 0.06338492780923843, "learning_rate": 6.305762114634959e-06, "loss": 0.4559, "num_input_tokens_seen": 92078944, "step": 75710 }, { "epoch": 9.486906402706428, "grad_norm": 0.0701800137758255, "learning_rate": 6.3052343652885225e-06, "loss": 0.4642, "num_input_tokens_seen": 92085056, "step": 75715 }, { "epoch": 9.487532890615212, "grad_norm": 0.1166568174958229, "learning_rate": 6.304706600336912e-06, "loss": 0.4632, "num_input_tokens_seen": 92091168, "step": 75720 }, { "epoch": 9.488159378523994, "grad_norm": 0.10203201323747635, "learning_rate": 6.304178819786436e-06, "loss": 0.4564, "num_input_tokens_seen": 92097216, "step": 75725 }, { "epoch": 9.488785866432778, "grad_norm": 0.13976624608039856, "learning_rate": 6.303651023643408e-06, "loss": 0.4604, "num_input_tokens_seen": 92103168, "step": 75730 }, { "epoch": 9.489412354341562, "grad_norm": 0.08649483323097229, "learning_rate": 6.303123211914135e-06, "loss": 0.4638, "num_input_tokens_seen": 92109120, "step": 75735 }, { "epoch": 9.490038842250344, "grad_norm": 0.0996931865811348, "learning_rate": 6.302595384604928e-06, "loss": 0.4604, "num_input_tokens_seen": 92115264, "step": 75740 }, { "epoch": 9.490665330159128, "grad_norm": 0.08823944628238678, "learning_rate": 6.302067541722097e-06, "loss": 0.4552, "num_input_tokens_seen": 92121280, "step": 75745 }, { "epoch": 9.49129181806791, "grad_norm": 0.11574675142765045, "learning_rate": 6.301539683271956e-06, "loss": 0.4496, "num_input_tokens_seen": 92127008, "step": 75750 }, { "epoch": 9.491918305976695, "grad_norm": 0.1128208339214325, "learning_rate": 6.301011809260813e-06, "loss": 0.4512, "num_input_tokens_seen": 92133376, "step": 75755 }, { "epoch": 9.492544793885479, "grad_norm": 0.08239703625440598, "learning_rate": 6.3004839196949785e-06, "loss": 0.4681, "num_input_tokens_seen": 92138880, "step": 75760 }, { "epoch": 9.493171281794261, "grad_norm": 0.07573986053466797, "learning_rate": 6.299956014580767e-06, "loss": 0.4625, "num_input_tokens_seen": 92144896, "step": 75765 }, { "epoch": 9.493797769703045, "grad_norm": 0.1395128220319748, "learning_rate": 6.299428093924487e-06, "loss": 0.4567, "num_input_tokens_seen": 92151328, "step": 75770 }, { "epoch": 9.494424257611827, "grad_norm": 0.15113577246665955, "learning_rate": 6.298900157732453e-06, "loss": 0.4559, "num_input_tokens_seen": 92157440, "step": 75775 }, { "epoch": 9.495050745520611, "grad_norm": 0.12225599586963654, "learning_rate": 6.298372206010976e-06, "loss": 0.459, "num_input_tokens_seen": 92163296, "step": 75780 }, { "epoch": 9.495677233429396, "grad_norm": 0.11724252253770828, "learning_rate": 6.2978442387663664e-06, "loss": 0.4481, "num_input_tokens_seen": 92169184, "step": 75785 }, { "epoch": 9.496303721338178, "grad_norm": 0.13393893837928772, "learning_rate": 6.2973162560049386e-06, "loss": 0.4564, "num_input_tokens_seen": 92175520, "step": 75790 }, { "epoch": 9.496930209246962, "grad_norm": 0.12289973348379135, "learning_rate": 6.296788257733003e-06, "loss": 0.4603, "num_input_tokens_seen": 92181216, "step": 75795 }, { "epoch": 9.497556697155744, "grad_norm": 0.1342456191778183, "learning_rate": 6.296260243956875e-06, "loss": 0.4673, "num_input_tokens_seen": 92187040, "step": 75800 }, { "epoch": 9.498183185064528, "grad_norm": 0.15271718800067902, "learning_rate": 6.295732214682866e-06, "loss": 0.4574, "num_input_tokens_seen": 92192928, "step": 75805 }, { "epoch": 9.498809672973312, "grad_norm": 0.11847501248121262, "learning_rate": 6.295204169917288e-06, "loss": 0.4572, "num_input_tokens_seen": 92199200, "step": 75810 }, { "epoch": 9.499436160882095, "grad_norm": 0.089853435754776, "learning_rate": 6.2946761096664545e-06, "loss": 0.4662, "num_input_tokens_seen": 92205312, "step": 75815 }, { "epoch": 9.500062648790879, "grad_norm": 0.12410284578800201, "learning_rate": 6.29414803393668e-06, "loss": 0.4617, "num_input_tokens_seen": 92211712, "step": 75820 }, { "epoch": 9.50068913669966, "grad_norm": 0.1158229410648346, "learning_rate": 6.293619942734279e-06, "loss": 0.4668, "num_input_tokens_seen": 92217760, "step": 75825 }, { "epoch": 9.501315624608445, "grad_norm": 0.12744992971420288, "learning_rate": 6.2930918360655624e-06, "loss": 0.4503, "num_input_tokens_seen": 92223648, "step": 75830 }, { "epoch": 9.501942112517229, "grad_norm": 0.16223309934139252, "learning_rate": 6.292563713936847e-06, "loss": 0.4676, "num_input_tokens_seen": 92229728, "step": 75835 }, { "epoch": 9.502568600426011, "grad_norm": 0.12040994316339493, "learning_rate": 6.292035576354445e-06, "loss": 0.4592, "num_input_tokens_seen": 92236384, "step": 75840 }, { "epoch": 9.503195088334795, "grad_norm": 0.1401626020669937, "learning_rate": 6.291507423324672e-06, "loss": 0.4676, "num_input_tokens_seen": 92242592, "step": 75845 }, { "epoch": 9.503821576243578, "grad_norm": 0.16277150809764862, "learning_rate": 6.290979254853842e-06, "loss": 0.4694, "num_input_tokens_seen": 92248800, "step": 75850 }, { "epoch": 9.504448064152362, "grad_norm": 0.14668895304203033, "learning_rate": 6.290451070948269e-06, "loss": 0.4722, "num_input_tokens_seen": 92255008, "step": 75855 }, { "epoch": 9.505074552061146, "grad_norm": 0.14688466489315033, "learning_rate": 6.289922871614269e-06, "loss": 0.4696, "num_input_tokens_seen": 92260896, "step": 75860 }, { "epoch": 9.505701039969928, "grad_norm": 0.08781760931015015, "learning_rate": 6.2893946568581566e-06, "loss": 0.4587, "num_input_tokens_seen": 92267008, "step": 75865 }, { "epoch": 9.506327527878712, "grad_norm": 0.11022518575191498, "learning_rate": 6.288866426686246e-06, "loss": 0.4601, "num_input_tokens_seen": 92272992, "step": 75870 }, { "epoch": 9.506954015787496, "grad_norm": 0.1268978714942932, "learning_rate": 6.288338181104854e-06, "loss": 0.4648, "num_input_tokens_seen": 92279168, "step": 75875 }, { "epoch": 9.507580503696278, "grad_norm": 0.07454905658960342, "learning_rate": 6.287809920120296e-06, "loss": 0.4668, "num_input_tokens_seen": 92285376, "step": 75880 }, { "epoch": 9.508206991605062, "grad_norm": 0.12549187242984772, "learning_rate": 6.287281643738888e-06, "loss": 0.4651, "num_input_tokens_seen": 92291104, "step": 75885 }, { "epoch": 9.508833479513845, "grad_norm": 0.12006890028715134, "learning_rate": 6.286753351966945e-06, "loss": 0.4513, "num_input_tokens_seen": 92297056, "step": 75890 }, { "epoch": 9.509459967422629, "grad_norm": 0.10186372697353363, "learning_rate": 6.286225044810784e-06, "loss": 0.4647, "num_input_tokens_seen": 92303264, "step": 75895 }, { "epoch": 9.510086455331413, "grad_norm": 0.11623332649469376, "learning_rate": 6.285696722276722e-06, "loss": 0.4602, "num_input_tokens_seen": 92308704, "step": 75900 }, { "epoch": 9.510712943240195, "grad_norm": 0.1520615667104721, "learning_rate": 6.285168384371074e-06, "loss": 0.4636, "num_input_tokens_seen": 92314688, "step": 75905 }, { "epoch": 9.51133943114898, "grad_norm": 0.12781712412834167, "learning_rate": 6.284640031100157e-06, "loss": 0.4532, "num_input_tokens_seen": 92320544, "step": 75910 }, { "epoch": 9.511965919057761, "grad_norm": 0.12954476475715637, "learning_rate": 6.28411166247029e-06, "loss": 0.4562, "num_input_tokens_seen": 92326048, "step": 75915 }, { "epoch": 9.512592406966546, "grad_norm": 0.12620104849338531, "learning_rate": 6.283583278487787e-06, "loss": 0.4646, "num_input_tokens_seen": 92332192, "step": 75920 }, { "epoch": 9.51321889487533, "grad_norm": 0.1288682371377945, "learning_rate": 6.283054879158967e-06, "loss": 0.4646, "num_input_tokens_seen": 92338144, "step": 75925 }, { "epoch": 9.513845382784112, "grad_norm": 0.07953226566314697, "learning_rate": 6.282526464490148e-06, "loss": 0.4685, "num_input_tokens_seen": 92344128, "step": 75930 }, { "epoch": 9.514471870692896, "grad_norm": 0.15597496926784515, "learning_rate": 6.281998034487647e-06, "loss": 0.4626, "num_input_tokens_seen": 92350432, "step": 75935 }, { "epoch": 9.515098358601678, "grad_norm": 0.13210120797157288, "learning_rate": 6.281469589157781e-06, "loss": 0.4632, "num_input_tokens_seen": 92356608, "step": 75940 }, { "epoch": 9.515724846510462, "grad_norm": 0.12386088073253632, "learning_rate": 6.2809411285068675e-06, "loss": 0.4635, "num_input_tokens_seen": 92362784, "step": 75945 }, { "epoch": 9.516351334419246, "grad_norm": 0.1661202609539032, "learning_rate": 6.2804126525412276e-06, "loss": 0.4547, "num_input_tokens_seen": 92368832, "step": 75950 }, { "epoch": 9.516977822328029, "grad_norm": 0.11569090187549591, "learning_rate": 6.2798841612671765e-06, "loss": 0.4604, "num_input_tokens_seen": 92374912, "step": 75955 }, { "epoch": 9.517604310236813, "grad_norm": 0.1159089058637619, "learning_rate": 6.279355654691035e-06, "loss": 0.4589, "num_input_tokens_seen": 92380960, "step": 75960 }, { "epoch": 9.518230798145595, "grad_norm": 0.16592101752758026, "learning_rate": 6.278827132819121e-06, "loss": 0.4526, "num_input_tokens_seen": 92387200, "step": 75965 }, { "epoch": 9.518857286054379, "grad_norm": 0.12926441431045532, "learning_rate": 6.278298595657754e-06, "loss": 0.4613, "num_input_tokens_seen": 92393728, "step": 75970 }, { "epoch": 9.519483773963163, "grad_norm": 0.12362872064113617, "learning_rate": 6.277770043213251e-06, "loss": 0.4616, "num_input_tokens_seen": 92400032, "step": 75975 }, { "epoch": 9.520110261871945, "grad_norm": 0.1387583166360855, "learning_rate": 6.277241475491934e-06, "loss": 0.4573, "num_input_tokens_seen": 92406080, "step": 75980 }, { "epoch": 9.52073674978073, "grad_norm": 0.12656410038471222, "learning_rate": 6.276712892500121e-06, "loss": 0.4492, "num_input_tokens_seen": 92411808, "step": 75985 }, { "epoch": 9.521363237689513, "grad_norm": 0.14880448579788208, "learning_rate": 6.276184294244132e-06, "loss": 0.4577, "num_input_tokens_seen": 92418080, "step": 75990 }, { "epoch": 9.521989725598296, "grad_norm": 0.14830459654331207, "learning_rate": 6.275655680730287e-06, "loss": 0.451, "num_input_tokens_seen": 92423968, "step": 75995 }, { "epoch": 9.52261621350708, "grad_norm": 0.1392136961221695, "learning_rate": 6.275127051964906e-06, "loss": 0.465, "num_input_tokens_seen": 92430240, "step": 76000 }, { "epoch": 9.523242701415862, "grad_norm": 0.1236080750823021, "learning_rate": 6.274598407954309e-06, "loss": 0.467, "num_input_tokens_seen": 92435776, "step": 76005 }, { "epoch": 9.523869189324646, "grad_norm": 0.14041467010974884, "learning_rate": 6.274069748704817e-06, "loss": 0.4626, "num_input_tokens_seen": 92441888, "step": 76010 }, { "epoch": 9.52449567723343, "grad_norm": 0.1429181545972824, "learning_rate": 6.2735410742227475e-06, "loss": 0.4666, "num_input_tokens_seen": 92447904, "step": 76015 }, { "epoch": 9.525122165142212, "grad_norm": 0.15331058204174042, "learning_rate": 6.273012384514425e-06, "loss": 0.4717, "num_input_tokens_seen": 92454048, "step": 76020 }, { "epoch": 9.525748653050996, "grad_norm": 0.17974326014518738, "learning_rate": 6.27248367958617e-06, "loss": 0.4659, "num_input_tokens_seen": 92460224, "step": 76025 }, { "epoch": 9.526375140959779, "grad_norm": 0.11831285804510117, "learning_rate": 6.271954959444302e-06, "loss": 0.4701, "num_input_tokens_seen": 92466336, "step": 76030 }, { "epoch": 9.527001628868563, "grad_norm": 0.11975913494825363, "learning_rate": 6.271426224095143e-06, "loss": 0.4624, "num_input_tokens_seen": 92472032, "step": 76035 }, { "epoch": 9.527628116777347, "grad_norm": 0.16561783850193024, "learning_rate": 6.270897473545016e-06, "loss": 0.4594, "num_input_tokens_seen": 92478272, "step": 76040 }, { "epoch": 9.52825460468613, "grad_norm": 0.13990285992622375, "learning_rate": 6.27036870780024e-06, "loss": 0.4638, "num_input_tokens_seen": 92484416, "step": 76045 }, { "epoch": 9.528881092594913, "grad_norm": 0.1113412156701088, "learning_rate": 6.269839926867138e-06, "loss": 0.4658, "num_input_tokens_seen": 92490464, "step": 76050 }, { "epoch": 9.529507580503696, "grad_norm": 0.1291031688451767, "learning_rate": 6.269311130752031e-06, "loss": 0.4563, "num_input_tokens_seen": 92496480, "step": 76055 }, { "epoch": 9.53013406841248, "grad_norm": 0.08376263827085495, "learning_rate": 6.268782319461244e-06, "loss": 0.4587, "num_input_tokens_seen": 92502752, "step": 76060 }, { "epoch": 9.530760556321264, "grad_norm": 0.10836905241012573, "learning_rate": 6.268253493001097e-06, "loss": 0.4668, "num_input_tokens_seen": 92508960, "step": 76065 }, { "epoch": 9.531387044230046, "grad_norm": 0.14606380462646484, "learning_rate": 6.267724651377913e-06, "loss": 0.4537, "num_input_tokens_seen": 92515072, "step": 76070 }, { "epoch": 9.53201353213883, "grad_norm": 0.13862326741218567, "learning_rate": 6.267195794598016e-06, "loss": 0.4683, "num_input_tokens_seen": 92520864, "step": 76075 }, { "epoch": 9.532640020047612, "grad_norm": 0.09777311235666275, "learning_rate": 6.266666922667726e-06, "loss": 0.4705, "num_input_tokens_seen": 92527104, "step": 76080 }, { "epoch": 9.533266507956396, "grad_norm": 0.13123729825019836, "learning_rate": 6.266138035593369e-06, "loss": 0.4684, "num_input_tokens_seen": 92533312, "step": 76085 }, { "epoch": 9.53389299586518, "grad_norm": 0.10948538035154343, "learning_rate": 6.265609133381267e-06, "loss": 0.4736, "num_input_tokens_seen": 92539360, "step": 76090 }, { "epoch": 9.534519483773963, "grad_norm": 0.0996752679347992, "learning_rate": 6.265080216037744e-06, "loss": 0.4655, "num_input_tokens_seen": 92545376, "step": 76095 }, { "epoch": 9.535145971682747, "grad_norm": 0.10314425826072693, "learning_rate": 6.2645512835691245e-06, "loss": 0.4665, "num_input_tokens_seen": 92551424, "step": 76100 }, { "epoch": 9.53577245959153, "grad_norm": 0.09352879226207733, "learning_rate": 6.26402233598173e-06, "loss": 0.4614, "num_input_tokens_seen": 92557344, "step": 76105 }, { "epoch": 9.536398947500313, "grad_norm": 0.11489209532737732, "learning_rate": 6.263493373281886e-06, "loss": 0.4614, "num_input_tokens_seen": 92563616, "step": 76110 }, { "epoch": 9.537025435409097, "grad_norm": 0.0950671136379242, "learning_rate": 6.262964395475917e-06, "loss": 0.4688, "num_input_tokens_seen": 92570208, "step": 76115 }, { "epoch": 9.53765192331788, "grad_norm": 0.12468423694372177, "learning_rate": 6.262435402570146e-06, "loss": 0.4609, "num_input_tokens_seen": 92576640, "step": 76120 }, { "epoch": 9.538278411226663, "grad_norm": 0.15052258968353271, "learning_rate": 6.261906394570898e-06, "loss": 0.4591, "num_input_tokens_seen": 92582368, "step": 76125 }, { "epoch": 9.538904899135447, "grad_norm": 0.16564564406871796, "learning_rate": 6.261377371484499e-06, "loss": 0.4616, "num_input_tokens_seen": 92587872, "step": 76130 }, { "epoch": 9.53953138704423, "grad_norm": 0.11792320013046265, "learning_rate": 6.2608483333172724e-06, "loss": 0.4597, "num_input_tokens_seen": 92593856, "step": 76135 }, { "epoch": 9.540157874953014, "grad_norm": 0.09106933325529099, "learning_rate": 6.260319280075543e-06, "loss": 0.466, "num_input_tokens_seen": 92600064, "step": 76140 }, { "epoch": 9.540784362861796, "grad_norm": 0.11234911531209946, "learning_rate": 6.259790211765638e-06, "loss": 0.4558, "num_input_tokens_seen": 92605920, "step": 76145 }, { "epoch": 9.54141085077058, "grad_norm": 0.09745913743972778, "learning_rate": 6.259261128393883e-06, "loss": 0.4618, "num_input_tokens_seen": 92612096, "step": 76150 }, { "epoch": 9.542037338679364, "grad_norm": 0.10430040955543518, "learning_rate": 6.258732029966601e-06, "loss": 0.4607, "num_input_tokens_seen": 92618336, "step": 76155 }, { "epoch": 9.542663826588146, "grad_norm": 0.10866132378578186, "learning_rate": 6.25820291649012e-06, "loss": 0.4476, "num_input_tokens_seen": 92624480, "step": 76160 }, { "epoch": 9.54329031449693, "grad_norm": 0.10270295292139053, "learning_rate": 6.257673787970766e-06, "loss": 0.4684, "num_input_tokens_seen": 92630560, "step": 76165 }, { "epoch": 9.543916802405713, "grad_norm": 0.09808550029993057, "learning_rate": 6.2571446444148635e-06, "loss": 0.4606, "num_input_tokens_seen": 92636544, "step": 76170 }, { "epoch": 9.544543290314497, "grad_norm": 0.15199841558933258, "learning_rate": 6.25661548582874e-06, "loss": 0.4603, "num_input_tokens_seen": 92642752, "step": 76175 }, { "epoch": 9.545169778223281, "grad_norm": 0.09874744713306427, "learning_rate": 6.256086312218721e-06, "loss": 0.4602, "num_input_tokens_seen": 92648832, "step": 76180 }, { "epoch": 9.545796266132063, "grad_norm": 0.12934210896492004, "learning_rate": 6.255557123591135e-06, "loss": 0.4639, "num_input_tokens_seen": 92654880, "step": 76185 }, { "epoch": 9.546422754040847, "grad_norm": 0.14835020899772644, "learning_rate": 6.255027919952308e-06, "loss": 0.449, "num_input_tokens_seen": 92661056, "step": 76190 }, { "epoch": 9.54704924194963, "grad_norm": 0.11319519579410553, "learning_rate": 6.254498701308567e-06, "loss": 0.452, "num_input_tokens_seen": 92666944, "step": 76195 }, { "epoch": 9.547675729858414, "grad_norm": 0.08564583957195282, "learning_rate": 6.253969467666239e-06, "loss": 0.4583, "num_input_tokens_seen": 92672384, "step": 76200 }, { "epoch": 9.548302217767198, "grad_norm": 0.10353074967861176, "learning_rate": 6.2534402190316515e-06, "loss": 0.4584, "num_input_tokens_seen": 92678272, "step": 76205 }, { "epoch": 9.54892870567598, "grad_norm": 0.1503334939479828, "learning_rate": 6.252910955411132e-06, "loss": 0.4667, "num_input_tokens_seen": 92684480, "step": 76210 }, { "epoch": 9.549555193584764, "grad_norm": 0.14492946863174438, "learning_rate": 6.25238167681101e-06, "loss": 0.4563, "num_input_tokens_seen": 92690688, "step": 76215 }, { "epoch": 9.550181681493548, "grad_norm": 0.1360534429550171, "learning_rate": 6.251852383237611e-06, "loss": 0.471, "num_input_tokens_seen": 92696576, "step": 76220 }, { "epoch": 9.55080816940233, "grad_norm": 0.12630821764469147, "learning_rate": 6.251323074697265e-06, "loss": 0.4584, "num_input_tokens_seen": 92702784, "step": 76225 }, { "epoch": 9.551434657311114, "grad_norm": 0.12509676814079285, "learning_rate": 6.250793751196299e-06, "loss": 0.4587, "num_input_tokens_seen": 92708768, "step": 76230 }, { "epoch": 9.552061145219897, "grad_norm": 0.10128424316644669, "learning_rate": 6.250264412741043e-06, "loss": 0.4618, "num_input_tokens_seen": 92715072, "step": 76235 }, { "epoch": 9.55268763312868, "grad_norm": 0.11189640313386917, "learning_rate": 6.249735059337824e-06, "loss": 0.4548, "num_input_tokens_seen": 92721664, "step": 76240 }, { "epoch": 9.553314121037465, "grad_norm": 0.11795040220022202, "learning_rate": 6.249205690992972e-06, "loss": 0.4602, "num_input_tokens_seen": 92727264, "step": 76245 }, { "epoch": 9.553940608946247, "grad_norm": 0.11343535780906677, "learning_rate": 6.248676307712815e-06, "loss": 0.4589, "num_input_tokens_seen": 92733312, "step": 76250 }, { "epoch": 9.554567096855031, "grad_norm": 0.1413431316614151, "learning_rate": 6.248146909503684e-06, "loss": 0.4503, "num_input_tokens_seen": 92739744, "step": 76255 }, { "epoch": 9.555193584763813, "grad_norm": 0.12199513614177704, "learning_rate": 6.247617496371907e-06, "loss": 0.4733, "num_input_tokens_seen": 92746048, "step": 76260 }, { "epoch": 9.555820072672597, "grad_norm": 0.15610992908477783, "learning_rate": 6.2470880683238124e-06, "loss": 0.4638, "num_input_tokens_seen": 92752032, "step": 76265 }, { "epoch": 9.556446560581382, "grad_norm": 0.10653240978717804, "learning_rate": 6.2465586253657325e-06, "loss": 0.4741, "num_input_tokens_seen": 92758208, "step": 76270 }, { "epoch": 9.557073048490164, "grad_norm": 0.16817685961723328, "learning_rate": 6.246029167503998e-06, "loss": 0.4576, "num_input_tokens_seen": 92763872, "step": 76275 }, { "epoch": 9.557699536398948, "grad_norm": 0.111089788377285, "learning_rate": 6.2454996947449355e-06, "loss": 0.4548, "num_input_tokens_seen": 92769952, "step": 76280 }, { "epoch": 9.55832602430773, "grad_norm": 0.1268380880355835, "learning_rate": 6.244970207094876e-06, "loss": 0.4686, "num_input_tokens_seen": 92776000, "step": 76285 }, { "epoch": 9.558952512216514, "grad_norm": 0.11666576564311981, "learning_rate": 6.2444407045601515e-06, "loss": 0.4661, "num_input_tokens_seen": 92782176, "step": 76290 }, { "epoch": 9.559579000125298, "grad_norm": 0.1533079594373703, "learning_rate": 6.243911187147094e-06, "loss": 0.4596, "num_input_tokens_seen": 92788448, "step": 76295 }, { "epoch": 9.56020548803408, "grad_norm": 0.11185850203037262, "learning_rate": 6.2433816548620305e-06, "loss": 0.4668, "num_input_tokens_seen": 92794528, "step": 76300 }, { "epoch": 9.560831975942865, "grad_norm": 0.09558592736721039, "learning_rate": 6.242852107711295e-06, "loss": 0.4584, "num_input_tokens_seen": 92800416, "step": 76305 }, { "epoch": 9.561458463851647, "grad_norm": 0.1382606029510498, "learning_rate": 6.242322545701217e-06, "loss": 0.4666, "num_input_tokens_seen": 92806624, "step": 76310 }, { "epoch": 9.562084951760431, "grad_norm": 0.1210220530629158, "learning_rate": 6.241792968838128e-06, "loss": 0.4544, "num_input_tokens_seen": 92812736, "step": 76315 }, { "epoch": 9.562711439669215, "grad_norm": 0.104820117354393, "learning_rate": 6.241263377128361e-06, "loss": 0.4619, "num_input_tokens_seen": 92818528, "step": 76320 }, { "epoch": 9.563337927577997, "grad_norm": 0.08235560357570648, "learning_rate": 6.240733770578245e-06, "loss": 0.4637, "num_input_tokens_seen": 92824032, "step": 76325 }, { "epoch": 9.563964415486781, "grad_norm": 0.10794967412948608, "learning_rate": 6.240204149194116e-06, "loss": 0.4585, "num_input_tokens_seen": 92830240, "step": 76330 }, { "epoch": 9.564590903395565, "grad_norm": 0.12688462436199188, "learning_rate": 6.239674512982304e-06, "loss": 0.4569, "num_input_tokens_seen": 92836864, "step": 76335 }, { "epoch": 9.565217391304348, "grad_norm": 0.1158306747674942, "learning_rate": 6.239144861949138e-06, "loss": 0.4613, "num_input_tokens_seen": 92842880, "step": 76340 }, { "epoch": 9.565843879213132, "grad_norm": 0.1368001401424408, "learning_rate": 6.238615196100955e-06, "loss": 0.4638, "num_input_tokens_seen": 92849184, "step": 76345 }, { "epoch": 9.566470367121914, "grad_norm": 0.15393678843975067, "learning_rate": 6.2380855154440855e-06, "loss": 0.4666, "num_input_tokens_seen": 92855296, "step": 76350 }, { "epoch": 9.567096855030698, "grad_norm": 0.12591566145420074, "learning_rate": 6.237555819984864e-06, "loss": 0.4564, "num_input_tokens_seen": 92861280, "step": 76355 }, { "epoch": 9.56772334293948, "grad_norm": 0.1504879742860794, "learning_rate": 6.237026109729622e-06, "loss": 0.4722, "num_input_tokens_seen": 92867328, "step": 76360 }, { "epoch": 9.568349830848264, "grad_norm": 0.11170925945043564, "learning_rate": 6.236496384684691e-06, "loss": 0.4534, "num_input_tokens_seen": 92873888, "step": 76365 }, { "epoch": 9.568976318757048, "grad_norm": 0.1456146240234375, "learning_rate": 6.2359666448564086e-06, "loss": 0.463, "num_input_tokens_seen": 92880032, "step": 76370 }, { "epoch": 9.56960280666583, "grad_norm": 0.1812434196472168, "learning_rate": 6.235436890251104e-06, "loss": 0.468, "num_input_tokens_seen": 92886432, "step": 76375 }, { "epoch": 9.570229294574615, "grad_norm": 0.15308018028736115, "learning_rate": 6.234907120875112e-06, "loss": 0.4584, "num_input_tokens_seen": 92892704, "step": 76380 }, { "epoch": 9.570855782483399, "grad_norm": 0.1633504033088684, "learning_rate": 6.23437733673477e-06, "loss": 0.4625, "num_input_tokens_seen": 92898912, "step": 76385 }, { "epoch": 9.571482270392181, "grad_norm": 0.1617157906293869, "learning_rate": 6.233847537836405e-06, "loss": 0.4577, "num_input_tokens_seen": 92905216, "step": 76390 }, { "epoch": 9.572108758300965, "grad_norm": 0.1327587068080902, "learning_rate": 6.233317724186359e-06, "loss": 0.4659, "num_input_tokens_seen": 92911744, "step": 76395 }, { "epoch": 9.572735246209747, "grad_norm": 0.19740208983421326, "learning_rate": 6.232787895790961e-06, "loss": 0.4563, "num_input_tokens_seen": 92917856, "step": 76400 }, { "epoch": 9.573361734118532, "grad_norm": 0.13058452308177948, "learning_rate": 6.232258052656548e-06, "loss": 0.4592, "num_input_tokens_seen": 92924032, "step": 76405 }, { "epoch": 9.573988222027316, "grad_norm": 0.15712693333625793, "learning_rate": 6.231728194789454e-06, "loss": 0.4673, "num_input_tokens_seen": 92929632, "step": 76410 }, { "epoch": 9.574614709936098, "grad_norm": 0.21107806265354156, "learning_rate": 6.231198322196013e-06, "loss": 0.4564, "num_input_tokens_seen": 92935936, "step": 76415 }, { "epoch": 9.575241197844882, "grad_norm": 0.18884557485580444, "learning_rate": 6.230668434882562e-06, "loss": 0.4723, "num_input_tokens_seen": 92942080, "step": 76420 }, { "epoch": 9.575867685753664, "grad_norm": 0.1565759927034378, "learning_rate": 6.230138532855434e-06, "loss": 0.4637, "num_input_tokens_seen": 92948512, "step": 76425 }, { "epoch": 9.576494173662448, "grad_norm": 0.15544529259204865, "learning_rate": 6.229608616120967e-06, "loss": 0.46, "num_input_tokens_seen": 92954656, "step": 76430 }, { "epoch": 9.577120661571232, "grad_norm": 0.1360176056623459, "learning_rate": 6.229078684685493e-06, "loss": 0.4648, "num_input_tokens_seen": 92960608, "step": 76435 }, { "epoch": 9.577747149480015, "grad_norm": 0.17613768577575684, "learning_rate": 6.228548738555352e-06, "loss": 0.4664, "num_input_tokens_seen": 92966304, "step": 76440 }, { "epoch": 9.578373637388799, "grad_norm": 0.11623570322990417, "learning_rate": 6.228018777736877e-06, "loss": 0.4635, "num_input_tokens_seen": 92971904, "step": 76445 }, { "epoch": 9.579000125297581, "grad_norm": 0.12821967899799347, "learning_rate": 6.227488802236404e-06, "loss": 0.4619, "num_input_tokens_seen": 92977792, "step": 76450 }, { "epoch": 9.579626613206365, "grad_norm": 0.1496504545211792, "learning_rate": 6.226958812060273e-06, "loss": 0.4593, "num_input_tokens_seen": 92984000, "step": 76455 }, { "epoch": 9.580253101115149, "grad_norm": 0.15063859522342682, "learning_rate": 6.226428807214816e-06, "loss": 0.4612, "num_input_tokens_seen": 92990080, "step": 76460 }, { "epoch": 9.580879589023931, "grad_norm": 0.19422781467437744, "learning_rate": 6.225898787706373e-06, "loss": 0.4712, "num_input_tokens_seen": 92996352, "step": 76465 }, { "epoch": 9.581506076932715, "grad_norm": 0.14398019015789032, "learning_rate": 6.225368753541278e-06, "loss": 0.4599, "num_input_tokens_seen": 93002944, "step": 76470 }, { "epoch": 9.582132564841498, "grad_norm": 0.12777310609817505, "learning_rate": 6.22483870472587e-06, "loss": 0.4731, "num_input_tokens_seen": 93009152, "step": 76475 }, { "epoch": 9.582759052750282, "grad_norm": 0.1758151799440384, "learning_rate": 6.224308641266485e-06, "loss": 0.4572, "num_input_tokens_seen": 93015168, "step": 76480 }, { "epoch": 9.583385540659066, "grad_norm": 0.20505282282829285, "learning_rate": 6.2237785631694615e-06, "loss": 0.4661, "num_input_tokens_seen": 93021408, "step": 76485 }, { "epoch": 9.584012028567848, "grad_norm": 0.13482201099395752, "learning_rate": 6.223248470441135e-06, "loss": 0.4766, "num_input_tokens_seen": 93027840, "step": 76490 }, { "epoch": 9.584638516476632, "grad_norm": 0.12286823242902756, "learning_rate": 6.222718363087845e-06, "loss": 0.456, "num_input_tokens_seen": 93033920, "step": 76495 }, { "epoch": 9.585265004385416, "grad_norm": 0.11148283630609512, "learning_rate": 6.2221882411159295e-06, "loss": 0.4581, "num_input_tokens_seen": 93039712, "step": 76500 }, { "epoch": 9.585891492294198, "grad_norm": 0.1382014900445938, "learning_rate": 6.221658104531727e-06, "loss": 0.4519, "num_input_tokens_seen": 93045792, "step": 76505 }, { "epoch": 9.586517980202983, "grad_norm": 0.10136283189058304, "learning_rate": 6.2211279533415725e-06, "loss": 0.453, "num_input_tokens_seen": 93052064, "step": 76510 }, { "epoch": 9.587144468111765, "grad_norm": 0.10108080506324768, "learning_rate": 6.220597787551807e-06, "loss": 0.4599, "num_input_tokens_seen": 93058048, "step": 76515 }, { "epoch": 9.587770956020549, "grad_norm": 0.11375660449266434, "learning_rate": 6.22006760716877e-06, "loss": 0.4661, "num_input_tokens_seen": 93064256, "step": 76520 }, { "epoch": 9.588397443929333, "grad_norm": 0.11993240565061569, "learning_rate": 6.2195374121988e-06, "loss": 0.4711, "num_input_tokens_seen": 93070496, "step": 76525 }, { "epoch": 9.589023931838115, "grad_norm": 0.15855993330478668, "learning_rate": 6.219007202648233e-06, "loss": 0.4569, "num_input_tokens_seen": 93076672, "step": 76530 }, { "epoch": 9.5896504197469, "grad_norm": 0.14140500128269196, "learning_rate": 6.218476978523411e-06, "loss": 0.458, "num_input_tokens_seen": 93082496, "step": 76535 }, { "epoch": 9.590276907655682, "grad_norm": 0.15236595273017883, "learning_rate": 6.2179467398306704e-06, "loss": 0.4656, "num_input_tokens_seen": 93088480, "step": 76540 }, { "epoch": 9.590903395564466, "grad_norm": 0.12722308933734894, "learning_rate": 6.217416486576354e-06, "loss": 0.4672, "num_input_tokens_seen": 93094176, "step": 76545 }, { "epoch": 9.59152988347325, "grad_norm": 0.14294175803661346, "learning_rate": 6.2168862187668e-06, "loss": 0.4718, "num_input_tokens_seen": 93099744, "step": 76550 }, { "epoch": 9.592156371382032, "grad_norm": 0.15331876277923584, "learning_rate": 6.216355936408348e-06, "loss": 0.4583, "num_input_tokens_seen": 93105952, "step": 76555 }, { "epoch": 9.592782859290816, "grad_norm": 0.13113421201705933, "learning_rate": 6.215825639507338e-06, "loss": 0.4619, "num_input_tokens_seen": 93112320, "step": 76560 }, { "epoch": 9.593409347199598, "grad_norm": 0.1238555833697319, "learning_rate": 6.215295328070111e-06, "loss": 0.4657, "num_input_tokens_seen": 93118656, "step": 76565 }, { "epoch": 9.594035835108382, "grad_norm": 0.10984202474355698, "learning_rate": 6.214765002103006e-06, "loss": 0.4687, "num_input_tokens_seen": 93124640, "step": 76570 }, { "epoch": 9.594662323017166, "grad_norm": 0.12224358320236206, "learning_rate": 6.214234661612364e-06, "loss": 0.4595, "num_input_tokens_seen": 93130880, "step": 76575 }, { "epoch": 9.595288810925949, "grad_norm": 0.10156775265932083, "learning_rate": 6.213704306604525e-06, "loss": 0.4537, "num_input_tokens_seen": 93136480, "step": 76580 }, { "epoch": 9.595915298834733, "grad_norm": 0.12119533866643906, "learning_rate": 6.213173937085832e-06, "loss": 0.4601, "num_input_tokens_seen": 93142848, "step": 76585 }, { "epoch": 9.596541786743515, "grad_norm": 0.12977826595306396, "learning_rate": 6.212643553062624e-06, "loss": 0.4531, "num_input_tokens_seen": 93149248, "step": 76590 }, { "epoch": 9.597168274652299, "grad_norm": 0.11156997829675674, "learning_rate": 6.212113154541242e-06, "loss": 0.4627, "num_input_tokens_seen": 93155616, "step": 76595 }, { "epoch": 9.597794762561083, "grad_norm": 0.1406545639038086, "learning_rate": 6.211582741528029e-06, "loss": 0.4565, "num_input_tokens_seen": 93161984, "step": 76600 }, { "epoch": 9.598421250469865, "grad_norm": 0.14488494396209717, "learning_rate": 6.211052314029325e-06, "loss": 0.4645, "num_input_tokens_seen": 93168096, "step": 76605 }, { "epoch": 9.59904773837865, "grad_norm": 0.1239573061466217, "learning_rate": 6.210521872051474e-06, "loss": 0.4679, "num_input_tokens_seen": 93174016, "step": 76610 }, { "epoch": 9.599674226287433, "grad_norm": 0.17398175597190857, "learning_rate": 6.2099914156008154e-06, "loss": 0.4599, "num_input_tokens_seen": 93179776, "step": 76615 }, { "epoch": 9.600300714196216, "grad_norm": 0.1030040979385376, "learning_rate": 6.209460944683692e-06, "loss": 0.4618, "num_input_tokens_seen": 93185792, "step": 76620 }, { "epoch": 9.600927202105, "grad_norm": 0.11002933233976364, "learning_rate": 6.208930459306446e-06, "loss": 0.4624, "num_input_tokens_seen": 93192192, "step": 76625 }, { "epoch": 9.601553690013782, "grad_norm": 0.1495310515165329, "learning_rate": 6.208399959475421e-06, "loss": 0.4626, "num_input_tokens_seen": 93198368, "step": 76630 }, { "epoch": 9.602180177922566, "grad_norm": 0.17447219789028168, "learning_rate": 6.207869445196957e-06, "loss": 0.4609, "num_input_tokens_seen": 93204224, "step": 76635 }, { "epoch": 9.60280666583135, "grad_norm": 0.17329499125480652, "learning_rate": 6.207338916477399e-06, "loss": 0.4623, "num_input_tokens_seen": 93210560, "step": 76640 }, { "epoch": 9.603433153740133, "grad_norm": 0.14614544808864594, "learning_rate": 6.206808373323089e-06, "loss": 0.4667, "num_input_tokens_seen": 93217024, "step": 76645 }, { "epoch": 9.604059641648917, "grad_norm": 0.22410187125205994, "learning_rate": 6.206277815740369e-06, "loss": 0.4549, "num_input_tokens_seen": 93222880, "step": 76650 }, { "epoch": 9.604686129557699, "grad_norm": 0.1929868757724762, "learning_rate": 6.205747243735585e-06, "loss": 0.4627, "num_input_tokens_seen": 93229088, "step": 76655 }, { "epoch": 9.605312617466483, "grad_norm": 0.16603393852710724, "learning_rate": 6.2052166573150785e-06, "loss": 0.46, "num_input_tokens_seen": 93235488, "step": 76660 }, { "epoch": 9.605939105375267, "grad_norm": 0.19318126142024994, "learning_rate": 6.204686056485194e-06, "loss": 0.4653, "num_input_tokens_seen": 93241568, "step": 76665 }, { "epoch": 9.60656559328405, "grad_norm": 0.1596902757883072, "learning_rate": 6.2041554412522755e-06, "loss": 0.4597, "num_input_tokens_seen": 93247776, "step": 76670 }, { "epoch": 9.607192081192833, "grad_norm": 0.1837599128484726, "learning_rate": 6.203624811622666e-06, "loss": 0.4649, "num_input_tokens_seen": 93254048, "step": 76675 }, { "epoch": 9.607818569101616, "grad_norm": 0.16274023056030273, "learning_rate": 6.203094167602709e-06, "loss": 0.4623, "num_input_tokens_seen": 93259968, "step": 76680 }, { "epoch": 9.6084450570104, "grad_norm": 0.15729917585849762, "learning_rate": 6.2025635091987506e-06, "loss": 0.4645, "num_input_tokens_seen": 93266272, "step": 76685 }, { "epoch": 9.609071544919184, "grad_norm": 0.11711986362934113, "learning_rate": 6.202032836417133e-06, "loss": 0.4601, "num_input_tokens_seen": 93272512, "step": 76690 }, { "epoch": 9.609698032827966, "grad_norm": 0.14261916279792786, "learning_rate": 6.201502149264204e-06, "loss": 0.4716, "num_input_tokens_seen": 93278784, "step": 76695 }, { "epoch": 9.61032452073675, "grad_norm": 0.1100659966468811, "learning_rate": 6.2009714477463045e-06, "loss": 0.4596, "num_input_tokens_seen": 93284992, "step": 76700 }, { "epoch": 9.610951008645532, "grad_norm": 0.1062643975019455, "learning_rate": 6.200440731869782e-06, "loss": 0.4657, "num_input_tokens_seen": 93291200, "step": 76705 }, { "epoch": 9.611577496554316, "grad_norm": 0.14493033289909363, "learning_rate": 6.199910001640983e-06, "loss": 0.4526, "num_input_tokens_seen": 93297536, "step": 76710 }, { "epoch": 9.6122039844631, "grad_norm": 0.13531962037086487, "learning_rate": 6.199379257066251e-06, "loss": 0.4588, "num_input_tokens_seen": 93303584, "step": 76715 }, { "epoch": 9.612830472371883, "grad_norm": 0.14893874526023865, "learning_rate": 6.19884849815193e-06, "loss": 0.4588, "num_input_tokens_seen": 93309536, "step": 76720 }, { "epoch": 9.613456960280667, "grad_norm": 0.10001471638679504, "learning_rate": 6.198317724904369e-06, "loss": 0.4645, "num_input_tokens_seen": 93315488, "step": 76725 }, { "epoch": 9.61408344818945, "grad_norm": 0.14389744400978088, "learning_rate": 6.197786937329911e-06, "loss": 0.4558, "num_input_tokens_seen": 93321600, "step": 76730 }, { "epoch": 9.614709936098233, "grad_norm": 0.13843092322349548, "learning_rate": 6.197256135434904e-06, "loss": 0.4606, "num_input_tokens_seen": 93327584, "step": 76735 }, { "epoch": 9.615336424007017, "grad_norm": 0.14292731881141663, "learning_rate": 6.196725319225692e-06, "loss": 0.4511, "num_input_tokens_seen": 93333696, "step": 76740 }, { "epoch": 9.6159629119158, "grad_norm": 0.14457455277442932, "learning_rate": 6.196194488708623e-06, "loss": 0.4661, "num_input_tokens_seen": 93339744, "step": 76745 }, { "epoch": 9.616589399824583, "grad_norm": 0.2059345692396164, "learning_rate": 6.195663643890044e-06, "loss": 0.4679, "num_input_tokens_seen": 93345824, "step": 76750 }, { "epoch": 9.617215887733368, "grad_norm": 0.13076144456863403, "learning_rate": 6.195132784776301e-06, "loss": 0.4638, "num_input_tokens_seen": 93352288, "step": 76755 }, { "epoch": 9.61784237564215, "grad_norm": 0.1211867481470108, "learning_rate": 6.194601911373739e-06, "loss": 0.4661, "num_input_tokens_seen": 93358528, "step": 76760 }, { "epoch": 9.618468863550934, "grad_norm": 0.11580207943916321, "learning_rate": 6.194071023688708e-06, "loss": 0.4664, "num_input_tokens_seen": 93364640, "step": 76765 }, { "epoch": 9.619095351459716, "grad_norm": 0.11563335359096527, "learning_rate": 6.193540121727555e-06, "loss": 0.4492, "num_input_tokens_seen": 93370464, "step": 76770 }, { "epoch": 9.6197218393685, "grad_norm": 0.11187945306301117, "learning_rate": 6.193009205496625e-06, "loss": 0.4553, "num_input_tokens_seen": 93376512, "step": 76775 }, { "epoch": 9.620348327277284, "grad_norm": 0.11072564870119095, "learning_rate": 6.192478275002267e-06, "loss": 0.4646, "num_input_tokens_seen": 93382592, "step": 76780 }, { "epoch": 9.620974815186067, "grad_norm": 0.21314749121665955, "learning_rate": 6.19194733025083e-06, "loss": 0.4534, "num_input_tokens_seen": 93388928, "step": 76785 }, { "epoch": 9.62160130309485, "grad_norm": 0.1164032369852066, "learning_rate": 6.191416371248658e-06, "loss": 0.4588, "num_input_tokens_seen": 93394976, "step": 76790 }, { "epoch": 9.622227791003633, "grad_norm": 0.11811785399913788, "learning_rate": 6.190885398002104e-06, "loss": 0.4607, "num_input_tokens_seen": 93400128, "step": 76795 }, { "epoch": 9.622854278912417, "grad_norm": 0.10519005358219147, "learning_rate": 6.190354410517511e-06, "loss": 0.4546, "num_input_tokens_seen": 93405952, "step": 76800 }, { "epoch": 9.623480766821201, "grad_norm": 0.12975695729255676, "learning_rate": 6.189823408801233e-06, "loss": 0.4588, "num_input_tokens_seen": 93411840, "step": 76805 }, { "epoch": 9.624107254729983, "grad_norm": 0.14167667925357819, "learning_rate": 6.189292392859615e-06, "loss": 0.4594, "num_input_tokens_seen": 93417312, "step": 76810 }, { "epoch": 9.624733742638767, "grad_norm": 0.1354750096797943, "learning_rate": 6.188761362699007e-06, "loss": 0.4623, "num_input_tokens_seen": 93423360, "step": 76815 }, { "epoch": 9.62536023054755, "grad_norm": 0.1342894434928894, "learning_rate": 6.1882303183257566e-06, "loss": 0.4545, "num_input_tokens_seen": 93428768, "step": 76820 }, { "epoch": 9.625986718456334, "grad_norm": 0.13298653066158295, "learning_rate": 6.187699259746214e-06, "loss": 0.4719, "num_input_tokens_seen": 93435008, "step": 76825 }, { "epoch": 9.626613206365118, "grad_norm": 0.20968760550022125, "learning_rate": 6.187168186966727e-06, "loss": 0.4714, "num_input_tokens_seen": 93441312, "step": 76830 }, { "epoch": 9.6272396942739, "grad_norm": 0.10743410140275955, "learning_rate": 6.186637099993649e-06, "loss": 0.4673, "num_input_tokens_seen": 93447392, "step": 76835 }, { "epoch": 9.627866182182684, "grad_norm": 0.11163985729217529, "learning_rate": 6.186105998833325e-06, "loss": 0.4659, "num_input_tokens_seen": 93453248, "step": 76840 }, { "epoch": 9.628492670091468, "grad_norm": 0.17843040823936462, "learning_rate": 6.185574883492107e-06, "loss": 0.461, "num_input_tokens_seen": 93459264, "step": 76845 }, { "epoch": 9.62911915800025, "grad_norm": 0.11230183392763138, "learning_rate": 6.185043753976343e-06, "loss": 0.4612, "num_input_tokens_seen": 93465664, "step": 76850 }, { "epoch": 9.629745645909034, "grad_norm": 0.12471732497215271, "learning_rate": 6.184512610292386e-06, "loss": 0.4532, "num_input_tokens_seen": 93471648, "step": 76855 }, { "epoch": 9.630372133817817, "grad_norm": 0.11942560225725174, "learning_rate": 6.183981452446585e-06, "loss": 0.455, "num_input_tokens_seen": 93477792, "step": 76860 }, { "epoch": 9.6309986217266, "grad_norm": 0.10647744685411453, "learning_rate": 6.18345028044529e-06, "loss": 0.4628, "num_input_tokens_seen": 93483808, "step": 76865 }, { "epoch": 9.631625109635383, "grad_norm": 0.18193773925304413, "learning_rate": 6.182919094294853e-06, "loss": 0.4621, "num_input_tokens_seen": 93490080, "step": 76870 }, { "epoch": 9.632251597544167, "grad_norm": 0.11591176688671112, "learning_rate": 6.182387894001623e-06, "loss": 0.4692, "num_input_tokens_seen": 93496160, "step": 76875 }, { "epoch": 9.632878085452951, "grad_norm": 0.15158383548259735, "learning_rate": 6.181856679571951e-06, "loss": 0.4582, "num_input_tokens_seen": 93501920, "step": 76880 }, { "epoch": 9.633504573361733, "grad_norm": 0.1373843550682068, "learning_rate": 6.181325451012188e-06, "loss": 0.4668, "num_input_tokens_seen": 93508192, "step": 76885 }, { "epoch": 9.634131061270518, "grad_norm": 0.12115759402513504, "learning_rate": 6.180794208328686e-06, "loss": 0.4541, "num_input_tokens_seen": 93514368, "step": 76890 }, { "epoch": 9.634757549179302, "grad_norm": 0.17515040934085846, "learning_rate": 6.180262951527798e-06, "loss": 0.4512, "num_input_tokens_seen": 93520608, "step": 76895 }, { "epoch": 9.635384037088084, "grad_norm": 0.1121867373585701, "learning_rate": 6.1797316806158735e-06, "loss": 0.4574, "num_input_tokens_seen": 93526464, "step": 76900 }, { "epoch": 9.636010524996868, "grad_norm": 0.0904197245836258, "learning_rate": 6.179200395599264e-06, "loss": 0.4604, "num_input_tokens_seen": 93532064, "step": 76905 }, { "epoch": 9.63663701290565, "grad_norm": 0.1618192344903946, "learning_rate": 6.1786690964843245e-06, "loss": 0.4525, "num_input_tokens_seen": 93538144, "step": 76910 }, { "epoch": 9.637263500814434, "grad_norm": 0.1338508278131485, "learning_rate": 6.178137783277404e-06, "loss": 0.4603, "num_input_tokens_seen": 93544608, "step": 76915 }, { "epoch": 9.637889988723218, "grad_norm": 0.20820897817611694, "learning_rate": 6.177606455984855e-06, "loss": 0.4653, "num_input_tokens_seen": 93550688, "step": 76920 }, { "epoch": 9.638516476632, "grad_norm": 0.12582442164421082, "learning_rate": 6.1770751146130315e-06, "loss": 0.464, "num_input_tokens_seen": 93556928, "step": 76925 }, { "epoch": 9.639142964540785, "grad_norm": 0.12600132822990417, "learning_rate": 6.176543759168285e-06, "loss": 0.4629, "num_input_tokens_seen": 93563296, "step": 76930 }, { "epoch": 9.639769452449567, "grad_norm": 0.1507086455821991, "learning_rate": 6.17601238965697e-06, "loss": 0.4561, "num_input_tokens_seen": 93569408, "step": 76935 }, { "epoch": 9.640395940358351, "grad_norm": 0.10125894099473953, "learning_rate": 6.175481006085436e-06, "loss": 0.4737, "num_input_tokens_seen": 93575488, "step": 76940 }, { "epoch": 9.641022428267135, "grad_norm": 0.15402528643608093, "learning_rate": 6.174949608460039e-06, "loss": 0.4608, "num_input_tokens_seen": 93581504, "step": 76945 }, { "epoch": 9.641648916175917, "grad_norm": 0.18408195674419403, "learning_rate": 6.174418196787132e-06, "loss": 0.4549, "num_input_tokens_seen": 93587488, "step": 76950 }, { "epoch": 9.642275404084701, "grad_norm": 0.1274735927581787, "learning_rate": 6.1738867710730675e-06, "loss": 0.4615, "num_input_tokens_seen": 93593312, "step": 76955 }, { "epoch": 9.642901891993485, "grad_norm": 0.20208650827407837, "learning_rate": 6.1733553313242e-06, "loss": 0.4588, "num_input_tokens_seen": 93599200, "step": 76960 }, { "epoch": 9.643528379902268, "grad_norm": 0.11612255871295929, "learning_rate": 6.172823877546884e-06, "loss": 0.4511, "num_input_tokens_seen": 93605376, "step": 76965 }, { "epoch": 9.644154867811052, "grad_norm": 0.12092047929763794, "learning_rate": 6.172292409747471e-06, "loss": 0.4706, "num_input_tokens_seen": 93611456, "step": 76970 }, { "epoch": 9.644781355719834, "grad_norm": 0.15154144167900085, "learning_rate": 6.1717609279323175e-06, "loss": 0.4496, "num_input_tokens_seen": 93617568, "step": 76975 }, { "epoch": 9.645407843628618, "grad_norm": 0.146216481924057, "learning_rate": 6.171229432107777e-06, "loss": 0.455, "num_input_tokens_seen": 93623872, "step": 76980 }, { "epoch": 9.6460343315374, "grad_norm": 0.13516643643379211, "learning_rate": 6.170697922280204e-06, "loss": 0.4638, "num_input_tokens_seen": 93629568, "step": 76985 }, { "epoch": 9.646660819446184, "grad_norm": 0.13709738850593567, "learning_rate": 6.170166398455954e-06, "loss": 0.4617, "num_input_tokens_seen": 93635904, "step": 76990 }, { "epoch": 9.647287307354969, "grad_norm": 0.20875583589076996, "learning_rate": 6.169634860641379e-06, "loss": 0.4703, "num_input_tokens_seen": 93642528, "step": 76995 }, { "epoch": 9.64791379526375, "grad_norm": 0.12570978701114655, "learning_rate": 6.1691033088428356e-06, "loss": 0.4648, "num_input_tokens_seen": 93648416, "step": 77000 }, { "epoch": 9.648540283172535, "grad_norm": 0.15696123242378235, "learning_rate": 6.16857174306668e-06, "loss": 0.4662, "num_input_tokens_seen": 93654336, "step": 77005 }, { "epoch": 9.649166771081319, "grad_norm": 0.1519353985786438, "learning_rate": 6.1680401633192655e-06, "loss": 0.462, "num_input_tokens_seen": 93660288, "step": 77010 }, { "epoch": 9.649793258990101, "grad_norm": 0.12001623958349228, "learning_rate": 6.167508569606951e-06, "loss": 0.47, "num_input_tokens_seen": 93665824, "step": 77015 }, { "epoch": 9.650419746898885, "grad_norm": 0.1299581080675125, "learning_rate": 6.1669769619360885e-06, "loss": 0.4605, "num_input_tokens_seen": 93672064, "step": 77020 }, { "epoch": 9.651046234807668, "grad_norm": 0.1444762945175171, "learning_rate": 6.166445340313037e-06, "loss": 0.4704, "num_input_tokens_seen": 93677760, "step": 77025 }, { "epoch": 9.651672722716452, "grad_norm": 0.11708611249923706, "learning_rate": 6.16591370474415e-06, "loss": 0.4603, "num_input_tokens_seen": 93683936, "step": 77030 }, { "epoch": 9.652299210625236, "grad_norm": 0.1137516051530838, "learning_rate": 6.165382055235784e-06, "loss": 0.4616, "num_input_tokens_seen": 93689536, "step": 77035 }, { "epoch": 9.652925698534018, "grad_norm": 0.13813547790050507, "learning_rate": 6.164850391794296e-06, "loss": 0.4658, "num_input_tokens_seen": 93695840, "step": 77040 }, { "epoch": 9.653552186442802, "grad_norm": 0.1287231743335724, "learning_rate": 6.164318714426042e-06, "loss": 0.4706, "num_input_tokens_seen": 93701472, "step": 77045 }, { "epoch": 9.654178674351584, "grad_norm": 0.1288376748561859, "learning_rate": 6.163787023137379e-06, "loss": 0.4582, "num_input_tokens_seen": 93707264, "step": 77050 }, { "epoch": 9.654805162260368, "grad_norm": 0.16122882068157196, "learning_rate": 6.163255317934662e-06, "loss": 0.4623, "num_input_tokens_seen": 93713760, "step": 77055 }, { "epoch": 9.655431650169152, "grad_norm": 0.12362401187419891, "learning_rate": 6.162723598824252e-06, "loss": 0.4595, "num_input_tokens_seen": 93719520, "step": 77060 }, { "epoch": 9.656058138077935, "grad_norm": 0.13077843189239502, "learning_rate": 6.162191865812503e-06, "loss": 0.4654, "num_input_tokens_seen": 93725536, "step": 77065 }, { "epoch": 9.656684625986719, "grad_norm": 0.15953871607780457, "learning_rate": 6.161660118905772e-06, "loss": 0.4553, "num_input_tokens_seen": 93731936, "step": 77070 }, { "epoch": 9.657311113895501, "grad_norm": 0.21552692353725433, "learning_rate": 6.161128358110418e-06, "loss": 0.4622, "num_input_tokens_seen": 93737920, "step": 77075 }, { "epoch": 9.657937601804285, "grad_norm": 0.11333175748586655, "learning_rate": 6.160596583432801e-06, "loss": 0.4629, "num_input_tokens_seen": 93743648, "step": 77080 }, { "epoch": 9.65856408971307, "grad_norm": 0.12465269863605499, "learning_rate": 6.160064794879273e-06, "loss": 0.46, "num_input_tokens_seen": 93749568, "step": 77085 }, { "epoch": 9.659190577621851, "grad_norm": 0.10301917791366577, "learning_rate": 6.159532992456196e-06, "loss": 0.4751, "num_input_tokens_seen": 93755840, "step": 77090 }, { "epoch": 9.659817065530635, "grad_norm": 0.16739705204963684, "learning_rate": 6.159001176169929e-06, "loss": 0.461, "num_input_tokens_seen": 93761792, "step": 77095 }, { "epoch": 9.660443553439418, "grad_norm": 0.13471637666225433, "learning_rate": 6.158469346026826e-06, "loss": 0.4684, "num_input_tokens_seen": 93767840, "step": 77100 }, { "epoch": 9.661070041348202, "grad_norm": 0.12264735251665115, "learning_rate": 6.157937502033248e-06, "loss": 0.4585, "num_input_tokens_seen": 93774144, "step": 77105 }, { "epoch": 9.661696529256986, "grad_norm": 0.1367233395576477, "learning_rate": 6.157405644195556e-06, "loss": 0.4601, "num_input_tokens_seen": 93779456, "step": 77110 }, { "epoch": 9.662323017165768, "grad_norm": 0.11070896685123444, "learning_rate": 6.156873772520104e-06, "loss": 0.4632, "num_input_tokens_seen": 93785568, "step": 77115 }, { "epoch": 9.662949505074552, "grad_norm": 0.15278677642345428, "learning_rate": 6.156341887013254e-06, "loss": 0.4576, "num_input_tokens_seen": 93791776, "step": 77120 }, { "epoch": 9.663575992983336, "grad_norm": 0.1930852234363556, "learning_rate": 6.155809987681365e-06, "loss": 0.4639, "num_input_tokens_seen": 93797568, "step": 77125 }, { "epoch": 9.664202480892119, "grad_norm": 0.13848643004894257, "learning_rate": 6.155278074530795e-06, "loss": 0.4563, "num_input_tokens_seen": 93803680, "step": 77130 }, { "epoch": 9.664828968800903, "grad_norm": 0.13789963722229004, "learning_rate": 6.154746147567905e-06, "loss": 0.4684, "num_input_tokens_seen": 93809856, "step": 77135 }, { "epoch": 9.665455456709685, "grad_norm": 0.17570197582244873, "learning_rate": 6.154214206799054e-06, "loss": 0.4658, "num_input_tokens_seen": 93815712, "step": 77140 }, { "epoch": 9.666081944618469, "grad_norm": 0.18436014652252197, "learning_rate": 6.153682252230602e-06, "loss": 0.4591, "num_input_tokens_seen": 93822016, "step": 77145 }, { "epoch": 9.666708432527253, "grad_norm": 0.125051349401474, "learning_rate": 6.153150283868908e-06, "loss": 0.4632, "num_input_tokens_seen": 93828256, "step": 77150 }, { "epoch": 9.667334920436035, "grad_norm": 0.15081053972244263, "learning_rate": 6.152618301720334e-06, "loss": 0.4549, "num_input_tokens_seen": 93834304, "step": 77155 }, { "epoch": 9.66796140834482, "grad_norm": 0.1516576111316681, "learning_rate": 6.152086305791237e-06, "loss": 0.4632, "num_input_tokens_seen": 93839520, "step": 77160 }, { "epoch": 9.668587896253602, "grad_norm": 0.12925510108470917, "learning_rate": 6.151554296087981e-06, "loss": 0.4712, "num_input_tokens_seen": 93844736, "step": 77165 }, { "epoch": 9.669214384162386, "grad_norm": 0.12688513100147247, "learning_rate": 6.151022272616926e-06, "loss": 0.4585, "num_input_tokens_seen": 93851040, "step": 77170 }, { "epoch": 9.66984087207117, "grad_norm": 0.0964159145951271, "learning_rate": 6.150490235384431e-06, "loss": 0.4673, "num_input_tokens_seen": 93856864, "step": 77175 }, { "epoch": 9.670467359979952, "grad_norm": 0.15075106918811798, "learning_rate": 6.149958184396858e-06, "loss": 0.4642, "num_input_tokens_seen": 93863328, "step": 77180 }, { "epoch": 9.671093847888736, "grad_norm": 0.13365884125232697, "learning_rate": 6.149426119660568e-06, "loss": 0.466, "num_input_tokens_seen": 93869536, "step": 77185 }, { "epoch": 9.671720335797518, "grad_norm": 0.16949212551116943, "learning_rate": 6.148894041181923e-06, "loss": 0.4632, "num_input_tokens_seen": 93875328, "step": 77190 }, { "epoch": 9.672346823706302, "grad_norm": 0.15689009428024292, "learning_rate": 6.148361948967283e-06, "loss": 0.4616, "num_input_tokens_seen": 93881376, "step": 77195 }, { "epoch": 9.672973311615086, "grad_norm": 0.14797286689281464, "learning_rate": 6.147829843023011e-06, "loss": 0.4635, "num_input_tokens_seen": 93887392, "step": 77200 }, { "epoch": 9.673599799523869, "grad_norm": 0.15441669523715973, "learning_rate": 6.1472977233554674e-06, "loss": 0.4577, "num_input_tokens_seen": 93894048, "step": 77205 }, { "epoch": 9.674226287432653, "grad_norm": 0.15647833049297333, "learning_rate": 6.146765589971016e-06, "loss": 0.4621, "num_input_tokens_seen": 93900160, "step": 77210 }, { "epoch": 9.674852775341435, "grad_norm": 0.1354409158229828, "learning_rate": 6.146233442876017e-06, "loss": 0.4628, "num_input_tokens_seen": 93905760, "step": 77215 }, { "epoch": 9.67547926325022, "grad_norm": 0.16478420794010162, "learning_rate": 6.145701282076833e-06, "loss": 0.4541, "num_input_tokens_seen": 93912160, "step": 77220 }, { "epoch": 9.676105751159003, "grad_norm": 0.12002470344305038, "learning_rate": 6.145169107579828e-06, "loss": 0.4499, "num_input_tokens_seen": 93918304, "step": 77225 }, { "epoch": 9.676732239067785, "grad_norm": 0.218765527009964, "learning_rate": 6.144636919391363e-06, "loss": 0.4622, "num_input_tokens_seen": 93924416, "step": 77230 }, { "epoch": 9.67735872697657, "grad_norm": 0.1446213573217392, "learning_rate": 6.1441047175178025e-06, "loss": 0.4655, "num_input_tokens_seen": 93930016, "step": 77235 }, { "epoch": 9.677985214885354, "grad_norm": 0.19980809092521667, "learning_rate": 6.143572501965508e-06, "loss": 0.462, "num_input_tokens_seen": 93936320, "step": 77240 }, { "epoch": 9.678611702794136, "grad_norm": 0.14189477264881134, "learning_rate": 6.143040272740841e-06, "loss": 0.4488, "num_input_tokens_seen": 93942528, "step": 77245 }, { "epoch": 9.67923819070292, "grad_norm": 0.12842150032520294, "learning_rate": 6.142508029850168e-06, "loss": 0.4556, "num_input_tokens_seen": 93948640, "step": 77250 }, { "epoch": 9.679864678611702, "grad_norm": 0.1997302621603012, "learning_rate": 6.1419757732998496e-06, "loss": 0.4718, "num_input_tokens_seen": 93954944, "step": 77255 }, { "epoch": 9.680491166520486, "grad_norm": 0.16035182774066925, "learning_rate": 6.141443503096252e-06, "loss": 0.4577, "num_input_tokens_seen": 93961088, "step": 77260 }, { "epoch": 9.68111765442927, "grad_norm": 0.24616599082946777, "learning_rate": 6.140911219245736e-06, "loss": 0.4615, "num_input_tokens_seen": 93967008, "step": 77265 }, { "epoch": 9.681744142338053, "grad_norm": 0.1321120709180832, "learning_rate": 6.140378921754668e-06, "loss": 0.4607, "num_input_tokens_seen": 93973408, "step": 77270 }, { "epoch": 9.682370630246837, "grad_norm": 0.2462342083454132, "learning_rate": 6.139846610629412e-06, "loss": 0.4632, "num_input_tokens_seen": 93979616, "step": 77275 }, { "epoch": 9.682997118155619, "grad_norm": 0.19832000136375427, "learning_rate": 6.139314285876331e-06, "loss": 0.4727, "num_input_tokens_seen": 93985440, "step": 77280 }, { "epoch": 9.683623606064403, "grad_norm": 0.13162459433078766, "learning_rate": 6.138781947501791e-06, "loss": 0.4683, "num_input_tokens_seen": 93991616, "step": 77285 }, { "epoch": 9.684250093973187, "grad_norm": 0.1238030195236206, "learning_rate": 6.138249595512154e-06, "loss": 0.4652, "num_input_tokens_seen": 93998048, "step": 77290 }, { "epoch": 9.68487658188197, "grad_norm": 0.11723524332046509, "learning_rate": 6.137717229913787e-06, "loss": 0.4644, "num_input_tokens_seen": 94004352, "step": 77295 }, { "epoch": 9.685503069790753, "grad_norm": 0.1250762939453125, "learning_rate": 6.137184850713053e-06, "loss": 0.4546, "num_input_tokens_seen": 94010528, "step": 77300 }, { "epoch": 9.686129557699536, "grad_norm": 0.11082473397254944, "learning_rate": 6.1366524579163176e-06, "loss": 0.4646, "num_input_tokens_seen": 94016608, "step": 77305 }, { "epoch": 9.68675604560832, "grad_norm": 0.1147356927394867, "learning_rate": 6.136120051529947e-06, "loss": 0.4617, "num_input_tokens_seen": 94022848, "step": 77310 }, { "epoch": 9.687382533517104, "grad_norm": 0.11774267256259918, "learning_rate": 6.135587631560305e-06, "loss": 0.472, "num_input_tokens_seen": 94028896, "step": 77315 }, { "epoch": 9.688009021425886, "grad_norm": 0.1178210973739624, "learning_rate": 6.1350551980137595e-06, "loss": 0.4626, "num_input_tokens_seen": 94034912, "step": 77320 }, { "epoch": 9.68863550933467, "grad_norm": 0.08518047630786896, "learning_rate": 6.134522750896674e-06, "loss": 0.4666, "num_input_tokens_seen": 94041056, "step": 77325 }, { "epoch": 9.689261997243452, "grad_norm": 0.11949627101421356, "learning_rate": 6.133990290215416e-06, "loss": 0.4599, "num_input_tokens_seen": 94046752, "step": 77330 }, { "epoch": 9.689888485152236, "grad_norm": 0.1301051676273346, "learning_rate": 6.133457815976352e-06, "loss": 0.4636, "num_input_tokens_seen": 94052992, "step": 77335 }, { "epoch": 9.69051497306102, "grad_norm": 0.14262571930885315, "learning_rate": 6.132925328185844e-06, "loss": 0.4641, "num_input_tokens_seen": 94059200, "step": 77340 }, { "epoch": 9.691141460969803, "grad_norm": 0.15255679190158844, "learning_rate": 6.132392826850263e-06, "loss": 0.4634, "num_input_tokens_seen": 94065408, "step": 77345 }, { "epoch": 9.691767948878587, "grad_norm": 0.10683421045541763, "learning_rate": 6.131860311975974e-06, "loss": 0.4564, "num_input_tokens_seen": 94071584, "step": 77350 }, { "epoch": 9.692394436787371, "grad_norm": 0.11509574949741364, "learning_rate": 6.131327783569342e-06, "loss": 0.4584, "num_input_tokens_seen": 94077856, "step": 77355 }, { "epoch": 9.693020924696153, "grad_norm": 0.14944933354854584, "learning_rate": 6.130795241636736e-06, "loss": 0.4565, "num_input_tokens_seen": 94083744, "step": 77360 }, { "epoch": 9.693647412604937, "grad_norm": 0.15303923189640045, "learning_rate": 6.13026268618452e-06, "loss": 0.4824, "num_input_tokens_seen": 94090080, "step": 77365 }, { "epoch": 9.69427390051372, "grad_norm": 0.11343825608491898, "learning_rate": 6.129730117219066e-06, "loss": 0.4679, "num_input_tokens_seen": 94096192, "step": 77370 }, { "epoch": 9.694900388422504, "grad_norm": 0.13634875416755676, "learning_rate": 6.129197534746737e-06, "loss": 0.4631, "num_input_tokens_seen": 94102336, "step": 77375 }, { "epoch": 9.695526876331288, "grad_norm": 0.10311149805784225, "learning_rate": 6.128664938773903e-06, "loss": 0.4662, "num_input_tokens_seen": 94108448, "step": 77380 }, { "epoch": 9.69615336424007, "grad_norm": 0.10385234653949738, "learning_rate": 6.12813232930693e-06, "loss": 0.4627, "num_input_tokens_seen": 94114176, "step": 77385 }, { "epoch": 9.696779852148854, "grad_norm": 0.12570956349372864, "learning_rate": 6.127599706352188e-06, "loss": 0.468, "num_input_tokens_seen": 94119456, "step": 77390 }, { "epoch": 9.697406340057636, "grad_norm": 0.09841717779636383, "learning_rate": 6.127067069916042e-06, "loss": 0.458, "num_input_tokens_seen": 94125728, "step": 77395 }, { "epoch": 9.69803282796642, "grad_norm": 0.10405072569847107, "learning_rate": 6.126534420004862e-06, "loss": 0.459, "num_input_tokens_seen": 94132224, "step": 77400 }, { "epoch": 9.698659315875204, "grad_norm": 0.07513735443353653, "learning_rate": 6.126001756625017e-06, "loss": 0.465, "num_input_tokens_seen": 94137696, "step": 77405 }, { "epoch": 9.699285803783987, "grad_norm": 0.11984340101480484, "learning_rate": 6.125469079782873e-06, "loss": 0.4679, "num_input_tokens_seen": 94143744, "step": 77410 }, { "epoch": 9.69991229169277, "grad_norm": 0.10548479110002518, "learning_rate": 6.1249363894847995e-06, "loss": 0.456, "num_input_tokens_seen": 94149888, "step": 77415 }, { "epoch": 9.700538779601553, "grad_norm": 0.09283994883298874, "learning_rate": 6.124403685737166e-06, "loss": 0.468, "num_input_tokens_seen": 94156000, "step": 77420 }, { "epoch": 9.701165267510337, "grad_norm": 0.1144888699054718, "learning_rate": 6.123870968546342e-06, "loss": 0.4617, "num_input_tokens_seen": 94161984, "step": 77425 }, { "epoch": 9.701791755419121, "grad_norm": 0.10148265212774277, "learning_rate": 6.1233382379186945e-06, "loss": 0.4628, "num_input_tokens_seen": 94167360, "step": 77430 }, { "epoch": 9.702418243327903, "grad_norm": 0.1042366623878479, "learning_rate": 6.122805493860595e-06, "loss": 0.4609, "num_input_tokens_seen": 94173408, "step": 77435 }, { "epoch": 9.703044731236687, "grad_norm": 0.10371967405080795, "learning_rate": 6.12227273637841e-06, "loss": 0.4644, "num_input_tokens_seen": 94179520, "step": 77440 }, { "epoch": 9.70367121914547, "grad_norm": 0.12547515332698822, "learning_rate": 6.121739965478514e-06, "loss": 0.4675, "num_input_tokens_seen": 94185504, "step": 77445 }, { "epoch": 9.704297707054254, "grad_norm": 0.12149599194526672, "learning_rate": 6.121207181167272e-06, "loss": 0.4649, "num_input_tokens_seen": 94191680, "step": 77450 }, { "epoch": 9.704924194963038, "grad_norm": 0.07147157192230225, "learning_rate": 6.120674383451055e-06, "loss": 0.4665, "num_input_tokens_seen": 94197472, "step": 77455 }, { "epoch": 9.70555068287182, "grad_norm": 0.09685172885656357, "learning_rate": 6.120141572336234e-06, "loss": 0.4658, "num_input_tokens_seen": 94203680, "step": 77460 }, { "epoch": 9.706177170780604, "grad_norm": 0.10953362286090851, "learning_rate": 6.119608747829179e-06, "loss": 0.4606, "num_input_tokens_seen": 94209856, "step": 77465 }, { "epoch": 9.706803658689388, "grad_norm": 0.09478583186864853, "learning_rate": 6.119075909936259e-06, "loss": 0.4668, "num_input_tokens_seen": 94216192, "step": 77470 }, { "epoch": 9.70743014659817, "grad_norm": 0.10918334871530533, "learning_rate": 6.118543058663846e-06, "loss": 0.4654, "num_input_tokens_seen": 94222176, "step": 77475 }, { "epoch": 9.708056634506955, "grad_norm": 0.11954096704721451, "learning_rate": 6.118010194018311e-06, "loss": 0.4658, "num_input_tokens_seen": 94228032, "step": 77480 }, { "epoch": 9.708683122415737, "grad_norm": 0.115956611931324, "learning_rate": 6.117477316006024e-06, "loss": 0.4579, "num_input_tokens_seen": 94234272, "step": 77485 }, { "epoch": 9.709309610324521, "grad_norm": 0.13245730102062225, "learning_rate": 6.116944424633356e-06, "loss": 0.4556, "num_input_tokens_seen": 94240736, "step": 77490 }, { "epoch": 9.709936098233303, "grad_norm": 0.11084242165088654, "learning_rate": 6.116411519906679e-06, "loss": 0.4619, "num_input_tokens_seen": 94246752, "step": 77495 }, { "epoch": 9.710562586142087, "grad_norm": 0.11883492767810822, "learning_rate": 6.1158786018323614e-06, "loss": 0.4593, "num_input_tokens_seen": 94253120, "step": 77500 }, { "epoch": 9.711189074050871, "grad_norm": 0.12448209524154663, "learning_rate": 6.115345670416779e-06, "loss": 0.463, "num_input_tokens_seen": 94259360, "step": 77505 }, { "epoch": 9.711815561959654, "grad_norm": 0.1494731605052948, "learning_rate": 6.114812725666301e-06, "loss": 0.4609, "num_input_tokens_seen": 94265472, "step": 77510 }, { "epoch": 9.712442049868438, "grad_norm": 0.06976134330034256, "learning_rate": 6.1142797675873e-06, "loss": 0.4643, "num_input_tokens_seen": 94271552, "step": 77515 }, { "epoch": 9.713068537777222, "grad_norm": 0.11085943132638931, "learning_rate": 6.113746796186146e-06, "loss": 0.4668, "num_input_tokens_seen": 94277728, "step": 77520 }, { "epoch": 9.713695025686004, "grad_norm": 0.08763302117586136, "learning_rate": 6.113213811469214e-06, "loss": 0.4582, "num_input_tokens_seen": 94284192, "step": 77525 }, { "epoch": 9.714321513594788, "grad_norm": 0.1387486457824707, "learning_rate": 6.112680813442875e-06, "loss": 0.4539, "num_input_tokens_seen": 94290016, "step": 77530 }, { "epoch": 9.71494800150357, "grad_norm": 0.09690427035093307, "learning_rate": 6.112147802113501e-06, "loss": 0.4654, "num_input_tokens_seen": 94296192, "step": 77535 }, { "epoch": 9.715574489412354, "grad_norm": 0.09490364044904709, "learning_rate": 6.111614777487465e-06, "loss": 0.4671, "num_input_tokens_seen": 94302176, "step": 77540 }, { "epoch": 9.716200977321138, "grad_norm": 0.12656168639659882, "learning_rate": 6.11108173957114e-06, "loss": 0.4601, "num_input_tokens_seen": 94308224, "step": 77545 }, { "epoch": 9.71682746522992, "grad_norm": 0.14485082030296326, "learning_rate": 6.1105486883708986e-06, "loss": 0.4677, "num_input_tokens_seen": 94314368, "step": 77550 }, { "epoch": 9.717453953138705, "grad_norm": 0.09284955263137817, "learning_rate": 6.110015623893113e-06, "loss": 0.4632, "num_input_tokens_seen": 94320288, "step": 77555 }, { "epoch": 9.718080441047487, "grad_norm": 0.11921948939561844, "learning_rate": 6.109482546144159e-06, "loss": 0.46, "num_input_tokens_seen": 94326304, "step": 77560 }, { "epoch": 9.718706928956271, "grad_norm": 0.10284492373466492, "learning_rate": 6.108949455130406e-06, "loss": 0.4673, "num_input_tokens_seen": 94332928, "step": 77565 }, { "epoch": 9.719333416865055, "grad_norm": 0.09889326989650726, "learning_rate": 6.108416350858232e-06, "loss": 0.4655, "num_input_tokens_seen": 94338496, "step": 77570 }, { "epoch": 9.719959904773837, "grad_norm": 0.10207612812519073, "learning_rate": 6.107883233334006e-06, "loss": 0.4597, "num_input_tokens_seen": 94344512, "step": 77575 }, { "epoch": 9.720586392682621, "grad_norm": 0.15419909358024597, "learning_rate": 6.107350102564108e-06, "loss": 0.4582, "num_input_tokens_seen": 94350688, "step": 77580 }, { "epoch": 9.721212880591404, "grad_norm": 0.136562317609787, "learning_rate": 6.106816958554906e-06, "loss": 0.4614, "num_input_tokens_seen": 94356608, "step": 77585 }, { "epoch": 9.721839368500188, "grad_norm": 0.16228726506233215, "learning_rate": 6.106283801312779e-06, "loss": 0.4658, "num_input_tokens_seen": 94362720, "step": 77590 }, { "epoch": 9.722465856408972, "grad_norm": 0.10928992927074432, "learning_rate": 6.105750630844097e-06, "loss": 0.4682, "num_input_tokens_seen": 94368320, "step": 77595 }, { "epoch": 9.723092344317754, "grad_norm": 0.11029927432537079, "learning_rate": 6.105217447155238e-06, "loss": 0.4605, "num_input_tokens_seen": 94374560, "step": 77600 }, { "epoch": 9.723718832226538, "grad_norm": 0.14541128277778625, "learning_rate": 6.104684250252575e-06, "loss": 0.4613, "num_input_tokens_seen": 94380800, "step": 77605 }, { "epoch": 9.72434532013532, "grad_norm": 0.10150211304426193, "learning_rate": 6.104151040142482e-06, "loss": 0.4642, "num_input_tokens_seen": 94386112, "step": 77610 }, { "epoch": 9.724971808044105, "grad_norm": 0.11270258575677872, "learning_rate": 6.103617816831336e-06, "loss": 0.464, "num_input_tokens_seen": 94392320, "step": 77615 }, { "epoch": 9.725598295952889, "grad_norm": 0.11289580911397934, "learning_rate": 6.10308458032551e-06, "loss": 0.4626, "num_input_tokens_seen": 94398080, "step": 77620 }, { "epoch": 9.72622478386167, "grad_norm": 0.14130918681621552, "learning_rate": 6.102551330631381e-06, "loss": 0.4609, "num_input_tokens_seen": 94404416, "step": 77625 }, { "epoch": 9.726851271770455, "grad_norm": 0.10891750454902649, "learning_rate": 6.102018067755323e-06, "loss": 0.4616, "num_input_tokens_seen": 94410656, "step": 77630 }, { "epoch": 9.727477759679239, "grad_norm": 0.09475494921207428, "learning_rate": 6.101484791703714e-06, "loss": 0.461, "num_input_tokens_seen": 94416640, "step": 77635 }, { "epoch": 9.728104247588021, "grad_norm": 0.09936165064573288, "learning_rate": 6.100951502482927e-06, "loss": 0.4578, "num_input_tokens_seen": 94422944, "step": 77640 }, { "epoch": 9.728730735496805, "grad_norm": 0.09778948873281479, "learning_rate": 6.100418200099341e-06, "loss": 0.4589, "num_input_tokens_seen": 94429312, "step": 77645 }, { "epoch": 9.729357223405588, "grad_norm": 0.10460133850574493, "learning_rate": 6.0998848845593285e-06, "loss": 0.4626, "num_input_tokens_seen": 94435360, "step": 77650 }, { "epoch": 9.729983711314372, "grad_norm": 0.14611276984214783, "learning_rate": 6.0993515558692685e-06, "loss": 0.4643, "num_input_tokens_seen": 94441568, "step": 77655 }, { "epoch": 9.730610199223156, "grad_norm": 0.09866942465305328, "learning_rate": 6.098818214035536e-06, "loss": 0.4586, "num_input_tokens_seen": 94447776, "step": 77660 }, { "epoch": 9.731236687131938, "grad_norm": 0.13704797625541687, "learning_rate": 6.0982848590645074e-06, "loss": 0.4574, "num_input_tokens_seen": 94453760, "step": 77665 }, { "epoch": 9.731863175040722, "grad_norm": 0.12544187903404236, "learning_rate": 6.09775149096256e-06, "loss": 0.4611, "num_input_tokens_seen": 94459872, "step": 77670 }, { "epoch": 9.732489662949504, "grad_norm": 0.09749718010425568, "learning_rate": 6.09721810973607e-06, "loss": 0.4622, "num_input_tokens_seen": 94465824, "step": 77675 }, { "epoch": 9.733116150858288, "grad_norm": 0.11501122266054153, "learning_rate": 6.096684715391417e-06, "loss": 0.4558, "num_input_tokens_seen": 94471936, "step": 77680 }, { "epoch": 9.733742638767072, "grad_norm": 0.12740245461463928, "learning_rate": 6.096151307934975e-06, "loss": 0.456, "num_input_tokens_seen": 94477952, "step": 77685 }, { "epoch": 9.734369126675855, "grad_norm": 0.13302868604660034, "learning_rate": 6.095617887373122e-06, "loss": 0.4612, "num_input_tokens_seen": 94483968, "step": 77690 }, { "epoch": 9.734995614584639, "grad_norm": 0.10720886290073395, "learning_rate": 6.095084453712237e-06, "loss": 0.4573, "num_input_tokens_seen": 94490112, "step": 77695 }, { "epoch": 9.735622102493421, "grad_norm": 0.10768236219882965, "learning_rate": 6.094551006958695e-06, "loss": 0.4568, "num_input_tokens_seen": 94495744, "step": 77700 }, { "epoch": 9.736248590402205, "grad_norm": 0.15270227193832397, "learning_rate": 6.0940175471188775e-06, "loss": 0.4641, "num_input_tokens_seen": 94501568, "step": 77705 }, { "epoch": 9.73687507831099, "grad_norm": 0.12295186519622803, "learning_rate": 6.093484074199161e-06, "loss": 0.4597, "num_input_tokens_seen": 94507392, "step": 77710 }, { "epoch": 9.737501566219771, "grad_norm": 0.11273051798343658, "learning_rate": 6.092950588205921e-06, "loss": 0.4576, "num_input_tokens_seen": 94513408, "step": 77715 }, { "epoch": 9.738128054128556, "grad_norm": 0.12955981492996216, "learning_rate": 6.092417089145539e-06, "loss": 0.4613, "num_input_tokens_seen": 94519744, "step": 77720 }, { "epoch": 9.738754542037338, "grad_norm": 0.11740963906049728, "learning_rate": 6.091883577024391e-06, "loss": 0.4646, "num_input_tokens_seen": 94525536, "step": 77725 }, { "epoch": 9.739381029946122, "grad_norm": 0.1964017003774643, "learning_rate": 6.0913500518488575e-06, "loss": 0.4565, "num_input_tokens_seen": 94531008, "step": 77730 }, { "epoch": 9.740007517854906, "grad_norm": 0.12931010127067566, "learning_rate": 6.090816513625317e-06, "loss": 0.4585, "num_input_tokens_seen": 94537152, "step": 77735 }, { "epoch": 9.740634005763688, "grad_norm": 0.17872858047485352, "learning_rate": 6.090282962360148e-06, "loss": 0.4583, "num_input_tokens_seen": 94543456, "step": 77740 }, { "epoch": 9.741260493672472, "grad_norm": 0.12849390506744385, "learning_rate": 6.089749398059729e-06, "loss": 0.4673, "num_input_tokens_seen": 94549664, "step": 77745 }, { "epoch": 9.741886981581256, "grad_norm": 0.1601228415966034, "learning_rate": 6.089215820730438e-06, "loss": 0.4622, "num_input_tokens_seen": 94555712, "step": 77750 }, { "epoch": 9.742513469490039, "grad_norm": 0.14990194141864777, "learning_rate": 6.088682230378658e-06, "loss": 0.4582, "num_input_tokens_seen": 94561792, "step": 77755 }, { "epoch": 9.743139957398823, "grad_norm": 0.17214573919773102, "learning_rate": 6.088148627010768e-06, "loss": 0.4617, "num_input_tokens_seen": 94567744, "step": 77760 }, { "epoch": 9.743766445307605, "grad_norm": 0.18257887661457062, "learning_rate": 6.087615010633145e-06, "loss": 0.4619, "num_input_tokens_seen": 94573728, "step": 77765 }, { "epoch": 9.744392933216389, "grad_norm": 0.11518286168575287, "learning_rate": 6.08708138125217e-06, "loss": 0.4687, "num_input_tokens_seen": 94579424, "step": 77770 }, { "epoch": 9.745019421125173, "grad_norm": 0.15448875725269318, "learning_rate": 6.086547738874223e-06, "loss": 0.4627, "num_input_tokens_seen": 94585408, "step": 77775 }, { "epoch": 9.745645909033955, "grad_norm": 0.12778830528259277, "learning_rate": 6.086014083505684e-06, "loss": 0.4565, "num_input_tokens_seen": 94591392, "step": 77780 }, { "epoch": 9.74627239694274, "grad_norm": 0.15558655560016632, "learning_rate": 6.0854804151529346e-06, "loss": 0.4599, "num_input_tokens_seen": 94597312, "step": 77785 }, { "epoch": 9.746898884851522, "grad_norm": 0.15773649513721466, "learning_rate": 6.084946733822353e-06, "loss": 0.4552, "num_input_tokens_seen": 94603360, "step": 77790 }, { "epoch": 9.747525372760306, "grad_norm": 0.10035018622875214, "learning_rate": 6.084413039520322e-06, "loss": 0.4547, "num_input_tokens_seen": 94609696, "step": 77795 }, { "epoch": 9.74815186066909, "grad_norm": 0.11340994387865067, "learning_rate": 6.083879332253222e-06, "loss": 0.4662, "num_input_tokens_seen": 94615808, "step": 77800 }, { "epoch": 9.748778348577872, "grad_norm": 0.12050928175449371, "learning_rate": 6.083345612027432e-06, "loss": 0.464, "num_input_tokens_seen": 94621920, "step": 77805 }, { "epoch": 9.749404836486656, "grad_norm": 0.17534135282039642, "learning_rate": 6.082811878849334e-06, "loss": 0.4728, "num_input_tokens_seen": 94627296, "step": 77810 }, { "epoch": 9.750031324395438, "grad_norm": 0.14966490864753723, "learning_rate": 6.08227813272531e-06, "loss": 0.4746, "num_input_tokens_seen": 94633440, "step": 77815 }, { "epoch": 9.750657812304222, "grad_norm": 0.1338387280702591, "learning_rate": 6.0817443736617424e-06, "loss": 0.4606, "num_input_tokens_seen": 94639584, "step": 77820 }, { "epoch": 9.751284300213007, "grad_norm": 0.15861766040325165, "learning_rate": 6.08121060166501e-06, "loss": 0.4634, "num_input_tokens_seen": 94645664, "step": 77825 }, { "epoch": 9.751910788121789, "grad_norm": 0.11918535828590393, "learning_rate": 6.080676816741497e-06, "loss": 0.4633, "num_input_tokens_seen": 94651488, "step": 77830 }, { "epoch": 9.752537276030573, "grad_norm": 0.11586908996105194, "learning_rate": 6.080143018897581e-06, "loss": 0.4587, "num_input_tokens_seen": 94657376, "step": 77835 }, { "epoch": 9.753163763939355, "grad_norm": 0.10282772779464722, "learning_rate": 6.079609208139649e-06, "loss": 0.4668, "num_input_tokens_seen": 94663328, "step": 77840 }, { "epoch": 9.75379025184814, "grad_norm": 0.10617243498563766, "learning_rate": 6.079075384474082e-06, "loss": 0.4694, "num_input_tokens_seen": 94669536, "step": 77845 }, { "epoch": 9.754416739756923, "grad_norm": 0.12213342636823654, "learning_rate": 6.078541547907262e-06, "loss": 0.4619, "num_input_tokens_seen": 94675776, "step": 77850 }, { "epoch": 9.755043227665706, "grad_norm": 0.1301565319299698, "learning_rate": 6.07800769844557e-06, "loss": 0.4503, "num_input_tokens_seen": 94681248, "step": 77855 }, { "epoch": 9.75566971557449, "grad_norm": 0.15231285989284515, "learning_rate": 6.077473836095389e-06, "loss": 0.4618, "num_input_tokens_seen": 94687456, "step": 77860 }, { "epoch": 9.756296203483274, "grad_norm": 0.1614929735660553, "learning_rate": 6.076939960863104e-06, "loss": 0.4597, "num_input_tokens_seen": 94693728, "step": 77865 }, { "epoch": 9.756922691392056, "grad_norm": 0.12781161069869995, "learning_rate": 6.076406072755095e-06, "loss": 0.4666, "num_input_tokens_seen": 94699968, "step": 77870 }, { "epoch": 9.75754917930084, "grad_norm": 0.10657649487257004, "learning_rate": 6.075872171777746e-06, "loss": 0.4637, "num_input_tokens_seen": 94706144, "step": 77875 }, { "epoch": 9.758175667209622, "grad_norm": 0.11889125406742096, "learning_rate": 6.075338257937442e-06, "loss": 0.4708, "num_input_tokens_seen": 94711712, "step": 77880 }, { "epoch": 9.758802155118406, "grad_norm": 0.14894679188728333, "learning_rate": 6.074804331240563e-06, "loss": 0.4643, "num_input_tokens_seen": 94718080, "step": 77885 }, { "epoch": 9.75942864302719, "grad_norm": 0.132696270942688, "learning_rate": 6.074270391693497e-06, "loss": 0.4615, "num_input_tokens_seen": 94724288, "step": 77890 }, { "epoch": 9.760055130935973, "grad_norm": 0.13447435200214386, "learning_rate": 6.073736439302623e-06, "loss": 0.4525, "num_input_tokens_seen": 94730592, "step": 77895 }, { "epoch": 9.760681618844757, "grad_norm": 0.11454454809427261, "learning_rate": 6.073202474074329e-06, "loss": 0.4553, "num_input_tokens_seen": 94735840, "step": 77900 }, { "epoch": 9.761308106753539, "grad_norm": 0.10892031341791153, "learning_rate": 6.072668496014997e-06, "loss": 0.4568, "num_input_tokens_seen": 94742016, "step": 77905 }, { "epoch": 9.761934594662323, "grad_norm": 0.11857972294092178, "learning_rate": 6.072134505131011e-06, "loss": 0.4716, "num_input_tokens_seen": 94748576, "step": 77910 }, { "epoch": 9.762561082571107, "grad_norm": 0.1306401491165161, "learning_rate": 6.071600501428756e-06, "loss": 0.4541, "num_input_tokens_seen": 94754976, "step": 77915 }, { "epoch": 9.76318757047989, "grad_norm": 0.11200946569442749, "learning_rate": 6.071066484914615e-06, "loss": 0.4782, "num_input_tokens_seen": 94761088, "step": 77920 }, { "epoch": 9.763814058388673, "grad_norm": 0.109694704413414, "learning_rate": 6.070532455594974e-06, "loss": 0.4709, "num_input_tokens_seen": 94767232, "step": 77925 }, { "epoch": 9.764440546297456, "grad_norm": 0.11931969970464706, "learning_rate": 6.069998413476217e-06, "loss": 0.4607, "num_input_tokens_seen": 94773184, "step": 77930 }, { "epoch": 9.76506703420624, "grad_norm": 0.12827715277671814, "learning_rate": 6.069464358564731e-06, "loss": 0.4514, "num_input_tokens_seen": 94778400, "step": 77935 }, { "epoch": 9.765693522115024, "grad_norm": 0.14992819726467133, "learning_rate": 6.068930290866897e-06, "loss": 0.4682, "num_input_tokens_seen": 94784032, "step": 77940 }, { "epoch": 9.766320010023806, "grad_norm": 0.08776956051588058, "learning_rate": 6.068396210389104e-06, "loss": 0.4568, "num_input_tokens_seen": 94789984, "step": 77945 }, { "epoch": 9.76694649793259, "grad_norm": 0.14320358633995056, "learning_rate": 6.067862117137736e-06, "loss": 0.4625, "num_input_tokens_seen": 94796416, "step": 77950 }, { "epoch": 9.767572985841372, "grad_norm": 0.1101551428437233, "learning_rate": 6.067328011119179e-06, "loss": 0.456, "num_input_tokens_seen": 94802528, "step": 77955 }, { "epoch": 9.768199473750157, "grad_norm": 0.15995913743972778, "learning_rate": 6.066793892339818e-06, "loss": 0.4559, "num_input_tokens_seen": 94808384, "step": 77960 }, { "epoch": 9.76882596165894, "grad_norm": 0.16036568582057953, "learning_rate": 6.0662597608060395e-06, "loss": 0.4664, "num_input_tokens_seen": 94814496, "step": 77965 }, { "epoch": 9.769452449567723, "grad_norm": 0.16402743756771088, "learning_rate": 6.065725616524229e-06, "loss": 0.4566, "num_input_tokens_seen": 94820256, "step": 77970 }, { "epoch": 9.770078937476507, "grad_norm": 0.12474657595157623, "learning_rate": 6.065191459500772e-06, "loss": 0.4628, "num_input_tokens_seen": 94826464, "step": 77975 }, { "epoch": 9.770705425385291, "grad_norm": 0.08395073562860489, "learning_rate": 6.064657289742056e-06, "loss": 0.4665, "num_input_tokens_seen": 94832000, "step": 77980 }, { "epoch": 9.771331913294073, "grad_norm": 0.09147702157497406, "learning_rate": 6.064123107254466e-06, "loss": 0.4576, "num_input_tokens_seen": 94838272, "step": 77985 }, { "epoch": 9.771958401202857, "grad_norm": 0.1177351325750351, "learning_rate": 6.06358891204439e-06, "loss": 0.4575, "num_input_tokens_seen": 94844704, "step": 77990 }, { "epoch": 9.77258488911164, "grad_norm": 0.14511381089687347, "learning_rate": 6.063054704118214e-06, "loss": 0.4704, "num_input_tokens_seen": 94850592, "step": 77995 }, { "epoch": 9.773211377020424, "grad_norm": 0.1132931113243103, "learning_rate": 6.062520483482327e-06, "loss": 0.4539, "num_input_tokens_seen": 94856960, "step": 78000 }, { "epoch": 9.773837864929208, "grad_norm": 0.10818622261285782, "learning_rate": 6.0619862501431135e-06, "loss": 0.4739, "num_input_tokens_seen": 94862880, "step": 78005 }, { "epoch": 9.77446435283799, "grad_norm": 0.09732167422771454, "learning_rate": 6.061452004106961e-06, "loss": 0.4601, "num_input_tokens_seen": 94869024, "step": 78010 }, { "epoch": 9.775090840746774, "grad_norm": 0.1137077584862709, "learning_rate": 6.060917745380258e-06, "loss": 0.4533, "num_input_tokens_seen": 94875392, "step": 78015 }, { "epoch": 9.775717328655556, "grad_norm": 0.18468506634235382, "learning_rate": 6.060383473969391e-06, "loss": 0.458, "num_input_tokens_seen": 94881472, "step": 78020 }, { "epoch": 9.77634381656434, "grad_norm": 0.11666541546583176, "learning_rate": 6.059849189880749e-06, "loss": 0.4663, "num_input_tokens_seen": 94887456, "step": 78025 }, { "epoch": 9.776970304473124, "grad_norm": 0.10839574038982391, "learning_rate": 6.059314893120718e-06, "loss": 0.4684, "num_input_tokens_seen": 94893664, "step": 78030 }, { "epoch": 9.777596792381907, "grad_norm": 0.15546457469463348, "learning_rate": 6.058780583695686e-06, "loss": 0.4682, "num_input_tokens_seen": 94899456, "step": 78035 }, { "epoch": 9.77822328029069, "grad_norm": 0.10275797545909882, "learning_rate": 6.058246261612043e-06, "loss": 0.4631, "num_input_tokens_seen": 94905824, "step": 78040 }, { "epoch": 9.778849768199473, "grad_norm": 0.1522226631641388, "learning_rate": 6.057711926876177e-06, "loss": 0.4575, "num_input_tokens_seen": 94911904, "step": 78045 }, { "epoch": 9.779476256108257, "grad_norm": 0.11222923547029495, "learning_rate": 6.057177579494474e-06, "loss": 0.4585, "num_input_tokens_seen": 94918240, "step": 78050 }, { "epoch": 9.780102744017041, "grad_norm": 0.11682192981243134, "learning_rate": 6.0566432194733245e-06, "loss": 0.4611, "num_input_tokens_seen": 94924576, "step": 78055 }, { "epoch": 9.780729231925823, "grad_norm": 0.16172076761722565, "learning_rate": 6.056108846819118e-06, "loss": 0.4658, "num_input_tokens_seen": 94930496, "step": 78060 }, { "epoch": 9.781355719834607, "grad_norm": 0.16863366961479187, "learning_rate": 6.055574461538241e-06, "loss": 0.4549, "num_input_tokens_seen": 94936352, "step": 78065 }, { "epoch": 9.78198220774339, "grad_norm": 0.09790752083063126, "learning_rate": 6.055040063637085e-06, "loss": 0.4629, "num_input_tokens_seen": 94942688, "step": 78070 }, { "epoch": 9.782608695652174, "grad_norm": 0.08846044540405273, "learning_rate": 6.0545056531220366e-06, "loss": 0.4635, "num_input_tokens_seen": 94949088, "step": 78075 }, { "epoch": 9.783235183560958, "grad_norm": 0.10973640531301498, "learning_rate": 6.0539712299994876e-06, "loss": 0.4604, "num_input_tokens_seen": 94954976, "step": 78080 }, { "epoch": 9.78386167146974, "grad_norm": 0.12962324917316437, "learning_rate": 6.053436794275827e-06, "loss": 0.4635, "num_input_tokens_seen": 94961152, "step": 78085 }, { "epoch": 9.784488159378524, "grad_norm": 0.125, "learning_rate": 6.0529023459574425e-06, "loss": 0.4563, "num_input_tokens_seen": 94967616, "step": 78090 }, { "epoch": 9.785114647287308, "grad_norm": 0.11577612161636353, "learning_rate": 6.052367885050726e-06, "loss": 0.4704, "num_input_tokens_seen": 94973920, "step": 78095 }, { "epoch": 9.78574113519609, "grad_norm": 0.13261553645133972, "learning_rate": 6.051833411562066e-06, "loss": 0.4617, "num_input_tokens_seen": 94979904, "step": 78100 }, { "epoch": 9.786367623104875, "grad_norm": 0.14895927906036377, "learning_rate": 6.051298925497854e-06, "loss": 0.4626, "num_input_tokens_seen": 94986016, "step": 78105 }, { "epoch": 9.786994111013657, "grad_norm": 0.13868942856788635, "learning_rate": 6.050764426864479e-06, "loss": 0.4565, "num_input_tokens_seen": 94992032, "step": 78110 }, { "epoch": 9.787620598922441, "grad_norm": 0.14390829205513, "learning_rate": 6.050229915668333e-06, "loss": 0.4552, "num_input_tokens_seen": 94998208, "step": 78115 }, { "epoch": 9.788247086831223, "grad_norm": 0.11361600458621979, "learning_rate": 6.049695391915803e-06, "loss": 0.4711, "num_input_tokens_seen": 95004704, "step": 78120 }, { "epoch": 9.788873574740007, "grad_norm": 0.13398025929927826, "learning_rate": 6.049160855613283e-06, "loss": 0.4588, "num_input_tokens_seen": 95010720, "step": 78125 }, { "epoch": 9.789500062648791, "grad_norm": 0.12232384085655212, "learning_rate": 6.048626306767165e-06, "loss": 0.4566, "num_input_tokens_seen": 95017440, "step": 78130 }, { "epoch": 9.790126550557574, "grad_norm": 0.10244008898735046, "learning_rate": 6.048091745383835e-06, "loss": 0.4651, "num_input_tokens_seen": 95023712, "step": 78135 }, { "epoch": 9.790753038466358, "grad_norm": 0.1110333576798439, "learning_rate": 6.047557171469689e-06, "loss": 0.4596, "num_input_tokens_seen": 95029792, "step": 78140 }, { "epoch": 9.791379526375142, "grad_norm": 0.14254282414913177, "learning_rate": 6.047022585031115e-06, "loss": 0.4591, "num_input_tokens_seen": 95036128, "step": 78145 }, { "epoch": 9.792006014283924, "grad_norm": 0.12415491044521332, "learning_rate": 6.046487986074505e-06, "loss": 0.465, "num_input_tokens_seen": 95042272, "step": 78150 }, { "epoch": 9.792632502192708, "grad_norm": 0.12136052548885345, "learning_rate": 6.045953374606253e-06, "loss": 0.4621, "num_input_tokens_seen": 95047776, "step": 78155 }, { "epoch": 9.79325899010149, "grad_norm": 0.11284129321575165, "learning_rate": 6.045418750632749e-06, "loss": 0.4617, "num_input_tokens_seen": 95053824, "step": 78160 }, { "epoch": 9.793885478010274, "grad_norm": 0.12154329568147659, "learning_rate": 6.044884114160384e-06, "loss": 0.4578, "num_input_tokens_seen": 95060000, "step": 78165 }, { "epoch": 9.794511965919058, "grad_norm": 0.12598305940628052, "learning_rate": 6.044349465195551e-06, "loss": 0.4601, "num_input_tokens_seen": 95066368, "step": 78170 }, { "epoch": 9.79513845382784, "grad_norm": 0.0937858372926712, "learning_rate": 6.043814803744643e-06, "loss": 0.4571, "num_input_tokens_seen": 95072288, "step": 78175 }, { "epoch": 9.795764941736625, "grad_norm": 0.10848970711231232, "learning_rate": 6.043280129814051e-06, "loss": 0.4644, "num_input_tokens_seen": 95078528, "step": 78180 }, { "epoch": 9.796391429645407, "grad_norm": 0.1533002257347107, "learning_rate": 6.042745443410168e-06, "loss": 0.4698, "num_input_tokens_seen": 95084512, "step": 78185 }, { "epoch": 9.797017917554191, "grad_norm": 0.16249258816242218, "learning_rate": 6.042210744539385e-06, "loss": 0.4653, "num_input_tokens_seen": 95090592, "step": 78190 }, { "epoch": 9.797644405462975, "grad_norm": 0.10347288846969604, "learning_rate": 6.041676033208097e-06, "loss": 0.4569, "num_input_tokens_seen": 95096768, "step": 78195 }, { "epoch": 9.798270893371757, "grad_norm": 0.12155160307884216, "learning_rate": 6.041141309422697e-06, "loss": 0.4634, "num_input_tokens_seen": 95102560, "step": 78200 }, { "epoch": 9.798897381280542, "grad_norm": 0.10417173057794571, "learning_rate": 6.040606573189577e-06, "loss": 0.4606, "num_input_tokens_seen": 95108544, "step": 78205 }, { "epoch": 9.799523869189324, "grad_norm": 0.20305849611759186, "learning_rate": 6.040071824515131e-06, "loss": 0.462, "num_input_tokens_seen": 95114464, "step": 78210 }, { "epoch": 9.800150357098108, "grad_norm": 0.1115805134177208, "learning_rate": 6.0395370634057515e-06, "loss": 0.4578, "num_input_tokens_seen": 95120288, "step": 78215 }, { "epoch": 9.800776845006892, "grad_norm": 0.11254896223545074, "learning_rate": 6.039002289867832e-06, "loss": 0.4676, "num_input_tokens_seen": 95126688, "step": 78220 }, { "epoch": 9.801403332915674, "grad_norm": 0.12945358455181122, "learning_rate": 6.038467503907768e-06, "loss": 0.4685, "num_input_tokens_seen": 95132864, "step": 78225 }, { "epoch": 9.802029820824458, "grad_norm": 0.10810382664203644, "learning_rate": 6.037932705531952e-06, "loss": 0.4524, "num_input_tokens_seen": 95138816, "step": 78230 }, { "epoch": 9.80265630873324, "grad_norm": 0.10885820537805557, "learning_rate": 6.037397894746776e-06, "loss": 0.4537, "num_input_tokens_seen": 95145184, "step": 78235 }, { "epoch": 9.803282796642025, "grad_norm": 0.11657081544399261, "learning_rate": 6.036863071558636e-06, "loss": 0.4656, "num_input_tokens_seen": 95151136, "step": 78240 }, { "epoch": 9.803909284550809, "grad_norm": 0.11211283504962921, "learning_rate": 6.036328235973927e-06, "loss": 0.4497, "num_input_tokens_seen": 95156960, "step": 78245 }, { "epoch": 9.804535772459591, "grad_norm": 0.1398601233959198, "learning_rate": 6.035793387999041e-06, "loss": 0.462, "num_input_tokens_seen": 95163200, "step": 78250 }, { "epoch": 9.805162260368375, "grad_norm": 0.10980957746505737, "learning_rate": 6.035258527640376e-06, "loss": 0.4698, "num_input_tokens_seen": 95169664, "step": 78255 }, { "epoch": 9.805788748277159, "grad_norm": 0.1578395813703537, "learning_rate": 6.034723654904325e-06, "loss": 0.4596, "num_input_tokens_seen": 95175744, "step": 78260 }, { "epoch": 9.806415236185941, "grad_norm": 0.0910366028547287, "learning_rate": 6.034188769797283e-06, "loss": 0.4641, "num_input_tokens_seen": 95182272, "step": 78265 }, { "epoch": 9.807041724094725, "grad_norm": 0.1515323966741562, "learning_rate": 6.033653872325644e-06, "loss": 0.4691, "num_input_tokens_seen": 95188448, "step": 78270 }, { "epoch": 9.807668212003508, "grad_norm": 0.1255938559770584, "learning_rate": 6.0331189624958055e-06, "loss": 0.465, "num_input_tokens_seen": 95194624, "step": 78275 }, { "epoch": 9.808294699912292, "grad_norm": 0.1707608699798584, "learning_rate": 6.032584040314159e-06, "loss": 0.4591, "num_input_tokens_seen": 95201120, "step": 78280 }, { "epoch": 9.808921187821076, "grad_norm": 0.11752539128065109, "learning_rate": 6.032049105787103e-06, "loss": 0.4666, "num_input_tokens_seen": 95207392, "step": 78285 }, { "epoch": 9.809547675729858, "grad_norm": 0.10863590985536575, "learning_rate": 6.031514158921033e-06, "loss": 0.4642, "num_input_tokens_seen": 95213280, "step": 78290 }, { "epoch": 9.810174163638642, "grad_norm": 0.1287805140018463, "learning_rate": 6.030979199722342e-06, "loss": 0.4592, "num_input_tokens_seen": 95219552, "step": 78295 }, { "epoch": 9.810800651547424, "grad_norm": 0.13189992308616638, "learning_rate": 6.03044422819743e-06, "loss": 0.4604, "num_input_tokens_seen": 95225504, "step": 78300 }, { "epoch": 9.811427139456208, "grad_norm": 0.16977469623088837, "learning_rate": 6.029909244352688e-06, "loss": 0.4616, "num_input_tokens_seen": 95231808, "step": 78305 }, { "epoch": 9.812053627364993, "grad_norm": 0.19588793814182281, "learning_rate": 6.029374248194517e-06, "loss": 0.4625, "num_input_tokens_seen": 95237920, "step": 78310 }, { "epoch": 9.812680115273775, "grad_norm": 0.12427923083305359, "learning_rate": 6.028839239729312e-06, "loss": 0.4643, "num_input_tokens_seen": 95244000, "step": 78315 }, { "epoch": 9.813306603182559, "grad_norm": 0.12047634273767471, "learning_rate": 6.028304218963469e-06, "loss": 0.4647, "num_input_tokens_seen": 95250080, "step": 78320 }, { "epoch": 9.813933091091341, "grad_norm": 0.11092273890972137, "learning_rate": 6.027769185903384e-06, "loss": 0.469, "num_input_tokens_seen": 95256160, "step": 78325 }, { "epoch": 9.814559579000125, "grad_norm": 0.14272400736808777, "learning_rate": 6.027234140555453e-06, "loss": 0.4605, "num_input_tokens_seen": 95262080, "step": 78330 }, { "epoch": 9.81518606690891, "grad_norm": 0.09641335904598236, "learning_rate": 6.0266990829260765e-06, "loss": 0.4543, "num_input_tokens_seen": 95268000, "step": 78335 }, { "epoch": 9.815812554817692, "grad_norm": 0.09994039684534073, "learning_rate": 6.026164013021648e-06, "loss": 0.4663, "num_input_tokens_seen": 95273600, "step": 78340 }, { "epoch": 9.816439042726476, "grad_norm": 0.1722109466791153, "learning_rate": 6.025628930848566e-06, "loss": 0.4566, "num_input_tokens_seen": 95279552, "step": 78345 }, { "epoch": 9.817065530635258, "grad_norm": 0.09005500376224518, "learning_rate": 6.025093836413227e-06, "loss": 0.4661, "num_input_tokens_seen": 95285216, "step": 78350 }, { "epoch": 9.817692018544042, "grad_norm": 0.12997570633888245, "learning_rate": 6.024558729722031e-06, "loss": 0.4628, "num_input_tokens_seen": 95291392, "step": 78355 }, { "epoch": 9.818318506452826, "grad_norm": 0.10885576903820038, "learning_rate": 6.024023610781373e-06, "loss": 0.4653, "num_input_tokens_seen": 95297664, "step": 78360 }, { "epoch": 9.818944994361608, "grad_norm": 0.10442407429218292, "learning_rate": 6.023488479597652e-06, "loss": 0.4584, "num_input_tokens_seen": 95303776, "step": 78365 }, { "epoch": 9.819571482270392, "grad_norm": 0.11653809994459152, "learning_rate": 6.022953336177265e-06, "loss": 0.4709, "num_input_tokens_seen": 95309408, "step": 78370 }, { "epoch": 9.820197970179176, "grad_norm": 0.17056462168693542, "learning_rate": 6.022418180526613e-06, "loss": 0.4581, "num_input_tokens_seen": 95315584, "step": 78375 }, { "epoch": 9.820824458087959, "grad_norm": 0.10597097873687744, "learning_rate": 6.02188301265209e-06, "loss": 0.4549, "num_input_tokens_seen": 95321632, "step": 78380 }, { "epoch": 9.821450945996743, "grad_norm": 0.15818354487419128, "learning_rate": 6.021347832560097e-06, "loss": 0.4556, "num_input_tokens_seen": 95327680, "step": 78385 }, { "epoch": 9.822077433905525, "grad_norm": 0.24596765637397766, "learning_rate": 6.020812640257032e-06, "loss": 0.4626, "num_input_tokens_seen": 95334112, "step": 78390 }, { "epoch": 9.822703921814309, "grad_norm": 0.17001652717590332, "learning_rate": 6.020277435749294e-06, "loss": 0.4634, "num_input_tokens_seen": 95340160, "step": 78395 }, { "epoch": 9.823330409723093, "grad_norm": 0.14058899879455566, "learning_rate": 6.01974221904328e-06, "loss": 0.4643, "num_input_tokens_seen": 95346368, "step": 78400 }, { "epoch": 9.823956897631875, "grad_norm": 0.11642616987228394, "learning_rate": 6.019206990145392e-06, "loss": 0.4615, "num_input_tokens_seen": 95352352, "step": 78405 }, { "epoch": 9.82458338554066, "grad_norm": 0.1234380453824997, "learning_rate": 6.018671749062027e-06, "loss": 0.4686, "num_input_tokens_seen": 95358752, "step": 78410 }, { "epoch": 9.825209873449442, "grad_norm": 0.16378772258758545, "learning_rate": 6.018136495799585e-06, "loss": 0.4666, "num_input_tokens_seen": 95364640, "step": 78415 }, { "epoch": 9.825836361358226, "grad_norm": 0.12062600255012512, "learning_rate": 6.017601230364465e-06, "loss": 0.4632, "num_input_tokens_seen": 95370944, "step": 78420 }, { "epoch": 9.82646284926701, "grad_norm": 0.16288405656814575, "learning_rate": 6.017065952763067e-06, "loss": 0.4636, "num_input_tokens_seen": 95377504, "step": 78425 }, { "epoch": 9.827089337175792, "grad_norm": 0.1760682463645935, "learning_rate": 6.01653066300179e-06, "loss": 0.4631, "num_input_tokens_seen": 95383616, "step": 78430 }, { "epoch": 9.827715825084576, "grad_norm": 0.11981095373630524, "learning_rate": 6.0159953610870336e-06, "loss": 0.4646, "num_input_tokens_seen": 95389824, "step": 78435 }, { "epoch": 9.828342312993358, "grad_norm": 0.12822329998016357, "learning_rate": 6.0154600470252e-06, "loss": 0.455, "num_input_tokens_seen": 95396192, "step": 78440 }, { "epoch": 9.828968800902143, "grad_norm": 0.12309889495372772, "learning_rate": 6.014924720822687e-06, "loss": 0.4547, "num_input_tokens_seen": 95402272, "step": 78445 }, { "epoch": 9.829595288810927, "grad_norm": 0.11159438639879227, "learning_rate": 6.0143893824858956e-06, "loss": 0.4566, "num_input_tokens_seen": 95408480, "step": 78450 }, { "epoch": 9.830221776719709, "grad_norm": 0.1152728796005249, "learning_rate": 6.013854032021226e-06, "loss": 0.4593, "num_input_tokens_seen": 95415008, "step": 78455 }, { "epoch": 9.830848264628493, "grad_norm": 0.12747736275196075, "learning_rate": 6.01331866943508e-06, "loss": 0.4591, "num_input_tokens_seen": 95420896, "step": 78460 }, { "epoch": 9.831474752537275, "grad_norm": 0.14197584986686707, "learning_rate": 6.0127832947338575e-06, "loss": 0.4558, "num_input_tokens_seen": 95427104, "step": 78465 }, { "epoch": 9.83210124044606, "grad_norm": 0.13009700179100037, "learning_rate": 6.012247907923958e-06, "loss": 0.4642, "num_input_tokens_seen": 95433792, "step": 78470 }, { "epoch": 9.832727728354843, "grad_norm": 0.12097133696079254, "learning_rate": 6.011712509011785e-06, "loss": 0.4659, "num_input_tokens_seen": 95439712, "step": 78475 }, { "epoch": 9.833354216263626, "grad_norm": 0.19563741981983185, "learning_rate": 6.011177098003738e-06, "loss": 0.4705, "num_input_tokens_seen": 95445632, "step": 78480 }, { "epoch": 9.83398070417241, "grad_norm": 0.12977047264575958, "learning_rate": 6.010641674906217e-06, "loss": 0.458, "num_input_tokens_seen": 95451808, "step": 78485 }, { "epoch": 9.834607192081194, "grad_norm": 0.10327356308698654, "learning_rate": 6.0101062397256274e-06, "loss": 0.4525, "num_input_tokens_seen": 95457888, "step": 78490 }, { "epoch": 9.835233679989976, "grad_norm": 0.1558334231376648, "learning_rate": 6.009570792468367e-06, "loss": 0.4681, "num_input_tokens_seen": 95463904, "step": 78495 }, { "epoch": 9.83586016789876, "grad_norm": 0.10966424643993378, "learning_rate": 6.009035333140838e-06, "loss": 0.4584, "num_input_tokens_seen": 95470112, "step": 78500 }, { "epoch": 9.836486655807542, "grad_norm": 0.12633316218852997, "learning_rate": 6.008499861749444e-06, "loss": 0.4606, "num_input_tokens_seen": 95476352, "step": 78505 }, { "epoch": 9.837113143716326, "grad_norm": 0.14554515480995178, "learning_rate": 6.007964378300587e-06, "loss": 0.4563, "num_input_tokens_seen": 95482784, "step": 78510 }, { "epoch": 9.83773963162511, "grad_norm": 0.13741040229797363, "learning_rate": 6.007428882800669e-06, "loss": 0.4665, "num_input_tokens_seen": 95489120, "step": 78515 }, { "epoch": 9.838366119533893, "grad_norm": 0.1466815024614334, "learning_rate": 6.006893375256091e-06, "loss": 0.4593, "num_input_tokens_seen": 95495104, "step": 78520 }, { "epoch": 9.838992607442677, "grad_norm": 0.1315167099237442, "learning_rate": 6.006357855673256e-06, "loss": 0.465, "num_input_tokens_seen": 95500928, "step": 78525 }, { "epoch": 9.839619095351459, "grad_norm": 0.11714812368154526, "learning_rate": 6.005822324058567e-06, "loss": 0.4551, "num_input_tokens_seen": 95506848, "step": 78530 }, { "epoch": 9.840245583260243, "grad_norm": 0.134802907705307, "learning_rate": 6.0052867804184265e-06, "loss": 0.4531, "num_input_tokens_seen": 95512512, "step": 78535 }, { "epoch": 9.840872071169027, "grad_norm": 0.1358528882265091, "learning_rate": 6.004751224759238e-06, "loss": 0.4661, "num_input_tokens_seen": 95518656, "step": 78540 }, { "epoch": 9.84149855907781, "grad_norm": 0.13816465437412262, "learning_rate": 6.004215657087402e-06, "loss": 0.4536, "num_input_tokens_seen": 95524640, "step": 78545 }, { "epoch": 9.842125046986594, "grad_norm": 0.10202021896839142, "learning_rate": 6.003680077409324e-06, "loss": 0.4616, "num_input_tokens_seen": 95530304, "step": 78550 }, { "epoch": 9.842751534895376, "grad_norm": 0.16163912415504456, "learning_rate": 6.003144485731408e-06, "loss": 0.4799, "num_input_tokens_seen": 95536480, "step": 78555 }, { "epoch": 9.84337802280416, "grad_norm": 0.11420726776123047, "learning_rate": 6.002608882060054e-06, "loss": 0.467, "num_input_tokens_seen": 95542560, "step": 78560 }, { "epoch": 9.844004510712944, "grad_norm": 0.15598070621490479, "learning_rate": 6.00207326640167e-06, "loss": 0.455, "num_input_tokens_seen": 95548800, "step": 78565 }, { "epoch": 9.844630998621726, "grad_norm": 0.1404094398021698, "learning_rate": 6.001537638762658e-06, "loss": 0.4566, "num_input_tokens_seen": 95554912, "step": 78570 }, { "epoch": 9.84525748653051, "grad_norm": 0.1104041337966919, "learning_rate": 6.001001999149421e-06, "loss": 0.463, "num_input_tokens_seen": 95560544, "step": 78575 }, { "epoch": 9.845883974439293, "grad_norm": 0.1273433417081833, "learning_rate": 6.000466347568362e-06, "loss": 0.4733, "num_input_tokens_seen": 95566304, "step": 78580 }, { "epoch": 9.846510462348077, "grad_norm": 0.08751913905143738, "learning_rate": 5.9999306840258886e-06, "loss": 0.4528, "num_input_tokens_seen": 95572032, "step": 78585 }, { "epoch": 9.84713695025686, "grad_norm": 0.11366423964500427, "learning_rate": 5.999395008528402e-06, "loss": 0.4687, "num_input_tokens_seen": 95578112, "step": 78590 }, { "epoch": 9.847763438165643, "grad_norm": 0.13396601378917694, "learning_rate": 5.998859321082308e-06, "loss": 0.4627, "num_input_tokens_seen": 95584256, "step": 78595 }, { "epoch": 9.848389926074427, "grad_norm": 0.09506220370531082, "learning_rate": 5.998323621694012e-06, "loss": 0.4648, "num_input_tokens_seen": 95590176, "step": 78600 }, { "epoch": 9.849016413983211, "grad_norm": 0.12972450256347656, "learning_rate": 5.997787910369916e-06, "loss": 0.4644, "num_input_tokens_seen": 95596480, "step": 78605 }, { "epoch": 9.849642901891993, "grad_norm": 0.19023920595645905, "learning_rate": 5.997252187116428e-06, "loss": 0.4547, "num_input_tokens_seen": 95602240, "step": 78610 }, { "epoch": 9.850269389800777, "grad_norm": 0.11557778716087341, "learning_rate": 5.99671645193995e-06, "loss": 0.4527, "num_input_tokens_seen": 95608480, "step": 78615 }, { "epoch": 9.85089587770956, "grad_norm": 0.11907890439033508, "learning_rate": 5.99618070484689e-06, "loss": 0.4544, "num_input_tokens_seen": 95614720, "step": 78620 }, { "epoch": 9.851522365618344, "grad_norm": 0.12272278964519501, "learning_rate": 5.995644945843653e-06, "loss": 0.4592, "num_input_tokens_seen": 95620960, "step": 78625 }, { "epoch": 9.852148853527126, "grad_norm": 0.2592563331127167, "learning_rate": 5.995109174936643e-06, "loss": 0.464, "num_input_tokens_seen": 95627104, "step": 78630 }, { "epoch": 9.85277534143591, "grad_norm": 0.14675141870975494, "learning_rate": 5.994573392132266e-06, "loss": 0.4551, "num_input_tokens_seen": 95633376, "step": 78635 }, { "epoch": 9.853401829344694, "grad_norm": 0.10733433067798615, "learning_rate": 5.994037597436928e-06, "loss": 0.4598, "num_input_tokens_seen": 95639456, "step": 78640 }, { "epoch": 9.854028317253476, "grad_norm": 0.1665259301662445, "learning_rate": 5.993501790857035e-06, "loss": 0.4608, "num_input_tokens_seen": 95645152, "step": 78645 }, { "epoch": 9.85465480516226, "grad_norm": 0.1245250254869461, "learning_rate": 5.992965972398992e-06, "loss": 0.4684, "num_input_tokens_seen": 95651328, "step": 78650 }, { "epoch": 9.855281293071045, "grad_norm": 0.0874408632516861, "learning_rate": 5.992430142069206e-06, "loss": 0.4593, "num_input_tokens_seen": 95657760, "step": 78655 }, { "epoch": 9.855907780979827, "grad_norm": 0.14424335956573486, "learning_rate": 5.991894299874084e-06, "loss": 0.4623, "num_input_tokens_seen": 95663488, "step": 78660 }, { "epoch": 9.85653426888861, "grad_norm": 0.16877563297748566, "learning_rate": 5.991358445820031e-06, "loss": 0.4598, "num_input_tokens_seen": 95669536, "step": 78665 }, { "epoch": 9.857160756797393, "grad_norm": 0.15088032186031342, "learning_rate": 5.990822579913456e-06, "loss": 0.4554, "num_input_tokens_seen": 95675744, "step": 78670 }, { "epoch": 9.857787244706177, "grad_norm": 0.11744518578052521, "learning_rate": 5.990286702160761e-06, "loss": 0.4671, "num_input_tokens_seen": 95681792, "step": 78675 }, { "epoch": 9.858413732614961, "grad_norm": 0.1513601690530777, "learning_rate": 5.989750812568358e-06, "loss": 0.4683, "num_input_tokens_seen": 95687680, "step": 78680 }, { "epoch": 9.859040220523744, "grad_norm": 0.1894465535879135, "learning_rate": 5.989214911142651e-06, "loss": 0.4631, "num_input_tokens_seen": 95693824, "step": 78685 }, { "epoch": 9.859666708432528, "grad_norm": 0.14555047452449799, "learning_rate": 5.988678997890049e-06, "loss": 0.4559, "num_input_tokens_seen": 95699872, "step": 78690 }, { "epoch": 9.86029319634131, "grad_norm": 0.14951638877391815, "learning_rate": 5.988143072816957e-06, "loss": 0.4649, "num_input_tokens_seen": 95705984, "step": 78695 }, { "epoch": 9.860919684250094, "grad_norm": 0.13586929440498352, "learning_rate": 5.987607135929786e-06, "loss": 0.4591, "num_input_tokens_seen": 95711872, "step": 78700 }, { "epoch": 9.861546172158878, "grad_norm": 0.1578921526670456, "learning_rate": 5.9870711872349394e-06, "loss": 0.465, "num_input_tokens_seen": 95718144, "step": 78705 }, { "epoch": 9.86217266006766, "grad_norm": 0.29864731431007385, "learning_rate": 5.986535226738828e-06, "loss": 0.4581, "num_input_tokens_seen": 95724096, "step": 78710 }, { "epoch": 9.862799147976444, "grad_norm": 0.15462814271450043, "learning_rate": 5.985999254447858e-06, "loss": 0.4731, "num_input_tokens_seen": 95729856, "step": 78715 }, { "epoch": 9.863425635885227, "grad_norm": 0.2081943154335022, "learning_rate": 5.985463270368437e-06, "loss": 0.4605, "num_input_tokens_seen": 95735968, "step": 78720 }, { "epoch": 9.86405212379401, "grad_norm": 0.19238537549972534, "learning_rate": 5.984927274506975e-06, "loss": 0.4623, "num_input_tokens_seen": 95742144, "step": 78725 }, { "epoch": 9.864678611702795, "grad_norm": 0.22491933405399323, "learning_rate": 5.98439126686988e-06, "loss": 0.4608, "num_input_tokens_seen": 95748128, "step": 78730 }, { "epoch": 9.865305099611577, "grad_norm": 0.13653990626335144, "learning_rate": 5.9838552474635595e-06, "loss": 0.4603, "num_input_tokens_seen": 95754176, "step": 78735 }, { "epoch": 9.865931587520361, "grad_norm": 0.09214384108781815, "learning_rate": 5.983319216294421e-06, "loss": 0.4711, "num_input_tokens_seen": 95760000, "step": 78740 }, { "epoch": 9.866558075429143, "grad_norm": 0.1694178432226181, "learning_rate": 5.982783173368876e-06, "loss": 0.4664, "num_input_tokens_seen": 95765984, "step": 78745 }, { "epoch": 9.867184563337927, "grad_norm": 0.2016129195690155, "learning_rate": 5.982247118693332e-06, "loss": 0.4613, "num_input_tokens_seen": 95771872, "step": 78750 }, { "epoch": 9.867811051246711, "grad_norm": 0.13989193737506866, "learning_rate": 5.981711052274197e-06, "loss": 0.458, "num_input_tokens_seen": 95778240, "step": 78755 }, { "epoch": 9.868437539155494, "grad_norm": 0.16312362253665924, "learning_rate": 5.981174974117881e-06, "loss": 0.453, "num_input_tokens_seen": 95784352, "step": 78760 }, { "epoch": 9.869064027064278, "grad_norm": 0.14715105295181274, "learning_rate": 5.980638884230793e-06, "loss": 0.4626, "num_input_tokens_seen": 95790592, "step": 78765 }, { "epoch": 9.869690514973062, "grad_norm": 0.16466611623764038, "learning_rate": 5.980102782619343e-06, "loss": 0.4644, "num_input_tokens_seen": 95796608, "step": 78770 }, { "epoch": 9.870317002881844, "grad_norm": 0.31465649604797363, "learning_rate": 5.97956666928994e-06, "loss": 0.4642, "num_input_tokens_seen": 95802336, "step": 78775 }, { "epoch": 9.870943490790628, "grad_norm": 0.16480235755443573, "learning_rate": 5.979030544248995e-06, "loss": 0.465, "num_input_tokens_seen": 95808256, "step": 78780 }, { "epoch": 9.87156997869941, "grad_norm": 0.15197338163852692, "learning_rate": 5.978494407502916e-06, "loss": 0.464, "num_input_tokens_seen": 95814592, "step": 78785 }, { "epoch": 9.872196466608194, "grad_norm": 0.1265251636505127, "learning_rate": 5.977958259058115e-06, "loss": 0.4491, "num_input_tokens_seen": 95820864, "step": 78790 }, { "epoch": 9.872822954516979, "grad_norm": 0.16751953959465027, "learning_rate": 5.9774220989209985e-06, "loss": 0.4625, "num_input_tokens_seen": 95827264, "step": 78795 }, { "epoch": 9.87344944242576, "grad_norm": 0.1536800116300583, "learning_rate": 5.9768859270979795e-06, "loss": 0.4568, "num_input_tokens_seen": 95833312, "step": 78800 }, { "epoch": 9.874075930334545, "grad_norm": 0.13969455659389496, "learning_rate": 5.976349743595469e-06, "loss": 0.4627, "num_input_tokens_seen": 95839616, "step": 78805 }, { "epoch": 9.874702418243327, "grad_norm": 0.10796286165714264, "learning_rate": 5.975813548419875e-06, "loss": 0.456, "num_input_tokens_seen": 95844960, "step": 78810 }, { "epoch": 9.875328906152111, "grad_norm": 0.13536436855793, "learning_rate": 5.97527734157761e-06, "loss": 0.4574, "num_input_tokens_seen": 95851168, "step": 78815 }, { "epoch": 9.875955394060895, "grad_norm": 0.11899147182703018, "learning_rate": 5.974741123075085e-06, "loss": 0.4631, "num_input_tokens_seen": 95856672, "step": 78820 }, { "epoch": 9.876581881969678, "grad_norm": 0.15557700395584106, "learning_rate": 5.9742048929187104e-06, "loss": 0.4653, "num_input_tokens_seen": 95862176, "step": 78825 }, { "epoch": 9.877208369878462, "grad_norm": 0.15324433147907257, "learning_rate": 5.9736686511148975e-06, "loss": 0.4614, "num_input_tokens_seen": 95868192, "step": 78830 }, { "epoch": 9.877834857787244, "grad_norm": 0.14128829538822174, "learning_rate": 5.973132397670057e-06, "loss": 0.4633, "num_input_tokens_seen": 95874464, "step": 78835 }, { "epoch": 9.878461345696028, "grad_norm": 0.10381066054105759, "learning_rate": 5.9725961325906e-06, "loss": 0.4662, "num_input_tokens_seen": 95880544, "step": 78840 }, { "epoch": 9.879087833604812, "grad_norm": 0.13630715012550354, "learning_rate": 5.972059855882939e-06, "loss": 0.4623, "num_input_tokens_seen": 95886336, "step": 78845 }, { "epoch": 9.879714321513594, "grad_norm": 0.1223897710442543, "learning_rate": 5.971523567553485e-06, "loss": 0.4557, "num_input_tokens_seen": 95892512, "step": 78850 }, { "epoch": 9.880340809422378, "grad_norm": 0.11615925282239914, "learning_rate": 5.970987267608651e-06, "loss": 0.4637, "num_input_tokens_seen": 95898976, "step": 78855 }, { "epoch": 9.88096729733116, "grad_norm": 0.1326988935470581, "learning_rate": 5.9704509560548465e-06, "loss": 0.4628, "num_input_tokens_seen": 95904960, "step": 78860 }, { "epoch": 9.881593785239945, "grad_norm": 0.12887555360794067, "learning_rate": 5.969914632898485e-06, "loss": 0.4676, "num_input_tokens_seen": 95911008, "step": 78865 }, { "epoch": 9.882220273148729, "grad_norm": 0.14274053275585175, "learning_rate": 5.969378298145978e-06, "loss": 0.4612, "num_input_tokens_seen": 95917376, "step": 78870 }, { "epoch": 9.882846761057511, "grad_norm": 0.1483771800994873, "learning_rate": 5.968841951803739e-06, "loss": 0.4538, "num_input_tokens_seen": 95923232, "step": 78875 }, { "epoch": 9.883473248966295, "grad_norm": 0.1571407914161682, "learning_rate": 5.968305593878181e-06, "loss": 0.4635, "num_input_tokens_seen": 95929472, "step": 78880 }, { "epoch": 9.88409973687508, "grad_norm": 0.17521408200263977, "learning_rate": 5.967769224375714e-06, "loss": 0.4598, "num_input_tokens_seen": 95935584, "step": 78885 }, { "epoch": 9.884726224783861, "grad_norm": 0.10569090396165848, "learning_rate": 5.967232843302754e-06, "loss": 0.4644, "num_input_tokens_seen": 95941504, "step": 78890 }, { "epoch": 9.885352712692645, "grad_norm": 0.15153469145298004, "learning_rate": 5.966696450665711e-06, "loss": 0.4625, "num_input_tokens_seen": 95947744, "step": 78895 }, { "epoch": 9.885979200601428, "grad_norm": 0.12219391018152237, "learning_rate": 5.966160046470999e-06, "loss": 0.4662, "num_input_tokens_seen": 95953280, "step": 78900 }, { "epoch": 9.886605688510212, "grad_norm": 0.11402829736471176, "learning_rate": 5.965623630725033e-06, "loss": 0.4671, "num_input_tokens_seen": 95959392, "step": 78905 }, { "epoch": 9.887232176418996, "grad_norm": 0.13679248094558716, "learning_rate": 5.965087203434223e-06, "loss": 0.4636, "num_input_tokens_seen": 95965344, "step": 78910 }, { "epoch": 9.887858664327778, "grad_norm": 0.11781629174947739, "learning_rate": 5.964550764604984e-06, "loss": 0.4575, "num_input_tokens_seen": 95971072, "step": 78915 }, { "epoch": 9.888485152236562, "grad_norm": 0.1377866417169571, "learning_rate": 5.964014314243729e-06, "loss": 0.4602, "num_input_tokens_seen": 95976832, "step": 78920 }, { "epoch": 9.889111640145344, "grad_norm": 0.10803468525409698, "learning_rate": 5.963477852356872e-06, "loss": 0.4648, "num_input_tokens_seen": 95982944, "step": 78925 }, { "epoch": 9.889738128054129, "grad_norm": 0.12286452949047089, "learning_rate": 5.962941378950829e-06, "loss": 0.4567, "num_input_tokens_seen": 95988928, "step": 78930 }, { "epoch": 9.890364615962913, "grad_norm": 0.12316926568746567, "learning_rate": 5.962404894032012e-06, "loss": 0.4651, "num_input_tokens_seen": 95995136, "step": 78935 }, { "epoch": 9.890991103871695, "grad_norm": 0.23825368285179138, "learning_rate": 5.961868397606836e-06, "loss": 0.4503, "num_input_tokens_seen": 96001184, "step": 78940 }, { "epoch": 9.891617591780479, "grad_norm": 0.12801459431648254, "learning_rate": 5.961331889681714e-06, "loss": 0.4628, "num_input_tokens_seen": 96007264, "step": 78945 }, { "epoch": 9.892244079689261, "grad_norm": 0.150816410779953, "learning_rate": 5.960795370263062e-06, "loss": 0.4633, "num_input_tokens_seen": 96013088, "step": 78950 }, { "epoch": 9.892870567598045, "grad_norm": 0.1673995554447174, "learning_rate": 5.960258839357291e-06, "loss": 0.4587, "num_input_tokens_seen": 96019232, "step": 78955 }, { "epoch": 9.89349705550683, "grad_norm": 0.15778732299804688, "learning_rate": 5.95972229697082e-06, "loss": 0.4666, "num_input_tokens_seen": 96025248, "step": 78960 }, { "epoch": 9.894123543415612, "grad_norm": 0.11349817365407944, "learning_rate": 5.959185743110062e-06, "loss": 0.4623, "num_input_tokens_seen": 96031360, "step": 78965 }, { "epoch": 9.894750031324396, "grad_norm": 0.12152372300624847, "learning_rate": 5.958649177781431e-06, "loss": 0.4634, "num_input_tokens_seen": 96037952, "step": 78970 }, { "epoch": 9.895376519233178, "grad_norm": 0.11162097007036209, "learning_rate": 5.958112600991343e-06, "loss": 0.4645, "num_input_tokens_seen": 96043968, "step": 78975 }, { "epoch": 9.896003007141962, "grad_norm": 0.13476736843585968, "learning_rate": 5.957576012746216e-06, "loss": 0.4614, "num_input_tokens_seen": 96049504, "step": 78980 }, { "epoch": 9.896629495050746, "grad_norm": 0.15437312424182892, "learning_rate": 5.9570394130524614e-06, "loss": 0.4675, "num_input_tokens_seen": 96055712, "step": 78985 }, { "epoch": 9.897255982959528, "grad_norm": 0.14319679141044617, "learning_rate": 5.956502801916494e-06, "loss": 0.4579, "num_input_tokens_seen": 96061728, "step": 78990 }, { "epoch": 9.897882470868312, "grad_norm": 0.10696937143802643, "learning_rate": 5.955966179344732e-06, "loss": 0.4598, "num_input_tokens_seen": 96068064, "step": 78995 }, { "epoch": 9.898508958777096, "grad_norm": 0.25072067975997925, "learning_rate": 5.955429545343592e-06, "loss": 0.4726, "num_input_tokens_seen": 96074272, "step": 79000 }, { "epoch": 9.899135446685879, "grad_norm": 0.9739434719085693, "learning_rate": 5.954892899919487e-06, "loss": 0.4659, "num_input_tokens_seen": 96080416, "step": 79005 }, { "epoch": 9.899761934594663, "grad_norm": 31.644651412963867, "learning_rate": 5.9543562430788375e-06, "loss": 0.6567, "num_input_tokens_seen": 96086592, "step": 79010 }, { "epoch": 9.900388422503445, "grad_norm": 0.6943290829658508, "learning_rate": 5.953819574828054e-06, "loss": 0.4782, "num_input_tokens_seen": 96092704, "step": 79015 }, { "epoch": 9.90101491041223, "grad_norm": 0.2022555023431778, "learning_rate": 5.953282895173556e-06, "loss": 0.4761, "num_input_tokens_seen": 96098944, "step": 79020 }, { "epoch": 9.901641398321013, "grad_norm": 0.1830633133649826, "learning_rate": 5.95274620412176e-06, "loss": 0.4551, "num_input_tokens_seen": 96104608, "step": 79025 }, { "epoch": 9.902267886229795, "grad_norm": 0.1582789272069931, "learning_rate": 5.9522095016790816e-06, "loss": 0.4689, "num_input_tokens_seen": 96110720, "step": 79030 }, { "epoch": 9.90289437413858, "grad_norm": 0.16262724995613098, "learning_rate": 5.951672787851939e-06, "loss": 0.4664, "num_input_tokens_seen": 96116832, "step": 79035 }, { "epoch": 9.903520862047362, "grad_norm": 0.10123959183692932, "learning_rate": 5.951136062646746e-06, "loss": 0.4614, "num_input_tokens_seen": 96122880, "step": 79040 }, { "epoch": 9.904147349956146, "grad_norm": 0.10115604847669601, "learning_rate": 5.950599326069923e-06, "loss": 0.4697, "num_input_tokens_seen": 96128832, "step": 79045 }, { "epoch": 9.90477383786493, "grad_norm": 0.12222656607627869, "learning_rate": 5.950062578127886e-06, "loss": 0.4571, "num_input_tokens_seen": 96135296, "step": 79050 }, { "epoch": 9.905400325773712, "grad_norm": 0.29318761825561523, "learning_rate": 5.949525818827052e-06, "loss": 0.4604, "num_input_tokens_seen": 96140704, "step": 79055 }, { "epoch": 9.906026813682496, "grad_norm": 0.11867483705282211, "learning_rate": 5.9489890481738385e-06, "loss": 0.4612, "num_input_tokens_seen": 96146240, "step": 79060 }, { "epoch": 9.906653301591279, "grad_norm": 0.14107248187065125, "learning_rate": 5.9484522661746634e-06, "loss": 0.4644, "num_input_tokens_seen": 96151776, "step": 79065 }, { "epoch": 9.907279789500063, "grad_norm": 0.11495769768953323, "learning_rate": 5.947915472835945e-06, "loss": 0.4593, "num_input_tokens_seen": 96157760, "step": 79070 }, { "epoch": 9.907906277408847, "grad_norm": 0.1280604749917984, "learning_rate": 5.947378668164097e-06, "loss": 0.4571, "num_input_tokens_seen": 96163744, "step": 79075 }, { "epoch": 9.908532765317629, "grad_norm": 0.14149068295955658, "learning_rate": 5.946841852165543e-06, "loss": 0.463, "num_input_tokens_seen": 96169568, "step": 79080 }, { "epoch": 9.909159253226413, "grad_norm": 0.18088147044181824, "learning_rate": 5.946305024846698e-06, "loss": 0.4661, "num_input_tokens_seen": 96175904, "step": 79085 }, { "epoch": 9.909785741135195, "grad_norm": 0.10694807767868042, "learning_rate": 5.94576818621398e-06, "loss": 0.4556, "num_input_tokens_seen": 96181408, "step": 79090 }, { "epoch": 9.91041222904398, "grad_norm": 0.11450271308422089, "learning_rate": 5.94523133627381e-06, "loss": 0.4593, "num_input_tokens_seen": 96187744, "step": 79095 }, { "epoch": 9.911038716952763, "grad_norm": 0.17157228291034698, "learning_rate": 5.944694475032604e-06, "loss": 0.4691, "num_input_tokens_seen": 96193792, "step": 79100 }, { "epoch": 9.911665204861546, "grad_norm": 0.12959803640842438, "learning_rate": 5.9441576024967805e-06, "loss": 0.4617, "num_input_tokens_seen": 96199936, "step": 79105 }, { "epoch": 9.91229169277033, "grad_norm": 0.16450680792331696, "learning_rate": 5.9436207186727594e-06, "loss": 0.4559, "num_input_tokens_seen": 96205856, "step": 79110 }, { "epoch": 9.912918180679114, "grad_norm": 0.13059058785438538, "learning_rate": 5.94308382356696e-06, "loss": 0.4496, "num_input_tokens_seen": 96211936, "step": 79115 }, { "epoch": 9.913544668587896, "grad_norm": 0.10488156974315643, "learning_rate": 5.942546917185799e-06, "loss": 0.4658, "num_input_tokens_seen": 96218016, "step": 79120 }, { "epoch": 9.91417115649668, "grad_norm": 0.12118891626596451, "learning_rate": 5.942009999535698e-06, "loss": 0.4607, "num_input_tokens_seen": 96224256, "step": 79125 }, { "epoch": 9.914797644405462, "grad_norm": 0.14671757817268372, "learning_rate": 5.941473070623076e-06, "loss": 0.4619, "num_input_tokens_seen": 96230496, "step": 79130 }, { "epoch": 9.915424132314246, "grad_norm": 0.19599470496177673, "learning_rate": 5.940936130454351e-06, "loss": 0.4664, "num_input_tokens_seen": 96236640, "step": 79135 }, { "epoch": 9.91605062022303, "grad_norm": 0.13043102622032166, "learning_rate": 5.940399179035944e-06, "loss": 0.464, "num_input_tokens_seen": 96242400, "step": 79140 }, { "epoch": 9.916677108131813, "grad_norm": 0.10970738530158997, "learning_rate": 5.939862216374275e-06, "loss": 0.4619, "num_input_tokens_seen": 96248736, "step": 79145 }, { "epoch": 9.917303596040597, "grad_norm": 0.09432175010442734, "learning_rate": 5.939325242475762e-06, "loss": 0.4588, "num_input_tokens_seen": 96254944, "step": 79150 }, { "epoch": 9.91793008394938, "grad_norm": 0.11701953411102295, "learning_rate": 5.938788257346826e-06, "loss": 0.4632, "num_input_tokens_seen": 96261152, "step": 79155 }, { "epoch": 9.918556571858163, "grad_norm": 0.14489474892616272, "learning_rate": 5.938251260993888e-06, "loss": 0.4593, "num_input_tokens_seen": 96266816, "step": 79160 }, { "epoch": 9.919183059766947, "grad_norm": 0.15039339661598206, "learning_rate": 5.9377142534233655e-06, "loss": 0.456, "num_input_tokens_seen": 96272672, "step": 79165 }, { "epoch": 9.91980954767573, "grad_norm": 0.14406032860279083, "learning_rate": 5.9371772346416816e-06, "loss": 0.4539, "num_input_tokens_seen": 96278592, "step": 79170 }, { "epoch": 9.920436035584514, "grad_norm": 0.1572146862745285, "learning_rate": 5.936640204655256e-06, "loss": 0.4561, "num_input_tokens_seen": 96284736, "step": 79175 }, { "epoch": 9.921062523493296, "grad_norm": 0.13834913074970245, "learning_rate": 5.936103163470509e-06, "loss": 0.4568, "num_input_tokens_seen": 96291264, "step": 79180 }, { "epoch": 9.92168901140208, "grad_norm": 0.11976205557584763, "learning_rate": 5.935566111093862e-06, "loss": 0.472, "num_input_tokens_seen": 96297696, "step": 79185 }, { "epoch": 9.922315499310864, "grad_norm": 0.14370250701904297, "learning_rate": 5.935029047531735e-06, "loss": 0.4593, "num_input_tokens_seen": 96303328, "step": 79190 }, { "epoch": 9.922941987219646, "grad_norm": 0.13506795465946198, "learning_rate": 5.93449197279055e-06, "loss": 0.4661, "num_input_tokens_seen": 96309504, "step": 79195 }, { "epoch": 9.92356847512843, "grad_norm": 0.13787400722503662, "learning_rate": 5.933954886876729e-06, "loss": 0.4477, "num_input_tokens_seen": 96315456, "step": 79200 }, { "epoch": 9.924194963037213, "grad_norm": 0.13395395874977112, "learning_rate": 5.933417789796692e-06, "loss": 0.4655, "num_input_tokens_seen": 96321888, "step": 79205 }, { "epoch": 9.924821450945997, "grad_norm": 0.19836851954460144, "learning_rate": 5.932880681556858e-06, "loss": 0.4621, "num_input_tokens_seen": 96328032, "step": 79210 }, { "epoch": 9.92544793885478, "grad_norm": 0.23468832671642303, "learning_rate": 5.932343562163654e-06, "loss": 0.4646, "num_input_tokens_seen": 96334240, "step": 79215 }, { "epoch": 9.926074426763563, "grad_norm": 0.15335774421691895, "learning_rate": 5.931806431623498e-06, "loss": 0.462, "num_input_tokens_seen": 96340672, "step": 79220 }, { "epoch": 9.926700914672347, "grad_norm": 0.11789946258068085, "learning_rate": 5.931269289942811e-06, "loss": 0.4608, "num_input_tokens_seen": 96347296, "step": 79225 }, { "epoch": 9.927327402581131, "grad_norm": 0.13996034860610962, "learning_rate": 5.930732137128018e-06, "loss": 0.4625, "num_input_tokens_seen": 96353952, "step": 79230 }, { "epoch": 9.927953890489913, "grad_norm": 0.127371683716774, "learning_rate": 5.930194973185539e-06, "loss": 0.4667, "num_input_tokens_seen": 96360224, "step": 79235 }, { "epoch": 9.928580378398697, "grad_norm": 0.13194814324378967, "learning_rate": 5.929657798121797e-06, "loss": 0.4577, "num_input_tokens_seen": 96365952, "step": 79240 }, { "epoch": 9.92920686630748, "grad_norm": 0.10815182328224182, "learning_rate": 5.929120611943216e-06, "loss": 0.4667, "num_input_tokens_seen": 96372096, "step": 79245 }, { "epoch": 9.929833354216264, "grad_norm": 0.15765349566936493, "learning_rate": 5.928583414656216e-06, "loss": 0.4657, "num_input_tokens_seen": 96378176, "step": 79250 }, { "epoch": 9.930459842125046, "grad_norm": 0.11108497530221939, "learning_rate": 5.928046206267222e-06, "loss": 0.4613, "num_input_tokens_seen": 96384672, "step": 79255 }, { "epoch": 9.93108633003383, "grad_norm": 0.0948370173573494, "learning_rate": 5.927508986782653e-06, "loss": 0.458, "num_input_tokens_seen": 96390688, "step": 79260 }, { "epoch": 9.931712817942614, "grad_norm": 0.19573336839675903, "learning_rate": 5.926971756208936e-06, "loss": 0.4621, "num_input_tokens_seen": 96396992, "step": 79265 }, { "epoch": 9.932339305851396, "grad_norm": 0.12317980825901031, "learning_rate": 5.926434514552492e-06, "loss": 0.4626, "num_input_tokens_seen": 96403360, "step": 79270 }, { "epoch": 9.93296579376018, "grad_norm": 0.12016156315803528, "learning_rate": 5.925897261819743e-06, "loss": 0.457, "num_input_tokens_seen": 96409600, "step": 79275 }, { "epoch": 9.933592281668965, "grad_norm": 0.1477712243795395, "learning_rate": 5.925359998017115e-06, "loss": 0.4769, "num_input_tokens_seen": 96415360, "step": 79280 }, { "epoch": 9.934218769577747, "grad_norm": 0.11920345574617386, "learning_rate": 5.92482272315103e-06, "loss": 0.4603, "num_input_tokens_seen": 96421344, "step": 79285 }, { "epoch": 9.934845257486531, "grad_norm": 0.1521262377500534, "learning_rate": 5.924285437227913e-06, "loss": 0.4668, "num_input_tokens_seen": 96427552, "step": 79290 }, { "epoch": 9.935471745395313, "grad_norm": 0.17683161795139313, "learning_rate": 5.923748140254183e-06, "loss": 0.4667, "num_input_tokens_seen": 96433728, "step": 79295 }, { "epoch": 9.936098233304097, "grad_norm": 0.12330806255340576, "learning_rate": 5.92321083223627e-06, "loss": 0.4635, "num_input_tokens_seen": 96439680, "step": 79300 }, { "epoch": 9.936724721212881, "grad_norm": 0.11158154159784317, "learning_rate": 5.922673513180596e-06, "loss": 0.4537, "num_input_tokens_seen": 96445600, "step": 79305 }, { "epoch": 9.937351209121664, "grad_norm": 0.11200767010450363, "learning_rate": 5.922136183093583e-06, "loss": 0.4538, "num_input_tokens_seen": 96451936, "step": 79310 }, { "epoch": 9.937977697030448, "grad_norm": 0.16879621148109436, "learning_rate": 5.9215988419816575e-06, "loss": 0.4707, "num_input_tokens_seen": 96458304, "step": 79315 }, { "epoch": 9.93860418493923, "grad_norm": 0.12026319652795792, "learning_rate": 5.921061489851242e-06, "loss": 0.453, "num_input_tokens_seen": 96464416, "step": 79320 }, { "epoch": 9.939230672848014, "grad_norm": 0.12433977425098419, "learning_rate": 5.920524126708763e-06, "loss": 0.4642, "num_input_tokens_seen": 96469408, "step": 79325 }, { "epoch": 9.939857160756798, "grad_norm": 0.18259821832180023, "learning_rate": 5.919986752560642e-06, "loss": 0.4672, "num_input_tokens_seen": 96475456, "step": 79330 }, { "epoch": 9.94048364866558, "grad_norm": 0.10847566276788712, "learning_rate": 5.919449367413308e-06, "loss": 0.4613, "num_input_tokens_seen": 96481376, "step": 79335 }, { "epoch": 9.941110136574364, "grad_norm": 0.14803063869476318, "learning_rate": 5.918911971273184e-06, "loss": 0.4707, "num_input_tokens_seen": 96487104, "step": 79340 }, { "epoch": 9.941736624483147, "grad_norm": 0.12444037944078445, "learning_rate": 5.918374564146694e-06, "loss": 0.4586, "num_input_tokens_seen": 96493120, "step": 79345 }, { "epoch": 9.94236311239193, "grad_norm": 0.14899423718452454, "learning_rate": 5.917837146040265e-06, "loss": 0.464, "num_input_tokens_seen": 96499424, "step": 79350 }, { "epoch": 9.942989600300715, "grad_norm": 0.20492389798164368, "learning_rate": 5.91729971696032e-06, "loss": 0.4678, "num_input_tokens_seen": 96505472, "step": 79355 }, { "epoch": 9.943616088209497, "grad_norm": 0.13735423982143402, "learning_rate": 5.916762276913285e-06, "loss": 0.4533, "num_input_tokens_seen": 96511616, "step": 79360 }, { "epoch": 9.944242576118281, "grad_norm": 0.20223961770534515, "learning_rate": 5.9162248259055874e-06, "loss": 0.4657, "num_input_tokens_seen": 96517760, "step": 79365 }, { "epoch": 9.944869064027063, "grad_norm": 0.12732639908790588, "learning_rate": 5.915687363943651e-06, "loss": 0.4584, "num_input_tokens_seen": 96523744, "step": 79370 }, { "epoch": 9.945495551935847, "grad_norm": 0.11075042188167572, "learning_rate": 5.9151498910339035e-06, "loss": 0.4663, "num_input_tokens_seen": 96529920, "step": 79375 }, { "epoch": 9.946122039844632, "grad_norm": 0.15356485545635223, "learning_rate": 5.914612407182769e-06, "loss": 0.4682, "num_input_tokens_seen": 96536032, "step": 79380 }, { "epoch": 9.946748527753414, "grad_norm": 0.21482540667057037, "learning_rate": 5.9140749123966735e-06, "loss": 0.4552, "num_input_tokens_seen": 96542368, "step": 79385 }, { "epoch": 9.947375015662198, "grad_norm": 0.1697356402873993, "learning_rate": 5.913537406682045e-06, "loss": 0.4604, "num_input_tokens_seen": 96548352, "step": 79390 }, { "epoch": 9.948001503570982, "grad_norm": 0.16956982016563416, "learning_rate": 5.912999890045308e-06, "loss": 0.4677, "num_input_tokens_seen": 96553696, "step": 79395 }, { "epoch": 9.948627991479764, "grad_norm": 0.16058658063411713, "learning_rate": 5.91246236249289e-06, "loss": 0.4621, "num_input_tokens_seen": 96559968, "step": 79400 }, { "epoch": 9.949254479388548, "grad_norm": 0.2027519792318344, "learning_rate": 5.911924824031218e-06, "loss": 0.4653, "num_input_tokens_seen": 96566144, "step": 79405 }, { "epoch": 9.94988096729733, "grad_norm": 0.12861758470535278, "learning_rate": 5.9113872746667175e-06, "loss": 0.472, "num_input_tokens_seen": 96571776, "step": 79410 }, { "epoch": 9.950507455206115, "grad_norm": 0.12834258377552032, "learning_rate": 5.910849714405816e-06, "loss": 0.4581, "num_input_tokens_seen": 96577824, "step": 79415 }, { "epoch": 9.951133943114899, "grad_norm": 0.13480520248413086, "learning_rate": 5.91031214325494e-06, "loss": 0.4676, "num_input_tokens_seen": 96583936, "step": 79420 }, { "epoch": 9.951760431023681, "grad_norm": 0.113210029900074, "learning_rate": 5.909774561220516e-06, "loss": 0.462, "num_input_tokens_seen": 96590048, "step": 79425 }, { "epoch": 9.952386918932465, "grad_norm": 0.1529524326324463, "learning_rate": 5.9092369683089734e-06, "loss": 0.464, "num_input_tokens_seen": 96596512, "step": 79430 }, { "epoch": 9.953013406841247, "grad_norm": 0.1524081975221634, "learning_rate": 5.908699364526737e-06, "loss": 0.4511, "num_input_tokens_seen": 96602432, "step": 79435 }, { "epoch": 9.953639894750031, "grad_norm": 0.16956692934036255, "learning_rate": 5.908161749880238e-06, "loss": 0.4705, "num_input_tokens_seen": 96608640, "step": 79440 }, { "epoch": 9.954266382658815, "grad_norm": 0.12232693284749985, "learning_rate": 5.907624124375901e-06, "loss": 0.4564, "num_input_tokens_seen": 96614816, "step": 79445 }, { "epoch": 9.954892870567598, "grad_norm": 0.11624925583600998, "learning_rate": 5.907086488020154e-06, "loss": 0.4625, "num_input_tokens_seen": 96620864, "step": 79450 }, { "epoch": 9.955519358476382, "grad_norm": 0.11615663021802902, "learning_rate": 5.906548840819426e-06, "loss": 0.4629, "num_input_tokens_seen": 96626976, "step": 79455 }, { "epoch": 9.956145846385164, "grad_norm": 0.11138636618852615, "learning_rate": 5.9060111827801444e-06, "loss": 0.4619, "num_input_tokens_seen": 96633216, "step": 79460 }, { "epoch": 9.956772334293948, "grad_norm": 0.11466784030199051, "learning_rate": 5.905473513908737e-06, "loss": 0.4646, "num_input_tokens_seen": 96639744, "step": 79465 }, { "epoch": 9.957398822202732, "grad_norm": 0.16932453215122223, "learning_rate": 5.904935834211632e-06, "loss": 0.4628, "num_input_tokens_seen": 96646112, "step": 79470 }, { "epoch": 9.958025310111514, "grad_norm": 0.1208273395895958, "learning_rate": 5.90439814369526e-06, "loss": 0.4645, "num_input_tokens_seen": 96651840, "step": 79475 }, { "epoch": 9.958651798020298, "grad_norm": 0.17328593134880066, "learning_rate": 5.903860442366045e-06, "loss": 0.4604, "num_input_tokens_seen": 96658176, "step": 79480 }, { "epoch": 9.95927828592908, "grad_norm": 0.10392020642757416, "learning_rate": 5.903322730230421e-06, "loss": 0.4652, "num_input_tokens_seen": 96664512, "step": 79485 }, { "epoch": 9.959904773837865, "grad_norm": 0.13448424637317657, "learning_rate": 5.902785007294812e-06, "loss": 0.4539, "num_input_tokens_seen": 96670592, "step": 79490 }, { "epoch": 9.960531261746649, "grad_norm": 0.0995151698589325, "learning_rate": 5.902247273565651e-06, "loss": 0.469, "num_input_tokens_seen": 96676288, "step": 79495 }, { "epoch": 9.961157749655431, "grad_norm": 0.1233765259385109, "learning_rate": 5.901709529049364e-06, "loss": 0.4587, "num_input_tokens_seen": 96682752, "step": 79500 }, { "epoch": 9.961784237564215, "grad_norm": 0.10339310020208359, "learning_rate": 5.901171773752383e-06, "loss": 0.4708, "num_input_tokens_seen": 96689088, "step": 79505 }, { "epoch": 9.962410725473, "grad_norm": 0.1927872896194458, "learning_rate": 5.900634007681135e-06, "loss": 0.4646, "num_input_tokens_seen": 96695040, "step": 79510 }, { "epoch": 9.963037213381781, "grad_norm": 0.15497493743896484, "learning_rate": 5.90009623084205e-06, "loss": 0.4584, "num_input_tokens_seen": 96701088, "step": 79515 }, { "epoch": 9.963663701290566, "grad_norm": 0.11099689453840256, "learning_rate": 5.89955844324156e-06, "loss": 0.4639, "num_input_tokens_seen": 96707168, "step": 79520 }, { "epoch": 9.964290189199348, "grad_norm": 0.14960943162441254, "learning_rate": 5.89902064488609e-06, "loss": 0.4627, "num_input_tokens_seen": 96713184, "step": 79525 }, { "epoch": 9.964916677108132, "grad_norm": 0.12478921562433243, "learning_rate": 5.898482835782073e-06, "loss": 0.4652, "num_input_tokens_seen": 96719552, "step": 79530 }, { "epoch": 9.965543165016916, "grad_norm": 0.15415924787521362, "learning_rate": 5.897945015935938e-06, "loss": 0.4656, "num_input_tokens_seen": 96725952, "step": 79535 }, { "epoch": 9.966169652925698, "grad_norm": 0.172759547829628, "learning_rate": 5.897407185354116e-06, "loss": 0.4637, "num_input_tokens_seen": 96732320, "step": 79540 }, { "epoch": 9.966796140834482, "grad_norm": 0.0955868735909462, "learning_rate": 5.896869344043034e-06, "loss": 0.4641, "num_input_tokens_seen": 96737888, "step": 79545 }, { "epoch": 9.967422628743265, "grad_norm": 0.16459377110004425, "learning_rate": 5.8963314920091286e-06, "loss": 0.4653, "num_input_tokens_seen": 96743776, "step": 79550 }, { "epoch": 9.968049116652049, "grad_norm": 0.12239333987236023, "learning_rate": 5.8957936292588245e-06, "loss": 0.4623, "num_input_tokens_seen": 96750048, "step": 79555 }, { "epoch": 9.968675604560833, "grad_norm": 0.11091567575931549, "learning_rate": 5.895255755798555e-06, "loss": 0.4643, "num_input_tokens_seen": 96756256, "step": 79560 }, { "epoch": 9.969302092469615, "grad_norm": 0.1691628396511078, "learning_rate": 5.894717871634751e-06, "loss": 0.4622, "num_input_tokens_seen": 96762336, "step": 79565 }, { "epoch": 9.969928580378399, "grad_norm": 0.09274213016033173, "learning_rate": 5.894179976773843e-06, "loss": 0.4718, "num_input_tokens_seen": 96768448, "step": 79570 }, { "epoch": 9.970555068287181, "grad_norm": 0.18475979566574097, "learning_rate": 5.893642071222259e-06, "loss": 0.4543, "num_input_tokens_seen": 96774592, "step": 79575 }, { "epoch": 9.971181556195965, "grad_norm": 0.13382837176322937, "learning_rate": 5.893104154986435e-06, "loss": 0.4576, "num_input_tokens_seen": 96780704, "step": 79580 }, { "epoch": 9.97180804410475, "grad_norm": 0.1294451355934143, "learning_rate": 5.892566228072799e-06, "loss": 0.4526, "num_input_tokens_seen": 96787200, "step": 79585 }, { "epoch": 9.972434532013532, "grad_norm": 0.12057573348283768, "learning_rate": 5.892028290487784e-06, "loss": 0.4648, "num_input_tokens_seen": 96793184, "step": 79590 }, { "epoch": 9.973061019922316, "grad_norm": 0.11399037390947342, "learning_rate": 5.89149034223782e-06, "loss": 0.4607, "num_input_tokens_seen": 96799424, "step": 79595 }, { "epoch": 9.973687507831098, "grad_norm": 0.11772521585226059, "learning_rate": 5.8909523833293406e-06, "loss": 0.4568, "num_input_tokens_seen": 96805664, "step": 79600 }, { "epoch": 9.974313995739882, "grad_norm": 0.12335983663797379, "learning_rate": 5.890414413768775e-06, "loss": 0.4615, "num_input_tokens_seen": 96811744, "step": 79605 }, { "epoch": 9.974940483648666, "grad_norm": 0.10976319015026093, "learning_rate": 5.889876433562556e-06, "loss": 0.4551, "num_input_tokens_seen": 96817952, "step": 79610 }, { "epoch": 9.975566971557448, "grad_norm": 0.11658614873886108, "learning_rate": 5.8893384427171176e-06, "loss": 0.4582, "num_input_tokens_seen": 96824096, "step": 79615 }, { "epoch": 9.976193459466232, "grad_norm": 0.13379186391830444, "learning_rate": 5.88880044123889e-06, "loss": 0.4663, "num_input_tokens_seen": 96830144, "step": 79620 }, { "epoch": 9.976819947375017, "grad_norm": 0.12645068764686584, "learning_rate": 5.888262429134306e-06, "loss": 0.4703, "num_input_tokens_seen": 96835840, "step": 79625 }, { "epoch": 9.977446435283799, "grad_norm": 0.10143423825502396, "learning_rate": 5.887724406409798e-06, "loss": 0.4654, "num_input_tokens_seen": 96841664, "step": 79630 }, { "epoch": 9.978072923192583, "grad_norm": 0.13152450323104858, "learning_rate": 5.8871863730717985e-06, "loss": 0.4655, "num_input_tokens_seen": 96847552, "step": 79635 }, { "epoch": 9.978699411101365, "grad_norm": 0.10243403911590576, "learning_rate": 5.886648329126738e-06, "loss": 0.4649, "num_input_tokens_seen": 96853600, "step": 79640 }, { "epoch": 9.97932589901015, "grad_norm": 0.08397334069013596, "learning_rate": 5.886110274581053e-06, "loss": 0.4567, "num_input_tokens_seen": 96859936, "step": 79645 }, { "epoch": 9.979952386918933, "grad_norm": 0.10787897557020187, "learning_rate": 5.885572209441175e-06, "loss": 0.4604, "num_input_tokens_seen": 96866016, "step": 79650 }, { "epoch": 9.980578874827716, "grad_norm": 0.09867750108242035, "learning_rate": 5.885034133713537e-06, "loss": 0.4566, "num_input_tokens_seen": 96872352, "step": 79655 }, { "epoch": 9.9812053627365, "grad_norm": 0.1289362907409668, "learning_rate": 5.88449604740457e-06, "loss": 0.46, "num_input_tokens_seen": 96878528, "step": 79660 }, { "epoch": 9.981831850645282, "grad_norm": 0.22892606258392334, "learning_rate": 5.88395795052071e-06, "loss": 0.4626, "num_input_tokens_seen": 96884480, "step": 79665 }, { "epoch": 9.982458338554066, "grad_norm": 0.14025817811489105, "learning_rate": 5.883419843068389e-06, "loss": 0.4697, "num_input_tokens_seen": 96890368, "step": 79670 }, { "epoch": 9.98308482646285, "grad_norm": 0.11780724674463272, "learning_rate": 5.882881725054041e-06, "loss": 0.4592, "num_input_tokens_seen": 96896448, "step": 79675 }, { "epoch": 9.983711314371632, "grad_norm": 0.10125869512557983, "learning_rate": 5.882343596484101e-06, "loss": 0.4643, "num_input_tokens_seen": 96902720, "step": 79680 }, { "epoch": 9.984337802280416, "grad_norm": 0.1144460141658783, "learning_rate": 5.881805457365001e-06, "loss": 0.4606, "num_input_tokens_seen": 96908992, "step": 79685 }, { "epoch": 9.984964290189199, "grad_norm": 0.14632298052310944, "learning_rate": 5.881267307703175e-06, "loss": 0.4512, "num_input_tokens_seen": 96914656, "step": 79690 }, { "epoch": 9.985590778097983, "grad_norm": 0.0973903015255928, "learning_rate": 5.880729147505058e-06, "loss": 0.4582, "num_input_tokens_seen": 96919776, "step": 79695 }, { "epoch": 9.986217266006767, "grad_norm": 0.12314402312040329, "learning_rate": 5.880190976777083e-06, "loss": 0.4568, "num_input_tokens_seen": 96926144, "step": 79700 }, { "epoch": 9.986843753915549, "grad_norm": 0.08337530493736267, "learning_rate": 5.8796527955256845e-06, "loss": 0.4678, "num_input_tokens_seen": 96932128, "step": 79705 }, { "epoch": 9.987470241824333, "grad_norm": 0.1330176144838333, "learning_rate": 5.879114603757297e-06, "loss": 0.4685, "num_input_tokens_seen": 96937728, "step": 79710 }, { "epoch": 9.988096729733115, "grad_norm": 0.11629978567361832, "learning_rate": 5.878576401478358e-06, "loss": 0.4605, "num_input_tokens_seen": 96943936, "step": 79715 }, { "epoch": 9.9887232176419, "grad_norm": 0.14674244821071625, "learning_rate": 5.878038188695297e-06, "loss": 0.4624, "num_input_tokens_seen": 96949920, "step": 79720 }, { "epoch": 9.989349705550683, "grad_norm": 0.09560415148735046, "learning_rate": 5.877499965414551e-06, "loss": 0.4622, "num_input_tokens_seen": 96956000, "step": 79725 }, { "epoch": 9.989976193459466, "grad_norm": 0.09435614943504333, "learning_rate": 5.876961731642555e-06, "loss": 0.459, "num_input_tokens_seen": 96962240, "step": 79730 }, { "epoch": 9.99060268136825, "grad_norm": 0.14814354479312897, "learning_rate": 5.876423487385746e-06, "loss": 0.4767, "num_input_tokens_seen": 96968032, "step": 79735 }, { "epoch": 9.991229169277034, "grad_norm": 0.11075053364038467, "learning_rate": 5.875885232650557e-06, "loss": 0.4683, "num_input_tokens_seen": 96973920, "step": 79740 }, { "epoch": 9.991855657185816, "grad_norm": 0.0843844786286354, "learning_rate": 5.875346967443422e-06, "loss": 0.4625, "num_input_tokens_seen": 96979936, "step": 79745 }, { "epoch": 9.9924821450946, "grad_norm": 0.13579100370407104, "learning_rate": 5.874808691770779e-06, "loss": 0.4631, "num_input_tokens_seen": 96985888, "step": 79750 }, { "epoch": 9.993108633003382, "grad_norm": 0.10078809410333633, "learning_rate": 5.874270405639063e-06, "loss": 0.4656, "num_input_tokens_seen": 96991808, "step": 79755 }, { "epoch": 9.993735120912167, "grad_norm": 0.09240742772817612, "learning_rate": 5.873732109054709e-06, "loss": 0.4677, "num_input_tokens_seen": 96998048, "step": 79760 }, { "epoch": 9.994361608820949, "grad_norm": 0.10818523168563843, "learning_rate": 5.873193802024153e-06, "loss": 0.4615, "num_input_tokens_seen": 97004320, "step": 79765 }, { "epoch": 9.994988096729733, "grad_norm": 0.10866111516952515, "learning_rate": 5.872655484553831e-06, "loss": 0.4609, "num_input_tokens_seen": 97010112, "step": 79770 }, { "epoch": 9.995614584638517, "grad_norm": 0.14439743757247925, "learning_rate": 5.872117156650179e-06, "loss": 0.4653, "num_input_tokens_seen": 97015936, "step": 79775 }, { "epoch": 9.9962410725473, "grad_norm": 0.09268664568662643, "learning_rate": 5.871578818319633e-06, "loss": 0.4655, "num_input_tokens_seen": 97021152, "step": 79780 }, { "epoch": 9.996867560456083, "grad_norm": 0.11247362941503525, "learning_rate": 5.8710404695686305e-06, "loss": 0.4575, "num_input_tokens_seen": 97027264, "step": 79785 }, { "epoch": 9.997494048364867, "grad_norm": 0.150517076253891, "learning_rate": 5.870502110403604e-06, "loss": 0.4641, "num_input_tokens_seen": 97033440, "step": 79790 }, { "epoch": 9.99812053627365, "grad_norm": 0.15100066363811493, "learning_rate": 5.869963740830996e-06, "loss": 0.4588, "num_input_tokens_seen": 97039328, "step": 79795 }, { "epoch": 9.998747024182434, "grad_norm": 0.12485470622777939, "learning_rate": 5.869425360857237e-06, "loss": 0.4677, "num_input_tokens_seen": 97045408, "step": 79800 }, { "epoch": 9.999373512091216, "grad_norm": 0.08320264518260956, "learning_rate": 5.86888697048877e-06, "loss": 0.4589, "num_input_tokens_seen": 97051648, "step": 79805 }, { "epoch": 10.0, "grad_norm": 0.15347307920455933, "learning_rate": 5.868348569732028e-06, "loss": 0.4624, "num_input_tokens_seen": 97057728, "step": 79810 }, { "epoch": 10.0, "eval_loss": 0.4628205895423889, "eval_runtime": 222.1576, "eval_samples_per_second": 35.925, "eval_steps_per_second": 8.985, "num_input_tokens_seen": 97057728, "step": 79810 }, { "epoch": 10.000626487908784, "grad_norm": 0.09257537871599197, "learning_rate": 5.867810158593448e-06, "loss": 0.4635, "num_input_tokens_seen": 97064160, "step": 79815 }, { "epoch": 10.001252975817566, "grad_norm": 0.1335054337978363, "learning_rate": 5.867271737079468e-06, "loss": 0.4607, "num_input_tokens_seen": 97070432, "step": 79820 }, { "epoch": 10.00187946372635, "grad_norm": 0.09202348440885544, "learning_rate": 5.866733305196526e-06, "loss": 0.4626, "num_input_tokens_seen": 97076320, "step": 79825 }, { "epoch": 10.002505951635133, "grad_norm": 0.09534911811351776, "learning_rate": 5.866194862951058e-06, "loss": 0.4646, "num_input_tokens_seen": 97082720, "step": 79830 }, { "epoch": 10.003132439543917, "grad_norm": 0.10670604556798935, "learning_rate": 5.8656564103495015e-06, "loss": 0.4653, "num_input_tokens_seen": 97088832, "step": 79835 }, { "epoch": 10.0037589274527, "grad_norm": 0.12998653948307037, "learning_rate": 5.865117947398295e-06, "loss": 0.4548, "num_input_tokens_seen": 97094592, "step": 79840 }, { "epoch": 10.004385415361483, "grad_norm": 0.1393989771604538, "learning_rate": 5.864579474103876e-06, "loss": 0.4611, "num_input_tokens_seen": 97100768, "step": 79845 }, { "epoch": 10.005011903270267, "grad_norm": 0.10129265487194061, "learning_rate": 5.864040990472683e-06, "loss": 0.4693, "num_input_tokens_seen": 97107040, "step": 79850 }, { "epoch": 10.00563839117905, "grad_norm": 0.11752893030643463, "learning_rate": 5.8635024965111535e-06, "loss": 0.4645, "num_input_tokens_seen": 97113152, "step": 79855 }, { "epoch": 10.006264879087833, "grad_norm": 0.06415647268295288, "learning_rate": 5.862963992225725e-06, "loss": 0.4606, "num_input_tokens_seen": 97119296, "step": 79860 }, { "epoch": 10.006891366996618, "grad_norm": 0.12958002090454102, "learning_rate": 5.8624254776228385e-06, "loss": 0.4578, "num_input_tokens_seen": 97125408, "step": 79865 }, { "epoch": 10.0075178549054, "grad_norm": 0.17739054560661316, "learning_rate": 5.861886952708928e-06, "loss": 0.4611, "num_input_tokens_seen": 97131808, "step": 79870 }, { "epoch": 10.008144342814184, "grad_norm": 0.10450982302427292, "learning_rate": 5.861348417490437e-06, "loss": 0.4627, "num_input_tokens_seen": 97138144, "step": 79875 }, { "epoch": 10.008770830722968, "grad_norm": 0.1278863251209259, "learning_rate": 5.8608098719738e-06, "loss": 0.4553, "num_input_tokens_seen": 97144096, "step": 79880 }, { "epoch": 10.00939731863175, "grad_norm": 0.10658751428127289, "learning_rate": 5.860271316165457e-06, "loss": 0.458, "num_input_tokens_seen": 97150368, "step": 79885 }, { "epoch": 10.010023806540534, "grad_norm": 0.09777840971946716, "learning_rate": 5.8597327500718474e-06, "loss": 0.4526, "num_input_tokens_seen": 97156480, "step": 79890 }, { "epoch": 10.010650294449317, "grad_norm": 0.1145118847489357, "learning_rate": 5.859194173699408e-06, "loss": 0.4621, "num_input_tokens_seen": 97162496, "step": 79895 }, { "epoch": 10.0112767823581, "grad_norm": 0.1545085906982422, "learning_rate": 5.858655587054582e-06, "loss": 0.4618, "num_input_tokens_seen": 97168832, "step": 79900 }, { "epoch": 10.011903270266885, "grad_norm": 0.18137003481388092, "learning_rate": 5.858116990143807e-06, "loss": 0.4674, "num_input_tokens_seen": 97174976, "step": 79905 }, { "epoch": 10.012529758175667, "grad_norm": 0.09734059125185013, "learning_rate": 5.857578382973521e-06, "loss": 0.4622, "num_input_tokens_seen": 97180768, "step": 79910 }, { "epoch": 10.013156246084451, "grad_norm": 0.12066088616847992, "learning_rate": 5.857039765550163e-06, "loss": 0.462, "num_input_tokens_seen": 97187040, "step": 79915 }, { "epoch": 10.013782733993233, "grad_norm": 0.10797742754220963, "learning_rate": 5.8565011378801765e-06, "loss": 0.4672, "num_input_tokens_seen": 97193440, "step": 79920 }, { "epoch": 10.014409221902017, "grad_norm": 0.10263258963823318, "learning_rate": 5.855962499969998e-06, "loss": 0.462, "num_input_tokens_seen": 97199424, "step": 79925 }, { "epoch": 10.015035709810801, "grad_norm": 0.12340857833623886, "learning_rate": 5.855423851826067e-06, "loss": 0.4584, "num_input_tokens_seen": 97205792, "step": 79930 }, { "epoch": 10.015662197719584, "grad_norm": 0.1647270917892456, "learning_rate": 5.854885193454827e-06, "loss": 0.4655, "num_input_tokens_seen": 97212064, "step": 79935 }, { "epoch": 10.016288685628368, "grad_norm": 0.13371533155441284, "learning_rate": 5.854346524862714e-06, "loss": 0.4602, "num_input_tokens_seen": 97218144, "step": 79940 }, { "epoch": 10.01691517353715, "grad_norm": 0.10116466879844666, "learning_rate": 5.853807846056171e-06, "loss": 0.4614, "num_input_tokens_seen": 97223968, "step": 79945 }, { "epoch": 10.017541661445934, "grad_norm": 0.1179402694106102, "learning_rate": 5.853269157041635e-06, "loss": 0.4595, "num_input_tokens_seen": 97230144, "step": 79950 }, { "epoch": 10.018168149354718, "grad_norm": 0.15258535742759705, "learning_rate": 5.852730457825552e-06, "loss": 0.4645, "num_input_tokens_seen": 97236256, "step": 79955 }, { "epoch": 10.0187946372635, "grad_norm": 0.13054659962654114, "learning_rate": 5.852191748414357e-06, "loss": 0.4604, "num_input_tokens_seen": 97242464, "step": 79960 }, { "epoch": 10.019421125172284, "grad_norm": 0.16664670407772064, "learning_rate": 5.851653028814495e-06, "loss": 0.4696, "num_input_tokens_seen": 97248512, "step": 79965 }, { "epoch": 10.020047613081067, "grad_norm": 0.12405940145254135, "learning_rate": 5.851114299032404e-06, "loss": 0.4569, "num_input_tokens_seen": 97254816, "step": 79970 }, { "epoch": 10.02067410098985, "grad_norm": 0.1130765900015831, "learning_rate": 5.850575559074527e-06, "loss": 0.4643, "num_input_tokens_seen": 97260832, "step": 79975 }, { "epoch": 10.021300588898635, "grad_norm": 0.13968254625797272, "learning_rate": 5.850036808947303e-06, "loss": 0.4656, "num_input_tokens_seen": 97267296, "step": 79980 }, { "epoch": 10.021927076807417, "grad_norm": 0.2800651788711548, "learning_rate": 5.849498048657174e-06, "loss": 0.464, "num_input_tokens_seen": 97273536, "step": 79985 }, { "epoch": 10.022553564716201, "grad_norm": 0.10008740425109863, "learning_rate": 5.848959278210582e-06, "loss": 0.4576, "num_input_tokens_seen": 97279008, "step": 79990 }, { "epoch": 10.023180052624983, "grad_norm": 0.10948362201452255, "learning_rate": 5.848420497613969e-06, "loss": 0.4682, "num_input_tokens_seen": 97285216, "step": 79995 }, { "epoch": 10.023806540533768, "grad_norm": 0.10298695415258408, "learning_rate": 5.847881706873774e-06, "loss": 0.4589, "num_input_tokens_seen": 97291264, "step": 80000 }, { "epoch": 10.024433028442552, "grad_norm": 0.0852089673280716, "learning_rate": 5.84734290599644e-06, "loss": 0.4605, "num_input_tokens_seen": 97297408, "step": 80005 }, { "epoch": 10.025059516351334, "grad_norm": 0.12411762028932571, "learning_rate": 5.8468040949884115e-06, "loss": 0.46, "num_input_tokens_seen": 97303584, "step": 80010 }, { "epoch": 10.025686004260118, "grad_norm": 0.10773494094610214, "learning_rate": 5.846265273856127e-06, "loss": 0.4544, "num_input_tokens_seen": 97309632, "step": 80015 }, { "epoch": 10.026312492168902, "grad_norm": 0.20048370957374573, "learning_rate": 5.84572644260603e-06, "loss": 0.4667, "num_input_tokens_seen": 97316000, "step": 80020 }, { "epoch": 10.026938980077684, "grad_norm": 0.10065125674009323, "learning_rate": 5.845187601244562e-06, "loss": 0.4684, "num_input_tokens_seen": 97321792, "step": 80025 }, { "epoch": 10.027565467986468, "grad_norm": 0.13702163100242615, "learning_rate": 5.844648749778165e-06, "loss": 0.4561, "num_input_tokens_seen": 97328000, "step": 80030 }, { "epoch": 10.02819195589525, "grad_norm": 0.0980895534157753, "learning_rate": 5.844109888213283e-06, "loss": 0.4591, "num_input_tokens_seen": 97334464, "step": 80035 }, { "epoch": 10.028818443804035, "grad_norm": 0.13105879724025726, "learning_rate": 5.843571016556357e-06, "loss": 0.4534, "num_input_tokens_seen": 97340640, "step": 80040 }, { "epoch": 10.029444931712819, "grad_norm": 0.12448975443840027, "learning_rate": 5.843032134813831e-06, "loss": 0.4645, "num_input_tokens_seen": 97346880, "step": 80045 }, { "epoch": 10.030071419621601, "grad_norm": 0.08525124192237854, "learning_rate": 5.842493242992147e-06, "loss": 0.4572, "num_input_tokens_seen": 97352800, "step": 80050 }, { "epoch": 10.030697907530385, "grad_norm": 0.12743090093135834, "learning_rate": 5.8419543410977455e-06, "loss": 0.4579, "num_input_tokens_seen": 97358784, "step": 80055 }, { "epoch": 10.031324395439167, "grad_norm": 0.12918995320796967, "learning_rate": 5.841415429137074e-06, "loss": 0.4618, "num_input_tokens_seen": 97364352, "step": 80060 }, { "epoch": 10.031950883347951, "grad_norm": 0.13055139780044556, "learning_rate": 5.8408765071165745e-06, "loss": 0.4623, "num_input_tokens_seen": 97370528, "step": 80065 }, { "epoch": 10.032577371256735, "grad_norm": 0.13544650375843048, "learning_rate": 5.840337575042688e-06, "loss": 0.4569, "num_input_tokens_seen": 97376576, "step": 80070 }, { "epoch": 10.033203859165518, "grad_norm": 0.1031821221113205, "learning_rate": 5.83979863292186e-06, "loss": 0.4579, "num_input_tokens_seen": 97382496, "step": 80075 }, { "epoch": 10.033830347074302, "grad_norm": 0.14034976065158844, "learning_rate": 5.839259680760533e-06, "loss": 0.4667, "num_input_tokens_seen": 97388640, "step": 80080 }, { "epoch": 10.034456834983084, "grad_norm": 0.1358722448348999, "learning_rate": 5.838720718565152e-06, "loss": 0.4629, "num_input_tokens_seen": 97394656, "step": 80085 }, { "epoch": 10.035083322891868, "grad_norm": 0.12532123923301697, "learning_rate": 5.838181746342157e-06, "loss": 0.4585, "num_input_tokens_seen": 97400480, "step": 80090 }, { "epoch": 10.035709810800652, "grad_norm": 0.11192255467176437, "learning_rate": 5.837642764097997e-06, "loss": 0.4611, "num_input_tokens_seen": 97406656, "step": 80095 }, { "epoch": 10.036336298709434, "grad_norm": 0.12342799454927444, "learning_rate": 5.837103771839112e-06, "loss": 0.4585, "num_input_tokens_seen": 97412928, "step": 80100 }, { "epoch": 10.036962786618219, "grad_norm": 0.13740375638008118, "learning_rate": 5.8365647695719486e-06, "loss": 0.4635, "num_input_tokens_seen": 97418624, "step": 80105 }, { "epoch": 10.037589274527, "grad_norm": 0.18124903738498688, "learning_rate": 5.836025757302949e-06, "loss": 0.472, "num_input_tokens_seen": 97425216, "step": 80110 }, { "epoch": 10.038215762435785, "grad_norm": 0.11872319877147675, "learning_rate": 5.835486735038559e-06, "loss": 0.4628, "num_input_tokens_seen": 97431392, "step": 80115 }, { "epoch": 10.038842250344569, "grad_norm": 0.09091682732105255, "learning_rate": 5.834947702785223e-06, "loss": 0.4588, "num_input_tokens_seen": 97437824, "step": 80120 }, { "epoch": 10.039468738253351, "grad_norm": 0.14564676582813263, "learning_rate": 5.834408660549386e-06, "loss": 0.4688, "num_input_tokens_seen": 97443808, "step": 80125 }, { "epoch": 10.040095226162135, "grad_norm": 0.11456676572561264, "learning_rate": 5.83386960833749e-06, "loss": 0.4642, "num_input_tokens_seen": 97449824, "step": 80130 }, { "epoch": 10.04072171407092, "grad_norm": 0.12499946355819702, "learning_rate": 5.833330546155983e-06, "loss": 0.454, "num_input_tokens_seen": 97456064, "step": 80135 }, { "epoch": 10.041348201979702, "grad_norm": 0.14614057540893555, "learning_rate": 5.832791474011309e-06, "loss": 0.4725, "num_input_tokens_seen": 97462080, "step": 80140 }, { "epoch": 10.041974689888486, "grad_norm": 0.13375931978225708, "learning_rate": 5.832252391909911e-06, "loss": 0.4638, "num_input_tokens_seen": 97468256, "step": 80145 }, { "epoch": 10.042601177797268, "grad_norm": 0.1283564567565918, "learning_rate": 5.831713299858238e-06, "loss": 0.4577, "num_input_tokens_seen": 97474272, "step": 80150 }, { "epoch": 10.043227665706052, "grad_norm": 0.1433277279138565, "learning_rate": 5.831174197862732e-06, "loss": 0.458, "num_input_tokens_seen": 97480288, "step": 80155 }, { "epoch": 10.043854153614836, "grad_norm": 0.09479624032974243, "learning_rate": 5.8306350859298396e-06, "loss": 0.4616, "num_input_tokens_seen": 97485664, "step": 80160 }, { "epoch": 10.044480641523618, "grad_norm": 0.16596384346485138, "learning_rate": 5.830095964066007e-06, "loss": 0.4669, "num_input_tokens_seen": 97491680, "step": 80165 }, { "epoch": 10.045107129432402, "grad_norm": 0.15280435979366302, "learning_rate": 5.829556832277679e-06, "loss": 0.459, "num_input_tokens_seen": 97497568, "step": 80170 }, { "epoch": 10.045733617341185, "grad_norm": 0.18902352452278137, "learning_rate": 5.829017690571302e-06, "loss": 0.4647, "num_input_tokens_seen": 97503520, "step": 80175 }, { "epoch": 10.046360105249969, "grad_norm": 0.11824004352092743, "learning_rate": 5.82847853895332e-06, "loss": 0.4561, "num_input_tokens_seen": 97509632, "step": 80180 }, { "epoch": 10.046986593158753, "grad_norm": 0.09589160233736038, "learning_rate": 5.827939377430183e-06, "loss": 0.4701, "num_input_tokens_seen": 97515808, "step": 80185 }, { "epoch": 10.047613081067535, "grad_norm": 0.12760135531425476, "learning_rate": 5.8274002060083335e-06, "loss": 0.4687, "num_input_tokens_seen": 97521536, "step": 80190 }, { "epoch": 10.048239568976319, "grad_norm": 0.09178534895181656, "learning_rate": 5.826861024694219e-06, "loss": 0.4632, "num_input_tokens_seen": 97527456, "step": 80195 }, { "epoch": 10.048866056885101, "grad_norm": 0.08601155877113342, "learning_rate": 5.826321833494285e-06, "loss": 0.4696, "num_input_tokens_seen": 97533408, "step": 80200 }, { "epoch": 10.049492544793885, "grad_norm": 0.11551567912101746, "learning_rate": 5.825782632414979e-06, "loss": 0.4615, "num_input_tokens_seen": 97539200, "step": 80205 }, { "epoch": 10.05011903270267, "grad_norm": 0.10625243186950684, "learning_rate": 5.825243421462747e-06, "loss": 0.457, "num_input_tokens_seen": 97545504, "step": 80210 }, { "epoch": 10.050745520611452, "grad_norm": 0.083770252764225, "learning_rate": 5.824704200644038e-06, "loss": 0.4697, "num_input_tokens_seen": 97551648, "step": 80215 }, { "epoch": 10.051372008520236, "grad_norm": 0.14789526164531708, "learning_rate": 5.824164969965296e-06, "loss": 0.4579, "num_input_tokens_seen": 97558016, "step": 80220 }, { "epoch": 10.051998496429018, "grad_norm": 0.11538638919591904, "learning_rate": 5.823625729432967e-06, "loss": 0.4613, "num_input_tokens_seen": 97563840, "step": 80225 }, { "epoch": 10.052624984337802, "grad_norm": 0.10118146985769272, "learning_rate": 5.823086479053501e-06, "loss": 0.4591, "num_input_tokens_seen": 97569600, "step": 80230 }, { "epoch": 10.053251472246586, "grad_norm": 0.10004392266273499, "learning_rate": 5.8225472188333454e-06, "loss": 0.4667, "num_input_tokens_seen": 97575360, "step": 80235 }, { "epoch": 10.053877960155368, "grad_norm": 0.14120742678642273, "learning_rate": 5.822007948778945e-06, "loss": 0.4633, "num_input_tokens_seen": 97581792, "step": 80240 }, { "epoch": 10.054504448064153, "grad_norm": 0.14149895310401917, "learning_rate": 5.821468668896749e-06, "loss": 0.4583, "num_input_tokens_seen": 97587776, "step": 80245 }, { "epoch": 10.055130935972937, "grad_norm": 0.1474224478006363, "learning_rate": 5.820929379193204e-06, "loss": 0.4663, "num_input_tokens_seen": 97593664, "step": 80250 }, { "epoch": 10.055757423881719, "grad_norm": 0.10984595865011215, "learning_rate": 5.8203900796747595e-06, "loss": 0.4612, "num_input_tokens_seen": 97599712, "step": 80255 }, { "epoch": 10.056383911790503, "grad_norm": 0.14514265954494476, "learning_rate": 5.819850770347859e-06, "loss": 0.4551, "num_input_tokens_seen": 97604928, "step": 80260 }, { "epoch": 10.057010399699285, "grad_norm": 0.11791468411684036, "learning_rate": 5.819311451218955e-06, "loss": 0.4622, "num_input_tokens_seen": 97611168, "step": 80265 }, { "epoch": 10.05763688760807, "grad_norm": 0.09997060894966125, "learning_rate": 5.818772122294493e-06, "loss": 0.4652, "num_input_tokens_seen": 97617376, "step": 80270 }, { "epoch": 10.058263375516853, "grad_norm": 0.10363674908876419, "learning_rate": 5.818232783580924e-06, "loss": 0.4632, "num_input_tokens_seen": 97623136, "step": 80275 }, { "epoch": 10.058889863425636, "grad_norm": 0.19507475197315216, "learning_rate": 5.8176934350846925e-06, "loss": 0.461, "num_input_tokens_seen": 97629184, "step": 80280 }, { "epoch": 10.05951635133442, "grad_norm": 0.1095363050699234, "learning_rate": 5.817154076812248e-06, "loss": 0.459, "num_input_tokens_seen": 97635104, "step": 80285 }, { "epoch": 10.060142839243202, "grad_norm": 0.12002885341644287, "learning_rate": 5.816614708770041e-06, "loss": 0.478, "num_input_tokens_seen": 97641280, "step": 80290 }, { "epoch": 10.060769327151986, "grad_norm": 0.11341235041618347, "learning_rate": 5.816075330964518e-06, "loss": 0.4691, "num_input_tokens_seen": 97647456, "step": 80295 }, { "epoch": 10.06139581506077, "grad_norm": 0.16879288852214813, "learning_rate": 5.815535943402127e-06, "loss": 0.4564, "num_input_tokens_seen": 97653312, "step": 80300 }, { "epoch": 10.062022302969552, "grad_norm": 0.281405508518219, "learning_rate": 5.81499654608932e-06, "loss": 0.469, "num_input_tokens_seen": 97659776, "step": 80305 }, { "epoch": 10.062648790878336, "grad_norm": 0.0993417277932167, "learning_rate": 5.814457139032542e-06, "loss": 0.457, "num_input_tokens_seen": 97665696, "step": 80310 }, { "epoch": 10.063275278787119, "grad_norm": 0.1542287915945053, "learning_rate": 5.813917722238246e-06, "loss": 0.4645, "num_input_tokens_seen": 97671200, "step": 80315 }, { "epoch": 10.063901766695903, "grad_norm": 0.10831546038389206, "learning_rate": 5.813378295712879e-06, "loss": 0.4614, "num_input_tokens_seen": 97677760, "step": 80320 }, { "epoch": 10.064528254604687, "grad_norm": 0.15603548288345337, "learning_rate": 5.81283885946289e-06, "loss": 0.462, "num_input_tokens_seen": 97683776, "step": 80325 }, { "epoch": 10.065154742513469, "grad_norm": 0.0704789012670517, "learning_rate": 5.8122994134947305e-06, "loss": 0.4572, "num_input_tokens_seen": 97690080, "step": 80330 }, { "epoch": 10.065781230422253, "grad_norm": 0.10082020610570908, "learning_rate": 5.811759957814848e-06, "loss": 0.4592, "num_input_tokens_seen": 97695968, "step": 80335 }, { "epoch": 10.066407718331035, "grad_norm": 0.12779688835144043, "learning_rate": 5.811220492429692e-06, "loss": 0.4573, "num_input_tokens_seen": 97702048, "step": 80340 }, { "epoch": 10.06703420623982, "grad_norm": 0.13884487748146057, "learning_rate": 5.810681017345713e-06, "loss": 0.4635, "num_input_tokens_seen": 97708160, "step": 80345 }, { "epoch": 10.067660694148604, "grad_norm": 0.09886573255062103, "learning_rate": 5.81014153256936e-06, "loss": 0.4703, "num_input_tokens_seen": 97714336, "step": 80350 }, { "epoch": 10.068287182057386, "grad_norm": 0.10802850127220154, "learning_rate": 5.809602038107085e-06, "loss": 0.4678, "num_input_tokens_seen": 97720064, "step": 80355 }, { "epoch": 10.06891366996617, "grad_norm": 0.08415468037128448, "learning_rate": 5.8090625339653375e-06, "loss": 0.4623, "num_input_tokens_seen": 97725056, "step": 80360 }, { "epoch": 10.069540157874952, "grad_norm": 0.156119704246521, "learning_rate": 5.808523020150566e-06, "loss": 0.4592, "num_input_tokens_seen": 97731200, "step": 80365 }, { "epoch": 10.070166645783736, "grad_norm": 0.12283501774072647, "learning_rate": 5.8079834966692225e-06, "loss": 0.4607, "num_input_tokens_seen": 97737440, "step": 80370 }, { "epoch": 10.07079313369252, "grad_norm": 0.10145720094442368, "learning_rate": 5.807443963527757e-06, "loss": 0.4681, "num_input_tokens_seen": 97743328, "step": 80375 }, { "epoch": 10.071419621601303, "grad_norm": 0.11058741062879562, "learning_rate": 5.806904420732621e-06, "loss": 0.4724, "num_input_tokens_seen": 97748960, "step": 80380 }, { "epoch": 10.072046109510087, "grad_norm": 0.1619212031364441, "learning_rate": 5.806364868290263e-06, "loss": 0.4443, "num_input_tokens_seen": 97754816, "step": 80385 }, { "epoch": 10.07267259741887, "grad_norm": 0.15535248816013336, "learning_rate": 5.805825306207136e-06, "loss": 0.4644, "num_input_tokens_seen": 97761248, "step": 80390 }, { "epoch": 10.073299085327653, "grad_norm": 0.1404953896999359, "learning_rate": 5.8052857344896895e-06, "loss": 0.4557, "num_input_tokens_seen": 97767328, "step": 80395 }, { "epoch": 10.073925573236437, "grad_norm": 0.09664447605609894, "learning_rate": 5.804746153144375e-06, "loss": 0.4706, "num_input_tokens_seen": 97773472, "step": 80400 }, { "epoch": 10.07455206114522, "grad_norm": 0.11188538372516632, "learning_rate": 5.804206562177643e-06, "loss": 0.4596, "num_input_tokens_seen": 97779808, "step": 80405 }, { "epoch": 10.075178549054003, "grad_norm": 0.1338716596364975, "learning_rate": 5.803666961595945e-06, "loss": 0.4596, "num_input_tokens_seen": 97785856, "step": 80410 }, { "epoch": 10.075805036962787, "grad_norm": 0.11748121678829193, "learning_rate": 5.803127351405734e-06, "loss": 0.4671, "num_input_tokens_seen": 97791968, "step": 80415 }, { "epoch": 10.07643152487157, "grad_norm": 0.15388639271259308, "learning_rate": 5.802587731613459e-06, "loss": 0.4623, "num_input_tokens_seen": 97797888, "step": 80420 }, { "epoch": 10.077058012780354, "grad_norm": 0.15965303778648376, "learning_rate": 5.802048102225574e-06, "loss": 0.4594, "num_input_tokens_seen": 97803872, "step": 80425 }, { "epoch": 10.077684500689136, "grad_norm": 0.11156812310218811, "learning_rate": 5.80150846324853e-06, "loss": 0.464, "num_input_tokens_seen": 97810016, "step": 80430 }, { "epoch": 10.07831098859792, "grad_norm": 0.12831242382526398, "learning_rate": 5.800968814688777e-06, "loss": 0.4615, "num_input_tokens_seen": 97815680, "step": 80435 }, { "epoch": 10.078937476506704, "grad_norm": 0.1006847620010376, "learning_rate": 5.80042915655277e-06, "loss": 0.4611, "num_input_tokens_seen": 97821248, "step": 80440 }, { "epoch": 10.079563964415486, "grad_norm": 0.10401667654514313, "learning_rate": 5.799889488846959e-06, "loss": 0.4558, "num_input_tokens_seen": 97827520, "step": 80445 }, { "epoch": 10.08019045232427, "grad_norm": 0.08842587471008301, "learning_rate": 5.799349811577797e-06, "loss": 0.4609, "num_input_tokens_seen": 97833216, "step": 80450 }, { "epoch": 10.080816940233053, "grad_norm": 0.1197509691119194, "learning_rate": 5.798810124751736e-06, "loss": 0.4571, "num_input_tokens_seen": 97839296, "step": 80455 }, { "epoch": 10.081443428141837, "grad_norm": 0.1097845807671547, "learning_rate": 5.798270428375227e-06, "loss": 0.47, "num_input_tokens_seen": 97845792, "step": 80460 }, { "epoch": 10.08206991605062, "grad_norm": 0.13876354694366455, "learning_rate": 5.797730722454724e-06, "loss": 0.4591, "num_input_tokens_seen": 97851744, "step": 80465 }, { "epoch": 10.082696403959403, "grad_norm": 0.13595685362815857, "learning_rate": 5.797191006996681e-06, "loss": 0.4624, "num_input_tokens_seen": 97857472, "step": 80470 }, { "epoch": 10.083322891868187, "grad_norm": 0.14826750755310059, "learning_rate": 5.796651282007547e-06, "loss": 0.4641, "num_input_tokens_seen": 97864064, "step": 80475 }, { "epoch": 10.08394937977697, "grad_norm": 0.12793958187103271, "learning_rate": 5.796111547493779e-06, "loss": 0.4539, "num_input_tokens_seen": 97869696, "step": 80480 }, { "epoch": 10.084575867685754, "grad_norm": 0.15576927363872528, "learning_rate": 5.795571803461828e-06, "loss": 0.4563, "num_input_tokens_seen": 97875936, "step": 80485 }, { "epoch": 10.085202355594538, "grad_norm": 0.08163771778345108, "learning_rate": 5.7950320499181466e-06, "loss": 0.4595, "num_input_tokens_seen": 97882464, "step": 80490 }, { "epoch": 10.08582884350332, "grad_norm": 0.1121562123298645, "learning_rate": 5.794492286869189e-06, "loss": 0.4607, "num_input_tokens_seen": 97888288, "step": 80495 }, { "epoch": 10.086455331412104, "grad_norm": 0.12492264807224274, "learning_rate": 5.793952514321408e-06, "loss": 0.4635, "num_input_tokens_seen": 97894624, "step": 80500 }, { "epoch": 10.087081819320888, "grad_norm": 0.12923264503479004, "learning_rate": 5.793412732281258e-06, "loss": 0.4648, "num_input_tokens_seen": 97900640, "step": 80505 }, { "epoch": 10.08770830722967, "grad_norm": 0.09592145681381226, "learning_rate": 5.792872940755191e-06, "loss": 0.4599, "num_input_tokens_seen": 97906848, "step": 80510 }, { "epoch": 10.088334795138454, "grad_norm": 0.12974371016025543, "learning_rate": 5.792333139749659e-06, "loss": 0.4591, "num_input_tokens_seen": 97913056, "step": 80515 }, { "epoch": 10.088961283047237, "grad_norm": 0.12949016690254211, "learning_rate": 5.791793329271122e-06, "loss": 0.4587, "num_input_tokens_seen": 97919104, "step": 80520 }, { "epoch": 10.08958777095602, "grad_norm": 0.16973459720611572, "learning_rate": 5.791253509326028e-06, "loss": 0.4566, "num_input_tokens_seen": 97925280, "step": 80525 }, { "epoch": 10.090214258864805, "grad_norm": 0.18319955468177795, "learning_rate": 5.790713679920834e-06, "loss": 0.46, "num_input_tokens_seen": 97931456, "step": 80530 }, { "epoch": 10.090840746773587, "grad_norm": 0.1455012708902359, "learning_rate": 5.790173841061992e-06, "loss": 0.4657, "num_input_tokens_seen": 97937536, "step": 80535 }, { "epoch": 10.091467234682371, "grad_norm": 0.12121202796697617, "learning_rate": 5.789633992755958e-06, "loss": 0.4597, "num_input_tokens_seen": 97943680, "step": 80540 }, { "epoch": 10.092093722591153, "grad_norm": 0.10871603339910507, "learning_rate": 5.789094135009187e-06, "loss": 0.4667, "num_input_tokens_seen": 97949728, "step": 80545 }, { "epoch": 10.092720210499937, "grad_norm": 0.1301427036523819, "learning_rate": 5.788554267828131e-06, "loss": 0.4595, "num_input_tokens_seen": 97955936, "step": 80550 }, { "epoch": 10.093346698408721, "grad_norm": 0.10671341419219971, "learning_rate": 5.788014391219246e-06, "loss": 0.4592, "num_input_tokens_seen": 97962304, "step": 80555 }, { "epoch": 10.093973186317504, "grad_norm": 0.235567107796669, "learning_rate": 5.787474505188986e-06, "loss": 0.4611, "num_input_tokens_seen": 97968416, "step": 80560 }, { "epoch": 10.094599674226288, "grad_norm": 0.12919986248016357, "learning_rate": 5.786934609743806e-06, "loss": 0.4673, "num_input_tokens_seen": 97974688, "step": 80565 }, { "epoch": 10.09522616213507, "grad_norm": 0.10317450016736984, "learning_rate": 5.786394704890162e-06, "loss": 0.4658, "num_input_tokens_seen": 97980544, "step": 80570 }, { "epoch": 10.095852650043854, "grad_norm": 0.14388473331928253, "learning_rate": 5.785854790634506e-06, "loss": 0.469, "num_input_tokens_seen": 97986656, "step": 80575 }, { "epoch": 10.096479137952638, "grad_norm": 0.1626594364643097, "learning_rate": 5.785314866983298e-06, "loss": 0.459, "num_input_tokens_seen": 97993088, "step": 80580 }, { "epoch": 10.09710562586142, "grad_norm": 0.209621861577034, "learning_rate": 5.78477493394299e-06, "loss": 0.4637, "num_input_tokens_seen": 97999136, "step": 80585 }, { "epoch": 10.097732113770205, "grad_norm": 0.1470220685005188, "learning_rate": 5.784234991520037e-06, "loss": 0.4413, "num_input_tokens_seen": 98004896, "step": 80590 }, { "epoch": 10.098358601678987, "grad_norm": 0.12088724970817566, "learning_rate": 5.783695039720895e-06, "loss": 0.4631, "num_input_tokens_seen": 98010880, "step": 80595 }, { "epoch": 10.09898508958777, "grad_norm": 0.12968303263187408, "learning_rate": 5.7831550785520205e-06, "loss": 0.464, "num_input_tokens_seen": 98017056, "step": 80600 }, { "epoch": 10.099611577496555, "grad_norm": 0.1402144432067871, "learning_rate": 5.7826151080198685e-06, "loss": 0.4676, "num_input_tokens_seen": 98023392, "step": 80605 }, { "epoch": 10.100238065405337, "grad_norm": 0.1526239514350891, "learning_rate": 5.782075128130895e-06, "loss": 0.4517, "num_input_tokens_seen": 98029760, "step": 80610 }, { "epoch": 10.100864553314121, "grad_norm": 0.1894783079624176, "learning_rate": 5.781535138891556e-06, "loss": 0.4722, "num_input_tokens_seen": 98035872, "step": 80615 }, { "epoch": 10.101491041222904, "grad_norm": 0.13943901658058167, "learning_rate": 5.780995140308306e-06, "loss": 0.4626, "num_input_tokens_seen": 98042240, "step": 80620 }, { "epoch": 10.102117529131688, "grad_norm": 0.15204834938049316, "learning_rate": 5.780455132387604e-06, "loss": 0.4654, "num_input_tokens_seen": 98048512, "step": 80625 }, { "epoch": 10.102744017040472, "grad_norm": 0.15505801141262054, "learning_rate": 5.779915115135906e-06, "loss": 0.4567, "num_input_tokens_seen": 98054816, "step": 80630 }, { "epoch": 10.103370504949254, "grad_norm": 0.13381528854370117, "learning_rate": 5.779375088559665e-06, "loss": 0.4701, "num_input_tokens_seen": 98060448, "step": 80635 }, { "epoch": 10.103996992858038, "grad_norm": 0.2257005125284195, "learning_rate": 5.77883505266534e-06, "loss": 0.4595, "num_input_tokens_seen": 98066592, "step": 80640 }, { "epoch": 10.104623480766822, "grad_norm": 0.22820502519607544, "learning_rate": 5.778295007459387e-06, "loss": 0.464, "num_input_tokens_seen": 98072704, "step": 80645 }, { "epoch": 10.105249968675604, "grad_norm": 0.21838204562664032, "learning_rate": 5.777754952948264e-06, "loss": 0.4653, "num_input_tokens_seen": 98078624, "step": 80650 }, { "epoch": 10.105876456584388, "grad_norm": 0.15569467842578888, "learning_rate": 5.777214889138425e-06, "loss": 0.4591, "num_input_tokens_seen": 98084672, "step": 80655 }, { "epoch": 10.10650294449317, "grad_norm": 0.14560572803020477, "learning_rate": 5.776674816036329e-06, "loss": 0.4653, "num_input_tokens_seen": 98090688, "step": 80660 }, { "epoch": 10.107129432401955, "grad_norm": 0.13769562542438507, "learning_rate": 5.776134733648433e-06, "loss": 0.4574, "num_input_tokens_seen": 98096736, "step": 80665 }, { "epoch": 10.107755920310739, "grad_norm": 0.11991511285305023, "learning_rate": 5.775594641981193e-06, "loss": 0.4525, "num_input_tokens_seen": 98102816, "step": 80670 }, { "epoch": 10.108382408219521, "grad_norm": 0.1260373592376709, "learning_rate": 5.775054541041066e-06, "loss": 0.4583, "num_input_tokens_seen": 98108768, "step": 80675 }, { "epoch": 10.109008896128305, "grad_norm": 0.15524357557296753, "learning_rate": 5.7745144308345125e-06, "loss": 0.4703, "num_input_tokens_seen": 98115040, "step": 80680 }, { "epoch": 10.109635384037087, "grad_norm": 0.19130603969097137, "learning_rate": 5.773974311367987e-06, "loss": 0.4579, "num_input_tokens_seen": 98121280, "step": 80685 }, { "epoch": 10.110261871945871, "grad_norm": 0.15088659524917603, "learning_rate": 5.773434182647949e-06, "loss": 0.4556, "num_input_tokens_seen": 98127104, "step": 80690 }, { "epoch": 10.110888359854656, "grad_norm": 0.11805308610200882, "learning_rate": 5.772894044680854e-06, "loss": 0.4612, "num_input_tokens_seen": 98132992, "step": 80695 }, { "epoch": 10.111514847763438, "grad_norm": 0.16067218780517578, "learning_rate": 5.772353897473161e-06, "loss": 0.4609, "num_input_tokens_seen": 98138976, "step": 80700 }, { "epoch": 10.112141335672222, "grad_norm": 0.10176552832126617, "learning_rate": 5.771813741031328e-06, "loss": 0.4695, "num_input_tokens_seen": 98145344, "step": 80705 }, { "epoch": 10.112767823581004, "grad_norm": 0.16363050043582916, "learning_rate": 5.771273575361812e-06, "loss": 0.4671, "num_input_tokens_seen": 98151584, "step": 80710 }, { "epoch": 10.113394311489788, "grad_norm": 0.14729070663452148, "learning_rate": 5.770733400471074e-06, "loss": 0.4678, "num_input_tokens_seen": 98157664, "step": 80715 }, { "epoch": 10.114020799398572, "grad_norm": 0.1368846893310547, "learning_rate": 5.770193216365567e-06, "loss": 0.4568, "num_input_tokens_seen": 98164032, "step": 80720 }, { "epoch": 10.114647287307355, "grad_norm": 0.2001291662454605, "learning_rate": 5.769653023051754e-06, "loss": 0.4658, "num_input_tokens_seen": 98170272, "step": 80725 }, { "epoch": 10.115273775216139, "grad_norm": 0.11001039296388626, "learning_rate": 5.769112820536091e-06, "loss": 0.4617, "num_input_tokens_seen": 98176096, "step": 80730 }, { "epoch": 10.11590026312492, "grad_norm": 0.27197301387786865, "learning_rate": 5.7685726088250405e-06, "loss": 0.4691, "num_input_tokens_seen": 98182240, "step": 80735 }, { "epoch": 10.116526751033705, "grad_norm": 0.1688300371170044, "learning_rate": 5.768032387925057e-06, "loss": 0.463, "num_input_tokens_seen": 98187904, "step": 80740 }, { "epoch": 10.117153238942489, "grad_norm": 0.12123312801122665, "learning_rate": 5.7674921578426e-06, "loss": 0.4643, "num_input_tokens_seen": 98193824, "step": 80745 }, { "epoch": 10.117779726851271, "grad_norm": 0.10945980995893478, "learning_rate": 5.7669519185841295e-06, "loss": 0.4554, "num_input_tokens_seen": 98199968, "step": 80750 }, { "epoch": 10.118406214760055, "grad_norm": 0.11376136541366577, "learning_rate": 5.766411670156104e-06, "loss": 0.4577, "num_input_tokens_seen": 98206080, "step": 80755 }, { "epoch": 10.11903270266884, "grad_norm": 0.11225099861621857, "learning_rate": 5.765871412564981e-06, "loss": 0.4601, "num_input_tokens_seen": 98212672, "step": 80760 }, { "epoch": 10.119659190577622, "grad_norm": 0.12949052453041077, "learning_rate": 5.765331145817224e-06, "loss": 0.4622, "num_input_tokens_seen": 98218944, "step": 80765 }, { "epoch": 10.120285678486406, "grad_norm": 0.13267527520656586, "learning_rate": 5.7647908699192875e-06, "loss": 0.4608, "num_input_tokens_seen": 98225152, "step": 80770 }, { "epoch": 10.120912166395188, "grad_norm": 0.14812834560871124, "learning_rate": 5.764250584877634e-06, "loss": 0.4617, "num_input_tokens_seen": 98231520, "step": 80775 }, { "epoch": 10.121538654303972, "grad_norm": 0.1583559811115265, "learning_rate": 5.763710290698721e-06, "loss": 0.4628, "num_input_tokens_seen": 98237792, "step": 80780 }, { "epoch": 10.122165142212756, "grad_norm": 0.13013775646686554, "learning_rate": 5.7631699873890115e-06, "loss": 0.4638, "num_input_tokens_seen": 98243840, "step": 80785 }, { "epoch": 10.122791630121538, "grad_norm": 0.0934973731637001, "learning_rate": 5.762629674954962e-06, "loss": 0.4627, "num_input_tokens_seen": 98250080, "step": 80790 }, { "epoch": 10.123418118030322, "grad_norm": 0.1339753419160843, "learning_rate": 5.7620893534030345e-06, "loss": 0.465, "num_input_tokens_seen": 98255840, "step": 80795 }, { "epoch": 10.124044605939105, "grad_norm": 0.09197653084993362, "learning_rate": 5.761549022739688e-06, "loss": 0.459, "num_input_tokens_seen": 98261952, "step": 80800 }, { "epoch": 10.124671093847889, "grad_norm": 0.15204575657844543, "learning_rate": 5.761008682971381e-06, "loss": 0.4631, "num_input_tokens_seen": 98267776, "step": 80805 }, { "epoch": 10.125297581756673, "grad_norm": 0.12450450658798218, "learning_rate": 5.760468334104576e-06, "loss": 0.4559, "num_input_tokens_seen": 98274112, "step": 80810 }, { "epoch": 10.125924069665455, "grad_norm": 0.16092485189437866, "learning_rate": 5.759927976145734e-06, "loss": 0.4569, "num_input_tokens_seen": 98280544, "step": 80815 }, { "epoch": 10.12655055757424, "grad_norm": 0.15095371007919312, "learning_rate": 5.759387609101313e-06, "loss": 0.4571, "num_input_tokens_seen": 98286816, "step": 80820 }, { "epoch": 10.127177045483021, "grad_norm": 0.13911297917366028, "learning_rate": 5.758847232977774e-06, "loss": 0.4598, "num_input_tokens_seen": 98293248, "step": 80825 }, { "epoch": 10.127803533391806, "grad_norm": 0.12061212211847305, "learning_rate": 5.7583068477815805e-06, "loss": 0.4687, "num_input_tokens_seen": 98299424, "step": 80830 }, { "epoch": 10.12843002130059, "grad_norm": 0.16716507077217102, "learning_rate": 5.757766453519189e-06, "loss": 0.4593, "num_input_tokens_seen": 98305568, "step": 80835 }, { "epoch": 10.129056509209372, "grad_norm": 0.09171075373888016, "learning_rate": 5.757226050197065e-06, "loss": 0.4585, "num_input_tokens_seen": 98311360, "step": 80840 }, { "epoch": 10.129682997118156, "grad_norm": 0.12143152207136154, "learning_rate": 5.756685637821663e-06, "loss": 0.4701, "num_input_tokens_seen": 98317472, "step": 80845 }, { "epoch": 10.130309485026938, "grad_norm": 0.10933792591094971, "learning_rate": 5.756145216399451e-06, "loss": 0.4536, "num_input_tokens_seen": 98323680, "step": 80850 }, { "epoch": 10.130935972935722, "grad_norm": 0.12769408524036407, "learning_rate": 5.7556047859368865e-06, "loss": 0.4563, "num_input_tokens_seen": 98329600, "step": 80855 }, { "epoch": 10.131562460844506, "grad_norm": 0.1564548760652542, "learning_rate": 5.755064346440431e-06, "loss": 0.4622, "num_input_tokens_seen": 98335904, "step": 80860 }, { "epoch": 10.132188948753289, "grad_norm": 0.1401178240776062, "learning_rate": 5.7545238979165476e-06, "loss": 0.461, "num_input_tokens_seen": 98342400, "step": 80865 }, { "epoch": 10.132815436662073, "grad_norm": 0.1636965423822403, "learning_rate": 5.753983440371695e-06, "loss": 0.4673, "num_input_tokens_seen": 98348544, "step": 80870 }, { "epoch": 10.133441924570855, "grad_norm": 0.1538587361574173, "learning_rate": 5.753442973812339e-06, "loss": 0.4589, "num_input_tokens_seen": 98354400, "step": 80875 }, { "epoch": 10.134068412479639, "grad_norm": 0.15676051378250122, "learning_rate": 5.752902498244936e-06, "loss": 0.4535, "num_input_tokens_seen": 98360352, "step": 80880 }, { "epoch": 10.134694900388423, "grad_norm": 0.13698093593120575, "learning_rate": 5.752362013675952e-06, "loss": 0.4623, "num_input_tokens_seen": 98366208, "step": 80885 }, { "epoch": 10.135321388297205, "grad_norm": 0.18816010653972626, "learning_rate": 5.751821520111848e-06, "loss": 0.4616, "num_input_tokens_seen": 98372416, "step": 80890 }, { "epoch": 10.13594787620599, "grad_norm": 0.14789080619812012, "learning_rate": 5.751281017559085e-06, "loss": 0.4617, "num_input_tokens_seen": 98378624, "step": 80895 }, { "epoch": 10.136574364114773, "grad_norm": 0.14947855472564697, "learning_rate": 5.750740506024125e-06, "loss": 0.4645, "num_input_tokens_seen": 98384832, "step": 80900 }, { "epoch": 10.137200852023556, "grad_norm": 0.132491797208786, "learning_rate": 5.750199985513431e-06, "loss": 0.4624, "num_input_tokens_seen": 98391200, "step": 80905 }, { "epoch": 10.13782733993234, "grad_norm": 0.13431473076343536, "learning_rate": 5.749659456033468e-06, "loss": 0.4594, "num_input_tokens_seen": 98397344, "step": 80910 }, { "epoch": 10.138453827841122, "grad_norm": 0.1418309062719345, "learning_rate": 5.7491189175906925e-06, "loss": 0.4591, "num_input_tokens_seen": 98403584, "step": 80915 }, { "epoch": 10.139080315749906, "grad_norm": 0.1538437157869339, "learning_rate": 5.748578370191573e-06, "loss": 0.4629, "num_input_tokens_seen": 98409600, "step": 80920 }, { "epoch": 10.13970680365869, "grad_norm": 0.15075084567070007, "learning_rate": 5.748037813842569e-06, "loss": 0.454, "num_input_tokens_seen": 98415744, "step": 80925 }, { "epoch": 10.140333291567472, "grad_norm": 0.1288241446018219, "learning_rate": 5.7474972485501435e-06, "loss": 0.4538, "num_input_tokens_seen": 98421632, "step": 80930 }, { "epoch": 10.140959779476256, "grad_norm": 0.13839662075042725, "learning_rate": 5.746956674320761e-06, "loss": 0.4604, "num_input_tokens_seen": 98427776, "step": 80935 }, { "epoch": 10.141586267385039, "grad_norm": 0.1799909472465515, "learning_rate": 5.746416091160883e-06, "loss": 0.4559, "num_input_tokens_seen": 98433888, "step": 80940 }, { "epoch": 10.142212755293823, "grad_norm": 0.13110767304897308, "learning_rate": 5.745875499076971e-06, "loss": 0.4584, "num_input_tokens_seen": 98440096, "step": 80945 }, { "epoch": 10.142839243202607, "grad_norm": 0.1318800151348114, "learning_rate": 5.745334898075494e-06, "loss": 0.4591, "num_input_tokens_seen": 98445600, "step": 80950 }, { "epoch": 10.14346573111139, "grad_norm": 0.1360437422990799, "learning_rate": 5.744794288162909e-06, "loss": 0.459, "num_input_tokens_seen": 98451680, "step": 80955 }, { "epoch": 10.144092219020173, "grad_norm": 0.2303714156150818, "learning_rate": 5.744253669345684e-06, "loss": 0.4459, "num_input_tokens_seen": 98457376, "step": 80960 }, { "epoch": 10.144718706928955, "grad_norm": 0.11598051339387894, "learning_rate": 5.74371304163028e-06, "loss": 0.461, "num_input_tokens_seen": 98462848, "step": 80965 }, { "epoch": 10.14534519483774, "grad_norm": 0.12584014236927032, "learning_rate": 5.74317240502316e-06, "loss": 0.4647, "num_input_tokens_seen": 98468704, "step": 80970 }, { "epoch": 10.145971682746524, "grad_norm": 0.18354351818561554, "learning_rate": 5.74263175953079e-06, "loss": 0.4584, "num_input_tokens_seen": 98474688, "step": 80975 }, { "epoch": 10.146598170655306, "grad_norm": 0.1902802437543869, "learning_rate": 5.7420911051596325e-06, "loss": 0.4518, "num_input_tokens_seen": 98480864, "step": 80980 }, { "epoch": 10.14722465856409, "grad_norm": 0.24737435579299927, "learning_rate": 5.741550441916152e-06, "loss": 0.4585, "num_input_tokens_seen": 98487392, "step": 80985 }, { "epoch": 10.147851146472872, "grad_norm": 0.22908802330493927, "learning_rate": 5.741009769806813e-06, "loss": 0.458, "num_input_tokens_seen": 98493568, "step": 80990 }, { "epoch": 10.148477634381656, "grad_norm": 0.16628731787204742, "learning_rate": 5.740469088838079e-06, "loss": 0.4674, "num_input_tokens_seen": 98499680, "step": 80995 }, { "epoch": 10.14910412229044, "grad_norm": 0.1610596925020218, "learning_rate": 5.739928399016415e-06, "loss": 0.4753, "num_input_tokens_seen": 98505696, "step": 81000 }, { "epoch": 10.149730610199223, "grad_norm": 0.19031742215156555, "learning_rate": 5.739387700348284e-06, "loss": 0.4461, "num_input_tokens_seen": 98511936, "step": 81005 }, { "epoch": 10.150357098108007, "grad_norm": 0.14967834949493408, "learning_rate": 5.738846992840153e-06, "loss": 0.4646, "num_input_tokens_seen": 98517792, "step": 81010 }, { "epoch": 10.15098358601679, "grad_norm": 0.22930504381656647, "learning_rate": 5.738306276498483e-06, "loss": 0.4626, "num_input_tokens_seen": 98523296, "step": 81015 }, { "epoch": 10.151610073925573, "grad_norm": 0.24969470500946045, "learning_rate": 5.737765551329743e-06, "loss": 0.4686, "num_input_tokens_seen": 98529120, "step": 81020 }, { "epoch": 10.152236561834357, "grad_norm": 0.15704990923404694, "learning_rate": 5.737224817340393e-06, "loss": 0.4646, "num_input_tokens_seen": 98534912, "step": 81025 }, { "epoch": 10.15286304974314, "grad_norm": 0.15559862554073334, "learning_rate": 5.736684074536901e-06, "loss": 0.4479, "num_input_tokens_seen": 98541088, "step": 81030 }, { "epoch": 10.153489537651923, "grad_norm": 0.20347696542739868, "learning_rate": 5.736143322925733e-06, "loss": 0.4792, "num_input_tokens_seen": 98547520, "step": 81035 }, { "epoch": 10.154116025560707, "grad_norm": 0.1843540221452713, "learning_rate": 5.735602562513351e-06, "loss": 0.4643, "num_input_tokens_seen": 98553664, "step": 81040 }, { "epoch": 10.15474251346949, "grad_norm": 0.1984361857175827, "learning_rate": 5.735061793306223e-06, "loss": 0.4584, "num_input_tokens_seen": 98560032, "step": 81045 }, { "epoch": 10.155369001378274, "grad_norm": 0.1606234312057495, "learning_rate": 5.734521015310812e-06, "loss": 0.4585, "num_input_tokens_seen": 98566112, "step": 81050 }, { "epoch": 10.155995489287056, "grad_norm": 0.18916815519332886, "learning_rate": 5.733980228533586e-06, "loss": 0.4645, "num_input_tokens_seen": 98571776, "step": 81055 }, { "epoch": 10.15662197719584, "grad_norm": 0.15866850316524506, "learning_rate": 5.73343943298101e-06, "loss": 0.4585, "num_input_tokens_seen": 98577792, "step": 81060 }, { "epoch": 10.157248465104624, "grad_norm": 0.17581118643283844, "learning_rate": 5.732898628659548e-06, "loss": 0.4606, "num_input_tokens_seen": 98583680, "step": 81065 }, { "epoch": 10.157874953013406, "grad_norm": 0.12093412131071091, "learning_rate": 5.732357815575667e-06, "loss": 0.4594, "num_input_tokens_seen": 98589792, "step": 81070 }, { "epoch": 10.15850144092219, "grad_norm": 0.24604584276676178, "learning_rate": 5.731816993735831e-06, "loss": 0.467, "num_input_tokens_seen": 98595680, "step": 81075 }, { "epoch": 10.159127928830973, "grad_norm": 0.21001797914505005, "learning_rate": 5.7312761631465084e-06, "loss": 0.4649, "num_input_tokens_seen": 98601888, "step": 81080 }, { "epoch": 10.159754416739757, "grad_norm": 0.19518721103668213, "learning_rate": 5.730735323814165e-06, "loss": 0.4614, "num_input_tokens_seen": 98607936, "step": 81085 }, { "epoch": 10.160380904648541, "grad_norm": 0.13087816536426544, "learning_rate": 5.7301944757452655e-06, "loss": 0.456, "num_input_tokens_seen": 98613440, "step": 81090 }, { "epoch": 10.161007392557323, "grad_norm": 0.17777253687381744, "learning_rate": 5.729653618946278e-06, "loss": 0.4573, "num_input_tokens_seen": 98619776, "step": 81095 }, { "epoch": 10.161633880466107, "grad_norm": 0.2850376069545746, "learning_rate": 5.729112753423668e-06, "loss": 0.4728, "num_input_tokens_seen": 98625792, "step": 81100 }, { "epoch": 10.16226036837489, "grad_norm": 0.17857255041599274, "learning_rate": 5.728571879183902e-06, "loss": 0.4596, "num_input_tokens_seen": 98631744, "step": 81105 }, { "epoch": 10.162886856283674, "grad_norm": 0.11501684039831161, "learning_rate": 5.728030996233447e-06, "loss": 0.4709, "num_input_tokens_seen": 98638016, "step": 81110 }, { "epoch": 10.163513344192458, "grad_norm": 0.15863288938999176, "learning_rate": 5.727490104578769e-06, "loss": 0.4599, "num_input_tokens_seen": 98644320, "step": 81115 }, { "epoch": 10.16413983210124, "grad_norm": 0.15783709287643433, "learning_rate": 5.726949204226335e-06, "loss": 0.4584, "num_input_tokens_seen": 98649504, "step": 81120 }, { "epoch": 10.164766320010024, "grad_norm": 0.12956203520298004, "learning_rate": 5.726408295182613e-06, "loss": 0.4607, "num_input_tokens_seen": 98656032, "step": 81125 }, { "epoch": 10.165392807918806, "grad_norm": 0.15289127826690674, "learning_rate": 5.7258673774540675e-06, "loss": 0.4682, "num_input_tokens_seen": 98662144, "step": 81130 }, { "epoch": 10.16601929582759, "grad_norm": 0.172115758061409, "learning_rate": 5.7253264510471674e-06, "loss": 0.4657, "num_input_tokens_seen": 98668448, "step": 81135 }, { "epoch": 10.166645783736374, "grad_norm": 0.12944866716861725, "learning_rate": 5.7247855159683805e-06, "loss": 0.4595, "num_input_tokens_seen": 98674848, "step": 81140 }, { "epoch": 10.167272271645157, "grad_norm": 0.19165970385074615, "learning_rate": 5.724244572224174e-06, "loss": 0.4525, "num_input_tokens_seen": 98681024, "step": 81145 }, { "epoch": 10.16789875955394, "grad_norm": 0.2034262716770172, "learning_rate": 5.7237036198210126e-06, "loss": 0.462, "num_input_tokens_seen": 98687072, "step": 81150 }, { "epoch": 10.168525247462725, "grad_norm": 0.1253749579191208, "learning_rate": 5.723162658765367e-06, "loss": 0.4576, "num_input_tokens_seen": 98693280, "step": 81155 }, { "epoch": 10.169151735371507, "grad_norm": 0.36028915643692017, "learning_rate": 5.722621689063704e-06, "loss": 0.4696, "num_input_tokens_seen": 98699488, "step": 81160 }, { "epoch": 10.169778223280291, "grad_norm": 0.18083304166793823, "learning_rate": 5.7220807107224915e-06, "loss": 0.4615, "num_input_tokens_seen": 98705792, "step": 81165 }, { "epoch": 10.170404711189073, "grad_norm": 0.14570768177509308, "learning_rate": 5.721539723748197e-06, "loss": 0.4616, "num_input_tokens_seen": 98711936, "step": 81170 }, { "epoch": 10.171031199097857, "grad_norm": 0.14520113170146942, "learning_rate": 5.720998728147289e-06, "loss": 0.4694, "num_input_tokens_seen": 98717920, "step": 81175 }, { "epoch": 10.171657687006642, "grad_norm": 0.2901894450187683, "learning_rate": 5.720457723926234e-06, "loss": 0.4704, "num_input_tokens_seen": 98723648, "step": 81180 }, { "epoch": 10.172284174915424, "grad_norm": 0.19532382488250732, "learning_rate": 5.719916711091501e-06, "loss": 0.4662, "num_input_tokens_seen": 98729920, "step": 81185 }, { "epoch": 10.172910662824208, "grad_norm": 0.1434139907360077, "learning_rate": 5.719375689649559e-06, "loss": 0.4632, "num_input_tokens_seen": 98736320, "step": 81190 }, { "epoch": 10.17353715073299, "grad_norm": 0.20636513829231262, "learning_rate": 5.718834659606875e-06, "loss": 0.4548, "num_input_tokens_seen": 98742432, "step": 81195 }, { "epoch": 10.174163638641774, "grad_norm": 0.12201783061027527, "learning_rate": 5.718293620969919e-06, "loss": 0.4579, "num_input_tokens_seen": 98748768, "step": 81200 }, { "epoch": 10.174790126550558, "grad_norm": 0.1788058578968048, "learning_rate": 5.71775257374516e-06, "loss": 0.471, "num_input_tokens_seen": 98755168, "step": 81205 }, { "epoch": 10.17541661445934, "grad_norm": 0.17252139747142792, "learning_rate": 5.717211517939064e-06, "loss": 0.4694, "num_input_tokens_seen": 98761280, "step": 81210 }, { "epoch": 10.176043102368125, "grad_norm": 0.17087674140930176, "learning_rate": 5.716670453558101e-06, "loss": 0.4609, "num_input_tokens_seen": 98767328, "step": 81215 }, { "epoch": 10.176669590276907, "grad_norm": 0.1643868237733841, "learning_rate": 5.716129380608741e-06, "loss": 0.452, "num_input_tokens_seen": 98773632, "step": 81220 }, { "epoch": 10.177296078185691, "grad_norm": 0.2048056274652481, "learning_rate": 5.715588299097452e-06, "loss": 0.4648, "num_input_tokens_seen": 98779776, "step": 81225 }, { "epoch": 10.177922566094475, "grad_norm": 0.1607922464609146, "learning_rate": 5.715047209030704e-06, "loss": 0.4712, "num_input_tokens_seen": 98785888, "step": 81230 }, { "epoch": 10.178549054003257, "grad_norm": 0.1485329568386078, "learning_rate": 5.7145061104149644e-06, "loss": 0.4691, "num_input_tokens_seen": 98791936, "step": 81235 }, { "epoch": 10.179175541912041, "grad_norm": 0.16103671491146088, "learning_rate": 5.713965003256703e-06, "loss": 0.4663, "num_input_tokens_seen": 98798048, "step": 81240 }, { "epoch": 10.179802029820824, "grad_norm": 0.17160284519195557, "learning_rate": 5.71342388756239e-06, "loss": 0.468, "num_input_tokens_seen": 98804512, "step": 81245 }, { "epoch": 10.180428517729608, "grad_norm": 0.1648344248533249, "learning_rate": 5.712882763338497e-06, "loss": 0.4647, "num_input_tokens_seen": 98810464, "step": 81250 }, { "epoch": 10.181055005638392, "grad_norm": 0.17288735508918762, "learning_rate": 5.712341630591488e-06, "loss": 0.4564, "num_input_tokens_seen": 98815904, "step": 81255 }, { "epoch": 10.181681493547174, "grad_norm": 0.20558534562587738, "learning_rate": 5.711800489327838e-06, "loss": 0.4572, "num_input_tokens_seen": 98821920, "step": 81260 }, { "epoch": 10.182307981455958, "grad_norm": 0.13283774256706238, "learning_rate": 5.711259339554014e-06, "loss": 0.4622, "num_input_tokens_seen": 98827808, "step": 81265 }, { "epoch": 10.182934469364742, "grad_norm": 0.14592887461185455, "learning_rate": 5.710718181276487e-06, "loss": 0.4641, "num_input_tokens_seen": 98833120, "step": 81270 }, { "epoch": 10.183560957273524, "grad_norm": 0.1586640328168869, "learning_rate": 5.710177014501726e-06, "loss": 0.4699, "num_input_tokens_seen": 98839168, "step": 81275 }, { "epoch": 10.184187445182308, "grad_norm": 0.15698304772377014, "learning_rate": 5.709635839236201e-06, "loss": 0.4555, "num_input_tokens_seen": 98844864, "step": 81280 }, { "epoch": 10.18481393309109, "grad_norm": 0.15056554973125458, "learning_rate": 5.709094655486385e-06, "loss": 0.4589, "num_input_tokens_seen": 98850816, "step": 81285 }, { "epoch": 10.185440420999875, "grad_norm": 0.11584997922182083, "learning_rate": 5.708553463258745e-06, "loss": 0.4612, "num_input_tokens_seen": 98857056, "step": 81290 }, { "epoch": 10.186066908908659, "grad_norm": 0.18429777026176453, "learning_rate": 5.708012262559751e-06, "loss": 0.4691, "num_input_tokens_seen": 98863296, "step": 81295 }, { "epoch": 10.186693396817441, "grad_norm": 0.1579897403717041, "learning_rate": 5.707471053395877e-06, "loss": 0.4574, "num_input_tokens_seen": 98869760, "step": 81300 }, { "epoch": 10.187319884726225, "grad_norm": 0.12408275902271271, "learning_rate": 5.706929835773591e-06, "loss": 0.4591, "num_input_tokens_seen": 98876160, "step": 81305 }, { "epoch": 10.187946372635007, "grad_norm": 0.12900486588478088, "learning_rate": 5.706388609699366e-06, "loss": 0.4588, "num_input_tokens_seen": 98882272, "step": 81310 }, { "epoch": 10.188572860543792, "grad_norm": 0.1645122468471527, "learning_rate": 5.70584737517967e-06, "loss": 0.4646, "num_input_tokens_seen": 98888512, "step": 81315 }, { "epoch": 10.189199348452576, "grad_norm": 0.14775386452674866, "learning_rate": 5.705306132220976e-06, "loss": 0.4522, "num_input_tokens_seen": 98894816, "step": 81320 }, { "epoch": 10.189825836361358, "grad_norm": 0.13340330123901367, "learning_rate": 5.704764880829754e-06, "loss": 0.4657, "num_input_tokens_seen": 98900544, "step": 81325 }, { "epoch": 10.190452324270142, "grad_norm": 0.15772505104541779, "learning_rate": 5.704223621012475e-06, "loss": 0.4602, "num_input_tokens_seen": 98906560, "step": 81330 }, { "epoch": 10.191078812178924, "grad_norm": 0.17002427577972412, "learning_rate": 5.70368235277561e-06, "loss": 0.4622, "num_input_tokens_seen": 98912736, "step": 81335 }, { "epoch": 10.191705300087708, "grad_norm": 0.12202131748199463, "learning_rate": 5.703141076125631e-06, "loss": 0.4599, "num_input_tokens_seen": 98918688, "step": 81340 }, { "epoch": 10.192331787996492, "grad_norm": 0.15119680762290955, "learning_rate": 5.70259979106901e-06, "loss": 0.4642, "num_input_tokens_seen": 98924864, "step": 81345 }, { "epoch": 10.192958275905275, "grad_norm": 0.18018047511577606, "learning_rate": 5.7020584976122165e-06, "loss": 0.4693, "num_input_tokens_seen": 98930848, "step": 81350 }, { "epoch": 10.193584763814059, "grad_norm": 0.15022745728492737, "learning_rate": 5.701517195761724e-06, "loss": 0.4556, "num_input_tokens_seen": 98936960, "step": 81355 }, { "epoch": 10.194211251722841, "grad_norm": 0.18112637102603912, "learning_rate": 5.700975885524004e-06, "loss": 0.464, "num_input_tokens_seen": 98943392, "step": 81360 }, { "epoch": 10.194837739631625, "grad_norm": 0.13308016955852509, "learning_rate": 5.700434566905528e-06, "loss": 0.462, "num_input_tokens_seen": 98949184, "step": 81365 }, { "epoch": 10.195464227540409, "grad_norm": 0.14930148422718048, "learning_rate": 5.699893239912769e-06, "loss": 0.4576, "num_input_tokens_seen": 98955360, "step": 81370 }, { "epoch": 10.196090715449191, "grad_norm": 0.24482883512973785, "learning_rate": 5.699351904552196e-06, "loss": 0.4599, "num_input_tokens_seen": 98961344, "step": 81375 }, { "epoch": 10.196717203357975, "grad_norm": 0.25079798698425293, "learning_rate": 5.6988105608302835e-06, "loss": 0.4536, "num_input_tokens_seen": 98967072, "step": 81380 }, { "epoch": 10.19734369126676, "grad_norm": 0.23324942588806152, "learning_rate": 5.698269208753504e-06, "loss": 0.4483, "num_input_tokens_seen": 98972672, "step": 81385 }, { "epoch": 10.197970179175542, "grad_norm": 0.17896735668182373, "learning_rate": 5.697727848328327e-06, "loss": 0.4582, "num_input_tokens_seen": 98979104, "step": 81390 }, { "epoch": 10.198596667084326, "grad_norm": 0.6883277893066406, "learning_rate": 5.697186479561229e-06, "loss": 0.4672, "num_input_tokens_seen": 98984960, "step": 81395 }, { "epoch": 10.199223154993108, "grad_norm": 0.16046656668186188, "learning_rate": 5.696645102458679e-06, "loss": 0.4526, "num_input_tokens_seen": 98991136, "step": 81400 }, { "epoch": 10.199849642901892, "grad_norm": 0.15340514481067657, "learning_rate": 5.696103717027152e-06, "loss": 0.4595, "num_input_tokens_seen": 98997120, "step": 81405 }, { "epoch": 10.200476130810676, "grad_norm": 0.23188596963882446, "learning_rate": 5.69556232327312e-06, "loss": 0.461, "num_input_tokens_seen": 99002912, "step": 81410 }, { "epoch": 10.201102618719458, "grad_norm": 0.17394094169139862, "learning_rate": 5.695020921203055e-06, "loss": 0.4613, "num_input_tokens_seen": 99008896, "step": 81415 }, { "epoch": 10.201729106628243, "grad_norm": 0.2200217992067337, "learning_rate": 5.694479510823431e-06, "loss": 0.4714, "num_input_tokens_seen": 99014816, "step": 81420 }, { "epoch": 10.202355594537025, "grad_norm": 0.18571622669696808, "learning_rate": 5.6939380921407205e-06, "loss": 0.4664, "num_input_tokens_seen": 99021024, "step": 81425 }, { "epoch": 10.202982082445809, "grad_norm": 0.15146887302398682, "learning_rate": 5.693396665161397e-06, "loss": 0.4764, "num_input_tokens_seen": 99027392, "step": 81430 }, { "epoch": 10.203608570354593, "grad_norm": 0.17958366870880127, "learning_rate": 5.692855229891933e-06, "loss": 0.4571, "num_input_tokens_seen": 99033376, "step": 81435 }, { "epoch": 10.204235058263375, "grad_norm": 0.14712785184383392, "learning_rate": 5.6923137863388025e-06, "loss": 0.466, "num_input_tokens_seen": 99039264, "step": 81440 }, { "epoch": 10.20486154617216, "grad_norm": 0.16778646409511566, "learning_rate": 5.691772334508477e-06, "loss": 0.4592, "num_input_tokens_seen": 99045280, "step": 81445 }, { "epoch": 10.205488034080942, "grad_norm": 0.19348295032978058, "learning_rate": 5.691230874407433e-06, "loss": 0.459, "num_input_tokens_seen": 99051072, "step": 81450 }, { "epoch": 10.206114521989726, "grad_norm": 0.16589781641960144, "learning_rate": 5.690689406042144e-06, "loss": 0.4666, "num_input_tokens_seen": 99057280, "step": 81455 }, { "epoch": 10.20674100989851, "grad_norm": 0.27878233790397644, "learning_rate": 5.690147929419081e-06, "loss": 0.4695, "num_input_tokens_seen": 99063328, "step": 81460 }, { "epoch": 10.207367497807292, "grad_norm": 0.17553596198558807, "learning_rate": 5.689606444544717e-06, "loss": 0.4689, "num_input_tokens_seen": 99069408, "step": 81465 }, { "epoch": 10.207993985716076, "grad_norm": 0.20603196322917938, "learning_rate": 5.6890649514255315e-06, "loss": 0.4654, "num_input_tokens_seen": 99075040, "step": 81470 }, { "epoch": 10.208620473624858, "grad_norm": 0.16639713943004608, "learning_rate": 5.6885234500679956e-06, "loss": 0.4728, "num_input_tokens_seen": 99081376, "step": 81475 }, { "epoch": 10.209246961533642, "grad_norm": 0.1354067623615265, "learning_rate": 5.687981940478581e-06, "loss": 0.4678, "num_input_tokens_seen": 99087392, "step": 81480 }, { "epoch": 10.209873449442426, "grad_norm": 0.14047178626060486, "learning_rate": 5.687440422663765e-06, "loss": 0.4553, "num_input_tokens_seen": 99093408, "step": 81485 }, { "epoch": 10.210499937351209, "grad_norm": 0.1575699746608734, "learning_rate": 5.68689889663002e-06, "loss": 0.4608, "num_input_tokens_seen": 99099072, "step": 81490 }, { "epoch": 10.211126425259993, "grad_norm": 0.1696086972951889, "learning_rate": 5.6863573623838216e-06, "loss": 0.4604, "num_input_tokens_seen": 99105152, "step": 81495 }, { "epoch": 10.211752913168775, "grad_norm": 0.17556102573871613, "learning_rate": 5.685815819931643e-06, "loss": 0.4628, "num_input_tokens_seen": 99111360, "step": 81500 }, { "epoch": 10.212379401077559, "grad_norm": 0.18352197110652924, "learning_rate": 5.68527426927996e-06, "loss": 0.463, "num_input_tokens_seen": 99117504, "step": 81505 }, { "epoch": 10.213005888986343, "grad_norm": 0.1838856190443039, "learning_rate": 5.684732710435248e-06, "loss": 0.4601, "num_input_tokens_seen": 99123488, "step": 81510 }, { "epoch": 10.213632376895125, "grad_norm": 0.18586492538452148, "learning_rate": 5.68419114340398e-06, "loss": 0.4681, "num_input_tokens_seen": 99129600, "step": 81515 }, { "epoch": 10.21425886480391, "grad_norm": 0.1679207682609558, "learning_rate": 5.683649568192632e-06, "loss": 0.4617, "num_input_tokens_seen": 99135872, "step": 81520 }, { "epoch": 10.214885352712693, "grad_norm": 0.15973001718521118, "learning_rate": 5.683107984807677e-06, "loss": 0.4658, "num_input_tokens_seen": 99142272, "step": 81525 }, { "epoch": 10.215511840621476, "grad_norm": 0.2123514711856842, "learning_rate": 5.682566393255594e-06, "loss": 0.4604, "num_input_tokens_seen": 99147776, "step": 81530 }, { "epoch": 10.21613832853026, "grad_norm": 0.13280843198299408, "learning_rate": 5.682024793542855e-06, "loss": 0.4644, "num_input_tokens_seen": 99153952, "step": 81535 }, { "epoch": 10.216764816439042, "grad_norm": 0.23809464275836945, "learning_rate": 5.6814831856759366e-06, "loss": 0.4711, "num_input_tokens_seen": 99160160, "step": 81540 }, { "epoch": 10.217391304347826, "grad_norm": 0.2950677275657654, "learning_rate": 5.680941569661314e-06, "loss": 0.4653, "num_input_tokens_seen": 99166144, "step": 81545 }, { "epoch": 10.21801779225661, "grad_norm": 0.21005947887897491, "learning_rate": 5.68039994550546e-06, "loss": 0.463, "num_input_tokens_seen": 99172448, "step": 81550 }, { "epoch": 10.218644280165393, "grad_norm": 0.20225274562835693, "learning_rate": 5.6798583132148545e-06, "loss": 0.457, "num_input_tokens_seen": 99177856, "step": 81555 }, { "epoch": 10.219270768074177, "grad_norm": 0.15439890325069427, "learning_rate": 5.679316672795972e-06, "loss": 0.4596, "num_input_tokens_seen": 99184224, "step": 81560 }, { "epoch": 10.219897255982959, "grad_norm": 0.14871224761009216, "learning_rate": 5.678775024255286e-06, "loss": 0.4604, "num_input_tokens_seen": 99190368, "step": 81565 }, { "epoch": 10.220523743891743, "grad_norm": 0.19479018449783325, "learning_rate": 5.6782333675992755e-06, "loss": 0.4518, "num_input_tokens_seen": 99196608, "step": 81570 }, { "epoch": 10.221150231800527, "grad_norm": 0.34347376227378845, "learning_rate": 5.6776917028344135e-06, "loss": 0.4678, "num_input_tokens_seen": 99202848, "step": 81575 }, { "epoch": 10.22177671970931, "grad_norm": 0.21156075596809387, "learning_rate": 5.677150029967178e-06, "loss": 0.4669, "num_input_tokens_seen": 99208832, "step": 81580 }, { "epoch": 10.222403207618093, "grad_norm": 0.20884232223033905, "learning_rate": 5.676608349004043e-06, "loss": 0.4627, "num_input_tokens_seen": 99215072, "step": 81585 }, { "epoch": 10.223029695526876, "grad_norm": 0.14806868135929108, "learning_rate": 5.676066659951489e-06, "loss": 0.4577, "num_input_tokens_seen": 99221472, "step": 81590 }, { "epoch": 10.22365618343566, "grad_norm": 0.15855105221271515, "learning_rate": 5.675524962815988e-06, "loss": 0.4574, "num_input_tokens_seen": 99228256, "step": 81595 }, { "epoch": 10.224282671344444, "grad_norm": 0.17215260863304138, "learning_rate": 5.674983257604019e-06, "loss": 0.456, "num_input_tokens_seen": 99234432, "step": 81600 }, { "epoch": 10.224909159253226, "grad_norm": 0.15427006781101227, "learning_rate": 5.6744415443220565e-06, "loss": 0.4662, "num_input_tokens_seen": 99240832, "step": 81605 }, { "epoch": 10.22553564716201, "grad_norm": 0.1842557042837143, "learning_rate": 5.6738998229765794e-06, "loss": 0.4654, "num_input_tokens_seen": 99246976, "step": 81610 }, { "epoch": 10.226162135070792, "grad_norm": 0.230782151222229, "learning_rate": 5.673358093574063e-06, "loss": 0.4596, "num_input_tokens_seen": 99253024, "step": 81615 }, { "epoch": 10.226788622979576, "grad_norm": 0.13853682577610016, "learning_rate": 5.672816356120985e-06, "loss": 0.4672, "num_input_tokens_seen": 99258720, "step": 81620 }, { "epoch": 10.22741511088836, "grad_norm": 0.1379188895225525, "learning_rate": 5.672274610623821e-06, "loss": 0.4651, "num_input_tokens_seen": 99264960, "step": 81625 }, { "epoch": 10.228041598797143, "grad_norm": 0.3000575006008148, "learning_rate": 5.67173285708905e-06, "loss": 0.4756, "num_input_tokens_seen": 99270944, "step": 81630 }, { "epoch": 10.228668086705927, "grad_norm": 0.1607484668493271, "learning_rate": 5.671191095523147e-06, "loss": 0.4687, "num_input_tokens_seen": 99276928, "step": 81635 }, { "epoch": 10.229294574614709, "grad_norm": 0.3326551616191864, "learning_rate": 5.670649325932589e-06, "loss": 0.4549, "num_input_tokens_seen": 99283520, "step": 81640 }, { "epoch": 10.229921062523493, "grad_norm": 0.1803935319185257, "learning_rate": 5.670107548323856e-06, "loss": 0.4597, "num_input_tokens_seen": 99289664, "step": 81645 }, { "epoch": 10.230547550432277, "grad_norm": 0.13537997007369995, "learning_rate": 5.669565762703423e-06, "loss": 0.4601, "num_input_tokens_seen": 99295584, "step": 81650 }, { "epoch": 10.23117403834106, "grad_norm": 0.2197531908750534, "learning_rate": 5.669023969077769e-06, "loss": 0.467, "num_input_tokens_seen": 99301856, "step": 81655 }, { "epoch": 10.231800526249843, "grad_norm": 0.143987774848938, "learning_rate": 5.6684821674533695e-06, "loss": 0.4581, "num_input_tokens_seen": 99307968, "step": 81660 }, { "epoch": 10.232427014158628, "grad_norm": 0.1591615229845047, "learning_rate": 5.667940357836705e-06, "loss": 0.4638, "num_input_tokens_seen": 99313696, "step": 81665 }, { "epoch": 10.23305350206741, "grad_norm": 0.15612150728702545, "learning_rate": 5.667398540234252e-06, "loss": 0.4562, "num_input_tokens_seen": 99319328, "step": 81670 }, { "epoch": 10.233679989976194, "grad_norm": 0.16762685775756836, "learning_rate": 5.666856714652489e-06, "loss": 0.4569, "num_input_tokens_seen": 99325600, "step": 81675 }, { "epoch": 10.234306477884976, "grad_norm": 0.12356218695640564, "learning_rate": 5.666314881097892e-06, "loss": 0.4516, "num_input_tokens_seen": 99331904, "step": 81680 }, { "epoch": 10.23493296579376, "grad_norm": 0.18360736966133118, "learning_rate": 5.665773039576941e-06, "loss": 0.4551, "num_input_tokens_seen": 99338144, "step": 81685 }, { "epoch": 10.235559453702544, "grad_norm": 0.22127079963684082, "learning_rate": 5.665231190096114e-06, "loss": 0.4559, "num_input_tokens_seen": 99343840, "step": 81690 }, { "epoch": 10.236185941611327, "grad_norm": 0.29552650451660156, "learning_rate": 5.664689332661888e-06, "loss": 0.4681, "num_input_tokens_seen": 99349728, "step": 81695 }, { "epoch": 10.23681242952011, "grad_norm": 0.1399233639240265, "learning_rate": 5.6641474672807415e-06, "loss": 0.4534, "num_input_tokens_seen": 99355680, "step": 81700 }, { "epoch": 10.237438917428893, "grad_norm": 0.16852210462093353, "learning_rate": 5.663605593959155e-06, "loss": 0.4664, "num_input_tokens_seen": 99361728, "step": 81705 }, { "epoch": 10.238065405337677, "grad_norm": 0.17814964056015015, "learning_rate": 5.6630637127036046e-06, "loss": 0.4613, "num_input_tokens_seen": 99367872, "step": 81710 }, { "epoch": 10.238691893246461, "grad_norm": 0.15156961977481842, "learning_rate": 5.66252182352057e-06, "loss": 0.4578, "num_input_tokens_seen": 99373824, "step": 81715 }, { "epoch": 10.239318381155243, "grad_norm": 0.18349862098693848, "learning_rate": 5.661979926416532e-06, "loss": 0.4633, "num_input_tokens_seen": 99380032, "step": 81720 }, { "epoch": 10.239944869064027, "grad_norm": 0.16296212375164032, "learning_rate": 5.661438021397966e-06, "loss": 0.4718, "num_input_tokens_seen": 99386112, "step": 81725 }, { "epoch": 10.24057135697281, "grad_norm": 0.20472313463687897, "learning_rate": 5.660896108471353e-06, "loss": 0.4695, "num_input_tokens_seen": 99391904, "step": 81730 }, { "epoch": 10.241197844881594, "grad_norm": 0.15145279467105865, "learning_rate": 5.660354187643171e-06, "loss": 0.4604, "num_input_tokens_seen": 99398336, "step": 81735 }, { "epoch": 10.241824332790378, "grad_norm": 0.343216210603714, "learning_rate": 5.659812258919899e-06, "loss": 0.4584, "num_input_tokens_seen": 99404544, "step": 81740 }, { "epoch": 10.24245082069916, "grad_norm": 0.2430199682712555, "learning_rate": 5.659270322308018e-06, "loss": 0.4523, "num_input_tokens_seen": 99411008, "step": 81745 }, { "epoch": 10.243077308607944, "grad_norm": 0.1774362474679947, "learning_rate": 5.658728377814004e-06, "loss": 0.4613, "num_input_tokens_seen": 99416960, "step": 81750 }, { "epoch": 10.243703796516726, "grad_norm": 0.19930702447891235, "learning_rate": 5.65818642544434e-06, "loss": 0.4613, "num_input_tokens_seen": 99423232, "step": 81755 }, { "epoch": 10.24433028442551, "grad_norm": 0.1693626046180725, "learning_rate": 5.6576444652055026e-06, "loss": 0.4668, "num_input_tokens_seen": 99429632, "step": 81760 }, { "epoch": 10.244956772334294, "grad_norm": 0.14948832988739014, "learning_rate": 5.657102497103975e-06, "loss": 0.4537, "num_input_tokens_seen": 99435712, "step": 81765 }, { "epoch": 10.245583260243077, "grad_norm": 0.21339820325374603, "learning_rate": 5.656560521146233e-06, "loss": 0.4545, "num_input_tokens_seen": 99441344, "step": 81770 }, { "epoch": 10.24620974815186, "grad_norm": 0.15586349368095398, "learning_rate": 5.656018537338757e-06, "loss": 0.4613, "num_input_tokens_seen": 99447584, "step": 81775 }, { "epoch": 10.246836236060645, "grad_norm": 0.14001527428627014, "learning_rate": 5.655476545688029e-06, "loss": 0.4484, "num_input_tokens_seen": 99453568, "step": 81780 }, { "epoch": 10.247462723969427, "grad_norm": 0.13308891654014587, "learning_rate": 5.654934546200528e-06, "loss": 0.463, "num_input_tokens_seen": 99459328, "step": 81785 }, { "epoch": 10.248089211878211, "grad_norm": 0.18008741736412048, "learning_rate": 5.654392538882734e-06, "loss": 0.4692, "num_input_tokens_seen": 99465120, "step": 81790 }, { "epoch": 10.248715699786993, "grad_norm": 0.21305297315120697, "learning_rate": 5.6538505237411254e-06, "loss": 0.461, "num_input_tokens_seen": 99471136, "step": 81795 }, { "epoch": 10.249342187695778, "grad_norm": 0.218033105134964, "learning_rate": 5.653308500782186e-06, "loss": 0.4596, "num_input_tokens_seen": 99477280, "step": 81800 }, { "epoch": 10.249968675604562, "grad_norm": 0.15727801620960236, "learning_rate": 5.652766470012394e-06, "loss": 0.4616, "num_input_tokens_seen": 99483424, "step": 81805 }, { "epoch": 10.250595163513344, "grad_norm": 0.15055882930755615, "learning_rate": 5.652224431438228e-06, "loss": 0.4662, "num_input_tokens_seen": 99489792, "step": 81810 }, { "epoch": 10.251221651422128, "grad_norm": 0.15405990183353424, "learning_rate": 5.651682385066173e-06, "loss": 0.4686, "num_input_tokens_seen": 99496000, "step": 81815 }, { "epoch": 10.25184813933091, "grad_norm": 0.1753629893064499, "learning_rate": 5.651140330902705e-06, "loss": 0.4541, "num_input_tokens_seen": 99502080, "step": 81820 }, { "epoch": 10.252474627239694, "grad_norm": 0.26400256156921387, "learning_rate": 5.6505982689543095e-06, "loss": 0.4704, "num_input_tokens_seen": 99508160, "step": 81825 }, { "epoch": 10.253101115148478, "grad_norm": 0.13893954455852509, "learning_rate": 5.650056199227463e-06, "loss": 0.4582, "num_input_tokens_seen": 99514016, "step": 81830 }, { "epoch": 10.25372760305726, "grad_norm": 0.14367036521434784, "learning_rate": 5.649514121728648e-06, "loss": 0.4594, "num_input_tokens_seen": 99520320, "step": 81835 }, { "epoch": 10.254354090966045, "grad_norm": 0.1978234201669693, "learning_rate": 5.648972036464346e-06, "loss": 0.4532, "num_input_tokens_seen": 99526432, "step": 81840 }, { "epoch": 10.254980578874827, "grad_norm": 0.1693984568119049, "learning_rate": 5.648429943441037e-06, "loss": 0.456, "num_input_tokens_seen": 99532544, "step": 81845 }, { "epoch": 10.255607066783611, "grad_norm": 0.12392254173755646, "learning_rate": 5.647887842665204e-06, "loss": 0.4687, "num_input_tokens_seen": 99538816, "step": 81850 }, { "epoch": 10.256233554692395, "grad_norm": 0.19832947850227356, "learning_rate": 5.647345734143327e-06, "loss": 0.4659, "num_input_tokens_seen": 99544928, "step": 81855 }, { "epoch": 10.256860042601177, "grad_norm": 0.11098865419626236, "learning_rate": 5.646803617881886e-06, "loss": 0.4601, "num_input_tokens_seen": 99551072, "step": 81860 }, { "epoch": 10.257486530509961, "grad_norm": 0.21100085973739624, "learning_rate": 5.646261493887366e-06, "loss": 0.4646, "num_input_tokens_seen": 99556992, "step": 81865 }, { "epoch": 10.258113018418744, "grad_norm": 0.14884249866008759, "learning_rate": 5.645719362166245e-06, "loss": 0.4546, "num_input_tokens_seen": 99563296, "step": 81870 }, { "epoch": 10.258739506327528, "grad_norm": 0.13916678726673126, "learning_rate": 5.645177222725008e-06, "loss": 0.4496, "num_input_tokens_seen": 99569600, "step": 81875 }, { "epoch": 10.259365994236312, "grad_norm": 0.1336381882429123, "learning_rate": 5.644635075570133e-06, "loss": 0.4671, "num_input_tokens_seen": 99575584, "step": 81880 }, { "epoch": 10.259992482145094, "grad_norm": 0.18047718703746796, "learning_rate": 5.644092920708104e-06, "loss": 0.4604, "num_input_tokens_seen": 99582080, "step": 81885 }, { "epoch": 10.260618970053878, "grad_norm": 0.10824429243803024, "learning_rate": 5.643550758145404e-06, "loss": 0.4613, "num_input_tokens_seen": 99588160, "step": 81890 }, { "epoch": 10.261245457962662, "grad_norm": 0.13230739533901215, "learning_rate": 5.643008587888511e-06, "loss": 0.4591, "num_input_tokens_seen": 99594272, "step": 81895 }, { "epoch": 10.261871945871444, "grad_norm": 0.17316783964633942, "learning_rate": 5.642466409943912e-06, "loss": 0.452, "num_input_tokens_seen": 99600576, "step": 81900 }, { "epoch": 10.262498433780229, "grad_norm": 0.13611608743667603, "learning_rate": 5.641924224318086e-06, "loss": 0.4596, "num_input_tokens_seen": 99606752, "step": 81905 }, { "epoch": 10.26312492168901, "grad_norm": 0.20762966573238373, "learning_rate": 5.641382031017517e-06, "loss": 0.4596, "num_input_tokens_seen": 99612928, "step": 81910 }, { "epoch": 10.263751409597795, "grad_norm": 0.16511909663677216, "learning_rate": 5.640839830048684e-06, "loss": 0.465, "num_input_tokens_seen": 99619200, "step": 81915 }, { "epoch": 10.264377897506579, "grad_norm": 0.1394723355770111, "learning_rate": 5.640297621418075e-06, "loss": 0.4602, "num_input_tokens_seen": 99625440, "step": 81920 }, { "epoch": 10.265004385415361, "grad_norm": 0.14134332537651062, "learning_rate": 5.639755405132169e-06, "loss": 0.4543, "num_input_tokens_seen": 99631072, "step": 81925 }, { "epoch": 10.265630873324145, "grad_norm": 0.1196536123752594, "learning_rate": 5.639213181197448e-06, "loss": 0.4514, "num_input_tokens_seen": 99637376, "step": 81930 }, { "epoch": 10.266257361232928, "grad_norm": 0.1322251260280609, "learning_rate": 5.638670949620398e-06, "loss": 0.4598, "num_input_tokens_seen": 99643552, "step": 81935 }, { "epoch": 10.266883849141712, "grad_norm": 0.1856100708246231, "learning_rate": 5.638128710407499e-06, "loss": 0.4633, "num_input_tokens_seen": 99649728, "step": 81940 }, { "epoch": 10.267510337050496, "grad_norm": 0.1529194563627243, "learning_rate": 5.637586463565234e-06, "loss": 0.4661, "num_input_tokens_seen": 99655808, "step": 81945 }, { "epoch": 10.268136824959278, "grad_norm": 0.18507790565490723, "learning_rate": 5.637044209100087e-06, "loss": 0.4615, "num_input_tokens_seen": 99661792, "step": 81950 }, { "epoch": 10.268763312868062, "grad_norm": 0.1593218445777893, "learning_rate": 5.636501947018541e-06, "loss": 0.4635, "num_input_tokens_seen": 99667680, "step": 81955 }, { "epoch": 10.269389800776844, "grad_norm": 0.17236727476119995, "learning_rate": 5.6359596773270785e-06, "loss": 0.4625, "num_input_tokens_seen": 99674368, "step": 81960 }, { "epoch": 10.270016288685628, "grad_norm": 0.11849556863307953, "learning_rate": 5.635417400032185e-06, "loss": 0.4575, "num_input_tokens_seen": 99680480, "step": 81965 }, { "epoch": 10.270642776594412, "grad_norm": 0.1560692936182022, "learning_rate": 5.63487511514034e-06, "loss": 0.4704, "num_input_tokens_seen": 99686400, "step": 81970 }, { "epoch": 10.271269264503195, "grad_norm": 0.17191897332668304, "learning_rate": 5.6343328226580305e-06, "loss": 0.454, "num_input_tokens_seen": 99692544, "step": 81975 }, { "epoch": 10.271895752411979, "grad_norm": 0.17712467908859253, "learning_rate": 5.633790522591739e-06, "loss": 0.4505, "num_input_tokens_seen": 99699072, "step": 81980 }, { "epoch": 10.272522240320761, "grad_norm": 0.1516207456588745, "learning_rate": 5.633248214947949e-06, "loss": 0.4629, "num_input_tokens_seen": 99705408, "step": 81985 }, { "epoch": 10.273148728229545, "grad_norm": 0.11970231682062149, "learning_rate": 5.632705899733144e-06, "loss": 0.4612, "num_input_tokens_seen": 99711136, "step": 81990 }, { "epoch": 10.27377521613833, "grad_norm": 0.19095627963542938, "learning_rate": 5.632163576953808e-06, "loss": 0.4598, "num_input_tokens_seen": 99717280, "step": 81995 }, { "epoch": 10.274401704047111, "grad_norm": 0.18447908759117126, "learning_rate": 5.6316212466164255e-06, "loss": 0.4721, "num_input_tokens_seen": 99723424, "step": 82000 }, { "epoch": 10.275028191955895, "grad_norm": 0.11858983337879181, "learning_rate": 5.63107890872748e-06, "loss": 0.4619, "num_input_tokens_seen": 99729856, "step": 82005 }, { "epoch": 10.27565467986468, "grad_norm": 0.1524181365966797, "learning_rate": 5.630536563293453e-06, "loss": 0.4553, "num_input_tokens_seen": 99735776, "step": 82010 }, { "epoch": 10.276281167773462, "grad_norm": 0.1258469521999359, "learning_rate": 5.6299942103208335e-06, "loss": 0.4647, "num_input_tokens_seen": 99741792, "step": 82015 }, { "epoch": 10.276907655682246, "grad_norm": 0.23235242068767548, "learning_rate": 5.629451849816102e-06, "loss": 0.4633, "num_input_tokens_seen": 99747744, "step": 82020 }, { "epoch": 10.277534143591028, "grad_norm": 0.14681419730186462, "learning_rate": 5.628909481785746e-06, "loss": 0.461, "num_input_tokens_seen": 99753920, "step": 82025 }, { "epoch": 10.278160631499812, "grad_norm": 0.14088471233844757, "learning_rate": 5.628367106236249e-06, "loss": 0.4782, "num_input_tokens_seen": 99760064, "step": 82030 }, { "epoch": 10.278787119408596, "grad_norm": 0.17247389256954193, "learning_rate": 5.627824723174094e-06, "loss": 0.4578, "num_input_tokens_seen": 99765920, "step": 82035 }, { "epoch": 10.279413607317379, "grad_norm": 0.13199107348918915, "learning_rate": 5.627282332605766e-06, "loss": 0.4683, "num_input_tokens_seen": 99771840, "step": 82040 }, { "epoch": 10.280040095226163, "grad_norm": 0.1434522271156311, "learning_rate": 5.626739934537752e-06, "loss": 0.4623, "num_input_tokens_seen": 99778176, "step": 82045 }, { "epoch": 10.280666583134945, "grad_norm": 0.16128095984458923, "learning_rate": 5.626197528976533e-06, "loss": 0.4592, "num_input_tokens_seen": 99783904, "step": 82050 }, { "epoch": 10.281293071043729, "grad_norm": 0.1654532253742218, "learning_rate": 5.625655115928598e-06, "loss": 0.459, "num_input_tokens_seen": 99789920, "step": 82055 }, { "epoch": 10.281919558952513, "grad_norm": 0.13410726189613342, "learning_rate": 5.625112695400429e-06, "loss": 0.4562, "num_input_tokens_seen": 99796096, "step": 82060 }, { "epoch": 10.282546046861295, "grad_norm": 0.10652923583984375, "learning_rate": 5.624570267398511e-06, "loss": 0.4553, "num_input_tokens_seen": 99802528, "step": 82065 }, { "epoch": 10.28317253477008, "grad_norm": 0.14350105822086334, "learning_rate": 5.624027831929333e-06, "loss": 0.4633, "num_input_tokens_seen": 99808576, "step": 82070 }, { "epoch": 10.283799022678862, "grad_norm": 0.17109334468841553, "learning_rate": 5.623485388999376e-06, "loss": 0.4574, "num_input_tokens_seen": 99814624, "step": 82075 }, { "epoch": 10.284425510587646, "grad_norm": 0.16222591698169708, "learning_rate": 5.622942938615126e-06, "loss": 0.4572, "num_input_tokens_seen": 99820864, "step": 82080 }, { "epoch": 10.28505199849643, "grad_norm": 0.14464648067951202, "learning_rate": 5.622400480783072e-06, "loss": 0.4455, "num_input_tokens_seen": 99826688, "step": 82085 }, { "epoch": 10.285678486405212, "grad_norm": 0.17151695489883423, "learning_rate": 5.621858015509695e-06, "loss": 0.4647, "num_input_tokens_seen": 99833056, "step": 82090 }, { "epoch": 10.286304974313996, "grad_norm": 0.18811897933483124, "learning_rate": 5.6213155428014835e-06, "loss": 0.4587, "num_input_tokens_seen": 99839136, "step": 82095 }, { "epoch": 10.286931462222778, "grad_norm": 0.12208069115877151, "learning_rate": 5.620773062664921e-06, "loss": 0.463, "num_input_tokens_seen": 99845280, "step": 82100 }, { "epoch": 10.287557950131562, "grad_norm": 0.23182030022144318, "learning_rate": 5.620230575106497e-06, "loss": 0.4451, "num_input_tokens_seen": 99851328, "step": 82105 }, { "epoch": 10.288184438040346, "grad_norm": 0.164457768201828, "learning_rate": 5.619688080132693e-06, "loss": 0.4464, "num_input_tokens_seen": 99857600, "step": 82110 }, { "epoch": 10.288810925949129, "grad_norm": 0.1669975072145462, "learning_rate": 5.619145577749998e-06, "loss": 0.4582, "num_input_tokens_seen": 99863520, "step": 82115 }, { "epoch": 10.289437413857913, "grad_norm": 0.14717982709407806, "learning_rate": 5.618603067964896e-06, "loss": 0.4538, "num_input_tokens_seen": 99869504, "step": 82120 }, { "epoch": 10.290063901766695, "grad_norm": 0.15421639382839203, "learning_rate": 5.6180605507838755e-06, "loss": 0.4616, "num_input_tokens_seen": 99875616, "step": 82125 }, { "epoch": 10.290690389675479, "grad_norm": 0.16979703307151794, "learning_rate": 5.617518026213421e-06, "loss": 0.4622, "num_input_tokens_seen": 99881824, "step": 82130 }, { "epoch": 10.291316877584263, "grad_norm": 0.13865503668785095, "learning_rate": 5.61697549426002e-06, "loss": 0.4882, "num_input_tokens_seen": 99887424, "step": 82135 }, { "epoch": 10.291943365493045, "grad_norm": 0.1445314884185791, "learning_rate": 5.616432954930157e-06, "loss": 0.4665, "num_input_tokens_seen": 99893728, "step": 82140 }, { "epoch": 10.29256985340183, "grad_norm": 0.16426709294319153, "learning_rate": 5.61589040823032e-06, "loss": 0.4746, "num_input_tokens_seen": 99899616, "step": 82145 }, { "epoch": 10.293196341310614, "grad_norm": 0.13434824347496033, "learning_rate": 5.6153478541669955e-06, "loss": 0.4618, "num_input_tokens_seen": 99905952, "step": 82150 }, { "epoch": 10.293822829219396, "grad_norm": 0.1359933316707611, "learning_rate": 5.61480529274667e-06, "loss": 0.4645, "num_input_tokens_seen": 99912128, "step": 82155 }, { "epoch": 10.29444931712818, "grad_norm": 0.1136004775762558, "learning_rate": 5.61426272397583e-06, "loss": 0.4547, "num_input_tokens_seen": 99918784, "step": 82160 }, { "epoch": 10.295075805036962, "grad_norm": 0.18065601587295532, "learning_rate": 5.613720147860964e-06, "loss": 0.4712, "num_input_tokens_seen": 99924672, "step": 82165 }, { "epoch": 10.295702292945746, "grad_norm": 0.13614791631698608, "learning_rate": 5.613177564408556e-06, "loss": 0.4648, "num_input_tokens_seen": 99930528, "step": 82170 }, { "epoch": 10.29632878085453, "grad_norm": 0.14973048865795135, "learning_rate": 5.612634973625096e-06, "loss": 0.4523, "num_input_tokens_seen": 99936576, "step": 82175 }, { "epoch": 10.296955268763313, "grad_norm": 0.168919637799263, "learning_rate": 5.6120923755170696e-06, "loss": 0.4663, "num_input_tokens_seen": 99942560, "step": 82180 }, { "epoch": 10.297581756672097, "grad_norm": 0.1478019505739212, "learning_rate": 5.611549770090964e-06, "loss": 0.4613, "num_input_tokens_seen": 99948224, "step": 82185 }, { "epoch": 10.298208244580879, "grad_norm": 0.15594668686389923, "learning_rate": 5.611007157353267e-06, "loss": 0.4539, "num_input_tokens_seen": 99954048, "step": 82190 }, { "epoch": 10.298834732489663, "grad_norm": 0.12200428545475006, "learning_rate": 5.610464537310465e-06, "loss": 0.4765, "num_input_tokens_seen": 99960192, "step": 82195 }, { "epoch": 10.299461220398447, "grad_norm": 0.12174367904663086, "learning_rate": 5.609921909969047e-06, "loss": 0.4666, "num_input_tokens_seen": 99966240, "step": 82200 }, { "epoch": 10.30008770830723, "grad_norm": 0.12775403261184692, "learning_rate": 5.609379275335499e-06, "loss": 0.4577, "num_input_tokens_seen": 99972384, "step": 82205 }, { "epoch": 10.300714196216013, "grad_norm": 0.1457998901605606, "learning_rate": 5.6088366334163105e-06, "loss": 0.4668, "num_input_tokens_seen": 99978688, "step": 82210 }, { "epoch": 10.301340684124796, "grad_norm": 0.15003608167171478, "learning_rate": 5.608293984217967e-06, "loss": 0.4626, "num_input_tokens_seen": 99984672, "step": 82215 }, { "epoch": 10.30196717203358, "grad_norm": 0.13383179903030396, "learning_rate": 5.607751327746959e-06, "loss": 0.4597, "num_input_tokens_seen": 99990752, "step": 82220 }, { "epoch": 10.302593659942364, "grad_norm": 0.1925022304058075, "learning_rate": 5.60720866400977e-06, "loss": 0.4579, "num_input_tokens_seen": 99996896, "step": 82225 }, { "epoch": 10.303220147851146, "grad_norm": 0.11659177392721176, "learning_rate": 5.606665993012893e-06, "loss": 0.4661, "num_input_tokens_seen": 100003040, "step": 82230 }, { "epoch": 10.30384663575993, "grad_norm": 0.12078902125358582, "learning_rate": 5.606123314762813e-06, "loss": 0.4708, "num_input_tokens_seen": 100009216, "step": 82235 }, { "epoch": 10.304473123668712, "grad_norm": 0.12092293798923492, "learning_rate": 5.605580629266021e-06, "loss": 0.458, "num_input_tokens_seen": 100015360, "step": 82240 }, { "epoch": 10.305099611577496, "grad_norm": 0.17937085032463074, "learning_rate": 5.605037936529003e-06, "loss": 0.4662, "num_input_tokens_seen": 100021248, "step": 82245 }, { "epoch": 10.30572609948628, "grad_norm": 0.10486455261707306, "learning_rate": 5.604495236558248e-06, "loss": 0.4651, "num_input_tokens_seen": 100027296, "step": 82250 }, { "epoch": 10.306352587395063, "grad_norm": 0.12251241505146027, "learning_rate": 5.603952529360242e-06, "loss": 0.4577, "num_input_tokens_seen": 100033568, "step": 82255 }, { "epoch": 10.306979075303847, "grad_norm": 0.17705556750297546, "learning_rate": 5.603409814941479e-06, "loss": 0.4537, "num_input_tokens_seen": 100039808, "step": 82260 }, { "epoch": 10.307605563212629, "grad_norm": 0.15479342639446259, "learning_rate": 5.602867093308442e-06, "loss": 0.4558, "num_input_tokens_seen": 100045856, "step": 82265 }, { "epoch": 10.308232051121413, "grad_norm": 0.16903401911258698, "learning_rate": 5.602324364467622e-06, "loss": 0.466, "num_input_tokens_seen": 100052224, "step": 82270 }, { "epoch": 10.308858539030197, "grad_norm": 0.191410630941391, "learning_rate": 5.601781628425506e-06, "loss": 0.452, "num_input_tokens_seen": 100057824, "step": 82275 }, { "epoch": 10.30948502693898, "grad_norm": 0.1410636305809021, "learning_rate": 5.601238885188588e-06, "loss": 0.47, "num_input_tokens_seen": 100063904, "step": 82280 }, { "epoch": 10.310111514847764, "grad_norm": 0.1407448649406433, "learning_rate": 5.600696134763352e-06, "loss": 0.4528, "num_input_tokens_seen": 100069440, "step": 82285 }, { "epoch": 10.310738002756548, "grad_norm": 0.11288037896156311, "learning_rate": 5.60015337715629e-06, "loss": 0.4623, "num_input_tokens_seen": 100075584, "step": 82290 }, { "epoch": 10.31136449066533, "grad_norm": 0.1464817076921463, "learning_rate": 5.599610612373889e-06, "loss": 0.4698, "num_input_tokens_seen": 100081440, "step": 82295 }, { "epoch": 10.311990978574114, "grad_norm": 0.13558292388916016, "learning_rate": 5.5990678404226395e-06, "loss": 0.461, "num_input_tokens_seen": 100087648, "step": 82300 }, { "epoch": 10.312617466482896, "grad_norm": 0.19024284183979034, "learning_rate": 5.598525061309031e-06, "loss": 0.4549, "num_input_tokens_seen": 100094112, "step": 82305 }, { "epoch": 10.31324395439168, "grad_norm": 0.12940314412117004, "learning_rate": 5.597982275039551e-06, "loss": 0.462, "num_input_tokens_seen": 100100192, "step": 82310 }, { "epoch": 10.313870442300464, "grad_norm": 0.13322433829307556, "learning_rate": 5.59743948162069e-06, "loss": 0.4531, "num_input_tokens_seen": 100106144, "step": 82315 }, { "epoch": 10.314496930209247, "grad_norm": 0.1533779501914978, "learning_rate": 5.596896681058937e-06, "loss": 0.4588, "num_input_tokens_seen": 100112224, "step": 82320 }, { "epoch": 10.31512341811803, "grad_norm": 0.15046852827072144, "learning_rate": 5.596353873360784e-06, "loss": 0.4556, "num_input_tokens_seen": 100118496, "step": 82325 }, { "epoch": 10.315749906026813, "grad_norm": 0.15214939415454865, "learning_rate": 5.595811058532718e-06, "loss": 0.4592, "num_input_tokens_seen": 100124608, "step": 82330 }, { "epoch": 10.316376393935597, "grad_norm": 0.17038261890411377, "learning_rate": 5.59526823658123e-06, "loss": 0.4659, "num_input_tokens_seen": 100130688, "step": 82335 }, { "epoch": 10.317002881844381, "grad_norm": 0.14980007708072662, "learning_rate": 5.59472540751281e-06, "loss": 0.4551, "num_input_tokens_seen": 100137120, "step": 82340 }, { "epoch": 10.317629369753163, "grad_norm": 0.16071461141109467, "learning_rate": 5.594182571333948e-06, "loss": 0.462, "num_input_tokens_seen": 100143456, "step": 82345 }, { "epoch": 10.318255857661947, "grad_norm": 0.307439923286438, "learning_rate": 5.593639728051133e-06, "loss": 0.4646, "num_input_tokens_seen": 100149152, "step": 82350 }, { "epoch": 10.31888234557073, "grad_norm": 0.13955718278884888, "learning_rate": 5.593096877670857e-06, "loss": 0.4667, "num_input_tokens_seen": 100155392, "step": 82355 }, { "epoch": 10.319508833479514, "grad_norm": 0.18026112020015717, "learning_rate": 5.5925540201996085e-06, "loss": 0.46, "num_input_tokens_seen": 100161728, "step": 82360 }, { "epoch": 10.320135321388298, "grad_norm": 0.14146055281162262, "learning_rate": 5.59201115564388e-06, "loss": 0.4613, "num_input_tokens_seen": 100168000, "step": 82365 }, { "epoch": 10.32076180929708, "grad_norm": 0.2003125548362732, "learning_rate": 5.591468284010159e-06, "loss": 0.4566, "num_input_tokens_seen": 100174368, "step": 82370 }, { "epoch": 10.321388297205864, "grad_norm": 0.13970358669757843, "learning_rate": 5.590925405304936e-06, "loss": 0.4683, "num_input_tokens_seen": 100180480, "step": 82375 }, { "epoch": 10.322014785114646, "grad_norm": 0.1120176613330841, "learning_rate": 5.590382519534704e-06, "loss": 0.4628, "num_input_tokens_seen": 100185760, "step": 82380 }, { "epoch": 10.32264127302343, "grad_norm": 0.1629672348499298, "learning_rate": 5.589839626705953e-06, "loss": 0.4601, "num_input_tokens_seen": 100191456, "step": 82385 }, { "epoch": 10.323267760932215, "grad_norm": 0.18988779187202454, "learning_rate": 5.589296726825173e-06, "loss": 0.4648, "num_input_tokens_seen": 100197696, "step": 82390 }, { "epoch": 10.323894248840997, "grad_norm": 0.14119966328144073, "learning_rate": 5.588753819898855e-06, "loss": 0.4609, "num_input_tokens_seen": 100204160, "step": 82395 }, { "epoch": 10.32452073674978, "grad_norm": 0.12065484374761581, "learning_rate": 5.588210905933491e-06, "loss": 0.4589, "num_input_tokens_seen": 100210272, "step": 82400 }, { "epoch": 10.325147224658565, "grad_norm": 0.12969833612442017, "learning_rate": 5.5876679849355705e-06, "loss": 0.461, "num_input_tokens_seen": 100216448, "step": 82405 }, { "epoch": 10.325773712567347, "grad_norm": 0.14528237283229828, "learning_rate": 5.587125056911587e-06, "loss": 0.4578, "num_input_tokens_seen": 100222496, "step": 82410 }, { "epoch": 10.326400200476131, "grad_norm": 0.1595243662595749, "learning_rate": 5.586582121868027e-06, "loss": 0.4605, "num_input_tokens_seen": 100228352, "step": 82415 }, { "epoch": 10.327026688384914, "grad_norm": 0.1422065645456314, "learning_rate": 5.586039179811387e-06, "loss": 0.4587, "num_input_tokens_seen": 100234560, "step": 82420 }, { "epoch": 10.327653176293698, "grad_norm": 0.19421127438545227, "learning_rate": 5.585496230748154e-06, "loss": 0.4627, "num_input_tokens_seen": 100241024, "step": 82425 }, { "epoch": 10.328279664202482, "grad_norm": 0.1465955376625061, "learning_rate": 5.5849532746848215e-06, "loss": 0.4626, "num_input_tokens_seen": 100247584, "step": 82430 }, { "epoch": 10.328906152111264, "grad_norm": 0.1314803808927536, "learning_rate": 5.584410311627881e-06, "loss": 0.4622, "num_input_tokens_seen": 100253696, "step": 82435 }, { "epoch": 10.329532640020048, "grad_norm": 0.119513601064682, "learning_rate": 5.583867341583824e-06, "loss": 0.4577, "num_input_tokens_seen": 100259936, "step": 82440 }, { "epoch": 10.33015912792883, "grad_norm": 0.13971047103405, "learning_rate": 5.583324364559143e-06, "loss": 0.4723, "num_input_tokens_seen": 100266112, "step": 82445 }, { "epoch": 10.330785615837614, "grad_norm": 0.1360122263431549, "learning_rate": 5.582781380560327e-06, "loss": 0.459, "num_input_tokens_seen": 100272128, "step": 82450 }, { "epoch": 10.331412103746398, "grad_norm": 0.1625010371208191, "learning_rate": 5.58223838959387e-06, "loss": 0.4642, "num_input_tokens_seen": 100278304, "step": 82455 }, { "epoch": 10.33203859165518, "grad_norm": 0.16073133051395416, "learning_rate": 5.5816953916662644e-06, "loss": 0.4538, "num_input_tokens_seen": 100284352, "step": 82460 }, { "epoch": 10.332665079563965, "grad_norm": 0.13477353751659393, "learning_rate": 5.581152386784001e-06, "loss": 0.4558, "num_input_tokens_seen": 100290560, "step": 82465 }, { "epoch": 10.333291567472747, "grad_norm": 0.1545725166797638, "learning_rate": 5.580609374953573e-06, "loss": 0.4627, "num_input_tokens_seen": 100296480, "step": 82470 }, { "epoch": 10.333918055381531, "grad_norm": 0.1805703341960907, "learning_rate": 5.5800663561814705e-06, "loss": 0.4594, "num_input_tokens_seen": 100302048, "step": 82475 }, { "epoch": 10.334544543290315, "grad_norm": 0.158558651804924, "learning_rate": 5.579523330474187e-06, "loss": 0.4676, "num_input_tokens_seen": 100308224, "step": 82480 }, { "epoch": 10.335171031199097, "grad_norm": 0.1632576584815979, "learning_rate": 5.578980297838216e-06, "loss": 0.4575, "num_input_tokens_seen": 100314208, "step": 82485 }, { "epoch": 10.335797519107881, "grad_norm": 0.1464274525642395, "learning_rate": 5.578437258280047e-06, "loss": 0.4637, "num_input_tokens_seen": 100319744, "step": 82490 }, { "epoch": 10.336424007016664, "grad_norm": 0.17040489614009857, "learning_rate": 5.577894211806176e-06, "loss": 0.4681, "num_input_tokens_seen": 100325792, "step": 82495 }, { "epoch": 10.337050494925448, "grad_norm": 0.13367678225040436, "learning_rate": 5.577351158423093e-06, "loss": 0.4711, "num_input_tokens_seen": 100332064, "step": 82500 }, { "epoch": 10.337676982834232, "grad_norm": 0.1981050968170166, "learning_rate": 5.576808098137292e-06, "loss": 0.456, "num_input_tokens_seen": 100338176, "step": 82505 }, { "epoch": 10.338303470743014, "grad_norm": 0.15687674283981323, "learning_rate": 5.576265030955266e-06, "loss": 0.4653, "num_input_tokens_seen": 100344480, "step": 82510 }, { "epoch": 10.338929958651798, "grad_norm": 0.19106024503707886, "learning_rate": 5.575721956883504e-06, "loss": 0.4516, "num_input_tokens_seen": 100350816, "step": 82515 }, { "epoch": 10.339556446560582, "grad_norm": 0.1578606218099594, "learning_rate": 5.575178875928504e-06, "loss": 0.4677, "num_input_tokens_seen": 100357088, "step": 82520 }, { "epoch": 10.340182934469365, "grad_norm": 0.16958579421043396, "learning_rate": 5.574635788096757e-06, "loss": 0.4614, "num_input_tokens_seen": 100363232, "step": 82525 }, { "epoch": 10.340809422378149, "grad_norm": 0.16004963219165802, "learning_rate": 5.574092693394757e-06, "loss": 0.4601, "num_input_tokens_seen": 100368992, "step": 82530 }, { "epoch": 10.34143591028693, "grad_norm": 0.13256439566612244, "learning_rate": 5.573549591828994e-06, "loss": 0.4684, "num_input_tokens_seen": 100375328, "step": 82535 }, { "epoch": 10.342062398195715, "grad_norm": 0.1172846332192421, "learning_rate": 5.573006483405965e-06, "loss": 0.4629, "num_input_tokens_seen": 100381472, "step": 82540 }, { "epoch": 10.342688886104499, "grad_norm": 0.14070190489292145, "learning_rate": 5.572463368132161e-06, "loss": 0.4521, "num_input_tokens_seen": 100387712, "step": 82545 }, { "epoch": 10.343315374013281, "grad_norm": 0.16582201421260834, "learning_rate": 5.571920246014077e-06, "loss": 0.4638, "num_input_tokens_seen": 100394240, "step": 82550 }, { "epoch": 10.343941861922065, "grad_norm": 0.13309840857982635, "learning_rate": 5.571377117058205e-06, "loss": 0.4654, "num_input_tokens_seen": 100400384, "step": 82555 }, { "epoch": 10.344568349830848, "grad_norm": 0.1690746247768402, "learning_rate": 5.570833981271039e-06, "loss": 0.454, "num_input_tokens_seen": 100406528, "step": 82560 }, { "epoch": 10.345194837739632, "grad_norm": 0.13045047223567963, "learning_rate": 5.570290838659074e-06, "loss": 0.467, "num_input_tokens_seen": 100412032, "step": 82565 }, { "epoch": 10.345821325648416, "grad_norm": 0.10817842930555344, "learning_rate": 5.569747689228801e-06, "loss": 0.4641, "num_input_tokens_seen": 100418048, "step": 82570 }, { "epoch": 10.346447813557198, "grad_norm": 0.15494006872177124, "learning_rate": 5.569204532986715e-06, "loss": 0.4603, "num_input_tokens_seen": 100424320, "step": 82575 }, { "epoch": 10.347074301465982, "grad_norm": 0.13677778840065002, "learning_rate": 5.568661369939311e-06, "loss": 0.459, "num_input_tokens_seen": 100430592, "step": 82580 }, { "epoch": 10.347700789374764, "grad_norm": 0.185461163520813, "learning_rate": 5.568118200093082e-06, "loss": 0.4534, "num_input_tokens_seen": 100436704, "step": 82585 }, { "epoch": 10.348327277283548, "grad_norm": 0.14673130214214325, "learning_rate": 5.5675750234545225e-06, "loss": 0.466, "num_input_tokens_seen": 100442464, "step": 82590 }, { "epoch": 10.348953765192332, "grad_norm": 0.18231306970119476, "learning_rate": 5.567031840030127e-06, "loss": 0.46, "num_input_tokens_seen": 100448512, "step": 82595 }, { "epoch": 10.349580253101115, "grad_norm": 0.18008561432361603, "learning_rate": 5.566488649826388e-06, "loss": 0.4573, "num_input_tokens_seen": 100454688, "step": 82600 }, { "epoch": 10.350206741009899, "grad_norm": 0.11807183176279068, "learning_rate": 5.565945452849802e-06, "loss": 0.4518, "num_input_tokens_seen": 100460928, "step": 82605 }, { "epoch": 10.350833228918681, "grad_norm": 0.2347324937582016, "learning_rate": 5.565402249106863e-06, "loss": 0.4564, "num_input_tokens_seen": 100466976, "step": 82610 }, { "epoch": 10.351459716827465, "grad_norm": 0.14178289473056793, "learning_rate": 5.564859038604064e-06, "loss": 0.47, "num_input_tokens_seen": 100473344, "step": 82615 }, { "epoch": 10.35208620473625, "grad_norm": 0.1777736246585846, "learning_rate": 5.5643158213479e-06, "loss": 0.4661, "num_input_tokens_seen": 100479648, "step": 82620 }, { "epoch": 10.352712692645031, "grad_norm": 0.20270869135856628, "learning_rate": 5.563772597344866e-06, "loss": 0.4616, "num_input_tokens_seen": 100485888, "step": 82625 }, { "epoch": 10.353339180553816, "grad_norm": 0.1534152328968048, "learning_rate": 5.563229366601455e-06, "loss": 0.454, "num_input_tokens_seen": 100492128, "step": 82630 }, { "epoch": 10.3539656684626, "grad_norm": 0.16814327239990234, "learning_rate": 5.562686129124165e-06, "loss": 0.4599, "num_input_tokens_seen": 100498464, "step": 82635 }, { "epoch": 10.354592156371382, "grad_norm": 0.16999512910842896, "learning_rate": 5.562142884919488e-06, "loss": 0.4568, "num_input_tokens_seen": 100504448, "step": 82640 }, { "epoch": 10.355218644280166, "grad_norm": 0.24684424698352814, "learning_rate": 5.561599633993921e-06, "loss": 0.4477, "num_input_tokens_seen": 100510528, "step": 82645 }, { "epoch": 10.355845132188948, "grad_norm": 0.16383041441440582, "learning_rate": 5.561056376353959e-06, "loss": 0.4654, "num_input_tokens_seen": 100516672, "step": 82650 }, { "epoch": 10.356471620097732, "grad_norm": 0.17161183059215546, "learning_rate": 5.560513112006097e-06, "loss": 0.4652, "num_input_tokens_seen": 100522720, "step": 82655 }, { "epoch": 10.357098108006516, "grad_norm": 0.170204758644104, "learning_rate": 5.559969840956827e-06, "loss": 0.4604, "num_input_tokens_seen": 100529280, "step": 82660 }, { "epoch": 10.357724595915299, "grad_norm": 0.1301054209470749, "learning_rate": 5.559426563212649e-06, "loss": 0.4631, "num_input_tokens_seen": 100535168, "step": 82665 }, { "epoch": 10.358351083824083, "grad_norm": 0.18024596571922302, "learning_rate": 5.5588832787800555e-06, "loss": 0.4622, "num_input_tokens_seen": 100540864, "step": 82670 }, { "epoch": 10.358977571732865, "grad_norm": 0.13556122779846191, "learning_rate": 5.558339987665542e-06, "loss": 0.4644, "num_input_tokens_seen": 100547264, "step": 82675 }, { "epoch": 10.359604059641649, "grad_norm": 0.12798289954662323, "learning_rate": 5.557796689875604e-06, "loss": 0.4619, "num_input_tokens_seen": 100553472, "step": 82680 }, { "epoch": 10.360230547550433, "grad_norm": 0.14141006767749786, "learning_rate": 5.557253385416738e-06, "loss": 0.4709, "num_input_tokens_seen": 100559680, "step": 82685 }, { "epoch": 10.360857035459215, "grad_norm": 0.17214979231357574, "learning_rate": 5.55671007429544e-06, "loss": 0.4619, "num_input_tokens_seen": 100565920, "step": 82690 }, { "epoch": 10.361483523368, "grad_norm": 0.1793729066848755, "learning_rate": 5.556166756518204e-06, "loss": 0.4482, "num_input_tokens_seen": 100572032, "step": 82695 }, { "epoch": 10.362110011276782, "grad_norm": 0.17906402051448822, "learning_rate": 5.555623432091526e-06, "loss": 0.457, "num_input_tokens_seen": 100578304, "step": 82700 }, { "epoch": 10.362736499185566, "grad_norm": 0.15575118362903595, "learning_rate": 5.555080101021904e-06, "loss": 0.4675, "num_input_tokens_seen": 100584832, "step": 82705 }, { "epoch": 10.36336298709435, "grad_norm": 0.21420148015022278, "learning_rate": 5.554536763315833e-06, "loss": 0.4611, "num_input_tokens_seen": 100591392, "step": 82710 }, { "epoch": 10.363989475003132, "grad_norm": 0.13869553804397583, "learning_rate": 5.55399341897981e-06, "loss": 0.454, "num_input_tokens_seen": 100597664, "step": 82715 }, { "epoch": 10.364615962911916, "grad_norm": 0.1463676244020462, "learning_rate": 5.553450068020327e-06, "loss": 0.4597, "num_input_tokens_seen": 100604064, "step": 82720 }, { "epoch": 10.365242450820698, "grad_norm": 0.14384903013706207, "learning_rate": 5.552906710443886e-06, "loss": 0.4667, "num_input_tokens_seen": 100610080, "step": 82725 }, { "epoch": 10.365868938729482, "grad_norm": 0.1398300975561142, "learning_rate": 5.552363346256979e-06, "loss": 0.4499, "num_input_tokens_seen": 100616224, "step": 82730 }, { "epoch": 10.366495426638267, "grad_norm": 0.12959378957748413, "learning_rate": 5.551819975466103e-06, "loss": 0.464, "num_input_tokens_seen": 100622304, "step": 82735 }, { "epoch": 10.367121914547049, "grad_norm": 0.16330188512802124, "learning_rate": 5.5512765980777565e-06, "loss": 0.461, "num_input_tokens_seen": 100628352, "step": 82740 }, { "epoch": 10.367748402455833, "grad_norm": 0.17878364026546478, "learning_rate": 5.550733214098434e-06, "loss": 0.4591, "num_input_tokens_seen": 100634464, "step": 82745 }, { "epoch": 10.368374890364615, "grad_norm": 0.21046341955661774, "learning_rate": 5.550189823534633e-06, "loss": 0.467, "num_input_tokens_seen": 100640608, "step": 82750 }, { "epoch": 10.3690013782734, "grad_norm": 0.18795667588710785, "learning_rate": 5.54964642639285e-06, "loss": 0.4652, "num_input_tokens_seen": 100646656, "step": 82755 }, { "epoch": 10.369627866182183, "grad_norm": 0.15937095880508423, "learning_rate": 5.549103022679583e-06, "loss": 0.4624, "num_input_tokens_seen": 100652832, "step": 82760 }, { "epoch": 10.370254354090966, "grad_norm": 0.1855960190296173, "learning_rate": 5.548559612401325e-06, "loss": 0.4626, "num_input_tokens_seen": 100658848, "step": 82765 }, { "epoch": 10.37088084199975, "grad_norm": 0.18609806895256042, "learning_rate": 5.5480161955645784e-06, "loss": 0.4554, "num_input_tokens_seen": 100665248, "step": 82770 }, { "epoch": 10.371507329908532, "grad_norm": 0.16717204451560974, "learning_rate": 5.547472772175837e-06, "loss": 0.4692, "num_input_tokens_seen": 100671456, "step": 82775 }, { "epoch": 10.372133817817316, "grad_norm": 0.15509232878684998, "learning_rate": 5.5469293422415985e-06, "loss": 0.4784, "num_input_tokens_seen": 100677632, "step": 82780 }, { "epoch": 10.3727603057261, "grad_norm": 0.1789141744375229, "learning_rate": 5.546385905768359e-06, "loss": 0.4455, "num_input_tokens_seen": 100683168, "step": 82785 }, { "epoch": 10.373386793634882, "grad_norm": 0.14523640275001526, "learning_rate": 5.545842462762615e-06, "loss": 0.4543, "num_input_tokens_seen": 100689120, "step": 82790 }, { "epoch": 10.374013281543666, "grad_norm": 0.1514626443386078, "learning_rate": 5.545299013230868e-06, "loss": 0.458, "num_input_tokens_seen": 100695584, "step": 82795 }, { "epoch": 10.37463976945245, "grad_norm": 0.1507040113210678, "learning_rate": 5.544755557179612e-06, "loss": 0.4615, "num_input_tokens_seen": 100702080, "step": 82800 }, { "epoch": 10.375266257361233, "grad_norm": 0.21935734152793884, "learning_rate": 5.544212094615345e-06, "loss": 0.4716, "num_input_tokens_seen": 100707744, "step": 82805 }, { "epoch": 10.375892745270017, "grad_norm": 0.2227480113506317, "learning_rate": 5.543668625544566e-06, "loss": 0.464, "num_input_tokens_seen": 100713600, "step": 82810 }, { "epoch": 10.376519233178799, "grad_norm": 0.15765252709388733, "learning_rate": 5.543125149973771e-06, "loss": 0.4641, "num_input_tokens_seen": 100719680, "step": 82815 }, { "epoch": 10.377145721087583, "grad_norm": 0.20257742702960968, "learning_rate": 5.542581667909458e-06, "loss": 0.4512, "num_input_tokens_seen": 100725856, "step": 82820 }, { "epoch": 10.377772208996367, "grad_norm": 0.16376367211341858, "learning_rate": 5.542038179358125e-06, "loss": 0.4539, "num_input_tokens_seen": 100731808, "step": 82825 }, { "epoch": 10.37839869690515, "grad_norm": 0.16046403348445892, "learning_rate": 5.5414946843262695e-06, "loss": 0.4652, "num_input_tokens_seen": 100737728, "step": 82830 }, { "epoch": 10.379025184813933, "grad_norm": 0.17530135810375214, "learning_rate": 5.54095118282039e-06, "loss": 0.4665, "num_input_tokens_seen": 100744064, "step": 82835 }, { "epoch": 10.379651672722716, "grad_norm": 0.17249229550361633, "learning_rate": 5.540407674846985e-06, "loss": 0.4639, "num_input_tokens_seen": 100750304, "step": 82840 }, { "epoch": 10.3802781606315, "grad_norm": 0.2095603197813034, "learning_rate": 5.5398641604125505e-06, "loss": 0.464, "num_input_tokens_seen": 100756416, "step": 82845 }, { "epoch": 10.380904648540284, "grad_norm": 0.1397058516740799, "learning_rate": 5.539320639523586e-06, "loss": 0.4701, "num_input_tokens_seen": 100762592, "step": 82850 }, { "epoch": 10.381531136449066, "grad_norm": 0.17950215935707092, "learning_rate": 5.5387771121865906e-06, "loss": 0.4654, "num_input_tokens_seen": 100768832, "step": 82855 }, { "epoch": 10.38215762435785, "grad_norm": 0.2840893268585205, "learning_rate": 5.5382335784080634e-06, "loss": 0.4702, "num_input_tokens_seen": 100774304, "step": 82860 }, { "epoch": 10.382784112266632, "grad_norm": 0.17613588273525238, "learning_rate": 5.537690038194499e-06, "loss": 0.4611, "num_input_tokens_seen": 100780160, "step": 82865 }, { "epoch": 10.383410600175417, "grad_norm": 0.1948479264974594, "learning_rate": 5.5371464915523985e-06, "loss": 0.4634, "num_input_tokens_seen": 100786176, "step": 82870 }, { "epoch": 10.3840370880842, "grad_norm": 0.1506638377904892, "learning_rate": 5.536602938488261e-06, "loss": 0.456, "num_input_tokens_seen": 100792224, "step": 82875 }, { "epoch": 10.384663575992983, "grad_norm": 0.19549153745174408, "learning_rate": 5.536059379008585e-06, "loss": 0.4669, "num_input_tokens_seen": 100798176, "step": 82880 }, { "epoch": 10.385290063901767, "grad_norm": 0.1638251543045044, "learning_rate": 5.535515813119865e-06, "loss": 0.4593, "num_input_tokens_seen": 100804288, "step": 82885 }, { "epoch": 10.38591655181055, "grad_norm": 0.12790709733963013, "learning_rate": 5.534972240828605e-06, "loss": 0.4688, "num_input_tokens_seen": 100810528, "step": 82890 }, { "epoch": 10.386543039719333, "grad_norm": 0.12672634422779083, "learning_rate": 5.534428662141303e-06, "loss": 0.4612, "num_input_tokens_seen": 100816128, "step": 82895 }, { "epoch": 10.387169527628117, "grad_norm": 0.17356330156326294, "learning_rate": 5.533885077064456e-06, "loss": 0.4669, "num_input_tokens_seen": 100822496, "step": 82900 }, { "epoch": 10.3877960155369, "grad_norm": 0.18288317322731018, "learning_rate": 5.533341485604564e-06, "loss": 0.4548, "num_input_tokens_seen": 100828576, "step": 82905 }, { "epoch": 10.388422503445684, "grad_norm": 0.19536323845386505, "learning_rate": 5.532797887768128e-06, "loss": 0.4577, "num_input_tokens_seen": 100834752, "step": 82910 }, { "epoch": 10.389048991354468, "grad_norm": 0.15623627603054047, "learning_rate": 5.532254283561644e-06, "loss": 0.4647, "num_input_tokens_seen": 100841152, "step": 82915 }, { "epoch": 10.38967547926325, "grad_norm": 0.14717282354831696, "learning_rate": 5.531710672991613e-06, "loss": 0.4559, "num_input_tokens_seen": 100847040, "step": 82920 }, { "epoch": 10.390301967172034, "grad_norm": 0.183217391371727, "learning_rate": 5.5311670560645325e-06, "loss": 0.4594, "num_input_tokens_seen": 100852960, "step": 82925 }, { "epoch": 10.390928455080816, "grad_norm": 0.1395541876554489, "learning_rate": 5.530623432786905e-06, "loss": 0.4703, "num_input_tokens_seen": 100858816, "step": 82930 }, { "epoch": 10.3915549429896, "grad_norm": 0.13564345240592957, "learning_rate": 5.530079803165227e-06, "loss": 0.461, "num_input_tokens_seen": 100865152, "step": 82935 }, { "epoch": 10.392181430898384, "grad_norm": 0.1489970088005066, "learning_rate": 5.529536167205999e-06, "loss": 0.4634, "num_input_tokens_seen": 100871296, "step": 82940 }, { "epoch": 10.392807918807167, "grad_norm": 0.12011148035526276, "learning_rate": 5.528992524915722e-06, "loss": 0.4597, "num_input_tokens_seen": 100877472, "step": 82945 }, { "epoch": 10.39343440671595, "grad_norm": 0.12790559232234955, "learning_rate": 5.5284488763008925e-06, "loss": 0.4609, "num_input_tokens_seen": 100883680, "step": 82950 }, { "epoch": 10.394060894624733, "grad_norm": 0.19556428492069244, "learning_rate": 5.527905221368014e-06, "loss": 0.4609, "num_input_tokens_seen": 100889696, "step": 82955 }, { "epoch": 10.394687382533517, "grad_norm": 0.160480797290802, "learning_rate": 5.527361560123586e-06, "loss": 0.4614, "num_input_tokens_seen": 100895744, "step": 82960 }, { "epoch": 10.395313870442301, "grad_norm": 0.20404671132564545, "learning_rate": 5.526817892574106e-06, "loss": 0.4579, "num_input_tokens_seen": 100901824, "step": 82965 }, { "epoch": 10.395940358351083, "grad_norm": 0.16123183071613312, "learning_rate": 5.526274218726074e-06, "loss": 0.4556, "num_input_tokens_seen": 100907392, "step": 82970 }, { "epoch": 10.396566846259867, "grad_norm": 0.15630730986595154, "learning_rate": 5.5257305385859925e-06, "loss": 0.4626, "num_input_tokens_seen": 100913472, "step": 82975 }, { "epoch": 10.39719333416865, "grad_norm": 0.15030820667743683, "learning_rate": 5.525186852160361e-06, "loss": 0.4656, "num_input_tokens_seen": 100920128, "step": 82980 }, { "epoch": 10.397819822077434, "grad_norm": 0.2336031198501587, "learning_rate": 5.524643159455678e-06, "loss": 0.4617, "num_input_tokens_seen": 100926048, "step": 82985 }, { "epoch": 10.398446309986218, "grad_norm": 0.11927621811628342, "learning_rate": 5.524099460478446e-06, "loss": 0.4623, "num_input_tokens_seen": 100931776, "step": 82990 }, { "epoch": 10.399072797895, "grad_norm": 0.18429772555828094, "learning_rate": 5.523555755235162e-06, "loss": 0.4706, "num_input_tokens_seen": 100938016, "step": 82995 }, { "epoch": 10.399699285803784, "grad_norm": 0.23345132172107697, "learning_rate": 5.523012043732331e-06, "loss": 0.465, "num_input_tokens_seen": 100944416, "step": 83000 }, { "epoch": 10.400325773712567, "grad_norm": 0.14911039173603058, "learning_rate": 5.522468325976451e-06, "loss": 0.4598, "num_input_tokens_seen": 100950688, "step": 83005 }, { "epoch": 10.40095226162135, "grad_norm": 0.13702978193759918, "learning_rate": 5.521924601974021e-06, "loss": 0.4522, "num_input_tokens_seen": 100957024, "step": 83010 }, { "epoch": 10.401578749530135, "grad_norm": 0.1663958579301834, "learning_rate": 5.521380871731545e-06, "loss": 0.4552, "num_input_tokens_seen": 100963040, "step": 83015 }, { "epoch": 10.402205237438917, "grad_norm": 0.18333519995212555, "learning_rate": 5.520837135255524e-06, "loss": 0.4645, "num_input_tokens_seen": 100969280, "step": 83020 }, { "epoch": 10.402831725347701, "grad_norm": 0.1833428293466568, "learning_rate": 5.520293392552455e-06, "loss": 0.4595, "num_input_tokens_seen": 100975424, "step": 83025 }, { "epoch": 10.403458213256485, "grad_norm": 0.21474823355674744, "learning_rate": 5.519749643628842e-06, "loss": 0.4524, "num_input_tokens_seen": 100981600, "step": 83030 }, { "epoch": 10.404084701165267, "grad_norm": 0.1723671555519104, "learning_rate": 5.519205888491185e-06, "loss": 0.4657, "num_input_tokens_seen": 100987808, "step": 83035 }, { "epoch": 10.404711189074051, "grad_norm": 0.2560073137283325, "learning_rate": 5.518662127145985e-06, "loss": 0.4564, "num_input_tokens_seen": 100994048, "step": 83040 }, { "epoch": 10.405337676982834, "grad_norm": 0.24160274863243103, "learning_rate": 5.5181183595997425e-06, "loss": 0.4706, "num_input_tokens_seen": 101000224, "step": 83045 }, { "epoch": 10.405964164891618, "grad_norm": 0.26144808530807495, "learning_rate": 5.517574585858959e-06, "loss": 0.4745, "num_input_tokens_seen": 101006208, "step": 83050 }, { "epoch": 10.406590652800402, "grad_norm": 0.2159419059753418, "learning_rate": 5.5170308059301384e-06, "loss": 0.4653, "num_input_tokens_seen": 101012192, "step": 83055 }, { "epoch": 10.407217140709184, "grad_norm": 0.15455438196659088, "learning_rate": 5.516487019819778e-06, "loss": 0.4588, "num_input_tokens_seen": 101018112, "step": 83060 }, { "epoch": 10.407843628617968, "grad_norm": 0.15121319890022278, "learning_rate": 5.5159432275343814e-06, "loss": 0.4599, "num_input_tokens_seen": 101023680, "step": 83065 }, { "epoch": 10.40847011652675, "grad_norm": 0.16819635033607483, "learning_rate": 5.515399429080449e-06, "loss": 0.4581, "num_input_tokens_seen": 101030240, "step": 83070 }, { "epoch": 10.409096604435534, "grad_norm": 0.1456354260444641, "learning_rate": 5.514855624464484e-06, "loss": 0.4623, "num_input_tokens_seen": 101036416, "step": 83075 }, { "epoch": 10.409723092344318, "grad_norm": 0.19008870422840118, "learning_rate": 5.5143118136929876e-06, "loss": 0.4691, "num_input_tokens_seen": 101042464, "step": 83080 }, { "epoch": 10.4103495802531, "grad_norm": 0.17894001305103302, "learning_rate": 5.513767996772461e-06, "loss": 0.4584, "num_input_tokens_seen": 101048512, "step": 83085 }, { "epoch": 10.410976068161885, "grad_norm": 0.1651889681816101, "learning_rate": 5.513224173709406e-06, "loss": 0.4665, "num_input_tokens_seen": 101054432, "step": 83090 }, { "epoch": 10.411602556070667, "grad_norm": 0.1330653727054596, "learning_rate": 5.512680344510324e-06, "loss": 0.4758, "num_input_tokens_seen": 101060544, "step": 83095 }, { "epoch": 10.412229043979451, "grad_norm": 0.16115844249725342, "learning_rate": 5.512136509181717e-06, "loss": 0.4548, "num_input_tokens_seen": 101066752, "step": 83100 }, { "epoch": 10.412855531888235, "grad_norm": 0.15588954091072083, "learning_rate": 5.511592667730088e-06, "loss": 0.4636, "num_input_tokens_seen": 101072928, "step": 83105 }, { "epoch": 10.413482019797017, "grad_norm": 0.14049416780471802, "learning_rate": 5.511048820161938e-06, "loss": 0.4545, "num_input_tokens_seen": 101078976, "step": 83110 }, { "epoch": 10.414108507705802, "grad_norm": 0.19198967516422272, "learning_rate": 5.510504966483771e-06, "loss": 0.459, "num_input_tokens_seen": 101085088, "step": 83115 }, { "epoch": 10.414734995614584, "grad_norm": 0.17761798202991486, "learning_rate": 5.509961106702088e-06, "loss": 0.4403, "num_input_tokens_seen": 101091296, "step": 83120 }, { "epoch": 10.415361483523368, "grad_norm": 0.17186111211776733, "learning_rate": 5.509417240823389e-06, "loss": 0.4597, "num_input_tokens_seen": 101097440, "step": 83125 }, { "epoch": 10.415987971432152, "grad_norm": 0.1430104821920395, "learning_rate": 5.508873368854181e-06, "loss": 0.4637, "num_input_tokens_seen": 101103584, "step": 83130 }, { "epoch": 10.416614459340934, "grad_norm": 0.16716542840003967, "learning_rate": 5.508329490800962e-06, "loss": 0.4631, "num_input_tokens_seen": 101109568, "step": 83135 }, { "epoch": 10.417240947249718, "grad_norm": 0.25671663880348206, "learning_rate": 5.507785606670237e-06, "loss": 0.4699, "num_input_tokens_seen": 101115968, "step": 83140 }, { "epoch": 10.417867435158502, "grad_norm": 0.21421389281749725, "learning_rate": 5.50724171646851e-06, "loss": 0.4597, "num_input_tokens_seen": 101122368, "step": 83145 }, { "epoch": 10.418493923067285, "grad_norm": 0.19255666434764862, "learning_rate": 5.50669782020228e-06, "loss": 0.4649, "num_input_tokens_seen": 101128224, "step": 83150 }, { "epoch": 10.419120410976069, "grad_norm": 0.20816150307655334, "learning_rate": 5.506153917878051e-06, "loss": 0.4682, "num_input_tokens_seen": 101134304, "step": 83155 }, { "epoch": 10.419746898884851, "grad_norm": 0.192123144865036, "learning_rate": 5.505610009502328e-06, "loss": 0.4616, "num_input_tokens_seen": 101140064, "step": 83160 }, { "epoch": 10.420373386793635, "grad_norm": 0.1850333958864212, "learning_rate": 5.505066095081611e-06, "loss": 0.4611, "num_input_tokens_seen": 101146048, "step": 83165 }, { "epoch": 10.420999874702419, "grad_norm": 0.19919735193252563, "learning_rate": 5.5045221746224055e-06, "loss": 0.4491, "num_input_tokens_seen": 101152128, "step": 83170 }, { "epoch": 10.421626362611201, "grad_norm": 0.15693967044353485, "learning_rate": 5.503978248131213e-06, "loss": 0.4674, "num_input_tokens_seen": 101158784, "step": 83175 }, { "epoch": 10.422252850519985, "grad_norm": 0.11563510447740555, "learning_rate": 5.5034343156145366e-06, "loss": 0.4613, "num_input_tokens_seen": 101165024, "step": 83180 }, { "epoch": 10.422879338428768, "grad_norm": 0.17311468720436096, "learning_rate": 5.50289037707888e-06, "loss": 0.4616, "num_input_tokens_seen": 101171200, "step": 83185 }, { "epoch": 10.423505826337552, "grad_norm": 0.17337752878665924, "learning_rate": 5.5023464325307455e-06, "loss": 0.4709, "num_input_tokens_seen": 101177344, "step": 83190 }, { "epoch": 10.424132314246336, "grad_norm": 0.13008758425712585, "learning_rate": 5.501802481976636e-06, "loss": 0.464, "num_input_tokens_seen": 101183456, "step": 83195 }, { "epoch": 10.424758802155118, "grad_norm": 0.15233217179775238, "learning_rate": 5.501258525423058e-06, "loss": 0.4596, "num_input_tokens_seen": 101189568, "step": 83200 }, { "epoch": 10.425385290063902, "grad_norm": 0.1264016479253769, "learning_rate": 5.50071456287651e-06, "loss": 0.4583, "num_input_tokens_seen": 101195456, "step": 83205 }, { "epoch": 10.426011777972684, "grad_norm": 0.1852763146162033, "learning_rate": 5.500170594343502e-06, "loss": 0.4668, "num_input_tokens_seen": 101201472, "step": 83210 }, { "epoch": 10.426638265881468, "grad_norm": 0.14719927310943604, "learning_rate": 5.499626619830532e-06, "loss": 0.462, "num_input_tokens_seen": 101207648, "step": 83215 }, { "epoch": 10.427264753790253, "grad_norm": 0.15510576963424683, "learning_rate": 5.499082639344107e-06, "loss": 0.4649, "num_input_tokens_seen": 101213888, "step": 83220 }, { "epoch": 10.427891241699035, "grad_norm": 0.11696560680866241, "learning_rate": 5.498538652890728e-06, "loss": 0.465, "num_input_tokens_seen": 101220160, "step": 83225 }, { "epoch": 10.428517729607819, "grad_norm": 0.17841535806655884, "learning_rate": 5.497994660476901e-06, "loss": 0.4525, "num_input_tokens_seen": 101226176, "step": 83230 }, { "epoch": 10.429144217516601, "grad_norm": 0.13206374645233154, "learning_rate": 5.4974506621091295e-06, "loss": 0.4535, "num_input_tokens_seen": 101232224, "step": 83235 }, { "epoch": 10.429770705425385, "grad_norm": 0.1726861298084259, "learning_rate": 5.496906657793917e-06, "loss": 0.4657, "num_input_tokens_seen": 101238272, "step": 83240 }, { "epoch": 10.43039719333417, "grad_norm": 0.09610601514577866, "learning_rate": 5.496362647537767e-06, "loss": 0.4729, "num_input_tokens_seen": 101244448, "step": 83245 }, { "epoch": 10.431023681242952, "grad_norm": 0.12842674553394318, "learning_rate": 5.495818631347183e-06, "loss": 0.4601, "num_input_tokens_seen": 101250688, "step": 83250 }, { "epoch": 10.431650169151736, "grad_norm": 0.1207059919834137, "learning_rate": 5.495274609228672e-06, "loss": 0.4609, "num_input_tokens_seen": 101256128, "step": 83255 }, { "epoch": 10.432276657060518, "grad_norm": 0.13540318608283997, "learning_rate": 5.494730581188737e-06, "loss": 0.4606, "num_input_tokens_seen": 101262016, "step": 83260 }, { "epoch": 10.432903144969302, "grad_norm": 0.09732437133789062, "learning_rate": 5.4941865472338805e-06, "loss": 0.4677, "num_input_tokens_seen": 101268416, "step": 83265 }, { "epoch": 10.433529632878086, "grad_norm": 0.178492471575737, "learning_rate": 5.493642507370609e-06, "loss": 0.457, "num_input_tokens_seen": 101274624, "step": 83270 }, { "epoch": 10.434156120786868, "grad_norm": 0.11251504719257355, "learning_rate": 5.493098461605427e-06, "loss": 0.4693, "num_input_tokens_seen": 101280704, "step": 83275 }, { "epoch": 10.434782608695652, "grad_norm": 0.11091911792755127, "learning_rate": 5.492554409944838e-06, "loss": 0.4682, "num_input_tokens_seen": 101287328, "step": 83280 }, { "epoch": 10.435409096604436, "grad_norm": 0.12634901702404022, "learning_rate": 5.492010352395347e-06, "loss": 0.4583, "num_input_tokens_seen": 101293408, "step": 83285 }, { "epoch": 10.436035584513219, "grad_norm": 0.1566714495420456, "learning_rate": 5.491466288963459e-06, "loss": 0.456, "num_input_tokens_seen": 101299936, "step": 83290 }, { "epoch": 10.436662072422003, "grad_norm": 0.1724701076745987, "learning_rate": 5.490922219655678e-06, "loss": 0.4548, "num_input_tokens_seen": 101306112, "step": 83295 }, { "epoch": 10.437288560330785, "grad_norm": 0.20223703980445862, "learning_rate": 5.490378144478509e-06, "loss": 0.4561, "num_input_tokens_seen": 101312544, "step": 83300 }, { "epoch": 10.437915048239569, "grad_norm": 0.09980084747076035, "learning_rate": 5.489834063438456e-06, "loss": 0.4597, "num_input_tokens_seen": 101318912, "step": 83305 }, { "epoch": 10.438541536148353, "grad_norm": 0.13754788041114807, "learning_rate": 5.489289976542026e-06, "loss": 0.4668, "num_input_tokens_seen": 101324448, "step": 83310 }, { "epoch": 10.439168024057135, "grad_norm": 0.17140409350395203, "learning_rate": 5.488745883795723e-06, "loss": 0.4655, "num_input_tokens_seen": 101330560, "step": 83315 }, { "epoch": 10.43979451196592, "grad_norm": 0.13575704395771027, "learning_rate": 5.488201785206051e-06, "loss": 0.4573, "num_input_tokens_seen": 101336096, "step": 83320 }, { "epoch": 10.440420999874702, "grad_norm": 0.1457282304763794, "learning_rate": 5.487657680779517e-06, "loss": 0.4684, "num_input_tokens_seen": 101342368, "step": 83325 }, { "epoch": 10.441047487783486, "grad_norm": 0.11740720272064209, "learning_rate": 5.4871135705226255e-06, "loss": 0.4656, "num_input_tokens_seen": 101348768, "step": 83330 }, { "epoch": 10.44167397569227, "grad_norm": 0.1182670071721077, "learning_rate": 5.4865694544418815e-06, "loss": 0.4629, "num_input_tokens_seen": 101354848, "step": 83335 }, { "epoch": 10.442300463601052, "grad_norm": 0.15899908542633057, "learning_rate": 5.4860253325437906e-06, "loss": 0.456, "num_input_tokens_seen": 101360832, "step": 83340 }, { "epoch": 10.442926951509836, "grad_norm": 0.14835749566555023, "learning_rate": 5.485481204834858e-06, "loss": 0.4578, "num_input_tokens_seen": 101366528, "step": 83345 }, { "epoch": 10.443553439418618, "grad_norm": 0.11289962381124496, "learning_rate": 5.484937071321591e-06, "loss": 0.4535, "num_input_tokens_seen": 101372672, "step": 83350 }, { "epoch": 10.444179927327403, "grad_norm": 0.1453164964914322, "learning_rate": 5.484392932010492e-06, "loss": 0.4574, "num_input_tokens_seen": 101378592, "step": 83355 }, { "epoch": 10.444806415236187, "grad_norm": 0.07947847992181778, "learning_rate": 5.483848786908068e-06, "loss": 0.4619, "num_input_tokens_seen": 101385024, "step": 83360 }, { "epoch": 10.445432903144969, "grad_norm": 0.1723656803369522, "learning_rate": 5.483304636020825e-06, "loss": 0.4574, "num_input_tokens_seen": 101391264, "step": 83365 }, { "epoch": 10.446059391053753, "grad_norm": 0.12332040071487427, "learning_rate": 5.48276047935527e-06, "loss": 0.4583, "num_input_tokens_seen": 101397376, "step": 83370 }, { "epoch": 10.446685878962535, "grad_norm": 0.14288996160030365, "learning_rate": 5.482216316917907e-06, "loss": 0.4632, "num_input_tokens_seen": 101403456, "step": 83375 }, { "epoch": 10.44731236687132, "grad_norm": 0.1432112604379654, "learning_rate": 5.481672148715241e-06, "loss": 0.456, "num_input_tokens_seen": 101409728, "step": 83380 }, { "epoch": 10.447938854780103, "grad_norm": 0.14823560416698456, "learning_rate": 5.481127974753781e-06, "loss": 0.4646, "num_input_tokens_seen": 101415584, "step": 83385 }, { "epoch": 10.448565342688886, "grad_norm": 0.12347740679979324, "learning_rate": 5.480583795040032e-06, "loss": 0.4571, "num_input_tokens_seen": 101421056, "step": 83390 }, { "epoch": 10.44919183059767, "grad_norm": 0.13471157848834991, "learning_rate": 5.4800396095805e-06, "loss": 0.4612, "num_input_tokens_seen": 101426848, "step": 83395 }, { "epoch": 10.449818318506452, "grad_norm": 0.1082620769739151, "learning_rate": 5.47949541838169e-06, "loss": 0.4557, "num_input_tokens_seen": 101432864, "step": 83400 }, { "epoch": 10.450444806415236, "grad_norm": 0.1673002988100052, "learning_rate": 5.478951221450109e-06, "loss": 0.4549, "num_input_tokens_seen": 101439008, "step": 83405 }, { "epoch": 10.45107129432402, "grad_norm": 0.1612945795059204, "learning_rate": 5.478407018792262e-06, "loss": 0.462, "num_input_tokens_seen": 101445056, "step": 83410 }, { "epoch": 10.451697782232802, "grad_norm": 0.1844521164894104, "learning_rate": 5.477862810414659e-06, "loss": 0.4705, "num_input_tokens_seen": 101451264, "step": 83415 }, { "epoch": 10.452324270141586, "grad_norm": 0.1453302949666977, "learning_rate": 5.477318596323804e-06, "loss": 0.4608, "num_input_tokens_seen": 101457184, "step": 83420 }, { "epoch": 10.45295075805037, "grad_norm": 0.1546081155538559, "learning_rate": 5.476774376526204e-06, "loss": 0.4568, "num_input_tokens_seen": 101463360, "step": 83425 }, { "epoch": 10.453577245959153, "grad_norm": 0.13379880785942078, "learning_rate": 5.476230151028365e-06, "loss": 0.4586, "num_input_tokens_seen": 101469664, "step": 83430 }, { "epoch": 10.454203733867937, "grad_norm": 0.1485074907541275, "learning_rate": 5.475685919836795e-06, "loss": 0.4651, "num_input_tokens_seen": 101475648, "step": 83435 }, { "epoch": 10.454830221776719, "grad_norm": 0.1448998600244522, "learning_rate": 5.475141682958e-06, "loss": 0.4585, "num_input_tokens_seen": 101481248, "step": 83440 }, { "epoch": 10.455456709685503, "grad_norm": 0.1543002426624298, "learning_rate": 5.474597440398483e-06, "loss": 0.4671, "num_input_tokens_seen": 101487552, "step": 83445 }, { "epoch": 10.456083197594287, "grad_norm": 0.13444039225578308, "learning_rate": 5.474053192164759e-06, "loss": 0.46, "num_input_tokens_seen": 101493248, "step": 83450 }, { "epoch": 10.45670968550307, "grad_norm": 0.13005922734737396, "learning_rate": 5.473508938263329e-06, "loss": 0.4635, "num_input_tokens_seen": 101499232, "step": 83455 }, { "epoch": 10.457336173411854, "grad_norm": 0.13013312220573425, "learning_rate": 5.472964678700702e-06, "loss": 0.4667, "num_input_tokens_seen": 101505408, "step": 83460 }, { "epoch": 10.457962661320636, "grad_norm": 0.14372585713863373, "learning_rate": 5.472420413483383e-06, "loss": 0.4528, "num_input_tokens_seen": 101511744, "step": 83465 }, { "epoch": 10.45858914922942, "grad_norm": 0.1409500539302826, "learning_rate": 5.471876142617883e-06, "loss": 0.4557, "num_input_tokens_seen": 101517728, "step": 83470 }, { "epoch": 10.459215637138204, "grad_norm": 0.13873833417892456, "learning_rate": 5.471331866110705e-06, "loss": 0.4565, "num_input_tokens_seen": 101524064, "step": 83475 }, { "epoch": 10.459842125046986, "grad_norm": 0.12911413609981537, "learning_rate": 5.470787583968359e-06, "loss": 0.4598, "num_input_tokens_seen": 101530464, "step": 83480 }, { "epoch": 10.46046861295577, "grad_norm": 0.13023537397384644, "learning_rate": 5.470243296197352e-06, "loss": 0.4691, "num_input_tokens_seen": 101536512, "step": 83485 }, { "epoch": 10.461095100864553, "grad_norm": 0.17218023538589478, "learning_rate": 5.469699002804191e-06, "loss": 0.4616, "num_input_tokens_seen": 101542816, "step": 83490 }, { "epoch": 10.461721588773337, "grad_norm": 0.10970811545848846, "learning_rate": 5.469154703795384e-06, "loss": 0.4646, "num_input_tokens_seen": 101548864, "step": 83495 }, { "epoch": 10.46234807668212, "grad_norm": 0.1164526641368866, "learning_rate": 5.468610399177438e-06, "loss": 0.4589, "num_input_tokens_seen": 101555040, "step": 83500 }, { "epoch": 10.462974564590903, "grad_norm": 0.1293986588716507, "learning_rate": 5.468066088956858e-06, "loss": 0.4682, "num_input_tokens_seen": 101561280, "step": 83505 }, { "epoch": 10.463601052499687, "grad_norm": 0.1258709877729416, "learning_rate": 5.4675217731401575e-06, "loss": 0.4613, "num_input_tokens_seen": 101567680, "step": 83510 }, { "epoch": 10.46422754040847, "grad_norm": 0.1491820514202118, "learning_rate": 5.466977451733839e-06, "loss": 0.4701, "num_input_tokens_seen": 101573792, "step": 83515 }, { "epoch": 10.464854028317253, "grad_norm": 0.14949969947338104, "learning_rate": 5.466433124744413e-06, "loss": 0.4624, "num_input_tokens_seen": 101579936, "step": 83520 }, { "epoch": 10.465480516226037, "grad_norm": 0.14474180340766907, "learning_rate": 5.465888792178387e-06, "loss": 0.46, "num_input_tokens_seen": 101586112, "step": 83525 }, { "epoch": 10.46610700413482, "grad_norm": 0.12885145843029022, "learning_rate": 5.465344454042271e-06, "loss": 0.4615, "num_input_tokens_seen": 101592160, "step": 83530 }, { "epoch": 10.466733492043604, "grad_norm": 0.13603346049785614, "learning_rate": 5.4648001103425684e-06, "loss": 0.4642, "num_input_tokens_seen": 101598144, "step": 83535 }, { "epoch": 10.467359979952388, "grad_norm": 0.14451195299625397, "learning_rate": 5.46425576108579e-06, "loss": 0.4574, "num_input_tokens_seen": 101604384, "step": 83540 }, { "epoch": 10.46798646786117, "grad_norm": 0.1675807237625122, "learning_rate": 5.463711406278444e-06, "loss": 0.4674, "num_input_tokens_seen": 101610720, "step": 83545 }, { "epoch": 10.468612955769954, "grad_norm": 0.15149419009685516, "learning_rate": 5.463167045927039e-06, "loss": 0.4614, "num_input_tokens_seen": 101616448, "step": 83550 }, { "epoch": 10.469239443678736, "grad_norm": 0.13842909038066864, "learning_rate": 5.462622680038082e-06, "loss": 0.4636, "num_input_tokens_seen": 101622400, "step": 83555 }, { "epoch": 10.46986593158752, "grad_norm": 0.17086118459701538, "learning_rate": 5.462078308618081e-06, "loss": 0.4538, "num_input_tokens_seen": 101628864, "step": 83560 }, { "epoch": 10.470492419496304, "grad_norm": 0.09634029865264893, "learning_rate": 5.461533931673546e-06, "loss": 0.4585, "num_input_tokens_seen": 101635008, "step": 83565 }, { "epoch": 10.471118907405087, "grad_norm": 0.1530163735151291, "learning_rate": 5.460989549210984e-06, "loss": 0.4634, "num_input_tokens_seen": 101640672, "step": 83570 }, { "epoch": 10.47174539531387, "grad_norm": 0.14833632111549377, "learning_rate": 5.460445161236906e-06, "loss": 0.4597, "num_input_tokens_seen": 101646944, "step": 83575 }, { "epoch": 10.472371883222653, "grad_norm": 0.17222537100315094, "learning_rate": 5.4599007677578175e-06, "loss": 0.4571, "num_input_tokens_seen": 101652832, "step": 83580 }, { "epoch": 10.472998371131437, "grad_norm": 0.16404634714126587, "learning_rate": 5.45935636878023e-06, "loss": 0.4614, "num_input_tokens_seen": 101659104, "step": 83585 }, { "epoch": 10.473624859040221, "grad_norm": 0.13647820055484772, "learning_rate": 5.458811964310651e-06, "loss": 0.4639, "num_input_tokens_seen": 101665344, "step": 83590 }, { "epoch": 10.474251346949004, "grad_norm": 0.13462065160274506, "learning_rate": 5.4582675543555875e-06, "loss": 0.4653, "num_input_tokens_seen": 101671296, "step": 83595 }, { "epoch": 10.474877834857788, "grad_norm": 0.17063339054584503, "learning_rate": 5.457723138921552e-06, "loss": 0.4598, "num_input_tokens_seen": 101677120, "step": 83600 }, { "epoch": 10.47550432276657, "grad_norm": 0.14190340042114258, "learning_rate": 5.45717871801505e-06, "loss": 0.4675, "num_input_tokens_seen": 101683456, "step": 83605 }, { "epoch": 10.476130810675354, "grad_norm": 0.2108055055141449, "learning_rate": 5.456634291642593e-06, "loss": 0.4725, "num_input_tokens_seen": 101689504, "step": 83610 }, { "epoch": 10.476757298584138, "grad_norm": 0.15271025896072388, "learning_rate": 5.456089859810688e-06, "loss": 0.4576, "num_input_tokens_seen": 101695008, "step": 83615 }, { "epoch": 10.47738378649292, "grad_norm": 0.11455852538347244, "learning_rate": 5.455545422525846e-06, "loss": 0.4612, "num_input_tokens_seen": 101701184, "step": 83620 }, { "epoch": 10.478010274401704, "grad_norm": 0.16425476968288422, "learning_rate": 5.455000979794576e-06, "loss": 0.4673, "num_input_tokens_seen": 101707168, "step": 83625 }, { "epoch": 10.478636762310487, "grad_norm": 0.13195987045764923, "learning_rate": 5.454456531623386e-06, "loss": 0.4547, "num_input_tokens_seen": 101713472, "step": 83630 }, { "epoch": 10.47926325021927, "grad_norm": 0.1449800580739975, "learning_rate": 5.453912078018786e-06, "loss": 0.4576, "num_input_tokens_seen": 101719712, "step": 83635 }, { "epoch": 10.479889738128055, "grad_norm": 0.17293454706668854, "learning_rate": 5.453367618987285e-06, "loss": 0.4653, "num_input_tokens_seen": 101725664, "step": 83640 }, { "epoch": 10.480516226036837, "grad_norm": 0.1708902269601822, "learning_rate": 5.452823154535395e-06, "loss": 0.4712, "num_input_tokens_seen": 101731584, "step": 83645 }, { "epoch": 10.481142713945621, "grad_norm": 0.1289147287607193, "learning_rate": 5.4522786846696215e-06, "loss": 0.4651, "num_input_tokens_seen": 101737984, "step": 83650 }, { "epoch": 10.481769201854405, "grad_norm": 0.12706196308135986, "learning_rate": 5.451734209396477e-06, "loss": 0.4631, "num_input_tokens_seen": 101744384, "step": 83655 }, { "epoch": 10.482395689763187, "grad_norm": 0.1268005669116974, "learning_rate": 5.45118972872247e-06, "loss": 0.4597, "num_input_tokens_seen": 101750208, "step": 83660 }, { "epoch": 10.483022177671971, "grad_norm": 0.12424968183040619, "learning_rate": 5.45064524265411e-06, "loss": 0.4664, "num_input_tokens_seen": 101755968, "step": 83665 }, { "epoch": 10.483648665580754, "grad_norm": 0.1901477426290512, "learning_rate": 5.450100751197908e-06, "loss": 0.4572, "num_input_tokens_seen": 101762304, "step": 83670 }, { "epoch": 10.484275153489538, "grad_norm": 0.1149144321680069, "learning_rate": 5.449556254360373e-06, "loss": 0.4615, "num_input_tokens_seen": 101768512, "step": 83675 }, { "epoch": 10.484901641398322, "grad_norm": 0.16869133710861206, "learning_rate": 5.449011752148015e-06, "loss": 0.4528, "num_input_tokens_seen": 101774368, "step": 83680 }, { "epoch": 10.485528129307104, "grad_norm": 0.15540334582328796, "learning_rate": 5.448467244567343e-06, "loss": 0.4649, "num_input_tokens_seen": 101780576, "step": 83685 }, { "epoch": 10.486154617215888, "grad_norm": 0.15101151168346405, "learning_rate": 5.447922731624869e-06, "loss": 0.4577, "num_input_tokens_seen": 101786720, "step": 83690 }, { "epoch": 10.48678110512467, "grad_norm": 0.14731474220752716, "learning_rate": 5.447378213327102e-06, "loss": 0.4598, "num_input_tokens_seen": 101792864, "step": 83695 }, { "epoch": 10.487407593033454, "grad_norm": 0.11506253480911255, "learning_rate": 5.446833689680553e-06, "loss": 0.4614, "num_input_tokens_seen": 101799136, "step": 83700 }, { "epoch": 10.488034080942239, "grad_norm": 0.09623202681541443, "learning_rate": 5.4462891606917315e-06, "loss": 0.4646, "num_input_tokens_seen": 101805280, "step": 83705 }, { "epoch": 10.48866056885102, "grad_norm": 0.1324123591184616, "learning_rate": 5.4457446263671465e-06, "loss": 0.4558, "num_input_tokens_seen": 101810752, "step": 83710 }, { "epoch": 10.489287056759805, "grad_norm": 0.15489397943019867, "learning_rate": 5.4452000867133115e-06, "loss": 0.4609, "num_input_tokens_seen": 101817088, "step": 83715 }, { "epoch": 10.489913544668587, "grad_norm": 0.15040838718414307, "learning_rate": 5.444655541736733e-06, "loss": 0.4637, "num_input_tokens_seen": 101823296, "step": 83720 }, { "epoch": 10.490540032577371, "grad_norm": 0.11387964338064194, "learning_rate": 5.444110991443925e-06, "loss": 0.4495, "num_input_tokens_seen": 101829408, "step": 83725 }, { "epoch": 10.491166520486155, "grad_norm": 0.1574328988790512, "learning_rate": 5.443566435841396e-06, "loss": 0.4605, "num_input_tokens_seen": 101835616, "step": 83730 }, { "epoch": 10.491793008394938, "grad_norm": 0.19801339507102966, "learning_rate": 5.443021874935658e-06, "loss": 0.4587, "num_input_tokens_seen": 101841984, "step": 83735 }, { "epoch": 10.492419496303722, "grad_norm": 0.10602577030658722, "learning_rate": 5.442477308733222e-06, "loss": 0.4578, "num_input_tokens_seen": 101848224, "step": 83740 }, { "epoch": 10.493045984212504, "grad_norm": 0.1529463529586792, "learning_rate": 5.441932737240596e-06, "loss": 0.4569, "num_input_tokens_seen": 101854336, "step": 83745 }, { "epoch": 10.493672472121288, "grad_norm": 0.13936316967010498, "learning_rate": 5.441388160464294e-06, "loss": 0.4546, "num_input_tokens_seen": 101860160, "step": 83750 }, { "epoch": 10.494298960030072, "grad_norm": 0.11302783340215683, "learning_rate": 5.440843578410824e-06, "loss": 0.4619, "num_input_tokens_seen": 101866560, "step": 83755 }, { "epoch": 10.494925447938854, "grad_norm": 0.1458815187215805, "learning_rate": 5.4402989910866986e-06, "loss": 0.4582, "num_input_tokens_seen": 101872704, "step": 83760 }, { "epoch": 10.495551935847638, "grad_norm": 0.23207060992717743, "learning_rate": 5.439754398498429e-06, "loss": 0.4561, "num_input_tokens_seen": 101878848, "step": 83765 }, { "epoch": 10.496178423756422, "grad_norm": 0.14028793573379517, "learning_rate": 5.4392098006525266e-06, "loss": 0.4641, "num_input_tokens_seen": 101885152, "step": 83770 }, { "epoch": 10.496804911665205, "grad_norm": 0.17817236483097076, "learning_rate": 5.4386651975554994e-06, "loss": 0.4706, "num_input_tokens_seen": 101891136, "step": 83775 }, { "epoch": 10.497431399573989, "grad_norm": 0.1326938420534134, "learning_rate": 5.438120589213862e-06, "loss": 0.4466, "num_input_tokens_seen": 101897536, "step": 83780 }, { "epoch": 10.498057887482771, "grad_norm": 0.1862420290708542, "learning_rate": 5.437575975634126e-06, "loss": 0.4654, "num_input_tokens_seen": 101903936, "step": 83785 }, { "epoch": 10.498684375391555, "grad_norm": 0.1542770266532898, "learning_rate": 5.437031356822801e-06, "loss": 0.4742, "num_input_tokens_seen": 101910240, "step": 83790 }, { "epoch": 10.49931086330034, "grad_norm": 0.15027020871639252, "learning_rate": 5.436486732786398e-06, "loss": 0.4607, "num_input_tokens_seen": 101916288, "step": 83795 }, { "epoch": 10.499937351209121, "grad_norm": 0.21946504712104797, "learning_rate": 5.43594210353143e-06, "loss": 0.4525, "num_input_tokens_seen": 101922400, "step": 83800 }, { "epoch": 10.500563839117905, "grad_norm": 0.1495245099067688, "learning_rate": 5.435397469064406e-06, "loss": 0.4659, "num_input_tokens_seen": 101928448, "step": 83805 }, { "epoch": 10.501190327026688, "grad_norm": 0.12525220215320587, "learning_rate": 5.434852829391841e-06, "loss": 0.454, "num_input_tokens_seen": 101934720, "step": 83810 }, { "epoch": 10.501816814935472, "grad_norm": 0.15790008008480072, "learning_rate": 5.434308184520242e-06, "loss": 0.4517, "num_input_tokens_seen": 101940032, "step": 83815 }, { "epoch": 10.502443302844256, "grad_norm": 0.15296831727027893, "learning_rate": 5.433763534456127e-06, "loss": 0.4629, "num_input_tokens_seen": 101946272, "step": 83820 }, { "epoch": 10.503069790753038, "grad_norm": 0.15509700775146484, "learning_rate": 5.433218879206001e-06, "loss": 0.4619, "num_input_tokens_seen": 101952352, "step": 83825 }, { "epoch": 10.503696278661822, "grad_norm": 0.19571878015995026, "learning_rate": 5.432674218776381e-06, "loss": 0.46, "num_input_tokens_seen": 101958176, "step": 83830 }, { "epoch": 10.504322766570604, "grad_norm": 0.1460721790790558, "learning_rate": 5.432129553173777e-06, "loss": 0.4661, "num_input_tokens_seen": 101964064, "step": 83835 }, { "epoch": 10.504949254479389, "grad_norm": 0.252927303314209, "learning_rate": 5.431584882404701e-06, "loss": 0.4651, "num_input_tokens_seen": 101970048, "step": 83840 }, { "epoch": 10.505575742388173, "grad_norm": 0.16891725361347198, "learning_rate": 5.4310402064756655e-06, "loss": 0.4636, "num_input_tokens_seen": 101976224, "step": 83845 }, { "epoch": 10.506202230296955, "grad_norm": 0.13982965052127838, "learning_rate": 5.4304955253931805e-06, "loss": 0.4553, "num_input_tokens_seen": 101981888, "step": 83850 }, { "epoch": 10.506828718205739, "grad_norm": 0.16931842267513275, "learning_rate": 5.42995083916376e-06, "loss": 0.4614, "num_input_tokens_seen": 101987648, "step": 83855 }, { "epoch": 10.507455206114521, "grad_norm": 0.23408329486846924, "learning_rate": 5.429406147793915e-06, "loss": 0.4594, "num_input_tokens_seen": 101993536, "step": 83860 }, { "epoch": 10.508081694023305, "grad_norm": 0.1314748078584671, "learning_rate": 5.42886145129016e-06, "loss": 0.4685, "num_input_tokens_seen": 101999616, "step": 83865 }, { "epoch": 10.50870818193209, "grad_norm": 0.13370418548583984, "learning_rate": 5.428316749659004e-06, "loss": 0.4577, "num_input_tokens_seen": 102005696, "step": 83870 }, { "epoch": 10.509334669840872, "grad_norm": 0.1265944540500641, "learning_rate": 5.427772042906962e-06, "loss": 0.4604, "num_input_tokens_seen": 102012320, "step": 83875 }, { "epoch": 10.509961157749656, "grad_norm": 0.13443371653556824, "learning_rate": 5.427227331040545e-06, "loss": 0.4539, "num_input_tokens_seen": 102018048, "step": 83880 }, { "epoch": 10.51058764565844, "grad_norm": 0.15397216379642487, "learning_rate": 5.4266826140662675e-06, "loss": 0.4603, "num_input_tokens_seen": 102023712, "step": 83885 }, { "epoch": 10.511214133567222, "grad_norm": 0.15013915300369263, "learning_rate": 5.42613789199064e-06, "loss": 0.4694, "num_input_tokens_seen": 102029920, "step": 83890 }, { "epoch": 10.511840621476006, "grad_norm": 0.13915641605854034, "learning_rate": 5.425593164820176e-06, "loss": 0.4583, "num_input_tokens_seen": 102036320, "step": 83895 }, { "epoch": 10.512467109384788, "grad_norm": 0.2309725433588028, "learning_rate": 5.425048432561387e-06, "loss": 0.4614, "num_input_tokens_seen": 102042368, "step": 83900 }, { "epoch": 10.513093597293572, "grad_norm": 0.15370962023735046, "learning_rate": 5.4245036952207875e-06, "loss": 0.4601, "num_input_tokens_seen": 102048640, "step": 83905 }, { "epoch": 10.513720085202355, "grad_norm": 0.17949460446834564, "learning_rate": 5.423958952804888e-06, "loss": 0.4588, "num_input_tokens_seen": 102055040, "step": 83910 }, { "epoch": 10.514346573111139, "grad_norm": 0.1642417311668396, "learning_rate": 5.423414205320206e-06, "loss": 0.4637, "num_input_tokens_seen": 102061312, "step": 83915 }, { "epoch": 10.514973061019923, "grad_norm": 0.19593434035778046, "learning_rate": 5.422869452773249e-06, "loss": 0.4548, "num_input_tokens_seen": 102067456, "step": 83920 }, { "epoch": 10.515599548928705, "grad_norm": 0.17464224994182587, "learning_rate": 5.42232469517053e-06, "loss": 0.4697, "num_input_tokens_seen": 102073568, "step": 83925 }, { "epoch": 10.51622603683749, "grad_norm": 0.17006205022335052, "learning_rate": 5.421779932518567e-06, "loss": 0.4647, "num_input_tokens_seen": 102079616, "step": 83930 }, { "epoch": 10.516852524746273, "grad_norm": 0.12664064764976501, "learning_rate": 5.421235164823869e-06, "loss": 0.4641, "num_input_tokens_seen": 102085792, "step": 83935 }, { "epoch": 10.517479012655055, "grad_norm": 0.15209273993968964, "learning_rate": 5.420690392092951e-06, "loss": 0.4547, "num_input_tokens_seen": 102091808, "step": 83940 }, { "epoch": 10.51810550056384, "grad_norm": 0.1545691043138504, "learning_rate": 5.420145614332326e-06, "loss": 0.4697, "num_input_tokens_seen": 102097920, "step": 83945 }, { "epoch": 10.518731988472622, "grad_norm": 0.13930262625217438, "learning_rate": 5.419600831548507e-06, "loss": 0.4552, "num_input_tokens_seen": 102103872, "step": 83950 }, { "epoch": 10.519358476381406, "grad_norm": 0.19174273312091827, "learning_rate": 5.4190560437480066e-06, "loss": 0.4553, "num_input_tokens_seen": 102109984, "step": 83955 }, { "epoch": 10.51998496429019, "grad_norm": 0.1570422202348709, "learning_rate": 5.418511250937339e-06, "loss": 0.4637, "num_input_tokens_seen": 102116128, "step": 83960 }, { "epoch": 10.520611452198972, "grad_norm": 0.1574554592370987, "learning_rate": 5.417966453123018e-06, "loss": 0.4603, "num_input_tokens_seen": 102122080, "step": 83965 }, { "epoch": 10.521237940107756, "grad_norm": 0.18527567386627197, "learning_rate": 5.417421650311557e-06, "loss": 0.4588, "num_input_tokens_seen": 102128000, "step": 83970 }, { "epoch": 10.521864428016539, "grad_norm": 0.15839199721813202, "learning_rate": 5.416876842509468e-06, "loss": 0.4702, "num_input_tokens_seen": 102134176, "step": 83975 }, { "epoch": 10.522490915925323, "grad_norm": 0.16140496730804443, "learning_rate": 5.416332029723267e-06, "loss": 0.4597, "num_input_tokens_seen": 102140384, "step": 83980 }, { "epoch": 10.523117403834107, "grad_norm": 0.18518435955047607, "learning_rate": 5.415787211959466e-06, "loss": 0.4653, "num_input_tokens_seen": 102146592, "step": 83985 }, { "epoch": 10.523743891742889, "grad_norm": 0.15678797662258148, "learning_rate": 5.4152423892245795e-06, "loss": 0.4585, "num_input_tokens_seen": 102152640, "step": 83990 }, { "epoch": 10.524370379651673, "grad_norm": 0.18542487919330597, "learning_rate": 5.4146975615251206e-06, "loss": 0.4577, "num_input_tokens_seen": 102158112, "step": 83995 }, { "epoch": 10.524996867560455, "grad_norm": 0.21126633882522583, "learning_rate": 5.414152728867603e-06, "loss": 0.4745, "num_input_tokens_seen": 102164128, "step": 84000 }, { "epoch": 10.52562335546924, "grad_norm": 0.17115871608257294, "learning_rate": 5.413607891258543e-06, "loss": 0.4627, "num_input_tokens_seen": 102170336, "step": 84005 }, { "epoch": 10.526249843378023, "grad_norm": 0.18935438990592957, "learning_rate": 5.413063048704452e-06, "loss": 0.4614, "num_input_tokens_seen": 102176672, "step": 84010 }, { "epoch": 10.526876331286806, "grad_norm": 0.25109031796455383, "learning_rate": 5.412518201211846e-06, "loss": 0.4585, "num_input_tokens_seen": 102182880, "step": 84015 }, { "epoch": 10.52750281919559, "grad_norm": 0.17557060718536377, "learning_rate": 5.411973348787237e-06, "loss": 0.4513, "num_input_tokens_seen": 102188832, "step": 84020 }, { "epoch": 10.528129307104372, "grad_norm": 0.14461390674114227, "learning_rate": 5.4114284914371415e-06, "loss": 0.4549, "num_input_tokens_seen": 102194624, "step": 84025 }, { "epoch": 10.528755795013156, "grad_norm": 0.14232994616031647, "learning_rate": 5.410883629168071e-06, "loss": 0.4615, "num_input_tokens_seen": 102200640, "step": 84030 }, { "epoch": 10.52938228292194, "grad_norm": 0.15178264677524567, "learning_rate": 5.410338761986542e-06, "loss": 0.4637, "num_input_tokens_seen": 102206624, "step": 84035 }, { "epoch": 10.530008770830722, "grad_norm": 0.1908138394355774, "learning_rate": 5.4097938898990675e-06, "loss": 0.4559, "num_input_tokens_seen": 102212480, "step": 84040 }, { "epoch": 10.530635258739506, "grad_norm": 0.1702902466058731, "learning_rate": 5.409249012912163e-06, "loss": 0.4634, "num_input_tokens_seen": 102218496, "step": 84045 }, { "epoch": 10.53126174664829, "grad_norm": 0.19153626263141632, "learning_rate": 5.4087041310323425e-06, "loss": 0.4522, "num_input_tokens_seen": 102224704, "step": 84050 }, { "epoch": 10.531888234557073, "grad_norm": 0.1702919751405716, "learning_rate": 5.408159244266121e-06, "loss": 0.4643, "num_input_tokens_seen": 102230688, "step": 84055 }, { "epoch": 10.532514722465857, "grad_norm": 0.3808232545852661, "learning_rate": 5.407614352620012e-06, "loss": 0.4557, "num_input_tokens_seen": 102236960, "step": 84060 }, { "epoch": 10.53314121037464, "grad_norm": 0.16054818034172058, "learning_rate": 5.40706945610053e-06, "loss": 0.4577, "num_input_tokens_seen": 102243072, "step": 84065 }, { "epoch": 10.533767698283423, "grad_norm": 0.19954310357570648, "learning_rate": 5.406524554714192e-06, "loss": 0.4653, "num_input_tokens_seen": 102249184, "step": 84070 }, { "epoch": 10.534394186192207, "grad_norm": 0.1952463835477829, "learning_rate": 5.40597964846751e-06, "loss": 0.4544, "num_input_tokens_seen": 102254464, "step": 84075 }, { "epoch": 10.53502067410099, "grad_norm": 0.23274925351142883, "learning_rate": 5.405434737367e-06, "loss": 0.462, "num_input_tokens_seen": 102260736, "step": 84080 }, { "epoch": 10.535647162009774, "grad_norm": 0.20937307178974152, "learning_rate": 5.404889821419177e-06, "loss": 0.4637, "num_input_tokens_seen": 102266816, "step": 84085 }, { "epoch": 10.536273649918556, "grad_norm": 0.19288326799869537, "learning_rate": 5.404344900630556e-06, "loss": 0.4554, "num_input_tokens_seen": 102272832, "step": 84090 }, { "epoch": 10.53690013782734, "grad_norm": 0.21084780991077423, "learning_rate": 5.403799975007651e-06, "loss": 0.4671, "num_input_tokens_seen": 102279168, "step": 84095 }, { "epoch": 10.537526625736124, "grad_norm": 0.17591162025928497, "learning_rate": 5.403255044556979e-06, "loss": 0.453, "num_input_tokens_seen": 102285184, "step": 84100 }, { "epoch": 10.538153113644906, "grad_norm": 0.21070750057697296, "learning_rate": 5.402710109285053e-06, "loss": 0.4632, "num_input_tokens_seen": 102290976, "step": 84105 }, { "epoch": 10.53877960155369, "grad_norm": 0.17117324471473694, "learning_rate": 5.40216516919839e-06, "loss": 0.4726, "num_input_tokens_seen": 102297056, "step": 84110 }, { "epoch": 10.539406089462473, "grad_norm": 0.17870691418647766, "learning_rate": 5.401620224303504e-06, "loss": 0.4621, "num_input_tokens_seen": 102303008, "step": 84115 }, { "epoch": 10.540032577371257, "grad_norm": 0.18735551834106445, "learning_rate": 5.401075274606911e-06, "loss": 0.4557, "num_input_tokens_seen": 102308800, "step": 84120 }, { "epoch": 10.54065906528004, "grad_norm": 0.19297558069229126, "learning_rate": 5.400530320115124e-06, "loss": 0.4663, "num_input_tokens_seen": 102314400, "step": 84125 }, { "epoch": 10.541285553188823, "grad_norm": 0.1715046763420105, "learning_rate": 5.3999853608346606e-06, "loss": 0.4632, "num_input_tokens_seen": 102320640, "step": 84130 }, { "epoch": 10.541912041097607, "grad_norm": 0.19918838143348694, "learning_rate": 5.3994403967720366e-06, "loss": 0.4659, "num_input_tokens_seen": 102326432, "step": 84135 }, { "epoch": 10.54253852900639, "grad_norm": 0.1772555112838745, "learning_rate": 5.398895427933767e-06, "loss": 0.4617, "num_input_tokens_seen": 102332512, "step": 84140 }, { "epoch": 10.543165016915173, "grad_norm": 0.17543527483940125, "learning_rate": 5.398350454326368e-06, "loss": 0.4668, "num_input_tokens_seen": 102338784, "step": 84145 }, { "epoch": 10.543791504823957, "grad_norm": 0.1875259429216385, "learning_rate": 5.397805475956353e-06, "loss": 0.4632, "num_input_tokens_seen": 102344928, "step": 84150 }, { "epoch": 10.54441799273274, "grad_norm": 0.2016918659210205, "learning_rate": 5.39726049283024e-06, "loss": 0.4581, "num_input_tokens_seen": 102350496, "step": 84155 }, { "epoch": 10.545044480641524, "grad_norm": 0.15745212137699127, "learning_rate": 5.396715504954544e-06, "loss": 0.4615, "num_input_tokens_seen": 102356768, "step": 84160 }, { "epoch": 10.545670968550308, "grad_norm": 0.24517391622066498, "learning_rate": 5.39617051233578e-06, "loss": 0.4622, "num_input_tokens_seen": 102362784, "step": 84165 }, { "epoch": 10.54629745645909, "grad_norm": 0.24805013835430145, "learning_rate": 5.395625514980464e-06, "loss": 0.464, "num_input_tokens_seen": 102368704, "step": 84170 }, { "epoch": 10.546923944367874, "grad_norm": 0.1937035173177719, "learning_rate": 5.395080512895114e-06, "loss": 0.4564, "num_input_tokens_seen": 102374912, "step": 84175 }, { "epoch": 10.547550432276656, "grad_norm": 0.17055481672286987, "learning_rate": 5.394535506086241e-06, "loss": 0.4561, "num_input_tokens_seen": 102380864, "step": 84180 }, { "epoch": 10.54817692018544, "grad_norm": 0.1526859998703003, "learning_rate": 5.393990494560367e-06, "loss": 0.4641, "num_input_tokens_seen": 102386944, "step": 84185 }, { "epoch": 10.548803408094225, "grad_norm": 0.15744473040103912, "learning_rate": 5.393445478324004e-06, "loss": 0.4588, "num_input_tokens_seen": 102392768, "step": 84190 }, { "epoch": 10.549429896003007, "grad_norm": 0.1395508050918579, "learning_rate": 5.392900457383671e-06, "loss": 0.4654, "num_input_tokens_seen": 102399008, "step": 84195 }, { "epoch": 10.550056383911791, "grad_norm": 0.17089395225048065, "learning_rate": 5.392355431745881e-06, "loss": 0.4677, "num_input_tokens_seen": 102405312, "step": 84200 }, { "epoch": 10.550682871820573, "grad_norm": 0.16308486461639404, "learning_rate": 5.391810401417153e-06, "loss": 0.4599, "num_input_tokens_seen": 102411104, "step": 84205 }, { "epoch": 10.551309359729357, "grad_norm": 0.16885872185230255, "learning_rate": 5.391265366404004e-06, "loss": 0.4531, "num_input_tokens_seen": 102417312, "step": 84210 }, { "epoch": 10.551935847638141, "grad_norm": 0.1624443531036377, "learning_rate": 5.390720326712946e-06, "loss": 0.4622, "num_input_tokens_seen": 102423328, "step": 84215 }, { "epoch": 10.552562335546924, "grad_norm": 0.19349583983421326, "learning_rate": 5.390175282350499e-06, "loss": 0.4735, "num_input_tokens_seen": 102428864, "step": 84220 }, { "epoch": 10.553188823455708, "grad_norm": 0.1487278789281845, "learning_rate": 5.3896302333231786e-06, "loss": 0.4647, "num_input_tokens_seen": 102435392, "step": 84225 }, { "epoch": 10.55381531136449, "grad_norm": 0.13250312209129333, "learning_rate": 5.389085179637501e-06, "loss": 0.462, "num_input_tokens_seen": 102441568, "step": 84230 }, { "epoch": 10.554441799273274, "grad_norm": 0.18109074234962463, "learning_rate": 5.388540121299982e-06, "loss": 0.4578, "num_input_tokens_seen": 102447712, "step": 84235 }, { "epoch": 10.555068287182058, "grad_norm": 0.12368050962686539, "learning_rate": 5.3879950583171406e-06, "loss": 0.4641, "num_input_tokens_seen": 102453792, "step": 84240 }, { "epoch": 10.55569477509084, "grad_norm": 0.14907369017601013, "learning_rate": 5.387449990695492e-06, "loss": 0.4682, "num_input_tokens_seen": 102460096, "step": 84245 }, { "epoch": 10.556321262999624, "grad_norm": 0.22825069725513458, "learning_rate": 5.38690491844155e-06, "loss": 0.4616, "num_input_tokens_seen": 102466208, "step": 84250 }, { "epoch": 10.556947750908407, "grad_norm": 0.17009766399860382, "learning_rate": 5.386359841561838e-06, "loss": 0.4605, "num_input_tokens_seen": 102472704, "step": 84255 }, { "epoch": 10.55757423881719, "grad_norm": 0.12903226912021637, "learning_rate": 5.385814760062867e-06, "loss": 0.4624, "num_input_tokens_seen": 102478592, "step": 84260 }, { "epoch": 10.558200726725975, "grad_norm": 0.13629688322544098, "learning_rate": 5.385269673951158e-06, "loss": 0.4729, "num_input_tokens_seen": 102484832, "step": 84265 }, { "epoch": 10.558827214634757, "grad_norm": 0.12696337699890137, "learning_rate": 5.3847245832332265e-06, "loss": 0.461, "num_input_tokens_seen": 102490944, "step": 84270 }, { "epoch": 10.559453702543541, "grad_norm": 0.15436172485351562, "learning_rate": 5.384179487915587e-06, "loss": 0.4599, "num_input_tokens_seen": 102497248, "step": 84275 }, { "epoch": 10.560080190452325, "grad_norm": 0.13307635486125946, "learning_rate": 5.3836343880047606e-06, "loss": 0.4597, "num_input_tokens_seen": 102503104, "step": 84280 }, { "epoch": 10.560706678361107, "grad_norm": 0.17833805084228516, "learning_rate": 5.383089283507261e-06, "loss": 0.4716, "num_input_tokens_seen": 102509120, "step": 84285 }, { "epoch": 10.561333166269891, "grad_norm": 0.1708897352218628, "learning_rate": 5.382544174429608e-06, "loss": 0.4531, "num_input_tokens_seen": 102515104, "step": 84290 }, { "epoch": 10.561959654178674, "grad_norm": 0.11214999854564667, "learning_rate": 5.381999060778316e-06, "loss": 0.4553, "num_input_tokens_seen": 102521216, "step": 84295 }, { "epoch": 10.562586142087458, "grad_norm": 0.14830520749092102, "learning_rate": 5.381453942559907e-06, "loss": 0.4642, "num_input_tokens_seen": 102527168, "step": 84300 }, { "epoch": 10.563212629996242, "grad_norm": 0.14241109788417816, "learning_rate": 5.380908819780894e-06, "loss": 0.4632, "num_input_tokens_seen": 102533184, "step": 84305 }, { "epoch": 10.563839117905024, "grad_norm": 0.17124900221824646, "learning_rate": 5.380363692447795e-06, "loss": 0.4591, "num_input_tokens_seen": 102539008, "step": 84310 }, { "epoch": 10.564465605813808, "grad_norm": 0.1678241789340973, "learning_rate": 5.379818560567129e-06, "loss": 0.4654, "num_input_tokens_seen": 102545184, "step": 84315 }, { "epoch": 10.56509209372259, "grad_norm": 0.2012876272201538, "learning_rate": 5.379273424145412e-06, "loss": 0.459, "num_input_tokens_seen": 102551168, "step": 84320 }, { "epoch": 10.565718581631375, "grad_norm": 0.1647462397813797, "learning_rate": 5.378728283189164e-06, "loss": 0.4625, "num_input_tokens_seen": 102557440, "step": 84325 }, { "epoch": 10.566345069540159, "grad_norm": 0.1647489070892334, "learning_rate": 5.3781831377049006e-06, "loss": 0.4681, "num_input_tokens_seen": 102563648, "step": 84330 }, { "epoch": 10.566971557448941, "grad_norm": 0.1788002997636795, "learning_rate": 5.377637987699139e-06, "loss": 0.4579, "num_input_tokens_seen": 102569856, "step": 84335 }, { "epoch": 10.567598045357725, "grad_norm": 0.1710936278104782, "learning_rate": 5.377092833178398e-06, "loss": 0.4632, "num_input_tokens_seen": 102575904, "step": 84340 }, { "epoch": 10.568224533266507, "grad_norm": 0.13067297637462616, "learning_rate": 5.376547674149195e-06, "loss": 0.4657, "num_input_tokens_seen": 102581696, "step": 84345 }, { "epoch": 10.568851021175291, "grad_norm": 0.16537117958068848, "learning_rate": 5.376002510618048e-06, "loss": 0.4788, "num_input_tokens_seen": 102587744, "step": 84350 }, { "epoch": 10.569477509084075, "grad_norm": 0.11869177222251892, "learning_rate": 5.3754573425914756e-06, "loss": 0.4637, "num_input_tokens_seen": 102593792, "step": 84355 }, { "epoch": 10.570103996992858, "grad_norm": 0.2282242625951767, "learning_rate": 5.374912170075994e-06, "loss": 0.464, "num_input_tokens_seen": 102600096, "step": 84360 }, { "epoch": 10.570730484901642, "grad_norm": 0.1988261640071869, "learning_rate": 5.3743669930781225e-06, "loss": 0.4504, "num_input_tokens_seen": 102606304, "step": 84365 }, { "epoch": 10.571356972810424, "grad_norm": 0.2877165675163269, "learning_rate": 5.373821811604378e-06, "loss": 0.472, "num_input_tokens_seen": 102612288, "step": 84370 }, { "epoch": 10.571983460719208, "grad_norm": 0.2277013063430786, "learning_rate": 5.373276625661281e-06, "loss": 0.4638, "num_input_tokens_seen": 102618464, "step": 84375 }, { "epoch": 10.572609948627992, "grad_norm": 0.17903780937194824, "learning_rate": 5.372731435255348e-06, "loss": 0.4612, "num_input_tokens_seen": 102624768, "step": 84380 }, { "epoch": 10.573236436536774, "grad_norm": 0.1745041012763977, "learning_rate": 5.372186240393097e-06, "loss": 0.4681, "num_input_tokens_seen": 102630944, "step": 84385 }, { "epoch": 10.573862924445558, "grad_norm": 0.1341668963432312, "learning_rate": 5.371641041081046e-06, "loss": 0.4534, "num_input_tokens_seen": 102637024, "step": 84390 }, { "epoch": 10.574489412354342, "grad_norm": 0.16047191619873047, "learning_rate": 5.371095837325714e-06, "loss": 0.4692, "num_input_tokens_seen": 102643040, "step": 84395 }, { "epoch": 10.575115900263125, "grad_norm": 0.15142948925495148, "learning_rate": 5.37055062913362e-06, "loss": 0.4644, "num_input_tokens_seen": 102649120, "step": 84400 }, { "epoch": 10.575742388171909, "grad_norm": 0.15085139870643616, "learning_rate": 5.370005416511281e-06, "loss": 0.4571, "num_input_tokens_seen": 102655328, "step": 84405 }, { "epoch": 10.576368876080691, "grad_norm": 0.1611476093530655, "learning_rate": 5.369460199465216e-06, "loss": 0.4649, "num_input_tokens_seen": 102661408, "step": 84410 }, { "epoch": 10.576995363989475, "grad_norm": 0.1613769829273224, "learning_rate": 5.368914978001944e-06, "loss": 0.4768, "num_input_tokens_seen": 102667328, "step": 84415 }, { "epoch": 10.577621851898257, "grad_norm": 0.13364319503307343, "learning_rate": 5.368369752127985e-06, "loss": 0.4634, "num_input_tokens_seen": 102673440, "step": 84420 }, { "epoch": 10.578248339807041, "grad_norm": 0.1512342095375061, "learning_rate": 5.3678245218498536e-06, "loss": 0.4543, "num_input_tokens_seen": 102679456, "step": 84425 }, { "epoch": 10.578874827715826, "grad_norm": 0.13672076165676117, "learning_rate": 5.367279287174072e-06, "loss": 0.4683, "num_input_tokens_seen": 102685536, "step": 84430 }, { "epoch": 10.579501315624608, "grad_norm": 0.1566004604101181, "learning_rate": 5.366734048107156e-06, "loss": 0.4604, "num_input_tokens_seen": 102691776, "step": 84435 }, { "epoch": 10.580127803533392, "grad_norm": 0.13612717390060425, "learning_rate": 5.366188804655627e-06, "loss": 0.4585, "num_input_tokens_seen": 102697696, "step": 84440 }, { "epoch": 10.580754291442176, "grad_norm": 0.1609877496957779, "learning_rate": 5.365643556826004e-06, "loss": 0.4577, "num_input_tokens_seen": 102703904, "step": 84445 }, { "epoch": 10.581380779350958, "grad_norm": 0.13526752591133118, "learning_rate": 5.365098304624803e-06, "loss": 0.4604, "num_input_tokens_seen": 102710464, "step": 84450 }, { "epoch": 10.582007267259742, "grad_norm": 0.1948046237230301, "learning_rate": 5.364553048058546e-06, "loss": 0.4538, "num_input_tokens_seen": 102716448, "step": 84455 }, { "epoch": 10.582633755168525, "grad_norm": 0.24711672961711884, "learning_rate": 5.364007787133752e-06, "loss": 0.4561, "num_input_tokens_seen": 102722368, "step": 84460 }, { "epoch": 10.583260243077309, "grad_norm": 0.17412106692790985, "learning_rate": 5.363462521856937e-06, "loss": 0.4558, "num_input_tokens_seen": 102728640, "step": 84465 }, { "epoch": 10.583886730986093, "grad_norm": 0.13007931411266327, "learning_rate": 5.362917252234623e-06, "loss": 0.4628, "num_input_tokens_seen": 102734304, "step": 84470 }, { "epoch": 10.584513218894875, "grad_norm": 0.12357629090547562, "learning_rate": 5.3623719782733285e-06, "loss": 0.4543, "num_input_tokens_seen": 102740352, "step": 84475 }, { "epoch": 10.585139706803659, "grad_norm": 0.18525269627571106, "learning_rate": 5.361826699979571e-06, "loss": 0.4583, "num_input_tokens_seen": 102746464, "step": 84480 }, { "epoch": 10.585766194712441, "grad_norm": 0.1874299943447113, "learning_rate": 5.361281417359872e-06, "loss": 0.4666, "num_input_tokens_seen": 102752480, "step": 84485 }, { "epoch": 10.586392682621225, "grad_norm": 0.15153157711029053, "learning_rate": 5.360736130420748e-06, "loss": 0.466, "num_input_tokens_seen": 102758624, "step": 84490 }, { "epoch": 10.58701917053001, "grad_norm": 0.18424749374389648, "learning_rate": 5.360190839168723e-06, "loss": 0.4508, "num_input_tokens_seen": 102764960, "step": 84495 }, { "epoch": 10.587645658438792, "grad_norm": 0.2239387482404709, "learning_rate": 5.359645543610311e-06, "loss": 0.463, "num_input_tokens_seen": 102771008, "step": 84500 }, { "epoch": 10.588272146347576, "grad_norm": 0.18876174092292786, "learning_rate": 5.359100243752036e-06, "loss": 0.4661, "num_input_tokens_seen": 102777344, "step": 84505 }, { "epoch": 10.588898634256358, "grad_norm": 0.13856108486652374, "learning_rate": 5.358554939600416e-06, "loss": 0.4581, "num_input_tokens_seen": 102783328, "step": 84510 }, { "epoch": 10.589525122165142, "grad_norm": 0.17067117989063263, "learning_rate": 5.358009631161969e-06, "loss": 0.4526, "num_input_tokens_seen": 102788832, "step": 84515 }, { "epoch": 10.590151610073926, "grad_norm": 0.2408997267484665, "learning_rate": 5.357464318443217e-06, "loss": 0.4655, "num_input_tokens_seen": 102794944, "step": 84520 }, { "epoch": 10.590778097982708, "grad_norm": 0.18573690950870514, "learning_rate": 5.3569190014506775e-06, "loss": 0.4531, "num_input_tokens_seen": 102801216, "step": 84525 }, { "epoch": 10.591404585891492, "grad_norm": 0.22892756760120392, "learning_rate": 5.356373680190872e-06, "loss": 0.4661, "num_input_tokens_seen": 102807264, "step": 84530 }, { "epoch": 10.592031073800275, "grad_norm": 0.25074711441993713, "learning_rate": 5.3558283546703205e-06, "loss": 0.4636, "num_input_tokens_seen": 102813408, "step": 84535 }, { "epoch": 10.592657561709059, "grad_norm": 0.19813215732574463, "learning_rate": 5.3552830248955404e-06, "loss": 0.4587, "num_input_tokens_seen": 102819808, "step": 84540 }, { "epoch": 10.593284049617843, "grad_norm": 0.2034006118774414, "learning_rate": 5.354737690873052e-06, "loss": 0.4576, "num_input_tokens_seen": 102826240, "step": 84545 }, { "epoch": 10.593910537526625, "grad_norm": 0.1711340695619583, "learning_rate": 5.354192352609377e-06, "loss": 0.4606, "num_input_tokens_seen": 102832608, "step": 84550 }, { "epoch": 10.59453702543541, "grad_norm": 0.2013096809387207, "learning_rate": 5.353647010111036e-06, "loss": 0.463, "num_input_tokens_seen": 102838880, "step": 84555 }, { "epoch": 10.595163513344193, "grad_norm": 0.15002469718456268, "learning_rate": 5.3531016633845464e-06, "loss": 0.4673, "num_input_tokens_seen": 102845280, "step": 84560 }, { "epoch": 10.595790001252976, "grad_norm": 0.2129141390323639, "learning_rate": 5.35255631243643e-06, "loss": 0.4582, "num_input_tokens_seen": 102851680, "step": 84565 }, { "epoch": 10.59641648916176, "grad_norm": 0.19586507976055145, "learning_rate": 5.352010957273207e-06, "loss": 0.4696, "num_input_tokens_seen": 102857952, "step": 84570 }, { "epoch": 10.597042977070542, "grad_norm": 0.1928407847881317, "learning_rate": 5.351465597901396e-06, "loss": 0.4627, "num_input_tokens_seen": 102864096, "step": 84575 }, { "epoch": 10.597669464979326, "grad_norm": 0.15592531859874725, "learning_rate": 5.350920234327519e-06, "loss": 0.4502, "num_input_tokens_seen": 102870688, "step": 84580 }, { "epoch": 10.59829595288811, "grad_norm": 0.17676813900470734, "learning_rate": 5.350374866558095e-06, "loss": 0.4517, "num_input_tokens_seen": 102876512, "step": 84585 }, { "epoch": 10.598922440796892, "grad_norm": 0.1678839474916458, "learning_rate": 5.3498294945996464e-06, "loss": 0.4603, "num_input_tokens_seen": 102882816, "step": 84590 }, { "epoch": 10.599548928705676, "grad_norm": 0.2261732518672943, "learning_rate": 5.3492841184586895e-06, "loss": 0.4699, "num_input_tokens_seen": 102888672, "step": 84595 }, { "epoch": 10.600175416614459, "grad_norm": 0.20558784902095795, "learning_rate": 5.348738738141749e-06, "loss": 0.4521, "num_input_tokens_seen": 102894912, "step": 84600 }, { "epoch": 10.600801904523243, "grad_norm": 0.19817079603672028, "learning_rate": 5.348193353655343e-06, "loss": 0.4611, "num_input_tokens_seen": 102900992, "step": 84605 }, { "epoch": 10.601428392432027, "grad_norm": 0.18203230202198029, "learning_rate": 5.347647965005994e-06, "loss": 0.4552, "num_input_tokens_seen": 102907488, "step": 84610 }, { "epoch": 10.602054880340809, "grad_norm": 0.31694090366363525, "learning_rate": 5.34710257220022e-06, "loss": 0.4671, "num_input_tokens_seen": 102913664, "step": 84615 }, { "epoch": 10.602681368249593, "grad_norm": 0.2433546781539917, "learning_rate": 5.3465571752445435e-06, "loss": 0.458, "num_input_tokens_seen": 102919776, "step": 84620 }, { "epoch": 10.603307856158375, "grad_norm": 0.21638983488082886, "learning_rate": 5.3460117741454846e-06, "loss": 0.4619, "num_input_tokens_seen": 102926048, "step": 84625 }, { "epoch": 10.60393434406716, "grad_norm": 0.16992925107479095, "learning_rate": 5.345466368909563e-06, "loss": 0.4591, "num_input_tokens_seen": 102932000, "step": 84630 }, { "epoch": 10.604560831975943, "grad_norm": 0.15723419189453125, "learning_rate": 5.344920959543302e-06, "loss": 0.4714, "num_input_tokens_seen": 102938048, "step": 84635 }, { "epoch": 10.605187319884726, "grad_norm": 0.241442009806633, "learning_rate": 5.34437554605322e-06, "loss": 0.4572, "num_input_tokens_seen": 102944096, "step": 84640 }, { "epoch": 10.60581380779351, "grad_norm": 0.1943625509738922, "learning_rate": 5.34383012844584e-06, "loss": 0.4725, "num_input_tokens_seen": 102950304, "step": 84645 }, { "epoch": 10.606440295702292, "grad_norm": 0.1464022397994995, "learning_rate": 5.34328470672768e-06, "loss": 0.462, "num_input_tokens_seen": 102956352, "step": 84650 }, { "epoch": 10.607066783611076, "grad_norm": 0.19081810116767883, "learning_rate": 5.342739280905264e-06, "loss": 0.462, "num_input_tokens_seen": 102962368, "step": 84655 }, { "epoch": 10.60769327151986, "grad_norm": 0.21265168488025665, "learning_rate": 5.34219385098511e-06, "loss": 0.4723, "num_input_tokens_seen": 102968544, "step": 84660 }, { "epoch": 10.608319759428642, "grad_norm": 0.19853362441062927, "learning_rate": 5.341648416973742e-06, "loss": 0.4657, "num_input_tokens_seen": 102974432, "step": 84665 }, { "epoch": 10.608946247337427, "grad_norm": 0.21233059465885162, "learning_rate": 5.341102978877681e-06, "loss": 0.454, "num_input_tokens_seen": 102980128, "step": 84670 }, { "epoch": 10.60957273524621, "grad_norm": 0.23332321643829346, "learning_rate": 5.340557536703447e-06, "loss": 0.4554, "num_input_tokens_seen": 102986208, "step": 84675 }, { "epoch": 10.610199223154993, "grad_norm": 0.18063358962535858, "learning_rate": 5.340012090457559e-06, "loss": 0.4667, "num_input_tokens_seen": 102992416, "step": 84680 }, { "epoch": 10.610825711063777, "grad_norm": 0.18671658635139465, "learning_rate": 5.339466640146542e-06, "loss": 0.4501, "num_input_tokens_seen": 102998592, "step": 84685 }, { "epoch": 10.61145219897256, "grad_norm": 0.18365252017974854, "learning_rate": 5.338921185776916e-06, "loss": 0.4596, "num_input_tokens_seen": 103004480, "step": 84690 }, { "epoch": 10.612078686881343, "grad_norm": 0.15986567735671997, "learning_rate": 5.3383757273552026e-06, "loss": 0.4531, "num_input_tokens_seen": 103010688, "step": 84695 }, { "epoch": 10.612705174790127, "grad_norm": 0.15914984047412872, "learning_rate": 5.337830264887922e-06, "loss": 0.4668, "num_input_tokens_seen": 103016672, "step": 84700 }, { "epoch": 10.61333166269891, "grad_norm": 0.2130214124917984, "learning_rate": 5.337284798381597e-06, "loss": 0.4576, "num_input_tokens_seen": 103022816, "step": 84705 }, { "epoch": 10.613958150607694, "grad_norm": 0.13924729824066162, "learning_rate": 5.336739327842748e-06, "loss": 0.4514, "num_input_tokens_seen": 103028960, "step": 84710 }, { "epoch": 10.614584638516476, "grad_norm": 0.21154794096946716, "learning_rate": 5.336193853277899e-06, "loss": 0.4536, "num_input_tokens_seen": 103035392, "step": 84715 }, { "epoch": 10.61521112642526, "grad_norm": 0.21231819689273834, "learning_rate": 5.335648374693568e-06, "loss": 0.4709, "num_input_tokens_seen": 103041760, "step": 84720 }, { "epoch": 10.615837614334044, "grad_norm": 0.19855302572250366, "learning_rate": 5.3351028920962796e-06, "loss": 0.4607, "num_input_tokens_seen": 103047968, "step": 84725 }, { "epoch": 10.616464102242826, "grad_norm": 0.1782066822052002, "learning_rate": 5.334557405492556e-06, "loss": 0.4604, "num_input_tokens_seen": 103053792, "step": 84730 }, { "epoch": 10.61709059015161, "grad_norm": 0.25197896361351013, "learning_rate": 5.334011914888914e-06, "loss": 0.4551, "num_input_tokens_seen": 103059712, "step": 84735 }, { "epoch": 10.617717078060393, "grad_norm": 0.18499250710010529, "learning_rate": 5.33346642029188e-06, "loss": 0.4648, "num_input_tokens_seen": 103066080, "step": 84740 }, { "epoch": 10.618343565969177, "grad_norm": 0.1896783411502838, "learning_rate": 5.332920921707975e-06, "loss": 0.4658, "num_input_tokens_seen": 103071936, "step": 84745 }, { "epoch": 10.61897005387796, "grad_norm": 0.17449991405010223, "learning_rate": 5.332375419143719e-06, "loss": 0.4636, "num_input_tokens_seen": 103078208, "step": 84750 }, { "epoch": 10.619596541786743, "grad_norm": 0.17483673989772797, "learning_rate": 5.331829912605636e-06, "loss": 0.4555, "num_input_tokens_seen": 103084448, "step": 84755 }, { "epoch": 10.620223029695527, "grad_norm": 0.21539007127285004, "learning_rate": 5.331284402100249e-06, "loss": 0.4506, "num_input_tokens_seen": 103090624, "step": 84760 }, { "epoch": 10.62084951760431, "grad_norm": 0.24671237170696259, "learning_rate": 5.330738887634077e-06, "loss": 0.4726, "num_input_tokens_seen": 103096384, "step": 84765 }, { "epoch": 10.621476005513093, "grad_norm": 0.19376692175865173, "learning_rate": 5.330193369213644e-06, "loss": 0.4612, "num_input_tokens_seen": 103102720, "step": 84770 }, { "epoch": 10.622102493421878, "grad_norm": 0.23397134244441986, "learning_rate": 5.329647846845473e-06, "loss": 0.4604, "num_input_tokens_seen": 103108960, "step": 84775 }, { "epoch": 10.62272898133066, "grad_norm": 0.1955483853816986, "learning_rate": 5.329102320536082e-06, "loss": 0.4587, "num_input_tokens_seen": 103115328, "step": 84780 }, { "epoch": 10.623355469239444, "grad_norm": 0.2137925773859024, "learning_rate": 5.328556790291999e-06, "loss": 0.4667, "num_input_tokens_seen": 103121408, "step": 84785 }, { "epoch": 10.623981957148228, "grad_norm": 0.19686558842658997, "learning_rate": 5.3280112561197415e-06, "loss": 0.4559, "num_input_tokens_seen": 103127520, "step": 84790 }, { "epoch": 10.62460844505701, "grad_norm": 0.24951176345348358, "learning_rate": 5.327465718025833e-06, "loss": 0.4588, "num_input_tokens_seen": 103133920, "step": 84795 }, { "epoch": 10.625234932965794, "grad_norm": 0.17799577116966248, "learning_rate": 5.326920176016798e-06, "loss": 0.4566, "num_input_tokens_seen": 103140224, "step": 84800 }, { "epoch": 10.625861420874577, "grad_norm": 0.22129273414611816, "learning_rate": 5.326374630099157e-06, "loss": 0.4586, "num_input_tokens_seen": 103146528, "step": 84805 }, { "epoch": 10.62648790878336, "grad_norm": 0.23918433487415314, "learning_rate": 5.325829080279432e-06, "loss": 0.4672, "num_input_tokens_seen": 103151840, "step": 84810 }, { "epoch": 10.627114396692145, "grad_norm": 0.15106523036956787, "learning_rate": 5.325283526564148e-06, "loss": 0.4754, "num_input_tokens_seen": 103157952, "step": 84815 }, { "epoch": 10.627740884600927, "grad_norm": 0.19502441585063934, "learning_rate": 5.324737968959824e-06, "loss": 0.4467, "num_input_tokens_seen": 103164096, "step": 84820 }, { "epoch": 10.628367372509711, "grad_norm": 0.15108856558799744, "learning_rate": 5.3241924074729865e-06, "loss": 0.4614, "num_input_tokens_seen": 103170560, "step": 84825 }, { "epoch": 10.628993860418493, "grad_norm": 0.1709744781255722, "learning_rate": 5.323646842110156e-06, "loss": 0.4583, "num_input_tokens_seen": 103177088, "step": 84830 }, { "epoch": 10.629620348327277, "grad_norm": 0.22186684608459473, "learning_rate": 5.323101272877855e-06, "loss": 0.4626, "num_input_tokens_seen": 103183104, "step": 84835 }, { "epoch": 10.630246836236061, "grad_norm": 0.22566550970077515, "learning_rate": 5.322555699782607e-06, "loss": 0.4524, "num_input_tokens_seen": 103189088, "step": 84840 }, { "epoch": 10.630873324144844, "grad_norm": 0.1543349176645279, "learning_rate": 5.322010122830934e-06, "loss": 0.4759, "num_input_tokens_seen": 103194848, "step": 84845 }, { "epoch": 10.631499812053628, "grad_norm": 0.3486197590827942, "learning_rate": 5.32146454202936e-06, "loss": 0.46, "num_input_tokens_seen": 103200768, "step": 84850 }, { "epoch": 10.63212629996241, "grad_norm": 0.15172423422336578, "learning_rate": 5.320918957384404e-06, "loss": 0.4708, "num_input_tokens_seen": 103207008, "step": 84855 }, { "epoch": 10.632752787871194, "grad_norm": 0.22268018126487732, "learning_rate": 5.320373368902595e-06, "loss": 0.4573, "num_input_tokens_seen": 103213120, "step": 84860 }, { "epoch": 10.633379275779978, "grad_norm": 0.18173745274543762, "learning_rate": 5.319827776590452e-06, "loss": 0.452, "num_input_tokens_seen": 103219168, "step": 84865 }, { "epoch": 10.63400576368876, "grad_norm": 0.16871795058250427, "learning_rate": 5.3192821804544994e-06, "loss": 0.4586, "num_input_tokens_seen": 103225152, "step": 84870 }, { "epoch": 10.634632251597544, "grad_norm": 0.1516922116279602, "learning_rate": 5.31873658050126e-06, "loss": 0.4529, "num_input_tokens_seen": 103231168, "step": 84875 }, { "epoch": 10.635258739506327, "grad_norm": 0.26831957697868347, "learning_rate": 5.318190976737256e-06, "loss": 0.456, "num_input_tokens_seen": 103237120, "step": 84880 }, { "epoch": 10.63588522741511, "grad_norm": 0.19528266787528992, "learning_rate": 5.317645369169013e-06, "loss": 0.461, "num_input_tokens_seen": 103243392, "step": 84885 }, { "epoch": 10.636511715323895, "grad_norm": 0.20595324039459229, "learning_rate": 5.317099757803053e-06, "loss": 0.4483, "num_input_tokens_seen": 103249632, "step": 84890 }, { "epoch": 10.637138203232677, "grad_norm": 0.32571566104888916, "learning_rate": 5.3165541426458964e-06, "loss": 0.4652, "num_input_tokens_seen": 103255712, "step": 84895 }, { "epoch": 10.637764691141461, "grad_norm": 0.19844059646129608, "learning_rate": 5.31600852370407e-06, "loss": 0.461, "num_input_tokens_seen": 103261856, "step": 84900 }, { "epoch": 10.638391179050245, "grad_norm": 0.28317344188690186, "learning_rate": 5.3154629009840954e-06, "loss": 0.4537, "num_input_tokens_seen": 103267456, "step": 84905 }, { "epoch": 10.639017666959028, "grad_norm": 0.23573742806911469, "learning_rate": 5.3149172744924975e-06, "loss": 0.4641, "num_input_tokens_seen": 103273408, "step": 84910 }, { "epoch": 10.639644154867812, "grad_norm": 0.3336278796195984, "learning_rate": 5.314371644235798e-06, "loss": 0.4691, "num_input_tokens_seen": 103278752, "step": 84915 }, { "epoch": 10.640270642776594, "grad_norm": 0.1782020926475525, "learning_rate": 5.313826010220522e-06, "loss": 0.451, "num_input_tokens_seen": 103285024, "step": 84920 }, { "epoch": 10.640897130685378, "grad_norm": 0.25690725445747375, "learning_rate": 5.313280372453192e-06, "loss": 0.4638, "num_input_tokens_seen": 103291104, "step": 84925 }, { "epoch": 10.641523618594162, "grad_norm": 0.15470123291015625, "learning_rate": 5.312734730940331e-06, "loss": 0.4555, "num_input_tokens_seen": 103297376, "step": 84930 }, { "epoch": 10.642150106502944, "grad_norm": 0.21174664795398712, "learning_rate": 5.312189085688464e-06, "loss": 0.4569, "num_input_tokens_seen": 103303712, "step": 84935 }, { "epoch": 10.642776594411728, "grad_norm": 0.18786358833312988, "learning_rate": 5.311643436704114e-06, "loss": 0.4598, "num_input_tokens_seen": 103309152, "step": 84940 }, { "epoch": 10.64340308232051, "grad_norm": 0.18016771972179413, "learning_rate": 5.311097783993805e-06, "loss": 0.4504, "num_input_tokens_seen": 103315040, "step": 84945 }, { "epoch": 10.644029570229295, "grad_norm": 0.21288685500621796, "learning_rate": 5.3105521275640605e-06, "loss": 0.4568, "num_input_tokens_seen": 103320800, "step": 84950 }, { "epoch": 10.644656058138079, "grad_norm": 0.281627893447876, "learning_rate": 5.310006467421404e-06, "loss": 0.4575, "num_input_tokens_seen": 103327008, "step": 84955 }, { "epoch": 10.645282546046861, "grad_norm": 0.3710039556026459, "learning_rate": 5.309460803572358e-06, "loss": 0.4631, "num_input_tokens_seen": 103333376, "step": 84960 }, { "epoch": 10.645909033955645, "grad_norm": 0.31111663579940796, "learning_rate": 5.308915136023449e-06, "loss": 0.4661, "num_input_tokens_seen": 103338752, "step": 84965 }, { "epoch": 10.646535521864427, "grad_norm": 0.2449880838394165, "learning_rate": 5.3083694647812e-06, "loss": 0.4625, "num_input_tokens_seen": 103344448, "step": 84970 }, { "epoch": 10.647162009773211, "grad_norm": 0.3470941483974457, "learning_rate": 5.3078237898521355e-06, "loss": 0.4665, "num_input_tokens_seen": 103350688, "step": 84975 }, { "epoch": 10.647788497681995, "grad_norm": 0.295476496219635, "learning_rate": 5.307278111242778e-06, "loss": 0.4461, "num_input_tokens_seen": 103356896, "step": 84980 }, { "epoch": 10.648414985590778, "grad_norm": 0.2856685519218445, "learning_rate": 5.3067324289596534e-06, "loss": 0.4657, "num_input_tokens_seen": 103363104, "step": 84985 }, { "epoch": 10.649041473499562, "grad_norm": 0.2908192276954651, "learning_rate": 5.306186743009283e-06, "loss": 0.464, "num_input_tokens_seen": 103369088, "step": 84990 }, { "epoch": 10.649667961408344, "grad_norm": 0.3290329873561859, "learning_rate": 5.305641053398193e-06, "loss": 0.4664, "num_input_tokens_seen": 103375104, "step": 84995 }, { "epoch": 10.650294449317128, "grad_norm": 0.28867578506469727, "learning_rate": 5.305095360132908e-06, "loss": 0.4504, "num_input_tokens_seen": 103381152, "step": 85000 }, { "epoch": 10.650920937225912, "grad_norm": 0.39732825756073, "learning_rate": 5.304549663219951e-06, "loss": 0.4624, "num_input_tokens_seen": 103387104, "step": 85005 }, { "epoch": 10.651547425134694, "grad_norm": 0.29920098185539246, "learning_rate": 5.304003962665846e-06, "loss": 0.4605, "num_input_tokens_seen": 103393088, "step": 85010 }, { "epoch": 10.652173913043478, "grad_norm": 0.2767030894756317, "learning_rate": 5.3034582584771185e-06, "loss": 0.4608, "num_input_tokens_seen": 103399008, "step": 85015 }, { "epoch": 10.652800400952263, "grad_norm": 0.2578553855419159, "learning_rate": 5.302912550660294e-06, "loss": 0.4773, "num_input_tokens_seen": 103405312, "step": 85020 }, { "epoch": 10.653426888861045, "grad_norm": 0.21065160632133484, "learning_rate": 5.302366839221894e-06, "loss": 0.4504, "num_input_tokens_seen": 103411744, "step": 85025 }, { "epoch": 10.654053376769829, "grad_norm": 0.20405271649360657, "learning_rate": 5.301821124168445e-06, "loss": 0.4675, "num_input_tokens_seen": 103417856, "step": 85030 }, { "epoch": 10.654679864678611, "grad_norm": 0.19623661041259766, "learning_rate": 5.30127540550647e-06, "loss": 0.4714, "num_input_tokens_seen": 103423968, "step": 85035 }, { "epoch": 10.655306352587395, "grad_norm": 0.16471657156944275, "learning_rate": 5.300729683242494e-06, "loss": 0.4609, "num_input_tokens_seen": 103430144, "step": 85040 }, { "epoch": 10.655932840496178, "grad_norm": 0.16910827159881592, "learning_rate": 5.3001839573830425e-06, "loss": 0.4703, "num_input_tokens_seen": 103435776, "step": 85045 }, { "epoch": 10.656559328404962, "grad_norm": 0.20975549519062042, "learning_rate": 5.299638227934639e-06, "loss": 0.4483, "num_input_tokens_seen": 103441536, "step": 85050 }, { "epoch": 10.657185816313746, "grad_norm": 0.29843229055404663, "learning_rate": 5.299092494903807e-06, "loss": 0.4537, "num_input_tokens_seen": 103447744, "step": 85055 }, { "epoch": 10.657812304222528, "grad_norm": 0.19700565934181213, "learning_rate": 5.298546758297075e-06, "loss": 0.4512, "num_input_tokens_seen": 103454112, "step": 85060 }, { "epoch": 10.658438792131312, "grad_norm": 0.1919250339269638, "learning_rate": 5.298001018120964e-06, "loss": 0.4525, "num_input_tokens_seen": 103460256, "step": 85065 }, { "epoch": 10.659065280040096, "grad_norm": 0.25917863845825195, "learning_rate": 5.297455274382001e-06, "loss": 0.4589, "num_input_tokens_seen": 103466656, "step": 85070 }, { "epoch": 10.659691767948878, "grad_norm": 0.19823026657104492, "learning_rate": 5.29690952708671e-06, "loss": 0.4709, "num_input_tokens_seen": 103472960, "step": 85075 }, { "epoch": 10.660318255857662, "grad_norm": 0.1796870082616806, "learning_rate": 5.296363776241617e-06, "loss": 0.4566, "num_input_tokens_seen": 103479168, "step": 85080 }, { "epoch": 10.660944743766445, "grad_norm": 0.24330076575279236, "learning_rate": 5.295818021853245e-06, "loss": 0.4694, "num_input_tokens_seen": 103485184, "step": 85085 }, { "epoch": 10.661571231675229, "grad_norm": 0.20519590377807617, "learning_rate": 5.295272263928119e-06, "loss": 0.4721, "num_input_tokens_seen": 103491296, "step": 85090 }, { "epoch": 10.662197719584013, "grad_norm": 0.25967493653297424, "learning_rate": 5.2947265024727665e-06, "loss": 0.4729, "num_input_tokens_seen": 103497312, "step": 85095 }, { "epoch": 10.662824207492795, "grad_norm": 0.19620154798030853, "learning_rate": 5.29418073749371e-06, "loss": 0.4585, "num_input_tokens_seen": 103503232, "step": 85100 }, { "epoch": 10.663450695401579, "grad_norm": 0.18179002404212952, "learning_rate": 5.293634968997476e-06, "loss": 0.4545, "num_input_tokens_seen": 103509216, "step": 85105 }, { "epoch": 10.664077183310361, "grad_norm": 0.1780112087726593, "learning_rate": 5.293089196990588e-06, "loss": 0.4659, "num_input_tokens_seen": 103515520, "step": 85110 }, { "epoch": 10.664703671219145, "grad_norm": 0.20795883238315582, "learning_rate": 5.292543421479573e-06, "loss": 0.4615, "num_input_tokens_seen": 103521824, "step": 85115 }, { "epoch": 10.66533015912793, "grad_norm": 0.23709312081336975, "learning_rate": 5.291997642470954e-06, "loss": 0.4723, "num_input_tokens_seen": 103528032, "step": 85120 }, { "epoch": 10.665956647036712, "grad_norm": 0.19740030169487, "learning_rate": 5.29145185997126e-06, "loss": 0.4634, "num_input_tokens_seen": 103534528, "step": 85125 }, { "epoch": 10.666583134945496, "grad_norm": 0.20621688663959503, "learning_rate": 5.290906073987012e-06, "loss": 0.4728, "num_input_tokens_seen": 103540928, "step": 85130 }, { "epoch": 10.667209622854278, "grad_norm": 0.21578261256217957, "learning_rate": 5.290360284524738e-06, "loss": 0.4783, "num_input_tokens_seen": 103546688, "step": 85135 }, { "epoch": 10.667836110763062, "grad_norm": 0.19515857100486755, "learning_rate": 5.289814491590963e-06, "loss": 0.4711, "num_input_tokens_seen": 103552736, "step": 85140 }, { "epoch": 10.668462598671846, "grad_norm": 0.18842719495296478, "learning_rate": 5.289268695192214e-06, "loss": 0.4677, "num_input_tokens_seen": 103559008, "step": 85145 }, { "epoch": 10.669089086580628, "grad_norm": 0.26076918840408325, "learning_rate": 5.288722895335011e-06, "loss": 0.4587, "num_input_tokens_seen": 103565088, "step": 85150 }, { "epoch": 10.669715574489413, "grad_norm": 0.2480604350566864, "learning_rate": 5.288177092025885e-06, "loss": 0.4627, "num_input_tokens_seen": 103570720, "step": 85155 }, { "epoch": 10.670342062398195, "grad_norm": 0.21287193894386292, "learning_rate": 5.28763128527136e-06, "loss": 0.4732, "num_input_tokens_seen": 103577216, "step": 85160 }, { "epoch": 10.670968550306979, "grad_norm": 0.24492312967777252, "learning_rate": 5.287085475077959e-06, "loss": 0.4544, "num_input_tokens_seen": 103583584, "step": 85165 }, { "epoch": 10.671595038215763, "grad_norm": 0.17383210361003876, "learning_rate": 5.286539661452211e-06, "loss": 0.4626, "num_input_tokens_seen": 103589920, "step": 85170 }, { "epoch": 10.672221526124545, "grad_norm": 0.2119065225124359, "learning_rate": 5.28599384440064e-06, "loss": 0.4525, "num_input_tokens_seen": 103596032, "step": 85175 }, { "epoch": 10.67284801403333, "grad_norm": 0.2246084362268448, "learning_rate": 5.2854480239297725e-06, "loss": 0.462, "num_input_tokens_seen": 103601920, "step": 85180 }, { "epoch": 10.673474501942113, "grad_norm": 0.2091524451971054, "learning_rate": 5.284902200046135e-06, "loss": 0.461, "num_input_tokens_seen": 103607904, "step": 85185 }, { "epoch": 10.674100989850896, "grad_norm": 0.1702851951122284, "learning_rate": 5.284356372756251e-06, "loss": 0.4563, "num_input_tokens_seen": 103613984, "step": 85190 }, { "epoch": 10.67472747775968, "grad_norm": 0.19943854212760925, "learning_rate": 5.2838105420666476e-06, "loss": 0.4671, "num_input_tokens_seen": 103620096, "step": 85195 }, { "epoch": 10.675353965668462, "grad_norm": 0.15870928764343262, "learning_rate": 5.283264707983851e-06, "loss": 0.4531, "num_input_tokens_seen": 103626304, "step": 85200 }, { "epoch": 10.675980453577246, "grad_norm": 0.19703936576843262, "learning_rate": 5.2827188705143865e-06, "loss": 0.4538, "num_input_tokens_seen": 103632096, "step": 85205 }, { "epoch": 10.67660694148603, "grad_norm": 0.1647169440984726, "learning_rate": 5.28217302966478e-06, "loss": 0.47, "num_input_tokens_seen": 103638208, "step": 85210 }, { "epoch": 10.677233429394812, "grad_norm": 0.3229014575481415, "learning_rate": 5.281627185441557e-06, "loss": 0.4533, "num_input_tokens_seen": 103644416, "step": 85215 }, { "epoch": 10.677859917303596, "grad_norm": 0.23469296097755432, "learning_rate": 5.281081337851245e-06, "loss": 0.4644, "num_input_tokens_seen": 103650624, "step": 85220 }, { "epoch": 10.678486405212379, "grad_norm": 0.1435651183128357, "learning_rate": 5.28053548690037e-06, "loss": 0.4589, "num_input_tokens_seen": 103656544, "step": 85225 }, { "epoch": 10.679112893121163, "grad_norm": 0.19604195654392242, "learning_rate": 5.279989632595456e-06, "loss": 0.471, "num_input_tokens_seen": 103662560, "step": 85230 }, { "epoch": 10.679739381029947, "grad_norm": 0.163587287068367, "learning_rate": 5.279443774943032e-06, "loss": 0.4557, "num_input_tokens_seen": 103668768, "step": 85235 }, { "epoch": 10.680365868938729, "grad_norm": 0.2289879322052002, "learning_rate": 5.278897913949621e-06, "loss": 0.459, "num_input_tokens_seen": 103674880, "step": 85240 }, { "epoch": 10.680992356847513, "grad_norm": 0.2115790694952011, "learning_rate": 5.278352049621751e-06, "loss": 0.4575, "num_input_tokens_seen": 103681120, "step": 85245 }, { "epoch": 10.681618844756295, "grad_norm": 0.23933005332946777, "learning_rate": 5.2778061819659496e-06, "loss": 0.4531, "num_input_tokens_seen": 103687264, "step": 85250 }, { "epoch": 10.68224533266508, "grad_norm": 0.1850014477968216, "learning_rate": 5.277260310988741e-06, "loss": 0.4698, "num_input_tokens_seen": 103693600, "step": 85255 }, { "epoch": 10.682871820573864, "grad_norm": 0.290355384349823, "learning_rate": 5.2767144366966525e-06, "loss": 0.464, "num_input_tokens_seen": 103699968, "step": 85260 }, { "epoch": 10.683498308482646, "grad_norm": 0.2131446897983551, "learning_rate": 5.276168559096211e-06, "loss": 0.4475, "num_input_tokens_seen": 103705920, "step": 85265 }, { "epoch": 10.68412479639143, "grad_norm": 0.3026600778102875, "learning_rate": 5.27562267819394e-06, "loss": 0.4609, "num_input_tokens_seen": 103712160, "step": 85270 }, { "epoch": 10.684751284300212, "grad_norm": 0.1559247523546219, "learning_rate": 5.27507679399637e-06, "loss": 0.4567, "num_input_tokens_seen": 103718432, "step": 85275 }, { "epoch": 10.685377772208996, "grad_norm": 0.19914838671684265, "learning_rate": 5.274530906510026e-06, "loss": 0.4596, "num_input_tokens_seen": 103724576, "step": 85280 }, { "epoch": 10.68600426011778, "grad_norm": 0.18873801827430725, "learning_rate": 5.273985015741433e-06, "loss": 0.4744, "num_input_tokens_seen": 103730720, "step": 85285 }, { "epoch": 10.686630748026563, "grad_norm": 0.1747121512889862, "learning_rate": 5.273439121697119e-06, "loss": 0.4633, "num_input_tokens_seen": 103737120, "step": 85290 }, { "epoch": 10.687257235935347, "grad_norm": 0.23492255806922913, "learning_rate": 5.272893224383612e-06, "loss": 0.4545, "num_input_tokens_seen": 103742848, "step": 85295 }, { "epoch": 10.68788372384413, "grad_norm": 0.17812518775463104, "learning_rate": 5.272347323807434e-06, "loss": 0.4628, "num_input_tokens_seen": 103749216, "step": 85300 }, { "epoch": 10.688510211752913, "grad_norm": 0.24269428849220276, "learning_rate": 5.2718014199751164e-06, "loss": 0.4759, "num_input_tokens_seen": 103754880, "step": 85305 }, { "epoch": 10.689136699661697, "grad_norm": 0.22469623386859894, "learning_rate": 5.2712555128931855e-06, "loss": 0.4726, "num_input_tokens_seen": 103761056, "step": 85310 }, { "epoch": 10.68976318757048, "grad_norm": 0.1561453491449356, "learning_rate": 5.270709602568164e-06, "loss": 0.4653, "num_input_tokens_seen": 103767072, "step": 85315 }, { "epoch": 10.690389675479263, "grad_norm": 0.1255338042974472, "learning_rate": 5.270163689006583e-06, "loss": 0.4666, "num_input_tokens_seen": 103773536, "step": 85320 }, { "epoch": 10.691016163388047, "grad_norm": 0.17532041668891907, "learning_rate": 5.2696177722149685e-06, "loss": 0.4581, "num_input_tokens_seen": 103779488, "step": 85325 }, { "epoch": 10.69164265129683, "grad_norm": 0.12730908393859863, "learning_rate": 5.269071852199847e-06, "loss": 0.4567, "num_input_tokens_seen": 103785824, "step": 85330 }, { "epoch": 10.692269139205614, "grad_norm": 0.17317914962768555, "learning_rate": 5.268525928967746e-06, "loss": 0.4595, "num_input_tokens_seen": 103791840, "step": 85335 }, { "epoch": 10.692895627114396, "grad_norm": 0.2141038477420807, "learning_rate": 5.26798000252519e-06, "loss": 0.4638, "num_input_tokens_seen": 103797888, "step": 85340 }, { "epoch": 10.69352211502318, "grad_norm": 0.21082690358161926, "learning_rate": 5.267434072878709e-06, "loss": 0.4661, "num_input_tokens_seen": 103804064, "step": 85345 }, { "epoch": 10.694148602931964, "grad_norm": 0.1477644443511963, "learning_rate": 5.266888140034828e-06, "loss": 0.4612, "num_input_tokens_seen": 103810336, "step": 85350 }, { "epoch": 10.694775090840746, "grad_norm": 0.21826522052288055, "learning_rate": 5.2663422040000755e-06, "loss": 0.4616, "num_input_tokens_seen": 103816512, "step": 85355 }, { "epoch": 10.69540157874953, "grad_norm": 0.2091919630765915, "learning_rate": 5.265796264780977e-06, "loss": 0.4674, "num_input_tokens_seen": 103822688, "step": 85360 }, { "epoch": 10.696028066658313, "grad_norm": 0.15079502761363983, "learning_rate": 5.265250322384061e-06, "loss": 0.4631, "num_input_tokens_seen": 103828640, "step": 85365 }, { "epoch": 10.696654554567097, "grad_norm": 0.2367531657218933, "learning_rate": 5.264704376815856e-06, "loss": 0.4618, "num_input_tokens_seen": 103834912, "step": 85370 }, { "epoch": 10.69728104247588, "grad_norm": 0.1361660361289978, "learning_rate": 5.264158428082885e-06, "loss": 0.463, "num_input_tokens_seen": 103841280, "step": 85375 }, { "epoch": 10.697907530384663, "grad_norm": 0.18032123148441315, "learning_rate": 5.26361247619168e-06, "loss": 0.457, "num_input_tokens_seen": 103847488, "step": 85380 }, { "epoch": 10.698534018293447, "grad_norm": 0.18048642575740814, "learning_rate": 5.263066521148765e-06, "loss": 0.4555, "num_input_tokens_seen": 103854048, "step": 85385 }, { "epoch": 10.69916050620223, "grad_norm": 0.15387162566184998, "learning_rate": 5.262520562960669e-06, "loss": 0.4644, "num_input_tokens_seen": 103860416, "step": 85390 }, { "epoch": 10.699786994111014, "grad_norm": 0.14046387374401093, "learning_rate": 5.261974601633919e-06, "loss": 0.4714, "num_input_tokens_seen": 103866848, "step": 85395 }, { "epoch": 10.700413482019798, "grad_norm": 0.38538822531700134, "learning_rate": 5.261428637175043e-06, "loss": 0.4662, "num_input_tokens_seen": 103872768, "step": 85400 }, { "epoch": 10.70103996992858, "grad_norm": 0.17714351415634155, "learning_rate": 5.260882669590567e-06, "loss": 0.4666, "num_input_tokens_seen": 103878272, "step": 85405 }, { "epoch": 10.701666457837364, "grad_norm": 0.32805684208869934, "learning_rate": 5.26033669888702e-06, "loss": 0.4737, "num_input_tokens_seen": 103884320, "step": 85410 }, { "epoch": 10.702292945746148, "grad_norm": 0.14117421209812164, "learning_rate": 5.259790725070929e-06, "loss": 0.459, "num_input_tokens_seen": 103890208, "step": 85415 }, { "epoch": 10.70291943365493, "grad_norm": 0.14862415194511414, "learning_rate": 5.25924474814882e-06, "loss": 0.4559, "num_input_tokens_seen": 103896544, "step": 85420 }, { "epoch": 10.703545921563714, "grad_norm": 0.17538879811763763, "learning_rate": 5.2586987681272226e-06, "loss": 0.4577, "num_input_tokens_seen": 103902848, "step": 85425 }, { "epoch": 10.704172409472497, "grad_norm": 0.2168884128332138, "learning_rate": 5.258152785012663e-06, "loss": 0.4579, "num_input_tokens_seen": 103909280, "step": 85430 }, { "epoch": 10.70479889738128, "grad_norm": 0.22150476276874542, "learning_rate": 5.257606798811672e-06, "loss": 0.4548, "num_input_tokens_seen": 103915648, "step": 85435 }, { "epoch": 10.705425385290065, "grad_norm": 0.3270590305328369, "learning_rate": 5.257060809530774e-06, "loss": 0.4654, "num_input_tokens_seen": 103922048, "step": 85440 }, { "epoch": 10.706051873198847, "grad_norm": 0.23543404042720795, "learning_rate": 5.256514817176499e-06, "loss": 0.4566, "num_input_tokens_seen": 103927680, "step": 85445 }, { "epoch": 10.706678361107631, "grad_norm": 0.15925058722496033, "learning_rate": 5.255968821755373e-06, "loss": 0.466, "num_input_tokens_seen": 103933600, "step": 85450 }, { "epoch": 10.707304849016413, "grad_norm": 0.24355186522006989, "learning_rate": 5.2554228232739245e-06, "loss": 0.46, "num_input_tokens_seen": 103940000, "step": 85455 }, { "epoch": 10.707931336925197, "grad_norm": 0.11983557790517807, "learning_rate": 5.254876821738681e-06, "loss": 0.467, "num_input_tokens_seen": 103946240, "step": 85460 }, { "epoch": 10.708557824833981, "grad_norm": 0.13233815133571625, "learning_rate": 5.254330817156171e-06, "loss": 0.4603, "num_input_tokens_seen": 103952224, "step": 85465 }, { "epoch": 10.709184312742764, "grad_norm": 0.18235701322555542, "learning_rate": 5.2537848095329225e-06, "loss": 0.4615, "num_input_tokens_seen": 103958560, "step": 85470 }, { "epoch": 10.709810800651548, "grad_norm": 0.16758738458156586, "learning_rate": 5.253238798875463e-06, "loss": 0.4551, "num_input_tokens_seen": 103964704, "step": 85475 }, { "epoch": 10.71043728856033, "grad_norm": 0.1781541258096695, "learning_rate": 5.252692785190321e-06, "loss": 0.4658, "num_input_tokens_seen": 103970784, "step": 85480 }, { "epoch": 10.711063776469114, "grad_norm": 0.16248437762260437, "learning_rate": 5.252146768484023e-06, "loss": 0.4578, "num_input_tokens_seen": 103976768, "step": 85485 }, { "epoch": 10.711690264377898, "grad_norm": 0.20257754623889923, "learning_rate": 5.251600748763099e-06, "loss": 0.4651, "num_input_tokens_seen": 103982400, "step": 85490 }, { "epoch": 10.71231675228668, "grad_norm": 0.16021181643009186, "learning_rate": 5.2510547260340786e-06, "loss": 0.456, "num_input_tokens_seen": 103988032, "step": 85495 }, { "epoch": 10.712943240195465, "grad_norm": 0.20577438175678253, "learning_rate": 5.250508700303486e-06, "loss": 0.4621, "num_input_tokens_seen": 103994208, "step": 85500 }, { "epoch": 10.713569728104247, "grad_norm": 0.19208814203739166, "learning_rate": 5.249962671577852e-06, "loss": 0.4486, "num_input_tokens_seen": 103999808, "step": 85505 }, { "epoch": 10.71419621601303, "grad_norm": 0.2218731790781021, "learning_rate": 5.249416639863703e-06, "loss": 0.4659, "num_input_tokens_seen": 104005920, "step": 85510 }, { "epoch": 10.714822703921815, "grad_norm": 0.17120683193206787, "learning_rate": 5.24887060516757e-06, "loss": 0.4599, "num_input_tokens_seen": 104011872, "step": 85515 }, { "epoch": 10.715449191830597, "grad_norm": 0.16786350309848785, "learning_rate": 5.2483245674959785e-06, "loss": 0.4552, "num_input_tokens_seen": 104018304, "step": 85520 }, { "epoch": 10.716075679739381, "grad_norm": 0.13852953910827637, "learning_rate": 5.247778526855457e-06, "loss": 0.473, "num_input_tokens_seen": 104024512, "step": 85525 }, { "epoch": 10.716702167648165, "grad_norm": 0.1385393738746643, "learning_rate": 5.247232483252537e-06, "loss": 0.4716, "num_input_tokens_seen": 104030528, "step": 85530 }, { "epoch": 10.717328655556948, "grad_norm": 0.127506285905838, "learning_rate": 5.246686436693744e-06, "loss": 0.4576, "num_input_tokens_seen": 104036640, "step": 85535 }, { "epoch": 10.717955143465732, "grad_norm": 0.16610826551914215, "learning_rate": 5.2461403871856075e-06, "loss": 0.4591, "num_input_tokens_seen": 104042816, "step": 85540 }, { "epoch": 10.718581631374514, "grad_norm": 0.1992230862379074, "learning_rate": 5.245594334734655e-06, "loss": 0.4696, "num_input_tokens_seen": 104048736, "step": 85545 }, { "epoch": 10.719208119283298, "grad_norm": 0.34166282415390015, "learning_rate": 5.245048279347416e-06, "loss": 0.4726, "num_input_tokens_seen": 104054880, "step": 85550 }, { "epoch": 10.71983460719208, "grad_norm": 0.1697053462266922, "learning_rate": 5.2445022210304185e-06, "loss": 0.4639, "num_input_tokens_seen": 104061152, "step": 85555 }, { "epoch": 10.720461095100864, "grad_norm": 0.16147202253341675, "learning_rate": 5.243956159790192e-06, "loss": 0.4692, "num_input_tokens_seen": 104067200, "step": 85560 }, { "epoch": 10.721087583009648, "grad_norm": 0.1791287064552307, "learning_rate": 5.243410095633264e-06, "loss": 0.453, "num_input_tokens_seen": 104073248, "step": 85565 }, { "epoch": 10.72171407091843, "grad_norm": 0.2050285041332245, "learning_rate": 5.242864028566165e-06, "loss": 0.4477, "num_input_tokens_seen": 104079168, "step": 85570 }, { "epoch": 10.722340558827215, "grad_norm": 0.16674038767814636, "learning_rate": 5.242317958595421e-06, "loss": 0.4596, "num_input_tokens_seen": 104085696, "step": 85575 }, { "epoch": 10.722967046735999, "grad_norm": 0.17386208474636078, "learning_rate": 5.241771885727562e-06, "loss": 0.4568, "num_input_tokens_seen": 104091776, "step": 85580 }, { "epoch": 10.723593534644781, "grad_norm": 0.21122264862060547, "learning_rate": 5.241225809969117e-06, "loss": 0.4576, "num_input_tokens_seen": 104097984, "step": 85585 }, { "epoch": 10.724220022553565, "grad_norm": 0.16687321662902832, "learning_rate": 5.240679731326616e-06, "loss": 0.4588, "num_input_tokens_seen": 104104128, "step": 85590 }, { "epoch": 10.724846510462347, "grad_norm": 0.17351609468460083, "learning_rate": 5.240133649806585e-06, "loss": 0.4631, "num_input_tokens_seen": 104109632, "step": 85595 }, { "epoch": 10.725472998371131, "grad_norm": 0.20863398909568787, "learning_rate": 5.239587565415555e-06, "loss": 0.4582, "num_input_tokens_seen": 104115712, "step": 85600 }, { "epoch": 10.726099486279915, "grad_norm": 0.16347338259220123, "learning_rate": 5.239041478160054e-06, "loss": 0.4577, "num_input_tokens_seen": 104121600, "step": 85605 }, { "epoch": 10.726725974188698, "grad_norm": 0.12056471407413483, "learning_rate": 5.238495388046609e-06, "loss": 0.4624, "num_input_tokens_seen": 104127488, "step": 85610 }, { "epoch": 10.727352462097482, "grad_norm": 0.12612776458263397, "learning_rate": 5.237949295081755e-06, "loss": 0.4627, "num_input_tokens_seen": 104133344, "step": 85615 }, { "epoch": 10.727978950006264, "grad_norm": 0.1394062489271164, "learning_rate": 5.237403199272014e-06, "loss": 0.4543, "num_input_tokens_seen": 104139520, "step": 85620 }, { "epoch": 10.728605437915048, "grad_norm": 0.1447346806526184, "learning_rate": 5.23685710062392e-06, "loss": 0.4536, "num_input_tokens_seen": 104145312, "step": 85625 }, { "epoch": 10.729231925823832, "grad_norm": 0.11599637567996979, "learning_rate": 5.236310999143999e-06, "loss": 0.4636, "num_input_tokens_seen": 104151584, "step": 85630 }, { "epoch": 10.729858413732615, "grad_norm": 0.2122793197631836, "learning_rate": 5.235764894838782e-06, "loss": 0.467, "num_input_tokens_seen": 104157888, "step": 85635 }, { "epoch": 10.730484901641399, "grad_norm": 0.16681502759456635, "learning_rate": 5.2352187877147975e-06, "loss": 0.462, "num_input_tokens_seen": 104164160, "step": 85640 }, { "epoch": 10.731111389550183, "grad_norm": 0.1488182097673416, "learning_rate": 5.234672677778575e-06, "loss": 0.4685, "num_input_tokens_seen": 104170624, "step": 85645 }, { "epoch": 10.731737877458965, "grad_norm": 0.14660565555095673, "learning_rate": 5.234126565036644e-06, "loss": 0.4591, "num_input_tokens_seen": 104176992, "step": 85650 }, { "epoch": 10.732364365367749, "grad_norm": 0.202377051115036, "learning_rate": 5.233580449495531e-06, "loss": 0.4612, "num_input_tokens_seen": 104183232, "step": 85655 }, { "epoch": 10.732990853276531, "grad_norm": 0.18161392211914062, "learning_rate": 5.2330343311617696e-06, "loss": 0.4684, "num_input_tokens_seen": 104188992, "step": 85660 }, { "epoch": 10.733617341185315, "grad_norm": 0.1541350930929184, "learning_rate": 5.232488210041885e-06, "loss": 0.4583, "num_input_tokens_seen": 104194784, "step": 85665 }, { "epoch": 10.734243829094098, "grad_norm": 0.13944709300994873, "learning_rate": 5.2319420861424085e-06, "loss": 0.4688, "num_input_tokens_seen": 104200864, "step": 85670 }, { "epoch": 10.734870317002882, "grad_norm": 0.15270468592643738, "learning_rate": 5.231395959469869e-06, "loss": 0.4624, "num_input_tokens_seen": 104206816, "step": 85675 }, { "epoch": 10.735496804911666, "grad_norm": 0.15226277709007263, "learning_rate": 5.230849830030798e-06, "loss": 0.4623, "num_input_tokens_seen": 104213088, "step": 85680 }, { "epoch": 10.736123292820448, "grad_norm": 0.17259953916072845, "learning_rate": 5.230303697831721e-06, "loss": 0.4643, "num_input_tokens_seen": 104219392, "step": 85685 }, { "epoch": 10.736749780729232, "grad_norm": 0.15591493248939514, "learning_rate": 5.229757562879172e-06, "loss": 0.4644, "num_input_tokens_seen": 104225472, "step": 85690 }, { "epoch": 10.737376268638016, "grad_norm": 0.20289285480976105, "learning_rate": 5.229211425179677e-06, "loss": 0.4561, "num_input_tokens_seen": 104231680, "step": 85695 }, { "epoch": 10.738002756546798, "grad_norm": 0.1670878678560257, "learning_rate": 5.228665284739767e-06, "loss": 0.462, "num_input_tokens_seen": 104237248, "step": 85700 }, { "epoch": 10.738629244455582, "grad_norm": 0.14641104638576508, "learning_rate": 5.22811914156597e-06, "loss": 0.4568, "num_input_tokens_seen": 104242944, "step": 85705 }, { "epoch": 10.739255732364365, "grad_norm": 0.13771972060203552, "learning_rate": 5.227572995664819e-06, "loss": 0.4697, "num_input_tokens_seen": 104249184, "step": 85710 }, { "epoch": 10.739882220273149, "grad_norm": 0.14725428819656372, "learning_rate": 5.227026847042841e-06, "loss": 0.4628, "num_input_tokens_seen": 104255328, "step": 85715 }, { "epoch": 10.740508708181933, "grad_norm": 0.13878795504570007, "learning_rate": 5.226480695706565e-06, "loss": 0.4637, "num_input_tokens_seen": 104260704, "step": 85720 }, { "epoch": 10.741135196090715, "grad_norm": 0.17938901484012604, "learning_rate": 5.225934541662523e-06, "loss": 0.464, "num_input_tokens_seen": 104266688, "step": 85725 }, { "epoch": 10.7417616839995, "grad_norm": 0.20667515695095062, "learning_rate": 5.225388384917243e-06, "loss": 0.4597, "num_input_tokens_seen": 104272832, "step": 85730 }, { "epoch": 10.742388171908281, "grad_norm": 0.2053113430738449, "learning_rate": 5.2248422254772545e-06, "loss": 0.4736, "num_input_tokens_seen": 104278976, "step": 85735 }, { "epoch": 10.743014659817065, "grad_norm": 0.1468617469072342, "learning_rate": 5.224296063349088e-06, "loss": 0.4729, "num_input_tokens_seen": 104285280, "step": 85740 }, { "epoch": 10.74364114772585, "grad_norm": 0.15198026597499847, "learning_rate": 5.223749898539275e-06, "loss": 0.4595, "num_input_tokens_seen": 104290720, "step": 85745 }, { "epoch": 10.744267635634632, "grad_norm": 0.16308996081352234, "learning_rate": 5.223203731054343e-06, "loss": 0.4546, "num_input_tokens_seen": 104296064, "step": 85750 }, { "epoch": 10.744894123543416, "grad_norm": 0.16403856873512268, "learning_rate": 5.222657560900825e-06, "loss": 0.4565, "num_input_tokens_seen": 104301920, "step": 85755 }, { "epoch": 10.745520611452198, "grad_norm": 0.20303335785865784, "learning_rate": 5.222111388085246e-06, "loss": 0.4682, "num_input_tokens_seen": 104308320, "step": 85760 }, { "epoch": 10.746147099360982, "grad_norm": 0.15671740472316742, "learning_rate": 5.22156521261414e-06, "loss": 0.4613, "num_input_tokens_seen": 104314368, "step": 85765 }, { "epoch": 10.746773587269766, "grad_norm": 0.21627679467201233, "learning_rate": 5.221019034494034e-06, "loss": 0.4692, "num_input_tokens_seen": 104320640, "step": 85770 }, { "epoch": 10.747400075178549, "grad_norm": 0.14245246350765228, "learning_rate": 5.220472853731462e-06, "loss": 0.4621, "num_input_tokens_seen": 104326944, "step": 85775 }, { "epoch": 10.748026563087333, "grad_norm": 0.14814133942127228, "learning_rate": 5.21992667033295e-06, "loss": 0.463, "num_input_tokens_seen": 104332992, "step": 85780 }, { "epoch": 10.748653050996115, "grad_norm": 0.2361305058002472, "learning_rate": 5.21938048430503e-06, "loss": 0.4716, "num_input_tokens_seen": 104339104, "step": 85785 }, { "epoch": 10.749279538904899, "grad_norm": 0.15481455624103546, "learning_rate": 5.218834295654232e-06, "loss": 0.4646, "num_input_tokens_seen": 104345376, "step": 85790 }, { "epoch": 10.749906026813683, "grad_norm": 0.12469926476478577, "learning_rate": 5.2182881043870844e-06, "loss": 0.4603, "num_input_tokens_seen": 104351552, "step": 85795 }, { "epoch": 10.750532514722465, "grad_norm": 0.14154067635536194, "learning_rate": 5.217741910510121e-06, "loss": 0.4701, "num_input_tokens_seen": 104357184, "step": 85800 }, { "epoch": 10.75115900263125, "grad_norm": 0.1598447859287262, "learning_rate": 5.21719571402987e-06, "loss": 0.4705, "num_input_tokens_seen": 104363328, "step": 85805 }, { "epoch": 10.751785490540033, "grad_norm": 0.14229251444339752, "learning_rate": 5.21664951495286e-06, "loss": 0.4585, "num_input_tokens_seen": 104369312, "step": 85810 }, { "epoch": 10.752411978448816, "grad_norm": 0.14917409420013428, "learning_rate": 5.216103313285625e-06, "loss": 0.4576, "num_input_tokens_seen": 104375488, "step": 85815 }, { "epoch": 10.7530384663576, "grad_norm": 0.1315203458070755, "learning_rate": 5.215557109034693e-06, "loss": 0.4623, "num_input_tokens_seen": 104381600, "step": 85820 }, { "epoch": 10.753664954266382, "grad_norm": 0.13594695925712585, "learning_rate": 5.215010902206594e-06, "loss": 0.4706, "num_input_tokens_seen": 104387936, "step": 85825 }, { "epoch": 10.754291442175166, "grad_norm": 0.17642371356487274, "learning_rate": 5.214464692807859e-06, "loss": 0.4678, "num_input_tokens_seen": 104394208, "step": 85830 }, { "epoch": 10.75491793008395, "grad_norm": 0.12065010517835617, "learning_rate": 5.213918480845017e-06, "loss": 0.4559, "num_input_tokens_seen": 104400192, "step": 85835 }, { "epoch": 10.755544417992732, "grad_norm": 0.14655090868473053, "learning_rate": 5.213372266324601e-06, "loss": 0.4699, "num_input_tokens_seen": 104406624, "step": 85840 }, { "epoch": 10.756170905901516, "grad_norm": 0.1672762781381607, "learning_rate": 5.2128260492531394e-06, "loss": 0.4646, "num_input_tokens_seen": 104412832, "step": 85845 }, { "epoch": 10.756797393810299, "grad_norm": 0.2081456333398819, "learning_rate": 5.212279829637165e-06, "loss": 0.4609, "num_input_tokens_seen": 104419008, "step": 85850 }, { "epoch": 10.757423881719083, "grad_norm": 0.1606132835149765, "learning_rate": 5.2117336074832035e-06, "loss": 0.4623, "num_input_tokens_seen": 104424928, "step": 85855 }, { "epoch": 10.758050369627867, "grad_norm": 0.14371569454669952, "learning_rate": 5.211187382797791e-06, "loss": 0.4554, "num_input_tokens_seen": 104431168, "step": 85860 }, { "epoch": 10.75867685753665, "grad_norm": 0.16083455085754395, "learning_rate": 5.2106411555874555e-06, "loss": 0.4656, "num_input_tokens_seen": 104437312, "step": 85865 }, { "epoch": 10.759303345445433, "grad_norm": 0.12905162572860718, "learning_rate": 5.210094925858728e-06, "loss": 0.4574, "num_input_tokens_seen": 104443392, "step": 85870 }, { "epoch": 10.759929833354215, "grad_norm": 0.14057902991771698, "learning_rate": 5.20954869361814e-06, "loss": 0.4556, "num_input_tokens_seen": 104449792, "step": 85875 }, { "epoch": 10.760556321263, "grad_norm": 0.1413668394088745, "learning_rate": 5.209002458872221e-06, "loss": 0.4608, "num_input_tokens_seen": 104456288, "step": 85880 }, { "epoch": 10.761182809171784, "grad_norm": 0.15004082024097443, "learning_rate": 5.208456221627501e-06, "loss": 0.4556, "num_input_tokens_seen": 104462112, "step": 85885 }, { "epoch": 10.761809297080566, "grad_norm": 0.1373155266046524, "learning_rate": 5.207909981890511e-06, "loss": 0.4615, "num_input_tokens_seen": 104468448, "step": 85890 }, { "epoch": 10.76243578498935, "grad_norm": 0.1692562997341156, "learning_rate": 5.207363739667783e-06, "loss": 0.459, "num_input_tokens_seen": 104474656, "step": 85895 }, { "epoch": 10.763062272898132, "grad_norm": 0.11652733385562897, "learning_rate": 5.206817494965848e-06, "loss": 0.4631, "num_input_tokens_seen": 104480576, "step": 85900 }, { "epoch": 10.763688760806916, "grad_norm": 0.18334268033504486, "learning_rate": 5.206271247791235e-06, "loss": 0.4643, "num_input_tokens_seen": 104486912, "step": 85905 }, { "epoch": 10.7643152487157, "grad_norm": 0.17173685133457184, "learning_rate": 5.2057249981504765e-06, "loss": 0.4506, "num_input_tokens_seen": 104493056, "step": 85910 }, { "epoch": 10.764941736624483, "grad_norm": 0.1438039243221283, "learning_rate": 5.205178746050102e-06, "loss": 0.4603, "num_input_tokens_seen": 104498976, "step": 85915 }, { "epoch": 10.765568224533267, "grad_norm": 0.15483716130256653, "learning_rate": 5.204632491496643e-06, "loss": 0.4625, "num_input_tokens_seen": 104504704, "step": 85920 }, { "epoch": 10.76619471244205, "grad_norm": 0.12358329445123672, "learning_rate": 5.20408623449663e-06, "loss": 0.4637, "num_input_tokens_seen": 104510848, "step": 85925 }, { "epoch": 10.766821200350833, "grad_norm": 0.1596851795911789, "learning_rate": 5.203539975056596e-06, "loss": 0.4633, "num_input_tokens_seen": 104516960, "step": 85930 }, { "epoch": 10.767447688259617, "grad_norm": 0.15912368893623352, "learning_rate": 5.20299371318307e-06, "loss": 0.4629, "num_input_tokens_seen": 104522592, "step": 85935 }, { "epoch": 10.7680741761684, "grad_norm": 0.11398489773273468, "learning_rate": 5.202447448882583e-06, "loss": 0.4695, "num_input_tokens_seen": 104528704, "step": 85940 }, { "epoch": 10.768700664077183, "grad_norm": 0.19593782722949982, "learning_rate": 5.201901182161666e-06, "loss": 0.4559, "num_input_tokens_seen": 104534944, "step": 85945 }, { "epoch": 10.769327151985967, "grad_norm": 0.11503256857395172, "learning_rate": 5.201354913026851e-06, "loss": 0.4548, "num_input_tokens_seen": 104541056, "step": 85950 }, { "epoch": 10.76995363989475, "grad_norm": 0.10692659765481949, "learning_rate": 5.20080864148467e-06, "loss": 0.465, "num_input_tokens_seen": 104546976, "step": 85955 }, { "epoch": 10.770580127803534, "grad_norm": 0.14182351529598236, "learning_rate": 5.200262367541652e-06, "loss": 0.4576, "num_input_tokens_seen": 104553376, "step": 85960 }, { "epoch": 10.771206615712316, "grad_norm": 0.16282588243484497, "learning_rate": 5.19971609120433e-06, "loss": 0.46, "num_input_tokens_seen": 104559648, "step": 85965 }, { "epoch": 10.7718331036211, "grad_norm": 0.20590251684188843, "learning_rate": 5.199169812479234e-06, "loss": 0.4591, "num_input_tokens_seen": 104565600, "step": 85970 }, { "epoch": 10.772459591529884, "grad_norm": 0.14444951713085175, "learning_rate": 5.198623531372894e-06, "loss": 0.4658, "num_input_tokens_seen": 104571648, "step": 85975 }, { "epoch": 10.773086079438666, "grad_norm": 0.11917243152856827, "learning_rate": 5.198077247891841e-06, "loss": 0.4605, "num_input_tokens_seen": 104578144, "step": 85980 }, { "epoch": 10.77371256734745, "grad_norm": 0.11751172691583633, "learning_rate": 5.197530962042612e-06, "loss": 0.4561, "num_input_tokens_seen": 104584288, "step": 85985 }, { "epoch": 10.774339055256233, "grad_norm": 0.1640748530626297, "learning_rate": 5.196984673831733e-06, "loss": 0.4695, "num_input_tokens_seen": 104590656, "step": 85990 }, { "epoch": 10.774965543165017, "grad_norm": 0.17078028619289398, "learning_rate": 5.196438383265735e-06, "loss": 0.4647, "num_input_tokens_seen": 104596768, "step": 85995 }, { "epoch": 10.775592031073801, "grad_norm": 0.16513021290302277, "learning_rate": 5.195892090351152e-06, "loss": 0.4618, "num_input_tokens_seen": 104603168, "step": 86000 }, { "epoch": 10.776218518982583, "grad_norm": 0.11359502375125885, "learning_rate": 5.195345795094516e-06, "loss": 0.4633, "num_input_tokens_seen": 104608896, "step": 86005 }, { "epoch": 10.776845006891367, "grad_norm": 0.2067524492740631, "learning_rate": 5.194799497502354e-06, "loss": 0.4665, "num_input_tokens_seen": 104614816, "step": 86010 }, { "epoch": 10.77747149480015, "grad_norm": 0.16953174769878387, "learning_rate": 5.194253197581203e-06, "loss": 0.4742, "num_input_tokens_seen": 104620800, "step": 86015 }, { "epoch": 10.778097982708934, "grad_norm": 0.12608428299427032, "learning_rate": 5.19370689533759e-06, "loss": 0.4634, "num_input_tokens_seen": 104626848, "step": 86020 }, { "epoch": 10.778724470617718, "grad_norm": 0.13567794859409332, "learning_rate": 5.193160590778048e-06, "loss": 0.4655, "num_input_tokens_seen": 104632832, "step": 86025 }, { "epoch": 10.7793509585265, "grad_norm": 0.1333349645137787, "learning_rate": 5.19261428390911e-06, "loss": 0.4631, "num_input_tokens_seen": 104638944, "step": 86030 }, { "epoch": 10.779977446435284, "grad_norm": 0.20249059796333313, "learning_rate": 5.192067974737306e-06, "loss": 0.4607, "num_input_tokens_seen": 104645472, "step": 86035 }, { "epoch": 10.780603934344068, "grad_norm": 0.18015000224113464, "learning_rate": 5.1915216632691656e-06, "loss": 0.462, "num_input_tokens_seen": 104651584, "step": 86040 }, { "epoch": 10.78123042225285, "grad_norm": 0.1894577443599701, "learning_rate": 5.190975349511224e-06, "loss": 0.4599, "num_input_tokens_seen": 104657888, "step": 86045 }, { "epoch": 10.781856910161634, "grad_norm": 0.16455887258052826, "learning_rate": 5.190429033470013e-06, "loss": 0.4665, "num_input_tokens_seen": 104663936, "step": 86050 }, { "epoch": 10.782483398070417, "grad_norm": 0.13666146993637085, "learning_rate": 5.189882715152061e-06, "loss": 0.4618, "num_input_tokens_seen": 104670272, "step": 86055 }, { "epoch": 10.7831098859792, "grad_norm": 0.1530078947544098, "learning_rate": 5.189336394563903e-06, "loss": 0.4636, "num_input_tokens_seen": 104675744, "step": 86060 }, { "epoch": 10.783736373887985, "grad_norm": 0.14315977692604065, "learning_rate": 5.188790071712069e-06, "loss": 0.4653, "num_input_tokens_seen": 104682112, "step": 86065 }, { "epoch": 10.784362861796767, "grad_norm": 0.16704164445400238, "learning_rate": 5.18824374660309e-06, "loss": 0.4544, "num_input_tokens_seen": 104687808, "step": 86070 }, { "epoch": 10.784989349705551, "grad_norm": 0.15908615291118622, "learning_rate": 5.1876974192434995e-06, "loss": 0.4625, "num_input_tokens_seen": 104693664, "step": 86075 }, { "epoch": 10.785615837614333, "grad_norm": 0.14891670644283295, "learning_rate": 5.187151089639828e-06, "loss": 0.4616, "num_input_tokens_seen": 104699808, "step": 86080 }, { "epoch": 10.786242325523117, "grad_norm": 0.1472751647233963, "learning_rate": 5.186604757798609e-06, "loss": 0.4612, "num_input_tokens_seen": 104706144, "step": 86085 }, { "epoch": 10.786868813431902, "grad_norm": 0.14991235733032227, "learning_rate": 5.186058423726371e-06, "loss": 0.4667, "num_input_tokens_seen": 104712128, "step": 86090 }, { "epoch": 10.787495301340684, "grad_norm": 0.16719169914722443, "learning_rate": 5.185512087429649e-06, "loss": 0.4664, "num_input_tokens_seen": 104718208, "step": 86095 }, { "epoch": 10.788121789249468, "grad_norm": 0.12758196890354156, "learning_rate": 5.184965748914975e-06, "loss": 0.4653, "num_input_tokens_seen": 104724288, "step": 86100 }, { "epoch": 10.78874827715825, "grad_norm": 0.20803402364253998, "learning_rate": 5.184419408188879e-06, "loss": 0.465, "num_input_tokens_seen": 104730592, "step": 86105 }, { "epoch": 10.789374765067034, "grad_norm": 0.18665006756782532, "learning_rate": 5.183873065257895e-06, "loss": 0.4538, "num_input_tokens_seen": 104736768, "step": 86110 }, { "epoch": 10.790001252975818, "grad_norm": 0.16069170832633972, "learning_rate": 5.183326720128553e-06, "loss": 0.4572, "num_input_tokens_seen": 104742688, "step": 86115 }, { "epoch": 10.7906277408846, "grad_norm": 0.17392541468143463, "learning_rate": 5.182780372807387e-06, "loss": 0.461, "num_input_tokens_seen": 104748896, "step": 86120 }, { "epoch": 10.791254228793385, "grad_norm": 0.17723716795444489, "learning_rate": 5.1822340233009275e-06, "loss": 0.4573, "num_input_tokens_seen": 104755200, "step": 86125 }, { "epoch": 10.791880716702167, "grad_norm": 0.15827398002147675, "learning_rate": 5.181687671615706e-06, "loss": 0.4705, "num_input_tokens_seen": 104761280, "step": 86130 }, { "epoch": 10.792507204610951, "grad_norm": 0.16747115552425385, "learning_rate": 5.181141317758257e-06, "loss": 0.4583, "num_input_tokens_seen": 104767488, "step": 86135 }, { "epoch": 10.793133692519735, "grad_norm": 0.19093623757362366, "learning_rate": 5.180594961735112e-06, "loss": 0.4605, "num_input_tokens_seen": 104773696, "step": 86140 }, { "epoch": 10.793760180428517, "grad_norm": 0.1754942387342453, "learning_rate": 5.180048603552799e-06, "loss": 0.4536, "num_input_tokens_seen": 104779808, "step": 86145 }, { "epoch": 10.794386668337301, "grad_norm": 0.2540835738182068, "learning_rate": 5.179502243217855e-06, "loss": 0.4841, "num_input_tokens_seen": 104786112, "step": 86150 }, { "epoch": 10.795013156246085, "grad_norm": 0.1471349447965622, "learning_rate": 5.178955880736813e-06, "loss": 0.459, "num_input_tokens_seen": 104792032, "step": 86155 }, { "epoch": 10.795639644154868, "grad_norm": 0.2290782332420349, "learning_rate": 5.178409516116201e-06, "loss": 0.4622, "num_input_tokens_seen": 104798080, "step": 86160 }, { "epoch": 10.796266132063652, "grad_norm": 0.2235187590122223, "learning_rate": 5.177863149362553e-06, "loss": 0.47, "num_input_tokens_seen": 104803712, "step": 86165 }, { "epoch": 10.796892619972434, "grad_norm": 0.15846040844917297, "learning_rate": 5.177316780482402e-06, "loss": 0.4609, "num_input_tokens_seen": 104809664, "step": 86170 }, { "epoch": 10.797519107881218, "grad_norm": 0.15521202981472015, "learning_rate": 5.17677040948228e-06, "loss": 0.4538, "num_input_tokens_seen": 104815648, "step": 86175 }, { "epoch": 10.79814559579, "grad_norm": 0.20513854920864105, "learning_rate": 5.1762240363687186e-06, "loss": 0.4655, "num_input_tokens_seen": 104821824, "step": 86180 }, { "epoch": 10.798772083698784, "grad_norm": 0.17341311275959015, "learning_rate": 5.1756776611482515e-06, "loss": 0.4542, "num_input_tokens_seen": 104828128, "step": 86185 }, { "epoch": 10.799398571607568, "grad_norm": 0.15505822002887726, "learning_rate": 5.1751312838274105e-06, "loss": 0.4715, "num_input_tokens_seen": 104834368, "step": 86190 }, { "epoch": 10.80002505951635, "grad_norm": 0.18194255232810974, "learning_rate": 5.174584904412727e-06, "loss": 0.461, "num_input_tokens_seen": 104840128, "step": 86195 }, { "epoch": 10.800651547425135, "grad_norm": 0.19307486712932587, "learning_rate": 5.174038522910734e-06, "loss": 0.453, "num_input_tokens_seen": 104846432, "step": 86200 }, { "epoch": 10.801278035333919, "grad_norm": 0.17446011304855347, "learning_rate": 5.1734921393279644e-06, "loss": 0.4609, "num_input_tokens_seen": 104852288, "step": 86205 }, { "epoch": 10.801904523242701, "grad_norm": 0.17223092913627625, "learning_rate": 5.17294575367095e-06, "loss": 0.4668, "num_input_tokens_seen": 104858240, "step": 86210 }, { "epoch": 10.802531011151485, "grad_norm": 0.19036971032619476, "learning_rate": 5.1723993659462255e-06, "loss": 0.4678, "num_input_tokens_seen": 104864128, "step": 86215 }, { "epoch": 10.803157499060267, "grad_norm": 0.12457158416509628, "learning_rate": 5.17185297616032e-06, "loss": 0.4652, "num_input_tokens_seen": 104870176, "step": 86220 }, { "epoch": 10.803783986969052, "grad_norm": 0.14250293374061584, "learning_rate": 5.171306584319769e-06, "loss": 0.4651, "num_input_tokens_seen": 104876544, "step": 86225 }, { "epoch": 10.804410474877836, "grad_norm": 0.13391947746276855, "learning_rate": 5.170760190431101e-06, "loss": 0.4591, "num_input_tokens_seen": 104882976, "step": 86230 }, { "epoch": 10.805036962786618, "grad_norm": 0.18094028532505035, "learning_rate": 5.170213794500854e-06, "loss": 0.4564, "num_input_tokens_seen": 104889024, "step": 86235 }, { "epoch": 10.805663450695402, "grad_norm": 0.13795353472232819, "learning_rate": 5.1696673965355575e-06, "loss": 0.462, "num_input_tokens_seen": 104895296, "step": 86240 }, { "epoch": 10.806289938604184, "grad_norm": 0.1884884089231491, "learning_rate": 5.169120996541743e-06, "loss": 0.4643, "num_input_tokens_seen": 104901280, "step": 86245 }, { "epoch": 10.806916426512968, "grad_norm": 0.1324298083782196, "learning_rate": 5.1685745945259455e-06, "loss": 0.4639, "num_input_tokens_seen": 104906560, "step": 86250 }, { "epoch": 10.807542914421752, "grad_norm": 0.1270284205675125, "learning_rate": 5.168028190494698e-06, "loss": 0.4631, "num_input_tokens_seen": 104912704, "step": 86255 }, { "epoch": 10.808169402330535, "grad_norm": 0.13260123133659363, "learning_rate": 5.167481784454532e-06, "loss": 0.4691, "num_input_tokens_seen": 104918752, "step": 86260 }, { "epoch": 10.808795890239319, "grad_norm": 0.15254966914653778, "learning_rate": 5.16693537641198e-06, "loss": 0.4608, "num_input_tokens_seen": 104924288, "step": 86265 }, { "epoch": 10.809422378148101, "grad_norm": 0.1313808560371399, "learning_rate": 5.166388966373576e-06, "loss": 0.459, "num_input_tokens_seen": 104929696, "step": 86270 }, { "epoch": 10.810048866056885, "grad_norm": 0.17307427525520325, "learning_rate": 5.165842554345851e-06, "loss": 0.4652, "num_input_tokens_seen": 104936096, "step": 86275 }, { "epoch": 10.810675353965669, "grad_norm": 0.14938931167125702, "learning_rate": 5.16529614033534e-06, "loss": 0.4577, "num_input_tokens_seen": 104941952, "step": 86280 }, { "epoch": 10.811301841874451, "grad_norm": 0.1343204379081726, "learning_rate": 5.164749724348575e-06, "loss": 0.4557, "num_input_tokens_seen": 104948288, "step": 86285 }, { "epoch": 10.811928329783235, "grad_norm": 0.14478306472301483, "learning_rate": 5.164203306392086e-06, "loss": 0.4592, "num_input_tokens_seen": 104954304, "step": 86290 }, { "epoch": 10.812554817692018, "grad_norm": 0.13537926971912384, "learning_rate": 5.1636568864724105e-06, "loss": 0.4607, "num_input_tokens_seen": 104960192, "step": 86295 }, { "epoch": 10.813181305600802, "grad_norm": 0.19513796269893646, "learning_rate": 5.163110464596079e-06, "loss": 0.4582, "num_input_tokens_seen": 104966304, "step": 86300 }, { "epoch": 10.813807793509586, "grad_norm": 0.17543762922286987, "learning_rate": 5.162564040769623e-06, "loss": 0.461, "num_input_tokens_seen": 104972416, "step": 86305 }, { "epoch": 10.814434281418368, "grad_norm": 0.17914769053459167, "learning_rate": 5.162017614999578e-06, "loss": 0.4545, "num_input_tokens_seen": 104978496, "step": 86310 }, { "epoch": 10.815060769327152, "grad_norm": 0.14177829027175903, "learning_rate": 5.161471187292477e-06, "loss": 0.4568, "num_input_tokens_seen": 104984576, "step": 86315 }, { "epoch": 10.815687257235936, "grad_norm": 0.18363168835639954, "learning_rate": 5.160924757654853e-06, "loss": 0.469, "num_input_tokens_seen": 104990784, "step": 86320 }, { "epoch": 10.816313745144718, "grad_norm": 0.16920258104801178, "learning_rate": 5.160378326093237e-06, "loss": 0.4617, "num_input_tokens_seen": 104997280, "step": 86325 }, { "epoch": 10.816940233053502, "grad_norm": 0.2402772605419159, "learning_rate": 5.159831892614164e-06, "loss": 0.4586, "num_input_tokens_seen": 105003008, "step": 86330 }, { "epoch": 10.817566720962285, "grad_norm": 0.15469512343406677, "learning_rate": 5.1592854572241655e-06, "loss": 0.4598, "num_input_tokens_seen": 105009024, "step": 86335 }, { "epoch": 10.818193208871069, "grad_norm": 0.14990253746509552, "learning_rate": 5.1587390199297775e-06, "loss": 0.4696, "num_input_tokens_seen": 105015264, "step": 86340 }, { "epoch": 10.818819696779853, "grad_norm": 0.32378464937210083, "learning_rate": 5.158192580737529e-06, "loss": 0.4606, "num_input_tokens_seen": 105021440, "step": 86345 }, { "epoch": 10.819446184688635, "grad_norm": 0.44803357124328613, "learning_rate": 5.1576461396539545e-06, "loss": 0.4653, "num_input_tokens_seen": 105026784, "step": 86350 }, { "epoch": 10.82007267259742, "grad_norm": 0.22009101510047913, "learning_rate": 5.157099696685589e-06, "loss": 0.4575, "num_input_tokens_seen": 105032992, "step": 86355 }, { "epoch": 10.820699160506202, "grad_norm": 0.13693881034851074, "learning_rate": 5.156553251838964e-06, "loss": 0.4527, "num_input_tokens_seen": 105039232, "step": 86360 }, { "epoch": 10.821325648414986, "grad_norm": 0.183669775724411, "learning_rate": 5.1560068051206135e-06, "loss": 0.451, "num_input_tokens_seen": 105045120, "step": 86365 }, { "epoch": 10.82195213632377, "grad_norm": 0.23058679699897766, "learning_rate": 5.155460356537071e-06, "loss": 0.4542, "num_input_tokens_seen": 105051168, "step": 86370 }, { "epoch": 10.822578624232552, "grad_norm": 0.2819094657897949, "learning_rate": 5.154913906094869e-06, "loss": 0.4551, "num_input_tokens_seen": 105057408, "step": 86375 }, { "epoch": 10.823205112141336, "grad_norm": 0.12918205559253693, "learning_rate": 5.1543674538005416e-06, "loss": 0.4569, "num_input_tokens_seen": 105063776, "step": 86380 }, { "epoch": 10.823831600050118, "grad_norm": 0.27809247374534607, "learning_rate": 5.1538209996606215e-06, "loss": 0.468, "num_input_tokens_seen": 105069600, "step": 86385 }, { "epoch": 10.824458087958902, "grad_norm": 0.26746559143066406, "learning_rate": 5.153274543681641e-06, "loss": 0.465, "num_input_tokens_seen": 105075872, "step": 86390 }, { "epoch": 10.825084575867686, "grad_norm": 0.15621210634708405, "learning_rate": 5.152728085870136e-06, "loss": 0.4613, "num_input_tokens_seen": 105081312, "step": 86395 }, { "epoch": 10.825711063776469, "grad_norm": 0.1435650885105133, "learning_rate": 5.1521816262326355e-06, "loss": 0.4556, "num_input_tokens_seen": 105087424, "step": 86400 }, { "epoch": 10.826337551685253, "grad_norm": 0.16710315644741058, "learning_rate": 5.151635164775678e-06, "loss": 0.4592, "num_input_tokens_seen": 105093472, "step": 86405 }, { "epoch": 10.826964039594035, "grad_norm": 0.17509756982326508, "learning_rate": 5.151088701505794e-06, "loss": 0.4627, "num_input_tokens_seen": 105099136, "step": 86410 }, { "epoch": 10.827590527502819, "grad_norm": 0.14297239482402802, "learning_rate": 5.150542236429516e-06, "loss": 0.4658, "num_input_tokens_seen": 105104800, "step": 86415 }, { "epoch": 10.828217015411603, "grad_norm": 0.169673889875412, "learning_rate": 5.149995769553381e-06, "loss": 0.4707, "num_input_tokens_seen": 105110880, "step": 86420 }, { "epoch": 10.828843503320385, "grad_norm": 0.20109163224697113, "learning_rate": 5.149449300883921e-06, "loss": 0.4555, "num_input_tokens_seen": 105116736, "step": 86425 }, { "epoch": 10.82946999122917, "grad_norm": 0.14381884038448334, "learning_rate": 5.148902830427668e-06, "loss": 0.4575, "num_input_tokens_seen": 105122656, "step": 86430 }, { "epoch": 10.830096479137953, "grad_norm": 0.23186707496643066, "learning_rate": 5.148356358191157e-06, "loss": 0.4647, "num_input_tokens_seen": 105128960, "step": 86435 }, { "epoch": 10.830722967046736, "grad_norm": 0.16612252593040466, "learning_rate": 5.14780988418092e-06, "loss": 0.4623, "num_input_tokens_seen": 105135360, "step": 86440 }, { "epoch": 10.83134945495552, "grad_norm": 0.11862090975046158, "learning_rate": 5.147263408403492e-06, "loss": 0.4643, "num_input_tokens_seen": 105141344, "step": 86445 }, { "epoch": 10.831975942864302, "grad_norm": 0.19626612961292267, "learning_rate": 5.146716930865406e-06, "loss": 0.4648, "num_input_tokens_seen": 105147392, "step": 86450 }, { "epoch": 10.832602430773086, "grad_norm": 0.15356378257274628, "learning_rate": 5.146170451573196e-06, "loss": 0.4622, "num_input_tokens_seen": 105153632, "step": 86455 }, { "epoch": 10.83322891868187, "grad_norm": 0.20789997279644012, "learning_rate": 5.145623970533395e-06, "loss": 0.4698, "num_input_tokens_seen": 105159840, "step": 86460 }, { "epoch": 10.833855406590652, "grad_norm": 0.18043294548988342, "learning_rate": 5.145077487752537e-06, "loss": 0.4643, "num_input_tokens_seen": 105166016, "step": 86465 }, { "epoch": 10.834481894499437, "grad_norm": 0.17701031267642975, "learning_rate": 5.144531003237157e-06, "loss": 0.4641, "num_input_tokens_seen": 105172000, "step": 86470 }, { "epoch": 10.835108382408219, "grad_norm": 0.16959327459335327, "learning_rate": 5.143984516993785e-06, "loss": 0.4669, "num_input_tokens_seen": 105178112, "step": 86475 }, { "epoch": 10.835734870317003, "grad_norm": 0.11496014893054962, "learning_rate": 5.143438029028959e-06, "loss": 0.4623, "num_input_tokens_seen": 105184352, "step": 86480 }, { "epoch": 10.836361358225787, "grad_norm": 0.20187437534332275, "learning_rate": 5.14289153934921e-06, "loss": 0.4655, "num_input_tokens_seen": 105190464, "step": 86485 }, { "epoch": 10.83698784613457, "grad_norm": 0.15893332660198212, "learning_rate": 5.142345047961073e-06, "loss": 0.463, "num_input_tokens_seen": 105196256, "step": 86490 }, { "epoch": 10.837614334043353, "grad_norm": 0.1198233887553215, "learning_rate": 5.14179855487108e-06, "loss": 0.4655, "num_input_tokens_seen": 105202304, "step": 86495 }, { "epoch": 10.838240821952136, "grad_norm": 0.14288529753684998, "learning_rate": 5.1412520600857676e-06, "loss": 0.4607, "num_input_tokens_seen": 105208192, "step": 86500 }, { "epoch": 10.83886730986092, "grad_norm": 0.16544319689273834, "learning_rate": 5.140705563611668e-06, "loss": 0.4628, "num_input_tokens_seen": 105213632, "step": 86505 }, { "epoch": 10.839493797769704, "grad_norm": 0.2021680325269699, "learning_rate": 5.140159065455314e-06, "loss": 0.4664, "num_input_tokens_seen": 105219168, "step": 86510 }, { "epoch": 10.840120285678486, "grad_norm": 0.14450722932815552, "learning_rate": 5.13961256562324e-06, "loss": 0.4573, "num_input_tokens_seen": 105225408, "step": 86515 }, { "epoch": 10.84074677358727, "grad_norm": 0.12911424040794373, "learning_rate": 5.139066064121982e-06, "loss": 0.4597, "num_input_tokens_seen": 105231424, "step": 86520 }, { "epoch": 10.841373261496052, "grad_norm": 0.15666674077510834, "learning_rate": 5.138519560958072e-06, "loss": 0.4658, "num_input_tokens_seen": 105237344, "step": 86525 }, { "epoch": 10.841999749404836, "grad_norm": 0.1525464504957199, "learning_rate": 5.137973056138044e-06, "loss": 0.4615, "num_input_tokens_seen": 105243392, "step": 86530 }, { "epoch": 10.84262623731362, "grad_norm": 0.16473929584026337, "learning_rate": 5.137426549668432e-06, "loss": 0.4712, "num_input_tokens_seen": 105249376, "step": 86535 }, { "epoch": 10.843252725222403, "grad_norm": 0.17799954116344452, "learning_rate": 5.1368800415557695e-06, "loss": 0.4581, "num_input_tokens_seen": 105255232, "step": 86540 }, { "epoch": 10.843879213131187, "grad_norm": 0.12810823321342468, "learning_rate": 5.136333531806591e-06, "loss": 0.4562, "num_input_tokens_seen": 105261472, "step": 86545 }, { "epoch": 10.84450570103997, "grad_norm": 0.12954120337963104, "learning_rate": 5.135787020427431e-06, "loss": 0.4572, "num_input_tokens_seen": 105267808, "step": 86550 }, { "epoch": 10.845132188948753, "grad_norm": 0.15381909906864166, "learning_rate": 5.1352405074248235e-06, "loss": 0.4708, "num_input_tokens_seen": 105273952, "step": 86555 }, { "epoch": 10.845758676857537, "grad_norm": 0.16368255019187927, "learning_rate": 5.134693992805301e-06, "loss": 0.4765, "num_input_tokens_seen": 105279776, "step": 86560 }, { "epoch": 10.84638516476632, "grad_norm": 0.12491592019796371, "learning_rate": 5.134147476575399e-06, "loss": 0.4664, "num_input_tokens_seen": 105286048, "step": 86565 }, { "epoch": 10.847011652675103, "grad_norm": 0.10775807499885559, "learning_rate": 5.1336009587416525e-06, "loss": 0.4558, "num_input_tokens_seen": 105292096, "step": 86570 }, { "epoch": 10.847638140583888, "grad_norm": 0.12689678370952606, "learning_rate": 5.133054439310593e-06, "loss": 0.4633, "num_input_tokens_seen": 105297824, "step": 86575 }, { "epoch": 10.84826462849267, "grad_norm": 0.13906322419643402, "learning_rate": 5.132507918288756e-06, "loss": 0.4518, "num_input_tokens_seen": 105303552, "step": 86580 }, { "epoch": 10.848891116401454, "grad_norm": 0.1659373641014099, "learning_rate": 5.1319613956826755e-06, "loss": 0.472, "num_input_tokens_seen": 105309536, "step": 86585 }, { "epoch": 10.849517604310236, "grad_norm": 0.14596109092235565, "learning_rate": 5.131414871498885e-06, "loss": 0.4647, "num_input_tokens_seen": 105315680, "step": 86590 }, { "epoch": 10.85014409221902, "grad_norm": 0.19970926642417908, "learning_rate": 5.13086834574392e-06, "loss": 0.4604, "num_input_tokens_seen": 105321888, "step": 86595 }, { "epoch": 10.850770580127804, "grad_norm": 0.13194292783737183, "learning_rate": 5.1303218184243135e-06, "loss": 0.4613, "num_input_tokens_seen": 105328096, "step": 86600 }, { "epoch": 10.851397068036587, "grad_norm": 0.1363655924797058, "learning_rate": 5.129775289546599e-06, "loss": 0.4585, "num_input_tokens_seen": 105334048, "step": 86605 }, { "epoch": 10.85202355594537, "grad_norm": 0.11550646275281906, "learning_rate": 5.129228759117313e-06, "loss": 0.4674, "num_input_tokens_seen": 105339648, "step": 86610 }, { "epoch": 10.852650043854153, "grad_norm": 0.1791713833808899, "learning_rate": 5.128682227142988e-06, "loss": 0.4583, "num_input_tokens_seen": 105345824, "step": 86615 }, { "epoch": 10.853276531762937, "grad_norm": 0.14164268970489502, "learning_rate": 5.128135693630159e-06, "loss": 0.4583, "num_input_tokens_seen": 105351584, "step": 86620 }, { "epoch": 10.853903019671721, "grad_norm": 0.1277102828025818, "learning_rate": 5.127589158585361e-06, "loss": 0.4656, "num_input_tokens_seen": 105357792, "step": 86625 }, { "epoch": 10.854529507580503, "grad_norm": 0.13080577552318573, "learning_rate": 5.127042622015126e-06, "loss": 0.4623, "num_input_tokens_seen": 105364160, "step": 86630 }, { "epoch": 10.855155995489287, "grad_norm": 0.18671759963035583, "learning_rate": 5.126496083925991e-06, "loss": 0.4653, "num_input_tokens_seen": 105370400, "step": 86635 }, { "epoch": 10.85578248339807, "grad_norm": 0.2540293037891388, "learning_rate": 5.125949544324488e-06, "loss": 0.4626, "num_input_tokens_seen": 105376512, "step": 86640 }, { "epoch": 10.856408971306854, "grad_norm": 0.13510623574256897, "learning_rate": 5.125403003217153e-06, "loss": 0.4718, "num_input_tokens_seen": 105381920, "step": 86645 }, { "epoch": 10.857035459215638, "grad_norm": 0.18185386061668396, "learning_rate": 5.124856460610519e-06, "loss": 0.4612, "num_input_tokens_seen": 105387968, "step": 86650 }, { "epoch": 10.85766194712442, "grad_norm": 0.14916209876537323, "learning_rate": 5.124309916511122e-06, "loss": 0.4623, "num_input_tokens_seen": 105393888, "step": 86655 }, { "epoch": 10.858288435033204, "grad_norm": 0.19221238791942596, "learning_rate": 5.123763370925493e-06, "loss": 0.4595, "num_input_tokens_seen": 105399840, "step": 86660 }, { "epoch": 10.858914922941988, "grad_norm": 0.12739895284175873, "learning_rate": 5.123216823860171e-06, "loss": 0.46, "num_input_tokens_seen": 105406112, "step": 86665 }, { "epoch": 10.85954141085077, "grad_norm": 0.1613663285970688, "learning_rate": 5.122670275321687e-06, "loss": 0.463, "num_input_tokens_seen": 105412256, "step": 86670 }, { "epoch": 10.860167898759554, "grad_norm": 0.15203694999217987, "learning_rate": 5.122123725316578e-06, "loss": 0.4642, "num_input_tokens_seen": 105418240, "step": 86675 }, { "epoch": 10.860794386668337, "grad_norm": 0.18568167090415955, "learning_rate": 5.121577173851376e-06, "loss": 0.4592, "num_input_tokens_seen": 105424480, "step": 86680 }, { "epoch": 10.86142087457712, "grad_norm": 0.12437281757593155, "learning_rate": 5.1210306209326175e-06, "loss": 0.4623, "num_input_tokens_seen": 105429952, "step": 86685 }, { "epoch": 10.862047362485903, "grad_norm": 0.14649972319602966, "learning_rate": 5.120484066566836e-06, "loss": 0.4708, "num_input_tokens_seen": 105436000, "step": 86690 }, { "epoch": 10.862673850394687, "grad_norm": 0.12214414775371552, "learning_rate": 5.119937510760567e-06, "loss": 0.4586, "num_input_tokens_seen": 105442272, "step": 86695 }, { "epoch": 10.863300338303471, "grad_norm": 0.13420012593269348, "learning_rate": 5.1193909535203435e-06, "loss": 0.4565, "num_input_tokens_seen": 105448416, "step": 86700 }, { "epoch": 10.863926826212253, "grad_norm": 0.11900819092988968, "learning_rate": 5.118844394852701e-06, "loss": 0.4626, "num_input_tokens_seen": 105453952, "step": 86705 }, { "epoch": 10.864553314121038, "grad_norm": 0.13669048249721527, "learning_rate": 5.118297834764172e-06, "loss": 0.4694, "num_input_tokens_seen": 105460000, "step": 86710 }, { "epoch": 10.865179802029822, "grad_norm": 0.16971193253993988, "learning_rate": 5.117751273261295e-06, "loss": 0.461, "num_input_tokens_seen": 105466336, "step": 86715 }, { "epoch": 10.865806289938604, "grad_norm": 0.15237022936344147, "learning_rate": 5.117204710350602e-06, "loss": 0.4597, "num_input_tokens_seen": 105472448, "step": 86720 }, { "epoch": 10.866432777847388, "grad_norm": 0.1523139774799347, "learning_rate": 5.116658146038628e-06, "loss": 0.4636, "num_input_tokens_seen": 105478720, "step": 86725 }, { "epoch": 10.86705926575617, "grad_norm": 0.13992170989513397, "learning_rate": 5.116111580331907e-06, "loss": 0.4638, "num_input_tokens_seen": 105484768, "step": 86730 }, { "epoch": 10.867685753664954, "grad_norm": 0.14496462047100067, "learning_rate": 5.115565013236976e-06, "loss": 0.4584, "num_input_tokens_seen": 105489952, "step": 86735 }, { "epoch": 10.868312241573738, "grad_norm": 0.16379189491271973, "learning_rate": 5.115018444760368e-06, "loss": 0.4618, "num_input_tokens_seen": 105496032, "step": 86740 }, { "epoch": 10.86893872948252, "grad_norm": 0.2754567861557007, "learning_rate": 5.114471874908617e-06, "loss": 0.4544, "num_input_tokens_seen": 105502336, "step": 86745 }, { "epoch": 10.869565217391305, "grad_norm": 0.15285299718379974, "learning_rate": 5.113925303688259e-06, "loss": 0.4679, "num_input_tokens_seen": 105508544, "step": 86750 }, { "epoch": 10.870191705300087, "grad_norm": 0.22516532242298126, "learning_rate": 5.113378731105828e-06, "loss": 0.4571, "num_input_tokens_seen": 105514656, "step": 86755 }, { "epoch": 10.870818193208871, "grad_norm": 0.1531076580286026, "learning_rate": 5.112832157167858e-06, "loss": 0.4581, "num_input_tokens_seen": 105520608, "step": 86760 }, { "epoch": 10.871444681117655, "grad_norm": 0.1697612702846527, "learning_rate": 5.112285581880886e-06, "loss": 0.4643, "num_input_tokens_seen": 105526880, "step": 86765 }, { "epoch": 10.872071169026437, "grad_norm": 0.12317021191120148, "learning_rate": 5.111739005251445e-06, "loss": 0.461, "num_input_tokens_seen": 105532992, "step": 86770 }, { "epoch": 10.872697656935221, "grad_norm": 0.14642609655857086, "learning_rate": 5.11119242728607e-06, "loss": 0.4638, "num_input_tokens_seen": 105539200, "step": 86775 }, { "epoch": 10.873324144844005, "grad_norm": 0.2021290510892868, "learning_rate": 5.110645847991297e-06, "loss": 0.4488, "num_input_tokens_seen": 105545248, "step": 86780 }, { "epoch": 10.873950632752788, "grad_norm": 0.12438200414180756, "learning_rate": 5.110099267373658e-06, "loss": 0.461, "num_input_tokens_seen": 105551264, "step": 86785 }, { "epoch": 10.874577120661572, "grad_norm": 0.2352382242679596, "learning_rate": 5.109552685439692e-06, "loss": 0.4611, "num_input_tokens_seen": 105557408, "step": 86790 }, { "epoch": 10.875203608570354, "grad_norm": 0.18848945200443268, "learning_rate": 5.109006102195931e-06, "loss": 0.4668, "num_input_tokens_seen": 105563776, "step": 86795 }, { "epoch": 10.875830096479138, "grad_norm": 0.17644467949867249, "learning_rate": 5.108459517648909e-06, "loss": 0.4532, "num_input_tokens_seen": 105569568, "step": 86800 }, { "epoch": 10.87645658438792, "grad_norm": 0.16191832721233368, "learning_rate": 5.1079129318051635e-06, "loss": 0.4705, "num_input_tokens_seen": 105575648, "step": 86805 }, { "epoch": 10.877083072296704, "grad_norm": 0.42736580967903137, "learning_rate": 5.107366344671227e-06, "loss": 0.4677, "num_input_tokens_seen": 105581792, "step": 86810 }, { "epoch": 10.877709560205489, "grad_norm": 0.15512941777706146, "learning_rate": 5.106819756253637e-06, "loss": 0.4587, "num_input_tokens_seen": 105587936, "step": 86815 }, { "epoch": 10.87833604811427, "grad_norm": 0.3803632855415344, "learning_rate": 5.106273166558926e-06, "loss": 0.4621, "num_input_tokens_seen": 105594336, "step": 86820 }, { "epoch": 10.878962536023055, "grad_norm": 0.13699732720851898, "learning_rate": 5.10572657559363e-06, "loss": 0.4655, "num_input_tokens_seen": 105600416, "step": 86825 }, { "epoch": 10.879589023931839, "grad_norm": 0.1981828361749649, "learning_rate": 5.105179983364285e-06, "loss": 0.4603, "num_input_tokens_seen": 105606880, "step": 86830 }, { "epoch": 10.880215511840621, "grad_norm": 0.17037656903266907, "learning_rate": 5.104633389877424e-06, "loss": 0.4537, "num_input_tokens_seen": 105613536, "step": 86835 }, { "epoch": 10.880841999749405, "grad_norm": 0.16630828380584717, "learning_rate": 5.104086795139583e-06, "loss": 0.4654, "num_input_tokens_seen": 105619456, "step": 86840 }, { "epoch": 10.881468487658188, "grad_norm": 0.17597919702529907, "learning_rate": 5.103540199157297e-06, "loss": 0.4591, "num_input_tokens_seen": 105625600, "step": 86845 }, { "epoch": 10.882094975566972, "grad_norm": 0.18279774487018585, "learning_rate": 5.1029936019371e-06, "loss": 0.4537, "num_input_tokens_seen": 105631616, "step": 86850 }, { "epoch": 10.882721463475756, "grad_norm": 0.18734312057495117, "learning_rate": 5.102447003485528e-06, "loss": 0.4591, "num_input_tokens_seen": 105637856, "step": 86855 }, { "epoch": 10.883347951384538, "grad_norm": 0.12262242287397385, "learning_rate": 5.1019004038091165e-06, "loss": 0.4613, "num_input_tokens_seen": 105643968, "step": 86860 }, { "epoch": 10.883974439293322, "grad_norm": 0.1657087802886963, "learning_rate": 5.101353802914401e-06, "loss": 0.459, "num_input_tokens_seen": 105650048, "step": 86865 }, { "epoch": 10.884600927202104, "grad_norm": 0.15855488181114197, "learning_rate": 5.100807200807914e-06, "loss": 0.462, "num_input_tokens_seen": 105656032, "step": 86870 }, { "epoch": 10.885227415110888, "grad_norm": 0.2538825273513794, "learning_rate": 5.100260597496193e-06, "loss": 0.4603, "num_input_tokens_seen": 105662368, "step": 86875 }, { "epoch": 10.885853903019672, "grad_norm": 0.19585289061069489, "learning_rate": 5.099713992985771e-06, "loss": 0.4623, "num_input_tokens_seen": 105668640, "step": 86880 }, { "epoch": 10.886480390928455, "grad_norm": 0.1720258593559265, "learning_rate": 5.099167387283186e-06, "loss": 0.4612, "num_input_tokens_seen": 105674848, "step": 86885 }, { "epoch": 10.887106878837239, "grad_norm": 0.14387868344783783, "learning_rate": 5.098620780394971e-06, "loss": 0.4674, "num_input_tokens_seen": 105681120, "step": 86890 }, { "epoch": 10.887733366746021, "grad_norm": 0.19230185449123383, "learning_rate": 5.098074172327661e-06, "loss": 0.4686, "num_input_tokens_seen": 105687328, "step": 86895 }, { "epoch": 10.888359854654805, "grad_norm": 0.17763657867908478, "learning_rate": 5.097527563087793e-06, "loss": 0.4672, "num_input_tokens_seen": 105693568, "step": 86900 }, { "epoch": 10.888986342563589, "grad_norm": 0.18634672462940216, "learning_rate": 5.0969809526819005e-06, "loss": 0.4627, "num_input_tokens_seen": 105699648, "step": 86905 }, { "epoch": 10.889612830472371, "grad_norm": 0.20512016117572784, "learning_rate": 5.096434341116519e-06, "loss": 0.4617, "num_input_tokens_seen": 105705792, "step": 86910 }, { "epoch": 10.890239318381155, "grad_norm": 0.19668707251548767, "learning_rate": 5.0958877283981836e-06, "loss": 0.466, "num_input_tokens_seen": 105712128, "step": 86915 }, { "epoch": 10.890865806289938, "grad_norm": 0.2802952229976654, "learning_rate": 5.09534111453343e-06, "loss": 0.4646, "num_input_tokens_seen": 105718368, "step": 86920 }, { "epoch": 10.891492294198722, "grad_norm": 0.14525943994522095, "learning_rate": 5.094794499528792e-06, "loss": 0.4517, "num_input_tokens_seen": 105724576, "step": 86925 }, { "epoch": 10.892118782107506, "grad_norm": 0.1310381293296814, "learning_rate": 5.094247883390808e-06, "loss": 0.4734, "num_input_tokens_seen": 105730048, "step": 86930 }, { "epoch": 10.892745270016288, "grad_norm": 0.17476166784763336, "learning_rate": 5.093701266126011e-06, "loss": 0.4571, "num_input_tokens_seen": 105736192, "step": 86935 }, { "epoch": 10.893371757925072, "grad_norm": 0.1823233664035797, "learning_rate": 5.093154647740936e-06, "loss": 0.4574, "num_input_tokens_seen": 105742400, "step": 86940 }, { "epoch": 10.893998245833856, "grad_norm": 0.2532092034816742, "learning_rate": 5.092608028242119e-06, "loss": 0.4597, "num_input_tokens_seen": 105748928, "step": 86945 }, { "epoch": 10.894624733742639, "grad_norm": 0.21200159192085266, "learning_rate": 5.0920614076360965e-06, "loss": 0.463, "num_input_tokens_seen": 105754784, "step": 86950 }, { "epoch": 10.895251221651423, "grad_norm": 0.15573376417160034, "learning_rate": 5.091514785929401e-06, "loss": 0.4621, "num_input_tokens_seen": 105760768, "step": 86955 }, { "epoch": 10.895877709560205, "grad_norm": 0.1885761171579361, "learning_rate": 5.090968163128569e-06, "loss": 0.4649, "num_input_tokens_seen": 105767168, "step": 86960 }, { "epoch": 10.896504197468989, "grad_norm": 0.14158383011817932, "learning_rate": 5.090421539240137e-06, "loss": 0.4651, "num_input_tokens_seen": 105773120, "step": 86965 }, { "epoch": 10.897130685377773, "grad_norm": 0.12776847183704376, "learning_rate": 5.0898749142706385e-06, "loss": 0.4691, "num_input_tokens_seen": 105779424, "step": 86970 }, { "epoch": 10.897757173286555, "grad_norm": 0.15078020095825195, "learning_rate": 5.089328288226611e-06, "loss": 0.4675, "num_input_tokens_seen": 105785184, "step": 86975 }, { "epoch": 10.89838366119534, "grad_norm": 0.14702872931957245, "learning_rate": 5.088781661114587e-06, "loss": 0.4572, "num_input_tokens_seen": 105791488, "step": 86980 }, { "epoch": 10.899010149104122, "grad_norm": 0.17679463326931, "learning_rate": 5.088235032941104e-06, "loss": 0.4632, "num_input_tokens_seen": 105797632, "step": 86985 }, { "epoch": 10.899636637012906, "grad_norm": 0.15064534544944763, "learning_rate": 5.087688403712699e-06, "loss": 0.4618, "num_input_tokens_seen": 105803712, "step": 86990 }, { "epoch": 10.90026312492169, "grad_norm": 0.13626869022846222, "learning_rate": 5.087141773435904e-06, "loss": 0.4617, "num_input_tokens_seen": 105809472, "step": 86995 }, { "epoch": 10.900889612830472, "grad_norm": 0.1361592561006546, "learning_rate": 5.0865951421172566e-06, "loss": 0.4559, "num_input_tokens_seen": 105815840, "step": 87000 }, { "epoch": 10.901516100739256, "grad_norm": 0.15087758004665375, "learning_rate": 5.086048509763291e-06, "loss": 0.4618, "num_input_tokens_seen": 105822016, "step": 87005 }, { "epoch": 10.902142588648038, "grad_norm": 0.16072183847427368, "learning_rate": 5.085501876380543e-06, "loss": 0.4476, "num_input_tokens_seen": 105827136, "step": 87010 }, { "epoch": 10.902769076556822, "grad_norm": 0.156734436750412, "learning_rate": 5.084955241975549e-06, "loss": 0.4551, "num_input_tokens_seen": 105833344, "step": 87015 }, { "epoch": 10.903395564465606, "grad_norm": 0.20880310237407684, "learning_rate": 5.084408606554841e-06, "loss": 0.4709, "num_input_tokens_seen": 105839456, "step": 87020 }, { "epoch": 10.904022052374389, "grad_norm": 0.15045525133609772, "learning_rate": 5.083861970124959e-06, "loss": 0.4618, "num_input_tokens_seen": 105845376, "step": 87025 }, { "epoch": 10.904648540283173, "grad_norm": 0.16515548527240753, "learning_rate": 5.083315332692438e-06, "loss": 0.4679, "num_input_tokens_seen": 105851040, "step": 87030 }, { "epoch": 10.905275028191955, "grad_norm": 0.13258971273899078, "learning_rate": 5.082768694263808e-06, "loss": 0.4528, "num_input_tokens_seen": 105856992, "step": 87035 }, { "epoch": 10.905901516100739, "grad_norm": 0.1301361322402954, "learning_rate": 5.0822220548456124e-06, "loss": 0.4599, "num_input_tokens_seen": 105862816, "step": 87040 }, { "epoch": 10.906528004009523, "grad_norm": 0.17622478306293488, "learning_rate": 5.081675414444382e-06, "loss": 0.4665, "num_input_tokens_seen": 105869216, "step": 87045 }, { "epoch": 10.907154491918305, "grad_norm": 0.1308491975069046, "learning_rate": 5.081128773066655e-06, "loss": 0.4565, "num_input_tokens_seen": 105875392, "step": 87050 }, { "epoch": 10.90778097982709, "grad_norm": 0.17287412285804749, "learning_rate": 5.080582130718963e-06, "loss": 0.4651, "num_input_tokens_seen": 105881696, "step": 87055 }, { "epoch": 10.908407467735874, "grad_norm": 0.1705046445131302, "learning_rate": 5.080035487407845e-06, "loss": 0.4617, "num_input_tokens_seen": 105887680, "step": 87060 }, { "epoch": 10.909033955644656, "grad_norm": 0.15734119713306427, "learning_rate": 5.079488843139835e-06, "loss": 0.4619, "num_input_tokens_seen": 105894048, "step": 87065 }, { "epoch": 10.90966044355344, "grad_norm": 0.2115083634853363, "learning_rate": 5.07894219792147e-06, "loss": 0.4578, "num_input_tokens_seen": 105898688, "step": 87070 }, { "epoch": 10.910286931462222, "grad_norm": 0.13388200104236603, "learning_rate": 5.078395551759282e-06, "loss": 0.4538, "num_input_tokens_seen": 105904704, "step": 87075 }, { "epoch": 10.910913419371006, "grad_norm": 0.13815905153751373, "learning_rate": 5.077848904659811e-06, "loss": 0.4662, "num_input_tokens_seen": 105910304, "step": 87080 }, { "epoch": 10.91153990727979, "grad_norm": 0.15735745429992676, "learning_rate": 5.077302256629591e-06, "loss": 0.451, "num_input_tokens_seen": 105916128, "step": 87085 }, { "epoch": 10.912166395188573, "grad_norm": 0.1484309434890747, "learning_rate": 5.076755607675158e-06, "loss": 0.4545, "num_input_tokens_seen": 105922336, "step": 87090 }, { "epoch": 10.912792883097357, "grad_norm": 0.12946683168411255, "learning_rate": 5.076208957803046e-06, "loss": 0.4649, "num_input_tokens_seen": 105928384, "step": 87095 }, { "epoch": 10.913419371006139, "grad_norm": 0.1430824100971222, "learning_rate": 5.075662307019793e-06, "loss": 0.4692, "num_input_tokens_seen": 105933984, "step": 87100 }, { "epoch": 10.914045858914923, "grad_norm": 0.17341899871826172, "learning_rate": 5.075115655331932e-06, "loss": 0.4669, "num_input_tokens_seen": 105940320, "step": 87105 }, { "epoch": 10.914672346823707, "grad_norm": 0.16204820573329926, "learning_rate": 5.074569002746003e-06, "loss": 0.4681, "num_input_tokens_seen": 105946208, "step": 87110 }, { "epoch": 10.91529883473249, "grad_norm": 0.13931220769882202, "learning_rate": 5.074022349268536e-06, "loss": 0.4626, "num_input_tokens_seen": 105951648, "step": 87115 }, { "epoch": 10.915925322641273, "grad_norm": 0.17469832301139832, "learning_rate": 5.073475694906072e-06, "loss": 0.4636, "num_input_tokens_seen": 105957664, "step": 87120 }, { "epoch": 10.916551810550056, "grad_norm": 0.14326979219913483, "learning_rate": 5.072929039665142e-06, "loss": 0.4654, "num_input_tokens_seen": 105963936, "step": 87125 }, { "epoch": 10.91717829845884, "grad_norm": 0.1864113062620163, "learning_rate": 5.072382383552284e-06, "loss": 0.4649, "num_input_tokens_seen": 105969920, "step": 87130 }, { "epoch": 10.917804786367624, "grad_norm": 0.13004718720912933, "learning_rate": 5.071835726574035e-06, "loss": 0.4534, "num_input_tokens_seen": 105976064, "step": 87135 }, { "epoch": 10.918431274276406, "grad_norm": 0.1112867221236229, "learning_rate": 5.0712890687369295e-06, "loss": 0.4565, "num_input_tokens_seen": 105982336, "step": 87140 }, { "epoch": 10.91905776218519, "grad_norm": 0.15579472482204437, "learning_rate": 5.070742410047503e-06, "loss": 0.4613, "num_input_tokens_seen": 105988896, "step": 87145 }, { "epoch": 10.919684250093972, "grad_norm": 0.1636120229959488, "learning_rate": 5.070195750512291e-06, "loss": 0.4599, "num_input_tokens_seen": 105995008, "step": 87150 }, { "epoch": 10.920310738002756, "grad_norm": 0.1865576207637787, "learning_rate": 5.069649090137831e-06, "loss": 0.4546, "num_input_tokens_seen": 106001120, "step": 87155 }, { "epoch": 10.92093722591154, "grad_norm": 0.21295876801013947, "learning_rate": 5.069102428930656e-06, "loss": 0.4622, "num_input_tokens_seen": 106007424, "step": 87160 }, { "epoch": 10.921563713820323, "grad_norm": 0.1561630666255951, "learning_rate": 5.068555766897304e-06, "loss": 0.4601, "num_input_tokens_seen": 106013632, "step": 87165 }, { "epoch": 10.922190201729107, "grad_norm": 0.22290728986263275, "learning_rate": 5.06800910404431e-06, "loss": 0.4545, "num_input_tokens_seen": 106019200, "step": 87170 }, { "epoch": 10.92281668963789, "grad_norm": 0.21203719079494476, "learning_rate": 5.0674624403782115e-06, "loss": 0.4607, "num_input_tokens_seen": 106025440, "step": 87175 }, { "epoch": 10.923443177546673, "grad_norm": 0.25557342171669006, "learning_rate": 5.06691577590554e-06, "loss": 0.4615, "num_input_tokens_seen": 106031136, "step": 87180 }, { "epoch": 10.924069665455457, "grad_norm": 0.3457026183605194, "learning_rate": 5.066369110632836e-06, "loss": 0.4519, "num_input_tokens_seen": 106037216, "step": 87185 }, { "epoch": 10.92469615336424, "grad_norm": 0.2160278707742691, "learning_rate": 5.065822444566634e-06, "loss": 0.4659, "num_input_tokens_seen": 106042720, "step": 87190 }, { "epoch": 10.925322641273024, "grad_norm": 0.32340577244758606, "learning_rate": 5.065275777713469e-06, "loss": 0.4653, "num_input_tokens_seen": 106048448, "step": 87195 }, { "epoch": 10.925949129181808, "grad_norm": 0.24400271475315094, "learning_rate": 5.064729110079877e-06, "loss": 0.4535, "num_input_tokens_seen": 106054336, "step": 87200 }, { "epoch": 10.92657561709059, "grad_norm": 0.22789114713668823, "learning_rate": 5.064182441672392e-06, "loss": 0.458, "num_input_tokens_seen": 106059584, "step": 87205 }, { "epoch": 10.927202104999374, "grad_norm": 0.35374465584754944, "learning_rate": 5.063635772497555e-06, "loss": 0.4572, "num_input_tokens_seen": 106065376, "step": 87210 }, { "epoch": 10.927828592908156, "grad_norm": 0.2093348354101181, "learning_rate": 5.0630891025618974e-06, "loss": 0.454, "num_input_tokens_seen": 106071520, "step": 87215 }, { "epoch": 10.92845508081694, "grad_norm": 1.2337589263916016, "learning_rate": 5.0625424318719566e-06, "loss": 0.4576, "num_input_tokens_seen": 106077824, "step": 87220 }, { "epoch": 10.929081568725724, "grad_norm": 0.8243001103401184, "learning_rate": 5.0619957604342686e-06, "loss": 0.4601, "num_input_tokens_seen": 106084064, "step": 87225 }, { "epoch": 10.929708056634507, "grad_norm": 0.8492448329925537, "learning_rate": 5.061449088255369e-06, "loss": 0.4679, "num_input_tokens_seen": 106090144, "step": 87230 }, { "epoch": 10.93033454454329, "grad_norm": 0.45813509821891785, "learning_rate": 5.060902415341793e-06, "loss": 0.4754, "num_input_tokens_seen": 106096000, "step": 87235 }, { "epoch": 10.930961032452073, "grad_norm": 0.3848157525062561, "learning_rate": 5.06035574170008e-06, "loss": 0.4545, "num_input_tokens_seen": 106102048, "step": 87240 }, { "epoch": 10.931587520360857, "grad_norm": 0.9995248317718506, "learning_rate": 5.05980906733676e-06, "loss": 0.4788, "num_input_tokens_seen": 106108160, "step": 87245 }, { "epoch": 10.932214008269641, "grad_norm": 0.43608716130256653, "learning_rate": 5.059262392258375e-06, "loss": 0.4565, "num_input_tokens_seen": 106114400, "step": 87250 }, { "epoch": 10.932840496178423, "grad_norm": 0.1542089581489563, "learning_rate": 5.058715716471459e-06, "loss": 0.4684, "num_input_tokens_seen": 106120512, "step": 87255 }, { "epoch": 10.933466984087207, "grad_norm": 0.2007671445608139, "learning_rate": 5.058169039982545e-06, "loss": 0.4551, "num_input_tokens_seen": 106126272, "step": 87260 }, { "epoch": 10.93409347199599, "grad_norm": 4.439993858337402, "learning_rate": 5.057622362798173e-06, "loss": 0.4807, "num_input_tokens_seen": 106132480, "step": 87265 }, { "epoch": 10.934719959904774, "grad_norm": 1.1820793151855469, "learning_rate": 5.057075684924875e-06, "loss": 0.4624, "num_input_tokens_seen": 106138176, "step": 87270 }, { "epoch": 10.935346447813558, "grad_norm": 0.3082081973552704, "learning_rate": 5.0565290063691905e-06, "loss": 0.453, "num_input_tokens_seen": 106143904, "step": 87275 }, { "epoch": 10.93597293572234, "grad_norm": 0.9621371626853943, "learning_rate": 5.0559823271376525e-06, "loss": 0.4803, "num_input_tokens_seen": 106150240, "step": 87280 }, { "epoch": 10.936599423631124, "grad_norm": 0.6733846068382263, "learning_rate": 5.055435647236802e-06, "loss": 0.4695, "num_input_tokens_seen": 106156448, "step": 87285 }, { "epoch": 10.937225911539908, "grad_norm": 0.3103829622268677, "learning_rate": 5.054888966673168e-06, "loss": 0.4591, "num_input_tokens_seen": 106162432, "step": 87290 }, { "epoch": 10.93785239944869, "grad_norm": 0.20717176795005798, "learning_rate": 5.054342285453292e-06, "loss": 0.4683, "num_input_tokens_seen": 106168352, "step": 87295 }, { "epoch": 10.938478887357475, "grad_norm": 0.2770381569862366, "learning_rate": 5.053795603583709e-06, "loss": 0.468, "num_input_tokens_seen": 106174272, "step": 87300 }, { "epoch": 10.939105375266257, "grad_norm": 0.21858569979667664, "learning_rate": 5.053248921070954e-06, "loss": 0.4667, "num_input_tokens_seen": 106180096, "step": 87305 }, { "epoch": 10.93973186317504, "grad_norm": 0.20782463252544403, "learning_rate": 5.052702237921563e-06, "loss": 0.4643, "num_input_tokens_seen": 106185760, "step": 87310 }, { "epoch": 10.940358351083823, "grad_norm": 0.18846143782138824, "learning_rate": 5.052155554142073e-06, "loss": 0.4605, "num_input_tokens_seen": 106192256, "step": 87315 }, { "epoch": 10.940984838992607, "grad_norm": 0.503854513168335, "learning_rate": 5.05160886973902e-06, "loss": 0.4504, "num_input_tokens_seen": 106198656, "step": 87320 }, { "epoch": 10.941611326901391, "grad_norm": 0.17259712517261505, "learning_rate": 5.051062184718938e-06, "loss": 0.4647, "num_input_tokens_seen": 106204640, "step": 87325 }, { "epoch": 10.942237814810174, "grad_norm": 0.1686154007911682, "learning_rate": 5.050515499088364e-06, "loss": 0.4611, "num_input_tokens_seen": 106211040, "step": 87330 }, { "epoch": 10.942864302718958, "grad_norm": 0.26796165108680725, "learning_rate": 5.049968812853837e-06, "loss": 0.4587, "num_input_tokens_seen": 106217312, "step": 87335 }, { "epoch": 10.943490790627742, "grad_norm": 0.11190160363912582, "learning_rate": 5.04942212602189e-06, "loss": 0.4517, "num_input_tokens_seen": 106223296, "step": 87340 }, { "epoch": 10.944117278536524, "grad_norm": 0.31479859352111816, "learning_rate": 5.048875438599059e-06, "loss": 0.4604, "num_input_tokens_seen": 106229376, "step": 87345 }, { "epoch": 10.944743766445308, "grad_norm": 0.17910614609718323, "learning_rate": 5.048328750591882e-06, "loss": 0.4717, "num_input_tokens_seen": 106235680, "step": 87350 }, { "epoch": 10.94537025435409, "grad_norm": 0.20244789123535156, "learning_rate": 5.047782062006894e-06, "loss": 0.4712, "num_input_tokens_seen": 106241504, "step": 87355 }, { "epoch": 10.945996742262874, "grad_norm": 0.1443825364112854, "learning_rate": 5.047235372850632e-06, "loss": 0.4596, "num_input_tokens_seen": 106247744, "step": 87360 }, { "epoch": 10.946623230171658, "grad_norm": 0.17139971256256104, "learning_rate": 5.04668868312963e-06, "loss": 0.4643, "num_input_tokens_seen": 106253984, "step": 87365 }, { "epoch": 10.94724971808044, "grad_norm": 0.13855621218681335, "learning_rate": 5.0461419928504276e-06, "loss": 0.4612, "num_input_tokens_seen": 106260672, "step": 87370 }, { "epoch": 10.947876205989225, "grad_norm": 0.19986900687217712, "learning_rate": 5.045595302019557e-06, "loss": 0.4645, "num_input_tokens_seen": 106266656, "step": 87375 }, { "epoch": 10.948502693898007, "grad_norm": 0.15309391915798187, "learning_rate": 5.045048610643557e-06, "loss": 0.4702, "num_input_tokens_seen": 106272544, "step": 87380 }, { "epoch": 10.949129181806791, "grad_norm": 0.15667971968650818, "learning_rate": 5.044501918728963e-06, "loss": 0.4745, "num_input_tokens_seen": 106278848, "step": 87385 }, { "epoch": 10.949755669715575, "grad_norm": 0.2545292377471924, "learning_rate": 5.04395522628231e-06, "loss": 0.4684, "num_input_tokens_seen": 106283712, "step": 87390 }, { "epoch": 10.950382157624357, "grad_norm": 0.4017650783061981, "learning_rate": 5.043408533310137e-06, "loss": 0.4618, "num_input_tokens_seen": 106289440, "step": 87395 }, { "epoch": 10.951008645533141, "grad_norm": 0.12209098786115646, "learning_rate": 5.042861839818977e-06, "loss": 0.4699, "num_input_tokens_seen": 106295488, "step": 87400 }, { "epoch": 10.951635133441924, "grad_norm": 0.13674351572990417, "learning_rate": 5.042315145815368e-06, "loss": 0.4644, "num_input_tokens_seen": 106301600, "step": 87405 }, { "epoch": 10.952261621350708, "grad_norm": 0.13283517956733704, "learning_rate": 5.041768451305846e-06, "loss": 0.463, "num_input_tokens_seen": 106307904, "step": 87410 }, { "epoch": 10.952888109259492, "grad_norm": 0.1436792016029358, "learning_rate": 5.0412217562969476e-06, "loss": 0.4551, "num_input_tokens_seen": 106314016, "step": 87415 }, { "epoch": 10.953514597168274, "grad_norm": 0.22021141648292542, "learning_rate": 5.0406750607952085e-06, "loss": 0.461, "num_input_tokens_seen": 106320000, "step": 87420 }, { "epoch": 10.954141085077058, "grad_norm": 0.11365063488483429, "learning_rate": 5.040128364807164e-06, "loss": 0.4702, "num_input_tokens_seen": 106325408, "step": 87425 }, { "epoch": 10.95476757298584, "grad_norm": 0.1446288824081421, "learning_rate": 5.039581668339352e-06, "loss": 0.4583, "num_input_tokens_seen": 106331744, "step": 87430 }, { "epoch": 10.955394060894625, "grad_norm": 0.1601998656988144, "learning_rate": 5.039034971398307e-06, "loss": 0.4651, "num_input_tokens_seen": 106337856, "step": 87435 }, { "epoch": 10.956020548803409, "grad_norm": 0.12543202936649323, "learning_rate": 5.038488273990566e-06, "loss": 0.4593, "num_input_tokens_seen": 106344064, "step": 87440 }, { "epoch": 10.95664703671219, "grad_norm": 0.15272650122642517, "learning_rate": 5.037941576122667e-06, "loss": 0.4661, "num_input_tokens_seen": 106350400, "step": 87445 }, { "epoch": 10.957273524620975, "grad_norm": 0.1749139130115509, "learning_rate": 5.037394877801143e-06, "loss": 0.4576, "num_input_tokens_seen": 106356544, "step": 87450 }, { "epoch": 10.957900012529759, "grad_norm": 0.15249648690223694, "learning_rate": 5.036848179032533e-06, "loss": 0.46, "num_input_tokens_seen": 106362688, "step": 87455 }, { "epoch": 10.958526500438541, "grad_norm": 0.21853460371494293, "learning_rate": 5.0363014798233714e-06, "loss": 0.461, "num_input_tokens_seen": 106368992, "step": 87460 }, { "epoch": 10.959152988347325, "grad_norm": 0.14393901824951172, "learning_rate": 5.035754780180194e-06, "loss": 0.4575, "num_input_tokens_seen": 106375136, "step": 87465 }, { "epoch": 10.959779476256108, "grad_norm": 0.14862856268882751, "learning_rate": 5.035208080109539e-06, "loss": 0.4629, "num_input_tokens_seen": 106381376, "step": 87470 }, { "epoch": 10.960405964164892, "grad_norm": 0.20291432738304138, "learning_rate": 5.034661379617942e-06, "loss": 0.4677, "num_input_tokens_seen": 106388032, "step": 87475 }, { "epoch": 10.961032452073676, "grad_norm": 0.15763968229293823, "learning_rate": 5.0341146787119396e-06, "loss": 0.4615, "num_input_tokens_seen": 106394016, "step": 87480 }, { "epoch": 10.961658939982458, "grad_norm": 0.2950218915939331, "learning_rate": 5.033567977398067e-06, "loss": 0.4629, "num_input_tokens_seen": 106399360, "step": 87485 }, { "epoch": 10.962285427891242, "grad_norm": 0.1737259030342102, "learning_rate": 5.03302127568286e-06, "loss": 0.4617, "num_input_tokens_seen": 106405728, "step": 87490 }, { "epoch": 10.962911915800024, "grad_norm": 0.16477127373218536, "learning_rate": 5.032474573572857e-06, "loss": 0.4678, "num_input_tokens_seen": 106411520, "step": 87495 }, { "epoch": 10.963538403708808, "grad_norm": 0.16047008335590363, "learning_rate": 5.031927871074593e-06, "loss": 0.4576, "num_input_tokens_seen": 106417600, "step": 87500 }, { "epoch": 10.964164891617592, "grad_norm": 0.19794854521751404, "learning_rate": 5.031381168194604e-06, "loss": 0.4587, "num_input_tokens_seen": 106423584, "step": 87505 }, { "epoch": 10.964791379526375, "grad_norm": 0.12211957573890686, "learning_rate": 5.030834464939428e-06, "loss": 0.4579, "num_input_tokens_seen": 106429536, "step": 87510 }, { "epoch": 10.965417867435159, "grad_norm": 0.16161011159420013, "learning_rate": 5.0302877613156e-06, "loss": 0.4624, "num_input_tokens_seen": 106435392, "step": 87515 }, { "epoch": 10.966044355343941, "grad_norm": 0.14015363156795502, "learning_rate": 5.0297410573296555e-06, "loss": 0.4708, "num_input_tokens_seen": 106440928, "step": 87520 }, { "epoch": 10.966670843252725, "grad_norm": 0.2563917934894562, "learning_rate": 5.029194352988132e-06, "loss": 0.4647, "num_input_tokens_seen": 106447232, "step": 87525 }, { "epoch": 10.96729733116151, "grad_norm": 0.1543481945991516, "learning_rate": 5.028647648297564e-06, "loss": 0.4588, "num_input_tokens_seen": 106453600, "step": 87530 }, { "epoch": 10.967923819070291, "grad_norm": 0.15298418700695038, "learning_rate": 5.028100943264491e-06, "loss": 0.4505, "num_input_tokens_seen": 106459552, "step": 87535 }, { "epoch": 10.968550306979076, "grad_norm": 0.14354951679706573, "learning_rate": 5.0275542378954475e-06, "loss": 0.4644, "num_input_tokens_seen": 106465824, "step": 87540 }, { "epoch": 10.969176794887858, "grad_norm": 0.13821132481098175, "learning_rate": 5.027007532196968e-06, "loss": 0.4679, "num_input_tokens_seen": 106472160, "step": 87545 }, { "epoch": 10.969803282796642, "grad_norm": 0.11317924410104752, "learning_rate": 5.026460826175593e-06, "loss": 0.4593, "num_input_tokens_seen": 106478176, "step": 87550 }, { "epoch": 10.970429770705426, "grad_norm": 0.13737192749977112, "learning_rate": 5.025914119837856e-06, "loss": 0.4662, "num_input_tokens_seen": 106484480, "step": 87555 }, { "epoch": 10.971056258614208, "grad_norm": 0.19180691242218018, "learning_rate": 5.025367413190294e-06, "loss": 0.4611, "num_input_tokens_seen": 106490304, "step": 87560 }, { "epoch": 10.971682746522992, "grad_norm": 0.191737100481987, "learning_rate": 5.024820706239443e-06, "loss": 0.4528, "num_input_tokens_seen": 106496288, "step": 87565 }, { "epoch": 10.972309234431776, "grad_norm": 0.11649281531572342, "learning_rate": 5.02427399899184e-06, "loss": 0.4633, "num_input_tokens_seen": 106502240, "step": 87570 }, { "epoch": 10.972935722340559, "grad_norm": 0.1939552277326584, "learning_rate": 5.0237272914540205e-06, "loss": 0.4537, "num_input_tokens_seen": 106508352, "step": 87575 }, { "epoch": 10.973562210249343, "grad_norm": 0.1637965887784958, "learning_rate": 5.023180583632522e-06, "loss": 0.4745, "num_input_tokens_seen": 106514208, "step": 87580 }, { "epoch": 10.974188698158125, "grad_norm": 0.11808837205171585, "learning_rate": 5.022633875533879e-06, "loss": 0.4659, "num_input_tokens_seen": 106520320, "step": 87585 }, { "epoch": 10.974815186066909, "grad_norm": 0.19221429526805878, "learning_rate": 5.022087167164629e-06, "loss": 0.4686, "num_input_tokens_seen": 106526432, "step": 87590 }, { "epoch": 10.975441673975693, "grad_norm": 0.18193639814853668, "learning_rate": 5.0215404585313075e-06, "loss": 0.4539, "num_input_tokens_seen": 106532480, "step": 87595 }, { "epoch": 10.976068161884475, "grad_norm": 0.1391041874885559, "learning_rate": 5.0209937496404534e-06, "loss": 0.4622, "num_input_tokens_seen": 106538240, "step": 87600 }, { "epoch": 10.97669464979326, "grad_norm": 0.2574011981487274, "learning_rate": 5.020447040498601e-06, "loss": 0.4602, "num_input_tokens_seen": 106544576, "step": 87605 }, { "epoch": 10.977321137702042, "grad_norm": 0.11219850182533264, "learning_rate": 5.019900331112287e-06, "loss": 0.459, "num_input_tokens_seen": 106550688, "step": 87610 }, { "epoch": 10.977947625610826, "grad_norm": 0.18241369724273682, "learning_rate": 5.019353621488048e-06, "loss": 0.4586, "num_input_tokens_seen": 106556320, "step": 87615 }, { "epoch": 10.97857411351961, "grad_norm": 0.2608264982700348, "learning_rate": 5.01880691163242e-06, "loss": 0.4633, "num_input_tokens_seen": 106562432, "step": 87620 }, { "epoch": 10.979200601428392, "grad_norm": 0.12936070561408997, "learning_rate": 5.01826020155194e-06, "loss": 0.4646, "num_input_tokens_seen": 106568288, "step": 87625 }, { "epoch": 10.979827089337176, "grad_norm": 0.1809614598751068, "learning_rate": 5.017713491253144e-06, "loss": 0.4645, "num_input_tokens_seen": 106574400, "step": 87630 }, { "epoch": 10.980453577245958, "grad_norm": 0.28265100717544556, "learning_rate": 5.017166780742567e-06, "loss": 0.4633, "num_input_tokens_seen": 106580480, "step": 87635 }, { "epoch": 10.981080065154742, "grad_norm": 0.14644992351531982, "learning_rate": 5.016620070026747e-06, "loss": 0.4645, "num_input_tokens_seen": 106586528, "step": 87640 }, { "epoch": 10.981706553063526, "grad_norm": 0.1608039140701294, "learning_rate": 5.0160733591122214e-06, "loss": 0.4559, "num_input_tokens_seen": 106592576, "step": 87645 }, { "epoch": 10.982333040972309, "grad_norm": 0.20268182456493378, "learning_rate": 5.015526648005523e-06, "loss": 0.4677, "num_input_tokens_seen": 106598880, "step": 87650 }, { "epoch": 10.982959528881093, "grad_norm": 0.12121966481208801, "learning_rate": 5.014979936713193e-06, "loss": 0.45, "num_input_tokens_seen": 106605152, "step": 87655 }, { "epoch": 10.983586016789875, "grad_norm": 0.17327980697155, "learning_rate": 5.014433225241764e-06, "loss": 0.4606, "num_input_tokens_seen": 106611616, "step": 87660 }, { "epoch": 10.98421250469866, "grad_norm": 0.20557370781898499, "learning_rate": 5.013886513597774e-06, "loss": 0.457, "num_input_tokens_seen": 106617824, "step": 87665 }, { "epoch": 10.984838992607443, "grad_norm": 0.21101082861423492, "learning_rate": 5.013339801787759e-06, "loss": 0.4616, "num_input_tokens_seen": 106623840, "step": 87670 }, { "epoch": 10.985465480516226, "grad_norm": 0.13318239152431488, "learning_rate": 5.012793089818255e-06, "loss": 0.4593, "num_input_tokens_seen": 106629472, "step": 87675 }, { "epoch": 10.98609196842501, "grad_norm": 0.1257934421300888, "learning_rate": 5.0122463776957995e-06, "loss": 0.4565, "num_input_tokens_seen": 106635584, "step": 87680 }, { "epoch": 10.986718456333794, "grad_norm": 0.16691334545612335, "learning_rate": 5.011699665426929e-06, "loss": 0.4643, "num_input_tokens_seen": 106641760, "step": 87685 }, { "epoch": 10.987344944242576, "grad_norm": 0.20070095360279083, "learning_rate": 5.011152953018178e-06, "loss": 0.4488, "num_input_tokens_seen": 106647840, "step": 87690 }, { "epoch": 10.98797143215136, "grad_norm": 0.15859095752239227, "learning_rate": 5.010606240476084e-06, "loss": 0.4583, "num_input_tokens_seen": 106653280, "step": 87695 }, { "epoch": 10.988597920060142, "grad_norm": 0.22073043882846832, "learning_rate": 5.0100595278071825e-06, "loss": 0.4623, "num_input_tokens_seen": 106659264, "step": 87700 }, { "epoch": 10.989224407968926, "grad_norm": 0.12760628759860992, "learning_rate": 5.009512815018013e-06, "loss": 0.4685, "num_input_tokens_seen": 106665248, "step": 87705 }, { "epoch": 10.98985089587771, "grad_norm": 0.22382842004299164, "learning_rate": 5.008966102115109e-06, "loss": 0.4583, "num_input_tokens_seen": 106671488, "step": 87710 }, { "epoch": 10.990477383786493, "grad_norm": 0.18283556401729584, "learning_rate": 5.008419389105007e-06, "loss": 0.466, "num_input_tokens_seen": 106677888, "step": 87715 }, { "epoch": 10.991103871695277, "grad_norm": 0.1322084367275238, "learning_rate": 5.007872675994244e-06, "loss": 0.4625, "num_input_tokens_seen": 106683840, "step": 87720 }, { "epoch": 10.991730359604059, "grad_norm": 0.14262175559997559, "learning_rate": 5.007325962789358e-06, "loss": 0.4625, "num_input_tokens_seen": 106689952, "step": 87725 }, { "epoch": 10.992356847512843, "grad_norm": 0.19254660606384277, "learning_rate": 5.006779249496884e-06, "loss": 0.4575, "num_input_tokens_seen": 106696256, "step": 87730 }, { "epoch": 10.992983335421627, "grad_norm": 0.18902719020843506, "learning_rate": 5.0062325361233575e-06, "loss": 0.458, "num_input_tokens_seen": 106702176, "step": 87735 }, { "epoch": 10.99360982333041, "grad_norm": 0.1592310220003128, "learning_rate": 5.0056858226753155e-06, "loss": 0.4574, "num_input_tokens_seen": 106708064, "step": 87740 }, { "epoch": 10.994236311239193, "grad_norm": 0.13770349323749542, "learning_rate": 5.005139109159296e-06, "loss": 0.4619, "num_input_tokens_seen": 106714368, "step": 87745 }, { "epoch": 10.994862799147976, "grad_norm": 0.14078132808208466, "learning_rate": 5.004592395581832e-06, "loss": 0.4666, "num_input_tokens_seen": 106720320, "step": 87750 }, { "epoch": 10.99548928705676, "grad_norm": 0.11960344016551971, "learning_rate": 5.004045681949463e-06, "loss": 0.4644, "num_input_tokens_seen": 106726688, "step": 87755 }, { "epoch": 10.996115774965544, "grad_norm": 0.13964737951755524, "learning_rate": 5.0034989682687255e-06, "loss": 0.4602, "num_input_tokens_seen": 106732128, "step": 87760 }, { "epoch": 10.996742262874326, "grad_norm": 0.13765308260917664, "learning_rate": 5.002952254546154e-06, "loss": 0.4591, "num_input_tokens_seen": 106738208, "step": 87765 }, { "epoch": 10.99736875078311, "grad_norm": 0.11971006542444229, "learning_rate": 5.002405540788287e-06, "loss": 0.4637, "num_input_tokens_seen": 106744416, "step": 87770 }, { "epoch": 10.997995238691892, "grad_norm": 0.14348822832107544, "learning_rate": 5.001858827001657e-06, "loss": 0.4661, "num_input_tokens_seen": 106750336, "step": 87775 }, { "epoch": 10.998621726600676, "grad_norm": 0.20558315515518188, "learning_rate": 5.001312113192805e-06, "loss": 0.4661, "num_input_tokens_seen": 106756640, "step": 87780 }, { "epoch": 10.99924821450946, "grad_norm": 0.13793408870697021, "learning_rate": 5.000765399368266e-06, "loss": 0.4563, "num_input_tokens_seen": 106762816, "step": 87785 }, { "epoch": 10.999874702418243, "grad_norm": 0.14937129616737366, "learning_rate": 5.000218685534574e-06, "loss": 0.474, "num_input_tokens_seen": 106769344, "step": 87790 }, { "epoch": 11.000501190327027, "grad_norm": 0.15370161831378937, "learning_rate": 4.99967197169827e-06, "loss": 0.4543, "num_input_tokens_seen": 106775296, "step": 87795 }, { "epoch": 11.00112767823581, "grad_norm": 0.15265731513500214, "learning_rate": 4.999125257865887e-06, "loss": 0.4686, "num_input_tokens_seen": 106781120, "step": 87800 }, { "epoch": 11.001754166144593, "grad_norm": 0.20544759929180145, "learning_rate": 4.998578544043961e-06, "loss": 0.4696, "num_input_tokens_seen": 106787040, "step": 87805 }, { "epoch": 11.002380654053377, "grad_norm": 0.1439930647611618, "learning_rate": 4.99803183023903e-06, "loss": 0.4628, "num_input_tokens_seen": 106793280, "step": 87810 }, { "epoch": 11.00300714196216, "grad_norm": 0.13396400213241577, "learning_rate": 4.997485116457632e-06, "loss": 0.4648, "num_input_tokens_seen": 106798944, "step": 87815 }, { "epoch": 11.003633629870944, "grad_norm": 0.14316944777965546, "learning_rate": 4.9969384027063e-06, "loss": 0.4622, "num_input_tokens_seen": 106805120, "step": 87820 }, { "epoch": 11.004260117779728, "grad_norm": 0.22244103252887726, "learning_rate": 4.996391688991573e-06, "loss": 0.4519, "num_input_tokens_seen": 106811360, "step": 87825 }, { "epoch": 11.00488660568851, "grad_norm": 0.14004459977149963, "learning_rate": 4.995844975319985e-06, "loss": 0.4564, "num_input_tokens_seen": 106817504, "step": 87830 }, { "epoch": 11.005513093597294, "grad_norm": 0.1615789532661438, "learning_rate": 4.995298261698076e-06, "loss": 0.4612, "num_input_tokens_seen": 106823424, "step": 87835 }, { "epoch": 11.006139581506076, "grad_norm": 0.16123047471046448, "learning_rate": 4.994751548132378e-06, "loss": 0.4583, "num_input_tokens_seen": 106829376, "step": 87840 }, { "epoch": 11.00676606941486, "grad_norm": 0.1425022929906845, "learning_rate": 4.99420483462943e-06, "loss": 0.4687, "num_input_tokens_seen": 106835328, "step": 87845 }, { "epoch": 11.007392557323644, "grad_norm": 0.2408151626586914, "learning_rate": 4.993658121195769e-06, "loss": 0.4671, "num_input_tokens_seen": 106841600, "step": 87850 }, { "epoch": 11.008019045232427, "grad_norm": 0.13114772737026215, "learning_rate": 4.99311140783793e-06, "loss": 0.4574, "num_input_tokens_seen": 106847872, "step": 87855 }, { "epoch": 11.00864553314121, "grad_norm": 0.1250971406698227, "learning_rate": 4.992564694562449e-06, "loss": 0.4522, "num_input_tokens_seen": 106853920, "step": 87860 }, { "epoch": 11.009272021049993, "grad_norm": 0.15371868014335632, "learning_rate": 4.9920179813758655e-06, "loss": 0.4747, "num_input_tokens_seen": 106860224, "step": 87865 }, { "epoch": 11.009898508958777, "grad_norm": 0.11721276491880417, "learning_rate": 4.9914712682847114e-06, "loss": 0.4797, "num_input_tokens_seen": 106866336, "step": 87870 }, { "epoch": 11.010524996867561, "grad_norm": 0.10910683125257492, "learning_rate": 4.990924555295526e-06, "loss": 0.4658, "num_input_tokens_seen": 106872512, "step": 87875 }, { "epoch": 11.011151484776343, "grad_norm": 0.1802665889263153, "learning_rate": 4.990377842414847e-06, "loss": 0.4605, "num_input_tokens_seen": 106878400, "step": 87880 }, { "epoch": 11.011777972685127, "grad_norm": 0.17021778225898743, "learning_rate": 4.9898311296492065e-06, "loss": 0.4642, "num_input_tokens_seen": 106884640, "step": 87885 }, { "epoch": 11.01240446059391, "grad_norm": 0.11914506554603577, "learning_rate": 4.989284417005146e-06, "loss": 0.4564, "num_input_tokens_seen": 106890400, "step": 87890 }, { "epoch": 11.013030948502694, "grad_norm": 0.1276027262210846, "learning_rate": 4.988737704489197e-06, "loss": 0.4666, "num_input_tokens_seen": 106896640, "step": 87895 }, { "epoch": 11.013657436411478, "grad_norm": 0.11704915761947632, "learning_rate": 4.9881909921079e-06, "loss": 0.4597, "num_input_tokens_seen": 106902720, "step": 87900 }, { "epoch": 11.01428392432026, "grad_norm": 0.16445215046405792, "learning_rate": 4.9876442798677875e-06, "loss": 0.4655, "num_input_tokens_seen": 106908832, "step": 87905 }, { "epoch": 11.014910412229044, "grad_norm": 0.11970984190702438, "learning_rate": 4.9870975677754005e-06, "loss": 0.4593, "num_input_tokens_seen": 106914624, "step": 87910 }, { "epoch": 11.015536900137826, "grad_norm": 0.13815715909004211, "learning_rate": 4.986550855837271e-06, "loss": 0.4635, "num_input_tokens_seen": 106920672, "step": 87915 }, { "epoch": 11.01616338804661, "grad_norm": 0.24067296087741852, "learning_rate": 4.986004144059939e-06, "loss": 0.4668, "num_input_tokens_seen": 106927232, "step": 87920 }, { "epoch": 11.016789875955395, "grad_norm": 0.10751663893461227, "learning_rate": 4.985457432449937e-06, "loss": 0.4603, "num_input_tokens_seen": 106933536, "step": 87925 }, { "epoch": 11.017416363864177, "grad_norm": 0.15176977217197418, "learning_rate": 4.984910721013804e-06, "loss": 0.4642, "num_input_tokens_seen": 106939520, "step": 87930 }, { "epoch": 11.018042851772961, "grad_norm": 0.11846164613962173, "learning_rate": 4.984364009758078e-06, "loss": 0.4584, "num_input_tokens_seen": 106945088, "step": 87935 }, { "epoch": 11.018669339681745, "grad_norm": 0.130848228931427, "learning_rate": 4.983817298689292e-06, "loss": 0.4613, "num_input_tokens_seen": 106951488, "step": 87940 }, { "epoch": 11.019295827590527, "grad_norm": 0.11107141524553299, "learning_rate": 4.983270587813985e-06, "loss": 0.4728, "num_input_tokens_seen": 106957600, "step": 87945 }, { "epoch": 11.019922315499311, "grad_norm": 0.0980520248413086, "learning_rate": 4.9827238771386905e-06, "loss": 0.4571, "num_input_tokens_seen": 106963872, "step": 87950 }, { "epoch": 11.020548803408094, "grad_norm": 0.14363108575344086, "learning_rate": 4.982177166669948e-06, "loss": 0.4653, "num_input_tokens_seen": 106970048, "step": 87955 }, { "epoch": 11.021175291316878, "grad_norm": 0.20634214580059052, "learning_rate": 4.98163045641429e-06, "loss": 0.4647, "num_input_tokens_seen": 106976384, "step": 87960 }, { "epoch": 11.021801779225662, "grad_norm": 0.14608944952487946, "learning_rate": 4.981083746378258e-06, "loss": 0.4546, "num_input_tokens_seen": 106982752, "step": 87965 }, { "epoch": 11.022428267134444, "grad_norm": 0.11933421343564987, "learning_rate": 4.980537036568385e-06, "loss": 0.4673, "num_input_tokens_seen": 106988800, "step": 87970 }, { "epoch": 11.023054755043228, "grad_norm": 0.19050715863704681, "learning_rate": 4.9799903269912086e-06, "loss": 0.4713, "num_input_tokens_seen": 106994784, "step": 87975 }, { "epoch": 11.02368124295201, "grad_norm": 0.1348005086183548, "learning_rate": 4.9794436176532634e-06, "loss": 0.4493, "num_input_tokens_seen": 107000320, "step": 87980 }, { "epoch": 11.024307730860794, "grad_norm": 0.14232312142848969, "learning_rate": 4.978896908561089e-06, "loss": 0.4637, "num_input_tokens_seen": 107006016, "step": 87985 }, { "epoch": 11.024934218769578, "grad_norm": 0.14915570616722107, "learning_rate": 4.978350199721217e-06, "loss": 0.4602, "num_input_tokens_seen": 107012224, "step": 87990 }, { "epoch": 11.02556070667836, "grad_norm": 0.1527331918478012, "learning_rate": 4.977803491140187e-06, "loss": 0.4581, "num_input_tokens_seen": 107018176, "step": 87995 }, { "epoch": 11.026187194587145, "grad_norm": 0.1688939779996872, "learning_rate": 4.977256782824537e-06, "loss": 0.4528, "num_input_tokens_seen": 107024448, "step": 88000 }, { "epoch": 11.026813682495927, "grad_norm": 0.14130227267742157, "learning_rate": 4.9767100747808e-06, "loss": 0.4573, "num_input_tokens_seen": 107030240, "step": 88005 }, { "epoch": 11.027440170404711, "grad_norm": 0.24696004390716553, "learning_rate": 4.976163367015514e-06, "loss": 0.4641, "num_input_tokens_seen": 107036320, "step": 88010 }, { "epoch": 11.028066658313495, "grad_norm": 0.14935268461704254, "learning_rate": 4.975616659535214e-06, "loss": 0.4534, "num_input_tokens_seen": 107042720, "step": 88015 }, { "epoch": 11.028693146222277, "grad_norm": 0.174608513712883, "learning_rate": 4.975069952346439e-06, "loss": 0.4673, "num_input_tokens_seen": 107048352, "step": 88020 }, { "epoch": 11.029319634131062, "grad_norm": 0.1723729968070984, "learning_rate": 4.974523245455722e-06, "loss": 0.4682, "num_input_tokens_seen": 107054464, "step": 88025 }, { "epoch": 11.029946122039844, "grad_norm": 0.12391901016235352, "learning_rate": 4.973976538869603e-06, "loss": 0.4653, "num_input_tokens_seen": 107060768, "step": 88030 }, { "epoch": 11.030572609948628, "grad_norm": 0.17133840918540955, "learning_rate": 4.973429832594615e-06, "loss": 0.4613, "num_input_tokens_seen": 107066912, "step": 88035 }, { "epoch": 11.031199097857412, "grad_norm": 0.1311502754688263, "learning_rate": 4.972883126637297e-06, "loss": 0.4695, "num_input_tokens_seen": 107073184, "step": 88040 }, { "epoch": 11.031825585766194, "grad_norm": 0.13548579812049866, "learning_rate": 4.9723364210041815e-06, "loss": 0.4711, "num_input_tokens_seen": 107079296, "step": 88045 }, { "epoch": 11.032452073674978, "grad_norm": 0.19807316362857819, "learning_rate": 4.971789715701809e-06, "loss": 0.474, "num_input_tokens_seen": 107084896, "step": 88050 }, { "epoch": 11.03307856158376, "grad_norm": 0.16306115686893463, "learning_rate": 4.9712430107367136e-06, "loss": 0.4603, "num_input_tokens_seen": 107091136, "step": 88055 }, { "epoch": 11.033705049492545, "grad_norm": 0.2232028990983963, "learning_rate": 4.9706963061154324e-06, "loss": 0.4596, "num_input_tokens_seen": 107097216, "step": 88060 }, { "epoch": 11.034331537401329, "grad_norm": 0.1470567137002945, "learning_rate": 4.970149601844502e-06, "loss": 0.4583, "num_input_tokens_seen": 107103552, "step": 88065 }, { "epoch": 11.034958025310111, "grad_norm": 0.1174507662653923, "learning_rate": 4.969602897930457e-06, "loss": 0.4703, "num_input_tokens_seen": 107109888, "step": 88070 }, { "epoch": 11.035584513218895, "grad_norm": 0.16288070380687714, "learning_rate": 4.969056194379837e-06, "loss": 0.4555, "num_input_tokens_seen": 107115776, "step": 88075 }, { "epoch": 11.036211001127679, "grad_norm": 0.14451220631599426, "learning_rate": 4.968509491199174e-06, "loss": 0.4596, "num_input_tokens_seen": 107121856, "step": 88080 }, { "epoch": 11.036837489036461, "grad_norm": 0.15305061638355255, "learning_rate": 4.9679627883950085e-06, "loss": 0.4634, "num_input_tokens_seen": 107128320, "step": 88085 }, { "epoch": 11.037463976945245, "grad_norm": 0.13554732501506805, "learning_rate": 4.967416085973873e-06, "loss": 0.4698, "num_input_tokens_seen": 107134464, "step": 88090 }, { "epoch": 11.038090464854028, "grad_norm": 0.18829604983329773, "learning_rate": 4.9668693839423075e-06, "loss": 0.4599, "num_input_tokens_seen": 107140576, "step": 88095 }, { "epoch": 11.038716952762812, "grad_norm": 0.14069563150405884, "learning_rate": 4.9663226823068446e-06, "loss": 0.4628, "num_input_tokens_seen": 107145376, "step": 88100 }, { "epoch": 11.039343440671596, "grad_norm": 0.12731315195560455, "learning_rate": 4.965775981074023e-06, "loss": 0.461, "num_input_tokens_seen": 107151456, "step": 88105 }, { "epoch": 11.039969928580378, "grad_norm": 0.18680359423160553, "learning_rate": 4.965229280250378e-06, "loss": 0.4586, "num_input_tokens_seen": 107157440, "step": 88110 }, { "epoch": 11.040596416489162, "grad_norm": 0.1749836504459381, "learning_rate": 4.964682579842446e-06, "loss": 0.4616, "num_input_tokens_seen": 107163520, "step": 88115 }, { "epoch": 11.041222904397944, "grad_norm": 0.13296277821063995, "learning_rate": 4.964135879856764e-06, "loss": 0.46, "num_input_tokens_seen": 107169632, "step": 88120 }, { "epoch": 11.041849392306728, "grad_norm": 0.12034427374601364, "learning_rate": 4.963589180299867e-06, "loss": 0.4601, "num_input_tokens_seen": 107175808, "step": 88125 }, { "epoch": 11.042475880215513, "grad_norm": 0.11950893700122833, "learning_rate": 4.963042481178294e-06, "loss": 0.4644, "num_input_tokens_seen": 107182272, "step": 88130 }, { "epoch": 11.043102368124295, "grad_norm": 0.14012956619262695, "learning_rate": 4.962495782498577e-06, "loss": 0.4542, "num_input_tokens_seen": 107188096, "step": 88135 }, { "epoch": 11.043728856033079, "grad_norm": 0.14412762224674225, "learning_rate": 4.961949084267256e-06, "loss": 0.4577, "num_input_tokens_seen": 107194144, "step": 88140 }, { "epoch": 11.044355343941861, "grad_norm": 0.1302870512008667, "learning_rate": 4.9614023864908645e-06, "loss": 0.4624, "num_input_tokens_seen": 107200288, "step": 88145 }, { "epoch": 11.044981831850645, "grad_norm": 0.19072647392749786, "learning_rate": 4.960855689175941e-06, "loss": 0.4641, "num_input_tokens_seen": 107206176, "step": 88150 }, { "epoch": 11.04560831975943, "grad_norm": 0.18835735321044922, "learning_rate": 4.960308992329019e-06, "loss": 0.4569, "num_input_tokens_seen": 107212224, "step": 88155 }, { "epoch": 11.046234807668212, "grad_norm": 0.1330517828464508, "learning_rate": 4.959762295956639e-06, "loss": 0.4556, "num_input_tokens_seen": 107218272, "step": 88160 }, { "epoch": 11.046861295576996, "grad_norm": 0.13793517649173737, "learning_rate": 4.959215600065331e-06, "loss": 0.4675, "num_input_tokens_seen": 107224288, "step": 88165 }, { "epoch": 11.047487783485778, "grad_norm": 0.16817882657051086, "learning_rate": 4.958668904661638e-06, "loss": 0.4585, "num_input_tokens_seen": 107230624, "step": 88170 }, { "epoch": 11.048114271394562, "grad_norm": 0.14299553632736206, "learning_rate": 4.958122209752091e-06, "loss": 0.4629, "num_input_tokens_seen": 107236448, "step": 88175 }, { "epoch": 11.048740759303346, "grad_norm": 0.21338298916816711, "learning_rate": 4.957575515343228e-06, "loss": 0.4609, "num_input_tokens_seen": 107242752, "step": 88180 }, { "epoch": 11.049367247212128, "grad_norm": 0.1188521757721901, "learning_rate": 4.957028821441588e-06, "loss": 0.4541, "num_input_tokens_seen": 107249184, "step": 88185 }, { "epoch": 11.049993735120912, "grad_norm": 0.16544164717197418, "learning_rate": 4.956482128053702e-06, "loss": 0.4668, "num_input_tokens_seen": 107255456, "step": 88190 }, { "epoch": 11.050620223029696, "grad_norm": 0.1614585667848587, "learning_rate": 4.95593543518611e-06, "loss": 0.4607, "num_input_tokens_seen": 107261088, "step": 88195 }, { "epoch": 11.051246710938479, "grad_norm": 0.12539735436439514, "learning_rate": 4.9553887428453466e-06, "loss": 0.4583, "num_input_tokens_seen": 107267264, "step": 88200 }, { "epoch": 11.051873198847263, "grad_norm": 0.16206598281860352, "learning_rate": 4.954842051037949e-06, "loss": 0.457, "num_input_tokens_seen": 107273696, "step": 88205 }, { "epoch": 11.052499686756045, "grad_norm": 0.1478525996208191, "learning_rate": 4.954295359770451e-06, "loss": 0.454, "num_input_tokens_seen": 107279808, "step": 88210 }, { "epoch": 11.053126174664829, "grad_norm": 0.1455412358045578, "learning_rate": 4.953748669049392e-06, "loss": 0.4565, "num_input_tokens_seen": 107285984, "step": 88215 }, { "epoch": 11.053752662573613, "grad_norm": 0.14307214319705963, "learning_rate": 4.953201978881304e-06, "loss": 0.461, "num_input_tokens_seen": 107291680, "step": 88220 }, { "epoch": 11.054379150482395, "grad_norm": 0.15318283438682556, "learning_rate": 4.952655289272729e-06, "loss": 0.4554, "num_input_tokens_seen": 107298080, "step": 88225 }, { "epoch": 11.05500563839118, "grad_norm": 0.15661300718784332, "learning_rate": 4.9521086002301976e-06, "loss": 0.4631, "num_input_tokens_seen": 107303872, "step": 88230 }, { "epoch": 11.055632126299962, "grad_norm": 0.18270978331565857, "learning_rate": 4.951561911760251e-06, "loss": 0.4563, "num_input_tokens_seen": 107309856, "step": 88235 }, { "epoch": 11.056258614208746, "grad_norm": 0.17289157211780548, "learning_rate": 4.951015223869419e-06, "loss": 0.4629, "num_input_tokens_seen": 107315936, "step": 88240 }, { "epoch": 11.05688510211753, "grad_norm": 0.16853056848049164, "learning_rate": 4.950468536564241e-06, "loss": 0.4607, "num_input_tokens_seen": 107322272, "step": 88245 }, { "epoch": 11.057511590026312, "grad_norm": 0.15500999987125397, "learning_rate": 4.949921849851257e-06, "loss": 0.4616, "num_input_tokens_seen": 107328416, "step": 88250 }, { "epoch": 11.058138077935096, "grad_norm": 0.15904006361961365, "learning_rate": 4.949375163736996e-06, "loss": 0.462, "num_input_tokens_seen": 107334464, "step": 88255 }, { "epoch": 11.058764565843878, "grad_norm": 0.11809322237968445, "learning_rate": 4.9488284782279985e-06, "loss": 0.456, "num_input_tokens_seen": 107340512, "step": 88260 }, { "epoch": 11.059391053752663, "grad_norm": 0.16470685601234436, "learning_rate": 4.9482817933308e-06, "loss": 0.4495, "num_input_tokens_seen": 107346816, "step": 88265 }, { "epoch": 11.060017541661447, "grad_norm": 0.18656811118125916, "learning_rate": 4.947735109051937e-06, "loss": 0.4622, "num_input_tokens_seen": 107352960, "step": 88270 }, { "epoch": 11.060644029570229, "grad_norm": 0.1705910861492157, "learning_rate": 4.947188425397942e-06, "loss": 0.4656, "num_input_tokens_seen": 107359040, "step": 88275 }, { "epoch": 11.061270517479013, "grad_norm": 0.13863155245780945, "learning_rate": 4.946641742375357e-06, "loss": 0.4583, "num_input_tokens_seen": 107365632, "step": 88280 }, { "epoch": 11.061897005387795, "grad_norm": 0.14503103494644165, "learning_rate": 4.946095059990712e-06, "loss": 0.4609, "num_input_tokens_seen": 107371872, "step": 88285 }, { "epoch": 11.06252349329658, "grad_norm": 0.2027560919523239, "learning_rate": 4.945548378250548e-06, "loss": 0.4632, "num_input_tokens_seen": 107378304, "step": 88290 }, { "epoch": 11.063149981205363, "grad_norm": 0.134700208902359, "learning_rate": 4.945001697161398e-06, "loss": 0.4622, "num_input_tokens_seen": 107384320, "step": 88295 }, { "epoch": 11.063776469114146, "grad_norm": 0.17418719828128815, "learning_rate": 4.944455016729798e-06, "loss": 0.4557, "num_input_tokens_seen": 107390016, "step": 88300 }, { "epoch": 11.06440295702293, "grad_norm": 0.20903626084327698, "learning_rate": 4.943908336962287e-06, "loss": 0.4522, "num_input_tokens_seen": 107395904, "step": 88305 }, { "epoch": 11.065029444931714, "grad_norm": 0.18068863451480865, "learning_rate": 4.943361657865397e-06, "loss": 0.4608, "num_input_tokens_seen": 107402240, "step": 88310 }, { "epoch": 11.065655932840496, "grad_norm": 0.24759043753147125, "learning_rate": 4.942814979445667e-06, "loss": 0.4674, "num_input_tokens_seen": 107408352, "step": 88315 }, { "epoch": 11.06628242074928, "grad_norm": 0.15104563534259796, "learning_rate": 4.9422683017096325e-06, "loss": 0.4555, "num_input_tokens_seen": 107414336, "step": 88320 }, { "epoch": 11.066908908658062, "grad_norm": 0.24305148422718048, "learning_rate": 4.941721624663828e-06, "loss": 0.4682, "num_input_tokens_seen": 107420384, "step": 88325 }, { "epoch": 11.067535396566846, "grad_norm": 0.16959208250045776, "learning_rate": 4.94117494831479e-06, "loss": 0.4584, "num_input_tokens_seen": 107426720, "step": 88330 }, { "epoch": 11.06816188447563, "grad_norm": 0.20510779321193695, "learning_rate": 4.940628272669058e-06, "loss": 0.4756, "num_input_tokens_seen": 107432832, "step": 88335 }, { "epoch": 11.068788372384413, "grad_norm": 0.21718651056289673, "learning_rate": 4.940081597733162e-06, "loss": 0.4648, "num_input_tokens_seen": 107438848, "step": 88340 }, { "epoch": 11.069414860293197, "grad_norm": 0.1883063167333603, "learning_rate": 4.939534923513642e-06, "loss": 0.4617, "num_input_tokens_seen": 107445280, "step": 88345 }, { "epoch": 11.070041348201979, "grad_norm": 0.22463558614253998, "learning_rate": 4.938988250017032e-06, "loss": 0.4624, "num_input_tokens_seen": 107451424, "step": 88350 }, { "epoch": 11.070667836110763, "grad_norm": 0.15030741691589355, "learning_rate": 4.938441577249871e-06, "loss": 0.4598, "num_input_tokens_seen": 107457536, "step": 88355 }, { "epoch": 11.071294324019547, "grad_norm": 0.19113844633102417, "learning_rate": 4.93789490521869e-06, "loss": 0.451, "num_input_tokens_seen": 107463200, "step": 88360 }, { "epoch": 11.07192081192833, "grad_norm": 0.1528317630290985, "learning_rate": 4.937348233930027e-06, "loss": 0.457, "num_input_tokens_seen": 107469184, "step": 88365 }, { "epoch": 11.072547299837113, "grad_norm": 0.178641214966774, "learning_rate": 4.93680156339042e-06, "loss": 0.4674, "num_input_tokens_seen": 107475488, "step": 88370 }, { "epoch": 11.073173787745896, "grad_norm": 0.17571747303009033, "learning_rate": 4.936254893606403e-06, "loss": 0.4575, "num_input_tokens_seen": 107481792, "step": 88375 }, { "epoch": 11.07380027565468, "grad_norm": 0.19378848373889923, "learning_rate": 4.935708224584515e-06, "loss": 0.4675, "num_input_tokens_seen": 107487904, "step": 88380 }, { "epoch": 11.074426763563464, "grad_norm": 0.19181807339191437, "learning_rate": 4.935161556331285e-06, "loss": 0.46, "num_input_tokens_seen": 107493920, "step": 88385 }, { "epoch": 11.075053251472246, "grad_norm": 0.20011237263679504, "learning_rate": 4.934614888853257e-06, "loss": 0.4621, "num_input_tokens_seen": 107500128, "step": 88390 }, { "epoch": 11.07567973938103, "grad_norm": 0.30680108070373535, "learning_rate": 4.93406822215696e-06, "loss": 0.466, "num_input_tokens_seen": 107506368, "step": 88395 }, { "epoch": 11.076306227289813, "grad_norm": 0.19389094412326813, "learning_rate": 4.9335215562489355e-06, "loss": 0.4718, "num_input_tokens_seen": 107512480, "step": 88400 }, { "epoch": 11.076932715198597, "grad_norm": 0.14511483907699585, "learning_rate": 4.932974891135715e-06, "loss": 0.4513, "num_input_tokens_seen": 107518496, "step": 88405 }, { "epoch": 11.07755920310738, "grad_norm": 0.20724046230316162, "learning_rate": 4.932428226823836e-06, "loss": 0.4597, "num_input_tokens_seen": 107524864, "step": 88410 }, { "epoch": 11.078185691016163, "grad_norm": 0.30871278047561646, "learning_rate": 4.931881563319835e-06, "loss": 0.4662, "num_input_tokens_seen": 107531136, "step": 88415 }, { "epoch": 11.078812178924947, "grad_norm": 0.20903030037879944, "learning_rate": 4.931334900630247e-06, "loss": 0.4737, "num_input_tokens_seen": 107536992, "step": 88420 }, { "epoch": 11.07943866683373, "grad_norm": 0.18361619114875793, "learning_rate": 4.9307882387616065e-06, "loss": 0.4513, "num_input_tokens_seen": 107543232, "step": 88425 }, { "epoch": 11.080065154742513, "grad_norm": 0.30935704708099365, "learning_rate": 4.930241577720452e-06, "loss": 0.463, "num_input_tokens_seen": 107549408, "step": 88430 }, { "epoch": 11.080691642651297, "grad_norm": 0.19964534044265747, "learning_rate": 4.929694917513319e-06, "loss": 0.4706, "num_input_tokens_seen": 107555776, "step": 88435 }, { "epoch": 11.08131813056008, "grad_norm": 0.21687985956668854, "learning_rate": 4.92914825814674e-06, "loss": 0.4548, "num_input_tokens_seen": 107561952, "step": 88440 }, { "epoch": 11.081944618468864, "grad_norm": 0.19481045007705688, "learning_rate": 4.928601599627256e-06, "loss": 0.4569, "num_input_tokens_seen": 107568256, "step": 88445 }, { "epoch": 11.082571106377648, "grad_norm": 0.18597455322742462, "learning_rate": 4.928054941961398e-06, "loss": 0.4622, "num_input_tokens_seen": 107574432, "step": 88450 }, { "epoch": 11.08319759428643, "grad_norm": 0.21714183688163757, "learning_rate": 4.927508285155705e-06, "loss": 0.4535, "num_input_tokens_seen": 107580224, "step": 88455 }, { "epoch": 11.083824082195214, "grad_norm": 0.18246139585971832, "learning_rate": 4.926961629216711e-06, "loss": 0.462, "num_input_tokens_seen": 107586400, "step": 88460 }, { "epoch": 11.084450570103996, "grad_norm": 0.18880178034305573, "learning_rate": 4.926414974150952e-06, "loss": 0.4675, "num_input_tokens_seen": 107592384, "step": 88465 }, { "epoch": 11.08507705801278, "grad_norm": 0.19683890044689178, "learning_rate": 4.925868319964963e-06, "loss": 0.455, "num_input_tokens_seen": 107598432, "step": 88470 }, { "epoch": 11.085703545921564, "grad_norm": 0.17111006379127502, "learning_rate": 4.925321666665281e-06, "loss": 0.4599, "num_input_tokens_seen": 107604672, "step": 88475 }, { "epoch": 11.086330033830347, "grad_norm": 0.1864851713180542, "learning_rate": 4.924775014258441e-06, "loss": 0.4539, "num_input_tokens_seen": 107611040, "step": 88480 }, { "epoch": 11.08695652173913, "grad_norm": 0.2639532685279846, "learning_rate": 4.924228362750981e-06, "loss": 0.4614, "num_input_tokens_seen": 107617088, "step": 88485 }, { "epoch": 11.087583009647913, "grad_norm": 0.22080673277378082, "learning_rate": 4.9236817121494316e-06, "loss": 0.4647, "num_input_tokens_seen": 107623200, "step": 88490 }, { "epoch": 11.088209497556697, "grad_norm": 0.22917240858078003, "learning_rate": 4.923135062460333e-06, "loss": 0.4631, "num_input_tokens_seen": 107629408, "step": 88495 }, { "epoch": 11.088835985465481, "grad_norm": 0.2704298496246338, "learning_rate": 4.92258841369022e-06, "loss": 0.4735, "num_input_tokens_seen": 107635360, "step": 88500 }, { "epoch": 11.089462473374263, "grad_norm": 0.2752035856246948, "learning_rate": 4.922041765845626e-06, "loss": 0.4684, "num_input_tokens_seen": 107641536, "step": 88505 }, { "epoch": 11.090088961283048, "grad_norm": 0.1838197261095047, "learning_rate": 4.92149511893309e-06, "loss": 0.4646, "num_input_tokens_seen": 107647424, "step": 88510 }, { "epoch": 11.09071544919183, "grad_norm": 0.18101690709590912, "learning_rate": 4.920948472959145e-06, "loss": 0.4667, "num_input_tokens_seen": 107653760, "step": 88515 }, { "epoch": 11.091341937100614, "grad_norm": 0.1501159369945526, "learning_rate": 4.920401827930327e-06, "loss": 0.4556, "num_input_tokens_seen": 107660000, "step": 88520 }, { "epoch": 11.091968425009398, "grad_norm": 0.22259289026260376, "learning_rate": 4.919855183853171e-06, "loss": 0.4584, "num_input_tokens_seen": 107666272, "step": 88525 }, { "epoch": 11.09259491291818, "grad_norm": 0.16554437577724457, "learning_rate": 4.919308540734217e-06, "loss": 0.4596, "num_input_tokens_seen": 107672448, "step": 88530 }, { "epoch": 11.093221400826964, "grad_norm": 0.2201850861310959, "learning_rate": 4.918761898579995e-06, "loss": 0.4565, "num_input_tokens_seen": 107678560, "step": 88535 }, { "epoch": 11.093847888735747, "grad_norm": 0.2035159021615982, "learning_rate": 4.9182152573970445e-06, "loss": 0.4503, "num_input_tokens_seen": 107684480, "step": 88540 }, { "epoch": 11.09447437664453, "grad_norm": 0.2542789578437805, "learning_rate": 4.9176686171918965e-06, "loss": 0.473, "num_input_tokens_seen": 107690432, "step": 88545 }, { "epoch": 11.095100864553315, "grad_norm": 0.23652566969394684, "learning_rate": 4.9171219779710906e-06, "loss": 0.4604, "num_input_tokens_seen": 107696704, "step": 88550 }, { "epoch": 11.095727352462097, "grad_norm": 0.302600234746933, "learning_rate": 4.916575339741162e-06, "loss": 0.4542, "num_input_tokens_seen": 107702848, "step": 88555 }, { "epoch": 11.096353840370881, "grad_norm": 0.335491418838501, "learning_rate": 4.916028702508644e-06, "loss": 0.4605, "num_input_tokens_seen": 107708928, "step": 88560 }, { "epoch": 11.096980328279665, "grad_norm": 0.2394682914018631, "learning_rate": 4.915482066280076e-06, "loss": 0.4517, "num_input_tokens_seen": 107714784, "step": 88565 }, { "epoch": 11.097606816188447, "grad_norm": 0.23507238924503326, "learning_rate": 4.914935431061988e-06, "loss": 0.4531, "num_input_tokens_seen": 107720896, "step": 88570 }, { "epoch": 11.098233304097231, "grad_norm": 0.3660341799259186, "learning_rate": 4.91438879686092e-06, "loss": 0.4619, "num_input_tokens_seen": 107727296, "step": 88575 }, { "epoch": 11.098859792006014, "grad_norm": 0.2174326777458191, "learning_rate": 4.913842163683405e-06, "loss": 0.4671, "num_input_tokens_seen": 107733152, "step": 88580 }, { "epoch": 11.099486279914798, "grad_norm": 0.23297150433063507, "learning_rate": 4.913295531535981e-06, "loss": 0.467, "num_input_tokens_seen": 107739584, "step": 88585 }, { "epoch": 11.100112767823582, "grad_norm": 0.24085018038749695, "learning_rate": 4.9127489004251795e-06, "loss": 0.4729, "num_input_tokens_seen": 107745984, "step": 88590 }, { "epoch": 11.100739255732364, "grad_norm": 0.3861185610294342, "learning_rate": 4.912202270357542e-06, "loss": 0.4681, "num_input_tokens_seen": 107752480, "step": 88595 }, { "epoch": 11.101365743641148, "grad_norm": 0.22878335416316986, "learning_rate": 4.911655641339597e-06, "loss": 0.4605, "num_input_tokens_seen": 107758688, "step": 88600 }, { "epoch": 11.10199223154993, "grad_norm": 0.25666937232017517, "learning_rate": 4.911109013377884e-06, "loss": 0.4567, "num_input_tokens_seen": 107764832, "step": 88605 }, { "epoch": 11.102618719458714, "grad_norm": 0.15055547654628754, "learning_rate": 4.910562386478937e-06, "loss": 0.4559, "num_input_tokens_seen": 107770976, "step": 88610 }, { "epoch": 11.103245207367499, "grad_norm": 0.15834757685661316, "learning_rate": 4.9100157606492906e-06, "loss": 0.4714, "num_input_tokens_seen": 107777184, "step": 88615 }, { "epoch": 11.10387169527628, "grad_norm": 0.16936133801937103, "learning_rate": 4.909469135895485e-06, "loss": 0.4645, "num_input_tokens_seen": 107783552, "step": 88620 }, { "epoch": 11.104498183185065, "grad_norm": 0.15489649772644043, "learning_rate": 4.908922512224049e-06, "loss": 0.4672, "num_input_tokens_seen": 107789824, "step": 88625 }, { "epoch": 11.105124671093847, "grad_norm": 0.24264872074127197, "learning_rate": 4.908375889641522e-06, "loss": 0.4646, "num_input_tokens_seen": 107795264, "step": 88630 }, { "epoch": 11.105751159002631, "grad_norm": 0.14610189199447632, "learning_rate": 4.907829268154437e-06, "loss": 0.4633, "num_input_tokens_seen": 107801696, "step": 88635 }, { "epoch": 11.106377646911415, "grad_norm": 0.17711588740348816, "learning_rate": 4.907282647769334e-06, "loss": 0.4593, "num_input_tokens_seen": 107807200, "step": 88640 }, { "epoch": 11.107004134820198, "grad_norm": 0.17937661707401276, "learning_rate": 4.906736028492741e-06, "loss": 0.4758, "num_input_tokens_seen": 107813216, "step": 88645 }, { "epoch": 11.107630622728982, "grad_norm": 0.19612354040145874, "learning_rate": 4.9061894103312e-06, "loss": 0.455, "num_input_tokens_seen": 107819200, "step": 88650 }, { "epoch": 11.108257110637764, "grad_norm": 0.18232659995555878, "learning_rate": 4.905642793291242e-06, "loss": 0.455, "num_input_tokens_seen": 107825216, "step": 88655 }, { "epoch": 11.108883598546548, "grad_norm": 0.16324067115783691, "learning_rate": 4.905096177379405e-06, "loss": 0.4631, "num_input_tokens_seen": 107831296, "step": 88660 }, { "epoch": 11.109510086455332, "grad_norm": 0.16125619411468506, "learning_rate": 4.904549562602221e-06, "loss": 0.4554, "num_input_tokens_seen": 107837024, "step": 88665 }, { "epoch": 11.110136574364114, "grad_norm": 0.2243296205997467, "learning_rate": 4.904002948966229e-06, "loss": 0.4628, "num_input_tokens_seen": 107843200, "step": 88670 }, { "epoch": 11.110763062272898, "grad_norm": 0.15825937688350677, "learning_rate": 4.903456336477961e-06, "loss": 0.4588, "num_input_tokens_seen": 107849696, "step": 88675 }, { "epoch": 11.11138955018168, "grad_norm": 0.1660904586315155, "learning_rate": 4.902909725143955e-06, "loss": 0.4531, "num_input_tokens_seen": 107855008, "step": 88680 }, { "epoch": 11.112016038090465, "grad_norm": 0.14651720225811005, "learning_rate": 4.902363114970744e-06, "loss": 0.4525, "num_input_tokens_seen": 107860928, "step": 88685 }, { "epoch": 11.112642525999249, "grad_norm": 0.31389158964157104, "learning_rate": 4.901816505964863e-06, "loss": 0.4578, "num_input_tokens_seen": 107866912, "step": 88690 }, { "epoch": 11.113269013908031, "grad_norm": 0.2726704180240631, "learning_rate": 4.901269898132851e-06, "loss": 0.4624, "num_input_tokens_seen": 107873280, "step": 88695 }, { "epoch": 11.113895501816815, "grad_norm": 0.19770249724388123, "learning_rate": 4.900723291481238e-06, "loss": 0.465, "num_input_tokens_seen": 107879328, "step": 88700 }, { "epoch": 11.1145219897256, "grad_norm": 0.1818673312664032, "learning_rate": 4.900176686016564e-06, "loss": 0.4595, "num_input_tokens_seen": 107885344, "step": 88705 }, { "epoch": 11.115148477634381, "grad_norm": 0.20540237426757812, "learning_rate": 4.899630081745359e-06, "loss": 0.4534, "num_input_tokens_seen": 107891552, "step": 88710 }, { "epoch": 11.115774965543165, "grad_norm": 0.2308991551399231, "learning_rate": 4.8990834786741635e-06, "loss": 0.4596, "num_input_tokens_seen": 107897856, "step": 88715 }, { "epoch": 11.116401453451948, "grad_norm": 0.19731509685516357, "learning_rate": 4.898536876809507e-06, "loss": 0.4603, "num_input_tokens_seen": 107903680, "step": 88720 }, { "epoch": 11.117027941360732, "grad_norm": 0.2106534093618393, "learning_rate": 4.897990276157929e-06, "loss": 0.4671, "num_input_tokens_seen": 107909344, "step": 88725 }, { "epoch": 11.117654429269516, "grad_norm": 0.22669140994548798, "learning_rate": 4.897443676725962e-06, "loss": 0.4693, "num_input_tokens_seen": 107915712, "step": 88730 }, { "epoch": 11.118280917178298, "grad_norm": 0.15279650688171387, "learning_rate": 4.896897078520143e-06, "loss": 0.4694, "num_input_tokens_seen": 107921472, "step": 88735 }, { "epoch": 11.118907405087082, "grad_norm": 0.16017594933509827, "learning_rate": 4.896350481547006e-06, "loss": 0.4596, "num_input_tokens_seen": 107927840, "step": 88740 }, { "epoch": 11.119533892995864, "grad_norm": 0.1909971386194229, "learning_rate": 4.895803885813085e-06, "loss": 0.4627, "num_input_tokens_seen": 107933984, "step": 88745 }, { "epoch": 11.120160380904649, "grad_norm": 0.2255268543958664, "learning_rate": 4.895257291324918e-06, "loss": 0.467, "num_input_tokens_seen": 107940288, "step": 88750 }, { "epoch": 11.120786868813433, "grad_norm": 0.21684589982032776, "learning_rate": 4.894710698089037e-06, "loss": 0.4617, "num_input_tokens_seen": 107946496, "step": 88755 }, { "epoch": 11.121413356722215, "grad_norm": 0.17878074944019318, "learning_rate": 4.894164106111979e-06, "loss": 0.4604, "num_input_tokens_seen": 107952832, "step": 88760 }, { "epoch": 11.122039844630999, "grad_norm": 0.2257278561592102, "learning_rate": 4.8936175154002774e-06, "loss": 0.4663, "num_input_tokens_seen": 107959456, "step": 88765 }, { "epoch": 11.122666332539781, "grad_norm": 0.19620800018310547, "learning_rate": 4.893070925960469e-06, "loss": 0.4616, "num_input_tokens_seen": 107965536, "step": 88770 }, { "epoch": 11.123292820448565, "grad_norm": 0.20989620685577393, "learning_rate": 4.892524337799086e-06, "loss": 0.4536, "num_input_tokens_seen": 107971776, "step": 88775 }, { "epoch": 11.12391930835735, "grad_norm": 0.22128136456012726, "learning_rate": 4.891977750922665e-06, "loss": 0.4609, "num_input_tokens_seen": 107977664, "step": 88780 }, { "epoch": 11.124545796266132, "grad_norm": 0.14299950003623962, "learning_rate": 4.8914311653377406e-06, "loss": 0.4607, "num_input_tokens_seen": 107983552, "step": 88785 }, { "epoch": 11.125172284174916, "grad_norm": 0.17357684671878815, "learning_rate": 4.89088458105085e-06, "loss": 0.4686, "num_input_tokens_seen": 107989824, "step": 88790 }, { "epoch": 11.125798772083698, "grad_norm": 0.1551797091960907, "learning_rate": 4.890337998068524e-06, "loss": 0.4563, "num_input_tokens_seen": 107996160, "step": 88795 }, { "epoch": 11.126425259992482, "grad_norm": 0.1707514226436615, "learning_rate": 4.889791416397299e-06, "loss": 0.4582, "num_input_tokens_seen": 108001984, "step": 88800 }, { "epoch": 11.127051747901266, "grad_norm": 0.1492532193660736, "learning_rate": 4.889244836043712e-06, "loss": 0.4662, "num_input_tokens_seen": 108008480, "step": 88805 }, { "epoch": 11.127678235810048, "grad_norm": 0.14432744681835175, "learning_rate": 4.888698257014295e-06, "loss": 0.4731, "num_input_tokens_seen": 108014816, "step": 88810 }, { "epoch": 11.128304723718832, "grad_norm": 0.21370582282543182, "learning_rate": 4.888151679315585e-06, "loss": 0.4553, "num_input_tokens_seen": 108020384, "step": 88815 }, { "epoch": 11.128931211627616, "grad_norm": 0.25968286395072937, "learning_rate": 4.887605102954114e-06, "loss": 0.4756, "num_input_tokens_seen": 108026816, "step": 88820 }, { "epoch": 11.129557699536399, "grad_norm": 0.19050942361354828, "learning_rate": 4.88705852793642e-06, "loss": 0.4595, "num_input_tokens_seen": 108033344, "step": 88825 }, { "epoch": 11.130184187445183, "grad_norm": 0.19391289353370667, "learning_rate": 4.886511954269035e-06, "loss": 0.4515, "num_input_tokens_seen": 108039648, "step": 88830 }, { "epoch": 11.130810675353965, "grad_norm": 0.14192204177379608, "learning_rate": 4.8859653819584965e-06, "loss": 0.45, "num_input_tokens_seen": 108045856, "step": 88835 }, { "epoch": 11.13143716326275, "grad_norm": 0.1604330986738205, "learning_rate": 4.885418811011336e-06, "loss": 0.4588, "num_input_tokens_seen": 108052192, "step": 88840 }, { "epoch": 11.132063651171533, "grad_norm": 0.20414237678050995, "learning_rate": 4.884872241434092e-06, "loss": 0.4629, "num_input_tokens_seen": 108058272, "step": 88845 }, { "epoch": 11.132690139080315, "grad_norm": 0.14256417751312256, "learning_rate": 4.884325673233295e-06, "loss": 0.4677, "num_input_tokens_seen": 108064512, "step": 88850 }, { "epoch": 11.1333166269891, "grad_norm": 0.2020803540945053, "learning_rate": 4.883779106415484e-06, "loss": 0.4592, "num_input_tokens_seen": 108070816, "step": 88855 }, { "epoch": 11.133943114897882, "grad_norm": 0.14522382616996765, "learning_rate": 4.88323254098719e-06, "loss": 0.4658, "num_input_tokens_seen": 108077120, "step": 88860 }, { "epoch": 11.134569602806666, "grad_norm": 0.15699847042560577, "learning_rate": 4.882685976954948e-06, "loss": 0.4628, "num_input_tokens_seen": 108083136, "step": 88865 }, { "epoch": 11.13519609071545, "grad_norm": 0.15909618139266968, "learning_rate": 4.882139414325297e-06, "loss": 0.4554, "num_input_tokens_seen": 108089408, "step": 88870 }, { "epoch": 11.135822578624232, "grad_norm": 0.15598155558109283, "learning_rate": 4.8815928531047655e-06, "loss": 0.4624, "num_input_tokens_seen": 108095424, "step": 88875 }, { "epoch": 11.136449066533016, "grad_norm": 0.15467782318592072, "learning_rate": 4.8810462932998925e-06, "loss": 0.4681, "num_input_tokens_seen": 108101984, "step": 88880 }, { "epoch": 11.137075554441799, "grad_norm": 0.14744403958320618, "learning_rate": 4.8804997349172115e-06, "loss": 0.4621, "num_input_tokens_seen": 108108224, "step": 88885 }, { "epoch": 11.137702042350583, "grad_norm": 0.18549391627311707, "learning_rate": 4.879953177963256e-06, "loss": 0.469, "num_input_tokens_seen": 108113600, "step": 88890 }, { "epoch": 11.138328530259367, "grad_norm": 0.20631839334964752, "learning_rate": 4.87940662244456e-06, "loss": 0.4633, "num_input_tokens_seen": 108119776, "step": 88895 }, { "epoch": 11.138955018168149, "grad_norm": 0.13635210692882538, "learning_rate": 4.878860068367661e-06, "loss": 0.464, "num_input_tokens_seen": 108126080, "step": 88900 }, { "epoch": 11.139581506076933, "grad_norm": 0.16014912724494934, "learning_rate": 4.87831351573909e-06, "loss": 0.4626, "num_input_tokens_seen": 108132096, "step": 88905 }, { "epoch": 11.140207993985715, "grad_norm": 0.23989368975162506, "learning_rate": 4.877766964565386e-06, "loss": 0.4652, "num_input_tokens_seen": 108138208, "step": 88910 }, { "epoch": 11.1408344818945, "grad_norm": 0.2019432634115219, "learning_rate": 4.8772204148530784e-06, "loss": 0.4588, "num_input_tokens_seen": 108144192, "step": 88915 }, { "epoch": 11.141460969803283, "grad_norm": 0.15332503616809845, "learning_rate": 4.876673866608703e-06, "loss": 0.4693, "num_input_tokens_seen": 108150336, "step": 88920 }, { "epoch": 11.142087457712066, "grad_norm": 0.14107120037078857, "learning_rate": 4.876127319838798e-06, "loss": 0.4633, "num_input_tokens_seen": 108156672, "step": 88925 }, { "epoch": 11.14271394562085, "grad_norm": 0.2183936983346939, "learning_rate": 4.875580774549893e-06, "loss": 0.4558, "num_input_tokens_seen": 108162592, "step": 88930 }, { "epoch": 11.143340433529634, "grad_norm": 0.16839098930358887, "learning_rate": 4.875034230748525e-06, "loss": 0.4551, "num_input_tokens_seen": 108168384, "step": 88935 }, { "epoch": 11.143966921438416, "grad_norm": 0.2800016701221466, "learning_rate": 4.874487688441229e-06, "loss": 0.4628, "num_input_tokens_seen": 108174592, "step": 88940 }, { "epoch": 11.1445934093472, "grad_norm": 0.20208775997161865, "learning_rate": 4.873941147634537e-06, "loss": 0.4621, "num_input_tokens_seen": 108180448, "step": 88945 }, { "epoch": 11.145219897255982, "grad_norm": 0.24544301629066467, "learning_rate": 4.873394608334984e-06, "loss": 0.4647, "num_input_tokens_seen": 108186656, "step": 88950 }, { "epoch": 11.145846385164766, "grad_norm": 0.12964850664138794, "learning_rate": 4.872848070549107e-06, "loss": 0.465, "num_input_tokens_seen": 108192896, "step": 88955 }, { "epoch": 11.14647287307355, "grad_norm": 0.17636145651340485, "learning_rate": 4.8723015342834365e-06, "loss": 0.4682, "num_input_tokens_seen": 108198880, "step": 88960 }, { "epoch": 11.147099360982333, "grad_norm": 0.26122915744781494, "learning_rate": 4.8717549995445105e-06, "loss": 0.4682, "num_input_tokens_seen": 108205184, "step": 88965 }, { "epoch": 11.147725848891117, "grad_norm": 0.16230140626430511, "learning_rate": 4.871208466338859e-06, "loss": 0.4488, "num_input_tokens_seen": 108211360, "step": 88970 }, { "epoch": 11.1483523367999, "grad_norm": 0.22230415046215057, "learning_rate": 4.8706619346730185e-06, "loss": 0.4647, "num_input_tokens_seen": 108216960, "step": 88975 }, { "epoch": 11.148978824708683, "grad_norm": 0.22359171509742737, "learning_rate": 4.870115404553525e-06, "loss": 0.4569, "num_input_tokens_seen": 108223552, "step": 88980 }, { "epoch": 11.149605312617467, "grad_norm": 0.15973691642284393, "learning_rate": 4.869568875986909e-06, "loss": 0.4519, "num_input_tokens_seen": 108229728, "step": 88985 }, { "epoch": 11.15023180052625, "grad_norm": 0.2795135974884033, "learning_rate": 4.8690223489797084e-06, "loss": 0.464, "num_input_tokens_seen": 108236096, "step": 88990 }, { "epoch": 11.150858288435034, "grad_norm": 0.17070817947387695, "learning_rate": 4.868475823538455e-06, "loss": 0.4634, "num_input_tokens_seen": 108242112, "step": 88995 }, { "epoch": 11.151484776343816, "grad_norm": 0.22053779661655426, "learning_rate": 4.867929299669685e-06, "loss": 0.4614, "num_input_tokens_seen": 108247968, "step": 89000 }, { "epoch": 11.1521112642526, "grad_norm": 0.24743176996707916, "learning_rate": 4.867382777379929e-06, "loss": 0.4563, "num_input_tokens_seen": 108253760, "step": 89005 }, { "epoch": 11.152737752161384, "grad_norm": 0.3359426259994507, "learning_rate": 4.866836256675726e-06, "loss": 0.4591, "num_input_tokens_seen": 108259968, "step": 89010 }, { "epoch": 11.153364240070166, "grad_norm": 0.1507871150970459, "learning_rate": 4.866289737563605e-06, "loss": 0.4664, "num_input_tokens_seen": 108266272, "step": 89015 }, { "epoch": 11.15399072797895, "grad_norm": 0.21883618831634521, "learning_rate": 4.865743220050106e-06, "loss": 0.4617, "num_input_tokens_seen": 108271776, "step": 89020 }, { "epoch": 11.154617215887733, "grad_norm": 0.17974713444709778, "learning_rate": 4.8651967041417555e-06, "loss": 0.462, "num_input_tokens_seen": 108277760, "step": 89025 }, { "epoch": 11.155243703796517, "grad_norm": 0.22278086841106415, "learning_rate": 4.864650189845094e-06, "loss": 0.4633, "num_input_tokens_seen": 108283712, "step": 89030 }, { "epoch": 11.1558701917053, "grad_norm": 0.2601187527179718, "learning_rate": 4.864103677166653e-06, "loss": 0.471, "num_input_tokens_seen": 108290016, "step": 89035 }, { "epoch": 11.156496679614083, "grad_norm": 0.2998979687690735, "learning_rate": 4.863557166112967e-06, "loss": 0.4636, "num_input_tokens_seen": 108296224, "step": 89040 }, { "epoch": 11.157123167522867, "grad_norm": 0.28629258275032043, "learning_rate": 4.8630106566905686e-06, "loss": 0.4601, "num_input_tokens_seen": 108302464, "step": 89045 }, { "epoch": 11.15774965543165, "grad_norm": 0.27252018451690674, "learning_rate": 4.862464148905993e-06, "loss": 0.4663, "num_input_tokens_seen": 108308032, "step": 89050 }, { "epoch": 11.158376143340433, "grad_norm": 0.22470563650131226, "learning_rate": 4.8619176427657765e-06, "loss": 0.4522, "num_input_tokens_seen": 108314176, "step": 89055 }, { "epoch": 11.159002631249217, "grad_norm": 0.2683351933956146, "learning_rate": 4.8613711382764485e-06, "loss": 0.4599, "num_input_tokens_seen": 108320160, "step": 89060 }, { "epoch": 11.159629119158, "grad_norm": 0.1701408326625824, "learning_rate": 4.860824635444547e-06, "loss": 0.4658, "num_input_tokens_seen": 108326400, "step": 89065 }, { "epoch": 11.160255607066784, "grad_norm": 0.19608722627162933, "learning_rate": 4.860278134276602e-06, "loss": 0.4623, "num_input_tokens_seen": 108332480, "step": 89070 }, { "epoch": 11.160882094975568, "grad_norm": 0.20430058240890503, "learning_rate": 4.859731634779151e-06, "loss": 0.4605, "num_input_tokens_seen": 108338848, "step": 89075 }, { "epoch": 11.16150858288435, "grad_norm": 0.1945401132106781, "learning_rate": 4.859185136958725e-06, "loss": 0.4661, "num_input_tokens_seen": 108345024, "step": 89080 }, { "epoch": 11.162135070793134, "grad_norm": 0.24073655903339386, "learning_rate": 4.858638640821859e-06, "loss": 0.4518, "num_input_tokens_seen": 108351072, "step": 89085 }, { "epoch": 11.162761558701916, "grad_norm": 0.1524612009525299, "learning_rate": 4.858092146375088e-06, "loss": 0.4701, "num_input_tokens_seen": 108357088, "step": 89090 }, { "epoch": 11.1633880466107, "grad_norm": 0.21906937658786774, "learning_rate": 4.857545653624944e-06, "loss": 0.4617, "num_input_tokens_seen": 108363264, "step": 89095 }, { "epoch": 11.164014534519485, "grad_norm": 0.2417116016149521, "learning_rate": 4.8569991625779615e-06, "loss": 0.4586, "num_input_tokens_seen": 108369536, "step": 89100 }, { "epoch": 11.164641022428267, "grad_norm": 0.31632885336875916, "learning_rate": 4.856452673240673e-06, "loss": 0.4637, "num_input_tokens_seen": 108375744, "step": 89105 }, { "epoch": 11.165267510337051, "grad_norm": 0.17601452767848969, "learning_rate": 4.855906185619616e-06, "loss": 0.4685, "num_input_tokens_seen": 108381760, "step": 89110 }, { "epoch": 11.165893998245833, "grad_norm": 0.2288486808538437, "learning_rate": 4.8553596997213194e-06, "loss": 0.4616, "num_input_tokens_seen": 108387744, "step": 89115 }, { "epoch": 11.166520486154617, "grad_norm": 0.2372293919324875, "learning_rate": 4.854813215552322e-06, "loss": 0.4636, "num_input_tokens_seen": 108393344, "step": 89120 }, { "epoch": 11.167146974063401, "grad_norm": 0.18091937899589539, "learning_rate": 4.854266733119152e-06, "loss": 0.4587, "num_input_tokens_seen": 108399488, "step": 89125 }, { "epoch": 11.167773461972184, "grad_norm": 0.21553201973438263, "learning_rate": 4.853720252428348e-06, "loss": 0.4646, "num_input_tokens_seen": 108405568, "step": 89130 }, { "epoch": 11.168399949880968, "grad_norm": 0.22990787029266357, "learning_rate": 4.853173773486439e-06, "loss": 0.4687, "num_input_tokens_seen": 108412128, "step": 89135 }, { "epoch": 11.16902643778975, "grad_norm": 0.19494789838790894, "learning_rate": 4.852627296299963e-06, "loss": 0.458, "num_input_tokens_seen": 108418368, "step": 89140 }, { "epoch": 11.169652925698534, "grad_norm": 0.16444070637226105, "learning_rate": 4.852080820875449e-06, "loss": 0.4708, "num_input_tokens_seen": 108424032, "step": 89145 }, { "epoch": 11.170279413607318, "grad_norm": 0.21294942498207092, "learning_rate": 4.851534347219437e-06, "loss": 0.4626, "num_input_tokens_seen": 108429824, "step": 89150 }, { "epoch": 11.1709059015161, "grad_norm": 0.15670013427734375, "learning_rate": 4.850987875338452e-06, "loss": 0.4658, "num_input_tokens_seen": 108436064, "step": 89155 }, { "epoch": 11.171532389424884, "grad_norm": 0.24250847101211548, "learning_rate": 4.850441405239037e-06, "loss": 0.4555, "num_input_tokens_seen": 108442080, "step": 89160 }, { "epoch": 11.172158877333667, "grad_norm": 0.2150191366672516, "learning_rate": 4.849894936927717e-06, "loss": 0.4627, "num_input_tokens_seen": 108448224, "step": 89165 }, { "epoch": 11.17278536524245, "grad_norm": 0.23382709920406342, "learning_rate": 4.84934847041103e-06, "loss": 0.4657, "num_input_tokens_seen": 108454688, "step": 89170 }, { "epoch": 11.173411853151235, "grad_norm": 0.15655583143234253, "learning_rate": 4.848802005695511e-06, "loss": 0.4587, "num_input_tokens_seen": 108460704, "step": 89175 }, { "epoch": 11.174038341060017, "grad_norm": 0.212485209107399, "learning_rate": 4.848255542787689e-06, "loss": 0.464, "num_input_tokens_seen": 108466976, "step": 89180 }, { "epoch": 11.174664828968801, "grad_norm": 0.20918165147304535, "learning_rate": 4.847709081694101e-06, "loss": 0.4663, "num_input_tokens_seen": 108473024, "step": 89185 }, { "epoch": 11.175291316877583, "grad_norm": 0.17635191977024078, "learning_rate": 4.847162622421277e-06, "loss": 0.4592, "num_input_tokens_seen": 108478688, "step": 89190 }, { "epoch": 11.175917804786367, "grad_norm": 0.2155439555644989, "learning_rate": 4.846616164975754e-06, "loss": 0.4472, "num_input_tokens_seen": 108484864, "step": 89195 }, { "epoch": 11.176544292695151, "grad_norm": 0.19721180200576782, "learning_rate": 4.846069709364062e-06, "loss": 0.4626, "num_input_tokens_seen": 108491136, "step": 89200 }, { "epoch": 11.177170780603934, "grad_norm": 0.1813219040632248, "learning_rate": 4.845523255592738e-06, "loss": 0.4574, "num_input_tokens_seen": 108497184, "step": 89205 }, { "epoch": 11.177797268512718, "grad_norm": 0.23191948235034943, "learning_rate": 4.844976803668312e-06, "loss": 0.4694, "num_input_tokens_seen": 108502912, "step": 89210 }, { "epoch": 11.178423756421502, "grad_norm": 0.19093482196331024, "learning_rate": 4.84443035359732e-06, "loss": 0.4559, "num_input_tokens_seen": 108509120, "step": 89215 }, { "epoch": 11.179050244330284, "grad_norm": 0.19630666077136993, "learning_rate": 4.843883905386293e-06, "loss": 0.4659, "num_input_tokens_seen": 108515552, "step": 89220 }, { "epoch": 11.179676732239068, "grad_norm": 0.18847091495990753, "learning_rate": 4.843337459041766e-06, "loss": 0.4663, "num_input_tokens_seen": 108521472, "step": 89225 }, { "epoch": 11.18030322014785, "grad_norm": 0.16796037554740906, "learning_rate": 4.84279101457027e-06, "loss": 0.4433, "num_input_tokens_seen": 108527296, "step": 89230 }, { "epoch": 11.180929708056635, "grad_norm": 0.203139066696167, "learning_rate": 4.842244571978339e-06, "loss": 0.4651, "num_input_tokens_seen": 108533280, "step": 89235 }, { "epoch": 11.181556195965419, "grad_norm": 0.1613260954618454, "learning_rate": 4.841698131272509e-06, "loss": 0.4503, "num_input_tokens_seen": 108539232, "step": 89240 }, { "epoch": 11.182182683874201, "grad_norm": 0.20485521852970123, "learning_rate": 4.8411516924593096e-06, "loss": 0.4618, "num_input_tokens_seen": 108545216, "step": 89245 }, { "epoch": 11.182809171782985, "grad_norm": 0.16437716782093048, "learning_rate": 4.840605255545276e-06, "loss": 0.4584, "num_input_tokens_seen": 108551392, "step": 89250 }, { "epoch": 11.183435659691767, "grad_norm": 0.2596847712993622, "learning_rate": 4.840058820536939e-06, "loss": 0.4618, "num_input_tokens_seen": 108557632, "step": 89255 }, { "epoch": 11.184062147600551, "grad_norm": 0.1798473596572876, "learning_rate": 4.839512387440836e-06, "loss": 0.456, "num_input_tokens_seen": 108562976, "step": 89260 }, { "epoch": 11.184688635509335, "grad_norm": 0.23267319798469543, "learning_rate": 4.8389659562634945e-06, "loss": 0.4637, "num_input_tokens_seen": 108569504, "step": 89265 }, { "epoch": 11.185315123418118, "grad_norm": 0.20783859491348267, "learning_rate": 4.838419527011453e-06, "loss": 0.4696, "num_input_tokens_seen": 108575552, "step": 89270 }, { "epoch": 11.185941611326902, "grad_norm": 0.19818265736103058, "learning_rate": 4.83787309969124e-06, "loss": 0.4588, "num_input_tokens_seen": 108581632, "step": 89275 }, { "epoch": 11.186568099235684, "grad_norm": 0.21430854499340057, "learning_rate": 4.837326674309393e-06, "loss": 0.4584, "num_input_tokens_seen": 108587904, "step": 89280 }, { "epoch": 11.187194587144468, "grad_norm": 0.2687492370605469, "learning_rate": 4.836780250872438e-06, "loss": 0.4649, "num_input_tokens_seen": 108593952, "step": 89285 }, { "epoch": 11.187821075053252, "grad_norm": 0.277275413274765, "learning_rate": 4.836233829386915e-06, "loss": 0.4695, "num_input_tokens_seen": 108600128, "step": 89290 }, { "epoch": 11.188447562962034, "grad_norm": 0.18584860861301422, "learning_rate": 4.8356874098593536e-06, "loss": 0.4545, "num_input_tokens_seen": 108606240, "step": 89295 }, { "epoch": 11.189074050870818, "grad_norm": 0.17480291426181793, "learning_rate": 4.835140992296288e-06, "loss": 0.4665, "num_input_tokens_seen": 108612256, "step": 89300 }, { "epoch": 11.1897005387796, "grad_norm": 0.1638294905424118, "learning_rate": 4.83459457670425e-06, "loss": 0.4655, "num_input_tokens_seen": 108618464, "step": 89305 }, { "epoch": 11.190327026688385, "grad_norm": 0.2245546281337738, "learning_rate": 4.834048163089772e-06, "loss": 0.4616, "num_input_tokens_seen": 108624736, "step": 89310 }, { "epoch": 11.190953514597169, "grad_norm": 0.17557384073734283, "learning_rate": 4.8335017514593905e-06, "loss": 0.4553, "num_input_tokens_seen": 108630976, "step": 89315 }, { "epoch": 11.191580002505951, "grad_norm": 0.16139645874500275, "learning_rate": 4.832955341819633e-06, "loss": 0.4527, "num_input_tokens_seen": 108637408, "step": 89320 }, { "epoch": 11.192206490414735, "grad_norm": 0.1592835634946823, "learning_rate": 4.832408934177037e-06, "loss": 0.4659, "num_input_tokens_seen": 108643712, "step": 89325 }, { "epoch": 11.19283297832352, "grad_norm": 0.1889115273952484, "learning_rate": 4.83186252853813e-06, "loss": 0.46, "num_input_tokens_seen": 108649632, "step": 89330 }, { "epoch": 11.193459466232301, "grad_norm": 0.13160887360572815, "learning_rate": 4.831316124909451e-06, "loss": 0.4751, "num_input_tokens_seen": 108655968, "step": 89335 }, { "epoch": 11.194085954141086, "grad_norm": 0.214432492852211, "learning_rate": 4.830769723297528e-06, "loss": 0.4638, "num_input_tokens_seen": 108662016, "step": 89340 }, { "epoch": 11.194712442049868, "grad_norm": 0.21881525218486786, "learning_rate": 4.830223323708895e-06, "loss": 0.4671, "num_input_tokens_seen": 108667968, "step": 89345 }, { "epoch": 11.195338929958652, "grad_norm": 0.1855309158563614, "learning_rate": 4.829676926150085e-06, "loss": 0.4658, "num_input_tokens_seen": 108674048, "step": 89350 }, { "epoch": 11.195965417867436, "grad_norm": 0.1538165658712387, "learning_rate": 4.829130530627631e-06, "loss": 0.4545, "num_input_tokens_seen": 108680224, "step": 89355 }, { "epoch": 11.196591905776218, "grad_norm": 0.15946009755134583, "learning_rate": 4.828584137148065e-06, "loss": 0.4604, "num_input_tokens_seen": 108686304, "step": 89360 }, { "epoch": 11.197218393685002, "grad_norm": 0.24485069513320923, "learning_rate": 4.828037745717919e-06, "loss": 0.4523, "num_input_tokens_seen": 108692032, "step": 89365 }, { "epoch": 11.197844881593785, "grad_norm": 0.2091725617647171, "learning_rate": 4.827491356343728e-06, "loss": 0.4596, "num_input_tokens_seen": 108698336, "step": 89370 }, { "epoch": 11.198471369502569, "grad_norm": 0.183692067861557, "learning_rate": 4.8269449690320215e-06, "loss": 0.4702, "num_input_tokens_seen": 108703904, "step": 89375 }, { "epoch": 11.199097857411353, "grad_norm": 0.18851988017559052, "learning_rate": 4.8263985837893345e-06, "loss": 0.4682, "num_input_tokens_seen": 108710080, "step": 89380 }, { "epoch": 11.199724345320135, "grad_norm": 0.18080413341522217, "learning_rate": 4.825852200622197e-06, "loss": 0.4631, "num_input_tokens_seen": 108716608, "step": 89385 }, { "epoch": 11.200350833228919, "grad_norm": 0.20279987156391144, "learning_rate": 4.825305819537145e-06, "loss": 0.4593, "num_input_tokens_seen": 108723008, "step": 89390 }, { "epoch": 11.200977321137701, "grad_norm": 0.15038977563381195, "learning_rate": 4.824759440540707e-06, "loss": 0.4657, "num_input_tokens_seen": 108728960, "step": 89395 }, { "epoch": 11.201603809046485, "grad_norm": 0.1892426759004593, "learning_rate": 4.8242130636394175e-06, "loss": 0.4598, "num_input_tokens_seen": 108735264, "step": 89400 }, { "epoch": 11.20223029695527, "grad_norm": 0.14704522490501404, "learning_rate": 4.823666688839807e-06, "loss": 0.4603, "num_input_tokens_seen": 108741664, "step": 89405 }, { "epoch": 11.202856784864052, "grad_norm": 0.16704505681991577, "learning_rate": 4.823120316148413e-06, "loss": 0.4585, "num_input_tokens_seen": 108747456, "step": 89410 }, { "epoch": 11.203483272772836, "grad_norm": 0.2107091099023819, "learning_rate": 4.822573945571762e-06, "loss": 0.4674, "num_input_tokens_seen": 108753376, "step": 89415 }, { "epoch": 11.204109760681618, "grad_norm": 0.1385660469532013, "learning_rate": 4.822027577116388e-06, "loss": 0.4527, "num_input_tokens_seen": 108759488, "step": 89420 }, { "epoch": 11.204736248590402, "grad_norm": 0.18397743999958038, "learning_rate": 4.821481210788826e-06, "loss": 0.471, "num_input_tokens_seen": 108765824, "step": 89425 }, { "epoch": 11.205362736499186, "grad_norm": 0.2611525058746338, "learning_rate": 4.820934846595605e-06, "loss": 0.4598, "num_input_tokens_seen": 108771808, "step": 89430 }, { "epoch": 11.205989224407968, "grad_norm": 0.12095817178487778, "learning_rate": 4.8203884845432604e-06, "loss": 0.4673, "num_input_tokens_seen": 108777984, "step": 89435 }, { "epoch": 11.206615712316752, "grad_norm": 0.20495325326919556, "learning_rate": 4.81984212463832e-06, "loss": 0.4615, "num_input_tokens_seen": 108784064, "step": 89440 }, { "epoch": 11.207242200225537, "grad_norm": 0.16242912411689758, "learning_rate": 4.819295766887319e-06, "loss": 0.4534, "num_input_tokens_seen": 108790176, "step": 89445 }, { "epoch": 11.207868688134319, "grad_norm": 0.26295873522758484, "learning_rate": 4.81874941129679e-06, "loss": 0.4584, "num_input_tokens_seen": 108796352, "step": 89450 }, { "epoch": 11.208495176043103, "grad_norm": 0.18120422959327698, "learning_rate": 4.818203057873263e-06, "loss": 0.463, "num_input_tokens_seen": 108802496, "step": 89455 }, { "epoch": 11.209121663951885, "grad_norm": 0.2108292430639267, "learning_rate": 4.817656706623271e-06, "loss": 0.4591, "num_input_tokens_seen": 108808736, "step": 89460 }, { "epoch": 11.20974815186067, "grad_norm": 0.19739654660224915, "learning_rate": 4.81711035755335e-06, "loss": 0.47, "num_input_tokens_seen": 108814688, "step": 89465 }, { "epoch": 11.210374639769453, "grad_norm": 0.20511554181575775, "learning_rate": 4.816564010670024e-06, "loss": 0.4634, "num_input_tokens_seen": 108820608, "step": 89470 }, { "epoch": 11.211001127678236, "grad_norm": 0.18595704436302185, "learning_rate": 4.816017665979833e-06, "loss": 0.4502, "num_input_tokens_seen": 108826624, "step": 89475 }, { "epoch": 11.21162761558702, "grad_norm": 0.20310619473457336, "learning_rate": 4.815471323489302e-06, "loss": 0.4542, "num_input_tokens_seen": 108832800, "step": 89480 }, { "epoch": 11.212254103495802, "grad_norm": 0.25146791338920593, "learning_rate": 4.8149249832049675e-06, "loss": 0.4665, "num_input_tokens_seen": 108839072, "step": 89485 }, { "epoch": 11.212880591404586, "grad_norm": 0.22229236364364624, "learning_rate": 4.814378645133362e-06, "loss": 0.4671, "num_input_tokens_seen": 108845312, "step": 89490 }, { "epoch": 11.21350707931337, "grad_norm": 0.3694763779640198, "learning_rate": 4.813832309281015e-06, "loss": 0.4649, "num_input_tokens_seen": 108851680, "step": 89495 }, { "epoch": 11.214133567222152, "grad_norm": 0.1777680367231369, "learning_rate": 4.813285975654459e-06, "loss": 0.4625, "num_input_tokens_seen": 108858016, "step": 89500 }, { "epoch": 11.214760055130936, "grad_norm": 0.20106413960456848, "learning_rate": 4.812739644260226e-06, "loss": 0.4554, "num_input_tokens_seen": 108863840, "step": 89505 }, { "epoch": 11.215386543039719, "grad_norm": 0.1634896695613861, "learning_rate": 4.812193315104849e-06, "loss": 0.4699, "num_input_tokens_seen": 108870304, "step": 89510 }, { "epoch": 11.216013030948503, "grad_norm": 0.19032764434814453, "learning_rate": 4.811646988194857e-06, "loss": 0.4518, "num_input_tokens_seen": 108876128, "step": 89515 }, { "epoch": 11.216639518857287, "grad_norm": 0.22312672436237335, "learning_rate": 4.811100663536786e-06, "loss": 0.4594, "num_input_tokens_seen": 108882208, "step": 89520 }, { "epoch": 11.217266006766069, "grad_norm": 0.2000269591808319, "learning_rate": 4.810554341137164e-06, "loss": 0.4597, "num_input_tokens_seen": 108888448, "step": 89525 }, { "epoch": 11.217892494674853, "grad_norm": 0.19989915192127228, "learning_rate": 4.810008021002525e-06, "loss": 0.4631, "num_input_tokens_seen": 108894144, "step": 89530 }, { "epoch": 11.218518982583635, "grad_norm": 0.19219258427619934, "learning_rate": 4.809461703139398e-06, "loss": 0.4575, "num_input_tokens_seen": 108900128, "step": 89535 }, { "epoch": 11.21914547049242, "grad_norm": 0.18713214993476868, "learning_rate": 4.808915387554317e-06, "loss": 0.4528, "num_input_tokens_seen": 108906272, "step": 89540 }, { "epoch": 11.219771958401203, "grad_norm": 0.18559131026268005, "learning_rate": 4.808369074253815e-06, "loss": 0.4669, "num_input_tokens_seen": 108912384, "step": 89545 }, { "epoch": 11.220398446309986, "grad_norm": 0.2721847891807556, "learning_rate": 4.8078227632444194e-06, "loss": 0.4535, "num_input_tokens_seen": 108918400, "step": 89550 }, { "epoch": 11.22102493421877, "grad_norm": 0.2757648229598999, "learning_rate": 4.807276454532666e-06, "loss": 0.4588, "num_input_tokens_seen": 108924512, "step": 89555 }, { "epoch": 11.221651422127552, "grad_norm": 0.24179619550704956, "learning_rate": 4.8067301481250845e-06, "loss": 0.4505, "num_input_tokens_seen": 108930816, "step": 89560 }, { "epoch": 11.222277910036336, "grad_norm": 0.24531792104244232, "learning_rate": 4.806183844028206e-06, "loss": 0.4517, "num_input_tokens_seen": 108937056, "step": 89565 }, { "epoch": 11.22290439794512, "grad_norm": 0.2965935170650482, "learning_rate": 4.805637542248562e-06, "loss": 0.4622, "num_input_tokens_seen": 108943424, "step": 89570 }, { "epoch": 11.223530885853902, "grad_norm": 0.8243038058280945, "learning_rate": 4.8050912427926875e-06, "loss": 0.4701, "num_input_tokens_seen": 108949760, "step": 89575 }, { "epoch": 11.224157373762687, "grad_norm": 0.21661913394927979, "learning_rate": 4.804544945667107e-06, "loss": 0.458, "num_input_tokens_seen": 108955808, "step": 89580 }, { "epoch": 11.22478386167147, "grad_norm": 0.23337745666503906, "learning_rate": 4.80399865087836e-06, "loss": 0.4632, "num_input_tokens_seen": 108962048, "step": 89585 }, { "epoch": 11.225410349580253, "grad_norm": 0.1968981772661209, "learning_rate": 4.80345235843297e-06, "loss": 0.473, "num_input_tokens_seen": 108968416, "step": 89590 }, { "epoch": 11.226036837489037, "grad_norm": 0.3133009970188141, "learning_rate": 4.8029060683374755e-06, "loss": 0.4766, "num_input_tokens_seen": 108974624, "step": 89595 }, { "epoch": 11.22666332539782, "grad_norm": 0.24354015290737152, "learning_rate": 4.802359780598403e-06, "loss": 0.4676, "num_input_tokens_seen": 108980896, "step": 89600 }, { "epoch": 11.227289813306603, "grad_norm": 0.3795890212059021, "learning_rate": 4.801813495222285e-06, "loss": 0.4656, "num_input_tokens_seen": 108986976, "step": 89605 }, { "epoch": 11.227916301215387, "grad_norm": 0.20283536612987518, "learning_rate": 4.801267212215655e-06, "loss": 0.4606, "num_input_tokens_seen": 108992992, "step": 89610 }, { "epoch": 11.22854278912417, "grad_norm": 0.2099754363298416, "learning_rate": 4.80072093158504e-06, "loss": 0.4609, "num_input_tokens_seen": 108999008, "step": 89615 }, { "epoch": 11.229169277032954, "grad_norm": 0.15157859027385712, "learning_rate": 4.800174653336977e-06, "loss": 0.4577, "num_input_tokens_seen": 109005408, "step": 89620 }, { "epoch": 11.229795764941736, "grad_norm": 0.1954386681318283, "learning_rate": 4.799628377477991e-06, "loss": 0.4497, "num_input_tokens_seen": 109011328, "step": 89625 }, { "epoch": 11.23042225285052, "grad_norm": 0.24458985030651093, "learning_rate": 4.79908210401462e-06, "loss": 0.4611, "num_input_tokens_seen": 109017344, "step": 89630 }, { "epoch": 11.231048740759304, "grad_norm": 0.22692802548408508, "learning_rate": 4.7985358329533875e-06, "loss": 0.4686, "num_input_tokens_seen": 109023840, "step": 89635 }, { "epoch": 11.231675228668086, "grad_norm": 0.22861531376838684, "learning_rate": 4.797989564300832e-06, "loss": 0.4493, "num_input_tokens_seen": 109029792, "step": 89640 }, { "epoch": 11.23230171657687, "grad_norm": 0.2120557725429535, "learning_rate": 4.797443298063478e-06, "loss": 0.4593, "num_input_tokens_seen": 109035840, "step": 89645 }, { "epoch": 11.232928204485653, "grad_norm": 0.20107236504554749, "learning_rate": 4.796897034247861e-06, "loss": 0.4651, "num_input_tokens_seen": 109041856, "step": 89650 }, { "epoch": 11.233554692394437, "grad_norm": 0.24229146540164948, "learning_rate": 4.796350772860511e-06, "loss": 0.4611, "num_input_tokens_seen": 109048288, "step": 89655 }, { "epoch": 11.23418118030322, "grad_norm": 0.21934795379638672, "learning_rate": 4.795804513907959e-06, "loss": 0.453, "num_input_tokens_seen": 109054528, "step": 89660 }, { "epoch": 11.234807668212003, "grad_norm": 0.2830384373664856, "learning_rate": 4.7952582573967334e-06, "loss": 0.4694, "num_input_tokens_seen": 109060512, "step": 89665 }, { "epoch": 11.235434156120787, "grad_norm": 0.1881248950958252, "learning_rate": 4.794712003333368e-06, "loss": 0.4595, "num_input_tokens_seen": 109066560, "step": 89670 }, { "epoch": 11.23606064402957, "grad_norm": 0.20208661258220673, "learning_rate": 4.794165751724396e-06, "loss": 0.4651, "num_input_tokens_seen": 109073056, "step": 89675 }, { "epoch": 11.236687131938353, "grad_norm": 0.2627469301223755, "learning_rate": 4.793619502576342e-06, "loss": 0.4616, "num_input_tokens_seen": 109079232, "step": 89680 }, { "epoch": 11.237313619847138, "grad_norm": 0.1830403357744217, "learning_rate": 4.793073255895744e-06, "loss": 0.4764, "num_input_tokens_seen": 109085504, "step": 89685 }, { "epoch": 11.23794010775592, "grad_norm": 0.26693299412727356, "learning_rate": 4.792527011689127e-06, "loss": 0.4673, "num_input_tokens_seen": 109091520, "step": 89690 }, { "epoch": 11.238566595664704, "grad_norm": 0.18148411810398102, "learning_rate": 4.791980769963025e-06, "loss": 0.4583, "num_input_tokens_seen": 109096832, "step": 89695 }, { "epoch": 11.239193083573488, "grad_norm": 0.2170320600271225, "learning_rate": 4.791434530723966e-06, "loss": 0.4605, "num_input_tokens_seen": 109102752, "step": 89700 }, { "epoch": 11.23981957148227, "grad_norm": 0.13505962491035461, "learning_rate": 4.790888293978484e-06, "loss": 0.463, "num_input_tokens_seen": 109108160, "step": 89705 }, { "epoch": 11.240446059391054, "grad_norm": 0.16818644106388092, "learning_rate": 4.790342059733109e-06, "loss": 0.4646, "num_input_tokens_seen": 109113856, "step": 89710 }, { "epoch": 11.241072547299837, "grad_norm": 0.14769737422466278, "learning_rate": 4.78979582799437e-06, "loss": 0.4585, "num_input_tokens_seen": 109120064, "step": 89715 }, { "epoch": 11.24169903520862, "grad_norm": 0.15736597776412964, "learning_rate": 4.789249598768797e-06, "loss": 0.46, "num_input_tokens_seen": 109125632, "step": 89720 }, { "epoch": 11.242325523117405, "grad_norm": 0.16678273677825928, "learning_rate": 4.788703372062923e-06, "loss": 0.4654, "num_input_tokens_seen": 109131680, "step": 89725 }, { "epoch": 11.242952011026187, "grad_norm": 0.19766220450401306, "learning_rate": 4.78815714788328e-06, "loss": 0.4675, "num_input_tokens_seen": 109137856, "step": 89730 }, { "epoch": 11.243578498934971, "grad_norm": 0.1647627055644989, "learning_rate": 4.787610926236393e-06, "loss": 0.4501, "num_input_tokens_seen": 109144000, "step": 89735 }, { "epoch": 11.244204986843753, "grad_norm": 0.24647463858127594, "learning_rate": 4.7870647071288e-06, "loss": 0.4656, "num_input_tokens_seen": 109150336, "step": 89740 }, { "epoch": 11.244831474752537, "grad_norm": 0.19709795713424683, "learning_rate": 4.786518490567024e-06, "loss": 0.4521, "num_input_tokens_seen": 109156384, "step": 89745 }, { "epoch": 11.245457962661321, "grad_norm": 0.18701888620853424, "learning_rate": 4.785972276557602e-06, "loss": 0.4667, "num_input_tokens_seen": 109162464, "step": 89750 }, { "epoch": 11.246084450570104, "grad_norm": 0.28695330023765564, "learning_rate": 4.785426065107059e-06, "loss": 0.4601, "num_input_tokens_seen": 109168736, "step": 89755 }, { "epoch": 11.246710938478888, "grad_norm": 0.22331659495830536, "learning_rate": 4.784879856221928e-06, "loss": 0.4646, "num_input_tokens_seen": 109174240, "step": 89760 }, { "epoch": 11.24733742638767, "grad_norm": 0.17860250174999237, "learning_rate": 4.784333649908739e-06, "loss": 0.46, "num_input_tokens_seen": 109180288, "step": 89765 }, { "epoch": 11.247963914296454, "grad_norm": 0.3139117658138275, "learning_rate": 4.783787446174025e-06, "loss": 0.4509, "num_input_tokens_seen": 109185984, "step": 89770 }, { "epoch": 11.248590402205238, "grad_norm": 0.20563654601573944, "learning_rate": 4.783241245024311e-06, "loss": 0.4555, "num_input_tokens_seen": 109191808, "step": 89775 }, { "epoch": 11.24921689011402, "grad_norm": 0.22080524265766144, "learning_rate": 4.782695046466132e-06, "loss": 0.4657, "num_input_tokens_seen": 109197824, "step": 89780 }, { "epoch": 11.249843378022804, "grad_norm": 0.22115854918956757, "learning_rate": 4.782148850506015e-06, "loss": 0.4479, "num_input_tokens_seen": 109203904, "step": 89785 }, { "epoch": 11.250469865931587, "grad_norm": 0.23454701900482178, "learning_rate": 4.781602657150491e-06, "loss": 0.4577, "num_input_tokens_seen": 109210048, "step": 89790 }, { "epoch": 11.25109635384037, "grad_norm": 0.22477419674396515, "learning_rate": 4.781056466406094e-06, "loss": 0.4528, "num_input_tokens_seen": 109215584, "step": 89795 }, { "epoch": 11.251722841749155, "grad_norm": 0.21175171434879303, "learning_rate": 4.780510278279348e-06, "loss": 0.4629, "num_input_tokens_seen": 109221376, "step": 89800 }, { "epoch": 11.252349329657937, "grad_norm": 0.2186558097600937, "learning_rate": 4.779964092776789e-06, "loss": 0.4518, "num_input_tokens_seen": 109227584, "step": 89805 }, { "epoch": 11.252975817566721, "grad_norm": 0.22631940245628357, "learning_rate": 4.779417909904941e-06, "loss": 0.4787, "num_input_tokens_seen": 109233824, "step": 89810 }, { "epoch": 11.253602305475503, "grad_norm": 0.357076495885849, "learning_rate": 4.77887172967034e-06, "loss": 0.4627, "num_input_tokens_seen": 109239968, "step": 89815 }, { "epoch": 11.254228793384287, "grad_norm": 0.30864351987838745, "learning_rate": 4.778325552079511e-06, "loss": 0.4593, "num_input_tokens_seen": 109246336, "step": 89820 }, { "epoch": 11.254855281293072, "grad_norm": 0.2565629184246063, "learning_rate": 4.777779377138989e-06, "loss": 0.4599, "num_input_tokens_seen": 109252544, "step": 89825 }, { "epoch": 11.255481769201854, "grad_norm": 0.18867707252502441, "learning_rate": 4.777233204855299e-06, "loss": 0.4624, "num_input_tokens_seen": 109258432, "step": 89830 }, { "epoch": 11.256108257110638, "grad_norm": 0.2516864538192749, "learning_rate": 4.776687035234976e-06, "loss": 0.4496, "num_input_tokens_seen": 109264320, "step": 89835 }, { "epoch": 11.256734745019422, "grad_norm": 0.25251802802085876, "learning_rate": 4.776140868284544e-06, "loss": 0.4584, "num_input_tokens_seen": 109270176, "step": 89840 }, { "epoch": 11.257361232928204, "grad_norm": 0.2777327597141266, "learning_rate": 4.7755947040105395e-06, "loss": 0.463, "num_input_tokens_seen": 109276192, "step": 89845 }, { "epoch": 11.257987720836988, "grad_norm": 0.21710087358951569, "learning_rate": 4.7750485424194864e-06, "loss": 0.4613, "num_input_tokens_seen": 109282144, "step": 89850 }, { "epoch": 11.25861420874577, "grad_norm": 0.165689155459404, "learning_rate": 4.774502383517917e-06, "loss": 0.4708, "num_input_tokens_seen": 109287968, "step": 89855 }, { "epoch": 11.259240696654555, "grad_norm": 0.16998861730098724, "learning_rate": 4.773956227312364e-06, "loss": 0.4601, "num_input_tokens_seen": 109293696, "step": 89860 }, { "epoch": 11.259867184563339, "grad_norm": 0.2514317035675049, "learning_rate": 4.773410073809351e-06, "loss": 0.464, "num_input_tokens_seen": 109299232, "step": 89865 }, { "epoch": 11.260493672472121, "grad_norm": 0.22294023633003235, "learning_rate": 4.772863923015413e-06, "loss": 0.459, "num_input_tokens_seen": 109305312, "step": 89870 }, { "epoch": 11.261120160380905, "grad_norm": 0.19658216834068298, "learning_rate": 4.772317774937076e-06, "loss": 0.4564, "num_input_tokens_seen": 109311264, "step": 89875 }, { "epoch": 11.261746648289687, "grad_norm": 0.21678894758224487, "learning_rate": 4.771771629580875e-06, "loss": 0.4575, "num_input_tokens_seen": 109317632, "step": 89880 }, { "epoch": 11.262373136198471, "grad_norm": 0.28043290972709656, "learning_rate": 4.771225486953333e-06, "loss": 0.4663, "num_input_tokens_seen": 109323776, "step": 89885 }, { "epoch": 11.262999624107255, "grad_norm": 0.24549096822738647, "learning_rate": 4.770679347060984e-06, "loss": 0.4627, "num_input_tokens_seen": 109329920, "step": 89890 }, { "epoch": 11.263626112016038, "grad_norm": 0.24992530047893524, "learning_rate": 4.7701332099103545e-06, "loss": 0.4559, "num_input_tokens_seen": 109335936, "step": 89895 }, { "epoch": 11.264252599924822, "grad_norm": 0.3056487441062927, "learning_rate": 4.769587075507979e-06, "loss": 0.4605, "num_input_tokens_seen": 109342016, "step": 89900 }, { "epoch": 11.264879087833604, "grad_norm": 0.32038694620132446, "learning_rate": 4.76904094386038e-06, "loss": 0.4683, "num_input_tokens_seen": 109348064, "step": 89905 }, { "epoch": 11.265505575742388, "grad_norm": 0.19517208635807037, "learning_rate": 4.7684948149740915e-06, "loss": 0.4704, "num_input_tokens_seen": 109353920, "step": 89910 }, { "epoch": 11.266132063651172, "grad_norm": 0.22601072490215302, "learning_rate": 4.767948688855644e-06, "loss": 0.4515, "num_input_tokens_seen": 109359840, "step": 89915 }, { "epoch": 11.266758551559954, "grad_norm": 0.3620278537273407, "learning_rate": 4.767402565511563e-06, "loss": 0.4611, "num_input_tokens_seen": 109366176, "step": 89920 }, { "epoch": 11.267385039468738, "grad_norm": 0.20462770760059357, "learning_rate": 4.766856444948382e-06, "loss": 0.4612, "num_input_tokens_seen": 109372320, "step": 89925 }, { "epoch": 11.26801152737752, "grad_norm": 0.22968091070652008, "learning_rate": 4.766310327172626e-06, "loss": 0.4666, "num_input_tokens_seen": 109378560, "step": 89930 }, { "epoch": 11.268638015286305, "grad_norm": 0.20419685542583466, "learning_rate": 4.7657642121908284e-06, "loss": 0.4595, "num_input_tokens_seen": 109384736, "step": 89935 }, { "epoch": 11.269264503195089, "grad_norm": 0.23983870446681976, "learning_rate": 4.765218100009515e-06, "loss": 0.4472, "num_input_tokens_seen": 109390496, "step": 89940 }, { "epoch": 11.269890991103871, "grad_norm": 0.30319514870643616, "learning_rate": 4.764671990635219e-06, "loss": 0.4577, "num_input_tokens_seen": 109396160, "step": 89945 }, { "epoch": 11.270517479012655, "grad_norm": 0.34999966621398926, "learning_rate": 4.7641258840744645e-06, "loss": 0.4715, "num_input_tokens_seen": 109401824, "step": 89950 }, { "epoch": 11.27114396692144, "grad_norm": 0.2798127830028534, "learning_rate": 4.763579780333786e-06, "loss": 0.4526, "num_input_tokens_seen": 109407808, "step": 89955 }, { "epoch": 11.271770454830222, "grad_norm": 0.25478625297546387, "learning_rate": 4.763033679419707e-06, "loss": 0.4545, "num_input_tokens_seen": 109413920, "step": 89960 }, { "epoch": 11.272396942739006, "grad_norm": 0.3087370693683624, "learning_rate": 4.762487581338762e-06, "loss": 0.4624, "num_input_tokens_seen": 109420512, "step": 89965 }, { "epoch": 11.273023430647788, "grad_norm": 0.31116795539855957, "learning_rate": 4.761941486097476e-06, "loss": 0.4646, "num_input_tokens_seen": 109426880, "step": 89970 }, { "epoch": 11.273649918556572, "grad_norm": 0.33382198214530945, "learning_rate": 4.7613953937023804e-06, "loss": 0.4512, "num_input_tokens_seen": 109432320, "step": 89975 }, { "epoch": 11.274276406465356, "grad_norm": 0.33700796961784363, "learning_rate": 4.760849304160003e-06, "loss": 0.4606, "num_input_tokens_seen": 109437760, "step": 89980 }, { "epoch": 11.274902894374138, "grad_norm": 0.20202258229255676, "learning_rate": 4.760303217476873e-06, "loss": 0.4649, "num_input_tokens_seen": 109443904, "step": 89985 }, { "epoch": 11.275529382282922, "grad_norm": 0.20184943079948425, "learning_rate": 4.759757133659522e-06, "loss": 0.459, "num_input_tokens_seen": 109449920, "step": 89990 }, { "epoch": 11.276155870191705, "grad_norm": 0.20565126836299896, "learning_rate": 4.759211052714474e-06, "loss": 0.4676, "num_input_tokens_seen": 109456288, "step": 89995 }, { "epoch": 11.276782358100489, "grad_norm": 0.25823235511779785, "learning_rate": 4.758664974648262e-06, "loss": 0.4544, "num_input_tokens_seen": 109462016, "step": 90000 }, { "epoch": 11.277408846009273, "grad_norm": 0.2377539426088333, "learning_rate": 4.758118899467412e-06, "loss": 0.4602, "num_input_tokens_seen": 109468256, "step": 90005 }, { "epoch": 11.278035333918055, "grad_norm": 0.217448890209198, "learning_rate": 4.757572827178455e-06, "loss": 0.4598, "num_input_tokens_seen": 109474464, "step": 90010 }, { "epoch": 11.278661821826839, "grad_norm": 0.3076981008052826, "learning_rate": 4.757026757787917e-06, "loss": 0.457, "num_input_tokens_seen": 109480544, "step": 90015 }, { "epoch": 11.279288309735621, "grad_norm": 0.22792194783687592, "learning_rate": 4.7564806913023286e-06, "loss": 0.4519, "num_input_tokens_seen": 109486144, "step": 90020 }, { "epoch": 11.279914797644405, "grad_norm": 0.23495276272296906, "learning_rate": 4.755934627728218e-06, "loss": 0.4515, "num_input_tokens_seen": 109492256, "step": 90025 }, { "epoch": 11.28054128555319, "grad_norm": 1.2585840225219727, "learning_rate": 4.755388567072116e-06, "loss": 0.4663, "num_input_tokens_seen": 109498528, "step": 90030 }, { "epoch": 11.281167773461972, "grad_norm": 0.24439749121665955, "learning_rate": 4.7548425093405485e-06, "loss": 0.4608, "num_input_tokens_seen": 109504384, "step": 90035 }, { "epoch": 11.281794261370756, "grad_norm": 0.44990241527557373, "learning_rate": 4.754296454540043e-06, "loss": 0.4715, "num_input_tokens_seen": 109510048, "step": 90040 }, { "epoch": 11.282420749279538, "grad_norm": 0.267238974571228, "learning_rate": 4.753750402677133e-06, "loss": 0.4563, "num_input_tokens_seen": 109516224, "step": 90045 }, { "epoch": 11.283047237188322, "grad_norm": 0.41076165437698364, "learning_rate": 4.7532043537583415e-06, "loss": 0.4575, "num_input_tokens_seen": 109521984, "step": 90050 }, { "epoch": 11.283673725097106, "grad_norm": 0.8637782335281372, "learning_rate": 4.752658307790202e-06, "loss": 0.4762, "num_input_tokens_seen": 109528288, "step": 90055 }, { "epoch": 11.284300213005888, "grad_norm": 0.32926997542381287, "learning_rate": 4.752112264779238e-06, "loss": 0.4636, "num_input_tokens_seen": 109533728, "step": 90060 }, { "epoch": 11.284926700914673, "grad_norm": 0.21998518705368042, "learning_rate": 4.751566224731982e-06, "loss": 0.4629, "num_input_tokens_seen": 109539552, "step": 90065 }, { "epoch": 11.285553188823457, "grad_norm": 0.2410230189561844, "learning_rate": 4.75102018765496e-06, "loss": 0.454, "num_input_tokens_seen": 109545472, "step": 90070 }, { "epoch": 11.286179676732239, "grad_norm": 0.3109819293022156, "learning_rate": 4.750474153554701e-06, "loss": 0.4634, "num_input_tokens_seen": 109551840, "step": 90075 }, { "epoch": 11.286806164641023, "grad_norm": 0.24724924564361572, "learning_rate": 4.7499281224377324e-06, "loss": 0.4668, "num_input_tokens_seen": 109558048, "step": 90080 }, { "epoch": 11.287432652549805, "grad_norm": 0.29128190875053406, "learning_rate": 4.749382094310585e-06, "loss": 0.4519, "num_input_tokens_seen": 109563552, "step": 90085 }, { "epoch": 11.28805914045859, "grad_norm": 0.40921059250831604, "learning_rate": 4.748836069179784e-06, "loss": 0.4742, "num_input_tokens_seen": 109569792, "step": 90090 }, { "epoch": 11.288685628367373, "grad_norm": 0.31378665566444397, "learning_rate": 4.748290047051859e-06, "loss": 0.4579, "num_input_tokens_seen": 109575808, "step": 90095 }, { "epoch": 11.289312116276156, "grad_norm": 0.239209845662117, "learning_rate": 4.747744027933339e-06, "loss": 0.4613, "num_input_tokens_seen": 109581984, "step": 90100 }, { "epoch": 11.28993860418494, "grad_norm": 0.21627594530582428, "learning_rate": 4.74719801183075e-06, "loss": 0.4516, "num_input_tokens_seen": 109588256, "step": 90105 }, { "epoch": 11.290565092093722, "grad_norm": 0.13997326791286469, "learning_rate": 4.746651998750624e-06, "loss": 0.4671, "num_input_tokens_seen": 109594240, "step": 90110 }, { "epoch": 11.291191580002506, "grad_norm": 0.256585031747818, "learning_rate": 4.7461059886994835e-06, "loss": 0.4536, "num_input_tokens_seen": 109600224, "step": 90115 }, { "epoch": 11.29181806791129, "grad_norm": 0.3109560012817383, "learning_rate": 4.745559981683861e-06, "loss": 0.4497, "num_input_tokens_seen": 109606144, "step": 90120 }, { "epoch": 11.292444555820072, "grad_norm": 0.3418456017971039, "learning_rate": 4.745013977710282e-06, "loss": 0.4512, "num_input_tokens_seen": 109612640, "step": 90125 }, { "epoch": 11.293071043728856, "grad_norm": 0.29546889662742615, "learning_rate": 4.744467976785275e-06, "loss": 0.4467, "num_input_tokens_seen": 109618848, "step": 90130 }, { "epoch": 11.293697531637639, "grad_norm": 0.438502699136734, "learning_rate": 4.743921978915368e-06, "loss": 0.4641, "num_input_tokens_seen": 109624928, "step": 90135 }, { "epoch": 11.294324019546423, "grad_norm": 0.30788785219192505, "learning_rate": 4.743375984107091e-06, "loss": 0.4654, "num_input_tokens_seen": 109630880, "step": 90140 }, { "epoch": 11.294950507455207, "grad_norm": 0.28880539536476135, "learning_rate": 4.742829992366968e-06, "loss": 0.4594, "num_input_tokens_seen": 109636992, "step": 90145 }, { "epoch": 11.295576995363989, "grad_norm": 0.30816999077796936, "learning_rate": 4.74228400370153e-06, "loss": 0.4609, "num_input_tokens_seen": 109642528, "step": 90150 }, { "epoch": 11.296203483272773, "grad_norm": 0.24271084368228912, "learning_rate": 4.741738018117302e-06, "loss": 0.4824, "num_input_tokens_seen": 109648640, "step": 90155 }, { "epoch": 11.296829971181555, "grad_norm": 0.18994183838367462, "learning_rate": 4.741192035620813e-06, "loss": 0.4681, "num_input_tokens_seen": 109654528, "step": 90160 }, { "epoch": 11.29745645909034, "grad_norm": 0.22855108976364136, "learning_rate": 4.740646056218592e-06, "loss": 0.4425, "num_input_tokens_seen": 109660512, "step": 90165 }, { "epoch": 11.298082946999124, "grad_norm": 0.23544105887413025, "learning_rate": 4.7401000799171644e-06, "loss": 0.4679, "num_input_tokens_seen": 109666848, "step": 90170 }, { "epoch": 11.298709434907906, "grad_norm": 0.217116117477417, "learning_rate": 4.73955410672306e-06, "loss": 0.459, "num_input_tokens_seen": 109672864, "step": 90175 }, { "epoch": 11.29933592281669, "grad_norm": 0.5909437537193298, "learning_rate": 4.739008136642804e-06, "loss": 0.4469, "num_input_tokens_seen": 109678560, "step": 90180 }, { "epoch": 11.299962410725472, "grad_norm": 0.24580368399620056, "learning_rate": 4.738462169682927e-06, "loss": 0.4557, "num_input_tokens_seen": 109684576, "step": 90185 }, { "epoch": 11.300588898634256, "grad_norm": 0.23340095579624176, "learning_rate": 4.737916205849952e-06, "loss": 0.4656, "num_input_tokens_seen": 109690528, "step": 90190 }, { "epoch": 11.30121538654304, "grad_norm": 0.25587815046310425, "learning_rate": 4.737370245150412e-06, "loss": 0.47, "num_input_tokens_seen": 109696736, "step": 90195 }, { "epoch": 11.301841874451823, "grad_norm": 0.24643747508525848, "learning_rate": 4.736824287590829e-06, "loss": 0.4742, "num_input_tokens_seen": 109702624, "step": 90200 }, { "epoch": 11.302468362360607, "grad_norm": 0.29883041977882385, "learning_rate": 4.736278333177735e-06, "loss": 0.4616, "num_input_tokens_seen": 109708832, "step": 90205 }, { "epoch": 11.30309485026939, "grad_norm": 0.21979528665542603, "learning_rate": 4.735732381917654e-06, "loss": 0.4726, "num_input_tokens_seen": 109714976, "step": 90210 }, { "epoch": 11.303721338178173, "grad_norm": 0.2841602563858032, "learning_rate": 4.735186433817115e-06, "loss": 0.4518, "num_input_tokens_seen": 109721408, "step": 90215 }, { "epoch": 11.304347826086957, "grad_norm": 0.21635103225708008, "learning_rate": 4.7346404888826455e-06, "loss": 0.4653, "num_input_tokens_seen": 109727328, "step": 90220 }, { "epoch": 11.30497431399574, "grad_norm": 0.19113664329051971, "learning_rate": 4.73409454712077e-06, "loss": 0.4554, "num_input_tokens_seen": 109733504, "step": 90225 }, { "epoch": 11.305600801904523, "grad_norm": 0.370755672454834, "learning_rate": 4.733548608538019e-06, "loss": 0.4576, "num_input_tokens_seen": 109739552, "step": 90230 }, { "epoch": 11.306227289813307, "grad_norm": 0.4296579658985138, "learning_rate": 4.733002673140918e-06, "loss": 0.4607, "num_input_tokens_seen": 109745536, "step": 90235 }, { "epoch": 11.30685377772209, "grad_norm": 0.33550819754600525, "learning_rate": 4.732456740935995e-06, "loss": 0.4717, "num_input_tokens_seen": 109751232, "step": 90240 }, { "epoch": 11.307480265630874, "grad_norm": 0.30394411087036133, "learning_rate": 4.731910811929776e-06, "loss": 0.4581, "num_input_tokens_seen": 109757184, "step": 90245 }, { "epoch": 11.308106753539656, "grad_norm": 0.22792109847068787, "learning_rate": 4.7313648861287905e-06, "loss": 0.4607, "num_input_tokens_seen": 109762944, "step": 90250 }, { "epoch": 11.30873324144844, "grad_norm": 0.35870394110679626, "learning_rate": 4.730818963539562e-06, "loss": 0.472, "num_input_tokens_seen": 109769120, "step": 90255 }, { "epoch": 11.309359729357224, "grad_norm": 1.7859392166137695, "learning_rate": 4.730273044168621e-06, "loss": 0.4839, "num_input_tokens_seen": 109775264, "step": 90260 }, { "epoch": 11.309986217266006, "grad_norm": 0.22356830537319183, "learning_rate": 4.7297271280224895e-06, "loss": 0.4629, "num_input_tokens_seen": 109781280, "step": 90265 }, { "epoch": 11.31061270517479, "grad_norm": 0.2900944650173187, "learning_rate": 4.729181215107699e-06, "loss": 0.457, "num_input_tokens_seen": 109787776, "step": 90270 }, { "epoch": 11.311239193083573, "grad_norm": 0.22338980436325073, "learning_rate": 4.728635305430775e-06, "loss": 0.4574, "num_input_tokens_seen": 109793664, "step": 90275 }, { "epoch": 11.311865680992357, "grad_norm": 0.3064157962799072, "learning_rate": 4.728089398998242e-06, "loss": 0.4592, "num_input_tokens_seen": 109799712, "step": 90280 }, { "epoch": 11.31249216890114, "grad_norm": 0.41109180450439453, "learning_rate": 4.727543495816631e-06, "loss": 0.4451, "num_input_tokens_seen": 109806016, "step": 90285 }, { "epoch": 11.313118656809923, "grad_norm": 0.17981141805648804, "learning_rate": 4.726997595892465e-06, "loss": 0.4687, "num_input_tokens_seen": 109811904, "step": 90290 }, { "epoch": 11.313745144718707, "grad_norm": 0.33345744013786316, "learning_rate": 4.726451699232275e-06, "loss": 0.4753, "num_input_tokens_seen": 109817888, "step": 90295 }, { "epoch": 11.31437163262749, "grad_norm": 0.6747409105300903, "learning_rate": 4.725905805842582e-06, "loss": 0.4629, "num_input_tokens_seen": 109823552, "step": 90300 }, { "epoch": 11.314998120536274, "grad_norm": 0.24952998757362366, "learning_rate": 4.725359915729919e-06, "loss": 0.4629, "num_input_tokens_seen": 109830048, "step": 90305 }, { "epoch": 11.315624608445058, "grad_norm": 0.26553875207901, "learning_rate": 4.724814028900806e-06, "loss": 0.4634, "num_input_tokens_seen": 109836640, "step": 90310 }, { "epoch": 11.31625109635384, "grad_norm": 0.18052789568901062, "learning_rate": 4.724268145361774e-06, "loss": 0.4593, "num_input_tokens_seen": 109842368, "step": 90315 }, { "epoch": 11.316877584262624, "grad_norm": 0.2774885594844818, "learning_rate": 4.723722265119348e-06, "loss": 0.4569, "num_input_tokens_seen": 109848736, "step": 90320 }, { "epoch": 11.317504072171406, "grad_norm": 0.29050084948539734, "learning_rate": 4.723176388180054e-06, "loss": 0.4665, "num_input_tokens_seen": 109854176, "step": 90325 }, { "epoch": 11.31813056008019, "grad_norm": 0.23761844635009766, "learning_rate": 4.722630514550418e-06, "loss": 0.453, "num_input_tokens_seen": 109859584, "step": 90330 }, { "epoch": 11.318757047988974, "grad_norm": 0.3772510588169098, "learning_rate": 4.722084644236971e-06, "loss": 0.4471, "num_input_tokens_seen": 109865760, "step": 90335 }, { "epoch": 11.319383535897757, "grad_norm": 0.24041275680065155, "learning_rate": 4.721538777246232e-06, "loss": 0.4571, "num_input_tokens_seen": 109871072, "step": 90340 }, { "epoch": 11.32001002380654, "grad_norm": 0.24563926458358765, "learning_rate": 4.720992913584732e-06, "loss": 0.4659, "num_input_tokens_seen": 109877440, "step": 90345 }, { "epoch": 11.320636511715325, "grad_norm": 0.2835344970226288, "learning_rate": 4.720447053258997e-06, "loss": 0.4604, "num_input_tokens_seen": 109883712, "step": 90350 }, { "epoch": 11.321262999624107, "grad_norm": 0.43984436988830566, "learning_rate": 4.719901196275552e-06, "loss": 0.4657, "num_input_tokens_seen": 109889952, "step": 90355 }, { "epoch": 11.321889487532891, "grad_norm": 0.5922603607177734, "learning_rate": 4.719355342640925e-06, "loss": 0.4908, "num_input_tokens_seen": 109896096, "step": 90360 }, { "epoch": 11.322515975441673, "grad_norm": 0.4680440425872803, "learning_rate": 4.71880949236164e-06, "loss": 0.4652, "num_input_tokens_seen": 109902432, "step": 90365 }, { "epoch": 11.323142463350457, "grad_norm": 0.1836836040019989, "learning_rate": 4.718263645444224e-06, "loss": 0.4594, "num_input_tokens_seen": 109909024, "step": 90370 }, { "epoch": 11.323768951259241, "grad_norm": 0.41257724165916443, "learning_rate": 4.717717801895202e-06, "loss": 0.4686, "num_input_tokens_seen": 109914784, "step": 90375 }, { "epoch": 11.324395439168024, "grad_norm": 0.7414113283157349, "learning_rate": 4.717171961721103e-06, "loss": 0.4602, "num_input_tokens_seen": 109920896, "step": 90380 }, { "epoch": 11.325021927076808, "grad_norm": 0.3072228729724884, "learning_rate": 4.716626124928448e-06, "loss": 0.464, "num_input_tokens_seen": 109927008, "step": 90385 }, { "epoch": 11.32564841498559, "grad_norm": 0.33137789368629456, "learning_rate": 4.716080291523769e-06, "loss": 0.4683, "num_input_tokens_seen": 109933216, "step": 90390 }, { "epoch": 11.326274902894374, "grad_norm": 0.38833919167518616, "learning_rate": 4.715534461513587e-06, "loss": 0.4684, "num_input_tokens_seen": 109939360, "step": 90395 }, { "epoch": 11.326901390803158, "grad_norm": 0.4983220100402832, "learning_rate": 4.714988634904432e-06, "loss": 0.4757, "num_input_tokens_seen": 109945408, "step": 90400 }, { "epoch": 11.32752787871194, "grad_norm": 0.3517995774745941, "learning_rate": 4.714442811702824e-06, "loss": 0.4585, "num_input_tokens_seen": 109951648, "step": 90405 }, { "epoch": 11.328154366620725, "grad_norm": 0.24872693419456482, "learning_rate": 4.713896991915293e-06, "loss": 0.4608, "num_input_tokens_seen": 109957600, "step": 90410 }, { "epoch": 11.328780854529507, "grad_norm": 0.29212266206741333, "learning_rate": 4.713351175548367e-06, "loss": 0.4558, "num_input_tokens_seen": 109963968, "step": 90415 }, { "epoch": 11.32940734243829, "grad_norm": 0.33640486001968384, "learning_rate": 4.712805362608565e-06, "loss": 0.4583, "num_input_tokens_seen": 109970368, "step": 90420 }, { "epoch": 11.330033830347075, "grad_norm": 0.3332870900630951, "learning_rate": 4.71225955310242e-06, "loss": 0.4578, "num_input_tokens_seen": 109976480, "step": 90425 }, { "epoch": 11.330660318255857, "grad_norm": 0.35187631845474243, "learning_rate": 4.7117137470364505e-06, "loss": 0.4464, "num_input_tokens_seen": 109982656, "step": 90430 }, { "epoch": 11.331286806164641, "grad_norm": 0.3390429615974426, "learning_rate": 4.711167944417187e-06, "loss": 0.4571, "num_input_tokens_seen": 109988672, "step": 90435 }, { "epoch": 11.331913294073424, "grad_norm": 0.46228328347206116, "learning_rate": 4.710622145251153e-06, "loss": 0.4515, "num_input_tokens_seen": 109994784, "step": 90440 }, { "epoch": 11.332539781982208, "grad_norm": 0.2735721170902252, "learning_rate": 4.7100763495448765e-06, "loss": 0.4716, "num_input_tokens_seen": 110000768, "step": 90445 }, { "epoch": 11.333166269890992, "grad_norm": 0.4193800985813141, "learning_rate": 4.709530557304879e-06, "loss": 0.4635, "num_input_tokens_seen": 110007040, "step": 90450 }, { "epoch": 11.333792757799774, "grad_norm": 0.24514679610729218, "learning_rate": 4.70898476853769e-06, "loss": 0.4585, "num_input_tokens_seen": 110012960, "step": 90455 }, { "epoch": 11.334419245708558, "grad_norm": 0.4772951900959015, "learning_rate": 4.708438983249829e-06, "loss": 0.4541, "num_input_tokens_seen": 110018048, "step": 90460 }, { "epoch": 11.335045733617342, "grad_norm": 0.4333924651145935, "learning_rate": 4.707893201447829e-06, "loss": 0.4604, "num_input_tokens_seen": 110023776, "step": 90465 }, { "epoch": 11.335672221526124, "grad_norm": 0.57169508934021, "learning_rate": 4.707347423138209e-06, "loss": 0.4462, "num_input_tokens_seen": 110030016, "step": 90470 }, { "epoch": 11.336298709434908, "grad_norm": 0.41326454281806946, "learning_rate": 4.7068016483274964e-06, "loss": 0.4552, "num_input_tokens_seen": 110035904, "step": 90475 }, { "epoch": 11.33692519734369, "grad_norm": 0.6497308611869812, "learning_rate": 4.706255877022219e-06, "loss": 0.4645, "num_input_tokens_seen": 110042144, "step": 90480 }, { "epoch": 11.337551685252475, "grad_norm": 7.495809078216553, "learning_rate": 4.705710109228897e-06, "loss": 0.4971, "num_input_tokens_seen": 110048256, "step": 90485 }, { "epoch": 11.338178173161259, "grad_norm": 0.5112162828445435, "learning_rate": 4.7051643449540595e-06, "loss": 0.4571, "num_input_tokens_seen": 110054176, "step": 90490 }, { "epoch": 11.338804661070041, "grad_norm": 0.4078983962535858, "learning_rate": 4.704618584204229e-06, "loss": 0.4607, "num_input_tokens_seen": 110060480, "step": 90495 }, { "epoch": 11.339431148978825, "grad_norm": 0.4398512542247772, "learning_rate": 4.7040728269859335e-06, "loss": 0.4575, "num_input_tokens_seen": 110066176, "step": 90500 }, { "epoch": 11.340057636887607, "grad_norm": 0.3743993937969208, "learning_rate": 4.703527073305694e-06, "loss": 0.4727, "num_input_tokens_seen": 110072480, "step": 90505 }, { "epoch": 11.340684124796391, "grad_norm": 0.5840917825698853, "learning_rate": 4.702981323170041e-06, "loss": 0.4628, "num_input_tokens_seen": 110078656, "step": 90510 }, { "epoch": 11.341310612705175, "grad_norm": 0.4083689749240875, "learning_rate": 4.702435576585492e-06, "loss": 0.4574, "num_input_tokens_seen": 110084448, "step": 90515 }, { "epoch": 11.341937100613958, "grad_norm": 0.501633882522583, "learning_rate": 4.70188983355858e-06, "loss": 0.4649, "num_input_tokens_seen": 110090464, "step": 90520 }, { "epoch": 11.342563588522742, "grad_norm": 0.3030056357383728, "learning_rate": 4.701344094095822e-06, "loss": 0.4601, "num_input_tokens_seen": 110096608, "step": 90525 }, { "epoch": 11.343190076431524, "grad_norm": 0.33045268058776855, "learning_rate": 4.700798358203746e-06, "loss": 0.458, "num_input_tokens_seen": 110102752, "step": 90530 }, { "epoch": 11.343816564340308, "grad_norm": 0.3166755735874176, "learning_rate": 4.700252625888879e-06, "loss": 0.4486, "num_input_tokens_seen": 110108768, "step": 90535 }, { "epoch": 11.344443052249092, "grad_norm": 0.4779312312602997, "learning_rate": 4.699706897157744e-06, "loss": 0.4599, "num_input_tokens_seen": 110115136, "step": 90540 }, { "epoch": 11.345069540157874, "grad_norm": 0.34150418639183044, "learning_rate": 4.699161172016865e-06, "loss": 0.4557, "num_input_tokens_seen": 110121312, "step": 90545 }, { "epoch": 11.345696028066659, "grad_norm": 0.2981266975402832, "learning_rate": 4.698615450472767e-06, "loss": 0.4684, "num_input_tokens_seen": 110127776, "step": 90550 }, { "epoch": 11.34632251597544, "grad_norm": 0.25202786922454834, "learning_rate": 4.698069732531977e-06, "loss": 0.4645, "num_input_tokens_seen": 110134080, "step": 90555 }, { "epoch": 11.346949003884225, "grad_norm": 0.4059881567955017, "learning_rate": 4.697524018201014e-06, "loss": 0.4655, "num_input_tokens_seen": 110139040, "step": 90560 }, { "epoch": 11.347575491793009, "grad_norm": 0.4322952926158905, "learning_rate": 4.696978307486408e-06, "loss": 0.4763, "num_input_tokens_seen": 110144960, "step": 90565 }, { "epoch": 11.348201979701791, "grad_norm": 0.2982940673828125, "learning_rate": 4.696432600394679e-06, "loss": 0.4581, "num_input_tokens_seen": 110151296, "step": 90570 }, { "epoch": 11.348828467610575, "grad_norm": 0.2837677597999573, "learning_rate": 4.695886896932356e-06, "loss": 0.4576, "num_input_tokens_seen": 110157280, "step": 90575 }, { "epoch": 11.34945495551936, "grad_norm": 0.4251044690608978, "learning_rate": 4.695341197105959e-06, "loss": 0.4669, "num_input_tokens_seen": 110163328, "step": 90580 }, { "epoch": 11.350081443428142, "grad_norm": 0.25113219022750854, "learning_rate": 4.6947955009220145e-06, "loss": 0.4603, "num_input_tokens_seen": 110169472, "step": 90585 }, { "epoch": 11.350707931336926, "grad_norm": 0.23316116631031036, "learning_rate": 4.694249808387045e-06, "loss": 0.463, "num_input_tokens_seen": 110174944, "step": 90590 }, { "epoch": 11.351334419245708, "grad_norm": 0.21865735948085785, "learning_rate": 4.693704119507578e-06, "loss": 0.4618, "num_input_tokens_seen": 110180352, "step": 90595 }, { "epoch": 11.351960907154492, "grad_norm": 0.31727465987205505, "learning_rate": 4.693158434290135e-06, "loss": 0.4609, "num_input_tokens_seen": 110186592, "step": 90600 }, { "epoch": 11.352587395063276, "grad_norm": 0.31848645210266113, "learning_rate": 4.6926127527412405e-06, "loss": 0.4491, "num_input_tokens_seen": 110191904, "step": 90605 }, { "epoch": 11.353213882972058, "grad_norm": 0.35467827320098877, "learning_rate": 4.692067074867421e-06, "loss": 0.4601, "num_input_tokens_seen": 110198272, "step": 90610 }, { "epoch": 11.353840370880842, "grad_norm": 1.0275901556015015, "learning_rate": 4.691521400675197e-06, "loss": 0.4771, "num_input_tokens_seen": 110204480, "step": 90615 }, { "epoch": 11.354466858789625, "grad_norm": 0.24112525582313538, "learning_rate": 4.690975730171095e-06, "loss": 0.4647, "num_input_tokens_seen": 110210656, "step": 90620 }, { "epoch": 11.355093346698409, "grad_norm": 0.4543324112892151, "learning_rate": 4.690430063361636e-06, "loss": 0.4427, "num_input_tokens_seen": 110216256, "step": 90625 }, { "epoch": 11.355719834607193, "grad_norm": 0.2177329808473587, "learning_rate": 4.689884400253348e-06, "loss": 0.4607, "num_input_tokens_seen": 110222080, "step": 90630 }, { "epoch": 11.356346322515975, "grad_norm": 0.29838913679122925, "learning_rate": 4.689338740852751e-06, "loss": 0.4688, "num_input_tokens_seen": 110227104, "step": 90635 }, { "epoch": 11.35697281042476, "grad_norm": 0.2995629608631134, "learning_rate": 4.688793085166372e-06, "loss": 0.4645, "num_input_tokens_seen": 110233440, "step": 90640 }, { "epoch": 11.357599298333541, "grad_norm": 0.5327396988868713, "learning_rate": 4.6882474332007314e-06, "loss": 0.4568, "num_input_tokens_seen": 110239424, "step": 90645 }, { "epoch": 11.358225786242325, "grad_norm": 0.29099223017692566, "learning_rate": 4.687701784962357e-06, "loss": 0.4434, "num_input_tokens_seen": 110245728, "step": 90650 }, { "epoch": 11.35885227415111, "grad_norm": 0.2094326615333557, "learning_rate": 4.687156140457768e-06, "loss": 0.4555, "num_input_tokens_seen": 110251872, "step": 90655 }, { "epoch": 11.359478762059892, "grad_norm": 0.409389466047287, "learning_rate": 4.6866104996934915e-06, "loss": 0.4631, "num_input_tokens_seen": 110258144, "step": 90660 }, { "epoch": 11.360105249968676, "grad_norm": 0.3355051279067993, "learning_rate": 4.6860648626760514e-06, "loss": 0.4627, "num_input_tokens_seen": 110264320, "step": 90665 }, { "epoch": 11.360731737877458, "grad_norm": 0.6067483425140381, "learning_rate": 4.685519229411968e-06, "loss": 0.4585, "num_input_tokens_seen": 110270560, "step": 90670 }, { "epoch": 11.361358225786242, "grad_norm": 0.36453744769096375, "learning_rate": 4.684973599907768e-06, "loss": 0.4477, "num_input_tokens_seen": 110276864, "step": 90675 }, { "epoch": 11.361984713695026, "grad_norm": 0.32768404483795166, "learning_rate": 4.684427974169972e-06, "loss": 0.4545, "num_input_tokens_seen": 110282784, "step": 90680 }, { "epoch": 11.362611201603809, "grad_norm": 0.2957227826118469, "learning_rate": 4.683882352205106e-06, "loss": 0.475, "num_input_tokens_seen": 110289024, "step": 90685 }, { "epoch": 11.363237689512593, "grad_norm": 0.3102510869503021, "learning_rate": 4.683336734019692e-06, "loss": 0.4632, "num_input_tokens_seen": 110295072, "step": 90690 }, { "epoch": 11.363864177421377, "grad_norm": 0.30814269185066223, "learning_rate": 4.682791119620253e-06, "loss": 0.4967, "num_input_tokens_seen": 110301344, "step": 90695 }, { "epoch": 11.364490665330159, "grad_norm": 0.21706408262252808, "learning_rate": 4.6822455090133124e-06, "loss": 0.4542, "num_input_tokens_seen": 110307456, "step": 90700 }, { "epoch": 11.365117153238943, "grad_norm": 0.3919251263141632, "learning_rate": 4.6816999022053965e-06, "loss": 0.4696, "num_input_tokens_seen": 110313536, "step": 90705 }, { "epoch": 11.365743641147725, "grad_norm": 0.29191523790359497, "learning_rate": 4.681154299203023e-06, "loss": 0.4667, "num_input_tokens_seen": 110319552, "step": 90710 }, { "epoch": 11.36637012905651, "grad_norm": 0.2829732596874237, "learning_rate": 4.6806087000127185e-06, "loss": 0.4548, "num_input_tokens_seen": 110325472, "step": 90715 }, { "epoch": 11.366996616965293, "grad_norm": 0.2750677466392517, "learning_rate": 4.680063104641007e-06, "loss": 0.4636, "num_input_tokens_seen": 110330176, "step": 90720 }, { "epoch": 11.367623104874076, "grad_norm": 0.34198734164237976, "learning_rate": 4.679517513094408e-06, "loss": 0.4715, "num_input_tokens_seen": 110336096, "step": 90725 }, { "epoch": 11.36824959278286, "grad_norm": 0.19320048391819, "learning_rate": 4.678971925379449e-06, "loss": 0.4642, "num_input_tokens_seen": 110342240, "step": 90730 }, { "epoch": 11.368876080691642, "grad_norm": 0.17864835262298584, "learning_rate": 4.6784263415026485e-06, "loss": 0.4603, "num_input_tokens_seen": 110348192, "step": 90735 }, { "epoch": 11.369502568600426, "grad_norm": 0.22605256736278534, "learning_rate": 4.6778807614705324e-06, "loss": 0.4692, "num_input_tokens_seen": 110354144, "step": 90740 }, { "epoch": 11.37012905650921, "grad_norm": 0.24573299288749695, "learning_rate": 4.677335185289623e-06, "loss": 0.465, "num_input_tokens_seen": 110360608, "step": 90745 }, { "epoch": 11.370755544417992, "grad_norm": 0.26398152112960815, "learning_rate": 4.676789612966442e-06, "loss": 0.4556, "num_input_tokens_seen": 110366592, "step": 90750 }, { "epoch": 11.371382032326776, "grad_norm": 0.6432350277900696, "learning_rate": 4.676244044507513e-06, "loss": 0.4718, "num_input_tokens_seen": 110372576, "step": 90755 }, { "epoch": 11.372008520235559, "grad_norm": 0.22188416123390198, "learning_rate": 4.6756984799193595e-06, "loss": 0.473, "num_input_tokens_seen": 110378560, "step": 90760 }, { "epoch": 11.372635008144343, "grad_norm": 0.2678091824054718, "learning_rate": 4.675152919208502e-06, "loss": 0.4654, "num_input_tokens_seen": 110384608, "step": 90765 }, { "epoch": 11.373261496053127, "grad_norm": 0.1860790103673935, "learning_rate": 4.674607362381466e-06, "loss": 0.451, "num_input_tokens_seen": 110390720, "step": 90770 }, { "epoch": 11.37388798396191, "grad_norm": 0.23362700641155243, "learning_rate": 4.674061809444771e-06, "loss": 0.4593, "num_input_tokens_seen": 110397152, "step": 90775 }, { "epoch": 11.374514471870693, "grad_norm": 0.23166131973266602, "learning_rate": 4.67351626040494e-06, "loss": 0.4646, "num_input_tokens_seen": 110403136, "step": 90780 }, { "epoch": 11.375140959779475, "grad_norm": 0.22594743967056274, "learning_rate": 4.6729707152685e-06, "loss": 0.4516, "num_input_tokens_seen": 110409440, "step": 90785 }, { "epoch": 11.37576744768826, "grad_norm": 0.20693539083003998, "learning_rate": 4.6724251740419675e-06, "loss": 0.4618, "num_input_tokens_seen": 110414688, "step": 90790 }, { "epoch": 11.376393935597044, "grad_norm": 0.18065254390239716, "learning_rate": 4.671879636731868e-06, "loss": 0.469, "num_input_tokens_seen": 110420960, "step": 90795 }, { "epoch": 11.377020423505826, "grad_norm": 0.17554214596748352, "learning_rate": 4.671334103344723e-06, "loss": 0.454, "num_input_tokens_seen": 110426752, "step": 90800 }, { "epoch": 11.37764691141461, "grad_norm": 0.2979223132133484, "learning_rate": 4.670788573887057e-06, "loss": 0.4637, "num_input_tokens_seen": 110432896, "step": 90805 }, { "epoch": 11.378273399323392, "grad_norm": 0.2531726658344269, "learning_rate": 4.670243048365388e-06, "loss": 0.4598, "num_input_tokens_seen": 110439072, "step": 90810 }, { "epoch": 11.378899887232176, "grad_norm": 0.16296206414699554, "learning_rate": 4.669697526786243e-06, "loss": 0.4768, "num_input_tokens_seen": 110444992, "step": 90815 }, { "epoch": 11.37952637514096, "grad_norm": 0.25497904419898987, "learning_rate": 4.6691520091561395e-06, "loss": 0.4539, "num_input_tokens_seen": 110451360, "step": 90820 }, { "epoch": 11.380152863049743, "grad_norm": 0.25871312618255615, "learning_rate": 4.668606495481604e-06, "loss": 0.4659, "num_input_tokens_seen": 110457664, "step": 90825 }, { "epoch": 11.380779350958527, "grad_norm": 0.17699483036994934, "learning_rate": 4.668060985769155e-06, "loss": 0.4588, "num_input_tokens_seen": 110463584, "step": 90830 }, { "epoch": 11.38140583886731, "grad_norm": 0.30384358763694763, "learning_rate": 4.667515480025316e-06, "loss": 0.4616, "num_input_tokens_seen": 110469728, "step": 90835 }, { "epoch": 11.382032326776093, "grad_norm": 0.2600315809249878, "learning_rate": 4.66696997825661e-06, "loss": 0.4679, "num_input_tokens_seen": 110476032, "step": 90840 }, { "epoch": 11.382658814684877, "grad_norm": 0.3159422278404236, "learning_rate": 4.666424480469556e-06, "loss": 0.4649, "num_input_tokens_seen": 110482080, "step": 90845 }, { "epoch": 11.38328530259366, "grad_norm": 0.18021275103092194, "learning_rate": 4.665878986670679e-06, "loss": 0.4655, "num_input_tokens_seen": 110488032, "step": 90850 }, { "epoch": 11.383911790502443, "grad_norm": 0.15099026262760162, "learning_rate": 4.665333496866499e-06, "loss": 0.4619, "num_input_tokens_seen": 110494176, "step": 90855 }, { "epoch": 11.384538278411227, "grad_norm": 0.19161702692508698, "learning_rate": 4.66478801106354e-06, "loss": 0.4559, "num_input_tokens_seen": 110500288, "step": 90860 }, { "epoch": 11.38516476632001, "grad_norm": 0.3242930471897125, "learning_rate": 4.664242529268321e-06, "loss": 0.4515, "num_input_tokens_seen": 110506368, "step": 90865 }, { "epoch": 11.385791254228794, "grad_norm": 0.189414843916893, "learning_rate": 4.663697051487366e-06, "loss": 0.4662, "num_input_tokens_seen": 110512512, "step": 90870 }, { "epoch": 11.386417742137576, "grad_norm": 0.1971931755542755, "learning_rate": 4.6631515777271935e-06, "loss": 0.4689, "num_input_tokens_seen": 110518720, "step": 90875 }, { "epoch": 11.38704423004636, "grad_norm": 0.18163320422172546, "learning_rate": 4.66260610799433e-06, "loss": 0.4668, "num_input_tokens_seen": 110525280, "step": 90880 }, { "epoch": 11.387670717955144, "grad_norm": 0.16809020936489105, "learning_rate": 4.662060642295291e-06, "loss": 0.4641, "num_input_tokens_seen": 110531232, "step": 90885 }, { "epoch": 11.388297205863926, "grad_norm": 0.34289008378982544, "learning_rate": 4.661515180636603e-06, "loss": 0.4631, "num_input_tokens_seen": 110537472, "step": 90890 }, { "epoch": 11.38892369377271, "grad_norm": 0.2437431365251541, "learning_rate": 4.6609697230247845e-06, "loss": 0.4574, "num_input_tokens_seen": 110543232, "step": 90895 }, { "epoch": 11.389550181681493, "grad_norm": 0.314754843711853, "learning_rate": 4.660424269466358e-06, "loss": 0.4652, "num_input_tokens_seen": 110549472, "step": 90900 }, { "epoch": 11.390176669590277, "grad_norm": 0.20212383568286896, "learning_rate": 4.659878819967846e-06, "loss": 0.466, "num_input_tokens_seen": 110555712, "step": 90905 }, { "epoch": 11.390803157499061, "grad_norm": 0.21690122783184052, "learning_rate": 4.659333374535767e-06, "loss": 0.4618, "num_input_tokens_seen": 110561376, "step": 90910 }, { "epoch": 11.391429645407843, "grad_norm": 0.1864694058895111, "learning_rate": 4.6587879331766465e-06, "loss": 0.4576, "num_input_tokens_seen": 110567424, "step": 90915 }, { "epoch": 11.392056133316627, "grad_norm": 0.2580128312110901, "learning_rate": 4.6582424958970005e-06, "loss": 0.4611, "num_input_tokens_seen": 110573600, "step": 90920 }, { "epoch": 11.39268262122541, "grad_norm": 0.18056577444076538, "learning_rate": 4.657697062703355e-06, "loss": 0.4667, "num_input_tokens_seen": 110579680, "step": 90925 }, { "epoch": 11.393309109134194, "grad_norm": 0.200079083442688, "learning_rate": 4.657151633602227e-06, "loss": 0.4581, "num_input_tokens_seen": 110585728, "step": 90930 }, { "epoch": 11.393935597042978, "grad_norm": 0.22946138679981232, "learning_rate": 4.6566062086001416e-06, "loss": 0.4694, "num_input_tokens_seen": 110591936, "step": 90935 }, { "epoch": 11.39456208495176, "grad_norm": 0.21038603782653809, "learning_rate": 4.656060787703615e-06, "loss": 0.4671, "num_input_tokens_seen": 110597888, "step": 90940 }, { "epoch": 11.395188572860544, "grad_norm": 0.23740988969802856, "learning_rate": 4.655515370919172e-06, "loss": 0.4637, "num_input_tokens_seen": 110604128, "step": 90945 }, { "epoch": 11.395815060769326, "grad_norm": 0.18524783849716187, "learning_rate": 4.654969958253332e-06, "loss": 0.4656, "num_input_tokens_seen": 110610080, "step": 90950 }, { "epoch": 11.39644154867811, "grad_norm": 0.20224419236183167, "learning_rate": 4.654424549712618e-06, "loss": 0.4566, "num_input_tokens_seen": 110616608, "step": 90955 }, { "epoch": 11.397068036586894, "grad_norm": 0.19017083942890167, "learning_rate": 4.6538791453035475e-06, "loss": 0.4619, "num_input_tokens_seen": 110622848, "step": 90960 }, { "epoch": 11.397694524495677, "grad_norm": 0.2233135849237442, "learning_rate": 4.653333745032642e-06, "loss": 0.4651, "num_input_tokens_seen": 110628928, "step": 90965 }, { "epoch": 11.39832101240446, "grad_norm": 0.22754406929016113, "learning_rate": 4.6527883489064254e-06, "loss": 0.4749, "num_input_tokens_seen": 110634944, "step": 90970 }, { "epoch": 11.398947500313245, "grad_norm": 0.29152676463127136, "learning_rate": 4.652242956931414e-06, "loss": 0.4577, "num_input_tokens_seen": 110641472, "step": 90975 }, { "epoch": 11.399573988222027, "grad_norm": 0.168865367770195, "learning_rate": 4.651697569114133e-06, "loss": 0.4462, "num_input_tokens_seen": 110647776, "step": 90980 }, { "epoch": 11.400200476130811, "grad_norm": 0.20089949667453766, "learning_rate": 4.651152185461098e-06, "loss": 0.4593, "num_input_tokens_seen": 110653824, "step": 90985 }, { "epoch": 11.400826964039593, "grad_norm": 0.227104052901268, "learning_rate": 4.650606805978834e-06, "loss": 0.4612, "num_input_tokens_seen": 110660064, "step": 90990 }, { "epoch": 11.401453451948377, "grad_norm": 0.3091218173503876, "learning_rate": 4.650061430673857e-06, "loss": 0.454, "num_input_tokens_seen": 110666304, "step": 90995 }, { "epoch": 11.402079939857162, "grad_norm": 0.15566366910934448, "learning_rate": 4.649516059552691e-06, "loss": 0.4608, "num_input_tokens_seen": 110672448, "step": 91000 }, { "epoch": 11.402706427765944, "grad_norm": 0.2478713095188141, "learning_rate": 4.648970692621853e-06, "loss": 0.4659, "num_input_tokens_seen": 110678208, "step": 91005 }, { "epoch": 11.403332915674728, "grad_norm": 0.2924652695655823, "learning_rate": 4.648425329887869e-06, "loss": 0.4663, "num_input_tokens_seen": 110684576, "step": 91010 }, { "epoch": 11.40395940358351, "grad_norm": 0.24706487357616425, "learning_rate": 4.647879971357253e-06, "loss": 0.4591, "num_input_tokens_seen": 110690752, "step": 91015 }, { "epoch": 11.404585891492294, "grad_norm": 0.19624552130699158, "learning_rate": 4.647334617036532e-06, "loss": 0.4478, "num_input_tokens_seen": 110696896, "step": 91020 }, { "epoch": 11.405212379401078, "grad_norm": 0.23382550477981567, "learning_rate": 4.646789266932218e-06, "loss": 0.4568, "num_input_tokens_seen": 110702976, "step": 91025 }, { "epoch": 11.40583886730986, "grad_norm": 0.20942863821983337, "learning_rate": 4.646243921050836e-06, "loss": 0.4626, "num_input_tokens_seen": 110709120, "step": 91030 }, { "epoch": 11.406465355218645, "grad_norm": 0.21984249353408813, "learning_rate": 4.645698579398907e-06, "loss": 0.4534, "num_input_tokens_seen": 110715456, "step": 91035 }, { "epoch": 11.407091843127427, "grad_norm": 0.21285809576511383, "learning_rate": 4.645153241982948e-06, "loss": 0.4505, "num_input_tokens_seen": 110721504, "step": 91040 }, { "epoch": 11.407718331036211, "grad_norm": 0.14840978384017944, "learning_rate": 4.644607908809481e-06, "loss": 0.4626, "num_input_tokens_seen": 110727616, "step": 91045 }, { "epoch": 11.408344818944995, "grad_norm": 0.4608471989631653, "learning_rate": 4.644062579885024e-06, "loss": 0.4617, "num_input_tokens_seen": 110734304, "step": 91050 }, { "epoch": 11.408971306853777, "grad_norm": 0.24639861285686493, "learning_rate": 4.643517255216099e-06, "loss": 0.4662, "num_input_tokens_seen": 110740320, "step": 91055 }, { "epoch": 11.409597794762561, "grad_norm": 0.20754827558994293, "learning_rate": 4.642971934809225e-06, "loss": 0.4665, "num_input_tokens_seen": 110746080, "step": 91060 }, { "epoch": 11.410224282671344, "grad_norm": 0.2856631577014923, "learning_rate": 4.6424266186709225e-06, "loss": 0.4573, "num_input_tokens_seen": 110752288, "step": 91065 }, { "epoch": 11.410850770580128, "grad_norm": 0.24701035022735596, "learning_rate": 4.641881306807709e-06, "loss": 0.4656, "num_input_tokens_seen": 110758752, "step": 91070 }, { "epoch": 11.411477258488912, "grad_norm": 0.2745143473148346, "learning_rate": 4.641335999226108e-06, "loss": 0.4598, "num_input_tokens_seen": 110764736, "step": 91075 }, { "epoch": 11.412103746397694, "grad_norm": 0.25453728437423706, "learning_rate": 4.640790695932634e-06, "loss": 0.4631, "num_input_tokens_seen": 110770784, "step": 91080 }, { "epoch": 11.412730234306478, "grad_norm": 0.3405202627182007, "learning_rate": 4.640245396933808e-06, "loss": 0.4694, "num_input_tokens_seen": 110776992, "step": 91085 }, { "epoch": 11.413356722215262, "grad_norm": 0.18357303738594055, "learning_rate": 4.639700102236154e-06, "loss": 0.4605, "num_input_tokens_seen": 110783264, "step": 91090 }, { "epoch": 11.413983210124044, "grad_norm": 0.20486785471439362, "learning_rate": 4.639154811846187e-06, "loss": 0.459, "num_input_tokens_seen": 110789376, "step": 91095 }, { "epoch": 11.414609698032828, "grad_norm": 0.1695503443479538, "learning_rate": 4.638609525770428e-06, "loss": 0.4628, "num_input_tokens_seen": 110795424, "step": 91100 }, { "epoch": 11.41523618594161, "grad_norm": 0.2193308025598526, "learning_rate": 4.638064244015396e-06, "loss": 0.4561, "num_input_tokens_seen": 110801376, "step": 91105 }, { "epoch": 11.415862673850395, "grad_norm": 0.24079620838165283, "learning_rate": 4.637518966587609e-06, "loss": 0.4682, "num_input_tokens_seen": 110807616, "step": 91110 }, { "epoch": 11.416489161759179, "grad_norm": 0.18962082266807556, "learning_rate": 4.636973693493588e-06, "loss": 0.4629, "num_input_tokens_seen": 110813440, "step": 91115 }, { "epoch": 11.417115649667961, "grad_norm": 0.28416186571121216, "learning_rate": 4.636428424739854e-06, "loss": 0.4634, "num_input_tokens_seen": 110819584, "step": 91120 }, { "epoch": 11.417742137576745, "grad_norm": 0.21117939054965973, "learning_rate": 4.63588316033292e-06, "loss": 0.4527, "num_input_tokens_seen": 110825984, "step": 91125 }, { "epoch": 11.418368625485527, "grad_norm": 0.22090333700180054, "learning_rate": 4.635337900279313e-06, "loss": 0.4624, "num_input_tokens_seen": 110831744, "step": 91130 }, { "epoch": 11.418995113394312, "grad_norm": 0.19580592215061188, "learning_rate": 4.634792644585544e-06, "loss": 0.4618, "num_input_tokens_seen": 110838144, "step": 91135 }, { "epoch": 11.419621601303096, "grad_norm": 0.18212561309337616, "learning_rate": 4.634247393258139e-06, "loss": 0.458, "num_input_tokens_seen": 110843744, "step": 91140 }, { "epoch": 11.420248089211878, "grad_norm": 0.23851102590560913, "learning_rate": 4.633702146303612e-06, "loss": 0.4525, "num_input_tokens_seen": 110850048, "step": 91145 }, { "epoch": 11.420874577120662, "grad_norm": 0.24518625438213348, "learning_rate": 4.633156903728484e-06, "loss": 0.4622, "num_input_tokens_seen": 110856192, "step": 91150 }, { "epoch": 11.421501065029444, "grad_norm": 0.19339025020599365, "learning_rate": 4.632611665539274e-06, "loss": 0.4644, "num_input_tokens_seen": 110861760, "step": 91155 }, { "epoch": 11.422127552938228, "grad_norm": 0.25580233335494995, "learning_rate": 4.6320664317425014e-06, "loss": 0.4558, "num_input_tokens_seen": 110867552, "step": 91160 }, { "epoch": 11.422754040847012, "grad_norm": 0.2138969600200653, "learning_rate": 4.631521202344683e-06, "loss": 0.4533, "num_input_tokens_seen": 110873600, "step": 91165 }, { "epoch": 11.423380528755795, "grad_norm": 0.2168213427066803, "learning_rate": 4.6309759773523375e-06, "loss": 0.4634, "num_input_tokens_seen": 110879424, "step": 91170 }, { "epoch": 11.424007016664579, "grad_norm": 0.2581319808959961, "learning_rate": 4.6304307567719875e-06, "loss": 0.4614, "num_input_tokens_seen": 110885312, "step": 91175 }, { "epoch": 11.424633504573361, "grad_norm": 0.24715839326381683, "learning_rate": 4.629885540610146e-06, "loss": 0.4558, "num_input_tokens_seen": 110891712, "step": 91180 }, { "epoch": 11.425259992482145, "grad_norm": 0.19428174197673798, "learning_rate": 4.6293403288733355e-06, "loss": 0.4639, "num_input_tokens_seen": 110896512, "step": 91185 }, { "epoch": 11.425886480390929, "grad_norm": 0.18136583268642426, "learning_rate": 4.628795121568071e-06, "loss": 0.466, "num_input_tokens_seen": 110902528, "step": 91190 }, { "epoch": 11.426512968299711, "grad_norm": 0.18921969830989838, "learning_rate": 4.628249918700876e-06, "loss": 0.47, "num_input_tokens_seen": 110908832, "step": 91195 }, { "epoch": 11.427139456208495, "grad_norm": 0.21670688688755035, "learning_rate": 4.627704720278262e-06, "loss": 0.4595, "num_input_tokens_seen": 110915392, "step": 91200 }, { "epoch": 11.42776594411728, "grad_norm": 0.19465970993041992, "learning_rate": 4.627159526306753e-06, "loss": 0.4611, "num_input_tokens_seen": 110921152, "step": 91205 }, { "epoch": 11.428392432026062, "grad_norm": 0.17379803955554962, "learning_rate": 4.6266143367928646e-06, "loss": 0.4616, "num_input_tokens_seen": 110927456, "step": 91210 }, { "epoch": 11.429018919934846, "grad_norm": 0.18396207690238953, "learning_rate": 4.6260691517431165e-06, "loss": 0.4667, "num_input_tokens_seen": 110933600, "step": 91215 }, { "epoch": 11.429645407843628, "grad_norm": 0.3811120092868805, "learning_rate": 4.625523971164027e-06, "loss": 0.4567, "num_input_tokens_seen": 110939936, "step": 91220 }, { "epoch": 11.430271895752412, "grad_norm": 0.18903428316116333, "learning_rate": 4.62497879506211e-06, "loss": 0.4502, "num_input_tokens_seen": 110946016, "step": 91225 }, { "epoch": 11.430898383661196, "grad_norm": 0.13677972555160522, "learning_rate": 4.624433623443891e-06, "loss": 0.4616, "num_input_tokens_seen": 110951424, "step": 91230 }, { "epoch": 11.431524871569978, "grad_norm": 0.20084287226200104, "learning_rate": 4.623888456315881e-06, "loss": 0.4581, "num_input_tokens_seen": 110957344, "step": 91235 }, { "epoch": 11.432151359478762, "grad_norm": 0.2584865093231201, "learning_rate": 4.623343293684602e-06, "loss": 0.4628, "num_input_tokens_seen": 110963616, "step": 91240 }, { "epoch": 11.432777847387545, "grad_norm": 0.20840279757976532, "learning_rate": 4.62279813555657e-06, "loss": 0.4536, "num_input_tokens_seen": 110969472, "step": 91245 }, { "epoch": 11.433404335296329, "grad_norm": 0.2755768299102783, "learning_rate": 4.6222529819383035e-06, "loss": 0.4532, "num_input_tokens_seen": 110975296, "step": 91250 }, { "epoch": 11.434030823205113, "grad_norm": 0.20851165056228638, "learning_rate": 4.62170783283632e-06, "loss": 0.463, "num_input_tokens_seen": 110981568, "step": 91255 }, { "epoch": 11.434657311113895, "grad_norm": 0.1944788098335266, "learning_rate": 4.621162688257138e-06, "loss": 0.4565, "num_input_tokens_seen": 110987744, "step": 91260 }, { "epoch": 11.43528379902268, "grad_norm": 0.207012340426445, "learning_rate": 4.620617548207273e-06, "loss": 0.461, "num_input_tokens_seen": 110994048, "step": 91265 }, { "epoch": 11.435910286931461, "grad_norm": 0.2103670984506607, "learning_rate": 4.620072412693247e-06, "loss": 0.4757, "num_input_tokens_seen": 111000224, "step": 91270 }, { "epoch": 11.436536774840246, "grad_norm": 0.2042204886674881, "learning_rate": 4.619527281721573e-06, "loss": 0.466, "num_input_tokens_seen": 111005920, "step": 91275 }, { "epoch": 11.43716326274903, "grad_norm": 0.22637933492660522, "learning_rate": 4.618982155298769e-06, "loss": 0.4663, "num_input_tokens_seen": 111012000, "step": 91280 }, { "epoch": 11.437789750657812, "grad_norm": 0.15996228158473969, "learning_rate": 4.618437033431356e-06, "loss": 0.4622, "num_input_tokens_seen": 111018432, "step": 91285 }, { "epoch": 11.438416238566596, "grad_norm": 0.25914835929870605, "learning_rate": 4.617891916125847e-06, "loss": 0.4685, "num_input_tokens_seen": 111024352, "step": 91290 }, { "epoch": 11.439042726475378, "grad_norm": 0.20515434443950653, "learning_rate": 4.617346803388764e-06, "loss": 0.4535, "num_input_tokens_seen": 111030400, "step": 91295 }, { "epoch": 11.439669214384162, "grad_norm": 0.2501837909221649, "learning_rate": 4.616801695226619e-06, "loss": 0.4608, "num_input_tokens_seen": 111036640, "step": 91300 }, { "epoch": 11.440295702292946, "grad_norm": 0.21991002559661865, "learning_rate": 4.616256591645934e-06, "loss": 0.4488, "num_input_tokens_seen": 111042880, "step": 91305 }, { "epoch": 11.440922190201729, "grad_norm": 0.22750982642173767, "learning_rate": 4.615711492653223e-06, "loss": 0.4619, "num_input_tokens_seen": 111048960, "step": 91310 }, { "epoch": 11.441548678110513, "grad_norm": 0.1852278709411621, "learning_rate": 4.615166398255004e-06, "loss": 0.4643, "num_input_tokens_seen": 111055392, "step": 91315 }, { "epoch": 11.442175166019297, "grad_norm": 0.23489485681056976, "learning_rate": 4.614621308457794e-06, "loss": 0.4581, "num_input_tokens_seen": 111061280, "step": 91320 }, { "epoch": 11.442801653928079, "grad_norm": 0.22269830107688904, "learning_rate": 4.6140762232681124e-06, "loss": 0.4701, "num_input_tokens_seen": 111067456, "step": 91325 }, { "epoch": 11.443428141836863, "grad_norm": 0.21152324974536896, "learning_rate": 4.613531142692472e-06, "loss": 0.4716, "num_input_tokens_seen": 111073568, "step": 91330 }, { "epoch": 11.444054629745645, "grad_norm": 0.28755268454551697, "learning_rate": 4.612986066737391e-06, "loss": 0.4627, "num_input_tokens_seen": 111079648, "step": 91335 }, { "epoch": 11.44468111765443, "grad_norm": 0.1710943728685379, "learning_rate": 4.612440995409389e-06, "loss": 0.4605, "num_input_tokens_seen": 111085888, "step": 91340 }, { "epoch": 11.445307605563213, "grad_norm": 0.1796252727508545, "learning_rate": 4.611895928714979e-06, "loss": 0.4581, "num_input_tokens_seen": 111091936, "step": 91345 }, { "epoch": 11.445934093471996, "grad_norm": 0.2954929769039154, "learning_rate": 4.611350866660682e-06, "loss": 0.4606, "num_input_tokens_seen": 111098016, "step": 91350 }, { "epoch": 11.44656058138078, "grad_norm": 0.19648808240890503, "learning_rate": 4.61080580925301e-06, "loss": 0.464, "num_input_tokens_seen": 111104192, "step": 91355 }, { "epoch": 11.447187069289562, "grad_norm": 0.27543342113494873, "learning_rate": 4.6102607564984815e-06, "loss": 0.4695, "num_input_tokens_seen": 111110368, "step": 91360 }, { "epoch": 11.447813557198346, "grad_norm": 0.1991272121667862, "learning_rate": 4.6097157084036145e-06, "loss": 0.4599, "num_input_tokens_seen": 111116576, "step": 91365 }, { "epoch": 11.44844004510713, "grad_norm": 0.20692415535449982, "learning_rate": 4.609170664974925e-06, "loss": 0.4496, "num_input_tokens_seen": 111122528, "step": 91370 }, { "epoch": 11.449066533015912, "grad_norm": 0.16649408638477325, "learning_rate": 4.6086256262189265e-06, "loss": 0.4571, "num_input_tokens_seen": 111127872, "step": 91375 }, { "epoch": 11.449693020924697, "grad_norm": 0.23907843232154846, "learning_rate": 4.6080805921421405e-06, "loss": 0.4531, "num_input_tokens_seen": 111133920, "step": 91380 }, { "epoch": 11.450319508833479, "grad_norm": 0.20162640511989594, "learning_rate": 4.607535562751077e-06, "loss": 0.4601, "num_input_tokens_seen": 111140128, "step": 91385 }, { "epoch": 11.450945996742263, "grad_norm": 0.18171486258506775, "learning_rate": 4.60699053805226e-06, "loss": 0.4601, "num_input_tokens_seen": 111145824, "step": 91390 }, { "epoch": 11.451572484651047, "grad_norm": 0.24744756519794464, "learning_rate": 4.606445518052198e-06, "loss": 0.4589, "num_input_tokens_seen": 111152096, "step": 91395 }, { "epoch": 11.45219897255983, "grad_norm": 0.19225803017616272, "learning_rate": 4.605900502757411e-06, "loss": 0.4718, "num_input_tokens_seen": 111158336, "step": 91400 }, { "epoch": 11.452825460468613, "grad_norm": 0.2682468295097351, "learning_rate": 4.6053554921744175e-06, "loss": 0.4701, "num_input_tokens_seen": 111164032, "step": 91405 }, { "epoch": 11.453451948377396, "grad_norm": 0.1908227950334549, "learning_rate": 4.604810486309728e-06, "loss": 0.4578, "num_input_tokens_seen": 111170304, "step": 91410 }, { "epoch": 11.45407843628618, "grad_norm": 0.24138356745243073, "learning_rate": 4.604265485169863e-06, "loss": 0.4565, "num_input_tokens_seen": 111176640, "step": 91415 }, { "epoch": 11.454704924194964, "grad_norm": 0.2440996617078781, "learning_rate": 4.6037204887613355e-06, "loss": 0.4647, "num_input_tokens_seen": 111182720, "step": 91420 }, { "epoch": 11.455331412103746, "grad_norm": 0.26192793250083923, "learning_rate": 4.603175497090665e-06, "loss": 0.4595, "num_input_tokens_seen": 111188224, "step": 91425 }, { "epoch": 11.45595790001253, "grad_norm": 0.209018737077713, "learning_rate": 4.602630510164363e-06, "loss": 0.4634, "num_input_tokens_seen": 111194304, "step": 91430 }, { "epoch": 11.456584387921312, "grad_norm": 0.20754970610141754, "learning_rate": 4.60208552798895e-06, "loss": 0.4633, "num_input_tokens_seen": 111200288, "step": 91435 }, { "epoch": 11.457210875830096, "grad_norm": 0.21481947600841522, "learning_rate": 4.601540550570936e-06, "loss": 0.4629, "num_input_tokens_seen": 111206400, "step": 91440 }, { "epoch": 11.45783736373888, "grad_norm": 0.3427737355232239, "learning_rate": 4.600995577916842e-06, "loss": 0.461, "num_input_tokens_seen": 111212800, "step": 91445 }, { "epoch": 11.458463851647663, "grad_norm": 0.26216819882392883, "learning_rate": 4.60045061003318e-06, "loss": 0.4566, "num_input_tokens_seen": 111218720, "step": 91450 }, { "epoch": 11.459090339556447, "grad_norm": 0.24248579144477844, "learning_rate": 4.599905646926467e-06, "loss": 0.4571, "num_input_tokens_seen": 111225184, "step": 91455 }, { "epoch": 11.459716827465229, "grad_norm": 0.24151961505413055, "learning_rate": 4.599360688603218e-06, "loss": 0.4664, "num_input_tokens_seen": 111231424, "step": 91460 }, { "epoch": 11.460343315374013, "grad_norm": 0.26564842462539673, "learning_rate": 4.598815735069949e-06, "loss": 0.4767, "num_input_tokens_seen": 111237600, "step": 91465 }, { "epoch": 11.460969803282797, "grad_norm": 0.2366991937160492, "learning_rate": 4.5982707863331755e-06, "loss": 0.456, "num_input_tokens_seen": 111243680, "step": 91470 }, { "epoch": 11.46159629119158, "grad_norm": 0.16583307087421417, "learning_rate": 4.597725842399411e-06, "loss": 0.474, "num_input_tokens_seen": 111250144, "step": 91475 }, { "epoch": 11.462222779100363, "grad_norm": 0.19559742510318756, "learning_rate": 4.597180903275176e-06, "loss": 0.4613, "num_input_tokens_seen": 111256224, "step": 91480 }, { "epoch": 11.462849267009148, "grad_norm": 0.2481565922498703, "learning_rate": 4.596635968966979e-06, "loss": 0.4593, "num_input_tokens_seen": 111262656, "step": 91485 }, { "epoch": 11.46347575491793, "grad_norm": 0.192647323012352, "learning_rate": 4.596091039481339e-06, "loss": 0.4612, "num_input_tokens_seen": 111268832, "step": 91490 }, { "epoch": 11.464102242826714, "grad_norm": 0.19451996684074402, "learning_rate": 4.59554611482477e-06, "loss": 0.4623, "num_input_tokens_seen": 111275232, "step": 91495 }, { "epoch": 11.464728730735496, "grad_norm": 0.17388655245304108, "learning_rate": 4.595001195003789e-06, "loss": 0.4514, "num_input_tokens_seen": 111281312, "step": 91500 }, { "epoch": 11.46535521864428, "grad_norm": 0.20349042117595673, "learning_rate": 4.594456280024906e-06, "loss": 0.4511, "num_input_tokens_seen": 111287552, "step": 91505 }, { "epoch": 11.465981706553064, "grad_norm": 0.1942758709192276, "learning_rate": 4.593911369894641e-06, "loss": 0.4589, "num_input_tokens_seen": 111293376, "step": 91510 }, { "epoch": 11.466608194461847, "grad_norm": 0.20723667740821838, "learning_rate": 4.593366464619507e-06, "loss": 0.4552, "num_input_tokens_seen": 111299264, "step": 91515 }, { "epoch": 11.46723468237063, "grad_norm": 0.2498805969953537, "learning_rate": 4.592821564206017e-06, "loss": 0.4678, "num_input_tokens_seen": 111305280, "step": 91520 }, { "epoch": 11.467861170279413, "grad_norm": 0.2753176689147949, "learning_rate": 4.59227666866069e-06, "loss": 0.4614, "num_input_tokens_seen": 111311424, "step": 91525 }, { "epoch": 11.468487658188197, "grad_norm": 0.24125032126903534, "learning_rate": 4.591731777990036e-06, "loss": 0.4583, "num_input_tokens_seen": 111317536, "step": 91530 }, { "epoch": 11.469114146096981, "grad_norm": 0.2393423318862915, "learning_rate": 4.5911868922005745e-06, "loss": 0.4638, "num_input_tokens_seen": 111323616, "step": 91535 }, { "epoch": 11.469740634005763, "grad_norm": 0.23632074892520905, "learning_rate": 4.590642011298815e-06, "loss": 0.461, "num_input_tokens_seen": 111329440, "step": 91540 }, { "epoch": 11.470367121914547, "grad_norm": 0.2466384470462799, "learning_rate": 4.590097135291277e-06, "loss": 0.4604, "num_input_tokens_seen": 111335296, "step": 91545 }, { "epoch": 11.47099360982333, "grad_norm": 0.216377854347229, "learning_rate": 4.589552264184469e-06, "loss": 0.4684, "num_input_tokens_seen": 111341120, "step": 91550 }, { "epoch": 11.471620097732114, "grad_norm": 0.22335310280323029, "learning_rate": 4.589007397984912e-06, "loss": 0.4726, "num_input_tokens_seen": 111347296, "step": 91555 }, { "epoch": 11.472246585640898, "grad_norm": 0.1915753185749054, "learning_rate": 4.5884625366991146e-06, "loss": 0.4612, "num_input_tokens_seen": 111353472, "step": 91560 }, { "epoch": 11.47287307354968, "grad_norm": 0.20344246923923492, "learning_rate": 4.587917680333595e-06, "loss": 0.4659, "num_input_tokens_seen": 111359648, "step": 91565 }, { "epoch": 11.473499561458464, "grad_norm": 0.251118004322052, "learning_rate": 4.5873728288948645e-06, "loss": 0.4663, "num_input_tokens_seen": 111365760, "step": 91570 }, { "epoch": 11.474126049367246, "grad_norm": 0.28421345353126526, "learning_rate": 4.586827982389441e-06, "loss": 0.4673, "num_input_tokens_seen": 111371808, "step": 91575 }, { "epoch": 11.47475253727603, "grad_norm": 0.23113280534744263, "learning_rate": 4.586283140823834e-06, "loss": 0.4681, "num_input_tokens_seen": 111377920, "step": 91580 }, { "epoch": 11.475379025184814, "grad_norm": 0.20487600564956665, "learning_rate": 4.585738304204561e-06, "loss": 0.4664, "num_input_tokens_seen": 111383968, "step": 91585 }, { "epoch": 11.476005513093597, "grad_norm": 0.2922714948654175, "learning_rate": 4.585193472538136e-06, "loss": 0.4549, "num_input_tokens_seen": 111390464, "step": 91590 }, { "epoch": 11.47663200100238, "grad_norm": 0.24115800857543945, "learning_rate": 4.58464864583107e-06, "loss": 0.4588, "num_input_tokens_seen": 111396480, "step": 91595 }, { "epoch": 11.477258488911165, "grad_norm": 0.22583530843257904, "learning_rate": 4.5841038240898805e-06, "loss": 0.4677, "num_input_tokens_seen": 111402432, "step": 91600 }, { "epoch": 11.477884976819947, "grad_norm": 0.28702613711357117, "learning_rate": 4.583559007321078e-06, "loss": 0.4674, "num_input_tokens_seen": 111408480, "step": 91605 }, { "epoch": 11.478511464728731, "grad_norm": 0.20548835396766663, "learning_rate": 4.583014195531179e-06, "loss": 0.4516, "num_input_tokens_seen": 111414368, "step": 91610 }, { "epoch": 11.479137952637513, "grad_norm": 0.2074936479330063, "learning_rate": 4.582469388726693e-06, "loss": 0.453, "num_input_tokens_seen": 111420576, "step": 91615 }, { "epoch": 11.479764440546298, "grad_norm": 0.20775429904460907, "learning_rate": 4.581924586914139e-06, "loss": 0.4619, "num_input_tokens_seen": 111427296, "step": 91620 }, { "epoch": 11.480390928455082, "grad_norm": 0.20834918320178986, "learning_rate": 4.581379790100027e-06, "loss": 0.4556, "num_input_tokens_seen": 111433312, "step": 91625 }, { "epoch": 11.481017416363864, "grad_norm": 0.29409360885620117, "learning_rate": 4.580834998290873e-06, "loss": 0.4641, "num_input_tokens_seen": 111439584, "step": 91630 }, { "epoch": 11.481643904272648, "grad_norm": 0.2779942750930786, "learning_rate": 4.580290211493188e-06, "loss": 0.4671, "num_input_tokens_seen": 111445600, "step": 91635 }, { "epoch": 11.48227039218143, "grad_norm": 0.23661202192306519, "learning_rate": 4.579745429713487e-06, "loss": 0.4737, "num_input_tokens_seen": 111451648, "step": 91640 }, { "epoch": 11.482896880090214, "grad_norm": 0.2288106083869934, "learning_rate": 4.579200652958281e-06, "loss": 0.4676, "num_input_tokens_seen": 111457728, "step": 91645 }, { "epoch": 11.483523367998998, "grad_norm": 0.22255733609199524, "learning_rate": 4.5786558812340854e-06, "loss": 0.4679, "num_input_tokens_seen": 111463488, "step": 91650 }, { "epoch": 11.48414985590778, "grad_norm": 0.19323097169399261, "learning_rate": 4.578111114547415e-06, "loss": 0.4654, "num_input_tokens_seen": 111469728, "step": 91655 }, { "epoch": 11.484776343816565, "grad_norm": 0.2526392638683319, "learning_rate": 4.5775663529047785e-06, "loss": 0.4597, "num_input_tokens_seen": 111475584, "step": 91660 }, { "epoch": 11.485402831725347, "grad_norm": 0.24042801558971405, "learning_rate": 4.5770215963126945e-06, "loss": 0.458, "num_input_tokens_seen": 111482016, "step": 91665 }, { "epoch": 11.486029319634131, "grad_norm": 0.2543199360370636, "learning_rate": 4.57647684477767e-06, "loss": 0.4575, "num_input_tokens_seen": 111488480, "step": 91670 }, { "epoch": 11.486655807542915, "grad_norm": 0.1850021779537201, "learning_rate": 4.575932098306222e-06, "loss": 0.4629, "num_input_tokens_seen": 111494752, "step": 91675 }, { "epoch": 11.487282295451697, "grad_norm": 0.22694960236549377, "learning_rate": 4.575387356904861e-06, "loss": 0.4673, "num_input_tokens_seen": 111500992, "step": 91680 }, { "epoch": 11.487908783360481, "grad_norm": 0.241132915019989, "learning_rate": 4.574842620580104e-06, "loss": 0.4671, "num_input_tokens_seen": 111507584, "step": 91685 }, { "epoch": 11.488535271269264, "grad_norm": 0.21241724491119385, "learning_rate": 4.574297889338458e-06, "loss": 0.4787, "num_input_tokens_seen": 111513600, "step": 91690 }, { "epoch": 11.489161759178048, "grad_norm": 0.2533387839794159, "learning_rate": 4.573753163186441e-06, "loss": 0.4563, "num_input_tokens_seen": 111520096, "step": 91695 }, { "epoch": 11.489788247086832, "grad_norm": 0.22490115463733673, "learning_rate": 4.573208442130562e-06, "loss": 0.4628, "num_input_tokens_seen": 111526112, "step": 91700 }, { "epoch": 11.490414734995614, "grad_norm": 0.1999538242816925, "learning_rate": 4.572663726177334e-06, "loss": 0.4589, "num_input_tokens_seen": 111531200, "step": 91705 }, { "epoch": 11.491041222904398, "grad_norm": 0.19416992366313934, "learning_rate": 4.572119015333273e-06, "loss": 0.4581, "num_input_tokens_seen": 111537088, "step": 91710 }, { "epoch": 11.491667710813182, "grad_norm": 0.2264879047870636, "learning_rate": 4.571574309604887e-06, "loss": 0.4645, "num_input_tokens_seen": 111543040, "step": 91715 }, { "epoch": 11.492294198721964, "grad_norm": 0.15151244401931763, "learning_rate": 4.571029608998691e-06, "loss": 0.4596, "num_input_tokens_seen": 111549472, "step": 91720 }, { "epoch": 11.492920686630749, "grad_norm": 0.20189817249774933, "learning_rate": 4.570484913521196e-06, "loss": 0.4711, "num_input_tokens_seen": 111555776, "step": 91725 }, { "epoch": 11.49354717453953, "grad_norm": 0.226780965924263, "learning_rate": 4.569940223178916e-06, "loss": 0.4612, "num_input_tokens_seen": 111561344, "step": 91730 }, { "epoch": 11.494173662448315, "grad_norm": 0.19219359755516052, "learning_rate": 4.56939553797836e-06, "loss": 0.4684, "num_input_tokens_seen": 111567136, "step": 91735 }, { "epoch": 11.494800150357099, "grad_norm": 0.17246811091899872, "learning_rate": 4.568850857926046e-06, "loss": 0.46, "num_input_tokens_seen": 111573376, "step": 91740 }, { "epoch": 11.495426638265881, "grad_norm": 0.2033555507659912, "learning_rate": 4.568306183028479e-06, "loss": 0.4686, "num_input_tokens_seen": 111579360, "step": 91745 }, { "epoch": 11.496053126174665, "grad_norm": 0.1795104295015335, "learning_rate": 4.567761513292177e-06, "loss": 0.4469, "num_input_tokens_seen": 111585536, "step": 91750 }, { "epoch": 11.496679614083448, "grad_norm": 0.20058199763298035, "learning_rate": 4.567216848723648e-06, "loss": 0.4589, "num_input_tokens_seen": 111591648, "step": 91755 }, { "epoch": 11.497306101992232, "grad_norm": 0.20566999912261963, "learning_rate": 4.566672189329407e-06, "loss": 0.4646, "num_input_tokens_seen": 111597600, "step": 91760 }, { "epoch": 11.497932589901016, "grad_norm": 0.18796689808368683, "learning_rate": 4.566127535115963e-06, "loss": 0.4616, "num_input_tokens_seen": 111603296, "step": 91765 }, { "epoch": 11.498559077809798, "grad_norm": 0.2483413964509964, "learning_rate": 4.565582886089829e-06, "loss": 0.4661, "num_input_tokens_seen": 111609664, "step": 91770 }, { "epoch": 11.499185565718582, "grad_norm": 0.1889295130968094, "learning_rate": 4.565038242257517e-06, "loss": 0.4513, "num_input_tokens_seen": 111615840, "step": 91775 }, { "epoch": 11.499812053627364, "grad_norm": 0.1913960874080658, "learning_rate": 4.564493603625539e-06, "loss": 0.4575, "num_input_tokens_seen": 111622112, "step": 91780 }, { "epoch": 11.500438541536148, "grad_norm": 0.2282177358865738, "learning_rate": 4.5639489702004055e-06, "loss": 0.4583, "num_input_tokens_seen": 111628256, "step": 91785 }, { "epoch": 11.501065029444932, "grad_norm": 0.37462395429611206, "learning_rate": 4.563404341988629e-06, "loss": 0.4585, "num_input_tokens_seen": 111634496, "step": 91790 }, { "epoch": 11.501691517353715, "grad_norm": 0.17987577617168427, "learning_rate": 4.562859718996722e-06, "loss": 0.4601, "num_input_tokens_seen": 111640608, "step": 91795 }, { "epoch": 11.502318005262499, "grad_norm": 0.21292094886302948, "learning_rate": 4.562315101231192e-06, "loss": 0.4645, "num_input_tokens_seen": 111646656, "step": 91800 }, { "epoch": 11.502944493171281, "grad_norm": 0.3068181872367859, "learning_rate": 4.5617704886985555e-06, "loss": 0.4686, "num_input_tokens_seen": 111652800, "step": 91805 }, { "epoch": 11.503570981080065, "grad_norm": 0.2041298747062683, "learning_rate": 4.561225881405319e-06, "loss": 0.4538, "num_input_tokens_seen": 111658688, "step": 91810 }, { "epoch": 11.504197468988849, "grad_norm": 0.2303735613822937, "learning_rate": 4.560681279357998e-06, "loss": 0.4629, "num_input_tokens_seen": 111665184, "step": 91815 }, { "epoch": 11.504823956897631, "grad_norm": 0.23722591996192932, "learning_rate": 4.5601366825631e-06, "loss": 0.4654, "num_input_tokens_seen": 111671520, "step": 91820 }, { "epoch": 11.505450444806415, "grad_norm": 0.2456623613834381, "learning_rate": 4.559592091027139e-06, "loss": 0.4631, "num_input_tokens_seen": 111677408, "step": 91825 }, { "epoch": 11.5060769327152, "grad_norm": 0.33269447088241577, "learning_rate": 4.559047504756623e-06, "loss": 0.4629, "num_input_tokens_seen": 111683424, "step": 91830 }, { "epoch": 11.506703420623982, "grad_norm": 0.2271120399236679, "learning_rate": 4.558502923758067e-06, "loss": 0.4577, "num_input_tokens_seen": 111688672, "step": 91835 }, { "epoch": 11.507329908532766, "grad_norm": 0.2761833071708679, "learning_rate": 4.557958348037978e-06, "loss": 0.4611, "num_input_tokens_seen": 111694720, "step": 91840 }, { "epoch": 11.507956396441548, "grad_norm": 0.20468340814113617, "learning_rate": 4.557413777602869e-06, "loss": 0.4695, "num_input_tokens_seen": 111700832, "step": 91845 }, { "epoch": 11.508582884350332, "grad_norm": 0.2329816222190857, "learning_rate": 4.556869212459252e-06, "loss": 0.4708, "num_input_tokens_seen": 111707200, "step": 91850 }, { "epoch": 11.509209372259116, "grad_norm": 0.2423081398010254, "learning_rate": 4.556324652613634e-06, "loss": 0.4561, "num_input_tokens_seen": 111713376, "step": 91855 }, { "epoch": 11.509835860167899, "grad_norm": 0.22651313245296478, "learning_rate": 4.55578009807253e-06, "loss": 0.4537, "num_input_tokens_seen": 111719840, "step": 91860 }, { "epoch": 11.510462348076683, "grad_norm": 0.15647494792938232, "learning_rate": 4.555235548842446e-06, "loss": 0.465, "num_input_tokens_seen": 111726144, "step": 91865 }, { "epoch": 11.511088835985465, "grad_norm": 0.1900678128004074, "learning_rate": 4.554691004929896e-06, "loss": 0.4686, "num_input_tokens_seen": 111732288, "step": 91870 }, { "epoch": 11.511715323894249, "grad_norm": 0.1990150511264801, "learning_rate": 4.55414646634139e-06, "loss": 0.4593, "num_input_tokens_seen": 111738528, "step": 91875 }, { "epoch": 11.512341811803033, "grad_norm": 0.2004641890525818, "learning_rate": 4.553601933083438e-06, "loss": 0.4669, "num_input_tokens_seen": 111744896, "step": 91880 }, { "epoch": 11.512968299711815, "grad_norm": 0.16992247104644775, "learning_rate": 4.553057405162548e-06, "loss": 0.4636, "num_input_tokens_seen": 111750624, "step": 91885 }, { "epoch": 11.5135947876206, "grad_norm": 0.19520938396453857, "learning_rate": 4.552512882585234e-06, "loss": 0.4526, "num_input_tokens_seen": 111756672, "step": 91890 }, { "epoch": 11.514221275529382, "grad_norm": 0.1861262023448944, "learning_rate": 4.551968365358005e-06, "loss": 0.4553, "num_input_tokens_seen": 111762976, "step": 91895 }, { "epoch": 11.514847763438166, "grad_norm": 0.20537470281124115, "learning_rate": 4.55142385348737e-06, "loss": 0.4611, "num_input_tokens_seen": 111769312, "step": 91900 }, { "epoch": 11.51547425134695, "grad_norm": 0.19919820129871368, "learning_rate": 4.550879346979842e-06, "loss": 0.4634, "num_input_tokens_seen": 111775360, "step": 91905 }, { "epoch": 11.516100739255732, "grad_norm": 0.17937394976615906, "learning_rate": 4.5503348458419266e-06, "loss": 0.4468, "num_input_tokens_seen": 111781408, "step": 91910 }, { "epoch": 11.516727227164516, "grad_norm": 0.18979595601558685, "learning_rate": 4.549790350080139e-06, "loss": 0.4618, "num_input_tokens_seen": 111787552, "step": 91915 }, { "epoch": 11.517353715073298, "grad_norm": 0.19833919405937195, "learning_rate": 4.549245859700983e-06, "loss": 0.4607, "num_input_tokens_seen": 111793824, "step": 91920 }, { "epoch": 11.517980202982082, "grad_norm": 0.1907138079404831, "learning_rate": 4.5487013747109735e-06, "loss": 0.4577, "num_input_tokens_seen": 111800288, "step": 91925 }, { "epoch": 11.518606690890866, "grad_norm": 0.21442914009094238, "learning_rate": 4.548156895116619e-06, "loss": 0.463, "num_input_tokens_seen": 111806560, "step": 91930 }, { "epoch": 11.519233178799649, "grad_norm": 0.26266273856163025, "learning_rate": 4.547612420924427e-06, "loss": 0.4725, "num_input_tokens_seen": 111812640, "step": 91935 }, { "epoch": 11.519859666708433, "grad_norm": 0.1547035574913025, "learning_rate": 4.547067952140909e-06, "loss": 0.4556, "num_input_tokens_seen": 111818816, "step": 91940 }, { "epoch": 11.520486154617217, "grad_norm": 0.2264978438615799, "learning_rate": 4.546523488772577e-06, "loss": 0.4594, "num_input_tokens_seen": 111824896, "step": 91945 }, { "epoch": 11.521112642525999, "grad_norm": 0.20379258692264557, "learning_rate": 4.545979030825935e-06, "loss": 0.4505, "num_input_tokens_seen": 111830848, "step": 91950 }, { "epoch": 11.521739130434783, "grad_norm": 0.25016477704048157, "learning_rate": 4.545434578307495e-06, "loss": 0.4574, "num_input_tokens_seen": 111836832, "step": 91955 }, { "epoch": 11.522365618343565, "grad_norm": 0.2014504373073578, "learning_rate": 4.54489013122377e-06, "loss": 0.4638, "num_input_tokens_seen": 111842528, "step": 91960 }, { "epoch": 11.52299210625235, "grad_norm": 0.15440760552883148, "learning_rate": 4.5443456895812635e-06, "loss": 0.46, "num_input_tokens_seen": 111849024, "step": 91965 }, { "epoch": 11.523618594161132, "grad_norm": 0.24314916133880615, "learning_rate": 4.543801253386489e-06, "loss": 0.4546, "num_input_tokens_seen": 111855552, "step": 91970 }, { "epoch": 11.524245082069916, "grad_norm": 0.2063908576965332, "learning_rate": 4.543256822645954e-06, "loss": 0.4531, "num_input_tokens_seen": 111861664, "step": 91975 }, { "epoch": 11.5248715699787, "grad_norm": 0.2275768667459488, "learning_rate": 4.542712397366166e-06, "loss": 0.4639, "num_input_tokens_seen": 111867616, "step": 91980 }, { "epoch": 11.525498057887482, "grad_norm": 0.18986551463603973, "learning_rate": 4.542167977553637e-06, "loss": 0.4674, "num_input_tokens_seen": 111873536, "step": 91985 }, { "epoch": 11.526124545796266, "grad_norm": 0.30802854895591736, "learning_rate": 4.541623563214875e-06, "loss": 0.4665, "num_input_tokens_seen": 111879904, "step": 91990 }, { "epoch": 11.52675103370505, "grad_norm": 0.18944475054740906, "learning_rate": 4.541079154356387e-06, "loss": 0.4585, "num_input_tokens_seen": 111885504, "step": 91995 }, { "epoch": 11.527377521613833, "grad_norm": 0.2636672854423523, "learning_rate": 4.540534750984687e-06, "loss": 0.4663, "num_input_tokens_seen": 111891648, "step": 92000 }, { "epoch": 11.528004009522617, "grad_norm": 0.2277117222547531, "learning_rate": 4.539990353106277e-06, "loss": 0.4653, "num_input_tokens_seen": 111897408, "step": 92005 }, { "epoch": 11.528630497431399, "grad_norm": 0.1525687426328659, "learning_rate": 4.5394459607276716e-06, "loss": 0.4617, "num_input_tokens_seen": 111903200, "step": 92010 }, { "epoch": 11.529256985340183, "grad_norm": 0.19117528200149536, "learning_rate": 4.538901573855375e-06, "loss": 0.4635, "num_input_tokens_seen": 111909632, "step": 92015 }, { "epoch": 11.529883473248967, "grad_norm": 0.2504618763923645, "learning_rate": 4.538357192495897e-06, "loss": 0.46, "num_input_tokens_seen": 111915648, "step": 92020 }, { "epoch": 11.53050996115775, "grad_norm": 0.20969608426094055, "learning_rate": 4.53781281665575e-06, "loss": 0.4587, "num_input_tokens_seen": 111921888, "step": 92025 }, { "epoch": 11.531136449066533, "grad_norm": 0.18385900557041168, "learning_rate": 4.537268446341436e-06, "loss": 0.4666, "num_input_tokens_seen": 111927904, "step": 92030 }, { "epoch": 11.531762936975316, "grad_norm": 0.30031126737594604, "learning_rate": 4.536724081559469e-06, "loss": 0.4529, "num_input_tokens_seen": 111933216, "step": 92035 }, { "epoch": 11.5323894248841, "grad_norm": 0.20645944774150848, "learning_rate": 4.536179722316354e-06, "loss": 0.4612, "num_input_tokens_seen": 111939360, "step": 92040 }, { "epoch": 11.533015912792884, "grad_norm": 0.21063284575939178, "learning_rate": 4.5356353686186015e-06, "loss": 0.4634, "num_input_tokens_seen": 111945504, "step": 92045 }, { "epoch": 11.533642400701666, "grad_norm": 0.20097288489341736, "learning_rate": 4.535091020472717e-06, "loss": 0.4604, "num_input_tokens_seen": 111951904, "step": 92050 }, { "epoch": 11.53426888861045, "grad_norm": 0.3023442327976227, "learning_rate": 4.534546677885212e-06, "loss": 0.4453, "num_input_tokens_seen": 111958080, "step": 92055 }, { "epoch": 11.534895376519232, "grad_norm": 0.19549943506717682, "learning_rate": 4.534002340862592e-06, "loss": 0.4578, "num_input_tokens_seen": 111964256, "step": 92060 }, { "epoch": 11.535521864428016, "grad_norm": 0.21638457477092743, "learning_rate": 4.533458009411367e-06, "loss": 0.4664, "num_input_tokens_seen": 111970336, "step": 92065 }, { "epoch": 11.5361483523368, "grad_norm": 0.19883666932582855, "learning_rate": 4.532913683538041e-06, "loss": 0.4625, "num_input_tokens_seen": 111976192, "step": 92070 }, { "epoch": 11.536774840245583, "grad_norm": 0.1606598049402237, "learning_rate": 4.532369363249126e-06, "loss": 0.4644, "num_input_tokens_seen": 111982336, "step": 92075 }, { "epoch": 11.537401328154367, "grad_norm": 0.25832659006118774, "learning_rate": 4.531825048551129e-06, "loss": 0.459, "num_input_tokens_seen": 111988544, "step": 92080 }, { "epoch": 11.538027816063149, "grad_norm": 0.2527572214603424, "learning_rate": 4.531280739450556e-06, "loss": 0.4611, "num_input_tokens_seen": 111993824, "step": 92085 }, { "epoch": 11.538654303971933, "grad_norm": 0.24652427434921265, "learning_rate": 4.530736435953917e-06, "loss": 0.46, "num_input_tokens_seen": 112000000, "step": 92090 }, { "epoch": 11.539280791880717, "grad_norm": 0.2325006127357483, "learning_rate": 4.530192138067717e-06, "loss": 0.4628, "num_input_tokens_seen": 112005952, "step": 92095 }, { "epoch": 11.5399072797895, "grad_norm": 0.19071711599826813, "learning_rate": 4.529647845798468e-06, "loss": 0.4437, "num_input_tokens_seen": 112012096, "step": 92100 }, { "epoch": 11.540533767698284, "grad_norm": 0.18951819837093353, "learning_rate": 4.529103559152671e-06, "loss": 0.4595, "num_input_tokens_seen": 112018272, "step": 92105 }, { "epoch": 11.541160255607068, "grad_norm": 0.23285192251205444, "learning_rate": 4.52855927813684e-06, "loss": 0.4632, "num_input_tokens_seen": 112024320, "step": 92110 }, { "epoch": 11.54178674351585, "grad_norm": 0.18575428426265717, "learning_rate": 4.528015002757477e-06, "loss": 0.4507, "num_input_tokens_seen": 112030400, "step": 92115 }, { "epoch": 11.542413231424634, "grad_norm": 0.2011166512966156, "learning_rate": 4.527470733021093e-06, "loss": 0.4619, "num_input_tokens_seen": 112036448, "step": 92120 }, { "epoch": 11.543039719333416, "grad_norm": 0.22168757021427155, "learning_rate": 4.5269264689341915e-06, "loss": 0.4558, "num_input_tokens_seen": 112042560, "step": 92125 }, { "epoch": 11.5436662072422, "grad_norm": 0.30407610535621643, "learning_rate": 4.5263822105032824e-06, "loss": 0.4647, "num_input_tokens_seen": 112048000, "step": 92130 }, { "epoch": 11.544292695150984, "grad_norm": 0.2752876579761505, "learning_rate": 4.525837957734873e-06, "loss": 0.4666, "num_input_tokens_seen": 112053696, "step": 92135 }, { "epoch": 11.544919183059767, "grad_norm": 0.27923575043678284, "learning_rate": 4.525293710635469e-06, "loss": 0.4668, "num_input_tokens_seen": 112059936, "step": 92140 }, { "epoch": 11.54554567096855, "grad_norm": 0.19839535653591156, "learning_rate": 4.524749469211578e-06, "loss": 0.4533, "num_input_tokens_seen": 112065856, "step": 92145 }, { "epoch": 11.546172158877333, "grad_norm": 0.31800708174705505, "learning_rate": 4.524205233469706e-06, "loss": 0.4531, "num_input_tokens_seen": 112071808, "step": 92150 }, { "epoch": 11.546798646786117, "grad_norm": 0.2688107490539551, "learning_rate": 4.523661003416362e-06, "loss": 0.4553, "num_input_tokens_seen": 112077856, "step": 92155 }, { "epoch": 11.547425134694901, "grad_norm": 0.4038258194923401, "learning_rate": 4.52311677905805e-06, "loss": 0.466, "num_input_tokens_seen": 112083744, "step": 92160 }, { "epoch": 11.548051622603683, "grad_norm": 0.26358669996261597, "learning_rate": 4.5225725604012786e-06, "loss": 0.455, "num_input_tokens_seen": 112089632, "step": 92165 }, { "epoch": 11.548678110512467, "grad_norm": 0.2986423075199127, "learning_rate": 4.5220283474525524e-06, "loss": 0.4705, "num_input_tokens_seen": 112095840, "step": 92170 }, { "epoch": 11.54930459842125, "grad_norm": 0.2336115539073944, "learning_rate": 4.5214841402183815e-06, "loss": 0.4596, "num_input_tokens_seen": 112101760, "step": 92175 }, { "epoch": 11.549931086330034, "grad_norm": 0.30688247084617615, "learning_rate": 4.520939938705268e-06, "loss": 0.4722, "num_input_tokens_seen": 112107968, "step": 92180 }, { "epoch": 11.550557574238818, "grad_norm": 0.23132291436195374, "learning_rate": 4.52039574291972e-06, "loss": 0.4619, "num_input_tokens_seen": 112114144, "step": 92185 }, { "epoch": 11.5511840621476, "grad_norm": 0.20576293766498566, "learning_rate": 4.5198515528682445e-06, "loss": 0.4554, "num_input_tokens_seen": 112120096, "step": 92190 }, { "epoch": 11.551810550056384, "grad_norm": 0.2504231333732605, "learning_rate": 4.519307368557349e-06, "loss": 0.4612, "num_input_tokens_seen": 112125984, "step": 92195 }, { "epoch": 11.552437037965166, "grad_norm": 0.24440959095954895, "learning_rate": 4.518763189993536e-06, "loss": 0.4672, "num_input_tokens_seen": 112132352, "step": 92200 }, { "epoch": 11.55306352587395, "grad_norm": 0.23378098011016846, "learning_rate": 4.518219017183313e-06, "loss": 0.4672, "num_input_tokens_seen": 112138304, "step": 92205 }, { "epoch": 11.553690013782735, "grad_norm": 0.21713198721408844, "learning_rate": 4.51767485013319e-06, "loss": 0.4554, "num_input_tokens_seen": 112144864, "step": 92210 }, { "epoch": 11.554316501691517, "grad_norm": 0.18101908266544342, "learning_rate": 4.517130688849667e-06, "loss": 0.4626, "num_input_tokens_seen": 112151072, "step": 92215 }, { "epoch": 11.5549429896003, "grad_norm": 0.20477890968322754, "learning_rate": 4.516586533339255e-06, "loss": 0.4499, "num_input_tokens_seen": 112156960, "step": 92220 }, { "epoch": 11.555569477509085, "grad_norm": 0.19562537968158722, "learning_rate": 4.516042383608454e-06, "loss": 0.4615, "num_input_tokens_seen": 112163040, "step": 92225 }, { "epoch": 11.556195965417867, "grad_norm": 0.20892445743083954, "learning_rate": 4.515498239663777e-06, "loss": 0.4609, "num_input_tokens_seen": 112168960, "step": 92230 }, { "epoch": 11.556822453326651, "grad_norm": 0.20909465849399567, "learning_rate": 4.514954101511723e-06, "loss": 0.4641, "num_input_tokens_seen": 112175264, "step": 92235 }, { "epoch": 11.557448941235434, "grad_norm": 0.2015009969472885, "learning_rate": 4.514409969158801e-06, "loss": 0.4675, "num_input_tokens_seen": 112181536, "step": 92240 }, { "epoch": 11.558075429144218, "grad_norm": 0.3518124520778656, "learning_rate": 4.513865842611516e-06, "loss": 0.4615, "num_input_tokens_seen": 112187136, "step": 92245 }, { "epoch": 11.558701917053002, "grad_norm": 0.1742650419473648, "learning_rate": 4.513321721876376e-06, "loss": 0.4583, "num_input_tokens_seen": 112193088, "step": 92250 }, { "epoch": 11.559328404961784, "grad_norm": 0.18227455019950867, "learning_rate": 4.512777606959881e-06, "loss": 0.4575, "num_input_tokens_seen": 112199168, "step": 92255 }, { "epoch": 11.559954892870568, "grad_norm": 0.3302321135997772, "learning_rate": 4.512233497868542e-06, "loss": 0.45, "num_input_tokens_seen": 112205408, "step": 92260 }, { "epoch": 11.56058138077935, "grad_norm": 0.1948750913143158, "learning_rate": 4.511689394608859e-06, "loss": 0.4596, "num_input_tokens_seen": 112211328, "step": 92265 }, { "epoch": 11.561207868688134, "grad_norm": 0.33119454979896545, "learning_rate": 4.51114529718734e-06, "loss": 0.4553, "num_input_tokens_seen": 112217696, "step": 92270 }, { "epoch": 11.561834356596918, "grad_norm": 0.26765018701553345, "learning_rate": 4.510601205610492e-06, "loss": 0.4506, "num_input_tokens_seen": 112223808, "step": 92275 }, { "epoch": 11.5624608445057, "grad_norm": 0.2695309519767761, "learning_rate": 4.510057119884817e-06, "loss": 0.4692, "num_input_tokens_seen": 112230176, "step": 92280 }, { "epoch": 11.563087332414485, "grad_norm": 0.21499867737293243, "learning_rate": 4.509513040016822e-06, "loss": 0.4653, "num_input_tokens_seen": 112236160, "step": 92285 }, { "epoch": 11.563713820323267, "grad_norm": 0.21891766786575317, "learning_rate": 4.5089689660130085e-06, "loss": 0.4642, "num_input_tokens_seen": 112242656, "step": 92290 }, { "epoch": 11.564340308232051, "grad_norm": 0.4126850366592407, "learning_rate": 4.508424897879886e-06, "loss": 0.4664, "num_input_tokens_seen": 112248832, "step": 92295 }, { "epoch": 11.564966796140835, "grad_norm": 0.23180708289146423, "learning_rate": 4.507880835623955e-06, "loss": 0.4663, "num_input_tokens_seen": 112254752, "step": 92300 }, { "epoch": 11.565593284049617, "grad_norm": 0.2507920265197754, "learning_rate": 4.507336779251725e-06, "loss": 0.4521, "num_input_tokens_seen": 112260800, "step": 92305 }, { "epoch": 11.566219771958401, "grad_norm": 0.21300199627876282, "learning_rate": 4.506792728769696e-06, "loss": 0.4693, "num_input_tokens_seen": 112266368, "step": 92310 }, { "epoch": 11.566846259867184, "grad_norm": 0.23477540910243988, "learning_rate": 4.506248684184377e-06, "loss": 0.4627, "num_input_tokens_seen": 112272096, "step": 92315 }, { "epoch": 11.567472747775968, "grad_norm": 0.1886071264743805, "learning_rate": 4.505704645502268e-06, "loss": 0.4645, "num_input_tokens_seen": 112277856, "step": 92320 }, { "epoch": 11.568099235684752, "grad_norm": 0.26467740535736084, "learning_rate": 4.505160612729875e-06, "loss": 0.4565, "num_input_tokens_seen": 112283264, "step": 92325 }, { "epoch": 11.568725723593534, "grad_norm": 0.3120032250881195, "learning_rate": 4.504616585873705e-06, "loss": 0.4607, "num_input_tokens_seen": 112289312, "step": 92330 }, { "epoch": 11.569352211502318, "grad_norm": 0.20320598781108856, "learning_rate": 4.504072564940258e-06, "loss": 0.4584, "num_input_tokens_seen": 112295168, "step": 92335 }, { "epoch": 11.569978699411102, "grad_norm": 0.21981927752494812, "learning_rate": 4.503528549936043e-06, "loss": 0.4641, "num_input_tokens_seen": 112300736, "step": 92340 }, { "epoch": 11.570605187319885, "grad_norm": 0.22872455418109894, "learning_rate": 4.50298454086756e-06, "loss": 0.4639, "num_input_tokens_seen": 112306816, "step": 92345 }, { "epoch": 11.571231675228669, "grad_norm": 0.2165030688047409, "learning_rate": 4.502440537741315e-06, "loss": 0.4688, "num_input_tokens_seen": 112312992, "step": 92350 }, { "epoch": 11.57185816313745, "grad_norm": 0.2395537793636322, "learning_rate": 4.50189654056381e-06, "loss": 0.4555, "num_input_tokens_seen": 112318784, "step": 92355 }, { "epoch": 11.572484651046235, "grad_norm": 0.2589901387691498, "learning_rate": 4.5013525493415545e-06, "loss": 0.4643, "num_input_tokens_seen": 112325152, "step": 92360 }, { "epoch": 11.573111138955019, "grad_norm": 0.2554994821548462, "learning_rate": 4.500808564081045e-06, "loss": 0.4565, "num_input_tokens_seen": 112331328, "step": 92365 }, { "epoch": 11.573737626863801, "grad_norm": 0.3285462558269501, "learning_rate": 4.500264584788791e-06, "loss": 0.4674, "num_input_tokens_seen": 112337792, "step": 92370 }, { "epoch": 11.574364114772585, "grad_norm": 0.26226329803466797, "learning_rate": 4.499720611471292e-06, "loss": 0.4603, "num_input_tokens_seen": 112343936, "step": 92375 }, { "epoch": 11.574990602681368, "grad_norm": 0.26162004470825195, "learning_rate": 4.499176644135057e-06, "loss": 0.465, "num_input_tokens_seen": 112350240, "step": 92380 }, { "epoch": 11.575617090590152, "grad_norm": 0.17981553077697754, "learning_rate": 4.498632682786584e-06, "loss": 0.4536, "num_input_tokens_seen": 112356480, "step": 92385 }, { "epoch": 11.576243578498936, "grad_norm": 0.18499884009361267, "learning_rate": 4.498088727432377e-06, "loss": 0.4568, "num_input_tokens_seen": 112362464, "step": 92390 }, { "epoch": 11.576870066407718, "grad_norm": 0.32550564408302307, "learning_rate": 4.497544778078944e-06, "loss": 0.4605, "num_input_tokens_seen": 112368704, "step": 92395 }, { "epoch": 11.577496554316502, "grad_norm": 0.28020384907722473, "learning_rate": 4.497000834732785e-06, "loss": 0.4591, "num_input_tokens_seen": 112374688, "step": 92400 }, { "epoch": 11.578123042225284, "grad_norm": 0.1793111264705658, "learning_rate": 4.496456897400403e-06, "loss": 0.4642, "num_input_tokens_seen": 112380416, "step": 92405 }, { "epoch": 11.578749530134068, "grad_norm": 0.2620358467102051, "learning_rate": 4.495912966088303e-06, "loss": 0.4641, "num_input_tokens_seen": 112386912, "step": 92410 }, { "epoch": 11.579376018042852, "grad_norm": 0.2735547721385956, "learning_rate": 4.495369040802988e-06, "loss": 0.4526, "num_input_tokens_seen": 112392864, "step": 92415 }, { "epoch": 11.580002505951635, "grad_norm": 0.2147507220506668, "learning_rate": 4.494825121550959e-06, "loss": 0.4652, "num_input_tokens_seen": 112398944, "step": 92420 }, { "epoch": 11.580628993860419, "grad_norm": 0.2512911260128021, "learning_rate": 4.494281208338722e-06, "loss": 0.4631, "num_input_tokens_seen": 112404928, "step": 92425 }, { "epoch": 11.581255481769201, "grad_norm": 0.21122682094573975, "learning_rate": 4.4937373011727765e-06, "loss": 0.4706, "num_input_tokens_seen": 112411328, "step": 92430 }, { "epoch": 11.581881969677985, "grad_norm": 0.2394314706325531, "learning_rate": 4.493193400059629e-06, "loss": 0.4644, "num_input_tokens_seen": 112417376, "step": 92435 }, { "epoch": 11.58250845758677, "grad_norm": 0.3408477008342743, "learning_rate": 4.492649505005779e-06, "loss": 0.4733, "num_input_tokens_seen": 112423168, "step": 92440 }, { "epoch": 11.583134945495551, "grad_norm": 0.1965792179107666, "learning_rate": 4.4921056160177316e-06, "loss": 0.4627, "num_input_tokens_seen": 112429216, "step": 92445 }, { "epoch": 11.583761433404336, "grad_norm": 0.20205961167812347, "learning_rate": 4.4915617331019865e-06, "loss": 0.4664, "num_input_tokens_seen": 112435328, "step": 92450 }, { "epoch": 11.58438792131312, "grad_norm": 0.2574697732925415, "learning_rate": 4.4910178562650496e-06, "loss": 0.4651, "num_input_tokens_seen": 112441344, "step": 92455 }, { "epoch": 11.585014409221902, "grad_norm": 0.38176265358924866, "learning_rate": 4.4904739855134235e-06, "loss": 0.4738, "num_input_tokens_seen": 112447360, "step": 92460 }, { "epoch": 11.585640897130686, "grad_norm": 0.19905541837215424, "learning_rate": 4.489930120853608e-06, "loss": 0.4599, "num_input_tokens_seen": 112453632, "step": 92465 }, { "epoch": 11.586267385039468, "grad_norm": 0.20954541862010956, "learning_rate": 4.489386262292108e-06, "loss": 0.4699, "num_input_tokens_seen": 112459776, "step": 92470 }, { "epoch": 11.586893872948252, "grad_norm": 0.1927986741065979, "learning_rate": 4.488842409835424e-06, "loss": 0.4611, "num_input_tokens_seen": 112465888, "step": 92475 }, { "epoch": 11.587520360857035, "grad_norm": 0.21671459078788757, "learning_rate": 4.488298563490059e-06, "loss": 0.4496, "num_input_tokens_seen": 112472032, "step": 92480 }, { "epoch": 11.588146848765819, "grad_norm": 0.21527154743671417, "learning_rate": 4.487754723262513e-06, "loss": 0.459, "num_input_tokens_seen": 112477920, "step": 92485 }, { "epoch": 11.588773336674603, "grad_norm": 0.3448505401611328, "learning_rate": 4.487210889159291e-06, "loss": 0.4641, "num_input_tokens_seen": 112483776, "step": 92490 }, { "epoch": 11.589399824583385, "grad_norm": 0.3030643165111542, "learning_rate": 4.486667061186895e-06, "loss": 0.4617, "num_input_tokens_seen": 112489696, "step": 92495 }, { "epoch": 11.590026312492169, "grad_norm": 0.18650610744953156, "learning_rate": 4.486123239351825e-06, "loss": 0.4584, "num_input_tokens_seen": 112495840, "step": 92500 }, { "epoch": 11.590652800400953, "grad_norm": 0.1972934603691101, "learning_rate": 4.4855794236605835e-06, "loss": 0.4541, "num_input_tokens_seen": 112502176, "step": 92505 }, { "epoch": 11.591279288309735, "grad_norm": 0.3096241056919098, "learning_rate": 4.485035614119671e-06, "loss": 0.4646, "num_input_tokens_seen": 112508544, "step": 92510 }, { "epoch": 11.59190577621852, "grad_norm": 0.21364152431488037, "learning_rate": 4.4844918107355945e-06, "loss": 0.4533, "num_input_tokens_seen": 112514848, "step": 92515 }, { "epoch": 11.592532264127302, "grad_norm": 0.24264678359031677, "learning_rate": 4.483948013514849e-06, "loss": 0.4625, "num_input_tokens_seen": 112520960, "step": 92520 }, { "epoch": 11.593158752036086, "grad_norm": 0.32192525267601013, "learning_rate": 4.483404222463941e-06, "loss": 0.4674, "num_input_tokens_seen": 112526912, "step": 92525 }, { "epoch": 11.59378523994487, "grad_norm": 0.20861588418483734, "learning_rate": 4.4828604375893675e-06, "loss": 0.4613, "num_input_tokens_seen": 112532960, "step": 92530 }, { "epoch": 11.594411727853652, "grad_norm": 0.18172232806682587, "learning_rate": 4.482316658897635e-06, "loss": 0.4757, "num_input_tokens_seen": 112539168, "step": 92535 }, { "epoch": 11.595038215762436, "grad_norm": 0.26740339398384094, "learning_rate": 4.481772886395239e-06, "loss": 0.4643, "num_input_tokens_seen": 112544960, "step": 92540 }, { "epoch": 11.595664703671218, "grad_norm": 0.2048805207014084, "learning_rate": 4.481229120088685e-06, "loss": 0.465, "num_input_tokens_seen": 112551168, "step": 92545 }, { "epoch": 11.596291191580002, "grad_norm": 0.17072831094264984, "learning_rate": 4.480685359984474e-06, "loss": 0.4634, "num_input_tokens_seen": 112556064, "step": 92550 }, { "epoch": 11.596917679488786, "grad_norm": 0.23501524329185486, "learning_rate": 4.480141606089105e-06, "loss": 0.4649, "num_input_tokens_seen": 112562368, "step": 92555 }, { "epoch": 11.597544167397569, "grad_norm": 0.21466422080993652, "learning_rate": 4.47959785840908e-06, "loss": 0.4607, "num_input_tokens_seen": 112568288, "step": 92560 }, { "epoch": 11.598170655306353, "grad_norm": 0.3257049322128296, "learning_rate": 4.479054116950902e-06, "loss": 0.4688, "num_input_tokens_seen": 112573824, "step": 92565 }, { "epoch": 11.598797143215137, "grad_norm": 0.3508610725402832, "learning_rate": 4.478510381721068e-06, "loss": 0.4553, "num_input_tokens_seen": 112580192, "step": 92570 }, { "epoch": 11.59942363112392, "grad_norm": 0.34530678391456604, "learning_rate": 4.477966652726081e-06, "loss": 0.4555, "num_input_tokens_seen": 112585984, "step": 92575 }, { "epoch": 11.600050119032703, "grad_norm": 0.17088404297828674, "learning_rate": 4.477422929972443e-06, "loss": 0.4623, "num_input_tokens_seen": 112591616, "step": 92580 }, { "epoch": 11.600676606941486, "grad_norm": 0.2170983999967575, "learning_rate": 4.476879213466652e-06, "loss": 0.4591, "num_input_tokens_seen": 112597600, "step": 92585 }, { "epoch": 11.60130309485027, "grad_norm": 0.23935003578662872, "learning_rate": 4.476335503215211e-06, "loss": 0.4624, "num_input_tokens_seen": 112603392, "step": 92590 }, { "epoch": 11.601929582759052, "grad_norm": 0.21824069321155548, "learning_rate": 4.475791799224618e-06, "loss": 0.4605, "num_input_tokens_seen": 112609536, "step": 92595 }, { "epoch": 11.602556070667836, "grad_norm": 0.1979132741689682, "learning_rate": 4.475248101501375e-06, "loss": 0.4615, "num_input_tokens_seen": 112615776, "step": 92600 }, { "epoch": 11.60318255857662, "grad_norm": 0.23290196061134338, "learning_rate": 4.474704410051983e-06, "loss": 0.4584, "num_input_tokens_seen": 112621856, "step": 92605 }, { "epoch": 11.603809046485402, "grad_norm": 0.2392050325870514, "learning_rate": 4.474160724882941e-06, "loss": 0.4614, "num_input_tokens_seen": 112628032, "step": 92610 }, { "epoch": 11.604435534394186, "grad_norm": 0.2064451277256012, "learning_rate": 4.473617046000748e-06, "loss": 0.4619, "num_input_tokens_seen": 112634080, "step": 92615 }, { "epoch": 11.60506202230297, "grad_norm": 0.23165208101272583, "learning_rate": 4.473073373411909e-06, "loss": 0.4564, "num_input_tokens_seen": 112640128, "step": 92620 }, { "epoch": 11.605688510211753, "grad_norm": 0.25304239988327026, "learning_rate": 4.472529707122918e-06, "loss": 0.4538, "num_input_tokens_seen": 112646144, "step": 92625 }, { "epoch": 11.606314998120537, "grad_norm": 0.20459489524364471, "learning_rate": 4.47198604714028e-06, "loss": 0.4563, "num_input_tokens_seen": 112651968, "step": 92630 }, { "epoch": 11.606941486029319, "grad_norm": 0.19458448886871338, "learning_rate": 4.471442393470491e-06, "loss": 0.4561, "num_input_tokens_seen": 112657952, "step": 92635 }, { "epoch": 11.607567973938103, "grad_norm": 0.23853878676891327, "learning_rate": 4.470898746120052e-06, "loss": 0.4659, "num_input_tokens_seen": 112663904, "step": 92640 }, { "epoch": 11.608194461846887, "grad_norm": 0.23433397710323334, "learning_rate": 4.470355105095465e-06, "loss": 0.4582, "num_input_tokens_seen": 112669984, "step": 92645 }, { "epoch": 11.60882094975567, "grad_norm": 0.24878866970539093, "learning_rate": 4.469811470403228e-06, "loss": 0.4617, "num_input_tokens_seen": 112676288, "step": 92650 }, { "epoch": 11.609447437664453, "grad_norm": 0.21531663835048676, "learning_rate": 4.46926784204984e-06, "loss": 0.4597, "num_input_tokens_seen": 112682560, "step": 92655 }, { "epoch": 11.610073925573236, "grad_norm": 0.23674599826335907, "learning_rate": 4.4687242200418005e-06, "loss": 0.4591, "num_input_tokens_seen": 112688544, "step": 92660 }, { "epoch": 11.61070041348202, "grad_norm": 0.21492625772953033, "learning_rate": 4.468180604385612e-06, "loss": 0.4533, "num_input_tokens_seen": 112694528, "step": 92665 }, { "epoch": 11.611326901390804, "grad_norm": 0.2398594170808792, "learning_rate": 4.4676369950877686e-06, "loss": 0.4552, "num_input_tokens_seen": 112700864, "step": 92670 }, { "epoch": 11.611953389299586, "grad_norm": 0.48461422324180603, "learning_rate": 4.4670933921547755e-06, "loss": 0.46, "num_input_tokens_seen": 112706912, "step": 92675 }, { "epoch": 11.61257987720837, "grad_norm": 0.27215316891670227, "learning_rate": 4.466549795593126e-06, "loss": 0.467, "num_input_tokens_seen": 112712864, "step": 92680 }, { "epoch": 11.613206365117152, "grad_norm": 0.20308701694011688, "learning_rate": 4.466006205409325e-06, "loss": 0.4632, "num_input_tokens_seen": 112719200, "step": 92685 }, { "epoch": 11.613832853025936, "grad_norm": 0.25264066457748413, "learning_rate": 4.465462621609866e-06, "loss": 0.4746, "num_input_tokens_seen": 112725312, "step": 92690 }, { "epoch": 11.61445934093472, "grad_norm": 0.22511230409145355, "learning_rate": 4.464919044201251e-06, "loss": 0.4498, "num_input_tokens_seen": 112731616, "step": 92695 }, { "epoch": 11.615085828843503, "grad_norm": 0.265109121799469, "learning_rate": 4.464375473189981e-06, "loss": 0.4597, "num_input_tokens_seen": 112737792, "step": 92700 }, { "epoch": 11.615712316752287, "grad_norm": 0.26785895228385925, "learning_rate": 4.463831908582551e-06, "loss": 0.4596, "num_input_tokens_seen": 112743296, "step": 92705 }, { "epoch": 11.61633880466107, "grad_norm": 0.2803102135658264, "learning_rate": 4.463288350385462e-06, "loss": 0.4583, "num_input_tokens_seen": 112749248, "step": 92710 }, { "epoch": 11.616965292569853, "grad_norm": 0.23662608861923218, "learning_rate": 4.462744798605211e-06, "loss": 0.4495, "num_input_tokens_seen": 112755168, "step": 92715 }, { "epoch": 11.617591780478637, "grad_norm": 0.2256825566291809, "learning_rate": 4.462201253248299e-06, "loss": 0.4533, "num_input_tokens_seen": 112761312, "step": 92720 }, { "epoch": 11.61821826838742, "grad_norm": 0.21188874542713165, "learning_rate": 4.461657714321222e-06, "loss": 0.4586, "num_input_tokens_seen": 112767744, "step": 92725 }, { "epoch": 11.618844756296204, "grad_norm": 0.3227810263633728, "learning_rate": 4.461114181830481e-06, "loss": 0.4613, "num_input_tokens_seen": 112773856, "step": 92730 }, { "epoch": 11.619471244204988, "grad_norm": 0.8663597106933594, "learning_rate": 4.460570655782571e-06, "loss": 0.4614, "num_input_tokens_seen": 112780000, "step": 92735 }, { "epoch": 11.62009773211377, "grad_norm": 0.268957257270813, "learning_rate": 4.460027136183995e-06, "loss": 0.452, "num_input_tokens_seen": 112785952, "step": 92740 }, { "epoch": 11.620724220022554, "grad_norm": 0.24364520609378815, "learning_rate": 4.459483623041246e-06, "loss": 0.475, "num_input_tokens_seen": 112792192, "step": 92745 }, { "epoch": 11.621350707931336, "grad_norm": 0.3980300724506378, "learning_rate": 4.458940116360826e-06, "loss": 0.4552, "num_input_tokens_seen": 112798240, "step": 92750 }, { "epoch": 11.62197719584012, "grad_norm": 0.4602022171020508, "learning_rate": 4.458396616149232e-06, "loss": 0.4733, "num_input_tokens_seen": 112804416, "step": 92755 }, { "epoch": 11.622603683748904, "grad_norm": 0.30352911353111267, "learning_rate": 4.45785312241296e-06, "loss": 0.4521, "num_input_tokens_seen": 112810592, "step": 92760 }, { "epoch": 11.623230171657687, "grad_norm": 0.3848240077495575, "learning_rate": 4.457309635158511e-06, "loss": 0.462, "num_input_tokens_seen": 112816736, "step": 92765 }, { "epoch": 11.62385665956647, "grad_norm": 0.34220021963119507, "learning_rate": 4.456766154392381e-06, "loss": 0.4574, "num_input_tokens_seen": 112822912, "step": 92770 }, { "epoch": 11.624483147475253, "grad_norm": 0.3895263373851776, "learning_rate": 4.4562226801210706e-06, "loss": 0.4571, "num_input_tokens_seen": 112829184, "step": 92775 }, { "epoch": 11.625109635384037, "grad_norm": 0.8874725699424744, "learning_rate": 4.455679212351073e-06, "loss": 0.4797, "num_input_tokens_seen": 112835744, "step": 92780 }, { "epoch": 11.625736123292821, "grad_norm": 0.2938346266746521, "learning_rate": 4.45513575108889e-06, "loss": 0.4592, "num_input_tokens_seen": 112841408, "step": 92785 }, { "epoch": 11.626362611201603, "grad_norm": 0.219114750623703, "learning_rate": 4.454592296341015e-06, "loss": 0.4684, "num_input_tokens_seen": 112847712, "step": 92790 }, { "epoch": 11.626989099110387, "grad_norm": 0.3100578486919403, "learning_rate": 4.45404884811395e-06, "loss": 0.4614, "num_input_tokens_seen": 112853696, "step": 92795 }, { "epoch": 11.62761558701917, "grad_norm": 0.3211160898208618, "learning_rate": 4.453505406414187e-06, "loss": 0.4666, "num_input_tokens_seen": 112859808, "step": 92800 }, { "epoch": 11.628242074927954, "grad_norm": 0.22249232232570648, "learning_rate": 4.452961971248229e-06, "loss": 0.4511, "num_input_tokens_seen": 112865792, "step": 92805 }, { "epoch": 11.628868562836738, "grad_norm": 0.23103682696819305, "learning_rate": 4.452418542622568e-06, "loss": 0.4552, "num_input_tokens_seen": 112871904, "step": 92810 }, { "epoch": 11.62949505074552, "grad_norm": 0.2624444365501404, "learning_rate": 4.451875120543706e-06, "loss": 0.4577, "num_input_tokens_seen": 112878048, "step": 92815 }, { "epoch": 11.630121538654304, "grad_norm": 0.23896440863609314, "learning_rate": 4.451331705018137e-06, "loss": 0.4614, "num_input_tokens_seen": 112883552, "step": 92820 }, { "epoch": 11.630748026563086, "grad_norm": 0.4140542149543762, "learning_rate": 4.450788296052357e-06, "loss": 0.456, "num_input_tokens_seen": 112889632, "step": 92825 }, { "epoch": 11.63137451447187, "grad_norm": 0.32953497767448425, "learning_rate": 4.450244893652868e-06, "loss": 0.4455, "num_input_tokens_seen": 112895744, "step": 92830 }, { "epoch": 11.632001002380655, "grad_norm": 0.47984758019447327, "learning_rate": 4.449701497826161e-06, "loss": 0.457, "num_input_tokens_seen": 112901088, "step": 92835 }, { "epoch": 11.632627490289437, "grad_norm": 0.5189988017082214, "learning_rate": 4.4491581085787376e-06, "loss": 0.4562, "num_input_tokens_seen": 112907488, "step": 92840 }, { "epoch": 11.633253978198221, "grad_norm": 0.37361210584640503, "learning_rate": 4.448614725917089e-06, "loss": 0.4594, "num_input_tokens_seen": 112913760, "step": 92845 }, { "epoch": 11.633880466107005, "grad_norm": 0.2712668180465698, "learning_rate": 4.448071349847718e-06, "loss": 0.4473, "num_input_tokens_seen": 112919648, "step": 92850 }, { "epoch": 11.634506954015787, "grad_norm": 0.2684840261936188, "learning_rate": 4.4475279803771165e-06, "loss": 0.4648, "num_input_tokens_seen": 112926016, "step": 92855 }, { "epoch": 11.635133441924571, "grad_norm": 0.2331411987543106, "learning_rate": 4.446984617511783e-06, "loss": 0.4557, "num_input_tokens_seen": 112932160, "step": 92860 }, { "epoch": 11.635759929833354, "grad_norm": 0.2501971423625946, "learning_rate": 4.446441261258212e-06, "loss": 0.4586, "num_input_tokens_seen": 112937600, "step": 92865 }, { "epoch": 11.636386417742138, "grad_norm": 1.2254266738891602, "learning_rate": 4.445897911622904e-06, "loss": 0.4706, "num_input_tokens_seen": 112943616, "step": 92870 }, { "epoch": 11.637012905650922, "grad_norm": 0.2489882856607437, "learning_rate": 4.445354568612349e-06, "loss": 0.4636, "num_input_tokens_seen": 112949888, "step": 92875 }, { "epoch": 11.637639393559704, "grad_norm": 0.28652602434158325, "learning_rate": 4.444811232233047e-06, "loss": 0.4644, "num_input_tokens_seen": 112954816, "step": 92880 }, { "epoch": 11.638265881468488, "grad_norm": 0.41471001505851746, "learning_rate": 4.444267902491496e-06, "loss": 0.4563, "num_input_tokens_seen": 112960192, "step": 92885 }, { "epoch": 11.63889236937727, "grad_norm": 0.23351021111011505, "learning_rate": 4.443724579394186e-06, "loss": 0.4581, "num_input_tokens_seen": 112966240, "step": 92890 }, { "epoch": 11.639518857286054, "grad_norm": 0.24847406148910522, "learning_rate": 4.44318126294762e-06, "loss": 0.4603, "num_input_tokens_seen": 112972064, "step": 92895 }, { "epoch": 11.640145345194838, "grad_norm": 0.3599497377872467, "learning_rate": 4.442637953158287e-06, "loss": 0.4589, "num_input_tokens_seen": 112978144, "step": 92900 }, { "epoch": 11.64077183310362, "grad_norm": 0.3584960997104645, "learning_rate": 4.442094650032689e-06, "loss": 0.4893, "num_input_tokens_seen": 112984256, "step": 92905 }, { "epoch": 11.641398321012405, "grad_norm": 0.2930554151535034, "learning_rate": 4.441551353577316e-06, "loss": 0.4578, "num_input_tokens_seen": 112990528, "step": 92910 }, { "epoch": 11.642024808921187, "grad_norm": 0.3610612452030182, "learning_rate": 4.441008063798667e-06, "loss": 0.4612, "num_input_tokens_seen": 112996608, "step": 92915 }, { "epoch": 11.642651296829971, "grad_norm": 0.3736584186553955, "learning_rate": 4.440464780703235e-06, "loss": 0.448, "num_input_tokens_seen": 113002944, "step": 92920 }, { "epoch": 11.643277784738755, "grad_norm": 0.2919042408466339, "learning_rate": 4.43992150429752e-06, "loss": 0.4795, "num_input_tokens_seen": 113008672, "step": 92925 }, { "epoch": 11.643904272647537, "grad_norm": 0.4590076506137848, "learning_rate": 4.439378234588012e-06, "loss": 0.4614, "num_input_tokens_seen": 113015200, "step": 92930 }, { "epoch": 11.644530760556322, "grad_norm": 0.26485729217529297, "learning_rate": 4.438834971581211e-06, "loss": 0.4514, "num_input_tokens_seen": 113021216, "step": 92935 }, { "epoch": 11.645157248465104, "grad_norm": 0.4004727602005005, "learning_rate": 4.438291715283607e-06, "loss": 0.4554, "num_input_tokens_seen": 113027296, "step": 92940 }, { "epoch": 11.645783736373888, "grad_norm": 0.3184705674648285, "learning_rate": 4.437748465701698e-06, "loss": 0.4532, "num_input_tokens_seen": 113033152, "step": 92945 }, { "epoch": 11.646410224282672, "grad_norm": 0.356901615858078, "learning_rate": 4.4372052228419805e-06, "loss": 0.4642, "num_input_tokens_seen": 113039552, "step": 92950 }, { "epoch": 11.647036712191454, "grad_norm": 0.33029860258102417, "learning_rate": 4.4366619867109454e-06, "loss": 0.4681, "num_input_tokens_seen": 113045312, "step": 92955 }, { "epoch": 11.647663200100238, "grad_norm": 0.6741219758987427, "learning_rate": 4.436118757315092e-06, "loss": 0.4701, "num_input_tokens_seen": 113051616, "step": 92960 }, { "epoch": 11.648289688009022, "grad_norm": 0.2590704560279846, "learning_rate": 4.435575534660913e-06, "loss": 0.4647, "num_input_tokens_seen": 113057952, "step": 92965 }, { "epoch": 11.648916175917805, "grad_norm": 0.2843121886253357, "learning_rate": 4.435032318754902e-06, "loss": 0.4769, "num_input_tokens_seen": 113063968, "step": 92970 }, { "epoch": 11.649542663826589, "grad_norm": 0.3964345455169678, "learning_rate": 4.434489109603554e-06, "loss": 0.4641, "num_input_tokens_seen": 113070208, "step": 92975 }, { "epoch": 11.650169151735371, "grad_norm": 0.29207518696784973, "learning_rate": 4.4339459072133665e-06, "loss": 0.4648, "num_input_tokens_seen": 113076448, "step": 92980 }, { "epoch": 11.650795639644155, "grad_norm": 0.41245952248573303, "learning_rate": 4.433402711590829e-06, "loss": 0.4526, "num_input_tokens_seen": 113082496, "step": 92985 }, { "epoch": 11.651422127552939, "grad_norm": 0.3021185100078583, "learning_rate": 4.432859522742441e-06, "loss": 0.4573, "num_input_tokens_seen": 113088672, "step": 92990 }, { "epoch": 11.652048615461721, "grad_norm": 0.3056187033653259, "learning_rate": 4.432316340674692e-06, "loss": 0.4557, "num_input_tokens_seen": 113094720, "step": 92995 }, { "epoch": 11.652675103370505, "grad_norm": 6.225363254547119, "learning_rate": 4.43177316539408e-06, "loss": 0.4892, "num_input_tokens_seen": 113100640, "step": 93000 }, { "epoch": 11.653301591279288, "grad_norm": 0.23675507307052612, "learning_rate": 4.431229996907096e-06, "loss": 0.4615, "num_input_tokens_seen": 113106560, "step": 93005 }, { "epoch": 11.653928079188072, "grad_norm": 0.5346761345863342, "learning_rate": 4.430686835220236e-06, "loss": 0.4623, "num_input_tokens_seen": 113112640, "step": 93010 }, { "epoch": 11.654554567096856, "grad_norm": 0.30105823278427124, "learning_rate": 4.430143680339993e-06, "loss": 0.4573, "num_input_tokens_seen": 113118976, "step": 93015 }, { "epoch": 11.655181055005638, "grad_norm": 0.45023313164711, "learning_rate": 4.429600532272863e-06, "loss": 0.4676, "num_input_tokens_seen": 113124864, "step": 93020 }, { "epoch": 11.655807542914422, "grad_norm": 0.25507572293281555, "learning_rate": 4.429057391025338e-06, "loss": 0.4654, "num_input_tokens_seen": 113130752, "step": 93025 }, { "epoch": 11.656434030823204, "grad_norm": 0.26479360461235046, "learning_rate": 4.42851425660391e-06, "loss": 0.4594, "num_input_tokens_seen": 113136288, "step": 93030 }, { "epoch": 11.657060518731988, "grad_norm": 0.24757710099220276, "learning_rate": 4.427971129015078e-06, "loss": 0.4599, "num_input_tokens_seen": 113142272, "step": 93035 }, { "epoch": 11.657687006640773, "grad_norm": 0.2884715795516968, "learning_rate": 4.427428008265329e-06, "loss": 0.4652, "num_input_tokens_seen": 113148864, "step": 93040 }, { "epoch": 11.658313494549555, "grad_norm": 0.26756423711776733, "learning_rate": 4.426884894361162e-06, "loss": 0.4719, "num_input_tokens_seen": 113154816, "step": 93045 }, { "epoch": 11.658939982458339, "grad_norm": 0.29481443762779236, "learning_rate": 4.426341787309065e-06, "loss": 0.4624, "num_input_tokens_seen": 113161024, "step": 93050 }, { "epoch": 11.659566470367121, "grad_norm": 0.28191235661506653, "learning_rate": 4.425798687115537e-06, "loss": 0.466, "num_input_tokens_seen": 113166880, "step": 93055 }, { "epoch": 11.660192958275905, "grad_norm": 0.21806266903877258, "learning_rate": 4.425255593787066e-06, "loss": 0.4483, "num_input_tokens_seen": 113173184, "step": 93060 }, { "epoch": 11.66081944618469, "grad_norm": 0.3175070881843567, "learning_rate": 4.424712507330149e-06, "loss": 0.4763, "num_input_tokens_seen": 113179936, "step": 93065 }, { "epoch": 11.661445934093472, "grad_norm": 0.21496614813804626, "learning_rate": 4.424169427751277e-06, "loss": 0.4603, "num_input_tokens_seen": 113185952, "step": 93070 }, { "epoch": 11.662072422002256, "grad_norm": 0.19189657270908356, "learning_rate": 4.423626355056944e-06, "loss": 0.4661, "num_input_tokens_seen": 113192064, "step": 93075 }, { "epoch": 11.66269890991104, "grad_norm": 0.2480219453573227, "learning_rate": 4.423083289253644e-06, "loss": 0.4694, "num_input_tokens_seen": 113197536, "step": 93080 }, { "epoch": 11.663325397819822, "grad_norm": 0.21222902834415436, "learning_rate": 4.4225402303478655e-06, "loss": 0.4615, "num_input_tokens_seen": 113203904, "step": 93085 }, { "epoch": 11.663951885728606, "grad_norm": 0.26332080364227295, "learning_rate": 4.421997178346107e-06, "loss": 0.454, "num_input_tokens_seen": 113210336, "step": 93090 }, { "epoch": 11.664578373637388, "grad_norm": 0.22306577861309052, "learning_rate": 4.421454133254856e-06, "loss": 0.4601, "num_input_tokens_seen": 113216416, "step": 93095 }, { "epoch": 11.665204861546172, "grad_norm": 0.22830018401145935, "learning_rate": 4.420911095080609e-06, "loss": 0.4811, "num_input_tokens_seen": 113222624, "step": 93100 }, { "epoch": 11.665831349454955, "grad_norm": 0.27831074595451355, "learning_rate": 4.420368063829854e-06, "loss": 0.4573, "num_input_tokens_seen": 113228768, "step": 93105 }, { "epoch": 11.666457837363739, "grad_norm": 0.3015228509902954, "learning_rate": 4.419825039509088e-06, "loss": 0.4641, "num_input_tokens_seen": 113234944, "step": 93110 }, { "epoch": 11.667084325272523, "grad_norm": 0.3156910836696625, "learning_rate": 4.419282022124801e-06, "loss": 0.4629, "num_input_tokens_seen": 113241760, "step": 93115 }, { "epoch": 11.667710813181305, "grad_norm": 0.2183876633644104, "learning_rate": 4.418739011683487e-06, "loss": 0.4561, "num_input_tokens_seen": 113247776, "step": 93120 }, { "epoch": 11.668337301090089, "grad_norm": 0.226777583360672, "learning_rate": 4.418196008191635e-06, "loss": 0.456, "num_input_tokens_seen": 113253888, "step": 93125 }, { "epoch": 11.668963788998873, "grad_norm": 0.2077474743127823, "learning_rate": 4.417653011655739e-06, "loss": 0.4563, "num_input_tokens_seen": 113259840, "step": 93130 }, { "epoch": 11.669590276907655, "grad_norm": 0.359809547662735, "learning_rate": 4.417110022082292e-06, "loss": 0.4697, "num_input_tokens_seen": 113266272, "step": 93135 }, { "epoch": 11.67021676481644, "grad_norm": 0.30652761459350586, "learning_rate": 4.416567039477784e-06, "loss": 0.447, "num_input_tokens_seen": 113272544, "step": 93140 }, { "epoch": 11.670843252725222, "grad_norm": 0.21198631823062897, "learning_rate": 4.416024063848709e-06, "loss": 0.4515, "num_input_tokens_seen": 113278656, "step": 93145 }, { "epoch": 11.671469740634006, "grad_norm": 0.20894484221935272, "learning_rate": 4.415481095201555e-06, "loss": 0.4577, "num_input_tokens_seen": 113285056, "step": 93150 }, { "epoch": 11.67209622854279, "grad_norm": 0.21864618360996246, "learning_rate": 4.4149381335428185e-06, "loss": 0.4698, "num_input_tokens_seen": 113290336, "step": 93155 }, { "epoch": 11.672722716451572, "grad_norm": 0.25639501214027405, "learning_rate": 4.414395178878986e-06, "loss": 0.4476, "num_input_tokens_seen": 113296320, "step": 93160 }, { "epoch": 11.673349204360356, "grad_norm": 0.2447555810213089, "learning_rate": 4.413852231216552e-06, "loss": 0.4577, "num_input_tokens_seen": 113302176, "step": 93165 }, { "epoch": 11.673975692269138, "grad_norm": 0.3081514239311218, "learning_rate": 4.413309290562008e-06, "loss": 0.4641, "num_input_tokens_seen": 113308320, "step": 93170 }, { "epoch": 11.674602180177923, "grad_norm": 0.24024474620819092, "learning_rate": 4.4127663569218444e-06, "loss": 0.4581, "num_input_tokens_seen": 113314592, "step": 93175 }, { "epoch": 11.675228668086707, "grad_norm": 0.2401316612958908, "learning_rate": 4.412223430302553e-06, "loss": 0.4572, "num_input_tokens_seen": 113320512, "step": 93180 }, { "epoch": 11.675855155995489, "grad_norm": 0.19837552309036255, "learning_rate": 4.411680510710625e-06, "loss": 0.4603, "num_input_tokens_seen": 113326816, "step": 93185 }, { "epoch": 11.676481643904273, "grad_norm": 0.23580826818943024, "learning_rate": 4.411137598152551e-06, "loss": 0.4579, "num_input_tokens_seen": 113333056, "step": 93190 }, { "epoch": 11.677108131813055, "grad_norm": 0.2456846982240677, "learning_rate": 4.41059469263482e-06, "loss": 0.4677, "num_input_tokens_seen": 113339328, "step": 93195 }, { "epoch": 11.67773461972184, "grad_norm": 0.3177602291107178, "learning_rate": 4.410051794163928e-06, "loss": 0.4615, "num_input_tokens_seen": 113345440, "step": 93200 }, { "epoch": 11.678361107630623, "grad_norm": 0.966550350189209, "learning_rate": 4.409508902746361e-06, "loss": 0.4667, "num_input_tokens_seen": 113351424, "step": 93205 }, { "epoch": 11.678987595539406, "grad_norm": 0.25237974524497986, "learning_rate": 4.4089660183886134e-06, "loss": 0.4672, "num_input_tokens_seen": 113357632, "step": 93210 }, { "epoch": 11.67961408344819, "grad_norm": 0.21660269796848297, "learning_rate": 4.408423141097172e-06, "loss": 0.4643, "num_input_tokens_seen": 113363680, "step": 93215 }, { "epoch": 11.680240571356972, "grad_norm": 0.3155246078968048, "learning_rate": 4.40788027087853e-06, "loss": 0.4623, "num_input_tokens_seen": 113370144, "step": 93220 }, { "epoch": 11.680867059265756, "grad_norm": 0.18021851778030396, "learning_rate": 4.407337407739176e-06, "loss": 0.4589, "num_input_tokens_seen": 113375968, "step": 93225 }, { "epoch": 11.68149354717454, "grad_norm": 0.2335919737815857, "learning_rate": 4.406794551685604e-06, "loss": 0.4537, "num_input_tokens_seen": 113381824, "step": 93230 }, { "epoch": 11.682120035083322, "grad_norm": 0.22956137359142303, "learning_rate": 4.4062517027243e-06, "loss": 0.4648, "num_input_tokens_seen": 113387392, "step": 93235 }, { "epoch": 11.682746522992106, "grad_norm": 0.1576995551586151, "learning_rate": 4.405708860861758e-06, "loss": 0.4545, "num_input_tokens_seen": 113393376, "step": 93240 }, { "epoch": 11.68337301090089, "grad_norm": 0.29979515075683594, "learning_rate": 4.405166026104464e-06, "loss": 0.4569, "num_input_tokens_seen": 113399360, "step": 93245 }, { "epoch": 11.683999498809673, "grad_norm": 0.28023290634155273, "learning_rate": 4.404623198458913e-06, "loss": 0.4632, "num_input_tokens_seen": 113405696, "step": 93250 }, { "epoch": 11.684625986718457, "grad_norm": 0.2090480625629425, "learning_rate": 4.404080377931589e-06, "loss": 0.4606, "num_input_tokens_seen": 113411392, "step": 93255 }, { "epoch": 11.685252474627239, "grad_norm": 0.2829347848892212, "learning_rate": 4.403537564528985e-06, "loss": 0.4637, "num_input_tokens_seen": 113417504, "step": 93260 }, { "epoch": 11.685878962536023, "grad_norm": 0.46195128560066223, "learning_rate": 4.4029947582575944e-06, "loss": 0.4564, "num_input_tokens_seen": 113423040, "step": 93265 }, { "epoch": 11.686505450444807, "grad_norm": 0.24571684002876282, "learning_rate": 4.4024519591239e-06, "loss": 0.4532, "num_input_tokens_seen": 113428512, "step": 93270 }, { "epoch": 11.68713193835359, "grad_norm": 0.28778988122940063, "learning_rate": 4.401909167134397e-06, "loss": 0.4654, "num_input_tokens_seen": 113434688, "step": 93275 }, { "epoch": 11.687758426262373, "grad_norm": 0.24469725787639618, "learning_rate": 4.401366382295571e-06, "loss": 0.471, "num_input_tokens_seen": 113440608, "step": 93280 }, { "epoch": 11.688384914171156, "grad_norm": 0.22649763524532318, "learning_rate": 4.400823604613915e-06, "loss": 0.453, "num_input_tokens_seen": 113446944, "step": 93285 }, { "epoch": 11.68901140207994, "grad_norm": 0.2796524167060852, "learning_rate": 4.400280834095915e-06, "loss": 0.4686, "num_input_tokens_seen": 113452992, "step": 93290 }, { "epoch": 11.689637889988724, "grad_norm": 0.1903090476989746, "learning_rate": 4.399738070748064e-06, "loss": 0.4665, "num_input_tokens_seen": 113458240, "step": 93295 }, { "epoch": 11.690264377897506, "grad_norm": 0.27424371242523193, "learning_rate": 4.399195314576847e-06, "loss": 0.4608, "num_input_tokens_seen": 113463904, "step": 93300 }, { "epoch": 11.69089086580629, "grad_norm": 0.28780171275138855, "learning_rate": 4.398652565588757e-06, "loss": 0.4623, "num_input_tokens_seen": 113469920, "step": 93305 }, { "epoch": 11.691517353715073, "grad_norm": 0.30272477865219116, "learning_rate": 4.398109823790279e-06, "loss": 0.4635, "num_input_tokens_seen": 113476192, "step": 93310 }, { "epoch": 11.692143841623857, "grad_norm": 0.21026787161827087, "learning_rate": 4.3975670891879045e-06, "loss": 0.4624, "num_input_tokens_seen": 113482048, "step": 93315 }, { "epoch": 11.69277032953264, "grad_norm": 0.261074960231781, "learning_rate": 4.397024361788124e-06, "loss": 0.4528, "num_input_tokens_seen": 113488448, "step": 93320 }, { "epoch": 11.693396817441423, "grad_norm": 0.2743928134441376, "learning_rate": 4.396481641597422e-06, "loss": 0.4561, "num_input_tokens_seen": 113494080, "step": 93325 }, { "epoch": 11.694023305350207, "grad_norm": 0.2885518968105316, "learning_rate": 4.39593892862229e-06, "loss": 0.4646, "num_input_tokens_seen": 113500064, "step": 93330 }, { "epoch": 11.69464979325899, "grad_norm": 0.2329229712486267, "learning_rate": 4.3953962228692146e-06, "loss": 0.4649, "num_input_tokens_seen": 113506144, "step": 93335 }, { "epoch": 11.695276281167773, "grad_norm": 0.2321138232946396, "learning_rate": 4.394853524344689e-06, "loss": 0.4671, "num_input_tokens_seen": 113512384, "step": 93340 }, { "epoch": 11.695902769076557, "grad_norm": 0.18937985599040985, "learning_rate": 4.3943108330551954e-06, "loss": 0.4664, "num_input_tokens_seen": 113518752, "step": 93345 }, { "epoch": 11.69652925698534, "grad_norm": 0.2877455949783325, "learning_rate": 4.393768149007227e-06, "loss": 0.4604, "num_input_tokens_seen": 113525024, "step": 93350 }, { "epoch": 11.697155744894124, "grad_norm": 0.25275135040283203, "learning_rate": 4.393225472207269e-06, "loss": 0.4614, "num_input_tokens_seen": 113530592, "step": 93355 }, { "epoch": 11.697782232802908, "grad_norm": 0.3202912509441376, "learning_rate": 4.392682802661812e-06, "loss": 0.4527, "num_input_tokens_seen": 113536640, "step": 93360 }, { "epoch": 11.69840872071169, "grad_norm": 0.30826255679130554, "learning_rate": 4.39214014037734e-06, "loss": 0.4709, "num_input_tokens_seen": 113542720, "step": 93365 }, { "epoch": 11.699035208620474, "grad_norm": 0.2923796474933624, "learning_rate": 4.391597485360345e-06, "loss": 0.4667, "num_input_tokens_seen": 113548832, "step": 93370 }, { "epoch": 11.699661696529256, "grad_norm": 0.24978838860988617, "learning_rate": 4.391054837617313e-06, "loss": 0.4573, "num_input_tokens_seen": 113554560, "step": 93375 }, { "epoch": 11.70028818443804, "grad_norm": 0.2695983052253723, "learning_rate": 4.390512197154734e-06, "loss": 0.4662, "num_input_tokens_seen": 113560064, "step": 93380 }, { "epoch": 11.700914672346824, "grad_norm": 0.6348733901977539, "learning_rate": 4.389969563979093e-06, "loss": 0.4581, "num_input_tokens_seen": 113566016, "step": 93385 }, { "epoch": 11.701541160255607, "grad_norm": 0.19380851089954376, "learning_rate": 4.389426938096878e-06, "loss": 0.4744, "num_input_tokens_seen": 113572192, "step": 93390 }, { "epoch": 11.70216764816439, "grad_norm": 0.2431991845369339, "learning_rate": 4.38888431951458e-06, "loss": 0.4554, "num_input_tokens_seen": 113578016, "step": 93395 }, { "epoch": 11.702794136073173, "grad_norm": 0.36609163880348206, "learning_rate": 4.388341708238681e-06, "loss": 0.4565, "num_input_tokens_seen": 113584160, "step": 93400 }, { "epoch": 11.703420623981957, "grad_norm": 0.33423495292663574, "learning_rate": 4.3877991042756735e-06, "loss": 0.4519, "num_input_tokens_seen": 113590400, "step": 93405 }, { "epoch": 11.704047111890741, "grad_norm": 0.2032805234193802, "learning_rate": 4.38725650763204e-06, "loss": 0.456, "num_input_tokens_seen": 113596576, "step": 93410 }, { "epoch": 11.704673599799523, "grad_norm": 0.2419489622116089, "learning_rate": 4.386713918314273e-06, "loss": 0.4582, "num_input_tokens_seen": 113602848, "step": 93415 }, { "epoch": 11.705300087708308, "grad_norm": 0.2337743639945984, "learning_rate": 4.386171336328854e-06, "loss": 0.4603, "num_input_tokens_seen": 113609056, "step": 93420 }, { "epoch": 11.70592657561709, "grad_norm": 0.25564292073249817, "learning_rate": 4.385628761682274e-06, "loss": 0.461, "num_input_tokens_seen": 113614944, "step": 93425 }, { "epoch": 11.706553063525874, "grad_norm": 0.23050038516521454, "learning_rate": 4.385086194381017e-06, "loss": 0.4651, "num_input_tokens_seen": 113621120, "step": 93430 }, { "epoch": 11.707179551434658, "grad_norm": 0.26121923327445984, "learning_rate": 4.384543634431574e-06, "loss": 0.4622, "num_input_tokens_seen": 113627328, "step": 93435 }, { "epoch": 11.70780603934344, "grad_norm": 0.2313600331544876, "learning_rate": 4.384001081840427e-06, "loss": 0.4647, "num_input_tokens_seen": 113633408, "step": 93440 }, { "epoch": 11.708432527252224, "grad_norm": 0.21223627030849457, "learning_rate": 4.383458536614066e-06, "loss": 0.4621, "num_input_tokens_seen": 113639104, "step": 93445 }, { "epoch": 11.709059015161007, "grad_norm": 0.20976580679416656, "learning_rate": 4.382915998758978e-06, "loss": 0.4452, "num_input_tokens_seen": 113645408, "step": 93450 }, { "epoch": 11.70968550306979, "grad_norm": 0.216551274061203, "learning_rate": 4.382373468281646e-06, "loss": 0.4572, "num_input_tokens_seen": 113651584, "step": 93455 }, { "epoch": 11.710311990978575, "grad_norm": 0.4433721899986267, "learning_rate": 4.38183094518856e-06, "loss": 0.4448, "num_input_tokens_seen": 113657600, "step": 93460 }, { "epoch": 11.710938478887357, "grad_norm": 0.30044126510620117, "learning_rate": 4.381288429486203e-06, "loss": 0.4481, "num_input_tokens_seen": 113663648, "step": 93465 }, { "epoch": 11.711564966796141, "grad_norm": 0.2779257297515869, "learning_rate": 4.3807459211810654e-06, "loss": 0.4573, "num_input_tokens_seen": 113669984, "step": 93470 }, { "epoch": 11.712191454704925, "grad_norm": 0.336392879486084, "learning_rate": 4.380203420279629e-06, "loss": 0.4611, "num_input_tokens_seen": 113675680, "step": 93475 }, { "epoch": 11.712817942613707, "grad_norm": 0.2979496419429779, "learning_rate": 4.379660926788382e-06, "loss": 0.4623, "num_input_tokens_seen": 113681824, "step": 93480 }, { "epoch": 11.713444430522491, "grad_norm": 0.262253999710083, "learning_rate": 4.37911844071381e-06, "loss": 0.4566, "num_input_tokens_seen": 113688096, "step": 93485 }, { "epoch": 11.714070918431274, "grad_norm": 0.9609917998313904, "learning_rate": 4.378575962062401e-06, "loss": 0.4783, "num_input_tokens_seen": 113694528, "step": 93490 }, { "epoch": 11.714697406340058, "grad_norm": 0.2621522545814514, "learning_rate": 4.3780334908406366e-06, "loss": 0.4587, "num_input_tokens_seen": 113700480, "step": 93495 }, { "epoch": 11.715323894248842, "grad_norm": 0.2885078489780426, "learning_rate": 4.377491027055004e-06, "loss": 0.4463, "num_input_tokens_seen": 113706496, "step": 93500 }, { "epoch": 11.715950382157624, "grad_norm": 0.3887316584587097, "learning_rate": 4.376948570711993e-06, "loss": 0.4632, "num_input_tokens_seen": 113712384, "step": 93505 }, { "epoch": 11.716576870066408, "grad_norm": 0.6596147418022156, "learning_rate": 4.376406121818083e-06, "loss": 0.4617, "num_input_tokens_seen": 113718496, "step": 93510 }, { "epoch": 11.71720335797519, "grad_norm": 0.2585410475730896, "learning_rate": 4.375863680379764e-06, "loss": 0.4492, "num_input_tokens_seen": 113724480, "step": 93515 }, { "epoch": 11.717829845883974, "grad_norm": 0.3455313742160797, "learning_rate": 4.375321246403518e-06, "loss": 0.4609, "num_input_tokens_seen": 113730752, "step": 93520 }, { "epoch": 11.718456333792759, "grad_norm": 0.3563089966773987, "learning_rate": 4.374778819895833e-06, "loss": 0.4583, "num_input_tokens_seen": 113736768, "step": 93525 }, { "epoch": 11.71908282170154, "grad_norm": 0.34537413716316223, "learning_rate": 4.374236400863191e-06, "loss": 0.4472, "num_input_tokens_seen": 113742944, "step": 93530 }, { "epoch": 11.719709309610325, "grad_norm": 0.3181488811969757, "learning_rate": 4.37369398931208e-06, "loss": 0.4678, "num_input_tokens_seen": 113748768, "step": 93535 }, { "epoch": 11.720335797519107, "grad_norm": 0.2531135678291321, "learning_rate": 4.373151585248983e-06, "loss": 0.4611, "num_input_tokens_seen": 113754976, "step": 93540 }, { "epoch": 11.720962285427891, "grad_norm": 0.30116549134254456, "learning_rate": 4.372609188680388e-06, "loss": 0.4664, "num_input_tokens_seen": 113760832, "step": 93545 }, { "epoch": 11.721588773336675, "grad_norm": 0.2761518657207489, "learning_rate": 4.372066799612775e-06, "loss": 0.4569, "num_input_tokens_seen": 113767072, "step": 93550 }, { "epoch": 11.722215261245458, "grad_norm": 0.3257436752319336, "learning_rate": 4.371524418052634e-06, "loss": 0.4737, "num_input_tokens_seen": 113773024, "step": 93555 }, { "epoch": 11.722841749154242, "grad_norm": 0.2787732779979706, "learning_rate": 4.370982044006444e-06, "loss": 0.4742, "num_input_tokens_seen": 113779072, "step": 93560 }, { "epoch": 11.723468237063024, "grad_norm": 0.3083861470222473, "learning_rate": 4.370439677480693e-06, "loss": 0.4638, "num_input_tokens_seen": 113785152, "step": 93565 }, { "epoch": 11.724094724971808, "grad_norm": 0.35303759574890137, "learning_rate": 4.369897318481869e-06, "loss": 0.4566, "num_input_tokens_seen": 113791392, "step": 93570 }, { "epoch": 11.724721212880592, "grad_norm": 0.26329436898231506, "learning_rate": 4.3693549670164475e-06, "loss": 0.4649, "num_input_tokens_seen": 113797216, "step": 93575 }, { "epoch": 11.725347700789374, "grad_norm": 0.34343910217285156, "learning_rate": 4.36881262309092e-06, "loss": 0.4649, "num_input_tokens_seen": 113803168, "step": 93580 }, { "epoch": 11.725974188698158, "grad_norm": 0.3332628905773163, "learning_rate": 4.368270286711768e-06, "loss": 0.4577, "num_input_tokens_seen": 113809504, "step": 93585 }, { "epoch": 11.726600676606942, "grad_norm": 0.3020536005496979, "learning_rate": 4.367727957885475e-06, "loss": 0.4701, "num_input_tokens_seen": 113815456, "step": 93590 }, { "epoch": 11.727227164515725, "grad_norm": 0.25918272137641907, "learning_rate": 4.367185636618526e-06, "loss": 0.4686, "num_input_tokens_seen": 113821888, "step": 93595 }, { "epoch": 11.727853652424509, "grad_norm": 0.24111869931221008, "learning_rate": 4.366643322917406e-06, "loss": 0.4578, "num_input_tokens_seen": 113828000, "step": 93600 }, { "epoch": 11.728480140333291, "grad_norm": 0.33648431301116943, "learning_rate": 4.366101016788596e-06, "loss": 0.4645, "num_input_tokens_seen": 113833472, "step": 93605 }, { "epoch": 11.729106628242075, "grad_norm": 0.5613848567008972, "learning_rate": 4.3655587182385835e-06, "loss": 0.4672, "num_input_tokens_seen": 113839552, "step": 93610 }, { "epoch": 11.72973311615086, "grad_norm": 0.2549964189529419, "learning_rate": 4.365016427273847e-06, "loss": 0.4594, "num_input_tokens_seen": 113845696, "step": 93615 }, { "epoch": 11.730359604059641, "grad_norm": 0.27257657051086426, "learning_rate": 4.3644741439008765e-06, "loss": 0.4604, "num_input_tokens_seen": 113851904, "step": 93620 }, { "epoch": 11.730986091968425, "grad_norm": 0.25401559472084045, "learning_rate": 4.363931868126149e-06, "loss": 0.4636, "num_input_tokens_seen": 113858048, "step": 93625 }, { "epoch": 11.731612579877208, "grad_norm": 0.22882650792598724, "learning_rate": 4.363389599956151e-06, "loss": 0.4576, "num_input_tokens_seen": 113863936, "step": 93630 }, { "epoch": 11.732239067785992, "grad_norm": 0.31448641419410706, "learning_rate": 4.362847339397366e-06, "loss": 0.4584, "num_input_tokens_seen": 113870368, "step": 93635 }, { "epoch": 11.732865555694776, "grad_norm": 0.2956324517726898, "learning_rate": 4.362305086456277e-06, "loss": 0.4516, "num_input_tokens_seen": 113876736, "step": 93640 }, { "epoch": 11.733492043603558, "grad_norm": 0.2725260257720947, "learning_rate": 4.361762841139367e-06, "loss": 0.4616, "num_input_tokens_seen": 113883072, "step": 93645 }, { "epoch": 11.734118531512342, "grad_norm": 0.26593831181526184, "learning_rate": 4.361220603453118e-06, "loss": 0.4641, "num_input_tokens_seen": 113889664, "step": 93650 }, { "epoch": 11.734745019421124, "grad_norm": 0.30424559116363525, "learning_rate": 4.360678373404015e-06, "loss": 0.455, "num_input_tokens_seen": 113895616, "step": 93655 }, { "epoch": 11.735371507329909, "grad_norm": 0.2439182698726654, "learning_rate": 4.360136150998538e-06, "loss": 0.4559, "num_input_tokens_seen": 113901536, "step": 93660 }, { "epoch": 11.735997995238693, "grad_norm": 0.28897523880004883, "learning_rate": 4.359593936243174e-06, "loss": 0.4535, "num_input_tokens_seen": 113907520, "step": 93665 }, { "epoch": 11.736624483147475, "grad_norm": 0.25996482372283936, "learning_rate": 4.3590517291444005e-06, "loss": 0.4637, "num_input_tokens_seen": 113913632, "step": 93670 }, { "epoch": 11.737250971056259, "grad_norm": 0.2742624878883362, "learning_rate": 4.358509529708704e-06, "loss": 0.4518, "num_input_tokens_seen": 113919584, "step": 93675 }, { "epoch": 11.737877458965041, "grad_norm": 0.3296603560447693, "learning_rate": 4.357967337942564e-06, "loss": 0.4512, "num_input_tokens_seen": 113925536, "step": 93680 }, { "epoch": 11.738503946873825, "grad_norm": 0.35198354721069336, "learning_rate": 4.357425153852463e-06, "loss": 0.4538, "num_input_tokens_seen": 113931296, "step": 93685 }, { "epoch": 11.73913043478261, "grad_norm": 2.972299098968506, "learning_rate": 4.356882977444887e-06, "loss": 0.4702, "num_input_tokens_seen": 113937440, "step": 93690 }, { "epoch": 11.739756922691392, "grad_norm": 0.268017441034317, "learning_rate": 4.3563408087263146e-06, "loss": 0.4614, "num_input_tokens_seen": 113943968, "step": 93695 }, { "epoch": 11.740383410600176, "grad_norm": 0.27730387449264526, "learning_rate": 4.35579864770323e-06, "loss": 0.458, "num_input_tokens_seen": 113949856, "step": 93700 }, { "epoch": 11.74100989850896, "grad_norm": 0.32844898104667664, "learning_rate": 4.355256494382113e-06, "loss": 0.4716, "num_input_tokens_seen": 113955648, "step": 93705 }, { "epoch": 11.741636386417742, "grad_norm": 0.3835561275482178, "learning_rate": 4.354714348769449e-06, "loss": 0.4894, "num_input_tokens_seen": 113961920, "step": 93710 }, { "epoch": 11.742262874326526, "grad_norm": 0.32523319125175476, "learning_rate": 4.354172210871714e-06, "loss": 0.4616, "num_input_tokens_seen": 113967840, "step": 93715 }, { "epoch": 11.742889362235308, "grad_norm": 0.2771293818950653, "learning_rate": 4.353630080695397e-06, "loss": 0.4601, "num_input_tokens_seen": 113973856, "step": 93720 }, { "epoch": 11.743515850144092, "grad_norm": 0.22773012518882751, "learning_rate": 4.353087958246972e-06, "loss": 0.4601, "num_input_tokens_seen": 113979648, "step": 93725 }, { "epoch": 11.744142338052875, "grad_norm": 0.39529940485954285, "learning_rate": 4.352545843532927e-06, "loss": 0.4539, "num_input_tokens_seen": 113985664, "step": 93730 }, { "epoch": 11.744768825961659, "grad_norm": 0.415377140045166, "learning_rate": 4.35200373655974e-06, "loss": 0.4595, "num_input_tokens_seen": 113992096, "step": 93735 }, { "epoch": 11.745395313870443, "grad_norm": 0.23515425622463226, "learning_rate": 4.351461637333893e-06, "loss": 0.4582, "num_input_tokens_seen": 113998496, "step": 93740 }, { "epoch": 11.746021801779225, "grad_norm": 0.45923057198524475, "learning_rate": 4.350919545861867e-06, "loss": 0.4544, "num_input_tokens_seen": 114004256, "step": 93745 }, { "epoch": 11.74664828968801, "grad_norm": 0.28751426935195923, "learning_rate": 4.350377462150143e-06, "loss": 0.4655, "num_input_tokens_seen": 114009984, "step": 93750 }, { "epoch": 11.747274777596793, "grad_norm": 0.523731529712677, "learning_rate": 4.349835386205204e-06, "loss": 0.4583, "num_input_tokens_seen": 114016000, "step": 93755 }, { "epoch": 11.747901265505575, "grad_norm": 0.2631993889808655, "learning_rate": 4.349293318033529e-06, "loss": 0.4596, "num_input_tokens_seen": 114021920, "step": 93760 }, { "epoch": 11.74852775341436, "grad_norm": 0.4615146815776825, "learning_rate": 4.348751257641601e-06, "loss": 0.4631, "num_input_tokens_seen": 114028192, "step": 93765 }, { "epoch": 11.749154241323142, "grad_norm": 0.3132155239582062, "learning_rate": 4.348209205035897e-06, "loss": 0.4709, "num_input_tokens_seen": 114034400, "step": 93770 }, { "epoch": 11.749780729231926, "grad_norm": 0.3555801808834076, "learning_rate": 4.347667160222902e-06, "loss": 0.4589, "num_input_tokens_seen": 114039872, "step": 93775 }, { "epoch": 11.75040721714071, "grad_norm": 0.289884477853775, "learning_rate": 4.347125123209093e-06, "loss": 0.4565, "num_input_tokens_seen": 114045856, "step": 93780 }, { "epoch": 11.751033705049492, "grad_norm": 0.535260021686554, "learning_rate": 4.346583094000952e-06, "loss": 0.4707, "num_input_tokens_seen": 114052224, "step": 93785 }, { "epoch": 11.751660192958276, "grad_norm": 0.32079750299453735, "learning_rate": 4.3460410726049606e-06, "loss": 0.4672, "num_input_tokens_seen": 114058368, "step": 93790 }, { "epoch": 11.752286680867059, "grad_norm": 0.3204090893268585, "learning_rate": 4.3454990590275966e-06, "loss": 0.4606, "num_input_tokens_seen": 114064768, "step": 93795 }, { "epoch": 11.752913168775843, "grad_norm": 0.2641865909099579, "learning_rate": 4.344957053275342e-06, "loss": 0.4516, "num_input_tokens_seen": 114070944, "step": 93800 }, { "epoch": 11.753539656684627, "grad_norm": 0.21535617113113403, "learning_rate": 4.3444150553546785e-06, "loss": 0.4637, "num_input_tokens_seen": 114076832, "step": 93805 }, { "epoch": 11.754166144593409, "grad_norm": 0.24765321612358093, "learning_rate": 4.3438730652720814e-06, "loss": 0.4656, "num_input_tokens_seen": 114082944, "step": 93810 }, { "epoch": 11.754792632502193, "grad_norm": 0.19535285234451294, "learning_rate": 4.3433310830340336e-06, "loss": 0.4714, "num_input_tokens_seen": 114088512, "step": 93815 }, { "epoch": 11.755419120410975, "grad_norm": 0.3257116675376892, "learning_rate": 4.342789108647017e-06, "loss": 0.4542, "num_input_tokens_seen": 114094752, "step": 93820 }, { "epoch": 11.75604560831976, "grad_norm": 0.3462120592594147, "learning_rate": 4.342247142117508e-06, "loss": 0.4574, "num_input_tokens_seen": 114101088, "step": 93825 }, { "epoch": 11.756672096228543, "grad_norm": 0.3167685568332672, "learning_rate": 4.341705183451989e-06, "loss": 0.4431, "num_input_tokens_seen": 114107328, "step": 93830 }, { "epoch": 11.757298584137326, "grad_norm": 0.291246235370636, "learning_rate": 4.341163232656935e-06, "loss": 0.4562, "num_input_tokens_seen": 114113056, "step": 93835 }, { "epoch": 11.75792507204611, "grad_norm": 0.3548576533794403, "learning_rate": 4.34062128973883e-06, "loss": 0.4647, "num_input_tokens_seen": 114119328, "step": 93840 }, { "epoch": 11.758551559954892, "grad_norm": 0.3213993012905121, "learning_rate": 4.340079354704151e-06, "loss": 0.4614, "num_input_tokens_seen": 114125088, "step": 93845 }, { "epoch": 11.759178047863676, "grad_norm": 0.4181818962097168, "learning_rate": 4.33953742755938e-06, "loss": 0.4578, "num_input_tokens_seen": 114131136, "step": 93850 }, { "epoch": 11.75980453577246, "grad_norm": 0.39601072669029236, "learning_rate": 4.338995508310992e-06, "loss": 0.4669, "num_input_tokens_seen": 114137344, "step": 93855 }, { "epoch": 11.760431023681242, "grad_norm": 0.2239382266998291, "learning_rate": 4.338453596965471e-06, "loss": 0.4546, "num_input_tokens_seen": 114143328, "step": 93860 }, { "epoch": 11.761057511590026, "grad_norm": 0.3165616989135742, "learning_rate": 4.337911693529292e-06, "loss": 0.4564, "num_input_tokens_seen": 114148704, "step": 93865 }, { "epoch": 11.76168399949881, "grad_norm": 0.46147072315216064, "learning_rate": 4.337369798008934e-06, "loss": 0.4648, "num_input_tokens_seen": 114154720, "step": 93870 }, { "epoch": 11.762310487407593, "grad_norm": 0.267456591129303, "learning_rate": 4.33682791041088e-06, "loss": 0.4547, "num_input_tokens_seen": 114160704, "step": 93875 }, { "epoch": 11.762936975316377, "grad_norm": 0.3683704733848572, "learning_rate": 4.336286030741603e-06, "loss": 0.468, "num_input_tokens_seen": 114166784, "step": 93880 }, { "epoch": 11.763563463225159, "grad_norm": 0.37899500131607056, "learning_rate": 4.335744159007588e-06, "loss": 0.4651, "num_input_tokens_seen": 114172576, "step": 93885 }, { "epoch": 11.764189951133943, "grad_norm": 0.3619453012943268, "learning_rate": 4.335202295215307e-06, "loss": 0.4613, "num_input_tokens_seen": 114178528, "step": 93890 }, { "epoch": 11.764816439042727, "grad_norm": 0.413094699382782, "learning_rate": 4.334660439371243e-06, "loss": 0.4434, "num_input_tokens_seen": 114184608, "step": 93895 }, { "epoch": 11.76544292695151, "grad_norm": 0.3111984431743622, "learning_rate": 4.334118591481872e-06, "loss": 0.4597, "num_input_tokens_seen": 114190624, "step": 93900 }, { "epoch": 11.766069414860294, "grad_norm": 0.19553549587726593, "learning_rate": 4.3335767515536745e-06, "loss": 0.4655, "num_input_tokens_seen": 114196416, "step": 93905 }, { "epoch": 11.766695902769076, "grad_norm": 0.6396912932395935, "learning_rate": 4.3330349195931255e-06, "loss": 0.4616, "num_input_tokens_seen": 114202624, "step": 93910 }, { "epoch": 11.76732239067786, "grad_norm": 0.22928105294704437, "learning_rate": 4.3324930956067065e-06, "loss": 0.4634, "num_input_tokens_seen": 114208896, "step": 93915 }, { "epoch": 11.767948878586644, "grad_norm": 0.31423819065093994, "learning_rate": 4.331951279600892e-06, "loss": 0.4653, "num_input_tokens_seen": 114214752, "step": 93920 }, { "epoch": 11.768575366495426, "grad_norm": 0.24874244630336761, "learning_rate": 4.331409471582164e-06, "loss": 0.4519, "num_input_tokens_seen": 114220672, "step": 93925 }, { "epoch": 11.76920185440421, "grad_norm": 0.21686753630638123, "learning_rate": 4.330867671556995e-06, "loss": 0.4528, "num_input_tokens_seen": 114226784, "step": 93930 }, { "epoch": 11.769828342312993, "grad_norm": 0.45937973260879517, "learning_rate": 4.330325879531867e-06, "loss": 0.4644, "num_input_tokens_seen": 114232736, "step": 93935 }, { "epoch": 11.770454830221777, "grad_norm": 0.2803788483142853, "learning_rate": 4.329784095513257e-06, "loss": 0.4597, "num_input_tokens_seen": 114239168, "step": 93940 }, { "epoch": 11.77108131813056, "grad_norm": 0.2808522880077362, "learning_rate": 4.329242319507639e-06, "loss": 0.4606, "num_input_tokens_seen": 114245056, "step": 93945 }, { "epoch": 11.771707806039343, "grad_norm": 0.5196699500083923, "learning_rate": 4.328700551521495e-06, "loss": 0.4616, "num_input_tokens_seen": 114251392, "step": 93950 }, { "epoch": 11.772334293948127, "grad_norm": 0.3833196461200714, "learning_rate": 4.3281587915612995e-06, "loss": 0.4486, "num_input_tokens_seen": 114257216, "step": 93955 }, { "epoch": 11.77296078185691, "grad_norm": 0.47832801938056946, "learning_rate": 4.327617039633533e-06, "loss": 0.4584, "num_input_tokens_seen": 114263488, "step": 93960 }, { "epoch": 11.773587269765693, "grad_norm": 0.2597869336605072, "learning_rate": 4.327075295744667e-06, "loss": 0.4597, "num_input_tokens_seen": 114269568, "step": 93965 }, { "epoch": 11.774213757674477, "grad_norm": 0.28155049681663513, "learning_rate": 4.326533559901184e-06, "loss": 0.4539, "num_input_tokens_seen": 114275648, "step": 93970 }, { "epoch": 11.77484024558326, "grad_norm": 0.327328085899353, "learning_rate": 4.325991832109556e-06, "loss": 0.4422, "num_input_tokens_seen": 114281760, "step": 93975 }, { "epoch": 11.775466733492044, "grad_norm": 0.40210509300231934, "learning_rate": 4.3254501123762645e-06, "loss": 0.4573, "num_input_tokens_seen": 114287360, "step": 93980 }, { "epoch": 11.776093221400828, "grad_norm": 0.6084209680557251, "learning_rate": 4.324908400707782e-06, "loss": 0.458, "num_input_tokens_seen": 114293024, "step": 93985 }, { "epoch": 11.77671970930961, "grad_norm": 0.32416847348213196, "learning_rate": 4.324366697110589e-06, "loss": 0.4548, "num_input_tokens_seen": 114299104, "step": 93990 }, { "epoch": 11.777346197218394, "grad_norm": 0.4823421239852905, "learning_rate": 4.323825001591158e-06, "loss": 0.4656, "num_input_tokens_seen": 114305280, "step": 93995 }, { "epoch": 11.777972685127176, "grad_norm": 0.4383426904678345, "learning_rate": 4.323283314155969e-06, "loss": 0.4627, "num_input_tokens_seen": 114311392, "step": 94000 }, { "epoch": 11.77859917303596, "grad_norm": 0.3754403591156006, "learning_rate": 4.322741634811497e-06, "loss": 0.4754, "num_input_tokens_seen": 114317472, "step": 94005 }, { "epoch": 11.779225660944745, "grad_norm": 0.3219054341316223, "learning_rate": 4.322199963564218e-06, "loss": 0.4748, "num_input_tokens_seen": 114323776, "step": 94010 }, { "epoch": 11.779852148853527, "grad_norm": 0.3525180220603943, "learning_rate": 4.321658300420609e-06, "loss": 0.4689, "num_input_tokens_seen": 114329408, "step": 94015 }, { "epoch": 11.78047863676231, "grad_norm": 0.3049851357936859, "learning_rate": 4.321116645387144e-06, "loss": 0.4633, "num_input_tokens_seen": 114335488, "step": 94020 }, { "epoch": 11.781105124671093, "grad_norm": 0.3753156065940857, "learning_rate": 4.320574998470302e-06, "loss": 0.4419, "num_input_tokens_seen": 114341728, "step": 94025 }, { "epoch": 11.781731612579877, "grad_norm": 0.31600221991539, "learning_rate": 4.320033359676555e-06, "loss": 0.4654, "num_input_tokens_seen": 114347840, "step": 94030 }, { "epoch": 11.782358100488661, "grad_norm": 0.2523290812969208, "learning_rate": 4.3194917290123835e-06, "loss": 0.4688, "num_input_tokens_seen": 114353856, "step": 94035 }, { "epoch": 11.782984588397444, "grad_norm": 0.30360913276672363, "learning_rate": 4.318950106484258e-06, "loss": 0.4561, "num_input_tokens_seen": 114359616, "step": 94040 }, { "epoch": 11.783611076306228, "grad_norm": 0.22932106256484985, "learning_rate": 4.318408492098658e-06, "loss": 0.4678, "num_input_tokens_seen": 114365856, "step": 94045 }, { "epoch": 11.78423756421501, "grad_norm": 0.26067617535591125, "learning_rate": 4.317866885862057e-06, "loss": 0.451, "num_input_tokens_seen": 114372160, "step": 94050 }, { "epoch": 11.784864052123794, "grad_norm": 0.28132811188697815, "learning_rate": 4.3173252877809314e-06, "loss": 0.4604, "num_input_tokens_seen": 114378464, "step": 94055 }, { "epoch": 11.785490540032578, "grad_norm": 0.28406888246536255, "learning_rate": 4.316783697861755e-06, "loss": 0.4521, "num_input_tokens_seen": 114384512, "step": 94060 }, { "epoch": 11.78611702794136, "grad_norm": 0.3466377556324005, "learning_rate": 4.316242116111004e-06, "loss": 0.4586, "num_input_tokens_seen": 114390592, "step": 94065 }, { "epoch": 11.786743515850144, "grad_norm": 0.33953505754470825, "learning_rate": 4.315700542535154e-06, "loss": 0.4561, "num_input_tokens_seen": 114397024, "step": 94070 }, { "epoch": 11.787370003758927, "grad_norm": 0.2672216594219208, "learning_rate": 4.315158977140679e-06, "loss": 0.4588, "num_input_tokens_seen": 114403232, "step": 94075 }, { "epoch": 11.78799649166771, "grad_norm": 0.23840667307376862, "learning_rate": 4.314617419934055e-06, "loss": 0.4616, "num_input_tokens_seen": 114409312, "step": 94080 }, { "epoch": 11.788622979576495, "grad_norm": 0.3836444914340973, "learning_rate": 4.314075870921755e-06, "loss": 0.4565, "num_input_tokens_seen": 114415328, "step": 94085 }, { "epoch": 11.789249467485277, "grad_norm": 0.32625463604927063, "learning_rate": 4.313534330110256e-06, "loss": 0.4579, "num_input_tokens_seen": 114421184, "step": 94090 }, { "epoch": 11.789875955394061, "grad_norm": 0.34133875370025635, "learning_rate": 4.312992797506029e-06, "loss": 0.4508, "num_input_tokens_seen": 114427328, "step": 94095 }, { "epoch": 11.790502443302845, "grad_norm": 0.3284023404121399, "learning_rate": 4.312451273115553e-06, "loss": 0.4481, "num_input_tokens_seen": 114433632, "step": 94100 }, { "epoch": 11.791128931211627, "grad_norm": 0.31963345408439636, "learning_rate": 4.311909756945298e-06, "loss": 0.4655, "num_input_tokens_seen": 114439264, "step": 94105 }, { "epoch": 11.791755419120411, "grad_norm": 0.43739813566207886, "learning_rate": 4.311368249001743e-06, "loss": 0.4646, "num_input_tokens_seen": 114445440, "step": 94110 }, { "epoch": 11.792381907029194, "grad_norm": 0.322006493806839, "learning_rate": 4.310826749291356e-06, "loss": 0.4501, "num_input_tokens_seen": 114451616, "step": 94115 }, { "epoch": 11.793008394937978, "grad_norm": 0.22795508801937103, "learning_rate": 4.310285257820616e-06, "loss": 0.4602, "num_input_tokens_seen": 114457856, "step": 94120 }, { "epoch": 11.793634882846762, "grad_norm": 0.3177955746650696, "learning_rate": 4.309743774595998e-06, "loss": 0.4621, "num_input_tokens_seen": 114463680, "step": 94125 }, { "epoch": 11.794261370755544, "grad_norm": 0.3056338429450989, "learning_rate": 4.30920229962397e-06, "loss": 0.4781, "num_input_tokens_seen": 114470112, "step": 94130 }, { "epoch": 11.794887858664328, "grad_norm": 0.28002291917800903, "learning_rate": 4.308660832911013e-06, "loss": 0.4666, "num_input_tokens_seen": 114476224, "step": 94135 }, { "epoch": 11.79551434657311, "grad_norm": 0.24581320583820343, "learning_rate": 4.308119374463594e-06, "loss": 0.4562, "num_input_tokens_seen": 114482464, "step": 94140 }, { "epoch": 11.796140834481895, "grad_norm": 0.20200911164283752, "learning_rate": 4.307577924288191e-06, "loss": 0.4763, "num_input_tokens_seen": 114488512, "step": 94145 }, { "epoch": 11.796767322390679, "grad_norm": 0.2503378093242645, "learning_rate": 4.3070364823912754e-06, "loss": 0.4587, "num_input_tokens_seen": 114494496, "step": 94150 }, { "epoch": 11.79739381029946, "grad_norm": 0.3337148427963257, "learning_rate": 4.306495048779322e-06, "loss": 0.4614, "num_input_tokens_seen": 114500480, "step": 94155 }, { "epoch": 11.798020298208245, "grad_norm": 0.3282824754714966, "learning_rate": 4.305953623458802e-06, "loss": 0.4531, "num_input_tokens_seen": 114506496, "step": 94160 }, { "epoch": 11.798646786117027, "grad_norm": 0.2625334560871124, "learning_rate": 4.305412206436192e-06, "loss": 0.4562, "num_input_tokens_seen": 114512384, "step": 94165 }, { "epoch": 11.799273274025811, "grad_norm": 0.37643858790397644, "learning_rate": 4.304870797717961e-06, "loss": 0.4606, "num_input_tokens_seen": 114518528, "step": 94170 }, { "epoch": 11.799899761934595, "grad_norm": 0.28444167971611023, "learning_rate": 4.304329397310586e-06, "loss": 0.4708, "num_input_tokens_seen": 114524320, "step": 94175 }, { "epoch": 11.800526249843378, "grad_norm": 0.4284675121307373, "learning_rate": 4.303788005220536e-06, "loss": 0.4539, "num_input_tokens_seen": 114530240, "step": 94180 }, { "epoch": 11.801152737752162, "grad_norm": 0.2716473639011383, "learning_rate": 4.303246621454286e-06, "loss": 0.4668, "num_input_tokens_seen": 114536512, "step": 94185 }, { "epoch": 11.801779225660944, "grad_norm": 0.2851695716381073, "learning_rate": 4.30270524601831e-06, "loss": 0.4651, "num_input_tokens_seen": 114542432, "step": 94190 }, { "epoch": 11.802405713569728, "grad_norm": 0.25324317812919617, "learning_rate": 4.302163878919078e-06, "loss": 0.452, "num_input_tokens_seen": 114548480, "step": 94195 }, { "epoch": 11.803032201478512, "grad_norm": 0.3110796809196472, "learning_rate": 4.301622520163063e-06, "loss": 0.4496, "num_input_tokens_seen": 114554752, "step": 94200 }, { "epoch": 11.803658689387294, "grad_norm": 0.3715748190879822, "learning_rate": 4.301081169756739e-06, "loss": 0.4629, "num_input_tokens_seen": 114561216, "step": 94205 }, { "epoch": 11.804285177296078, "grad_norm": 0.25031113624572754, "learning_rate": 4.3005398277065766e-06, "loss": 0.4589, "num_input_tokens_seen": 114567136, "step": 94210 }, { "epoch": 11.804911665204862, "grad_norm": 0.37041500210762024, "learning_rate": 4.299998494019049e-06, "loss": 0.4515, "num_input_tokens_seen": 114573664, "step": 94215 }, { "epoch": 11.805538153113645, "grad_norm": 0.3455770015716553, "learning_rate": 4.299457168700629e-06, "loss": 0.4647, "num_input_tokens_seen": 114579776, "step": 94220 }, { "epoch": 11.806164641022429, "grad_norm": 0.24732369184494019, "learning_rate": 4.298915851757785e-06, "loss": 0.4705, "num_input_tokens_seen": 114585344, "step": 94225 }, { "epoch": 11.806791128931211, "grad_norm": 0.2564593553543091, "learning_rate": 4.2983745431969935e-06, "loss": 0.4586, "num_input_tokens_seen": 114591232, "step": 94230 }, { "epoch": 11.807417616839995, "grad_norm": 0.27217385172843933, "learning_rate": 4.2978332430247225e-06, "loss": 0.4618, "num_input_tokens_seen": 114597408, "step": 94235 }, { "epoch": 11.808044104748777, "grad_norm": 0.22120791673660278, "learning_rate": 4.2972919512474484e-06, "loss": 0.4611, "num_input_tokens_seen": 114603584, "step": 94240 }, { "epoch": 11.808670592657561, "grad_norm": 0.2931634187698364, "learning_rate": 4.296750667871637e-06, "loss": 0.4695, "num_input_tokens_seen": 114609664, "step": 94245 }, { "epoch": 11.809297080566346, "grad_norm": 0.35707876086235046, "learning_rate": 4.296209392903763e-06, "loss": 0.4597, "num_input_tokens_seen": 114615968, "step": 94250 }, { "epoch": 11.809923568475128, "grad_norm": 0.28693488240242004, "learning_rate": 4.295668126350297e-06, "loss": 0.4634, "num_input_tokens_seen": 114622272, "step": 94255 }, { "epoch": 11.810550056383912, "grad_norm": 0.5425922274589539, "learning_rate": 4.295126868217712e-06, "loss": 0.4694, "num_input_tokens_seen": 114628032, "step": 94260 }, { "epoch": 11.811176544292696, "grad_norm": 0.3028145432472229, "learning_rate": 4.294585618512478e-06, "loss": 0.4616, "num_input_tokens_seen": 114634048, "step": 94265 }, { "epoch": 11.811803032201478, "grad_norm": 0.218949094414711, "learning_rate": 4.294044377241064e-06, "loss": 0.4547, "num_input_tokens_seen": 114640096, "step": 94270 }, { "epoch": 11.812429520110262, "grad_norm": 0.2904618978500366, "learning_rate": 4.293503144409946e-06, "loss": 0.4507, "num_input_tokens_seen": 114645984, "step": 94275 }, { "epoch": 11.813056008019045, "grad_norm": 0.3058698773384094, "learning_rate": 4.29296192002559e-06, "loss": 0.4564, "num_input_tokens_seen": 114652032, "step": 94280 }, { "epoch": 11.813682495927829, "grad_norm": 0.2704070210456848, "learning_rate": 4.29242070409447e-06, "loss": 0.4619, "num_input_tokens_seen": 114658400, "step": 94285 }, { "epoch": 11.814308983836613, "grad_norm": 0.39839088916778564, "learning_rate": 4.291879496623053e-06, "loss": 0.4701, "num_input_tokens_seen": 114664832, "step": 94290 }, { "epoch": 11.814935471745395, "grad_norm": 0.27864792943000793, "learning_rate": 4.291338297617814e-06, "loss": 0.4617, "num_input_tokens_seen": 114671104, "step": 94295 }, { "epoch": 11.815561959654179, "grad_norm": 0.3356420397758484, "learning_rate": 4.290797107085222e-06, "loss": 0.4496, "num_input_tokens_seen": 114677248, "step": 94300 }, { "epoch": 11.816188447562961, "grad_norm": 0.23801037669181824, "learning_rate": 4.290255925031745e-06, "loss": 0.4665, "num_input_tokens_seen": 114683104, "step": 94305 }, { "epoch": 11.816814935471745, "grad_norm": 0.2950422465801239, "learning_rate": 4.289714751463855e-06, "loss": 0.4595, "num_input_tokens_seen": 114689056, "step": 94310 }, { "epoch": 11.81744142338053, "grad_norm": 0.3099938929080963, "learning_rate": 4.289173586388023e-06, "loss": 0.4621, "num_input_tokens_seen": 114695424, "step": 94315 }, { "epoch": 11.818067911289312, "grad_norm": 0.3876010477542877, "learning_rate": 4.2886324298107195e-06, "loss": 0.4678, "num_input_tokens_seen": 114701440, "step": 94320 }, { "epoch": 11.818694399198096, "grad_norm": 0.2596507966518402, "learning_rate": 4.288091281738412e-06, "loss": 0.467, "num_input_tokens_seen": 114707616, "step": 94325 }, { "epoch": 11.81932088710688, "grad_norm": 0.2917124927043915, "learning_rate": 4.287550142177574e-06, "loss": 0.4631, "num_input_tokens_seen": 114713536, "step": 94330 }, { "epoch": 11.819947375015662, "grad_norm": 0.3043787181377411, "learning_rate": 4.287009011134669e-06, "loss": 0.4572, "num_input_tokens_seen": 114719808, "step": 94335 }, { "epoch": 11.820573862924446, "grad_norm": 0.2850365936756134, "learning_rate": 4.286467888616176e-06, "loss": 0.4644, "num_input_tokens_seen": 114725920, "step": 94340 }, { "epoch": 11.821200350833228, "grad_norm": 0.3127971887588501, "learning_rate": 4.285926774628555e-06, "loss": 0.4658, "num_input_tokens_seen": 114731904, "step": 94345 }, { "epoch": 11.821826838742012, "grad_norm": 0.25723713636398315, "learning_rate": 4.285385669178281e-06, "loss": 0.4773, "num_input_tokens_seen": 114737888, "step": 94350 }, { "epoch": 11.822453326650795, "grad_norm": 0.21859052777290344, "learning_rate": 4.2848445722718226e-06, "loss": 0.4666, "num_input_tokens_seen": 114744320, "step": 94355 }, { "epoch": 11.823079814559579, "grad_norm": 0.1773664355278015, "learning_rate": 4.284303483915648e-06, "loss": 0.4598, "num_input_tokens_seen": 114750528, "step": 94360 }, { "epoch": 11.823706302468363, "grad_norm": 0.33999109268188477, "learning_rate": 4.2837624041162264e-06, "loss": 0.4687, "num_input_tokens_seen": 114756416, "step": 94365 }, { "epoch": 11.824332790377145, "grad_norm": 0.3438625931739807, "learning_rate": 4.283221332880027e-06, "loss": 0.4588, "num_input_tokens_seen": 114762816, "step": 94370 }, { "epoch": 11.82495927828593, "grad_norm": 0.3001483380794525, "learning_rate": 4.282680270213522e-06, "loss": 0.4622, "num_input_tokens_seen": 114768544, "step": 94375 }, { "epoch": 11.825585766194713, "grad_norm": 0.3145216703414917, "learning_rate": 4.2821392161231755e-06, "loss": 0.4567, "num_input_tokens_seen": 114774944, "step": 94380 }, { "epoch": 11.826212254103496, "grad_norm": 0.2545947730541229, "learning_rate": 4.28159817061546e-06, "loss": 0.4624, "num_input_tokens_seen": 114781056, "step": 94385 }, { "epoch": 11.82683874201228, "grad_norm": 0.30732131004333496, "learning_rate": 4.28105713369684e-06, "loss": 0.4523, "num_input_tokens_seen": 114787232, "step": 94390 }, { "epoch": 11.827465229921062, "grad_norm": 0.31101754307746887, "learning_rate": 4.280516105373789e-06, "loss": 0.4635, "num_input_tokens_seen": 114792992, "step": 94395 }, { "epoch": 11.828091717829846, "grad_norm": 0.32541748881340027, "learning_rate": 4.27997508565277e-06, "loss": 0.4692, "num_input_tokens_seen": 114798880, "step": 94400 }, { "epoch": 11.82871820573863, "grad_norm": 0.2946433424949646, "learning_rate": 4.279434074540256e-06, "loss": 0.4543, "num_input_tokens_seen": 114804928, "step": 94405 }, { "epoch": 11.829344693647412, "grad_norm": 0.32859113812446594, "learning_rate": 4.2788930720427126e-06, "loss": 0.4553, "num_input_tokens_seen": 114811072, "step": 94410 }, { "epoch": 11.829971181556196, "grad_norm": 0.23726008832454681, "learning_rate": 4.27835207816661e-06, "loss": 0.4638, "num_input_tokens_seen": 114816864, "step": 94415 }, { "epoch": 11.830597669464979, "grad_norm": 0.2748391330242157, "learning_rate": 4.277811092918413e-06, "loss": 0.4599, "num_input_tokens_seen": 114823072, "step": 94420 }, { "epoch": 11.831224157373763, "grad_norm": 0.3652780055999756, "learning_rate": 4.277270116304594e-06, "loss": 0.4619, "num_input_tokens_seen": 114829312, "step": 94425 }, { "epoch": 11.831850645282547, "grad_norm": 0.1845083087682724, "learning_rate": 4.276729148331616e-06, "loss": 0.4658, "num_input_tokens_seen": 114835232, "step": 94430 }, { "epoch": 11.832477133191329, "grad_norm": 0.26614031195640564, "learning_rate": 4.27618818900595e-06, "loss": 0.4636, "num_input_tokens_seen": 114841280, "step": 94435 }, { "epoch": 11.833103621100113, "grad_norm": 0.2500201463699341, "learning_rate": 4.2756472383340645e-06, "loss": 0.4588, "num_input_tokens_seen": 114847328, "step": 94440 }, { "epoch": 11.833730109008895, "grad_norm": 0.32506340742111206, "learning_rate": 4.275106296322423e-06, "loss": 0.4699, "num_input_tokens_seen": 114853696, "step": 94445 }, { "epoch": 11.83435659691768, "grad_norm": 0.29303890466690063, "learning_rate": 4.274565362977497e-06, "loss": 0.456, "num_input_tokens_seen": 114859296, "step": 94450 }, { "epoch": 11.834983084826463, "grad_norm": 0.21542097628116608, "learning_rate": 4.27402443830575e-06, "loss": 0.4661, "num_input_tokens_seen": 114865152, "step": 94455 }, { "epoch": 11.835609572735246, "grad_norm": 0.34889164566993713, "learning_rate": 4.273483522313652e-06, "loss": 0.4618, "num_input_tokens_seen": 114871520, "step": 94460 }, { "epoch": 11.83623606064403, "grad_norm": 0.2745942771434784, "learning_rate": 4.272942615007669e-06, "loss": 0.4632, "num_input_tokens_seen": 114877920, "step": 94465 }, { "epoch": 11.836862548552812, "grad_norm": 0.29452502727508545, "learning_rate": 4.27240171639427e-06, "loss": 0.4614, "num_input_tokens_seen": 114884096, "step": 94470 }, { "epoch": 11.837489036461596, "grad_norm": 0.2646000385284424, "learning_rate": 4.271860826479918e-06, "loss": 0.4571, "num_input_tokens_seen": 114890272, "step": 94475 }, { "epoch": 11.83811552437038, "grad_norm": 0.27953749895095825, "learning_rate": 4.271319945271084e-06, "loss": 0.4647, "num_input_tokens_seen": 114896416, "step": 94480 }, { "epoch": 11.838742012279162, "grad_norm": 0.3130185008049011, "learning_rate": 4.2707790727742315e-06, "loss": 0.4574, "num_input_tokens_seen": 114902720, "step": 94485 }, { "epoch": 11.839368500187947, "grad_norm": 0.20359575748443604, "learning_rate": 4.270238208995828e-06, "loss": 0.4706, "num_input_tokens_seen": 114908608, "step": 94490 }, { "epoch": 11.83999498809673, "grad_norm": 0.22477690875530243, "learning_rate": 4.269697353942341e-06, "loss": 0.4621, "num_input_tokens_seen": 114914656, "step": 94495 }, { "epoch": 11.840621476005513, "grad_norm": 0.2750614583492279, "learning_rate": 4.269156507620236e-06, "loss": 0.47, "num_input_tokens_seen": 114920736, "step": 94500 }, { "epoch": 11.841247963914297, "grad_norm": 0.45283031463623047, "learning_rate": 4.268615670035981e-06, "loss": 0.4561, "num_input_tokens_seen": 114926848, "step": 94505 }, { "epoch": 11.84187445182308, "grad_norm": 0.3334665298461914, "learning_rate": 4.268074841196037e-06, "loss": 0.4523, "num_input_tokens_seen": 114932672, "step": 94510 }, { "epoch": 11.842500939731863, "grad_norm": 0.24749621748924255, "learning_rate": 4.267534021106876e-06, "loss": 0.463, "num_input_tokens_seen": 114938784, "step": 94515 }, { "epoch": 11.843127427640647, "grad_norm": 0.29467836022377014, "learning_rate": 4.266993209774961e-06, "loss": 0.4601, "num_input_tokens_seen": 114944704, "step": 94520 }, { "epoch": 11.84375391554943, "grad_norm": 0.19116686284542084, "learning_rate": 4.2664524072067595e-06, "loss": 0.4657, "num_input_tokens_seen": 114950560, "step": 94525 }, { "epoch": 11.844380403458214, "grad_norm": 0.3612513244152069, "learning_rate": 4.265911613408734e-06, "loss": 0.4622, "num_input_tokens_seen": 114956000, "step": 94530 }, { "epoch": 11.845006891366996, "grad_norm": 0.19712448120117188, "learning_rate": 4.265370828387355e-06, "loss": 0.4504, "num_input_tokens_seen": 114961632, "step": 94535 }, { "epoch": 11.84563337927578, "grad_norm": 0.18992067873477936, "learning_rate": 4.264830052149083e-06, "loss": 0.4697, "num_input_tokens_seen": 114967744, "step": 94540 }, { "epoch": 11.846259867184564, "grad_norm": 0.2905827462673187, "learning_rate": 4.264289284700388e-06, "loss": 0.4589, "num_input_tokens_seen": 114974400, "step": 94545 }, { "epoch": 11.846886355093346, "grad_norm": 0.26981475949287415, "learning_rate": 4.2637485260477316e-06, "loss": 0.4646, "num_input_tokens_seen": 114980544, "step": 94550 }, { "epoch": 11.84751284300213, "grad_norm": 0.24640759825706482, "learning_rate": 4.26320777619758e-06, "loss": 0.464, "num_input_tokens_seen": 114986752, "step": 94555 }, { "epoch": 11.848139330910913, "grad_norm": 0.25251829624176025, "learning_rate": 4.262667035156401e-06, "loss": 0.4761, "num_input_tokens_seen": 114992928, "step": 94560 }, { "epoch": 11.848765818819697, "grad_norm": 0.3341410458087921, "learning_rate": 4.2621263029306556e-06, "loss": 0.4628, "num_input_tokens_seen": 114999328, "step": 94565 }, { "epoch": 11.84939230672848, "grad_norm": 0.3036686182022095, "learning_rate": 4.261585579526812e-06, "loss": 0.4639, "num_input_tokens_seen": 115005248, "step": 94570 }, { "epoch": 11.850018794637263, "grad_norm": 0.2789175808429718, "learning_rate": 4.261044864951332e-06, "loss": 0.4627, "num_input_tokens_seen": 115011136, "step": 94575 }, { "epoch": 11.850645282546047, "grad_norm": 0.30738285183906555, "learning_rate": 4.260504159210685e-06, "loss": 0.4486, "num_input_tokens_seen": 115016768, "step": 94580 }, { "epoch": 11.85127177045483, "grad_norm": 0.2999440133571625, "learning_rate": 4.259963462311329e-06, "loss": 0.4493, "num_input_tokens_seen": 115022880, "step": 94585 }, { "epoch": 11.851898258363613, "grad_norm": 0.24131546914577484, "learning_rate": 4.259422774259735e-06, "loss": 0.4538, "num_input_tokens_seen": 115028224, "step": 94590 }, { "epoch": 11.852524746272397, "grad_norm": 0.2942459285259247, "learning_rate": 4.258882095062362e-06, "loss": 0.4648, "num_input_tokens_seen": 115033984, "step": 94595 }, { "epoch": 11.85315123418118, "grad_norm": 0.26793795824050903, "learning_rate": 4.258341424725678e-06, "loss": 0.4515, "num_input_tokens_seen": 115040064, "step": 94600 }, { "epoch": 11.853777722089964, "grad_norm": 0.30859482288360596, "learning_rate": 4.257800763256145e-06, "loss": 0.4619, "num_input_tokens_seen": 115046208, "step": 94605 }, { "epoch": 11.854404209998748, "grad_norm": 0.3190186023712158, "learning_rate": 4.2572601106602285e-06, "loss": 0.4683, "num_input_tokens_seen": 115051616, "step": 94610 }, { "epoch": 11.85503069790753, "grad_norm": 0.4657074511051178, "learning_rate": 4.25671946694439e-06, "loss": 0.4644, "num_input_tokens_seen": 115057536, "step": 94615 }, { "epoch": 11.855657185816314, "grad_norm": 0.46620938181877136, "learning_rate": 4.256178832115098e-06, "loss": 0.46, "num_input_tokens_seen": 115063968, "step": 94620 }, { "epoch": 11.856283673725097, "grad_norm": 0.2906433939933777, "learning_rate": 4.2556382061788115e-06, "loss": 0.4672, "num_input_tokens_seen": 115070240, "step": 94625 }, { "epoch": 11.85691016163388, "grad_norm": 0.251131147146225, "learning_rate": 4.255097589141996e-06, "loss": 0.453, "num_input_tokens_seen": 115076128, "step": 94630 }, { "epoch": 11.857536649542665, "grad_norm": 0.2668878734111786, "learning_rate": 4.254556981011118e-06, "loss": 0.46, "num_input_tokens_seen": 115082048, "step": 94635 }, { "epoch": 11.858163137451447, "grad_norm": 0.2785907983779907, "learning_rate": 4.254016381792634e-06, "loss": 0.4643, "num_input_tokens_seen": 115088096, "step": 94640 }, { "epoch": 11.858789625360231, "grad_norm": 0.2632442116737366, "learning_rate": 4.253475791493015e-06, "loss": 0.4702, "num_input_tokens_seen": 115094176, "step": 94645 }, { "epoch": 11.859416113269013, "grad_norm": 0.2372845560312271, "learning_rate": 4.252935210118718e-06, "loss": 0.4584, "num_input_tokens_seen": 115100576, "step": 94650 }, { "epoch": 11.860042601177797, "grad_norm": 0.3544003367424011, "learning_rate": 4.25239463767621e-06, "loss": 0.461, "num_input_tokens_seen": 115106944, "step": 94655 }, { "epoch": 11.860669089086581, "grad_norm": 0.315810889005661, "learning_rate": 4.251854074171952e-06, "loss": 0.4535, "num_input_tokens_seen": 115112896, "step": 94660 }, { "epoch": 11.861295576995364, "grad_norm": 0.28327488899230957, "learning_rate": 4.251313519612407e-06, "loss": 0.4496, "num_input_tokens_seen": 115118944, "step": 94665 }, { "epoch": 11.861922064904148, "grad_norm": 0.3906421363353729, "learning_rate": 4.250772974004038e-06, "loss": 0.4619, "num_input_tokens_seen": 115124896, "step": 94670 }, { "epoch": 11.86254855281293, "grad_norm": 0.2954312860965729, "learning_rate": 4.2502324373533095e-06, "loss": 0.4674, "num_input_tokens_seen": 115130720, "step": 94675 }, { "epoch": 11.863175040721714, "grad_norm": 0.292146772146225, "learning_rate": 4.249691909666682e-06, "loss": 0.4676, "num_input_tokens_seen": 115136864, "step": 94680 }, { "epoch": 11.863801528630498, "grad_norm": 0.2117515653371811, "learning_rate": 4.249151390950618e-06, "loss": 0.4596, "num_input_tokens_seen": 115142624, "step": 94685 }, { "epoch": 11.86442801653928, "grad_norm": 0.31851664185523987, "learning_rate": 4.248610881211582e-06, "loss": 0.4548, "num_input_tokens_seen": 115147648, "step": 94690 }, { "epoch": 11.865054504448064, "grad_norm": 0.26729217171669006, "learning_rate": 4.248070380456033e-06, "loss": 0.462, "num_input_tokens_seen": 115153984, "step": 94695 }, { "epoch": 11.865680992356847, "grad_norm": 0.30274683237075806, "learning_rate": 4.247529888690437e-06, "loss": 0.4571, "num_input_tokens_seen": 115160288, "step": 94700 }, { "epoch": 11.86630748026563, "grad_norm": 0.25845301151275635, "learning_rate": 4.246989405921251e-06, "loss": 0.4702, "num_input_tokens_seen": 115166592, "step": 94705 }, { "epoch": 11.866933968174415, "grad_norm": 0.32704514265060425, "learning_rate": 4.246448932154943e-06, "loss": 0.461, "num_input_tokens_seen": 115173088, "step": 94710 }, { "epoch": 11.867560456083197, "grad_norm": 0.30384621024131775, "learning_rate": 4.245908467397968e-06, "loss": 0.4626, "num_input_tokens_seen": 115179264, "step": 94715 }, { "epoch": 11.868186943991981, "grad_norm": 0.31771692633628845, "learning_rate": 4.245368011656793e-06, "loss": 0.4691, "num_input_tokens_seen": 115185600, "step": 94720 }, { "epoch": 11.868813431900765, "grad_norm": 0.1962966024875641, "learning_rate": 4.244827564937877e-06, "loss": 0.4596, "num_input_tokens_seen": 115191744, "step": 94725 }, { "epoch": 11.869439919809547, "grad_norm": 0.2447877675294876, "learning_rate": 4.244287127247685e-06, "loss": 0.4593, "num_input_tokens_seen": 115197824, "step": 94730 }, { "epoch": 11.870066407718332, "grad_norm": 0.22854088246822357, "learning_rate": 4.243746698592673e-06, "loss": 0.4539, "num_input_tokens_seen": 115203968, "step": 94735 }, { "epoch": 11.870692895627114, "grad_norm": 0.24768370389938354, "learning_rate": 4.243206278979304e-06, "loss": 0.4613, "num_input_tokens_seen": 115210176, "step": 94740 }, { "epoch": 11.871319383535898, "grad_norm": 0.3642549514770508, "learning_rate": 4.2426658684140444e-06, "loss": 0.4587, "num_input_tokens_seen": 115215776, "step": 94745 }, { "epoch": 11.871945871444682, "grad_norm": 0.5641552805900574, "learning_rate": 4.242125466903347e-06, "loss": 0.4646, "num_input_tokens_seen": 115221632, "step": 94750 }, { "epoch": 11.872572359353464, "grad_norm": 0.20465345680713654, "learning_rate": 4.24158507445368e-06, "loss": 0.4564, "num_input_tokens_seen": 115227424, "step": 94755 }, { "epoch": 11.873198847262248, "grad_norm": 0.4456368386745453, "learning_rate": 4.241044691071499e-06, "loss": 0.464, "num_input_tokens_seen": 115233696, "step": 94760 }, { "epoch": 11.87382533517103, "grad_norm": 0.24491389095783234, "learning_rate": 4.2405043167632685e-06, "loss": 0.4693, "num_input_tokens_seen": 115239904, "step": 94765 }, { "epoch": 11.874451823079815, "grad_norm": 0.27917030453681946, "learning_rate": 4.239963951535446e-06, "loss": 0.4512, "num_input_tokens_seen": 115245312, "step": 94770 }, { "epoch": 11.875078310988599, "grad_norm": 0.19471445679664612, "learning_rate": 4.239423595394494e-06, "loss": 0.458, "num_input_tokens_seen": 115250784, "step": 94775 }, { "epoch": 11.875704798897381, "grad_norm": 0.2622618079185486, "learning_rate": 4.238883248346871e-06, "loss": 0.4624, "num_input_tokens_seen": 115256960, "step": 94780 }, { "epoch": 11.876331286806165, "grad_norm": 0.30127623677253723, "learning_rate": 4.238342910399042e-06, "loss": 0.4599, "num_input_tokens_seen": 115263232, "step": 94785 }, { "epoch": 11.876957774714947, "grad_norm": 0.22899679839611053, "learning_rate": 4.237802581557461e-06, "loss": 0.4629, "num_input_tokens_seen": 115269504, "step": 94790 }, { "epoch": 11.877584262623731, "grad_norm": 0.2605672776699066, "learning_rate": 4.237262261828593e-06, "loss": 0.4526, "num_input_tokens_seen": 115275776, "step": 94795 }, { "epoch": 11.878210750532515, "grad_norm": 0.2665238380432129, "learning_rate": 4.236721951218894e-06, "loss": 0.461, "num_input_tokens_seen": 115281696, "step": 94800 }, { "epoch": 11.878837238441298, "grad_norm": 0.28458890318870544, "learning_rate": 4.236181649734825e-06, "loss": 0.4601, "num_input_tokens_seen": 115287584, "step": 94805 }, { "epoch": 11.879463726350082, "grad_norm": 0.32908180356025696, "learning_rate": 4.23564135738285e-06, "loss": 0.4629, "num_input_tokens_seen": 115293792, "step": 94810 }, { "epoch": 11.880090214258864, "grad_norm": 0.2855741083621979, "learning_rate": 4.2351010741694225e-06, "loss": 0.4602, "num_input_tokens_seen": 115300096, "step": 94815 }, { "epoch": 11.880716702167648, "grad_norm": 0.23796813189983368, "learning_rate": 4.2345608001010055e-06, "loss": 0.4603, "num_input_tokens_seen": 115305440, "step": 94820 }, { "epoch": 11.881343190076432, "grad_norm": 0.34752050042152405, "learning_rate": 4.234020535184058e-06, "loss": 0.4613, "num_input_tokens_seen": 115311296, "step": 94825 }, { "epoch": 11.881969677985214, "grad_norm": 0.23910342156887054, "learning_rate": 4.233480279425039e-06, "loss": 0.4732, "num_input_tokens_seen": 115316992, "step": 94830 }, { "epoch": 11.882596165893998, "grad_norm": 0.27294301986694336, "learning_rate": 4.232940032830407e-06, "loss": 0.4618, "num_input_tokens_seen": 115323008, "step": 94835 }, { "epoch": 11.883222653802783, "grad_norm": 0.30092352628707886, "learning_rate": 4.232399795406623e-06, "loss": 0.4679, "num_input_tokens_seen": 115329184, "step": 94840 }, { "epoch": 11.883849141711565, "grad_norm": 0.204924076795578, "learning_rate": 4.231859567160143e-06, "loss": 0.4664, "num_input_tokens_seen": 115335392, "step": 94845 }, { "epoch": 11.884475629620349, "grad_norm": 0.26432880759239197, "learning_rate": 4.23131934809743e-06, "loss": 0.464, "num_input_tokens_seen": 115341632, "step": 94850 }, { "epoch": 11.885102117529131, "grad_norm": 0.24890783429145813, "learning_rate": 4.230779138224939e-06, "loss": 0.4596, "num_input_tokens_seen": 115347904, "step": 94855 }, { "epoch": 11.885728605437915, "grad_norm": 0.23965446650981903, "learning_rate": 4.23023893754913e-06, "loss": 0.464, "num_input_tokens_seen": 115354240, "step": 94860 }, { "epoch": 11.886355093346697, "grad_norm": 0.26744407415390015, "learning_rate": 4.229698746076462e-06, "loss": 0.4656, "num_input_tokens_seen": 115360512, "step": 94865 }, { "epoch": 11.886981581255482, "grad_norm": 0.26654380559921265, "learning_rate": 4.229158563813393e-06, "loss": 0.4602, "num_input_tokens_seen": 115366720, "step": 94870 }, { "epoch": 11.887608069164266, "grad_norm": 0.2391277700662613, "learning_rate": 4.2286183907663815e-06, "loss": 0.4619, "num_input_tokens_seen": 115372960, "step": 94875 }, { "epoch": 11.888234557073048, "grad_norm": 0.3406634032726288, "learning_rate": 4.228078226941886e-06, "loss": 0.4735, "num_input_tokens_seen": 115378816, "step": 94880 }, { "epoch": 11.888861044981832, "grad_norm": 0.2759826183319092, "learning_rate": 4.227538072346364e-06, "loss": 0.4667, "num_input_tokens_seen": 115384800, "step": 94885 }, { "epoch": 11.889487532890616, "grad_norm": 0.19287708401679993, "learning_rate": 4.226997926986274e-06, "loss": 0.4609, "num_input_tokens_seen": 115391008, "step": 94890 }, { "epoch": 11.890114020799398, "grad_norm": 0.2566542625427246, "learning_rate": 4.226457790868075e-06, "loss": 0.4602, "num_input_tokens_seen": 115397600, "step": 94895 }, { "epoch": 11.890740508708182, "grad_norm": 0.20378099381923676, "learning_rate": 4.2259176639982206e-06, "loss": 0.463, "num_input_tokens_seen": 115403648, "step": 94900 }, { "epoch": 11.891366996616965, "grad_norm": 0.5171657204627991, "learning_rate": 4.225377546383174e-06, "loss": 0.4678, "num_input_tokens_seen": 115409632, "step": 94905 }, { "epoch": 11.891993484525749, "grad_norm": 0.2905025780200958, "learning_rate": 4.224837438029388e-06, "loss": 0.4679, "num_input_tokens_seen": 115415936, "step": 94910 }, { "epoch": 11.892619972434533, "grad_norm": 0.18982857465744019, "learning_rate": 4.2242973389433235e-06, "loss": 0.4642, "num_input_tokens_seen": 115421952, "step": 94915 }, { "epoch": 11.893246460343315, "grad_norm": 0.2037656009197235, "learning_rate": 4.223757249131436e-06, "loss": 0.4558, "num_input_tokens_seen": 115427968, "step": 94920 }, { "epoch": 11.893872948252099, "grad_norm": 0.20507007837295532, "learning_rate": 4.223217168600182e-06, "loss": 0.4587, "num_input_tokens_seen": 115434400, "step": 94925 }, { "epoch": 11.894499436160881, "grad_norm": 0.2753771245479584, "learning_rate": 4.222677097356022e-06, "loss": 0.4562, "num_input_tokens_seen": 115440384, "step": 94930 }, { "epoch": 11.895125924069665, "grad_norm": 0.3111203610897064, "learning_rate": 4.222137035405409e-06, "loss": 0.4725, "num_input_tokens_seen": 115446336, "step": 94935 }, { "epoch": 11.89575241197845, "grad_norm": 0.22665293514728546, "learning_rate": 4.2215969827548036e-06, "loss": 0.4562, "num_input_tokens_seen": 115452608, "step": 94940 }, { "epoch": 11.896378899887232, "grad_norm": 0.2297065556049347, "learning_rate": 4.221056939410658e-06, "loss": 0.4638, "num_input_tokens_seen": 115458560, "step": 94945 }, { "epoch": 11.897005387796016, "grad_norm": 0.22610902786254883, "learning_rate": 4.2205169053794355e-06, "loss": 0.4499, "num_input_tokens_seen": 115464928, "step": 94950 }, { "epoch": 11.897631875704798, "grad_norm": 0.24173417687416077, "learning_rate": 4.219976880667586e-06, "loss": 0.4526, "num_input_tokens_seen": 115471040, "step": 94955 }, { "epoch": 11.898258363613582, "grad_norm": 0.27231091260910034, "learning_rate": 4.219436865281571e-06, "loss": 0.4534, "num_input_tokens_seen": 115477344, "step": 94960 }, { "epoch": 11.898884851522366, "grad_norm": 0.2181606888771057, "learning_rate": 4.218896859227841e-06, "loss": 0.4598, "num_input_tokens_seen": 115483552, "step": 94965 }, { "epoch": 11.899511339431148, "grad_norm": 0.27688539028167725, "learning_rate": 4.218356862512858e-06, "loss": 0.4642, "num_input_tokens_seen": 115489600, "step": 94970 }, { "epoch": 11.900137827339933, "grad_norm": 0.27006086707115173, "learning_rate": 4.217816875143076e-06, "loss": 0.462, "num_input_tokens_seen": 115494848, "step": 94975 }, { "epoch": 11.900764315248715, "grad_norm": 0.26024329662323, "learning_rate": 4.21727689712495e-06, "loss": 0.4557, "num_input_tokens_seen": 115501056, "step": 94980 }, { "epoch": 11.901390803157499, "grad_norm": 0.3699175715446472, "learning_rate": 4.216736928464937e-06, "loss": 0.4643, "num_input_tokens_seen": 115507392, "step": 94985 }, { "epoch": 11.902017291066283, "grad_norm": 0.1923898309469223, "learning_rate": 4.216196969169491e-06, "loss": 0.4647, "num_input_tokens_seen": 115513472, "step": 94990 }, { "epoch": 11.902643778975065, "grad_norm": 0.29576489329338074, "learning_rate": 4.215657019245072e-06, "loss": 0.4558, "num_input_tokens_seen": 115519616, "step": 94995 }, { "epoch": 11.90327026688385, "grad_norm": 0.33314260840415955, "learning_rate": 4.215117078698131e-06, "loss": 0.457, "num_input_tokens_seen": 115525632, "step": 95000 }, { "epoch": 11.903896754792633, "grad_norm": 0.34250232577323914, "learning_rate": 4.214577147535128e-06, "loss": 0.4632, "num_input_tokens_seen": 115531872, "step": 95005 }, { "epoch": 11.904523242701416, "grad_norm": 0.2556701600551605, "learning_rate": 4.214037225762512e-06, "loss": 0.4631, "num_input_tokens_seen": 115538016, "step": 95010 }, { "epoch": 11.9051497306102, "grad_norm": 1.2006586790084839, "learning_rate": 4.2134973133867445e-06, "loss": 0.4737, "num_input_tokens_seen": 115544192, "step": 95015 }, { "epoch": 11.905776218518982, "grad_norm": 0.28418561816215515, "learning_rate": 4.212957410414276e-06, "loss": 0.4478, "num_input_tokens_seen": 115550432, "step": 95020 }, { "epoch": 11.906402706427766, "grad_norm": 0.2915879487991333, "learning_rate": 4.212417516851564e-06, "loss": 0.4558, "num_input_tokens_seen": 115556320, "step": 95025 }, { "epoch": 11.90702919433655, "grad_norm": 0.19765135645866394, "learning_rate": 4.211877632705062e-06, "loss": 0.4567, "num_input_tokens_seen": 115562400, "step": 95030 }, { "epoch": 11.907655682245332, "grad_norm": 0.2172670215368271, "learning_rate": 4.211337757981228e-06, "loss": 0.4514, "num_input_tokens_seen": 115569024, "step": 95035 }, { "epoch": 11.908282170154116, "grad_norm": 0.2679894268512726, "learning_rate": 4.210797892686511e-06, "loss": 0.4607, "num_input_tokens_seen": 115575168, "step": 95040 }, { "epoch": 11.908908658062899, "grad_norm": 0.20264218747615814, "learning_rate": 4.210258036827372e-06, "loss": 0.4537, "num_input_tokens_seen": 115581504, "step": 95045 }, { "epoch": 11.909535145971683, "grad_norm": 0.2913796305656433, "learning_rate": 4.2097181904102585e-06, "loss": 0.4595, "num_input_tokens_seen": 115587840, "step": 95050 }, { "epoch": 11.910161633880467, "grad_norm": 0.21175383031368256, "learning_rate": 4.2091783534416294e-06, "loss": 0.4585, "num_input_tokens_seen": 115593920, "step": 95055 }, { "epoch": 11.910788121789249, "grad_norm": 0.28682827949523926, "learning_rate": 4.20863852592794e-06, "loss": 0.4473, "num_input_tokens_seen": 115599744, "step": 95060 }, { "epoch": 11.911414609698033, "grad_norm": 0.3256576359272003, "learning_rate": 4.20809870787564e-06, "loss": 0.4539, "num_input_tokens_seen": 115605856, "step": 95065 }, { "epoch": 11.912041097606815, "grad_norm": 0.36370643973350525, "learning_rate": 4.207558899291189e-06, "loss": 0.4587, "num_input_tokens_seen": 115612096, "step": 95070 }, { "epoch": 11.9126675855156, "grad_norm": 0.9353358745574951, "learning_rate": 4.207019100181035e-06, "loss": 0.4602, "num_input_tokens_seen": 115618240, "step": 95075 }, { "epoch": 11.913294073424384, "grad_norm": 0.32866254448890686, "learning_rate": 4.2064793105516344e-06, "loss": 0.4649, "num_input_tokens_seen": 115624032, "step": 95080 }, { "epoch": 11.913920561333166, "grad_norm": 0.309912770986557, "learning_rate": 4.20593953040944e-06, "loss": 0.4597, "num_input_tokens_seen": 115629920, "step": 95085 }, { "epoch": 11.91454704924195, "grad_norm": 0.24813905358314514, "learning_rate": 4.205399759760908e-06, "loss": 0.4642, "num_input_tokens_seen": 115636000, "step": 95090 }, { "epoch": 11.915173537150732, "grad_norm": 0.29072752594947815, "learning_rate": 4.20485999861249e-06, "loss": 0.4652, "num_input_tokens_seen": 115642208, "step": 95095 }, { "epoch": 11.915800025059516, "grad_norm": 0.24852760136127472, "learning_rate": 4.204320246970639e-06, "loss": 0.4688, "num_input_tokens_seen": 115648128, "step": 95100 }, { "epoch": 11.9164265129683, "grad_norm": 0.27401307225227356, "learning_rate": 4.2037805048418075e-06, "loss": 0.4586, "num_input_tokens_seen": 115654080, "step": 95105 }, { "epoch": 11.917053000877083, "grad_norm": 0.2892698645591736, "learning_rate": 4.203240772232449e-06, "loss": 0.4592, "num_input_tokens_seen": 115660096, "step": 95110 }, { "epoch": 11.917679488785867, "grad_norm": 0.22383709251880646, "learning_rate": 4.202701049149019e-06, "loss": 0.4583, "num_input_tokens_seen": 115666144, "step": 95115 }, { "epoch": 11.91830597669465, "grad_norm": 0.31631115078926086, "learning_rate": 4.2021613355979665e-06, "loss": 0.4615, "num_input_tokens_seen": 115672544, "step": 95120 }, { "epoch": 11.918932464603433, "grad_norm": 0.2845393121242523, "learning_rate": 4.201621631585748e-06, "loss": 0.4603, "num_input_tokens_seen": 115678656, "step": 95125 }, { "epoch": 11.919558952512217, "grad_norm": 0.31528666615486145, "learning_rate": 4.201081937118812e-06, "loss": 0.4656, "num_input_tokens_seen": 115684768, "step": 95130 }, { "epoch": 11.920185440421, "grad_norm": 0.2722296714782715, "learning_rate": 4.200542252203614e-06, "loss": 0.4557, "num_input_tokens_seen": 115690944, "step": 95135 }, { "epoch": 11.920811928329783, "grad_norm": 0.214277982711792, "learning_rate": 4.200002576846605e-06, "loss": 0.4545, "num_input_tokens_seen": 115696928, "step": 95140 }, { "epoch": 11.921438416238567, "grad_norm": 0.2938624918460846, "learning_rate": 4.19946291105424e-06, "loss": 0.4594, "num_input_tokens_seen": 115703392, "step": 95145 }, { "epoch": 11.92206490414735, "grad_norm": 0.310794472694397, "learning_rate": 4.198923254832966e-06, "loss": 0.4554, "num_input_tokens_seen": 115709952, "step": 95150 }, { "epoch": 11.922691392056134, "grad_norm": 0.2611074447631836, "learning_rate": 4.19838360818924e-06, "loss": 0.452, "num_input_tokens_seen": 115716128, "step": 95155 }, { "epoch": 11.923317879964916, "grad_norm": 0.2942233383655548, "learning_rate": 4.197843971129511e-06, "loss": 0.4622, "num_input_tokens_seen": 115722400, "step": 95160 }, { "epoch": 11.9239443678737, "grad_norm": 0.2922884225845337, "learning_rate": 4.197304343660233e-06, "loss": 0.4618, "num_input_tokens_seen": 115728544, "step": 95165 }, { "epoch": 11.924570855782484, "grad_norm": 0.2808820605278015, "learning_rate": 4.196764725787854e-06, "loss": 0.4624, "num_input_tokens_seen": 115734592, "step": 95170 }, { "epoch": 11.925197343691266, "grad_norm": 0.2693483233451843, "learning_rate": 4.196225117518828e-06, "loss": 0.4603, "num_input_tokens_seen": 115740928, "step": 95175 }, { "epoch": 11.92582383160005, "grad_norm": 0.30846279859542847, "learning_rate": 4.19568551885961e-06, "loss": 0.463, "num_input_tokens_seen": 115746784, "step": 95180 }, { "epoch": 11.926450319508833, "grad_norm": 0.2581031024456024, "learning_rate": 4.195145929816645e-06, "loss": 0.4579, "num_input_tokens_seen": 115752928, "step": 95185 }, { "epoch": 11.927076807417617, "grad_norm": 0.2574169337749481, "learning_rate": 4.194606350396388e-06, "loss": 0.4768, "num_input_tokens_seen": 115759040, "step": 95190 }, { "epoch": 11.9277032953264, "grad_norm": 0.2797507345676422, "learning_rate": 4.1940667806052885e-06, "loss": 0.4572, "num_input_tokens_seen": 115765024, "step": 95195 }, { "epoch": 11.928329783235183, "grad_norm": 0.29589682817459106, "learning_rate": 4.1935272204498e-06, "loss": 0.4578, "num_input_tokens_seen": 115771040, "step": 95200 }, { "epoch": 11.928956271143967, "grad_norm": 0.28024670481681824, "learning_rate": 4.192987669936369e-06, "loss": 0.4577, "num_input_tokens_seen": 115777088, "step": 95205 }, { "epoch": 11.92958275905275, "grad_norm": 0.30833372473716736, "learning_rate": 4.192448129071452e-06, "loss": 0.4481, "num_input_tokens_seen": 115783360, "step": 95210 }, { "epoch": 11.930209246961534, "grad_norm": 0.24658164381980896, "learning_rate": 4.191908597861494e-06, "loss": 0.4585, "num_input_tokens_seen": 115789632, "step": 95215 }, { "epoch": 11.930835734870318, "grad_norm": 0.27200138568878174, "learning_rate": 4.19136907631295e-06, "loss": 0.4632, "num_input_tokens_seen": 115796032, "step": 95220 }, { "epoch": 11.9314622227791, "grad_norm": 0.24167706072330475, "learning_rate": 4.190829564432267e-06, "loss": 0.46, "num_input_tokens_seen": 115802432, "step": 95225 }, { "epoch": 11.932088710687884, "grad_norm": 0.23131349682807922, "learning_rate": 4.190290062225897e-06, "loss": 0.4665, "num_input_tokens_seen": 115808320, "step": 95230 }, { "epoch": 11.932715198596668, "grad_norm": 0.23226748406887054, "learning_rate": 4.18975056970029e-06, "loss": 0.4613, "num_input_tokens_seen": 115814752, "step": 95235 }, { "epoch": 11.93334168650545, "grad_norm": 0.25995728373527527, "learning_rate": 4.189211086861896e-06, "loss": 0.4588, "num_input_tokens_seen": 115820896, "step": 95240 }, { "epoch": 11.933968174414234, "grad_norm": 0.2935430109500885, "learning_rate": 4.188671613717167e-06, "loss": 0.4566, "num_input_tokens_seen": 115826688, "step": 95245 }, { "epoch": 11.934594662323017, "grad_norm": 0.5281779170036316, "learning_rate": 4.188132150272549e-06, "loss": 0.4586, "num_input_tokens_seen": 115832256, "step": 95250 }, { "epoch": 11.9352211502318, "grad_norm": 0.23683689534664154, "learning_rate": 4.187592696534496e-06, "loss": 0.4718, "num_input_tokens_seen": 115838496, "step": 95255 }, { "epoch": 11.935847638140585, "grad_norm": 0.32546764612197876, "learning_rate": 4.187053252509452e-06, "loss": 0.4437, "num_input_tokens_seen": 115844768, "step": 95260 }, { "epoch": 11.936474126049367, "grad_norm": 0.23895537853240967, "learning_rate": 4.186513818203874e-06, "loss": 0.461, "num_input_tokens_seen": 115850944, "step": 95265 }, { "epoch": 11.937100613958151, "grad_norm": 0.3306238651275635, "learning_rate": 4.185974393624203e-06, "loss": 0.4688, "num_input_tokens_seen": 115857088, "step": 95270 }, { "epoch": 11.937727101866933, "grad_norm": 0.3158775866031647, "learning_rate": 4.185434978776896e-06, "loss": 0.457, "num_input_tokens_seen": 115863264, "step": 95275 }, { "epoch": 11.938353589775717, "grad_norm": 0.36769169569015503, "learning_rate": 4.184895573668396e-06, "loss": 0.4517, "num_input_tokens_seen": 115869472, "step": 95280 }, { "epoch": 11.938980077684501, "grad_norm": 0.4568113088607788, "learning_rate": 4.1843561783051565e-06, "loss": 0.4757, "num_input_tokens_seen": 115874944, "step": 95285 }, { "epoch": 11.939606565593284, "grad_norm": 0.3918329179286957, "learning_rate": 4.183816792693623e-06, "loss": 0.4595, "num_input_tokens_seen": 115880768, "step": 95290 }, { "epoch": 11.940233053502068, "grad_norm": 0.27189260721206665, "learning_rate": 4.183277416840247e-06, "loss": 0.4685, "num_input_tokens_seen": 115886528, "step": 95295 }, { "epoch": 11.94085954141085, "grad_norm": 0.2553558051586151, "learning_rate": 4.182738050751476e-06, "loss": 0.4586, "num_input_tokens_seen": 115892768, "step": 95300 }, { "epoch": 11.941486029319634, "grad_norm": 0.23355413973331451, "learning_rate": 4.182198694433758e-06, "loss": 0.4608, "num_input_tokens_seen": 115898592, "step": 95305 }, { "epoch": 11.942112517228418, "grad_norm": 0.31370189785957336, "learning_rate": 4.1816593478935445e-06, "loss": 0.4686, "num_input_tokens_seen": 115904928, "step": 95310 }, { "epoch": 11.9427390051372, "grad_norm": 0.3580838143825531, "learning_rate": 4.181120011137279e-06, "loss": 0.4629, "num_input_tokens_seen": 115911232, "step": 95315 }, { "epoch": 11.943365493045984, "grad_norm": 0.7666618227958679, "learning_rate": 4.180580684171415e-06, "loss": 0.4646, "num_input_tokens_seen": 115917280, "step": 95320 }, { "epoch": 11.943991980954767, "grad_norm": 0.2636275291442871, "learning_rate": 4.180041367002395e-06, "loss": 0.4605, "num_input_tokens_seen": 115923968, "step": 95325 }, { "epoch": 11.94461846886355, "grad_norm": 0.26269102096557617, "learning_rate": 4.179502059636672e-06, "loss": 0.4734, "num_input_tokens_seen": 115930176, "step": 95330 }, { "epoch": 11.945244956772335, "grad_norm": 0.3195546865463257, "learning_rate": 4.17896276208069e-06, "loss": 0.4636, "num_input_tokens_seen": 115936224, "step": 95335 }, { "epoch": 11.945871444681117, "grad_norm": 0.21917638182640076, "learning_rate": 4.178423474340899e-06, "loss": 0.4738, "num_input_tokens_seen": 115942336, "step": 95340 }, { "epoch": 11.946497932589901, "grad_norm": 0.20499873161315918, "learning_rate": 4.177884196423745e-06, "loss": 0.4619, "num_input_tokens_seen": 115948320, "step": 95345 }, { "epoch": 11.947124420498685, "grad_norm": 0.22953741252422333, "learning_rate": 4.17734492833568e-06, "loss": 0.457, "num_input_tokens_seen": 115953760, "step": 95350 }, { "epoch": 11.947750908407468, "grad_norm": 0.2828901410102844, "learning_rate": 4.176805670083145e-06, "loss": 0.4642, "num_input_tokens_seen": 115960032, "step": 95355 }, { "epoch": 11.948377396316252, "grad_norm": 0.3220328986644745, "learning_rate": 4.17626642167259e-06, "loss": 0.461, "num_input_tokens_seen": 115966112, "step": 95360 }, { "epoch": 11.949003884225034, "grad_norm": 0.22844593226909637, "learning_rate": 4.1757271831104655e-06, "loss": 0.468, "num_input_tokens_seen": 115972384, "step": 95365 }, { "epoch": 11.949630372133818, "grad_norm": 0.32403531670570374, "learning_rate": 4.175187954403213e-06, "loss": 0.4553, "num_input_tokens_seen": 115978336, "step": 95370 }, { "epoch": 11.9502568600426, "grad_norm": 0.2275787740945816, "learning_rate": 4.174648735557283e-06, "loss": 0.4702, "num_input_tokens_seen": 115984256, "step": 95375 }, { "epoch": 11.950883347951384, "grad_norm": 0.3018454611301422, "learning_rate": 4.17410952657912e-06, "loss": 0.4581, "num_input_tokens_seen": 115990304, "step": 95380 }, { "epoch": 11.951509835860168, "grad_norm": 0.48489004373550415, "learning_rate": 4.173570327475173e-06, "loss": 0.4638, "num_input_tokens_seen": 115996352, "step": 95385 }, { "epoch": 11.95213632376895, "grad_norm": 0.327951580286026, "learning_rate": 4.173031138251889e-06, "loss": 0.4569, "num_input_tokens_seen": 116002176, "step": 95390 }, { "epoch": 11.952762811677735, "grad_norm": 0.30741405487060547, "learning_rate": 4.1724919589157105e-06, "loss": 0.4665, "num_input_tokens_seen": 116008256, "step": 95395 }, { "epoch": 11.953389299586519, "grad_norm": 0.27531442046165466, "learning_rate": 4.171952789473087e-06, "loss": 0.4574, "num_input_tokens_seen": 116014400, "step": 95400 }, { "epoch": 11.954015787495301, "grad_norm": 0.18850886821746826, "learning_rate": 4.1714136299304655e-06, "loss": 0.4631, "num_input_tokens_seen": 116020544, "step": 95405 }, { "epoch": 11.954642275404085, "grad_norm": 0.234335258603096, "learning_rate": 4.170874480294289e-06, "loss": 0.4625, "num_input_tokens_seen": 116026816, "step": 95410 }, { "epoch": 11.955268763312867, "grad_norm": 0.2603227496147156, "learning_rate": 4.170335340571007e-06, "loss": 0.4535, "num_input_tokens_seen": 116032960, "step": 95415 }, { "epoch": 11.955895251221651, "grad_norm": 0.24877792596817017, "learning_rate": 4.169796210767062e-06, "loss": 0.4495, "num_input_tokens_seen": 116039200, "step": 95420 }, { "epoch": 11.956521739130435, "grad_norm": 0.39060860872268677, "learning_rate": 4.169257090888901e-06, "loss": 0.4698, "num_input_tokens_seen": 116045152, "step": 95425 }, { "epoch": 11.957148227039218, "grad_norm": 0.33871909976005554, "learning_rate": 4.168717980942972e-06, "loss": 0.4716, "num_input_tokens_seen": 116051360, "step": 95430 }, { "epoch": 11.957774714948002, "grad_norm": 0.24002212285995483, "learning_rate": 4.168178880935716e-06, "loss": 0.4583, "num_input_tokens_seen": 116057632, "step": 95435 }, { "epoch": 11.958401202856784, "grad_norm": 0.22607889771461487, "learning_rate": 4.167639790873582e-06, "loss": 0.4697, "num_input_tokens_seen": 116064064, "step": 95440 }, { "epoch": 11.959027690765568, "grad_norm": 0.2757765054702759, "learning_rate": 4.167100710763014e-06, "loss": 0.4519, "num_input_tokens_seen": 116070432, "step": 95445 }, { "epoch": 11.959654178674352, "grad_norm": 0.28543832898139954, "learning_rate": 4.166561640610457e-06, "loss": 0.4669, "num_input_tokens_seen": 116076352, "step": 95450 }, { "epoch": 11.960280666583134, "grad_norm": 0.36497563123703003, "learning_rate": 4.166022580422357e-06, "loss": 0.4503, "num_input_tokens_seen": 116082496, "step": 95455 }, { "epoch": 11.960907154491919, "grad_norm": 0.2208460420370102, "learning_rate": 4.1654835302051585e-06, "loss": 0.4604, "num_input_tokens_seen": 116088768, "step": 95460 }, { "epoch": 11.961533642400703, "grad_norm": 0.2438630908727646, "learning_rate": 4.164944489965305e-06, "loss": 0.4529, "num_input_tokens_seen": 116094560, "step": 95465 }, { "epoch": 11.962160130309485, "grad_norm": 2.203277587890625, "learning_rate": 4.1644054597092434e-06, "loss": 0.452, "num_input_tokens_seen": 116100448, "step": 95470 }, { "epoch": 11.962786618218269, "grad_norm": 0.33801913261413574, "learning_rate": 4.163866439443415e-06, "loss": 0.4553, "num_input_tokens_seen": 116106528, "step": 95475 }, { "epoch": 11.963413106127051, "grad_norm": 0.3777196705341339, "learning_rate": 4.163327429174267e-06, "loss": 0.4592, "num_input_tokens_seen": 116112960, "step": 95480 }, { "epoch": 11.964039594035835, "grad_norm": 0.2950347363948822, "learning_rate": 4.162788428908244e-06, "loss": 0.4564, "num_input_tokens_seen": 116118912, "step": 95485 }, { "epoch": 11.964666081944618, "grad_norm": 0.27835848927497864, "learning_rate": 4.162249438651789e-06, "loss": 0.4727, "num_input_tokens_seen": 116124896, "step": 95490 }, { "epoch": 11.965292569853402, "grad_norm": 0.48761793971061707, "learning_rate": 4.161710458411345e-06, "loss": 0.466, "num_input_tokens_seen": 116131328, "step": 95495 }, { "epoch": 11.965919057762186, "grad_norm": 0.18790455162525177, "learning_rate": 4.161171488193358e-06, "loss": 0.4528, "num_input_tokens_seen": 116137472, "step": 95500 }, { "epoch": 11.966545545670968, "grad_norm": 0.28370431065559387, "learning_rate": 4.160632528004272e-06, "loss": 0.4488, "num_input_tokens_seen": 116143552, "step": 95505 }, { "epoch": 11.967172033579752, "grad_norm": 0.545489490032196, "learning_rate": 4.160093577850529e-06, "loss": 0.4525, "num_input_tokens_seen": 116149856, "step": 95510 }, { "epoch": 11.967798521488536, "grad_norm": 0.5025162100791931, "learning_rate": 4.159554637738575e-06, "loss": 0.4711, "num_input_tokens_seen": 116156064, "step": 95515 }, { "epoch": 11.968425009397318, "grad_norm": 0.4812692701816559, "learning_rate": 4.15901570767485e-06, "loss": 0.4832, "num_input_tokens_seen": 116161952, "step": 95520 }, { "epoch": 11.969051497306102, "grad_norm": 0.6230254173278809, "learning_rate": 4.158476787665801e-06, "loss": 0.484, "num_input_tokens_seen": 116168064, "step": 95525 }, { "epoch": 11.969677985214885, "grad_norm": 0.46298518776893616, "learning_rate": 4.157937877717868e-06, "loss": 0.4851, "num_input_tokens_seen": 116174304, "step": 95530 }, { "epoch": 11.970304473123669, "grad_norm": 0.3009323179721832, "learning_rate": 4.1573989778374956e-06, "loss": 0.4647, "num_input_tokens_seen": 116180288, "step": 95535 }, { "epoch": 11.970930961032453, "grad_norm": 0.4865647852420807, "learning_rate": 4.156860088031127e-06, "loss": 0.4709, "num_input_tokens_seen": 116186528, "step": 95540 }, { "epoch": 11.971557448941235, "grad_norm": 0.5420486927032471, "learning_rate": 4.156321208305205e-06, "loss": 0.4554, "num_input_tokens_seen": 116192864, "step": 95545 }, { "epoch": 11.97218393685002, "grad_norm": 0.5393304824829102, "learning_rate": 4.155782338666172e-06, "loss": 0.4649, "num_input_tokens_seen": 116198880, "step": 95550 }, { "epoch": 11.972810424758801, "grad_norm": 0.6084392666816711, "learning_rate": 4.155243479120471e-06, "loss": 0.4883, "num_input_tokens_seen": 116204960, "step": 95555 }, { "epoch": 11.973436912667585, "grad_norm": 0.33622705936431885, "learning_rate": 4.154704629674546e-06, "loss": 0.4502, "num_input_tokens_seen": 116211264, "step": 95560 }, { "epoch": 11.97406340057637, "grad_norm": 0.37893038988113403, "learning_rate": 4.154165790334835e-06, "loss": 0.456, "num_input_tokens_seen": 116217280, "step": 95565 }, { "epoch": 11.974689888485152, "grad_norm": 0.6668315529823303, "learning_rate": 4.153626961107787e-06, "loss": 0.4598, "num_input_tokens_seen": 116223520, "step": 95570 }, { "epoch": 11.975316376393936, "grad_norm": 0.3542648255825043, "learning_rate": 4.153088141999837e-06, "loss": 0.454, "num_input_tokens_seen": 116229728, "step": 95575 }, { "epoch": 11.975942864302718, "grad_norm": 0.27906152606010437, "learning_rate": 4.1525493330174325e-06, "loss": 0.4621, "num_input_tokens_seen": 116235584, "step": 95580 }, { "epoch": 11.976569352211502, "grad_norm": 0.5048442482948303, "learning_rate": 4.152010534167012e-06, "loss": 0.4828, "num_input_tokens_seen": 116241632, "step": 95585 }, { "epoch": 11.977195840120286, "grad_norm": 0.40815895795822144, "learning_rate": 4.151471745455018e-06, "loss": 0.4643, "num_input_tokens_seen": 116247712, "step": 95590 }, { "epoch": 11.977822328029069, "grad_norm": 0.3028051257133484, "learning_rate": 4.150932966887894e-06, "loss": 0.4569, "num_input_tokens_seen": 116253792, "step": 95595 }, { "epoch": 11.978448815937853, "grad_norm": 0.20696984231472015, "learning_rate": 4.150394198472079e-06, "loss": 0.462, "num_input_tokens_seen": 116259584, "step": 95600 }, { "epoch": 11.979075303846635, "grad_norm": 0.2850746810436249, "learning_rate": 4.149855440214016e-06, "loss": 0.4632, "num_input_tokens_seen": 116265728, "step": 95605 }, { "epoch": 11.979701791755419, "grad_norm": 0.24456197023391724, "learning_rate": 4.149316692120144e-06, "loss": 0.4515, "num_input_tokens_seen": 116271840, "step": 95610 }, { "epoch": 11.980328279664203, "grad_norm": 0.2425204962491989, "learning_rate": 4.14877795419691e-06, "loss": 0.4633, "num_input_tokens_seen": 116277728, "step": 95615 }, { "epoch": 11.980954767572985, "grad_norm": 0.5105045437812805, "learning_rate": 4.148239226450749e-06, "loss": 0.4655, "num_input_tokens_seen": 116283904, "step": 95620 }, { "epoch": 11.98158125548177, "grad_norm": 0.2889772653579712, "learning_rate": 4.147700508888105e-06, "loss": 0.4706, "num_input_tokens_seen": 116290368, "step": 95625 }, { "epoch": 11.982207743390553, "grad_norm": 0.1939757615327835, "learning_rate": 4.1471618015154175e-06, "loss": 0.4559, "num_input_tokens_seen": 116296192, "step": 95630 }, { "epoch": 11.982834231299336, "grad_norm": 0.22083525359630585, "learning_rate": 4.146623104339129e-06, "loss": 0.4668, "num_input_tokens_seen": 116302464, "step": 95635 }, { "epoch": 11.98346071920812, "grad_norm": 0.44652891159057617, "learning_rate": 4.1460844173656765e-06, "loss": 0.4692, "num_input_tokens_seen": 116308480, "step": 95640 }, { "epoch": 11.984087207116902, "grad_norm": 0.23774069547653198, "learning_rate": 4.145545740601504e-06, "loss": 0.4579, "num_input_tokens_seen": 116314688, "step": 95645 }, { "epoch": 11.984713695025686, "grad_norm": 0.4560987651348114, "learning_rate": 4.14500707405305e-06, "loss": 0.4658, "num_input_tokens_seen": 116320768, "step": 95650 }, { "epoch": 11.98534018293447, "grad_norm": 0.29422515630722046, "learning_rate": 4.144468417726757e-06, "loss": 0.4597, "num_input_tokens_seen": 116326880, "step": 95655 }, { "epoch": 11.985966670843252, "grad_norm": 0.305095374584198, "learning_rate": 4.143929771629061e-06, "loss": 0.46, "num_input_tokens_seen": 116332672, "step": 95660 }, { "epoch": 11.986593158752036, "grad_norm": 0.43563035130500793, "learning_rate": 4.1433911357664044e-06, "loss": 0.4575, "num_input_tokens_seen": 116338400, "step": 95665 }, { "epoch": 11.987219646660819, "grad_norm": 0.23135241866111755, "learning_rate": 4.142852510145229e-06, "loss": 0.4467, "num_input_tokens_seen": 116344352, "step": 95670 }, { "epoch": 11.987846134569603, "grad_norm": 0.3159095048904419, "learning_rate": 4.142313894771971e-06, "loss": 0.4643, "num_input_tokens_seen": 116350304, "step": 95675 }, { "epoch": 11.988472622478387, "grad_norm": 0.2792430818080902, "learning_rate": 4.141775289653073e-06, "loss": 0.4646, "num_input_tokens_seen": 116356160, "step": 95680 }, { "epoch": 11.98909911038717, "grad_norm": 0.33468008041381836, "learning_rate": 4.1412366947949715e-06, "loss": 0.4626, "num_input_tokens_seen": 116362208, "step": 95685 }, { "epoch": 11.989725598295953, "grad_norm": 0.2704477310180664, "learning_rate": 4.140698110204109e-06, "loss": 0.4618, "num_input_tokens_seen": 116368416, "step": 95690 }, { "epoch": 11.990352086204735, "grad_norm": 0.22627849876880646, "learning_rate": 4.140159535886921e-06, "loss": 0.4499, "num_input_tokens_seen": 116374560, "step": 95695 }, { "epoch": 11.99097857411352, "grad_norm": 0.22850339114665985, "learning_rate": 4.139620971849849e-06, "loss": 0.4612, "num_input_tokens_seen": 116380032, "step": 95700 }, { "epoch": 11.991605062022304, "grad_norm": 0.31033772230148315, "learning_rate": 4.139082418099332e-06, "loss": 0.4497, "num_input_tokens_seen": 116385984, "step": 95705 }, { "epoch": 11.992231549931086, "grad_norm": 0.2110060155391693, "learning_rate": 4.1385438746418095e-06, "loss": 0.4603, "num_input_tokens_seen": 116391904, "step": 95710 }, { "epoch": 11.99285803783987, "grad_norm": 0.24418820440769196, "learning_rate": 4.1380053414837175e-06, "loss": 0.4674, "num_input_tokens_seen": 116398304, "step": 95715 }, { "epoch": 11.993484525748652, "grad_norm": 0.42238306999206543, "learning_rate": 4.137466818631498e-06, "loss": 0.4644, "num_input_tokens_seen": 116404544, "step": 95720 }, { "epoch": 11.994111013657436, "grad_norm": 0.3380977213382721, "learning_rate": 4.136928306091586e-06, "loss": 0.4668, "num_input_tokens_seen": 116410656, "step": 95725 }, { "epoch": 11.99473750156622, "grad_norm": 0.20457035303115845, "learning_rate": 4.136389803870422e-06, "loss": 0.4523, "num_input_tokens_seen": 116416448, "step": 95730 }, { "epoch": 11.995363989475003, "grad_norm": 0.2503143846988678, "learning_rate": 4.135851311974447e-06, "loss": 0.4635, "num_input_tokens_seen": 116422432, "step": 95735 }, { "epoch": 11.995990477383787, "grad_norm": 0.29404690861701965, "learning_rate": 4.135312830410093e-06, "loss": 0.4597, "num_input_tokens_seen": 116428448, "step": 95740 }, { "epoch": 11.99661696529257, "grad_norm": 0.2823062241077423, "learning_rate": 4.134774359183803e-06, "loss": 0.4572, "num_input_tokens_seen": 116434688, "step": 95745 }, { "epoch": 11.997243453201353, "grad_norm": 0.5534326434135437, "learning_rate": 4.134235898302011e-06, "loss": 0.4488, "num_input_tokens_seen": 116441024, "step": 95750 }, { "epoch": 11.997869941110137, "grad_norm": 0.2760591208934784, "learning_rate": 4.133697447771158e-06, "loss": 0.4806, "num_input_tokens_seen": 116447328, "step": 95755 }, { "epoch": 11.99849642901892, "grad_norm": 0.27540022134780884, "learning_rate": 4.133159007597679e-06, "loss": 0.4711, "num_input_tokens_seen": 116452704, "step": 95760 }, { "epoch": 11.999122916927703, "grad_norm": 0.4161463677883148, "learning_rate": 4.1326205777880155e-06, "loss": 0.4622, "num_input_tokens_seen": 116458912, "step": 95765 }, { "epoch": 11.999749404836487, "grad_norm": 0.3803299367427826, "learning_rate": 4.1320821583485995e-06, "loss": 0.4572, "num_input_tokens_seen": 116465440, "step": 95770 }, { "epoch": 12.0, "eval_loss": 0.4653880000114441, "eval_runtime": 223.2813, "eval_samples_per_second": 35.744, "eval_steps_per_second": 8.939, "num_input_tokens_seen": 116467904, "step": 95772 }, { "epoch": 12.00037589274527, "grad_norm": 0.27059856057167053, "learning_rate": 4.131543749285873e-06, "loss": 0.4622, "num_input_tokens_seen": 116471552, "step": 95775 }, { "epoch": 12.001002380654054, "grad_norm": 0.45095452666282654, "learning_rate": 4.13100535060627e-06, "loss": 0.4686, "num_input_tokens_seen": 116477280, "step": 95780 }, { "epoch": 12.001628868562836, "grad_norm": 0.21872010827064514, "learning_rate": 4.1304669623162295e-06, "loss": 0.4632, "num_input_tokens_seen": 116483424, "step": 95785 }, { "epoch": 12.00225535647162, "grad_norm": 0.20703235268592834, "learning_rate": 4.129928584422186e-06, "loss": 0.4717, "num_input_tokens_seen": 116489376, "step": 95790 }, { "epoch": 12.002881844380404, "grad_norm": 0.30624064803123474, "learning_rate": 4.129390216930578e-06, "loss": 0.4507, "num_input_tokens_seen": 116495552, "step": 95795 }, { "epoch": 12.003508332289186, "grad_norm": 0.34296759963035583, "learning_rate": 4.128851859847845e-06, "loss": 0.4624, "num_input_tokens_seen": 116501536, "step": 95800 }, { "epoch": 12.00413482019797, "grad_norm": 0.2776259183883667, "learning_rate": 4.1283135131804165e-06, "loss": 0.4677, "num_input_tokens_seen": 116507520, "step": 95805 }, { "epoch": 12.004761308106753, "grad_norm": 0.2657680809497833, "learning_rate": 4.127775176934735e-06, "loss": 0.4709, "num_input_tokens_seen": 116513504, "step": 95810 }, { "epoch": 12.005387796015537, "grad_norm": 0.2756882905960083, "learning_rate": 4.127236851117233e-06, "loss": 0.4594, "num_input_tokens_seen": 116519616, "step": 95815 }, { "epoch": 12.006014283924321, "grad_norm": 0.23679301142692566, "learning_rate": 4.1266985357343506e-06, "loss": 0.4662, "num_input_tokens_seen": 116525824, "step": 95820 }, { "epoch": 12.006640771833103, "grad_norm": 0.1758703589439392, "learning_rate": 4.126160230792519e-06, "loss": 0.4697, "num_input_tokens_seen": 116532224, "step": 95825 }, { "epoch": 12.007267259741887, "grad_norm": 0.23790472745895386, "learning_rate": 4.125621936298178e-06, "loss": 0.4596, "num_input_tokens_seen": 116538592, "step": 95830 }, { "epoch": 12.00789374765067, "grad_norm": 0.22551822662353516, "learning_rate": 4.1250836522577615e-06, "loss": 0.4558, "num_input_tokens_seen": 116545024, "step": 95835 }, { "epoch": 12.008520235559454, "grad_norm": 0.20454424619674683, "learning_rate": 4.124545378677707e-06, "loss": 0.461, "num_input_tokens_seen": 116551232, "step": 95840 }, { "epoch": 12.009146723468238, "grad_norm": 0.4695293605327606, "learning_rate": 4.124007115564446e-06, "loss": 0.4672, "num_input_tokens_seen": 116557248, "step": 95845 }, { "epoch": 12.00977321137702, "grad_norm": 0.31258493661880493, "learning_rate": 4.123468862924418e-06, "loss": 0.4643, "num_input_tokens_seen": 116563424, "step": 95850 }, { "epoch": 12.010399699285804, "grad_norm": 0.29045984148979187, "learning_rate": 4.1229306207640555e-06, "loss": 0.4664, "num_input_tokens_seen": 116569056, "step": 95855 }, { "epoch": 12.011026187194588, "grad_norm": 0.22747893631458282, "learning_rate": 4.122392389089796e-06, "loss": 0.4598, "num_input_tokens_seen": 116574784, "step": 95860 }, { "epoch": 12.01165267510337, "grad_norm": 0.22787711024284363, "learning_rate": 4.121854167908072e-06, "loss": 0.4604, "num_input_tokens_seen": 116581120, "step": 95865 }, { "epoch": 12.012279163012154, "grad_norm": 0.5100902915000916, "learning_rate": 4.12131595722532e-06, "loss": 0.462, "num_input_tokens_seen": 116587360, "step": 95870 }, { "epoch": 12.012905650920937, "grad_norm": 0.283631294965744, "learning_rate": 4.120777757047976e-06, "loss": 0.4627, "num_input_tokens_seen": 116592768, "step": 95875 }, { "epoch": 12.01353213882972, "grad_norm": 0.2291126847267151, "learning_rate": 4.120239567382471e-06, "loss": 0.4578, "num_input_tokens_seen": 116598976, "step": 95880 }, { "epoch": 12.014158626738505, "grad_norm": 0.21227121353149414, "learning_rate": 4.119701388235245e-06, "loss": 0.4602, "num_input_tokens_seen": 116605280, "step": 95885 }, { "epoch": 12.014785114647287, "grad_norm": 0.3049676716327667, "learning_rate": 4.119163219612726e-06, "loss": 0.4443, "num_input_tokens_seen": 116611072, "step": 95890 }, { "epoch": 12.015411602556071, "grad_norm": 0.2815326750278473, "learning_rate": 4.118625061521353e-06, "loss": 0.4526, "num_input_tokens_seen": 116617280, "step": 95895 }, { "epoch": 12.016038090464853, "grad_norm": 0.2669164836406708, "learning_rate": 4.118086913967556e-06, "loss": 0.4608, "num_input_tokens_seen": 116623200, "step": 95900 }, { "epoch": 12.016664578373637, "grad_norm": 0.3110339939594269, "learning_rate": 4.117548776957774e-06, "loss": 0.4567, "num_input_tokens_seen": 116629216, "step": 95905 }, { "epoch": 12.017291066282421, "grad_norm": 3.9222686290740967, "learning_rate": 4.1170106504984366e-06, "loss": 0.4745, "num_input_tokens_seen": 116635168, "step": 95910 }, { "epoch": 12.017917554191204, "grad_norm": 0.28235694766044617, "learning_rate": 4.11647253459598e-06, "loss": 0.4521, "num_input_tokens_seen": 116641248, "step": 95915 }, { "epoch": 12.018544042099988, "grad_norm": 0.2396206110715866, "learning_rate": 4.1159344292568374e-06, "loss": 0.4858, "num_input_tokens_seen": 116647296, "step": 95920 }, { "epoch": 12.01917053000877, "grad_norm": 0.3864910900592804, "learning_rate": 4.11539633448744e-06, "loss": 0.4514, "num_input_tokens_seen": 116653248, "step": 95925 }, { "epoch": 12.019797017917554, "grad_norm": 0.4136694073677063, "learning_rate": 4.114858250294226e-06, "loss": 0.4619, "num_input_tokens_seen": 116659328, "step": 95930 }, { "epoch": 12.020423505826338, "grad_norm": 0.17170916497707367, "learning_rate": 4.114320176683623e-06, "loss": 0.4621, "num_input_tokens_seen": 116665504, "step": 95935 }, { "epoch": 12.02104999373512, "grad_norm": 0.22137993574142456, "learning_rate": 4.113782113662071e-06, "loss": 0.4638, "num_input_tokens_seen": 116671520, "step": 95940 }, { "epoch": 12.021676481643905, "grad_norm": 0.24361872673034668, "learning_rate": 4.113244061235995e-06, "loss": 0.4514, "num_input_tokens_seen": 116677248, "step": 95945 }, { "epoch": 12.022302969552687, "grad_norm": 0.39691030979156494, "learning_rate": 4.1127060194118346e-06, "loss": 0.4727, "num_input_tokens_seen": 116683488, "step": 95950 }, { "epoch": 12.022929457461471, "grad_norm": 0.24456073343753815, "learning_rate": 4.1121679881960175e-06, "loss": 0.4623, "num_input_tokens_seen": 116689824, "step": 95955 }, { "epoch": 12.023555945370255, "grad_norm": 0.2446419894695282, "learning_rate": 4.1116299675949785e-06, "loss": 0.4577, "num_input_tokens_seen": 116695872, "step": 95960 }, { "epoch": 12.024182433279037, "grad_norm": 0.3221752941608429, "learning_rate": 4.111091957615151e-06, "loss": 0.4528, "num_input_tokens_seen": 116702208, "step": 95965 }, { "epoch": 12.024808921187821, "grad_norm": 0.22229434549808502, "learning_rate": 4.1105539582629674e-06, "loss": 0.4652, "num_input_tokens_seen": 116708192, "step": 95970 }, { "epoch": 12.025435409096604, "grad_norm": 0.21575435996055603, "learning_rate": 4.110015969544857e-06, "loss": 0.4597, "num_input_tokens_seen": 116714176, "step": 95975 }, { "epoch": 12.026061897005388, "grad_norm": 0.27173709869384766, "learning_rate": 4.109477991467254e-06, "loss": 0.4545, "num_input_tokens_seen": 116719872, "step": 95980 }, { "epoch": 12.026688384914172, "grad_norm": 0.33102452754974365, "learning_rate": 4.108940024036593e-06, "loss": 0.452, "num_input_tokens_seen": 116726208, "step": 95985 }, { "epoch": 12.027314872822954, "grad_norm": 0.20555245876312256, "learning_rate": 4.108402067259301e-06, "loss": 0.4701, "num_input_tokens_seen": 116732096, "step": 95990 }, { "epoch": 12.027941360731738, "grad_norm": 0.19720490276813507, "learning_rate": 4.1078641211418135e-06, "loss": 0.4646, "num_input_tokens_seen": 116738432, "step": 95995 }, { "epoch": 12.028567848640522, "grad_norm": 0.25962090492248535, "learning_rate": 4.107326185690559e-06, "loss": 0.4569, "num_input_tokens_seen": 116744576, "step": 96000 }, { "epoch": 12.029194336549304, "grad_norm": 0.23462538421154022, "learning_rate": 4.106788260911971e-06, "loss": 0.4595, "num_input_tokens_seen": 116750784, "step": 96005 }, { "epoch": 12.029820824458088, "grad_norm": 0.24655179679393768, "learning_rate": 4.10625034681248e-06, "loss": 0.4551, "num_input_tokens_seen": 116756704, "step": 96010 }, { "epoch": 12.03044731236687, "grad_norm": 0.2296680510044098, "learning_rate": 4.105712443398518e-06, "loss": 0.4589, "num_input_tokens_seen": 116762592, "step": 96015 }, { "epoch": 12.031073800275655, "grad_norm": 0.3028974235057831, "learning_rate": 4.1051745506765145e-06, "loss": 0.461, "num_input_tokens_seen": 116768832, "step": 96020 }, { "epoch": 12.031700288184439, "grad_norm": 0.22362489998340607, "learning_rate": 4.104636668652904e-06, "loss": 0.4575, "num_input_tokens_seen": 116775104, "step": 96025 }, { "epoch": 12.032326776093221, "grad_norm": 0.21448400616645813, "learning_rate": 4.104098797334112e-06, "loss": 0.4527, "num_input_tokens_seen": 116780928, "step": 96030 }, { "epoch": 12.032953264002005, "grad_norm": 0.26317882537841797, "learning_rate": 4.103560936726575e-06, "loss": 0.467, "num_input_tokens_seen": 116786848, "step": 96035 }, { "epoch": 12.033579751910787, "grad_norm": 0.25112172961235046, "learning_rate": 4.103023086836718e-06, "loss": 0.4705, "num_input_tokens_seen": 116792864, "step": 96040 }, { "epoch": 12.034206239819571, "grad_norm": 0.1903909295797348, "learning_rate": 4.102485247670975e-06, "loss": 0.4696, "num_input_tokens_seen": 116799104, "step": 96045 }, { "epoch": 12.034832727728356, "grad_norm": 0.20264898240566254, "learning_rate": 4.1019474192357765e-06, "loss": 0.4668, "num_input_tokens_seen": 116805440, "step": 96050 }, { "epoch": 12.035459215637138, "grad_norm": 0.26039963960647583, "learning_rate": 4.101409601537551e-06, "loss": 0.4582, "num_input_tokens_seen": 116811616, "step": 96055 }, { "epoch": 12.036085703545922, "grad_norm": 0.30064326524734497, "learning_rate": 4.100871794582728e-06, "loss": 0.4686, "num_input_tokens_seen": 116817632, "step": 96060 }, { "epoch": 12.036712191454704, "grad_norm": 0.2292732149362564, "learning_rate": 4.1003339983777405e-06, "loss": 0.4644, "num_input_tokens_seen": 116823648, "step": 96065 }, { "epoch": 12.037338679363488, "grad_norm": 0.20506495237350464, "learning_rate": 4.099796212929016e-06, "loss": 0.4576, "num_input_tokens_seen": 116829760, "step": 96070 }, { "epoch": 12.037965167272272, "grad_norm": 0.19666032493114471, "learning_rate": 4.0992584382429825e-06, "loss": 0.4533, "num_input_tokens_seen": 116835776, "step": 96075 }, { "epoch": 12.038591655181055, "grad_norm": 0.3769230246543884, "learning_rate": 4.098720674326075e-06, "loss": 0.463, "num_input_tokens_seen": 116842048, "step": 96080 }, { "epoch": 12.039218143089839, "grad_norm": 0.2211107611656189, "learning_rate": 4.098182921184718e-06, "loss": 0.4711, "num_input_tokens_seen": 116848416, "step": 96085 }, { "epoch": 12.039844630998621, "grad_norm": 0.18694357573986053, "learning_rate": 4.0976451788253435e-06, "loss": 0.461, "num_input_tokens_seen": 116854304, "step": 96090 }, { "epoch": 12.040471118907405, "grad_norm": 0.18820153176784515, "learning_rate": 4.097107447254377e-06, "loss": 0.4644, "num_input_tokens_seen": 116860480, "step": 96095 }, { "epoch": 12.041097606816189, "grad_norm": 0.2297682762145996, "learning_rate": 4.096569726478251e-06, "loss": 0.4629, "num_input_tokens_seen": 116866656, "step": 96100 }, { "epoch": 12.041724094724971, "grad_norm": 0.2676009237766266, "learning_rate": 4.096032016503395e-06, "loss": 0.4584, "num_input_tokens_seen": 116873120, "step": 96105 }, { "epoch": 12.042350582633755, "grad_norm": 0.1772189736366272, "learning_rate": 4.095494317336235e-06, "loss": 0.4589, "num_input_tokens_seen": 116879520, "step": 96110 }, { "epoch": 12.04297707054254, "grad_norm": 0.2604844868183136, "learning_rate": 4.094956628983201e-06, "loss": 0.461, "num_input_tokens_seen": 116885920, "step": 96115 }, { "epoch": 12.043603558451322, "grad_norm": 0.25455111265182495, "learning_rate": 4.094418951450721e-06, "loss": 0.4531, "num_input_tokens_seen": 116892160, "step": 96120 }, { "epoch": 12.044230046360106, "grad_norm": 0.23339760303497314, "learning_rate": 4.093881284745226e-06, "loss": 0.4624, "num_input_tokens_seen": 116898112, "step": 96125 }, { "epoch": 12.044856534268888, "grad_norm": 0.3521123230457306, "learning_rate": 4.093343628873139e-06, "loss": 0.4636, "num_input_tokens_seen": 116903616, "step": 96130 }, { "epoch": 12.045483022177672, "grad_norm": 0.3047453463077545, "learning_rate": 4.092805983840894e-06, "loss": 0.4564, "num_input_tokens_seen": 116909632, "step": 96135 }, { "epoch": 12.046109510086456, "grad_norm": 0.24690428376197815, "learning_rate": 4.092268349654914e-06, "loss": 0.4589, "num_input_tokens_seen": 116915904, "step": 96140 }, { "epoch": 12.046735997995238, "grad_norm": 0.3810194730758667, "learning_rate": 4.091730726321631e-06, "loss": 0.4595, "num_input_tokens_seen": 116921952, "step": 96145 }, { "epoch": 12.047362485904022, "grad_norm": 0.26739639043807983, "learning_rate": 4.091193113847469e-06, "loss": 0.4696, "num_input_tokens_seen": 116928192, "step": 96150 }, { "epoch": 12.047988973812805, "grad_norm": 0.28560444712638855, "learning_rate": 4.090655512238858e-06, "loss": 0.4568, "num_input_tokens_seen": 116933664, "step": 96155 }, { "epoch": 12.048615461721589, "grad_norm": 0.20557637512683868, "learning_rate": 4.0901179215022254e-06, "loss": 0.4528, "num_input_tokens_seen": 116939616, "step": 96160 }, { "epoch": 12.049241949630373, "grad_norm": 0.2588553726673126, "learning_rate": 4.089580341643997e-06, "loss": 0.4549, "num_input_tokens_seen": 116945888, "step": 96165 }, { "epoch": 12.049868437539155, "grad_norm": 0.2527804672718048, "learning_rate": 4.089042772670602e-06, "loss": 0.4526, "num_input_tokens_seen": 116952096, "step": 96170 }, { "epoch": 12.05049492544794, "grad_norm": 0.28609511256217957, "learning_rate": 4.0885052145884655e-06, "loss": 0.4538, "num_input_tokens_seen": 116958144, "step": 96175 }, { "epoch": 12.051121413356721, "grad_norm": 0.24826191365718842, "learning_rate": 4.087967667404017e-06, "loss": 0.4604, "num_input_tokens_seen": 116963360, "step": 96180 }, { "epoch": 12.051747901265506, "grad_norm": 0.23756009340286255, "learning_rate": 4.0874301311236805e-06, "loss": 0.463, "num_input_tokens_seen": 116969600, "step": 96185 }, { "epoch": 12.05237438917429, "grad_norm": 0.519637942314148, "learning_rate": 4.086892605753886e-06, "loss": 0.4711, "num_input_tokens_seen": 116975648, "step": 96190 }, { "epoch": 12.053000877083072, "grad_norm": 0.250904381275177, "learning_rate": 4.086355091301056e-06, "loss": 0.4718, "num_input_tokens_seen": 116981760, "step": 96195 }, { "epoch": 12.053627364991856, "grad_norm": 0.33396294713020325, "learning_rate": 4.085817587771621e-06, "loss": 0.4615, "num_input_tokens_seen": 116987808, "step": 96200 }, { "epoch": 12.054253852900638, "grad_norm": 0.2403445988893509, "learning_rate": 4.085280095172003e-06, "loss": 0.4522, "num_input_tokens_seen": 116994016, "step": 96205 }, { "epoch": 12.054880340809422, "grad_norm": 0.3405880928039551, "learning_rate": 4.084742613508632e-06, "loss": 0.4658, "num_input_tokens_seen": 117000160, "step": 96210 }, { "epoch": 12.055506828718206, "grad_norm": 0.3272424638271332, "learning_rate": 4.084205142787933e-06, "loss": 0.4586, "num_input_tokens_seen": 117006144, "step": 96215 }, { "epoch": 12.056133316626989, "grad_norm": 0.23636570572853088, "learning_rate": 4.08366768301633e-06, "loss": 0.4649, "num_input_tokens_seen": 117012384, "step": 96220 }, { "epoch": 12.056759804535773, "grad_norm": 0.3297704756259918, "learning_rate": 4.08313023420025e-06, "loss": 0.4585, "num_input_tokens_seen": 117018720, "step": 96225 }, { "epoch": 12.057386292444555, "grad_norm": 0.4289088845252991, "learning_rate": 4.082592796346119e-06, "loss": 0.4612, "num_input_tokens_seen": 117024352, "step": 96230 }, { "epoch": 12.058012780353339, "grad_norm": 0.34312695264816284, "learning_rate": 4.0820553694603646e-06, "loss": 0.4504, "num_input_tokens_seen": 117030144, "step": 96235 }, { "epoch": 12.058639268262123, "grad_norm": 0.3427548408508301, "learning_rate": 4.081517953549408e-06, "loss": 0.4454, "num_input_tokens_seen": 117036544, "step": 96240 }, { "epoch": 12.059265756170905, "grad_norm": 0.2351706176996231, "learning_rate": 4.080980548619679e-06, "loss": 0.4606, "num_input_tokens_seen": 117042752, "step": 96245 }, { "epoch": 12.05989224407969, "grad_norm": 0.27351635694503784, "learning_rate": 4.080443154677597e-06, "loss": 0.4598, "num_input_tokens_seen": 117048864, "step": 96250 }, { "epoch": 12.060518731988473, "grad_norm": 0.400346040725708, "learning_rate": 4.079905771729592e-06, "loss": 0.4637, "num_input_tokens_seen": 117055008, "step": 96255 }, { "epoch": 12.061145219897256, "grad_norm": 0.2755449116230011, "learning_rate": 4.079368399782086e-06, "loss": 0.4569, "num_input_tokens_seen": 117061376, "step": 96260 }, { "epoch": 12.06177170780604, "grad_norm": 0.2609354853630066, "learning_rate": 4.078831038841506e-06, "loss": 0.4579, "num_input_tokens_seen": 117067136, "step": 96265 }, { "epoch": 12.062398195714822, "grad_norm": 0.2733920216560364, "learning_rate": 4.078293688914274e-06, "loss": 0.4676, "num_input_tokens_seen": 117073152, "step": 96270 }, { "epoch": 12.063024683623606, "grad_norm": 0.3128570020198822, "learning_rate": 4.077756350006818e-06, "loss": 0.4655, "num_input_tokens_seen": 117079424, "step": 96275 }, { "epoch": 12.06365117153239, "grad_norm": 0.2752140164375305, "learning_rate": 4.077219022125558e-06, "loss": 0.4504, "num_input_tokens_seen": 117085600, "step": 96280 }, { "epoch": 12.064277659441172, "grad_norm": 0.43470633029937744, "learning_rate": 4.07668170527692e-06, "loss": 0.4556, "num_input_tokens_seen": 117091872, "step": 96285 }, { "epoch": 12.064904147349957, "grad_norm": 0.25700244307518005, "learning_rate": 4.076144399467331e-06, "loss": 0.4593, "num_input_tokens_seen": 117097440, "step": 96290 }, { "epoch": 12.065530635258739, "grad_norm": 0.33628812432289124, "learning_rate": 4.075607104703211e-06, "loss": 0.4609, "num_input_tokens_seen": 117103680, "step": 96295 }, { "epoch": 12.066157123167523, "grad_norm": 0.2509665787220001, "learning_rate": 4.075069820990987e-06, "loss": 0.4655, "num_input_tokens_seen": 117109984, "step": 96300 }, { "epoch": 12.066783611076307, "grad_norm": 0.25066205859184265, "learning_rate": 4.074532548337078e-06, "loss": 0.4483, "num_input_tokens_seen": 117116288, "step": 96305 }, { "epoch": 12.06741009898509, "grad_norm": 0.2695580720901489, "learning_rate": 4.0739952867479135e-06, "loss": 0.4597, "num_input_tokens_seen": 117122528, "step": 96310 }, { "epoch": 12.068036586893873, "grad_norm": 0.23390477895736694, "learning_rate": 4.073458036229912e-06, "loss": 0.4529, "num_input_tokens_seen": 117128800, "step": 96315 }, { "epoch": 12.068663074802656, "grad_norm": 0.3270919620990753, "learning_rate": 4.072920796789499e-06, "loss": 0.4652, "num_input_tokens_seen": 117134880, "step": 96320 }, { "epoch": 12.06928956271144, "grad_norm": 0.27379652857780457, "learning_rate": 4.072383568433096e-06, "loss": 0.4508, "num_input_tokens_seen": 117140896, "step": 96325 }, { "epoch": 12.069916050620224, "grad_norm": 1.874710202217102, "learning_rate": 4.0718463511671304e-06, "loss": 0.4545, "num_input_tokens_seen": 117146944, "step": 96330 }, { "epoch": 12.070542538529006, "grad_norm": 0.2547382712364197, "learning_rate": 4.071309144998019e-06, "loss": 0.4375, "num_input_tokens_seen": 117152864, "step": 96335 }, { "epoch": 12.07116902643779, "grad_norm": 0.4115822911262512, "learning_rate": 4.070771949932191e-06, "loss": 0.4707, "num_input_tokens_seen": 117159168, "step": 96340 }, { "epoch": 12.071795514346572, "grad_norm": 0.3326176404953003, "learning_rate": 4.0702347659760626e-06, "loss": 0.47, "num_input_tokens_seen": 117165312, "step": 96345 }, { "epoch": 12.072422002255356, "grad_norm": 0.3568112850189209, "learning_rate": 4.069697593136058e-06, "loss": 0.4367, "num_input_tokens_seen": 117171008, "step": 96350 }, { "epoch": 12.07304849016414, "grad_norm": 0.26999977231025696, "learning_rate": 4.0691604314186055e-06, "loss": 0.4553, "num_input_tokens_seen": 117176960, "step": 96355 }, { "epoch": 12.073674978072923, "grad_norm": 0.4915923774242401, "learning_rate": 4.068623280830119e-06, "loss": 0.469, "num_input_tokens_seen": 117183328, "step": 96360 }, { "epoch": 12.074301465981707, "grad_norm": 0.41613221168518066, "learning_rate": 4.068086141377026e-06, "loss": 0.4471, "num_input_tokens_seen": 117189664, "step": 96365 }, { "epoch": 12.07492795389049, "grad_norm": 0.3204829692840576, "learning_rate": 4.067549013065745e-06, "loss": 0.458, "num_input_tokens_seen": 117195648, "step": 96370 }, { "epoch": 12.075554441799273, "grad_norm": 0.3238886296749115, "learning_rate": 4.0670118959027015e-06, "loss": 0.4564, "num_input_tokens_seen": 117201312, "step": 96375 }, { "epoch": 12.076180929708057, "grad_norm": 0.36033472418785095, "learning_rate": 4.066474789894313e-06, "loss": 0.4546, "num_input_tokens_seen": 117206912, "step": 96380 }, { "epoch": 12.07680741761684, "grad_norm": 0.48233962059020996, "learning_rate": 4.065937695047006e-06, "loss": 0.4786, "num_input_tokens_seen": 117213152, "step": 96385 }, { "epoch": 12.077433905525623, "grad_norm": 0.34962278604507446, "learning_rate": 4.065400611367196e-06, "loss": 0.4566, "num_input_tokens_seen": 117219200, "step": 96390 }, { "epoch": 12.078060393434408, "grad_norm": 0.25722429156303406, "learning_rate": 4.064863538861311e-06, "loss": 0.4581, "num_input_tokens_seen": 117225408, "step": 96395 }, { "epoch": 12.07868688134319, "grad_norm": 0.2679958641529083, "learning_rate": 4.064326477535766e-06, "loss": 0.457, "num_input_tokens_seen": 117231744, "step": 96400 }, { "epoch": 12.079313369251974, "grad_norm": 0.2757944166660309, "learning_rate": 4.063789427396986e-06, "loss": 0.4587, "num_input_tokens_seen": 117237952, "step": 96405 }, { "epoch": 12.079939857160756, "grad_norm": 0.35924285650253296, "learning_rate": 4.06325238845139e-06, "loss": 0.4434, "num_input_tokens_seen": 117244160, "step": 96410 }, { "epoch": 12.08056634506954, "grad_norm": 0.54338538646698, "learning_rate": 4.062715360705398e-06, "loss": 0.4668, "num_input_tokens_seen": 117250048, "step": 96415 }, { "epoch": 12.081192832978324, "grad_norm": 0.3056754469871521, "learning_rate": 4.062178344165432e-06, "loss": 0.4535, "num_input_tokens_seen": 117256128, "step": 96420 }, { "epoch": 12.081819320887107, "grad_norm": 0.35744374990463257, "learning_rate": 4.061641338837915e-06, "loss": 0.4613, "num_input_tokens_seen": 117261664, "step": 96425 }, { "epoch": 12.08244580879589, "grad_norm": 0.2932782769203186, "learning_rate": 4.061104344729263e-06, "loss": 0.4616, "num_input_tokens_seen": 117267712, "step": 96430 }, { "epoch": 12.083072296704673, "grad_norm": 0.3552858233451843, "learning_rate": 4.060567361845897e-06, "loss": 0.4659, "num_input_tokens_seen": 117273856, "step": 96435 }, { "epoch": 12.083698784613457, "grad_norm": 0.28699588775634766, "learning_rate": 4.06003039019424e-06, "loss": 0.4617, "num_input_tokens_seen": 117279776, "step": 96440 }, { "epoch": 12.084325272522241, "grad_norm": 0.34499430656433105, "learning_rate": 4.059493429780709e-06, "loss": 0.4649, "num_input_tokens_seen": 117285472, "step": 96445 }, { "epoch": 12.084951760431023, "grad_norm": 0.3677873909473419, "learning_rate": 4.058956480611726e-06, "loss": 0.4477, "num_input_tokens_seen": 117291584, "step": 96450 }, { "epoch": 12.085578248339807, "grad_norm": 0.3237733244895935, "learning_rate": 4.058419542693707e-06, "loss": 0.4656, "num_input_tokens_seen": 117298080, "step": 96455 }, { "epoch": 12.08620473624859, "grad_norm": 0.3352545201778412, "learning_rate": 4.057882616033077e-06, "loss": 0.457, "num_input_tokens_seen": 117304064, "step": 96460 }, { "epoch": 12.086831224157374, "grad_norm": 0.32090410590171814, "learning_rate": 4.057345700636249e-06, "loss": 0.4616, "num_input_tokens_seen": 117309568, "step": 96465 }, { "epoch": 12.087457712066158, "grad_norm": 0.4320673644542694, "learning_rate": 4.056808796509647e-06, "loss": 0.4621, "num_input_tokens_seen": 117315680, "step": 96470 }, { "epoch": 12.08808419997494, "grad_norm": 0.5082090497016907, "learning_rate": 4.0562719036596894e-06, "loss": 0.4672, "num_input_tokens_seen": 117321952, "step": 96475 }, { "epoch": 12.088710687883724, "grad_norm": 0.41616544127464294, "learning_rate": 4.055735022092795e-06, "loss": 0.4614, "num_input_tokens_seen": 117328000, "step": 96480 }, { "epoch": 12.089337175792506, "grad_norm": 0.4153927266597748, "learning_rate": 4.055198151815381e-06, "loss": 0.4629, "num_input_tokens_seen": 117334208, "step": 96485 }, { "epoch": 12.08996366370129, "grad_norm": 0.29104095697402954, "learning_rate": 4.054661292833868e-06, "loss": 0.4553, "num_input_tokens_seen": 117340128, "step": 96490 }, { "epoch": 12.090590151610074, "grad_norm": 0.4746647775173187, "learning_rate": 4.054124445154675e-06, "loss": 0.4487, "num_input_tokens_seen": 117345952, "step": 96495 }, { "epoch": 12.091216639518857, "grad_norm": 0.3918133080005646, "learning_rate": 4.053587608784218e-06, "loss": 0.4549, "num_input_tokens_seen": 117352000, "step": 96500 }, { "epoch": 12.09184312742764, "grad_norm": 0.2531566619873047, "learning_rate": 4.053050783728919e-06, "loss": 0.4535, "num_input_tokens_seen": 117357728, "step": 96505 }, { "epoch": 12.092469615336425, "grad_norm": 0.5084396600723267, "learning_rate": 4.0525139699951925e-06, "loss": 0.4602, "num_input_tokens_seen": 117363680, "step": 96510 }, { "epoch": 12.093096103245207, "grad_norm": 0.3703406751155853, "learning_rate": 4.051977167589459e-06, "loss": 0.4856, "num_input_tokens_seen": 117369600, "step": 96515 }, { "epoch": 12.093722591153991, "grad_norm": 0.45946183800697327, "learning_rate": 4.051440376518134e-06, "loss": 0.4504, "num_input_tokens_seen": 117375712, "step": 96520 }, { "epoch": 12.094349079062773, "grad_norm": 0.5642426013946533, "learning_rate": 4.0509035967876385e-06, "loss": 0.4669, "num_input_tokens_seen": 117381632, "step": 96525 }, { "epoch": 12.094975566971558, "grad_norm": 0.35713526606559753, "learning_rate": 4.050366828404386e-06, "loss": 0.4486, "num_input_tokens_seen": 117387744, "step": 96530 }, { "epoch": 12.095602054880342, "grad_norm": 0.3356284499168396, "learning_rate": 4.049830071374799e-06, "loss": 0.4621, "num_input_tokens_seen": 117393792, "step": 96535 }, { "epoch": 12.096228542789124, "grad_norm": 0.29517608880996704, "learning_rate": 4.049293325705291e-06, "loss": 0.4542, "num_input_tokens_seen": 117399776, "step": 96540 }, { "epoch": 12.096855030697908, "grad_norm": 0.37709954380989075, "learning_rate": 4.0487565914022795e-06, "loss": 0.457, "num_input_tokens_seen": 117405696, "step": 96545 }, { "epoch": 12.09748151860669, "grad_norm": 7.212655544281006, "learning_rate": 4.048219868472185e-06, "loss": 0.5032, "num_input_tokens_seen": 117411936, "step": 96550 }, { "epoch": 12.098108006515474, "grad_norm": 0.41865283250808716, "learning_rate": 4.047683156921422e-06, "loss": 0.4576, "num_input_tokens_seen": 117418336, "step": 96555 }, { "epoch": 12.098734494424258, "grad_norm": 0.4084320366382599, "learning_rate": 4.047146456756407e-06, "loss": 0.4433, "num_input_tokens_seen": 117424640, "step": 96560 }, { "epoch": 12.09936098233304, "grad_norm": 0.39602768421173096, "learning_rate": 4.046609767983557e-06, "loss": 0.4707, "num_input_tokens_seen": 117430880, "step": 96565 }, { "epoch": 12.099987470241825, "grad_norm": 0.3507007658481598, "learning_rate": 4.046073090609289e-06, "loss": 0.4397, "num_input_tokens_seen": 117436960, "step": 96570 }, { "epoch": 12.100613958150607, "grad_norm": 0.5574498772621155, "learning_rate": 4.045536424640019e-06, "loss": 0.4565, "num_input_tokens_seen": 117443168, "step": 96575 }, { "epoch": 12.101240446059391, "grad_norm": 0.39997178316116333, "learning_rate": 4.044999770082164e-06, "loss": 0.4492, "num_input_tokens_seen": 117449248, "step": 96580 }, { "epoch": 12.101866933968175, "grad_norm": 0.376949280500412, "learning_rate": 4.044463126942138e-06, "loss": 0.4839, "num_input_tokens_seen": 117455520, "step": 96585 }, { "epoch": 12.102493421876957, "grad_norm": 0.41682446002960205, "learning_rate": 4.0439264952263615e-06, "loss": 0.45, "num_input_tokens_seen": 117461632, "step": 96590 }, { "epoch": 12.103119909785741, "grad_norm": 0.5499851107597351, "learning_rate": 4.043389874941245e-06, "loss": 0.4592, "num_input_tokens_seen": 117467712, "step": 96595 }, { "epoch": 12.103746397694524, "grad_norm": 0.566578209400177, "learning_rate": 4.042853266093207e-06, "loss": 0.4682, "num_input_tokens_seen": 117473792, "step": 96600 }, { "epoch": 12.104372885603308, "grad_norm": 0.40580999851226807, "learning_rate": 4.042316668688665e-06, "loss": 0.4552, "num_input_tokens_seen": 117479872, "step": 96605 }, { "epoch": 12.104999373512092, "grad_norm": 0.37036994099617004, "learning_rate": 4.041780082734031e-06, "loss": 0.4581, "num_input_tokens_seen": 117485760, "step": 96610 }, { "epoch": 12.105625861420874, "grad_norm": 0.6413335204124451, "learning_rate": 4.041243508235723e-06, "loss": 0.4567, "num_input_tokens_seen": 117491776, "step": 96615 }, { "epoch": 12.106252349329658, "grad_norm": 0.4573367238044739, "learning_rate": 4.040706945200152e-06, "loss": 0.4393, "num_input_tokens_seen": 117497632, "step": 96620 }, { "epoch": 12.106878837238442, "grad_norm": 0.5613833665847778, "learning_rate": 4.0401703936337386e-06, "loss": 0.4735, "num_input_tokens_seen": 117503456, "step": 96625 }, { "epoch": 12.107505325147224, "grad_norm": 0.5679619908332825, "learning_rate": 4.039633853542895e-06, "loss": 0.4585, "num_input_tokens_seen": 117509536, "step": 96630 }, { "epoch": 12.108131813056008, "grad_norm": 0.6487178206443787, "learning_rate": 4.039097324934035e-06, "loss": 0.472, "num_input_tokens_seen": 117515584, "step": 96635 }, { "epoch": 12.10875830096479, "grad_norm": 0.6856000423431396, "learning_rate": 4.038560807813574e-06, "loss": 0.4551, "num_input_tokens_seen": 117521632, "step": 96640 }, { "epoch": 12.109384788873575, "grad_norm": 0.5453717112541199, "learning_rate": 4.03802430218793e-06, "loss": 0.4597, "num_input_tokens_seen": 117527808, "step": 96645 }, { "epoch": 12.110011276782359, "grad_norm": 0.5171849131584167, "learning_rate": 4.03748780806351e-06, "loss": 0.4514, "num_input_tokens_seen": 117534048, "step": 96650 }, { "epoch": 12.110637764691141, "grad_norm": 0.8860195875167847, "learning_rate": 4.0369513254467345e-06, "loss": 0.4433, "num_input_tokens_seen": 117539872, "step": 96655 }, { "epoch": 12.111264252599925, "grad_norm": 0.6773802638053894, "learning_rate": 4.036414854344016e-06, "loss": 0.4673, "num_input_tokens_seen": 117545984, "step": 96660 }, { "epoch": 12.111890740508708, "grad_norm": 0.9345718622207642, "learning_rate": 4.035878394761767e-06, "loss": 0.4263, "num_input_tokens_seen": 117552224, "step": 96665 }, { "epoch": 12.112517228417492, "grad_norm": 0.6923988461494446, "learning_rate": 4.035341946706405e-06, "loss": 0.4782, "num_input_tokens_seen": 117557792, "step": 96670 }, { "epoch": 12.113143716326276, "grad_norm": 2.1123640537261963, "learning_rate": 4.034805510184338e-06, "loss": 0.453, "num_input_tokens_seen": 117563648, "step": 96675 }, { "epoch": 12.113770204235058, "grad_norm": 1.7175939083099365, "learning_rate": 4.0342690852019835e-06, "loss": 0.4383, "num_input_tokens_seen": 117570080, "step": 96680 }, { "epoch": 12.114396692143842, "grad_norm": 0.9226595759391785, "learning_rate": 4.033732671765755e-06, "loss": 0.476, "num_input_tokens_seen": 117576192, "step": 96685 }, { "epoch": 12.115023180052624, "grad_norm": 1.4489402770996094, "learning_rate": 4.033196269882063e-06, "loss": 0.4703, "num_input_tokens_seen": 117582400, "step": 96690 }, { "epoch": 12.115649667961408, "grad_norm": 2.149064779281616, "learning_rate": 4.032659879557323e-06, "loss": 0.4836, "num_input_tokens_seen": 117589056, "step": 96695 }, { "epoch": 12.116276155870192, "grad_norm": 0.916454017162323, "learning_rate": 4.032123500797949e-06, "loss": 0.4374, "num_input_tokens_seen": 117595072, "step": 96700 }, { "epoch": 12.116902643778975, "grad_norm": 0.8104841709136963, "learning_rate": 4.031587133610351e-06, "loss": 0.4582, "num_input_tokens_seen": 117600736, "step": 96705 }, { "epoch": 12.117529131687759, "grad_norm": 1.8632910251617432, "learning_rate": 4.031050778000945e-06, "loss": 0.4317, "num_input_tokens_seen": 117606656, "step": 96710 }, { "epoch": 12.118155619596541, "grad_norm": 1.125922679901123, "learning_rate": 4.0305144339761385e-06, "loss": 0.4128, "num_input_tokens_seen": 117612736, "step": 96715 }, { "epoch": 12.118782107505325, "grad_norm": 1.5079807043075562, "learning_rate": 4.029978101542346e-06, "loss": 0.5183, "num_input_tokens_seen": 117618304, "step": 96720 }, { "epoch": 12.119408595414109, "grad_norm": 1.1605249643325806, "learning_rate": 4.029441780705985e-06, "loss": 0.4836, "num_input_tokens_seen": 117624352, "step": 96725 }, { "epoch": 12.120035083322891, "grad_norm": 0.7784322500228882, "learning_rate": 4.028905471473461e-06, "loss": 0.4642, "num_input_tokens_seen": 117630336, "step": 96730 }, { "epoch": 12.120661571231675, "grad_norm": 4.625369071960449, "learning_rate": 4.0283691738511895e-06, "loss": 0.4507, "num_input_tokens_seen": 117636480, "step": 96735 }, { "epoch": 12.121288059140458, "grad_norm": 1.1468678712844849, "learning_rate": 4.0278328878455805e-06, "loss": 0.4387, "num_input_tokens_seen": 117642592, "step": 96740 }, { "epoch": 12.121914547049242, "grad_norm": 2.4429991245269775, "learning_rate": 4.027296613463049e-06, "loss": 0.505, "num_input_tokens_seen": 117648800, "step": 96745 }, { "epoch": 12.122541034958026, "grad_norm": 1.0095741748809814, "learning_rate": 4.026760350710001e-06, "loss": 0.4707, "num_input_tokens_seen": 117655072, "step": 96750 }, { "epoch": 12.123167522866808, "grad_norm": 0.8034944534301758, "learning_rate": 4.026224099592854e-06, "loss": 0.4538, "num_input_tokens_seen": 117661184, "step": 96755 }, { "epoch": 12.123794010775592, "grad_norm": 0.4440939724445343, "learning_rate": 4.025687860118015e-06, "loss": 0.4996, "num_input_tokens_seen": 117667488, "step": 96760 }, { "epoch": 12.124420498684376, "grad_norm": 0.5752756595611572, "learning_rate": 4.0251516322918985e-06, "loss": 0.4772, "num_input_tokens_seen": 117673600, "step": 96765 }, { "epoch": 12.125046986593158, "grad_norm": 0.3388996720314026, "learning_rate": 4.024615416120912e-06, "loss": 0.4496, "num_input_tokens_seen": 117679456, "step": 96770 }, { "epoch": 12.125673474501943, "grad_norm": 0.9695019125938416, "learning_rate": 4.024079211611469e-06, "loss": 0.4795, "num_input_tokens_seen": 117685632, "step": 96775 }, { "epoch": 12.126299962410725, "grad_norm": 0.37283608317375183, "learning_rate": 4.02354301876998e-06, "loss": 0.4532, "num_input_tokens_seen": 117691872, "step": 96780 }, { "epoch": 12.126926450319509, "grad_norm": 0.4824283719062805, "learning_rate": 4.023006837602853e-06, "loss": 0.4655, "num_input_tokens_seen": 117697952, "step": 96785 }, { "epoch": 12.127552938228293, "grad_norm": 0.9706869125366211, "learning_rate": 4.0224706681165025e-06, "loss": 0.4589, "num_input_tokens_seen": 117704000, "step": 96790 }, { "epoch": 12.128179426137075, "grad_norm": 0.613161563873291, "learning_rate": 4.021934510317336e-06, "loss": 0.4833, "num_input_tokens_seen": 117710176, "step": 96795 }, { "epoch": 12.12880591404586, "grad_norm": 0.7010231614112854, "learning_rate": 4.021398364211767e-06, "loss": 0.4619, "num_input_tokens_seen": 117716320, "step": 96800 }, { "epoch": 12.129432401954642, "grad_norm": 2.419438123703003, "learning_rate": 4.020862229806201e-06, "loss": 0.5365, "num_input_tokens_seen": 117722720, "step": 96805 }, { "epoch": 12.130058889863426, "grad_norm": 0.46370992064476013, "learning_rate": 4.0203261071070525e-06, "loss": 0.4465, "num_input_tokens_seen": 117728896, "step": 96810 }, { "epoch": 12.13068537777221, "grad_norm": 0.4718969762325287, "learning_rate": 4.0197899961207265e-06, "loss": 0.4676, "num_input_tokens_seen": 117735168, "step": 96815 }, { "epoch": 12.131311865680992, "grad_norm": 0.5862104296684265, "learning_rate": 4.0192538968536375e-06, "loss": 0.4581, "num_input_tokens_seen": 117741024, "step": 96820 }, { "epoch": 12.131938353589776, "grad_norm": 0.78713059425354, "learning_rate": 4.018717809312191e-06, "loss": 0.4675, "num_input_tokens_seen": 117746336, "step": 96825 }, { "epoch": 12.132564841498558, "grad_norm": 0.2425789088010788, "learning_rate": 4.018181733502799e-06, "loss": 0.4732, "num_input_tokens_seen": 117751840, "step": 96830 }, { "epoch": 12.133191329407342, "grad_norm": 0.36825159192085266, "learning_rate": 4.01764566943187e-06, "loss": 0.4517, "num_input_tokens_seen": 117758496, "step": 96835 }, { "epoch": 12.133817817316126, "grad_norm": 0.34506288170814514, "learning_rate": 4.017109617105812e-06, "loss": 0.4578, "num_input_tokens_seen": 117765056, "step": 96840 }, { "epoch": 12.134444305224909, "grad_norm": 0.5290457010269165, "learning_rate": 4.016573576531034e-06, "loss": 0.4491, "num_input_tokens_seen": 117770720, "step": 96845 }, { "epoch": 12.135070793133693, "grad_norm": 0.9441006779670715, "learning_rate": 4.016037547713947e-06, "loss": 0.4598, "num_input_tokens_seen": 117776896, "step": 96850 }, { "epoch": 12.135697281042475, "grad_norm": 0.37248894572257996, "learning_rate": 4.0155015306609595e-06, "loss": 0.4582, "num_input_tokens_seen": 117782944, "step": 96855 }, { "epoch": 12.136323768951259, "grad_norm": 0.40371549129486084, "learning_rate": 4.014965525378477e-06, "loss": 0.4605, "num_input_tokens_seen": 117788864, "step": 96860 }, { "epoch": 12.136950256860043, "grad_norm": 0.527023434638977, "learning_rate": 4.014429531872912e-06, "loss": 0.4522, "num_input_tokens_seen": 117794752, "step": 96865 }, { "epoch": 12.137576744768825, "grad_norm": 0.5301707983016968, "learning_rate": 4.013893550150668e-06, "loss": 0.4557, "num_input_tokens_seen": 117800832, "step": 96870 }, { "epoch": 12.13820323267761, "grad_norm": 1.1821831464767456, "learning_rate": 4.013357580218159e-06, "loss": 0.4593, "num_input_tokens_seen": 117807040, "step": 96875 }, { "epoch": 12.138829720586394, "grad_norm": 2.6342194080352783, "learning_rate": 4.012821622081786e-06, "loss": 0.4501, "num_input_tokens_seen": 117813088, "step": 96880 }, { "epoch": 12.139456208495176, "grad_norm": 5.433765411376953, "learning_rate": 4.012285675747963e-06, "loss": 0.4763, "num_input_tokens_seen": 117819168, "step": 96885 }, { "epoch": 12.14008269640396, "grad_norm": 1.4678218364715576, "learning_rate": 4.011749741223093e-06, "loss": 0.4568, "num_input_tokens_seen": 117825408, "step": 96890 }, { "epoch": 12.140709184312742, "grad_norm": 0.46592292189598083, "learning_rate": 4.011213818513589e-06, "loss": 0.4609, "num_input_tokens_seen": 117831648, "step": 96895 }, { "epoch": 12.141335672221526, "grad_norm": 0.2836517095565796, "learning_rate": 4.0106779076258515e-06, "loss": 0.4632, "num_input_tokens_seen": 117837920, "step": 96900 }, { "epoch": 12.14196216013031, "grad_norm": 0.3322550654411316, "learning_rate": 4.010142008566292e-06, "loss": 0.4636, "num_input_tokens_seen": 117844096, "step": 96905 }, { "epoch": 12.142588648039093, "grad_norm": 0.4776509404182434, "learning_rate": 4.00960612134132e-06, "loss": 0.4506, "num_input_tokens_seen": 117850080, "step": 96910 }, { "epoch": 12.143215135947877, "grad_norm": 1.2927682399749756, "learning_rate": 4.009070245957336e-06, "loss": 0.4567, "num_input_tokens_seen": 117856256, "step": 96915 }, { "epoch": 12.143841623856659, "grad_norm": 0.3689250349998474, "learning_rate": 4.008534382420754e-06, "loss": 0.466, "num_input_tokens_seen": 117862496, "step": 96920 }, { "epoch": 12.144468111765443, "grad_norm": 0.7406563758850098, "learning_rate": 4.007998530737973e-06, "loss": 0.4606, "num_input_tokens_seen": 117868448, "step": 96925 }, { "epoch": 12.145094599674227, "grad_norm": 0.50711590051651, "learning_rate": 4.0074626909154064e-06, "loss": 0.4633, "num_input_tokens_seen": 117874816, "step": 96930 }, { "epoch": 12.14572108758301, "grad_norm": 0.2555745840072632, "learning_rate": 4.006926862959456e-06, "loss": 0.451, "num_input_tokens_seen": 117880736, "step": 96935 }, { "epoch": 12.146347575491793, "grad_norm": 0.32909753918647766, "learning_rate": 4.006391046876532e-06, "loss": 0.4486, "num_input_tokens_seen": 117886656, "step": 96940 }, { "epoch": 12.146974063400576, "grad_norm": 0.6187157034873962, "learning_rate": 4.0058552426730355e-06, "loss": 0.4744, "num_input_tokens_seen": 117893088, "step": 96945 }, { "epoch": 12.14760055130936, "grad_norm": 0.37551358342170715, "learning_rate": 4.005319450355379e-06, "loss": 0.4743, "num_input_tokens_seen": 117899296, "step": 96950 }, { "epoch": 12.148227039218144, "grad_norm": 0.37828564643859863, "learning_rate": 4.004783669929961e-06, "loss": 0.4692, "num_input_tokens_seen": 117905184, "step": 96955 }, { "epoch": 12.148853527126926, "grad_norm": 0.7169419527053833, "learning_rate": 4.004247901403195e-06, "loss": 0.4661, "num_input_tokens_seen": 117911232, "step": 96960 }, { "epoch": 12.14948001503571, "grad_norm": 0.5681232213973999, "learning_rate": 4.003712144781479e-06, "loss": 0.4611, "num_input_tokens_seen": 117917664, "step": 96965 }, { "epoch": 12.150106502944492, "grad_norm": 0.40199124813079834, "learning_rate": 4.003176400071223e-06, "loss": 0.4585, "num_input_tokens_seen": 117923936, "step": 96970 }, { "epoch": 12.150732990853276, "grad_norm": 0.46799707412719727, "learning_rate": 4.002640667278832e-06, "loss": 0.4984, "num_input_tokens_seen": 117930208, "step": 96975 }, { "epoch": 12.15135947876206, "grad_norm": 0.5958288311958313, "learning_rate": 4.0021049464107095e-06, "loss": 0.4482, "num_input_tokens_seen": 117936416, "step": 96980 }, { "epoch": 12.151985966670843, "grad_norm": 0.39047616720199585, "learning_rate": 4.001569237473263e-06, "loss": 0.4736, "num_input_tokens_seen": 117942624, "step": 96985 }, { "epoch": 12.152612454579627, "grad_norm": 0.348389595746994, "learning_rate": 4.001033540472893e-06, "loss": 0.4657, "num_input_tokens_seen": 117948768, "step": 96990 }, { "epoch": 12.15323894248841, "grad_norm": 0.47115856409072876, "learning_rate": 4.000497855416009e-06, "loss": 0.4553, "num_input_tokens_seen": 117953952, "step": 96995 }, { "epoch": 12.153865430397193, "grad_norm": 6.87177038192749, "learning_rate": 3.999962182309012e-06, "loss": 0.5035, "num_input_tokens_seen": 117960192, "step": 97000 }, { "epoch": 12.154491918305977, "grad_norm": 0.523418664932251, "learning_rate": 3.99942652115831e-06, "loss": 0.4533, "num_input_tokens_seen": 117966496, "step": 97005 }, { "epoch": 12.15511840621476, "grad_norm": 0.473827064037323, "learning_rate": 3.998890871970302e-06, "loss": 0.4516, "num_input_tokens_seen": 117972576, "step": 97010 }, { "epoch": 12.155744894123544, "grad_norm": 0.6078771352767944, "learning_rate": 3.998355234751398e-06, "loss": 0.5018, "num_input_tokens_seen": 117978720, "step": 97015 }, { "epoch": 12.156371382032328, "grad_norm": 0.5407159924507141, "learning_rate": 3.997819609507998e-06, "loss": 0.4699, "num_input_tokens_seen": 117984608, "step": 97020 }, { "epoch": 12.15699786994111, "grad_norm": 0.44939038157463074, "learning_rate": 3.997283996246509e-06, "loss": 0.4586, "num_input_tokens_seen": 117990560, "step": 97025 }, { "epoch": 12.157624357849894, "grad_norm": 0.2968454957008362, "learning_rate": 3.996748394973331e-06, "loss": 0.4721, "num_input_tokens_seen": 117996768, "step": 97030 }, { "epoch": 12.158250845758676, "grad_norm": 0.32001301646232605, "learning_rate": 3.996212805694869e-06, "loss": 0.4891, "num_input_tokens_seen": 118002656, "step": 97035 }, { "epoch": 12.15887733366746, "grad_norm": 0.36396539211273193, "learning_rate": 3.9956772284175285e-06, "loss": 0.447, "num_input_tokens_seen": 118008576, "step": 97040 }, { "epoch": 12.159503821576244, "grad_norm": 1.198667049407959, "learning_rate": 3.99514166314771e-06, "loss": 0.4643, "num_input_tokens_seen": 118014816, "step": 97045 }, { "epoch": 12.160130309485027, "grad_norm": 0.41986772418022156, "learning_rate": 3.994606109891818e-06, "loss": 0.4578, "num_input_tokens_seen": 118020960, "step": 97050 }, { "epoch": 12.16075679739381, "grad_norm": 0.44169872999191284, "learning_rate": 3.994070568656254e-06, "loss": 0.4937, "num_input_tokens_seen": 118027136, "step": 97055 }, { "epoch": 12.161383285302593, "grad_norm": 0.5658502578735352, "learning_rate": 3.993535039447426e-06, "loss": 0.4924, "num_input_tokens_seen": 118033056, "step": 97060 }, { "epoch": 12.162009773211377, "grad_norm": 0.307937353849411, "learning_rate": 3.9929995222717285e-06, "loss": 0.4504, "num_input_tokens_seen": 118039008, "step": 97065 }, { "epoch": 12.162636261120161, "grad_norm": 1.914090633392334, "learning_rate": 3.992464017135572e-06, "loss": 0.4743, "num_input_tokens_seen": 118045120, "step": 97070 }, { "epoch": 12.163262749028943, "grad_norm": 0.3909760117530823, "learning_rate": 3.991928524045352e-06, "loss": 0.4542, "num_input_tokens_seen": 118051328, "step": 97075 }, { "epoch": 12.163889236937727, "grad_norm": 0.31005826592445374, "learning_rate": 3.9913930430074775e-06, "loss": 0.4678, "num_input_tokens_seen": 118057664, "step": 97080 }, { "epoch": 12.16451572484651, "grad_norm": 0.46393683552742004, "learning_rate": 3.990857574028344e-06, "loss": 0.4342, "num_input_tokens_seen": 118064032, "step": 97085 }, { "epoch": 12.165142212755294, "grad_norm": 0.3749698996543884, "learning_rate": 3.9903221171143565e-06, "loss": 0.4902, "num_input_tokens_seen": 118070304, "step": 97090 }, { "epoch": 12.165768700664078, "grad_norm": 0.6302978992462158, "learning_rate": 3.989786672271918e-06, "loss": 0.4677, "num_input_tokens_seen": 118076384, "step": 97095 }, { "epoch": 12.16639518857286, "grad_norm": 0.3036552369594574, "learning_rate": 3.98925123950743e-06, "loss": 0.4614, "num_input_tokens_seen": 118082400, "step": 97100 }, { "epoch": 12.167021676481644, "grad_norm": 0.6338998675346375, "learning_rate": 3.988715818827293e-06, "loss": 0.4673, "num_input_tokens_seen": 118088672, "step": 97105 }, { "epoch": 12.167648164390426, "grad_norm": 0.5155916810035706, "learning_rate": 3.988180410237907e-06, "loss": 0.4434, "num_input_tokens_seen": 118094688, "step": 97110 }, { "epoch": 12.16827465229921, "grad_norm": 0.5305122137069702, "learning_rate": 3.987645013745677e-06, "loss": 0.4477, "num_input_tokens_seen": 118100864, "step": 97115 }, { "epoch": 12.168901140207995, "grad_norm": 0.3567751944065094, "learning_rate": 3.987109629357e-06, "loss": 0.4619, "num_input_tokens_seen": 118106816, "step": 97120 }, { "epoch": 12.169527628116777, "grad_norm": 0.4485304057598114, "learning_rate": 3.986574257078282e-06, "loss": 0.5023, "num_input_tokens_seen": 118113120, "step": 97125 }, { "epoch": 12.17015411602556, "grad_norm": 0.42607852816581726, "learning_rate": 3.986038896915918e-06, "loss": 0.4463, "num_input_tokens_seen": 118118592, "step": 97130 }, { "epoch": 12.170780603934345, "grad_norm": 0.3848348557949066, "learning_rate": 3.985503548876314e-06, "loss": 0.4634, "num_input_tokens_seen": 118124448, "step": 97135 }, { "epoch": 12.171407091843127, "grad_norm": 0.5306214094161987, "learning_rate": 3.984968212965865e-06, "loss": 0.5, "num_input_tokens_seen": 118130592, "step": 97140 }, { "epoch": 12.172033579751911, "grad_norm": 2.476616144180298, "learning_rate": 3.984432889190976e-06, "loss": 0.4701, "num_input_tokens_seen": 118136800, "step": 97145 }, { "epoch": 12.172660067660694, "grad_norm": 0.5035714507102966, "learning_rate": 3.983897577558045e-06, "loss": 0.4494, "num_input_tokens_seen": 118142976, "step": 97150 }, { "epoch": 12.173286555569478, "grad_norm": 0.3380952477455139, "learning_rate": 3.983362278073473e-06, "loss": 0.4595, "num_input_tokens_seen": 118148064, "step": 97155 }, { "epoch": 12.173913043478262, "grad_norm": 0.27597880363464355, "learning_rate": 3.98282699074366e-06, "loss": 0.4621, "num_input_tokens_seen": 118153664, "step": 97160 }, { "epoch": 12.174539531387044, "grad_norm": 0.4185211658477783, "learning_rate": 3.982291715575005e-06, "loss": 0.4593, "num_input_tokens_seen": 118159968, "step": 97165 }, { "epoch": 12.175166019295828, "grad_norm": 0.316890686750412, "learning_rate": 3.9817564525739094e-06, "loss": 0.4539, "num_input_tokens_seen": 118165568, "step": 97170 }, { "epoch": 12.17579250720461, "grad_norm": 0.6009266376495361, "learning_rate": 3.98122120174677e-06, "loss": 0.4534, "num_input_tokens_seen": 118171680, "step": 97175 }, { "epoch": 12.176418995113394, "grad_norm": 0.45335549116134644, "learning_rate": 3.9806859630999896e-06, "loss": 0.4503, "num_input_tokens_seen": 118177760, "step": 97180 }, { "epoch": 12.177045483022178, "grad_norm": 0.6298711895942688, "learning_rate": 3.9801507366399635e-06, "loss": 0.4765, "num_input_tokens_seen": 118184128, "step": 97185 }, { "epoch": 12.17767197093096, "grad_norm": 0.3944997787475586, "learning_rate": 3.979615522373094e-06, "loss": 0.4834, "num_input_tokens_seen": 118190080, "step": 97190 }, { "epoch": 12.178298458839745, "grad_norm": 0.3121766448020935, "learning_rate": 3.979080320305778e-06, "loss": 0.4544, "num_input_tokens_seen": 118196064, "step": 97195 }, { "epoch": 12.178924946748527, "grad_norm": 0.6564731001853943, "learning_rate": 3.9785451304444154e-06, "loss": 0.4393, "num_input_tokens_seen": 118202304, "step": 97200 }, { "epoch": 12.179551434657311, "grad_norm": 0.846439778804779, "learning_rate": 3.978009952795404e-06, "loss": 0.4656, "num_input_tokens_seen": 118208192, "step": 97205 }, { "epoch": 12.180177922566095, "grad_norm": 5.489109992980957, "learning_rate": 3.977474787365145e-06, "loss": 0.4761, "num_input_tokens_seen": 118214688, "step": 97210 }, { "epoch": 12.180804410474877, "grad_norm": 0.36174798011779785, "learning_rate": 3.976939634160032e-06, "loss": 0.4587, "num_input_tokens_seen": 118220960, "step": 97215 }, { "epoch": 12.181430898383661, "grad_norm": 0.48346999287605286, "learning_rate": 3.976404493186466e-06, "loss": 0.4623, "num_input_tokens_seen": 118226784, "step": 97220 }, { "epoch": 12.182057386292444, "grad_norm": 0.5641931295394897, "learning_rate": 3.9758693644508474e-06, "loss": 0.4337, "num_input_tokens_seen": 118232576, "step": 97225 }, { "epoch": 12.182683874201228, "grad_norm": 0.5098819732666016, "learning_rate": 3.975334247959569e-06, "loss": 0.4563, "num_input_tokens_seen": 118238816, "step": 97230 }, { "epoch": 12.183310362110012, "grad_norm": 0.42579472064971924, "learning_rate": 3.974799143719034e-06, "loss": 0.4573, "num_input_tokens_seen": 118244736, "step": 97235 }, { "epoch": 12.183936850018794, "grad_norm": 0.5909974575042725, "learning_rate": 3.974264051735633e-06, "loss": 0.4565, "num_input_tokens_seen": 118250976, "step": 97240 }, { "epoch": 12.184563337927578, "grad_norm": 0.8433597683906555, "learning_rate": 3.973728972015771e-06, "loss": 0.4655, "num_input_tokens_seen": 118256288, "step": 97245 }, { "epoch": 12.185189825836362, "grad_norm": 0.4300001263618469, "learning_rate": 3.973193904565841e-06, "loss": 0.4578, "num_input_tokens_seen": 118262048, "step": 97250 }, { "epoch": 12.185816313745145, "grad_norm": 2.950652599334717, "learning_rate": 3.972658849392242e-06, "loss": 0.4531, "num_input_tokens_seen": 118268224, "step": 97255 }, { "epoch": 12.186442801653929, "grad_norm": 0.7733498215675354, "learning_rate": 3.972123806501368e-06, "loss": 0.4615, "num_input_tokens_seen": 118274048, "step": 97260 }, { "epoch": 12.18706928956271, "grad_norm": 0.610611617565155, "learning_rate": 3.971588775899621e-06, "loss": 0.4496, "num_input_tokens_seen": 118280192, "step": 97265 }, { "epoch": 12.187695777471495, "grad_norm": 0.9177600145339966, "learning_rate": 3.971053757593391e-06, "loss": 0.4583, "num_input_tokens_seen": 118286464, "step": 97270 }, { "epoch": 12.188322265380279, "grad_norm": 0.5193812251091003, "learning_rate": 3.970518751589081e-06, "loss": 0.4784, "num_input_tokens_seen": 118292352, "step": 97275 }, { "epoch": 12.188948753289061, "grad_norm": 0.7409490346908569, "learning_rate": 3.969983757893086e-06, "loss": 0.4693, "num_input_tokens_seen": 118297856, "step": 97280 }, { "epoch": 12.189575241197845, "grad_norm": 0.7428659200668335, "learning_rate": 3.969448776511798e-06, "loss": 0.449, "num_input_tokens_seen": 118304000, "step": 97285 }, { "epoch": 12.190201729106628, "grad_norm": 0.5543946027755737, "learning_rate": 3.968913807451619e-06, "loss": 0.4479, "num_input_tokens_seen": 118310240, "step": 97290 }, { "epoch": 12.190828217015412, "grad_norm": 0.6345803141593933, "learning_rate": 3.9683788507189405e-06, "loss": 0.503, "num_input_tokens_seen": 118316288, "step": 97295 }, { "epoch": 12.191454704924196, "grad_norm": 0.6038469076156616, "learning_rate": 3.967843906320161e-06, "loss": 0.4633, "num_input_tokens_seen": 118321184, "step": 97300 }, { "epoch": 12.192081192832978, "grad_norm": 0.4390488266944885, "learning_rate": 3.967308974261676e-06, "loss": 0.4513, "num_input_tokens_seen": 118326720, "step": 97305 }, { "epoch": 12.192707680741762, "grad_norm": 1.5293841361999512, "learning_rate": 3.96677405454988e-06, "loss": 0.4536, "num_input_tokens_seen": 118332832, "step": 97310 }, { "epoch": 12.193334168650544, "grad_norm": 0.9117580056190491, "learning_rate": 3.966239147191168e-06, "loss": 0.459, "num_input_tokens_seen": 118338880, "step": 97315 }, { "epoch": 12.193960656559328, "grad_norm": 0.41927826404571533, "learning_rate": 3.965704252191938e-06, "loss": 0.4559, "num_input_tokens_seen": 118345056, "step": 97320 }, { "epoch": 12.194587144468112, "grad_norm": 1.118461012840271, "learning_rate": 3.965169369558582e-06, "loss": 0.4638, "num_input_tokens_seen": 118351008, "step": 97325 }, { "epoch": 12.195213632376895, "grad_norm": 0.6283279061317444, "learning_rate": 3.964634499297497e-06, "loss": 0.5117, "num_input_tokens_seen": 118357184, "step": 97330 }, { "epoch": 12.195840120285679, "grad_norm": 0.6708713173866272, "learning_rate": 3.964099641415075e-06, "loss": 0.4604, "num_input_tokens_seen": 118363136, "step": 97335 }, { "epoch": 12.196466608194461, "grad_norm": 0.5004523396492004, "learning_rate": 3.9635647959177135e-06, "loss": 0.435, "num_input_tokens_seen": 118369344, "step": 97340 }, { "epoch": 12.197093096103245, "grad_norm": 0.5568159222602844, "learning_rate": 3.963029962811808e-06, "loss": 0.4589, "num_input_tokens_seen": 118375392, "step": 97345 }, { "epoch": 12.19771958401203, "grad_norm": 1.9012417793273926, "learning_rate": 3.96249514210375e-06, "loss": 0.5017, "num_input_tokens_seen": 118381568, "step": 97350 }, { "epoch": 12.198346071920811, "grad_norm": 0.4869324564933777, "learning_rate": 3.961960333799934e-06, "loss": 0.4601, "num_input_tokens_seen": 118387136, "step": 97355 }, { "epoch": 12.198972559829595, "grad_norm": 0.6923199892044067, "learning_rate": 3.961425537906755e-06, "loss": 0.4536, "num_input_tokens_seen": 118393728, "step": 97360 }, { "epoch": 12.199599047738378, "grad_norm": 0.7144750356674194, "learning_rate": 3.960890754430609e-06, "loss": 0.4752, "num_input_tokens_seen": 118399936, "step": 97365 }, { "epoch": 12.200225535647162, "grad_norm": 0.8816289901733398, "learning_rate": 3.9603559833778854e-06, "loss": 0.4428, "num_input_tokens_seen": 118406048, "step": 97370 }, { "epoch": 12.200852023555946, "grad_norm": 0.6742644906044006, "learning_rate": 3.959821224754982e-06, "loss": 0.4643, "num_input_tokens_seen": 118412320, "step": 97375 }, { "epoch": 12.201478511464728, "grad_norm": 0.7127034664154053, "learning_rate": 3.959286478568289e-06, "loss": 0.4558, "num_input_tokens_seen": 118418400, "step": 97380 }, { "epoch": 12.202104999373512, "grad_norm": 0.3505922257900238, "learning_rate": 3.958751744824202e-06, "loss": 0.4818, "num_input_tokens_seen": 118424576, "step": 97385 }, { "epoch": 12.202731487282296, "grad_norm": 0.5545566082000732, "learning_rate": 3.958217023529113e-06, "loss": 0.4616, "num_input_tokens_seen": 118430912, "step": 97390 }, { "epoch": 12.203357975191079, "grad_norm": 0.3738231360912323, "learning_rate": 3.957682314689415e-06, "loss": 0.4775, "num_input_tokens_seen": 118436960, "step": 97395 }, { "epoch": 12.203984463099863, "grad_norm": 0.7929114699363708, "learning_rate": 3.957147618311502e-06, "loss": 0.4466, "num_input_tokens_seen": 118443680, "step": 97400 }, { "epoch": 12.204610951008645, "grad_norm": 0.3742031157016754, "learning_rate": 3.956612934401765e-06, "loss": 0.4712, "num_input_tokens_seen": 118450016, "step": 97405 }, { "epoch": 12.205237438917429, "grad_norm": 8.221713066101074, "learning_rate": 3.956078262966598e-06, "loss": 0.5044, "num_input_tokens_seen": 118456160, "step": 97410 }, { "epoch": 12.205863926826213, "grad_norm": 0.28918877243995667, "learning_rate": 3.955543604012392e-06, "loss": 0.4543, "num_input_tokens_seen": 118462304, "step": 97415 }, { "epoch": 12.206490414734995, "grad_norm": 0.40862390398979187, "learning_rate": 3.955008957545542e-06, "loss": 0.4702, "num_input_tokens_seen": 118468192, "step": 97420 }, { "epoch": 12.20711690264378, "grad_norm": 0.32745853066444397, "learning_rate": 3.954474323572438e-06, "loss": 0.4677, "num_input_tokens_seen": 118473984, "step": 97425 }, { "epoch": 12.207743390552562, "grad_norm": 0.3299293518066406, "learning_rate": 3.953939702099474e-06, "loss": 0.4472, "num_input_tokens_seen": 118480192, "step": 97430 }, { "epoch": 12.208369878461346, "grad_norm": 0.3127520978450775, "learning_rate": 3.953405093133037e-06, "loss": 0.4681, "num_input_tokens_seen": 118486208, "step": 97435 }, { "epoch": 12.20899636637013, "grad_norm": 0.3310264050960541, "learning_rate": 3.952870496679526e-06, "loss": 0.4729, "num_input_tokens_seen": 118492000, "step": 97440 }, { "epoch": 12.209622854278912, "grad_norm": 0.6859933733940125, "learning_rate": 3.9523359127453256e-06, "loss": 0.452, "num_input_tokens_seen": 118497824, "step": 97445 }, { "epoch": 12.210249342187696, "grad_norm": 0.2652989327907562, "learning_rate": 3.951801341336831e-06, "loss": 0.4598, "num_input_tokens_seen": 118503840, "step": 97450 }, { "epoch": 12.210875830096478, "grad_norm": 0.42453065514564514, "learning_rate": 3.9512667824604325e-06, "loss": 0.4627, "num_input_tokens_seen": 118509152, "step": 97455 }, { "epoch": 12.211502318005262, "grad_norm": 0.373990535736084, "learning_rate": 3.9507322361225205e-06, "loss": 0.4583, "num_input_tokens_seen": 118515104, "step": 97460 }, { "epoch": 12.212128805914046, "grad_norm": 0.39300039410591125, "learning_rate": 3.950197702329488e-06, "loss": 0.4702, "num_input_tokens_seen": 118521216, "step": 97465 }, { "epoch": 12.212755293822829, "grad_norm": 0.6196732521057129, "learning_rate": 3.949663181087724e-06, "loss": 0.4624, "num_input_tokens_seen": 118527424, "step": 97470 }, { "epoch": 12.213381781731613, "grad_norm": 0.3841114938259125, "learning_rate": 3.949128672403621e-06, "loss": 0.4562, "num_input_tokens_seen": 118533504, "step": 97475 }, { "epoch": 12.214008269640395, "grad_norm": 6.19097375869751, "learning_rate": 3.948594176283567e-06, "loss": 0.4825, "num_input_tokens_seen": 118539488, "step": 97480 }, { "epoch": 12.21463475754918, "grad_norm": 0.34113413095474243, "learning_rate": 3.948059692733955e-06, "loss": 0.4665, "num_input_tokens_seen": 118544480, "step": 97485 }, { "epoch": 12.215261245457963, "grad_norm": 0.31429117918014526, "learning_rate": 3.947525221761172e-06, "loss": 0.4673, "num_input_tokens_seen": 118550016, "step": 97490 }, { "epoch": 12.215887733366745, "grad_norm": 2.4607789516448975, "learning_rate": 3.946990763371612e-06, "loss": 0.5478, "num_input_tokens_seen": 118556256, "step": 97495 }, { "epoch": 12.21651422127553, "grad_norm": 0.3526839017868042, "learning_rate": 3.94645631757166e-06, "loss": 0.4451, "num_input_tokens_seen": 118562464, "step": 97500 }, { "epoch": 12.217140709184314, "grad_norm": 0.45008304715156555, "learning_rate": 3.9459218843677105e-06, "loss": 0.4594, "num_input_tokens_seen": 118568576, "step": 97505 }, { "epoch": 12.217767197093096, "grad_norm": 0.323863685131073, "learning_rate": 3.945387463766149e-06, "loss": 0.4647, "num_input_tokens_seen": 118574752, "step": 97510 }, { "epoch": 12.21839368500188, "grad_norm": 0.36484891176223755, "learning_rate": 3.94485305577337e-06, "loss": 0.4583, "num_input_tokens_seen": 118580960, "step": 97515 }, { "epoch": 12.219020172910662, "grad_norm": 0.48042556643486023, "learning_rate": 3.9443186603957575e-06, "loss": 0.4539, "num_input_tokens_seen": 118587168, "step": 97520 }, { "epoch": 12.219646660819446, "grad_norm": 0.32929691672325134, "learning_rate": 3.943784277639703e-06, "loss": 0.4576, "num_input_tokens_seen": 118593344, "step": 97525 }, { "epoch": 12.22027314872823, "grad_norm": 0.4420184791088104, "learning_rate": 3.943249907511597e-06, "loss": 0.4657, "num_input_tokens_seen": 118599520, "step": 97530 }, { "epoch": 12.220899636637013, "grad_norm": 4.446604251861572, "learning_rate": 3.942715550017826e-06, "loss": 0.4892, "num_input_tokens_seen": 118605600, "step": 97535 }, { "epoch": 12.221526124545797, "grad_norm": 0.3860345780849457, "learning_rate": 3.9421812051647806e-06, "loss": 0.4608, "num_input_tokens_seen": 118611776, "step": 97540 }, { "epoch": 12.222152612454579, "grad_norm": 0.5578690767288208, "learning_rate": 3.941646872958847e-06, "loss": 0.469, "num_input_tokens_seen": 118617824, "step": 97545 }, { "epoch": 12.222779100363363, "grad_norm": 0.2837993800640106, "learning_rate": 3.941112553406416e-06, "loss": 0.4628, "num_input_tokens_seen": 118623264, "step": 97550 }, { "epoch": 12.223405588272147, "grad_norm": 0.3108014762401581, "learning_rate": 3.940578246513873e-06, "loss": 0.4463, "num_input_tokens_seen": 118629888, "step": 97555 }, { "epoch": 12.22403207618093, "grad_norm": 0.26972243189811707, "learning_rate": 3.940043952287609e-06, "loss": 0.4539, "num_input_tokens_seen": 118636224, "step": 97560 }, { "epoch": 12.224658564089713, "grad_norm": 0.4378962218761444, "learning_rate": 3.939509670734009e-06, "loss": 0.4688, "num_input_tokens_seen": 118642208, "step": 97565 }, { "epoch": 12.225285051998496, "grad_norm": 0.29208338260650635, "learning_rate": 3.938975401859465e-06, "loss": 0.4554, "num_input_tokens_seen": 118648224, "step": 97570 }, { "epoch": 12.22591153990728, "grad_norm": 0.6334444880485535, "learning_rate": 3.9384411456703604e-06, "loss": 0.4682, "num_input_tokens_seen": 118654336, "step": 97575 }, { "epoch": 12.226538027816064, "grad_norm": 0.4149874746799469, "learning_rate": 3.937906902173087e-06, "loss": 0.4627, "num_input_tokens_seen": 118660512, "step": 97580 }, { "epoch": 12.227164515724846, "grad_norm": 0.2894129753112793, "learning_rate": 3.9373726713740255e-06, "loss": 0.4531, "num_input_tokens_seen": 118666784, "step": 97585 }, { "epoch": 12.22779100363363, "grad_norm": 0.2821228504180908, "learning_rate": 3.936838453279569e-06, "loss": 0.4632, "num_input_tokens_seen": 118672384, "step": 97590 }, { "epoch": 12.228417491542412, "grad_norm": 0.410440593957901, "learning_rate": 3.936304247896104e-06, "loss": 0.4516, "num_input_tokens_seen": 118678752, "step": 97595 }, { "epoch": 12.229043979451196, "grad_norm": 0.3057199716567993, "learning_rate": 3.9357700552300135e-06, "loss": 0.4553, "num_input_tokens_seen": 118684704, "step": 97600 }, { "epoch": 12.22967046735998, "grad_norm": 0.450336754322052, "learning_rate": 3.9352358752876895e-06, "loss": 0.4572, "num_input_tokens_seen": 118691296, "step": 97605 }, { "epoch": 12.230296955268763, "grad_norm": 0.25437819957733154, "learning_rate": 3.9347017080755135e-06, "loss": 0.47, "num_input_tokens_seen": 118697248, "step": 97610 }, { "epoch": 12.230923443177547, "grad_norm": 0.39028501510620117, "learning_rate": 3.934167553599875e-06, "loss": 0.4689, "num_input_tokens_seen": 118703424, "step": 97615 }, { "epoch": 12.231549931086331, "grad_norm": 0.340576171875, "learning_rate": 3.9336334118671585e-06, "loss": 0.4481, "num_input_tokens_seen": 118709696, "step": 97620 }, { "epoch": 12.232176418995113, "grad_norm": 0.33105704188346863, "learning_rate": 3.933099282883754e-06, "loss": 0.4565, "num_input_tokens_seen": 118715872, "step": 97625 }, { "epoch": 12.232802906903897, "grad_norm": 0.8310757875442505, "learning_rate": 3.932565166656042e-06, "loss": 0.4875, "num_input_tokens_seen": 118721824, "step": 97630 }, { "epoch": 12.23342939481268, "grad_norm": 0.4334230124950409, "learning_rate": 3.9320310631904125e-06, "loss": 0.4421, "num_input_tokens_seen": 118728000, "step": 97635 }, { "epoch": 12.234055882721464, "grad_norm": 0.4785553514957428, "learning_rate": 3.9314969724932485e-06, "loss": 0.4573, "num_input_tokens_seen": 118734080, "step": 97640 }, { "epoch": 12.234682370630248, "grad_norm": 0.42533910274505615, "learning_rate": 3.930962894570935e-06, "loss": 0.4555, "num_input_tokens_seen": 118740064, "step": 97645 }, { "epoch": 12.23530885853903, "grad_norm": 0.40618109703063965, "learning_rate": 3.930428829429862e-06, "loss": 0.4584, "num_input_tokens_seen": 118746304, "step": 97650 }, { "epoch": 12.235935346447814, "grad_norm": 0.5712488293647766, "learning_rate": 3.9298947770764094e-06, "loss": 0.4648, "num_input_tokens_seen": 118752320, "step": 97655 }, { "epoch": 12.236561834356596, "grad_norm": 0.5627173185348511, "learning_rate": 3.929360737516966e-06, "loss": 0.4599, "num_input_tokens_seen": 118758848, "step": 97660 }, { "epoch": 12.23718832226538, "grad_norm": 0.5810679793357849, "learning_rate": 3.928826710757914e-06, "loss": 0.5246, "num_input_tokens_seen": 118765280, "step": 97665 }, { "epoch": 12.237814810174164, "grad_norm": 0.5101006627082825, "learning_rate": 3.92829269680564e-06, "loss": 0.4392, "num_input_tokens_seen": 118771520, "step": 97670 }, { "epoch": 12.238441298082947, "grad_norm": 0.9663249254226685, "learning_rate": 3.927758695666527e-06, "loss": 0.4637, "num_input_tokens_seen": 118777664, "step": 97675 }, { "epoch": 12.23906778599173, "grad_norm": 0.28310757875442505, "learning_rate": 3.927224707346962e-06, "loss": 0.4676, "num_input_tokens_seen": 118783488, "step": 97680 }, { "epoch": 12.239694273900513, "grad_norm": 1.3260835409164429, "learning_rate": 3.926690731853325e-06, "loss": 0.4435, "num_input_tokens_seen": 118789504, "step": 97685 }, { "epoch": 12.240320761809297, "grad_norm": 0.33433297276496887, "learning_rate": 3.926156769192005e-06, "loss": 0.5086, "num_input_tokens_seen": 118795872, "step": 97690 }, { "epoch": 12.240947249718081, "grad_norm": 0.885598361492157, "learning_rate": 3.925622819369383e-06, "loss": 0.4794, "num_input_tokens_seen": 118802304, "step": 97695 }, { "epoch": 12.241573737626863, "grad_norm": 0.5417501926422119, "learning_rate": 3.925088882391843e-06, "loss": 0.5051, "num_input_tokens_seen": 118808416, "step": 97700 }, { "epoch": 12.242200225535647, "grad_norm": 0.3979577422142029, "learning_rate": 3.924554958265769e-06, "loss": 0.4626, "num_input_tokens_seen": 118814464, "step": 97705 }, { "epoch": 12.24282671344443, "grad_norm": 0.5514377355575562, "learning_rate": 3.924021046997544e-06, "loss": 0.4553, "num_input_tokens_seen": 118820640, "step": 97710 }, { "epoch": 12.243453201353214, "grad_norm": 0.21981081366539001, "learning_rate": 3.9234871485935534e-06, "loss": 0.4642, "num_input_tokens_seen": 118826880, "step": 97715 }, { "epoch": 12.244079689261998, "grad_norm": 0.702263593673706, "learning_rate": 3.922953263060179e-06, "loss": 0.4532, "num_input_tokens_seen": 118833024, "step": 97720 }, { "epoch": 12.24470617717078, "grad_norm": 0.3050842583179474, "learning_rate": 3.922419390403804e-06, "loss": 0.4561, "num_input_tokens_seen": 118839072, "step": 97725 }, { "epoch": 12.245332665079564, "grad_norm": 0.7559760212898254, "learning_rate": 3.9218855306308095e-06, "loss": 0.5038, "num_input_tokens_seen": 118845504, "step": 97730 }, { "epoch": 12.245959152988346, "grad_norm": 0.3839169442653656, "learning_rate": 3.921351683747582e-06, "loss": 0.4561, "num_input_tokens_seen": 118851648, "step": 97735 }, { "epoch": 12.24658564089713, "grad_norm": 0.3126123249530792, "learning_rate": 3.9208178497605005e-06, "loss": 0.4805, "num_input_tokens_seen": 118857536, "step": 97740 }, { "epoch": 12.247212128805915, "grad_norm": 0.4172314405441284, "learning_rate": 3.9202840286759506e-06, "loss": 0.4523, "num_input_tokens_seen": 118863712, "step": 97745 }, { "epoch": 12.247838616714697, "grad_norm": 0.358596533536911, "learning_rate": 3.9197502205003114e-06, "loss": 0.4598, "num_input_tokens_seen": 118869888, "step": 97750 }, { "epoch": 12.248465104623481, "grad_norm": 0.3738136291503906, "learning_rate": 3.919216425239968e-06, "loss": 0.4717, "num_input_tokens_seen": 118876128, "step": 97755 }, { "epoch": 12.249091592532265, "grad_norm": 0.40842869877815247, "learning_rate": 3.9186826429013e-06, "loss": 0.4576, "num_input_tokens_seen": 118882144, "step": 97760 }, { "epoch": 12.249718080441047, "grad_norm": 0.28578996658325195, "learning_rate": 3.918148873490689e-06, "loss": 0.4525, "num_input_tokens_seen": 118888192, "step": 97765 }, { "epoch": 12.250344568349831, "grad_norm": 0.27563607692718506, "learning_rate": 3.917615117014518e-06, "loss": 0.4505, "num_input_tokens_seen": 118894240, "step": 97770 }, { "epoch": 12.250971056258614, "grad_norm": 0.7001422047615051, "learning_rate": 3.917081373479168e-06, "loss": 0.4574, "num_input_tokens_seen": 118900384, "step": 97775 }, { "epoch": 12.251597544167398, "grad_norm": 0.3506772220134735, "learning_rate": 3.916547642891023e-06, "loss": 0.4661, "num_input_tokens_seen": 118906720, "step": 97780 }, { "epoch": 12.252224032076182, "grad_norm": 0.42834821343421936, "learning_rate": 3.91601392525646e-06, "loss": 0.4434, "num_input_tokens_seen": 118913024, "step": 97785 }, { "epoch": 12.252850519984964, "grad_norm": 0.6484857797622681, "learning_rate": 3.915480220581864e-06, "loss": 0.4635, "num_input_tokens_seen": 118919264, "step": 97790 }, { "epoch": 12.253477007893748, "grad_norm": 1.0816675424575806, "learning_rate": 3.914946528873611e-06, "loss": 0.4684, "num_input_tokens_seen": 118925184, "step": 97795 }, { "epoch": 12.25410349580253, "grad_norm": 0.32823848724365234, "learning_rate": 3.9144128501380875e-06, "loss": 0.4528, "num_input_tokens_seen": 118931264, "step": 97800 }, { "epoch": 12.254729983711314, "grad_norm": 0.4360457956790924, "learning_rate": 3.913879184381668e-06, "loss": 0.4565, "num_input_tokens_seen": 118937344, "step": 97805 }, { "epoch": 12.255356471620098, "grad_norm": 0.27801114320755005, "learning_rate": 3.913345531610737e-06, "loss": 0.4461, "num_input_tokens_seen": 118943680, "step": 97810 }, { "epoch": 12.25598295952888, "grad_norm": 0.3194575011730194, "learning_rate": 3.912811891831675e-06, "loss": 0.4452, "num_input_tokens_seen": 118949312, "step": 97815 }, { "epoch": 12.256609447437665, "grad_norm": 2.733520746231079, "learning_rate": 3.91227826505086e-06, "loss": 0.4619, "num_input_tokens_seen": 118955552, "step": 97820 }, { "epoch": 12.257235935346447, "grad_norm": 0.2968050241470337, "learning_rate": 3.911744651274672e-06, "loss": 0.4318, "num_input_tokens_seen": 118961792, "step": 97825 }, { "epoch": 12.257862423255231, "grad_norm": 0.48317039012908936, "learning_rate": 3.911211050509494e-06, "loss": 0.4669, "num_input_tokens_seen": 118968256, "step": 97830 }, { "epoch": 12.258488911164015, "grad_norm": 0.41350287199020386, "learning_rate": 3.910677462761701e-06, "loss": 0.4453, "num_input_tokens_seen": 118973760, "step": 97835 }, { "epoch": 12.259115399072797, "grad_norm": 0.628800094127655, "learning_rate": 3.910143888037674e-06, "loss": 0.5106, "num_input_tokens_seen": 118980320, "step": 97840 }, { "epoch": 12.259741886981582, "grad_norm": 0.319160521030426, "learning_rate": 3.9096103263437965e-06, "loss": 0.4424, "num_input_tokens_seen": 118986496, "step": 97845 }, { "epoch": 12.260368374890364, "grad_norm": 0.40656235814094543, "learning_rate": 3.909076777686441e-06, "loss": 0.4529, "num_input_tokens_seen": 118992512, "step": 97850 }, { "epoch": 12.260994862799148, "grad_norm": 0.49434351921081543, "learning_rate": 3.9085432420719934e-06, "loss": 0.4668, "num_input_tokens_seen": 118998496, "step": 97855 }, { "epoch": 12.261621350707932, "grad_norm": 1.3261903524398804, "learning_rate": 3.9080097195068254e-06, "loss": 0.4898, "num_input_tokens_seen": 119004736, "step": 97860 }, { "epoch": 12.262247838616714, "grad_norm": 0.47851553559303284, "learning_rate": 3.907476209997321e-06, "loss": 0.455, "num_input_tokens_seen": 119010432, "step": 97865 }, { "epoch": 12.262874326525498, "grad_norm": 0.6975341439247131, "learning_rate": 3.9069427135498574e-06, "loss": 0.508, "num_input_tokens_seen": 119016256, "step": 97870 }, { "epoch": 12.26350081443428, "grad_norm": 0.4127854108810425, "learning_rate": 3.906409230170812e-06, "loss": 0.4691, "num_input_tokens_seen": 119022432, "step": 97875 }, { "epoch": 12.264127302343065, "grad_norm": 7.722893238067627, "learning_rate": 3.905875759866563e-06, "loss": 0.5049, "num_input_tokens_seen": 119027904, "step": 97880 }, { "epoch": 12.264753790251849, "grad_norm": 0.35005107522010803, "learning_rate": 3.905342302643492e-06, "loss": 0.4585, "num_input_tokens_seen": 119034016, "step": 97885 }, { "epoch": 12.265380278160631, "grad_norm": 9.200374603271484, "learning_rate": 3.904808858507971e-06, "loss": 0.4998, "num_input_tokens_seen": 119040288, "step": 97890 }, { "epoch": 12.266006766069415, "grad_norm": 0.9637333154678345, "learning_rate": 3.9042754274663805e-06, "loss": 0.4725, "num_input_tokens_seen": 119046528, "step": 97895 }, { "epoch": 12.266633253978199, "grad_norm": 0.43589067459106445, "learning_rate": 3.903742009525102e-06, "loss": 0.4853, "num_input_tokens_seen": 119052832, "step": 97900 }, { "epoch": 12.267259741886981, "grad_norm": 0.33158209919929504, "learning_rate": 3.903208604690506e-06, "loss": 0.5178, "num_input_tokens_seen": 119059008, "step": 97905 }, { "epoch": 12.267886229795765, "grad_norm": 0.4768064320087433, "learning_rate": 3.902675212968976e-06, "loss": 0.4512, "num_input_tokens_seen": 119065280, "step": 97910 }, { "epoch": 12.268512717704548, "grad_norm": 1.3706471920013428, "learning_rate": 3.902141834366884e-06, "loss": 0.4289, "num_input_tokens_seen": 119072000, "step": 97915 }, { "epoch": 12.269139205613332, "grad_norm": 6.383363246917725, "learning_rate": 3.901608468890611e-06, "loss": 0.4912, "num_input_tokens_seen": 119078144, "step": 97920 }, { "epoch": 12.269765693522116, "grad_norm": 0.3661743700504303, "learning_rate": 3.901075116546531e-06, "loss": 0.4919, "num_input_tokens_seen": 119084224, "step": 97925 }, { "epoch": 12.270392181430898, "grad_norm": 4.584989547729492, "learning_rate": 3.9005417773410235e-06, "loss": 0.4885, "num_input_tokens_seen": 119090208, "step": 97930 }, { "epoch": 12.271018669339682, "grad_norm": 4.165674209594727, "learning_rate": 3.900008451280462e-06, "loss": 0.5133, "num_input_tokens_seen": 119096384, "step": 97935 }, { "epoch": 12.271645157248464, "grad_norm": 0.4350232779979706, "learning_rate": 3.899475138371226e-06, "loss": 0.4369, "num_input_tokens_seen": 119102336, "step": 97940 }, { "epoch": 12.272271645157248, "grad_norm": 0.4951973259449005, "learning_rate": 3.898941838619688e-06, "loss": 0.5013, "num_input_tokens_seen": 119108384, "step": 97945 }, { "epoch": 12.272898133066032, "grad_norm": 0.5035688877105713, "learning_rate": 3.898408552032228e-06, "loss": 0.5065, "num_input_tokens_seen": 119114368, "step": 97950 }, { "epoch": 12.273524620974815, "grad_norm": 0.6092706322669983, "learning_rate": 3.8978752786152185e-06, "loss": 0.5089, "num_input_tokens_seen": 119120448, "step": 97955 }, { "epoch": 12.274151108883599, "grad_norm": 2.377624034881592, "learning_rate": 3.897342018375035e-06, "loss": 0.5491, "num_input_tokens_seen": 119126752, "step": 97960 }, { "epoch": 12.274777596792381, "grad_norm": 0.9198460578918457, "learning_rate": 3.896808771318059e-06, "loss": 0.4601, "num_input_tokens_seen": 119133088, "step": 97965 }, { "epoch": 12.275404084701165, "grad_norm": 0.5279901027679443, "learning_rate": 3.896275537450659e-06, "loss": 0.4573, "num_input_tokens_seen": 119139264, "step": 97970 }, { "epoch": 12.27603057260995, "grad_norm": 0.39910775423049927, "learning_rate": 3.895742316779213e-06, "loss": 0.4434, "num_input_tokens_seen": 119145120, "step": 97975 }, { "epoch": 12.276657060518732, "grad_norm": 0.5948305726051331, "learning_rate": 3.895209109310095e-06, "loss": 0.4437, "num_input_tokens_seen": 119151104, "step": 97980 }, { "epoch": 12.277283548427516, "grad_norm": 0.3909289240837097, "learning_rate": 3.894675915049684e-06, "loss": 0.4579, "num_input_tokens_seen": 119157184, "step": 97985 }, { "epoch": 12.277910036336298, "grad_norm": 0.5138290524482727, "learning_rate": 3.894142734004349e-06, "loss": 0.4824, "num_input_tokens_seen": 119163328, "step": 97990 }, { "epoch": 12.278536524245082, "grad_norm": 0.5091397166252136, "learning_rate": 3.893609566180469e-06, "loss": 0.4549, "num_input_tokens_seen": 119168928, "step": 97995 }, { "epoch": 12.279163012153866, "grad_norm": 0.6714984774589539, "learning_rate": 3.893076411584416e-06, "loss": 0.4564, "num_input_tokens_seen": 119175136, "step": 98000 }, { "epoch": 12.279789500062648, "grad_norm": 0.5641455054283142, "learning_rate": 3.8925432702225664e-06, "loss": 0.457, "num_input_tokens_seen": 119181440, "step": 98005 }, { "epoch": 12.280415987971432, "grad_norm": 1.3638981580734253, "learning_rate": 3.8920101421012915e-06, "loss": 0.4577, "num_input_tokens_seen": 119187584, "step": 98010 }, { "epoch": 12.281042475880216, "grad_norm": 6.863709926605225, "learning_rate": 3.891477027226967e-06, "loss": 0.5104, "num_input_tokens_seen": 119193632, "step": 98015 }, { "epoch": 12.281668963788999, "grad_norm": 0.5496301054954529, "learning_rate": 3.890943925605967e-06, "loss": 0.4501, "num_input_tokens_seen": 119199008, "step": 98020 }, { "epoch": 12.282295451697783, "grad_norm": 0.42338937520980835, "learning_rate": 3.890410837244664e-06, "loss": 0.4533, "num_input_tokens_seen": 119204640, "step": 98025 }, { "epoch": 12.282921939606565, "grad_norm": 0.5970906615257263, "learning_rate": 3.889877762149434e-06, "loss": 0.496, "num_input_tokens_seen": 119210560, "step": 98030 }, { "epoch": 12.283548427515349, "grad_norm": 1.811753511428833, "learning_rate": 3.889344700326647e-06, "loss": 0.525, "num_input_tokens_seen": 119216832, "step": 98035 }, { "epoch": 12.284174915424133, "grad_norm": 0.5107364058494568, "learning_rate": 3.88881165178268e-06, "loss": 0.5377, "num_input_tokens_seen": 119223104, "step": 98040 }, { "epoch": 12.284801403332915, "grad_norm": 0.5172864198684692, "learning_rate": 3.888278616523902e-06, "loss": 0.4225, "num_input_tokens_seen": 119229152, "step": 98045 }, { "epoch": 12.2854278912417, "grad_norm": 0.6255874633789062, "learning_rate": 3.88774559455669e-06, "loss": 0.5759, "num_input_tokens_seen": 119235008, "step": 98050 }, { "epoch": 12.286054379150482, "grad_norm": 0.7147630453109741, "learning_rate": 3.887212585887412e-06, "loss": 0.4413, "num_input_tokens_seen": 119241056, "step": 98055 }, { "epoch": 12.286680867059266, "grad_norm": 0.5856289863586426, "learning_rate": 3.886679590522445e-06, "loss": 0.4368, "num_input_tokens_seen": 119247072, "step": 98060 }, { "epoch": 12.28730735496805, "grad_norm": 5.937307357788086, "learning_rate": 3.886146608468158e-06, "loss": 0.4651, "num_input_tokens_seen": 119253184, "step": 98065 }, { "epoch": 12.287933842876832, "grad_norm": 14.329763412475586, "learning_rate": 3.885613639730926e-06, "loss": 0.6734, "num_input_tokens_seen": 119259168, "step": 98070 }, { "epoch": 12.288560330785616, "grad_norm": 0.6249111294746399, "learning_rate": 3.88508068431712e-06, "loss": 0.4776, "num_input_tokens_seen": 119265184, "step": 98075 }, { "epoch": 12.289186818694398, "grad_norm": 0.723625123500824, "learning_rate": 3.88454774223311e-06, "loss": 0.449, "num_input_tokens_seen": 119270176, "step": 98080 }, { "epoch": 12.289813306603182, "grad_norm": 0.6341580748558044, "learning_rate": 3.884014813485271e-06, "loss": 0.4307, "num_input_tokens_seen": 119276160, "step": 98085 }, { "epoch": 12.290439794511967, "grad_norm": 0.6921449303627014, "learning_rate": 3.883481898079971e-06, "loss": 0.4418, "num_input_tokens_seen": 119282720, "step": 98090 }, { "epoch": 12.291066282420749, "grad_norm": 6.840766906738281, "learning_rate": 3.882948996023588e-06, "loss": 0.5794, "num_input_tokens_seen": 119288992, "step": 98095 }, { "epoch": 12.291692770329533, "grad_norm": 0.9794548153877258, "learning_rate": 3.882416107322485e-06, "loss": 0.5354, "num_input_tokens_seen": 119295200, "step": 98100 }, { "epoch": 12.292319258238315, "grad_norm": 0.8787664175033569, "learning_rate": 3.88188323198304e-06, "loss": 0.4554, "num_input_tokens_seen": 119301280, "step": 98105 }, { "epoch": 12.2929457461471, "grad_norm": 9.692502975463867, "learning_rate": 3.881350370011619e-06, "loss": 0.4845, "num_input_tokens_seen": 119307200, "step": 98110 }, { "epoch": 12.293572234055883, "grad_norm": 1.7796733379364014, "learning_rate": 3.880817521414596e-06, "loss": 0.4366, "num_input_tokens_seen": 119313504, "step": 98115 }, { "epoch": 12.294198721964666, "grad_norm": 2.0025179386138916, "learning_rate": 3.880284686198339e-06, "loss": 0.4715, "num_input_tokens_seen": 119319360, "step": 98120 }, { "epoch": 12.29482520987345, "grad_norm": 0.4860195219516754, "learning_rate": 3.879751864369221e-06, "loss": 0.4436, "num_input_tokens_seen": 119325568, "step": 98125 }, { "epoch": 12.295451697782234, "grad_norm": 16.834373474121094, "learning_rate": 3.879219055933611e-06, "loss": 0.4902, "num_input_tokens_seen": 119332160, "step": 98130 }, { "epoch": 12.296078185691016, "grad_norm": 7.735718727111816, "learning_rate": 3.878686260897881e-06, "loss": 0.6208, "num_input_tokens_seen": 119338368, "step": 98135 }, { "epoch": 12.2967046735998, "grad_norm": 0.7739781737327576, "learning_rate": 3.878153479268397e-06, "loss": 0.5599, "num_input_tokens_seen": 119344672, "step": 98140 }, { "epoch": 12.297331161508582, "grad_norm": 0.924466073513031, "learning_rate": 3.877620711051531e-06, "loss": 0.4556, "num_input_tokens_seen": 119350720, "step": 98145 }, { "epoch": 12.297957649417366, "grad_norm": 0.8786453604698181, "learning_rate": 3.877087956253656e-06, "loss": 0.5739, "num_input_tokens_seen": 119357024, "step": 98150 }, { "epoch": 12.29858413732615, "grad_norm": 0.8788633346557617, "learning_rate": 3.876555214881136e-06, "loss": 0.4662, "num_input_tokens_seen": 119362848, "step": 98155 }, { "epoch": 12.299210625234933, "grad_norm": 1.7306941747665405, "learning_rate": 3.876022486940346e-06, "loss": 0.4314, "num_input_tokens_seen": 119368992, "step": 98160 }, { "epoch": 12.299837113143717, "grad_norm": 0.9912679195404053, "learning_rate": 3.8754897724376485e-06, "loss": 0.4545, "num_input_tokens_seen": 119374848, "step": 98165 }, { "epoch": 12.300463601052499, "grad_norm": 12.951574325561523, "learning_rate": 3.874957071379419e-06, "loss": 0.5491, "num_input_tokens_seen": 119381056, "step": 98170 }, { "epoch": 12.301090088961283, "grad_norm": 1.1937264204025269, "learning_rate": 3.874424383772022e-06, "loss": 0.464, "num_input_tokens_seen": 119387680, "step": 98175 }, { "epoch": 12.301716576870067, "grad_norm": 5.276427268981934, "learning_rate": 3.873891709621829e-06, "loss": 0.4659, "num_input_tokens_seen": 119393952, "step": 98180 }, { "epoch": 12.30234306477885, "grad_norm": 8.03048038482666, "learning_rate": 3.873359048935205e-06, "loss": 0.4721, "num_input_tokens_seen": 119400192, "step": 98185 }, { "epoch": 12.302969552687633, "grad_norm": 0.959938645362854, "learning_rate": 3.8728264017185244e-06, "loss": 0.4496, "num_input_tokens_seen": 119406240, "step": 98190 }, { "epoch": 12.303596040596416, "grad_norm": 4.042705059051514, "learning_rate": 3.8722937679781495e-06, "loss": 0.5126, "num_input_tokens_seen": 119412128, "step": 98195 }, { "epoch": 12.3042225285052, "grad_norm": 1.1771634817123413, "learning_rate": 3.871761147720452e-06, "loss": 0.4481, "num_input_tokens_seen": 119418080, "step": 98200 }, { "epoch": 12.304849016413984, "grad_norm": 0.8712286949157715, "learning_rate": 3.871228540951797e-06, "loss": 0.4777, "num_input_tokens_seen": 119424192, "step": 98205 }, { "epoch": 12.305475504322766, "grad_norm": 2.760392427444458, "learning_rate": 3.870695947678553e-06, "loss": 0.4795, "num_input_tokens_seen": 119430048, "step": 98210 }, { "epoch": 12.30610199223155, "grad_norm": 0.8486343622207642, "learning_rate": 3.870163367907091e-06, "loss": 0.4777, "num_input_tokens_seen": 119436032, "step": 98215 }, { "epoch": 12.306728480140332, "grad_norm": 1.116940975189209, "learning_rate": 3.869630801643774e-06, "loss": 0.5735, "num_input_tokens_seen": 119442272, "step": 98220 }, { "epoch": 12.307354968049117, "grad_norm": 3.596548080444336, "learning_rate": 3.869098248894974e-06, "loss": 0.4622, "num_input_tokens_seen": 119448512, "step": 98225 }, { "epoch": 12.3079814559579, "grad_norm": 1.361445665359497, "learning_rate": 3.8685657096670526e-06, "loss": 0.4344, "num_input_tokens_seen": 119454656, "step": 98230 }, { "epoch": 12.308607943866683, "grad_norm": 0.8224740624427795, "learning_rate": 3.86803318396638e-06, "loss": 0.4386, "num_input_tokens_seen": 119460640, "step": 98235 }, { "epoch": 12.309234431775467, "grad_norm": 0.6803727746009827, "learning_rate": 3.8675006717993215e-06, "loss": 0.5685, "num_input_tokens_seen": 119466560, "step": 98240 }, { "epoch": 12.309860919684251, "grad_norm": 1.844378113746643, "learning_rate": 3.866968173172247e-06, "loss": 0.4339, "num_input_tokens_seen": 119472512, "step": 98245 }, { "epoch": 12.310487407593033, "grad_norm": 1.1855535507202148, "learning_rate": 3.8664356880915175e-06, "loss": 0.788, "num_input_tokens_seen": 119478400, "step": 98250 }, { "epoch": 12.311113895501817, "grad_norm": 1.1607213020324707, "learning_rate": 3.865903216563506e-06, "loss": 0.5211, "num_input_tokens_seen": 119484512, "step": 98255 }, { "epoch": 12.3117403834106, "grad_norm": 4.182154655456543, "learning_rate": 3.865370758594572e-06, "loss": 0.4881, "num_input_tokens_seen": 119490336, "step": 98260 }, { "epoch": 12.312366871319384, "grad_norm": 2.848848581314087, "learning_rate": 3.864838314191084e-06, "loss": 0.4913, "num_input_tokens_seen": 119496416, "step": 98265 }, { "epoch": 12.312993359228168, "grad_norm": 5.933834075927734, "learning_rate": 3.864305883359411e-06, "loss": 0.4861, "num_input_tokens_seen": 119502624, "step": 98270 }, { "epoch": 12.31361984713695, "grad_norm": 0.9028193950653076, "learning_rate": 3.863773466105914e-06, "loss": 0.4784, "num_input_tokens_seen": 119508640, "step": 98275 }, { "epoch": 12.314246335045734, "grad_norm": 1.3274866342544556, "learning_rate": 3.863241062436962e-06, "loss": 0.4634, "num_input_tokens_seen": 119514688, "step": 98280 }, { "epoch": 12.314872822954516, "grad_norm": 10.146430015563965, "learning_rate": 3.862708672358919e-06, "loss": 0.5257, "num_input_tokens_seen": 119520576, "step": 98285 }, { "epoch": 12.3154993108633, "grad_norm": 0.9839488863945007, "learning_rate": 3.86217629587815e-06, "loss": 0.5188, "num_input_tokens_seen": 119526976, "step": 98290 }, { "epoch": 12.316125798772084, "grad_norm": 0.879790186882019, "learning_rate": 3.861643933001018e-06, "loss": 0.4724, "num_input_tokens_seen": 119532896, "step": 98295 }, { "epoch": 12.316752286680867, "grad_norm": 0.8751269578933716, "learning_rate": 3.861111583733892e-06, "loss": 0.4653, "num_input_tokens_seen": 119539328, "step": 98300 }, { "epoch": 12.31737877458965, "grad_norm": 8.801332473754883, "learning_rate": 3.860579248083133e-06, "loss": 0.488, "num_input_tokens_seen": 119545440, "step": 98305 }, { "epoch": 12.318005262498433, "grad_norm": 1.387092113494873, "learning_rate": 3.8600469260551085e-06, "loss": 0.4571, "num_input_tokens_seen": 119551264, "step": 98310 }, { "epoch": 12.318631750407217, "grad_norm": 0.5436713695526123, "learning_rate": 3.8595146176561795e-06, "loss": 0.504, "num_input_tokens_seen": 119556768, "step": 98315 }, { "epoch": 12.319258238316001, "grad_norm": 0.6172117590904236, "learning_rate": 3.858982322892715e-06, "loss": 0.4546, "num_input_tokens_seen": 119563008, "step": 98320 }, { "epoch": 12.319884726224783, "grad_norm": 0.856092631816864, "learning_rate": 3.8584500417710735e-06, "loss": 0.4379, "num_input_tokens_seen": 119569408, "step": 98325 }, { "epoch": 12.320511214133568, "grad_norm": 1.011549711227417, "learning_rate": 3.857917774297621e-06, "loss": 0.4586, "num_input_tokens_seen": 119575648, "step": 98330 }, { "epoch": 12.32113770204235, "grad_norm": 1.864163875579834, "learning_rate": 3.857385520478723e-06, "loss": 0.4782, "num_input_tokens_seen": 119581024, "step": 98335 }, { "epoch": 12.321764189951134, "grad_norm": 0.8325024843215942, "learning_rate": 3.856853280320743e-06, "loss": 0.4396, "num_input_tokens_seen": 119587104, "step": 98340 }, { "epoch": 12.322390677859918, "grad_norm": 1.0130459070205688, "learning_rate": 3.856321053830042e-06, "loss": 0.4426, "num_input_tokens_seen": 119593216, "step": 98345 }, { "epoch": 12.3230171657687, "grad_norm": 0.6306982040405273, "learning_rate": 3.855788841012983e-06, "loss": 0.4525, "num_input_tokens_seen": 119599456, "step": 98350 }, { "epoch": 12.323643653677484, "grad_norm": 0.9191412925720215, "learning_rate": 3.855256641875934e-06, "loss": 0.4993, "num_input_tokens_seen": 119605760, "step": 98355 }, { "epoch": 12.324270141586267, "grad_norm": 1.1153113842010498, "learning_rate": 3.854724456425251e-06, "loss": 0.52, "num_input_tokens_seen": 119611616, "step": 98360 }, { "epoch": 12.32489662949505, "grad_norm": 1.1087104082107544, "learning_rate": 3.854192284667302e-06, "loss": 0.4774, "num_input_tokens_seen": 119617824, "step": 98365 }, { "epoch": 12.325523117403835, "grad_norm": 13.22619915008545, "learning_rate": 3.853660126608446e-06, "loss": 0.5283, "num_input_tokens_seen": 119624096, "step": 98370 }, { "epoch": 12.326149605312617, "grad_norm": 1.007256031036377, "learning_rate": 3.853127982255049e-06, "loss": 0.5065, "num_input_tokens_seen": 119629792, "step": 98375 }, { "epoch": 12.326776093221401, "grad_norm": 11.514076232910156, "learning_rate": 3.852595851613468e-06, "loss": 0.5145, "num_input_tokens_seen": 119636192, "step": 98380 }, { "epoch": 12.327402581130183, "grad_norm": 6.509907245635986, "learning_rate": 3.852063734690072e-06, "loss": 0.5142, "num_input_tokens_seen": 119642368, "step": 98385 }, { "epoch": 12.328029069038967, "grad_norm": 5.374858379364014, "learning_rate": 3.851531631491215e-06, "loss": 0.4985, "num_input_tokens_seen": 119648352, "step": 98390 }, { "epoch": 12.328655556947751, "grad_norm": 4.824783802032471, "learning_rate": 3.850999542023265e-06, "loss": 0.4601, "num_input_tokens_seen": 119654656, "step": 98395 }, { "epoch": 12.329282044856534, "grad_norm": 4.087451934814453, "learning_rate": 3.850467466292583e-06, "loss": 0.5026, "num_input_tokens_seen": 119660480, "step": 98400 }, { "epoch": 12.329908532765318, "grad_norm": 0.6467992663383484, "learning_rate": 3.849935404305527e-06, "loss": 0.4804, "num_input_tokens_seen": 119666496, "step": 98405 }, { "epoch": 12.330535020674102, "grad_norm": 0.6278555393218994, "learning_rate": 3.849403356068462e-06, "loss": 0.4734, "num_input_tokens_seen": 119672864, "step": 98410 }, { "epoch": 12.331161508582884, "grad_norm": 0.49705928564071655, "learning_rate": 3.848871321587745e-06, "loss": 0.4486, "num_input_tokens_seen": 119678880, "step": 98415 }, { "epoch": 12.331787996491668, "grad_norm": 0.5757879018783569, "learning_rate": 3.848339300869742e-06, "loss": 0.4949, "num_input_tokens_seen": 119685216, "step": 98420 }, { "epoch": 12.33241448440045, "grad_norm": 1.549654483795166, "learning_rate": 3.847807293920808e-06, "loss": 0.4627, "num_input_tokens_seen": 119690688, "step": 98425 }, { "epoch": 12.333040972309234, "grad_norm": 0.6140710115432739, "learning_rate": 3.847275300747308e-06, "loss": 0.4584, "num_input_tokens_seen": 119696832, "step": 98430 }, { "epoch": 12.333667460218019, "grad_norm": 0.5999268889427185, "learning_rate": 3.8467433213556e-06, "loss": 0.443, "num_input_tokens_seen": 119702848, "step": 98435 }, { "epoch": 12.3342939481268, "grad_norm": 0.5679378509521484, "learning_rate": 3.846211355752046e-06, "loss": 0.4722, "num_input_tokens_seen": 119708992, "step": 98440 }, { "epoch": 12.334920436035585, "grad_norm": 0.4345709979534149, "learning_rate": 3.8456794039430045e-06, "loss": 0.4816, "num_input_tokens_seen": 119715424, "step": 98445 }, { "epoch": 12.335546923944367, "grad_norm": 0.6892040371894836, "learning_rate": 3.845147465934837e-06, "loss": 0.4549, "num_input_tokens_seen": 119721728, "step": 98450 }, { "epoch": 12.336173411853151, "grad_norm": 0.6102933883666992, "learning_rate": 3.844615541733902e-06, "loss": 0.4808, "num_input_tokens_seen": 119728128, "step": 98455 }, { "epoch": 12.336799899761935, "grad_norm": 0.7332789301872253, "learning_rate": 3.84408363134656e-06, "loss": 0.4449, "num_input_tokens_seen": 119734272, "step": 98460 }, { "epoch": 12.337426387670718, "grad_norm": 0.47724607586860657, "learning_rate": 3.843551734779171e-06, "loss": 0.4667, "num_input_tokens_seen": 119740288, "step": 98465 }, { "epoch": 12.338052875579502, "grad_norm": 0.49215731024742126, "learning_rate": 3.843019852038091e-06, "loss": 0.4803, "num_input_tokens_seen": 119745792, "step": 98470 }, { "epoch": 12.338679363488284, "grad_norm": 1.0108647346496582, "learning_rate": 3.842487983129684e-06, "loss": 0.4362, "num_input_tokens_seen": 119751744, "step": 98475 }, { "epoch": 12.339305851397068, "grad_norm": 1.3670793771743774, "learning_rate": 3.841956128060304e-06, "loss": 0.4969, "num_input_tokens_seen": 119757920, "step": 98480 }, { "epoch": 12.339932339305852, "grad_norm": 0.8245300054550171, "learning_rate": 3.841424286836314e-06, "loss": 0.4477, "num_input_tokens_seen": 119764096, "step": 98485 }, { "epoch": 12.340558827214634, "grad_norm": 1.4531961679458618, "learning_rate": 3.84089245946407e-06, "loss": 0.4691, "num_input_tokens_seen": 119770496, "step": 98490 }, { "epoch": 12.341185315123418, "grad_norm": 2.7174770832061768, "learning_rate": 3.840360645949931e-06, "loss": 0.4743, "num_input_tokens_seen": 119776736, "step": 98495 }, { "epoch": 12.3418118030322, "grad_norm": 0.4754117429256439, "learning_rate": 3.8398288463002544e-06, "loss": 0.4788, "num_input_tokens_seen": 119782624, "step": 98500 }, { "epoch": 12.342438290940985, "grad_norm": 0.5231506824493408, "learning_rate": 3.839297060521402e-06, "loss": 0.5328, "num_input_tokens_seen": 119788960, "step": 98505 }, { "epoch": 12.343064778849769, "grad_norm": 8.304048538208008, "learning_rate": 3.8387652886197264e-06, "loss": 0.5278, "num_input_tokens_seen": 119795136, "step": 98510 }, { "epoch": 12.343691266758551, "grad_norm": 0.5458284020423889, "learning_rate": 3.8382335306015884e-06, "loss": 0.4582, "num_input_tokens_seen": 119801376, "step": 98515 }, { "epoch": 12.344317754667335, "grad_norm": 1.935943603515625, "learning_rate": 3.8377017864733475e-06, "loss": 0.4531, "num_input_tokens_seen": 119807552, "step": 98520 }, { "epoch": 12.344944242576119, "grad_norm": 0.5261032581329346, "learning_rate": 3.837170056241356e-06, "loss": 0.4613, "num_input_tokens_seen": 119813824, "step": 98525 }, { "epoch": 12.345570730484901, "grad_norm": 2.09122371673584, "learning_rate": 3.836638339911978e-06, "loss": 0.4922, "num_input_tokens_seen": 119819936, "step": 98530 }, { "epoch": 12.346197218393685, "grad_norm": 0.6796274185180664, "learning_rate": 3.836106637491563e-06, "loss": 0.4445, "num_input_tokens_seen": 119826432, "step": 98535 }, { "epoch": 12.346823706302468, "grad_norm": 0.4860174357891083, "learning_rate": 3.8355749489864736e-06, "loss": 0.442, "num_input_tokens_seen": 119832352, "step": 98540 }, { "epoch": 12.347450194211252, "grad_norm": 1.0719114542007446, "learning_rate": 3.835043274403063e-06, "loss": 0.4738, "num_input_tokens_seen": 119838528, "step": 98545 }, { "epoch": 12.348076682120036, "grad_norm": 0.7679216861724854, "learning_rate": 3.834511613747692e-06, "loss": 0.4279, "num_input_tokens_seen": 119844032, "step": 98550 }, { "epoch": 12.348703170028818, "grad_norm": 0.8207065463066101, "learning_rate": 3.833979967026712e-06, "loss": 0.4527, "num_input_tokens_seen": 119850176, "step": 98555 }, { "epoch": 12.349329657937602, "grad_norm": 0.5125429630279541, "learning_rate": 3.833448334246485e-06, "loss": 0.475, "num_input_tokens_seen": 119856032, "step": 98560 }, { "epoch": 12.349956145846384, "grad_norm": 0.69033282995224, "learning_rate": 3.8329167154133605e-06, "loss": 0.4298, "num_input_tokens_seen": 119862560, "step": 98565 }, { "epoch": 12.350582633755169, "grad_norm": 2.676250696182251, "learning_rate": 3.8323851105337006e-06, "loss": 0.4877, "num_input_tokens_seen": 119868704, "step": 98570 }, { "epoch": 12.351209121663953, "grad_norm": 0.9666460156440735, "learning_rate": 3.831853519613857e-06, "loss": 0.494, "num_input_tokens_seen": 119874656, "step": 98575 }, { "epoch": 12.351835609572735, "grad_norm": 0.9262910485267639, "learning_rate": 3.831321942660185e-06, "loss": 0.454, "num_input_tokens_seen": 119881120, "step": 98580 }, { "epoch": 12.352462097481519, "grad_norm": 6.539839267730713, "learning_rate": 3.830790379679046e-06, "loss": 0.5537, "num_input_tokens_seen": 119887584, "step": 98585 }, { "epoch": 12.353088585390301, "grad_norm": 2.6210968494415283, "learning_rate": 3.830258830676787e-06, "loss": 0.4617, "num_input_tokens_seen": 119893504, "step": 98590 }, { "epoch": 12.353715073299085, "grad_norm": 0.5821602940559387, "learning_rate": 3.82972729565977e-06, "loss": 0.4722, "num_input_tokens_seen": 119899616, "step": 98595 }, { "epoch": 12.35434156120787, "grad_norm": 0.7154118418693542, "learning_rate": 3.8291957746343445e-06, "loss": 0.4658, "num_input_tokens_seen": 119905728, "step": 98600 }, { "epoch": 12.354968049116652, "grad_norm": 7.850401878356934, "learning_rate": 3.828664267606871e-06, "loss": 0.553, "num_input_tokens_seen": 119911936, "step": 98605 }, { "epoch": 12.355594537025436, "grad_norm": 1.5879713296890259, "learning_rate": 3.8281327745836975e-06, "loss": 0.4452, "num_input_tokens_seen": 119917824, "step": 98610 }, { "epoch": 12.356221024934218, "grad_norm": 0.5176603198051453, "learning_rate": 3.827601295571185e-06, "loss": 0.4656, "num_input_tokens_seen": 119924128, "step": 98615 }, { "epoch": 12.356847512843002, "grad_norm": 1.274355173110962, "learning_rate": 3.827069830575683e-06, "loss": 0.4678, "num_input_tokens_seen": 119929952, "step": 98620 }, { "epoch": 12.357474000751786, "grad_norm": 0.32736697793006897, "learning_rate": 3.826538379603549e-06, "loss": 0.4604, "num_input_tokens_seen": 119935488, "step": 98625 }, { "epoch": 12.358100488660568, "grad_norm": 0.7018870711326599, "learning_rate": 3.826006942661133e-06, "loss": 0.447, "num_input_tokens_seen": 119941824, "step": 98630 }, { "epoch": 12.358726976569352, "grad_norm": 0.6758805513381958, "learning_rate": 3.825475519754792e-06, "loss": 0.4485, "num_input_tokens_seen": 119947872, "step": 98635 }, { "epoch": 12.359353464478136, "grad_norm": 0.932148277759552, "learning_rate": 3.824944110890879e-06, "loss": 0.4473, "num_input_tokens_seen": 119954080, "step": 98640 }, { "epoch": 12.359979952386919, "grad_norm": 0.6309313178062439, "learning_rate": 3.824412716075746e-06, "loss": 0.4698, "num_input_tokens_seen": 119960416, "step": 98645 }, { "epoch": 12.360606440295703, "grad_norm": 0.7516720294952393, "learning_rate": 3.823881335315748e-06, "loss": 0.4642, "num_input_tokens_seen": 119966720, "step": 98650 }, { "epoch": 12.361232928204485, "grad_norm": 0.6044897437095642, "learning_rate": 3.8233499686172374e-06, "loss": 0.472, "num_input_tokens_seen": 119972800, "step": 98655 }, { "epoch": 12.361859416113269, "grad_norm": 1.2112432718276978, "learning_rate": 3.8228186159865684e-06, "loss": 0.4508, "num_input_tokens_seen": 119979072, "step": 98660 }, { "epoch": 12.362485904022053, "grad_norm": 0.6931598782539368, "learning_rate": 3.8222872774300904e-06, "loss": 0.4788, "num_input_tokens_seen": 119985312, "step": 98665 }, { "epoch": 12.363112391930835, "grad_norm": 0.7136783599853516, "learning_rate": 3.821755952954161e-06, "loss": 0.4619, "num_input_tokens_seen": 119991296, "step": 98670 }, { "epoch": 12.36373887983962, "grad_norm": 1.8019647598266602, "learning_rate": 3.821224642565127e-06, "loss": 0.4579, "num_input_tokens_seen": 119997312, "step": 98675 }, { "epoch": 12.364365367748402, "grad_norm": 0.48613429069519043, "learning_rate": 3.820693346269345e-06, "loss": 0.4697, "num_input_tokens_seen": 120003520, "step": 98680 }, { "epoch": 12.364991855657186, "grad_norm": 3.156747817993164, "learning_rate": 3.820162064073165e-06, "loss": 0.4567, "num_input_tokens_seen": 120009696, "step": 98685 }, { "epoch": 12.36561834356597, "grad_norm": 0.4761626422405243, "learning_rate": 3.81963079598294e-06, "loss": 0.4519, "num_input_tokens_seen": 120015584, "step": 98690 }, { "epoch": 12.366244831474752, "grad_norm": 1.032243251800537, "learning_rate": 3.81909954200502e-06, "loss": 0.4618, "num_input_tokens_seen": 120021664, "step": 98695 }, { "epoch": 12.366871319383536, "grad_norm": 0.5660455226898193, "learning_rate": 3.818568302145759e-06, "loss": 0.4556, "num_input_tokens_seen": 120027968, "step": 98700 }, { "epoch": 12.367497807292319, "grad_norm": 0.8747832775115967, "learning_rate": 3.818037076411506e-06, "loss": 0.4649, "num_input_tokens_seen": 120034176, "step": 98705 }, { "epoch": 12.368124295201103, "grad_norm": 0.5107426047325134, "learning_rate": 3.8175058648086145e-06, "loss": 0.4538, "num_input_tokens_seen": 120040096, "step": 98710 }, { "epoch": 12.368750783109887, "grad_norm": 0.7083408832550049, "learning_rate": 3.816974667343435e-06, "loss": 0.4987, "num_input_tokens_seen": 120046432, "step": 98715 }, { "epoch": 12.369377271018669, "grad_norm": 0.8844016790390015, "learning_rate": 3.816443484022317e-06, "loss": 0.4563, "num_input_tokens_seen": 120052064, "step": 98720 }, { "epoch": 12.370003758927453, "grad_norm": 0.49207425117492676, "learning_rate": 3.815912314851614e-06, "loss": 0.4497, "num_input_tokens_seen": 120058368, "step": 98725 }, { "epoch": 12.370630246836235, "grad_norm": 2.0889320373535156, "learning_rate": 3.815381159837672e-06, "loss": 0.4601, "num_input_tokens_seen": 120064416, "step": 98730 }, { "epoch": 12.37125673474502, "grad_norm": 0.679287314414978, "learning_rate": 3.814850018986848e-06, "loss": 0.4861, "num_input_tokens_seen": 120070752, "step": 98735 }, { "epoch": 12.371883222653803, "grad_norm": 0.5461857318878174, "learning_rate": 3.8143188923054845e-06, "loss": 0.4487, "num_input_tokens_seen": 120076480, "step": 98740 }, { "epoch": 12.372509710562586, "grad_norm": 1.0804311037063599, "learning_rate": 3.813787779799938e-06, "loss": 0.4904, "num_input_tokens_seen": 120082688, "step": 98745 }, { "epoch": 12.37313619847137, "grad_norm": 0.9084296226501465, "learning_rate": 3.8132566814765537e-06, "loss": 0.44, "num_input_tokens_seen": 120089024, "step": 98750 }, { "epoch": 12.373762686380154, "grad_norm": 0.6669435501098633, "learning_rate": 3.812725597341686e-06, "loss": 0.46, "num_input_tokens_seen": 120095200, "step": 98755 }, { "epoch": 12.374389174288936, "grad_norm": 2.0266833305358887, "learning_rate": 3.8121945274016802e-06, "loss": 0.4691, "num_input_tokens_seen": 120101376, "step": 98760 }, { "epoch": 12.37501566219772, "grad_norm": 0.5532862544059753, "learning_rate": 3.8116634716628876e-06, "loss": 0.4587, "num_input_tokens_seen": 120107264, "step": 98765 }, { "epoch": 12.375642150106502, "grad_norm": 1.5284217596054077, "learning_rate": 3.811132430131659e-06, "loss": 0.4494, "num_input_tokens_seen": 120113376, "step": 98770 }, { "epoch": 12.376268638015286, "grad_norm": 0.7466383576393127, "learning_rate": 3.8106014028143394e-06, "loss": 0.4786, "num_input_tokens_seen": 120119488, "step": 98775 }, { "epoch": 12.37689512592407, "grad_norm": 0.9557005763053894, "learning_rate": 3.8100703897172825e-06, "loss": 0.4855, "num_input_tokens_seen": 120125472, "step": 98780 }, { "epoch": 12.377521613832853, "grad_norm": 1.0566990375518799, "learning_rate": 3.8095393908468327e-06, "loss": 0.4586, "num_input_tokens_seen": 120131488, "step": 98785 }, { "epoch": 12.378148101741637, "grad_norm": 0.45421212911605835, "learning_rate": 3.8090084062093417e-06, "loss": 0.4565, "num_input_tokens_seen": 120137856, "step": 98790 }, { "epoch": 12.378774589650419, "grad_norm": 0.5891417860984802, "learning_rate": 3.8084774358111548e-06, "loss": 0.4563, "num_input_tokens_seen": 120143968, "step": 98795 }, { "epoch": 12.379401077559203, "grad_norm": 0.46494877338409424, "learning_rate": 3.807946479658623e-06, "loss": 0.4482, "num_input_tokens_seen": 120150112, "step": 98800 }, { "epoch": 12.380027565467987, "grad_norm": 0.8793208003044128, "learning_rate": 3.8074155377580925e-06, "loss": 0.4419, "num_input_tokens_seen": 120155520, "step": 98805 }, { "epoch": 12.38065405337677, "grad_norm": 3.402092695236206, "learning_rate": 3.8068846101159133e-06, "loss": 0.481, "num_input_tokens_seen": 120161504, "step": 98810 }, { "epoch": 12.381280541285554, "grad_norm": 0.6796026229858398, "learning_rate": 3.806353696738431e-06, "loss": 0.4435, "num_input_tokens_seen": 120167584, "step": 98815 }, { "epoch": 12.381907029194336, "grad_norm": 0.8952332139015198, "learning_rate": 3.8058227976319946e-06, "loss": 0.4596, "num_input_tokens_seen": 120173824, "step": 98820 }, { "epoch": 12.38253351710312, "grad_norm": 1.5032896995544434, "learning_rate": 3.805291912802949e-06, "loss": 0.4689, "num_input_tokens_seen": 120179808, "step": 98825 }, { "epoch": 12.383160005011904, "grad_norm": 0.9516672492027283, "learning_rate": 3.8047610422576423e-06, "loss": 0.5021, "num_input_tokens_seen": 120185952, "step": 98830 }, { "epoch": 12.383786492920686, "grad_norm": 0.5962842702865601, "learning_rate": 3.8042301860024254e-06, "loss": 0.439, "num_input_tokens_seen": 120192032, "step": 98835 }, { "epoch": 12.38441298082947, "grad_norm": 2.367565393447876, "learning_rate": 3.80369934404364e-06, "loss": 0.4942, "num_input_tokens_seen": 120198208, "step": 98840 }, { "epoch": 12.385039468738253, "grad_norm": 2.048818826675415, "learning_rate": 3.8031685163876357e-06, "loss": 0.4499, "num_input_tokens_seen": 120204832, "step": 98845 }, { "epoch": 12.385665956647037, "grad_norm": 10.543363571166992, "learning_rate": 3.802637703040757e-06, "loss": 0.5234, "num_input_tokens_seen": 120210688, "step": 98850 }, { "epoch": 12.38629244455582, "grad_norm": 2.0253772735595703, "learning_rate": 3.8021069040093517e-06, "loss": 0.4384, "num_input_tokens_seen": 120216224, "step": 98855 }, { "epoch": 12.386918932464603, "grad_norm": 2.883790969848633, "learning_rate": 3.8015761192997646e-06, "loss": 0.5141, "num_input_tokens_seen": 120222368, "step": 98860 }, { "epoch": 12.387545420373387, "grad_norm": 13.232470512390137, "learning_rate": 3.8010453489183445e-06, "loss": 0.5697, "num_input_tokens_seen": 120227904, "step": 98865 }, { "epoch": 12.38817190828217, "grad_norm": 12.34766674041748, "learning_rate": 3.8005145928714337e-06, "loss": 0.4924, "num_input_tokens_seen": 120233952, "step": 98870 }, { "epoch": 12.388798396190953, "grad_norm": 4.999361038208008, "learning_rate": 3.7999838511653807e-06, "loss": 0.5376, "num_input_tokens_seen": 120240224, "step": 98875 }, { "epoch": 12.389424884099737, "grad_norm": 0.9937637448310852, "learning_rate": 3.7994531238065284e-06, "loss": 0.5115, "num_input_tokens_seen": 120246528, "step": 98880 }, { "epoch": 12.39005137200852, "grad_norm": 1.5500614643096924, "learning_rate": 3.798922410801223e-06, "loss": 0.5073, "num_input_tokens_seen": 120252800, "step": 98885 }, { "epoch": 12.390677859917304, "grad_norm": 5.6927337646484375, "learning_rate": 3.798391712155812e-06, "loss": 0.4641, "num_input_tokens_seen": 120258880, "step": 98890 }, { "epoch": 12.391304347826088, "grad_norm": 0.7793647050857544, "learning_rate": 3.7978610278766366e-06, "loss": 0.6385, "num_input_tokens_seen": 120264832, "step": 98895 }, { "epoch": 12.39193083573487, "grad_norm": 2.7545292377471924, "learning_rate": 3.7973303579700437e-06, "loss": 0.4666, "num_input_tokens_seen": 120271168, "step": 98900 }, { "epoch": 12.392557323643654, "grad_norm": 1.254845380783081, "learning_rate": 3.796799702442378e-06, "loss": 0.4546, "num_input_tokens_seen": 120276768, "step": 98905 }, { "epoch": 12.393183811552436, "grad_norm": 0.7493630051612854, "learning_rate": 3.7962690612999838e-06, "loss": 0.5476, "num_input_tokens_seen": 120283136, "step": 98910 }, { "epoch": 12.39381029946122, "grad_norm": 1.8625351190567017, "learning_rate": 3.7957384345492033e-06, "loss": 0.479, "num_input_tokens_seen": 120288960, "step": 98915 }, { "epoch": 12.394436787370005, "grad_norm": 7.257376194000244, "learning_rate": 3.795207822196385e-06, "loss": 0.5852, "num_input_tokens_seen": 120295200, "step": 98920 }, { "epoch": 12.395063275278787, "grad_norm": 5.432877063751221, "learning_rate": 3.7946772242478675e-06, "loss": 0.4637, "num_input_tokens_seen": 120301216, "step": 98925 }, { "epoch": 12.39568976318757, "grad_norm": 0.6403555870056152, "learning_rate": 3.794146640709999e-06, "loss": 0.4519, "num_input_tokens_seen": 120307392, "step": 98930 }, { "epoch": 12.396316251096353, "grad_norm": 3.1966311931610107, "learning_rate": 3.7936160715891202e-06, "loss": 0.4743, "num_input_tokens_seen": 120313216, "step": 98935 }, { "epoch": 12.396942739005137, "grad_norm": 0.8931219577789307, "learning_rate": 3.793085516891577e-06, "loss": 0.4445, "num_input_tokens_seen": 120319456, "step": 98940 }, { "epoch": 12.397569226913921, "grad_norm": 1.803017258644104, "learning_rate": 3.792554976623709e-06, "loss": 0.4685, "num_input_tokens_seen": 120325536, "step": 98945 }, { "epoch": 12.398195714822704, "grad_norm": 3.5749289989471436, "learning_rate": 3.7920244507918613e-06, "loss": 0.4485, "num_input_tokens_seen": 120332000, "step": 98950 }, { "epoch": 12.398822202731488, "grad_norm": 4.87736701965332, "learning_rate": 3.791493939402378e-06, "loss": 0.4878, "num_input_tokens_seen": 120338048, "step": 98955 }, { "epoch": 12.39944869064027, "grad_norm": 1.3075882196426392, "learning_rate": 3.790963442461601e-06, "loss": 0.4916, "num_input_tokens_seen": 120344160, "step": 98960 }, { "epoch": 12.400075178549054, "grad_norm": 0.8478695750236511, "learning_rate": 3.7904329599758726e-06, "loss": 0.4701, "num_input_tokens_seen": 120350080, "step": 98965 }, { "epoch": 12.400701666457838, "grad_norm": 0.7838382720947266, "learning_rate": 3.7899024919515337e-06, "loss": 0.4596, "num_input_tokens_seen": 120356416, "step": 98970 }, { "epoch": 12.40132815436662, "grad_norm": 0.4868006408214569, "learning_rate": 3.78937203839493e-06, "loss": 0.4598, "num_input_tokens_seen": 120362432, "step": 98975 }, { "epoch": 12.401954642275404, "grad_norm": 0.7005283832550049, "learning_rate": 3.7888415993123996e-06, "loss": 0.4578, "num_input_tokens_seen": 120368352, "step": 98980 }, { "epoch": 12.402581130184187, "grad_norm": 0.7287269830703735, "learning_rate": 3.788311174710288e-06, "loss": 0.4634, "num_input_tokens_seen": 120374592, "step": 98985 }, { "epoch": 12.40320761809297, "grad_norm": 0.4761187434196472, "learning_rate": 3.787780764594933e-06, "loss": 0.479, "num_input_tokens_seen": 120380512, "step": 98990 }, { "epoch": 12.403834106001755, "grad_norm": 0.7505013346672058, "learning_rate": 3.7872503689726802e-06, "loss": 0.4557, "num_input_tokens_seen": 120386560, "step": 98995 }, { "epoch": 12.404460593910537, "grad_norm": 0.7617992758750916, "learning_rate": 3.7867199878498674e-06, "loss": 0.4666, "num_input_tokens_seen": 120392608, "step": 99000 }, { "epoch": 12.405087081819321, "grad_norm": 4.539683818817139, "learning_rate": 3.7861896212328377e-06, "loss": 0.4696, "num_input_tokens_seen": 120398816, "step": 99005 }, { "epoch": 12.405713569728103, "grad_norm": 6.803866386413574, "learning_rate": 3.7856592691279303e-06, "loss": 0.4739, "num_input_tokens_seen": 120404864, "step": 99010 }, { "epoch": 12.406340057636887, "grad_norm": 0.9269886612892151, "learning_rate": 3.7851289315414874e-06, "loss": 0.4561, "num_input_tokens_seen": 120410944, "step": 99015 }, { "epoch": 12.406966545545671, "grad_norm": 2.704746961593628, "learning_rate": 3.7845986084798524e-06, "loss": 0.5193, "num_input_tokens_seen": 120416832, "step": 99020 }, { "epoch": 12.407593033454454, "grad_norm": 0.9677926898002625, "learning_rate": 3.78406829994936e-06, "loss": 0.4557, "num_input_tokens_seen": 120423008, "step": 99025 }, { "epoch": 12.408219521363238, "grad_norm": 0.7972661852836609, "learning_rate": 3.783538005956356e-06, "loss": 0.4534, "num_input_tokens_seen": 120429088, "step": 99030 }, { "epoch": 12.408846009272022, "grad_norm": 1.107246994972229, "learning_rate": 3.7830077265071753e-06, "loss": 0.5026, "num_input_tokens_seen": 120435040, "step": 99035 }, { "epoch": 12.409472497180804, "grad_norm": 6.700737476348877, "learning_rate": 3.782477461608163e-06, "loss": 0.4756, "num_input_tokens_seen": 120441184, "step": 99040 }, { "epoch": 12.410098985089588, "grad_norm": 0.49357542395591736, "learning_rate": 3.7819472112656537e-06, "loss": 0.4947, "num_input_tokens_seen": 120447040, "step": 99045 }, { "epoch": 12.41072547299837, "grad_norm": 0.553724467754364, "learning_rate": 3.781416975485991e-06, "loss": 0.4624, "num_input_tokens_seen": 120453504, "step": 99050 }, { "epoch": 12.411351960907155, "grad_norm": 0.7472769021987915, "learning_rate": 3.7808867542755124e-06, "loss": 0.4731, "num_input_tokens_seen": 120459584, "step": 99055 }, { "epoch": 12.411978448815939, "grad_norm": 9.950709342956543, "learning_rate": 3.780356547640558e-06, "loss": 0.5912, "num_input_tokens_seen": 120465888, "step": 99060 }, { "epoch": 12.41260493672472, "grad_norm": 0.373126357793808, "learning_rate": 3.779826355587465e-06, "loss": 0.4676, "num_input_tokens_seen": 120471968, "step": 99065 }, { "epoch": 12.413231424633505, "grad_norm": 0.4781351387500763, "learning_rate": 3.779296178122574e-06, "loss": 0.4637, "num_input_tokens_seen": 120478176, "step": 99070 }, { "epoch": 12.413857912542287, "grad_norm": 0.5513576865196228, "learning_rate": 3.778766015252226e-06, "loss": 0.4468, "num_input_tokens_seen": 120484128, "step": 99075 }, { "epoch": 12.414484400451071, "grad_norm": 0.6942193508148193, "learning_rate": 3.7782358669827547e-06, "loss": 0.5286, "num_input_tokens_seen": 120490176, "step": 99080 }, { "epoch": 12.415110888359855, "grad_norm": 0.6083086133003235, "learning_rate": 3.777705733320503e-06, "loss": 0.4528, "num_input_tokens_seen": 120496032, "step": 99085 }, { "epoch": 12.415737376268638, "grad_norm": 1.4054256677627563, "learning_rate": 3.7771756142718047e-06, "loss": 0.4839, "num_input_tokens_seen": 120502272, "step": 99090 }, { "epoch": 12.416363864177422, "grad_norm": 1.0572220087051392, "learning_rate": 3.776645509843002e-06, "loss": 0.4667, "num_input_tokens_seen": 120508640, "step": 99095 }, { "epoch": 12.416990352086204, "grad_norm": 0.7255652546882629, "learning_rate": 3.7761154200404286e-06, "loss": 0.4805, "num_input_tokens_seen": 120514880, "step": 99100 }, { "epoch": 12.417616839994988, "grad_norm": 0.585790753364563, "learning_rate": 3.775585344870426e-06, "loss": 0.4597, "num_input_tokens_seen": 120520832, "step": 99105 }, { "epoch": 12.418243327903772, "grad_norm": 0.5365356802940369, "learning_rate": 3.7750552843393295e-06, "loss": 0.4522, "num_input_tokens_seen": 120526976, "step": 99110 }, { "epoch": 12.418869815812554, "grad_norm": 0.6706509590148926, "learning_rate": 3.7745252384534768e-06, "loss": 0.4499, "num_input_tokens_seen": 120533184, "step": 99115 }, { "epoch": 12.419496303721338, "grad_norm": 0.7257413268089294, "learning_rate": 3.7739952072192044e-06, "loss": 0.4515, "num_input_tokens_seen": 120539296, "step": 99120 }, { "epoch": 12.42012279163012, "grad_norm": 0.507842481136322, "learning_rate": 3.773465190642852e-06, "loss": 0.4743, "num_input_tokens_seen": 120545536, "step": 99125 }, { "epoch": 12.420749279538905, "grad_norm": 0.7119664549827576, "learning_rate": 3.772935188730753e-06, "loss": 0.4584, "num_input_tokens_seen": 120551584, "step": 99130 }, { "epoch": 12.421375767447689, "grad_norm": 0.7541036605834961, "learning_rate": 3.7724052014892447e-06, "loss": 0.4654, "num_input_tokens_seen": 120557600, "step": 99135 }, { "epoch": 12.422002255356471, "grad_norm": 1.9127864837646484, "learning_rate": 3.7718752289246667e-06, "loss": 0.4751, "num_input_tokens_seen": 120563584, "step": 99140 }, { "epoch": 12.422628743265255, "grad_norm": 0.5583811402320862, "learning_rate": 3.7713452710433514e-06, "loss": 0.4609, "num_input_tokens_seen": 120569536, "step": 99145 }, { "epoch": 12.42325523117404, "grad_norm": 0.41276565194129944, "learning_rate": 3.770815327851638e-06, "loss": 0.4653, "num_input_tokens_seen": 120575360, "step": 99150 }, { "epoch": 12.423881719082821, "grad_norm": 0.7388409376144409, "learning_rate": 3.7702853993558587e-06, "loss": 0.454, "num_input_tokens_seen": 120581600, "step": 99155 }, { "epoch": 12.424508206991606, "grad_norm": 0.48584529757499695, "learning_rate": 3.769755485562352e-06, "loss": 0.4707, "num_input_tokens_seen": 120587616, "step": 99160 }, { "epoch": 12.425134694900388, "grad_norm": 0.42683881521224976, "learning_rate": 3.769225586477452e-06, "loss": 0.464, "num_input_tokens_seen": 120593760, "step": 99165 }, { "epoch": 12.425761182809172, "grad_norm": 4.648959159851074, "learning_rate": 3.7686957021074973e-06, "loss": 0.4783, "num_input_tokens_seen": 120599904, "step": 99170 }, { "epoch": 12.426387670717956, "grad_norm": 2.523149013519287, "learning_rate": 3.7681658324588178e-06, "loss": 0.4761, "num_input_tokens_seen": 120605856, "step": 99175 }, { "epoch": 12.427014158626738, "grad_norm": 0.36378127336502075, "learning_rate": 3.7676359775377542e-06, "loss": 0.4678, "num_input_tokens_seen": 120612064, "step": 99180 }, { "epoch": 12.427640646535522, "grad_norm": 0.30902284383773804, "learning_rate": 3.767106137350637e-06, "loss": 0.4591, "num_input_tokens_seen": 120618240, "step": 99185 }, { "epoch": 12.428267134444305, "grad_norm": 0.34370601177215576, "learning_rate": 3.7665763119038034e-06, "loss": 0.4654, "num_input_tokens_seen": 120624384, "step": 99190 }, { "epoch": 12.428893622353089, "grad_norm": 0.3550108075141907, "learning_rate": 3.7660465012035857e-06, "loss": 0.4571, "num_input_tokens_seen": 120630176, "step": 99195 }, { "epoch": 12.429520110261873, "grad_norm": 0.48134899139404297, "learning_rate": 3.765516705256319e-06, "loss": 0.462, "num_input_tokens_seen": 120636160, "step": 99200 }, { "epoch": 12.430146598170655, "grad_norm": 0.6590526700019836, "learning_rate": 3.7649869240683407e-06, "loss": 0.4774, "num_input_tokens_seen": 120642592, "step": 99205 }, { "epoch": 12.430773086079439, "grad_norm": 0.384734183549881, "learning_rate": 3.76445715764598e-06, "loss": 0.4671, "num_input_tokens_seen": 120648480, "step": 99210 }, { "epoch": 12.431399573988221, "grad_norm": 0.9745060801506042, "learning_rate": 3.763927405995573e-06, "loss": 0.4591, "num_input_tokens_seen": 120654528, "step": 99215 }, { "epoch": 12.432026061897005, "grad_norm": 0.672948956489563, "learning_rate": 3.7633976691234527e-06, "loss": 0.4545, "num_input_tokens_seen": 120660704, "step": 99220 }, { "epoch": 12.43265254980579, "grad_norm": 0.35915806889533997, "learning_rate": 3.7628679470359547e-06, "loss": 0.4546, "num_input_tokens_seen": 120666944, "step": 99225 }, { "epoch": 12.433279037714572, "grad_norm": 0.7380768656730652, "learning_rate": 3.762338239739409e-06, "loss": 0.4556, "num_input_tokens_seen": 120672160, "step": 99230 }, { "epoch": 12.433905525623356, "grad_norm": 0.6643322110176086, "learning_rate": 3.7618085472401515e-06, "loss": 0.4477, "num_input_tokens_seen": 120678336, "step": 99235 }, { "epoch": 12.434532013532138, "grad_norm": 0.6505078077316284, "learning_rate": 3.7612788695445124e-06, "loss": 0.4552, "num_input_tokens_seen": 120684480, "step": 99240 }, { "epoch": 12.435158501440922, "grad_norm": 0.4684896767139435, "learning_rate": 3.7607492066588276e-06, "loss": 0.4684, "num_input_tokens_seen": 120690624, "step": 99245 }, { "epoch": 12.435784989349706, "grad_norm": 0.378516286611557, "learning_rate": 3.7602195585894257e-06, "loss": 0.4525, "num_input_tokens_seen": 120696512, "step": 99250 }, { "epoch": 12.436411477258488, "grad_norm": 0.39284074306488037, "learning_rate": 3.7596899253426417e-06, "loss": 0.468, "num_input_tokens_seen": 120702208, "step": 99255 }, { "epoch": 12.437037965167272, "grad_norm": 1.6897141933441162, "learning_rate": 3.7591603069248094e-06, "loss": 0.4625, "num_input_tokens_seen": 120708128, "step": 99260 }, { "epoch": 12.437664453076057, "grad_norm": 0.5698498487472534, "learning_rate": 3.7586307033422564e-06, "loss": 0.4624, "num_input_tokens_seen": 120714496, "step": 99265 }, { "epoch": 12.438290940984839, "grad_norm": 0.5140824317932129, "learning_rate": 3.7581011146013185e-06, "loss": 0.4679, "num_input_tokens_seen": 120720768, "step": 99270 }, { "epoch": 12.438917428893623, "grad_norm": 0.38917404413223267, "learning_rate": 3.7575715407083253e-06, "loss": 0.4641, "num_input_tokens_seen": 120726944, "step": 99275 }, { "epoch": 12.439543916802405, "grad_norm": 0.5594062209129333, "learning_rate": 3.7570419816696104e-06, "loss": 0.4557, "num_input_tokens_seen": 120732928, "step": 99280 }, { "epoch": 12.44017040471119, "grad_norm": 1.3993420600891113, "learning_rate": 3.756512437491501e-06, "loss": 0.4467, "num_input_tokens_seen": 120739264, "step": 99285 }, { "epoch": 12.440796892619973, "grad_norm": 0.4967655539512634, "learning_rate": 3.755982908180334e-06, "loss": 0.4508, "num_input_tokens_seen": 120745536, "step": 99290 }, { "epoch": 12.441423380528756, "grad_norm": 5.025129318237305, "learning_rate": 3.7554533937424346e-06, "loss": 0.4948, "num_input_tokens_seen": 120751648, "step": 99295 }, { "epoch": 12.44204986843754, "grad_norm": 0.8273148536682129, "learning_rate": 3.754923894184139e-06, "loss": 0.452, "num_input_tokens_seen": 120758112, "step": 99300 }, { "epoch": 12.442676356346322, "grad_norm": 0.5570318102836609, "learning_rate": 3.754394409511772e-06, "loss": 0.4699, "num_input_tokens_seen": 120764288, "step": 99305 }, { "epoch": 12.443302844255106, "grad_norm": 3.8706886768341064, "learning_rate": 3.753864939731669e-06, "loss": 0.4774, "num_input_tokens_seen": 120770624, "step": 99310 }, { "epoch": 12.44392933216389, "grad_norm": 0.6342436671257019, "learning_rate": 3.753335484850157e-06, "loss": 0.4509, "num_input_tokens_seen": 120776672, "step": 99315 }, { "epoch": 12.444555820072672, "grad_norm": 0.9303682446479797, "learning_rate": 3.752806044873568e-06, "loss": 0.4435, "num_input_tokens_seen": 120782720, "step": 99320 }, { "epoch": 12.445182307981456, "grad_norm": 10.622833251953125, "learning_rate": 3.7522766198082317e-06, "loss": 0.5007, "num_input_tokens_seen": 120789184, "step": 99325 }, { "epoch": 12.445808795890239, "grad_norm": 0.8917068839073181, "learning_rate": 3.751747209660476e-06, "loss": 0.4457, "num_input_tokens_seen": 120795456, "step": 99330 }, { "epoch": 12.446435283799023, "grad_norm": 0.31695321202278137, "learning_rate": 3.7512178144366347e-06, "loss": 0.4802, "num_input_tokens_seen": 120801056, "step": 99335 }, { "epoch": 12.447061771707807, "grad_norm": 4.105093479156494, "learning_rate": 3.750688434143032e-06, "loss": 0.4617, "num_input_tokens_seen": 120807488, "step": 99340 }, { "epoch": 12.447688259616589, "grad_norm": 0.8336755633354187, "learning_rate": 3.750159068786001e-06, "loss": 0.4626, "num_input_tokens_seen": 120813472, "step": 99345 }, { "epoch": 12.448314747525373, "grad_norm": 3.737086057662964, "learning_rate": 3.749629718371868e-06, "loss": 0.4715, "num_input_tokens_seen": 120819776, "step": 99350 }, { "epoch": 12.448941235434155, "grad_norm": 0.5458763837814331, "learning_rate": 3.749100382906964e-06, "loss": 0.4381, "num_input_tokens_seen": 120825248, "step": 99355 }, { "epoch": 12.44956772334294, "grad_norm": 1.2968742847442627, "learning_rate": 3.7485710623976153e-06, "loss": 0.4485, "num_input_tokens_seen": 120831424, "step": 99360 }, { "epoch": 12.450194211251723, "grad_norm": 0.4621085226535797, "learning_rate": 3.748041756850153e-06, "loss": 0.4706, "num_input_tokens_seen": 120837728, "step": 99365 }, { "epoch": 12.450820699160506, "grad_norm": 0.9010168313980103, "learning_rate": 3.7475124662709017e-06, "loss": 0.4404, "num_input_tokens_seen": 120843936, "step": 99370 }, { "epoch": 12.45144718706929, "grad_norm": 0.7703090310096741, "learning_rate": 3.7469831906661945e-06, "loss": 0.4738, "num_input_tokens_seen": 120850176, "step": 99375 }, { "epoch": 12.452073674978074, "grad_norm": 0.5151521563529968, "learning_rate": 3.746453930042355e-06, "loss": 0.4415, "num_input_tokens_seen": 120856256, "step": 99380 }, { "epoch": 12.452700162886856, "grad_norm": 0.445292592048645, "learning_rate": 3.745924684405713e-06, "loss": 0.4805, "num_input_tokens_seen": 120862656, "step": 99385 }, { "epoch": 12.45332665079564, "grad_norm": 0.5058919191360474, "learning_rate": 3.7453954537625974e-06, "loss": 0.4647, "num_input_tokens_seen": 120868672, "step": 99390 }, { "epoch": 12.453953138704422, "grad_norm": 0.5858509540557861, "learning_rate": 3.744866238119332e-06, "loss": 0.4551, "num_input_tokens_seen": 120874560, "step": 99395 }, { "epoch": 12.454579626613206, "grad_norm": 0.5225805044174194, "learning_rate": 3.7443370374822483e-06, "loss": 0.5014, "num_input_tokens_seen": 120880800, "step": 99400 }, { "epoch": 12.45520611452199, "grad_norm": 0.4896034002304077, "learning_rate": 3.743807851857668e-06, "loss": 0.5037, "num_input_tokens_seen": 120886624, "step": 99405 }, { "epoch": 12.455832602430773, "grad_norm": 0.4326300621032715, "learning_rate": 3.743278681251924e-06, "loss": 0.4463, "num_input_tokens_seen": 120892832, "step": 99410 }, { "epoch": 12.456459090339557, "grad_norm": 3.149827003479004, "learning_rate": 3.7427495256713377e-06, "loss": 0.4623, "num_input_tokens_seen": 120898848, "step": 99415 }, { "epoch": 12.45708557824834, "grad_norm": 0.9308534860610962, "learning_rate": 3.7422203851222392e-06, "loss": 0.4502, "num_input_tokens_seen": 120904768, "step": 99420 }, { "epoch": 12.457712066157123, "grad_norm": 0.5806142091751099, "learning_rate": 3.7416912596109524e-06, "loss": 0.4545, "num_input_tokens_seen": 120910912, "step": 99425 }, { "epoch": 12.458338554065907, "grad_norm": 0.8659807443618774, "learning_rate": 3.7411621491438055e-06, "loss": 0.4493, "num_input_tokens_seen": 120917056, "step": 99430 }, { "epoch": 12.45896504197469, "grad_norm": 0.763460099697113, "learning_rate": 3.7406330537271225e-06, "loss": 0.4789, "num_input_tokens_seen": 120923008, "step": 99435 }, { "epoch": 12.459591529883474, "grad_norm": 0.9416800737380981, "learning_rate": 3.7401039733672293e-06, "loss": 0.4466, "num_input_tokens_seen": 120929152, "step": 99440 }, { "epoch": 12.460218017792256, "grad_norm": 2.1170918941497803, "learning_rate": 3.739574908070455e-06, "loss": 0.459, "num_input_tokens_seen": 120935104, "step": 99445 }, { "epoch": 12.46084450570104, "grad_norm": 0.5468951463699341, "learning_rate": 3.7390458578431204e-06, "loss": 0.4777, "num_input_tokens_seen": 120941664, "step": 99450 }, { "epoch": 12.461470993609824, "grad_norm": 0.7188022136688232, "learning_rate": 3.7385168226915547e-06, "loss": 0.4723, "num_input_tokens_seen": 120947936, "step": 99455 }, { "epoch": 12.462097481518606, "grad_norm": 0.6937189698219299, "learning_rate": 3.737987802622079e-06, "loss": 0.4737, "num_input_tokens_seen": 120954464, "step": 99460 }, { "epoch": 12.46272396942739, "grad_norm": 2.1446213722229004, "learning_rate": 3.737458797641021e-06, "loss": 0.4864, "num_input_tokens_seen": 120960800, "step": 99465 }, { "epoch": 12.463350457336173, "grad_norm": 0.5227108597755432, "learning_rate": 3.7369298077547046e-06, "loss": 0.4926, "num_input_tokens_seen": 120966976, "step": 99470 }, { "epoch": 12.463976945244957, "grad_norm": 2.627265453338623, "learning_rate": 3.7364008329694546e-06, "loss": 0.4744, "num_input_tokens_seen": 120973280, "step": 99475 }, { "epoch": 12.46460343315374, "grad_norm": 0.4180985987186432, "learning_rate": 3.7358718732915932e-06, "loss": 0.4478, "num_input_tokens_seen": 120979232, "step": 99480 }, { "epoch": 12.465229921062523, "grad_norm": 1.6214966773986816, "learning_rate": 3.735342928727449e-06, "loss": 0.4694, "num_input_tokens_seen": 120985152, "step": 99485 }, { "epoch": 12.465856408971307, "grad_norm": 0.5372492671012878, "learning_rate": 3.734813999283341e-06, "loss": 0.4549, "num_input_tokens_seen": 120990976, "step": 99490 }, { "epoch": 12.46648289688009, "grad_norm": 0.47136837244033813, "learning_rate": 3.734285084965598e-06, "loss": 0.4643, "num_input_tokens_seen": 120997056, "step": 99495 }, { "epoch": 12.467109384788873, "grad_norm": 0.6392550468444824, "learning_rate": 3.7337561857805382e-06, "loss": 0.4537, "num_input_tokens_seen": 121002976, "step": 99500 }, { "epoch": 12.467735872697657, "grad_norm": 0.414792537689209, "learning_rate": 3.7332273017344885e-06, "loss": 0.5073, "num_input_tokens_seen": 121008960, "step": 99505 }, { "epoch": 12.46836236060644, "grad_norm": 0.8548450469970703, "learning_rate": 3.7326984328337736e-06, "loss": 0.4887, "num_input_tokens_seen": 121015232, "step": 99510 }, { "epoch": 12.468988848515224, "grad_norm": 0.5782966613769531, "learning_rate": 3.7321695790847112e-06, "loss": 0.4921, "num_input_tokens_seen": 121021312, "step": 99515 }, { "epoch": 12.469615336424006, "grad_norm": 0.5769079327583313, "learning_rate": 3.7316407404936296e-06, "loss": 0.4584, "num_input_tokens_seen": 121027616, "step": 99520 }, { "epoch": 12.47024182433279, "grad_norm": 0.885691225528717, "learning_rate": 3.7311119170668488e-06, "loss": 0.4575, "num_input_tokens_seen": 121033824, "step": 99525 }, { "epoch": 12.470868312241574, "grad_norm": 0.683720588684082, "learning_rate": 3.7305831088106915e-06, "loss": 0.4643, "num_input_tokens_seen": 121039968, "step": 99530 }, { "epoch": 12.471494800150356, "grad_norm": 0.4625270366668701, "learning_rate": 3.73005431573148e-06, "loss": 0.4689, "num_input_tokens_seen": 121045856, "step": 99535 }, { "epoch": 12.47212128805914, "grad_norm": 0.5709003210067749, "learning_rate": 3.729525537835539e-06, "loss": 0.4442, "num_input_tokens_seen": 121051840, "step": 99540 }, { "epoch": 12.472747775967925, "grad_norm": 0.6131364107131958, "learning_rate": 3.7289967751291857e-06, "loss": 0.475, "num_input_tokens_seen": 121057952, "step": 99545 }, { "epoch": 12.473374263876707, "grad_norm": 0.740790069103241, "learning_rate": 3.7284680276187467e-06, "loss": 0.4526, "num_input_tokens_seen": 121064352, "step": 99550 }, { "epoch": 12.474000751785491, "grad_norm": 0.8551731705665588, "learning_rate": 3.72793929531054e-06, "loss": 0.4472, "num_input_tokens_seen": 121070400, "step": 99555 }, { "epoch": 12.474627239694273, "grad_norm": 0.8682243824005127, "learning_rate": 3.7274105782108906e-06, "loss": 0.4531, "num_input_tokens_seen": 121076384, "step": 99560 }, { "epoch": 12.475253727603057, "grad_norm": 0.7043622136116028, "learning_rate": 3.726881876326115e-06, "loss": 0.462, "num_input_tokens_seen": 121082720, "step": 99565 }, { "epoch": 12.475880215511841, "grad_norm": 0.6302136182785034, "learning_rate": 3.7263531896625372e-06, "loss": 0.4471, "num_input_tokens_seen": 121088288, "step": 99570 }, { "epoch": 12.476506703420624, "grad_norm": 1.286728024482727, "learning_rate": 3.7258245182264786e-06, "loss": 0.4579, "num_input_tokens_seen": 121094368, "step": 99575 }, { "epoch": 12.477133191329408, "grad_norm": 1.2593709230422974, "learning_rate": 3.7252958620242596e-06, "loss": 0.4695, "num_input_tokens_seen": 121100832, "step": 99580 }, { "epoch": 12.47775967923819, "grad_norm": 0.6923837065696716, "learning_rate": 3.7247672210622e-06, "loss": 0.4512, "num_input_tokens_seen": 121106464, "step": 99585 }, { "epoch": 12.478386167146974, "grad_norm": 2.938636302947998, "learning_rate": 3.724238595346619e-06, "loss": 0.464, "num_input_tokens_seen": 121112960, "step": 99590 }, { "epoch": 12.479012655055758, "grad_norm": 0.8700851798057556, "learning_rate": 3.723709984883841e-06, "loss": 0.4651, "num_input_tokens_seen": 121118720, "step": 99595 }, { "epoch": 12.47963914296454, "grad_norm": 0.8831267952919006, "learning_rate": 3.7231813896801805e-06, "loss": 0.4574, "num_input_tokens_seen": 121125152, "step": 99600 }, { "epoch": 12.480265630873324, "grad_norm": 1.6661992073059082, "learning_rate": 3.722652809741963e-06, "loss": 0.4879, "num_input_tokens_seen": 121131264, "step": 99605 }, { "epoch": 12.480892118782107, "grad_norm": 0.43485695123672485, "learning_rate": 3.7221242450755025e-06, "loss": 0.4415, "num_input_tokens_seen": 121137600, "step": 99610 }, { "epoch": 12.48151860669089, "grad_norm": 7.024482250213623, "learning_rate": 3.7215956956871214e-06, "loss": 0.4558, "num_input_tokens_seen": 121143904, "step": 99615 }, { "epoch": 12.482145094599675, "grad_norm": 0.8038467168807983, "learning_rate": 3.7210671615831394e-06, "loss": 0.4503, "num_input_tokens_seen": 121150144, "step": 99620 }, { "epoch": 12.482771582508457, "grad_norm": 0.5296044945716858, "learning_rate": 3.720538642769874e-06, "loss": 0.4624, "num_input_tokens_seen": 121156064, "step": 99625 }, { "epoch": 12.483398070417241, "grad_norm": 0.524933934211731, "learning_rate": 3.720010139253645e-06, "loss": 0.5074, "num_input_tokens_seen": 121162016, "step": 99630 }, { "epoch": 12.484024558326023, "grad_norm": 0.6277041435241699, "learning_rate": 3.71948165104077e-06, "loss": 0.4615, "num_input_tokens_seen": 121168512, "step": 99635 }, { "epoch": 12.484651046234807, "grad_norm": 0.8190556168556213, "learning_rate": 3.718953178137571e-06, "loss": 0.443, "num_input_tokens_seen": 121174400, "step": 99640 }, { "epoch": 12.485277534143592, "grad_norm": 0.7280982136726379, "learning_rate": 3.718424720550361e-06, "loss": 0.4602, "num_input_tokens_seen": 121180480, "step": 99645 }, { "epoch": 12.485904022052374, "grad_norm": 5.611357688903809, "learning_rate": 3.717896278285464e-06, "loss": 0.4459, "num_input_tokens_seen": 121186464, "step": 99650 }, { "epoch": 12.486530509961158, "grad_norm": 1.1016829013824463, "learning_rate": 3.7173678513491924e-06, "loss": 0.4729, "num_input_tokens_seen": 121192064, "step": 99655 }, { "epoch": 12.487156997869942, "grad_norm": 0.7529669404029846, "learning_rate": 3.7168394397478684e-06, "loss": 0.4734, "num_input_tokens_seen": 121198240, "step": 99660 }, { "epoch": 12.487783485778724, "grad_norm": 1.0034319162368774, "learning_rate": 3.7163110434878057e-06, "loss": 0.4313, "num_input_tokens_seen": 121204736, "step": 99665 }, { "epoch": 12.488409973687508, "grad_norm": 3.8872978687286377, "learning_rate": 3.715782662575325e-06, "loss": 0.5549, "num_input_tokens_seen": 121211168, "step": 99670 }, { "epoch": 12.48903646159629, "grad_norm": 0.43280595541000366, "learning_rate": 3.715254297016743e-06, "loss": 0.4576, "num_input_tokens_seen": 121216576, "step": 99675 }, { "epoch": 12.489662949505075, "grad_norm": 0.6404187679290771, "learning_rate": 3.7147259468183745e-06, "loss": 0.4959, "num_input_tokens_seen": 121222304, "step": 99680 }, { "epoch": 12.490289437413859, "grad_norm": 5.788346290588379, "learning_rate": 3.7141976119865374e-06, "loss": 0.4871, "num_input_tokens_seen": 121228256, "step": 99685 }, { "epoch": 12.490915925322641, "grad_norm": 1.897786021232605, "learning_rate": 3.713669292527549e-06, "loss": 0.5159, "num_input_tokens_seen": 121234592, "step": 99690 }, { "epoch": 12.491542413231425, "grad_norm": 0.4440650939941406, "learning_rate": 3.713140988447728e-06, "loss": 0.4454, "num_input_tokens_seen": 121240800, "step": 99695 }, { "epoch": 12.492168901140207, "grad_norm": 0.9835367202758789, "learning_rate": 3.712612699753386e-06, "loss": 0.4516, "num_input_tokens_seen": 121246880, "step": 99700 }, { "epoch": 12.492795389048991, "grad_norm": 0.45520561933517456, "learning_rate": 3.712084426450844e-06, "loss": 0.4442, "num_input_tokens_seen": 121252992, "step": 99705 }, { "epoch": 12.493421876957775, "grad_norm": 0.9088945984840393, "learning_rate": 3.711556168546413e-06, "loss": 0.4583, "num_input_tokens_seen": 121258912, "step": 99710 }, { "epoch": 12.494048364866558, "grad_norm": 0.827483057975769, "learning_rate": 3.7110279260464144e-06, "loss": 0.4386, "num_input_tokens_seen": 121265120, "step": 99715 }, { "epoch": 12.494674852775342, "grad_norm": 0.6536388397216797, "learning_rate": 3.7104996989571572e-06, "loss": 0.4668, "num_input_tokens_seen": 121271040, "step": 99720 }, { "epoch": 12.495301340684124, "grad_norm": 0.8590784072875977, "learning_rate": 3.7099714872849623e-06, "loss": 0.4551, "num_input_tokens_seen": 121277024, "step": 99725 }, { "epoch": 12.495927828592908, "grad_norm": 1.17824387550354, "learning_rate": 3.709443291036142e-06, "loss": 0.4915, "num_input_tokens_seen": 121283360, "step": 99730 }, { "epoch": 12.496554316501692, "grad_norm": 0.4327552616596222, "learning_rate": 3.708915110217013e-06, "loss": 0.4723, "num_input_tokens_seen": 121289056, "step": 99735 }, { "epoch": 12.497180804410474, "grad_norm": 0.859703540802002, "learning_rate": 3.708386944833888e-06, "loss": 0.4248, "num_input_tokens_seen": 121295072, "step": 99740 }, { "epoch": 12.497807292319258, "grad_norm": 1.4892337322235107, "learning_rate": 3.7078587948930844e-06, "loss": 0.5271, "num_input_tokens_seen": 121301088, "step": 99745 }, { "epoch": 12.49843378022804, "grad_norm": 0.8127922415733337, "learning_rate": 3.707330660400914e-06, "loss": 0.4535, "num_input_tokens_seen": 121306432, "step": 99750 }, { "epoch": 12.499060268136825, "grad_norm": 0.6344691514968872, "learning_rate": 3.7068025413636923e-06, "loss": 0.4331, "num_input_tokens_seen": 121312736, "step": 99755 }, { "epoch": 12.499686756045609, "grad_norm": 0.7075971364974976, "learning_rate": 3.706274437787735e-06, "loss": 0.4653, "num_input_tokens_seen": 121319200, "step": 99760 }, { "epoch": 12.500313243954391, "grad_norm": 1.0204941034317017, "learning_rate": 3.7057463496793533e-06, "loss": 0.5207, "num_input_tokens_seen": 121324896, "step": 99765 }, { "epoch": 12.500939731863175, "grad_norm": 4.192567348480225, "learning_rate": 3.7052182770448638e-06, "loss": 0.4396, "num_input_tokens_seen": 121330944, "step": 99770 }, { "epoch": 12.50156621977196, "grad_norm": 9.212210655212402, "learning_rate": 3.704690219890576e-06, "loss": 0.4997, "num_input_tokens_seen": 121336640, "step": 99775 }, { "epoch": 12.502192707680742, "grad_norm": 3.601856231689453, "learning_rate": 3.704162178222807e-06, "loss": 0.5083, "num_input_tokens_seen": 121342752, "step": 99780 }, { "epoch": 12.502819195589526, "grad_norm": 0.8711172342300415, "learning_rate": 3.703634152047868e-06, "loss": 0.4303, "num_input_tokens_seen": 121348832, "step": 99785 }, { "epoch": 12.503445683498308, "grad_norm": 0.7100706696510315, "learning_rate": 3.7031061413720735e-06, "loss": 0.4944, "num_input_tokens_seen": 121354976, "step": 99790 }, { "epoch": 12.504072171407092, "grad_norm": 1.043472409248352, "learning_rate": 3.7025781462017345e-06, "loss": 0.4129, "num_input_tokens_seen": 121361408, "step": 99795 }, { "epoch": 12.504698659315876, "grad_norm": 1.2527225017547607, "learning_rate": 3.702050166543166e-06, "loss": 0.4677, "num_input_tokens_seen": 121366816, "step": 99800 }, { "epoch": 12.505325147224658, "grad_norm": 0.6366878747940063, "learning_rate": 3.7015222024026777e-06, "loss": 0.4727, "num_input_tokens_seen": 121372608, "step": 99805 }, { "epoch": 12.505951635133442, "grad_norm": 6.8985676765441895, "learning_rate": 3.700994253786585e-06, "loss": 0.5548, "num_input_tokens_seen": 121378880, "step": 99810 }, { "epoch": 12.506578123042225, "grad_norm": 1.3085771799087524, "learning_rate": 3.700466320701196e-06, "loss": 0.4683, "num_input_tokens_seen": 121385024, "step": 99815 }, { "epoch": 12.507204610951009, "grad_norm": 5.5595011711120605, "learning_rate": 3.6999384031528242e-06, "loss": 0.4506, "num_input_tokens_seen": 121391360, "step": 99820 }, { "epoch": 12.507831098859793, "grad_norm": 0.6498156785964966, "learning_rate": 3.6994105011477844e-06, "loss": 0.4766, "num_input_tokens_seen": 121397056, "step": 99825 }, { "epoch": 12.508457586768575, "grad_norm": 0.6902185082435608, "learning_rate": 3.698882614692383e-06, "loss": 0.5405, "num_input_tokens_seen": 121403648, "step": 99830 }, { "epoch": 12.509084074677359, "grad_norm": 1.070915699005127, "learning_rate": 3.6983547437929358e-06, "loss": 0.5047, "num_input_tokens_seen": 121409216, "step": 99835 }, { "epoch": 12.509710562586141, "grad_norm": 0.9299883246421814, "learning_rate": 3.69782688845575e-06, "loss": 0.5379, "num_input_tokens_seen": 121415872, "step": 99840 }, { "epoch": 12.510337050494925, "grad_norm": 0.7346524596214294, "learning_rate": 3.6972990486871407e-06, "loss": 0.4171, "num_input_tokens_seen": 121421824, "step": 99845 }, { "epoch": 12.51096353840371, "grad_norm": 3.0788016319274902, "learning_rate": 3.696771224493415e-06, "loss": 0.534, "num_input_tokens_seen": 121427968, "step": 99850 }, { "epoch": 12.511590026312492, "grad_norm": 1.6690330505371094, "learning_rate": 3.696243415880887e-06, "loss": 0.4611, "num_input_tokens_seen": 121434272, "step": 99855 }, { "epoch": 12.512216514221276, "grad_norm": 2.3189125061035156, "learning_rate": 3.6957156228558634e-06, "loss": 0.4238, "num_input_tokens_seen": 121440736, "step": 99860 }, { "epoch": 12.512843002130058, "grad_norm": 0.5922918319702148, "learning_rate": 3.695187845424657e-06, "loss": 0.4728, "num_input_tokens_seen": 121446912, "step": 99865 }, { "epoch": 12.513469490038842, "grad_norm": 0.6848410964012146, "learning_rate": 3.694660083593576e-06, "loss": 0.5427, "num_input_tokens_seen": 121453088, "step": 99870 }, { "epoch": 12.514095977947626, "grad_norm": 3.759235143661499, "learning_rate": 3.694132337368931e-06, "loss": 0.5257, "num_input_tokens_seen": 121459008, "step": 99875 }, { "epoch": 12.514722465856408, "grad_norm": 1.775039792060852, "learning_rate": 3.6936046067570332e-06, "loss": 0.451, "num_input_tokens_seen": 121465120, "step": 99880 }, { "epoch": 12.515348953765193, "grad_norm": 1.3897253274917603, "learning_rate": 3.693076891764189e-06, "loss": 0.4748, "num_input_tokens_seen": 121471328, "step": 99885 }, { "epoch": 12.515975441673977, "grad_norm": 1.0261131525039673, "learning_rate": 3.6925491923967104e-06, "loss": 0.4932, "num_input_tokens_seen": 121477440, "step": 99890 }, { "epoch": 12.516601929582759, "grad_norm": 1.3554861545562744, "learning_rate": 3.6920215086609047e-06, "loss": 0.4637, "num_input_tokens_seen": 121483488, "step": 99895 }, { "epoch": 12.517228417491543, "grad_norm": 0.5488874316215515, "learning_rate": 3.6914938405630828e-06, "loss": 0.4559, "num_input_tokens_seen": 121489792, "step": 99900 }, { "epoch": 12.517854905400325, "grad_norm": 0.7586521506309509, "learning_rate": 3.6909661881095502e-06, "loss": 0.5045, "num_input_tokens_seen": 121495776, "step": 99905 }, { "epoch": 12.51848139330911, "grad_norm": 1.0087523460388184, "learning_rate": 3.6904385513066205e-06, "loss": 0.4561, "num_input_tokens_seen": 121501600, "step": 99910 }, { "epoch": 12.519107881217893, "grad_norm": 0.8041266202926636, "learning_rate": 3.6899109301605963e-06, "loss": 0.4399, "num_input_tokens_seen": 121507840, "step": 99915 }, { "epoch": 12.519734369126676, "grad_norm": 5.216547012329102, "learning_rate": 3.6893833246777905e-06, "loss": 0.4946, "num_input_tokens_seen": 121514144, "step": 99920 }, { "epoch": 12.52036085703546, "grad_norm": 1.1835254430770874, "learning_rate": 3.688855734864507e-06, "loss": 0.4317, "num_input_tokens_seen": 121520288, "step": 99925 }, { "epoch": 12.520987344944242, "grad_norm": 0.8778144717216492, "learning_rate": 3.6883281607270562e-06, "loss": 0.449, "num_input_tokens_seen": 121526272, "step": 99930 }, { "epoch": 12.521613832853026, "grad_norm": 0.7925438284873962, "learning_rate": 3.6878006022717438e-06, "loss": 0.411, "num_input_tokens_seen": 121532288, "step": 99935 }, { "epoch": 12.52224032076181, "grad_norm": 3.4580891132354736, "learning_rate": 3.68727305950488e-06, "loss": 0.4514, "num_input_tokens_seen": 121538464, "step": 99940 }, { "epoch": 12.522866808670592, "grad_norm": 0.712826132774353, "learning_rate": 3.6867455324327705e-06, "loss": 0.4701, "num_input_tokens_seen": 121544672, "step": 99945 }, { "epoch": 12.523493296579376, "grad_norm": 1.4174259901046753, "learning_rate": 3.686218021061721e-06, "loss": 0.5991, "num_input_tokens_seen": 121550432, "step": 99950 }, { "epoch": 12.524119784488159, "grad_norm": 2.060025215148926, "learning_rate": 3.685690525398042e-06, "loss": 0.5407, "num_input_tokens_seen": 121556544, "step": 99955 }, { "epoch": 12.524746272396943, "grad_norm": 0.5374955534934998, "learning_rate": 3.685163045448035e-06, "loss": 0.4255, "num_input_tokens_seen": 121562880, "step": 99960 }, { "epoch": 12.525372760305727, "grad_norm": 4.365777969360352, "learning_rate": 3.6846355812180124e-06, "loss": 0.5345, "num_input_tokens_seen": 121569088, "step": 99965 }, { "epoch": 12.525999248214509, "grad_norm": 1.0663388967514038, "learning_rate": 3.684108132714274e-06, "loss": 0.483, "num_input_tokens_seen": 121574944, "step": 99970 }, { "epoch": 12.526625736123293, "grad_norm": 5.504086971282959, "learning_rate": 3.6835806999431324e-06, "loss": 0.4837, "num_input_tokens_seen": 121581152, "step": 99975 }, { "epoch": 12.527252224032075, "grad_norm": 0.6370041966438293, "learning_rate": 3.683053282910888e-06, "loss": 0.5311, "num_input_tokens_seen": 121587360, "step": 99980 }, { "epoch": 12.52787871194086, "grad_norm": 1.1455223560333252, "learning_rate": 3.6825258816238496e-06, "loss": 0.5202, "num_input_tokens_seen": 121593600, "step": 99985 }, { "epoch": 12.528505199849644, "grad_norm": 0.6224149465560913, "learning_rate": 3.6819984960883202e-06, "loss": 0.4718, "num_input_tokens_seen": 121599840, "step": 99990 }, { "epoch": 12.529131687758426, "grad_norm": 0.801398754119873, "learning_rate": 3.6814711263106103e-06, "loss": 0.4419, "num_input_tokens_seen": 121605984, "step": 99995 }, { "epoch": 12.52975817566721, "grad_norm": 0.5026163458824158, "learning_rate": 3.6809437722970186e-06, "loss": 0.4692, "num_input_tokens_seen": 121611936, "step": 100000 }, { "epoch": 12.530384663575994, "grad_norm": 0.808402955532074, "learning_rate": 3.680416434053854e-06, "loss": 0.4682, "num_input_tokens_seen": 121618240, "step": 100005 }, { "epoch": 12.531011151484776, "grad_norm": 0.711229681968689, "learning_rate": 3.6798891115874213e-06, "loss": 0.4401, "num_input_tokens_seen": 121623520, "step": 100010 }, { "epoch": 12.53163763939356, "grad_norm": 0.8222119212150574, "learning_rate": 3.6793618049040232e-06, "loss": 0.4822, "num_input_tokens_seen": 121629632, "step": 100015 }, { "epoch": 12.532264127302343, "grad_norm": 0.7681658267974854, "learning_rate": 3.678834514009967e-06, "loss": 0.5045, "num_input_tokens_seen": 121635712, "step": 100020 }, { "epoch": 12.532890615211127, "grad_norm": 2.2248146533966064, "learning_rate": 3.678307238911552e-06, "loss": 0.5287, "num_input_tokens_seen": 121641920, "step": 100025 }, { "epoch": 12.533517103119909, "grad_norm": 0.6370404958724976, "learning_rate": 3.6777799796150883e-06, "loss": 0.5428, "num_input_tokens_seen": 121648160, "step": 100030 }, { "epoch": 12.534143591028693, "grad_norm": 0.8674458265304565, "learning_rate": 3.677252736126874e-06, "loss": 0.5501, "num_input_tokens_seen": 121654464, "step": 100035 }, { "epoch": 12.534770078937477, "grad_norm": 1.0026075839996338, "learning_rate": 3.676725508453216e-06, "loss": 0.4828, "num_input_tokens_seen": 121660544, "step": 100040 }, { "epoch": 12.53539656684626, "grad_norm": 0.6141026616096497, "learning_rate": 3.676198296600416e-06, "loss": 0.5015, "num_input_tokens_seen": 121666784, "step": 100045 }, { "epoch": 12.536023054755043, "grad_norm": 0.30519622564315796, "learning_rate": 3.6756711005747813e-06, "loss": 0.4376, "num_input_tokens_seen": 121672768, "step": 100050 }, { "epoch": 12.536649542663827, "grad_norm": 1.5534937381744385, "learning_rate": 3.67514392038261e-06, "loss": 0.458, "num_input_tokens_seen": 121679040, "step": 100055 }, { "epoch": 12.53727603057261, "grad_norm": 0.869253396987915, "learning_rate": 3.6746167560302058e-06, "loss": 0.4534, "num_input_tokens_seen": 121684832, "step": 100060 }, { "epoch": 12.537902518481394, "grad_norm": 8.78364372253418, "learning_rate": 3.6740896075238757e-06, "loss": 0.4996, "num_input_tokens_seen": 121691296, "step": 100065 }, { "epoch": 12.538529006390176, "grad_norm": 0.8154341578483582, "learning_rate": 3.673562474869917e-06, "loss": 0.4632, "num_input_tokens_seen": 121696704, "step": 100070 }, { "epoch": 12.53915549429896, "grad_norm": 1.8773454427719116, "learning_rate": 3.673035358074636e-06, "loss": 0.4674, "num_input_tokens_seen": 121702784, "step": 100075 }, { "epoch": 12.539781982207744, "grad_norm": 1.1361806392669678, "learning_rate": 3.6725082571443304e-06, "loss": 0.5436, "num_input_tokens_seen": 121708896, "step": 100080 }, { "epoch": 12.540408470116526, "grad_norm": 2.079315423965454, "learning_rate": 3.671981172085306e-06, "loss": 0.4602, "num_input_tokens_seen": 121714624, "step": 100085 }, { "epoch": 12.54103495802531, "grad_norm": 0.6673673391342163, "learning_rate": 3.6714541029038643e-06, "loss": 0.4763, "num_input_tokens_seen": 121721024, "step": 100090 }, { "epoch": 12.541661445934093, "grad_norm": 12.881261825561523, "learning_rate": 3.670927049606304e-06, "loss": 0.4737, "num_input_tokens_seen": 121727488, "step": 100095 }, { "epoch": 12.542287933842877, "grad_norm": 7.487638473510742, "learning_rate": 3.670400012198928e-06, "loss": 0.5001, "num_input_tokens_seen": 121733312, "step": 100100 }, { "epoch": 12.54291442175166, "grad_norm": 1.7581846714019775, "learning_rate": 3.6698729906880405e-06, "loss": 0.4548, "num_input_tokens_seen": 121739744, "step": 100105 }, { "epoch": 12.543540909660443, "grad_norm": 1.116145133972168, "learning_rate": 3.6693459850799366e-06, "loss": 0.4699, "num_input_tokens_seen": 121745856, "step": 100110 }, { "epoch": 12.544167397569227, "grad_norm": 0.9669046401977539, "learning_rate": 3.668818995380923e-06, "loss": 0.4759, "num_input_tokens_seen": 121752192, "step": 100115 }, { "epoch": 12.54479388547801, "grad_norm": 1.583884358406067, "learning_rate": 3.668292021597295e-06, "loss": 0.445, "num_input_tokens_seen": 121758176, "step": 100120 }, { "epoch": 12.545420373386794, "grad_norm": 0.6588723063468933, "learning_rate": 3.6677650637353557e-06, "loss": 0.5077, "num_input_tokens_seen": 121764448, "step": 100125 }, { "epoch": 12.546046861295578, "grad_norm": 0.7291082143783569, "learning_rate": 3.667238121801407e-06, "loss": 0.5063, "num_input_tokens_seen": 121769888, "step": 100130 }, { "epoch": 12.54667334920436, "grad_norm": 3.168877124786377, "learning_rate": 3.6667111958017455e-06, "loss": 0.4654, "num_input_tokens_seen": 121775936, "step": 100135 }, { "epoch": 12.547299837113144, "grad_norm": 3.3327560424804688, "learning_rate": 3.666184285742673e-06, "loss": 0.4903, "num_input_tokens_seen": 121782208, "step": 100140 }, { "epoch": 12.547926325021926, "grad_norm": 0.8849281668663025, "learning_rate": 3.665657391630489e-06, "loss": 0.4531, "num_input_tokens_seen": 121788224, "step": 100145 }, { "epoch": 12.54855281293071, "grad_norm": 0.43313759565353394, "learning_rate": 3.665130513471493e-06, "loss": 0.5045, "num_input_tokens_seen": 121794304, "step": 100150 }, { "epoch": 12.549179300839494, "grad_norm": 0.651229739189148, "learning_rate": 3.664603651271983e-06, "loss": 0.4918, "num_input_tokens_seen": 121800256, "step": 100155 }, { "epoch": 12.549805788748277, "grad_norm": 0.6824826598167419, "learning_rate": 3.6640768050382603e-06, "loss": 0.4321, "num_input_tokens_seen": 121806496, "step": 100160 }, { "epoch": 12.55043227665706, "grad_norm": 4.306558132171631, "learning_rate": 3.663549974776621e-06, "loss": 0.4761, "num_input_tokens_seen": 121812768, "step": 100165 }, { "epoch": 12.551058764565845, "grad_norm": 0.9810101985931396, "learning_rate": 3.663023160493368e-06, "loss": 0.4625, "num_input_tokens_seen": 121818976, "step": 100170 }, { "epoch": 12.551685252474627, "grad_norm": 1.168170690536499, "learning_rate": 3.6624963621947945e-06, "loss": 0.5453, "num_input_tokens_seen": 121825152, "step": 100175 }, { "epoch": 12.552311740383411, "grad_norm": 0.7590976357460022, "learning_rate": 3.661969579887204e-06, "loss": 0.4738, "num_input_tokens_seen": 121831328, "step": 100180 }, { "epoch": 12.552938228292193, "grad_norm": 1.0177820920944214, "learning_rate": 3.66144281357689e-06, "loss": 0.4427, "num_input_tokens_seen": 121836960, "step": 100185 }, { "epoch": 12.553564716200977, "grad_norm": 2.1444742679595947, "learning_rate": 3.660916063270152e-06, "loss": 0.5492, "num_input_tokens_seen": 121843232, "step": 100190 }, { "epoch": 12.554191204109761, "grad_norm": 0.41713279485702515, "learning_rate": 3.6603893289732895e-06, "loss": 0.4668, "num_input_tokens_seen": 121849632, "step": 100195 }, { "epoch": 12.554817692018544, "grad_norm": 0.47736191749572754, "learning_rate": 3.6598626106925994e-06, "loss": 0.4585, "num_input_tokens_seen": 121855744, "step": 100200 }, { "epoch": 12.555444179927328, "grad_norm": 0.740233302116394, "learning_rate": 3.659335908434378e-06, "loss": 0.4643, "num_input_tokens_seen": 121861792, "step": 100205 }, { "epoch": 12.55607066783611, "grad_norm": 0.7312894463539124, "learning_rate": 3.658809222204922e-06, "loss": 0.4847, "num_input_tokens_seen": 121867840, "step": 100210 }, { "epoch": 12.556697155744894, "grad_norm": 0.8622242212295532, "learning_rate": 3.658282552010532e-06, "loss": 0.45, "num_input_tokens_seen": 121874272, "step": 100215 }, { "epoch": 12.557323643653678, "grad_norm": 1.559104323387146, "learning_rate": 3.6577558978575e-06, "loss": 0.4863, "num_input_tokens_seen": 121880384, "step": 100220 }, { "epoch": 12.55795013156246, "grad_norm": 0.526229977607727, "learning_rate": 3.6572292597521266e-06, "loss": 0.4437, "num_input_tokens_seen": 121886400, "step": 100225 }, { "epoch": 12.558576619471244, "grad_norm": 0.4852167069911957, "learning_rate": 3.6567026377007036e-06, "loss": 0.4743, "num_input_tokens_seen": 121892896, "step": 100230 }, { "epoch": 12.559203107380027, "grad_norm": 0.4127257466316223, "learning_rate": 3.6561760317095318e-06, "loss": 0.4608, "num_input_tokens_seen": 121899264, "step": 100235 }, { "epoch": 12.55982959528881, "grad_norm": 0.5394884347915649, "learning_rate": 3.6556494417849046e-06, "loss": 0.4932, "num_input_tokens_seen": 121905952, "step": 100240 }, { "epoch": 12.560456083197595, "grad_norm": 0.4698067605495453, "learning_rate": 3.6551228679331186e-06, "loss": 0.467, "num_input_tokens_seen": 121911872, "step": 100245 }, { "epoch": 12.561082571106377, "grad_norm": 1.1831161975860596, "learning_rate": 3.6545963101604696e-06, "loss": 0.4621, "num_input_tokens_seen": 121918240, "step": 100250 }, { "epoch": 12.561709059015161, "grad_norm": 0.5449997186660767, "learning_rate": 3.6540697684732516e-06, "loss": 0.4532, "num_input_tokens_seen": 121924192, "step": 100255 }, { "epoch": 12.562335546923943, "grad_norm": 0.459566205739975, "learning_rate": 3.653543242877764e-06, "loss": 0.4691, "num_input_tokens_seen": 121930496, "step": 100260 }, { "epoch": 12.562962034832728, "grad_norm": 0.7381657361984253, "learning_rate": 3.6530167333802967e-06, "loss": 0.4576, "num_input_tokens_seen": 121936672, "step": 100265 }, { "epoch": 12.563588522741512, "grad_norm": 0.3869238793849945, "learning_rate": 3.6524902399871487e-06, "loss": 0.4644, "num_input_tokens_seen": 121942592, "step": 100270 }, { "epoch": 12.564215010650294, "grad_norm": 0.54753178358078, "learning_rate": 3.651963762704611e-06, "loss": 0.4454, "num_input_tokens_seen": 121948672, "step": 100275 }, { "epoch": 12.564841498559078, "grad_norm": 0.5311992168426514, "learning_rate": 3.651437301538982e-06, "loss": 0.46, "num_input_tokens_seen": 121954816, "step": 100280 }, { "epoch": 12.565467986467862, "grad_norm": 0.6159552335739136, "learning_rate": 3.650910856496551e-06, "loss": 0.4607, "num_input_tokens_seen": 121960992, "step": 100285 }, { "epoch": 12.566094474376644, "grad_norm": 2.3579206466674805, "learning_rate": 3.650384427583617e-06, "loss": 0.4689, "num_input_tokens_seen": 121967072, "step": 100290 }, { "epoch": 12.566720962285428, "grad_norm": 0.47859832644462585, "learning_rate": 3.6498580148064714e-06, "loss": 0.4664, "num_input_tokens_seen": 121973056, "step": 100295 }, { "epoch": 12.56734745019421, "grad_norm": 0.5687803626060486, "learning_rate": 3.649331618171409e-06, "loss": 0.4694, "num_input_tokens_seen": 121979360, "step": 100300 }, { "epoch": 12.567973938102995, "grad_norm": 0.36130955815315247, "learning_rate": 3.6488052376847206e-06, "loss": 0.4643, "num_input_tokens_seen": 121985088, "step": 100305 }, { "epoch": 12.568600426011779, "grad_norm": 0.4371764361858368, "learning_rate": 3.6482788733527018e-06, "loss": 0.453, "num_input_tokens_seen": 121991200, "step": 100310 }, { "epoch": 12.569226913920561, "grad_norm": 0.7232449054718018, "learning_rate": 3.647752525181648e-06, "loss": 0.4742, "num_input_tokens_seen": 121997568, "step": 100315 }, { "epoch": 12.569853401829345, "grad_norm": 0.7565457820892334, "learning_rate": 3.6472261931778473e-06, "loss": 0.4581, "num_input_tokens_seen": 122003840, "step": 100320 }, { "epoch": 12.570479889738127, "grad_norm": 0.5500786900520325, "learning_rate": 3.6466998773475974e-06, "loss": 0.4604, "num_input_tokens_seen": 122009120, "step": 100325 }, { "epoch": 12.571106377646911, "grad_norm": 0.3821565806865692, "learning_rate": 3.646173577697186e-06, "loss": 0.4615, "num_input_tokens_seen": 122015040, "step": 100330 }, { "epoch": 12.571732865555695, "grad_norm": 0.5189857482910156, "learning_rate": 3.6456472942329096e-06, "loss": 0.4681, "num_input_tokens_seen": 122021088, "step": 100335 }, { "epoch": 12.572359353464478, "grad_norm": 0.4172319769859314, "learning_rate": 3.6451210269610566e-06, "loss": 0.4468, "num_input_tokens_seen": 122027264, "step": 100340 }, { "epoch": 12.572985841373262, "grad_norm": 0.8248504996299744, "learning_rate": 3.6445947758879218e-06, "loss": 0.4712, "num_input_tokens_seen": 122033408, "step": 100345 }, { "epoch": 12.573612329282044, "grad_norm": 0.5824587345123291, "learning_rate": 3.6440685410197957e-06, "loss": 0.4523, "num_input_tokens_seen": 122039264, "step": 100350 }, { "epoch": 12.574238817190828, "grad_norm": 1.037887692451477, "learning_rate": 3.6435423223629705e-06, "loss": 0.4754, "num_input_tokens_seen": 122045408, "step": 100355 }, { "epoch": 12.574865305099612, "grad_norm": 0.39437136054039, "learning_rate": 3.6430161199237363e-06, "loss": 0.449, "num_input_tokens_seen": 122051232, "step": 100360 }, { "epoch": 12.575491793008394, "grad_norm": 0.500498354434967, "learning_rate": 3.6424899337083873e-06, "loss": 0.4691, "num_input_tokens_seen": 122056960, "step": 100365 }, { "epoch": 12.576118280917179, "grad_norm": 0.5396818518638611, "learning_rate": 3.6419637637232103e-06, "loss": 0.4675, "num_input_tokens_seen": 122063072, "step": 100370 }, { "epoch": 12.57674476882596, "grad_norm": 0.4508858621120453, "learning_rate": 3.641437609974498e-06, "loss": 0.4474, "num_input_tokens_seen": 122069152, "step": 100375 }, { "epoch": 12.577371256734745, "grad_norm": 0.507819414138794, "learning_rate": 3.6409114724685433e-06, "loss": 0.4588, "num_input_tokens_seen": 122075392, "step": 100380 }, { "epoch": 12.577997744643529, "grad_norm": 0.4578285813331604, "learning_rate": 3.640385351211633e-06, "loss": 0.4521, "num_input_tokens_seen": 122081472, "step": 100385 }, { "epoch": 12.578624232552311, "grad_norm": 1.2277357578277588, "learning_rate": 3.639859246210061e-06, "loss": 0.4806, "num_input_tokens_seen": 122087488, "step": 100390 }, { "epoch": 12.579250720461095, "grad_norm": 0.7155337929725647, "learning_rate": 3.6393331574701134e-06, "loss": 0.4589, "num_input_tokens_seen": 122093600, "step": 100395 }, { "epoch": 12.57987720836988, "grad_norm": 0.5338997840881348, "learning_rate": 3.6388070849980827e-06, "loss": 0.4586, "num_input_tokens_seen": 122099712, "step": 100400 }, { "epoch": 12.580503696278662, "grad_norm": 0.46427249908447266, "learning_rate": 3.638281028800257e-06, "loss": 0.4462, "num_input_tokens_seen": 122105952, "step": 100405 }, { "epoch": 12.581130184187446, "grad_norm": 0.4924684464931488, "learning_rate": 3.6377549888829276e-06, "loss": 0.4651, "num_input_tokens_seen": 122112160, "step": 100410 }, { "epoch": 12.581756672096228, "grad_norm": 0.2945261597633362, "learning_rate": 3.637228965252381e-06, "loss": 0.462, "num_input_tokens_seen": 122118144, "step": 100415 }, { "epoch": 12.582383160005012, "grad_norm": 0.8701935410499573, "learning_rate": 3.6367029579149107e-06, "loss": 0.4597, "num_input_tokens_seen": 122124416, "step": 100420 }, { "epoch": 12.583009647913796, "grad_norm": 0.6935011744499207, "learning_rate": 3.6361769668768e-06, "loss": 0.4497, "num_input_tokens_seen": 122130240, "step": 100425 }, { "epoch": 12.583636135822578, "grad_norm": 1.2419365644454956, "learning_rate": 3.6356509921443394e-06, "loss": 0.4601, "num_input_tokens_seen": 122135488, "step": 100430 }, { "epoch": 12.584262623731362, "grad_norm": 0.3692871630191803, "learning_rate": 3.635125033723822e-06, "loss": 0.4744, "num_input_tokens_seen": 122141280, "step": 100435 }, { "epoch": 12.584889111640145, "grad_norm": 0.6822432279586792, "learning_rate": 3.634599091621529e-06, "loss": 0.4533, "num_input_tokens_seen": 122146912, "step": 100440 }, { "epoch": 12.585515599548929, "grad_norm": 0.5597171187400818, "learning_rate": 3.6340731658437545e-06, "loss": 0.4623, "num_input_tokens_seen": 122153056, "step": 100445 }, { "epoch": 12.586142087457713, "grad_norm": 0.3957681953907013, "learning_rate": 3.6335472563967815e-06, "loss": 0.4498, "num_input_tokens_seen": 122159008, "step": 100450 }, { "epoch": 12.586768575366495, "grad_norm": 0.4567853510379791, "learning_rate": 3.633021363286901e-06, "loss": 0.4561, "num_input_tokens_seen": 122165056, "step": 100455 }, { "epoch": 12.58739506327528, "grad_norm": 0.47696930170059204, "learning_rate": 3.6324954865203987e-06, "loss": 0.4534, "num_input_tokens_seen": 122171520, "step": 100460 }, { "epoch": 12.588021551184061, "grad_norm": 0.4869205355644226, "learning_rate": 3.631969626103564e-06, "loss": 0.45, "num_input_tokens_seen": 122177984, "step": 100465 }, { "epoch": 12.588648039092845, "grad_norm": 0.8659664392471313, "learning_rate": 3.6314437820426806e-06, "loss": 0.4684, "num_input_tokens_seen": 122184064, "step": 100470 }, { "epoch": 12.58927452700163, "grad_norm": 0.5712703466415405, "learning_rate": 3.6309179543440397e-06, "loss": 0.4638, "num_input_tokens_seen": 122190304, "step": 100475 }, { "epoch": 12.589901014910412, "grad_norm": 0.38113346695899963, "learning_rate": 3.6303921430139244e-06, "loss": 0.4705, "num_input_tokens_seen": 122196608, "step": 100480 }, { "epoch": 12.590527502819196, "grad_norm": 0.8319748640060425, "learning_rate": 3.6298663480586237e-06, "loss": 0.4508, "num_input_tokens_seen": 122202560, "step": 100485 }, { "epoch": 12.591153990727978, "grad_norm": 0.33072802424430847, "learning_rate": 3.629340569484421e-06, "loss": 0.4582, "num_input_tokens_seen": 122208416, "step": 100490 }, { "epoch": 12.591780478636762, "grad_norm": 0.47682860493659973, "learning_rate": 3.6288148072976037e-06, "loss": 0.4551, "num_input_tokens_seen": 122214432, "step": 100495 }, { "epoch": 12.592406966545546, "grad_norm": 0.3364948332309723, "learning_rate": 3.6282890615044607e-06, "loss": 0.4589, "num_input_tokens_seen": 122220896, "step": 100500 }, { "epoch": 12.593033454454329, "grad_norm": 0.37431392073631287, "learning_rate": 3.627763332111273e-06, "loss": 0.4578, "num_input_tokens_seen": 122227232, "step": 100505 }, { "epoch": 12.593659942363113, "grad_norm": 0.49524274468421936, "learning_rate": 3.6272376191243296e-06, "loss": 0.4601, "num_input_tokens_seen": 122233312, "step": 100510 }, { "epoch": 12.594286430271897, "grad_norm": 0.3784298598766327, "learning_rate": 3.6267119225499135e-06, "loss": 0.4548, "num_input_tokens_seen": 122239360, "step": 100515 }, { "epoch": 12.594912918180679, "grad_norm": 0.5227912664413452, "learning_rate": 3.626186242394313e-06, "loss": 0.4627, "num_input_tokens_seen": 122245408, "step": 100520 }, { "epoch": 12.595539406089463, "grad_norm": 0.29282546043395996, "learning_rate": 3.625660578663809e-06, "loss": 0.4642, "num_input_tokens_seen": 122251456, "step": 100525 }, { "epoch": 12.596165893998245, "grad_norm": 0.6025554537773132, "learning_rate": 3.62513493136469e-06, "loss": 0.4547, "num_input_tokens_seen": 122257856, "step": 100530 }, { "epoch": 12.59679238190703, "grad_norm": 0.39706915616989136, "learning_rate": 3.624609300503237e-06, "loss": 0.4581, "num_input_tokens_seen": 122263776, "step": 100535 }, { "epoch": 12.597418869815813, "grad_norm": 0.5469326972961426, "learning_rate": 3.624083686085739e-06, "loss": 0.4625, "num_input_tokens_seen": 122269888, "step": 100540 }, { "epoch": 12.598045357724596, "grad_norm": 0.5490431189537048, "learning_rate": 3.623558088118474e-06, "loss": 0.4589, "num_input_tokens_seen": 122276032, "step": 100545 }, { "epoch": 12.59867184563338, "grad_norm": 0.555686354637146, "learning_rate": 3.623032506607731e-06, "loss": 0.4662, "num_input_tokens_seen": 122282400, "step": 100550 }, { "epoch": 12.599298333542162, "grad_norm": 0.6368445158004761, "learning_rate": 3.6225069415597907e-06, "loss": 0.4599, "num_input_tokens_seen": 122288224, "step": 100555 }, { "epoch": 12.599924821450946, "grad_norm": 1.2388123273849487, "learning_rate": 3.6219813929809396e-06, "loss": 0.4557, "num_input_tokens_seen": 122293568, "step": 100560 }, { "epoch": 12.60055130935973, "grad_norm": 0.5789791345596313, "learning_rate": 3.6214558608774587e-06, "loss": 0.4633, "num_input_tokens_seen": 122299808, "step": 100565 }, { "epoch": 12.601177797268512, "grad_norm": 0.507591724395752, "learning_rate": 3.620930345255632e-06, "loss": 0.4568, "num_input_tokens_seen": 122306080, "step": 100570 }, { "epoch": 12.601804285177296, "grad_norm": 0.36161795258522034, "learning_rate": 3.6204048461217437e-06, "loss": 0.4496, "num_input_tokens_seen": 122312032, "step": 100575 }, { "epoch": 12.602430773086079, "grad_norm": 0.6387673020362854, "learning_rate": 3.619879363482074e-06, "loss": 0.4809, "num_input_tokens_seen": 122318208, "step": 100580 }, { "epoch": 12.603057260994863, "grad_norm": 0.47290509939193726, "learning_rate": 3.6193538973429086e-06, "loss": 0.4802, "num_input_tokens_seen": 122324448, "step": 100585 }, { "epoch": 12.603683748903647, "grad_norm": 0.8172298669815063, "learning_rate": 3.6188284477105255e-06, "loss": 0.4604, "num_input_tokens_seen": 122330080, "step": 100590 }, { "epoch": 12.60431023681243, "grad_norm": 0.546325147151947, "learning_rate": 3.6183030145912123e-06, "loss": 0.4542, "num_input_tokens_seen": 122336064, "step": 100595 }, { "epoch": 12.604936724721213, "grad_norm": 0.6949074864387512, "learning_rate": 3.617777597991246e-06, "loss": 0.4601, "num_input_tokens_seen": 122342336, "step": 100600 }, { "epoch": 12.605563212629995, "grad_norm": 0.3572442829608917, "learning_rate": 3.6172521979169117e-06, "loss": 0.445, "num_input_tokens_seen": 122348544, "step": 100605 }, { "epoch": 12.60618970053878, "grad_norm": 0.34093889594078064, "learning_rate": 3.6167268143744886e-06, "loss": 0.4596, "num_input_tokens_seen": 122353536, "step": 100610 }, { "epoch": 12.606816188447564, "grad_norm": 0.37671008706092834, "learning_rate": 3.6162014473702623e-06, "loss": 0.4696, "num_input_tokens_seen": 122359488, "step": 100615 }, { "epoch": 12.607442676356346, "grad_norm": 0.4534968435764313, "learning_rate": 3.615676096910508e-06, "loss": 0.455, "num_input_tokens_seen": 122365760, "step": 100620 }, { "epoch": 12.60806916426513, "grad_norm": 0.6929065585136414, "learning_rate": 3.61515076300151e-06, "loss": 0.4589, "num_input_tokens_seen": 122371872, "step": 100625 }, { "epoch": 12.608695652173914, "grad_norm": 0.47807854413986206, "learning_rate": 3.6146254456495523e-06, "loss": 0.4606, "num_input_tokens_seen": 122378304, "step": 100630 }, { "epoch": 12.609322140082696, "grad_norm": 0.5652031302452087, "learning_rate": 3.6141001448609086e-06, "loss": 0.4754, "num_input_tokens_seen": 122384384, "step": 100635 }, { "epoch": 12.60994862799148, "grad_norm": 0.40651735663414, "learning_rate": 3.6135748606418653e-06, "loss": 0.4548, "num_input_tokens_seen": 122390592, "step": 100640 }, { "epoch": 12.610575115900263, "grad_norm": 0.5148233771324158, "learning_rate": 3.6130495929986987e-06, "loss": 0.4412, "num_input_tokens_seen": 122396800, "step": 100645 }, { "epoch": 12.611201603809047, "grad_norm": 4.027881145477295, "learning_rate": 3.612524341937692e-06, "loss": 0.4722, "num_input_tokens_seen": 122402656, "step": 100650 }, { "epoch": 12.611828091717829, "grad_norm": 0.49083685874938965, "learning_rate": 3.611999107465121e-06, "loss": 0.4559, "num_input_tokens_seen": 122408960, "step": 100655 }, { "epoch": 12.612454579626613, "grad_norm": 0.4887380003929138, "learning_rate": 3.6114738895872692e-06, "loss": 0.4596, "num_input_tokens_seen": 122415168, "step": 100660 }, { "epoch": 12.613081067535397, "grad_norm": 0.6398794651031494, "learning_rate": 3.6109486883104126e-06, "loss": 0.4543, "num_input_tokens_seen": 122421184, "step": 100665 }, { "epoch": 12.61370755544418, "grad_norm": 0.6177347302436829, "learning_rate": 3.610423503640835e-06, "loss": 0.4539, "num_input_tokens_seen": 122427456, "step": 100670 }, { "epoch": 12.614334043352963, "grad_norm": 4.856113910675049, "learning_rate": 3.6098983355848105e-06, "loss": 0.457, "num_input_tokens_seen": 122433504, "step": 100675 }, { "epoch": 12.614960531261747, "grad_norm": 0.6161254048347473, "learning_rate": 3.6093731841486202e-06, "loss": 0.4351, "num_input_tokens_seen": 122439456, "step": 100680 }, { "epoch": 12.61558701917053, "grad_norm": 0.5252882838249207, "learning_rate": 3.608848049338545e-06, "loss": 0.457, "num_input_tokens_seen": 122445376, "step": 100685 }, { "epoch": 12.616213507079314, "grad_norm": 2.056204319000244, "learning_rate": 3.608322931160858e-06, "loss": 0.4685, "num_input_tokens_seen": 122451648, "step": 100690 }, { "epoch": 12.616839994988096, "grad_norm": 11.500567436218262, "learning_rate": 3.607797829621843e-06, "loss": 0.5346, "num_input_tokens_seen": 122457568, "step": 100695 }, { "epoch": 12.61746648289688, "grad_norm": 0.33395496010780334, "learning_rate": 3.6072727447277732e-06, "loss": 0.4459, "num_input_tokens_seen": 122463520, "step": 100700 }, { "epoch": 12.618092970805664, "grad_norm": 1.8635101318359375, "learning_rate": 3.606747676484931e-06, "loss": 0.4471, "num_input_tokens_seen": 122469760, "step": 100705 }, { "epoch": 12.618719458714446, "grad_norm": 0.5290547609329224, "learning_rate": 3.6062226248995903e-06, "loss": 0.4944, "num_input_tokens_seen": 122476096, "step": 100710 }, { "epoch": 12.61934594662323, "grad_norm": 0.7126273512840271, "learning_rate": 3.60569758997803e-06, "loss": 0.4457, "num_input_tokens_seen": 122482080, "step": 100715 }, { "epoch": 12.619972434532013, "grad_norm": 3.7275266647338867, "learning_rate": 3.6051725717265264e-06, "loss": 0.4749, "num_input_tokens_seen": 122488544, "step": 100720 }, { "epoch": 12.620598922440797, "grad_norm": 0.554539680480957, "learning_rate": 3.6046475701513594e-06, "loss": 0.5004, "num_input_tokens_seen": 122494624, "step": 100725 }, { "epoch": 12.621225410349581, "grad_norm": 0.6481305360794067, "learning_rate": 3.6041225852588025e-06, "loss": 0.4399, "num_input_tokens_seen": 122500864, "step": 100730 }, { "epoch": 12.621851898258363, "grad_norm": 0.4847293198108673, "learning_rate": 3.6035976170551353e-06, "loss": 0.4671, "num_input_tokens_seen": 122507168, "step": 100735 }, { "epoch": 12.622478386167147, "grad_norm": 0.8353317975997925, "learning_rate": 3.6030726655466303e-06, "loss": 0.5166, "num_input_tokens_seen": 122513408, "step": 100740 }, { "epoch": 12.62310487407593, "grad_norm": 1.4941368103027344, "learning_rate": 3.602547730739566e-06, "loss": 0.5016, "num_input_tokens_seen": 122519520, "step": 100745 }, { "epoch": 12.623731361984714, "grad_norm": 1.3500999212265015, "learning_rate": 3.602022812640221e-06, "loss": 0.4596, "num_input_tokens_seen": 122525504, "step": 100750 }, { "epoch": 12.624357849893498, "grad_norm": 0.5861911177635193, "learning_rate": 3.6014979112548666e-06, "loss": 0.4496, "num_input_tokens_seen": 122531680, "step": 100755 }, { "epoch": 12.62498433780228, "grad_norm": 0.48451337218284607, "learning_rate": 3.6009730265897815e-06, "loss": 0.444, "num_input_tokens_seen": 122537280, "step": 100760 }, { "epoch": 12.625610825711064, "grad_norm": 0.6609593629837036, "learning_rate": 3.6004481586512394e-06, "loss": 0.4505, "num_input_tokens_seen": 122543392, "step": 100765 }, { "epoch": 12.626237313619846, "grad_norm": 0.5050226449966431, "learning_rate": 3.5999233074455165e-06, "loss": 0.4607, "num_input_tokens_seen": 122549568, "step": 100770 }, { "epoch": 12.62686380152863, "grad_norm": 0.508033037185669, "learning_rate": 3.5993984729788865e-06, "loss": 0.4518, "num_input_tokens_seen": 122555840, "step": 100775 }, { "epoch": 12.627490289437414, "grad_norm": 0.44701632857322693, "learning_rate": 3.5988736552576283e-06, "loss": 0.4819, "num_input_tokens_seen": 122561952, "step": 100780 }, { "epoch": 12.628116777346197, "grad_norm": 0.6800654530525208, "learning_rate": 3.5983488542880106e-06, "loss": 0.4708, "num_input_tokens_seen": 122567584, "step": 100785 }, { "epoch": 12.62874326525498, "grad_norm": 0.5387970209121704, "learning_rate": 3.5978240700763134e-06, "loss": 0.4769, "num_input_tokens_seen": 122573504, "step": 100790 }, { "epoch": 12.629369753163765, "grad_norm": 0.46135979890823364, "learning_rate": 3.5972993026288056e-06, "loss": 0.4596, "num_input_tokens_seen": 122579296, "step": 100795 }, { "epoch": 12.629996241072547, "grad_norm": 0.4839496910572052, "learning_rate": 3.5967745519517666e-06, "loss": 0.4565, "num_input_tokens_seen": 122585216, "step": 100800 }, { "epoch": 12.630622728981331, "grad_norm": 0.9067797064781189, "learning_rate": 3.596249818051465e-06, "loss": 0.4634, "num_input_tokens_seen": 122591648, "step": 100805 }, { "epoch": 12.631249216890113, "grad_norm": 0.6035816669464111, "learning_rate": 3.5957251009341775e-06, "loss": 0.5092, "num_input_tokens_seen": 122597920, "step": 100810 }, { "epoch": 12.631875704798897, "grad_norm": 0.7636588215827942, "learning_rate": 3.595200400606178e-06, "loss": 0.4557, "num_input_tokens_seen": 122604288, "step": 100815 }, { "epoch": 12.632502192707681, "grad_norm": 0.38869112730026245, "learning_rate": 3.594675717073738e-06, "loss": 0.4545, "num_input_tokens_seen": 122610624, "step": 100820 }, { "epoch": 12.633128680616464, "grad_norm": 0.5129815340042114, "learning_rate": 3.594151050343133e-06, "loss": 0.4577, "num_input_tokens_seen": 122617056, "step": 100825 }, { "epoch": 12.633755168525248, "grad_norm": 0.6782649755477905, "learning_rate": 3.593626400420632e-06, "loss": 0.476, "num_input_tokens_seen": 122623200, "step": 100830 }, { "epoch": 12.63438165643403, "grad_norm": 2.1977381706237793, "learning_rate": 3.5931017673125125e-06, "loss": 0.4656, "num_input_tokens_seen": 122629056, "step": 100835 }, { "epoch": 12.635008144342814, "grad_norm": 0.6039264798164368, "learning_rate": 3.592577151025041e-06, "loss": 0.4589, "num_input_tokens_seen": 122635040, "step": 100840 }, { "epoch": 12.635634632251598, "grad_norm": 0.6448973417282104, "learning_rate": 3.592052551564496e-06, "loss": 0.4502, "num_input_tokens_seen": 122641376, "step": 100845 }, { "epoch": 12.63626112016038, "grad_norm": 0.9999242424964905, "learning_rate": 3.591527968937144e-06, "loss": 0.4535, "num_input_tokens_seen": 122647520, "step": 100850 }, { "epoch": 12.636887608069165, "grad_norm": 0.8051192760467529, "learning_rate": 3.591003403149261e-06, "loss": 0.4631, "num_input_tokens_seen": 122653696, "step": 100855 }, { "epoch": 12.637514095977947, "grad_norm": 0.5031642317771912, "learning_rate": 3.590478854207116e-06, "loss": 0.4458, "num_input_tokens_seen": 122659584, "step": 100860 }, { "epoch": 12.638140583886731, "grad_norm": 2.137739419937134, "learning_rate": 3.589954322116981e-06, "loss": 0.4559, "num_input_tokens_seen": 122666112, "step": 100865 }, { "epoch": 12.638767071795515, "grad_norm": 0.4574773609638214, "learning_rate": 3.5894298068851285e-06, "loss": 0.4563, "num_input_tokens_seen": 122672480, "step": 100870 }, { "epoch": 12.639393559704297, "grad_norm": 1.1634819507598877, "learning_rate": 3.588905308517827e-06, "loss": 0.4548, "num_input_tokens_seen": 122678528, "step": 100875 }, { "epoch": 12.640020047613081, "grad_norm": 0.8802534341812134, "learning_rate": 3.5883808270213514e-06, "loss": 0.4478, "num_input_tokens_seen": 122684736, "step": 100880 }, { "epoch": 12.640646535521864, "grad_norm": 0.49157020449638367, "learning_rate": 3.587856362401968e-06, "loss": 0.4528, "num_input_tokens_seen": 122691072, "step": 100885 }, { "epoch": 12.641273023430648, "grad_norm": 0.7005857825279236, "learning_rate": 3.587331914665951e-06, "loss": 0.4701, "num_input_tokens_seen": 122697216, "step": 100890 }, { "epoch": 12.641899511339432, "grad_norm": 0.7541335821151733, "learning_rate": 3.5868074838195665e-06, "loss": 0.4534, "num_input_tokens_seen": 122703360, "step": 100895 }, { "epoch": 12.642525999248214, "grad_norm": 0.5787824392318726, "learning_rate": 3.5862830698690886e-06, "loss": 0.4706, "num_input_tokens_seen": 122709312, "step": 100900 }, { "epoch": 12.643152487156998, "grad_norm": 0.7018730640411377, "learning_rate": 3.585758672820784e-06, "loss": 0.4643, "num_input_tokens_seen": 122715392, "step": 100905 }, { "epoch": 12.643778975065782, "grad_norm": 2.0611276626586914, "learning_rate": 3.5852342926809237e-06, "loss": 0.4504, "num_input_tokens_seen": 122721696, "step": 100910 }, { "epoch": 12.644405462974564, "grad_norm": 0.8870927691459656, "learning_rate": 3.5847099294557762e-06, "loss": 0.4666, "num_input_tokens_seen": 122728032, "step": 100915 }, { "epoch": 12.645031950883348, "grad_norm": 0.6022337079048157, "learning_rate": 3.584185583151613e-06, "loss": 0.4435, "num_input_tokens_seen": 122734080, "step": 100920 }, { "epoch": 12.64565843879213, "grad_norm": 4.532413959503174, "learning_rate": 3.5836612537746993e-06, "loss": 0.5513, "num_input_tokens_seen": 122740448, "step": 100925 }, { "epoch": 12.646284926700915, "grad_norm": 0.6348663568496704, "learning_rate": 3.5831369413313065e-06, "loss": 0.5231, "num_input_tokens_seen": 122746816, "step": 100930 }, { "epoch": 12.646911414609699, "grad_norm": 4.387592315673828, "learning_rate": 3.5826126458277048e-06, "loss": 0.4662, "num_input_tokens_seen": 122752960, "step": 100935 }, { "epoch": 12.647537902518481, "grad_norm": 0.7266507148742676, "learning_rate": 3.582088367270159e-06, "loss": 0.4724, "num_input_tokens_seen": 122759264, "step": 100940 }, { "epoch": 12.648164390427265, "grad_norm": 0.870486319065094, "learning_rate": 3.58156410566494e-06, "loss": 0.4774, "num_input_tokens_seen": 122765248, "step": 100945 }, { "epoch": 12.648790878336047, "grad_norm": 0.7857332825660706, "learning_rate": 3.5810398610183135e-06, "loss": 0.4545, "num_input_tokens_seen": 122771552, "step": 100950 }, { "epoch": 12.649417366244831, "grad_norm": 0.9670165181159973, "learning_rate": 3.58051563333655e-06, "loss": 0.4496, "num_input_tokens_seen": 122777920, "step": 100955 }, { "epoch": 12.650043854153616, "grad_norm": 1.669022798538208, "learning_rate": 3.5799914226259138e-06, "loss": 0.4453, "num_input_tokens_seen": 122784128, "step": 100960 }, { "epoch": 12.650670342062398, "grad_norm": 0.7232565879821777, "learning_rate": 3.579467228892675e-06, "loss": 0.4528, "num_input_tokens_seen": 122790176, "step": 100965 }, { "epoch": 12.651296829971182, "grad_norm": 0.9967702031135559, "learning_rate": 3.5789430521430986e-06, "loss": 0.4607, "num_input_tokens_seen": 122796128, "step": 100970 }, { "epoch": 12.651923317879964, "grad_norm": 1.0339505672454834, "learning_rate": 3.578418892383455e-06, "loss": 0.4644, "num_input_tokens_seen": 122802144, "step": 100975 }, { "epoch": 12.652549805788748, "grad_norm": 0.5306000113487244, "learning_rate": 3.5778947496200065e-06, "loss": 0.4813, "num_input_tokens_seen": 122807680, "step": 100980 }, { "epoch": 12.653176293697532, "grad_norm": 0.9367247223854065, "learning_rate": 3.5773706238590245e-06, "loss": 0.4898, "num_input_tokens_seen": 122813440, "step": 100985 }, { "epoch": 12.653802781606315, "grad_norm": 10.100194931030273, "learning_rate": 3.57684651510677e-06, "loss": 0.5345, "num_input_tokens_seen": 122819584, "step": 100990 }, { "epoch": 12.654429269515099, "grad_norm": 1.0558571815490723, "learning_rate": 3.5763224233695123e-06, "loss": 0.4742, "num_input_tokens_seen": 122825472, "step": 100995 }, { "epoch": 12.655055757423881, "grad_norm": 0.654006838798523, "learning_rate": 3.5757983486535186e-06, "loss": 0.4676, "num_input_tokens_seen": 122831712, "step": 101000 }, { "epoch": 12.655682245332665, "grad_norm": 0.5937505960464478, "learning_rate": 3.5752742909650518e-06, "loss": 0.4581, "num_input_tokens_seen": 122837568, "step": 101005 }, { "epoch": 12.656308733241449, "grad_norm": 5.300865650177002, "learning_rate": 3.5747502503103798e-06, "loss": 0.4878, "num_input_tokens_seen": 122843840, "step": 101010 }, { "epoch": 12.656935221150231, "grad_norm": 0.3936266601085663, "learning_rate": 3.5742262266957652e-06, "loss": 0.4676, "num_input_tokens_seen": 122850208, "step": 101015 }, { "epoch": 12.657561709059015, "grad_norm": 1.2413973808288574, "learning_rate": 3.573702220127476e-06, "loss": 0.4764, "num_input_tokens_seen": 122856320, "step": 101020 }, { "epoch": 12.6581881969678, "grad_norm": 1.3579717874526978, "learning_rate": 3.5731782306117746e-06, "loss": 0.4628, "num_input_tokens_seen": 122862336, "step": 101025 }, { "epoch": 12.658814684876582, "grad_norm": 2.0598561763763428, "learning_rate": 3.5726542581549295e-06, "loss": 0.4723, "num_input_tokens_seen": 122868736, "step": 101030 }, { "epoch": 12.659441172785366, "grad_norm": 0.44460245966911316, "learning_rate": 3.5721303027632005e-06, "loss": 0.4522, "num_input_tokens_seen": 122874464, "step": 101035 }, { "epoch": 12.660067660694148, "grad_norm": 0.6807486414909363, "learning_rate": 3.571606364442857e-06, "loss": 0.4526, "num_input_tokens_seen": 122880992, "step": 101040 }, { "epoch": 12.660694148602932, "grad_norm": 0.434926837682724, "learning_rate": 3.571082443200157e-06, "loss": 0.4606, "num_input_tokens_seen": 122886976, "step": 101045 }, { "epoch": 12.661320636511716, "grad_norm": 0.5530365705490112, "learning_rate": 3.5705585390413685e-06, "loss": 0.4653, "num_input_tokens_seen": 122893088, "step": 101050 }, { "epoch": 12.661947124420498, "grad_norm": 0.6745936870574951, "learning_rate": 3.5700346519727556e-06, "loss": 0.4619, "num_input_tokens_seen": 122899104, "step": 101055 }, { "epoch": 12.662573612329282, "grad_norm": 0.7607670426368713, "learning_rate": 3.5695107820005796e-06, "loss": 0.466, "num_input_tokens_seen": 122905472, "step": 101060 }, { "epoch": 12.663200100238065, "grad_norm": 1.0741894245147705, "learning_rate": 3.5689869291311075e-06, "loss": 0.46, "num_input_tokens_seen": 122911424, "step": 101065 }, { "epoch": 12.663826588146849, "grad_norm": 0.5770478844642639, "learning_rate": 3.5684630933705965e-06, "loss": 0.4696, "num_input_tokens_seen": 122917632, "step": 101070 }, { "epoch": 12.664453076055633, "grad_norm": 0.8054764270782471, "learning_rate": 3.5679392747253144e-06, "loss": 0.4551, "num_input_tokens_seen": 122923840, "step": 101075 }, { "epoch": 12.665079563964415, "grad_norm": 0.6149209141731262, "learning_rate": 3.567415473201521e-06, "loss": 0.4495, "num_input_tokens_seen": 122930400, "step": 101080 }, { "epoch": 12.6657060518732, "grad_norm": 0.6973726153373718, "learning_rate": 3.5668916888054826e-06, "loss": 0.4575, "num_input_tokens_seen": 122936128, "step": 101085 }, { "epoch": 12.666332539781981, "grad_norm": 0.3721894919872284, "learning_rate": 3.5663679215434566e-06, "loss": 0.4578, "num_input_tokens_seen": 122941984, "step": 101090 }, { "epoch": 12.666959027690766, "grad_norm": 0.6057558655738831, "learning_rate": 3.5658441714217094e-06, "loss": 0.4604, "num_input_tokens_seen": 122947968, "step": 101095 }, { "epoch": 12.66758551559955, "grad_norm": 0.6202504634857178, "learning_rate": 3.565320438446499e-06, "loss": 0.4698, "num_input_tokens_seen": 122954048, "step": 101100 }, { "epoch": 12.668212003508332, "grad_norm": 0.45818862318992615, "learning_rate": 3.564796722624091e-06, "loss": 0.4418, "num_input_tokens_seen": 122960128, "step": 101105 }, { "epoch": 12.668838491417116, "grad_norm": 0.6341710686683655, "learning_rate": 3.5642730239607424e-06, "loss": 0.4662, "num_input_tokens_seen": 122966464, "step": 101110 }, { "epoch": 12.669464979325898, "grad_norm": 0.5120534300804138, "learning_rate": 3.5637493424627166e-06, "loss": 0.4625, "num_input_tokens_seen": 122972448, "step": 101115 }, { "epoch": 12.670091467234682, "grad_norm": 1.690399169921875, "learning_rate": 3.5632256781362785e-06, "loss": 0.4693, "num_input_tokens_seen": 122978336, "step": 101120 }, { "epoch": 12.670717955143466, "grad_norm": 0.8727991580963135, "learning_rate": 3.562702030987682e-06, "loss": 0.4604, "num_input_tokens_seen": 122984672, "step": 101125 }, { "epoch": 12.671344443052249, "grad_norm": 0.3894404172897339, "learning_rate": 3.562178401023193e-06, "loss": 0.4721, "num_input_tokens_seen": 122990912, "step": 101130 }, { "epoch": 12.671970930961033, "grad_norm": 0.6358688473701477, "learning_rate": 3.5616547882490684e-06, "loss": 0.466, "num_input_tokens_seen": 122996896, "step": 101135 }, { "epoch": 12.672597418869817, "grad_norm": 0.2837803065776825, "learning_rate": 3.561131192671572e-06, "loss": 0.4544, "num_input_tokens_seen": 123002944, "step": 101140 }, { "epoch": 12.673223906778599, "grad_norm": 5.864727973937988, "learning_rate": 3.56060761429696e-06, "loss": 0.4755, "num_input_tokens_seen": 123009152, "step": 101145 }, { "epoch": 12.673850394687383, "grad_norm": 0.7113412618637085, "learning_rate": 3.5600840531314973e-06, "loss": 0.4619, "num_input_tokens_seen": 123015392, "step": 101150 }, { "epoch": 12.674476882596165, "grad_norm": 0.45701873302459717, "learning_rate": 3.5595605091814374e-06, "loss": 0.4587, "num_input_tokens_seen": 123021792, "step": 101155 }, { "epoch": 12.67510337050495, "grad_norm": 0.4519546329975128, "learning_rate": 3.5590369824530447e-06, "loss": 0.4573, "num_input_tokens_seen": 123027808, "step": 101160 }, { "epoch": 12.675729858413732, "grad_norm": 0.5439101457595825, "learning_rate": 3.5585134729525737e-06, "loss": 0.451, "num_input_tokens_seen": 123033824, "step": 101165 }, { "epoch": 12.676356346322516, "grad_norm": 0.44745633006095886, "learning_rate": 3.5579899806862883e-06, "loss": 0.459, "num_input_tokens_seen": 123039680, "step": 101170 }, { "epoch": 12.6769828342313, "grad_norm": 0.4239732623100281, "learning_rate": 3.5574665056604436e-06, "loss": 0.4662, "num_input_tokens_seen": 123045696, "step": 101175 }, { "epoch": 12.677609322140082, "grad_norm": 1.3502082824707031, "learning_rate": 3.5569430478813007e-06, "loss": 0.4725, "num_input_tokens_seen": 123051264, "step": 101180 }, { "epoch": 12.678235810048866, "grad_norm": 0.357617050409317, "learning_rate": 3.5564196073551165e-06, "loss": 0.458, "num_input_tokens_seen": 123057504, "step": 101185 }, { "epoch": 12.67886229795765, "grad_norm": 0.6636757254600525, "learning_rate": 3.555896184088149e-06, "loss": 0.4623, "num_input_tokens_seen": 123063488, "step": 101190 }, { "epoch": 12.679488785866432, "grad_norm": 0.47964224219322205, "learning_rate": 3.5553727780866587e-06, "loss": 0.4545, "num_input_tokens_seen": 123069504, "step": 101195 }, { "epoch": 12.680115273775217, "grad_norm": 0.5490251779556274, "learning_rate": 3.554849389356899e-06, "loss": 0.4497, "num_input_tokens_seen": 123075840, "step": 101200 }, { "epoch": 12.680741761683999, "grad_norm": 0.3587533235549927, "learning_rate": 3.5543260179051333e-06, "loss": 0.4653, "num_input_tokens_seen": 123081440, "step": 101205 }, { "epoch": 12.681368249592783, "grad_norm": 0.5933510065078735, "learning_rate": 3.553802663737612e-06, "loss": 0.4606, "num_input_tokens_seen": 123087680, "step": 101210 }, { "epoch": 12.681994737501567, "grad_norm": 0.5055623054504395, "learning_rate": 3.553279326860598e-06, "loss": 0.4817, "num_input_tokens_seen": 123094080, "step": 101215 }, { "epoch": 12.68262122541035, "grad_norm": 0.5492262244224548, "learning_rate": 3.552756007280344e-06, "loss": 0.4458, "num_input_tokens_seen": 123100224, "step": 101220 }, { "epoch": 12.683247713319133, "grad_norm": 0.5445957779884338, "learning_rate": 3.5522327050031103e-06, "loss": 0.4585, "num_input_tokens_seen": 123106336, "step": 101225 }, { "epoch": 12.683874201227916, "grad_norm": 0.5503641963005066, "learning_rate": 3.55170942003515e-06, "loss": 0.4822, "num_input_tokens_seen": 123112416, "step": 101230 }, { "epoch": 12.6845006891367, "grad_norm": 0.5441524982452393, "learning_rate": 3.551186152382723e-06, "loss": 0.4514, "num_input_tokens_seen": 123118656, "step": 101235 }, { "epoch": 12.685127177045484, "grad_norm": 1.6460798978805542, "learning_rate": 3.5506629020520833e-06, "loss": 0.4707, "num_input_tokens_seen": 123124864, "step": 101240 }, { "epoch": 12.685753664954266, "grad_norm": 0.5961933732032776, "learning_rate": 3.550139669049486e-06, "loss": 0.4569, "num_input_tokens_seen": 123131200, "step": 101245 }, { "epoch": 12.68638015286305, "grad_norm": 1.2357239723205566, "learning_rate": 3.54961645338119e-06, "loss": 0.4473, "num_input_tokens_seen": 123137216, "step": 101250 }, { "epoch": 12.687006640771834, "grad_norm": 0.3518643081188202, "learning_rate": 3.5490932550534467e-06, "loss": 0.4652, "num_input_tokens_seen": 123143264, "step": 101255 }, { "epoch": 12.687633128680616, "grad_norm": 0.7788282036781311, "learning_rate": 3.548570074072516e-06, "loss": 0.4719, "num_input_tokens_seen": 123149824, "step": 101260 }, { "epoch": 12.6882596165894, "grad_norm": 0.29881584644317627, "learning_rate": 3.5480469104446473e-06, "loss": 0.4579, "num_input_tokens_seen": 123156288, "step": 101265 }, { "epoch": 12.688886104498183, "grad_norm": 0.6393423080444336, "learning_rate": 3.5475237641761013e-06, "loss": 0.4595, "num_input_tokens_seen": 123162656, "step": 101270 }, { "epoch": 12.689512592406967, "grad_norm": 0.32256969809532166, "learning_rate": 3.547000635273128e-06, "loss": 0.4583, "num_input_tokens_seen": 123168480, "step": 101275 }, { "epoch": 12.690139080315749, "grad_norm": 0.5804187655448914, "learning_rate": 3.546477523741984e-06, "loss": 0.4665, "num_input_tokens_seen": 123174624, "step": 101280 }, { "epoch": 12.690765568224533, "grad_norm": 0.8033111691474915, "learning_rate": 3.5459544295889227e-06, "loss": 0.4661, "num_input_tokens_seen": 123180864, "step": 101285 }, { "epoch": 12.691392056133317, "grad_norm": 1.0366100072860718, "learning_rate": 3.5454313528202004e-06, "loss": 0.4626, "num_input_tokens_seen": 123187200, "step": 101290 }, { "epoch": 12.6920185440421, "grad_norm": 0.34979331493377686, "learning_rate": 3.5449082934420674e-06, "loss": 0.4633, "num_input_tokens_seen": 123193216, "step": 101295 }, { "epoch": 12.692645031950883, "grad_norm": 2.2682530879974365, "learning_rate": 3.544385251460779e-06, "loss": 0.4702, "num_input_tokens_seen": 123199296, "step": 101300 }, { "epoch": 12.693271519859668, "grad_norm": 0.6115771532058716, "learning_rate": 3.543862226882591e-06, "loss": 0.4786, "num_input_tokens_seen": 123205376, "step": 101305 }, { "epoch": 12.69389800776845, "grad_norm": 0.5289609432220459, "learning_rate": 3.5433392197137524e-06, "loss": 0.4827, "num_input_tokens_seen": 123211808, "step": 101310 }, { "epoch": 12.694524495677234, "grad_norm": 9.044922828674316, "learning_rate": 3.5428162299605194e-06, "loss": 0.5167, "num_input_tokens_seen": 123217664, "step": 101315 }, { "epoch": 12.695150983586016, "grad_norm": 0.37618106603622437, "learning_rate": 3.5422932576291426e-06, "loss": 0.4558, "num_input_tokens_seen": 123223744, "step": 101320 }, { "epoch": 12.6957774714948, "grad_norm": 0.32592809200286865, "learning_rate": 3.5417703027258752e-06, "loss": 0.4694, "num_input_tokens_seen": 123229984, "step": 101325 }, { "epoch": 12.696403959403584, "grad_norm": 0.49459972977638245, "learning_rate": 3.5412473652569714e-06, "loss": 0.4671, "num_input_tokens_seen": 123236032, "step": 101330 }, { "epoch": 12.697030447312367, "grad_norm": 0.34406787157058716, "learning_rate": 3.5407244452286806e-06, "loss": 0.4634, "num_input_tokens_seen": 123242240, "step": 101335 }, { "epoch": 12.69765693522115, "grad_norm": 0.34095990657806396, "learning_rate": 3.5402015426472554e-06, "loss": 0.4662, "num_input_tokens_seen": 123248448, "step": 101340 }, { "epoch": 12.698283423129933, "grad_norm": 0.8586322069168091, "learning_rate": 3.53967865751895e-06, "loss": 0.4655, "num_input_tokens_seen": 123254944, "step": 101345 }, { "epoch": 12.698909911038717, "grad_norm": 0.5370267033576965, "learning_rate": 3.539155789850012e-06, "loss": 0.4586, "num_input_tokens_seen": 123261120, "step": 101350 }, { "epoch": 12.699536398947501, "grad_norm": 0.3465731739997864, "learning_rate": 3.5386329396466966e-06, "loss": 0.4507, "num_input_tokens_seen": 123267488, "step": 101355 }, { "epoch": 12.700162886856283, "grad_norm": 0.9107437133789062, "learning_rate": 3.5381101069152514e-06, "loss": 0.4624, "num_input_tokens_seen": 123273664, "step": 101360 }, { "epoch": 12.700789374765067, "grad_norm": 0.3407723605632782, "learning_rate": 3.5375872916619287e-06, "loss": 0.4483, "num_input_tokens_seen": 123280192, "step": 101365 }, { "epoch": 12.70141586267385, "grad_norm": 0.5621222257614136, "learning_rate": 3.5370644938929818e-06, "loss": 0.4635, "num_input_tokens_seen": 123286144, "step": 101370 }, { "epoch": 12.702042350582634, "grad_norm": 0.6185240745544434, "learning_rate": 3.536541713614656e-06, "loss": 0.4673, "num_input_tokens_seen": 123292032, "step": 101375 }, { "epoch": 12.702668838491418, "grad_norm": 0.3263372480869293, "learning_rate": 3.5360189508332064e-06, "loss": 0.4553, "num_input_tokens_seen": 123298240, "step": 101380 }, { "epoch": 12.7032953264002, "grad_norm": 0.41266748309135437, "learning_rate": 3.5354962055548802e-06, "loss": 0.4496, "num_input_tokens_seen": 123303488, "step": 101385 }, { "epoch": 12.703921814308984, "grad_norm": 0.30441880226135254, "learning_rate": 3.534973477785929e-06, "loss": 0.458, "num_input_tokens_seen": 123309536, "step": 101390 }, { "epoch": 12.704548302217766, "grad_norm": 0.2525253891944885, "learning_rate": 3.5344507675326e-06, "loss": 0.4503, "num_input_tokens_seen": 123315040, "step": 101395 }, { "epoch": 12.70517479012655, "grad_norm": 0.4538530111312866, "learning_rate": 3.5339280748011464e-06, "loss": 0.4646, "num_input_tokens_seen": 123320608, "step": 101400 }, { "epoch": 12.705801278035334, "grad_norm": 0.3888002336025238, "learning_rate": 3.5334053995978135e-06, "loss": 0.4501, "num_input_tokens_seen": 123326912, "step": 101405 }, { "epoch": 12.706427765944117, "grad_norm": 1.2254289388656616, "learning_rate": 3.5328827419288534e-06, "loss": 0.4625, "num_input_tokens_seen": 123332864, "step": 101410 }, { "epoch": 12.7070542538529, "grad_norm": 0.36246323585510254, "learning_rate": 3.532360101800513e-06, "loss": 0.4693, "num_input_tokens_seen": 123338784, "step": 101415 }, { "epoch": 12.707680741761685, "grad_norm": 0.45704397559165955, "learning_rate": 3.531837479219042e-06, "loss": 0.4568, "num_input_tokens_seen": 123344896, "step": 101420 }, { "epoch": 12.708307229670467, "grad_norm": 0.30846190452575684, "learning_rate": 3.5313148741906874e-06, "loss": 0.4677, "num_input_tokens_seen": 123351264, "step": 101425 }, { "epoch": 12.708933717579251, "grad_norm": 0.4591214954853058, "learning_rate": 3.530792286721698e-06, "loss": 0.4619, "num_input_tokens_seen": 123357312, "step": 101430 }, { "epoch": 12.709560205488033, "grad_norm": 0.4837477207183838, "learning_rate": 3.5302697168183225e-06, "loss": 0.4585, "num_input_tokens_seen": 123363616, "step": 101435 }, { "epoch": 12.710186693396818, "grad_norm": 0.4509445130825043, "learning_rate": 3.5297471644868075e-06, "loss": 0.4484, "num_input_tokens_seen": 123369792, "step": 101440 }, { "epoch": 12.710813181305602, "grad_norm": 0.5479472279548645, "learning_rate": 3.5292246297334034e-06, "loss": 0.4592, "num_input_tokens_seen": 123375968, "step": 101445 }, { "epoch": 12.711439669214384, "grad_norm": 0.3997727930545807, "learning_rate": 3.5287021125643533e-06, "loss": 0.4632, "num_input_tokens_seen": 123382016, "step": 101450 }, { "epoch": 12.712066157123168, "grad_norm": 0.4014122784137726, "learning_rate": 3.5281796129859088e-06, "loss": 0.4645, "num_input_tokens_seen": 123388064, "step": 101455 }, { "epoch": 12.71269264503195, "grad_norm": 0.41843852400779724, "learning_rate": 3.5276571310043115e-06, "loss": 0.4566, "num_input_tokens_seen": 123394208, "step": 101460 }, { "epoch": 12.713319132940734, "grad_norm": 0.3529820144176483, "learning_rate": 3.5271346666258145e-06, "loss": 0.4692, "num_input_tokens_seen": 123400416, "step": 101465 }, { "epoch": 12.713945620849518, "grad_norm": 0.3361992835998535, "learning_rate": 3.5266122198566576e-06, "loss": 0.4619, "num_input_tokens_seen": 123406464, "step": 101470 }, { "epoch": 12.7145721087583, "grad_norm": 0.6297674775123596, "learning_rate": 3.526089790703092e-06, "loss": 0.4454, "num_input_tokens_seen": 123412288, "step": 101475 }, { "epoch": 12.715198596667085, "grad_norm": 0.4673093557357788, "learning_rate": 3.5255673791713625e-06, "loss": 0.4589, "num_input_tokens_seen": 123418592, "step": 101480 }, { "epoch": 12.715825084575867, "grad_norm": 0.40079259872436523, "learning_rate": 3.5250449852677125e-06, "loss": 0.4612, "num_input_tokens_seen": 123424480, "step": 101485 }, { "epoch": 12.716451572484651, "grad_norm": 0.4715852439403534, "learning_rate": 3.524522608998392e-06, "loss": 0.4591, "num_input_tokens_seen": 123430432, "step": 101490 }, { "epoch": 12.717078060393435, "grad_norm": 0.46882563829421997, "learning_rate": 3.5240002503696425e-06, "loss": 0.4527, "num_input_tokens_seen": 123436320, "step": 101495 }, { "epoch": 12.717704548302217, "grad_norm": 0.38593125343322754, "learning_rate": 3.5234779093877137e-06, "loss": 0.4541, "num_input_tokens_seen": 123442560, "step": 101500 }, { "epoch": 12.718331036211001, "grad_norm": 0.33853083848953247, "learning_rate": 3.522955586058845e-06, "loss": 0.4566, "num_input_tokens_seen": 123448864, "step": 101505 }, { "epoch": 12.718957524119784, "grad_norm": 0.3742143511772156, "learning_rate": 3.522433280389286e-06, "loss": 0.4499, "num_input_tokens_seen": 123455040, "step": 101510 }, { "epoch": 12.719584012028568, "grad_norm": 0.3998810350894928, "learning_rate": 3.5219109923852778e-06, "loss": 0.4605, "num_input_tokens_seen": 123460896, "step": 101515 }, { "epoch": 12.720210499937352, "grad_norm": 0.8749526143074036, "learning_rate": 3.5213887220530686e-06, "loss": 0.4798, "num_input_tokens_seen": 123467136, "step": 101520 }, { "epoch": 12.720836987846134, "grad_norm": 0.3413683772087097, "learning_rate": 3.520866469398898e-06, "loss": 0.458, "num_input_tokens_seen": 123473216, "step": 101525 }, { "epoch": 12.721463475754918, "grad_norm": 0.3905704915523529, "learning_rate": 3.520344234429014e-06, "loss": 0.4823, "num_input_tokens_seen": 123479648, "step": 101530 }, { "epoch": 12.722089963663702, "grad_norm": 0.3501685857772827, "learning_rate": 3.519822017149658e-06, "loss": 0.4678, "num_input_tokens_seen": 123485728, "step": 101535 }, { "epoch": 12.722716451572484, "grad_norm": 0.8908385038375854, "learning_rate": 3.5192998175670746e-06, "loss": 0.4724, "num_input_tokens_seen": 123491904, "step": 101540 }, { "epoch": 12.723342939481268, "grad_norm": 0.49343809485435486, "learning_rate": 3.5187776356875057e-06, "loss": 0.4498, "num_input_tokens_seen": 123497248, "step": 101545 }, { "epoch": 12.72396942739005, "grad_norm": 0.3042260408401489, "learning_rate": 3.518255471517195e-06, "loss": 0.4476, "num_input_tokens_seen": 123502944, "step": 101550 }, { "epoch": 12.724595915298835, "grad_norm": 0.5046653151512146, "learning_rate": 3.517733325062389e-06, "loss": 0.4607, "num_input_tokens_seen": 123509120, "step": 101555 }, { "epoch": 12.725222403207619, "grad_norm": 0.4884108901023865, "learning_rate": 3.5172111963293242e-06, "loss": 0.4646, "num_input_tokens_seen": 123515456, "step": 101560 }, { "epoch": 12.725848891116401, "grad_norm": 0.5351448059082031, "learning_rate": 3.5166890853242487e-06, "loss": 0.4584, "num_input_tokens_seen": 123521472, "step": 101565 }, { "epoch": 12.726475379025185, "grad_norm": 0.39642736315727234, "learning_rate": 3.5161669920534004e-06, "loss": 0.4696, "num_input_tokens_seen": 123527264, "step": 101570 }, { "epoch": 12.727101866933968, "grad_norm": 0.3885650038719177, "learning_rate": 3.5156449165230256e-06, "loss": 0.4642, "num_input_tokens_seen": 123533280, "step": 101575 }, { "epoch": 12.727728354842752, "grad_norm": 0.333368718624115, "learning_rate": 3.515122858739361e-06, "loss": 0.4723, "num_input_tokens_seen": 123539552, "step": 101580 }, { "epoch": 12.728354842751536, "grad_norm": 0.530949592590332, "learning_rate": 3.5146008187086523e-06, "loss": 0.461, "num_input_tokens_seen": 123545664, "step": 101585 }, { "epoch": 12.728981330660318, "grad_norm": 0.4116237759590149, "learning_rate": 3.5140787964371394e-06, "loss": 0.4757, "num_input_tokens_seen": 123551488, "step": 101590 }, { "epoch": 12.729607818569102, "grad_norm": 1.123557448387146, "learning_rate": 3.5135567919310647e-06, "loss": 0.4724, "num_input_tokens_seen": 123557664, "step": 101595 }, { "epoch": 12.730234306477884, "grad_norm": 0.24782752990722656, "learning_rate": 3.5130348051966667e-06, "loss": 0.452, "num_input_tokens_seen": 123563136, "step": 101600 }, { "epoch": 12.730860794386668, "grad_norm": 0.24772688746452332, "learning_rate": 3.5125128362401895e-06, "loss": 0.4745, "num_input_tokens_seen": 123568544, "step": 101605 }, { "epoch": 12.731487282295452, "grad_norm": 0.5371416211128235, "learning_rate": 3.511990885067871e-06, "loss": 0.4466, "num_input_tokens_seen": 123574272, "step": 101610 }, { "epoch": 12.732113770204235, "grad_norm": 0.5912415981292725, "learning_rate": 3.5114689516859513e-06, "loss": 0.4523, "num_input_tokens_seen": 123580416, "step": 101615 }, { "epoch": 12.732740258113019, "grad_norm": 0.34845399856567383, "learning_rate": 3.510947036100674e-06, "loss": 0.4661, "num_input_tokens_seen": 123585984, "step": 101620 }, { "epoch": 12.733366746021801, "grad_norm": 0.6593161225318909, "learning_rate": 3.5104251383182754e-06, "loss": 0.448, "num_input_tokens_seen": 123592320, "step": 101625 }, { "epoch": 12.733993233930585, "grad_norm": 0.5934200882911682, "learning_rate": 3.5099032583449984e-06, "loss": 0.4625, "num_input_tokens_seen": 123598560, "step": 101630 }, { "epoch": 12.734619721839369, "grad_norm": 0.7725238800048828, "learning_rate": 3.5093813961870783e-06, "loss": 0.4535, "num_input_tokens_seen": 123605024, "step": 101635 }, { "epoch": 12.735246209748151, "grad_norm": 0.47305089235305786, "learning_rate": 3.5088595518507585e-06, "loss": 0.4635, "num_input_tokens_seen": 123610784, "step": 101640 }, { "epoch": 12.735872697656935, "grad_norm": 0.7359774708747864, "learning_rate": 3.5083377253422745e-06, "loss": 0.4598, "num_input_tokens_seen": 123616928, "step": 101645 }, { "epoch": 12.73649918556572, "grad_norm": 0.3236864507198334, "learning_rate": 3.5078159166678705e-06, "loss": 0.4637, "num_input_tokens_seen": 123622624, "step": 101650 }, { "epoch": 12.737125673474502, "grad_norm": 0.5234194397926331, "learning_rate": 3.507294125833779e-06, "loss": 0.4593, "num_input_tokens_seen": 123629056, "step": 101655 }, { "epoch": 12.737752161383286, "grad_norm": 0.4861714243888855, "learning_rate": 3.5067723528462428e-06, "loss": 0.4694, "num_input_tokens_seen": 123635136, "step": 101660 }, { "epoch": 12.738378649292068, "grad_norm": 0.5490262508392334, "learning_rate": 3.5062505977114967e-06, "loss": 0.4621, "num_input_tokens_seen": 123641152, "step": 101665 }, { "epoch": 12.739005137200852, "grad_norm": 0.7977192997932434, "learning_rate": 3.5057288604357813e-06, "loss": 0.4517, "num_input_tokens_seen": 123647584, "step": 101670 }, { "epoch": 12.739631625109636, "grad_norm": 0.43572816252708435, "learning_rate": 3.505207141025335e-06, "loss": 0.4479, "num_input_tokens_seen": 123653856, "step": 101675 }, { "epoch": 12.740258113018418, "grad_norm": 0.6413635015487671, "learning_rate": 3.504685439486393e-06, "loss": 0.441, "num_input_tokens_seen": 123660352, "step": 101680 }, { "epoch": 12.740884600927203, "grad_norm": 0.5457260012626648, "learning_rate": 3.504163755825195e-06, "loss": 0.4561, "num_input_tokens_seen": 123666400, "step": 101685 }, { "epoch": 12.741511088835985, "grad_norm": 0.36531946063041687, "learning_rate": 3.503642090047975e-06, "loss": 0.4624, "num_input_tokens_seen": 123671392, "step": 101690 }, { "epoch": 12.742137576744769, "grad_norm": 0.5377457141876221, "learning_rate": 3.5031204421609733e-06, "loss": 0.4597, "num_input_tokens_seen": 123677568, "step": 101695 }, { "epoch": 12.742764064653553, "grad_norm": 0.6543101668357849, "learning_rate": 3.5025988121704235e-06, "loss": 0.4537, "num_input_tokens_seen": 123683968, "step": 101700 }, { "epoch": 12.743390552562335, "grad_norm": 0.5830851793289185, "learning_rate": 3.5020772000825664e-06, "loss": 0.4573, "num_input_tokens_seen": 123690016, "step": 101705 }, { "epoch": 12.74401704047112, "grad_norm": 0.4000169336795807, "learning_rate": 3.5015556059036332e-06, "loss": 0.4582, "num_input_tokens_seen": 123695840, "step": 101710 }, { "epoch": 12.744643528379902, "grad_norm": 0.31064048409461975, "learning_rate": 3.501034029639865e-06, "loss": 0.4644, "num_input_tokens_seen": 123701440, "step": 101715 }, { "epoch": 12.745270016288686, "grad_norm": 1.0333640575408936, "learning_rate": 3.500512471297493e-06, "loss": 0.4432, "num_input_tokens_seen": 123707488, "step": 101720 }, { "epoch": 12.74589650419747, "grad_norm": 0.5462929010391235, "learning_rate": 3.4999909308827573e-06, "loss": 0.4619, "num_input_tokens_seen": 123713568, "step": 101725 }, { "epoch": 12.746522992106252, "grad_norm": 0.5571595430374146, "learning_rate": 3.4994694084018893e-06, "loss": 0.4606, "num_input_tokens_seen": 123719584, "step": 101730 }, { "epoch": 12.747149480015036, "grad_norm": 0.5068851113319397, "learning_rate": 3.4989479038611258e-06, "loss": 0.4543, "num_input_tokens_seen": 123725536, "step": 101735 }, { "epoch": 12.747775967923818, "grad_norm": 1.7747300863265991, "learning_rate": 3.498426417266703e-06, "loss": 0.4736, "num_input_tokens_seen": 123731744, "step": 101740 }, { "epoch": 12.748402455832602, "grad_norm": 0.6339448690414429, "learning_rate": 3.497904948624855e-06, "loss": 0.4676, "num_input_tokens_seen": 123738112, "step": 101745 }, { "epoch": 12.749028943741386, "grad_norm": 0.7054805755615234, "learning_rate": 3.4973834979418163e-06, "loss": 0.4568, "num_input_tokens_seen": 123744160, "step": 101750 }, { "epoch": 12.749655431650169, "grad_norm": 0.5376635193824768, "learning_rate": 3.4968620652238207e-06, "loss": 0.4558, "num_input_tokens_seen": 123750592, "step": 101755 }, { "epoch": 12.750281919558953, "grad_norm": 0.5217346549034119, "learning_rate": 3.496340650477104e-06, "loss": 0.4619, "num_input_tokens_seen": 123756704, "step": 101760 }, { "epoch": 12.750908407467737, "grad_norm": 0.5945132374763489, "learning_rate": 3.495819253707897e-06, "loss": 0.46, "num_input_tokens_seen": 123762784, "step": 101765 }, { "epoch": 12.751534895376519, "grad_norm": 4.152393817901611, "learning_rate": 3.4952978749224387e-06, "loss": 0.4636, "num_input_tokens_seen": 123769184, "step": 101770 }, { "epoch": 12.752161383285303, "grad_norm": 1.0359816551208496, "learning_rate": 3.4947765141269564e-06, "loss": 0.4664, "num_input_tokens_seen": 123775136, "step": 101775 }, { "epoch": 12.752787871194085, "grad_norm": 0.4194660186767578, "learning_rate": 3.4942551713276885e-06, "loss": 0.4625, "num_input_tokens_seen": 123780448, "step": 101780 }, { "epoch": 12.75341435910287, "grad_norm": 0.6628751754760742, "learning_rate": 3.4937338465308646e-06, "loss": 0.4718, "num_input_tokens_seen": 123786496, "step": 101785 }, { "epoch": 12.754040847011652, "grad_norm": 0.5093795657157898, "learning_rate": 3.4932125397427193e-06, "loss": 0.4606, "num_input_tokens_seen": 123792896, "step": 101790 }, { "epoch": 12.754667334920436, "grad_norm": 0.6375194787979126, "learning_rate": 3.4926912509694837e-06, "loss": 0.4577, "num_input_tokens_seen": 123798400, "step": 101795 }, { "epoch": 12.75529382282922, "grad_norm": 0.6184926629066467, "learning_rate": 3.4921699802173924e-06, "loss": 0.4562, "num_input_tokens_seen": 123803872, "step": 101800 }, { "epoch": 12.755920310738002, "grad_norm": 0.4972846806049347, "learning_rate": 3.4916487274926776e-06, "loss": 0.4716, "num_input_tokens_seen": 123810112, "step": 101805 }, { "epoch": 12.756546798646786, "grad_norm": 1.5797598361968994, "learning_rate": 3.4911274928015694e-06, "loss": 0.4642, "num_input_tokens_seen": 123816384, "step": 101810 }, { "epoch": 12.75717328655557, "grad_norm": 0.5421385169029236, "learning_rate": 3.4906062761503017e-06, "loss": 0.4557, "num_input_tokens_seen": 123822880, "step": 101815 }, { "epoch": 12.757799774464353, "grad_norm": 0.6107172966003418, "learning_rate": 3.490085077545103e-06, "loss": 0.4585, "num_input_tokens_seen": 123828960, "step": 101820 }, { "epoch": 12.758426262373137, "grad_norm": 0.6121050715446472, "learning_rate": 3.4895638969922096e-06, "loss": 0.4495, "num_input_tokens_seen": 123835136, "step": 101825 }, { "epoch": 12.759052750281919, "grad_norm": 0.5682500004768372, "learning_rate": 3.489042734497847e-06, "loss": 0.4475, "num_input_tokens_seen": 123841216, "step": 101830 }, { "epoch": 12.759679238190703, "grad_norm": 0.6163602471351624, "learning_rate": 3.4885215900682513e-06, "loss": 0.4928, "num_input_tokens_seen": 123847584, "step": 101835 }, { "epoch": 12.760305726099487, "grad_norm": 0.4112185537815094, "learning_rate": 3.488000463709649e-06, "loss": 0.4502, "num_input_tokens_seen": 123853536, "step": 101840 }, { "epoch": 12.76093221400827, "grad_norm": 0.967512309551239, "learning_rate": 3.487479355428273e-06, "loss": 0.4574, "num_input_tokens_seen": 123860224, "step": 101845 }, { "epoch": 12.761558701917053, "grad_norm": 1.7889289855957031, "learning_rate": 3.4869582652303514e-06, "loss": 0.4483, "num_input_tokens_seen": 123866176, "step": 101850 }, { "epoch": 12.762185189825836, "grad_norm": 0.6427450776100159, "learning_rate": 3.4864371931221175e-06, "loss": 0.4705, "num_input_tokens_seen": 123872064, "step": 101855 }, { "epoch": 12.76281167773462, "grad_norm": 0.41956302523612976, "learning_rate": 3.485916139109799e-06, "loss": 0.4588, "num_input_tokens_seen": 123878112, "step": 101860 }, { "epoch": 12.763438165643404, "grad_norm": 0.6525378227233887, "learning_rate": 3.485395103199625e-06, "loss": 0.4924, "num_input_tokens_seen": 123883872, "step": 101865 }, { "epoch": 12.764064653552186, "grad_norm": 0.8410851359367371, "learning_rate": 3.4848740853978287e-06, "loss": 0.4507, "num_input_tokens_seen": 123890144, "step": 101870 }, { "epoch": 12.76469114146097, "grad_norm": 0.4170912504196167, "learning_rate": 3.4843530857106344e-06, "loss": 0.4597, "num_input_tokens_seen": 123896416, "step": 101875 }, { "epoch": 12.765317629369752, "grad_norm": 0.8490118384361267, "learning_rate": 3.4838321041442747e-06, "loss": 0.4676, "num_input_tokens_seen": 123902144, "step": 101880 }, { "epoch": 12.765944117278536, "grad_norm": 0.9802693724632263, "learning_rate": 3.4833111407049753e-06, "loss": 0.4615, "num_input_tokens_seen": 123908384, "step": 101885 }, { "epoch": 12.76657060518732, "grad_norm": 0.8769055008888245, "learning_rate": 3.4827901953989674e-06, "loss": 0.4482, "num_input_tokens_seen": 123914048, "step": 101890 }, { "epoch": 12.767197093096103, "grad_norm": 0.5296270251274109, "learning_rate": 3.4822692682324775e-06, "loss": 0.4623, "num_input_tokens_seen": 123920384, "step": 101895 }, { "epoch": 12.767823581004887, "grad_norm": 0.717876136302948, "learning_rate": 3.4817483592117356e-06, "loss": 0.4679, "num_input_tokens_seen": 123926656, "step": 101900 }, { "epoch": 12.768450068913669, "grad_norm": 0.8018010854721069, "learning_rate": 3.481227468342967e-06, "loss": 0.4596, "num_input_tokens_seen": 123932640, "step": 101905 }, { "epoch": 12.769076556822453, "grad_norm": 5.471245288848877, "learning_rate": 3.4807065956324036e-06, "loss": 0.4739, "num_input_tokens_seen": 123938944, "step": 101910 }, { "epoch": 12.769703044731237, "grad_norm": 1.156699776649475, "learning_rate": 3.4801857410862678e-06, "loss": 0.4612, "num_input_tokens_seen": 123945088, "step": 101915 }, { "epoch": 12.77032953264002, "grad_norm": 2.6525609493255615, "learning_rate": 3.479664904710789e-06, "loss": 0.4807, "num_input_tokens_seen": 123950784, "step": 101920 }, { "epoch": 12.770956020548804, "grad_norm": 1.0478332042694092, "learning_rate": 3.479144086512197e-06, "loss": 0.4649, "num_input_tokens_seen": 123956800, "step": 101925 }, { "epoch": 12.771582508457588, "grad_norm": 0.6180326342582703, "learning_rate": 3.4786232864967133e-06, "loss": 0.4533, "num_input_tokens_seen": 123962688, "step": 101930 }, { "epoch": 12.77220899636637, "grad_norm": 0.6224949955940247, "learning_rate": 3.4781025046705697e-06, "loss": 0.4595, "num_input_tokens_seen": 123968736, "step": 101935 }, { "epoch": 12.772835484275154, "grad_norm": 0.6217867732048035, "learning_rate": 3.477581741039988e-06, "loss": 0.4635, "num_input_tokens_seen": 123974848, "step": 101940 }, { "epoch": 12.773461972183936, "grad_norm": 0.5830503106117249, "learning_rate": 3.4770609956111984e-06, "loss": 0.4628, "num_input_tokens_seen": 123980928, "step": 101945 }, { "epoch": 12.77408846009272, "grad_norm": 0.35037463903427124, "learning_rate": 3.4765402683904242e-06, "loss": 0.4578, "num_input_tokens_seen": 123987488, "step": 101950 }, { "epoch": 12.774714948001504, "grad_norm": 0.605318009853363, "learning_rate": 3.4760195593838924e-06, "loss": 0.4902, "num_input_tokens_seen": 123993792, "step": 101955 }, { "epoch": 12.775341435910287, "grad_norm": 1.1160989999771118, "learning_rate": 3.475498868597827e-06, "loss": 0.465, "num_input_tokens_seen": 123999776, "step": 101960 }, { "epoch": 12.77596792381907, "grad_norm": 0.5960017442703247, "learning_rate": 3.474978196038456e-06, "loss": 0.4515, "num_input_tokens_seen": 124005536, "step": 101965 }, { "epoch": 12.776594411727853, "grad_norm": 0.7965646982192993, "learning_rate": 3.4744575417120006e-06, "loss": 0.4512, "num_input_tokens_seen": 124011360, "step": 101970 }, { "epoch": 12.777220899636637, "grad_norm": 0.6821297407150269, "learning_rate": 3.473936905624691e-06, "loss": 0.4584, "num_input_tokens_seen": 124017472, "step": 101975 }, { "epoch": 12.777847387545421, "grad_norm": 0.36391618847846985, "learning_rate": 3.473416287782746e-06, "loss": 0.4929, "num_input_tokens_seen": 124023744, "step": 101980 }, { "epoch": 12.778473875454203, "grad_norm": 1.2355612516403198, "learning_rate": 3.4728956881923924e-06, "loss": 0.447, "num_input_tokens_seen": 124029856, "step": 101985 }, { "epoch": 12.779100363362987, "grad_norm": 0.6377154588699341, "learning_rate": 3.4723751068598576e-06, "loss": 0.4759, "num_input_tokens_seen": 124036128, "step": 101990 }, { "epoch": 12.77972685127177, "grad_norm": 0.7085718512535095, "learning_rate": 3.47185454379136e-06, "loss": 0.455, "num_input_tokens_seen": 124042432, "step": 101995 }, { "epoch": 12.780353339180554, "grad_norm": 1.2005009651184082, "learning_rate": 3.471333998993128e-06, "loss": 0.4653, "num_input_tokens_seen": 124048576, "step": 102000 }, { "epoch": 12.780979827089338, "grad_norm": 0.7682890892028809, "learning_rate": 3.4708134724713828e-06, "loss": 0.4952, "num_input_tokens_seen": 124055040, "step": 102005 }, { "epoch": 12.78160631499812, "grad_norm": 1.95343816280365, "learning_rate": 3.470292964232348e-06, "loss": 0.4517, "num_input_tokens_seen": 124061088, "step": 102010 }, { "epoch": 12.782232802906904, "grad_norm": 0.7508478760719299, "learning_rate": 3.469772474282246e-06, "loss": 0.48, "num_input_tokens_seen": 124067392, "step": 102015 }, { "epoch": 12.782859290815686, "grad_norm": 10.293313980102539, "learning_rate": 3.4692520026273026e-06, "loss": 0.6272, "num_input_tokens_seen": 124072960, "step": 102020 }, { "epoch": 12.78348577872447, "grad_norm": 0.5914853811264038, "learning_rate": 3.468731549273737e-06, "loss": 0.4646, "num_input_tokens_seen": 124079040, "step": 102025 }, { "epoch": 12.784112266633255, "grad_norm": 0.9200565814971924, "learning_rate": 3.4682111142277748e-06, "loss": 0.4555, "num_input_tokens_seen": 124085184, "step": 102030 }, { "epoch": 12.784738754542037, "grad_norm": 0.5546284317970276, "learning_rate": 3.4676906974956344e-06, "loss": 0.4614, "num_input_tokens_seen": 124091424, "step": 102035 }, { "epoch": 12.78536524245082, "grad_norm": 0.7728657722473145, "learning_rate": 3.46717029908354e-06, "loss": 0.462, "num_input_tokens_seen": 124097440, "step": 102040 }, { "epoch": 12.785991730359605, "grad_norm": 3.2319114208221436, "learning_rate": 3.4666499189977155e-06, "loss": 0.5448, "num_input_tokens_seen": 124103456, "step": 102045 }, { "epoch": 12.786618218268387, "grad_norm": 0.7127497792243958, "learning_rate": 3.466129557244379e-06, "loss": 0.4793, "num_input_tokens_seen": 124109312, "step": 102050 }, { "epoch": 12.787244706177171, "grad_norm": 0.4784439206123352, "learning_rate": 3.4656092138297532e-06, "loss": 0.4685, "num_input_tokens_seen": 124115008, "step": 102055 }, { "epoch": 12.787871194085954, "grad_norm": 1.1432969570159912, "learning_rate": 3.4650888887600588e-06, "loss": 0.4738, "num_input_tokens_seen": 124121120, "step": 102060 }, { "epoch": 12.788497681994738, "grad_norm": 0.7253658175468445, "learning_rate": 3.4645685820415194e-06, "loss": 0.4767, "num_input_tokens_seen": 124127232, "step": 102065 }, { "epoch": 12.789124169903522, "grad_norm": 0.33051154017448425, "learning_rate": 3.464048293680351e-06, "loss": 0.4611, "num_input_tokens_seen": 124132864, "step": 102070 }, { "epoch": 12.789750657812304, "grad_norm": 0.5190842151641846, "learning_rate": 3.463528023682779e-06, "loss": 0.4585, "num_input_tokens_seen": 124138880, "step": 102075 }, { "epoch": 12.790377145721088, "grad_norm": 0.8094553351402283, "learning_rate": 3.4630077720550194e-06, "loss": 0.461, "num_input_tokens_seen": 124144384, "step": 102080 }, { "epoch": 12.79100363362987, "grad_norm": 0.6088561415672302, "learning_rate": 3.4624875388032958e-06, "loss": 0.4597, "num_input_tokens_seen": 124149600, "step": 102085 }, { "epoch": 12.791630121538654, "grad_norm": 0.3882814049720764, "learning_rate": 3.461967323933824e-06, "loss": 0.463, "num_input_tokens_seen": 124155872, "step": 102090 }, { "epoch": 12.792256609447438, "grad_norm": 0.5201578736305237, "learning_rate": 3.4614471274528276e-06, "loss": 0.4633, "num_input_tokens_seen": 124161856, "step": 102095 }, { "epoch": 12.79288309735622, "grad_norm": 0.5274671912193298, "learning_rate": 3.4609269493665243e-06, "loss": 0.4745, "num_input_tokens_seen": 124168032, "step": 102100 }, { "epoch": 12.793509585265005, "grad_norm": 0.9442441463470459, "learning_rate": 3.460406789681132e-06, "loss": 0.4563, "num_input_tokens_seen": 124174176, "step": 102105 }, { "epoch": 12.794136073173787, "grad_norm": 0.5172989368438721, "learning_rate": 3.459886648402872e-06, "loss": 0.4746, "num_input_tokens_seen": 124180096, "step": 102110 }, { "epoch": 12.794762561082571, "grad_norm": 0.7929336428642273, "learning_rate": 3.459366525537961e-06, "loss": 0.4573, "num_input_tokens_seen": 124185696, "step": 102115 }, { "epoch": 12.795389048991355, "grad_norm": 0.4037732779979706, "learning_rate": 3.458846421092621e-06, "loss": 0.4559, "num_input_tokens_seen": 124191904, "step": 102120 }, { "epoch": 12.796015536900137, "grad_norm": 0.3442389667034149, "learning_rate": 3.458326335073065e-06, "loss": 0.4653, "num_input_tokens_seen": 124197824, "step": 102125 }, { "epoch": 12.796642024808921, "grad_norm": 0.8033331632614136, "learning_rate": 3.457806267485516e-06, "loss": 0.4495, "num_input_tokens_seen": 124203968, "step": 102130 }, { "epoch": 12.797268512717704, "grad_norm": 0.8428611755371094, "learning_rate": 3.457286218336188e-06, "loss": 0.4616, "num_input_tokens_seen": 124209824, "step": 102135 }, { "epoch": 12.797895000626488, "grad_norm": 0.47357794642448425, "learning_rate": 3.456766187631302e-06, "loss": 0.454, "num_input_tokens_seen": 124215968, "step": 102140 }, { "epoch": 12.798521488535272, "grad_norm": 1.4013352394104004, "learning_rate": 3.456246175377072e-06, "loss": 0.4587, "num_input_tokens_seen": 124222016, "step": 102145 }, { "epoch": 12.799147976444054, "grad_norm": 0.4509248733520508, "learning_rate": 3.4557261815797176e-06, "loss": 0.4509, "num_input_tokens_seen": 124227936, "step": 102150 }, { "epoch": 12.799774464352838, "grad_norm": 0.2637985944747925, "learning_rate": 3.455206206245455e-06, "loss": 0.4705, "num_input_tokens_seen": 124234016, "step": 102155 }, { "epoch": 12.800400952261622, "grad_norm": 3.32391357421875, "learning_rate": 3.4546862493805013e-06, "loss": 0.4548, "num_input_tokens_seen": 124239968, "step": 102160 }, { "epoch": 12.801027440170405, "grad_norm": 0.6435948610305786, "learning_rate": 3.4541663109910716e-06, "loss": 0.4687, "num_input_tokens_seen": 124245792, "step": 102165 }, { "epoch": 12.801653928079189, "grad_norm": 0.5524287819862366, "learning_rate": 3.4536463910833833e-06, "loss": 0.4774, "num_input_tokens_seen": 124251840, "step": 102170 }, { "epoch": 12.80228041598797, "grad_norm": 1.0227528810501099, "learning_rate": 3.4531264896636544e-06, "loss": 0.4489, "num_input_tokens_seen": 124258016, "step": 102175 }, { "epoch": 12.802906903896755, "grad_norm": 0.6620119214057922, "learning_rate": 3.452606606738097e-06, "loss": 0.4338, "num_input_tokens_seen": 124264096, "step": 102180 }, { "epoch": 12.803533391805539, "grad_norm": 0.6971582770347595, "learning_rate": 3.4520867423129307e-06, "loss": 0.528, "num_input_tokens_seen": 124269984, "step": 102185 }, { "epoch": 12.804159879714321, "grad_norm": 5.064218044281006, "learning_rate": 3.4515668963943666e-06, "loss": 0.4532, "num_input_tokens_seen": 124276000, "step": 102190 }, { "epoch": 12.804786367623105, "grad_norm": 0.3704310357570648, "learning_rate": 3.4510470689886244e-06, "loss": 0.4525, "num_input_tokens_seen": 124281728, "step": 102195 }, { "epoch": 12.805412855531888, "grad_norm": 13.156900405883789, "learning_rate": 3.4505272601019146e-06, "loss": 0.5693, "num_input_tokens_seen": 124287872, "step": 102200 }, { "epoch": 12.806039343440672, "grad_norm": 2.0823075771331787, "learning_rate": 3.450007469740455e-06, "loss": 0.5001, "num_input_tokens_seen": 124293920, "step": 102205 }, { "epoch": 12.806665831349456, "grad_norm": 0.651206374168396, "learning_rate": 3.449487697910458e-06, "loss": 0.484, "num_input_tokens_seen": 124300064, "step": 102210 }, { "epoch": 12.807292319258238, "grad_norm": 0.5012741684913635, "learning_rate": 3.448967944618141e-06, "loss": 0.4589, "num_input_tokens_seen": 124306368, "step": 102215 }, { "epoch": 12.807918807167022, "grad_norm": 17.465307235717773, "learning_rate": 3.4484482098697154e-06, "loss": 0.4954, "num_input_tokens_seen": 124312512, "step": 102220 }, { "epoch": 12.808545295075804, "grad_norm": 0.7144815921783447, "learning_rate": 3.447928493671395e-06, "loss": 0.467, "num_input_tokens_seen": 124318720, "step": 102225 }, { "epoch": 12.809171782984588, "grad_norm": 0.525818943977356, "learning_rate": 3.4474087960293967e-06, "loss": 0.4701, "num_input_tokens_seen": 124324768, "step": 102230 }, { "epoch": 12.809798270893372, "grad_norm": 0.635852038860321, "learning_rate": 3.4468891169499295e-06, "loss": 0.4584, "num_input_tokens_seen": 124330880, "step": 102235 }, { "epoch": 12.810424758802155, "grad_norm": 0.4946320354938507, "learning_rate": 3.446369456439211e-06, "loss": 0.4423, "num_input_tokens_seen": 124337376, "step": 102240 }, { "epoch": 12.811051246710939, "grad_norm": 0.9590745568275452, "learning_rate": 3.445849814503449e-06, "loss": 0.4497, "num_input_tokens_seen": 124343168, "step": 102245 }, { "epoch": 12.811677734619721, "grad_norm": 0.4096568524837494, "learning_rate": 3.4453301911488624e-06, "loss": 0.4639, "num_input_tokens_seen": 124349024, "step": 102250 }, { "epoch": 12.812304222528505, "grad_norm": 1.1378471851348877, "learning_rate": 3.444810586381657e-06, "loss": 0.4411, "num_input_tokens_seen": 124354656, "step": 102255 }, { "epoch": 12.81293071043729, "grad_norm": 1.139736533164978, "learning_rate": 3.444291000208051e-06, "loss": 0.4547, "num_input_tokens_seen": 124360768, "step": 102260 }, { "epoch": 12.813557198346071, "grad_norm": 0.5884052515029907, "learning_rate": 3.443771432634252e-06, "loss": 0.501, "num_input_tokens_seen": 124366688, "step": 102265 }, { "epoch": 12.814183686254855, "grad_norm": 0.6245504021644592, "learning_rate": 3.443251883666477e-06, "loss": 0.4665, "num_input_tokens_seen": 124372288, "step": 102270 }, { "epoch": 12.81481017416364, "grad_norm": 1.6379064321517944, "learning_rate": 3.4427323533109314e-06, "loss": 0.4679, "num_input_tokens_seen": 124378208, "step": 102275 }, { "epoch": 12.815436662072422, "grad_norm": 0.5038604140281677, "learning_rate": 3.4422128415738328e-06, "loss": 0.4588, "num_input_tokens_seen": 124384480, "step": 102280 }, { "epoch": 12.816063149981206, "grad_norm": 0.6435167193412781, "learning_rate": 3.441693348461387e-06, "loss": 0.4566, "num_input_tokens_seen": 124390464, "step": 102285 }, { "epoch": 12.816689637889988, "grad_norm": 0.617487370967865, "learning_rate": 3.4411738739798072e-06, "loss": 0.4633, "num_input_tokens_seen": 124396480, "step": 102290 }, { "epoch": 12.817316125798772, "grad_norm": 0.7455511689186096, "learning_rate": 3.4406544181353063e-06, "loss": 0.4563, "num_input_tokens_seen": 124402656, "step": 102295 }, { "epoch": 12.817942613707556, "grad_norm": 0.7630762457847595, "learning_rate": 3.440134980934091e-06, "loss": 0.4605, "num_input_tokens_seen": 124409024, "step": 102300 }, { "epoch": 12.818569101616339, "grad_norm": 0.4050174355506897, "learning_rate": 3.4396155623823752e-06, "loss": 0.4654, "num_input_tokens_seen": 124415520, "step": 102305 }, { "epoch": 12.819195589525123, "grad_norm": 0.5267147421836853, "learning_rate": 3.4390961624863656e-06, "loss": 0.4607, "num_input_tokens_seen": 124421344, "step": 102310 }, { "epoch": 12.819822077433905, "grad_norm": 0.5384925603866577, "learning_rate": 3.4385767812522743e-06, "loss": 0.4588, "num_input_tokens_seen": 124427488, "step": 102315 }, { "epoch": 12.820448565342689, "grad_norm": 0.6317761540412903, "learning_rate": 3.43805741868631e-06, "loss": 0.4444, "num_input_tokens_seen": 124433824, "step": 102320 }, { "epoch": 12.821075053251473, "grad_norm": 0.43492358922958374, "learning_rate": 3.437538074794684e-06, "loss": 0.4851, "num_input_tokens_seen": 124439936, "step": 102325 }, { "epoch": 12.821701541160255, "grad_norm": 0.3826947808265686, "learning_rate": 3.4370187495836015e-06, "loss": 0.46, "num_input_tokens_seen": 124445664, "step": 102330 }, { "epoch": 12.82232802906904, "grad_norm": 1.512784481048584, "learning_rate": 3.436499443059277e-06, "loss": 0.457, "num_input_tokens_seen": 124451296, "step": 102335 }, { "epoch": 12.822954516977822, "grad_norm": 0.5553330183029175, "learning_rate": 3.4359801552279136e-06, "loss": 0.4533, "num_input_tokens_seen": 124457120, "step": 102340 }, { "epoch": 12.823581004886606, "grad_norm": 0.4447641372680664, "learning_rate": 3.4354608860957246e-06, "loss": 0.454, "num_input_tokens_seen": 124462912, "step": 102345 }, { "epoch": 12.82420749279539, "grad_norm": 14.244848251342773, "learning_rate": 3.4349416356689136e-06, "loss": 0.5734, "num_input_tokens_seen": 124469088, "step": 102350 }, { "epoch": 12.824833980704172, "grad_norm": 0.5315660834312439, "learning_rate": 3.4344224039536923e-06, "loss": 0.4741, "num_input_tokens_seen": 124475264, "step": 102355 }, { "epoch": 12.825460468612956, "grad_norm": 0.6545169949531555, "learning_rate": 3.433903190956267e-06, "loss": 0.4631, "num_input_tokens_seen": 124481472, "step": 102360 }, { "epoch": 12.826086956521738, "grad_norm": 0.4877849519252777, "learning_rate": 3.433383996682846e-06, "loss": 0.4558, "num_input_tokens_seen": 124487648, "step": 102365 }, { "epoch": 12.826713444430522, "grad_norm": 0.9055448174476624, "learning_rate": 3.4328648211396376e-06, "loss": 0.4681, "num_input_tokens_seen": 124493856, "step": 102370 }, { "epoch": 12.827339932339306, "grad_norm": 0.7177290320396423, "learning_rate": 3.4323456643328456e-06, "loss": 0.4424, "num_input_tokens_seen": 124499808, "step": 102375 }, { "epoch": 12.827966420248089, "grad_norm": 0.40296509861946106, "learning_rate": 3.4318265262686824e-06, "loss": 0.4536, "num_input_tokens_seen": 124506272, "step": 102380 }, { "epoch": 12.828592908156873, "grad_norm": 0.5157871246337891, "learning_rate": 3.4313074069533488e-06, "loss": 0.4525, "num_input_tokens_seen": 124512192, "step": 102385 }, { "epoch": 12.829219396065657, "grad_norm": 0.6398285627365112, "learning_rate": 3.430788306393056e-06, "loss": 0.4668, "num_input_tokens_seen": 124518400, "step": 102390 }, { "epoch": 12.82984588397444, "grad_norm": 0.5437848567962646, "learning_rate": 3.430269224594006e-06, "loss": 0.4721, "num_input_tokens_seen": 124524672, "step": 102395 }, { "epoch": 12.830472371883223, "grad_norm": 0.8202944397926331, "learning_rate": 3.4297501615624097e-06, "loss": 0.4364, "num_input_tokens_seen": 124530880, "step": 102400 }, { "epoch": 12.831098859792005, "grad_norm": 0.5431341528892517, "learning_rate": 3.4292311173044684e-06, "loss": 0.4602, "num_input_tokens_seen": 124537120, "step": 102405 }, { "epoch": 12.83172534770079, "grad_norm": 0.7718085646629333, "learning_rate": 3.4287120918263906e-06, "loss": 0.4669, "num_input_tokens_seen": 124543744, "step": 102410 }, { "epoch": 12.832351835609572, "grad_norm": 0.9373202919960022, "learning_rate": 3.42819308513438e-06, "loss": 0.4691, "num_input_tokens_seen": 124549984, "step": 102415 }, { "epoch": 12.832978323518356, "grad_norm": 0.6348602175712585, "learning_rate": 3.4276740972346436e-06, "loss": 0.4506, "num_input_tokens_seen": 124556192, "step": 102420 }, { "epoch": 12.83360481142714, "grad_norm": 1.0056313276290894, "learning_rate": 3.4271551281333847e-06, "loss": 0.4382, "num_input_tokens_seen": 124561824, "step": 102425 }, { "epoch": 12.834231299335922, "grad_norm": 0.7443456649780273, "learning_rate": 3.4266361778368086e-06, "loss": 0.485, "num_input_tokens_seen": 124567808, "step": 102430 }, { "epoch": 12.834857787244706, "grad_norm": 0.4932417571544647, "learning_rate": 3.4261172463511203e-06, "loss": 0.4527, "num_input_tokens_seen": 124573504, "step": 102435 }, { "epoch": 12.83548427515349, "grad_norm": 0.7108883857727051, "learning_rate": 3.4255983336825227e-06, "loss": 0.4446, "num_input_tokens_seen": 124579040, "step": 102440 }, { "epoch": 12.836110763062273, "grad_norm": 1.2383277416229248, "learning_rate": 3.4250794398372227e-06, "loss": 0.4505, "num_input_tokens_seen": 124585248, "step": 102445 }, { "epoch": 12.836737250971057, "grad_norm": 0.706153929233551, "learning_rate": 3.42456056482142e-06, "loss": 0.4555, "num_input_tokens_seen": 124591072, "step": 102450 }, { "epoch": 12.837363738879839, "grad_norm": 4.414951324462891, "learning_rate": 3.4240417086413224e-06, "loss": 0.4432, "num_input_tokens_seen": 124597440, "step": 102455 }, { "epoch": 12.837990226788623, "grad_norm": 1.5223784446716309, "learning_rate": 3.423522871303129e-06, "loss": 0.4602, "num_input_tokens_seen": 124603232, "step": 102460 }, { "epoch": 12.838616714697407, "grad_norm": 1.6521841287612915, "learning_rate": 3.423004052813046e-06, "loss": 0.433, "num_input_tokens_seen": 124609376, "step": 102465 }, { "epoch": 12.83924320260619, "grad_norm": 10.43387222290039, "learning_rate": 3.4224852531772747e-06, "loss": 0.6127, "num_input_tokens_seen": 124615328, "step": 102470 }, { "epoch": 12.839869690514973, "grad_norm": 1.200722575187683, "learning_rate": 3.4219664724020187e-06, "loss": 0.4648, "num_input_tokens_seen": 124621440, "step": 102475 }, { "epoch": 12.840496178423756, "grad_norm": 1.4359028339385986, "learning_rate": 3.4214477104934806e-06, "loss": 0.4379, "num_input_tokens_seen": 124627040, "step": 102480 }, { "epoch": 12.84112266633254, "grad_norm": 3.3219993114471436, "learning_rate": 3.4209289674578614e-06, "loss": 0.5428, "num_input_tokens_seen": 124632992, "step": 102485 }, { "epoch": 12.841749154241324, "grad_norm": 4.071456432342529, "learning_rate": 3.4204102433013666e-06, "loss": 0.4522, "num_input_tokens_seen": 124639040, "step": 102490 }, { "epoch": 12.842375642150106, "grad_norm": 1.3956884145736694, "learning_rate": 3.4198915380301934e-06, "loss": 0.4624, "num_input_tokens_seen": 124645216, "step": 102495 }, { "epoch": 12.84300213005889, "grad_norm": 0.9771703481674194, "learning_rate": 3.4193728516505464e-06, "loss": 0.456, "num_input_tokens_seen": 124651008, "step": 102500 }, { "epoch": 12.843628617967672, "grad_norm": 14.655156135559082, "learning_rate": 3.4188541841686246e-06, "loss": 0.5455, "num_input_tokens_seen": 124656896, "step": 102505 }, { "epoch": 12.844255105876456, "grad_norm": 1.7499727010726929, "learning_rate": 3.418335535590631e-06, "loss": 0.4925, "num_input_tokens_seen": 124662976, "step": 102510 }, { "epoch": 12.84488159378524, "grad_norm": 0.9467104077339172, "learning_rate": 3.417816905922766e-06, "loss": 0.4368, "num_input_tokens_seen": 124669280, "step": 102515 }, { "epoch": 12.845508081694023, "grad_norm": 0.5706795454025269, "learning_rate": 3.4172982951712304e-06, "loss": 0.4869, "num_input_tokens_seen": 124675424, "step": 102520 }, { "epoch": 12.846134569602807, "grad_norm": 0.7077073454856873, "learning_rate": 3.4167797033422227e-06, "loss": 0.4814, "num_input_tokens_seen": 124681376, "step": 102525 }, { "epoch": 12.84676105751159, "grad_norm": 0.776419460773468, "learning_rate": 3.4162611304419475e-06, "loss": 0.428, "num_input_tokens_seen": 124687040, "step": 102530 }, { "epoch": 12.847387545420373, "grad_norm": 3.833587646484375, "learning_rate": 3.4157425764765996e-06, "loss": 0.4973, "num_input_tokens_seen": 124693344, "step": 102535 }, { "epoch": 12.848014033329157, "grad_norm": 0.8595812916755676, "learning_rate": 3.4152240414523806e-06, "loss": 0.4364, "num_input_tokens_seen": 124699520, "step": 102540 }, { "epoch": 12.84864052123794, "grad_norm": 0.6107895970344543, "learning_rate": 3.4147055253754937e-06, "loss": 0.4395, "num_input_tokens_seen": 124705632, "step": 102545 }, { "epoch": 12.849267009146724, "grad_norm": 9.939516067504883, "learning_rate": 3.4141870282521327e-06, "loss": 0.5319, "num_input_tokens_seen": 124712000, "step": 102550 }, { "epoch": 12.849893497055508, "grad_norm": 3.459611654281616, "learning_rate": 3.4136685500885e-06, "loss": 0.4547, "num_input_tokens_seen": 124718400, "step": 102555 }, { "epoch": 12.85051998496429, "grad_norm": 0.7713649868965149, "learning_rate": 3.4131500908907934e-06, "loss": 0.6768, "num_input_tokens_seen": 124724576, "step": 102560 }, { "epoch": 12.851146472873074, "grad_norm": 1.0499504804611206, "learning_rate": 3.4126316506652114e-06, "loss": 0.5178, "num_input_tokens_seen": 124731072, "step": 102565 }, { "epoch": 12.851772960781856, "grad_norm": 1.1668086051940918, "learning_rate": 3.4121132294179527e-06, "loss": 0.4433, "num_input_tokens_seen": 124737472, "step": 102570 }, { "epoch": 12.85239944869064, "grad_norm": 1.7019317150115967, "learning_rate": 3.411594827155216e-06, "loss": 0.4418, "num_input_tokens_seen": 124743648, "step": 102575 }, { "epoch": 12.853025936599424, "grad_norm": 0.9544985294342041, "learning_rate": 3.4110764438831966e-06, "loss": 0.4753, "num_input_tokens_seen": 124749440, "step": 102580 }, { "epoch": 12.853652424508207, "grad_norm": 9.708498001098633, "learning_rate": 3.410558079608097e-06, "loss": 0.5176, "num_input_tokens_seen": 124755904, "step": 102585 }, { "epoch": 12.85427891241699, "grad_norm": 1.1982663869857788, "learning_rate": 3.4100397343361093e-06, "loss": 0.4147, "num_input_tokens_seen": 124761984, "step": 102590 }, { "epoch": 12.854905400325773, "grad_norm": 5.14225959777832, "learning_rate": 3.4095214080734355e-06, "loss": 0.5075, "num_input_tokens_seen": 124767584, "step": 102595 }, { "epoch": 12.855531888234557, "grad_norm": 2.0085227489471436, "learning_rate": 3.4090031008262693e-06, "loss": 0.6533, "num_input_tokens_seen": 124773824, "step": 102600 }, { "epoch": 12.856158376143341, "grad_norm": 1.3532283306121826, "learning_rate": 3.4084848126008073e-06, "loss": 0.5031, "num_input_tokens_seen": 124779552, "step": 102605 }, { "epoch": 12.856784864052123, "grad_norm": 1.998572826385498, "learning_rate": 3.4079665434032505e-06, "loss": 0.5343, "num_input_tokens_seen": 124785632, "step": 102610 }, { "epoch": 12.857411351960907, "grad_norm": 8.380414009094238, "learning_rate": 3.4074482932397895e-06, "loss": 0.4986, "num_input_tokens_seen": 124791584, "step": 102615 }, { "epoch": 12.85803783986969, "grad_norm": 1.0068082809448242, "learning_rate": 3.4069300621166245e-06, "loss": 0.431, "num_input_tokens_seen": 124797728, "step": 102620 }, { "epoch": 12.858664327778474, "grad_norm": 12.889683723449707, "learning_rate": 3.40641185003995e-06, "loss": 0.5469, "num_input_tokens_seen": 124804128, "step": 102625 }, { "epoch": 12.859290815687258, "grad_norm": 3.6842246055603027, "learning_rate": 3.405893657015962e-06, "loss": 0.4885, "num_input_tokens_seen": 124810240, "step": 102630 }, { "epoch": 12.85991730359604, "grad_norm": 0.6989956498146057, "learning_rate": 3.4053754830508535e-06, "loss": 0.4604, "num_input_tokens_seen": 124816352, "step": 102635 }, { "epoch": 12.860543791504824, "grad_norm": 0.8610712289810181, "learning_rate": 3.4048573281508244e-06, "loss": 0.5853, "num_input_tokens_seen": 124822432, "step": 102640 }, { "epoch": 12.861170279413606, "grad_norm": 0.48147478699684143, "learning_rate": 3.404339192322065e-06, "loss": 0.4431, "num_input_tokens_seen": 124828448, "step": 102645 }, { "epoch": 12.86179676732239, "grad_norm": 1.48738431930542, "learning_rate": 3.4038210755707747e-06, "loss": 0.4301, "num_input_tokens_seen": 124834560, "step": 102650 }, { "epoch": 12.862423255231175, "grad_norm": 2.9977164268493652, "learning_rate": 3.4033029779031423e-06, "loss": 0.5621, "num_input_tokens_seen": 124840928, "step": 102655 }, { "epoch": 12.863049743139957, "grad_norm": 0.5765055418014526, "learning_rate": 3.402784899325366e-06, "loss": 0.5335, "num_input_tokens_seen": 124846144, "step": 102660 }, { "epoch": 12.863676231048741, "grad_norm": 2.4869346618652344, "learning_rate": 3.402266839843641e-06, "loss": 0.497, "num_input_tokens_seen": 124852320, "step": 102665 }, { "epoch": 12.864302718957525, "grad_norm": 4.60497522354126, "learning_rate": 3.401748799464157e-06, "loss": 0.6069, "num_input_tokens_seen": 124858208, "step": 102670 }, { "epoch": 12.864929206866307, "grad_norm": 1.2697421312332153, "learning_rate": 3.401230778193111e-06, "loss": 0.4552, "num_input_tokens_seen": 124864384, "step": 102675 }, { "epoch": 12.865555694775091, "grad_norm": 0.6316392421722412, "learning_rate": 3.4007127760366944e-06, "loss": 0.533, "num_input_tokens_seen": 124870560, "step": 102680 }, { "epoch": 12.866182182683874, "grad_norm": 1.5532138347625732, "learning_rate": 3.4001947930011026e-06, "loss": 0.4568, "num_input_tokens_seen": 124876416, "step": 102685 }, { "epoch": 12.866808670592658, "grad_norm": 0.5178511142730713, "learning_rate": 3.399676829092526e-06, "loss": 0.4774, "num_input_tokens_seen": 124882848, "step": 102690 }, { "epoch": 12.867435158501442, "grad_norm": 1.8195726871490479, "learning_rate": 3.3991588843171596e-06, "loss": 0.4788, "num_input_tokens_seen": 124888672, "step": 102695 }, { "epoch": 12.868061646410224, "grad_norm": 0.605211079120636, "learning_rate": 3.398640958681193e-06, "loss": 0.4577, "num_input_tokens_seen": 124894432, "step": 102700 }, { "epoch": 12.868688134319008, "grad_norm": 0.6086347699165344, "learning_rate": 3.398123052190822e-06, "loss": 0.4853, "num_input_tokens_seen": 124900512, "step": 102705 }, { "epoch": 12.86931462222779, "grad_norm": 2.17592716217041, "learning_rate": 3.3976051648522346e-06, "loss": 0.4831, "num_input_tokens_seen": 124906464, "step": 102710 }, { "epoch": 12.869941110136574, "grad_norm": 0.8257770538330078, "learning_rate": 3.397087296671626e-06, "loss": 0.4598, "num_input_tokens_seen": 124912608, "step": 102715 }, { "epoch": 12.870567598045358, "grad_norm": 0.7522441148757935, "learning_rate": 3.396569447655186e-06, "loss": 0.4375, "num_input_tokens_seen": 124918080, "step": 102720 }, { "epoch": 12.87119408595414, "grad_norm": 1.4131916761398315, "learning_rate": 3.396051617809105e-06, "loss": 0.4671, "num_input_tokens_seen": 124923776, "step": 102725 }, { "epoch": 12.871820573862925, "grad_norm": 1.4562686681747437, "learning_rate": 3.3955338071395773e-06, "loss": 0.4557, "num_input_tokens_seen": 124929920, "step": 102730 }, { "epoch": 12.872447061771707, "grad_norm": 4.509438991546631, "learning_rate": 3.3950160156527912e-06, "loss": 0.5435, "num_input_tokens_seen": 124936320, "step": 102735 }, { "epoch": 12.873073549680491, "grad_norm": 2.971003532409668, "learning_rate": 3.394498243354939e-06, "loss": 0.4329, "num_input_tokens_seen": 124942432, "step": 102740 }, { "epoch": 12.873700037589275, "grad_norm": 2.418473482131958, "learning_rate": 3.393980490252209e-06, "loss": 0.4642, "num_input_tokens_seen": 124948480, "step": 102745 }, { "epoch": 12.874326525498057, "grad_norm": 1.3824098110198975, "learning_rate": 3.393462756350794e-06, "loss": 0.4468, "num_input_tokens_seen": 124954304, "step": 102750 }, { "epoch": 12.874953013406842, "grad_norm": 0.6838006973266602, "learning_rate": 3.39294504165688e-06, "loss": 0.4603, "num_input_tokens_seen": 124960384, "step": 102755 }, { "epoch": 12.875579501315624, "grad_norm": 0.905217170715332, "learning_rate": 3.392427346176662e-06, "loss": 0.4651, "num_input_tokens_seen": 124966496, "step": 102760 }, { "epoch": 12.876205989224408, "grad_norm": 1.1677772998809814, "learning_rate": 3.391909669916324e-06, "loss": 0.474, "num_input_tokens_seen": 124972864, "step": 102765 }, { "epoch": 12.876832477133192, "grad_norm": 5.9816131591796875, "learning_rate": 3.3913920128820595e-06, "loss": 0.5614, "num_input_tokens_seen": 124979104, "step": 102770 }, { "epoch": 12.877458965041974, "grad_norm": 1.882158637046814, "learning_rate": 3.3908743750800555e-06, "loss": 0.4645, "num_input_tokens_seen": 124985728, "step": 102775 }, { "epoch": 12.878085452950758, "grad_norm": 0.7591733336448669, "learning_rate": 3.3903567565165012e-06, "loss": 0.4393, "num_input_tokens_seen": 124992256, "step": 102780 }, { "epoch": 12.878711940859542, "grad_norm": 2.805715799331665, "learning_rate": 3.389839157197584e-06, "loss": 0.4512, "num_input_tokens_seen": 124998336, "step": 102785 }, { "epoch": 12.879338428768325, "grad_norm": 0.7620090842247009, "learning_rate": 3.389321577129494e-06, "loss": 0.4577, "num_input_tokens_seen": 125004320, "step": 102790 }, { "epoch": 12.879964916677109, "grad_norm": 1.294355869293213, "learning_rate": 3.3888040163184207e-06, "loss": 0.4625, "num_input_tokens_seen": 125009152, "step": 102795 }, { "epoch": 12.880591404585891, "grad_norm": 0.7875469326972961, "learning_rate": 3.388286474770548e-06, "loss": 0.5317, "num_input_tokens_seen": 125015360, "step": 102800 }, { "epoch": 12.881217892494675, "grad_norm": 5.757319450378418, "learning_rate": 3.387768952492067e-06, "loss": 0.5556, "num_input_tokens_seen": 125021536, "step": 102805 }, { "epoch": 12.881844380403459, "grad_norm": 3.2025279998779297, "learning_rate": 3.3872514494891616e-06, "loss": 0.445, "num_input_tokens_seen": 125027584, "step": 102810 }, { "epoch": 12.882470868312241, "grad_norm": 1.169057011604309, "learning_rate": 3.3867339657680242e-06, "loss": 0.457, "num_input_tokens_seen": 125033760, "step": 102815 }, { "epoch": 12.883097356221025, "grad_norm": 1.3838529586791992, "learning_rate": 3.386216501334836e-06, "loss": 0.4928, "num_input_tokens_seen": 125040032, "step": 102820 }, { "epoch": 12.883723844129808, "grad_norm": 2.2704849243164062, "learning_rate": 3.385699056195787e-06, "loss": 0.443, "num_input_tokens_seen": 125046208, "step": 102825 }, { "epoch": 12.884350332038592, "grad_norm": 7.932778358459473, "learning_rate": 3.3851816303570615e-06, "loss": 0.4605, "num_input_tokens_seen": 125052352, "step": 102830 }, { "epoch": 12.884976819947376, "grad_norm": 0.7759791612625122, "learning_rate": 3.3846642238248507e-06, "loss": 0.504, "num_input_tokens_seen": 125058560, "step": 102835 }, { "epoch": 12.885603307856158, "grad_norm": 5.921678066253662, "learning_rate": 3.384146836605334e-06, "loss": 0.4708, "num_input_tokens_seen": 125064448, "step": 102840 }, { "epoch": 12.886229795764942, "grad_norm": 0.9141153693199158, "learning_rate": 3.383629468704701e-06, "loss": 0.4366, "num_input_tokens_seen": 125070656, "step": 102845 }, { "epoch": 12.886856283673724, "grad_norm": 1.462704062461853, "learning_rate": 3.3831121201291384e-06, "loss": 0.4735, "num_input_tokens_seen": 125076640, "step": 102850 }, { "epoch": 12.887482771582508, "grad_norm": 0.9289476275444031, "learning_rate": 3.382594790884828e-06, "loss": 0.4692, "num_input_tokens_seen": 125082080, "step": 102855 }, { "epoch": 12.888109259491292, "grad_norm": 12.391935348510742, "learning_rate": 3.3820774809779586e-06, "loss": 0.5118, "num_input_tokens_seen": 125088128, "step": 102860 }, { "epoch": 12.888735747400075, "grad_norm": 3.460620164871216, "learning_rate": 3.3815601904147117e-06, "loss": 0.5283, "num_input_tokens_seen": 125094112, "step": 102865 }, { "epoch": 12.889362235308859, "grad_norm": 1.6188393831253052, "learning_rate": 3.381042919201275e-06, "loss": 0.4568, "num_input_tokens_seen": 125100160, "step": 102870 }, { "epoch": 12.889988723217641, "grad_norm": 0.9488401412963867, "learning_rate": 3.380525667343829e-06, "loss": 0.4657, "num_input_tokens_seen": 125105728, "step": 102875 }, { "epoch": 12.890615211126425, "grad_norm": 1.2873358726501465, "learning_rate": 3.380008434848562e-06, "loss": 0.4423, "num_input_tokens_seen": 125112064, "step": 102880 }, { "epoch": 12.89124169903521, "grad_norm": 5.152677536010742, "learning_rate": 3.3794912217216545e-06, "loss": 0.5195, "num_input_tokens_seen": 125117760, "step": 102885 }, { "epoch": 12.891868186943992, "grad_norm": 5.845114707946777, "learning_rate": 3.3789740279692938e-06, "loss": 0.5114, "num_input_tokens_seen": 125123712, "step": 102890 }, { "epoch": 12.892494674852776, "grad_norm": 13.708100318908691, "learning_rate": 3.37845685359766e-06, "loss": 0.5355, "num_input_tokens_seen": 125129792, "step": 102895 }, { "epoch": 12.89312116276156, "grad_norm": 1.5633748769760132, "learning_rate": 3.37793969861294e-06, "loss": 0.4752, "num_input_tokens_seen": 125135584, "step": 102900 }, { "epoch": 12.893747650670342, "grad_norm": 14.625133514404297, "learning_rate": 3.377422563021312e-06, "loss": 0.5227, "num_input_tokens_seen": 125141472, "step": 102905 }, { "epoch": 12.894374138579126, "grad_norm": 0.5911950469017029, "learning_rate": 3.376905446828962e-06, "loss": 0.4555, "num_input_tokens_seen": 125147520, "step": 102910 }, { "epoch": 12.895000626487908, "grad_norm": 1.8413184881210327, "learning_rate": 3.3763883500420737e-06, "loss": 0.4683, "num_input_tokens_seen": 125153600, "step": 102915 }, { "epoch": 12.895627114396692, "grad_norm": 1.390583872795105, "learning_rate": 3.375871272666825e-06, "loss": 0.456, "num_input_tokens_seen": 125159264, "step": 102920 }, { "epoch": 12.896253602305475, "grad_norm": 1.6925727128982544, "learning_rate": 3.375354214709404e-06, "loss": 0.4889, "num_input_tokens_seen": 125164992, "step": 102925 }, { "epoch": 12.896880090214259, "grad_norm": 0.5662847757339478, "learning_rate": 3.374837176175987e-06, "loss": 0.4662, "num_input_tokens_seen": 125169984, "step": 102930 }, { "epoch": 12.897506578123043, "grad_norm": 0.9110219478607178, "learning_rate": 3.3743201570727586e-06, "loss": 0.4518, "num_input_tokens_seen": 125175840, "step": 102935 }, { "epoch": 12.898133066031825, "grad_norm": 8.34466552734375, "learning_rate": 3.3738031574058983e-06, "loss": 0.5208, "num_input_tokens_seen": 125181600, "step": 102940 }, { "epoch": 12.898759553940609, "grad_norm": 1.2001547813415527, "learning_rate": 3.3732861771815913e-06, "loss": 0.4678, "num_input_tokens_seen": 125187648, "step": 102945 }, { "epoch": 12.899386041849393, "grad_norm": 10.755026817321777, "learning_rate": 3.3727692164060132e-06, "loss": 0.4563, "num_input_tokens_seen": 125193792, "step": 102950 }, { "epoch": 12.900012529758175, "grad_norm": 1.9747776985168457, "learning_rate": 3.3722522750853492e-06, "loss": 0.4723, "num_input_tokens_seen": 125199456, "step": 102955 }, { "epoch": 12.90063901766696, "grad_norm": 11.535269737243652, "learning_rate": 3.371735353225776e-06, "loss": 0.4807, "num_input_tokens_seen": 125205600, "step": 102960 }, { "epoch": 12.901265505575742, "grad_norm": 8.169048309326172, "learning_rate": 3.3712184508334773e-06, "loss": 0.4902, "num_input_tokens_seen": 125211136, "step": 102965 }, { "epoch": 12.901891993484526, "grad_norm": 12.461352348327637, "learning_rate": 3.370701567914629e-06, "loss": 0.4504, "num_input_tokens_seen": 125217376, "step": 102970 }, { "epoch": 12.90251848139331, "grad_norm": 13.422571182250977, "learning_rate": 3.3701847044754145e-06, "loss": 0.4839, "num_input_tokens_seen": 125223392, "step": 102975 }, { "epoch": 12.903144969302092, "grad_norm": 13.780649185180664, "learning_rate": 3.3696678605220124e-06, "loss": 0.688, "num_input_tokens_seen": 125229504, "step": 102980 }, { "epoch": 12.903771457210876, "grad_norm": 1.503043532371521, "learning_rate": 3.3691510360606015e-06, "loss": 0.5687, "num_input_tokens_seen": 125235136, "step": 102985 }, { "epoch": 12.904397945119658, "grad_norm": 0.9256539344787598, "learning_rate": 3.3686342310973618e-06, "loss": 0.5209, "num_input_tokens_seen": 125241216, "step": 102990 }, { "epoch": 12.905024433028442, "grad_norm": 8.495115280151367, "learning_rate": 3.36811744563847e-06, "loss": 0.5288, "num_input_tokens_seen": 125247680, "step": 102995 }, { "epoch": 12.905650920937227, "grad_norm": 8.229248046875, "learning_rate": 3.3676006796901084e-06, "loss": 0.5579, "num_input_tokens_seen": 125253856, "step": 103000 }, { "epoch": 12.906277408846009, "grad_norm": 1.8218810558319092, "learning_rate": 3.367083933258451e-06, "loss": 0.4458, "num_input_tokens_seen": 125259808, "step": 103005 }, { "epoch": 12.906903896754793, "grad_norm": 1.1090010404586792, "learning_rate": 3.36656720634968e-06, "loss": 0.4241, "num_input_tokens_seen": 125266336, "step": 103010 }, { "epoch": 12.907530384663575, "grad_norm": 14.588688850402832, "learning_rate": 3.3660504989699692e-06, "loss": 0.5073, "num_input_tokens_seen": 125272672, "step": 103015 }, { "epoch": 12.90815687257236, "grad_norm": 1.0109376907348633, "learning_rate": 3.365533811125501e-06, "loss": 0.5654, "num_input_tokens_seen": 125279168, "step": 103020 }, { "epoch": 12.908783360481143, "grad_norm": 1.394798755645752, "learning_rate": 3.3650171428224475e-06, "loss": 0.5652, "num_input_tokens_seen": 125285536, "step": 103025 }, { "epoch": 12.909409848389926, "grad_norm": 10.260095596313477, "learning_rate": 3.364500494066989e-06, "loss": 0.7166, "num_input_tokens_seen": 125291456, "step": 103030 }, { "epoch": 12.91003633629871, "grad_norm": 10.299373626708984, "learning_rate": 3.3639838648653033e-06, "loss": 0.5245, "num_input_tokens_seen": 125297696, "step": 103035 }, { "epoch": 12.910662824207492, "grad_norm": 11.283297538757324, "learning_rate": 3.363467255223566e-06, "loss": 0.5078, "num_input_tokens_seen": 125304352, "step": 103040 }, { "epoch": 12.911289312116276, "grad_norm": 3.261101722717285, "learning_rate": 3.3629506651479537e-06, "loss": 0.4578, "num_input_tokens_seen": 125310304, "step": 103045 }, { "epoch": 12.91191580002506, "grad_norm": 2.128364324569702, "learning_rate": 3.362434094644641e-06, "loss": 0.4687, "num_input_tokens_seen": 125315680, "step": 103050 }, { "epoch": 12.912542287933842, "grad_norm": 1.5642893314361572, "learning_rate": 3.3619175437198083e-06, "loss": 0.4475, "num_input_tokens_seen": 125321984, "step": 103055 }, { "epoch": 12.913168775842626, "grad_norm": 1.084848403930664, "learning_rate": 3.3614010123796257e-06, "loss": 0.4383, "num_input_tokens_seen": 125328288, "step": 103060 }, { "epoch": 12.91379526375141, "grad_norm": 0.7721410989761353, "learning_rate": 3.3608845006302742e-06, "loss": 0.4742, "num_input_tokens_seen": 125334272, "step": 103065 }, { "epoch": 12.914421751660193, "grad_norm": 2.2006773948669434, "learning_rate": 3.3603680084779244e-06, "loss": 0.4858, "num_input_tokens_seen": 125340480, "step": 103070 }, { "epoch": 12.915048239568977, "grad_norm": 4.345160484313965, "learning_rate": 3.359851535928756e-06, "loss": 0.4561, "num_input_tokens_seen": 125346688, "step": 103075 }, { "epoch": 12.915674727477759, "grad_norm": 3.9620361328125, "learning_rate": 3.359335082988939e-06, "loss": 0.4715, "num_input_tokens_seen": 125352672, "step": 103080 }, { "epoch": 12.916301215386543, "grad_norm": 5.226048469543457, "learning_rate": 3.3588186496646512e-06, "loss": 0.4883, "num_input_tokens_seen": 125358848, "step": 103085 }, { "epoch": 12.916927703295327, "grad_norm": 0.9846441745758057, "learning_rate": 3.358302235962066e-06, "loss": 0.5002, "num_input_tokens_seen": 125365120, "step": 103090 }, { "epoch": 12.91755419120411, "grad_norm": 0.8325673341751099, "learning_rate": 3.3577858418873577e-06, "loss": 0.4352, "num_input_tokens_seen": 125371104, "step": 103095 }, { "epoch": 12.918180679112893, "grad_norm": 10.686120986938477, "learning_rate": 3.3572694674467022e-06, "loss": 0.5281, "num_input_tokens_seen": 125377056, "step": 103100 }, { "epoch": 12.918807167021676, "grad_norm": 8.787320137023926, "learning_rate": 3.3567531126462687e-06, "loss": 0.5537, "num_input_tokens_seen": 125383168, "step": 103105 }, { "epoch": 12.91943365493046, "grad_norm": 1.1237198114395142, "learning_rate": 3.356236777492236e-06, "loss": 0.4494, "num_input_tokens_seen": 125389504, "step": 103110 }, { "epoch": 12.920060142839244, "grad_norm": 8.118117332458496, "learning_rate": 3.3557204619907722e-06, "loss": 0.5154, "num_input_tokens_seen": 125395648, "step": 103115 }, { "epoch": 12.920686630748026, "grad_norm": 6.036667823791504, "learning_rate": 3.3552041661480556e-06, "loss": 0.4563, "num_input_tokens_seen": 125401760, "step": 103120 }, { "epoch": 12.92131311865681, "grad_norm": 3.6646502017974854, "learning_rate": 3.354687889970253e-06, "loss": 0.4801, "num_input_tokens_seen": 125408192, "step": 103125 }, { "epoch": 12.921939606565592, "grad_norm": 4.515143871307373, "learning_rate": 3.354171633463542e-06, "loss": 0.5528, "num_input_tokens_seen": 125414464, "step": 103130 }, { "epoch": 12.922566094474377, "grad_norm": 0.9298688173294067, "learning_rate": 3.3536553966340924e-06, "loss": 0.467, "num_input_tokens_seen": 125420960, "step": 103135 }, { "epoch": 12.92319258238316, "grad_norm": 0.7401295304298401, "learning_rate": 3.353139179488076e-06, "loss": 0.483, "num_input_tokens_seen": 125426880, "step": 103140 }, { "epoch": 12.923819070291943, "grad_norm": 9.751256942749023, "learning_rate": 3.352622982031665e-06, "loss": 0.517, "num_input_tokens_seen": 125433248, "step": 103145 }, { "epoch": 12.924445558200727, "grad_norm": 11.169028282165527, "learning_rate": 3.3521068042710338e-06, "loss": 0.599, "num_input_tokens_seen": 125438976, "step": 103150 }, { "epoch": 12.92507204610951, "grad_norm": 8.461859703063965, "learning_rate": 3.351590646212348e-06, "loss": 0.5111, "num_input_tokens_seen": 125445376, "step": 103155 }, { "epoch": 12.925698534018293, "grad_norm": 0.906794011592865, "learning_rate": 3.3510745078617827e-06, "loss": 0.52, "num_input_tokens_seen": 125451808, "step": 103160 }, { "epoch": 12.926325021927077, "grad_norm": 3.283447504043579, "learning_rate": 3.35055838922551e-06, "loss": 0.4778, "num_input_tokens_seen": 125457856, "step": 103165 }, { "epoch": 12.92695150983586, "grad_norm": 0.9270105957984924, "learning_rate": 3.3500422903096964e-06, "loss": 0.4628, "num_input_tokens_seen": 125464032, "step": 103170 }, { "epoch": 12.927577997744644, "grad_norm": 1.2602626085281372, "learning_rate": 3.349526211120517e-06, "loss": 0.4746, "num_input_tokens_seen": 125470304, "step": 103175 }, { "epoch": 12.928204485653428, "grad_norm": 0.7512910962104797, "learning_rate": 3.3490101516641366e-06, "loss": 0.5167, "num_input_tokens_seen": 125476256, "step": 103180 }, { "epoch": 12.92883097356221, "grad_norm": 1.5940965414047241, "learning_rate": 3.348494111946729e-06, "loss": 0.4774, "num_input_tokens_seen": 125482432, "step": 103185 }, { "epoch": 12.929457461470994, "grad_norm": 1.050881266593933, "learning_rate": 3.347978091974463e-06, "loss": 0.435, "num_input_tokens_seen": 125488608, "step": 103190 }, { "epoch": 12.930083949379776, "grad_norm": 1.0435858964920044, "learning_rate": 3.3474620917535078e-06, "loss": 0.4304, "num_input_tokens_seen": 125494880, "step": 103195 }, { "epoch": 12.93071043728856, "grad_norm": 1.3415052890777588, "learning_rate": 3.3469461112900315e-06, "loss": 0.4125, "num_input_tokens_seen": 125501376, "step": 103200 }, { "epoch": 12.931336925197344, "grad_norm": 6.147839546203613, "learning_rate": 3.3464301505902064e-06, "loss": 0.4413, "num_input_tokens_seen": 125507616, "step": 103205 }, { "epoch": 12.931963413106127, "grad_norm": 9.156789779663086, "learning_rate": 3.345914209660197e-06, "loss": 0.6362, "num_input_tokens_seen": 125513632, "step": 103210 }, { "epoch": 12.93258990101491, "grad_norm": 9.750956535339355, "learning_rate": 3.345398288506174e-06, "loss": 0.6111, "num_input_tokens_seen": 125519712, "step": 103215 }, { "epoch": 12.933216388923693, "grad_norm": 5.914976596832275, "learning_rate": 3.3448823871343078e-06, "loss": 0.5345, "num_input_tokens_seen": 125525920, "step": 103220 }, { "epoch": 12.933842876832477, "grad_norm": 3.889491558074951, "learning_rate": 3.3443665055507625e-06, "loss": 0.5405, "num_input_tokens_seen": 125531872, "step": 103225 }, { "epoch": 12.934469364741261, "grad_norm": 0.9230214357376099, "learning_rate": 3.3438506437617093e-06, "loss": 0.5279, "num_input_tokens_seen": 125538112, "step": 103230 }, { "epoch": 12.935095852650043, "grad_norm": 5.785409450531006, "learning_rate": 3.343334801773311e-06, "loss": 0.4555, "num_input_tokens_seen": 125544416, "step": 103235 }, { "epoch": 12.935722340558828, "grad_norm": 1.1232657432556152, "learning_rate": 3.3428189795917397e-06, "loss": 0.5091, "num_input_tokens_seen": 125550784, "step": 103240 }, { "epoch": 12.93634882846761, "grad_norm": 1.0227099657058716, "learning_rate": 3.3423031772231607e-06, "loss": 0.4231, "num_input_tokens_seen": 125556800, "step": 103245 }, { "epoch": 12.936975316376394, "grad_norm": 8.869695663452148, "learning_rate": 3.3417873946737424e-06, "loss": 0.6243, "num_input_tokens_seen": 125562688, "step": 103250 }, { "epoch": 12.937601804285178, "grad_norm": 11.526305198669434, "learning_rate": 3.3412716319496474e-06, "loss": 0.4748, "num_input_tokens_seen": 125568000, "step": 103255 }, { "epoch": 12.93822829219396, "grad_norm": 5.50909423828125, "learning_rate": 3.3407558890570466e-06, "loss": 0.7702, "num_input_tokens_seen": 125574240, "step": 103260 }, { "epoch": 12.938854780102744, "grad_norm": 9.937063217163086, "learning_rate": 3.3402401660021024e-06, "loss": 0.5727, "num_input_tokens_seen": 125580352, "step": 103265 }, { "epoch": 12.939481268011527, "grad_norm": 0.8612741231918335, "learning_rate": 3.3397244627909848e-06, "loss": 0.7131, "num_input_tokens_seen": 125586528, "step": 103270 }, { "epoch": 12.94010775592031, "grad_norm": 4.233434677124023, "learning_rate": 3.339208779429855e-06, "loss": 0.5168, "num_input_tokens_seen": 125592992, "step": 103275 }, { "epoch": 12.940734243829095, "grad_norm": 1.283400297164917, "learning_rate": 3.3386931159248804e-06, "loss": 0.4416, "num_input_tokens_seen": 125598816, "step": 103280 }, { "epoch": 12.941360731737877, "grad_norm": 6.744304180145264, "learning_rate": 3.338177472282228e-06, "loss": 0.5704, "num_input_tokens_seen": 125605440, "step": 103285 }, { "epoch": 12.941987219646661, "grad_norm": 0.7744529843330383, "learning_rate": 3.3376618485080596e-06, "loss": 0.4546, "num_input_tokens_seen": 125611648, "step": 103290 }, { "epoch": 12.942613707555445, "grad_norm": 0.7255001068115234, "learning_rate": 3.337146244608542e-06, "loss": 0.4524, "num_input_tokens_seen": 125617696, "step": 103295 }, { "epoch": 12.943240195464227, "grad_norm": 0.8467555046081543, "learning_rate": 3.3366306605898392e-06, "loss": 0.4781, "num_input_tokens_seen": 125623808, "step": 103300 }, { "epoch": 12.943866683373011, "grad_norm": 3.4230220317840576, "learning_rate": 3.3361150964581163e-06, "loss": 0.4815, "num_input_tokens_seen": 125629984, "step": 103305 }, { "epoch": 12.944493171281794, "grad_norm": 0.8759778738021851, "learning_rate": 3.3355995522195352e-06, "loss": 0.4595, "num_input_tokens_seen": 125636160, "step": 103310 }, { "epoch": 12.945119659190578, "grad_norm": 0.7437314391136169, "learning_rate": 3.3350840278802623e-06, "loss": 0.4606, "num_input_tokens_seen": 125641920, "step": 103315 }, { "epoch": 12.945746147099362, "grad_norm": 1.712456226348877, "learning_rate": 3.334568523446457e-06, "loss": 0.4457, "num_input_tokens_seen": 125648064, "step": 103320 }, { "epoch": 12.946372635008144, "grad_norm": 0.5911202430725098, "learning_rate": 3.3340530389242875e-06, "loss": 0.4706, "num_input_tokens_seen": 125654336, "step": 103325 }, { "epoch": 12.946999122916928, "grad_norm": 2.309403419494629, "learning_rate": 3.3335375743199126e-06, "loss": 0.5187, "num_input_tokens_seen": 125660672, "step": 103330 }, { "epoch": 12.94762561082571, "grad_norm": 1.7552894353866577, "learning_rate": 3.3330221296394987e-06, "loss": 0.51, "num_input_tokens_seen": 125666880, "step": 103335 }, { "epoch": 12.948252098734494, "grad_norm": 2.230631113052368, "learning_rate": 3.3325067048892068e-06, "loss": 0.498, "num_input_tokens_seen": 125672800, "step": 103340 }, { "epoch": 12.948878586643279, "grad_norm": 1.8133982419967651, "learning_rate": 3.3319913000751973e-06, "loss": 0.4673, "num_input_tokens_seen": 125679104, "step": 103345 }, { "epoch": 12.94950507455206, "grad_norm": 0.7861396670341492, "learning_rate": 3.331475915203636e-06, "loss": 0.4704, "num_input_tokens_seen": 125685280, "step": 103350 }, { "epoch": 12.950131562460845, "grad_norm": 0.8523511290550232, "learning_rate": 3.3309605502806815e-06, "loss": 0.4571, "num_input_tokens_seen": 125691520, "step": 103355 }, { "epoch": 12.950758050369627, "grad_norm": 0.7518089413642883, "learning_rate": 3.330445205312499e-06, "loss": 0.4366, "num_input_tokens_seen": 125697824, "step": 103360 }, { "epoch": 12.951384538278411, "grad_norm": 1.0979186296463013, "learning_rate": 3.3299298803052453e-06, "loss": 0.4842, "num_input_tokens_seen": 125704192, "step": 103365 }, { "epoch": 12.952011026187195, "grad_norm": 0.6444380879402161, "learning_rate": 3.3294145752650865e-06, "loss": 0.4643, "num_input_tokens_seen": 125710176, "step": 103370 }, { "epoch": 12.952637514095978, "grad_norm": 0.7946580052375793, "learning_rate": 3.328899290198179e-06, "loss": 0.4406, "num_input_tokens_seen": 125715936, "step": 103375 }, { "epoch": 12.953264002004762, "grad_norm": 0.8614348769187927, "learning_rate": 3.3283840251106875e-06, "loss": 0.4564, "num_input_tokens_seen": 125721856, "step": 103380 }, { "epoch": 12.953890489913544, "grad_norm": 0.6132208704948425, "learning_rate": 3.3278687800087682e-06, "loss": 0.4709, "num_input_tokens_seen": 125728224, "step": 103385 }, { "epoch": 12.954516977822328, "grad_norm": 2.349843740463257, "learning_rate": 3.3273535548985846e-06, "loss": 0.4746, "num_input_tokens_seen": 125734336, "step": 103390 }, { "epoch": 12.955143465731112, "grad_norm": 1.5014601945877075, "learning_rate": 3.3268383497862953e-06, "loss": 0.4611, "num_input_tokens_seen": 125740192, "step": 103395 }, { "epoch": 12.955769953639894, "grad_norm": 0.41647517681121826, "learning_rate": 3.326323164678061e-06, "loss": 0.4823, "num_input_tokens_seen": 125745536, "step": 103400 }, { "epoch": 12.956396441548678, "grad_norm": 0.7199099063873291, "learning_rate": 3.3258079995800387e-06, "loss": 0.4591, "num_input_tokens_seen": 125751488, "step": 103405 }, { "epoch": 12.957022929457462, "grad_norm": 2.064514636993408, "learning_rate": 3.32529285449839e-06, "loss": 0.4515, "num_input_tokens_seen": 125757504, "step": 103410 }, { "epoch": 12.957649417366245, "grad_norm": 0.8411338329315186, "learning_rate": 3.324777729439274e-06, "loss": 0.4915, "num_input_tokens_seen": 125763488, "step": 103415 }, { "epoch": 12.958275905275029, "grad_norm": 0.5535696744918823, "learning_rate": 3.3242626244088476e-06, "loss": 0.4981, "num_input_tokens_seen": 125769568, "step": 103420 }, { "epoch": 12.958902393183811, "grad_norm": 0.47975021600723267, "learning_rate": 3.3237475394132723e-06, "loss": 0.4654, "num_input_tokens_seen": 125775520, "step": 103425 }, { "epoch": 12.959528881092595, "grad_norm": 0.6288431882858276, "learning_rate": 3.3232324744587023e-06, "loss": 0.4672, "num_input_tokens_seen": 125781824, "step": 103430 }, { "epoch": 12.960155369001379, "grad_norm": 0.5761226415634155, "learning_rate": 3.3227174295513003e-06, "loss": 0.4632, "num_input_tokens_seen": 125788128, "step": 103435 }, { "epoch": 12.960781856910161, "grad_norm": 1.3066943883895874, "learning_rate": 3.322202404697219e-06, "loss": 0.4503, "num_input_tokens_seen": 125794368, "step": 103440 }, { "epoch": 12.961408344818945, "grad_norm": 0.569024920463562, "learning_rate": 3.32168739990262e-06, "loss": 0.447, "num_input_tokens_seen": 125800224, "step": 103445 }, { "epoch": 12.962034832727728, "grad_norm": 0.6561444997787476, "learning_rate": 3.321172415173658e-06, "loss": 0.4574, "num_input_tokens_seen": 125806368, "step": 103450 }, { "epoch": 12.962661320636512, "grad_norm": 0.9978922009468079, "learning_rate": 3.3206574505164934e-06, "loss": 0.4537, "num_input_tokens_seen": 125812736, "step": 103455 }, { "epoch": 12.963287808545296, "grad_norm": 0.5930786728858948, "learning_rate": 3.320142505937279e-06, "loss": 0.459, "num_input_tokens_seen": 125818816, "step": 103460 }, { "epoch": 12.963914296454078, "grad_norm": 0.8537828922271729, "learning_rate": 3.319627581442173e-06, "loss": 0.4485, "num_input_tokens_seen": 125824704, "step": 103465 }, { "epoch": 12.964540784362862, "grad_norm": 0.6465638875961304, "learning_rate": 3.3191126770373337e-06, "loss": 0.4698, "num_input_tokens_seen": 125830528, "step": 103470 }, { "epoch": 12.965167272271644, "grad_norm": 0.5793489217758179, "learning_rate": 3.318597792728914e-06, "loss": 0.4508, "num_input_tokens_seen": 125836928, "step": 103475 }, { "epoch": 12.965793760180429, "grad_norm": 1.027722716331482, "learning_rate": 3.3180829285230737e-06, "loss": 0.4706, "num_input_tokens_seen": 125842976, "step": 103480 }, { "epoch": 12.966420248089213, "grad_norm": 0.7738630175590515, "learning_rate": 3.317568084425964e-06, "loss": 0.5033, "num_input_tokens_seen": 125849536, "step": 103485 }, { "epoch": 12.967046735997995, "grad_norm": 4.651191711425781, "learning_rate": 3.3170532604437434e-06, "loss": 0.5001, "num_input_tokens_seen": 125855776, "step": 103490 }, { "epoch": 12.967673223906779, "grad_norm": 0.7034714818000793, "learning_rate": 3.3165384565825648e-06, "loss": 0.4544, "num_input_tokens_seen": 125861760, "step": 103495 }, { "epoch": 12.968299711815561, "grad_norm": 0.6048309206962585, "learning_rate": 3.316023672848585e-06, "loss": 0.4695, "num_input_tokens_seen": 125867520, "step": 103500 }, { "epoch": 12.968926199724345, "grad_norm": 1.1040558815002441, "learning_rate": 3.3155089092479573e-06, "loss": 0.4833, "num_input_tokens_seen": 125873600, "step": 103505 }, { "epoch": 12.96955268763313, "grad_norm": 1.0239986181259155, "learning_rate": 3.314994165786839e-06, "loss": 0.447, "num_input_tokens_seen": 125879392, "step": 103510 }, { "epoch": 12.970179175541912, "grad_norm": 0.9259747266769409, "learning_rate": 3.31447944247138e-06, "loss": 0.474, "num_input_tokens_seen": 125885664, "step": 103515 }, { "epoch": 12.970805663450696, "grad_norm": 0.7995878458023071, "learning_rate": 3.3139647393077377e-06, "loss": 0.4587, "num_input_tokens_seen": 125891168, "step": 103520 }, { "epoch": 12.97143215135948, "grad_norm": 8.246827125549316, "learning_rate": 3.313450056302063e-06, "loss": 0.5083, "num_input_tokens_seen": 125897152, "step": 103525 }, { "epoch": 12.972058639268262, "grad_norm": 2.473559617996216, "learning_rate": 3.312935393460511e-06, "loss": 0.4646, "num_input_tokens_seen": 125903584, "step": 103530 }, { "epoch": 12.972685127177046, "grad_norm": 14.390228271484375, "learning_rate": 3.3124207507892364e-06, "loss": 0.4855, "num_input_tokens_seen": 125909728, "step": 103535 }, { "epoch": 12.973311615085828, "grad_norm": 16.38481330871582, "learning_rate": 3.3119061282943886e-06, "loss": 0.6476, "num_input_tokens_seen": 125916064, "step": 103540 }, { "epoch": 12.973938102994612, "grad_norm": 1.0192433595657349, "learning_rate": 3.3113915259821248e-06, "loss": 0.4614, "num_input_tokens_seen": 125922208, "step": 103545 }, { "epoch": 12.974564590903395, "grad_norm": 0.8421374559402466, "learning_rate": 3.310876943858593e-06, "loss": 0.4922, "num_input_tokens_seen": 125928768, "step": 103550 }, { "epoch": 12.975191078812179, "grad_norm": 0.5582891702651978, "learning_rate": 3.310362381929948e-06, "loss": 0.4589, "num_input_tokens_seen": 125935040, "step": 103555 }, { "epoch": 12.975817566720963, "grad_norm": 0.46915265917778015, "learning_rate": 3.3098478402023405e-06, "loss": 0.4437, "num_input_tokens_seen": 125940640, "step": 103560 }, { "epoch": 12.976444054629745, "grad_norm": 0.7601418495178223, "learning_rate": 3.309333318681925e-06, "loss": 0.4566, "num_input_tokens_seen": 125947040, "step": 103565 }, { "epoch": 12.977070542538529, "grad_norm": 0.5626574158668518, "learning_rate": 3.3088188173748493e-06, "loss": 0.4719, "num_input_tokens_seen": 125953216, "step": 103570 }, { "epoch": 12.977697030447313, "grad_norm": 0.8356487154960632, "learning_rate": 3.308304336287268e-06, "loss": 0.4976, "num_input_tokens_seen": 125959488, "step": 103575 }, { "epoch": 12.978323518356095, "grad_norm": 0.3351375460624695, "learning_rate": 3.307789875425329e-06, "loss": 0.4597, "num_input_tokens_seen": 125965696, "step": 103580 }, { "epoch": 12.97895000626488, "grad_norm": 5.999464511871338, "learning_rate": 3.307275434795187e-06, "loss": 0.4869, "num_input_tokens_seen": 125971008, "step": 103585 }, { "epoch": 12.979576494173662, "grad_norm": 1.384202480316162, "learning_rate": 3.3067610144029883e-06, "loss": 0.4856, "num_input_tokens_seen": 125977120, "step": 103590 }, { "epoch": 12.980202982082446, "grad_norm": 0.4515262842178345, "learning_rate": 3.306246614254884e-06, "loss": 0.4496, "num_input_tokens_seen": 125983584, "step": 103595 }, { "epoch": 12.98082946999123, "grad_norm": 0.35764768719673157, "learning_rate": 3.3057322343570284e-06, "loss": 0.4593, "num_input_tokens_seen": 125989856, "step": 103600 }, { "epoch": 12.981455957900012, "grad_norm": 0.7807551622390747, "learning_rate": 3.305217874715567e-06, "loss": 0.4613, "num_input_tokens_seen": 125996096, "step": 103605 }, { "epoch": 12.982082445808796, "grad_norm": 1.9065794944763184, "learning_rate": 3.304703535336652e-06, "loss": 0.4692, "num_input_tokens_seen": 126001920, "step": 103610 }, { "epoch": 12.982708933717579, "grad_norm": 0.8431287407875061, "learning_rate": 3.30418921622643e-06, "loss": 0.4511, "num_input_tokens_seen": 126008128, "step": 103615 }, { "epoch": 12.983335421626363, "grad_norm": 0.34433144330978394, "learning_rate": 3.3036749173910533e-06, "loss": 0.4773, "num_input_tokens_seen": 126014304, "step": 103620 }, { "epoch": 12.983961909535147, "grad_norm": 0.7544437646865845, "learning_rate": 3.3031606388366675e-06, "loss": 0.4498, "num_input_tokens_seen": 126020480, "step": 103625 }, { "epoch": 12.984588397443929, "grad_norm": 0.5294344425201416, "learning_rate": 3.3026463805694247e-06, "loss": 0.4797, "num_input_tokens_seen": 126026624, "step": 103630 }, { "epoch": 12.985214885352713, "grad_norm": 0.47825127840042114, "learning_rate": 3.30213214259547e-06, "loss": 0.4488, "num_input_tokens_seen": 126033024, "step": 103635 }, { "epoch": 12.985841373261495, "grad_norm": 0.41556552052497864, "learning_rate": 3.3016179249209544e-06, "loss": 0.4495, "num_input_tokens_seen": 126038080, "step": 103640 }, { "epoch": 12.98646786117028, "grad_norm": 1.5770549774169922, "learning_rate": 3.3011037275520225e-06, "loss": 0.4905, "num_input_tokens_seen": 126044096, "step": 103645 }, { "epoch": 12.987094349079063, "grad_norm": 0.47727543115615845, "learning_rate": 3.3005895504948237e-06, "loss": 0.4604, "num_input_tokens_seen": 126050400, "step": 103650 }, { "epoch": 12.987720836987846, "grad_norm": 0.5499053597450256, "learning_rate": 3.3000753937555076e-06, "loss": 0.4583, "num_input_tokens_seen": 126056352, "step": 103655 }, { "epoch": 12.98834732489663, "grad_norm": 1.0511550903320312, "learning_rate": 3.2995612573402185e-06, "loss": 0.4424, "num_input_tokens_seen": 126062720, "step": 103660 }, { "epoch": 12.988973812805412, "grad_norm": 0.5900497436523438, "learning_rate": 3.2990471412551046e-06, "loss": 0.4692, "num_input_tokens_seen": 126068992, "step": 103665 }, { "epoch": 12.989600300714196, "grad_norm": 0.9902445673942566, "learning_rate": 3.2985330455063107e-06, "loss": 0.46, "num_input_tokens_seen": 126074880, "step": 103670 }, { "epoch": 12.99022678862298, "grad_norm": 0.7601921558380127, "learning_rate": 3.2980189700999876e-06, "loss": 0.4482, "num_input_tokens_seen": 126081152, "step": 103675 }, { "epoch": 12.990853276531762, "grad_norm": 0.5716251134872437, "learning_rate": 3.297504915042276e-06, "loss": 0.472, "num_input_tokens_seen": 126087360, "step": 103680 }, { "epoch": 12.991479764440546, "grad_norm": 4.799342155456543, "learning_rate": 3.296990880339327e-06, "loss": 0.4724, "num_input_tokens_seen": 126093280, "step": 103685 }, { "epoch": 12.99210625234933, "grad_norm": 0.879144012928009, "learning_rate": 3.2964768659972813e-06, "loss": 0.4661, "num_input_tokens_seen": 126099520, "step": 103690 }, { "epoch": 12.992732740258113, "grad_norm": 0.44519761204719543, "learning_rate": 3.295962872022289e-06, "loss": 0.455, "num_input_tokens_seen": 126105440, "step": 103695 }, { "epoch": 12.993359228166897, "grad_norm": 0.3863636255264282, "learning_rate": 3.2954488984204913e-06, "loss": 0.4722, "num_input_tokens_seen": 126111360, "step": 103700 }, { "epoch": 12.993985716075679, "grad_norm": 0.5710883140563965, "learning_rate": 3.2949349451980363e-06, "loss": 0.4548, "num_input_tokens_seen": 126117376, "step": 103705 }, { "epoch": 12.994612203984463, "grad_norm": 0.45224979519844055, "learning_rate": 3.2944210123610666e-06, "loss": 0.4545, "num_input_tokens_seen": 126123392, "step": 103710 }, { "epoch": 12.995238691893247, "grad_norm": 0.5876604318618774, "learning_rate": 3.2939070999157273e-06, "loss": 0.4671, "num_input_tokens_seen": 126129568, "step": 103715 }, { "epoch": 12.99586517980203, "grad_norm": 0.5122970342636108, "learning_rate": 3.2933932078681653e-06, "loss": 0.4704, "num_input_tokens_seen": 126135552, "step": 103720 }, { "epoch": 12.996491667710814, "grad_norm": 0.5798671841621399, "learning_rate": 3.2928793362245203e-06, "loss": 0.4694, "num_input_tokens_seen": 126141664, "step": 103725 }, { "epoch": 12.997118155619596, "grad_norm": 0.9458259344100952, "learning_rate": 3.29236548499094e-06, "loss": 0.4467, "num_input_tokens_seen": 126147872, "step": 103730 }, { "epoch": 12.99774464352838, "grad_norm": 0.459803968667984, "learning_rate": 3.2918516541735646e-06, "loss": 0.4842, "num_input_tokens_seen": 126154080, "step": 103735 }, { "epoch": 12.998371131437164, "grad_norm": 0.7514359951019287, "learning_rate": 3.2913378437785405e-06, "loss": 0.4516, "num_input_tokens_seen": 126159904, "step": 103740 }, { "epoch": 12.998997619345946, "grad_norm": 0.590893566608429, "learning_rate": 3.2908240538120067e-06, "loss": 0.457, "num_input_tokens_seen": 126166208, "step": 103745 }, { "epoch": 12.99962410725473, "grad_norm": 3.8895928859710693, "learning_rate": 3.2903102842801092e-06, "loss": 0.4906, "num_input_tokens_seen": 126172288, "step": 103750 }, { "epoch": 13.000250595163513, "grad_norm": 0.6433263421058655, "learning_rate": 3.2897965351889905e-06, "loss": 0.4966, "num_input_tokens_seen": 126177152, "step": 103755 }, { "epoch": 13.000877083072297, "grad_norm": 0.3103019595146179, "learning_rate": 3.2892828065447914e-06, "loss": 0.4663, "num_input_tokens_seen": 126183168, "step": 103760 }, { "epoch": 13.00150357098108, "grad_norm": 0.6960554718971252, "learning_rate": 3.2887690983536537e-06, "loss": 0.4538, "num_input_tokens_seen": 126189472, "step": 103765 }, { "epoch": 13.002130058889863, "grad_norm": 0.33009305596351624, "learning_rate": 3.2882554106217224e-06, "loss": 0.4668, "num_input_tokens_seen": 126195264, "step": 103770 }, { "epoch": 13.002756546798647, "grad_norm": 0.5712080001831055, "learning_rate": 3.287741743355135e-06, "loss": 0.4494, "num_input_tokens_seen": 126201120, "step": 103775 }, { "epoch": 13.00338303470743, "grad_norm": 0.6020228862762451, "learning_rate": 3.2872280965600344e-06, "loss": 0.4561, "num_input_tokens_seen": 126207168, "step": 103780 }, { "epoch": 13.004009522616213, "grad_norm": 0.5232945084571838, "learning_rate": 3.2867144702425635e-06, "loss": 0.4402, "num_input_tokens_seen": 126212736, "step": 103785 }, { "epoch": 13.004636010524997, "grad_norm": 0.7092466354370117, "learning_rate": 3.28620086440886e-06, "loss": 0.4423, "num_input_tokens_seen": 126218784, "step": 103790 }, { "epoch": 13.00526249843378, "grad_norm": 0.4078976511955261, "learning_rate": 3.2856872790650685e-06, "loss": 0.478, "num_input_tokens_seen": 126225120, "step": 103795 }, { "epoch": 13.005888986342564, "grad_norm": 2.0983612537384033, "learning_rate": 3.285173714217324e-06, "loss": 0.446, "num_input_tokens_seen": 126231168, "step": 103800 }, { "epoch": 13.006515474251348, "grad_norm": 0.7379494905471802, "learning_rate": 3.2846601698717704e-06, "loss": 0.4813, "num_input_tokens_seen": 126236960, "step": 103805 }, { "epoch": 13.00714196216013, "grad_norm": 0.6832419037818909, "learning_rate": 3.284146646034547e-06, "loss": 0.4526, "num_input_tokens_seen": 126243200, "step": 103810 }, { "epoch": 13.007768450068914, "grad_norm": 0.6667872667312622, "learning_rate": 3.283633142711794e-06, "loss": 0.4808, "num_input_tokens_seen": 126249536, "step": 103815 }, { "epoch": 13.008394937977696, "grad_norm": 1.0579309463500977, "learning_rate": 3.2831196599096475e-06, "loss": 0.4721, "num_input_tokens_seen": 126255584, "step": 103820 }, { "epoch": 13.00902142588648, "grad_norm": 5.287835121154785, "learning_rate": 3.2826061976342516e-06, "loss": 0.4556, "num_input_tokens_seen": 126262048, "step": 103825 }, { "epoch": 13.009647913795265, "grad_norm": 0.7576152086257935, "learning_rate": 3.28209275589174e-06, "loss": 0.456, "num_input_tokens_seen": 126268256, "step": 103830 }, { "epoch": 13.010274401704047, "grad_norm": 0.39556020498275757, "learning_rate": 3.2815793346882548e-06, "loss": 0.4659, "num_input_tokens_seen": 126274016, "step": 103835 }, { "epoch": 13.01090088961283, "grad_norm": 0.520822286605835, "learning_rate": 3.2810659340299343e-06, "loss": 0.4977, "num_input_tokens_seen": 126280384, "step": 103840 }, { "epoch": 13.011527377521613, "grad_norm": 0.6987144351005554, "learning_rate": 3.2805525539229145e-06, "loss": 0.4538, "num_input_tokens_seen": 126286464, "step": 103845 }, { "epoch": 13.012153865430397, "grad_norm": 0.9378827214241028, "learning_rate": 3.2800391943733366e-06, "loss": 0.4434, "num_input_tokens_seen": 126293088, "step": 103850 }, { "epoch": 13.012780353339181, "grad_norm": 0.6029555201530457, "learning_rate": 3.2795258553873338e-06, "loss": 0.4407, "num_input_tokens_seen": 126299072, "step": 103855 }, { "epoch": 13.013406841247964, "grad_norm": 0.7504973411560059, "learning_rate": 3.2790125369710466e-06, "loss": 0.4691, "num_input_tokens_seen": 126305152, "step": 103860 }, { "epoch": 13.014033329156748, "grad_norm": 6.668743133544922, "learning_rate": 3.2784992391306116e-06, "loss": 0.4717, "num_input_tokens_seen": 126311200, "step": 103865 }, { "epoch": 13.01465981706553, "grad_norm": 0.9906325936317444, "learning_rate": 3.2779859618721665e-06, "loss": 0.492, "num_input_tokens_seen": 126317376, "step": 103870 }, { "epoch": 13.015286304974314, "grad_norm": 0.8369306325912476, "learning_rate": 3.277472705201845e-06, "loss": 0.4604, "num_input_tokens_seen": 126323360, "step": 103875 }, { "epoch": 13.015912792883098, "grad_norm": 0.9033542275428772, "learning_rate": 3.2769594691257877e-06, "loss": 0.4597, "num_input_tokens_seen": 126329568, "step": 103880 }, { "epoch": 13.01653928079188, "grad_norm": 0.6146185398101807, "learning_rate": 3.2764462536501275e-06, "loss": 0.4413, "num_input_tokens_seen": 126335776, "step": 103885 }, { "epoch": 13.017165768700664, "grad_norm": 0.5696796178817749, "learning_rate": 3.275933058781002e-06, "loss": 0.515, "num_input_tokens_seen": 126341664, "step": 103890 }, { "epoch": 13.017792256609447, "grad_norm": 0.8381715416908264, "learning_rate": 3.2754198845245456e-06, "loss": 0.4805, "num_input_tokens_seen": 126347680, "step": 103895 }, { "epoch": 13.01841874451823, "grad_norm": 0.8521120548248291, "learning_rate": 3.2749067308868933e-06, "loss": 0.432, "num_input_tokens_seen": 126353600, "step": 103900 }, { "epoch": 13.019045232427015, "grad_norm": 0.6012864112854004, "learning_rate": 3.274393597874184e-06, "loss": 0.4542, "num_input_tokens_seen": 126359456, "step": 103905 }, { "epoch": 13.019671720335797, "grad_norm": 3.0704874992370605, "learning_rate": 3.2738804854925483e-06, "loss": 0.455, "num_input_tokens_seen": 126365824, "step": 103910 }, { "epoch": 13.020298208244581, "grad_norm": 1.136950969696045, "learning_rate": 3.2733673937481233e-06, "loss": 0.4398, "num_input_tokens_seen": 126371584, "step": 103915 }, { "epoch": 13.020924696153365, "grad_norm": 0.8450345993041992, "learning_rate": 3.2728543226470414e-06, "loss": 0.5077, "num_input_tokens_seen": 126377632, "step": 103920 }, { "epoch": 13.021551184062147, "grad_norm": 2.472879409790039, "learning_rate": 3.2723412721954408e-06, "loss": 0.612, "num_input_tokens_seen": 126383744, "step": 103925 }, { "epoch": 13.022177671970931, "grad_norm": 5.3398261070251465, "learning_rate": 3.271828242399451e-06, "loss": 0.6306, "num_input_tokens_seen": 126389856, "step": 103930 }, { "epoch": 13.022804159879714, "grad_norm": 0.9915297031402588, "learning_rate": 3.2713152332652086e-06, "loss": 0.4446, "num_input_tokens_seen": 126396416, "step": 103935 }, { "epoch": 13.023430647788498, "grad_norm": 0.8791563510894775, "learning_rate": 3.2708022447988446e-06, "loss": 0.4694, "num_input_tokens_seen": 126402944, "step": 103940 }, { "epoch": 13.024057135697282, "grad_norm": 0.9104170203208923, "learning_rate": 3.2702892770064953e-06, "loss": 0.4285, "num_input_tokens_seen": 126409152, "step": 103945 }, { "epoch": 13.024683623606064, "grad_norm": 0.8965786695480347, "learning_rate": 3.26977632989429e-06, "loss": 0.4749, "num_input_tokens_seen": 126415168, "step": 103950 }, { "epoch": 13.025310111514848, "grad_norm": 2.5985324382781982, "learning_rate": 3.2692634034683644e-06, "loss": 0.435, "num_input_tokens_seen": 126421184, "step": 103955 }, { "epoch": 13.02593659942363, "grad_norm": 7.164261341094971, "learning_rate": 3.2687504977348505e-06, "loss": 0.5129, "num_input_tokens_seen": 126427296, "step": 103960 }, { "epoch": 13.026563087332415, "grad_norm": 7.381282806396484, "learning_rate": 3.2682376126998783e-06, "loss": 0.4559, "num_input_tokens_seen": 126433344, "step": 103965 }, { "epoch": 13.027189575241199, "grad_norm": 3.920640707015991, "learning_rate": 3.267724748369583e-06, "loss": 0.5048, "num_input_tokens_seen": 126439744, "step": 103970 }, { "epoch": 13.02781606314998, "grad_norm": 2.781201124191284, "learning_rate": 3.267211904750093e-06, "loss": 0.6513, "num_input_tokens_seen": 126445888, "step": 103975 }, { "epoch": 13.028442551058765, "grad_norm": 0.9391545653343201, "learning_rate": 3.266699081847544e-06, "loss": 0.4853, "num_input_tokens_seen": 126451872, "step": 103980 }, { "epoch": 13.029069038967547, "grad_norm": 3.6305975914001465, "learning_rate": 3.2661862796680626e-06, "loss": 0.5317, "num_input_tokens_seen": 126458016, "step": 103985 }, { "epoch": 13.029695526876331, "grad_norm": 1.0139557123184204, "learning_rate": 3.265673498217784e-06, "loss": 0.4943, "num_input_tokens_seen": 126464352, "step": 103990 }, { "epoch": 13.030322014785115, "grad_norm": 0.7530843019485474, "learning_rate": 3.265160737502835e-06, "loss": 0.4491, "num_input_tokens_seen": 126470240, "step": 103995 }, { "epoch": 13.030948502693898, "grad_norm": 11.904817581176758, "learning_rate": 3.26464799752935e-06, "loss": 0.5332, "num_input_tokens_seen": 126476480, "step": 104000 }, { "epoch": 13.031574990602682, "grad_norm": 1.1606649160385132, "learning_rate": 3.264135278303455e-06, "loss": 0.4569, "num_input_tokens_seen": 126482912, "step": 104005 }, { "epoch": 13.032201478511464, "grad_norm": 0.7885345220565796, "learning_rate": 3.263622579831283e-06, "loss": 0.4847, "num_input_tokens_seen": 126488896, "step": 104010 }, { "epoch": 13.032827966420248, "grad_norm": 0.44827842712402344, "learning_rate": 3.2631099021189622e-06, "loss": 0.4332, "num_input_tokens_seen": 126495008, "step": 104015 }, { "epoch": 13.033454454329032, "grad_norm": 1.2295620441436768, "learning_rate": 3.262597245172624e-06, "loss": 0.4602, "num_input_tokens_seen": 126501536, "step": 104020 }, { "epoch": 13.034080942237814, "grad_norm": 0.6702165603637695, "learning_rate": 3.2620846089983958e-06, "loss": 0.4525, "num_input_tokens_seen": 126506464, "step": 104025 }, { "epoch": 13.034707430146598, "grad_norm": 0.7486661076545715, "learning_rate": 3.261571993602407e-06, "loss": 0.4436, "num_input_tokens_seen": 126512544, "step": 104030 }, { "epoch": 13.03533391805538, "grad_norm": 1.0878452062606812, "learning_rate": 3.261059398990788e-06, "loss": 0.4555, "num_input_tokens_seen": 126518592, "step": 104035 }, { "epoch": 13.035960405964165, "grad_norm": 1.482431173324585, "learning_rate": 3.2605468251696644e-06, "loss": 0.504, "num_input_tokens_seen": 126524704, "step": 104040 }, { "epoch": 13.036586893872949, "grad_norm": 4.517853260040283, "learning_rate": 3.2600342721451674e-06, "loss": 0.4765, "num_input_tokens_seen": 126530976, "step": 104045 }, { "epoch": 13.037213381781731, "grad_norm": 0.6390486359596252, "learning_rate": 3.259521739923422e-06, "loss": 0.5115, "num_input_tokens_seen": 126537280, "step": 104050 }, { "epoch": 13.037839869690515, "grad_norm": 0.99154132604599, "learning_rate": 3.2590092285105598e-06, "loss": 0.4303, "num_input_tokens_seen": 126543392, "step": 104055 }, { "epoch": 13.0384663575993, "grad_norm": 7.472101211547852, "learning_rate": 3.258496737912703e-06, "loss": 0.551, "num_input_tokens_seen": 126549632, "step": 104060 }, { "epoch": 13.039092845508081, "grad_norm": 1.0334832668304443, "learning_rate": 3.257984268135984e-06, "loss": 0.4729, "num_input_tokens_seen": 126555424, "step": 104065 }, { "epoch": 13.039719333416866, "grad_norm": 0.5635339617729187, "learning_rate": 3.257471819186526e-06, "loss": 0.4563, "num_input_tokens_seen": 126561888, "step": 104070 }, { "epoch": 13.040345821325648, "grad_norm": 0.6416473984718323, "learning_rate": 3.25695939107046e-06, "loss": 0.4579, "num_input_tokens_seen": 126568384, "step": 104075 }, { "epoch": 13.040972309234432, "grad_norm": 0.6870306730270386, "learning_rate": 3.256446983793907e-06, "loss": 0.4913, "num_input_tokens_seen": 126574752, "step": 104080 }, { "epoch": 13.041598797143216, "grad_norm": 1.0364563465118408, "learning_rate": 3.255934597362997e-06, "loss": 0.4753, "num_input_tokens_seen": 126580928, "step": 104085 }, { "epoch": 13.042225285051998, "grad_norm": 5.654852867126465, "learning_rate": 3.255422231783857e-06, "loss": 0.4597, "num_input_tokens_seen": 126587072, "step": 104090 }, { "epoch": 13.042851772960782, "grad_norm": 0.568269670009613, "learning_rate": 3.254909887062609e-06, "loss": 0.4564, "num_input_tokens_seen": 126592864, "step": 104095 }, { "epoch": 13.043478260869565, "grad_norm": 1.605724573135376, "learning_rate": 3.2543975632053825e-06, "loss": 0.4655, "num_input_tokens_seen": 126599264, "step": 104100 }, { "epoch": 13.044104748778349, "grad_norm": 1.0191153287887573, "learning_rate": 3.2538852602182984e-06, "loss": 0.4637, "num_input_tokens_seen": 126605472, "step": 104105 }, { "epoch": 13.044731236687133, "grad_norm": 0.5257105231285095, "learning_rate": 3.2533729781074864e-06, "loss": 0.4528, "num_input_tokens_seen": 126611744, "step": 104110 }, { "epoch": 13.045357724595915, "grad_norm": 0.5920507311820984, "learning_rate": 3.252860716879067e-06, "loss": 0.4502, "num_input_tokens_seen": 126617696, "step": 104115 }, { "epoch": 13.045984212504699, "grad_norm": 0.5573622584342957, "learning_rate": 3.2523484765391677e-06, "loss": 0.4969, "num_input_tokens_seen": 126623616, "step": 104120 }, { "epoch": 13.046610700413481, "grad_norm": 1.2063839435577393, "learning_rate": 3.2518362570939106e-06, "loss": 0.4504, "num_input_tokens_seen": 126629888, "step": 104125 }, { "epoch": 13.047237188322265, "grad_norm": 4.658121585845947, "learning_rate": 3.2513240585494228e-06, "loss": 0.4563, "num_input_tokens_seen": 126635936, "step": 104130 }, { "epoch": 13.04786367623105, "grad_norm": 0.8537753820419312, "learning_rate": 3.250811880911824e-06, "loss": 0.4672, "num_input_tokens_seen": 126642432, "step": 104135 }, { "epoch": 13.048490164139832, "grad_norm": 4.250620365142822, "learning_rate": 3.250299724187241e-06, "loss": 0.4818, "num_input_tokens_seen": 126648672, "step": 104140 }, { "epoch": 13.049116652048616, "grad_norm": 0.9733129739761353, "learning_rate": 3.2497875883817955e-06, "loss": 0.4637, "num_input_tokens_seen": 126654592, "step": 104145 }, { "epoch": 13.049743139957398, "grad_norm": 0.5912908315658569, "learning_rate": 3.2492754735016098e-06, "loss": 0.4814, "num_input_tokens_seen": 126660960, "step": 104150 }, { "epoch": 13.050369627866182, "grad_norm": 1.2228649854660034, "learning_rate": 3.2487633795528105e-06, "loss": 0.4525, "num_input_tokens_seen": 126666688, "step": 104155 }, { "epoch": 13.050996115774966, "grad_norm": 1.7675104141235352, "learning_rate": 3.2482513065415146e-06, "loss": 0.4514, "num_input_tokens_seen": 126672544, "step": 104160 }, { "epoch": 13.051622603683748, "grad_norm": 1.0069407224655151, "learning_rate": 3.2477392544738486e-06, "loss": 0.5428, "num_input_tokens_seen": 126678656, "step": 104165 }, { "epoch": 13.052249091592532, "grad_norm": 1.0320801734924316, "learning_rate": 3.2472272233559328e-06, "loss": 0.4675, "num_input_tokens_seen": 126684960, "step": 104170 }, { "epoch": 13.052875579501316, "grad_norm": 0.8432939648628235, "learning_rate": 3.2467152131938894e-06, "loss": 0.4535, "num_input_tokens_seen": 126691232, "step": 104175 }, { "epoch": 13.053502067410099, "grad_norm": 1.6630982160568237, "learning_rate": 3.2462032239938385e-06, "loss": 0.4587, "num_input_tokens_seen": 126697120, "step": 104180 }, { "epoch": 13.054128555318883, "grad_norm": 0.8223594427108765, "learning_rate": 3.2456912557619047e-06, "loss": 0.4568, "num_input_tokens_seen": 126703456, "step": 104185 }, { "epoch": 13.054755043227665, "grad_norm": 0.9213127493858337, "learning_rate": 3.245179308504205e-06, "loss": 0.4541, "num_input_tokens_seen": 126710048, "step": 104190 }, { "epoch": 13.05538153113645, "grad_norm": 0.8997503519058228, "learning_rate": 3.244667382226864e-06, "loss": 0.4672, "num_input_tokens_seen": 126716384, "step": 104195 }, { "epoch": 13.056008019045233, "grad_norm": 0.774457573890686, "learning_rate": 3.2441554769359973e-06, "loss": 0.448, "num_input_tokens_seen": 126722304, "step": 104200 }, { "epoch": 13.056634506954016, "grad_norm": 0.9945064783096313, "learning_rate": 3.2436435926377314e-06, "loss": 0.478, "num_input_tokens_seen": 126728544, "step": 104205 }, { "epoch": 13.0572609948628, "grad_norm": 1.2267240285873413, "learning_rate": 3.24313172933818e-06, "loss": 0.4947, "num_input_tokens_seen": 126734592, "step": 104210 }, { "epoch": 13.057887482771582, "grad_norm": 4.414038181304932, "learning_rate": 3.2426198870434667e-06, "loss": 0.4602, "num_input_tokens_seen": 126740832, "step": 104215 }, { "epoch": 13.058513970680366, "grad_norm": 3.7062811851501465, "learning_rate": 3.242108065759711e-06, "loss": 0.4827, "num_input_tokens_seen": 126747040, "step": 104220 }, { "epoch": 13.05914045858915, "grad_norm": 0.6374360918998718, "learning_rate": 3.241596265493031e-06, "loss": 0.4527, "num_input_tokens_seen": 126752800, "step": 104225 }, { "epoch": 13.059766946497932, "grad_norm": 3.5432655811309814, "learning_rate": 3.241084486249546e-06, "loss": 0.4587, "num_input_tokens_seen": 126758912, "step": 104230 }, { "epoch": 13.060393434406716, "grad_norm": 3.5827741622924805, "learning_rate": 3.240572728035374e-06, "loss": 0.4698, "num_input_tokens_seen": 126764928, "step": 104235 }, { "epoch": 13.061019922315499, "grad_norm": 0.6556239724159241, "learning_rate": 3.2400609908566365e-06, "loss": 0.456, "num_input_tokens_seen": 126770784, "step": 104240 }, { "epoch": 13.061646410224283, "grad_norm": 2.0649280548095703, "learning_rate": 3.2395492747194472e-06, "loss": 0.4566, "num_input_tokens_seen": 126777024, "step": 104245 }, { "epoch": 13.062272898133067, "grad_norm": 1.3977137804031372, "learning_rate": 3.239037579629929e-06, "loss": 0.4573, "num_input_tokens_seen": 126783424, "step": 104250 }, { "epoch": 13.062899386041849, "grad_norm": 0.9463582634925842, "learning_rate": 3.2385259055941943e-06, "loss": 0.5095, "num_input_tokens_seen": 126789312, "step": 104255 }, { "epoch": 13.063525873950633, "grad_norm": 6.288334846496582, "learning_rate": 3.2380142526183656e-06, "loss": 0.4625, "num_input_tokens_seen": 126795584, "step": 104260 }, { "epoch": 13.064152361859415, "grad_norm": 0.9722813963890076, "learning_rate": 3.237502620708556e-06, "loss": 0.48, "num_input_tokens_seen": 126801536, "step": 104265 }, { "epoch": 13.0647788497682, "grad_norm": 1.245599627494812, "learning_rate": 3.2369910098708845e-06, "loss": 0.4761, "num_input_tokens_seen": 126807392, "step": 104270 }, { "epoch": 13.065405337676983, "grad_norm": 1.0011886358261108, "learning_rate": 3.2364794201114684e-06, "loss": 0.4449, "num_input_tokens_seen": 126813504, "step": 104275 }, { "epoch": 13.066031825585766, "grad_norm": 5.516716480255127, "learning_rate": 3.2359678514364235e-06, "loss": 0.4792, "num_input_tokens_seen": 126819648, "step": 104280 }, { "epoch": 13.06665831349455, "grad_norm": 1.227848768234253, "learning_rate": 3.2354563038518667e-06, "loss": 0.4763, "num_input_tokens_seen": 126826240, "step": 104285 }, { "epoch": 13.067284801403332, "grad_norm": 0.724686861038208, "learning_rate": 3.234944777363912e-06, "loss": 0.4575, "num_input_tokens_seen": 126832320, "step": 104290 }, { "epoch": 13.067911289312116, "grad_norm": 4.003359317779541, "learning_rate": 3.234433271978679e-06, "loss": 0.4765, "num_input_tokens_seen": 126838208, "step": 104295 }, { "epoch": 13.0685377772209, "grad_norm": 0.8403142094612122, "learning_rate": 3.233921787702277e-06, "loss": 0.4427, "num_input_tokens_seen": 126844224, "step": 104300 }, { "epoch": 13.069164265129682, "grad_norm": 0.8590046167373657, "learning_rate": 3.2334103245408284e-06, "loss": 0.4808, "num_input_tokens_seen": 126850176, "step": 104305 }, { "epoch": 13.069790753038466, "grad_norm": 1.989972472190857, "learning_rate": 3.2328988825004414e-06, "loss": 0.4475, "num_input_tokens_seen": 126856320, "step": 104310 }, { "epoch": 13.07041724094725, "grad_norm": 3.880793333053589, "learning_rate": 3.2323874615872377e-06, "loss": 0.4807, "num_input_tokens_seen": 126862496, "step": 104315 }, { "epoch": 13.071043728856033, "grad_norm": 0.4766528308391571, "learning_rate": 3.2318760618073243e-06, "loss": 0.4508, "num_input_tokens_seen": 126868704, "step": 104320 }, { "epoch": 13.071670216764817, "grad_norm": 0.7024231553077698, "learning_rate": 3.231364683166821e-06, "loss": 0.4538, "num_input_tokens_seen": 126874816, "step": 104325 }, { "epoch": 13.0722967046736, "grad_norm": 1.0583276748657227, "learning_rate": 3.230853325671839e-06, "loss": 0.4583, "num_input_tokens_seen": 126880864, "step": 104330 }, { "epoch": 13.072923192582383, "grad_norm": 2.415414810180664, "learning_rate": 3.2303419893284925e-06, "loss": 0.4631, "num_input_tokens_seen": 126886944, "step": 104335 }, { "epoch": 13.073549680491167, "grad_norm": 0.7790862321853638, "learning_rate": 3.2298306741428975e-06, "loss": 0.4636, "num_input_tokens_seen": 126893152, "step": 104340 }, { "epoch": 13.07417616839995, "grad_norm": 2.3322207927703857, "learning_rate": 3.229319380121163e-06, "loss": 0.4524, "num_input_tokens_seen": 126897504, "step": 104345 }, { "epoch": 13.074802656308734, "grad_norm": 1.4173296689987183, "learning_rate": 3.2288081072694062e-06, "loss": 0.4726, "num_input_tokens_seen": 126903712, "step": 104350 }, { "epoch": 13.075429144217516, "grad_norm": 0.5227436423301697, "learning_rate": 3.2282968555937356e-06, "loss": 0.5173, "num_input_tokens_seen": 126909344, "step": 104355 }, { "epoch": 13.0760556321263, "grad_norm": 0.989903450012207, "learning_rate": 3.227785625100267e-06, "loss": 0.4576, "num_input_tokens_seen": 126915488, "step": 104360 }, { "epoch": 13.076682120035084, "grad_norm": 0.841813325881958, "learning_rate": 3.22727441579511e-06, "loss": 0.4626, "num_input_tokens_seen": 126921792, "step": 104365 }, { "epoch": 13.077308607943866, "grad_norm": 0.7996644377708435, "learning_rate": 3.2267632276843784e-06, "loss": 0.4568, "num_input_tokens_seen": 126927904, "step": 104370 }, { "epoch": 13.07793509585265, "grad_norm": 0.9879739880561829, "learning_rate": 3.2262520607741835e-06, "loss": 0.4474, "num_input_tokens_seen": 126934048, "step": 104375 }, { "epoch": 13.078561583761433, "grad_norm": 1.2629144191741943, "learning_rate": 3.2257409150706364e-06, "loss": 0.4698, "num_input_tokens_seen": 126940256, "step": 104380 }, { "epoch": 13.079188071670217, "grad_norm": 1.2241653203964233, "learning_rate": 3.225229790579848e-06, "loss": 0.4431, "num_input_tokens_seen": 126945600, "step": 104385 }, { "epoch": 13.079814559579, "grad_norm": 4.246826648712158, "learning_rate": 3.2247186873079307e-06, "loss": 0.5042, "num_input_tokens_seen": 126951936, "step": 104390 }, { "epoch": 13.080441047487783, "grad_norm": 7.961634635925293, "learning_rate": 3.224207605260993e-06, "loss": 0.5288, "num_input_tokens_seen": 126958176, "step": 104395 }, { "epoch": 13.081067535396567, "grad_norm": 4.41804838180542, "learning_rate": 3.223696544445145e-06, "loss": 0.4735, "num_input_tokens_seen": 126964352, "step": 104400 }, { "epoch": 13.08169402330535, "grad_norm": 1.679545283317566, "learning_rate": 3.223185504866501e-06, "loss": 0.4433, "num_input_tokens_seen": 126970368, "step": 104405 }, { "epoch": 13.082320511214133, "grad_norm": 1.009271264076233, "learning_rate": 3.2226744865311664e-06, "loss": 0.4708, "num_input_tokens_seen": 126976352, "step": 104410 }, { "epoch": 13.082946999122917, "grad_norm": 0.6050167679786682, "learning_rate": 3.2221634894452546e-06, "loss": 0.4538, "num_input_tokens_seen": 126982144, "step": 104415 }, { "epoch": 13.0835734870317, "grad_norm": 3.2977662086486816, "learning_rate": 3.2216525136148705e-06, "loss": 0.4362, "num_input_tokens_seen": 126987872, "step": 104420 }, { "epoch": 13.084199974940484, "grad_norm": 1.7376717329025269, "learning_rate": 3.221141559046127e-06, "loss": 0.5247, "num_input_tokens_seen": 126994016, "step": 104425 }, { "epoch": 13.084826462849268, "grad_norm": 1.5918173789978027, "learning_rate": 3.220630625745132e-06, "loss": 0.4572, "num_input_tokens_seen": 127000256, "step": 104430 }, { "epoch": 13.08545295075805, "grad_norm": 0.7181606292724609, "learning_rate": 3.220119713717994e-06, "loss": 0.4757, "num_input_tokens_seen": 127006368, "step": 104435 }, { "epoch": 13.086079438666834, "grad_norm": 7.370806694030762, "learning_rate": 3.21960882297082e-06, "loss": 0.5132, "num_input_tokens_seen": 127012160, "step": 104440 }, { "epoch": 13.086705926575616, "grad_norm": 1.0487552881240845, "learning_rate": 3.219097953509721e-06, "loss": 0.4845, "num_input_tokens_seen": 127018400, "step": 104445 }, { "epoch": 13.0873324144844, "grad_norm": 0.9067810773849487, "learning_rate": 3.2185871053408013e-06, "loss": 0.4452, "num_input_tokens_seen": 127024384, "step": 104450 }, { "epoch": 13.087958902393185, "grad_norm": 1.7073934078216553, "learning_rate": 3.2180762784701713e-06, "loss": 0.4647, "num_input_tokens_seen": 127029632, "step": 104455 }, { "epoch": 13.088585390301967, "grad_norm": 8.407686233520508, "learning_rate": 3.2175654729039383e-06, "loss": 0.4668, "num_input_tokens_seen": 127035936, "step": 104460 }, { "epoch": 13.089211878210751, "grad_norm": 3.568432569503784, "learning_rate": 3.217054688648208e-06, "loss": 0.4858, "num_input_tokens_seen": 127042400, "step": 104465 }, { "epoch": 13.089838366119533, "grad_norm": 2.8982934951782227, "learning_rate": 3.216543925709089e-06, "loss": 0.4619, "num_input_tokens_seen": 127048480, "step": 104470 }, { "epoch": 13.090464854028317, "grad_norm": 1.3116095066070557, "learning_rate": 3.216033184092685e-06, "loss": 0.4705, "num_input_tokens_seen": 127054752, "step": 104475 }, { "epoch": 13.091091341937101, "grad_norm": 16.37087059020996, "learning_rate": 3.215522463805105e-06, "loss": 0.5286, "num_input_tokens_seen": 127060800, "step": 104480 }, { "epoch": 13.091717829845884, "grad_norm": 1.0577223300933838, "learning_rate": 3.2150117648524536e-06, "loss": 0.5498, "num_input_tokens_seen": 127067200, "step": 104485 }, { "epoch": 13.092344317754668, "grad_norm": 2.3100011348724365, "learning_rate": 3.21450108724084e-06, "loss": 0.4469, "num_input_tokens_seen": 127073376, "step": 104490 }, { "epoch": 13.09297080566345, "grad_norm": 0.6653856635093689, "learning_rate": 3.2139904309763637e-06, "loss": 0.4428, "num_input_tokens_seen": 127079552, "step": 104495 }, { "epoch": 13.093597293572234, "grad_norm": 1.1265933513641357, "learning_rate": 3.2134797960651366e-06, "loss": 0.5268, "num_input_tokens_seen": 127085856, "step": 104500 }, { "epoch": 13.094223781481018, "grad_norm": 12.435467720031738, "learning_rate": 3.2129691825132577e-06, "loss": 0.4935, "num_input_tokens_seen": 127091520, "step": 104505 }, { "epoch": 13.0948502693898, "grad_norm": 0.8680897355079651, "learning_rate": 3.2124585903268366e-06, "loss": 0.4262, "num_input_tokens_seen": 127097120, "step": 104510 }, { "epoch": 13.095476757298584, "grad_norm": 2.79407000541687, "learning_rate": 3.2119480195119746e-06, "loss": 0.4254, "num_input_tokens_seen": 127103360, "step": 104515 }, { "epoch": 13.096103245207367, "grad_norm": 10.606650352478027, "learning_rate": 3.211437470074777e-06, "loss": 0.5625, "num_input_tokens_seen": 127108480, "step": 104520 }, { "epoch": 13.09672973311615, "grad_norm": 0.5824764966964722, "learning_rate": 3.21092694202135e-06, "loss": 0.6227, "num_input_tokens_seen": 127114272, "step": 104525 }, { "epoch": 13.097356221024935, "grad_norm": 1.7030885219573975, "learning_rate": 3.2104164353577937e-06, "loss": 0.4556, "num_input_tokens_seen": 127120192, "step": 104530 }, { "epoch": 13.097982708933717, "grad_norm": 9.366787910461426, "learning_rate": 3.2099059500902147e-06, "loss": 0.5244, "num_input_tokens_seen": 127126208, "step": 104535 }, { "epoch": 13.098609196842501, "grad_norm": 0.9199259281158447, "learning_rate": 3.209395486224714e-06, "loss": 0.5089, "num_input_tokens_seen": 127132384, "step": 104540 }, { "epoch": 13.099235684751285, "grad_norm": 2.7092344760894775, "learning_rate": 3.208885043767398e-06, "loss": 0.4879, "num_input_tokens_seen": 127138336, "step": 104545 }, { "epoch": 13.099862172660067, "grad_norm": 0.8799484372138977, "learning_rate": 3.2083746227243645e-06, "loss": 0.436, "num_input_tokens_seen": 127144576, "step": 104550 }, { "epoch": 13.100488660568852, "grad_norm": 1.006356954574585, "learning_rate": 3.2078642231017208e-06, "loss": 0.5581, "num_input_tokens_seen": 127150592, "step": 104555 }, { "epoch": 13.101115148477634, "grad_norm": 0.9744505286216736, "learning_rate": 3.2073538449055653e-06, "loss": 0.4757, "num_input_tokens_seen": 127156768, "step": 104560 }, { "epoch": 13.101741636386418, "grad_norm": 3.831069231033325, "learning_rate": 3.206843488142004e-06, "loss": 0.4565, "num_input_tokens_seen": 127163168, "step": 104565 }, { "epoch": 13.102368124295202, "grad_norm": 1.1481542587280273, "learning_rate": 3.2063331528171345e-06, "loss": 0.4205, "num_input_tokens_seen": 127169184, "step": 104570 }, { "epoch": 13.102994612203984, "grad_norm": 2.0266611576080322, "learning_rate": 3.2058228389370606e-06, "loss": 0.4327, "num_input_tokens_seen": 127175456, "step": 104575 }, { "epoch": 13.103621100112768, "grad_norm": 4.050867557525635, "learning_rate": 3.205312546507884e-06, "loss": 0.5335, "num_input_tokens_seen": 127181440, "step": 104580 }, { "epoch": 13.10424758802155, "grad_norm": 20.761642456054688, "learning_rate": 3.2048022755357023e-06, "loss": 0.6307, "num_input_tokens_seen": 127187296, "step": 104585 }, { "epoch": 13.104874075930335, "grad_norm": 9.640728950500488, "learning_rate": 3.204292026026621e-06, "loss": 0.4735, "num_input_tokens_seen": 127193120, "step": 104590 }, { "epoch": 13.105500563839119, "grad_norm": 1.4017068147659302, "learning_rate": 3.203781797986737e-06, "loss": 0.4686, "num_input_tokens_seen": 127199104, "step": 104595 }, { "epoch": 13.106127051747901, "grad_norm": 1.5106287002563477, "learning_rate": 3.2032715914221535e-06, "loss": 0.4818, "num_input_tokens_seen": 127205280, "step": 104600 }, { "epoch": 13.106753539656685, "grad_norm": 4.797919273376465, "learning_rate": 3.2027614063389665e-06, "loss": 0.4739, "num_input_tokens_seen": 127211008, "step": 104605 }, { "epoch": 13.107380027565467, "grad_norm": 1.250760793685913, "learning_rate": 3.2022512427432805e-06, "loss": 0.4485, "num_input_tokens_seen": 127217216, "step": 104610 }, { "epoch": 13.108006515474251, "grad_norm": 0.6731710433959961, "learning_rate": 3.20174110064119e-06, "loss": 0.6433, "num_input_tokens_seen": 127223072, "step": 104615 }, { "epoch": 13.108633003383035, "grad_norm": 7.5367560386657715, "learning_rate": 3.2012309800387993e-06, "loss": 0.4489, "num_input_tokens_seen": 127228992, "step": 104620 }, { "epoch": 13.109259491291818, "grad_norm": 2.3312489986419678, "learning_rate": 3.200720880942202e-06, "loss": 0.5346, "num_input_tokens_seen": 127235296, "step": 104625 }, { "epoch": 13.109885979200602, "grad_norm": 9.575098991394043, "learning_rate": 3.2002108033575e-06, "loss": 0.5954, "num_input_tokens_seen": 127241376, "step": 104630 }, { "epoch": 13.110512467109384, "grad_norm": 8.118914604187012, "learning_rate": 3.1997007472907905e-06, "loss": 0.6004, "num_input_tokens_seen": 127247616, "step": 104635 }, { "epoch": 13.111138955018168, "grad_norm": 1.0480856895446777, "learning_rate": 3.1991907127481723e-06, "loss": 0.4352, "num_input_tokens_seen": 127253600, "step": 104640 }, { "epoch": 13.111765442926952, "grad_norm": 1.2401047945022583, "learning_rate": 3.1986806997357446e-06, "loss": 0.5302, "num_input_tokens_seen": 127259648, "step": 104645 }, { "epoch": 13.112391930835734, "grad_norm": 10.025999069213867, "learning_rate": 3.1981707082596014e-06, "loss": 0.5412, "num_input_tokens_seen": 127265440, "step": 104650 }, { "epoch": 13.113018418744518, "grad_norm": 0.7478901147842407, "learning_rate": 3.197660738325845e-06, "loss": 0.4245, "num_input_tokens_seen": 127271520, "step": 104655 }, { "epoch": 13.1136449066533, "grad_norm": 1.041339635848999, "learning_rate": 3.1971507899405676e-06, "loss": 0.4422, "num_input_tokens_seen": 127277600, "step": 104660 }, { "epoch": 13.114271394562085, "grad_norm": 9.504798889160156, "learning_rate": 3.19664086310987e-06, "loss": 0.6865, "num_input_tokens_seen": 127284032, "step": 104665 }, { "epoch": 13.114897882470869, "grad_norm": 1.5814208984375, "learning_rate": 3.196130957839845e-06, "loss": 0.4945, "num_input_tokens_seen": 127289856, "step": 104670 }, { "epoch": 13.115524370379651, "grad_norm": 11.971513748168945, "learning_rate": 3.1956210741365933e-06, "loss": 0.4848, "num_input_tokens_seen": 127296064, "step": 104675 }, { "epoch": 13.116150858288435, "grad_norm": 0.8210788369178772, "learning_rate": 3.1951112120062068e-06, "loss": 0.457, "num_input_tokens_seen": 127301984, "step": 104680 }, { "epoch": 13.11677734619722, "grad_norm": 0.987749457359314, "learning_rate": 3.194601371454784e-06, "loss": 0.4618, "num_input_tokens_seen": 127308032, "step": 104685 }, { "epoch": 13.117403834106002, "grad_norm": 12.978161811828613, "learning_rate": 3.194091552488419e-06, "loss": 0.5132, "num_input_tokens_seen": 127314048, "step": 104690 }, { "epoch": 13.118030322014786, "grad_norm": 3.3437156677246094, "learning_rate": 3.19358175511321e-06, "loss": 0.4551, "num_input_tokens_seen": 127319712, "step": 104695 }, { "epoch": 13.118656809923568, "grad_norm": 0.6811162233352661, "learning_rate": 3.1930719793352472e-06, "loss": 0.4324, "num_input_tokens_seen": 127326016, "step": 104700 }, { "epoch": 13.119283297832352, "grad_norm": 1.1059529781341553, "learning_rate": 3.192562225160628e-06, "loss": 0.4364, "num_input_tokens_seen": 127332576, "step": 104705 }, { "epoch": 13.119909785741136, "grad_norm": 3.3953542709350586, "learning_rate": 3.19205249259545e-06, "loss": 0.5785, "num_input_tokens_seen": 127338912, "step": 104710 }, { "epoch": 13.120536273649918, "grad_norm": 1.0572190284729004, "learning_rate": 3.191542781645801e-06, "loss": 0.4294, "num_input_tokens_seen": 127345152, "step": 104715 }, { "epoch": 13.121162761558702, "grad_norm": 2.92149019241333, "learning_rate": 3.191033092317782e-06, "loss": 0.4964, "num_input_tokens_seen": 127351584, "step": 104720 }, { "epoch": 13.121789249467485, "grad_norm": 0.9616589546203613, "learning_rate": 3.1905234246174803e-06, "loss": 0.4353, "num_input_tokens_seen": 127357856, "step": 104725 }, { "epoch": 13.122415737376269, "grad_norm": 1.0513203144073486, "learning_rate": 3.190013778550995e-06, "loss": 0.4438, "num_input_tokens_seen": 127364096, "step": 104730 }, { "epoch": 13.123042225285053, "grad_norm": 7.172179222106934, "learning_rate": 3.1895041541244145e-06, "loss": 0.5184, "num_input_tokens_seen": 127370080, "step": 104735 }, { "epoch": 13.123668713193835, "grad_norm": 1.1708115339279175, "learning_rate": 3.188994551343835e-06, "loss": 0.524, "num_input_tokens_seen": 127376384, "step": 104740 }, { "epoch": 13.124295201102619, "grad_norm": 1.1809566020965576, "learning_rate": 3.188484970215347e-06, "loss": 0.4491, "num_input_tokens_seen": 127382368, "step": 104745 }, { "epoch": 13.124921689011401, "grad_norm": 7.163401126861572, "learning_rate": 3.1879754107450465e-06, "loss": 0.4904, "num_input_tokens_seen": 127388384, "step": 104750 }, { "epoch": 13.125548176920185, "grad_norm": 0.8819869756698608, "learning_rate": 3.1874658729390214e-06, "loss": 0.4249, "num_input_tokens_seen": 127394880, "step": 104755 }, { "epoch": 13.12617466482897, "grad_norm": 3.8815345764160156, "learning_rate": 3.1869563568033678e-06, "loss": 0.5774, "num_input_tokens_seen": 127400992, "step": 104760 }, { "epoch": 13.126801152737752, "grad_norm": 2.126861095428467, "learning_rate": 3.1864468623441724e-06, "loss": 0.4744, "num_input_tokens_seen": 127406848, "step": 104765 }, { "epoch": 13.127427640646536, "grad_norm": 1.0615434646606445, "learning_rate": 3.1859373895675306e-06, "loss": 0.5351, "num_input_tokens_seen": 127412960, "step": 104770 }, { "epoch": 13.128054128555318, "grad_norm": 0.8143943548202515, "learning_rate": 3.185427938479534e-06, "loss": 0.464, "num_input_tokens_seen": 127419200, "step": 104775 }, { "epoch": 13.128680616464102, "grad_norm": 3.9898760318756104, "learning_rate": 3.1849185090862698e-06, "loss": 0.4448, "num_input_tokens_seen": 127425440, "step": 104780 }, { "epoch": 13.129307104372886, "grad_norm": 1.4194400310516357, "learning_rate": 3.184409101393833e-06, "loss": 0.4658, "num_input_tokens_seen": 127431264, "step": 104785 }, { "epoch": 13.129933592281668, "grad_norm": 4.3527727127075195, "learning_rate": 3.1838997154083106e-06, "loss": 0.5133, "num_input_tokens_seen": 127437504, "step": 104790 }, { "epoch": 13.130560080190453, "grad_norm": 0.6556124687194824, "learning_rate": 3.1833903511357943e-06, "loss": 0.4666, "num_input_tokens_seen": 127443520, "step": 104795 }, { "epoch": 13.131186568099235, "grad_norm": 1.0858008861541748, "learning_rate": 3.182881008582373e-06, "loss": 0.4327, "num_input_tokens_seen": 127449792, "step": 104800 }, { "epoch": 13.131813056008019, "grad_norm": 8.020355224609375, "learning_rate": 3.182371687754139e-06, "loss": 0.5834, "num_input_tokens_seen": 127455840, "step": 104805 }, { "epoch": 13.132439543916803, "grad_norm": 6.419483661651611, "learning_rate": 3.1818623886571776e-06, "loss": 0.5181, "num_input_tokens_seen": 127462080, "step": 104810 }, { "epoch": 13.133066031825585, "grad_norm": 0.7430528998374939, "learning_rate": 3.1813531112975813e-06, "loss": 0.4476, "num_input_tokens_seen": 127468288, "step": 104815 }, { "epoch": 13.13369251973437, "grad_norm": 2.970926284790039, "learning_rate": 3.1808438556814364e-06, "loss": 0.4704, "num_input_tokens_seen": 127474304, "step": 104820 }, { "epoch": 13.134319007643153, "grad_norm": 0.6990871429443359, "learning_rate": 3.180334621814832e-06, "loss": 0.4385, "num_input_tokens_seen": 127479904, "step": 104825 }, { "epoch": 13.134945495551936, "grad_norm": 3.5851798057556152, "learning_rate": 3.1798254097038596e-06, "loss": 0.519, "num_input_tokens_seen": 127485984, "step": 104830 }, { "epoch": 13.13557198346072, "grad_norm": 0.7707346081733704, "learning_rate": 3.179316219354602e-06, "loss": 0.4606, "num_input_tokens_seen": 127491456, "step": 104835 }, { "epoch": 13.136198471369502, "grad_norm": 3.0177080631256104, "learning_rate": 3.1788070507731517e-06, "loss": 0.4678, "num_input_tokens_seen": 127497440, "step": 104840 }, { "epoch": 13.136824959278286, "grad_norm": 0.7970864772796631, "learning_rate": 3.178297903965594e-06, "loss": 0.4582, "num_input_tokens_seen": 127503616, "step": 104845 }, { "epoch": 13.13745144718707, "grad_norm": 1.1396392583847046, "learning_rate": 3.177788778938017e-06, "loss": 0.4389, "num_input_tokens_seen": 127509184, "step": 104850 }, { "epoch": 13.138077935095852, "grad_norm": 2.1131632328033447, "learning_rate": 3.177279675696505e-06, "loss": 0.4189, "num_input_tokens_seen": 127515552, "step": 104855 }, { "epoch": 13.138704423004636, "grad_norm": 4.979452610015869, "learning_rate": 3.1767705942471495e-06, "loss": 0.5339, "num_input_tokens_seen": 127521632, "step": 104860 }, { "epoch": 13.139330910913419, "grad_norm": 1.6374768018722534, "learning_rate": 3.1762615345960325e-06, "loss": 0.5382, "num_input_tokens_seen": 127527680, "step": 104865 }, { "epoch": 13.139957398822203, "grad_norm": 3.2128539085388184, "learning_rate": 3.175752496749244e-06, "loss": 0.4957, "num_input_tokens_seen": 127533184, "step": 104870 }, { "epoch": 13.140583886730987, "grad_norm": 0.8040635585784912, "learning_rate": 3.175243480712866e-06, "loss": 0.4412, "num_input_tokens_seen": 127539488, "step": 104875 }, { "epoch": 13.141210374639769, "grad_norm": 1.4957355260849, "learning_rate": 3.174734486492989e-06, "loss": 0.552, "num_input_tokens_seen": 127545824, "step": 104880 }, { "epoch": 13.141836862548553, "grad_norm": 1.7778114080429077, "learning_rate": 3.1742255140956936e-06, "loss": 0.4508, "num_input_tokens_seen": 127551936, "step": 104885 }, { "epoch": 13.142463350457335, "grad_norm": 0.6199907064437866, "learning_rate": 3.1737165635270665e-06, "loss": 0.4683, "num_input_tokens_seen": 127557664, "step": 104890 }, { "epoch": 13.14308983836612, "grad_norm": 0.9045100212097168, "learning_rate": 3.173207634793195e-06, "loss": 0.5058, "num_input_tokens_seen": 127563840, "step": 104895 }, { "epoch": 13.143716326274903, "grad_norm": 0.7379000186920166, "learning_rate": 3.172698727900162e-06, "loss": 0.4556, "num_input_tokens_seen": 127570016, "step": 104900 }, { "epoch": 13.144342814183686, "grad_norm": 1.0176182985305786, "learning_rate": 3.172189842854052e-06, "loss": 0.4826, "num_input_tokens_seen": 127576224, "step": 104905 }, { "epoch": 13.14496930209247, "grad_norm": 5.375240325927734, "learning_rate": 3.171680979660948e-06, "loss": 0.5732, "num_input_tokens_seen": 127582176, "step": 104910 }, { "epoch": 13.145595790001252, "grad_norm": 1.0342273712158203, "learning_rate": 3.171172138326938e-06, "loss": 0.4452, "num_input_tokens_seen": 127588352, "step": 104915 }, { "epoch": 13.146222277910036, "grad_norm": 2.9734373092651367, "learning_rate": 3.170663318858099e-06, "loss": 0.4587, "num_input_tokens_seen": 127594560, "step": 104920 }, { "epoch": 13.14684876581882, "grad_norm": 2.2320735454559326, "learning_rate": 3.1701545212605213e-06, "loss": 0.4465, "num_input_tokens_seen": 127600576, "step": 104925 }, { "epoch": 13.147475253727603, "grad_norm": 0.6369748711585999, "learning_rate": 3.1696457455402824e-06, "loss": 0.4646, "num_input_tokens_seen": 127606400, "step": 104930 }, { "epoch": 13.148101741636387, "grad_norm": 0.8912169337272644, "learning_rate": 3.169136991703469e-06, "loss": 0.4565, "num_input_tokens_seen": 127612608, "step": 104935 }, { "epoch": 13.14872822954517, "grad_norm": 4.055182933807373, "learning_rate": 3.1686282597561613e-06, "loss": 0.5026, "num_input_tokens_seen": 127618080, "step": 104940 }, { "epoch": 13.149354717453953, "grad_norm": 1.001121997833252, "learning_rate": 3.1681195497044424e-06, "loss": 0.522, "num_input_tokens_seen": 127624192, "step": 104945 }, { "epoch": 13.149981205362737, "grad_norm": 1.7147579193115234, "learning_rate": 3.1676108615543944e-06, "loss": 0.4542, "num_input_tokens_seen": 127630176, "step": 104950 }, { "epoch": 13.15060769327152, "grad_norm": 1.6598409414291382, "learning_rate": 3.1671021953120983e-06, "loss": 0.4295, "num_input_tokens_seen": 127636416, "step": 104955 }, { "epoch": 13.151234181180303, "grad_norm": 4.216672420501709, "learning_rate": 3.1665935509836386e-06, "loss": 0.5428, "num_input_tokens_seen": 127642368, "step": 104960 }, { "epoch": 13.151860669089087, "grad_norm": 1.6753038167953491, "learning_rate": 3.1660849285750926e-06, "loss": 0.4681, "num_input_tokens_seen": 127648480, "step": 104965 }, { "epoch": 13.15248715699787, "grad_norm": 0.8149169683456421, "learning_rate": 3.1655763280925455e-06, "loss": 0.5004, "num_input_tokens_seen": 127654400, "step": 104970 }, { "epoch": 13.153113644906654, "grad_norm": 1.1109626293182373, "learning_rate": 3.1650677495420733e-06, "loss": 0.4676, "num_input_tokens_seen": 127660672, "step": 104975 }, { "epoch": 13.153740132815436, "grad_norm": 0.9048909544944763, "learning_rate": 3.164559192929761e-06, "loss": 0.46, "num_input_tokens_seen": 127666880, "step": 104980 }, { "epoch": 13.15436662072422, "grad_norm": 3.489543914794922, "learning_rate": 3.1640506582616848e-06, "loss": 0.445, "num_input_tokens_seen": 127672864, "step": 104985 }, { "epoch": 13.154993108633004, "grad_norm": 0.7419159412384033, "learning_rate": 3.163542145543927e-06, "loss": 0.4834, "num_input_tokens_seen": 127678976, "step": 104990 }, { "epoch": 13.155619596541786, "grad_norm": 8.86344051361084, "learning_rate": 3.1630336547825673e-06, "loss": 0.5024, "num_input_tokens_seen": 127685216, "step": 104995 }, { "epoch": 13.15624608445057, "grad_norm": 0.8455169200897217, "learning_rate": 3.1625251859836848e-06, "loss": 0.4519, "num_input_tokens_seen": 127691232, "step": 105000 }, { "epoch": 13.156872572359353, "grad_norm": 4.654175281524658, "learning_rate": 3.1620167391533572e-06, "loss": 0.5438, "num_input_tokens_seen": 127697024, "step": 105005 }, { "epoch": 13.157499060268137, "grad_norm": 1.8168084621429443, "learning_rate": 3.1615083142976648e-06, "loss": 0.5239, "num_input_tokens_seen": 127703008, "step": 105010 }, { "epoch": 13.15812554817692, "grad_norm": 9.959869384765625, "learning_rate": 3.1609999114226886e-06, "loss": 0.588, "num_input_tokens_seen": 127709248, "step": 105015 }, { "epoch": 13.158752036085703, "grad_norm": 1.7199455499649048, "learning_rate": 3.1604915305345024e-06, "loss": 0.4626, "num_input_tokens_seen": 127715328, "step": 105020 }, { "epoch": 13.159378523994487, "grad_norm": 3.7213034629821777, "learning_rate": 3.1599831716391883e-06, "loss": 0.4755, "num_input_tokens_seen": 127721536, "step": 105025 }, { "epoch": 13.16000501190327, "grad_norm": 2.366159439086914, "learning_rate": 3.1594748347428208e-06, "loss": 0.4433, "num_input_tokens_seen": 127727840, "step": 105030 }, { "epoch": 13.160631499812053, "grad_norm": 1.2618483304977417, "learning_rate": 3.1589665198514807e-06, "loss": 0.4552, "num_input_tokens_seen": 127733920, "step": 105035 }, { "epoch": 13.161257987720838, "grad_norm": 0.7551782727241516, "learning_rate": 3.158458226971242e-06, "loss": 0.4704, "num_input_tokens_seen": 127740224, "step": 105040 }, { "epoch": 13.16188447562962, "grad_norm": 13.087820053100586, "learning_rate": 3.157949956108185e-06, "loss": 0.4901, "num_input_tokens_seen": 127746304, "step": 105045 }, { "epoch": 13.162510963538404, "grad_norm": 1.3663010597229004, "learning_rate": 3.157441707268384e-06, "loss": 0.4703, "num_input_tokens_seen": 127752160, "step": 105050 }, { "epoch": 13.163137451447188, "grad_norm": 1.556363582611084, "learning_rate": 3.1569334804579167e-06, "loss": 0.446, "num_input_tokens_seen": 127758528, "step": 105055 }, { "epoch": 13.16376393935597, "grad_norm": 5.658961772918701, "learning_rate": 3.1564252756828583e-06, "loss": 0.4655, "num_input_tokens_seen": 127764416, "step": 105060 }, { "epoch": 13.164390427264754, "grad_norm": 8.405777931213379, "learning_rate": 3.1559170929492883e-06, "loss": 0.546, "num_input_tokens_seen": 127770592, "step": 105065 }, { "epoch": 13.165016915173537, "grad_norm": 9.196253776550293, "learning_rate": 3.1554089322632774e-06, "loss": 0.4871, "num_input_tokens_seen": 127776800, "step": 105070 }, { "epoch": 13.16564340308232, "grad_norm": 1.1221338510513306, "learning_rate": 3.1549007936309035e-06, "loss": 0.5437, "num_input_tokens_seen": 127782688, "step": 105075 }, { "epoch": 13.166269890991105, "grad_norm": 1.0542864799499512, "learning_rate": 3.154392677058244e-06, "loss": 0.4565, "num_input_tokens_seen": 127789216, "step": 105080 }, { "epoch": 13.166896378899887, "grad_norm": 1.1984158754348755, "learning_rate": 3.15388458255137e-06, "loss": 0.4516, "num_input_tokens_seen": 127795360, "step": 105085 }, { "epoch": 13.167522866808671, "grad_norm": 0.8610321283340454, "learning_rate": 3.1533765101163593e-06, "loss": 0.4367, "num_input_tokens_seen": 127801216, "step": 105090 }, { "epoch": 13.168149354717453, "grad_norm": 5.48072624206543, "learning_rate": 3.1528684597592833e-06, "loss": 0.5016, "num_input_tokens_seen": 127807840, "step": 105095 }, { "epoch": 13.168775842626237, "grad_norm": 1.1238161325454712, "learning_rate": 3.1523604314862187e-06, "loss": 0.4324, "num_input_tokens_seen": 127814016, "step": 105100 }, { "epoch": 13.169402330535021, "grad_norm": 3.2143328189849854, "learning_rate": 3.1518524253032377e-06, "loss": 0.5003, "num_input_tokens_seen": 127819808, "step": 105105 }, { "epoch": 13.170028818443804, "grad_norm": 1.574622392654419, "learning_rate": 3.1513444412164173e-06, "loss": 0.4267, "num_input_tokens_seen": 127826144, "step": 105110 }, { "epoch": 13.170655306352588, "grad_norm": 1.6655668020248413, "learning_rate": 3.150836479231826e-06, "loss": 0.4527, "num_input_tokens_seen": 127832448, "step": 105115 }, { "epoch": 13.17128179426137, "grad_norm": 9.755475044250488, "learning_rate": 3.1503285393555404e-06, "loss": 0.5139, "num_input_tokens_seen": 127837984, "step": 105120 }, { "epoch": 13.171908282170154, "grad_norm": 8.11357307434082, "learning_rate": 3.1498206215936312e-06, "loss": 0.6218, "num_input_tokens_seen": 127844064, "step": 105125 }, { "epoch": 13.172534770078938, "grad_norm": 1.972527027130127, "learning_rate": 3.149312725952174e-06, "loss": 0.4615, "num_input_tokens_seen": 127850432, "step": 105130 }, { "epoch": 13.17316125798772, "grad_norm": 3.7099900245666504, "learning_rate": 3.148804852437237e-06, "loss": 0.6057, "num_input_tokens_seen": 127856064, "step": 105135 }, { "epoch": 13.173787745896504, "grad_norm": 3.405376434326172, "learning_rate": 3.148297001054894e-06, "loss": 0.4305, "num_input_tokens_seen": 127861888, "step": 105140 }, { "epoch": 13.174414233805287, "grad_norm": 6.0905022621154785, "learning_rate": 3.1477891718112195e-06, "loss": 0.5088, "num_input_tokens_seen": 127868384, "step": 105145 }, { "epoch": 13.17504072171407, "grad_norm": 0.8141002655029297, "learning_rate": 3.147281364712281e-06, "loss": 0.549, "num_input_tokens_seen": 127874592, "step": 105150 }, { "epoch": 13.175667209622855, "grad_norm": 1.6908992528915405, "learning_rate": 3.1467735797641525e-06, "loss": 0.4925, "num_input_tokens_seen": 127880832, "step": 105155 }, { "epoch": 13.176293697531637, "grad_norm": 2.079810380935669, "learning_rate": 3.146265816972902e-06, "loss": 0.4364, "num_input_tokens_seen": 127887232, "step": 105160 }, { "epoch": 13.176920185440421, "grad_norm": 6.999488830566406, "learning_rate": 3.1457580763446047e-06, "loss": 0.5189, "num_input_tokens_seen": 127893504, "step": 105165 }, { "epoch": 13.177546673349203, "grad_norm": 1.8449596166610718, "learning_rate": 3.1452503578853273e-06, "loss": 0.4722, "num_input_tokens_seen": 127899840, "step": 105170 }, { "epoch": 13.178173161257988, "grad_norm": 1.0201281309127808, "learning_rate": 3.1447426616011425e-06, "loss": 0.4308, "num_input_tokens_seen": 127905952, "step": 105175 }, { "epoch": 13.178799649166772, "grad_norm": 11.122435569763184, "learning_rate": 3.144234987498117e-06, "loss": 0.491, "num_input_tokens_seen": 127912224, "step": 105180 }, { "epoch": 13.179426137075554, "grad_norm": 1.5963081121444702, "learning_rate": 3.1437273355823252e-06, "loss": 0.4417, "num_input_tokens_seen": 127918432, "step": 105185 }, { "epoch": 13.180052624984338, "grad_norm": 1.0126885175704956, "learning_rate": 3.1432197058598312e-06, "loss": 0.5702, "num_input_tokens_seen": 127924640, "step": 105190 }, { "epoch": 13.180679112893122, "grad_norm": 1.9276034832000732, "learning_rate": 3.1427120983367075e-06, "loss": 0.5105, "num_input_tokens_seen": 127931072, "step": 105195 }, { "epoch": 13.181305600801904, "grad_norm": 1.6006168127059937, "learning_rate": 3.1422045130190226e-06, "loss": 0.4999, "num_input_tokens_seen": 127937184, "step": 105200 }, { "epoch": 13.181932088710688, "grad_norm": 6.203571319580078, "learning_rate": 3.141696949912843e-06, "loss": 0.4936, "num_input_tokens_seen": 127943136, "step": 105205 }, { "epoch": 13.18255857661947, "grad_norm": 6.925804615020752, "learning_rate": 3.14118940902424e-06, "loss": 0.5016, "num_input_tokens_seen": 127949440, "step": 105210 }, { "epoch": 13.183185064528255, "grad_norm": 7.019958972930908, "learning_rate": 3.14068189035928e-06, "loss": 0.5376, "num_input_tokens_seen": 127955648, "step": 105215 }, { "epoch": 13.183811552437039, "grad_norm": 1.1632869243621826, "learning_rate": 3.1401743939240327e-06, "loss": 0.453, "num_input_tokens_seen": 127961696, "step": 105220 }, { "epoch": 13.184438040345821, "grad_norm": 0.9229123592376709, "learning_rate": 3.1396669197245623e-06, "loss": 0.4357, "num_input_tokens_seen": 127968288, "step": 105225 }, { "epoch": 13.185064528254605, "grad_norm": 8.893054962158203, "learning_rate": 3.13915946776694e-06, "loss": 0.4843, "num_input_tokens_seen": 127974688, "step": 105230 }, { "epoch": 13.185691016163387, "grad_norm": 1.9134314060211182, "learning_rate": 3.1386520380572293e-06, "loss": 0.524, "num_input_tokens_seen": 127981056, "step": 105235 }, { "epoch": 13.186317504072171, "grad_norm": 7.337924480438232, "learning_rate": 3.1381446306014997e-06, "loss": 0.4781, "num_input_tokens_seen": 127987072, "step": 105240 }, { "epoch": 13.186943991980955, "grad_norm": 6.118975639343262, "learning_rate": 3.1376372454058145e-06, "loss": 0.5369, "num_input_tokens_seen": 127993280, "step": 105245 }, { "epoch": 13.187570479889738, "grad_norm": 10.889697074890137, "learning_rate": 3.137129882476243e-06, "loss": 0.4691, "num_input_tokens_seen": 127999552, "step": 105250 }, { "epoch": 13.188196967798522, "grad_norm": 11.631746292114258, "learning_rate": 3.136622541818849e-06, "loss": 0.4912, "num_input_tokens_seen": 128005824, "step": 105255 }, { "epoch": 13.188823455707304, "grad_norm": 3.6356287002563477, "learning_rate": 3.1361152234396997e-06, "loss": 0.6211, "num_input_tokens_seen": 128012128, "step": 105260 }, { "epoch": 13.189449943616088, "grad_norm": 1.580928921699524, "learning_rate": 3.1356079273448603e-06, "loss": 0.5711, "num_input_tokens_seen": 128018432, "step": 105265 }, { "epoch": 13.190076431524872, "grad_norm": 1.1057873964309692, "learning_rate": 3.1351006535403947e-06, "loss": 0.4415, "num_input_tokens_seen": 128024896, "step": 105270 }, { "epoch": 13.190702919433654, "grad_norm": 1.184104323387146, "learning_rate": 3.134593402032371e-06, "loss": 0.4594, "num_input_tokens_seen": 128030912, "step": 105275 }, { "epoch": 13.191329407342439, "grad_norm": 1.0108428001403809, "learning_rate": 3.134086172826849e-06, "loss": 0.4901, "num_input_tokens_seen": 128037056, "step": 105280 }, { "epoch": 13.19195589525122, "grad_norm": 1.2920897006988525, "learning_rate": 3.1335789659298975e-06, "loss": 0.4448, "num_input_tokens_seen": 128043136, "step": 105285 }, { "epoch": 13.192582383160005, "grad_norm": 0.7237048149108887, "learning_rate": 3.133071781347577e-06, "loss": 0.4638, "num_input_tokens_seen": 128048960, "step": 105290 }, { "epoch": 13.193208871068789, "grad_norm": 2.544624090194702, "learning_rate": 3.1325646190859547e-06, "loss": 0.4821, "num_input_tokens_seen": 128054976, "step": 105295 }, { "epoch": 13.193835358977571, "grad_norm": 2.260040044784546, "learning_rate": 3.1320574791510904e-06, "loss": 0.471, "num_input_tokens_seen": 128061184, "step": 105300 }, { "epoch": 13.194461846886355, "grad_norm": 1.14760422706604, "learning_rate": 3.131550361549051e-06, "loss": 0.4365, "num_input_tokens_seen": 128067488, "step": 105305 }, { "epoch": 13.19508833479514, "grad_norm": 2.2678122520446777, "learning_rate": 3.131043266285897e-06, "loss": 0.4547, "num_input_tokens_seen": 128073600, "step": 105310 }, { "epoch": 13.195714822703922, "grad_norm": 2.593657970428467, "learning_rate": 3.130536193367694e-06, "loss": 0.4691, "num_input_tokens_seen": 128079872, "step": 105315 }, { "epoch": 13.196341310612706, "grad_norm": 0.7199656367301941, "learning_rate": 3.1300291428004992e-06, "loss": 0.446, "num_input_tokens_seen": 128085760, "step": 105320 }, { "epoch": 13.196967798521488, "grad_norm": 1.307797908782959, "learning_rate": 3.1295221145903797e-06, "loss": 0.4511, "num_input_tokens_seen": 128091968, "step": 105325 }, { "epoch": 13.197594286430272, "grad_norm": 5.568229675292969, "learning_rate": 3.1290151087433974e-06, "loss": 0.4454, "num_input_tokens_seen": 128098048, "step": 105330 }, { "epoch": 13.198220774339056, "grad_norm": 1.2626183032989502, "learning_rate": 3.1285081252656104e-06, "loss": 0.4957, "num_input_tokens_seen": 128104128, "step": 105335 }, { "epoch": 13.198847262247838, "grad_norm": 0.6297543048858643, "learning_rate": 3.1280011641630847e-06, "loss": 0.4304, "num_input_tokens_seen": 128110272, "step": 105340 }, { "epoch": 13.199473750156622, "grad_norm": 7.981135845184326, "learning_rate": 3.127494225441876e-06, "loss": 0.531, "num_input_tokens_seen": 128115328, "step": 105345 }, { "epoch": 13.200100238065405, "grad_norm": 1.2181856632232666, "learning_rate": 3.1269873091080512e-06, "loss": 0.6137, "num_input_tokens_seen": 128121632, "step": 105350 }, { "epoch": 13.200726725974189, "grad_norm": 4.776931285858154, "learning_rate": 3.1264804151676654e-06, "loss": 0.4635, "num_input_tokens_seen": 128127872, "step": 105355 }, { "epoch": 13.201353213882973, "grad_norm": 0.9639062881469727, "learning_rate": 3.1259735436267824e-06, "loss": 0.4444, "num_input_tokens_seen": 128134048, "step": 105360 }, { "epoch": 13.201979701791755, "grad_norm": 11.08309268951416, "learning_rate": 3.12546669449146e-06, "loss": 0.5919, "num_input_tokens_seen": 128140096, "step": 105365 }, { "epoch": 13.20260618970054, "grad_norm": 12.24908447265625, "learning_rate": 3.1249598677677616e-06, "loss": 0.4963, "num_input_tokens_seen": 128146592, "step": 105370 }, { "epoch": 13.203232677609321, "grad_norm": 9.35289192199707, "learning_rate": 3.124453063461743e-06, "loss": 0.4826, "num_input_tokens_seen": 128153024, "step": 105375 }, { "epoch": 13.203859165518105, "grad_norm": 2.13521409034729, "learning_rate": 3.123946281579466e-06, "loss": 0.4663, "num_input_tokens_seen": 128159200, "step": 105380 }, { "epoch": 13.20448565342689, "grad_norm": 0.7639063000679016, "learning_rate": 3.123439522126987e-06, "loss": 0.456, "num_input_tokens_seen": 128164352, "step": 105385 }, { "epoch": 13.205112141335672, "grad_norm": 0.7908143997192383, "learning_rate": 3.1229327851103663e-06, "loss": 0.4216, "num_input_tokens_seen": 128170432, "step": 105390 }, { "epoch": 13.205738629244456, "grad_norm": 1.1899206638336182, "learning_rate": 3.122426070535664e-06, "loss": 0.5042, "num_input_tokens_seen": 128176448, "step": 105395 }, { "epoch": 13.206365117153238, "grad_norm": 7.5771484375, "learning_rate": 3.1219193784089347e-06, "loss": 0.5018, "num_input_tokens_seen": 128182528, "step": 105400 }, { "epoch": 13.206991605062022, "grad_norm": 1.298617959022522, "learning_rate": 3.1214127087362387e-06, "loss": 0.4817, "num_input_tokens_seen": 128188960, "step": 105405 }, { "epoch": 13.207618092970806, "grad_norm": 1.6579241752624512, "learning_rate": 3.120906061523634e-06, "loss": 0.4426, "num_input_tokens_seen": 128195072, "step": 105410 }, { "epoch": 13.208244580879589, "grad_norm": 3.3976433277130127, "learning_rate": 3.120399436777177e-06, "loss": 0.481, "num_input_tokens_seen": 128200992, "step": 105415 }, { "epoch": 13.208871068788373, "grad_norm": 1.0783084630966187, "learning_rate": 3.1198928345029246e-06, "loss": 0.5026, "num_input_tokens_seen": 128207360, "step": 105420 }, { "epoch": 13.209497556697155, "grad_norm": 0.9877417683601379, "learning_rate": 3.1193862547069354e-06, "loss": 0.455, "num_input_tokens_seen": 128213664, "step": 105425 }, { "epoch": 13.210124044605939, "grad_norm": 7.102928161621094, "learning_rate": 3.1188796973952623e-06, "loss": 0.5138, "num_input_tokens_seen": 128219808, "step": 105430 }, { "epoch": 13.210750532514723, "grad_norm": 9.972819328308105, "learning_rate": 3.118373162573967e-06, "loss": 0.5323, "num_input_tokens_seen": 128225856, "step": 105435 }, { "epoch": 13.211377020423505, "grad_norm": 1.849465012550354, "learning_rate": 3.1178666502490994e-06, "loss": 0.4285, "num_input_tokens_seen": 128231904, "step": 105440 }, { "epoch": 13.21200350833229, "grad_norm": 2.05625319480896, "learning_rate": 3.117360160426719e-06, "loss": 0.4624, "num_input_tokens_seen": 128238048, "step": 105445 }, { "epoch": 13.212629996241073, "grad_norm": 0.8207572102546692, "learning_rate": 3.116853693112882e-06, "loss": 0.4178, "num_input_tokens_seen": 128244384, "step": 105450 }, { "epoch": 13.213256484149856, "grad_norm": 1.551086187362671, "learning_rate": 3.1163472483136415e-06, "loss": 0.532, "num_input_tokens_seen": 128250624, "step": 105455 }, { "epoch": 13.21388297205864, "grad_norm": 4.482003211975098, "learning_rate": 3.1158408260350536e-06, "loss": 0.4992, "num_input_tokens_seen": 128256768, "step": 105460 }, { "epoch": 13.214509459967422, "grad_norm": 7.203778266906738, "learning_rate": 3.115334426283173e-06, "loss": 0.4826, "num_input_tokens_seen": 128262656, "step": 105465 }, { "epoch": 13.215135947876206, "grad_norm": 14.79084300994873, "learning_rate": 3.1148280490640537e-06, "loss": 0.6566, "num_input_tokens_seen": 128268544, "step": 105470 }, { "epoch": 13.21576243578499, "grad_norm": 8.888970375061035, "learning_rate": 3.1143216943837497e-06, "loss": 0.5265, "num_input_tokens_seen": 128274752, "step": 105475 }, { "epoch": 13.216388923693772, "grad_norm": 5.371665000915527, "learning_rate": 3.113815362248316e-06, "loss": 0.5075, "num_input_tokens_seen": 128280896, "step": 105480 }, { "epoch": 13.217015411602556, "grad_norm": 2.633237600326538, "learning_rate": 3.113309052663804e-06, "loss": 0.5602, "num_input_tokens_seen": 128286656, "step": 105485 }, { "epoch": 13.217641899511339, "grad_norm": 1.2372058629989624, "learning_rate": 3.1128027656362715e-06, "loss": 0.4467, "num_input_tokens_seen": 128293024, "step": 105490 }, { "epoch": 13.218268387420123, "grad_norm": 1.2574524879455566, "learning_rate": 3.1122965011717653e-06, "loss": 0.4465, "num_input_tokens_seen": 128299008, "step": 105495 }, { "epoch": 13.218894875328907, "grad_norm": 1.366603136062622, "learning_rate": 3.111790259276344e-06, "loss": 0.5018, "num_input_tokens_seen": 128305056, "step": 105500 }, { "epoch": 13.21952136323769, "grad_norm": 10.692458152770996, "learning_rate": 3.111284039956056e-06, "loss": 0.5324, "num_input_tokens_seen": 128310400, "step": 105505 }, { "epoch": 13.220147851146473, "grad_norm": 1.3293946981430054, "learning_rate": 3.1107778432169545e-06, "loss": 0.4189, "num_input_tokens_seen": 128316544, "step": 105510 }, { "epoch": 13.220774339055255, "grad_norm": 0.7916676998138428, "learning_rate": 3.1102716690650938e-06, "loss": 0.4313, "num_input_tokens_seen": 128322720, "step": 105515 }, { "epoch": 13.22140082696404, "grad_norm": 1.0116065740585327, "learning_rate": 3.109765517506524e-06, "loss": 0.4236, "num_input_tokens_seen": 128328608, "step": 105520 }, { "epoch": 13.222027314872824, "grad_norm": 1.7023097276687622, "learning_rate": 3.1092593885472965e-06, "loss": 0.4455, "num_input_tokens_seen": 128334688, "step": 105525 }, { "epoch": 13.222653802781606, "grad_norm": 3.9542973041534424, "learning_rate": 3.108753282193462e-06, "loss": 0.4767, "num_input_tokens_seen": 128340608, "step": 105530 }, { "epoch": 13.22328029069039, "grad_norm": 7.41102933883667, "learning_rate": 3.108247198451073e-06, "loss": 0.4747, "num_input_tokens_seen": 128346528, "step": 105535 }, { "epoch": 13.223906778599172, "grad_norm": 1.5251777172088623, "learning_rate": 3.1077411373261788e-06, "loss": 0.6021, "num_input_tokens_seen": 128352384, "step": 105540 }, { "epoch": 13.224533266507956, "grad_norm": 5.302507400512695, "learning_rate": 3.107235098824831e-06, "loss": 0.5667, "num_input_tokens_seen": 128358560, "step": 105545 }, { "epoch": 13.22515975441674, "grad_norm": 7.916860580444336, "learning_rate": 3.1067290829530777e-06, "loss": 0.4861, "num_input_tokens_seen": 128364736, "step": 105550 }, { "epoch": 13.225786242325523, "grad_norm": 0.933098316192627, "learning_rate": 3.106223089716971e-06, "loss": 0.5087, "num_input_tokens_seen": 128371040, "step": 105555 }, { "epoch": 13.226412730234307, "grad_norm": 0.8771154284477234, "learning_rate": 3.1057171191225586e-06, "loss": 0.5323, "num_input_tokens_seen": 128377056, "step": 105560 }, { "epoch": 13.22703921814309, "grad_norm": 6.508874416351318, "learning_rate": 3.1052111711758914e-06, "loss": 0.4956, "num_input_tokens_seen": 128382912, "step": 105565 }, { "epoch": 13.227665706051873, "grad_norm": 1.1941767930984497, "learning_rate": 3.104705245883017e-06, "loss": 0.4464, "num_input_tokens_seen": 128388992, "step": 105570 }, { "epoch": 13.228292193960657, "grad_norm": 1.060766577720642, "learning_rate": 3.104199343249984e-06, "loss": 0.4496, "num_input_tokens_seen": 128395488, "step": 105575 }, { "epoch": 13.22891868186944, "grad_norm": 2.486830711364746, "learning_rate": 3.1036934632828448e-06, "loss": 0.4998, "num_input_tokens_seen": 128401376, "step": 105580 }, { "epoch": 13.229545169778223, "grad_norm": 7.3489909172058105, "learning_rate": 3.103187605987642e-06, "loss": 0.5589, "num_input_tokens_seen": 128407648, "step": 105585 }, { "epoch": 13.230171657687007, "grad_norm": 1.1112526655197144, "learning_rate": 3.1026817713704283e-06, "loss": 0.4635, "num_input_tokens_seen": 128413696, "step": 105590 }, { "epoch": 13.23079814559579, "grad_norm": 5.479420185089111, "learning_rate": 3.1021759594372467e-06, "loss": 0.4491, "num_input_tokens_seen": 128419840, "step": 105595 }, { "epoch": 13.231424633504574, "grad_norm": 8.353900909423828, "learning_rate": 3.101670170194149e-06, "loss": 0.5148, "num_input_tokens_seen": 128426048, "step": 105600 }, { "epoch": 13.232051121413356, "grad_norm": 13.868706703186035, "learning_rate": 3.1011644036471787e-06, "loss": 0.6043, "num_input_tokens_seen": 128432288, "step": 105605 }, { "epoch": 13.23267760932214, "grad_norm": 9.649216651916504, "learning_rate": 3.1006586598023857e-06, "loss": 0.5029, "num_input_tokens_seen": 128438560, "step": 105610 }, { "epoch": 13.233304097230924, "grad_norm": 9.69088077545166, "learning_rate": 3.100152938665816e-06, "loss": 0.4565, "num_input_tokens_seen": 128444128, "step": 105615 }, { "epoch": 13.233930585139706, "grad_norm": 1.371082067489624, "learning_rate": 3.099647240243514e-06, "loss": 0.4515, "num_input_tokens_seen": 128450304, "step": 105620 }, { "epoch": 13.23455707304849, "grad_norm": 3.4274842739105225, "learning_rate": 3.0991415645415256e-06, "loss": 0.4858, "num_input_tokens_seen": 128456608, "step": 105625 }, { "epoch": 13.235183560957273, "grad_norm": 7.480402946472168, "learning_rate": 3.098635911565899e-06, "loss": 0.5435, "num_input_tokens_seen": 128462752, "step": 105630 }, { "epoch": 13.235810048866057, "grad_norm": 7.956171035766602, "learning_rate": 3.0981302813226802e-06, "loss": 0.4495, "num_input_tokens_seen": 128468896, "step": 105635 }, { "epoch": 13.236436536774841, "grad_norm": 8.609206199645996, "learning_rate": 3.0976246738179118e-06, "loss": 0.6041, "num_input_tokens_seen": 128474720, "step": 105640 }, { "epoch": 13.237063024683623, "grad_norm": 4.840290546417236, "learning_rate": 3.0971190890576407e-06, "loss": 0.5126, "num_input_tokens_seen": 128481120, "step": 105645 }, { "epoch": 13.237689512592407, "grad_norm": 3.959688663482666, "learning_rate": 3.096613527047909e-06, "loss": 0.5156, "num_input_tokens_seen": 128487008, "step": 105650 }, { "epoch": 13.23831600050119, "grad_norm": 6.370753765106201, "learning_rate": 3.0961079877947652e-06, "loss": 0.4707, "num_input_tokens_seen": 128493504, "step": 105655 }, { "epoch": 13.238942488409974, "grad_norm": 1.101505994796753, "learning_rate": 3.0956024713042485e-06, "loss": 0.522, "num_input_tokens_seen": 128499520, "step": 105660 }, { "epoch": 13.239568976318758, "grad_norm": 10.095322608947754, "learning_rate": 3.095096977582407e-06, "loss": 0.5485, "num_input_tokens_seen": 128505952, "step": 105665 }, { "epoch": 13.24019546422754, "grad_norm": 8.351728439331055, "learning_rate": 3.0945915066352826e-06, "loss": 0.4629, "num_input_tokens_seen": 128512288, "step": 105670 }, { "epoch": 13.240821952136324, "grad_norm": 1.2520672082901, "learning_rate": 3.0940860584689185e-06, "loss": 0.4617, "num_input_tokens_seen": 128518080, "step": 105675 }, { "epoch": 13.241448440045108, "grad_norm": 0.963474452495575, "learning_rate": 3.093580633089357e-06, "loss": 0.5051, "num_input_tokens_seen": 128523872, "step": 105680 }, { "epoch": 13.24207492795389, "grad_norm": 1.2108603715896606, "learning_rate": 3.0930752305026444e-06, "loss": 0.4586, "num_input_tokens_seen": 128529888, "step": 105685 }, { "epoch": 13.242701415862674, "grad_norm": 5.717260360717773, "learning_rate": 3.0925698507148182e-06, "loss": 0.5338, "num_input_tokens_seen": 128535872, "step": 105690 }, { "epoch": 13.243327903771457, "grad_norm": 0.9460443258285522, "learning_rate": 3.0920644937319227e-06, "loss": 0.4284, "num_input_tokens_seen": 128542080, "step": 105695 }, { "epoch": 13.24395439168024, "grad_norm": 1.224236249923706, "learning_rate": 3.0915591595600026e-06, "loss": 0.4473, "num_input_tokens_seen": 128548032, "step": 105700 }, { "epoch": 13.244580879589025, "grad_norm": 7.206700801849365, "learning_rate": 3.0910538482050957e-06, "loss": 0.509, "num_input_tokens_seen": 128554400, "step": 105705 }, { "epoch": 13.245207367497807, "grad_norm": 11.341590881347656, "learning_rate": 3.0905485596732463e-06, "loss": 0.5992, "num_input_tokens_seen": 128559872, "step": 105710 }, { "epoch": 13.245833855406591, "grad_norm": 1.251029372215271, "learning_rate": 3.0900432939704925e-06, "loss": 0.4604, "num_input_tokens_seen": 128565664, "step": 105715 }, { "epoch": 13.246460343315373, "grad_norm": 1.0123224258422852, "learning_rate": 3.0895380511028784e-06, "loss": 0.5105, "num_input_tokens_seen": 128571424, "step": 105720 }, { "epoch": 13.247086831224157, "grad_norm": 4.107227802276611, "learning_rate": 3.0890328310764415e-06, "loss": 0.5519, "num_input_tokens_seen": 128577568, "step": 105725 }, { "epoch": 13.247713319132941, "grad_norm": 1.294427752494812, "learning_rate": 3.0885276338972263e-06, "loss": 0.4995, "num_input_tokens_seen": 128583616, "step": 105730 }, { "epoch": 13.248339807041724, "grad_norm": 9.985570907592773, "learning_rate": 3.0880224595712684e-06, "loss": 0.4722, "num_input_tokens_seen": 128590048, "step": 105735 }, { "epoch": 13.248966294950508, "grad_norm": 2.8991479873657227, "learning_rate": 3.0875173081046105e-06, "loss": 0.5266, "num_input_tokens_seen": 128596384, "step": 105740 }, { "epoch": 13.24959278285929, "grad_norm": 1.2018462419509888, "learning_rate": 3.0870121795032905e-06, "loss": 0.439, "num_input_tokens_seen": 128602464, "step": 105745 }, { "epoch": 13.250219270768074, "grad_norm": 3.047365665435791, "learning_rate": 3.0865070737733493e-06, "loss": 0.4158, "num_input_tokens_seen": 128608384, "step": 105750 }, { "epoch": 13.250845758676858, "grad_norm": 6.29939079284668, "learning_rate": 3.0860019909208226e-06, "loss": 0.475, "num_input_tokens_seen": 128614432, "step": 105755 }, { "epoch": 13.25147224658564, "grad_norm": 6.886186122894287, "learning_rate": 3.0854969309517525e-06, "loss": 0.498, "num_input_tokens_seen": 128620064, "step": 105760 }, { "epoch": 13.252098734494425, "grad_norm": 3.374187707901001, "learning_rate": 3.084991893872177e-06, "loss": 0.5713, "num_input_tokens_seen": 128625984, "step": 105765 }, { "epoch": 13.252725222403207, "grad_norm": 1.1795904636383057, "learning_rate": 3.0844868796881323e-06, "loss": 0.4985, "num_input_tokens_seen": 128632416, "step": 105770 }, { "epoch": 13.25335171031199, "grad_norm": 1.2303094863891602, "learning_rate": 3.0839818884056576e-06, "loss": 0.5835, "num_input_tokens_seen": 128638816, "step": 105775 }, { "epoch": 13.253978198220775, "grad_norm": 7.670955657958984, "learning_rate": 3.08347692003079e-06, "loss": 0.5176, "num_input_tokens_seen": 128644800, "step": 105780 }, { "epoch": 13.254604686129557, "grad_norm": 3.897400379180908, "learning_rate": 3.0829719745695684e-06, "loss": 0.4459, "num_input_tokens_seen": 128651200, "step": 105785 }, { "epoch": 13.255231174038341, "grad_norm": 7.720455169677734, "learning_rate": 3.0824670520280275e-06, "loss": 0.4612, "num_input_tokens_seen": 128657408, "step": 105790 }, { "epoch": 13.255857661947124, "grad_norm": 4.0988922119140625, "learning_rate": 3.0819621524122067e-06, "loss": 0.5629, "num_input_tokens_seen": 128663264, "step": 105795 }, { "epoch": 13.256484149855908, "grad_norm": 9.722800254821777, "learning_rate": 3.081457275728139e-06, "loss": 0.4968, "num_input_tokens_seen": 128669120, "step": 105800 }, { "epoch": 13.257110637764692, "grad_norm": 8.042767524719238, "learning_rate": 3.0809524219818644e-06, "loss": 0.4829, "num_input_tokens_seen": 128675328, "step": 105805 }, { "epoch": 13.257737125673474, "grad_norm": 0.9531787037849426, "learning_rate": 3.0804475911794153e-06, "loss": 0.456, "num_input_tokens_seen": 128681408, "step": 105810 }, { "epoch": 13.258363613582258, "grad_norm": 1.5691659450531006, "learning_rate": 3.079942783326829e-06, "loss": 0.4766, "num_input_tokens_seen": 128687168, "step": 105815 }, { "epoch": 13.258990101491042, "grad_norm": 1.8688225746154785, "learning_rate": 3.0794379984301433e-06, "loss": 0.4515, "num_input_tokens_seen": 128693344, "step": 105820 }, { "epoch": 13.259616589399824, "grad_norm": 1.5830278396606445, "learning_rate": 3.0789332364953882e-06, "loss": 0.4408, "num_input_tokens_seen": 128699296, "step": 105825 }, { "epoch": 13.260243077308608, "grad_norm": 5.276926517486572, "learning_rate": 3.078428497528604e-06, "loss": 0.4609, "num_input_tokens_seen": 128705824, "step": 105830 }, { "epoch": 13.26086956521739, "grad_norm": 1.7185875177383423, "learning_rate": 3.07792378153582e-06, "loss": 0.4268, "num_input_tokens_seen": 128711904, "step": 105835 }, { "epoch": 13.261496053126175, "grad_norm": 1.7534886598587036, "learning_rate": 3.077419088523076e-06, "loss": 0.4675, "num_input_tokens_seen": 128717792, "step": 105840 }, { "epoch": 13.262122541034959, "grad_norm": 1.0096018314361572, "learning_rate": 3.0769144184964013e-06, "loss": 0.5356, "num_input_tokens_seen": 128724064, "step": 105845 }, { "epoch": 13.262749028943741, "grad_norm": 1.152771234512329, "learning_rate": 3.0764097714618336e-06, "loss": 0.4513, "num_input_tokens_seen": 128730208, "step": 105850 }, { "epoch": 13.263375516852525, "grad_norm": 2.4898483753204346, "learning_rate": 3.075905147425402e-06, "loss": 0.4498, "num_input_tokens_seen": 128736320, "step": 105855 }, { "epoch": 13.264002004761307, "grad_norm": 7.871858596801758, "learning_rate": 3.0754005463931443e-06, "loss": 0.475, "num_input_tokens_seen": 128742432, "step": 105860 }, { "epoch": 13.264628492670091, "grad_norm": 3.499262809753418, "learning_rate": 3.0748959683710887e-06, "loss": 0.504, "num_input_tokens_seen": 128748608, "step": 105865 }, { "epoch": 13.265254980578876, "grad_norm": 1.1649208068847656, "learning_rate": 3.0743914133652718e-06, "loss": 0.4363, "num_input_tokens_seen": 128754688, "step": 105870 }, { "epoch": 13.265881468487658, "grad_norm": 1.7587919235229492, "learning_rate": 3.0738868813817235e-06, "loss": 0.4579, "num_input_tokens_seen": 128760672, "step": 105875 }, { "epoch": 13.266507956396442, "grad_norm": 2.424271821975708, "learning_rate": 3.073382372426478e-06, "loss": 0.4434, "num_input_tokens_seen": 128766784, "step": 105880 }, { "epoch": 13.267134444305224, "grad_norm": 2.416121006011963, "learning_rate": 3.0728778865055655e-06, "loss": 0.5512, "num_input_tokens_seen": 128772288, "step": 105885 }, { "epoch": 13.267760932214008, "grad_norm": 1.6982951164245605, "learning_rate": 3.0723734236250174e-06, "loss": 0.4336, "num_input_tokens_seen": 128778368, "step": 105890 }, { "epoch": 13.268387420122792, "grad_norm": 2.0975944995880127, "learning_rate": 3.071868983790868e-06, "loss": 0.4169, "num_input_tokens_seen": 128784576, "step": 105895 }, { "epoch": 13.269013908031575, "grad_norm": 5.195136547088623, "learning_rate": 3.0713645670091436e-06, "loss": 0.6273, "num_input_tokens_seen": 128790688, "step": 105900 }, { "epoch": 13.269640395940359, "grad_norm": 5.858278751373291, "learning_rate": 3.0708601732858795e-06, "loss": 0.4914, "num_input_tokens_seen": 128796768, "step": 105905 }, { "epoch": 13.27026688384914, "grad_norm": 1.4067710638046265, "learning_rate": 3.070355802627102e-06, "loss": 0.4742, "num_input_tokens_seen": 128802944, "step": 105910 }, { "epoch": 13.270893371757925, "grad_norm": 0.9760613441467285, "learning_rate": 3.0698514550388447e-06, "loss": 0.5267, "num_input_tokens_seen": 128808960, "step": 105915 }, { "epoch": 13.271519859666709, "grad_norm": 1.2974079847335815, "learning_rate": 3.069347130527135e-06, "loss": 0.4217, "num_input_tokens_seen": 128815200, "step": 105920 }, { "epoch": 13.272146347575491, "grad_norm": 8.916949272155762, "learning_rate": 3.068842829098004e-06, "loss": 0.4943, "num_input_tokens_seen": 128821088, "step": 105925 }, { "epoch": 13.272772835484275, "grad_norm": 1.0119175910949707, "learning_rate": 3.0683385507574793e-06, "loss": 0.5898, "num_input_tokens_seen": 128827008, "step": 105930 }, { "epoch": 13.273399323393058, "grad_norm": 1.5136597156524658, "learning_rate": 3.0678342955115936e-06, "loss": 0.4941, "num_input_tokens_seen": 128833376, "step": 105935 }, { "epoch": 13.274025811301842, "grad_norm": 5.531595230102539, "learning_rate": 3.0673300633663715e-06, "loss": 0.4943, "num_input_tokens_seen": 128839520, "step": 105940 }, { "epoch": 13.274652299210626, "grad_norm": 9.638765335083008, "learning_rate": 3.0668258543278432e-06, "loss": 0.4587, "num_input_tokens_seen": 128845600, "step": 105945 }, { "epoch": 13.275278787119408, "grad_norm": 12.837699890136719, "learning_rate": 3.066321668402039e-06, "loss": 0.6219, "num_input_tokens_seen": 128851296, "step": 105950 }, { "epoch": 13.275905275028192, "grad_norm": 3.5795741081237793, "learning_rate": 3.0658175055949833e-06, "loss": 0.4844, "num_input_tokens_seen": 128857344, "step": 105955 }, { "epoch": 13.276531762936976, "grad_norm": 0.9549291729927063, "learning_rate": 3.065313365912708e-06, "loss": 0.5325, "num_input_tokens_seen": 128863488, "step": 105960 }, { "epoch": 13.277158250845758, "grad_norm": 7.290334701538086, "learning_rate": 3.0648092493612353e-06, "loss": 0.5759, "num_input_tokens_seen": 128869184, "step": 105965 }, { "epoch": 13.277784738754542, "grad_norm": 1.7382934093475342, "learning_rate": 3.0643051559465976e-06, "loss": 0.4442, "num_input_tokens_seen": 128875488, "step": 105970 }, { "epoch": 13.278411226663325, "grad_norm": 2.5059754848480225, "learning_rate": 3.063801085674816e-06, "loss": 0.5642, "num_input_tokens_seen": 128880928, "step": 105975 }, { "epoch": 13.279037714572109, "grad_norm": 4.880483150482178, "learning_rate": 3.0632970385519224e-06, "loss": 0.4932, "num_input_tokens_seen": 128886912, "step": 105980 }, { "epoch": 13.279664202480893, "grad_norm": 1.2939074039459229, "learning_rate": 3.0627930145839394e-06, "loss": 0.4743, "num_input_tokens_seen": 128893088, "step": 105985 }, { "epoch": 13.280290690389675, "grad_norm": 1.382222056388855, "learning_rate": 3.0622890137768967e-06, "loss": 0.482, "num_input_tokens_seen": 128899328, "step": 105990 }, { "epoch": 13.28091717829846, "grad_norm": 3.257936954498291, "learning_rate": 3.061785036136816e-06, "loss": 0.4753, "num_input_tokens_seen": 128905376, "step": 105995 }, { "epoch": 13.281543666207241, "grad_norm": 0.8379711508750916, "learning_rate": 3.0612810816697253e-06, "loss": 0.4773, "num_input_tokens_seen": 128911104, "step": 106000 }, { "epoch": 13.282170154116026, "grad_norm": 0.7543079853057861, "learning_rate": 3.060777150381651e-06, "loss": 0.4653, "num_input_tokens_seen": 128917152, "step": 106005 }, { "epoch": 13.28279664202481, "grad_norm": 0.9436494708061218, "learning_rate": 3.060273242278614e-06, "loss": 0.4689, "num_input_tokens_seen": 128923424, "step": 106010 }, { "epoch": 13.283423129933592, "grad_norm": 5.764598846435547, "learning_rate": 3.0597693573666433e-06, "loss": 0.4525, "num_input_tokens_seen": 128929824, "step": 106015 }, { "epoch": 13.284049617842376, "grad_norm": 0.7001617550849915, "learning_rate": 3.0592654956517596e-06, "loss": 0.4456, "num_input_tokens_seen": 128935712, "step": 106020 }, { "epoch": 13.284676105751158, "grad_norm": 0.7150103449821472, "learning_rate": 3.058761657139989e-06, "loss": 0.465, "num_input_tokens_seen": 128941536, "step": 106025 }, { "epoch": 13.285302593659942, "grad_norm": 1.014062762260437, "learning_rate": 3.0582578418373555e-06, "loss": 0.4741, "num_input_tokens_seen": 128947424, "step": 106030 }, { "epoch": 13.285929081568726, "grad_norm": 0.8209912180900574, "learning_rate": 3.0577540497498815e-06, "loss": 0.4725, "num_input_tokens_seen": 128953696, "step": 106035 }, { "epoch": 13.286555569477509, "grad_norm": 0.707713782787323, "learning_rate": 3.0572502808835896e-06, "loss": 0.4483, "num_input_tokens_seen": 128959360, "step": 106040 }, { "epoch": 13.287182057386293, "grad_norm": 0.8060660362243652, "learning_rate": 3.0567465352445065e-06, "loss": 0.4449, "num_input_tokens_seen": 128965248, "step": 106045 }, { "epoch": 13.287808545295075, "grad_norm": 1.473531723022461, "learning_rate": 3.0562428128386502e-06, "loss": 0.465, "num_input_tokens_seen": 128971360, "step": 106050 }, { "epoch": 13.288435033203859, "grad_norm": 2.041378974914551, "learning_rate": 3.055739113672047e-06, "loss": 0.4719, "num_input_tokens_seen": 128977440, "step": 106055 }, { "epoch": 13.289061521112643, "grad_norm": 0.6525106430053711, "learning_rate": 3.0552354377507152e-06, "loss": 0.4593, "num_input_tokens_seen": 128983424, "step": 106060 }, { "epoch": 13.289688009021425, "grad_norm": 0.9542002081871033, "learning_rate": 3.05473178508068e-06, "loss": 0.4449, "num_input_tokens_seen": 128989408, "step": 106065 }, { "epoch": 13.29031449693021, "grad_norm": 1.1576015949249268, "learning_rate": 3.054228155667963e-06, "loss": 0.47, "num_input_tokens_seen": 128995744, "step": 106070 }, { "epoch": 13.290940984838993, "grad_norm": 0.901793897151947, "learning_rate": 3.053724549518582e-06, "loss": 0.4463, "num_input_tokens_seen": 129002016, "step": 106075 }, { "epoch": 13.291567472747776, "grad_norm": 2.171862840652466, "learning_rate": 3.0532209666385615e-06, "loss": 0.4462, "num_input_tokens_seen": 129008256, "step": 106080 }, { "epoch": 13.29219396065656, "grad_norm": 1.0772552490234375, "learning_rate": 3.0527174070339217e-06, "loss": 0.4503, "num_input_tokens_seen": 129014592, "step": 106085 }, { "epoch": 13.292820448565342, "grad_norm": 2.2986645698547363, "learning_rate": 3.0522138707106817e-06, "loss": 0.4811, "num_input_tokens_seen": 129020672, "step": 106090 }, { "epoch": 13.293446936474126, "grad_norm": 5.181249618530273, "learning_rate": 3.0517103576748618e-06, "loss": 0.4858, "num_input_tokens_seen": 129026784, "step": 106095 }, { "epoch": 13.29407342438291, "grad_norm": 2.171586036682129, "learning_rate": 3.0512068679324847e-06, "loss": 0.4735, "num_input_tokens_seen": 129032960, "step": 106100 }, { "epoch": 13.294699912291692, "grad_norm": 1.2509679794311523, "learning_rate": 3.0507034014895666e-06, "loss": 0.4709, "num_input_tokens_seen": 129039104, "step": 106105 }, { "epoch": 13.295326400200477, "grad_norm": 1.0499022006988525, "learning_rate": 3.050199958352129e-06, "loss": 0.4303, "num_input_tokens_seen": 129045728, "step": 106110 }, { "epoch": 13.295952888109259, "grad_norm": 1.1238926649093628, "learning_rate": 3.049696538526189e-06, "loss": 0.4337, "num_input_tokens_seen": 129051392, "step": 106115 }, { "epoch": 13.296579376018043, "grad_norm": 1.404298186302185, "learning_rate": 3.0491931420177676e-06, "loss": 0.4618, "num_input_tokens_seen": 129057568, "step": 106120 }, { "epoch": 13.297205863926827, "grad_norm": 3.360036611557007, "learning_rate": 3.0486897688328814e-06, "loss": 0.4885, "num_input_tokens_seen": 129063904, "step": 106125 }, { "epoch": 13.29783235183561, "grad_norm": 3.628201723098755, "learning_rate": 3.048186418977549e-06, "loss": 0.5564, "num_input_tokens_seen": 129069952, "step": 106130 }, { "epoch": 13.298458839744393, "grad_norm": 0.8870489001274109, "learning_rate": 3.0476830924577905e-06, "loss": 0.4796, "num_input_tokens_seen": 129076096, "step": 106135 }, { "epoch": 13.299085327653176, "grad_norm": 3.179818868637085, "learning_rate": 3.0471797892796205e-06, "loss": 0.466, "num_input_tokens_seen": 129081920, "step": 106140 }, { "epoch": 13.29971181556196, "grad_norm": 1.416809320449829, "learning_rate": 3.0466765094490602e-06, "loss": 0.4551, "num_input_tokens_seen": 129088128, "step": 106145 }, { "epoch": 13.300338303470744, "grad_norm": 1.617437481880188, "learning_rate": 3.046173252972123e-06, "loss": 0.4754, "num_input_tokens_seen": 129094368, "step": 106150 }, { "epoch": 13.300964791379526, "grad_norm": 0.6873038411140442, "learning_rate": 3.0456700198548283e-06, "loss": 0.4658, "num_input_tokens_seen": 129100544, "step": 106155 }, { "epoch": 13.30159127928831, "grad_norm": 0.6305018067359924, "learning_rate": 3.0451668101031904e-06, "loss": 0.4834, "num_input_tokens_seen": 129106656, "step": 106160 }, { "epoch": 13.302217767197092, "grad_norm": 1.105342149734497, "learning_rate": 3.0446636237232286e-06, "loss": 0.5079, "num_input_tokens_seen": 129112800, "step": 106165 }, { "epoch": 13.302844255105876, "grad_norm": 0.9056013226509094, "learning_rate": 3.044160460720955e-06, "loss": 0.4591, "num_input_tokens_seen": 129119200, "step": 106170 }, { "epoch": 13.30347074301466, "grad_norm": 0.7015939950942993, "learning_rate": 3.0436573211023883e-06, "loss": 0.463, "num_input_tokens_seen": 129125248, "step": 106175 }, { "epoch": 13.304097230923443, "grad_norm": 1.4896034002304077, "learning_rate": 3.0431542048735428e-06, "loss": 0.4438, "num_input_tokens_seen": 129131520, "step": 106180 }, { "epoch": 13.304723718832227, "grad_norm": 0.829431414604187, "learning_rate": 3.042651112040435e-06, "loss": 0.4537, "num_input_tokens_seen": 129137504, "step": 106185 }, { "epoch": 13.30535020674101, "grad_norm": 0.7807146310806274, "learning_rate": 3.0421480426090777e-06, "loss": 0.4455, "num_input_tokens_seen": 129143616, "step": 106190 }, { "epoch": 13.305976694649793, "grad_norm": 1.1170845031738281, "learning_rate": 3.041644996585486e-06, "loss": 0.4554, "num_input_tokens_seen": 129149888, "step": 106195 }, { "epoch": 13.306603182558577, "grad_norm": 0.9396120309829712, "learning_rate": 3.041141973975677e-06, "loss": 0.4393, "num_input_tokens_seen": 129155648, "step": 106200 }, { "epoch": 13.30722967046736, "grad_norm": 0.8940368294715881, "learning_rate": 3.0406389747856606e-06, "loss": 0.4671, "num_input_tokens_seen": 129161472, "step": 106205 }, { "epoch": 13.307856158376143, "grad_norm": 0.889110803604126, "learning_rate": 3.040135999021455e-06, "loss": 0.4534, "num_input_tokens_seen": 129166912, "step": 106210 }, { "epoch": 13.308482646284927, "grad_norm": 0.8404989242553711, "learning_rate": 3.039633046689069e-06, "loss": 0.4607, "num_input_tokens_seen": 129173088, "step": 106215 }, { "epoch": 13.30910913419371, "grad_norm": 0.9051666259765625, "learning_rate": 3.03913011779452e-06, "loss": 0.4254, "num_input_tokens_seen": 129178944, "step": 106220 }, { "epoch": 13.309735622102494, "grad_norm": 1.185438632965088, "learning_rate": 3.0386272123438175e-06, "loss": 0.4326, "num_input_tokens_seen": 129185280, "step": 106225 }, { "epoch": 13.310362110011276, "grad_norm": 1.0760502815246582, "learning_rate": 3.0381243303429765e-06, "loss": 0.4681, "num_input_tokens_seen": 129191200, "step": 106230 }, { "epoch": 13.31098859792006, "grad_norm": 3.093334913253784, "learning_rate": 3.037621471798008e-06, "loss": 0.4247, "num_input_tokens_seen": 129197408, "step": 106235 }, { "epoch": 13.311615085828844, "grad_norm": 5.148753643035889, "learning_rate": 3.0371186367149254e-06, "loss": 0.5874, "num_input_tokens_seen": 129203712, "step": 106240 }, { "epoch": 13.312241573737627, "grad_norm": 1.0571645498275757, "learning_rate": 3.0366158250997385e-06, "loss": 0.4648, "num_input_tokens_seen": 129209440, "step": 106245 }, { "epoch": 13.31286806164641, "grad_norm": 1.4416742324829102, "learning_rate": 3.03611303695846e-06, "loss": 0.5113, "num_input_tokens_seen": 129215808, "step": 106250 }, { "epoch": 13.313494549555193, "grad_norm": 11.449250221252441, "learning_rate": 3.0356102722971038e-06, "loss": 0.508, "num_input_tokens_seen": 129221504, "step": 106255 }, { "epoch": 13.314121037463977, "grad_norm": 4.8045830726623535, "learning_rate": 3.035107531121676e-06, "loss": 0.4641, "num_input_tokens_seen": 129227808, "step": 106260 }, { "epoch": 13.314747525372761, "grad_norm": 1.821794033050537, "learning_rate": 3.034604813438192e-06, "loss": 0.6562, "num_input_tokens_seen": 129233952, "step": 106265 }, { "epoch": 13.315374013281543, "grad_norm": 3.6203112602233887, "learning_rate": 3.0341021192526573e-06, "loss": 0.51, "num_input_tokens_seen": 129240160, "step": 106270 }, { "epoch": 13.316000501190327, "grad_norm": 1.1752880811691284, "learning_rate": 3.0335994485710867e-06, "loss": 0.4465, "num_input_tokens_seen": 129245984, "step": 106275 }, { "epoch": 13.31662698909911, "grad_norm": 1.2263473272323608, "learning_rate": 3.0330968013994865e-06, "loss": 0.4529, "num_input_tokens_seen": 129252032, "step": 106280 }, { "epoch": 13.317253477007894, "grad_norm": 0.8928846120834351, "learning_rate": 3.0325941777438685e-06, "loss": 0.4475, "num_input_tokens_seen": 129258176, "step": 106285 }, { "epoch": 13.317879964916678, "grad_norm": 0.9169678688049316, "learning_rate": 3.03209157761024e-06, "loss": 0.465, "num_input_tokens_seen": 129264320, "step": 106290 }, { "epoch": 13.31850645282546, "grad_norm": 1.1073719263076782, "learning_rate": 3.0315890010046127e-06, "loss": 0.4386, "num_input_tokens_seen": 129269600, "step": 106295 }, { "epoch": 13.319132940734244, "grad_norm": 1.1434741020202637, "learning_rate": 3.0310864479329925e-06, "loss": 0.5554, "num_input_tokens_seen": 129275552, "step": 106300 }, { "epoch": 13.319759428643028, "grad_norm": 7.0448527336120605, "learning_rate": 3.030583918401391e-06, "loss": 0.4601, "num_input_tokens_seen": 129281632, "step": 106305 }, { "epoch": 13.32038591655181, "grad_norm": 1.2944585084915161, "learning_rate": 3.030081412415813e-06, "loss": 0.4857, "num_input_tokens_seen": 129287712, "step": 106310 }, { "epoch": 13.321012404460594, "grad_norm": 5.464768886566162, "learning_rate": 3.0295789299822676e-06, "loss": 0.4663, "num_input_tokens_seen": 129294016, "step": 106315 }, { "epoch": 13.321638892369377, "grad_norm": 5.4267120361328125, "learning_rate": 3.0290764711067645e-06, "loss": 0.4763, "num_input_tokens_seen": 129299648, "step": 106320 }, { "epoch": 13.32226538027816, "grad_norm": 1.3022348880767822, "learning_rate": 3.0285740357953077e-06, "loss": 0.4338, "num_input_tokens_seen": 129305824, "step": 106325 }, { "epoch": 13.322891868186945, "grad_norm": 5.079034805297852, "learning_rate": 3.0280716240539076e-06, "loss": 0.5095, "num_input_tokens_seen": 129312096, "step": 106330 }, { "epoch": 13.323518356095727, "grad_norm": 2.272773504257202, "learning_rate": 3.0275692358885667e-06, "loss": 0.4425, "num_input_tokens_seen": 129318304, "step": 106335 }, { "epoch": 13.324144844004511, "grad_norm": 0.867620050907135, "learning_rate": 3.0270668713052952e-06, "loss": 0.4704, "num_input_tokens_seen": 129323456, "step": 106340 }, { "epoch": 13.324771331913293, "grad_norm": 1.009297490119934, "learning_rate": 3.0265645303100973e-06, "loss": 0.4935, "num_input_tokens_seen": 129329312, "step": 106345 }, { "epoch": 13.325397819822077, "grad_norm": 3.878084182739258, "learning_rate": 3.026062212908981e-06, "loss": 0.4527, "num_input_tokens_seen": 129335040, "step": 106350 }, { "epoch": 13.326024307730862, "grad_norm": 3.4381167888641357, "learning_rate": 3.025559919107949e-06, "loss": 0.4503, "num_input_tokens_seen": 129340608, "step": 106355 }, { "epoch": 13.326650795639644, "grad_norm": 0.7249317765235901, "learning_rate": 3.02505764891301e-06, "loss": 0.4958, "num_input_tokens_seen": 129346304, "step": 106360 }, { "epoch": 13.327277283548428, "grad_norm": 3.2373383045196533, "learning_rate": 3.024555402330166e-06, "loss": 0.4938, "num_input_tokens_seen": 129352192, "step": 106365 }, { "epoch": 13.32790377145721, "grad_norm": 1.9225841760635376, "learning_rate": 3.0240531793654245e-06, "loss": 0.4619, "num_input_tokens_seen": 129358368, "step": 106370 }, { "epoch": 13.328530259365994, "grad_norm": 6.511632442474365, "learning_rate": 3.0235509800247865e-06, "loss": 0.4984, "num_input_tokens_seen": 129364448, "step": 106375 }, { "epoch": 13.329156747274778, "grad_norm": 1.0587453842163086, "learning_rate": 3.0230488043142582e-06, "loss": 0.4866, "num_input_tokens_seen": 129370208, "step": 106380 }, { "epoch": 13.32978323518356, "grad_norm": 0.7698457837104797, "learning_rate": 3.022546652239845e-06, "loss": 0.4748, "num_input_tokens_seen": 129376608, "step": 106385 }, { "epoch": 13.330409723092345, "grad_norm": 7.514752388000488, "learning_rate": 3.022044523807548e-06, "loss": 0.5103, "num_input_tokens_seen": 129382784, "step": 106390 }, { "epoch": 13.331036211001127, "grad_norm": 0.8845035433769226, "learning_rate": 3.021542419023372e-06, "loss": 0.4767, "num_input_tokens_seen": 129388768, "step": 106395 }, { "epoch": 13.331662698909911, "grad_norm": 0.9274584054946899, "learning_rate": 3.0210403378933195e-06, "loss": 0.422, "num_input_tokens_seen": 129394976, "step": 106400 }, { "epoch": 13.332289186818695, "grad_norm": 0.8442849516868591, "learning_rate": 3.020538280423395e-06, "loss": 0.4923, "num_input_tokens_seen": 129400928, "step": 106405 }, { "epoch": 13.332915674727477, "grad_norm": 0.8674041628837585, "learning_rate": 3.0200362466195977e-06, "loss": 0.408, "num_input_tokens_seen": 129406528, "step": 106410 }, { "epoch": 13.333542162636261, "grad_norm": 1.3904154300689697, "learning_rate": 3.0195342364879343e-06, "loss": 0.4677, "num_input_tokens_seen": 129412768, "step": 106415 }, { "epoch": 13.334168650545044, "grad_norm": 1.2084912061691284, "learning_rate": 3.0190322500344015e-06, "loss": 0.5217, "num_input_tokens_seen": 129418912, "step": 106420 }, { "epoch": 13.334795138453828, "grad_norm": 3.5103859901428223, "learning_rate": 3.0185302872650055e-06, "loss": 0.542, "num_input_tokens_seen": 129424448, "step": 106425 }, { "epoch": 13.335421626362612, "grad_norm": 7.827134132385254, "learning_rate": 3.0180283481857447e-06, "loss": 0.5062, "num_input_tokens_seen": 129430560, "step": 106430 }, { "epoch": 13.336048114271394, "grad_norm": 10.835250854492188, "learning_rate": 3.017526432802621e-06, "loss": 0.4834, "num_input_tokens_seen": 129436896, "step": 106435 }, { "epoch": 13.336674602180178, "grad_norm": 1.3934143781661987, "learning_rate": 3.017024541121637e-06, "loss": 0.4745, "num_input_tokens_seen": 129443008, "step": 106440 }, { "epoch": 13.337301090088962, "grad_norm": 1.0438213348388672, "learning_rate": 3.0165226731487905e-06, "loss": 0.4508, "num_input_tokens_seen": 129449408, "step": 106445 }, { "epoch": 13.337927577997744, "grad_norm": 0.9468353390693665, "learning_rate": 3.016020828890084e-06, "loss": 0.5048, "num_input_tokens_seen": 129455712, "step": 106450 }, { "epoch": 13.338554065906528, "grad_norm": 1.2777506113052368, "learning_rate": 3.015519008351515e-06, "loss": 0.4843, "num_input_tokens_seen": 129461824, "step": 106455 }, { "epoch": 13.33918055381531, "grad_norm": 0.7342795729637146, "learning_rate": 3.0150172115390876e-06, "loss": 0.4273, "num_input_tokens_seen": 129467744, "step": 106460 }, { "epoch": 13.339807041724095, "grad_norm": 0.9063032865524292, "learning_rate": 3.0145154384587954e-06, "loss": 0.4119, "num_input_tokens_seen": 129473792, "step": 106465 }, { "epoch": 13.340433529632879, "grad_norm": 1.1634162664413452, "learning_rate": 3.0140136891166426e-06, "loss": 0.4873, "num_input_tokens_seen": 129479840, "step": 106470 }, { "epoch": 13.341060017541661, "grad_norm": 8.896405220031738, "learning_rate": 3.013511963518624e-06, "loss": 0.4954, "num_input_tokens_seen": 129485856, "step": 106475 }, { "epoch": 13.341686505450445, "grad_norm": 9.525721549987793, "learning_rate": 3.0130102616707426e-06, "loss": 0.529, "num_input_tokens_seen": 129491904, "step": 106480 }, { "epoch": 13.342312993359227, "grad_norm": 6.471750259399414, "learning_rate": 3.0125085835789913e-06, "loss": 0.5161, "num_input_tokens_seen": 129498112, "step": 106485 }, { "epoch": 13.342939481268012, "grad_norm": 0.9802844524383545, "learning_rate": 3.0120069292493726e-06, "loss": 0.5173, "num_input_tokens_seen": 129504192, "step": 106490 }, { "epoch": 13.343565969176796, "grad_norm": 3.3342955112457275, "learning_rate": 3.0115052986878812e-06, "loss": 0.5039, "num_input_tokens_seen": 129510432, "step": 106495 }, { "epoch": 13.344192457085578, "grad_norm": 3.6586718559265137, "learning_rate": 3.011003691900516e-06, "loss": 0.4792, "num_input_tokens_seen": 129516736, "step": 106500 }, { "epoch": 13.344818944994362, "grad_norm": 2.0364677906036377, "learning_rate": 3.0105021088932753e-06, "loss": 0.4316, "num_input_tokens_seen": 129523104, "step": 106505 }, { "epoch": 13.345445432903144, "grad_norm": 1.1712106466293335, "learning_rate": 3.0100005496721526e-06, "loss": 0.4652, "num_input_tokens_seen": 129528960, "step": 106510 }, { "epoch": 13.346071920811928, "grad_norm": 2.670868396759033, "learning_rate": 3.0094990142431486e-06, "loss": 0.4471, "num_input_tokens_seen": 129535040, "step": 106515 }, { "epoch": 13.346698408720712, "grad_norm": 1.248090386390686, "learning_rate": 3.0089975026122553e-06, "loss": 0.4631, "num_input_tokens_seen": 129541344, "step": 106520 }, { "epoch": 13.347324896629495, "grad_norm": 2.3126626014709473, "learning_rate": 3.008496014785473e-06, "loss": 0.4413, "num_input_tokens_seen": 129547040, "step": 106525 }, { "epoch": 13.347951384538279, "grad_norm": 0.7904645800590515, "learning_rate": 3.007994550768793e-06, "loss": 0.4609, "num_input_tokens_seen": 129552608, "step": 106530 }, { "epoch": 13.348577872447061, "grad_norm": 1.1628577709197998, "learning_rate": 3.007493110568215e-06, "loss": 0.4674, "num_input_tokens_seen": 129558912, "step": 106535 }, { "epoch": 13.349204360355845, "grad_norm": 5.323347568511963, "learning_rate": 3.00699169418973e-06, "loss": 0.5033, "num_input_tokens_seen": 129565408, "step": 106540 }, { "epoch": 13.349830848264629, "grad_norm": 1.3098326921463013, "learning_rate": 3.0064903016393364e-06, "loss": 0.507, "num_input_tokens_seen": 129570304, "step": 106545 }, { "epoch": 13.350457336173411, "grad_norm": 3.205475091934204, "learning_rate": 3.0059889329230264e-06, "loss": 0.5144, "num_input_tokens_seen": 129576224, "step": 106550 }, { "epoch": 13.351083824082195, "grad_norm": 3.2393875122070312, "learning_rate": 3.0054875880467966e-06, "loss": 0.4603, "num_input_tokens_seen": 129582336, "step": 106555 }, { "epoch": 13.351710311990978, "grad_norm": 0.9448840618133545, "learning_rate": 3.0049862670166373e-06, "loss": 0.4631, "num_input_tokens_seen": 129588608, "step": 106560 }, { "epoch": 13.352336799899762, "grad_norm": 0.8727447390556335, "learning_rate": 3.0044849698385446e-06, "loss": 0.4686, "num_input_tokens_seen": 129594752, "step": 106565 }, { "epoch": 13.352963287808546, "grad_norm": 0.6790705919265747, "learning_rate": 3.0039836965185147e-06, "loss": 0.4784, "num_input_tokens_seen": 129600832, "step": 106570 }, { "epoch": 13.353589775717328, "grad_norm": 0.9548208713531494, "learning_rate": 3.0034824470625357e-06, "loss": 0.5015, "num_input_tokens_seen": 129606784, "step": 106575 }, { "epoch": 13.354216263626112, "grad_norm": 1.007265329360962, "learning_rate": 3.002981221476604e-06, "loss": 0.4662, "num_input_tokens_seen": 129613088, "step": 106580 }, { "epoch": 13.354842751534896, "grad_norm": 9.677342414855957, "learning_rate": 3.0024800197667093e-06, "loss": 0.5143, "num_input_tokens_seen": 129618336, "step": 106585 }, { "epoch": 13.355469239443678, "grad_norm": 1.146545171737671, "learning_rate": 3.0019788419388467e-06, "loss": 0.4764, "num_input_tokens_seen": 129623968, "step": 106590 }, { "epoch": 13.356095727352463, "grad_norm": 2.551849126815796, "learning_rate": 3.0014776879990055e-06, "loss": 0.4383, "num_input_tokens_seen": 129629984, "step": 106595 }, { "epoch": 13.356722215261245, "grad_norm": 9.353489875793457, "learning_rate": 3.00097655795318e-06, "loss": 0.5176, "num_input_tokens_seen": 129635744, "step": 106600 }, { "epoch": 13.357348703170029, "grad_norm": 2.5559356212615967, "learning_rate": 3.0004754518073587e-06, "loss": 0.4807, "num_input_tokens_seen": 129642016, "step": 106605 }, { "epoch": 13.357975191078813, "grad_norm": 1.0936638116836548, "learning_rate": 2.9999743695675364e-06, "loss": 0.4776, "num_input_tokens_seen": 129648288, "step": 106610 }, { "epoch": 13.358601678987595, "grad_norm": 1.0549720525741577, "learning_rate": 2.999473311239701e-06, "loss": 0.4933, "num_input_tokens_seen": 129654272, "step": 106615 }, { "epoch": 13.35922816689638, "grad_norm": 1.0329504013061523, "learning_rate": 2.9989722768298435e-06, "loss": 0.4918, "num_input_tokens_seen": 129660160, "step": 106620 }, { "epoch": 13.359854654805162, "grad_norm": 0.9284854531288147, "learning_rate": 2.9984712663439567e-06, "loss": 0.4426, "num_input_tokens_seen": 129666112, "step": 106625 }, { "epoch": 13.360481142713946, "grad_norm": 1.0258584022521973, "learning_rate": 2.997970279788027e-06, "loss": 0.5116, "num_input_tokens_seen": 129672160, "step": 106630 }, { "epoch": 13.36110763062273, "grad_norm": 7.829309463500977, "learning_rate": 2.997469317168048e-06, "loss": 0.4996, "num_input_tokens_seen": 129678272, "step": 106635 }, { "epoch": 13.361734118531512, "grad_norm": 0.8403858542442322, "learning_rate": 2.9969683784900045e-06, "loss": 0.4644, "num_input_tokens_seen": 129684384, "step": 106640 }, { "epoch": 13.362360606440296, "grad_norm": 1.5682737827301025, "learning_rate": 2.9964674637598888e-06, "loss": 0.4571, "num_input_tokens_seen": 129690528, "step": 106645 }, { "epoch": 13.362987094349078, "grad_norm": 4.756556987762451, "learning_rate": 2.99596657298369e-06, "loss": 0.5118, "num_input_tokens_seen": 129696672, "step": 106650 }, { "epoch": 13.363613582257862, "grad_norm": 1.2023578882217407, "learning_rate": 2.995465706167395e-06, "loss": 0.466, "num_input_tokens_seen": 129702496, "step": 106655 }, { "epoch": 13.364240070166646, "grad_norm": 0.8243162035942078, "learning_rate": 2.9949648633169926e-06, "loss": 0.4356, "num_input_tokens_seen": 129708192, "step": 106660 }, { "epoch": 13.364866558075429, "grad_norm": 1.0314496755599976, "learning_rate": 2.994464044438472e-06, "loss": 0.4355, "num_input_tokens_seen": 129714592, "step": 106665 }, { "epoch": 13.365493045984213, "grad_norm": 1.342288851737976, "learning_rate": 2.9939632495378185e-06, "loss": 0.4641, "num_input_tokens_seen": 129720960, "step": 106670 }, { "epoch": 13.366119533892995, "grad_norm": 1.0844182968139648, "learning_rate": 2.9934624786210224e-06, "loss": 0.4999, "num_input_tokens_seen": 129726912, "step": 106675 }, { "epoch": 13.366746021801779, "grad_norm": 4.108333587646484, "learning_rate": 2.9929617316940683e-06, "loss": 0.4529, "num_input_tokens_seen": 129733056, "step": 106680 }, { "epoch": 13.367372509710563, "grad_norm": 1.112529993057251, "learning_rate": 2.9924610087629435e-06, "loss": 0.4647, "num_input_tokens_seen": 129739200, "step": 106685 }, { "epoch": 13.367998997619345, "grad_norm": 1.1353076696395874, "learning_rate": 2.991960309833637e-06, "loss": 0.4555, "num_input_tokens_seen": 129745504, "step": 106690 }, { "epoch": 13.36862548552813, "grad_norm": 3.8053297996520996, "learning_rate": 2.9914596349121316e-06, "loss": 0.4637, "num_input_tokens_seen": 129751456, "step": 106695 }, { "epoch": 13.369251973436914, "grad_norm": 9.175069808959961, "learning_rate": 2.990958984004416e-06, "loss": 0.4713, "num_input_tokens_seen": 129757408, "step": 106700 }, { "epoch": 13.369878461345696, "grad_norm": 0.943804919719696, "learning_rate": 2.9904583571164747e-06, "loss": 0.4339, "num_input_tokens_seen": 129763040, "step": 106705 }, { "epoch": 13.37050494925448, "grad_norm": 2.2725720405578613, "learning_rate": 2.9899577542542933e-06, "loss": 0.4852, "num_input_tokens_seen": 129769152, "step": 106710 }, { "epoch": 13.371131437163262, "grad_norm": 5.622493267059326, "learning_rate": 2.9894571754238554e-06, "loss": 0.5358, "num_input_tokens_seen": 129775456, "step": 106715 }, { "epoch": 13.371757925072046, "grad_norm": 1.4269047975540161, "learning_rate": 2.9889566206311504e-06, "loss": 0.4427, "num_input_tokens_seen": 129781600, "step": 106720 }, { "epoch": 13.37238441298083, "grad_norm": 1.3353784084320068, "learning_rate": 2.988456089882157e-06, "loss": 0.444, "num_input_tokens_seen": 129787968, "step": 106725 }, { "epoch": 13.373010900889613, "grad_norm": 1.1557207107543945, "learning_rate": 2.987955583182864e-06, "loss": 0.4549, "num_input_tokens_seen": 129793920, "step": 106730 }, { "epoch": 13.373637388798397, "grad_norm": 1.1045292615890503, "learning_rate": 2.987455100539253e-06, "loss": 0.5853, "num_input_tokens_seen": 129800000, "step": 106735 }, { "epoch": 13.374263876707179, "grad_norm": 0.8847702145576477, "learning_rate": 2.986954641957309e-06, "loss": 0.4434, "num_input_tokens_seen": 129806016, "step": 106740 }, { "epoch": 13.374890364615963, "grad_norm": 3.47802734375, "learning_rate": 2.9864542074430136e-06, "loss": 0.4588, "num_input_tokens_seen": 129811872, "step": 106745 }, { "epoch": 13.375516852524747, "grad_norm": 6.432276725769043, "learning_rate": 2.98595379700235e-06, "loss": 0.5007, "num_input_tokens_seen": 129817888, "step": 106750 }, { "epoch": 13.37614334043353, "grad_norm": 6.044256210327148, "learning_rate": 2.985453410641304e-06, "loss": 0.453, "num_input_tokens_seen": 129824192, "step": 106755 }, { "epoch": 13.376769828342313, "grad_norm": 9.642497062683105, "learning_rate": 2.9849530483658553e-06, "loss": 0.4848, "num_input_tokens_seen": 129830144, "step": 106760 }, { "epoch": 13.377396316251096, "grad_norm": 12.122795104980469, "learning_rate": 2.9844527101819886e-06, "loss": 0.5076, "num_input_tokens_seen": 129836384, "step": 106765 }, { "epoch": 13.37802280415988, "grad_norm": 4.888241291046143, "learning_rate": 2.9839523960956824e-06, "loss": 0.5623, "num_input_tokens_seen": 129842272, "step": 106770 }, { "epoch": 13.378649292068664, "grad_norm": 1.761087417602539, "learning_rate": 2.9834521061129218e-06, "loss": 0.5069, "num_input_tokens_seen": 129848384, "step": 106775 }, { "epoch": 13.379275779977446, "grad_norm": 1.0163496732711792, "learning_rate": 2.9829518402396856e-06, "loss": 0.4405, "num_input_tokens_seen": 129854528, "step": 106780 }, { "epoch": 13.37990226788623, "grad_norm": 6.866768836975098, "learning_rate": 2.9824515984819573e-06, "loss": 0.5003, "num_input_tokens_seen": 129860416, "step": 106785 }, { "epoch": 13.380528755795012, "grad_norm": 1.088019847869873, "learning_rate": 2.9819513808457148e-06, "loss": 0.4361, "num_input_tokens_seen": 129866720, "step": 106790 }, { "epoch": 13.381155243703796, "grad_norm": 3.5251190662384033, "learning_rate": 2.981451187336941e-06, "loss": 0.4519, "num_input_tokens_seen": 129872928, "step": 106795 }, { "epoch": 13.38178173161258, "grad_norm": 2.123866558074951, "learning_rate": 2.9809510179616158e-06, "loss": 0.4931, "num_input_tokens_seen": 129878944, "step": 106800 }, { "epoch": 13.382408219521363, "grad_norm": 6.245803356170654, "learning_rate": 2.9804508727257177e-06, "loss": 0.4861, "num_input_tokens_seen": 129885152, "step": 106805 }, { "epoch": 13.383034707430147, "grad_norm": 1.1372054815292358, "learning_rate": 2.9799507516352277e-06, "loss": 0.43, "num_input_tokens_seen": 129891136, "step": 106810 }, { "epoch": 13.38366119533893, "grad_norm": 4.725252151489258, "learning_rate": 2.9794506546961242e-06, "loss": 0.4731, "num_input_tokens_seen": 129897376, "step": 106815 }, { "epoch": 13.384287683247713, "grad_norm": 0.9321286678314209, "learning_rate": 2.978950581914389e-06, "loss": 0.4719, "num_input_tokens_seen": 129903200, "step": 106820 }, { "epoch": 13.384914171156497, "grad_norm": 1.1361668109893799, "learning_rate": 2.978450533295997e-06, "loss": 0.4521, "num_input_tokens_seen": 129909472, "step": 106825 }, { "epoch": 13.38554065906528, "grad_norm": 8.870320320129395, "learning_rate": 2.9779505088469298e-06, "loss": 0.5309, "num_input_tokens_seen": 129915744, "step": 106830 }, { "epoch": 13.386167146974064, "grad_norm": 1.1285173892974854, "learning_rate": 2.9774505085731625e-06, "loss": 0.5174, "num_input_tokens_seen": 129922208, "step": 106835 }, { "epoch": 13.386793634882848, "grad_norm": 1.036436676979065, "learning_rate": 2.976950532480677e-06, "loss": 0.4534, "num_input_tokens_seen": 129927616, "step": 106840 }, { "epoch": 13.38742012279163, "grad_norm": 0.5964645147323608, "learning_rate": 2.9764505805754467e-06, "loss": 0.4792, "num_input_tokens_seen": 129933760, "step": 106845 }, { "epoch": 13.388046610700414, "grad_norm": 3.1439731121063232, "learning_rate": 2.9759506528634524e-06, "loss": 0.5234, "num_input_tokens_seen": 129940192, "step": 106850 }, { "epoch": 13.388673098609196, "grad_norm": 0.9037541747093201, "learning_rate": 2.975450749350669e-06, "loss": 0.4738, "num_input_tokens_seen": 129946368, "step": 106855 }, { "epoch": 13.38929958651798, "grad_norm": 2.20229172706604, "learning_rate": 2.974950870043075e-06, "loss": 0.4595, "num_input_tokens_seen": 129952128, "step": 106860 }, { "epoch": 13.389926074426764, "grad_norm": 0.9340308308601379, "learning_rate": 2.9744510149466445e-06, "loss": 0.4761, "num_input_tokens_seen": 129958080, "step": 106865 }, { "epoch": 13.390552562335547, "grad_norm": 0.9507232308387756, "learning_rate": 2.973951184067355e-06, "loss": 0.4893, "num_input_tokens_seen": 129964288, "step": 106870 }, { "epoch": 13.39117905024433, "grad_norm": 0.8582973480224609, "learning_rate": 2.9734513774111846e-06, "loss": 0.4508, "num_input_tokens_seen": 129970336, "step": 106875 }, { "epoch": 13.391805538153113, "grad_norm": 8.700409889221191, "learning_rate": 2.9729515949841046e-06, "loss": 0.4664, "num_input_tokens_seen": 129976736, "step": 106880 }, { "epoch": 13.392432026061897, "grad_norm": 0.9379875659942627, "learning_rate": 2.9724518367920945e-06, "loss": 0.4364, "num_input_tokens_seen": 129982560, "step": 106885 }, { "epoch": 13.393058513970681, "grad_norm": 4.380315780639648, "learning_rate": 2.971952102841126e-06, "loss": 0.5183, "num_input_tokens_seen": 129988736, "step": 106890 }, { "epoch": 13.393685001879463, "grad_norm": 1.3581565618515015, "learning_rate": 2.9714523931371764e-06, "loss": 0.4265, "num_input_tokens_seen": 129994080, "step": 106895 }, { "epoch": 13.394311489788247, "grad_norm": 1.9125761985778809, "learning_rate": 2.9709527076862175e-06, "loss": 0.5047, "num_input_tokens_seen": 129999776, "step": 106900 }, { "epoch": 13.39493797769703, "grad_norm": 0.9256846308708191, "learning_rate": 2.9704530464942254e-06, "loss": 0.4288, "num_input_tokens_seen": 130006048, "step": 106905 }, { "epoch": 13.395564465605814, "grad_norm": 1.602099895477295, "learning_rate": 2.9699534095671735e-06, "loss": 0.4852, "num_input_tokens_seen": 130012384, "step": 106910 }, { "epoch": 13.396190953514598, "grad_norm": 0.8194280862808228, "learning_rate": 2.969453796911036e-06, "loss": 0.4802, "num_input_tokens_seen": 130018208, "step": 106915 }, { "epoch": 13.39681744142338, "grad_norm": 1.176868200302124, "learning_rate": 2.9689542085317844e-06, "loss": 0.4306, "num_input_tokens_seen": 130024128, "step": 106920 }, { "epoch": 13.397443929332164, "grad_norm": 0.9147713780403137, "learning_rate": 2.9684546444353947e-06, "loss": 0.4806, "num_input_tokens_seen": 130029920, "step": 106925 }, { "epoch": 13.398070417240948, "grad_norm": 0.8359001278877258, "learning_rate": 2.9679551046278356e-06, "loss": 0.4454, "num_input_tokens_seen": 130036096, "step": 106930 }, { "epoch": 13.39869690514973, "grad_norm": 0.7703829407691956, "learning_rate": 2.9674555891150814e-06, "loss": 0.5516, "num_input_tokens_seen": 130042304, "step": 106935 }, { "epoch": 13.399323393058514, "grad_norm": 4.991776466369629, "learning_rate": 2.9669560979031074e-06, "loss": 0.5069, "num_input_tokens_seen": 130048512, "step": 106940 }, { "epoch": 13.399949880967297, "grad_norm": 5.291804313659668, "learning_rate": 2.9664566309978797e-06, "loss": 0.4824, "num_input_tokens_seen": 130054176, "step": 106945 }, { "epoch": 13.40057636887608, "grad_norm": 1.6966218948364258, "learning_rate": 2.965957188405375e-06, "loss": 0.4936, "num_input_tokens_seen": 130060352, "step": 106950 }, { "epoch": 13.401202856784865, "grad_norm": 1.0889102220535278, "learning_rate": 2.96545777013156e-06, "loss": 0.4877, "num_input_tokens_seen": 130066752, "step": 106955 }, { "epoch": 13.401829344693647, "grad_norm": 1.7618212699890137, "learning_rate": 2.9649583761824096e-06, "loss": 0.4075, "num_input_tokens_seen": 130072800, "step": 106960 }, { "epoch": 13.402455832602431, "grad_norm": 8.748747825622559, "learning_rate": 2.9644590065638913e-06, "loss": 0.4677, "num_input_tokens_seen": 130078720, "step": 106965 }, { "epoch": 13.403082320511214, "grad_norm": 1.5100191831588745, "learning_rate": 2.9639596612819786e-06, "loss": 0.4874, "num_input_tokens_seen": 130084768, "step": 106970 }, { "epoch": 13.403708808419998, "grad_norm": 5.5937700271606445, "learning_rate": 2.9634603403426386e-06, "loss": 0.4906, "num_input_tokens_seen": 130090752, "step": 106975 }, { "epoch": 13.404335296328782, "grad_norm": 1.2303502559661865, "learning_rate": 2.9629610437518443e-06, "loss": 0.4301, "num_input_tokens_seen": 130096352, "step": 106980 }, { "epoch": 13.404961784237564, "grad_norm": 1.3420907258987427, "learning_rate": 2.9624617715155614e-06, "loss": 0.4501, "num_input_tokens_seen": 130102304, "step": 106985 }, { "epoch": 13.405588272146348, "grad_norm": 0.8748968839645386, "learning_rate": 2.961962523639763e-06, "loss": 0.4579, "num_input_tokens_seen": 130108352, "step": 106990 }, { "epoch": 13.40621476005513, "grad_norm": 1.8184884786605835, "learning_rate": 2.9614633001304136e-06, "loss": 0.5397, "num_input_tokens_seen": 130114368, "step": 106995 }, { "epoch": 13.406841247963914, "grad_norm": 7.861583709716797, "learning_rate": 2.9609641009934853e-06, "loss": 0.5397, "num_input_tokens_seen": 130120320, "step": 107000 }, { "epoch": 13.407467735872698, "grad_norm": 5.7377471923828125, "learning_rate": 2.9604649262349474e-06, "loss": 0.5799, "num_input_tokens_seen": 130126016, "step": 107005 }, { "epoch": 13.40809422378148, "grad_norm": 1.131884217262268, "learning_rate": 2.959965775860763e-06, "loss": 0.4879, "num_input_tokens_seen": 130131904, "step": 107010 }, { "epoch": 13.408720711690265, "grad_norm": 4.175921440124512, "learning_rate": 2.9594666498769047e-06, "loss": 0.4977, "num_input_tokens_seen": 130137728, "step": 107015 }, { "epoch": 13.409347199599047, "grad_norm": 4.006992340087891, "learning_rate": 2.958967548289337e-06, "loss": 0.4806, "num_input_tokens_seen": 130143968, "step": 107020 }, { "epoch": 13.409973687507831, "grad_norm": 1.1275731325149536, "learning_rate": 2.9584684711040303e-06, "loss": 0.5937, "num_input_tokens_seen": 130150048, "step": 107025 }, { "epoch": 13.410600175416615, "grad_norm": 0.795151948928833, "learning_rate": 2.9579694183269476e-06, "loss": 0.4714, "num_input_tokens_seen": 130156128, "step": 107030 }, { "epoch": 13.411226663325397, "grad_norm": 0.9642850756645203, "learning_rate": 2.9574703899640596e-06, "loss": 0.4245, "num_input_tokens_seen": 130161888, "step": 107035 }, { "epoch": 13.411853151234181, "grad_norm": 1.127437949180603, "learning_rate": 2.956971386021328e-06, "loss": 0.4763, "num_input_tokens_seen": 130167936, "step": 107040 }, { "epoch": 13.412479639142964, "grad_norm": 1.0060418844223022, "learning_rate": 2.956472406504723e-06, "loss": 0.4454, "num_input_tokens_seen": 130173920, "step": 107045 }, { "epoch": 13.413106127051748, "grad_norm": 0.9793289303779602, "learning_rate": 2.9559734514202064e-06, "loss": 0.4258, "num_input_tokens_seen": 130180256, "step": 107050 }, { "epoch": 13.413732614960532, "grad_norm": 0.7362227439880371, "learning_rate": 2.955474520773746e-06, "loss": 0.5125, "num_input_tokens_seen": 130186400, "step": 107055 }, { "epoch": 13.414359102869314, "grad_norm": 1.0442081689834595, "learning_rate": 2.9549756145713076e-06, "loss": 0.4461, "num_input_tokens_seen": 130192576, "step": 107060 }, { "epoch": 13.414985590778098, "grad_norm": 4.377502918243408, "learning_rate": 2.9544767328188553e-06, "loss": 0.5286, "num_input_tokens_seen": 130198816, "step": 107065 }, { "epoch": 13.41561207868688, "grad_norm": 1.3290265798568726, "learning_rate": 2.9539778755223537e-06, "loss": 0.4913, "num_input_tokens_seen": 130205024, "step": 107070 }, { "epoch": 13.416238566595664, "grad_norm": 0.9297099113464355, "learning_rate": 2.9534790426877647e-06, "loss": 0.4632, "num_input_tokens_seen": 130211040, "step": 107075 }, { "epoch": 13.416865054504449, "grad_norm": 1.3830357789993286, "learning_rate": 2.952980234321057e-06, "loss": 0.5202, "num_input_tokens_seen": 130216384, "step": 107080 }, { "epoch": 13.41749154241323, "grad_norm": 0.9935014247894287, "learning_rate": 2.9524814504281895e-06, "loss": 0.5225, "num_input_tokens_seen": 130222688, "step": 107085 }, { "epoch": 13.418118030322015, "grad_norm": 8.887564659118652, "learning_rate": 2.9519826910151304e-06, "loss": 0.4775, "num_input_tokens_seen": 130229184, "step": 107090 }, { "epoch": 13.418744518230799, "grad_norm": 1.0554845333099365, "learning_rate": 2.951483956087837e-06, "loss": 0.5218, "num_input_tokens_seen": 130235520, "step": 107095 }, { "epoch": 13.419371006139581, "grad_norm": 0.7782735228538513, "learning_rate": 2.950985245652278e-06, "loss": 0.4969, "num_input_tokens_seen": 130241760, "step": 107100 }, { "epoch": 13.419997494048365, "grad_norm": 1.731910228729248, "learning_rate": 2.9504865597144113e-06, "loss": 0.4814, "num_input_tokens_seen": 130247936, "step": 107105 }, { "epoch": 13.420623981957148, "grad_norm": 2.772277355194092, "learning_rate": 2.949987898280201e-06, "loss": 0.4615, "num_input_tokens_seen": 130254080, "step": 107110 }, { "epoch": 13.421250469865932, "grad_norm": 0.9252865314483643, "learning_rate": 2.9494892613556088e-06, "loss": 0.5103, "num_input_tokens_seen": 130260160, "step": 107115 }, { "epoch": 13.421876957774716, "grad_norm": 0.5690701603889465, "learning_rate": 2.948990648946597e-06, "loss": 0.4234, "num_input_tokens_seen": 130266432, "step": 107120 }, { "epoch": 13.422503445683498, "grad_norm": 1.286856770515442, "learning_rate": 2.948492061059126e-06, "loss": 0.4478, "num_input_tokens_seen": 130272832, "step": 107125 }, { "epoch": 13.423129933592282, "grad_norm": 1.452481746673584, "learning_rate": 2.947993497699157e-06, "loss": 0.4412, "num_input_tokens_seen": 130279168, "step": 107130 }, { "epoch": 13.423756421501064, "grad_norm": 1.6717792749404907, "learning_rate": 2.947494958872653e-06, "loss": 0.5361, "num_input_tokens_seen": 130285664, "step": 107135 }, { "epoch": 13.424382909409848, "grad_norm": 2.5538628101348877, "learning_rate": 2.9469964445855697e-06, "loss": 0.4345, "num_input_tokens_seen": 130291168, "step": 107140 }, { "epoch": 13.425009397318632, "grad_norm": 1.8286103010177612, "learning_rate": 2.9464979548438723e-06, "loss": 0.4587, "num_input_tokens_seen": 130296480, "step": 107145 }, { "epoch": 13.425635885227415, "grad_norm": 1.0708268880844116, "learning_rate": 2.9459994896535164e-06, "loss": 0.4833, "num_input_tokens_seen": 130302208, "step": 107150 }, { "epoch": 13.426262373136199, "grad_norm": 5.091820240020752, "learning_rate": 2.945501049020466e-06, "loss": 0.4944, "num_input_tokens_seen": 130308416, "step": 107155 }, { "epoch": 13.426888861044981, "grad_norm": 0.9877656698226929, "learning_rate": 2.9450026329506755e-06, "loss": 0.4908, "num_input_tokens_seen": 130314464, "step": 107160 }, { "epoch": 13.427515348953765, "grad_norm": 1.18825364112854, "learning_rate": 2.944504241450107e-06, "loss": 0.4652, "num_input_tokens_seen": 130320192, "step": 107165 }, { "epoch": 13.42814183686255, "grad_norm": 2.012300729751587, "learning_rate": 2.9440058745247175e-06, "loss": 0.446, "num_input_tokens_seen": 130326336, "step": 107170 }, { "epoch": 13.428768324771331, "grad_norm": 1.2196544408798218, "learning_rate": 2.9435075321804685e-06, "loss": 0.4678, "num_input_tokens_seen": 130332736, "step": 107175 }, { "epoch": 13.429394812680115, "grad_norm": 1.1351878643035889, "learning_rate": 2.9430092144233134e-06, "loss": 0.4444, "num_input_tokens_seen": 130338272, "step": 107180 }, { "epoch": 13.430021300588898, "grad_norm": 9.297667503356934, "learning_rate": 2.9425109212592136e-06, "loss": 0.496, "num_input_tokens_seen": 130344512, "step": 107185 }, { "epoch": 13.430647788497682, "grad_norm": 2.880950450897217, "learning_rate": 2.942012652694127e-06, "loss": 0.5287, "num_input_tokens_seen": 130350656, "step": 107190 }, { "epoch": 13.431274276406466, "grad_norm": 0.7736678123474121, "learning_rate": 2.9415144087340077e-06, "loss": 0.4997, "num_input_tokens_seen": 130356608, "step": 107195 }, { "epoch": 13.431900764315248, "grad_norm": 0.9390116333961487, "learning_rate": 2.9410161893848156e-06, "loss": 0.4663, "num_input_tokens_seen": 130362432, "step": 107200 }, { "epoch": 13.432527252224032, "grad_norm": 0.8486035466194153, "learning_rate": 2.940517994652505e-06, "loss": 0.5005, "num_input_tokens_seen": 130368352, "step": 107205 }, { "epoch": 13.433153740132816, "grad_norm": 6.943808078765869, "learning_rate": 2.9400198245430334e-06, "loss": 0.4831, "num_input_tokens_seen": 130373792, "step": 107210 }, { "epoch": 13.433780228041599, "grad_norm": 2.64322829246521, "learning_rate": 2.9395216790623572e-06, "loss": 0.5095, "num_input_tokens_seen": 130379936, "step": 107215 }, { "epoch": 13.434406715950383, "grad_norm": 1.4850795269012451, "learning_rate": 2.9390235582164316e-06, "loss": 0.4354, "num_input_tokens_seen": 130386208, "step": 107220 }, { "epoch": 13.435033203859165, "grad_norm": 3.656862735748291, "learning_rate": 2.9385254620112115e-06, "loss": 0.4803, "num_input_tokens_seen": 130392640, "step": 107225 }, { "epoch": 13.435659691767949, "grad_norm": 1.0846312046051025, "learning_rate": 2.9380273904526545e-06, "loss": 0.4823, "num_input_tokens_seen": 130398944, "step": 107230 }, { "epoch": 13.436286179676733, "grad_norm": 0.7929471731185913, "learning_rate": 2.937529343546711e-06, "loss": 0.459, "num_input_tokens_seen": 130404544, "step": 107235 }, { "epoch": 13.436912667585515, "grad_norm": 0.8780882954597473, "learning_rate": 2.937031321299339e-06, "loss": 0.4586, "num_input_tokens_seen": 130410272, "step": 107240 }, { "epoch": 13.4375391554943, "grad_norm": 1.3988639116287231, "learning_rate": 2.9365333237164938e-06, "loss": 0.4712, "num_input_tokens_seen": 130416736, "step": 107245 }, { "epoch": 13.438165643403082, "grad_norm": 1.231945276260376, "learning_rate": 2.9360353508041257e-06, "loss": 0.4741, "num_input_tokens_seen": 130423040, "step": 107250 }, { "epoch": 13.438792131311866, "grad_norm": 1.0196470022201538, "learning_rate": 2.9355374025681923e-06, "loss": 0.4456, "num_input_tokens_seen": 130429312, "step": 107255 }, { "epoch": 13.43941861922065, "grad_norm": 1.0855915546417236, "learning_rate": 2.9350394790146427e-06, "loss": 0.4461, "num_input_tokens_seen": 130435392, "step": 107260 }, { "epoch": 13.440045107129432, "grad_norm": 0.9209189414978027, "learning_rate": 2.9345415801494332e-06, "loss": 0.4419, "num_input_tokens_seen": 130441504, "step": 107265 }, { "epoch": 13.440671595038216, "grad_norm": 1.029547929763794, "learning_rate": 2.9340437059785156e-06, "loss": 0.4462, "num_input_tokens_seen": 130447648, "step": 107270 }, { "epoch": 13.441298082946998, "grad_norm": 5.126194477081299, "learning_rate": 2.9335458565078423e-06, "loss": 0.484, "num_input_tokens_seen": 130453696, "step": 107275 }, { "epoch": 13.441924570855782, "grad_norm": 1.9225225448608398, "learning_rate": 2.933048031743365e-06, "loss": 0.4589, "num_input_tokens_seen": 130459520, "step": 107280 }, { "epoch": 13.442551058764566, "grad_norm": 4.399959087371826, "learning_rate": 2.9325502316910388e-06, "loss": 0.5143, "num_input_tokens_seen": 130465632, "step": 107285 }, { "epoch": 13.443177546673349, "grad_norm": 1.0765960216522217, "learning_rate": 2.93205245635681e-06, "loss": 0.4686, "num_input_tokens_seen": 130472000, "step": 107290 }, { "epoch": 13.443804034582133, "grad_norm": 0.7210515141487122, "learning_rate": 2.9315547057466353e-06, "loss": 0.4357, "num_input_tokens_seen": 130478208, "step": 107295 }, { "epoch": 13.444430522490915, "grad_norm": 1.3982645273208618, "learning_rate": 2.9310569798664606e-06, "loss": 0.4576, "num_input_tokens_seen": 130484288, "step": 107300 }, { "epoch": 13.4450570103997, "grad_norm": 2.643615484237671, "learning_rate": 2.9305592787222394e-06, "loss": 0.4608, "num_input_tokens_seen": 130490592, "step": 107305 }, { "epoch": 13.445683498308483, "grad_norm": 1.0410984754562378, "learning_rate": 2.9300616023199243e-06, "loss": 0.4747, "num_input_tokens_seen": 130496960, "step": 107310 }, { "epoch": 13.446309986217265, "grad_norm": 6.990119457244873, "learning_rate": 2.9295639506654606e-06, "loss": 0.4555, "num_input_tokens_seen": 130502912, "step": 107315 }, { "epoch": 13.44693647412605, "grad_norm": 0.8637681007385254, "learning_rate": 2.9290663237648017e-06, "loss": 0.4766, "num_input_tokens_seen": 130509184, "step": 107320 }, { "epoch": 13.447562962034834, "grad_norm": 3.2741334438323975, "learning_rate": 2.9285687216238954e-06, "loss": 0.4821, "num_input_tokens_seen": 130515104, "step": 107325 }, { "epoch": 13.448189449943616, "grad_norm": 6.86322546005249, "learning_rate": 2.9280711442486926e-06, "loss": 0.4611, "num_input_tokens_seen": 130521600, "step": 107330 }, { "epoch": 13.4488159378524, "grad_norm": 2.753342866897583, "learning_rate": 2.92757359164514e-06, "loss": 0.462, "num_input_tokens_seen": 130527936, "step": 107335 }, { "epoch": 13.449442425761182, "grad_norm": 1.2249045372009277, "learning_rate": 2.9270760638191893e-06, "loss": 0.4533, "num_input_tokens_seen": 130533920, "step": 107340 }, { "epoch": 13.450068913669966, "grad_norm": 1.8227466344833374, "learning_rate": 2.926578560776785e-06, "loss": 0.47, "num_input_tokens_seen": 130539936, "step": 107345 }, { "epoch": 13.45069540157875, "grad_norm": 1.2296162843704224, "learning_rate": 2.9260810825238793e-06, "loss": 0.4855, "num_input_tokens_seen": 130546272, "step": 107350 }, { "epoch": 13.451321889487533, "grad_norm": 0.9795113205909729, "learning_rate": 2.9255836290664165e-06, "loss": 0.4545, "num_input_tokens_seen": 130552576, "step": 107355 }, { "epoch": 13.451948377396317, "grad_norm": 6.608220100402832, "learning_rate": 2.9250862004103465e-06, "loss": 0.4547, "num_input_tokens_seen": 130558976, "step": 107360 }, { "epoch": 13.452574865305099, "grad_norm": 1.0201104879379272, "learning_rate": 2.924588796561614e-06, "loss": 0.4222, "num_input_tokens_seen": 130565248, "step": 107365 }, { "epoch": 13.453201353213883, "grad_norm": 1.4271550178527832, "learning_rate": 2.9240914175261677e-06, "loss": 0.4523, "num_input_tokens_seen": 130571232, "step": 107370 }, { "epoch": 13.453827841122667, "grad_norm": 1.5258055925369263, "learning_rate": 2.9235940633099548e-06, "loss": 0.4231, "num_input_tokens_seen": 130577184, "step": 107375 }, { "epoch": 13.45445432903145, "grad_norm": 2.8404150009155273, "learning_rate": 2.923096733918919e-06, "loss": 0.5602, "num_input_tokens_seen": 130583232, "step": 107380 }, { "epoch": 13.455080816940233, "grad_norm": 1.9628409147262573, "learning_rate": 2.92259942935901e-06, "loss": 0.4917, "num_input_tokens_seen": 130589312, "step": 107385 }, { "epoch": 13.455707304849016, "grad_norm": 8.800149917602539, "learning_rate": 2.9221021496361696e-06, "loss": 0.5853, "num_input_tokens_seen": 130595296, "step": 107390 }, { "epoch": 13.4563337927578, "grad_norm": 1.8938719034194946, "learning_rate": 2.921604894756347e-06, "loss": 0.4199, "num_input_tokens_seen": 130601760, "step": 107395 }, { "epoch": 13.456960280666584, "grad_norm": 1.4013397693634033, "learning_rate": 2.9211076647254834e-06, "loss": 0.4942, "num_input_tokens_seen": 130607968, "step": 107400 }, { "epoch": 13.457586768575366, "grad_norm": 1.3835031986236572, "learning_rate": 2.920610459549527e-06, "loss": 0.4443, "num_input_tokens_seen": 130614560, "step": 107405 }, { "epoch": 13.45821325648415, "grad_norm": 4.741488456726074, "learning_rate": 2.9201132792344195e-06, "loss": 0.5255, "num_input_tokens_seen": 130620832, "step": 107410 }, { "epoch": 13.458839744392932, "grad_norm": 8.449566841125488, "learning_rate": 2.919616123786107e-06, "loss": 0.6608, "num_input_tokens_seen": 130626400, "step": 107415 }, { "epoch": 13.459466232301716, "grad_norm": 1.0875473022460938, "learning_rate": 2.919118993210533e-06, "loss": 0.5027, "num_input_tokens_seen": 130632896, "step": 107420 }, { "epoch": 13.4600927202105, "grad_norm": 8.3347749710083, "learning_rate": 2.9186218875136397e-06, "loss": 0.584, "num_input_tokens_seen": 130639040, "step": 107425 }, { "epoch": 13.460719208119283, "grad_norm": 10.260239601135254, "learning_rate": 2.918124806701374e-06, "loss": 0.5102, "num_input_tokens_seen": 130645152, "step": 107430 }, { "epoch": 13.461345696028067, "grad_norm": 4.565914154052734, "learning_rate": 2.9176277507796747e-06, "loss": 0.4604, "num_input_tokens_seen": 130651072, "step": 107435 }, { "epoch": 13.461972183936851, "grad_norm": 1.2943743467330933, "learning_rate": 2.917130719754488e-06, "loss": 0.4845, "num_input_tokens_seen": 130657184, "step": 107440 }, { "epoch": 13.462598671845633, "grad_norm": 3.295773983001709, "learning_rate": 2.9166337136317523e-06, "loss": 0.5061, "num_input_tokens_seen": 130663328, "step": 107445 }, { "epoch": 13.463225159754417, "grad_norm": 3.145524024963379, "learning_rate": 2.916136732417413e-06, "loss": 0.4384, "num_input_tokens_seen": 130669344, "step": 107450 }, { "epoch": 13.4638516476632, "grad_norm": 2.1694726943969727, "learning_rate": 2.9156397761174106e-06, "loss": 0.4784, "num_input_tokens_seen": 130675264, "step": 107455 }, { "epoch": 13.464478135571984, "grad_norm": 1.3055700063705444, "learning_rate": 2.9151428447376893e-06, "loss": 0.5112, "num_input_tokens_seen": 130681120, "step": 107460 }, { "epoch": 13.465104623480768, "grad_norm": 1.6761411428451538, "learning_rate": 2.9146459382841863e-06, "loss": 0.5035, "num_input_tokens_seen": 130687296, "step": 107465 }, { "epoch": 13.46573111138955, "grad_norm": 1.2761163711547852, "learning_rate": 2.914149056762846e-06, "loss": 0.4446, "num_input_tokens_seen": 130693376, "step": 107470 }, { "epoch": 13.466357599298334, "grad_norm": 8.068077087402344, "learning_rate": 2.9136522001796065e-06, "loss": 0.4573, "num_input_tokens_seen": 130699584, "step": 107475 }, { "epoch": 13.466984087207116, "grad_norm": 1.3539329767227173, "learning_rate": 2.9131553685404098e-06, "loss": 0.4826, "num_input_tokens_seen": 130705888, "step": 107480 }, { "epoch": 13.4676105751159, "grad_norm": 1.397929072380066, "learning_rate": 2.9126585618511936e-06, "loss": 0.4873, "num_input_tokens_seen": 130712320, "step": 107485 }, { "epoch": 13.468237063024684, "grad_norm": 2.27970027923584, "learning_rate": 2.912161780117899e-06, "loss": 0.5495, "num_input_tokens_seen": 130718304, "step": 107490 }, { "epoch": 13.468863550933467, "grad_norm": 4.6437811851501465, "learning_rate": 2.911665023346468e-06, "loss": 0.4701, "num_input_tokens_seen": 130724320, "step": 107495 }, { "epoch": 13.46949003884225, "grad_norm": 2.4213809967041016, "learning_rate": 2.911168291542835e-06, "loss": 0.4433, "num_input_tokens_seen": 130730368, "step": 107500 }, { "epoch": 13.470116526751033, "grad_norm": 7.307735919952393, "learning_rate": 2.910671584712944e-06, "loss": 0.5025, "num_input_tokens_seen": 130736512, "step": 107505 }, { "epoch": 13.470743014659817, "grad_norm": 1.3947347402572632, "learning_rate": 2.910174902862728e-06, "loss": 0.4518, "num_input_tokens_seen": 130742464, "step": 107510 }, { "epoch": 13.471369502568601, "grad_norm": 2.7794313430786133, "learning_rate": 2.909678245998131e-06, "loss": 0.5425, "num_input_tokens_seen": 130748832, "step": 107515 }, { "epoch": 13.471995990477383, "grad_norm": 0.7053338885307312, "learning_rate": 2.909181614125085e-06, "loss": 0.508, "num_input_tokens_seen": 130754976, "step": 107520 }, { "epoch": 13.472622478386167, "grad_norm": 1.3077250719070435, "learning_rate": 2.908685007249533e-06, "loss": 0.4534, "num_input_tokens_seen": 130760864, "step": 107525 }, { "epoch": 13.47324896629495, "grad_norm": 1.042839527130127, "learning_rate": 2.9081884253774084e-06, "loss": 0.4424, "num_input_tokens_seen": 130767200, "step": 107530 }, { "epoch": 13.473875454203734, "grad_norm": 5.287259101867676, "learning_rate": 2.9076918685146516e-06, "loss": 0.4926, "num_input_tokens_seen": 130773312, "step": 107535 }, { "epoch": 13.474501942112518, "grad_norm": 1.3573065996170044, "learning_rate": 2.9071953366671955e-06, "loss": 0.4473, "num_input_tokens_seen": 130779584, "step": 107540 }, { "epoch": 13.4751284300213, "grad_norm": 0.9359427094459534, "learning_rate": 2.906698829840979e-06, "loss": 0.5024, "num_input_tokens_seen": 130785760, "step": 107545 }, { "epoch": 13.475754917930084, "grad_norm": 2.5976316928863525, "learning_rate": 2.90620234804194e-06, "loss": 0.4469, "num_input_tokens_seen": 130792480, "step": 107550 }, { "epoch": 13.476381405838866, "grad_norm": 3.8276443481445312, "learning_rate": 2.90570589127601e-06, "loss": 0.4794, "num_input_tokens_seen": 130798752, "step": 107555 }, { "epoch": 13.47700789374765, "grad_norm": 0.9779282808303833, "learning_rate": 2.9052094595491266e-06, "loss": 0.464, "num_input_tokens_seen": 130804768, "step": 107560 }, { "epoch": 13.477634381656435, "grad_norm": 1.9421477317810059, "learning_rate": 2.9047130528672265e-06, "loss": 0.4555, "num_input_tokens_seen": 130810848, "step": 107565 }, { "epoch": 13.478260869565217, "grad_norm": 0.8396840691566467, "learning_rate": 2.904216671236244e-06, "loss": 0.4701, "num_input_tokens_seen": 130817120, "step": 107570 }, { "epoch": 13.478887357474001, "grad_norm": 2.207228660583496, "learning_rate": 2.9037203146621117e-06, "loss": 0.4469, "num_input_tokens_seen": 130822624, "step": 107575 }, { "epoch": 13.479513845382785, "grad_norm": 4.20691442489624, "learning_rate": 2.903223983150768e-06, "loss": 0.4883, "num_input_tokens_seen": 130828640, "step": 107580 }, { "epoch": 13.480140333291567, "grad_norm": 1.0710176229476929, "learning_rate": 2.9027276767081418e-06, "loss": 0.4777, "num_input_tokens_seen": 130835104, "step": 107585 }, { "epoch": 13.480766821200351, "grad_norm": 1.2367361783981323, "learning_rate": 2.9022313953401722e-06, "loss": 0.4564, "num_input_tokens_seen": 130841344, "step": 107590 }, { "epoch": 13.481393309109134, "grad_norm": 0.8212607502937317, "learning_rate": 2.901735139052787e-06, "loss": 0.4772, "num_input_tokens_seen": 130847520, "step": 107595 }, { "epoch": 13.482019797017918, "grad_norm": 1.422568440437317, "learning_rate": 2.9012389078519255e-06, "loss": 0.481, "num_input_tokens_seen": 130853792, "step": 107600 }, { "epoch": 13.482646284926702, "grad_norm": 0.9819068312644958, "learning_rate": 2.9007427017435147e-06, "loss": 0.4386, "num_input_tokens_seen": 130859648, "step": 107605 }, { "epoch": 13.483272772835484, "grad_norm": 0.8790198564529419, "learning_rate": 2.9002465207334894e-06, "loss": 0.4803, "num_input_tokens_seen": 130865888, "step": 107610 }, { "epoch": 13.483899260744268, "grad_norm": 6.727658271789551, "learning_rate": 2.899750364827785e-06, "loss": 0.4659, "num_input_tokens_seen": 130872160, "step": 107615 }, { "epoch": 13.48452574865305, "grad_norm": 6.945577621459961, "learning_rate": 2.8992542340323293e-06, "loss": 0.5036, "num_input_tokens_seen": 130878464, "step": 107620 }, { "epoch": 13.485152236561834, "grad_norm": 1.1097387075424194, "learning_rate": 2.8987581283530563e-06, "loss": 0.4371, "num_input_tokens_seen": 130884288, "step": 107625 }, { "epoch": 13.485778724470618, "grad_norm": 1.101389765739441, "learning_rate": 2.898262047795896e-06, "loss": 0.4689, "num_input_tokens_seen": 130890432, "step": 107630 }, { "epoch": 13.4864052123794, "grad_norm": 0.82403963804245, "learning_rate": 2.89776599236678e-06, "loss": 0.4832, "num_input_tokens_seen": 130896384, "step": 107635 }, { "epoch": 13.487031700288185, "grad_norm": 3.533327341079712, "learning_rate": 2.897269962071638e-06, "loss": 0.4622, "num_input_tokens_seen": 130902432, "step": 107640 }, { "epoch": 13.487658188196967, "grad_norm": 0.8868834972381592, "learning_rate": 2.896773956916404e-06, "loss": 0.4978, "num_input_tokens_seen": 130908576, "step": 107645 }, { "epoch": 13.488284676105751, "grad_norm": 2.806035041809082, "learning_rate": 2.8962779769070027e-06, "loss": 0.4575, "num_input_tokens_seen": 130915008, "step": 107650 }, { "epoch": 13.488911164014535, "grad_norm": 1.077351689338684, "learning_rate": 2.8957820220493677e-06, "loss": 0.4358, "num_input_tokens_seen": 130921344, "step": 107655 }, { "epoch": 13.489537651923317, "grad_norm": 2.697399377822876, "learning_rate": 2.8952860923494273e-06, "loss": 0.4806, "num_input_tokens_seen": 130927616, "step": 107660 }, { "epoch": 13.490164139832101, "grad_norm": 1.8999994993209839, "learning_rate": 2.8947901878131134e-06, "loss": 0.4703, "num_input_tokens_seen": 130933152, "step": 107665 }, { "epoch": 13.490790627740884, "grad_norm": 10.52044677734375, "learning_rate": 2.8942943084463504e-06, "loss": 0.5913, "num_input_tokens_seen": 130939360, "step": 107670 }, { "epoch": 13.491417115649668, "grad_norm": 1.067938208580017, "learning_rate": 2.8937984542550702e-06, "loss": 0.4849, "num_input_tokens_seen": 130945792, "step": 107675 }, { "epoch": 13.492043603558452, "grad_norm": 9.025798797607422, "learning_rate": 2.893302625245201e-06, "loss": 0.5292, "num_input_tokens_seen": 130951680, "step": 107680 }, { "epoch": 13.492670091467234, "grad_norm": 2.114527940750122, "learning_rate": 2.892806821422669e-06, "loss": 0.4716, "num_input_tokens_seen": 130957920, "step": 107685 }, { "epoch": 13.493296579376018, "grad_norm": 1.0546869039535522, "learning_rate": 2.8923110427934046e-06, "loss": 0.4804, "num_input_tokens_seen": 130963904, "step": 107690 }, { "epoch": 13.4939230672848, "grad_norm": 0.9798876047134399, "learning_rate": 2.891815289363332e-06, "loss": 0.4724, "num_input_tokens_seen": 130969696, "step": 107695 }, { "epoch": 13.494549555193585, "grad_norm": 8.80272388458252, "learning_rate": 2.8913195611383826e-06, "loss": 0.5089, "num_input_tokens_seen": 130975936, "step": 107700 }, { "epoch": 13.495176043102369, "grad_norm": 1.9175509214401245, "learning_rate": 2.890823858124478e-06, "loss": 0.4624, "num_input_tokens_seen": 130982144, "step": 107705 }, { "epoch": 13.495802531011151, "grad_norm": 2.8696541786193848, "learning_rate": 2.8903281803275496e-06, "loss": 0.452, "num_input_tokens_seen": 130988128, "step": 107710 }, { "epoch": 13.496429018919935, "grad_norm": 2.563502311706543, "learning_rate": 2.88983252775352e-06, "loss": 0.4307, "num_input_tokens_seen": 130994464, "step": 107715 }, { "epoch": 13.497055506828719, "grad_norm": 1.9467477798461914, "learning_rate": 2.889336900408318e-06, "loss": 0.4713, "num_input_tokens_seen": 131000864, "step": 107720 }, { "epoch": 13.497681994737501, "grad_norm": 4.703101634979248, "learning_rate": 2.8888412982978666e-06, "loss": 0.497, "num_input_tokens_seen": 131006752, "step": 107725 }, { "epoch": 13.498308482646285, "grad_norm": 0.9503417015075684, "learning_rate": 2.8883457214280935e-06, "loss": 0.513, "num_input_tokens_seen": 131013056, "step": 107730 }, { "epoch": 13.498934970555068, "grad_norm": 1.2779260873794556, "learning_rate": 2.887850169804921e-06, "loss": 0.5249, "num_input_tokens_seen": 131019040, "step": 107735 }, { "epoch": 13.499561458463852, "grad_norm": 2.152538537979126, "learning_rate": 2.887354643434276e-06, "loss": 0.4394, "num_input_tokens_seen": 131025152, "step": 107740 }, { "epoch": 13.500187946372636, "grad_norm": 1.6490254402160645, "learning_rate": 2.8868591423220843e-06, "loss": 0.4365, "num_input_tokens_seen": 131031616, "step": 107745 }, { "epoch": 13.500814434281418, "grad_norm": 4.1962738037109375, "learning_rate": 2.8863636664742657e-06, "loss": 0.4832, "num_input_tokens_seen": 131037984, "step": 107750 }, { "epoch": 13.501440922190202, "grad_norm": 0.9832065105438232, "learning_rate": 2.885868215896747e-06, "loss": 0.4404, "num_input_tokens_seen": 131044224, "step": 107755 }, { "epoch": 13.502067410098984, "grad_norm": 1.3280659914016724, "learning_rate": 2.8853727905954533e-06, "loss": 0.464, "num_input_tokens_seen": 131050624, "step": 107760 }, { "epoch": 13.502693898007768, "grad_norm": 6.37965202331543, "learning_rate": 2.884877390576304e-06, "loss": 0.4915, "num_input_tokens_seen": 131057088, "step": 107765 }, { "epoch": 13.503320385916552, "grad_norm": 5.081561088562012, "learning_rate": 2.8843820158452228e-06, "loss": 0.4703, "num_input_tokens_seen": 131063296, "step": 107770 }, { "epoch": 13.503946873825335, "grad_norm": 4.434529781341553, "learning_rate": 2.883886666408136e-06, "loss": 0.482, "num_input_tokens_seen": 131069248, "step": 107775 }, { "epoch": 13.504573361734119, "grad_norm": 2.361891984939575, "learning_rate": 2.8833913422709615e-06, "loss": 0.4351, "num_input_tokens_seen": 131075296, "step": 107780 }, { "epoch": 13.505199849642901, "grad_norm": 0.8928894400596619, "learning_rate": 2.882896043439624e-06, "loss": 0.447, "num_input_tokens_seen": 131080928, "step": 107785 }, { "epoch": 13.505826337551685, "grad_norm": 1.0250808000564575, "learning_rate": 2.882400769920043e-06, "loss": 0.4509, "num_input_tokens_seen": 131086880, "step": 107790 }, { "epoch": 13.50645282546047, "grad_norm": 0.8682527542114258, "learning_rate": 2.881905521718141e-06, "loss": 0.4567, "num_input_tokens_seen": 131092512, "step": 107795 }, { "epoch": 13.507079313369251, "grad_norm": 1.071826457977295, "learning_rate": 2.881410298839842e-06, "loss": 0.4672, "num_input_tokens_seen": 131098880, "step": 107800 }, { "epoch": 13.507705801278036, "grad_norm": 1.017831802368164, "learning_rate": 2.880915101291061e-06, "loss": 0.4872, "num_input_tokens_seen": 131105088, "step": 107805 }, { "epoch": 13.508332289186818, "grad_norm": 1.166140079498291, "learning_rate": 2.8804199290777248e-06, "loss": 0.48, "num_input_tokens_seen": 131111296, "step": 107810 }, { "epoch": 13.508958777095602, "grad_norm": 0.8583563566207886, "learning_rate": 2.8799247822057473e-06, "loss": 0.4734, "num_input_tokens_seen": 131117408, "step": 107815 }, { "epoch": 13.509585265004386, "grad_norm": 1.0938937664031982, "learning_rate": 2.8794296606810534e-06, "loss": 0.4494, "num_input_tokens_seen": 131123008, "step": 107820 }, { "epoch": 13.510211752913168, "grad_norm": 2.8888614177703857, "learning_rate": 2.8789345645095595e-06, "loss": 0.4775, "num_input_tokens_seen": 131129088, "step": 107825 }, { "epoch": 13.510838240821952, "grad_norm": 2.4828808307647705, "learning_rate": 2.8784394936971884e-06, "loss": 0.4482, "num_input_tokens_seen": 131134720, "step": 107830 }, { "epoch": 13.511464728730736, "grad_norm": 5.8745436668396, "learning_rate": 2.8779444482498543e-06, "loss": 0.4919, "num_input_tokens_seen": 131140640, "step": 107835 }, { "epoch": 13.512091216639519, "grad_norm": 7.5074310302734375, "learning_rate": 2.87744942817348e-06, "loss": 0.4867, "num_input_tokens_seen": 131146816, "step": 107840 }, { "epoch": 13.512717704548303, "grad_norm": 0.7988860607147217, "learning_rate": 2.876954433473981e-06, "loss": 0.4644, "num_input_tokens_seen": 131152576, "step": 107845 }, { "epoch": 13.513344192457085, "grad_norm": 1.7193061113357544, "learning_rate": 2.8764594641572786e-06, "loss": 0.4475, "num_input_tokens_seen": 131158720, "step": 107850 }, { "epoch": 13.513970680365869, "grad_norm": 0.9142247438430786, "learning_rate": 2.8759645202292864e-06, "loss": 0.4438, "num_input_tokens_seen": 131164768, "step": 107855 }, { "epoch": 13.514597168274653, "grad_norm": 1.1897940635681152, "learning_rate": 2.875469601695925e-06, "loss": 0.466, "num_input_tokens_seen": 131170784, "step": 107860 }, { "epoch": 13.515223656183435, "grad_norm": 1.0868821144104004, "learning_rate": 2.8749747085631098e-06, "loss": 0.4635, "num_input_tokens_seen": 131176864, "step": 107865 }, { "epoch": 13.51585014409222, "grad_norm": 1.1176791191101074, "learning_rate": 2.874479840836761e-06, "loss": 0.4544, "num_input_tokens_seen": 131183168, "step": 107870 }, { "epoch": 13.516476632001002, "grad_norm": 0.8795763850212097, "learning_rate": 2.8739849985227908e-06, "loss": 0.4944, "num_input_tokens_seen": 131189152, "step": 107875 }, { "epoch": 13.517103119909786, "grad_norm": 0.9456660747528076, "learning_rate": 2.8734901816271165e-06, "loss": 0.4845, "num_input_tokens_seen": 131194592, "step": 107880 }, { "epoch": 13.51772960781857, "grad_norm": 0.814613401889801, "learning_rate": 2.8729953901556583e-06, "loss": 0.436, "num_input_tokens_seen": 131200672, "step": 107885 }, { "epoch": 13.518356095727352, "grad_norm": 0.5598906874656677, "learning_rate": 2.872500624114326e-06, "loss": 0.4808, "num_input_tokens_seen": 131206240, "step": 107890 }, { "epoch": 13.518982583636136, "grad_norm": 3.0445938110351562, "learning_rate": 2.8720058835090393e-06, "loss": 0.4622, "num_input_tokens_seen": 131212256, "step": 107895 }, { "epoch": 13.519609071544918, "grad_norm": 3.0420939922332764, "learning_rate": 2.871511168345709e-06, "loss": 0.4921, "num_input_tokens_seen": 131218016, "step": 107900 }, { "epoch": 13.520235559453702, "grad_norm": 1.1398924589157104, "learning_rate": 2.8710164786302547e-06, "loss": 0.4789, "num_input_tokens_seen": 131224384, "step": 107905 }, { "epoch": 13.520862047362487, "grad_norm": 0.864477276802063, "learning_rate": 2.8705218143685864e-06, "loss": 0.4584, "num_input_tokens_seen": 131230336, "step": 107910 }, { "epoch": 13.521488535271269, "grad_norm": 0.9780364036560059, "learning_rate": 2.870027175566622e-06, "loss": 0.4548, "num_input_tokens_seen": 131236576, "step": 107915 }, { "epoch": 13.522115023180053, "grad_norm": 4.1868205070495605, "learning_rate": 2.869532562230272e-06, "loss": 0.4925, "num_input_tokens_seen": 131243072, "step": 107920 }, { "epoch": 13.522741511088835, "grad_norm": 3.8549890518188477, "learning_rate": 2.8690379743654505e-06, "loss": 0.4722, "num_input_tokens_seen": 131249152, "step": 107925 }, { "epoch": 13.52336799899762, "grad_norm": 2.937973737716675, "learning_rate": 2.868543411978074e-06, "loss": 0.4405, "num_input_tokens_seen": 131255168, "step": 107930 }, { "epoch": 13.523994486906403, "grad_norm": 3.694758176803589, "learning_rate": 2.868048875074051e-06, "loss": 0.4726, "num_input_tokens_seen": 131261120, "step": 107935 }, { "epoch": 13.524620974815186, "grad_norm": 1.2047271728515625, "learning_rate": 2.8675543636592983e-06, "loss": 0.4816, "num_input_tokens_seen": 131267488, "step": 107940 }, { "epoch": 13.52524746272397, "grad_norm": 1.1264301538467407, "learning_rate": 2.867059877739724e-06, "loss": 0.4528, "num_input_tokens_seen": 131273696, "step": 107945 }, { "epoch": 13.525873950632754, "grad_norm": 3.0439438819885254, "learning_rate": 2.8665654173212445e-06, "loss": 0.4494, "num_input_tokens_seen": 131279552, "step": 107950 }, { "epoch": 13.526500438541536, "grad_norm": 1.1964787244796753, "learning_rate": 2.8660709824097665e-06, "loss": 0.4472, "num_input_tokens_seen": 131285824, "step": 107955 }, { "epoch": 13.52712692645032, "grad_norm": 1.1255594491958618, "learning_rate": 2.8655765730112043e-06, "loss": 0.4973, "num_input_tokens_seen": 131291744, "step": 107960 }, { "epoch": 13.527753414359102, "grad_norm": 1.1662222146987915, "learning_rate": 2.8650821891314705e-06, "loss": 0.443, "num_input_tokens_seen": 131297728, "step": 107965 }, { "epoch": 13.528379902267886, "grad_norm": 3.428201913833618, "learning_rate": 2.864587830776473e-06, "loss": 0.4983, "num_input_tokens_seen": 131303872, "step": 107970 }, { "epoch": 13.52900639017667, "grad_norm": 0.9322996735572815, "learning_rate": 2.8640934979521225e-06, "loss": 0.5453, "num_input_tokens_seen": 131309920, "step": 107975 }, { "epoch": 13.529632878085453, "grad_norm": 1.1112223863601685, "learning_rate": 2.863599190664332e-06, "loss": 0.481, "num_input_tokens_seen": 131316288, "step": 107980 }, { "epoch": 13.530259365994237, "grad_norm": 0.8067287802696228, "learning_rate": 2.8631049089190076e-06, "loss": 0.5121, "num_input_tokens_seen": 131322304, "step": 107985 }, { "epoch": 13.530885853903019, "grad_norm": 1.1626847982406616, "learning_rate": 2.86261065272206e-06, "loss": 0.5007, "num_input_tokens_seen": 131328384, "step": 107990 }, { "epoch": 13.531512341811803, "grad_norm": 1.449684500694275, "learning_rate": 2.862116422079402e-06, "loss": 0.4835, "num_input_tokens_seen": 131333760, "step": 107995 }, { "epoch": 13.532138829720587, "grad_norm": 2.0804901123046875, "learning_rate": 2.8616222169969366e-06, "loss": 0.4578, "num_input_tokens_seen": 131339872, "step": 108000 }, { "epoch": 13.53276531762937, "grad_norm": 2.3076162338256836, "learning_rate": 2.8611280374805785e-06, "loss": 0.4627, "num_input_tokens_seen": 131346176, "step": 108005 }, { "epoch": 13.533391805538153, "grad_norm": 3.4587082862854004, "learning_rate": 2.8606338835362303e-06, "loss": 0.501, "num_input_tokens_seen": 131352128, "step": 108010 }, { "epoch": 13.534018293446936, "grad_norm": 1.7968305349349976, "learning_rate": 2.860139755169805e-06, "loss": 0.4426, "num_input_tokens_seen": 131358304, "step": 108015 }, { "epoch": 13.53464478135572, "grad_norm": 2.164762020111084, "learning_rate": 2.8596456523872056e-06, "loss": 0.4534, "num_input_tokens_seen": 131364352, "step": 108020 }, { "epoch": 13.535271269264504, "grad_norm": 2.246748208999634, "learning_rate": 2.8591515751943444e-06, "loss": 0.4729, "num_input_tokens_seen": 131370304, "step": 108025 }, { "epoch": 13.535897757173286, "grad_norm": 0.9492861032485962, "learning_rate": 2.8586575235971235e-06, "loss": 0.4434, "num_input_tokens_seen": 131376512, "step": 108030 }, { "epoch": 13.53652424508207, "grad_norm": 2.49556040763855, "learning_rate": 2.858163497601455e-06, "loss": 0.4501, "num_input_tokens_seen": 131382656, "step": 108035 }, { "epoch": 13.537150732990852, "grad_norm": 1.3983290195465088, "learning_rate": 2.85766949721324e-06, "loss": 0.4398, "num_input_tokens_seen": 131388640, "step": 108040 }, { "epoch": 13.537777220899637, "grad_norm": 7.344402313232422, "learning_rate": 2.857175522438388e-06, "loss": 0.4612, "num_input_tokens_seen": 131394784, "step": 108045 }, { "epoch": 13.53840370880842, "grad_norm": 1.1605067253112793, "learning_rate": 2.8566815732828057e-06, "loss": 0.4448, "num_input_tokens_seen": 131400800, "step": 108050 }, { "epoch": 13.539030196717203, "grad_norm": 0.9616284370422363, "learning_rate": 2.8561876497523955e-06, "loss": 0.4467, "num_input_tokens_seen": 131406816, "step": 108055 }, { "epoch": 13.539656684625987, "grad_norm": 0.8535749912261963, "learning_rate": 2.855693751853066e-06, "loss": 0.4633, "num_input_tokens_seen": 131413152, "step": 108060 }, { "epoch": 13.540283172534771, "grad_norm": 0.8681482076644897, "learning_rate": 2.855199879590719e-06, "loss": 0.5113, "num_input_tokens_seen": 131419264, "step": 108065 }, { "epoch": 13.540909660443553, "grad_norm": 7.926076889038086, "learning_rate": 2.8547060329712607e-06, "loss": 0.5072, "num_input_tokens_seen": 131424960, "step": 108070 }, { "epoch": 13.541536148352337, "grad_norm": 1.5195404291152954, "learning_rate": 2.8542122120005946e-06, "loss": 0.4667, "num_input_tokens_seen": 131431072, "step": 108075 }, { "epoch": 13.54216263626112, "grad_norm": 1.2798222303390503, "learning_rate": 2.853718416684628e-06, "loss": 0.4412, "num_input_tokens_seen": 131437152, "step": 108080 }, { "epoch": 13.542789124169904, "grad_norm": 1.0865309238433838, "learning_rate": 2.8532246470292605e-06, "loss": 0.4238, "num_input_tokens_seen": 131443264, "step": 108085 }, { "epoch": 13.543415612078686, "grad_norm": 2.160466194152832, "learning_rate": 2.8527309030403994e-06, "loss": 0.4608, "num_input_tokens_seen": 131449216, "step": 108090 }, { "epoch": 13.54404209998747, "grad_norm": 0.8290646076202393, "learning_rate": 2.852237184723943e-06, "loss": 0.4501, "num_input_tokens_seen": 131455456, "step": 108095 }, { "epoch": 13.544668587896254, "grad_norm": 4.6634321212768555, "learning_rate": 2.8517434920857993e-06, "loss": 0.5028, "num_input_tokens_seen": 131461344, "step": 108100 }, { "epoch": 13.545295075805036, "grad_norm": 5.931684494018555, "learning_rate": 2.8512498251318666e-06, "loss": 0.485, "num_input_tokens_seen": 131468096, "step": 108105 }, { "epoch": 13.54592156371382, "grad_norm": 10.845794677734375, "learning_rate": 2.850756183868048e-06, "loss": 0.5081, "num_input_tokens_seen": 131474112, "step": 108110 }, { "epoch": 13.546548051622604, "grad_norm": 1.0534321069717407, "learning_rate": 2.8502625683002484e-06, "loss": 0.453, "num_input_tokens_seen": 131479712, "step": 108115 }, { "epoch": 13.547174539531387, "grad_norm": 1.1726715564727783, "learning_rate": 2.849768978434366e-06, "loss": 0.4422, "num_input_tokens_seen": 131485760, "step": 108120 }, { "epoch": 13.54780102744017, "grad_norm": 6.503267765045166, "learning_rate": 2.8492754142763046e-06, "loss": 0.4807, "num_input_tokens_seen": 131492000, "step": 108125 }, { "epoch": 13.548427515348953, "grad_norm": 4.781414985656738, "learning_rate": 2.848781875831962e-06, "loss": 0.4954, "num_input_tokens_seen": 131498048, "step": 108130 }, { "epoch": 13.549054003257737, "grad_norm": 1.2761497497558594, "learning_rate": 2.848288363107242e-06, "loss": 0.4708, "num_input_tokens_seen": 131504032, "step": 108135 }, { "epoch": 13.549680491166521, "grad_norm": 8.153626441955566, "learning_rate": 2.8477948761080426e-06, "loss": 0.5754, "num_input_tokens_seen": 131510272, "step": 108140 }, { "epoch": 13.550306979075303, "grad_norm": 1.3201950788497925, "learning_rate": 2.8473014148402665e-06, "loss": 0.4908, "num_input_tokens_seen": 131516608, "step": 108145 }, { "epoch": 13.550933466984088, "grad_norm": 9.202651023864746, "learning_rate": 2.84680797930981e-06, "loss": 0.6381, "num_input_tokens_seen": 131522560, "step": 108150 }, { "epoch": 13.55155995489287, "grad_norm": 0.6256171464920044, "learning_rate": 2.846314569522577e-06, "loss": 0.4203, "num_input_tokens_seen": 131528480, "step": 108155 }, { "epoch": 13.552186442801654, "grad_norm": 1.2070661783218384, "learning_rate": 2.845821185484461e-06, "loss": 0.5074, "num_input_tokens_seen": 131534656, "step": 108160 }, { "epoch": 13.552812930710438, "grad_norm": 1.392500638961792, "learning_rate": 2.8453278272013646e-06, "loss": 0.4124, "num_input_tokens_seen": 131540640, "step": 108165 }, { "epoch": 13.55343941861922, "grad_norm": 1.0198687314987183, "learning_rate": 2.8448344946791873e-06, "loss": 0.4221, "num_input_tokens_seen": 131545984, "step": 108170 }, { "epoch": 13.554065906528004, "grad_norm": 0.9256555438041687, "learning_rate": 2.8443411879238237e-06, "loss": 0.4793, "num_input_tokens_seen": 131552128, "step": 108175 }, { "epoch": 13.554692394436788, "grad_norm": 1.2381335496902466, "learning_rate": 2.8438479069411736e-06, "loss": 0.4995, "num_input_tokens_seen": 131558368, "step": 108180 }, { "epoch": 13.55531888234557, "grad_norm": 1.1999683380126953, "learning_rate": 2.843354651737134e-06, "loss": 0.5279, "num_input_tokens_seen": 131564672, "step": 108185 }, { "epoch": 13.555945370254355, "grad_norm": 1.8037420511245728, "learning_rate": 2.8428614223176055e-06, "loss": 0.5566, "num_input_tokens_seen": 131570912, "step": 108190 }, { "epoch": 13.556571858163137, "grad_norm": 3.7691867351531982, "learning_rate": 2.84236821868848e-06, "loss": 0.6695, "num_input_tokens_seen": 131577152, "step": 108195 }, { "epoch": 13.557198346071921, "grad_norm": 2.706307888031006, "learning_rate": 2.8418750408556582e-06, "loss": 0.5601, "num_input_tokens_seen": 131583232, "step": 108200 }, { "epoch": 13.557824833980703, "grad_norm": 1.2598700523376465, "learning_rate": 2.8413818888250333e-06, "loss": 0.5096, "num_input_tokens_seen": 131589216, "step": 108205 }, { "epoch": 13.558451321889487, "grad_norm": 7.7266526222229, "learning_rate": 2.8408887626025046e-06, "loss": 0.4893, "num_input_tokens_seen": 131595392, "step": 108210 }, { "epoch": 13.559077809798271, "grad_norm": 1.445381999015808, "learning_rate": 2.840395662193964e-06, "loss": 0.447, "num_input_tokens_seen": 131601664, "step": 108215 }, { "epoch": 13.559704297707054, "grad_norm": 5.028985977172852, "learning_rate": 2.839902587605311e-06, "loss": 0.7068, "num_input_tokens_seen": 131607584, "step": 108220 }, { "epoch": 13.560330785615838, "grad_norm": 2.9870028495788574, "learning_rate": 2.839409538842436e-06, "loss": 0.4983, "num_input_tokens_seen": 131613696, "step": 108225 }, { "epoch": 13.560957273524622, "grad_norm": 1.3980233669281006, "learning_rate": 2.8389165159112375e-06, "loss": 0.4346, "num_input_tokens_seen": 131619936, "step": 108230 }, { "epoch": 13.561583761433404, "grad_norm": 8.043675422668457, "learning_rate": 2.83842351881761e-06, "loss": 0.5156, "num_input_tokens_seen": 131626304, "step": 108235 }, { "epoch": 13.562210249342188, "grad_norm": 7.550076007843018, "learning_rate": 2.8379305475674455e-06, "loss": 0.4886, "num_input_tokens_seen": 131632576, "step": 108240 }, { "epoch": 13.56283673725097, "grad_norm": 1.6095597743988037, "learning_rate": 2.8374376021666406e-06, "loss": 0.5169, "num_input_tokens_seen": 131638528, "step": 108245 }, { "epoch": 13.563463225159754, "grad_norm": 7.389866352081299, "learning_rate": 2.8369446826210856e-06, "loss": 0.5823, "num_input_tokens_seen": 131644736, "step": 108250 }, { "epoch": 13.564089713068539, "grad_norm": 9.319598197937012, "learning_rate": 2.8364517889366777e-06, "loss": 0.5654, "num_input_tokens_seen": 131651168, "step": 108255 }, { "epoch": 13.56471620097732, "grad_norm": 1.0248290300369263, "learning_rate": 2.8359589211193054e-06, "loss": 0.4635, "num_input_tokens_seen": 131657056, "step": 108260 }, { "epoch": 13.565342688886105, "grad_norm": 1.152691125869751, "learning_rate": 2.835466079174866e-06, "loss": 0.4444, "num_input_tokens_seen": 131663264, "step": 108265 }, { "epoch": 13.565969176794887, "grad_norm": 1.7092267274856567, "learning_rate": 2.834973263109247e-06, "loss": 0.4638, "num_input_tokens_seen": 131669440, "step": 108270 }, { "epoch": 13.566595664703671, "grad_norm": 2.828463315963745, "learning_rate": 2.8344804729283426e-06, "loss": 0.4768, "num_input_tokens_seen": 131675488, "step": 108275 }, { "epoch": 13.567222152612455, "grad_norm": 4.001436710357666, "learning_rate": 2.833987708638045e-06, "loss": 0.4546, "num_input_tokens_seen": 131681344, "step": 108280 }, { "epoch": 13.567848640521238, "grad_norm": 2.0938079357147217, "learning_rate": 2.833494970244248e-06, "loss": 0.4441, "num_input_tokens_seen": 131687520, "step": 108285 }, { "epoch": 13.568475128430022, "grad_norm": 0.903917133808136, "learning_rate": 2.833002257752838e-06, "loss": 0.4216, "num_input_tokens_seen": 131693888, "step": 108290 }, { "epoch": 13.569101616338804, "grad_norm": 1.3531993627548218, "learning_rate": 2.832509571169708e-06, "loss": 0.4343, "num_input_tokens_seen": 131699840, "step": 108295 }, { "epoch": 13.569728104247588, "grad_norm": 8.968293190002441, "learning_rate": 2.83201691050075e-06, "loss": 0.4757, "num_input_tokens_seen": 131705440, "step": 108300 }, { "epoch": 13.570354592156372, "grad_norm": 8.782008171081543, "learning_rate": 2.83152427575185e-06, "loss": 0.5193, "num_input_tokens_seen": 131711552, "step": 108305 }, { "epoch": 13.570981080065154, "grad_norm": 3.946883201599121, "learning_rate": 2.831031666928904e-06, "loss": 0.5509, "num_input_tokens_seen": 131717600, "step": 108310 }, { "epoch": 13.571607567973938, "grad_norm": 1.056261420249939, "learning_rate": 2.8305390840377943e-06, "loss": 0.4274, "num_input_tokens_seen": 131723456, "step": 108315 }, { "epoch": 13.57223405588272, "grad_norm": 1.2563292980194092, "learning_rate": 2.830046527084417e-06, "loss": 0.4961, "num_input_tokens_seen": 131729376, "step": 108320 }, { "epoch": 13.572860543791505, "grad_norm": 3.7450520992279053, "learning_rate": 2.8295539960746553e-06, "loss": 0.5493, "num_input_tokens_seen": 131735840, "step": 108325 }, { "epoch": 13.573487031700289, "grad_norm": 4.8806023597717285, "learning_rate": 2.8290614910144033e-06, "loss": 0.4467, "num_input_tokens_seen": 131741888, "step": 108330 }, { "epoch": 13.574113519609071, "grad_norm": 10.308575630187988, "learning_rate": 2.8285690119095434e-06, "loss": 0.4525, "num_input_tokens_seen": 131747840, "step": 108335 }, { "epoch": 13.574740007517855, "grad_norm": 4.074384689331055, "learning_rate": 2.8280765587659685e-06, "loss": 0.5185, "num_input_tokens_seen": 131754272, "step": 108340 }, { "epoch": 13.575366495426639, "grad_norm": 2.1117918491363525, "learning_rate": 2.827584131589563e-06, "loss": 0.4623, "num_input_tokens_seen": 131760544, "step": 108345 }, { "epoch": 13.575992983335421, "grad_norm": 1.1776344776153564, "learning_rate": 2.8270917303862177e-06, "loss": 0.4518, "num_input_tokens_seen": 131766592, "step": 108350 }, { "epoch": 13.576619471244205, "grad_norm": 1.31356680393219, "learning_rate": 2.8265993551618155e-06, "loss": 0.4673, "num_input_tokens_seen": 131772960, "step": 108355 }, { "epoch": 13.577245959152988, "grad_norm": 1.4556742906570435, "learning_rate": 2.826107005922245e-06, "loss": 0.5061, "num_input_tokens_seen": 131779296, "step": 108360 }, { "epoch": 13.577872447061772, "grad_norm": 1.9616312980651855, "learning_rate": 2.8256146826733956e-06, "loss": 0.4631, "num_input_tokens_seen": 131785664, "step": 108365 }, { "epoch": 13.578498934970556, "grad_norm": 1.0887200832366943, "learning_rate": 2.825122385421148e-06, "loss": 0.5238, "num_input_tokens_seen": 131791712, "step": 108370 }, { "epoch": 13.579125422879338, "grad_norm": 1.88418710231781, "learning_rate": 2.824630114171391e-06, "loss": 0.4632, "num_input_tokens_seen": 131797856, "step": 108375 }, { "epoch": 13.579751910788122, "grad_norm": 8.764931678771973, "learning_rate": 2.8241378689300125e-06, "loss": 0.5032, "num_input_tokens_seen": 131803808, "step": 108380 }, { "epoch": 13.580378398696904, "grad_norm": 3.930410146713257, "learning_rate": 2.8236456497028923e-06, "loss": 0.4849, "num_input_tokens_seen": 131810080, "step": 108385 }, { "epoch": 13.581004886605688, "grad_norm": 2.6043448448181152, "learning_rate": 2.8231534564959184e-06, "loss": 0.4487, "num_input_tokens_seen": 131816320, "step": 108390 }, { "epoch": 13.581631374514473, "grad_norm": 1.2344086170196533, "learning_rate": 2.8226612893149774e-06, "loss": 0.4442, "num_input_tokens_seen": 131822240, "step": 108395 }, { "epoch": 13.582257862423255, "grad_norm": 9.269856452941895, "learning_rate": 2.8221691481659496e-06, "loss": 0.4859, "num_input_tokens_seen": 131828448, "step": 108400 }, { "epoch": 13.582884350332039, "grad_norm": 1.1607688665390015, "learning_rate": 2.821677033054722e-06, "loss": 0.5588, "num_input_tokens_seen": 131834144, "step": 108405 }, { "epoch": 13.583510838240821, "grad_norm": 1.1133102178573608, "learning_rate": 2.8211849439871753e-06, "loss": 0.4669, "num_input_tokens_seen": 131840128, "step": 108410 }, { "epoch": 13.584137326149605, "grad_norm": 4.090734481811523, "learning_rate": 2.8206928809691945e-06, "loss": 0.464, "num_input_tokens_seen": 131846400, "step": 108415 }, { "epoch": 13.58476381405839, "grad_norm": 1.4982341527938843, "learning_rate": 2.8202008440066642e-06, "loss": 0.494, "num_input_tokens_seen": 131852672, "step": 108420 }, { "epoch": 13.585390301967172, "grad_norm": 1.4854894876480103, "learning_rate": 2.8197088331054633e-06, "loss": 0.4578, "num_input_tokens_seen": 131858656, "step": 108425 }, { "epoch": 13.586016789875956, "grad_norm": 2.004565477371216, "learning_rate": 2.819216848271479e-06, "loss": 0.4814, "num_input_tokens_seen": 131864256, "step": 108430 }, { "epoch": 13.586643277784738, "grad_norm": 8.439970970153809, "learning_rate": 2.818724889510589e-06, "loss": 0.5731, "num_input_tokens_seen": 131869920, "step": 108435 }, { "epoch": 13.587269765693522, "grad_norm": 1.435780644416809, "learning_rate": 2.8182329568286783e-06, "loss": 0.502, "num_input_tokens_seen": 131876096, "step": 108440 }, { "epoch": 13.587896253602306, "grad_norm": 1.1948240995407104, "learning_rate": 2.8177410502316247e-06, "loss": 0.4618, "num_input_tokens_seen": 131882336, "step": 108445 }, { "epoch": 13.588522741511088, "grad_norm": 2.9585742950439453, "learning_rate": 2.817249169725314e-06, "loss": 0.4711, "num_input_tokens_seen": 131888192, "step": 108450 }, { "epoch": 13.589149229419872, "grad_norm": 1.2524334192276, "learning_rate": 2.8167573153156224e-06, "loss": 0.4431, "num_input_tokens_seen": 131894368, "step": 108455 }, { "epoch": 13.589775717328656, "grad_norm": 1.1167281866073608, "learning_rate": 2.816265487008435e-06, "loss": 0.4695, "num_input_tokens_seen": 131900544, "step": 108460 }, { "epoch": 13.590402205237439, "grad_norm": 1.1016355752944946, "learning_rate": 2.815773684809627e-06, "loss": 0.4504, "num_input_tokens_seen": 131906848, "step": 108465 }, { "epoch": 13.591028693146223, "grad_norm": 0.7358190417289734, "learning_rate": 2.815281908725084e-06, "loss": 0.4829, "num_input_tokens_seen": 131912224, "step": 108470 }, { "epoch": 13.591655181055005, "grad_norm": 1.1117504835128784, "learning_rate": 2.8147901587606794e-06, "loss": 0.4393, "num_input_tokens_seen": 131918336, "step": 108475 }, { "epoch": 13.592281668963789, "grad_norm": 0.8242648243904114, "learning_rate": 2.814298434922297e-06, "loss": 0.5151, "num_input_tokens_seen": 131924224, "step": 108480 }, { "epoch": 13.592908156872573, "grad_norm": 5.362663269042969, "learning_rate": 2.813806737215814e-06, "loss": 0.4702, "num_input_tokens_seen": 131930336, "step": 108485 }, { "epoch": 13.593534644781355, "grad_norm": 7.757140636444092, "learning_rate": 2.8133150656471088e-06, "loss": 0.4947, "num_input_tokens_seen": 131936800, "step": 108490 }, { "epoch": 13.59416113269014, "grad_norm": 1.4860142469406128, "learning_rate": 2.8128234202220626e-06, "loss": 0.5193, "num_input_tokens_seen": 131942368, "step": 108495 }, { "epoch": 13.594787620598922, "grad_norm": 2.6919147968292236, "learning_rate": 2.8123318009465507e-06, "loss": 0.5462, "num_input_tokens_seen": 131948320, "step": 108500 }, { "epoch": 13.595414108507706, "grad_norm": 2.209078788757324, "learning_rate": 2.811840207826452e-06, "loss": 0.461, "num_input_tokens_seen": 131953984, "step": 108505 }, { "epoch": 13.59604059641649, "grad_norm": 1.9157370328903198, "learning_rate": 2.8113486408676416e-06, "loss": 0.4697, "num_input_tokens_seen": 131960416, "step": 108510 }, { "epoch": 13.596667084325272, "grad_norm": 4.150958061218262, "learning_rate": 2.8108571000760003e-06, "loss": 0.5135, "num_input_tokens_seen": 131966912, "step": 108515 }, { "epoch": 13.597293572234056, "grad_norm": 1.9692407846450806, "learning_rate": 2.810365585457401e-06, "loss": 0.4715, "num_input_tokens_seen": 131973472, "step": 108520 }, { "epoch": 13.597920060142838, "grad_norm": 1.123091220855713, "learning_rate": 2.809874097017724e-06, "loss": 0.4596, "num_input_tokens_seen": 131979360, "step": 108525 }, { "epoch": 13.598546548051623, "grad_norm": 0.9427697062492371, "learning_rate": 2.8093826347628417e-06, "loss": 0.4484, "num_input_tokens_seen": 131985408, "step": 108530 }, { "epoch": 13.599173035960407, "grad_norm": 1.7841804027557373, "learning_rate": 2.808891198698633e-06, "loss": 0.452, "num_input_tokens_seen": 131991264, "step": 108535 }, { "epoch": 13.599799523869189, "grad_norm": 1.286232590675354, "learning_rate": 2.8083997888309712e-06, "loss": 0.5033, "num_input_tokens_seen": 131997792, "step": 108540 }, { "epoch": 13.600426011777973, "grad_norm": 3.223470449447632, "learning_rate": 2.8079084051657323e-06, "loss": 0.482, "num_input_tokens_seen": 132003712, "step": 108545 }, { "epoch": 13.601052499686755, "grad_norm": 1.0037086009979248, "learning_rate": 2.8074170477087935e-06, "loss": 0.4614, "num_input_tokens_seen": 132009792, "step": 108550 }, { "epoch": 13.60167898759554, "grad_norm": 1.1465990543365479, "learning_rate": 2.8069257164660247e-06, "loss": 0.4691, "num_input_tokens_seen": 132015680, "step": 108555 }, { "epoch": 13.602305475504323, "grad_norm": 1.8946688175201416, "learning_rate": 2.8064344114433056e-06, "loss": 0.4913, "num_input_tokens_seen": 132021952, "step": 108560 }, { "epoch": 13.602931963413106, "grad_norm": 0.8932148218154907, "learning_rate": 2.805943132646505e-06, "loss": 0.4438, "num_input_tokens_seen": 132027392, "step": 108565 }, { "epoch": 13.60355845132189, "grad_norm": 2.7397890090942383, "learning_rate": 2.805451880081501e-06, "loss": 0.4768, "num_input_tokens_seen": 132033408, "step": 108570 }, { "epoch": 13.604184939230674, "grad_norm": 1.6102449893951416, "learning_rate": 2.804960653754163e-06, "loss": 0.4645, "num_input_tokens_seen": 132039712, "step": 108575 }, { "epoch": 13.604811427139456, "grad_norm": 2.000732183456421, "learning_rate": 2.8044694536703652e-06, "loss": 0.5415, "num_input_tokens_seen": 132045888, "step": 108580 }, { "epoch": 13.60543791504824, "grad_norm": 0.7030625939369202, "learning_rate": 2.8039782798359832e-06, "loss": 0.4656, "num_input_tokens_seen": 132051936, "step": 108585 }, { "epoch": 13.606064402957022, "grad_norm": 1.399980902671814, "learning_rate": 2.803487132256886e-06, "loss": 0.4925, "num_input_tokens_seen": 132058080, "step": 108590 }, { "epoch": 13.606690890865806, "grad_norm": 1.3946269750595093, "learning_rate": 2.802996010938945e-06, "loss": 0.4624, "num_input_tokens_seen": 132064352, "step": 108595 }, { "epoch": 13.60731737877459, "grad_norm": 1.646422266960144, "learning_rate": 2.8025049158880357e-06, "loss": 0.4698, "num_input_tokens_seen": 132070368, "step": 108600 }, { "epoch": 13.607943866683373, "grad_norm": 1.2071046829223633, "learning_rate": 2.8020138471100287e-06, "loss": 0.4663, "num_input_tokens_seen": 132076480, "step": 108605 }, { "epoch": 13.608570354592157, "grad_norm": 0.951473593711853, "learning_rate": 2.801522804610793e-06, "loss": 0.4425, "num_input_tokens_seen": 132082848, "step": 108610 }, { "epoch": 13.609196842500939, "grad_norm": 1.204595923423767, "learning_rate": 2.801031788396202e-06, "loss": 0.4899, "num_input_tokens_seen": 132089056, "step": 108615 }, { "epoch": 13.609823330409723, "grad_norm": 4.261934757232666, "learning_rate": 2.8005407984721223e-06, "loss": 0.4683, "num_input_tokens_seen": 132095136, "step": 108620 }, { "epoch": 13.610449818318507, "grad_norm": 1.2524477243423462, "learning_rate": 2.800049834844429e-06, "loss": 0.4695, "num_input_tokens_seen": 132101376, "step": 108625 }, { "epoch": 13.61107630622729, "grad_norm": 1.042495846748352, "learning_rate": 2.799558897518988e-06, "loss": 0.4601, "num_input_tokens_seen": 132107552, "step": 108630 }, { "epoch": 13.611702794136074, "grad_norm": 0.7335934638977051, "learning_rate": 2.799067986501672e-06, "loss": 0.449, "num_input_tokens_seen": 132114016, "step": 108635 }, { "epoch": 13.612329282044856, "grad_norm": 2.8907737731933594, "learning_rate": 2.7985771017983464e-06, "loss": 0.4484, "num_input_tokens_seen": 132120192, "step": 108640 }, { "epoch": 13.61295576995364, "grad_norm": 3.692016363143921, "learning_rate": 2.798086243414885e-06, "loss": 0.4554, "num_input_tokens_seen": 132126336, "step": 108645 }, { "epoch": 13.613582257862424, "grad_norm": 0.6319310069084167, "learning_rate": 2.797595411357151e-06, "loss": 0.4374, "num_input_tokens_seen": 132131488, "step": 108650 }, { "epoch": 13.614208745771206, "grad_norm": 1.0226726531982422, "learning_rate": 2.797104605631018e-06, "loss": 0.5046, "num_input_tokens_seen": 132137728, "step": 108655 }, { "epoch": 13.61483523367999, "grad_norm": 3.672006368637085, "learning_rate": 2.7966138262423497e-06, "loss": 0.4594, "num_input_tokens_seen": 132144448, "step": 108660 }, { "epoch": 13.615461721588773, "grad_norm": 1.0547701120376587, "learning_rate": 2.7961230731970156e-06, "loss": 0.4297, "num_input_tokens_seen": 132150944, "step": 108665 }, { "epoch": 13.616088209497557, "grad_norm": 0.6945909261703491, "learning_rate": 2.795632346500885e-06, "loss": 0.4453, "num_input_tokens_seen": 132156992, "step": 108670 }, { "epoch": 13.61671469740634, "grad_norm": 3.0586912631988525, "learning_rate": 2.795141646159821e-06, "loss": 0.5157, "num_input_tokens_seen": 132163360, "step": 108675 }, { "epoch": 13.617341185315123, "grad_norm": 1.321498155593872, "learning_rate": 2.7946509721796944e-06, "loss": 0.4613, "num_input_tokens_seen": 132169280, "step": 108680 }, { "epoch": 13.617967673223907, "grad_norm": 1.1255704164505005, "learning_rate": 2.7941603245663672e-06, "loss": 0.4523, "num_input_tokens_seen": 132175232, "step": 108685 }, { "epoch": 13.618594161132691, "grad_norm": 2.0594000816345215, "learning_rate": 2.7936697033257082e-06, "loss": 0.4847, "num_input_tokens_seen": 132181344, "step": 108690 }, { "epoch": 13.619220649041473, "grad_norm": 1.1086792945861816, "learning_rate": 2.793179108463583e-06, "loss": 0.4491, "num_input_tokens_seen": 132187392, "step": 108695 }, { "epoch": 13.619847136950257, "grad_norm": 2.520063877105713, "learning_rate": 2.7926885399858583e-06, "loss": 0.435, "num_input_tokens_seen": 132193632, "step": 108700 }, { "epoch": 13.62047362485904, "grad_norm": 1.8577560186386108, "learning_rate": 2.792197997898397e-06, "loss": 0.4605, "num_input_tokens_seen": 132199776, "step": 108705 }, { "epoch": 13.621100112767824, "grad_norm": 1.1834039688110352, "learning_rate": 2.7917074822070662e-06, "loss": 0.4416, "num_input_tokens_seen": 132206112, "step": 108710 }, { "epoch": 13.621726600676606, "grad_norm": 1.1480629444122314, "learning_rate": 2.791216992917727e-06, "loss": 0.4701, "num_input_tokens_seen": 132211840, "step": 108715 }, { "epoch": 13.62235308858539, "grad_norm": 1.0168436765670776, "learning_rate": 2.790726530036248e-06, "loss": 0.4681, "num_input_tokens_seen": 132217984, "step": 108720 }, { "epoch": 13.622979576494174, "grad_norm": 1.2815862894058228, "learning_rate": 2.7902360935684886e-06, "loss": 0.4674, "num_input_tokens_seen": 132224160, "step": 108725 }, { "epoch": 13.623606064402956, "grad_norm": 5.051223278045654, "learning_rate": 2.7897456835203145e-06, "loss": 0.4908, "num_input_tokens_seen": 132229888, "step": 108730 }, { "epoch": 13.62423255231174, "grad_norm": 1.1255601644515991, "learning_rate": 2.789255299897591e-06, "loss": 0.4398, "num_input_tokens_seen": 132235360, "step": 108735 }, { "epoch": 13.624859040220525, "grad_norm": 0.8324353098869324, "learning_rate": 2.7887649427061773e-06, "loss": 0.5284, "num_input_tokens_seen": 132241632, "step": 108740 }, { "epoch": 13.625485528129307, "grad_norm": 1.0414987802505493, "learning_rate": 2.78827461195194e-06, "loss": 0.4625, "num_input_tokens_seen": 132247392, "step": 108745 }, { "epoch": 13.62611201603809, "grad_norm": 4.988051414489746, "learning_rate": 2.787784307640737e-06, "loss": 0.5561, "num_input_tokens_seen": 132253696, "step": 108750 }, { "epoch": 13.626738503946873, "grad_norm": 0.876253068447113, "learning_rate": 2.7872940297784345e-06, "loss": 0.4642, "num_input_tokens_seen": 132259936, "step": 108755 }, { "epoch": 13.627364991855657, "grad_norm": 2.3327407836914062, "learning_rate": 2.78680377837089e-06, "loss": 0.4757, "num_input_tokens_seen": 132266080, "step": 108760 }, { "epoch": 13.627991479764441, "grad_norm": 1.2737687826156616, "learning_rate": 2.7863135534239695e-06, "loss": 0.5433, "num_input_tokens_seen": 132272032, "step": 108765 }, { "epoch": 13.628617967673224, "grad_norm": 1.261923909187317, "learning_rate": 2.7858233549435294e-06, "loss": 0.44, "num_input_tokens_seen": 132278240, "step": 108770 }, { "epoch": 13.629244455582008, "grad_norm": 1.1539115905761719, "learning_rate": 2.7853331829354345e-06, "loss": 0.4409, "num_input_tokens_seen": 132284544, "step": 108775 }, { "epoch": 13.62987094349079, "grad_norm": 0.8978973627090454, "learning_rate": 2.7848430374055413e-06, "loss": 0.4875, "num_input_tokens_seen": 132290176, "step": 108780 }, { "epoch": 13.630497431399574, "grad_norm": 3.6376006603240967, "learning_rate": 2.784352918359713e-06, "loss": 0.4514, "num_input_tokens_seen": 132295872, "step": 108785 }, { "epoch": 13.631123919308358, "grad_norm": 4.694334030151367, "learning_rate": 2.783862825803809e-06, "loss": 0.4523, "num_input_tokens_seen": 132302080, "step": 108790 }, { "epoch": 13.63175040721714, "grad_norm": 0.8971593379974365, "learning_rate": 2.783372759743687e-06, "loss": 0.4566, "num_input_tokens_seen": 132308096, "step": 108795 }, { "epoch": 13.632376895125924, "grad_norm": 3.28192400932312, "learning_rate": 2.782882720185207e-06, "loss": 0.5312, "num_input_tokens_seen": 132314112, "step": 108800 }, { "epoch": 13.633003383034707, "grad_norm": 2.156179189682007, "learning_rate": 2.782392707134228e-06, "loss": 0.4421, "num_input_tokens_seen": 132320384, "step": 108805 }, { "epoch": 13.63362987094349, "grad_norm": 2.4683220386505127, "learning_rate": 2.7819027205966107e-06, "loss": 0.475, "num_input_tokens_seen": 132326208, "step": 108810 }, { "epoch": 13.634256358852275, "grad_norm": 3.706390380859375, "learning_rate": 2.78141276057821e-06, "loss": 0.4853, "num_input_tokens_seen": 132332288, "step": 108815 }, { "epoch": 13.634882846761057, "grad_norm": 0.9310105443000793, "learning_rate": 2.7809228270848865e-06, "loss": 0.476, "num_input_tokens_seen": 132338528, "step": 108820 }, { "epoch": 13.635509334669841, "grad_norm": 1.2514050006866455, "learning_rate": 2.7804329201224946e-06, "loss": 0.4664, "num_input_tokens_seen": 132344480, "step": 108825 }, { "epoch": 13.636135822578623, "grad_norm": 0.9248504638671875, "learning_rate": 2.779943039696894e-06, "loss": 0.4678, "num_input_tokens_seen": 132350208, "step": 108830 }, { "epoch": 13.636762310487407, "grad_norm": 1.1157355308532715, "learning_rate": 2.77945318581394e-06, "loss": 0.4454, "num_input_tokens_seen": 132355904, "step": 108835 }, { "epoch": 13.637388798396191, "grad_norm": 0.9568939208984375, "learning_rate": 2.7789633584794916e-06, "loss": 0.471, "num_input_tokens_seen": 132361920, "step": 108840 }, { "epoch": 13.638015286304974, "grad_norm": 0.8081312775611877, "learning_rate": 2.7784735576994026e-06, "loss": 0.463, "num_input_tokens_seen": 132367968, "step": 108845 }, { "epoch": 13.638641774213758, "grad_norm": 2.415900230407715, "learning_rate": 2.7779837834795287e-06, "loss": 0.4737, "num_input_tokens_seen": 132374176, "step": 108850 }, { "epoch": 13.639268262122542, "grad_norm": 1.0379782915115356, "learning_rate": 2.77749403582573e-06, "loss": 0.448, "num_input_tokens_seen": 132380256, "step": 108855 }, { "epoch": 13.639894750031324, "grad_norm": 1.0833892822265625, "learning_rate": 2.777004314743856e-06, "loss": 0.4502, "num_input_tokens_seen": 132386304, "step": 108860 }, { "epoch": 13.640521237940108, "grad_norm": 1.0975695848464966, "learning_rate": 2.7765146202397663e-06, "loss": 0.4664, "num_input_tokens_seen": 132391712, "step": 108865 }, { "epoch": 13.64114772584889, "grad_norm": 1.0422708988189697, "learning_rate": 2.7760249523193116e-06, "loss": 0.4629, "num_input_tokens_seen": 132397952, "step": 108870 }, { "epoch": 13.641774213757675, "grad_norm": 5.240686893463135, "learning_rate": 2.775535310988351e-06, "loss": 0.4998, "num_input_tokens_seen": 132404096, "step": 108875 }, { "epoch": 13.642400701666459, "grad_norm": 3.112051486968994, "learning_rate": 2.7750456962527337e-06, "loss": 0.4887, "num_input_tokens_seen": 132409920, "step": 108880 }, { "epoch": 13.64302718957524, "grad_norm": 5.791599750518799, "learning_rate": 2.7745561081183175e-06, "loss": 0.5604, "num_input_tokens_seen": 132415840, "step": 108885 }, { "epoch": 13.643653677484025, "grad_norm": 2.6729631423950195, "learning_rate": 2.7740665465909523e-06, "loss": 0.4752, "num_input_tokens_seen": 132421664, "step": 108890 }, { "epoch": 13.644280165392807, "grad_norm": 3.0744266510009766, "learning_rate": 2.773577011676493e-06, "loss": 0.4883, "num_input_tokens_seen": 132428096, "step": 108895 }, { "epoch": 13.644906653301591, "grad_norm": 0.7168944478034973, "learning_rate": 2.7730875033807924e-06, "loss": 0.436, "num_input_tokens_seen": 132434208, "step": 108900 }, { "epoch": 13.645533141210375, "grad_norm": 1.1774934530258179, "learning_rate": 2.7725980217097046e-06, "loss": 0.4508, "num_input_tokens_seen": 132440416, "step": 108905 }, { "epoch": 13.646159629119158, "grad_norm": 1.3170422315597534, "learning_rate": 2.772108566669079e-06, "loss": 0.5204, "num_input_tokens_seen": 132446496, "step": 108910 }, { "epoch": 13.646786117027942, "grad_norm": 3.3300771713256836, "learning_rate": 2.7716191382647686e-06, "loss": 0.5261, "num_input_tokens_seen": 132452736, "step": 108915 }, { "epoch": 13.647412604936724, "grad_norm": 4.430502414703369, "learning_rate": 2.771129736502626e-06, "loss": 0.5401, "num_input_tokens_seen": 132458624, "step": 108920 }, { "epoch": 13.648039092845508, "grad_norm": 1.0918511152267456, "learning_rate": 2.7706403613884997e-06, "loss": 0.4825, "num_input_tokens_seen": 132464576, "step": 108925 }, { "epoch": 13.648665580754292, "grad_norm": 5.579526424407959, "learning_rate": 2.7701510129282443e-06, "loss": 0.4743, "num_input_tokens_seen": 132470816, "step": 108930 }, { "epoch": 13.649292068663074, "grad_norm": 1.9670991897583008, "learning_rate": 2.7696616911277063e-06, "loss": 0.4474, "num_input_tokens_seen": 132477056, "step": 108935 }, { "epoch": 13.649918556571858, "grad_norm": 1.8921799659729004, "learning_rate": 2.7691723959927396e-06, "loss": 0.4879, "num_input_tokens_seen": 132483104, "step": 108940 }, { "epoch": 13.65054504448064, "grad_norm": 0.9036088585853577, "learning_rate": 2.7686831275291916e-06, "loss": 0.4831, "num_input_tokens_seen": 132488640, "step": 108945 }, { "epoch": 13.651171532389425, "grad_norm": 0.8762128949165344, "learning_rate": 2.7681938857429137e-06, "loss": 0.4857, "num_input_tokens_seen": 132494752, "step": 108950 }, { "epoch": 13.651798020298209, "grad_norm": 1.0538314580917358, "learning_rate": 2.7677046706397525e-06, "loss": 0.4544, "num_input_tokens_seen": 132500896, "step": 108955 }, { "epoch": 13.652424508206991, "grad_norm": 1.2164515256881714, "learning_rate": 2.767215482225561e-06, "loss": 0.4739, "num_input_tokens_seen": 132506976, "step": 108960 }, { "epoch": 13.653050996115775, "grad_norm": 2.4701502323150635, "learning_rate": 2.7667263205061834e-06, "loss": 0.4932, "num_input_tokens_seen": 132513088, "step": 108965 }, { "epoch": 13.65367748402456, "grad_norm": 1.3068774938583374, "learning_rate": 2.7662371854874725e-06, "loss": 0.4465, "num_input_tokens_seen": 132518496, "step": 108970 }, { "epoch": 13.654303971933341, "grad_norm": 1.2618886232376099, "learning_rate": 2.765748077175272e-06, "loss": 0.4652, "num_input_tokens_seen": 132524704, "step": 108975 }, { "epoch": 13.654930459842126, "grad_norm": 0.8877208232879639, "learning_rate": 2.7652589955754314e-06, "loss": 0.456, "num_input_tokens_seen": 132530592, "step": 108980 }, { "epoch": 13.655556947750908, "grad_norm": 2.6861190795898438, "learning_rate": 2.7647699406938013e-06, "loss": 0.4531, "num_input_tokens_seen": 132536416, "step": 108985 }, { "epoch": 13.656183435659692, "grad_norm": 1.0801042318344116, "learning_rate": 2.7642809125362224e-06, "loss": 0.4683, "num_input_tokens_seen": 132542944, "step": 108990 }, { "epoch": 13.656809923568476, "grad_norm": 4.105196952819824, "learning_rate": 2.763791911108546e-06, "loss": 0.4685, "num_input_tokens_seen": 132549152, "step": 108995 }, { "epoch": 13.657436411477258, "grad_norm": 1.0050115585327148, "learning_rate": 2.7633029364166186e-06, "loss": 0.4415, "num_input_tokens_seen": 132555232, "step": 109000 }, { "epoch": 13.658062899386042, "grad_norm": 1.6553659439086914, "learning_rate": 2.762813988466284e-06, "loss": 0.4675, "num_input_tokens_seen": 132561120, "step": 109005 }, { "epoch": 13.658689387294825, "grad_norm": 5.908102512359619, "learning_rate": 2.7623250672633885e-06, "loss": 0.502, "num_input_tokens_seen": 132567200, "step": 109010 }, { "epoch": 13.659315875203609, "grad_norm": 1.0061885118484497, "learning_rate": 2.7618361728137804e-06, "loss": 0.4473, "num_input_tokens_seen": 132573344, "step": 109015 }, { "epoch": 13.659942363112393, "grad_norm": 8.851333618164062, "learning_rate": 2.761347305123301e-06, "loss": 0.4839, "num_input_tokens_seen": 132579392, "step": 109020 }, { "epoch": 13.660568851021175, "grad_norm": 7.184745788574219, "learning_rate": 2.7608584641977977e-06, "loss": 0.474, "num_input_tokens_seen": 132585600, "step": 109025 }, { "epoch": 13.661195338929959, "grad_norm": 3.9521331787109375, "learning_rate": 2.760369650043113e-06, "loss": 0.5542, "num_input_tokens_seen": 132592000, "step": 109030 }, { "epoch": 13.661821826838741, "grad_norm": 6.5556769371032715, "learning_rate": 2.7598808626650914e-06, "loss": 0.4616, "num_input_tokens_seen": 132598048, "step": 109035 }, { "epoch": 13.662448314747525, "grad_norm": 1.268480658531189, "learning_rate": 2.7593921020695797e-06, "loss": 0.5104, "num_input_tokens_seen": 132603936, "step": 109040 }, { "epoch": 13.66307480265631, "grad_norm": 0.8805636763572693, "learning_rate": 2.758903368262417e-06, "loss": 0.427, "num_input_tokens_seen": 132609504, "step": 109045 }, { "epoch": 13.663701290565092, "grad_norm": 1.15036940574646, "learning_rate": 2.758414661249451e-06, "loss": 0.4684, "num_input_tokens_seen": 132615648, "step": 109050 }, { "epoch": 13.664327778473876, "grad_norm": 4.994114398956299, "learning_rate": 2.757925981036521e-06, "loss": 0.4767, "num_input_tokens_seen": 132621888, "step": 109055 }, { "epoch": 13.664954266382658, "grad_norm": 0.9020048975944519, "learning_rate": 2.7574373276294717e-06, "loss": 0.5037, "num_input_tokens_seen": 132628064, "step": 109060 }, { "epoch": 13.665580754291442, "grad_norm": 4.275696277618408, "learning_rate": 2.7569487010341432e-06, "loss": 0.5205, "num_input_tokens_seen": 132634144, "step": 109065 }, { "epoch": 13.666207242200226, "grad_norm": 3.937230110168457, "learning_rate": 2.7564601012563806e-06, "loss": 0.5091, "num_input_tokens_seen": 132640576, "step": 109070 }, { "epoch": 13.666833730109008, "grad_norm": 0.8964629173278809, "learning_rate": 2.7559715283020227e-06, "loss": 0.4625, "num_input_tokens_seen": 132646368, "step": 109075 }, { "epoch": 13.667460218017792, "grad_norm": 1.2588388919830322, "learning_rate": 2.7554829821769126e-06, "loss": 0.4709, "num_input_tokens_seen": 132652224, "step": 109080 }, { "epoch": 13.668086705926576, "grad_norm": 5.059631824493408, "learning_rate": 2.7549944628868896e-06, "loss": 0.52, "num_input_tokens_seen": 132657888, "step": 109085 }, { "epoch": 13.668713193835359, "grad_norm": 3.6587653160095215, "learning_rate": 2.7545059704377965e-06, "loss": 0.4697, "num_input_tokens_seen": 132664064, "step": 109090 }, { "epoch": 13.669339681744143, "grad_norm": 1.0917127132415771, "learning_rate": 2.7540175048354707e-06, "loss": 0.5036, "num_input_tokens_seen": 132670080, "step": 109095 }, { "epoch": 13.669966169652925, "grad_norm": 2.175283432006836, "learning_rate": 2.7535290660857538e-06, "loss": 0.4809, "num_input_tokens_seen": 132675808, "step": 109100 }, { "epoch": 13.67059265756171, "grad_norm": 1.428491473197937, "learning_rate": 2.753040654194486e-06, "loss": 0.486, "num_input_tokens_seen": 132681824, "step": 109105 }, { "epoch": 13.671219145470493, "grad_norm": 0.9236899018287659, "learning_rate": 2.752552269167506e-06, "loss": 0.4707, "num_input_tokens_seen": 132687552, "step": 109110 }, { "epoch": 13.671845633379275, "grad_norm": 0.7432823777198792, "learning_rate": 2.752063911010655e-06, "loss": 0.4517, "num_input_tokens_seen": 132693984, "step": 109115 }, { "epoch": 13.67247212128806, "grad_norm": 1.0720077753067017, "learning_rate": 2.751575579729768e-06, "loss": 0.4356, "num_input_tokens_seen": 132700384, "step": 109120 }, { "epoch": 13.673098609196842, "grad_norm": 0.9678235650062561, "learning_rate": 2.751087275330688e-06, "loss": 0.4468, "num_input_tokens_seen": 132706432, "step": 109125 }, { "epoch": 13.673725097105626, "grad_norm": 1.0667394399642944, "learning_rate": 2.7505989978192478e-06, "loss": 0.4464, "num_input_tokens_seen": 132712416, "step": 109130 }, { "epoch": 13.67435158501441, "grad_norm": 1.3899109363555908, "learning_rate": 2.75011074720129e-06, "loss": 0.4443, "num_input_tokens_seen": 132718496, "step": 109135 }, { "epoch": 13.674978072923192, "grad_norm": 0.736270010471344, "learning_rate": 2.749622523482648e-06, "loss": 0.4613, "num_input_tokens_seen": 132724864, "step": 109140 }, { "epoch": 13.675604560831976, "grad_norm": 1.5142796039581299, "learning_rate": 2.749134326669162e-06, "loss": 0.4588, "num_input_tokens_seen": 132731008, "step": 109145 }, { "epoch": 13.676231048740759, "grad_norm": 2.195509910583496, "learning_rate": 2.7486461567666656e-06, "loss": 0.4688, "num_input_tokens_seen": 132736832, "step": 109150 }, { "epoch": 13.676857536649543, "grad_norm": 3.6959214210510254, "learning_rate": 2.748158013781e-06, "loss": 0.4768, "num_input_tokens_seen": 132742976, "step": 109155 }, { "epoch": 13.677484024558327, "grad_norm": 0.705730676651001, "learning_rate": 2.747669897717996e-06, "loss": 0.4701, "num_input_tokens_seen": 132748960, "step": 109160 }, { "epoch": 13.678110512467109, "grad_norm": 1.060465931892395, "learning_rate": 2.7471818085834924e-06, "loss": 0.444, "num_input_tokens_seen": 132754976, "step": 109165 }, { "epoch": 13.678737000375893, "grad_norm": 0.8805150389671326, "learning_rate": 2.7466937463833267e-06, "loss": 0.4503, "num_input_tokens_seen": 132761088, "step": 109170 }, { "epoch": 13.679363488284675, "grad_norm": 1.0722243785858154, "learning_rate": 2.7462057111233286e-06, "loss": 0.4517, "num_input_tokens_seen": 132767200, "step": 109175 }, { "epoch": 13.67998997619346, "grad_norm": 0.7936142683029175, "learning_rate": 2.7457177028093386e-06, "loss": 0.4674, "num_input_tokens_seen": 132773536, "step": 109180 }, { "epoch": 13.680616464102243, "grad_norm": 1.1516673564910889, "learning_rate": 2.745229721447187e-06, "loss": 0.4457, "num_input_tokens_seen": 132779648, "step": 109185 }, { "epoch": 13.681242952011026, "grad_norm": 0.8972444534301758, "learning_rate": 2.744741767042712e-06, "loss": 0.4628, "num_input_tokens_seen": 132785632, "step": 109190 }, { "epoch": 13.68186943991981, "grad_norm": 4.376847743988037, "learning_rate": 2.7442538396017425e-06, "loss": 0.4622, "num_input_tokens_seen": 132791104, "step": 109195 }, { "epoch": 13.682495927828594, "grad_norm": 0.8258553743362427, "learning_rate": 2.7437659391301154e-06, "loss": 0.4658, "num_input_tokens_seen": 132796480, "step": 109200 }, { "epoch": 13.683122415737376, "grad_norm": 4.049808502197266, "learning_rate": 2.7432780656336654e-06, "loss": 0.4591, "num_input_tokens_seen": 132802912, "step": 109205 }, { "epoch": 13.68374890364616, "grad_norm": 0.7553808689117432, "learning_rate": 2.7427902191182217e-06, "loss": 0.4657, "num_input_tokens_seen": 132809248, "step": 109210 }, { "epoch": 13.684375391554942, "grad_norm": 3.7069220542907715, "learning_rate": 2.7423023995896183e-06, "loss": 0.452, "num_input_tokens_seen": 132815040, "step": 109215 }, { "epoch": 13.685001879463726, "grad_norm": 2.238187551498413, "learning_rate": 2.741814607053688e-06, "loss": 0.4578, "num_input_tokens_seen": 132821120, "step": 109220 }, { "epoch": 13.68562836737251, "grad_norm": 6.94687032699585, "learning_rate": 2.7413268415162642e-06, "loss": 0.474, "num_input_tokens_seen": 132827424, "step": 109225 }, { "epoch": 13.686254855281293, "grad_norm": 5.880433082580566, "learning_rate": 2.740839102983176e-06, "loss": 0.4608, "num_input_tokens_seen": 132833536, "step": 109230 }, { "epoch": 13.686881343190077, "grad_norm": 4.682766437530518, "learning_rate": 2.740351391460257e-06, "loss": 0.4727, "num_input_tokens_seen": 132839296, "step": 109235 }, { "epoch": 13.68750783109886, "grad_norm": 1.1714938879013062, "learning_rate": 2.7398637069533352e-06, "loss": 0.4664, "num_input_tokens_seen": 132845664, "step": 109240 }, { "epoch": 13.688134319007643, "grad_norm": 0.8848634958267212, "learning_rate": 2.7393760494682447e-06, "loss": 0.5079, "num_input_tokens_seen": 132851968, "step": 109245 }, { "epoch": 13.688760806916427, "grad_norm": 1.4563584327697754, "learning_rate": 2.738888419010812e-06, "loss": 0.4816, "num_input_tokens_seen": 132857856, "step": 109250 }, { "epoch": 13.68938729482521, "grad_norm": 1.4374264478683472, "learning_rate": 2.7384008155868723e-06, "loss": 0.4482, "num_input_tokens_seen": 132864096, "step": 109255 }, { "epoch": 13.690013782733994, "grad_norm": 4.9673075675964355, "learning_rate": 2.7379132392022494e-06, "loss": 0.4609, "num_input_tokens_seen": 132869824, "step": 109260 }, { "epoch": 13.690640270642776, "grad_norm": 1.7073177099227905, "learning_rate": 2.7374256898627778e-06, "loss": 0.4332, "num_input_tokens_seen": 132876096, "step": 109265 }, { "epoch": 13.69126675855156, "grad_norm": 0.8512271046638489, "learning_rate": 2.7369381675742825e-06, "loss": 0.4502, "num_input_tokens_seen": 132882240, "step": 109270 }, { "epoch": 13.691893246460344, "grad_norm": 2.3340556621551514, "learning_rate": 2.7364506723425954e-06, "loss": 0.4527, "num_input_tokens_seen": 132888448, "step": 109275 }, { "epoch": 13.692519734369126, "grad_norm": 1.6121296882629395, "learning_rate": 2.7359632041735427e-06, "loss": 0.5108, "num_input_tokens_seen": 132894752, "step": 109280 }, { "epoch": 13.69314622227791, "grad_norm": 2.211361885070801, "learning_rate": 2.735475763072953e-06, "loss": 0.5001, "num_input_tokens_seen": 132900768, "step": 109285 }, { "epoch": 13.693772710186693, "grad_norm": 2.5170722007751465, "learning_rate": 2.734988349046656e-06, "loss": 0.4759, "num_input_tokens_seen": 132907072, "step": 109290 }, { "epoch": 13.694399198095477, "grad_norm": 3.9350082874298096, "learning_rate": 2.734500962100476e-06, "loss": 0.4735, "num_input_tokens_seen": 132912832, "step": 109295 }, { "epoch": 13.69502568600426, "grad_norm": 1.0343163013458252, "learning_rate": 2.7340136022402435e-06, "loss": 0.5003, "num_input_tokens_seen": 132918720, "step": 109300 }, { "epoch": 13.695652173913043, "grad_norm": 0.7981060743331909, "learning_rate": 2.7335262694717817e-06, "loss": 0.4364, "num_input_tokens_seen": 132925088, "step": 109305 }, { "epoch": 13.696278661821827, "grad_norm": 1.046554684638977, "learning_rate": 2.733038963800919e-06, "loss": 0.4796, "num_input_tokens_seen": 132931200, "step": 109310 }, { "epoch": 13.696905149730611, "grad_norm": 1.846826434135437, "learning_rate": 2.732551685233481e-06, "loss": 0.4716, "num_input_tokens_seen": 132937088, "step": 109315 }, { "epoch": 13.697531637639393, "grad_norm": 0.9071036577224731, "learning_rate": 2.7320644337752966e-06, "loss": 0.4806, "num_input_tokens_seen": 132943488, "step": 109320 }, { "epoch": 13.698158125548177, "grad_norm": 2.0399487018585205, "learning_rate": 2.7315772094321863e-06, "loss": 0.4573, "num_input_tokens_seen": 132949664, "step": 109325 }, { "epoch": 13.69878461345696, "grad_norm": 0.7004787921905518, "learning_rate": 2.7310900122099792e-06, "loss": 0.4714, "num_input_tokens_seen": 132955680, "step": 109330 }, { "epoch": 13.699411101365744, "grad_norm": 8.39739990234375, "learning_rate": 2.7306028421144974e-06, "loss": 0.4682, "num_input_tokens_seen": 132962016, "step": 109335 }, { "epoch": 13.700037589274526, "grad_norm": 7.238790512084961, "learning_rate": 2.7301156991515687e-06, "loss": 0.5159, "num_input_tokens_seen": 132967520, "step": 109340 }, { "epoch": 13.70066407718331, "grad_norm": 7.589674472808838, "learning_rate": 2.7296285833270124e-06, "loss": 0.5119, "num_input_tokens_seen": 132973824, "step": 109345 }, { "epoch": 13.701290565092094, "grad_norm": 0.737282395362854, "learning_rate": 2.7291414946466552e-06, "loss": 0.4711, "num_input_tokens_seen": 132979616, "step": 109350 }, { "epoch": 13.701917053000876, "grad_norm": 0.9549614787101746, "learning_rate": 2.7286544331163235e-06, "loss": 0.4452, "num_input_tokens_seen": 132985600, "step": 109355 }, { "epoch": 13.70254354090966, "grad_norm": 1.1845617294311523, "learning_rate": 2.7281673987418354e-06, "loss": 0.4652, "num_input_tokens_seen": 132991712, "step": 109360 }, { "epoch": 13.703170028818445, "grad_norm": 0.7308906316757202, "learning_rate": 2.7276803915290185e-06, "loss": 0.4409, "num_input_tokens_seen": 132997568, "step": 109365 }, { "epoch": 13.703796516727227, "grad_norm": 0.9272662997245789, "learning_rate": 2.72719341148369e-06, "loss": 0.4498, "num_input_tokens_seen": 133003744, "step": 109370 }, { "epoch": 13.704423004636011, "grad_norm": 0.9897448420524597, "learning_rate": 2.7267064586116776e-06, "loss": 0.4551, "num_input_tokens_seen": 133009472, "step": 109375 }, { "epoch": 13.705049492544793, "grad_norm": 2.76631498336792, "learning_rate": 2.726219532918799e-06, "loss": 0.4737, "num_input_tokens_seen": 133015168, "step": 109380 }, { "epoch": 13.705675980453577, "grad_norm": 1.452497124671936, "learning_rate": 2.725732634410879e-06, "loss": 0.4684, "num_input_tokens_seen": 133021248, "step": 109385 }, { "epoch": 13.706302468362361, "grad_norm": 0.8462126851081848, "learning_rate": 2.7252457630937355e-06, "loss": 0.4497, "num_input_tokens_seen": 133027360, "step": 109390 }, { "epoch": 13.706928956271144, "grad_norm": 1.0411702394485474, "learning_rate": 2.724758918973194e-06, "loss": 0.4799, "num_input_tokens_seen": 133033632, "step": 109395 }, { "epoch": 13.707555444179928, "grad_norm": 3.423304319381714, "learning_rate": 2.7242721020550704e-06, "loss": 0.4761, "num_input_tokens_seen": 133039904, "step": 109400 }, { "epoch": 13.70818193208871, "grad_norm": 0.8355057835578918, "learning_rate": 2.7237853123451873e-06, "loss": 0.4638, "num_input_tokens_seen": 133045344, "step": 109405 }, { "epoch": 13.708808419997494, "grad_norm": 0.8857468366622925, "learning_rate": 2.723298549849364e-06, "loss": 0.4507, "num_input_tokens_seen": 133051488, "step": 109410 }, { "epoch": 13.709434907906278, "grad_norm": 3.352915048599243, "learning_rate": 2.722811814573423e-06, "loss": 0.4414, "num_input_tokens_seen": 133058176, "step": 109415 }, { "epoch": 13.71006139581506, "grad_norm": 1.1474034786224365, "learning_rate": 2.722325106523179e-06, "loss": 0.4644, "num_input_tokens_seen": 133064224, "step": 109420 }, { "epoch": 13.710687883723844, "grad_norm": 3.222097158432007, "learning_rate": 2.721838425704454e-06, "loss": 0.4582, "num_input_tokens_seen": 133070240, "step": 109425 }, { "epoch": 13.711314371632627, "grad_norm": 0.7674055695533752, "learning_rate": 2.7213517721230674e-06, "loss": 0.4458, "num_input_tokens_seen": 133076416, "step": 109430 }, { "epoch": 13.71194085954141, "grad_norm": 0.857365071773529, "learning_rate": 2.7208651457848346e-06, "loss": 0.4431, "num_input_tokens_seen": 133082592, "step": 109435 }, { "epoch": 13.712567347450195, "grad_norm": 4.250764846801758, "learning_rate": 2.720378546695577e-06, "loss": 0.5369, "num_input_tokens_seen": 133088768, "step": 109440 }, { "epoch": 13.713193835358977, "grad_norm": 0.8441844582557678, "learning_rate": 2.7198919748611076e-06, "loss": 0.4866, "num_input_tokens_seen": 133094848, "step": 109445 }, { "epoch": 13.713820323267761, "grad_norm": 0.7409499883651733, "learning_rate": 2.71940543028725e-06, "loss": 0.4707, "num_input_tokens_seen": 133100992, "step": 109450 }, { "epoch": 13.714446811176543, "grad_norm": 0.826957106590271, "learning_rate": 2.718918912979815e-06, "loss": 0.4623, "num_input_tokens_seen": 133106784, "step": 109455 }, { "epoch": 13.715073299085327, "grad_norm": 1.4706473350524902, "learning_rate": 2.7184324229446245e-06, "loss": 0.4342, "num_input_tokens_seen": 133112544, "step": 109460 }, { "epoch": 13.715699786994112, "grad_norm": 1.5432991981506348, "learning_rate": 2.7179459601874904e-06, "loss": 0.4509, "num_input_tokens_seen": 133118752, "step": 109465 }, { "epoch": 13.716326274902894, "grad_norm": 2.4584007263183594, "learning_rate": 2.717459524714231e-06, "loss": 0.4652, "num_input_tokens_seen": 133124704, "step": 109470 }, { "epoch": 13.716952762811678, "grad_norm": 1.5256686210632324, "learning_rate": 2.7169731165306643e-06, "loss": 0.4528, "num_input_tokens_seen": 133130976, "step": 109475 }, { "epoch": 13.717579250720462, "grad_norm": 1.370083212852478, "learning_rate": 2.7164867356426016e-06, "loss": 0.4529, "num_input_tokens_seen": 133136704, "step": 109480 }, { "epoch": 13.718205738629244, "grad_norm": 1.9137892723083496, "learning_rate": 2.7160003820558616e-06, "loss": 0.461, "num_input_tokens_seen": 133142496, "step": 109485 }, { "epoch": 13.718832226538028, "grad_norm": 0.6882033944129944, "learning_rate": 2.7155140557762557e-06, "loss": 0.4805, "num_input_tokens_seen": 133148864, "step": 109490 }, { "epoch": 13.71945871444681, "grad_norm": 0.9699307084083557, "learning_rate": 2.715027756809602e-06, "loss": 0.4393, "num_input_tokens_seen": 133154880, "step": 109495 }, { "epoch": 13.720085202355595, "grad_norm": 0.7750782370567322, "learning_rate": 2.7145414851617096e-06, "loss": 0.4581, "num_input_tokens_seen": 133161216, "step": 109500 }, { "epoch": 13.720711690264379, "grad_norm": 4.26488733291626, "learning_rate": 2.714055240838398e-06, "loss": 0.4673, "num_input_tokens_seen": 133167264, "step": 109505 }, { "epoch": 13.721338178173161, "grad_norm": 2.3616955280303955, "learning_rate": 2.713569023845476e-06, "loss": 0.456, "num_input_tokens_seen": 133172992, "step": 109510 }, { "epoch": 13.721964666081945, "grad_norm": 0.5441784858703613, "learning_rate": 2.7130828341887584e-06, "loss": 0.4615, "num_input_tokens_seen": 133179040, "step": 109515 }, { "epoch": 13.722591153990727, "grad_norm": 0.868261456489563, "learning_rate": 2.7125966718740584e-06, "loss": 0.4258, "num_input_tokens_seen": 133185728, "step": 109520 }, { "epoch": 13.723217641899511, "grad_norm": 1.3556187152862549, "learning_rate": 2.71211053690719e-06, "loss": 0.4564, "num_input_tokens_seen": 133191584, "step": 109525 }, { "epoch": 13.723844129808295, "grad_norm": 1.249926209449768, "learning_rate": 2.7116244292939615e-06, "loss": 0.4756, "num_input_tokens_seen": 133197632, "step": 109530 }, { "epoch": 13.724470617717078, "grad_norm": 1.1528770923614502, "learning_rate": 2.7111383490401873e-06, "loss": 0.4436, "num_input_tokens_seen": 133203776, "step": 109535 }, { "epoch": 13.725097105625862, "grad_norm": 1.520321249961853, "learning_rate": 2.71065229615168e-06, "loss": 0.475, "num_input_tokens_seen": 133210208, "step": 109540 }, { "epoch": 13.725723593534644, "grad_norm": 6.3824310302734375, "learning_rate": 2.710166270634248e-06, "loss": 0.485, "num_input_tokens_seen": 133216224, "step": 109545 }, { "epoch": 13.726350081443428, "grad_norm": 0.5511623024940491, "learning_rate": 2.7096802724937055e-06, "loss": 0.4467, "num_input_tokens_seen": 133221856, "step": 109550 }, { "epoch": 13.726976569352212, "grad_norm": 4.782314300537109, "learning_rate": 2.7091943017358584e-06, "loss": 0.4921, "num_input_tokens_seen": 133227968, "step": 109555 }, { "epoch": 13.727603057260994, "grad_norm": 7.7545905113220215, "learning_rate": 2.7087083583665218e-06, "loss": 0.4548, "num_input_tokens_seen": 133234080, "step": 109560 }, { "epoch": 13.728229545169778, "grad_norm": 1.2924410104751587, "learning_rate": 2.708222442391501e-06, "loss": 0.4407, "num_input_tokens_seen": 133240576, "step": 109565 }, { "epoch": 13.72885603307856, "grad_norm": 2.4407622814178467, "learning_rate": 2.70773655381661e-06, "loss": 0.4597, "num_input_tokens_seen": 133246848, "step": 109570 }, { "epoch": 13.729482520987345, "grad_norm": 1.015674352645874, "learning_rate": 2.7072506926476534e-06, "loss": 0.4913, "num_input_tokens_seen": 133253344, "step": 109575 }, { "epoch": 13.730109008896129, "grad_norm": 2.475956916809082, "learning_rate": 2.7067648588904437e-06, "loss": 0.4874, "num_input_tokens_seen": 133259680, "step": 109580 }, { "epoch": 13.730735496804911, "grad_norm": 3.3726022243499756, "learning_rate": 2.7062790525507875e-06, "loss": 0.5696, "num_input_tokens_seen": 133265792, "step": 109585 }, { "epoch": 13.731361984713695, "grad_norm": 1.1150531768798828, "learning_rate": 2.7057932736344925e-06, "loss": 0.4746, "num_input_tokens_seen": 133271776, "step": 109590 }, { "epoch": 13.73198847262248, "grad_norm": 4.9125447273254395, "learning_rate": 2.705307522147371e-06, "loss": 0.4819, "num_input_tokens_seen": 133278112, "step": 109595 }, { "epoch": 13.732614960531262, "grad_norm": 0.8638976812362671, "learning_rate": 2.704821798095225e-06, "loss": 0.4419, "num_input_tokens_seen": 133284288, "step": 109600 }, { "epoch": 13.733241448440046, "grad_norm": 0.9805962443351746, "learning_rate": 2.7043361014838655e-06, "loss": 0.454, "num_input_tokens_seen": 133290272, "step": 109605 }, { "epoch": 13.733867936348828, "grad_norm": 0.6979625225067139, "learning_rate": 2.703850432319096e-06, "loss": 0.5231, "num_input_tokens_seen": 133296256, "step": 109610 }, { "epoch": 13.734494424257612, "grad_norm": 4.839804172515869, "learning_rate": 2.703364790606726e-06, "loss": 0.4471, "num_input_tokens_seen": 133302528, "step": 109615 }, { "epoch": 13.735120912166396, "grad_norm": 1.6045912504196167, "learning_rate": 2.702879176352562e-06, "loss": 0.5285, "num_input_tokens_seen": 133308672, "step": 109620 }, { "epoch": 13.735747400075178, "grad_norm": 5.532922744750977, "learning_rate": 2.702393589562407e-06, "loss": 0.4968, "num_input_tokens_seen": 133314336, "step": 109625 }, { "epoch": 13.736373887983962, "grad_norm": 0.6221693754196167, "learning_rate": 2.7019080302420685e-06, "loss": 0.464, "num_input_tokens_seen": 133320544, "step": 109630 }, { "epoch": 13.737000375892745, "grad_norm": 2.1365721225738525, "learning_rate": 2.7014224983973535e-06, "loss": 0.4841, "num_input_tokens_seen": 133326368, "step": 109635 }, { "epoch": 13.737626863801529, "grad_norm": 4.574345111846924, "learning_rate": 2.700936994034063e-06, "loss": 0.4512, "num_input_tokens_seen": 133332480, "step": 109640 }, { "epoch": 13.738253351710313, "grad_norm": 1.0274503231048584, "learning_rate": 2.700451517158006e-06, "loss": 0.4373, "num_input_tokens_seen": 133338912, "step": 109645 }, { "epoch": 13.738879839619095, "grad_norm": 1.314579963684082, "learning_rate": 2.6999660677749816e-06, "loss": 0.4474, "num_input_tokens_seen": 133345056, "step": 109650 }, { "epoch": 13.739506327527879, "grad_norm": 1.580647587776184, "learning_rate": 2.6994806458907976e-06, "loss": 0.4808, "num_input_tokens_seen": 133351136, "step": 109655 }, { "epoch": 13.740132815436661, "grad_norm": 1.431837558746338, "learning_rate": 2.6989952515112582e-06, "loss": 0.4501, "num_input_tokens_seen": 133357376, "step": 109660 }, { "epoch": 13.740759303345445, "grad_norm": 0.9034704566001892, "learning_rate": 2.698509884642163e-06, "loss": 0.4332, "num_input_tokens_seen": 133363584, "step": 109665 }, { "epoch": 13.74138579125423, "grad_norm": 1.7077913284301758, "learning_rate": 2.6980245452893196e-06, "loss": 0.5279, "num_input_tokens_seen": 133369856, "step": 109670 }, { "epoch": 13.742012279163012, "grad_norm": 3.707965135574341, "learning_rate": 2.697539233458526e-06, "loss": 0.5227, "num_input_tokens_seen": 133376320, "step": 109675 }, { "epoch": 13.742638767071796, "grad_norm": 1.0535894632339478, "learning_rate": 2.697053949155588e-06, "loss": 0.4419, "num_input_tokens_seen": 133382432, "step": 109680 }, { "epoch": 13.743265254980578, "grad_norm": 1.0368696451187134, "learning_rate": 2.6965686923863056e-06, "loss": 0.4561, "num_input_tokens_seen": 133388736, "step": 109685 }, { "epoch": 13.743891742889362, "grad_norm": 1.8574632406234741, "learning_rate": 2.696083463156482e-06, "loss": 0.4434, "num_input_tokens_seen": 133394880, "step": 109690 }, { "epoch": 13.744518230798146, "grad_norm": 8.540136337280273, "learning_rate": 2.6955982614719155e-06, "loss": 0.5717, "num_input_tokens_seen": 133401248, "step": 109695 }, { "epoch": 13.745144718706928, "grad_norm": 0.899259626865387, "learning_rate": 2.6951130873384117e-06, "loss": 0.3945, "num_input_tokens_seen": 133407296, "step": 109700 }, { "epoch": 13.745771206615713, "grad_norm": 6.679574966430664, "learning_rate": 2.694627940761767e-06, "loss": 0.4925, "num_input_tokens_seen": 133413632, "step": 109705 }, { "epoch": 13.746397694524497, "grad_norm": 6.501696586608887, "learning_rate": 2.694142821747785e-06, "loss": 0.4827, "num_input_tokens_seen": 133419840, "step": 109710 }, { "epoch": 13.747024182433279, "grad_norm": 0.9303263425827026, "learning_rate": 2.6936577303022628e-06, "loss": 0.442, "num_input_tokens_seen": 133425920, "step": 109715 }, { "epoch": 13.747650670342063, "grad_norm": 0.8668895959854126, "learning_rate": 2.693172666431001e-06, "loss": 0.4657, "num_input_tokens_seen": 133431488, "step": 109720 }, { "epoch": 13.748277158250845, "grad_norm": 0.9386427402496338, "learning_rate": 2.6926876301397997e-06, "loss": 0.4788, "num_input_tokens_seen": 133437216, "step": 109725 }, { "epoch": 13.74890364615963, "grad_norm": 7.08550500869751, "learning_rate": 2.6922026214344575e-06, "loss": 0.4787, "num_input_tokens_seen": 133443168, "step": 109730 }, { "epoch": 13.749530134068413, "grad_norm": 9.886778831481934, "learning_rate": 2.6917176403207756e-06, "loss": 0.4914, "num_input_tokens_seen": 133449216, "step": 109735 }, { "epoch": 13.750156621977196, "grad_norm": 1.127213478088379, "learning_rate": 2.6912326868045474e-06, "loss": 0.4994, "num_input_tokens_seen": 133455584, "step": 109740 }, { "epoch": 13.75078310988598, "grad_norm": 7.755661487579346, "learning_rate": 2.6907477608915762e-06, "loss": 0.4984, "num_input_tokens_seen": 133461536, "step": 109745 }, { "epoch": 13.751409597794762, "grad_norm": 0.8747332096099854, "learning_rate": 2.6902628625876546e-06, "loss": 0.49, "num_input_tokens_seen": 133467552, "step": 109750 }, { "epoch": 13.752036085703546, "grad_norm": 5.057647705078125, "learning_rate": 2.6897779918985846e-06, "loss": 0.5119, "num_input_tokens_seen": 133473920, "step": 109755 }, { "epoch": 13.75266257361233, "grad_norm": 7.283141613006592, "learning_rate": 2.6892931488301587e-06, "loss": 0.4977, "num_input_tokens_seen": 133480064, "step": 109760 }, { "epoch": 13.753289061521112, "grad_norm": 0.6849706768989563, "learning_rate": 2.6888083333881783e-06, "loss": 0.453, "num_input_tokens_seen": 133486464, "step": 109765 }, { "epoch": 13.753915549429896, "grad_norm": 1.6639028787612915, "learning_rate": 2.688323545578435e-06, "loss": 0.4607, "num_input_tokens_seen": 133493024, "step": 109770 }, { "epoch": 13.754542037338679, "grad_norm": 0.9522356390953064, "learning_rate": 2.68783878540673e-06, "loss": 0.4495, "num_input_tokens_seen": 133498528, "step": 109775 }, { "epoch": 13.755168525247463, "grad_norm": 1.1687003374099731, "learning_rate": 2.6873540528788535e-06, "loss": 0.4839, "num_input_tokens_seen": 133504736, "step": 109780 }, { "epoch": 13.755795013156247, "grad_norm": 1.2211366891860962, "learning_rate": 2.686869348000604e-06, "loss": 0.5114, "num_input_tokens_seen": 133510880, "step": 109785 }, { "epoch": 13.756421501065029, "grad_norm": 2.1007776260375977, "learning_rate": 2.6863846707777786e-06, "loss": 0.5036, "num_input_tokens_seen": 133517024, "step": 109790 }, { "epoch": 13.757047988973813, "grad_norm": 2.206634521484375, "learning_rate": 2.6859000212161666e-06, "loss": 0.4343, "num_input_tokens_seen": 133522496, "step": 109795 }, { "epoch": 13.757674476882595, "grad_norm": 0.9724286198616028, "learning_rate": 2.6854153993215682e-06, "loss": 0.4678, "num_input_tokens_seen": 133528416, "step": 109800 }, { "epoch": 13.75830096479138, "grad_norm": 1.564010739326477, "learning_rate": 2.6849308050997726e-06, "loss": 0.4511, "num_input_tokens_seen": 133534624, "step": 109805 }, { "epoch": 13.758927452700163, "grad_norm": 1.1029324531555176, "learning_rate": 2.6844462385565776e-06, "loss": 0.4653, "num_input_tokens_seen": 133540768, "step": 109810 }, { "epoch": 13.759553940608946, "grad_norm": 7.046934604644775, "learning_rate": 2.6839616996977723e-06, "loss": 0.5236, "num_input_tokens_seen": 133546720, "step": 109815 }, { "epoch": 13.76018042851773, "grad_norm": 1.4060978889465332, "learning_rate": 2.6834771885291534e-06, "loss": 0.4733, "num_input_tokens_seen": 133552864, "step": 109820 }, { "epoch": 13.760806916426514, "grad_norm": 1.047735333442688, "learning_rate": 2.682992705056513e-06, "loss": 0.5131, "num_input_tokens_seen": 133559392, "step": 109825 }, { "epoch": 13.761433404335296, "grad_norm": 1.2325700521469116, "learning_rate": 2.6825082492856414e-06, "loss": 0.4795, "num_input_tokens_seen": 133565376, "step": 109830 }, { "epoch": 13.76205989224408, "grad_norm": 1.3799892663955688, "learning_rate": 2.6820238212223316e-06, "loss": 0.5822, "num_input_tokens_seen": 133571648, "step": 109835 }, { "epoch": 13.762686380152862, "grad_norm": 1.6318912506103516, "learning_rate": 2.6815394208723765e-06, "loss": 0.4483, "num_input_tokens_seen": 133577728, "step": 109840 }, { "epoch": 13.763312868061647, "grad_norm": 3.3449740409851074, "learning_rate": 2.6810550482415688e-06, "loss": 0.5856, "num_input_tokens_seen": 133583520, "step": 109845 }, { "epoch": 13.763939355970429, "grad_norm": 0.906257688999176, "learning_rate": 2.680570703335695e-06, "loss": 0.4766, "num_input_tokens_seen": 133589216, "step": 109850 }, { "epoch": 13.764565843879213, "grad_norm": 1.179418683052063, "learning_rate": 2.6800863861605508e-06, "loss": 0.4749, "num_input_tokens_seen": 133595392, "step": 109855 }, { "epoch": 13.765192331787997, "grad_norm": 1.2106612920761108, "learning_rate": 2.679602096721923e-06, "loss": 0.4547, "num_input_tokens_seen": 133601504, "step": 109860 }, { "epoch": 13.76581881969678, "grad_norm": 6.381853103637695, "learning_rate": 2.679117835025604e-06, "loss": 0.4707, "num_input_tokens_seen": 133607808, "step": 109865 }, { "epoch": 13.766445307605563, "grad_norm": 10.02111530303955, "learning_rate": 2.678633601077382e-06, "loss": 0.5549, "num_input_tokens_seen": 133614304, "step": 109870 }, { "epoch": 13.767071795514347, "grad_norm": 2.5697312355041504, "learning_rate": 2.678149394883047e-06, "loss": 0.4557, "num_input_tokens_seen": 133620384, "step": 109875 }, { "epoch": 13.76769828342313, "grad_norm": 1.26483154296875, "learning_rate": 2.6776652164483873e-06, "loss": 0.4865, "num_input_tokens_seen": 133626560, "step": 109880 }, { "epoch": 13.768324771331914, "grad_norm": 1.32537841796875, "learning_rate": 2.6771810657791943e-06, "loss": 0.4679, "num_input_tokens_seen": 133632832, "step": 109885 }, { "epoch": 13.768951259240696, "grad_norm": 1.262638807296753, "learning_rate": 2.676696942881253e-06, "loss": 0.4504, "num_input_tokens_seen": 133638752, "step": 109890 }, { "epoch": 13.76957774714948, "grad_norm": 5.096048355102539, "learning_rate": 2.6762128477603544e-06, "loss": 0.474, "num_input_tokens_seen": 133644704, "step": 109895 }, { "epoch": 13.770204235058264, "grad_norm": 0.9390041828155518, "learning_rate": 2.675728780422283e-06, "loss": 0.4313, "num_input_tokens_seen": 133651008, "step": 109900 }, { "epoch": 13.770830722967046, "grad_norm": 1.180918574333191, "learning_rate": 2.675244740872829e-06, "loss": 0.4165, "num_input_tokens_seen": 133656864, "step": 109905 }, { "epoch": 13.77145721087583, "grad_norm": 7.898351669311523, "learning_rate": 2.6747607291177795e-06, "loss": 0.5234, "num_input_tokens_seen": 133662784, "step": 109910 }, { "epoch": 13.772083698784613, "grad_norm": 1.5237326622009277, "learning_rate": 2.6742767451629194e-06, "loss": 0.4525, "num_input_tokens_seen": 133669024, "step": 109915 }, { "epoch": 13.772710186693397, "grad_norm": 8.112460136413574, "learning_rate": 2.6737927890140376e-06, "loss": 0.4928, "num_input_tokens_seen": 133675456, "step": 109920 }, { "epoch": 13.77333667460218, "grad_norm": 3.5260658264160156, "learning_rate": 2.6733088606769164e-06, "loss": 0.5224, "num_input_tokens_seen": 133681792, "step": 109925 }, { "epoch": 13.773963162510963, "grad_norm": 1.1837064027786255, "learning_rate": 2.6728249601573452e-06, "loss": 0.4904, "num_input_tokens_seen": 133688256, "step": 109930 }, { "epoch": 13.774589650419747, "grad_norm": 1.1795332431793213, "learning_rate": 2.6723410874611073e-06, "loss": 0.4718, "num_input_tokens_seen": 133694432, "step": 109935 }, { "epoch": 13.775216138328531, "grad_norm": 8.565895080566406, "learning_rate": 2.6718572425939902e-06, "loss": 0.4841, "num_input_tokens_seen": 133700800, "step": 109940 }, { "epoch": 13.775842626237313, "grad_norm": 2.2887866497039795, "learning_rate": 2.6713734255617764e-06, "loss": 0.4825, "num_input_tokens_seen": 133706816, "step": 109945 }, { "epoch": 13.776469114146098, "grad_norm": 1.0750561952590942, "learning_rate": 2.6708896363702523e-06, "loss": 0.4164, "num_input_tokens_seen": 133712832, "step": 109950 }, { "epoch": 13.77709560205488, "grad_norm": 1.2950578927993774, "learning_rate": 2.6704058750251993e-06, "loss": 0.4299, "num_input_tokens_seen": 133719360, "step": 109955 }, { "epoch": 13.777722089963664, "grad_norm": 0.747595489025116, "learning_rate": 2.669922141532404e-06, "loss": 0.5446, "num_input_tokens_seen": 133725344, "step": 109960 }, { "epoch": 13.778348577872446, "grad_norm": 1.010074257850647, "learning_rate": 2.6694384358976465e-06, "loss": 0.4461, "num_input_tokens_seen": 133731264, "step": 109965 }, { "epoch": 13.77897506578123, "grad_norm": 0.6111488342285156, "learning_rate": 2.668954758126713e-06, "loss": 0.4303, "num_input_tokens_seen": 133737344, "step": 109970 }, { "epoch": 13.779601553690014, "grad_norm": 1.3667042255401611, "learning_rate": 2.6684711082253867e-06, "loss": 0.5254, "num_input_tokens_seen": 133743584, "step": 109975 }, { "epoch": 13.780228041598797, "grad_norm": 1.9999125003814697, "learning_rate": 2.667987486199446e-06, "loss": 0.4714, "num_input_tokens_seen": 133750080, "step": 109980 }, { "epoch": 13.78085452950758, "grad_norm": 0.8777110576629639, "learning_rate": 2.667503892054678e-06, "loss": 0.5158, "num_input_tokens_seen": 133756352, "step": 109985 }, { "epoch": 13.781481017416365, "grad_norm": 1.4217604398727417, "learning_rate": 2.6670203257968598e-06, "loss": 0.554, "num_input_tokens_seen": 133762784, "step": 109990 }, { "epoch": 13.782107505325147, "grad_norm": 1.4682472944259644, "learning_rate": 2.666536787431777e-06, "loss": 0.4697, "num_input_tokens_seen": 133768800, "step": 109995 }, { "epoch": 13.782733993233931, "grad_norm": 1.0604438781738281, "learning_rate": 2.666053276965207e-06, "loss": 0.5706, "num_input_tokens_seen": 133774848, "step": 110000 }, { "epoch": 13.783360481142713, "grad_norm": 1.170068621635437, "learning_rate": 2.6655697944029347e-06, "loss": 0.4728, "num_input_tokens_seen": 133781184, "step": 110005 }, { "epoch": 13.783986969051497, "grad_norm": 1.0394833087921143, "learning_rate": 2.6650863397507353e-06, "loss": 0.4827, "num_input_tokens_seen": 133787296, "step": 110010 }, { "epoch": 13.784613456960281, "grad_norm": 1.0122014284133911, "learning_rate": 2.6646029130143946e-06, "loss": 0.4488, "num_input_tokens_seen": 133793376, "step": 110015 }, { "epoch": 13.785239944869064, "grad_norm": 3.244828701019287, "learning_rate": 2.664119514199688e-06, "loss": 0.4867, "num_input_tokens_seen": 133799520, "step": 110020 }, { "epoch": 13.785866432777848, "grad_norm": 3.8161568641662598, "learning_rate": 2.663636143312395e-06, "loss": 0.4801, "num_input_tokens_seen": 133805440, "step": 110025 }, { "epoch": 13.78649292068663, "grad_norm": 0.9703801870346069, "learning_rate": 2.6631528003582975e-06, "loss": 0.5533, "num_input_tokens_seen": 133811584, "step": 110030 }, { "epoch": 13.787119408595414, "grad_norm": 1.2976760864257812, "learning_rate": 2.6626694853431746e-06, "loss": 0.4789, "num_input_tokens_seen": 133817504, "step": 110035 }, { "epoch": 13.787745896504198, "grad_norm": 10.286528587341309, "learning_rate": 2.6621861982728003e-06, "loss": 0.549, "num_input_tokens_seen": 133822976, "step": 110040 }, { "epoch": 13.78837238441298, "grad_norm": 0.7970094084739685, "learning_rate": 2.661702939152957e-06, "loss": 0.4587, "num_input_tokens_seen": 133829504, "step": 110045 }, { "epoch": 13.788998872321764, "grad_norm": 0.7940139174461365, "learning_rate": 2.6612197079894218e-06, "loss": 0.4611, "num_input_tokens_seen": 133835744, "step": 110050 }, { "epoch": 13.789625360230547, "grad_norm": 5.243019104003906, "learning_rate": 2.66073650478797e-06, "loss": 0.4566, "num_input_tokens_seen": 133841952, "step": 110055 }, { "epoch": 13.79025184813933, "grad_norm": 1.4154218435287476, "learning_rate": 2.660253329554382e-06, "loss": 0.4512, "num_input_tokens_seen": 133847968, "step": 110060 }, { "epoch": 13.790878336048115, "grad_norm": 1.1362985372543335, "learning_rate": 2.65977018229443e-06, "loss": 0.4403, "num_input_tokens_seen": 133854208, "step": 110065 }, { "epoch": 13.791504823956897, "grad_norm": 1.2501035928726196, "learning_rate": 2.6592870630138954e-06, "loss": 0.4421, "num_input_tokens_seen": 133860224, "step": 110070 }, { "epoch": 13.792131311865681, "grad_norm": 0.7560455799102783, "learning_rate": 2.65880397171855e-06, "loss": 0.4533, "num_input_tokens_seen": 133866432, "step": 110075 }, { "epoch": 13.792757799774463, "grad_norm": 1.3207008838653564, "learning_rate": 2.6583209084141738e-06, "loss": 0.4491, "num_input_tokens_seen": 133872768, "step": 110080 }, { "epoch": 13.793384287683248, "grad_norm": 1.3151942491531372, "learning_rate": 2.6578378731065367e-06, "loss": 0.4423, "num_input_tokens_seen": 133878592, "step": 110085 }, { "epoch": 13.794010775592032, "grad_norm": 7.592694282531738, "learning_rate": 2.657354865801418e-06, "loss": 0.5594, "num_input_tokens_seen": 133884864, "step": 110090 }, { "epoch": 13.794637263500814, "grad_norm": 6.106083869934082, "learning_rate": 2.656871886504594e-06, "loss": 0.5509, "num_input_tokens_seen": 133890912, "step": 110095 }, { "epoch": 13.795263751409598, "grad_norm": 2.6891260147094727, "learning_rate": 2.6563889352218336e-06, "loss": 0.5251, "num_input_tokens_seen": 133896928, "step": 110100 }, { "epoch": 13.795890239318382, "grad_norm": 2.7125160694122314, "learning_rate": 2.6559060119589164e-06, "loss": 0.479, "num_input_tokens_seen": 133902944, "step": 110105 }, { "epoch": 13.796516727227164, "grad_norm": 1.1369383335113525, "learning_rate": 2.655423116721611e-06, "loss": 0.4637, "num_input_tokens_seen": 133909472, "step": 110110 }, { "epoch": 13.797143215135948, "grad_norm": 1.0853588581085205, "learning_rate": 2.654940249515696e-06, "loss": 0.4715, "num_input_tokens_seen": 133915424, "step": 110115 }, { "epoch": 13.79776970304473, "grad_norm": 1.2553609609603882, "learning_rate": 2.65445741034694e-06, "loss": 0.4901, "num_input_tokens_seen": 133921184, "step": 110120 }, { "epoch": 13.798396190953515, "grad_norm": 1.1510229110717773, "learning_rate": 2.653974599221119e-06, "loss": 0.4354, "num_input_tokens_seen": 133927072, "step": 110125 }, { "epoch": 13.799022678862299, "grad_norm": 0.8917310833930969, "learning_rate": 2.653491816144003e-06, "loss": 0.4379, "num_input_tokens_seen": 133932448, "step": 110130 }, { "epoch": 13.799649166771081, "grad_norm": 1.0815231800079346, "learning_rate": 2.6530090611213646e-06, "loss": 0.5128, "num_input_tokens_seen": 133938656, "step": 110135 }, { "epoch": 13.800275654679865, "grad_norm": 4.574168682098389, "learning_rate": 2.6525263341589763e-06, "loss": 0.5123, "num_input_tokens_seen": 133944928, "step": 110140 }, { "epoch": 13.800902142588647, "grad_norm": 10.979986190795898, "learning_rate": 2.6520436352626113e-06, "loss": 0.5144, "num_input_tokens_seen": 133951264, "step": 110145 }, { "epoch": 13.801528630497431, "grad_norm": 7.718080997467041, "learning_rate": 2.6515609644380367e-06, "loss": 0.5593, "num_input_tokens_seen": 133957376, "step": 110150 }, { "epoch": 13.802155118406215, "grad_norm": 4.626342296600342, "learning_rate": 2.651078321691025e-06, "loss": 0.5551, "num_input_tokens_seen": 133963744, "step": 110155 }, { "epoch": 13.802781606314998, "grad_norm": 0.8863458633422852, "learning_rate": 2.6505957070273493e-06, "loss": 0.4386, "num_input_tokens_seen": 133970016, "step": 110160 }, { "epoch": 13.803408094223782, "grad_norm": 0.9167355895042419, "learning_rate": 2.650113120452775e-06, "loss": 0.5171, "num_input_tokens_seen": 133976032, "step": 110165 }, { "epoch": 13.804034582132564, "grad_norm": 1.2814747095108032, "learning_rate": 2.6496305619730755e-06, "loss": 0.45, "num_input_tokens_seen": 133981600, "step": 110170 }, { "epoch": 13.804661070041348, "grad_norm": 1.1888591051101685, "learning_rate": 2.649148031594018e-06, "loss": 0.5436, "num_input_tokens_seen": 133987616, "step": 110175 }, { "epoch": 13.805287557950132, "grad_norm": 6.716562747955322, "learning_rate": 2.648665529321373e-06, "loss": 0.4677, "num_input_tokens_seen": 133993728, "step": 110180 }, { "epoch": 13.805914045858914, "grad_norm": 8.097073554992676, "learning_rate": 2.6481830551609066e-06, "loss": 0.5358, "num_input_tokens_seen": 133999936, "step": 110185 }, { "epoch": 13.806540533767699, "grad_norm": 0.825859010219574, "learning_rate": 2.647700609118392e-06, "loss": 0.4322, "num_input_tokens_seen": 134006144, "step": 110190 }, { "epoch": 13.80716702167648, "grad_norm": 1.3584545850753784, "learning_rate": 2.6472181911995913e-06, "loss": 0.4788, "num_input_tokens_seen": 134012032, "step": 110195 }, { "epoch": 13.807793509585265, "grad_norm": 1.1312586069107056, "learning_rate": 2.646735801410278e-06, "loss": 0.4741, "num_input_tokens_seen": 134018048, "step": 110200 }, { "epoch": 13.808419997494049, "grad_norm": 1.8571007251739502, "learning_rate": 2.6462534397562147e-06, "loss": 0.4701, "num_input_tokens_seen": 134024000, "step": 110205 }, { "epoch": 13.809046485402831, "grad_norm": 0.9433350563049316, "learning_rate": 2.6457711062431698e-06, "loss": 0.4953, "num_input_tokens_seen": 134029728, "step": 110210 }, { "epoch": 13.809672973311615, "grad_norm": 0.9237014651298523, "learning_rate": 2.645288800876913e-06, "loss": 0.4199, "num_input_tokens_seen": 134035744, "step": 110215 }, { "epoch": 13.8102994612204, "grad_norm": 5.432063579559326, "learning_rate": 2.6448065236632057e-06, "loss": 0.5778, "num_input_tokens_seen": 134042144, "step": 110220 }, { "epoch": 13.810925949129182, "grad_norm": 7.766635894775391, "learning_rate": 2.6443242746078194e-06, "loss": 0.4844, "num_input_tokens_seen": 134048352, "step": 110225 }, { "epoch": 13.811552437037966, "grad_norm": 0.8554152250289917, "learning_rate": 2.643842053716515e-06, "loss": 0.5188, "num_input_tokens_seen": 134053920, "step": 110230 }, { "epoch": 13.812178924946748, "grad_norm": 1.5476808547973633, "learning_rate": 2.6433598609950596e-06, "loss": 0.4712, "num_input_tokens_seen": 134060224, "step": 110235 }, { "epoch": 13.812805412855532, "grad_norm": 7.8286542892456055, "learning_rate": 2.64287769644922e-06, "loss": 0.5268, "num_input_tokens_seen": 134066080, "step": 110240 }, { "epoch": 13.813431900764316, "grad_norm": 2.5388870239257812, "learning_rate": 2.6423955600847574e-06, "loss": 0.5433, "num_input_tokens_seen": 134072064, "step": 110245 }, { "epoch": 13.814058388673098, "grad_norm": 3.381237268447876, "learning_rate": 2.6419134519074385e-06, "loss": 0.4963, "num_input_tokens_seen": 134078144, "step": 110250 }, { "epoch": 13.814684876581882, "grad_norm": 0.9199079871177673, "learning_rate": 2.641431371923029e-06, "loss": 0.4583, "num_input_tokens_seen": 134084128, "step": 110255 }, { "epoch": 13.815311364490665, "grad_norm": 0.7879277467727661, "learning_rate": 2.6409493201372877e-06, "loss": 0.5034, "num_input_tokens_seen": 134090400, "step": 110260 }, { "epoch": 13.815937852399449, "grad_norm": 6.125608444213867, "learning_rate": 2.640467296555983e-06, "loss": 0.558, "num_input_tokens_seen": 134096448, "step": 110265 }, { "epoch": 13.816564340308233, "grad_norm": 2.6177494525909424, "learning_rate": 2.639985301184873e-06, "loss": 0.4729, "num_input_tokens_seen": 134102656, "step": 110270 }, { "epoch": 13.817190828217015, "grad_norm": 1.0458643436431885, "learning_rate": 2.639503334029724e-06, "loss": 0.4585, "num_input_tokens_seen": 134108832, "step": 110275 }, { "epoch": 13.817817316125799, "grad_norm": 0.8899807929992676, "learning_rate": 2.6390213950962986e-06, "loss": 0.4555, "num_input_tokens_seen": 134115136, "step": 110280 }, { "epoch": 13.818443804034581, "grad_norm": 0.5931566953659058, "learning_rate": 2.6385394843903554e-06, "loss": 0.4673, "num_input_tokens_seen": 134121696, "step": 110285 }, { "epoch": 13.819070291943365, "grad_norm": 1.0641111135482788, "learning_rate": 2.6380576019176603e-06, "loss": 0.4279, "num_input_tokens_seen": 134128096, "step": 110290 }, { "epoch": 13.81969677985215, "grad_norm": 1.0323678255081177, "learning_rate": 2.6375757476839704e-06, "loss": 0.4617, "num_input_tokens_seen": 134133824, "step": 110295 }, { "epoch": 13.820323267760932, "grad_norm": 1.2596238851547241, "learning_rate": 2.6370939216950502e-06, "loss": 0.4696, "num_input_tokens_seen": 134139904, "step": 110300 }, { "epoch": 13.820949755669716, "grad_norm": 2.1377172470092773, "learning_rate": 2.636612123956657e-06, "loss": 0.4747, "num_input_tokens_seen": 134146208, "step": 110305 }, { "epoch": 13.821576243578498, "grad_norm": 4.391860485076904, "learning_rate": 2.6361303544745556e-06, "loss": 0.4607, "num_input_tokens_seen": 134152384, "step": 110310 }, { "epoch": 13.822202731487282, "grad_norm": 0.9512318968772888, "learning_rate": 2.6356486132545002e-06, "loss": 0.4414, "num_input_tokens_seen": 134158720, "step": 110315 }, { "epoch": 13.822829219396066, "grad_norm": 1.8153576850891113, "learning_rate": 2.635166900302256e-06, "loss": 0.4527, "num_input_tokens_seen": 134164736, "step": 110320 }, { "epoch": 13.823455707304849, "grad_norm": 1.317129373550415, "learning_rate": 2.6346852156235774e-06, "loss": 0.4254, "num_input_tokens_seen": 134171104, "step": 110325 }, { "epoch": 13.824082195213633, "grad_norm": 1.3605047464370728, "learning_rate": 2.634203559224226e-06, "loss": 0.4414, "num_input_tokens_seen": 134177472, "step": 110330 }, { "epoch": 13.824708683122417, "grad_norm": 1.2264854907989502, "learning_rate": 2.633721931109962e-06, "loss": 0.4474, "num_input_tokens_seen": 134183200, "step": 110335 }, { "epoch": 13.825335171031199, "grad_norm": 1.283610224723816, "learning_rate": 2.6332403312865397e-06, "loss": 0.4814, "num_input_tokens_seen": 134189632, "step": 110340 }, { "epoch": 13.825961658939983, "grad_norm": 0.8394026756286621, "learning_rate": 2.632758759759719e-06, "loss": 0.4858, "num_input_tokens_seen": 134195488, "step": 110345 }, { "epoch": 13.826588146848765, "grad_norm": 1.157402515411377, "learning_rate": 2.6322772165352585e-06, "loss": 0.4722, "num_input_tokens_seen": 134201728, "step": 110350 }, { "epoch": 13.82721463475755, "grad_norm": 2.6983397006988525, "learning_rate": 2.6317957016189155e-06, "loss": 0.4618, "num_input_tokens_seen": 134207968, "step": 110355 }, { "epoch": 13.827841122666333, "grad_norm": 2.9169230461120605, "learning_rate": 2.6313142150164444e-06, "loss": 0.4668, "num_input_tokens_seen": 134214016, "step": 110360 }, { "epoch": 13.828467610575116, "grad_norm": 1.374427318572998, "learning_rate": 2.6308327567336056e-06, "loss": 0.4658, "num_input_tokens_seen": 134220448, "step": 110365 }, { "epoch": 13.8290940984839, "grad_norm": 1.0327662229537964, "learning_rate": 2.6303513267761504e-06, "loss": 0.4674, "num_input_tokens_seen": 134226560, "step": 110370 }, { "epoch": 13.829720586392682, "grad_norm": 4.38763427734375, "learning_rate": 2.6298699251498394e-06, "loss": 0.476, "num_input_tokens_seen": 134232512, "step": 110375 }, { "epoch": 13.830347074301466, "grad_norm": 1.4275522232055664, "learning_rate": 2.6293885518604244e-06, "loss": 0.4776, "num_input_tokens_seen": 134238720, "step": 110380 }, { "epoch": 13.83097356221025, "grad_norm": 0.6359381079673767, "learning_rate": 2.6289072069136644e-06, "loss": 0.4856, "num_input_tokens_seen": 134244768, "step": 110385 }, { "epoch": 13.831600050119032, "grad_norm": 2.2420971393585205, "learning_rate": 2.6284258903153103e-06, "loss": 0.4892, "num_input_tokens_seen": 134250848, "step": 110390 }, { "epoch": 13.832226538027816, "grad_norm": 1.183149814605713, "learning_rate": 2.6279446020711187e-06, "loss": 0.4237, "num_input_tokens_seen": 134256608, "step": 110395 }, { "epoch": 13.832853025936599, "grad_norm": 5.07854700088501, "learning_rate": 2.6274633421868458e-06, "loss": 0.4519, "num_input_tokens_seen": 134262976, "step": 110400 }, { "epoch": 13.833479513845383, "grad_norm": 1.102203130722046, "learning_rate": 2.6269821106682414e-06, "loss": 0.4351, "num_input_tokens_seen": 134269408, "step": 110405 }, { "epoch": 13.834106001754167, "grad_norm": 0.9242337942123413, "learning_rate": 2.6265009075210622e-06, "loss": 0.4813, "num_input_tokens_seen": 134275488, "step": 110410 }, { "epoch": 13.834732489662949, "grad_norm": 4.528264045715332, "learning_rate": 2.626019732751059e-06, "loss": 0.4665, "num_input_tokens_seen": 134281504, "step": 110415 }, { "epoch": 13.835358977571733, "grad_norm": 2.445159912109375, "learning_rate": 2.625538586363987e-06, "loss": 0.4443, "num_input_tokens_seen": 134287296, "step": 110420 }, { "epoch": 13.835985465480515, "grad_norm": 1.8796335458755493, "learning_rate": 2.625057468365596e-06, "loss": 0.4463, "num_input_tokens_seen": 134293088, "step": 110425 }, { "epoch": 13.8366119533893, "grad_norm": 0.7774565815925598, "learning_rate": 2.6245763787616418e-06, "loss": 0.4878, "num_input_tokens_seen": 134298464, "step": 110430 }, { "epoch": 13.837238441298084, "grad_norm": 1.330641269683838, "learning_rate": 2.6240953175578727e-06, "loss": 0.4544, "num_input_tokens_seen": 134304576, "step": 110435 }, { "epoch": 13.837864929206866, "grad_norm": 1.1268337965011597, "learning_rate": 2.623614284760041e-06, "loss": 0.4756, "num_input_tokens_seen": 134310624, "step": 110440 }, { "epoch": 13.83849141711565, "grad_norm": 1.214685320854187, "learning_rate": 2.6231332803739006e-06, "loss": 0.4418, "num_input_tokens_seen": 134316288, "step": 110445 }, { "epoch": 13.839117905024434, "grad_norm": 0.7879270911216736, "learning_rate": 2.6226523044051978e-06, "loss": 0.4599, "num_input_tokens_seen": 134322336, "step": 110450 }, { "epoch": 13.839744392933216, "grad_norm": 1.2314852476119995, "learning_rate": 2.622171356859686e-06, "loss": 0.4719, "num_input_tokens_seen": 134328352, "step": 110455 }, { "epoch": 13.840370880842, "grad_norm": 1.7437899112701416, "learning_rate": 2.621690437743114e-06, "loss": 0.4839, "num_input_tokens_seen": 134334432, "step": 110460 }, { "epoch": 13.840997368750783, "grad_norm": 0.8245232105255127, "learning_rate": 2.6212095470612354e-06, "loss": 0.4739, "num_input_tokens_seen": 134340704, "step": 110465 }, { "epoch": 13.841623856659567, "grad_norm": 1.844234585762024, "learning_rate": 2.6207286848197945e-06, "loss": 0.5713, "num_input_tokens_seen": 134346784, "step": 110470 }, { "epoch": 13.842250344568349, "grad_norm": 9.592022895812988, "learning_rate": 2.620247851024544e-06, "loss": 0.5043, "num_input_tokens_seen": 134352800, "step": 110475 }, { "epoch": 13.842876832477133, "grad_norm": 1.0069355964660645, "learning_rate": 2.6197670456812297e-06, "loss": 0.4486, "num_input_tokens_seen": 134358816, "step": 110480 }, { "epoch": 13.843503320385917, "grad_norm": 1.3002249002456665, "learning_rate": 2.6192862687956034e-06, "loss": 0.4371, "num_input_tokens_seen": 134364512, "step": 110485 }, { "epoch": 13.8441298082947, "grad_norm": 1.7537128925323486, "learning_rate": 2.6188055203734085e-06, "loss": 0.442, "num_input_tokens_seen": 134369824, "step": 110490 }, { "epoch": 13.844756296203483, "grad_norm": 1.9663043022155762, "learning_rate": 2.6183248004203987e-06, "loss": 0.4686, "num_input_tokens_seen": 134375808, "step": 110495 }, { "epoch": 13.845382784112267, "grad_norm": 4.49656867980957, "learning_rate": 2.6178441089423158e-06, "loss": 0.4721, "num_input_tokens_seen": 134382016, "step": 110500 }, { "epoch": 13.84600927202105, "grad_norm": 1.1476383209228516, "learning_rate": 2.6173634459449114e-06, "loss": 0.485, "num_input_tokens_seen": 134387616, "step": 110505 }, { "epoch": 13.846635759929834, "grad_norm": 2.1070146560668945, "learning_rate": 2.616882811433929e-06, "loss": 0.4526, "num_input_tokens_seen": 134393824, "step": 110510 }, { "epoch": 13.847262247838616, "grad_norm": 3.935957431793213, "learning_rate": 2.616402205415117e-06, "loss": 0.4556, "num_input_tokens_seen": 134399712, "step": 110515 }, { "epoch": 13.8478887357474, "grad_norm": 1.4737920761108398, "learning_rate": 2.61592162789422e-06, "loss": 0.4902, "num_input_tokens_seen": 134405888, "step": 110520 }, { "epoch": 13.848515223656184, "grad_norm": 0.7687427401542664, "learning_rate": 2.6154410788769833e-06, "loss": 0.4611, "num_input_tokens_seen": 134412160, "step": 110525 }, { "epoch": 13.849141711564966, "grad_norm": 0.7734866142272949, "learning_rate": 2.614960558369156e-06, "loss": 0.4451, "num_input_tokens_seen": 134418112, "step": 110530 }, { "epoch": 13.84976819947375, "grad_norm": 2.3706090450286865, "learning_rate": 2.614480066376478e-06, "loss": 0.4757, "num_input_tokens_seen": 134423904, "step": 110535 }, { "epoch": 13.850394687382533, "grad_norm": 1.4941648244857788, "learning_rate": 2.6139996029046984e-06, "loss": 0.4512, "num_input_tokens_seen": 134430400, "step": 110540 }, { "epoch": 13.851021175291317, "grad_norm": 5.157606601715088, "learning_rate": 2.6135191679595573e-06, "loss": 0.4921, "num_input_tokens_seen": 134436384, "step": 110545 }, { "epoch": 13.8516476632001, "grad_norm": 1.2637383937835693, "learning_rate": 2.613038761546802e-06, "loss": 0.4874, "num_input_tokens_seen": 134442624, "step": 110550 }, { "epoch": 13.852274151108883, "grad_norm": 7.437060356140137, "learning_rate": 2.6125583836721745e-06, "loss": 0.5393, "num_input_tokens_seen": 134448736, "step": 110555 }, { "epoch": 13.852900639017667, "grad_norm": 1.2666640281677246, "learning_rate": 2.612078034341421e-06, "loss": 0.4723, "num_input_tokens_seen": 134455008, "step": 110560 }, { "epoch": 13.85352712692645, "grad_norm": 1.1870166063308716, "learning_rate": 2.61159771356028e-06, "loss": 0.5084, "num_input_tokens_seen": 134461440, "step": 110565 }, { "epoch": 13.854153614835234, "grad_norm": 0.6866478323936462, "learning_rate": 2.6111174213344982e-06, "loss": 0.4893, "num_input_tokens_seen": 134467264, "step": 110570 }, { "epoch": 13.854780102744018, "grad_norm": 1.3118305206298828, "learning_rate": 2.6106371576698142e-06, "loss": 0.4834, "num_input_tokens_seen": 134473216, "step": 110575 }, { "epoch": 13.8554065906528, "grad_norm": 0.8402828574180603, "learning_rate": 2.6101569225719724e-06, "loss": 0.4756, "num_input_tokens_seen": 134478656, "step": 110580 }, { "epoch": 13.856033078561584, "grad_norm": 1.0387365818023682, "learning_rate": 2.609676716046715e-06, "loss": 0.4501, "num_input_tokens_seen": 134485088, "step": 110585 }, { "epoch": 13.856659566470366, "grad_norm": 8.141141891479492, "learning_rate": 2.6091965380997808e-06, "loss": 0.5328, "num_input_tokens_seen": 134491520, "step": 110590 }, { "epoch": 13.85728605437915, "grad_norm": 0.9679593443870544, "learning_rate": 2.608716388736914e-06, "loss": 0.4795, "num_input_tokens_seen": 134497792, "step": 110595 }, { "epoch": 13.857912542287934, "grad_norm": 0.763565182685852, "learning_rate": 2.6082362679638513e-06, "loss": 0.4379, "num_input_tokens_seen": 134504000, "step": 110600 }, { "epoch": 13.858539030196717, "grad_norm": 3.18013334274292, "learning_rate": 2.607756175786337e-06, "loss": 0.4729, "num_input_tokens_seen": 134509888, "step": 110605 }, { "epoch": 13.8591655181055, "grad_norm": 1.5683904886245728, "learning_rate": 2.6072761122101063e-06, "loss": 0.5136, "num_input_tokens_seen": 134516384, "step": 110610 }, { "epoch": 13.859792006014285, "grad_norm": 2.6486563682556152, "learning_rate": 2.606796077240904e-06, "loss": 0.4862, "num_input_tokens_seen": 134522592, "step": 110615 }, { "epoch": 13.860418493923067, "grad_norm": 2.8911280632019043, "learning_rate": 2.6063160708844637e-06, "loss": 0.4624, "num_input_tokens_seen": 134528960, "step": 110620 }, { "epoch": 13.861044981831851, "grad_norm": 1.0446423292160034, "learning_rate": 2.6058360931465304e-06, "loss": 0.4594, "num_input_tokens_seen": 134535296, "step": 110625 }, { "epoch": 13.861671469740633, "grad_norm": 1.2441117763519287, "learning_rate": 2.6053561440328367e-06, "loss": 0.4632, "num_input_tokens_seen": 134541952, "step": 110630 }, { "epoch": 13.862297957649417, "grad_norm": 2.2348899841308594, "learning_rate": 2.6048762235491265e-06, "loss": 0.522, "num_input_tokens_seen": 134547776, "step": 110635 }, { "epoch": 13.862924445558201, "grad_norm": 0.9570850729942322, "learning_rate": 2.6043963317011323e-06, "loss": 0.5114, "num_input_tokens_seen": 134553792, "step": 110640 }, { "epoch": 13.863550933466984, "grad_norm": 1.0732344388961792, "learning_rate": 2.6039164684945937e-06, "loss": 0.4546, "num_input_tokens_seen": 134559520, "step": 110645 }, { "epoch": 13.864177421375768, "grad_norm": 5.044926643371582, "learning_rate": 2.603436633935249e-06, "loss": 0.4812, "num_input_tokens_seen": 134564992, "step": 110650 }, { "epoch": 13.86480390928455, "grad_norm": 1.1695224046707153, "learning_rate": 2.602956828028835e-06, "loss": 0.4902, "num_input_tokens_seen": 134571264, "step": 110655 }, { "epoch": 13.865430397193334, "grad_norm": 0.9133639335632324, "learning_rate": 2.6024770507810872e-06, "loss": 0.428, "num_input_tokens_seen": 134577344, "step": 110660 }, { "epoch": 13.866056885102118, "grad_norm": 1.0783101320266724, "learning_rate": 2.6019973021977407e-06, "loss": 0.5058, "num_input_tokens_seen": 134583168, "step": 110665 }, { "epoch": 13.8666833730109, "grad_norm": 1.9344298839569092, "learning_rate": 2.601517582284535e-06, "loss": 0.4583, "num_input_tokens_seen": 134588896, "step": 110670 }, { "epoch": 13.867309860919685, "grad_norm": 1.5373343229293823, "learning_rate": 2.601037891047201e-06, "loss": 0.4406, "num_input_tokens_seen": 134595168, "step": 110675 }, { "epoch": 13.867936348828467, "grad_norm": 0.9478954076766968, "learning_rate": 2.6005582284914775e-06, "loss": 0.4603, "num_input_tokens_seen": 134601056, "step": 110680 }, { "epoch": 13.86856283673725, "grad_norm": 1.1058827638626099, "learning_rate": 2.6000785946230964e-06, "loss": 0.4788, "num_input_tokens_seen": 134607392, "step": 110685 }, { "epoch": 13.869189324646035, "grad_norm": 1.0876721143722534, "learning_rate": 2.599598989447796e-06, "loss": 0.4494, "num_input_tokens_seen": 134612928, "step": 110690 }, { "epoch": 13.869815812554817, "grad_norm": 1.5513249635696411, "learning_rate": 2.599119412971305e-06, "loss": 0.4321, "num_input_tokens_seen": 134618688, "step": 110695 }, { "epoch": 13.870442300463601, "grad_norm": 8.73095417022705, "learning_rate": 2.5986398651993616e-06, "loss": 0.5108, "num_input_tokens_seen": 134624928, "step": 110700 }, { "epoch": 13.871068788372384, "grad_norm": 0.9842029809951782, "learning_rate": 2.5981603461376957e-06, "loss": 0.4527, "num_input_tokens_seen": 134631168, "step": 110705 }, { "epoch": 13.871695276281168, "grad_norm": 9.275162696838379, "learning_rate": 2.597680855792043e-06, "loss": 0.5282, "num_input_tokens_seen": 134637088, "step": 110710 }, { "epoch": 13.872321764189952, "grad_norm": 8.216024398803711, "learning_rate": 2.597201394168137e-06, "loss": 0.5489, "num_input_tokens_seen": 134643328, "step": 110715 }, { "epoch": 13.872948252098734, "grad_norm": 1.5231094360351562, "learning_rate": 2.5967219612717066e-06, "loss": 0.5174, "num_input_tokens_seen": 134649216, "step": 110720 }, { "epoch": 13.873574740007518, "grad_norm": 1.0907307863235474, "learning_rate": 2.596242557108487e-06, "loss": 0.4514, "num_input_tokens_seen": 134655200, "step": 110725 }, { "epoch": 13.874201227916302, "grad_norm": 0.7424392700195312, "learning_rate": 2.5957631816842077e-06, "loss": 0.4713, "num_input_tokens_seen": 134661088, "step": 110730 }, { "epoch": 13.874827715825084, "grad_norm": 1.4181572198867798, "learning_rate": 2.5952838350046022e-06, "loss": 0.4703, "num_input_tokens_seen": 134667360, "step": 110735 }, { "epoch": 13.875454203733868, "grad_norm": 0.8367835879325867, "learning_rate": 2.594804517075398e-06, "loss": 0.4221, "num_input_tokens_seen": 134673216, "step": 110740 }, { "epoch": 13.87608069164265, "grad_norm": 1.099532127380371, "learning_rate": 2.5943252279023307e-06, "loss": 0.4769, "num_input_tokens_seen": 134679712, "step": 110745 }, { "epoch": 13.876707179551435, "grad_norm": 0.6513012051582336, "learning_rate": 2.5938459674911254e-06, "loss": 0.4639, "num_input_tokens_seen": 134686048, "step": 110750 }, { "epoch": 13.877333667460219, "grad_norm": 3.782935380935669, "learning_rate": 2.593366735847515e-06, "loss": 0.4788, "num_input_tokens_seen": 134692288, "step": 110755 }, { "epoch": 13.877960155369001, "grad_norm": 1.050555944442749, "learning_rate": 2.5928875329772285e-06, "loss": 0.4383, "num_input_tokens_seen": 134698080, "step": 110760 }, { "epoch": 13.878586643277785, "grad_norm": 4.679452419281006, "learning_rate": 2.5924083588859973e-06, "loss": 0.4724, "num_input_tokens_seen": 134704160, "step": 110765 }, { "epoch": 13.879213131186567, "grad_norm": 0.9008493423461914, "learning_rate": 2.591929213579546e-06, "loss": 0.4118, "num_input_tokens_seen": 134710432, "step": 110770 }, { "epoch": 13.879839619095351, "grad_norm": 1.1381875276565552, "learning_rate": 2.591450097063606e-06, "loss": 0.5443, "num_input_tokens_seen": 134716832, "step": 110775 }, { "epoch": 13.880466107004136, "grad_norm": 0.6605166792869568, "learning_rate": 2.590971009343907e-06, "loss": 0.451, "num_input_tokens_seen": 134722720, "step": 110780 }, { "epoch": 13.881092594912918, "grad_norm": 5.419179916381836, "learning_rate": 2.5904919504261726e-06, "loss": 0.5006, "num_input_tokens_seen": 134728128, "step": 110785 }, { "epoch": 13.881719082821702, "grad_norm": 9.814953804016113, "learning_rate": 2.590012920316135e-06, "loss": 0.622, "num_input_tokens_seen": 134733952, "step": 110790 }, { "epoch": 13.882345570730484, "grad_norm": 8.752779006958008, "learning_rate": 2.589533919019518e-06, "loss": 0.51, "num_input_tokens_seen": 134740128, "step": 110795 }, { "epoch": 13.882972058639268, "grad_norm": 0.8893490433692932, "learning_rate": 2.58905494654205e-06, "loss": 0.4931, "num_input_tokens_seen": 134745984, "step": 110800 }, { "epoch": 13.883598546548052, "grad_norm": 9.268193244934082, "learning_rate": 2.5885760028894562e-06, "loss": 0.5355, "num_input_tokens_seen": 134752416, "step": 110805 }, { "epoch": 13.884225034456835, "grad_norm": 2.4882524013519287, "learning_rate": 2.5880970880674656e-06, "loss": 0.5203, "num_input_tokens_seen": 134758432, "step": 110810 }, { "epoch": 13.884851522365619, "grad_norm": 4.824323654174805, "learning_rate": 2.5876182020818e-06, "loss": 0.4885, "num_input_tokens_seen": 134764480, "step": 110815 }, { "epoch": 13.8854780102744, "grad_norm": 1.91731858253479, "learning_rate": 2.5871393449381887e-06, "loss": 0.5093, "num_input_tokens_seen": 134770816, "step": 110820 }, { "epoch": 13.886104498183185, "grad_norm": 2.2290539741516113, "learning_rate": 2.586660516642353e-06, "loss": 0.4977, "num_input_tokens_seen": 134776832, "step": 110825 }, { "epoch": 13.886730986091969, "grad_norm": 0.897855818271637, "learning_rate": 2.5861817172000204e-06, "loss": 0.4211, "num_input_tokens_seen": 134783008, "step": 110830 }, { "epoch": 13.887357474000751, "grad_norm": 1.0193864107131958, "learning_rate": 2.585702946616917e-06, "loss": 0.4554, "num_input_tokens_seen": 134788704, "step": 110835 }, { "epoch": 13.887983961909535, "grad_norm": 2.4114272594451904, "learning_rate": 2.585224204898762e-06, "loss": 0.4672, "num_input_tokens_seen": 134794560, "step": 110840 }, { "epoch": 13.88861044981832, "grad_norm": 1.0867011547088623, "learning_rate": 2.5847454920512847e-06, "loss": 0.5104, "num_input_tokens_seen": 134800672, "step": 110845 }, { "epoch": 13.889236937727102, "grad_norm": 1.079810380935669, "learning_rate": 2.584266808080203e-06, "loss": 0.4484, "num_input_tokens_seen": 134806560, "step": 110850 }, { "epoch": 13.889863425635886, "grad_norm": 0.8860686421394348, "learning_rate": 2.583788152991243e-06, "loss": 0.4454, "num_input_tokens_seen": 134812448, "step": 110855 }, { "epoch": 13.890489913544668, "grad_norm": 3.3539226055145264, "learning_rate": 2.5833095267901288e-06, "loss": 0.4878, "num_input_tokens_seen": 134818432, "step": 110860 }, { "epoch": 13.891116401453452, "grad_norm": 7.271198749542236, "learning_rate": 2.582830929482579e-06, "loss": 0.4669, "num_input_tokens_seen": 134824576, "step": 110865 }, { "epoch": 13.891742889362236, "grad_norm": 3.0888845920562744, "learning_rate": 2.5823523610743186e-06, "loss": 0.4745, "num_input_tokens_seen": 134829792, "step": 110870 }, { "epoch": 13.892369377271018, "grad_norm": 1.2443387508392334, "learning_rate": 2.581873821571069e-06, "loss": 0.5644, "num_input_tokens_seen": 134835040, "step": 110875 }, { "epoch": 13.892995865179802, "grad_norm": 3.132551908493042, "learning_rate": 2.5813953109785493e-06, "loss": 0.4377, "num_input_tokens_seen": 134841184, "step": 110880 }, { "epoch": 13.893622353088585, "grad_norm": 1.5890558958053589, "learning_rate": 2.5809168293024845e-06, "loss": 0.57, "num_input_tokens_seen": 134847104, "step": 110885 }, { "epoch": 13.894248840997369, "grad_norm": 1.176589012145996, "learning_rate": 2.580438376548591e-06, "loss": 0.4867, "num_input_tokens_seen": 134853408, "step": 110890 }, { "epoch": 13.894875328906153, "grad_norm": 0.8442320227622986, "learning_rate": 2.5799599527225905e-06, "loss": 0.4824, "num_input_tokens_seen": 134859328, "step": 110895 }, { "epoch": 13.895501816814935, "grad_norm": 1.1206836700439453, "learning_rate": 2.5794815578302057e-06, "loss": 0.4582, "num_input_tokens_seen": 134865632, "step": 110900 }, { "epoch": 13.89612830472372, "grad_norm": 1.9979506731033325, "learning_rate": 2.5790031918771518e-06, "loss": 0.5207, "num_input_tokens_seen": 134871936, "step": 110905 }, { "epoch": 13.896754792632501, "grad_norm": 0.8542848825454712, "learning_rate": 2.578524854869152e-06, "loss": 0.4858, "num_input_tokens_seen": 134877568, "step": 110910 }, { "epoch": 13.897381280541286, "grad_norm": 1.053205132484436, "learning_rate": 2.578046546811922e-06, "loss": 0.4994, "num_input_tokens_seen": 134883712, "step": 110915 }, { "epoch": 13.89800776845007, "grad_norm": 0.7170020937919617, "learning_rate": 2.577568267711183e-06, "loss": 0.4522, "num_input_tokens_seen": 134889920, "step": 110920 }, { "epoch": 13.898634256358852, "grad_norm": 1.0160300731658936, "learning_rate": 2.577090017572651e-06, "loss": 0.4974, "num_input_tokens_seen": 134895968, "step": 110925 }, { "epoch": 13.899260744267636, "grad_norm": 3.4438045024871826, "learning_rate": 2.576611796402046e-06, "loss": 0.4669, "num_input_tokens_seen": 134901952, "step": 110930 }, { "epoch": 13.899887232176418, "grad_norm": 9.084853172302246, "learning_rate": 2.576133604205083e-06, "loss": 0.5597, "num_input_tokens_seen": 134908160, "step": 110935 }, { "epoch": 13.900513720085202, "grad_norm": 1.8205592632293701, "learning_rate": 2.5756554409874823e-06, "loss": 0.5234, "num_input_tokens_seen": 134914272, "step": 110940 }, { "epoch": 13.901140207993986, "grad_norm": 1.014269232749939, "learning_rate": 2.5751773067549566e-06, "loss": 0.4393, "num_input_tokens_seen": 134920288, "step": 110945 }, { "epoch": 13.901766695902769, "grad_norm": 1.1522572040557861, "learning_rate": 2.5746992015132262e-06, "loss": 0.4439, "num_input_tokens_seen": 134926304, "step": 110950 }, { "epoch": 13.902393183811553, "grad_norm": 0.7677949666976929, "learning_rate": 2.5742211252680073e-06, "loss": 0.5178, "num_input_tokens_seen": 134932192, "step": 110955 }, { "epoch": 13.903019671720337, "grad_norm": 2.9103055000305176, "learning_rate": 2.5737430780250123e-06, "loss": 0.4401, "num_input_tokens_seen": 134937920, "step": 110960 }, { "epoch": 13.903646159629119, "grad_norm": 0.9298157095909119, "learning_rate": 2.573265059789959e-06, "loss": 0.4251, "num_input_tokens_seen": 134944000, "step": 110965 }, { "epoch": 13.904272647537903, "grad_norm": 1.9605498313903809, "learning_rate": 2.5727870705685616e-06, "loss": 0.4656, "num_input_tokens_seen": 134950208, "step": 110970 }, { "epoch": 13.904899135446685, "grad_norm": 1.2009650468826294, "learning_rate": 2.5723091103665376e-06, "loss": 0.4338, "num_input_tokens_seen": 134956320, "step": 110975 }, { "epoch": 13.90552562335547, "grad_norm": 2.7623560428619385, "learning_rate": 2.571831179189597e-06, "loss": 0.4923, "num_input_tokens_seen": 134962624, "step": 110980 }, { "epoch": 13.906152111264252, "grad_norm": 8.508816719055176, "learning_rate": 2.571353277043459e-06, "loss": 0.487, "num_input_tokens_seen": 134968896, "step": 110985 }, { "epoch": 13.906778599173036, "grad_norm": 8.084983825683594, "learning_rate": 2.570875403933832e-06, "loss": 0.5184, "num_input_tokens_seen": 134975104, "step": 110990 }, { "epoch": 13.90740508708182, "grad_norm": 2.5421993732452393, "learning_rate": 2.570397559866434e-06, "loss": 0.5103, "num_input_tokens_seen": 134981568, "step": 110995 }, { "epoch": 13.908031574990602, "grad_norm": 1.2134616374969482, "learning_rate": 2.569919744846974e-06, "loss": 0.4861, "num_input_tokens_seen": 134987840, "step": 111000 }, { "epoch": 13.908658062899386, "grad_norm": 3.2103381156921387, "learning_rate": 2.5694419588811674e-06, "loss": 0.4486, "num_input_tokens_seen": 134994304, "step": 111005 }, { "epoch": 13.90928455080817, "grad_norm": 0.9095789790153503, "learning_rate": 2.5689642019747252e-06, "loss": 0.429, "num_input_tokens_seen": 135000768, "step": 111010 }, { "epoch": 13.909911038716952, "grad_norm": 0.931152880191803, "learning_rate": 2.5684864741333593e-06, "loss": 0.4569, "num_input_tokens_seen": 135006976, "step": 111015 }, { "epoch": 13.910537526625737, "grad_norm": 3.038431406021118, "learning_rate": 2.5680087753627838e-06, "loss": 0.5106, "num_input_tokens_seen": 135013344, "step": 111020 }, { "epoch": 13.911164014534519, "grad_norm": 3.599745988845825, "learning_rate": 2.567531105668706e-06, "loss": 0.5275, "num_input_tokens_seen": 135019328, "step": 111025 }, { "epoch": 13.911790502443303, "grad_norm": 1.2175475358963013, "learning_rate": 2.5670534650568406e-06, "loss": 0.573, "num_input_tokens_seen": 135025344, "step": 111030 }, { "epoch": 13.912416990352087, "grad_norm": 0.9532032012939453, "learning_rate": 2.5665758535328956e-06, "loss": 0.4418, "num_input_tokens_seen": 135030784, "step": 111035 }, { "epoch": 13.91304347826087, "grad_norm": 0.8114574551582336, "learning_rate": 2.566098271102583e-06, "loss": 0.4388, "num_input_tokens_seen": 135036928, "step": 111040 }, { "epoch": 13.913669966169653, "grad_norm": 1.0448129177093506, "learning_rate": 2.5656207177716107e-06, "loss": 0.4389, "num_input_tokens_seen": 135043264, "step": 111045 }, { "epoch": 13.914296454078436, "grad_norm": 0.98091059923172, "learning_rate": 2.565143193545691e-06, "loss": 0.4903, "num_input_tokens_seen": 135048832, "step": 111050 }, { "epoch": 13.91492294198722, "grad_norm": 1.0944888591766357, "learning_rate": 2.5646656984305295e-06, "loss": 0.4556, "num_input_tokens_seen": 135054784, "step": 111055 }, { "epoch": 13.915549429896004, "grad_norm": 1.1062384843826294, "learning_rate": 2.5641882324318378e-06, "loss": 0.5142, "num_input_tokens_seen": 135060832, "step": 111060 }, { "epoch": 13.916175917804786, "grad_norm": 1.9898627996444702, "learning_rate": 2.5637107955553254e-06, "loss": 0.451, "num_input_tokens_seen": 135067072, "step": 111065 }, { "epoch": 13.91680240571357, "grad_norm": 2.565140962600708, "learning_rate": 2.5632333878066964e-06, "loss": 0.4885, "num_input_tokens_seen": 135073408, "step": 111070 }, { "epoch": 13.917428893622354, "grad_norm": 0.920317530632019, "learning_rate": 2.562756009191661e-06, "loss": 0.486, "num_input_tokens_seen": 135079392, "step": 111075 }, { "epoch": 13.918055381531136, "grad_norm": 2.368964433670044, "learning_rate": 2.5622786597159264e-06, "loss": 0.5131, "num_input_tokens_seen": 135085696, "step": 111080 }, { "epoch": 13.91868186943992, "grad_norm": 1.175834059715271, "learning_rate": 2.5618013393852027e-06, "loss": 0.4308, "num_input_tokens_seen": 135091744, "step": 111085 }, { "epoch": 13.919308357348703, "grad_norm": 4.897165775299072, "learning_rate": 2.5613240482051915e-06, "loss": 0.4763, "num_input_tokens_seen": 135098016, "step": 111090 }, { "epoch": 13.919934845257487, "grad_norm": 1.1495838165283203, "learning_rate": 2.5608467861816032e-06, "loss": 0.5137, "num_input_tokens_seen": 135104160, "step": 111095 }, { "epoch": 13.920561333166269, "grad_norm": 1.285203218460083, "learning_rate": 2.5603695533201413e-06, "loss": 0.4468, "num_input_tokens_seen": 135109952, "step": 111100 }, { "epoch": 13.921187821075053, "grad_norm": 4.343295097351074, "learning_rate": 2.5598923496265138e-06, "loss": 0.4614, "num_input_tokens_seen": 135115648, "step": 111105 }, { "epoch": 13.921814308983837, "grad_norm": 2.707125425338745, "learning_rate": 2.5594151751064234e-06, "loss": 0.4629, "num_input_tokens_seen": 135121472, "step": 111110 }, { "epoch": 13.92244079689262, "grad_norm": 1.9861847162246704, "learning_rate": 2.5589380297655776e-06, "loss": 0.4663, "num_input_tokens_seen": 135127168, "step": 111115 }, { "epoch": 13.923067284801403, "grad_norm": 1.0495667457580566, "learning_rate": 2.5584609136096784e-06, "loss": 0.4272, "num_input_tokens_seen": 135133152, "step": 111120 }, { "epoch": 13.923693772710187, "grad_norm": 0.9657841920852661, "learning_rate": 2.557983826644434e-06, "loss": 0.4855, "num_input_tokens_seen": 135139296, "step": 111125 }, { "epoch": 13.92432026061897, "grad_norm": 1.205154299736023, "learning_rate": 2.557506768875544e-06, "loss": 0.4672, "num_input_tokens_seen": 135144928, "step": 111130 }, { "epoch": 13.924946748527754, "grad_norm": 4.927030563354492, "learning_rate": 2.5570297403087157e-06, "loss": 0.5324, "num_input_tokens_seen": 135151264, "step": 111135 }, { "epoch": 13.925573236436536, "grad_norm": 2.513587474822998, "learning_rate": 2.556552740949649e-06, "loss": 0.4461, "num_input_tokens_seen": 135157472, "step": 111140 }, { "epoch": 13.92619972434532, "grad_norm": 4.820844650268555, "learning_rate": 2.556075770804049e-06, "loss": 0.4775, "num_input_tokens_seen": 135163136, "step": 111145 }, { "epoch": 13.926826212254104, "grad_norm": 0.6696598529815674, "learning_rate": 2.555598829877619e-06, "loss": 0.4719, "num_input_tokens_seen": 135168416, "step": 111150 }, { "epoch": 13.927452700162887, "grad_norm": 9.78349781036377, "learning_rate": 2.5551219181760588e-06, "loss": 0.4616, "num_input_tokens_seen": 135174784, "step": 111155 }, { "epoch": 13.92807918807167, "grad_norm": 1.0093324184417725, "learning_rate": 2.554645035705074e-06, "loss": 0.5436, "num_input_tokens_seen": 135181344, "step": 111160 }, { "epoch": 13.928705675980453, "grad_norm": 3.66125226020813, "learning_rate": 2.5541681824703612e-06, "loss": 0.4747, "num_input_tokens_seen": 135187104, "step": 111165 }, { "epoch": 13.929332163889237, "grad_norm": 1.224910855293274, "learning_rate": 2.553691358477624e-06, "loss": 0.5035, "num_input_tokens_seen": 135193280, "step": 111170 }, { "epoch": 13.929958651798021, "grad_norm": 1.4434103965759277, "learning_rate": 2.553214563732563e-06, "loss": 0.4953, "num_input_tokens_seen": 135199168, "step": 111175 }, { "epoch": 13.930585139706803, "grad_norm": 0.5962479710578918, "learning_rate": 2.552737798240882e-06, "loss": 0.4582, "num_input_tokens_seen": 135204896, "step": 111180 }, { "epoch": 13.931211627615587, "grad_norm": 0.5448831915855408, "learning_rate": 2.552261062008276e-06, "loss": 0.4793, "num_input_tokens_seen": 135210912, "step": 111185 }, { "epoch": 13.93183811552437, "grad_norm": 5.484479904174805, "learning_rate": 2.5517843550404488e-06, "loss": 0.5591, "num_input_tokens_seen": 135216768, "step": 111190 }, { "epoch": 13.932464603433154, "grad_norm": 2.3955819606781006, "learning_rate": 2.5513076773430956e-06, "loss": 0.4791, "num_input_tokens_seen": 135223072, "step": 111195 }, { "epoch": 13.933091091341938, "grad_norm": 4.815401554107666, "learning_rate": 2.5508310289219184e-06, "loss": 0.4696, "num_input_tokens_seen": 135229248, "step": 111200 }, { "epoch": 13.93371757925072, "grad_norm": 1.0904433727264404, "learning_rate": 2.5503544097826172e-06, "loss": 0.4638, "num_input_tokens_seen": 135235712, "step": 111205 }, { "epoch": 13.934344067159504, "grad_norm": 1.0263845920562744, "learning_rate": 2.549877819930886e-06, "loss": 0.4837, "num_input_tokens_seen": 135242016, "step": 111210 }, { "epoch": 13.934970555068286, "grad_norm": 1.0041553974151611, "learning_rate": 2.5494012593724282e-06, "loss": 0.4474, "num_input_tokens_seen": 135248224, "step": 111215 }, { "epoch": 13.93559704297707, "grad_norm": 5.8126373291015625, "learning_rate": 2.5489247281129364e-06, "loss": 0.4615, "num_input_tokens_seen": 135254208, "step": 111220 }, { "epoch": 13.936223530885854, "grad_norm": 1.0648837089538574, "learning_rate": 2.548448226158112e-06, "loss": 0.4851, "num_input_tokens_seen": 135260672, "step": 111225 }, { "epoch": 13.936850018794637, "grad_norm": 1.4253008365631104, "learning_rate": 2.5479717535136487e-06, "loss": 0.4606, "num_input_tokens_seen": 135266464, "step": 111230 }, { "epoch": 13.93747650670342, "grad_norm": 0.9530190229415894, "learning_rate": 2.547495310185246e-06, "loss": 0.4867, "num_input_tokens_seen": 135272608, "step": 111235 }, { "epoch": 13.938102994612205, "grad_norm": 0.9895336031913757, "learning_rate": 2.547018896178598e-06, "loss": 0.4284, "num_input_tokens_seen": 135278656, "step": 111240 }, { "epoch": 13.938729482520987, "grad_norm": 1.124840497970581, "learning_rate": 2.5465425114994024e-06, "loss": 0.4434, "num_input_tokens_seen": 135284352, "step": 111245 }, { "epoch": 13.939355970429771, "grad_norm": 0.8014840483665466, "learning_rate": 2.5460661561533516e-06, "loss": 0.4484, "num_input_tokens_seen": 135290592, "step": 111250 }, { "epoch": 13.939982458338553, "grad_norm": 5.732104778289795, "learning_rate": 2.5455898301461458e-06, "loss": 0.4745, "num_input_tokens_seen": 135296544, "step": 111255 }, { "epoch": 13.940608946247337, "grad_norm": 2.5320217609405518, "learning_rate": 2.5451135334834752e-06, "loss": 0.5153, "num_input_tokens_seen": 135302496, "step": 111260 }, { "epoch": 13.941235434156122, "grad_norm": 5.752190113067627, "learning_rate": 2.5446372661710363e-06, "loss": 0.4787, "num_input_tokens_seen": 135308448, "step": 111265 }, { "epoch": 13.941861922064904, "grad_norm": 0.9018705487251282, "learning_rate": 2.5441610282145223e-06, "loss": 0.5032, "num_input_tokens_seen": 135314656, "step": 111270 }, { "epoch": 13.942488409973688, "grad_norm": 9.820638656616211, "learning_rate": 2.543684819619631e-06, "loss": 0.5315, "num_input_tokens_seen": 135320352, "step": 111275 }, { "epoch": 13.94311489788247, "grad_norm": 1.0776689052581787, "learning_rate": 2.5432086403920497e-06, "loss": 0.4736, "num_input_tokens_seen": 135326208, "step": 111280 }, { "epoch": 13.943741385791254, "grad_norm": 0.8350073099136353, "learning_rate": 2.5427324905374756e-06, "loss": 0.4727, "num_input_tokens_seen": 135332288, "step": 111285 }, { "epoch": 13.944367873700038, "grad_norm": 1.1092437505722046, "learning_rate": 2.5422563700616023e-06, "loss": 0.4417, "num_input_tokens_seen": 135338080, "step": 111290 }, { "epoch": 13.94499436160882, "grad_norm": 5.161988735198975, "learning_rate": 2.5417802789701185e-06, "loss": 0.4879, "num_input_tokens_seen": 135343968, "step": 111295 }, { "epoch": 13.945620849517605, "grad_norm": 9.999858856201172, "learning_rate": 2.5413042172687196e-06, "loss": 0.5612, "num_input_tokens_seen": 135350368, "step": 111300 }, { "epoch": 13.946247337426387, "grad_norm": 1.1207104921340942, "learning_rate": 2.540828184963095e-06, "loss": 0.4801, "num_input_tokens_seen": 135356800, "step": 111305 }, { "epoch": 13.946873825335171, "grad_norm": 0.9229699969291687, "learning_rate": 2.5403521820589385e-06, "loss": 0.5166, "num_input_tokens_seen": 135363136, "step": 111310 }, { "epoch": 13.947500313243955, "grad_norm": 0.7501866817474365, "learning_rate": 2.539876208561938e-06, "loss": 0.4516, "num_input_tokens_seen": 135368704, "step": 111315 }, { "epoch": 13.948126801152737, "grad_norm": 3.45721173286438, "learning_rate": 2.539400264477787e-06, "loss": 0.48, "num_input_tokens_seen": 135374784, "step": 111320 }, { "epoch": 13.948753289061521, "grad_norm": 6.593519687652588, "learning_rate": 2.538924349812173e-06, "loss": 0.5883, "num_input_tokens_seen": 135380800, "step": 111325 }, { "epoch": 13.949379776970304, "grad_norm": 1.087281346321106, "learning_rate": 2.5384484645707873e-06, "loss": 0.456, "num_input_tokens_seen": 135386848, "step": 111330 }, { "epoch": 13.950006264879088, "grad_norm": 3.653489589691162, "learning_rate": 2.5379726087593225e-06, "loss": 0.4649, "num_input_tokens_seen": 135393088, "step": 111335 }, { "epoch": 13.950632752787872, "grad_norm": 2.745936870574951, "learning_rate": 2.5374967823834622e-06, "loss": 0.4574, "num_input_tokens_seen": 135399456, "step": 111340 }, { "epoch": 13.951259240696654, "grad_norm": 5.0880537033081055, "learning_rate": 2.537020985448901e-06, "loss": 0.4914, "num_input_tokens_seen": 135405536, "step": 111345 }, { "epoch": 13.951885728605438, "grad_norm": 0.7199167609214783, "learning_rate": 2.5365452179613225e-06, "loss": 0.4618, "num_input_tokens_seen": 135411584, "step": 111350 }, { "epoch": 13.952512216514222, "grad_norm": 0.7824429273605347, "learning_rate": 2.5360694799264187e-06, "loss": 0.4544, "num_input_tokens_seen": 135417728, "step": 111355 }, { "epoch": 13.953138704423004, "grad_norm": 3.958369255065918, "learning_rate": 2.535593771349874e-06, "loss": 0.4518, "num_input_tokens_seen": 135423840, "step": 111360 }, { "epoch": 13.953765192331788, "grad_norm": 6.67022705078125, "learning_rate": 2.5351180922373798e-06, "loss": 0.5006, "num_input_tokens_seen": 135430304, "step": 111365 }, { "epoch": 13.95439168024057, "grad_norm": 0.7448890209197998, "learning_rate": 2.534642442594619e-06, "loss": 0.5098, "num_input_tokens_seen": 135436128, "step": 111370 }, { "epoch": 13.955018168149355, "grad_norm": 1.7564289569854736, "learning_rate": 2.534166822427281e-06, "loss": 0.5036, "num_input_tokens_seen": 135442240, "step": 111375 }, { "epoch": 13.955644656058139, "grad_norm": 2.862241506576538, "learning_rate": 2.533691231741051e-06, "loss": 0.479, "num_input_tokens_seen": 135448064, "step": 111380 }, { "epoch": 13.956271143966921, "grad_norm": 1.3496193885803223, "learning_rate": 2.5332156705416166e-06, "loss": 0.4515, "num_input_tokens_seen": 135454208, "step": 111385 }, { "epoch": 13.956897631875705, "grad_norm": 0.8783047199249268, "learning_rate": 2.532740138834665e-06, "loss": 0.4396, "num_input_tokens_seen": 135460256, "step": 111390 }, { "epoch": 13.957524119784487, "grad_norm": 0.8665643334388733, "learning_rate": 2.5322646366258764e-06, "loss": 0.4429, "num_input_tokens_seen": 135466624, "step": 111395 }, { "epoch": 13.958150607693272, "grad_norm": 1.4338780641555786, "learning_rate": 2.5317891639209415e-06, "loss": 0.4456, "num_input_tokens_seen": 135472896, "step": 111400 }, { "epoch": 13.958777095602056, "grad_norm": 6.049811363220215, "learning_rate": 2.53131372072554e-06, "loss": 0.5015, "num_input_tokens_seen": 135478720, "step": 111405 }, { "epoch": 13.959403583510838, "grad_norm": 0.9650844931602478, "learning_rate": 2.5308383070453613e-06, "loss": 0.4953, "num_input_tokens_seen": 135484768, "step": 111410 }, { "epoch": 13.960030071419622, "grad_norm": 7.831123352050781, "learning_rate": 2.5303629228860836e-06, "loss": 0.5217, "num_input_tokens_seen": 135490304, "step": 111415 }, { "epoch": 13.960656559328404, "grad_norm": 5.760173320770264, "learning_rate": 2.529887568253396e-06, "loss": 0.4434, "num_input_tokens_seen": 135496480, "step": 111420 }, { "epoch": 13.961283047237188, "grad_norm": 8.196053504943848, "learning_rate": 2.529412243152978e-06, "loss": 0.5229, "num_input_tokens_seen": 135502880, "step": 111425 }, { "epoch": 13.961909535145972, "grad_norm": 2.531789541244507, "learning_rate": 2.5289369475905144e-06, "loss": 0.4621, "num_input_tokens_seen": 135509024, "step": 111430 }, { "epoch": 13.962536023054755, "grad_norm": 1.0472322702407837, "learning_rate": 2.5284616815716857e-06, "loss": 0.4504, "num_input_tokens_seen": 135515168, "step": 111435 }, { "epoch": 13.963162510963539, "grad_norm": 0.6228255033493042, "learning_rate": 2.527986445102177e-06, "loss": 0.5064, "num_input_tokens_seen": 135520640, "step": 111440 }, { "epoch": 13.963788998872321, "grad_norm": 1.2648073434829712, "learning_rate": 2.527511238187668e-06, "loss": 0.467, "num_input_tokens_seen": 135526784, "step": 111445 }, { "epoch": 13.964415486781105, "grad_norm": 0.7677726745605469, "learning_rate": 2.5270360608338405e-06, "loss": 0.4791, "num_input_tokens_seen": 135532928, "step": 111450 }, { "epoch": 13.965041974689889, "grad_norm": 2.731100082397461, "learning_rate": 2.5265609130463777e-06, "loss": 0.4759, "num_input_tokens_seen": 135538720, "step": 111455 }, { "epoch": 13.965668462598671, "grad_norm": 1.2310824394226074, "learning_rate": 2.526085794830957e-06, "loss": 0.4765, "num_input_tokens_seen": 135544544, "step": 111460 }, { "epoch": 13.966294950507455, "grad_norm": 0.7241401672363281, "learning_rate": 2.525610706193262e-06, "loss": 0.5426, "num_input_tokens_seen": 135550784, "step": 111465 }, { "epoch": 13.96692143841624, "grad_norm": 1.2843598127365112, "learning_rate": 2.5251356471389697e-06, "loss": 0.4607, "num_input_tokens_seen": 135556608, "step": 111470 }, { "epoch": 13.967547926325022, "grad_norm": 1.1547666788101196, "learning_rate": 2.524660617673762e-06, "loss": 0.4487, "num_input_tokens_seen": 135562752, "step": 111475 }, { "epoch": 13.968174414233806, "grad_norm": 1.0855767726898193, "learning_rate": 2.5241856178033186e-06, "loss": 0.4638, "num_input_tokens_seen": 135568608, "step": 111480 }, { "epoch": 13.968800902142588, "grad_norm": 0.9550610780715942, "learning_rate": 2.5237106475333168e-06, "loss": 0.4835, "num_input_tokens_seen": 135574688, "step": 111485 }, { "epoch": 13.969427390051372, "grad_norm": 5.0199875831604, "learning_rate": 2.5232357068694357e-06, "loss": 0.4908, "num_input_tokens_seen": 135581024, "step": 111490 }, { "epoch": 13.970053877960156, "grad_norm": 1.1873890161514282, "learning_rate": 2.5227607958173563e-06, "loss": 0.501, "num_input_tokens_seen": 135587136, "step": 111495 }, { "epoch": 13.970680365868938, "grad_norm": 1.11273193359375, "learning_rate": 2.5222859143827517e-06, "loss": 0.4394, "num_input_tokens_seen": 135593664, "step": 111500 }, { "epoch": 13.971306853777723, "grad_norm": 5.31639289855957, "learning_rate": 2.521811062571305e-06, "loss": 0.4755, "num_input_tokens_seen": 135599968, "step": 111505 }, { "epoch": 13.971933341686505, "grad_norm": 0.92317134141922, "learning_rate": 2.5213362403886882e-06, "loss": 0.4348, "num_input_tokens_seen": 135605856, "step": 111510 }, { "epoch": 13.972559829595289, "grad_norm": 2.9821202754974365, "learning_rate": 2.52086144784058e-06, "loss": 0.488, "num_input_tokens_seen": 135612160, "step": 111515 }, { "epoch": 13.973186317504073, "grad_norm": 5.995126247406006, "learning_rate": 2.520386684932661e-06, "loss": 0.4841, "num_input_tokens_seen": 135618592, "step": 111520 }, { "epoch": 13.973812805412855, "grad_norm": 6.311069011688232, "learning_rate": 2.5199119516706007e-06, "loss": 0.5164, "num_input_tokens_seen": 135624896, "step": 111525 }, { "epoch": 13.97443929332164, "grad_norm": 2.521395206451416, "learning_rate": 2.5194372480600805e-06, "loss": 0.503, "num_input_tokens_seen": 135630848, "step": 111530 }, { "epoch": 13.975065781230422, "grad_norm": 1.21089506149292, "learning_rate": 2.5189625741067714e-06, "loss": 0.449, "num_input_tokens_seen": 135636960, "step": 111535 }, { "epoch": 13.975692269139206, "grad_norm": 2.071605920791626, "learning_rate": 2.518487929816353e-06, "loss": 0.4755, "num_input_tokens_seen": 135643200, "step": 111540 }, { "epoch": 13.97631875704799, "grad_norm": 1.2072994709014893, "learning_rate": 2.5180133151944953e-06, "loss": 0.4405, "num_input_tokens_seen": 135649312, "step": 111545 }, { "epoch": 13.976945244956772, "grad_norm": 1.1858264207839966, "learning_rate": 2.517538730246878e-06, "loss": 0.4811, "num_input_tokens_seen": 135655712, "step": 111550 }, { "epoch": 13.977571732865556, "grad_norm": 1.158686876296997, "learning_rate": 2.517064174979169e-06, "loss": 0.4393, "num_input_tokens_seen": 135661760, "step": 111555 }, { "epoch": 13.978198220774338, "grad_norm": 1.186769962310791, "learning_rate": 2.516589649397048e-06, "loss": 0.4519, "num_input_tokens_seen": 135668192, "step": 111560 }, { "epoch": 13.978824708683122, "grad_norm": 1.0431545972824097, "learning_rate": 2.516115153506184e-06, "loss": 0.4741, "num_input_tokens_seen": 135674208, "step": 111565 }, { "epoch": 13.979451196591906, "grad_norm": 11.011514663696289, "learning_rate": 2.515640687312252e-06, "loss": 0.5237, "num_input_tokens_seen": 135680256, "step": 111570 }, { "epoch": 13.980077684500689, "grad_norm": 9.533503532409668, "learning_rate": 2.515166250820925e-06, "loss": 0.4669, "num_input_tokens_seen": 135686144, "step": 111575 }, { "epoch": 13.980704172409473, "grad_norm": 1.5886259078979492, "learning_rate": 2.5146918440378733e-06, "loss": 0.4976, "num_input_tokens_seen": 135692448, "step": 111580 }, { "epoch": 13.981330660318257, "grad_norm": 1.1249737739562988, "learning_rate": 2.5142174669687703e-06, "loss": 0.4733, "num_input_tokens_seen": 135698240, "step": 111585 }, { "epoch": 13.981957148227039, "grad_norm": 2.704813003540039, "learning_rate": 2.5137431196192874e-06, "loss": 0.5594, "num_input_tokens_seen": 135704352, "step": 111590 }, { "epoch": 13.982583636135823, "grad_norm": 1.3228800296783447, "learning_rate": 2.513268801995098e-06, "loss": 0.464, "num_input_tokens_seen": 135710624, "step": 111595 }, { "epoch": 13.983210124044605, "grad_norm": 0.8375871181488037, "learning_rate": 2.5127945141018685e-06, "loss": 0.4505, "num_input_tokens_seen": 135716384, "step": 111600 }, { "epoch": 13.98383661195339, "grad_norm": 0.9422481060028076, "learning_rate": 2.512320255945273e-06, "loss": 0.5035, "num_input_tokens_seen": 135722560, "step": 111605 }, { "epoch": 13.984463099862172, "grad_norm": 1.1605162620544434, "learning_rate": 2.5118460275309796e-06, "loss": 0.4694, "num_input_tokens_seen": 135728352, "step": 111610 }, { "epoch": 13.985089587770956, "grad_norm": 1.1744954586029053, "learning_rate": 2.5113718288646606e-06, "loss": 0.4608, "num_input_tokens_seen": 135734624, "step": 111615 }, { "epoch": 13.98571607567974, "grad_norm": 5.969718933105469, "learning_rate": 2.510897659951981e-06, "loss": 0.4915, "num_input_tokens_seen": 135740832, "step": 111620 }, { "epoch": 13.986342563588522, "grad_norm": 1.1876806020736694, "learning_rate": 2.5104235207986148e-06, "loss": 0.4396, "num_input_tokens_seen": 135746912, "step": 111625 }, { "epoch": 13.986969051497306, "grad_norm": 0.8857967853546143, "learning_rate": 2.5099494114102275e-06, "loss": 0.4816, "num_input_tokens_seen": 135752960, "step": 111630 }, { "epoch": 13.98759553940609, "grad_norm": 1.5120656490325928, "learning_rate": 2.5094753317924877e-06, "loss": 0.4358, "num_input_tokens_seen": 135758976, "step": 111635 }, { "epoch": 13.988222027314873, "grad_norm": 1.4396981000900269, "learning_rate": 2.5090012819510663e-06, "loss": 0.4433, "num_input_tokens_seen": 135765408, "step": 111640 }, { "epoch": 13.988848515223657, "grad_norm": 8.723686218261719, "learning_rate": 2.5085272618916266e-06, "loss": 0.4892, "num_input_tokens_seen": 135771648, "step": 111645 }, { "epoch": 13.989475003132439, "grad_norm": 3.2877249717712402, "learning_rate": 2.5080532716198403e-06, "loss": 0.4627, "num_input_tokens_seen": 135777824, "step": 111650 }, { "epoch": 13.990101491041223, "grad_norm": 1.124875783920288, "learning_rate": 2.5075793111413706e-06, "loss": 0.5006, "num_input_tokens_seen": 135784160, "step": 111655 }, { "epoch": 13.990727978950007, "grad_norm": 4.549171447753906, "learning_rate": 2.5071053804618876e-06, "loss": 0.5079, "num_input_tokens_seen": 135790336, "step": 111660 }, { "epoch": 13.99135446685879, "grad_norm": 1.3669018745422363, "learning_rate": 2.5066314795870538e-06, "loss": 0.4353, "num_input_tokens_seen": 135796448, "step": 111665 }, { "epoch": 13.991980954767573, "grad_norm": 2.765291452407837, "learning_rate": 2.506157608522539e-06, "loss": 0.466, "num_input_tokens_seen": 135802624, "step": 111670 }, { "epoch": 13.992607442676356, "grad_norm": 0.9790638089179993, "learning_rate": 2.5056837672740042e-06, "loss": 0.4432, "num_input_tokens_seen": 135808512, "step": 111675 }, { "epoch": 13.99323393058514, "grad_norm": 1.270536184310913, "learning_rate": 2.5052099558471176e-06, "loss": 0.4401, "num_input_tokens_seen": 135814048, "step": 111680 }, { "epoch": 13.993860418493924, "grad_norm": 1.1690422296524048, "learning_rate": 2.5047361742475433e-06, "loss": 0.5051, "num_input_tokens_seen": 135819968, "step": 111685 }, { "epoch": 13.994486906402706, "grad_norm": 1.375241994857788, "learning_rate": 2.504262422480948e-06, "loss": 0.4466, "num_input_tokens_seen": 135825952, "step": 111690 }, { "epoch": 13.99511339431149, "grad_norm": 1.3148068189620972, "learning_rate": 2.5037887005529915e-06, "loss": 0.4932, "num_input_tokens_seen": 135832384, "step": 111695 }, { "epoch": 13.995739882220272, "grad_norm": 3.2344493865966797, "learning_rate": 2.50331500846934e-06, "loss": 0.5017, "num_input_tokens_seen": 135838464, "step": 111700 }, { "epoch": 13.996366370129056, "grad_norm": 4.816258907318115, "learning_rate": 2.5028413462356592e-06, "loss": 0.5186, "num_input_tokens_seen": 135844352, "step": 111705 }, { "epoch": 13.99699285803784, "grad_norm": 4.939507007598877, "learning_rate": 2.502367713857607e-06, "loss": 0.5347, "num_input_tokens_seen": 135850400, "step": 111710 }, { "epoch": 13.997619345946623, "grad_norm": 1.2672845125198364, "learning_rate": 2.5018941113408508e-06, "loss": 0.5195, "num_input_tokens_seen": 135856576, "step": 111715 }, { "epoch": 13.998245833855407, "grad_norm": 3.4908268451690674, "learning_rate": 2.5014205386910485e-06, "loss": 0.5062, "num_input_tokens_seen": 135862464, "step": 111720 }, { "epoch": 13.998872321764189, "grad_norm": 6.113173961639404, "learning_rate": 2.5009469959138668e-06, "loss": 0.4651, "num_input_tokens_seen": 135868416, "step": 111725 }, { "epoch": 13.999498809672973, "grad_norm": 1.1079139709472656, "learning_rate": 2.5004734830149625e-06, "loss": 0.4518, "num_input_tokens_seen": 135874592, "step": 111730 }, { "epoch": 14.0, "eval_loss": 0.4826337695121765, "eval_runtime": 223.5725, "eval_samples_per_second": 35.698, "eval_steps_per_second": 8.928, "num_input_tokens_seen": 135879680, "step": 111734 }, { "epoch": 14.000125297581757, "grad_norm": 4.020562171936035, "learning_rate": 2.5000000000000015e-06, "loss": 0.4512, "num_input_tokens_seen": 135880864, "step": 111735 }, { "epoch": 14.00075178549054, "grad_norm": 1.8430187702178955, "learning_rate": 2.4995265468746398e-06, "loss": 0.4801, "num_input_tokens_seen": 135887168, "step": 111740 }, { "epoch": 14.001378273399324, "grad_norm": 1.4009138345718384, "learning_rate": 2.499053123644543e-06, "loss": 0.4313, "num_input_tokens_seen": 135893184, "step": 111745 }, { "epoch": 14.002004761308108, "grad_norm": 0.9661164879798889, "learning_rate": 2.498579730315367e-06, "loss": 0.484, "num_input_tokens_seen": 135899296, "step": 111750 }, { "epoch": 14.00263124921689, "grad_norm": 5.121647834777832, "learning_rate": 2.4981063668927747e-06, "loss": 0.4736, "num_input_tokens_seen": 135905760, "step": 111755 }, { "epoch": 14.003257737125674, "grad_norm": 6.440863132476807, "learning_rate": 2.497633033382423e-06, "loss": 0.4925, "num_input_tokens_seen": 135912000, "step": 111760 }, { "epoch": 14.003884225034456, "grad_norm": 0.9332125186920166, "learning_rate": 2.497159729789972e-06, "loss": 0.4331, "num_input_tokens_seen": 135918112, "step": 111765 }, { "epoch": 14.00451071294324, "grad_norm": 13.28355884552002, "learning_rate": 2.4966864561210823e-06, "loss": 0.5513, "num_input_tokens_seen": 135924064, "step": 111770 }, { "epoch": 14.005137200852024, "grad_norm": 0.4776444435119629, "learning_rate": 2.496213212381409e-06, "loss": 0.4374, "num_input_tokens_seen": 135929216, "step": 111775 }, { "epoch": 14.005763688760807, "grad_norm": 4.239932537078857, "learning_rate": 2.495739998576614e-06, "loss": 0.4556, "num_input_tokens_seen": 135934560, "step": 111780 }, { "epoch": 14.00639017666959, "grad_norm": 0.8222007751464844, "learning_rate": 2.4952668147123506e-06, "loss": 0.4344, "num_input_tokens_seen": 135940384, "step": 111785 }, { "epoch": 14.007016664578373, "grad_norm": 1.4913928508758545, "learning_rate": 2.4947936607942786e-06, "loss": 0.4591, "num_input_tokens_seen": 135946624, "step": 111790 }, { "epoch": 14.007643152487157, "grad_norm": 1.0625993013381958, "learning_rate": 2.4943205368280543e-06, "loss": 0.4932, "num_input_tokens_seen": 135952800, "step": 111795 }, { "epoch": 14.008269640395941, "grad_norm": 3.7388908863067627, "learning_rate": 2.4938474428193363e-06, "loss": 0.4589, "num_input_tokens_seen": 135958816, "step": 111800 }, { "epoch": 14.008896128304723, "grad_norm": 1.1637279987335205, "learning_rate": 2.493374378773778e-06, "loss": 0.473, "num_input_tokens_seen": 135964928, "step": 111805 }, { "epoch": 14.009522616213507, "grad_norm": 1.2073633670806885, "learning_rate": 2.492901344697038e-06, "loss": 0.4304, "num_input_tokens_seen": 135971040, "step": 111810 }, { "epoch": 14.01014910412229, "grad_norm": 1.066429853439331, "learning_rate": 2.492428340594768e-06, "loss": 0.4463, "num_input_tokens_seen": 135977056, "step": 111815 }, { "epoch": 14.010775592031074, "grad_norm": 1.1525486707687378, "learning_rate": 2.491955366472626e-06, "loss": 0.4997, "num_input_tokens_seen": 135982976, "step": 111820 }, { "epoch": 14.011402079939858, "grad_norm": 1.2879287004470825, "learning_rate": 2.4914824223362676e-06, "loss": 0.5162, "num_input_tokens_seen": 135988992, "step": 111825 }, { "epoch": 14.01202856784864, "grad_norm": 1.1988500356674194, "learning_rate": 2.4910095081913445e-06, "loss": 0.497, "num_input_tokens_seen": 135994944, "step": 111830 }, { "epoch": 14.012655055757424, "grad_norm": 1.0315343141555786, "learning_rate": 2.4905366240435138e-06, "loss": 0.49, "num_input_tokens_seen": 136000832, "step": 111835 }, { "epoch": 14.013281543666206, "grad_norm": 2.31506609916687, "learning_rate": 2.490063769898426e-06, "loss": 0.4789, "num_input_tokens_seen": 136007104, "step": 111840 }, { "epoch": 14.01390803157499, "grad_norm": 2.3652050495147705, "learning_rate": 2.489590945761738e-06, "loss": 0.4777, "num_input_tokens_seen": 136013216, "step": 111845 }, { "epoch": 14.014534519483774, "grad_norm": 1.1497973203659058, "learning_rate": 2.489118151639098e-06, "loss": 0.4236, "num_input_tokens_seen": 136019552, "step": 111850 }, { "epoch": 14.015161007392557, "grad_norm": 7.698460102081299, "learning_rate": 2.4886453875361644e-06, "loss": 0.46, "num_input_tokens_seen": 136025472, "step": 111855 }, { "epoch": 14.01578749530134, "grad_norm": 0.8164559006690979, "learning_rate": 2.488172653458585e-06, "loss": 0.4486, "num_input_tokens_seen": 136031616, "step": 111860 }, { "epoch": 14.016413983210125, "grad_norm": 1.0587124824523926, "learning_rate": 2.4876999494120142e-06, "loss": 0.4768, "num_input_tokens_seen": 136036896, "step": 111865 }, { "epoch": 14.017040471118907, "grad_norm": 7.537003517150879, "learning_rate": 2.4872272754021014e-06, "loss": 0.4952, "num_input_tokens_seen": 136042976, "step": 111870 }, { "epoch": 14.017666959027691, "grad_norm": 2.712191343307495, "learning_rate": 2.4867546314345007e-06, "loss": 0.459, "num_input_tokens_seen": 136049216, "step": 111875 }, { "epoch": 14.018293446936474, "grad_norm": 1.0021533966064453, "learning_rate": 2.4862820175148606e-06, "loss": 0.518, "num_input_tokens_seen": 136055232, "step": 111880 }, { "epoch": 14.018919934845258, "grad_norm": 1.1511898040771484, "learning_rate": 2.4858094336488313e-06, "loss": 0.43, "num_input_tokens_seen": 136061440, "step": 111885 }, { "epoch": 14.019546422754042, "grad_norm": 0.8915131092071533, "learning_rate": 2.4853368798420645e-06, "loss": 0.4844, "num_input_tokens_seen": 136067552, "step": 111890 }, { "epoch": 14.020172910662824, "grad_norm": 8.656770706176758, "learning_rate": 2.484864356100211e-06, "loss": 0.5038, "num_input_tokens_seen": 136073920, "step": 111895 }, { "epoch": 14.020799398571608, "grad_norm": 0.8868156671524048, "learning_rate": 2.484391862428917e-06, "loss": 0.421, "num_input_tokens_seen": 136080352, "step": 111900 }, { "epoch": 14.02142588648039, "grad_norm": 4.6627678871154785, "learning_rate": 2.4839193988338335e-06, "loss": 0.474, "num_input_tokens_seen": 136086432, "step": 111905 }, { "epoch": 14.022052374389174, "grad_norm": 4.042056083679199, "learning_rate": 2.483446965320611e-06, "loss": 0.4921, "num_input_tokens_seen": 136092800, "step": 111910 }, { "epoch": 14.022678862297958, "grad_norm": 1.155827522277832, "learning_rate": 2.4829745618948934e-06, "loss": 0.4576, "num_input_tokens_seen": 136099136, "step": 111915 }, { "epoch": 14.02330535020674, "grad_norm": 6.3573899269104, "learning_rate": 2.4825021885623336e-06, "loss": 0.5037, "num_input_tokens_seen": 136105344, "step": 111920 }, { "epoch": 14.023931838115525, "grad_norm": 7.629158973693848, "learning_rate": 2.4820298453285745e-06, "loss": 0.521, "num_input_tokens_seen": 136110752, "step": 111925 }, { "epoch": 14.024558326024307, "grad_norm": 0.9683946371078491, "learning_rate": 2.4815575321992672e-06, "loss": 0.4915, "num_input_tokens_seen": 136116960, "step": 111930 }, { "epoch": 14.025184813933091, "grad_norm": 1.3400214910507202, "learning_rate": 2.481085249180056e-06, "loss": 0.4202, "num_input_tokens_seen": 136123296, "step": 111935 }, { "epoch": 14.025811301841875, "grad_norm": 1.3037092685699463, "learning_rate": 2.4806129962765894e-06, "loss": 0.5241, "num_input_tokens_seen": 136129408, "step": 111940 }, { "epoch": 14.026437789750657, "grad_norm": 11.79175853729248, "learning_rate": 2.4801407734945116e-06, "loss": 0.4991, "num_input_tokens_seen": 136135136, "step": 111945 }, { "epoch": 14.027064277659441, "grad_norm": 3.826469898223877, "learning_rate": 2.4796685808394686e-06, "loss": 0.4521, "num_input_tokens_seen": 136141088, "step": 111950 }, { "epoch": 14.027690765568224, "grad_norm": 1.1250511407852173, "learning_rate": 2.479196418317109e-06, "loss": 0.4183, "num_input_tokens_seen": 136147328, "step": 111955 }, { "epoch": 14.028317253477008, "grad_norm": 8.217988014221191, "learning_rate": 2.4787242859330733e-06, "loss": 0.5372, "num_input_tokens_seen": 136153440, "step": 111960 }, { "epoch": 14.028943741385792, "grad_norm": 3.512688398361206, "learning_rate": 2.4782521836930114e-06, "loss": 0.478, "num_input_tokens_seen": 136159840, "step": 111965 }, { "epoch": 14.029570229294574, "grad_norm": 1.282395601272583, "learning_rate": 2.4777801116025616e-06, "loss": 0.4335, "num_input_tokens_seen": 136165600, "step": 111970 }, { "epoch": 14.030196717203358, "grad_norm": 1.886823058128357, "learning_rate": 2.4773080696673742e-06, "loss": 0.4709, "num_input_tokens_seen": 136171744, "step": 111975 }, { "epoch": 14.030823205112142, "grad_norm": 7.552643299102783, "learning_rate": 2.4768360578930874e-06, "loss": 0.5248, "num_input_tokens_seen": 136177984, "step": 111980 }, { "epoch": 14.031449693020924, "grad_norm": 8.089151382446289, "learning_rate": 2.4763640762853487e-06, "loss": 0.4748, "num_input_tokens_seen": 136184192, "step": 111985 }, { "epoch": 14.032076180929709, "grad_norm": 1.265497088432312, "learning_rate": 2.4758921248497973e-06, "loss": 0.4637, "num_input_tokens_seen": 136190272, "step": 111990 }, { "epoch": 14.03270266883849, "grad_norm": 7.310914993286133, "learning_rate": 2.4754202035920775e-06, "loss": 0.4766, "num_input_tokens_seen": 136196576, "step": 111995 }, { "epoch": 14.033329156747275, "grad_norm": 1.2669388055801392, "learning_rate": 2.474948312517832e-06, "loss": 0.4466, "num_input_tokens_seen": 136202752, "step": 112000 }, { "epoch": 14.033955644656059, "grad_norm": 1.6926932334899902, "learning_rate": 2.4744764516327023e-06, "loss": 0.4418, "num_input_tokens_seen": 136209024, "step": 112005 }, { "epoch": 14.034582132564841, "grad_norm": 7.76718282699585, "learning_rate": 2.4740046209423324e-06, "loss": 0.5396, "num_input_tokens_seen": 136214496, "step": 112010 }, { "epoch": 14.035208620473625, "grad_norm": 1.5641441345214844, "learning_rate": 2.4735328204523585e-06, "loss": 0.4563, "num_input_tokens_seen": 136220640, "step": 112015 }, { "epoch": 14.035835108382408, "grad_norm": 0.9149344563484192, "learning_rate": 2.473061050168426e-06, "loss": 0.4228, "num_input_tokens_seen": 136226880, "step": 112020 }, { "epoch": 14.036461596291192, "grad_norm": 8.146920204162598, "learning_rate": 2.472589310096171e-06, "loss": 0.4633, "num_input_tokens_seen": 136233216, "step": 112025 }, { "epoch": 14.037088084199976, "grad_norm": 13.797798156738281, "learning_rate": 2.472117600241238e-06, "loss": 0.5558, "num_input_tokens_seen": 136239680, "step": 112030 }, { "epoch": 14.037714572108758, "grad_norm": 1.5688886642456055, "learning_rate": 2.4716459206092626e-06, "loss": 0.4373, "num_input_tokens_seen": 136245664, "step": 112035 }, { "epoch": 14.038341060017542, "grad_norm": 5.70811128616333, "learning_rate": 2.471174271205887e-06, "loss": 0.4605, "num_input_tokens_seen": 136251424, "step": 112040 }, { "epoch": 14.038967547926324, "grad_norm": 3.1140618324279785, "learning_rate": 2.470702652036748e-06, "loss": 0.4728, "num_input_tokens_seen": 136257472, "step": 112045 }, { "epoch": 14.039594035835108, "grad_norm": 8.71827507019043, "learning_rate": 2.470231063107487e-06, "loss": 0.5696, "num_input_tokens_seen": 136263808, "step": 112050 }, { "epoch": 14.040220523743892, "grad_norm": 1.157184362411499, "learning_rate": 2.469759504423739e-06, "loss": 0.5574, "num_input_tokens_seen": 136270048, "step": 112055 }, { "epoch": 14.040847011652675, "grad_norm": 10.917421340942383, "learning_rate": 2.469287975991145e-06, "loss": 0.4762, "num_input_tokens_seen": 136276288, "step": 112060 }, { "epoch": 14.041473499561459, "grad_norm": 1.780086636543274, "learning_rate": 2.4688164778153386e-06, "loss": 0.4303, "num_input_tokens_seen": 136282528, "step": 112065 }, { "epoch": 14.042099987470241, "grad_norm": 5.7332916259765625, "learning_rate": 2.4683450099019595e-06, "loss": 0.4526, "num_input_tokens_seen": 136288640, "step": 112070 }, { "epoch": 14.042726475379025, "grad_norm": 2.105536460876465, "learning_rate": 2.4678735722566467e-06, "loss": 0.4439, "num_input_tokens_seen": 136294880, "step": 112075 }, { "epoch": 14.04335296328781, "grad_norm": 4.908353328704834, "learning_rate": 2.4674021648850322e-06, "loss": 0.5116, "num_input_tokens_seen": 136301024, "step": 112080 }, { "epoch": 14.043979451196591, "grad_norm": 1.6094799041748047, "learning_rate": 2.4669307877927555e-06, "loss": 0.4859, "num_input_tokens_seen": 136307104, "step": 112085 }, { "epoch": 14.044605939105375, "grad_norm": 2.32243013381958, "learning_rate": 2.46645944098545e-06, "loss": 0.4822, "num_input_tokens_seen": 136313120, "step": 112090 }, { "epoch": 14.045232427014158, "grad_norm": 2.421124219894409, "learning_rate": 2.4659881244687513e-06, "loss": 0.5007, "num_input_tokens_seen": 136319424, "step": 112095 }, { "epoch": 14.045858914922942, "grad_norm": 2.395277738571167, "learning_rate": 2.465516838248297e-06, "loss": 0.4648, "num_input_tokens_seen": 136325600, "step": 112100 }, { "epoch": 14.046485402831726, "grad_norm": 1.2332003116607666, "learning_rate": 2.4650455823297174e-06, "loss": 0.5158, "num_input_tokens_seen": 136332064, "step": 112105 }, { "epoch": 14.047111890740508, "grad_norm": 1.4125385284423828, "learning_rate": 2.4645743567186497e-06, "loss": 0.4889, "num_input_tokens_seen": 136337856, "step": 112110 }, { "epoch": 14.047738378649292, "grad_norm": 4.4832963943481445, "learning_rate": 2.4641031614207295e-06, "loss": 0.4972, "num_input_tokens_seen": 136343968, "step": 112115 }, { "epoch": 14.048364866558076, "grad_norm": 8.51626968383789, "learning_rate": 2.4636319964415854e-06, "loss": 0.4567, "num_input_tokens_seen": 136350112, "step": 112120 }, { "epoch": 14.048991354466859, "grad_norm": 1.0250431299209595, "learning_rate": 2.4631608617868557e-06, "loss": 0.4738, "num_input_tokens_seen": 136355936, "step": 112125 }, { "epoch": 14.049617842375643, "grad_norm": 0.7304319143295288, "learning_rate": 2.462689757462169e-06, "loss": 0.4718, "num_input_tokens_seen": 136362528, "step": 112130 }, { "epoch": 14.050244330284425, "grad_norm": 0.9714838862419128, "learning_rate": 2.4622186834731586e-06, "loss": 0.4548, "num_input_tokens_seen": 136368608, "step": 112135 }, { "epoch": 14.050870818193209, "grad_norm": 2.204284429550171, "learning_rate": 2.46174763982546e-06, "loss": 0.4572, "num_input_tokens_seen": 136374816, "step": 112140 }, { "epoch": 14.051497306101993, "grad_norm": 4.456338405609131, "learning_rate": 2.4612766265247007e-06, "loss": 0.4515, "num_input_tokens_seen": 136380160, "step": 112145 }, { "epoch": 14.052123794010775, "grad_norm": 0.9727126955986023, "learning_rate": 2.4608056435765152e-06, "loss": 0.449, "num_input_tokens_seen": 136385952, "step": 112150 }, { "epoch": 14.05275028191956, "grad_norm": 1.2172980308532715, "learning_rate": 2.4603346909865315e-06, "loss": 0.5367, "num_input_tokens_seen": 136392320, "step": 112155 }, { "epoch": 14.053376769828342, "grad_norm": 1.6027581691741943, "learning_rate": 2.459863768760384e-06, "loss": 0.458, "num_input_tokens_seen": 136398720, "step": 112160 }, { "epoch": 14.054003257737126, "grad_norm": 1.7523188591003418, "learning_rate": 2.459392876903698e-06, "loss": 0.49, "num_input_tokens_seen": 136405024, "step": 112165 }, { "epoch": 14.05462974564591, "grad_norm": 1.3779971599578857, "learning_rate": 2.4589220154221085e-06, "loss": 0.4139, "num_input_tokens_seen": 136411456, "step": 112170 }, { "epoch": 14.055256233554692, "grad_norm": 1.6128743886947632, "learning_rate": 2.4584511843212413e-06, "loss": 0.495, "num_input_tokens_seen": 136417696, "step": 112175 }, { "epoch": 14.055882721463476, "grad_norm": 1.270792007446289, "learning_rate": 2.4579803836067277e-06, "loss": 0.4641, "num_input_tokens_seen": 136424000, "step": 112180 }, { "epoch": 14.056509209372258, "grad_norm": 0.8890421390533447, "learning_rate": 2.4575096132841954e-06, "loss": 0.4901, "num_input_tokens_seen": 136430208, "step": 112185 }, { "epoch": 14.057135697281042, "grad_norm": 1.3531304597854614, "learning_rate": 2.4570388733592722e-06, "loss": 0.4793, "num_input_tokens_seen": 136436128, "step": 112190 }, { "epoch": 14.057762185189826, "grad_norm": 1.1745842695236206, "learning_rate": 2.45656816383759e-06, "loss": 0.5043, "num_input_tokens_seen": 136441984, "step": 112195 }, { "epoch": 14.058388673098609, "grad_norm": 2.8142318725585938, "learning_rate": 2.456097484724771e-06, "loss": 0.4573, "num_input_tokens_seen": 136448032, "step": 112200 }, { "epoch": 14.059015161007393, "grad_norm": 1.4906343221664429, "learning_rate": 2.455626836026446e-06, "loss": 0.4376, "num_input_tokens_seen": 136454208, "step": 112205 }, { "epoch": 14.059641648916175, "grad_norm": 1.1701141595840454, "learning_rate": 2.4551562177482415e-06, "loss": 0.4373, "num_input_tokens_seen": 136460160, "step": 112210 }, { "epoch": 14.06026813682496, "grad_norm": 1.21006441116333, "learning_rate": 2.454685629895785e-06, "loss": 0.4126, "num_input_tokens_seen": 136466240, "step": 112215 }, { "epoch": 14.060894624733743, "grad_norm": 9.271099090576172, "learning_rate": 2.4542150724747003e-06, "loss": 0.5091, "num_input_tokens_seen": 136472160, "step": 112220 }, { "epoch": 14.061521112642525, "grad_norm": 1.1519660949707031, "learning_rate": 2.453744545490617e-06, "loss": 0.503, "num_input_tokens_seen": 136478304, "step": 112225 }, { "epoch": 14.06214760055131, "grad_norm": 9.918745040893555, "learning_rate": 2.453274048949157e-06, "loss": 0.5151, "num_input_tokens_seen": 136484256, "step": 112230 }, { "epoch": 14.062774088460094, "grad_norm": 3.60520339012146, "learning_rate": 2.4528035828559477e-06, "loss": 0.434, "num_input_tokens_seen": 136490176, "step": 112235 }, { "epoch": 14.063400576368876, "grad_norm": 4.273421287536621, "learning_rate": 2.4523331472166116e-06, "loss": 0.4788, "num_input_tokens_seen": 136495808, "step": 112240 }, { "epoch": 14.06402706427766, "grad_norm": 1.158956527709961, "learning_rate": 2.451862742036777e-06, "loss": 0.4681, "num_input_tokens_seen": 136502208, "step": 112245 }, { "epoch": 14.064653552186442, "grad_norm": 1.2311773300170898, "learning_rate": 2.4513923673220634e-06, "loss": 0.4884, "num_input_tokens_seen": 136508640, "step": 112250 }, { "epoch": 14.065280040095226, "grad_norm": 2.4960274696350098, "learning_rate": 2.4509220230780973e-06, "loss": 0.4534, "num_input_tokens_seen": 136514976, "step": 112255 }, { "epoch": 14.06590652800401, "grad_norm": 8.565133094787598, "learning_rate": 2.450451709310503e-06, "loss": 0.5462, "num_input_tokens_seen": 136521056, "step": 112260 }, { "epoch": 14.066533015912793, "grad_norm": 1.1231600046157837, "learning_rate": 2.4499814260249e-06, "loss": 0.4325, "num_input_tokens_seen": 136527200, "step": 112265 }, { "epoch": 14.067159503821577, "grad_norm": 3.389707565307617, "learning_rate": 2.449511173226915e-06, "loss": 0.4969, "num_input_tokens_seen": 136533376, "step": 112270 }, { "epoch": 14.067785991730359, "grad_norm": 1.3806071281433105, "learning_rate": 2.4490409509221663e-06, "loss": 0.5075, "num_input_tokens_seen": 136539584, "step": 112275 }, { "epoch": 14.068412479639143, "grad_norm": 3.5817079544067383, "learning_rate": 2.44857075911628e-06, "loss": 0.4436, "num_input_tokens_seen": 136545472, "step": 112280 }, { "epoch": 14.069038967547927, "grad_norm": 2.0523650646209717, "learning_rate": 2.4481005978148735e-06, "loss": 0.5007, "num_input_tokens_seen": 136551680, "step": 112285 }, { "epoch": 14.06966545545671, "grad_norm": 0.9010412693023682, "learning_rate": 2.4476304670235714e-06, "loss": 0.4677, "num_input_tokens_seen": 136557248, "step": 112290 }, { "epoch": 14.070291943365493, "grad_norm": 1.0928622484207153, "learning_rate": 2.4471603667479916e-06, "loss": 0.4679, "num_input_tokens_seen": 136562944, "step": 112295 }, { "epoch": 14.070918431274276, "grad_norm": 1.3964112997055054, "learning_rate": 2.4466902969937557e-06, "loss": 0.459, "num_input_tokens_seen": 136569376, "step": 112300 }, { "epoch": 14.07154491918306, "grad_norm": 1.6514675617218018, "learning_rate": 2.4462202577664836e-06, "loss": 0.4541, "num_input_tokens_seen": 136575648, "step": 112305 }, { "epoch": 14.072171407091844, "grad_norm": 2.670128345489502, "learning_rate": 2.4457502490717977e-06, "loss": 0.4506, "num_input_tokens_seen": 136581920, "step": 112310 }, { "epoch": 14.072797895000626, "grad_norm": 1.8705137968063354, "learning_rate": 2.445280270915313e-06, "loss": 0.5204, "num_input_tokens_seen": 136588288, "step": 112315 }, { "epoch": 14.07342438290941, "grad_norm": 2.7080535888671875, "learning_rate": 2.4448103233026505e-06, "loss": 0.5262, "num_input_tokens_seen": 136594656, "step": 112320 }, { "epoch": 14.074050870818192, "grad_norm": 1.119040608406067, "learning_rate": 2.444340406239431e-06, "loss": 0.4248, "num_input_tokens_seen": 136600736, "step": 112325 }, { "epoch": 14.074677358726976, "grad_norm": 1.527965784072876, "learning_rate": 2.4438705197312684e-06, "loss": 0.5086, "num_input_tokens_seen": 136606976, "step": 112330 }, { "epoch": 14.07530384663576, "grad_norm": 3.2094249725341797, "learning_rate": 2.4434006637837856e-06, "loss": 0.4828, "num_input_tokens_seen": 136613408, "step": 112335 }, { "epoch": 14.075930334544543, "grad_norm": 7.546951770782471, "learning_rate": 2.4429308384025945e-06, "loss": 0.4471, "num_input_tokens_seen": 136619744, "step": 112340 }, { "epoch": 14.076556822453327, "grad_norm": 4.036311626434326, "learning_rate": 2.442461043593317e-06, "loss": 0.4542, "num_input_tokens_seen": 136625376, "step": 112345 }, { "epoch": 14.07718331036211, "grad_norm": 1.31010901927948, "learning_rate": 2.4419912793615665e-06, "loss": 0.4635, "num_input_tokens_seen": 136631072, "step": 112350 }, { "epoch": 14.077809798270893, "grad_norm": 5.393043041229248, "learning_rate": 2.441521545712963e-06, "loss": 0.5511, "num_input_tokens_seen": 136637344, "step": 112355 }, { "epoch": 14.078436286179677, "grad_norm": 3.1719789505004883, "learning_rate": 2.4410518426531177e-06, "loss": 0.4976, "num_input_tokens_seen": 136643392, "step": 112360 }, { "epoch": 14.07906277408846, "grad_norm": 1.3876984119415283, "learning_rate": 2.440582170187652e-06, "loss": 0.4431, "num_input_tokens_seen": 136649536, "step": 112365 }, { "epoch": 14.079689261997244, "grad_norm": 5.614668369293213, "learning_rate": 2.4401125283221756e-06, "loss": 0.4594, "num_input_tokens_seen": 136655776, "step": 112370 }, { "epoch": 14.080315749906028, "grad_norm": 3.1580018997192383, "learning_rate": 2.4396429170623065e-06, "loss": 0.4445, "num_input_tokens_seen": 136661600, "step": 112375 }, { "epoch": 14.08094223781481, "grad_norm": 8.503491401672363, "learning_rate": 2.439173336413661e-06, "loss": 0.5429, "num_input_tokens_seen": 136667680, "step": 112380 }, { "epoch": 14.081568725723594, "grad_norm": 6.74398136138916, "learning_rate": 2.4387037863818492e-06, "loss": 0.449, "num_input_tokens_seen": 136673312, "step": 112385 }, { "epoch": 14.082195213632376, "grad_norm": 2.6778008937835693, "learning_rate": 2.438234266972489e-06, "loss": 0.4627, "num_input_tokens_seen": 136679584, "step": 112390 }, { "epoch": 14.08282170154116, "grad_norm": 1.2691634893417358, "learning_rate": 2.4377647781911894e-06, "loss": 0.4948, "num_input_tokens_seen": 136685760, "step": 112395 }, { "epoch": 14.083448189449944, "grad_norm": 3.5036160945892334, "learning_rate": 2.4372953200435687e-06, "loss": 0.4605, "num_input_tokens_seen": 136691872, "step": 112400 }, { "epoch": 14.084074677358727, "grad_norm": 4.153135299682617, "learning_rate": 2.436825892535235e-06, "loss": 0.4464, "num_input_tokens_seen": 136697984, "step": 112405 }, { "epoch": 14.08470116526751, "grad_norm": 0.9863459467887878, "learning_rate": 2.4363564956718034e-06, "loss": 0.4624, "num_input_tokens_seen": 136703872, "step": 112410 }, { "epoch": 14.085327653176293, "grad_norm": 1.7612775564193726, "learning_rate": 2.4358871294588844e-06, "loss": 0.465, "num_input_tokens_seen": 136710016, "step": 112415 }, { "epoch": 14.085954141085077, "grad_norm": 2.9012269973754883, "learning_rate": 2.4354177939020927e-06, "loss": 0.4448, "num_input_tokens_seen": 136716192, "step": 112420 }, { "epoch": 14.086580628993861, "grad_norm": 1.1543691158294678, "learning_rate": 2.4349484890070357e-06, "loss": 0.4732, "num_input_tokens_seen": 136722432, "step": 112425 }, { "epoch": 14.087207116902643, "grad_norm": 1.6020132303237915, "learning_rate": 2.434479214779328e-06, "loss": 0.4935, "num_input_tokens_seen": 136728640, "step": 112430 }, { "epoch": 14.087833604811427, "grad_norm": 1.0521541833877563, "learning_rate": 2.434009971224576e-06, "loss": 0.4159, "num_input_tokens_seen": 136734976, "step": 112435 }, { "epoch": 14.08846009272021, "grad_norm": 1.6154252290725708, "learning_rate": 2.433540758348392e-06, "loss": 0.5339, "num_input_tokens_seen": 136740832, "step": 112440 }, { "epoch": 14.089086580628994, "grad_norm": 2.057737350463867, "learning_rate": 2.433071576156389e-06, "loss": 0.5135, "num_input_tokens_seen": 136747136, "step": 112445 }, { "epoch": 14.089713068537778, "grad_norm": 1.0348148345947266, "learning_rate": 2.432602424654171e-06, "loss": 0.4792, "num_input_tokens_seen": 136753344, "step": 112450 }, { "epoch": 14.09033955644656, "grad_norm": 1.2605942487716675, "learning_rate": 2.432133303847351e-06, "loss": 0.414, "num_input_tokens_seen": 136758976, "step": 112455 }, { "epoch": 14.090966044355344, "grad_norm": 1.7942358255386353, "learning_rate": 2.4316642137415346e-06, "loss": 0.4567, "num_input_tokens_seen": 136765312, "step": 112460 }, { "epoch": 14.091592532264126, "grad_norm": 1.4974101781845093, "learning_rate": 2.431195154342334e-06, "loss": 0.4601, "num_input_tokens_seen": 136771872, "step": 112465 }, { "epoch": 14.09221902017291, "grad_norm": 2.092251777648926, "learning_rate": 2.4307261256553533e-06, "loss": 0.5551, "num_input_tokens_seen": 136777696, "step": 112470 }, { "epoch": 14.092845508081695, "grad_norm": 9.610685348510742, "learning_rate": 2.430257127686203e-06, "loss": 0.4898, "num_input_tokens_seen": 136784000, "step": 112475 }, { "epoch": 14.093471995990477, "grad_norm": 4.05852746963501, "learning_rate": 2.4297881604404883e-06, "loss": 0.436, "num_input_tokens_seen": 136790560, "step": 112480 }, { "epoch": 14.094098483899261, "grad_norm": 2.1592600345611572, "learning_rate": 2.4293192239238183e-06, "loss": 0.504, "num_input_tokens_seen": 136796672, "step": 112485 }, { "epoch": 14.094724971808045, "grad_norm": 4.035705089569092, "learning_rate": 2.4288503181417964e-06, "loss": 0.5911, "num_input_tokens_seen": 136802784, "step": 112490 }, { "epoch": 14.095351459716827, "grad_norm": 1.027984619140625, "learning_rate": 2.428381443100033e-06, "loss": 0.4313, "num_input_tokens_seen": 136809184, "step": 112495 }, { "epoch": 14.095977947625611, "grad_norm": 3.830521821975708, "learning_rate": 2.4279125988041297e-06, "loss": 0.4688, "num_input_tokens_seen": 136815456, "step": 112500 }, { "epoch": 14.096604435534394, "grad_norm": 6.592408180236816, "learning_rate": 2.4274437852596933e-06, "loss": 0.5143, "num_input_tokens_seen": 136821344, "step": 112505 }, { "epoch": 14.097230923443178, "grad_norm": 7.207492351531982, "learning_rate": 2.4269750024723297e-06, "loss": 0.4732, "num_input_tokens_seen": 136827552, "step": 112510 }, { "epoch": 14.097857411351962, "grad_norm": 1.6099685430526733, "learning_rate": 2.426506250447645e-06, "loss": 0.5345, "num_input_tokens_seen": 136834016, "step": 112515 }, { "epoch": 14.098483899260744, "grad_norm": 1.1724810600280762, "learning_rate": 2.4260375291912395e-06, "loss": 0.4576, "num_input_tokens_seen": 136840064, "step": 112520 }, { "epoch": 14.099110387169528, "grad_norm": 7.887811660766602, "learning_rate": 2.42556883870872e-06, "loss": 0.4804, "num_input_tokens_seen": 136846112, "step": 112525 }, { "epoch": 14.09973687507831, "grad_norm": 2.117161512374878, "learning_rate": 2.425100179005691e-06, "loss": 0.4482, "num_input_tokens_seen": 136851808, "step": 112530 }, { "epoch": 14.100363362987094, "grad_norm": 1.2618486881256104, "learning_rate": 2.4246315500877525e-06, "loss": 0.4363, "num_input_tokens_seen": 136857728, "step": 112535 }, { "epoch": 14.100989850895878, "grad_norm": 4.126824855804443, "learning_rate": 2.4241629519605108e-06, "loss": 0.4738, "num_input_tokens_seen": 136863968, "step": 112540 }, { "epoch": 14.10161633880466, "grad_norm": 1.4396346807479858, "learning_rate": 2.423694384629565e-06, "loss": 0.4649, "num_input_tokens_seen": 136870368, "step": 112545 }, { "epoch": 14.102242826713445, "grad_norm": 2.196645736694336, "learning_rate": 2.4232258481005205e-06, "loss": 0.5472, "num_input_tokens_seen": 136876672, "step": 112550 }, { "epoch": 14.102869314622227, "grad_norm": 5.4281158447265625, "learning_rate": 2.4227573423789756e-06, "loss": 0.4845, "num_input_tokens_seen": 136882944, "step": 112555 }, { "epoch": 14.103495802531011, "grad_norm": 0.9963645935058594, "learning_rate": 2.4222888674705357e-06, "loss": 0.4062, "num_input_tokens_seen": 136888960, "step": 112560 }, { "epoch": 14.104122290439795, "grad_norm": 2.264458179473877, "learning_rate": 2.4218204233807975e-06, "loss": 0.4912, "num_input_tokens_seen": 136895168, "step": 112565 }, { "epoch": 14.104748778348577, "grad_norm": 4.5574822425842285, "learning_rate": 2.4213520101153638e-06, "loss": 0.487, "num_input_tokens_seen": 136900960, "step": 112570 }, { "epoch": 14.105375266257361, "grad_norm": 10.781790733337402, "learning_rate": 2.4208836276798363e-06, "loss": 0.5226, "num_input_tokens_seen": 136907072, "step": 112575 }, { "epoch": 14.106001754166144, "grad_norm": 1.464460849761963, "learning_rate": 2.420415276079812e-06, "loss": 0.4926, "num_input_tokens_seen": 136912672, "step": 112580 }, { "epoch": 14.106628242074928, "grad_norm": 1.528923749923706, "learning_rate": 2.419946955320894e-06, "loss": 0.4433, "num_input_tokens_seen": 136918752, "step": 112585 }, { "epoch": 14.107254729983712, "grad_norm": 1.021990418434143, "learning_rate": 2.4194786654086768e-06, "loss": 0.4856, "num_input_tokens_seen": 136924352, "step": 112590 }, { "epoch": 14.107881217892494, "grad_norm": 1.0062601566314697, "learning_rate": 2.419010406348764e-06, "loss": 0.5009, "num_input_tokens_seen": 136930432, "step": 112595 }, { "epoch": 14.108507705801278, "grad_norm": 3.4270412921905518, "learning_rate": 2.41854217814675e-06, "loss": 0.5104, "num_input_tokens_seen": 136936512, "step": 112600 }, { "epoch": 14.109134193710062, "grad_norm": 1.5529656410217285, "learning_rate": 2.4180739808082344e-06, "loss": 0.4888, "num_input_tokens_seen": 136942720, "step": 112605 }, { "epoch": 14.109760681618845, "grad_norm": 1.6236960887908936, "learning_rate": 2.417605814338817e-06, "loss": 0.4676, "num_input_tokens_seen": 136948928, "step": 112610 }, { "epoch": 14.110387169527629, "grad_norm": 1.3464504480361938, "learning_rate": 2.4171376787440915e-06, "loss": 0.4545, "num_input_tokens_seen": 136955200, "step": 112615 }, { "epoch": 14.111013657436411, "grad_norm": 0.8826930522918701, "learning_rate": 2.4166695740296564e-06, "loss": 0.4469, "num_input_tokens_seen": 136961184, "step": 112620 }, { "epoch": 14.111640145345195, "grad_norm": 2.836216926574707, "learning_rate": 2.4162015002011087e-06, "loss": 0.4526, "num_input_tokens_seen": 136966816, "step": 112625 }, { "epoch": 14.112266633253979, "grad_norm": 1.379671335220337, "learning_rate": 2.4157334572640457e-06, "loss": 0.4592, "num_input_tokens_seen": 136973184, "step": 112630 }, { "epoch": 14.112893121162761, "grad_norm": 1.5312420129776, "learning_rate": 2.415265445224061e-06, "loss": 0.5028, "num_input_tokens_seen": 136979360, "step": 112635 }, { "epoch": 14.113519609071545, "grad_norm": 1.0660505294799805, "learning_rate": 2.414797464086752e-06, "loss": 0.4496, "num_input_tokens_seen": 136985472, "step": 112640 }, { "epoch": 14.114146096980328, "grad_norm": 1.2676070928573608, "learning_rate": 2.4143295138577116e-06, "loss": 0.4446, "num_input_tokens_seen": 136991840, "step": 112645 }, { "epoch": 14.114772584889112, "grad_norm": 4.638522624969482, "learning_rate": 2.4138615945425374e-06, "loss": 0.4542, "num_input_tokens_seen": 136997664, "step": 112650 }, { "epoch": 14.115399072797896, "grad_norm": 5.4693603515625, "learning_rate": 2.4133937061468203e-06, "loss": 0.4581, "num_input_tokens_seen": 137004032, "step": 112655 }, { "epoch": 14.116025560706678, "grad_norm": 1.339243769645691, "learning_rate": 2.412925848676158e-06, "loss": 0.4623, "num_input_tokens_seen": 137010240, "step": 112660 }, { "epoch": 14.116652048615462, "grad_norm": 4.348977088928223, "learning_rate": 2.4124580221361404e-06, "loss": 0.4506, "num_input_tokens_seen": 137015936, "step": 112665 }, { "epoch": 14.117278536524244, "grad_norm": 1.322726845741272, "learning_rate": 2.4119902265323647e-06, "loss": 0.4453, "num_input_tokens_seen": 137022176, "step": 112670 }, { "epoch": 14.117905024433028, "grad_norm": 4.632680416107178, "learning_rate": 2.4115224618704197e-06, "loss": 0.4607, "num_input_tokens_seen": 137028192, "step": 112675 }, { "epoch": 14.118531512341812, "grad_norm": 1.4039536714553833, "learning_rate": 2.4110547281559017e-06, "loss": 0.4489, "num_input_tokens_seen": 137034368, "step": 112680 }, { "epoch": 14.119158000250595, "grad_norm": 0.8748988509178162, "learning_rate": 2.410587025394399e-06, "loss": 0.439, "num_input_tokens_seen": 137040640, "step": 112685 }, { "epoch": 14.119784488159379, "grad_norm": 1.3351771831512451, "learning_rate": 2.410119353591506e-06, "loss": 0.4155, "num_input_tokens_seen": 137046880, "step": 112690 }, { "epoch": 14.120410976068161, "grad_norm": 3.470766067504883, "learning_rate": 2.409651712752815e-06, "loss": 0.454, "num_input_tokens_seen": 137052384, "step": 112695 }, { "epoch": 14.121037463976945, "grad_norm": 1.956953763961792, "learning_rate": 2.4091841028839143e-06, "loss": 0.4469, "num_input_tokens_seen": 137058688, "step": 112700 }, { "epoch": 14.12166395188573, "grad_norm": 2.168708562850952, "learning_rate": 2.4087165239903977e-06, "loss": 0.4454, "num_input_tokens_seen": 137065312, "step": 112705 }, { "epoch": 14.122290439794511, "grad_norm": 7.43047571182251, "learning_rate": 2.4082489760778515e-06, "loss": 0.5175, "num_input_tokens_seen": 137071488, "step": 112710 }, { "epoch": 14.122916927703296, "grad_norm": 2.7485945224761963, "learning_rate": 2.4077814591518677e-06, "loss": 0.479, "num_input_tokens_seen": 137077696, "step": 112715 }, { "epoch": 14.123543415612078, "grad_norm": 4.501704692840576, "learning_rate": 2.4073139732180377e-06, "loss": 0.4676, "num_input_tokens_seen": 137083840, "step": 112720 }, { "epoch": 14.124169903520862, "grad_norm": 1.6930930614471436, "learning_rate": 2.406846518281948e-06, "loss": 0.4417, "num_input_tokens_seen": 137090368, "step": 112725 }, { "epoch": 14.124796391429646, "grad_norm": 1.2509050369262695, "learning_rate": 2.4063790943491878e-06, "loss": 0.4399, "num_input_tokens_seen": 137096512, "step": 112730 }, { "epoch": 14.125422879338428, "grad_norm": 2.3017585277557373, "learning_rate": 2.405911701425348e-06, "loss": 0.4668, "num_input_tokens_seen": 137102432, "step": 112735 }, { "epoch": 14.126049367247212, "grad_norm": 10.183564186096191, "learning_rate": 2.4054443395160128e-06, "loss": 0.489, "num_input_tokens_seen": 137108704, "step": 112740 }, { "epoch": 14.126675855155996, "grad_norm": 1.3251144886016846, "learning_rate": 2.4049770086267738e-06, "loss": 0.4493, "num_input_tokens_seen": 137114528, "step": 112745 }, { "epoch": 14.127302343064779, "grad_norm": 1.673174500465393, "learning_rate": 2.4045097087632145e-06, "loss": 0.4325, "num_input_tokens_seen": 137120672, "step": 112750 }, { "epoch": 14.127928830973563, "grad_norm": 1.5702258348464966, "learning_rate": 2.404042439930924e-06, "loss": 0.4605, "num_input_tokens_seen": 137126976, "step": 112755 }, { "epoch": 14.128555318882345, "grad_norm": 1.43293297290802, "learning_rate": 2.4035752021354907e-06, "loss": 0.4619, "num_input_tokens_seen": 137133024, "step": 112760 }, { "epoch": 14.129181806791129, "grad_norm": 1.0768128633499146, "learning_rate": 2.4031079953824966e-06, "loss": 0.4513, "num_input_tokens_seen": 137138944, "step": 112765 }, { "epoch": 14.129808294699913, "grad_norm": 2.0431222915649414, "learning_rate": 2.402640819677532e-06, "loss": 0.4059, "num_input_tokens_seen": 137145184, "step": 112770 }, { "epoch": 14.130434782608695, "grad_norm": 7.042647361755371, "learning_rate": 2.4021736750261784e-06, "loss": 0.4594, "num_input_tokens_seen": 137151008, "step": 112775 }, { "epoch": 14.13106127051748, "grad_norm": 1.2333790063858032, "learning_rate": 2.4017065614340233e-06, "loss": 0.4351, "num_input_tokens_seen": 137157376, "step": 112780 }, { "epoch": 14.131687758426262, "grad_norm": 12.415312767028809, "learning_rate": 2.40123947890665e-06, "loss": 0.5476, "num_input_tokens_seen": 137163584, "step": 112785 }, { "epoch": 14.132314246335046, "grad_norm": 3.2567944526672363, "learning_rate": 2.4007724274496453e-06, "loss": 0.4789, "num_input_tokens_seen": 137169920, "step": 112790 }, { "epoch": 14.13294073424383, "grad_norm": 6.88140869140625, "learning_rate": 2.40030540706859e-06, "loss": 0.4592, "num_input_tokens_seen": 137175840, "step": 112795 }, { "epoch": 14.133567222152612, "grad_norm": 1.941693663597107, "learning_rate": 2.399838417769071e-06, "loss": 0.4736, "num_input_tokens_seen": 137181824, "step": 112800 }, { "epoch": 14.134193710061396, "grad_norm": 3.0405383110046387, "learning_rate": 2.3993714595566677e-06, "loss": 0.4622, "num_input_tokens_seen": 137187904, "step": 112805 }, { "epoch": 14.134820197970178, "grad_norm": 1.6715490818023682, "learning_rate": 2.3989045324369648e-06, "loss": 0.4707, "num_input_tokens_seen": 137193888, "step": 112810 }, { "epoch": 14.135446685878962, "grad_norm": 3.751309633255005, "learning_rate": 2.3984376364155475e-06, "loss": 0.4373, "num_input_tokens_seen": 137199680, "step": 112815 }, { "epoch": 14.136073173787747, "grad_norm": 1.3825185298919678, "learning_rate": 2.3979707714979932e-06, "loss": 0.4182, "num_input_tokens_seen": 137205600, "step": 112820 }, { "epoch": 14.136699661696529, "grad_norm": 7.406491756439209, "learning_rate": 2.397503937689886e-06, "loss": 0.4757, "num_input_tokens_seen": 137211712, "step": 112825 }, { "epoch": 14.137326149605313, "grad_norm": 1.8717372417449951, "learning_rate": 2.397037134996808e-06, "loss": 0.4247, "num_input_tokens_seen": 137218272, "step": 112830 }, { "epoch": 14.137952637514095, "grad_norm": 2.363023281097412, "learning_rate": 2.396570363424341e-06, "loss": 0.5305, "num_input_tokens_seen": 137224416, "step": 112835 }, { "epoch": 14.13857912542288, "grad_norm": 2.537155866622925, "learning_rate": 2.396103622978062e-06, "loss": 0.4739, "num_input_tokens_seen": 137230464, "step": 112840 }, { "epoch": 14.139205613331663, "grad_norm": 9.549513816833496, "learning_rate": 2.3956369136635554e-06, "loss": 0.5377, "num_input_tokens_seen": 137236480, "step": 112845 }, { "epoch": 14.139832101240446, "grad_norm": 3.6618685722351074, "learning_rate": 2.3951702354863977e-06, "loss": 0.4872, "num_input_tokens_seen": 137241760, "step": 112850 }, { "epoch": 14.14045858914923, "grad_norm": 3.059861660003662, "learning_rate": 2.394703588452171e-06, "loss": 0.4475, "num_input_tokens_seen": 137248096, "step": 112855 }, { "epoch": 14.141085077058014, "grad_norm": 5.800483703613281, "learning_rate": 2.394236972566452e-06, "loss": 0.4552, "num_input_tokens_seen": 137254176, "step": 112860 }, { "epoch": 14.141711564966796, "grad_norm": 2.052603244781494, "learning_rate": 2.3937703878348222e-06, "loss": 0.4658, "num_input_tokens_seen": 137260352, "step": 112865 }, { "epoch": 14.14233805287558, "grad_norm": 11.698436737060547, "learning_rate": 2.3933038342628574e-06, "loss": 0.5267, "num_input_tokens_seen": 137265792, "step": 112870 }, { "epoch": 14.142964540784362, "grad_norm": 3.8275935649871826, "learning_rate": 2.392837311856136e-06, "loss": 0.4781, "num_input_tokens_seen": 137272000, "step": 112875 }, { "epoch": 14.143591028693146, "grad_norm": 6.575387477874756, "learning_rate": 2.392370820620239e-06, "loss": 0.4844, "num_input_tokens_seen": 137278368, "step": 112880 }, { "epoch": 14.14421751660193, "grad_norm": 1.5200114250183105, "learning_rate": 2.3919043605607397e-06, "loss": 0.4531, "num_input_tokens_seen": 137284544, "step": 112885 }, { "epoch": 14.144844004510713, "grad_norm": 9.901422500610352, "learning_rate": 2.3914379316832178e-06, "loss": 0.4818, "num_input_tokens_seen": 137290720, "step": 112890 }, { "epoch": 14.145470492419497, "grad_norm": 2.441370964050293, "learning_rate": 2.3909715339932465e-06, "loss": 0.4491, "num_input_tokens_seen": 137296736, "step": 112895 }, { "epoch": 14.146096980328279, "grad_norm": 1.8145864009857178, "learning_rate": 2.3905051674964062e-06, "loss": 0.4237, "num_input_tokens_seen": 137302944, "step": 112900 }, { "epoch": 14.146723468237063, "grad_norm": 12.197999954223633, "learning_rate": 2.3900388321982686e-06, "loss": 0.5141, "num_input_tokens_seen": 137309120, "step": 112905 }, { "epoch": 14.147349956145847, "grad_norm": 8.207259178161621, "learning_rate": 2.3895725281044136e-06, "loss": 0.5166, "num_input_tokens_seen": 137314976, "step": 112910 }, { "epoch": 14.14797644405463, "grad_norm": 2.3333075046539307, "learning_rate": 2.3891062552204116e-06, "loss": 0.4475, "num_input_tokens_seen": 137320640, "step": 112915 }, { "epoch": 14.148602931963413, "grad_norm": 1.8036868572235107, "learning_rate": 2.388640013551839e-06, "loss": 0.4374, "num_input_tokens_seen": 137327008, "step": 112920 }, { "epoch": 14.149229419872196, "grad_norm": 3.062321901321411, "learning_rate": 2.3881738031042704e-06, "loss": 0.4575, "num_input_tokens_seen": 137333248, "step": 112925 }, { "epoch": 14.14985590778098, "grad_norm": 10.098008155822754, "learning_rate": 2.3877076238832824e-06, "loss": 0.5957, "num_input_tokens_seen": 137339040, "step": 112930 }, { "epoch": 14.150482395689764, "grad_norm": 1.7711501121520996, "learning_rate": 2.387241475894444e-06, "loss": 0.4442, "num_input_tokens_seen": 137345408, "step": 112935 }, { "epoch": 14.151108883598546, "grad_norm": 2.979452610015869, "learning_rate": 2.38677535914333e-06, "loss": 0.4486, "num_input_tokens_seen": 137351744, "step": 112940 }, { "epoch": 14.15173537150733, "grad_norm": 9.971996307373047, "learning_rate": 2.386309273635516e-06, "loss": 0.4856, "num_input_tokens_seen": 137357760, "step": 112945 }, { "epoch": 14.152361859416112, "grad_norm": 2.332235097885132, "learning_rate": 2.38584321937657e-06, "loss": 0.487, "num_input_tokens_seen": 137363520, "step": 112950 }, { "epoch": 14.152988347324897, "grad_norm": 2.647592544555664, "learning_rate": 2.3853771963720685e-06, "loss": 0.4451, "num_input_tokens_seen": 137369120, "step": 112955 }, { "epoch": 14.15361483523368, "grad_norm": 2.635831594467163, "learning_rate": 2.3849112046275784e-06, "loss": 0.4867, "num_input_tokens_seen": 137375360, "step": 112960 }, { "epoch": 14.154241323142463, "grad_norm": 1.5528608560562134, "learning_rate": 2.384445244148676e-06, "loss": 0.4657, "num_input_tokens_seen": 137381472, "step": 112965 }, { "epoch": 14.154867811051247, "grad_norm": 1.8538213968276978, "learning_rate": 2.3839793149409275e-06, "loss": 0.5543, "num_input_tokens_seen": 137387360, "step": 112970 }, { "epoch": 14.15549429896003, "grad_norm": 1.1936615705490112, "learning_rate": 2.3835134170099073e-06, "loss": 0.5222, "num_input_tokens_seen": 137392960, "step": 112975 }, { "epoch": 14.156120786868813, "grad_norm": 1.9132204055786133, "learning_rate": 2.3830475503611827e-06, "loss": 0.4421, "num_input_tokens_seen": 137399296, "step": 112980 }, { "epoch": 14.156747274777597, "grad_norm": 1.1893562078475952, "learning_rate": 2.3825817150003262e-06, "loss": 0.4453, "num_input_tokens_seen": 137405056, "step": 112985 }, { "epoch": 14.15737376268638, "grad_norm": 1.353090524673462, "learning_rate": 2.3821159109329047e-06, "loss": 0.4412, "num_input_tokens_seen": 137410880, "step": 112990 }, { "epoch": 14.158000250595164, "grad_norm": 1.3400311470031738, "learning_rate": 2.381650138164488e-06, "loss": 0.4942, "num_input_tokens_seen": 137417248, "step": 112995 }, { "epoch": 14.158626738503948, "grad_norm": 1.637980580329895, "learning_rate": 2.381184396700647e-06, "loss": 0.4796, "num_input_tokens_seen": 137422784, "step": 113000 }, { "epoch": 14.15925322641273, "grad_norm": 2.200576066970825, "learning_rate": 2.3807186865469467e-06, "loss": 0.4354, "num_input_tokens_seen": 137428704, "step": 113005 }, { "epoch": 14.159879714321514, "grad_norm": 1.2326642274856567, "learning_rate": 2.3802530077089587e-06, "loss": 0.4383, "num_input_tokens_seen": 137434624, "step": 113010 }, { "epoch": 14.160506202230296, "grad_norm": 7.780882358551025, "learning_rate": 2.379787360192246e-06, "loss": 0.5149, "num_input_tokens_seen": 137440704, "step": 113015 }, { "epoch": 14.16113269013908, "grad_norm": 1.1141859292984009, "learning_rate": 2.379321744002381e-06, "loss": 0.4513, "num_input_tokens_seen": 137446816, "step": 113020 }, { "epoch": 14.161759178047864, "grad_norm": 4.532703399658203, "learning_rate": 2.378856159144926e-06, "loss": 0.4693, "num_input_tokens_seen": 137452800, "step": 113025 }, { "epoch": 14.162385665956647, "grad_norm": 1.8411885499954224, "learning_rate": 2.378390605625449e-06, "loss": 0.4593, "num_input_tokens_seen": 137458336, "step": 113030 }, { "epoch": 14.16301215386543, "grad_norm": 2.6996655464172363, "learning_rate": 2.3779250834495167e-06, "loss": 0.4238, "num_input_tokens_seen": 137464384, "step": 113035 }, { "epoch": 14.163638641774213, "grad_norm": 5.019911766052246, "learning_rate": 2.3774595926226963e-06, "loss": 0.4561, "num_input_tokens_seen": 137470496, "step": 113040 }, { "epoch": 14.164265129682997, "grad_norm": 2.0019609928131104, "learning_rate": 2.376994133150549e-06, "loss": 0.462, "num_input_tokens_seen": 137476832, "step": 113045 }, { "epoch": 14.164891617591781, "grad_norm": 1.3692407608032227, "learning_rate": 2.376528705038645e-06, "loss": 0.4493, "num_input_tokens_seen": 137482688, "step": 113050 }, { "epoch": 14.165518105500563, "grad_norm": 5.502516269683838, "learning_rate": 2.3760633082925435e-06, "loss": 0.4766, "num_input_tokens_seen": 137488544, "step": 113055 }, { "epoch": 14.166144593409348, "grad_norm": 7.426446914672852, "learning_rate": 2.375597942917811e-06, "loss": 0.6047, "num_input_tokens_seen": 137494496, "step": 113060 }, { "epoch": 14.16677108131813, "grad_norm": 1.856563687324524, "learning_rate": 2.3751326089200138e-06, "loss": 0.4597, "num_input_tokens_seen": 137500544, "step": 113065 }, { "epoch": 14.167397569226914, "grad_norm": 4.054283618927002, "learning_rate": 2.3746673063047117e-06, "loss": 0.4505, "num_input_tokens_seen": 137506656, "step": 113070 }, { "epoch": 14.168024057135698, "grad_norm": 7.044795036315918, "learning_rate": 2.3742020350774704e-06, "loss": 0.4613, "num_input_tokens_seen": 137512768, "step": 113075 }, { "epoch": 14.16865054504448, "grad_norm": 1.2830084562301636, "learning_rate": 2.37373679524385e-06, "loss": 0.4593, "num_input_tokens_seen": 137518720, "step": 113080 }, { "epoch": 14.169277032953264, "grad_norm": 4.0153584480285645, "learning_rate": 2.3732715868094156e-06, "loss": 0.5036, "num_input_tokens_seen": 137524992, "step": 113085 }, { "epoch": 14.169903520862047, "grad_norm": 1.990383267402649, "learning_rate": 2.3728064097797267e-06, "loss": 0.4431, "num_input_tokens_seen": 137531520, "step": 113090 }, { "epoch": 14.17053000877083, "grad_norm": 1.8793753385543823, "learning_rate": 2.372341264160347e-06, "loss": 0.4497, "num_input_tokens_seen": 137538016, "step": 113095 }, { "epoch": 14.171156496679615, "grad_norm": 5.305673599243164, "learning_rate": 2.3718761499568354e-06, "loss": 0.5594, "num_input_tokens_seen": 137544288, "step": 113100 }, { "epoch": 14.171782984588397, "grad_norm": 1.2552670240402222, "learning_rate": 2.371411067174756e-06, "loss": 0.4585, "num_input_tokens_seen": 137550496, "step": 113105 }, { "epoch": 14.172409472497181, "grad_norm": 1.5892510414123535, "learning_rate": 2.370946015819665e-06, "loss": 0.4234, "num_input_tokens_seen": 137556672, "step": 113110 }, { "epoch": 14.173035960405965, "grad_norm": 3.8730404376983643, "learning_rate": 2.370480995897127e-06, "loss": 0.4527, "num_input_tokens_seen": 137563040, "step": 113115 }, { "epoch": 14.173662448314747, "grad_norm": 2.2560651302337646, "learning_rate": 2.3700160074126972e-06, "loss": 0.4481, "num_input_tokens_seen": 137569376, "step": 113120 }, { "epoch": 14.174288936223531, "grad_norm": 1.5159400701522827, "learning_rate": 2.3695510503719375e-06, "loss": 0.4546, "num_input_tokens_seen": 137575648, "step": 113125 }, { "epoch": 14.174915424132314, "grad_norm": 1.6195194721221924, "learning_rate": 2.3690861247804066e-06, "loss": 0.4754, "num_input_tokens_seen": 137582112, "step": 113130 }, { "epoch": 14.175541912041098, "grad_norm": 1.3317582607269287, "learning_rate": 2.3686212306436644e-06, "loss": 0.4886, "num_input_tokens_seen": 137588256, "step": 113135 }, { "epoch": 14.176168399949882, "grad_norm": 7.823795795440674, "learning_rate": 2.3681563679672666e-06, "loss": 0.4778, "num_input_tokens_seen": 137594528, "step": 113140 }, { "epoch": 14.176794887858664, "grad_norm": 8.311469078063965, "learning_rate": 2.3676915367567714e-06, "loss": 0.4696, "num_input_tokens_seen": 137600672, "step": 113145 }, { "epoch": 14.177421375767448, "grad_norm": 1.1467970609664917, "learning_rate": 2.3672267370177397e-06, "loss": 0.4586, "num_input_tokens_seen": 137606592, "step": 113150 }, { "epoch": 14.17804786367623, "grad_norm": 8.059317588806152, "learning_rate": 2.3667619687557237e-06, "loss": 0.4685, "num_input_tokens_seen": 137612960, "step": 113155 }, { "epoch": 14.178674351585014, "grad_norm": 7.56568717956543, "learning_rate": 2.3662972319762837e-06, "loss": 0.4586, "num_input_tokens_seen": 137618848, "step": 113160 }, { "epoch": 14.179300839493798, "grad_norm": 1.9619133472442627, "learning_rate": 2.365832526684973e-06, "loss": 0.4435, "num_input_tokens_seen": 137625152, "step": 113165 }, { "epoch": 14.17992732740258, "grad_norm": 2.812952756881714, "learning_rate": 2.3653678528873514e-06, "loss": 0.4338, "num_input_tokens_seen": 137631552, "step": 113170 }, { "epoch": 14.180553815311365, "grad_norm": 1.5275300741195679, "learning_rate": 2.3649032105889695e-06, "loss": 0.4741, "num_input_tokens_seen": 137637568, "step": 113175 }, { "epoch": 14.181180303220147, "grad_norm": 1.3803213834762573, "learning_rate": 2.3644385997953857e-06, "loss": 0.5032, "num_input_tokens_seen": 137643744, "step": 113180 }, { "epoch": 14.181806791128931, "grad_norm": 1.7981928586959839, "learning_rate": 2.3639740205121563e-06, "loss": 0.4485, "num_input_tokens_seen": 137649600, "step": 113185 }, { "epoch": 14.182433279037715, "grad_norm": 1.8288570642471313, "learning_rate": 2.3635094727448316e-06, "loss": 0.4328, "num_input_tokens_seen": 137655648, "step": 113190 }, { "epoch": 14.183059766946498, "grad_norm": 2.0811662673950195, "learning_rate": 2.36304495649897e-06, "loss": 0.5018, "num_input_tokens_seen": 137660448, "step": 113195 }, { "epoch": 14.183686254855282, "grad_norm": 1.289808988571167, "learning_rate": 2.3625804717801203e-06, "loss": 0.4948, "num_input_tokens_seen": 137666560, "step": 113200 }, { "epoch": 14.184312742764064, "grad_norm": 7.156312465667725, "learning_rate": 2.3621160185938413e-06, "loss": 0.5209, "num_input_tokens_seen": 137672480, "step": 113205 }, { "epoch": 14.184939230672848, "grad_norm": 4.209235668182373, "learning_rate": 2.3616515969456805e-06, "loss": 0.4451, "num_input_tokens_seen": 137678752, "step": 113210 }, { "epoch": 14.185565718581632, "grad_norm": 5.910212516784668, "learning_rate": 2.3611872068411955e-06, "loss": 0.4415, "num_input_tokens_seen": 137684896, "step": 113215 }, { "epoch": 14.186192206490414, "grad_norm": 1.2525347471237183, "learning_rate": 2.360722848285934e-06, "loss": 0.4652, "num_input_tokens_seen": 137690912, "step": 113220 }, { "epoch": 14.186818694399198, "grad_norm": 1.6244829893112183, "learning_rate": 2.3602585212854495e-06, "loss": 0.4596, "num_input_tokens_seen": 137697184, "step": 113225 }, { "epoch": 14.187445182307982, "grad_norm": 1.8225452899932861, "learning_rate": 2.359794225845296e-06, "loss": 0.4439, "num_input_tokens_seen": 137703232, "step": 113230 }, { "epoch": 14.188071670216765, "grad_norm": 2.79073166847229, "learning_rate": 2.35932996197102e-06, "loss": 0.4536, "num_input_tokens_seen": 137709440, "step": 113235 }, { "epoch": 14.188698158125549, "grad_norm": 1.6777781248092651, "learning_rate": 2.358865729668175e-06, "loss": 0.4487, "num_input_tokens_seen": 137715744, "step": 113240 }, { "epoch": 14.189324646034331, "grad_norm": 2.310559034347534, "learning_rate": 2.358401528942311e-06, "loss": 0.5266, "num_input_tokens_seen": 137721760, "step": 113245 }, { "epoch": 14.189951133943115, "grad_norm": 6.793608665466309, "learning_rate": 2.3579373597989787e-06, "loss": 0.4817, "num_input_tokens_seen": 137728192, "step": 113250 }, { "epoch": 14.190577621851899, "grad_norm": 12.136589050292969, "learning_rate": 2.3574732222437248e-06, "loss": 0.5616, "num_input_tokens_seen": 137733760, "step": 113255 }, { "epoch": 14.191204109760681, "grad_norm": 4.099392414093018, "learning_rate": 2.3570091162821023e-06, "loss": 0.4815, "num_input_tokens_seen": 137740160, "step": 113260 }, { "epoch": 14.191830597669465, "grad_norm": 3.2129435539245605, "learning_rate": 2.3565450419196563e-06, "loss": 0.4403, "num_input_tokens_seen": 137746112, "step": 113265 }, { "epoch": 14.192457085578248, "grad_norm": 3.764312744140625, "learning_rate": 2.3560809991619383e-06, "loss": 0.4422, "num_input_tokens_seen": 137752192, "step": 113270 }, { "epoch": 14.193083573487032, "grad_norm": 1.765547752380371, "learning_rate": 2.355616988014493e-06, "loss": 0.504, "num_input_tokens_seen": 137758400, "step": 113275 }, { "epoch": 14.193710061395816, "grad_norm": 2.4278149604797363, "learning_rate": 2.355153008482872e-06, "loss": 0.5121, "num_input_tokens_seen": 137763776, "step": 113280 }, { "epoch": 14.194336549304598, "grad_norm": 10.352580070495605, "learning_rate": 2.3546890605726186e-06, "loss": 0.595, "num_input_tokens_seen": 137769792, "step": 113285 }, { "epoch": 14.194963037213382, "grad_norm": 1.3112874031066895, "learning_rate": 2.3542251442892833e-06, "loss": 0.4853, "num_input_tokens_seen": 137775680, "step": 113290 }, { "epoch": 14.195589525122164, "grad_norm": 1.963293433189392, "learning_rate": 2.353761259638409e-06, "loss": 0.4507, "num_input_tokens_seen": 137781824, "step": 113295 }, { "epoch": 14.196216013030948, "grad_norm": 2.8809897899627686, "learning_rate": 2.3532974066255454e-06, "loss": 0.5017, "num_input_tokens_seen": 137787936, "step": 113300 }, { "epoch": 14.196842500939733, "grad_norm": 4.328559875488281, "learning_rate": 2.352833585256235e-06, "loss": 0.4818, "num_input_tokens_seen": 137793856, "step": 113305 }, { "epoch": 14.197468988848515, "grad_norm": 1.5892993211746216, "learning_rate": 2.352369795536024e-06, "loss": 0.4758, "num_input_tokens_seen": 137799936, "step": 113310 }, { "epoch": 14.198095476757299, "grad_norm": 2.3973636627197266, "learning_rate": 2.3519060374704604e-06, "loss": 0.4815, "num_input_tokens_seen": 137806528, "step": 113315 }, { "epoch": 14.198721964666081, "grad_norm": 1.4680979251861572, "learning_rate": 2.351442311065085e-06, "loss": 0.4539, "num_input_tokens_seen": 137812736, "step": 113320 }, { "epoch": 14.199348452574865, "grad_norm": 1.7513831853866577, "learning_rate": 2.350978616325445e-06, "loss": 0.432, "num_input_tokens_seen": 137818688, "step": 113325 }, { "epoch": 14.19997494048365, "grad_norm": 1.5219773054122925, "learning_rate": 2.3505149532570814e-06, "loss": 0.4828, "num_input_tokens_seen": 137824896, "step": 113330 }, { "epoch": 14.200601428392432, "grad_norm": 2.7558693885803223, "learning_rate": 2.3500513218655386e-06, "loss": 0.4974, "num_input_tokens_seen": 137830880, "step": 113335 }, { "epoch": 14.201227916301216, "grad_norm": 1.3700361251831055, "learning_rate": 2.3495877221563623e-06, "loss": 0.4659, "num_input_tokens_seen": 137837024, "step": 113340 }, { "epoch": 14.201854404209998, "grad_norm": 3.9113738536834717, "learning_rate": 2.349124154135091e-06, "loss": 0.4803, "num_input_tokens_seen": 137843200, "step": 113345 }, { "epoch": 14.202480892118782, "grad_norm": 1.3127762079238892, "learning_rate": 2.3486606178072697e-06, "loss": 0.4595, "num_input_tokens_seen": 137849120, "step": 113350 }, { "epoch": 14.203107380027566, "grad_norm": 1.3200575113296509, "learning_rate": 2.348197113178441e-06, "loss": 0.4904, "num_input_tokens_seen": 137855392, "step": 113355 }, { "epoch": 14.203733867936348, "grad_norm": 1.2830995321273804, "learning_rate": 2.347733640254144e-06, "loss": 0.4303, "num_input_tokens_seen": 137861664, "step": 113360 }, { "epoch": 14.204360355845132, "grad_norm": 1.4439369440078735, "learning_rate": 2.347270199039921e-06, "loss": 0.5107, "num_input_tokens_seen": 137867712, "step": 113365 }, { "epoch": 14.204986843753916, "grad_norm": 3.30171537399292, "learning_rate": 2.3468067895413147e-06, "loss": 0.4792, "num_input_tokens_seen": 137873984, "step": 113370 }, { "epoch": 14.205613331662699, "grad_norm": 6.1051812171936035, "learning_rate": 2.3463434117638627e-06, "loss": 0.4775, "num_input_tokens_seen": 137880160, "step": 113375 }, { "epoch": 14.206239819571483, "grad_norm": 8.681990623474121, "learning_rate": 2.3458800657131076e-06, "loss": 0.458, "num_input_tokens_seen": 137886688, "step": 113380 }, { "epoch": 14.206866307480265, "grad_norm": 1.7045232057571411, "learning_rate": 2.3454167513945864e-06, "loss": 0.4523, "num_input_tokens_seen": 137892768, "step": 113385 }, { "epoch": 14.207492795389049, "grad_norm": 1.4557113647460938, "learning_rate": 2.344953468813841e-06, "loss": 0.4521, "num_input_tokens_seen": 137898816, "step": 113390 }, { "epoch": 14.208119283297833, "grad_norm": 1.4384242296218872, "learning_rate": 2.344490217976408e-06, "loss": 0.4558, "num_input_tokens_seen": 137905184, "step": 113395 }, { "epoch": 14.208745771206615, "grad_norm": 1.2120788097381592, "learning_rate": 2.3440269988878284e-06, "loss": 0.4785, "num_input_tokens_seen": 137911136, "step": 113400 }, { "epoch": 14.2093722591154, "grad_norm": 1.182087779045105, "learning_rate": 2.3435638115536373e-06, "loss": 0.4709, "num_input_tokens_seen": 137916896, "step": 113405 }, { "epoch": 14.209998747024182, "grad_norm": 1.1061160564422607, "learning_rate": 2.3431006559793763e-06, "loss": 0.4705, "num_input_tokens_seen": 137923200, "step": 113410 }, { "epoch": 14.210625234932966, "grad_norm": 1.5369987487792969, "learning_rate": 2.3426375321705785e-06, "loss": 0.4565, "num_input_tokens_seen": 137929088, "step": 113415 }, { "epoch": 14.21125172284175, "grad_norm": 2.6036252975463867, "learning_rate": 2.342174440132785e-06, "loss": 0.4522, "num_input_tokens_seen": 137934720, "step": 113420 }, { "epoch": 14.211878210750532, "grad_norm": 1.9996267557144165, "learning_rate": 2.341711379871529e-06, "loss": 0.4778, "num_input_tokens_seen": 137941056, "step": 113425 }, { "epoch": 14.212504698659316, "grad_norm": 1.676267147064209, "learning_rate": 2.341248351392349e-06, "loss": 0.4881, "num_input_tokens_seen": 137947520, "step": 113430 }, { "epoch": 14.213131186568098, "grad_norm": 1.331175684928894, "learning_rate": 2.3407853547007814e-06, "loss": 0.4707, "num_input_tokens_seen": 137954080, "step": 113435 }, { "epoch": 14.213757674476883, "grad_norm": 3.6694562435150146, "learning_rate": 2.3403223898023592e-06, "loss": 0.4865, "num_input_tokens_seen": 137960064, "step": 113440 }, { "epoch": 14.214384162385667, "grad_norm": 1.2298387289047241, "learning_rate": 2.339859456702618e-06, "loss": 0.4444, "num_input_tokens_seen": 137966048, "step": 113445 }, { "epoch": 14.215010650294449, "grad_norm": 9.757538795471191, "learning_rate": 2.3393965554070945e-06, "loss": 0.4515, "num_input_tokens_seen": 137972416, "step": 113450 }, { "epoch": 14.215637138203233, "grad_norm": 1.8815443515777588, "learning_rate": 2.338933685921324e-06, "loss": 0.4508, "num_input_tokens_seen": 137978592, "step": 113455 }, { "epoch": 14.216263626112015, "grad_norm": 4.435957908630371, "learning_rate": 2.338470848250837e-06, "loss": 0.4393, "num_input_tokens_seen": 137984800, "step": 113460 }, { "epoch": 14.2168901140208, "grad_norm": 1.6143747568130493, "learning_rate": 2.3380080424011697e-06, "loss": 0.4422, "num_input_tokens_seen": 137990816, "step": 113465 }, { "epoch": 14.217516601929583, "grad_norm": 4.507412433624268, "learning_rate": 2.337545268377853e-06, "loss": 0.4656, "num_input_tokens_seen": 137996704, "step": 113470 }, { "epoch": 14.218143089838366, "grad_norm": 1.3203871250152588, "learning_rate": 2.3370825261864226e-06, "loss": 0.4682, "num_input_tokens_seen": 138003008, "step": 113475 }, { "epoch": 14.21876957774715, "grad_norm": 1.534019112586975, "learning_rate": 2.336619815832408e-06, "loss": 0.4293, "num_input_tokens_seen": 138009280, "step": 113480 }, { "epoch": 14.219396065655932, "grad_norm": 1.6713918447494507, "learning_rate": 2.3361571373213444e-06, "loss": 0.448, "num_input_tokens_seen": 138015520, "step": 113485 }, { "epoch": 14.220022553564716, "grad_norm": 2.183439016342163, "learning_rate": 2.3356944906587596e-06, "loss": 0.432, "num_input_tokens_seen": 138021696, "step": 113490 }, { "epoch": 14.2206490414735, "grad_norm": 2.3528425693511963, "learning_rate": 2.3352318758501875e-06, "loss": 0.4574, "num_input_tokens_seen": 138028000, "step": 113495 }, { "epoch": 14.221275529382282, "grad_norm": 1.8936680555343628, "learning_rate": 2.3347692929011605e-06, "loss": 0.4477, "num_input_tokens_seen": 138034272, "step": 113500 }, { "epoch": 14.221902017291066, "grad_norm": 1.5896576642990112, "learning_rate": 2.334306741817206e-06, "loss": 0.4899, "num_input_tokens_seen": 138040544, "step": 113505 }, { "epoch": 14.22252850519985, "grad_norm": 3.3269829750061035, "learning_rate": 2.333844222603857e-06, "loss": 0.4917, "num_input_tokens_seen": 138046240, "step": 113510 }, { "epoch": 14.223154993108633, "grad_norm": 5.256940841674805, "learning_rate": 2.3333817352666404e-06, "loss": 0.5396, "num_input_tokens_seen": 138052256, "step": 113515 }, { "epoch": 14.223781481017417, "grad_norm": 6.15231466293335, "learning_rate": 2.332919279811089e-06, "loss": 0.4446, "num_input_tokens_seen": 138058240, "step": 113520 }, { "epoch": 14.224407968926199, "grad_norm": 2.591041088104248, "learning_rate": 2.3324568562427273e-06, "loss": 0.4496, "num_input_tokens_seen": 138064224, "step": 113525 }, { "epoch": 14.225034456834983, "grad_norm": 1.2036405801773071, "learning_rate": 2.3319944645670895e-06, "loss": 0.4311, "num_input_tokens_seen": 138070688, "step": 113530 }, { "epoch": 14.225660944743767, "grad_norm": 0.9706861972808838, "learning_rate": 2.331532104789698e-06, "loss": 0.4703, "num_input_tokens_seen": 138076896, "step": 113535 }, { "epoch": 14.22628743265255, "grad_norm": 1.32300865650177, "learning_rate": 2.3310697769160853e-06, "loss": 0.4652, "num_input_tokens_seen": 138083040, "step": 113540 }, { "epoch": 14.226913920561334, "grad_norm": 1.2861288785934448, "learning_rate": 2.330607480951776e-06, "loss": 0.4499, "num_input_tokens_seen": 138088896, "step": 113545 }, { "epoch": 14.227540408470116, "grad_norm": 1.4442856311798096, "learning_rate": 2.330145216902301e-06, "loss": 0.4481, "num_input_tokens_seen": 138094848, "step": 113550 }, { "epoch": 14.2281668963789, "grad_norm": 1.466374397277832, "learning_rate": 2.3296829847731827e-06, "loss": 0.4285, "num_input_tokens_seen": 138100288, "step": 113555 }, { "epoch": 14.228793384287684, "grad_norm": 1.1836212873458862, "learning_rate": 2.3292207845699495e-06, "loss": 0.4598, "num_input_tokens_seen": 138106240, "step": 113560 }, { "epoch": 14.229419872196466, "grad_norm": 3.0827200412750244, "learning_rate": 2.328758616298129e-06, "loss": 0.4627, "num_input_tokens_seen": 138112384, "step": 113565 }, { "epoch": 14.23004636010525, "grad_norm": 1.7602711915969849, "learning_rate": 2.3282964799632427e-06, "loss": 0.4439, "num_input_tokens_seen": 138118560, "step": 113570 }, { "epoch": 14.230672848014033, "grad_norm": 2.4270894527435303, "learning_rate": 2.327834375570821e-06, "loss": 0.5205, "num_input_tokens_seen": 138124384, "step": 113575 }, { "epoch": 14.231299335922817, "grad_norm": 12.227524757385254, "learning_rate": 2.327372303126384e-06, "loss": 0.5139, "num_input_tokens_seen": 138130784, "step": 113580 }, { "epoch": 14.2319258238316, "grad_norm": 1.2963758707046509, "learning_rate": 2.3269102626354596e-06, "loss": 0.4676, "num_input_tokens_seen": 138136832, "step": 113585 }, { "epoch": 14.232552311740383, "grad_norm": 1.9258103370666504, "learning_rate": 2.3264482541035686e-06, "loss": 0.4266, "num_input_tokens_seen": 138142784, "step": 113590 }, { "epoch": 14.233178799649167, "grad_norm": 1.8623502254486084, "learning_rate": 2.3259862775362387e-06, "loss": 0.4499, "num_input_tokens_seen": 138148768, "step": 113595 }, { "epoch": 14.23380528755795, "grad_norm": 10.445121765136719, "learning_rate": 2.3255243329389887e-06, "loss": 0.4472, "num_input_tokens_seen": 138154816, "step": 113600 }, { "epoch": 14.234431775466733, "grad_norm": 1.8758597373962402, "learning_rate": 2.325062420317346e-06, "loss": 0.4267, "num_input_tokens_seen": 138161056, "step": 113605 }, { "epoch": 14.235058263375517, "grad_norm": 1.411596655845642, "learning_rate": 2.324600539676829e-06, "loss": 0.5048, "num_input_tokens_seen": 138167264, "step": 113610 }, { "epoch": 14.2356847512843, "grad_norm": 15.606818199157715, "learning_rate": 2.3241386910229624e-06, "loss": 0.6192, "num_input_tokens_seen": 138173664, "step": 113615 }, { "epoch": 14.236311239193084, "grad_norm": 1.781856656074524, "learning_rate": 2.3236768743612685e-06, "loss": 0.4873, "num_input_tokens_seen": 138179616, "step": 113620 }, { "epoch": 14.236937727101868, "grad_norm": 7.1216044425964355, "learning_rate": 2.3232150896972666e-06, "loss": 0.46, "num_input_tokens_seen": 138186048, "step": 113625 }, { "epoch": 14.23756421501065, "grad_norm": 1.6187890768051147, "learning_rate": 2.32275333703648e-06, "loss": 0.4397, "num_input_tokens_seen": 138192480, "step": 113630 }, { "epoch": 14.238190702919434, "grad_norm": 7.600651264190674, "learning_rate": 2.3222916163844274e-06, "loss": 0.4845, "num_input_tokens_seen": 138198368, "step": 113635 }, { "epoch": 14.238817190828216, "grad_norm": 1.5664608478546143, "learning_rate": 2.321829927746631e-06, "loss": 0.4358, "num_input_tokens_seen": 138204608, "step": 113640 }, { "epoch": 14.239443678737, "grad_norm": 3.0633556842803955, "learning_rate": 2.3213682711286073e-06, "loss": 0.4891, "num_input_tokens_seen": 138210912, "step": 113645 }, { "epoch": 14.240070166645785, "grad_norm": 2.1202962398529053, "learning_rate": 2.3209066465358786e-06, "loss": 0.4649, "num_input_tokens_seen": 138216896, "step": 113650 }, { "epoch": 14.240696654554567, "grad_norm": 1.3324257135391235, "learning_rate": 2.320445053973963e-06, "loss": 0.4676, "num_input_tokens_seen": 138223200, "step": 113655 }, { "epoch": 14.24132314246335, "grad_norm": 1.8833619356155396, "learning_rate": 2.3199834934483815e-06, "loss": 0.4362, "num_input_tokens_seen": 138229280, "step": 113660 }, { "epoch": 14.241949630372133, "grad_norm": 1.3160202503204346, "learning_rate": 2.319521964964649e-06, "loss": 0.4664, "num_input_tokens_seen": 138235360, "step": 113665 }, { "epoch": 14.242576118280917, "grad_norm": 6.604840278625488, "learning_rate": 2.3190604685282865e-06, "loss": 0.46, "num_input_tokens_seen": 138241344, "step": 113670 }, { "epoch": 14.243202606189701, "grad_norm": 1.9665385484695435, "learning_rate": 2.3185990041448087e-06, "loss": 0.4649, "num_input_tokens_seen": 138247616, "step": 113675 }, { "epoch": 14.243829094098484, "grad_norm": 1.8651220798492432, "learning_rate": 2.3181375718197342e-06, "loss": 0.4454, "num_input_tokens_seen": 138253504, "step": 113680 }, { "epoch": 14.244455582007268, "grad_norm": 2.524923801422119, "learning_rate": 2.317676171558581e-06, "loss": 0.4552, "num_input_tokens_seen": 138259296, "step": 113685 }, { "epoch": 14.24508206991605, "grad_norm": 4.201436996459961, "learning_rate": 2.3172148033668627e-06, "loss": 0.5266, "num_input_tokens_seen": 138265280, "step": 113690 }, { "epoch": 14.245708557824834, "grad_norm": 5.883857250213623, "learning_rate": 2.3167534672500995e-06, "loss": 0.4219, "num_input_tokens_seen": 138271328, "step": 113695 }, { "epoch": 14.246335045733618, "grad_norm": 1.5693403482437134, "learning_rate": 2.316292163213802e-06, "loss": 0.4404, "num_input_tokens_seen": 138277024, "step": 113700 }, { "epoch": 14.2469615336424, "grad_norm": 4.5050554275512695, "learning_rate": 2.31583089126349e-06, "loss": 0.4489, "num_input_tokens_seen": 138283008, "step": 113705 }, { "epoch": 14.247588021551184, "grad_norm": 2.0376360416412354, "learning_rate": 2.3153696514046747e-06, "loss": 0.6195, "num_input_tokens_seen": 138289568, "step": 113710 }, { "epoch": 14.248214509459967, "grad_norm": 2.3528544902801514, "learning_rate": 2.314908443642874e-06, "loss": 0.4281, "num_input_tokens_seen": 138295744, "step": 113715 }, { "epoch": 14.24884099736875, "grad_norm": 5.674635410308838, "learning_rate": 2.3144472679835995e-06, "loss": 0.4818, "num_input_tokens_seen": 138302144, "step": 113720 }, { "epoch": 14.249467485277535, "grad_norm": 13.423948287963867, "learning_rate": 2.3139861244323665e-06, "loss": 0.4776, "num_input_tokens_seen": 138308064, "step": 113725 }, { "epoch": 14.250093973186317, "grad_norm": 1.862133264541626, "learning_rate": 2.313525012994686e-06, "loss": 0.4848, "num_input_tokens_seen": 138313632, "step": 113730 }, { "epoch": 14.250720461095101, "grad_norm": 1.688193440437317, "learning_rate": 2.3130639336760744e-06, "loss": 0.4206, "num_input_tokens_seen": 138319904, "step": 113735 }, { "epoch": 14.251346949003885, "grad_norm": 6.719305992126465, "learning_rate": 2.3126028864820416e-06, "loss": 0.4694, "num_input_tokens_seen": 138325824, "step": 113740 }, { "epoch": 14.251973436912667, "grad_norm": 11.979767799377441, "learning_rate": 2.3121418714181e-06, "loss": 0.5545, "num_input_tokens_seen": 138331904, "step": 113745 }, { "epoch": 14.252599924821451, "grad_norm": 2.386713981628418, "learning_rate": 2.3116808884897616e-06, "loss": 0.4901, "num_input_tokens_seen": 138338080, "step": 113750 }, { "epoch": 14.253226412730234, "grad_norm": 1.5418745279312134, "learning_rate": 2.311219937702541e-06, "loss": 0.4144, "num_input_tokens_seen": 138344256, "step": 113755 }, { "epoch": 14.253852900639018, "grad_norm": 1.9334518909454346, "learning_rate": 2.310759019061945e-06, "loss": 0.4444, "num_input_tokens_seen": 138349792, "step": 113760 }, { "epoch": 14.254479388547802, "grad_norm": 2.161613941192627, "learning_rate": 2.310298132573485e-06, "loss": 0.4507, "num_input_tokens_seen": 138356064, "step": 113765 }, { "epoch": 14.255105876456584, "grad_norm": 2.042060136795044, "learning_rate": 2.3098372782426747e-06, "loss": 0.4338, "num_input_tokens_seen": 138362208, "step": 113770 }, { "epoch": 14.255732364365368, "grad_norm": 10.219501495361328, "learning_rate": 2.30937645607502e-06, "loss": 0.5234, "num_input_tokens_seen": 138368096, "step": 113775 }, { "epoch": 14.25635885227415, "grad_norm": 2.3729076385498047, "learning_rate": 2.308915666076033e-06, "loss": 0.4797, "num_input_tokens_seen": 138373888, "step": 113780 }, { "epoch": 14.256985340182935, "grad_norm": 11.161155700683594, "learning_rate": 2.30845490825122e-06, "loss": 0.4736, "num_input_tokens_seen": 138380128, "step": 113785 }, { "epoch": 14.257611828091719, "grad_norm": 2.053199529647827, "learning_rate": 2.307994182606094e-06, "loss": 0.4683, "num_input_tokens_seen": 138386208, "step": 113790 }, { "epoch": 14.2582383160005, "grad_norm": 3.420713186264038, "learning_rate": 2.3075334891461587e-06, "loss": 0.4712, "num_input_tokens_seen": 138391840, "step": 113795 }, { "epoch": 14.258864803909285, "grad_norm": 2.060988187789917, "learning_rate": 2.307072827876924e-06, "loss": 0.6139, "num_input_tokens_seen": 138397760, "step": 113800 }, { "epoch": 14.259491291818067, "grad_norm": 9.41100788116455, "learning_rate": 2.3066121988038996e-06, "loss": 0.5017, "num_input_tokens_seen": 138403648, "step": 113805 }, { "epoch": 14.260117779726851, "grad_norm": 2.180377960205078, "learning_rate": 2.3061516019325893e-06, "loss": 0.5066, "num_input_tokens_seen": 138409728, "step": 113810 }, { "epoch": 14.260744267635635, "grad_norm": 5.558079719543457, "learning_rate": 2.3056910372685033e-06, "loss": 0.5316, "num_input_tokens_seen": 138416032, "step": 113815 }, { "epoch": 14.261370755544418, "grad_norm": 9.728089332580566, "learning_rate": 2.3052305048171438e-06, "loss": 0.482, "num_input_tokens_seen": 138422112, "step": 113820 }, { "epoch": 14.261997243453202, "grad_norm": 4.008591175079346, "learning_rate": 2.3047700045840216e-06, "loss": 0.515, "num_input_tokens_seen": 138428032, "step": 113825 }, { "epoch": 14.262623731361984, "grad_norm": 1.8029911518096924, "learning_rate": 2.304309536574638e-06, "loss": 0.4784, "num_input_tokens_seen": 138434016, "step": 113830 }, { "epoch": 14.263250219270768, "grad_norm": 9.878432273864746, "learning_rate": 2.3038491007945018e-06, "loss": 0.5041, "num_input_tokens_seen": 138440288, "step": 113835 }, { "epoch": 14.263876707179552, "grad_norm": 4.9996418952941895, "learning_rate": 2.3033886972491156e-06, "loss": 0.5027, "num_input_tokens_seen": 138446272, "step": 113840 }, { "epoch": 14.264503195088334, "grad_norm": 4.914570331573486, "learning_rate": 2.3029283259439837e-06, "loss": 0.4706, "num_input_tokens_seen": 138452352, "step": 113845 }, { "epoch": 14.265129682997118, "grad_norm": 11.344548225402832, "learning_rate": 2.3024679868846134e-06, "loss": 0.4903, "num_input_tokens_seen": 138458464, "step": 113850 }, { "epoch": 14.265756170905902, "grad_norm": 1.8406356573104858, "learning_rate": 2.3020076800765045e-06, "loss": 0.4631, "num_input_tokens_seen": 138464736, "step": 113855 }, { "epoch": 14.266382658814685, "grad_norm": 1.5500218868255615, "learning_rate": 2.301547405525162e-06, "loss": 0.4618, "num_input_tokens_seen": 138470720, "step": 113860 }, { "epoch": 14.267009146723469, "grad_norm": 11.509556770324707, "learning_rate": 2.3010871632360893e-06, "loss": 0.5154, "num_input_tokens_seen": 138476832, "step": 113865 }, { "epoch": 14.267635634632251, "grad_norm": 12.347696304321289, "learning_rate": 2.30062695321479e-06, "loss": 0.4876, "num_input_tokens_seen": 138482848, "step": 113870 }, { "epoch": 14.268262122541035, "grad_norm": 1.9758269786834717, "learning_rate": 2.3001667754667633e-06, "loss": 0.4387, "num_input_tokens_seen": 138489056, "step": 113875 }, { "epoch": 14.26888861044982, "grad_norm": 3.989246129989624, "learning_rate": 2.299706629997515e-06, "loss": 0.4766, "num_input_tokens_seen": 138495200, "step": 113880 }, { "epoch": 14.269515098358601, "grad_norm": 1.449103832244873, "learning_rate": 2.2992465168125423e-06, "loss": 0.4731, "num_input_tokens_seen": 138501568, "step": 113885 }, { "epoch": 14.270141586267385, "grad_norm": 4.354246139526367, "learning_rate": 2.2987864359173493e-06, "loss": 0.4698, "num_input_tokens_seen": 138507744, "step": 113890 }, { "epoch": 14.270768074176168, "grad_norm": 4.943124294281006, "learning_rate": 2.2983263873174343e-06, "loss": 0.4493, "num_input_tokens_seen": 138513696, "step": 113895 }, { "epoch": 14.271394562084952, "grad_norm": 2.1088359355926514, "learning_rate": 2.2978663710183007e-06, "loss": 0.4662, "num_input_tokens_seen": 138519232, "step": 113900 }, { "epoch": 14.272021049993736, "grad_norm": 11.05268383026123, "learning_rate": 2.2974063870254442e-06, "loss": 0.4888, "num_input_tokens_seen": 138525696, "step": 113905 }, { "epoch": 14.272647537902518, "grad_norm": 1.7137459516525269, "learning_rate": 2.296946435344369e-06, "loss": 0.4626, "num_input_tokens_seen": 138531200, "step": 113910 }, { "epoch": 14.273274025811302, "grad_norm": 2.2242908477783203, "learning_rate": 2.2964865159805692e-06, "loss": 0.4603, "num_input_tokens_seen": 138537312, "step": 113915 }, { "epoch": 14.273900513720085, "grad_norm": 4.423632621765137, "learning_rate": 2.296026628939548e-06, "loss": 0.5359, "num_input_tokens_seen": 138543200, "step": 113920 }, { "epoch": 14.274527001628869, "grad_norm": 1.339925765991211, "learning_rate": 2.2955667742268008e-06, "loss": 0.4438, "num_input_tokens_seen": 138549184, "step": 113925 }, { "epoch": 14.275153489537653, "grad_norm": 1.8138844966888428, "learning_rate": 2.295106951847826e-06, "loss": 0.5316, "num_input_tokens_seen": 138555200, "step": 113930 }, { "epoch": 14.275779977446435, "grad_norm": 1.7784734964370728, "learning_rate": 2.2946471618081235e-06, "loss": 0.4421, "num_input_tokens_seen": 138561440, "step": 113935 }, { "epoch": 14.276406465355219, "grad_norm": 1.8266232013702393, "learning_rate": 2.294187404113187e-06, "loss": 0.4429, "num_input_tokens_seen": 138567648, "step": 113940 }, { "epoch": 14.277032953264001, "grad_norm": 12.373664855957031, "learning_rate": 2.2937276787685164e-06, "loss": 0.4674, "num_input_tokens_seen": 138573824, "step": 113945 }, { "epoch": 14.277659441172785, "grad_norm": 5.147034168243408, "learning_rate": 2.2932679857796053e-06, "loss": 0.476, "num_input_tokens_seen": 138579808, "step": 113950 }, { "epoch": 14.27828592908157, "grad_norm": 5.083148002624512, "learning_rate": 2.2928083251519513e-06, "loss": 0.4742, "num_input_tokens_seen": 138585824, "step": 113955 }, { "epoch": 14.278912416990352, "grad_norm": 10.261033058166504, "learning_rate": 2.2923486968910512e-06, "loss": 0.4826, "num_input_tokens_seen": 138591392, "step": 113960 }, { "epoch": 14.279538904899136, "grad_norm": 11.3525972366333, "learning_rate": 2.2918891010023973e-06, "loss": 0.4699, "num_input_tokens_seen": 138597152, "step": 113965 }, { "epoch": 14.280165392807918, "grad_norm": 1.6031324863433838, "learning_rate": 2.2914295374914863e-06, "loss": 0.5076, "num_input_tokens_seen": 138602976, "step": 113970 }, { "epoch": 14.280791880716702, "grad_norm": 1.4513146877288818, "learning_rate": 2.290970006363814e-06, "loss": 0.4528, "num_input_tokens_seen": 138608544, "step": 113975 }, { "epoch": 14.281418368625486, "grad_norm": 4.328566551208496, "learning_rate": 2.290510507624871e-06, "loss": 0.4496, "num_input_tokens_seen": 138614496, "step": 113980 }, { "epoch": 14.282044856534268, "grad_norm": 1.4665801525115967, "learning_rate": 2.2900510412801537e-06, "loss": 0.4502, "num_input_tokens_seen": 138620672, "step": 113985 }, { "epoch": 14.282671344443052, "grad_norm": 3.554607629776001, "learning_rate": 2.2895916073351555e-06, "loss": 0.4656, "num_input_tokens_seen": 138626720, "step": 113990 }, { "epoch": 14.283297832351835, "grad_norm": 1.6597477197647095, "learning_rate": 2.289132205795367e-06, "loss": 0.4524, "num_input_tokens_seen": 138632736, "step": 113995 }, { "epoch": 14.283924320260619, "grad_norm": 3.0627987384796143, "learning_rate": 2.2886728366662837e-06, "loss": 0.4543, "num_input_tokens_seen": 138639040, "step": 114000 }, { "epoch": 14.284550808169403, "grad_norm": 3.97904896736145, "learning_rate": 2.2882134999533944e-06, "loss": 0.4463, "num_input_tokens_seen": 138645184, "step": 114005 }, { "epoch": 14.285177296078185, "grad_norm": 6.980892658233643, "learning_rate": 2.2877541956621946e-06, "loss": 0.4405, "num_input_tokens_seen": 138651328, "step": 114010 }, { "epoch": 14.28580378398697, "grad_norm": 1.7735121250152588, "learning_rate": 2.287294923798172e-06, "loss": 0.4729, "num_input_tokens_seen": 138657440, "step": 114015 }, { "epoch": 14.286430271895753, "grad_norm": 10.332533836364746, "learning_rate": 2.286835684366821e-06, "loss": 0.5288, "num_input_tokens_seen": 138663584, "step": 114020 }, { "epoch": 14.287056759804535, "grad_norm": 12.657320022583008, "learning_rate": 2.2863764773736287e-06, "loss": 0.5335, "num_input_tokens_seen": 138669792, "step": 114025 }, { "epoch": 14.28768324771332, "grad_norm": 2.311098575592041, "learning_rate": 2.285917302824089e-06, "loss": 0.5185, "num_input_tokens_seen": 138675840, "step": 114030 }, { "epoch": 14.288309735622102, "grad_norm": 1.8695180416107178, "learning_rate": 2.2854581607236874e-06, "loss": 0.559, "num_input_tokens_seen": 138681856, "step": 114035 }, { "epoch": 14.288936223530886, "grad_norm": 2.7276687622070312, "learning_rate": 2.2849990510779183e-06, "loss": 0.5145, "num_input_tokens_seen": 138688512, "step": 114040 }, { "epoch": 14.28956271143967, "grad_norm": 2.7336602210998535, "learning_rate": 2.284539973892266e-06, "loss": 0.4757, "num_input_tokens_seen": 138694656, "step": 114045 }, { "epoch": 14.290189199348452, "grad_norm": 1.3641024827957153, "learning_rate": 2.2840809291722217e-06, "loss": 0.4585, "num_input_tokens_seen": 138700800, "step": 114050 }, { "epoch": 14.290815687257236, "grad_norm": 1.5983211994171143, "learning_rate": 2.2836219169232747e-06, "loss": 0.4429, "num_input_tokens_seen": 138706400, "step": 114055 }, { "epoch": 14.291442175166019, "grad_norm": 1.5169949531555176, "learning_rate": 2.28316293715091e-06, "loss": 0.4891, "num_input_tokens_seen": 138712608, "step": 114060 }, { "epoch": 14.292068663074803, "grad_norm": 1.6795079708099365, "learning_rate": 2.282703989860616e-06, "loss": 0.4632, "num_input_tokens_seen": 138718528, "step": 114065 }, { "epoch": 14.292695150983587, "grad_norm": 1.7480436563491821, "learning_rate": 2.2822450750578812e-06, "loss": 0.4704, "num_input_tokens_seen": 138723840, "step": 114070 }, { "epoch": 14.293321638892369, "grad_norm": 5.882448196411133, "learning_rate": 2.2817861927481933e-06, "loss": 0.4729, "num_input_tokens_seen": 138730080, "step": 114075 }, { "epoch": 14.293948126801153, "grad_norm": 1.5720882415771484, "learning_rate": 2.2813273429370344e-06, "loss": 0.4935, "num_input_tokens_seen": 138736288, "step": 114080 }, { "epoch": 14.294574614709935, "grad_norm": 11.93233585357666, "learning_rate": 2.2808685256298953e-06, "loss": 0.5497, "num_input_tokens_seen": 138742688, "step": 114085 }, { "epoch": 14.29520110261872, "grad_norm": 5.744451522827148, "learning_rate": 2.280409740832257e-06, "loss": 0.4552, "num_input_tokens_seen": 138748544, "step": 114090 }, { "epoch": 14.295827590527503, "grad_norm": 1.1098603010177612, "learning_rate": 2.2799509885496078e-06, "loss": 0.4961, "num_input_tokens_seen": 138754656, "step": 114095 }, { "epoch": 14.296454078436286, "grad_norm": 2.250762701034546, "learning_rate": 2.2794922687874305e-06, "loss": 0.4525, "num_input_tokens_seen": 138760608, "step": 114100 }, { "epoch": 14.29708056634507, "grad_norm": 1.5950239896774292, "learning_rate": 2.279033581551212e-06, "loss": 0.454, "num_input_tokens_seen": 138767040, "step": 114105 }, { "epoch": 14.297707054253852, "grad_norm": 1.5771636962890625, "learning_rate": 2.278574926846433e-06, "loss": 0.444, "num_input_tokens_seen": 138773056, "step": 114110 }, { "epoch": 14.298333542162636, "grad_norm": 2.741818904876709, "learning_rate": 2.2781163046785785e-06, "loss": 0.4841, "num_input_tokens_seen": 138779200, "step": 114115 }, { "epoch": 14.29896003007142, "grad_norm": 3.761502504348755, "learning_rate": 2.277657715053134e-06, "loss": 0.489, "num_input_tokens_seen": 138785184, "step": 114120 }, { "epoch": 14.299586517980202, "grad_norm": 2.087489128112793, "learning_rate": 2.277199157975579e-06, "loss": 0.4258, "num_input_tokens_seen": 138791616, "step": 114125 }, { "epoch": 14.300213005888986, "grad_norm": 1.6634573936462402, "learning_rate": 2.276740633451398e-06, "loss": 0.425, "num_input_tokens_seen": 138797664, "step": 114130 }, { "epoch": 14.30083949379777, "grad_norm": 1.831046223640442, "learning_rate": 2.2762821414860716e-06, "loss": 0.4677, "num_input_tokens_seen": 138803744, "step": 114135 }, { "epoch": 14.301465981706553, "grad_norm": 10.339539527893066, "learning_rate": 2.275823682085083e-06, "loss": 0.468, "num_input_tokens_seen": 138809888, "step": 114140 }, { "epoch": 14.302092469615337, "grad_norm": 6.774473190307617, "learning_rate": 2.2753652552539114e-06, "loss": 0.4649, "num_input_tokens_seen": 138816192, "step": 114145 }, { "epoch": 14.30271895752412, "grad_norm": 1.8719533681869507, "learning_rate": 2.2749068609980406e-06, "loss": 0.5451, "num_input_tokens_seen": 138822368, "step": 114150 }, { "epoch": 14.303345445432903, "grad_norm": 5.7157206535339355, "learning_rate": 2.2744484993229475e-06, "loss": 0.5312, "num_input_tokens_seen": 138828608, "step": 114155 }, { "epoch": 14.303971933341687, "grad_norm": 2.534877300262451, "learning_rate": 2.2739901702341143e-06, "loss": 0.5087, "num_input_tokens_seen": 138834624, "step": 114160 }, { "epoch": 14.30459842125047, "grad_norm": 6.136229515075684, "learning_rate": 2.2735318737370205e-06, "loss": 0.4639, "num_input_tokens_seen": 138840832, "step": 114165 }, { "epoch": 14.305224909159254, "grad_norm": 6.762418746948242, "learning_rate": 2.2730736098371474e-06, "loss": 0.4587, "num_input_tokens_seen": 138846944, "step": 114170 }, { "epoch": 14.305851397068036, "grad_norm": 3.771113634109497, "learning_rate": 2.27261537853997e-06, "loss": 0.4255, "num_input_tokens_seen": 138853344, "step": 114175 }, { "epoch": 14.30647788497682, "grad_norm": 4.864681243896484, "learning_rate": 2.272157179850969e-06, "loss": 0.4334, "num_input_tokens_seen": 138859744, "step": 114180 }, { "epoch": 14.307104372885604, "grad_norm": 1.8110480308532715, "learning_rate": 2.2716990137756233e-06, "loss": 0.4511, "num_input_tokens_seen": 138866272, "step": 114185 }, { "epoch": 14.307730860794386, "grad_norm": 1.6731542348861694, "learning_rate": 2.271240880319409e-06, "loss": 0.463, "num_input_tokens_seen": 138872448, "step": 114190 }, { "epoch": 14.30835734870317, "grad_norm": 1.812995195388794, "learning_rate": 2.2707827794878057e-06, "loss": 0.446, "num_input_tokens_seen": 138878464, "step": 114195 }, { "epoch": 14.308983836611953, "grad_norm": 1.6157392263412476, "learning_rate": 2.270324711286287e-06, "loss": 0.5195, "num_input_tokens_seen": 138884768, "step": 114200 }, { "epoch": 14.309610324520737, "grad_norm": 2.150902032852173, "learning_rate": 2.269866675720333e-06, "loss": 0.4613, "num_input_tokens_seen": 138890528, "step": 114205 }, { "epoch": 14.31023681242952, "grad_norm": 1.8566869497299194, "learning_rate": 2.2694086727954164e-06, "loss": 0.4369, "num_input_tokens_seen": 138896704, "step": 114210 }, { "epoch": 14.310863300338303, "grad_norm": 9.486382484436035, "learning_rate": 2.268950702517017e-06, "loss": 0.4795, "num_input_tokens_seen": 138902880, "step": 114215 }, { "epoch": 14.311489788247087, "grad_norm": 7.446542739868164, "learning_rate": 2.2684927648906056e-06, "loss": 0.5174, "num_input_tokens_seen": 138908704, "step": 114220 }, { "epoch": 14.31211627615587, "grad_norm": 2.4372057914733887, "learning_rate": 2.268034859921662e-06, "loss": 0.4881, "num_input_tokens_seen": 138914304, "step": 114225 }, { "epoch": 14.312742764064653, "grad_norm": 12.764440536499023, "learning_rate": 2.267576987615657e-06, "loss": 0.5408, "num_input_tokens_seen": 138920832, "step": 114230 }, { "epoch": 14.313369251973437, "grad_norm": 5.122719764709473, "learning_rate": 2.2671191479780664e-06, "loss": 0.4962, "num_input_tokens_seen": 138926976, "step": 114235 }, { "epoch": 14.31399573988222, "grad_norm": 2.9703235626220703, "learning_rate": 2.2666613410143658e-06, "loss": 0.5084, "num_input_tokens_seen": 138933184, "step": 114240 }, { "epoch": 14.314622227791004, "grad_norm": 2.5162336826324463, "learning_rate": 2.2662035667300247e-06, "loss": 0.4803, "num_input_tokens_seen": 138939520, "step": 114245 }, { "epoch": 14.315248715699788, "grad_norm": 1.4385960102081299, "learning_rate": 2.265745825130521e-06, "loss": 0.4319, "num_input_tokens_seen": 138945696, "step": 114250 }, { "epoch": 14.31587520360857, "grad_norm": 1.8541072607040405, "learning_rate": 2.265288116221322e-06, "loss": 0.4978, "num_input_tokens_seen": 138951840, "step": 114255 }, { "epoch": 14.316501691517354, "grad_norm": 2.14917254447937, "learning_rate": 2.264830440007905e-06, "loss": 0.4452, "num_input_tokens_seen": 138957824, "step": 114260 }, { "epoch": 14.317128179426136, "grad_norm": 1.643437385559082, "learning_rate": 2.2643727964957386e-06, "loss": 0.4746, "num_input_tokens_seen": 138964128, "step": 114265 }, { "epoch": 14.31775466733492, "grad_norm": 1.5268818140029907, "learning_rate": 2.2639151856902945e-06, "loss": 0.4866, "num_input_tokens_seen": 138970080, "step": 114270 }, { "epoch": 14.318381155243705, "grad_norm": 2.0544135570526123, "learning_rate": 2.263457607597045e-06, "loss": 0.4764, "num_input_tokens_seen": 138976672, "step": 114275 }, { "epoch": 14.319007643152487, "grad_norm": 2.1764602661132812, "learning_rate": 2.2630000622214625e-06, "loss": 0.4557, "num_input_tokens_seen": 138983008, "step": 114280 }, { "epoch": 14.319634131061271, "grad_norm": 1.6987379789352417, "learning_rate": 2.2625425495690135e-06, "loss": 0.533, "num_input_tokens_seen": 138989312, "step": 114285 }, { "epoch": 14.320260618970053, "grad_norm": 7.964550018310547, "learning_rate": 2.2620850696451713e-06, "loss": 0.4733, "num_input_tokens_seen": 138995328, "step": 114290 }, { "epoch": 14.320887106878837, "grad_norm": 1.5038388967514038, "learning_rate": 2.2616276224554027e-06, "loss": 0.4835, "num_input_tokens_seen": 139001280, "step": 114295 }, { "epoch": 14.321513594787621, "grad_norm": 1.3675376176834106, "learning_rate": 2.261170208005178e-06, "loss": 0.4686, "num_input_tokens_seen": 139007488, "step": 114300 }, { "epoch": 14.322140082696404, "grad_norm": 10.621647834777832, "learning_rate": 2.2607128262999678e-06, "loss": 0.4579, "num_input_tokens_seen": 139013504, "step": 114305 }, { "epoch": 14.322766570605188, "grad_norm": 3.685845375061035, "learning_rate": 2.2602554773452374e-06, "loss": 0.4846, "num_input_tokens_seen": 139019168, "step": 114310 }, { "epoch": 14.32339305851397, "grad_norm": 5.004531383514404, "learning_rate": 2.2597981611464574e-06, "loss": 0.4727, "num_input_tokens_seen": 139025408, "step": 114315 }, { "epoch": 14.324019546422754, "grad_norm": 1.2088192701339722, "learning_rate": 2.259340877709093e-06, "loss": 0.4426, "num_input_tokens_seen": 139031872, "step": 114320 }, { "epoch": 14.324646034331538, "grad_norm": 6.5775275230407715, "learning_rate": 2.258883627038614e-06, "loss": 0.4839, "num_input_tokens_seen": 139037952, "step": 114325 }, { "epoch": 14.32527252224032, "grad_norm": 1.1800310611724854, "learning_rate": 2.2584264091404845e-06, "loss": 0.43, "num_input_tokens_seen": 139044032, "step": 114330 }, { "epoch": 14.325899010149104, "grad_norm": 2.1513442993164062, "learning_rate": 2.2579692240201738e-06, "loss": 0.503, "num_input_tokens_seen": 139049312, "step": 114335 }, { "epoch": 14.326525498057887, "grad_norm": 1.4489136934280396, "learning_rate": 2.257512071683145e-06, "loss": 0.458, "num_input_tokens_seen": 139055008, "step": 114340 }, { "epoch": 14.32715198596667, "grad_norm": 1.52232825756073, "learning_rate": 2.2570549521348667e-06, "loss": 0.4576, "num_input_tokens_seen": 139060960, "step": 114345 }, { "epoch": 14.327778473875455, "grad_norm": 3.8197574615478516, "learning_rate": 2.256597865380801e-06, "loss": 0.4643, "num_input_tokens_seen": 139066400, "step": 114350 }, { "epoch": 14.328404961784237, "grad_norm": 5.700592517852783, "learning_rate": 2.256140811426414e-06, "loss": 0.4789, "num_input_tokens_seen": 139072352, "step": 114355 }, { "epoch": 14.329031449693021, "grad_norm": 6.296690940856934, "learning_rate": 2.2556837902771718e-06, "loss": 0.4922, "num_input_tokens_seen": 139078176, "step": 114360 }, { "epoch": 14.329657937601805, "grad_norm": 1.7668009996414185, "learning_rate": 2.255226801938536e-06, "loss": 0.4853, "num_input_tokens_seen": 139084288, "step": 114365 }, { "epoch": 14.330284425510587, "grad_norm": 5.1282219886779785, "learning_rate": 2.254769846415971e-06, "loss": 0.4557, "num_input_tokens_seen": 139090496, "step": 114370 }, { "epoch": 14.330910913419372, "grad_norm": 1.6452230215072632, "learning_rate": 2.254312923714943e-06, "loss": 0.4589, "num_input_tokens_seen": 139096736, "step": 114375 }, { "epoch": 14.331537401328154, "grad_norm": 1.6604284048080444, "learning_rate": 2.2538560338409104e-06, "loss": 0.4519, "num_input_tokens_seen": 139102912, "step": 114380 }, { "epoch": 14.332163889236938, "grad_norm": 1.4522024393081665, "learning_rate": 2.253399176799338e-06, "loss": 0.447, "num_input_tokens_seen": 139109024, "step": 114385 }, { "epoch": 14.332790377145722, "grad_norm": 1.5327825546264648, "learning_rate": 2.252942352595689e-06, "loss": 0.4982, "num_input_tokens_seen": 139114784, "step": 114390 }, { "epoch": 14.333416865054504, "grad_norm": 5.5552167892456055, "learning_rate": 2.2524855612354223e-06, "loss": 0.4589, "num_input_tokens_seen": 139120768, "step": 114395 }, { "epoch": 14.334043352963288, "grad_norm": 2.3933284282684326, "learning_rate": 2.252028802724002e-06, "loss": 0.4569, "num_input_tokens_seen": 139126592, "step": 114400 }, { "epoch": 14.33466984087207, "grad_norm": 1.3925749063491821, "learning_rate": 2.2515720770668864e-06, "loss": 0.4509, "num_input_tokens_seen": 139132640, "step": 114405 }, { "epoch": 14.335296328780855, "grad_norm": 1.5419635772705078, "learning_rate": 2.2511153842695393e-06, "loss": 0.4665, "num_input_tokens_seen": 139138656, "step": 114410 }, { "epoch": 14.335922816689639, "grad_norm": 1.4359560012817383, "learning_rate": 2.2506587243374165e-06, "loss": 0.4397, "num_input_tokens_seen": 139144544, "step": 114415 }, { "epoch": 14.336549304598421, "grad_norm": 4.4141435623168945, "learning_rate": 2.2502020972759806e-06, "loss": 0.5202, "num_input_tokens_seen": 139150304, "step": 114420 }, { "epoch": 14.337175792507205, "grad_norm": 1.3520029783248901, "learning_rate": 2.2497455030906925e-06, "loss": 0.4686, "num_input_tokens_seen": 139156320, "step": 114425 }, { "epoch": 14.337802280415987, "grad_norm": 3.1670000553131104, "learning_rate": 2.249288941787007e-06, "loss": 0.4888, "num_input_tokens_seen": 139162240, "step": 114430 }, { "epoch": 14.338428768324771, "grad_norm": 6.432058334350586, "learning_rate": 2.248832413370387e-06, "loss": 0.4545, "num_input_tokens_seen": 139168352, "step": 114435 }, { "epoch": 14.339055256233555, "grad_norm": 1.6997472047805786, "learning_rate": 2.2483759178462864e-06, "loss": 0.458, "num_input_tokens_seen": 139173888, "step": 114440 }, { "epoch": 14.339681744142338, "grad_norm": 4.578794479370117, "learning_rate": 2.2479194552201672e-06, "loss": 0.4808, "num_input_tokens_seen": 139179936, "step": 114445 }, { "epoch": 14.340308232051122, "grad_norm": 3.892752170562744, "learning_rate": 2.247463025497483e-06, "loss": 0.4877, "num_input_tokens_seen": 139186272, "step": 114450 }, { "epoch": 14.340934719959904, "grad_norm": 4.697083950042725, "learning_rate": 2.247006628683694e-06, "loss": 0.465, "num_input_tokens_seen": 139192160, "step": 114455 }, { "epoch": 14.341561207868688, "grad_norm": 2.2483861446380615, "learning_rate": 2.2465502647842536e-06, "loss": 0.4735, "num_input_tokens_seen": 139198272, "step": 114460 }, { "epoch": 14.342187695777472, "grad_norm": 1.2544488906860352, "learning_rate": 2.24609393380462e-06, "loss": 0.4285, "num_input_tokens_seen": 139204416, "step": 114465 }, { "epoch": 14.342814183686254, "grad_norm": 1.4235022068023682, "learning_rate": 2.245637635750251e-06, "loss": 0.4933, "num_input_tokens_seen": 139210720, "step": 114470 }, { "epoch": 14.343440671595038, "grad_norm": 1.653459906578064, "learning_rate": 2.245181370626597e-06, "loss": 0.4321, "num_input_tokens_seen": 139216576, "step": 114475 }, { "epoch": 14.34406715950382, "grad_norm": 1.9690721035003662, "learning_rate": 2.244725138439116e-06, "loss": 0.4379, "num_input_tokens_seen": 139222368, "step": 114480 }, { "epoch": 14.344693647412605, "grad_norm": 2.140500068664551, "learning_rate": 2.244268939193263e-06, "loss": 0.4681, "num_input_tokens_seen": 139228800, "step": 114485 }, { "epoch": 14.345320135321389, "grad_norm": 14.239019393920898, "learning_rate": 2.2438127728944937e-06, "loss": 0.4971, "num_input_tokens_seen": 139235008, "step": 114490 }, { "epoch": 14.345946623230171, "grad_norm": 8.736472129821777, "learning_rate": 2.2433566395482577e-06, "loss": 0.4593, "num_input_tokens_seen": 139241472, "step": 114495 }, { "epoch": 14.346573111138955, "grad_norm": 1.6008665561676025, "learning_rate": 2.2429005391600122e-06, "loss": 0.4609, "num_input_tokens_seen": 139247520, "step": 114500 }, { "epoch": 14.34719959904774, "grad_norm": 4.710679531097412, "learning_rate": 2.242444471735208e-06, "loss": 0.4736, "num_input_tokens_seen": 139253792, "step": 114505 }, { "epoch": 14.347826086956522, "grad_norm": 10.87995433807373, "learning_rate": 2.2419884372792996e-06, "loss": 0.4974, "num_input_tokens_seen": 139260224, "step": 114510 }, { "epoch": 14.348452574865306, "grad_norm": 5.908297061920166, "learning_rate": 2.2415324357977366e-06, "loss": 0.4587, "num_input_tokens_seen": 139266208, "step": 114515 }, { "epoch": 14.349079062774088, "grad_norm": 4.966862678527832, "learning_rate": 2.2410764672959744e-06, "loss": 0.4429, "num_input_tokens_seen": 139272096, "step": 114520 }, { "epoch": 14.349705550682872, "grad_norm": 1.3608694076538086, "learning_rate": 2.2406205317794603e-06, "loss": 0.4264, "num_input_tokens_seen": 139278368, "step": 114525 }, { "epoch": 14.350332038591656, "grad_norm": 6.727028846740723, "learning_rate": 2.2401646292536504e-06, "loss": 0.4786, "num_input_tokens_seen": 139284544, "step": 114530 }, { "epoch": 14.350958526500438, "grad_norm": 1.6415159702301025, "learning_rate": 2.2397087597239904e-06, "loss": 0.4748, "num_input_tokens_seen": 139290208, "step": 114535 }, { "epoch": 14.351585014409222, "grad_norm": 1.6544368267059326, "learning_rate": 2.2392529231959347e-06, "loss": 0.4759, "num_input_tokens_seen": 139296224, "step": 114540 }, { "epoch": 14.352211502318005, "grad_norm": 7.676844596862793, "learning_rate": 2.2387971196749297e-06, "loss": 0.4998, "num_input_tokens_seen": 139302432, "step": 114545 }, { "epoch": 14.352837990226789, "grad_norm": 1.6505095958709717, "learning_rate": 2.2383413491664265e-06, "loss": 0.5036, "num_input_tokens_seen": 139308512, "step": 114550 }, { "epoch": 14.353464478135573, "grad_norm": 10.938273429870605, "learning_rate": 2.237885611675876e-06, "loss": 0.4991, "num_input_tokens_seen": 139313856, "step": 114555 }, { "epoch": 14.354090966044355, "grad_norm": 1.359349250793457, "learning_rate": 2.2374299072087243e-06, "loss": 0.4945, "num_input_tokens_seen": 139320000, "step": 114560 }, { "epoch": 14.354717453953139, "grad_norm": 6.025094985961914, "learning_rate": 2.2369742357704215e-06, "loss": 0.494, "num_input_tokens_seen": 139325888, "step": 114565 }, { "epoch": 14.355343941861921, "grad_norm": 1.3996119499206543, "learning_rate": 2.2365185973664134e-06, "loss": 0.4547, "num_input_tokens_seen": 139331840, "step": 114570 }, { "epoch": 14.355970429770705, "grad_norm": 4.916224002838135, "learning_rate": 2.2360629920021486e-06, "loss": 0.4653, "num_input_tokens_seen": 139337760, "step": 114575 }, { "epoch": 14.35659691767949, "grad_norm": 8.623583793640137, "learning_rate": 2.235607419683075e-06, "loss": 0.5476, "num_input_tokens_seen": 139343968, "step": 114580 }, { "epoch": 14.357223405588272, "grad_norm": 1.3295769691467285, "learning_rate": 2.2351518804146406e-06, "loss": 0.4537, "num_input_tokens_seen": 139350016, "step": 114585 }, { "epoch": 14.357849893497056, "grad_norm": 1.142452359199524, "learning_rate": 2.2346963742022877e-06, "loss": 0.4579, "num_input_tokens_seen": 139356224, "step": 114590 }, { "epoch": 14.358476381405838, "grad_norm": 1.6048940420150757, "learning_rate": 2.234240901051467e-06, "loss": 0.4522, "num_input_tokens_seen": 139361920, "step": 114595 }, { "epoch": 14.359102869314622, "grad_norm": 9.947513580322266, "learning_rate": 2.2337854609676194e-06, "loss": 0.462, "num_input_tokens_seen": 139368064, "step": 114600 }, { "epoch": 14.359729357223406, "grad_norm": 1.376047134399414, "learning_rate": 2.2333300539561926e-06, "loss": 0.4771, "num_input_tokens_seen": 139373664, "step": 114605 }, { "epoch": 14.360355845132188, "grad_norm": 1.65121328830719, "learning_rate": 2.2328746800226326e-06, "loss": 0.4127, "num_input_tokens_seen": 139379968, "step": 114610 }, { "epoch": 14.360982333040972, "grad_norm": 12.002523422241211, "learning_rate": 2.232419339172381e-06, "loss": 0.5827, "num_input_tokens_seen": 139385792, "step": 114615 }, { "epoch": 14.361608820949755, "grad_norm": 3.74068284034729, "learning_rate": 2.2319640314108843e-06, "loss": 0.4646, "num_input_tokens_seen": 139392000, "step": 114620 }, { "epoch": 14.362235308858539, "grad_norm": 1.6211875677108765, "learning_rate": 2.2315087567435833e-06, "loss": 0.4456, "num_input_tokens_seen": 139398560, "step": 114625 }, { "epoch": 14.362861796767323, "grad_norm": 2.2120065689086914, "learning_rate": 2.2310535151759242e-06, "loss": 0.4175, "num_input_tokens_seen": 139404928, "step": 114630 }, { "epoch": 14.363488284676105, "grad_norm": 4.136756420135498, "learning_rate": 2.230598306713347e-06, "loss": 0.4926, "num_input_tokens_seen": 139410208, "step": 114635 }, { "epoch": 14.36411477258489, "grad_norm": 5.184199810028076, "learning_rate": 2.230143131361297e-06, "loss": 0.4998, "num_input_tokens_seen": 139416672, "step": 114640 }, { "epoch": 14.364741260493673, "grad_norm": 1.4801369905471802, "learning_rate": 2.229687989125213e-06, "loss": 0.4837, "num_input_tokens_seen": 139423040, "step": 114645 }, { "epoch": 14.365367748402456, "grad_norm": 17.87152099609375, "learning_rate": 2.22923288001054e-06, "loss": 0.5615, "num_input_tokens_seen": 139428896, "step": 114650 }, { "epoch": 14.36599423631124, "grad_norm": 1.8970246315002441, "learning_rate": 2.2287778040227154e-06, "loss": 0.4831, "num_input_tokens_seen": 139434848, "step": 114655 }, { "epoch": 14.366620724220022, "grad_norm": 1.4271140098571777, "learning_rate": 2.228322761167184e-06, "loss": 0.5465, "num_input_tokens_seen": 139441088, "step": 114660 }, { "epoch": 14.367247212128806, "grad_norm": 1.531202793121338, "learning_rate": 2.227867751449383e-06, "loss": 0.5695, "num_input_tokens_seen": 139447104, "step": 114665 }, { "epoch": 14.36787370003759, "grad_norm": 6.591156482696533, "learning_rate": 2.227412774874753e-06, "loss": 0.5061, "num_input_tokens_seen": 139453280, "step": 114670 }, { "epoch": 14.368500187946372, "grad_norm": 4.6685285568237305, "learning_rate": 2.226957831448736e-06, "loss": 0.4819, "num_input_tokens_seen": 139459168, "step": 114675 }, { "epoch": 14.369126675855156, "grad_norm": 3.0534329414367676, "learning_rate": 2.2265029211767687e-06, "loss": 0.498, "num_input_tokens_seen": 139465504, "step": 114680 }, { "epoch": 14.369753163763939, "grad_norm": 4.41579532623291, "learning_rate": 2.22604804406429e-06, "loss": 0.4616, "num_input_tokens_seen": 139471744, "step": 114685 }, { "epoch": 14.370379651672723, "grad_norm": 1.4989575147628784, "learning_rate": 2.2255932001167396e-06, "loss": 0.496, "num_input_tokens_seen": 139478080, "step": 114690 }, { "epoch": 14.371006139581507, "grad_norm": 5.65446138381958, "learning_rate": 2.2251383893395568e-06, "loss": 0.4935, "num_input_tokens_seen": 139483904, "step": 114695 }, { "epoch": 14.371632627490289, "grad_norm": 1.8013091087341309, "learning_rate": 2.2246836117381752e-06, "loss": 0.484, "num_input_tokens_seen": 139489920, "step": 114700 }, { "epoch": 14.372259115399073, "grad_norm": 4.226200580596924, "learning_rate": 2.2242288673180368e-06, "loss": 0.4921, "num_input_tokens_seen": 139495808, "step": 114705 }, { "epoch": 14.372885603307855, "grad_norm": 1.486928105354309, "learning_rate": 2.2237741560845738e-06, "loss": 0.4587, "num_input_tokens_seen": 139501280, "step": 114710 }, { "epoch": 14.37351209121664, "grad_norm": 2.8186562061309814, "learning_rate": 2.223319478043227e-06, "loss": 0.456, "num_input_tokens_seen": 139507456, "step": 114715 }, { "epoch": 14.374138579125423, "grad_norm": 2.2793078422546387, "learning_rate": 2.2228648331994286e-06, "loss": 0.4796, "num_input_tokens_seen": 139513600, "step": 114720 }, { "epoch": 14.374765067034206, "grad_norm": 1.3725955486297607, "learning_rate": 2.2224102215586176e-06, "loss": 0.4824, "num_input_tokens_seen": 139519840, "step": 114725 }, { "epoch": 14.37539155494299, "grad_norm": 1.283889651298523, "learning_rate": 2.2219556431262255e-06, "loss": 0.4408, "num_input_tokens_seen": 139525952, "step": 114730 }, { "epoch": 14.376018042851772, "grad_norm": 1.7350956201553345, "learning_rate": 2.22150109790769e-06, "loss": 0.518, "num_input_tokens_seen": 139532064, "step": 114735 }, { "epoch": 14.376644530760556, "grad_norm": 1.475142240524292, "learning_rate": 2.221046585908447e-06, "loss": 0.4082, "num_input_tokens_seen": 139538176, "step": 114740 }, { "epoch": 14.37727101866934, "grad_norm": 2.226330280303955, "learning_rate": 2.220592107133926e-06, "loss": 0.4786, "num_input_tokens_seen": 139544352, "step": 114745 }, { "epoch": 14.377897506578122, "grad_norm": 2.952544689178467, "learning_rate": 2.220137661589565e-06, "loss": 0.4981, "num_input_tokens_seen": 139550688, "step": 114750 }, { "epoch": 14.378523994486907, "grad_norm": 1.4709949493408203, "learning_rate": 2.2196832492807933e-06, "loss": 0.5538, "num_input_tokens_seen": 139556864, "step": 114755 }, { "epoch": 14.37915048239569, "grad_norm": 13.603729248046875, "learning_rate": 2.219228870213049e-06, "loss": 0.5919, "num_input_tokens_seen": 139562912, "step": 114760 }, { "epoch": 14.379776970304473, "grad_norm": 1.1620547771453857, "learning_rate": 2.2187745243917584e-06, "loss": 0.4821, "num_input_tokens_seen": 139568960, "step": 114765 }, { "epoch": 14.380403458213257, "grad_norm": 1.5340303182601929, "learning_rate": 2.218320211822359e-06, "loss": 0.4436, "num_input_tokens_seen": 139575136, "step": 114770 }, { "epoch": 14.38102994612204, "grad_norm": 13.606778144836426, "learning_rate": 2.217865932510278e-06, "loss": 0.5177, "num_input_tokens_seen": 139581408, "step": 114775 }, { "epoch": 14.381656434030823, "grad_norm": 1.8503814935684204, "learning_rate": 2.21741168646095e-06, "loss": 0.4441, "num_input_tokens_seen": 139587744, "step": 114780 }, { "epoch": 14.382282921939607, "grad_norm": 1.3573384284973145, "learning_rate": 2.216957473679803e-06, "loss": 0.5112, "num_input_tokens_seen": 139593952, "step": 114785 }, { "epoch": 14.38290940984839, "grad_norm": 1.0762202739715576, "learning_rate": 2.2165032941722727e-06, "loss": 0.4918, "num_input_tokens_seen": 139599744, "step": 114790 }, { "epoch": 14.383535897757174, "grad_norm": 3.7435812950134277, "learning_rate": 2.216049147943783e-06, "loss": 0.4585, "num_input_tokens_seen": 139606112, "step": 114795 }, { "epoch": 14.384162385665956, "grad_norm": 1.1184533834457397, "learning_rate": 2.215595034999767e-06, "loss": 0.4406, "num_input_tokens_seen": 139612352, "step": 114800 }, { "epoch": 14.38478887357474, "grad_norm": 6.4289326667785645, "learning_rate": 2.2151409553456544e-06, "loss": 0.4601, "num_input_tokens_seen": 139618208, "step": 114805 }, { "epoch": 14.385415361483524, "grad_norm": 9.630847930908203, "learning_rate": 2.214686908986872e-06, "loss": 0.5501, "num_input_tokens_seen": 139624416, "step": 114810 }, { "epoch": 14.386041849392306, "grad_norm": 1.3114978075027466, "learning_rate": 2.2142328959288507e-06, "loss": 0.438, "num_input_tokens_seen": 139630432, "step": 114815 }, { "epoch": 14.38666833730109, "grad_norm": 2.5865447521209717, "learning_rate": 2.213778916177015e-06, "loss": 0.4514, "num_input_tokens_seen": 139636928, "step": 114820 }, { "epoch": 14.387294825209873, "grad_norm": 8.837084770202637, "learning_rate": 2.2133249697367974e-06, "loss": 0.4689, "num_input_tokens_seen": 139643008, "step": 114825 }, { "epoch": 14.387921313118657, "grad_norm": 5.321413516998291, "learning_rate": 2.212871056613621e-06, "loss": 0.5044, "num_input_tokens_seen": 139648992, "step": 114830 }, { "epoch": 14.38854780102744, "grad_norm": 1.4825093746185303, "learning_rate": 2.212417176812916e-06, "loss": 0.4546, "num_input_tokens_seen": 139654880, "step": 114835 }, { "epoch": 14.389174288936223, "grad_norm": 10.374807357788086, "learning_rate": 2.2119633303401057e-06, "loss": 0.4898, "num_input_tokens_seen": 139660960, "step": 114840 }, { "epoch": 14.389800776845007, "grad_norm": 1.588461995124817, "learning_rate": 2.211509517200619e-06, "loss": 0.4778, "num_input_tokens_seen": 139666720, "step": 114845 }, { "epoch": 14.39042726475379, "grad_norm": 3.259082794189453, "learning_rate": 2.2110557373998793e-06, "loss": 0.4702, "num_input_tokens_seen": 139672864, "step": 114850 }, { "epoch": 14.391053752662573, "grad_norm": 6.028190612792969, "learning_rate": 2.2106019909433125e-06, "loss": 0.4582, "num_input_tokens_seen": 139678880, "step": 114855 }, { "epoch": 14.391680240571358, "grad_norm": 9.46837043762207, "learning_rate": 2.2101482778363463e-06, "loss": 0.5747, "num_input_tokens_seen": 139685152, "step": 114860 }, { "epoch": 14.39230672848014, "grad_norm": 7.799195766448975, "learning_rate": 2.2096945980844016e-06, "loss": 0.5078, "num_input_tokens_seen": 139691584, "step": 114865 }, { "epoch": 14.392933216388924, "grad_norm": 2.1052136421203613, "learning_rate": 2.2092409516929047e-06, "loss": 0.4693, "num_input_tokens_seen": 139697824, "step": 114870 }, { "epoch": 14.393559704297708, "grad_norm": 4.237576484680176, "learning_rate": 2.2087873386672777e-06, "loss": 0.4736, "num_input_tokens_seen": 139704256, "step": 114875 }, { "epoch": 14.39418619220649, "grad_norm": 2.408930778503418, "learning_rate": 2.208333759012946e-06, "loss": 0.4601, "num_input_tokens_seen": 139710336, "step": 114880 }, { "epoch": 14.394812680115274, "grad_norm": 1.799702525138855, "learning_rate": 2.20788021273533e-06, "loss": 0.4822, "num_input_tokens_seen": 139716544, "step": 114885 }, { "epoch": 14.395439168024057, "grad_norm": 1.4069737195968628, "learning_rate": 2.207426699839853e-06, "loss": 0.4533, "num_input_tokens_seen": 139722752, "step": 114890 }, { "epoch": 14.39606565593284, "grad_norm": 1.9400235414505005, "learning_rate": 2.2069732203319383e-06, "loss": 0.4797, "num_input_tokens_seen": 139728608, "step": 114895 }, { "epoch": 14.396692143841625, "grad_norm": 4.87476110458374, "learning_rate": 2.2065197742170085e-06, "loss": 0.4641, "num_input_tokens_seen": 139734400, "step": 114900 }, { "epoch": 14.397318631750407, "grad_norm": 2.2247886657714844, "learning_rate": 2.206066361500481e-06, "loss": 0.446, "num_input_tokens_seen": 139741056, "step": 114905 }, { "epoch": 14.397945119659191, "grad_norm": 2.502314567565918, "learning_rate": 2.205612982187782e-06, "loss": 0.4524, "num_input_tokens_seen": 139747392, "step": 114910 }, { "epoch": 14.398571607567973, "grad_norm": 1.3469864130020142, "learning_rate": 2.205159636284327e-06, "loss": 0.4317, "num_input_tokens_seen": 139753504, "step": 114915 }, { "epoch": 14.399198095476757, "grad_norm": 5.102011203765869, "learning_rate": 2.2047063237955384e-06, "loss": 0.4736, "num_input_tokens_seen": 139759392, "step": 114920 }, { "epoch": 14.399824583385541, "grad_norm": 1.426790714263916, "learning_rate": 2.2042530447268374e-06, "loss": 0.4108, "num_input_tokens_seen": 139765632, "step": 114925 }, { "epoch": 14.400451071294324, "grad_norm": 2.9658162593841553, "learning_rate": 2.20379979908364e-06, "loss": 0.4476, "num_input_tokens_seen": 139771200, "step": 114930 }, { "epoch": 14.401077559203108, "grad_norm": 9.539146423339844, "learning_rate": 2.203346586871369e-06, "loss": 0.4686, "num_input_tokens_seen": 139777312, "step": 114935 }, { "epoch": 14.40170404711189, "grad_norm": 3.512244701385498, "learning_rate": 2.202893408095439e-06, "loss": 0.4595, "num_input_tokens_seen": 139783392, "step": 114940 }, { "epoch": 14.402330535020674, "grad_norm": 18.56374168395996, "learning_rate": 2.2024402627612717e-06, "loss": 0.5064, "num_input_tokens_seen": 139789760, "step": 114945 }, { "epoch": 14.402957022929458, "grad_norm": 1.6866552829742432, "learning_rate": 2.201987150874281e-06, "loss": 0.4687, "num_input_tokens_seen": 139795488, "step": 114950 }, { "epoch": 14.40358351083824, "grad_norm": 7.451189041137695, "learning_rate": 2.2015340724398887e-06, "loss": 0.4688, "num_input_tokens_seen": 139801376, "step": 114955 }, { "epoch": 14.404209998747024, "grad_norm": 4.446533679962158, "learning_rate": 2.201081027463507e-06, "loss": 0.4868, "num_input_tokens_seen": 139807776, "step": 114960 }, { "epoch": 14.404836486655807, "grad_norm": 12.607644081115723, "learning_rate": 2.200628015950557e-06, "loss": 0.5053, "num_input_tokens_seen": 139814144, "step": 114965 }, { "epoch": 14.40546297456459, "grad_norm": 2.4080967903137207, "learning_rate": 2.2001750379064507e-06, "loss": 0.4477, "num_input_tokens_seen": 139820160, "step": 114970 }, { "epoch": 14.406089462473375, "grad_norm": 1.5313740968704224, "learning_rate": 2.199722093336606e-06, "loss": 0.4066, "num_input_tokens_seen": 139826176, "step": 114975 }, { "epoch": 14.406715950382157, "grad_norm": 11.586812973022461, "learning_rate": 2.1992691822464397e-06, "loss": 0.4826, "num_input_tokens_seen": 139831680, "step": 114980 }, { "epoch": 14.407342438290941, "grad_norm": 0.9786155223846436, "learning_rate": 2.198816304641363e-06, "loss": 0.4263, "num_input_tokens_seen": 139837792, "step": 114985 }, { "epoch": 14.407968926199725, "grad_norm": 2.1080679893493652, "learning_rate": 2.1983634605267927e-06, "loss": 0.418, "num_input_tokens_seen": 139844064, "step": 114990 }, { "epoch": 14.408595414108508, "grad_norm": 3.2605419158935547, "learning_rate": 2.197910649908145e-06, "loss": 0.4458, "num_input_tokens_seen": 139850208, "step": 114995 }, { "epoch": 14.409221902017292, "grad_norm": 2.1525542736053467, "learning_rate": 2.197457872790829e-06, "loss": 0.5201, "num_input_tokens_seen": 139855936, "step": 115000 }, { "epoch": 14.409848389926074, "grad_norm": 1.7179228067398071, "learning_rate": 2.1970051291802602e-06, "loss": 0.5284, "num_input_tokens_seen": 139861856, "step": 115005 }, { "epoch": 14.410474877834858, "grad_norm": 2.230924129486084, "learning_rate": 2.1965524190818536e-06, "loss": 0.4427, "num_input_tokens_seen": 139867872, "step": 115010 }, { "epoch": 14.411101365743642, "grad_norm": 1.5621834993362427, "learning_rate": 2.1960997425010184e-06, "loss": 0.4526, "num_input_tokens_seen": 139874240, "step": 115015 }, { "epoch": 14.411727853652424, "grad_norm": 1.3092163801193237, "learning_rate": 2.19564709944317e-06, "loss": 0.4862, "num_input_tokens_seen": 139880384, "step": 115020 }, { "epoch": 14.412354341561208, "grad_norm": 9.071233749389648, "learning_rate": 2.195194489913716e-06, "loss": 0.544, "num_input_tokens_seen": 139886240, "step": 115025 }, { "epoch": 14.41298082946999, "grad_norm": 11.110326766967773, "learning_rate": 2.1947419139180726e-06, "loss": 0.6637, "num_input_tokens_seen": 139891808, "step": 115030 }, { "epoch": 14.413607317378775, "grad_norm": 1.3431037664413452, "learning_rate": 2.1942893714616452e-06, "loss": 0.4526, "num_input_tokens_seen": 139898240, "step": 115035 }, { "epoch": 14.414233805287559, "grad_norm": 6.5708842277526855, "learning_rate": 2.1938368625498486e-06, "loss": 0.4639, "num_input_tokens_seen": 139904480, "step": 115040 }, { "epoch": 14.414860293196341, "grad_norm": 1.7115272283554077, "learning_rate": 2.1933843871880924e-06, "loss": 0.4169, "num_input_tokens_seen": 139910656, "step": 115045 }, { "epoch": 14.415486781105125, "grad_norm": 1.32048499584198, "learning_rate": 2.192931945381784e-06, "loss": 0.4965, "num_input_tokens_seen": 139916704, "step": 115050 }, { "epoch": 14.416113269013907, "grad_norm": 3.8042328357696533, "learning_rate": 2.1924795371363363e-06, "loss": 0.4912, "num_input_tokens_seen": 139922848, "step": 115055 }, { "epoch": 14.416739756922691, "grad_norm": 1.360013484954834, "learning_rate": 2.192027162457154e-06, "loss": 0.4372, "num_input_tokens_seen": 139928672, "step": 115060 }, { "epoch": 14.417366244831475, "grad_norm": 7.1630754470825195, "learning_rate": 2.1915748213496496e-06, "loss": 0.5698, "num_input_tokens_seen": 139934496, "step": 115065 }, { "epoch": 14.417992732740258, "grad_norm": 1.0236375331878662, "learning_rate": 2.1911225138192277e-06, "loss": 0.5265, "num_input_tokens_seen": 139940832, "step": 115070 }, { "epoch": 14.418619220649042, "grad_norm": 2.6235597133636475, "learning_rate": 2.1906702398712998e-06, "loss": 0.4274, "num_input_tokens_seen": 139947424, "step": 115075 }, { "epoch": 14.419245708557824, "grad_norm": 3.4390206336975098, "learning_rate": 2.1902179995112695e-06, "loss": 0.4394, "num_input_tokens_seen": 139953248, "step": 115080 }, { "epoch": 14.419872196466608, "grad_norm": 6.384740829467773, "learning_rate": 2.1897657927445443e-06, "loss": 0.4652, "num_input_tokens_seen": 139959680, "step": 115085 }, { "epoch": 14.420498684375392, "grad_norm": 6.7404608726501465, "learning_rate": 2.1893136195765346e-06, "loss": 0.4701, "num_input_tokens_seen": 139965664, "step": 115090 }, { "epoch": 14.421125172284174, "grad_norm": 1.5055454969406128, "learning_rate": 2.188861480012641e-06, "loss": 0.4834, "num_input_tokens_seen": 139971904, "step": 115095 }, { "epoch": 14.421751660192959, "grad_norm": 1.9505040645599365, "learning_rate": 2.188409374058273e-06, "loss": 0.4625, "num_input_tokens_seen": 139978176, "step": 115100 }, { "epoch": 14.42237814810174, "grad_norm": 1.6078211069107056, "learning_rate": 2.1879573017188337e-06, "loss": 0.4463, "num_input_tokens_seen": 139984000, "step": 115105 }, { "epoch": 14.423004636010525, "grad_norm": 1.987808346748352, "learning_rate": 2.187505262999732e-06, "loss": 0.4438, "num_input_tokens_seen": 139990016, "step": 115110 }, { "epoch": 14.423631123919309, "grad_norm": 1.363876223564148, "learning_rate": 2.1870532579063663e-06, "loss": 0.5942, "num_input_tokens_seen": 139996128, "step": 115115 }, { "epoch": 14.424257611828091, "grad_norm": 1.180674433708191, "learning_rate": 2.1866012864441466e-06, "loss": 0.4696, "num_input_tokens_seen": 140001824, "step": 115120 }, { "epoch": 14.424884099736875, "grad_norm": 5.871799468994141, "learning_rate": 2.186149348618472e-06, "loss": 0.5454, "num_input_tokens_seen": 140007776, "step": 115125 }, { "epoch": 14.425510587645658, "grad_norm": 12.331355094909668, "learning_rate": 2.1856974444347495e-06, "loss": 0.6176, "num_input_tokens_seen": 140013920, "step": 115130 }, { "epoch": 14.426137075554442, "grad_norm": 5.810042858123779, "learning_rate": 2.1852455738983787e-06, "loss": 0.4651, "num_input_tokens_seen": 140019968, "step": 115135 }, { "epoch": 14.426763563463226, "grad_norm": 3.085481882095337, "learning_rate": 2.184793737014765e-06, "loss": 0.4992, "num_input_tokens_seen": 140026112, "step": 115140 }, { "epoch": 14.427390051372008, "grad_norm": 1.1844207048416138, "learning_rate": 2.1843419337893068e-06, "loss": 0.4569, "num_input_tokens_seen": 140032160, "step": 115145 }, { "epoch": 14.428016539280792, "grad_norm": 1.6475768089294434, "learning_rate": 2.183890164227411e-06, "loss": 0.4782, "num_input_tokens_seen": 140038432, "step": 115150 }, { "epoch": 14.428643027189576, "grad_norm": 15.738303184509277, "learning_rate": 2.1834384283344728e-06, "loss": 0.5552, "num_input_tokens_seen": 140044480, "step": 115155 }, { "epoch": 14.429269515098358, "grad_norm": 5.945679664611816, "learning_rate": 2.182986726115897e-06, "loss": 0.5027, "num_input_tokens_seen": 140050496, "step": 115160 }, { "epoch": 14.429896003007142, "grad_norm": 5.844874858856201, "learning_rate": 2.1825350575770855e-06, "loss": 0.4653, "num_input_tokens_seen": 140056768, "step": 115165 }, { "epoch": 14.430522490915925, "grad_norm": 3.9056079387664795, "learning_rate": 2.1820834227234335e-06, "loss": 0.4453, "num_input_tokens_seen": 140063296, "step": 115170 }, { "epoch": 14.431148978824709, "grad_norm": 1.3618930578231812, "learning_rate": 2.1816318215603455e-06, "loss": 0.4315, "num_input_tokens_seen": 140069024, "step": 115175 }, { "epoch": 14.431775466733493, "grad_norm": 0.9553079605102539, "learning_rate": 2.181180254093217e-06, "loss": 0.4381, "num_input_tokens_seen": 140075520, "step": 115180 }, { "epoch": 14.432401954642275, "grad_norm": 4.113407611846924, "learning_rate": 2.1807287203274504e-06, "loss": 0.4904, "num_input_tokens_seen": 140081600, "step": 115185 }, { "epoch": 14.433028442551059, "grad_norm": 1.1790955066680908, "learning_rate": 2.1802772202684396e-06, "loss": 0.4878, "num_input_tokens_seen": 140087744, "step": 115190 }, { "epoch": 14.433654930459841, "grad_norm": 1.2561947107315063, "learning_rate": 2.1798257539215863e-06, "loss": 0.5245, "num_input_tokens_seen": 140093792, "step": 115195 }, { "epoch": 14.434281418368625, "grad_norm": 1.633443832397461, "learning_rate": 2.1793743212922867e-06, "loss": 0.4707, "num_input_tokens_seen": 140100064, "step": 115200 }, { "epoch": 14.43490790627741, "grad_norm": 6.387917518615723, "learning_rate": 2.1789229223859403e-06, "loss": 0.4513, "num_input_tokens_seen": 140106496, "step": 115205 }, { "epoch": 14.435534394186192, "grad_norm": 1.373923897743225, "learning_rate": 2.1784715572079407e-06, "loss": 0.4059, "num_input_tokens_seen": 140112992, "step": 115210 }, { "epoch": 14.436160882094976, "grad_norm": 3.023063898086548, "learning_rate": 2.178020225763688e-06, "loss": 0.4641, "num_input_tokens_seen": 140118816, "step": 115215 }, { "epoch": 14.436787370003758, "grad_norm": 1.6226469278335571, "learning_rate": 2.1775689280585732e-06, "loss": 0.4669, "num_input_tokens_seen": 140124896, "step": 115220 }, { "epoch": 14.437413857912542, "grad_norm": 2.7078592777252197, "learning_rate": 2.1771176640979958e-06, "loss": 0.4526, "num_input_tokens_seen": 140131360, "step": 115225 }, { "epoch": 14.438040345821326, "grad_norm": 7.9329962730407715, "learning_rate": 2.1766664338873517e-06, "loss": 0.4389, "num_input_tokens_seen": 140137536, "step": 115230 }, { "epoch": 14.438666833730109, "grad_norm": 10.179203033447266, "learning_rate": 2.176215237432032e-06, "loss": 0.5375, "num_input_tokens_seen": 140143456, "step": 115235 }, { "epoch": 14.439293321638893, "grad_norm": 5.564070224761963, "learning_rate": 2.175764074737435e-06, "loss": 0.4483, "num_input_tokens_seen": 140149792, "step": 115240 }, { "epoch": 14.439919809547675, "grad_norm": 3.3326382637023926, "learning_rate": 2.175312945808951e-06, "loss": 0.4594, "num_input_tokens_seen": 140156160, "step": 115245 }, { "epoch": 14.440546297456459, "grad_norm": 2.7866933345794678, "learning_rate": 2.1748618506519782e-06, "loss": 0.4325, "num_input_tokens_seen": 140162208, "step": 115250 }, { "epoch": 14.441172785365243, "grad_norm": 1.6337225437164307, "learning_rate": 2.1744107892719047e-06, "loss": 0.4729, "num_input_tokens_seen": 140168512, "step": 115255 }, { "epoch": 14.441799273274025, "grad_norm": 1.2771190404891968, "learning_rate": 2.173959761674128e-06, "loss": 0.4815, "num_input_tokens_seen": 140174400, "step": 115260 }, { "epoch": 14.44242576118281, "grad_norm": 1.6035810708999634, "learning_rate": 2.1735087678640365e-06, "loss": 0.447, "num_input_tokens_seen": 140180288, "step": 115265 }, { "epoch": 14.443052249091593, "grad_norm": 14.06135368347168, "learning_rate": 2.173057807847026e-06, "loss": 0.4624, "num_input_tokens_seen": 140186464, "step": 115270 }, { "epoch": 14.443678737000376, "grad_norm": 7.324270725250244, "learning_rate": 2.172606881628484e-06, "loss": 0.4957, "num_input_tokens_seen": 140192640, "step": 115275 }, { "epoch": 14.44430522490916, "grad_norm": 2.1163415908813477, "learning_rate": 2.172155989213806e-06, "loss": 0.4409, "num_input_tokens_seen": 140199008, "step": 115280 }, { "epoch": 14.444931712817942, "grad_norm": 7.5016703605651855, "learning_rate": 2.1717051306083785e-06, "loss": 0.4667, "num_input_tokens_seen": 140205024, "step": 115285 }, { "epoch": 14.445558200726726, "grad_norm": 9.62662410736084, "learning_rate": 2.1712543058175943e-06, "loss": 0.4873, "num_input_tokens_seen": 140211200, "step": 115290 }, { "epoch": 14.44618468863551, "grad_norm": 1.6900761127471924, "learning_rate": 2.1708035148468448e-06, "loss": 0.4546, "num_input_tokens_seen": 140217216, "step": 115295 }, { "epoch": 14.446811176544292, "grad_norm": 2.0496695041656494, "learning_rate": 2.1703527577015165e-06, "loss": 0.4684, "num_input_tokens_seen": 140223360, "step": 115300 }, { "epoch": 14.447437664453076, "grad_norm": 2.003917694091797, "learning_rate": 2.1699020343869998e-06, "loss": 0.4489, "num_input_tokens_seen": 140229472, "step": 115305 }, { "epoch": 14.448064152361859, "grad_norm": 4.77644157409668, "learning_rate": 2.1694513449086837e-06, "loss": 0.4449, "num_input_tokens_seen": 140235648, "step": 115310 }, { "epoch": 14.448690640270643, "grad_norm": 1.4412624835968018, "learning_rate": 2.1690006892719583e-06, "loss": 0.4663, "num_input_tokens_seen": 140242048, "step": 115315 }, { "epoch": 14.449317128179427, "grad_norm": 7.3986053466796875, "learning_rate": 2.168550067482208e-06, "loss": 0.5101, "num_input_tokens_seen": 140248544, "step": 115320 }, { "epoch": 14.449943616088209, "grad_norm": 1.6086164712905884, "learning_rate": 2.168099479544824e-06, "loss": 0.4586, "num_input_tokens_seen": 140254656, "step": 115325 }, { "epoch": 14.450570103996993, "grad_norm": 2.322052478790283, "learning_rate": 2.16764892546519e-06, "loss": 0.4508, "num_input_tokens_seen": 140260864, "step": 115330 }, { "epoch": 14.451196591905775, "grad_norm": 1.52694571018219, "learning_rate": 2.167198405248696e-06, "loss": 0.4406, "num_input_tokens_seen": 140267040, "step": 115335 }, { "epoch": 14.45182307981456, "grad_norm": 1.6197530031204224, "learning_rate": 2.166747918900725e-06, "loss": 0.4474, "num_input_tokens_seen": 140273216, "step": 115340 }, { "epoch": 14.452449567723344, "grad_norm": 1.873542308807373, "learning_rate": 2.166297466426667e-06, "loss": 0.4783, "num_input_tokens_seen": 140279488, "step": 115345 }, { "epoch": 14.453076055632126, "grad_norm": 1.2771615982055664, "learning_rate": 2.1658470478319034e-06, "loss": 0.439, "num_input_tokens_seen": 140285568, "step": 115350 }, { "epoch": 14.45370254354091, "grad_norm": 4.833067893981934, "learning_rate": 2.16539666312182e-06, "loss": 0.4932, "num_input_tokens_seen": 140291776, "step": 115355 }, { "epoch": 14.454329031449692, "grad_norm": 2.269819498062134, "learning_rate": 2.1649463123018056e-06, "loss": 0.4433, "num_input_tokens_seen": 140297824, "step": 115360 }, { "epoch": 14.454955519358476, "grad_norm": 7.7879838943481445, "learning_rate": 2.16449599537724e-06, "loss": 0.4467, "num_input_tokens_seen": 140304128, "step": 115365 }, { "epoch": 14.45558200726726, "grad_norm": 1.7226355075836182, "learning_rate": 2.16404571235351e-06, "loss": 0.4625, "num_input_tokens_seen": 140310368, "step": 115370 }, { "epoch": 14.456208495176043, "grad_norm": 3.6374738216400146, "learning_rate": 2.1635954632359957e-06, "loss": 0.4676, "num_input_tokens_seen": 140316512, "step": 115375 }, { "epoch": 14.456834983084827, "grad_norm": 6.045766353607178, "learning_rate": 2.163145248030085e-06, "loss": 0.5005, "num_input_tokens_seen": 140322752, "step": 115380 }, { "epoch": 14.45746147099361, "grad_norm": 1.6439765691757202, "learning_rate": 2.162695066741155e-06, "loss": 0.4471, "num_input_tokens_seen": 140328992, "step": 115385 }, { "epoch": 14.458087958902393, "grad_norm": 2.660818099975586, "learning_rate": 2.1622449193745938e-06, "loss": 0.4655, "num_input_tokens_seen": 140335104, "step": 115390 }, { "epoch": 14.458714446811177, "grad_norm": 1.4427005052566528, "learning_rate": 2.1617948059357786e-06, "loss": 0.4606, "num_input_tokens_seen": 140341152, "step": 115395 }, { "epoch": 14.45934093471996, "grad_norm": 1.3949145078659058, "learning_rate": 2.1613447264300918e-06, "loss": 0.5093, "num_input_tokens_seen": 140347168, "step": 115400 }, { "epoch": 14.459967422628743, "grad_norm": 1.4780055284500122, "learning_rate": 2.160894680862916e-06, "loss": 0.4415, "num_input_tokens_seen": 140353280, "step": 115405 }, { "epoch": 14.460593910537527, "grad_norm": 2.1741151809692383, "learning_rate": 2.1604446692396335e-06, "loss": 0.4603, "num_input_tokens_seen": 140358368, "step": 115410 }, { "epoch": 14.46122039844631, "grad_norm": 4.803468227386475, "learning_rate": 2.15999469156562e-06, "loss": 0.4536, "num_input_tokens_seen": 140364416, "step": 115415 }, { "epoch": 14.461846886355094, "grad_norm": 1.3469067811965942, "learning_rate": 2.159544747846258e-06, "loss": 0.4425, "num_input_tokens_seen": 140370656, "step": 115420 }, { "epoch": 14.462473374263876, "grad_norm": 6.947502613067627, "learning_rate": 2.159094838086928e-06, "loss": 0.5333, "num_input_tokens_seen": 140376896, "step": 115425 }, { "epoch": 14.46309986217266, "grad_norm": 10.161657333374023, "learning_rate": 2.1586449622930067e-06, "loss": 0.4784, "num_input_tokens_seen": 140383008, "step": 115430 }, { "epoch": 14.463726350081444, "grad_norm": 1.4415607452392578, "learning_rate": 2.1581951204698746e-06, "loss": 0.4463, "num_input_tokens_seen": 140389024, "step": 115435 }, { "epoch": 14.464352837990226, "grad_norm": 7.122632026672363, "learning_rate": 2.157745312622908e-06, "loss": 0.4745, "num_input_tokens_seen": 140394880, "step": 115440 }, { "epoch": 14.46497932589901, "grad_norm": 4.5910491943359375, "learning_rate": 2.1572955387574874e-06, "loss": 0.48, "num_input_tokens_seen": 140400800, "step": 115445 }, { "epoch": 14.465605813807793, "grad_norm": 1.7624309062957764, "learning_rate": 2.156845798878987e-06, "loss": 0.5027, "num_input_tokens_seen": 140406368, "step": 115450 }, { "epoch": 14.466232301716577, "grad_norm": 9.55275821685791, "learning_rate": 2.156396092992787e-06, "loss": 0.5239, "num_input_tokens_seen": 140412672, "step": 115455 }, { "epoch": 14.46685878962536, "grad_norm": 1.6554508209228516, "learning_rate": 2.155946421104261e-06, "loss": 0.4471, "num_input_tokens_seen": 140417696, "step": 115460 }, { "epoch": 14.467485277534143, "grad_norm": 1.4531468152999878, "learning_rate": 2.1554967832187884e-06, "loss": 0.4659, "num_input_tokens_seen": 140423584, "step": 115465 }, { "epoch": 14.468111765442927, "grad_norm": 6.055571556091309, "learning_rate": 2.1550471793417412e-06, "loss": 0.5574, "num_input_tokens_seen": 140429696, "step": 115470 }, { "epoch": 14.46873825335171, "grad_norm": 2.3644073009490967, "learning_rate": 2.154597609478497e-06, "loss": 0.5025, "num_input_tokens_seen": 140435904, "step": 115475 }, { "epoch": 14.469364741260494, "grad_norm": 10.10827350616455, "learning_rate": 2.154148073634433e-06, "loss": 0.4489, "num_input_tokens_seen": 140442112, "step": 115480 }, { "epoch": 14.469991229169278, "grad_norm": 8.41452693939209, "learning_rate": 2.1536985718149188e-06, "loss": 0.4693, "num_input_tokens_seen": 140448000, "step": 115485 }, { "epoch": 14.47061771707806, "grad_norm": 9.284521102905273, "learning_rate": 2.1532491040253333e-06, "loss": 0.5087, "num_input_tokens_seen": 140453824, "step": 115490 }, { "epoch": 14.471244204986844, "grad_norm": 1.6493326425552368, "learning_rate": 2.152799670271046e-06, "loss": 0.4604, "num_input_tokens_seen": 140459840, "step": 115495 }, { "epoch": 14.471870692895628, "grad_norm": 6.077147006988525, "learning_rate": 2.1523502705574334e-06, "loss": 0.4637, "num_input_tokens_seen": 140465696, "step": 115500 }, { "epoch": 14.47249718080441, "grad_norm": 3.7290947437286377, "learning_rate": 2.1519009048898687e-06, "loss": 0.4284, "num_input_tokens_seen": 140471872, "step": 115505 }, { "epoch": 14.473123668713194, "grad_norm": 5.291234970092773, "learning_rate": 2.1514515732737213e-06, "loss": 0.455, "num_input_tokens_seen": 140478048, "step": 115510 }, { "epoch": 14.473750156621977, "grad_norm": 9.491737365722656, "learning_rate": 2.1510022757143657e-06, "loss": 0.4719, "num_input_tokens_seen": 140484352, "step": 115515 }, { "epoch": 14.47437664453076, "grad_norm": 1.348456859588623, "learning_rate": 2.1505530122171746e-06, "loss": 0.5124, "num_input_tokens_seen": 140490528, "step": 115520 }, { "epoch": 14.475003132439545, "grad_norm": 1.4226466417312622, "learning_rate": 2.1501037827875165e-06, "loss": 0.4601, "num_input_tokens_seen": 140496928, "step": 115525 }, { "epoch": 14.475629620348327, "grad_norm": 5.812534809112549, "learning_rate": 2.149654587430765e-06, "loss": 0.4613, "num_input_tokens_seen": 140502848, "step": 115530 }, { "epoch": 14.476256108257111, "grad_norm": 3.3956263065338135, "learning_rate": 2.149205426152288e-06, "loss": 0.4467, "num_input_tokens_seen": 140508992, "step": 115535 }, { "epoch": 14.476882596165893, "grad_norm": 1.708203911781311, "learning_rate": 2.1487562989574566e-06, "loss": 0.4499, "num_input_tokens_seen": 140515424, "step": 115540 }, { "epoch": 14.477509084074677, "grad_norm": 1.9462461471557617, "learning_rate": 2.148307205851643e-06, "loss": 0.4182, "num_input_tokens_seen": 140521952, "step": 115545 }, { "epoch": 14.478135571983461, "grad_norm": 8.099699974060059, "learning_rate": 2.147858146840213e-06, "loss": 0.4729, "num_input_tokens_seen": 140528224, "step": 115550 }, { "epoch": 14.478762059892244, "grad_norm": 1.8504101037979126, "learning_rate": 2.1474091219285378e-06, "loss": 0.5034, "num_input_tokens_seen": 140534112, "step": 115555 }, { "epoch": 14.479388547801028, "grad_norm": 1.60112726688385, "learning_rate": 2.1469601311219828e-06, "loss": 0.4279, "num_input_tokens_seen": 140540256, "step": 115560 }, { "epoch": 14.48001503570981, "grad_norm": 12.858887672424316, "learning_rate": 2.1465111744259206e-06, "loss": 0.543, "num_input_tokens_seen": 140546624, "step": 115565 }, { "epoch": 14.480641523618594, "grad_norm": 2.115185260772705, "learning_rate": 2.146062251845714e-06, "loss": 0.4377, "num_input_tokens_seen": 140552640, "step": 115570 }, { "epoch": 14.481268011527378, "grad_norm": 1.8713635206222534, "learning_rate": 2.1456133633867355e-06, "loss": 0.5097, "num_input_tokens_seen": 140558592, "step": 115575 }, { "epoch": 14.48189449943616, "grad_norm": 5.06919002532959, "learning_rate": 2.145164509054346e-06, "loss": 0.4686, "num_input_tokens_seen": 140564800, "step": 115580 }, { "epoch": 14.482520987344945, "grad_norm": 2.1367979049682617, "learning_rate": 2.1447156888539178e-06, "loss": 0.4379, "num_input_tokens_seen": 140570848, "step": 115585 }, { "epoch": 14.483147475253727, "grad_norm": 1.6575983762741089, "learning_rate": 2.144266902790812e-06, "loss": 0.4255, "num_input_tokens_seen": 140576832, "step": 115590 }, { "epoch": 14.48377396316251, "grad_norm": 2.8088324069976807, "learning_rate": 2.143818150870396e-06, "loss": 0.4791, "num_input_tokens_seen": 140583040, "step": 115595 }, { "epoch": 14.484400451071295, "grad_norm": 2.048426866531372, "learning_rate": 2.1433694330980378e-06, "loss": 0.4742, "num_input_tokens_seen": 140589120, "step": 115600 }, { "epoch": 14.485026938980077, "grad_norm": 1.5503813028335571, "learning_rate": 2.142920749479097e-06, "loss": 0.5117, "num_input_tokens_seen": 140595360, "step": 115605 }, { "epoch": 14.485653426888861, "grad_norm": 6.784367561340332, "learning_rate": 2.142472100018942e-06, "loss": 0.5479, "num_input_tokens_seen": 140601376, "step": 115610 }, { "epoch": 14.486279914797645, "grad_norm": 2.137261152267456, "learning_rate": 2.1420234847229363e-06, "loss": 0.4897, "num_input_tokens_seen": 140607168, "step": 115615 }, { "epoch": 14.486906402706428, "grad_norm": 1.3766093254089355, "learning_rate": 2.1415749035964413e-06, "loss": 0.4606, "num_input_tokens_seen": 140613216, "step": 115620 }, { "epoch": 14.487532890615212, "grad_norm": 2.5081076622009277, "learning_rate": 2.1411263566448216e-06, "loss": 0.4629, "num_input_tokens_seen": 140619584, "step": 115625 }, { "epoch": 14.488159378523994, "grad_norm": 1.9609624147415161, "learning_rate": 2.1406778438734416e-06, "loss": 0.4494, "num_input_tokens_seen": 140625952, "step": 115630 }, { "epoch": 14.488785866432778, "grad_norm": 2.275519371032715, "learning_rate": 2.14022936528766e-06, "loss": 0.4653, "num_input_tokens_seen": 140632352, "step": 115635 }, { "epoch": 14.489412354341562, "grad_norm": 4.562646865844727, "learning_rate": 2.1397809208928427e-06, "loss": 0.4427, "num_input_tokens_seen": 140638624, "step": 115640 }, { "epoch": 14.490038842250344, "grad_norm": 1.5700769424438477, "learning_rate": 2.1393325106943476e-06, "loss": 0.4422, "num_input_tokens_seen": 140644224, "step": 115645 }, { "epoch": 14.490665330159128, "grad_norm": 4.804327964782715, "learning_rate": 2.1388841346975386e-06, "loss": 0.4412, "num_input_tokens_seen": 140650080, "step": 115650 }, { "epoch": 14.49129181806791, "grad_norm": 1.1469473838806152, "learning_rate": 2.138435792907774e-06, "loss": 0.4482, "num_input_tokens_seen": 140656128, "step": 115655 }, { "epoch": 14.491918305976695, "grad_norm": 1.7133086919784546, "learning_rate": 2.1379874853304156e-06, "loss": 0.414, "num_input_tokens_seen": 140662336, "step": 115660 }, { "epoch": 14.492544793885479, "grad_norm": 2.9002525806427, "learning_rate": 2.137539211970824e-06, "loss": 0.4528, "num_input_tokens_seen": 140668512, "step": 115665 }, { "epoch": 14.493171281794261, "grad_norm": 1.6683391332626343, "learning_rate": 2.137090972834357e-06, "loss": 0.4525, "num_input_tokens_seen": 140674976, "step": 115670 }, { "epoch": 14.493797769703045, "grad_norm": 1.7528972625732422, "learning_rate": 2.1366427679263756e-06, "loss": 0.4657, "num_input_tokens_seen": 140680992, "step": 115675 }, { "epoch": 14.494424257611827, "grad_norm": 1.608812928199768, "learning_rate": 2.1361945972522356e-06, "loss": 0.4306, "num_input_tokens_seen": 140687648, "step": 115680 }, { "epoch": 14.495050745520611, "grad_norm": 13.550711631774902, "learning_rate": 2.135746460817299e-06, "loss": 0.4654, "num_input_tokens_seen": 140693568, "step": 115685 }, { "epoch": 14.495677233429396, "grad_norm": 1.8141086101531982, "learning_rate": 2.1352983586269195e-06, "loss": 0.4402, "num_input_tokens_seen": 140700096, "step": 115690 }, { "epoch": 14.496303721338178, "grad_norm": 2.450040817260742, "learning_rate": 2.1348502906864583e-06, "loss": 0.4129, "num_input_tokens_seen": 140706240, "step": 115695 }, { "epoch": 14.496930209246962, "grad_norm": 16.42530632019043, "learning_rate": 2.134402257001269e-06, "loss": 0.4691, "num_input_tokens_seen": 140712704, "step": 115700 }, { "epoch": 14.497556697155744, "grad_norm": 1.535529375076294, "learning_rate": 2.1339542575767102e-06, "loss": 0.4091, "num_input_tokens_seen": 140718496, "step": 115705 }, { "epoch": 14.498183185064528, "grad_norm": 3.240964651107788, "learning_rate": 2.13350629241814e-06, "loss": 0.4303, "num_input_tokens_seen": 140724800, "step": 115710 }, { "epoch": 14.498809672973312, "grad_norm": 2.8762784004211426, "learning_rate": 2.1330583615309097e-06, "loss": 0.4437, "num_input_tokens_seen": 140730752, "step": 115715 }, { "epoch": 14.499436160882095, "grad_norm": 17.16592788696289, "learning_rate": 2.132610464920377e-06, "loss": 0.5117, "num_input_tokens_seen": 140736864, "step": 115720 }, { "epoch": 14.500062648790879, "grad_norm": 8.419355392456055, "learning_rate": 2.1321626025918975e-06, "loss": 0.4492, "num_input_tokens_seen": 140742688, "step": 115725 }, { "epoch": 14.50068913669966, "grad_norm": 1.2995842695236206, "learning_rate": 2.1317147745508267e-06, "loss": 0.4969, "num_input_tokens_seen": 140748704, "step": 115730 }, { "epoch": 14.501315624608445, "grad_norm": 2.1694889068603516, "learning_rate": 2.1312669808025153e-06, "loss": 0.4565, "num_input_tokens_seen": 140754944, "step": 115735 }, { "epoch": 14.501942112517229, "grad_norm": 18.050979614257812, "learning_rate": 2.1308192213523214e-06, "loss": 0.4995, "num_input_tokens_seen": 140761216, "step": 115740 }, { "epoch": 14.502568600426011, "grad_norm": 2.576148271560669, "learning_rate": 2.130371496205594e-06, "loss": 0.4197, "num_input_tokens_seen": 140767296, "step": 115745 }, { "epoch": 14.503195088334795, "grad_norm": 4.252341270446777, "learning_rate": 2.12992380536769e-06, "loss": 0.4967, "num_input_tokens_seen": 140773344, "step": 115750 }, { "epoch": 14.503821576243578, "grad_norm": 9.885310173034668, "learning_rate": 2.1294761488439574e-06, "loss": 0.5125, "num_input_tokens_seen": 140779552, "step": 115755 }, { "epoch": 14.504448064152362, "grad_norm": 2.220710515975952, "learning_rate": 2.129028526639753e-06, "loss": 0.4927, "num_input_tokens_seen": 140785568, "step": 115760 }, { "epoch": 14.505074552061146, "grad_norm": 13.465761184692383, "learning_rate": 2.1285809387604246e-06, "loss": 0.5533, "num_input_tokens_seen": 140791584, "step": 115765 }, { "epoch": 14.505701039969928, "grad_norm": 2.6875903606414795, "learning_rate": 2.1281333852113274e-06, "loss": 0.4598, "num_input_tokens_seen": 140797664, "step": 115770 }, { "epoch": 14.506327527878712, "grad_norm": 2.6884677410125732, "learning_rate": 2.1276858659978076e-06, "loss": 0.4467, "num_input_tokens_seen": 140803776, "step": 115775 }, { "epoch": 14.506954015787496, "grad_norm": 1.1739367246627808, "learning_rate": 2.1272383811252185e-06, "loss": 0.4406, "num_input_tokens_seen": 140809504, "step": 115780 }, { "epoch": 14.507580503696278, "grad_norm": 2.624058723449707, "learning_rate": 2.126790930598912e-06, "loss": 0.4765, "num_input_tokens_seen": 140815264, "step": 115785 }, { "epoch": 14.508206991605062, "grad_norm": 10.8551025390625, "learning_rate": 2.1263435144242334e-06, "loss": 0.4839, "num_input_tokens_seen": 140821376, "step": 115790 }, { "epoch": 14.508833479513845, "grad_norm": 1.997875690460205, "learning_rate": 2.125896132606536e-06, "loss": 0.4555, "num_input_tokens_seen": 140827392, "step": 115795 }, { "epoch": 14.509459967422629, "grad_norm": 2.758881092071533, "learning_rate": 2.1254487851511647e-06, "loss": 0.4387, "num_input_tokens_seen": 140833888, "step": 115800 }, { "epoch": 14.510086455331413, "grad_norm": 1.4198859930038452, "learning_rate": 2.1250014720634722e-06, "loss": 0.4705, "num_input_tokens_seen": 140840320, "step": 115805 }, { "epoch": 14.510712943240195, "grad_norm": 19.168678283691406, "learning_rate": 2.1245541933488024e-06, "loss": 0.5405, "num_input_tokens_seen": 140846720, "step": 115810 }, { "epoch": 14.51133943114898, "grad_norm": 7.994639873504639, "learning_rate": 2.1241069490125046e-06, "loss": 0.4339, "num_input_tokens_seen": 140852384, "step": 115815 }, { "epoch": 14.511965919057761, "grad_norm": 4.755047798156738, "learning_rate": 2.123659739059926e-06, "loss": 0.4239, "num_input_tokens_seen": 140858464, "step": 115820 }, { "epoch": 14.512592406966546, "grad_norm": 4.848444938659668, "learning_rate": 2.1232125634964157e-06, "loss": 0.4815, "num_input_tokens_seen": 140864640, "step": 115825 }, { "epoch": 14.51321889487533, "grad_norm": 12.316904067993164, "learning_rate": 2.122765422327316e-06, "loss": 0.46, "num_input_tokens_seen": 140870624, "step": 115830 }, { "epoch": 14.513845382784112, "grad_norm": 12.986662864685059, "learning_rate": 2.1223183155579762e-06, "loss": 0.4978, "num_input_tokens_seen": 140876256, "step": 115835 }, { "epoch": 14.514471870692896, "grad_norm": 1.7451828718185425, "learning_rate": 2.121871243193739e-06, "loss": 0.5051, "num_input_tokens_seen": 140881568, "step": 115840 }, { "epoch": 14.515098358601678, "grad_norm": 4.943418979644775, "learning_rate": 2.12142420523995e-06, "loss": 0.4299, "num_input_tokens_seen": 140887424, "step": 115845 }, { "epoch": 14.515724846510462, "grad_norm": 2.423488140106201, "learning_rate": 2.1209772017019577e-06, "loss": 0.4552, "num_input_tokens_seen": 140893568, "step": 115850 }, { "epoch": 14.516351334419246, "grad_norm": 7.333180904388428, "learning_rate": 2.1205302325851005e-06, "loss": 0.4254, "num_input_tokens_seen": 140899584, "step": 115855 }, { "epoch": 14.516977822328029, "grad_norm": 11.69376277923584, "learning_rate": 2.1200832978947282e-06, "loss": 0.4936, "num_input_tokens_seen": 140905536, "step": 115860 }, { "epoch": 14.517604310236813, "grad_norm": 4.852939605712891, "learning_rate": 2.1196363976361796e-06, "loss": 0.4535, "num_input_tokens_seen": 140911328, "step": 115865 }, { "epoch": 14.518230798145595, "grad_norm": 13.364081382751465, "learning_rate": 2.119189531814801e-06, "loss": 0.5474, "num_input_tokens_seen": 140917536, "step": 115870 }, { "epoch": 14.518857286054379, "grad_norm": 1.6728415489196777, "learning_rate": 2.118742700435931e-06, "loss": 0.437, "num_input_tokens_seen": 140923488, "step": 115875 }, { "epoch": 14.519483773963163, "grad_norm": 4.285080909729004, "learning_rate": 2.1182959035049174e-06, "loss": 0.4427, "num_input_tokens_seen": 140929760, "step": 115880 }, { "epoch": 14.520110261871945, "grad_norm": 2.0557992458343506, "learning_rate": 2.1178491410270967e-06, "loss": 0.4836, "num_input_tokens_seen": 140935456, "step": 115885 }, { "epoch": 14.52073674978073, "grad_norm": 9.126893997192383, "learning_rate": 2.1174024130078147e-06, "loss": 0.5533, "num_input_tokens_seen": 140941568, "step": 115890 }, { "epoch": 14.521363237689513, "grad_norm": 14.988859176635742, "learning_rate": 2.1169557194524086e-06, "loss": 0.5572, "num_input_tokens_seen": 140948192, "step": 115895 }, { "epoch": 14.521989725598296, "grad_norm": 1.3952516317367554, "learning_rate": 2.1165090603662226e-06, "loss": 0.5744, "num_input_tokens_seen": 140954208, "step": 115900 }, { "epoch": 14.52261621350708, "grad_norm": 16.827590942382812, "learning_rate": 2.1160624357545934e-06, "loss": 0.5226, "num_input_tokens_seen": 140960448, "step": 115905 }, { "epoch": 14.523242701415862, "grad_norm": 2.358158588409424, "learning_rate": 2.115615845622862e-06, "loss": 0.4691, "num_input_tokens_seen": 140966560, "step": 115910 }, { "epoch": 14.523869189324646, "grad_norm": 2.737464427947998, "learning_rate": 2.11516928997637e-06, "loss": 0.5087, "num_input_tokens_seen": 140972736, "step": 115915 }, { "epoch": 14.52449567723343, "grad_norm": 10.656420707702637, "learning_rate": 2.1147227688204538e-06, "loss": 0.4796, "num_input_tokens_seen": 140978912, "step": 115920 }, { "epoch": 14.525122165142212, "grad_norm": 1.8834636211395264, "learning_rate": 2.1142762821604517e-06, "loss": 0.4532, "num_input_tokens_seen": 140984864, "step": 115925 }, { "epoch": 14.525748653050996, "grad_norm": 1.371840000152588, "learning_rate": 2.1138298300017034e-06, "loss": 0.4768, "num_input_tokens_seen": 140991360, "step": 115930 }, { "epoch": 14.526375140959779, "grad_norm": 1.5620158910751343, "learning_rate": 2.113383412349548e-06, "loss": 0.4585, "num_input_tokens_seen": 140997504, "step": 115935 }, { "epoch": 14.527001628868563, "grad_norm": 9.310295104980469, "learning_rate": 2.112937029209318e-06, "loss": 0.4767, "num_input_tokens_seen": 141003424, "step": 115940 }, { "epoch": 14.527628116777347, "grad_norm": 1.8449143171310425, "learning_rate": 2.1124906805863558e-06, "loss": 0.5107, "num_input_tokens_seen": 141008960, "step": 115945 }, { "epoch": 14.52825460468613, "grad_norm": 2.460347890853882, "learning_rate": 2.112044366485993e-06, "loss": 0.4254, "num_input_tokens_seen": 141015104, "step": 115950 }, { "epoch": 14.528881092594913, "grad_norm": 5.924063682556152, "learning_rate": 2.11159808691357e-06, "loss": 0.4449, "num_input_tokens_seen": 141021216, "step": 115955 }, { "epoch": 14.529507580503696, "grad_norm": 14.438751220703125, "learning_rate": 2.1111518418744183e-06, "loss": 0.4891, "num_input_tokens_seen": 141027424, "step": 115960 }, { "epoch": 14.53013406841248, "grad_norm": 2.1330788135528564, "learning_rate": 2.110705631373875e-06, "loss": 0.4537, "num_input_tokens_seen": 141033504, "step": 115965 }, { "epoch": 14.530760556321264, "grad_norm": 3.6236839294433594, "learning_rate": 2.110259455417277e-06, "loss": 0.446, "num_input_tokens_seen": 141039648, "step": 115970 }, { "epoch": 14.531387044230046, "grad_norm": 9.010279655456543, "learning_rate": 2.109813314009955e-06, "loss": 0.4691, "num_input_tokens_seen": 141045632, "step": 115975 }, { "epoch": 14.53201353213883, "grad_norm": 1.7469587326049805, "learning_rate": 2.109367207157247e-06, "loss": 0.4481, "num_input_tokens_seen": 141051840, "step": 115980 }, { "epoch": 14.532640020047612, "grad_norm": 2.0107808113098145, "learning_rate": 2.108921134864482e-06, "loss": 0.4432, "num_input_tokens_seen": 141058016, "step": 115985 }, { "epoch": 14.533266507956396, "grad_norm": 10.862919807434082, "learning_rate": 2.108475097136997e-06, "loss": 0.4739, "num_input_tokens_seen": 141063936, "step": 115990 }, { "epoch": 14.53389299586518, "grad_norm": 14.125377655029297, "learning_rate": 2.1080290939801225e-06, "loss": 0.4835, "num_input_tokens_seen": 141069824, "step": 115995 }, { "epoch": 14.534519483773963, "grad_norm": 9.471457481384277, "learning_rate": 2.1075831253991923e-06, "loss": 0.4671, "num_input_tokens_seen": 141075264, "step": 116000 }, { "epoch": 14.535145971682747, "grad_norm": 1.5131396055221558, "learning_rate": 2.1071371913995366e-06, "loss": 0.4684, "num_input_tokens_seen": 141081120, "step": 116005 }, { "epoch": 14.53577245959153, "grad_norm": 3.909975051879883, "learning_rate": 2.1066912919864897e-06, "loss": 0.4597, "num_input_tokens_seen": 141087232, "step": 116010 }, { "epoch": 14.536398947500313, "grad_norm": 10.561206817626953, "learning_rate": 2.1062454271653785e-06, "loss": 0.485, "num_input_tokens_seen": 141093280, "step": 116015 }, { "epoch": 14.537025435409097, "grad_norm": 3.209475040435791, "learning_rate": 2.105799596941537e-06, "loss": 0.6963, "num_input_tokens_seen": 141099360, "step": 116020 }, { "epoch": 14.53765192331788, "grad_norm": 2.9202444553375244, "learning_rate": 2.105353801320294e-06, "loss": 0.4169, "num_input_tokens_seen": 141105184, "step": 116025 }, { "epoch": 14.538278411226663, "grad_norm": 2.850750684738159, "learning_rate": 2.104908040306982e-06, "loss": 0.468, "num_input_tokens_seen": 141111008, "step": 116030 }, { "epoch": 14.538904899135447, "grad_norm": 2.0314829349517822, "learning_rate": 2.104462313906927e-06, "loss": 0.4291, "num_input_tokens_seen": 141116544, "step": 116035 }, { "epoch": 14.53953138704423, "grad_norm": 11.166872024536133, "learning_rate": 2.1040166221254588e-06, "loss": 0.5048, "num_input_tokens_seen": 141122912, "step": 116040 }, { "epoch": 14.540157874953014, "grad_norm": 2.136960029602051, "learning_rate": 2.1035709649679094e-06, "loss": 0.4456, "num_input_tokens_seen": 141129056, "step": 116045 }, { "epoch": 14.540784362861796, "grad_norm": 13.327156066894531, "learning_rate": 2.1031253424396013e-06, "loss": 0.5924, "num_input_tokens_seen": 141135424, "step": 116050 }, { "epoch": 14.54141085077058, "grad_norm": 15.460890769958496, "learning_rate": 2.102679754545868e-06, "loss": 0.5735, "num_input_tokens_seen": 141141728, "step": 116055 }, { "epoch": 14.542037338679364, "grad_norm": 1.933045744895935, "learning_rate": 2.1022342012920322e-06, "loss": 0.5142, "num_input_tokens_seen": 141147744, "step": 116060 }, { "epoch": 14.542663826588146, "grad_norm": 16.203067779541016, "learning_rate": 2.1017886826834245e-06, "loss": 0.4932, "num_input_tokens_seen": 141153760, "step": 116065 }, { "epoch": 14.54329031449693, "grad_norm": 2.0592761039733887, "learning_rate": 2.101343198725368e-06, "loss": 0.4536, "num_input_tokens_seen": 141159136, "step": 116070 }, { "epoch": 14.543916802405713, "grad_norm": 16.896862030029297, "learning_rate": 2.1008977494231923e-06, "loss": 0.6234, "num_input_tokens_seen": 141165088, "step": 116075 }, { "epoch": 14.544543290314497, "grad_norm": 10.167740821838379, "learning_rate": 2.10045233478222e-06, "loss": 0.444, "num_input_tokens_seen": 141170496, "step": 116080 }, { "epoch": 14.545169778223281, "grad_norm": 3.0602948665618896, "learning_rate": 2.1000069548077796e-06, "loss": 0.4737, "num_input_tokens_seen": 141176640, "step": 116085 }, { "epoch": 14.545796266132063, "grad_norm": 8.743050575256348, "learning_rate": 2.0995616095051918e-06, "loss": 0.5121, "num_input_tokens_seen": 141182560, "step": 116090 }, { "epoch": 14.546422754040847, "grad_norm": 14.165332794189453, "learning_rate": 2.0991162988797843e-06, "loss": 0.6343, "num_input_tokens_seen": 141188544, "step": 116095 }, { "epoch": 14.54704924194963, "grad_norm": 2.5079028606414795, "learning_rate": 2.0986710229368816e-06, "loss": 0.4844, "num_input_tokens_seen": 141194656, "step": 116100 }, { "epoch": 14.547675729858414, "grad_norm": 3.2922921180725098, "learning_rate": 2.0982257816818047e-06, "loss": 0.4447, "num_input_tokens_seen": 141200672, "step": 116105 }, { "epoch": 14.548302217767198, "grad_norm": 4.161960124969482, "learning_rate": 2.0977805751198797e-06, "loss": 0.4882, "num_input_tokens_seen": 141206432, "step": 116110 }, { "epoch": 14.54892870567598, "grad_norm": 3.3573591709136963, "learning_rate": 2.0973354032564265e-06, "loss": 0.4327, "num_input_tokens_seen": 141212416, "step": 116115 }, { "epoch": 14.549555193584764, "grad_norm": 16.89861488342285, "learning_rate": 2.0968902660967687e-06, "loss": 0.4961, "num_input_tokens_seen": 141218304, "step": 116120 }, { "epoch": 14.550181681493548, "grad_norm": 1.8022695779800415, "learning_rate": 2.0964451636462306e-06, "loss": 0.4654, "num_input_tokens_seen": 141224672, "step": 116125 }, { "epoch": 14.55080816940233, "grad_norm": 4.076315879821777, "learning_rate": 2.09600009591013e-06, "loss": 0.4482, "num_input_tokens_seen": 141231136, "step": 116130 }, { "epoch": 14.551434657311114, "grad_norm": 14.133981704711914, "learning_rate": 2.0955550628937897e-06, "loss": 0.5251, "num_input_tokens_seen": 141237760, "step": 116135 }, { "epoch": 14.552061145219897, "grad_norm": 5.435035228729248, "learning_rate": 2.0951100646025324e-06, "loss": 0.4845, "num_input_tokens_seen": 141243392, "step": 116140 }, { "epoch": 14.55268763312868, "grad_norm": 3.3028464317321777, "learning_rate": 2.094665101041675e-06, "loss": 0.4375, "num_input_tokens_seen": 141249568, "step": 116145 }, { "epoch": 14.553314121037465, "grad_norm": 5.70092248916626, "learning_rate": 2.0942201722165386e-06, "loss": 0.4861, "num_input_tokens_seen": 141255616, "step": 116150 }, { "epoch": 14.553940608946247, "grad_norm": 1.5437899827957153, "learning_rate": 2.093775278132445e-06, "loss": 0.4945, "num_input_tokens_seen": 141262208, "step": 116155 }, { "epoch": 14.554567096855031, "grad_norm": 6.795412540435791, "learning_rate": 2.0933304187947096e-06, "loss": 0.4502, "num_input_tokens_seen": 141268512, "step": 116160 }, { "epoch": 14.555193584763813, "grad_norm": 2.268446445465088, "learning_rate": 2.0928855942086547e-06, "loss": 0.4301, "num_input_tokens_seen": 141274464, "step": 116165 }, { "epoch": 14.555820072672597, "grad_norm": 8.556809425354004, "learning_rate": 2.0924408043795947e-06, "loss": 0.465, "num_input_tokens_seen": 141280416, "step": 116170 }, { "epoch": 14.556446560581382, "grad_norm": 1.218759298324585, "learning_rate": 2.091996049312851e-06, "loss": 0.4835, "num_input_tokens_seen": 141286432, "step": 116175 }, { "epoch": 14.557073048490164, "grad_norm": 1.5564234256744385, "learning_rate": 2.091551329013738e-06, "loss": 0.4701, "num_input_tokens_seen": 141292640, "step": 116180 }, { "epoch": 14.557699536398948, "grad_norm": 10.743273735046387, "learning_rate": 2.091106643487576e-06, "loss": 0.4424, "num_input_tokens_seen": 141298784, "step": 116185 }, { "epoch": 14.55832602430773, "grad_norm": 2.3517422676086426, "learning_rate": 2.090661992739678e-06, "loss": 0.4452, "num_input_tokens_seen": 141304672, "step": 116190 }, { "epoch": 14.558952512216514, "grad_norm": 1.5924996137619019, "learning_rate": 2.0902173767753636e-06, "loss": 0.4295, "num_input_tokens_seen": 141309664, "step": 116195 }, { "epoch": 14.559579000125298, "grad_norm": 4.114315032958984, "learning_rate": 2.0897727955999445e-06, "loss": 0.4771, "num_input_tokens_seen": 141315840, "step": 116200 }, { "epoch": 14.56020548803408, "grad_norm": 2.3965983390808105, "learning_rate": 2.089328249218741e-06, "loss": 0.4675, "num_input_tokens_seen": 141321952, "step": 116205 }, { "epoch": 14.560831975942865, "grad_norm": 1.8561345338821411, "learning_rate": 2.0888837376370634e-06, "loss": 0.4431, "num_input_tokens_seen": 141327712, "step": 116210 }, { "epoch": 14.561458463851647, "grad_norm": 10.78350830078125, "learning_rate": 2.0884392608602283e-06, "loss": 0.4645, "num_input_tokens_seen": 141334176, "step": 116215 }, { "epoch": 14.562084951760431, "grad_norm": 2.2498362064361572, "learning_rate": 2.087994818893552e-06, "loss": 0.4504, "num_input_tokens_seen": 141340384, "step": 116220 }, { "epoch": 14.562711439669215, "grad_norm": 2.5164594650268555, "learning_rate": 2.087550411742343e-06, "loss": 0.5379, "num_input_tokens_seen": 141346784, "step": 116225 }, { "epoch": 14.563337927577997, "grad_norm": 2.1865394115448, "learning_rate": 2.087106039411918e-06, "loss": 0.5365, "num_input_tokens_seen": 141352992, "step": 116230 }, { "epoch": 14.563964415486781, "grad_norm": 1.9923601150512695, "learning_rate": 2.0866617019075913e-06, "loss": 0.4457, "num_input_tokens_seen": 141359200, "step": 116235 }, { "epoch": 14.564590903395565, "grad_norm": 8.043821334838867, "learning_rate": 2.086217399234671e-06, "loss": 0.5064, "num_input_tokens_seen": 141365120, "step": 116240 }, { "epoch": 14.565217391304348, "grad_norm": 2.0381791591644287, "learning_rate": 2.0857731313984723e-06, "loss": 0.4788, "num_input_tokens_seen": 141371264, "step": 116245 }, { "epoch": 14.565843879213132, "grad_norm": 1.7570545673370361, "learning_rate": 2.0853288984043072e-06, "loss": 0.4503, "num_input_tokens_seen": 141377536, "step": 116250 }, { "epoch": 14.566470367121914, "grad_norm": 12.173796653747559, "learning_rate": 2.084884700257484e-06, "loss": 0.4728, "num_input_tokens_seen": 141384256, "step": 116255 }, { "epoch": 14.567096855030698, "grad_norm": 1.953981876373291, "learning_rate": 2.084440536963317e-06, "loss": 0.4281, "num_input_tokens_seen": 141390368, "step": 116260 }, { "epoch": 14.56772334293948, "grad_norm": 4.278301239013672, "learning_rate": 2.0839964085271132e-06, "loss": 0.5432, "num_input_tokens_seen": 141396448, "step": 116265 }, { "epoch": 14.568349830848264, "grad_norm": 2.263974189758301, "learning_rate": 2.0835523149541854e-06, "loss": 0.3998, "num_input_tokens_seen": 141402464, "step": 116270 }, { "epoch": 14.568976318757048, "grad_norm": 5.286231994628906, "learning_rate": 2.0831082562498395e-06, "loss": 0.4216, "num_input_tokens_seen": 141408576, "step": 116275 }, { "epoch": 14.56960280666583, "grad_norm": 4.751737117767334, "learning_rate": 2.0826642324193874e-06, "loss": 0.5025, "num_input_tokens_seen": 141414496, "step": 116280 }, { "epoch": 14.570229294574615, "grad_norm": 8.06493091583252, "learning_rate": 2.082220243468138e-06, "loss": 0.5697, "num_input_tokens_seen": 141420320, "step": 116285 }, { "epoch": 14.570855782483399, "grad_norm": 8.944278717041016, "learning_rate": 2.0817762894013983e-06, "loss": 0.4255, "num_input_tokens_seen": 141426656, "step": 116290 }, { "epoch": 14.571482270392181, "grad_norm": 1.7575485706329346, "learning_rate": 2.0813323702244775e-06, "loss": 0.5083, "num_input_tokens_seen": 141432800, "step": 116295 }, { "epoch": 14.572108758300965, "grad_norm": 2.6006810665130615, "learning_rate": 2.0808884859426802e-06, "loss": 0.4301, "num_input_tokens_seen": 141439072, "step": 116300 }, { "epoch": 14.572735246209747, "grad_norm": 11.50367259979248, "learning_rate": 2.0804446365613175e-06, "loss": 0.4882, "num_input_tokens_seen": 141445376, "step": 116305 }, { "epoch": 14.573361734118532, "grad_norm": 4.725276947021484, "learning_rate": 2.080000822085692e-06, "loss": 0.4681, "num_input_tokens_seen": 141451520, "step": 116310 }, { "epoch": 14.573988222027316, "grad_norm": 3.2492737770080566, "learning_rate": 2.079557042521113e-06, "loss": 0.4972, "num_input_tokens_seen": 141457792, "step": 116315 }, { "epoch": 14.574614709936098, "grad_norm": 1.8133074045181274, "learning_rate": 2.079113297872884e-06, "loss": 0.5199, "num_input_tokens_seen": 141463264, "step": 116320 }, { "epoch": 14.575241197844882, "grad_norm": 1.511707067489624, "learning_rate": 2.0786695881463104e-06, "loss": 0.428, "num_input_tokens_seen": 141468864, "step": 116325 }, { "epoch": 14.575867685753664, "grad_norm": 1.9666283130645752, "learning_rate": 2.0782259133467005e-06, "loss": 0.3991, "num_input_tokens_seen": 141475072, "step": 116330 }, { "epoch": 14.576494173662448, "grad_norm": 4.995168685913086, "learning_rate": 2.077782273479354e-06, "loss": 0.5275, "num_input_tokens_seen": 141480928, "step": 116335 }, { "epoch": 14.577120661571232, "grad_norm": 9.04111099243164, "learning_rate": 2.077338668549577e-06, "loss": 0.4417, "num_input_tokens_seen": 141487264, "step": 116340 }, { "epoch": 14.577747149480015, "grad_norm": 7.867653846740723, "learning_rate": 2.076895098562674e-06, "loss": 0.4654, "num_input_tokens_seen": 141493248, "step": 116345 }, { "epoch": 14.578373637388799, "grad_norm": 2.2245237827301025, "learning_rate": 2.0764515635239497e-06, "loss": 0.4787, "num_input_tokens_seen": 141499680, "step": 116350 }, { "epoch": 14.579000125297581, "grad_norm": 1.9494497776031494, "learning_rate": 2.0760080634387023e-06, "loss": 0.4888, "num_input_tokens_seen": 141505440, "step": 116355 }, { "epoch": 14.579626613206365, "grad_norm": 3.5854153633117676, "learning_rate": 2.07556459831224e-06, "loss": 0.4921, "num_input_tokens_seen": 141511744, "step": 116360 }, { "epoch": 14.580253101115149, "grad_norm": 11.336108207702637, "learning_rate": 2.0751211681498583e-06, "loss": 0.5156, "num_input_tokens_seen": 141517376, "step": 116365 }, { "epoch": 14.580879589023931, "grad_norm": 2.0906567573547363, "learning_rate": 2.0746777729568652e-06, "loss": 0.5109, "num_input_tokens_seen": 141523168, "step": 116370 }, { "epoch": 14.581506076932715, "grad_norm": 1.7663987874984741, "learning_rate": 2.0742344127385563e-06, "loss": 0.4914, "num_input_tokens_seen": 141528960, "step": 116375 }, { "epoch": 14.582132564841498, "grad_norm": 8.821477890014648, "learning_rate": 2.0737910875002364e-06, "loss": 0.4801, "num_input_tokens_seen": 141535008, "step": 116380 }, { "epoch": 14.582759052750282, "grad_norm": 1.796729564666748, "learning_rate": 2.0733477972472034e-06, "loss": 0.4837, "num_input_tokens_seen": 141541408, "step": 116385 }, { "epoch": 14.583385540659066, "grad_norm": 2.5830769538879395, "learning_rate": 2.072904541984759e-06, "loss": 0.5281, "num_input_tokens_seen": 141546912, "step": 116390 }, { "epoch": 14.584012028567848, "grad_norm": 5.145742416381836, "learning_rate": 2.0724613217182003e-06, "loss": 0.4373, "num_input_tokens_seen": 141552640, "step": 116395 }, { "epoch": 14.584638516476632, "grad_norm": 6.277169227600098, "learning_rate": 2.072018136452828e-06, "loss": 0.6002, "num_input_tokens_seen": 141559104, "step": 116400 }, { "epoch": 14.585265004385416, "grad_norm": 8.264004707336426, "learning_rate": 2.071574986193942e-06, "loss": 0.4767, "num_input_tokens_seen": 141564864, "step": 116405 }, { "epoch": 14.585891492294198, "grad_norm": 5.251799583435059, "learning_rate": 2.071131870946838e-06, "loss": 0.4551, "num_input_tokens_seen": 141570912, "step": 116410 }, { "epoch": 14.586517980202983, "grad_norm": 1.684937596321106, "learning_rate": 2.070688790716816e-06, "loss": 0.4248, "num_input_tokens_seen": 141577312, "step": 116415 }, { "epoch": 14.587144468111765, "grad_norm": 3.2196638584136963, "learning_rate": 2.0702457455091713e-06, "loss": 0.4701, "num_input_tokens_seen": 141583680, "step": 116420 }, { "epoch": 14.587770956020549, "grad_norm": 1.7801804542541504, "learning_rate": 2.069802735329203e-06, "loss": 0.4803, "num_input_tokens_seen": 141589728, "step": 116425 }, { "epoch": 14.588397443929333, "grad_norm": 10.17084789276123, "learning_rate": 2.069359760182205e-06, "loss": 0.5372, "num_input_tokens_seen": 141595616, "step": 116430 }, { "epoch": 14.589023931838115, "grad_norm": 2.7771944999694824, "learning_rate": 2.0689168200734757e-06, "loss": 0.4547, "num_input_tokens_seen": 141601952, "step": 116435 }, { "epoch": 14.5896504197469, "grad_norm": 7.370028972625732, "learning_rate": 2.0684739150083087e-06, "loss": 0.4862, "num_input_tokens_seen": 141608064, "step": 116440 }, { "epoch": 14.590276907655682, "grad_norm": 1.604775071144104, "learning_rate": 2.068031044992004e-06, "loss": 0.4367, "num_input_tokens_seen": 141613952, "step": 116445 }, { "epoch": 14.590903395564466, "grad_norm": 2.320587396621704, "learning_rate": 2.0675882100298504e-06, "loss": 0.4645, "num_input_tokens_seen": 141620128, "step": 116450 }, { "epoch": 14.59152988347325, "grad_norm": 5.654774188995361, "learning_rate": 2.0671454101271475e-06, "loss": 0.4718, "num_input_tokens_seen": 141625632, "step": 116455 }, { "epoch": 14.592156371382032, "grad_norm": 1.721681833267212, "learning_rate": 2.0667026452891853e-06, "loss": 0.4866, "num_input_tokens_seen": 141631616, "step": 116460 }, { "epoch": 14.592782859290816, "grad_norm": 2.4928696155548096, "learning_rate": 2.0662599155212586e-06, "loss": 0.5138, "num_input_tokens_seen": 141638112, "step": 116465 }, { "epoch": 14.593409347199598, "grad_norm": 1.7121410369873047, "learning_rate": 2.065817220828663e-06, "loss": 0.446, "num_input_tokens_seen": 141643840, "step": 116470 }, { "epoch": 14.594035835108382, "grad_norm": 1.8104243278503418, "learning_rate": 2.065374561216688e-06, "loss": 0.4974, "num_input_tokens_seen": 141649760, "step": 116475 }, { "epoch": 14.594662323017166, "grad_norm": 3.9581258296966553, "learning_rate": 2.0649319366906297e-06, "loss": 0.4714, "num_input_tokens_seen": 141655232, "step": 116480 }, { "epoch": 14.595288810925949, "grad_norm": 1.3351529836654663, "learning_rate": 2.0644893472557755e-06, "loss": 0.4447, "num_input_tokens_seen": 141661408, "step": 116485 }, { "epoch": 14.595915298834733, "grad_norm": 2.576724052429199, "learning_rate": 2.0640467929174205e-06, "loss": 0.4441, "num_input_tokens_seen": 141667392, "step": 116490 }, { "epoch": 14.596541786743515, "grad_norm": 1.7011685371398926, "learning_rate": 2.0636042736808533e-06, "loss": 0.4556, "num_input_tokens_seen": 141673504, "step": 116495 }, { "epoch": 14.597168274652299, "grad_norm": 1.6715301275253296, "learning_rate": 2.0631617895513674e-06, "loss": 0.4893, "num_input_tokens_seen": 141679648, "step": 116500 }, { "epoch": 14.597794762561083, "grad_norm": 10.96092700958252, "learning_rate": 2.0627193405342495e-06, "loss": 0.5242, "num_input_tokens_seen": 141685728, "step": 116505 }, { "epoch": 14.598421250469865, "grad_norm": 5.58746337890625, "learning_rate": 2.0622769266347937e-06, "loss": 0.4657, "num_input_tokens_seen": 141692128, "step": 116510 }, { "epoch": 14.59904773837865, "grad_norm": 1.6977431774139404, "learning_rate": 2.0618345478582845e-06, "loss": 0.511, "num_input_tokens_seen": 141698176, "step": 116515 }, { "epoch": 14.599674226287433, "grad_norm": 2.030290365219116, "learning_rate": 2.0613922042100167e-06, "loss": 0.4246, "num_input_tokens_seen": 141704480, "step": 116520 }, { "epoch": 14.600300714196216, "grad_norm": 8.732705116271973, "learning_rate": 2.060949895695273e-06, "loss": 0.4968, "num_input_tokens_seen": 141710432, "step": 116525 }, { "epoch": 14.600927202105, "grad_norm": 1.6445611715316772, "learning_rate": 2.0605076223193445e-06, "loss": 0.4563, "num_input_tokens_seen": 141716448, "step": 116530 }, { "epoch": 14.601553690013782, "grad_norm": 3.215883255004883, "learning_rate": 2.0600653840875203e-06, "loss": 0.4948, "num_input_tokens_seen": 141722240, "step": 116535 }, { "epoch": 14.602180177922566, "grad_norm": 3.6495652198791504, "learning_rate": 2.059623181005084e-06, "loss": 0.4677, "num_input_tokens_seen": 141728768, "step": 116540 }, { "epoch": 14.60280666583135, "grad_norm": 1.6884812116622925, "learning_rate": 2.059181013077326e-06, "loss": 0.5127, "num_input_tokens_seen": 141735136, "step": 116545 }, { "epoch": 14.603433153740133, "grad_norm": 1.924382209777832, "learning_rate": 2.0587388803095303e-06, "loss": 0.4775, "num_input_tokens_seen": 141741344, "step": 116550 }, { "epoch": 14.604059641648917, "grad_norm": 6.969365119934082, "learning_rate": 2.0582967827069865e-06, "loss": 0.4615, "num_input_tokens_seen": 141747072, "step": 116555 }, { "epoch": 14.604686129557699, "grad_norm": 12.471210479736328, "learning_rate": 2.0578547202749764e-06, "loss": 0.5206, "num_input_tokens_seen": 141753376, "step": 116560 }, { "epoch": 14.605312617466483, "grad_norm": 15.617406845092773, "learning_rate": 2.0574126930187882e-06, "loss": 0.5232, "num_input_tokens_seen": 141759424, "step": 116565 }, { "epoch": 14.605939105375267, "grad_norm": 3.944936513900757, "learning_rate": 2.0569707009437035e-06, "loss": 0.459, "num_input_tokens_seen": 141765728, "step": 116570 }, { "epoch": 14.60656559328405, "grad_norm": 1.4377371072769165, "learning_rate": 2.0565287440550103e-06, "loss": 0.4613, "num_input_tokens_seen": 141771680, "step": 116575 }, { "epoch": 14.607192081192833, "grad_norm": 2.0190021991729736, "learning_rate": 2.0560868223579887e-06, "loss": 0.4904, "num_input_tokens_seen": 141777760, "step": 116580 }, { "epoch": 14.607818569101616, "grad_norm": 14.242857933044434, "learning_rate": 2.0556449358579244e-06, "loss": 0.5581, "num_input_tokens_seen": 141783936, "step": 116585 }, { "epoch": 14.6084450570104, "grad_norm": 2.7742300033569336, "learning_rate": 2.0552030845601022e-06, "loss": 0.4698, "num_input_tokens_seen": 141790080, "step": 116590 }, { "epoch": 14.609071544919184, "grad_norm": 8.297445297241211, "learning_rate": 2.054761268469801e-06, "loss": 0.4588, "num_input_tokens_seen": 141796192, "step": 116595 }, { "epoch": 14.609698032827966, "grad_norm": 1.828307867050171, "learning_rate": 2.054319487592307e-06, "loss": 0.4086, "num_input_tokens_seen": 141802208, "step": 116600 }, { "epoch": 14.61032452073675, "grad_norm": 4.113823890686035, "learning_rate": 2.053877741932898e-06, "loss": 0.5283, "num_input_tokens_seen": 141807808, "step": 116605 }, { "epoch": 14.610951008645532, "grad_norm": 4.398806095123291, "learning_rate": 2.05343603149686e-06, "loss": 0.4747, "num_input_tokens_seen": 141813824, "step": 116610 }, { "epoch": 14.611577496554316, "grad_norm": 3.340691089630127, "learning_rate": 2.052994356289469e-06, "loss": 0.4646, "num_input_tokens_seen": 141819968, "step": 116615 }, { "epoch": 14.6122039844631, "grad_norm": 1.8491514921188354, "learning_rate": 2.0525527163160107e-06, "loss": 0.4454, "num_input_tokens_seen": 141826176, "step": 116620 }, { "epoch": 14.612830472371883, "grad_norm": 1.5801578760147095, "learning_rate": 2.0521111115817604e-06, "loss": 0.4671, "num_input_tokens_seen": 141832480, "step": 116625 }, { "epoch": 14.613456960280667, "grad_norm": 6.565319061279297, "learning_rate": 2.051669542092002e-06, "loss": 0.4424, "num_input_tokens_seen": 141838240, "step": 116630 }, { "epoch": 14.61408344818945, "grad_norm": 1.6472830772399902, "learning_rate": 2.051228007852012e-06, "loss": 0.4259, "num_input_tokens_seen": 141844320, "step": 116635 }, { "epoch": 14.614709936098233, "grad_norm": 1.632358193397522, "learning_rate": 2.0507865088670696e-06, "loss": 0.398, "num_input_tokens_seen": 141850336, "step": 116640 }, { "epoch": 14.615336424007017, "grad_norm": 2.9087276458740234, "learning_rate": 2.0503450451424543e-06, "loss": 0.4653, "num_input_tokens_seen": 141856384, "step": 116645 }, { "epoch": 14.6159629119158, "grad_norm": 1.5325349569320679, "learning_rate": 2.0499036166834455e-06, "loss": 0.4495, "num_input_tokens_seen": 141862592, "step": 116650 }, { "epoch": 14.616589399824583, "grad_norm": 1.4493237733840942, "learning_rate": 2.049462223495318e-06, "loss": 0.4656, "num_input_tokens_seen": 141868416, "step": 116655 }, { "epoch": 14.617215887733368, "grad_norm": 2.0101354122161865, "learning_rate": 2.04902086558335e-06, "loss": 0.4345, "num_input_tokens_seen": 141874624, "step": 116660 }, { "epoch": 14.61784237564215, "grad_norm": 15.683646202087402, "learning_rate": 2.0485795429528205e-06, "loss": 0.5973, "num_input_tokens_seen": 141880480, "step": 116665 }, { "epoch": 14.618468863550934, "grad_norm": 2.388082265853882, "learning_rate": 2.0481382556090018e-06, "loss": 0.4804, "num_input_tokens_seen": 141886304, "step": 116670 }, { "epoch": 14.619095351459716, "grad_norm": 1.2626293897628784, "learning_rate": 2.047697003557174e-06, "loss": 0.4546, "num_input_tokens_seen": 141892416, "step": 116675 }, { "epoch": 14.6197218393685, "grad_norm": 3.168931722640991, "learning_rate": 2.047255786802609e-06, "loss": 0.4735, "num_input_tokens_seen": 141898560, "step": 116680 }, { "epoch": 14.620348327277284, "grad_norm": 7.774017333984375, "learning_rate": 2.0468146053505856e-06, "loss": 0.4663, "num_input_tokens_seen": 141904640, "step": 116685 }, { "epoch": 14.620974815186067, "grad_norm": 11.056309700012207, "learning_rate": 2.0463734592063745e-06, "loss": 0.5404, "num_input_tokens_seen": 141910656, "step": 116690 }, { "epoch": 14.62160130309485, "grad_norm": 9.654356002807617, "learning_rate": 2.0459323483752536e-06, "loss": 0.5096, "num_input_tokens_seen": 141916800, "step": 116695 }, { "epoch": 14.622227791003633, "grad_norm": 3.8538620471954346, "learning_rate": 2.0454912728624932e-06, "loss": 0.4894, "num_input_tokens_seen": 141922848, "step": 116700 }, { "epoch": 14.622854278912417, "grad_norm": 3.2976040840148926, "learning_rate": 2.045050232673371e-06, "loss": 0.4569, "num_input_tokens_seen": 141928864, "step": 116705 }, { "epoch": 14.623480766821201, "grad_norm": 1.1693063974380493, "learning_rate": 2.044609227813156e-06, "loss": 0.463, "num_input_tokens_seen": 141934304, "step": 116710 }, { "epoch": 14.624107254729983, "grad_norm": 6.717918872833252, "learning_rate": 2.0441682582871214e-06, "loss": 0.4948, "num_input_tokens_seen": 141940576, "step": 116715 }, { "epoch": 14.624733742638767, "grad_norm": 15.358269691467285, "learning_rate": 2.043727324100543e-06, "loss": 0.5373, "num_input_tokens_seen": 141946112, "step": 116720 }, { "epoch": 14.62536023054755, "grad_norm": 2.026754856109619, "learning_rate": 2.0432864252586874e-06, "loss": 0.4725, "num_input_tokens_seen": 141952256, "step": 116725 }, { "epoch": 14.625986718456334, "grad_norm": 11.982600212097168, "learning_rate": 2.0428455617668314e-06, "loss": 0.53, "num_input_tokens_seen": 141958368, "step": 116730 }, { "epoch": 14.626613206365118, "grad_norm": 1.92879056930542, "learning_rate": 2.0424047336302406e-06, "loss": 0.4691, "num_input_tokens_seen": 141964320, "step": 116735 }, { "epoch": 14.6272396942739, "grad_norm": 4.0208659172058105, "learning_rate": 2.0419639408541876e-06, "loss": 0.462, "num_input_tokens_seen": 141970272, "step": 116740 }, { "epoch": 14.627866182182684, "grad_norm": 2.456291437149048, "learning_rate": 2.0415231834439446e-06, "loss": 0.4582, "num_input_tokens_seen": 141976448, "step": 116745 }, { "epoch": 14.628492670091468, "grad_norm": 8.386723518371582, "learning_rate": 2.0410824614047776e-06, "loss": 0.5117, "num_input_tokens_seen": 141982208, "step": 116750 }, { "epoch": 14.62911915800025, "grad_norm": 5.0636796951293945, "learning_rate": 2.040641774741957e-06, "loss": 0.4666, "num_input_tokens_seen": 141988000, "step": 116755 }, { "epoch": 14.629745645909034, "grad_norm": 3.1932849884033203, "learning_rate": 2.0402011234607545e-06, "loss": 0.4572, "num_input_tokens_seen": 141994080, "step": 116760 }, { "epoch": 14.630372133817817, "grad_norm": 1.933081030845642, "learning_rate": 2.039760507566434e-06, "loss": 0.4368, "num_input_tokens_seen": 142000256, "step": 116765 }, { "epoch": 14.6309986217266, "grad_norm": 2.4522275924682617, "learning_rate": 2.0393199270642653e-06, "loss": 0.4687, "num_input_tokens_seen": 142006528, "step": 116770 }, { "epoch": 14.631625109635383, "grad_norm": 1.5918750762939453, "learning_rate": 2.0388793819595182e-06, "loss": 0.4452, "num_input_tokens_seen": 142012672, "step": 116775 }, { "epoch": 14.632251597544167, "grad_norm": 1.3787795305252075, "learning_rate": 2.0384388722574555e-06, "loss": 0.4657, "num_input_tokens_seen": 142018144, "step": 116780 }, { "epoch": 14.632878085452951, "grad_norm": 1.6925020217895508, "learning_rate": 2.037998397963348e-06, "loss": 0.4758, "num_input_tokens_seen": 142023552, "step": 116785 }, { "epoch": 14.633504573361733, "grad_norm": 1.203600287437439, "learning_rate": 2.0375579590824585e-06, "loss": 0.4852, "num_input_tokens_seen": 142029888, "step": 116790 }, { "epoch": 14.634131061270518, "grad_norm": 1.922092080116272, "learning_rate": 2.037117555620056e-06, "loss": 0.4669, "num_input_tokens_seen": 142036096, "step": 116795 }, { "epoch": 14.634757549179302, "grad_norm": 2.639028310775757, "learning_rate": 2.0366771875814016e-06, "loss": 0.4476, "num_input_tokens_seen": 142042400, "step": 116800 }, { "epoch": 14.635384037088084, "grad_norm": 3.714205265045166, "learning_rate": 2.0362368549717655e-06, "loss": 0.436, "num_input_tokens_seen": 142048448, "step": 116805 }, { "epoch": 14.636010524996868, "grad_norm": 1.4857097864151, "learning_rate": 2.0357965577964072e-06, "loss": 0.4294, "num_input_tokens_seen": 142054688, "step": 116810 }, { "epoch": 14.63663701290565, "grad_norm": 1.6632241010665894, "learning_rate": 2.0353562960605954e-06, "loss": 0.5, "num_input_tokens_seen": 142060512, "step": 116815 }, { "epoch": 14.637263500814434, "grad_norm": 3.3012478351593018, "learning_rate": 2.0349160697695904e-06, "loss": 0.4462, "num_input_tokens_seen": 142066944, "step": 116820 }, { "epoch": 14.637889988723218, "grad_norm": 1.6629070043563843, "learning_rate": 2.0344758789286574e-06, "loss": 0.4254, "num_input_tokens_seen": 142072960, "step": 116825 }, { "epoch": 14.638516476632, "grad_norm": 3.6359565258026123, "learning_rate": 2.034035723543057e-06, "loss": 0.4667, "num_input_tokens_seen": 142078944, "step": 116830 }, { "epoch": 14.639142964540785, "grad_norm": 1.6924138069152832, "learning_rate": 2.033595603618054e-06, "loss": 0.444, "num_input_tokens_seen": 142085280, "step": 116835 }, { "epoch": 14.639769452449567, "grad_norm": 4.626288414001465, "learning_rate": 2.03315551915891e-06, "loss": 0.5186, "num_input_tokens_seen": 142091328, "step": 116840 }, { "epoch": 14.640395940358351, "grad_norm": 4.150033473968506, "learning_rate": 2.032715470170885e-06, "loss": 0.4388, "num_input_tokens_seen": 142097408, "step": 116845 }, { "epoch": 14.641022428267135, "grad_norm": 4.07596492767334, "learning_rate": 2.032275456659242e-06, "loss": 0.4422, "num_input_tokens_seen": 142103136, "step": 116850 }, { "epoch": 14.641648916175917, "grad_norm": 8.034207344055176, "learning_rate": 2.03183547862924e-06, "loss": 0.5104, "num_input_tokens_seen": 142109248, "step": 116855 }, { "epoch": 14.642275404084701, "grad_norm": 2.8168609142303467, "learning_rate": 2.0313955360861425e-06, "loss": 0.5121, "num_input_tokens_seen": 142115808, "step": 116860 }, { "epoch": 14.642901891993485, "grad_norm": 2.2254292964935303, "learning_rate": 2.0309556290352056e-06, "loss": 0.4628, "num_input_tokens_seen": 142121888, "step": 116865 }, { "epoch": 14.643528379902268, "grad_norm": 6.312901020050049, "learning_rate": 2.030515757481692e-06, "loss": 0.4784, "num_input_tokens_seen": 142127840, "step": 116870 }, { "epoch": 14.644154867811052, "grad_norm": 5.195350170135498, "learning_rate": 2.0300759214308573e-06, "loss": 0.5, "num_input_tokens_seen": 142133824, "step": 116875 }, { "epoch": 14.644781355719834, "grad_norm": 1.8586466312408447, "learning_rate": 2.029636120887964e-06, "loss": 0.4557, "num_input_tokens_seen": 142140032, "step": 116880 }, { "epoch": 14.645407843628618, "grad_norm": 1.684805154800415, "learning_rate": 2.0291963558582663e-06, "loss": 0.4264, "num_input_tokens_seen": 142146112, "step": 116885 }, { "epoch": 14.6460343315374, "grad_norm": 12.351815223693848, "learning_rate": 2.028756626347026e-06, "loss": 0.5121, "num_input_tokens_seen": 142152032, "step": 116890 }, { "epoch": 14.646660819446184, "grad_norm": 6.104981899261475, "learning_rate": 2.0283169323594957e-06, "loss": 0.4365, "num_input_tokens_seen": 142157984, "step": 116895 }, { "epoch": 14.647287307354969, "grad_norm": 10.877589225769043, "learning_rate": 2.0278772739009357e-06, "loss": 0.5623, "num_input_tokens_seen": 142164320, "step": 116900 }, { "epoch": 14.64791379526375, "grad_norm": 3.6400961875915527, "learning_rate": 2.027437650976603e-06, "loss": 0.4541, "num_input_tokens_seen": 142170560, "step": 116905 }, { "epoch": 14.648540283172535, "grad_norm": 1.4440948963165283, "learning_rate": 2.026998063591751e-06, "loss": 0.4545, "num_input_tokens_seen": 142176768, "step": 116910 }, { "epoch": 14.649166771081319, "grad_norm": 7.02486515045166, "learning_rate": 2.0265585117516377e-06, "loss": 0.5125, "num_input_tokens_seen": 142182784, "step": 116915 }, { "epoch": 14.649793258990101, "grad_norm": 5.684693813323975, "learning_rate": 2.026118995461517e-06, "loss": 0.5129, "num_input_tokens_seen": 142188768, "step": 116920 }, { "epoch": 14.650419746898885, "grad_norm": 2.0319035053253174, "learning_rate": 2.0256795147266445e-06, "loss": 0.4301, "num_input_tokens_seen": 142194496, "step": 116925 }, { "epoch": 14.651046234807668, "grad_norm": 5.561419486999512, "learning_rate": 2.0252400695522728e-06, "loss": 0.4255, "num_input_tokens_seen": 142200480, "step": 116930 }, { "epoch": 14.651672722716452, "grad_norm": 1.971092939376831, "learning_rate": 2.0248006599436587e-06, "loss": 0.4279, "num_input_tokens_seen": 142206144, "step": 116935 }, { "epoch": 14.652299210625236, "grad_norm": 3.904719829559326, "learning_rate": 2.0243612859060526e-06, "loss": 0.4406, "num_input_tokens_seen": 142212480, "step": 116940 }, { "epoch": 14.652925698534018, "grad_norm": 4.629012107849121, "learning_rate": 2.0239219474447087e-06, "loss": 0.4286, "num_input_tokens_seen": 142218368, "step": 116945 }, { "epoch": 14.653552186442802, "grad_norm": 7.466657638549805, "learning_rate": 2.023482644564882e-06, "loss": 0.4385, "num_input_tokens_seen": 142224576, "step": 116950 }, { "epoch": 14.654178674351584, "grad_norm": 2.344169855117798, "learning_rate": 2.023043377271821e-06, "loss": 0.4538, "num_input_tokens_seen": 142230464, "step": 116955 }, { "epoch": 14.654805162260368, "grad_norm": 1.836089849472046, "learning_rate": 2.0226041455707795e-06, "loss": 0.5701, "num_input_tokens_seen": 142236800, "step": 116960 }, { "epoch": 14.655431650169152, "grad_norm": 1.4172595739364624, "learning_rate": 2.0221649494670088e-06, "loss": 0.4227, "num_input_tokens_seen": 142243008, "step": 116965 }, { "epoch": 14.656058138077935, "grad_norm": 1.4775848388671875, "learning_rate": 2.0217257889657616e-06, "loss": 0.508, "num_input_tokens_seen": 142249344, "step": 116970 }, { "epoch": 14.656684625986719, "grad_norm": 2.173175573348999, "learning_rate": 2.0212866640722845e-06, "loss": 0.4334, "num_input_tokens_seen": 142255488, "step": 116975 }, { "epoch": 14.657311113895501, "grad_norm": 4.612349033355713, "learning_rate": 2.0208475747918316e-06, "loss": 0.4856, "num_input_tokens_seen": 142261440, "step": 116980 }, { "epoch": 14.657937601804285, "grad_norm": 13.312496185302734, "learning_rate": 2.0204085211296486e-06, "loss": 0.537, "num_input_tokens_seen": 142267424, "step": 116985 }, { "epoch": 14.65856408971307, "grad_norm": 1.8728790283203125, "learning_rate": 2.019969503090989e-06, "loss": 0.5475, "num_input_tokens_seen": 142273344, "step": 116990 }, { "epoch": 14.659190577621851, "grad_norm": 7.900208473205566, "learning_rate": 2.019530520681097e-06, "loss": 0.5182, "num_input_tokens_seen": 142279424, "step": 116995 }, { "epoch": 14.659817065530635, "grad_norm": 6.540067195892334, "learning_rate": 2.0190915739052263e-06, "loss": 0.4586, "num_input_tokens_seen": 142285696, "step": 117000 }, { "epoch": 14.660443553439418, "grad_norm": 2.5085628032684326, "learning_rate": 2.01865266276862e-06, "loss": 0.5457, "num_input_tokens_seen": 142292000, "step": 117005 }, { "epoch": 14.661070041348202, "grad_norm": 1.0158801078796387, "learning_rate": 2.018213787276529e-06, "loss": 0.46, "num_input_tokens_seen": 142297440, "step": 117010 }, { "epoch": 14.661696529256986, "grad_norm": 0.9201894402503967, "learning_rate": 2.0177749474341974e-06, "loss": 0.4483, "num_input_tokens_seen": 142303616, "step": 117015 }, { "epoch": 14.662323017165768, "grad_norm": 5.158227920532227, "learning_rate": 2.0173361432468738e-06, "loss": 0.4759, "num_input_tokens_seen": 142309472, "step": 117020 }, { "epoch": 14.662949505074552, "grad_norm": 2.1266112327575684, "learning_rate": 2.016897374719806e-06, "loss": 0.5794, "num_input_tokens_seen": 142315552, "step": 117025 }, { "epoch": 14.663575992983336, "grad_norm": 5.405184745788574, "learning_rate": 2.0164586418582365e-06, "loss": 0.5231, "num_input_tokens_seen": 142321952, "step": 117030 }, { "epoch": 14.664202480892119, "grad_norm": 1.2303531169891357, "learning_rate": 2.016019944667414e-06, "loss": 0.4285, "num_input_tokens_seen": 142327936, "step": 117035 }, { "epoch": 14.664828968800903, "grad_norm": 2.605095624923706, "learning_rate": 2.01558128315258e-06, "loss": 0.4735, "num_input_tokens_seen": 142334272, "step": 117040 }, { "epoch": 14.665455456709685, "grad_norm": 4.480688571929932, "learning_rate": 2.015142657318983e-06, "loss": 0.4755, "num_input_tokens_seen": 142340544, "step": 117045 }, { "epoch": 14.666081944618469, "grad_norm": 4.797887802124023, "learning_rate": 2.014704067171863e-06, "loss": 0.4708, "num_input_tokens_seen": 142346976, "step": 117050 }, { "epoch": 14.666708432527253, "grad_norm": 1.4562113285064697, "learning_rate": 2.0142655127164655e-06, "loss": 0.4476, "num_input_tokens_seen": 142352640, "step": 117055 }, { "epoch": 14.667334920436035, "grad_norm": 10.603913307189941, "learning_rate": 2.0138269939580346e-06, "loss": 0.5002, "num_input_tokens_seen": 142358368, "step": 117060 }, { "epoch": 14.66796140834482, "grad_norm": 2.5939908027648926, "learning_rate": 2.013388510901815e-06, "loss": 0.4448, "num_input_tokens_seen": 142364608, "step": 117065 }, { "epoch": 14.668587896253602, "grad_norm": 2.188774585723877, "learning_rate": 2.0129500635530437e-06, "loss": 0.4531, "num_input_tokens_seen": 142370880, "step": 117070 }, { "epoch": 14.669214384162386, "grad_norm": 2.1276793479919434, "learning_rate": 2.012511651916968e-06, "loss": 0.4257, "num_input_tokens_seen": 142377024, "step": 117075 }, { "epoch": 14.66984087207117, "grad_norm": 3.0740208625793457, "learning_rate": 2.0120732759988254e-06, "loss": 0.5264, "num_input_tokens_seen": 142383168, "step": 117080 }, { "epoch": 14.670467359979952, "grad_norm": 1.8458389043807983, "learning_rate": 2.0116349358038594e-06, "loss": 0.455, "num_input_tokens_seen": 142389536, "step": 117085 }, { "epoch": 14.671093847888736, "grad_norm": 4.100248336791992, "learning_rate": 2.0111966313373115e-06, "loss": 0.4695, "num_input_tokens_seen": 142395712, "step": 117090 }, { "epoch": 14.671720335797518, "grad_norm": 1.7138044834136963, "learning_rate": 2.0107583626044192e-06, "loss": 0.4403, "num_input_tokens_seen": 142401728, "step": 117095 }, { "epoch": 14.672346823706302, "grad_norm": 2.0989980697631836, "learning_rate": 2.0103201296104254e-06, "loss": 0.4979, "num_input_tokens_seen": 142408224, "step": 117100 }, { "epoch": 14.672973311615086, "grad_norm": 1.671359896659851, "learning_rate": 2.0098819323605664e-06, "loss": 0.4568, "num_input_tokens_seen": 142414272, "step": 117105 }, { "epoch": 14.673599799523869, "grad_norm": 8.215669631958008, "learning_rate": 2.0094437708600844e-06, "loss": 0.4458, "num_input_tokens_seen": 142419968, "step": 117110 }, { "epoch": 14.674226287432653, "grad_norm": 2.1482656002044678, "learning_rate": 2.0090056451142154e-06, "loss": 0.5027, "num_input_tokens_seen": 142426112, "step": 117115 }, { "epoch": 14.674852775341435, "grad_norm": 6.916046619415283, "learning_rate": 2.0085675551281996e-06, "loss": 0.4783, "num_input_tokens_seen": 142431904, "step": 117120 }, { "epoch": 14.67547926325022, "grad_norm": 2.7010550498962402, "learning_rate": 2.008129500907272e-06, "loss": 0.5038, "num_input_tokens_seen": 142438112, "step": 117125 }, { "epoch": 14.676105751159003, "grad_norm": 1.8214930295944214, "learning_rate": 2.007691482456674e-06, "loss": 0.4902, "num_input_tokens_seen": 142444512, "step": 117130 }, { "epoch": 14.676732239067785, "grad_norm": 4.487683296203613, "learning_rate": 2.007253499781638e-06, "loss": 0.4412, "num_input_tokens_seen": 142450880, "step": 117135 }, { "epoch": 14.67735872697657, "grad_norm": 1.0915311574935913, "learning_rate": 2.006815552887404e-06, "loss": 0.4261, "num_input_tokens_seen": 142456768, "step": 117140 }, { "epoch": 14.677985214885354, "grad_norm": 1.8575255870819092, "learning_rate": 2.006377641779205e-06, "loss": 0.4379, "num_input_tokens_seen": 142462528, "step": 117145 }, { "epoch": 14.678611702794136, "grad_norm": 4.976967811584473, "learning_rate": 2.0059397664622786e-06, "loss": 0.5179, "num_input_tokens_seen": 142468960, "step": 117150 }, { "epoch": 14.67923819070292, "grad_norm": 1.6498208045959473, "learning_rate": 2.0055019269418614e-06, "loss": 0.459, "num_input_tokens_seen": 142475168, "step": 117155 }, { "epoch": 14.679864678611702, "grad_norm": 10.389432907104492, "learning_rate": 2.005064123223184e-06, "loss": 0.4569, "num_input_tokens_seen": 142481216, "step": 117160 }, { "epoch": 14.680491166520486, "grad_norm": 2.1385111808776855, "learning_rate": 2.004626355311483e-06, "loss": 0.5782, "num_input_tokens_seen": 142487040, "step": 117165 }, { "epoch": 14.68111765442927, "grad_norm": 2.4928061962127686, "learning_rate": 2.004188623211993e-06, "loss": 0.5318, "num_input_tokens_seen": 142493152, "step": 117170 }, { "epoch": 14.681744142338053, "grad_norm": 8.001646041870117, "learning_rate": 2.0037509269299476e-06, "loss": 0.5221, "num_input_tokens_seen": 142499296, "step": 117175 }, { "epoch": 14.682370630246837, "grad_norm": 10.7969388961792, "learning_rate": 2.003313266470578e-06, "loss": 0.4586, "num_input_tokens_seen": 142505600, "step": 117180 }, { "epoch": 14.682997118155619, "grad_norm": 1.4600098133087158, "learning_rate": 2.0028756418391187e-06, "loss": 0.5051, "num_input_tokens_seen": 142511552, "step": 117185 }, { "epoch": 14.683623606064403, "grad_norm": 6.485509872436523, "learning_rate": 2.0024380530408e-06, "loss": 0.4751, "num_input_tokens_seen": 142518016, "step": 117190 }, { "epoch": 14.684250093973187, "grad_norm": 1.4238018989562988, "learning_rate": 2.0020005000808563e-06, "loss": 0.4603, "num_input_tokens_seen": 142524288, "step": 117195 }, { "epoch": 14.68487658188197, "grad_norm": 1.3101348876953125, "learning_rate": 2.001562982964515e-06, "loss": 0.4519, "num_input_tokens_seen": 142530528, "step": 117200 }, { "epoch": 14.685503069790753, "grad_norm": 4.27968692779541, "learning_rate": 2.0011255016970093e-06, "loss": 0.5246, "num_input_tokens_seen": 142536608, "step": 117205 }, { "epoch": 14.686129557699536, "grad_norm": 2.44118070602417, "learning_rate": 2.000688056283571e-06, "loss": 0.4872, "num_input_tokens_seen": 142542912, "step": 117210 }, { "epoch": 14.68675604560832, "grad_norm": 12.974506378173828, "learning_rate": 2.0002506467294276e-06, "loss": 0.5201, "num_input_tokens_seen": 142549024, "step": 117215 }, { "epoch": 14.687382533517104, "grad_norm": 2.7616920471191406, "learning_rate": 1.999813273039811e-06, "loss": 0.4789, "num_input_tokens_seen": 142555456, "step": 117220 }, { "epoch": 14.688009021425886, "grad_norm": 1.1074491739273071, "learning_rate": 1.9993759352199477e-06, "loss": 0.4456, "num_input_tokens_seen": 142561248, "step": 117225 }, { "epoch": 14.68863550933467, "grad_norm": 1.774381399154663, "learning_rate": 1.998938633275069e-06, "loss": 0.4419, "num_input_tokens_seen": 142567488, "step": 117230 }, { "epoch": 14.689261997243452, "grad_norm": 4.696117401123047, "learning_rate": 1.9985013672104013e-06, "loss": 0.4963, "num_input_tokens_seen": 142573440, "step": 117235 }, { "epoch": 14.689888485152236, "grad_norm": 3.3928444385528564, "learning_rate": 1.9980641370311744e-06, "loss": 0.4654, "num_input_tokens_seen": 142579680, "step": 117240 }, { "epoch": 14.69051497306102, "grad_norm": 2.4134626388549805, "learning_rate": 1.997626942742613e-06, "loss": 0.4777, "num_input_tokens_seen": 142585600, "step": 117245 }, { "epoch": 14.691141460969803, "grad_norm": 2.5830605030059814, "learning_rate": 1.9971897843499478e-06, "loss": 0.4406, "num_input_tokens_seen": 142591872, "step": 117250 }, { "epoch": 14.691767948878587, "grad_norm": 1.355689287185669, "learning_rate": 1.9967526618584016e-06, "loss": 0.4819, "num_input_tokens_seen": 142598048, "step": 117255 }, { "epoch": 14.692394436787371, "grad_norm": 1.2914364337921143, "learning_rate": 1.9963155752732017e-06, "loss": 0.5014, "num_input_tokens_seen": 142604256, "step": 117260 }, { "epoch": 14.693020924696153, "grad_norm": 3.342317819595337, "learning_rate": 1.995878524599575e-06, "loss": 0.4658, "num_input_tokens_seen": 142610368, "step": 117265 }, { "epoch": 14.693647412604937, "grad_norm": 1.7757869958877563, "learning_rate": 1.9954415098427475e-06, "loss": 0.407, "num_input_tokens_seen": 142616640, "step": 117270 }, { "epoch": 14.69427390051372, "grad_norm": 4.928986549377441, "learning_rate": 1.9950045310079418e-06, "loss": 0.4731, "num_input_tokens_seen": 142622848, "step": 117275 }, { "epoch": 14.694900388422504, "grad_norm": 1.9022352695465088, "learning_rate": 1.994567588100383e-06, "loss": 0.4415, "num_input_tokens_seen": 142628768, "step": 117280 }, { "epoch": 14.695526876331288, "grad_norm": 3.084123134613037, "learning_rate": 1.994130681125297e-06, "loss": 0.4605, "num_input_tokens_seen": 142634848, "step": 117285 }, { "epoch": 14.69615336424007, "grad_norm": 5.852829456329346, "learning_rate": 1.9936938100879045e-06, "loss": 0.5035, "num_input_tokens_seen": 142640800, "step": 117290 }, { "epoch": 14.696779852148854, "grad_norm": 1.7682325839996338, "learning_rate": 1.993256974993432e-06, "loss": 0.4265, "num_input_tokens_seen": 142647328, "step": 117295 }, { "epoch": 14.697406340057636, "grad_norm": 2.871460199356079, "learning_rate": 1.9928201758470984e-06, "loss": 0.5107, "num_input_tokens_seen": 142653472, "step": 117300 }, { "epoch": 14.69803282796642, "grad_norm": 3.791119337081909, "learning_rate": 1.992383412654129e-06, "loss": 0.5142, "num_input_tokens_seen": 142659776, "step": 117305 }, { "epoch": 14.698659315875204, "grad_norm": 1.7990343570709229, "learning_rate": 1.991946685419744e-06, "loss": 0.4488, "num_input_tokens_seen": 142665728, "step": 117310 }, { "epoch": 14.699285803783987, "grad_norm": 1.8518112897872925, "learning_rate": 1.9915099941491663e-06, "loss": 0.4857, "num_input_tokens_seen": 142671840, "step": 117315 }, { "epoch": 14.69991229169277, "grad_norm": 1.4191322326660156, "learning_rate": 1.9910733388476144e-06, "loss": 0.4529, "num_input_tokens_seen": 142678208, "step": 117320 }, { "epoch": 14.700538779601553, "grad_norm": 3.0742812156677246, "learning_rate": 1.9906367195203124e-06, "loss": 0.4438, "num_input_tokens_seen": 142684352, "step": 117325 }, { "epoch": 14.701165267510337, "grad_norm": 5.873449802398682, "learning_rate": 1.9902001361724763e-06, "loss": 0.4721, "num_input_tokens_seen": 142690528, "step": 117330 }, { "epoch": 14.701791755419121, "grad_norm": 3.3618593215942383, "learning_rate": 1.989763588809329e-06, "loss": 0.4345, "num_input_tokens_seen": 142696448, "step": 117335 }, { "epoch": 14.702418243327903, "grad_norm": 1.5865308046340942, "learning_rate": 1.9893270774360903e-06, "loss": 0.4462, "num_input_tokens_seen": 142702304, "step": 117340 }, { "epoch": 14.703044731236687, "grad_norm": 7.542102336883545, "learning_rate": 1.9888906020579757e-06, "loss": 0.4576, "num_input_tokens_seen": 142708768, "step": 117345 }, { "epoch": 14.70367121914547, "grad_norm": 1.6631735563278198, "learning_rate": 1.9884541626802073e-06, "loss": 0.4431, "num_input_tokens_seen": 142714880, "step": 117350 }, { "epoch": 14.704297707054254, "grad_norm": 2.7130649089813232, "learning_rate": 1.988017759308e-06, "loss": 0.4705, "num_input_tokens_seen": 142721024, "step": 117355 }, { "epoch": 14.704924194963038, "grad_norm": 2.1099491119384766, "learning_rate": 1.987581391946572e-06, "loss": 0.466, "num_input_tokens_seen": 142727296, "step": 117360 }, { "epoch": 14.70555068287182, "grad_norm": 1.6754990816116333, "learning_rate": 1.9871450606011426e-06, "loss": 0.4635, "num_input_tokens_seen": 142733248, "step": 117365 }, { "epoch": 14.706177170780604, "grad_norm": 4.325772285461426, "learning_rate": 1.9867087652769263e-06, "loss": 0.4743, "num_input_tokens_seen": 142738624, "step": 117370 }, { "epoch": 14.706803658689388, "grad_norm": 1.6253912448883057, "learning_rate": 1.98627250597914e-06, "loss": 0.418, "num_input_tokens_seen": 142744768, "step": 117375 }, { "epoch": 14.70743014659817, "grad_norm": 2.1104531288146973, "learning_rate": 1.9858362827130012e-06, "loss": 0.4493, "num_input_tokens_seen": 142750560, "step": 117380 }, { "epoch": 14.708056634506955, "grad_norm": 12.125448226928711, "learning_rate": 1.985400095483722e-06, "loss": 0.4914, "num_input_tokens_seen": 142756832, "step": 117385 }, { "epoch": 14.708683122415737, "grad_norm": 5.183652400970459, "learning_rate": 1.9849639442965193e-06, "loss": 0.4538, "num_input_tokens_seen": 142762848, "step": 117390 }, { "epoch": 14.709309610324521, "grad_norm": 1.696529507637024, "learning_rate": 1.9845278291566095e-06, "loss": 0.4176, "num_input_tokens_seen": 142769248, "step": 117395 }, { "epoch": 14.709936098233303, "grad_norm": 8.138124465942383, "learning_rate": 1.9840917500692027e-06, "loss": 0.5967, "num_input_tokens_seen": 142775008, "step": 117400 }, { "epoch": 14.710562586142087, "grad_norm": 7.113468647003174, "learning_rate": 1.9836557070395167e-06, "loss": 0.531, "num_input_tokens_seen": 142781216, "step": 117405 }, { "epoch": 14.711189074050871, "grad_norm": 1.5256444215774536, "learning_rate": 1.9832197000727604e-06, "loss": 0.4525, "num_input_tokens_seen": 142786976, "step": 117410 }, { "epoch": 14.711815561959654, "grad_norm": 3.855896234512329, "learning_rate": 1.9827837291741513e-06, "loss": 0.4255, "num_input_tokens_seen": 142793216, "step": 117415 }, { "epoch": 14.712442049868438, "grad_norm": 3.293159246444702, "learning_rate": 1.982347794348898e-06, "loss": 0.4434, "num_input_tokens_seen": 142799200, "step": 117420 }, { "epoch": 14.713068537777222, "grad_norm": 1.6643402576446533, "learning_rate": 1.9819118956022147e-06, "loss": 0.4591, "num_input_tokens_seen": 142805504, "step": 117425 }, { "epoch": 14.713695025686004, "grad_norm": 3.2532613277435303, "learning_rate": 1.981476032939311e-06, "loss": 0.5014, "num_input_tokens_seen": 142811424, "step": 117430 }, { "epoch": 14.714321513594788, "grad_norm": 4.120553493499756, "learning_rate": 1.9810402063654012e-06, "loss": 0.4533, "num_input_tokens_seen": 142817408, "step": 117435 }, { "epoch": 14.71494800150357, "grad_norm": 1.673218011856079, "learning_rate": 1.9806044158856913e-06, "loss": 0.4868, "num_input_tokens_seen": 142823712, "step": 117440 }, { "epoch": 14.715574489412354, "grad_norm": 2.362643003463745, "learning_rate": 1.980168661505397e-06, "loss": 0.461, "num_input_tokens_seen": 142829792, "step": 117445 }, { "epoch": 14.716200977321138, "grad_norm": 1.5363441705703735, "learning_rate": 1.9797329432297226e-06, "loss": 0.486, "num_input_tokens_seen": 142835808, "step": 117450 }, { "epoch": 14.71682746522992, "grad_norm": 7.343235492706299, "learning_rate": 1.9792972610638806e-06, "loss": 0.497, "num_input_tokens_seen": 142841888, "step": 117455 }, { "epoch": 14.717453953138705, "grad_norm": 1.6848695278167725, "learning_rate": 1.978861615013081e-06, "loss": 0.4203, "num_input_tokens_seen": 142848224, "step": 117460 }, { "epoch": 14.718080441047487, "grad_norm": 2.130929470062256, "learning_rate": 1.978426005082529e-06, "loss": 0.4302, "num_input_tokens_seen": 142853920, "step": 117465 }, { "epoch": 14.718706928956271, "grad_norm": 1.8763997554779053, "learning_rate": 1.977990431277435e-06, "loss": 0.4416, "num_input_tokens_seen": 142859968, "step": 117470 }, { "epoch": 14.719333416865055, "grad_norm": 1.6987026929855347, "learning_rate": 1.977554893603006e-06, "loss": 0.486, "num_input_tokens_seen": 142866272, "step": 117475 }, { "epoch": 14.719959904773837, "grad_norm": 1.2804253101348877, "learning_rate": 1.977119392064451e-06, "loss": 0.4454, "num_input_tokens_seen": 142872320, "step": 117480 }, { "epoch": 14.720586392682621, "grad_norm": 1.3678913116455078, "learning_rate": 1.9766839266669736e-06, "loss": 0.464, "num_input_tokens_seen": 142878496, "step": 117485 }, { "epoch": 14.721212880591404, "grad_norm": 6.286864757537842, "learning_rate": 1.976248497415783e-06, "loss": 0.4981, "num_input_tokens_seen": 142884928, "step": 117490 }, { "epoch": 14.721839368500188, "grad_norm": 1.6031816005706787, "learning_rate": 1.975813104316083e-06, "loss": 0.4697, "num_input_tokens_seen": 142891136, "step": 117495 }, { "epoch": 14.722465856408972, "grad_norm": 1.9831403493881226, "learning_rate": 1.9753777473730808e-06, "loss": 0.4403, "num_input_tokens_seen": 142897376, "step": 117500 }, { "epoch": 14.723092344317754, "grad_norm": 7.692092418670654, "learning_rate": 1.9749424265919793e-06, "loss": 0.4895, "num_input_tokens_seen": 142903456, "step": 117505 }, { "epoch": 14.723718832226538, "grad_norm": 12.800573348999023, "learning_rate": 1.974507141977986e-06, "loss": 0.6788, "num_input_tokens_seen": 142910016, "step": 117510 }, { "epoch": 14.72434532013532, "grad_norm": 7.180987358093262, "learning_rate": 1.9740718935363017e-06, "loss": 0.4811, "num_input_tokens_seen": 142916288, "step": 117515 }, { "epoch": 14.724971808044105, "grad_norm": 5.211909294128418, "learning_rate": 1.973636681272132e-06, "loss": 0.4678, "num_input_tokens_seen": 142922464, "step": 117520 }, { "epoch": 14.725598295952889, "grad_norm": 1.5573467016220093, "learning_rate": 1.9732015051906822e-06, "loss": 0.4892, "num_input_tokens_seen": 142928736, "step": 117525 }, { "epoch": 14.72622478386167, "grad_norm": 5.1518659591674805, "learning_rate": 1.9727663652971513e-06, "loss": 0.4921, "num_input_tokens_seen": 142934816, "step": 117530 }, { "epoch": 14.726851271770455, "grad_norm": 2.7999846935272217, "learning_rate": 1.9723312615967457e-06, "loss": 0.461, "num_input_tokens_seen": 142940928, "step": 117535 }, { "epoch": 14.727477759679239, "grad_norm": 1.165034294128418, "learning_rate": 1.971896194094663e-06, "loss": 0.4178, "num_input_tokens_seen": 142946976, "step": 117540 }, { "epoch": 14.728104247588021, "grad_norm": 3.7823948860168457, "learning_rate": 1.9714611627961094e-06, "loss": 0.525, "num_input_tokens_seen": 142952992, "step": 117545 }, { "epoch": 14.728730735496805, "grad_norm": 2.987565755844116, "learning_rate": 1.971026167706282e-06, "loss": 0.5299, "num_input_tokens_seen": 142958720, "step": 117550 }, { "epoch": 14.729357223405588, "grad_norm": 1.2826297283172607, "learning_rate": 1.970591208830385e-06, "loss": 0.4351, "num_input_tokens_seen": 142964704, "step": 117555 }, { "epoch": 14.729983711314372, "grad_norm": 9.800980567932129, "learning_rate": 1.9701562861736157e-06, "loss": 0.5162, "num_input_tokens_seen": 142970688, "step": 117560 }, { "epoch": 14.730610199223156, "grad_norm": 5.769771099090576, "learning_rate": 1.9697213997411748e-06, "loss": 0.4789, "num_input_tokens_seen": 142976384, "step": 117565 }, { "epoch": 14.731236687131938, "grad_norm": 10.40443229675293, "learning_rate": 1.9692865495382646e-06, "loss": 0.4897, "num_input_tokens_seen": 142983040, "step": 117570 }, { "epoch": 14.731863175040722, "grad_norm": 1.260753870010376, "learning_rate": 1.9688517355700793e-06, "loss": 0.4522, "num_input_tokens_seen": 142989440, "step": 117575 }, { "epoch": 14.732489662949504, "grad_norm": 1.7326968908309937, "learning_rate": 1.9684169578418204e-06, "loss": 0.4408, "num_input_tokens_seen": 142995584, "step": 117580 }, { "epoch": 14.733116150858288, "grad_norm": 1.4502151012420654, "learning_rate": 1.9679822163586858e-06, "loss": 0.4419, "num_input_tokens_seen": 143001472, "step": 117585 }, { "epoch": 14.733742638767072, "grad_norm": 1.6795201301574707, "learning_rate": 1.9675475111258745e-06, "loss": 0.4505, "num_input_tokens_seen": 143007552, "step": 117590 }, { "epoch": 14.734369126675855, "grad_norm": 7.124711990356445, "learning_rate": 1.967112842148581e-06, "loss": 0.5162, "num_input_tokens_seen": 143013952, "step": 117595 }, { "epoch": 14.734995614584639, "grad_norm": 2.8011488914489746, "learning_rate": 1.966678209432004e-06, "loss": 0.4142, "num_input_tokens_seen": 143019936, "step": 117600 }, { "epoch": 14.735622102493421, "grad_norm": 2.2756967544555664, "learning_rate": 1.966243612981338e-06, "loss": 0.4353, "num_input_tokens_seen": 143026336, "step": 117605 }, { "epoch": 14.736248590402205, "grad_norm": 5.834819793701172, "learning_rate": 1.9658090528017824e-06, "loss": 0.4389, "num_input_tokens_seen": 143032416, "step": 117610 }, { "epoch": 14.73687507831099, "grad_norm": 1.3368475437164307, "learning_rate": 1.965374528898529e-06, "loss": 0.4678, "num_input_tokens_seen": 143038528, "step": 117615 }, { "epoch": 14.737501566219771, "grad_norm": 13.648441314697266, "learning_rate": 1.9649400412767755e-06, "loss": 0.4829, "num_input_tokens_seen": 143044640, "step": 117620 }, { "epoch": 14.738128054128556, "grad_norm": 1.6422250270843506, "learning_rate": 1.964505589941714e-06, "loss": 0.4808, "num_input_tokens_seen": 143050720, "step": 117625 }, { "epoch": 14.738754542037338, "grad_norm": 1.4266904592514038, "learning_rate": 1.9640711748985425e-06, "loss": 0.507, "num_input_tokens_seen": 143057120, "step": 117630 }, { "epoch": 14.739381029946122, "grad_norm": 2.2955687046051025, "learning_rate": 1.9636367961524505e-06, "loss": 0.4899, "num_input_tokens_seen": 143063328, "step": 117635 }, { "epoch": 14.740007517854906, "grad_norm": 2.828927993774414, "learning_rate": 1.9632024537086337e-06, "loss": 0.4617, "num_input_tokens_seen": 143069568, "step": 117640 }, { "epoch": 14.740634005763688, "grad_norm": 2.011870861053467, "learning_rate": 1.9627681475722858e-06, "loss": 0.4395, "num_input_tokens_seen": 143075552, "step": 117645 }, { "epoch": 14.741260493672472, "grad_norm": 8.403194427490234, "learning_rate": 1.9623338777485973e-06, "loss": 0.5071, "num_input_tokens_seen": 143081920, "step": 117650 }, { "epoch": 14.741886981581256, "grad_norm": 5.706380367279053, "learning_rate": 1.9618996442427624e-06, "loss": 0.4815, "num_input_tokens_seen": 143087040, "step": 117655 }, { "epoch": 14.742513469490039, "grad_norm": 2.515207529067993, "learning_rate": 1.9614654470599703e-06, "loss": 0.4971, "num_input_tokens_seen": 143093056, "step": 117660 }, { "epoch": 14.743139957398823, "grad_norm": 1.6203728914260864, "learning_rate": 1.961031286205415e-06, "loss": 0.4668, "num_input_tokens_seen": 143098496, "step": 117665 }, { "epoch": 14.743766445307605, "grad_norm": 1.8468199968338013, "learning_rate": 1.9605971616842833e-06, "loss": 0.4732, "num_input_tokens_seen": 143104544, "step": 117670 }, { "epoch": 14.744392933216389, "grad_norm": 2.483797073364258, "learning_rate": 1.9601630735017684e-06, "loss": 0.4585, "num_input_tokens_seen": 143110720, "step": 117675 }, { "epoch": 14.745019421125173, "grad_norm": 12.646228790283203, "learning_rate": 1.9597290216630597e-06, "loss": 0.5372, "num_input_tokens_seen": 143116256, "step": 117680 }, { "epoch": 14.745645909033955, "grad_norm": 2.870086431503296, "learning_rate": 1.959295006173349e-06, "loss": 0.4857, "num_input_tokens_seen": 143122208, "step": 117685 }, { "epoch": 14.74627239694274, "grad_norm": 1.8388594388961792, "learning_rate": 1.9588610270378206e-06, "loss": 0.5197, "num_input_tokens_seen": 143128032, "step": 117690 }, { "epoch": 14.746898884851522, "grad_norm": 2.5165817737579346, "learning_rate": 1.9584270842616677e-06, "loss": 0.4764, "num_input_tokens_seen": 143133856, "step": 117695 }, { "epoch": 14.747525372760306, "grad_norm": 6.811651229858398, "learning_rate": 1.9579931778500736e-06, "loss": 0.5271, "num_input_tokens_seen": 143140000, "step": 117700 }, { "epoch": 14.74815186066909, "grad_norm": 1.456017017364502, "learning_rate": 1.9575593078082295e-06, "loss": 0.454, "num_input_tokens_seen": 143146176, "step": 117705 }, { "epoch": 14.748778348577872, "grad_norm": 1.2654709815979004, "learning_rate": 1.957125474141323e-06, "loss": 0.4647, "num_input_tokens_seen": 143152448, "step": 117710 }, { "epoch": 14.749404836486656, "grad_norm": 1.4782794713974, "learning_rate": 1.9566916768545386e-06, "loss": 0.4426, "num_input_tokens_seen": 143158048, "step": 117715 }, { "epoch": 14.750031324395438, "grad_norm": 2.2618415355682373, "learning_rate": 1.9562579159530655e-06, "loss": 0.4389, "num_input_tokens_seen": 143164352, "step": 117720 }, { "epoch": 14.750657812304222, "grad_norm": 10.714362144470215, "learning_rate": 1.9558241914420857e-06, "loss": 0.5001, "num_input_tokens_seen": 143170560, "step": 117725 }, { "epoch": 14.751284300213007, "grad_norm": 1.8737599849700928, "learning_rate": 1.9553905033267895e-06, "loss": 0.4602, "num_input_tokens_seen": 143176768, "step": 117730 }, { "epoch": 14.751910788121789, "grad_norm": 4.931023120880127, "learning_rate": 1.954956851612357e-06, "loss": 0.4385, "num_input_tokens_seen": 143182272, "step": 117735 }, { "epoch": 14.752537276030573, "grad_norm": 3.249478340148926, "learning_rate": 1.9545232363039778e-06, "loss": 0.4654, "num_input_tokens_seen": 143188512, "step": 117740 }, { "epoch": 14.753163763939355, "grad_norm": 1.8230334520339966, "learning_rate": 1.9540896574068326e-06, "loss": 0.4118, "num_input_tokens_seen": 143194784, "step": 117745 }, { "epoch": 14.75379025184814, "grad_norm": 3.795539379119873, "learning_rate": 1.9536561149261076e-06, "loss": 0.456, "num_input_tokens_seen": 143200960, "step": 117750 }, { "epoch": 14.754416739756923, "grad_norm": 1.9074034690856934, "learning_rate": 1.953222608866983e-06, "loss": 0.4702, "num_input_tokens_seen": 143206784, "step": 117755 }, { "epoch": 14.755043227665706, "grad_norm": 3.903322458267212, "learning_rate": 1.9527891392346436e-06, "loss": 0.4851, "num_input_tokens_seen": 143212960, "step": 117760 }, { "epoch": 14.75566971557449, "grad_norm": 1.9254075288772583, "learning_rate": 1.9523557060342745e-06, "loss": 0.5397, "num_input_tokens_seen": 143219200, "step": 117765 }, { "epoch": 14.756296203483274, "grad_norm": 6.845865249633789, "learning_rate": 1.9519223092710533e-06, "loss": 0.4425, "num_input_tokens_seen": 143225536, "step": 117770 }, { "epoch": 14.756922691392056, "grad_norm": 1.5087273120880127, "learning_rate": 1.951488948950163e-06, "loss": 0.4574, "num_input_tokens_seen": 143231168, "step": 117775 }, { "epoch": 14.75754917930084, "grad_norm": 3.910829782485962, "learning_rate": 1.9510556250767875e-06, "loss": 0.4988, "num_input_tokens_seen": 143236736, "step": 117780 }, { "epoch": 14.758175667209622, "grad_norm": 11.001323699951172, "learning_rate": 1.9506223376561033e-06, "loss": 0.5217, "num_input_tokens_seen": 143242336, "step": 117785 }, { "epoch": 14.758802155118406, "grad_norm": 3.5138165950775146, "learning_rate": 1.950189086693294e-06, "loss": 0.5113, "num_input_tokens_seen": 143248352, "step": 117790 }, { "epoch": 14.75942864302719, "grad_norm": 5.069687843322754, "learning_rate": 1.949755872193539e-06, "loss": 0.4765, "num_input_tokens_seen": 143254688, "step": 117795 }, { "epoch": 14.760055130935973, "grad_norm": 2.1059234142303467, "learning_rate": 1.9493226941620155e-06, "loss": 0.5396, "num_input_tokens_seen": 143261120, "step": 117800 }, { "epoch": 14.760681618844757, "grad_norm": 1.4140113592147827, "learning_rate": 1.948889552603906e-06, "loss": 0.4535, "num_input_tokens_seen": 143267328, "step": 117805 }, { "epoch": 14.761308106753539, "grad_norm": 1.6047987937927246, "learning_rate": 1.948456447524385e-06, "loss": 0.4336, "num_input_tokens_seen": 143273568, "step": 117810 }, { "epoch": 14.761934594662323, "grad_norm": 6.275028705596924, "learning_rate": 1.9480233789286345e-06, "loss": 0.5261, "num_input_tokens_seen": 143279648, "step": 117815 }, { "epoch": 14.762561082571107, "grad_norm": 1.3403252363204956, "learning_rate": 1.9475903468218288e-06, "loss": 0.4623, "num_input_tokens_seen": 143285216, "step": 117820 }, { "epoch": 14.76318757047989, "grad_norm": 1.0373607873916626, "learning_rate": 1.9471573512091474e-06, "loss": 0.477, "num_input_tokens_seen": 143291424, "step": 117825 }, { "epoch": 14.763814058388673, "grad_norm": 4.273222923278809, "learning_rate": 1.946724392095768e-06, "loss": 0.4354, "num_input_tokens_seen": 143298080, "step": 117830 }, { "epoch": 14.764440546297456, "grad_norm": 1.936191201210022, "learning_rate": 1.9462914694868633e-06, "loss": 0.5477, "num_input_tokens_seen": 143304320, "step": 117835 }, { "epoch": 14.76506703420624, "grad_norm": 7.265858173370361, "learning_rate": 1.945858583387614e-06, "loss": 0.4717, "num_input_tokens_seen": 143310304, "step": 117840 }, { "epoch": 14.765693522115024, "grad_norm": 3.7009072303771973, "learning_rate": 1.9454257338031906e-06, "loss": 0.4677, "num_input_tokens_seen": 143316256, "step": 117845 }, { "epoch": 14.766320010023806, "grad_norm": 1.272242784500122, "learning_rate": 1.944992920738773e-06, "loss": 0.4479, "num_input_tokens_seen": 143322080, "step": 117850 }, { "epoch": 14.76694649793259, "grad_norm": 1.551137089729309, "learning_rate": 1.9445601441995316e-06, "loss": 0.4137, "num_input_tokens_seen": 143328352, "step": 117855 }, { "epoch": 14.767572985841372, "grad_norm": 2.057018280029297, "learning_rate": 1.9441274041906443e-06, "loss": 0.435, "num_input_tokens_seen": 143334592, "step": 117860 }, { "epoch": 14.768199473750157, "grad_norm": 6.961371898651123, "learning_rate": 1.9436947007172814e-06, "loss": 0.5057, "num_input_tokens_seen": 143340544, "step": 117865 }, { "epoch": 14.76882596165894, "grad_norm": 2.567805528640747, "learning_rate": 1.9432620337846197e-06, "loss": 0.4586, "num_input_tokens_seen": 143346752, "step": 117870 }, { "epoch": 14.769452449567723, "grad_norm": 7.296723365783691, "learning_rate": 1.9428294033978284e-06, "loss": 0.4709, "num_input_tokens_seen": 143352896, "step": 117875 }, { "epoch": 14.770078937476507, "grad_norm": 2.141350507736206, "learning_rate": 1.942396809562082e-06, "loss": 0.4907, "num_input_tokens_seen": 143358944, "step": 117880 }, { "epoch": 14.770705425385291, "grad_norm": 5.204419136047363, "learning_rate": 1.9419642522825525e-06, "loss": 0.4948, "num_input_tokens_seen": 143365088, "step": 117885 }, { "epoch": 14.771331913294073, "grad_norm": 7.687134742736816, "learning_rate": 1.941531731564413e-06, "loss": 0.4621, "num_input_tokens_seen": 143371520, "step": 117890 }, { "epoch": 14.771958401202857, "grad_norm": 1.491087794303894, "learning_rate": 1.9410992474128323e-06, "loss": 0.4357, "num_input_tokens_seen": 143377920, "step": 117895 }, { "epoch": 14.77258488911164, "grad_norm": 1.4345523118972778, "learning_rate": 1.940666799832981e-06, "loss": 0.4377, "num_input_tokens_seen": 143384192, "step": 117900 }, { "epoch": 14.773211377020424, "grad_norm": 1.2369072437286377, "learning_rate": 1.9402343888300316e-06, "loss": 0.4704, "num_input_tokens_seen": 143390080, "step": 117905 }, { "epoch": 14.773837864929208, "grad_norm": 2.601632595062256, "learning_rate": 1.9398020144091516e-06, "loss": 0.4515, "num_input_tokens_seen": 143396288, "step": 117910 }, { "epoch": 14.77446435283799, "grad_norm": 8.654672622680664, "learning_rate": 1.9393696765755133e-06, "loss": 0.5115, "num_input_tokens_seen": 143402528, "step": 117915 }, { "epoch": 14.775090840746774, "grad_norm": 2.183727502822876, "learning_rate": 1.938937375334281e-06, "loss": 0.4317, "num_input_tokens_seen": 143408448, "step": 117920 }, { "epoch": 14.775717328655556, "grad_norm": 2.093757152557373, "learning_rate": 1.938505110690629e-06, "loss": 0.4812, "num_input_tokens_seen": 143414752, "step": 117925 }, { "epoch": 14.77634381656434, "grad_norm": 10.002802848815918, "learning_rate": 1.93807288264972e-06, "loss": 0.579, "num_input_tokens_seen": 143420512, "step": 117930 }, { "epoch": 14.776970304473124, "grad_norm": 4.245323657989502, "learning_rate": 1.937640691216725e-06, "loss": 0.4688, "num_input_tokens_seen": 143426368, "step": 117935 }, { "epoch": 14.777596792381907, "grad_norm": 2.360562801361084, "learning_rate": 1.9372085363968094e-06, "loss": 0.4864, "num_input_tokens_seen": 143432320, "step": 117940 }, { "epoch": 14.77822328029069, "grad_norm": 1.8903820514678955, "learning_rate": 1.9367764181951403e-06, "loss": 0.4494, "num_input_tokens_seen": 143438144, "step": 117945 }, { "epoch": 14.778849768199473, "grad_norm": 1.7065281867980957, "learning_rate": 1.9363443366168868e-06, "loss": 0.4192, "num_input_tokens_seen": 143444352, "step": 117950 }, { "epoch": 14.779476256108257, "grad_norm": 5.582797527313232, "learning_rate": 1.9359122916672106e-06, "loss": 0.4431, "num_input_tokens_seen": 143450400, "step": 117955 }, { "epoch": 14.780102744017041, "grad_norm": 3.4321107864379883, "learning_rate": 1.9354802833512807e-06, "loss": 0.509, "num_input_tokens_seen": 143456800, "step": 117960 }, { "epoch": 14.780729231925823, "grad_norm": 1.5303081274032593, "learning_rate": 1.935048311674259e-06, "loss": 0.4708, "num_input_tokens_seen": 143463264, "step": 117965 }, { "epoch": 14.781355719834607, "grad_norm": 4.0821733474731445, "learning_rate": 1.9346163766413134e-06, "loss": 0.4496, "num_input_tokens_seen": 143469312, "step": 117970 }, { "epoch": 14.78198220774339, "grad_norm": 5.7430596351623535, "learning_rate": 1.934184478257604e-06, "loss": 0.459, "num_input_tokens_seen": 143474912, "step": 117975 }, { "epoch": 14.782608695652174, "grad_norm": 11.095166206359863, "learning_rate": 1.933752616528297e-06, "loss": 0.4805, "num_input_tokens_seen": 143480768, "step": 117980 }, { "epoch": 14.783235183560958, "grad_norm": 2.2136917114257812, "learning_rate": 1.9333207914585573e-06, "loss": 0.5861, "num_input_tokens_seen": 143486880, "step": 117985 }, { "epoch": 14.78386167146974, "grad_norm": 3.3050458431243896, "learning_rate": 1.932889003053544e-06, "loss": 0.4475, "num_input_tokens_seen": 143492960, "step": 117990 }, { "epoch": 14.784488159378524, "grad_norm": 1.6984515190124512, "learning_rate": 1.9324572513184215e-06, "loss": 0.4903, "num_input_tokens_seen": 143498784, "step": 117995 }, { "epoch": 14.785114647287308, "grad_norm": 1.9108984470367432, "learning_rate": 1.9320255362583533e-06, "loss": 0.4584, "num_input_tokens_seen": 143504576, "step": 118000 }, { "epoch": 14.78574113519609, "grad_norm": 2.3349661827087402, "learning_rate": 1.9315938578784976e-06, "loss": 0.508, "num_input_tokens_seen": 143510976, "step": 118005 }, { "epoch": 14.786367623104875, "grad_norm": 5.678391933441162, "learning_rate": 1.931162216184017e-06, "loss": 0.4548, "num_input_tokens_seen": 143517440, "step": 118010 }, { "epoch": 14.786994111013657, "grad_norm": 1.4849570989608765, "learning_rate": 1.930730611180074e-06, "loss": 0.453, "num_input_tokens_seen": 143523072, "step": 118015 }, { "epoch": 14.787620598922441, "grad_norm": 3.4053008556365967, "learning_rate": 1.930299042871825e-06, "loss": 0.4482, "num_input_tokens_seen": 143528608, "step": 118020 }, { "epoch": 14.788247086831223, "grad_norm": 4.903459072113037, "learning_rate": 1.929867511264434e-06, "loss": 0.4781, "num_input_tokens_seen": 143534848, "step": 118025 }, { "epoch": 14.788873574740007, "grad_norm": 1.896936297416687, "learning_rate": 1.929436016363056e-06, "loss": 0.4624, "num_input_tokens_seen": 143540832, "step": 118030 }, { "epoch": 14.789500062648791, "grad_norm": 1.881614327430725, "learning_rate": 1.929004558172854e-06, "loss": 0.4172, "num_input_tokens_seen": 143546784, "step": 118035 }, { "epoch": 14.790126550557574, "grad_norm": 1.8627629280090332, "learning_rate": 1.928573136698982e-06, "loss": 0.4322, "num_input_tokens_seen": 143552512, "step": 118040 }, { "epoch": 14.790753038466358, "grad_norm": 3.991368055343628, "learning_rate": 1.9281417519466035e-06, "loss": 0.4347, "num_input_tokens_seen": 143558528, "step": 118045 }, { "epoch": 14.791379526375142, "grad_norm": 2.930126667022705, "learning_rate": 1.9277104039208704e-06, "loss": 0.4656, "num_input_tokens_seen": 143564096, "step": 118050 }, { "epoch": 14.792006014283924, "grad_norm": 4.1490278244018555, "learning_rate": 1.9272790926269447e-06, "loss": 0.4881, "num_input_tokens_seen": 143570112, "step": 118055 }, { "epoch": 14.792632502192708, "grad_norm": 12.399079322814941, "learning_rate": 1.926847818069979e-06, "loss": 0.5575, "num_input_tokens_seen": 143576416, "step": 118060 }, { "epoch": 14.79325899010149, "grad_norm": 2.008920192718506, "learning_rate": 1.9264165802551326e-06, "loss": 0.4388, "num_input_tokens_seen": 143582752, "step": 118065 }, { "epoch": 14.793885478010274, "grad_norm": 2.341815233230591, "learning_rate": 1.925985379187559e-06, "loss": 0.4932, "num_input_tokens_seen": 143588512, "step": 118070 }, { "epoch": 14.794511965919058, "grad_norm": 2.0806071758270264, "learning_rate": 1.9255542148724143e-06, "loss": 0.4415, "num_input_tokens_seen": 143594496, "step": 118075 }, { "epoch": 14.79513845382784, "grad_norm": 2.068192720413208, "learning_rate": 1.9251230873148557e-06, "loss": 0.4854, "num_input_tokens_seen": 143600640, "step": 118080 }, { "epoch": 14.795764941736625, "grad_norm": 10.399911880493164, "learning_rate": 1.9246919965200343e-06, "loss": 0.4741, "num_input_tokens_seen": 143606464, "step": 118085 }, { "epoch": 14.796391429645407, "grad_norm": 11.164902687072754, "learning_rate": 1.9242609424931052e-06, "loss": 0.5186, "num_input_tokens_seen": 143612704, "step": 118090 }, { "epoch": 14.797017917554191, "grad_norm": 1.8559720516204834, "learning_rate": 1.923829925239223e-06, "loss": 0.4672, "num_input_tokens_seen": 143619168, "step": 118095 }, { "epoch": 14.797644405462975, "grad_norm": 4.304411888122559, "learning_rate": 1.9233989447635414e-06, "loss": 0.5188, "num_input_tokens_seen": 143625056, "step": 118100 }, { "epoch": 14.798270893371757, "grad_norm": 4.0237812995910645, "learning_rate": 1.922968001071211e-06, "loss": 0.4759, "num_input_tokens_seen": 143631168, "step": 118105 }, { "epoch": 14.798897381280542, "grad_norm": 8.031661033630371, "learning_rate": 1.9225370941673867e-06, "loss": 0.484, "num_input_tokens_seen": 143637152, "step": 118110 }, { "epoch": 14.799523869189324, "grad_norm": 1.957002878189087, "learning_rate": 1.922106224057217e-06, "loss": 0.4557, "num_input_tokens_seen": 143643136, "step": 118115 }, { "epoch": 14.800150357098108, "grad_norm": 3.153614044189453, "learning_rate": 1.921675390745857e-06, "loss": 0.4546, "num_input_tokens_seen": 143649088, "step": 118120 }, { "epoch": 14.800776845006892, "grad_norm": 2.1285500526428223, "learning_rate": 1.921244594238454e-06, "loss": 0.4554, "num_input_tokens_seen": 143655200, "step": 118125 }, { "epoch": 14.801403332915674, "grad_norm": 1.9796096086502075, "learning_rate": 1.9208138345401617e-06, "loss": 0.4853, "num_input_tokens_seen": 143661408, "step": 118130 }, { "epoch": 14.802029820824458, "grad_norm": 1.8226444721221924, "learning_rate": 1.920383111656128e-06, "loss": 0.419, "num_input_tokens_seen": 143667648, "step": 118135 }, { "epoch": 14.80265630873324, "grad_norm": 1.4866646528244019, "learning_rate": 1.9199524255915025e-06, "loss": 0.5017, "num_input_tokens_seen": 143673664, "step": 118140 }, { "epoch": 14.803282796642025, "grad_norm": 1.555755853652954, "learning_rate": 1.919521776351437e-06, "loss": 0.4687, "num_input_tokens_seen": 143679936, "step": 118145 }, { "epoch": 14.803909284550809, "grad_norm": 7.666669845581055, "learning_rate": 1.9190911639410775e-06, "loss": 0.4902, "num_input_tokens_seen": 143686112, "step": 118150 }, { "epoch": 14.804535772459591, "grad_norm": 1.8546130657196045, "learning_rate": 1.9186605883655753e-06, "loss": 0.4811, "num_input_tokens_seen": 143692384, "step": 118155 }, { "epoch": 14.805162260368375, "grad_norm": 1.9640244245529175, "learning_rate": 1.918230049630074e-06, "loss": 0.46, "num_input_tokens_seen": 143698688, "step": 118160 }, { "epoch": 14.805788748277159, "grad_norm": 3.4429383277893066, "learning_rate": 1.9177995477397256e-06, "loss": 0.5018, "num_input_tokens_seen": 143704768, "step": 118165 }, { "epoch": 14.806415236185941, "grad_norm": 4.590447902679443, "learning_rate": 1.9173690826996733e-06, "loss": 0.4736, "num_input_tokens_seen": 143711008, "step": 118170 }, { "epoch": 14.807041724094725, "grad_norm": 5.304888725280762, "learning_rate": 1.916938654515067e-06, "loss": 0.4649, "num_input_tokens_seen": 143717120, "step": 118175 }, { "epoch": 14.807668212003508, "grad_norm": 4.585526466369629, "learning_rate": 1.9165082631910493e-06, "loss": 0.4542, "num_input_tokens_seen": 143723104, "step": 118180 }, { "epoch": 14.808294699912292, "grad_norm": 1.5615360736846924, "learning_rate": 1.916077908732768e-06, "loss": 0.4758, "num_input_tokens_seen": 143729088, "step": 118185 }, { "epoch": 14.808921187821076, "grad_norm": 1.5305649042129517, "learning_rate": 1.9156475911453694e-06, "loss": 0.4376, "num_input_tokens_seen": 143734976, "step": 118190 }, { "epoch": 14.809547675729858, "grad_norm": 1.6789207458496094, "learning_rate": 1.9152173104339953e-06, "loss": 0.4567, "num_input_tokens_seen": 143741152, "step": 118195 }, { "epoch": 14.810174163638642, "grad_norm": 1.5009099245071411, "learning_rate": 1.9147870666037928e-06, "loss": 0.4407, "num_input_tokens_seen": 143746976, "step": 118200 }, { "epoch": 14.810800651547424, "grad_norm": 4.222131729125977, "learning_rate": 1.9143568596599036e-06, "loss": 0.545, "num_input_tokens_seen": 143752800, "step": 118205 }, { "epoch": 14.811427139456208, "grad_norm": 4.017520904541016, "learning_rate": 1.913926689607475e-06, "loss": 0.4501, "num_input_tokens_seen": 143758880, "step": 118210 }, { "epoch": 14.812053627364993, "grad_norm": 1.6024583578109741, "learning_rate": 1.913496556451645e-06, "loss": 0.4223, "num_input_tokens_seen": 143764896, "step": 118215 }, { "epoch": 14.812680115273775, "grad_norm": 1.840757966041565, "learning_rate": 1.9130664601975614e-06, "loss": 0.4712, "num_input_tokens_seen": 143771136, "step": 118220 }, { "epoch": 14.813306603182559, "grad_norm": 1.6703053712844849, "learning_rate": 1.912636400850361e-06, "loss": 0.4637, "num_input_tokens_seen": 143776960, "step": 118225 }, { "epoch": 14.813933091091341, "grad_norm": 2.306741714477539, "learning_rate": 1.9122063784151903e-06, "loss": 0.4621, "num_input_tokens_seen": 143782880, "step": 118230 }, { "epoch": 14.814559579000125, "grad_norm": 1.6457611322402954, "learning_rate": 1.911776392897186e-06, "loss": 0.5122, "num_input_tokens_seen": 143788928, "step": 118235 }, { "epoch": 14.81518606690891, "grad_norm": 3.0075631141662598, "learning_rate": 1.911346444301494e-06, "loss": 0.4605, "num_input_tokens_seen": 143795232, "step": 118240 }, { "epoch": 14.815812554817692, "grad_norm": 2.1078128814697266, "learning_rate": 1.9109165326332495e-06, "loss": 0.4245, "num_input_tokens_seen": 143801312, "step": 118245 }, { "epoch": 14.816439042726476, "grad_norm": 2.118281841278076, "learning_rate": 1.9104866578975974e-06, "loss": 0.4595, "num_input_tokens_seen": 143807424, "step": 118250 }, { "epoch": 14.817065530635258, "grad_norm": 1.9896873235702515, "learning_rate": 1.9100568200996727e-06, "loss": 0.4435, "num_input_tokens_seen": 143813440, "step": 118255 }, { "epoch": 14.817692018544042, "grad_norm": 2.15226411819458, "learning_rate": 1.909627019244617e-06, "loss": 0.44, "num_input_tokens_seen": 143819808, "step": 118260 }, { "epoch": 14.818318506452826, "grad_norm": 4.016933441162109, "learning_rate": 1.90919725533757e-06, "loss": 0.4135, "num_input_tokens_seen": 143826016, "step": 118265 }, { "epoch": 14.818944994361608, "grad_norm": 1.62004816532135, "learning_rate": 1.9087675283836663e-06, "loss": 0.4587, "num_input_tokens_seen": 143832480, "step": 118270 }, { "epoch": 14.819571482270392, "grad_norm": 4.465023517608643, "learning_rate": 1.9083378383880485e-06, "loss": 0.4722, "num_input_tokens_seen": 143838304, "step": 118275 }, { "epoch": 14.820197970179176, "grad_norm": 3.3793888092041016, "learning_rate": 1.907908185355849e-06, "loss": 0.4084, "num_input_tokens_seen": 143843744, "step": 118280 }, { "epoch": 14.820824458087959, "grad_norm": 2.895270586013794, "learning_rate": 1.907478569292208e-06, "loss": 0.5261, "num_input_tokens_seen": 143849312, "step": 118285 }, { "epoch": 14.821450945996743, "grad_norm": 3.923069953918457, "learning_rate": 1.9070489902022598e-06, "loss": 0.4789, "num_input_tokens_seen": 143855584, "step": 118290 }, { "epoch": 14.822077433905525, "grad_norm": 3.6485302448272705, "learning_rate": 1.9066194480911409e-06, "loss": 0.4564, "num_input_tokens_seen": 143861152, "step": 118295 }, { "epoch": 14.822703921814309, "grad_norm": 3.0585827827453613, "learning_rate": 1.9061899429639874e-06, "loss": 0.4805, "num_input_tokens_seen": 143867264, "step": 118300 }, { "epoch": 14.823330409723093, "grad_norm": 2.0872316360473633, "learning_rate": 1.9057604748259357e-06, "loss": 0.4341, "num_input_tokens_seen": 143873472, "step": 118305 }, { "epoch": 14.823956897631875, "grad_norm": 2.4919028282165527, "learning_rate": 1.9053310436821176e-06, "loss": 0.504, "num_input_tokens_seen": 143879680, "step": 118310 }, { "epoch": 14.82458338554066, "grad_norm": 3.258324146270752, "learning_rate": 1.9049016495376704e-06, "loss": 0.5803, "num_input_tokens_seen": 143885760, "step": 118315 }, { "epoch": 14.825209873449442, "grad_norm": 2.6652255058288574, "learning_rate": 1.9044722923977243e-06, "loss": 0.52, "num_input_tokens_seen": 143891424, "step": 118320 }, { "epoch": 14.825836361358226, "grad_norm": 13.45811939239502, "learning_rate": 1.9040429722674148e-06, "loss": 0.5039, "num_input_tokens_seen": 143897440, "step": 118325 }, { "epoch": 14.82646284926701, "grad_norm": 4.363578796386719, "learning_rate": 1.9036136891518758e-06, "loss": 0.4863, "num_input_tokens_seen": 143904096, "step": 118330 }, { "epoch": 14.827089337175792, "grad_norm": 2.0334835052490234, "learning_rate": 1.9031844430562373e-06, "loss": 0.4314, "num_input_tokens_seen": 143910208, "step": 118335 }, { "epoch": 14.827715825084576, "grad_norm": 3.8367862701416016, "learning_rate": 1.9027552339856337e-06, "loss": 0.4584, "num_input_tokens_seen": 143916288, "step": 118340 }, { "epoch": 14.828342312993358, "grad_norm": 2.836655378341675, "learning_rate": 1.9023260619451939e-06, "loss": 0.4409, "num_input_tokens_seen": 143922112, "step": 118345 }, { "epoch": 14.828968800902143, "grad_norm": 1.563012957572937, "learning_rate": 1.9018969269400516e-06, "loss": 0.4484, "num_input_tokens_seen": 143928128, "step": 118350 }, { "epoch": 14.829595288810927, "grad_norm": 3.3165509700775146, "learning_rate": 1.9014678289753353e-06, "loss": 0.4879, "num_input_tokens_seen": 143934144, "step": 118355 }, { "epoch": 14.830221776719709, "grad_norm": 1.592150330543518, "learning_rate": 1.9010387680561777e-06, "loss": 0.4727, "num_input_tokens_seen": 143940384, "step": 118360 }, { "epoch": 14.830848264628493, "grad_norm": 3.174018383026123, "learning_rate": 1.9006097441877053e-06, "loss": 0.4893, "num_input_tokens_seen": 143946464, "step": 118365 }, { "epoch": 14.831474752537275, "grad_norm": 6.650032043457031, "learning_rate": 1.9001807573750507e-06, "loss": 0.4823, "num_input_tokens_seen": 143951808, "step": 118370 }, { "epoch": 14.83210124044606, "grad_norm": 1.9140464067459106, "learning_rate": 1.8997518076233396e-06, "loss": 0.4745, "num_input_tokens_seen": 143957856, "step": 118375 }, { "epoch": 14.832727728354843, "grad_norm": 4.328829288482666, "learning_rate": 1.8993228949377023e-06, "loss": 0.4761, "num_input_tokens_seen": 143963968, "step": 118380 }, { "epoch": 14.833354216263626, "grad_norm": 1.5397495031356812, "learning_rate": 1.8988940193232686e-06, "loss": 0.4545, "num_input_tokens_seen": 143969856, "step": 118385 }, { "epoch": 14.83398070417241, "grad_norm": 6.2561936378479, "learning_rate": 1.8984651807851623e-06, "loss": 0.5023, "num_input_tokens_seen": 143975872, "step": 118390 }, { "epoch": 14.834607192081194, "grad_norm": 3.1975202560424805, "learning_rate": 1.8980363793285128e-06, "loss": 0.4562, "num_input_tokens_seen": 143981984, "step": 118395 }, { "epoch": 14.835233679989976, "grad_norm": 2.081918239593506, "learning_rate": 1.8976076149584478e-06, "loss": 0.4564, "num_input_tokens_seen": 143987904, "step": 118400 }, { "epoch": 14.83586016789876, "grad_norm": 1.780392050743103, "learning_rate": 1.8971788876800901e-06, "loss": 0.4483, "num_input_tokens_seen": 143994240, "step": 118405 }, { "epoch": 14.836486655807542, "grad_norm": 1.9764022827148438, "learning_rate": 1.8967501974985675e-06, "loss": 0.4748, "num_input_tokens_seen": 144000352, "step": 118410 }, { "epoch": 14.837113143716326, "grad_norm": 1.6504908800125122, "learning_rate": 1.8963215444190075e-06, "loss": 0.4845, "num_input_tokens_seen": 144006208, "step": 118415 }, { "epoch": 14.83773963162511, "grad_norm": 1.7094647884368896, "learning_rate": 1.895892928446531e-06, "loss": 0.4376, "num_input_tokens_seen": 144012416, "step": 118420 }, { "epoch": 14.838366119533893, "grad_norm": 1.6328775882720947, "learning_rate": 1.8954643495862663e-06, "loss": 0.4606, "num_input_tokens_seen": 144018272, "step": 118425 }, { "epoch": 14.838992607442677, "grad_norm": 1.7346711158752441, "learning_rate": 1.895035807843333e-06, "loss": 0.4416, "num_input_tokens_seen": 144024160, "step": 118430 }, { "epoch": 14.839619095351459, "grad_norm": 1.8234286308288574, "learning_rate": 1.8946073032228596e-06, "loss": 0.4913, "num_input_tokens_seen": 144030208, "step": 118435 }, { "epoch": 14.840245583260243, "grad_norm": 2.203718423843384, "learning_rate": 1.8941788357299645e-06, "loss": 0.4751, "num_input_tokens_seen": 144036512, "step": 118440 }, { "epoch": 14.840872071169027, "grad_norm": 2.49450421333313, "learning_rate": 1.893750405369773e-06, "loss": 0.4857, "num_input_tokens_seen": 144042848, "step": 118445 }, { "epoch": 14.84149855907781, "grad_norm": 1.3940987586975098, "learning_rate": 1.8933220121474083e-06, "loss": 0.4531, "num_input_tokens_seen": 144048832, "step": 118450 }, { "epoch": 14.842125046986594, "grad_norm": 1.3528306484222412, "learning_rate": 1.8928936560679894e-06, "loss": 0.4452, "num_input_tokens_seen": 144054464, "step": 118455 }, { "epoch": 14.842751534895376, "grad_norm": 5.262456893920898, "learning_rate": 1.8924653371366415e-06, "loss": 0.4855, "num_input_tokens_seen": 144060320, "step": 118460 }, { "epoch": 14.84337802280416, "grad_norm": 1.3364096879959106, "learning_rate": 1.8920370553584805e-06, "loss": 0.4834, "num_input_tokens_seen": 144066144, "step": 118465 }, { "epoch": 14.844004510712944, "grad_norm": 3.7872962951660156, "learning_rate": 1.8916088107386316e-06, "loss": 0.5165, "num_input_tokens_seen": 144072416, "step": 118470 }, { "epoch": 14.844630998621726, "grad_norm": 8.813772201538086, "learning_rate": 1.891180603282211e-06, "loss": 0.5853, "num_input_tokens_seen": 144078880, "step": 118475 }, { "epoch": 14.84525748653051, "grad_norm": 4.852014541625977, "learning_rate": 1.8907524329943421e-06, "loss": 0.4523, "num_input_tokens_seen": 144084896, "step": 118480 }, { "epoch": 14.845883974439293, "grad_norm": 1.6952992677688599, "learning_rate": 1.8903242998801397e-06, "loss": 0.4977, "num_input_tokens_seen": 144091040, "step": 118485 }, { "epoch": 14.846510462348077, "grad_norm": 1.4018887281417847, "learning_rate": 1.8898962039447272e-06, "loss": 0.5057, "num_input_tokens_seen": 144097184, "step": 118490 }, { "epoch": 14.84713695025686, "grad_norm": 8.539190292358398, "learning_rate": 1.8894681451932178e-06, "loss": 0.552, "num_input_tokens_seen": 144103232, "step": 118495 }, { "epoch": 14.847763438165643, "grad_norm": 1.3995932340621948, "learning_rate": 1.8890401236307327e-06, "loss": 0.4756, "num_input_tokens_seen": 144109376, "step": 118500 }, { "epoch": 14.848389926074427, "grad_norm": 6.385759353637695, "learning_rate": 1.888612139262388e-06, "loss": 0.4531, "num_input_tokens_seen": 144115552, "step": 118505 }, { "epoch": 14.849016413983211, "grad_norm": 1.9740948677062988, "learning_rate": 1.888184192093303e-06, "loss": 0.4278, "num_input_tokens_seen": 144121920, "step": 118510 }, { "epoch": 14.849642901891993, "grad_norm": 7.089101791381836, "learning_rate": 1.8877562821285906e-06, "loss": 0.4863, "num_input_tokens_seen": 144127776, "step": 118515 }, { "epoch": 14.850269389800777, "grad_norm": 1.370964765548706, "learning_rate": 1.8873284093733685e-06, "loss": 0.4793, "num_input_tokens_seen": 144133920, "step": 118520 }, { "epoch": 14.85089587770956, "grad_norm": 7.879724025726318, "learning_rate": 1.8869005738327534e-06, "loss": 0.4969, "num_input_tokens_seen": 144139936, "step": 118525 }, { "epoch": 14.851522365618344, "grad_norm": 2.004706859588623, "learning_rate": 1.8864727755118583e-06, "loss": 0.5205, "num_input_tokens_seen": 144145952, "step": 118530 }, { "epoch": 14.852148853527126, "grad_norm": 6.63716983795166, "learning_rate": 1.8860450144158005e-06, "loss": 0.4997, "num_input_tokens_seen": 144151968, "step": 118535 }, { "epoch": 14.85277534143591, "grad_norm": 6.849236011505127, "learning_rate": 1.8856172905496905e-06, "loss": 0.4738, "num_input_tokens_seen": 144157856, "step": 118540 }, { "epoch": 14.853401829344694, "grad_norm": 2.0908968448638916, "learning_rate": 1.8851896039186462e-06, "loss": 0.4901, "num_input_tokens_seen": 144164160, "step": 118545 }, { "epoch": 14.854028317253476, "grad_norm": 2.5295047760009766, "learning_rate": 1.8847619545277773e-06, "loss": 0.4902, "num_input_tokens_seen": 144170272, "step": 118550 }, { "epoch": 14.85465480516226, "grad_norm": 1.6779608726501465, "learning_rate": 1.8843343423821996e-06, "loss": 0.5238, "num_input_tokens_seen": 144176448, "step": 118555 }, { "epoch": 14.855281293071045, "grad_norm": 5.399550437927246, "learning_rate": 1.8839067674870232e-06, "loss": 0.457, "num_input_tokens_seen": 144182464, "step": 118560 }, { "epoch": 14.855907780979827, "grad_norm": 2.7372984886169434, "learning_rate": 1.8834792298473614e-06, "loss": 0.4396, "num_input_tokens_seen": 144188736, "step": 118565 }, { "epoch": 14.85653426888861, "grad_norm": 4.453981876373291, "learning_rate": 1.883051729468327e-06, "loss": 0.4431, "num_input_tokens_seen": 144194784, "step": 118570 }, { "epoch": 14.857160756797393, "grad_norm": 8.688984870910645, "learning_rate": 1.882624266355028e-06, "loss": 0.5098, "num_input_tokens_seen": 144201024, "step": 118575 }, { "epoch": 14.857787244706177, "grad_norm": 3.52732515335083, "learning_rate": 1.8821968405125785e-06, "loss": 0.4661, "num_input_tokens_seen": 144207424, "step": 118580 }, { "epoch": 14.858413732614961, "grad_norm": 1.4650053977966309, "learning_rate": 1.8817694519460856e-06, "loss": 0.4224, "num_input_tokens_seen": 144213248, "step": 118585 }, { "epoch": 14.859040220523744, "grad_norm": 2.5917892456054688, "learning_rate": 1.8813421006606625e-06, "loss": 0.4476, "num_input_tokens_seen": 144219168, "step": 118590 }, { "epoch": 14.859666708432528, "grad_norm": 1.6631537675857544, "learning_rate": 1.880914786661414e-06, "loss": 0.4289, "num_input_tokens_seen": 144225504, "step": 118595 }, { "epoch": 14.86029319634131, "grad_norm": 1.9325867891311646, "learning_rate": 1.8804875099534526e-06, "loss": 0.423, "num_input_tokens_seen": 144231776, "step": 118600 }, { "epoch": 14.860919684250094, "grad_norm": 1.593574047088623, "learning_rate": 1.8800602705418869e-06, "loss": 0.4754, "num_input_tokens_seen": 144237984, "step": 118605 }, { "epoch": 14.861546172158878, "grad_norm": 1.8287121057510376, "learning_rate": 1.8796330684318225e-06, "loss": 0.4398, "num_input_tokens_seen": 144243840, "step": 118610 }, { "epoch": 14.86217266006766, "grad_norm": 1.482491135597229, "learning_rate": 1.8792059036283677e-06, "loss": 0.4767, "num_input_tokens_seen": 144249888, "step": 118615 }, { "epoch": 14.862799147976444, "grad_norm": 1.7234578132629395, "learning_rate": 1.8787787761366321e-06, "loss": 0.4935, "num_input_tokens_seen": 144256224, "step": 118620 }, { "epoch": 14.863425635885227, "grad_norm": 2.1064321994781494, "learning_rate": 1.878351685961719e-06, "loss": 0.4794, "num_input_tokens_seen": 144262368, "step": 118625 }, { "epoch": 14.86405212379401, "grad_norm": 10.724512100219727, "learning_rate": 1.8779246331087353e-06, "loss": 0.4773, "num_input_tokens_seen": 144268544, "step": 118630 }, { "epoch": 14.864678611702795, "grad_norm": 2.072902202606201, "learning_rate": 1.8774976175827898e-06, "loss": 0.4346, "num_input_tokens_seen": 144274880, "step": 118635 }, { "epoch": 14.865305099611577, "grad_norm": 1.6002552509307861, "learning_rate": 1.8770706393889837e-06, "loss": 0.5036, "num_input_tokens_seen": 144280736, "step": 118640 }, { "epoch": 14.865931587520361, "grad_norm": 10.72207260131836, "learning_rate": 1.8766436985324255e-06, "loss": 0.5085, "num_input_tokens_seen": 144287072, "step": 118645 }, { "epoch": 14.866558075429143, "grad_norm": 1.5446187257766724, "learning_rate": 1.8762167950182158e-06, "loss": 0.4499, "num_input_tokens_seen": 144293152, "step": 118650 }, { "epoch": 14.867184563337927, "grad_norm": 8.25906753540039, "learning_rate": 1.8757899288514631e-06, "loss": 0.4889, "num_input_tokens_seen": 144299616, "step": 118655 }, { "epoch": 14.867811051246711, "grad_norm": 6.505195140838623, "learning_rate": 1.8753631000372663e-06, "loss": 0.5692, "num_input_tokens_seen": 144304800, "step": 118660 }, { "epoch": 14.868437539155494, "grad_norm": 9.281135559082031, "learning_rate": 1.874936308580732e-06, "loss": 0.4765, "num_input_tokens_seen": 144310816, "step": 118665 }, { "epoch": 14.869064027064278, "grad_norm": 9.465954780578613, "learning_rate": 1.8745095544869602e-06, "loss": 0.5232, "num_input_tokens_seen": 144316704, "step": 118670 }, { "epoch": 14.869690514973062, "grad_norm": 2.8570826053619385, "learning_rate": 1.8740828377610564e-06, "loss": 0.4253, "num_input_tokens_seen": 144322592, "step": 118675 }, { "epoch": 14.870317002881844, "grad_norm": 1.6552276611328125, "learning_rate": 1.8736561584081186e-06, "loss": 0.4529, "num_input_tokens_seen": 144328704, "step": 118680 }, { "epoch": 14.870943490790628, "grad_norm": 3.0332274436950684, "learning_rate": 1.8732295164332515e-06, "loss": 0.4859, "num_input_tokens_seen": 144334784, "step": 118685 }, { "epoch": 14.87156997869941, "grad_norm": 1.9081528186798096, "learning_rate": 1.8728029118415524e-06, "loss": 0.4373, "num_input_tokens_seen": 144341088, "step": 118690 }, { "epoch": 14.872196466608194, "grad_norm": 1.6376628875732422, "learning_rate": 1.8723763446381244e-06, "loss": 0.5078, "num_input_tokens_seen": 144347200, "step": 118695 }, { "epoch": 14.872822954516979, "grad_norm": 5.901031494140625, "learning_rate": 1.871949814828068e-06, "loss": 0.5137, "num_input_tokens_seen": 144353248, "step": 118700 }, { "epoch": 14.87344944242576, "grad_norm": 1.7575359344482422, "learning_rate": 1.8715233224164796e-06, "loss": 0.4654, "num_input_tokens_seen": 144359296, "step": 118705 }, { "epoch": 14.874075930334545, "grad_norm": 4.981379985809326, "learning_rate": 1.8710968674084602e-06, "loss": 0.4724, "num_input_tokens_seen": 144365472, "step": 118710 }, { "epoch": 14.874702418243327, "grad_norm": 5.197607517242432, "learning_rate": 1.8706704498091087e-06, "loss": 0.4656, "num_input_tokens_seen": 144371968, "step": 118715 }, { "epoch": 14.875328906152111, "grad_norm": 3.9271700382232666, "learning_rate": 1.8702440696235247e-06, "loss": 0.4516, "num_input_tokens_seen": 144377920, "step": 118720 }, { "epoch": 14.875955394060895, "grad_norm": 2.0921592712402344, "learning_rate": 1.8698177268568018e-06, "loss": 0.4823, "num_input_tokens_seen": 144384032, "step": 118725 }, { "epoch": 14.876581881969678, "grad_norm": 4.832650661468506, "learning_rate": 1.8693914215140424e-06, "loss": 0.4906, "num_input_tokens_seen": 144390208, "step": 118730 }, { "epoch": 14.877208369878462, "grad_norm": 2.297239065170288, "learning_rate": 1.8689651536003383e-06, "loss": 0.4265, "num_input_tokens_seen": 144396576, "step": 118735 }, { "epoch": 14.877834857787244, "grad_norm": 5.495321273803711, "learning_rate": 1.8685389231207901e-06, "loss": 0.4966, "num_input_tokens_seen": 144402432, "step": 118740 }, { "epoch": 14.878461345696028, "grad_norm": 1.5783679485321045, "learning_rate": 1.8681127300804901e-06, "loss": 0.4509, "num_input_tokens_seen": 144408832, "step": 118745 }, { "epoch": 14.879087833604812, "grad_norm": 2.4122002124786377, "learning_rate": 1.8676865744845352e-06, "loss": 0.4615, "num_input_tokens_seen": 144414656, "step": 118750 }, { "epoch": 14.879714321513594, "grad_norm": 2.3196380138397217, "learning_rate": 1.8672604563380231e-06, "loss": 0.4529, "num_input_tokens_seen": 144420544, "step": 118755 }, { "epoch": 14.880340809422378, "grad_norm": 2.240183115005493, "learning_rate": 1.8668343756460445e-06, "loss": 0.4341, "num_input_tokens_seen": 144426784, "step": 118760 }, { "epoch": 14.88096729733116, "grad_norm": 2.1483020782470703, "learning_rate": 1.8664083324136962e-06, "loss": 0.4757, "num_input_tokens_seen": 144432736, "step": 118765 }, { "epoch": 14.881593785239945, "grad_norm": 2.879866361618042, "learning_rate": 1.8659823266460698e-06, "loss": 0.4593, "num_input_tokens_seen": 144438944, "step": 118770 }, { "epoch": 14.882220273148729, "grad_norm": 3.750894546508789, "learning_rate": 1.8655563583482606e-06, "loss": 0.5012, "num_input_tokens_seen": 144444672, "step": 118775 }, { "epoch": 14.882846761057511, "grad_norm": 1.597718596458435, "learning_rate": 1.8651304275253596e-06, "loss": 0.4587, "num_input_tokens_seen": 144450784, "step": 118780 }, { "epoch": 14.883473248966295, "grad_norm": 2.7536582946777344, "learning_rate": 1.864704534182461e-06, "loss": 0.4634, "num_input_tokens_seen": 144456352, "step": 118785 }, { "epoch": 14.88409973687508, "grad_norm": 2.0478219985961914, "learning_rate": 1.8642786783246542e-06, "loss": 0.4585, "num_input_tokens_seen": 144462656, "step": 118790 }, { "epoch": 14.884726224783861, "grad_norm": 3.728780746459961, "learning_rate": 1.8638528599570338e-06, "loss": 0.4833, "num_input_tokens_seen": 144468672, "step": 118795 }, { "epoch": 14.885352712692645, "grad_norm": 1.8617274761199951, "learning_rate": 1.8634270790846875e-06, "loss": 0.434, "num_input_tokens_seen": 144474400, "step": 118800 }, { "epoch": 14.885979200601428, "grad_norm": 1.3108563423156738, "learning_rate": 1.863001335712708e-06, "loss": 0.4135, "num_input_tokens_seen": 144480448, "step": 118805 }, { "epoch": 14.886605688510212, "grad_norm": 9.84204387664795, "learning_rate": 1.862575629846186e-06, "loss": 0.547, "num_input_tokens_seen": 144486752, "step": 118810 }, { "epoch": 14.887232176418996, "grad_norm": 8.44161319732666, "learning_rate": 1.8621499614902089e-06, "loss": 0.4408, "num_input_tokens_seen": 144492928, "step": 118815 }, { "epoch": 14.887858664327778, "grad_norm": 4.780941963195801, "learning_rate": 1.8617243306498672e-06, "loss": 0.489, "num_input_tokens_seen": 144499104, "step": 118820 }, { "epoch": 14.888485152236562, "grad_norm": 2.958380937576294, "learning_rate": 1.8612987373302493e-06, "loss": 0.4663, "num_input_tokens_seen": 144505344, "step": 118825 }, { "epoch": 14.889111640145344, "grad_norm": 10.177691459655762, "learning_rate": 1.8608731815364461e-06, "loss": 0.4686, "num_input_tokens_seen": 144511392, "step": 118830 }, { "epoch": 14.889738128054129, "grad_norm": 1.9087047576904297, "learning_rate": 1.8604476632735412e-06, "loss": 0.436, "num_input_tokens_seen": 144517632, "step": 118835 }, { "epoch": 14.890364615962913, "grad_norm": 2.0448453426361084, "learning_rate": 1.860022182546626e-06, "loss": 0.5122, "num_input_tokens_seen": 144523840, "step": 118840 }, { "epoch": 14.890991103871695, "grad_norm": 8.46186351776123, "learning_rate": 1.8595967393607834e-06, "loss": 0.4759, "num_input_tokens_seen": 144530112, "step": 118845 }, { "epoch": 14.891617591780479, "grad_norm": 1.7664058208465576, "learning_rate": 1.8591713337211044e-06, "loss": 0.4297, "num_input_tokens_seen": 144535968, "step": 118850 }, { "epoch": 14.892244079689261, "grad_norm": 6.157578468322754, "learning_rate": 1.8587459656326702e-06, "loss": 0.477, "num_input_tokens_seen": 144541760, "step": 118855 }, { "epoch": 14.892870567598045, "grad_norm": 1.8795082569122314, "learning_rate": 1.8583206351005712e-06, "loss": 0.4785, "num_input_tokens_seen": 144547232, "step": 118860 }, { "epoch": 14.89349705550683, "grad_norm": 1.4310872554779053, "learning_rate": 1.857895342129889e-06, "loss": 0.4753, "num_input_tokens_seen": 144553376, "step": 118865 }, { "epoch": 14.894123543415612, "grad_norm": 2.325190305709839, "learning_rate": 1.857470086725711e-06, "loss": 0.4617, "num_input_tokens_seen": 144559264, "step": 118870 }, { "epoch": 14.894750031324396, "grad_norm": 10.120076179504395, "learning_rate": 1.8570448688931186e-06, "loss": 0.5276, "num_input_tokens_seen": 144565600, "step": 118875 }, { "epoch": 14.895376519233178, "grad_norm": 2.445152997970581, "learning_rate": 1.8566196886371973e-06, "loss": 0.451, "num_input_tokens_seen": 144572032, "step": 118880 }, { "epoch": 14.896003007141962, "grad_norm": 5.422937870025635, "learning_rate": 1.8561945459630321e-06, "loss": 0.4822, "num_input_tokens_seen": 144577696, "step": 118885 }, { "epoch": 14.896629495050746, "grad_norm": 3.11171817779541, "learning_rate": 1.8557694408757027e-06, "loss": 0.4517, "num_input_tokens_seen": 144584032, "step": 118890 }, { "epoch": 14.897255982959528, "grad_norm": 1.7295771837234497, "learning_rate": 1.8553443733802945e-06, "loss": 0.5213, "num_input_tokens_seen": 144590016, "step": 118895 }, { "epoch": 14.897882470868312, "grad_norm": 2.4364640712738037, "learning_rate": 1.8549193434818868e-06, "loss": 0.4381, "num_input_tokens_seen": 144595808, "step": 118900 }, { "epoch": 14.898508958777096, "grad_norm": 1.721855878829956, "learning_rate": 1.8544943511855639e-06, "loss": 0.4317, "num_input_tokens_seen": 144601792, "step": 118905 }, { "epoch": 14.899135446685879, "grad_norm": 3.2648868560791016, "learning_rate": 1.8540693964964046e-06, "loss": 0.4584, "num_input_tokens_seen": 144608064, "step": 118910 }, { "epoch": 14.899761934594663, "grad_norm": 12.732365608215332, "learning_rate": 1.8536444794194901e-06, "loss": 0.4981, "num_input_tokens_seen": 144614208, "step": 118915 }, { "epoch": 14.900388422503445, "grad_norm": 1.7057665586471558, "learning_rate": 1.853219599959901e-06, "loss": 0.4174, "num_input_tokens_seen": 144620352, "step": 118920 }, { "epoch": 14.90101491041223, "grad_norm": 12.456171035766602, "learning_rate": 1.8527947581227195e-06, "loss": 0.5189, "num_input_tokens_seen": 144626208, "step": 118925 }, { "epoch": 14.901641398321013, "grad_norm": 18.16663932800293, "learning_rate": 1.8523699539130208e-06, "loss": 0.5276, "num_input_tokens_seen": 144632608, "step": 118930 }, { "epoch": 14.902267886229795, "grad_norm": 2.2288031578063965, "learning_rate": 1.8519451873358857e-06, "loss": 0.4441, "num_input_tokens_seen": 144638592, "step": 118935 }, { "epoch": 14.90289437413858, "grad_norm": 1.8772536516189575, "learning_rate": 1.8515204583963941e-06, "loss": 0.4238, "num_input_tokens_seen": 144644544, "step": 118940 }, { "epoch": 14.903520862047362, "grad_norm": 2.4509010314941406, "learning_rate": 1.851095767099621e-06, "loss": 0.4344, "num_input_tokens_seen": 144650720, "step": 118945 }, { "epoch": 14.904147349956146, "grad_norm": 7.584517955780029, "learning_rate": 1.8506711134506472e-06, "loss": 0.5309, "num_input_tokens_seen": 144655872, "step": 118950 }, { "epoch": 14.90477383786493, "grad_norm": 1.7393983602523804, "learning_rate": 1.8502464974545463e-06, "loss": 0.4874, "num_input_tokens_seen": 144662048, "step": 118955 }, { "epoch": 14.905400325773712, "grad_norm": 1.6963077783584595, "learning_rate": 1.8498219191163985e-06, "loss": 0.4661, "num_input_tokens_seen": 144668320, "step": 118960 }, { "epoch": 14.906026813682496, "grad_norm": 2.200258493423462, "learning_rate": 1.849397378441276e-06, "loss": 0.4475, "num_input_tokens_seen": 144674688, "step": 118965 }, { "epoch": 14.906653301591279, "grad_norm": 2.707361936569214, "learning_rate": 1.848972875434259e-06, "loss": 0.5164, "num_input_tokens_seen": 144680768, "step": 118970 }, { "epoch": 14.907279789500063, "grad_norm": 1.56436026096344, "learning_rate": 1.8485484101004185e-06, "loss": 0.452, "num_input_tokens_seen": 144685760, "step": 118975 }, { "epoch": 14.907906277408847, "grad_norm": 1.5587939023971558, "learning_rate": 1.8481239824448332e-06, "loss": 0.4664, "num_input_tokens_seen": 144691968, "step": 118980 }, { "epoch": 14.908532765317629, "grad_norm": 6.565985679626465, "learning_rate": 1.8476995924725738e-06, "loss": 0.4747, "num_input_tokens_seen": 144698144, "step": 118985 }, { "epoch": 14.909159253226413, "grad_norm": 2.0893070697784424, "learning_rate": 1.847275240188718e-06, "loss": 0.4523, "num_input_tokens_seen": 144704512, "step": 118990 }, { "epoch": 14.909785741135195, "grad_norm": 1.734168529510498, "learning_rate": 1.8468509255983358e-06, "loss": 0.4379, "num_input_tokens_seen": 144710112, "step": 118995 }, { "epoch": 14.91041222904398, "grad_norm": 2.557687282562256, "learning_rate": 1.8464266487065014e-06, "loss": 0.492, "num_input_tokens_seen": 144716288, "step": 119000 }, { "epoch": 14.911038716952763, "grad_norm": 2.5637261867523193, "learning_rate": 1.8460024095182894e-06, "loss": 0.4324, "num_input_tokens_seen": 144722016, "step": 119005 }, { "epoch": 14.911665204861546, "grad_norm": 5.995399475097656, "learning_rate": 1.845578208038769e-06, "loss": 0.468, "num_input_tokens_seen": 144728096, "step": 119010 }, { "epoch": 14.91229169277033, "grad_norm": 1.3437427282333374, "learning_rate": 1.8451540442730132e-06, "loss": 0.4114, "num_input_tokens_seen": 144733984, "step": 119015 }, { "epoch": 14.912918180679114, "grad_norm": 15.553461074829102, "learning_rate": 1.8447299182260947e-06, "loss": 0.5762, "num_input_tokens_seen": 144739872, "step": 119020 }, { "epoch": 14.913544668587896, "grad_norm": 11.316245079040527, "learning_rate": 1.8443058299030815e-06, "loss": 0.5506, "num_input_tokens_seen": 144745408, "step": 119025 }, { "epoch": 14.91417115649668, "grad_norm": 1.7073994874954224, "learning_rate": 1.8438817793090447e-06, "loss": 0.4603, "num_input_tokens_seen": 144751168, "step": 119030 }, { "epoch": 14.914797644405462, "grad_norm": 2.4817066192626953, "learning_rate": 1.8434577664490572e-06, "loss": 0.4435, "num_input_tokens_seen": 144757536, "step": 119035 }, { "epoch": 14.915424132314246, "grad_norm": 7.803102493286133, "learning_rate": 1.8430337913281838e-06, "loss": 0.4767, "num_input_tokens_seen": 144763680, "step": 119040 }, { "epoch": 14.91605062022303, "grad_norm": 8.268712997436523, "learning_rate": 1.8426098539514969e-06, "loss": 0.5097, "num_input_tokens_seen": 144769728, "step": 119045 }, { "epoch": 14.916677108131813, "grad_norm": 2.3965375423431396, "learning_rate": 1.8421859543240627e-06, "loss": 0.5339, "num_input_tokens_seen": 144775968, "step": 119050 }, { "epoch": 14.917303596040597, "grad_norm": 2.770723342895508, "learning_rate": 1.841762092450952e-06, "loss": 0.4456, "num_input_tokens_seen": 144782112, "step": 119055 }, { "epoch": 14.91793008394938, "grad_norm": 1.9942686557769775, "learning_rate": 1.8413382683372288e-06, "loss": 0.4377, "num_input_tokens_seen": 144788128, "step": 119060 }, { "epoch": 14.918556571858163, "grad_norm": 1.670079231262207, "learning_rate": 1.8409144819879626e-06, "loss": 0.541, "num_input_tokens_seen": 144794112, "step": 119065 }, { "epoch": 14.919183059766947, "grad_norm": 4.939838409423828, "learning_rate": 1.8404907334082218e-06, "loss": 0.4564, "num_input_tokens_seen": 144800352, "step": 119070 }, { "epoch": 14.91980954767573, "grad_norm": 1.6464393138885498, "learning_rate": 1.8400670226030682e-06, "loss": 0.3995, "num_input_tokens_seen": 144806624, "step": 119075 }, { "epoch": 14.920436035584514, "grad_norm": 1.6871569156646729, "learning_rate": 1.8396433495775722e-06, "loss": 0.4341, "num_input_tokens_seen": 144812672, "step": 119080 }, { "epoch": 14.921062523493296, "grad_norm": 1.3154467344284058, "learning_rate": 1.8392197143367952e-06, "loss": 0.4108, "num_input_tokens_seen": 144818816, "step": 119085 }, { "epoch": 14.92168901140208, "grad_norm": 9.835294723510742, "learning_rate": 1.8387961168858058e-06, "loss": 0.4808, "num_input_tokens_seen": 144825152, "step": 119090 }, { "epoch": 14.922315499310864, "grad_norm": 4.740323066711426, "learning_rate": 1.8383725572296646e-06, "loss": 0.5263, "num_input_tokens_seen": 144830688, "step": 119095 }, { "epoch": 14.922941987219646, "grad_norm": 7.34869909286499, "learning_rate": 1.8379490353734398e-06, "loss": 0.4642, "num_input_tokens_seen": 144836800, "step": 119100 }, { "epoch": 14.92356847512843, "grad_norm": 1.9104559421539307, "learning_rate": 1.8375255513221906e-06, "loss": 0.4641, "num_input_tokens_seen": 144842464, "step": 119105 }, { "epoch": 14.924194963037213, "grad_norm": 2.227907419204712, "learning_rate": 1.8371021050809845e-06, "loss": 0.4604, "num_input_tokens_seen": 144848160, "step": 119110 }, { "epoch": 14.924821450945997, "grad_norm": 10.075535774230957, "learning_rate": 1.83667869665488e-06, "loss": 0.5045, "num_input_tokens_seen": 144854208, "step": 119115 }, { "epoch": 14.92544793885478, "grad_norm": 8.426469802856445, "learning_rate": 1.8362553260489408e-06, "loss": 0.5586, "num_input_tokens_seen": 144860064, "step": 119120 }, { "epoch": 14.926074426763563, "grad_norm": 2.3296632766723633, "learning_rate": 1.8358319932682295e-06, "loss": 0.4433, "num_input_tokens_seen": 144866272, "step": 119125 }, { "epoch": 14.926700914672347, "grad_norm": 5.155834674835205, "learning_rate": 1.835408698317807e-06, "loss": 0.4668, "num_input_tokens_seen": 144872800, "step": 119130 }, { "epoch": 14.927327402581131, "grad_norm": 4.35062837600708, "learning_rate": 1.8349854412027363e-06, "loss": 0.5834, "num_input_tokens_seen": 144879040, "step": 119135 }, { "epoch": 14.927953890489913, "grad_norm": 8.837159156799316, "learning_rate": 1.8345622219280735e-06, "loss": 0.4841, "num_input_tokens_seen": 144885088, "step": 119140 }, { "epoch": 14.928580378398697, "grad_norm": 1.4145100116729736, "learning_rate": 1.8341390404988824e-06, "loss": 0.4392, "num_input_tokens_seen": 144891232, "step": 119145 }, { "epoch": 14.92920686630748, "grad_norm": 5.264620304107666, "learning_rate": 1.8337158969202191e-06, "loss": 0.4687, "num_input_tokens_seen": 144897120, "step": 119150 }, { "epoch": 14.929833354216264, "grad_norm": 4.046725749969482, "learning_rate": 1.833292791197146e-06, "loss": 0.4508, "num_input_tokens_seen": 144903264, "step": 119155 }, { "epoch": 14.930459842125046, "grad_norm": 9.157430648803711, "learning_rate": 1.8328697233347182e-06, "loss": 0.547, "num_input_tokens_seen": 144909568, "step": 119160 }, { "epoch": 14.93108633003383, "grad_norm": 1.1164103746414185, "learning_rate": 1.8324466933379976e-06, "loss": 0.4162, "num_input_tokens_seen": 144915488, "step": 119165 }, { "epoch": 14.931712817942614, "grad_norm": 8.880128860473633, "learning_rate": 1.8320237012120379e-06, "loss": 0.5206, "num_input_tokens_seen": 144921792, "step": 119170 }, { "epoch": 14.932339305851396, "grad_norm": 1.812820315361023, "learning_rate": 1.8316007469619e-06, "loss": 0.4597, "num_input_tokens_seen": 144927488, "step": 119175 }, { "epoch": 14.93296579376018, "grad_norm": 4.589968204498291, "learning_rate": 1.8311778305926376e-06, "loss": 0.4461, "num_input_tokens_seen": 144933728, "step": 119180 }, { "epoch": 14.933592281668965, "grad_norm": 7.794849872589111, "learning_rate": 1.8307549521093083e-06, "loss": 0.5294, "num_input_tokens_seen": 144939136, "step": 119185 }, { "epoch": 14.934218769577747, "grad_norm": 1.3874924182891846, "learning_rate": 1.8303321115169698e-06, "loss": 0.5107, "num_input_tokens_seen": 144945280, "step": 119190 }, { "epoch": 14.934845257486531, "grad_norm": 2.0226917266845703, "learning_rate": 1.8299093088206743e-06, "loss": 0.4378, "num_input_tokens_seen": 144951424, "step": 119195 }, { "epoch": 14.935471745395313, "grad_norm": 1.3329828977584839, "learning_rate": 1.82948654402548e-06, "loss": 0.4303, "num_input_tokens_seen": 144957504, "step": 119200 }, { "epoch": 14.936098233304097, "grad_norm": 3.775705099105835, "learning_rate": 1.8290638171364379e-06, "loss": 0.4544, "num_input_tokens_seen": 144963712, "step": 119205 }, { "epoch": 14.936724721212881, "grad_norm": 4.626840591430664, "learning_rate": 1.8286411281586053e-06, "loss": 0.4352, "num_input_tokens_seen": 144970016, "step": 119210 }, { "epoch": 14.937351209121664, "grad_norm": 3.855942726135254, "learning_rate": 1.8282184770970328e-06, "loss": 0.4843, "num_input_tokens_seen": 144976128, "step": 119215 }, { "epoch": 14.937977697030448, "grad_norm": 2.698209047317505, "learning_rate": 1.827795863956775e-06, "loss": 0.4362, "num_input_tokens_seen": 144982304, "step": 119220 }, { "epoch": 14.93860418493923, "grad_norm": 1.642874836921692, "learning_rate": 1.8273732887428868e-06, "loss": 0.4851, "num_input_tokens_seen": 144988608, "step": 119225 }, { "epoch": 14.939230672848014, "grad_norm": 1.4931941032409668, "learning_rate": 1.8269507514604162e-06, "loss": 0.412, "num_input_tokens_seen": 144994848, "step": 119230 }, { "epoch": 14.939857160756798, "grad_norm": 15.890474319458008, "learning_rate": 1.8265282521144172e-06, "loss": 0.4987, "num_input_tokens_seen": 145000768, "step": 119235 }, { "epoch": 14.94048364866558, "grad_norm": 1.4985837936401367, "learning_rate": 1.8261057907099428e-06, "loss": 0.4323, "num_input_tokens_seen": 145007008, "step": 119240 }, { "epoch": 14.941110136574364, "grad_norm": 1.506420612335205, "learning_rate": 1.8256833672520409e-06, "loss": 0.446, "num_input_tokens_seen": 145012864, "step": 119245 }, { "epoch": 14.941736624483147, "grad_norm": 1.4494850635528564, "learning_rate": 1.8252609817457623e-06, "loss": 0.5212, "num_input_tokens_seen": 145018752, "step": 119250 }, { "epoch": 14.94236311239193, "grad_norm": 1.9720629453659058, "learning_rate": 1.82483863419616e-06, "loss": 0.4761, "num_input_tokens_seen": 145024992, "step": 119255 }, { "epoch": 14.942989600300715, "grad_norm": 1.7693275213241577, "learning_rate": 1.8244163246082798e-06, "loss": 0.3863, "num_input_tokens_seen": 145031072, "step": 119260 }, { "epoch": 14.943616088209497, "grad_norm": 2.74133563041687, "learning_rate": 1.8239940529871735e-06, "loss": 0.4941, "num_input_tokens_seen": 145037312, "step": 119265 }, { "epoch": 14.944242576118281, "grad_norm": 2.9715774059295654, "learning_rate": 1.8235718193378875e-06, "loss": 0.4403, "num_input_tokens_seen": 145043360, "step": 119270 }, { "epoch": 14.944869064027063, "grad_norm": 1.3314865827560425, "learning_rate": 1.8231496236654721e-06, "loss": 0.415, "num_input_tokens_seen": 145049344, "step": 119275 }, { "epoch": 14.945495551935847, "grad_norm": 2.640190839767456, "learning_rate": 1.8227274659749728e-06, "loss": 0.536, "num_input_tokens_seen": 145055680, "step": 119280 }, { "epoch": 14.946122039844632, "grad_norm": 1.6696784496307373, "learning_rate": 1.8223053462714391e-06, "loss": 0.4649, "num_input_tokens_seen": 145061760, "step": 119285 }, { "epoch": 14.946748527753414, "grad_norm": 2.00215220451355, "learning_rate": 1.821883264559915e-06, "loss": 0.5531, "num_input_tokens_seen": 145067808, "step": 119290 }, { "epoch": 14.947375015662198, "grad_norm": 2.013164758682251, "learning_rate": 1.8214612208454507e-06, "loss": 0.46, "num_input_tokens_seen": 145073984, "step": 119295 }, { "epoch": 14.948001503570982, "grad_norm": 1.1404668092727661, "learning_rate": 1.8210392151330874e-06, "loss": 0.496, "num_input_tokens_seen": 145079936, "step": 119300 }, { "epoch": 14.948627991479764, "grad_norm": 2.9144110679626465, "learning_rate": 1.8206172474278755e-06, "loss": 0.4269, "num_input_tokens_seen": 145086176, "step": 119305 }, { "epoch": 14.949254479388548, "grad_norm": 2.9271819591522217, "learning_rate": 1.8201953177348553e-06, "loss": 0.5899, "num_input_tokens_seen": 145092480, "step": 119310 }, { "epoch": 14.94988096729733, "grad_norm": 1.7949120998382568, "learning_rate": 1.8197734260590739e-06, "loss": 0.4534, "num_input_tokens_seen": 145098304, "step": 119315 }, { "epoch": 14.950507455206115, "grad_norm": 1.986944317817688, "learning_rate": 1.819351572405576e-06, "loss": 0.4669, "num_input_tokens_seen": 145104480, "step": 119320 }, { "epoch": 14.951133943114899, "grad_norm": 11.054144859313965, "learning_rate": 1.8189297567794029e-06, "loss": 0.5089, "num_input_tokens_seen": 145110432, "step": 119325 }, { "epoch": 14.951760431023681, "grad_norm": 1.8040460348129272, "learning_rate": 1.8185079791855992e-06, "loss": 0.4783, "num_input_tokens_seen": 145116608, "step": 119330 }, { "epoch": 14.952386918932465, "grad_norm": 10.802860260009766, "learning_rate": 1.818086239629207e-06, "loss": 0.5268, "num_input_tokens_seen": 145122784, "step": 119335 }, { "epoch": 14.953013406841247, "grad_norm": 1.8157556056976318, "learning_rate": 1.8176645381152713e-06, "loss": 0.4134, "num_input_tokens_seen": 145129024, "step": 119340 }, { "epoch": 14.953639894750031, "grad_norm": 1.5333688259124756, "learning_rate": 1.8172428746488296e-06, "loss": 0.473, "num_input_tokens_seen": 145134976, "step": 119345 }, { "epoch": 14.954266382658815, "grad_norm": 1.805513620376587, "learning_rate": 1.816821249234927e-06, "loss": 0.49, "num_input_tokens_seen": 145140768, "step": 119350 }, { "epoch": 14.954892870567598, "grad_norm": 7.512496471405029, "learning_rate": 1.8163996618786006e-06, "loss": 0.4993, "num_input_tokens_seen": 145146336, "step": 119355 }, { "epoch": 14.955519358476382, "grad_norm": 1.703365683555603, "learning_rate": 1.8159781125848952e-06, "loss": 0.4631, "num_input_tokens_seen": 145152672, "step": 119360 }, { "epoch": 14.956145846385164, "grad_norm": 1.346648097038269, "learning_rate": 1.815556601358846e-06, "loss": 0.4734, "num_input_tokens_seen": 145158720, "step": 119365 }, { "epoch": 14.956772334293948, "grad_norm": 2.2803397178649902, "learning_rate": 1.8151351282054959e-06, "loss": 0.4807, "num_input_tokens_seen": 145164864, "step": 119370 }, { "epoch": 14.957398822202732, "grad_norm": 2.0865256786346436, "learning_rate": 1.8147136931298836e-06, "loss": 0.5003, "num_input_tokens_seen": 145170912, "step": 119375 }, { "epoch": 14.958025310111514, "grad_norm": 1.382114052772522, "learning_rate": 1.8142922961370463e-06, "loss": 0.4883, "num_input_tokens_seen": 145177280, "step": 119380 }, { "epoch": 14.958651798020298, "grad_norm": 1.904225468635559, "learning_rate": 1.8138709372320246e-06, "loss": 0.4489, "num_input_tokens_seen": 145183136, "step": 119385 }, { "epoch": 14.95927828592908, "grad_norm": 2.46059513092041, "learning_rate": 1.813449616419853e-06, "loss": 0.4911, "num_input_tokens_seen": 145188928, "step": 119390 }, { "epoch": 14.959904773837865, "grad_norm": 5.1398468017578125, "learning_rate": 1.8130283337055715e-06, "loss": 0.4825, "num_input_tokens_seen": 145194880, "step": 119395 }, { "epoch": 14.960531261746649, "grad_norm": 4.66764497756958, "learning_rate": 1.8126070890942144e-06, "loss": 0.4436, "num_input_tokens_seen": 145201120, "step": 119400 }, { "epoch": 14.961157749655431, "grad_norm": 2.23586106300354, "learning_rate": 1.8121858825908205e-06, "loss": 0.4413, "num_input_tokens_seen": 145207296, "step": 119405 }, { "epoch": 14.961784237564215, "grad_norm": 1.8513591289520264, "learning_rate": 1.8117647142004235e-06, "loss": 0.4343, "num_input_tokens_seen": 145213216, "step": 119410 }, { "epoch": 14.962410725473, "grad_norm": 2.860917091369629, "learning_rate": 1.8113435839280613e-06, "loss": 0.4671, "num_input_tokens_seen": 145218592, "step": 119415 }, { "epoch": 14.963037213381781, "grad_norm": 6.99416446685791, "learning_rate": 1.810922491778766e-06, "loss": 0.5017, "num_input_tokens_seen": 145224992, "step": 119420 }, { "epoch": 14.963663701290566, "grad_norm": 4.467785835266113, "learning_rate": 1.810501437757573e-06, "loss": 0.46, "num_input_tokens_seen": 145231040, "step": 119425 }, { "epoch": 14.964290189199348, "grad_norm": 1.8335767984390259, "learning_rate": 1.810080421869519e-06, "loss": 0.4369, "num_input_tokens_seen": 145237408, "step": 119430 }, { "epoch": 14.964916677108132, "grad_norm": 3.6317596435546875, "learning_rate": 1.8096594441196342e-06, "loss": 0.4484, "num_input_tokens_seen": 145243456, "step": 119435 }, { "epoch": 14.965543165016916, "grad_norm": 1.856642484664917, "learning_rate": 1.8092385045129525e-06, "loss": 0.4139, "num_input_tokens_seen": 145249792, "step": 119440 }, { "epoch": 14.966169652925698, "grad_norm": 1.7148746252059937, "learning_rate": 1.8088176030545067e-06, "loss": 0.4989, "num_input_tokens_seen": 145255840, "step": 119445 }, { "epoch": 14.966796140834482, "grad_norm": 2.789469003677368, "learning_rate": 1.8083967397493318e-06, "loss": 0.4466, "num_input_tokens_seen": 145262016, "step": 119450 }, { "epoch": 14.967422628743265, "grad_norm": 1.6301237344741821, "learning_rate": 1.8079759146024555e-06, "loss": 0.457, "num_input_tokens_seen": 145267360, "step": 119455 }, { "epoch": 14.968049116652049, "grad_norm": 8.380910873413086, "learning_rate": 1.8075551276189124e-06, "loss": 0.496, "num_input_tokens_seen": 145273504, "step": 119460 }, { "epoch": 14.968675604560833, "grad_norm": 1.3332488536834717, "learning_rate": 1.8071343788037299e-06, "loss": 0.4127, "num_input_tokens_seen": 145279488, "step": 119465 }, { "epoch": 14.969302092469615, "grad_norm": 2.253457546234131, "learning_rate": 1.8067136681619424e-06, "loss": 0.4627, "num_input_tokens_seen": 145285952, "step": 119470 }, { "epoch": 14.969928580378399, "grad_norm": 9.343385696411133, "learning_rate": 1.8062929956985759e-06, "loss": 0.4743, "num_input_tokens_seen": 145291872, "step": 119475 }, { "epoch": 14.970555068287181, "grad_norm": 2.770564079284668, "learning_rate": 1.8058723614186636e-06, "loss": 0.4432, "num_input_tokens_seen": 145298016, "step": 119480 }, { "epoch": 14.971181556195965, "grad_norm": 1.7409367561340332, "learning_rate": 1.8054517653272308e-06, "loss": 0.4382, "num_input_tokens_seen": 145303648, "step": 119485 }, { "epoch": 14.97180804410475, "grad_norm": 2.50303053855896, "learning_rate": 1.8050312074293096e-06, "loss": 0.5156, "num_input_tokens_seen": 145309824, "step": 119490 }, { "epoch": 14.972434532013532, "grad_norm": 10.343280792236328, "learning_rate": 1.804610687729925e-06, "loss": 0.5053, "num_input_tokens_seen": 145315808, "step": 119495 }, { "epoch": 14.973061019922316, "grad_norm": 2.794194221496582, "learning_rate": 1.804190206234106e-06, "loss": 0.4386, "num_input_tokens_seen": 145322048, "step": 119500 }, { "epoch": 14.973687507831098, "grad_norm": 1.9675699472427368, "learning_rate": 1.803769762946882e-06, "loss": 0.4505, "num_input_tokens_seen": 145327680, "step": 119505 }, { "epoch": 14.974313995739882, "grad_norm": 1.8426131010055542, "learning_rate": 1.8033493578732759e-06, "loss": 0.4818, "num_input_tokens_seen": 145333472, "step": 119510 }, { "epoch": 14.974940483648666, "grad_norm": 2.714937448501587, "learning_rate": 1.8029289910183173e-06, "loss": 0.4934, "num_input_tokens_seen": 145339520, "step": 119515 }, { "epoch": 14.975566971557448, "grad_norm": 2.2913870811462402, "learning_rate": 1.8025086623870286e-06, "loss": 0.4838, "num_input_tokens_seen": 145345696, "step": 119520 }, { "epoch": 14.976193459466232, "grad_norm": 2.4142303466796875, "learning_rate": 1.8020883719844396e-06, "loss": 0.5054, "num_input_tokens_seen": 145351712, "step": 119525 }, { "epoch": 14.976819947375017, "grad_norm": 2.1303350925445557, "learning_rate": 1.8016681198155705e-06, "loss": 0.4691, "num_input_tokens_seen": 145358080, "step": 119530 }, { "epoch": 14.977446435283799, "grad_norm": 1.9516807794570923, "learning_rate": 1.8012479058854483e-06, "loss": 0.4824, "num_input_tokens_seen": 145364160, "step": 119535 }, { "epoch": 14.978072923192583, "grad_norm": 1.4245139360427856, "learning_rate": 1.8008277301990968e-06, "loss": 0.4566, "num_input_tokens_seen": 145370624, "step": 119540 }, { "epoch": 14.978699411101365, "grad_norm": 4.030703544616699, "learning_rate": 1.800407592761541e-06, "loss": 0.4477, "num_input_tokens_seen": 145377184, "step": 119545 }, { "epoch": 14.97932589901015, "grad_norm": 3.8000075817108154, "learning_rate": 1.7999874935778005e-06, "loss": 0.4938, "num_input_tokens_seen": 145383328, "step": 119550 }, { "epoch": 14.979952386918933, "grad_norm": 3.4942026138305664, "learning_rate": 1.7995674326529005e-06, "loss": 0.5085, "num_input_tokens_seen": 145389024, "step": 119555 }, { "epoch": 14.980578874827716, "grad_norm": 2.7133123874664307, "learning_rate": 1.7991474099918643e-06, "loss": 0.4617, "num_input_tokens_seen": 145395232, "step": 119560 }, { "epoch": 14.9812053627365, "grad_norm": 7.235739231109619, "learning_rate": 1.79872742559971e-06, "loss": 0.4269, "num_input_tokens_seen": 145401440, "step": 119565 }, { "epoch": 14.981831850645282, "grad_norm": 5.008696556091309, "learning_rate": 1.7983074794814626e-06, "loss": 0.4651, "num_input_tokens_seen": 145407904, "step": 119570 }, { "epoch": 14.982458338554066, "grad_norm": 1.6600230932235718, "learning_rate": 1.797887571642139e-06, "loss": 0.472, "num_input_tokens_seen": 145413920, "step": 119575 }, { "epoch": 14.98308482646285, "grad_norm": 2.0791282653808594, "learning_rate": 1.7974677020867642e-06, "loss": 0.465, "num_input_tokens_seen": 145420128, "step": 119580 }, { "epoch": 14.983711314371632, "grad_norm": 2.984318256378174, "learning_rate": 1.7970478708203532e-06, "loss": 0.489, "num_input_tokens_seen": 145426240, "step": 119585 }, { "epoch": 14.984337802280416, "grad_norm": 1.6006501913070679, "learning_rate": 1.7966280778479295e-06, "loss": 0.4578, "num_input_tokens_seen": 145432480, "step": 119590 }, { "epoch": 14.984964290189199, "grad_norm": 1.3975682258605957, "learning_rate": 1.7962083231745092e-06, "loss": 0.4202, "num_input_tokens_seen": 145438464, "step": 119595 }, { "epoch": 14.985590778097983, "grad_norm": 2.075080633163452, "learning_rate": 1.7957886068051133e-06, "loss": 0.5119, "num_input_tokens_seen": 145444896, "step": 119600 }, { "epoch": 14.986217266006767, "grad_norm": 8.345182418823242, "learning_rate": 1.7953689287447568e-06, "loss": 0.4654, "num_input_tokens_seen": 145451168, "step": 119605 }, { "epoch": 14.986843753915549, "grad_norm": 2.027341842651367, "learning_rate": 1.794949288998461e-06, "loss": 0.4473, "num_input_tokens_seen": 145457088, "step": 119610 }, { "epoch": 14.987470241824333, "grad_norm": 8.259435653686523, "learning_rate": 1.7945296875712392e-06, "loss": 0.4511, "num_input_tokens_seen": 145463712, "step": 119615 }, { "epoch": 14.988096729733115, "grad_norm": 2.9606831073760986, "learning_rate": 1.7941101244681102e-06, "loss": 0.4763, "num_input_tokens_seen": 145469760, "step": 119620 }, { "epoch": 14.9887232176419, "grad_norm": 4.462769985198975, "learning_rate": 1.7936905996940918e-06, "loss": 0.4474, "num_input_tokens_seen": 145475808, "step": 119625 }, { "epoch": 14.989349705550683, "grad_norm": 5.439028263092041, "learning_rate": 1.7932711132541963e-06, "loss": 0.4755, "num_input_tokens_seen": 145481088, "step": 119630 }, { "epoch": 14.989976193459466, "grad_norm": 1.8233692646026611, "learning_rate": 1.7928516651534406e-06, "loss": 0.4199, "num_input_tokens_seen": 145487488, "step": 119635 }, { "epoch": 14.99060268136825, "grad_norm": 2.6210556030273438, "learning_rate": 1.7924322553968409e-06, "loss": 0.4423, "num_input_tokens_seen": 145492960, "step": 119640 }, { "epoch": 14.991229169277034, "grad_norm": 1.312649130821228, "learning_rate": 1.792012883989409e-06, "loss": 0.4714, "num_input_tokens_seen": 145499264, "step": 119645 }, { "epoch": 14.991855657185816, "grad_norm": 5.733264446258545, "learning_rate": 1.7915935509361603e-06, "loss": 0.4469, "num_input_tokens_seen": 145505664, "step": 119650 }, { "epoch": 14.9924821450946, "grad_norm": 1.56138277053833, "learning_rate": 1.7911742562421103e-06, "loss": 0.4321, "num_input_tokens_seen": 145511680, "step": 119655 }, { "epoch": 14.993108633003382, "grad_norm": 2.0494000911712646, "learning_rate": 1.7907549999122676e-06, "loss": 0.4853, "num_input_tokens_seen": 145517472, "step": 119660 }, { "epoch": 14.993735120912167, "grad_norm": 2.1879472732543945, "learning_rate": 1.7903357819516486e-06, "loss": 0.4888, "num_input_tokens_seen": 145523648, "step": 119665 }, { "epoch": 14.994361608820949, "grad_norm": 3.984347343444824, "learning_rate": 1.7899166023652625e-06, "loss": 0.4468, "num_input_tokens_seen": 145529792, "step": 119670 }, { "epoch": 14.994988096729733, "grad_norm": 3.4728269577026367, "learning_rate": 1.7894974611581245e-06, "loss": 0.4683, "num_input_tokens_seen": 145536032, "step": 119675 }, { "epoch": 14.995614584638517, "grad_norm": 1.8097426891326904, "learning_rate": 1.789078358335241e-06, "loss": 0.4708, "num_input_tokens_seen": 145542016, "step": 119680 }, { "epoch": 14.9962410725473, "grad_norm": 8.942315101623535, "learning_rate": 1.7886592939016257e-06, "loss": 0.4566, "num_input_tokens_seen": 145548256, "step": 119685 }, { "epoch": 14.996867560456083, "grad_norm": 1.1041536331176758, "learning_rate": 1.78824026786229e-06, "loss": 0.4144, "num_input_tokens_seen": 145553952, "step": 119690 }, { "epoch": 14.997494048364867, "grad_norm": 1.9385710954666138, "learning_rate": 1.787821280222241e-06, "loss": 0.4574, "num_input_tokens_seen": 145560480, "step": 119695 }, { "epoch": 14.99812053627365, "grad_norm": 1.8739595413208008, "learning_rate": 1.7874023309864908e-06, "loss": 0.4824, "num_input_tokens_seen": 145566816, "step": 119700 }, { "epoch": 14.998747024182434, "grad_norm": 1.5352016687393188, "learning_rate": 1.7869834201600445e-06, "loss": 0.4795, "num_input_tokens_seen": 145572864, "step": 119705 }, { "epoch": 14.999373512091216, "grad_norm": 4.099992752075195, "learning_rate": 1.7865645477479149e-06, "loss": 0.4566, "num_input_tokens_seen": 145579040, "step": 119710 }, { "epoch": 15.0, "grad_norm": 2.015230894088745, "learning_rate": 1.7861457137551064e-06, "loss": 0.4113, "num_input_tokens_seen": 145585440, "step": 119715 }, { "epoch": 15.000626487908784, "grad_norm": 8.032868385314941, "learning_rate": 1.7857269181866294e-06, "loss": 0.4993, "num_input_tokens_seen": 145591360, "step": 119720 }, { "epoch": 15.001252975817566, "grad_norm": 2.1022963523864746, "learning_rate": 1.7853081610474876e-06, "loss": 0.4248, "num_input_tokens_seen": 145597408, "step": 119725 }, { "epoch": 15.00187946372635, "grad_norm": 3.695643663406372, "learning_rate": 1.784889442342691e-06, "loss": 0.4558, "num_input_tokens_seen": 145602880, "step": 119730 }, { "epoch": 15.002505951635133, "grad_norm": 6.004842758178711, "learning_rate": 1.784470762077243e-06, "loss": 0.4891, "num_input_tokens_seen": 145609088, "step": 119735 }, { "epoch": 15.003132439543917, "grad_norm": 5.9703688621521, "learning_rate": 1.7840521202561506e-06, "loss": 0.4925, "num_input_tokens_seen": 145615136, "step": 119740 }, { "epoch": 15.0037589274527, "grad_norm": 2.1556077003479004, "learning_rate": 1.7836335168844189e-06, "loss": 0.513, "num_input_tokens_seen": 145621216, "step": 119745 }, { "epoch": 15.004385415361483, "grad_norm": 1.7405469417572021, "learning_rate": 1.7832149519670523e-06, "loss": 0.4576, "num_input_tokens_seen": 145627200, "step": 119750 }, { "epoch": 15.005011903270267, "grad_norm": 2.0363667011260986, "learning_rate": 1.7827964255090574e-06, "loss": 0.4296, "num_input_tokens_seen": 145633120, "step": 119755 }, { "epoch": 15.00563839117905, "grad_norm": 5.608747959136963, "learning_rate": 1.7823779375154344e-06, "loss": 0.4308, "num_input_tokens_seen": 145638624, "step": 119760 }, { "epoch": 15.006264879087833, "grad_norm": 2.1258389949798584, "learning_rate": 1.78195948799119e-06, "loss": 0.4376, "num_input_tokens_seen": 145644512, "step": 119765 }, { "epoch": 15.006891366996618, "grad_norm": 1.4552055597305298, "learning_rate": 1.7815410769413237e-06, "loss": 0.4257, "num_input_tokens_seen": 145650656, "step": 119770 }, { "epoch": 15.0075178549054, "grad_norm": 7.640494346618652, "learning_rate": 1.7811227043708412e-06, "loss": 0.5352, "num_input_tokens_seen": 145656288, "step": 119775 }, { "epoch": 15.008144342814184, "grad_norm": 1.7222723960876465, "learning_rate": 1.7807043702847421e-06, "loss": 0.4303, "num_input_tokens_seen": 145662496, "step": 119780 }, { "epoch": 15.008770830722968, "grad_norm": 5.803128242492676, "learning_rate": 1.7802860746880303e-06, "loss": 0.4717, "num_input_tokens_seen": 145668352, "step": 119785 }, { "epoch": 15.00939731863175, "grad_norm": 5.772167205810547, "learning_rate": 1.7798678175857037e-06, "loss": 0.486, "num_input_tokens_seen": 145674464, "step": 119790 }, { "epoch": 15.010023806540534, "grad_norm": 6.59321928024292, "learning_rate": 1.7794495989827666e-06, "loss": 0.5358, "num_input_tokens_seen": 145680320, "step": 119795 }, { "epoch": 15.010650294449317, "grad_norm": 1.4901068210601807, "learning_rate": 1.779031418884215e-06, "loss": 0.5551, "num_input_tokens_seen": 145686336, "step": 119800 }, { "epoch": 15.0112767823581, "grad_norm": 8.985877990722656, "learning_rate": 1.778613277295051e-06, "loss": 0.4813, "num_input_tokens_seen": 145692512, "step": 119805 }, { "epoch": 15.011903270266885, "grad_norm": 7.770429611206055, "learning_rate": 1.7781951742202758e-06, "loss": 0.5037, "num_input_tokens_seen": 145698752, "step": 119810 }, { "epoch": 15.012529758175667, "grad_norm": 6.229807376861572, "learning_rate": 1.7777771096648838e-06, "loss": 0.484, "num_input_tokens_seen": 145704896, "step": 119815 }, { "epoch": 15.013156246084451, "grad_norm": 1.9309550523757935, "learning_rate": 1.7773590836338778e-06, "loss": 0.4207, "num_input_tokens_seen": 145711296, "step": 119820 }, { "epoch": 15.013782733993233, "grad_norm": 2.9569761753082275, "learning_rate": 1.7769410961322514e-06, "loss": 0.4167, "num_input_tokens_seen": 145716480, "step": 119825 }, { "epoch": 15.014409221902017, "grad_norm": 7.548628330230713, "learning_rate": 1.7765231471650053e-06, "loss": 0.445, "num_input_tokens_seen": 145722816, "step": 119830 }, { "epoch": 15.015035709810801, "grad_norm": 2.5139386653900146, "learning_rate": 1.776105236737134e-06, "loss": 0.493, "num_input_tokens_seen": 145728992, "step": 119835 }, { "epoch": 15.015662197719584, "grad_norm": 3.956735610961914, "learning_rate": 1.7756873648536348e-06, "loss": 0.512, "num_input_tokens_seen": 145735168, "step": 119840 }, { "epoch": 15.016288685628368, "grad_norm": 3.217982292175293, "learning_rate": 1.7752695315195057e-06, "loss": 0.421, "num_input_tokens_seen": 145741184, "step": 119845 }, { "epoch": 15.01691517353715, "grad_norm": 8.970597267150879, "learning_rate": 1.7748517367397388e-06, "loss": 0.4571, "num_input_tokens_seen": 145747168, "step": 119850 }, { "epoch": 15.017541661445934, "grad_norm": 2.010275363922119, "learning_rate": 1.7744339805193317e-06, "loss": 0.4442, "num_input_tokens_seen": 145753184, "step": 119855 }, { "epoch": 15.018168149354718, "grad_norm": 3.652055025100708, "learning_rate": 1.774016262863279e-06, "loss": 0.4645, "num_input_tokens_seen": 145759392, "step": 119860 }, { "epoch": 15.0187946372635, "grad_norm": 2.2468888759613037, "learning_rate": 1.7735985837765728e-06, "loss": 0.4762, "num_input_tokens_seen": 145764928, "step": 119865 }, { "epoch": 15.019421125172284, "grad_norm": 1.3116028308868408, "learning_rate": 1.7731809432642082e-06, "loss": 0.4375, "num_input_tokens_seen": 145771072, "step": 119870 }, { "epoch": 15.020047613081067, "grad_norm": 1.7345974445343018, "learning_rate": 1.77276334133118e-06, "loss": 0.462, "num_input_tokens_seen": 145777152, "step": 119875 }, { "epoch": 15.02067410098985, "grad_norm": 1.4085060358047485, "learning_rate": 1.7723457779824776e-06, "loss": 0.4191, "num_input_tokens_seen": 145783328, "step": 119880 }, { "epoch": 15.021300588898635, "grad_norm": 1.929288387298584, "learning_rate": 1.7719282532230969e-06, "loss": 0.4729, "num_input_tokens_seen": 145789248, "step": 119885 }, { "epoch": 15.021927076807417, "grad_norm": 1.6676394939422607, "learning_rate": 1.7715107670580267e-06, "loss": 0.4568, "num_input_tokens_seen": 145795040, "step": 119890 }, { "epoch": 15.022553564716201, "grad_norm": 5.3263349533081055, "learning_rate": 1.771093319492261e-06, "loss": 0.4444, "num_input_tokens_seen": 145801280, "step": 119895 }, { "epoch": 15.023180052624983, "grad_norm": 1.595676064491272, "learning_rate": 1.7706759105307874e-06, "loss": 0.475, "num_input_tokens_seen": 145807584, "step": 119900 }, { "epoch": 15.023806540533768, "grad_norm": 9.626532554626465, "learning_rate": 1.7702585401785998e-06, "loss": 0.5542, "num_input_tokens_seen": 145813696, "step": 119905 }, { "epoch": 15.024433028442552, "grad_norm": 4.104804515838623, "learning_rate": 1.7698412084406858e-06, "loss": 0.4382, "num_input_tokens_seen": 145819776, "step": 119910 }, { "epoch": 15.025059516351334, "grad_norm": 7.927309036254883, "learning_rate": 1.7694239153220373e-06, "loss": 0.4517, "num_input_tokens_seen": 145825952, "step": 119915 }, { "epoch": 15.025686004260118, "grad_norm": 2.3454439640045166, "learning_rate": 1.76900666082764e-06, "loss": 0.4835, "num_input_tokens_seen": 145832160, "step": 119920 }, { "epoch": 15.026312492168902, "grad_norm": 3.9122917652130127, "learning_rate": 1.7685894449624869e-06, "loss": 0.4755, "num_input_tokens_seen": 145838464, "step": 119925 }, { "epoch": 15.026938980077684, "grad_norm": 4.232452869415283, "learning_rate": 1.7681722677315616e-06, "loss": 0.4596, "num_input_tokens_seen": 145844384, "step": 119930 }, { "epoch": 15.027565467986468, "grad_norm": 2.543713331222534, "learning_rate": 1.7677551291398543e-06, "loss": 0.5042, "num_input_tokens_seen": 145850496, "step": 119935 }, { "epoch": 15.02819195589525, "grad_norm": 4.606014251708984, "learning_rate": 1.7673380291923536e-06, "loss": 0.4414, "num_input_tokens_seen": 145856800, "step": 119940 }, { "epoch": 15.028818443804035, "grad_norm": 2.152838945388794, "learning_rate": 1.766920967894043e-06, "loss": 0.4192, "num_input_tokens_seen": 145863200, "step": 119945 }, { "epoch": 15.029444931712819, "grad_norm": 4.50827169418335, "learning_rate": 1.76650394524991e-06, "loss": 0.4416, "num_input_tokens_seen": 145868864, "step": 119950 }, { "epoch": 15.030071419621601, "grad_norm": 1.9748998880386353, "learning_rate": 1.7660869612649413e-06, "loss": 0.4079, "num_input_tokens_seen": 145875232, "step": 119955 }, { "epoch": 15.030697907530385, "grad_norm": 6.1778764724731445, "learning_rate": 1.7656700159441237e-06, "loss": 0.467, "num_input_tokens_seen": 145881376, "step": 119960 }, { "epoch": 15.031324395439167, "grad_norm": 5.839168071746826, "learning_rate": 1.7652531092924385e-06, "loss": 0.4663, "num_input_tokens_seen": 145887520, "step": 119965 }, { "epoch": 15.031950883347951, "grad_norm": 3.1839845180511475, "learning_rate": 1.7648362413148734e-06, "loss": 0.4683, "num_input_tokens_seen": 145893504, "step": 119970 }, { "epoch": 15.032577371256735, "grad_norm": 8.853673934936523, "learning_rate": 1.7644194120164099e-06, "loss": 0.4927, "num_input_tokens_seen": 145899584, "step": 119975 }, { "epoch": 15.033203859165518, "grad_norm": 8.306681632995605, "learning_rate": 1.7640026214020339e-06, "loss": 0.5117, "num_input_tokens_seen": 145905664, "step": 119980 }, { "epoch": 15.033830347074302, "grad_norm": 6.783142566680908, "learning_rate": 1.7635858694767254e-06, "loss": 0.4479, "num_input_tokens_seen": 145911872, "step": 119985 }, { "epoch": 15.034456834983084, "grad_norm": 2.443568229675293, "learning_rate": 1.7631691562454696e-06, "loss": 0.4721, "num_input_tokens_seen": 145918112, "step": 119990 }, { "epoch": 15.035083322891868, "grad_norm": 2.163304328918457, "learning_rate": 1.7627524817132492e-06, "loss": 0.4587, "num_input_tokens_seen": 145924288, "step": 119995 }, { "epoch": 15.035709810800652, "grad_norm": 9.998464584350586, "learning_rate": 1.7623358458850427e-06, "loss": 0.4663, "num_input_tokens_seen": 145930176, "step": 120000 }, { "epoch": 15.036336298709434, "grad_norm": 7.0162034034729, "learning_rate": 1.7619192487658348e-06, "loss": 0.5244, "num_input_tokens_seen": 145936416, "step": 120005 }, { "epoch": 15.036962786618219, "grad_norm": 4.982954502105713, "learning_rate": 1.7615026903606035e-06, "loss": 0.4361, "num_input_tokens_seen": 145942560, "step": 120010 }, { "epoch": 15.037589274527, "grad_norm": 2.4394099712371826, "learning_rate": 1.7610861706743316e-06, "loss": 0.4658, "num_input_tokens_seen": 145948704, "step": 120015 }, { "epoch": 15.038215762435785, "grad_norm": 1.7256892919540405, "learning_rate": 1.7606696897119962e-06, "loss": 0.4677, "num_input_tokens_seen": 145954720, "step": 120020 }, { "epoch": 15.038842250344569, "grad_norm": 2.01396107673645, "learning_rate": 1.76025324747858e-06, "loss": 0.4651, "num_input_tokens_seen": 145960960, "step": 120025 }, { "epoch": 15.039468738253351, "grad_norm": 2.0759220123291016, "learning_rate": 1.759836843979058e-06, "loss": 0.4851, "num_input_tokens_seen": 145966688, "step": 120030 }, { "epoch": 15.040095226162135, "grad_norm": 2.776174545288086, "learning_rate": 1.7594204792184129e-06, "loss": 0.4521, "num_input_tokens_seen": 145972640, "step": 120035 }, { "epoch": 15.04072171407092, "grad_norm": 1.821861982345581, "learning_rate": 1.7590041532016183e-06, "loss": 0.4919, "num_input_tokens_seen": 145978752, "step": 120040 }, { "epoch": 15.041348201979702, "grad_norm": 8.168598175048828, "learning_rate": 1.7585878659336543e-06, "loss": 0.5522, "num_input_tokens_seen": 145984960, "step": 120045 }, { "epoch": 15.041974689888486, "grad_norm": 2.8443236351013184, "learning_rate": 1.7581716174194973e-06, "loss": 0.517, "num_input_tokens_seen": 145991008, "step": 120050 }, { "epoch": 15.042601177797268, "grad_norm": 3.5037119388580322, "learning_rate": 1.7577554076641258e-06, "loss": 0.4704, "num_input_tokens_seen": 145997184, "step": 120055 }, { "epoch": 15.043227665706052, "grad_norm": 3.3489694595336914, "learning_rate": 1.7573392366725127e-06, "loss": 0.4326, "num_input_tokens_seen": 146002976, "step": 120060 }, { "epoch": 15.043854153614836, "grad_norm": 10.452616691589355, "learning_rate": 1.7569231044496354e-06, "loss": 0.5101, "num_input_tokens_seen": 146009120, "step": 120065 }, { "epoch": 15.044480641523618, "grad_norm": 2.669598340988159, "learning_rate": 1.7565070110004711e-06, "loss": 0.4575, "num_input_tokens_seen": 146015392, "step": 120070 }, { "epoch": 15.045107129432402, "grad_norm": 2.363058090209961, "learning_rate": 1.7560909563299905e-06, "loss": 0.4195, "num_input_tokens_seen": 146021632, "step": 120075 }, { "epoch": 15.045733617341185, "grad_norm": 19.300159454345703, "learning_rate": 1.7556749404431712e-06, "loss": 0.4947, "num_input_tokens_seen": 146027360, "step": 120080 }, { "epoch": 15.046360105249969, "grad_norm": 1.6644482612609863, "learning_rate": 1.755258963344984e-06, "loss": 0.4891, "num_input_tokens_seen": 146032928, "step": 120085 }, { "epoch": 15.046986593158753, "grad_norm": 1.9073474407196045, "learning_rate": 1.7548430250404058e-06, "loss": 0.4184, "num_input_tokens_seen": 146039040, "step": 120090 }, { "epoch": 15.047613081067535, "grad_norm": 2.8322677612304688, "learning_rate": 1.7544271255344063e-06, "loss": 0.4451, "num_input_tokens_seen": 146044672, "step": 120095 }, { "epoch": 15.048239568976319, "grad_norm": 1.7231483459472656, "learning_rate": 1.7540112648319607e-06, "loss": 0.4521, "num_input_tokens_seen": 146050912, "step": 120100 }, { "epoch": 15.048866056885101, "grad_norm": 1.5068426132202148, "learning_rate": 1.7535954429380375e-06, "loss": 0.4384, "num_input_tokens_seen": 146056832, "step": 120105 }, { "epoch": 15.049492544793885, "grad_norm": 3.871913433074951, "learning_rate": 1.7531796598576122e-06, "loss": 0.479, "num_input_tokens_seen": 146063200, "step": 120110 }, { "epoch": 15.05011903270267, "grad_norm": 1.696691632270813, "learning_rate": 1.7527639155956522e-06, "loss": 0.4127, "num_input_tokens_seen": 146069504, "step": 120115 }, { "epoch": 15.050745520611452, "grad_norm": 9.915971755981445, "learning_rate": 1.7523482101571294e-06, "loss": 0.5344, "num_input_tokens_seen": 146075520, "step": 120120 }, { "epoch": 15.051372008520236, "grad_norm": 2.048847198486328, "learning_rate": 1.7519325435470159e-06, "loss": 0.4271, "num_input_tokens_seen": 146081760, "step": 120125 }, { "epoch": 15.051998496429018, "grad_norm": 2.7311601638793945, "learning_rate": 1.7515169157702784e-06, "loss": 0.4882, "num_input_tokens_seen": 146088128, "step": 120130 }, { "epoch": 15.052624984337802, "grad_norm": 2.191387414932251, "learning_rate": 1.7511013268318888e-06, "loss": 0.4217, "num_input_tokens_seen": 146094016, "step": 120135 }, { "epoch": 15.053251472246586, "grad_norm": 1.6991132497787476, "learning_rate": 1.7506857767368125e-06, "loss": 0.4849, "num_input_tokens_seen": 146099840, "step": 120140 }, { "epoch": 15.053877960155368, "grad_norm": 4.591337203979492, "learning_rate": 1.7502702654900217e-06, "loss": 0.4785, "num_input_tokens_seen": 146105824, "step": 120145 }, { "epoch": 15.054504448064153, "grad_norm": 3.602842092514038, "learning_rate": 1.7498547930964798e-06, "loss": 0.4722, "num_input_tokens_seen": 146111840, "step": 120150 }, { "epoch": 15.055130935972937, "grad_norm": 6.485071659088135, "learning_rate": 1.7494393595611569e-06, "loss": 0.4977, "num_input_tokens_seen": 146117920, "step": 120155 }, { "epoch": 15.055757423881719, "grad_norm": 2.0644335746765137, "learning_rate": 1.7490239648890195e-06, "loss": 0.4659, "num_input_tokens_seen": 146123840, "step": 120160 }, { "epoch": 15.056383911790503, "grad_norm": 2.6837446689605713, "learning_rate": 1.7486086090850352e-06, "loss": 0.4738, "num_input_tokens_seen": 146130208, "step": 120165 }, { "epoch": 15.057010399699285, "grad_norm": 2.4778218269348145, "learning_rate": 1.7481932921541673e-06, "loss": 0.4508, "num_input_tokens_seen": 146136416, "step": 120170 }, { "epoch": 15.05763688760807, "grad_norm": 1.7095365524291992, "learning_rate": 1.7477780141013823e-06, "loss": 0.4632, "num_input_tokens_seen": 146142720, "step": 120175 }, { "epoch": 15.058263375516853, "grad_norm": 1.6380844116210938, "learning_rate": 1.747362774931647e-06, "loss": 0.4505, "num_input_tokens_seen": 146148960, "step": 120180 }, { "epoch": 15.058889863425636, "grad_norm": 2.6989352703094482, "learning_rate": 1.7469475746499226e-06, "loss": 0.4239, "num_input_tokens_seen": 146155072, "step": 120185 }, { "epoch": 15.05951635133442, "grad_norm": 2.3775880336761475, "learning_rate": 1.7465324132611772e-06, "loss": 0.4483, "num_input_tokens_seen": 146161152, "step": 120190 }, { "epoch": 15.060142839243202, "grad_norm": 8.691091537475586, "learning_rate": 1.74611729077037e-06, "loss": 0.4578, "num_input_tokens_seen": 146167296, "step": 120195 }, { "epoch": 15.060769327151986, "grad_norm": 3.766667366027832, "learning_rate": 1.745702207182468e-06, "loss": 0.4819, "num_input_tokens_seen": 146173280, "step": 120200 }, { "epoch": 15.06139581506077, "grad_norm": 2.2336716651916504, "learning_rate": 1.745287162502431e-06, "loss": 0.4292, "num_input_tokens_seen": 146179360, "step": 120205 }, { "epoch": 15.062022302969552, "grad_norm": 2.3629331588745117, "learning_rate": 1.7448721567352233e-06, "loss": 0.422, "num_input_tokens_seen": 146185376, "step": 120210 }, { "epoch": 15.062648790878336, "grad_norm": 2.478595018386841, "learning_rate": 1.7444571898858048e-06, "loss": 0.4699, "num_input_tokens_seen": 146191328, "step": 120215 }, { "epoch": 15.063275278787119, "grad_norm": 4.996864318847656, "learning_rate": 1.744042261959139e-06, "loss": 0.4568, "num_input_tokens_seen": 146197440, "step": 120220 }, { "epoch": 15.063901766695903, "grad_norm": 2.295135021209717, "learning_rate": 1.7436273729601837e-06, "loss": 0.4169, "num_input_tokens_seen": 146203744, "step": 120225 }, { "epoch": 15.064528254604687, "grad_norm": 2.0616824626922607, "learning_rate": 1.7432125228939029e-06, "loss": 0.4343, "num_input_tokens_seen": 146209696, "step": 120230 }, { "epoch": 15.065154742513469, "grad_norm": 2.512669086456299, "learning_rate": 1.7427977117652529e-06, "loss": 0.4899, "num_input_tokens_seen": 146216000, "step": 120235 }, { "epoch": 15.065781230422253, "grad_norm": 1.7102279663085938, "learning_rate": 1.7423829395791942e-06, "loss": 0.4737, "num_input_tokens_seen": 146221888, "step": 120240 }, { "epoch": 15.066407718331035, "grad_norm": 19.482812881469727, "learning_rate": 1.7419682063406884e-06, "loss": 0.4666, "num_input_tokens_seen": 146228160, "step": 120245 }, { "epoch": 15.06703420623982, "grad_norm": 1.7714107036590576, "learning_rate": 1.74155351205469e-06, "loss": 0.4195, "num_input_tokens_seen": 146234304, "step": 120250 }, { "epoch": 15.067660694148604, "grad_norm": 5.875267028808594, "learning_rate": 1.7411388567261595e-06, "loss": 0.4845, "num_input_tokens_seen": 146240000, "step": 120255 }, { "epoch": 15.068287182057386, "grad_norm": 6.6841654777526855, "learning_rate": 1.740724240360055e-06, "loss": 0.4556, "num_input_tokens_seen": 146245952, "step": 120260 }, { "epoch": 15.06891366996617, "grad_norm": 2.198983907699585, "learning_rate": 1.740309662961331e-06, "loss": 0.4355, "num_input_tokens_seen": 146251904, "step": 120265 }, { "epoch": 15.069540157874952, "grad_norm": 10.658613204956055, "learning_rate": 1.7398951245349455e-06, "loss": 0.4828, "num_input_tokens_seen": 146258368, "step": 120270 }, { "epoch": 15.070166645783736, "grad_norm": 13.56387710571289, "learning_rate": 1.7394806250858565e-06, "loss": 0.5437, "num_input_tokens_seen": 146264512, "step": 120275 }, { "epoch": 15.07079313369252, "grad_norm": 2.63714861869812, "learning_rate": 1.739066164619016e-06, "loss": 0.4155, "num_input_tokens_seen": 146270656, "step": 120280 }, { "epoch": 15.071419621601303, "grad_norm": 5.908754348754883, "learning_rate": 1.7386517431393828e-06, "loss": 0.455, "num_input_tokens_seen": 146276480, "step": 120285 }, { "epoch": 15.072046109510087, "grad_norm": 2.079460620880127, "learning_rate": 1.7382373606519087e-06, "loss": 0.4382, "num_input_tokens_seen": 146282496, "step": 120290 }, { "epoch": 15.07267259741887, "grad_norm": 10.754142761230469, "learning_rate": 1.7378230171615506e-06, "loss": 0.5473, "num_input_tokens_seen": 146288736, "step": 120295 }, { "epoch": 15.073299085327653, "grad_norm": 2.659515857696533, "learning_rate": 1.7374087126732598e-06, "loss": 0.418, "num_input_tokens_seen": 146295264, "step": 120300 }, { "epoch": 15.073925573236437, "grad_norm": 6.178595542907715, "learning_rate": 1.7369944471919903e-06, "loss": 0.4898, "num_input_tokens_seen": 146301600, "step": 120305 }, { "epoch": 15.07455206114522, "grad_norm": 3.3259174823760986, "learning_rate": 1.7365802207226973e-06, "loss": 0.4717, "num_input_tokens_seen": 146307456, "step": 120310 }, { "epoch": 15.075178549054003, "grad_norm": 1.7861862182617188, "learning_rate": 1.73616603327033e-06, "loss": 0.4409, "num_input_tokens_seen": 146313856, "step": 120315 }, { "epoch": 15.075805036962787, "grad_norm": 8.149994850158691, "learning_rate": 1.7357518848398437e-06, "loss": 0.5505, "num_input_tokens_seen": 146319648, "step": 120320 }, { "epoch": 15.07643152487157, "grad_norm": 1.7656581401824951, "learning_rate": 1.735337775436186e-06, "loss": 0.4163, "num_input_tokens_seen": 146325152, "step": 120325 }, { "epoch": 15.077058012780354, "grad_norm": 5.506532669067383, "learning_rate": 1.7349237050643125e-06, "loss": 0.4655, "num_input_tokens_seen": 146331360, "step": 120330 }, { "epoch": 15.077684500689136, "grad_norm": 15.335862159729004, "learning_rate": 1.7345096737291689e-06, "loss": 0.535, "num_input_tokens_seen": 146337088, "step": 120335 }, { "epoch": 15.07831098859792, "grad_norm": 2.155304431915283, "learning_rate": 1.734095681435709e-06, "loss": 0.4604, "num_input_tokens_seen": 146343136, "step": 120340 }, { "epoch": 15.078937476506704, "grad_norm": 3.038393497467041, "learning_rate": 1.7336817281888797e-06, "loss": 0.4298, "num_input_tokens_seen": 146349600, "step": 120345 }, { "epoch": 15.079563964415486, "grad_norm": 2.2544586658477783, "learning_rate": 1.7332678139936332e-06, "loss": 0.3862, "num_input_tokens_seen": 146355776, "step": 120350 }, { "epoch": 15.08019045232427, "grad_norm": 3.6677443981170654, "learning_rate": 1.7328539388549147e-06, "loss": 0.4527, "num_input_tokens_seen": 146362144, "step": 120355 }, { "epoch": 15.080816940233053, "grad_norm": 4.131664752960205, "learning_rate": 1.7324401027776744e-06, "loss": 0.4993, "num_input_tokens_seen": 146368352, "step": 120360 }, { "epoch": 15.081443428141837, "grad_norm": 2.0377347469329834, "learning_rate": 1.7320263057668601e-06, "loss": 0.487, "num_input_tokens_seen": 146374656, "step": 120365 }, { "epoch": 15.08206991605062, "grad_norm": 2.5538816452026367, "learning_rate": 1.7316125478274188e-06, "loss": 0.4805, "num_input_tokens_seen": 146380608, "step": 120370 }, { "epoch": 15.082696403959403, "grad_norm": 2.485707998275757, "learning_rate": 1.7311988289642983e-06, "loss": 0.5196, "num_input_tokens_seen": 146386816, "step": 120375 }, { "epoch": 15.083322891868187, "grad_norm": 7.02427339553833, "learning_rate": 1.7307851491824435e-06, "loss": 0.4821, "num_input_tokens_seen": 146392672, "step": 120380 }, { "epoch": 15.08394937977697, "grad_norm": 4.201318264007568, "learning_rate": 1.7303715084868017e-06, "loss": 0.5332, "num_input_tokens_seen": 146399008, "step": 120385 }, { "epoch": 15.084575867685754, "grad_norm": 7.469859600067139, "learning_rate": 1.7299579068823163e-06, "loss": 0.4448, "num_input_tokens_seen": 146405312, "step": 120390 }, { "epoch": 15.085202355594538, "grad_norm": 2.3479723930358887, "learning_rate": 1.7295443443739346e-06, "loss": 0.4686, "num_input_tokens_seen": 146411072, "step": 120395 }, { "epoch": 15.08582884350332, "grad_norm": 3.786205291748047, "learning_rate": 1.7291308209665985e-06, "loss": 0.4515, "num_input_tokens_seen": 146417216, "step": 120400 }, { "epoch": 15.086455331412104, "grad_norm": 2.6987431049346924, "learning_rate": 1.7287173366652555e-06, "loss": 0.4412, "num_input_tokens_seen": 146423104, "step": 120405 }, { "epoch": 15.087081819320888, "grad_norm": 1.8608934879302979, "learning_rate": 1.7283038914748446e-06, "loss": 0.4646, "num_input_tokens_seen": 146429088, "step": 120410 }, { "epoch": 15.08770830722967, "grad_norm": 2.6359524726867676, "learning_rate": 1.7278904854003136e-06, "loss": 0.4302, "num_input_tokens_seen": 146435360, "step": 120415 }, { "epoch": 15.088334795138454, "grad_norm": 3.156794548034668, "learning_rate": 1.727477118446601e-06, "loss": 0.4979, "num_input_tokens_seen": 146441504, "step": 120420 }, { "epoch": 15.088961283047237, "grad_norm": 4.324417591094971, "learning_rate": 1.727063790618651e-06, "loss": 0.4771, "num_input_tokens_seen": 146447488, "step": 120425 }, { "epoch": 15.08958777095602, "grad_norm": 2.1730360984802246, "learning_rate": 1.7266505019214063e-06, "loss": 0.4532, "num_input_tokens_seen": 146453664, "step": 120430 }, { "epoch": 15.090214258864805, "grad_norm": 15.415594100952148, "learning_rate": 1.726237252359806e-06, "loss": 0.4671, "num_input_tokens_seen": 146459776, "step": 120435 }, { "epoch": 15.090840746773587, "grad_norm": 2.703399181365967, "learning_rate": 1.7258240419387934e-06, "loss": 0.422, "num_input_tokens_seen": 146465696, "step": 120440 }, { "epoch": 15.091467234682371, "grad_norm": 2.0493674278259277, "learning_rate": 1.725410870663305e-06, "loss": 0.4584, "num_input_tokens_seen": 146471616, "step": 120445 }, { "epoch": 15.092093722591153, "grad_norm": 1.7649431228637695, "learning_rate": 1.7249977385382844e-06, "loss": 0.4292, "num_input_tokens_seen": 146477664, "step": 120450 }, { "epoch": 15.092720210499937, "grad_norm": 5.123475074768066, "learning_rate": 1.7245846455686681e-06, "loss": 0.4715, "num_input_tokens_seen": 146484000, "step": 120455 }, { "epoch": 15.093346698408721, "grad_norm": 4.190038681030273, "learning_rate": 1.724171591759396e-06, "loss": 0.462, "num_input_tokens_seen": 146489952, "step": 120460 }, { "epoch": 15.093973186317504, "grad_norm": 2.7447080612182617, "learning_rate": 1.7237585771154081e-06, "loss": 0.4334, "num_input_tokens_seen": 146496384, "step": 120465 }, { "epoch": 15.094599674226288, "grad_norm": 1.6361476182937622, "learning_rate": 1.7233456016416395e-06, "loss": 0.4495, "num_input_tokens_seen": 146502368, "step": 120470 }, { "epoch": 15.09522616213507, "grad_norm": 2.449697971343994, "learning_rate": 1.722932665343029e-06, "loss": 0.4098, "num_input_tokens_seen": 146508512, "step": 120475 }, { "epoch": 15.095852650043854, "grad_norm": 2.697960376739502, "learning_rate": 1.7225197682245153e-06, "loss": 0.4672, "num_input_tokens_seen": 146514272, "step": 120480 }, { "epoch": 15.096479137952638, "grad_norm": 3.504695177078247, "learning_rate": 1.7221069102910321e-06, "loss": 0.4078, "num_input_tokens_seen": 146520480, "step": 120485 }, { "epoch": 15.09710562586142, "grad_norm": 2.069791316986084, "learning_rate": 1.7216940915475162e-06, "loss": 0.4227, "num_input_tokens_seen": 146526432, "step": 120490 }, { "epoch": 15.097732113770205, "grad_norm": 7.24624490737915, "learning_rate": 1.7212813119989053e-06, "loss": 0.4589, "num_input_tokens_seen": 146532352, "step": 120495 }, { "epoch": 15.098358601678987, "grad_norm": 9.886149406433105, "learning_rate": 1.7208685716501316e-06, "loss": 0.5475, "num_input_tokens_seen": 146538400, "step": 120500 }, { "epoch": 15.09898508958777, "grad_norm": 2.470038890838623, "learning_rate": 1.7204558705061326e-06, "loss": 0.4362, "num_input_tokens_seen": 146544352, "step": 120505 }, { "epoch": 15.099611577496555, "grad_norm": 1.9754971265792847, "learning_rate": 1.7200432085718394e-06, "loss": 0.4544, "num_input_tokens_seen": 146550688, "step": 120510 }, { "epoch": 15.100238065405337, "grad_norm": 2.0315170288085938, "learning_rate": 1.7196305858521883e-06, "loss": 0.4637, "num_input_tokens_seen": 146556576, "step": 120515 }, { "epoch": 15.100864553314121, "grad_norm": 2.838456392288208, "learning_rate": 1.7192180023521104e-06, "loss": 0.421, "num_input_tokens_seen": 146562784, "step": 120520 }, { "epoch": 15.101491041222904, "grad_norm": 2.3127963542938232, "learning_rate": 1.7188054580765418e-06, "loss": 0.4904, "num_input_tokens_seen": 146569312, "step": 120525 }, { "epoch": 15.102117529131688, "grad_norm": 2.0863821506500244, "learning_rate": 1.71839295303041e-06, "loss": 0.4675, "num_input_tokens_seen": 146574752, "step": 120530 }, { "epoch": 15.102744017040472, "grad_norm": 15.37965202331543, "learning_rate": 1.7179804872186518e-06, "loss": 0.4396, "num_input_tokens_seen": 146581024, "step": 120535 }, { "epoch": 15.103370504949254, "grad_norm": 6.218201160430908, "learning_rate": 1.7175680606461943e-06, "loss": 0.5222, "num_input_tokens_seen": 146587456, "step": 120540 }, { "epoch": 15.103996992858038, "grad_norm": 6.428837299346924, "learning_rate": 1.7171556733179701e-06, "loss": 0.47, "num_input_tokens_seen": 146593248, "step": 120545 }, { "epoch": 15.104623480766822, "grad_norm": 2.756892681121826, "learning_rate": 1.7167433252389122e-06, "loss": 0.4372, "num_input_tokens_seen": 146599552, "step": 120550 }, { "epoch": 15.105249968675604, "grad_norm": 3.2557015419006348, "learning_rate": 1.7163310164139463e-06, "loss": 0.4307, "num_input_tokens_seen": 146605888, "step": 120555 }, { "epoch": 15.105876456584388, "grad_norm": 2.48475980758667, "learning_rate": 1.7159187468480053e-06, "loss": 0.4324, "num_input_tokens_seen": 146611616, "step": 120560 }, { "epoch": 15.10650294449317, "grad_norm": 3.1044623851776123, "learning_rate": 1.715506516546015e-06, "loss": 0.4672, "num_input_tokens_seen": 146617728, "step": 120565 }, { "epoch": 15.107129432401955, "grad_norm": 3.1668622493743896, "learning_rate": 1.715094325512906e-06, "loss": 0.5658, "num_input_tokens_seen": 146623872, "step": 120570 }, { "epoch": 15.107755920310739, "grad_norm": 3.195918083190918, "learning_rate": 1.7146821737536062e-06, "loss": 0.4672, "num_input_tokens_seen": 146629728, "step": 120575 }, { "epoch": 15.108382408219521, "grad_norm": 8.200042724609375, "learning_rate": 1.714270061273045e-06, "loss": 0.4445, "num_input_tokens_seen": 146636000, "step": 120580 }, { "epoch": 15.109008896128305, "grad_norm": 2.732604742050171, "learning_rate": 1.7138579880761458e-06, "loss": 0.4469, "num_input_tokens_seen": 146641920, "step": 120585 }, { "epoch": 15.109635384037087, "grad_norm": 6.253906726837158, "learning_rate": 1.713445954167839e-06, "loss": 0.5449, "num_input_tokens_seen": 146648224, "step": 120590 }, { "epoch": 15.110261871945871, "grad_norm": 2.3563976287841797, "learning_rate": 1.713033959553047e-06, "loss": 0.4637, "num_input_tokens_seen": 146654240, "step": 120595 }, { "epoch": 15.110888359854656, "grad_norm": 2.8234028816223145, "learning_rate": 1.7126220042366992e-06, "loss": 0.4549, "num_input_tokens_seen": 146660512, "step": 120600 }, { "epoch": 15.111514847763438, "grad_norm": 6.649415016174316, "learning_rate": 1.712210088223718e-06, "loss": 0.4948, "num_input_tokens_seen": 146666880, "step": 120605 }, { "epoch": 15.112141335672222, "grad_norm": 2.312891721725464, "learning_rate": 1.711798211519029e-06, "loss": 0.4862, "num_input_tokens_seen": 146672480, "step": 120610 }, { "epoch": 15.112767823581004, "grad_norm": 2.801680564880371, "learning_rate": 1.7113863741275589e-06, "loss": 0.4466, "num_input_tokens_seen": 146678560, "step": 120615 }, { "epoch": 15.113394311489788, "grad_norm": 1.8268601894378662, "learning_rate": 1.7109745760542278e-06, "loss": 0.4234, "num_input_tokens_seen": 146684640, "step": 120620 }, { "epoch": 15.114020799398572, "grad_norm": 2.4588024616241455, "learning_rate": 1.7105628173039623e-06, "loss": 0.455, "num_input_tokens_seen": 146690624, "step": 120625 }, { "epoch": 15.114647287307355, "grad_norm": 16.132665634155273, "learning_rate": 1.7101510978816826e-06, "loss": 0.495, "num_input_tokens_seen": 146697152, "step": 120630 }, { "epoch": 15.115273775216139, "grad_norm": 8.718146324157715, "learning_rate": 1.7097394177923143e-06, "loss": 0.4563, "num_input_tokens_seen": 146703168, "step": 120635 }, { "epoch": 15.11590026312492, "grad_norm": 6.498722553253174, "learning_rate": 1.7093277770407757e-06, "loss": 0.4288, "num_input_tokens_seen": 146709312, "step": 120640 }, { "epoch": 15.116526751033705, "grad_norm": 5.681334495544434, "learning_rate": 1.7089161756319916e-06, "loss": 0.438, "num_input_tokens_seen": 146714304, "step": 120645 }, { "epoch": 15.117153238942489, "grad_norm": 1.9802794456481934, "learning_rate": 1.7085046135708804e-06, "loss": 0.5112, "num_input_tokens_seen": 146720032, "step": 120650 }, { "epoch": 15.117779726851271, "grad_norm": 1.7585896253585815, "learning_rate": 1.7080930908623654e-06, "loss": 0.5048, "num_input_tokens_seen": 146726016, "step": 120655 }, { "epoch": 15.118406214760055, "grad_norm": 2.650175094604492, "learning_rate": 1.7076816075113634e-06, "loss": 0.4031, "num_input_tokens_seen": 146731968, "step": 120660 }, { "epoch": 15.11903270266884, "grad_norm": 2.525193214416504, "learning_rate": 1.7072701635227955e-06, "loss": 0.4368, "num_input_tokens_seen": 146737984, "step": 120665 }, { "epoch": 15.119659190577622, "grad_norm": 2.1426632404327393, "learning_rate": 1.7068587589015817e-06, "loss": 0.4525, "num_input_tokens_seen": 146744064, "step": 120670 }, { "epoch": 15.120285678486406, "grad_norm": 4.041459083557129, "learning_rate": 1.7064473936526416e-06, "loss": 0.4512, "num_input_tokens_seen": 146749888, "step": 120675 }, { "epoch": 15.120912166395188, "grad_norm": 14.544669151306152, "learning_rate": 1.7060360677808906e-06, "loss": 0.4657, "num_input_tokens_seen": 146755744, "step": 120680 }, { "epoch": 15.121538654303972, "grad_norm": 3.1574106216430664, "learning_rate": 1.7056247812912474e-06, "loss": 0.4216, "num_input_tokens_seen": 146762112, "step": 120685 }, { "epoch": 15.122165142212756, "grad_norm": 11.202071189880371, "learning_rate": 1.7052135341886317e-06, "loss": 0.4494, "num_input_tokens_seen": 146767840, "step": 120690 }, { "epoch": 15.122791630121538, "grad_norm": 2.823606252670288, "learning_rate": 1.7048023264779567e-06, "loss": 0.4586, "num_input_tokens_seen": 146774304, "step": 120695 }, { "epoch": 15.123418118030322, "grad_norm": 3.68876314163208, "learning_rate": 1.7043911581641415e-06, "loss": 0.4501, "num_input_tokens_seen": 146779904, "step": 120700 }, { "epoch": 15.124044605939105, "grad_norm": 2.310844898223877, "learning_rate": 1.7039800292520997e-06, "loss": 0.4865, "num_input_tokens_seen": 146786240, "step": 120705 }, { "epoch": 15.124671093847889, "grad_norm": 3.2817587852478027, "learning_rate": 1.7035689397467493e-06, "loss": 0.4289, "num_input_tokens_seen": 146792448, "step": 120710 }, { "epoch": 15.125297581756673, "grad_norm": 3.2870254516601562, "learning_rate": 1.7031578896530015e-06, "loss": 0.4605, "num_input_tokens_seen": 146798528, "step": 120715 }, { "epoch": 15.125924069665455, "grad_norm": 6.65792989730835, "learning_rate": 1.702746878975775e-06, "loss": 0.5355, "num_input_tokens_seen": 146804640, "step": 120720 }, { "epoch": 15.12655055757424, "grad_norm": 6.8982648849487305, "learning_rate": 1.7023359077199802e-06, "loss": 0.4729, "num_input_tokens_seen": 146811136, "step": 120725 }, { "epoch": 15.127177045483021, "grad_norm": 16.400768280029297, "learning_rate": 1.7019249758905321e-06, "loss": 0.4998, "num_input_tokens_seen": 146817216, "step": 120730 }, { "epoch": 15.127803533391806, "grad_norm": 2.799560785293579, "learning_rate": 1.7015140834923454e-06, "loss": 0.5265, "num_input_tokens_seen": 146823552, "step": 120735 }, { "epoch": 15.12843002130059, "grad_norm": 2.0313162803649902, "learning_rate": 1.7011032305303294e-06, "loss": 0.4761, "num_input_tokens_seen": 146829664, "step": 120740 }, { "epoch": 15.129056509209372, "grad_norm": 2.221721887588501, "learning_rate": 1.7006924170093997e-06, "loss": 0.4379, "num_input_tokens_seen": 146835840, "step": 120745 }, { "epoch": 15.129682997118156, "grad_norm": 8.885693550109863, "learning_rate": 1.7002816429344636e-06, "loss": 0.5069, "num_input_tokens_seen": 146842176, "step": 120750 }, { "epoch": 15.130309485026938, "grad_norm": 2.2810287475585938, "learning_rate": 1.6998709083104375e-06, "loss": 0.4343, "num_input_tokens_seen": 146848384, "step": 120755 }, { "epoch": 15.130935972935722, "grad_norm": 2.616027593612671, "learning_rate": 1.6994602131422272e-06, "loss": 0.4808, "num_input_tokens_seen": 146854592, "step": 120760 }, { "epoch": 15.131562460844506, "grad_norm": 1.4754884243011475, "learning_rate": 1.6990495574347465e-06, "loss": 0.4761, "num_input_tokens_seen": 146860224, "step": 120765 }, { "epoch": 15.132188948753289, "grad_norm": 1.862342119216919, "learning_rate": 1.6986389411929016e-06, "loss": 0.4318, "num_input_tokens_seen": 146866016, "step": 120770 }, { "epoch": 15.132815436662073, "grad_norm": 3.099273681640625, "learning_rate": 1.6982283644216046e-06, "loss": 0.4149, "num_input_tokens_seen": 146871712, "step": 120775 }, { "epoch": 15.133441924570855, "grad_norm": 12.8973388671875, "learning_rate": 1.6978178271257628e-06, "loss": 0.5014, "num_input_tokens_seen": 146877184, "step": 120780 }, { "epoch": 15.134068412479639, "grad_norm": 3.7991766929626465, "learning_rate": 1.6974073293102873e-06, "loss": 0.4311, "num_input_tokens_seen": 146883488, "step": 120785 }, { "epoch": 15.134694900388423, "grad_norm": 6.128881454467773, "learning_rate": 1.6969968709800822e-06, "loss": 0.4354, "num_input_tokens_seen": 146889792, "step": 120790 }, { "epoch": 15.135321388297205, "grad_norm": 1.4233548641204834, "learning_rate": 1.696586452140056e-06, "loss": 0.4715, "num_input_tokens_seen": 146896128, "step": 120795 }, { "epoch": 15.13594787620599, "grad_norm": 2.461535930633545, "learning_rate": 1.6961760727951183e-06, "loss": 0.4275, "num_input_tokens_seen": 146902144, "step": 120800 }, { "epoch": 15.136574364114773, "grad_norm": 3.045403242111206, "learning_rate": 1.6957657329501714e-06, "loss": 0.4497, "num_input_tokens_seen": 146908320, "step": 120805 }, { "epoch": 15.137200852023556, "grad_norm": 5.69305944442749, "learning_rate": 1.695355432610125e-06, "loss": 0.4592, "num_input_tokens_seen": 146914656, "step": 120810 }, { "epoch": 15.13782733993234, "grad_norm": 2.3702449798583984, "learning_rate": 1.694945171779881e-06, "loss": 0.4364, "num_input_tokens_seen": 146920992, "step": 120815 }, { "epoch": 15.138453827841122, "grad_norm": 2.4889872074127197, "learning_rate": 1.6945349504643483e-06, "loss": 0.5067, "num_input_tokens_seen": 146926912, "step": 120820 }, { "epoch": 15.139080315749906, "grad_norm": 17.321523666381836, "learning_rate": 1.694124768668427e-06, "loss": 0.519, "num_input_tokens_seen": 146932992, "step": 120825 }, { "epoch": 15.13970680365869, "grad_norm": 5.187906265258789, "learning_rate": 1.6937146263970256e-06, "loss": 0.4569, "num_input_tokens_seen": 146939296, "step": 120830 }, { "epoch": 15.140333291567472, "grad_norm": 3.564221143722534, "learning_rate": 1.6933045236550438e-06, "loss": 0.4774, "num_input_tokens_seen": 146945248, "step": 120835 }, { "epoch": 15.140959779476256, "grad_norm": 2.5786027908325195, "learning_rate": 1.6928944604473886e-06, "loss": 0.4124, "num_input_tokens_seen": 146951264, "step": 120840 }, { "epoch": 15.141586267385039, "grad_norm": 3.4260947704315186, "learning_rate": 1.692484436778959e-06, "loss": 0.4499, "num_input_tokens_seen": 146957344, "step": 120845 }, { "epoch": 15.142212755293823, "grad_norm": 4.7384562492370605, "learning_rate": 1.6920744526546596e-06, "loss": 0.4787, "num_input_tokens_seen": 146963584, "step": 120850 }, { "epoch": 15.142839243202607, "grad_norm": 1.8766286373138428, "learning_rate": 1.6916645080793903e-06, "loss": 0.4739, "num_input_tokens_seen": 146969248, "step": 120855 }, { "epoch": 15.14346573111139, "grad_norm": 3.481369972229004, "learning_rate": 1.6912546030580529e-06, "loss": 0.4478, "num_input_tokens_seen": 146974976, "step": 120860 }, { "epoch": 15.144092219020173, "grad_norm": 3.3930749893188477, "learning_rate": 1.6908447375955505e-06, "loss": 0.4418, "num_input_tokens_seen": 146981280, "step": 120865 }, { "epoch": 15.144718706928955, "grad_norm": 2.849465847015381, "learning_rate": 1.6904349116967795e-06, "loss": 0.4862, "num_input_tokens_seen": 146987392, "step": 120870 }, { "epoch": 15.14534519483774, "grad_norm": 3.3405532836914062, "learning_rate": 1.6900251253666421e-06, "loss": 0.4537, "num_input_tokens_seen": 146993440, "step": 120875 }, { "epoch": 15.145971682746524, "grad_norm": 7.971585273742676, "learning_rate": 1.6896153786100384e-06, "loss": 0.4667, "num_input_tokens_seen": 146999040, "step": 120880 }, { "epoch": 15.146598170655306, "grad_norm": 5.8466949462890625, "learning_rate": 1.6892056714318644e-06, "loss": 0.4246, "num_input_tokens_seen": 147005120, "step": 120885 }, { "epoch": 15.14722465856409, "grad_norm": 2.629112958908081, "learning_rate": 1.6887960038370204e-06, "loss": 0.4352, "num_input_tokens_seen": 147010752, "step": 120890 }, { "epoch": 15.147851146472872, "grad_norm": 2.145923614501953, "learning_rate": 1.688386375830406e-06, "loss": 0.4332, "num_input_tokens_seen": 147016032, "step": 120895 }, { "epoch": 15.148477634381656, "grad_norm": 2.488151788711548, "learning_rate": 1.6879767874169144e-06, "loss": 0.4376, "num_input_tokens_seen": 147021920, "step": 120900 }, { "epoch": 15.14910412229044, "grad_norm": 6.595211029052734, "learning_rate": 1.6875672386014475e-06, "loss": 0.5637, "num_input_tokens_seen": 147027808, "step": 120905 }, { "epoch": 15.149730610199223, "grad_norm": 4.034656047821045, "learning_rate": 1.687157729388897e-06, "loss": 0.4213, "num_input_tokens_seen": 147034048, "step": 120910 }, { "epoch": 15.150357098108007, "grad_norm": 6.358089923858643, "learning_rate": 1.6867482597841623e-06, "loss": 0.4608, "num_input_tokens_seen": 147040096, "step": 120915 }, { "epoch": 15.15098358601679, "grad_norm": 20.696237564086914, "learning_rate": 1.6863388297921367e-06, "loss": 0.4915, "num_input_tokens_seen": 147046368, "step": 120920 }, { "epoch": 15.151610073925573, "grad_norm": 2.6942505836486816, "learning_rate": 1.6859294394177162e-06, "loss": 0.4512, "num_input_tokens_seen": 147052416, "step": 120925 }, { "epoch": 15.152236561834357, "grad_norm": 2.8952043056488037, "learning_rate": 1.6855200886657974e-06, "loss": 0.4422, "num_input_tokens_seen": 147058272, "step": 120930 }, { "epoch": 15.15286304974314, "grad_norm": 3.5126140117645264, "learning_rate": 1.6851107775412706e-06, "loss": 0.4368, "num_input_tokens_seen": 147064256, "step": 120935 }, { "epoch": 15.153489537651923, "grad_norm": 2.198258638381958, "learning_rate": 1.6847015060490336e-06, "loss": 0.4308, "num_input_tokens_seen": 147070432, "step": 120940 }, { "epoch": 15.154116025560707, "grad_norm": 2.4025051593780518, "learning_rate": 1.6842922741939755e-06, "loss": 0.5176, "num_input_tokens_seen": 147076512, "step": 120945 }, { "epoch": 15.15474251346949, "grad_norm": 2.770826816558838, "learning_rate": 1.6838830819809925e-06, "loss": 0.4717, "num_input_tokens_seen": 147082528, "step": 120950 }, { "epoch": 15.155369001378274, "grad_norm": 11.189336776733398, "learning_rate": 1.6834739294149738e-06, "loss": 0.5005, "num_input_tokens_seen": 147088384, "step": 120955 }, { "epoch": 15.155995489287056, "grad_norm": 4.0303144454956055, "learning_rate": 1.6830648165008145e-06, "loss": 0.4574, "num_input_tokens_seen": 147094656, "step": 120960 }, { "epoch": 15.15662197719584, "grad_norm": 2.2101404666900635, "learning_rate": 1.682655743243402e-06, "loss": 0.4338, "num_input_tokens_seen": 147100800, "step": 120965 }, { "epoch": 15.157248465104624, "grad_norm": 2.3639121055603027, "learning_rate": 1.6822467096476296e-06, "loss": 0.4395, "num_input_tokens_seen": 147107104, "step": 120970 }, { "epoch": 15.157874953013406, "grad_norm": 5.236638069152832, "learning_rate": 1.6818377157183886e-06, "loss": 0.4613, "num_input_tokens_seen": 147113248, "step": 120975 }, { "epoch": 15.15850144092219, "grad_norm": 5.9536566734313965, "learning_rate": 1.6814287614605656e-06, "loss": 0.4477, "num_input_tokens_seen": 147119456, "step": 120980 }, { "epoch": 15.159127928830973, "grad_norm": 2.522437810897827, "learning_rate": 1.6810198468790524e-06, "loss": 0.4288, "num_input_tokens_seen": 147125568, "step": 120985 }, { "epoch": 15.159754416739757, "grad_norm": 10.646470069885254, "learning_rate": 1.6806109719787368e-06, "loss": 0.4783, "num_input_tokens_seen": 147131776, "step": 120990 }, { "epoch": 15.160380904648541, "grad_norm": 4.095200538635254, "learning_rate": 1.6802021367645099e-06, "loss": 0.4776, "num_input_tokens_seen": 147137504, "step": 120995 }, { "epoch": 15.161007392557323, "grad_norm": 14.736533164978027, "learning_rate": 1.6797933412412554e-06, "loss": 0.4768, "num_input_tokens_seen": 147144224, "step": 121000 }, { "epoch": 15.161633880466107, "grad_norm": 3.116600751876831, "learning_rate": 1.6793845854138652e-06, "loss": 0.468, "num_input_tokens_seen": 147150208, "step": 121005 }, { "epoch": 15.16226036837489, "grad_norm": 1.8704787492752075, "learning_rate": 1.678975869287222e-06, "loss": 0.5085, "num_input_tokens_seen": 147156160, "step": 121010 }, { "epoch": 15.162886856283674, "grad_norm": 2.3994176387786865, "learning_rate": 1.6785671928662167e-06, "loss": 0.4313, "num_input_tokens_seen": 147162432, "step": 121015 }, { "epoch": 15.163513344192458, "grad_norm": 11.878377914428711, "learning_rate": 1.6781585561557307e-06, "loss": 0.5843, "num_input_tokens_seen": 147168512, "step": 121020 }, { "epoch": 15.16413983210124, "grad_norm": 5.616515636444092, "learning_rate": 1.6777499591606543e-06, "loss": 0.4202, "num_input_tokens_seen": 147175104, "step": 121025 }, { "epoch": 15.164766320010024, "grad_norm": 2.936584234237671, "learning_rate": 1.6773414018858685e-06, "loss": 0.4365, "num_input_tokens_seen": 147181504, "step": 121030 }, { "epoch": 15.165392807918806, "grad_norm": 15.193991661071777, "learning_rate": 1.6769328843362615e-06, "loss": 0.5906, "num_input_tokens_seen": 147187616, "step": 121035 }, { "epoch": 15.16601929582759, "grad_norm": 1.8574166297912598, "learning_rate": 1.6765244065167136e-06, "loss": 0.4089, "num_input_tokens_seen": 147193728, "step": 121040 }, { "epoch": 15.166645783736374, "grad_norm": 2.2883901596069336, "learning_rate": 1.676115968432111e-06, "loss": 0.4413, "num_input_tokens_seen": 147199808, "step": 121045 }, { "epoch": 15.167272271645157, "grad_norm": 4.161973476409912, "learning_rate": 1.6757075700873382e-06, "loss": 0.4638, "num_input_tokens_seen": 147206176, "step": 121050 }, { "epoch": 15.16789875955394, "grad_norm": 2.3274128437042236, "learning_rate": 1.6752992114872746e-06, "loss": 0.4977, "num_input_tokens_seen": 147212352, "step": 121055 }, { "epoch": 15.168525247462725, "grad_norm": 3.9991157054901123, "learning_rate": 1.6748908926368062e-06, "loss": 0.4412, "num_input_tokens_seen": 147218112, "step": 121060 }, { "epoch": 15.169151735371507, "grad_norm": 2.0817322731018066, "learning_rate": 1.674482613540811e-06, "loss": 0.4193, "num_input_tokens_seen": 147223968, "step": 121065 }, { "epoch": 15.169778223280291, "grad_norm": 1.9035298824310303, "learning_rate": 1.6740743742041733e-06, "loss": 0.5468, "num_input_tokens_seen": 147230144, "step": 121070 }, { "epoch": 15.170404711189073, "grad_norm": 2.4315378665924072, "learning_rate": 1.6736661746317712e-06, "loss": 0.4416, "num_input_tokens_seen": 147236096, "step": 121075 }, { "epoch": 15.171031199097857, "grad_norm": 2.354358434677124, "learning_rate": 1.6732580148284873e-06, "loss": 0.415, "num_input_tokens_seen": 147242048, "step": 121080 }, { "epoch": 15.171657687006642, "grad_norm": 3.62467098236084, "learning_rate": 1.6728498947992017e-06, "loss": 0.4286, "num_input_tokens_seen": 147248128, "step": 121085 }, { "epoch": 15.172284174915424, "grad_norm": 1.9697678089141846, "learning_rate": 1.6724418145487913e-06, "loss": 0.433, "num_input_tokens_seen": 147254400, "step": 121090 }, { "epoch": 15.172910662824208, "grad_norm": 9.148845672607422, "learning_rate": 1.672033774082137e-06, "loss": 0.4522, "num_input_tokens_seen": 147260128, "step": 121095 }, { "epoch": 15.17353715073299, "grad_norm": 2.0899574756622314, "learning_rate": 1.6716257734041186e-06, "loss": 0.4496, "num_input_tokens_seen": 147266176, "step": 121100 }, { "epoch": 15.174163638641774, "grad_norm": 2.548943042755127, "learning_rate": 1.6712178125196105e-06, "loss": 0.4691, "num_input_tokens_seen": 147272224, "step": 121105 }, { "epoch": 15.174790126550558, "grad_norm": 6.536190509796143, "learning_rate": 1.6708098914334926e-06, "loss": 0.4762, "num_input_tokens_seen": 147278400, "step": 121110 }, { "epoch": 15.17541661445934, "grad_norm": 2.096407413482666, "learning_rate": 1.6704020101506425e-06, "loss": 0.4092, "num_input_tokens_seen": 147284352, "step": 121115 }, { "epoch": 15.176043102368125, "grad_norm": 2.4426589012145996, "learning_rate": 1.6699941686759341e-06, "loss": 0.4826, "num_input_tokens_seen": 147290496, "step": 121120 }, { "epoch": 15.176669590276907, "grad_norm": 4.222733974456787, "learning_rate": 1.6695863670142476e-06, "loss": 0.5107, "num_input_tokens_seen": 147296640, "step": 121125 }, { "epoch": 15.177296078185691, "grad_norm": 2.4232676029205322, "learning_rate": 1.669178605170454e-06, "loss": 0.4571, "num_input_tokens_seen": 147302816, "step": 121130 }, { "epoch": 15.177922566094475, "grad_norm": 6.697182655334473, "learning_rate": 1.668770883149432e-06, "loss": 0.5113, "num_input_tokens_seen": 147309024, "step": 121135 }, { "epoch": 15.178549054003257, "grad_norm": 2.634202718734741, "learning_rate": 1.668363200956053e-06, "loss": 0.45, "num_input_tokens_seen": 147315040, "step": 121140 }, { "epoch": 15.179175541912041, "grad_norm": 2.6329431533813477, "learning_rate": 1.6679555585951951e-06, "loss": 0.4606, "num_input_tokens_seen": 147320480, "step": 121145 }, { "epoch": 15.179802029820824, "grad_norm": 2.263903856277466, "learning_rate": 1.6675479560717285e-06, "loss": 0.466, "num_input_tokens_seen": 147326624, "step": 121150 }, { "epoch": 15.180428517729608, "grad_norm": 2.8087480068206787, "learning_rate": 1.667140393390529e-06, "loss": 0.4626, "num_input_tokens_seen": 147332640, "step": 121155 }, { "epoch": 15.181055005638392, "grad_norm": 1.7905954122543335, "learning_rate": 1.6667328705564666e-06, "loss": 0.5083, "num_input_tokens_seen": 147338432, "step": 121160 }, { "epoch": 15.181681493547174, "grad_norm": 8.474937438964844, "learning_rate": 1.6663253875744157e-06, "loss": 0.4681, "num_input_tokens_seen": 147344416, "step": 121165 }, { "epoch": 15.182307981455958, "grad_norm": 2.1100752353668213, "learning_rate": 1.665917944449249e-06, "loss": 0.457, "num_input_tokens_seen": 147350496, "step": 121170 }, { "epoch": 15.182934469364742, "grad_norm": 7.714465618133545, "learning_rate": 1.6655105411858346e-06, "loss": 0.4465, "num_input_tokens_seen": 147356608, "step": 121175 }, { "epoch": 15.183560957273524, "grad_norm": 3.0040440559387207, "learning_rate": 1.665103177789047e-06, "loss": 0.4669, "num_input_tokens_seen": 147362656, "step": 121180 }, { "epoch": 15.184187445182308, "grad_norm": 1.8580474853515625, "learning_rate": 1.6646958542637526e-06, "loss": 0.4519, "num_input_tokens_seen": 147369024, "step": 121185 }, { "epoch": 15.18481393309109, "grad_norm": 2.711052417755127, "learning_rate": 1.6642885706148237e-06, "loss": 0.5126, "num_input_tokens_seen": 147375360, "step": 121190 }, { "epoch": 15.185440420999875, "grad_norm": 3.356722354888916, "learning_rate": 1.6638813268471298e-06, "loss": 0.4587, "num_input_tokens_seen": 147381632, "step": 121195 }, { "epoch": 15.186066908908659, "grad_norm": 2.501582145690918, "learning_rate": 1.663474122965541e-06, "loss": 0.4274, "num_input_tokens_seen": 147387616, "step": 121200 }, { "epoch": 15.186693396817441, "grad_norm": 9.110184669494629, "learning_rate": 1.6630669589749226e-06, "loss": 0.4976, "num_input_tokens_seen": 147393440, "step": 121205 }, { "epoch": 15.187319884726225, "grad_norm": 4.073011875152588, "learning_rate": 1.6626598348801453e-06, "loss": 0.4624, "num_input_tokens_seen": 147399616, "step": 121210 }, { "epoch": 15.187946372635007, "grad_norm": 2.3660619258880615, "learning_rate": 1.6622527506860747e-06, "loss": 0.431, "num_input_tokens_seen": 147405824, "step": 121215 }, { "epoch": 15.188572860543792, "grad_norm": 4.283559799194336, "learning_rate": 1.6618457063975802e-06, "loss": 0.5032, "num_input_tokens_seen": 147411904, "step": 121220 }, { "epoch": 15.189199348452576, "grad_norm": 4.604049205780029, "learning_rate": 1.661438702019525e-06, "loss": 0.471, "num_input_tokens_seen": 147418016, "step": 121225 }, { "epoch": 15.189825836361358, "grad_norm": 15.715550422668457, "learning_rate": 1.6610317375567769e-06, "loss": 0.4573, "num_input_tokens_seen": 147424128, "step": 121230 }, { "epoch": 15.190452324270142, "grad_norm": 1.5256296396255493, "learning_rate": 1.6606248130142034e-06, "loss": 0.4695, "num_input_tokens_seen": 147430176, "step": 121235 }, { "epoch": 15.191078812178924, "grad_norm": 2.3233683109283447, "learning_rate": 1.6602179283966662e-06, "loss": 0.4243, "num_input_tokens_seen": 147436608, "step": 121240 }, { "epoch": 15.191705300087708, "grad_norm": 6.510334491729736, "learning_rate": 1.6598110837090338e-06, "loss": 0.4578, "num_input_tokens_seen": 147443040, "step": 121245 }, { "epoch": 15.192331787996492, "grad_norm": 4.307237148284912, "learning_rate": 1.6594042789561653e-06, "loss": 0.499, "num_input_tokens_seen": 147448928, "step": 121250 }, { "epoch": 15.192958275905275, "grad_norm": 3.075024366378784, "learning_rate": 1.6589975141429298e-06, "loss": 0.4337, "num_input_tokens_seen": 147455264, "step": 121255 }, { "epoch": 15.193584763814059, "grad_norm": 4.075333118438721, "learning_rate": 1.6585907892741865e-06, "loss": 0.4886, "num_input_tokens_seen": 147461632, "step": 121260 }, { "epoch": 15.194211251722841, "grad_norm": 16.292438507080078, "learning_rate": 1.6581841043548007e-06, "loss": 0.5046, "num_input_tokens_seen": 147467616, "step": 121265 }, { "epoch": 15.194837739631625, "grad_norm": 12.51984691619873, "learning_rate": 1.6577774593896323e-06, "loss": 0.5242, "num_input_tokens_seen": 147473856, "step": 121270 }, { "epoch": 15.195464227540409, "grad_norm": 6.831933498382568, "learning_rate": 1.657370854383546e-06, "loss": 0.4691, "num_input_tokens_seen": 147479584, "step": 121275 }, { "epoch": 15.196090715449191, "grad_norm": 2.117739677429199, "learning_rate": 1.6569642893413995e-06, "loss": 0.441, "num_input_tokens_seen": 147485824, "step": 121280 }, { "epoch": 15.196717203357975, "grad_norm": 10.84305191040039, "learning_rate": 1.656557764268056e-06, "loss": 0.444, "num_input_tokens_seen": 147491648, "step": 121285 }, { "epoch": 15.19734369126676, "grad_norm": 11.15491008758545, "learning_rate": 1.6561512791683748e-06, "loss": 0.5142, "num_input_tokens_seen": 147497440, "step": 121290 }, { "epoch": 15.197970179175542, "grad_norm": 2.202636480331421, "learning_rate": 1.6557448340472188e-06, "loss": 0.4085, "num_input_tokens_seen": 147503936, "step": 121295 }, { "epoch": 15.198596667084326, "grad_norm": 2.161949396133423, "learning_rate": 1.6553384289094427e-06, "loss": 0.4596, "num_input_tokens_seen": 147510208, "step": 121300 }, { "epoch": 15.199223154993108, "grad_norm": 2.3359005451202393, "learning_rate": 1.6549320637599082e-06, "loss": 0.4616, "num_input_tokens_seen": 147516448, "step": 121305 }, { "epoch": 15.199849642901892, "grad_norm": 4.085965633392334, "learning_rate": 1.6545257386034747e-06, "loss": 0.5063, "num_input_tokens_seen": 147522656, "step": 121310 }, { "epoch": 15.200476130810676, "grad_norm": 2.2268669605255127, "learning_rate": 1.6541194534449971e-06, "loss": 0.4418, "num_input_tokens_seen": 147528416, "step": 121315 }, { "epoch": 15.201102618719458, "grad_norm": 6.7278151512146, "learning_rate": 1.6537132082893359e-06, "loss": 0.4741, "num_input_tokens_seen": 147534432, "step": 121320 }, { "epoch": 15.201729106628243, "grad_norm": 2.1394433975219727, "learning_rate": 1.6533070031413457e-06, "loss": 0.4361, "num_input_tokens_seen": 147540448, "step": 121325 }, { "epoch": 15.202355594537025, "grad_norm": 2.034968852996826, "learning_rate": 1.6529008380058854e-06, "loss": 0.3919, "num_input_tokens_seen": 147546464, "step": 121330 }, { "epoch": 15.202982082445809, "grad_norm": 9.660964965820312, "learning_rate": 1.6524947128878078e-06, "loss": 0.5165, "num_input_tokens_seen": 147552576, "step": 121335 }, { "epoch": 15.203608570354593, "grad_norm": 4.435619831085205, "learning_rate": 1.6520886277919717e-06, "loss": 0.5337, "num_input_tokens_seen": 147558720, "step": 121340 }, { "epoch": 15.204235058263375, "grad_norm": 2.5320041179656982, "learning_rate": 1.6516825827232297e-06, "loss": 0.4143, "num_input_tokens_seen": 147565088, "step": 121345 }, { "epoch": 15.20486154617216, "grad_norm": 5.892343997955322, "learning_rate": 1.6512765776864376e-06, "loss": 0.4336, "num_input_tokens_seen": 147571200, "step": 121350 }, { "epoch": 15.205488034080942, "grad_norm": 3.1626269817352295, "learning_rate": 1.650870612686451e-06, "loss": 0.4329, "num_input_tokens_seen": 147577152, "step": 121355 }, { "epoch": 15.206114521989726, "grad_norm": 6.787192344665527, "learning_rate": 1.65046468772812e-06, "loss": 0.4415, "num_input_tokens_seen": 147583200, "step": 121360 }, { "epoch": 15.20674100989851, "grad_norm": 11.658916473388672, "learning_rate": 1.6500588028163017e-06, "loss": 0.5206, "num_input_tokens_seen": 147589120, "step": 121365 }, { "epoch": 15.207367497807292, "grad_norm": 7.352103233337402, "learning_rate": 1.6496529579558451e-06, "loss": 0.5267, "num_input_tokens_seen": 147595072, "step": 121370 }, { "epoch": 15.207993985716076, "grad_norm": 12.50153636932373, "learning_rate": 1.6492471531516063e-06, "loss": 0.5719, "num_input_tokens_seen": 147601056, "step": 121375 }, { "epoch": 15.208620473624858, "grad_norm": 10.21163272857666, "learning_rate": 1.6488413884084331e-06, "loss": 0.4809, "num_input_tokens_seen": 147607488, "step": 121380 }, { "epoch": 15.209246961533642, "grad_norm": 13.570051193237305, "learning_rate": 1.6484356637311799e-06, "loss": 0.4818, "num_input_tokens_seen": 147613600, "step": 121385 }, { "epoch": 15.209873449442426, "grad_norm": 2.426785945892334, "learning_rate": 1.6480299791246946e-06, "loss": 0.4051, "num_input_tokens_seen": 147620160, "step": 121390 }, { "epoch": 15.210499937351209, "grad_norm": 5.607825756072998, "learning_rate": 1.6476243345938293e-06, "loss": 0.4579, "num_input_tokens_seen": 147626272, "step": 121395 }, { "epoch": 15.211126425259993, "grad_norm": 3.4944400787353516, "learning_rate": 1.6472187301434339e-06, "loss": 0.4415, "num_input_tokens_seen": 147632480, "step": 121400 }, { "epoch": 15.211752913168775, "grad_norm": 2.337498664855957, "learning_rate": 1.6468131657783588e-06, "loss": 0.4829, "num_input_tokens_seen": 147638432, "step": 121405 }, { "epoch": 15.212379401077559, "grad_norm": 5.626097679138184, "learning_rate": 1.6464076415034496e-06, "loss": 0.4607, "num_input_tokens_seen": 147644704, "step": 121410 }, { "epoch": 15.213005888986343, "grad_norm": 4.677975177764893, "learning_rate": 1.646002157323557e-06, "loss": 0.4397, "num_input_tokens_seen": 147650336, "step": 121415 }, { "epoch": 15.213632376895125, "grad_norm": 4.626664161682129, "learning_rate": 1.6455967132435296e-06, "loss": 0.4728, "num_input_tokens_seen": 147656544, "step": 121420 }, { "epoch": 15.21425886480391, "grad_norm": 4.04959774017334, "learning_rate": 1.6451913092682126e-06, "loss": 0.4575, "num_input_tokens_seen": 147662688, "step": 121425 }, { "epoch": 15.214885352712693, "grad_norm": 10.589441299438477, "learning_rate": 1.6447859454024557e-06, "loss": 0.506, "num_input_tokens_seen": 147668416, "step": 121430 }, { "epoch": 15.215511840621476, "grad_norm": 2.6779978275299072, "learning_rate": 1.6443806216511015e-06, "loss": 0.44, "num_input_tokens_seen": 147674400, "step": 121435 }, { "epoch": 15.21613832853026, "grad_norm": 2.2365407943725586, "learning_rate": 1.6439753380190004e-06, "loss": 0.4567, "num_input_tokens_seen": 147680320, "step": 121440 }, { "epoch": 15.216764816439042, "grad_norm": 2.369241237640381, "learning_rate": 1.643570094510994e-06, "loss": 0.4701, "num_input_tokens_seen": 147687008, "step": 121445 }, { "epoch": 15.217391304347826, "grad_norm": 2.2780160903930664, "learning_rate": 1.643164891131931e-06, "loss": 0.4188, "num_input_tokens_seen": 147693216, "step": 121450 }, { "epoch": 15.21801779225661, "grad_norm": 2.7324397563934326, "learning_rate": 1.6427597278866513e-06, "loss": 0.4144, "num_input_tokens_seen": 147699584, "step": 121455 }, { "epoch": 15.218644280165393, "grad_norm": 4.424346923828125, "learning_rate": 1.6423546047800043e-06, "loss": 0.4306, "num_input_tokens_seen": 147705792, "step": 121460 }, { "epoch": 15.219270768074177, "grad_norm": 5.527271270751953, "learning_rate": 1.641949521816829e-06, "loss": 0.5129, "num_input_tokens_seen": 147711744, "step": 121465 }, { "epoch": 15.219897255982959, "grad_norm": 10.013846397399902, "learning_rate": 1.6415444790019725e-06, "loss": 0.4968, "num_input_tokens_seen": 147717824, "step": 121470 }, { "epoch": 15.220523743891743, "grad_norm": 2.7056095600128174, "learning_rate": 1.6411394763402734e-06, "loss": 0.458, "num_input_tokens_seen": 147723936, "step": 121475 }, { "epoch": 15.221150231800527, "grad_norm": 7.318714618682861, "learning_rate": 1.6407345138365754e-06, "loss": 0.5168, "num_input_tokens_seen": 147730400, "step": 121480 }, { "epoch": 15.22177671970931, "grad_norm": 12.927587509155273, "learning_rate": 1.640329591495723e-06, "loss": 0.5218, "num_input_tokens_seen": 147736032, "step": 121485 }, { "epoch": 15.222403207618093, "grad_norm": 3.8970699310302734, "learning_rate": 1.6399247093225528e-06, "loss": 0.4311, "num_input_tokens_seen": 147741696, "step": 121490 }, { "epoch": 15.223029695526876, "grad_norm": 2.951491355895996, "learning_rate": 1.6395198673219076e-06, "loss": 0.45, "num_input_tokens_seen": 147747872, "step": 121495 }, { "epoch": 15.22365618343566, "grad_norm": 2.2391176223754883, "learning_rate": 1.6391150654986299e-06, "loss": 0.4695, "num_input_tokens_seen": 147754176, "step": 121500 }, { "epoch": 15.224282671344444, "grad_norm": 3.133955717086792, "learning_rate": 1.6387103038575553e-06, "loss": 0.4797, "num_input_tokens_seen": 147759648, "step": 121505 }, { "epoch": 15.224909159253226, "grad_norm": 2.2017171382904053, "learning_rate": 1.6383055824035254e-06, "loss": 0.4162, "num_input_tokens_seen": 147765792, "step": 121510 }, { "epoch": 15.22553564716201, "grad_norm": 8.049161911010742, "learning_rate": 1.63790090114138e-06, "loss": 0.4607, "num_input_tokens_seen": 147771296, "step": 121515 }, { "epoch": 15.226162135070792, "grad_norm": 1.762558102607727, "learning_rate": 1.6374962600759543e-06, "loss": 0.4401, "num_input_tokens_seen": 147777408, "step": 121520 }, { "epoch": 15.226788622979576, "grad_norm": 1.8067957162857056, "learning_rate": 1.6370916592120895e-06, "loss": 0.4572, "num_input_tokens_seen": 147783584, "step": 121525 }, { "epoch": 15.22741511088836, "grad_norm": 4.594141960144043, "learning_rate": 1.6366870985546195e-06, "loss": 0.4588, "num_input_tokens_seen": 147789664, "step": 121530 }, { "epoch": 15.228041598797143, "grad_norm": 1.9290170669555664, "learning_rate": 1.6362825781083836e-06, "loss": 0.4333, "num_input_tokens_seen": 147795712, "step": 121535 }, { "epoch": 15.228668086705927, "grad_norm": 11.94222640991211, "learning_rate": 1.6358780978782185e-06, "loss": 0.4957, "num_input_tokens_seen": 147802112, "step": 121540 }, { "epoch": 15.229294574614709, "grad_norm": 17.8464298248291, "learning_rate": 1.6354736578689579e-06, "loss": 0.5208, "num_input_tokens_seen": 147808288, "step": 121545 }, { "epoch": 15.229921062523493, "grad_norm": 6.398714542388916, "learning_rate": 1.6350692580854399e-06, "loss": 0.4936, "num_input_tokens_seen": 147814816, "step": 121550 }, { "epoch": 15.230547550432277, "grad_norm": 1.9810682535171509, "learning_rate": 1.6346648985324965e-06, "loss": 0.4317, "num_input_tokens_seen": 147820992, "step": 121555 }, { "epoch": 15.23117403834106, "grad_norm": 2.9372575283050537, "learning_rate": 1.6342605792149647e-06, "loss": 0.4253, "num_input_tokens_seen": 147827040, "step": 121560 }, { "epoch": 15.231800526249843, "grad_norm": 1.154659628868103, "learning_rate": 1.6338563001376768e-06, "loss": 0.4706, "num_input_tokens_seen": 147832928, "step": 121565 }, { "epoch": 15.232427014158628, "grad_norm": 1.7498195171356201, "learning_rate": 1.6334520613054678e-06, "loss": 0.5501, "num_input_tokens_seen": 147838848, "step": 121570 }, { "epoch": 15.23305350206741, "grad_norm": 9.054466247558594, "learning_rate": 1.6330478627231682e-06, "loss": 0.517, "num_input_tokens_seen": 147845632, "step": 121575 }, { "epoch": 15.233679989976194, "grad_norm": 3.2212421894073486, "learning_rate": 1.6326437043956145e-06, "loss": 0.4227, "num_input_tokens_seen": 147851648, "step": 121580 }, { "epoch": 15.234306477884976, "grad_norm": 7.907224178314209, "learning_rate": 1.632239586327634e-06, "loss": 0.502, "num_input_tokens_seen": 147858112, "step": 121585 }, { "epoch": 15.23493296579376, "grad_norm": 11.430177688598633, "learning_rate": 1.6318355085240612e-06, "loss": 0.4805, "num_input_tokens_seen": 147864480, "step": 121590 }, { "epoch": 15.235559453702544, "grad_norm": 2.366323471069336, "learning_rate": 1.6314314709897283e-06, "loss": 0.4355, "num_input_tokens_seen": 147870720, "step": 121595 }, { "epoch": 15.236185941611327, "grad_norm": 2.906848192214966, "learning_rate": 1.6310274737294624e-06, "loss": 0.4964, "num_input_tokens_seen": 147876608, "step": 121600 }, { "epoch": 15.23681242952011, "grad_norm": 2.0929665565490723, "learning_rate": 1.6306235167480956e-06, "loss": 0.4336, "num_input_tokens_seen": 147882816, "step": 121605 }, { "epoch": 15.237438917428893, "grad_norm": 2.445272445678711, "learning_rate": 1.6302196000504578e-06, "loss": 0.4082, "num_input_tokens_seen": 147888928, "step": 121610 }, { "epoch": 15.238065405337677, "grad_norm": 3.251086473464966, "learning_rate": 1.6298157236413793e-06, "loss": 0.4892, "num_input_tokens_seen": 147895232, "step": 121615 }, { "epoch": 15.238691893246461, "grad_norm": 1.994735598564148, "learning_rate": 1.6294118875256854e-06, "loss": 0.4382, "num_input_tokens_seen": 147901440, "step": 121620 }, { "epoch": 15.239318381155243, "grad_norm": 5.897361755371094, "learning_rate": 1.6290080917082079e-06, "loss": 0.4663, "num_input_tokens_seen": 147907552, "step": 121625 }, { "epoch": 15.239944869064027, "grad_norm": 10.27469253540039, "learning_rate": 1.6286043361937715e-06, "loss": 0.4641, "num_input_tokens_seen": 147912992, "step": 121630 }, { "epoch": 15.24057135697281, "grad_norm": 2.8780648708343506, "learning_rate": 1.6282006209872064e-06, "loss": 0.4817, "num_input_tokens_seen": 147919520, "step": 121635 }, { "epoch": 15.241197844881594, "grad_norm": 2.4724268913269043, "learning_rate": 1.6277969460933357e-06, "loss": 0.4117, "num_input_tokens_seen": 147925728, "step": 121640 }, { "epoch": 15.241824332790378, "grad_norm": 2.1776793003082275, "learning_rate": 1.6273933115169898e-06, "loss": 0.478, "num_input_tokens_seen": 147931744, "step": 121645 }, { "epoch": 15.24245082069916, "grad_norm": 7.589892864227295, "learning_rate": 1.6269897172629905e-06, "loss": 0.4851, "num_input_tokens_seen": 147938080, "step": 121650 }, { "epoch": 15.243077308607944, "grad_norm": 3.196397304534912, "learning_rate": 1.6265861633361667e-06, "loss": 0.4372, "num_input_tokens_seen": 147944096, "step": 121655 }, { "epoch": 15.243703796516726, "grad_norm": 2.402611255645752, "learning_rate": 1.6261826497413402e-06, "loss": 0.4174, "num_input_tokens_seen": 147949696, "step": 121660 }, { "epoch": 15.24433028442551, "grad_norm": 2.9564106464385986, "learning_rate": 1.6257791764833363e-06, "loss": 0.4483, "num_input_tokens_seen": 147955872, "step": 121665 }, { "epoch": 15.244956772334294, "grad_norm": 7.009005546569824, "learning_rate": 1.625375743566981e-06, "loss": 0.4847, "num_input_tokens_seen": 147961888, "step": 121670 }, { "epoch": 15.245583260243077, "grad_norm": 7.6575846672058105, "learning_rate": 1.624972350997094e-06, "loss": 0.6227, "num_input_tokens_seen": 147967840, "step": 121675 }, { "epoch": 15.24620974815186, "grad_norm": 3.1129586696624756, "learning_rate": 1.624568998778502e-06, "loss": 0.4992, "num_input_tokens_seen": 147973952, "step": 121680 }, { "epoch": 15.246836236060645, "grad_norm": 11.429112434387207, "learning_rate": 1.6241656869160239e-06, "loss": 0.5036, "num_input_tokens_seen": 147980000, "step": 121685 }, { "epoch": 15.247462723969427, "grad_norm": 5.332163333892822, "learning_rate": 1.623762415414485e-06, "loss": 0.4169, "num_input_tokens_seen": 147985984, "step": 121690 }, { "epoch": 15.248089211878211, "grad_norm": 4.552380561828613, "learning_rate": 1.6233591842787032e-06, "loss": 0.4531, "num_input_tokens_seen": 147992128, "step": 121695 }, { "epoch": 15.248715699786993, "grad_norm": 1.9922184944152832, "learning_rate": 1.6229559935135013e-06, "loss": 0.415, "num_input_tokens_seen": 147998400, "step": 121700 }, { "epoch": 15.249342187695778, "grad_norm": 12.830376625061035, "learning_rate": 1.6225528431237014e-06, "loss": 0.4467, "num_input_tokens_seen": 148004320, "step": 121705 }, { "epoch": 15.249968675604562, "grad_norm": 8.845471382141113, "learning_rate": 1.62214973311412e-06, "loss": 0.4837, "num_input_tokens_seen": 148010496, "step": 121710 }, { "epoch": 15.250595163513344, "grad_norm": 3.7438132762908936, "learning_rate": 1.621746663489579e-06, "loss": 0.4304, "num_input_tokens_seen": 148016768, "step": 121715 }, { "epoch": 15.251221651422128, "grad_norm": 14.145771980285645, "learning_rate": 1.6213436342548965e-06, "loss": 0.4844, "num_input_tokens_seen": 148023040, "step": 121720 }, { "epoch": 15.25184813933091, "grad_norm": 1.605873942375183, "learning_rate": 1.620940645414893e-06, "loss": 0.4895, "num_input_tokens_seen": 148029056, "step": 121725 }, { "epoch": 15.252474627239694, "grad_norm": 2.1361193656921387, "learning_rate": 1.6205376969743836e-06, "loss": 0.4663, "num_input_tokens_seen": 148035136, "step": 121730 }, { "epoch": 15.253101115148478, "grad_norm": 2.2933106422424316, "learning_rate": 1.620134788938189e-06, "loss": 0.4674, "num_input_tokens_seen": 148041344, "step": 121735 }, { "epoch": 15.25372760305726, "grad_norm": 3.0047430992126465, "learning_rate": 1.6197319213111235e-06, "loss": 0.4675, "num_input_tokens_seen": 148047360, "step": 121740 }, { "epoch": 15.254354090966045, "grad_norm": 16.279664993286133, "learning_rate": 1.6193290940980055e-06, "loss": 0.5142, "num_input_tokens_seen": 148053760, "step": 121745 }, { "epoch": 15.254980578874827, "grad_norm": 2.3468947410583496, "learning_rate": 1.6189263073036493e-06, "loss": 0.4543, "num_input_tokens_seen": 148059264, "step": 121750 }, { "epoch": 15.255607066783611, "grad_norm": 1.8508692979812622, "learning_rate": 1.6185235609328736e-06, "loss": 0.4607, "num_input_tokens_seen": 148065696, "step": 121755 }, { "epoch": 15.256233554692395, "grad_norm": 1.8918298482894897, "learning_rate": 1.6181208549904897e-06, "loss": 0.4461, "num_input_tokens_seen": 148072000, "step": 121760 }, { "epoch": 15.256860042601177, "grad_norm": 3.6605985164642334, "learning_rate": 1.6177181894813165e-06, "loss": 0.4304, "num_input_tokens_seen": 148077952, "step": 121765 }, { "epoch": 15.257486530509961, "grad_norm": 7.088416576385498, "learning_rate": 1.617315564410164e-06, "loss": 0.4297, "num_input_tokens_seen": 148084160, "step": 121770 }, { "epoch": 15.258113018418744, "grad_norm": 2.435612440109253, "learning_rate": 1.6169129797818494e-06, "loss": 0.4869, "num_input_tokens_seen": 148090048, "step": 121775 }, { "epoch": 15.258739506327528, "grad_norm": 9.51204776763916, "learning_rate": 1.6165104356011829e-06, "loss": 0.4985, "num_input_tokens_seen": 148095648, "step": 121780 }, { "epoch": 15.259365994236312, "grad_norm": 3.906048536300659, "learning_rate": 1.6161079318729788e-06, "loss": 0.4803, "num_input_tokens_seen": 148101664, "step": 121785 }, { "epoch": 15.259992482145094, "grad_norm": 2.810230255126953, "learning_rate": 1.6157054686020513e-06, "loss": 0.4527, "num_input_tokens_seen": 148107712, "step": 121790 }, { "epoch": 15.260618970053878, "grad_norm": 7.440272331237793, "learning_rate": 1.615303045793209e-06, "loss": 0.4431, "num_input_tokens_seen": 148112672, "step": 121795 }, { "epoch": 15.261245457962662, "grad_norm": 2.0316860675811768, "learning_rate": 1.6149006634512654e-06, "loss": 0.4945, "num_input_tokens_seen": 148118368, "step": 121800 }, { "epoch": 15.261871945871444, "grad_norm": 2.314682722091675, "learning_rate": 1.614498321581029e-06, "loss": 0.4626, "num_input_tokens_seen": 148123680, "step": 121805 }, { "epoch": 15.262498433780229, "grad_norm": 2.541212797164917, "learning_rate": 1.6140960201873113e-06, "loss": 0.4381, "num_input_tokens_seen": 148129504, "step": 121810 }, { "epoch": 15.26312492168901, "grad_norm": 4.51503849029541, "learning_rate": 1.613693759274923e-06, "loss": 0.4207, "num_input_tokens_seen": 148135744, "step": 121815 }, { "epoch": 15.263751409597795, "grad_norm": 3.0689165592193604, "learning_rate": 1.6132915388486742e-06, "loss": 0.4688, "num_input_tokens_seen": 148141696, "step": 121820 }, { "epoch": 15.264377897506579, "grad_norm": 2.3835225105285645, "learning_rate": 1.6128893589133704e-06, "loss": 0.4394, "num_input_tokens_seen": 148147968, "step": 121825 }, { "epoch": 15.265004385415361, "grad_norm": 14.391700744628906, "learning_rate": 1.612487219473824e-06, "loss": 0.5324, "num_input_tokens_seen": 148154112, "step": 121830 }, { "epoch": 15.265630873324145, "grad_norm": 1.5364004373550415, "learning_rate": 1.6120851205348393e-06, "loss": 0.4233, "num_input_tokens_seen": 148160032, "step": 121835 }, { "epoch": 15.266257361232928, "grad_norm": 6.955813884735107, "learning_rate": 1.6116830621012276e-06, "loss": 0.4349, "num_input_tokens_seen": 148166016, "step": 121840 }, { "epoch": 15.266883849141712, "grad_norm": 2.003563165664673, "learning_rate": 1.611281044177791e-06, "loss": 0.4234, "num_input_tokens_seen": 148172416, "step": 121845 }, { "epoch": 15.267510337050496, "grad_norm": 1.7124348878860474, "learning_rate": 1.610879066769339e-06, "loss": 0.4374, "num_input_tokens_seen": 148178112, "step": 121850 }, { "epoch": 15.268136824959278, "grad_norm": 3.097745418548584, "learning_rate": 1.6104771298806788e-06, "loss": 0.4843, "num_input_tokens_seen": 148184096, "step": 121855 }, { "epoch": 15.268763312868062, "grad_norm": 9.094337463378906, "learning_rate": 1.6100752335166131e-06, "loss": 0.497, "num_input_tokens_seen": 148190592, "step": 121860 }, { "epoch": 15.269389800776844, "grad_norm": 2.3032853603363037, "learning_rate": 1.6096733776819489e-06, "loss": 0.4822, "num_input_tokens_seen": 148196832, "step": 121865 }, { "epoch": 15.270016288685628, "grad_norm": 16.575075149536133, "learning_rate": 1.6092715623814886e-06, "loss": 0.518, "num_input_tokens_seen": 148202912, "step": 121870 }, { "epoch": 15.270642776594412, "grad_norm": 3.0561273097991943, "learning_rate": 1.6088697876200393e-06, "loss": 0.4453, "num_input_tokens_seen": 148208896, "step": 121875 }, { "epoch": 15.271269264503195, "grad_norm": 4.1217780113220215, "learning_rate": 1.608468053402401e-06, "loss": 0.4804, "num_input_tokens_seen": 148214976, "step": 121880 }, { "epoch": 15.271895752411979, "grad_norm": 2.1584084033966064, "learning_rate": 1.6080663597333806e-06, "loss": 0.5386, "num_input_tokens_seen": 148221248, "step": 121885 }, { "epoch": 15.272522240320761, "grad_norm": 1.8612658977508545, "learning_rate": 1.6076647066177764e-06, "loss": 0.449, "num_input_tokens_seen": 148227360, "step": 121890 }, { "epoch": 15.273148728229545, "grad_norm": 11.589700698852539, "learning_rate": 1.6072630940603952e-06, "loss": 0.4538, "num_input_tokens_seen": 148233472, "step": 121895 }, { "epoch": 15.27377521613833, "grad_norm": 17.329021453857422, "learning_rate": 1.6068615220660338e-06, "loss": 0.4938, "num_input_tokens_seen": 148239328, "step": 121900 }, { "epoch": 15.274401704047111, "grad_norm": 10.130297660827637, "learning_rate": 1.6064599906394956e-06, "loss": 0.5411, "num_input_tokens_seen": 148245856, "step": 121905 }, { "epoch": 15.275028191955895, "grad_norm": 8.510009765625, "learning_rate": 1.606058499785582e-06, "loss": 0.5416, "num_input_tokens_seen": 148252032, "step": 121910 }, { "epoch": 15.27565467986468, "grad_norm": 2.3239872455596924, "learning_rate": 1.605657049509094e-06, "loss": 0.4675, "num_input_tokens_seen": 148258112, "step": 121915 }, { "epoch": 15.276281167773462, "grad_norm": 1.9671541452407837, "learning_rate": 1.6052556398148277e-06, "loss": 0.4414, "num_input_tokens_seen": 148264320, "step": 121920 }, { "epoch": 15.276907655682246, "grad_norm": 2.0951802730560303, "learning_rate": 1.6048542707075849e-06, "loss": 0.5005, "num_input_tokens_seen": 148270400, "step": 121925 }, { "epoch": 15.277534143591028, "grad_norm": 7.603703022003174, "learning_rate": 1.6044529421921651e-06, "loss": 0.5029, "num_input_tokens_seen": 148276352, "step": 121930 }, { "epoch": 15.278160631499812, "grad_norm": 3.412806987762451, "learning_rate": 1.6040516542733631e-06, "loss": 0.4281, "num_input_tokens_seen": 148282560, "step": 121935 }, { "epoch": 15.278787119408596, "grad_norm": 4.651824474334717, "learning_rate": 1.6036504069559812e-06, "loss": 0.5768, "num_input_tokens_seen": 148288768, "step": 121940 }, { "epoch": 15.279413607317379, "grad_norm": 5.6563615798950195, "learning_rate": 1.603249200244812e-06, "loss": 0.5023, "num_input_tokens_seen": 148295072, "step": 121945 }, { "epoch": 15.280040095226163, "grad_norm": 1.751685380935669, "learning_rate": 1.6028480341446562e-06, "loss": 0.436, "num_input_tokens_seen": 148300320, "step": 121950 }, { "epoch": 15.280666583134945, "grad_norm": 6.171924591064453, "learning_rate": 1.6024469086603067e-06, "loss": 0.4707, "num_input_tokens_seen": 148306624, "step": 121955 }, { "epoch": 15.281293071043729, "grad_norm": 4.261707782745361, "learning_rate": 1.602045823796563e-06, "loss": 0.5187, "num_input_tokens_seen": 148313120, "step": 121960 }, { "epoch": 15.281919558952513, "grad_norm": 4.975033760070801, "learning_rate": 1.6016447795582163e-06, "loss": 0.4609, "num_input_tokens_seen": 148319392, "step": 121965 }, { "epoch": 15.282546046861295, "grad_norm": 10.00411605834961, "learning_rate": 1.6012437759500632e-06, "loss": 0.5385, "num_input_tokens_seen": 148325088, "step": 121970 }, { "epoch": 15.28317253477008, "grad_norm": 8.023097038269043, "learning_rate": 1.6008428129769e-06, "loss": 0.4553, "num_input_tokens_seen": 148331488, "step": 121975 }, { "epoch": 15.283799022678862, "grad_norm": 20.445886611938477, "learning_rate": 1.6004418906435177e-06, "loss": 0.5441, "num_input_tokens_seen": 148337536, "step": 121980 }, { "epoch": 15.284425510587646, "grad_norm": 2.214348554611206, "learning_rate": 1.6000410089547118e-06, "loss": 0.4602, "num_input_tokens_seen": 148343936, "step": 121985 }, { "epoch": 15.28505199849643, "grad_norm": 9.213373184204102, "learning_rate": 1.599640167915273e-06, "loss": 0.5301, "num_input_tokens_seen": 148350144, "step": 121990 }, { "epoch": 15.285678486405212, "grad_norm": 2.1388847827911377, "learning_rate": 1.5992393675299962e-06, "loss": 0.4482, "num_input_tokens_seen": 148355648, "step": 121995 }, { "epoch": 15.286304974313996, "grad_norm": 9.013592720031738, "learning_rate": 1.5988386078036706e-06, "loss": 0.4865, "num_input_tokens_seen": 148361824, "step": 122000 }, { "epoch": 15.286931462222778, "grad_norm": 16.978721618652344, "learning_rate": 1.5984378887410901e-06, "loss": 0.5536, "num_input_tokens_seen": 148368352, "step": 122005 }, { "epoch": 15.287557950131562, "grad_norm": 13.377685546875, "learning_rate": 1.5980372103470432e-06, "loss": 0.5129, "num_input_tokens_seen": 148374528, "step": 122010 }, { "epoch": 15.288184438040346, "grad_norm": 3.654271364212036, "learning_rate": 1.5976365726263216e-06, "loss": 0.4636, "num_input_tokens_seen": 148380192, "step": 122015 }, { "epoch": 15.288810925949129, "grad_norm": 3.0406579971313477, "learning_rate": 1.5972359755837153e-06, "loss": 0.4592, "num_input_tokens_seen": 148386336, "step": 122020 }, { "epoch": 15.289437413857913, "grad_norm": 8.591263771057129, "learning_rate": 1.596835419224016e-06, "loss": 0.4999, "num_input_tokens_seen": 148392640, "step": 122025 }, { "epoch": 15.290063901766695, "grad_norm": 1.9852879047393799, "learning_rate": 1.5964349035520081e-06, "loss": 0.4568, "num_input_tokens_seen": 148398976, "step": 122030 }, { "epoch": 15.290690389675479, "grad_norm": 1.8443416357040405, "learning_rate": 1.5960344285724827e-06, "loss": 0.4605, "num_input_tokens_seen": 148404768, "step": 122035 }, { "epoch": 15.291316877584263, "grad_norm": 2.4402542114257812, "learning_rate": 1.5956339942902294e-06, "loss": 0.4804, "num_input_tokens_seen": 148410944, "step": 122040 }, { "epoch": 15.291943365493045, "grad_norm": 2.616058111190796, "learning_rate": 1.5952336007100323e-06, "loss": 0.4484, "num_input_tokens_seen": 148417152, "step": 122045 }, { "epoch": 15.29256985340183, "grad_norm": 2.958876132965088, "learning_rate": 1.5948332478366813e-06, "loss": 0.4239, "num_input_tokens_seen": 148423520, "step": 122050 }, { "epoch": 15.293196341310614, "grad_norm": 3.350188970565796, "learning_rate": 1.5944329356749604e-06, "loss": 0.4577, "num_input_tokens_seen": 148428832, "step": 122055 }, { "epoch": 15.293822829219396, "grad_norm": 2.841156482696533, "learning_rate": 1.5940326642296588e-06, "loss": 0.4593, "num_input_tokens_seen": 148434848, "step": 122060 }, { "epoch": 15.29444931712818, "grad_norm": 3.136420965194702, "learning_rate": 1.5936324335055587e-06, "loss": 0.4633, "num_input_tokens_seen": 148440960, "step": 122065 }, { "epoch": 15.295075805036962, "grad_norm": 4.270324230194092, "learning_rate": 1.5932322435074481e-06, "loss": 0.4573, "num_input_tokens_seen": 148447424, "step": 122070 }, { "epoch": 15.295702292945746, "grad_norm": 2.122525453567505, "learning_rate": 1.5928320942401088e-06, "loss": 0.4306, "num_input_tokens_seen": 148453440, "step": 122075 }, { "epoch": 15.29632878085453, "grad_norm": 2.694714307785034, "learning_rate": 1.5924319857083276e-06, "loss": 0.4717, "num_input_tokens_seen": 148459584, "step": 122080 }, { "epoch": 15.296955268763313, "grad_norm": 2.3170413970947266, "learning_rate": 1.5920319179168859e-06, "loss": 0.452, "num_input_tokens_seen": 148465696, "step": 122085 }, { "epoch": 15.297581756672097, "grad_norm": 5.024282455444336, "learning_rate": 1.5916318908705692e-06, "loss": 0.4754, "num_input_tokens_seen": 148471584, "step": 122090 }, { "epoch": 15.298208244580879, "grad_norm": 2.0808212757110596, "learning_rate": 1.5912319045741575e-06, "loss": 0.487, "num_input_tokens_seen": 148478080, "step": 122095 }, { "epoch": 15.298834732489663, "grad_norm": 2.8257527351379395, "learning_rate": 1.590831959032434e-06, "loss": 0.4783, "num_input_tokens_seen": 148484192, "step": 122100 }, { "epoch": 15.299461220398447, "grad_norm": 5.556972026824951, "learning_rate": 1.5904320542501823e-06, "loss": 0.4639, "num_input_tokens_seen": 148490208, "step": 122105 }, { "epoch": 15.30008770830723, "grad_norm": 8.119293212890625, "learning_rate": 1.5900321902321807e-06, "loss": 0.5158, "num_input_tokens_seen": 148495648, "step": 122110 }, { "epoch": 15.300714196216013, "grad_norm": 3.2503156661987305, "learning_rate": 1.589632366983211e-06, "loss": 0.4377, "num_input_tokens_seen": 148501472, "step": 122115 }, { "epoch": 15.301340684124796, "grad_norm": 7.447240829467773, "learning_rate": 1.5892325845080552e-06, "loss": 0.4626, "num_input_tokens_seen": 148507520, "step": 122120 }, { "epoch": 15.30196717203358, "grad_norm": 4.316669464111328, "learning_rate": 1.5888328428114902e-06, "loss": 0.4332, "num_input_tokens_seen": 148513760, "step": 122125 }, { "epoch": 15.302593659942364, "grad_norm": 3.5404841899871826, "learning_rate": 1.5884331418982962e-06, "loss": 0.4586, "num_input_tokens_seen": 148519744, "step": 122130 }, { "epoch": 15.303220147851146, "grad_norm": 2.1846060752868652, "learning_rate": 1.5880334817732535e-06, "loss": 0.4361, "num_input_tokens_seen": 148525504, "step": 122135 }, { "epoch": 15.30384663575993, "grad_norm": 6.487710475921631, "learning_rate": 1.587633862441138e-06, "loss": 0.4846, "num_input_tokens_seen": 148531840, "step": 122140 }, { "epoch": 15.304473123668712, "grad_norm": 5.452970504760742, "learning_rate": 1.5872342839067305e-06, "loss": 0.451, "num_input_tokens_seen": 148537984, "step": 122145 }, { "epoch": 15.305099611577496, "grad_norm": 1.9046529531478882, "learning_rate": 1.5868347461748046e-06, "loss": 0.4274, "num_input_tokens_seen": 148543936, "step": 122150 }, { "epoch": 15.30572609948628, "grad_norm": 2.4465293884277344, "learning_rate": 1.5864352492501388e-06, "loss": 0.4449, "num_input_tokens_seen": 148550176, "step": 122155 }, { "epoch": 15.306352587395063, "grad_norm": 7.487669467926025, "learning_rate": 1.5860357931375114e-06, "loss": 0.4532, "num_input_tokens_seen": 148556288, "step": 122160 }, { "epoch": 15.306979075303847, "grad_norm": 1.9438568353652954, "learning_rate": 1.5856363778416944e-06, "loss": 0.4351, "num_input_tokens_seen": 148561792, "step": 122165 }, { "epoch": 15.307605563212629, "grad_norm": 4.896255970001221, "learning_rate": 1.585237003367467e-06, "loss": 0.4639, "num_input_tokens_seen": 148568160, "step": 122170 }, { "epoch": 15.308232051121413, "grad_norm": 3.3025646209716797, "learning_rate": 1.5848376697196005e-06, "loss": 0.5108, "num_input_tokens_seen": 148574144, "step": 122175 }, { "epoch": 15.308858539030197, "grad_norm": 17.059301376342773, "learning_rate": 1.5844383769028726e-06, "loss": 0.5416, "num_input_tokens_seen": 148580352, "step": 122180 }, { "epoch": 15.30948502693898, "grad_norm": 3.8981099128723145, "learning_rate": 1.5840391249220543e-06, "loss": 0.4448, "num_input_tokens_seen": 148586688, "step": 122185 }, { "epoch": 15.310111514847764, "grad_norm": 2.0352728366851807, "learning_rate": 1.5836399137819213e-06, "loss": 0.4901, "num_input_tokens_seen": 148592608, "step": 122190 }, { "epoch": 15.310738002756548, "grad_norm": 2.2101986408233643, "learning_rate": 1.5832407434872443e-06, "loss": 0.4578, "num_input_tokens_seen": 148598688, "step": 122195 }, { "epoch": 15.31136449066533, "grad_norm": 2.8623833656311035, "learning_rate": 1.5828416140427983e-06, "loss": 0.4448, "num_input_tokens_seen": 148604640, "step": 122200 }, { "epoch": 15.311990978574114, "grad_norm": 2.720435380935669, "learning_rate": 1.582442525453352e-06, "loss": 0.4371, "num_input_tokens_seen": 148610432, "step": 122205 }, { "epoch": 15.312617466482896, "grad_norm": 5.839838981628418, "learning_rate": 1.5820434777236788e-06, "loss": 0.4475, "num_input_tokens_seen": 148616288, "step": 122210 }, { "epoch": 15.31324395439168, "grad_norm": 3.376228094100952, "learning_rate": 1.581644470858551e-06, "loss": 0.4282, "num_input_tokens_seen": 148622016, "step": 122215 }, { "epoch": 15.313870442300464, "grad_norm": 11.256026268005371, "learning_rate": 1.5812455048627362e-06, "loss": 0.5091, "num_input_tokens_seen": 148628128, "step": 122220 }, { "epoch": 15.314496930209247, "grad_norm": 5.346744060516357, "learning_rate": 1.5808465797410054e-06, "loss": 0.43, "num_input_tokens_seen": 148634112, "step": 122225 }, { "epoch": 15.31512341811803, "grad_norm": 2.674612045288086, "learning_rate": 1.5804476954981285e-06, "loss": 0.4344, "num_input_tokens_seen": 148640320, "step": 122230 }, { "epoch": 15.315749906026813, "grad_norm": 12.107959747314453, "learning_rate": 1.5800488521388762e-06, "loss": 0.5159, "num_input_tokens_seen": 148645920, "step": 122235 }, { "epoch": 15.316376393935597, "grad_norm": 8.709493637084961, "learning_rate": 1.5796500496680134e-06, "loss": 0.4645, "num_input_tokens_seen": 148651712, "step": 122240 }, { "epoch": 15.317002881844381, "grad_norm": 2.556375503540039, "learning_rate": 1.5792512880903117e-06, "loss": 0.3989, "num_input_tokens_seen": 148658304, "step": 122245 }, { "epoch": 15.317629369753163, "grad_norm": 2.7705836296081543, "learning_rate": 1.5788525674105349e-06, "loss": 0.4226, "num_input_tokens_seen": 148664448, "step": 122250 }, { "epoch": 15.318255857661947, "grad_norm": 13.290525436401367, "learning_rate": 1.5784538876334537e-06, "loss": 0.4462, "num_input_tokens_seen": 148670208, "step": 122255 }, { "epoch": 15.31888234557073, "grad_norm": 1.7432899475097656, "learning_rate": 1.5780552487638317e-06, "loss": 0.6004, "num_input_tokens_seen": 148676000, "step": 122260 }, { "epoch": 15.319508833479514, "grad_norm": 6.0649847984313965, "learning_rate": 1.5776566508064372e-06, "loss": 0.4623, "num_input_tokens_seen": 148682176, "step": 122265 }, { "epoch": 15.320135321388298, "grad_norm": 2.903502941131592, "learning_rate": 1.5772580937660338e-06, "loss": 0.4088, "num_input_tokens_seen": 148688384, "step": 122270 }, { "epoch": 15.32076180929708, "grad_norm": 6.1629557609558105, "learning_rate": 1.5768595776473889e-06, "loss": 0.4754, "num_input_tokens_seen": 148694688, "step": 122275 }, { "epoch": 15.321388297205864, "grad_norm": 3.5488996505737305, "learning_rate": 1.5764611024552645e-06, "loss": 0.4551, "num_input_tokens_seen": 148701184, "step": 122280 }, { "epoch": 15.322014785114646, "grad_norm": 13.421629905700684, "learning_rate": 1.5760626681944258e-06, "loss": 0.447, "num_input_tokens_seen": 148707424, "step": 122285 }, { "epoch": 15.32264127302343, "grad_norm": 25.565622329711914, "learning_rate": 1.5756642748696377e-06, "loss": 0.5392, "num_input_tokens_seen": 148713312, "step": 122290 }, { "epoch": 15.323267760932215, "grad_norm": 2.975893974304199, "learning_rate": 1.5752659224856614e-06, "loss": 0.5079, "num_input_tokens_seen": 148719456, "step": 122295 }, { "epoch": 15.323894248840997, "grad_norm": 2.3888959884643555, "learning_rate": 1.5748676110472616e-06, "loss": 0.45, "num_input_tokens_seen": 148725600, "step": 122300 }, { "epoch": 15.32452073674978, "grad_norm": 3.257722854614258, "learning_rate": 1.574469340559197e-06, "loss": 0.5068, "num_input_tokens_seen": 148731968, "step": 122305 }, { "epoch": 15.325147224658565, "grad_norm": 5.265312194824219, "learning_rate": 1.5740711110262341e-06, "loss": 0.528, "num_input_tokens_seen": 148738240, "step": 122310 }, { "epoch": 15.325773712567347, "grad_norm": 3.0747265815734863, "learning_rate": 1.5736729224531295e-06, "loss": 0.4388, "num_input_tokens_seen": 148744512, "step": 122315 }, { "epoch": 15.326400200476131, "grad_norm": 4.086183547973633, "learning_rate": 1.5732747748446454e-06, "loss": 0.4125, "num_input_tokens_seen": 148750880, "step": 122320 }, { "epoch": 15.327026688384914, "grad_norm": 7.486037731170654, "learning_rate": 1.5728766682055424e-06, "loss": 0.4784, "num_input_tokens_seen": 148756928, "step": 122325 }, { "epoch": 15.327653176293698, "grad_norm": 2.4749984741210938, "learning_rate": 1.5724786025405824e-06, "loss": 0.5114, "num_input_tokens_seen": 148762976, "step": 122330 }, { "epoch": 15.328279664202482, "grad_norm": 7.492421627044678, "learning_rate": 1.5720805778545206e-06, "loss": 0.5328, "num_input_tokens_seen": 148768736, "step": 122335 }, { "epoch": 15.328906152111264, "grad_norm": 10.767641067504883, "learning_rate": 1.5716825941521168e-06, "loss": 0.5248, "num_input_tokens_seen": 148774816, "step": 122340 }, { "epoch": 15.329532640020048, "grad_norm": 9.625884056091309, "learning_rate": 1.5712846514381325e-06, "loss": 0.4512, "num_input_tokens_seen": 148781088, "step": 122345 }, { "epoch": 15.33015912792883, "grad_norm": 3.340344190597534, "learning_rate": 1.5708867497173208e-06, "loss": 0.4755, "num_input_tokens_seen": 148787360, "step": 122350 }, { "epoch": 15.330785615837614, "grad_norm": 2.4911606311798096, "learning_rate": 1.5704888889944431e-06, "loss": 0.4985, "num_input_tokens_seen": 148793568, "step": 122355 }, { "epoch": 15.331412103746398, "grad_norm": 5.199281215667725, "learning_rate": 1.5700910692742522e-06, "loss": 0.5022, "num_input_tokens_seen": 148799456, "step": 122360 }, { "epoch": 15.33203859165518, "grad_norm": 8.86430549621582, "learning_rate": 1.5696932905615075e-06, "loss": 0.4388, "num_input_tokens_seen": 148805600, "step": 122365 }, { "epoch": 15.332665079563965, "grad_norm": 19.86414909362793, "learning_rate": 1.5692955528609627e-06, "loss": 0.5666, "num_input_tokens_seen": 148811968, "step": 122370 }, { "epoch": 15.333291567472747, "grad_norm": 3.598919630050659, "learning_rate": 1.5688978561773748e-06, "loss": 0.4822, "num_input_tokens_seen": 148818304, "step": 122375 }, { "epoch": 15.333918055381531, "grad_norm": 3.161592721939087, "learning_rate": 1.568500200515497e-06, "loss": 0.4315, "num_input_tokens_seen": 148824448, "step": 122380 }, { "epoch": 15.334544543290315, "grad_norm": 4.365389347076416, "learning_rate": 1.5681025858800853e-06, "loss": 0.4676, "num_input_tokens_seen": 148830880, "step": 122385 }, { "epoch": 15.335171031199097, "grad_norm": 2.9583725929260254, "learning_rate": 1.5677050122758913e-06, "loss": 0.4229, "num_input_tokens_seen": 148837056, "step": 122390 }, { "epoch": 15.335797519107881, "grad_norm": 6.752386569976807, "learning_rate": 1.567307479707671e-06, "loss": 0.4443, "num_input_tokens_seen": 148843040, "step": 122395 }, { "epoch": 15.336424007016664, "grad_norm": 2.349456548690796, "learning_rate": 1.566909988180174e-06, "loss": 0.43, "num_input_tokens_seen": 148849376, "step": 122400 }, { "epoch": 15.337050494925448, "grad_norm": 4.190820693969727, "learning_rate": 1.5665125376981548e-06, "loss": 0.4282, "num_input_tokens_seen": 148854720, "step": 122405 }, { "epoch": 15.337676982834232, "grad_norm": 4.480166912078857, "learning_rate": 1.5661151282663666e-06, "loss": 0.4609, "num_input_tokens_seen": 148861056, "step": 122410 }, { "epoch": 15.338303470743014, "grad_norm": 1.6539965867996216, "learning_rate": 1.5657177598895572e-06, "loss": 0.472, "num_input_tokens_seen": 148866496, "step": 122415 }, { "epoch": 15.338929958651798, "grad_norm": 2.7449820041656494, "learning_rate": 1.5653204325724813e-06, "loss": 0.5129, "num_input_tokens_seen": 148872672, "step": 122420 }, { "epoch": 15.339556446560582, "grad_norm": 2.986909866333008, "learning_rate": 1.5649231463198855e-06, "loss": 0.461, "num_input_tokens_seen": 148878432, "step": 122425 }, { "epoch": 15.340182934469365, "grad_norm": 2.369847297668457, "learning_rate": 1.5645259011365216e-06, "loss": 0.4018, "num_input_tokens_seen": 148884576, "step": 122430 }, { "epoch": 15.340809422378149, "grad_norm": 3.337125062942505, "learning_rate": 1.5641286970271391e-06, "loss": 0.4479, "num_input_tokens_seen": 148890656, "step": 122435 }, { "epoch": 15.34143591028693, "grad_norm": 6.243619918823242, "learning_rate": 1.5637315339964882e-06, "loss": 0.4327, "num_input_tokens_seen": 148896544, "step": 122440 }, { "epoch": 15.342062398195715, "grad_norm": 4.757411479949951, "learning_rate": 1.5633344120493139e-06, "loss": 0.4547, "num_input_tokens_seen": 148902688, "step": 122445 }, { "epoch": 15.342688886104499, "grad_norm": 2.8483047485351562, "learning_rate": 1.5629373311903684e-06, "loss": 0.4205, "num_input_tokens_seen": 148908864, "step": 122450 }, { "epoch": 15.343315374013281, "grad_norm": 3.5706918239593506, "learning_rate": 1.5625402914243949e-06, "loss": 0.4721, "num_input_tokens_seen": 148914816, "step": 122455 }, { "epoch": 15.343941861922065, "grad_norm": 7.193317413330078, "learning_rate": 1.5621432927561435e-06, "loss": 0.4755, "num_input_tokens_seen": 148920768, "step": 122460 }, { "epoch": 15.344568349830848, "grad_norm": 8.673694610595703, "learning_rate": 1.5617463351903584e-06, "loss": 0.5405, "num_input_tokens_seen": 148927104, "step": 122465 }, { "epoch": 15.345194837739632, "grad_norm": 2.66023325920105, "learning_rate": 1.5613494187317862e-06, "loss": 0.4665, "num_input_tokens_seen": 148933216, "step": 122470 }, { "epoch": 15.345821325648416, "grad_norm": 4.5549116134643555, "learning_rate": 1.5609525433851748e-06, "loss": 0.4891, "num_input_tokens_seen": 148939424, "step": 122475 }, { "epoch": 15.346447813557198, "grad_norm": 2.438760757446289, "learning_rate": 1.5605557091552653e-06, "loss": 0.5327, "num_input_tokens_seen": 148944992, "step": 122480 }, { "epoch": 15.347074301465982, "grad_norm": 4.856987476348877, "learning_rate": 1.5601589160468055e-06, "loss": 0.445, "num_input_tokens_seen": 148951136, "step": 122485 }, { "epoch": 15.347700789374764, "grad_norm": 2.000562906265259, "learning_rate": 1.5597621640645361e-06, "loss": 0.4194, "num_input_tokens_seen": 148957024, "step": 122490 }, { "epoch": 15.348327277283548, "grad_norm": 3.323286771774292, "learning_rate": 1.5593654532132047e-06, "loss": 0.4401, "num_input_tokens_seen": 148963264, "step": 122495 }, { "epoch": 15.348953765192332, "grad_norm": 4.183625221252441, "learning_rate": 1.5589687834975497e-06, "loss": 0.433, "num_input_tokens_seen": 148969280, "step": 122500 }, { "epoch": 15.349580253101115, "grad_norm": 3.3809151649475098, "learning_rate": 1.558572154922318e-06, "loss": 0.4712, "num_input_tokens_seen": 148974784, "step": 122505 }, { "epoch": 15.350206741009899, "grad_norm": 2.181469678878784, "learning_rate": 1.5581755674922476e-06, "loss": 0.4722, "num_input_tokens_seen": 148980672, "step": 122510 }, { "epoch": 15.350833228918681, "grad_norm": 7.33654260635376, "learning_rate": 1.557779021212084e-06, "loss": 0.484, "num_input_tokens_seen": 148986784, "step": 122515 }, { "epoch": 15.351459716827465, "grad_norm": 1.5602967739105225, "learning_rate": 1.5573825160865647e-06, "loss": 0.4747, "num_input_tokens_seen": 148992480, "step": 122520 }, { "epoch": 15.35208620473625, "grad_norm": 10.748613357543945, "learning_rate": 1.5569860521204317e-06, "loss": 0.4621, "num_input_tokens_seen": 148998752, "step": 122525 }, { "epoch": 15.352712692645031, "grad_norm": 3.3827178478240967, "learning_rate": 1.5565896293184251e-06, "loss": 0.4308, "num_input_tokens_seen": 149004928, "step": 122530 }, { "epoch": 15.353339180553816, "grad_norm": 3.1017324924468994, "learning_rate": 1.5561932476852865e-06, "loss": 0.4801, "num_input_tokens_seen": 149011136, "step": 122535 }, { "epoch": 15.3539656684626, "grad_norm": 6.752301216125488, "learning_rate": 1.5557969072257511e-06, "loss": 0.4527, "num_input_tokens_seen": 149017248, "step": 122540 }, { "epoch": 15.354592156371382, "grad_norm": 2.7411787509918213, "learning_rate": 1.5554006079445594e-06, "loss": 0.4407, "num_input_tokens_seen": 149022976, "step": 122545 }, { "epoch": 15.355218644280166, "grad_norm": 3.198185443878174, "learning_rate": 1.5550043498464512e-06, "loss": 0.4397, "num_input_tokens_seen": 149029152, "step": 122550 }, { "epoch": 15.355845132188948, "grad_norm": 7.7732625007629395, "learning_rate": 1.5546081329361607e-06, "loss": 0.4503, "num_input_tokens_seen": 149035488, "step": 122555 }, { "epoch": 15.356471620097732, "grad_norm": 2.7422165870666504, "learning_rate": 1.5542119572184284e-06, "loss": 0.411, "num_input_tokens_seen": 149041056, "step": 122560 }, { "epoch": 15.357098108006516, "grad_norm": 2.4100422859191895, "learning_rate": 1.5538158226979872e-06, "loss": 0.4553, "num_input_tokens_seen": 149046688, "step": 122565 }, { "epoch": 15.357724595915299, "grad_norm": 4.064896106719971, "learning_rate": 1.5534197293795772e-06, "loss": 0.4487, "num_input_tokens_seen": 149052864, "step": 122570 }, { "epoch": 15.358351083824083, "grad_norm": 1.9756877422332764, "learning_rate": 1.5530236772679303e-06, "loss": 0.5374, "num_input_tokens_seen": 149059104, "step": 122575 }, { "epoch": 15.358977571732865, "grad_norm": 6.816775798797607, "learning_rate": 1.5526276663677853e-06, "loss": 0.4396, "num_input_tokens_seen": 149064896, "step": 122580 }, { "epoch": 15.359604059641649, "grad_norm": 3.1265337467193604, "learning_rate": 1.552231696683873e-06, "loss": 0.4314, "num_input_tokens_seen": 149071264, "step": 122585 }, { "epoch": 15.360230547550433, "grad_norm": 8.317814826965332, "learning_rate": 1.5518357682209295e-06, "loss": 0.4807, "num_input_tokens_seen": 149077504, "step": 122590 }, { "epoch": 15.360857035459215, "grad_norm": 3.2780821323394775, "learning_rate": 1.5514398809836906e-06, "loss": 0.4623, "num_input_tokens_seen": 149083840, "step": 122595 }, { "epoch": 15.361483523368, "grad_norm": 2.4219822883605957, "learning_rate": 1.5510440349768851e-06, "loss": 0.4804, "num_input_tokens_seen": 149089440, "step": 122600 }, { "epoch": 15.362110011276782, "grad_norm": 5.248170852661133, "learning_rate": 1.5506482302052495e-06, "loss": 0.4282, "num_input_tokens_seen": 149095488, "step": 122605 }, { "epoch": 15.362736499185566, "grad_norm": 4.975104808807373, "learning_rate": 1.5502524666735131e-06, "loss": 0.4653, "num_input_tokens_seen": 149101888, "step": 122610 }, { "epoch": 15.36336298709435, "grad_norm": 1.7945829629898071, "learning_rate": 1.5498567443864105e-06, "loss": 0.4282, "num_input_tokens_seen": 149108128, "step": 122615 }, { "epoch": 15.363989475003132, "grad_norm": 4.274087429046631, "learning_rate": 1.5494610633486689e-06, "loss": 0.4339, "num_input_tokens_seen": 149114368, "step": 122620 }, { "epoch": 15.364615962911916, "grad_norm": 1.9053821563720703, "learning_rate": 1.5490654235650237e-06, "loss": 0.4705, "num_input_tokens_seen": 149120576, "step": 122625 }, { "epoch": 15.365242450820698, "grad_norm": 5.832639694213867, "learning_rate": 1.548669825040201e-06, "loss": 0.4384, "num_input_tokens_seen": 149126464, "step": 122630 }, { "epoch": 15.365868938729482, "grad_norm": 2.5252113342285156, "learning_rate": 1.548274267778932e-06, "loss": 0.5181, "num_input_tokens_seen": 149132576, "step": 122635 }, { "epoch": 15.366495426638267, "grad_norm": 10.387177467346191, "learning_rate": 1.5478787517859468e-06, "loss": 0.4825, "num_input_tokens_seen": 149138976, "step": 122640 }, { "epoch": 15.367121914547049, "grad_norm": 9.056476593017578, "learning_rate": 1.5474832770659741e-06, "loss": 0.4341, "num_input_tokens_seen": 149144896, "step": 122645 }, { "epoch": 15.367748402455833, "grad_norm": 2.602832317352295, "learning_rate": 1.5470878436237407e-06, "loss": 0.473, "num_input_tokens_seen": 149150624, "step": 122650 }, { "epoch": 15.368374890364615, "grad_norm": 26.68798065185547, "learning_rate": 1.5466924514639748e-06, "loss": 0.593, "num_input_tokens_seen": 149156800, "step": 122655 }, { "epoch": 15.3690013782734, "grad_norm": 2.645404100418091, "learning_rate": 1.5462971005914062e-06, "loss": 0.4993, "num_input_tokens_seen": 149163200, "step": 122660 }, { "epoch": 15.369627866182183, "grad_norm": 1.698542594909668, "learning_rate": 1.545901791010757e-06, "loss": 0.4836, "num_input_tokens_seen": 149169248, "step": 122665 }, { "epoch": 15.370254354090966, "grad_norm": 1.8974952697753906, "learning_rate": 1.5455065227267585e-06, "loss": 0.4182, "num_input_tokens_seen": 149175200, "step": 122670 }, { "epoch": 15.37088084199975, "grad_norm": 2.112952470779419, "learning_rate": 1.5451112957441316e-06, "loss": 0.4268, "num_input_tokens_seen": 149180512, "step": 122675 }, { "epoch": 15.371507329908532, "grad_norm": 7.254515647888184, "learning_rate": 1.5447161100676055e-06, "loss": 0.4105, "num_input_tokens_seen": 149186560, "step": 122680 }, { "epoch": 15.372133817817316, "grad_norm": 3.2417473793029785, "learning_rate": 1.544320965701902e-06, "loss": 0.4616, "num_input_tokens_seen": 149192448, "step": 122685 }, { "epoch": 15.3727603057261, "grad_norm": 7.479846477508545, "learning_rate": 1.5439258626517484e-06, "loss": 0.4895, "num_input_tokens_seen": 149198496, "step": 122690 }, { "epoch": 15.373386793634882, "grad_norm": 7.615756511688232, "learning_rate": 1.5435308009218652e-06, "loss": 0.4548, "num_input_tokens_seen": 149204384, "step": 122695 }, { "epoch": 15.374013281543666, "grad_norm": 7.735569953918457, "learning_rate": 1.5431357805169788e-06, "loss": 0.4756, "num_input_tokens_seen": 149210464, "step": 122700 }, { "epoch": 15.37463976945245, "grad_norm": 3.3779759407043457, "learning_rate": 1.5427408014418087e-06, "loss": 0.4466, "num_input_tokens_seen": 149216640, "step": 122705 }, { "epoch": 15.375266257361233, "grad_norm": 2.2845916748046875, "learning_rate": 1.5423458637010808e-06, "loss": 0.5181, "num_input_tokens_seen": 149222944, "step": 122710 }, { "epoch": 15.375892745270017, "grad_norm": 3.6532912254333496, "learning_rate": 1.5419509672995137e-06, "loss": 0.4435, "num_input_tokens_seen": 149229024, "step": 122715 }, { "epoch": 15.376519233178799, "grad_norm": 3.103430986404419, "learning_rate": 1.5415561122418298e-06, "loss": 0.4238, "num_input_tokens_seen": 149235072, "step": 122720 }, { "epoch": 15.377145721087583, "grad_norm": 5.273199081420898, "learning_rate": 1.5411612985327523e-06, "loss": 0.4487, "num_input_tokens_seen": 149241312, "step": 122725 }, { "epoch": 15.377772208996367, "grad_norm": 5.95803689956665, "learning_rate": 1.5407665261769977e-06, "loss": 0.462, "num_input_tokens_seen": 149246880, "step": 122730 }, { "epoch": 15.37839869690515, "grad_norm": 2.120882034301758, "learning_rate": 1.5403717951792874e-06, "loss": 0.4614, "num_input_tokens_seen": 149253056, "step": 122735 }, { "epoch": 15.379025184813933, "grad_norm": 2.2086079120635986, "learning_rate": 1.539977105544343e-06, "loss": 0.4709, "num_input_tokens_seen": 149258368, "step": 122740 }, { "epoch": 15.379651672722716, "grad_norm": 2.509218692779541, "learning_rate": 1.5395824572768797e-06, "loss": 0.4604, "num_input_tokens_seen": 149264384, "step": 122745 }, { "epoch": 15.3802781606315, "grad_norm": 2.1770083904266357, "learning_rate": 1.5391878503816171e-06, "loss": 0.4543, "num_input_tokens_seen": 149270400, "step": 122750 }, { "epoch": 15.380904648540284, "grad_norm": 11.223592758178711, "learning_rate": 1.5387932848632752e-06, "loss": 0.4916, "num_input_tokens_seen": 149276000, "step": 122755 }, { "epoch": 15.381531136449066, "grad_norm": 6.504849433898926, "learning_rate": 1.5383987607265683e-06, "loss": 0.4694, "num_input_tokens_seen": 149282080, "step": 122760 }, { "epoch": 15.38215762435785, "grad_norm": 10.322053909301758, "learning_rate": 1.5380042779762162e-06, "loss": 0.5119, "num_input_tokens_seen": 149288320, "step": 122765 }, { "epoch": 15.382784112266632, "grad_norm": 2.679108142852783, "learning_rate": 1.5376098366169318e-06, "loss": 0.4477, "num_input_tokens_seen": 149294144, "step": 122770 }, { "epoch": 15.383410600175417, "grad_norm": 6.206238269805908, "learning_rate": 1.5372154366534325e-06, "loss": 0.432, "num_input_tokens_seen": 149300192, "step": 122775 }, { "epoch": 15.3840370880842, "grad_norm": 2.824648857116699, "learning_rate": 1.5368210780904359e-06, "loss": 0.4491, "num_input_tokens_seen": 149306240, "step": 122780 }, { "epoch": 15.384663575992983, "grad_norm": 2.639479160308838, "learning_rate": 1.5364267609326533e-06, "loss": 0.4822, "num_input_tokens_seen": 149311936, "step": 122785 }, { "epoch": 15.385290063901767, "grad_norm": 2.823230504989624, "learning_rate": 1.5360324851848029e-06, "loss": 0.4532, "num_input_tokens_seen": 149318144, "step": 122790 }, { "epoch": 15.38591655181055, "grad_norm": 3.953160047531128, "learning_rate": 1.5356382508515944e-06, "loss": 0.5275, "num_input_tokens_seen": 149324320, "step": 122795 }, { "epoch": 15.386543039719333, "grad_norm": 3.3848161697387695, "learning_rate": 1.5352440579377449e-06, "loss": 0.4492, "num_input_tokens_seen": 149330368, "step": 122800 }, { "epoch": 15.387169527628117, "grad_norm": 13.542452812194824, "learning_rate": 1.5348499064479645e-06, "loss": 0.4831, "num_input_tokens_seen": 149336672, "step": 122805 }, { "epoch": 15.3877960155369, "grad_norm": 2.0614616870880127, "learning_rate": 1.5344557963869677e-06, "loss": 0.5183, "num_input_tokens_seen": 149342656, "step": 122810 }, { "epoch": 15.388422503445684, "grad_norm": 2.252250909805298, "learning_rate": 1.5340617277594644e-06, "loss": 0.4176, "num_input_tokens_seen": 149348320, "step": 122815 }, { "epoch": 15.389048991354468, "grad_norm": 6.209364891052246, "learning_rate": 1.5336677005701683e-06, "loss": 0.4252, "num_input_tokens_seen": 149354560, "step": 122820 }, { "epoch": 15.38967547926325, "grad_norm": 2.411402702331543, "learning_rate": 1.533273714823788e-06, "loss": 0.4441, "num_input_tokens_seen": 149360992, "step": 122825 }, { "epoch": 15.390301967172034, "grad_norm": 14.768766403198242, "learning_rate": 1.5328797705250353e-06, "loss": 0.4656, "num_input_tokens_seen": 149367040, "step": 122830 }, { "epoch": 15.390928455080816, "grad_norm": 7.333043098449707, "learning_rate": 1.5324858676786214e-06, "loss": 0.4405, "num_input_tokens_seen": 149373312, "step": 122835 }, { "epoch": 15.3915549429896, "grad_norm": 2.259514331817627, "learning_rate": 1.5320920062892526e-06, "loss": 0.4432, "num_input_tokens_seen": 149379264, "step": 122840 }, { "epoch": 15.392181430898384, "grad_norm": 2.8540167808532715, "learning_rate": 1.5316981863616393e-06, "loss": 0.4675, "num_input_tokens_seen": 149385344, "step": 122845 }, { "epoch": 15.392807918807167, "grad_norm": 14.251672744750977, "learning_rate": 1.53130440790049e-06, "loss": 0.5359, "num_input_tokens_seen": 149390848, "step": 122850 }, { "epoch": 15.39343440671595, "grad_norm": 11.806241989135742, "learning_rate": 1.5309106709105149e-06, "loss": 0.4322, "num_input_tokens_seen": 149396864, "step": 122855 }, { "epoch": 15.394060894624733, "grad_norm": 3.5006070137023926, "learning_rate": 1.5305169753964178e-06, "loss": 0.4455, "num_input_tokens_seen": 149402912, "step": 122860 }, { "epoch": 15.394687382533517, "grad_norm": 2.13962984085083, "learning_rate": 1.5301233213629085e-06, "loss": 0.4541, "num_input_tokens_seen": 149409024, "step": 122865 }, { "epoch": 15.395313870442301, "grad_norm": 4.113478660583496, "learning_rate": 1.5297297088146906e-06, "loss": 0.3969, "num_input_tokens_seen": 149415392, "step": 122870 }, { "epoch": 15.395940358351083, "grad_norm": 3.0156331062316895, "learning_rate": 1.5293361377564735e-06, "loss": 0.4526, "num_input_tokens_seen": 149421760, "step": 122875 }, { "epoch": 15.396566846259867, "grad_norm": 5.1124267578125, "learning_rate": 1.5289426081929593e-06, "loss": 0.469, "num_input_tokens_seen": 149427968, "step": 122880 }, { "epoch": 15.39719333416865, "grad_norm": 15.64664363861084, "learning_rate": 1.528549120128856e-06, "loss": 0.5081, "num_input_tokens_seen": 149434208, "step": 122885 }, { "epoch": 15.397819822077434, "grad_norm": 14.81412124633789, "learning_rate": 1.5281556735688653e-06, "loss": 0.4927, "num_input_tokens_seen": 149440160, "step": 122890 }, { "epoch": 15.398446309986218, "grad_norm": 7.195440769195557, "learning_rate": 1.5277622685176936e-06, "loss": 0.4591, "num_input_tokens_seen": 149446592, "step": 122895 }, { "epoch": 15.399072797895, "grad_norm": 4.517303943634033, "learning_rate": 1.5273689049800417e-06, "loss": 0.4601, "num_input_tokens_seen": 149452832, "step": 122900 }, { "epoch": 15.399699285803784, "grad_norm": 2.8718159198760986, "learning_rate": 1.5269755829606142e-06, "loss": 0.4302, "num_input_tokens_seen": 149458656, "step": 122905 }, { "epoch": 15.400325773712567, "grad_norm": 2.2010226249694824, "learning_rate": 1.5265823024641157e-06, "loss": 0.4104, "num_input_tokens_seen": 149464544, "step": 122910 }, { "epoch": 15.40095226162135, "grad_norm": 23.81168556213379, "learning_rate": 1.5261890634952436e-06, "loss": 0.5081, "num_input_tokens_seen": 149470656, "step": 122915 }, { "epoch": 15.401578749530135, "grad_norm": 12.741910934448242, "learning_rate": 1.5257958660587042e-06, "loss": 0.5529, "num_input_tokens_seen": 149476672, "step": 122920 }, { "epoch": 15.402205237438917, "grad_norm": 2.155505895614624, "learning_rate": 1.5254027101591945e-06, "loss": 0.4195, "num_input_tokens_seen": 149482880, "step": 122925 }, { "epoch": 15.402831725347701, "grad_norm": 10.055671691894531, "learning_rate": 1.5250095958014177e-06, "loss": 0.5219, "num_input_tokens_seen": 149489120, "step": 122930 }, { "epoch": 15.403458213256485, "grad_norm": 5.8029046058654785, "learning_rate": 1.5246165229900716e-06, "loss": 0.4756, "num_input_tokens_seen": 149495360, "step": 122935 }, { "epoch": 15.404084701165267, "grad_norm": 10.77305793762207, "learning_rate": 1.524223491729857e-06, "loss": 0.5168, "num_input_tokens_seen": 149501248, "step": 122940 }, { "epoch": 15.404711189074051, "grad_norm": 5.236483097076416, "learning_rate": 1.5238305020254729e-06, "loss": 0.456, "num_input_tokens_seen": 149507296, "step": 122945 }, { "epoch": 15.405337676982834, "grad_norm": 5.187534809112549, "learning_rate": 1.5234375538816193e-06, "loss": 0.4326, "num_input_tokens_seen": 149513280, "step": 122950 }, { "epoch": 15.405964164891618, "grad_norm": 2.174440383911133, "learning_rate": 1.5230446473029908e-06, "loss": 0.4184, "num_input_tokens_seen": 149519264, "step": 122955 }, { "epoch": 15.406590652800402, "grad_norm": 21.44498062133789, "learning_rate": 1.522651782294287e-06, "loss": 0.5325, "num_input_tokens_seen": 149525312, "step": 122960 }, { "epoch": 15.407217140709184, "grad_norm": 3.820993423461914, "learning_rate": 1.522258958860206e-06, "loss": 0.4338, "num_input_tokens_seen": 149531360, "step": 122965 }, { "epoch": 15.407843628617968, "grad_norm": 2.976053476333618, "learning_rate": 1.5218661770054422e-06, "loss": 0.4526, "num_input_tokens_seen": 149537504, "step": 122970 }, { "epoch": 15.40847011652675, "grad_norm": 5.132102966308594, "learning_rate": 1.5214734367346935e-06, "loss": 0.4267, "num_input_tokens_seen": 149543232, "step": 122975 }, { "epoch": 15.409096604435534, "grad_norm": 8.605234146118164, "learning_rate": 1.5210807380526526e-06, "loss": 0.4265, "num_input_tokens_seen": 149548864, "step": 122980 }, { "epoch": 15.409723092344318, "grad_norm": 2.433880567550659, "learning_rate": 1.520688080964019e-06, "loss": 0.5156, "num_input_tokens_seen": 149554816, "step": 122985 }, { "epoch": 15.4103495802531, "grad_norm": 1.5552937984466553, "learning_rate": 1.520295465473482e-06, "loss": 0.4795, "num_input_tokens_seen": 149560352, "step": 122990 }, { "epoch": 15.410976068161885, "grad_norm": 21.014671325683594, "learning_rate": 1.5199028915857405e-06, "loss": 0.4903, "num_input_tokens_seen": 149565696, "step": 122995 }, { "epoch": 15.411602556070667, "grad_norm": 3.2000632286071777, "learning_rate": 1.519510359305484e-06, "loss": 0.4429, "num_input_tokens_seen": 149572032, "step": 123000 }, { "epoch": 15.412229043979451, "grad_norm": 8.034587860107422, "learning_rate": 1.5191178686374092e-06, "loss": 0.4791, "num_input_tokens_seen": 149578272, "step": 123005 }, { "epoch": 15.412855531888235, "grad_norm": 3.603546380996704, "learning_rate": 1.518725419586205e-06, "loss": 0.4468, "num_input_tokens_seen": 149584384, "step": 123010 }, { "epoch": 15.413482019797017, "grad_norm": 2.3774092197418213, "learning_rate": 1.518333012156567e-06, "loss": 0.5051, "num_input_tokens_seen": 149590496, "step": 123015 }, { "epoch": 15.414108507705802, "grad_norm": 5.3562703132629395, "learning_rate": 1.5179406463531837e-06, "loss": 0.5255, "num_input_tokens_seen": 149597024, "step": 123020 }, { "epoch": 15.414734995614584, "grad_norm": 8.466326713562012, "learning_rate": 1.517548322180747e-06, "loss": 0.5305, "num_input_tokens_seen": 149602880, "step": 123025 }, { "epoch": 15.415361483523368, "grad_norm": 8.302535057067871, "learning_rate": 1.51715603964395e-06, "loss": 0.4399, "num_input_tokens_seen": 149609312, "step": 123030 }, { "epoch": 15.415987971432152, "grad_norm": 12.62143611907959, "learning_rate": 1.5167637987474793e-06, "loss": 0.4864, "num_input_tokens_seen": 149615360, "step": 123035 }, { "epoch": 15.416614459340934, "grad_norm": 8.539071083068848, "learning_rate": 1.5163715994960271e-06, "loss": 0.4212, "num_input_tokens_seen": 149621600, "step": 123040 }, { "epoch": 15.417240947249718, "grad_norm": 15.512496948242188, "learning_rate": 1.5159794418942796e-06, "loss": 0.4973, "num_input_tokens_seen": 149627936, "step": 123045 }, { "epoch": 15.417867435158502, "grad_norm": 2.5176613330841064, "learning_rate": 1.5155873259469272e-06, "loss": 0.4256, "num_input_tokens_seen": 149633952, "step": 123050 }, { "epoch": 15.418493923067285, "grad_norm": 2.078204870223999, "learning_rate": 1.5151952516586583e-06, "loss": 0.4686, "num_input_tokens_seen": 149639776, "step": 123055 }, { "epoch": 15.419120410976069, "grad_norm": 1.8875787258148193, "learning_rate": 1.514803219034161e-06, "loss": 0.4242, "num_input_tokens_seen": 149645600, "step": 123060 }, { "epoch": 15.419746898884851, "grad_norm": 12.118220329284668, "learning_rate": 1.5144112280781203e-06, "loss": 0.4544, "num_input_tokens_seen": 149651872, "step": 123065 }, { "epoch": 15.420373386793635, "grad_norm": 4.984118938446045, "learning_rate": 1.5140192787952257e-06, "loss": 0.4662, "num_input_tokens_seen": 149658112, "step": 123070 }, { "epoch": 15.420999874702419, "grad_norm": 7.637845039367676, "learning_rate": 1.5136273711901594e-06, "loss": 0.5322, "num_input_tokens_seen": 149664288, "step": 123075 }, { "epoch": 15.421626362611201, "grad_norm": 4.103061199188232, "learning_rate": 1.5132355052676107e-06, "loss": 0.4622, "num_input_tokens_seen": 149670144, "step": 123080 }, { "epoch": 15.422252850519985, "grad_norm": 5.183483123779297, "learning_rate": 1.5128436810322617e-06, "loss": 0.5246, "num_input_tokens_seen": 149675936, "step": 123085 }, { "epoch": 15.422879338428768, "grad_norm": 4.9493021965026855, "learning_rate": 1.512451898488798e-06, "loss": 0.4608, "num_input_tokens_seen": 149682176, "step": 123090 }, { "epoch": 15.423505826337552, "grad_norm": 14.662078857421875, "learning_rate": 1.5120601576419058e-06, "loss": 0.5349, "num_input_tokens_seen": 149687840, "step": 123095 }, { "epoch": 15.424132314246336, "grad_norm": 2.4552717208862305, "learning_rate": 1.5116684584962654e-06, "loss": 0.427, "num_input_tokens_seen": 149693888, "step": 123100 }, { "epoch": 15.424758802155118, "grad_norm": 2.921248435974121, "learning_rate": 1.5112768010565626e-06, "loss": 0.4244, "num_input_tokens_seen": 149699904, "step": 123105 }, { "epoch": 15.425385290063902, "grad_norm": 7.72694730758667, "learning_rate": 1.5108851853274775e-06, "loss": 0.5447, "num_input_tokens_seen": 149706080, "step": 123110 }, { "epoch": 15.426011777972684, "grad_norm": 5.984151363372803, "learning_rate": 1.5104936113136953e-06, "loss": 0.4816, "num_input_tokens_seen": 149712416, "step": 123115 }, { "epoch": 15.426638265881468, "grad_norm": 3.668656826019287, "learning_rate": 1.5101020790198938e-06, "loss": 0.4612, "num_input_tokens_seen": 149718368, "step": 123120 }, { "epoch": 15.427264753790253, "grad_norm": 2.3486368656158447, "learning_rate": 1.509710588450758e-06, "loss": 0.4255, "num_input_tokens_seen": 149724576, "step": 123125 }, { "epoch": 15.427891241699035, "grad_norm": 7.317808628082275, "learning_rate": 1.5093191396109646e-06, "loss": 0.4173, "num_input_tokens_seen": 149730176, "step": 123130 }, { "epoch": 15.428517729607819, "grad_norm": 4.867157936096191, "learning_rate": 1.5089277325051976e-06, "loss": 0.4156, "num_input_tokens_seen": 149736576, "step": 123135 }, { "epoch": 15.429144217516601, "grad_norm": 2.7790489196777344, "learning_rate": 1.5085363671381332e-06, "loss": 0.428, "num_input_tokens_seen": 149742880, "step": 123140 }, { "epoch": 15.429770705425385, "grad_norm": 2.503199815750122, "learning_rate": 1.5081450435144518e-06, "loss": 0.4254, "num_input_tokens_seen": 149748992, "step": 123145 }, { "epoch": 15.43039719333417, "grad_norm": 2.258270025253296, "learning_rate": 1.5077537616388321e-06, "loss": 0.5174, "num_input_tokens_seen": 149754912, "step": 123150 }, { "epoch": 15.431023681242952, "grad_norm": 3.8145503997802734, "learning_rate": 1.5073625215159537e-06, "loss": 0.4527, "num_input_tokens_seen": 149761408, "step": 123155 }, { "epoch": 15.431650169151736, "grad_norm": 2.4983222484588623, "learning_rate": 1.5069713231504917e-06, "loss": 0.4151, "num_input_tokens_seen": 149767776, "step": 123160 }, { "epoch": 15.432276657060518, "grad_norm": 6.533345699310303, "learning_rate": 1.5065801665471242e-06, "loss": 0.4265, "num_input_tokens_seen": 149773376, "step": 123165 }, { "epoch": 15.432903144969302, "grad_norm": 8.604901313781738, "learning_rate": 1.5061890517105294e-06, "loss": 0.481, "num_input_tokens_seen": 149779200, "step": 123170 }, { "epoch": 15.433529632878086, "grad_norm": 2.0938634872436523, "learning_rate": 1.5057979786453802e-06, "loss": 0.4302, "num_input_tokens_seen": 149785248, "step": 123175 }, { "epoch": 15.434156120786868, "grad_norm": 3.597339153289795, "learning_rate": 1.505406947356356e-06, "loss": 0.5735, "num_input_tokens_seen": 149791520, "step": 123180 }, { "epoch": 15.434782608695652, "grad_norm": 3.2688894271850586, "learning_rate": 1.5050159578481277e-06, "loss": 0.4796, "num_input_tokens_seen": 149797856, "step": 123185 }, { "epoch": 15.435409096604436, "grad_norm": 3.3091797828674316, "learning_rate": 1.504625010125374e-06, "loss": 0.4954, "num_input_tokens_seen": 149804096, "step": 123190 }, { "epoch": 15.436035584513219, "grad_norm": 3.9777660369873047, "learning_rate": 1.5042341041927654e-06, "loss": 0.4122, "num_input_tokens_seen": 149810112, "step": 123195 }, { "epoch": 15.436662072422003, "grad_norm": 2.5575900077819824, "learning_rate": 1.5038432400549784e-06, "loss": 0.4261, "num_input_tokens_seen": 149816576, "step": 123200 }, { "epoch": 15.437288560330785, "grad_norm": 10.33779525756836, "learning_rate": 1.5034524177166836e-06, "loss": 0.4958, "num_input_tokens_seen": 149822496, "step": 123205 }, { "epoch": 15.437915048239569, "grad_norm": 4.977193355560303, "learning_rate": 1.5030616371825552e-06, "loss": 0.4687, "num_input_tokens_seen": 149828800, "step": 123210 }, { "epoch": 15.438541536148353, "grad_norm": 2.790367364883423, "learning_rate": 1.5026708984572664e-06, "loss": 0.4284, "num_input_tokens_seen": 149834912, "step": 123215 }, { "epoch": 15.439168024057135, "grad_norm": 2.2582461833953857, "learning_rate": 1.502280201545485e-06, "loss": 0.4635, "num_input_tokens_seen": 149840960, "step": 123220 }, { "epoch": 15.43979451196592, "grad_norm": 18.530868530273438, "learning_rate": 1.5018895464518874e-06, "loss": 0.5107, "num_input_tokens_seen": 149847104, "step": 123225 }, { "epoch": 15.440420999874702, "grad_norm": 2.7155914306640625, "learning_rate": 1.5014989331811386e-06, "loss": 0.4323, "num_input_tokens_seen": 149853312, "step": 123230 }, { "epoch": 15.441047487783486, "grad_norm": 13.72166633605957, "learning_rate": 1.5011083617379136e-06, "loss": 0.4409, "num_input_tokens_seen": 149859648, "step": 123235 }, { "epoch": 15.44167397569227, "grad_norm": 3.1128170490264893, "learning_rate": 1.5007178321268783e-06, "loss": 0.4364, "num_input_tokens_seen": 149866016, "step": 123240 }, { "epoch": 15.442300463601052, "grad_norm": 2.8090128898620605, "learning_rate": 1.5003273443527034e-06, "loss": 0.4152, "num_input_tokens_seen": 149872224, "step": 123245 }, { "epoch": 15.442926951509836, "grad_norm": 2.8491697311401367, "learning_rate": 1.4999368984200585e-06, "loss": 0.4189, "num_input_tokens_seen": 149878592, "step": 123250 }, { "epoch": 15.443553439418618, "grad_norm": 4.616067409515381, "learning_rate": 1.4995464943336097e-06, "loss": 0.4773, "num_input_tokens_seen": 149884704, "step": 123255 }, { "epoch": 15.444179927327403, "grad_norm": 15.0562105178833, "learning_rate": 1.4991561320980253e-06, "loss": 0.4814, "num_input_tokens_seen": 149890464, "step": 123260 }, { "epoch": 15.444806415236187, "grad_norm": 2.2814977169036865, "learning_rate": 1.4987658117179743e-06, "loss": 0.5448, "num_input_tokens_seen": 149896480, "step": 123265 }, { "epoch": 15.445432903144969, "grad_norm": 2.805483341217041, "learning_rate": 1.4983755331981197e-06, "loss": 0.4281, "num_input_tokens_seen": 149902624, "step": 123270 }, { "epoch": 15.446059391053753, "grad_norm": 10.85744571685791, "learning_rate": 1.4979852965431301e-06, "loss": 0.4548, "num_input_tokens_seen": 149908704, "step": 123275 }, { "epoch": 15.446685878962535, "grad_norm": 2.2218446731567383, "learning_rate": 1.4975951017576722e-06, "loss": 0.5663, "num_input_tokens_seen": 149915072, "step": 123280 }, { "epoch": 15.44731236687132, "grad_norm": 20.943115234375, "learning_rate": 1.497204948846408e-06, "loss": 0.5224, "num_input_tokens_seen": 149921792, "step": 123285 }, { "epoch": 15.447938854780103, "grad_norm": 2.3472132682800293, "learning_rate": 1.4968148378140045e-06, "loss": 0.4408, "num_input_tokens_seen": 149928032, "step": 123290 }, { "epoch": 15.448565342688886, "grad_norm": 2.9110357761383057, "learning_rate": 1.496424768665124e-06, "loss": 0.4431, "num_input_tokens_seen": 149934304, "step": 123295 }, { "epoch": 15.44919183059767, "grad_norm": 3.434976816177368, "learning_rate": 1.496034741404433e-06, "loss": 0.4548, "num_input_tokens_seen": 149939936, "step": 123300 }, { "epoch": 15.449818318506452, "grad_norm": 1.8549368381500244, "learning_rate": 1.4956447560365906e-06, "loss": 0.4974, "num_input_tokens_seen": 149945568, "step": 123305 }, { "epoch": 15.450444806415236, "grad_norm": 1.9959728717803955, "learning_rate": 1.4952548125662626e-06, "loss": 0.4611, "num_input_tokens_seen": 149951936, "step": 123310 }, { "epoch": 15.45107129432402, "grad_norm": 15.012420654296875, "learning_rate": 1.494864910998109e-06, "loss": 0.4906, "num_input_tokens_seen": 149958016, "step": 123315 }, { "epoch": 15.451697782232802, "grad_norm": 2.814185619354248, "learning_rate": 1.4944750513367933e-06, "loss": 0.4682, "num_input_tokens_seen": 149963488, "step": 123320 }, { "epoch": 15.452324270141586, "grad_norm": 3.0076401233673096, "learning_rate": 1.4940852335869743e-06, "loss": 0.418, "num_input_tokens_seen": 149969504, "step": 123325 }, { "epoch": 15.45295075805037, "grad_norm": 1.6399147510528564, "learning_rate": 1.4936954577533142e-06, "loss": 0.4362, "num_input_tokens_seen": 149975776, "step": 123330 }, { "epoch": 15.453577245959153, "grad_norm": 5.186948299407959, "learning_rate": 1.4933057238404747e-06, "loss": 0.4946, "num_input_tokens_seen": 149982144, "step": 123335 }, { "epoch": 15.454203733867937, "grad_norm": 2.284085273742676, "learning_rate": 1.492916031853111e-06, "loss": 0.4627, "num_input_tokens_seen": 149988480, "step": 123340 }, { "epoch": 15.454830221776719, "grad_norm": 12.077268600463867, "learning_rate": 1.4925263817958874e-06, "loss": 0.4819, "num_input_tokens_seen": 149994880, "step": 123345 }, { "epoch": 15.455456709685503, "grad_norm": 2.771296739578247, "learning_rate": 1.4921367736734576e-06, "loss": 0.4596, "num_input_tokens_seen": 150000832, "step": 123350 }, { "epoch": 15.456083197594287, "grad_norm": 2.2995059490203857, "learning_rate": 1.4917472074904821e-06, "loss": 0.4105, "num_input_tokens_seen": 150006816, "step": 123355 }, { "epoch": 15.45670968550307, "grad_norm": 2.42081618309021, "learning_rate": 1.4913576832516198e-06, "loss": 0.4799, "num_input_tokens_seen": 150012320, "step": 123360 }, { "epoch": 15.457336173411854, "grad_norm": 10.061322212219238, "learning_rate": 1.4909682009615245e-06, "loss": 0.493, "num_input_tokens_seen": 150018592, "step": 123365 }, { "epoch": 15.457962661320636, "grad_norm": 8.32505989074707, "learning_rate": 1.4905787606248551e-06, "loss": 0.4609, "num_input_tokens_seen": 150024544, "step": 123370 }, { "epoch": 15.45858914922942, "grad_norm": 2.877134084701538, "learning_rate": 1.4901893622462682e-06, "loss": 0.4665, "num_input_tokens_seen": 150030784, "step": 123375 }, { "epoch": 15.459215637138204, "grad_norm": 2.790039300918579, "learning_rate": 1.4898000058304174e-06, "loss": 0.4637, "num_input_tokens_seen": 150036768, "step": 123380 }, { "epoch": 15.459842125046986, "grad_norm": 10.893385887145996, "learning_rate": 1.48941069138196e-06, "loss": 0.5032, "num_input_tokens_seen": 150043072, "step": 123385 }, { "epoch": 15.46046861295577, "grad_norm": 8.505386352539062, "learning_rate": 1.4890214189055475e-06, "loss": 0.5015, "num_input_tokens_seen": 150049472, "step": 123390 }, { "epoch": 15.461095100864553, "grad_norm": 2.3454785346984863, "learning_rate": 1.4886321884058363e-06, "loss": 0.5223, "num_input_tokens_seen": 150055424, "step": 123395 }, { "epoch": 15.461721588773337, "grad_norm": 2.0925233364105225, "learning_rate": 1.488242999887481e-06, "loss": 0.4229, "num_input_tokens_seen": 150061632, "step": 123400 }, { "epoch": 15.46234807668212, "grad_norm": 2.655211925506592, "learning_rate": 1.4878538533551312e-06, "loss": 0.42, "num_input_tokens_seen": 150067808, "step": 123405 }, { "epoch": 15.462974564590903, "grad_norm": 2.7724578380584717, "learning_rate": 1.487464748813443e-06, "loss": 0.4539, "num_input_tokens_seen": 150073632, "step": 123410 }, { "epoch": 15.463601052499687, "grad_norm": 2.5786032676696777, "learning_rate": 1.4870756862670655e-06, "loss": 0.4408, "num_input_tokens_seen": 150079744, "step": 123415 }, { "epoch": 15.46422754040847, "grad_norm": 9.718103408813477, "learning_rate": 1.4866866657206536e-06, "loss": 0.4743, "num_input_tokens_seen": 150085984, "step": 123420 }, { "epoch": 15.464854028317253, "grad_norm": 6.257485866546631, "learning_rate": 1.4862976871788542e-06, "loss": 0.5153, "num_input_tokens_seen": 150092160, "step": 123425 }, { "epoch": 15.465480516226037, "grad_norm": 6.771975517272949, "learning_rate": 1.4859087506463222e-06, "loss": 0.4736, "num_input_tokens_seen": 150098304, "step": 123430 }, { "epoch": 15.46610700413482, "grad_norm": 3.0179603099823, "learning_rate": 1.485519856127704e-06, "loss": 0.454, "num_input_tokens_seen": 150104128, "step": 123435 }, { "epoch": 15.466733492043604, "grad_norm": 2.541771411895752, "learning_rate": 1.4851310036276517e-06, "loss": 0.4457, "num_input_tokens_seen": 150110176, "step": 123440 }, { "epoch": 15.467359979952388, "grad_norm": 5.635023593902588, "learning_rate": 1.484742193150812e-06, "loss": 0.4287, "num_input_tokens_seen": 150116128, "step": 123445 }, { "epoch": 15.46798646786117, "grad_norm": 13.3443021774292, "learning_rate": 1.4843534247018354e-06, "loss": 0.4717, "num_input_tokens_seen": 150122304, "step": 123450 }, { "epoch": 15.468612955769954, "grad_norm": 7.547295570373535, "learning_rate": 1.4839646982853706e-06, "loss": 0.4706, "num_input_tokens_seen": 150128768, "step": 123455 }, { "epoch": 15.469239443678736, "grad_norm": 5.116946697235107, "learning_rate": 1.4835760139060629e-06, "loss": 0.5017, "num_input_tokens_seen": 150135264, "step": 123460 }, { "epoch": 15.46986593158752, "grad_norm": 2.9952235221862793, "learning_rate": 1.4831873715685597e-06, "loss": 0.453, "num_input_tokens_seen": 150141376, "step": 123465 }, { "epoch": 15.470492419496304, "grad_norm": 2.1262123584747314, "learning_rate": 1.4827987712775082e-06, "loss": 0.4318, "num_input_tokens_seen": 150147584, "step": 123470 }, { "epoch": 15.471118907405087, "grad_norm": 1.8700687885284424, "learning_rate": 1.4824102130375567e-06, "loss": 0.503, "num_input_tokens_seen": 150153472, "step": 123475 }, { "epoch": 15.47174539531387, "grad_norm": 16.337156295776367, "learning_rate": 1.4820216968533462e-06, "loss": 0.48, "num_input_tokens_seen": 150159488, "step": 123480 }, { "epoch": 15.472371883222653, "grad_norm": 3.280365228652954, "learning_rate": 1.4816332227295266e-06, "loss": 0.4182, "num_input_tokens_seen": 150165504, "step": 123485 }, { "epoch": 15.472998371131437, "grad_norm": 5.218988418579102, "learning_rate": 1.4812447906707378e-06, "loss": 0.4477, "num_input_tokens_seen": 150171808, "step": 123490 }, { "epoch": 15.473624859040221, "grad_norm": 3.186897039413452, "learning_rate": 1.4808564006816272e-06, "loss": 0.4392, "num_input_tokens_seen": 150177536, "step": 123495 }, { "epoch": 15.474251346949004, "grad_norm": 12.291437149047852, "learning_rate": 1.4804680527668359e-06, "loss": 0.4569, "num_input_tokens_seen": 150183744, "step": 123500 }, { "epoch": 15.474877834857788, "grad_norm": 12.685979843139648, "learning_rate": 1.4800797469310097e-06, "loss": 0.4506, "num_input_tokens_seen": 150189888, "step": 123505 }, { "epoch": 15.47550432276657, "grad_norm": 2.144003391265869, "learning_rate": 1.4796914831787878e-06, "loss": 0.4782, "num_input_tokens_seen": 150196128, "step": 123510 }, { "epoch": 15.476130810675354, "grad_norm": 2.797053098678589, "learning_rate": 1.479303261514814e-06, "loss": 0.4654, "num_input_tokens_seen": 150202368, "step": 123515 }, { "epoch": 15.476757298584138, "grad_norm": 2.5632636547088623, "learning_rate": 1.478915081943731e-06, "loss": 0.4429, "num_input_tokens_seen": 150208384, "step": 123520 }, { "epoch": 15.47738378649292, "grad_norm": 2.209374189376831, "learning_rate": 1.4785269444701772e-06, "loss": 0.4842, "num_input_tokens_seen": 150214432, "step": 123525 }, { "epoch": 15.478010274401704, "grad_norm": 5.645022869110107, "learning_rate": 1.4781388490987959e-06, "loss": 0.4926, "num_input_tokens_seen": 150220896, "step": 123530 }, { "epoch": 15.478636762310487, "grad_norm": 2.740938663482666, "learning_rate": 1.4777507958342247e-06, "loss": 0.4248, "num_input_tokens_seen": 150226784, "step": 123535 }, { "epoch": 15.47926325021927, "grad_norm": 2.1206042766571045, "learning_rate": 1.477362784681105e-06, "loss": 0.4367, "num_input_tokens_seen": 150232704, "step": 123540 }, { "epoch": 15.479889738128055, "grad_norm": 3.3490474224090576, "learning_rate": 1.4769748156440733e-06, "loss": 0.4189, "num_input_tokens_seen": 150238752, "step": 123545 }, { "epoch": 15.480516226036837, "grad_norm": 2.0728774070739746, "learning_rate": 1.4765868887277718e-06, "loss": 0.4535, "num_input_tokens_seen": 150244896, "step": 123550 }, { "epoch": 15.481142713945621, "grad_norm": 2.617648124694824, "learning_rate": 1.4761990039368346e-06, "loss": 0.4454, "num_input_tokens_seen": 150250848, "step": 123555 }, { "epoch": 15.481769201854405, "grad_norm": 3.624554395675659, "learning_rate": 1.4758111612759007e-06, "loss": 0.448, "num_input_tokens_seen": 150257056, "step": 123560 }, { "epoch": 15.482395689763187, "grad_norm": 2.120180368423462, "learning_rate": 1.4754233607496071e-06, "loss": 0.4134, "num_input_tokens_seen": 150263200, "step": 123565 }, { "epoch": 15.483022177671971, "grad_norm": 4.919936656951904, "learning_rate": 1.4750356023625928e-06, "loss": 0.5145, "num_input_tokens_seen": 150269472, "step": 123570 }, { "epoch": 15.483648665580754, "grad_norm": 3.1707923412323, "learning_rate": 1.4746478861194896e-06, "loss": 0.5726, "num_input_tokens_seen": 150275904, "step": 123575 }, { "epoch": 15.484275153489538, "grad_norm": 16.4373779296875, "learning_rate": 1.474260212024935e-06, "loss": 0.5906, "num_input_tokens_seen": 150282336, "step": 123580 }, { "epoch": 15.484901641398322, "grad_norm": 11.619460105895996, "learning_rate": 1.4738725800835651e-06, "loss": 0.4335, "num_input_tokens_seen": 150288640, "step": 123585 }, { "epoch": 15.485528129307104, "grad_norm": 6.597102165222168, "learning_rate": 1.4734849903000114e-06, "loss": 0.496, "num_input_tokens_seen": 150294656, "step": 123590 }, { "epoch": 15.486154617215888, "grad_norm": 3.2056570053100586, "learning_rate": 1.473097442678912e-06, "loss": 0.4667, "num_input_tokens_seen": 150300896, "step": 123595 }, { "epoch": 15.48678110512467, "grad_norm": 10.35183334350586, "learning_rate": 1.4727099372248953e-06, "loss": 0.4695, "num_input_tokens_seen": 150306784, "step": 123600 }, { "epoch": 15.487407593033454, "grad_norm": 2.538931131362915, "learning_rate": 1.472322473942599e-06, "loss": 0.4523, "num_input_tokens_seen": 150312832, "step": 123605 }, { "epoch": 15.488034080942239, "grad_norm": 2.6099729537963867, "learning_rate": 1.471935052836652e-06, "loss": 0.4494, "num_input_tokens_seen": 150318240, "step": 123610 }, { "epoch": 15.48866056885102, "grad_norm": 5.715949058532715, "learning_rate": 1.4715476739116891e-06, "loss": 0.4571, "num_input_tokens_seen": 150323744, "step": 123615 }, { "epoch": 15.489287056759805, "grad_norm": 2.17580509185791, "learning_rate": 1.4711603371723388e-06, "loss": 0.4512, "num_input_tokens_seen": 150329888, "step": 123620 }, { "epoch": 15.489913544668587, "grad_norm": 2.7009284496307373, "learning_rate": 1.4707730426232352e-06, "loss": 0.4834, "num_input_tokens_seen": 150336032, "step": 123625 }, { "epoch": 15.490540032577371, "grad_norm": 1.9951046705245972, "learning_rate": 1.470385790269005e-06, "loss": 0.5708, "num_input_tokens_seen": 150342272, "step": 123630 }, { "epoch": 15.491166520486155, "grad_norm": 9.340068817138672, "learning_rate": 1.4699985801142824e-06, "loss": 0.4269, "num_input_tokens_seen": 150348192, "step": 123635 }, { "epoch": 15.491793008394938, "grad_norm": 2.329967737197876, "learning_rate": 1.469611412163693e-06, "loss": 0.469, "num_input_tokens_seen": 150354400, "step": 123640 }, { "epoch": 15.492419496303722, "grad_norm": 2.104992389678955, "learning_rate": 1.469224286421867e-06, "loss": 0.4339, "num_input_tokens_seen": 150360320, "step": 123645 }, { "epoch": 15.493045984212504, "grad_norm": 2.9776058197021484, "learning_rate": 1.468837202893435e-06, "loss": 0.4631, "num_input_tokens_seen": 150366272, "step": 123650 }, { "epoch": 15.493672472121288, "grad_norm": 6.894135475158691, "learning_rate": 1.4684501615830216e-06, "loss": 0.466, "num_input_tokens_seen": 150372256, "step": 123655 }, { "epoch": 15.494298960030072, "grad_norm": 1.9877111911773682, "learning_rate": 1.4680631624952567e-06, "loss": 0.4227, "num_input_tokens_seen": 150378208, "step": 123660 }, { "epoch": 15.494925447938854, "grad_norm": 8.435639381408691, "learning_rate": 1.467676205634765e-06, "loss": 0.48, "num_input_tokens_seen": 150383840, "step": 123665 }, { "epoch": 15.495551935847638, "grad_norm": 2.1060776710510254, "learning_rate": 1.4672892910061742e-06, "loss": 0.4356, "num_input_tokens_seen": 150390080, "step": 123670 }, { "epoch": 15.496178423756422, "grad_norm": 2.9855501651763916, "learning_rate": 1.46690241861411e-06, "loss": 0.5274, "num_input_tokens_seen": 150396096, "step": 123675 }, { "epoch": 15.496804911665205, "grad_norm": 8.79495906829834, "learning_rate": 1.4665155884631994e-06, "loss": 0.4794, "num_input_tokens_seen": 150402368, "step": 123680 }, { "epoch": 15.497431399573989, "grad_norm": 2.0499181747436523, "learning_rate": 1.4661288005580637e-06, "loss": 0.4699, "num_input_tokens_seen": 150408288, "step": 123685 }, { "epoch": 15.498057887482771, "grad_norm": 11.761130332946777, "learning_rate": 1.4657420549033313e-06, "loss": 0.4889, "num_input_tokens_seen": 150414400, "step": 123690 }, { "epoch": 15.498684375391555, "grad_norm": 9.697346687316895, "learning_rate": 1.4653553515036228e-06, "loss": 0.4149, "num_input_tokens_seen": 150420768, "step": 123695 }, { "epoch": 15.49931086330034, "grad_norm": 1.9657460451126099, "learning_rate": 1.464968690363564e-06, "loss": 0.4478, "num_input_tokens_seen": 150426496, "step": 123700 }, { "epoch": 15.499937351209121, "grad_norm": 2.5803465843200684, "learning_rate": 1.4645820714877751e-06, "loss": 0.5227, "num_input_tokens_seen": 150432576, "step": 123705 }, { "epoch": 15.500563839117905, "grad_norm": 2.3293747901916504, "learning_rate": 1.46419549488088e-06, "loss": 0.4511, "num_input_tokens_seen": 150438784, "step": 123710 }, { "epoch": 15.501190327026688, "grad_norm": 2.708146572113037, "learning_rate": 1.463808960547502e-06, "loss": 0.447, "num_input_tokens_seen": 150444736, "step": 123715 }, { "epoch": 15.501816814935472, "grad_norm": 2.234781503677368, "learning_rate": 1.4634224684922594e-06, "loss": 0.4489, "num_input_tokens_seen": 150450496, "step": 123720 }, { "epoch": 15.502443302844256, "grad_norm": 2.5669896602630615, "learning_rate": 1.4630360187197762e-06, "loss": 0.4488, "num_input_tokens_seen": 150456640, "step": 123725 }, { "epoch": 15.503069790753038, "grad_norm": 6.523495197296143, "learning_rate": 1.4626496112346695e-06, "loss": 0.454, "num_input_tokens_seen": 150462720, "step": 123730 }, { "epoch": 15.503696278661822, "grad_norm": 3.739271402359009, "learning_rate": 1.4622632460415625e-06, "loss": 0.5208, "num_input_tokens_seen": 150468640, "step": 123735 }, { "epoch": 15.504322766570604, "grad_norm": 4.201099872589111, "learning_rate": 1.4618769231450709e-06, "loss": 0.487, "num_input_tokens_seen": 150475072, "step": 123740 }, { "epoch": 15.504949254479389, "grad_norm": 10.138575553894043, "learning_rate": 1.4614906425498172e-06, "loss": 0.4703, "num_input_tokens_seen": 150481312, "step": 123745 }, { "epoch": 15.505575742388173, "grad_norm": 9.38625717163086, "learning_rate": 1.461104404260416e-06, "loss": 0.4514, "num_input_tokens_seen": 150487392, "step": 123750 }, { "epoch": 15.506202230296955, "grad_norm": 2.2250723838806152, "learning_rate": 1.4607182082814886e-06, "loss": 0.4403, "num_input_tokens_seen": 150493664, "step": 123755 }, { "epoch": 15.506828718205739, "grad_norm": 1.973511815071106, "learning_rate": 1.4603320546176497e-06, "loss": 0.4351, "num_input_tokens_seen": 150499616, "step": 123760 }, { "epoch": 15.507455206114521, "grad_norm": 3.1569571495056152, "learning_rate": 1.4599459432735164e-06, "loss": 0.4338, "num_input_tokens_seen": 150505376, "step": 123765 }, { "epoch": 15.508081694023305, "grad_norm": 2.4968786239624023, "learning_rate": 1.459559874253706e-06, "loss": 0.4281, "num_input_tokens_seen": 150511360, "step": 123770 }, { "epoch": 15.50870818193209, "grad_norm": 6.869561195373535, "learning_rate": 1.4591738475628352e-06, "loss": 0.4372, "num_input_tokens_seen": 150517248, "step": 123775 }, { "epoch": 15.509334669840872, "grad_norm": 4.453369140625, "learning_rate": 1.458787863205517e-06, "loss": 0.4681, "num_input_tokens_seen": 150523488, "step": 123780 }, { "epoch": 15.509961157749656, "grad_norm": 2.929145336151123, "learning_rate": 1.4584019211863665e-06, "loss": 0.4966, "num_input_tokens_seen": 150529696, "step": 123785 }, { "epoch": 15.51058764565844, "grad_norm": 15.852619171142578, "learning_rate": 1.4580160215100003e-06, "loss": 0.5008, "num_input_tokens_seen": 150535904, "step": 123790 }, { "epoch": 15.511214133567222, "grad_norm": 9.64048957824707, "learning_rate": 1.4576301641810291e-06, "loss": 0.4563, "num_input_tokens_seen": 150542016, "step": 123795 }, { "epoch": 15.511840621476006, "grad_norm": 11.269380569458008, "learning_rate": 1.4572443492040693e-06, "loss": 0.5084, "num_input_tokens_seen": 150547840, "step": 123800 }, { "epoch": 15.512467109384788, "grad_norm": 8.09291934967041, "learning_rate": 1.45685857658373e-06, "loss": 0.4448, "num_input_tokens_seen": 150554240, "step": 123805 }, { "epoch": 15.513093597293572, "grad_norm": 3.069636583328247, "learning_rate": 1.4564728463246274e-06, "loss": 0.4403, "num_input_tokens_seen": 150560192, "step": 123810 }, { "epoch": 15.513720085202355, "grad_norm": 5.540674209594727, "learning_rate": 1.456087158431369e-06, "loss": 0.4429, "num_input_tokens_seen": 150566240, "step": 123815 }, { "epoch": 15.514346573111139, "grad_norm": 3.3808467388153076, "learning_rate": 1.4557015129085706e-06, "loss": 0.4583, "num_input_tokens_seen": 150571616, "step": 123820 }, { "epoch": 15.514973061019923, "grad_norm": 2.9167487621307373, "learning_rate": 1.4553159097608383e-06, "loss": 0.4111, "num_input_tokens_seen": 150577728, "step": 123825 }, { "epoch": 15.515599548928705, "grad_norm": 11.749171257019043, "learning_rate": 1.4549303489927847e-06, "loss": 0.4497, "num_input_tokens_seen": 150583936, "step": 123830 }, { "epoch": 15.51622603683749, "grad_norm": 2.3603713512420654, "learning_rate": 1.4545448306090205e-06, "loss": 0.4277, "num_input_tokens_seen": 150590208, "step": 123835 }, { "epoch": 15.516852524746273, "grad_norm": 2.914501428604126, "learning_rate": 1.4541593546141524e-06, "loss": 0.4384, "num_input_tokens_seen": 150596480, "step": 123840 }, { "epoch": 15.517479012655055, "grad_norm": 2.2002289295196533, "learning_rate": 1.4537739210127916e-06, "loss": 0.5535, "num_input_tokens_seen": 150602688, "step": 123845 }, { "epoch": 15.51810550056384, "grad_norm": 2.839156150817871, "learning_rate": 1.453388529809544e-06, "loss": 0.4667, "num_input_tokens_seen": 150608896, "step": 123850 }, { "epoch": 15.518731988472622, "grad_norm": 2.3986167907714844, "learning_rate": 1.45300318100902e-06, "loss": 0.4207, "num_input_tokens_seen": 150615168, "step": 123855 }, { "epoch": 15.519358476381406, "grad_norm": 3.2547566890716553, "learning_rate": 1.4526178746158231e-06, "loss": 0.4483, "num_input_tokens_seen": 150621056, "step": 123860 }, { "epoch": 15.51998496429019, "grad_norm": 7.463996410369873, "learning_rate": 1.4522326106345625e-06, "loss": 0.4959, "num_input_tokens_seen": 150627232, "step": 123865 }, { "epoch": 15.520611452198972, "grad_norm": 2.372903347015381, "learning_rate": 1.4518473890698447e-06, "loss": 0.4049, "num_input_tokens_seen": 150632768, "step": 123870 }, { "epoch": 15.521237940107756, "grad_norm": 15.40166187286377, "learning_rate": 1.4514622099262737e-06, "loss": 0.5507, "num_input_tokens_seen": 150638400, "step": 123875 }, { "epoch": 15.521864428016539, "grad_norm": 2.7379255294799805, "learning_rate": 1.451077073208455e-06, "loss": 0.4286, "num_input_tokens_seen": 150644576, "step": 123880 }, { "epoch": 15.522490915925323, "grad_norm": 3.1995673179626465, "learning_rate": 1.450691978920995e-06, "loss": 0.4752, "num_input_tokens_seen": 150650688, "step": 123885 }, { "epoch": 15.523117403834107, "grad_norm": 2.641462802886963, "learning_rate": 1.4503069270684955e-06, "loss": 0.4647, "num_input_tokens_seen": 150656768, "step": 123890 }, { "epoch": 15.523743891742889, "grad_norm": 1.6881941556930542, "learning_rate": 1.4499219176555606e-06, "loss": 0.4625, "num_input_tokens_seen": 150663104, "step": 123895 }, { "epoch": 15.524370379651673, "grad_norm": 5.857716083526611, "learning_rate": 1.4495369506867957e-06, "loss": 0.4765, "num_input_tokens_seen": 150668992, "step": 123900 }, { "epoch": 15.524996867560455, "grad_norm": 10.365821838378906, "learning_rate": 1.4491520261668001e-06, "loss": 0.4836, "num_input_tokens_seen": 150674240, "step": 123905 }, { "epoch": 15.52562335546924, "grad_norm": 2.5699453353881836, "learning_rate": 1.4487671441001788e-06, "loss": 0.4814, "num_input_tokens_seen": 150680384, "step": 123910 }, { "epoch": 15.526249843378023, "grad_norm": 8.104719161987305, "learning_rate": 1.4483823044915306e-06, "loss": 0.4717, "num_input_tokens_seen": 150686560, "step": 123915 }, { "epoch": 15.526876331286806, "grad_norm": 2.1260533332824707, "learning_rate": 1.4479975073454593e-06, "loss": 0.426, "num_input_tokens_seen": 150692576, "step": 123920 }, { "epoch": 15.52750281919559, "grad_norm": 1.3383584022521973, "learning_rate": 1.4476127526665628e-06, "loss": 0.4142, "num_input_tokens_seen": 150698624, "step": 123925 }, { "epoch": 15.528129307104372, "grad_norm": 4.353028774261475, "learning_rate": 1.4472280404594442e-06, "loss": 0.4455, "num_input_tokens_seen": 150704992, "step": 123930 }, { "epoch": 15.528755795013156, "grad_norm": 18.3089599609375, "learning_rate": 1.4468433707287e-06, "loss": 0.5257, "num_input_tokens_seen": 150711456, "step": 123935 }, { "epoch": 15.52938228292194, "grad_norm": 15.633345603942871, "learning_rate": 1.4464587434789317e-06, "loss": 0.4852, "num_input_tokens_seen": 150717664, "step": 123940 }, { "epoch": 15.530008770830722, "grad_norm": 2.4107367992401123, "learning_rate": 1.4460741587147354e-06, "loss": 0.4714, "num_input_tokens_seen": 150723744, "step": 123945 }, { "epoch": 15.530635258739506, "grad_norm": 8.46686840057373, "learning_rate": 1.4456896164407108e-06, "loss": 0.4632, "num_input_tokens_seen": 150729728, "step": 123950 }, { "epoch": 15.53126174664829, "grad_norm": 3.0733489990234375, "learning_rate": 1.4453051166614562e-06, "loss": 0.4673, "num_input_tokens_seen": 150736064, "step": 123955 }, { "epoch": 15.531888234557073, "grad_norm": 2.839871644973755, "learning_rate": 1.4449206593815663e-06, "loss": 0.5563, "num_input_tokens_seen": 150741888, "step": 123960 }, { "epoch": 15.532514722465857, "grad_norm": 10.927399635314941, "learning_rate": 1.4445362446056405e-06, "loss": 0.4829, "num_input_tokens_seen": 150747968, "step": 123965 }, { "epoch": 15.53314121037464, "grad_norm": 5.393661975860596, "learning_rate": 1.4441518723382713e-06, "loss": 0.4963, "num_input_tokens_seen": 150754432, "step": 123970 }, { "epoch": 15.533767698283423, "grad_norm": 6.898400783538818, "learning_rate": 1.4437675425840564e-06, "loss": 0.5322, "num_input_tokens_seen": 150759904, "step": 123975 }, { "epoch": 15.534394186192207, "grad_norm": 2.5025475025177, "learning_rate": 1.4433832553475918e-06, "loss": 0.3994, "num_input_tokens_seen": 150766240, "step": 123980 }, { "epoch": 15.53502067410099, "grad_norm": 14.306586265563965, "learning_rate": 1.442999010633469e-06, "loss": 0.5473, "num_input_tokens_seen": 150772320, "step": 123985 }, { "epoch": 15.535647162009774, "grad_norm": 2.1740305423736572, "learning_rate": 1.4426148084462838e-06, "loss": 0.4763, "num_input_tokens_seen": 150778368, "step": 123990 }, { "epoch": 15.536273649918556, "grad_norm": 5.432601451873779, "learning_rate": 1.4422306487906307e-06, "loss": 0.4455, "num_input_tokens_seen": 150784800, "step": 123995 }, { "epoch": 15.53690013782734, "grad_norm": 4.197641372680664, "learning_rate": 1.4418465316711e-06, "loss": 0.5207, "num_input_tokens_seen": 150790336, "step": 124000 }, { "epoch": 15.537526625736124, "grad_norm": 2.388758897781372, "learning_rate": 1.441462457092287e-06, "loss": 0.4522, "num_input_tokens_seen": 150796288, "step": 124005 }, { "epoch": 15.538153113644906, "grad_norm": 3.5791399478912354, "learning_rate": 1.4410784250587805e-06, "loss": 0.433, "num_input_tokens_seen": 150802560, "step": 124010 }, { "epoch": 15.53877960155369, "grad_norm": 5.820555686950684, "learning_rate": 1.440694435575174e-06, "loss": 0.4581, "num_input_tokens_seen": 150808704, "step": 124015 }, { "epoch": 15.539406089462473, "grad_norm": 4.704153060913086, "learning_rate": 1.4403104886460595e-06, "loss": 0.4463, "num_input_tokens_seen": 150814368, "step": 124020 }, { "epoch": 15.540032577371257, "grad_norm": 3.1788220405578613, "learning_rate": 1.439926584276024e-06, "loss": 0.5278, "num_input_tokens_seen": 150820544, "step": 124025 }, { "epoch": 15.54065906528004, "grad_norm": 4.3843255043029785, "learning_rate": 1.4395427224696617e-06, "loss": 0.4338, "num_input_tokens_seen": 150826368, "step": 124030 }, { "epoch": 15.541285553188823, "grad_norm": 2.502331018447876, "learning_rate": 1.4391589032315572e-06, "loss": 0.4639, "num_input_tokens_seen": 150832608, "step": 124035 }, { "epoch": 15.541912041097607, "grad_norm": 2.832057476043701, "learning_rate": 1.4387751265663036e-06, "loss": 0.4299, "num_input_tokens_seen": 150839040, "step": 124040 }, { "epoch": 15.54253852900639, "grad_norm": 2.5198075771331787, "learning_rate": 1.4383913924784866e-06, "loss": 0.5503, "num_input_tokens_seen": 150845440, "step": 124045 }, { "epoch": 15.543165016915173, "grad_norm": 7.047368049621582, "learning_rate": 1.4380077009726956e-06, "loss": 0.4202, "num_input_tokens_seen": 150851648, "step": 124050 }, { "epoch": 15.543791504823957, "grad_norm": 2.5489397048950195, "learning_rate": 1.4376240520535162e-06, "loss": 0.4719, "num_input_tokens_seen": 150857696, "step": 124055 }, { "epoch": 15.54441799273274, "grad_norm": 2.757016658782959, "learning_rate": 1.4372404457255373e-06, "loss": 0.4834, "num_input_tokens_seen": 150863744, "step": 124060 }, { "epoch": 15.545044480641524, "grad_norm": 11.034491539001465, "learning_rate": 1.4368568819933436e-06, "loss": 0.4822, "num_input_tokens_seen": 150869760, "step": 124065 }, { "epoch": 15.545670968550308, "grad_norm": 6.8584699630737305, "learning_rate": 1.436473360861521e-06, "loss": 0.4646, "num_input_tokens_seen": 150875232, "step": 124070 }, { "epoch": 15.54629745645909, "grad_norm": 7.53872013092041, "learning_rate": 1.436089882334657e-06, "loss": 0.4111, "num_input_tokens_seen": 150881312, "step": 124075 }, { "epoch": 15.546923944367874, "grad_norm": 3.3430938720703125, "learning_rate": 1.4357064464173337e-06, "loss": 0.416, "num_input_tokens_seen": 150887680, "step": 124080 }, { "epoch": 15.547550432276656, "grad_norm": 18.935428619384766, "learning_rate": 1.435323053114136e-06, "loss": 0.593, "num_input_tokens_seen": 150893856, "step": 124085 }, { "epoch": 15.54817692018544, "grad_norm": 8.47463321685791, "learning_rate": 1.4349397024296486e-06, "loss": 0.5956, "num_input_tokens_seen": 150900096, "step": 124090 }, { "epoch": 15.548803408094225, "grad_norm": 3.0476014614105225, "learning_rate": 1.434556394368456e-06, "loss": 0.4888, "num_input_tokens_seen": 150906656, "step": 124095 }, { "epoch": 15.549429896003007, "grad_norm": 3.0116751194000244, "learning_rate": 1.4341731289351374e-06, "loss": 0.4661, "num_input_tokens_seen": 150912736, "step": 124100 }, { "epoch": 15.550056383911791, "grad_norm": 4.2784881591796875, "learning_rate": 1.4337899061342786e-06, "loss": 0.4879, "num_input_tokens_seen": 150918784, "step": 124105 }, { "epoch": 15.550682871820573, "grad_norm": 12.753981590270996, "learning_rate": 1.4334067259704586e-06, "loss": 0.4317, "num_input_tokens_seen": 150925088, "step": 124110 }, { "epoch": 15.551309359729357, "grad_norm": 7.521724224090576, "learning_rate": 1.4330235884482614e-06, "loss": 0.4527, "num_input_tokens_seen": 150931296, "step": 124115 }, { "epoch": 15.551935847638141, "grad_norm": 5.198881149291992, "learning_rate": 1.4326404935722642e-06, "loss": 0.4853, "num_input_tokens_seen": 150936448, "step": 124120 }, { "epoch": 15.552562335546924, "grad_norm": 2.4040441513061523, "learning_rate": 1.4322574413470507e-06, "loss": 0.4912, "num_input_tokens_seen": 150941984, "step": 124125 }, { "epoch": 15.553188823455708, "grad_norm": 16.181764602661133, "learning_rate": 1.4318744317771982e-06, "loss": 0.524, "num_input_tokens_seen": 150948064, "step": 124130 }, { "epoch": 15.55381531136449, "grad_norm": 9.192259788513184, "learning_rate": 1.4314914648672863e-06, "loss": 0.5087, "num_input_tokens_seen": 150954592, "step": 124135 }, { "epoch": 15.554441799273274, "grad_norm": 11.502493858337402, "learning_rate": 1.4311085406218956e-06, "loss": 0.4786, "num_input_tokens_seen": 150960896, "step": 124140 }, { "epoch": 15.555068287182058, "grad_norm": 2.536461353302002, "learning_rate": 1.4307256590456014e-06, "loss": 0.4888, "num_input_tokens_seen": 150967104, "step": 124145 }, { "epoch": 15.55569477509084, "grad_norm": 2.0088629722595215, "learning_rate": 1.4303428201429842e-06, "loss": 0.4326, "num_input_tokens_seen": 150973312, "step": 124150 }, { "epoch": 15.556321262999624, "grad_norm": 4.595499038696289, "learning_rate": 1.429960023918619e-06, "loss": 0.4393, "num_input_tokens_seen": 150979712, "step": 124155 }, { "epoch": 15.556947750908407, "grad_norm": 7.264260292053223, "learning_rate": 1.4295772703770838e-06, "loss": 0.4223, "num_input_tokens_seen": 150985888, "step": 124160 }, { "epoch": 15.55757423881719, "grad_norm": 10.92698860168457, "learning_rate": 1.4291945595229533e-06, "loss": 0.5187, "num_input_tokens_seen": 150992128, "step": 124165 }, { "epoch": 15.558200726725975, "grad_norm": 6.93610143661499, "learning_rate": 1.4288118913608051e-06, "loss": 0.5107, "num_input_tokens_seen": 150998144, "step": 124170 }, { "epoch": 15.558827214634757, "grad_norm": 3.9957025051116943, "learning_rate": 1.428429265895212e-06, "loss": 0.4528, "num_input_tokens_seen": 151004224, "step": 124175 }, { "epoch": 15.559453702543541, "grad_norm": 2.701192855834961, "learning_rate": 1.4280466831307499e-06, "loss": 0.4904, "num_input_tokens_seen": 151010400, "step": 124180 }, { "epoch": 15.560080190452325, "grad_norm": 3.8496339321136475, "learning_rate": 1.4276641430719929e-06, "loss": 0.4407, "num_input_tokens_seen": 151016576, "step": 124185 }, { "epoch": 15.560706678361107, "grad_norm": 2.4838333129882812, "learning_rate": 1.4272816457235155e-06, "loss": 0.4402, "num_input_tokens_seen": 151022784, "step": 124190 }, { "epoch": 15.561333166269891, "grad_norm": 3.293850898742676, "learning_rate": 1.4268991910898888e-06, "loss": 0.4562, "num_input_tokens_seen": 151029056, "step": 124195 }, { "epoch": 15.561959654178674, "grad_norm": 4.527597904205322, "learning_rate": 1.4265167791756863e-06, "loss": 0.4336, "num_input_tokens_seen": 151034944, "step": 124200 }, { "epoch": 15.562586142087458, "grad_norm": 2.038806915283203, "learning_rate": 1.4261344099854813e-06, "loss": 0.448, "num_input_tokens_seen": 151041344, "step": 124205 }, { "epoch": 15.563212629996242, "grad_norm": 2.7403151988983154, "learning_rate": 1.4257520835238432e-06, "loss": 0.4564, "num_input_tokens_seen": 151047296, "step": 124210 }, { "epoch": 15.563839117905024, "grad_norm": 13.391193389892578, "learning_rate": 1.4253697997953453e-06, "loss": 0.4688, "num_input_tokens_seen": 151053024, "step": 124215 }, { "epoch": 15.564465605813808, "grad_norm": 15.95328426361084, "learning_rate": 1.4249875588045554e-06, "loss": 0.4823, "num_input_tokens_seen": 151059296, "step": 124220 }, { "epoch": 15.56509209372259, "grad_norm": 2.0244436264038086, "learning_rate": 1.424605360556046e-06, "loss": 0.482, "num_input_tokens_seen": 151065376, "step": 124225 }, { "epoch": 15.565718581631375, "grad_norm": 2.8395416736602783, "learning_rate": 1.4242232050543842e-06, "loss": 0.4254, "num_input_tokens_seen": 151071616, "step": 124230 }, { "epoch": 15.566345069540159, "grad_norm": 6.23350191116333, "learning_rate": 1.4238410923041418e-06, "loss": 0.4457, "num_input_tokens_seen": 151077792, "step": 124235 }, { "epoch": 15.566971557448941, "grad_norm": 12.959423065185547, "learning_rate": 1.423459022309885e-06, "loss": 0.4546, "num_input_tokens_seen": 151084128, "step": 124240 }, { "epoch": 15.567598045357725, "grad_norm": 6.659416198730469, "learning_rate": 1.423076995076183e-06, "loss": 0.4287, "num_input_tokens_seen": 151090144, "step": 124245 }, { "epoch": 15.568224533266507, "grad_norm": 2.5116376876831055, "learning_rate": 1.422695010607602e-06, "loss": 0.4382, "num_input_tokens_seen": 151096320, "step": 124250 }, { "epoch": 15.568851021175291, "grad_norm": 13.99889850616455, "learning_rate": 1.422313068908711e-06, "loss": 0.4564, "num_input_tokens_seen": 151102368, "step": 124255 }, { "epoch": 15.569477509084075, "grad_norm": 1.8565455675125122, "learning_rate": 1.4219311699840744e-06, "loss": 0.4649, "num_input_tokens_seen": 151108896, "step": 124260 }, { "epoch": 15.570103996992858, "grad_norm": 2.1936886310577393, "learning_rate": 1.421549313838258e-06, "loss": 0.4778, "num_input_tokens_seen": 151115200, "step": 124265 }, { "epoch": 15.570730484901642, "grad_norm": 5.421035289764404, "learning_rate": 1.4211675004758302e-06, "loss": 0.4939, "num_input_tokens_seen": 151121088, "step": 124270 }, { "epoch": 15.571356972810424, "grad_norm": 1.9752846956253052, "learning_rate": 1.4207857299013521e-06, "loss": 0.4324, "num_input_tokens_seen": 151126880, "step": 124275 }, { "epoch": 15.571983460719208, "grad_norm": 6.9997148513793945, "learning_rate": 1.4204040021193915e-06, "loss": 0.4513, "num_input_tokens_seen": 151132640, "step": 124280 }, { "epoch": 15.572609948627992, "grad_norm": 2.179114818572998, "learning_rate": 1.420022317134509e-06, "loss": 0.4395, "num_input_tokens_seen": 151138816, "step": 124285 }, { "epoch": 15.573236436536774, "grad_norm": 17.59048843383789, "learning_rate": 1.4196406749512698e-06, "loss": 0.4678, "num_input_tokens_seen": 151144704, "step": 124290 }, { "epoch": 15.573862924445558, "grad_norm": 9.409512519836426, "learning_rate": 1.4192590755742365e-06, "loss": 0.5083, "num_input_tokens_seen": 151150624, "step": 124295 }, { "epoch": 15.574489412354342, "grad_norm": 2.6549360752105713, "learning_rate": 1.4188775190079729e-06, "loss": 0.5211, "num_input_tokens_seen": 151156736, "step": 124300 }, { "epoch": 15.575115900263125, "grad_norm": 3.3356051445007324, "learning_rate": 1.4184960052570373e-06, "loss": 0.4434, "num_input_tokens_seen": 151162752, "step": 124305 }, { "epoch": 15.575742388171909, "grad_norm": 12.046141624450684, "learning_rate": 1.4181145343259955e-06, "loss": 0.4732, "num_input_tokens_seen": 151168512, "step": 124310 }, { "epoch": 15.576368876080691, "grad_norm": 4.44263219833374, "learning_rate": 1.4177331062194038e-06, "loss": 0.4196, "num_input_tokens_seen": 151174912, "step": 124315 }, { "epoch": 15.576995363989475, "grad_norm": 2.869009017944336, "learning_rate": 1.4173517209418252e-06, "loss": 0.4928, "num_input_tokens_seen": 151181088, "step": 124320 }, { "epoch": 15.577621851898257, "grad_norm": 5.437363147735596, "learning_rate": 1.41697037849782e-06, "loss": 0.4656, "num_input_tokens_seen": 151187360, "step": 124325 }, { "epoch": 15.578248339807041, "grad_norm": 2.18376088142395, "learning_rate": 1.416589078891945e-06, "loss": 0.4509, "num_input_tokens_seen": 151193344, "step": 124330 }, { "epoch": 15.578874827715826, "grad_norm": 3.446553945541382, "learning_rate": 1.4162078221287618e-06, "loss": 0.511, "num_input_tokens_seen": 151199712, "step": 124335 }, { "epoch": 15.579501315624608, "grad_norm": 10.240102767944336, "learning_rate": 1.4158266082128257e-06, "loss": 0.4591, "num_input_tokens_seen": 151205760, "step": 124340 }, { "epoch": 15.580127803533392, "grad_norm": 4.477545738220215, "learning_rate": 1.4154454371486975e-06, "loss": 0.4706, "num_input_tokens_seen": 151212096, "step": 124345 }, { "epoch": 15.580754291442176, "grad_norm": 3.4585158824920654, "learning_rate": 1.4150643089409311e-06, "loss": 0.4598, "num_input_tokens_seen": 151218464, "step": 124350 }, { "epoch": 15.581380779350958, "grad_norm": 3.5618174076080322, "learning_rate": 1.4146832235940867e-06, "loss": 0.4014, "num_input_tokens_seen": 151224640, "step": 124355 }, { "epoch": 15.582007267259742, "grad_norm": 2.4656972885131836, "learning_rate": 1.4143021811127167e-06, "loss": 0.5005, "num_input_tokens_seen": 151230944, "step": 124360 }, { "epoch": 15.582633755168525, "grad_norm": 5.574743747711182, "learning_rate": 1.4139211815013808e-06, "loss": 0.4699, "num_input_tokens_seen": 151237472, "step": 124365 }, { "epoch": 15.583260243077309, "grad_norm": 16.63436508178711, "learning_rate": 1.4135402247646307e-06, "loss": 0.5595, "num_input_tokens_seen": 151243712, "step": 124370 }, { "epoch": 15.583886730986093, "grad_norm": 11.505227088928223, "learning_rate": 1.4131593109070241e-06, "loss": 0.4547, "num_input_tokens_seen": 151250208, "step": 124375 }, { "epoch": 15.584513218894875, "grad_norm": 2.322444438934326, "learning_rate": 1.412778439933112e-06, "loss": 0.4458, "num_input_tokens_seen": 151256288, "step": 124380 }, { "epoch": 15.585139706803659, "grad_norm": 2.6534337997436523, "learning_rate": 1.4123976118474492e-06, "loss": 0.3927, "num_input_tokens_seen": 151262464, "step": 124385 }, { "epoch": 15.585766194712441, "grad_norm": 3.253164768218994, "learning_rate": 1.4120168266545896e-06, "loss": 0.4534, "num_input_tokens_seen": 151268512, "step": 124390 }, { "epoch": 15.586392682621225, "grad_norm": 5.431300163269043, "learning_rate": 1.4116360843590871e-06, "loss": 0.436, "num_input_tokens_seen": 151274208, "step": 124395 }, { "epoch": 15.58701917053001, "grad_norm": 18.73078155517578, "learning_rate": 1.4112553849654904e-06, "loss": 0.5163, "num_input_tokens_seen": 151280224, "step": 124400 }, { "epoch": 15.587645658438792, "grad_norm": 9.692614555358887, "learning_rate": 1.4108747284783525e-06, "loss": 0.4952, "num_input_tokens_seen": 151286400, "step": 124405 }, { "epoch": 15.588272146347576, "grad_norm": 3.618788242340088, "learning_rate": 1.4104941149022266e-06, "loss": 0.5433, "num_input_tokens_seen": 151292480, "step": 124410 }, { "epoch": 15.588898634256358, "grad_norm": 6.210723876953125, "learning_rate": 1.4101135442416603e-06, "loss": 0.4587, "num_input_tokens_seen": 151298752, "step": 124415 }, { "epoch": 15.589525122165142, "grad_norm": 1.7862681150436401, "learning_rate": 1.4097330165012059e-06, "loss": 0.4101, "num_input_tokens_seen": 151304800, "step": 124420 }, { "epoch": 15.590151610073926, "grad_norm": 12.192861557006836, "learning_rate": 1.4093525316854107e-06, "loss": 0.5726, "num_input_tokens_seen": 151311136, "step": 124425 }, { "epoch": 15.590778097982708, "grad_norm": 10.374302864074707, "learning_rate": 1.4089720897988258e-06, "loss": 0.4406, "num_input_tokens_seen": 151317760, "step": 124430 }, { "epoch": 15.591404585891492, "grad_norm": 5.849028587341309, "learning_rate": 1.4085916908459973e-06, "loss": 0.4389, "num_input_tokens_seen": 151323936, "step": 124435 }, { "epoch": 15.592031073800275, "grad_norm": 12.047751426696777, "learning_rate": 1.408211334831477e-06, "loss": 0.4508, "num_input_tokens_seen": 151330048, "step": 124440 }, { "epoch": 15.592657561709059, "grad_norm": 3.3678174018859863, "learning_rate": 1.4078310217598073e-06, "loss": 0.4918, "num_input_tokens_seen": 151336288, "step": 124445 }, { "epoch": 15.593284049617843, "grad_norm": 2.9331016540527344, "learning_rate": 1.4074507516355384e-06, "loss": 0.4541, "num_input_tokens_seen": 151342304, "step": 124450 }, { "epoch": 15.593910537526625, "grad_norm": 8.106242179870605, "learning_rate": 1.4070705244632176e-06, "loss": 0.456, "num_input_tokens_seen": 151348416, "step": 124455 }, { "epoch": 15.59453702543541, "grad_norm": 6.55149507522583, "learning_rate": 1.406690340247388e-06, "loss": 0.4423, "num_input_tokens_seen": 151354432, "step": 124460 }, { "epoch": 15.595163513344193, "grad_norm": 2.5625317096710205, "learning_rate": 1.4063101989925982e-06, "loss": 0.4472, "num_input_tokens_seen": 151360576, "step": 124465 }, { "epoch": 15.595790001252976, "grad_norm": 8.576868057250977, "learning_rate": 1.4059301007033898e-06, "loss": 0.424, "num_input_tokens_seen": 151366656, "step": 124470 }, { "epoch": 15.59641648916176, "grad_norm": 6.165541172027588, "learning_rate": 1.4055500453843102e-06, "loss": 0.4744, "num_input_tokens_seen": 151372576, "step": 124475 }, { "epoch": 15.597042977070542, "grad_norm": 3.009506940841675, "learning_rate": 1.4051700330399005e-06, "loss": 0.5029, "num_input_tokens_seen": 151378752, "step": 124480 }, { "epoch": 15.597669464979326, "grad_norm": 2.1659743785858154, "learning_rate": 1.4047900636747047e-06, "loss": 0.5069, "num_input_tokens_seen": 151384960, "step": 124485 }, { "epoch": 15.59829595288811, "grad_norm": 1.7452447414398193, "learning_rate": 1.404410137293269e-06, "loss": 0.4141, "num_input_tokens_seen": 151391040, "step": 124490 }, { "epoch": 15.598922440796892, "grad_norm": 2.534424304962158, "learning_rate": 1.4040302539001304e-06, "loss": 0.4121, "num_input_tokens_seen": 151397184, "step": 124495 }, { "epoch": 15.599548928705676, "grad_norm": 3.2345337867736816, "learning_rate": 1.4036504134998346e-06, "loss": 0.4954, "num_input_tokens_seen": 151403552, "step": 124500 }, { "epoch": 15.600175416614459, "grad_norm": 1.8797142505645752, "learning_rate": 1.4032706160969212e-06, "loss": 0.4763, "num_input_tokens_seen": 151409600, "step": 124505 }, { "epoch": 15.600801904523243, "grad_norm": 1.6183445453643799, "learning_rate": 1.402890861695933e-06, "loss": 0.4898, "num_input_tokens_seen": 151415648, "step": 124510 }, { "epoch": 15.601428392432027, "grad_norm": 10.89472770690918, "learning_rate": 1.4025111503014073e-06, "loss": 0.5101, "num_input_tokens_seen": 151421856, "step": 124515 }, { "epoch": 15.602054880340809, "grad_norm": 2.4392008781433105, "learning_rate": 1.402131481917887e-06, "loss": 0.4717, "num_input_tokens_seen": 151428160, "step": 124520 }, { "epoch": 15.602681368249593, "grad_norm": 16.15201759338379, "learning_rate": 1.4017518565499083e-06, "loss": 0.4905, "num_input_tokens_seen": 151434400, "step": 124525 }, { "epoch": 15.603307856158375, "grad_norm": 13.623078346252441, "learning_rate": 1.401372274202013e-06, "loss": 0.4804, "num_input_tokens_seen": 151439936, "step": 124530 }, { "epoch": 15.60393434406716, "grad_norm": 3.334543466567993, "learning_rate": 1.400992734878736e-06, "loss": 0.459, "num_input_tokens_seen": 151445952, "step": 124535 }, { "epoch": 15.604560831975943, "grad_norm": 2.384514331817627, "learning_rate": 1.4006132385846183e-06, "loss": 0.439, "num_input_tokens_seen": 151452160, "step": 124540 }, { "epoch": 15.605187319884726, "grad_norm": 2.2075345516204834, "learning_rate": 1.4002337853241938e-06, "loss": 0.4229, "num_input_tokens_seen": 151458432, "step": 124545 }, { "epoch": 15.60581380779351, "grad_norm": 2.463205099105835, "learning_rate": 1.3998543751020028e-06, "loss": 0.4016, "num_input_tokens_seen": 151464448, "step": 124550 }, { "epoch": 15.606440295702292, "grad_norm": 17.28410530090332, "learning_rate": 1.3994750079225777e-06, "loss": 0.4799, "num_input_tokens_seen": 151470400, "step": 124555 }, { "epoch": 15.607066783611076, "grad_norm": 2.701225757598877, "learning_rate": 1.399095683790458e-06, "loss": 0.472, "num_input_tokens_seen": 151476384, "step": 124560 }, { "epoch": 15.60769327151986, "grad_norm": 5.368597984313965, "learning_rate": 1.3987164027101751e-06, "loss": 0.5479, "num_input_tokens_seen": 151482272, "step": 124565 }, { "epoch": 15.608319759428642, "grad_norm": 4.210590362548828, "learning_rate": 1.3983371646862654e-06, "loss": 0.4651, "num_input_tokens_seen": 151488384, "step": 124570 }, { "epoch": 15.608946247337427, "grad_norm": 12.240254402160645, "learning_rate": 1.3979579697232648e-06, "loss": 0.4486, "num_input_tokens_seen": 151494336, "step": 124575 }, { "epoch": 15.60957273524621, "grad_norm": 5.795899868011475, "learning_rate": 1.3975788178257033e-06, "loss": 0.5045, "num_input_tokens_seen": 151500288, "step": 124580 }, { "epoch": 15.610199223154993, "grad_norm": 15.475347518920898, "learning_rate": 1.397199708998117e-06, "loss": 0.5118, "num_input_tokens_seen": 151506528, "step": 124585 }, { "epoch": 15.610825711063777, "grad_norm": 19.626937866210938, "learning_rate": 1.3968206432450366e-06, "loss": 0.5116, "num_input_tokens_seen": 151512704, "step": 124590 }, { "epoch": 15.61145219897256, "grad_norm": 10.51766300201416, "learning_rate": 1.396441620570994e-06, "loss": 0.4895, "num_input_tokens_seen": 151518400, "step": 124595 }, { "epoch": 15.612078686881343, "grad_norm": 6.935277462005615, "learning_rate": 1.3960626409805216e-06, "loss": 0.4547, "num_input_tokens_seen": 151524288, "step": 124600 }, { "epoch": 15.612705174790127, "grad_norm": 11.35374641418457, "learning_rate": 1.3956837044781524e-06, "loss": 0.4608, "num_input_tokens_seen": 151530624, "step": 124605 }, { "epoch": 15.61333166269891, "grad_norm": 7.412500858306885, "learning_rate": 1.3953048110684131e-06, "loss": 0.4881, "num_input_tokens_seen": 151536768, "step": 124610 }, { "epoch": 15.613958150607694, "grad_norm": 2.575662612915039, "learning_rate": 1.3949259607558368e-06, "loss": 0.4055, "num_input_tokens_seen": 151542784, "step": 124615 }, { "epoch": 15.614584638516476, "grad_norm": 20.32662582397461, "learning_rate": 1.39454715354495e-06, "loss": 0.462, "num_input_tokens_seen": 151548896, "step": 124620 }, { "epoch": 15.61521112642526, "grad_norm": 10.572819709777832, "learning_rate": 1.3941683894402851e-06, "loss": 0.4566, "num_input_tokens_seen": 151554592, "step": 124625 }, { "epoch": 15.615837614334044, "grad_norm": 2.6889750957489014, "learning_rate": 1.393789668446367e-06, "loss": 0.433, "num_input_tokens_seen": 151560224, "step": 124630 }, { "epoch": 15.616464102242826, "grad_norm": 9.04505443572998, "learning_rate": 1.3934109905677257e-06, "loss": 0.4893, "num_input_tokens_seen": 151566208, "step": 124635 }, { "epoch": 15.61709059015161, "grad_norm": 15.92370891571045, "learning_rate": 1.3930323558088894e-06, "loss": 0.489, "num_input_tokens_seen": 151572544, "step": 124640 }, { "epoch": 15.617717078060393, "grad_norm": 4.162525653839111, "learning_rate": 1.392653764174382e-06, "loss": 0.4931, "num_input_tokens_seen": 151578720, "step": 124645 }, { "epoch": 15.618343565969177, "grad_norm": 3.032991409301758, "learning_rate": 1.3922752156687342e-06, "loss": 0.4327, "num_input_tokens_seen": 151585024, "step": 124650 }, { "epoch": 15.61897005387796, "grad_norm": 2.337576150894165, "learning_rate": 1.391896710296467e-06, "loss": 0.4637, "num_input_tokens_seen": 151591168, "step": 124655 }, { "epoch": 15.619596541786743, "grad_norm": 11.451857566833496, "learning_rate": 1.3915182480621102e-06, "loss": 0.5078, "num_input_tokens_seen": 151597312, "step": 124660 }, { "epoch": 15.620223029695527, "grad_norm": 16.920759201049805, "learning_rate": 1.391139828970185e-06, "loss": 0.5533, "num_input_tokens_seen": 151603296, "step": 124665 }, { "epoch": 15.62084951760431, "grad_norm": 2.177738904953003, "learning_rate": 1.3907614530252188e-06, "loss": 0.4738, "num_input_tokens_seen": 151609504, "step": 124670 }, { "epoch": 15.621476005513093, "grad_norm": 2.9565186500549316, "learning_rate": 1.3903831202317326e-06, "loss": 0.4664, "num_input_tokens_seen": 151615584, "step": 124675 }, { "epoch": 15.622102493421878, "grad_norm": 2.0706353187561035, "learning_rate": 1.390004830594252e-06, "loss": 0.429, "num_input_tokens_seen": 151621184, "step": 124680 }, { "epoch": 15.62272898133066, "grad_norm": 7.913815021514893, "learning_rate": 1.3896265841172974e-06, "loss": 0.4416, "num_input_tokens_seen": 151627648, "step": 124685 }, { "epoch": 15.623355469239444, "grad_norm": 2.0041654109954834, "learning_rate": 1.3892483808053925e-06, "loss": 0.461, "num_input_tokens_seen": 151633888, "step": 124690 }, { "epoch": 15.623981957148228, "grad_norm": 18.762556076049805, "learning_rate": 1.3888702206630605e-06, "loss": 0.5105, "num_input_tokens_seen": 151640000, "step": 124695 }, { "epoch": 15.62460844505701, "grad_norm": 2.2876195907592773, "learning_rate": 1.3884921036948196e-06, "loss": 0.4475, "num_input_tokens_seen": 151646208, "step": 124700 }, { "epoch": 15.625234932965794, "grad_norm": 5.8969597816467285, "learning_rate": 1.388114029905192e-06, "loss": 0.4601, "num_input_tokens_seen": 151652704, "step": 124705 }, { "epoch": 15.625861420874577, "grad_norm": 3.3310115337371826, "learning_rate": 1.3877359992986978e-06, "loss": 0.5048, "num_input_tokens_seen": 151658912, "step": 124710 }, { "epoch": 15.62648790878336, "grad_norm": 1.732380986213684, "learning_rate": 1.3873580118798585e-06, "loss": 0.4167, "num_input_tokens_seen": 151665184, "step": 124715 }, { "epoch": 15.627114396692145, "grad_norm": 2.487898588180542, "learning_rate": 1.3869800676531897e-06, "loss": 0.4458, "num_input_tokens_seen": 151671328, "step": 124720 }, { "epoch": 15.627740884600927, "grad_norm": 8.20079517364502, "learning_rate": 1.3866021666232132e-06, "loss": 0.499, "num_input_tokens_seen": 151677888, "step": 124725 }, { "epoch": 15.628367372509711, "grad_norm": 6.859373569488525, "learning_rate": 1.386224308794445e-06, "loss": 0.4951, "num_input_tokens_seen": 151683872, "step": 124730 }, { "epoch": 15.628993860418493, "grad_norm": 2.8076417446136475, "learning_rate": 1.3858464941714046e-06, "loss": 0.427, "num_input_tokens_seen": 151690176, "step": 124735 }, { "epoch": 15.629620348327277, "grad_norm": 7.807387351989746, "learning_rate": 1.385468722758606e-06, "loss": 0.4378, "num_input_tokens_seen": 151696448, "step": 124740 }, { "epoch": 15.630246836236061, "grad_norm": 17.72797393798828, "learning_rate": 1.3850909945605694e-06, "loss": 0.5194, "num_input_tokens_seen": 151702464, "step": 124745 }, { "epoch": 15.630873324144844, "grad_norm": 1.2886935472488403, "learning_rate": 1.3847133095818082e-06, "loss": 0.4437, "num_input_tokens_seen": 151708160, "step": 124750 }, { "epoch": 15.631499812053628, "grad_norm": 1.4973760843276978, "learning_rate": 1.3843356678268388e-06, "loss": 0.4569, "num_input_tokens_seen": 151714304, "step": 124755 }, { "epoch": 15.63212629996241, "grad_norm": 2.5422186851501465, "learning_rate": 1.3839580693001775e-06, "loss": 0.4512, "num_input_tokens_seen": 151720768, "step": 124760 }, { "epoch": 15.632752787871194, "grad_norm": 5.957478046417236, "learning_rate": 1.3835805140063369e-06, "loss": 0.4785, "num_input_tokens_seen": 151726496, "step": 124765 }, { "epoch": 15.633379275779978, "grad_norm": 2.660895347595215, "learning_rate": 1.3832030019498326e-06, "loss": 0.4815, "num_input_tokens_seen": 151732704, "step": 124770 }, { "epoch": 15.63400576368876, "grad_norm": 8.844599723815918, "learning_rate": 1.382825533135176e-06, "loss": 0.4822, "num_input_tokens_seen": 151738688, "step": 124775 }, { "epoch": 15.634632251597544, "grad_norm": 7.154055595397949, "learning_rate": 1.3824481075668827e-06, "loss": 0.4957, "num_input_tokens_seen": 151744800, "step": 124780 }, { "epoch": 15.635258739506327, "grad_norm": 3.16706919670105, "learning_rate": 1.3820707252494625e-06, "loss": 0.4493, "num_input_tokens_seen": 151751264, "step": 124785 }, { "epoch": 15.63588522741511, "grad_norm": 3.171379327774048, "learning_rate": 1.38169338618743e-06, "loss": 0.5011, "num_input_tokens_seen": 151757728, "step": 124790 }, { "epoch": 15.636511715323895, "grad_norm": 15.534825325012207, "learning_rate": 1.3813160903852934e-06, "loss": 0.4763, "num_input_tokens_seen": 151763936, "step": 124795 }, { "epoch": 15.637138203232677, "grad_norm": 3.5559422969818115, "learning_rate": 1.3809388378475657e-06, "loss": 0.4148, "num_input_tokens_seen": 151770112, "step": 124800 }, { "epoch": 15.637764691141461, "grad_norm": 4.945182800292969, "learning_rate": 1.3805616285787565e-06, "loss": 0.4638, "num_input_tokens_seen": 151776448, "step": 124805 }, { "epoch": 15.638391179050245, "grad_norm": 2.1521799564361572, "learning_rate": 1.3801844625833777e-06, "loss": 0.498, "num_input_tokens_seen": 151782496, "step": 124810 }, { "epoch": 15.639017666959028, "grad_norm": 23.76788902282715, "learning_rate": 1.3798073398659361e-06, "loss": 0.6131, "num_input_tokens_seen": 151788576, "step": 124815 }, { "epoch": 15.639644154867812, "grad_norm": 12.066506385803223, "learning_rate": 1.3794302604309407e-06, "loss": 0.5178, "num_input_tokens_seen": 151794688, "step": 124820 }, { "epoch": 15.640270642776594, "grad_norm": 7.606757640838623, "learning_rate": 1.3790532242829024e-06, "loss": 0.4838, "num_input_tokens_seen": 151800928, "step": 124825 }, { "epoch": 15.640897130685378, "grad_norm": 3.1482036113739014, "learning_rate": 1.3786762314263258e-06, "loss": 0.5115, "num_input_tokens_seen": 151806400, "step": 124830 }, { "epoch": 15.641523618594162, "grad_norm": 2.3635237216949463, "learning_rate": 1.3782992818657209e-06, "loss": 0.4395, "num_input_tokens_seen": 151812448, "step": 124835 }, { "epoch": 15.642150106502944, "grad_norm": 2.0786705017089844, "learning_rate": 1.377922375605591e-06, "loss": 0.4315, "num_input_tokens_seen": 151818656, "step": 124840 }, { "epoch": 15.642776594411728, "grad_norm": 3.1116418838500977, "learning_rate": 1.3775455126504466e-06, "loss": 0.4713, "num_input_tokens_seen": 151824896, "step": 124845 }, { "epoch": 15.64340308232051, "grad_norm": 2.1447339057922363, "learning_rate": 1.3771686930047895e-06, "loss": 0.4123, "num_input_tokens_seen": 151831392, "step": 124850 }, { "epoch": 15.644029570229295, "grad_norm": 13.250727653503418, "learning_rate": 1.3767919166731276e-06, "loss": 0.4629, "num_input_tokens_seen": 151837504, "step": 124855 }, { "epoch": 15.644656058138079, "grad_norm": 1.5828428268432617, "learning_rate": 1.3764151836599638e-06, "loss": 0.4483, "num_input_tokens_seen": 151843520, "step": 124860 }, { "epoch": 15.645282546046861, "grad_norm": 4.074512958526611, "learning_rate": 1.376038493969804e-06, "loss": 0.4491, "num_input_tokens_seen": 151849664, "step": 124865 }, { "epoch": 15.645909033955645, "grad_norm": 3.0828323364257812, "learning_rate": 1.3756618476071492e-06, "loss": 0.4711, "num_input_tokens_seen": 151855840, "step": 124870 }, { "epoch": 15.646535521864427, "grad_norm": 3.597114324569702, "learning_rate": 1.3752852445765063e-06, "loss": 0.5215, "num_input_tokens_seen": 151861760, "step": 124875 }, { "epoch": 15.647162009773211, "grad_norm": 1.824184775352478, "learning_rate": 1.374908684882374e-06, "loss": 0.4165, "num_input_tokens_seen": 151868128, "step": 124880 }, { "epoch": 15.647788497681995, "grad_norm": 3.1765835285186768, "learning_rate": 1.3745321685292562e-06, "loss": 0.4554, "num_input_tokens_seen": 151874240, "step": 124885 }, { "epoch": 15.648414985590778, "grad_norm": 6.636383533477783, "learning_rate": 1.3741556955216563e-06, "loss": 0.4215, "num_input_tokens_seen": 151880384, "step": 124890 }, { "epoch": 15.649041473499562, "grad_norm": 2.4932303428649902, "learning_rate": 1.3737792658640714e-06, "loss": 0.4259, "num_input_tokens_seen": 151886496, "step": 124895 }, { "epoch": 15.649667961408344, "grad_norm": 5.979862213134766, "learning_rate": 1.3734028795610054e-06, "loss": 0.4737, "num_input_tokens_seen": 151892896, "step": 124900 }, { "epoch": 15.650294449317128, "grad_norm": 25.585718154907227, "learning_rate": 1.3730265366169565e-06, "loss": 0.5427, "num_input_tokens_seen": 151899104, "step": 124905 }, { "epoch": 15.650920937225912, "grad_norm": 1.6445903778076172, "learning_rate": 1.3726502370364241e-06, "loss": 0.4337, "num_input_tokens_seen": 151905056, "step": 124910 }, { "epoch": 15.651547425134694, "grad_norm": 2.4154052734375, "learning_rate": 1.3722739808239077e-06, "loss": 0.4682, "num_input_tokens_seen": 151911392, "step": 124915 }, { "epoch": 15.652173913043478, "grad_norm": 2.959949493408203, "learning_rate": 1.371897767983908e-06, "loss": 0.441, "num_input_tokens_seen": 151917312, "step": 124920 }, { "epoch": 15.652800400952263, "grad_norm": 7.2603654861450195, "learning_rate": 1.371521598520919e-06, "loss": 0.4294, "num_input_tokens_seen": 151923840, "step": 124925 }, { "epoch": 15.653426888861045, "grad_norm": 2.7422893047332764, "learning_rate": 1.3711454724394412e-06, "loss": 0.5052, "num_input_tokens_seen": 151929856, "step": 124930 }, { "epoch": 15.654053376769829, "grad_norm": 8.25025463104248, "learning_rate": 1.3707693897439695e-06, "loss": 0.4393, "num_input_tokens_seen": 151935936, "step": 124935 }, { "epoch": 15.654679864678611, "grad_norm": 2.0912153720855713, "learning_rate": 1.3703933504390005e-06, "loss": 0.4947, "num_input_tokens_seen": 151942016, "step": 124940 }, { "epoch": 15.655306352587395, "grad_norm": 2.4062659740448, "learning_rate": 1.3700173545290319e-06, "loss": 0.4691, "num_input_tokens_seen": 151948192, "step": 124945 }, { "epoch": 15.655932840496178, "grad_norm": 10.945101737976074, "learning_rate": 1.3696414020185566e-06, "loss": 0.4706, "num_input_tokens_seen": 151953920, "step": 124950 }, { "epoch": 15.656559328404962, "grad_norm": 2.5601770877838135, "learning_rate": 1.3692654929120719e-06, "loss": 0.4117, "num_input_tokens_seen": 151960224, "step": 124955 }, { "epoch": 15.657185816313746, "grad_norm": 10.936016082763672, "learning_rate": 1.3688896272140695e-06, "loss": 0.4719, "num_input_tokens_seen": 151966112, "step": 124960 }, { "epoch": 15.657812304222528, "grad_norm": 8.019803047180176, "learning_rate": 1.3685138049290459e-06, "loss": 0.4576, "num_input_tokens_seen": 151972352, "step": 124965 }, { "epoch": 15.658438792131312, "grad_norm": 4.1897053718566895, "learning_rate": 1.3681380260614914e-06, "loss": 0.4757, "num_input_tokens_seen": 151978528, "step": 124970 }, { "epoch": 15.659065280040096, "grad_norm": 2.0061724185943604, "learning_rate": 1.3677622906159016e-06, "loss": 0.4123, "num_input_tokens_seen": 151984064, "step": 124975 }, { "epoch": 15.659691767948878, "grad_norm": 12.018034934997559, "learning_rate": 1.367386598596766e-06, "loss": 0.5859, "num_input_tokens_seen": 151990208, "step": 124980 }, { "epoch": 15.660318255857662, "grad_norm": 21.932706832885742, "learning_rate": 1.3670109500085793e-06, "loss": 0.5085, "num_input_tokens_seen": 151996288, "step": 124985 }, { "epoch": 15.660944743766445, "grad_norm": 11.5235595703125, "learning_rate": 1.3666353448558294e-06, "loss": 0.5315, "num_input_tokens_seen": 152002304, "step": 124990 }, { "epoch": 15.661571231675229, "grad_norm": 10.318412780761719, "learning_rate": 1.3662597831430108e-06, "loss": 0.5708, "num_input_tokens_seen": 152007968, "step": 124995 }, { "epoch": 15.662197719584013, "grad_norm": 3.033726692199707, "learning_rate": 1.3658842648746096e-06, "loss": 0.5164, "num_input_tokens_seen": 152014208, "step": 125000 }, { "epoch": 15.662824207492795, "grad_norm": 2.2778072357177734, "learning_rate": 1.365508790055118e-06, "loss": 0.4477, "num_input_tokens_seen": 152020512, "step": 125005 }, { "epoch": 15.663450695401579, "grad_norm": 11.339723587036133, "learning_rate": 1.3651333586890236e-06, "loss": 0.4709, "num_input_tokens_seen": 152026528, "step": 125010 }, { "epoch": 15.664077183310361, "grad_norm": 11.370678901672363, "learning_rate": 1.3647579707808183e-06, "loss": 0.3992, "num_input_tokens_seen": 152032640, "step": 125015 }, { "epoch": 15.664703671219145, "grad_norm": 1.7760685682296753, "learning_rate": 1.3643826263349857e-06, "loss": 0.4707, "num_input_tokens_seen": 152038848, "step": 125020 }, { "epoch": 15.66533015912793, "grad_norm": 3.492396593093872, "learning_rate": 1.364007325356016e-06, "loss": 0.5767, "num_input_tokens_seen": 152044992, "step": 125025 }, { "epoch": 15.665956647036712, "grad_norm": 2.4807686805725098, "learning_rate": 1.3636320678483966e-06, "loss": 0.4723, "num_input_tokens_seen": 152051488, "step": 125030 }, { "epoch": 15.666583134945496, "grad_norm": 1.7492936849594116, "learning_rate": 1.363256853816612e-06, "loss": 0.4939, "num_input_tokens_seen": 152057408, "step": 125035 }, { "epoch": 15.667209622854278, "grad_norm": 2.5673553943634033, "learning_rate": 1.362881683265151e-06, "loss": 0.4457, "num_input_tokens_seen": 152063840, "step": 125040 }, { "epoch": 15.667836110763062, "grad_norm": 1.9581003189086914, "learning_rate": 1.3625065561984951e-06, "loss": 0.5283, "num_input_tokens_seen": 152069472, "step": 125045 }, { "epoch": 15.668462598671846, "grad_norm": 9.308187484741211, "learning_rate": 1.3621314726211338e-06, "loss": 0.492, "num_input_tokens_seen": 152075680, "step": 125050 }, { "epoch": 15.669089086580628, "grad_norm": 8.253682136535645, "learning_rate": 1.3617564325375476e-06, "loss": 0.4694, "num_input_tokens_seen": 152081600, "step": 125055 }, { "epoch": 15.669715574489413, "grad_norm": 1.774818778038025, "learning_rate": 1.3613814359522237e-06, "loss": 0.4379, "num_input_tokens_seen": 152088096, "step": 125060 }, { "epoch": 15.670342062398195, "grad_norm": 17.617403030395508, "learning_rate": 1.3610064828696424e-06, "loss": 0.5732, "num_input_tokens_seen": 152094176, "step": 125065 }, { "epoch": 15.670968550306979, "grad_norm": 2.802622079849243, "learning_rate": 1.3606315732942883e-06, "loss": 0.4399, "num_input_tokens_seen": 152100544, "step": 125070 }, { "epoch": 15.671595038215763, "grad_norm": 3.9776692390441895, "learning_rate": 1.3602567072306444e-06, "loss": 0.458, "num_input_tokens_seen": 152106656, "step": 125075 }, { "epoch": 15.672221526124545, "grad_norm": 4.208819389343262, "learning_rate": 1.3598818846831908e-06, "loss": 0.4231, "num_input_tokens_seen": 152112576, "step": 125080 }, { "epoch": 15.67284801403333, "grad_norm": 1.949591040611267, "learning_rate": 1.3595071056564108e-06, "loss": 0.4791, "num_input_tokens_seen": 152118720, "step": 125085 }, { "epoch": 15.673474501942113, "grad_norm": 13.003644943237305, "learning_rate": 1.3591323701547827e-06, "loss": 0.5211, "num_input_tokens_seen": 152125024, "step": 125090 }, { "epoch": 15.674100989850896, "grad_norm": 11.014530181884766, "learning_rate": 1.35875767818279e-06, "loss": 0.512, "num_input_tokens_seen": 152130464, "step": 125095 }, { "epoch": 15.67472747775968, "grad_norm": 2.83909273147583, "learning_rate": 1.3583830297449085e-06, "loss": 0.4605, "num_input_tokens_seen": 152136288, "step": 125100 }, { "epoch": 15.675353965668462, "grad_norm": 1.4188908338546753, "learning_rate": 1.3580084248456205e-06, "loss": 0.5061, "num_input_tokens_seen": 152142528, "step": 125105 }, { "epoch": 15.675980453577246, "grad_norm": 2.3926942348480225, "learning_rate": 1.3576338634894043e-06, "loss": 0.4365, "num_input_tokens_seen": 152148320, "step": 125110 }, { "epoch": 15.67660694148603, "grad_norm": 2.4366955757141113, "learning_rate": 1.357259345680737e-06, "loss": 0.4419, "num_input_tokens_seen": 152154496, "step": 125115 }, { "epoch": 15.677233429394812, "grad_norm": 5.334864616394043, "learning_rate": 1.3568848714240962e-06, "loss": 0.4723, "num_input_tokens_seen": 152160128, "step": 125120 }, { "epoch": 15.677859917303596, "grad_norm": 8.22213077545166, "learning_rate": 1.35651044072396e-06, "loss": 0.4627, "num_input_tokens_seen": 152166528, "step": 125125 }, { "epoch": 15.678486405212379, "grad_norm": 6.260595798492432, "learning_rate": 1.3561360535848062e-06, "loss": 0.4455, "num_input_tokens_seen": 152172608, "step": 125130 }, { "epoch": 15.679112893121163, "grad_norm": 3.8323445320129395, "learning_rate": 1.3557617100111082e-06, "loss": 0.4387, "num_input_tokens_seen": 152179104, "step": 125135 }, { "epoch": 15.679739381029947, "grad_norm": 3.509207010269165, "learning_rate": 1.3553874100073438e-06, "loss": 0.4133, "num_input_tokens_seen": 152185120, "step": 125140 }, { "epoch": 15.680365868938729, "grad_norm": 17.330821990966797, "learning_rate": 1.3550131535779864e-06, "loss": 0.492, "num_input_tokens_seen": 152191008, "step": 125145 }, { "epoch": 15.680992356847513, "grad_norm": 2.514573335647583, "learning_rate": 1.3546389407275123e-06, "loss": 0.4403, "num_input_tokens_seen": 152197088, "step": 125150 }, { "epoch": 15.681618844756295, "grad_norm": 5.487761497497559, "learning_rate": 1.354264771460393e-06, "loss": 0.4494, "num_input_tokens_seen": 152203328, "step": 125155 }, { "epoch": 15.68224533266508, "grad_norm": 3.1300036907196045, "learning_rate": 1.3538906457811051e-06, "loss": 0.4952, "num_input_tokens_seen": 152209408, "step": 125160 }, { "epoch": 15.682871820573864, "grad_norm": 14.628584861755371, "learning_rate": 1.3535165636941188e-06, "loss": 0.4724, "num_input_tokens_seen": 152215552, "step": 125165 }, { "epoch": 15.683498308482646, "grad_norm": 2.7853522300720215, "learning_rate": 1.3531425252039088e-06, "loss": 0.4211, "num_input_tokens_seen": 152221696, "step": 125170 }, { "epoch": 15.68412479639143, "grad_norm": 6.701420783996582, "learning_rate": 1.3527685303149452e-06, "loss": 0.461, "num_input_tokens_seen": 152227872, "step": 125175 }, { "epoch": 15.684751284300212, "grad_norm": 5.022371292114258, "learning_rate": 1.3523945790317012e-06, "loss": 0.4298, "num_input_tokens_seen": 152233984, "step": 125180 }, { "epoch": 15.685377772208996, "grad_norm": 2.1492843627929688, "learning_rate": 1.3520206713586452e-06, "loss": 0.4301, "num_input_tokens_seen": 152240192, "step": 125185 }, { "epoch": 15.68600426011778, "grad_norm": 8.978782653808594, "learning_rate": 1.3516468073002492e-06, "loss": 0.4655, "num_input_tokens_seen": 152246368, "step": 125190 }, { "epoch": 15.686630748026563, "grad_norm": 3.4137301445007324, "learning_rate": 1.3512729868609848e-06, "loss": 0.4268, "num_input_tokens_seen": 152252512, "step": 125195 }, { "epoch": 15.687257235935347, "grad_norm": 18.979686737060547, "learning_rate": 1.3508992100453177e-06, "loss": 0.5329, "num_input_tokens_seen": 152258656, "step": 125200 }, { "epoch": 15.68788372384413, "grad_norm": 11.567055702209473, "learning_rate": 1.3505254768577197e-06, "loss": 0.497, "num_input_tokens_seen": 152264992, "step": 125205 }, { "epoch": 15.688510211752913, "grad_norm": 5.960781574249268, "learning_rate": 1.350151787302657e-06, "loss": 0.4841, "num_input_tokens_seen": 152271104, "step": 125210 }, { "epoch": 15.689136699661697, "grad_norm": 3.7847578525543213, "learning_rate": 1.3497781413845985e-06, "loss": 0.4527, "num_input_tokens_seen": 152277088, "step": 125215 }, { "epoch": 15.68976318757048, "grad_norm": 14.09448528289795, "learning_rate": 1.3494045391080107e-06, "loss": 0.4716, "num_input_tokens_seen": 152282976, "step": 125220 }, { "epoch": 15.690389675479263, "grad_norm": 2.3047142028808594, "learning_rate": 1.3490309804773626e-06, "loss": 0.4827, "num_input_tokens_seen": 152289120, "step": 125225 }, { "epoch": 15.691016163388047, "grad_norm": 2.430823564529419, "learning_rate": 1.348657465497117e-06, "loss": 0.4737, "num_input_tokens_seen": 152295264, "step": 125230 }, { "epoch": 15.69164265129683, "grad_norm": 4.466106414794922, "learning_rate": 1.348283994171743e-06, "loss": 0.4547, "num_input_tokens_seen": 152301472, "step": 125235 }, { "epoch": 15.692269139205614, "grad_norm": 6.067661762237549, "learning_rate": 1.3479105665057024e-06, "loss": 0.4845, "num_input_tokens_seen": 152307168, "step": 125240 }, { "epoch": 15.692895627114396, "grad_norm": 1.975832223892212, "learning_rate": 1.3475371825034633e-06, "loss": 0.5013, "num_input_tokens_seen": 152312800, "step": 125245 }, { "epoch": 15.69352211502318, "grad_norm": 4.340840816497803, "learning_rate": 1.3471638421694861e-06, "loss": 0.4284, "num_input_tokens_seen": 152319008, "step": 125250 }, { "epoch": 15.694148602931964, "grad_norm": 9.329375267028809, "learning_rate": 1.3467905455082364e-06, "loss": 0.4977, "num_input_tokens_seen": 152325024, "step": 125255 }, { "epoch": 15.694775090840746, "grad_norm": 10.770334243774414, "learning_rate": 1.3464172925241791e-06, "loss": 0.5201, "num_input_tokens_seen": 152331008, "step": 125260 }, { "epoch": 15.69540157874953, "grad_norm": 3.117583990097046, "learning_rate": 1.346044083221773e-06, "loss": 0.4121, "num_input_tokens_seen": 152336864, "step": 125265 }, { "epoch": 15.696028066658313, "grad_norm": 2.20747447013855, "learning_rate": 1.3456709176054832e-06, "loss": 0.4412, "num_input_tokens_seen": 152342304, "step": 125270 }, { "epoch": 15.696654554567097, "grad_norm": 3.27862548828125, "learning_rate": 1.3452977956797691e-06, "loss": 0.469, "num_input_tokens_seen": 152348448, "step": 125275 }, { "epoch": 15.69728104247588, "grad_norm": 1.696724772453308, "learning_rate": 1.3449247174490937e-06, "loss": 0.4404, "num_input_tokens_seen": 152354304, "step": 125280 }, { "epoch": 15.697907530384663, "grad_norm": 2.1063408851623535, "learning_rate": 1.3445516829179144e-06, "loss": 0.4758, "num_input_tokens_seen": 152360608, "step": 125285 }, { "epoch": 15.698534018293447, "grad_norm": 2.355874538421631, "learning_rate": 1.3441786920906952e-06, "loss": 0.4303, "num_input_tokens_seen": 152367008, "step": 125290 }, { "epoch": 15.69916050620223, "grad_norm": 3.0791220664978027, "learning_rate": 1.3438057449718916e-06, "loss": 0.4186, "num_input_tokens_seen": 152373056, "step": 125295 }, { "epoch": 15.699786994111014, "grad_norm": 3.106157064437866, "learning_rate": 1.343432841565966e-06, "loss": 0.4602, "num_input_tokens_seen": 152379328, "step": 125300 }, { "epoch": 15.700413482019798, "grad_norm": 7.782926082611084, "learning_rate": 1.3430599818773731e-06, "loss": 0.4639, "num_input_tokens_seen": 152385088, "step": 125305 }, { "epoch": 15.70103996992858, "grad_norm": 2.4295461177825928, "learning_rate": 1.3426871659105733e-06, "loss": 0.5072, "num_input_tokens_seen": 152391360, "step": 125310 }, { "epoch": 15.701666457837364, "grad_norm": 6.89103364944458, "learning_rate": 1.3423143936700245e-06, "loss": 0.4542, "num_input_tokens_seen": 152397216, "step": 125315 }, { "epoch": 15.702292945746148, "grad_norm": 2.1410133838653564, "learning_rate": 1.3419416651601808e-06, "loss": 0.5173, "num_input_tokens_seen": 152403168, "step": 125320 }, { "epoch": 15.70291943365493, "grad_norm": 10.489031791687012, "learning_rate": 1.3415689803855003e-06, "loss": 0.4875, "num_input_tokens_seen": 152409408, "step": 125325 }, { "epoch": 15.703545921563714, "grad_norm": 1.8162541389465332, "learning_rate": 1.341196339350438e-06, "loss": 0.4696, "num_input_tokens_seen": 152415584, "step": 125330 }, { "epoch": 15.704172409472497, "grad_norm": 2.4715750217437744, "learning_rate": 1.3408237420594516e-06, "loss": 0.4767, "num_input_tokens_seen": 152421568, "step": 125335 }, { "epoch": 15.70479889738128, "grad_norm": 3.328432083129883, "learning_rate": 1.340451188516992e-06, "loss": 0.4026, "num_input_tokens_seen": 152427872, "step": 125340 }, { "epoch": 15.705425385290065, "grad_norm": 10.323090553283691, "learning_rate": 1.3400786787275166e-06, "loss": 0.4838, "num_input_tokens_seen": 152433728, "step": 125345 }, { "epoch": 15.706051873198847, "grad_norm": 2.346501588821411, "learning_rate": 1.3397062126954769e-06, "loss": 0.4291, "num_input_tokens_seen": 152439616, "step": 125350 }, { "epoch": 15.706678361107631, "grad_norm": 2.0014595985412598, "learning_rate": 1.3393337904253279e-06, "loss": 0.4308, "num_input_tokens_seen": 152445600, "step": 125355 }, { "epoch": 15.707304849016413, "grad_norm": 2.19157338142395, "learning_rate": 1.3389614119215194e-06, "loss": 0.399, "num_input_tokens_seen": 152451552, "step": 125360 }, { "epoch": 15.707931336925197, "grad_norm": 2.2702300548553467, "learning_rate": 1.3385890771885068e-06, "loss": 0.4154, "num_input_tokens_seen": 152457568, "step": 125365 }, { "epoch": 15.708557824833981, "grad_norm": 2.775960922241211, "learning_rate": 1.3382167862307388e-06, "loss": 0.497, "num_input_tokens_seen": 152463744, "step": 125370 }, { "epoch": 15.709184312742764, "grad_norm": 10.219520568847656, "learning_rate": 1.337844539052668e-06, "loss": 0.6207, "num_input_tokens_seen": 152470112, "step": 125375 }, { "epoch": 15.709810800651548, "grad_norm": 2.738995313644409, "learning_rate": 1.337472335658746e-06, "loss": 0.4176, "num_input_tokens_seen": 152476160, "step": 125380 }, { "epoch": 15.71043728856033, "grad_norm": 2.9183504581451416, "learning_rate": 1.3371001760534197e-06, "loss": 0.4308, "num_input_tokens_seen": 152482336, "step": 125385 }, { "epoch": 15.711063776469114, "grad_norm": 9.652889251708984, "learning_rate": 1.3367280602411426e-06, "loss": 0.5128, "num_input_tokens_seen": 152488416, "step": 125390 }, { "epoch": 15.711690264377898, "grad_norm": 15.902667999267578, "learning_rate": 1.3363559882263594e-06, "loss": 0.5655, "num_input_tokens_seen": 152494528, "step": 125395 }, { "epoch": 15.71231675228668, "grad_norm": 4.899646759033203, "learning_rate": 1.335983960013522e-06, "loss": 0.4473, "num_input_tokens_seen": 152500448, "step": 125400 }, { "epoch": 15.712943240195465, "grad_norm": 2.2912402153015137, "learning_rate": 1.3356119756070756e-06, "loss": 0.4586, "num_input_tokens_seen": 152506720, "step": 125405 }, { "epoch": 15.713569728104247, "grad_norm": 2.191824436187744, "learning_rate": 1.3352400350114703e-06, "loss": 0.5342, "num_input_tokens_seen": 152512928, "step": 125410 }, { "epoch": 15.71419621601303, "grad_norm": 1.9681802988052368, "learning_rate": 1.3348681382311502e-06, "loss": 0.476, "num_input_tokens_seen": 152519136, "step": 125415 }, { "epoch": 15.714822703921815, "grad_norm": 1.9254918098449707, "learning_rate": 1.334496285270563e-06, "loss": 0.4459, "num_input_tokens_seen": 152525088, "step": 125420 }, { "epoch": 15.715449191830597, "grad_norm": 11.890520095825195, "learning_rate": 1.3341244761341543e-06, "loss": 0.663, "num_input_tokens_seen": 152531616, "step": 125425 }, { "epoch": 15.716075679739381, "grad_norm": 3.8629772663116455, "learning_rate": 1.3337527108263709e-06, "loss": 0.4249, "num_input_tokens_seen": 152537856, "step": 125430 }, { "epoch": 15.716702167648165, "grad_norm": 1.6058775186538696, "learning_rate": 1.3333809893516552e-06, "loss": 0.4505, "num_input_tokens_seen": 152544384, "step": 125435 }, { "epoch": 15.717328655556948, "grad_norm": 9.328682899475098, "learning_rate": 1.3330093117144528e-06, "loss": 0.4505, "num_input_tokens_seen": 152550848, "step": 125440 }, { "epoch": 15.717955143465732, "grad_norm": 2.958548069000244, "learning_rate": 1.332637677919208e-06, "loss": 0.4228, "num_input_tokens_seen": 152557120, "step": 125445 }, { "epoch": 15.718581631374514, "grad_norm": 2.138169288635254, "learning_rate": 1.3322660879703614e-06, "loss": 0.435, "num_input_tokens_seen": 152563264, "step": 125450 }, { "epoch": 15.719208119283298, "grad_norm": 1.6321704387664795, "learning_rate": 1.3318945418723593e-06, "loss": 0.5421, "num_input_tokens_seen": 152569440, "step": 125455 }, { "epoch": 15.71983460719208, "grad_norm": 2.656750440597534, "learning_rate": 1.3315230396296402e-06, "loss": 0.4663, "num_input_tokens_seen": 152576032, "step": 125460 }, { "epoch": 15.720461095100864, "grad_norm": 9.545897483825684, "learning_rate": 1.331151581246649e-06, "loss": 0.4773, "num_input_tokens_seen": 152582368, "step": 125465 }, { "epoch": 15.721087583009648, "grad_norm": 20.900358200073242, "learning_rate": 1.3307801667278242e-06, "loss": 0.4968, "num_input_tokens_seen": 152588320, "step": 125470 }, { "epoch": 15.72171407091843, "grad_norm": 17.839797973632812, "learning_rate": 1.3304087960776085e-06, "loss": 0.6009, "num_input_tokens_seen": 152594560, "step": 125475 }, { "epoch": 15.722340558827215, "grad_norm": 12.561065673828125, "learning_rate": 1.3300374693004397e-06, "loss": 0.4961, "num_input_tokens_seen": 152600992, "step": 125480 }, { "epoch": 15.722967046735999, "grad_norm": 17.26190185546875, "learning_rate": 1.3296661864007593e-06, "loss": 0.5479, "num_input_tokens_seen": 152607136, "step": 125485 }, { "epoch": 15.723593534644781, "grad_norm": 2.724698305130005, "learning_rate": 1.3292949473830047e-06, "loss": 0.5238, "num_input_tokens_seen": 152613184, "step": 125490 }, { "epoch": 15.724220022553565, "grad_norm": 3.919484853744507, "learning_rate": 1.3289237522516163e-06, "loss": 0.4822, "num_input_tokens_seen": 152619296, "step": 125495 }, { "epoch": 15.724846510462347, "grad_norm": 13.857196807861328, "learning_rate": 1.3285526010110294e-06, "loss": 0.4968, "num_input_tokens_seen": 152625728, "step": 125500 }, { "epoch": 15.725472998371131, "grad_norm": 7.540931701660156, "learning_rate": 1.3281814936656833e-06, "loss": 0.4306, "num_input_tokens_seen": 152631456, "step": 125505 }, { "epoch": 15.726099486279915, "grad_norm": 1.6241930723190308, "learning_rate": 1.3278104302200157e-06, "loss": 0.4426, "num_input_tokens_seen": 152637600, "step": 125510 }, { "epoch": 15.726725974188698, "grad_norm": 1.9199045896530151, "learning_rate": 1.3274394106784605e-06, "loss": 0.446, "num_input_tokens_seen": 152643936, "step": 125515 }, { "epoch": 15.727352462097482, "grad_norm": 2.76997447013855, "learning_rate": 1.3270684350454542e-06, "loss": 0.4819, "num_input_tokens_seen": 152649760, "step": 125520 }, { "epoch": 15.727978950006264, "grad_norm": 2.923752546310425, "learning_rate": 1.3266975033254354e-06, "loss": 0.4397, "num_input_tokens_seen": 152656032, "step": 125525 }, { "epoch": 15.728605437915048, "grad_norm": 7.599374771118164, "learning_rate": 1.3263266155228338e-06, "loss": 0.43, "num_input_tokens_seen": 152662112, "step": 125530 }, { "epoch": 15.729231925823832, "grad_norm": 2.9492430686950684, "learning_rate": 1.3259557716420868e-06, "loss": 0.4256, "num_input_tokens_seen": 152668576, "step": 125535 }, { "epoch": 15.729858413732615, "grad_norm": 4.280838489532471, "learning_rate": 1.325584971687629e-06, "loss": 0.4501, "num_input_tokens_seen": 152674496, "step": 125540 }, { "epoch": 15.730484901641399, "grad_norm": 1.5587596893310547, "learning_rate": 1.3252142156638903e-06, "loss": 0.4933, "num_input_tokens_seen": 152680576, "step": 125545 }, { "epoch": 15.731111389550183, "grad_norm": 1.6720831394195557, "learning_rate": 1.3248435035753066e-06, "loss": 0.4467, "num_input_tokens_seen": 152686464, "step": 125550 }, { "epoch": 15.731737877458965, "grad_norm": 3.869682550430298, "learning_rate": 1.3244728354263075e-06, "loss": 0.5199, "num_input_tokens_seen": 152692896, "step": 125555 }, { "epoch": 15.732364365367749, "grad_norm": 2.369593858718872, "learning_rate": 1.324102211221326e-06, "loss": 0.4318, "num_input_tokens_seen": 152699008, "step": 125560 }, { "epoch": 15.732990853276531, "grad_norm": 1.182952880859375, "learning_rate": 1.3237316309647946e-06, "loss": 0.4797, "num_input_tokens_seen": 152704992, "step": 125565 }, { "epoch": 15.733617341185315, "grad_norm": 1.9907232522964478, "learning_rate": 1.3233610946611408e-06, "loss": 0.465, "num_input_tokens_seen": 152711072, "step": 125570 }, { "epoch": 15.734243829094098, "grad_norm": 2.916667938232422, "learning_rate": 1.3229906023147975e-06, "loss": 0.4489, "num_input_tokens_seen": 152717184, "step": 125575 }, { "epoch": 15.734870317002882, "grad_norm": 3.305558443069458, "learning_rate": 1.3226201539301919e-06, "loss": 0.4503, "num_input_tokens_seen": 152723040, "step": 125580 }, { "epoch": 15.735496804911666, "grad_norm": 1.9938676357269287, "learning_rate": 1.3222497495117552e-06, "loss": 0.4481, "num_input_tokens_seen": 152728896, "step": 125585 }, { "epoch": 15.736123292820448, "grad_norm": 2.179340124130249, "learning_rate": 1.3218793890639136e-06, "loss": 0.488, "num_input_tokens_seen": 152735104, "step": 125590 }, { "epoch": 15.736749780729232, "grad_norm": 3.1762936115264893, "learning_rate": 1.3215090725910978e-06, "loss": 0.5664, "num_input_tokens_seen": 152741312, "step": 125595 }, { "epoch": 15.737376268638016, "grad_norm": 20.135257720947266, "learning_rate": 1.321138800097732e-06, "loss": 0.4368, "num_input_tokens_seen": 152747296, "step": 125600 }, { "epoch": 15.738002756546798, "grad_norm": 9.673172950744629, "learning_rate": 1.3207685715882467e-06, "loss": 0.4921, "num_input_tokens_seen": 152753376, "step": 125605 }, { "epoch": 15.738629244455582, "grad_norm": 3.5937600135803223, "learning_rate": 1.320398387067065e-06, "loss": 0.4673, "num_input_tokens_seen": 152759776, "step": 125610 }, { "epoch": 15.739255732364365, "grad_norm": 4.657852649688721, "learning_rate": 1.3200282465386156e-06, "loss": 0.4457, "num_input_tokens_seen": 152765344, "step": 125615 }, { "epoch": 15.739882220273149, "grad_norm": 3.937730073928833, "learning_rate": 1.3196581500073208e-06, "loss": 0.4681, "num_input_tokens_seen": 152771392, "step": 125620 }, { "epoch": 15.740508708181933, "grad_norm": 2.5030126571655273, "learning_rate": 1.3192880974776073e-06, "loss": 0.4245, "num_input_tokens_seen": 152777600, "step": 125625 }, { "epoch": 15.741135196090715, "grad_norm": 3.961545705795288, "learning_rate": 1.3189180889538988e-06, "loss": 0.4982, "num_input_tokens_seen": 152783520, "step": 125630 }, { "epoch": 15.7417616839995, "grad_norm": 2.09395694732666, "learning_rate": 1.3185481244406213e-06, "loss": 0.4375, "num_input_tokens_seen": 152789184, "step": 125635 }, { "epoch": 15.742388171908281, "grad_norm": 8.758543014526367, "learning_rate": 1.3181782039421947e-06, "loss": 0.4908, "num_input_tokens_seen": 152795296, "step": 125640 }, { "epoch": 15.743014659817065, "grad_norm": 16.246585845947266, "learning_rate": 1.3178083274630432e-06, "loss": 0.5253, "num_input_tokens_seen": 152801632, "step": 125645 }, { "epoch": 15.74364114772585, "grad_norm": 3.4332594871520996, "learning_rate": 1.3174384950075903e-06, "loss": 0.4675, "num_input_tokens_seen": 152808160, "step": 125650 }, { "epoch": 15.744267635634632, "grad_norm": 16.064868927001953, "learning_rate": 1.3170687065802545e-06, "loss": 0.4908, "num_input_tokens_seen": 152814112, "step": 125655 }, { "epoch": 15.744894123543416, "grad_norm": 5.014090538024902, "learning_rate": 1.316698962185461e-06, "loss": 0.4711, "num_input_tokens_seen": 152820096, "step": 125660 }, { "epoch": 15.745520611452198, "grad_norm": 12.282404899597168, "learning_rate": 1.3163292618276257e-06, "loss": 0.5856, "num_input_tokens_seen": 152826016, "step": 125665 }, { "epoch": 15.746147099360982, "grad_norm": 2.9386343955993652, "learning_rate": 1.3159596055111735e-06, "loss": 0.4575, "num_input_tokens_seen": 152831872, "step": 125670 }, { "epoch": 15.746773587269766, "grad_norm": 2.322584867477417, "learning_rate": 1.3155899932405196e-06, "loss": 0.482, "num_input_tokens_seen": 152837728, "step": 125675 }, { "epoch": 15.747400075178549, "grad_norm": 10.510790824890137, "learning_rate": 1.3152204250200868e-06, "loss": 0.5552, "num_input_tokens_seen": 152844032, "step": 125680 }, { "epoch": 15.748026563087333, "grad_norm": 3.225809335708618, "learning_rate": 1.3148509008542898e-06, "loss": 0.438, "num_input_tokens_seen": 152850144, "step": 125685 }, { "epoch": 15.748653050996115, "grad_norm": 9.72976303100586, "learning_rate": 1.3144814207475492e-06, "loss": 0.4918, "num_input_tokens_seen": 152855872, "step": 125690 }, { "epoch": 15.749279538904899, "grad_norm": 11.43492603302002, "learning_rate": 1.3141119847042832e-06, "loss": 0.4688, "num_input_tokens_seen": 152861664, "step": 125695 }, { "epoch": 15.749906026813683, "grad_norm": 5.9459757804870605, "learning_rate": 1.3137425927289055e-06, "loss": 0.4587, "num_input_tokens_seen": 152867776, "step": 125700 }, { "epoch": 15.750532514722465, "grad_norm": 2.8776040077209473, "learning_rate": 1.3133732448258363e-06, "loss": 0.4843, "num_input_tokens_seen": 152873728, "step": 125705 }, { "epoch": 15.75115900263125, "grad_norm": 5.941892147064209, "learning_rate": 1.3130039409994877e-06, "loss": 0.4679, "num_input_tokens_seen": 152880000, "step": 125710 }, { "epoch": 15.751785490540033, "grad_norm": 4.227366924285889, "learning_rate": 1.3126346812542783e-06, "loss": 0.4215, "num_input_tokens_seen": 152885600, "step": 125715 }, { "epoch": 15.752411978448816, "grad_norm": 1.86453378200531, "learning_rate": 1.3122654655946198e-06, "loss": 0.4879, "num_input_tokens_seen": 152891680, "step": 125720 }, { "epoch": 15.7530384663576, "grad_norm": 4.826174736022949, "learning_rate": 1.3118962940249286e-06, "loss": 0.4664, "num_input_tokens_seen": 152897568, "step": 125725 }, { "epoch": 15.753664954266382, "grad_norm": 1.7478649616241455, "learning_rate": 1.3115271665496193e-06, "loss": 0.436, "num_input_tokens_seen": 152903488, "step": 125730 }, { "epoch": 15.754291442175166, "grad_norm": 3.189405918121338, "learning_rate": 1.3111580831731025e-06, "loss": 0.4402, "num_input_tokens_seen": 152909568, "step": 125735 }, { "epoch": 15.75491793008395, "grad_norm": 2.559713840484619, "learning_rate": 1.3107890438997916e-06, "loss": 0.4583, "num_input_tokens_seen": 152915456, "step": 125740 }, { "epoch": 15.755544417992732, "grad_norm": 1.5911868810653687, "learning_rate": 1.3104200487341e-06, "loss": 0.4252, "num_input_tokens_seen": 152921696, "step": 125745 }, { "epoch": 15.756170905901516, "grad_norm": 10.895788192749023, "learning_rate": 1.3100510976804403e-06, "loss": 0.5851, "num_input_tokens_seen": 152927968, "step": 125750 }, { "epoch": 15.756797393810299, "grad_norm": 1.8357890844345093, "learning_rate": 1.30968219074322e-06, "loss": 0.4105, "num_input_tokens_seen": 152934208, "step": 125755 }, { "epoch": 15.757423881719083, "grad_norm": 14.138117790222168, "learning_rate": 1.3093133279268538e-06, "loss": 0.5126, "num_input_tokens_seen": 152940288, "step": 125760 }, { "epoch": 15.758050369627867, "grad_norm": 6.828485488891602, "learning_rate": 1.3089445092357478e-06, "loss": 0.4566, "num_input_tokens_seen": 152946784, "step": 125765 }, { "epoch": 15.75867685753665, "grad_norm": 2.493220567703247, "learning_rate": 1.3085757346743154e-06, "loss": 0.4206, "num_input_tokens_seen": 152952768, "step": 125770 }, { "epoch": 15.759303345445433, "grad_norm": 14.963122367858887, "learning_rate": 1.3082070042469618e-06, "loss": 0.4826, "num_input_tokens_seen": 152958336, "step": 125775 }, { "epoch": 15.759929833354215, "grad_norm": 3.866483688354492, "learning_rate": 1.3078383179580988e-06, "loss": 0.4783, "num_input_tokens_seen": 152964672, "step": 125780 }, { "epoch": 15.760556321263, "grad_norm": 12.82929801940918, "learning_rate": 1.307469675812132e-06, "loss": 0.5177, "num_input_tokens_seen": 152970368, "step": 125785 }, { "epoch": 15.761182809171784, "grad_norm": 3.4693257808685303, "learning_rate": 1.3071010778134707e-06, "loss": 0.4296, "num_input_tokens_seen": 152976320, "step": 125790 }, { "epoch": 15.761809297080566, "grad_norm": 2.1328718662261963, "learning_rate": 1.3067325239665197e-06, "loss": 0.4831, "num_input_tokens_seen": 152981792, "step": 125795 }, { "epoch": 15.76243578498935, "grad_norm": 2.494907855987549, "learning_rate": 1.3063640142756878e-06, "loss": 0.4334, "num_input_tokens_seen": 152987808, "step": 125800 }, { "epoch": 15.763062272898132, "grad_norm": 2.2530040740966797, "learning_rate": 1.305995548745378e-06, "loss": 0.4381, "num_input_tokens_seen": 152993952, "step": 125805 }, { "epoch": 15.763688760806916, "grad_norm": 5.025701999664307, "learning_rate": 1.305627127379997e-06, "loss": 0.5101, "num_input_tokens_seen": 152999968, "step": 125810 }, { "epoch": 15.7643152487157, "grad_norm": 8.073125839233398, "learning_rate": 1.3052587501839515e-06, "loss": 0.5045, "num_input_tokens_seen": 153006176, "step": 125815 }, { "epoch": 15.764941736624483, "grad_norm": 16.49526023864746, "learning_rate": 1.3048904171616423e-06, "loss": 0.5102, "num_input_tokens_seen": 153012128, "step": 125820 }, { "epoch": 15.765568224533267, "grad_norm": 1.6936537027359009, "learning_rate": 1.304522128317477e-06, "loss": 0.4519, "num_input_tokens_seen": 153018304, "step": 125825 }, { "epoch": 15.76619471244205, "grad_norm": 2.038358211517334, "learning_rate": 1.3041538836558543e-06, "loss": 0.4357, "num_input_tokens_seen": 153023648, "step": 125830 }, { "epoch": 15.766821200350833, "grad_norm": 1.631636381149292, "learning_rate": 1.3037856831811797e-06, "loss": 0.4638, "num_input_tokens_seen": 153029920, "step": 125835 }, { "epoch": 15.767447688259617, "grad_norm": 6.731732368469238, "learning_rate": 1.3034175268978544e-06, "loss": 0.456, "num_input_tokens_seen": 153035840, "step": 125840 }, { "epoch": 15.7680741761684, "grad_norm": 4.249788761138916, "learning_rate": 1.3030494148102823e-06, "loss": 0.5057, "num_input_tokens_seen": 153042080, "step": 125845 }, { "epoch": 15.768700664077183, "grad_norm": 1.9454454183578491, "learning_rate": 1.3026813469228612e-06, "loss": 0.462, "num_input_tokens_seen": 153047712, "step": 125850 }, { "epoch": 15.769327151985967, "grad_norm": 2.054042100906372, "learning_rate": 1.3023133232399949e-06, "loss": 0.4671, "num_input_tokens_seen": 153053632, "step": 125855 }, { "epoch": 15.76995363989475, "grad_norm": 6.051397323608398, "learning_rate": 1.3019453437660795e-06, "loss": 0.5043, "num_input_tokens_seen": 153059712, "step": 125860 }, { "epoch": 15.770580127803534, "grad_norm": 2.46822190284729, "learning_rate": 1.3015774085055187e-06, "loss": 0.4358, "num_input_tokens_seen": 153065568, "step": 125865 }, { "epoch": 15.771206615712316, "grad_norm": 4.594525337219238, "learning_rate": 1.3012095174627077e-06, "loss": 0.4776, "num_input_tokens_seen": 153071648, "step": 125870 }, { "epoch": 15.7718331036211, "grad_norm": 2.877965211868286, "learning_rate": 1.3008416706420468e-06, "loss": 0.4799, "num_input_tokens_seen": 153077440, "step": 125875 }, { "epoch": 15.772459591529884, "grad_norm": 3.155210018157959, "learning_rate": 1.3004738680479357e-06, "loss": 0.4141, "num_input_tokens_seen": 153083264, "step": 125880 }, { "epoch": 15.773086079438666, "grad_norm": 10.521246910095215, "learning_rate": 1.3001061096847678e-06, "loss": 0.4727, "num_input_tokens_seen": 153088512, "step": 125885 }, { "epoch": 15.77371256734745, "grad_norm": 3.1249911785125732, "learning_rate": 1.2997383955569437e-06, "loss": 0.4656, "num_input_tokens_seen": 153094720, "step": 125890 }, { "epoch": 15.774339055256233, "grad_norm": 3.4149229526519775, "learning_rate": 1.2993707256688564e-06, "loss": 0.4216, "num_input_tokens_seen": 153100704, "step": 125895 }, { "epoch": 15.774965543165017, "grad_norm": 3.172220230102539, "learning_rate": 1.2990031000249053e-06, "loss": 0.4355, "num_input_tokens_seen": 153106752, "step": 125900 }, { "epoch": 15.775592031073801, "grad_norm": 4.896515369415283, "learning_rate": 1.2986355186294819e-06, "loss": 0.4867, "num_input_tokens_seen": 153113056, "step": 125905 }, { "epoch": 15.776218518982583, "grad_norm": 5.323290824890137, "learning_rate": 1.2982679814869848e-06, "loss": 0.4449, "num_input_tokens_seen": 153119296, "step": 125910 }, { "epoch": 15.776845006891367, "grad_norm": 2.5355308055877686, "learning_rate": 1.2979004886018044e-06, "loss": 0.4924, "num_input_tokens_seen": 153125440, "step": 125915 }, { "epoch": 15.77747149480015, "grad_norm": 10.098042488098145, "learning_rate": 1.2975330399783376e-06, "loss": 0.4927, "num_input_tokens_seen": 153131616, "step": 125920 }, { "epoch": 15.778097982708934, "grad_norm": 2.815858840942383, "learning_rate": 1.297165635620975e-06, "loss": 0.4431, "num_input_tokens_seen": 153138016, "step": 125925 }, { "epoch": 15.778724470617718, "grad_norm": 3.013836622238159, "learning_rate": 1.2967982755341102e-06, "loss": 0.4849, "num_input_tokens_seen": 153143872, "step": 125930 }, { "epoch": 15.7793509585265, "grad_norm": 3.8044424057006836, "learning_rate": 1.296430959722137e-06, "loss": 0.4491, "num_input_tokens_seen": 153150336, "step": 125935 }, { "epoch": 15.779977446435284, "grad_norm": 5.064908981323242, "learning_rate": 1.296063688189444e-06, "loss": 0.4031, "num_input_tokens_seen": 153156672, "step": 125940 }, { "epoch": 15.780603934344068, "grad_norm": 13.421201705932617, "learning_rate": 1.2956964609404232e-06, "loss": 0.4894, "num_input_tokens_seen": 153162656, "step": 125945 }, { "epoch": 15.78123042225285, "grad_norm": 4.41129207611084, "learning_rate": 1.2953292779794662e-06, "loss": 0.4325, "num_input_tokens_seen": 153168160, "step": 125950 }, { "epoch": 15.781856910161634, "grad_norm": 4.528615951538086, "learning_rate": 1.294962139310964e-06, "loss": 0.4608, "num_input_tokens_seen": 153174432, "step": 125955 }, { "epoch": 15.782483398070417, "grad_norm": 1.9733012914657593, "learning_rate": 1.2945950449393025e-06, "loss": 0.4722, "num_input_tokens_seen": 153180832, "step": 125960 }, { "epoch": 15.7831098859792, "grad_norm": 6.202042579650879, "learning_rate": 1.2942279948688747e-06, "loss": 0.4844, "num_input_tokens_seen": 153187072, "step": 125965 }, { "epoch": 15.783736373887985, "grad_norm": 2.5300614833831787, "learning_rate": 1.2938609891040648e-06, "loss": 0.4637, "num_input_tokens_seen": 153193152, "step": 125970 }, { "epoch": 15.784362861796767, "grad_norm": 1.7105493545532227, "learning_rate": 1.2934940276492641e-06, "loss": 0.4294, "num_input_tokens_seen": 153199328, "step": 125975 }, { "epoch": 15.784989349705551, "grad_norm": 2.3433549404144287, "learning_rate": 1.2931271105088578e-06, "loss": 0.4663, "num_input_tokens_seen": 153205504, "step": 125980 }, { "epoch": 15.785615837614333, "grad_norm": 2.439779281616211, "learning_rate": 1.2927602376872345e-06, "loss": 0.4503, "num_input_tokens_seen": 153211296, "step": 125985 }, { "epoch": 15.786242325523117, "grad_norm": 1.6988656520843506, "learning_rate": 1.2923934091887774e-06, "loss": 0.4356, "num_input_tokens_seen": 153216704, "step": 125990 }, { "epoch": 15.786868813431902, "grad_norm": 6.757774829864502, "learning_rate": 1.2920266250178747e-06, "loss": 0.4667, "num_input_tokens_seen": 153222816, "step": 125995 }, { "epoch": 15.787495301340684, "grad_norm": 2.9384102821350098, "learning_rate": 1.291659885178913e-06, "loss": 0.4189, "num_input_tokens_seen": 153228288, "step": 126000 }, { "epoch": 15.788121789249468, "grad_norm": 4.442910194396973, "learning_rate": 1.2912931896762736e-06, "loss": 0.4398, "num_input_tokens_seen": 153234272, "step": 126005 }, { "epoch": 15.78874827715825, "grad_norm": 3.322864294052124, "learning_rate": 1.2909265385143437e-06, "loss": 0.4858, "num_input_tokens_seen": 153240608, "step": 126010 }, { "epoch": 15.789374765067034, "grad_norm": 3.956603527069092, "learning_rate": 1.2905599316975037e-06, "loss": 0.4329, "num_input_tokens_seen": 153246912, "step": 126015 }, { "epoch": 15.790001252975818, "grad_norm": 3.4290671348571777, "learning_rate": 1.2901933692301405e-06, "loss": 0.4141, "num_input_tokens_seen": 153253152, "step": 126020 }, { "epoch": 15.7906277408846, "grad_norm": 8.05950927734375, "learning_rate": 1.2898268511166322e-06, "loss": 0.4432, "num_input_tokens_seen": 153259200, "step": 126025 }, { "epoch": 15.791254228793385, "grad_norm": 3.3335492610931396, "learning_rate": 1.2894603773613657e-06, "loss": 0.4257, "num_input_tokens_seen": 153265824, "step": 126030 }, { "epoch": 15.791880716702167, "grad_norm": 2.3584163188934326, "learning_rate": 1.2890939479687182e-06, "loss": 0.4793, "num_input_tokens_seen": 153272064, "step": 126035 }, { "epoch": 15.792507204610951, "grad_norm": 6.082821369171143, "learning_rate": 1.2887275629430724e-06, "loss": 0.4696, "num_input_tokens_seen": 153278112, "step": 126040 }, { "epoch": 15.793133692519735, "grad_norm": 4.758626937866211, "learning_rate": 1.2883612222888088e-06, "loss": 0.4419, "num_input_tokens_seen": 153284416, "step": 126045 }, { "epoch": 15.793760180428517, "grad_norm": 9.69020938873291, "learning_rate": 1.2879949260103086e-06, "loss": 0.4862, "num_input_tokens_seen": 153290496, "step": 126050 }, { "epoch": 15.794386668337301, "grad_norm": 13.561919212341309, "learning_rate": 1.2876286741119487e-06, "loss": 0.4665, "num_input_tokens_seen": 153296640, "step": 126055 }, { "epoch": 15.795013156246085, "grad_norm": 14.296032905578613, "learning_rate": 1.2872624665981092e-06, "loss": 0.5071, "num_input_tokens_seen": 153302624, "step": 126060 }, { "epoch": 15.795639644154868, "grad_norm": 2.748128652572632, "learning_rate": 1.2868963034731701e-06, "loss": 0.4523, "num_input_tokens_seen": 153308704, "step": 126065 }, { "epoch": 15.796266132063652, "grad_norm": 2.7607717514038086, "learning_rate": 1.2865301847415057e-06, "loss": 0.5366, "num_input_tokens_seen": 153315040, "step": 126070 }, { "epoch": 15.796892619972434, "grad_norm": 1.612195372581482, "learning_rate": 1.2861641104074969e-06, "loss": 0.4289, "num_input_tokens_seen": 153321120, "step": 126075 }, { "epoch": 15.797519107881218, "grad_norm": 2.5375916957855225, "learning_rate": 1.2857980804755165e-06, "loss": 0.4438, "num_input_tokens_seen": 153327136, "step": 126080 }, { "epoch": 15.79814559579, "grad_norm": 12.126648902893066, "learning_rate": 1.2854320949499449e-06, "loss": 0.4922, "num_input_tokens_seen": 153333472, "step": 126085 }, { "epoch": 15.798772083698784, "grad_norm": 9.346573829650879, "learning_rate": 1.2850661538351539e-06, "loss": 0.4302, "num_input_tokens_seen": 153339296, "step": 126090 }, { "epoch": 15.799398571607568, "grad_norm": 5.907742500305176, "learning_rate": 1.2847002571355222e-06, "loss": 0.4431, "num_input_tokens_seen": 153345376, "step": 126095 }, { "epoch": 15.80002505951635, "grad_norm": 2.622955322265625, "learning_rate": 1.2843344048554207e-06, "loss": 0.457, "num_input_tokens_seen": 153351584, "step": 126100 }, { "epoch": 15.800651547425135, "grad_norm": 4.153990268707275, "learning_rate": 1.2839685969992267e-06, "loss": 0.3903, "num_input_tokens_seen": 153357664, "step": 126105 }, { "epoch": 15.801278035333919, "grad_norm": 8.298538208007812, "learning_rate": 1.2836028335713113e-06, "loss": 0.5, "num_input_tokens_seen": 153363104, "step": 126110 }, { "epoch": 15.801904523242701, "grad_norm": 7.7574896812438965, "learning_rate": 1.283237114576048e-06, "loss": 0.4866, "num_input_tokens_seen": 153369344, "step": 126115 }, { "epoch": 15.802531011151485, "grad_norm": 16.34596061706543, "learning_rate": 1.2828714400178122e-06, "loss": 0.5585, "num_input_tokens_seen": 153375488, "step": 126120 }, { "epoch": 15.803157499060267, "grad_norm": 4.361399173736572, "learning_rate": 1.2825058099009714e-06, "loss": 0.4847, "num_input_tokens_seen": 153381856, "step": 126125 }, { "epoch": 15.803783986969052, "grad_norm": 2.9346718788146973, "learning_rate": 1.2821402242299008e-06, "loss": 0.4444, "num_input_tokens_seen": 153388128, "step": 126130 }, { "epoch": 15.804410474877836, "grad_norm": 12.532841682434082, "learning_rate": 1.2817746830089683e-06, "loss": 0.4343, "num_input_tokens_seen": 153394208, "step": 126135 }, { "epoch": 15.805036962786618, "grad_norm": 9.642844200134277, "learning_rate": 1.2814091862425453e-06, "loss": 0.5424, "num_input_tokens_seen": 153400384, "step": 126140 }, { "epoch": 15.805663450695402, "grad_norm": 2.7469265460968018, "learning_rate": 1.2810437339350035e-06, "loss": 0.4407, "num_input_tokens_seen": 153406368, "step": 126145 }, { "epoch": 15.806289938604184, "grad_norm": 4.818539619445801, "learning_rate": 1.280678326090709e-06, "loss": 0.4212, "num_input_tokens_seen": 153412832, "step": 126150 }, { "epoch": 15.806916426512968, "grad_norm": 16.840171813964844, "learning_rate": 1.2803129627140325e-06, "loss": 0.4515, "num_input_tokens_seen": 153419072, "step": 126155 }, { "epoch": 15.807542914421752, "grad_norm": 4.428297519683838, "learning_rate": 1.2799476438093427e-06, "loss": 0.4759, "num_input_tokens_seen": 153425024, "step": 126160 }, { "epoch": 15.808169402330535, "grad_norm": 7.740049839019775, "learning_rate": 1.2795823693810055e-06, "loss": 0.4341, "num_input_tokens_seen": 153430880, "step": 126165 }, { "epoch": 15.808795890239319, "grad_norm": 4.727623462677002, "learning_rate": 1.2792171394333907e-06, "loss": 0.4671, "num_input_tokens_seen": 153437024, "step": 126170 }, { "epoch": 15.809422378148101, "grad_norm": 3.459226131439209, "learning_rate": 1.2788519539708616e-06, "loss": 0.4638, "num_input_tokens_seen": 153443072, "step": 126175 }, { "epoch": 15.810048866056885, "grad_norm": 3.6834218502044678, "learning_rate": 1.2784868129977857e-06, "loss": 0.4551, "num_input_tokens_seen": 153449440, "step": 126180 }, { "epoch": 15.810675353965669, "grad_norm": 2.778808832168579, "learning_rate": 1.2781217165185306e-06, "loss": 0.4661, "num_input_tokens_seen": 153455552, "step": 126185 }, { "epoch": 15.811301841874451, "grad_norm": 3.184828996658325, "learning_rate": 1.2777566645374579e-06, "loss": 0.5129, "num_input_tokens_seen": 153461632, "step": 126190 }, { "epoch": 15.811928329783235, "grad_norm": 7.080048084259033, "learning_rate": 1.277391657058935e-06, "loss": 0.4796, "num_input_tokens_seen": 153467840, "step": 126195 }, { "epoch": 15.812554817692018, "grad_norm": 5.291894435882568, "learning_rate": 1.2770266940873239e-06, "loss": 0.4941, "num_input_tokens_seen": 153473408, "step": 126200 }, { "epoch": 15.813181305600802, "grad_norm": 2.552032470703125, "learning_rate": 1.2766617756269894e-06, "loss": 0.4563, "num_input_tokens_seen": 153479456, "step": 126205 }, { "epoch": 15.813807793509586, "grad_norm": 6.007048606872559, "learning_rate": 1.276296901682293e-06, "loss": 0.5582, "num_input_tokens_seen": 153485824, "step": 126210 }, { "epoch": 15.814434281418368, "grad_norm": 2.565687656402588, "learning_rate": 1.275932072257599e-06, "loss": 0.4917, "num_input_tokens_seen": 153492064, "step": 126215 }, { "epoch": 15.815060769327152, "grad_norm": 2.0220561027526855, "learning_rate": 1.2755672873572666e-06, "loss": 0.5041, "num_input_tokens_seen": 153497600, "step": 126220 }, { "epoch": 15.815687257235936, "grad_norm": 2.829582691192627, "learning_rate": 1.2752025469856598e-06, "loss": 0.4656, "num_input_tokens_seen": 153503712, "step": 126225 }, { "epoch": 15.816313745144718, "grad_norm": 3.232041835784912, "learning_rate": 1.2748378511471377e-06, "loss": 0.4349, "num_input_tokens_seen": 153509728, "step": 126230 }, { "epoch": 15.816940233053502, "grad_norm": 9.91690731048584, "learning_rate": 1.274473199846062e-06, "loss": 0.4331, "num_input_tokens_seen": 153516096, "step": 126235 }, { "epoch": 15.817566720962285, "grad_norm": 19.641279220581055, "learning_rate": 1.2741085930867897e-06, "loss": 0.5497, "num_input_tokens_seen": 153522496, "step": 126240 }, { "epoch": 15.818193208871069, "grad_norm": 3.30424165725708, "learning_rate": 1.2737440308736815e-06, "loss": 0.4085, "num_input_tokens_seen": 153528672, "step": 126245 }, { "epoch": 15.818819696779853, "grad_norm": 3.834433078765869, "learning_rate": 1.2733795132110966e-06, "loss": 0.4646, "num_input_tokens_seen": 153534912, "step": 126250 }, { "epoch": 15.819446184688635, "grad_norm": 4.682734489440918, "learning_rate": 1.2730150401033946e-06, "loss": 0.4409, "num_input_tokens_seen": 153540992, "step": 126255 }, { "epoch": 15.82007267259742, "grad_norm": 2.7253379821777344, "learning_rate": 1.2726506115549292e-06, "loss": 0.4514, "num_input_tokens_seen": 153547136, "step": 126260 }, { "epoch": 15.820699160506202, "grad_norm": 3.7117817401885986, "learning_rate": 1.2722862275700597e-06, "loss": 0.4361, "num_input_tokens_seen": 153553120, "step": 126265 }, { "epoch": 15.821325648414986, "grad_norm": 2.8956778049468994, "learning_rate": 1.2719218881531442e-06, "loss": 0.4278, "num_input_tokens_seen": 153559040, "step": 126270 }, { "epoch": 15.82195213632377, "grad_norm": 3.3634634017944336, "learning_rate": 1.2715575933085356e-06, "loss": 0.4108, "num_input_tokens_seen": 153565184, "step": 126275 }, { "epoch": 15.822578624232552, "grad_norm": 5.460035800933838, "learning_rate": 1.2711933430405914e-06, "loss": 0.428, "num_input_tokens_seen": 153571520, "step": 126280 }, { "epoch": 15.823205112141336, "grad_norm": 10.544893264770508, "learning_rate": 1.2708291373536652e-06, "loss": 0.4377, "num_input_tokens_seen": 153577696, "step": 126285 }, { "epoch": 15.823831600050118, "grad_norm": 7.303797245025635, "learning_rate": 1.2704649762521125e-06, "loss": 0.5557, "num_input_tokens_seen": 153583680, "step": 126290 }, { "epoch": 15.824458087958902, "grad_norm": 9.542223930358887, "learning_rate": 1.2701008597402858e-06, "loss": 0.5025, "num_input_tokens_seen": 153589216, "step": 126295 }, { "epoch": 15.825084575867686, "grad_norm": 7.288978576660156, "learning_rate": 1.2697367878225391e-06, "loss": 0.5486, "num_input_tokens_seen": 153595168, "step": 126300 }, { "epoch": 15.825711063776469, "grad_norm": 6.862716197967529, "learning_rate": 1.2693727605032269e-06, "loss": 0.4169, "num_input_tokens_seen": 153601344, "step": 126305 }, { "epoch": 15.826337551685253, "grad_norm": 4.215839862823486, "learning_rate": 1.2690087777866984e-06, "loss": 0.4638, "num_input_tokens_seen": 153607488, "step": 126310 }, { "epoch": 15.826964039594035, "grad_norm": 23.749380111694336, "learning_rate": 1.2686448396773082e-06, "loss": 0.4711, "num_input_tokens_seen": 153613920, "step": 126315 }, { "epoch": 15.827590527502819, "grad_norm": 2.3630874156951904, "learning_rate": 1.2682809461794049e-06, "loss": 0.5506, "num_input_tokens_seen": 153620096, "step": 126320 }, { "epoch": 15.828217015411603, "grad_norm": 5.698893070220947, "learning_rate": 1.2679170972973414e-06, "loss": 0.5231, "num_input_tokens_seen": 153626432, "step": 126325 }, { "epoch": 15.828843503320385, "grad_norm": 2.7827394008636475, "learning_rate": 1.2675532930354655e-06, "loss": 0.5438, "num_input_tokens_seen": 153632384, "step": 126330 }, { "epoch": 15.82946999122917, "grad_norm": 4.224429607391357, "learning_rate": 1.2671895333981294e-06, "loss": 0.4715, "num_input_tokens_seen": 153638624, "step": 126335 }, { "epoch": 15.830096479137953, "grad_norm": 3.653780937194824, "learning_rate": 1.2668258183896791e-06, "loss": 0.5001, "num_input_tokens_seen": 153644704, "step": 126340 }, { "epoch": 15.830722967046736, "grad_norm": 4.331582069396973, "learning_rate": 1.2664621480144656e-06, "loss": 0.4387, "num_input_tokens_seen": 153650784, "step": 126345 }, { "epoch": 15.83134945495552, "grad_norm": 2.4905991554260254, "learning_rate": 1.2660985222768368e-06, "loss": 0.471, "num_input_tokens_seen": 153657344, "step": 126350 }, { "epoch": 15.831975942864302, "grad_norm": 3.850381374359131, "learning_rate": 1.2657349411811387e-06, "loss": 0.4, "num_input_tokens_seen": 153663552, "step": 126355 }, { "epoch": 15.832602430773086, "grad_norm": 2.1335291862487793, "learning_rate": 1.2653714047317183e-06, "loss": 0.4689, "num_input_tokens_seen": 153669056, "step": 126360 }, { "epoch": 15.83322891868187, "grad_norm": 18.589330673217773, "learning_rate": 1.265007912932923e-06, "loss": 0.4993, "num_input_tokens_seen": 153675200, "step": 126365 }, { "epoch": 15.833855406590652, "grad_norm": 6.911520957946777, "learning_rate": 1.2646444657890995e-06, "loss": 0.5572, "num_input_tokens_seen": 153681600, "step": 126370 }, { "epoch": 15.834481894499437, "grad_norm": 13.11852741241455, "learning_rate": 1.264281063304591e-06, "loss": 0.4585, "num_input_tokens_seen": 153687648, "step": 126375 }, { "epoch": 15.835108382408219, "grad_norm": 2.606062650680542, "learning_rate": 1.2639177054837442e-06, "loss": 0.4547, "num_input_tokens_seen": 153693760, "step": 126380 }, { "epoch": 15.835734870317003, "grad_norm": 2.5627944469451904, "learning_rate": 1.2635543923309006e-06, "loss": 0.4412, "num_input_tokens_seen": 153699808, "step": 126385 }, { "epoch": 15.836361358225787, "grad_norm": 2.059512138366699, "learning_rate": 1.263191123850408e-06, "loss": 0.5082, "num_input_tokens_seen": 153705728, "step": 126390 }, { "epoch": 15.83698784613457, "grad_norm": 15.47765827178955, "learning_rate": 1.2628279000466054e-06, "loss": 0.4783, "num_input_tokens_seen": 153712000, "step": 126395 }, { "epoch": 15.837614334043353, "grad_norm": 4.444088459014893, "learning_rate": 1.2624647209238384e-06, "loss": 0.4189, "num_input_tokens_seen": 153718368, "step": 126400 }, { "epoch": 15.838240821952136, "grad_norm": 2.735175132751465, "learning_rate": 1.2621015864864472e-06, "loss": 0.4591, "num_input_tokens_seen": 153724768, "step": 126405 }, { "epoch": 15.83886730986092, "grad_norm": 13.552419662475586, "learning_rate": 1.2617384967387752e-06, "loss": 0.5078, "num_input_tokens_seen": 153730816, "step": 126410 }, { "epoch": 15.839493797769704, "grad_norm": 4.875233173370361, "learning_rate": 1.2613754516851612e-06, "loss": 0.4751, "num_input_tokens_seen": 153736928, "step": 126415 }, { "epoch": 15.840120285678486, "grad_norm": 4.711588382720947, "learning_rate": 1.2610124513299482e-06, "loss": 0.454, "num_input_tokens_seen": 153742912, "step": 126420 }, { "epoch": 15.84074677358727, "grad_norm": 5.319457054138184, "learning_rate": 1.2606494956774734e-06, "loss": 0.4193, "num_input_tokens_seen": 153749152, "step": 126425 }, { "epoch": 15.841373261496052, "grad_norm": 7.26522159576416, "learning_rate": 1.2602865847320777e-06, "loss": 0.4387, "num_input_tokens_seen": 153754976, "step": 126430 }, { "epoch": 15.841999749404836, "grad_norm": 2.827993392944336, "learning_rate": 1.2599237184981016e-06, "loss": 0.395, "num_input_tokens_seen": 153761088, "step": 126435 }, { "epoch": 15.84262623731362, "grad_norm": 4.059634208679199, "learning_rate": 1.2595608969798805e-06, "loss": 0.5444, "num_input_tokens_seen": 153767200, "step": 126440 }, { "epoch": 15.843252725222403, "grad_norm": 15.583688735961914, "learning_rate": 1.2591981201817554e-06, "loss": 0.5816, "num_input_tokens_seen": 153772384, "step": 126445 }, { "epoch": 15.843879213131187, "grad_norm": 7.103806018829346, "learning_rate": 1.25883538810806e-06, "loss": 0.4411, "num_input_tokens_seen": 153778624, "step": 126450 }, { "epoch": 15.84450570103997, "grad_norm": 4.419291019439697, "learning_rate": 1.258472700763133e-06, "loss": 0.5031, "num_input_tokens_seen": 153784928, "step": 126455 }, { "epoch": 15.845132188948753, "grad_norm": 4.964443206787109, "learning_rate": 1.2581100581513111e-06, "loss": 0.4841, "num_input_tokens_seen": 153791392, "step": 126460 }, { "epoch": 15.845758676857537, "grad_norm": 3.451230049133301, "learning_rate": 1.2577474602769303e-06, "loss": 0.4418, "num_input_tokens_seen": 153797728, "step": 126465 }, { "epoch": 15.84638516476632, "grad_norm": 4.836559772491455, "learning_rate": 1.2573849071443238e-06, "loss": 0.4577, "num_input_tokens_seen": 153803552, "step": 126470 }, { "epoch": 15.847011652675103, "grad_norm": 4.973491191864014, "learning_rate": 1.2570223987578284e-06, "loss": 0.4722, "num_input_tokens_seen": 153809792, "step": 126475 }, { "epoch": 15.847638140583888, "grad_norm": 11.528573036193848, "learning_rate": 1.2566599351217762e-06, "loss": 0.4627, "num_input_tokens_seen": 153815872, "step": 126480 }, { "epoch": 15.84826462849267, "grad_norm": 13.173447608947754, "learning_rate": 1.2562975162405028e-06, "loss": 0.4649, "num_input_tokens_seen": 153821984, "step": 126485 }, { "epoch": 15.848891116401454, "grad_norm": 3.673393964767456, "learning_rate": 1.255935142118339e-06, "loss": 0.4712, "num_input_tokens_seen": 153828192, "step": 126490 }, { "epoch": 15.849517604310236, "grad_norm": 2.4746289253234863, "learning_rate": 1.2555728127596178e-06, "loss": 0.4586, "num_input_tokens_seen": 153834560, "step": 126495 }, { "epoch": 15.85014409221902, "grad_norm": 2.8401620388031006, "learning_rate": 1.2552105281686738e-06, "loss": 0.3905, "num_input_tokens_seen": 153840512, "step": 126500 }, { "epoch": 15.850770580127804, "grad_norm": 9.325895309448242, "learning_rate": 1.2548482883498346e-06, "loss": 0.4106, "num_input_tokens_seen": 153846560, "step": 126505 }, { "epoch": 15.851397068036587, "grad_norm": 5.067841529846191, "learning_rate": 1.254486093307435e-06, "loss": 0.455, "num_input_tokens_seen": 153852320, "step": 126510 }, { "epoch": 15.85202355594537, "grad_norm": 10.581988334655762, "learning_rate": 1.2541239430458007e-06, "loss": 0.4809, "num_input_tokens_seen": 153858528, "step": 126515 }, { "epoch": 15.852650043854153, "grad_norm": 3.233440399169922, "learning_rate": 1.2537618375692662e-06, "loss": 0.448, "num_input_tokens_seen": 153864544, "step": 126520 }, { "epoch": 15.853276531762937, "grad_norm": 2.3190383911132812, "learning_rate": 1.253399776882157e-06, "loss": 0.4476, "num_input_tokens_seen": 153870336, "step": 126525 }, { "epoch": 15.853903019671721, "grad_norm": 4.445699691772461, "learning_rate": 1.253037760988805e-06, "loss": 0.4931, "num_input_tokens_seen": 153875968, "step": 126530 }, { "epoch": 15.854529507580503, "grad_norm": 2.0625991821289062, "learning_rate": 1.2526757898935349e-06, "loss": 0.4583, "num_input_tokens_seen": 153881792, "step": 126535 }, { "epoch": 15.855155995489287, "grad_norm": 3.4499294757843018, "learning_rate": 1.2523138636006781e-06, "loss": 0.4533, "num_input_tokens_seen": 153887872, "step": 126540 }, { "epoch": 15.85578248339807, "grad_norm": 2.5962588787078857, "learning_rate": 1.2519519821145588e-06, "loss": 0.5188, "num_input_tokens_seen": 153893920, "step": 126545 }, { "epoch": 15.856408971306854, "grad_norm": 3.1448283195495605, "learning_rate": 1.2515901454395042e-06, "loss": 0.4555, "num_input_tokens_seen": 153900064, "step": 126550 }, { "epoch": 15.857035459215638, "grad_norm": 8.114173889160156, "learning_rate": 1.2512283535798426e-06, "loss": 0.4609, "num_input_tokens_seen": 153906400, "step": 126555 }, { "epoch": 15.85766194712442, "grad_norm": 7.100866317749023, "learning_rate": 1.2508666065398962e-06, "loss": 0.4174, "num_input_tokens_seen": 153912640, "step": 126560 }, { "epoch": 15.858288435033204, "grad_norm": 2.6891837120056152, "learning_rate": 1.250504904323992e-06, "loss": 0.5075, "num_input_tokens_seen": 153918784, "step": 126565 }, { "epoch": 15.858914922941988, "grad_norm": 4.858277320861816, "learning_rate": 1.2501432469364539e-06, "loss": 0.4812, "num_input_tokens_seen": 153924704, "step": 126570 }, { "epoch": 15.85954141085077, "grad_norm": 2.458353281021118, "learning_rate": 1.2497816343816071e-06, "loss": 0.4575, "num_input_tokens_seen": 153931040, "step": 126575 }, { "epoch": 15.860167898759554, "grad_norm": 1.7017426490783691, "learning_rate": 1.249420066663773e-06, "loss": 0.4576, "num_input_tokens_seen": 153937344, "step": 126580 }, { "epoch": 15.860794386668337, "grad_norm": 4.626310348510742, "learning_rate": 1.249058543787276e-06, "loss": 0.4571, "num_input_tokens_seen": 153943424, "step": 126585 }, { "epoch": 15.86142087457712, "grad_norm": 5.491273880004883, "learning_rate": 1.2486970657564367e-06, "loss": 0.4797, "num_input_tokens_seen": 153949440, "step": 126590 }, { "epoch": 15.862047362485903, "grad_norm": 2.378328561782837, "learning_rate": 1.2483356325755797e-06, "loss": 0.4397, "num_input_tokens_seen": 153955584, "step": 126595 }, { "epoch": 15.862673850394687, "grad_norm": 4.841922760009766, "learning_rate": 1.2479742442490227e-06, "loss": 0.4308, "num_input_tokens_seen": 153961760, "step": 126600 }, { "epoch": 15.863300338303471, "grad_norm": 3.4607555866241455, "learning_rate": 1.2476129007810894e-06, "loss": 0.4227, "num_input_tokens_seen": 153967744, "step": 126605 }, { "epoch": 15.863926826212253, "grad_norm": 5.330154895782471, "learning_rate": 1.2472516021760972e-06, "loss": 0.474, "num_input_tokens_seen": 153973760, "step": 126610 }, { "epoch": 15.864553314121038, "grad_norm": 4.464949131011963, "learning_rate": 1.2468903484383677e-06, "loss": 0.4478, "num_input_tokens_seen": 153979904, "step": 126615 }, { "epoch": 15.865179802029822, "grad_norm": 5.1334757804870605, "learning_rate": 1.2465291395722207e-06, "loss": 0.4233, "num_input_tokens_seen": 153985728, "step": 126620 }, { "epoch": 15.865806289938604, "grad_norm": 4.080574035644531, "learning_rate": 1.2461679755819722e-06, "loss": 0.4823, "num_input_tokens_seen": 153992000, "step": 126625 }, { "epoch": 15.866432777847388, "grad_norm": 4.631392955780029, "learning_rate": 1.2458068564719427e-06, "loss": 0.4164, "num_input_tokens_seen": 153998368, "step": 126630 }, { "epoch": 15.86705926575617, "grad_norm": 7.051239013671875, "learning_rate": 1.2454457822464476e-06, "loss": 0.4429, "num_input_tokens_seen": 154004160, "step": 126635 }, { "epoch": 15.867685753664954, "grad_norm": 2.6548361778259277, "learning_rate": 1.2450847529098065e-06, "loss": 0.4753, "num_input_tokens_seen": 154010272, "step": 126640 }, { "epoch": 15.868312241573738, "grad_norm": 2.316014051437378, "learning_rate": 1.2447237684663322e-06, "loss": 0.4567, "num_input_tokens_seen": 154016576, "step": 126645 }, { "epoch": 15.86893872948252, "grad_norm": 4.19951057434082, "learning_rate": 1.244362828920344e-06, "loss": 0.4769, "num_input_tokens_seen": 154022752, "step": 126650 }, { "epoch": 15.869565217391305, "grad_norm": 2.5492398738861084, "learning_rate": 1.2440019342761544e-06, "loss": 0.4482, "num_input_tokens_seen": 154029184, "step": 126655 }, { "epoch": 15.870191705300087, "grad_norm": 10.066347122192383, "learning_rate": 1.2436410845380792e-06, "loss": 0.4283, "num_input_tokens_seen": 154035232, "step": 126660 }, { "epoch": 15.870818193208871, "grad_norm": 10.232466697692871, "learning_rate": 1.2432802797104332e-06, "loss": 0.4882, "num_input_tokens_seen": 154040544, "step": 126665 }, { "epoch": 15.871444681117655, "grad_norm": 10.337692260742188, "learning_rate": 1.2429195197975319e-06, "loss": 0.4502, "num_input_tokens_seen": 154046880, "step": 126670 }, { "epoch": 15.872071169026437, "grad_norm": 2.0846309661865234, "learning_rate": 1.2425588048036846e-06, "loss": 0.4391, "num_input_tokens_seen": 154051648, "step": 126675 }, { "epoch": 15.872697656935221, "grad_norm": 2.4161808490753174, "learning_rate": 1.2421981347332056e-06, "loss": 0.4644, "num_input_tokens_seen": 154057568, "step": 126680 }, { "epoch": 15.873324144844005, "grad_norm": 13.911294937133789, "learning_rate": 1.2418375095904094e-06, "loss": 0.5871, "num_input_tokens_seen": 154063744, "step": 126685 }, { "epoch": 15.873950632752788, "grad_norm": 3.149195432662964, "learning_rate": 1.2414769293796037e-06, "loss": 0.4684, "num_input_tokens_seen": 154069952, "step": 126690 }, { "epoch": 15.874577120661572, "grad_norm": 2.4155995845794678, "learning_rate": 1.2411163941051024e-06, "loss": 0.436, "num_input_tokens_seen": 154076192, "step": 126695 }, { "epoch": 15.875203608570354, "grad_norm": 2.819638252258301, "learning_rate": 1.240755903771214e-06, "loss": 0.4598, "num_input_tokens_seen": 154081888, "step": 126700 }, { "epoch": 15.875830096479138, "grad_norm": 4.837086200714111, "learning_rate": 1.2403954583822508e-06, "loss": 0.5277, "num_input_tokens_seen": 154087904, "step": 126705 }, { "epoch": 15.87645658438792, "grad_norm": 2.747227430343628, "learning_rate": 1.2400350579425192e-06, "loss": 0.5042, "num_input_tokens_seen": 154093952, "step": 126710 }, { "epoch": 15.877083072296704, "grad_norm": 13.029219627380371, "learning_rate": 1.239674702456331e-06, "loss": 0.476, "num_input_tokens_seen": 154100000, "step": 126715 }, { "epoch": 15.877709560205489, "grad_norm": 17.24297332763672, "learning_rate": 1.2393143919279914e-06, "loss": 0.5008, "num_input_tokens_seen": 154106432, "step": 126720 }, { "epoch": 15.87833604811427, "grad_norm": 12.327491760253906, "learning_rate": 1.2389541263618122e-06, "loss": 0.4821, "num_input_tokens_seen": 154112928, "step": 126725 }, { "epoch": 15.878962536023055, "grad_norm": 2.0830650329589844, "learning_rate": 1.2385939057620973e-06, "loss": 0.5175, "num_input_tokens_seen": 154119232, "step": 126730 }, { "epoch": 15.879589023931839, "grad_norm": 8.98454761505127, "learning_rate": 1.2382337301331538e-06, "loss": 0.446, "num_input_tokens_seen": 154124672, "step": 126735 }, { "epoch": 15.880215511840621, "grad_norm": 2.5399911403656006, "learning_rate": 1.2378735994792907e-06, "loss": 0.5365, "num_input_tokens_seen": 154130784, "step": 126740 }, { "epoch": 15.880841999749405, "grad_norm": 3.908344030380249, "learning_rate": 1.2375135138048105e-06, "loss": 0.4436, "num_input_tokens_seen": 154136608, "step": 126745 }, { "epoch": 15.881468487658188, "grad_norm": 6.258768558502197, "learning_rate": 1.2371534731140205e-06, "loss": 0.4815, "num_input_tokens_seen": 154142688, "step": 126750 }, { "epoch": 15.882094975566972, "grad_norm": 7.922509670257568, "learning_rate": 1.2367934774112234e-06, "loss": 0.4356, "num_input_tokens_seen": 154148896, "step": 126755 }, { "epoch": 15.882721463475756, "grad_norm": 8.403318405151367, "learning_rate": 1.2364335267007239e-06, "loss": 0.4925, "num_input_tokens_seen": 154154528, "step": 126760 }, { "epoch": 15.883347951384538, "grad_norm": 7.72749662399292, "learning_rate": 1.2360736209868274e-06, "loss": 0.4727, "num_input_tokens_seen": 154160448, "step": 126765 }, { "epoch": 15.883974439293322, "grad_norm": 20.158849716186523, "learning_rate": 1.235713760273834e-06, "loss": 0.5016, "num_input_tokens_seen": 154166720, "step": 126770 }, { "epoch": 15.884600927202104, "grad_norm": 5.7692084312438965, "learning_rate": 1.2353539445660468e-06, "loss": 0.4904, "num_input_tokens_seen": 154172960, "step": 126775 }, { "epoch": 15.885227415110888, "grad_norm": 5.2907633781433105, "learning_rate": 1.2349941738677702e-06, "loss": 0.4162, "num_input_tokens_seen": 154179104, "step": 126780 }, { "epoch": 15.885853903019672, "grad_norm": 3.5487260818481445, "learning_rate": 1.2346344481833022e-06, "loss": 0.3947, "num_input_tokens_seen": 154184928, "step": 126785 }, { "epoch": 15.886480390928455, "grad_norm": 9.165453910827637, "learning_rate": 1.234274767516947e-06, "loss": 0.4897, "num_input_tokens_seen": 154191232, "step": 126790 }, { "epoch": 15.887106878837239, "grad_norm": 11.99585247039795, "learning_rate": 1.2339151318730007e-06, "loss": 0.4668, "num_input_tokens_seen": 154197472, "step": 126795 }, { "epoch": 15.887733366746021, "grad_norm": 12.918111801147461, "learning_rate": 1.233555541255766e-06, "loss": 0.5184, "num_input_tokens_seen": 154203616, "step": 126800 }, { "epoch": 15.888359854654805, "grad_norm": 7.021298885345459, "learning_rate": 1.2331959956695428e-06, "loss": 0.4241, "num_input_tokens_seen": 154209344, "step": 126805 }, { "epoch": 15.888986342563589, "grad_norm": 2.5204296112060547, "learning_rate": 1.2328364951186272e-06, "loss": 0.4739, "num_input_tokens_seen": 154215488, "step": 126810 }, { "epoch": 15.889612830472371, "grad_norm": 6.071809768676758, "learning_rate": 1.2324770396073193e-06, "loss": 0.4747, "num_input_tokens_seen": 154221728, "step": 126815 }, { "epoch": 15.890239318381155, "grad_norm": 3.3820338249206543, "learning_rate": 1.2321176291399152e-06, "loss": 0.5004, "num_input_tokens_seen": 154227424, "step": 126820 }, { "epoch": 15.890865806289938, "grad_norm": 3.392982006072998, "learning_rate": 1.2317582637207138e-06, "loss": 0.4522, "num_input_tokens_seen": 154233504, "step": 126825 }, { "epoch": 15.891492294198722, "grad_norm": 1.974489688873291, "learning_rate": 1.231398943354009e-06, "loss": 0.4331, "num_input_tokens_seen": 154239520, "step": 126830 }, { "epoch": 15.892118782107506, "grad_norm": 2.593329668045044, "learning_rate": 1.2310396680441e-06, "loss": 0.4608, "num_input_tokens_seen": 154245568, "step": 126835 }, { "epoch": 15.892745270016288, "grad_norm": 2.218125343322754, "learning_rate": 1.230680437795279e-06, "loss": 0.4517, "num_input_tokens_seen": 154251936, "step": 126840 }, { "epoch": 15.893371757925072, "grad_norm": 10.079992294311523, "learning_rate": 1.2303212526118442e-06, "loss": 0.4751, "num_input_tokens_seen": 154258208, "step": 126845 }, { "epoch": 15.893998245833856, "grad_norm": 2.757080316543579, "learning_rate": 1.229962112498086e-06, "loss": 0.4935, "num_input_tokens_seen": 154264192, "step": 126850 }, { "epoch": 15.894624733742639, "grad_norm": 10.845321655273438, "learning_rate": 1.2296030174583023e-06, "loss": 0.4896, "num_input_tokens_seen": 154270016, "step": 126855 }, { "epoch": 15.895251221651423, "grad_norm": 4.113204002380371, "learning_rate": 1.2292439674967833e-06, "loss": 0.4523, "num_input_tokens_seen": 154275968, "step": 126860 }, { "epoch": 15.895877709560205, "grad_norm": 3.0923259258270264, "learning_rate": 1.2288849626178223e-06, "loss": 0.477, "num_input_tokens_seen": 154282176, "step": 126865 }, { "epoch": 15.896504197468989, "grad_norm": 6.410973072052002, "learning_rate": 1.228526002825713e-06, "loss": 0.4408, "num_input_tokens_seen": 154288384, "step": 126870 }, { "epoch": 15.897130685377773, "grad_norm": 4.830929279327393, "learning_rate": 1.228167088124747e-06, "loss": 0.4473, "num_input_tokens_seen": 154294240, "step": 126875 }, { "epoch": 15.897757173286555, "grad_norm": 3.938312530517578, "learning_rate": 1.227808218519213e-06, "loss": 0.551, "num_input_tokens_seen": 154300384, "step": 126880 }, { "epoch": 15.89838366119534, "grad_norm": 7.212985515594482, "learning_rate": 1.2274493940134042e-06, "loss": 0.4725, "num_input_tokens_seen": 154306752, "step": 126885 }, { "epoch": 15.899010149104122, "grad_norm": 8.162418365478516, "learning_rate": 1.2270906146116102e-06, "loss": 0.5589, "num_input_tokens_seen": 154313248, "step": 126890 }, { "epoch": 15.899636637012906, "grad_norm": 4.548216342926025, "learning_rate": 1.2267318803181188e-06, "loss": 0.4304, "num_input_tokens_seen": 154319616, "step": 126895 }, { "epoch": 15.90026312492169, "grad_norm": 2.746798276901245, "learning_rate": 1.2263731911372217e-06, "loss": 0.4788, "num_input_tokens_seen": 154325504, "step": 126900 }, { "epoch": 15.900889612830472, "grad_norm": 2.9416236877441406, "learning_rate": 1.2260145470732043e-06, "loss": 0.4393, "num_input_tokens_seen": 154331584, "step": 126905 }, { "epoch": 15.901516100739256, "grad_norm": 3.3582282066345215, "learning_rate": 1.2256559481303576e-06, "loss": 0.5031, "num_input_tokens_seen": 154337792, "step": 126910 }, { "epoch": 15.902142588648038, "grad_norm": 2.8938117027282715, "learning_rate": 1.225297394312966e-06, "loss": 0.4468, "num_input_tokens_seen": 154343840, "step": 126915 }, { "epoch": 15.902769076556822, "grad_norm": 4.944098949432373, "learning_rate": 1.2249388856253175e-06, "loss": 0.4737, "num_input_tokens_seen": 154349184, "step": 126920 }, { "epoch": 15.903395564465606, "grad_norm": 8.599722862243652, "learning_rate": 1.2245804220716995e-06, "loss": 0.4984, "num_input_tokens_seen": 154355168, "step": 126925 }, { "epoch": 15.904022052374389, "grad_norm": 6.062053203582764, "learning_rate": 1.2242220036563956e-06, "loss": 0.5012, "num_input_tokens_seen": 154361120, "step": 126930 }, { "epoch": 15.904648540283173, "grad_norm": 2.5983192920684814, "learning_rate": 1.2238636303836937e-06, "loss": 0.4334, "num_input_tokens_seen": 154367264, "step": 126935 }, { "epoch": 15.905275028191955, "grad_norm": 15.847603797912598, "learning_rate": 1.2235053022578758e-06, "loss": 0.4544, "num_input_tokens_seen": 154373024, "step": 126940 }, { "epoch": 15.905901516100739, "grad_norm": 2.310451030731201, "learning_rate": 1.2231470192832279e-06, "loss": 0.4476, "num_input_tokens_seen": 154379456, "step": 126945 }, { "epoch": 15.906528004009523, "grad_norm": 3.466709852218628, "learning_rate": 1.2227887814640316e-06, "loss": 0.42, "num_input_tokens_seen": 154385760, "step": 126950 }, { "epoch": 15.907154491918305, "grad_norm": 10.886136054992676, "learning_rate": 1.2224305888045728e-06, "loss": 0.5577, "num_input_tokens_seen": 154391808, "step": 126955 }, { "epoch": 15.90778097982709, "grad_norm": 5.261674404144287, "learning_rate": 1.2220724413091306e-06, "loss": 0.4574, "num_input_tokens_seen": 154397824, "step": 126960 }, { "epoch": 15.908407467735874, "grad_norm": 11.11817741394043, "learning_rate": 1.2217143389819886e-06, "loss": 0.4561, "num_input_tokens_seen": 154403520, "step": 126965 }, { "epoch": 15.909033955644656, "grad_norm": 6.058305263519287, "learning_rate": 1.2213562818274298e-06, "loss": 0.4347, "num_input_tokens_seen": 154409536, "step": 126970 }, { "epoch": 15.90966044355344, "grad_norm": 10.758554458618164, "learning_rate": 1.2209982698497325e-06, "loss": 0.5039, "num_input_tokens_seen": 154414784, "step": 126975 }, { "epoch": 15.910286931462222, "grad_norm": 4.579913139343262, "learning_rate": 1.2206403030531778e-06, "loss": 0.4867, "num_input_tokens_seen": 154420832, "step": 126980 }, { "epoch": 15.910913419371006, "grad_norm": 18.003061294555664, "learning_rate": 1.2202823814420457e-06, "loss": 0.478, "num_input_tokens_seen": 154426848, "step": 126985 }, { "epoch": 15.91153990727979, "grad_norm": 3.2807416915893555, "learning_rate": 1.219924505020617e-06, "loss": 0.4721, "num_input_tokens_seen": 154433088, "step": 126990 }, { "epoch": 15.912166395188573, "grad_norm": 4.051612854003906, "learning_rate": 1.2195666737931676e-06, "loss": 0.4242, "num_input_tokens_seen": 154439264, "step": 126995 }, { "epoch": 15.912792883097357, "grad_norm": 3.2752108573913574, "learning_rate": 1.2192088877639785e-06, "loss": 0.423, "num_input_tokens_seen": 154444800, "step": 127000 }, { "epoch": 15.913419371006139, "grad_norm": 5.003026962280273, "learning_rate": 1.2188511469373242e-06, "loss": 0.4518, "num_input_tokens_seen": 154450880, "step": 127005 }, { "epoch": 15.914045858914923, "grad_norm": 3.4388012886047363, "learning_rate": 1.218493451317485e-06, "loss": 0.4542, "num_input_tokens_seen": 154457088, "step": 127010 }, { "epoch": 15.914672346823707, "grad_norm": 6.926634311676025, "learning_rate": 1.2181358009087345e-06, "loss": 0.4454, "num_input_tokens_seen": 154463360, "step": 127015 }, { "epoch": 15.91529883473249, "grad_norm": 3.6537926197052, "learning_rate": 1.2177781957153517e-06, "loss": 0.4411, "num_input_tokens_seen": 154469216, "step": 127020 }, { "epoch": 15.915925322641273, "grad_norm": 8.851405143737793, "learning_rate": 1.2174206357416085e-06, "loss": 0.4686, "num_input_tokens_seen": 154475104, "step": 127025 }, { "epoch": 15.916551810550056, "grad_norm": 1.9291335344314575, "learning_rate": 1.217063120991784e-06, "loss": 0.4151, "num_input_tokens_seen": 154481440, "step": 127030 }, { "epoch": 15.91717829845884, "grad_norm": 1.5510494709014893, "learning_rate": 1.2167056514701486e-06, "loss": 0.4233, "num_input_tokens_seen": 154487648, "step": 127035 }, { "epoch": 15.917804786367624, "grad_norm": 1.7600008249282837, "learning_rate": 1.2163482271809796e-06, "loss": 0.4471, "num_input_tokens_seen": 154493088, "step": 127040 }, { "epoch": 15.918431274276406, "grad_norm": 2.4564361572265625, "learning_rate": 1.2159908481285471e-06, "loss": 0.4509, "num_input_tokens_seen": 154499424, "step": 127045 }, { "epoch": 15.91905776218519, "grad_norm": 13.94851303100586, "learning_rate": 1.215633514317125e-06, "loss": 0.4615, "num_input_tokens_seen": 154505696, "step": 127050 }, { "epoch": 15.919684250093972, "grad_norm": 4.971142292022705, "learning_rate": 1.2152762257509882e-06, "loss": 0.4978, "num_input_tokens_seen": 154511872, "step": 127055 }, { "epoch": 15.920310738002756, "grad_norm": 13.899758338928223, "learning_rate": 1.2149189824344039e-06, "loss": 0.4652, "num_input_tokens_seen": 154518048, "step": 127060 }, { "epoch": 15.92093722591154, "grad_norm": 2.9441640377044678, "learning_rate": 1.2145617843716472e-06, "loss": 0.4394, "num_input_tokens_seen": 154524000, "step": 127065 }, { "epoch": 15.921563713820323, "grad_norm": 11.709407806396484, "learning_rate": 1.2142046315669858e-06, "loss": 0.5167, "num_input_tokens_seen": 154530208, "step": 127070 }, { "epoch": 15.922190201729107, "grad_norm": 9.135396003723145, "learning_rate": 1.2138475240246912e-06, "loss": 0.4507, "num_input_tokens_seen": 154536192, "step": 127075 }, { "epoch": 15.92281668963789, "grad_norm": 15.232192039489746, "learning_rate": 1.2134904617490318e-06, "loss": 0.5089, "num_input_tokens_seen": 154542080, "step": 127080 }, { "epoch": 15.923443177546673, "grad_norm": 2.238067388534546, "learning_rate": 1.2131334447442794e-06, "loss": 0.424, "num_input_tokens_seen": 154548416, "step": 127085 }, { "epoch": 15.924069665455457, "grad_norm": 5.8248066902160645, "learning_rate": 1.212776473014699e-06, "loss": 0.4945, "num_input_tokens_seen": 154554432, "step": 127090 }, { "epoch": 15.92469615336424, "grad_norm": 10.548114776611328, "learning_rate": 1.2124195465645616e-06, "loss": 0.47, "num_input_tokens_seen": 154560640, "step": 127095 }, { "epoch": 15.925322641273024, "grad_norm": 6.82084846496582, "learning_rate": 1.2120626653981315e-06, "loss": 0.5008, "num_input_tokens_seen": 154566560, "step": 127100 }, { "epoch": 15.925949129181808, "grad_norm": 2.952404022216797, "learning_rate": 1.2117058295196766e-06, "loss": 0.5057, "num_input_tokens_seen": 154572672, "step": 127105 }, { "epoch": 15.92657561709059, "grad_norm": 4.0337018966674805, "learning_rate": 1.2113490389334654e-06, "loss": 0.4552, "num_input_tokens_seen": 154578752, "step": 127110 }, { "epoch": 15.927202104999374, "grad_norm": 3.2054314613342285, "learning_rate": 1.21099229364376e-06, "loss": 0.4611, "num_input_tokens_seen": 154584672, "step": 127115 }, { "epoch": 15.927828592908156, "grad_norm": 2.860945463180542, "learning_rate": 1.2106355936548286e-06, "loss": 0.4946, "num_input_tokens_seen": 154590880, "step": 127120 }, { "epoch": 15.92845508081694, "grad_norm": 3.6731925010681152, "learning_rate": 1.2102789389709335e-06, "loss": 0.4706, "num_input_tokens_seen": 154596864, "step": 127125 }, { "epoch": 15.929081568725724, "grad_norm": 7.596631050109863, "learning_rate": 1.2099223295963413e-06, "loss": 0.4725, "num_input_tokens_seen": 154603040, "step": 127130 }, { "epoch": 15.929708056634507, "grad_norm": 3.5828332901000977, "learning_rate": 1.2095657655353126e-06, "loss": 0.4466, "num_input_tokens_seen": 154608896, "step": 127135 }, { "epoch": 15.93033454454329, "grad_norm": 6.376881122589111, "learning_rate": 1.209209246792114e-06, "loss": 0.4638, "num_input_tokens_seen": 154615136, "step": 127140 }, { "epoch": 15.930961032452073, "grad_norm": 3.2730753421783447, "learning_rate": 1.2088527733710043e-06, "loss": 0.4108, "num_input_tokens_seen": 154621408, "step": 127145 }, { "epoch": 15.931587520360857, "grad_norm": 2.5542426109313965, "learning_rate": 1.2084963452762483e-06, "loss": 0.4601, "num_input_tokens_seen": 154627680, "step": 127150 }, { "epoch": 15.932214008269641, "grad_norm": 4.630523681640625, "learning_rate": 1.208139962512105e-06, "loss": 0.4249, "num_input_tokens_seen": 154633600, "step": 127155 }, { "epoch": 15.932840496178423, "grad_norm": 10.960165977478027, "learning_rate": 1.2077836250828374e-06, "loss": 0.4592, "num_input_tokens_seen": 154639936, "step": 127160 }, { "epoch": 15.933466984087207, "grad_norm": 3.9480302333831787, "learning_rate": 1.2074273329927039e-06, "loss": 0.4659, "num_input_tokens_seen": 154645760, "step": 127165 }, { "epoch": 15.93409347199599, "grad_norm": 15.237834930419922, "learning_rate": 1.207071086245965e-06, "loss": 0.4888, "num_input_tokens_seen": 154652096, "step": 127170 }, { "epoch": 15.934719959904774, "grad_norm": 6.8958940505981445, "learning_rate": 1.2067148848468818e-06, "loss": 0.4667, "num_input_tokens_seen": 154658112, "step": 127175 }, { "epoch": 15.935346447813558, "grad_norm": 3.9700372219085693, "learning_rate": 1.2063587287997096e-06, "loss": 0.4456, "num_input_tokens_seen": 154664640, "step": 127180 }, { "epoch": 15.93597293572234, "grad_norm": 5.123936176300049, "learning_rate": 1.2060026181087086e-06, "loss": 0.4626, "num_input_tokens_seen": 154670880, "step": 127185 }, { "epoch": 15.936599423631124, "grad_norm": 2.724355697631836, "learning_rate": 1.2056465527781364e-06, "loss": 0.4468, "num_input_tokens_seen": 154677152, "step": 127190 }, { "epoch": 15.937225911539908, "grad_norm": 2.9376659393310547, "learning_rate": 1.2052905328122511e-06, "loss": 0.45, "num_input_tokens_seen": 154683648, "step": 127195 }, { "epoch": 15.93785239944869, "grad_norm": 2.2684106826782227, "learning_rate": 1.204934558215306e-06, "loss": 0.4254, "num_input_tokens_seen": 154689760, "step": 127200 }, { "epoch": 15.938478887357475, "grad_norm": 2.817141056060791, "learning_rate": 1.2045786289915605e-06, "loss": 0.4383, "num_input_tokens_seen": 154695360, "step": 127205 }, { "epoch": 15.939105375266257, "grad_norm": 2.8036513328552246, "learning_rate": 1.2042227451452675e-06, "loss": 0.4506, "num_input_tokens_seen": 154701664, "step": 127210 }, { "epoch": 15.93973186317504, "grad_norm": 5.397036552429199, "learning_rate": 1.203866906680684e-06, "loss": 0.4217, "num_input_tokens_seen": 154708064, "step": 127215 }, { "epoch": 15.940358351083823, "grad_norm": 3.8761637210845947, "learning_rate": 1.203511113602062e-06, "loss": 0.475, "num_input_tokens_seen": 154714336, "step": 127220 }, { "epoch": 15.940984838992607, "grad_norm": 3.270003080368042, "learning_rate": 1.203155365913658e-06, "loss": 0.4517, "num_input_tokens_seen": 154720608, "step": 127225 }, { "epoch": 15.941611326901391, "grad_norm": 3.1564383506774902, "learning_rate": 1.2027996636197226e-06, "loss": 0.4606, "num_input_tokens_seen": 154726816, "step": 127230 }, { "epoch": 15.942237814810174, "grad_norm": 2.9590320587158203, "learning_rate": 1.202444006724509e-06, "loss": 0.467, "num_input_tokens_seen": 154732704, "step": 127235 }, { "epoch": 15.942864302718958, "grad_norm": 10.52098560333252, "learning_rate": 1.2020883952322721e-06, "loss": 0.471, "num_input_tokens_seen": 154738816, "step": 127240 }, { "epoch": 15.943490790627742, "grad_norm": 2.629863977432251, "learning_rate": 1.2017328291472601e-06, "loss": 0.4517, "num_input_tokens_seen": 154744928, "step": 127245 }, { "epoch": 15.944117278536524, "grad_norm": 10.240187644958496, "learning_rate": 1.2013773084737268e-06, "loss": 0.4569, "num_input_tokens_seen": 154750752, "step": 127250 }, { "epoch": 15.944743766445308, "grad_norm": 9.275253295898438, "learning_rate": 1.20102183321592e-06, "loss": 0.4972, "num_input_tokens_seen": 154756736, "step": 127255 }, { "epoch": 15.94537025435409, "grad_norm": 3.9765279293060303, "learning_rate": 1.2006664033780924e-06, "loss": 0.4637, "num_input_tokens_seen": 154762976, "step": 127260 }, { "epoch": 15.945996742262874, "grad_norm": 2.5679728984832764, "learning_rate": 1.2003110189644911e-06, "loss": 0.4472, "num_input_tokens_seen": 154769344, "step": 127265 }, { "epoch": 15.946623230171658, "grad_norm": 4.386092185974121, "learning_rate": 1.1999556799793677e-06, "loss": 0.4606, "num_input_tokens_seen": 154775136, "step": 127270 }, { "epoch": 15.94724971808044, "grad_norm": 4.226742744445801, "learning_rate": 1.1996003864269678e-06, "loss": 0.4051, "num_input_tokens_seen": 154781120, "step": 127275 }, { "epoch": 15.947876205989225, "grad_norm": 3.3251450061798096, "learning_rate": 1.19924513831154e-06, "loss": 0.5116, "num_input_tokens_seen": 154787200, "step": 127280 }, { "epoch": 15.948502693898007, "grad_norm": 8.52251148223877, "learning_rate": 1.1988899356373325e-06, "loss": 0.4257, "num_input_tokens_seen": 154793312, "step": 127285 }, { "epoch": 15.949129181806791, "grad_norm": 5.360141754150391, "learning_rate": 1.1985347784085926e-06, "loss": 0.4891, "num_input_tokens_seen": 154799552, "step": 127290 }, { "epoch": 15.949755669715575, "grad_norm": 3.292994737625122, "learning_rate": 1.1981796666295648e-06, "loss": 0.4406, "num_input_tokens_seen": 154805728, "step": 127295 }, { "epoch": 15.950382157624357, "grad_norm": 2.2950689792633057, "learning_rate": 1.1978246003044947e-06, "loss": 0.5145, "num_input_tokens_seen": 154811936, "step": 127300 }, { "epoch": 15.951008645533141, "grad_norm": 11.892332077026367, "learning_rate": 1.19746957943763e-06, "loss": 0.4928, "num_input_tokens_seen": 154818048, "step": 127305 }, { "epoch": 15.951635133441924, "grad_norm": 3.2868313789367676, "learning_rate": 1.1971146040332121e-06, "loss": 0.4677, "num_input_tokens_seen": 154823904, "step": 127310 }, { "epoch": 15.952261621350708, "grad_norm": 3.5118353366851807, "learning_rate": 1.1967596740954873e-06, "loss": 0.4766, "num_input_tokens_seen": 154830048, "step": 127315 }, { "epoch": 15.952888109259492, "grad_norm": 2.5940914154052734, "learning_rate": 1.1964047896286973e-06, "loss": 0.4005, "num_input_tokens_seen": 154836128, "step": 127320 }, { "epoch": 15.953514597168274, "grad_norm": 2.2528140544891357, "learning_rate": 1.1960499506370871e-06, "loss": 0.4708, "num_input_tokens_seen": 154842080, "step": 127325 }, { "epoch": 15.954141085077058, "grad_norm": 14.756070137023926, "learning_rate": 1.195695157124897e-06, "loss": 0.5684, "num_input_tokens_seen": 154848512, "step": 127330 }, { "epoch": 15.95476757298584, "grad_norm": 2.355315685272217, "learning_rate": 1.1953404090963705e-06, "loss": 0.4722, "num_input_tokens_seen": 154854560, "step": 127335 }, { "epoch": 15.955394060894625, "grad_norm": 2.6858162879943848, "learning_rate": 1.1949857065557474e-06, "loss": 0.413, "num_input_tokens_seen": 154860672, "step": 127340 }, { "epoch": 15.956020548803409, "grad_norm": 6.451364994049072, "learning_rate": 1.1946310495072699e-06, "loss": 0.4817, "num_input_tokens_seen": 154867008, "step": 127345 }, { "epoch": 15.95664703671219, "grad_norm": 3.2776641845703125, "learning_rate": 1.194276437955177e-06, "loss": 0.5284, "num_input_tokens_seen": 154873120, "step": 127350 }, { "epoch": 15.957273524620975, "grad_norm": 5.118459224700928, "learning_rate": 1.1939218719037083e-06, "loss": 0.4276, "num_input_tokens_seen": 154879136, "step": 127355 }, { "epoch": 15.957900012529759, "grad_norm": 3.925464630126953, "learning_rate": 1.1935673513571056e-06, "loss": 0.4712, "num_input_tokens_seen": 154885408, "step": 127360 }, { "epoch": 15.958526500438541, "grad_norm": 2.824092149734497, "learning_rate": 1.1932128763196037e-06, "loss": 0.4419, "num_input_tokens_seen": 154891296, "step": 127365 }, { "epoch": 15.959152988347325, "grad_norm": 2.42836332321167, "learning_rate": 1.192858446795444e-06, "loss": 0.4857, "num_input_tokens_seen": 154897504, "step": 127370 }, { "epoch": 15.959779476256108, "grad_norm": 2.813556671142578, "learning_rate": 1.1925040627888606e-06, "loss": 0.453, "num_input_tokens_seen": 154903712, "step": 127375 }, { "epoch": 15.960405964164892, "grad_norm": 12.691375732421875, "learning_rate": 1.192149724304093e-06, "loss": 0.5538, "num_input_tokens_seen": 154909952, "step": 127380 }, { "epoch": 15.961032452073676, "grad_norm": 5.42051887512207, "learning_rate": 1.1917954313453777e-06, "loss": 0.4186, "num_input_tokens_seen": 154916032, "step": 127385 }, { "epoch": 15.961658939982458, "grad_norm": 3.2709460258483887, "learning_rate": 1.1914411839169487e-06, "loss": 0.4361, "num_input_tokens_seen": 154921984, "step": 127390 }, { "epoch": 15.962285427891242, "grad_norm": 7.218592643737793, "learning_rate": 1.1910869820230426e-06, "loss": 0.45, "num_input_tokens_seen": 154928288, "step": 127395 }, { "epoch": 15.962911915800024, "grad_norm": 4.544053554534912, "learning_rate": 1.190732825667895e-06, "loss": 0.538, "num_input_tokens_seen": 154934400, "step": 127400 }, { "epoch": 15.963538403708808, "grad_norm": 14.979517936706543, "learning_rate": 1.1903787148557378e-06, "loss": 0.5541, "num_input_tokens_seen": 154940768, "step": 127405 }, { "epoch": 15.964164891617592, "grad_norm": 3.343679666519165, "learning_rate": 1.1900246495908074e-06, "loss": 0.5332, "num_input_tokens_seen": 154946848, "step": 127410 }, { "epoch": 15.964791379526375, "grad_norm": 6.481024265289307, "learning_rate": 1.1896706298773342e-06, "loss": 0.464, "num_input_tokens_seen": 154953024, "step": 127415 }, { "epoch": 15.965417867435159, "grad_norm": 2.725818634033203, "learning_rate": 1.1893166557195523e-06, "loss": 0.4592, "num_input_tokens_seen": 154958944, "step": 127420 }, { "epoch": 15.966044355343941, "grad_norm": 5.559958457946777, "learning_rate": 1.1889627271216947e-06, "loss": 0.4279, "num_input_tokens_seen": 154965376, "step": 127425 }, { "epoch": 15.966670843252725, "grad_norm": 3.084367036819458, "learning_rate": 1.1886088440879907e-06, "loss": 0.4327, "num_input_tokens_seen": 154971328, "step": 127430 }, { "epoch": 15.96729733116151, "grad_norm": 17.954561233520508, "learning_rate": 1.188255006622674e-06, "loss": 0.4675, "num_input_tokens_seen": 154977312, "step": 127435 }, { "epoch": 15.967923819070291, "grad_norm": 5.6704487800598145, "learning_rate": 1.1879012147299718e-06, "loss": 0.4292, "num_input_tokens_seen": 154983520, "step": 127440 }, { "epoch": 15.968550306979076, "grad_norm": 4.6969685554504395, "learning_rate": 1.187547468414117e-06, "loss": 0.4499, "num_input_tokens_seen": 154989920, "step": 127445 }, { "epoch": 15.969176794887858, "grad_norm": 3.7067980766296387, "learning_rate": 1.187193767679336e-06, "loss": 0.4589, "num_input_tokens_seen": 154995808, "step": 127450 }, { "epoch": 15.969803282796642, "grad_norm": 3.2774603366851807, "learning_rate": 1.1868401125298613e-06, "loss": 0.4213, "num_input_tokens_seen": 155001600, "step": 127455 }, { "epoch": 15.970429770705426, "grad_norm": 4.515273094177246, "learning_rate": 1.1864865029699168e-06, "loss": 0.4417, "num_input_tokens_seen": 155007552, "step": 127460 }, { "epoch": 15.971056258614208, "grad_norm": 3.158734083175659, "learning_rate": 1.186132939003734e-06, "loss": 0.4662, "num_input_tokens_seen": 155013728, "step": 127465 }, { "epoch": 15.971682746522992, "grad_norm": 3.3095192909240723, "learning_rate": 1.1857794206355367e-06, "loss": 0.4188, "num_input_tokens_seen": 155019712, "step": 127470 }, { "epoch": 15.972309234431776, "grad_norm": 2.3710548877716064, "learning_rate": 1.185425947869555e-06, "loss": 0.4472, "num_input_tokens_seen": 155025888, "step": 127475 }, { "epoch": 15.972935722340559, "grad_norm": 2.87016224861145, "learning_rate": 1.1850725207100118e-06, "loss": 0.4619, "num_input_tokens_seen": 155032224, "step": 127480 }, { "epoch": 15.973562210249343, "grad_norm": 7.350378513336182, "learning_rate": 1.184719139161134e-06, "loss": 0.4482, "num_input_tokens_seen": 155038400, "step": 127485 }, { "epoch": 15.974188698158125, "grad_norm": 6.4767279624938965, "learning_rate": 1.1843658032271465e-06, "loss": 0.497, "num_input_tokens_seen": 155044544, "step": 127490 }, { "epoch": 15.974815186066909, "grad_norm": 3.127323627471924, "learning_rate": 1.1840125129122743e-06, "loss": 0.4809, "num_input_tokens_seen": 155049728, "step": 127495 }, { "epoch": 15.975441673975693, "grad_norm": 2.6273512840270996, "learning_rate": 1.1836592682207415e-06, "loss": 0.4239, "num_input_tokens_seen": 155056192, "step": 127500 }, { "epoch": 15.976068161884475, "grad_norm": 3.3299710750579834, "learning_rate": 1.18330606915677e-06, "loss": 0.4797, "num_input_tokens_seen": 155062304, "step": 127505 }, { "epoch": 15.97669464979326, "grad_norm": 3.8005409240722656, "learning_rate": 1.1829529157245845e-06, "loss": 0.4413, "num_input_tokens_seen": 155068192, "step": 127510 }, { "epoch": 15.977321137702042, "grad_norm": 5.593929290771484, "learning_rate": 1.1825998079284046e-06, "loss": 0.4381, "num_input_tokens_seen": 155074400, "step": 127515 }, { "epoch": 15.977947625610826, "grad_norm": 2.469460964202881, "learning_rate": 1.1822467457724546e-06, "loss": 0.4514, "num_input_tokens_seen": 155080640, "step": 127520 }, { "epoch": 15.97857411351961, "grad_norm": 1.9043502807617188, "learning_rate": 1.181893729260954e-06, "loss": 0.4756, "num_input_tokens_seen": 155087136, "step": 127525 }, { "epoch": 15.979200601428392, "grad_norm": 2.1527695655822754, "learning_rate": 1.1815407583981248e-06, "loss": 0.4726, "num_input_tokens_seen": 155093152, "step": 127530 }, { "epoch": 15.979827089337176, "grad_norm": 3.0116958618164062, "learning_rate": 1.1811878331881843e-06, "loss": 0.464, "num_input_tokens_seen": 155099104, "step": 127535 }, { "epoch": 15.980453577245958, "grad_norm": 2.5480592250823975, "learning_rate": 1.1808349536353548e-06, "loss": 0.4592, "num_input_tokens_seen": 155105120, "step": 127540 }, { "epoch": 15.981080065154742, "grad_norm": 5.199459075927734, "learning_rate": 1.1804821197438554e-06, "loss": 0.4329, "num_input_tokens_seen": 155111264, "step": 127545 }, { "epoch": 15.981706553063526, "grad_norm": 2.342827320098877, "learning_rate": 1.1801293315179023e-06, "loss": 0.441, "num_input_tokens_seen": 155117312, "step": 127550 }, { "epoch": 15.982333040972309, "grad_norm": 2.3530032634735107, "learning_rate": 1.1797765889617158e-06, "loss": 0.4294, "num_input_tokens_seen": 155123296, "step": 127555 }, { "epoch": 15.982959528881093, "grad_norm": 8.001747131347656, "learning_rate": 1.179423892079511e-06, "loss": 0.5029, "num_input_tokens_seen": 155129216, "step": 127560 }, { "epoch": 15.983586016789875, "grad_norm": 2.206158399581909, "learning_rate": 1.1790712408755067e-06, "loss": 0.4996, "num_input_tokens_seen": 155135200, "step": 127565 }, { "epoch": 15.98421250469866, "grad_norm": 30.60382843017578, "learning_rate": 1.178718635353917e-06, "loss": 0.5303, "num_input_tokens_seen": 155140800, "step": 127570 }, { "epoch": 15.984838992607443, "grad_norm": 4.52516508102417, "learning_rate": 1.1783660755189602e-06, "loss": 0.4179, "num_input_tokens_seen": 155146912, "step": 127575 }, { "epoch": 15.985465480516226, "grad_norm": 5.577269554138184, "learning_rate": 1.1780135613748484e-06, "loss": 0.4495, "num_input_tokens_seen": 155153312, "step": 127580 }, { "epoch": 15.98609196842501, "grad_norm": 2.1240344047546387, "learning_rate": 1.1776610929257982e-06, "loss": 0.486, "num_input_tokens_seen": 155159424, "step": 127585 }, { "epoch": 15.986718456333794, "grad_norm": 3.5717499256134033, "learning_rate": 1.1773086701760245e-06, "loss": 0.4431, "num_input_tokens_seen": 155165184, "step": 127590 }, { "epoch": 15.987344944242576, "grad_norm": 3.083122491836548, "learning_rate": 1.176956293129738e-06, "loss": 0.4625, "num_input_tokens_seen": 155171424, "step": 127595 }, { "epoch": 15.98797143215136, "grad_norm": 10.420867919921875, "learning_rate": 1.1766039617911535e-06, "loss": 0.5043, "num_input_tokens_seen": 155177600, "step": 127600 }, { "epoch": 15.988597920060142, "grad_norm": 2.0398309230804443, "learning_rate": 1.1762516761644831e-06, "loss": 0.489, "num_input_tokens_seen": 155183552, "step": 127605 }, { "epoch": 15.989224407968926, "grad_norm": 8.471118927001953, "learning_rate": 1.17589943625394e-06, "loss": 0.5175, "num_input_tokens_seen": 155190080, "step": 127610 }, { "epoch": 15.98985089587771, "grad_norm": 11.798624992370605, "learning_rate": 1.1755472420637332e-06, "loss": 0.56, "num_input_tokens_seen": 155196416, "step": 127615 }, { "epoch": 15.990477383786493, "grad_norm": 3.4561855792999268, "learning_rate": 1.1751950935980754e-06, "loss": 0.4131, "num_input_tokens_seen": 155201952, "step": 127620 }, { "epoch": 15.991103871695277, "grad_norm": 6.699010848999023, "learning_rate": 1.1748429908611753e-06, "loss": 0.5582, "num_input_tokens_seen": 155208000, "step": 127625 }, { "epoch": 15.991730359604059, "grad_norm": 1.672579050064087, "learning_rate": 1.1744909338572441e-06, "loss": 0.421, "num_input_tokens_seen": 155214304, "step": 127630 }, { "epoch": 15.992356847512843, "grad_norm": 1.2829818725585938, "learning_rate": 1.1741389225904892e-06, "loss": 0.4658, "num_input_tokens_seen": 155220096, "step": 127635 }, { "epoch": 15.992983335421627, "grad_norm": 20.33749771118164, "learning_rate": 1.173786957065121e-06, "loss": 0.5171, "num_input_tokens_seen": 155226528, "step": 127640 }, { "epoch": 15.99360982333041, "grad_norm": 2.921466112136841, "learning_rate": 1.1734350372853453e-06, "loss": 0.4756, "num_input_tokens_seen": 155232832, "step": 127645 }, { "epoch": 15.994236311239193, "grad_norm": 2.6291310787200928, "learning_rate": 1.1730831632553724e-06, "loss": 0.4041, "num_input_tokens_seen": 155238592, "step": 127650 }, { "epoch": 15.994862799147976, "grad_norm": 3.580538511276245, "learning_rate": 1.1727313349794068e-06, "loss": 0.4809, "num_input_tokens_seen": 155244768, "step": 127655 }, { "epoch": 15.99548928705676, "grad_norm": 7.7861552238464355, "learning_rate": 1.1723795524616566e-06, "loss": 0.464, "num_input_tokens_seen": 155250432, "step": 127660 }, { "epoch": 15.996115774965544, "grad_norm": 15.606895446777344, "learning_rate": 1.172027815706326e-06, "loss": 0.4336, "num_input_tokens_seen": 155256160, "step": 127665 }, { "epoch": 15.996742262874326, "grad_norm": 2.9173951148986816, "learning_rate": 1.1716761247176206e-06, "loss": 0.4667, "num_input_tokens_seen": 155262368, "step": 127670 }, { "epoch": 15.99736875078311, "grad_norm": 4.116376876831055, "learning_rate": 1.171324479499748e-06, "loss": 0.4653, "num_input_tokens_seen": 155268320, "step": 127675 }, { "epoch": 15.997995238691892, "grad_norm": 13.724201202392578, "learning_rate": 1.1709728800569087e-06, "loss": 0.531, "num_input_tokens_seen": 155274496, "step": 127680 }, { "epoch": 15.998621726600676, "grad_norm": 3.734229803085327, "learning_rate": 1.1706213263933087e-06, "loss": 0.4522, "num_input_tokens_seen": 155280608, "step": 127685 }, { "epoch": 15.99924821450946, "grad_norm": 4.619047164916992, "learning_rate": 1.1702698185131495e-06, "loss": 0.4676, "num_input_tokens_seen": 155286816, "step": 127690 }, { "epoch": 15.999874702418243, "grad_norm": 7.975259780883789, "learning_rate": 1.169918356420634e-06, "loss": 0.4148, "num_input_tokens_seen": 155292832, "step": 127695 }, { "epoch": 16.0, "eval_loss": 0.49875274300575256, "eval_runtime": 224.1875, "eval_samples_per_second": 35.6, "eval_steps_per_second": 8.903, "num_input_tokens_seen": 155294048, "step": 127696 }, { "epoch": 16.000501190327025, "grad_norm": 3.141101837158203, "learning_rate": 1.169566940119965e-06, "loss": 0.4224, "num_input_tokens_seen": 155298976, "step": 127700 }, { "epoch": 16.00112767823581, "grad_norm": 5.517821788787842, "learning_rate": 1.169215569615345e-06, "loss": 0.4498, "num_input_tokens_seen": 155305312, "step": 127705 }, { "epoch": 16.001754166144593, "grad_norm": 2.4662177562713623, "learning_rate": 1.1688642449109721e-06, "loss": 0.4244, "num_input_tokens_seen": 155311712, "step": 127710 }, { "epoch": 16.002380654053376, "grad_norm": 3.7803356647491455, "learning_rate": 1.1685129660110501e-06, "loss": 0.4695, "num_input_tokens_seen": 155317888, "step": 127715 }, { "epoch": 16.00300714196216, "grad_norm": 5.602506637573242, "learning_rate": 1.1681617329197753e-06, "loss": 0.4602, "num_input_tokens_seen": 155324064, "step": 127720 }, { "epoch": 16.003633629870944, "grad_norm": 6.258903503417969, "learning_rate": 1.1678105456413485e-06, "loss": 0.5352, "num_input_tokens_seen": 155330272, "step": 127725 }, { "epoch": 16.004260117779726, "grad_norm": 13.152139663696289, "learning_rate": 1.1674594041799697e-06, "loss": 0.4724, "num_input_tokens_seen": 155336480, "step": 127730 }, { "epoch": 16.00488660568851, "grad_norm": 3.827895164489746, "learning_rate": 1.167108308539835e-06, "loss": 0.4173, "num_input_tokens_seen": 155342496, "step": 127735 }, { "epoch": 16.005513093597294, "grad_norm": 8.431222915649414, "learning_rate": 1.166757258725144e-06, "loss": 0.4305, "num_input_tokens_seen": 155348576, "step": 127740 }, { "epoch": 16.006139581506076, "grad_norm": 7.3513970375061035, "learning_rate": 1.1664062547400916e-06, "loss": 0.449, "num_input_tokens_seen": 155354272, "step": 127745 }, { "epoch": 16.00676606941486, "grad_norm": 6.292593955993652, "learning_rate": 1.1660552965888772e-06, "loss": 0.451, "num_input_tokens_seen": 155360480, "step": 127750 }, { "epoch": 16.007392557323644, "grad_norm": 4.984676361083984, "learning_rate": 1.1657043842756933e-06, "loss": 0.4612, "num_input_tokens_seen": 155366592, "step": 127755 }, { "epoch": 16.008019045232427, "grad_norm": 5.166128635406494, "learning_rate": 1.1653535178047386e-06, "loss": 0.4543, "num_input_tokens_seen": 155372032, "step": 127760 }, { "epoch": 16.00864553314121, "grad_norm": 4.851844310760498, "learning_rate": 1.1650026971802052e-06, "loss": 0.4462, "num_input_tokens_seen": 155378368, "step": 127765 }, { "epoch": 16.009272021049995, "grad_norm": 20.716068267822266, "learning_rate": 1.1646519224062907e-06, "loss": 0.5165, "num_input_tokens_seen": 155384672, "step": 127770 }, { "epoch": 16.009898508958777, "grad_norm": 3.606621265411377, "learning_rate": 1.164301193487185e-06, "loss": 0.4339, "num_input_tokens_seen": 155391072, "step": 127775 }, { "epoch": 16.01052499686756, "grad_norm": 14.579519271850586, "learning_rate": 1.1639505104270855e-06, "loss": 0.4413, "num_input_tokens_seen": 155396832, "step": 127780 }, { "epoch": 16.011151484776345, "grad_norm": 2.0644962787628174, "learning_rate": 1.1635998732301806e-06, "loss": 0.432, "num_input_tokens_seen": 155403136, "step": 127785 }, { "epoch": 16.011777972685127, "grad_norm": 13.215699195861816, "learning_rate": 1.1632492819006652e-06, "loss": 0.5034, "num_input_tokens_seen": 155408960, "step": 127790 }, { "epoch": 16.01240446059391, "grad_norm": 2.553679943084717, "learning_rate": 1.1628987364427318e-06, "loss": 0.4868, "num_input_tokens_seen": 155415104, "step": 127795 }, { "epoch": 16.013030948502696, "grad_norm": 4.549200057983398, "learning_rate": 1.1625482368605685e-06, "loss": 0.4768, "num_input_tokens_seen": 155421408, "step": 127800 }, { "epoch": 16.013657436411478, "grad_norm": 6.126703262329102, "learning_rate": 1.1621977831583676e-06, "loss": 0.4542, "num_input_tokens_seen": 155427712, "step": 127805 }, { "epoch": 16.01428392432026, "grad_norm": 2.8932762145996094, "learning_rate": 1.1618473753403186e-06, "loss": 0.4276, "num_input_tokens_seen": 155434048, "step": 127810 }, { "epoch": 16.014910412229042, "grad_norm": 10.711071014404297, "learning_rate": 1.1614970134106129e-06, "loss": 0.4559, "num_input_tokens_seen": 155439424, "step": 127815 }, { "epoch": 16.01553690013783, "grad_norm": 2.964038848876953, "learning_rate": 1.161146697373436e-06, "loss": 0.4338, "num_input_tokens_seen": 155445472, "step": 127820 }, { "epoch": 16.01616338804661, "grad_norm": 6.933943748474121, "learning_rate": 1.1607964272329792e-06, "loss": 0.4219, "num_input_tokens_seen": 155451712, "step": 127825 }, { "epoch": 16.016789875955393, "grad_norm": 5.86157751083374, "learning_rate": 1.1604462029934276e-06, "loss": 0.4835, "num_input_tokens_seen": 155457824, "step": 127830 }, { "epoch": 16.01741636386418, "grad_norm": 2.3544857501983643, "learning_rate": 1.160096024658971e-06, "loss": 0.4121, "num_input_tokens_seen": 155463840, "step": 127835 }, { "epoch": 16.01804285177296, "grad_norm": 3.7419848442077637, "learning_rate": 1.159745892233794e-06, "loss": 0.4361, "num_input_tokens_seen": 155470144, "step": 127840 }, { "epoch": 16.018669339681743, "grad_norm": 2.920653820037842, "learning_rate": 1.1593958057220844e-06, "loss": 0.437, "num_input_tokens_seen": 155475968, "step": 127845 }, { "epoch": 16.01929582759053, "grad_norm": 3.824044704437256, "learning_rate": 1.1590457651280258e-06, "loss": 0.4377, "num_input_tokens_seen": 155481728, "step": 127850 }, { "epoch": 16.01992231549931, "grad_norm": 2.8300013542175293, "learning_rate": 1.1586957704558044e-06, "loss": 0.4644, "num_input_tokens_seen": 155488160, "step": 127855 }, { "epoch": 16.020548803408094, "grad_norm": 5.674229621887207, "learning_rate": 1.1583458217096062e-06, "loss": 0.4432, "num_input_tokens_seen": 155494304, "step": 127860 }, { "epoch": 16.021175291316876, "grad_norm": 4.44911527633667, "learning_rate": 1.157995918893612e-06, "loss": 0.4758, "num_input_tokens_seen": 155500608, "step": 127865 }, { "epoch": 16.02180177922566, "grad_norm": 4.332970142364502, "learning_rate": 1.1576460620120077e-06, "loss": 0.4755, "num_input_tokens_seen": 155506240, "step": 127870 }, { "epoch": 16.022428267134444, "grad_norm": 2.965792417526245, "learning_rate": 1.1572962510689745e-06, "loss": 0.449, "num_input_tokens_seen": 155512736, "step": 127875 }, { "epoch": 16.023054755043226, "grad_norm": 4.418545722961426, "learning_rate": 1.1569464860686962e-06, "loss": 0.4622, "num_input_tokens_seen": 155518944, "step": 127880 }, { "epoch": 16.023681242952012, "grad_norm": 14.79632568359375, "learning_rate": 1.1565967670153528e-06, "loss": 0.5192, "num_input_tokens_seen": 155524864, "step": 127885 }, { "epoch": 16.024307730860794, "grad_norm": 3.294692277908325, "learning_rate": 1.1562470939131277e-06, "loss": 0.4458, "num_input_tokens_seen": 155531040, "step": 127890 }, { "epoch": 16.024934218769577, "grad_norm": 3.294931411743164, "learning_rate": 1.1558974667661988e-06, "loss": 0.4774, "num_input_tokens_seen": 155536960, "step": 127895 }, { "epoch": 16.025560706678363, "grad_norm": 3.038457155227661, "learning_rate": 1.1555478855787473e-06, "loss": 0.4486, "num_input_tokens_seen": 155542464, "step": 127900 }, { "epoch": 16.026187194587145, "grad_norm": 2.961308717727661, "learning_rate": 1.1551983503549535e-06, "loss": 0.4389, "num_input_tokens_seen": 155548672, "step": 127905 }, { "epoch": 16.026813682495927, "grad_norm": 12.304336547851562, "learning_rate": 1.1548488610989972e-06, "loss": 0.4182, "num_input_tokens_seen": 155554688, "step": 127910 }, { "epoch": 16.027440170404713, "grad_norm": 6.558525085449219, "learning_rate": 1.1544994178150543e-06, "loss": 0.5134, "num_input_tokens_seen": 155560256, "step": 127915 }, { "epoch": 16.028066658313495, "grad_norm": 9.39266300201416, "learning_rate": 1.1541500205073036e-06, "loss": 0.4465, "num_input_tokens_seen": 155566496, "step": 127920 }, { "epoch": 16.028693146222277, "grad_norm": 4.6937408447265625, "learning_rate": 1.1538006691799247e-06, "loss": 0.4345, "num_input_tokens_seen": 155572736, "step": 127925 }, { "epoch": 16.02931963413106, "grad_norm": 7.085140228271484, "learning_rate": 1.1534513638370908e-06, "loss": 0.4227, "num_input_tokens_seen": 155578944, "step": 127930 }, { "epoch": 16.029946122039846, "grad_norm": 2.9339351654052734, "learning_rate": 1.1531021044829816e-06, "loss": 0.4273, "num_input_tokens_seen": 155585408, "step": 127935 }, { "epoch": 16.030572609948628, "grad_norm": 2.7602837085723877, "learning_rate": 1.1527528911217696e-06, "loss": 0.4873, "num_input_tokens_seen": 155591328, "step": 127940 }, { "epoch": 16.03119909785741, "grad_norm": 9.428234100341797, "learning_rate": 1.1524037237576325e-06, "loss": 0.4619, "num_input_tokens_seen": 155597568, "step": 127945 }, { "epoch": 16.031825585766196, "grad_norm": 2.41202449798584, "learning_rate": 1.1520546023947421e-06, "loss": 0.4396, "num_input_tokens_seen": 155604000, "step": 127950 }, { "epoch": 16.03245207367498, "grad_norm": 6.20564079284668, "learning_rate": 1.1517055270372758e-06, "loss": 0.4917, "num_input_tokens_seen": 155609408, "step": 127955 }, { "epoch": 16.03307856158376, "grad_norm": 6.280276298522949, "learning_rate": 1.1513564976894043e-06, "loss": 0.4355, "num_input_tokens_seen": 155615456, "step": 127960 }, { "epoch": 16.033705049492546, "grad_norm": 2.520108699798584, "learning_rate": 1.1510075143553024e-06, "loss": 0.4244, "num_input_tokens_seen": 155621344, "step": 127965 }, { "epoch": 16.03433153740133, "grad_norm": 3.183417797088623, "learning_rate": 1.150658577039141e-06, "loss": 0.4374, "num_input_tokens_seen": 155627456, "step": 127970 }, { "epoch": 16.03495802531011, "grad_norm": 9.75160026550293, "learning_rate": 1.1503096857450924e-06, "loss": 0.5297, "num_input_tokens_seen": 155633632, "step": 127975 }, { "epoch": 16.035584513218893, "grad_norm": 2.693450450897217, "learning_rate": 1.1499608404773294e-06, "loss": 0.4081, "num_input_tokens_seen": 155640000, "step": 127980 }, { "epoch": 16.03621100112768, "grad_norm": 2.358612060546875, "learning_rate": 1.1496120412400203e-06, "loss": 0.5011, "num_input_tokens_seen": 155646240, "step": 127985 }, { "epoch": 16.03683748903646, "grad_norm": 4.29962682723999, "learning_rate": 1.1492632880373378e-06, "loss": 0.4672, "num_input_tokens_seen": 155652224, "step": 127990 }, { "epoch": 16.037463976945244, "grad_norm": 2.757221221923828, "learning_rate": 1.1489145808734486e-06, "loss": 0.4828, "num_input_tokens_seen": 155658496, "step": 127995 }, { "epoch": 16.03809046485403, "grad_norm": 11.51102352142334, "learning_rate": 1.1485659197525234e-06, "loss": 0.5043, "num_input_tokens_seen": 155664512, "step": 128000 }, { "epoch": 16.03871695276281, "grad_norm": 12.828668594360352, "learning_rate": 1.1482173046787325e-06, "loss": 0.5824, "num_input_tokens_seen": 155670720, "step": 128005 }, { "epoch": 16.039343440671594, "grad_norm": 3.081963062286377, "learning_rate": 1.14786873565624e-06, "loss": 0.4752, "num_input_tokens_seen": 155676864, "step": 128010 }, { "epoch": 16.03996992858038, "grad_norm": 2.7004897594451904, "learning_rate": 1.1475202126892166e-06, "loss": 0.3999, "num_input_tokens_seen": 155683136, "step": 128015 }, { "epoch": 16.040596416489162, "grad_norm": 2.608092784881592, "learning_rate": 1.1471717357818284e-06, "loss": 0.453, "num_input_tokens_seen": 155689312, "step": 128020 }, { "epoch": 16.041222904397944, "grad_norm": 2.434309244155884, "learning_rate": 1.1468233049382404e-06, "loss": 0.4801, "num_input_tokens_seen": 155695136, "step": 128025 }, { "epoch": 16.04184939230673, "grad_norm": 18.798311233520508, "learning_rate": 1.1464749201626208e-06, "loss": 0.5054, "num_input_tokens_seen": 155701312, "step": 128030 }, { "epoch": 16.042475880215513, "grad_norm": 3.963101387023926, "learning_rate": 1.1461265814591322e-06, "loss": 0.4484, "num_input_tokens_seen": 155707584, "step": 128035 }, { "epoch": 16.043102368124295, "grad_norm": 3.1381635665893555, "learning_rate": 1.1457782888319402e-06, "loss": 0.4377, "num_input_tokens_seen": 155713760, "step": 128040 }, { "epoch": 16.043728856033077, "grad_norm": 4.395956039428711, "learning_rate": 1.1454300422852104e-06, "loss": 0.4215, "num_input_tokens_seen": 155719744, "step": 128045 }, { "epoch": 16.044355343941863, "grad_norm": 9.258347511291504, "learning_rate": 1.1450818418231042e-06, "loss": 0.4564, "num_input_tokens_seen": 155725824, "step": 128050 }, { "epoch": 16.044981831850645, "grad_norm": 16.1711483001709, "learning_rate": 1.1447336874497867e-06, "loss": 0.4862, "num_input_tokens_seen": 155732224, "step": 128055 }, { "epoch": 16.045608319759427, "grad_norm": 26.41300392150879, "learning_rate": 1.1443855791694181e-06, "loss": 0.578, "num_input_tokens_seen": 155738368, "step": 128060 }, { "epoch": 16.046234807668213, "grad_norm": 3.7874996662139893, "learning_rate": 1.1440375169861629e-06, "loss": 0.4637, "num_input_tokens_seen": 155744192, "step": 128065 }, { "epoch": 16.046861295576996, "grad_norm": 3.843883752822876, "learning_rate": 1.14368950090418e-06, "loss": 0.4337, "num_input_tokens_seen": 155750336, "step": 128070 }, { "epoch": 16.047487783485778, "grad_norm": 3.964531421661377, "learning_rate": 1.143341530927632e-06, "loss": 0.423, "num_input_tokens_seen": 155756384, "step": 128075 }, { "epoch": 16.048114271394564, "grad_norm": 19.591724395751953, "learning_rate": 1.142993607060678e-06, "loss": 0.4915, "num_input_tokens_seen": 155762464, "step": 128080 }, { "epoch": 16.048740759303346, "grad_norm": 6.031155586242676, "learning_rate": 1.142645729307479e-06, "loss": 0.456, "num_input_tokens_seen": 155768768, "step": 128085 }, { "epoch": 16.04936724721213, "grad_norm": 13.509580612182617, "learning_rate": 1.1422978976721922e-06, "loss": 0.4507, "num_input_tokens_seen": 155775040, "step": 128090 }, { "epoch": 16.04999373512091, "grad_norm": 2.5049452781677246, "learning_rate": 1.1419501121589772e-06, "loss": 0.4974, "num_input_tokens_seen": 155781280, "step": 128095 }, { "epoch": 16.050620223029696, "grad_norm": 11.143965721130371, "learning_rate": 1.1416023727719938e-06, "loss": 0.4658, "num_input_tokens_seen": 155787488, "step": 128100 }, { "epoch": 16.05124671093848, "grad_norm": 3.68630313873291, "learning_rate": 1.141254679515396e-06, "loss": 0.5003, "num_input_tokens_seen": 155793568, "step": 128105 }, { "epoch": 16.05187319884726, "grad_norm": 2.825205087661743, "learning_rate": 1.1409070323933435e-06, "loss": 0.4824, "num_input_tokens_seen": 155799744, "step": 128110 }, { "epoch": 16.052499686756047, "grad_norm": 16.81386947631836, "learning_rate": 1.1405594314099922e-06, "loss": 0.5558, "num_input_tokens_seen": 155804864, "step": 128115 }, { "epoch": 16.05312617466483, "grad_norm": 2.0318732261657715, "learning_rate": 1.1402118765694986e-06, "loss": 0.4579, "num_input_tokens_seen": 155810848, "step": 128120 }, { "epoch": 16.05375266257361, "grad_norm": 20.33938217163086, "learning_rate": 1.1398643678760158e-06, "loss": 0.4904, "num_input_tokens_seen": 155816928, "step": 128125 }, { "epoch": 16.054379150482397, "grad_norm": 2.6067469120025635, "learning_rate": 1.1395169053337012e-06, "loss": 0.4821, "num_input_tokens_seen": 155823008, "step": 128130 }, { "epoch": 16.05500563839118, "grad_norm": 15.381999969482422, "learning_rate": 1.139169488946707e-06, "loss": 0.4488, "num_input_tokens_seen": 155828640, "step": 128135 }, { "epoch": 16.05563212629996, "grad_norm": 2.685544490814209, "learning_rate": 1.1388221187191883e-06, "loss": 0.4191, "num_input_tokens_seen": 155835104, "step": 128140 }, { "epoch": 16.056258614208744, "grad_norm": 3.928272008895874, "learning_rate": 1.1384747946552965e-06, "loss": 0.4464, "num_input_tokens_seen": 155840992, "step": 128145 }, { "epoch": 16.05688510211753, "grad_norm": 2.76839280128479, "learning_rate": 1.138127516759186e-06, "loss": 0.4248, "num_input_tokens_seen": 155846976, "step": 128150 }, { "epoch": 16.057511590026312, "grad_norm": 8.032366752624512, "learning_rate": 1.1377802850350067e-06, "loss": 0.4506, "num_input_tokens_seen": 155853216, "step": 128155 }, { "epoch": 16.058138077935094, "grad_norm": 6.405862331390381, "learning_rate": 1.1374330994869114e-06, "loss": 0.5741, "num_input_tokens_seen": 155859040, "step": 128160 }, { "epoch": 16.05876456584388, "grad_norm": 8.78599739074707, "learning_rate": 1.1370859601190526e-06, "loss": 0.4462, "num_input_tokens_seen": 155864992, "step": 128165 }, { "epoch": 16.059391053752663, "grad_norm": 3.7688658237457275, "learning_rate": 1.1367388669355771e-06, "loss": 0.475, "num_input_tokens_seen": 155871200, "step": 128170 }, { "epoch": 16.060017541661445, "grad_norm": 5.774738788604736, "learning_rate": 1.1363918199406376e-06, "loss": 0.4836, "num_input_tokens_seen": 155877312, "step": 128175 }, { "epoch": 16.06064402957023, "grad_norm": 3.564178943634033, "learning_rate": 1.1360448191383815e-06, "loss": 0.4372, "num_input_tokens_seen": 155883072, "step": 128180 }, { "epoch": 16.061270517479013, "grad_norm": 4.116815090179443, "learning_rate": 1.1356978645329591e-06, "loss": 0.4291, "num_input_tokens_seen": 155889472, "step": 128185 }, { "epoch": 16.061897005387795, "grad_norm": 14.593818664550781, "learning_rate": 1.1353509561285164e-06, "loss": 0.4364, "num_input_tokens_seen": 155895680, "step": 128190 }, { "epoch": 16.06252349329658, "grad_norm": 10.03901195526123, "learning_rate": 1.1350040939292039e-06, "loss": 0.4387, "num_input_tokens_seen": 155901984, "step": 128195 }, { "epoch": 16.063149981205363, "grad_norm": 6.710862636566162, "learning_rate": 1.1346572779391652e-06, "loss": 0.481, "num_input_tokens_seen": 155908384, "step": 128200 }, { "epoch": 16.063776469114146, "grad_norm": 10.758916854858398, "learning_rate": 1.1343105081625489e-06, "loss": 0.4405, "num_input_tokens_seen": 155914752, "step": 128205 }, { "epoch": 16.064402957022928, "grad_norm": 4.274270534515381, "learning_rate": 1.1339637846035019e-06, "loss": 0.4543, "num_input_tokens_seen": 155920384, "step": 128210 }, { "epoch": 16.065029444931714, "grad_norm": 5.800247669219971, "learning_rate": 1.1336171072661667e-06, "loss": 0.496, "num_input_tokens_seen": 155926240, "step": 128215 }, { "epoch": 16.065655932840496, "grad_norm": 16.76356315612793, "learning_rate": 1.1332704761546892e-06, "loss": 0.5349, "num_input_tokens_seen": 155931552, "step": 128220 }, { "epoch": 16.06628242074928, "grad_norm": 4.725891590118408, "learning_rate": 1.1329238912732149e-06, "loss": 0.4554, "num_input_tokens_seen": 155938080, "step": 128225 }, { "epoch": 16.066908908658064, "grad_norm": 4.751467227935791, "learning_rate": 1.1325773526258877e-06, "loss": 0.4573, "num_input_tokens_seen": 155943936, "step": 128230 }, { "epoch": 16.067535396566846, "grad_norm": 7.319157600402832, "learning_rate": 1.1322308602168486e-06, "loss": 0.4928, "num_input_tokens_seen": 155949984, "step": 128235 }, { "epoch": 16.06816188447563, "grad_norm": 11.2413911819458, "learning_rate": 1.1318844140502427e-06, "loss": 0.4792, "num_input_tokens_seen": 155956224, "step": 128240 }, { "epoch": 16.068788372384414, "grad_norm": 3.98763370513916, "learning_rate": 1.1315380141302096e-06, "loss": 0.3993, "num_input_tokens_seen": 155961728, "step": 128245 }, { "epoch": 16.069414860293197, "grad_norm": 2.2397985458374023, "learning_rate": 1.131191660460893e-06, "loss": 0.4616, "num_input_tokens_seen": 155967872, "step": 128250 }, { "epoch": 16.07004134820198, "grad_norm": 10.455591201782227, "learning_rate": 1.1308453530464313e-06, "loss": 0.4485, "num_input_tokens_seen": 155973824, "step": 128255 }, { "epoch": 16.07066783611076, "grad_norm": 3.9882752895355225, "learning_rate": 1.1304990918909685e-06, "loss": 0.5019, "num_input_tokens_seen": 155979904, "step": 128260 }, { "epoch": 16.071294324019547, "grad_norm": 3.2743117809295654, "learning_rate": 1.1301528769986404e-06, "loss": 0.4303, "num_input_tokens_seen": 155985472, "step": 128265 }, { "epoch": 16.07192081192833, "grad_norm": 6.8483357429504395, "learning_rate": 1.1298067083735898e-06, "loss": 0.4922, "num_input_tokens_seen": 155991648, "step": 128270 }, { "epoch": 16.07254729983711, "grad_norm": 3.4880378246307373, "learning_rate": 1.1294605860199526e-06, "loss": 0.4048, "num_input_tokens_seen": 155998080, "step": 128275 }, { "epoch": 16.073173787745898, "grad_norm": 11.910029411315918, "learning_rate": 1.1291145099418694e-06, "loss": 0.45, "num_input_tokens_seen": 156004224, "step": 128280 }, { "epoch": 16.07380027565468, "grad_norm": 6.842698574066162, "learning_rate": 1.1287684801434756e-06, "loss": 0.5134, "num_input_tokens_seen": 156010336, "step": 128285 }, { "epoch": 16.074426763563462, "grad_norm": 3.624732494354248, "learning_rate": 1.1284224966289092e-06, "loss": 0.4222, "num_input_tokens_seen": 156016512, "step": 128290 }, { "epoch": 16.075053251472248, "grad_norm": 3.803009033203125, "learning_rate": 1.128076559402308e-06, "loss": 0.4822, "num_input_tokens_seen": 156022752, "step": 128295 }, { "epoch": 16.07567973938103, "grad_norm": 3.491567850112915, "learning_rate": 1.1277306684678058e-06, "loss": 0.4427, "num_input_tokens_seen": 156029120, "step": 128300 }, { "epoch": 16.076306227289813, "grad_norm": 2.4610228538513184, "learning_rate": 1.1273848238295403e-06, "loss": 0.4575, "num_input_tokens_seen": 156034848, "step": 128305 }, { "epoch": 16.0769327151986, "grad_norm": 3.6477890014648438, "learning_rate": 1.1270390254916442e-06, "loss": 0.407, "num_input_tokens_seen": 156040960, "step": 128310 }, { "epoch": 16.07755920310738, "grad_norm": 8.141901969909668, "learning_rate": 1.1266932734582524e-06, "loss": 0.4499, "num_input_tokens_seen": 156047328, "step": 128315 }, { "epoch": 16.078185691016163, "grad_norm": 9.550799369812012, "learning_rate": 1.126347567733499e-06, "loss": 0.4653, "num_input_tokens_seen": 156053248, "step": 128320 }, { "epoch": 16.078812178924945, "grad_norm": 7.733867168426514, "learning_rate": 1.126001908321519e-06, "loss": 0.4653, "num_input_tokens_seen": 156059520, "step": 128325 }, { "epoch": 16.07943866683373, "grad_norm": 3.64340877532959, "learning_rate": 1.125656295226441e-06, "loss": 0.442, "num_input_tokens_seen": 156065888, "step": 128330 }, { "epoch": 16.080065154742513, "grad_norm": 5.65678596496582, "learning_rate": 1.1253107284524018e-06, "loss": 0.4644, "num_input_tokens_seen": 156071872, "step": 128335 }, { "epoch": 16.080691642651296, "grad_norm": 7.282541751861572, "learning_rate": 1.1249652080035283e-06, "loss": 0.5054, "num_input_tokens_seen": 156077888, "step": 128340 }, { "epoch": 16.08131813056008, "grad_norm": 4.209053993225098, "learning_rate": 1.1246197338839544e-06, "loss": 0.4379, "num_input_tokens_seen": 156084256, "step": 128345 }, { "epoch": 16.081944618468864, "grad_norm": 4.476081371307373, "learning_rate": 1.1242743060978107e-06, "loss": 0.4545, "num_input_tokens_seen": 156090336, "step": 128350 }, { "epoch": 16.082571106377646, "grad_norm": 8.53083324432373, "learning_rate": 1.1239289246492248e-06, "loss": 0.4757, "num_input_tokens_seen": 156096160, "step": 128355 }, { "epoch": 16.083197594286432, "grad_norm": 10.871781349182129, "learning_rate": 1.1235835895423285e-06, "loss": 0.4747, "num_input_tokens_seen": 156102272, "step": 128360 }, { "epoch": 16.083824082195214, "grad_norm": 3.057373523712158, "learning_rate": 1.1232383007812487e-06, "loss": 0.4666, "num_input_tokens_seen": 156108320, "step": 128365 }, { "epoch": 16.084450570103996, "grad_norm": 2.844266176223755, "learning_rate": 1.1228930583701158e-06, "loss": 0.4223, "num_input_tokens_seen": 156114464, "step": 128370 }, { "epoch": 16.08507705801278, "grad_norm": 3.9018914699554443, "learning_rate": 1.1225478623130547e-06, "loss": 0.4307, "num_input_tokens_seen": 156120224, "step": 128375 }, { "epoch": 16.085703545921564, "grad_norm": 10.9715576171875, "learning_rate": 1.122202712614195e-06, "loss": 0.4474, "num_input_tokens_seen": 156125760, "step": 128380 }, { "epoch": 16.086330033830347, "grad_norm": 4.502547264099121, "learning_rate": 1.1218576092776607e-06, "loss": 0.4432, "num_input_tokens_seen": 156132352, "step": 128385 }, { "epoch": 16.08695652173913, "grad_norm": 8.986730575561523, "learning_rate": 1.121512552307581e-06, "loss": 0.4573, "num_input_tokens_seen": 156137824, "step": 128390 }, { "epoch": 16.087583009647915, "grad_norm": 3.5640902519226074, "learning_rate": 1.121167541708078e-06, "loss": 0.4078, "num_input_tokens_seen": 156143616, "step": 128395 }, { "epoch": 16.088209497556697, "grad_norm": 4.344362735748291, "learning_rate": 1.12082257748328e-06, "loss": 0.5114, "num_input_tokens_seen": 156149280, "step": 128400 }, { "epoch": 16.08883598546548, "grad_norm": 9.049737930297852, "learning_rate": 1.1204776596373074e-06, "loss": 0.5113, "num_input_tokens_seen": 156155456, "step": 128405 }, { "epoch": 16.089462473374265, "grad_norm": 21.359371185302734, "learning_rate": 1.1201327881742864e-06, "loss": 0.4136, "num_input_tokens_seen": 156161600, "step": 128410 }, { "epoch": 16.090088961283048, "grad_norm": 4.556467533111572, "learning_rate": 1.11978796309834e-06, "loss": 0.4205, "num_input_tokens_seen": 156167200, "step": 128415 }, { "epoch": 16.09071544919183, "grad_norm": 2.9766368865966797, "learning_rate": 1.1194431844135922e-06, "loss": 0.4489, "num_input_tokens_seen": 156173440, "step": 128420 }, { "epoch": 16.091341937100616, "grad_norm": 5.110979080200195, "learning_rate": 1.1190984521241622e-06, "loss": 0.3936, "num_input_tokens_seen": 156179872, "step": 128425 }, { "epoch": 16.091968425009398, "grad_norm": 6.766071319580078, "learning_rate": 1.118753766234173e-06, "loss": 0.4365, "num_input_tokens_seen": 156186208, "step": 128430 }, { "epoch": 16.09259491291818, "grad_norm": 15.741233825683594, "learning_rate": 1.118409126747747e-06, "loss": 0.4914, "num_input_tokens_seen": 156192096, "step": 128435 }, { "epoch": 16.093221400826963, "grad_norm": 3.7427687644958496, "learning_rate": 1.1180645336690022e-06, "loss": 0.4704, "num_input_tokens_seen": 156198144, "step": 128440 }, { "epoch": 16.09384788873575, "grad_norm": 15.677923202514648, "learning_rate": 1.1177199870020611e-06, "loss": 0.4571, "num_input_tokens_seen": 156204224, "step": 128445 }, { "epoch": 16.09447437664453, "grad_norm": 18.062808990478516, "learning_rate": 1.1173754867510405e-06, "loss": 0.513, "num_input_tokens_seen": 156210368, "step": 128450 }, { "epoch": 16.095100864553313, "grad_norm": 3.6414999961853027, "learning_rate": 1.1170310329200611e-06, "loss": 0.4283, "num_input_tokens_seen": 156216480, "step": 128455 }, { "epoch": 16.0957273524621, "grad_norm": 2.8156051635742188, "learning_rate": 1.1166866255132392e-06, "loss": 0.434, "num_input_tokens_seen": 156222528, "step": 128460 }, { "epoch": 16.09635384037088, "grad_norm": 7.983834266662598, "learning_rate": 1.116342264534695e-06, "loss": 0.4474, "num_input_tokens_seen": 156228928, "step": 128465 }, { "epoch": 16.096980328279663, "grad_norm": 2.6928303241729736, "learning_rate": 1.1159979499885426e-06, "loss": 0.4331, "num_input_tokens_seen": 156234912, "step": 128470 }, { "epoch": 16.09760681618845, "grad_norm": 4.935002326965332, "learning_rate": 1.1156536818789004e-06, "loss": 0.4752, "num_input_tokens_seen": 156240736, "step": 128475 }, { "epoch": 16.09823330409723, "grad_norm": 12.939886093139648, "learning_rate": 1.1153094602098858e-06, "loss": 0.4867, "num_input_tokens_seen": 156247040, "step": 128480 }, { "epoch": 16.098859792006014, "grad_norm": 5.25253963470459, "learning_rate": 1.1149652849856107e-06, "loss": 0.4894, "num_input_tokens_seen": 156253184, "step": 128485 }, { "epoch": 16.099486279914796, "grad_norm": 10.858175277709961, "learning_rate": 1.114621156210194e-06, "loss": 0.4798, "num_input_tokens_seen": 156259552, "step": 128490 }, { "epoch": 16.100112767823582, "grad_norm": 2.9248151779174805, "learning_rate": 1.1142770738877462e-06, "loss": 0.3927, "num_input_tokens_seen": 156265728, "step": 128495 }, { "epoch": 16.100739255732364, "grad_norm": 3.660187005996704, "learning_rate": 1.1139330380223846e-06, "loss": 0.4078, "num_input_tokens_seen": 156271648, "step": 128500 }, { "epoch": 16.101365743641146, "grad_norm": 8.945352554321289, "learning_rate": 1.1135890486182194e-06, "loss": 0.4127, "num_input_tokens_seen": 156277888, "step": 128505 }, { "epoch": 16.101992231549932, "grad_norm": 3.2982866764068604, "learning_rate": 1.1132451056793658e-06, "loss": 0.4273, "num_input_tokens_seen": 156284032, "step": 128510 }, { "epoch": 16.102618719458714, "grad_norm": 10.295903205871582, "learning_rate": 1.1129012092099333e-06, "loss": 0.4419, "num_input_tokens_seen": 156290304, "step": 128515 }, { "epoch": 16.103245207367497, "grad_norm": 3.6192691326141357, "learning_rate": 1.1125573592140354e-06, "loss": 0.504, "num_input_tokens_seen": 156295904, "step": 128520 }, { "epoch": 16.103871695276283, "grad_norm": 6.191572189331055, "learning_rate": 1.112213555695782e-06, "loss": 0.4494, "num_input_tokens_seen": 156302208, "step": 128525 }, { "epoch": 16.104498183185065, "grad_norm": 3.294569253921509, "learning_rate": 1.1118697986592858e-06, "loss": 0.4276, "num_input_tokens_seen": 156308160, "step": 128530 }, { "epoch": 16.105124671093847, "grad_norm": 14.823663711547852, "learning_rate": 1.1115260881086543e-06, "loss": 0.4729, "num_input_tokens_seen": 156314240, "step": 128535 }, { "epoch": 16.105751159002633, "grad_norm": 4.754938125610352, "learning_rate": 1.1111824240479967e-06, "loss": 0.4461, "num_input_tokens_seen": 156320352, "step": 128540 }, { "epoch": 16.106377646911415, "grad_norm": 3.084083080291748, "learning_rate": 1.1108388064814251e-06, "loss": 0.384, "num_input_tokens_seen": 156326432, "step": 128545 }, { "epoch": 16.107004134820198, "grad_norm": 4.8720855712890625, "learning_rate": 1.1104952354130433e-06, "loss": 0.4502, "num_input_tokens_seen": 156332672, "step": 128550 }, { "epoch": 16.10763062272898, "grad_norm": 17.04447364807129, "learning_rate": 1.1101517108469628e-06, "loss": 0.4502, "num_input_tokens_seen": 156338688, "step": 128555 }, { "epoch": 16.108257110637766, "grad_norm": 4.031318187713623, "learning_rate": 1.1098082327872879e-06, "loss": 0.4181, "num_input_tokens_seen": 156344832, "step": 128560 }, { "epoch": 16.108883598546548, "grad_norm": 8.575188636779785, "learning_rate": 1.109464801238127e-06, "loss": 0.4222, "num_input_tokens_seen": 156351008, "step": 128565 }, { "epoch": 16.10951008645533, "grad_norm": 4.365559101104736, "learning_rate": 1.1091214162035847e-06, "loss": 0.419, "num_input_tokens_seen": 156357344, "step": 128570 }, { "epoch": 16.110136574364116, "grad_norm": 3.2928950786590576, "learning_rate": 1.1087780776877677e-06, "loss": 0.4727, "num_input_tokens_seen": 156363456, "step": 128575 }, { "epoch": 16.1107630622729, "grad_norm": 4.672163486480713, "learning_rate": 1.1084347856947798e-06, "loss": 0.4092, "num_input_tokens_seen": 156369344, "step": 128580 }, { "epoch": 16.11138955018168, "grad_norm": 4.573586463928223, "learning_rate": 1.1080915402287267e-06, "loss": 0.427, "num_input_tokens_seen": 156375200, "step": 128585 }, { "epoch": 16.112016038090466, "grad_norm": 2.7332465648651123, "learning_rate": 1.10774834129371e-06, "loss": 0.4519, "num_input_tokens_seen": 156381408, "step": 128590 }, { "epoch": 16.11264252599925, "grad_norm": 4.035784721374512, "learning_rate": 1.1074051888938346e-06, "loss": 0.4029, "num_input_tokens_seen": 156387616, "step": 128595 }, { "epoch": 16.11326901390803, "grad_norm": 3.033392906188965, "learning_rate": 1.107062083033204e-06, "loss": 0.4817, "num_input_tokens_seen": 156393856, "step": 128600 }, { "epoch": 16.113895501816813, "grad_norm": 15.400813102722168, "learning_rate": 1.1067190237159182e-06, "loss": 0.5504, "num_input_tokens_seen": 156399168, "step": 128605 }, { "epoch": 16.1145219897256, "grad_norm": 3.9769413471221924, "learning_rate": 1.1063760109460809e-06, "loss": 0.4391, "num_input_tokens_seen": 156405408, "step": 128610 }, { "epoch": 16.11514847763438, "grad_norm": 11.734834671020508, "learning_rate": 1.1060330447277907e-06, "loss": 0.4675, "num_input_tokens_seen": 156411616, "step": 128615 }, { "epoch": 16.115774965543164, "grad_norm": 2.6127588748931885, "learning_rate": 1.1056901250651486e-06, "loss": 0.4382, "num_input_tokens_seen": 156417280, "step": 128620 }, { "epoch": 16.11640145345195, "grad_norm": 4.416255950927734, "learning_rate": 1.1053472519622576e-06, "loss": 0.5586, "num_input_tokens_seen": 156423328, "step": 128625 }, { "epoch": 16.117027941360732, "grad_norm": 14.847928047180176, "learning_rate": 1.1050044254232129e-06, "loss": 0.566, "num_input_tokens_seen": 156429600, "step": 128630 }, { "epoch": 16.117654429269514, "grad_norm": 2.929218292236328, "learning_rate": 1.1046616454521147e-06, "loss": 0.4308, "num_input_tokens_seen": 156435648, "step": 128635 }, { "epoch": 16.1182809171783, "grad_norm": 4.795868396759033, "learning_rate": 1.1043189120530634e-06, "loss": 0.4784, "num_input_tokens_seen": 156441312, "step": 128640 }, { "epoch": 16.118907405087082, "grad_norm": 7.58711051940918, "learning_rate": 1.1039762252301534e-06, "loss": 0.4349, "num_input_tokens_seen": 156447360, "step": 128645 }, { "epoch": 16.119533892995864, "grad_norm": 11.672511100769043, "learning_rate": 1.1036335849874841e-06, "loss": 0.54, "num_input_tokens_seen": 156453536, "step": 128650 }, { "epoch": 16.12016038090465, "grad_norm": 3.0372493267059326, "learning_rate": 1.1032909913291501e-06, "loss": 0.4312, "num_input_tokens_seen": 156459872, "step": 128655 }, { "epoch": 16.120786868813433, "grad_norm": 4.0127482414245605, "learning_rate": 1.1029484442592487e-06, "loss": 0.4493, "num_input_tokens_seen": 156465824, "step": 128660 }, { "epoch": 16.121413356722215, "grad_norm": 13.780217170715332, "learning_rate": 1.1026059437818765e-06, "loss": 0.4499, "num_input_tokens_seen": 156471360, "step": 128665 }, { "epoch": 16.122039844630997, "grad_norm": 6.162118911743164, "learning_rate": 1.1022634899011253e-06, "loss": 0.4987, "num_input_tokens_seen": 156477344, "step": 128670 }, { "epoch": 16.122666332539783, "grad_norm": 3.5777807235717773, "learning_rate": 1.1019210826210924e-06, "loss": 0.4581, "num_input_tokens_seen": 156483232, "step": 128675 }, { "epoch": 16.123292820448565, "grad_norm": 2.465641498565674, "learning_rate": 1.101578721945869e-06, "loss": 0.4473, "num_input_tokens_seen": 156489024, "step": 128680 }, { "epoch": 16.123919308357348, "grad_norm": 3.6481246948242188, "learning_rate": 1.1012364078795511e-06, "loss": 0.4962, "num_input_tokens_seen": 156494688, "step": 128685 }, { "epoch": 16.124545796266133, "grad_norm": 6.618407726287842, "learning_rate": 1.1008941404262285e-06, "loss": 0.6078, "num_input_tokens_seen": 156501024, "step": 128690 }, { "epoch": 16.125172284174916, "grad_norm": 7.742334365844727, "learning_rate": 1.100551919589996e-06, "loss": 0.5227, "num_input_tokens_seen": 156506880, "step": 128695 }, { "epoch": 16.125798772083698, "grad_norm": 4.048473834991455, "learning_rate": 1.1002097453749423e-06, "loss": 0.4765, "num_input_tokens_seen": 156513088, "step": 128700 }, { "epoch": 16.126425259992484, "grad_norm": 6.614978790283203, "learning_rate": 1.0998676177851614e-06, "loss": 0.4444, "num_input_tokens_seen": 156519232, "step": 128705 }, { "epoch": 16.127051747901266, "grad_norm": 3.4949750900268555, "learning_rate": 1.0995255368247404e-06, "loss": 0.4321, "num_input_tokens_seen": 156525568, "step": 128710 }, { "epoch": 16.12767823581005, "grad_norm": 5.683924674987793, "learning_rate": 1.0991835024977715e-06, "loss": 0.4578, "num_input_tokens_seen": 156531840, "step": 128715 }, { "epoch": 16.12830472371883, "grad_norm": 3.5458555221557617, "learning_rate": 1.0988415148083447e-06, "loss": 0.4352, "num_input_tokens_seen": 156537824, "step": 128720 }, { "epoch": 16.128931211627616, "grad_norm": 3.4599688053131104, "learning_rate": 1.0984995737605458e-06, "loss": 0.4865, "num_input_tokens_seen": 156543328, "step": 128725 }, { "epoch": 16.1295576995364, "grad_norm": 11.74213981628418, "learning_rate": 1.0981576793584648e-06, "loss": 0.4688, "num_input_tokens_seen": 156549568, "step": 128730 }, { "epoch": 16.13018418744518, "grad_norm": 18.291805267333984, "learning_rate": 1.0978158316061893e-06, "loss": 0.4521, "num_input_tokens_seen": 156555616, "step": 128735 }, { "epoch": 16.130810675353967, "grad_norm": 3.2468388080596924, "learning_rate": 1.097474030507808e-06, "loss": 0.4498, "num_input_tokens_seen": 156561824, "step": 128740 }, { "epoch": 16.13143716326275, "grad_norm": 6.68651008605957, "learning_rate": 1.0971322760674042e-06, "loss": 0.5067, "num_input_tokens_seen": 156567872, "step": 128745 }, { "epoch": 16.13206365117153, "grad_norm": 18.311742782592773, "learning_rate": 1.0967905682890662e-06, "loss": 0.5003, "num_input_tokens_seen": 156574400, "step": 128750 }, { "epoch": 16.132690139080317, "grad_norm": 2.466379404067993, "learning_rate": 1.0964489071768781e-06, "loss": 0.4586, "num_input_tokens_seen": 156580608, "step": 128755 }, { "epoch": 16.1333166269891, "grad_norm": 10.114703178405762, "learning_rate": 1.096107292734926e-06, "loss": 0.4313, "num_input_tokens_seen": 156586752, "step": 128760 }, { "epoch": 16.133943114897882, "grad_norm": 2.560088634490967, "learning_rate": 1.095765724967292e-06, "loss": 0.4467, "num_input_tokens_seen": 156593184, "step": 128765 }, { "epoch": 16.134569602806664, "grad_norm": 3.2132387161254883, "learning_rate": 1.0954242038780627e-06, "loss": 0.4276, "num_input_tokens_seen": 156599584, "step": 128770 }, { "epoch": 16.13519609071545, "grad_norm": 3.069683074951172, "learning_rate": 1.095082729471319e-06, "loss": 0.521, "num_input_tokens_seen": 156605632, "step": 128775 }, { "epoch": 16.135822578624232, "grad_norm": 5.547204494476318, "learning_rate": 1.0947413017511438e-06, "loss": 0.3963, "num_input_tokens_seen": 156611872, "step": 128780 }, { "epoch": 16.136449066533014, "grad_norm": 3.6564865112304688, "learning_rate": 1.094399920721621e-06, "loss": 0.4218, "num_input_tokens_seen": 156617504, "step": 128785 }, { "epoch": 16.1370755544418, "grad_norm": 20.80777359008789, "learning_rate": 1.0940585863868297e-06, "loss": 0.4633, "num_input_tokens_seen": 156623520, "step": 128790 }, { "epoch": 16.137702042350583, "grad_norm": 1.9082062244415283, "learning_rate": 1.0937172987508533e-06, "loss": 0.4468, "num_input_tokens_seen": 156629600, "step": 128795 }, { "epoch": 16.138328530259365, "grad_norm": 23.630895614624023, "learning_rate": 1.0933760578177693e-06, "loss": 0.4743, "num_input_tokens_seen": 156636000, "step": 128800 }, { "epoch": 16.13895501816815, "grad_norm": 3.87040114402771, "learning_rate": 1.0930348635916605e-06, "loss": 0.4374, "num_input_tokens_seen": 156641952, "step": 128805 }, { "epoch": 16.139581506076933, "grad_norm": 4.743003845214844, "learning_rate": 1.0926937160766033e-06, "loss": 0.4632, "num_input_tokens_seen": 156648096, "step": 128810 }, { "epoch": 16.140207993985715, "grad_norm": 15.991658210754395, "learning_rate": 1.0923526152766794e-06, "loss": 0.4421, "num_input_tokens_seen": 156654688, "step": 128815 }, { "epoch": 16.1408344818945, "grad_norm": 3.8938939571380615, "learning_rate": 1.0920115611959636e-06, "loss": 0.4933, "num_input_tokens_seen": 156661024, "step": 128820 }, { "epoch": 16.141460969803283, "grad_norm": 2.4779224395751953, "learning_rate": 1.0916705538385358e-06, "loss": 0.4591, "num_input_tokens_seen": 156667232, "step": 128825 }, { "epoch": 16.142087457712066, "grad_norm": 7.701337814331055, "learning_rate": 1.0913295932084734e-06, "loss": 0.4747, "num_input_tokens_seen": 156673376, "step": 128830 }, { "epoch": 16.142713945620848, "grad_norm": 18.442785263061523, "learning_rate": 1.0909886793098507e-06, "loss": 0.5408, "num_input_tokens_seen": 156679456, "step": 128835 }, { "epoch": 16.143340433529634, "grad_norm": 3.015803575515747, "learning_rate": 1.0906478121467444e-06, "loss": 0.4464, "num_input_tokens_seen": 156685600, "step": 128840 }, { "epoch": 16.143966921438416, "grad_norm": 17.08367347717285, "learning_rate": 1.0903069917232313e-06, "loss": 0.4809, "num_input_tokens_seen": 156691584, "step": 128845 }, { "epoch": 16.1445934093472, "grad_norm": 3.1195099353790283, "learning_rate": 1.0899662180433857e-06, "loss": 0.4126, "num_input_tokens_seen": 156697440, "step": 128850 }, { "epoch": 16.145219897255984, "grad_norm": 2.244405508041382, "learning_rate": 1.0896254911112802e-06, "loss": 0.4189, "num_input_tokens_seen": 156703776, "step": 128855 }, { "epoch": 16.145846385164766, "grad_norm": 2.0310349464416504, "learning_rate": 1.0892848109309911e-06, "loss": 0.4039, "num_input_tokens_seen": 156710016, "step": 128860 }, { "epoch": 16.14647287307355, "grad_norm": 9.912060737609863, "learning_rate": 1.0889441775065885e-06, "loss": 0.5373, "num_input_tokens_seen": 156716256, "step": 128865 }, { "epoch": 16.147099360982335, "grad_norm": 6.0652313232421875, "learning_rate": 1.0886035908421476e-06, "loss": 0.5122, "num_input_tokens_seen": 156722272, "step": 128870 }, { "epoch": 16.147725848891117, "grad_norm": 8.827568054199219, "learning_rate": 1.0882630509417385e-06, "loss": 0.5251, "num_input_tokens_seen": 156728864, "step": 128875 }, { "epoch": 16.1483523367999, "grad_norm": 6.717014312744141, "learning_rate": 1.0879225578094348e-06, "loss": 0.4727, "num_input_tokens_seen": 156735200, "step": 128880 }, { "epoch": 16.14897882470868, "grad_norm": 2.73313570022583, "learning_rate": 1.0875821114493046e-06, "loss": 0.4284, "num_input_tokens_seen": 156740864, "step": 128885 }, { "epoch": 16.149605312617467, "grad_norm": 2.3019466400146484, "learning_rate": 1.0872417118654205e-06, "loss": 0.4164, "num_input_tokens_seen": 156747168, "step": 128890 }, { "epoch": 16.15023180052625, "grad_norm": 7.926339149475098, "learning_rate": 1.0869013590618504e-06, "loss": 0.4694, "num_input_tokens_seen": 156753504, "step": 128895 }, { "epoch": 16.150858288435032, "grad_norm": 4.55835485458374, "learning_rate": 1.0865610530426646e-06, "loss": 0.5295, "num_input_tokens_seen": 156759520, "step": 128900 }, { "epoch": 16.151484776343818, "grad_norm": 4.50660514831543, "learning_rate": 1.086220793811933e-06, "loss": 0.4484, "num_input_tokens_seen": 156765536, "step": 128905 }, { "epoch": 16.1521112642526, "grad_norm": 6.6229658126831055, "learning_rate": 1.0858805813737205e-06, "loss": 0.474, "num_input_tokens_seen": 156771776, "step": 128910 }, { "epoch": 16.152737752161382, "grad_norm": 17.852630615234375, "learning_rate": 1.0855404157320986e-06, "loss": 0.5302, "num_input_tokens_seen": 156777920, "step": 128915 }, { "epoch": 16.153364240070168, "grad_norm": 2.6835124492645264, "learning_rate": 1.0852002968911301e-06, "loss": 0.4401, "num_input_tokens_seen": 156783968, "step": 128920 }, { "epoch": 16.15399072797895, "grad_norm": 28.267763137817383, "learning_rate": 1.084860224854885e-06, "loss": 0.5976, "num_input_tokens_seen": 156789952, "step": 128925 }, { "epoch": 16.154617215887733, "grad_norm": 4.05265998840332, "learning_rate": 1.0845201996274264e-06, "loss": 0.4469, "num_input_tokens_seen": 156796000, "step": 128930 }, { "epoch": 16.15524370379652, "grad_norm": 9.663934707641602, "learning_rate": 1.0841802212128205e-06, "loss": 0.4861, "num_input_tokens_seen": 156801824, "step": 128935 }, { "epoch": 16.1558701917053, "grad_norm": 3.004770040512085, "learning_rate": 1.0838402896151323e-06, "loss": 0.4785, "num_input_tokens_seen": 156808256, "step": 128940 }, { "epoch": 16.156496679614083, "grad_norm": 27.52102279663086, "learning_rate": 1.0835004048384278e-06, "loss": 0.5448, "num_input_tokens_seen": 156814592, "step": 128945 }, { "epoch": 16.157123167522865, "grad_norm": 5.493142604827881, "learning_rate": 1.0831605668867668e-06, "loss": 0.4236, "num_input_tokens_seen": 156820832, "step": 128950 }, { "epoch": 16.15774965543165, "grad_norm": 11.621062278747559, "learning_rate": 1.0828207757642161e-06, "loss": 0.4264, "num_input_tokens_seen": 156826752, "step": 128955 }, { "epoch": 16.158376143340433, "grad_norm": 4.617481708526611, "learning_rate": 1.0824810314748347e-06, "loss": 0.4578, "num_input_tokens_seen": 156832512, "step": 128960 }, { "epoch": 16.159002631249216, "grad_norm": 11.077555656433105, "learning_rate": 1.0821413340226867e-06, "loss": 0.4899, "num_input_tokens_seen": 156838624, "step": 128965 }, { "epoch": 16.159629119158, "grad_norm": 2.3280763626098633, "learning_rate": 1.0818016834118338e-06, "loss": 0.4721, "num_input_tokens_seen": 156844512, "step": 128970 }, { "epoch": 16.160255607066784, "grad_norm": 3.389392852783203, "learning_rate": 1.081462079646335e-06, "loss": 0.4115, "num_input_tokens_seen": 156850400, "step": 128975 }, { "epoch": 16.160882094975566, "grad_norm": 3.0717756748199463, "learning_rate": 1.081122522730253e-06, "loss": 0.4416, "num_input_tokens_seen": 156856672, "step": 128980 }, { "epoch": 16.161508582884352, "grad_norm": 2.4614481925964355, "learning_rate": 1.0807830126676444e-06, "loss": 0.4882, "num_input_tokens_seen": 156862464, "step": 128985 }, { "epoch": 16.162135070793134, "grad_norm": 18.471294403076172, "learning_rate": 1.0804435494625714e-06, "loss": 0.4883, "num_input_tokens_seen": 156868768, "step": 128990 }, { "epoch": 16.162761558701916, "grad_norm": 4.662271976470947, "learning_rate": 1.0801041331190904e-06, "loss": 0.4623, "num_input_tokens_seen": 156874624, "step": 128995 }, { "epoch": 16.1633880466107, "grad_norm": 5.523079872131348, "learning_rate": 1.0797647636412607e-06, "loss": 0.4424, "num_input_tokens_seen": 156880928, "step": 129000 }, { "epoch": 16.164014534519485, "grad_norm": 3.2657227516174316, "learning_rate": 1.0794254410331383e-06, "loss": 0.458, "num_input_tokens_seen": 156886848, "step": 129005 }, { "epoch": 16.164641022428267, "grad_norm": 2.313004732131958, "learning_rate": 1.0790861652987823e-06, "loss": 0.4263, "num_input_tokens_seen": 156892768, "step": 129010 }, { "epoch": 16.16526751033705, "grad_norm": 12.53622817993164, "learning_rate": 1.078746936442246e-06, "loss": 0.5119, "num_input_tokens_seen": 156898848, "step": 129015 }, { "epoch": 16.165893998245835, "grad_norm": 3.2968902587890625, "learning_rate": 1.0784077544675885e-06, "loss": 0.4049, "num_input_tokens_seen": 156904800, "step": 129020 }, { "epoch": 16.166520486154617, "grad_norm": 6.653660297393799, "learning_rate": 1.078068619378862e-06, "loss": 0.4331, "num_input_tokens_seen": 156910848, "step": 129025 }, { "epoch": 16.1671469740634, "grad_norm": 8.475212097167969, "learning_rate": 1.0777295311801228e-06, "loss": 0.443, "num_input_tokens_seen": 156917024, "step": 129030 }, { "epoch": 16.167773461972185, "grad_norm": 15.478401184082031, "learning_rate": 1.0773904898754244e-06, "loss": 0.5552, "num_input_tokens_seen": 156923776, "step": 129035 }, { "epoch": 16.168399949880968, "grad_norm": 4.533370494842529, "learning_rate": 1.077051495468822e-06, "loss": 0.4283, "num_input_tokens_seen": 156929888, "step": 129040 }, { "epoch": 16.16902643778975, "grad_norm": 6.220825672149658, "learning_rate": 1.076712547964366e-06, "loss": 0.4666, "num_input_tokens_seen": 156935968, "step": 129045 }, { "epoch": 16.169652925698536, "grad_norm": 2.324781894683838, "learning_rate": 1.07637364736611e-06, "loss": 0.4866, "num_input_tokens_seen": 156942048, "step": 129050 }, { "epoch": 16.170279413607318, "grad_norm": 4.35183572769165, "learning_rate": 1.0760347936781069e-06, "loss": 0.4835, "num_input_tokens_seen": 156947968, "step": 129055 }, { "epoch": 16.1709059015161, "grad_norm": 5.21576452255249, "learning_rate": 1.0756959869044059e-06, "loss": 0.4334, "num_input_tokens_seen": 156954240, "step": 129060 }, { "epoch": 16.171532389424883, "grad_norm": 4.404665470123291, "learning_rate": 1.0753572270490602e-06, "loss": 0.4038, "num_input_tokens_seen": 156960224, "step": 129065 }, { "epoch": 16.17215887733367, "grad_norm": 4.309146404266357, "learning_rate": 1.075018514116117e-06, "loss": 0.4662, "num_input_tokens_seen": 156966112, "step": 129070 }, { "epoch": 16.17278536524245, "grad_norm": 3.323552131652832, "learning_rate": 1.0746798481096287e-06, "loss": 0.4975, "num_input_tokens_seen": 156972576, "step": 129075 }, { "epoch": 16.173411853151233, "grad_norm": 5.764776229858398, "learning_rate": 1.0743412290336414e-06, "loss": 0.4695, "num_input_tokens_seen": 156978240, "step": 129080 }, { "epoch": 16.17403834106002, "grad_norm": 2.418478488922119, "learning_rate": 1.0740026568922058e-06, "loss": 0.4188, "num_input_tokens_seen": 156984672, "step": 129085 }, { "epoch": 16.1746648289688, "grad_norm": 5.204670429229736, "learning_rate": 1.0736641316893703e-06, "loss": 0.4821, "num_input_tokens_seen": 156990912, "step": 129090 }, { "epoch": 16.175291316877583, "grad_norm": 5.025991439819336, "learning_rate": 1.0733256534291797e-06, "loss": 0.4308, "num_input_tokens_seen": 156997056, "step": 129095 }, { "epoch": 16.17591780478637, "grad_norm": 4.2471089363098145, "learning_rate": 1.072987222115684e-06, "loss": 0.5155, "num_input_tokens_seen": 157003328, "step": 129100 }, { "epoch": 16.17654429269515, "grad_norm": 13.162311553955078, "learning_rate": 1.072648837752926e-06, "loss": 0.6148, "num_input_tokens_seen": 157009280, "step": 129105 }, { "epoch": 16.177170780603934, "grad_norm": 20.412199020385742, "learning_rate": 1.0723105003449547e-06, "loss": 0.4411, "num_input_tokens_seen": 157015392, "step": 129110 }, { "epoch": 16.177797268512716, "grad_norm": 9.42132568359375, "learning_rate": 1.0719722098958125e-06, "loss": 0.4634, "num_input_tokens_seen": 157021760, "step": 129115 }, { "epoch": 16.178423756421502, "grad_norm": 4.950297832489014, "learning_rate": 1.0716339664095455e-06, "loss": 0.4277, "num_input_tokens_seen": 157028192, "step": 129120 }, { "epoch": 16.179050244330284, "grad_norm": 2.2431752681732178, "learning_rate": 1.0712957698901965e-06, "loss": 0.4155, "num_input_tokens_seen": 157034400, "step": 129125 }, { "epoch": 16.179676732239066, "grad_norm": 2.480517864227295, "learning_rate": 1.070957620341811e-06, "loss": 0.4734, "num_input_tokens_seen": 157040352, "step": 129130 }, { "epoch": 16.180303220147852, "grad_norm": 3.730409622192383, "learning_rate": 1.0706195177684293e-06, "loss": 0.4677, "num_input_tokens_seen": 157046656, "step": 129135 }, { "epoch": 16.180929708056635, "grad_norm": 4.080391883850098, "learning_rate": 1.0702814621740943e-06, "loss": 0.4628, "num_input_tokens_seen": 157052256, "step": 129140 }, { "epoch": 16.181556195965417, "grad_norm": 4.5793962478637695, "learning_rate": 1.0699434535628488e-06, "loss": 0.4036, "num_input_tokens_seen": 157058400, "step": 129145 }, { "epoch": 16.182182683874203, "grad_norm": 6.110180854797363, "learning_rate": 1.0696054919387344e-06, "loss": 0.4237, "num_input_tokens_seen": 157064544, "step": 129150 }, { "epoch": 16.182809171782985, "grad_norm": 3.386293411254883, "learning_rate": 1.0692675773057898e-06, "loss": 0.399, "num_input_tokens_seen": 157071328, "step": 129155 }, { "epoch": 16.183435659691767, "grad_norm": 13.712431907653809, "learning_rate": 1.0689297096680563e-06, "loss": 0.4531, "num_input_tokens_seen": 157077376, "step": 129160 }, { "epoch": 16.184062147600553, "grad_norm": 5.949748516082764, "learning_rate": 1.0685918890295743e-06, "loss": 0.4012, "num_input_tokens_seen": 157083616, "step": 129165 }, { "epoch": 16.184688635509335, "grad_norm": 4.67910623550415, "learning_rate": 1.0682541153943803e-06, "loss": 0.4244, "num_input_tokens_seen": 157089856, "step": 129170 }, { "epoch": 16.185315123418118, "grad_norm": 3.7982888221740723, "learning_rate": 1.0679163887665156e-06, "loss": 0.4433, "num_input_tokens_seen": 157095808, "step": 129175 }, { "epoch": 16.1859416113269, "grad_norm": 4.331221103668213, "learning_rate": 1.0675787091500145e-06, "loss": 0.5035, "num_input_tokens_seen": 157101696, "step": 129180 }, { "epoch": 16.186568099235686, "grad_norm": 20.50255584716797, "learning_rate": 1.0672410765489176e-06, "loss": 0.4625, "num_input_tokens_seen": 157107136, "step": 129185 }, { "epoch": 16.187194587144468, "grad_norm": 3.622495412826538, "learning_rate": 1.0669034909672594e-06, "loss": 0.4548, "num_input_tokens_seen": 157113088, "step": 129190 }, { "epoch": 16.18782107505325, "grad_norm": 3.9414896965026855, "learning_rate": 1.0665659524090778e-06, "loss": 0.4121, "num_input_tokens_seen": 157119072, "step": 129195 }, { "epoch": 16.188447562962036, "grad_norm": 2.5962226390838623, "learning_rate": 1.0662284608784057e-06, "loss": 0.3996, "num_input_tokens_seen": 157124672, "step": 129200 }, { "epoch": 16.18907405087082, "grad_norm": 7.331752300262451, "learning_rate": 1.0658910163792807e-06, "loss": 0.4533, "num_input_tokens_seen": 157130976, "step": 129205 }, { "epoch": 16.1897005387796, "grad_norm": 5.900221824645996, "learning_rate": 1.0655536189157357e-06, "loss": 0.4064, "num_input_tokens_seen": 157137344, "step": 129210 }, { "epoch": 16.190327026688387, "grad_norm": 9.6556978225708, "learning_rate": 1.0652162684918044e-06, "loss": 0.5187, "num_input_tokens_seen": 157143456, "step": 129215 }, { "epoch": 16.19095351459717, "grad_norm": 5.437034606933594, "learning_rate": 1.064878965111522e-06, "loss": 0.4306, "num_input_tokens_seen": 157149440, "step": 129220 }, { "epoch": 16.19158000250595, "grad_norm": 4.31026554107666, "learning_rate": 1.0645417087789195e-06, "loss": 0.4506, "num_input_tokens_seen": 157155776, "step": 129225 }, { "epoch": 16.192206490414733, "grad_norm": 9.016221046447754, "learning_rate": 1.0642044994980304e-06, "loss": 0.4312, "num_input_tokens_seen": 157161760, "step": 129230 }, { "epoch": 16.19283297832352, "grad_norm": 16.03314781188965, "learning_rate": 1.0638673372728847e-06, "loss": 0.5098, "num_input_tokens_seen": 157167776, "step": 129235 }, { "epoch": 16.1934594662323, "grad_norm": 3.7379260063171387, "learning_rate": 1.063530222107514e-06, "loss": 0.459, "num_input_tokens_seen": 157173824, "step": 129240 }, { "epoch": 16.194085954141084, "grad_norm": 5.829891204833984, "learning_rate": 1.0631931540059498e-06, "loss": 0.4038, "num_input_tokens_seen": 157180192, "step": 129245 }, { "epoch": 16.19471244204987, "grad_norm": 16.275482177734375, "learning_rate": 1.062856132972221e-06, "loss": 0.4796, "num_input_tokens_seen": 157186432, "step": 129250 }, { "epoch": 16.195338929958652, "grad_norm": 15.37474250793457, "learning_rate": 1.0625191590103568e-06, "loss": 0.5148, "num_input_tokens_seen": 157192320, "step": 129255 }, { "epoch": 16.195965417867434, "grad_norm": 4.603282928466797, "learning_rate": 1.062182232124388e-06, "loss": 0.4216, "num_input_tokens_seen": 157198976, "step": 129260 }, { "epoch": 16.19659190577622, "grad_norm": 3.558823585510254, "learning_rate": 1.0618453523183397e-06, "loss": 0.4595, "num_input_tokens_seen": 157205152, "step": 129265 }, { "epoch": 16.197218393685002, "grad_norm": 3.6136765480041504, "learning_rate": 1.0615085195962426e-06, "loss": 0.4584, "num_input_tokens_seen": 157211136, "step": 129270 }, { "epoch": 16.197844881593785, "grad_norm": 2.030515432357788, "learning_rate": 1.0611717339621214e-06, "loss": 0.3966, "num_input_tokens_seen": 157217408, "step": 129275 }, { "epoch": 16.19847136950257, "grad_norm": 16.215848922729492, "learning_rate": 1.0608349954200037e-06, "loss": 0.4615, "num_input_tokens_seen": 157223552, "step": 129280 }, { "epoch": 16.199097857411353, "grad_norm": 7.320559978485107, "learning_rate": 1.060498303973917e-06, "loss": 0.4704, "num_input_tokens_seen": 157230016, "step": 129285 }, { "epoch": 16.199724345320135, "grad_norm": 3.305431604385376, "learning_rate": 1.0601616596278835e-06, "loss": 0.4247, "num_input_tokens_seen": 157236032, "step": 129290 }, { "epoch": 16.200350833228917, "grad_norm": 4.328920841217041, "learning_rate": 1.0598250623859314e-06, "loss": 0.4442, "num_input_tokens_seen": 157242240, "step": 129295 }, { "epoch": 16.200977321137703, "grad_norm": 3.727795362472534, "learning_rate": 1.0594885122520815e-06, "loss": 0.4604, "num_input_tokens_seen": 157248192, "step": 129300 }, { "epoch": 16.201603809046485, "grad_norm": 4.209309101104736, "learning_rate": 1.0591520092303614e-06, "loss": 0.422, "num_input_tokens_seen": 157254464, "step": 129305 }, { "epoch": 16.202230296955268, "grad_norm": 6.82912540435791, "learning_rate": 1.0588155533247907e-06, "loss": 0.525, "num_input_tokens_seen": 157260320, "step": 129310 }, { "epoch": 16.202856784864053, "grad_norm": 10.025436401367188, "learning_rate": 1.0584791445393949e-06, "loss": 0.4846, "num_input_tokens_seen": 157266528, "step": 129315 }, { "epoch": 16.203483272772836, "grad_norm": 14.786689758300781, "learning_rate": 1.0581427828781937e-06, "loss": 0.5046, "num_input_tokens_seen": 157272640, "step": 129320 }, { "epoch": 16.204109760681618, "grad_norm": 17.282672882080078, "learning_rate": 1.057806468345211e-06, "loss": 0.4889, "num_input_tokens_seen": 157279072, "step": 129325 }, { "epoch": 16.204736248590404, "grad_norm": 5.168534755706787, "learning_rate": 1.0574702009444648e-06, "loss": 0.42, "num_input_tokens_seen": 157285120, "step": 129330 }, { "epoch": 16.205362736499186, "grad_norm": 6.2852983474731445, "learning_rate": 1.0571339806799773e-06, "loss": 0.4322, "num_input_tokens_seen": 157291552, "step": 129335 }, { "epoch": 16.20598922440797, "grad_norm": 2.241325616836548, "learning_rate": 1.0567978075557694e-06, "loss": 0.4387, "num_input_tokens_seen": 157297248, "step": 129340 }, { "epoch": 16.20661571231675, "grad_norm": 3.1119794845581055, "learning_rate": 1.0564616815758572e-06, "loss": 0.4639, "num_input_tokens_seen": 157303648, "step": 129345 }, { "epoch": 16.207242200225537, "grad_norm": 6.15377140045166, "learning_rate": 1.0561256027442619e-06, "loss": 0.4996, "num_input_tokens_seen": 157309504, "step": 129350 }, { "epoch": 16.20786868813432, "grad_norm": 4.688692092895508, "learning_rate": 1.0557895710650001e-06, "loss": 0.426, "num_input_tokens_seen": 157315936, "step": 129355 }, { "epoch": 16.2084951760431, "grad_norm": 4.9596381187438965, "learning_rate": 1.0554535865420916e-06, "loss": 0.4639, "num_input_tokens_seen": 157321568, "step": 129360 }, { "epoch": 16.209121663951887, "grad_norm": 17.20773696899414, "learning_rate": 1.0551176491795507e-06, "loss": 0.5138, "num_input_tokens_seen": 157327776, "step": 129365 }, { "epoch": 16.20974815186067, "grad_norm": 3.4650027751922607, "learning_rate": 1.0547817589813962e-06, "loss": 0.4197, "num_input_tokens_seen": 157333504, "step": 129370 }, { "epoch": 16.21037463976945, "grad_norm": 3.2647173404693604, "learning_rate": 1.0544459159516413e-06, "loss": 0.4056, "num_input_tokens_seen": 157339648, "step": 129375 }, { "epoch": 16.211001127678237, "grad_norm": 4.319361209869385, "learning_rate": 1.0541101200943043e-06, "loss": 0.4517, "num_input_tokens_seen": 157345600, "step": 129380 }, { "epoch": 16.21162761558702, "grad_norm": 28.11903190612793, "learning_rate": 1.0537743714133968e-06, "loss": 0.5742, "num_input_tokens_seen": 157351648, "step": 129385 }, { "epoch": 16.212254103495802, "grad_norm": 5.856790542602539, "learning_rate": 1.0534386699129363e-06, "loss": 0.4158, "num_input_tokens_seen": 157357440, "step": 129390 }, { "epoch": 16.212880591404584, "grad_norm": 4.272249221801758, "learning_rate": 1.0531030155969324e-06, "loss": 0.4397, "num_input_tokens_seen": 157363360, "step": 129395 }, { "epoch": 16.21350707931337, "grad_norm": 15.540141105651855, "learning_rate": 1.0527674084694007e-06, "loss": 0.5059, "num_input_tokens_seen": 157369536, "step": 129400 }, { "epoch": 16.214133567222152, "grad_norm": 3.067871570587158, "learning_rate": 1.052431848534355e-06, "loss": 0.4173, "num_input_tokens_seen": 157375776, "step": 129405 }, { "epoch": 16.214760055130935, "grad_norm": 3.487607479095459, "learning_rate": 1.0520963357958035e-06, "loss": 0.4429, "num_input_tokens_seen": 157381856, "step": 129410 }, { "epoch": 16.21538654303972, "grad_norm": 3.6948564052581787, "learning_rate": 1.0517608702577614e-06, "loss": 0.4539, "num_input_tokens_seen": 157387744, "step": 129415 }, { "epoch": 16.216013030948503, "grad_norm": 3.5694258213043213, "learning_rate": 1.051425451924236e-06, "loss": 0.4074, "num_input_tokens_seen": 157393984, "step": 129420 }, { "epoch": 16.216639518857285, "grad_norm": 4.702886581420898, "learning_rate": 1.0510900807992408e-06, "loss": 0.4435, "num_input_tokens_seen": 157399712, "step": 129425 }, { "epoch": 16.21726600676607, "grad_norm": 20.531049728393555, "learning_rate": 1.050754756886782e-06, "loss": 0.4812, "num_input_tokens_seen": 157405664, "step": 129430 }, { "epoch": 16.217892494674853, "grad_norm": 15.55964469909668, "learning_rate": 1.0504194801908718e-06, "loss": 0.4704, "num_input_tokens_seen": 157411744, "step": 129435 }, { "epoch": 16.218518982583635, "grad_norm": 9.9736909866333, "learning_rate": 1.0500842507155162e-06, "loss": 0.4363, "num_input_tokens_seen": 157417792, "step": 129440 }, { "epoch": 16.21914547049242, "grad_norm": 14.942025184631348, "learning_rate": 1.0497490684647238e-06, "loss": 0.4518, "num_input_tokens_seen": 157424000, "step": 129445 }, { "epoch": 16.219771958401203, "grad_norm": 7.562455654144287, "learning_rate": 1.049413933442504e-06, "loss": 0.4917, "num_input_tokens_seen": 157429504, "step": 129450 }, { "epoch": 16.220398446309986, "grad_norm": 8.774164199829102, "learning_rate": 1.049078845652861e-06, "loss": 0.5137, "num_input_tokens_seen": 157435840, "step": 129455 }, { "epoch": 16.221024934218768, "grad_norm": 14.391460418701172, "learning_rate": 1.0487438050998023e-06, "loss": 0.4831, "num_input_tokens_seen": 157442080, "step": 129460 }, { "epoch": 16.221651422127554, "grad_norm": 3.287914276123047, "learning_rate": 1.048408811787333e-06, "loss": 0.4587, "num_input_tokens_seen": 157448256, "step": 129465 }, { "epoch": 16.222277910036336, "grad_norm": 9.42112922668457, "learning_rate": 1.0480738657194606e-06, "loss": 0.5007, "num_input_tokens_seen": 157454208, "step": 129470 }, { "epoch": 16.22290439794512, "grad_norm": 4.823591232299805, "learning_rate": 1.0477389669001858e-06, "loss": 0.3992, "num_input_tokens_seen": 157460320, "step": 129475 }, { "epoch": 16.223530885853904, "grad_norm": 7.678213119506836, "learning_rate": 1.0474041153335162e-06, "loss": 0.428, "num_input_tokens_seen": 157466528, "step": 129480 }, { "epoch": 16.224157373762687, "grad_norm": 4.114541053771973, "learning_rate": 1.047069311023452e-06, "loss": 0.3908, "num_input_tokens_seen": 157472416, "step": 129485 }, { "epoch": 16.22478386167147, "grad_norm": 13.32023811340332, "learning_rate": 1.0467345539739993e-06, "loss": 0.4492, "num_input_tokens_seen": 157478592, "step": 129490 }, { "epoch": 16.225410349580255, "grad_norm": 5.623045444488525, "learning_rate": 1.046399844189157e-06, "loss": 0.4502, "num_input_tokens_seen": 157484480, "step": 129495 }, { "epoch": 16.226036837489037, "grad_norm": 13.387317657470703, "learning_rate": 1.04606518167293e-06, "loss": 0.5508, "num_input_tokens_seen": 157490912, "step": 129500 }, { "epoch": 16.22666332539782, "grad_norm": 30.07042121887207, "learning_rate": 1.0457305664293167e-06, "loss": 0.4854, "num_input_tokens_seen": 157497376, "step": 129505 }, { "epoch": 16.2272898133066, "grad_norm": 5.262478351593018, "learning_rate": 1.0453959984623208e-06, "loss": 0.409, "num_input_tokens_seen": 157503360, "step": 129510 }, { "epoch": 16.227916301215387, "grad_norm": 3.9828147888183594, "learning_rate": 1.0450614777759388e-06, "loss": 0.4561, "num_input_tokens_seen": 157509280, "step": 129515 }, { "epoch": 16.22854278912417, "grad_norm": 3.4572532176971436, "learning_rate": 1.044727004374172e-06, "loss": 0.411, "num_input_tokens_seen": 157515296, "step": 129520 }, { "epoch": 16.229169277032952, "grad_norm": 7.775263786315918, "learning_rate": 1.0443925782610203e-06, "loss": 0.4524, "num_input_tokens_seen": 157521216, "step": 129525 }, { "epoch": 16.229795764941738, "grad_norm": 3.8715224266052246, "learning_rate": 1.0440581994404802e-06, "loss": 0.4277, "num_input_tokens_seen": 157527680, "step": 129530 }, { "epoch": 16.23042225285052, "grad_norm": 4.204778671264648, "learning_rate": 1.0437238679165518e-06, "loss": 0.4696, "num_input_tokens_seen": 157534016, "step": 129535 }, { "epoch": 16.231048740759302, "grad_norm": 7.8687238693237305, "learning_rate": 1.043389583693229e-06, "loss": 0.4079, "num_input_tokens_seen": 157540672, "step": 129540 }, { "epoch": 16.231675228668088, "grad_norm": 8.817365646362305, "learning_rate": 1.043055346774512e-06, "loss": 0.4448, "num_input_tokens_seen": 157546656, "step": 129545 }, { "epoch": 16.23230171657687, "grad_norm": 3.0382864475250244, "learning_rate": 1.0427211571643936e-06, "loss": 0.501, "num_input_tokens_seen": 157552576, "step": 129550 }, { "epoch": 16.232928204485653, "grad_norm": 3.469223737716675, "learning_rate": 1.0423870148668707e-06, "loss": 0.493, "num_input_tokens_seen": 157558944, "step": 129555 }, { "epoch": 16.23355469239444, "grad_norm": 4.013027667999268, "learning_rate": 1.0420529198859386e-06, "loss": 0.4034, "num_input_tokens_seen": 157564896, "step": 129560 }, { "epoch": 16.23418118030322, "grad_norm": 10.015584945678711, "learning_rate": 1.0417188722255928e-06, "loss": 0.4256, "num_input_tokens_seen": 157571328, "step": 129565 }, { "epoch": 16.234807668212003, "grad_norm": 2.685633897781372, "learning_rate": 1.0413848718898246e-06, "loss": 0.4899, "num_input_tokens_seen": 157577152, "step": 129570 }, { "epoch": 16.235434156120785, "grad_norm": 3.730374574661255, "learning_rate": 1.0410509188826296e-06, "loss": 0.5711, "num_input_tokens_seen": 157583424, "step": 129575 }, { "epoch": 16.23606064402957, "grad_norm": 4.432338714599609, "learning_rate": 1.040717013207998e-06, "loss": 0.4293, "num_input_tokens_seen": 157589760, "step": 129580 }, { "epoch": 16.236687131938353, "grad_norm": 3.246574878692627, "learning_rate": 1.0403831548699233e-06, "loss": 0.4147, "num_input_tokens_seen": 157596096, "step": 129585 }, { "epoch": 16.237313619847136, "grad_norm": 1.9872095584869385, "learning_rate": 1.0400493438723986e-06, "loss": 0.4161, "num_input_tokens_seen": 157602240, "step": 129590 }, { "epoch": 16.23794010775592, "grad_norm": 9.875322341918945, "learning_rate": 1.039715580219412e-06, "loss": 0.4476, "num_input_tokens_seen": 157608352, "step": 129595 }, { "epoch": 16.238566595664704, "grad_norm": 3.328404188156128, "learning_rate": 1.039381863914956e-06, "loss": 0.5002, "num_input_tokens_seen": 157614656, "step": 129600 }, { "epoch": 16.239193083573486, "grad_norm": 2.5568251609802246, "learning_rate": 1.0390481949630193e-06, "loss": 0.4497, "num_input_tokens_seen": 157620096, "step": 129605 }, { "epoch": 16.239819571482272, "grad_norm": 11.969171524047852, "learning_rate": 1.0387145733675923e-06, "loss": 0.4937, "num_input_tokens_seen": 157626272, "step": 129610 }, { "epoch": 16.240446059391054, "grad_norm": 4.204505443572998, "learning_rate": 1.0383809991326622e-06, "loss": 0.4546, "num_input_tokens_seen": 157632384, "step": 129615 }, { "epoch": 16.241072547299837, "grad_norm": 3.190011739730835, "learning_rate": 1.038047472262219e-06, "loss": 0.4649, "num_input_tokens_seen": 157638784, "step": 129620 }, { "epoch": 16.24169903520862, "grad_norm": 15.619592666625977, "learning_rate": 1.037713992760248e-06, "loss": 0.4939, "num_input_tokens_seen": 157644352, "step": 129625 }, { "epoch": 16.242325523117405, "grad_norm": 5.259335994720459, "learning_rate": 1.037380560630739e-06, "loss": 0.4237, "num_input_tokens_seen": 157650528, "step": 129630 }, { "epoch": 16.242952011026187, "grad_norm": 8.536702156066895, "learning_rate": 1.0370471758776757e-06, "loss": 0.4494, "num_input_tokens_seen": 157656864, "step": 129635 }, { "epoch": 16.24357849893497, "grad_norm": 16.436071395874023, "learning_rate": 1.0367138385050463e-06, "loss": 0.4418, "num_input_tokens_seen": 157663008, "step": 129640 }, { "epoch": 16.244204986843755, "grad_norm": 15.119982719421387, "learning_rate": 1.0363805485168338e-06, "loss": 0.4591, "num_input_tokens_seen": 157669536, "step": 129645 }, { "epoch": 16.244831474752537, "grad_norm": 24.246835708618164, "learning_rate": 1.0360473059170244e-06, "loss": 0.4963, "num_input_tokens_seen": 157675648, "step": 129650 }, { "epoch": 16.24545796266132, "grad_norm": 6.68393611907959, "learning_rate": 1.0357141107096019e-06, "loss": 0.4898, "num_input_tokens_seen": 157681248, "step": 129655 }, { "epoch": 16.246084450570105, "grad_norm": 3.167867660522461, "learning_rate": 1.0353809628985523e-06, "loss": 0.5144, "num_input_tokens_seen": 157687424, "step": 129660 }, { "epoch": 16.246710938478888, "grad_norm": 5.380507469177246, "learning_rate": 1.0350478624878546e-06, "loss": 0.3924, "num_input_tokens_seen": 157693440, "step": 129665 }, { "epoch": 16.24733742638767, "grad_norm": 3.4229676723480225, "learning_rate": 1.034714809481493e-06, "loss": 0.5605, "num_input_tokens_seen": 157699200, "step": 129670 }, { "epoch": 16.247963914296456, "grad_norm": 3.8315343856811523, "learning_rate": 1.0343818038834513e-06, "loss": 0.4587, "num_input_tokens_seen": 157705504, "step": 129675 }, { "epoch": 16.248590402205238, "grad_norm": 4.705244541168213, "learning_rate": 1.0340488456977083e-06, "loss": 0.3941, "num_input_tokens_seen": 157711584, "step": 129680 }, { "epoch": 16.24921689011402, "grad_norm": 5.252212047576904, "learning_rate": 1.0337159349282466e-06, "loss": 0.481, "num_input_tokens_seen": 157717504, "step": 129685 }, { "epoch": 16.249843378022803, "grad_norm": 3.5008957386016846, "learning_rate": 1.0333830715790445e-06, "loss": 0.3848, "num_input_tokens_seen": 157723680, "step": 129690 }, { "epoch": 16.25046986593159, "grad_norm": 3.1053335666656494, "learning_rate": 1.033050255654084e-06, "loss": 0.4696, "num_input_tokens_seen": 157729600, "step": 129695 }, { "epoch": 16.25109635384037, "grad_norm": 3.5962324142456055, "learning_rate": 1.0327174871573413e-06, "loss": 0.3988, "num_input_tokens_seen": 157735840, "step": 129700 }, { "epoch": 16.251722841749153, "grad_norm": 12.054729461669922, "learning_rate": 1.0323847660927972e-06, "loss": 0.4693, "num_input_tokens_seen": 157742208, "step": 129705 }, { "epoch": 16.25234932965794, "grad_norm": 3.905710458755493, "learning_rate": 1.0320520924644295e-06, "loss": 0.5008, "num_input_tokens_seen": 157748384, "step": 129710 }, { "epoch": 16.25297581756672, "grad_norm": 3.4460155963897705, "learning_rate": 1.0317194662762142e-06, "loss": 0.5213, "num_input_tokens_seen": 157754432, "step": 129715 }, { "epoch": 16.253602305475503, "grad_norm": 8.052717208862305, "learning_rate": 1.0313868875321308e-06, "loss": 0.4194, "num_input_tokens_seen": 157760032, "step": 129720 }, { "epoch": 16.25422879338429, "grad_norm": 6.6484832763671875, "learning_rate": 1.031054356236152e-06, "loss": 0.4405, "num_input_tokens_seen": 157765984, "step": 129725 }, { "epoch": 16.25485528129307, "grad_norm": 7.215700149536133, "learning_rate": 1.0307218723922564e-06, "loss": 0.543, "num_input_tokens_seen": 157772000, "step": 129730 }, { "epoch": 16.255481769201854, "grad_norm": 9.967738151550293, "learning_rate": 1.030389436004417e-06, "loss": 0.4437, "num_input_tokens_seen": 157778240, "step": 129735 }, { "epoch": 16.256108257110636, "grad_norm": 20.17255401611328, "learning_rate": 1.0300570470766108e-06, "loss": 0.5714, "num_input_tokens_seen": 157784672, "step": 129740 }, { "epoch": 16.256734745019422, "grad_norm": 15.668776512145996, "learning_rate": 1.0297247056128084e-06, "loss": 0.5453, "num_input_tokens_seen": 157790624, "step": 129745 }, { "epoch": 16.257361232928204, "grad_norm": 5.704115390777588, "learning_rate": 1.0293924116169868e-06, "loss": 0.4286, "num_input_tokens_seen": 157796608, "step": 129750 }, { "epoch": 16.257987720836987, "grad_norm": 2.8333323001861572, "learning_rate": 1.0290601650931165e-06, "loss": 0.4132, "num_input_tokens_seen": 157802720, "step": 129755 }, { "epoch": 16.258614208745772, "grad_norm": 2.6612203121185303, "learning_rate": 1.02872796604517e-06, "loss": 0.4433, "num_input_tokens_seen": 157808832, "step": 129760 }, { "epoch": 16.259240696654555, "grad_norm": 4.953832149505615, "learning_rate": 1.0283958144771195e-06, "loss": 0.4589, "num_input_tokens_seen": 157814624, "step": 129765 }, { "epoch": 16.259867184563337, "grad_norm": 5.965444087982178, "learning_rate": 1.0280637103929359e-06, "loss": 0.4946, "num_input_tokens_seen": 157820960, "step": 129770 }, { "epoch": 16.260493672472123, "grad_norm": 7.770020961761475, "learning_rate": 1.027731653796592e-06, "loss": 0.4197, "num_input_tokens_seen": 157826560, "step": 129775 }, { "epoch": 16.261120160380905, "grad_norm": 15.955860137939453, "learning_rate": 1.0273996446920542e-06, "loss": 0.4554, "num_input_tokens_seen": 157832064, "step": 129780 }, { "epoch": 16.261746648289687, "grad_norm": 4.864817142486572, "learning_rate": 1.027067683083295e-06, "loss": 0.448, "num_input_tokens_seen": 157838400, "step": 129785 }, { "epoch": 16.26237313619847, "grad_norm": 19.448261260986328, "learning_rate": 1.0267357689742802e-06, "loss": 0.5196, "num_input_tokens_seen": 157844352, "step": 129790 }, { "epoch": 16.262999624107255, "grad_norm": 6.296407222747803, "learning_rate": 1.0264039023689815e-06, "loss": 0.4768, "num_input_tokens_seen": 157850304, "step": 129795 }, { "epoch": 16.263626112016038, "grad_norm": 3.8153765201568604, "learning_rate": 1.0260720832713638e-06, "loss": 0.4441, "num_input_tokens_seen": 157856128, "step": 129800 }, { "epoch": 16.26425259992482, "grad_norm": 12.829282760620117, "learning_rate": 1.0257403116853964e-06, "loss": 0.4753, "num_input_tokens_seen": 157862304, "step": 129805 }, { "epoch": 16.264879087833606, "grad_norm": 3.3966572284698486, "learning_rate": 1.0254085876150443e-06, "loss": 0.4063, "num_input_tokens_seen": 157868320, "step": 129810 }, { "epoch": 16.265505575742388, "grad_norm": 15.130634307861328, "learning_rate": 1.025076911064275e-06, "loss": 0.5615, "num_input_tokens_seen": 157874880, "step": 129815 }, { "epoch": 16.26613206365117, "grad_norm": 4.649120807647705, "learning_rate": 1.0247452820370518e-06, "loss": 0.496, "num_input_tokens_seen": 157880512, "step": 129820 }, { "epoch": 16.266758551559956, "grad_norm": 11.792082786560059, "learning_rate": 1.0244137005373423e-06, "loss": 0.4426, "num_input_tokens_seen": 157886944, "step": 129825 }, { "epoch": 16.26738503946874, "grad_norm": 3.141939401626587, "learning_rate": 1.024082166569108e-06, "loss": 0.4742, "num_input_tokens_seen": 157893536, "step": 129830 }, { "epoch": 16.26801152737752, "grad_norm": 3.5152640342712402, "learning_rate": 1.0237506801363146e-06, "loss": 0.4891, "num_input_tokens_seen": 157899584, "step": 129835 }, { "epoch": 16.268638015286307, "grad_norm": 3.116429090499878, "learning_rate": 1.0234192412429261e-06, "loss": 0.5198, "num_input_tokens_seen": 157905664, "step": 129840 }, { "epoch": 16.26926450319509, "grad_norm": 4.222991466522217, "learning_rate": 1.0230878498929026e-06, "loss": 0.44, "num_input_tokens_seen": 157911840, "step": 129845 }, { "epoch": 16.26989099110387, "grad_norm": 6.187704086303711, "learning_rate": 1.0227565060902083e-06, "loss": 0.4582, "num_input_tokens_seen": 157917824, "step": 129850 }, { "epoch": 16.270517479012653, "grad_norm": 8.172322273254395, "learning_rate": 1.0224252098388032e-06, "loss": 0.4409, "num_input_tokens_seen": 157923680, "step": 129855 }, { "epoch": 16.27114396692144, "grad_norm": 4.052621841430664, "learning_rate": 1.0220939611426483e-06, "loss": 0.4201, "num_input_tokens_seen": 157929792, "step": 129860 }, { "epoch": 16.27177045483022, "grad_norm": 16.91624641418457, "learning_rate": 1.0217627600057062e-06, "loss": 0.4967, "num_input_tokens_seen": 157936032, "step": 129865 }, { "epoch": 16.272396942739004, "grad_norm": 4.325881481170654, "learning_rate": 1.021431606431934e-06, "loss": 0.4214, "num_input_tokens_seen": 157941952, "step": 129870 }, { "epoch": 16.27302343064779, "grad_norm": 3.39345121383667, "learning_rate": 1.0211005004252916e-06, "loss": 0.443, "num_input_tokens_seen": 157948288, "step": 129875 }, { "epoch": 16.273649918556572, "grad_norm": 3.4545910358428955, "learning_rate": 1.020769441989739e-06, "loss": 0.4026, "num_input_tokens_seen": 157954656, "step": 129880 }, { "epoch": 16.274276406465354, "grad_norm": 13.742920875549316, "learning_rate": 1.0204384311292326e-06, "loss": 0.4609, "num_input_tokens_seen": 157960288, "step": 129885 }, { "epoch": 16.27490289437414, "grad_norm": 5.698444843292236, "learning_rate": 1.0201074678477301e-06, "loss": 0.4936, "num_input_tokens_seen": 157966688, "step": 129890 }, { "epoch": 16.275529382282922, "grad_norm": 7.85064172744751, "learning_rate": 1.0197765521491903e-06, "loss": 0.4353, "num_input_tokens_seen": 157972832, "step": 129895 }, { "epoch": 16.276155870191705, "grad_norm": 3.360413074493408, "learning_rate": 1.0194456840375672e-06, "loss": 0.42, "num_input_tokens_seen": 157978944, "step": 129900 }, { "epoch": 16.27678235810049, "grad_norm": 3.9599640369415283, "learning_rate": 1.0191148635168186e-06, "loss": 0.4385, "num_input_tokens_seen": 157984416, "step": 129905 }, { "epoch": 16.277408846009273, "grad_norm": 3.8774290084838867, "learning_rate": 1.0187840905908974e-06, "loss": 0.4035, "num_input_tokens_seen": 157990560, "step": 129910 }, { "epoch": 16.278035333918055, "grad_norm": 9.969260215759277, "learning_rate": 1.0184533652637612e-06, "loss": 0.4761, "num_input_tokens_seen": 157996512, "step": 129915 }, { "epoch": 16.278661821826837, "grad_norm": 3.45723819732666, "learning_rate": 1.0181226875393608e-06, "loss": 0.4135, "num_input_tokens_seen": 158002752, "step": 129920 }, { "epoch": 16.279288309735623, "grad_norm": 11.706340789794922, "learning_rate": 1.0177920574216531e-06, "loss": 0.5602, "num_input_tokens_seen": 158009344, "step": 129925 }, { "epoch": 16.279914797644405, "grad_norm": 8.981756210327148, "learning_rate": 1.0174614749145879e-06, "loss": 0.4207, "num_input_tokens_seen": 158015392, "step": 129930 }, { "epoch": 16.280541285553188, "grad_norm": 4.081550121307373, "learning_rate": 1.0171309400221207e-06, "loss": 0.4498, "num_input_tokens_seen": 158021504, "step": 129935 }, { "epoch": 16.281167773461974, "grad_norm": 3.9908483028411865, "learning_rate": 1.0168004527482e-06, "loss": 0.397, "num_input_tokens_seen": 158027776, "step": 129940 }, { "epoch": 16.281794261370756, "grad_norm": 8.937273979187012, "learning_rate": 1.0164700130967808e-06, "loss": 0.4593, "num_input_tokens_seen": 158033888, "step": 129945 }, { "epoch": 16.282420749279538, "grad_norm": 3.291106700897217, "learning_rate": 1.01613962107181e-06, "loss": 0.4144, "num_input_tokens_seen": 158040160, "step": 129950 }, { "epoch": 16.283047237188324, "grad_norm": 6.227025032043457, "learning_rate": 1.0158092766772393e-06, "loss": 0.534, "num_input_tokens_seen": 158046272, "step": 129955 }, { "epoch": 16.283673725097106, "grad_norm": 6.378763198852539, "learning_rate": 1.01547897991702e-06, "loss": 0.3912, "num_input_tokens_seen": 158052448, "step": 129960 }, { "epoch": 16.28430021300589, "grad_norm": 2.6195690631866455, "learning_rate": 1.015148730795098e-06, "loss": 0.4749, "num_input_tokens_seen": 158058752, "step": 129965 }, { "epoch": 16.28492670091467, "grad_norm": 4.533618450164795, "learning_rate": 1.0148185293154234e-06, "loss": 0.4745, "num_input_tokens_seen": 158064704, "step": 129970 }, { "epoch": 16.285553188823457, "grad_norm": 4.310634136199951, "learning_rate": 1.0144883754819434e-06, "loss": 0.413, "num_input_tokens_seen": 158070880, "step": 129975 }, { "epoch": 16.28617967673224, "grad_norm": 3.4321327209472656, "learning_rate": 1.0141582692986074e-06, "loss": 0.4436, "num_input_tokens_seen": 158076160, "step": 129980 }, { "epoch": 16.28680616464102, "grad_norm": 10.0723295211792, "learning_rate": 1.0138282107693592e-06, "loss": 0.4975, "num_input_tokens_seen": 158082592, "step": 129985 }, { "epoch": 16.287432652549807, "grad_norm": 8.873066902160645, "learning_rate": 1.013498199898147e-06, "loss": 0.5098, "num_input_tokens_seen": 158089056, "step": 129990 }, { "epoch": 16.28805914045859, "grad_norm": 3.526505708694458, "learning_rate": 1.0131682366889144e-06, "loss": 0.4949, "num_input_tokens_seen": 158094912, "step": 129995 }, { "epoch": 16.28868562836737, "grad_norm": 3.926321029663086, "learning_rate": 1.0128383211456088e-06, "loss": 0.4943, "num_input_tokens_seen": 158100704, "step": 130000 }, { "epoch": 16.289312116276157, "grad_norm": 8.007303237915039, "learning_rate": 1.0125084532721723e-06, "loss": 0.4239, "num_input_tokens_seen": 158106400, "step": 130005 }, { "epoch": 16.28993860418494, "grad_norm": 3.0349009037017822, "learning_rate": 1.0121786330725508e-06, "loss": 0.4576, "num_input_tokens_seen": 158111936, "step": 130010 }, { "epoch": 16.290565092093722, "grad_norm": 3.6917595863342285, "learning_rate": 1.011848860550685e-06, "loss": 0.4306, "num_input_tokens_seen": 158117888, "step": 130015 }, { "epoch": 16.291191580002504, "grad_norm": 5.58454704284668, "learning_rate": 1.0115191357105192e-06, "loss": 0.4784, "num_input_tokens_seen": 158123936, "step": 130020 }, { "epoch": 16.29181806791129, "grad_norm": 9.230055809020996, "learning_rate": 1.0111894585559973e-06, "loss": 0.4344, "num_input_tokens_seen": 158129920, "step": 130025 }, { "epoch": 16.292444555820072, "grad_norm": 6.680941104888916, "learning_rate": 1.010859829091057e-06, "loss": 0.5072, "num_input_tokens_seen": 158135616, "step": 130030 }, { "epoch": 16.293071043728855, "grad_norm": 21.766155242919922, "learning_rate": 1.0105302473196427e-06, "loss": 0.4768, "num_input_tokens_seen": 158141664, "step": 130035 }, { "epoch": 16.29369753163764, "grad_norm": 3.2279863357543945, "learning_rate": 1.0102007132456925e-06, "loss": 0.5162, "num_input_tokens_seen": 158147744, "step": 130040 }, { "epoch": 16.294324019546423, "grad_norm": 7.178700923919678, "learning_rate": 1.0098712268731487e-06, "loss": 0.4093, "num_input_tokens_seen": 158153824, "step": 130045 }, { "epoch": 16.294950507455205, "grad_norm": 2.462101697921753, "learning_rate": 1.0095417882059472e-06, "loss": 0.4528, "num_input_tokens_seen": 158159968, "step": 130050 }, { "epoch": 16.29557699536399, "grad_norm": 3.7853024005889893, "learning_rate": 1.00921239724803e-06, "loss": 0.4248, "num_input_tokens_seen": 158166144, "step": 130055 }, { "epoch": 16.296203483272773, "grad_norm": 4.221606731414795, "learning_rate": 1.0088830540033328e-06, "loss": 0.4515, "num_input_tokens_seen": 158172448, "step": 130060 }, { "epoch": 16.296829971181555, "grad_norm": 4.743152141571045, "learning_rate": 1.0085537584757942e-06, "loss": 0.449, "num_input_tokens_seen": 158178624, "step": 130065 }, { "epoch": 16.29745645909034, "grad_norm": 3.252371311187744, "learning_rate": 1.0082245106693529e-06, "loss": 0.4282, "num_input_tokens_seen": 158184928, "step": 130070 }, { "epoch": 16.298082946999124, "grad_norm": 10.900900840759277, "learning_rate": 1.007895310587942e-06, "loss": 0.4548, "num_input_tokens_seen": 158191104, "step": 130075 }, { "epoch": 16.298709434907906, "grad_norm": 3.131744623184204, "learning_rate": 1.007566158235499e-06, "loss": 0.4196, "num_input_tokens_seen": 158197440, "step": 130080 }, { "epoch": 16.299335922816688, "grad_norm": 3.894810914993286, "learning_rate": 1.0072370536159593e-06, "loss": 0.4086, "num_input_tokens_seen": 158203872, "step": 130085 }, { "epoch": 16.299962410725474, "grad_norm": 4.538484573364258, "learning_rate": 1.0069079967332585e-06, "loss": 0.4485, "num_input_tokens_seen": 158209952, "step": 130090 }, { "epoch": 16.300588898634256, "grad_norm": 2.085843324661255, "learning_rate": 1.006578987591329e-06, "loss": 0.4097, "num_input_tokens_seen": 158216064, "step": 130095 }, { "epoch": 16.30121538654304, "grad_norm": 16.704944610595703, "learning_rate": 1.0062500261941067e-06, "loss": 0.465, "num_input_tokens_seen": 158222240, "step": 130100 }, { "epoch": 16.301841874451824, "grad_norm": 19.727018356323242, "learning_rate": 1.0059211125455209e-06, "loss": 0.4603, "num_input_tokens_seen": 158228160, "step": 130105 }, { "epoch": 16.302468362360607, "grad_norm": 4.950258255004883, "learning_rate": 1.0055922466495082e-06, "loss": 0.4347, "num_input_tokens_seen": 158234560, "step": 130110 }, { "epoch": 16.30309485026939, "grad_norm": 6.0223708152771, "learning_rate": 1.005263428509997e-06, "loss": 0.4677, "num_input_tokens_seen": 158241024, "step": 130115 }, { "epoch": 16.303721338178175, "grad_norm": 4.910357475280762, "learning_rate": 1.0049346581309217e-06, "loss": 0.4121, "num_input_tokens_seen": 158247136, "step": 130120 }, { "epoch": 16.304347826086957, "grad_norm": 3.779932737350464, "learning_rate": 1.0046059355162097e-06, "loss": 0.4238, "num_input_tokens_seen": 158253120, "step": 130125 }, { "epoch": 16.30497431399574, "grad_norm": 6.08052396774292, "learning_rate": 1.0042772606697942e-06, "loss": 0.5132, "num_input_tokens_seen": 158258560, "step": 130130 }, { "epoch": 16.30560080190452, "grad_norm": 3.3699452877044678, "learning_rate": 1.0039486335956028e-06, "loss": 0.4228, "num_input_tokens_seen": 158264960, "step": 130135 }, { "epoch": 16.306227289813307, "grad_norm": 4.458138465881348, "learning_rate": 1.0036200542975643e-06, "loss": 0.4015, "num_input_tokens_seen": 158270816, "step": 130140 }, { "epoch": 16.30685377772209, "grad_norm": 12.355780601501465, "learning_rate": 1.0032915227796097e-06, "loss": 0.4256, "num_input_tokens_seen": 158276864, "step": 130145 }, { "epoch": 16.307480265630872, "grad_norm": 6.877316474914551, "learning_rate": 1.002963039045664e-06, "loss": 0.4431, "num_input_tokens_seen": 158283456, "step": 130150 }, { "epoch": 16.308106753539658, "grad_norm": 6.8071465492248535, "learning_rate": 1.002634603099657e-06, "loss": 0.4919, "num_input_tokens_seen": 158289696, "step": 130155 }, { "epoch": 16.30873324144844, "grad_norm": 10.938322067260742, "learning_rate": 1.0023062149455132e-06, "loss": 0.5765, "num_input_tokens_seen": 158296032, "step": 130160 }, { "epoch": 16.309359729357222, "grad_norm": 7.03407096862793, "learning_rate": 1.00197787458716e-06, "loss": 0.4995, "num_input_tokens_seen": 158302144, "step": 130165 }, { "epoch": 16.309986217266008, "grad_norm": 8.16868782043457, "learning_rate": 1.001649582028522e-06, "loss": 0.4309, "num_input_tokens_seen": 158308352, "step": 130170 }, { "epoch": 16.31061270517479, "grad_norm": 11.191267013549805, "learning_rate": 1.0013213372735248e-06, "loss": 0.4862, "num_input_tokens_seen": 158314496, "step": 130175 }, { "epoch": 16.311239193083573, "grad_norm": 3.7447891235351562, "learning_rate": 1.000993140326093e-06, "loss": 0.4163, "num_input_tokens_seen": 158320736, "step": 130180 }, { "epoch": 16.31186568099236, "grad_norm": 14.419422149658203, "learning_rate": 1.0006649911901517e-06, "loss": 0.4436, "num_input_tokens_seen": 158326560, "step": 130185 }, { "epoch": 16.31249216890114, "grad_norm": 3.5376274585723877, "learning_rate": 1.0003368898696215e-06, "loss": 0.4708, "num_input_tokens_seen": 158332640, "step": 130190 }, { "epoch": 16.313118656809923, "grad_norm": 4.436043739318848, "learning_rate": 1.0000088363684274e-06, "loss": 0.4097, "num_input_tokens_seen": 158338976, "step": 130195 }, { "epoch": 16.313745144718705, "grad_norm": 10.146780967712402, "learning_rate": 9.996808306904899e-07, "loss": 0.4602, "num_input_tokens_seen": 158345056, "step": 130200 }, { "epoch": 16.31437163262749, "grad_norm": 4.480187892913818, "learning_rate": 9.993528728397312e-07, "loss": 0.4122, "num_input_tokens_seen": 158351808, "step": 130205 }, { "epoch": 16.314998120536274, "grad_norm": 5.182222366333008, "learning_rate": 9.990249628200738e-07, "loss": 0.4408, "num_input_tokens_seen": 158357760, "step": 130210 }, { "epoch": 16.315624608445056, "grad_norm": 18.892925262451172, "learning_rate": 9.986971006354358e-07, "loss": 0.4593, "num_input_tokens_seen": 158364160, "step": 130215 }, { "epoch": 16.31625109635384, "grad_norm": 4.05388069152832, "learning_rate": 9.983692862897386e-07, "loss": 0.491, "num_input_tokens_seen": 158370240, "step": 130220 }, { "epoch": 16.316877584262624, "grad_norm": 12.096479415893555, "learning_rate": 9.980415197869003e-07, "loss": 0.4704, "num_input_tokens_seen": 158375360, "step": 130225 }, { "epoch": 16.317504072171406, "grad_norm": 8.913326263427734, "learning_rate": 9.977138011308414e-07, "loss": 0.6366, "num_input_tokens_seen": 158381472, "step": 130230 }, { "epoch": 16.318130560080192, "grad_norm": 20.549287796020508, "learning_rate": 9.973861303254773e-07, "loss": 0.4756, "num_input_tokens_seen": 158387648, "step": 130235 }, { "epoch": 16.318757047988974, "grad_norm": 3.857449769973755, "learning_rate": 9.970585073747292e-07, "loss": 0.4728, "num_input_tokens_seen": 158393376, "step": 130240 }, { "epoch": 16.319383535897757, "grad_norm": 5.845798015594482, "learning_rate": 9.967309322825104e-07, "loss": 0.5525, "num_input_tokens_seen": 158399488, "step": 130245 }, { "epoch": 16.32001002380654, "grad_norm": 20.298730850219727, "learning_rate": 9.964034050527404e-07, "loss": 0.4958, "num_input_tokens_seen": 158405600, "step": 130250 }, { "epoch": 16.320636511715325, "grad_norm": 3.181485891342163, "learning_rate": 9.960759256893327e-07, "loss": 0.4861, "num_input_tokens_seen": 158411584, "step": 130255 }, { "epoch": 16.321262999624107, "grad_norm": 4.252168655395508, "learning_rate": 9.957484941962048e-07, "loss": 0.4576, "num_input_tokens_seen": 158417696, "step": 130260 }, { "epoch": 16.32188948753289, "grad_norm": 4.11417818069458, "learning_rate": 9.954211105772688e-07, "loss": 0.4026, "num_input_tokens_seen": 158423808, "step": 130265 }, { "epoch": 16.322515975441675, "grad_norm": 11.97570514678955, "learning_rate": 9.950937748364402e-07, "loss": 0.5342, "num_input_tokens_seen": 158429952, "step": 130270 }, { "epoch": 16.323142463350457, "grad_norm": 5.402320861816406, "learning_rate": 9.947664869776325e-07, "loss": 0.4388, "num_input_tokens_seen": 158436192, "step": 130275 }, { "epoch": 16.32376895125924, "grad_norm": 5.238694667816162, "learning_rate": 9.944392470047609e-07, "loss": 0.436, "num_input_tokens_seen": 158442432, "step": 130280 }, { "epoch": 16.324395439168025, "grad_norm": 2.488285779953003, "learning_rate": 9.941120549217337e-07, "loss": 0.413, "num_input_tokens_seen": 158448160, "step": 130285 }, { "epoch": 16.325021927076808, "grad_norm": 12.11816120147705, "learning_rate": 9.937849107324654e-07, "loss": 0.4904, "num_input_tokens_seen": 158454080, "step": 130290 }, { "epoch": 16.32564841498559, "grad_norm": 3.468496799468994, "learning_rate": 9.93457814440868e-07, "loss": 0.4456, "num_input_tokens_seen": 158460320, "step": 130295 }, { "epoch": 16.326274902894376, "grad_norm": 4.5676188468933105, "learning_rate": 9.931307660508494e-07, "loss": 0.5166, "num_input_tokens_seen": 158466304, "step": 130300 }, { "epoch": 16.326901390803158, "grad_norm": 6.515930652618408, "learning_rate": 9.928037655663225e-07, "loss": 0.4848, "num_input_tokens_seen": 158472320, "step": 130305 }, { "epoch": 16.32752787871194, "grad_norm": 5.2658209800720215, "learning_rate": 9.924768129911943e-07, "loss": 0.3877, "num_input_tokens_seen": 158478368, "step": 130310 }, { "epoch": 16.328154366620723, "grad_norm": 9.297270774841309, "learning_rate": 9.921499083293768e-07, "loss": 0.4211, "num_input_tokens_seen": 158484160, "step": 130315 }, { "epoch": 16.32878085452951, "grad_norm": 4.0556817054748535, "learning_rate": 9.918230515847755e-07, "loss": 0.4387, "num_input_tokens_seen": 158490464, "step": 130320 }, { "epoch": 16.32940734243829, "grad_norm": 5.398169994354248, "learning_rate": 9.914962427612995e-07, "loss": 0.4396, "num_input_tokens_seen": 158496576, "step": 130325 }, { "epoch": 16.330033830347073, "grad_norm": 20.03526496887207, "learning_rate": 9.91169481862857e-07, "loss": 0.4361, "num_input_tokens_seen": 158502816, "step": 130330 }, { "epoch": 16.33066031825586, "grad_norm": 22.0757999420166, "learning_rate": 9.908427688933526e-07, "loss": 0.5274, "num_input_tokens_seen": 158508704, "step": 130335 }, { "epoch": 16.33128680616464, "grad_norm": 2.990186929702759, "learning_rate": 9.905161038566952e-07, "loss": 0.4034, "num_input_tokens_seen": 158514720, "step": 130340 }, { "epoch": 16.331913294073424, "grad_norm": 4.848383903503418, "learning_rate": 9.901894867567873e-07, "loss": 0.4364, "num_input_tokens_seen": 158520800, "step": 130345 }, { "epoch": 16.33253978198221, "grad_norm": 11.383659362792969, "learning_rate": 9.898629175975365e-07, "loss": 0.4288, "num_input_tokens_seen": 158527040, "step": 130350 }, { "epoch": 16.33316626989099, "grad_norm": 3.0737926959991455, "learning_rate": 9.895363963828446e-07, "loss": 0.4045, "num_input_tokens_seen": 158533120, "step": 130355 }, { "epoch": 16.333792757799774, "grad_norm": 11.997495651245117, "learning_rate": 9.892099231166185e-07, "loss": 0.4514, "num_input_tokens_seen": 158538432, "step": 130360 }, { "epoch": 16.334419245708556, "grad_norm": 4.414079189300537, "learning_rate": 9.888834978027589e-07, "loss": 0.4097, "num_input_tokens_seen": 158544960, "step": 130365 }, { "epoch": 16.335045733617342, "grad_norm": 2.4848718643188477, "learning_rate": 9.885571204451705e-07, "loss": 0.4555, "num_input_tokens_seen": 158550752, "step": 130370 }, { "epoch": 16.335672221526124, "grad_norm": 2.9442672729492188, "learning_rate": 9.882307910477529e-07, "loss": 0.384, "num_input_tokens_seen": 158556800, "step": 130375 }, { "epoch": 16.336298709434907, "grad_norm": 5.105380058288574, "learning_rate": 9.879045096144092e-07, "loss": 0.4539, "num_input_tokens_seen": 158562976, "step": 130380 }, { "epoch": 16.336925197343692, "grad_norm": 7.877651214599609, "learning_rate": 9.8757827614904e-07, "loss": 0.3993, "num_input_tokens_seen": 158568672, "step": 130385 }, { "epoch": 16.337551685252475, "grad_norm": 4.194386959075928, "learning_rate": 9.87252090655546e-07, "loss": 0.4359, "num_input_tokens_seen": 158574720, "step": 130390 }, { "epoch": 16.338178173161257, "grad_norm": 4.306453704833984, "learning_rate": 9.869259531378284e-07, "loss": 0.401, "num_input_tokens_seen": 158580448, "step": 130395 }, { "epoch": 16.338804661070043, "grad_norm": 3.9995155334472656, "learning_rate": 9.865998635997836e-07, "loss": 0.4851, "num_input_tokens_seen": 158586656, "step": 130400 }, { "epoch": 16.339431148978825, "grad_norm": 9.900378227233887, "learning_rate": 9.862738220453134e-07, "loss": 0.4365, "num_input_tokens_seen": 158592896, "step": 130405 }, { "epoch": 16.340057636887607, "grad_norm": 3.2873430252075195, "learning_rate": 9.85947828478313e-07, "loss": 0.4192, "num_input_tokens_seen": 158599232, "step": 130410 }, { "epoch": 16.34068412479639, "grad_norm": 4.765025615692139, "learning_rate": 9.85621882902682e-07, "loss": 0.482, "num_input_tokens_seen": 158605600, "step": 130415 }, { "epoch": 16.341310612705175, "grad_norm": 16.792211532592773, "learning_rate": 9.852959853223154e-07, "loss": 0.565, "num_input_tokens_seen": 158611776, "step": 130420 }, { "epoch": 16.341937100613958, "grad_norm": 4.868408679962158, "learning_rate": 9.849701357411118e-07, "loss": 0.4586, "num_input_tokens_seen": 158618208, "step": 130425 }, { "epoch": 16.34256358852274, "grad_norm": 10.692485809326172, "learning_rate": 9.846443341629646e-07, "loss": 0.4576, "num_input_tokens_seen": 158624032, "step": 130430 }, { "epoch": 16.343190076431526, "grad_norm": 9.224879264831543, "learning_rate": 9.84318580591772e-07, "loss": 0.5009, "num_input_tokens_seen": 158630336, "step": 130435 }, { "epoch": 16.343816564340308, "grad_norm": 3.8425352573394775, "learning_rate": 9.839928750314255e-07, "loss": 0.5082, "num_input_tokens_seen": 158636448, "step": 130440 }, { "epoch": 16.34444305224909, "grad_norm": 6.557330131530762, "learning_rate": 9.836672174858219e-07, "loss": 0.4358, "num_input_tokens_seen": 158642624, "step": 130445 }, { "epoch": 16.345069540157876, "grad_norm": 14.940923690795898, "learning_rate": 9.833416079588521e-07, "loss": 0.4499, "num_input_tokens_seen": 158648864, "step": 130450 }, { "epoch": 16.34569602806666, "grad_norm": 2.7999088764190674, "learning_rate": 9.830160464544102e-07, "loss": 0.442, "num_input_tokens_seen": 158654624, "step": 130455 }, { "epoch": 16.34632251597544, "grad_norm": 3.1435964107513428, "learning_rate": 9.826905329763902e-07, "loss": 0.4572, "num_input_tokens_seen": 158660832, "step": 130460 }, { "epoch": 16.346949003884227, "grad_norm": 3.7347025871276855, "learning_rate": 9.82365067528681e-07, "loss": 0.4349, "num_input_tokens_seen": 158667072, "step": 130465 }, { "epoch": 16.34757549179301, "grad_norm": 3.7570033073425293, "learning_rate": 9.82039650115177e-07, "loss": 0.4038, "num_input_tokens_seen": 158673472, "step": 130470 }, { "epoch": 16.34820197970179, "grad_norm": 7.1763081550598145, "learning_rate": 9.817142807397655e-07, "loss": 0.4666, "num_input_tokens_seen": 158678912, "step": 130475 }, { "epoch": 16.348828467610574, "grad_norm": 7.0520195960998535, "learning_rate": 9.813889594063381e-07, "loss": 0.461, "num_input_tokens_seen": 158684960, "step": 130480 }, { "epoch": 16.34945495551936, "grad_norm": 2.751962423324585, "learning_rate": 9.810636861187862e-07, "loss": 0.4307, "num_input_tokens_seen": 158691488, "step": 130485 }, { "epoch": 16.35008144342814, "grad_norm": 23.19379234313965, "learning_rate": 9.80738460880995e-07, "loss": 0.532, "num_input_tokens_seen": 158697984, "step": 130490 }, { "epoch": 16.350707931336924, "grad_norm": 2.9306886196136475, "learning_rate": 9.80413283696855e-07, "loss": 0.5212, "num_input_tokens_seen": 158704192, "step": 130495 }, { "epoch": 16.35133441924571, "grad_norm": 4.074435234069824, "learning_rate": 9.800881545702545e-07, "loss": 0.4361, "num_input_tokens_seen": 158710560, "step": 130500 }, { "epoch": 16.351960907154492, "grad_norm": 15.95753002166748, "learning_rate": 9.797630735050789e-07, "loss": 0.484, "num_input_tokens_seen": 158716736, "step": 130505 }, { "epoch": 16.352587395063274, "grad_norm": 5.656392574310303, "learning_rate": 9.794380405052156e-07, "loss": 0.4561, "num_input_tokens_seen": 158722880, "step": 130510 }, { "epoch": 16.35321388297206, "grad_norm": 6.549691677093506, "learning_rate": 9.791130555745525e-07, "loss": 0.4474, "num_input_tokens_seen": 158728992, "step": 130515 }, { "epoch": 16.353840370880842, "grad_norm": 5.1725077629089355, "learning_rate": 9.787881187169718e-07, "loss": 0.3884, "num_input_tokens_seen": 158735584, "step": 130520 }, { "epoch": 16.354466858789625, "grad_norm": 4.219973564147949, "learning_rate": 9.784632299363616e-07, "loss": 0.4531, "num_input_tokens_seen": 158741536, "step": 130525 }, { "epoch": 16.355093346698407, "grad_norm": 4.0765180587768555, "learning_rate": 9.78138389236603e-07, "loss": 0.4109, "num_input_tokens_seen": 158748032, "step": 130530 }, { "epoch": 16.355719834607193, "grad_norm": 13.336241722106934, "learning_rate": 9.77813596621583e-07, "loss": 0.4421, "num_input_tokens_seen": 158754368, "step": 130535 }, { "epoch": 16.356346322515975, "grad_norm": 15.68936824798584, "learning_rate": 9.774888520951815e-07, "loss": 0.4813, "num_input_tokens_seen": 158760416, "step": 130540 }, { "epoch": 16.356972810424757, "grad_norm": 3.363055467605591, "learning_rate": 9.771641556612837e-07, "loss": 0.4169, "num_input_tokens_seen": 158766272, "step": 130545 }, { "epoch": 16.357599298333543, "grad_norm": 8.070942878723145, "learning_rate": 9.768395073237703e-07, "loss": 0.4656, "num_input_tokens_seen": 158772544, "step": 130550 }, { "epoch": 16.358225786242325, "grad_norm": 5.136432647705078, "learning_rate": 9.765149070865238e-07, "loss": 0.4959, "num_input_tokens_seen": 158778624, "step": 130555 }, { "epoch": 16.358852274151108, "grad_norm": 3.9827187061309814, "learning_rate": 9.76190354953423e-07, "loss": 0.4206, "num_input_tokens_seen": 158784448, "step": 130560 }, { "epoch": 16.359478762059894, "grad_norm": 19.915390014648438, "learning_rate": 9.75865850928351e-07, "loss": 0.4322, "num_input_tokens_seen": 158790592, "step": 130565 }, { "epoch": 16.360105249968676, "grad_norm": 4.348659992218018, "learning_rate": 9.755413950151843e-07, "loss": 0.4484, "num_input_tokens_seen": 158796544, "step": 130570 }, { "epoch": 16.360731737877458, "grad_norm": 5.0553436279296875, "learning_rate": 9.75216987217804e-07, "loss": 0.4476, "num_input_tokens_seen": 158802816, "step": 130575 }, { "epoch": 16.361358225786244, "grad_norm": 11.669548034667969, "learning_rate": 9.7489262754009e-07, "loss": 0.4386, "num_input_tokens_seen": 158808864, "step": 130580 }, { "epoch": 16.361984713695026, "grad_norm": 3.437220573425293, "learning_rate": 9.745683159859177e-07, "loss": 0.453, "num_input_tokens_seen": 158815072, "step": 130585 }, { "epoch": 16.36261120160381, "grad_norm": 34.185298919677734, "learning_rate": 9.74244052559165e-07, "loss": 0.5124, "num_input_tokens_seen": 158821120, "step": 130590 }, { "epoch": 16.36323768951259, "grad_norm": 3.5859949588775635, "learning_rate": 9.739198372637093e-07, "loss": 0.4572, "num_input_tokens_seen": 158826656, "step": 130595 }, { "epoch": 16.363864177421377, "grad_norm": 14.957036972045898, "learning_rate": 9.73595670103429e-07, "loss": 0.4803, "num_input_tokens_seen": 158832544, "step": 130600 }, { "epoch": 16.36449066533016, "grad_norm": 6.69332218170166, "learning_rate": 9.732715510821955e-07, "loss": 0.4784, "num_input_tokens_seen": 158838464, "step": 130605 }, { "epoch": 16.36511715323894, "grad_norm": 3.3872148990631104, "learning_rate": 9.72947480203888e-07, "loss": 0.4652, "num_input_tokens_seen": 158844480, "step": 130610 }, { "epoch": 16.365743641147727, "grad_norm": 4.787823677062988, "learning_rate": 9.726234574723774e-07, "loss": 0.4398, "num_input_tokens_seen": 158850944, "step": 130615 }, { "epoch": 16.36637012905651, "grad_norm": 6.748215198516846, "learning_rate": 9.722994828915412e-07, "loss": 0.3932, "num_input_tokens_seen": 158856960, "step": 130620 }, { "epoch": 16.36699661696529, "grad_norm": 18.882274627685547, "learning_rate": 9.71975556465249e-07, "loss": 0.5309, "num_input_tokens_seen": 158862880, "step": 130625 }, { "epoch": 16.367623104874077, "grad_norm": 3.619647264480591, "learning_rate": 9.716516781973773e-07, "loss": 0.4125, "num_input_tokens_seen": 158868672, "step": 130630 }, { "epoch": 16.36824959278286, "grad_norm": 8.581596374511719, "learning_rate": 9.713278480917954e-07, "loss": 0.4403, "num_input_tokens_seen": 158874784, "step": 130635 }, { "epoch": 16.368876080691642, "grad_norm": 8.897340774536133, "learning_rate": 9.710040661523762e-07, "loss": 0.4172, "num_input_tokens_seen": 158880704, "step": 130640 }, { "epoch": 16.369502568600424, "grad_norm": 5.858882427215576, "learning_rate": 9.706803323829916e-07, "loss": 0.3886, "num_input_tokens_seen": 158886816, "step": 130645 }, { "epoch": 16.37012905650921, "grad_norm": 11.121596336364746, "learning_rate": 9.703566467875104e-07, "loss": 0.4484, "num_input_tokens_seen": 158892832, "step": 130650 }, { "epoch": 16.370755544417992, "grad_norm": 4.437315464019775, "learning_rate": 9.700330093698047e-07, "loss": 0.4974, "num_input_tokens_seen": 158898752, "step": 130655 }, { "epoch": 16.371382032326775, "grad_norm": 4.18144416809082, "learning_rate": 9.697094201337415e-07, "loss": 0.4768, "num_input_tokens_seen": 158904960, "step": 130660 }, { "epoch": 16.37200852023556, "grad_norm": 34.863243103027344, "learning_rate": 9.693858790831917e-07, "loss": 0.4673, "num_input_tokens_seen": 158911488, "step": 130665 }, { "epoch": 16.372635008144343, "grad_norm": 10.169617652893066, "learning_rate": 9.69062386222021e-07, "loss": 0.4547, "num_input_tokens_seen": 158917824, "step": 130670 }, { "epoch": 16.373261496053125, "grad_norm": 7.008152008056641, "learning_rate": 9.687389415540998e-07, "loss": 0.482, "num_input_tokens_seen": 158923744, "step": 130675 }, { "epoch": 16.37388798396191, "grad_norm": 14.27207088470459, "learning_rate": 9.684155450832933e-07, "loss": 0.5031, "num_input_tokens_seen": 158929856, "step": 130680 }, { "epoch": 16.374514471870693, "grad_norm": 4.932504653930664, "learning_rate": 9.680921968134677e-07, "loss": 0.514, "num_input_tokens_seen": 158936128, "step": 130685 }, { "epoch": 16.375140959779475, "grad_norm": 18.995759963989258, "learning_rate": 9.677688967484899e-07, "loss": 0.4541, "num_input_tokens_seen": 158942272, "step": 130690 }, { "epoch": 16.37576744768826, "grad_norm": 3.0595078468322754, "learning_rate": 9.674456448922264e-07, "loss": 0.4289, "num_input_tokens_seen": 158948640, "step": 130695 }, { "epoch": 16.376393935597044, "grad_norm": 3.690610647201538, "learning_rate": 9.671224412485398e-07, "loss": 0.4121, "num_input_tokens_seen": 158955008, "step": 130700 }, { "epoch": 16.377020423505826, "grad_norm": 3.2982423305511475, "learning_rate": 9.667992858212944e-07, "loss": 0.4837, "num_input_tokens_seen": 158961088, "step": 130705 }, { "epoch": 16.377646911414608, "grad_norm": 20.445066452026367, "learning_rate": 9.664761786143561e-07, "loss": 0.4924, "num_input_tokens_seen": 158967040, "step": 130710 }, { "epoch": 16.378273399323394, "grad_norm": 12.427719116210938, "learning_rate": 9.661531196315849e-07, "loss": 0.4748, "num_input_tokens_seen": 158973184, "step": 130715 }, { "epoch": 16.378899887232176, "grad_norm": 2.762221097946167, "learning_rate": 9.658301088768456e-07, "loss": 0.454, "num_input_tokens_seen": 158979360, "step": 130720 }, { "epoch": 16.37952637514096, "grad_norm": 3.17671537399292, "learning_rate": 9.655071463539984e-07, "loss": 0.4283, "num_input_tokens_seen": 158985312, "step": 130725 }, { "epoch": 16.380152863049744, "grad_norm": 3.0805587768554688, "learning_rate": 9.651842320669064e-07, "loss": 0.418, "num_input_tokens_seen": 158991328, "step": 130730 }, { "epoch": 16.380779350958527, "grad_norm": 6.921243190765381, "learning_rate": 9.648613660194279e-07, "loss": 0.4408, "num_input_tokens_seen": 158997664, "step": 130735 }, { "epoch": 16.38140583886731, "grad_norm": 3.3489482402801514, "learning_rate": 9.645385482154252e-07, "loss": 0.4863, "num_input_tokens_seen": 159003456, "step": 130740 }, { "epoch": 16.382032326776095, "grad_norm": 3.7185637950897217, "learning_rate": 9.642157786587558e-07, "loss": 0.4584, "num_input_tokens_seen": 159009568, "step": 130745 }, { "epoch": 16.382658814684877, "grad_norm": 4.6974406242370605, "learning_rate": 9.638930573532817e-07, "loss": 0.4471, "num_input_tokens_seen": 159015392, "step": 130750 }, { "epoch": 16.38328530259366, "grad_norm": 5.671010494232178, "learning_rate": 9.635703843028576e-07, "loss": 0.4036, "num_input_tokens_seen": 159021056, "step": 130755 }, { "epoch": 16.38391179050244, "grad_norm": 4.38709831237793, "learning_rate": 9.632477595113433e-07, "loss": 0.4219, "num_input_tokens_seen": 159027200, "step": 130760 }, { "epoch": 16.384538278411227, "grad_norm": 13.30508041381836, "learning_rate": 9.629251829825976e-07, "loss": 0.4773, "num_input_tokens_seen": 159032544, "step": 130765 }, { "epoch": 16.38516476632001, "grad_norm": 3.7886908054351807, "learning_rate": 9.626026547204742e-07, "loss": 0.4755, "num_input_tokens_seen": 159038400, "step": 130770 }, { "epoch": 16.385791254228792, "grad_norm": 3.939160108566284, "learning_rate": 9.622801747288314e-07, "loss": 0.4716, "num_input_tokens_seen": 159044288, "step": 130775 }, { "epoch": 16.386417742137578, "grad_norm": 7.923166751861572, "learning_rate": 9.619577430115228e-07, "loss": 0.4311, "num_input_tokens_seen": 159050368, "step": 130780 }, { "epoch": 16.38704423004636, "grad_norm": 7.268779277801514, "learning_rate": 9.616353595724055e-07, "loss": 0.4382, "num_input_tokens_seen": 159056160, "step": 130785 }, { "epoch": 16.387670717955142, "grad_norm": 4.1416096687316895, "learning_rate": 9.613130244153318e-07, "loss": 0.4476, "num_input_tokens_seen": 159062176, "step": 130790 }, { "epoch": 16.38829720586393, "grad_norm": 18.107187271118164, "learning_rate": 9.609907375441564e-07, "loss": 0.4551, "num_input_tokens_seen": 159068256, "step": 130795 }, { "epoch": 16.38892369377271, "grad_norm": 7.426303386688232, "learning_rate": 9.606684989627318e-07, "loss": 0.5214, "num_input_tokens_seen": 159074496, "step": 130800 }, { "epoch": 16.389550181681493, "grad_norm": 10.043188095092773, "learning_rate": 9.60346308674913e-07, "loss": 0.479, "num_input_tokens_seen": 159080448, "step": 130805 }, { "epoch": 16.39017666959028, "grad_norm": 4.484737873077393, "learning_rate": 9.6002416668455e-07, "loss": 0.5275, "num_input_tokens_seen": 159086688, "step": 130810 }, { "epoch": 16.39080315749906, "grad_norm": 4.270693302154541, "learning_rate": 9.597020729954954e-07, "loss": 0.449, "num_input_tokens_seen": 159092832, "step": 130815 }, { "epoch": 16.391429645407843, "grad_norm": 12.078585624694824, "learning_rate": 9.593800276115978e-07, "loss": 0.4279, "num_input_tokens_seen": 159098848, "step": 130820 }, { "epoch": 16.392056133316625, "grad_norm": 10.823820114135742, "learning_rate": 9.590580305367098e-07, "loss": 0.4328, "num_input_tokens_seen": 159104928, "step": 130825 }, { "epoch": 16.39268262122541, "grad_norm": 5.268106937408447, "learning_rate": 9.58736081774681e-07, "loss": 0.4236, "num_input_tokens_seen": 159110784, "step": 130830 }, { "epoch": 16.393309109134194, "grad_norm": 6.696734428405762, "learning_rate": 9.584141813293595e-07, "loss": 0.4397, "num_input_tokens_seen": 159116928, "step": 130835 }, { "epoch": 16.393935597042976, "grad_norm": 3.194673776626587, "learning_rate": 9.58092329204595e-07, "loss": 0.4439, "num_input_tokens_seen": 159122944, "step": 130840 }, { "epoch": 16.39456208495176, "grad_norm": 21.748445510864258, "learning_rate": 9.57770525404234e-07, "loss": 0.4514, "num_input_tokens_seen": 159128736, "step": 130845 }, { "epoch": 16.395188572860544, "grad_norm": 3.8396644592285156, "learning_rate": 9.574487699321266e-07, "loss": 0.459, "num_input_tokens_seen": 159134912, "step": 130850 }, { "epoch": 16.395815060769326, "grad_norm": 4.279225826263428, "learning_rate": 9.571270627921164e-07, "loss": 0.4937, "num_input_tokens_seen": 159141408, "step": 130855 }, { "epoch": 16.396441548678112, "grad_norm": 3.9556448459625244, "learning_rate": 9.568054039880526e-07, "loss": 0.4657, "num_input_tokens_seen": 159147744, "step": 130860 }, { "epoch": 16.397068036586894, "grad_norm": 3.228722333908081, "learning_rate": 9.564837935237775e-07, "loss": 0.5363, "num_input_tokens_seen": 159153760, "step": 130865 }, { "epoch": 16.397694524495677, "grad_norm": 3.363297462463379, "learning_rate": 9.561622314031404e-07, "loss": 0.4498, "num_input_tokens_seen": 159159968, "step": 130870 }, { "epoch": 16.39832101240446, "grad_norm": 20.937114715576172, "learning_rate": 9.558407176299823e-07, "loss": 0.4745, "num_input_tokens_seen": 159166080, "step": 130875 }, { "epoch": 16.398947500313245, "grad_norm": 11.466002464294434, "learning_rate": 9.55519252208148e-07, "loss": 0.4525, "num_input_tokens_seen": 159172160, "step": 130880 }, { "epoch": 16.399573988222027, "grad_norm": 14.917414665222168, "learning_rate": 9.551978351414832e-07, "loss": 0.4568, "num_input_tokens_seen": 159178048, "step": 130885 }, { "epoch": 16.40020047613081, "grad_norm": 32.73080825805664, "learning_rate": 9.548764664338278e-07, "loss": 0.5953, "num_input_tokens_seen": 159183904, "step": 130890 }, { "epoch": 16.400826964039595, "grad_norm": 11.05118179321289, "learning_rate": 9.545551460890245e-07, "loss": 0.5117, "num_input_tokens_seen": 159190400, "step": 130895 }, { "epoch": 16.401453451948377, "grad_norm": 14.198491096496582, "learning_rate": 9.542338741109174e-07, "loss": 0.4798, "num_input_tokens_seen": 159196416, "step": 130900 }, { "epoch": 16.40207993985716, "grad_norm": 5.078753471374512, "learning_rate": 9.539126505033447e-07, "loss": 0.4613, "num_input_tokens_seen": 159202592, "step": 130905 }, { "epoch": 16.402706427765946, "grad_norm": 19.25115966796875, "learning_rate": 9.535914752701481e-07, "loss": 0.5091, "num_input_tokens_seen": 159208832, "step": 130910 }, { "epoch": 16.403332915674728, "grad_norm": 5.063948631286621, "learning_rate": 9.532703484151684e-07, "loss": 0.4791, "num_input_tokens_seen": 159215072, "step": 130915 }, { "epoch": 16.40395940358351, "grad_norm": 6.271337985992432, "learning_rate": 9.529492699422426e-07, "loss": 0.5998, "num_input_tokens_seen": 159221216, "step": 130920 }, { "epoch": 16.404585891492296, "grad_norm": 3.119413375854492, "learning_rate": 9.526282398552128e-07, "loss": 0.501, "num_input_tokens_seen": 159227296, "step": 130925 }, { "epoch": 16.40521237940108, "grad_norm": 7.532918453216553, "learning_rate": 9.523072581579135e-07, "loss": 0.5149, "num_input_tokens_seen": 159233248, "step": 130930 }, { "epoch": 16.40583886730986, "grad_norm": 3.4404752254486084, "learning_rate": 9.51986324854185e-07, "loss": 0.4231, "num_input_tokens_seen": 159239360, "step": 130935 }, { "epoch": 16.406465355218643, "grad_norm": 14.784841537475586, "learning_rate": 9.516654399478625e-07, "loss": 0.5014, "num_input_tokens_seen": 159245216, "step": 130940 }, { "epoch": 16.40709184312743, "grad_norm": 4.13702917098999, "learning_rate": 9.513446034427831e-07, "loss": 0.4313, "num_input_tokens_seen": 159250752, "step": 130945 }, { "epoch": 16.40771833103621, "grad_norm": 3.204989194869995, "learning_rate": 9.510238153427843e-07, "loss": 0.4585, "num_input_tokens_seen": 159256800, "step": 130950 }, { "epoch": 16.408344818944993, "grad_norm": 5.230934143066406, "learning_rate": 9.507030756516989e-07, "loss": 0.4401, "num_input_tokens_seen": 159263040, "step": 130955 }, { "epoch": 16.40897130685378, "grad_norm": 2.8618431091308594, "learning_rate": 9.50382384373364e-07, "loss": 0.4516, "num_input_tokens_seen": 159269472, "step": 130960 }, { "epoch": 16.40959779476256, "grad_norm": 9.88066577911377, "learning_rate": 9.500617415116104e-07, "loss": 0.4418, "num_input_tokens_seen": 159274976, "step": 130965 }, { "epoch": 16.410224282671344, "grad_norm": 2.8369667530059814, "learning_rate": 9.497411470702755e-07, "loss": 0.4828, "num_input_tokens_seen": 159280576, "step": 130970 }, { "epoch": 16.41085077058013, "grad_norm": 6.711128234863281, "learning_rate": 9.494206010531893e-07, "loss": 0.4435, "num_input_tokens_seen": 159286944, "step": 130975 }, { "epoch": 16.41147725848891, "grad_norm": 5.1730732917785645, "learning_rate": 9.491001034641867e-07, "loss": 0.4365, "num_input_tokens_seen": 159293248, "step": 130980 }, { "epoch": 16.412103746397694, "grad_norm": 20.984020233154297, "learning_rate": 9.487796543070965e-07, "loss": 0.5152, "num_input_tokens_seen": 159299712, "step": 130985 }, { "epoch": 16.412730234306476, "grad_norm": 11.38991641998291, "learning_rate": 9.484592535857528e-07, "loss": 0.4339, "num_input_tokens_seen": 159305856, "step": 130990 }, { "epoch": 16.413356722215262, "grad_norm": 1.9905824661254883, "learning_rate": 9.481389013039843e-07, "loss": 0.4917, "num_input_tokens_seen": 159312256, "step": 130995 }, { "epoch": 16.413983210124044, "grad_norm": 6.926337718963623, "learning_rate": 9.478185974656218e-07, "loss": 0.4451, "num_input_tokens_seen": 159318528, "step": 131000 }, { "epoch": 16.414609698032827, "grad_norm": 5.962010383605957, "learning_rate": 9.474983420744943e-07, "loss": 0.4649, "num_input_tokens_seen": 159324832, "step": 131005 }, { "epoch": 16.415236185941612, "grad_norm": 3.3919363021850586, "learning_rate": 9.471781351344316e-07, "loss": 0.4315, "num_input_tokens_seen": 159331104, "step": 131010 }, { "epoch": 16.415862673850395, "grad_norm": 7.631089687347412, "learning_rate": 9.46857976649263e-07, "loss": 0.4416, "num_input_tokens_seen": 159336800, "step": 131015 }, { "epoch": 16.416489161759177, "grad_norm": 25.06416130065918, "learning_rate": 9.465378666228142e-07, "loss": 0.4604, "num_input_tokens_seen": 159343040, "step": 131020 }, { "epoch": 16.417115649667963, "grad_norm": 9.561320304870605, "learning_rate": 9.462178050589138e-07, "loss": 0.4429, "num_input_tokens_seen": 159349088, "step": 131025 }, { "epoch": 16.417742137576745, "grad_norm": 9.934964179992676, "learning_rate": 9.458977919613866e-07, "loss": 0.4162, "num_input_tokens_seen": 159355008, "step": 131030 }, { "epoch": 16.418368625485527, "grad_norm": 6.643451690673828, "learning_rate": 9.455778273340616e-07, "loss": 0.5168, "num_input_tokens_seen": 159361216, "step": 131035 }, { "epoch": 16.41899511339431, "grad_norm": 4.200155735015869, "learning_rate": 9.452579111807603e-07, "loss": 0.4711, "num_input_tokens_seen": 159367328, "step": 131040 }, { "epoch": 16.419621601303096, "grad_norm": 3.0158097743988037, "learning_rate": 9.449380435053118e-07, "loss": 0.4736, "num_input_tokens_seen": 159373504, "step": 131045 }, { "epoch": 16.420248089211878, "grad_norm": 3.300250768661499, "learning_rate": 9.446182243115365e-07, "loss": 0.4458, "num_input_tokens_seen": 159379584, "step": 131050 }, { "epoch": 16.42087457712066, "grad_norm": 11.781327247619629, "learning_rate": 9.442984536032612e-07, "loss": 0.4352, "num_input_tokens_seen": 159385408, "step": 131055 }, { "epoch": 16.421501065029446, "grad_norm": 3.568202495574951, "learning_rate": 9.439787313843068e-07, "loss": 0.4795, "num_input_tokens_seen": 159391808, "step": 131060 }, { "epoch": 16.42212755293823, "grad_norm": 2.987520933151245, "learning_rate": 9.436590576584975e-07, "loss": 0.4803, "num_input_tokens_seen": 159397696, "step": 131065 }, { "epoch": 16.42275404084701, "grad_norm": 4.132308483123779, "learning_rate": 9.433394324296536e-07, "loss": 0.4044, "num_input_tokens_seen": 159403744, "step": 131070 }, { "epoch": 16.423380528755796, "grad_norm": 5.2004218101501465, "learning_rate": 9.430198557015973e-07, "loss": 0.4651, "num_input_tokens_seen": 159409344, "step": 131075 }, { "epoch": 16.42400701666458, "grad_norm": 2.4985625743865967, "learning_rate": 9.427003274781516e-07, "loss": 0.4384, "num_input_tokens_seen": 159414720, "step": 131080 }, { "epoch": 16.42463350457336, "grad_norm": 3.805281162261963, "learning_rate": 9.423808477631325e-07, "loss": 0.3971, "num_input_tokens_seen": 159420768, "step": 131085 }, { "epoch": 16.425259992482147, "grad_norm": 7.923956394195557, "learning_rate": 9.420614165603631e-07, "loss": 0.4377, "num_input_tokens_seen": 159426944, "step": 131090 }, { "epoch": 16.42588648039093, "grad_norm": 12.380860328674316, "learning_rate": 9.417420338736605e-07, "loss": 0.4986, "num_input_tokens_seen": 159432224, "step": 131095 }, { "epoch": 16.42651296829971, "grad_norm": 4.010117053985596, "learning_rate": 9.414226997068438e-07, "loss": 0.4307, "num_input_tokens_seen": 159438752, "step": 131100 }, { "epoch": 16.427139456208494, "grad_norm": 22.79440689086914, "learning_rate": 9.411034140637315e-07, "loss": 0.5542, "num_input_tokens_seen": 159445024, "step": 131105 }, { "epoch": 16.42776594411728, "grad_norm": 18.29397964477539, "learning_rate": 9.407841769481401e-07, "loss": 0.4988, "num_input_tokens_seen": 159451168, "step": 131110 }, { "epoch": 16.42839243202606, "grad_norm": 3.143362522125244, "learning_rate": 9.404649883638861e-07, "loss": 0.3995, "num_input_tokens_seen": 159457280, "step": 131115 }, { "epoch": 16.429018919934844, "grad_norm": 11.457252502441406, "learning_rate": 9.401458483147874e-07, "loss": 0.4523, "num_input_tokens_seen": 159463360, "step": 131120 }, { "epoch": 16.42964540784363, "grad_norm": 5.155732154846191, "learning_rate": 9.398267568046576e-07, "loss": 0.4333, "num_input_tokens_seen": 159469792, "step": 131125 }, { "epoch": 16.430271895752412, "grad_norm": 5.701308250427246, "learning_rate": 9.395077138373121e-07, "loss": 0.4161, "num_input_tokens_seen": 159475904, "step": 131130 }, { "epoch": 16.430898383661194, "grad_norm": 19.55963706970215, "learning_rate": 9.391887194165672e-07, "loss": 0.4851, "num_input_tokens_seen": 159481792, "step": 131135 }, { "epoch": 16.43152487156998, "grad_norm": 4.2053728103637695, "learning_rate": 9.388697735462344e-07, "loss": 0.4869, "num_input_tokens_seen": 159487712, "step": 131140 }, { "epoch": 16.432151359478762, "grad_norm": 5.460760593414307, "learning_rate": 9.385508762301288e-07, "loss": 0.5346, "num_input_tokens_seen": 159493984, "step": 131145 }, { "epoch": 16.432777847387545, "grad_norm": 21.64903450012207, "learning_rate": 9.382320274720608e-07, "loss": 0.5494, "num_input_tokens_seen": 159500320, "step": 131150 }, { "epoch": 16.433404335296327, "grad_norm": 3.0600244998931885, "learning_rate": 9.379132272758451e-07, "loss": 0.4206, "num_input_tokens_seen": 159506080, "step": 131155 }, { "epoch": 16.434030823205113, "grad_norm": 9.917743682861328, "learning_rate": 9.37594475645291e-07, "loss": 0.5274, "num_input_tokens_seen": 159512128, "step": 131160 }, { "epoch": 16.434657311113895, "grad_norm": 14.464452743530273, "learning_rate": 9.372757725842119e-07, "loss": 0.4431, "num_input_tokens_seen": 159518240, "step": 131165 }, { "epoch": 16.435283799022677, "grad_norm": 5.6732707023620605, "learning_rate": 9.369571180964149e-07, "loss": 0.4018, "num_input_tokens_seen": 159524320, "step": 131170 }, { "epoch": 16.435910286931463, "grad_norm": 4.207605361938477, "learning_rate": 9.36638512185713e-07, "loss": 0.4266, "num_input_tokens_seen": 159530688, "step": 131175 }, { "epoch": 16.436536774840246, "grad_norm": 3.672332763671875, "learning_rate": 9.363199548559126e-07, "loss": 0.4688, "num_input_tokens_seen": 159536736, "step": 131180 }, { "epoch": 16.437163262749028, "grad_norm": 5.55706262588501, "learning_rate": 9.360014461108252e-07, "loss": 0.4364, "num_input_tokens_seen": 159542848, "step": 131185 }, { "epoch": 16.437789750657814, "grad_norm": 6.152373313903809, "learning_rate": 9.35682985954256e-07, "loss": 0.428, "num_input_tokens_seen": 159548608, "step": 131190 }, { "epoch": 16.438416238566596, "grad_norm": 5.920435905456543, "learning_rate": 9.353645743900136e-07, "loss": 0.5597, "num_input_tokens_seen": 159554656, "step": 131195 }, { "epoch": 16.43904272647538, "grad_norm": 2.491048812866211, "learning_rate": 9.350462114219061e-07, "loss": 0.4819, "num_input_tokens_seen": 159560384, "step": 131200 }, { "epoch": 16.439669214384164, "grad_norm": 5.151942253112793, "learning_rate": 9.347278970537382e-07, "loss": 0.4333, "num_input_tokens_seen": 159566592, "step": 131205 }, { "epoch": 16.440295702292946, "grad_norm": 5.697195053100586, "learning_rate": 9.344096312893159e-07, "loss": 0.4266, "num_input_tokens_seen": 159572544, "step": 131210 }, { "epoch": 16.44092219020173, "grad_norm": 8.398231506347656, "learning_rate": 9.340914141324442e-07, "loss": 0.4503, "num_input_tokens_seen": 159578592, "step": 131215 }, { "epoch": 16.44154867811051, "grad_norm": 3.867251396179199, "learning_rate": 9.337732455869298e-07, "loss": 0.4473, "num_input_tokens_seen": 159584544, "step": 131220 }, { "epoch": 16.442175166019297, "grad_norm": 5.533018589019775, "learning_rate": 9.334551256565738e-07, "loss": 0.4433, "num_input_tokens_seen": 159590848, "step": 131225 }, { "epoch": 16.44280165392808, "grad_norm": 3.298980236053467, "learning_rate": 9.331370543451818e-07, "loss": 0.4734, "num_input_tokens_seen": 159596992, "step": 131230 }, { "epoch": 16.44342814183686, "grad_norm": 3.8469364643096924, "learning_rate": 9.328190316565544e-07, "loss": 0.4285, "num_input_tokens_seen": 159603072, "step": 131235 }, { "epoch": 16.444054629745647, "grad_norm": 3.049408197402954, "learning_rate": 9.32501057594496e-07, "loss": 0.5221, "num_input_tokens_seen": 159608928, "step": 131240 }, { "epoch": 16.44468111765443, "grad_norm": 9.570296287536621, "learning_rate": 9.321831321628061e-07, "loss": 0.5108, "num_input_tokens_seen": 159614944, "step": 131245 }, { "epoch": 16.44530760556321, "grad_norm": 4.387509822845459, "learning_rate": 9.318652553652879e-07, "loss": 0.4546, "num_input_tokens_seen": 159620832, "step": 131250 }, { "epoch": 16.445934093471998, "grad_norm": 4.863070487976074, "learning_rate": 9.3154742720574e-07, "loss": 0.441, "num_input_tokens_seen": 159626976, "step": 131255 }, { "epoch": 16.44656058138078, "grad_norm": 8.326571464538574, "learning_rate": 9.312296476879629e-07, "loss": 0.5159, "num_input_tokens_seen": 159633312, "step": 131260 }, { "epoch": 16.447187069289562, "grad_norm": 10.499571800231934, "learning_rate": 9.309119168157576e-07, "loss": 0.4589, "num_input_tokens_seen": 159639232, "step": 131265 }, { "epoch": 16.447813557198344, "grad_norm": 12.700063705444336, "learning_rate": 9.305942345929198e-07, "loss": 0.4562, "num_input_tokens_seen": 159645376, "step": 131270 }, { "epoch": 16.44844004510713, "grad_norm": 6.293216228485107, "learning_rate": 9.302766010232511e-07, "loss": 0.4411, "num_input_tokens_seen": 159650688, "step": 131275 }, { "epoch": 16.449066533015912, "grad_norm": 3.9307427406311035, "learning_rate": 9.299590161105459e-07, "loss": 0.4783, "num_input_tokens_seen": 159656992, "step": 131280 }, { "epoch": 16.449693020924695, "grad_norm": 5.700310230255127, "learning_rate": 9.296414798586034e-07, "loss": 0.4574, "num_input_tokens_seen": 159662912, "step": 131285 }, { "epoch": 16.45031950883348, "grad_norm": 5.517252445220947, "learning_rate": 9.293239922712183e-07, "loss": 0.4397, "num_input_tokens_seen": 159668416, "step": 131290 }, { "epoch": 16.450945996742263, "grad_norm": 7.268392086029053, "learning_rate": 9.290065533521885e-07, "loss": 0.4567, "num_input_tokens_seen": 159674400, "step": 131295 }, { "epoch": 16.451572484651045, "grad_norm": 3.080306053161621, "learning_rate": 9.286891631053069e-07, "loss": 0.399, "num_input_tokens_seen": 159680256, "step": 131300 }, { "epoch": 16.45219897255983, "grad_norm": 3.2085421085357666, "learning_rate": 9.28371821534369e-07, "loss": 0.3828, "num_input_tokens_seen": 159685824, "step": 131305 }, { "epoch": 16.452825460468613, "grad_norm": 5.753179550170898, "learning_rate": 9.280545286431697e-07, "loss": 0.441, "num_input_tokens_seen": 159691552, "step": 131310 }, { "epoch": 16.453451948377396, "grad_norm": 10.887800216674805, "learning_rate": 9.277372844355031e-07, "loss": 0.4368, "num_input_tokens_seen": 159697440, "step": 131315 }, { "epoch": 16.45407843628618, "grad_norm": 4.515318870544434, "learning_rate": 9.274200889151596e-07, "loss": 0.4405, "num_input_tokens_seen": 159703520, "step": 131320 }, { "epoch": 16.454704924194964, "grad_norm": 5.301748752593994, "learning_rate": 9.271029420859328e-07, "loss": 0.4608, "num_input_tokens_seen": 159709760, "step": 131325 }, { "epoch": 16.455331412103746, "grad_norm": 6.863186836242676, "learning_rate": 9.267858439516164e-07, "loss": 0.4113, "num_input_tokens_seen": 159715872, "step": 131330 }, { "epoch": 16.45595790001253, "grad_norm": 5.582934379577637, "learning_rate": 9.264687945159983e-07, "loss": 0.4996, "num_input_tokens_seen": 159722496, "step": 131335 }, { "epoch": 16.456584387921314, "grad_norm": 4.888243675231934, "learning_rate": 9.261517937828717e-07, "loss": 0.4524, "num_input_tokens_seen": 159728192, "step": 131340 }, { "epoch": 16.457210875830096, "grad_norm": 14.768940925598145, "learning_rate": 9.258348417560242e-07, "loss": 0.4847, "num_input_tokens_seen": 159734368, "step": 131345 }, { "epoch": 16.45783736373888, "grad_norm": 4.635445594787598, "learning_rate": 9.255179384392482e-07, "loss": 0.4834, "num_input_tokens_seen": 159740096, "step": 131350 }, { "epoch": 16.458463851647664, "grad_norm": 5.6477179527282715, "learning_rate": 9.252010838363296e-07, "loss": 0.4749, "num_input_tokens_seen": 159746112, "step": 131355 }, { "epoch": 16.459090339556447, "grad_norm": 9.331472396850586, "learning_rate": 9.248842779510591e-07, "loss": 0.4315, "num_input_tokens_seen": 159752064, "step": 131360 }, { "epoch": 16.45971682746523, "grad_norm": 2.0866215229034424, "learning_rate": 9.245675207872218e-07, "loss": 0.4565, "num_input_tokens_seen": 159758400, "step": 131365 }, { "epoch": 16.460343315374015, "grad_norm": 5.632879734039307, "learning_rate": 9.242508123486077e-07, "loss": 0.4375, "num_input_tokens_seen": 159763872, "step": 131370 }, { "epoch": 16.460969803282797, "grad_norm": 8.301152229309082, "learning_rate": 9.23934152639e-07, "loss": 0.4738, "num_input_tokens_seen": 159770080, "step": 131375 }, { "epoch": 16.46159629119158, "grad_norm": 14.186182022094727, "learning_rate": 9.236175416621873e-07, "loss": 0.4061, "num_input_tokens_seen": 159775712, "step": 131380 }, { "epoch": 16.46222277910036, "grad_norm": 9.98340129852295, "learning_rate": 9.233009794219549e-07, "loss": 0.5077, "num_input_tokens_seen": 159780672, "step": 131385 }, { "epoch": 16.462849267009148, "grad_norm": 4.955284595489502, "learning_rate": 9.229844659220854e-07, "loss": 0.42, "num_input_tokens_seen": 159786784, "step": 131390 }, { "epoch": 16.46347575491793, "grad_norm": 4.788747787475586, "learning_rate": 9.226680011663658e-07, "loss": 0.4162, "num_input_tokens_seen": 159792736, "step": 131395 }, { "epoch": 16.464102242826712, "grad_norm": 15.68528938293457, "learning_rate": 9.223515851585774e-07, "loss": 0.4812, "num_input_tokens_seen": 159798976, "step": 131400 }, { "epoch": 16.464728730735498, "grad_norm": 5.401501655578613, "learning_rate": 9.220352179025049e-07, "loss": 0.4472, "num_input_tokens_seen": 159804768, "step": 131405 }, { "epoch": 16.46535521864428, "grad_norm": 3.3590385913848877, "learning_rate": 9.217188994019288e-07, "loss": 0.4437, "num_input_tokens_seen": 159810880, "step": 131410 }, { "epoch": 16.465981706553062, "grad_norm": 3.3751797676086426, "learning_rate": 9.214026296606321e-07, "loss": 0.4089, "num_input_tokens_seen": 159817088, "step": 131415 }, { "epoch": 16.46660819446185, "grad_norm": 4.283722877502441, "learning_rate": 9.210864086823962e-07, "loss": 0.4096, "num_input_tokens_seen": 159823264, "step": 131420 }, { "epoch": 16.46723468237063, "grad_norm": 4.219213008880615, "learning_rate": 9.207702364710025e-07, "loss": 0.4184, "num_input_tokens_seen": 159829408, "step": 131425 }, { "epoch": 16.467861170279413, "grad_norm": 4.654146194458008, "learning_rate": 9.204541130302297e-07, "loss": 0.4287, "num_input_tokens_seen": 159834944, "step": 131430 }, { "epoch": 16.4684876581882, "grad_norm": 14.604179382324219, "learning_rate": 9.201380383638586e-07, "loss": 0.4961, "num_input_tokens_seen": 159840960, "step": 131435 }, { "epoch": 16.46911414609698, "grad_norm": 5.433627128601074, "learning_rate": 9.198220124756663e-07, "loss": 0.4222, "num_input_tokens_seen": 159846848, "step": 131440 }, { "epoch": 16.469740634005763, "grad_norm": 3.8893556594848633, "learning_rate": 9.195060353694319e-07, "loss": 0.4297, "num_input_tokens_seen": 159853024, "step": 131445 }, { "epoch": 16.470367121914546, "grad_norm": 3.888782024383545, "learning_rate": 9.19190107048935e-07, "loss": 0.4811, "num_input_tokens_seen": 159858848, "step": 131450 }, { "epoch": 16.47099360982333, "grad_norm": 22.19500160217285, "learning_rate": 9.188742275179502e-07, "loss": 0.5672, "num_input_tokens_seen": 159864960, "step": 131455 }, { "epoch": 16.471620097732114, "grad_norm": 5.303121089935303, "learning_rate": 9.185583967802569e-07, "loss": 0.3918, "num_input_tokens_seen": 159871264, "step": 131460 }, { "epoch": 16.472246585640896, "grad_norm": 4.163009166717529, "learning_rate": 9.182426148396273e-07, "loss": 0.4285, "num_input_tokens_seen": 159877344, "step": 131465 }, { "epoch": 16.472873073549682, "grad_norm": 13.730283737182617, "learning_rate": 9.17926881699841e-07, "loss": 0.4145, "num_input_tokens_seen": 159883328, "step": 131470 }, { "epoch": 16.473499561458464, "grad_norm": 4.963067531585693, "learning_rate": 9.176111973646696e-07, "loss": 0.441, "num_input_tokens_seen": 159889280, "step": 131475 }, { "epoch": 16.474126049367246, "grad_norm": 3.8570384979248047, "learning_rate": 9.172955618378892e-07, "loss": 0.4884, "num_input_tokens_seen": 159895520, "step": 131480 }, { "epoch": 16.474752537276032, "grad_norm": 4.010743618011475, "learning_rate": 9.169799751232722e-07, "loss": 0.4706, "num_input_tokens_seen": 159901760, "step": 131485 }, { "epoch": 16.475379025184814, "grad_norm": 9.826618194580078, "learning_rate": 9.166644372245937e-07, "loss": 0.4323, "num_input_tokens_seen": 159907872, "step": 131490 }, { "epoch": 16.476005513093597, "grad_norm": 15.525178909301758, "learning_rate": 9.163489481456234e-07, "loss": 0.5006, "num_input_tokens_seen": 159913856, "step": 131495 }, { "epoch": 16.47663200100238, "grad_norm": 7.186042308807373, "learning_rate": 9.160335078901344e-07, "loss": 0.3993, "num_input_tokens_seen": 159919872, "step": 131500 }, { "epoch": 16.477258488911165, "grad_norm": 5.770536422729492, "learning_rate": 9.157181164619006e-07, "loss": 0.5321, "num_input_tokens_seen": 159926400, "step": 131505 }, { "epoch": 16.477884976819947, "grad_norm": 4.209968090057373, "learning_rate": 9.154027738646887e-07, "loss": 0.485, "num_input_tokens_seen": 159932544, "step": 131510 }, { "epoch": 16.47851146472873, "grad_norm": 13.949203491210938, "learning_rate": 9.150874801022708e-07, "loss": 0.5192, "num_input_tokens_seen": 159938592, "step": 131515 }, { "epoch": 16.479137952637515, "grad_norm": 5.988714218139648, "learning_rate": 9.147722351784177e-07, "loss": 0.4047, "num_input_tokens_seen": 159944672, "step": 131520 }, { "epoch": 16.479764440546298, "grad_norm": 9.589373588562012, "learning_rate": 9.144570390968965e-07, "loss": 0.4686, "num_input_tokens_seen": 159950912, "step": 131525 }, { "epoch": 16.48039092845508, "grad_norm": 4.465071201324463, "learning_rate": 9.141418918614758e-07, "loss": 0.4204, "num_input_tokens_seen": 159957088, "step": 131530 }, { "epoch": 16.481017416363866, "grad_norm": 15.710736274719238, "learning_rate": 9.138267934759254e-07, "loss": 0.4613, "num_input_tokens_seen": 159963296, "step": 131535 }, { "epoch": 16.481643904272648, "grad_norm": 3.97821307182312, "learning_rate": 9.1351174394401e-07, "loss": 0.3933, "num_input_tokens_seen": 159969600, "step": 131540 }, { "epoch": 16.48227039218143, "grad_norm": 4.931314945220947, "learning_rate": 9.131967432694983e-07, "loss": 0.4812, "num_input_tokens_seen": 159976064, "step": 131545 }, { "epoch": 16.482896880090216, "grad_norm": 20.887121200561523, "learning_rate": 9.128817914561544e-07, "loss": 0.4703, "num_input_tokens_seen": 159982048, "step": 131550 }, { "epoch": 16.483523367999, "grad_norm": 3.596869468688965, "learning_rate": 9.125668885077465e-07, "loss": 0.457, "num_input_tokens_seen": 159988544, "step": 131555 }, { "epoch": 16.48414985590778, "grad_norm": 25.88228988647461, "learning_rate": 9.122520344280367e-07, "loss": 0.4567, "num_input_tokens_seen": 159994720, "step": 131560 }, { "epoch": 16.484776343816563, "grad_norm": 15.043676376342773, "learning_rate": 9.119372292207906e-07, "loss": 0.46, "num_input_tokens_seen": 160000608, "step": 131565 }, { "epoch": 16.48540283172535, "grad_norm": 3.9016621112823486, "learning_rate": 9.116224728897733e-07, "loss": 0.4278, "num_input_tokens_seen": 160006848, "step": 131570 }, { "epoch": 16.48602931963413, "grad_norm": 4.994378089904785, "learning_rate": 9.113077654387448e-07, "loss": 0.4253, "num_input_tokens_seen": 160012864, "step": 131575 }, { "epoch": 16.486655807542913, "grad_norm": 3.714096784591675, "learning_rate": 9.109931068714711e-07, "loss": 0.433, "num_input_tokens_seen": 160018848, "step": 131580 }, { "epoch": 16.4872822954517, "grad_norm": 16.31290054321289, "learning_rate": 9.106784971917116e-07, "loss": 0.4352, "num_input_tokens_seen": 160024832, "step": 131585 }, { "epoch": 16.48790878336048, "grad_norm": 16.774028778076172, "learning_rate": 9.103639364032302e-07, "loss": 0.4681, "num_input_tokens_seen": 160030816, "step": 131590 }, { "epoch": 16.488535271269264, "grad_norm": 5.068740367889404, "learning_rate": 9.100494245097847e-07, "loss": 0.4695, "num_input_tokens_seen": 160036672, "step": 131595 }, { "epoch": 16.48916175917805, "grad_norm": 6.8266072273254395, "learning_rate": 9.097349615151385e-07, "loss": 0.3988, "num_input_tokens_seen": 160043104, "step": 131600 }, { "epoch": 16.48978824708683, "grad_norm": 4.291849136352539, "learning_rate": 9.094205474230483e-07, "loss": 0.4542, "num_input_tokens_seen": 160049312, "step": 131605 }, { "epoch": 16.490414734995614, "grad_norm": 9.57498550415039, "learning_rate": 9.091061822372743e-07, "loss": 0.4982, "num_input_tokens_seen": 160055648, "step": 131610 }, { "epoch": 16.491041222904396, "grad_norm": 10.359395027160645, "learning_rate": 9.087918659615769e-07, "loss": 0.4215, "num_input_tokens_seen": 160061952, "step": 131615 }, { "epoch": 16.491667710813182, "grad_norm": 22.395294189453125, "learning_rate": 9.084775985997113e-07, "loss": 0.5922, "num_input_tokens_seen": 160067744, "step": 131620 }, { "epoch": 16.492294198721964, "grad_norm": 25.358787536621094, "learning_rate": 9.081633801554362e-07, "loss": 0.4781, "num_input_tokens_seen": 160073696, "step": 131625 }, { "epoch": 16.492920686630747, "grad_norm": 13.612748146057129, "learning_rate": 9.078492106325077e-07, "loss": 0.496, "num_input_tokens_seen": 160079776, "step": 131630 }, { "epoch": 16.493547174539533, "grad_norm": 13.73986530303955, "learning_rate": 9.075350900346836e-07, "loss": 0.4479, "num_input_tokens_seen": 160086048, "step": 131635 }, { "epoch": 16.494173662448315, "grad_norm": 19.31117057800293, "learning_rate": 9.07221018365717e-07, "loss": 0.4765, "num_input_tokens_seen": 160092352, "step": 131640 }, { "epoch": 16.494800150357097, "grad_norm": 11.784419059753418, "learning_rate": 9.069069956293658e-07, "loss": 0.4857, "num_input_tokens_seen": 160098592, "step": 131645 }, { "epoch": 16.495426638265883, "grad_norm": 7.573051452636719, "learning_rate": 9.065930218293811e-07, "loss": 0.3775, "num_input_tokens_seen": 160104608, "step": 131650 }, { "epoch": 16.496053126174665, "grad_norm": 11.692153930664062, "learning_rate": 9.062790969695195e-07, "loss": 0.4223, "num_input_tokens_seen": 160110816, "step": 131655 }, { "epoch": 16.496679614083448, "grad_norm": 4.575053691864014, "learning_rate": 9.059652210535324e-07, "loss": 0.4542, "num_input_tokens_seen": 160117152, "step": 131660 }, { "epoch": 16.49730610199223, "grad_norm": 3.267429828643799, "learning_rate": 9.056513940851741e-07, "loss": 0.4102, "num_input_tokens_seen": 160123104, "step": 131665 }, { "epoch": 16.497932589901016, "grad_norm": 7.66203498840332, "learning_rate": 9.053376160681942e-07, "loss": 0.4327, "num_input_tokens_seen": 160128928, "step": 131670 }, { "epoch": 16.498559077809798, "grad_norm": 3.5542895793914795, "learning_rate": 9.050238870063477e-07, "loss": 0.4595, "num_input_tokens_seen": 160135040, "step": 131675 }, { "epoch": 16.49918556571858, "grad_norm": 4.266297817230225, "learning_rate": 9.047102069033815e-07, "loss": 0.4161, "num_input_tokens_seen": 160140576, "step": 131680 }, { "epoch": 16.499812053627366, "grad_norm": 3.5964550971984863, "learning_rate": 9.043965757630485e-07, "loss": 0.4378, "num_input_tokens_seen": 160146656, "step": 131685 }, { "epoch": 16.50043854153615, "grad_norm": 6.098418712615967, "learning_rate": 9.040829935890988e-07, "loss": 0.4475, "num_input_tokens_seen": 160152640, "step": 131690 }, { "epoch": 16.50106502944493, "grad_norm": 19.294981002807617, "learning_rate": 9.037694603852793e-07, "loss": 0.4296, "num_input_tokens_seen": 160159328, "step": 131695 }, { "epoch": 16.501691517353716, "grad_norm": 4.87738561630249, "learning_rate": 9.034559761553407e-07, "loss": 0.4602, "num_input_tokens_seen": 160165440, "step": 131700 }, { "epoch": 16.5023180052625, "grad_norm": 4.424263954162598, "learning_rate": 9.031425409030293e-07, "loss": 0.4063, "num_input_tokens_seen": 160171520, "step": 131705 }, { "epoch": 16.50294449317128, "grad_norm": 8.122448921203613, "learning_rate": 9.028291546320945e-07, "loss": 0.4296, "num_input_tokens_seen": 160177312, "step": 131710 }, { "epoch": 16.503570981080067, "grad_norm": 3.1295368671417236, "learning_rate": 9.025158173462805e-07, "loss": 0.4825, "num_input_tokens_seen": 160183328, "step": 131715 }, { "epoch": 16.50419746898885, "grad_norm": 10.51034927368164, "learning_rate": 9.022025290493347e-07, "loss": 0.4564, "num_input_tokens_seen": 160189216, "step": 131720 }, { "epoch": 16.50482395689763, "grad_norm": 4.839381217956543, "learning_rate": 9.018892897450049e-07, "loss": 0.4195, "num_input_tokens_seen": 160195520, "step": 131725 }, { "epoch": 16.505450444806414, "grad_norm": 4.673698425292969, "learning_rate": 9.01576099437032e-07, "loss": 0.4194, "num_input_tokens_seen": 160201792, "step": 131730 }, { "epoch": 16.5060769327152, "grad_norm": 4.298248291015625, "learning_rate": 9.012629581291631e-07, "loss": 0.4776, "num_input_tokens_seen": 160207840, "step": 131735 }, { "epoch": 16.50670342062398, "grad_norm": 14.263993263244629, "learning_rate": 9.009498658251425e-07, "loss": 0.5718, "num_input_tokens_seen": 160214048, "step": 131740 }, { "epoch": 16.507329908532764, "grad_norm": 3.6960270404815674, "learning_rate": 9.006368225287116e-07, "loss": 0.4725, "num_input_tokens_seen": 160220256, "step": 131745 }, { "epoch": 16.50795639644155, "grad_norm": 3.6539015769958496, "learning_rate": 9.003238282436139e-07, "loss": 0.4813, "num_input_tokens_seen": 160226240, "step": 131750 }, { "epoch": 16.508582884350332, "grad_norm": 4.8126959800720215, "learning_rate": 9.000108829735932e-07, "loss": 0.4525, "num_input_tokens_seen": 160232320, "step": 131755 }, { "epoch": 16.509209372259114, "grad_norm": 10.79672622680664, "learning_rate": 8.996979867223876e-07, "loss": 0.421, "num_input_tokens_seen": 160238496, "step": 131760 }, { "epoch": 16.5098358601679, "grad_norm": 20.74486541748047, "learning_rate": 8.993851394937414e-07, "loss": 0.487, "num_input_tokens_seen": 160244704, "step": 131765 }, { "epoch": 16.510462348076683, "grad_norm": 17.564645767211914, "learning_rate": 8.990723412913926e-07, "loss": 0.5495, "num_input_tokens_seen": 160250656, "step": 131770 }, { "epoch": 16.511088835985465, "grad_norm": 7.552527904510498, "learning_rate": 8.987595921190828e-07, "loss": 0.4696, "num_input_tokens_seen": 160257216, "step": 131775 }, { "epoch": 16.511715323894247, "grad_norm": 2.6269543170928955, "learning_rate": 8.98446891980549e-07, "loss": 0.4171, "num_input_tokens_seen": 160263744, "step": 131780 }, { "epoch": 16.512341811803033, "grad_norm": 3.243582010269165, "learning_rate": 8.981342408795324e-07, "loss": 0.5158, "num_input_tokens_seen": 160269888, "step": 131785 }, { "epoch": 16.512968299711815, "grad_norm": 4.225444793701172, "learning_rate": 8.978216388197681e-07, "loss": 0.3609, "num_input_tokens_seen": 160276032, "step": 131790 }, { "epoch": 16.513594787620598, "grad_norm": 3.529627799987793, "learning_rate": 8.975090858049962e-07, "loss": 0.4008, "num_input_tokens_seen": 160282080, "step": 131795 }, { "epoch": 16.514221275529383, "grad_norm": 6.229700088500977, "learning_rate": 8.97196581838951e-07, "loss": 0.4123, "num_input_tokens_seen": 160288256, "step": 131800 }, { "epoch": 16.514847763438166, "grad_norm": 3.699277877807617, "learning_rate": 8.96884126925372e-07, "loss": 0.4968, "num_input_tokens_seen": 160294368, "step": 131805 }, { "epoch": 16.515474251346948, "grad_norm": 7.154468059539795, "learning_rate": 8.965717210679914e-07, "loss": 0.4705, "num_input_tokens_seen": 160300416, "step": 131810 }, { "epoch": 16.516100739255734, "grad_norm": 16.39466667175293, "learning_rate": 8.962593642705458e-07, "loss": 0.4742, "num_input_tokens_seen": 160306624, "step": 131815 }, { "epoch": 16.516727227164516, "grad_norm": 21.97943878173828, "learning_rate": 8.959470565367706e-07, "loss": 0.5111, "num_input_tokens_seen": 160312992, "step": 131820 }, { "epoch": 16.5173537150733, "grad_norm": 5.818472862243652, "learning_rate": 8.956347978703983e-07, "loss": 0.4679, "num_input_tokens_seen": 160318720, "step": 131825 }, { "epoch": 16.517980202982084, "grad_norm": 4.156600475311279, "learning_rate": 8.953225882751626e-07, "loss": 0.4143, "num_input_tokens_seen": 160325120, "step": 131830 }, { "epoch": 16.518606690890866, "grad_norm": 4.744755268096924, "learning_rate": 8.950104277547961e-07, "loss": 0.4352, "num_input_tokens_seen": 160331296, "step": 131835 }, { "epoch": 16.51923317879965, "grad_norm": 5.277687072753906, "learning_rate": 8.946983163130324e-07, "loss": 0.4637, "num_input_tokens_seen": 160337280, "step": 131840 }, { "epoch": 16.51985966670843, "grad_norm": 4.128399848937988, "learning_rate": 8.943862539536008e-07, "loss": 0.4361, "num_input_tokens_seen": 160343456, "step": 131845 }, { "epoch": 16.520486154617217, "grad_norm": 3.6651885509490967, "learning_rate": 8.940742406802344e-07, "loss": 0.448, "num_input_tokens_seen": 160349472, "step": 131850 }, { "epoch": 16.521112642526, "grad_norm": 5.0516676902771, "learning_rate": 8.937622764966619e-07, "loss": 0.4056, "num_input_tokens_seen": 160354848, "step": 131855 }, { "epoch": 16.52173913043478, "grad_norm": 2.3041529655456543, "learning_rate": 8.934503614066142e-07, "loss": 0.4148, "num_input_tokens_seen": 160361152, "step": 131860 }, { "epoch": 16.522365618343567, "grad_norm": 9.54969310760498, "learning_rate": 8.931384954138195e-07, "loss": 0.4911, "num_input_tokens_seen": 160366944, "step": 131865 }, { "epoch": 16.52299210625235, "grad_norm": 3.527681350708008, "learning_rate": 8.928266785220063e-07, "loss": 0.4967, "num_input_tokens_seen": 160373216, "step": 131870 }, { "epoch": 16.52361859416113, "grad_norm": 12.005448341369629, "learning_rate": 8.925149107349046e-07, "loss": 0.4073, "num_input_tokens_seen": 160379680, "step": 131875 }, { "epoch": 16.524245082069918, "grad_norm": 2.681394577026367, "learning_rate": 8.922031920562396e-07, "loss": 0.5532, "num_input_tokens_seen": 160385600, "step": 131880 }, { "epoch": 16.5248715699787, "grad_norm": 3.5886523723602295, "learning_rate": 8.918915224897401e-07, "loss": 0.4378, "num_input_tokens_seen": 160391904, "step": 131885 }, { "epoch": 16.525498057887482, "grad_norm": 16.287025451660156, "learning_rate": 8.915799020391303e-07, "loss": 0.474, "num_input_tokens_seen": 160398080, "step": 131890 }, { "epoch": 16.526124545796264, "grad_norm": 15.194788932800293, "learning_rate": 8.912683307081382e-07, "loss": 0.4939, "num_input_tokens_seen": 160404128, "step": 131895 }, { "epoch": 16.52675103370505, "grad_norm": 7.013011932373047, "learning_rate": 8.909568085004861e-07, "loss": 0.4245, "num_input_tokens_seen": 160410304, "step": 131900 }, { "epoch": 16.527377521613833, "grad_norm": 4.638625621795654, "learning_rate": 8.906453354199019e-07, "loss": 0.5548, "num_input_tokens_seen": 160416544, "step": 131905 }, { "epoch": 16.528004009522615, "grad_norm": 13.043590545654297, "learning_rate": 8.90333911470106e-07, "loss": 0.4462, "num_input_tokens_seen": 160422592, "step": 131910 }, { "epoch": 16.5286304974314, "grad_norm": 22.289079666137695, "learning_rate": 8.900225366548243e-07, "loss": 0.6589, "num_input_tokens_seen": 160428640, "step": 131915 }, { "epoch": 16.529256985340183, "grad_norm": 2.551837921142578, "learning_rate": 8.897112109777778e-07, "loss": 0.4696, "num_input_tokens_seen": 160434656, "step": 131920 }, { "epoch": 16.529883473248965, "grad_norm": 3.5710983276367188, "learning_rate": 8.893999344426896e-07, "loss": 0.4889, "num_input_tokens_seen": 160440672, "step": 131925 }, { "epoch": 16.53050996115775, "grad_norm": 4.876740455627441, "learning_rate": 8.890887070532811e-07, "loss": 0.4889, "num_input_tokens_seen": 160447040, "step": 131930 }, { "epoch": 16.531136449066533, "grad_norm": 18.908170700073242, "learning_rate": 8.887775288132744e-07, "loss": 0.5414, "num_input_tokens_seen": 160453376, "step": 131935 }, { "epoch": 16.531762936975316, "grad_norm": 14.297172546386719, "learning_rate": 8.884663997263882e-07, "loss": 0.4364, "num_input_tokens_seen": 160459520, "step": 131940 }, { "epoch": 16.5323894248841, "grad_norm": 5.172262668609619, "learning_rate": 8.881553197963422e-07, "loss": 0.4649, "num_input_tokens_seen": 160465568, "step": 131945 }, { "epoch": 16.533015912792884, "grad_norm": 4.261308670043945, "learning_rate": 8.87844289026858e-07, "loss": 0.4524, "num_input_tokens_seen": 160471776, "step": 131950 }, { "epoch": 16.533642400701666, "grad_norm": 2.872018575668335, "learning_rate": 8.875333074216513e-07, "loss": 0.4467, "num_input_tokens_seen": 160478112, "step": 131955 }, { "epoch": 16.53426888861045, "grad_norm": 4.786820411682129, "learning_rate": 8.872223749844433e-07, "loss": 0.4137, "num_input_tokens_seen": 160484320, "step": 131960 }, { "epoch": 16.534895376519234, "grad_norm": 2.984518051147461, "learning_rate": 8.869114917189475e-07, "loss": 0.416, "num_input_tokens_seen": 160490528, "step": 131965 }, { "epoch": 16.535521864428016, "grad_norm": 14.211807250976562, "learning_rate": 8.86600657628885e-07, "loss": 0.4135, "num_input_tokens_seen": 160496480, "step": 131970 }, { "epoch": 16.5361483523368, "grad_norm": 4.177526473999023, "learning_rate": 8.862898727179681e-07, "loss": 0.4338, "num_input_tokens_seen": 160502688, "step": 131975 }, { "epoch": 16.536774840245585, "grad_norm": 8.00578784942627, "learning_rate": 8.859791369899163e-07, "loss": 0.4746, "num_input_tokens_seen": 160508224, "step": 131980 }, { "epoch": 16.537401328154367, "grad_norm": 3.6389026641845703, "learning_rate": 8.856684504484409e-07, "loss": 0.4608, "num_input_tokens_seen": 160514112, "step": 131985 }, { "epoch": 16.53802781606315, "grad_norm": 3.2799155712127686, "learning_rate": 8.853578130972596e-07, "loss": 0.4686, "num_input_tokens_seen": 160520352, "step": 131990 }, { "epoch": 16.538654303971935, "grad_norm": 17.212371826171875, "learning_rate": 8.850472249400844e-07, "loss": 0.4796, "num_input_tokens_seen": 160526496, "step": 131995 }, { "epoch": 16.539280791880717, "grad_norm": 5.52182674407959, "learning_rate": 8.847366859806283e-07, "loss": 0.484, "num_input_tokens_seen": 160532608, "step": 132000 }, { "epoch": 16.5399072797895, "grad_norm": 4.764048099517822, "learning_rate": 8.844261962226069e-07, "loss": 0.4361, "num_input_tokens_seen": 160538656, "step": 132005 }, { "epoch": 16.54053376769828, "grad_norm": 8.345047950744629, "learning_rate": 8.841157556697294e-07, "loss": 0.5019, "num_input_tokens_seen": 160544992, "step": 132010 }, { "epoch": 16.541160255607068, "grad_norm": 18.29698944091797, "learning_rate": 8.838053643257094e-07, "loss": 0.4972, "num_input_tokens_seen": 160551136, "step": 132015 }, { "epoch": 16.54178674351585, "grad_norm": 5.272006988525391, "learning_rate": 8.834950221942562e-07, "loss": 0.4584, "num_input_tokens_seen": 160556960, "step": 132020 }, { "epoch": 16.542413231424632, "grad_norm": 3.336392402648926, "learning_rate": 8.831847292790818e-07, "loss": 0.4335, "num_input_tokens_seen": 160562784, "step": 132025 }, { "epoch": 16.543039719333418, "grad_norm": 4.042759895324707, "learning_rate": 8.828744855838938e-07, "loss": 0.4426, "num_input_tokens_seen": 160568864, "step": 132030 }, { "epoch": 16.5436662072422, "grad_norm": 23.198049545288086, "learning_rate": 8.825642911124033e-07, "loss": 0.4724, "num_input_tokens_seen": 160575008, "step": 132035 }, { "epoch": 16.544292695150983, "grad_norm": 4.169193744659424, "learning_rate": 8.822541458683181e-07, "loss": 0.5564, "num_input_tokens_seen": 160581120, "step": 132040 }, { "epoch": 16.54491918305977, "grad_norm": 4.923473834991455, "learning_rate": 8.819440498553484e-07, "loss": 0.4672, "num_input_tokens_seen": 160587552, "step": 132045 }, { "epoch": 16.54554567096855, "grad_norm": 23.65302276611328, "learning_rate": 8.81634003077198e-07, "loss": 0.4547, "num_input_tokens_seen": 160593664, "step": 132050 }, { "epoch": 16.546172158877333, "grad_norm": 3.515284776687622, "learning_rate": 8.813240055375772e-07, "loss": 0.4593, "num_input_tokens_seen": 160599872, "step": 132055 }, { "epoch": 16.546798646786115, "grad_norm": 3.743198871612549, "learning_rate": 8.810140572401888e-07, "loss": 0.3945, "num_input_tokens_seen": 160605696, "step": 132060 }, { "epoch": 16.5474251346949, "grad_norm": 8.522690773010254, "learning_rate": 8.80704158188741e-07, "loss": 0.4646, "num_input_tokens_seen": 160611712, "step": 132065 }, { "epoch": 16.548051622603683, "grad_norm": 6.511836051940918, "learning_rate": 8.803943083869393e-07, "loss": 0.5283, "num_input_tokens_seen": 160617888, "step": 132070 }, { "epoch": 16.548678110512466, "grad_norm": 16.40097427368164, "learning_rate": 8.800845078384862e-07, "loss": 0.5145, "num_input_tokens_seen": 160624160, "step": 132075 }, { "epoch": 16.54930459842125, "grad_norm": 3.050727367401123, "learning_rate": 8.797747565470882e-07, "loss": 0.4758, "num_input_tokens_seen": 160630048, "step": 132080 }, { "epoch": 16.549931086330034, "grad_norm": 5.762407302856445, "learning_rate": 8.794650545164452e-07, "loss": 0.4163, "num_input_tokens_seen": 160636448, "step": 132085 }, { "epoch": 16.550557574238816, "grad_norm": 3.577477216720581, "learning_rate": 8.791554017502634e-07, "loss": 0.438, "num_input_tokens_seen": 160642816, "step": 132090 }, { "epoch": 16.551184062147602, "grad_norm": 16.48271942138672, "learning_rate": 8.788457982522419e-07, "loss": 0.5714, "num_input_tokens_seen": 160648800, "step": 132095 }, { "epoch": 16.551810550056384, "grad_norm": 9.857815742492676, "learning_rate": 8.785362440260852e-07, "loss": 0.4873, "num_input_tokens_seen": 160655008, "step": 132100 }, { "epoch": 16.552437037965166, "grad_norm": 3.563081979751587, "learning_rate": 8.782267390754917e-07, "loss": 0.4376, "num_input_tokens_seen": 160661408, "step": 132105 }, { "epoch": 16.553063525873952, "grad_norm": 4.03478479385376, "learning_rate": 8.779172834041638e-07, "loss": 0.4054, "num_input_tokens_seen": 160667392, "step": 132110 }, { "epoch": 16.553690013782735, "grad_norm": 16.982257843017578, "learning_rate": 8.776078770157998e-07, "loss": 0.4419, "num_input_tokens_seen": 160673376, "step": 132115 }, { "epoch": 16.554316501691517, "grad_norm": 5.298900604248047, "learning_rate": 8.772985199140988e-07, "loss": 0.5076, "num_input_tokens_seen": 160679552, "step": 132120 }, { "epoch": 16.5549429896003, "grad_norm": 12.678422927856445, "learning_rate": 8.769892121027618e-07, "loss": 0.4745, "num_input_tokens_seen": 160685856, "step": 132125 }, { "epoch": 16.555569477509085, "grad_norm": 2.8818740844726562, "learning_rate": 8.766799535854842e-07, "loss": 0.4527, "num_input_tokens_seen": 160691648, "step": 132130 }, { "epoch": 16.556195965417867, "grad_norm": 15.900665283203125, "learning_rate": 8.763707443659641e-07, "loss": 0.465, "num_input_tokens_seen": 160697440, "step": 132135 }, { "epoch": 16.55682245332665, "grad_norm": 5.06111478805542, "learning_rate": 8.760615844478997e-07, "loss": 0.4989, "num_input_tokens_seen": 160703776, "step": 132140 }, { "epoch": 16.557448941235435, "grad_norm": 18.713191986083984, "learning_rate": 8.757524738349854e-07, "loss": 0.4732, "num_input_tokens_seen": 160709920, "step": 132145 }, { "epoch": 16.558075429144218, "grad_norm": 4.504457950592041, "learning_rate": 8.75443412530918e-07, "loss": 0.4088, "num_input_tokens_seen": 160716224, "step": 132150 }, { "epoch": 16.558701917053, "grad_norm": 4.876123428344727, "learning_rate": 8.751344005393931e-07, "loss": 0.4864, "num_input_tokens_seen": 160722400, "step": 132155 }, { "epoch": 16.559328404961786, "grad_norm": 2.947932004928589, "learning_rate": 8.748254378641035e-07, "loss": 0.4351, "num_input_tokens_seen": 160728832, "step": 132160 }, { "epoch": 16.559954892870568, "grad_norm": 3.36352276802063, "learning_rate": 8.74516524508745e-07, "loss": 0.4279, "num_input_tokens_seen": 160734848, "step": 132165 }, { "epoch": 16.56058138077935, "grad_norm": 5.39489221572876, "learning_rate": 8.742076604770094e-07, "loss": 0.4182, "num_input_tokens_seen": 160741120, "step": 132170 }, { "epoch": 16.561207868688136, "grad_norm": 4.875870227813721, "learning_rate": 8.738988457725906e-07, "loss": 0.442, "num_input_tokens_seen": 160747136, "step": 132175 }, { "epoch": 16.56183435659692, "grad_norm": 15.223809242248535, "learning_rate": 8.735900803991792e-07, "loss": 0.5267, "num_input_tokens_seen": 160753504, "step": 132180 }, { "epoch": 16.5624608445057, "grad_norm": 5.260226726531982, "learning_rate": 8.732813643604671e-07, "loss": 0.4261, "num_input_tokens_seen": 160759456, "step": 132185 }, { "epoch": 16.563087332414483, "grad_norm": 5.9047112464904785, "learning_rate": 8.729726976601477e-07, "loss": 0.4713, "num_input_tokens_seen": 160765696, "step": 132190 }, { "epoch": 16.56371382032327, "grad_norm": 11.792911529541016, "learning_rate": 8.726640803019082e-07, "loss": 0.4204, "num_input_tokens_seen": 160771712, "step": 132195 }, { "epoch": 16.56434030823205, "grad_norm": 6.835394859313965, "learning_rate": 8.723555122894406e-07, "loss": 0.5567, "num_input_tokens_seen": 160778144, "step": 132200 }, { "epoch": 16.564966796140833, "grad_norm": 3.9982962608337402, "learning_rate": 8.720469936264325e-07, "loss": 0.4413, "num_input_tokens_seen": 160784128, "step": 132205 }, { "epoch": 16.56559328404962, "grad_norm": 4.200848579406738, "learning_rate": 8.717385243165738e-07, "loss": 0.4296, "num_input_tokens_seen": 160790048, "step": 132210 }, { "epoch": 16.5662197719584, "grad_norm": 3.2742249965667725, "learning_rate": 8.71430104363551e-07, "loss": 0.5109, "num_input_tokens_seen": 160796352, "step": 132215 }, { "epoch": 16.566846259867184, "grad_norm": 2.6099376678466797, "learning_rate": 8.711217337710537e-07, "loss": 0.4277, "num_input_tokens_seen": 160802880, "step": 132220 }, { "epoch": 16.56747274777597, "grad_norm": 8.711576461791992, "learning_rate": 8.708134125427658e-07, "loss": 0.5137, "num_input_tokens_seen": 160809024, "step": 132225 }, { "epoch": 16.568099235684752, "grad_norm": 5.022647380828857, "learning_rate": 8.705051406823755e-07, "loss": 0.4905, "num_input_tokens_seen": 160815264, "step": 132230 }, { "epoch": 16.568725723593534, "grad_norm": 5.2302703857421875, "learning_rate": 8.701969181935694e-07, "loss": 0.4889, "num_input_tokens_seen": 160821536, "step": 132235 }, { "epoch": 16.569352211502316, "grad_norm": 13.026495933532715, "learning_rate": 8.698887450800303e-07, "loss": 0.4568, "num_input_tokens_seen": 160827456, "step": 132240 }, { "epoch": 16.569978699411102, "grad_norm": 4.625502109527588, "learning_rate": 8.695806213454433e-07, "loss": 0.4389, "num_input_tokens_seen": 160833472, "step": 132245 }, { "epoch": 16.570605187319885, "grad_norm": 20.697404861450195, "learning_rate": 8.692725469934926e-07, "loss": 0.5291, "num_input_tokens_seen": 160838528, "step": 132250 }, { "epoch": 16.571231675228667, "grad_norm": 3.4969232082366943, "learning_rate": 8.689645220278631e-07, "loss": 0.4085, "num_input_tokens_seen": 160844480, "step": 132255 }, { "epoch": 16.571858163137453, "grad_norm": 4.338602542877197, "learning_rate": 8.686565464522339e-07, "loss": 0.4414, "num_input_tokens_seen": 160851040, "step": 132260 }, { "epoch": 16.572484651046235, "grad_norm": 2.4787347316741943, "learning_rate": 8.683486202702912e-07, "loss": 0.4373, "num_input_tokens_seen": 160857088, "step": 132265 }, { "epoch": 16.573111138955017, "grad_norm": 4.028609275817871, "learning_rate": 8.68040743485713e-07, "loss": 0.4612, "num_input_tokens_seen": 160863456, "step": 132270 }, { "epoch": 16.573737626863803, "grad_norm": 3.867783546447754, "learning_rate": 8.677329161021825e-07, "loss": 0.3929, "num_input_tokens_seen": 160869440, "step": 132275 }, { "epoch": 16.574364114772585, "grad_norm": 4.8428730964660645, "learning_rate": 8.674251381233784e-07, "loss": 0.4235, "num_input_tokens_seen": 160875264, "step": 132280 }, { "epoch": 16.574990602681368, "grad_norm": 3.0107226371765137, "learning_rate": 8.671174095529828e-07, "loss": 0.4264, "num_input_tokens_seen": 160881696, "step": 132285 }, { "epoch": 16.57561709059015, "grad_norm": 3.4115943908691406, "learning_rate": 8.668097303946716e-07, "loss": 0.4539, "num_input_tokens_seen": 160887552, "step": 132290 }, { "epoch": 16.576243578498936, "grad_norm": 8.762100219726562, "learning_rate": 8.665021006521263e-07, "loss": 0.4391, "num_input_tokens_seen": 160893408, "step": 132295 }, { "epoch": 16.576870066407718, "grad_norm": 5.588210582733154, "learning_rate": 8.661945203290223e-07, "loss": 0.427, "num_input_tokens_seen": 160899456, "step": 132300 }, { "epoch": 16.5774965543165, "grad_norm": 17.859710693359375, "learning_rate": 8.658869894290384e-07, "loss": 0.5212, "num_input_tokens_seen": 160905952, "step": 132305 }, { "epoch": 16.578123042225286, "grad_norm": 3.723180055618286, "learning_rate": 8.655795079558527e-07, "loss": 0.4226, "num_input_tokens_seen": 160912064, "step": 132310 }, { "epoch": 16.57874953013407, "grad_norm": 27.74175262451172, "learning_rate": 8.652720759131389e-07, "loss": 0.5705, "num_input_tokens_seen": 160918336, "step": 132315 }, { "epoch": 16.57937601804285, "grad_norm": 13.955873489379883, "learning_rate": 8.649646933045746e-07, "loss": 0.4486, "num_input_tokens_seen": 160924512, "step": 132320 }, { "epoch": 16.580002505951636, "grad_norm": 4.720781326293945, "learning_rate": 8.646573601338327e-07, "loss": 0.4613, "num_input_tokens_seen": 160930560, "step": 132325 }, { "epoch": 16.58062899386042, "grad_norm": 6.644184589385986, "learning_rate": 8.643500764045904e-07, "loss": 0.4269, "num_input_tokens_seen": 160936480, "step": 132330 }, { "epoch": 16.5812554817692, "grad_norm": 4.462205410003662, "learning_rate": 8.640428421205188e-07, "loss": 0.4236, "num_input_tokens_seen": 160942624, "step": 132335 }, { "epoch": 16.581881969677987, "grad_norm": 6.085463047027588, "learning_rate": 8.637356572852918e-07, "loss": 0.4451, "num_input_tokens_seen": 160948992, "step": 132340 }, { "epoch": 16.58250845758677, "grad_norm": 6.620335578918457, "learning_rate": 8.634285219025845e-07, "loss": 0.4852, "num_input_tokens_seen": 160954976, "step": 132345 }, { "epoch": 16.58313494549555, "grad_norm": 4.44081974029541, "learning_rate": 8.631214359760654e-07, "loss": 0.5058, "num_input_tokens_seen": 160961120, "step": 132350 }, { "epoch": 16.583761433404334, "grad_norm": 8.520336151123047, "learning_rate": 8.628143995094079e-07, "loss": 0.4682, "num_input_tokens_seen": 160967328, "step": 132355 }, { "epoch": 16.58438792131312, "grad_norm": 8.923948287963867, "learning_rate": 8.625074125062838e-07, "loss": 0.5732, "num_input_tokens_seen": 160973376, "step": 132360 }, { "epoch": 16.585014409221902, "grad_norm": 3.5454983711242676, "learning_rate": 8.622004749703605e-07, "loss": 0.4595, "num_input_tokens_seen": 160979008, "step": 132365 }, { "epoch": 16.585640897130684, "grad_norm": 15.746516227722168, "learning_rate": 8.618935869053096e-07, "loss": 0.6013, "num_input_tokens_seen": 160984576, "step": 132370 }, { "epoch": 16.58626738503947, "grad_norm": 5.28494930267334, "learning_rate": 8.615867483148016e-07, "loss": 0.4536, "num_input_tokens_seen": 160991072, "step": 132375 }, { "epoch": 16.586893872948252, "grad_norm": 3.9908556938171387, "learning_rate": 8.612799592025023e-07, "loss": 0.4242, "num_input_tokens_seen": 160997472, "step": 132380 }, { "epoch": 16.587520360857035, "grad_norm": 3.605346918106079, "learning_rate": 8.609732195720815e-07, "loss": 0.4512, "num_input_tokens_seen": 161003584, "step": 132385 }, { "epoch": 16.58814684876582, "grad_norm": 4.897922515869141, "learning_rate": 8.606665294272043e-07, "loss": 0.4185, "num_input_tokens_seen": 161009344, "step": 132390 }, { "epoch": 16.588773336674603, "grad_norm": 3.733851194381714, "learning_rate": 8.603598887715408e-07, "loss": 0.4546, "num_input_tokens_seen": 161015392, "step": 132395 }, { "epoch": 16.589399824583385, "grad_norm": 12.907280921936035, "learning_rate": 8.600532976087539e-07, "loss": 0.4226, "num_input_tokens_seen": 161021600, "step": 132400 }, { "epoch": 16.590026312492167, "grad_norm": 3.0953540802001953, "learning_rate": 8.597467559425115e-07, "loss": 0.4083, "num_input_tokens_seen": 161027808, "step": 132405 }, { "epoch": 16.590652800400953, "grad_norm": 2.308469295501709, "learning_rate": 8.59440263776476e-07, "loss": 0.4196, "num_input_tokens_seen": 161033760, "step": 132410 }, { "epoch": 16.591279288309735, "grad_norm": 5.010642051696777, "learning_rate": 8.591338211143152e-07, "loss": 0.455, "num_input_tokens_seen": 161040000, "step": 132415 }, { "epoch": 16.591905776218518, "grad_norm": 4.563924312591553, "learning_rate": 8.588274279596892e-07, "loss": 0.4391, "num_input_tokens_seen": 161046080, "step": 132420 }, { "epoch": 16.592532264127303, "grad_norm": 5.064391136169434, "learning_rate": 8.58521084316265e-07, "loss": 0.5065, "num_input_tokens_seen": 161052352, "step": 132425 }, { "epoch": 16.593158752036086, "grad_norm": 8.205349922180176, "learning_rate": 8.582147901877014e-07, "loss": 0.4371, "num_input_tokens_seen": 161058144, "step": 132430 }, { "epoch": 16.593785239944868, "grad_norm": 5.864038944244385, "learning_rate": 8.57908545577662e-07, "loss": 0.4543, "num_input_tokens_seen": 161063776, "step": 132435 }, { "epoch": 16.594411727853654, "grad_norm": 3.263090133666992, "learning_rate": 8.576023504898101e-07, "loss": 0.4709, "num_input_tokens_seen": 161070048, "step": 132440 }, { "epoch": 16.595038215762436, "grad_norm": 12.072535514831543, "learning_rate": 8.572962049278038e-07, "loss": 0.4758, "num_input_tokens_seen": 161075744, "step": 132445 }, { "epoch": 16.59566470367122, "grad_norm": 4.4701972007751465, "learning_rate": 8.569901088953037e-07, "loss": 0.4444, "num_input_tokens_seen": 161081792, "step": 132450 }, { "epoch": 16.596291191580004, "grad_norm": 4.989410400390625, "learning_rate": 8.566840623959704e-07, "loss": 0.4471, "num_input_tokens_seen": 161087648, "step": 132455 }, { "epoch": 16.596917679488786, "grad_norm": 3.8775765895843506, "learning_rate": 8.563780654334642e-07, "loss": 0.4668, "num_input_tokens_seen": 161093856, "step": 132460 }, { "epoch": 16.59754416739757, "grad_norm": 17.77680206298828, "learning_rate": 8.560721180114407e-07, "loss": 0.5575, "num_input_tokens_seen": 161099968, "step": 132465 }, { "epoch": 16.59817065530635, "grad_norm": 3.20279860496521, "learning_rate": 8.557662201335598e-07, "loss": 0.4225, "num_input_tokens_seen": 161106208, "step": 132470 }, { "epoch": 16.598797143215137, "grad_norm": 4.7776007652282715, "learning_rate": 8.554603718034771e-07, "loss": 0.4248, "num_input_tokens_seen": 161112256, "step": 132475 }, { "epoch": 16.59942363112392, "grad_norm": 7.309926509857178, "learning_rate": 8.551545730248511e-07, "loss": 0.4296, "num_input_tokens_seen": 161118400, "step": 132480 }, { "epoch": 16.6000501190327, "grad_norm": 5.3756794929504395, "learning_rate": 8.548488238013364e-07, "loss": 0.4488, "num_input_tokens_seen": 161124448, "step": 132485 }, { "epoch": 16.600676606941487, "grad_norm": 4.089465618133545, "learning_rate": 8.545431241365887e-07, "loss": 0.4438, "num_input_tokens_seen": 161130720, "step": 132490 }, { "epoch": 16.60130309485027, "grad_norm": 15.57693862915039, "learning_rate": 8.542374740342646e-07, "loss": 0.4713, "num_input_tokens_seen": 161137248, "step": 132495 }, { "epoch": 16.601929582759052, "grad_norm": 5.304459571838379, "learning_rate": 8.539318734980157e-07, "loss": 0.4679, "num_input_tokens_seen": 161143040, "step": 132500 }, { "epoch": 16.602556070667838, "grad_norm": 3.968756675720215, "learning_rate": 8.536263225314984e-07, "loss": 0.4374, "num_input_tokens_seen": 161149184, "step": 132505 }, { "epoch": 16.60318255857662, "grad_norm": 4.758384704589844, "learning_rate": 8.533208211383631e-07, "loss": 0.5137, "num_input_tokens_seen": 161155360, "step": 132510 }, { "epoch": 16.603809046485402, "grad_norm": 3.6647732257843018, "learning_rate": 8.530153693222648e-07, "loss": 0.4409, "num_input_tokens_seen": 161161664, "step": 132515 }, { "epoch": 16.604435534394185, "grad_norm": 4.692984580993652, "learning_rate": 8.527099670868532e-07, "loss": 0.4695, "num_input_tokens_seen": 161168000, "step": 132520 }, { "epoch": 16.60506202230297, "grad_norm": 29.623950958251953, "learning_rate": 8.524046144357822e-07, "loss": 0.4932, "num_input_tokens_seen": 161174240, "step": 132525 }, { "epoch": 16.605688510211753, "grad_norm": 9.763688087463379, "learning_rate": 8.520993113727e-07, "loss": 0.4768, "num_input_tokens_seen": 161180416, "step": 132530 }, { "epoch": 16.606314998120535, "grad_norm": 5.078039646148682, "learning_rate": 8.51794057901259e-07, "loss": 0.4265, "num_input_tokens_seen": 161186048, "step": 132535 }, { "epoch": 16.60694148602932, "grad_norm": 2.7884883880615234, "learning_rate": 8.514888540251065e-07, "loss": 0.442, "num_input_tokens_seen": 161192192, "step": 132540 }, { "epoch": 16.607567973938103, "grad_norm": 3.3394272327423096, "learning_rate": 8.511836997478929e-07, "loss": 0.505, "num_input_tokens_seen": 161198112, "step": 132545 }, { "epoch": 16.608194461846885, "grad_norm": 3.732884645462036, "learning_rate": 8.508785950732656e-07, "loss": 0.5321, "num_input_tokens_seen": 161204256, "step": 132550 }, { "epoch": 16.60882094975567, "grad_norm": 28.430923461914062, "learning_rate": 8.505735400048748e-07, "loss": 0.5031, "num_input_tokens_seen": 161210208, "step": 132555 }, { "epoch": 16.609447437664453, "grad_norm": 16.011945724487305, "learning_rate": 8.502685345463646e-07, "loss": 0.4751, "num_input_tokens_seen": 161216416, "step": 132560 }, { "epoch": 16.610073925573236, "grad_norm": 2.7498483657836914, "learning_rate": 8.499635787013832e-07, "loss": 0.4403, "num_input_tokens_seen": 161222560, "step": 132565 }, { "epoch": 16.61070041348202, "grad_norm": 2.5961532592773438, "learning_rate": 8.496586724735773e-07, "loss": 0.413, "num_input_tokens_seen": 161228640, "step": 132570 }, { "epoch": 16.611326901390804, "grad_norm": 9.732110977172852, "learning_rate": 8.493538158665903e-07, "loss": 0.4589, "num_input_tokens_seen": 161234976, "step": 132575 }, { "epoch": 16.611953389299586, "grad_norm": 3.0788073539733887, "learning_rate": 8.490490088840692e-07, "loss": 0.478, "num_input_tokens_seen": 161240832, "step": 132580 }, { "epoch": 16.61257987720837, "grad_norm": 8.272378921508789, "learning_rate": 8.487442515296557e-07, "loss": 0.4259, "num_input_tokens_seen": 161247008, "step": 132585 }, { "epoch": 16.613206365117154, "grad_norm": 6.188782215118408, "learning_rate": 8.484395438069964e-07, "loss": 0.4061, "num_input_tokens_seen": 161252960, "step": 132590 }, { "epoch": 16.613832853025936, "grad_norm": 10.885247230529785, "learning_rate": 8.481348857197319e-07, "loss": 0.4145, "num_input_tokens_seen": 161258624, "step": 132595 }, { "epoch": 16.61445934093472, "grad_norm": 15.140748023986816, "learning_rate": 8.478302772715063e-07, "loss": 0.4757, "num_input_tokens_seen": 161264768, "step": 132600 }, { "epoch": 16.615085828843505, "grad_norm": 5.767332077026367, "learning_rate": 8.475257184659597e-07, "loss": 0.4532, "num_input_tokens_seen": 161270592, "step": 132605 }, { "epoch": 16.615712316752287, "grad_norm": 21.95043182373047, "learning_rate": 8.472212093067351e-07, "loss": 0.5003, "num_input_tokens_seen": 161276480, "step": 132610 }, { "epoch": 16.61633880466107, "grad_norm": 3.6188254356384277, "learning_rate": 8.469167497974718e-07, "loss": 0.4162, "num_input_tokens_seen": 161282432, "step": 132615 }, { "epoch": 16.616965292569855, "grad_norm": 3.0738301277160645, "learning_rate": 8.466123399418103e-07, "loss": 0.4582, "num_input_tokens_seen": 161288320, "step": 132620 }, { "epoch": 16.617591780478637, "grad_norm": 3.4530982971191406, "learning_rate": 8.463079797433909e-07, "loss": 0.5283, "num_input_tokens_seen": 161294464, "step": 132625 }, { "epoch": 16.61821826838742, "grad_norm": 3.711930990219116, "learning_rate": 8.460036692058516e-07, "loss": 0.5446, "num_input_tokens_seen": 161300448, "step": 132630 }, { "epoch": 16.618844756296202, "grad_norm": 11.119093894958496, "learning_rate": 8.456994083328313e-07, "loss": 0.4734, "num_input_tokens_seen": 161305920, "step": 132635 }, { "epoch": 16.619471244204988, "grad_norm": 3.8179829120635986, "learning_rate": 8.453951971279667e-07, "loss": 0.4676, "num_input_tokens_seen": 161312096, "step": 132640 }, { "epoch": 16.62009773211377, "grad_norm": 7.379951000213623, "learning_rate": 8.450910355948961e-07, "loss": 0.4287, "num_input_tokens_seen": 161318304, "step": 132645 }, { "epoch": 16.620724220022552, "grad_norm": 5.4032793045043945, "learning_rate": 8.447869237372547e-07, "loss": 0.4528, "num_input_tokens_seen": 161324576, "step": 132650 }, { "epoch": 16.621350707931338, "grad_norm": 3.515113353729248, "learning_rate": 8.444828615586792e-07, "loss": 0.5322, "num_input_tokens_seen": 161330656, "step": 132655 }, { "epoch": 16.62197719584012, "grad_norm": 7.4968976974487305, "learning_rate": 8.441788490628045e-07, "loss": 0.4744, "num_input_tokens_seen": 161336704, "step": 132660 }, { "epoch": 16.622603683748903, "grad_norm": 6.737349033355713, "learning_rate": 8.438748862532669e-07, "loss": 0.4276, "num_input_tokens_seen": 161343200, "step": 132665 }, { "epoch": 16.62323017165769, "grad_norm": 3.188162088394165, "learning_rate": 8.435709731336983e-07, "loss": 0.4656, "num_input_tokens_seen": 161349312, "step": 132670 }, { "epoch": 16.62385665956647, "grad_norm": 5.373003959655762, "learning_rate": 8.432671097077332e-07, "loss": 0.39, "num_input_tokens_seen": 161355616, "step": 132675 }, { "epoch": 16.624483147475253, "grad_norm": 7.062582015991211, "learning_rate": 8.429632959790057e-07, "loss": 0.6252, "num_input_tokens_seen": 161361600, "step": 132680 }, { "epoch": 16.625109635384035, "grad_norm": 4.427280426025391, "learning_rate": 8.42659531951146e-07, "loss": 0.4712, "num_input_tokens_seen": 161367616, "step": 132685 }, { "epoch": 16.62573612329282, "grad_norm": 3.3786516189575195, "learning_rate": 8.42355817627788e-07, "loss": 0.4957, "num_input_tokens_seen": 161373536, "step": 132690 }, { "epoch": 16.626362611201603, "grad_norm": 5.408435821533203, "learning_rate": 8.420521530125608e-07, "loss": 0.4599, "num_input_tokens_seen": 161379648, "step": 132695 }, { "epoch": 16.626989099110386, "grad_norm": 8.159038543701172, "learning_rate": 8.417485381090967e-07, "loss": 0.4416, "num_input_tokens_seen": 161386080, "step": 132700 }, { "epoch": 16.62761558701917, "grad_norm": 10.751440048217773, "learning_rate": 8.414449729210239e-07, "loss": 0.5301, "num_input_tokens_seen": 161392736, "step": 132705 }, { "epoch": 16.628242074927954, "grad_norm": 2.5018131732940674, "learning_rate": 8.411414574519744e-07, "loss": 0.3997, "num_input_tokens_seen": 161398656, "step": 132710 }, { "epoch": 16.628868562836736, "grad_norm": 18.50103759765625, "learning_rate": 8.408379917055737e-07, "loss": 0.4928, "num_input_tokens_seen": 161404384, "step": 132715 }, { "epoch": 16.629495050745522, "grad_norm": 9.055962562561035, "learning_rate": 8.405345756854527e-07, "loss": 0.4526, "num_input_tokens_seen": 161410784, "step": 132720 }, { "epoch": 16.630121538654304, "grad_norm": 9.679313659667969, "learning_rate": 8.40231209395237e-07, "loss": 0.4232, "num_input_tokens_seen": 161416992, "step": 132725 }, { "epoch": 16.630748026563086, "grad_norm": 12.993629455566406, "learning_rate": 8.399278928385557e-07, "loss": 0.489, "num_input_tokens_seen": 161422976, "step": 132730 }, { "epoch": 16.631374514471872, "grad_norm": 8.639333724975586, "learning_rate": 8.396246260190327e-07, "loss": 0.4131, "num_input_tokens_seen": 161429088, "step": 132735 }, { "epoch": 16.632001002380655, "grad_norm": 4.488162517547607, "learning_rate": 8.393214089402951e-07, "loss": 0.4704, "num_input_tokens_seen": 161435104, "step": 132740 }, { "epoch": 16.632627490289437, "grad_norm": 16.52889633178711, "learning_rate": 8.390182416059694e-07, "loss": 0.5381, "num_input_tokens_seen": 161441120, "step": 132745 }, { "epoch": 16.63325397819822, "grad_norm": 4.842810153961182, "learning_rate": 8.387151240196778e-07, "loss": 0.4475, "num_input_tokens_seen": 161447136, "step": 132750 }, { "epoch": 16.633880466107005, "grad_norm": 3.6349050998687744, "learning_rate": 8.384120561850457e-07, "loss": 0.4437, "num_input_tokens_seen": 161453408, "step": 132755 }, { "epoch": 16.634506954015787, "grad_norm": 4.435835838317871, "learning_rate": 8.381090381056973e-07, "loss": 0.4907, "num_input_tokens_seen": 161459360, "step": 132760 }, { "epoch": 16.63513344192457, "grad_norm": 4.859012603759766, "learning_rate": 8.378060697852536e-07, "loss": 0.4658, "num_input_tokens_seen": 161465664, "step": 132765 }, { "epoch": 16.635759929833355, "grad_norm": 3.3436057567596436, "learning_rate": 8.375031512273374e-07, "loss": 0.4255, "num_input_tokens_seen": 161471648, "step": 132770 }, { "epoch": 16.636386417742138, "grad_norm": 3.3443970680236816, "learning_rate": 8.372002824355718e-07, "loss": 0.4631, "num_input_tokens_seen": 161477312, "step": 132775 }, { "epoch": 16.63701290565092, "grad_norm": 3.419750690460205, "learning_rate": 8.368974634135757e-07, "loss": 0.4737, "num_input_tokens_seen": 161483424, "step": 132780 }, { "epoch": 16.637639393559706, "grad_norm": 15.22566032409668, "learning_rate": 8.365946941649721e-07, "loss": 0.4343, "num_input_tokens_seen": 161489472, "step": 132785 }, { "epoch": 16.638265881468488, "grad_norm": 10.777191162109375, "learning_rate": 8.362919746933773e-07, "loss": 0.4777, "num_input_tokens_seen": 161495520, "step": 132790 }, { "epoch": 16.63889236937727, "grad_norm": 2.9375879764556885, "learning_rate": 8.359893050024149e-07, "loss": 0.4091, "num_input_tokens_seen": 161501792, "step": 132795 }, { "epoch": 16.639518857286056, "grad_norm": 4.784639358520508, "learning_rate": 8.35686685095699e-07, "loss": 0.4148, "num_input_tokens_seen": 161508288, "step": 132800 }, { "epoch": 16.64014534519484, "grad_norm": 3.310922145843506, "learning_rate": 8.353841149768504e-07, "loss": 0.3757, "num_input_tokens_seen": 161514336, "step": 132805 }, { "epoch": 16.64077183310362, "grad_norm": 6.64955997467041, "learning_rate": 8.350815946494873e-07, "loss": 0.4771, "num_input_tokens_seen": 161520352, "step": 132810 }, { "epoch": 16.641398321012403, "grad_norm": 4.176937580108643, "learning_rate": 8.347791241172243e-07, "loss": 0.4035, "num_input_tokens_seen": 161526272, "step": 132815 }, { "epoch": 16.64202480892119, "grad_norm": 14.893954277038574, "learning_rate": 8.344767033836798e-07, "loss": 0.518, "num_input_tokens_seen": 161532544, "step": 132820 }, { "epoch": 16.64265129682997, "grad_norm": 5.045041561126709, "learning_rate": 8.341743324524676e-07, "loss": 0.4601, "num_input_tokens_seen": 161538528, "step": 132825 }, { "epoch": 16.643277784738753, "grad_norm": 17.220443725585938, "learning_rate": 8.338720113272048e-07, "loss": 0.4956, "num_input_tokens_seen": 161544704, "step": 132830 }, { "epoch": 16.64390427264754, "grad_norm": 4.368035793304443, "learning_rate": 8.335697400115033e-07, "loss": 0.4346, "num_input_tokens_seen": 161551040, "step": 132835 }, { "epoch": 16.64453076055632, "grad_norm": 19.18154525756836, "learning_rate": 8.3326751850898e-07, "loss": 0.5706, "num_input_tokens_seen": 161556672, "step": 132840 }, { "epoch": 16.645157248465104, "grad_norm": 3.6967713832855225, "learning_rate": 8.329653468232452e-07, "loss": 0.4343, "num_input_tokens_seen": 161562592, "step": 132845 }, { "epoch": 16.64578373637389, "grad_norm": 8.896112442016602, "learning_rate": 8.32663224957913e-07, "loss": 0.5069, "num_input_tokens_seen": 161568832, "step": 132850 }, { "epoch": 16.646410224282672, "grad_norm": 3.808417797088623, "learning_rate": 8.323611529165976e-07, "loss": 0.4699, "num_input_tokens_seen": 161574944, "step": 132855 }, { "epoch": 16.647036712191454, "grad_norm": 11.22531795501709, "learning_rate": 8.320591307029069e-07, "loss": 0.4438, "num_input_tokens_seen": 161581088, "step": 132860 }, { "epoch": 16.647663200100236, "grad_norm": 9.860432624816895, "learning_rate": 8.317571583204537e-07, "loss": 0.4283, "num_input_tokens_seen": 161587104, "step": 132865 }, { "epoch": 16.648289688009022, "grad_norm": 3.476457357406616, "learning_rate": 8.314552357728478e-07, "loss": 0.3838, "num_input_tokens_seen": 161593152, "step": 132870 }, { "epoch": 16.648916175917805, "grad_norm": 2.585200548171997, "learning_rate": 8.311533630637009e-07, "loss": 0.4292, "num_input_tokens_seen": 161599296, "step": 132875 }, { "epoch": 16.649542663826587, "grad_norm": 9.125197410583496, "learning_rate": 8.308515401966188e-07, "loss": 0.4692, "num_input_tokens_seen": 161605440, "step": 132880 }, { "epoch": 16.650169151735373, "grad_norm": 22.302120208740234, "learning_rate": 8.305497671752132e-07, "loss": 0.478, "num_input_tokens_seen": 161611776, "step": 132885 }, { "epoch": 16.650795639644155, "grad_norm": 9.009468078613281, "learning_rate": 8.302480440030897e-07, "loss": 0.5681, "num_input_tokens_seen": 161617920, "step": 132890 }, { "epoch": 16.651422127552937, "grad_norm": 10.386971473693848, "learning_rate": 8.299463706838573e-07, "loss": 0.4267, "num_input_tokens_seen": 161623936, "step": 132895 }, { "epoch": 16.652048615461723, "grad_norm": 3.193924903869629, "learning_rate": 8.296447472211211e-07, "loss": 0.4554, "num_input_tokens_seen": 161630048, "step": 132900 }, { "epoch": 16.652675103370505, "grad_norm": 7.24802827835083, "learning_rate": 8.293431736184892e-07, "loss": 0.5005, "num_input_tokens_seen": 161636224, "step": 132905 }, { "epoch": 16.653301591279288, "grad_norm": 15.410131454467773, "learning_rate": 8.290416498795651e-07, "loss": 0.4878, "num_input_tokens_seen": 161642240, "step": 132910 }, { "epoch": 16.65392807918807, "grad_norm": 12.500218391418457, "learning_rate": 8.287401760079561e-07, "loss": 0.4506, "num_input_tokens_seen": 161648288, "step": 132915 }, { "epoch": 16.654554567096856, "grad_norm": 5.448307991027832, "learning_rate": 8.28438752007264e-07, "loss": 0.4612, "num_input_tokens_seen": 161654432, "step": 132920 }, { "epoch": 16.655181055005638, "grad_norm": 3.7742929458618164, "learning_rate": 8.281373778810942e-07, "loss": 0.5028, "num_input_tokens_seen": 161660128, "step": 132925 }, { "epoch": 16.65580754291442, "grad_norm": 4.154561996459961, "learning_rate": 8.278360536330505e-07, "loss": 0.4853, "num_input_tokens_seen": 161665888, "step": 132930 }, { "epoch": 16.656434030823206, "grad_norm": 18.789615631103516, "learning_rate": 8.275347792667332e-07, "loss": 0.4557, "num_input_tokens_seen": 161671744, "step": 132935 }, { "epoch": 16.65706051873199, "grad_norm": 4.283198356628418, "learning_rate": 8.272335547857469e-07, "loss": 0.464, "num_input_tokens_seen": 161678048, "step": 132940 }, { "epoch": 16.65768700664077, "grad_norm": 3.3700194358825684, "learning_rate": 8.269323801936902e-07, "loss": 0.4223, "num_input_tokens_seen": 161684480, "step": 132945 }, { "epoch": 16.658313494549557, "grad_norm": 4.691250324249268, "learning_rate": 8.266312554941669e-07, "loss": 0.4701, "num_input_tokens_seen": 161690080, "step": 132950 }, { "epoch": 16.65893998245834, "grad_norm": 5.042318820953369, "learning_rate": 8.263301806907748e-07, "loss": 0.465, "num_input_tokens_seen": 161696096, "step": 132955 }, { "epoch": 16.65956647036712, "grad_norm": 2.8878772258758545, "learning_rate": 8.260291557871142e-07, "loss": 0.5091, "num_input_tokens_seen": 161702304, "step": 132960 }, { "epoch": 16.660192958275907, "grad_norm": 17.21424102783203, "learning_rate": 8.25728180786784e-07, "loss": 0.5597, "num_input_tokens_seen": 161708544, "step": 132965 }, { "epoch": 16.66081944618469, "grad_norm": 3.571686029434204, "learning_rate": 8.254272556933846e-07, "loss": 0.4558, "num_input_tokens_seen": 161714560, "step": 132970 }, { "epoch": 16.66144593409347, "grad_norm": 4.276949882507324, "learning_rate": 8.251263805105108e-07, "loss": 0.4647, "num_input_tokens_seen": 161721056, "step": 132975 }, { "epoch": 16.662072422002254, "grad_norm": 4.193318843841553, "learning_rate": 8.248255552417622e-07, "loss": 0.4831, "num_input_tokens_seen": 161726752, "step": 132980 }, { "epoch": 16.66269890991104, "grad_norm": 4.570187568664551, "learning_rate": 8.245247798907335e-07, "loss": 0.4355, "num_input_tokens_seen": 161733088, "step": 132985 }, { "epoch": 16.663325397819822, "grad_norm": 13.597577095031738, "learning_rate": 8.242240544610214e-07, "loss": 0.5085, "num_input_tokens_seen": 161739392, "step": 132990 }, { "epoch": 16.663951885728604, "grad_norm": 23.397920608520508, "learning_rate": 8.239233789562229e-07, "loss": 0.5105, "num_input_tokens_seen": 161745728, "step": 132995 }, { "epoch": 16.66457837363739, "grad_norm": 8.049261093139648, "learning_rate": 8.236227533799302e-07, "loss": 0.4492, "num_input_tokens_seen": 161751904, "step": 133000 }, { "epoch": 16.665204861546172, "grad_norm": 10.422745704650879, "learning_rate": 8.233221777357397e-07, "loss": 0.4882, "num_input_tokens_seen": 161757952, "step": 133005 }, { "epoch": 16.665831349454955, "grad_norm": 3.6568830013275146, "learning_rate": 8.230216520272433e-07, "loss": 0.514, "num_input_tokens_seen": 161764000, "step": 133010 }, { "epoch": 16.66645783736374, "grad_norm": 4.11447286605835, "learning_rate": 8.227211762580356e-07, "loss": 0.4262, "num_input_tokens_seen": 161770112, "step": 133015 }, { "epoch": 16.667084325272523, "grad_norm": 7.621695041656494, "learning_rate": 8.224207504317078e-07, "loss": 0.5139, "num_input_tokens_seen": 161776032, "step": 133020 }, { "epoch": 16.667710813181305, "grad_norm": 4.609335899353027, "learning_rate": 8.22120374551853e-07, "loss": 0.4235, "num_input_tokens_seen": 161782240, "step": 133025 }, { "epoch": 16.668337301090087, "grad_norm": 9.147455215454102, "learning_rate": 8.218200486220607e-07, "loss": 0.4763, "num_input_tokens_seen": 161788480, "step": 133030 }, { "epoch": 16.668963788998873, "grad_norm": 3.9326906204223633, "learning_rate": 8.215197726459234e-07, "loss": 0.4234, "num_input_tokens_seen": 161793984, "step": 133035 }, { "epoch": 16.669590276907655, "grad_norm": 5.168720722198486, "learning_rate": 8.21219546627029e-07, "loss": 0.489, "num_input_tokens_seen": 161799904, "step": 133040 }, { "epoch": 16.670216764816438, "grad_norm": 4.262362957000732, "learning_rate": 8.209193705689699e-07, "loss": 0.4765, "num_input_tokens_seen": 161806144, "step": 133045 }, { "epoch": 16.670843252725223, "grad_norm": 5.066708564758301, "learning_rate": 8.206192444753319e-07, "loss": 0.4694, "num_input_tokens_seen": 161812320, "step": 133050 }, { "epoch": 16.671469740634006, "grad_norm": 3.346343755722046, "learning_rate": 8.203191683497047e-07, "loss": 0.4671, "num_input_tokens_seen": 161817728, "step": 133055 }, { "epoch": 16.672096228542788, "grad_norm": 25.854202270507812, "learning_rate": 8.200191421956772e-07, "loss": 0.4687, "num_input_tokens_seen": 161823264, "step": 133060 }, { "epoch": 16.672722716451574, "grad_norm": 8.305914878845215, "learning_rate": 8.197191660168335e-07, "loss": 0.4287, "num_input_tokens_seen": 161829088, "step": 133065 }, { "epoch": 16.673349204360356, "grad_norm": 3.7886157035827637, "learning_rate": 8.194192398167622e-07, "loss": 0.4712, "num_input_tokens_seen": 161834496, "step": 133070 }, { "epoch": 16.67397569226914, "grad_norm": 28.79918098449707, "learning_rate": 8.191193635990486e-07, "loss": 0.5057, "num_input_tokens_seen": 161840800, "step": 133075 }, { "epoch": 16.674602180177924, "grad_norm": 16.01481819152832, "learning_rate": 8.188195373672797e-07, "loss": 0.441, "num_input_tokens_seen": 161846912, "step": 133080 }, { "epoch": 16.675228668086707, "grad_norm": 5.013172626495361, "learning_rate": 8.185197611250368e-07, "loss": 0.4559, "num_input_tokens_seen": 161852544, "step": 133085 }, { "epoch": 16.67585515599549, "grad_norm": 10.840858459472656, "learning_rate": 8.182200348759072e-07, "loss": 0.4441, "num_input_tokens_seen": 161858880, "step": 133090 }, { "epoch": 16.67648164390427, "grad_norm": 4.691988468170166, "learning_rate": 8.179203586234718e-07, "loss": 0.4585, "num_input_tokens_seen": 161865120, "step": 133095 }, { "epoch": 16.677108131813057, "grad_norm": 9.942866325378418, "learning_rate": 8.176207323713159e-07, "loss": 0.48, "num_input_tokens_seen": 161871008, "step": 133100 }, { "epoch": 16.67773461972184, "grad_norm": 12.116169929504395, "learning_rate": 8.173211561230188e-07, "loss": 0.454, "num_input_tokens_seen": 161876992, "step": 133105 }, { "epoch": 16.67836110763062, "grad_norm": 10.249956130981445, "learning_rate": 8.170216298821642e-07, "loss": 0.4353, "num_input_tokens_seen": 161883392, "step": 133110 }, { "epoch": 16.678987595539407, "grad_norm": 3.768807888031006, "learning_rate": 8.16722153652334e-07, "loss": 0.4573, "num_input_tokens_seen": 161889600, "step": 133115 }, { "epoch": 16.67961408344819, "grad_norm": 10.008859634399414, "learning_rate": 8.164227274371067e-07, "loss": 0.4952, "num_input_tokens_seen": 161895648, "step": 133120 }, { "epoch": 16.680240571356972, "grad_norm": 27.645883560180664, "learning_rate": 8.161233512400641e-07, "loss": 0.4833, "num_input_tokens_seen": 161901984, "step": 133125 }, { "epoch": 16.680867059265758, "grad_norm": 11.931628227233887, "learning_rate": 8.158240250647831e-07, "loss": 0.5353, "num_input_tokens_seen": 161908320, "step": 133130 }, { "epoch": 16.68149354717454, "grad_norm": 3.6102466583251953, "learning_rate": 8.155247489148449e-07, "loss": 0.3904, "num_input_tokens_seen": 161914624, "step": 133135 }, { "epoch": 16.682120035083322, "grad_norm": 2.6697306632995605, "learning_rate": 8.152255227938255e-07, "loss": 0.428, "num_input_tokens_seen": 161920704, "step": 133140 }, { "epoch": 16.682746522992105, "grad_norm": 2.9958248138427734, "learning_rate": 8.149263467053042e-07, "loss": 0.4443, "num_input_tokens_seen": 161926688, "step": 133145 }, { "epoch": 16.68337301090089, "grad_norm": 8.470916748046875, "learning_rate": 8.14627220652856e-07, "loss": 0.4816, "num_input_tokens_seen": 161932512, "step": 133150 }, { "epoch": 16.683999498809673, "grad_norm": 2.6294329166412354, "learning_rate": 8.143281446400592e-07, "loss": 0.4947, "num_input_tokens_seen": 161938688, "step": 133155 }, { "epoch": 16.684625986718455, "grad_norm": 2.539799690246582, "learning_rate": 8.140291186704879e-07, "loss": 0.5605, "num_input_tokens_seen": 161944768, "step": 133160 }, { "epoch": 16.68525247462724, "grad_norm": 5.792645454406738, "learning_rate": 8.137301427477173e-07, "loss": 0.4425, "num_input_tokens_seen": 161950976, "step": 133165 }, { "epoch": 16.685878962536023, "grad_norm": 6.2406697273254395, "learning_rate": 8.134312168753228e-07, "loss": 0.5179, "num_input_tokens_seen": 161957024, "step": 133170 }, { "epoch": 16.686505450444805, "grad_norm": 5.980640888214111, "learning_rate": 8.131323410568792e-07, "loss": 0.4897, "num_input_tokens_seen": 161963264, "step": 133175 }, { "epoch": 16.68713193835359, "grad_norm": 8.457079887390137, "learning_rate": 8.128335152959571e-07, "loss": 0.4253, "num_input_tokens_seen": 161968800, "step": 133180 }, { "epoch": 16.687758426262373, "grad_norm": 17.347627639770508, "learning_rate": 8.125347395961309e-07, "loss": 0.4657, "num_input_tokens_seen": 161975040, "step": 133185 }, { "epoch": 16.688384914171156, "grad_norm": 5.830145359039307, "learning_rate": 8.122360139609736e-07, "loss": 0.4602, "num_input_tokens_seen": 161981024, "step": 133190 }, { "epoch": 16.68901140207994, "grad_norm": 3.911869525909424, "learning_rate": 8.119373383940549e-07, "loss": 0.421, "num_input_tokens_seen": 161987072, "step": 133195 }, { "epoch": 16.689637889988724, "grad_norm": 3.5227298736572266, "learning_rate": 8.116387128989472e-07, "loss": 0.431, "num_input_tokens_seen": 161993344, "step": 133200 }, { "epoch": 16.690264377897506, "grad_norm": 6.221925258636475, "learning_rate": 8.11340137479219e-07, "loss": 0.4334, "num_input_tokens_seen": 161999360, "step": 133205 }, { "epoch": 16.69089086580629, "grad_norm": 6.883822917938232, "learning_rate": 8.110416121384424e-07, "loss": 0.4317, "num_input_tokens_seen": 162005376, "step": 133210 }, { "epoch": 16.691517353715074, "grad_norm": 4.638497352600098, "learning_rate": 8.107431368801833e-07, "loss": 0.4148, "num_input_tokens_seen": 162011552, "step": 133215 }, { "epoch": 16.692143841623857, "grad_norm": 4.087562561035156, "learning_rate": 8.104447117080139e-07, "loss": 0.4219, "num_input_tokens_seen": 162017344, "step": 133220 }, { "epoch": 16.69277032953264, "grad_norm": 4.279877662658691, "learning_rate": 8.101463366254991e-07, "loss": 0.4321, "num_input_tokens_seen": 162023744, "step": 133225 }, { "epoch": 16.693396817441425, "grad_norm": 3.2344751358032227, "learning_rate": 8.098480116362089e-07, "loss": 0.44, "num_input_tokens_seen": 162030048, "step": 133230 }, { "epoch": 16.694023305350207, "grad_norm": 2.1126902103424072, "learning_rate": 8.095497367437071e-07, "loss": 0.4051, "num_input_tokens_seen": 162036192, "step": 133235 }, { "epoch": 16.69464979325899, "grad_norm": 10.781035423278809, "learning_rate": 8.092515119515615e-07, "loss": 0.4701, "num_input_tokens_seen": 162042624, "step": 133240 }, { "epoch": 16.695276281167775, "grad_norm": 8.740303993225098, "learning_rate": 8.089533372633384e-07, "loss": 0.4575, "num_input_tokens_seen": 162048544, "step": 133245 }, { "epoch": 16.695902769076557, "grad_norm": 14.392263412475586, "learning_rate": 8.086552126826008e-07, "loss": 0.5772, "num_input_tokens_seen": 162054464, "step": 133250 }, { "epoch": 16.69652925698534, "grad_norm": 2.9341914653778076, "learning_rate": 8.083571382129157e-07, "loss": 0.4513, "num_input_tokens_seen": 162060160, "step": 133255 }, { "epoch": 16.697155744894122, "grad_norm": 3.360715627670288, "learning_rate": 8.080591138578431e-07, "loss": 0.412, "num_input_tokens_seen": 162066368, "step": 133260 }, { "epoch": 16.697782232802908, "grad_norm": 3.823613405227661, "learning_rate": 8.077611396209501e-07, "loss": 0.3887, "num_input_tokens_seen": 162072736, "step": 133265 }, { "epoch": 16.69840872071169, "grad_norm": 2.98005747795105, "learning_rate": 8.074632155057954e-07, "loss": 0.4797, "num_input_tokens_seen": 162078432, "step": 133270 }, { "epoch": 16.699035208620472, "grad_norm": 4.601221561431885, "learning_rate": 8.071653415159436e-07, "loss": 0.4794, "num_input_tokens_seen": 162084544, "step": 133275 }, { "epoch": 16.699661696529258, "grad_norm": 4.231015205383301, "learning_rate": 8.068675176549545e-07, "loss": 0.47, "num_input_tokens_seen": 162090560, "step": 133280 }, { "epoch": 16.70028818443804, "grad_norm": 3.3676319122314453, "learning_rate": 8.065697439263914e-07, "loss": 0.4819, "num_input_tokens_seen": 162096736, "step": 133285 }, { "epoch": 16.700914672346823, "grad_norm": 11.543792724609375, "learning_rate": 8.062720203338115e-07, "loss": 0.5346, "num_input_tokens_seen": 162102720, "step": 133290 }, { "epoch": 16.70154116025561, "grad_norm": 4.837297439575195, "learning_rate": 8.059743468807751e-07, "loss": 0.4221, "num_input_tokens_seen": 162108608, "step": 133295 }, { "epoch": 16.70216764816439, "grad_norm": 12.069391250610352, "learning_rate": 8.05676723570843e-07, "loss": 0.4445, "num_input_tokens_seen": 162114272, "step": 133300 }, { "epoch": 16.702794136073173, "grad_norm": 4.241367340087891, "learning_rate": 8.053791504075709e-07, "loss": 0.4295, "num_input_tokens_seen": 162120480, "step": 133305 }, { "epoch": 16.703420623981955, "grad_norm": 6.995175838470459, "learning_rate": 8.050816273945184e-07, "loss": 0.4627, "num_input_tokens_seen": 162126624, "step": 133310 }, { "epoch": 16.70404711189074, "grad_norm": 5.718862533569336, "learning_rate": 8.047841545352414e-07, "loss": 0.4504, "num_input_tokens_seen": 162132960, "step": 133315 }, { "epoch": 16.704673599799523, "grad_norm": 2.3987154960632324, "learning_rate": 8.044867318332977e-07, "loss": 0.4323, "num_input_tokens_seen": 162139264, "step": 133320 }, { "epoch": 16.705300087708306, "grad_norm": 6.510611057281494, "learning_rate": 8.041893592922417e-07, "loss": 0.456, "num_input_tokens_seen": 162145280, "step": 133325 }, { "epoch": 16.70592657561709, "grad_norm": 13.762664794921875, "learning_rate": 8.038920369156306e-07, "loss": 0.4877, "num_input_tokens_seen": 162151456, "step": 133330 }, { "epoch": 16.706553063525874, "grad_norm": 2.96219801902771, "learning_rate": 8.035947647070164e-07, "loss": 0.4509, "num_input_tokens_seen": 162157664, "step": 133335 }, { "epoch": 16.707179551434656, "grad_norm": 5.805216312408447, "learning_rate": 8.032975426699569e-07, "loss": 0.4632, "num_input_tokens_seen": 162163616, "step": 133340 }, { "epoch": 16.707806039343442, "grad_norm": 5.92905330657959, "learning_rate": 8.030003708080019e-07, "loss": 0.4422, "num_input_tokens_seen": 162169760, "step": 133345 }, { "epoch": 16.708432527252224, "grad_norm": 4.935084819793701, "learning_rate": 8.027032491247078e-07, "loss": 0.4458, "num_input_tokens_seen": 162175936, "step": 133350 }, { "epoch": 16.709059015161007, "grad_norm": 4.073440074920654, "learning_rate": 8.024061776236236e-07, "loss": 0.4497, "num_input_tokens_seen": 162181664, "step": 133355 }, { "epoch": 16.709685503069792, "grad_norm": 7.347398281097412, "learning_rate": 8.021091563083028e-07, "loss": 0.4994, "num_input_tokens_seen": 162187904, "step": 133360 }, { "epoch": 16.710311990978575, "grad_norm": 3.244248867034912, "learning_rate": 8.018121851822979e-07, "loss": 0.4619, "num_input_tokens_seen": 162194176, "step": 133365 }, { "epoch": 16.710938478887357, "grad_norm": 4.0637335777282715, "learning_rate": 8.015152642491564e-07, "loss": 0.4237, "num_input_tokens_seen": 162200224, "step": 133370 }, { "epoch": 16.71156496679614, "grad_norm": 4.474462985992432, "learning_rate": 8.012183935124301e-07, "loss": 0.4299, "num_input_tokens_seen": 162206464, "step": 133375 }, { "epoch": 16.712191454704925, "grad_norm": 2.9206767082214355, "learning_rate": 8.009215729756692e-07, "loss": 0.4157, "num_input_tokens_seen": 162212544, "step": 133380 }, { "epoch": 16.712817942613707, "grad_norm": 4.212372779846191, "learning_rate": 8.006248026424202e-07, "loss": 0.4631, "num_input_tokens_seen": 162218432, "step": 133385 }, { "epoch": 16.71344443052249, "grad_norm": 3.6773104667663574, "learning_rate": 8.003280825162319e-07, "loss": 0.4441, "num_input_tokens_seen": 162224352, "step": 133390 }, { "epoch": 16.714070918431275, "grad_norm": 3.2983381748199463, "learning_rate": 8.000314126006536e-07, "loss": 0.4771, "num_input_tokens_seen": 162230336, "step": 133395 }, { "epoch": 16.714697406340058, "grad_norm": 3.994863510131836, "learning_rate": 7.997347928992299e-07, "loss": 0.4213, "num_input_tokens_seen": 162236224, "step": 133400 }, { "epoch": 16.71532389424884, "grad_norm": 10.743456840515137, "learning_rate": 7.99438223415509e-07, "loss": 0.4757, "num_input_tokens_seen": 162242336, "step": 133405 }, { "epoch": 16.715950382157626, "grad_norm": 3.6991286277770996, "learning_rate": 7.991417041530352e-07, "loss": 0.4351, "num_input_tokens_seen": 162248544, "step": 133410 }, { "epoch": 16.716576870066408, "grad_norm": 7.076496124267578, "learning_rate": 7.988452351153547e-07, "loss": 0.4555, "num_input_tokens_seen": 162253952, "step": 133415 }, { "epoch": 16.71720335797519, "grad_norm": 14.219261169433594, "learning_rate": 7.985488163060107e-07, "loss": 0.5043, "num_input_tokens_seen": 162260128, "step": 133420 }, { "epoch": 16.717829845883976, "grad_norm": 2.727407217025757, "learning_rate": 7.98252447728548e-07, "loss": 0.4221, "num_input_tokens_seen": 162265856, "step": 133425 }, { "epoch": 16.71845633379276, "grad_norm": 3.6573898792266846, "learning_rate": 7.979561293865112e-07, "loss": 0.4785, "num_input_tokens_seen": 162271968, "step": 133430 }, { "epoch": 16.71908282170154, "grad_norm": 25.445091247558594, "learning_rate": 7.976598612834407e-07, "loss": 0.4675, "num_input_tokens_seen": 162278208, "step": 133435 }, { "epoch": 16.719709309610323, "grad_norm": 14.644625663757324, "learning_rate": 7.973636434228804e-07, "loss": 0.5267, "num_input_tokens_seen": 162284128, "step": 133440 }, { "epoch": 16.72033579751911, "grad_norm": 4.444646835327148, "learning_rate": 7.970674758083702e-07, "loss": 0.4088, "num_input_tokens_seen": 162290368, "step": 133445 }, { "epoch": 16.72096228542789, "grad_norm": 4.108583927154541, "learning_rate": 7.967713584434533e-07, "loss": 0.4218, "num_input_tokens_seen": 162296544, "step": 133450 }, { "epoch": 16.721588773336673, "grad_norm": 3.7237324714660645, "learning_rate": 7.96475291331667e-07, "loss": 0.4284, "num_input_tokens_seen": 162302720, "step": 133455 }, { "epoch": 16.72221526124546, "grad_norm": 20.672903060913086, "learning_rate": 7.961792744765545e-07, "loss": 0.5941, "num_input_tokens_seen": 162308960, "step": 133460 }, { "epoch": 16.72284174915424, "grad_norm": 17.95284652709961, "learning_rate": 7.958833078816514e-07, "loss": 0.4751, "num_input_tokens_seen": 162315168, "step": 133465 }, { "epoch": 16.723468237063024, "grad_norm": 3.3369596004486084, "learning_rate": 7.955873915504986e-07, "loss": 0.4894, "num_input_tokens_seen": 162321696, "step": 133470 }, { "epoch": 16.72409472497181, "grad_norm": 22.384685516357422, "learning_rate": 7.952915254866339e-07, "loss": 0.5217, "num_input_tokens_seen": 162327488, "step": 133475 }, { "epoch": 16.724721212880592, "grad_norm": 4.6617512702941895, "learning_rate": 7.949957096935934e-07, "loss": 0.4386, "num_input_tokens_seen": 162333728, "step": 133480 }, { "epoch": 16.725347700789374, "grad_norm": 2.4402997493743896, "learning_rate": 7.946999441749148e-07, "loss": 0.4467, "num_input_tokens_seen": 162340064, "step": 133485 }, { "epoch": 16.725974188698157, "grad_norm": 8.650611877441406, "learning_rate": 7.944042289341336e-07, "loss": 0.4578, "num_input_tokens_seen": 162346016, "step": 133490 }, { "epoch": 16.726600676606942, "grad_norm": 3.693542957305908, "learning_rate": 7.94108563974787e-07, "loss": 0.4341, "num_input_tokens_seen": 162352064, "step": 133495 }, { "epoch": 16.727227164515725, "grad_norm": 3.0123183727264404, "learning_rate": 7.938129493004076e-07, "loss": 0.4441, "num_input_tokens_seen": 162358176, "step": 133500 }, { "epoch": 16.727853652424507, "grad_norm": 17.676189422607422, "learning_rate": 7.935173849145317e-07, "loss": 0.4888, "num_input_tokens_seen": 162364224, "step": 133505 }, { "epoch": 16.728480140333293, "grad_norm": 3.6697099208831787, "learning_rate": 7.932218708206907e-07, "loss": 0.41, "num_input_tokens_seen": 162370592, "step": 133510 }, { "epoch": 16.729106628242075, "grad_norm": 10.846648216247559, "learning_rate": 7.929264070224208e-07, "loss": 0.5235, "num_input_tokens_seen": 162376192, "step": 133515 }, { "epoch": 16.729733116150857, "grad_norm": 4.013625621795654, "learning_rate": 7.926309935232512e-07, "loss": 0.4298, "num_input_tokens_seen": 162382272, "step": 133520 }, { "epoch": 16.730359604059643, "grad_norm": 9.12552547454834, "learning_rate": 7.923356303267171e-07, "loss": 0.4299, "num_input_tokens_seen": 162388288, "step": 133525 }, { "epoch": 16.730986091968425, "grad_norm": 15.609253883361816, "learning_rate": 7.920403174363467e-07, "loss": 0.4507, "num_input_tokens_seen": 162394368, "step": 133530 }, { "epoch": 16.731612579877208, "grad_norm": 5.455974578857422, "learning_rate": 7.917450548556732e-07, "loss": 0.4581, "num_input_tokens_seen": 162400320, "step": 133535 }, { "epoch": 16.73223906778599, "grad_norm": 3.4531402587890625, "learning_rate": 7.914498425882245e-07, "loss": 0.4319, "num_input_tokens_seen": 162406592, "step": 133540 }, { "epoch": 16.732865555694776, "grad_norm": 3.861961603164673, "learning_rate": 7.911546806375314e-07, "loss": 0.405, "num_input_tokens_seen": 162412544, "step": 133545 }, { "epoch": 16.733492043603558, "grad_norm": 3.029754400253296, "learning_rate": 7.908595690071235e-07, "loss": 0.4388, "num_input_tokens_seen": 162418720, "step": 133550 }, { "epoch": 16.73411853151234, "grad_norm": 5.517910957336426, "learning_rate": 7.90564507700528e-07, "loss": 0.4425, "num_input_tokens_seen": 162424672, "step": 133555 }, { "epoch": 16.734745019421126, "grad_norm": 5.7717366218566895, "learning_rate": 7.902694967212732e-07, "loss": 0.4374, "num_input_tokens_seen": 162430208, "step": 133560 }, { "epoch": 16.73537150732991, "grad_norm": 3.684704542160034, "learning_rate": 7.899745360728855e-07, "loss": 0.4495, "num_input_tokens_seen": 162436288, "step": 133565 }, { "epoch": 16.73599799523869, "grad_norm": 6.1879472732543945, "learning_rate": 7.896796257588923e-07, "loss": 0.4838, "num_input_tokens_seen": 162442400, "step": 133570 }, { "epoch": 16.736624483147477, "grad_norm": 3.970471143722534, "learning_rate": 7.89384765782818e-07, "loss": 0.4886, "num_input_tokens_seen": 162448640, "step": 133575 }, { "epoch": 16.73725097105626, "grad_norm": 6.625153064727783, "learning_rate": 7.89089956148189e-07, "loss": 0.4478, "num_input_tokens_seen": 162454976, "step": 133580 }, { "epoch": 16.73787745896504, "grad_norm": 3.693247079849243, "learning_rate": 7.887951968585301e-07, "loss": 0.4485, "num_input_tokens_seen": 162461120, "step": 133585 }, { "epoch": 16.738503946873827, "grad_norm": 10.178775787353516, "learning_rate": 7.885004879173663e-07, "loss": 0.4802, "num_input_tokens_seen": 162466944, "step": 133590 }, { "epoch": 16.73913043478261, "grad_norm": 6.242011070251465, "learning_rate": 7.882058293282191e-07, "loss": 0.4936, "num_input_tokens_seen": 162473248, "step": 133595 }, { "epoch": 16.73975692269139, "grad_norm": 2.216032028198242, "learning_rate": 7.879112210946127e-07, "loss": 0.4359, "num_input_tokens_seen": 162479424, "step": 133600 }, { "epoch": 16.740383410600174, "grad_norm": 6.985373497009277, "learning_rate": 7.876166632200688e-07, "loss": 0.4674, "num_input_tokens_seen": 162485632, "step": 133605 }, { "epoch": 16.74100989850896, "grad_norm": 6.015120029449463, "learning_rate": 7.873221557081084e-07, "loss": 0.4815, "num_input_tokens_seen": 162491488, "step": 133610 }, { "epoch": 16.741636386417742, "grad_norm": 8.562731742858887, "learning_rate": 7.870276985622549e-07, "loss": 0.4319, "num_input_tokens_seen": 162497408, "step": 133615 }, { "epoch": 16.742262874326524, "grad_norm": 4.592680931091309, "learning_rate": 7.867332917860265e-07, "loss": 0.473, "num_input_tokens_seen": 162503488, "step": 133620 }, { "epoch": 16.74288936223531, "grad_norm": 2.923295497894287, "learning_rate": 7.864389353829443e-07, "loss": 0.4638, "num_input_tokens_seen": 162509632, "step": 133625 }, { "epoch": 16.743515850144092, "grad_norm": 4.074931621551514, "learning_rate": 7.861446293565267e-07, "loss": 0.473, "num_input_tokens_seen": 162516192, "step": 133630 }, { "epoch": 16.744142338052875, "grad_norm": 5.8719706535339355, "learning_rate": 7.85850373710294e-07, "loss": 0.4632, "num_input_tokens_seen": 162522560, "step": 133635 }, { "epoch": 16.74476882596166, "grad_norm": 15.279402732849121, "learning_rate": 7.855561684477619e-07, "loss": 0.5134, "num_input_tokens_seen": 162528960, "step": 133640 }, { "epoch": 16.745395313870443, "grad_norm": 2.6151115894317627, "learning_rate": 7.852620135724504e-07, "loss": 0.577, "num_input_tokens_seen": 162534592, "step": 133645 }, { "epoch": 16.746021801779225, "grad_norm": 4.778472423553467, "learning_rate": 7.849679090878737e-07, "loss": 0.5014, "num_input_tokens_seen": 162540832, "step": 133650 }, { "epoch": 16.746648289688007, "grad_norm": 3.335484743118286, "learning_rate": 7.846738549975508e-07, "loss": 0.4008, "num_input_tokens_seen": 162546400, "step": 133655 }, { "epoch": 16.747274777596793, "grad_norm": 7.859398365020752, "learning_rate": 7.843798513049944e-07, "loss": 0.5347, "num_input_tokens_seen": 162552224, "step": 133660 }, { "epoch": 16.747901265505575, "grad_norm": 3.331812858581543, "learning_rate": 7.840858980137217e-07, "loss": 0.4238, "num_input_tokens_seen": 162557728, "step": 133665 }, { "epoch": 16.748527753414358, "grad_norm": 3.2946908473968506, "learning_rate": 7.837919951272471e-07, "loss": 0.4377, "num_input_tokens_seen": 162563872, "step": 133670 }, { "epoch": 16.749154241323144, "grad_norm": 9.568293571472168, "learning_rate": 7.834981426490834e-07, "loss": 0.4816, "num_input_tokens_seen": 162570304, "step": 133675 }, { "epoch": 16.749780729231926, "grad_norm": 7.681570053100586, "learning_rate": 7.832043405827455e-07, "loss": 0.4108, "num_input_tokens_seen": 162576576, "step": 133680 }, { "epoch": 16.750407217140708, "grad_norm": 4.541157245635986, "learning_rate": 7.82910588931744e-07, "loss": 0.4649, "num_input_tokens_seen": 162582560, "step": 133685 }, { "epoch": 16.751033705049494, "grad_norm": 4.067909240722656, "learning_rate": 7.826168876995921e-07, "loss": 0.4964, "num_input_tokens_seen": 162588544, "step": 133690 }, { "epoch": 16.751660192958276, "grad_norm": 6.718236923217773, "learning_rate": 7.823232368898004e-07, "loss": 0.4223, "num_input_tokens_seen": 162594816, "step": 133695 }, { "epoch": 16.75228668086706, "grad_norm": 4.214921951293945, "learning_rate": 7.820296365058822e-07, "loss": 0.5326, "num_input_tokens_seen": 162600992, "step": 133700 }, { "epoch": 16.75291316877584, "grad_norm": 25.243236541748047, "learning_rate": 7.817360865513445e-07, "loss": 0.5209, "num_input_tokens_seen": 162607136, "step": 133705 }, { "epoch": 16.753539656684627, "grad_norm": 5.123386383056641, "learning_rate": 7.814425870296999e-07, "loss": 0.5286, "num_input_tokens_seen": 162613504, "step": 133710 }, { "epoch": 16.75416614459341, "grad_norm": 4.701345920562744, "learning_rate": 7.811491379444541e-07, "loss": 0.4443, "num_input_tokens_seen": 162619840, "step": 133715 }, { "epoch": 16.75479263250219, "grad_norm": 3.520704507827759, "learning_rate": 7.80855739299119e-07, "loss": 0.498, "num_input_tokens_seen": 162626144, "step": 133720 }, { "epoch": 16.755419120410977, "grad_norm": 23.797433853149414, "learning_rate": 7.805623910971999e-07, "loss": 0.4993, "num_input_tokens_seen": 162632352, "step": 133725 }, { "epoch": 16.75604560831976, "grad_norm": 5.8536858558654785, "learning_rate": 7.802690933422047e-07, "loss": 0.4587, "num_input_tokens_seen": 162638272, "step": 133730 }, { "epoch": 16.75667209622854, "grad_norm": 8.118156433105469, "learning_rate": 7.799758460376411e-07, "loss": 0.4573, "num_input_tokens_seen": 162643712, "step": 133735 }, { "epoch": 16.757298584137327, "grad_norm": 3.852051019668579, "learning_rate": 7.796826491870135e-07, "loss": 0.4699, "num_input_tokens_seen": 162649824, "step": 133740 }, { "epoch": 16.75792507204611, "grad_norm": 19.476022720336914, "learning_rate": 7.793895027938286e-07, "loss": 0.5297, "num_input_tokens_seen": 162655424, "step": 133745 }, { "epoch": 16.758551559954892, "grad_norm": 4.401954650878906, "learning_rate": 7.790964068615902e-07, "loss": 0.5012, "num_input_tokens_seen": 162661344, "step": 133750 }, { "epoch": 16.759178047863678, "grad_norm": 2.9685451984405518, "learning_rate": 7.78803361393804e-07, "loss": 0.4312, "num_input_tokens_seen": 162667264, "step": 133755 }, { "epoch": 16.75980453577246, "grad_norm": 9.148530960083008, "learning_rate": 7.785103663939714e-07, "loss": 0.4418, "num_input_tokens_seen": 162673376, "step": 133760 }, { "epoch": 16.760431023681242, "grad_norm": 3.083216905593872, "learning_rate": 7.78217421865598e-07, "loss": 0.4132, "num_input_tokens_seen": 162679744, "step": 133765 }, { "epoch": 16.761057511590025, "grad_norm": 4.4136576652526855, "learning_rate": 7.779245278121833e-07, "loss": 0.4414, "num_input_tokens_seen": 162685888, "step": 133770 }, { "epoch": 16.76168399949881, "grad_norm": 6.999688625335693, "learning_rate": 7.776316842372317e-07, "loss": 0.4271, "num_input_tokens_seen": 162691936, "step": 133775 }, { "epoch": 16.762310487407593, "grad_norm": 4.408888816833496, "learning_rate": 7.773388911442425e-07, "loss": 0.42, "num_input_tokens_seen": 162697952, "step": 133780 }, { "epoch": 16.762936975316375, "grad_norm": 3.484983205795288, "learning_rate": 7.770461485367164e-07, "loss": 0.4604, "num_input_tokens_seen": 162704160, "step": 133785 }, { "epoch": 16.76356346322516, "grad_norm": 17.348173141479492, "learning_rate": 7.767534564181545e-07, "loss": 0.5182, "num_input_tokens_seen": 162710464, "step": 133790 }, { "epoch": 16.764189951133943, "grad_norm": 2.8767032623291016, "learning_rate": 7.764608147920572e-07, "loss": 0.4596, "num_input_tokens_seen": 162716704, "step": 133795 }, { "epoch": 16.764816439042725, "grad_norm": 4.309902191162109, "learning_rate": 7.761682236619206e-07, "loss": 0.4505, "num_input_tokens_seen": 162723328, "step": 133800 }, { "epoch": 16.76544292695151, "grad_norm": 2.6845531463623047, "learning_rate": 7.758756830312436e-07, "loss": 0.4269, "num_input_tokens_seen": 162729312, "step": 133805 }, { "epoch": 16.766069414860294, "grad_norm": 4.084418773651123, "learning_rate": 7.755831929035256e-07, "loss": 0.5001, "num_input_tokens_seen": 162735296, "step": 133810 }, { "epoch": 16.766695902769076, "grad_norm": 22.400693893432617, "learning_rate": 7.752907532822613e-07, "loss": 0.574, "num_input_tokens_seen": 162740896, "step": 133815 }, { "epoch": 16.76732239067786, "grad_norm": 17.98805046081543, "learning_rate": 7.749983641709491e-07, "loss": 0.4736, "num_input_tokens_seen": 162746496, "step": 133820 }, { "epoch": 16.767948878586644, "grad_norm": 3.7152607440948486, "learning_rate": 7.747060255730821e-07, "loss": 0.43, "num_input_tokens_seen": 162752288, "step": 133825 }, { "epoch": 16.768575366495426, "grad_norm": 2.7979841232299805, "learning_rate": 7.744137374921584e-07, "loss": 0.4073, "num_input_tokens_seen": 162758592, "step": 133830 }, { "epoch": 16.76920185440421, "grad_norm": 4.489095211029053, "learning_rate": 7.741214999316698e-07, "loss": 0.5001, "num_input_tokens_seen": 162764832, "step": 133835 }, { "epoch": 16.769828342312994, "grad_norm": 3.1944007873535156, "learning_rate": 7.73829312895113e-07, "loss": 0.4451, "num_input_tokens_seen": 162770432, "step": 133840 }, { "epoch": 16.770454830221777, "grad_norm": 10.916169166564941, "learning_rate": 7.735371763859789e-07, "loss": 0.423, "num_input_tokens_seen": 162776544, "step": 133845 }, { "epoch": 16.77108131813056, "grad_norm": 2.985182762145996, "learning_rate": 7.73245090407762e-07, "loss": 0.4246, "num_input_tokens_seen": 162782528, "step": 133850 }, { "epoch": 16.771707806039345, "grad_norm": 8.155797004699707, "learning_rate": 7.72953054963953e-07, "loss": 0.4285, "num_input_tokens_seen": 162788608, "step": 133855 }, { "epoch": 16.772334293948127, "grad_norm": 3.844329595565796, "learning_rate": 7.726610700580439e-07, "loss": 0.4079, "num_input_tokens_seen": 162794560, "step": 133860 }, { "epoch": 16.77296078185691, "grad_norm": 9.665783882141113, "learning_rate": 7.72369135693527e-07, "loss": 0.4152, "num_input_tokens_seen": 162800640, "step": 133865 }, { "epoch": 16.773587269765695, "grad_norm": 2.9794154167175293, "learning_rate": 7.720772518738906e-07, "loss": 0.5276, "num_input_tokens_seen": 162806848, "step": 133870 }, { "epoch": 16.774213757674477, "grad_norm": 4.388887882232666, "learning_rate": 7.717854186026264e-07, "loss": 0.4302, "num_input_tokens_seen": 162812512, "step": 133875 }, { "epoch": 16.77484024558326, "grad_norm": 14.455911636352539, "learning_rate": 7.714936358832209e-07, "loss": 0.4523, "num_input_tokens_seen": 162817888, "step": 133880 }, { "epoch": 16.775466733492042, "grad_norm": 6.661158084869385, "learning_rate": 7.712019037191643e-07, "loss": 0.4259, "num_input_tokens_seen": 162824224, "step": 133885 }, { "epoch": 16.776093221400828, "grad_norm": 7.183045387268066, "learning_rate": 7.709102221139453e-07, "loss": 0.4208, "num_input_tokens_seen": 162830304, "step": 133890 }, { "epoch": 16.77671970930961, "grad_norm": 2.928687334060669, "learning_rate": 7.706185910710495e-07, "loss": 0.4421, "num_input_tokens_seen": 162836352, "step": 133895 }, { "epoch": 16.777346197218392, "grad_norm": 2.6040563583374023, "learning_rate": 7.703270105939636e-07, "loss": 0.4651, "num_input_tokens_seen": 162842208, "step": 133900 }, { "epoch": 16.777972685127178, "grad_norm": 3.5324137210845947, "learning_rate": 7.700354806861765e-07, "loss": 0.4721, "num_input_tokens_seen": 162848000, "step": 133905 }, { "epoch": 16.77859917303596, "grad_norm": 7.0295939445495605, "learning_rate": 7.697440013511698e-07, "loss": 0.4591, "num_input_tokens_seen": 162854400, "step": 133910 }, { "epoch": 16.779225660944743, "grad_norm": 5.146313190460205, "learning_rate": 7.694525725924301e-07, "loss": 0.473, "num_input_tokens_seen": 162860224, "step": 133915 }, { "epoch": 16.77985214885353, "grad_norm": 6.903825283050537, "learning_rate": 7.691611944134431e-07, "loss": 0.5189, "num_input_tokens_seen": 162866208, "step": 133920 }, { "epoch": 16.78047863676231, "grad_norm": 6.406134605407715, "learning_rate": 7.688698668176903e-07, "loss": 0.4282, "num_input_tokens_seen": 162872640, "step": 133925 }, { "epoch": 16.781105124671093, "grad_norm": 4.227046966552734, "learning_rate": 7.685785898086562e-07, "loss": 0.4792, "num_input_tokens_seen": 162878592, "step": 133930 }, { "epoch": 16.781731612579875, "grad_norm": 11.913315773010254, "learning_rate": 7.682873633898213e-07, "loss": 0.4584, "num_input_tokens_seen": 162884480, "step": 133935 }, { "epoch": 16.78235810048866, "grad_norm": 6.131431579589844, "learning_rate": 7.679961875646707e-07, "loss": 0.4386, "num_input_tokens_seen": 162890912, "step": 133940 }, { "epoch": 16.782984588397444, "grad_norm": 3.444603204727173, "learning_rate": 7.677050623366822e-07, "loss": 0.4548, "num_input_tokens_seen": 162897024, "step": 133945 }, { "epoch": 16.783611076306226, "grad_norm": 11.383943557739258, "learning_rate": 7.674139877093389e-07, "loss": 0.4816, "num_input_tokens_seen": 162903136, "step": 133950 }, { "epoch": 16.78423756421501, "grad_norm": 11.99293041229248, "learning_rate": 7.671229636861193e-07, "loss": 0.5089, "num_input_tokens_seen": 162908928, "step": 133955 }, { "epoch": 16.784864052123794, "grad_norm": 4.221827983856201, "learning_rate": 7.668319902705046e-07, "loss": 0.4436, "num_input_tokens_seen": 162915008, "step": 133960 }, { "epoch": 16.785490540032576, "grad_norm": 3.8845503330230713, "learning_rate": 7.665410674659707e-07, "loss": 0.5359, "num_input_tokens_seen": 162921024, "step": 133965 }, { "epoch": 16.786117027941362, "grad_norm": 4.1063690185546875, "learning_rate": 7.662501952759993e-07, "loss": 0.3862, "num_input_tokens_seen": 162927232, "step": 133970 }, { "epoch": 16.786743515850144, "grad_norm": 8.039740562438965, "learning_rate": 7.659593737040655e-07, "loss": 0.4772, "num_input_tokens_seen": 162933312, "step": 133975 }, { "epoch": 16.787370003758927, "grad_norm": 3.3860843181610107, "learning_rate": 7.656686027536464e-07, "loss": 0.452, "num_input_tokens_seen": 162939360, "step": 133980 }, { "epoch": 16.787996491667712, "grad_norm": 3.7363219261169434, "learning_rate": 7.653778824282209e-07, "loss": 0.5162, "num_input_tokens_seen": 162945728, "step": 133985 }, { "epoch": 16.788622979576495, "grad_norm": 5.028650283813477, "learning_rate": 7.650872127312619e-07, "loss": 0.4383, "num_input_tokens_seen": 162951968, "step": 133990 }, { "epoch": 16.789249467485277, "grad_norm": 20.926471710205078, "learning_rate": 7.647965936662455e-07, "loss": 0.496, "num_input_tokens_seen": 162957984, "step": 133995 }, { "epoch": 16.78987595539406, "grad_norm": 3.013753652572632, "learning_rate": 7.645060252366482e-07, "loss": 0.4151, "num_input_tokens_seen": 162964064, "step": 134000 }, { "epoch": 16.790502443302845, "grad_norm": 8.216954231262207, "learning_rate": 7.642155074459407e-07, "loss": 0.6286, "num_input_tokens_seen": 162970080, "step": 134005 }, { "epoch": 16.791128931211627, "grad_norm": 4.48217248916626, "learning_rate": 7.639250402975983e-07, "loss": 0.4179, "num_input_tokens_seen": 162976160, "step": 134010 }, { "epoch": 16.79175541912041, "grad_norm": 14.456205368041992, "learning_rate": 7.636346237950948e-07, "loss": 0.5146, "num_input_tokens_seen": 162982528, "step": 134015 }, { "epoch": 16.792381907029196, "grad_norm": 2.6528215408325195, "learning_rate": 7.633442579418998e-07, "loss": 0.512, "num_input_tokens_seen": 162988192, "step": 134020 }, { "epoch": 16.793008394937978, "grad_norm": 4.116849899291992, "learning_rate": 7.630539427414874e-07, "loss": 0.5002, "num_input_tokens_seen": 162994688, "step": 134025 }, { "epoch": 16.79363488284676, "grad_norm": 8.49262809753418, "learning_rate": 7.627636781973264e-07, "loss": 0.4183, "num_input_tokens_seen": 163001216, "step": 134030 }, { "epoch": 16.794261370755546, "grad_norm": 6.082705974578857, "learning_rate": 7.62473464312889e-07, "loss": 0.4917, "num_input_tokens_seen": 163007360, "step": 134035 }, { "epoch": 16.794887858664328, "grad_norm": 3.79744029045105, "learning_rate": 7.62183301091643e-07, "loss": 0.4715, "num_input_tokens_seen": 163013344, "step": 134040 }, { "epoch": 16.79551434657311, "grad_norm": 12.278985977172852, "learning_rate": 7.618931885370584e-07, "loss": 0.4607, "num_input_tokens_seen": 163019488, "step": 134045 }, { "epoch": 16.796140834481896, "grad_norm": 7.8782429695129395, "learning_rate": 7.616031266526053e-07, "loss": 0.4374, "num_input_tokens_seen": 163025536, "step": 134050 }, { "epoch": 16.79676732239068, "grad_norm": 3.4314000606536865, "learning_rate": 7.613131154417486e-07, "loss": 0.4017, "num_input_tokens_seen": 163031456, "step": 134055 }, { "epoch": 16.79739381029946, "grad_norm": 2.987440824508667, "learning_rate": 7.610231549079589e-07, "loss": 0.4402, "num_input_tokens_seen": 163037504, "step": 134060 }, { "epoch": 16.798020298208243, "grad_norm": 4.695411205291748, "learning_rate": 7.607332450547e-07, "loss": 0.444, "num_input_tokens_seen": 163043904, "step": 134065 }, { "epoch": 16.79864678611703, "grad_norm": 3.618013620376587, "learning_rate": 7.60443385885441e-07, "loss": 0.4735, "num_input_tokens_seen": 163050176, "step": 134070 }, { "epoch": 16.79927327402581, "grad_norm": 4.191347599029541, "learning_rate": 7.601535774036439e-07, "loss": 0.477, "num_input_tokens_seen": 163056256, "step": 134075 }, { "epoch": 16.799899761934594, "grad_norm": 24.317853927612305, "learning_rate": 7.598638196127773e-07, "loss": 0.5286, "num_input_tokens_seen": 163061888, "step": 134080 }, { "epoch": 16.80052624984338, "grad_norm": 3.511353015899658, "learning_rate": 7.59574112516302e-07, "loss": 0.4326, "num_input_tokens_seen": 163068032, "step": 134085 }, { "epoch": 16.80115273775216, "grad_norm": 6.113600730895996, "learning_rate": 7.592844561176832e-07, "loss": 0.4237, "num_input_tokens_seen": 163073792, "step": 134090 }, { "epoch": 16.801779225660944, "grad_norm": 3.803534984588623, "learning_rate": 7.589948504203859e-07, "loss": 0.4478, "num_input_tokens_seen": 163079808, "step": 134095 }, { "epoch": 16.80240571356973, "grad_norm": 3.6824381351470947, "learning_rate": 7.587052954278689e-07, "loss": 0.4365, "num_input_tokens_seen": 163085888, "step": 134100 }, { "epoch": 16.803032201478512, "grad_norm": 4.248626708984375, "learning_rate": 7.584157911435963e-07, "loss": 0.5151, "num_input_tokens_seen": 163091808, "step": 134105 }, { "epoch": 16.803658689387294, "grad_norm": 6.7533040046691895, "learning_rate": 7.581263375710296e-07, "loss": 0.4868, "num_input_tokens_seen": 163098080, "step": 134110 }, { "epoch": 16.804285177296077, "grad_norm": 8.521960258483887, "learning_rate": 7.578369347136294e-07, "loss": 0.4914, "num_input_tokens_seen": 163104192, "step": 134115 }, { "epoch": 16.804911665204862, "grad_norm": 16.287803649902344, "learning_rate": 7.57547582574854e-07, "loss": 0.644, "num_input_tokens_seen": 163110240, "step": 134120 }, { "epoch": 16.805538153113645, "grad_norm": 3.844540596008301, "learning_rate": 7.572582811581658e-07, "loss": 0.4664, "num_input_tokens_seen": 163116704, "step": 134125 }, { "epoch": 16.806164641022427, "grad_norm": 16.16499900817871, "learning_rate": 7.569690304670202e-07, "loss": 0.4512, "num_input_tokens_seen": 163123008, "step": 134130 }, { "epoch": 16.806791128931213, "grad_norm": 6.6703572273254395, "learning_rate": 7.56679830504879e-07, "loss": 0.4634, "num_input_tokens_seen": 163129248, "step": 134135 }, { "epoch": 16.807417616839995, "grad_norm": 3.2491514682769775, "learning_rate": 7.563906812751964e-07, "loss": 0.4867, "num_input_tokens_seen": 163135648, "step": 134140 }, { "epoch": 16.808044104748777, "grad_norm": 11.443148612976074, "learning_rate": 7.561015827814322e-07, "loss": 0.4723, "num_input_tokens_seen": 163142112, "step": 134145 }, { "epoch": 16.808670592657563, "grad_norm": 3.386988401412964, "learning_rate": 7.558125350270407e-07, "loss": 0.4123, "num_input_tokens_seen": 163147808, "step": 134150 }, { "epoch": 16.809297080566346, "grad_norm": 3.449099540710449, "learning_rate": 7.555235380154796e-07, "loss": 0.4414, "num_input_tokens_seen": 163153984, "step": 134155 }, { "epoch": 16.809923568475128, "grad_norm": 5.748510837554932, "learning_rate": 7.552345917502025e-07, "loss": 0.437, "num_input_tokens_seen": 163160320, "step": 134160 }, { "epoch": 16.81055005638391, "grad_norm": 3.348485231399536, "learning_rate": 7.54945696234664e-07, "loss": 0.4836, "num_input_tokens_seen": 163166336, "step": 134165 }, { "epoch": 16.811176544292696, "grad_norm": 3.153507709503174, "learning_rate": 7.546568514723202e-07, "loss": 0.4634, "num_input_tokens_seen": 163172192, "step": 134170 }, { "epoch": 16.811803032201478, "grad_norm": 2.5108089447021484, "learning_rate": 7.543680574666218e-07, "loss": 0.4948, "num_input_tokens_seen": 163178464, "step": 134175 }, { "epoch": 16.81242952011026, "grad_norm": 2.9070963859558105, "learning_rate": 7.54079314221024e-07, "loss": 0.4929, "num_input_tokens_seen": 163184384, "step": 134180 }, { "epoch": 16.813056008019046, "grad_norm": 18.161508560180664, "learning_rate": 7.537906217389768e-07, "loss": 0.4466, "num_input_tokens_seen": 163190336, "step": 134185 }, { "epoch": 16.81368249592783, "grad_norm": 3.6325302124023438, "learning_rate": 7.53501980023934e-07, "loss": 0.4182, "num_input_tokens_seen": 163196608, "step": 134190 }, { "epoch": 16.81430898383661, "grad_norm": 3.1530165672302246, "learning_rate": 7.532133890793436e-07, "loss": 0.4354, "num_input_tokens_seen": 163202688, "step": 134195 }, { "epoch": 16.814935471745397, "grad_norm": 4.187230110168457, "learning_rate": 7.529248489086577e-07, "loss": 0.4119, "num_input_tokens_seen": 163208352, "step": 134200 }, { "epoch": 16.81556195965418, "grad_norm": 4.193190574645996, "learning_rate": 7.526363595153263e-07, "loss": 0.4459, "num_input_tokens_seen": 163214432, "step": 134205 }, { "epoch": 16.81618844756296, "grad_norm": 13.781880378723145, "learning_rate": 7.523479209027995e-07, "loss": 0.4548, "num_input_tokens_seen": 163220640, "step": 134210 }, { "epoch": 16.816814935471747, "grad_norm": 11.898597717285156, "learning_rate": 7.520595330745228e-07, "loss": 0.415, "num_input_tokens_seen": 163226656, "step": 134215 }, { "epoch": 16.81744142338053, "grad_norm": 8.902817726135254, "learning_rate": 7.517711960339475e-07, "loss": 0.4901, "num_input_tokens_seen": 163232832, "step": 134220 }, { "epoch": 16.81806791128931, "grad_norm": 2.6987338066101074, "learning_rate": 7.514829097845178e-07, "loss": 0.4207, "num_input_tokens_seen": 163239008, "step": 134225 }, { "epoch": 16.818694399198094, "grad_norm": 4.968898773193359, "learning_rate": 7.511946743296822e-07, "loss": 0.4269, "num_input_tokens_seen": 163245120, "step": 134230 }, { "epoch": 16.81932088710688, "grad_norm": 19.380937576293945, "learning_rate": 7.509064896728873e-07, "loss": 0.5503, "num_input_tokens_seen": 163251200, "step": 134235 }, { "epoch": 16.819947375015662, "grad_norm": 4.358263969421387, "learning_rate": 7.506183558175767e-07, "loss": 0.4145, "num_input_tokens_seen": 163257472, "step": 134240 }, { "epoch": 16.820573862924444, "grad_norm": 2.8090715408325195, "learning_rate": 7.503302727671974e-07, "loss": 0.4464, "num_input_tokens_seen": 163263712, "step": 134245 }, { "epoch": 16.82120035083323, "grad_norm": 13.349498748779297, "learning_rate": 7.500422405251917e-07, "loss": 0.4962, "num_input_tokens_seen": 163269632, "step": 134250 }, { "epoch": 16.821826838742012, "grad_norm": 4.003846645355225, "learning_rate": 7.497542590950052e-07, "loss": 0.4464, "num_input_tokens_seen": 163275616, "step": 134255 }, { "epoch": 16.822453326650795, "grad_norm": 12.450143814086914, "learning_rate": 7.49466328480079e-07, "loss": 0.4765, "num_input_tokens_seen": 163281376, "step": 134260 }, { "epoch": 16.82307981455958, "grad_norm": 8.975881576538086, "learning_rate": 7.491784486838572e-07, "loss": 0.5549, "num_input_tokens_seen": 163287328, "step": 134265 }, { "epoch": 16.823706302468363, "grad_norm": 3.8246376514434814, "learning_rate": 7.4889061970978e-07, "loss": 0.525, "num_input_tokens_seen": 163293184, "step": 134270 }, { "epoch": 16.824332790377145, "grad_norm": 3.1941399574279785, "learning_rate": 7.486028415612911e-07, "loss": 0.5535, "num_input_tokens_seen": 163298880, "step": 134275 }, { "epoch": 16.824959278285927, "grad_norm": 4.172652721405029, "learning_rate": 7.483151142418282e-07, "loss": 0.3695, "num_input_tokens_seen": 163305184, "step": 134280 }, { "epoch": 16.825585766194713, "grad_norm": 2.5964207649230957, "learning_rate": 7.480274377548324e-07, "loss": 0.4359, "num_input_tokens_seen": 163311296, "step": 134285 }, { "epoch": 16.826212254103496, "grad_norm": 15.847528457641602, "learning_rate": 7.477398121037449e-07, "loss": 0.5312, "num_input_tokens_seen": 163316832, "step": 134290 }, { "epoch": 16.826838742012278, "grad_norm": 12.129143714904785, "learning_rate": 7.474522372920018e-07, "loss": 0.4366, "num_input_tokens_seen": 163323232, "step": 134295 }, { "epoch": 16.827465229921064, "grad_norm": 17.593843460083008, "learning_rate": 7.471647133230437e-07, "loss": 0.4385, "num_input_tokens_seen": 163328960, "step": 134300 }, { "epoch": 16.828091717829846, "grad_norm": 22.013574600219727, "learning_rate": 7.468772402003061e-07, "loss": 0.5414, "num_input_tokens_seen": 163335232, "step": 134305 }, { "epoch": 16.828718205738628, "grad_norm": 4.940921783447266, "learning_rate": 7.465898179272268e-07, "loss": 0.4737, "num_input_tokens_seen": 163341312, "step": 134310 }, { "epoch": 16.829344693647414, "grad_norm": 3.0206522941589355, "learning_rate": 7.463024465072421e-07, "loss": 0.3825, "num_input_tokens_seen": 163347200, "step": 134315 }, { "epoch": 16.829971181556196, "grad_norm": 23.89155387878418, "learning_rate": 7.460151259437893e-07, "loss": 0.4711, "num_input_tokens_seen": 163353312, "step": 134320 }, { "epoch": 16.83059766946498, "grad_norm": 3.9955718517303467, "learning_rate": 7.457278562403014e-07, "loss": 0.4556, "num_input_tokens_seen": 163359488, "step": 134325 }, { "epoch": 16.83122415737376, "grad_norm": 4.927738189697266, "learning_rate": 7.454406374002143e-07, "loss": 0.4378, "num_input_tokens_seen": 163365760, "step": 134330 }, { "epoch": 16.831850645282547, "grad_norm": 4.053106307983398, "learning_rate": 7.451534694269608e-07, "loss": 0.4625, "num_input_tokens_seen": 163371872, "step": 134335 }, { "epoch": 16.83247713319133, "grad_norm": 3.692009449005127, "learning_rate": 7.448663523239758e-07, "loss": 0.4607, "num_input_tokens_seen": 163377856, "step": 134340 }, { "epoch": 16.83310362110011, "grad_norm": 5.348911762237549, "learning_rate": 7.445792860946899e-07, "loss": 0.4388, "num_input_tokens_seen": 163383904, "step": 134345 }, { "epoch": 16.833730109008897, "grad_norm": 9.762838363647461, "learning_rate": 7.44292270742536e-07, "loss": 0.4684, "num_input_tokens_seen": 163389856, "step": 134350 }, { "epoch": 16.83435659691768, "grad_norm": 2.2198197841644287, "learning_rate": 7.440053062709474e-07, "loss": 0.4847, "num_input_tokens_seen": 163396000, "step": 134355 }, { "epoch": 16.83498308482646, "grad_norm": 12.149091720581055, "learning_rate": 7.43718392683353e-07, "loss": 0.4423, "num_input_tokens_seen": 163401632, "step": 134360 }, { "epoch": 16.835609572735247, "grad_norm": 5.274730205535889, "learning_rate": 7.43431529983184e-07, "loss": 0.5391, "num_input_tokens_seen": 163407456, "step": 134365 }, { "epoch": 16.83623606064403, "grad_norm": 10.507970809936523, "learning_rate": 7.431447181738693e-07, "loss": 0.4103, "num_input_tokens_seen": 163413376, "step": 134370 }, { "epoch": 16.836862548552812, "grad_norm": 6.993100643157959, "learning_rate": 7.428579572588396e-07, "loss": 0.4354, "num_input_tokens_seen": 163419840, "step": 134375 }, { "epoch": 16.837489036461598, "grad_norm": 22.738922119140625, "learning_rate": 7.425712472415208e-07, "loss": 0.4381, "num_input_tokens_seen": 163426144, "step": 134380 }, { "epoch": 16.83811552437038, "grad_norm": 28.153993606567383, "learning_rate": 7.422845881253432e-07, "loss": 0.5389, "num_input_tokens_seen": 163432288, "step": 134385 }, { "epoch": 16.838742012279162, "grad_norm": 7.713720321655273, "learning_rate": 7.419979799137323e-07, "loss": 0.429, "num_input_tokens_seen": 163438432, "step": 134390 }, { "epoch": 16.839368500187945, "grad_norm": 4.921554088592529, "learning_rate": 7.417114226101163e-07, "loss": 0.4069, "num_input_tokens_seen": 163444384, "step": 134395 }, { "epoch": 16.83999498809673, "grad_norm": 5.881450176239014, "learning_rate": 7.414249162179193e-07, "loss": 0.4863, "num_input_tokens_seen": 163450272, "step": 134400 }, { "epoch": 16.840621476005513, "grad_norm": 12.526847839355469, "learning_rate": 7.411384607405681e-07, "loss": 0.444, "num_input_tokens_seen": 163456448, "step": 134405 }, { "epoch": 16.841247963914295, "grad_norm": 6.806944847106934, "learning_rate": 7.408520561814869e-07, "loss": 0.4299, "num_input_tokens_seen": 163462752, "step": 134410 }, { "epoch": 16.84187445182308, "grad_norm": 9.292560577392578, "learning_rate": 7.405657025441015e-07, "loss": 0.4448, "num_input_tokens_seen": 163468896, "step": 134415 }, { "epoch": 16.842500939731863, "grad_norm": 4.035379409790039, "learning_rate": 7.402793998318331e-07, "loss": 0.4412, "num_input_tokens_seen": 163475040, "step": 134420 }, { "epoch": 16.843127427640646, "grad_norm": 17.46052360534668, "learning_rate": 7.39993148048106e-07, "loss": 0.4959, "num_input_tokens_seen": 163481248, "step": 134425 }, { "epoch": 16.84375391554943, "grad_norm": 8.008020401000977, "learning_rate": 7.397069471963436e-07, "loss": 0.4166, "num_input_tokens_seen": 163487328, "step": 134430 }, { "epoch": 16.844380403458214, "grad_norm": 3.0798327922821045, "learning_rate": 7.39420797279965e-07, "loss": 0.4401, "num_input_tokens_seen": 163493792, "step": 134435 }, { "epoch": 16.845006891366996, "grad_norm": 4.04657506942749, "learning_rate": 7.391346983023945e-07, "loss": 0.432, "num_input_tokens_seen": 163500256, "step": 134440 }, { "epoch": 16.84563337927578, "grad_norm": 8.959396362304688, "learning_rate": 7.388486502670495e-07, "loss": 0.5471, "num_input_tokens_seen": 163506432, "step": 134445 }, { "epoch": 16.846259867184564, "grad_norm": 15.453102111816406, "learning_rate": 7.385626531773532e-07, "loss": 0.5272, "num_input_tokens_seen": 163512640, "step": 134450 }, { "epoch": 16.846886355093346, "grad_norm": 5.0819525718688965, "learning_rate": 7.382767070367214e-07, "loss": 0.406, "num_input_tokens_seen": 163518880, "step": 134455 }, { "epoch": 16.84751284300213, "grad_norm": 8.675799369812012, "learning_rate": 7.379908118485762e-07, "loss": 0.4378, "num_input_tokens_seen": 163525056, "step": 134460 }, { "epoch": 16.848139330910914, "grad_norm": 3.5559511184692383, "learning_rate": 7.377049676163328e-07, "loss": 0.4863, "num_input_tokens_seen": 163531456, "step": 134465 }, { "epoch": 16.848765818819697, "grad_norm": 16.626535415649414, "learning_rate": 7.3741917434341e-07, "loss": 0.4241, "num_input_tokens_seen": 163537536, "step": 134470 }, { "epoch": 16.84939230672848, "grad_norm": 8.463519096374512, "learning_rate": 7.371334320332258e-07, "loss": 0.4201, "num_input_tokens_seen": 163543872, "step": 134475 }, { "epoch": 16.850018794637265, "grad_norm": 8.112981796264648, "learning_rate": 7.368477406891944e-07, "loss": 0.4982, "num_input_tokens_seen": 163550144, "step": 134480 }, { "epoch": 16.850645282546047, "grad_norm": 5.376782417297363, "learning_rate": 7.365621003147339e-07, "loss": 0.4416, "num_input_tokens_seen": 163556480, "step": 134485 }, { "epoch": 16.85127177045483, "grad_norm": 4.122056484222412, "learning_rate": 7.362765109132564e-07, "loss": 0.487, "num_input_tokens_seen": 163561952, "step": 134490 }, { "epoch": 16.851898258363615, "grad_norm": 3.597388744354248, "learning_rate": 7.359909724881797e-07, "loss": 0.4835, "num_input_tokens_seen": 163568000, "step": 134495 }, { "epoch": 16.852524746272397, "grad_norm": 23.201967239379883, "learning_rate": 7.357054850429141e-07, "loss": 0.5226, "num_input_tokens_seen": 163574400, "step": 134500 }, { "epoch": 16.85315123418118, "grad_norm": 3.080899238586426, "learning_rate": 7.354200485808749e-07, "loss": 0.5031, "num_input_tokens_seen": 163580512, "step": 134505 }, { "epoch": 16.853777722089962, "grad_norm": 4.010647296905518, "learning_rate": 7.351346631054762e-07, "loss": 0.4525, "num_input_tokens_seen": 163586720, "step": 134510 }, { "epoch": 16.854404209998748, "grad_norm": 4.029262542724609, "learning_rate": 7.348493286201264e-07, "loss": 0.4545, "num_input_tokens_seen": 163592672, "step": 134515 }, { "epoch": 16.85503069790753, "grad_norm": 11.59868335723877, "learning_rate": 7.345640451282393e-07, "loss": 0.4594, "num_input_tokens_seen": 163598720, "step": 134520 }, { "epoch": 16.855657185816312, "grad_norm": 14.930968284606934, "learning_rate": 7.342788126332256e-07, "loss": 0.514, "num_input_tokens_seen": 163605024, "step": 134525 }, { "epoch": 16.8562836737251, "grad_norm": 15.381576538085938, "learning_rate": 7.339936311384948e-07, "loss": 0.4216, "num_input_tokens_seen": 163611136, "step": 134530 }, { "epoch": 16.85691016163388, "grad_norm": 5.45922327041626, "learning_rate": 7.337085006474565e-07, "loss": 0.4517, "num_input_tokens_seen": 163617216, "step": 134535 }, { "epoch": 16.857536649542663, "grad_norm": 10.969724655151367, "learning_rate": 7.334234211635211e-07, "loss": 0.4646, "num_input_tokens_seen": 163623360, "step": 134540 }, { "epoch": 16.85816313745145, "grad_norm": 7.987739562988281, "learning_rate": 7.331383926900947e-07, "loss": 0.4942, "num_input_tokens_seen": 163629312, "step": 134545 }, { "epoch": 16.85878962536023, "grad_norm": 4.7785539627075195, "learning_rate": 7.328534152305878e-07, "loss": 0.4101, "num_input_tokens_seen": 163634880, "step": 134550 }, { "epoch": 16.859416113269013, "grad_norm": 2.514151096343994, "learning_rate": 7.325684887884038e-07, "loss": 0.4781, "num_input_tokens_seen": 163640704, "step": 134555 }, { "epoch": 16.860042601177796, "grad_norm": 10.796313285827637, "learning_rate": 7.322836133669531e-07, "loss": 0.4854, "num_input_tokens_seen": 163646752, "step": 134560 }, { "epoch": 16.86066908908658, "grad_norm": 7.8028483390808105, "learning_rate": 7.319987889696389e-07, "loss": 0.506, "num_input_tokens_seen": 163652896, "step": 134565 }, { "epoch": 16.861295576995364, "grad_norm": 4.611413955688477, "learning_rate": 7.317140155998687e-07, "loss": 0.4528, "num_input_tokens_seen": 163659008, "step": 134570 }, { "epoch": 16.861922064904146, "grad_norm": 21.04829978942871, "learning_rate": 7.314292932610445e-07, "loss": 0.4513, "num_input_tokens_seen": 163665056, "step": 134575 }, { "epoch": 16.86254855281293, "grad_norm": 15.863853454589844, "learning_rate": 7.311446219565732e-07, "loss": 0.4427, "num_input_tokens_seen": 163671424, "step": 134580 }, { "epoch": 16.863175040721714, "grad_norm": 12.835273742675781, "learning_rate": 7.30860001689856e-07, "loss": 0.4812, "num_input_tokens_seen": 163677696, "step": 134585 }, { "epoch": 16.863801528630496, "grad_norm": 4.536593437194824, "learning_rate": 7.305754324642977e-07, "loss": 0.4272, "num_input_tokens_seen": 163683840, "step": 134590 }, { "epoch": 16.864428016539282, "grad_norm": 3.979109525680542, "learning_rate": 7.302909142832986e-07, "loss": 0.4455, "num_input_tokens_seen": 163690080, "step": 134595 }, { "epoch": 16.865054504448064, "grad_norm": 3.5605151653289795, "learning_rate": 7.300064471502616e-07, "loss": 0.5233, "num_input_tokens_seen": 163696064, "step": 134600 }, { "epoch": 16.865680992356847, "grad_norm": 5.579098224639893, "learning_rate": 7.297220310685882e-07, "loss": 0.37, "num_input_tokens_seen": 163702400, "step": 134605 }, { "epoch": 16.866307480265633, "grad_norm": 10.877963066101074, "learning_rate": 7.294376660416774e-07, "loss": 0.4531, "num_input_tokens_seen": 163708448, "step": 134610 }, { "epoch": 16.866933968174415, "grad_norm": 3.686352491378784, "learning_rate": 7.291533520729299e-07, "loss": 0.5645, "num_input_tokens_seen": 163714880, "step": 134615 }, { "epoch": 16.867560456083197, "grad_norm": 10.565743446350098, "learning_rate": 7.288690891657457e-07, "loss": 0.4661, "num_input_tokens_seen": 163721152, "step": 134620 }, { "epoch": 16.86818694399198, "grad_norm": 4.87080717086792, "learning_rate": 7.285848773235216e-07, "loss": 0.4518, "num_input_tokens_seen": 163727168, "step": 134625 }, { "epoch": 16.868813431900765, "grad_norm": 11.33675479888916, "learning_rate": 7.283007165496558e-07, "loss": 0.4494, "num_input_tokens_seen": 163732896, "step": 134630 }, { "epoch": 16.869439919809547, "grad_norm": 12.206010818481445, "learning_rate": 7.28016606847548e-07, "loss": 0.5148, "num_input_tokens_seen": 163739040, "step": 134635 }, { "epoch": 16.87006640771833, "grad_norm": 3.474355459213257, "learning_rate": 7.277325482205927e-07, "loss": 0.4382, "num_input_tokens_seen": 163745600, "step": 134640 }, { "epoch": 16.870692895627116, "grad_norm": 2.370645523071289, "learning_rate": 7.27448540672187e-07, "loss": 0.4498, "num_input_tokens_seen": 163751232, "step": 134645 }, { "epoch": 16.871319383535898, "grad_norm": 21.951114654541016, "learning_rate": 7.271645842057256e-07, "loss": 0.4621, "num_input_tokens_seen": 163757568, "step": 134650 }, { "epoch": 16.87194587144468, "grad_norm": 4.501873970031738, "learning_rate": 7.268806788246041e-07, "loss": 0.4377, "num_input_tokens_seen": 163763840, "step": 134655 }, { "epoch": 16.872572359353466, "grad_norm": 4.34014368057251, "learning_rate": 7.265968245322175e-07, "loss": 0.4351, "num_input_tokens_seen": 163769856, "step": 134660 }, { "epoch": 16.87319884726225, "grad_norm": 17.374801635742188, "learning_rate": 7.263130213319575e-07, "loss": 0.5261, "num_input_tokens_seen": 163776032, "step": 134665 }, { "epoch": 16.87382533517103, "grad_norm": 13.11044692993164, "learning_rate": 7.260292692272198e-07, "loss": 0.4849, "num_input_tokens_seen": 163782272, "step": 134670 }, { "epoch": 16.874451823079813, "grad_norm": 3.279458522796631, "learning_rate": 7.257455682213943e-07, "loss": 0.4449, "num_input_tokens_seen": 163788224, "step": 134675 }, { "epoch": 16.8750783109886, "grad_norm": 5.082493782043457, "learning_rate": 7.254619183178757e-07, "loss": 0.4927, "num_input_tokens_seen": 163794336, "step": 134680 }, { "epoch": 16.87570479889738, "grad_norm": 3.171172618865967, "learning_rate": 7.251783195200518e-07, "loss": 0.4262, "num_input_tokens_seen": 163799968, "step": 134685 }, { "epoch": 16.876331286806163, "grad_norm": 3.676065683364868, "learning_rate": 7.24894771831317e-07, "loss": 0.4296, "num_input_tokens_seen": 163806176, "step": 134690 }, { "epoch": 16.87695777471495, "grad_norm": 3.4375734329223633, "learning_rate": 7.246112752550577e-07, "loss": 0.6045, "num_input_tokens_seen": 163812000, "step": 134695 }, { "epoch": 16.87758426262373, "grad_norm": 28.999435424804688, "learning_rate": 7.243278297946671e-07, "loss": 0.4514, "num_input_tokens_seen": 163818304, "step": 134700 }, { "epoch": 16.878210750532514, "grad_norm": 14.260562896728516, "learning_rate": 7.240444354535304e-07, "loss": 0.4293, "num_input_tokens_seen": 163824256, "step": 134705 }, { "epoch": 16.8788372384413, "grad_norm": 7.324979305267334, "learning_rate": 7.237610922350374e-07, "loss": 0.4725, "num_input_tokens_seen": 163830144, "step": 134710 }, { "epoch": 16.87946372635008, "grad_norm": 13.182462692260742, "learning_rate": 7.234778001425774e-07, "loss": 0.4525, "num_input_tokens_seen": 163836160, "step": 134715 }, { "epoch": 16.880090214258864, "grad_norm": 7.634872913360596, "learning_rate": 7.231945591795342e-07, "loss": 0.4364, "num_input_tokens_seen": 163842368, "step": 134720 }, { "epoch": 16.88071670216765, "grad_norm": 9.250289916992188, "learning_rate": 7.229113693492962e-07, "loss": 0.5016, "num_input_tokens_seen": 163848416, "step": 134725 }, { "epoch": 16.881343190076432, "grad_norm": 2.3246045112609863, "learning_rate": 7.226282306552485e-07, "loss": 0.4458, "num_input_tokens_seen": 163854432, "step": 134730 }, { "epoch": 16.881969677985214, "grad_norm": 5.291812896728516, "learning_rate": 7.223451431007778e-07, "loss": 0.4388, "num_input_tokens_seen": 163860736, "step": 134735 }, { "epoch": 16.882596165893997, "grad_norm": 24.73979949951172, "learning_rate": 7.220621066892664e-07, "loss": 0.5095, "num_input_tokens_seen": 163866944, "step": 134740 }, { "epoch": 16.883222653802783, "grad_norm": 13.84367847442627, "learning_rate": 7.217791214240999e-07, "loss": 0.5148, "num_input_tokens_seen": 163873248, "step": 134745 }, { "epoch": 16.883849141711565, "grad_norm": 4.7266130447387695, "learning_rate": 7.214961873086601e-07, "loss": 0.5255, "num_input_tokens_seen": 163879328, "step": 134750 }, { "epoch": 16.884475629620347, "grad_norm": 21.69457244873047, "learning_rate": 7.212133043463315e-07, "loss": 0.5064, "num_input_tokens_seen": 163885216, "step": 134755 }, { "epoch": 16.885102117529133, "grad_norm": 6.739204406738281, "learning_rate": 7.209304725404942e-07, "loss": 0.4715, "num_input_tokens_seen": 163891456, "step": 134760 }, { "epoch": 16.885728605437915, "grad_norm": 14.248993873596191, "learning_rate": 7.206476918945315e-07, "loss": 0.473, "num_input_tokens_seen": 163897152, "step": 134765 }, { "epoch": 16.886355093346697, "grad_norm": 8.180647850036621, "learning_rate": 7.20364962411823e-07, "loss": 0.4787, "num_input_tokens_seen": 163902976, "step": 134770 }, { "epoch": 16.886981581255483, "grad_norm": 14.730032920837402, "learning_rate": 7.200822840957505e-07, "loss": 0.4121, "num_input_tokens_seen": 163908832, "step": 134775 }, { "epoch": 16.887608069164266, "grad_norm": 4.951107978820801, "learning_rate": 7.197996569496912e-07, "loss": 0.4414, "num_input_tokens_seen": 163914848, "step": 134780 }, { "epoch": 16.888234557073048, "grad_norm": 9.989296913146973, "learning_rate": 7.195170809770258e-07, "loss": 0.4482, "num_input_tokens_seen": 163920128, "step": 134785 }, { "epoch": 16.88886104498183, "grad_norm": 4.164159297943115, "learning_rate": 7.192345561811337e-07, "loss": 0.4109, "num_input_tokens_seen": 163926176, "step": 134790 }, { "epoch": 16.889487532890616, "grad_norm": 6.822646617889404, "learning_rate": 7.189520825653906e-07, "loss": 0.4254, "num_input_tokens_seen": 163932000, "step": 134795 }, { "epoch": 16.8901140207994, "grad_norm": 6.591881275177002, "learning_rate": 7.186696601331755e-07, "loss": 0.4398, "num_input_tokens_seen": 163937920, "step": 134800 }, { "epoch": 16.89074050870818, "grad_norm": 6.876553535461426, "learning_rate": 7.183872888878629e-07, "loss": 0.4598, "num_input_tokens_seen": 163944128, "step": 134805 }, { "epoch": 16.891366996616966, "grad_norm": 4.194390773773193, "learning_rate": 7.181049688328318e-07, "loss": 0.4538, "num_input_tokens_seen": 163950400, "step": 134810 }, { "epoch": 16.89199348452575, "grad_norm": 8.233601570129395, "learning_rate": 7.178226999714538e-07, "loss": 0.4989, "num_input_tokens_seen": 163956704, "step": 134815 }, { "epoch": 16.89261997243453, "grad_norm": 4.013323783874512, "learning_rate": 7.175404823071064e-07, "loss": 0.4606, "num_input_tokens_seen": 163962720, "step": 134820 }, { "epoch": 16.893246460343317, "grad_norm": 3.703786611557007, "learning_rate": 7.172583158431628e-07, "loss": 0.4235, "num_input_tokens_seen": 163968960, "step": 134825 }, { "epoch": 16.8938729482521, "grad_norm": 6.036000728607178, "learning_rate": 7.169762005829978e-07, "loss": 0.4782, "num_input_tokens_seen": 163975200, "step": 134830 }, { "epoch": 16.89449943616088, "grad_norm": 11.316203117370605, "learning_rate": 7.166941365299823e-07, "loss": 0.454, "num_input_tokens_seen": 163981504, "step": 134835 }, { "epoch": 16.895125924069667, "grad_norm": 2.9168202877044678, "learning_rate": 7.164121236874904e-07, "loss": 0.4207, "num_input_tokens_seen": 163987936, "step": 134840 }, { "epoch": 16.89575241197845, "grad_norm": 5.660635471343994, "learning_rate": 7.161301620588923e-07, "loss": 0.4366, "num_input_tokens_seen": 163993920, "step": 134845 }, { "epoch": 16.89637889988723, "grad_norm": 3.262624979019165, "learning_rate": 7.158482516475595e-07, "loss": 0.4139, "num_input_tokens_seen": 164000032, "step": 134850 }, { "epoch": 16.897005387796014, "grad_norm": 5.341784954071045, "learning_rate": 7.15566392456864e-07, "loss": 0.4836, "num_input_tokens_seen": 164006144, "step": 134855 }, { "epoch": 16.8976318757048, "grad_norm": 11.541678428649902, "learning_rate": 7.152845844901735e-07, "loss": 0.445, "num_input_tokens_seen": 164012320, "step": 134860 }, { "epoch": 16.898258363613582, "grad_norm": 7.3810529708862305, "learning_rate": 7.150028277508592e-07, "loss": 0.4606, "num_input_tokens_seen": 164018368, "step": 134865 }, { "epoch": 16.898884851522364, "grad_norm": 6.419658184051514, "learning_rate": 7.147211222422873e-07, "loss": 0.4559, "num_input_tokens_seen": 164024224, "step": 134870 }, { "epoch": 16.89951133943115, "grad_norm": 3.0109381675720215, "learning_rate": 7.144394679678291e-07, "loss": 0.4001, "num_input_tokens_seen": 164030624, "step": 134875 }, { "epoch": 16.900137827339933, "grad_norm": 4.718813419342041, "learning_rate": 7.141578649308484e-07, "loss": 0.4668, "num_input_tokens_seen": 164036736, "step": 134880 }, { "epoch": 16.900764315248715, "grad_norm": 14.772706031799316, "learning_rate": 7.138763131347149e-07, "loss": 0.516, "num_input_tokens_seen": 164042816, "step": 134885 }, { "epoch": 16.9013908031575, "grad_norm": 4.098206043243408, "learning_rate": 7.135948125827924e-07, "loss": 0.4149, "num_input_tokens_seen": 164049056, "step": 134890 }, { "epoch": 16.902017291066283, "grad_norm": 5.603576183319092, "learning_rate": 7.133133632784495e-07, "loss": 0.5352, "num_input_tokens_seen": 164055360, "step": 134895 }, { "epoch": 16.902643778975065, "grad_norm": 9.094164848327637, "learning_rate": 7.130319652250478e-07, "loss": 0.4553, "num_input_tokens_seen": 164061408, "step": 134900 }, { "epoch": 16.903270266883847, "grad_norm": 7.573968410491943, "learning_rate": 7.127506184259531e-07, "loss": 0.4523, "num_input_tokens_seen": 164067200, "step": 134905 }, { "epoch": 16.903896754792633, "grad_norm": 8.892040252685547, "learning_rate": 7.124693228845308e-07, "loss": 0.4564, "num_input_tokens_seen": 164073344, "step": 134910 }, { "epoch": 16.904523242701416, "grad_norm": 8.828901290893555, "learning_rate": 7.121880786041413e-07, "loss": 0.4335, "num_input_tokens_seen": 164079456, "step": 134915 }, { "epoch": 16.905149730610198, "grad_norm": 3.3325538635253906, "learning_rate": 7.119068855881489e-07, "loss": 0.416, "num_input_tokens_seen": 164085536, "step": 134920 }, { "epoch": 16.905776218518984, "grad_norm": 11.505598068237305, "learning_rate": 7.116257438399144e-07, "loss": 0.4359, "num_input_tokens_seen": 164091488, "step": 134925 }, { "epoch": 16.906402706427766, "grad_norm": 14.79156494140625, "learning_rate": 7.113446533627993e-07, "loss": 0.4694, "num_input_tokens_seen": 164097504, "step": 134930 }, { "epoch": 16.90702919433655, "grad_norm": 3.9150288105010986, "learning_rate": 7.110636141601651e-07, "loss": 0.5119, "num_input_tokens_seen": 164103456, "step": 134935 }, { "epoch": 16.907655682245334, "grad_norm": 14.248787879943848, "learning_rate": 7.107826262353717e-07, "loss": 0.4747, "num_input_tokens_seen": 164109664, "step": 134940 }, { "epoch": 16.908282170154116, "grad_norm": 11.246870040893555, "learning_rate": 7.105016895917777e-07, "loss": 0.4587, "num_input_tokens_seen": 164115904, "step": 134945 }, { "epoch": 16.9089086580629, "grad_norm": 3.173520803451538, "learning_rate": 7.10220804232743e-07, "loss": 0.4108, "num_input_tokens_seen": 164121600, "step": 134950 }, { "epoch": 16.90953514597168, "grad_norm": 3.769894599914551, "learning_rate": 7.099399701616244e-07, "loss": 0.4465, "num_input_tokens_seen": 164127616, "step": 134955 }, { "epoch": 16.910161633880467, "grad_norm": 3.587362766265869, "learning_rate": 7.096591873817815e-07, "loss": 0.4232, "num_input_tokens_seen": 164134304, "step": 134960 }, { "epoch": 16.91078812178925, "grad_norm": 3.3695175647735596, "learning_rate": 7.093784558965688e-07, "loss": 0.3821, "num_input_tokens_seen": 164140160, "step": 134965 }, { "epoch": 16.91141460969803, "grad_norm": 14.127304077148438, "learning_rate": 7.090977757093442e-07, "loss": 0.5788, "num_input_tokens_seen": 164146496, "step": 134970 }, { "epoch": 16.912041097606817, "grad_norm": 17.917470932006836, "learning_rate": 7.088171468234645e-07, "loss": 0.4834, "num_input_tokens_seen": 164152576, "step": 134975 }, { "epoch": 16.9126675855156, "grad_norm": 4.535177707672119, "learning_rate": 7.085365692422824e-07, "loss": 0.4371, "num_input_tokens_seen": 164158464, "step": 134980 }, { "epoch": 16.91329407342438, "grad_norm": 7.679396152496338, "learning_rate": 7.082560429691543e-07, "loss": 0.4357, "num_input_tokens_seen": 164164480, "step": 134985 }, { "epoch": 16.913920561333168, "grad_norm": 7.582113742828369, "learning_rate": 7.07975568007433e-07, "loss": 0.4727, "num_input_tokens_seen": 164170496, "step": 134990 }, { "epoch": 16.91454704924195, "grad_norm": 5.914700508117676, "learning_rate": 7.076951443604735e-07, "loss": 0.4747, "num_input_tokens_seen": 164176192, "step": 134995 }, { "epoch": 16.915173537150732, "grad_norm": 7.520395278930664, "learning_rate": 7.07414772031626e-07, "loss": 0.4931, "num_input_tokens_seen": 164182080, "step": 135000 }, { "epoch": 16.915800025059518, "grad_norm": 2.0735888481140137, "learning_rate": 7.071344510242451e-07, "loss": 0.3853, "num_input_tokens_seen": 164188256, "step": 135005 }, { "epoch": 16.9164265129683, "grad_norm": 9.094947814941406, "learning_rate": 7.068541813416796e-07, "loss": 0.4315, "num_input_tokens_seen": 164194144, "step": 135010 }, { "epoch": 16.917053000877083, "grad_norm": 5.8237528800964355, "learning_rate": 7.065739629872831e-07, "loss": 0.477, "num_input_tokens_seen": 164200512, "step": 135015 }, { "epoch": 16.917679488785865, "grad_norm": 5.151181221008301, "learning_rate": 7.062937959644039e-07, "loss": 0.5066, "num_input_tokens_seen": 164206752, "step": 135020 }, { "epoch": 16.91830597669465, "grad_norm": 19.530677795410156, "learning_rate": 7.060136802763917e-07, "loss": 0.4431, "num_input_tokens_seen": 164213120, "step": 135025 }, { "epoch": 16.918932464603433, "grad_norm": 4.975173473358154, "learning_rate": 7.057336159265965e-07, "loss": 0.4836, "num_input_tokens_seen": 164219392, "step": 135030 }, { "epoch": 16.919558952512215, "grad_norm": 4.15329122543335, "learning_rate": 7.054536029183678e-07, "loss": 0.4601, "num_input_tokens_seen": 164225440, "step": 135035 }, { "epoch": 16.920185440421, "grad_norm": 3.587078809738159, "learning_rate": 7.051736412550502e-07, "loss": 0.5333, "num_input_tokens_seen": 164232000, "step": 135040 }, { "epoch": 16.920811928329783, "grad_norm": 2.882612943649292, "learning_rate": 7.048937309399928e-07, "loss": 0.5183, "num_input_tokens_seen": 164237856, "step": 135045 }, { "epoch": 16.921438416238566, "grad_norm": 9.79796314239502, "learning_rate": 7.046138719765432e-07, "loss": 0.4485, "num_input_tokens_seen": 164243904, "step": 135050 }, { "epoch": 16.92206490414735, "grad_norm": 5.269806861877441, "learning_rate": 7.043340643680452e-07, "loss": 0.4458, "num_input_tokens_seen": 164250272, "step": 135055 }, { "epoch": 16.922691392056134, "grad_norm": 10.496694564819336, "learning_rate": 7.040543081178458e-07, "loss": 0.4363, "num_input_tokens_seen": 164256544, "step": 135060 }, { "epoch": 16.923317879964916, "grad_norm": 3.163663148880005, "learning_rate": 7.03774603229288e-07, "loss": 0.4341, "num_input_tokens_seen": 164262752, "step": 135065 }, { "epoch": 16.923944367873702, "grad_norm": 3.380272388458252, "learning_rate": 7.034949497057181e-07, "loss": 0.514, "num_input_tokens_seen": 164268544, "step": 135070 }, { "epoch": 16.924570855782484, "grad_norm": 11.888472557067871, "learning_rate": 7.032153475504766e-07, "loss": 0.4366, "num_input_tokens_seen": 164274688, "step": 135075 }, { "epoch": 16.925197343691266, "grad_norm": 3.8066585063934326, "learning_rate": 7.029357967669098e-07, "loss": 0.4304, "num_input_tokens_seen": 164280128, "step": 135080 }, { "epoch": 16.92582383160005, "grad_norm": 2.7173309326171875, "learning_rate": 7.026562973583572e-07, "loss": 0.4264, "num_input_tokens_seen": 164286048, "step": 135085 }, { "epoch": 16.926450319508834, "grad_norm": 4.972403526306152, "learning_rate": 7.023768493281619e-07, "loss": 0.4477, "num_input_tokens_seen": 164292096, "step": 135090 }, { "epoch": 16.927076807417617, "grad_norm": 7.608156681060791, "learning_rate": 7.020974526796654e-07, "loss": 0.4213, "num_input_tokens_seen": 164298240, "step": 135095 }, { "epoch": 16.9277032953264, "grad_norm": 12.154260635375977, "learning_rate": 7.018181074162062e-07, "loss": 0.3979, "num_input_tokens_seen": 164304352, "step": 135100 }, { "epoch": 16.928329783235185, "grad_norm": 13.089385032653809, "learning_rate": 7.015388135411266e-07, "loss": 0.5483, "num_input_tokens_seen": 164309408, "step": 135105 }, { "epoch": 16.928956271143967, "grad_norm": 11.557007789611816, "learning_rate": 7.01259571057763e-07, "loss": 0.455, "num_input_tokens_seen": 164315840, "step": 135110 }, { "epoch": 16.92958275905275, "grad_norm": 4.059820652008057, "learning_rate": 7.009803799694564e-07, "loss": 0.4489, "num_input_tokens_seen": 164321728, "step": 135115 }, { "epoch": 16.930209246961535, "grad_norm": 2.706557512283325, "learning_rate": 7.007012402795432e-07, "loss": 0.4913, "num_input_tokens_seen": 164327776, "step": 135120 }, { "epoch": 16.930835734870318, "grad_norm": 4.50850248336792, "learning_rate": 7.004221519913607e-07, "loss": 0.4385, "num_input_tokens_seen": 164333984, "step": 135125 }, { "epoch": 16.9314622227791, "grad_norm": 26.298194885253906, "learning_rate": 7.001431151082477e-07, "loss": 0.5617, "num_input_tokens_seen": 164340128, "step": 135130 }, { "epoch": 16.932088710687882, "grad_norm": 3.4020681381225586, "learning_rate": 6.998641296335379e-07, "loss": 0.4404, "num_input_tokens_seen": 164345952, "step": 135135 }, { "epoch": 16.932715198596668, "grad_norm": 3.5810317993164062, "learning_rate": 6.995851955705673e-07, "loss": 0.4128, "num_input_tokens_seen": 164352192, "step": 135140 }, { "epoch": 16.93334168650545, "grad_norm": 4.803893089294434, "learning_rate": 6.993063129226729e-07, "loss": 0.4899, "num_input_tokens_seen": 164358304, "step": 135145 }, { "epoch": 16.933968174414233, "grad_norm": 4.151735782623291, "learning_rate": 6.990274816931863e-07, "loss": 0.4446, "num_input_tokens_seen": 164364672, "step": 135150 }, { "epoch": 16.93459466232302, "grad_norm": 5.514181613922119, "learning_rate": 6.987487018854417e-07, "loss": 0.4401, "num_input_tokens_seen": 164370720, "step": 135155 }, { "epoch": 16.9352211502318, "grad_norm": 2.900752544403076, "learning_rate": 6.984699735027745e-07, "loss": 0.4635, "num_input_tokens_seen": 164376672, "step": 135160 }, { "epoch": 16.935847638140583, "grad_norm": 13.007088661193848, "learning_rate": 6.981912965485133e-07, "loss": 0.5597, "num_input_tokens_seen": 164382944, "step": 135165 }, { "epoch": 16.93647412604937, "grad_norm": 19.06065559387207, "learning_rate": 6.979126710259937e-07, "loss": 0.4595, "num_input_tokens_seen": 164389280, "step": 135170 }, { "epoch": 16.93710061395815, "grad_norm": 7.2886834144592285, "learning_rate": 6.976340969385437e-07, "loss": 0.4792, "num_input_tokens_seen": 164395456, "step": 135175 }, { "epoch": 16.937727101866933, "grad_norm": 12.563854217529297, "learning_rate": 6.973555742894967e-07, "loss": 0.4463, "num_input_tokens_seen": 164401120, "step": 135180 }, { "epoch": 16.938353589775716, "grad_norm": 3.4620771408081055, "learning_rate": 6.970771030821799e-07, "loss": 0.4531, "num_input_tokens_seen": 164407104, "step": 135185 }, { "epoch": 16.9389800776845, "grad_norm": 9.140119552612305, "learning_rate": 6.967986833199253e-07, "loss": 0.4667, "num_input_tokens_seen": 164413088, "step": 135190 }, { "epoch": 16.939606565593284, "grad_norm": 3.0897529125213623, "learning_rate": 6.96520315006059e-07, "loss": 0.5031, "num_input_tokens_seen": 164419424, "step": 135195 }, { "epoch": 16.940233053502066, "grad_norm": 24.766399383544922, "learning_rate": 6.962419981439117e-07, "loss": 0.5024, "num_input_tokens_seen": 164425472, "step": 135200 }, { "epoch": 16.940859541410852, "grad_norm": 3.4348483085632324, "learning_rate": 6.959637327368085e-07, "loss": 0.4621, "num_input_tokens_seen": 164431488, "step": 135205 }, { "epoch": 16.941486029319634, "grad_norm": 10.725217819213867, "learning_rate": 6.95685518788079e-07, "loss": 0.4816, "num_input_tokens_seen": 164437760, "step": 135210 }, { "epoch": 16.942112517228416, "grad_norm": 5.420355319976807, "learning_rate": 6.954073563010466e-07, "loss": 0.4133, "num_input_tokens_seen": 164444128, "step": 135215 }, { "epoch": 16.942739005137202, "grad_norm": 6.3410868644714355, "learning_rate": 6.951292452790381e-07, "loss": 0.4181, "num_input_tokens_seen": 164450464, "step": 135220 }, { "epoch": 16.943365493045984, "grad_norm": 3.1122171878814697, "learning_rate": 6.948511857253803e-07, "loss": 0.3967, "num_input_tokens_seen": 164456128, "step": 135225 }, { "epoch": 16.943991980954767, "grad_norm": 24.078035354614258, "learning_rate": 6.945731776433951e-07, "loss": 0.5483, "num_input_tokens_seen": 164462016, "step": 135230 }, { "epoch": 16.944618468863553, "grad_norm": 14.73581600189209, "learning_rate": 6.942952210364074e-07, "loss": 0.468, "num_input_tokens_seen": 164468096, "step": 135235 }, { "epoch": 16.945244956772335, "grad_norm": 3.2712392807006836, "learning_rate": 6.940173159077401e-07, "loss": 0.427, "num_input_tokens_seen": 164474304, "step": 135240 }, { "epoch": 16.945871444681117, "grad_norm": 4.188559532165527, "learning_rate": 6.937394622607174e-07, "loss": 0.4425, "num_input_tokens_seen": 164480192, "step": 135245 }, { "epoch": 16.9464979325899, "grad_norm": 3.6434412002563477, "learning_rate": 6.934616600986587e-07, "loss": 0.4253, "num_input_tokens_seen": 164486112, "step": 135250 }, { "epoch": 16.947124420498685, "grad_norm": 16.816383361816406, "learning_rate": 6.931839094248877e-07, "loss": 0.4832, "num_input_tokens_seen": 164492224, "step": 135255 }, { "epoch": 16.947750908407468, "grad_norm": 8.90774917602539, "learning_rate": 6.929062102427225e-07, "loss": 0.4408, "num_input_tokens_seen": 164498304, "step": 135260 }, { "epoch": 16.94837739631625, "grad_norm": 3.5313034057617188, "learning_rate": 6.926285625554869e-07, "loss": 0.4367, "num_input_tokens_seen": 164504384, "step": 135265 }, { "epoch": 16.949003884225036, "grad_norm": 5.180436134338379, "learning_rate": 6.923509663664963e-07, "loss": 0.4566, "num_input_tokens_seen": 164510304, "step": 135270 }, { "epoch": 16.949630372133818, "grad_norm": 29.82090950012207, "learning_rate": 6.92073421679072e-07, "loss": 0.5088, "num_input_tokens_seen": 164516544, "step": 135275 }, { "epoch": 16.9502568600426, "grad_norm": 3.695485830307007, "learning_rate": 6.917959284965326e-07, "loss": 0.5927, "num_input_tokens_seen": 164522528, "step": 135280 }, { "epoch": 16.950883347951386, "grad_norm": 3.9403345584869385, "learning_rate": 6.915184868221941e-07, "loss": 0.4547, "num_input_tokens_seen": 164528608, "step": 135285 }, { "epoch": 16.95150983586017, "grad_norm": 16.016735076904297, "learning_rate": 6.912410966593758e-07, "loss": 0.3994, "num_input_tokens_seen": 164534784, "step": 135290 }, { "epoch": 16.95213632376895, "grad_norm": 7.2661943435668945, "learning_rate": 6.90963758011391e-07, "loss": 0.5036, "num_input_tokens_seen": 164541504, "step": 135295 }, { "epoch": 16.952762811677733, "grad_norm": 7.663029193878174, "learning_rate": 6.906864708815591e-07, "loss": 0.4721, "num_input_tokens_seen": 164547456, "step": 135300 }, { "epoch": 16.95338929958652, "grad_norm": 3.6296885013580322, "learning_rate": 6.904092352731923e-07, "loss": 0.4971, "num_input_tokens_seen": 164553664, "step": 135305 }, { "epoch": 16.9540157874953, "grad_norm": 3.1654469966888428, "learning_rate": 6.90132051189607e-07, "loss": 0.4495, "num_input_tokens_seen": 164559936, "step": 135310 }, { "epoch": 16.954642275404083, "grad_norm": 15.973943710327148, "learning_rate": 6.898549186341158e-07, "loss": 0.4449, "num_input_tokens_seen": 164566144, "step": 135315 }, { "epoch": 16.95526876331287, "grad_norm": 5.452497482299805, "learning_rate": 6.895778376100343e-07, "loss": 0.4585, "num_input_tokens_seen": 164572000, "step": 135320 }, { "epoch": 16.95589525122165, "grad_norm": 5.159186840057373, "learning_rate": 6.89300808120672e-07, "loss": 0.4563, "num_input_tokens_seen": 164578080, "step": 135325 }, { "epoch": 16.956521739130434, "grad_norm": 9.73067855834961, "learning_rate": 6.890238301693431e-07, "loss": 0.4904, "num_input_tokens_seen": 164584576, "step": 135330 }, { "epoch": 16.95714822703922, "grad_norm": 2.5255441665649414, "learning_rate": 6.887469037593602e-07, "loss": 0.5238, "num_input_tokens_seen": 164590656, "step": 135335 }, { "epoch": 16.957774714948002, "grad_norm": 4.995167255401611, "learning_rate": 6.884700288940311e-07, "loss": 0.4919, "num_input_tokens_seen": 164596384, "step": 135340 }, { "epoch": 16.958401202856784, "grad_norm": 6.171607971191406, "learning_rate": 6.881932055766682e-07, "loss": 0.4563, "num_input_tokens_seen": 164601824, "step": 135345 }, { "epoch": 16.95902769076557, "grad_norm": 2.23559308052063, "learning_rate": 6.879164338105804e-07, "loss": 0.4064, "num_input_tokens_seen": 164607904, "step": 135350 }, { "epoch": 16.959654178674352, "grad_norm": 3.5199294090270996, "learning_rate": 6.876397135990781e-07, "loss": 0.4309, "num_input_tokens_seen": 164614144, "step": 135355 }, { "epoch": 16.960280666583134, "grad_norm": 12.202098846435547, "learning_rate": 6.873630449454677e-07, "loss": 0.455, "num_input_tokens_seen": 164620256, "step": 135360 }, { "epoch": 16.960907154491917, "grad_norm": 22.56957244873047, "learning_rate": 6.870864278530592e-07, "loss": 0.4919, "num_input_tokens_seen": 164626688, "step": 135365 }, { "epoch": 16.961533642400703, "grad_norm": 4.498696327209473, "learning_rate": 6.868098623251568e-07, "loss": 0.4305, "num_input_tokens_seen": 164632704, "step": 135370 }, { "epoch": 16.962160130309485, "grad_norm": 4.7423529624938965, "learning_rate": 6.865333483650704e-07, "loss": 0.429, "num_input_tokens_seen": 164638464, "step": 135375 }, { "epoch": 16.962786618218267, "grad_norm": 4.076234817504883, "learning_rate": 6.862568859761037e-07, "loss": 0.4528, "num_input_tokens_seen": 164644288, "step": 135380 }, { "epoch": 16.963413106127053, "grad_norm": 3.18515682220459, "learning_rate": 6.859804751615634e-07, "loss": 0.4373, "num_input_tokens_seen": 164650560, "step": 135385 }, { "epoch": 16.964039594035835, "grad_norm": 3.625546455383301, "learning_rate": 6.857041159247524e-07, "loss": 0.4349, "num_input_tokens_seen": 164656800, "step": 135390 }, { "epoch": 16.964666081944618, "grad_norm": 7.038537502288818, "learning_rate": 6.854278082689769e-07, "loss": 0.4192, "num_input_tokens_seen": 164662848, "step": 135395 }, { "epoch": 16.965292569853403, "grad_norm": 5.74507474899292, "learning_rate": 6.851515521975388e-07, "loss": 0.4973, "num_input_tokens_seen": 164669024, "step": 135400 }, { "epoch": 16.965919057762186, "grad_norm": 13.036558151245117, "learning_rate": 6.84875347713741e-07, "loss": 0.4473, "num_input_tokens_seen": 164674976, "step": 135405 }, { "epoch": 16.966545545670968, "grad_norm": 4.863698959350586, "learning_rate": 6.84599194820888e-07, "loss": 0.4458, "num_input_tokens_seen": 164680992, "step": 135410 }, { "epoch": 16.96717203357975, "grad_norm": 13.875112533569336, "learning_rate": 6.843230935222789e-07, "loss": 0.4365, "num_input_tokens_seen": 164686880, "step": 135415 }, { "epoch": 16.967798521488536, "grad_norm": 5.037657260894775, "learning_rate": 6.84047043821216e-07, "loss": 0.5281, "num_input_tokens_seen": 164692608, "step": 135420 }, { "epoch": 16.96842500939732, "grad_norm": 6.021206855773926, "learning_rate": 6.837710457209989e-07, "loss": 0.4674, "num_input_tokens_seen": 164698784, "step": 135425 }, { "epoch": 16.9690514973061, "grad_norm": 3.506458282470703, "learning_rate": 6.834950992249284e-07, "loss": 0.4778, "num_input_tokens_seen": 164704768, "step": 135430 }, { "epoch": 16.969677985214886, "grad_norm": 4.303443908691406, "learning_rate": 6.832192043363023e-07, "loss": 0.4755, "num_input_tokens_seen": 164710880, "step": 135435 }, { "epoch": 16.97030447312367, "grad_norm": 14.144878387451172, "learning_rate": 6.829433610584202e-07, "loss": 0.4248, "num_input_tokens_seen": 164717056, "step": 135440 }, { "epoch": 16.97093096103245, "grad_norm": 4.363700866699219, "learning_rate": 6.826675693945794e-07, "loss": 0.4921, "num_input_tokens_seen": 164722912, "step": 135445 }, { "epoch": 16.971557448941237, "grad_norm": 4.145548343658447, "learning_rate": 6.823918293480791e-07, "loss": 0.4246, "num_input_tokens_seen": 164729312, "step": 135450 }, { "epoch": 16.97218393685002, "grad_norm": 9.822813987731934, "learning_rate": 6.821161409222133e-07, "loss": 0.4787, "num_input_tokens_seen": 164735328, "step": 135455 }, { "epoch": 16.9728104247588, "grad_norm": 4.136324882507324, "learning_rate": 6.818405041202792e-07, "loss": 0.4363, "num_input_tokens_seen": 164741440, "step": 135460 }, { "epoch": 16.973436912667587, "grad_norm": 6.069043159484863, "learning_rate": 6.815649189455731e-07, "loss": 0.4125, "num_input_tokens_seen": 164747456, "step": 135465 }, { "epoch": 16.97406340057637, "grad_norm": 28.508020401000977, "learning_rate": 6.812893854013886e-07, "loss": 0.4689, "num_input_tokens_seen": 164753440, "step": 135470 }, { "epoch": 16.974689888485152, "grad_norm": 3.6560986042022705, "learning_rate": 6.810139034910212e-07, "loss": 0.4446, "num_input_tokens_seen": 164759360, "step": 135475 }, { "epoch": 16.975316376393934, "grad_norm": 6.253562927246094, "learning_rate": 6.80738473217763e-07, "loss": 0.4381, "num_input_tokens_seen": 164765344, "step": 135480 }, { "epoch": 16.97594286430272, "grad_norm": 3.2398457527160645, "learning_rate": 6.804630945849089e-07, "loss": 0.4076, "num_input_tokens_seen": 164771360, "step": 135485 }, { "epoch": 16.976569352211502, "grad_norm": 3.2713875770568848, "learning_rate": 6.801877675957491e-07, "loss": 0.3972, "num_input_tokens_seen": 164777088, "step": 135490 }, { "epoch": 16.977195840120284, "grad_norm": 14.290634155273438, "learning_rate": 6.799124922535771e-07, "loss": 0.4405, "num_input_tokens_seen": 164783392, "step": 135495 }, { "epoch": 16.97782232802907, "grad_norm": 2.7739503383636475, "learning_rate": 6.796372685616826e-07, "loss": 0.4562, "num_input_tokens_seen": 164789440, "step": 135500 }, { "epoch": 16.978448815937853, "grad_norm": 6.460955619812012, "learning_rate": 6.793620965233577e-07, "loss": 0.4175, "num_input_tokens_seen": 164795200, "step": 135505 }, { "epoch": 16.979075303846635, "grad_norm": 4.324209213256836, "learning_rate": 6.790869761418911e-07, "loss": 0.52, "num_input_tokens_seen": 164800768, "step": 135510 }, { "epoch": 16.97970179175542, "grad_norm": 4.289942264556885, "learning_rate": 6.788119074205729e-07, "loss": 0.4359, "num_input_tokens_seen": 164807040, "step": 135515 }, { "epoch": 16.980328279664203, "grad_norm": 4.984341621398926, "learning_rate": 6.78536890362691e-07, "loss": 0.4268, "num_input_tokens_seen": 164812544, "step": 135520 }, { "epoch": 16.980954767572985, "grad_norm": 5.931657791137695, "learning_rate": 6.782619249715333e-07, "loss": 0.4122, "num_input_tokens_seen": 164818496, "step": 135525 }, { "epoch": 16.981581255481768, "grad_norm": 5.889762878417969, "learning_rate": 6.779870112503889e-07, "loss": 0.5212, "num_input_tokens_seen": 164824544, "step": 135530 }, { "epoch": 16.982207743390553, "grad_norm": 5.175833225250244, "learning_rate": 6.77712149202543e-07, "loss": 0.4904, "num_input_tokens_seen": 164830560, "step": 135535 }, { "epoch": 16.982834231299336, "grad_norm": 6.444320201873779, "learning_rate": 6.774373388312827e-07, "loss": 0.4307, "num_input_tokens_seen": 164836160, "step": 135540 }, { "epoch": 16.983460719208118, "grad_norm": 3.63320255279541, "learning_rate": 6.771625801398928e-07, "loss": 0.4538, "num_input_tokens_seen": 164841824, "step": 135545 }, { "epoch": 16.984087207116904, "grad_norm": 3.4352407455444336, "learning_rate": 6.76887873131658e-07, "loss": 0.3823, "num_input_tokens_seen": 164847872, "step": 135550 }, { "epoch": 16.984713695025686, "grad_norm": 7.8854265213012695, "learning_rate": 6.766132178098639e-07, "loss": 0.418, "num_input_tokens_seen": 164853728, "step": 135555 }, { "epoch": 16.98534018293447, "grad_norm": 3.061319351196289, "learning_rate": 6.763386141777951e-07, "loss": 0.4767, "num_input_tokens_seen": 164860000, "step": 135560 }, { "epoch": 16.985966670843254, "grad_norm": 3.598703145980835, "learning_rate": 6.760640622387315e-07, "loss": 0.4318, "num_input_tokens_seen": 164866272, "step": 135565 }, { "epoch": 16.986593158752036, "grad_norm": 23.38507080078125, "learning_rate": 6.757895619959587e-07, "loss": 0.5097, "num_input_tokens_seen": 164872256, "step": 135570 }, { "epoch": 16.98721964666082, "grad_norm": 4.010263442993164, "learning_rate": 6.75515113452756e-07, "loss": 0.4186, "num_input_tokens_seen": 164878304, "step": 135575 }, { "epoch": 16.9878461345696, "grad_norm": 4.134119987487793, "learning_rate": 6.752407166124076e-07, "loss": 0.4133, "num_input_tokens_seen": 164884000, "step": 135580 }, { "epoch": 16.988472622478387, "grad_norm": 4.892251968383789, "learning_rate": 6.749663714781906e-07, "loss": 0.4281, "num_input_tokens_seen": 164890240, "step": 135585 }, { "epoch": 16.98909911038717, "grad_norm": 15.083582878112793, "learning_rate": 6.746920780533878e-07, "loss": 0.5145, "num_input_tokens_seen": 164896320, "step": 135590 }, { "epoch": 16.98972559829595, "grad_norm": 3.4225242137908936, "learning_rate": 6.744178363412779e-07, "loss": 0.4401, "num_input_tokens_seen": 164901184, "step": 135595 }, { "epoch": 16.990352086204737, "grad_norm": 2.2949812412261963, "learning_rate": 6.74143646345139e-07, "loss": 0.4158, "num_input_tokens_seen": 164907392, "step": 135600 }, { "epoch": 16.99097857411352, "grad_norm": 4.387548446655273, "learning_rate": 6.738695080682511e-07, "loss": 0.4037, "num_input_tokens_seen": 164913824, "step": 135605 }, { "epoch": 16.991605062022302, "grad_norm": 8.897130012512207, "learning_rate": 6.73595421513889e-07, "loss": 0.5344, "num_input_tokens_seen": 164920160, "step": 135610 }, { "epoch": 16.992231549931088, "grad_norm": 3.2574362754821777, "learning_rate": 6.733213866853322e-07, "loss": 0.4047, "num_input_tokens_seen": 164926336, "step": 135615 }, { "epoch": 16.99285803783987, "grad_norm": 6.225818634033203, "learning_rate": 6.730474035858547e-07, "loss": 0.3964, "num_input_tokens_seen": 164932384, "step": 135620 }, { "epoch": 16.993484525748652, "grad_norm": 17.04428482055664, "learning_rate": 6.727734722187346e-07, "loss": 0.5406, "num_input_tokens_seen": 164938496, "step": 135625 }, { "epoch": 16.994111013657438, "grad_norm": 22.637683868408203, "learning_rate": 6.724995925872446e-07, "loss": 0.4892, "num_input_tokens_seen": 164944800, "step": 135630 }, { "epoch": 16.99473750156622, "grad_norm": 2.9287612438201904, "learning_rate": 6.722257646946617e-07, "loss": 0.5008, "num_input_tokens_seen": 164950592, "step": 135635 }, { "epoch": 16.995363989475003, "grad_norm": 6.0827155113220215, "learning_rate": 6.719519885442566e-07, "loss": 0.4711, "num_input_tokens_seen": 164956672, "step": 135640 }, { "epoch": 16.995990477383785, "grad_norm": 9.224716186523438, "learning_rate": 6.716782641393049e-07, "loss": 0.4449, "num_input_tokens_seen": 164962944, "step": 135645 }, { "epoch": 16.99661696529257, "grad_norm": 19.535947799682617, "learning_rate": 6.714045914830785e-07, "loss": 0.4616, "num_input_tokens_seen": 164969312, "step": 135650 }, { "epoch": 16.997243453201353, "grad_norm": 5.107517242431641, "learning_rate": 6.711309705788505e-07, "loss": 0.4144, "num_input_tokens_seen": 164975584, "step": 135655 }, { "epoch": 16.997869941110135, "grad_norm": 15.422149658203125, "learning_rate": 6.708574014298902e-07, "loss": 0.4834, "num_input_tokens_seen": 164981472, "step": 135660 }, { "epoch": 16.99849642901892, "grad_norm": 14.397247314453125, "learning_rate": 6.705838840394696e-07, "loss": 0.4574, "num_input_tokens_seen": 164987584, "step": 135665 }, { "epoch": 16.999122916927703, "grad_norm": 6.1793036460876465, "learning_rate": 6.703104184108594e-07, "loss": 0.4495, "num_input_tokens_seen": 164993952, "step": 135670 }, { "epoch": 16.999749404836486, "grad_norm": 4.177015781402588, "learning_rate": 6.700370045473275e-07, "loss": 0.4855, "num_input_tokens_seen": 164999680, "step": 135675 }, { "epoch": 17.00037589274527, "grad_norm": 3.76261043548584, "learning_rate": 6.697636424521448e-07, "loss": 0.4523, "num_input_tokens_seen": 165006112, "step": 135680 }, { "epoch": 17.001002380654054, "grad_norm": 7.492097854614258, "learning_rate": 6.694903321285773e-07, "loss": 0.4525, "num_input_tokens_seen": 165012480, "step": 135685 }, { "epoch": 17.001628868562836, "grad_norm": 3.965071201324463, "learning_rate": 6.692170735798947e-07, "loss": 0.5302, "num_input_tokens_seen": 165018560, "step": 135690 }, { "epoch": 17.00225535647162, "grad_norm": 6.108999729156494, "learning_rate": 6.689438668093617e-07, "loss": 0.4105, "num_input_tokens_seen": 165024864, "step": 135695 }, { "epoch": 17.002881844380404, "grad_norm": 7.37058687210083, "learning_rate": 6.686707118202479e-07, "loss": 0.3989, "num_input_tokens_seen": 165031296, "step": 135700 }, { "epoch": 17.003508332289186, "grad_norm": 22.481830596923828, "learning_rate": 6.683976086158156e-07, "loss": 0.5089, "num_input_tokens_seen": 165037216, "step": 135705 }, { "epoch": 17.00413482019797, "grad_norm": 5.348717212677002, "learning_rate": 6.681245571993322e-07, "loss": 0.4371, "num_input_tokens_seen": 165042912, "step": 135710 }, { "epoch": 17.004761308106755, "grad_norm": 17.367916107177734, "learning_rate": 6.678515575740629e-07, "loss": 0.4568, "num_input_tokens_seen": 165048512, "step": 135715 }, { "epoch": 17.005387796015537, "grad_norm": 4.340876579284668, "learning_rate": 6.675786097432691e-07, "loss": 0.4368, "num_input_tokens_seen": 165054848, "step": 135720 }, { "epoch": 17.00601428392432, "grad_norm": 5.811354637145996, "learning_rate": 6.673057137102168e-07, "loss": 0.4782, "num_input_tokens_seen": 165060960, "step": 135725 }, { "epoch": 17.006640771833105, "grad_norm": 8.646971702575684, "learning_rate": 6.670328694781658e-07, "loss": 0.4176, "num_input_tokens_seen": 165066848, "step": 135730 }, { "epoch": 17.007267259741887, "grad_norm": 14.963698387145996, "learning_rate": 6.667600770503812e-07, "loss": 0.5161, "num_input_tokens_seen": 165072608, "step": 135735 }, { "epoch": 17.00789374765067, "grad_norm": 3.894493341445923, "learning_rate": 6.664873364301222e-07, "loss": 0.4284, "num_input_tokens_seen": 165078816, "step": 135740 }, { "epoch": 17.008520235559455, "grad_norm": 3.8563759326934814, "learning_rate": 6.662146476206504e-07, "loss": 0.4459, "num_input_tokens_seen": 165085408, "step": 135745 }, { "epoch": 17.009146723468238, "grad_norm": 4.575611114501953, "learning_rate": 6.659420106252273e-07, "loss": 0.3913, "num_input_tokens_seen": 165091648, "step": 135750 }, { "epoch": 17.00977321137702, "grad_norm": 10.208016395568848, "learning_rate": 6.656694254471102e-07, "loss": 0.4436, "num_input_tokens_seen": 165097856, "step": 135755 }, { "epoch": 17.010399699285802, "grad_norm": 22.83837127685547, "learning_rate": 6.653968920895593e-07, "loss": 0.429, "num_input_tokens_seen": 165104448, "step": 135760 }, { "epoch": 17.011026187194588, "grad_norm": 3.9502933025360107, "learning_rate": 6.651244105558335e-07, "loss": 0.4516, "num_input_tokens_seen": 165111040, "step": 135765 }, { "epoch": 17.01165267510337, "grad_norm": 3.926255941390991, "learning_rate": 6.648519808491893e-07, "loss": 0.4337, "num_input_tokens_seen": 165117120, "step": 135770 }, { "epoch": 17.012279163012153, "grad_norm": 4.436666011810303, "learning_rate": 6.645796029728845e-07, "loss": 0.4289, "num_input_tokens_seen": 165123232, "step": 135775 }, { "epoch": 17.01290565092094, "grad_norm": 8.141702651977539, "learning_rate": 6.643072769301767e-07, "loss": 0.4486, "num_input_tokens_seen": 165129344, "step": 135780 }, { "epoch": 17.01353213882972, "grad_norm": 16.487703323364258, "learning_rate": 6.640350027243192e-07, "loss": 0.4481, "num_input_tokens_seen": 165134848, "step": 135785 }, { "epoch": 17.014158626738503, "grad_norm": 6.5050811767578125, "learning_rate": 6.6376278035857e-07, "loss": 0.4337, "num_input_tokens_seen": 165140864, "step": 135790 }, { "epoch": 17.01478511464729, "grad_norm": 3.4031693935394287, "learning_rate": 6.634906098361815e-07, "loss": 0.4373, "num_input_tokens_seen": 165146976, "step": 135795 }, { "epoch": 17.01541160255607, "grad_norm": 14.367255210876465, "learning_rate": 6.632184911604095e-07, "loss": 0.5006, "num_input_tokens_seen": 165153056, "step": 135800 }, { "epoch": 17.016038090464853, "grad_norm": 32.063602447509766, "learning_rate": 6.629464243345057e-07, "loss": 0.615, "num_input_tokens_seen": 165158976, "step": 135805 }, { "epoch": 17.016664578373636, "grad_norm": 7.922085762023926, "learning_rate": 6.626744093617249e-07, "loss": 0.5068, "num_input_tokens_seen": 165165248, "step": 135810 }, { "epoch": 17.01729106628242, "grad_norm": 3.187164783477783, "learning_rate": 6.624024462453171e-07, "loss": 0.386, "num_input_tokens_seen": 165171360, "step": 135815 }, { "epoch": 17.017917554191204, "grad_norm": 6.351507663726807, "learning_rate": 6.621305349885359e-07, "loss": 0.4388, "num_input_tokens_seen": 165176864, "step": 135820 }, { "epoch": 17.018544042099986, "grad_norm": 4.523996829986572, "learning_rate": 6.618586755946299e-07, "loss": 0.4054, "num_input_tokens_seen": 165183104, "step": 135825 }, { "epoch": 17.019170530008772, "grad_norm": 7.68438720703125, "learning_rate": 6.615868680668525e-07, "loss": 0.4496, "num_input_tokens_seen": 165189280, "step": 135830 }, { "epoch": 17.019797017917554, "grad_norm": 19.364046096801758, "learning_rate": 6.613151124084499e-07, "loss": 0.4426, "num_input_tokens_seen": 165195520, "step": 135835 }, { "epoch": 17.020423505826336, "grad_norm": 7.208966255187988, "learning_rate": 6.61043408622673e-07, "loss": 0.4367, "num_input_tokens_seen": 165201024, "step": 135840 }, { "epoch": 17.021049993735122, "grad_norm": 4.882714748382568, "learning_rate": 6.607717567127714e-07, "loss": 0.4858, "num_input_tokens_seen": 165207200, "step": 135845 }, { "epoch": 17.021676481643905, "grad_norm": 6.334982872009277, "learning_rate": 6.605001566819907e-07, "loss": 0.4313, "num_input_tokens_seen": 165213504, "step": 135850 }, { "epoch": 17.022302969552687, "grad_norm": 8.119375228881836, "learning_rate": 6.60228608533579e-07, "loss": 0.5101, "num_input_tokens_seen": 165219456, "step": 135855 }, { "epoch": 17.022929457461473, "grad_norm": 4.578775882720947, "learning_rate": 6.599571122707827e-07, "loss": 0.4399, "num_input_tokens_seen": 165225696, "step": 135860 }, { "epoch": 17.023555945370255, "grad_norm": 2.801077127456665, "learning_rate": 6.596856678968494e-07, "loss": 0.5196, "num_input_tokens_seen": 165231680, "step": 135865 }, { "epoch": 17.024182433279037, "grad_norm": 9.76326847076416, "learning_rate": 6.594142754150218e-07, "loss": 0.4477, "num_input_tokens_seen": 165237632, "step": 135870 }, { "epoch": 17.02480892118782, "grad_norm": 4.308030128479004, "learning_rate": 6.591429348285472e-07, "loss": 0.4065, "num_input_tokens_seen": 165243744, "step": 135875 }, { "epoch": 17.025435409096605, "grad_norm": 4.1840691566467285, "learning_rate": 6.588716461406674e-07, "loss": 0.4337, "num_input_tokens_seen": 165249632, "step": 135880 }, { "epoch": 17.026061897005388, "grad_norm": 6.289889812469482, "learning_rate": 6.586004093546277e-07, "loss": 0.4556, "num_input_tokens_seen": 165255648, "step": 135885 }, { "epoch": 17.02668838491417, "grad_norm": 9.129667282104492, "learning_rate": 6.583292244736689e-07, "loss": 0.4351, "num_input_tokens_seen": 165262048, "step": 135890 }, { "epoch": 17.027314872822956, "grad_norm": 9.427597999572754, "learning_rate": 6.580580915010348e-07, "loss": 0.4836, "num_input_tokens_seen": 165268032, "step": 135895 }, { "epoch": 17.027941360731738, "grad_norm": 6.544575214385986, "learning_rate": 6.57787010439968e-07, "loss": 0.4702, "num_input_tokens_seen": 165274400, "step": 135900 }, { "epoch": 17.02856784864052, "grad_norm": 7.490420341491699, "learning_rate": 6.575159812937071e-07, "loss": 0.4926, "num_input_tokens_seen": 165280352, "step": 135905 }, { "epoch": 17.029194336549306, "grad_norm": 4.08450984954834, "learning_rate": 6.572450040654943e-07, "loss": 0.4763, "num_input_tokens_seen": 165286208, "step": 135910 }, { "epoch": 17.02982082445809, "grad_norm": 5.428312301635742, "learning_rate": 6.569740787585677e-07, "loss": 0.4416, "num_input_tokens_seen": 165292256, "step": 135915 }, { "epoch": 17.03044731236687, "grad_norm": 10.142394065856934, "learning_rate": 6.567032053761685e-07, "loss": 0.4326, "num_input_tokens_seen": 165298656, "step": 135920 }, { "epoch": 17.031073800275653, "grad_norm": 3.1847219467163086, "learning_rate": 6.564323839215331e-07, "loss": 0.4065, "num_input_tokens_seen": 165304640, "step": 135925 }, { "epoch": 17.03170028818444, "grad_norm": 3.2500011920928955, "learning_rate": 6.561616143979016e-07, "loss": 0.4099, "num_input_tokens_seen": 165311104, "step": 135930 }, { "epoch": 17.03232677609322, "grad_norm": 21.931116104125977, "learning_rate": 6.558908968085087e-07, "loss": 0.5205, "num_input_tokens_seen": 165317344, "step": 135935 }, { "epoch": 17.032953264002003, "grad_norm": 7.9251837730407715, "learning_rate": 6.55620231156594e-07, "loss": 0.4487, "num_input_tokens_seen": 165323392, "step": 135940 }, { "epoch": 17.03357975191079, "grad_norm": 15.946759223937988, "learning_rate": 6.553496174453905e-07, "loss": 0.5016, "num_input_tokens_seen": 165329440, "step": 135945 }, { "epoch": 17.03420623981957, "grad_norm": 12.354182243347168, "learning_rate": 6.55079055678135e-07, "loss": 0.5734, "num_input_tokens_seen": 165335136, "step": 135950 }, { "epoch": 17.034832727728354, "grad_norm": 4.135675430297852, "learning_rate": 6.548085458580639e-07, "loss": 0.4171, "num_input_tokens_seen": 165341856, "step": 135955 }, { "epoch": 17.03545921563714, "grad_norm": 15.513751983642578, "learning_rate": 6.545380879884089e-07, "loss": 0.4987, "num_input_tokens_seen": 165348032, "step": 135960 }, { "epoch": 17.036085703545922, "grad_norm": 4.9372711181640625, "learning_rate": 6.542676820724042e-07, "loss": 0.4761, "num_input_tokens_seen": 165354048, "step": 135965 }, { "epoch": 17.036712191454704, "grad_norm": 9.957592964172363, "learning_rate": 6.539973281132828e-07, "loss": 0.464, "num_input_tokens_seen": 165360512, "step": 135970 }, { "epoch": 17.03733867936349, "grad_norm": 3.807208776473999, "learning_rate": 6.537270261142786e-07, "loss": 0.4078, "num_input_tokens_seen": 165366624, "step": 135975 }, { "epoch": 17.037965167272272, "grad_norm": 11.850828170776367, "learning_rate": 6.534567760786209e-07, "loss": 0.4399, "num_input_tokens_seen": 165372960, "step": 135980 }, { "epoch": 17.038591655181055, "grad_norm": 4.532702922821045, "learning_rate": 6.531865780095431e-07, "loss": 0.4319, "num_input_tokens_seen": 165378912, "step": 135985 }, { "epoch": 17.039218143089837, "grad_norm": 8.226322174072266, "learning_rate": 6.529164319102732e-07, "loss": 0.452, "num_input_tokens_seen": 165385152, "step": 135990 }, { "epoch": 17.039844630998623, "grad_norm": 5.913725852966309, "learning_rate": 6.526463377840431e-07, "loss": 0.4254, "num_input_tokens_seen": 165391104, "step": 135995 }, { "epoch": 17.040471118907405, "grad_norm": 4.282927513122559, "learning_rate": 6.523762956340802e-07, "loss": 0.4454, "num_input_tokens_seen": 165397376, "step": 136000 }, { "epoch": 17.041097606816187, "grad_norm": 8.766547203063965, "learning_rate": 6.521063054636146e-07, "loss": 0.4547, "num_input_tokens_seen": 165403520, "step": 136005 }, { "epoch": 17.041724094724973, "grad_norm": 3.0385944843292236, "learning_rate": 6.518363672758732e-07, "loss": 0.4566, "num_input_tokens_seen": 165409184, "step": 136010 }, { "epoch": 17.042350582633755, "grad_norm": 2.619884490966797, "learning_rate": 6.51566481074084e-07, "loss": 0.3977, "num_input_tokens_seen": 165415200, "step": 136015 }, { "epoch": 17.042977070542538, "grad_norm": 5.196488380432129, "learning_rate": 6.512966468614729e-07, "loss": 0.4368, "num_input_tokens_seen": 165420352, "step": 136020 }, { "epoch": 17.043603558451323, "grad_norm": 4.978334903717041, "learning_rate": 6.510268646412665e-07, "loss": 0.4598, "num_input_tokens_seen": 165426208, "step": 136025 }, { "epoch": 17.044230046360106, "grad_norm": 17.895301818847656, "learning_rate": 6.507571344166918e-07, "loss": 0.4809, "num_input_tokens_seen": 165432064, "step": 136030 }, { "epoch": 17.044856534268888, "grad_norm": 6.512283802032471, "learning_rate": 6.504874561909708e-07, "loss": 0.4478, "num_input_tokens_seen": 165438080, "step": 136035 }, { "epoch": 17.04548302217767, "grad_norm": 7.252399921417236, "learning_rate": 6.502178299673301e-07, "loss": 0.4666, "num_input_tokens_seen": 165443712, "step": 136040 }, { "epoch": 17.046109510086456, "grad_norm": 6.475400924682617, "learning_rate": 6.499482557489917e-07, "loss": 0.449, "num_input_tokens_seen": 165449664, "step": 136045 }, { "epoch": 17.04673599799524, "grad_norm": 4.662345886230469, "learning_rate": 6.496787335391796e-07, "loss": 0.4667, "num_input_tokens_seen": 165455200, "step": 136050 }, { "epoch": 17.04736248590402, "grad_norm": 19.445880889892578, "learning_rate": 6.494092633411153e-07, "loss": 0.5383, "num_input_tokens_seen": 165461312, "step": 136055 }, { "epoch": 17.047988973812807, "grad_norm": 5.327178001403809, "learning_rate": 6.491398451580205e-07, "loss": 0.4266, "num_input_tokens_seen": 165466912, "step": 136060 }, { "epoch": 17.04861546172159, "grad_norm": 5.376925945281982, "learning_rate": 6.488704789931171e-07, "loss": 0.426, "num_input_tokens_seen": 165472736, "step": 136065 }, { "epoch": 17.04924194963037, "grad_norm": 3.054175615310669, "learning_rate": 6.486011648496271e-07, "loss": 0.4231, "num_input_tokens_seen": 165478944, "step": 136070 }, { "epoch": 17.049868437539157, "grad_norm": 3.7490174770355225, "learning_rate": 6.483319027307667e-07, "loss": 0.4503, "num_input_tokens_seen": 165484416, "step": 136075 }, { "epoch": 17.05049492544794, "grad_norm": 4.346591949462891, "learning_rate": 6.480626926397571e-07, "loss": 0.4821, "num_input_tokens_seen": 165490976, "step": 136080 }, { "epoch": 17.05112141335672, "grad_norm": 4.248308181762695, "learning_rate": 6.477935345798186e-07, "loss": 0.4333, "num_input_tokens_seen": 165497248, "step": 136085 }, { "epoch": 17.051747901265507, "grad_norm": 4.45438814163208, "learning_rate": 6.475244285541659e-07, "loss": 0.4474, "num_input_tokens_seen": 165503264, "step": 136090 }, { "epoch": 17.05237438917429, "grad_norm": 5.564306735992432, "learning_rate": 6.472553745660193e-07, "loss": 0.4217, "num_input_tokens_seen": 165509344, "step": 136095 }, { "epoch": 17.053000877083072, "grad_norm": 4.93416166305542, "learning_rate": 6.469863726185932e-07, "loss": 0.5042, "num_input_tokens_seen": 165515744, "step": 136100 }, { "epoch": 17.053627364991854, "grad_norm": 10.803313255310059, "learning_rate": 6.467174227151057e-07, "loss": 0.4256, "num_input_tokens_seen": 165521728, "step": 136105 }, { "epoch": 17.05425385290064, "grad_norm": 3.61959171295166, "learning_rate": 6.464485248587709e-07, "loss": 0.4201, "num_input_tokens_seen": 165527456, "step": 136110 }, { "epoch": 17.054880340809422, "grad_norm": 20.136911392211914, "learning_rate": 6.461796790528047e-07, "loss": 0.4575, "num_input_tokens_seen": 165533536, "step": 136115 }, { "epoch": 17.055506828718205, "grad_norm": 4.17764949798584, "learning_rate": 6.459108853004204e-07, "loss": 0.4253, "num_input_tokens_seen": 165539872, "step": 136120 }, { "epoch": 17.05613331662699, "grad_norm": 2.622262954711914, "learning_rate": 6.456421436048327e-07, "loss": 0.4469, "num_input_tokens_seen": 165545952, "step": 136125 }, { "epoch": 17.056759804535773, "grad_norm": 10.984461784362793, "learning_rate": 6.453734539692536e-07, "loss": 0.41, "num_input_tokens_seen": 165551648, "step": 136130 }, { "epoch": 17.057386292444555, "grad_norm": 7.786723613739014, "learning_rate": 6.451048163968971e-07, "loss": 0.4455, "num_input_tokens_seen": 165558336, "step": 136135 }, { "epoch": 17.05801278035334, "grad_norm": 2.9101998805999756, "learning_rate": 6.448362308909722e-07, "loss": 0.4829, "num_input_tokens_seen": 165564416, "step": 136140 }, { "epoch": 17.058639268262123, "grad_norm": 11.981392860412598, "learning_rate": 6.445676974546921e-07, "loss": 0.4648, "num_input_tokens_seen": 165570272, "step": 136145 }, { "epoch": 17.059265756170905, "grad_norm": 15.503663063049316, "learning_rate": 6.442992160912682e-07, "loss": 0.51, "num_input_tokens_seen": 165575424, "step": 136150 }, { "epoch": 17.059892244079688, "grad_norm": 26.92796516418457, "learning_rate": 6.44030786803908e-07, "loss": 0.543, "num_input_tokens_seen": 165581696, "step": 136155 }, { "epoch": 17.060518731988473, "grad_norm": 4.295252323150635, "learning_rate": 6.437624095958222e-07, "loss": 0.4735, "num_input_tokens_seen": 165587840, "step": 136160 }, { "epoch": 17.061145219897256, "grad_norm": 5.623703479766846, "learning_rate": 6.434940844702203e-07, "loss": 0.4542, "num_input_tokens_seen": 165593600, "step": 136165 }, { "epoch": 17.061771707806038, "grad_norm": 2.8773183822631836, "learning_rate": 6.432258114303086e-07, "loss": 0.3875, "num_input_tokens_seen": 165599808, "step": 136170 }, { "epoch": 17.062398195714824, "grad_norm": 3.6384479999542236, "learning_rate": 6.429575904792951e-07, "loss": 0.386, "num_input_tokens_seen": 165606112, "step": 136175 }, { "epoch": 17.063024683623606, "grad_norm": 8.058882713317871, "learning_rate": 6.426894216203877e-07, "loss": 0.4139, "num_input_tokens_seen": 165612416, "step": 136180 }, { "epoch": 17.06365117153239, "grad_norm": 5.044676780700684, "learning_rate": 6.424213048567906e-07, "loss": 0.4261, "num_input_tokens_seen": 165618272, "step": 136185 }, { "epoch": 17.064277659441174, "grad_norm": 3.9905784130096436, "learning_rate": 6.421532401917119e-07, "loss": 0.4703, "num_input_tokens_seen": 165624352, "step": 136190 }, { "epoch": 17.064904147349957, "grad_norm": 24.398080825805664, "learning_rate": 6.418852276283539e-07, "loss": 0.5517, "num_input_tokens_seen": 165630528, "step": 136195 }, { "epoch": 17.06553063525874, "grad_norm": 3.6737804412841797, "learning_rate": 6.416172671699227e-07, "loss": 0.3996, "num_input_tokens_seen": 165636672, "step": 136200 }, { "epoch": 17.06615712316752, "grad_norm": 7.027325630187988, "learning_rate": 6.413493588196213e-07, "loss": 0.3977, "num_input_tokens_seen": 165642816, "step": 136205 }, { "epoch": 17.066783611076307, "grad_norm": 3.821840763092041, "learning_rate": 6.410815025806521e-07, "loss": 0.4896, "num_input_tokens_seen": 165648896, "step": 136210 }, { "epoch": 17.06741009898509, "grad_norm": 3.8814241886138916, "learning_rate": 6.408136984562197e-07, "loss": 0.4238, "num_input_tokens_seen": 165655232, "step": 136215 }, { "epoch": 17.06803658689387, "grad_norm": 14.360833168029785, "learning_rate": 6.405459464495229e-07, "loss": 0.4581, "num_input_tokens_seen": 165661792, "step": 136220 }, { "epoch": 17.068663074802657, "grad_norm": 3.9328982830047607, "learning_rate": 6.402782465637664e-07, "loss": 0.5647, "num_input_tokens_seen": 165668224, "step": 136225 }, { "epoch": 17.06928956271144, "grad_norm": 4.120890140533447, "learning_rate": 6.400105988021471e-07, "loss": 0.4121, "num_input_tokens_seen": 165674560, "step": 136230 }, { "epoch": 17.069916050620222, "grad_norm": 11.407634735107422, "learning_rate": 6.397430031678681e-07, "loss": 0.4337, "num_input_tokens_seen": 165680704, "step": 136235 }, { "epoch": 17.070542538529008, "grad_norm": 6.292044639587402, "learning_rate": 6.394754596641268e-07, "loss": 0.4342, "num_input_tokens_seen": 165686912, "step": 136240 }, { "epoch": 17.07116902643779, "grad_norm": 3.680861711502075, "learning_rate": 6.392079682941232e-07, "loss": 0.437, "num_input_tokens_seen": 165693120, "step": 136245 }, { "epoch": 17.071795514346572, "grad_norm": 6.475399017333984, "learning_rate": 6.389405290610534e-07, "loss": 0.4072, "num_input_tokens_seen": 165699072, "step": 136250 }, { "epoch": 17.072422002255358, "grad_norm": 4.004952430725098, "learning_rate": 6.386731419681175e-07, "loss": 0.4723, "num_input_tokens_seen": 165705344, "step": 136255 }, { "epoch": 17.07304849016414, "grad_norm": 5.115331172943115, "learning_rate": 6.384058070185101e-07, "loss": 0.3855, "num_input_tokens_seen": 165711200, "step": 136260 }, { "epoch": 17.073674978072923, "grad_norm": 3.8110978603363037, "learning_rate": 6.381385242154281e-07, "loss": 0.4376, "num_input_tokens_seen": 165717056, "step": 136265 }, { "epoch": 17.074301465981705, "grad_norm": 4.26186990737915, "learning_rate": 6.378712935620668e-07, "loss": 0.4738, "num_input_tokens_seen": 165723040, "step": 136270 }, { "epoch": 17.07492795389049, "grad_norm": 3.172752857208252, "learning_rate": 6.376041150616236e-07, "loss": 0.475, "num_input_tokens_seen": 165728896, "step": 136275 }, { "epoch": 17.075554441799273, "grad_norm": 6.575165748596191, "learning_rate": 6.373369887172893e-07, "loss": 0.4617, "num_input_tokens_seen": 165735328, "step": 136280 }, { "epoch": 17.076180929708055, "grad_norm": 6.426489353179932, "learning_rate": 6.370699145322595e-07, "loss": 0.4349, "num_input_tokens_seen": 165741472, "step": 136285 }, { "epoch": 17.07680741761684, "grad_norm": 6.172354221343994, "learning_rate": 6.36802892509728e-07, "loss": 0.4701, "num_input_tokens_seen": 165747200, "step": 136290 }, { "epoch": 17.077433905525623, "grad_norm": 13.498559951782227, "learning_rate": 6.36535922652885e-07, "loss": 0.4091, "num_input_tokens_seen": 165753440, "step": 136295 }, { "epoch": 17.078060393434406, "grad_norm": 5.181379318237305, "learning_rate": 6.362690049649251e-07, "loss": 0.4182, "num_input_tokens_seen": 165759744, "step": 136300 }, { "epoch": 17.07868688134319, "grad_norm": 12.146562576293945, "learning_rate": 6.360021394490368e-07, "loss": 0.4563, "num_input_tokens_seen": 165765824, "step": 136305 }, { "epoch": 17.079313369251974, "grad_norm": 2.6055407524108887, "learning_rate": 6.357353261084126e-07, "loss": 0.3973, "num_input_tokens_seen": 165772160, "step": 136310 }, { "epoch": 17.079939857160756, "grad_norm": 10.330913543701172, "learning_rate": 6.354685649462417e-07, "loss": 0.4723, "num_input_tokens_seen": 165778304, "step": 136315 }, { "epoch": 17.08056634506954, "grad_norm": 19.514225006103516, "learning_rate": 6.352018559657142e-07, "loss": 0.4652, "num_input_tokens_seen": 165783808, "step": 136320 }, { "epoch": 17.081192832978324, "grad_norm": 4.880465030670166, "learning_rate": 6.34935199170017e-07, "loss": 0.4621, "num_input_tokens_seen": 165789248, "step": 136325 }, { "epoch": 17.081819320887107, "grad_norm": 30.146343231201172, "learning_rate": 6.346685945623393e-07, "loss": 0.5354, "num_input_tokens_seen": 165795232, "step": 136330 }, { "epoch": 17.08244580879589, "grad_norm": 3.873436212539673, "learning_rate": 6.344020421458702e-07, "loss": 0.4167, "num_input_tokens_seen": 165801792, "step": 136335 }, { "epoch": 17.083072296704675, "grad_norm": 5.329395771026611, "learning_rate": 6.341355419237938e-07, "loss": 0.4186, "num_input_tokens_seen": 165807840, "step": 136340 }, { "epoch": 17.083698784613457, "grad_norm": 7.927135467529297, "learning_rate": 6.338690938992986e-07, "loss": 0.4301, "num_input_tokens_seen": 165814048, "step": 136345 }, { "epoch": 17.08432527252224, "grad_norm": 3.7858176231384277, "learning_rate": 6.336026980755683e-07, "loss": 0.4243, "num_input_tokens_seen": 165820000, "step": 136350 }, { "epoch": 17.084951760431025, "grad_norm": 15.000725746154785, "learning_rate": 6.333363544557902e-07, "loss": 0.5247, "num_input_tokens_seen": 165825856, "step": 136355 }, { "epoch": 17.085578248339807, "grad_norm": 19.363052368164062, "learning_rate": 6.330700630431458e-07, "loss": 0.4478, "num_input_tokens_seen": 165831328, "step": 136360 }, { "epoch": 17.08620473624859, "grad_norm": 3.4865944385528564, "learning_rate": 6.328038238408202e-07, "loss": 0.4434, "num_input_tokens_seen": 165837760, "step": 136365 }, { "epoch": 17.086831224157375, "grad_norm": 6.768154621124268, "learning_rate": 6.325376368519981e-07, "loss": 0.451, "num_input_tokens_seen": 165844096, "step": 136370 }, { "epoch": 17.087457712066158, "grad_norm": 5.674596309661865, "learning_rate": 6.322715020798593e-07, "loss": 0.4533, "num_input_tokens_seen": 165850048, "step": 136375 }, { "epoch": 17.08808419997494, "grad_norm": 10.621667861938477, "learning_rate": 6.320054195275866e-07, "loss": 0.4366, "num_input_tokens_seen": 165856064, "step": 136380 }, { "epoch": 17.088710687883722, "grad_norm": 8.112919807434082, "learning_rate": 6.317393891983631e-07, "loss": 0.4129, "num_input_tokens_seen": 165861920, "step": 136385 }, { "epoch": 17.089337175792508, "grad_norm": 4.328261375427246, "learning_rate": 6.314734110953663e-07, "loss": 0.5893, "num_input_tokens_seen": 165868096, "step": 136390 }, { "epoch": 17.08996366370129, "grad_norm": 18.413066864013672, "learning_rate": 6.312074852217781e-07, "loss": 0.4908, "num_input_tokens_seen": 165874272, "step": 136395 }, { "epoch": 17.090590151610073, "grad_norm": 4.693507194519043, "learning_rate": 6.309416115807781e-07, "loss": 0.4811, "num_input_tokens_seen": 165880224, "step": 136400 }, { "epoch": 17.09121663951886, "grad_norm": 7.268014907836914, "learning_rate": 6.306757901755439e-07, "loss": 0.4663, "num_input_tokens_seen": 165886368, "step": 136405 }, { "epoch": 17.09184312742764, "grad_norm": 21.48710060119629, "learning_rate": 6.304100210092545e-07, "loss": 0.502, "num_input_tokens_seen": 165892384, "step": 136410 }, { "epoch": 17.092469615336423, "grad_norm": 4.000252723693848, "learning_rate": 6.301443040850869e-07, "loss": 0.458, "num_input_tokens_seen": 165898432, "step": 136415 }, { "epoch": 17.09309610324521, "grad_norm": 4.915462017059326, "learning_rate": 6.298786394062184e-07, "loss": 0.4032, "num_input_tokens_seen": 165904544, "step": 136420 }, { "epoch": 17.09372259115399, "grad_norm": 5.519409656524658, "learning_rate": 6.296130269758244e-07, "loss": 0.445, "num_input_tokens_seen": 165910720, "step": 136425 }, { "epoch": 17.094349079062773, "grad_norm": 4.9443039894104, "learning_rate": 6.293474667970817e-07, "loss": 0.386, "num_input_tokens_seen": 165916928, "step": 136430 }, { "epoch": 17.094975566971556, "grad_norm": 18.136837005615234, "learning_rate": 6.290819588731639e-07, "loss": 0.4467, "num_input_tokens_seen": 165923104, "step": 136435 }, { "epoch": 17.09560205488034, "grad_norm": 16.877422332763672, "learning_rate": 6.288165032072474e-07, "loss": 0.4466, "num_input_tokens_seen": 165929120, "step": 136440 }, { "epoch": 17.096228542789124, "grad_norm": 21.12589454650879, "learning_rate": 6.285510998025035e-07, "loss": 0.4044, "num_input_tokens_seen": 165935136, "step": 136445 }, { "epoch": 17.096855030697906, "grad_norm": 5.410910129547119, "learning_rate": 6.282857486621063e-07, "loss": 0.457, "num_input_tokens_seen": 165941344, "step": 136450 }, { "epoch": 17.097481518606692, "grad_norm": 5.318517684936523, "learning_rate": 6.280204497892295e-07, "loss": 0.486, "num_input_tokens_seen": 165947424, "step": 136455 }, { "epoch": 17.098108006515474, "grad_norm": 5.056979656219482, "learning_rate": 6.277552031870432e-07, "loss": 0.467, "num_input_tokens_seen": 165953728, "step": 136460 }, { "epoch": 17.098734494424257, "grad_norm": 8.123931884765625, "learning_rate": 6.274900088587205e-07, "loss": 0.5022, "num_input_tokens_seen": 165959936, "step": 136465 }, { "epoch": 17.099360982333042, "grad_norm": 3.9201791286468506, "learning_rate": 6.272248668074299e-07, "loss": 0.4297, "num_input_tokens_seen": 165965824, "step": 136470 }, { "epoch": 17.099987470241825, "grad_norm": 7.2401251792907715, "learning_rate": 6.269597770363422e-07, "loss": 0.4812, "num_input_tokens_seen": 165971904, "step": 136475 }, { "epoch": 17.100613958150607, "grad_norm": 6.566042423248291, "learning_rate": 6.266947395486272e-07, "loss": 0.4795, "num_input_tokens_seen": 165978272, "step": 136480 }, { "epoch": 17.101240446059393, "grad_norm": 2.7392966747283936, "learning_rate": 6.264297543474546e-07, "loss": 0.4265, "num_input_tokens_seen": 165984512, "step": 136485 }, { "epoch": 17.101866933968175, "grad_norm": 4.201715469360352, "learning_rate": 6.2616482143599e-07, "loss": 0.4784, "num_input_tokens_seen": 165990336, "step": 136490 }, { "epoch": 17.102493421876957, "grad_norm": 3.723080635070801, "learning_rate": 6.258999408174032e-07, "loss": 0.4138, "num_input_tokens_seen": 165996192, "step": 136495 }, { "epoch": 17.10311990978574, "grad_norm": 8.78580379486084, "learning_rate": 6.256351124948595e-07, "loss": 0.4608, "num_input_tokens_seen": 166001408, "step": 136500 }, { "epoch": 17.103746397694525, "grad_norm": 5.628267765045166, "learning_rate": 6.253703364715269e-07, "loss": 0.4268, "num_input_tokens_seen": 166007680, "step": 136505 }, { "epoch": 17.104372885603308, "grad_norm": 10.861501693725586, "learning_rate": 6.251056127505689e-07, "loss": 0.4268, "num_input_tokens_seen": 166012672, "step": 136510 }, { "epoch": 17.10499937351209, "grad_norm": 5.302073001861572, "learning_rate": 6.248409413351514e-07, "loss": 0.4166, "num_input_tokens_seen": 166018816, "step": 136515 }, { "epoch": 17.105625861420876, "grad_norm": 25.216655731201172, "learning_rate": 6.245763222284396e-07, "loss": 0.5135, "num_input_tokens_seen": 166024352, "step": 136520 }, { "epoch": 17.106252349329658, "grad_norm": 11.046109199523926, "learning_rate": 6.243117554335959e-07, "loss": 0.4759, "num_input_tokens_seen": 166030656, "step": 136525 }, { "epoch": 17.10687883723844, "grad_norm": 4.7144246101379395, "learning_rate": 6.240472409537851e-07, "loss": 0.4733, "num_input_tokens_seen": 166037024, "step": 136530 }, { "epoch": 17.107505325147226, "grad_norm": 4.144355773925781, "learning_rate": 6.237827787921674e-07, "loss": 0.4206, "num_input_tokens_seen": 166043072, "step": 136535 }, { "epoch": 17.10813181305601, "grad_norm": 3.0999138355255127, "learning_rate": 6.23518368951907e-07, "loss": 0.3971, "num_input_tokens_seen": 166049408, "step": 136540 }, { "epoch": 17.10875830096479, "grad_norm": 7.209519863128662, "learning_rate": 6.23254011436163e-07, "loss": 0.6261, "num_input_tokens_seen": 166055712, "step": 136545 }, { "epoch": 17.109384788873573, "grad_norm": 14.384584426879883, "learning_rate": 6.229897062480977e-07, "loss": 0.5136, "num_input_tokens_seen": 166061856, "step": 136550 }, { "epoch": 17.11001127678236, "grad_norm": 6.527169227600098, "learning_rate": 6.2272545339087e-07, "loss": 0.491, "num_input_tokens_seen": 166067648, "step": 136555 }, { "epoch": 17.11063776469114, "grad_norm": 4.730278491973877, "learning_rate": 6.224612528676404e-07, "loss": 0.4021, "num_input_tokens_seen": 166073184, "step": 136560 }, { "epoch": 17.111264252599923, "grad_norm": 16.267065048217773, "learning_rate": 6.221971046815661e-07, "loss": 0.4844, "num_input_tokens_seen": 166079488, "step": 136565 }, { "epoch": 17.11189074050871, "grad_norm": 3.762662887573242, "learning_rate": 6.219330088358061e-07, "loss": 0.3723, "num_input_tokens_seen": 166085216, "step": 136570 }, { "epoch": 17.11251722841749, "grad_norm": 15.311256408691406, "learning_rate": 6.216689653335184e-07, "loss": 0.6254, "num_input_tokens_seen": 166091200, "step": 136575 }, { "epoch": 17.113143716326274, "grad_norm": 10.48734188079834, "learning_rate": 6.214049741778588e-07, "loss": 0.4535, "num_input_tokens_seen": 166096928, "step": 136580 }, { "epoch": 17.11377020423506, "grad_norm": 4.813714981079102, "learning_rate": 6.211410353719838e-07, "loss": 0.4611, "num_input_tokens_seen": 166103232, "step": 136585 }, { "epoch": 17.114396692143842, "grad_norm": 15.432817459106445, "learning_rate": 6.208771489190496e-07, "loss": 0.4383, "num_input_tokens_seen": 166109600, "step": 136590 }, { "epoch": 17.115023180052624, "grad_norm": 8.963129997253418, "learning_rate": 6.206133148222115e-07, "loss": 0.4215, "num_input_tokens_seen": 166115712, "step": 136595 }, { "epoch": 17.11564966796141, "grad_norm": 17.633256912231445, "learning_rate": 6.203495330846226e-07, "loss": 0.4711, "num_input_tokens_seen": 166121760, "step": 136600 }, { "epoch": 17.116276155870192, "grad_norm": 18.31960678100586, "learning_rate": 6.200858037094376e-07, "loss": 0.524, "num_input_tokens_seen": 166127968, "step": 136605 }, { "epoch": 17.116902643778975, "grad_norm": 5.974833965301514, "learning_rate": 6.198221266998089e-07, "loss": 0.4393, "num_input_tokens_seen": 166134336, "step": 136610 }, { "epoch": 17.117529131687757, "grad_norm": 25.594358444213867, "learning_rate": 6.195585020588902e-07, "loss": 0.4874, "num_input_tokens_seen": 166140160, "step": 136615 }, { "epoch": 17.118155619596543, "grad_norm": 2.9405014514923096, "learning_rate": 6.19294929789831e-07, "loss": 0.4063, "num_input_tokens_seen": 166146272, "step": 136620 }, { "epoch": 17.118782107505325, "grad_norm": 6.640766620635986, "learning_rate": 6.190314098957856e-07, "loss": 0.4578, "num_input_tokens_seen": 166152384, "step": 136625 }, { "epoch": 17.119408595414107, "grad_norm": 5.2258429527282715, "learning_rate": 6.18767942379902e-07, "loss": 0.3812, "num_input_tokens_seen": 166158336, "step": 136630 }, { "epoch": 17.120035083322893, "grad_norm": 6.217926025390625, "learning_rate": 6.185045272453317e-07, "loss": 0.437, "num_input_tokens_seen": 166164608, "step": 136635 }, { "epoch": 17.120661571231675, "grad_norm": 19.55069351196289, "learning_rate": 6.18241164495223e-07, "loss": 0.5345, "num_input_tokens_seen": 166170176, "step": 136640 }, { "epoch": 17.121288059140458, "grad_norm": 7.276327610015869, "learning_rate": 6.179778541327247e-07, "loss": 0.573, "num_input_tokens_seen": 166176480, "step": 136645 }, { "epoch": 17.121914547049244, "grad_norm": 4.134303092956543, "learning_rate": 6.177145961609865e-07, "loss": 0.4548, "num_input_tokens_seen": 166182528, "step": 136650 }, { "epoch": 17.122541034958026, "grad_norm": 9.020938873291016, "learning_rate": 6.174513905831542e-07, "loss": 0.4642, "num_input_tokens_seen": 166188256, "step": 136655 }, { "epoch": 17.123167522866808, "grad_norm": 4.17869234085083, "learning_rate": 6.171882374023758e-07, "loss": 0.4801, "num_input_tokens_seen": 166194240, "step": 136660 }, { "epoch": 17.12379401077559, "grad_norm": 9.428306579589844, "learning_rate": 6.16925136621796e-07, "loss": 0.4427, "num_input_tokens_seen": 166200160, "step": 136665 }, { "epoch": 17.124420498684376, "grad_norm": 9.135997772216797, "learning_rate": 6.166620882445618e-07, "loss": 0.503, "num_input_tokens_seen": 166206048, "step": 136670 }, { "epoch": 17.12504698659316, "grad_norm": 14.24485969543457, "learning_rate": 6.163990922738172e-07, "loss": 0.4312, "num_input_tokens_seen": 166212288, "step": 136675 }, { "epoch": 17.12567347450194, "grad_norm": 12.432127952575684, "learning_rate": 6.161361487127065e-07, "loss": 0.4535, "num_input_tokens_seen": 166218336, "step": 136680 }, { "epoch": 17.126299962410727, "grad_norm": 5.946043968200684, "learning_rate": 6.158732575643744e-07, "loss": 0.4613, "num_input_tokens_seen": 166224448, "step": 136685 }, { "epoch": 17.12692645031951, "grad_norm": 10.818170547485352, "learning_rate": 6.156104188319645e-07, "loss": 0.5109, "num_input_tokens_seen": 166230528, "step": 136690 }, { "epoch": 17.12755293822829, "grad_norm": 5.119533538818359, "learning_rate": 6.153476325186169e-07, "loss": 0.3875, "num_input_tokens_seen": 166236576, "step": 136695 }, { "epoch": 17.128179426137077, "grad_norm": 9.750298500061035, "learning_rate": 6.150848986274749e-07, "loss": 0.4126, "num_input_tokens_seen": 166242624, "step": 136700 }, { "epoch": 17.12880591404586, "grad_norm": 8.13182544708252, "learning_rate": 6.148222171616802e-07, "loss": 0.4742, "num_input_tokens_seen": 166248608, "step": 136705 }, { "epoch": 17.12943240195464, "grad_norm": 4.243578910827637, "learning_rate": 6.145595881243721e-07, "loss": 0.4181, "num_input_tokens_seen": 166254624, "step": 136710 }, { "epoch": 17.130058889863427, "grad_norm": 4.713190078735352, "learning_rate": 6.142970115186924e-07, "loss": 0.3874, "num_input_tokens_seen": 166260896, "step": 136715 }, { "epoch": 17.13068537777221, "grad_norm": 6.072312831878662, "learning_rate": 6.140344873477777e-07, "loss": 0.4731, "num_input_tokens_seen": 166266688, "step": 136720 }, { "epoch": 17.131311865680992, "grad_norm": 20.41132164001465, "learning_rate": 6.137720156147697e-07, "loss": 0.4621, "num_input_tokens_seen": 166272864, "step": 136725 }, { "epoch": 17.131938353589774, "grad_norm": 14.683667182922363, "learning_rate": 6.135095963228038e-07, "loss": 0.4832, "num_input_tokens_seen": 166278528, "step": 136730 }, { "epoch": 17.13256484149856, "grad_norm": 11.260990142822266, "learning_rate": 6.132472294750197e-07, "loss": 0.4557, "num_input_tokens_seen": 166284832, "step": 136735 }, { "epoch": 17.133191329407342, "grad_norm": 12.814542770385742, "learning_rate": 6.12984915074552e-07, "loss": 0.5033, "num_input_tokens_seen": 166291040, "step": 136740 }, { "epoch": 17.133817817316125, "grad_norm": 5.935450553894043, "learning_rate": 6.127226531245395e-07, "loss": 0.4956, "num_input_tokens_seen": 166297088, "step": 136745 }, { "epoch": 17.13444430522491, "grad_norm": 6.006180763244629, "learning_rate": 6.124604436281146e-07, "loss": 0.4435, "num_input_tokens_seen": 166302944, "step": 136750 }, { "epoch": 17.135070793133693, "grad_norm": 3.665959596633911, "learning_rate": 6.121982865884152e-07, "loss": 0.4598, "num_input_tokens_seen": 166309408, "step": 136755 }, { "epoch": 17.135697281042475, "grad_norm": 2.480618476867676, "learning_rate": 6.119361820085734e-07, "loss": 0.4161, "num_input_tokens_seen": 166315360, "step": 136760 }, { "epoch": 17.13632376895126, "grad_norm": 16.52138900756836, "learning_rate": 6.11674129891724e-07, "loss": 0.444, "num_input_tokens_seen": 166321408, "step": 136765 }, { "epoch": 17.136950256860043, "grad_norm": 7.6656012535095215, "learning_rate": 6.114121302410009e-07, "loss": 0.4697, "num_input_tokens_seen": 166327136, "step": 136770 }, { "epoch": 17.137576744768825, "grad_norm": 5.241622447967529, "learning_rate": 6.11150183059534e-07, "loss": 0.5922, "num_input_tokens_seen": 166332864, "step": 136775 }, { "epoch": 17.138203232677608, "grad_norm": 17.573633193969727, "learning_rate": 6.108882883504568e-07, "loss": 0.521, "num_input_tokens_seen": 166338848, "step": 136780 }, { "epoch": 17.138829720586394, "grad_norm": 9.324416160583496, "learning_rate": 6.106264461169015e-07, "loss": 0.4827, "num_input_tokens_seen": 166344800, "step": 136785 }, { "epoch": 17.139456208495176, "grad_norm": 11.100388526916504, "learning_rate": 6.103646563619964e-07, "loss": 0.4398, "num_input_tokens_seen": 166350752, "step": 136790 }, { "epoch": 17.140082696403958, "grad_norm": 8.253196716308594, "learning_rate": 6.101029190888724e-07, "loss": 0.4476, "num_input_tokens_seen": 166356672, "step": 136795 }, { "epoch": 17.140709184312744, "grad_norm": 8.80935287475586, "learning_rate": 6.098412343006599e-07, "loss": 0.4621, "num_input_tokens_seen": 166362720, "step": 136800 }, { "epoch": 17.141335672221526, "grad_norm": 10.87740707397461, "learning_rate": 6.095796020004852e-07, "loss": 0.4041, "num_input_tokens_seen": 166368736, "step": 136805 }, { "epoch": 17.14196216013031, "grad_norm": 4.608487129211426, "learning_rate": 6.093180221914791e-07, "loss": 0.3871, "num_input_tokens_seen": 166374944, "step": 136810 }, { "epoch": 17.142588648039094, "grad_norm": 10.898780822753906, "learning_rate": 6.090564948767663e-07, "loss": 0.46, "num_input_tokens_seen": 166381376, "step": 136815 }, { "epoch": 17.143215135947877, "grad_norm": 12.041606903076172, "learning_rate": 6.087950200594755e-07, "loss": 0.4518, "num_input_tokens_seen": 166387424, "step": 136820 }, { "epoch": 17.14384162385666, "grad_norm": 4.668090343475342, "learning_rate": 6.085335977427315e-07, "loss": 0.4486, "num_input_tokens_seen": 166393728, "step": 136825 }, { "epoch": 17.14446811176544, "grad_norm": 5.648159027099609, "learning_rate": 6.082722279296605e-07, "loss": 0.4376, "num_input_tokens_seen": 166400064, "step": 136830 }, { "epoch": 17.145094599674227, "grad_norm": 5.061699867248535, "learning_rate": 6.080109106233878e-07, "loss": 0.4184, "num_input_tokens_seen": 166406528, "step": 136835 }, { "epoch": 17.14572108758301, "grad_norm": 5.146949291229248, "learning_rate": 6.077496458270372e-07, "loss": 0.4096, "num_input_tokens_seen": 166412768, "step": 136840 }, { "epoch": 17.14634757549179, "grad_norm": 14.716392517089844, "learning_rate": 6.074884335437326e-07, "loss": 0.4346, "num_input_tokens_seen": 166418784, "step": 136845 }, { "epoch": 17.146974063400577, "grad_norm": 4.063816070556641, "learning_rate": 6.07227273776596e-07, "loss": 0.4237, "num_input_tokens_seen": 166424960, "step": 136850 }, { "epoch": 17.14760055130936, "grad_norm": 3.7004942893981934, "learning_rate": 6.069661665287518e-07, "loss": 0.4989, "num_input_tokens_seen": 166430464, "step": 136855 }, { "epoch": 17.148227039218142, "grad_norm": 3.930548667907715, "learning_rate": 6.06705111803319e-07, "loss": 0.4401, "num_input_tokens_seen": 166436480, "step": 136860 }, { "epoch": 17.148853527126928, "grad_norm": 4.4557881355285645, "learning_rate": 6.064441096034218e-07, "loss": 0.4713, "num_input_tokens_seen": 166442368, "step": 136865 }, { "epoch": 17.14948001503571, "grad_norm": 4.727822303771973, "learning_rate": 6.061831599321782e-07, "loss": 0.4015, "num_input_tokens_seen": 166448032, "step": 136870 }, { "epoch": 17.150106502944492, "grad_norm": 5.33003044128418, "learning_rate": 6.059222627927097e-07, "loss": 0.4181, "num_input_tokens_seen": 166454048, "step": 136875 }, { "epoch": 17.150732990853278, "grad_norm": 13.70350456237793, "learning_rate": 6.056614181881343e-07, "loss": 0.4549, "num_input_tokens_seen": 166459840, "step": 136880 }, { "epoch": 17.15135947876206, "grad_norm": 4.012731075286865, "learning_rate": 6.054006261215706e-07, "loss": 0.4372, "num_input_tokens_seen": 166465888, "step": 136885 }, { "epoch": 17.151985966670843, "grad_norm": 16.635774612426758, "learning_rate": 6.051398865961378e-07, "loss": 0.5174, "num_input_tokens_seen": 166472032, "step": 136890 }, { "epoch": 17.152612454579625, "grad_norm": 3.88358998298645, "learning_rate": 6.048791996149534e-07, "loss": 0.41, "num_input_tokens_seen": 166478496, "step": 136895 }, { "epoch": 17.15323894248841, "grad_norm": 9.953657150268555, "learning_rate": 6.046185651811321e-07, "loss": 0.419, "num_input_tokens_seen": 166484512, "step": 136900 }, { "epoch": 17.153865430397193, "grad_norm": 9.305266380310059, "learning_rate": 6.043579832977919e-07, "loss": 0.4519, "num_input_tokens_seen": 166490528, "step": 136905 }, { "epoch": 17.154491918305975, "grad_norm": 5.461354732513428, "learning_rate": 6.040974539680483e-07, "loss": 0.4276, "num_input_tokens_seen": 166496480, "step": 136910 }, { "epoch": 17.15511840621476, "grad_norm": 5.704992294311523, "learning_rate": 6.038369771950142e-07, "loss": 0.4235, "num_input_tokens_seen": 166502816, "step": 136915 }, { "epoch": 17.155744894123544, "grad_norm": 4.111382007598877, "learning_rate": 6.035765529818066e-07, "loss": 0.4233, "num_input_tokens_seen": 166508832, "step": 136920 }, { "epoch": 17.156371382032326, "grad_norm": 4.69581937789917, "learning_rate": 6.033161813315364e-07, "loss": 0.3949, "num_input_tokens_seen": 166514912, "step": 136925 }, { "epoch": 17.15699786994111, "grad_norm": 5.236852645874023, "learning_rate": 6.030558622473193e-07, "loss": 0.5096, "num_input_tokens_seen": 166521280, "step": 136930 }, { "epoch": 17.157624357849894, "grad_norm": 4.669739723205566, "learning_rate": 6.027955957322645e-07, "loss": 0.5217, "num_input_tokens_seen": 166527328, "step": 136935 }, { "epoch": 17.158250845758676, "grad_norm": 4.2602410316467285, "learning_rate": 6.025353817894863e-07, "loss": 0.4495, "num_input_tokens_seen": 166533376, "step": 136940 }, { "epoch": 17.15887733366746, "grad_norm": 3.4610238075256348, "learning_rate": 6.022752204220938e-07, "loss": 0.4792, "num_input_tokens_seen": 166539520, "step": 136945 }, { "epoch": 17.159503821576244, "grad_norm": 2.924769878387451, "learning_rate": 6.02015111633199e-07, "loss": 0.4208, "num_input_tokens_seen": 166545120, "step": 136950 }, { "epoch": 17.160130309485027, "grad_norm": 4.588968753814697, "learning_rate": 6.017550554259116e-07, "loss": 0.4456, "num_input_tokens_seen": 166551392, "step": 136955 }, { "epoch": 17.16075679739381, "grad_norm": 13.987908363342285, "learning_rate": 6.014950518033396e-07, "loss": 0.4705, "num_input_tokens_seen": 166557536, "step": 136960 }, { "epoch": 17.161383285302595, "grad_norm": 13.302569389343262, "learning_rate": 6.012351007685935e-07, "loss": 0.4876, "num_input_tokens_seen": 166563744, "step": 136965 }, { "epoch": 17.162009773211377, "grad_norm": 23.183956146240234, "learning_rate": 6.009752023247789e-07, "loss": 0.4901, "num_input_tokens_seen": 166569184, "step": 136970 }, { "epoch": 17.16263626112016, "grad_norm": 6.229264259338379, "learning_rate": 6.007153564750051e-07, "loss": 0.4087, "num_input_tokens_seen": 166575232, "step": 136975 }, { "epoch": 17.163262749028945, "grad_norm": 6.635953903198242, "learning_rate": 6.004555632223768e-07, "loss": 0.4109, "num_input_tokens_seen": 166581312, "step": 136980 }, { "epoch": 17.163889236937727, "grad_norm": 2.3463032245635986, "learning_rate": 6.001958225700017e-07, "loss": 0.465, "num_input_tokens_seen": 166587712, "step": 136985 }, { "epoch": 17.16451572484651, "grad_norm": 4.286516189575195, "learning_rate": 5.999361345209854e-07, "loss": 0.4611, "num_input_tokens_seen": 166593760, "step": 136990 }, { "epoch": 17.165142212755296, "grad_norm": 6.7066426277160645, "learning_rate": 5.996764990784304e-07, "loss": 0.4027, "num_input_tokens_seen": 166599872, "step": 136995 }, { "epoch": 17.165768700664078, "grad_norm": 16.876672744750977, "learning_rate": 5.994169162454432e-07, "loss": 0.5604, "num_input_tokens_seen": 166606048, "step": 137000 }, { "epoch": 17.16639518857286, "grad_norm": 5.396038055419922, "learning_rate": 5.991573860251277e-07, "loss": 0.4676, "num_input_tokens_seen": 166611968, "step": 137005 }, { "epoch": 17.167021676481642, "grad_norm": 6.552663803100586, "learning_rate": 5.988979084205838e-07, "loss": 0.4419, "num_input_tokens_seen": 166618016, "step": 137010 }, { "epoch": 17.167648164390428, "grad_norm": 15.305192947387695, "learning_rate": 5.986384834349163e-07, "loss": 0.4256, "num_input_tokens_seen": 166623456, "step": 137015 }, { "epoch": 17.16827465229921, "grad_norm": 4.694634914398193, "learning_rate": 5.983791110712273e-07, "loss": 0.4182, "num_input_tokens_seen": 166629472, "step": 137020 }, { "epoch": 17.168901140207993, "grad_norm": 7.562679290771484, "learning_rate": 5.981197913326154e-07, "loss": 0.4512, "num_input_tokens_seen": 166635584, "step": 137025 }, { "epoch": 17.16952762811678, "grad_norm": 5.745554447174072, "learning_rate": 5.978605242221835e-07, "loss": 0.497, "num_input_tokens_seen": 166641920, "step": 137030 }, { "epoch": 17.17015411602556, "grad_norm": 3.767878770828247, "learning_rate": 5.976013097430288e-07, "loss": 0.417, "num_input_tokens_seen": 166648256, "step": 137035 }, { "epoch": 17.170780603934343, "grad_norm": 10.757964134216309, "learning_rate": 5.973421478982533e-07, "loss": 0.4834, "num_input_tokens_seen": 166654208, "step": 137040 }, { "epoch": 17.17140709184313, "grad_norm": 4.13840913772583, "learning_rate": 5.970830386909527e-07, "loss": 0.407, "num_input_tokens_seen": 166660352, "step": 137045 }, { "epoch": 17.17203357975191, "grad_norm": 4.667991638183594, "learning_rate": 5.968239821242266e-07, "loss": 0.4041, "num_input_tokens_seen": 166666272, "step": 137050 }, { "epoch": 17.172660067660694, "grad_norm": 10.595033645629883, "learning_rate": 5.965649782011712e-07, "loss": 0.4409, "num_input_tokens_seen": 166672320, "step": 137055 }, { "epoch": 17.173286555569476, "grad_norm": 4.775176525115967, "learning_rate": 5.96306026924885e-07, "loss": 0.416, "num_input_tokens_seen": 166678240, "step": 137060 }, { "epoch": 17.17391304347826, "grad_norm": 9.17395305633545, "learning_rate": 5.960471282984614e-07, "loss": 0.4189, "num_input_tokens_seen": 166684416, "step": 137065 }, { "epoch": 17.174539531387044, "grad_norm": 25.499801635742188, "learning_rate": 5.957882823249971e-07, "loss": 0.4561, "num_input_tokens_seen": 166690304, "step": 137070 }, { "epoch": 17.175166019295826, "grad_norm": 7.682519435882568, "learning_rate": 5.95529489007588e-07, "loss": 0.3922, "num_input_tokens_seen": 166696544, "step": 137075 }, { "epoch": 17.175792507204612, "grad_norm": 4.1689653396606445, "learning_rate": 5.952707483493253e-07, "loss": 0.4662, "num_input_tokens_seen": 166702528, "step": 137080 }, { "epoch": 17.176418995113394, "grad_norm": 10.019991874694824, "learning_rate": 5.950120603533055e-07, "loss": 0.4233, "num_input_tokens_seen": 166708736, "step": 137085 }, { "epoch": 17.177045483022177, "grad_norm": 14.654836654663086, "learning_rate": 5.947534250226189e-07, "loss": 0.4883, "num_input_tokens_seen": 166715040, "step": 137090 }, { "epoch": 17.177671970930962, "grad_norm": 7.06631326675415, "learning_rate": 5.944948423603592e-07, "loss": 0.4398, "num_input_tokens_seen": 166720800, "step": 137095 }, { "epoch": 17.178298458839745, "grad_norm": 3.8616926670074463, "learning_rate": 5.942363123696171e-07, "loss": 0.3911, "num_input_tokens_seen": 166727104, "step": 137100 }, { "epoch": 17.178924946748527, "grad_norm": 4.243505954742432, "learning_rate": 5.939778350534853e-07, "loss": 0.4719, "num_input_tokens_seen": 166733184, "step": 137105 }, { "epoch": 17.179551434657313, "grad_norm": 5.867673397064209, "learning_rate": 5.937194104150518e-07, "loss": 0.4677, "num_input_tokens_seen": 166739328, "step": 137110 }, { "epoch": 17.180177922566095, "grad_norm": 10.783666610717773, "learning_rate": 5.934610384574085e-07, "loss": 0.4865, "num_input_tokens_seen": 166744896, "step": 137115 }, { "epoch": 17.180804410474877, "grad_norm": 17.218339920043945, "learning_rate": 5.93202719183642e-07, "loss": 0.5112, "num_input_tokens_seen": 166750848, "step": 137120 }, { "epoch": 17.18143089838366, "grad_norm": 3.5023751258850098, "learning_rate": 5.929444525968431e-07, "loss": 0.4252, "num_input_tokens_seen": 166756960, "step": 137125 }, { "epoch": 17.182057386292445, "grad_norm": 11.504292488098145, "learning_rate": 5.926862387000975e-07, "loss": 0.4708, "num_input_tokens_seen": 166763264, "step": 137130 }, { "epoch": 17.182683874201228, "grad_norm": 7.439836502075195, "learning_rate": 5.924280774964936e-07, "loss": 0.4467, "num_input_tokens_seen": 166769376, "step": 137135 }, { "epoch": 17.18331036211001, "grad_norm": 3.047852039337158, "learning_rate": 5.921699689891186e-07, "loss": 0.4774, "num_input_tokens_seen": 166775296, "step": 137140 }, { "epoch": 17.183936850018796, "grad_norm": 4.070349216461182, "learning_rate": 5.919119131810569e-07, "loss": 0.4551, "num_input_tokens_seen": 166780800, "step": 137145 }, { "epoch": 17.184563337927578, "grad_norm": 5.600231170654297, "learning_rate": 5.916539100753948e-07, "loss": 0.4138, "num_input_tokens_seen": 166786112, "step": 137150 }, { "epoch": 17.18518982583636, "grad_norm": 3.613039970397949, "learning_rate": 5.913959596752156e-07, "loss": 0.4553, "num_input_tokens_seen": 166791904, "step": 137155 }, { "epoch": 17.185816313745146, "grad_norm": 5.503023624420166, "learning_rate": 5.911380619836054e-07, "loss": 0.4866, "num_input_tokens_seen": 166798112, "step": 137160 }, { "epoch": 17.18644280165393, "grad_norm": 20.25609588623047, "learning_rate": 5.908802170036454e-07, "loss": 0.477, "num_input_tokens_seen": 166804352, "step": 137165 }, { "epoch": 17.18706928956271, "grad_norm": 3.9008359909057617, "learning_rate": 5.906224247384207e-07, "loss": 0.3957, "num_input_tokens_seen": 166810848, "step": 137170 }, { "epoch": 17.187695777471493, "grad_norm": 15.970471382141113, "learning_rate": 5.903646851910105e-07, "loss": 0.4747, "num_input_tokens_seen": 166816864, "step": 137175 }, { "epoch": 17.18832226538028, "grad_norm": 4.9643168449401855, "learning_rate": 5.901069983644997e-07, "loss": 0.4183, "num_input_tokens_seen": 166822912, "step": 137180 }, { "epoch": 17.18894875328906, "grad_norm": 5.295205593109131, "learning_rate": 5.898493642619657e-07, "loss": 0.4081, "num_input_tokens_seen": 166829280, "step": 137185 }, { "epoch": 17.189575241197844, "grad_norm": 13.09340763092041, "learning_rate": 5.895917828864906e-07, "loss": 0.4912, "num_input_tokens_seen": 166835456, "step": 137190 }, { "epoch": 17.19020172910663, "grad_norm": 17.556217193603516, "learning_rate": 5.893342542411551e-07, "loss": 0.5074, "num_input_tokens_seen": 166841664, "step": 137195 }, { "epoch": 17.19082821701541, "grad_norm": 7.630196571350098, "learning_rate": 5.890767783290352e-07, "loss": 0.4496, "num_input_tokens_seen": 166848224, "step": 137200 }, { "epoch": 17.191454704924194, "grad_norm": 6.184897422790527, "learning_rate": 5.888193551532112e-07, "loss": 0.4285, "num_input_tokens_seen": 166853792, "step": 137205 }, { "epoch": 17.19208119283298, "grad_norm": 8.72315788269043, "learning_rate": 5.885619847167607e-07, "loss": 0.4515, "num_input_tokens_seen": 166859712, "step": 137210 }, { "epoch": 17.192707680741762, "grad_norm": 9.363895416259766, "learning_rate": 5.883046670227616e-07, "loss": 0.4124, "num_input_tokens_seen": 166865952, "step": 137215 }, { "epoch": 17.193334168650544, "grad_norm": 5.391444683074951, "learning_rate": 5.880474020742882e-07, "loss": 0.5436, "num_input_tokens_seen": 166872192, "step": 137220 }, { "epoch": 17.19396065655933, "grad_norm": 6.818904399871826, "learning_rate": 5.877901898744187e-07, "loss": 0.4545, "num_input_tokens_seen": 166878400, "step": 137225 }, { "epoch": 17.194587144468112, "grad_norm": 25.14707374572754, "learning_rate": 5.875330304262261e-07, "loss": 0.4611, "num_input_tokens_seen": 166884640, "step": 137230 }, { "epoch": 17.195213632376895, "grad_norm": 16.71019744873047, "learning_rate": 5.872759237327868e-07, "loss": 0.5125, "num_input_tokens_seen": 166890368, "step": 137235 }, { "epoch": 17.195840120285677, "grad_norm": 5.240744113922119, "learning_rate": 5.870188697971729e-07, "loss": 0.4302, "num_input_tokens_seen": 166896384, "step": 137240 }, { "epoch": 17.196466608194463, "grad_norm": 10.067182540893555, "learning_rate": 5.867618686224591e-07, "loss": 0.4663, "num_input_tokens_seen": 166902368, "step": 137245 }, { "epoch": 17.197093096103245, "grad_norm": 29.962669372558594, "learning_rate": 5.865049202117173e-07, "loss": 0.5642, "num_input_tokens_seen": 166908288, "step": 137250 }, { "epoch": 17.197719584012027, "grad_norm": 5.083534240722656, "learning_rate": 5.862480245680197e-07, "loss": 0.4175, "num_input_tokens_seen": 166914496, "step": 137255 }, { "epoch": 17.198346071920813, "grad_norm": 4.946244716644287, "learning_rate": 5.859911816944385e-07, "loss": 0.4471, "num_input_tokens_seen": 166920448, "step": 137260 }, { "epoch": 17.198972559829595, "grad_norm": 4.509218692779541, "learning_rate": 5.857343915940434e-07, "loss": 0.4783, "num_input_tokens_seen": 166926688, "step": 137265 }, { "epoch": 17.199599047738378, "grad_norm": 6.719784259796143, "learning_rate": 5.854776542699053e-07, "loss": 0.4224, "num_input_tokens_seen": 166932352, "step": 137270 }, { "epoch": 17.200225535647164, "grad_norm": 5.867454528808594, "learning_rate": 5.852209697250927e-07, "loss": 0.4794, "num_input_tokens_seen": 166938368, "step": 137275 }, { "epoch": 17.200852023555946, "grad_norm": 30.64972496032715, "learning_rate": 5.849643379626763e-07, "loss": 0.5039, "num_input_tokens_seen": 166944832, "step": 137280 }, { "epoch": 17.201478511464728, "grad_norm": 8.672270774841309, "learning_rate": 5.84707758985722e-07, "loss": 0.397, "num_input_tokens_seen": 166950816, "step": 137285 }, { "epoch": 17.20210499937351, "grad_norm": 6.203240394592285, "learning_rate": 5.844512327972995e-07, "loss": 0.452, "num_input_tokens_seen": 166957376, "step": 137290 }, { "epoch": 17.202731487282296, "grad_norm": 5.4606804847717285, "learning_rate": 5.841947594004743e-07, "loss": 0.51, "num_input_tokens_seen": 166963776, "step": 137295 }, { "epoch": 17.20335797519108, "grad_norm": 11.338315963745117, "learning_rate": 5.839383387983128e-07, "loss": 0.4526, "num_input_tokens_seen": 166969760, "step": 137300 }, { "epoch": 17.20398446309986, "grad_norm": 4.045825481414795, "learning_rate": 5.836819709938818e-07, "loss": 0.4685, "num_input_tokens_seen": 166975904, "step": 137305 }, { "epoch": 17.204610951008647, "grad_norm": 15.245992660522461, "learning_rate": 5.834256559902468e-07, "loss": 0.4122, "num_input_tokens_seen": 166981856, "step": 137310 }, { "epoch": 17.20523743891743, "grad_norm": 8.294427871704102, "learning_rate": 5.831693937904698e-07, "loss": 0.4646, "num_input_tokens_seen": 166987584, "step": 137315 }, { "epoch": 17.20586392682621, "grad_norm": 2.846233367919922, "learning_rate": 5.829131843976166e-07, "loss": 0.4223, "num_input_tokens_seen": 166993088, "step": 137320 }, { "epoch": 17.206490414734997, "grad_norm": 4.593052864074707, "learning_rate": 5.826570278147508e-07, "loss": 0.4479, "num_input_tokens_seen": 166997760, "step": 137325 }, { "epoch": 17.20711690264378, "grad_norm": 4.124207973480225, "learning_rate": 5.824009240449335e-07, "loss": 0.4327, "num_input_tokens_seen": 167003712, "step": 137330 }, { "epoch": 17.20774339055256, "grad_norm": 5.072921276092529, "learning_rate": 5.821448730912277e-07, "loss": 0.4473, "num_input_tokens_seen": 167009568, "step": 137335 }, { "epoch": 17.208369878461347, "grad_norm": 5.266751289367676, "learning_rate": 5.818888749566936e-07, "loss": 0.4193, "num_input_tokens_seen": 167015616, "step": 137340 }, { "epoch": 17.20899636637013, "grad_norm": 6.730984687805176, "learning_rate": 5.81632929644394e-07, "loss": 0.4934, "num_input_tokens_seen": 167021856, "step": 137345 }, { "epoch": 17.209622854278912, "grad_norm": 5.311814308166504, "learning_rate": 5.813770371573857e-07, "loss": 0.4726, "num_input_tokens_seen": 167028096, "step": 137350 }, { "epoch": 17.210249342187694, "grad_norm": 8.079096794128418, "learning_rate": 5.811211974987313e-07, "loss": 0.3647, "num_input_tokens_seen": 167034496, "step": 137355 }, { "epoch": 17.21087583009648, "grad_norm": 3.5544073581695557, "learning_rate": 5.808654106714873e-07, "loss": 0.4307, "num_input_tokens_seen": 167040448, "step": 137360 }, { "epoch": 17.211502318005262, "grad_norm": 8.231424331665039, "learning_rate": 5.806096766787134e-07, "loss": 0.5548, "num_input_tokens_seen": 167046912, "step": 137365 }, { "epoch": 17.212128805914045, "grad_norm": 6.579805850982666, "learning_rate": 5.803539955234655e-07, "loss": 0.4593, "num_input_tokens_seen": 167052672, "step": 137370 }, { "epoch": 17.21275529382283, "grad_norm": 5.111191272735596, "learning_rate": 5.800983672088023e-07, "loss": 0.4201, "num_input_tokens_seen": 167058944, "step": 137375 }, { "epoch": 17.213381781731613, "grad_norm": 4.800299167633057, "learning_rate": 5.79842791737778e-07, "loss": 0.4354, "num_input_tokens_seen": 167064768, "step": 137380 }, { "epoch": 17.214008269640395, "grad_norm": 10.987421035766602, "learning_rate": 5.795872691134502e-07, "loss": 0.4304, "num_input_tokens_seen": 167070880, "step": 137385 }, { "epoch": 17.21463475754918, "grad_norm": 5.933948040008545, "learning_rate": 5.793317993388736e-07, "loss": 0.4165, "num_input_tokens_seen": 167076928, "step": 137390 }, { "epoch": 17.215261245457963, "grad_norm": 17.988534927368164, "learning_rate": 5.790763824171008e-07, "loss": 0.5241, "num_input_tokens_seen": 167083104, "step": 137395 }, { "epoch": 17.215887733366745, "grad_norm": 5.669295310974121, "learning_rate": 5.788210183511866e-07, "loss": 0.4415, "num_input_tokens_seen": 167089280, "step": 137400 }, { "epoch": 17.216514221275528, "grad_norm": 4.9116973876953125, "learning_rate": 5.785657071441858e-07, "loss": 0.6079, "num_input_tokens_seen": 167095808, "step": 137405 }, { "epoch": 17.217140709184314, "grad_norm": 8.581289291381836, "learning_rate": 5.783104487991481e-07, "loss": 0.4501, "num_input_tokens_seen": 167101856, "step": 137410 }, { "epoch": 17.217767197093096, "grad_norm": 12.979928970336914, "learning_rate": 5.780552433191266e-07, "loss": 0.448, "num_input_tokens_seen": 167107840, "step": 137415 }, { "epoch": 17.218393685001878, "grad_norm": 4.832613945007324, "learning_rate": 5.778000907071734e-07, "loss": 0.3808, "num_input_tokens_seen": 167114272, "step": 137420 }, { "epoch": 17.219020172910664, "grad_norm": 5.78109073638916, "learning_rate": 5.775449909663372e-07, "loss": 0.4428, "num_input_tokens_seen": 167120864, "step": 137425 }, { "epoch": 17.219646660819446, "grad_norm": 7.212496757507324, "learning_rate": 5.772899440996698e-07, "loss": 0.4266, "num_input_tokens_seen": 167127136, "step": 137430 }, { "epoch": 17.22027314872823, "grad_norm": 6.616163730621338, "learning_rate": 5.770349501102185e-07, "loss": 0.4195, "num_input_tokens_seen": 167133024, "step": 137435 }, { "epoch": 17.220899636637014, "grad_norm": 9.579485893249512, "learning_rate": 5.767800090010328e-07, "loss": 0.4663, "num_input_tokens_seen": 167138880, "step": 137440 }, { "epoch": 17.221526124545797, "grad_norm": 5.3420586585998535, "learning_rate": 5.765251207751621e-07, "loss": 0.4732, "num_input_tokens_seen": 167144704, "step": 137445 }, { "epoch": 17.22215261245458, "grad_norm": 6.494565963745117, "learning_rate": 5.762702854356517e-07, "loss": 0.434, "num_input_tokens_seen": 167150496, "step": 137450 }, { "epoch": 17.22277910036336, "grad_norm": 5.292993545532227, "learning_rate": 5.760155029855502e-07, "loss": 0.4309, "num_input_tokens_seen": 167157056, "step": 137455 }, { "epoch": 17.223405588272147, "grad_norm": 5.407408237457275, "learning_rate": 5.75760773427902e-07, "loss": 0.4585, "num_input_tokens_seen": 167162624, "step": 137460 }, { "epoch": 17.22403207618093, "grad_norm": 3.5914180278778076, "learning_rate": 5.755060967657539e-07, "loss": 0.5019, "num_input_tokens_seen": 167168864, "step": 137465 }, { "epoch": 17.22465856408971, "grad_norm": 17.726539611816406, "learning_rate": 5.752514730021496e-07, "loss": 0.4188, "num_input_tokens_seen": 167175104, "step": 137470 }, { "epoch": 17.225285051998497, "grad_norm": 6.105963230133057, "learning_rate": 5.749969021401352e-07, "loss": 0.4386, "num_input_tokens_seen": 167181088, "step": 137475 }, { "epoch": 17.22591153990728, "grad_norm": 11.976602554321289, "learning_rate": 5.747423841827526e-07, "loss": 0.4224, "num_input_tokens_seen": 167186976, "step": 137480 }, { "epoch": 17.226538027816062, "grad_norm": 6.026648044586182, "learning_rate": 5.74487919133046e-07, "loss": 0.3636, "num_input_tokens_seen": 167192928, "step": 137485 }, { "epoch": 17.227164515724848, "grad_norm": 4.718698024749756, "learning_rate": 5.742335069940558e-07, "loss": 0.4076, "num_input_tokens_seen": 167198976, "step": 137490 }, { "epoch": 17.22779100363363, "grad_norm": 3.4283032417297363, "learning_rate": 5.739791477688262e-07, "loss": 0.4225, "num_input_tokens_seen": 167204736, "step": 137495 }, { "epoch": 17.228417491542412, "grad_norm": 7.867160320281982, "learning_rate": 5.737248414603963e-07, "loss": 0.4139, "num_input_tokens_seen": 167210784, "step": 137500 }, { "epoch": 17.2290439794512, "grad_norm": 4.030826568603516, "learning_rate": 5.734705880718067e-07, "loss": 0.4082, "num_input_tokens_seen": 167216768, "step": 137505 }, { "epoch": 17.22967046735998, "grad_norm": 24.852651596069336, "learning_rate": 5.732163876060987e-07, "loss": 0.4843, "num_input_tokens_seen": 167222880, "step": 137510 }, { "epoch": 17.230296955268763, "grad_norm": 7.895704746246338, "learning_rate": 5.729622400663099e-07, "loss": 0.4327, "num_input_tokens_seen": 167228768, "step": 137515 }, { "epoch": 17.230923443177545, "grad_norm": 2.973078489303589, "learning_rate": 5.727081454554806e-07, "loss": 0.5436, "num_input_tokens_seen": 167234880, "step": 137520 }, { "epoch": 17.23154993108633, "grad_norm": 4.519725322723389, "learning_rate": 5.724541037766473e-07, "loss": 0.437, "num_input_tokens_seen": 167241024, "step": 137525 }, { "epoch": 17.232176418995113, "grad_norm": 30.220579147338867, "learning_rate": 5.722001150328483e-07, "loss": 0.4614, "num_input_tokens_seen": 167247136, "step": 137530 }, { "epoch": 17.232802906903895, "grad_norm": 4.059074878692627, "learning_rate": 5.719461792271186e-07, "loss": 0.4842, "num_input_tokens_seen": 167253216, "step": 137535 }, { "epoch": 17.23342939481268, "grad_norm": 5.482490539550781, "learning_rate": 5.71692296362496e-07, "loss": 0.4763, "num_input_tokens_seen": 167259392, "step": 137540 }, { "epoch": 17.234055882721464, "grad_norm": 7.460955619812012, "learning_rate": 5.714384664420148e-07, "loss": 0.4514, "num_input_tokens_seen": 167265504, "step": 137545 }, { "epoch": 17.234682370630246, "grad_norm": 5.332696437835693, "learning_rate": 5.711846894687101e-07, "loss": 0.4156, "num_input_tokens_seen": 167271712, "step": 137550 }, { "epoch": 17.23530885853903, "grad_norm": 4.2856059074401855, "learning_rate": 5.709309654456158e-07, "loss": 0.4535, "num_input_tokens_seen": 167277632, "step": 137555 }, { "epoch": 17.235935346447814, "grad_norm": 6.046494007110596, "learning_rate": 5.706772943757661e-07, "loss": 0.3991, "num_input_tokens_seen": 167284160, "step": 137560 }, { "epoch": 17.236561834356596, "grad_norm": 7.393606185913086, "learning_rate": 5.704236762621923e-07, "loss": 0.5278, "num_input_tokens_seen": 167290432, "step": 137565 }, { "epoch": 17.23718832226538, "grad_norm": 16.299638748168945, "learning_rate": 5.701701111079283e-07, "loss": 0.5075, "num_input_tokens_seen": 167296800, "step": 137570 }, { "epoch": 17.237814810174164, "grad_norm": 14.789497375488281, "learning_rate": 5.699165989160055e-07, "loss": 0.4602, "num_input_tokens_seen": 167302880, "step": 137575 }, { "epoch": 17.238441298082947, "grad_norm": 7.1243062019348145, "learning_rate": 5.696631396894531e-07, "loss": 0.4061, "num_input_tokens_seen": 167309344, "step": 137580 }, { "epoch": 17.23906778599173, "grad_norm": 7.238008975982666, "learning_rate": 5.694097334313043e-07, "loss": 0.4085, "num_input_tokens_seen": 167315488, "step": 137585 }, { "epoch": 17.239694273900515, "grad_norm": 4.166100025177002, "learning_rate": 5.691563801445854e-07, "loss": 0.4445, "num_input_tokens_seen": 167321312, "step": 137590 }, { "epoch": 17.240320761809297, "grad_norm": 29.22039222717285, "learning_rate": 5.689030798323292e-07, "loss": 0.5307, "num_input_tokens_seen": 167327264, "step": 137595 }, { "epoch": 17.24094724971808, "grad_norm": 25.663837432861328, "learning_rate": 5.686498324975604e-07, "loss": 0.4823, "num_input_tokens_seen": 167333536, "step": 137600 }, { "epoch": 17.241573737626865, "grad_norm": 6.447836399078369, "learning_rate": 5.683966381433087e-07, "loss": 0.4107, "num_input_tokens_seen": 167339776, "step": 137605 }, { "epoch": 17.242200225535647, "grad_norm": 35.232421875, "learning_rate": 5.681434967726019e-07, "loss": 0.5787, "num_input_tokens_seen": 167345984, "step": 137610 }, { "epoch": 17.24282671344443, "grad_norm": 3.9228460788726807, "learning_rate": 5.678904083884651e-07, "loss": 0.4292, "num_input_tokens_seen": 167352384, "step": 137615 }, { "epoch": 17.243453201353216, "grad_norm": 6.114632606506348, "learning_rate": 5.676373729939244e-07, "loss": 0.4844, "num_input_tokens_seen": 167358656, "step": 137620 }, { "epoch": 17.244079689261998, "grad_norm": 6.7264604568481445, "learning_rate": 5.673843905920068e-07, "loss": 0.4565, "num_input_tokens_seen": 167364768, "step": 137625 }, { "epoch": 17.24470617717078, "grad_norm": 6.487232208251953, "learning_rate": 5.671314611857342e-07, "loss": 0.4206, "num_input_tokens_seen": 167370432, "step": 137630 }, { "epoch": 17.245332665079562, "grad_norm": 5.933019638061523, "learning_rate": 5.668785847781322e-07, "loss": 0.4767, "num_input_tokens_seen": 167376672, "step": 137635 }, { "epoch": 17.24595915298835, "grad_norm": 3.7625627517700195, "learning_rate": 5.666257613722248e-07, "loss": 0.4516, "num_input_tokens_seen": 167382816, "step": 137640 }, { "epoch": 17.24658564089713, "grad_norm": 10.008360862731934, "learning_rate": 5.66372990971033e-07, "loss": 0.4059, "num_input_tokens_seen": 167388864, "step": 137645 }, { "epoch": 17.247212128805913, "grad_norm": 8.382097244262695, "learning_rate": 5.661202735775806e-07, "loss": 0.4002, "num_input_tokens_seen": 167395232, "step": 137650 }, { "epoch": 17.2478386167147, "grad_norm": 4.01633882522583, "learning_rate": 5.658676091948867e-07, "loss": 0.4251, "num_input_tokens_seen": 167400992, "step": 137655 }, { "epoch": 17.24846510462348, "grad_norm": 12.098191261291504, "learning_rate": 5.656149978259751e-07, "loss": 0.4847, "num_input_tokens_seen": 167407264, "step": 137660 }, { "epoch": 17.249091592532263, "grad_norm": 13.347197532653809, "learning_rate": 5.653624394738633e-07, "loss": 0.4407, "num_input_tokens_seen": 167413728, "step": 137665 }, { "epoch": 17.24971808044105, "grad_norm": 30.13300895690918, "learning_rate": 5.651099341415728e-07, "loss": 0.5674, "num_input_tokens_seen": 167419744, "step": 137670 }, { "epoch": 17.25034456834983, "grad_norm": 5.866044998168945, "learning_rate": 5.648574818321206e-07, "loss": 0.4174, "num_input_tokens_seen": 167425792, "step": 137675 }, { "epoch": 17.250971056258614, "grad_norm": 6.984557628631592, "learning_rate": 5.646050825485272e-07, "loss": 0.443, "num_input_tokens_seen": 167431680, "step": 137680 }, { "epoch": 17.251597544167396, "grad_norm": 5.2037129402160645, "learning_rate": 5.643527362938078e-07, "loss": 0.3828, "num_input_tokens_seen": 167438016, "step": 137685 }, { "epoch": 17.25222403207618, "grad_norm": 7.167893886566162, "learning_rate": 5.641004430709812e-07, "loss": 0.4458, "num_input_tokens_seen": 167444448, "step": 137690 }, { "epoch": 17.252850519984964, "grad_norm": 8.415419578552246, "learning_rate": 5.638482028830644e-07, "loss": 0.4978, "num_input_tokens_seen": 167450528, "step": 137695 }, { "epoch": 17.253477007893746, "grad_norm": 6.068635940551758, "learning_rate": 5.635960157330705e-07, "loss": 0.5223, "num_input_tokens_seen": 167456064, "step": 137700 }, { "epoch": 17.254103495802532, "grad_norm": 9.531000137329102, "learning_rate": 5.633438816240178e-07, "loss": 0.4027, "num_input_tokens_seen": 167462368, "step": 137705 }, { "epoch": 17.254729983711314, "grad_norm": 3.492774248123169, "learning_rate": 5.630918005589175e-07, "loss": 0.4304, "num_input_tokens_seen": 167468672, "step": 137710 }, { "epoch": 17.255356471620097, "grad_norm": 4.118274211883545, "learning_rate": 5.628397725407852e-07, "loss": 0.4333, "num_input_tokens_seen": 167474816, "step": 137715 }, { "epoch": 17.255982959528883, "grad_norm": 2.8288941383361816, "learning_rate": 5.625877975726346e-07, "loss": 0.4178, "num_input_tokens_seen": 167479968, "step": 137720 }, { "epoch": 17.256609447437665, "grad_norm": 11.458257675170898, "learning_rate": 5.623358756574782e-07, "loss": 0.4954, "num_input_tokens_seen": 167485952, "step": 137725 }, { "epoch": 17.257235935346447, "grad_norm": 5.324160575866699, "learning_rate": 5.620840067983263e-07, "loss": 0.4558, "num_input_tokens_seen": 167491840, "step": 137730 }, { "epoch": 17.257862423255233, "grad_norm": 5.753625869750977, "learning_rate": 5.618321909981922e-07, "loss": 0.4509, "num_input_tokens_seen": 167497888, "step": 137735 }, { "epoch": 17.258488911164015, "grad_norm": 6.208230495452881, "learning_rate": 5.615804282600851e-07, "loss": 0.3872, "num_input_tokens_seen": 167504096, "step": 137740 }, { "epoch": 17.259115399072797, "grad_norm": 4.1407246589660645, "learning_rate": 5.613287185870159e-07, "loss": 0.4009, "num_input_tokens_seen": 167510272, "step": 137745 }, { "epoch": 17.25974188698158, "grad_norm": 7.129852294921875, "learning_rate": 5.610770619819933e-07, "loss": 0.3844, "num_input_tokens_seen": 167516672, "step": 137750 }, { "epoch": 17.260368374890366, "grad_norm": 6.140944480895996, "learning_rate": 5.60825458448026e-07, "loss": 0.4973, "num_input_tokens_seen": 167522688, "step": 137755 }, { "epoch": 17.260994862799148, "grad_norm": 4.244229316711426, "learning_rate": 5.60573907988124e-07, "loss": 0.4841, "num_input_tokens_seen": 167528128, "step": 137760 }, { "epoch": 17.26162135070793, "grad_norm": 5.495572566986084, "learning_rate": 5.603224106052923e-07, "loss": 0.4545, "num_input_tokens_seen": 167534304, "step": 137765 }, { "epoch": 17.262247838616716, "grad_norm": 19.205686569213867, "learning_rate": 5.6007096630254e-07, "loss": 0.4155, "num_input_tokens_seen": 167540224, "step": 137770 }, { "epoch": 17.2628743265255, "grad_norm": 5.941279411315918, "learning_rate": 5.598195750828711e-07, "loss": 0.4694, "num_input_tokens_seen": 167545952, "step": 137775 }, { "epoch": 17.26350081443428, "grad_norm": 3.133725643157959, "learning_rate": 5.595682369492933e-07, "loss": 0.5643, "num_input_tokens_seen": 167552032, "step": 137780 }, { "epoch": 17.264127302343066, "grad_norm": 16.126220703125, "learning_rate": 5.593169519048091e-07, "loss": 0.4734, "num_input_tokens_seen": 167558144, "step": 137785 }, { "epoch": 17.26475379025185, "grad_norm": 14.43743896484375, "learning_rate": 5.590657199524258e-07, "loss": 0.5067, "num_input_tokens_seen": 167564192, "step": 137790 }, { "epoch": 17.26538027816063, "grad_norm": 10.010788917541504, "learning_rate": 5.58814541095144e-07, "loss": 0.5236, "num_input_tokens_seen": 167570464, "step": 137795 }, { "epoch": 17.266006766069413, "grad_norm": 10.81155776977539, "learning_rate": 5.585634153359698e-07, "loss": 0.5226, "num_input_tokens_seen": 167576032, "step": 137800 }, { "epoch": 17.2666332539782, "grad_norm": 13.94050407409668, "learning_rate": 5.58312342677903e-07, "loss": 0.4614, "num_input_tokens_seen": 167581952, "step": 137805 }, { "epoch": 17.26725974188698, "grad_norm": 3.6770167350769043, "learning_rate": 5.580613231239467e-07, "loss": 0.463, "num_input_tokens_seen": 167588064, "step": 137810 }, { "epoch": 17.267886229795764, "grad_norm": 7.280880928039551, "learning_rate": 5.578103566771026e-07, "loss": 0.4856, "num_input_tokens_seen": 167594112, "step": 137815 }, { "epoch": 17.26851271770455, "grad_norm": 14.945013999938965, "learning_rate": 5.575594433403692e-07, "loss": 0.4534, "num_input_tokens_seen": 167600672, "step": 137820 }, { "epoch": 17.26913920561333, "grad_norm": 22.247560501098633, "learning_rate": 5.57308583116748e-07, "loss": 0.4873, "num_input_tokens_seen": 167607232, "step": 137825 }, { "epoch": 17.269765693522114, "grad_norm": 5.431046962738037, "learning_rate": 5.570577760092382e-07, "loss": 0.4804, "num_input_tokens_seen": 167613344, "step": 137830 }, { "epoch": 17.2703921814309, "grad_norm": 3.441232681274414, "learning_rate": 5.568070220208388e-07, "loss": 0.4434, "num_input_tokens_seen": 167619744, "step": 137835 }, { "epoch": 17.271018669339682, "grad_norm": 11.574913024902344, "learning_rate": 5.565563211545461e-07, "loss": 0.418, "num_input_tokens_seen": 167626112, "step": 137840 }, { "epoch": 17.271645157248464, "grad_norm": 4.641493797302246, "learning_rate": 5.563056734133593e-07, "loss": 0.3743, "num_input_tokens_seen": 167632352, "step": 137845 }, { "epoch": 17.272271645157247, "grad_norm": 11.389833450317383, "learning_rate": 5.560550788002733e-07, "loss": 0.5699, "num_input_tokens_seen": 167638496, "step": 137850 }, { "epoch": 17.272898133066032, "grad_norm": 8.033108711242676, "learning_rate": 5.558045373182857e-07, "loss": 0.4487, "num_input_tokens_seen": 167644096, "step": 137855 }, { "epoch": 17.273524620974815, "grad_norm": 4.829160213470459, "learning_rate": 5.555540489703909e-07, "loss": 0.4498, "num_input_tokens_seen": 167650144, "step": 137860 }, { "epoch": 17.274151108883597, "grad_norm": 4.968551158905029, "learning_rate": 5.553036137595851e-07, "loss": 0.4427, "num_input_tokens_seen": 167656032, "step": 137865 }, { "epoch": 17.274777596792383, "grad_norm": 8.621515274047852, "learning_rate": 5.550532316888602e-07, "loss": 0.527, "num_input_tokens_seen": 167662176, "step": 137870 }, { "epoch": 17.275404084701165, "grad_norm": 28.37493133544922, "learning_rate": 5.548029027612112e-07, "loss": 0.4315, "num_input_tokens_seen": 167668384, "step": 137875 }, { "epoch": 17.276030572609947, "grad_norm": 19.611526489257812, "learning_rate": 5.54552626979632e-07, "loss": 0.4888, "num_input_tokens_seen": 167674240, "step": 137880 }, { "epoch": 17.276657060518733, "grad_norm": 4.800602436065674, "learning_rate": 5.543024043471124e-07, "loss": 0.3806, "num_input_tokens_seen": 167680128, "step": 137885 }, { "epoch": 17.277283548427516, "grad_norm": 17.145000457763672, "learning_rate": 5.540522348666461e-07, "loss": 0.4742, "num_input_tokens_seen": 167685824, "step": 137890 }, { "epoch": 17.277910036336298, "grad_norm": 5.879024505615234, "learning_rate": 5.538021185412229e-07, "loss": 0.4035, "num_input_tokens_seen": 167691904, "step": 137895 }, { "epoch": 17.278536524245084, "grad_norm": 3.947537899017334, "learning_rate": 5.535520553738339e-07, "loss": 0.4499, "num_input_tokens_seen": 167697888, "step": 137900 }, { "epoch": 17.279163012153866, "grad_norm": 14.272337913513184, "learning_rate": 5.533020453674675e-07, "loss": 0.5616, "num_input_tokens_seen": 167703360, "step": 137905 }, { "epoch": 17.27978950006265, "grad_norm": 15.434558868408203, "learning_rate": 5.53052088525115e-07, "loss": 0.5331, "num_input_tokens_seen": 167709248, "step": 137910 }, { "epoch": 17.28041598797143, "grad_norm": 5.60516357421875, "learning_rate": 5.528021848497627e-07, "loss": 0.4056, "num_input_tokens_seen": 167715424, "step": 137915 }, { "epoch": 17.281042475880216, "grad_norm": 4.847439765930176, "learning_rate": 5.525523343443989e-07, "loss": 0.4283, "num_input_tokens_seen": 167721728, "step": 137920 }, { "epoch": 17.281668963789, "grad_norm": 5.942897319793701, "learning_rate": 5.523025370120116e-07, "loss": 0.4262, "num_input_tokens_seen": 167727776, "step": 137925 }, { "epoch": 17.28229545169778, "grad_norm": 8.024802207946777, "learning_rate": 5.520527928555875e-07, "loss": 0.4192, "num_input_tokens_seen": 167734048, "step": 137930 }, { "epoch": 17.282921939606567, "grad_norm": 11.789137840270996, "learning_rate": 5.518031018781112e-07, "loss": 0.476, "num_input_tokens_seen": 167740000, "step": 137935 }, { "epoch": 17.28354842751535, "grad_norm": 4.236698150634766, "learning_rate": 5.515534640825681e-07, "loss": 0.4702, "num_input_tokens_seen": 167746272, "step": 137940 }, { "epoch": 17.28417491542413, "grad_norm": 20.276775360107422, "learning_rate": 5.51303879471945e-07, "loss": 0.4837, "num_input_tokens_seen": 167752032, "step": 137945 }, { "epoch": 17.284801403332917, "grad_norm": 5.074797630310059, "learning_rate": 5.510543480492231e-07, "loss": 0.4372, "num_input_tokens_seen": 167758080, "step": 137950 }, { "epoch": 17.2854278912417, "grad_norm": 19.951763153076172, "learning_rate": 5.508048698173879e-07, "loss": 0.4947, "num_input_tokens_seen": 167764096, "step": 137955 }, { "epoch": 17.28605437915048, "grad_norm": 12.150861740112305, "learning_rate": 5.505554447794198e-07, "loss": 0.4743, "num_input_tokens_seen": 167770240, "step": 137960 }, { "epoch": 17.286680867059268, "grad_norm": 17.264612197875977, "learning_rate": 5.503060729383037e-07, "loss": 0.4642, "num_input_tokens_seen": 167775968, "step": 137965 }, { "epoch": 17.28730735496805, "grad_norm": 4.246500492095947, "learning_rate": 5.500567542970186e-07, "loss": 0.433, "num_input_tokens_seen": 167782016, "step": 137970 }, { "epoch": 17.287933842876832, "grad_norm": 13.266199111938477, "learning_rate": 5.49807488858547e-07, "loss": 0.4875, "num_input_tokens_seen": 167787840, "step": 137975 }, { "epoch": 17.288560330785614, "grad_norm": 5.6098480224609375, "learning_rate": 5.495582766258678e-07, "loss": 0.4708, "num_input_tokens_seen": 167794080, "step": 137980 }, { "epoch": 17.2891868186944, "grad_norm": 8.601789474487305, "learning_rate": 5.493091176019616e-07, "loss": 0.4313, "num_input_tokens_seen": 167800288, "step": 137985 }, { "epoch": 17.289813306603182, "grad_norm": 11.13900089263916, "learning_rate": 5.490600117898065e-07, "loss": 0.461, "num_input_tokens_seen": 167806656, "step": 137990 }, { "epoch": 17.290439794511965, "grad_norm": 8.725057601928711, "learning_rate": 5.488109591923818e-07, "loss": 0.4338, "num_input_tokens_seen": 167812800, "step": 137995 }, { "epoch": 17.29106628242075, "grad_norm": 16.62723731994629, "learning_rate": 5.485619598126629e-07, "loss": 0.4416, "num_input_tokens_seen": 167818912, "step": 138000 }, { "epoch": 17.291692770329533, "grad_norm": 4.361374378204346, "learning_rate": 5.483130136536291e-07, "loss": 0.443, "num_input_tokens_seen": 167825152, "step": 138005 }, { "epoch": 17.292319258238315, "grad_norm": 4.61823844909668, "learning_rate": 5.480641207182564e-07, "loss": 0.4541, "num_input_tokens_seen": 167831456, "step": 138010 }, { "epoch": 17.2929457461471, "grad_norm": 13.938024520874023, "learning_rate": 5.478152810095194e-07, "loss": 0.409, "num_input_tokens_seen": 167837536, "step": 138015 }, { "epoch": 17.293572234055883, "grad_norm": 20.107379913330078, "learning_rate": 5.475664945303943e-07, "loss": 0.5007, "num_input_tokens_seen": 167843776, "step": 138020 }, { "epoch": 17.294198721964666, "grad_norm": 24.62816619873047, "learning_rate": 5.473177612838553e-07, "loss": 0.4502, "num_input_tokens_seen": 167850048, "step": 138025 }, { "epoch": 17.294825209873448, "grad_norm": 7.072213649749756, "learning_rate": 5.470690812728757e-07, "loss": 0.4692, "num_input_tokens_seen": 167856384, "step": 138030 }, { "epoch": 17.295451697782234, "grad_norm": 6.688957214355469, "learning_rate": 5.46820454500429e-07, "loss": 0.472, "num_input_tokens_seen": 167862688, "step": 138035 }, { "epoch": 17.296078185691016, "grad_norm": 5.915968894958496, "learning_rate": 5.465718809694887e-07, "loss": 0.4162, "num_input_tokens_seen": 167868352, "step": 138040 }, { "epoch": 17.2967046735998, "grad_norm": 5.017320156097412, "learning_rate": 5.463233606830248e-07, "loss": 0.4851, "num_input_tokens_seen": 167874496, "step": 138045 }, { "epoch": 17.297331161508584, "grad_norm": 9.998454093933105, "learning_rate": 5.460748936440108e-07, "loss": 0.3826, "num_input_tokens_seen": 167880864, "step": 138050 }, { "epoch": 17.297957649417366, "grad_norm": 7.03495979309082, "learning_rate": 5.458264798554147e-07, "loss": 0.4997, "num_input_tokens_seen": 167887296, "step": 138055 }, { "epoch": 17.29858413732615, "grad_norm": 13.473505020141602, "learning_rate": 5.455781193202082e-07, "loss": 0.4605, "num_input_tokens_seen": 167893120, "step": 138060 }, { "epoch": 17.299210625234934, "grad_norm": 4.226480960845947, "learning_rate": 5.453298120413609e-07, "loss": 0.4003, "num_input_tokens_seen": 167899200, "step": 138065 }, { "epoch": 17.299837113143717, "grad_norm": 29.050962448120117, "learning_rate": 5.450815580218405e-07, "loss": 0.4879, "num_input_tokens_seen": 167904640, "step": 138070 }, { "epoch": 17.3004636010525, "grad_norm": 5.340409755706787, "learning_rate": 5.448333572646164e-07, "loss": 0.5073, "num_input_tokens_seen": 167910656, "step": 138075 }, { "epoch": 17.30109008896128, "grad_norm": 4.264516353607178, "learning_rate": 5.445852097726539e-07, "loss": 0.407, "num_input_tokens_seen": 167916640, "step": 138080 }, { "epoch": 17.301716576870067, "grad_norm": 3.525254487991333, "learning_rate": 5.443371155489224e-07, "loss": 0.5112, "num_input_tokens_seen": 167922752, "step": 138085 }, { "epoch": 17.30234306477885, "grad_norm": 5.744342803955078, "learning_rate": 5.440890745963856e-07, "loss": 0.416, "num_input_tokens_seen": 167928928, "step": 138090 }, { "epoch": 17.30296955268763, "grad_norm": 9.168062210083008, "learning_rate": 5.438410869180116e-07, "loss": 0.3929, "num_input_tokens_seen": 167934912, "step": 138095 }, { "epoch": 17.303596040596418, "grad_norm": 4.419804573059082, "learning_rate": 5.435931525167626e-07, "loss": 0.3779, "num_input_tokens_seen": 167940896, "step": 138100 }, { "epoch": 17.3042225285052, "grad_norm": 4.437439918518066, "learning_rate": 5.433452713956055e-07, "loss": 0.5195, "num_input_tokens_seen": 167947200, "step": 138105 }, { "epoch": 17.304849016413982, "grad_norm": 4.395330429077148, "learning_rate": 5.430974435575015e-07, "loss": 0.4118, "num_input_tokens_seen": 167953408, "step": 138110 }, { "epoch": 17.305475504322768, "grad_norm": 23.05634117126465, "learning_rate": 5.428496690054158e-07, "loss": 0.4793, "num_input_tokens_seen": 167959808, "step": 138115 }, { "epoch": 17.30610199223155, "grad_norm": 8.203076362609863, "learning_rate": 5.426019477423083e-07, "loss": 0.4616, "num_input_tokens_seen": 167965728, "step": 138120 }, { "epoch": 17.306728480140332, "grad_norm": 8.445049285888672, "learning_rate": 5.423542797711429e-07, "loss": 0.4438, "num_input_tokens_seen": 167971648, "step": 138125 }, { "epoch": 17.30735496804912, "grad_norm": 3.2507402896881104, "learning_rate": 5.421066650948792e-07, "loss": 0.4136, "num_input_tokens_seen": 167977728, "step": 138130 }, { "epoch": 17.3079814559579, "grad_norm": 3.3288042545318604, "learning_rate": 5.418591037164783e-07, "loss": 0.418, "num_input_tokens_seen": 167983840, "step": 138135 }, { "epoch": 17.308607943866683, "grad_norm": 12.822713851928711, "learning_rate": 5.416115956389012e-07, "loss": 0.4411, "num_input_tokens_seen": 167989920, "step": 138140 }, { "epoch": 17.309234431775465, "grad_norm": 29.33563804626465, "learning_rate": 5.413641408651049e-07, "loss": 0.4933, "num_input_tokens_seen": 167996064, "step": 138145 }, { "epoch": 17.30986091968425, "grad_norm": 6.070565223693848, "learning_rate": 5.411167393980498e-07, "loss": 0.5288, "num_input_tokens_seen": 168001376, "step": 138150 }, { "epoch": 17.310487407593033, "grad_norm": 13.48155689239502, "learning_rate": 5.40869391240692e-07, "loss": 0.4879, "num_input_tokens_seen": 168007200, "step": 138155 }, { "epoch": 17.311113895501816, "grad_norm": 3.5315744876861572, "learning_rate": 5.406220963959907e-07, "loss": 0.4552, "num_input_tokens_seen": 168013312, "step": 138160 }, { "epoch": 17.3117403834106, "grad_norm": 4.567129611968994, "learning_rate": 5.403748548669002e-07, "loss": 0.4331, "num_input_tokens_seen": 168019232, "step": 138165 }, { "epoch": 17.312366871319384, "grad_norm": 10.66889762878418, "learning_rate": 5.401276666563793e-07, "loss": 0.4435, "num_input_tokens_seen": 168025504, "step": 138170 }, { "epoch": 17.312993359228166, "grad_norm": 7.80183744430542, "learning_rate": 5.3988053176738e-07, "loss": 0.4774, "num_input_tokens_seen": 168031776, "step": 138175 }, { "epoch": 17.313619847136952, "grad_norm": 4.441996097564697, "learning_rate": 5.396334502028605e-07, "loss": 0.3917, "num_input_tokens_seen": 168038016, "step": 138180 }, { "epoch": 17.314246335045734, "grad_norm": 23.6574764251709, "learning_rate": 5.393864219657719e-07, "loss": 0.4148, "num_input_tokens_seen": 168044384, "step": 138185 }, { "epoch": 17.314872822954516, "grad_norm": 5.973925590515137, "learning_rate": 5.391394470590688e-07, "loss": 0.441, "num_input_tokens_seen": 168050560, "step": 138190 }, { "epoch": 17.3154993108633, "grad_norm": 5.157613277435303, "learning_rate": 5.388925254857052e-07, "loss": 0.446, "num_input_tokens_seen": 168056672, "step": 138195 }, { "epoch": 17.316125798772084, "grad_norm": 12.481050491333008, "learning_rate": 5.386456572486315e-07, "loss": 0.4557, "num_input_tokens_seen": 168062688, "step": 138200 }, { "epoch": 17.316752286680867, "grad_norm": 5.686174392700195, "learning_rate": 5.383988423508002e-07, "loss": 0.4409, "num_input_tokens_seen": 168069056, "step": 138205 }, { "epoch": 17.31737877458965, "grad_norm": 3.848456859588623, "learning_rate": 5.381520807951612e-07, "loss": 0.4575, "num_input_tokens_seen": 168075424, "step": 138210 }, { "epoch": 17.318005262498435, "grad_norm": 6.469409942626953, "learning_rate": 5.379053725846667e-07, "loss": 0.4397, "num_input_tokens_seen": 168081472, "step": 138215 }, { "epoch": 17.318631750407217, "grad_norm": 9.194250106811523, "learning_rate": 5.376587177222636e-07, "loss": 0.4181, "num_input_tokens_seen": 168087456, "step": 138220 }, { "epoch": 17.319258238316, "grad_norm": 10.799065589904785, "learning_rate": 5.374121162109025e-07, "loss": 0.4275, "num_input_tokens_seen": 168093664, "step": 138225 }, { "epoch": 17.319884726224785, "grad_norm": 4.282564640045166, "learning_rate": 5.37165568053532e-07, "loss": 0.4378, "num_input_tokens_seen": 168099808, "step": 138230 }, { "epoch": 17.320511214133568, "grad_norm": 4.430396556854248, "learning_rate": 5.369190732530988e-07, "loss": 0.5029, "num_input_tokens_seen": 168106208, "step": 138235 }, { "epoch": 17.32113770204235, "grad_norm": 3.4114742279052734, "learning_rate": 5.366726318125504e-07, "loss": 0.4704, "num_input_tokens_seen": 168111936, "step": 138240 }, { "epoch": 17.321764189951136, "grad_norm": 5.1951775550842285, "learning_rate": 5.364262437348328e-07, "loss": 0.4112, "num_input_tokens_seen": 168117952, "step": 138245 }, { "epoch": 17.322390677859918, "grad_norm": 6.135193347930908, "learning_rate": 5.361799090228936e-07, "loss": 0.4086, "num_input_tokens_seen": 168124160, "step": 138250 }, { "epoch": 17.3230171657687, "grad_norm": 11.729069709777832, "learning_rate": 5.359336276796756e-07, "loss": 0.411, "num_input_tokens_seen": 168130080, "step": 138255 }, { "epoch": 17.323643653677482, "grad_norm": 10.944324493408203, "learning_rate": 5.356873997081247e-07, "loss": 0.5482, "num_input_tokens_seen": 168136160, "step": 138260 }, { "epoch": 17.32427014158627, "grad_norm": 5.738656997680664, "learning_rate": 5.35441225111184e-07, "loss": 0.4343, "num_input_tokens_seen": 168142304, "step": 138265 }, { "epoch": 17.32489662949505, "grad_norm": 8.787907600402832, "learning_rate": 5.35195103891798e-07, "loss": 0.6, "num_input_tokens_seen": 168148704, "step": 138270 }, { "epoch": 17.325523117403833, "grad_norm": 5.696108341217041, "learning_rate": 5.349490360529069e-07, "loss": 0.4338, "num_input_tokens_seen": 168155072, "step": 138275 }, { "epoch": 17.32614960531262, "grad_norm": 5.609674453735352, "learning_rate": 5.347030215974552e-07, "loss": 0.4606, "num_input_tokens_seen": 168161152, "step": 138280 }, { "epoch": 17.3267760932214, "grad_norm": 8.520602226257324, "learning_rate": 5.344570605283822e-07, "loss": 0.3914, "num_input_tokens_seen": 168167520, "step": 138285 }, { "epoch": 17.327402581130183, "grad_norm": 16.745880126953125, "learning_rate": 5.342111528486305e-07, "loss": 0.4411, "num_input_tokens_seen": 168173696, "step": 138290 }, { "epoch": 17.32802906903897, "grad_norm": 5.931042671203613, "learning_rate": 5.339652985611383e-07, "loss": 0.4012, "num_input_tokens_seen": 168179328, "step": 138295 }, { "epoch": 17.32865555694775, "grad_norm": 5.580260753631592, "learning_rate": 5.337194976688464e-07, "loss": 0.5452, "num_input_tokens_seen": 168185696, "step": 138300 }, { "epoch": 17.329282044856534, "grad_norm": 31.506275177001953, "learning_rate": 5.334737501746923e-07, "loss": 0.465, "num_input_tokens_seen": 168192000, "step": 138305 }, { "epoch": 17.329908532765316, "grad_norm": 4.67647647857666, "learning_rate": 5.332280560816144e-07, "loss": 0.4016, "num_input_tokens_seen": 168198144, "step": 138310 }, { "epoch": 17.330535020674102, "grad_norm": 4.999637603759766, "learning_rate": 5.329824153925517e-07, "loss": 0.3988, "num_input_tokens_seen": 168204352, "step": 138315 }, { "epoch": 17.331161508582884, "grad_norm": 6.0409111976623535, "learning_rate": 5.327368281104389e-07, "loss": 0.4027, "num_input_tokens_seen": 168210432, "step": 138320 }, { "epoch": 17.331787996491666, "grad_norm": 11.619738578796387, "learning_rate": 5.324912942382143e-07, "loss": 0.4386, "num_input_tokens_seen": 168216192, "step": 138325 }, { "epoch": 17.332414484400452, "grad_norm": 15.160055160522461, "learning_rate": 5.322458137788111e-07, "loss": 0.4523, "num_input_tokens_seen": 168222272, "step": 138330 }, { "epoch": 17.333040972309234, "grad_norm": 3.428424596786499, "learning_rate": 5.320003867351658e-07, "loss": 0.3967, "num_input_tokens_seen": 168228448, "step": 138335 }, { "epoch": 17.333667460218017, "grad_norm": 11.57104206085205, "learning_rate": 5.317550131102117e-07, "loss": 0.4187, "num_input_tokens_seen": 168234592, "step": 138340 }, { "epoch": 17.334293948126803, "grad_norm": 10.3102388381958, "learning_rate": 5.315096929068847e-07, "loss": 0.4535, "num_input_tokens_seen": 168240800, "step": 138345 }, { "epoch": 17.334920436035585, "grad_norm": 5.496435642242432, "learning_rate": 5.312644261281152e-07, "loss": 0.407, "num_input_tokens_seen": 168246912, "step": 138350 }, { "epoch": 17.335546923944367, "grad_norm": 7.054457187652588, "learning_rate": 5.310192127768376e-07, "loss": 0.4318, "num_input_tokens_seen": 168253088, "step": 138355 }, { "epoch": 17.336173411853153, "grad_norm": 7.946369647979736, "learning_rate": 5.307740528559813e-07, "loss": 0.4384, "num_input_tokens_seen": 168258912, "step": 138360 }, { "epoch": 17.336799899761935, "grad_norm": 4.3523054122924805, "learning_rate": 5.305289463684804e-07, "loss": 0.4292, "num_input_tokens_seen": 168264384, "step": 138365 }, { "epoch": 17.337426387670718, "grad_norm": 16.93362045288086, "learning_rate": 5.302838933172622e-07, "loss": 0.4862, "num_input_tokens_seen": 168270624, "step": 138370 }, { "epoch": 17.3380528755795, "grad_norm": 13.71426773071289, "learning_rate": 5.300388937052575e-07, "loss": 0.3977, "num_input_tokens_seen": 168276832, "step": 138375 }, { "epoch": 17.338679363488286, "grad_norm": 4.934311389923096, "learning_rate": 5.297939475353975e-07, "loss": 0.4402, "num_input_tokens_seen": 168282304, "step": 138380 }, { "epoch": 17.339305851397068, "grad_norm": 8.078091621398926, "learning_rate": 5.295490548106086e-07, "loss": 0.4739, "num_input_tokens_seen": 168288928, "step": 138385 }, { "epoch": 17.33993233930585, "grad_norm": 6.743898868560791, "learning_rate": 5.293042155338196e-07, "loss": 0.4543, "num_input_tokens_seen": 168294848, "step": 138390 }, { "epoch": 17.340558827214636, "grad_norm": 4.18951416015625, "learning_rate": 5.290594297079565e-07, "loss": 0.4005, "num_input_tokens_seen": 168300640, "step": 138395 }, { "epoch": 17.34118531512342, "grad_norm": 3.999751329421997, "learning_rate": 5.288146973359482e-07, "loss": 0.4262, "num_input_tokens_seen": 168306848, "step": 138400 }, { "epoch": 17.3418118030322, "grad_norm": 14.372584342956543, "learning_rate": 5.285700184207182e-07, "loss": 0.4416, "num_input_tokens_seen": 168312864, "step": 138405 }, { "epoch": 17.342438290940986, "grad_norm": 14.461955070495605, "learning_rate": 5.283253929651943e-07, "loss": 0.4803, "num_input_tokens_seen": 168318880, "step": 138410 }, { "epoch": 17.34306477884977, "grad_norm": 10.195028305053711, "learning_rate": 5.280808209722987e-07, "loss": 0.5098, "num_input_tokens_seen": 168324928, "step": 138415 }, { "epoch": 17.34369126675855, "grad_norm": 8.201726913452148, "learning_rate": 5.278363024449573e-07, "loss": 0.421, "num_input_tokens_seen": 168331328, "step": 138420 }, { "epoch": 17.344317754667333, "grad_norm": 8.816267013549805, "learning_rate": 5.275918373860922e-07, "loss": 0.4708, "num_input_tokens_seen": 168336512, "step": 138425 }, { "epoch": 17.34494424257612, "grad_norm": 5.665018558502197, "learning_rate": 5.273474257986272e-07, "loss": 0.452, "num_input_tokens_seen": 168342784, "step": 138430 }, { "epoch": 17.3455707304849, "grad_norm": 6.593222141265869, "learning_rate": 5.271030676854838e-07, "loss": 0.4226, "num_input_tokens_seen": 168348992, "step": 138435 }, { "epoch": 17.346197218393684, "grad_norm": 8.405411720275879, "learning_rate": 5.268587630495847e-07, "loss": 0.4748, "num_input_tokens_seen": 168355008, "step": 138440 }, { "epoch": 17.34682370630247, "grad_norm": 4.716174125671387, "learning_rate": 5.266145118938493e-07, "loss": 0.456, "num_input_tokens_seen": 168360800, "step": 138445 }, { "epoch": 17.347450194211252, "grad_norm": 6.144161701202393, "learning_rate": 5.263703142211979e-07, "loss": 0.4926, "num_input_tokens_seen": 168367136, "step": 138450 }, { "epoch": 17.348076682120034, "grad_norm": 8.717339515686035, "learning_rate": 5.261261700345522e-07, "loss": 0.4476, "num_input_tokens_seen": 168373408, "step": 138455 }, { "epoch": 17.34870317002882, "grad_norm": 3.255000114440918, "learning_rate": 5.25882079336828e-07, "loss": 0.4356, "num_input_tokens_seen": 168379648, "step": 138460 }, { "epoch": 17.349329657937602, "grad_norm": 9.541791915893555, "learning_rate": 5.256380421309465e-07, "loss": 0.448, "num_input_tokens_seen": 168385856, "step": 138465 }, { "epoch": 17.349956145846384, "grad_norm": 22.705503463745117, "learning_rate": 5.25394058419823e-07, "loss": 0.4348, "num_input_tokens_seen": 168391872, "step": 138470 }, { "epoch": 17.350582633755167, "grad_norm": 4.80502462387085, "learning_rate": 5.251501282063764e-07, "loss": 0.3987, "num_input_tokens_seen": 168398080, "step": 138475 }, { "epoch": 17.351209121663953, "grad_norm": 10.117936134338379, "learning_rate": 5.249062514935216e-07, "loss": 0.5035, "num_input_tokens_seen": 168404160, "step": 138480 }, { "epoch": 17.351835609572735, "grad_norm": 6.319458961486816, "learning_rate": 5.246624282841755e-07, "loss": 0.4647, "num_input_tokens_seen": 168409792, "step": 138485 }, { "epoch": 17.352462097481517, "grad_norm": 4.376633644104004, "learning_rate": 5.244186585812521e-07, "loss": 0.4189, "num_input_tokens_seen": 168415808, "step": 138490 }, { "epoch": 17.353088585390303, "grad_norm": 3.9463307857513428, "learning_rate": 5.241749423876669e-07, "loss": 0.4134, "num_input_tokens_seen": 168422208, "step": 138495 }, { "epoch": 17.353715073299085, "grad_norm": 16.536556243896484, "learning_rate": 5.239312797063334e-07, "loss": 0.4948, "num_input_tokens_seen": 168427744, "step": 138500 }, { "epoch": 17.354341561207868, "grad_norm": 7.940333843231201, "learning_rate": 5.236876705401645e-07, "loss": 0.4054, "num_input_tokens_seen": 168433952, "step": 138505 }, { "epoch": 17.354968049116653, "grad_norm": 6.5846638679504395, "learning_rate": 5.234441148920738e-07, "loss": 0.3795, "num_input_tokens_seen": 168440256, "step": 138510 }, { "epoch": 17.355594537025436, "grad_norm": 12.586116790771484, "learning_rate": 5.232006127649714e-07, "loss": 0.4169, "num_input_tokens_seen": 168445984, "step": 138515 }, { "epoch": 17.356221024934218, "grad_norm": 14.561430931091309, "learning_rate": 5.229571641617703e-07, "loss": 0.4742, "num_input_tokens_seen": 168451872, "step": 138520 }, { "epoch": 17.356847512843004, "grad_norm": 13.114164352416992, "learning_rate": 5.227137690853795e-07, "loss": 0.4234, "num_input_tokens_seen": 168457696, "step": 138525 }, { "epoch": 17.357474000751786, "grad_norm": 15.33033275604248, "learning_rate": 5.224704275387105e-07, "loss": 0.5014, "num_input_tokens_seen": 168463744, "step": 138530 }, { "epoch": 17.35810048866057, "grad_norm": 3.8588736057281494, "learning_rate": 5.222271395246714e-07, "loss": 0.4176, "num_input_tokens_seen": 168469728, "step": 138535 }, { "epoch": 17.35872697656935, "grad_norm": 15.589414596557617, "learning_rate": 5.219839050461717e-07, "loss": 0.4729, "num_input_tokens_seen": 168476064, "step": 138540 }, { "epoch": 17.359353464478136, "grad_norm": 4.079567909240723, "learning_rate": 5.217407241061195e-07, "loss": 0.4042, "num_input_tokens_seen": 168481984, "step": 138545 }, { "epoch": 17.35997995238692, "grad_norm": 5.198920249938965, "learning_rate": 5.214975967074226e-07, "loss": 0.4688, "num_input_tokens_seen": 168488192, "step": 138550 }, { "epoch": 17.3606064402957, "grad_norm": 5.625753879547119, "learning_rate": 5.212545228529864e-07, "loss": 0.4303, "num_input_tokens_seen": 168493920, "step": 138555 }, { "epoch": 17.361232928204487, "grad_norm": 10.116607666015625, "learning_rate": 5.21011502545718e-07, "loss": 0.4186, "num_input_tokens_seen": 168500064, "step": 138560 }, { "epoch": 17.36185941611327, "grad_norm": 2.974653720855713, "learning_rate": 5.207685357885239e-07, "loss": 0.4055, "num_input_tokens_seen": 168505888, "step": 138565 }, { "epoch": 17.36248590402205, "grad_norm": 12.617728233337402, "learning_rate": 5.205256225843064e-07, "loss": 0.4403, "num_input_tokens_seen": 168511744, "step": 138570 }, { "epoch": 17.363112391930837, "grad_norm": 8.34576416015625, "learning_rate": 5.202827629359724e-07, "loss": 0.451, "num_input_tokens_seen": 168517600, "step": 138575 }, { "epoch": 17.36373887983962, "grad_norm": 13.068452835083008, "learning_rate": 5.200399568464237e-07, "loss": 0.4699, "num_input_tokens_seen": 168523744, "step": 138580 }, { "epoch": 17.364365367748402, "grad_norm": 19.673416137695312, "learning_rate": 5.197972043185645e-07, "loss": 0.4885, "num_input_tokens_seen": 168529952, "step": 138585 }, { "epoch": 17.364991855657188, "grad_norm": 3.8191843032836914, "learning_rate": 5.195545053552958e-07, "loss": 0.5412, "num_input_tokens_seen": 168536512, "step": 138590 }, { "epoch": 17.36561834356597, "grad_norm": 6.0973801612854, "learning_rate": 5.193118599595204e-07, "loss": 0.4113, "num_input_tokens_seen": 168542496, "step": 138595 }, { "epoch": 17.366244831474752, "grad_norm": 12.616753578186035, "learning_rate": 5.190692681341386e-07, "loss": 0.4166, "num_input_tokens_seen": 168548960, "step": 138600 }, { "epoch": 17.366871319383534, "grad_norm": 4.416622161865234, "learning_rate": 5.188267298820515e-07, "loss": 0.3925, "num_input_tokens_seen": 168555040, "step": 138605 }, { "epoch": 17.36749780729232, "grad_norm": 20.45250129699707, "learning_rate": 5.185842452061574e-07, "loss": 0.4799, "num_input_tokens_seen": 168561120, "step": 138610 }, { "epoch": 17.368124295201103, "grad_norm": 4.088669300079346, "learning_rate": 5.183418141093577e-07, "loss": 0.445, "num_input_tokens_seen": 168567264, "step": 138615 }, { "epoch": 17.368750783109885, "grad_norm": 3.2543623447418213, "learning_rate": 5.180994365945486e-07, "loss": 0.4152, "num_input_tokens_seen": 168573344, "step": 138620 }, { "epoch": 17.36937727101867, "grad_norm": 3.843242645263672, "learning_rate": 5.178571126646286e-07, "loss": 0.3946, "num_input_tokens_seen": 168579264, "step": 138625 }, { "epoch": 17.370003758927453, "grad_norm": 14.9893217086792, "learning_rate": 5.176148423224963e-07, "loss": 0.449, "num_input_tokens_seen": 168585472, "step": 138630 }, { "epoch": 17.370630246836235, "grad_norm": 8.613338470458984, "learning_rate": 5.173726255710459e-07, "loss": 0.4248, "num_input_tokens_seen": 168591584, "step": 138635 }, { "epoch": 17.37125673474502, "grad_norm": 8.231928825378418, "learning_rate": 5.171304624131751e-07, "loss": 0.5151, "num_input_tokens_seen": 168597856, "step": 138640 }, { "epoch": 17.371883222653803, "grad_norm": 27.120269775390625, "learning_rate": 5.168883528517793e-07, "loss": 0.4324, "num_input_tokens_seen": 168604352, "step": 138645 }, { "epoch": 17.372509710562586, "grad_norm": 10.201308250427246, "learning_rate": 5.166462968897518e-07, "loss": 0.4408, "num_input_tokens_seen": 168610272, "step": 138650 }, { "epoch": 17.373136198471368, "grad_norm": 39.30794143676758, "learning_rate": 5.164042945299869e-07, "loss": 0.7068, "num_input_tokens_seen": 168616416, "step": 138655 }, { "epoch": 17.373762686380154, "grad_norm": 10.953009605407715, "learning_rate": 5.16162345775379e-07, "loss": 0.4322, "num_input_tokens_seen": 168622080, "step": 138660 }, { "epoch": 17.374389174288936, "grad_norm": 5.735638618469238, "learning_rate": 5.159204506288195e-07, "loss": 0.4462, "num_input_tokens_seen": 168627872, "step": 138665 }, { "epoch": 17.37501566219772, "grad_norm": 4.1817402839660645, "learning_rate": 5.15678609093202e-07, "loss": 0.4381, "num_input_tokens_seen": 168634240, "step": 138670 }, { "epoch": 17.375642150106504, "grad_norm": 7.244335174560547, "learning_rate": 5.154368211714156e-07, "loss": 0.4182, "num_input_tokens_seen": 168639840, "step": 138675 }, { "epoch": 17.376268638015286, "grad_norm": 17.282365798950195, "learning_rate": 5.151950868663524e-07, "loss": 0.4737, "num_input_tokens_seen": 168645920, "step": 138680 }, { "epoch": 17.37689512592407, "grad_norm": 5.430160999298096, "learning_rate": 5.149534061809036e-07, "loss": 0.4413, "num_input_tokens_seen": 168652128, "step": 138685 }, { "epoch": 17.377521613832855, "grad_norm": 21.224504470825195, "learning_rate": 5.147117791179574e-07, "loss": 0.4543, "num_input_tokens_seen": 168657920, "step": 138690 }, { "epoch": 17.378148101741637, "grad_norm": 13.834884643554688, "learning_rate": 5.14470205680403e-07, "loss": 0.4259, "num_input_tokens_seen": 168664096, "step": 138695 }, { "epoch": 17.37877458965042, "grad_norm": 5.469505786895752, "learning_rate": 5.142286858711276e-07, "loss": 0.5761, "num_input_tokens_seen": 168670528, "step": 138700 }, { "epoch": 17.3794010775592, "grad_norm": 4.554914474487305, "learning_rate": 5.139872196930213e-07, "loss": 0.4019, "num_input_tokens_seen": 168677024, "step": 138705 }, { "epoch": 17.380027565467987, "grad_norm": 5.174314975738525, "learning_rate": 5.137458071489676e-07, "loss": 0.4306, "num_input_tokens_seen": 168683232, "step": 138710 }, { "epoch": 17.38065405337677, "grad_norm": 12.241544723510742, "learning_rate": 5.13504448241856e-07, "loss": 0.5245, "num_input_tokens_seen": 168689280, "step": 138715 }, { "epoch": 17.381280541285552, "grad_norm": 8.539946556091309, "learning_rate": 5.132631429745699e-07, "loss": 0.4357, "num_input_tokens_seen": 168695648, "step": 138720 }, { "epoch": 17.381907029194338, "grad_norm": 27.16435432434082, "learning_rate": 5.130218913499962e-07, "loss": 0.5689, "num_input_tokens_seen": 168701760, "step": 138725 }, { "epoch": 17.38253351710312, "grad_norm": 8.623309135437012, "learning_rate": 5.127806933710172e-07, "loss": 0.4037, "num_input_tokens_seen": 168707872, "step": 138730 }, { "epoch": 17.383160005011902, "grad_norm": 17.014076232910156, "learning_rate": 5.125395490405183e-07, "loss": 0.4813, "num_input_tokens_seen": 168714048, "step": 138735 }, { "epoch": 17.383786492920688, "grad_norm": 18.053442001342773, "learning_rate": 5.122984583613816e-07, "loss": 0.4762, "num_input_tokens_seen": 168720192, "step": 138740 }, { "epoch": 17.38441298082947, "grad_norm": 5.982721328735352, "learning_rate": 5.120574213364893e-07, "loss": 0.3944, "num_input_tokens_seen": 168726656, "step": 138745 }, { "epoch": 17.385039468738253, "grad_norm": 6.0692853927612305, "learning_rate": 5.118164379687241e-07, "loss": 0.4497, "num_input_tokens_seen": 168732768, "step": 138750 }, { "epoch": 17.38566595664704, "grad_norm": 23.8488712310791, "learning_rate": 5.115755082609664e-07, "loss": 0.4884, "num_input_tokens_seen": 168739040, "step": 138755 }, { "epoch": 17.38629244455582, "grad_norm": 3.853210210800171, "learning_rate": 5.113346322160984e-07, "loss": 0.3933, "num_input_tokens_seen": 168745344, "step": 138760 }, { "epoch": 17.386918932464603, "grad_norm": 13.896759033203125, "learning_rate": 5.110938098369978e-07, "loss": 0.4615, "num_input_tokens_seen": 168751104, "step": 138765 }, { "epoch": 17.387545420373385, "grad_norm": 3.84907865524292, "learning_rate": 5.10853041126546e-07, "loss": 0.4193, "num_input_tokens_seen": 168756960, "step": 138770 }, { "epoch": 17.38817190828217, "grad_norm": 7.916423320770264, "learning_rate": 5.10612326087619e-07, "loss": 0.4618, "num_input_tokens_seen": 168763232, "step": 138775 }, { "epoch": 17.388798396190953, "grad_norm": 25.104480743408203, "learning_rate": 5.103716647230971e-07, "loss": 0.5047, "num_input_tokens_seen": 168769376, "step": 138780 }, { "epoch": 17.389424884099736, "grad_norm": 3.997704029083252, "learning_rate": 5.101310570358559e-07, "loss": 0.3884, "num_input_tokens_seen": 168775456, "step": 138785 }, { "epoch": 17.39005137200852, "grad_norm": 13.182596206665039, "learning_rate": 5.098905030287738e-07, "loss": 0.486, "num_input_tokens_seen": 168781888, "step": 138790 }, { "epoch": 17.390677859917304, "grad_norm": 4.137272834777832, "learning_rate": 5.096500027047246e-07, "loss": 0.4069, "num_input_tokens_seen": 168788128, "step": 138795 }, { "epoch": 17.391304347826086, "grad_norm": 10.03747844696045, "learning_rate": 5.094095560665857e-07, "loss": 0.42, "num_input_tokens_seen": 168794400, "step": 138800 }, { "epoch": 17.391930835734872, "grad_norm": 5.767351150512695, "learning_rate": 5.091691631172307e-07, "loss": 0.3968, "num_input_tokens_seen": 168800352, "step": 138805 }, { "epoch": 17.392557323643654, "grad_norm": 16.96805763244629, "learning_rate": 5.089288238595335e-07, "loss": 0.4304, "num_input_tokens_seen": 168806592, "step": 138810 }, { "epoch": 17.393183811552436, "grad_norm": 8.430974960327148, "learning_rate": 5.086885382963691e-07, "loss": 0.4435, "num_input_tokens_seen": 168812832, "step": 138815 }, { "epoch": 17.39381029946122, "grad_norm": 22.34101676940918, "learning_rate": 5.084483064306083e-07, "loss": 0.4265, "num_input_tokens_seen": 168818784, "step": 138820 }, { "epoch": 17.394436787370005, "grad_norm": 7.788459300994873, "learning_rate": 5.082081282651258e-07, "loss": 0.4892, "num_input_tokens_seen": 168824736, "step": 138825 }, { "epoch": 17.395063275278787, "grad_norm": 20.07832145690918, "learning_rate": 5.079680038027901e-07, "loss": 0.481, "num_input_tokens_seen": 168831200, "step": 138830 }, { "epoch": 17.39568976318757, "grad_norm": 3.6816046237945557, "learning_rate": 5.077279330464746e-07, "loss": 0.4558, "num_input_tokens_seen": 168837184, "step": 138835 }, { "epoch": 17.396316251096355, "grad_norm": 7.254002571105957, "learning_rate": 5.074879159990475e-07, "loss": 0.4436, "num_input_tokens_seen": 168843424, "step": 138840 }, { "epoch": 17.396942739005137, "grad_norm": 15.889699935913086, "learning_rate": 5.0724795266338e-07, "loss": 0.4704, "num_input_tokens_seen": 168849728, "step": 138845 }, { "epoch": 17.39756922691392, "grad_norm": 24.931360244750977, "learning_rate": 5.070080430423413e-07, "loss": 0.6195, "num_input_tokens_seen": 168855552, "step": 138850 }, { "epoch": 17.398195714822705, "grad_norm": 12.942814826965332, "learning_rate": 5.067681871387975e-07, "loss": 0.4182, "num_input_tokens_seen": 168861664, "step": 138855 }, { "epoch": 17.398822202731488, "grad_norm": 10.611530303955078, "learning_rate": 5.06528384955618e-07, "loss": 0.4597, "num_input_tokens_seen": 168867680, "step": 138860 }, { "epoch": 17.39944869064027, "grad_norm": 13.451004981994629, "learning_rate": 5.062886364956698e-07, "loss": 0.4171, "num_input_tokens_seen": 168873696, "step": 138865 }, { "epoch": 17.400075178549056, "grad_norm": 6.937234878540039, "learning_rate": 5.060489417618197e-07, "loss": 0.4183, "num_input_tokens_seen": 168880000, "step": 138870 }, { "epoch": 17.400701666457838, "grad_norm": 6.935276031494141, "learning_rate": 5.05809300756932e-07, "loss": 0.4334, "num_input_tokens_seen": 168886304, "step": 138875 }, { "epoch": 17.40132815436662, "grad_norm": 4.556704044342041, "learning_rate": 5.055697134838738e-07, "loss": 0.4763, "num_input_tokens_seen": 168892416, "step": 138880 }, { "epoch": 17.401954642275403, "grad_norm": 17.71820831298828, "learning_rate": 5.053301799455074e-07, "loss": 0.4958, "num_input_tokens_seen": 168898464, "step": 138885 }, { "epoch": 17.40258113018419, "grad_norm": 6.710416316986084, "learning_rate": 5.050907001446981e-07, "loss": 0.3998, "num_input_tokens_seen": 168904800, "step": 138890 }, { "epoch": 17.40320761809297, "grad_norm": 6.470067501068115, "learning_rate": 5.048512740843081e-07, "loss": 0.4044, "num_input_tokens_seen": 168910880, "step": 138895 }, { "epoch": 17.403834106001753, "grad_norm": 4.116293430328369, "learning_rate": 5.046119017672008e-07, "loss": 0.4599, "num_input_tokens_seen": 168916960, "step": 138900 }, { "epoch": 17.40446059391054, "grad_norm": 6.862153053283691, "learning_rate": 5.043725831962371e-07, "loss": 0.4191, "num_input_tokens_seen": 168922912, "step": 138905 }, { "epoch": 17.40508708181932, "grad_norm": 13.099180221557617, "learning_rate": 5.041333183742803e-07, "loss": 0.4713, "num_input_tokens_seen": 168928992, "step": 138910 }, { "epoch": 17.405713569728103, "grad_norm": 5.02878475189209, "learning_rate": 5.038941073041881e-07, "loss": 0.4246, "num_input_tokens_seen": 168935264, "step": 138915 }, { "epoch": 17.40634005763689, "grad_norm": 22.7679386138916, "learning_rate": 5.036549499888233e-07, "loss": 0.4943, "num_input_tokens_seen": 168941216, "step": 138920 }, { "epoch": 17.40696654554567, "grad_norm": 3.943669080734253, "learning_rate": 5.034158464310424e-07, "loss": 0.5519, "num_input_tokens_seen": 168947328, "step": 138925 }, { "epoch": 17.407593033454454, "grad_norm": 5.109848499298096, "learning_rate": 5.031767966337059e-07, "loss": 0.4133, "num_input_tokens_seen": 168953312, "step": 138930 }, { "epoch": 17.408219521363236, "grad_norm": 4.652463436126709, "learning_rate": 5.029378005996721e-07, "loss": 0.4311, "num_input_tokens_seen": 168959520, "step": 138935 }, { "epoch": 17.408846009272022, "grad_norm": 5.872360706329346, "learning_rate": 5.026988583317977e-07, "loss": 0.4439, "num_input_tokens_seen": 168965984, "step": 138940 }, { "epoch": 17.409472497180804, "grad_norm": 4.303594589233398, "learning_rate": 5.024599698329397e-07, "loss": 0.3832, "num_input_tokens_seen": 168972032, "step": 138945 }, { "epoch": 17.410098985089586, "grad_norm": 10.667292594909668, "learning_rate": 5.022211351059536e-07, "loss": 0.4104, "num_input_tokens_seen": 168978112, "step": 138950 }, { "epoch": 17.410725472998372, "grad_norm": 6.019985198974609, "learning_rate": 5.01982354153695e-07, "loss": 0.4916, "num_input_tokens_seen": 168984064, "step": 138955 }, { "epoch": 17.411351960907155, "grad_norm": 8.921636581420898, "learning_rate": 5.017436269790194e-07, "loss": 0.4713, "num_input_tokens_seen": 168989728, "step": 138960 }, { "epoch": 17.411978448815937, "grad_norm": 11.49542236328125, "learning_rate": 5.015049535847815e-07, "loss": 0.4855, "num_input_tokens_seen": 168995104, "step": 138965 }, { "epoch": 17.412604936724723, "grad_norm": 6.811870574951172, "learning_rate": 5.012663339738333e-07, "loss": 0.4179, "num_input_tokens_seen": 169001216, "step": 138970 }, { "epoch": 17.413231424633505, "grad_norm": 23.394981384277344, "learning_rate": 5.010277681490289e-07, "loss": 0.4285, "num_input_tokens_seen": 169007360, "step": 138975 }, { "epoch": 17.413857912542287, "grad_norm": 5.683957099914551, "learning_rate": 5.007892561132194e-07, "loss": 0.4842, "num_input_tokens_seen": 169013472, "step": 138980 }, { "epoch": 17.414484400451073, "grad_norm": 6.667746543884277, "learning_rate": 5.005507978692581e-07, "loss": 0.4594, "num_input_tokens_seen": 169019712, "step": 138985 }, { "epoch": 17.415110888359855, "grad_norm": 7.904945373535156, "learning_rate": 5.003123934199938e-07, "loss": 0.4143, "num_input_tokens_seen": 169025728, "step": 138990 }, { "epoch": 17.415737376268638, "grad_norm": 3.762791395187378, "learning_rate": 5.000740427682777e-07, "loss": 0.4613, "num_input_tokens_seen": 169032000, "step": 138995 }, { "epoch": 17.41636386417742, "grad_norm": 8.087839126586914, "learning_rate": 4.998357459169612e-07, "loss": 0.5029, "num_input_tokens_seen": 169038112, "step": 139000 }, { "epoch": 17.416990352086206, "grad_norm": 6.349715232849121, "learning_rate": 4.995975028688909e-07, "loss": 0.4083, "num_input_tokens_seen": 169044032, "step": 139005 }, { "epoch": 17.417616839994988, "grad_norm": 7.307320594787598, "learning_rate": 4.993593136269171e-07, "loss": 0.4112, "num_input_tokens_seen": 169050176, "step": 139010 }, { "epoch": 17.41824332790377, "grad_norm": 13.403851509094238, "learning_rate": 4.991211781938854e-07, "loss": 0.4366, "num_input_tokens_seen": 169056320, "step": 139015 }, { "epoch": 17.418869815812556, "grad_norm": 6.572590351104736, "learning_rate": 4.988830965726449e-07, "loss": 0.4273, "num_input_tokens_seen": 169062080, "step": 139020 }, { "epoch": 17.41949630372134, "grad_norm": 18.467079162597656, "learning_rate": 4.98645068766041e-07, "loss": 0.4372, "num_input_tokens_seen": 169068064, "step": 139025 }, { "epoch": 17.42012279163012, "grad_norm": 4.416502475738525, "learning_rate": 4.984070947769204e-07, "loss": 0.4352, "num_input_tokens_seen": 169074880, "step": 139030 }, { "epoch": 17.420749279538907, "grad_norm": 12.180014610290527, "learning_rate": 4.981691746081268e-07, "loss": 0.4117, "num_input_tokens_seen": 169081024, "step": 139035 }, { "epoch": 17.42137576744769, "grad_norm": 4.615877151489258, "learning_rate": 4.97931308262507e-07, "loss": 0.4266, "num_input_tokens_seen": 169086688, "step": 139040 }, { "epoch": 17.42200225535647, "grad_norm": 13.247599601745605, "learning_rate": 4.976934957429019e-07, "loss": 0.4554, "num_input_tokens_seen": 169092608, "step": 139045 }, { "epoch": 17.422628743265253, "grad_norm": 4.263489246368408, "learning_rate": 4.974557370521571e-07, "loss": 0.4304, "num_input_tokens_seen": 169098752, "step": 139050 }, { "epoch": 17.42325523117404, "grad_norm": 6.857271194458008, "learning_rate": 4.972180321931142e-07, "loss": 0.4343, "num_input_tokens_seen": 169104992, "step": 139055 }, { "epoch": 17.42388171908282, "grad_norm": 9.57459545135498, "learning_rate": 4.969803811686158e-07, "loss": 0.4553, "num_input_tokens_seen": 169111040, "step": 139060 }, { "epoch": 17.424508206991604, "grad_norm": 10.825801849365234, "learning_rate": 4.967427839815025e-07, "loss": 0.462, "num_input_tokens_seen": 169116992, "step": 139065 }, { "epoch": 17.42513469490039, "grad_norm": 5.775276184082031, "learning_rate": 4.965052406346155e-07, "loss": 0.4444, "num_input_tokens_seen": 169122912, "step": 139070 }, { "epoch": 17.425761182809172, "grad_norm": 8.206108093261719, "learning_rate": 4.962677511307956e-07, "loss": 0.4723, "num_input_tokens_seen": 169128768, "step": 139075 }, { "epoch": 17.426387670717954, "grad_norm": 4.819430351257324, "learning_rate": 4.960303154728801e-07, "loss": 0.4231, "num_input_tokens_seen": 169134304, "step": 139080 }, { "epoch": 17.42701415862674, "grad_norm": 7.317779064178467, "learning_rate": 4.957929336637096e-07, "loss": 0.4089, "num_input_tokens_seen": 169140160, "step": 139085 }, { "epoch": 17.427640646535522, "grad_norm": 5.629814147949219, "learning_rate": 4.955556057061211e-07, "loss": 0.4337, "num_input_tokens_seen": 169146400, "step": 139090 }, { "epoch": 17.428267134444305, "grad_norm": 12.454715728759766, "learning_rate": 4.95318331602953e-07, "loss": 0.4726, "num_input_tokens_seen": 169152448, "step": 139095 }, { "epoch": 17.428893622353087, "grad_norm": 10.685707092285156, "learning_rate": 4.950811113570408e-07, "loss": 0.4429, "num_input_tokens_seen": 169158624, "step": 139100 }, { "epoch": 17.429520110261873, "grad_norm": 5.582054615020752, "learning_rate": 4.948439449712228e-07, "loss": 0.4153, "num_input_tokens_seen": 169164800, "step": 139105 }, { "epoch": 17.430146598170655, "grad_norm": 11.868484497070312, "learning_rate": 4.946068324483317e-07, "loss": 0.4474, "num_input_tokens_seen": 169171136, "step": 139110 }, { "epoch": 17.430773086079437, "grad_norm": 4.67863655090332, "learning_rate": 4.94369773791204e-07, "loss": 0.5219, "num_input_tokens_seen": 169177504, "step": 139115 }, { "epoch": 17.431399573988223, "grad_norm": 18.189668655395508, "learning_rate": 4.941327690026749e-07, "loss": 0.4507, "num_input_tokens_seen": 169183520, "step": 139120 }, { "epoch": 17.432026061897005, "grad_norm": 8.763434410095215, "learning_rate": 4.938958180855752e-07, "loss": 0.4172, "num_input_tokens_seen": 169189632, "step": 139125 }, { "epoch": 17.432652549805788, "grad_norm": 21.39536476135254, "learning_rate": 4.936589210427412e-07, "loss": 0.4839, "num_input_tokens_seen": 169195712, "step": 139130 }, { "epoch": 17.433279037714573, "grad_norm": 5.547187805175781, "learning_rate": 4.934220778770022e-07, "loss": 0.4018, "num_input_tokens_seen": 169201216, "step": 139135 }, { "epoch": 17.433905525623356, "grad_norm": 7.105356693267822, "learning_rate": 4.931852885911926e-07, "loss": 0.3659, "num_input_tokens_seen": 169207232, "step": 139140 }, { "epoch": 17.434532013532138, "grad_norm": 6.642939567565918, "learning_rate": 4.929485531881407e-07, "loss": 0.4125, "num_input_tokens_seen": 169213152, "step": 139145 }, { "epoch": 17.435158501440924, "grad_norm": 4.753735065460205, "learning_rate": 4.927118716706791e-07, "loss": 0.4696, "num_input_tokens_seen": 169219104, "step": 139150 }, { "epoch": 17.435784989349706, "grad_norm": 19.038349151611328, "learning_rate": 4.924752440416353e-07, "loss": 0.4725, "num_input_tokens_seen": 169225376, "step": 139155 }, { "epoch": 17.43641147725849, "grad_norm": 3.513927698135376, "learning_rate": 4.9223867030384e-07, "loss": 0.5062, "num_input_tokens_seen": 169231776, "step": 139160 }, { "epoch": 17.43703796516727, "grad_norm": 4.573476314544678, "learning_rate": 4.920021504601219e-07, "loss": 0.4539, "num_input_tokens_seen": 169238080, "step": 139165 }, { "epoch": 17.437664453076057, "grad_norm": 11.772775650024414, "learning_rate": 4.917656845133085e-07, "loss": 0.4266, "num_input_tokens_seen": 169244256, "step": 139170 }, { "epoch": 17.43829094098484, "grad_norm": 8.996089935302734, "learning_rate": 4.915292724662257e-07, "loss": 0.459, "num_input_tokens_seen": 169250496, "step": 139175 }, { "epoch": 17.43891742889362, "grad_norm": 6.280040264129639, "learning_rate": 4.91292914321701e-07, "loss": 0.4277, "num_input_tokens_seen": 169256224, "step": 139180 }, { "epoch": 17.439543916802407, "grad_norm": 26.46794319152832, "learning_rate": 4.910566100825615e-07, "loss": 0.5417, "num_input_tokens_seen": 169262304, "step": 139185 }, { "epoch": 17.44017040471119, "grad_norm": 4.026333332061768, "learning_rate": 4.908203597516298e-07, "loss": 0.5348, "num_input_tokens_seen": 169268544, "step": 139190 }, { "epoch": 17.44079689261997, "grad_norm": 6.8882060050964355, "learning_rate": 4.905841633317332e-07, "loss": 0.3882, "num_input_tokens_seen": 169274624, "step": 139195 }, { "epoch": 17.441423380528757, "grad_norm": 7.944040298461914, "learning_rate": 4.903480208256928e-07, "loss": 0.4124, "num_input_tokens_seen": 169280608, "step": 139200 }, { "epoch": 17.44204986843754, "grad_norm": 3.0094003677368164, "learning_rate": 4.901119322363345e-07, "loss": 0.4128, "num_input_tokens_seen": 169286784, "step": 139205 }, { "epoch": 17.442676356346322, "grad_norm": 6.802610397338867, "learning_rate": 4.898758975664786e-07, "loss": 0.4375, "num_input_tokens_seen": 169292736, "step": 139210 }, { "epoch": 17.443302844255104, "grad_norm": 4.194633483886719, "learning_rate": 4.896399168189498e-07, "loss": 0.4454, "num_input_tokens_seen": 169298528, "step": 139215 }, { "epoch": 17.44392933216389, "grad_norm": 10.212031364440918, "learning_rate": 4.894039899965663e-07, "loss": 0.3973, "num_input_tokens_seen": 169304736, "step": 139220 }, { "epoch": 17.444555820072672, "grad_norm": 9.140731811523438, "learning_rate": 4.891681171021517e-07, "loss": 0.4315, "num_input_tokens_seen": 169311104, "step": 139225 }, { "epoch": 17.445182307981455, "grad_norm": 22.628061294555664, "learning_rate": 4.889322981385236e-07, "loss": 0.4634, "num_input_tokens_seen": 169316992, "step": 139230 }, { "epoch": 17.44580879589024, "grad_norm": 2.878253698348999, "learning_rate": 4.886965331085025e-07, "loss": 0.4423, "num_input_tokens_seen": 169323168, "step": 139235 }, { "epoch": 17.446435283799023, "grad_norm": 4.2958784103393555, "learning_rate": 4.884608220149085e-07, "loss": 0.4087, "num_input_tokens_seen": 169329152, "step": 139240 }, { "epoch": 17.447061771707805, "grad_norm": 20.170175552368164, "learning_rate": 4.882251648605574e-07, "loss": 0.4367, "num_input_tokens_seen": 169335232, "step": 139245 }, { "epoch": 17.44768825961659, "grad_norm": 31.179201126098633, "learning_rate": 4.879895616482683e-07, "loss": 0.5434, "num_input_tokens_seen": 169340800, "step": 139250 }, { "epoch": 17.448314747525373, "grad_norm": 5.829073905944824, "learning_rate": 4.877540123808572e-07, "loss": 0.4678, "num_input_tokens_seen": 169346304, "step": 139255 }, { "epoch": 17.448941235434155, "grad_norm": 3.188910722732544, "learning_rate": 4.875185170611401e-07, "loss": 0.4014, "num_input_tokens_seen": 169352192, "step": 139260 }, { "epoch": 17.44956772334294, "grad_norm": 16.045129776000977, "learning_rate": 4.872830756919344e-07, "loss": 0.4374, "num_input_tokens_seen": 169357824, "step": 139265 }, { "epoch": 17.450194211251723, "grad_norm": 4.66394567489624, "learning_rate": 4.870476882760522e-07, "loss": 0.3965, "num_input_tokens_seen": 169364160, "step": 139270 }, { "epoch": 17.450820699160506, "grad_norm": 10.385293960571289, "learning_rate": 4.868123548163095e-07, "loss": 0.4093, "num_input_tokens_seen": 169370592, "step": 139275 }, { "epoch": 17.451447187069288, "grad_norm": 4.556210517883301, "learning_rate": 4.865770753155202e-07, "loss": 0.5112, "num_input_tokens_seen": 169376768, "step": 139280 }, { "epoch": 17.452073674978074, "grad_norm": 28.32125473022461, "learning_rate": 4.86341849776496e-07, "loss": 0.5362, "num_input_tokens_seen": 169382528, "step": 139285 }, { "epoch": 17.452700162886856, "grad_norm": 4.079992771148682, "learning_rate": 4.861066782020507e-07, "loss": 0.4685, "num_input_tokens_seen": 169388704, "step": 139290 }, { "epoch": 17.45332665079564, "grad_norm": 5.327644348144531, "learning_rate": 4.858715605949943e-07, "loss": 0.4433, "num_input_tokens_seen": 169394880, "step": 139295 }, { "epoch": 17.453953138704424, "grad_norm": 8.017544746398926, "learning_rate": 4.856364969581385e-07, "loss": 0.4282, "num_input_tokens_seen": 169401088, "step": 139300 }, { "epoch": 17.454579626613206, "grad_norm": 3.9201745986938477, "learning_rate": 4.854014872942948e-07, "loss": 0.6457, "num_input_tokens_seen": 169407168, "step": 139305 }, { "epoch": 17.45520611452199, "grad_norm": 16.736881256103516, "learning_rate": 4.851665316062709e-07, "loss": 0.4519, "num_input_tokens_seen": 169413344, "step": 139310 }, { "epoch": 17.455832602430775, "grad_norm": 14.93009090423584, "learning_rate": 4.849316298968782e-07, "loss": 0.422, "num_input_tokens_seen": 169419616, "step": 139315 }, { "epoch": 17.456459090339557, "grad_norm": 6.472033500671387, "learning_rate": 4.846967821689225e-07, "loss": 0.4274, "num_input_tokens_seen": 169426048, "step": 139320 }, { "epoch": 17.45708557824834, "grad_norm": 15.711514472961426, "learning_rate": 4.844619884252144e-07, "loss": 0.4435, "num_input_tokens_seen": 169431936, "step": 139325 }, { "epoch": 17.45771206615712, "grad_norm": 5.843460559844971, "learning_rate": 4.84227248668559e-07, "loss": 0.4553, "num_input_tokens_seen": 169438080, "step": 139330 }, { "epoch": 17.458338554065907, "grad_norm": 24.648563385009766, "learning_rate": 4.839925629017638e-07, "loss": 0.4907, "num_input_tokens_seen": 169444032, "step": 139335 }, { "epoch": 17.45896504197469, "grad_norm": 22.48635482788086, "learning_rate": 4.837579311276341e-07, "loss": 0.5239, "num_input_tokens_seen": 169450144, "step": 139340 }, { "epoch": 17.459591529883472, "grad_norm": 5.181106090545654, "learning_rate": 4.835233533489758e-07, "loss": 0.4286, "num_input_tokens_seen": 169456192, "step": 139345 }, { "epoch": 17.460218017792258, "grad_norm": 5.4520158767700195, "learning_rate": 4.832888295685923e-07, "loss": 0.4273, "num_input_tokens_seen": 169462080, "step": 139350 }, { "epoch": 17.46084450570104, "grad_norm": 5.600273132324219, "learning_rate": 4.830543597892889e-07, "loss": 0.4147, "num_input_tokens_seen": 169468448, "step": 139355 }, { "epoch": 17.461470993609822, "grad_norm": 3.6333184242248535, "learning_rate": 4.828199440138681e-07, "loss": 0.429, "num_input_tokens_seen": 169474688, "step": 139360 }, { "epoch": 17.462097481518608, "grad_norm": 25.08538055419922, "learning_rate": 4.82585582245132e-07, "loss": 0.4765, "num_input_tokens_seen": 169480608, "step": 139365 }, { "epoch": 17.46272396942739, "grad_norm": 14.60399055480957, "learning_rate": 4.823512744858833e-07, "loss": 0.4457, "num_input_tokens_seen": 169486720, "step": 139370 }, { "epoch": 17.463350457336173, "grad_norm": 10.73227310180664, "learning_rate": 4.821170207389236e-07, "loss": 0.5788, "num_input_tokens_seen": 169492704, "step": 139375 }, { "epoch": 17.46397694524496, "grad_norm": 11.402616500854492, "learning_rate": 4.818828210070542e-07, "loss": 0.5035, "num_input_tokens_seen": 169498816, "step": 139380 }, { "epoch": 17.46460343315374, "grad_norm": 4.737865924835205, "learning_rate": 4.816486752930738e-07, "loss": 0.451, "num_input_tokens_seen": 169504928, "step": 139385 }, { "epoch": 17.465229921062523, "grad_norm": 4.669229507446289, "learning_rate": 4.814145835997825e-07, "loss": 0.4004, "num_input_tokens_seen": 169511392, "step": 139390 }, { "epoch": 17.465856408971305, "grad_norm": 4.313527584075928, "learning_rate": 4.811805459299784e-07, "loss": 0.4586, "num_input_tokens_seen": 169517344, "step": 139395 }, { "epoch": 17.46648289688009, "grad_norm": 3.38301944732666, "learning_rate": 4.809465622864612e-07, "loss": 0.4836, "num_input_tokens_seen": 169523296, "step": 139400 }, { "epoch": 17.467109384788873, "grad_norm": 13.07672119140625, "learning_rate": 4.807126326720258e-07, "loss": 0.4637, "num_input_tokens_seen": 169529344, "step": 139405 }, { "epoch": 17.467735872697656, "grad_norm": 26.506481170654297, "learning_rate": 4.804787570894715e-07, "loss": 0.5446, "num_input_tokens_seen": 169535680, "step": 139410 }, { "epoch": 17.46836236060644, "grad_norm": 19.7903995513916, "learning_rate": 4.802449355415928e-07, "loss": 0.4883, "num_input_tokens_seen": 169541984, "step": 139415 }, { "epoch": 17.468988848515224, "grad_norm": 4.315293788909912, "learning_rate": 4.800111680311869e-07, "loss": 0.49, "num_input_tokens_seen": 169548480, "step": 139420 }, { "epoch": 17.469615336424006, "grad_norm": 6.793566703796387, "learning_rate": 4.797774545610468e-07, "loss": 0.4617, "num_input_tokens_seen": 169554848, "step": 139425 }, { "epoch": 17.470241824332792, "grad_norm": 7.875442028045654, "learning_rate": 4.795437951339671e-07, "loss": 0.4503, "num_input_tokens_seen": 169560768, "step": 139430 }, { "epoch": 17.470868312241574, "grad_norm": 3.3438849449157715, "learning_rate": 4.793101897527431e-07, "loss": 0.4515, "num_input_tokens_seen": 169566400, "step": 139435 }, { "epoch": 17.471494800150356, "grad_norm": 26.692626953125, "learning_rate": 4.790766384201656e-07, "loss": 0.455, "num_input_tokens_seen": 169572384, "step": 139440 }, { "epoch": 17.47212128805914, "grad_norm": 4.491485595703125, "learning_rate": 4.788431411390288e-07, "loss": 0.447, "num_input_tokens_seen": 169578528, "step": 139445 }, { "epoch": 17.472747775967925, "grad_norm": 4.421853542327881, "learning_rate": 4.78609697912123e-07, "loss": 0.4421, "num_input_tokens_seen": 169584864, "step": 139450 }, { "epoch": 17.473374263876707, "grad_norm": 3.1076438426971436, "learning_rate": 4.783763087422399e-07, "loss": 0.4132, "num_input_tokens_seen": 169590848, "step": 139455 }, { "epoch": 17.47400075178549, "grad_norm": 7.073635101318359, "learning_rate": 4.781429736321691e-07, "loss": 0.4074, "num_input_tokens_seen": 169596416, "step": 139460 }, { "epoch": 17.474627239694275, "grad_norm": 10.571656227111816, "learning_rate": 4.779096925847004e-07, "loss": 0.4536, "num_input_tokens_seen": 169602304, "step": 139465 }, { "epoch": 17.475253727603057, "grad_norm": 4.897950649261475, "learning_rate": 4.776764656026245e-07, "loss": 0.4555, "num_input_tokens_seen": 169608416, "step": 139470 }, { "epoch": 17.47588021551184, "grad_norm": 5.982226848602295, "learning_rate": 4.774432926887279e-07, "loss": 0.5127, "num_input_tokens_seen": 169614688, "step": 139475 }, { "epoch": 17.476506703420625, "grad_norm": 4.233229160308838, "learning_rate": 4.772101738457985e-07, "loss": 0.3652, "num_input_tokens_seen": 169621024, "step": 139480 }, { "epoch": 17.477133191329408, "grad_norm": 5.949040412902832, "learning_rate": 4.769771090766245e-07, "loss": 0.4292, "num_input_tokens_seen": 169627328, "step": 139485 }, { "epoch": 17.47775967923819, "grad_norm": 3.7590842247009277, "learning_rate": 4.767440983839927e-07, "loss": 0.4335, "num_input_tokens_seen": 169633312, "step": 139490 }, { "epoch": 17.478386167146976, "grad_norm": 16.13188934326172, "learning_rate": 4.7651114177068694e-07, "loss": 0.4451, "num_input_tokens_seen": 169639456, "step": 139495 }, { "epoch": 17.479012655055758, "grad_norm": 5.733701229095459, "learning_rate": 4.762782392394949e-07, "loss": 0.3966, "num_input_tokens_seen": 169645632, "step": 139500 }, { "epoch": 17.47963914296454, "grad_norm": 9.963505744934082, "learning_rate": 4.760453907931989e-07, "loss": 0.4108, "num_input_tokens_seen": 169651712, "step": 139505 }, { "epoch": 17.480265630873323, "grad_norm": 23.204421997070312, "learning_rate": 4.75812596434585e-07, "loss": 0.5519, "num_input_tokens_seen": 169657792, "step": 139510 }, { "epoch": 17.48089211878211, "grad_norm": 8.485516548156738, "learning_rate": 4.7557985616643375e-07, "loss": 0.4541, "num_input_tokens_seen": 169663616, "step": 139515 }, { "epoch": 17.48151860669089, "grad_norm": 8.473403930664062, "learning_rate": 4.753471699915302e-07, "loss": 0.4205, "num_input_tokens_seen": 169669376, "step": 139520 }, { "epoch": 17.482145094599673, "grad_norm": 21.20619010925293, "learning_rate": 4.7511453791265483e-07, "loss": 0.4405, "num_input_tokens_seen": 169675616, "step": 139525 }, { "epoch": 17.48277158250846, "grad_norm": 6.6012163162231445, "learning_rate": 4.748819599325899e-07, "loss": 0.4895, "num_input_tokens_seen": 169681280, "step": 139530 }, { "epoch": 17.48339807041724, "grad_norm": 9.622188568115234, "learning_rate": 4.746494360541154e-07, "loss": 0.5099, "num_input_tokens_seen": 169687520, "step": 139535 }, { "epoch": 17.484024558326023, "grad_norm": 5.099967956542969, "learning_rate": 4.744169662800119e-07, "loss": 0.4261, "num_input_tokens_seen": 169693312, "step": 139540 }, { "epoch": 17.48465104623481, "grad_norm": 13.70880126953125, "learning_rate": 4.741845506130577e-07, "loss": 0.4085, "num_input_tokens_seen": 169699456, "step": 139545 }, { "epoch": 17.48527753414359, "grad_norm": 19.224327087402344, "learning_rate": 4.7395218905603227e-07, "loss": 0.5112, "num_input_tokens_seen": 169705408, "step": 139550 }, { "epoch": 17.485904022052374, "grad_norm": 2.8764188289642334, "learning_rate": 4.73719881611715e-07, "loss": 0.4566, "num_input_tokens_seen": 169711520, "step": 139555 }, { "epoch": 17.486530509961156, "grad_norm": 6.668630123138428, "learning_rate": 4.7348762828288096e-07, "loss": 0.4595, "num_input_tokens_seen": 169717504, "step": 139560 }, { "epoch": 17.487156997869942, "grad_norm": 7.9365668296813965, "learning_rate": 4.732554290723085e-07, "loss": 0.3979, "num_input_tokens_seen": 169723712, "step": 139565 }, { "epoch": 17.487783485778724, "grad_norm": 6.260842800140381, "learning_rate": 4.7302328398277255e-07, "loss": 0.4606, "num_input_tokens_seen": 169729920, "step": 139570 }, { "epoch": 17.488409973687506, "grad_norm": 20.165428161621094, "learning_rate": 4.727911930170498e-07, "loss": 0.5221, "num_input_tokens_seen": 169736256, "step": 139575 }, { "epoch": 17.489036461596292, "grad_norm": 4.2652764320373535, "learning_rate": 4.7255915617791414e-07, "loss": 0.4231, "num_input_tokens_seen": 169742432, "step": 139580 }, { "epoch": 17.489662949505075, "grad_norm": 4.026089668273926, "learning_rate": 4.7232717346814114e-07, "loss": 0.4825, "num_input_tokens_seen": 169748576, "step": 139585 }, { "epoch": 17.490289437413857, "grad_norm": 17.04958152770996, "learning_rate": 4.720952448905025e-07, "loss": 0.4873, "num_input_tokens_seen": 169754592, "step": 139590 }, { "epoch": 17.490915925322643, "grad_norm": 5.785054683685303, "learning_rate": 4.718633704477732e-07, "loss": 0.4257, "num_input_tokens_seen": 169760640, "step": 139595 }, { "epoch": 17.491542413231425, "grad_norm": 4.007252216339111, "learning_rate": 4.716315501427232e-07, "loss": 0.4371, "num_input_tokens_seen": 169767008, "step": 139600 }, { "epoch": 17.492168901140207, "grad_norm": 4.294191360473633, "learning_rate": 4.7139978397812644e-07, "loss": 0.4505, "num_input_tokens_seen": 169772512, "step": 139605 }, { "epoch": 17.492795389048993, "grad_norm": 13.932452201843262, "learning_rate": 4.711680719567513e-07, "loss": 0.4892, "num_input_tokens_seen": 169778368, "step": 139610 }, { "epoch": 17.493421876957775, "grad_norm": 5.450967311859131, "learning_rate": 4.709364140813699e-07, "loss": 0.422, "num_input_tokens_seen": 169784448, "step": 139615 }, { "epoch": 17.494048364866558, "grad_norm": 9.172382354736328, "learning_rate": 4.7070481035475236e-07, "loss": 0.4085, "num_input_tokens_seen": 169790784, "step": 139620 }, { "epoch": 17.49467485277534, "grad_norm": 18.11790657043457, "learning_rate": 4.704732607796658e-07, "loss": 0.4962, "num_input_tokens_seen": 169796672, "step": 139625 }, { "epoch": 17.495301340684126, "grad_norm": 13.152188301086426, "learning_rate": 4.702417653588809e-07, "loss": 0.5474, "num_input_tokens_seen": 169802336, "step": 139630 }, { "epoch": 17.495927828592908, "grad_norm": 9.438603401184082, "learning_rate": 4.700103240951631e-07, "loss": 0.4768, "num_input_tokens_seen": 169808192, "step": 139635 }, { "epoch": 17.49655431650169, "grad_norm": 5.500075817108154, "learning_rate": 4.697789369912809e-07, "loss": 0.4208, "num_input_tokens_seen": 169814368, "step": 139640 }, { "epoch": 17.497180804410476, "grad_norm": 16.694067001342773, "learning_rate": 4.695476040499997e-07, "loss": 0.4531, "num_input_tokens_seen": 169819904, "step": 139645 }, { "epoch": 17.49780729231926, "grad_norm": 21.314651489257812, "learning_rate": 4.6931632527408735e-07, "loss": 0.4284, "num_input_tokens_seen": 169826208, "step": 139650 }, { "epoch": 17.49843378022804, "grad_norm": 12.529561996459961, "learning_rate": 4.6908510066630554e-07, "loss": 0.4575, "num_input_tokens_seen": 169832096, "step": 139655 }, { "epoch": 17.499060268136827, "grad_norm": 6.337967872619629, "learning_rate": 4.688539302294226e-07, "loss": 0.42, "num_input_tokens_seen": 169838336, "step": 139660 }, { "epoch": 17.49968675604561, "grad_norm": 22.356826782226562, "learning_rate": 4.686228139661991e-07, "loss": 0.4385, "num_input_tokens_seen": 169844512, "step": 139665 }, { "epoch": 17.50031324395439, "grad_norm": 14.74990463256836, "learning_rate": 4.683917518793996e-07, "loss": 0.4694, "num_input_tokens_seen": 169849920, "step": 139670 }, { "epoch": 17.500939731863173, "grad_norm": 11.279048919677734, "learning_rate": 4.681607439717867e-07, "loss": 0.4464, "num_input_tokens_seen": 169856224, "step": 139675 }, { "epoch": 17.50156621977196, "grad_norm": 3.888993978500366, "learning_rate": 4.679297902461233e-07, "loss": 0.4068, "num_input_tokens_seen": 169862496, "step": 139680 }, { "epoch": 17.50219270768074, "grad_norm": 4.776918411254883, "learning_rate": 4.6769889070516883e-07, "loss": 0.4181, "num_input_tokens_seen": 169868480, "step": 139685 }, { "epoch": 17.502819195589524, "grad_norm": 5.025388240814209, "learning_rate": 4.674680453516844e-07, "loss": 0.4643, "num_input_tokens_seen": 169874464, "step": 139690 }, { "epoch": 17.50344568349831, "grad_norm": 4.530566692352295, "learning_rate": 4.6723725418843125e-07, "loss": 0.4581, "num_input_tokens_seen": 169880640, "step": 139695 }, { "epoch": 17.504072171407092, "grad_norm": 4.221120357513428, "learning_rate": 4.670065172181665e-07, "loss": 0.4199, "num_input_tokens_seen": 169886656, "step": 139700 }, { "epoch": 17.504698659315874, "grad_norm": 8.243714332580566, "learning_rate": 4.6677583444365127e-07, "loss": 0.4166, "num_input_tokens_seen": 169893024, "step": 139705 }, { "epoch": 17.50532514722466, "grad_norm": 4.076972961425781, "learning_rate": 4.665452058676412e-07, "loss": 0.4318, "num_input_tokens_seen": 169899200, "step": 139710 }, { "epoch": 17.505951635133442, "grad_norm": 9.87552261352539, "learning_rate": 4.663146314928957e-07, "loss": 0.4188, "num_input_tokens_seen": 169905088, "step": 139715 }, { "epoch": 17.506578123042225, "grad_norm": 7.564526557922363, "learning_rate": 4.660841113221698e-07, "loss": 0.5276, "num_input_tokens_seen": 169911008, "step": 139720 }, { "epoch": 17.507204610951007, "grad_norm": 22.939586639404297, "learning_rate": 4.6585364535822076e-07, "loss": 0.4637, "num_input_tokens_seen": 169917120, "step": 139725 }, { "epoch": 17.507831098859793, "grad_norm": 9.24575424194336, "learning_rate": 4.6562323360380244e-07, "loss": 0.4755, "num_input_tokens_seen": 169923136, "step": 139730 }, { "epoch": 17.508457586768575, "grad_norm": 3.843472719192505, "learning_rate": 4.6539287606167106e-07, "loss": 0.433, "num_input_tokens_seen": 169929088, "step": 139735 }, { "epoch": 17.509084074677357, "grad_norm": 3.4840199947357178, "learning_rate": 4.65162572734581e-07, "loss": 0.4503, "num_input_tokens_seen": 169935360, "step": 139740 }, { "epoch": 17.509710562586143, "grad_norm": 4.6026763916015625, "learning_rate": 4.649323236252845e-07, "loss": 0.416, "num_input_tokens_seen": 169941408, "step": 139745 }, { "epoch": 17.510337050494925, "grad_norm": 11.538566589355469, "learning_rate": 4.647021287365355e-07, "loss": 0.4597, "num_input_tokens_seen": 169947872, "step": 139750 }, { "epoch": 17.510963538403708, "grad_norm": 22.56307029724121, "learning_rate": 4.6447198807108463e-07, "loss": 0.4926, "num_input_tokens_seen": 169953824, "step": 139755 }, { "epoch": 17.511590026312494, "grad_norm": 5.423969268798828, "learning_rate": 4.642419016316857e-07, "loss": 0.5277, "num_input_tokens_seen": 169959936, "step": 139760 }, { "epoch": 17.512216514221276, "grad_norm": 20.595827102661133, "learning_rate": 4.640118694210871e-07, "loss": 0.5127, "num_input_tokens_seen": 169966208, "step": 139765 }, { "epoch": 17.512843002130058, "grad_norm": 4.975620269775391, "learning_rate": 4.637818914420411e-07, "loss": 0.4954, "num_input_tokens_seen": 169972608, "step": 139770 }, { "epoch": 17.513469490038844, "grad_norm": 5.386746406555176, "learning_rate": 4.6355196769729603e-07, "loss": 0.4405, "num_input_tokens_seen": 169978688, "step": 139775 }, { "epoch": 17.514095977947626, "grad_norm": 4.537362098693848, "learning_rate": 4.6332209818960084e-07, "loss": 0.5217, "num_input_tokens_seen": 169984832, "step": 139780 }, { "epoch": 17.51472246585641, "grad_norm": 5.2612833976745605, "learning_rate": 4.6309228292170437e-07, "loss": 0.4586, "num_input_tokens_seen": 169990496, "step": 139785 }, { "epoch": 17.51534895376519, "grad_norm": 11.104134559631348, "learning_rate": 4.628625218963545e-07, "loss": 0.4641, "num_input_tokens_seen": 169996384, "step": 139790 }, { "epoch": 17.515975441673977, "grad_norm": 3.3108978271484375, "learning_rate": 4.626328151162973e-07, "loss": 0.4477, "num_input_tokens_seen": 170002272, "step": 139795 }, { "epoch": 17.51660192958276, "grad_norm": 12.656229019165039, "learning_rate": 4.624031625842795e-07, "loss": 0.5878, "num_input_tokens_seen": 170008160, "step": 139800 }, { "epoch": 17.51722841749154, "grad_norm": 5.039726257324219, "learning_rate": 4.621735643030478e-07, "loss": 0.3892, "num_input_tokens_seen": 170014496, "step": 139805 }, { "epoch": 17.517854905400327, "grad_norm": 3.383188247680664, "learning_rate": 4.6194402027534557e-07, "loss": 0.4188, "num_input_tokens_seen": 170020320, "step": 139810 }, { "epoch": 17.51848139330911, "grad_norm": 22.600736618041992, "learning_rate": 4.617145305039189e-07, "loss": 0.477, "num_input_tokens_seen": 170026432, "step": 139815 }, { "epoch": 17.51910788121789, "grad_norm": 8.788765907287598, "learning_rate": 4.6148509499150953e-07, "loss": 0.5141, "num_input_tokens_seen": 170032384, "step": 139820 }, { "epoch": 17.519734369126677, "grad_norm": 4.845000267028809, "learning_rate": 4.612557137408624e-07, "loss": 0.4028, "num_input_tokens_seen": 170038688, "step": 139825 }, { "epoch": 17.52036085703546, "grad_norm": 11.7592191696167, "learning_rate": 4.6102638675471823e-07, "loss": 0.5044, "num_input_tokens_seen": 170044384, "step": 139830 }, { "epoch": 17.520987344944242, "grad_norm": 18.470977783203125, "learning_rate": 4.607971140358208e-07, "loss": 0.4431, "num_input_tokens_seen": 170050592, "step": 139835 }, { "epoch": 17.521613832853028, "grad_norm": 6.984568119049072, "learning_rate": 4.6056789558690975e-07, "loss": 0.4046, "num_input_tokens_seen": 170056992, "step": 139840 }, { "epoch": 17.52224032076181, "grad_norm": 4.429718017578125, "learning_rate": 4.603387314107266e-07, "loss": 0.4493, "num_input_tokens_seen": 170063200, "step": 139845 }, { "epoch": 17.522866808670592, "grad_norm": 8.744257926940918, "learning_rate": 4.601096215100098e-07, "loss": 0.4785, "num_input_tokens_seen": 170069504, "step": 139850 }, { "epoch": 17.523493296579375, "grad_norm": 15.699190139770508, "learning_rate": 4.5988056588749883e-07, "loss": 0.571, "num_input_tokens_seen": 170075648, "step": 139855 }, { "epoch": 17.52411978448816, "grad_norm": 24.117887496948242, "learning_rate": 4.596515645459343e-07, "loss": 0.4783, "num_input_tokens_seen": 170081600, "step": 139860 }, { "epoch": 17.524746272396943, "grad_norm": 21.814254760742188, "learning_rate": 4.5942261748805174e-07, "loss": 0.4486, "num_input_tokens_seen": 170087904, "step": 139865 }, { "epoch": 17.525372760305725, "grad_norm": 6.763541221618652, "learning_rate": 4.5919372471659007e-07, "loss": 0.4625, "num_input_tokens_seen": 170094304, "step": 139870 }, { "epoch": 17.52599924821451, "grad_norm": 7.651285171508789, "learning_rate": 4.5896488623428435e-07, "loss": 0.512, "num_input_tokens_seen": 170100416, "step": 139875 }, { "epoch": 17.526625736123293, "grad_norm": 6.069094181060791, "learning_rate": 4.587361020438713e-07, "loss": 0.5002, "num_input_tokens_seen": 170106720, "step": 139880 }, { "epoch": 17.527252224032075, "grad_norm": 2.9479336738586426, "learning_rate": 4.58507372148087e-07, "loss": 0.4387, "num_input_tokens_seen": 170112544, "step": 139885 }, { "epoch": 17.52787871194086, "grad_norm": 4.447889804840088, "learning_rate": 4.582786965496644e-07, "loss": 0.4472, "num_input_tokens_seen": 170118464, "step": 139890 }, { "epoch": 17.528505199849644, "grad_norm": 6.252945899963379, "learning_rate": 4.5805007525133893e-07, "loss": 0.4278, "num_input_tokens_seen": 170124320, "step": 139895 }, { "epoch": 17.529131687758426, "grad_norm": 4.744882106781006, "learning_rate": 4.5782150825584405e-07, "loss": 0.4369, "num_input_tokens_seen": 170130272, "step": 139900 }, { "epoch": 17.529758175667208, "grad_norm": 5.534260272979736, "learning_rate": 4.5759299556591083e-07, "loss": 0.4787, "num_input_tokens_seen": 170136384, "step": 139905 }, { "epoch": 17.530384663575994, "grad_norm": 13.50366497039795, "learning_rate": 4.573645371842733e-07, "loss": 0.4615, "num_input_tokens_seen": 170142336, "step": 139910 }, { "epoch": 17.531011151484776, "grad_norm": 23.10833740234375, "learning_rate": 4.571361331136609e-07, "loss": 0.4742, "num_input_tokens_seen": 170148448, "step": 139915 }, { "epoch": 17.53163763939356, "grad_norm": 3.8764796257019043, "learning_rate": 4.5690778335680584e-07, "loss": 0.4351, "num_input_tokens_seen": 170154432, "step": 139920 }, { "epoch": 17.532264127302344, "grad_norm": 3.7708241939544678, "learning_rate": 4.566794879164388e-07, "loss": 0.418, "num_input_tokens_seen": 170160736, "step": 139925 }, { "epoch": 17.532890615211127, "grad_norm": 3.320488929748535, "learning_rate": 4.5645124679528697e-07, "loss": 0.4568, "num_input_tokens_seen": 170167008, "step": 139930 }, { "epoch": 17.53351710311991, "grad_norm": 3.913942575454712, "learning_rate": 4.562230599960821e-07, "loss": 0.469, "num_input_tokens_seen": 170172992, "step": 139935 }, { "epoch": 17.534143591028695, "grad_norm": 13.87111759185791, "learning_rate": 4.559949275215492e-07, "loss": 0.4584, "num_input_tokens_seen": 170179296, "step": 139940 }, { "epoch": 17.534770078937477, "grad_norm": 4.030784606933594, "learning_rate": 4.557668493744183e-07, "loss": 0.4227, "num_input_tokens_seen": 170185664, "step": 139945 }, { "epoch": 17.53539656684626, "grad_norm": 8.722413063049316, "learning_rate": 4.555388255574145e-07, "loss": 0.4572, "num_input_tokens_seen": 170191872, "step": 139950 }, { "epoch": 17.53602305475504, "grad_norm": 3.890192985534668, "learning_rate": 4.553108560732661e-07, "loss": 0.4048, "num_input_tokens_seen": 170198176, "step": 139955 }, { "epoch": 17.536649542663827, "grad_norm": 3.555738925933838, "learning_rate": 4.5508294092469596e-07, "loss": 0.5005, "num_input_tokens_seen": 170204256, "step": 139960 }, { "epoch": 17.53727603057261, "grad_norm": 4.240046501159668, "learning_rate": 4.548550801144314e-07, "loss": 0.4893, "num_input_tokens_seen": 170209920, "step": 139965 }, { "epoch": 17.537902518481392, "grad_norm": 11.446088790893555, "learning_rate": 4.546272736451951e-07, "loss": 0.4829, "num_input_tokens_seen": 170215968, "step": 139970 }, { "epoch": 17.538529006390178, "grad_norm": 5.367794990539551, "learning_rate": 4.5439952151971057e-07, "loss": 0.4069, "num_input_tokens_seen": 170222336, "step": 139975 }, { "epoch": 17.53915549429896, "grad_norm": 5.3283467292785645, "learning_rate": 4.5417182374070336e-07, "loss": 0.4109, "num_input_tokens_seen": 170228512, "step": 139980 }, { "epoch": 17.539781982207742, "grad_norm": 4.666759490966797, "learning_rate": 4.539441803108924e-07, "loss": 0.4303, "num_input_tokens_seen": 170234720, "step": 139985 }, { "epoch": 17.540408470116528, "grad_norm": 4.131818771362305, "learning_rate": 4.5371659123300107e-07, "loss": 0.4341, "num_input_tokens_seen": 170240544, "step": 139990 }, { "epoch": 17.54103495802531, "grad_norm": 10.039854049682617, "learning_rate": 4.5348905650974997e-07, "loss": 0.4493, "num_input_tokens_seen": 170246080, "step": 139995 }, { "epoch": 17.541661445934093, "grad_norm": 10.882369995117188, "learning_rate": 4.532615761438608e-07, "loss": 0.4765, "num_input_tokens_seen": 170252064, "step": 140000 }, { "epoch": 17.54228793384288, "grad_norm": 5.587396144866943, "learning_rate": 4.5303415013805144e-07, "loss": 0.5296, "num_input_tokens_seen": 170257952, "step": 140005 }, { "epoch": 17.54291442175166, "grad_norm": 5.46144437789917, "learning_rate": 4.5280677849504237e-07, "loss": 0.5059, "num_input_tokens_seen": 170263744, "step": 140010 }, { "epoch": 17.543540909660443, "grad_norm": 3.024406909942627, "learning_rate": 4.5257946121755046e-07, "loss": 0.4585, "num_input_tokens_seen": 170269664, "step": 140015 }, { "epoch": 17.544167397569225, "grad_norm": 11.51857852935791, "learning_rate": 4.5235219830829514e-07, "loss": 0.4079, "num_input_tokens_seen": 170275680, "step": 140020 }, { "epoch": 17.54479388547801, "grad_norm": 6.473467826843262, "learning_rate": 4.5212498976999196e-07, "loss": 0.4376, "num_input_tokens_seen": 170281920, "step": 140025 }, { "epoch": 17.545420373386794, "grad_norm": 15.849103927612305, "learning_rate": 4.5189783560535883e-07, "loss": 0.4604, "num_input_tokens_seen": 170287744, "step": 140030 }, { "epoch": 17.546046861295576, "grad_norm": 7.648138046264648, "learning_rate": 4.5167073581711017e-07, "loss": 0.4963, "num_input_tokens_seen": 170293984, "step": 140035 }, { "epoch": 17.54667334920436, "grad_norm": 4.281147480010986, "learning_rate": 4.514436904079622e-07, "loss": 0.381, "num_input_tokens_seen": 170300000, "step": 140040 }, { "epoch": 17.547299837113144, "grad_norm": 4.172616958618164, "learning_rate": 4.5121669938062995e-07, "loss": 0.4178, "num_input_tokens_seen": 170305920, "step": 140045 }, { "epoch": 17.547926325021926, "grad_norm": 4.972691535949707, "learning_rate": 4.509897627378251e-07, "loss": 0.4215, "num_input_tokens_seen": 170311936, "step": 140050 }, { "epoch": 17.548552812930712, "grad_norm": 5.406732082366943, "learning_rate": 4.5076288048226336e-07, "loss": 0.4288, "num_input_tokens_seen": 170318144, "step": 140055 }, { "epoch": 17.549179300839494, "grad_norm": 17.695199966430664, "learning_rate": 4.505360526166558e-07, "loss": 0.4541, "num_input_tokens_seen": 170324384, "step": 140060 }, { "epoch": 17.549805788748277, "grad_norm": 5.549117565155029, "learning_rate": 4.503092791437147e-07, "loss": 0.4685, "num_input_tokens_seen": 170330752, "step": 140065 }, { "epoch": 17.55043227665706, "grad_norm": 7.732110500335693, "learning_rate": 4.5008256006615125e-07, "loss": 0.4377, "num_input_tokens_seen": 170336640, "step": 140070 }, { "epoch": 17.551058764565845, "grad_norm": 17.20606803894043, "learning_rate": 4.4985589538667665e-07, "loss": 0.4742, "num_input_tokens_seen": 170343008, "step": 140075 }, { "epoch": 17.551685252474627, "grad_norm": 5.012139797210693, "learning_rate": 4.4962928510799917e-07, "loss": 0.4569, "num_input_tokens_seen": 170349120, "step": 140080 }, { "epoch": 17.55231174038341, "grad_norm": 16.52041244506836, "learning_rate": 4.4940272923283013e-07, "loss": 0.4588, "num_input_tokens_seen": 170355168, "step": 140085 }, { "epoch": 17.552938228292195, "grad_norm": 6.960598468780518, "learning_rate": 4.491762277638778e-07, "loss": 0.4914, "num_input_tokens_seen": 170361312, "step": 140090 }, { "epoch": 17.553564716200977, "grad_norm": 5.777799129486084, "learning_rate": 4.4894978070384897e-07, "loss": 0.4381, "num_input_tokens_seen": 170367552, "step": 140095 }, { "epoch": 17.55419120410976, "grad_norm": 6.978896141052246, "learning_rate": 4.4872338805545145e-07, "loss": 0.4465, "num_input_tokens_seen": 170373888, "step": 140100 }, { "epoch": 17.554817692018545, "grad_norm": 17.289457321166992, "learning_rate": 4.4849704982139306e-07, "loss": 0.4811, "num_input_tokens_seen": 170379808, "step": 140105 }, { "epoch": 17.555444179927328, "grad_norm": 11.086193084716797, "learning_rate": 4.482707660043795e-07, "loss": 0.427, "num_input_tokens_seen": 170386240, "step": 140110 }, { "epoch": 17.55607066783611, "grad_norm": 8.424638748168945, "learning_rate": 4.480445366071151e-07, "loss": 0.4284, "num_input_tokens_seen": 170392480, "step": 140115 }, { "epoch": 17.556697155744892, "grad_norm": 20.788192749023438, "learning_rate": 4.478183616323062e-07, "loss": 0.4804, "num_input_tokens_seen": 170398080, "step": 140120 }, { "epoch": 17.557323643653678, "grad_norm": 8.393001556396484, "learning_rate": 4.47592241082655e-07, "loss": 0.396, "num_input_tokens_seen": 170404160, "step": 140125 }, { "epoch": 17.55795013156246, "grad_norm": 5.2977824211120605, "learning_rate": 4.473661749608671e-07, "loss": 0.51, "num_input_tokens_seen": 170410208, "step": 140130 }, { "epoch": 17.558576619471243, "grad_norm": 18.657995223999023, "learning_rate": 4.4714016326964317e-07, "loss": 0.4578, "num_input_tokens_seen": 170416352, "step": 140135 }, { "epoch": 17.55920310738003, "grad_norm": 8.705183029174805, "learning_rate": 4.4691420601168766e-07, "loss": 0.4601, "num_input_tokens_seen": 170422304, "step": 140140 }, { "epoch": 17.55982959528881, "grad_norm": 8.946181297302246, "learning_rate": 4.4668830318969956e-07, "loss": 0.4416, "num_input_tokens_seen": 170428192, "step": 140145 }, { "epoch": 17.560456083197593, "grad_norm": 12.09727954864502, "learning_rate": 4.464624548063823e-07, "loss": 0.433, "num_input_tokens_seen": 170434368, "step": 140150 }, { "epoch": 17.56108257110638, "grad_norm": 4.668676853179932, "learning_rate": 4.462366608644336e-07, "loss": 0.4143, "num_input_tokens_seen": 170440512, "step": 140155 }, { "epoch": 17.56170905901516, "grad_norm": 4.446012496948242, "learning_rate": 4.460109213665548e-07, "loss": 0.4192, "num_input_tokens_seen": 170446624, "step": 140160 }, { "epoch": 17.562335546923943, "grad_norm": 3.5444557666778564, "learning_rate": 4.457852363154441e-07, "loss": 0.4195, "num_input_tokens_seen": 170452416, "step": 140165 }, { "epoch": 17.56296203483273, "grad_norm": 11.95580005645752, "learning_rate": 4.45559605713799e-07, "loss": 0.417, "num_input_tokens_seen": 170458752, "step": 140170 }, { "epoch": 17.56358852274151, "grad_norm": 4.628478050231934, "learning_rate": 4.453340295643194e-07, "loss": 0.4617, "num_input_tokens_seen": 170464640, "step": 140175 }, { "epoch": 17.564215010650294, "grad_norm": 4.5155816078186035, "learning_rate": 4.4510850786969984e-07, "loss": 0.4304, "num_input_tokens_seen": 170470528, "step": 140180 }, { "epoch": 17.564841498559076, "grad_norm": 11.604809761047363, "learning_rate": 4.4488304063263876e-07, "loss": 0.4732, "num_input_tokens_seen": 170476704, "step": 140185 }, { "epoch": 17.565467986467862, "grad_norm": 5.632910251617432, "learning_rate": 4.446576278558296e-07, "loss": 0.4788, "num_input_tokens_seen": 170482848, "step": 140190 }, { "epoch": 17.566094474376644, "grad_norm": 6.000384330749512, "learning_rate": 4.444322695419684e-07, "loss": 0.4049, "num_input_tokens_seen": 170488960, "step": 140195 }, { "epoch": 17.566720962285427, "grad_norm": 4.369495868682861, "learning_rate": 4.4420696569374924e-07, "loss": 0.4506, "num_input_tokens_seen": 170495008, "step": 140200 }, { "epoch": 17.567347450194212, "grad_norm": 3.5426244735717773, "learning_rate": 4.439817163138671e-07, "loss": 0.4804, "num_input_tokens_seen": 170501088, "step": 140205 }, { "epoch": 17.567973938102995, "grad_norm": 7.539799690246582, "learning_rate": 4.437565214050138e-07, "loss": 0.4307, "num_input_tokens_seen": 170507104, "step": 140210 }, { "epoch": 17.568600426011777, "grad_norm": 5.0029425621032715, "learning_rate": 4.4353138096988214e-07, "loss": 0.4063, "num_input_tokens_seen": 170513600, "step": 140215 }, { "epoch": 17.569226913920563, "grad_norm": 6.694085597991943, "learning_rate": 4.4330629501116275e-07, "loss": 0.4661, "num_input_tokens_seen": 170519008, "step": 140220 }, { "epoch": 17.569853401829345, "grad_norm": 26.182313919067383, "learning_rate": 4.430812635315479e-07, "loss": 0.4794, "num_input_tokens_seen": 170525088, "step": 140225 }, { "epoch": 17.570479889738127, "grad_norm": 8.737579345703125, "learning_rate": 4.428562865337288e-07, "loss": 0.4434, "num_input_tokens_seen": 170531360, "step": 140230 }, { "epoch": 17.571106377646913, "grad_norm": 5.231114387512207, "learning_rate": 4.426313640203933e-07, "loss": 0.463, "num_input_tokens_seen": 170537216, "step": 140235 }, { "epoch": 17.571732865555695, "grad_norm": 6.538999557495117, "learning_rate": 4.424064959942326e-07, "loss": 0.5529, "num_input_tokens_seen": 170543296, "step": 140240 }, { "epoch": 17.572359353464478, "grad_norm": 27.044830322265625, "learning_rate": 4.421816824579328e-07, "loss": 0.4394, "num_input_tokens_seen": 170549888, "step": 140245 }, { "epoch": 17.57298584137326, "grad_norm": 4.795629501342773, "learning_rate": 4.419569234141835e-07, "loss": 0.4536, "num_input_tokens_seen": 170556192, "step": 140250 }, { "epoch": 17.573612329282046, "grad_norm": 5.678085803985596, "learning_rate": 4.4173221886567087e-07, "loss": 0.4433, "num_input_tokens_seen": 170562400, "step": 140255 }, { "epoch": 17.574238817190828, "grad_norm": 5.2712202072143555, "learning_rate": 4.415075688150833e-07, "loss": 0.4371, "num_input_tokens_seen": 170568288, "step": 140260 }, { "epoch": 17.57486530509961, "grad_norm": 5.471915245056152, "learning_rate": 4.412829732651036e-07, "loss": 0.4221, "num_input_tokens_seen": 170574432, "step": 140265 }, { "epoch": 17.575491793008396, "grad_norm": 3.4807989597320557, "learning_rate": 4.4105843221842024e-07, "loss": 0.4442, "num_input_tokens_seen": 170579936, "step": 140270 }, { "epoch": 17.57611828091718, "grad_norm": 8.708502769470215, "learning_rate": 4.40833945677715e-07, "loss": 0.417, "num_input_tokens_seen": 170585984, "step": 140275 }, { "epoch": 17.57674476882596, "grad_norm": 21.914758682250977, "learning_rate": 4.406095136456745e-07, "loss": 0.4478, "num_input_tokens_seen": 170592256, "step": 140280 }, { "epoch": 17.577371256734747, "grad_norm": 21.78509521484375, "learning_rate": 4.403851361249789e-07, "loss": 0.4596, "num_input_tokens_seen": 170598496, "step": 140285 }, { "epoch": 17.57799774464353, "grad_norm": 5.6894097328186035, "learning_rate": 4.401608131183133e-07, "loss": 0.4465, "num_input_tokens_seen": 170604832, "step": 140290 }, { "epoch": 17.57862423255231, "grad_norm": 4.830845832824707, "learning_rate": 4.3993654462835823e-07, "loss": 0.3938, "num_input_tokens_seen": 170610944, "step": 140295 }, { "epoch": 17.579250720461093, "grad_norm": 3.599436044692993, "learning_rate": 4.397123306577966e-07, "loss": 0.4276, "num_input_tokens_seen": 170616544, "step": 140300 }, { "epoch": 17.57987720836988, "grad_norm": 11.118793487548828, "learning_rate": 4.39488171209308e-07, "loss": 0.4683, "num_input_tokens_seen": 170622880, "step": 140305 }, { "epoch": 17.58050369627866, "grad_norm": 26.754322052001953, "learning_rate": 4.392640662855718e-07, "loss": 0.4641, "num_input_tokens_seen": 170629024, "step": 140310 }, { "epoch": 17.581130184187444, "grad_norm": 6.6985650062561035, "learning_rate": 4.390400158892694e-07, "loss": 0.4793, "num_input_tokens_seen": 170635104, "step": 140315 }, { "epoch": 17.58175667209623, "grad_norm": 3.2010414600372314, "learning_rate": 4.3881602002307733e-07, "loss": 0.4339, "num_input_tokens_seen": 170641216, "step": 140320 }, { "epoch": 17.582383160005012, "grad_norm": 4.606931686401367, "learning_rate": 4.385920786896752e-07, "loss": 0.4991, "num_input_tokens_seen": 170647680, "step": 140325 }, { "epoch": 17.583009647913794, "grad_norm": 6.245966911315918, "learning_rate": 4.3836819189173874e-07, "loss": 0.3993, "num_input_tokens_seen": 170653344, "step": 140330 }, { "epoch": 17.58363613582258, "grad_norm": 5.603745460510254, "learning_rate": 4.3814435963194737e-07, "loss": 0.4333, "num_input_tokens_seen": 170659616, "step": 140335 }, { "epoch": 17.584262623731362, "grad_norm": 3.7043964862823486, "learning_rate": 4.379205819129739e-07, "loss": 0.4553, "num_input_tokens_seen": 170665664, "step": 140340 }, { "epoch": 17.584889111640145, "grad_norm": 11.032938957214355, "learning_rate": 4.376968587374969e-07, "loss": 0.5097, "num_input_tokens_seen": 170672096, "step": 140345 }, { "epoch": 17.585515599548927, "grad_norm": 3.688316583633423, "learning_rate": 4.3747319010818854e-07, "loss": 0.4205, "num_input_tokens_seen": 170677888, "step": 140350 }, { "epoch": 17.586142087457713, "grad_norm": 10.905415534973145, "learning_rate": 4.372495760277246e-07, "loss": 0.4914, "num_input_tokens_seen": 170684256, "step": 140355 }, { "epoch": 17.586768575366495, "grad_norm": 4.828312873840332, "learning_rate": 4.3702601649877884e-07, "loss": 0.5297, "num_input_tokens_seen": 170690272, "step": 140360 }, { "epoch": 17.587395063275277, "grad_norm": 12.950675964355469, "learning_rate": 4.3680251152402206e-07, "loss": 0.4891, "num_input_tokens_seen": 170696096, "step": 140365 }, { "epoch": 17.588021551184063, "grad_norm": 3.5684707164764404, "learning_rate": 4.3657906110612935e-07, "loss": 0.4186, "num_input_tokens_seen": 170701984, "step": 140370 }, { "epoch": 17.588648039092845, "grad_norm": 4.563416481018066, "learning_rate": 4.3635566524776907e-07, "loss": 0.4466, "num_input_tokens_seen": 170708128, "step": 140375 }, { "epoch": 17.589274527001628, "grad_norm": 3.385132312774658, "learning_rate": 4.361323239516152e-07, "loss": 0.4512, "num_input_tokens_seen": 170714272, "step": 140380 }, { "epoch": 17.589901014910414, "grad_norm": 8.713994026184082, "learning_rate": 4.3590903722033507e-07, "loss": 0.4194, "num_input_tokens_seen": 170720672, "step": 140385 }, { "epoch": 17.590527502819196, "grad_norm": 25.627824783325195, "learning_rate": 4.3568580505660095e-07, "loss": 0.5403, "num_input_tokens_seen": 170726784, "step": 140390 }, { "epoch": 17.591153990727978, "grad_norm": 9.778803825378418, "learning_rate": 4.3546262746307965e-07, "loss": 0.4479, "num_input_tokens_seen": 170732672, "step": 140395 }, { "epoch": 17.591780478636764, "grad_norm": 19.79983139038086, "learning_rate": 4.352395044424401e-07, "loss": 0.4861, "num_input_tokens_seen": 170738656, "step": 140400 }, { "epoch": 17.592406966545546, "grad_norm": 3.811828374862671, "learning_rate": 4.350164359973502e-07, "loss": 0.4678, "num_input_tokens_seen": 170745056, "step": 140405 }, { "epoch": 17.59303345445433, "grad_norm": 7.443769931793213, "learning_rate": 4.347934221304778e-07, "loss": 0.4151, "num_input_tokens_seen": 170751424, "step": 140410 }, { "epoch": 17.59365994236311, "grad_norm": 5.257497787475586, "learning_rate": 4.345704628444869e-07, "loss": 0.4734, "num_input_tokens_seen": 170757888, "step": 140415 }, { "epoch": 17.594286430271897, "grad_norm": 2.561234712600708, "learning_rate": 4.343475581420448e-07, "loss": 0.4825, "num_input_tokens_seen": 170763584, "step": 140420 }, { "epoch": 17.59491291818068, "grad_norm": 16.363853454589844, "learning_rate": 4.341247080258176e-07, "loss": 0.4514, "num_input_tokens_seen": 170769728, "step": 140425 }, { "epoch": 17.59553940608946, "grad_norm": 4.399496078491211, "learning_rate": 4.339019124984667e-07, "loss": 0.4162, "num_input_tokens_seen": 170776160, "step": 140430 }, { "epoch": 17.596165893998247, "grad_norm": 6.831462383270264, "learning_rate": 4.336791715626587e-07, "loss": 0.4386, "num_input_tokens_seen": 170782112, "step": 140435 }, { "epoch": 17.59679238190703, "grad_norm": 19.41961097717285, "learning_rate": 4.3345648522105434e-07, "loss": 0.5567, "num_input_tokens_seen": 170788320, "step": 140440 }, { "epoch": 17.59741886981581, "grad_norm": 13.900494575500488, "learning_rate": 4.332338534763181e-07, "loss": 0.4338, "num_input_tokens_seen": 170794336, "step": 140445 }, { "epoch": 17.598045357724597, "grad_norm": 11.38290786743164, "learning_rate": 4.3301127633110953e-07, "loss": 0.4923, "num_input_tokens_seen": 170800192, "step": 140450 }, { "epoch": 17.59867184563338, "grad_norm": 4.808470249176025, "learning_rate": 4.327887537880915e-07, "loss": 0.4195, "num_input_tokens_seen": 170806112, "step": 140455 }, { "epoch": 17.599298333542162, "grad_norm": 3.6014418601989746, "learning_rate": 4.3256628584992354e-07, "loss": 0.4435, "num_input_tokens_seen": 170812480, "step": 140460 }, { "epoch": 17.599924821450944, "grad_norm": 6.64951753616333, "learning_rate": 4.3234387251926633e-07, "loss": 0.5053, "num_input_tokens_seen": 170818464, "step": 140465 }, { "epoch": 17.60055130935973, "grad_norm": 14.041115760803223, "learning_rate": 4.321215137987772e-07, "loss": 0.4685, "num_input_tokens_seen": 170824768, "step": 140470 }, { "epoch": 17.601177797268512, "grad_norm": 4.71065092086792, "learning_rate": 4.318992096911162e-07, "loss": 0.4402, "num_input_tokens_seen": 170830720, "step": 140475 }, { "epoch": 17.601804285177295, "grad_norm": 13.488405227661133, "learning_rate": 4.316769601989418e-07, "loss": 0.4259, "num_input_tokens_seen": 170836864, "step": 140480 }, { "epoch": 17.60243077308608, "grad_norm": 14.245728492736816, "learning_rate": 4.31454765324909e-07, "loss": 0.4929, "num_input_tokens_seen": 170842656, "step": 140485 }, { "epoch": 17.603057260994863, "grad_norm": 7.791660308837891, "learning_rate": 4.3123262507167643e-07, "loss": 0.491, "num_input_tokens_seen": 170848544, "step": 140490 }, { "epoch": 17.603683748903645, "grad_norm": 5.915641784667969, "learning_rate": 4.310105394418984e-07, "loss": 0.47, "num_input_tokens_seen": 170854912, "step": 140495 }, { "epoch": 17.60431023681243, "grad_norm": 7.196000099182129, "learning_rate": 4.307885084382307e-07, "loss": 0.4373, "num_input_tokens_seen": 170860832, "step": 140500 }, { "epoch": 17.604936724721213, "grad_norm": 3.5306544303894043, "learning_rate": 4.3056653206332846e-07, "loss": 0.4808, "num_input_tokens_seen": 170867040, "step": 140505 }, { "epoch": 17.605563212629995, "grad_norm": 4.461031913757324, "learning_rate": 4.3034461031984496e-07, "loss": 0.4302, "num_input_tokens_seen": 170872960, "step": 140510 }, { "epoch": 17.60618970053878, "grad_norm": 9.978671073913574, "learning_rate": 4.3012274321043267e-07, "loss": 0.4582, "num_input_tokens_seen": 170878304, "step": 140515 }, { "epoch": 17.606816188447564, "grad_norm": 6.48233699798584, "learning_rate": 4.2990093073774663e-07, "loss": 0.4304, "num_input_tokens_seen": 170884512, "step": 140520 }, { "epoch": 17.607442676356346, "grad_norm": 7.401784896850586, "learning_rate": 4.296791729044364e-07, "loss": 0.4636, "num_input_tokens_seen": 170890208, "step": 140525 }, { "epoch": 17.608069164265128, "grad_norm": 5.012988090515137, "learning_rate": 4.2945746971315485e-07, "loss": 0.4162, "num_input_tokens_seen": 170896640, "step": 140530 }, { "epoch": 17.608695652173914, "grad_norm": 5.0494771003723145, "learning_rate": 4.2923582116655096e-07, "loss": 0.4505, "num_input_tokens_seen": 170902656, "step": 140535 }, { "epoch": 17.609322140082696, "grad_norm": 9.535394668579102, "learning_rate": 4.2901422726727593e-07, "loss": 0.4398, "num_input_tokens_seen": 170909120, "step": 140540 }, { "epoch": 17.60994862799148, "grad_norm": 5.2102789878845215, "learning_rate": 4.287926880179799e-07, "loss": 0.4216, "num_input_tokens_seen": 170915680, "step": 140545 }, { "epoch": 17.610575115900264, "grad_norm": 7.003206253051758, "learning_rate": 4.285712034213091e-07, "loss": 0.4733, "num_input_tokens_seen": 170921376, "step": 140550 }, { "epoch": 17.611201603809047, "grad_norm": 5.683271408081055, "learning_rate": 4.283497734799147e-07, "loss": 0.412, "num_input_tokens_seen": 170927488, "step": 140555 }, { "epoch": 17.61182809171783, "grad_norm": 2.0226387977600098, "learning_rate": 4.2812839819644067e-07, "loss": 0.4493, "num_input_tokens_seen": 170933472, "step": 140560 }, { "epoch": 17.612454579626615, "grad_norm": 7.521029949188232, "learning_rate": 4.2790707757353724e-07, "loss": 0.4943, "num_input_tokens_seen": 170939808, "step": 140565 }, { "epoch": 17.613081067535397, "grad_norm": 3.294837713241577, "learning_rate": 4.2768581161384714e-07, "loss": 0.4058, "num_input_tokens_seen": 170945728, "step": 140570 }, { "epoch": 17.61370755544418, "grad_norm": 11.061908721923828, "learning_rate": 4.2746460032001837e-07, "loss": 0.3895, "num_input_tokens_seen": 170951968, "step": 140575 }, { "epoch": 17.61433404335296, "grad_norm": 4.033257484436035, "learning_rate": 4.272434436946937e-07, "loss": 0.4417, "num_input_tokens_seen": 170957440, "step": 140580 }, { "epoch": 17.614960531261747, "grad_norm": 5.408461093902588, "learning_rate": 4.270223417405195e-07, "loss": 0.5003, "num_input_tokens_seen": 170963136, "step": 140585 }, { "epoch": 17.61558701917053, "grad_norm": 13.280308723449707, "learning_rate": 4.2680129446013685e-07, "loss": 0.4495, "num_input_tokens_seen": 170968928, "step": 140590 }, { "epoch": 17.616213507079312, "grad_norm": 13.677409172058105, "learning_rate": 4.2658030185618925e-07, "loss": 0.486, "num_input_tokens_seen": 170975008, "step": 140595 }, { "epoch": 17.616839994988098, "grad_norm": 6.487752914428711, "learning_rate": 4.263593639313207e-07, "loss": 0.4543, "num_input_tokens_seen": 170981184, "step": 140600 }, { "epoch": 17.61746648289688, "grad_norm": 4.8583478927612305, "learning_rate": 4.261384806881702e-07, "loss": 0.5459, "num_input_tokens_seen": 170987808, "step": 140605 }, { "epoch": 17.618092970805662, "grad_norm": 13.756333351135254, "learning_rate": 4.259176521293795e-07, "loss": 0.4106, "num_input_tokens_seen": 170993888, "step": 140610 }, { "epoch": 17.618719458714448, "grad_norm": 3.904062509536743, "learning_rate": 4.256968782575893e-07, "loss": 0.4393, "num_input_tokens_seen": 171000224, "step": 140615 }, { "epoch": 17.61934594662323, "grad_norm": 10.456099510192871, "learning_rate": 4.2547615907543915e-07, "loss": 0.4788, "num_input_tokens_seen": 171006592, "step": 140620 }, { "epoch": 17.619972434532013, "grad_norm": 5.384151935577393, "learning_rate": 4.252554945855675e-07, "loss": 0.4196, "num_input_tokens_seen": 171012832, "step": 140625 }, { "epoch": 17.6205989224408, "grad_norm": 6.49554967880249, "learning_rate": 4.2503488479061274e-07, "loss": 0.5094, "num_input_tokens_seen": 171019360, "step": 140630 }, { "epoch": 17.62122541034958, "grad_norm": 7.632710933685303, "learning_rate": 4.248143296932117e-07, "loss": 0.5366, "num_input_tokens_seen": 171025664, "step": 140635 }, { "epoch": 17.621851898258363, "grad_norm": 18.416015625, "learning_rate": 4.245938292960028e-07, "loss": 0.4593, "num_input_tokens_seen": 171031904, "step": 140640 }, { "epoch": 17.622478386167145, "grad_norm": 11.919034004211426, "learning_rate": 4.2437338360162063e-07, "loss": 0.4743, "num_input_tokens_seen": 171037856, "step": 140645 }, { "epoch": 17.62310487407593, "grad_norm": 11.843790054321289, "learning_rate": 4.2415299261270303e-07, "loss": 0.5333, "num_input_tokens_seen": 171044320, "step": 140650 }, { "epoch": 17.623731361984714, "grad_norm": 5.408261775970459, "learning_rate": 4.2393265633188187e-07, "loss": 0.459, "num_input_tokens_seen": 171050272, "step": 140655 }, { "epoch": 17.624357849893496, "grad_norm": 21.343624114990234, "learning_rate": 4.237123747617938e-07, "loss": 0.4253, "num_input_tokens_seen": 171056768, "step": 140660 }, { "epoch": 17.62498433780228, "grad_norm": 7.733154773712158, "learning_rate": 4.2349214790507243e-07, "loss": 0.4323, "num_input_tokens_seen": 171062880, "step": 140665 }, { "epoch": 17.625610825711064, "grad_norm": 6.974773406982422, "learning_rate": 4.2327197576434884e-07, "loss": 0.4355, "num_input_tokens_seen": 171069056, "step": 140670 }, { "epoch": 17.626237313619846, "grad_norm": 5.477303504943848, "learning_rate": 4.2305185834225826e-07, "loss": 0.4408, "num_input_tokens_seen": 171075104, "step": 140675 }, { "epoch": 17.626863801528632, "grad_norm": 14.069906234741211, "learning_rate": 4.228317956414296e-07, "loss": 0.5221, "num_input_tokens_seen": 171081216, "step": 140680 }, { "epoch": 17.627490289437414, "grad_norm": 8.179641723632812, "learning_rate": 4.2261178766449583e-07, "loss": 0.4412, "num_input_tokens_seen": 171087648, "step": 140685 }, { "epoch": 17.628116777346197, "grad_norm": 18.593490600585938, "learning_rate": 4.223918344140859e-07, "loss": 0.5627, "num_input_tokens_seen": 171093824, "step": 140690 }, { "epoch": 17.62874326525498, "grad_norm": 22.515474319458008, "learning_rate": 4.221719358928311e-07, "loss": 0.4712, "num_input_tokens_seen": 171099744, "step": 140695 }, { "epoch": 17.629369753163765, "grad_norm": 5.9899582862854, "learning_rate": 4.2195209210335873e-07, "loss": 0.4276, "num_input_tokens_seen": 171105664, "step": 140700 }, { "epoch": 17.629996241072547, "grad_norm": 6.033967971801758, "learning_rate": 4.217323030482978e-07, "loss": 0.5433, "num_input_tokens_seen": 171112096, "step": 140705 }, { "epoch": 17.63062272898133, "grad_norm": 12.014586448669434, "learning_rate": 4.215125687302779e-07, "loss": 0.4418, "num_input_tokens_seen": 171118432, "step": 140710 }, { "epoch": 17.631249216890115, "grad_norm": 6.105536937713623, "learning_rate": 4.2129288915192355e-07, "loss": 0.4666, "num_input_tokens_seen": 171125024, "step": 140715 }, { "epoch": 17.631875704798897, "grad_norm": 9.512657165527344, "learning_rate": 4.2107326431586217e-07, "loss": 0.4236, "num_input_tokens_seen": 171131040, "step": 140720 }, { "epoch": 17.63250219270768, "grad_norm": 3.612454414367676, "learning_rate": 4.2085369422471934e-07, "loss": 0.4944, "num_input_tokens_seen": 171136864, "step": 140725 }, { "epoch": 17.633128680616466, "grad_norm": 4.660764217376709, "learning_rate": 4.206341788811219e-07, "loss": 0.4016, "num_input_tokens_seen": 171142912, "step": 140730 }, { "epoch": 17.633755168525248, "grad_norm": 26.155353546142578, "learning_rate": 4.204147182876916e-07, "loss": 0.5239, "num_input_tokens_seen": 171148896, "step": 140735 }, { "epoch": 17.63438165643403, "grad_norm": 3.6559624671936035, "learning_rate": 4.201953124470548e-07, "loss": 0.4087, "num_input_tokens_seen": 171154944, "step": 140740 }, { "epoch": 17.635008144342812, "grad_norm": 14.338789939880371, "learning_rate": 4.199759613618326e-07, "loss": 0.4205, "num_input_tokens_seen": 171160800, "step": 140745 }, { "epoch": 17.635634632251598, "grad_norm": 2.7915732860565186, "learning_rate": 4.1975666503464965e-07, "loss": 0.464, "num_input_tokens_seen": 171167072, "step": 140750 }, { "epoch": 17.63626112016038, "grad_norm": 3.23649001121521, "learning_rate": 4.1953742346812544e-07, "loss": 0.4168, "num_input_tokens_seen": 171173248, "step": 140755 }, { "epoch": 17.636887608069163, "grad_norm": 11.16692066192627, "learning_rate": 4.1931823666488347e-07, "loss": 0.4561, "num_input_tokens_seen": 171178976, "step": 140760 }, { "epoch": 17.63751409597795, "grad_norm": 4.277553558349609, "learning_rate": 4.190991046275422e-07, "loss": 0.3561, "num_input_tokens_seen": 171185056, "step": 140765 }, { "epoch": 17.63814058388673, "grad_norm": 18.706172943115234, "learning_rate": 4.188800273587235e-07, "loss": 0.4723, "num_input_tokens_seen": 171191072, "step": 140770 }, { "epoch": 17.638767071795513, "grad_norm": 25.257062911987305, "learning_rate": 4.1866100486104457e-07, "loss": 0.5081, "num_input_tokens_seen": 171197216, "step": 140775 }, { "epoch": 17.6393935597043, "grad_norm": 2.98956036567688, "learning_rate": 4.184420371371262e-07, "loss": 0.4916, "num_input_tokens_seen": 171203424, "step": 140780 }, { "epoch": 17.64002004761308, "grad_norm": 8.10798454284668, "learning_rate": 4.1822312418958397e-07, "loss": 0.4994, "num_input_tokens_seen": 171209856, "step": 140785 }, { "epoch": 17.640646535521864, "grad_norm": 3.4356744289398193, "learning_rate": 4.180042660210365e-07, "loss": 0.4411, "num_input_tokens_seen": 171215904, "step": 140790 }, { "epoch": 17.64127302343065, "grad_norm": 12.853970527648926, "learning_rate": 4.1778546263410156e-07, "loss": 0.4298, "num_input_tokens_seen": 171221984, "step": 140795 }, { "epoch": 17.64189951133943, "grad_norm": 11.596150398254395, "learning_rate": 4.1756671403139326e-07, "loss": 0.547, "num_input_tokens_seen": 171228288, "step": 140800 }, { "epoch": 17.642525999248214, "grad_norm": 5.7972846031188965, "learning_rate": 4.173480202155278e-07, "loss": 0.4403, "num_input_tokens_seen": 171233632, "step": 140805 }, { "epoch": 17.643152487156996, "grad_norm": 9.287310600280762, "learning_rate": 4.1712938118911917e-07, "loss": 0.5245, "num_input_tokens_seen": 171239776, "step": 140810 }, { "epoch": 17.643778975065782, "grad_norm": 4.011091232299805, "learning_rate": 4.1691079695478196e-07, "loss": 0.5356, "num_input_tokens_seen": 171246048, "step": 140815 }, { "epoch": 17.644405462974564, "grad_norm": 4.685200214385986, "learning_rate": 4.1669226751512915e-07, "loss": 0.4134, "num_input_tokens_seen": 171252224, "step": 140820 }, { "epoch": 17.645031950883347, "grad_norm": 4.612904071807861, "learning_rate": 4.1647379287277466e-07, "loss": 0.416, "num_input_tokens_seen": 171258304, "step": 140825 }, { "epoch": 17.645658438792132, "grad_norm": 11.239899635314941, "learning_rate": 4.162553730303287e-07, "loss": 0.5127, "num_input_tokens_seen": 171264320, "step": 140830 }, { "epoch": 17.646284926700915, "grad_norm": 10.673859596252441, "learning_rate": 4.160370079904047e-07, "loss": 0.4431, "num_input_tokens_seen": 171270496, "step": 140835 }, { "epoch": 17.646911414609697, "grad_norm": 3.5897393226623535, "learning_rate": 4.158186977556111e-07, "loss": 0.4564, "num_input_tokens_seen": 171276608, "step": 140840 }, { "epoch": 17.647537902518483, "grad_norm": 9.204889297485352, "learning_rate": 4.156004423285592e-07, "loss": 0.4703, "num_input_tokens_seen": 171282816, "step": 140845 }, { "epoch": 17.648164390427265, "grad_norm": 23.792184829711914, "learning_rate": 4.153822417118591e-07, "loss": 0.401, "num_input_tokens_seen": 171289184, "step": 140850 }, { "epoch": 17.648790878336047, "grad_norm": 15.30691909790039, "learning_rate": 4.1516409590811826e-07, "loss": 0.4045, "num_input_tokens_seen": 171295616, "step": 140855 }, { "epoch": 17.649417366244833, "grad_norm": 10.08851432800293, "learning_rate": 4.149460049199461e-07, "loss": 0.4581, "num_input_tokens_seen": 171301664, "step": 140860 }, { "epoch": 17.650043854153616, "grad_norm": 10.631783485412598, "learning_rate": 4.1472796874994836e-07, "loss": 0.4414, "num_input_tokens_seen": 171307744, "step": 140865 }, { "epoch": 17.650670342062398, "grad_norm": 9.940658569335938, "learning_rate": 4.145099874007341e-07, "loss": 0.4125, "num_input_tokens_seen": 171313824, "step": 140870 }, { "epoch": 17.65129682997118, "grad_norm": 19.625150680541992, "learning_rate": 4.142920608749068e-07, "loss": 0.4733, "num_input_tokens_seen": 171320160, "step": 140875 }, { "epoch": 17.651923317879966, "grad_norm": 3.167590379714966, "learning_rate": 4.1407418917507435e-07, "loss": 0.4245, "num_input_tokens_seen": 171326176, "step": 140880 }, { "epoch": 17.652549805788748, "grad_norm": 10.056316375732422, "learning_rate": 4.1385637230384024e-07, "loss": 0.514, "num_input_tokens_seen": 171332448, "step": 140885 }, { "epoch": 17.65317629369753, "grad_norm": 6.1094794273376465, "learning_rate": 4.136386102638096e-07, "loss": 0.4628, "num_input_tokens_seen": 171338656, "step": 140890 }, { "epoch": 17.653802781606316, "grad_norm": 3.6399121284484863, "learning_rate": 4.134209030575842e-07, "loss": 0.4772, "num_input_tokens_seen": 171344864, "step": 140895 }, { "epoch": 17.6544292695151, "grad_norm": 5.294198989868164, "learning_rate": 4.132032506877692e-07, "loss": 0.5091, "num_input_tokens_seen": 171351136, "step": 140900 }, { "epoch": 17.65505575742388, "grad_norm": 7.968606948852539, "learning_rate": 4.1298565315696484e-07, "loss": 0.4292, "num_input_tokens_seen": 171356960, "step": 140905 }, { "epoch": 17.655682245332667, "grad_norm": 15.396319389343262, "learning_rate": 4.1276811046777335e-07, "loss": 0.4227, "num_input_tokens_seen": 171362848, "step": 140910 }, { "epoch": 17.65630873324145, "grad_norm": 7.770616054534912, "learning_rate": 4.12550622622796e-07, "loss": 0.5734, "num_input_tokens_seen": 171369056, "step": 140915 }, { "epoch": 17.65693522115023, "grad_norm": 37.716304779052734, "learning_rate": 4.123331896246341e-07, "loss": 0.5099, "num_input_tokens_seen": 171375072, "step": 140920 }, { "epoch": 17.657561709059014, "grad_norm": 4.035401821136475, "learning_rate": 4.121158114758844e-07, "loss": 0.4365, "num_input_tokens_seen": 171381088, "step": 140925 }, { "epoch": 17.6581881969678, "grad_norm": 10.329593658447266, "learning_rate": 4.1189848817914826e-07, "loss": 0.4502, "num_input_tokens_seen": 171387328, "step": 140930 }, { "epoch": 17.65881468487658, "grad_norm": 11.819811820983887, "learning_rate": 4.1168121973702345e-07, "loss": 0.4469, "num_input_tokens_seen": 171393376, "step": 140935 }, { "epoch": 17.659441172785364, "grad_norm": 4.487571716308594, "learning_rate": 4.1146400615210634e-07, "loss": 0.4045, "num_input_tokens_seen": 171399488, "step": 140940 }, { "epoch": 17.66006766069415, "grad_norm": 2.6491053104400635, "learning_rate": 4.1124684742699596e-07, "loss": 0.4508, "num_input_tokens_seen": 171405664, "step": 140945 }, { "epoch": 17.660694148602932, "grad_norm": 5.491279125213623, "learning_rate": 4.110297435642868e-07, "loss": 0.5057, "num_input_tokens_seen": 171411968, "step": 140950 }, { "epoch": 17.661320636511714, "grad_norm": 5.866313457489014, "learning_rate": 4.108126945665758e-07, "loss": 0.429, "num_input_tokens_seen": 171417856, "step": 140955 }, { "epoch": 17.6619471244205, "grad_norm": 4.805586338043213, "learning_rate": 4.105957004364569e-07, "loss": 0.4646, "num_input_tokens_seen": 171423584, "step": 140960 }, { "epoch": 17.662573612329282, "grad_norm": 3.350896120071411, "learning_rate": 4.1037876117652585e-07, "loss": 0.457, "num_input_tokens_seen": 171429888, "step": 140965 }, { "epoch": 17.663200100238065, "grad_norm": 4.368664264678955, "learning_rate": 4.1016187678937445e-07, "loss": 0.4692, "num_input_tokens_seen": 171435648, "step": 140970 }, { "epoch": 17.663826588146847, "grad_norm": 17.233478546142578, "learning_rate": 4.0994504727759675e-07, "loss": 0.499, "num_input_tokens_seen": 171442336, "step": 140975 }, { "epoch": 17.664453076055633, "grad_norm": 10.205157279968262, "learning_rate": 4.097282726437857e-07, "loss": 0.394, "num_input_tokens_seen": 171448576, "step": 140980 }, { "epoch": 17.665079563964415, "grad_norm": 6.033178329467773, "learning_rate": 4.095115528905319e-07, "loss": 0.4524, "num_input_tokens_seen": 171454784, "step": 140985 }, { "epoch": 17.665706051873197, "grad_norm": 11.551053047180176, "learning_rate": 4.092948880204278e-07, "loss": 0.4367, "num_input_tokens_seen": 171460896, "step": 140990 }, { "epoch": 17.666332539781983, "grad_norm": 7.648316383361816, "learning_rate": 4.090782780360619e-07, "loss": 0.408, "num_input_tokens_seen": 171466912, "step": 140995 }, { "epoch": 17.666959027690766, "grad_norm": 13.233342170715332, "learning_rate": 4.08861722940026e-07, "loss": 0.5098, "num_input_tokens_seen": 171472896, "step": 141000 }, { "epoch": 17.667585515599548, "grad_norm": 24.155517578125, "learning_rate": 4.0864522273490747e-07, "loss": 0.4729, "num_input_tokens_seen": 171478912, "step": 141005 }, { "epoch": 17.668212003508334, "grad_norm": 5.042160511016846, "learning_rate": 4.084287774232964e-07, "loss": 0.4324, "num_input_tokens_seen": 171485056, "step": 141010 }, { "epoch": 17.668838491417116, "grad_norm": 5.228074073791504, "learning_rate": 4.082123870077787e-07, "loss": 0.4639, "num_input_tokens_seen": 171491456, "step": 141015 }, { "epoch": 17.669464979325898, "grad_norm": 11.230437278747559, "learning_rate": 4.07996051490942e-07, "loss": 0.4139, "num_input_tokens_seen": 171497440, "step": 141020 }, { "epoch": 17.670091467234684, "grad_norm": 5.513956069946289, "learning_rate": 4.077797708753739e-07, "loss": 0.4768, "num_input_tokens_seen": 171503424, "step": 141025 }, { "epoch": 17.670717955143466, "grad_norm": 4.366191387176514, "learning_rate": 4.075635451636595e-07, "loss": 0.3884, "num_input_tokens_seen": 171509312, "step": 141030 }, { "epoch": 17.67134444305225, "grad_norm": 27.441205978393555, "learning_rate": 4.0734737435838447e-07, "loss": 0.4435, "num_input_tokens_seen": 171515008, "step": 141035 }, { "epoch": 17.67197093096103, "grad_norm": 4.713563919067383, "learning_rate": 4.071312584621323e-07, "loss": 0.4598, "num_input_tokens_seen": 171521152, "step": 141040 }, { "epoch": 17.672597418869817, "grad_norm": 13.533613204956055, "learning_rate": 4.0691519747748877e-07, "loss": 0.4448, "num_input_tokens_seen": 171526848, "step": 141045 }, { "epoch": 17.6732239067786, "grad_norm": 6.647159099578857, "learning_rate": 4.0669919140703394e-07, "loss": 0.4205, "num_input_tokens_seen": 171533152, "step": 141050 }, { "epoch": 17.67385039468738, "grad_norm": 6.573318958282471, "learning_rate": 4.064832402533536e-07, "loss": 0.4097, "num_input_tokens_seen": 171539104, "step": 141055 }, { "epoch": 17.674476882596167, "grad_norm": 11.077434539794922, "learning_rate": 4.062673440190268e-07, "loss": 0.4419, "num_input_tokens_seen": 171545088, "step": 141060 }, { "epoch": 17.67510337050495, "grad_norm": 10.531679153442383, "learning_rate": 4.060515027066375e-07, "loss": 0.3795, "num_input_tokens_seen": 171550880, "step": 141065 }, { "epoch": 17.67572985841373, "grad_norm": 19.09004783630371, "learning_rate": 4.058357163187637e-07, "loss": 0.4546, "num_input_tokens_seen": 171557056, "step": 141070 }, { "epoch": 17.676356346322518, "grad_norm": 6.230641841888428, "learning_rate": 4.0561998485798725e-07, "loss": 0.5042, "num_input_tokens_seen": 171562848, "step": 141075 }, { "epoch": 17.6769828342313, "grad_norm": 10.675960540771484, "learning_rate": 4.0540430832688603e-07, "loss": 0.4112, "num_input_tokens_seen": 171568672, "step": 141080 }, { "epoch": 17.677609322140082, "grad_norm": 3.5073001384735107, "learning_rate": 4.051886867280397e-07, "loss": 0.4367, "num_input_tokens_seen": 171574816, "step": 141085 }, { "epoch": 17.678235810048864, "grad_norm": 6.733827590942383, "learning_rate": 4.04973120064025e-07, "loss": 0.4242, "num_input_tokens_seen": 171580896, "step": 141090 }, { "epoch": 17.67886229795765, "grad_norm": 7.772133827209473, "learning_rate": 4.0475760833741995e-07, "loss": 0.5176, "num_input_tokens_seen": 171587040, "step": 141095 }, { "epoch": 17.679488785866432, "grad_norm": 7.6937737464904785, "learning_rate": 4.0454215155080246e-07, "loss": 0.3953, "num_input_tokens_seen": 171593216, "step": 141100 }, { "epoch": 17.680115273775215, "grad_norm": 4.903363227844238, "learning_rate": 4.043267497067455e-07, "loss": 0.4044, "num_input_tokens_seen": 171599584, "step": 141105 }, { "epoch": 17.680741761684, "grad_norm": 5.7141313552856445, "learning_rate": 4.0411140280782743e-07, "loss": 0.4828, "num_input_tokens_seen": 171605440, "step": 141110 }, { "epoch": 17.681368249592783, "grad_norm": 3.46772837638855, "learning_rate": 4.038961108566208e-07, "loss": 0.4066, "num_input_tokens_seen": 171611968, "step": 141115 }, { "epoch": 17.681994737501565, "grad_norm": 5.024909973144531, "learning_rate": 4.0368087385570023e-07, "loss": 0.4152, "num_input_tokens_seen": 171618176, "step": 141120 }, { "epoch": 17.68262122541035, "grad_norm": 7.392875671386719, "learning_rate": 4.034656918076402e-07, "loss": 0.483, "num_input_tokens_seen": 171624416, "step": 141125 }, { "epoch": 17.683247713319133, "grad_norm": 16.46287727355957, "learning_rate": 4.032505647150115e-07, "loss": 0.4933, "num_input_tokens_seen": 171629760, "step": 141130 }, { "epoch": 17.683874201227916, "grad_norm": 17.18351173400879, "learning_rate": 4.0303549258038644e-07, "loss": 0.4694, "num_input_tokens_seen": 171636000, "step": 141135 }, { "epoch": 17.6845006891367, "grad_norm": 5.402112007141113, "learning_rate": 4.028204754063386e-07, "loss": 0.4333, "num_input_tokens_seen": 171642560, "step": 141140 }, { "epoch": 17.685127177045484, "grad_norm": 7.775296211242676, "learning_rate": 4.0260551319543597e-07, "loss": 0.4756, "num_input_tokens_seen": 171648448, "step": 141145 }, { "epoch": 17.685753664954266, "grad_norm": 12.600430488586426, "learning_rate": 4.023906059502503e-07, "loss": 0.5268, "num_input_tokens_seen": 171654688, "step": 141150 }, { "epoch": 17.686380152863048, "grad_norm": 4.90301513671875, "learning_rate": 4.021757536733495e-07, "loss": 0.4257, "num_input_tokens_seen": 171661088, "step": 141155 }, { "epoch": 17.687006640771834, "grad_norm": 22.29054069519043, "learning_rate": 4.0196095636730383e-07, "loss": 0.4615, "num_input_tokens_seen": 171667136, "step": 141160 }, { "epoch": 17.687633128680616, "grad_norm": 9.310700416564941, "learning_rate": 4.0174621403468117e-07, "loss": 0.403, "num_input_tokens_seen": 171672864, "step": 141165 }, { "epoch": 17.6882596165894, "grad_norm": 4.296838283538818, "learning_rate": 4.015315266780473e-07, "loss": 0.4669, "num_input_tokens_seen": 171678304, "step": 141170 }, { "epoch": 17.688886104498184, "grad_norm": 12.600295066833496, "learning_rate": 4.013168942999718e-07, "loss": 0.4458, "num_input_tokens_seen": 171684064, "step": 141175 }, { "epoch": 17.689512592406967, "grad_norm": 5.952449798583984, "learning_rate": 4.011023169030176e-07, "loss": 0.4861, "num_input_tokens_seen": 171689696, "step": 141180 }, { "epoch": 17.69013908031575, "grad_norm": 6.229874134063721, "learning_rate": 4.008877944897532e-07, "loss": 0.5706, "num_input_tokens_seen": 171695488, "step": 141185 }, { "epoch": 17.690765568224535, "grad_norm": 5.25499153137207, "learning_rate": 4.006733270627411e-07, "loss": 0.4822, "num_input_tokens_seen": 171701632, "step": 141190 }, { "epoch": 17.691392056133317, "grad_norm": 9.712106704711914, "learning_rate": 4.004589146245469e-07, "loss": 0.4573, "num_input_tokens_seen": 171707872, "step": 141195 }, { "epoch": 17.6920185440421, "grad_norm": 15.094772338867188, "learning_rate": 4.0024455717773245e-07, "loss": 0.4339, "num_input_tokens_seen": 171713952, "step": 141200 }, { "epoch": 17.69264503195088, "grad_norm": 10.778871536254883, "learning_rate": 4.0003025472486246e-07, "loss": 0.4182, "num_input_tokens_seen": 171720032, "step": 141205 }, { "epoch": 17.693271519859668, "grad_norm": 5.980403900146484, "learning_rate": 3.9981600726849754e-07, "loss": 0.4131, "num_input_tokens_seen": 171725952, "step": 141210 }, { "epoch": 17.69389800776845, "grad_norm": 4.460077285766602, "learning_rate": 3.9960181481119966e-07, "loss": 0.4853, "num_input_tokens_seen": 171731808, "step": 141215 }, { "epoch": 17.694524495677232, "grad_norm": 4.758157253265381, "learning_rate": 3.993876773555311e-07, "loss": 0.4722, "num_input_tokens_seen": 171738016, "step": 141220 }, { "epoch": 17.695150983586018, "grad_norm": 11.26778793334961, "learning_rate": 3.991735949040498e-07, "loss": 0.4437, "num_input_tokens_seen": 171744000, "step": 141225 }, { "epoch": 17.6957774714948, "grad_norm": 8.32305908203125, "learning_rate": 3.989595674593161e-07, "loss": 0.5712, "num_input_tokens_seen": 171749600, "step": 141230 }, { "epoch": 17.696403959403582, "grad_norm": 4.273103713989258, "learning_rate": 3.9874559502388946e-07, "loss": 0.454, "num_input_tokens_seen": 171755840, "step": 141235 }, { "epoch": 17.69703044731237, "grad_norm": 4.906257152557373, "learning_rate": 3.985316776003284e-07, "loss": 0.4474, "num_input_tokens_seen": 171762048, "step": 141240 }, { "epoch": 17.69765693522115, "grad_norm": 4.502274036407471, "learning_rate": 3.9831781519118926e-07, "loss": 0.4516, "num_input_tokens_seen": 171768192, "step": 141245 }, { "epoch": 17.698283423129933, "grad_norm": 14.53481674194336, "learning_rate": 3.9810400779903003e-07, "loss": 0.4609, "num_input_tokens_seen": 171774240, "step": 141250 }, { "epoch": 17.69890991103872, "grad_norm": 3.3037612438201904, "learning_rate": 3.978902554264058e-07, "loss": 0.4298, "num_input_tokens_seen": 171780480, "step": 141255 }, { "epoch": 17.6995363989475, "grad_norm": 21.038022994995117, "learning_rate": 3.9767655807587346e-07, "loss": 0.4858, "num_input_tokens_seen": 171786496, "step": 141260 }, { "epoch": 17.700162886856283, "grad_norm": 10.04118824005127, "learning_rate": 3.9746291574998706e-07, "loss": 0.5305, "num_input_tokens_seen": 171792768, "step": 141265 }, { "epoch": 17.700789374765066, "grad_norm": 6.604879856109619, "learning_rate": 3.9724932845130126e-07, "loss": 0.4771, "num_input_tokens_seen": 171799040, "step": 141270 }, { "epoch": 17.70141586267385, "grad_norm": 27.485055923461914, "learning_rate": 3.9703579618236896e-07, "loss": 0.499, "num_input_tokens_seen": 171804736, "step": 141275 }, { "epoch": 17.702042350582634, "grad_norm": 6.035110950469971, "learning_rate": 3.9682231894574375e-07, "loss": 0.4241, "num_input_tokens_seen": 171810784, "step": 141280 }, { "epoch": 17.702668838491416, "grad_norm": 17.731931686401367, "learning_rate": 3.966088967439785e-07, "loss": 0.4312, "num_input_tokens_seen": 171817152, "step": 141285 }, { "epoch": 17.7032953264002, "grad_norm": 10.423956871032715, "learning_rate": 3.9639552957962344e-07, "loss": 0.4478, "num_input_tokens_seen": 171823584, "step": 141290 }, { "epoch": 17.703921814308984, "grad_norm": 16.68813133239746, "learning_rate": 3.96182217455231e-07, "loss": 0.516, "num_input_tokens_seen": 171829760, "step": 141295 }, { "epoch": 17.704548302217766, "grad_norm": 4.1346435546875, "learning_rate": 3.959689603733502e-07, "loss": 0.4298, "num_input_tokens_seen": 171835744, "step": 141300 }, { "epoch": 17.705174790126552, "grad_norm": 2.8095269203186035, "learning_rate": 3.957557583365318e-07, "loss": 0.4571, "num_input_tokens_seen": 171842208, "step": 141305 }, { "epoch": 17.705801278035334, "grad_norm": 3.7444677352905273, "learning_rate": 3.9554261134732374e-07, "loss": 0.4242, "num_input_tokens_seen": 171848256, "step": 141310 }, { "epoch": 17.706427765944117, "grad_norm": 5.4795660972595215, "learning_rate": 3.953295194082762e-07, "loss": 0.4305, "num_input_tokens_seen": 171854656, "step": 141315 }, { "epoch": 17.7070542538529, "grad_norm": 14.944295883178711, "learning_rate": 3.9511648252193445e-07, "loss": 0.5086, "num_input_tokens_seen": 171860640, "step": 141320 }, { "epoch": 17.707680741761685, "grad_norm": 4.455508708953857, "learning_rate": 3.949035006908469e-07, "loss": 0.415, "num_input_tokens_seen": 171866816, "step": 141325 }, { "epoch": 17.708307229670467, "grad_norm": 4.7861480712890625, "learning_rate": 3.946905739175594e-07, "loss": 0.4431, "num_input_tokens_seen": 171873024, "step": 141330 }, { "epoch": 17.70893371757925, "grad_norm": 4.339994430541992, "learning_rate": 3.9447770220461923e-07, "loss": 0.4646, "num_input_tokens_seen": 171879168, "step": 141335 }, { "epoch": 17.709560205488035, "grad_norm": 28.401790618896484, "learning_rate": 3.9426488555456885e-07, "loss": 0.6001, "num_input_tokens_seen": 171885472, "step": 141340 }, { "epoch": 17.710186693396818, "grad_norm": 18.54036521911621, "learning_rate": 3.940521239699546e-07, "loss": 0.4752, "num_input_tokens_seen": 171891648, "step": 141345 }, { "epoch": 17.7108131813056, "grad_norm": 5.187594890594482, "learning_rate": 3.9383941745332053e-07, "loss": 0.5058, "num_input_tokens_seen": 171897600, "step": 141350 }, { "epoch": 17.711439669214386, "grad_norm": 17.5253963470459, "learning_rate": 3.9362676600720797e-07, "loss": 0.4567, "num_input_tokens_seen": 171903680, "step": 141355 }, { "epoch": 17.712066157123168, "grad_norm": 4.052552700042725, "learning_rate": 3.934141696341609e-07, "loss": 0.4336, "num_input_tokens_seen": 171909664, "step": 141360 }, { "epoch": 17.71269264503195, "grad_norm": 3.1163225173950195, "learning_rate": 3.932016283367196e-07, "loss": 0.4521, "num_input_tokens_seen": 171915552, "step": 141365 }, { "epoch": 17.713319132940732, "grad_norm": 14.368910789489746, "learning_rate": 3.92989142117427e-07, "loss": 0.4606, "num_input_tokens_seen": 171921600, "step": 141370 }, { "epoch": 17.71394562084952, "grad_norm": 4.303576469421387, "learning_rate": 3.927767109788216e-07, "loss": 0.4325, "num_input_tokens_seen": 171927904, "step": 141375 }, { "epoch": 17.7145721087583, "grad_norm": 5.01884651184082, "learning_rate": 3.925643349234454e-07, "loss": 0.4391, "num_input_tokens_seen": 171934144, "step": 141380 }, { "epoch": 17.715198596667083, "grad_norm": 3.919532299041748, "learning_rate": 3.9235201395383505e-07, "loss": 0.4432, "num_input_tokens_seen": 171940480, "step": 141385 }, { "epoch": 17.71582508457587, "grad_norm": 4.326003551483154, "learning_rate": 3.9213974807253085e-07, "loss": 0.4315, "num_input_tokens_seen": 171946688, "step": 141390 }, { "epoch": 17.71645157248465, "grad_norm": 19.529132843017578, "learning_rate": 3.9192753728206965e-07, "loss": 0.4297, "num_input_tokens_seen": 171952928, "step": 141395 }, { "epoch": 17.717078060393433, "grad_norm": 3.1601879596710205, "learning_rate": 3.917153815849895e-07, "loss": 0.4337, "num_input_tokens_seen": 171959040, "step": 141400 }, { "epoch": 17.71770454830222, "grad_norm": 3.906994581222534, "learning_rate": 3.9150328098382593e-07, "loss": 0.4253, "num_input_tokens_seen": 171964576, "step": 141405 }, { "epoch": 17.718331036211, "grad_norm": 3.486686944961548, "learning_rate": 3.912912354811149e-07, "loss": 0.423, "num_input_tokens_seen": 171970560, "step": 141410 }, { "epoch": 17.718957524119784, "grad_norm": 15.144451141357422, "learning_rate": 3.910792450793932e-07, "loss": 0.5355, "num_input_tokens_seen": 171976576, "step": 141415 }, { "epoch": 17.71958401202857, "grad_norm": 14.19892406463623, "learning_rate": 3.9086730978119327e-07, "loss": 0.4497, "num_input_tokens_seen": 171982560, "step": 141420 }, { "epoch": 17.72021049993735, "grad_norm": 6.549557209014893, "learning_rate": 3.9065542958905024e-07, "loss": 0.4256, "num_input_tokens_seen": 171988864, "step": 141425 }, { "epoch": 17.720836987846134, "grad_norm": 7.752634048461914, "learning_rate": 3.90443604505496e-07, "loss": 0.4426, "num_input_tokens_seen": 171995040, "step": 141430 }, { "epoch": 17.721463475754916, "grad_norm": 30.462984085083008, "learning_rate": 3.9023183453306415e-07, "loss": 0.4771, "num_input_tokens_seen": 172001248, "step": 141435 }, { "epoch": 17.722089963663702, "grad_norm": 4.892297267913818, "learning_rate": 3.9002011967428646e-07, "loss": 0.4723, "num_input_tokens_seen": 172007264, "step": 141440 }, { "epoch": 17.722716451572484, "grad_norm": 10.089212417602539, "learning_rate": 3.898084599316948e-07, "loss": 0.4372, "num_input_tokens_seen": 172013376, "step": 141445 }, { "epoch": 17.723342939481267, "grad_norm": 3.862661838531494, "learning_rate": 3.895968553078183e-07, "loss": 0.4149, "num_input_tokens_seen": 172019712, "step": 141450 }, { "epoch": 17.723969427390053, "grad_norm": 6.0012898445129395, "learning_rate": 3.893853058051883e-07, "loss": 0.4675, "num_input_tokens_seen": 172025280, "step": 141455 }, { "epoch": 17.724595915298835, "grad_norm": 27.51448631286621, "learning_rate": 3.891738114263327e-07, "loss": 0.5312, "num_input_tokens_seen": 172030464, "step": 141460 }, { "epoch": 17.725222403207617, "grad_norm": 4.7498087882995605, "learning_rate": 3.8896237217378065e-07, "loss": 0.4432, "num_input_tokens_seen": 172036736, "step": 141465 }, { "epoch": 17.725848891116403, "grad_norm": 10.945197105407715, "learning_rate": 3.8875098805006115e-07, "loss": 0.4163, "num_input_tokens_seen": 172043040, "step": 141470 }, { "epoch": 17.726475379025185, "grad_norm": 4.858973979949951, "learning_rate": 3.885396590576995e-07, "loss": 0.4289, "num_input_tokens_seen": 172048928, "step": 141475 }, { "epoch": 17.727101866933968, "grad_norm": 10.908546447753906, "learning_rate": 3.883283851992242e-07, "loss": 0.4267, "num_input_tokens_seen": 172054400, "step": 141480 }, { "epoch": 17.727728354842753, "grad_norm": 6.5510735511779785, "learning_rate": 3.881171664771599e-07, "loss": 0.3987, "num_input_tokens_seen": 172060640, "step": 141485 }, { "epoch": 17.728354842751536, "grad_norm": 8.534401893615723, "learning_rate": 3.879060028940329e-07, "loss": 0.4941, "num_input_tokens_seen": 172066336, "step": 141490 }, { "epoch": 17.728981330660318, "grad_norm": 3.7579915523529053, "learning_rate": 3.8769489445236617e-07, "loss": 0.4407, "num_input_tokens_seen": 172072384, "step": 141495 }, { "epoch": 17.7296078185691, "grad_norm": 11.601996421813965, "learning_rate": 3.874838411546861e-07, "loss": 0.436, "num_input_tokens_seen": 172078176, "step": 141500 }, { "epoch": 17.730234306477886, "grad_norm": 19.292020797729492, "learning_rate": 3.872728430035133e-07, "loss": 0.5403, "num_input_tokens_seen": 172084480, "step": 141505 }, { "epoch": 17.73086079438667, "grad_norm": 7.429999828338623, "learning_rate": 3.8706190000137313e-07, "loss": 0.4516, "num_input_tokens_seen": 172090624, "step": 141510 }, { "epoch": 17.73148728229545, "grad_norm": 7.057535648345947, "learning_rate": 3.868510121507851e-07, "loss": 0.4132, "num_input_tokens_seen": 172096736, "step": 141515 }, { "epoch": 17.732113770204236, "grad_norm": 7.728360176086426, "learning_rate": 3.866401794542729e-07, "loss": 0.4356, "num_input_tokens_seen": 172102432, "step": 141520 }, { "epoch": 17.73274025811302, "grad_norm": 13.657740592956543, "learning_rate": 3.8642940191435496e-07, "loss": 0.4232, "num_input_tokens_seen": 172108768, "step": 141525 }, { "epoch": 17.7333667460218, "grad_norm": 8.479496002197266, "learning_rate": 3.862186795335526e-07, "loss": 0.5226, "num_input_tokens_seen": 172114848, "step": 141530 }, { "epoch": 17.733993233930587, "grad_norm": 12.966571807861328, "learning_rate": 3.8600801231438447e-07, "loss": 0.4182, "num_input_tokens_seen": 172121120, "step": 141535 }, { "epoch": 17.73461972183937, "grad_norm": 7.029272556304932, "learning_rate": 3.857974002593712e-07, "loss": 0.4322, "num_input_tokens_seen": 172126304, "step": 141540 }, { "epoch": 17.73524620974815, "grad_norm": 10.90646743774414, "learning_rate": 3.855868433710286e-07, "loss": 0.5376, "num_input_tokens_seen": 172132608, "step": 141545 }, { "epoch": 17.735872697656934, "grad_norm": 8.102206230163574, "learning_rate": 3.8537634165187464e-07, "loss": 0.447, "num_input_tokens_seen": 172138880, "step": 141550 }, { "epoch": 17.73649918556572, "grad_norm": 3.4584643840789795, "learning_rate": 3.851658951044268e-07, "loss": 0.4031, "num_input_tokens_seen": 172145184, "step": 141555 }, { "epoch": 17.7371256734745, "grad_norm": 4.756138324737549, "learning_rate": 3.8495550373120026e-07, "loss": 0.3991, "num_input_tokens_seen": 172151616, "step": 141560 }, { "epoch": 17.737752161383284, "grad_norm": 4.002596378326416, "learning_rate": 3.8474516753471193e-07, "loss": 0.4413, "num_input_tokens_seen": 172157056, "step": 141565 }, { "epoch": 17.73837864929207, "grad_norm": 5.859323024749756, "learning_rate": 3.845348865174742e-07, "loss": 0.4637, "num_input_tokens_seen": 172163232, "step": 141570 }, { "epoch": 17.739005137200852, "grad_norm": 5.942935466766357, "learning_rate": 3.843246606820028e-07, "loss": 0.4186, "num_input_tokens_seen": 172169568, "step": 141575 }, { "epoch": 17.739631625109634, "grad_norm": 3.9514589309692383, "learning_rate": 3.8411449003081024e-07, "loss": 0.4264, "num_input_tokens_seen": 172176000, "step": 141580 }, { "epoch": 17.74025811301842, "grad_norm": 31.698671340942383, "learning_rate": 3.839043745664106e-07, "loss": 0.4827, "num_input_tokens_seen": 172182048, "step": 141585 }, { "epoch": 17.740884600927203, "grad_norm": 5.565348148345947, "learning_rate": 3.8369431429131464e-07, "loss": 0.4511, "num_input_tokens_seen": 172188032, "step": 141590 }, { "epoch": 17.741511088835985, "grad_norm": 17.54668617248535, "learning_rate": 3.834843092080337e-07, "loss": 0.4536, "num_input_tokens_seen": 172194144, "step": 141595 }, { "epoch": 17.742137576744767, "grad_norm": 3.4879093170166016, "learning_rate": 3.8327435931908075e-07, "loss": 0.4452, "num_input_tokens_seen": 172200096, "step": 141600 }, { "epoch": 17.742764064653553, "grad_norm": 4.697160243988037, "learning_rate": 3.8306446462696324e-07, "loss": 0.4164, "num_input_tokens_seen": 172206112, "step": 141605 }, { "epoch": 17.743390552562335, "grad_norm": 30.78042984008789, "learning_rate": 3.828546251341925e-07, "loss": 0.5254, "num_input_tokens_seen": 172212352, "step": 141610 }, { "epoch": 17.744017040471117, "grad_norm": 9.978585243225098, "learning_rate": 3.826448408432754e-07, "loss": 0.4577, "num_input_tokens_seen": 172218432, "step": 141615 }, { "epoch": 17.744643528379903, "grad_norm": 4.381900787353516, "learning_rate": 3.8243511175672264e-07, "loss": 0.4301, "num_input_tokens_seen": 172224576, "step": 141620 }, { "epoch": 17.745270016288686, "grad_norm": 6.4482316970825195, "learning_rate": 3.822254378770396e-07, "loss": 0.5068, "num_input_tokens_seen": 172230784, "step": 141625 }, { "epoch": 17.745896504197468, "grad_norm": 7.427662372589111, "learning_rate": 3.820158192067341e-07, "loss": 0.4178, "num_input_tokens_seen": 172236832, "step": 141630 }, { "epoch": 17.746522992106254, "grad_norm": 7.732644081115723, "learning_rate": 3.8180625574831154e-07, "loss": 0.4555, "num_input_tokens_seen": 172242304, "step": 141635 }, { "epoch": 17.747149480015036, "grad_norm": 5.147383689880371, "learning_rate": 3.8159674750427813e-07, "loss": 0.4811, "num_input_tokens_seen": 172248224, "step": 141640 }, { "epoch": 17.74777596792382, "grad_norm": 9.726954460144043, "learning_rate": 3.813872944771385e-07, "loss": 0.4274, "num_input_tokens_seen": 172254752, "step": 141645 }, { "epoch": 17.748402455832604, "grad_norm": 20.883243560791016, "learning_rate": 3.8117789666939687e-07, "loss": 0.4739, "num_input_tokens_seen": 172260960, "step": 141650 }, { "epoch": 17.749028943741386, "grad_norm": 18.07538604736328, "learning_rate": 3.8096855408355726e-07, "loss": 0.4418, "num_input_tokens_seen": 172267232, "step": 141655 }, { "epoch": 17.74965543165017, "grad_norm": 6.354978084564209, "learning_rate": 3.807592667221216e-07, "loss": 0.403, "num_input_tokens_seen": 172273408, "step": 141660 }, { "epoch": 17.75028191955895, "grad_norm": 9.260370254516602, "learning_rate": 3.8055003458759345e-07, "loss": 0.5031, "num_input_tokens_seen": 172279616, "step": 141665 }, { "epoch": 17.750908407467737, "grad_norm": 11.994758605957031, "learning_rate": 3.803408576824724e-07, "loss": 0.4219, "num_input_tokens_seen": 172285536, "step": 141670 }, { "epoch": 17.75153489537652, "grad_norm": 3.9011709690093994, "learning_rate": 3.8013173600926156e-07, "loss": 0.4265, "num_input_tokens_seen": 172291808, "step": 141675 }, { "epoch": 17.7521613832853, "grad_norm": 14.758586883544922, "learning_rate": 3.7992266957045887e-07, "loss": 0.5191, "num_input_tokens_seen": 172297920, "step": 141680 }, { "epoch": 17.752787871194087, "grad_norm": 3.932176113128662, "learning_rate": 3.7971365836856567e-07, "loss": 0.4587, "num_input_tokens_seen": 172303904, "step": 141685 }, { "epoch": 17.75341435910287, "grad_norm": 3.9946370124816895, "learning_rate": 3.795047024060794e-07, "loss": 0.4141, "num_input_tokens_seen": 172310272, "step": 141690 }, { "epoch": 17.75404084701165, "grad_norm": 3.1551709175109863, "learning_rate": 3.792958016855003e-07, "loss": 0.447, "num_input_tokens_seen": 172316128, "step": 141695 }, { "epoch": 17.754667334920438, "grad_norm": 4.355669975280762, "learning_rate": 3.7908695620932356e-07, "loss": 0.4773, "num_input_tokens_seen": 172322240, "step": 141700 }, { "epoch": 17.75529382282922, "grad_norm": 5.478231906890869, "learning_rate": 3.7887816598004833e-07, "loss": 0.421, "num_input_tokens_seen": 172328448, "step": 141705 }, { "epoch": 17.755920310738002, "grad_norm": 6.096095561981201, "learning_rate": 3.786694310001693e-07, "loss": 0.4504, "num_input_tokens_seen": 172334432, "step": 141710 }, { "epoch": 17.756546798646784, "grad_norm": 29.32428741455078, "learning_rate": 3.7846075127218217e-07, "loss": 0.4843, "num_input_tokens_seen": 172340000, "step": 141715 }, { "epoch": 17.75717328655557, "grad_norm": 11.725406646728516, "learning_rate": 3.782521267985834e-07, "loss": 0.4362, "num_input_tokens_seen": 172345856, "step": 141720 }, { "epoch": 17.757799774464353, "grad_norm": 4.550012588500977, "learning_rate": 3.7804355758186526e-07, "loss": 0.4051, "num_input_tokens_seen": 172352160, "step": 141725 }, { "epoch": 17.758426262373135, "grad_norm": 3.735372304916382, "learning_rate": 3.778350436245232e-07, "loss": 0.4079, "num_input_tokens_seen": 172358624, "step": 141730 }, { "epoch": 17.75905275028192, "grad_norm": 8.193026542663574, "learning_rate": 3.7762658492904837e-07, "loss": 0.4597, "num_input_tokens_seen": 172365024, "step": 141735 }, { "epoch": 17.759679238190703, "grad_norm": 8.45430850982666, "learning_rate": 3.774181814979344e-07, "loss": 0.4399, "num_input_tokens_seen": 172371136, "step": 141740 }, { "epoch": 17.760305726099485, "grad_norm": 30.401294708251953, "learning_rate": 3.7720983333367323e-07, "loss": 0.5458, "num_input_tokens_seen": 172377056, "step": 141745 }, { "epoch": 17.76093221400827, "grad_norm": 32.46292495727539, "learning_rate": 3.7700154043875394e-07, "loss": 0.4836, "num_input_tokens_seen": 172382880, "step": 141750 }, { "epoch": 17.761558701917053, "grad_norm": 6.039651393890381, "learning_rate": 3.7679330281566783e-07, "loss": 0.4891, "num_input_tokens_seen": 172388800, "step": 141755 }, { "epoch": 17.762185189825836, "grad_norm": 6.186629295349121, "learning_rate": 3.765851204669063e-07, "loss": 0.4299, "num_input_tokens_seen": 172394848, "step": 141760 }, { "epoch": 17.76281167773462, "grad_norm": 6.626985549926758, "learning_rate": 3.763769933949557e-07, "loss": 0.5044, "num_input_tokens_seen": 172400960, "step": 141765 }, { "epoch": 17.763438165643404, "grad_norm": 8.736167907714844, "learning_rate": 3.7616892160230625e-07, "loss": 0.4186, "num_input_tokens_seen": 172406784, "step": 141770 }, { "epoch": 17.764064653552186, "grad_norm": 5.092956066131592, "learning_rate": 3.7596090509144365e-07, "loss": 0.4711, "num_input_tokens_seen": 172413120, "step": 141775 }, { "epoch": 17.76469114146097, "grad_norm": 7.300709247589111, "learning_rate": 3.7575294386485663e-07, "loss": 0.4515, "num_input_tokens_seen": 172419328, "step": 141780 }, { "epoch": 17.765317629369754, "grad_norm": 11.304275512695312, "learning_rate": 3.755450379250319e-07, "loss": 0.4763, "num_input_tokens_seen": 172425216, "step": 141785 }, { "epoch": 17.765944117278536, "grad_norm": 5.512197494506836, "learning_rate": 3.753371872744532e-07, "loss": 0.4662, "num_input_tokens_seen": 172431136, "step": 141790 }, { "epoch": 17.76657060518732, "grad_norm": 4.352666854858398, "learning_rate": 3.7512939191560737e-07, "loss": 0.4774, "num_input_tokens_seen": 172437152, "step": 141795 }, { "epoch": 17.767197093096105, "grad_norm": 6.0356526374816895, "learning_rate": 3.749216518509774e-07, "loss": 0.4996, "num_input_tokens_seen": 172443424, "step": 141800 }, { "epoch": 17.767823581004887, "grad_norm": 2.799370765686035, "learning_rate": 3.747139670830485e-07, "loss": 0.4798, "num_input_tokens_seen": 172449408, "step": 141805 }, { "epoch": 17.76845006891367, "grad_norm": 17.092819213867188, "learning_rate": 3.745063376143021e-07, "loss": 0.5031, "num_input_tokens_seen": 172455328, "step": 141810 }, { "epoch": 17.769076556822455, "grad_norm": 4.4636149406433105, "learning_rate": 3.742987634472217e-07, "loss": 0.4325, "num_input_tokens_seen": 172460384, "step": 141815 }, { "epoch": 17.769703044731237, "grad_norm": 4.247578144073486, "learning_rate": 3.7409124458428814e-07, "loss": 0.4497, "num_input_tokens_seen": 172466688, "step": 141820 }, { "epoch": 17.77032953264002, "grad_norm": 21.803348541259766, "learning_rate": 3.738837810279838e-07, "loss": 0.5813, "num_input_tokens_seen": 172472128, "step": 141825 }, { "epoch": 17.7709560205488, "grad_norm": 4.663419246673584, "learning_rate": 3.7367637278078736e-07, "loss": 0.4113, "num_input_tokens_seen": 172478336, "step": 141830 }, { "epoch": 17.771582508457588, "grad_norm": 3.22538423538208, "learning_rate": 3.734690198451796e-07, "loss": 0.4634, "num_input_tokens_seen": 172483872, "step": 141835 }, { "epoch": 17.77220899636637, "grad_norm": 10.010096549987793, "learning_rate": 3.732617222236401e-07, "loss": 0.4537, "num_input_tokens_seen": 172490016, "step": 141840 }, { "epoch": 17.772835484275152, "grad_norm": 4.265926837921143, "learning_rate": 3.730544799186464e-07, "loss": 0.4907, "num_input_tokens_seen": 172496416, "step": 141845 }, { "epoch": 17.773461972183938, "grad_norm": 16.9896297454834, "learning_rate": 3.7284729293267595e-07, "loss": 0.465, "num_input_tokens_seen": 172502624, "step": 141850 }, { "epoch": 17.77408846009272, "grad_norm": 9.810667037963867, "learning_rate": 3.7264016126820677e-07, "loss": 0.501, "num_input_tokens_seen": 172508064, "step": 141855 }, { "epoch": 17.774714948001503, "grad_norm": 12.080474853515625, "learning_rate": 3.7243308492771524e-07, "loss": 0.4665, "num_input_tokens_seen": 172513760, "step": 141860 }, { "epoch": 17.77534143591029, "grad_norm": 12.199601173400879, "learning_rate": 3.7222606391367655e-07, "loss": 0.4578, "num_input_tokens_seen": 172519232, "step": 141865 }, { "epoch": 17.77596792381907, "grad_norm": 5.709797382354736, "learning_rate": 3.720190982285665e-07, "loss": 0.4355, "num_input_tokens_seen": 172524768, "step": 141870 }, { "epoch": 17.776594411727853, "grad_norm": 4.467593193054199, "learning_rate": 3.7181218787485876e-07, "loss": 0.4598, "num_input_tokens_seen": 172531168, "step": 141875 }, { "epoch": 17.77722089963664, "grad_norm": 6.767517566680908, "learning_rate": 3.7160533285502786e-07, "loss": 0.4465, "num_input_tokens_seen": 172537632, "step": 141880 }, { "epoch": 17.77784738754542, "grad_norm": 5.838311195373535, "learning_rate": 3.7139853317154585e-07, "loss": 0.4171, "num_input_tokens_seen": 172543648, "step": 141885 }, { "epoch": 17.778473875454203, "grad_norm": 23.361793518066406, "learning_rate": 3.711917888268868e-07, "loss": 0.4918, "num_input_tokens_seen": 172550080, "step": 141890 }, { "epoch": 17.779100363362986, "grad_norm": 11.342669486999512, "learning_rate": 3.709850998235209e-07, "loss": 0.4185, "num_input_tokens_seen": 172555488, "step": 141895 }, { "epoch": 17.77972685127177, "grad_norm": 4.722879886627197, "learning_rate": 3.7077846616391965e-07, "loss": 0.4206, "num_input_tokens_seen": 172561760, "step": 141900 }, { "epoch": 17.780353339180554, "grad_norm": 8.05152702331543, "learning_rate": 3.705718878505549e-07, "loss": 0.4159, "num_input_tokens_seen": 172567648, "step": 141905 }, { "epoch": 17.780979827089336, "grad_norm": 4.659300804138184, "learning_rate": 3.7036536488589457e-07, "loss": 0.4345, "num_input_tokens_seen": 172572928, "step": 141910 }, { "epoch": 17.781606314998122, "grad_norm": 8.913554191589355, "learning_rate": 3.701588972724096e-07, "loss": 0.4939, "num_input_tokens_seen": 172579360, "step": 141915 }, { "epoch": 17.782232802906904, "grad_norm": 13.348954200744629, "learning_rate": 3.6995248501256686e-07, "loss": 0.4415, "num_input_tokens_seen": 172584832, "step": 141920 }, { "epoch": 17.782859290815686, "grad_norm": 5.704085826873779, "learning_rate": 3.697461281088355e-07, "loss": 0.4199, "num_input_tokens_seen": 172591232, "step": 141925 }, { "epoch": 17.783485778724472, "grad_norm": 6.685460567474365, "learning_rate": 3.695398265636813e-07, "loss": 0.4119, "num_input_tokens_seen": 172597536, "step": 141930 }, { "epoch": 17.784112266633255, "grad_norm": 21.476716995239258, "learning_rate": 3.6933358037957225e-07, "loss": 0.4553, "num_input_tokens_seen": 172603680, "step": 141935 }, { "epoch": 17.784738754542037, "grad_norm": 3.9447596073150635, "learning_rate": 3.691273895589731e-07, "loss": 0.4432, "num_input_tokens_seen": 172609600, "step": 141940 }, { "epoch": 17.78536524245082, "grad_norm": 4.22993278503418, "learning_rate": 3.689212541043491e-07, "loss": 0.447, "num_input_tokens_seen": 172615616, "step": 141945 }, { "epoch": 17.785991730359605, "grad_norm": 24.986631393432617, "learning_rate": 3.6871517401816494e-07, "loss": 0.415, "num_input_tokens_seen": 172621568, "step": 141950 }, { "epoch": 17.786618218268387, "grad_norm": 13.212543487548828, "learning_rate": 3.6850914930288585e-07, "loss": 0.4195, "num_input_tokens_seen": 172627840, "step": 141955 }, { "epoch": 17.78724470617717, "grad_norm": 9.022135734558105, "learning_rate": 3.6830317996097267e-07, "loss": 0.4654, "num_input_tokens_seen": 172633504, "step": 141960 }, { "epoch": 17.787871194085955, "grad_norm": 4.638226509094238, "learning_rate": 3.6809726599488895e-07, "loss": 0.4161, "num_input_tokens_seen": 172639584, "step": 141965 }, { "epoch": 17.788497681994738, "grad_norm": 13.273797988891602, "learning_rate": 3.678914074070977e-07, "loss": 0.4369, "num_input_tokens_seen": 172645888, "step": 141970 }, { "epoch": 17.78912416990352, "grad_norm": 5.8082756996154785, "learning_rate": 3.676856042000587e-07, "loss": 0.457, "num_input_tokens_seen": 172651968, "step": 141975 }, { "epoch": 17.789750657812306, "grad_norm": 4.4535112380981445, "learning_rate": 3.674798563762333e-07, "loss": 0.4469, "num_input_tokens_seen": 172657792, "step": 141980 }, { "epoch": 17.790377145721088, "grad_norm": 5.300412654876709, "learning_rate": 3.6727416393808056e-07, "loss": 0.4244, "num_input_tokens_seen": 172663904, "step": 141985 }, { "epoch": 17.79100363362987, "grad_norm": 11.114782333374023, "learning_rate": 3.670685268880608e-07, "loss": 0.454, "num_input_tokens_seen": 172670144, "step": 141990 }, { "epoch": 17.791630121538653, "grad_norm": 1.8539730310440063, "learning_rate": 3.6686294522863095e-07, "loss": 0.4626, "num_input_tokens_seen": 172675936, "step": 141995 }, { "epoch": 17.79225660944744, "grad_norm": 5.310004234313965, "learning_rate": 3.6665741896225127e-07, "loss": 0.4464, "num_input_tokens_seen": 172681952, "step": 142000 }, { "epoch": 17.79288309735622, "grad_norm": 5.649998188018799, "learning_rate": 3.664519480913764e-07, "loss": 0.4248, "num_input_tokens_seen": 172687392, "step": 142005 }, { "epoch": 17.793509585265003, "grad_norm": 4.631661415100098, "learning_rate": 3.66246532618465e-07, "loss": 0.459, "num_input_tokens_seen": 172693056, "step": 142010 }, { "epoch": 17.79413607317379, "grad_norm": 33.45915985107422, "learning_rate": 3.6604117254597116e-07, "loss": 0.4922, "num_input_tokens_seen": 172699360, "step": 142015 }, { "epoch": 17.79476256108257, "grad_norm": 6.025306224822998, "learning_rate": 3.658358678763518e-07, "loss": 0.4121, "num_input_tokens_seen": 172705376, "step": 142020 }, { "epoch": 17.795389048991353, "grad_norm": 10.960033416748047, "learning_rate": 3.656306186120612e-07, "loss": 0.4608, "num_input_tokens_seen": 172711584, "step": 142025 }, { "epoch": 17.79601553690014, "grad_norm": 13.588727951049805, "learning_rate": 3.654254247555522e-07, "loss": 0.4444, "num_input_tokens_seen": 172717728, "step": 142030 }, { "epoch": 17.79664202480892, "grad_norm": 5.152683734893799, "learning_rate": 3.6522028630927906e-07, "loss": 0.525, "num_input_tokens_seen": 172723680, "step": 142035 }, { "epoch": 17.797268512717704, "grad_norm": 7.581774711608887, "learning_rate": 3.6501520327569375e-07, "loss": 0.4405, "num_input_tokens_seen": 172730016, "step": 142040 }, { "epoch": 17.79789500062649, "grad_norm": 5.658674240112305, "learning_rate": 3.6481017565724976e-07, "loss": 0.476, "num_input_tokens_seen": 172736224, "step": 142045 }, { "epoch": 17.798521488535272, "grad_norm": 4.770010471343994, "learning_rate": 3.646052034563957e-07, "loss": 0.4015, "num_input_tokens_seen": 172742400, "step": 142050 }, { "epoch": 17.799147976444054, "grad_norm": 11.472542762756348, "learning_rate": 3.6440028667558415e-07, "loss": 0.4439, "num_input_tokens_seen": 172748352, "step": 142055 }, { "epoch": 17.799774464352836, "grad_norm": 10.406352996826172, "learning_rate": 3.6419542531726415e-07, "loss": 0.4319, "num_input_tokens_seen": 172754656, "step": 142060 }, { "epoch": 17.800400952261622, "grad_norm": 3.7950403690338135, "learning_rate": 3.639906193838866e-07, "loss": 0.4311, "num_input_tokens_seen": 172760800, "step": 142065 }, { "epoch": 17.801027440170405, "grad_norm": 22.684717178344727, "learning_rate": 3.637858688778978e-07, "loss": 0.5372, "num_input_tokens_seen": 172767008, "step": 142070 }, { "epoch": 17.801653928079187, "grad_norm": 13.225932121276855, "learning_rate": 3.635811738017481e-07, "loss": 0.4399, "num_input_tokens_seen": 172773344, "step": 142075 }, { "epoch": 17.802280415987973, "grad_norm": 8.731736183166504, "learning_rate": 3.633765341578821e-07, "loss": 0.4286, "num_input_tokens_seen": 172779648, "step": 142080 }, { "epoch": 17.802906903896755, "grad_norm": 4.888347625732422, "learning_rate": 3.631719499487485e-07, "loss": 0.4832, "num_input_tokens_seen": 172785472, "step": 142085 }, { "epoch": 17.803533391805537, "grad_norm": 4.626302719116211, "learning_rate": 3.629674211767931e-07, "loss": 0.4424, "num_input_tokens_seen": 172791264, "step": 142090 }, { "epoch": 17.804159879714323, "grad_norm": 7.649410724639893, "learning_rate": 3.6276294784446e-07, "loss": 0.4429, "num_input_tokens_seen": 172797536, "step": 142095 }, { "epoch": 17.804786367623105, "grad_norm": 10.414896965026855, "learning_rate": 3.625585299541956e-07, "loss": 0.491, "num_input_tokens_seen": 172803584, "step": 142100 }, { "epoch": 17.805412855531888, "grad_norm": 3.617194414138794, "learning_rate": 3.623541675084419e-07, "loss": 0.4869, "num_input_tokens_seen": 172809696, "step": 142105 }, { "epoch": 17.806039343440673, "grad_norm": 7.770703315734863, "learning_rate": 3.621498605096446e-07, "loss": 0.489, "num_input_tokens_seen": 172815840, "step": 142110 }, { "epoch": 17.806665831349456, "grad_norm": 7.032366752624512, "learning_rate": 3.6194560896024354e-07, "loss": 0.4713, "num_input_tokens_seen": 172821792, "step": 142115 }, { "epoch": 17.807292319258238, "grad_norm": 7.498884677886963, "learning_rate": 3.6174141286268336e-07, "loss": 0.4029, "num_input_tokens_seen": 172827968, "step": 142120 }, { "epoch": 17.80791880716702, "grad_norm": 4.442911624908447, "learning_rate": 3.615372722194033e-07, "loss": 0.423, "num_input_tokens_seen": 172834176, "step": 142125 }, { "epoch": 17.808545295075806, "grad_norm": 7.973198413848877, "learning_rate": 3.613331870328457e-07, "loss": 0.4303, "num_input_tokens_seen": 172840352, "step": 142130 }, { "epoch": 17.80917178298459, "grad_norm": 4.528838634490967, "learning_rate": 3.611291573054493e-07, "loss": 0.459, "num_input_tokens_seen": 172846016, "step": 142135 }, { "epoch": 17.80979827089337, "grad_norm": 13.636750221252441, "learning_rate": 3.609251830396543e-07, "loss": 0.3949, "num_input_tokens_seen": 172851936, "step": 142140 }, { "epoch": 17.810424758802156, "grad_norm": 9.98261547088623, "learning_rate": 3.607212642378988e-07, "loss": 0.4214, "num_input_tokens_seen": 172858144, "step": 142145 }, { "epoch": 17.81105124671094, "grad_norm": 5.057412624359131, "learning_rate": 3.6051740090262075e-07, "loss": 0.4376, "num_input_tokens_seen": 172864320, "step": 142150 }, { "epoch": 17.81167773461972, "grad_norm": 20.058393478393555, "learning_rate": 3.603135930362578e-07, "loss": 0.4349, "num_input_tokens_seen": 172870400, "step": 142155 }, { "epoch": 17.812304222528507, "grad_norm": 7.096052169799805, "learning_rate": 3.601098406412473e-07, "loss": 0.4397, "num_input_tokens_seen": 172876608, "step": 142160 }, { "epoch": 17.81293071043729, "grad_norm": 3.9075822830200195, "learning_rate": 3.599061437200241e-07, "loss": 0.3991, "num_input_tokens_seen": 172882208, "step": 142165 }, { "epoch": 17.81355719834607, "grad_norm": 14.118297576904297, "learning_rate": 3.5970250227502445e-07, "loss": 0.4819, "num_input_tokens_seen": 172888192, "step": 142170 }, { "epoch": 17.814183686254854, "grad_norm": 5.079843044281006, "learning_rate": 3.5949891630868315e-07, "loss": 0.4032, "num_input_tokens_seen": 172893984, "step": 142175 }, { "epoch": 17.81481017416364, "grad_norm": 17.982887268066406, "learning_rate": 3.5929538582343327e-07, "loss": 0.4446, "num_input_tokens_seen": 172899840, "step": 142180 }, { "epoch": 17.815436662072422, "grad_norm": 9.214118003845215, "learning_rate": 3.590919108217095e-07, "loss": 0.5042, "num_input_tokens_seen": 172905728, "step": 142185 }, { "epoch": 17.816063149981204, "grad_norm": 3.8359603881835938, "learning_rate": 3.588884913059426e-07, "loss": 0.412, "num_input_tokens_seen": 172912032, "step": 142190 }, { "epoch": 17.81668963788999, "grad_norm": 5.816701889038086, "learning_rate": 3.5868512727856685e-07, "loss": 0.4643, "num_input_tokens_seen": 172917824, "step": 142195 }, { "epoch": 17.817316125798772, "grad_norm": 5.9734320640563965, "learning_rate": 3.584818187420119e-07, "loss": 0.4376, "num_input_tokens_seen": 172923840, "step": 142200 }, { "epoch": 17.817942613707555, "grad_norm": 11.748173713684082, "learning_rate": 3.582785656987098e-07, "loss": 0.4512, "num_input_tokens_seen": 172930080, "step": 142205 }, { "epoch": 17.81856910161634, "grad_norm": 14.514670372009277, "learning_rate": 3.580753681510896e-07, "loss": 0.487, "num_input_tokens_seen": 172935872, "step": 142210 }, { "epoch": 17.819195589525123, "grad_norm": 6.750591278076172, "learning_rate": 3.5787222610158047e-07, "loss": 0.4345, "num_input_tokens_seen": 172941920, "step": 142215 }, { "epoch": 17.819822077433905, "grad_norm": 4.472956657409668, "learning_rate": 3.576691395526133e-07, "loss": 0.4548, "num_input_tokens_seen": 172948448, "step": 142220 }, { "epoch": 17.820448565342687, "grad_norm": 5.633545398712158, "learning_rate": 3.574661085066133e-07, "loss": 0.4467, "num_input_tokens_seen": 172954560, "step": 142225 }, { "epoch": 17.821075053251473, "grad_norm": 9.941995620727539, "learning_rate": 3.5726313296601036e-07, "loss": 0.4194, "num_input_tokens_seen": 172960864, "step": 142230 }, { "epoch": 17.821701541160255, "grad_norm": 5.070372104644775, "learning_rate": 3.57060212933229e-07, "loss": 0.4403, "num_input_tokens_seen": 172967072, "step": 142235 }, { "epoch": 17.822328029069038, "grad_norm": 3.4241466522216797, "learning_rate": 3.5685734841069684e-07, "loss": 0.4748, "num_input_tokens_seen": 172972800, "step": 142240 }, { "epoch": 17.822954516977823, "grad_norm": 19.930984497070312, "learning_rate": 3.566545394008386e-07, "loss": 0.4462, "num_input_tokens_seen": 172979104, "step": 142245 }, { "epoch": 17.823581004886606, "grad_norm": 6.404172420501709, "learning_rate": 3.56451785906079e-07, "loss": 0.411, "num_input_tokens_seen": 172985472, "step": 142250 }, { "epoch": 17.824207492795388, "grad_norm": 5.395769119262695, "learning_rate": 3.5624908792884325e-07, "loss": 0.4453, "num_input_tokens_seen": 172991680, "step": 142255 }, { "epoch": 17.824833980704174, "grad_norm": 6.3276519775390625, "learning_rate": 3.5604644547155286e-07, "loss": 0.3833, "num_input_tokens_seen": 172997856, "step": 142260 }, { "epoch": 17.825460468612956, "grad_norm": 24.754526138305664, "learning_rate": 3.5584385853663196e-07, "loss": 0.4885, "num_input_tokens_seen": 173004192, "step": 142265 }, { "epoch": 17.82608695652174, "grad_norm": 17.93453598022461, "learning_rate": 3.5564132712650243e-07, "loss": 0.4773, "num_input_tokens_seen": 173010208, "step": 142270 }, { "epoch": 17.826713444430524, "grad_norm": 5.4249091148376465, "learning_rate": 3.554388512435858e-07, "loss": 0.4299, "num_input_tokens_seen": 173016000, "step": 142275 }, { "epoch": 17.827339932339306, "grad_norm": 5.763720512390137, "learning_rate": 3.5523643089030227e-07, "loss": 0.4335, "num_input_tokens_seen": 173021728, "step": 142280 }, { "epoch": 17.82796642024809, "grad_norm": 3.947092056274414, "learning_rate": 3.550340660690732e-07, "loss": 0.4005, "num_input_tokens_seen": 173026912, "step": 142285 }, { "epoch": 17.82859290815687, "grad_norm": 3.955519676208496, "learning_rate": 3.5483175678231616e-07, "loss": 0.4307, "num_input_tokens_seen": 173033088, "step": 142290 }, { "epoch": 17.829219396065657, "grad_norm": 4.570888996124268, "learning_rate": 3.5462950303245146e-07, "loss": 0.4315, "num_input_tokens_seen": 173039168, "step": 142295 }, { "epoch": 17.82984588397444, "grad_norm": 6.07878303527832, "learning_rate": 3.544273048218966e-07, "loss": 0.4411, "num_input_tokens_seen": 173045280, "step": 142300 }, { "epoch": 17.83047237188322, "grad_norm": 12.929259300231934, "learning_rate": 3.54225162153069e-07, "loss": 0.4606, "num_input_tokens_seen": 173051680, "step": 142305 }, { "epoch": 17.831098859792007, "grad_norm": 5.045856952667236, "learning_rate": 3.5402307502838516e-07, "loss": 0.4494, "num_input_tokens_seen": 173057888, "step": 142310 }, { "epoch": 17.83172534770079, "grad_norm": 3.1526718139648438, "learning_rate": 3.538210434502626e-07, "loss": 0.5341, "num_input_tokens_seen": 173063904, "step": 142315 }, { "epoch": 17.832351835609572, "grad_norm": 18.72180938720703, "learning_rate": 3.5361906742111483e-07, "loss": 0.4879, "num_input_tokens_seen": 173070112, "step": 142320 }, { "epoch": 17.832978323518358, "grad_norm": 16.3681697845459, "learning_rate": 3.534171469433584e-07, "loss": 0.4902, "num_input_tokens_seen": 173076544, "step": 142325 }, { "epoch": 17.83360481142714, "grad_norm": 3.6685967445373535, "learning_rate": 3.532152820194057e-07, "loss": 0.4414, "num_input_tokens_seen": 173082848, "step": 142330 }, { "epoch": 17.834231299335922, "grad_norm": 5.193741798400879, "learning_rate": 3.5301347265167094e-07, "loss": 0.4698, "num_input_tokens_seen": 173089280, "step": 142335 }, { "epoch": 17.834857787244704, "grad_norm": 16.398075103759766, "learning_rate": 3.528117188425684e-07, "loss": 0.4335, "num_input_tokens_seen": 173095616, "step": 142340 }, { "epoch": 17.83548427515349, "grad_norm": 19.05569839477539, "learning_rate": 3.5261002059450765e-07, "loss": 0.4475, "num_input_tokens_seen": 173101984, "step": 142345 }, { "epoch": 17.836110763062273, "grad_norm": 13.194687843322754, "learning_rate": 3.524083779099019e-07, "loss": 0.5753, "num_input_tokens_seen": 173108256, "step": 142350 }, { "epoch": 17.836737250971055, "grad_norm": 10.226728439331055, "learning_rate": 3.522067907911614e-07, "loss": 0.5587, "num_input_tokens_seen": 173114656, "step": 142355 }, { "epoch": 17.83736373887984, "grad_norm": 7.200260639190674, "learning_rate": 3.5200525924069584e-07, "loss": 0.446, "num_input_tokens_seen": 173120960, "step": 142360 }, { "epoch": 17.837990226788623, "grad_norm": 10.291390419006348, "learning_rate": 3.5180378326091614e-07, "loss": 0.5225, "num_input_tokens_seen": 173127072, "step": 142365 }, { "epoch": 17.838616714697405, "grad_norm": 4.644790172576904, "learning_rate": 3.5160236285422975e-07, "loss": 0.4372, "num_input_tokens_seen": 173133312, "step": 142370 }, { "epoch": 17.83924320260619, "grad_norm": 12.839056015014648, "learning_rate": 3.5140099802304486e-07, "loss": 0.4139, "num_input_tokens_seen": 173139392, "step": 142375 }, { "epoch": 17.839869690514973, "grad_norm": 6.771136283874512, "learning_rate": 3.511996887697705e-07, "loss": 0.4285, "num_input_tokens_seen": 173145632, "step": 142380 }, { "epoch": 17.840496178423756, "grad_norm": 16.039731979370117, "learning_rate": 3.509984350968115e-07, "loss": 0.4931, "num_input_tokens_seen": 173152128, "step": 142385 }, { "epoch": 17.841122666332538, "grad_norm": 15.008264541625977, "learning_rate": 3.5079723700657544e-07, "loss": 0.4645, "num_input_tokens_seen": 173158368, "step": 142390 }, { "epoch": 17.841749154241324, "grad_norm": 7.644927501678467, "learning_rate": 3.5059609450146637e-07, "loss": 0.4812, "num_input_tokens_seen": 173164544, "step": 142395 }, { "epoch": 17.842375642150106, "grad_norm": 14.037610054016113, "learning_rate": 3.503950075838902e-07, "loss": 0.4293, "num_input_tokens_seen": 173170848, "step": 142400 }, { "epoch": 17.84300213005889, "grad_norm": 15.330891609191895, "learning_rate": 3.501939762562517e-07, "loss": 0.4248, "num_input_tokens_seen": 173177152, "step": 142405 }, { "epoch": 17.843628617967674, "grad_norm": 20.461576461791992, "learning_rate": 3.4999300052095285e-07, "loss": 0.5198, "num_input_tokens_seen": 173183264, "step": 142410 }, { "epoch": 17.844255105876456, "grad_norm": 5.564640998840332, "learning_rate": 3.497920803803978e-07, "loss": 0.4771, "num_input_tokens_seen": 173188992, "step": 142415 }, { "epoch": 17.84488159378524, "grad_norm": 11.601400375366211, "learning_rate": 3.495912158369874e-07, "loss": 0.5297, "num_input_tokens_seen": 173195040, "step": 142420 }, { "epoch": 17.845508081694025, "grad_norm": 3.7266132831573486, "learning_rate": 3.4939040689312475e-07, "loss": 0.4042, "num_input_tokens_seen": 173200448, "step": 142425 }, { "epoch": 17.846134569602807, "grad_norm": 19.239517211914062, "learning_rate": 3.4918965355120905e-07, "loss": 0.4767, "num_input_tokens_seen": 173206496, "step": 142430 }, { "epoch": 17.84676105751159, "grad_norm": 17.8883113861084, "learning_rate": 3.4898895581364166e-07, "loss": 0.5067, "num_input_tokens_seen": 173213024, "step": 142435 }, { "epoch": 17.847387545420375, "grad_norm": 5.7427287101745605, "learning_rate": 3.487883136828213e-07, "loss": 0.4321, "num_input_tokens_seen": 173218944, "step": 142440 }, { "epoch": 17.848014033329157, "grad_norm": 15.80964183807373, "learning_rate": 3.4858772716114763e-07, "loss": 0.4414, "num_input_tokens_seen": 173225216, "step": 142445 }, { "epoch": 17.84864052123794, "grad_norm": 8.316611289978027, "learning_rate": 3.483871962510177e-07, "loss": 0.5861, "num_input_tokens_seen": 173230944, "step": 142450 }, { "epoch": 17.849267009146722, "grad_norm": 4.71290397644043, "learning_rate": 3.4818672095482955e-07, "loss": 0.4174, "num_input_tokens_seen": 173237408, "step": 142455 }, { "epoch": 17.849893497055508, "grad_norm": 18.87396240234375, "learning_rate": 3.479863012749812e-07, "loss": 0.5043, "num_input_tokens_seen": 173243424, "step": 142460 }, { "epoch": 17.85051998496429, "grad_norm": 6.399981498718262, "learning_rate": 3.47785937213867e-07, "loss": 0.3981, "num_input_tokens_seen": 173249600, "step": 142465 }, { "epoch": 17.851146472873072, "grad_norm": 5.173951148986816, "learning_rate": 3.4758562877388326e-07, "loss": 0.4677, "num_input_tokens_seen": 173255488, "step": 142470 }, { "epoch": 17.851772960781858, "grad_norm": 5.004727363586426, "learning_rate": 3.4738537595742473e-07, "loss": 0.5448, "num_input_tokens_seen": 173261760, "step": 142475 }, { "epoch": 17.85239944869064, "grad_norm": 20.414730072021484, "learning_rate": 3.4718517876688675e-07, "loss": 0.4772, "num_input_tokens_seen": 173267520, "step": 142480 }, { "epoch": 17.853025936599423, "grad_norm": 6.421382904052734, "learning_rate": 3.469850372046607e-07, "loss": 0.4466, "num_input_tokens_seen": 173273632, "step": 142485 }, { "epoch": 17.85365242450821, "grad_norm": 8.14590835571289, "learning_rate": 3.467849512731419e-07, "loss": 0.443, "num_input_tokens_seen": 173279296, "step": 142490 }, { "epoch": 17.85427891241699, "grad_norm": 6.794290065765381, "learning_rate": 3.4658492097471965e-07, "loss": 0.459, "num_input_tokens_seen": 173285632, "step": 142495 }, { "epoch": 17.854905400325773, "grad_norm": 9.326462745666504, "learning_rate": 3.4638494631178854e-07, "loss": 0.4192, "num_input_tokens_seen": 173292000, "step": 142500 }, { "epoch": 17.85553188823456, "grad_norm": 4.331320285797119, "learning_rate": 3.4618502728673676e-07, "loss": 0.4274, "num_input_tokens_seen": 173298016, "step": 142505 }, { "epoch": 17.85615837614334, "grad_norm": 3.3871817588806152, "learning_rate": 3.459851639019568e-07, "loss": 0.38, "num_input_tokens_seen": 173304224, "step": 142510 }, { "epoch": 17.856784864052123, "grad_norm": 42.843692779541016, "learning_rate": 3.457853561598362e-07, "loss": 0.4706, "num_input_tokens_seen": 173310688, "step": 142515 }, { "epoch": 17.857411351960906, "grad_norm": 3.227388858795166, "learning_rate": 3.455856040627642e-07, "loss": 0.44, "num_input_tokens_seen": 173317056, "step": 142520 }, { "epoch": 17.85803783986969, "grad_norm": 3.1666765213012695, "learning_rate": 3.4538590761313107e-07, "loss": 0.4369, "num_input_tokens_seen": 173323040, "step": 142525 }, { "epoch": 17.858664327778474, "grad_norm": 7.712306976318359, "learning_rate": 3.4518626681332156e-07, "loss": 0.3888, "num_input_tokens_seen": 173329024, "step": 142530 }, { "epoch": 17.859290815687256, "grad_norm": 10.968762397766113, "learning_rate": 3.449866816657249e-07, "loss": 0.4892, "num_input_tokens_seen": 173335104, "step": 142535 }, { "epoch": 17.859917303596042, "grad_norm": 5.023439884185791, "learning_rate": 3.447871521727253e-07, "loss": 0.5036, "num_input_tokens_seen": 173341280, "step": 142540 }, { "epoch": 17.860543791504824, "grad_norm": 34.80742645263672, "learning_rate": 3.445876783367097e-07, "loss": 0.6085, "num_input_tokens_seen": 173347456, "step": 142545 }, { "epoch": 17.861170279413606, "grad_norm": 15.764578819274902, "learning_rate": 3.4438826016006177e-07, "loss": 0.4597, "num_input_tokens_seen": 173353536, "step": 142550 }, { "epoch": 17.861796767322392, "grad_norm": 10.87802505493164, "learning_rate": 3.441888976451674e-07, "loss": 0.4893, "num_input_tokens_seen": 173359232, "step": 142555 }, { "epoch": 17.862423255231175, "grad_norm": 8.829715728759766, "learning_rate": 3.4398959079440795e-07, "loss": 0.4741, "num_input_tokens_seen": 173365664, "step": 142560 }, { "epoch": 17.863049743139957, "grad_norm": 9.888959884643555, "learning_rate": 3.437903396101677e-07, "loss": 0.408, "num_input_tokens_seen": 173371616, "step": 142565 }, { "epoch": 17.86367623104874, "grad_norm": 9.066771507263184, "learning_rate": 3.435911440948286e-07, "loss": 0.4271, "num_input_tokens_seen": 173378016, "step": 142570 }, { "epoch": 17.864302718957525, "grad_norm": 4.380942344665527, "learning_rate": 3.4339200425077323e-07, "loss": 0.4206, "num_input_tokens_seen": 173384480, "step": 142575 }, { "epoch": 17.864929206866307, "grad_norm": 5.57657527923584, "learning_rate": 3.431929200803807e-07, "loss": 0.4186, "num_input_tokens_seen": 173390976, "step": 142580 }, { "epoch": 17.86555569477509, "grad_norm": 6.499839782714844, "learning_rate": 3.4299389158603204e-07, "loss": 0.4361, "num_input_tokens_seen": 173396992, "step": 142585 }, { "epoch": 17.866182182683875, "grad_norm": 5.634348392486572, "learning_rate": 3.4279491877010797e-07, "loss": 0.4311, "num_input_tokens_seen": 173402528, "step": 142590 }, { "epoch": 17.866808670592658, "grad_norm": 5.0615715980529785, "learning_rate": 3.42596001634985e-07, "loss": 0.4231, "num_input_tokens_seen": 173408896, "step": 142595 }, { "epoch": 17.86743515850144, "grad_norm": 9.826844215393066, "learning_rate": 3.423971401830434e-07, "loss": 0.4081, "num_input_tokens_seen": 173414336, "step": 142600 }, { "epoch": 17.868061646410226, "grad_norm": 25.061779022216797, "learning_rate": 3.421983344166591e-07, "loss": 0.4422, "num_input_tokens_seen": 173420416, "step": 142605 }, { "epoch": 17.868688134319008, "grad_norm": 13.576586723327637, "learning_rate": 3.4199958433821014e-07, "loss": 0.4554, "num_input_tokens_seen": 173426784, "step": 142610 }, { "epoch": 17.86931462222779, "grad_norm": 15.205883979797363, "learning_rate": 3.4180088995007186e-07, "loss": 0.5544, "num_input_tokens_seen": 173432736, "step": 142615 }, { "epoch": 17.869941110136573, "grad_norm": 3.9934279918670654, "learning_rate": 3.4160225125462133e-07, "loss": 0.4361, "num_input_tokens_seen": 173437920, "step": 142620 }, { "epoch": 17.87056759804536, "grad_norm": 5.1775336265563965, "learning_rate": 3.4140366825423153e-07, "loss": 0.4264, "num_input_tokens_seen": 173444352, "step": 142625 }, { "epoch": 17.87119408595414, "grad_norm": 8.771180152893066, "learning_rate": 3.412051409512784e-07, "loss": 0.4389, "num_input_tokens_seen": 173450464, "step": 142630 }, { "epoch": 17.871820573862923, "grad_norm": 5.817814350128174, "learning_rate": 3.41006669348134e-07, "loss": 0.4033, "num_input_tokens_seen": 173456448, "step": 142635 }, { "epoch": 17.87244706177171, "grad_norm": 27.04110336303711, "learning_rate": 3.4080825344717184e-07, "loss": 0.4602, "num_input_tokens_seen": 173462560, "step": 142640 }, { "epoch": 17.87307354968049, "grad_norm": 15.618110656738281, "learning_rate": 3.406098932507651e-07, "loss": 0.4435, "num_input_tokens_seen": 173468800, "step": 142645 }, { "epoch": 17.873700037589273, "grad_norm": 3.4576549530029297, "learning_rate": 3.404115887612841e-07, "loss": 0.4789, "num_input_tokens_seen": 173474944, "step": 142650 }, { "epoch": 17.87432652549806, "grad_norm": 3.7283966541290283, "learning_rate": 3.402133399811003e-07, "loss": 0.4614, "num_input_tokens_seen": 173480704, "step": 142655 }, { "epoch": 17.87495301340684, "grad_norm": 5.290258884429932, "learning_rate": 3.400151469125834e-07, "loss": 0.4511, "num_input_tokens_seen": 173486848, "step": 142660 }, { "epoch": 17.875579501315624, "grad_norm": 4.3307414054870605, "learning_rate": 3.398170095581038e-07, "loss": 0.459, "num_input_tokens_seen": 173492896, "step": 142665 }, { "epoch": 17.87620598922441, "grad_norm": 14.548287391662598, "learning_rate": 3.39618927920029e-07, "loss": 0.4366, "num_input_tokens_seen": 173499072, "step": 142670 }, { "epoch": 17.876832477133192, "grad_norm": 5.90899658203125, "learning_rate": 3.394209020007283e-07, "loss": 0.4509, "num_input_tokens_seen": 173504928, "step": 142675 }, { "epoch": 17.877458965041974, "grad_norm": 4.453490257263184, "learning_rate": 3.3922293180256915e-07, "loss": 0.43, "num_input_tokens_seen": 173511072, "step": 142680 }, { "epoch": 17.878085452950756, "grad_norm": 3.029139995574951, "learning_rate": 3.3902501732791913e-07, "loss": 0.4311, "num_input_tokens_seen": 173517280, "step": 142685 }, { "epoch": 17.878711940859542, "grad_norm": 6.287151336669922, "learning_rate": 3.3882715857914304e-07, "loss": 0.4832, "num_input_tokens_seen": 173523072, "step": 142690 }, { "epoch": 17.879338428768325, "grad_norm": 11.388986587524414, "learning_rate": 3.3862935555860843e-07, "loss": 0.4968, "num_input_tokens_seen": 173529088, "step": 142695 }, { "epoch": 17.879964916677107, "grad_norm": 3.690579414367676, "learning_rate": 3.3843160826867727e-07, "loss": 0.4725, "num_input_tokens_seen": 173534688, "step": 142700 }, { "epoch": 17.880591404585893, "grad_norm": 4.374353408813477, "learning_rate": 3.38233916711716e-07, "loss": 0.4692, "num_input_tokens_seen": 173540960, "step": 142705 }, { "epoch": 17.881217892494675, "grad_norm": 20.55378532409668, "learning_rate": 3.3803628089008835e-07, "loss": 0.4714, "num_input_tokens_seen": 173547136, "step": 142710 }, { "epoch": 17.881844380403457, "grad_norm": 5.549751281738281, "learning_rate": 3.378387008061557e-07, "loss": 0.4128, "num_input_tokens_seen": 173553056, "step": 142715 }, { "epoch": 17.882470868312243, "grad_norm": 4.515749454498291, "learning_rate": 3.3764117646228224e-07, "loss": 0.4438, "num_input_tokens_seen": 173559040, "step": 142720 }, { "epoch": 17.883097356221025, "grad_norm": 8.71551513671875, "learning_rate": 3.3744370786082723e-07, "loss": 0.4596, "num_input_tokens_seen": 173565344, "step": 142725 }, { "epoch": 17.883723844129808, "grad_norm": 14.035361289978027, "learning_rate": 3.372462950041533e-07, "loss": 0.4984, "num_input_tokens_seen": 173570944, "step": 142730 }, { "epoch": 17.884350332038593, "grad_norm": 4.210746765136719, "learning_rate": 3.3704893789461955e-07, "loss": 0.4171, "num_input_tokens_seen": 173577152, "step": 142735 }, { "epoch": 17.884976819947376, "grad_norm": 4.510561466217041, "learning_rate": 3.3685163653458754e-07, "loss": 0.5322, "num_input_tokens_seen": 173582944, "step": 142740 }, { "epoch": 17.885603307856158, "grad_norm": 6.597057342529297, "learning_rate": 3.3665439092641304e-07, "loss": 0.5317, "num_input_tokens_seen": 173588992, "step": 142745 }, { "epoch": 17.88622979576494, "grad_norm": 16.557781219482422, "learning_rate": 3.3645720107245763e-07, "loss": 0.4934, "num_input_tokens_seen": 173595200, "step": 142750 }, { "epoch": 17.886856283673726, "grad_norm": 25.580659866333008, "learning_rate": 3.3626006697507595e-07, "loss": 0.423, "num_input_tokens_seen": 173601312, "step": 142755 }, { "epoch": 17.88748277158251, "grad_norm": 5.236385822296143, "learning_rate": 3.3606298863662736e-07, "loss": 0.4151, "num_input_tokens_seen": 173607680, "step": 142760 }, { "epoch": 17.88810925949129, "grad_norm": 15.438558578491211, "learning_rate": 3.35865966059466e-07, "loss": 0.4757, "num_input_tokens_seen": 173613920, "step": 142765 }, { "epoch": 17.888735747400077, "grad_norm": 4.734508037567139, "learning_rate": 3.356689992459483e-07, "loss": 0.4198, "num_input_tokens_seen": 173620096, "step": 142770 }, { "epoch": 17.88936223530886, "grad_norm": 4.365057468414307, "learning_rate": 3.3547208819842914e-07, "loss": 0.485, "num_input_tokens_seen": 173626336, "step": 142775 }, { "epoch": 17.88998872321764, "grad_norm": 10.644278526306152, "learning_rate": 3.3527523291926435e-07, "loss": 0.4412, "num_input_tokens_seen": 173632448, "step": 142780 }, { "epoch": 17.890615211126427, "grad_norm": 14.474011421203613, "learning_rate": 3.350784334108048e-07, "loss": 0.4451, "num_input_tokens_seen": 173638304, "step": 142785 }, { "epoch": 17.89124169903521, "grad_norm": 3.934746742248535, "learning_rate": 3.3488168967540425e-07, "loss": 0.4852, "num_input_tokens_seen": 173644480, "step": 142790 }, { "epoch": 17.89186818694399, "grad_norm": 20.07784652709961, "learning_rate": 3.3468500171541685e-07, "loss": 0.4878, "num_input_tokens_seen": 173650240, "step": 142795 }, { "epoch": 17.892494674852774, "grad_norm": 5.117671012878418, "learning_rate": 3.344883695331913e-07, "loss": 0.4018, "num_input_tokens_seen": 173656672, "step": 142800 }, { "epoch": 17.89312116276156, "grad_norm": 8.06440258026123, "learning_rate": 3.3429179313108073e-07, "loss": 0.4401, "num_input_tokens_seen": 173662720, "step": 142805 }, { "epoch": 17.893747650670342, "grad_norm": 6.196957588195801, "learning_rate": 3.340952725114338e-07, "loss": 0.409, "num_input_tokens_seen": 173668992, "step": 142810 }, { "epoch": 17.894374138579124, "grad_norm": 5.799719333648682, "learning_rate": 3.3389880767660145e-07, "loss": 0.4515, "num_input_tokens_seen": 173675072, "step": 142815 }, { "epoch": 17.89500062648791, "grad_norm": 9.241617202758789, "learning_rate": 3.3370239862893117e-07, "loss": 0.4323, "num_input_tokens_seen": 173681248, "step": 142820 }, { "epoch": 17.895627114396692, "grad_norm": 7.109478950500488, "learning_rate": 3.3350604537077223e-07, "loss": 0.4707, "num_input_tokens_seen": 173687584, "step": 142825 }, { "epoch": 17.896253602305475, "grad_norm": 10.91523265838623, "learning_rate": 3.3330974790447226e-07, "loss": 0.614, "num_input_tokens_seen": 173692992, "step": 142830 }, { "epoch": 17.89688009021426, "grad_norm": 4.0047197341918945, "learning_rate": 3.3311350623237704e-07, "loss": 0.4604, "num_input_tokens_seen": 173699008, "step": 142835 }, { "epoch": 17.897506578123043, "grad_norm": 4.370335102081299, "learning_rate": 3.3291732035683424e-07, "loss": 0.4698, "num_input_tokens_seen": 173705248, "step": 142840 }, { "epoch": 17.898133066031825, "grad_norm": 4.378129005432129, "learning_rate": 3.3272119028018855e-07, "loss": 0.4873, "num_input_tokens_seen": 173711200, "step": 142845 }, { "epoch": 17.898759553940607, "grad_norm": 26.124469757080078, "learning_rate": 3.325251160047854e-07, "loss": 0.4519, "num_input_tokens_seen": 173717344, "step": 142850 }, { "epoch": 17.899386041849393, "grad_norm": 22.33896827697754, "learning_rate": 3.323290975329679e-07, "loss": 0.5352, "num_input_tokens_seen": 173723104, "step": 142855 }, { "epoch": 17.900012529758175, "grad_norm": 12.338074684143066, "learning_rate": 3.321331348670809e-07, "loss": 0.4451, "num_input_tokens_seen": 173729088, "step": 142860 }, { "epoch": 17.900639017666958, "grad_norm": 6.090047836303711, "learning_rate": 3.319372280094663e-07, "loss": 0.4392, "num_input_tokens_seen": 173735296, "step": 142865 }, { "epoch": 17.901265505575743, "grad_norm": 19.308698654174805, "learning_rate": 3.317413769624672e-07, "loss": 0.4495, "num_input_tokens_seen": 173741248, "step": 142870 }, { "epoch": 17.901891993484526, "grad_norm": 19.10480308532715, "learning_rate": 3.3154558172842524e-07, "loss": 0.4644, "num_input_tokens_seen": 173747488, "step": 142875 }, { "epoch": 17.902518481393308, "grad_norm": 9.760883331298828, "learning_rate": 3.3134984230968003e-07, "loss": 0.5108, "num_input_tokens_seen": 173753728, "step": 142880 }, { "epoch": 17.903144969302094, "grad_norm": 3.052985668182373, "learning_rate": 3.311541587085726e-07, "loss": 0.4105, "num_input_tokens_seen": 173759584, "step": 142885 }, { "epoch": 17.903771457210876, "grad_norm": 18.417985916137695, "learning_rate": 3.309585309274427e-07, "loss": 0.5029, "num_input_tokens_seen": 173765088, "step": 142890 }, { "epoch": 17.90439794511966, "grad_norm": 6.2324442863464355, "learning_rate": 3.307629589686301e-07, "loss": 0.4327, "num_input_tokens_seen": 173771328, "step": 142895 }, { "epoch": 17.905024433028444, "grad_norm": 22.173709869384766, "learning_rate": 3.305674428344707e-07, "loss": 0.5058, "num_input_tokens_seen": 173777376, "step": 142900 }, { "epoch": 17.905650920937227, "grad_norm": 5.00345516204834, "learning_rate": 3.303719825273044e-07, "loss": 0.3972, "num_input_tokens_seen": 173783840, "step": 142905 }, { "epoch": 17.90627740884601, "grad_norm": 25.93121910095215, "learning_rate": 3.3017657804946644e-07, "loss": 0.5251, "num_input_tokens_seen": 173790080, "step": 142910 }, { "epoch": 17.90690389675479, "grad_norm": 7.345402240753174, "learning_rate": 3.2998122940329394e-07, "loss": 0.4616, "num_input_tokens_seen": 173795680, "step": 142915 }, { "epoch": 17.907530384663577, "grad_norm": 9.463438034057617, "learning_rate": 3.2978593659112155e-07, "loss": 0.4088, "num_input_tokens_seen": 173801632, "step": 142920 }, { "epoch": 17.90815687257236, "grad_norm": 7.541561603546143, "learning_rate": 3.2959069961528587e-07, "loss": 0.4126, "num_input_tokens_seen": 173807744, "step": 142925 }, { "epoch": 17.90878336048114, "grad_norm": 4.506428241729736, "learning_rate": 3.2939551847811947e-07, "loss": 0.4537, "num_input_tokens_seen": 173813920, "step": 142930 }, { "epoch": 17.909409848389927, "grad_norm": 4.706231594085693, "learning_rate": 3.292003931819565e-07, "loss": 0.4194, "num_input_tokens_seen": 173819872, "step": 142935 }, { "epoch": 17.91003633629871, "grad_norm": 4.397332191467285, "learning_rate": 3.2900532372912965e-07, "loss": 0.4081, "num_input_tokens_seen": 173826016, "step": 142940 }, { "epoch": 17.910662824207492, "grad_norm": 21.019683837890625, "learning_rate": 3.2881031012197195e-07, "loss": 0.4802, "num_input_tokens_seen": 173831968, "step": 142945 }, { "epoch": 17.911289312116278, "grad_norm": 3.751865863800049, "learning_rate": 3.286153523628133e-07, "loss": 0.4211, "num_input_tokens_seen": 173838112, "step": 142950 }, { "epoch": 17.91191580002506, "grad_norm": 16.387453079223633, "learning_rate": 3.2842045045398564e-07, "loss": 0.5761, "num_input_tokens_seen": 173844256, "step": 142955 }, { "epoch": 17.912542287933842, "grad_norm": 7.078763484954834, "learning_rate": 3.2822560439782047e-07, "loss": 0.4507, "num_input_tokens_seen": 173850432, "step": 142960 }, { "epoch": 17.913168775842625, "grad_norm": 12.734492301940918, "learning_rate": 3.2803081419664483e-07, "loss": 0.4485, "num_input_tokens_seen": 173856960, "step": 142965 }, { "epoch": 17.91379526375141, "grad_norm": 15.324626922607422, "learning_rate": 3.2783607985278967e-07, "loss": 0.4433, "num_input_tokens_seen": 173863392, "step": 142970 }, { "epoch": 17.914421751660193, "grad_norm": 5.666295051574707, "learning_rate": 3.2764140136858135e-07, "loss": 0.4402, "num_input_tokens_seen": 173869088, "step": 142975 }, { "epoch": 17.915048239568975, "grad_norm": 3.2718725204467773, "learning_rate": 3.274467787463481e-07, "loss": 0.4639, "num_input_tokens_seen": 173875360, "step": 142980 }, { "epoch": 17.91567472747776, "grad_norm": 21.829317092895508, "learning_rate": 3.272522119884186e-07, "loss": 0.4326, "num_input_tokens_seen": 173881728, "step": 142985 }, { "epoch": 17.916301215386543, "grad_norm": 4.451247215270996, "learning_rate": 3.2705770109711597e-07, "loss": 0.4609, "num_input_tokens_seen": 173888192, "step": 142990 }, { "epoch": 17.916927703295325, "grad_norm": 13.46420669555664, "learning_rate": 3.2686324607476784e-07, "loss": 0.4811, "num_input_tokens_seen": 173894368, "step": 142995 }, { "epoch": 17.91755419120411, "grad_norm": 12.135114669799805, "learning_rate": 3.2666884692369895e-07, "loss": 0.4758, "num_input_tokens_seen": 173900480, "step": 143000 }, { "epoch": 17.918180679112893, "grad_norm": 7.293056011199951, "learning_rate": 3.26474503646233e-07, "loss": 0.4268, "num_input_tokens_seen": 173905920, "step": 143005 }, { "epoch": 17.918807167021676, "grad_norm": 6.5983171463012695, "learning_rate": 3.262802162446932e-07, "loss": 0.4838, "num_input_tokens_seen": 173912160, "step": 143010 }, { "epoch": 17.919433654930458, "grad_norm": 7.139578819274902, "learning_rate": 3.2608598472140374e-07, "loss": 0.4411, "num_input_tokens_seen": 173918336, "step": 143015 }, { "epoch": 17.920060142839244, "grad_norm": 6.37981653213501, "learning_rate": 3.2589180907868503e-07, "loss": 0.468, "num_input_tokens_seen": 173924512, "step": 143020 }, { "epoch": 17.920686630748026, "grad_norm": 8.662396430969238, "learning_rate": 3.2569768931886015e-07, "loss": 0.4352, "num_input_tokens_seen": 173930656, "step": 143025 }, { "epoch": 17.92131311865681, "grad_norm": 5.226478099822998, "learning_rate": 3.2550362544424895e-07, "loss": 0.4848, "num_input_tokens_seen": 173936864, "step": 143030 }, { "epoch": 17.921939606565594, "grad_norm": 7.584041118621826, "learning_rate": 3.2530961745717235e-07, "loss": 0.4386, "num_input_tokens_seen": 173942816, "step": 143035 }, { "epoch": 17.922566094474377, "grad_norm": 3.028820753097534, "learning_rate": 3.2511566535994907e-07, "loss": 0.4785, "num_input_tokens_seen": 173948768, "step": 143040 }, { "epoch": 17.92319258238316, "grad_norm": 9.985797882080078, "learning_rate": 3.249217691548995e-07, "loss": 0.4698, "num_input_tokens_seen": 173954848, "step": 143045 }, { "epoch": 17.923819070291945, "grad_norm": 3.4602410793304443, "learning_rate": 3.2472792884433946e-07, "loss": 0.4661, "num_input_tokens_seen": 173961024, "step": 143050 }, { "epoch": 17.924445558200727, "grad_norm": 5.458100318908691, "learning_rate": 3.2453414443058883e-07, "loss": 0.4041, "num_input_tokens_seen": 173967232, "step": 143055 }, { "epoch": 17.92507204610951, "grad_norm": 6.705326557159424, "learning_rate": 3.2434041591596245e-07, "loss": 0.4335, "num_input_tokens_seen": 173973760, "step": 143060 }, { "epoch": 17.925698534018295, "grad_norm": 3.8859479427337646, "learning_rate": 3.241467433027784e-07, "loss": 0.3842, "num_input_tokens_seen": 173979936, "step": 143065 }, { "epoch": 17.926325021927077, "grad_norm": 7.157989025115967, "learning_rate": 3.2395312659335044e-07, "loss": 0.4684, "num_input_tokens_seen": 173985984, "step": 143070 }, { "epoch": 17.92695150983586, "grad_norm": 4.996705532073975, "learning_rate": 3.237595657899945e-07, "loss": 0.4582, "num_input_tokens_seen": 173992064, "step": 143075 }, { "epoch": 17.927577997744642, "grad_norm": 3.994602918624878, "learning_rate": 3.2356606089502476e-07, "loss": 0.4648, "num_input_tokens_seen": 173997536, "step": 143080 }, { "epoch": 17.928204485653428, "grad_norm": 7.986446857452393, "learning_rate": 3.2337261191075387e-07, "loss": 0.4437, "num_input_tokens_seen": 174003808, "step": 143085 }, { "epoch": 17.92883097356221, "grad_norm": 6.391386032104492, "learning_rate": 3.2317921883949555e-07, "loss": 0.4267, "num_input_tokens_seen": 174009856, "step": 143090 }, { "epoch": 17.929457461470992, "grad_norm": 3.3867173194885254, "learning_rate": 3.2298588168356183e-07, "loss": 0.3985, "num_input_tokens_seen": 174015072, "step": 143095 }, { "epoch": 17.930083949379778, "grad_norm": 5.352211952209473, "learning_rate": 3.2279260044526473e-07, "loss": 0.4577, "num_input_tokens_seen": 174020992, "step": 143100 }, { "epoch": 17.93071043728856, "grad_norm": 3.2718658447265625, "learning_rate": 3.2259937512691354e-07, "loss": 0.5003, "num_input_tokens_seen": 174027232, "step": 143105 }, { "epoch": 17.931336925197343, "grad_norm": 5.282943248748779, "learning_rate": 3.224062057308203e-07, "loss": 0.4174, "num_input_tokens_seen": 174033280, "step": 143110 }, { "epoch": 17.93196341310613, "grad_norm": 6.688146114349365, "learning_rate": 3.222130922592925e-07, "loss": 0.481, "num_input_tokens_seen": 174039456, "step": 143115 }, { "epoch": 17.93258990101491, "grad_norm": 2.8761398792266846, "learning_rate": 3.2202003471464126e-07, "loss": 0.475, "num_input_tokens_seen": 174044992, "step": 143120 }, { "epoch": 17.933216388923693, "grad_norm": 7.17022180557251, "learning_rate": 3.218270330991724e-07, "loss": 0.4789, "num_input_tokens_seen": 174050720, "step": 143125 }, { "epoch": 17.93384287683248, "grad_norm": 10.039352416992188, "learning_rate": 3.2163408741519576e-07, "loss": 0.4542, "num_input_tokens_seen": 174057024, "step": 143130 }, { "epoch": 17.93446936474126, "grad_norm": 13.966680526733398, "learning_rate": 3.2144119766501557e-07, "loss": 0.4368, "num_input_tokens_seen": 174063136, "step": 143135 }, { "epoch": 17.935095852650043, "grad_norm": 4.260354518890381, "learning_rate": 3.2124836385094003e-07, "loss": 0.4454, "num_input_tokens_seen": 174069312, "step": 143140 }, { "epoch": 17.935722340558826, "grad_norm": 19.57448959350586, "learning_rate": 3.2105558597527455e-07, "loss": 0.4494, "num_input_tokens_seen": 174075584, "step": 143145 }, { "epoch": 17.93634882846761, "grad_norm": 20.74140167236328, "learning_rate": 3.208628640403222e-07, "loss": 0.5873, "num_input_tokens_seen": 174081440, "step": 143150 }, { "epoch": 17.936975316376394, "grad_norm": 5.313946723937988, "learning_rate": 3.206701980483895e-07, "loss": 0.5042, "num_input_tokens_seen": 174087680, "step": 143155 }, { "epoch": 17.937601804285176, "grad_norm": 16.411996841430664, "learning_rate": 3.20477588001778e-07, "loss": 0.4467, "num_input_tokens_seen": 174093056, "step": 143160 }, { "epoch": 17.938228292193962, "grad_norm": 5.892820358276367, "learning_rate": 3.2028503390279186e-07, "loss": 0.4649, "num_input_tokens_seen": 174099424, "step": 143165 }, { "epoch": 17.938854780102744, "grad_norm": 22.4226131439209, "learning_rate": 3.2009253575373157e-07, "loss": 0.4886, "num_input_tokens_seen": 174105952, "step": 143170 }, { "epoch": 17.939481268011527, "grad_norm": 16.90472984313965, "learning_rate": 3.199000935569008e-07, "loss": 0.5048, "num_input_tokens_seen": 174112224, "step": 143175 }, { "epoch": 17.940107755920312, "grad_norm": 10.200129508972168, "learning_rate": 3.1970770731459877e-07, "loss": 0.4422, "num_input_tokens_seen": 174118656, "step": 143180 }, { "epoch": 17.940734243829095, "grad_norm": 7.501737594604492, "learning_rate": 3.195153770291254e-07, "loss": 0.5258, "num_input_tokens_seen": 174125472, "step": 143185 }, { "epoch": 17.941360731737877, "grad_norm": 4.686482906341553, "learning_rate": 3.19323102702781e-07, "loss": 0.4538, "num_input_tokens_seen": 174131648, "step": 143190 }, { "epoch": 17.94198721964666, "grad_norm": 4.444458484649658, "learning_rate": 3.1913088433786546e-07, "loss": 0.4156, "num_input_tokens_seen": 174137728, "step": 143195 }, { "epoch": 17.942613707555445, "grad_norm": 4.1608991622924805, "learning_rate": 3.1893872193667464e-07, "loss": 0.4463, "num_input_tokens_seen": 174144032, "step": 143200 }, { "epoch": 17.943240195464227, "grad_norm": 8.550604820251465, "learning_rate": 3.1874661550150674e-07, "loss": 0.4467, "num_input_tokens_seen": 174149696, "step": 143205 }, { "epoch": 17.94386668337301, "grad_norm": 7.4524102210998535, "learning_rate": 3.1855456503466e-07, "loss": 0.4062, "num_input_tokens_seen": 174155840, "step": 143210 }, { "epoch": 17.944493171281795, "grad_norm": 7.188170433044434, "learning_rate": 3.18362570538428e-07, "loss": 0.3883, "num_input_tokens_seen": 174161696, "step": 143215 }, { "epoch": 17.945119659190578, "grad_norm": 4.807637691497803, "learning_rate": 3.18170632015109e-07, "loss": 0.4074, "num_input_tokens_seen": 174167616, "step": 143220 }, { "epoch": 17.94574614709936, "grad_norm": 5.039294719696045, "learning_rate": 3.1797874946699505e-07, "loss": 0.4509, "num_input_tokens_seen": 174173760, "step": 143225 }, { "epoch": 17.946372635008146, "grad_norm": 13.351644515991211, "learning_rate": 3.1778692289638256e-07, "loss": 0.4401, "num_input_tokens_seen": 174180256, "step": 143230 }, { "epoch": 17.946999122916928, "grad_norm": 10.029417037963867, "learning_rate": 3.175951523055637e-07, "loss": 0.4344, "num_input_tokens_seen": 174186336, "step": 143235 }, { "epoch": 17.94762561082571, "grad_norm": 3.786975145339966, "learning_rate": 3.1740343769683156e-07, "loss": 0.4528, "num_input_tokens_seen": 174192096, "step": 143240 }, { "epoch": 17.948252098734493, "grad_norm": 5.774515628814697, "learning_rate": 3.1721177907247824e-07, "loss": 0.4474, "num_input_tokens_seen": 174198272, "step": 143245 }, { "epoch": 17.94887858664328, "grad_norm": 16.70154571533203, "learning_rate": 3.170201764347952e-07, "loss": 0.4509, "num_input_tokens_seen": 174204448, "step": 143250 }, { "epoch": 17.94950507455206, "grad_norm": 8.645756721496582, "learning_rate": 3.1682862978607285e-07, "loss": 0.4406, "num_input_tokens_seen": 174210496, "step": 143255 }, { "epoch": 17.950131562460843, "grad_norm": 6.797872066497803, "learning_rate": 3.1663713912860163e-07, "loss": 0.4188, "num_input_tokens_seen": 174216800, "step": 143260 }, { "epoch": 17.95075805036963, "grad_norm": 15.83849811553955, "learning_rate": 3.1644570446467125e-07, "loss": 0.4586, "num_input_tokens_seen": 174223040, "step": 143265 }, { "epoch": 17.95138453827841, "grad_norm": 3.9266951084136963, "learning_rate": 3.1625432579657e-07, "loss": 0.4112, "num_input_tokens_seen": 174229120, "step": 143270 }, { "epoch": 17.952011026187193, "grad_norm": 4.2783966064453125, "learning_rate": 3.1606300312658656e-07, "loss": 0.4562, "num_input_tokens_seen": 174235328, "step": 143275 }, { "epoch": 17.95263751409598, "grad_norm": 2.8296210765838623, "learning_rate": 3.158717364570074e-07, "loss": 0.4461, "num_input_tokens_seen": 174241472, "step": 143280 }, { "epoch": 17.95326400200476, "grad_norm": 4.9526777267456055, "learning_rate": 3.156805257901208e-07, "loss": 0.4537, "num_input_tokens_seen": 174247712, "step": 143285 }, { "epoch": 17.953890489913544, "grad_norm": 13.372696876525879, "learning_rate": 3.154893711282103e-07, "loss": 0.4403, "num_input_tokens_seen": 174253984, "step": 143290 }, { "epoch": 17.95451697782233, "grad_norm": 12.096598625183105, "learning_rate": 3.152982724735637e-07, "loss": 0.4419, "num_input_tokens_seen": 174260224, "step": 143295 }, { "epoch": 17.955143465731112, "grad_norm": 6.810247898101807, "learning_rate": 3.1510722982846464e-07, "loss": 0.4168, "num_input_tokens_seen": 174266432, "step": 143300 }, { "epoch": 17.955769953639894, "grad_norm": 6.116417407989502, "learning_rate": 3.14916243195198e-07, "loss": 0.4608, "num_input_tokens_seen": 174272544, "step": 143305 }, { "epoch": 17.956396441548677, "grad_norm": 5.372863292694092, "learning_rate": 3.147253125760463e-07, "loss": 0.4397, "num_input_tokens_seen": 174278144, "step": 143310 }, { "epoch": 17.957022929457462, "grad_norm": 5.2942023277282715, "learning_rate": 3.145344379732934e-07, "loss": 0.3876, "num_input_tokens_seen": 174284448, "step": 143315 }, { "epoch": 17.957649417366245, "grad_norm": 9.508492469787598, "learning_rate": 3.143436193892196e-07, "loss": 0.4102, "num_input_tokens_seen": 174290400, "step": 143320 }, { "epoch": 17.958275905275027, "grad_norm": 5.497370719909668, "learning_rate": 3.141528568261076e-07, "loss": 0.4454, "num_input_tokens_seen": 174296608, "step": 143325 }, { "epoch": 17.958902393183813, "grad_norm": 6.6204938888549805, "learning_rate": 3.139621502862383e-07, "loss": 0.4356, "num_input_tokens_seen": 174302976, "step": 143330 }, { "epoch": 17.959528881092595, "grad_norm": 4.841511249542236, "learning_rate": 3.1377149977189103e-07, "loss": 0.4287, "num_input_tokens_seen": 174309376, "step": 143335 }, { "epoch": 17.960155369001377, "grad_norm": 4.50714111328125, "learning_rate": 3.135809052853456e-07, "loss": 0.4683, "num_input_tokens_seen": 174315456, "step": 143340 }, { "epoch": 17.960781856910163, "grad_norm": 5.671990871429443, "learning_rate": 3.133903668288801e-07, "loss": 0.4612, "num_input_tokens_seen": 174321824, "step": 143345 }, { "epoch": 17.961408344818945, "grad_norm": 10.298888206481934, "learning_rate": 3.13199884404774e-07, "loss": 0.4851, "num_input_tokens_seen": 174327840, "step": 143350 }, { "epoch": 17.962034832727728, "grad_norm": 9.411726951599121, "learning_rate": 3.1300945801530256e-07, "loss": 0.5102, "num_input_tokens_seen": 174333888, "step": 143355 }, { "epoch": 17.96266132063651, "grad_norm": 13.754820823669434, "learning_rate": 3.128190876627446e-07, "loss": 0.4482, "num_input_tokens_seen": 174339968, "step": 143360 }, { "epoch": 17.963287808545296, "grad_norm": 5.056269645690918, "learning_rate": 3.1262877334937437e-07, "loss": 0.3902, "num_input_tokens_seen": 174346048, "step": 143365 }, { "epoch": 17.963914296454078, "grad_norm": 8.367932319641113, "learning_rate": 3.124385150774689e-07, "loss": 0.4157, "num_input_tokens_seen": 174352256, "step": 143370 }, { "epoch": 17.96454078436286, "grad_norm": 12.855074882507324, "learning_rate": 3.122483128493015e-07, "loss": 0.4403, "num_input_tokens_seen": 174358752, "step": 143375 }, { "epoch": 17.965167272271646, "grad_norm": 6.766657829284668, "learning_rate": 3.1205816666714736e-07, "loss": 0.4089, "num_input_tokens_seen": 174364800, "step": 143380 }, { "epoch": 17.96579376018043, "grad_norm": 8.127968788146973, "learning_rate": 3.1186807653327877e-07, "loss": 0.4578, "num_input_tokens_seen": 174370656, "step": 143385 }, { "epoch": 17.96642024808921, "grad_norm": 14.897957801818848, "learning_rate": 3.1167804244996824e-07, "loss": 0.4167, "num_input_tokens_seen": 174376960, "step": 143390 }, { "epoch": 17.967046735997997, "grad_norm": 10.233541488647461, "learning_rate": 3.11488064419489e-07, "loss": 0.4751, "num_input_tokens_seen": 174383264, "step": 143395 }, { "epoch": 17.96767322390678, "grad_norm": 22.41049575805664, "learning_rate": 3.112981424441125e-07, "loss": 0.5383, "num_input_tokens_seen": 174389088, "step": 143400 }, { "epoch": 17.96829971181556, "grad_norm": 21.39389991760254, "learning_rate": 3.1110827652610755e-07, "loss": 0.45, "num_input_tokens_seen": 174395168, "step": 143405 }, { "epoch": 17.968926199724347, "grad_norm": 14.052205085754395, "learning_rate": 3.109184666677456e-07, "loss": 0.5317, "num_input_tokens_seen": 174401248, "step": 143410 }, { "epoch": 17.96955268763313, "grad_norm": 3.0315589904785156, "learning_rate": 3.107287128712971e-07, "loss": 0.4601, "num_input_tokens_seen": 174407424, "step": 143415 }, { "epoch": 17.97017917554191, "grad_norm": 14.015061378479004, "learning_rate": 3.1053901513902807e-07, "loss": 0.4383, "num_input_tokens_seen": 174413728, "step": 143420 }, { "epoch": 17.970805663450694, "grad_norm": 13.937766075134277, "learning_rate": 3.1034937347320827e-07, "loss": 0.5231, "num_input_tokens_seen": 174420032, "step": 143425 }, { "epoch": 17.97143215135948, "grad_norm": 4.719489574432373, "learning_rate": 3.101597878761042e-07, "loss": 0.4402, "num_input_tokens_seen": 174426400, "step": 143430 }, { "epoch": 17.972058639268262, "grad_norm": 18.610567092895508, "learning_rate": 3.0997025834998364e-07, "loss": 0.471, "num_input_tokens_seen": 174432032, "step": 143435 }, { "epoch": 17.972685127177044, "grad_norm": 5.431517601013184, "learning_rate": 3.097807848971107e-07, "loss": 0.4113, "num_input_tokens_seen": 174438272, "step": 143440 }, { "epoch": 17.97331161508583, "grad_norm": 3.746011734008789, "learning_rate": 3.0959136751975206e-07, "loss": 0.4109, "num_input_tokens_seen": 174444384, "step": 143445 }, { "epoch": 17.973938102994612, "grad_norm": 4.401145935058594, "learning_rate": 3.09402006220173e-07, "loss": 0.3981, "num_input_tokens_seen": 174450176, "step": 143450 }, { "epoch": 17.974564590903395, "grad_norm": 7.078250408172607, "learning_rate": 3.092127010006357e-07, "loss": 0.4302, "num_input_tokens_seen": 174456128, "step": 143455 }, { "epoch": 17.97519107881218, "grad_norm": 18.946115493774414, "learning_rate": 3.09023451863405e-07, "loss": 0.4652, "num_input_tokens_seen": 174462112, "step": 143460 }, { "epoch": 17.975817566720963, "grad_norm": 10.12106990814209, "learning_rate": 3.088342588107424e-07, "loss": 0.4853, "num_input_tokens_seen": 174468288, "step": 143465 }, { "epoch": 17.976444054629745, "grad_norm": 24.019155502319336, "learning_rate": 3.086451218449105e-07, "loss": 0.4597, "num_input_tokens_seen": 174474400, "step": 143470 }, { "epoch": 17.977070542538527, "grad_norm": 17.672313690185547, "learning_rate": 3.084560409681703e-07, "loss": 0.442, "num_input_tokens_seen": 174480672, "step": 143475 }, { "epoch": 17.977697030447313, "grad_norm": 13.107699394226074, "learning_rate": 3.082670161827833e-07, "loss": 0.4483, "num_input_tokens_seen": 174486272, "step": 143480 }, { "epoch": 17.978323518356095, "grad_norm": 12.249485969543457, "learning_rate": 3.080780474910078e-07, "loss": 0.4577, "num_input_tokens_seen": 174492608, "step": 143485 }, { "epoch": 17.978950006264878, "grad_norm": 6.98331356048584, "learning_rate": 3.078891348951041e-07, "loss": 0.48, "num_input_tokens_seen": 174498912, "step": 143490 }, { "epoch": 17.979576494173664, "grad_norm": 8.675094604492188, "learning_rate": 3.0770027839733096e-07, "loss": 0.4103, "num_input_tokens_seen": 174504896, "step": 143495 }, { "epoch": 17.980202982082446, "grad_norm": 4.885408878326416, "learning_rate": 3.075114779999455e-07, "loss": 0.4961, "num_input_tokens_seen": 174510592, "step": 143500 }, { "epoch": 17.980829469991228, "grad_norm": 4.707784652709961, "learning_rate": 3.0732273370520595e-07, "loss": 0.4444, "num_input_tokens_seen": 174516704, "step": 143505 }, { "epoch": 17.981455957900014, "grad_norm": 4.112641334533691, "learning_rate": 3.071340455153682e-07, "loss": 0.4941, "num_input_tokens_seen": 174522656, "step": 143510 }, { "epoch": 17.982082445808796, "grad_norm": 4.830410957336426, "learning_rate": 3.069454134326894e-07, "loss": 0.4189, "num_input_tokens_seen": 174528736, "step": 143515 }, { "epoch": 17.98270893371758, "grad_norm": 12.411447525024414, "learning_rate": 3.067568374594232e-07, "loss": 0.4648, "num_input_tokens_seen": 174533952, "step": 143520 }, { "epoch": 17.983335421626364, "grad_norm": 4.788145542144775, "learning_rate": 3.065683175978251e-07, "loss": 0.4091, "num_input_tokens_seen": 174539808, "step": 143525 }, { "epoch": 17.983961909535147, "grad_norm": 4.748983383178711, "learning_rate": 3.063798538501489e-07, "loss": 0.4144, "num_input_tokens_seen": 174545760, "step": 143530 }, { "epoch": 17.98458839744393, "grad_norm": 8.094596862792969, "learning_rate": 3.061914462186477e-07, "loss": 0.4446, "num_input_tokens_seen": 174551840, "step": 143535 }, { "epoch": 17.98521488535271, "grad_norm": 5.625664710998535, "learning_rate": 3.0600309470557364e-07, "loss": 0.4285, "num_input_tokens_seen": 174558208, "step": 143540 }, { "epoch": 17.985841373261497, "grad_norm": 7.036417484283447, "learning_rate": 3.058147993131799e-07, "loss": 0.4239, "num_input_tokens_seen": 174564416, "step": 143545 }, { "epoch": 17.98646786117028, "grad_norm": 9.032302856445312, "learning_rate": 3.0562656004371583e-07, "loss": 0.4617, "num_input_tokens_seen": 174570528, "step": 143550 }, { "epoch": 17.98709434907906, "grad_norm": 13.172907829284668, "learning_rate": 3.05438376899434e-07, "loss": 0.4085, "num_input_tokens_seen": 174577024, "step": 143555 }, { "epoch": 17.987720836987847, "grad_norm": 3.768291711807251, "learning_rate": 3.0525024988258267e-07, "loss": 0.4909, "num_input_tokens_seen": 174583008, "step": 143560 }, { "epoch": 17.98834732489663, "grad_norm": 28.771743774414062, "learning_rate": 3.0506217899541223e-07, "loss": 0.4762, "num_input_tokens_seen": 174588992, "step": 143565 }, { "epoch": 17.988973812805412, "grad_norm": 9.991127014160156, "learning_rate": 3.0487416424017034e-07, "loss": 0.4433, "num_input_tokens_seen": 174595104, "step": 143570 }, { "epoch": 17.989600300714198, "grad_norm": 9.015854835510254, "learning_rate": 3.0468620561910523e-07, "loss": 0.5099, "num_input_tokens_seen": 174601088, "step": 143575 }, { "epoch": 17.99022678862298, "grad_norm": 5.98470401763916, "learning_rate": 3.044983031344645e-07, "loss": 0.4378, "num_input_tokens_seen": 174607168, "step": 143580 }, { "epoch": 17.990853276531762, "grad_norm": 9.433592796325684, "learning_rate": 3.0431045678849415e-07, "loss": 0.3952, "num_input_tokens_seen": 174612864, "step": 143585 }, { "epoch": 17.991479764440545, "grad_norm": 3.1437718868255615, "learning_rate": 3.0412266658344067e-07, "loss": 0.4477, "num_input_tokens_seen": 174619072, "step": 143590 }, { "epoch": 17.99210625234933, "grad_norm": 4.935615062713623, "learning_rate": 3.0393493252154793e-07, "loss": 0.5261, "num_input_tokens_seen": 174624576, "step": 143595 }, { "epoch": 17.992732740258113, "grad_norm": 4.530348777770996, "learning_rate": 3.0374725460506125e-07, "loss": 0.4293, "num_input_tokens_seen": 174630400, "step": 143600 }, { "epoch": 17.993359228166895, "grad_norm": 13.279601097106934, "learning_rate": 3.035596328362245e-07, "loss": 0.4277, "num_input_tokens_seen": 174636864, "step": 143605 }, { "epoch": 17.99398571607568, "grad_norm": 4.29703426361084, "learning_rate": 3.033720672172819e-07, "loss": 0.4168, "num_input_tokens_seen": 174643072, "step": 143610 }, { "epoch": 17.994612203984463, "grad_norm": 5.321613311767578, "learning_rate": 3.0318455775047384e-07, "loss": 0.4681, "num_input_tokens_seen": 174649280, "step": 143615 }, { "epoch": 17.995238691893245, "grad_norm": 14.464327812194824, "learning_rate": 3.0299710443804476e-07, "loss": 0.4841, "num_input_tokens_seen": 174655552, "step": 143620 }, { "epoch": 17.99586517980203, "grad_norm": 6.1791157722473145, "learning_rate": 3.028097072822328e-07, "loss": 0.4697, "num_input_tokens_seen": 174661856, "step": 143625 }, { "epoch": 17.996491667710814, "grad_norm": 3.938401222229004, "learning_rate": 3.0262236628528006e-07, "loss": 0.4614, "num_input_tokens_seen": 174668128, "step": 143630 }, { "epoch": 17.997118155619596, "grad_norm": 5.120665550231934, "learning_rate": 3.024350814494276e-07, "loss": 0.3999, "num_input_tokens_seen": 174674272, "step": 143635 }, { "epoch": 17.997744643528378, "grad_norm": 4.914694309234619, "learning_rate": 3.0224785277691184e-07, "loss": 0.4712, "num_input_tokens_seen": 174679840, "step": 143640 }, { "epoch": 17.998371131437164, "grad_norm": 11.301390647888184, "learning_rate": 3.0206068026997383e-07, "loss": 0.4759, "num_input_tokens_seen": 174685696, "step": 143645 }, { "epoch": 17.998997619345946, "grad_norm": 3.4512972831726074, "learning_rate": 3.0187356393084897e-07, "loss": 0.4107, "num_input_tokens_seen": 174691520, "step": 143650 }, { "epoch": 17.99962410725473, "grad_norm": 3.879444122314453, "learning_rate": 3.0168650376177666e-07, "loss": 0.5048, "num_input_tokens_seen": 174697216, "step": 143655 }, { "epoch": 18.0, "eval_loss": 0.5134217739105225, "eval_runtime": 223.9301, "eval_samples_per_second": 35.641, "eval_steps_per_second": 8.913, "num_input_tokens_seen": 174700864, "step": 143658 }, { "epoch": 18.000250595163514, "grad_norm": 5.5367937088012695, "learning_rate": 3.014994997649917e-07, "loss": 0.4471, "num_input_tokens_seen": 174703232, "step": 143660 }, { "epoch": 18.000877083072297, "grad_norm": 5.945342063903809, "learning_rate": 3.0131255194273124e-07, "loss": 0.4492, "num_input_tokens_seen": 174709312, "step": 143665 }, { "epoch": 18.00150357098108, "grad_norm": 4.135803699493408, "learning_rate": 3.0112566029722844e-07, "loss": 0.4026, "num_input_tokens_seen": 174715424, "step": 143670 }, { "epoch": 18.002130058889865, "grad_norm": 7.765787601470947, "learning_rate": 3.009388248307199e-07, "loss": 0.4757, "num_input_tokens_seen": 174721248, "step": 143675 }, { "epoch": 18.002756546798647, "grad_norm": 16.68355941772461, "learning_rate": 3.0075204554543823e-07, "loss": 0.396, "num_input_tokens_seen": 174726848, "step": 143680 }, { "epoch": 18.00338303470743, "grad_norm": 9.746281623840332, "learning_rate": 3.005653224436172e-07, "loss": 0.4199, "num_input_tokens_seen": 174732960, "step": 143685 }, { "epoch": 18.004009522616215, "grad_norm": 5.497888565063477, "learning_rate": 3.003786555274879e-07, "loss": 0.4519, "num_input_tokens_seen": 174739136, "step": 143690 }, { "epoch": 18.004636010524997, "grad_norm": 4.726734638214111, "learning_rate": 3.001920447992829e-07, "loss": 0.4513, "num_input_tokens_seen": 174745248, "step": 143695 }, { "epoch": 18.00526249843378, "grad_norm": 19.535991668701172, "learning_rate": 3.000054902612343e-07, "loss": 0.4574, "num_input_tokens_seen": 174751584, "step": 143700 }, { "epoch": 18.005888986342562, "grad_norm": 6.85105037689209, "learning_rate": 2.998189919155714e-07, "loss": 0.4889, "num_input_tokens_seen": 174757856, "step": 143705 }, { "epoch": 18.006515474251348, "grad_norm": 8.678131103515625, "learning_rate": 2.996325497645236e-07, "loss": 0.3992, "num_input_tokens_seen": 174763904, "step": 143710 }, { "epoch": 18.00714196216013, "grad_norm": 5.55860710144043, "learning_rate": 2.994461638103202e-07, "loss": 0.4211, "num_input_tokens_seen": 174770304, "step": 143715 }, { "epoch": 18.007768450068912, "grad_norm": 4.163975238800049, "learning_rate": 2.992598340551911e-07, "loss": 0.3934, "num_input_tokens_seen": 174776640, "step": 143720 }, { "epoch": 18.008394937977698, "grad_norm": 3.3401365280151367, "learning_rate": 2.990735605013623e-07, "loss": 0.4699, "num_input_tokens_seen": 174782880, "step": 143725 }, { "epoch": 18.00902142588648, "grad_norm": 5.4282684326171875, "learning_rate": 2.988873431510619e-07, "loss": 0.4092, "num_input_tokens_seen": 174788608, "step": 143730 }, { "epoch": 18.009647913795263, "grad_norm": 4.5955729484558105, "learning_rate": 2.98701182006515e-07, "loss": 0.5004, "num_input_tokens_seen": 174794560, "step": 143735 }, { "epoch": 18.01027440170405, "grad_norm": 10.087625503540039, "learning_rate": 2.9851507706994855e-07, "loss": 0.4414, "num_input_tokens_seen": 174800416, "step": 143740 }, { "epoch": 18.01090088961283, "grad_norm": 2.5458295345306396, "learning_rate": 2.983290283435869e-07, "loss": 0.4296, "num_input_tokens_seen": 174806656, "step": 143745 }, { "epoch": 18.011527377521613, "grad_norm": 5.848944664001465, "learning_rate": 2.98143035829655e-07, "loss": 0.4222, "num_input_tokens_seen": 174812640, "step": 143750 }, { "epoch": 18.012153865430395, "grad_norm": 2.7440035343170166, "learning_rate": 2.97957099530376e-07, "loss": 0.4322, "num_input_tokens_seen": 174819040, "step": 143755 }, { "epoch": 18.01278035333918, "grad_norm": 3.0312373638153076, "learning_rate": 2.977712194479726e-07, "loss": 0.4246, "num_input_tokens_seen": 174825024, "step": 143760 }, { "epoch": 18.013406841247964, "grad_norm": 5.485624313354492, "learning_rate": 2.975853955846686e-07, "loss": 0.4061, "num_input_tokens_seen": 174831264, "step": 143765 }, { "epoch": 18.014033329156746, "grad_norm": 4.280416488647461, "learning_rate": 2.973996279426844e-07, "loss": 0.4169, "num_input_tokens_seen": 174837600, "step": 143770 }, { "epoch": 18.01465981706553, "grad_norm": 10.214305877685547, "learning_rate": 2.9721391652424157e-07, "loss": 0.4609, "num_input_tokens_seen": 174843616, "step": 143775 }, { "epoch": 18.015286304974314, "grad_norm": 3.1463584899902344, "learning_rate": 2.9702826133156003e-07, "loss": 0.4473, "num_input_tokens_seen": 174849824, "step": 143780 }, { "epoch": 18.015912792883096, "grad_norm": 4.537925720214844, "learning_rate": 2.968426623668602e-07, "loss": 0.4355, "num_input_tokens_seen": 174855840, "step": 143785 }, { "epoch": 18.016539280791882, "grad_norm": 36.798160552978516, "learning_rate": 2.966571196323598e-07, "loss": 0.5187, "num_input_tokens_seen": 174862016, "step": 143790 }, { "epoch": 18.017165768700664, "grad_norm": 16.716266632080078, "learning_rate": 2.964716331302786e-07, "loss": 0.4102, "num_input_tokens_seen": 174868448, "step": 143795 }, { "epoch": 18.017792256609447, "grad_norm": 4.670019626617432, "learning_rate": 2.962862028628327e-07, "loss": 0.4426, "num_input_tokens_seen": 174874272, "step": 143800 }, { "epoch": 18.018418744518232, "grad_norm": 4.787405490875244, "learning_rate": 2.961008288322398e-07, "loss": 0.4524, "num_input_tokens_seen": 174880352, "step": 143805 }, { "epoch": 18.019045232427015, "grad_norm": 10.13097858428955, "learning_rate": 2.959155110407164e-07, "loss": 0.4534, "num_input_tokens_seen": 174886240, "step": 143810 }, { "epoch": 18.019671720335797, "grad_norm": 5.003570556640625, "learning_rate": 2.95730249490479e-07, "loss": 0.4083, "num_input_tokens_seen": 174892224, "step": 143815 }, { "epoch": 18.02029820824458, "grad_norm": 5.275812149047852, "learning_rate": 2.955450441837404e-07, "loss": 0.4434, "num_input_tokens_seen": 174898048, "step": 143820 }, { "epoch": 18.020924696153365, "grad_norm": 8.427604675292969, "learning_rate": 2.9535989512271655e-07, "loss": 0.393, "num_input_tokens_seen": 174904128, "step": 143825 }, { "epoch": 18.021551184062147, "grad_norm": 19.526456832885742, "learning_rate": 2.9517480230962126e-07, "loss": 0.4714, "num_input_tokens_seen": 174910080, "step": 143830 }, { "epoch": 18.02217767197093, "grad_norm": 14.216774940490723, "learning_rate": 2.9498976574666607e-07, "loss": 0.4409, "num_input_tokens_seen": 174916224, "step": 143835 }, { "epoch": 18.022804159879716, "grad_norm": 6.7256879806518555, "learning_rate": 2.948047854360647e-07, "loss": 0.3798, "num_input_tokens_seen": 174922752, "step": 143840 }, { "epoch": 18.023430647788498, "grad_norm": 12.996354103088379, "learning_rate": 2.946198613800272e-07, "loss": 0.4151, "num_input_tokens_seen": 174929120, "step": 143845 }, { "epoch": 18.02405713569728, "grad_norm": 5.089250087738037, "learning_rate": 2.944349935807661e-07, "loss": 0.4123, "num_input_tokens_seen": 174934688, "step": 143850 }, { "epoch": 18.024683623606066, "grad_norm": 8.847192764282227, "learning_rate": 2.942501820404903e-07, "loss": 0.4944, "num_input_tokens_seen": 174940448, "step": 143855 }, { "epoch": 18.025310111514848, "grad_norm": 8.452720642089844, "learning_rate": 2.9406542676141016e-07, "loss": 0.4649, "num_input_tokens_seen": 174945952, "step": 143860 }, { "epoch": 18.02593659942363, "grad_norm": 3.998199462890625, "learning_rate": 2.9388072774573397e-07, "loss": 0.447, "num_input_tokens_seen": 174951968, "step": 143865 }, { "epoch": 18.026563087332413, "grad_norm": 5.762453079223633, "learning_rate": 2.936960849956716e-07, "loss": 0.4205, "num_input_tokens_seen": 174958176, "step": 143870 }, { "epoch": 18.0271895752412, "grad_norm": 3.734320640563965, "learning_rate": 2.93511498513428e-07, "loss": 0.4654, "num_input_tokens_seen": 174963680, "step": 143875 }, { "epoch": 18.02781606314998, "grad_norm": 4.0741095542907715, "learning_rate": 2.9332696830121133e-07, "loss": 0.5872, "num_input_tokens_seen": 174969856, "step": 143880 }, { "epoch": 18.028442551058763, "grad_norm": 6.787759304046631, "learning_rate": 2.931424943612293e-07, "loss": 0.4249, "num_input_tokens_seen": 174976160, "step": 143885 }, { "epoch": 18.02906903896755, "grad_norm": 3.956240177154541, "learning_rate": 2.929580766956847e-07, "loss": 0.4088, "num_input_tokens_seen": 174981856, "step": 143890 }, { "epoch": 18.02969552687633, "grad_norm": 25.537479400634766, "learning_rate": 2.92773715306785e-07, "loss": 0.5467, "num_input_tokens_seen": 174988160, "step": 143895 }, { "epoch": 18.030322014785114, "grad_norm": 9.5492582321167, "learning_rate": 2.925894101967319e-07, "loss": 0.4972, "num_input_tokens_seen": 174994400, "step": 143900 }, { "epoch": 18.0309485026939, "grad_norm": 8.293606758117676, "learning_rate": 2.9240516136773144e-07, "loss": 0.4835, "num_input_tokens_seen": 175000416, "step": 143905 }, { "epoch": 18.03157499060268, "grad_norm": 4.649541854858398, "learning_rate": 2.9222096882198404e-07, "loss": 0.4901, "num_input_tokens_seen": 175006656, "step": 143910 }, { "epoch": 18.032201478511464, "grad_norm": 4.073836326599121, "learning_rate": 2.9203683256169344e-07, "loss": 0.3989, "num_input_tokens_seen": 175012800, "step": 143915 }, { "epoch": 18.03282796642025, "grad_norm": 13.842813491821289, "learning_rate": 2.918527525890602e-07, "loss": 0.4127, "num_input_tokens_seen": 175019008, "step": 143920 }, { "epoch": 18.033454454329032, "grad_norm": 10.569122314453125, "learning_rate": 2.9166872890628693e-07, "loss": 0.4543, "num_input_tokens_seen": 175025216, "step": 143925 }, { "epoch": 18.034080942237814, "grad_norm": 5.731705665588379, "learning_rate": 2.9148476151557135e-07, "loss": 0.3939, "num_input_tokens_seen": 175031392, "step": 143930 }, { "epoch": 18.034707430146597, "grad_norm": 6.988111972808838, "learning_rate": 2.9130085041911506e-07, "loss": 0.3632, "num_input_tokens_seen": 175037600, "step": 143935 }, { "epoch": 18.035333918055382, "grad_norm": 7.673766613006592, "learning_rate": 2.911169956191157e-07, "loss": 0.4572, "num_input_tokens_seen": 175043168, "step": 143940 }, { "epoch": 18.035960405964165, "grad_norm": 6.3652567863464355, "learning_rate": 2.909331971177709e-07, "loss": 0.4414, "num_input_tokens_seen": 175049248, "step": 143945 }, { "epoch": 18.036586893872947, "grad_norm": 6.554676055908203, "learning_rate": 2.9074945491728015e-07, "loss": 0.4033, "num_input_tokens_seen": 175055424, "step": 143950 }, { "epoch": 18.037213381781733, "grad_norm": 16.45207405090332, "learning_rate": 2.9056576901983824e-07, "loss": 0.499, "num_input_tokens_seen": 175061568, "step": 143955 }, { "epoch": 18.037839869690515, "grad_norm": 13.550769805908203, "learning_rate": 2.903821394276424e-07, "loss": 0.4259, "num_input_tokens_seen": 175067776, "step": 143960 }, { "epoch": 18.038466357599297, "grad_norm": 16.059236526489258, "learning_rate": 2.9019856614288745e-07, "loss": 0.5033, "num_input_tokens_seen": 175073696, "step": 143965 }, { "epoch": 18.039092845508083, "grad_norm": 12.907669067382812, "learning_rate": 2.900150491677689e-07, "loss": 0.452, "num_input_tokens_seen": 175079808, "step": 143970 }, { "epoch": 18.039719333416866, "grad_norm": 19.201698303222656, "learning_rate": 2.8983158850447947e-07, "loss": 0.4359, "num_input_tokens_seen": 175085856, "step": 143975 }, { "epoch": 18.040345821325648, "grad_norm": 8.770115852355957, "learning_rate": 2.896481841552146e-07, "loss": 0.3957, "num_input_tokens_seen": 175091968, "step": 143980 }, { "epoch": 18.04097230923443, "grad_norm": 11.131340026855469, "learning_rate": 2.8946483612216525e-07, "loss": 0.4693, "num_input_tokens_seen": 175098176, "step": 143985 }, { "epoch": 18.041598797143216, "grad_norm": 3.7055556774139404, "learning_rate": 2.8928154440752475e-07, "loss": 0.4479, "num_input_tokens_seen": 175104192, "step": 143990 }, { "epoch": 18.042225285051998, "grad_norm": 19.003808975219727, "learning_rate": 2.8909830901348304e-07, "loss": 0.4601, "num_input_tokens_seen": 175110560, "step": 143995 }, { "epoch": 18.04285177296078, "grad_norm": 3.5104448795318604, "learning_rate": 2.889151299422327e-07, "loss": 0.3939, "num_input_tokens_seen": 175116832, "step": 144000 }, { "epoch": 18.043478260869566, "grad_norm": 5.294409275054932, "learning_rate": 2.8873200719596207e-07, "loss": 0.4388, "num_input_tokens_seen": 175122688, "step": 144005 }, { "epoch": 18.04410474877835, "grad_norm": 7.016822814941406, "learning_rate": 2.88548940776861e-07, "loss": 0.4395, "num_input_tokens_seen": 175128800, "step": 144010 }, { "epoch": 18.04473123668713, "grad_norm": 5.214782238006592, "learning_rate": 2.8836593068711894e-07, "loss": 0.4404, "num_input_tokens_seen": 175134944, "step": 144015 }, { "epoch": 18.045357724595917, "grad_norm": 11.372149467468262, "learning_rate": 2.8818297692892407e-07, "loss": 0.4677, "num_input_tokens_seen": 175141024, "step": 144020 }, { "epoch": 18.0459842125047, "grad_norm": 12.115192413330078, "learning_rate": 2.880000795044624e-07, "loss": 0.4547, "num_input_tokens_seen": 175147296, "step": 144025 }, { "epoch": 18.04661070041348, "grad_norm": 5.032780170440674, "learning_rate": 2.878172384159211e-07, "loss": 0.473, "num_input_tokens_seen": 175153472, "step": 144030 }, { "epoch": 18.047237188322267, "grad_norm": 19.704710006713867, "learning_rate": 2.8763445366548795e-07, "loss": 0.4136, "num_input_tokens_seen": 175159584, "step": 144035 }, { "epoch": 18.04786367623105, "grad_norm": 22.366615295410156, "learning_rate": 2.874517252553455e-07, "loss": 0.4781, "num_input_tokens_seen": 175166144, "step": 144040 }, { "epoch": 18.04849016413983, "grad_norm": 9.974209785461426, "learning_rate": 2.8726905318768096e-07, "loss": 0.456, "num_input_tokens_seen": 175171808, "step": 144045 }, { "epoch": 18.049116652048614, "grad_norm": 4.8950419425964355, "learning_rate": 2.870864374646759e-07, "loss": 0.4212, "num_input_tokens_seen": 175177888, "step": 144050 }, { "epoch": 18.0497431399574, "grad_norm": 6.550784111022949, "learning_rate": 2.8690387808851584e-07, "loss": 0.4489, "num_input_tokens_seen": 175183776, "step": 144055 }, { "epoch": 18.050369627866182, "grad_norm": 5.533430099487305, "learning_rate": 2.8672137506138233e-07, "loss": 0.4481, "num_input_tokens_seen": 175189984, "step": 144060 }, { "epoch": 18.050996115774964, "grad_norm": 4.750585556030273, "learning_rate": 2.8653892838545695e-07, "loss": 0.4279, "num_input_tokens_seen": 175195584, "step": 144065 }, { "epoch": 18.05162260368375, "grad_norm": 12.51725959777832, "learning_rate": 2.8635653806292186e-07, "loss": 0.4857, "num_input_tokens_seen": 175201536, "step": 144070 }, { "epoch": 18.052249091592532, "grad_norm": 21.041547775268555, "learning_rate": 2.8617420409595695e-07, "loss": 0.4363, "num_input_tokens_seen": 175207712, "step": 144075 }, { "epoch": 18.052875579501315, "grad_norm": 4.7094879150390625, "learning_rate": 2.859919264867439e-07, "loss": 0.4791, "num_input_tokens_seen": 175213664, "step": 144080 }, { "epoch": 18.0535020674101, "grad_norm": 3.494527816772461, "learning_rate": 2.858097052374592e-07, "loss": 0.4733, "num_input_tokens_seen": 175219424, "step": 144085 }, { "epoch": 18.054128555318883, "grad_norm": 13.642875671386719, "learning_rate": 2.8562754035028393e-07, "loss": 0.4501, "num_input_tokens_seen": 175225504, "step": 144090 }, { "epoch": 18.054755043227665, "grad_norm": 16.1612548828125, "learning_rate": 2.854454318273947e-07, "loss": 0.425, "num_input_tokens_seen": 175231648, "step": 144095 }, { "epoch": 18.055381531136447, "grad_norm": 17.907169342041016, "learning_rate": 2.852633796709692e-07, "loss": 0.5446, "num_input_tokens_seen": 175237984, "step": 144100 }, { "epoch": 18.056008019045233, "grad_norm": 3.454010486602783, "learning_rate": 2.850813838831834e-07, "loss": 0.3935, "num_input_tokens_seen": 175244000, "step": 144105 }, { "epoch": 18.056634506954016, "grad_norm": 4.813449382781982, "learning_rate": 2.84899444466214e-07, "loss": 0.4137, "num_input_tokens_seen": 175250240, "step": 144110 }, { "epoch": 18.057260994862798, "grad_norm": 18.993379592895508, "learning_rate": 2.847175614222364e-07, "loss": 0.4382, "num_input_tokens_seen": 175256064, "step": 144115 }, { "epoch": 18.057887482771584, "grad_norm": 5.945857048034668, "learning_rate": 2.8453573475342444e-07, "loss": 0.3873, "num_input_tokens_seen": 175262240, "step": 144120 }, { "epoch": 18.058513970680366, "grad_norm": 7.896078586578369, "learning_rate": 2.84353964461952e-07, "loss": 0.405, "num_input_tokens_seen": 175268128, "step": 144125 }, { "epoch": 18.059140458589148, "grad_norm": 4.044579029083252, "learning_rate": 2.841722505499928e-07, "loss": 0.443, "num_input_tokens_seen": 175274048, "step": 144130 }, { "epoch": 18.059766946497934, "grad_norm": 3.3880386352539062, "learning_rate": 2.8399059301971953e-07, "loss": 0.4258, "num_input_tokens_seen": 175280384, "step": 144135 }, { "epoch": 18.060393434406716, "grad_norm": 5.881494522094727, "learning_rate": 2.8380899187330335e-07, "loss": 0.4068, "num_input_tokens_seen": 175286560, "step": 144140 }, { "epoch": 18.0610199223155, "grad_norm": 6.831358909606934, "learning_rate": 2.8362744711291633e-07, "loss": 0.436, "num_input_tokens_seen": 175292672, "step": 144145 }, { "epoch": 18.061646410224284, "grad_norm": 4.379965305328369, "learning_rate": 2.834459587407279e-07, "loss": 0.469, "num_input_tokens_seen": 175298880, "step": 144150 }, { "epoch": 18.062272898133067, "grad_norm": 5.011648178100586, "learning_rate": 2.8326452675890903e-07, "loss": 0.4492, "num_input_tokens_seen": 175305152, "step": 144155 }, { "epoch": 18.06289938604185, "grad_norm": 5.3298258781433105, "learning_rate": 2.8308315116962805e-07, "loss": 0.449, "num_input_tokens_seen": 175311136, "step": 144160 }, { "epoch": 18.06352587395063, "grad_norm": 13.093510627746582, "learning_rate": 2.829018319750543e-07, "loss": 0.4877, "num_input_tokens_seen": 175317600, "step": 144165 }, { "epoch": 18.064152361859417, "grad_norm": 5.68737268447876, "learning_rate": 2.8272056917735433e-07, "loss": 0.4046, "num_input_tokens_seen": 175323680, "step": 144170 }, { "epoch": 18.0647788497682, "grad_norm": 26.319772720336914, "learning_rate": 2.8253936277869656e-07, "loss": 0.4997, "num_input_tokens_seen": 175329888, "step": 144175 }, { "epoch": 18.06540533767698, "grad_norm": 3.8512299060821533, "learning_rate": 2.823582127812463e-07, "loss": 0.6147, "num_input_tokens_seen": 175336288, "step": 144180 }, { "epoch": 18.066031825585767, "grad_norm": 9.442584037780762, "learning_rate": 2.821771191871708e-07, "loss": 0.4793, "num_input_tokens_seen": 175341344, "step": 144185 }, { "epoch": 18.06665831349455, "grad_norm": 5.593336582183838, "learning_rate": 2.8199608199863335e-07, "loss": 0.4401, "num_input_tokens_seen": 175347296, "step": 144190 }, { "epoch": 18.067284801403332, "grad_norm": 6.83065128326416, "learning_rate": 2.818151012178e-07, "loss": 0.4478, "num_input_tokens_seen": 175353504, "step": 144195 }, { "epoch": 18.067911289312118, "grad_norm": 5.228863716125488, "learning_rate": 2.816341768468345e-07, "loss": 0.4103, "num_input_tokens_seen": 175359456, "step": 144200 }, { "epoch": 18.0685377772209, "grad_norm": 16.629772186279297, "learning_rate": 2.814533088878985e-07, "loss": 0.4226, "num_input_tokens_seen": 175365792, "step": 144205 }, { "epoch": 18.069164265129682, "grad_norm": 6.397195816040039, "learning_rate": 2.8127249734315574e-07, "loss": 0.4742, "num_input_tokens_seen": 175372032, "step": 144210 }, { "epoch": 18.069790753038465, "grad_norm": 7.38091516494751, "learning_rate": 2.8109174221476743e-07, "loss": 0.3805, "num_input_tokens_seen": 175378080, "step": 144215 }, { "epoch": 18.07041724094725, "grad_norm": 4.573322772979736, "learning_rate": 2.809110435048945e-07, "loss": 0.4297, "num_input_tokens_seen": 175384000, "step": 144220 }, { "epoch": 18.071043728856033, "grad_norm": 8.691218376159668, "learning_rate": 2.807304012156975e-07, "loss": 0.4181, "num_input_tokens_seen": 175389856, "step": 144225 }, { "epoch": 18.071670216764815, "grad_norm": 4.5424418449401855, "learning_rate": 2.805498153493369e-07, "loss": 0.4024, "num_input_tokens_seen": 175395232, "step": 144230 }, { "epoch": 18.0722967046736, "grad_norm": 5.953023910522461, "learning_rate": 2.8036928590797095e-07, "loss": 0.4279, "num_input_tokens_seen": 175401440, "step": 144235 }, { "epoch": 18.072923192582383, "grad_norm": 8.705883026123047, "learning_rate": 2.8018881289375857e-07, "loss": 0.4283, "num_input_tokens_seen": 175407712, "step": 144240 }, { "epoch": 18.073549680491166, "grad_norm": 7.872920989990234, "learning_rate": 2.800083963088568e-07, "loss": 0.3998, "num_input_tokens_seen": 175413952, "step": 144245 }, { "epoch": 18.07417616839995, "grad_norm": 15.556374549865723, "learning_rate": 2.798280361554223e-07, "loss": 0.4914, "num_input_tokens_seen": 175420064, "step": 144250 }, { "epoch": 18.074802656308734, "grad_norm": 6.653556823730469, "learning_rate": 2.796477324356134e-07, "loss": 0.4321, "num_input_tokens_seen": 175426368, "step": 144255 }, { "epoch": 18.075429144217516, "grad_norm": 10.623513221740723, "learning_rate": 2.7946748515158383e-07, "loss": 0.4463, "num_input_tokens_seen": 175432928, "step": 144260 }, { "epoch": 18.0760556321263, "grad_norm": 5.136168003082275, "learning_rate": 2.792872943054897e-07, "loss": 0.5509, "num_input_tokens_seen": 175438816, "step": 144265 }, { "epoch": 18.076682120035084, "grad_norm": 7.616183280944824, "learning_rate": 2.7910715989948433e-07, "loss": 0.4677, "num_input_tokens_seen": 175444768, "step": 144270 }, { "epoch": 18.077308607943866, "grad_norm": 13.65654182434082, "learning_rate": 2.7892708193572314e-07, "loss": 0.434, "num_input_tokens_seen": 175451200, "step": 144275 }, { "epoch": 18.07793509585265, "grad_norm": 3.3406102657318115, "learning_rate": 2.7874706041635667e-07, "loss": 0.416, "num_input_tokens_seen": 175456736, "step": 144280 }, { "epoch": 18.078561583761434, "grad_norm": 7.048240661621094, "learning_rate": 2.7856709534353934e-07, "loss": 0.4234, "num_input_tokens_seen": 175462656, "step": 144285 }, { "epoch": 18.079188071670217, "grad_norm": 20.300691604614258, "learning_rate": 2.78387186719421e-07, "loss": 0.4943, "num_input_tokens_seen": 175468832, "step": 144290 }, { "epoch": 18.079814559579, "grad_norm": 6.880227565765381, "learning_rate": 2.782073345461545e-07, "loss": 0.3956, "num_input_tokens_seen": 175474784, "step": 144295 }, { "epoch": 18.080441047487785, "grad_norm": 14.7724609375, "learning_rate": 2.780275388258885e-07, "loss": 0.4357, "num_input_tokens_seen": 175480576, "step": 144300 }, { "epoch": 18.081067535396567, "grad_norm": 12.453506469726562, "learning_rate": 2.778477995607742e-07, "loss": 0.3872, "num_input_tokens_seen": 175486976, "step": 144305 }, { "epoch": 18.08169402330535, "grad_norm": 5.452236652374268, "learning_rate": 2.776681167529588e-07, "loss": 0.4316, "num_input_tokens_seen": 175493216, "step": 144310 }, { "epoch": 18.082320511214135, "grad_norm": 4.370434761047363, "learning_rate": 2.77488490404591e-07, "loss": 0.4531, "num_input_tokens_seen": 175499296, "step": 144315 }, { "epoch": 18.082946999122917, "grad_norm": 6.511350154876709, "learning_rate": 2.7730892051781976e-07, "loss": 0.4499, "num_input_tokens_seen": 175505536, "step": 144320 }, { "epoch": 18.0835734870317, "grad_norm": 9.953269958496094, "learning_rate": 2.771294070947905e-07, "loss": 0.4527, "num_input_tokens_seen": 175511424, "step": 144325 }, { "epoch": 18.084199974940482, "grad_norm": 10.21981143951416, "learning_rate": 2.7694995013764993e-07, "loss": 0.4958, "num_input_tokens_seen": 175517440, "step": 144330 }, { "epoch": 18.084826462849268, "grad_norm": 7.9699554443359375, "learning_rate": 2.767705496485429e-07, "loss": 0.4348, "num_input_tokens_seen": 175523776, "step": 144335 }, { "epoch": 18.08545295075805, "grad_norm": 24.590747833251953, "learning_rate": 2.7659120562961663e-07, "loss": 0.4603, "num_input_tokens_seen": 175530016, "step": 144340 }, { "epoch": 18.086079438666832, "grad_norm": 17.417600631713867, "learning_rate": 2.764119180830127e-07, "loss": 0.4738, "num_input_tokens_seen": 175536192, "step": 144345 }, { "epoch": 18.08670592657562, "grad_norm": 14.618321418762207, "learning_rate": 2.7623268701087615e-07, "loss": 0.4199, "num_input_tokens_seen": 175542240, "step": 144350 }, { "epoch": 18.0873324144844, "grad_norm": 6.36583137512207, "learning_rate": 2.7605351241534903e-07, "loss": 0.4379, "num_input_tokens_seen": 175548704, "step": 144355 }, { "epoch": 18.087958902393183, "grad_norm": 12.047505378723145, "learning_rate": 2.758743942985742e-07, "loss": 0.4702, "num_input_tokens_seen": 175554944, "step": 144360 }, { "epoch": 18.08858539030197, "grad_norm": 9.756662368774414, "learning_rate": 2.7569533266269213e-07, "loss": 0.4012, "num_input_tokens_seen": 175560448, "step": 144365 }, { "epoch": 18.08921187821075, "grad_norm": 5.233820915222168, "learning_rate": 2.755163275098449e-07, "loss": 0.4277, "num_input_tokens_seen": 175566304, "step": 144370 }, { "epoch": 18.089838366119533, "grad_norm": 35.48137283325195, "learning_rate": 2.7533737884217203e-07, "loss": 0.5206, "num_input_tokens_seen": 175572384, "step": 144375 }, { "epoch": 18.090464854028316, "grad_norm": 6.0478835105896, "learning_rate": 2.7515848666181236e-07, "loss": 0.4748, "num_input_tokens_seen": 175578112, "step": 144380 }, { "epoch": 18.0910913419371, "grad_norm": 6.442427158355713, "learning_rate": 2.7497965097090685e-07, "loss": 0.5346, "num_input_tokens_seen": 175584224, "step": 144385 }, { "epoch": 18.091717829845884, "grad_norm": 4.632148742675781, "learning_rate": 2.748008717715911e-07, "loss": 0.4436, "num_input_tokens_seen": 175590144, "step": 144390 }, { "epoch": 18.092344317754666, "grad_norm": 5.325977325439453, "learning_rate": 2.7462214906600395e-07, "loss": 0.4599, "num_input_tokens_seen": 175596384, "step": 144395 }, { "epoch": 18.09297080566345, "grad_norm": 6.186206340789795, "learning_rate": 2.7444348285628196e-07, "loss": 0.4306, "num_input_tokens_seen": 175602560, "step": 144400 }, { "epoch": 18.093597293572234, "grad_norm": 19.4103946685791, "learning_rate": 2.742648731445613e-07, "loss": 0.5521, "num_input_tokens_seen": 175608576, "step": 144405 }, { "epoch": 18.094223781481016, "grad_norm": 4.500852108001709, "learning_rate": 2.7408631993297686e-07, "loss": 0.4369, "num_input_tokens_seen": 175614592, "step": 144410 }, { "epoch": 18.094850269389802, "grad_norm": 7.1560258865356445, "learning_rate": 2.7390782322366417e-07, "loss": 0.4001, "num_input_tokens_seen": 175620608, "step": 144415 }, { "epoch": 18.095476757298584, "grad_norm": 14.364279747009277, "learning_rate": 2.7372938301875653e-07, "loss": 0.4642, "num_input_tokens_seen": 175626304, "step": 144420 }, { "epoch": 18.096103245207367, "grad_norm": 3.7971487045288086, "learning_rate": 2.7355099932038776e-07, "loss": 0.4158, "num_input_tokens_seen": 175632416, "step": 144425 }, { "epoch": 18.096729733116153, "grad_norm": 3.9852514266967773, "learning_rate": 2.7337267213069006e-07, "loss": 0.4127, "num_input_tokens_seen": 175638944, "step": 144430 }, { "epoch": 18.097356221024935, "grad_norm": 9.131773948669434, "learning_rate": 2.731944014517973e-07, "loss": 0.4068, "num_input_tokens_seen": 175645216, "step": 144435 }, { "epoch": 18.097982708933717, "grad_norm": 19.464092254638672, "learning_rate": 2.730161872858389e-07, "loss": 0.4602, "num_input_tokens_seen": 175651584, "step": 144440 }, { "epoch": 18.0986091968425, "grad_norm": 4.454925060272217, "learning_rate": 2.728380296349459e-07, "loss": 0.3936, "num_input_tokens_seen": 175657568, "step": 144445 }, { "epoch": 18.099235684751285, "grad_norm": 8.271430015563965, "learning_rate": 2.7265992850124933e-07, "loss": 0.4493, "num_input_tokens_seen": 175663488, "step": 144450 }, { "epoch": 18.099862172660067, "grad_norm": 4.948551654815674, "learning_rate": 2.72481883886877e-07, "loss": 0.443, "num_input_tokens_seen": 175669632, "step": 144455 }, { "epoch": 18.10048866056885, "grad_norm": 19.38121223449707, "learning_rate": 2.723038957939594e-07, "loss": 0.4986, "num_input_tokens_seen": 175675328, "step": 144460 }, { "epoch": 18.101115148477636, "grad_norm": 5.043856143951416, "learning_rate": 2.7212596422462267e-07, "loss": 0.4309, "num_input_tokens_seen": 175681408, "step": 144465 }, { "epoch": 18.101741636386418, "grad_norm": 5.623944282531738, "learning_rate": 2.719480891809961e-07, "loss": 0.4535, "num_input_tokens_seen": 175687456, "step": 144470 }, { "epoch": 18.1023681242952, "grad_norm": 5.70167350769043, "learning_rate": 2.7177027066520423e-07, "loss": 0.4149, "num_input_tokens_seen": 175693888, "step": 144475 }, { "epoch": 18.102994612203986, "grad_norm": 13.842988014221191, "learning_rate": 2.715925086793747e-07, "loss": 0.4661, "num_input_tokens_seen": 175699936, "step": 144480 }, { "epoch": 18.10362110011277, "grad_norm": 4.83145809173584, "learning_rate": 2.714148032256314e-07, "loss": 0.4989, "num_input_tokens_seen": 175705984, "step": 144485 }, { "epoch": 18.10424758802155, "grad_norm": 11.460617065429688, "learning_rate": 2.7123715430610096e-07, "loss": 0.4547, "num_input_tokens_seen": 175712160, "step": 144490 }, { "epoch": 18.104874075930333, "grad_norm": 4.4104790687561035, "learning_rate": 2.7105956192290507e-07, "loss": 0.4208, "num_input_tokens_seen": 175718016, "step": 144495 }, { "epoch": 18.10550056383912, "grad_norm": 21.972412109375, "learning_rate": 2.708820260781675e-07, "loss": 0.4841, "num_input_tokens_seen": 175723936, "step": 144500 }, { "epoch": 18.1061270517479, "grad_norm": 5.2744245529174805, "learning_rate": 2.707045467740127e-07, "loss": 0.4505, "num_input_tokens_seen": 175730240, "step": 144505 }, { "epoch": 18.106753539656683, "grad_norm": 4.434210300445557, "learning_rate": 2.705271240125601e-07, "loss": 0.456, "num_input_tokens_seen": 175736352, "step": 144510 }, { "epoch": 18.10738002756547, "grad_norm": 4.029216289520264, "learning_rate": 2.70349757795933e-07, "loss": 0.3944, "num_input_tokens_seen": 175742432, "step": 144515 }, { "epoch": 18.10800651547425, "grad_norm": 10.23626708984375, "learning_rate": 2.701724481262502e-07, "loss": 0.3998, "num_input_tokens_seen": 175748544, "step": 144520 }, { "epoch": 18.108633003383034, "grad_norm": 4.398923397064209, "learning_rate": 2.6999519500563234e-07, "loss": 0.4208, "num_input_tokens_seen": 175754432, "step": 144525 }, { "epoch": 18.10925949129182, "grad_norm": 7.7816162109375, "learning_rate": 2.6981799843619926e-07, "loss": 0.423, "num_input_tokens_seen": 175760672, "step": 144530 }, { "epoch": 18.1098859792006, "grad_norm": 5.354947090148926, "learning_rate": 2.696408584200688e-07, "loss": 0.4738, "num_input_tokens_seen": 175766912, "step": 144535 }, { "epoch": 18.110512467109384, "grad_norm": 5.390902996063232, "learning_rate": 2.6946377495935816e-07, "loss": 0.4512, "num_input_tokens_seen": 175772768, "step": 144540 }, { "epoch": 18.11113895501817, "grad_norm": 7.029723167419434, "learning_rate": 2.692867480561867e-07, "loss": 0.4783, "num_input_tokens_seen": 175778496, "step": 144545 }, { "epoch": 18.111765442926952, "grad_norm": 23.663301467895508, "learning_rate": 2.691097777126683e-07, "loss": 0.456, "num_input_tokens_seen": 175784768, "step": 144550 }, { "epoch": 18.112391930835734, "grad_norm": 6.439579010009766, "learning_rate": 2.689328639309213e-07, "loss": 0.5124, "num_input_tokens_seen": 175791200, "step": 144555 }, { "epoch": 18.113018418744517, "grad_norm": 3.9943788051605225, "learning_rate": 2.687560067130585e-07, "loss": 0.4566, "num_input_tokens_seen": 175797376, "step": 144560 }, { "epoch": 18.113644906653303, "grad_norm": 16.66429901123047, "learning_rate": 2.6857920606119524e-07, "loss": 0.4658, "num_input_tokens_seen": 175803264, "step": 144565 }, { "epoch": 18.114271394562085, "grad_norm": 5.012002944946289, "learning_rate": 2.6840246197744667e-07, "loss": 0.5207, "num_input_tokens_seen": 175809408, "step": 144570 }, { "epoch": 18.114897882470867, "grad_norm": 15.71320915222168, "learning_rate": 2.6822577446392385e-07, "loss": 0.4533, "num_input_tokens_seen": 175815744, "step": 144575 }, { "epoch": 18.115524370379653, "grad_norm": 3.666633129119873, "learning_rate": 2.680491435227411e-07, "loss": 0.4416, "num_input_tokens_seen": 175821984, "step": 144580 }, { "epoch": 18.116150858288435, "grad_norm": 19.315013885498047, "learning_rate": 2.67872569156008e-07, "loss": 0.4874, "num_input_tokens_seen": 175828096, "step": 144585 }, { "epoch": 18.116777346197217, "grad_norm": 2.8761909008026123, "learning_rate": 2.676960513658383e-07, "loss": 0.4542, "num_input_tokens_seen": 175834432, "step": 144590 }, { "epoch": 18.117403834106003, "grad_norm": 22.526994705200195, "learning_rate": 2.6751959015433983e-07, "loss": 0.4685, "num_input_tokens_seen": 175840544, "step": 144595 }, { "epoch": 18.118030322014786, "grad_norm": 13.873865127563477, "learning_rate": 2.6734318552362414e-07, "loss": 0.4699, "num_input_tokens_seen": 175846592, "step": 144600 }, { "epoch": 18.118656809923568, "grad_norm": 6.55513334274292, "learning_rate": 2.671668374757991e-07, "loss": 0.4339, "num_input_tokens_seen": 175852832, "step": 144605 }, { "epoch": 18.11928329783235, "grad_norm": 22.760831832885742, "learning_rate": 2.669905460129746e-07, "loss": 0.47, "num_input_tokens_seen": 175859328, "step": 144610 }, { "epoch": 18.119909785741136, "grad_norm": 12.089612007141113, "learning_rate": 2.668143111372568e-07, "loss": 0.4093, "num_input_tokens_seen": 175865312, "step": 144615 }, { "epoch": 18.12053627364992, "grad_norm": 6.935822010040283, "learning_rate": 2.666381328507528e-07, "loss": 0.4921, "num_input_tokens_seen": 175871488, "step": 144620 }, { "epoch": 18.1211627615587, "grad_norm": 6.572680950164795, "learning_rate": 2.6646201115557047e-07, "loss": 0.3801, "num_input_tokens_seen": 175877504, "step": 144625 }, { "epoch": 18.121789249467486, "grad_norm": 4.525746822357178, "learning_rate": 2.6628594605381364e-07, "loss": 0.4173, "num_input_tokens_seen": 175883424, "step": 144630 }, { "epoch": 18.12241573737627, "grad_norm": 9.986448287963867, "learning_rate": 2.661099375475884e-07, "loss": 0.3864, "num_input_tokens_seen": 175889312, "step": 144635 }, { "epoch": 18.12304222528505, "grad_norm": 4.119117736816406, "learning_rate": 2.6593398563899975e-07, "loss": 0.4201, "num_input_tokens_seen": 175895392, "step": 144640 }, { "epoch": 18.123668713193837, "grad_norm": 10.524497985839844, "learning_rate": 2.6575809033014986e-07, "loss": 0.4739, "num_input_tokens_seen": 175901504, "step": 144645 }, { "epoch": 18.12429520110262, "grad_norm": 21.529277801513672, "learning_rate": 2.655822516231421e-07, "loss": 0.4402, "num_input_tokens_seen": 175907808, "step": 144650 }, { "epoch": 18.1249216890114, "grad_norm": 4.7623066902160645, "learning_rate": 2.6540646952007976e-07, "loss": 0.4248, "num_input_tokens_seen": 175913504, "step": 144655 }, { "epoch": 18.125548176920187, "grad_norm": 19.06557846069336, "learning_rate": 2.652307440230628e-07, "loss": 0.46, "num_input_tokens_seen": 175919648, "step": 144660 }, { "epoch": 18.12617466482897, "grad_norm": 6.629925727844238, "learning_rate": 2.6505507513419405e-07, "loss": 0.3758, "num_input_tokens_seen": 175925792, "step": 144665 }, { "epoch": 18.12680115273775, "grad_norm": 8.331964492797852, "learning_rate": 2.648794628555723e-07, "loss": 0.4722, "num_input_tokens_seen": 175931968, "step": 144670 }, { "epoch": 18.127427640646534, "grad_norm": 6.536118030548096, "learning_rate": 2.647039071892976e-07, "loss": 0.496, "num_input_tokens_seen": 175937984, "step": 144675 }, { "epoch": 18.12805412855532, "grad_norm": 4.192702293395996, "learning_rate": 2.6452840813746884e-07, "loss": 0.4174, "num_input_tokens_seen": 175943808, "step": 144680 }, { "epoch": 18.128680616464102, "grad_norm": 5.43107795715332, "learning_rate": 2.643529657021843e-07, "loss": 0.4297, "num_input_tokens_seen": 175949792, "step": 144685 }, { "epoch": 18.129307104372884, "grad_norm": 4.933820724487305, "learning_rate": 2.641775798855417e-07, "loss": 0.4667, "num_input_tokens_seen": 175955776, "step": 144690 }, { "epoch": 18.12993359228167, "grad_norm": 10.89222240447998, "learning_rate": 2.640022506896378e-07, "loss": 0.4217, "num_input_tokens_seen": 175961920, "step": 144695 }, { "epoch": 18.130560080190453, "grad_norm": 13.541147232055664, "learning_rate": 2.638269781165692e-07, "loss": 0.461, "num_input_tokens_seen": 175967904, "step": 144700 }, { "epoch": 18.131186568099235, "grad_norm": 3.8607006072998047, "learning_rate": 2.636517621684304e-07, "loss": 0.4664, "num_input_tokens_seen": 175973984, "step": 144705 }, { "epoch": 18.13181305600802, "grad_norm": 13.981234550476074, "learning_rate": 2.634766028473174e-07, "loss": 0.4426, "num_input_tokens_seen": 175980192, "step": 144710 }, { "epoch": 18.132439543916803, "grad_norm": 5.212531566619873, "learning_rate": 2.633015001553235e-07, "loss": 0.3937, "num_input_tokens_seen": 175986560, "step": 144715 }, { "epoch": 18.133066031825585, "grad_norm": 6.449211597442627, "learning_rate": 2.6312645409454275e-07, "loss": 0.4467, "num_input_tokens_seen": 175992832, "step": 144720 }, { "epoch": 18.133692519734367, "grad_norm": 8.32594108581543, "learning_rate": 2.629514646670672e-07, "loss": 0.4521, "num_input_tokens_seen": 175998848, "step": 144725 }, { "epoch": 18.134319007643153, "grad_norm": 7.221286296844482, "learning_rate": 2.6277653187498974e-07, "loss": 0.4477, "num_input_tokens_seen": 176004832, "step": 144730 }, { "epoch": 18.134945495551936, "grad_norm": 5.790024280548096, "learning_rate": 2.6260165572040253e-07, "loss": 0.4088, "num_input_tokens_seen": 176010880, "step": 144735 }, { "epoch": 18.135571983460718, "grad_norm": 5.882676124572754, "learning_rate": 2.62426836205395e-07, "loss": 0.5016, "num_input_tokens_seen": 176016992, "step": 144740 }, { "epoch": 18.136198471369504, "grad_norm": 33.52177047729492, "learning_rate": 2.6225207333205715e-07, "loss": 0.5086, "num_input_tokens_seen": 176022784, "step": 144745 }, { "epoch": 18.136824959278286, "grad_norm": 6.351577281951904, "learning_rate": 2.6207736710247956e-07, "loss": 0.4147, "num_input_tokens_seen": 176028640, "step": 144750 }, { "epoch": 18.13745144718707, "grad_norm": 16.393617630004883, "learning_rate": 2.619027175187511e-07, "loss": 0.5272, "num_input_tokens_seen": 176034880, "step": 144755 }, { "epoch": 18.138077935095854, "grad_norm": 16.61565399169922, "learning_rate": 2.617281245829584e-07, "loss": 0.4776, "num_input_tokens_seen": 176040896, "step": 144760 }, { "epoch": 18.138704423004636, "grad_norm": 5.945459365844727, "learning_rate": 2.6155358829719035e-07, "loss": 0.4013, "num_input_tokens_seen": 176046656, "step": 144765 }, { "epoch": 18.13933091091342, "grad_norm": 6.319753646850586, "learning_rate": 2.61379108663532e-07, "loss": 0.48, "num_input_tokens_seen": 176052640, "step": 144770 }, { "epoch": 18.139957398822204, "grad_norm": 4.240478992462158, "learning_rate": 2.6120468568407163e-07, "loss": 0.4643, "num_input_tokens_seen": 176058656, "step": 144775 }, { "epoch": 18.140583886730987, "grad_norm": 16.268638610839844, "learning_rate": 2.610303193608926e-07, "loss": 0.4888, "num_input_tokens_seen": 176064736, "step": 144780 }, { "epoch": 18.14121037463977, "grad_norm": 4.618423938751221, "learning_rate": 2.6085600969608103e-07, "loss": 0.4675, "num_input_tokens_seen": 176070976, "step": 144785 }, { "epoch": 18.14183686254855, "grad_norm": 7.439777374267578, "learning_rate": 2.606817566917197e-07, "loss": 0.4024, "num_input_tokens_seen": 176077280, "step": 144790 }, { "epoch": 18.142463350457337, "grad_norm": 8.349989891052246, "learning_rate": 2.60507560349893e-07, "loss": 0.4404, "num_input_tokens_seen": 176083168, "step": 144795 }, { "epoch": 18.14308983836612, "grad_norm": 4.804322242736816, "learning_rate": 2.603334206726826e-07, "loss": 0.4116, "num_input_tokens_seen": 176089568, "step": 144800 }, { "epoch": 18.1437163262749, "grad_norm": 9.833264350891113, "learning_rate": 2.601593376621714e-07, "loss": 0.4811, "num_input_tokens_seen": 176095456, "step": 144805 }, { "epoch": 18.144342814183688, "grad_norm": 13.17553997039795, "learning_rate": 2.5998531132044037e-07, "loss": 0.4742, "num_input_tokens_seen": 176101664, "step": 144810 }, { "epoch": 18.14496930209247, "grad_norm": 33.70292663574219, "learning_rate": 2.5981134164957013e-07, "loss": 0.4687, "num_input_tokens_seen": 176107488, "step": 144815 }, { "epoch": 18.145595790001252, "grad_norm": 7.199434280395508, "learning_rate": 2.5963742865164066e-07, "loss": 0.4039, "num_input_tokens_seen": 176113536, "step": 144820 }, { "epoch": 18.146222277910038, "grad_norm": 4.376612186431885, "learning_rate": 2.5946357232873085e-07, "loss": 0.4728, "num_input_tokens_seen": 176119840, "step": 144825 }, { "epoch": 18.14684876581882, "grad_norm": 12.917223930358887, "learning_rate": 2.5928977268292075e-07, "loss": 0.4813, "num_input_tokens_seen": 176126112, "step": 144830 }, { "epoch": 18.147475253727603, "grad_norm": 3.9703338146209717, "learning_rate": 2.591160297162859e-07, "loss": 0.4601, "num_input_tokens_seen": 176132256, "step": 144835 }, { "epoch": 18.148101741636385, "grad_norm": 2.87859845161438, "learning_rate": 2.589423434309052e-07, "loss": 0.5168, "num_input_tokens_seen": 176137984, "step": 144840 }, { "epoch": 18.14872822954517, "grad_norm": 15.980480194091797, "learning_rate": 2.587687138288547e-07, "loss": 0.4105, "num_input_tokens_seen": 176143776, "step": 144845 }, { "epoch": 18.149354717453953, "grad_norm": 8.85314655303955, "learning_rate": 2.585951409122112e-07, "loss": 0.4724, "num_input_tokens_seen": 176149856, "step": 144850 }, { "epoch": 18.149981205362735, "grad_norm": 3.1958158016204834, "learning_rate": 2.5842162468304845e-07, "loss": 0.5465, "num_input_tokens_seen": 176155872, "step": 144855 }, { "epoch": 18.15060769327152, "grad_norm": 18.843433380126953, "learning_rate": 2.582481651434421e-07, "loss": 0.5084, "num_input_tokens_seen": 176162016, "step": 144860 }, { "epoch": 18.151234181180303, "grad_norm": 4.1786322593688965, "learning_rate": 2.580747622954649e-07, "loss": 0.4643, "num_input_tokens_seen": 176168160, "step": 144865 }, { "epoch": 18.151860669089086, "grad_norm": 6.513335704803467, "learning_rate": 2.579014161411908e-07, "loss": 0.4812, "num_input_tokens_seen": 176174176, "step": 144870 }, { "epoch": 18.15248715699787, "grad_norm": 5.178064346313477, "learning_rate": 2.577281266826931e-07, "loss": 0.4365, "num_input_tokens_seen": 176179808, "step": 144875 }, { "epoch": 18.153113644906654, "grad_norm": 9.12425422668457, "learning_rate": 2.575548939220418e-07, "loss": 0.5236, "num_input_tokens_seen": 176185888, "step": 144880 }, { "epoch": 18.153740132815436, "grad_norm": 6.1657538414001465, "learning_rate": 2.573817178613103e-07, "loss": 0.4616, "num_input_tokens_seen": 176191904, "step": 144885 }, { "epoch": 18.15436662072422, "grad_norm": 3.1763744354248047, "learning_rate": 2.572085985025663e-07, "loss": 0.3714, "num_input_tokens_seen": 176197440, "step": 144890 }, { "epoch": 18.154993108633004, "grad_norm": 3.227410078048706, "learning_rate": 2.570355358478821e-07, "loss": 0.4446, "num_input_tokens_seen": 176203840, "step": 144895 }, { "epoch": 18.155619596541786, "grad_norm": 5.445268630981445, "learning_rate": 2.568625298993249e-07, "loss": 0.4133, "num_input_tokens_seen": 176210080, "step": 144900 }, { "epoch": 18.15624608445057, "grad_norm": 4.78477668762207, "learning_rate": 2.5668958065896476e-07, "loss": 0.5068, "num_input_tokens_seen": 176216512, "step": 144905 }, { "epoch": 18.156872572359354, "grad_norm": 5.571083068847656, "learning_rate": 2.565166881288678e-07, "loss": 0.428, "num_input_tokens_seen": 176222720, "step": 144910 }, { "epoch": 18.157499060268137, "grad_norm": 11.925517082214355, "learning_rate": 2.5634385231110226e-07, "loss": 0.529, "num_input_tokens_seen": 176228480, "step": 144915 }, { "epoch": 18.15812554817692, "grad_norm": 22.006113052368164, "learning_rate": 2.561710732077344e-07, "loss": 0.4547, "num_input_tokens_seen": 176234848, "step": 144920 }, { "epoch": 18.158752036085705, "grad_norm": 4.423760890960693, "learning_rate": 2.559983508208297e-07, "loss": 0.4369, "num_input_tokens_seen": 176240320, "step": 144925 }, { "epoch": 18.159378523994487, "grad_norm": 6.794974327087402, "learning_rate": 2.5582568515245264e-07, "loss": 0.4269, "num_input_tokens_seen": 176246208, "step": 144930 }, { "epoch": 18.16000501190327, "grad_norm": 5.887693881988525, "learning_rate": 2.556530762046683e-07, "loss": 0.4341, "num_input_tokens_seen": 176252352, "step": 144935 }, { "epoch": 18.160631499812055, "grad_norm": 13.509218215942383, "learning_rate": 2.5548052397954103e-07, "loss": 0.4188, "num_input_tokens_seen": 176258336, "step": 144940 }, { "epoch": 18.161257987720838, "grad_norm": 9.358206748962402, "learning_rate": 2.553080284791326e-07, "loss": 0.4371, "num_input_tokens_seen": 176264064, "step": 144945 }, { "epoch": 18.16188447562962, "grad_norm": 4.849619388580322, "learning_rate": 2.551355897055058e-07, "loss": 0.4674, "num_input_tokens_seen": 176269888, "step": 144950 }, { "epoch": 18.162510963538402, "grad_norm": 11.322663307189941, "learning_rate": 2.549632076607217e-07, "loss": 0.4226, "num_input_tokens_seen": 176276256, "step": 144955 }, { "epoch": 18.163137451447188, "grad_norm": 26.313152313232422, "learning_rate": 2.5479088234684315e-07, "loss": 0.4652, "num_input_tokens_seen": 176282496, "step": 144960 }, { "epoch": 18.16376393935597, "grad_norm": 5.519413471221924, "learning_rate": 2.5461861376592847e-07, "loss": 0.4586, "num_input_tokens_seen": 176288544, "step": 144965 }, { "epoch": 18.164390427264753, "grad_norm": 3.469074249267578, "learning_rate": 2.544464019200382e-07, "loss": 0.4811, "num_input_tokens_seen": 176294816, "step": 144970 }, { "epoch": 18.16501691517354, "grad_norm": 2.9265553951263428, "learning_rate": 2.5427424681123083e-07, "loss": 0.4184, "num_input_tokens_seen": 176300704, "step": 144975 }, { "epoch": 18.16564340308232, "grad_norm": 4.151432991027832, "learning_rate": 2.541021484415651e-07, "loss": 0.4233, "num_input_tokens_seen": 176306208, "step": 144980 }, { "epoch": 18.166269890991103, "grad_norm": 5.027901649475098, "learning_rate": 2.539301068130978e-07, "loss": 0.4331, "num_input_tokens_seen": 176312288, "step": 144985 }, { "epoch": 18.16689637889989, "grad_norm": 15.7142333984375, "learning_rate": 2.537581219278867e-07, "loss": 0.4247, "num_input_tokens_seen": 176318560, "step": 144990 }, { "epoch": 18.16752286680867, "grad_norm": 4.150732517242432, "learning_rate": 2.5358619378798744e-07, "loss": 0.4249, "num_input_tokens_seen": 176324608, "step": 144995 }, { "epoch": 18.168149354717453, "grad_norm": 5.146249771118164, "learning_rate": 2.534143223954555e-07, "loss": 0.4453, "num_input_tokens_seen": 176330656, "step": 145000 }, { "epoch": 18.168775842626236, "grad_norm": 5.6874237060546875, "learning_rate": 2.5324250775234705e-07, "loss": 0.4382, "num_input_tokens_seen": 176336768, "step": 145005 }, { "epoch": 18.16940233053502, "grad_norm": 6.494210720062256, "learning_rate": 2.530707498607143e-07, "loss": 0.4935, "num_input_tokens_seen": 176342880, "step": 145010 }, { "epoch": 18.170028818443804, "grad_norm": 8.394477844238281, "learning_rate": 2.5289904872261295e-07, "loss": 0.3828, "num_input_tokens_seen": 176349376, "step": 145015 }, { "epoch": 18.170655306352586, "grad_norm": 28.30556869506836, "learning_rate": 2.527274043400935e-07, "loss": 0.4371, "num_input_tokens_seen": 176355584, "step": 145020 }, { "epoch": 18.171281794261372, "grad_norm": 13.86140251159668, "learning_rate": 2.525558167152098e-07, "loss": 0.4379, "num_input_tokens_seen": 176360896, "step": 145025 }, { "epoch": 18.171908282170154, "grad_norm": 9.066948890686035, "learning_rate": 2.52384285850012e-07, "loss": 0.4321, "num_input_tokens_seen": 176366688, "step": 145030 }, { "epoch": 18.172534770078936, "grad_norm": 31.034719467163086, "learning_rate": 2.5221281174655275e-07, "loss": 0.5054, "num_input_tokens_seen": 176372544, "step": 145035 }, { "epoch": 18.173161257987722, "grad_norm": 16.16948890686035, "learning_rate": 2.5204139440688e-07, "loss": 0.4772, "num_input_tokens_seen": 176378720, "step": 145040 }, { "epoch": 18.173787745896504, "grad_norm": 17.971519470214844, "learning_rate": 2.518700338330443e-07, "loss": 0.4317, "num_input_tokens_seen": 176384544, "step": 145045 }, { "epoch": 18.174414233805287, "grad_norm": 5.255812644958496, "learning_rate": 2.5169873002709453e-07, "loss": 0.4232, "num_input_tokens_seen": 176390464, "step": 145050 }, { "epoch": 18.175040721714073, "grad_norm": 8.724075317382812, "learning_rate": 2.515274829910791e-07, "loss": 0.4537, "num_input_tokens_seen": 176396576, "step": 145055 }, { "epoch": 18.175667209622855, "grad_norm": 9.518674850463867, "learning_rate": 2.5135629272704463e-07, "loss": 0.4706, "num_input_tokens_seen": 176401856, "step": 145060 }, { "epoch": 18.176293697531637, "grad_norm": 12.981438636779785, "learning_rate": 2.511851592370379e-07, "loss": 0.5039, "num_input_tokens_seen": 176407936, "step": 145065 }, { "epoch": 18.17692018544042, "grad_norm": 17.41501235961914, "learning_rate": 2.5101408252310553e-07, "loss": 0.5047, "num_input_tokens_seen": 176414304, "step": 145070 }, { "epoch": 18.177546673349205, "grad_norm": 15.282402992248535, "learning_rate": 2.5084306258729207e-07, "loss": 0.4278, "num_input_tokens_seen": 176420640, "step": 145075 }, { "epoch": 18.178173161257988, "grad_norm": 10.654417991638184, "learning_rate": 2.506720994316436e-07, "loss": 0.4535, "num_input_tokens_seen": 176426880, "step": 145080 }, { "epoch": 18.17879964916677, "grad_norm": 23.401363372802734, "learning_rate": 2.5050119305820187e-07, "loss": 0.4523, "num_input_tokens_seen": 176432448, "step": 145085 }, { "epoch": 18.179426137075556, "grad_norm": 6.7536115646362305, "learning_rate": 2.5033034346901296e-07, "loss": 0.4169, "num_input_tokens_seen": 176438912, "step": 145090 }, { "epoch": 18.180052624984338, "grad_norm": 4.6594672203063965, "learning_rate": 2.5015955066611695e-07, "loss": 0.4195, "num_input_tokens_seen": 176445056, "step": 145095 }, { "epoch": 18.18067911289312, "grad_norm": 16.920120239257812, "learning_rate": 2.4998881465155777e-07, "loss": 0.5072, "num_input_tokens_seen": 176451392, "step": 145100 }, { "epoch": 18.181305600801906, "grad_norm": 7.134270668029785, "learning_rate": 2.498181354273749e-07, "loss": 0.4604, "num_input_tokens_seen": 176457856, "step": 145105 }, { "epoch": 18.18193208871069, "grad_norm": 7.071461200714111, "learning_rate": 2.496475129956111e-07, "loss": 0.4805, "num_input_tokens_seen": 176463872, "step": 145110 }, { "epoch": 18.18255857661947, "grad_norm": 10.18769645690918, "learning_rate": 2.4947694735830417e-07, "loss": 0.4108, "num_input_tokens_seen": 176470208, "step": 145115 }, { "epoch": 18.183185064528253, "grad_norm": 4.907618999481201, "learning_rate": 2.4930643851749483e-07, "loss": 0.4892, "num_input_tokens_seen": 176475904, "step": 145120 }, { "epoch": 18.18381155243704, "grad_norm": 4.071300506591797, "learning_rate": 2.491359864752213e-07, "loss": 0.4964, "num_input_tokens_seen": 176481760, "step": 145125 }, { "epoch": 18.18443804034582, "grad_norm": 6.198493480682373, "learning_rate": 2.4896559123352093e-07, "loss": 0.4935, "num_input_tokens_seen": 176488032, "step": 145130 }, { "epoch": 18.185064528254603, "grad_norm": 3.2714662551879883, "learning_rate": 2.487952527944321e-07, "loss": 0.4393, "num_input_tokens_seen": 176494240, "step": 145135 }, { "epoch": 18.18569101616339, "grad_norm": 3.8598556518554688, "learning_rate": 2.4862497115998975e-07, "loss": 0.3898, "num_input_tokens_seen": 176500320, "step": 145140 }, { "epoch": 18.18631750407217, "grad_norm": 6.479445934295654, "learning_rate": 2.4845474633223066e-07, "loss": 0.3841, "num_input_tokens_seen": 176506272, "step": 145145 }, { "epoch": 18.186943991980954, "grad_norm": 5.350576400756836, "learning_rate": 2.48284578313191e-07, "loss": 0.4007, "num_input_tokens_seen": 176512384, "step": 145150 }, { "epoch": 18.18757047988974, "grad_norm": 4.097391128540039, "learning_rate": 2.4811446710490293e-07, "loss": 0.5106, "num_input_tokens_seen": 176518592, "step": 145155 }, { "epoch": 18.188196967798522, "grad_norm": 13.74234390258789, "learning_rate": 2.4794441270940216e-07, "loss": 0.4439, "num_input_tokens_seen": 176524672, "step": 145160 }, { "epoch": 18.188823455707304, "grad_norm": 6.161200523376465, "learning_rate": 2.4777441512872146e-07, "loss": 0.4375, "num_input_tokens_seen": 176531296, "step": 145165 }, { "epoch": 18.18944994361609, "grad_norm": 4.6720781326293945, "learning_rate": 2.476044743648925e-07, "loss": 0.4138, "num_input_tokens_seen": 176537344, "step": 145170 }, { "epoch": 18.190076431524872, "grad_norm": 4.223610877990723, "learning_rate": 2.474345904199488e-07, "loss": 0.4659, "num_input_tokens_seen": 176543424, "step": 145175 }, { "epoch": 18.190702919433654, "grad_norm": 21.535259246826172, "learning_rate": 2.472647632959191e-07, "loss": 0.4561, "num_input_tokens_seen": 176549024, "step": 145180 }, { "epoch": 18.191329407342437, "grad_norm": 21.590255737304688, "learning_rate": 2.470949929948352e-07, "loss": 0.4415, "num_input_tokens_seen": 176555232, "step": 145185 }, { "epoch": 18.191955895251223, "grad_norm": 5.455982208251953, "learning_rate": 2.4692527951872713e-07, "loss": 0.4588, "num_input_tokens_seen": 176561312, "step": 145190 }, { "epoch": 18.192582383160005, "grad_norm": 8.620133399963379, "learning_rate": 2.467556228696233e-07, "loss": 0.4633, "num_input_tokens_seen": 176567392, "step": 145195 }, { "epoch": 18.193208871068787, "grad_norm": 5.846451282501221, "learning_rate": 2.465860230495526e-07, "loss": 0.4309, "num_input_tokens_seen": 176573472, "step": 145200 }, { "epoch": 18.193835358977573, "grad_norm": 29.361825942993164, "learning_rate": 2.464164800605423e-07, "loss": 0.5412, "num_input_tokens_seen": 176579808, "step": 145205 }, { "epoch": 18.194461846886355, "grad_norm": 11.604058265686035, "learning_rate": 2.462469939046197e-07, "loss": 0.4429, "num_input_tokens_seen": 176586048, "step": 145210 }, { "epoch": 18.195088334795138, "grad_norm": 6.671835422515869, "learning_rate": 2.4607756458381036e-07, "loss": 0.4625, "num_input_tokens_seen": 176592000, "step": 145215 }, { "epoch": 18.195714822703923, "grad_norm": 22.318601608276367, "learning_rate": 2.459081921001416e-07, "loss": 0.462, "num_input_tokens_seen": 176597888, "step": 145220 }, { "epoch": 18.196341310612706, "grad_norm": 16.79924201965332, "learning_rate": 2.457388764556362e-07, "loss": 0.4249, "num_input_tokens_seen": 176604096, "step": 145225 }, { "epoch": 18.196967798521488, "grad_norm": 14.929754257202148, "learning_rate": 2.455696176523209e-07, "loss": 0.4968, "num_input_tokens_seen": 176610016, "step": 145230 }, { "epoch": 18.19759428643027, "grad_norm": 2.9347355365753174, "learning_rate": 2.454004156922174e-07, "loss": 0.4489, "num_input_tokens_seen": 176615360, "step": 145235 }, { "epoch": 18.198220774339056, "grad_norm": 5.301558017730713, "learning_rate": 2.4523127057734907e-07, "loss": 0.4715, "num_input_tokens_seen": 176621536, "step": 145240 }, { "epoch": 18.19884726224784, "grad_norm": 6.367093563079834, "learning_rate": 2.4506218230973933e-07, "loss": 0.4373, "num_input_tokens_seen": 176628000, "step": 145245 }, { "epoch": 18.19947375015662, "grad_norm": 17.1795597076416, "learning_rate": 2.4489315089140875e-07, "loss": 0.5587, "num_input_tokens_seen": 176633696, "step": 145250 }, { "epoch": 18.200100238065406, "grad_norm": 12.210886001586914, "learning_rate": 2.4472417632437793e-07, "loss": 0.4506, "num_input_tokens_seen": 176639968, "step": 145255 }, { "epoch": 18.20072672597419, "grad_norm": 4.682960510253906, "learning_rate": 2.445552586106681e-07, "loss": 0.4823, "num_input_tokens_seen": 176646016, "step": 145260 }, { "epoch": 18.20135321388297, "grad_norm": 7.515878677368164, "learning_rate": 2.4438639775229754e-07, "loss": 0.419, "num_input_tokens_seen": 176652224, "step": 145265 }, { "epoch": 18.201979701791757, "grad_norm": 8.427062034606934, "learning_rate": 2.442175937512864e-07, "loss": 0.3708, "num_input_tokens_seen": 176657920, "step": 145270 }, { "epoch": 18.20260618970054, "grad_norm": 3.6006457805633545, "learning_rate": 2.44048846609653e-07, "loss": 0.4212, "num_input_tokens_seen": 176663904, "step": 145275 }, { "epoch": 18.20323267760932, "grad_norm": 4.867333889007568, "learning_rate": 2.438801563294135e-07, "loss": 0.4414, "num_input_tokens_seen": 176669824, "step": 145280 }, { "epoch": 18.203859165518107, "grad_norm": 13.033716201782227, "learning_rate": 2.437115229125858e-07, "loss": 0.414, "num_input_tokens_seen": 176676064, "step": 145285 }, { "epoch": 18.20448565342689, "grad_norm": 4.241277694702148, "learning_rate": 2.4354294636118545e-07, "loss": 0.4223, "num_input_tokens_seen": 176682144, "step": 145290 }, { "epoch": 18.205112141335672, "grad_norm": 3.794290542602539, "learning_rate": 2.433744266772287e-07, "loss": 0.4132, "num_input_tokens_seen": 176688480, "step": 145295 }, { "epoch": 18.205738629244454, "grad_norm": 19.397897720336914, "learning_rate": 2.4320596386272877e-07, "loss": 0.5392, "num_input_tokens_seen": 176694528, "step": 145300 }, { "epoch": 18.20636511715324, "grad_norm": 5.062679767608643, "learning_rate": 2.430375579197014e-07, "loss": 0.5015, "num_input_tokens_seen": 176700736, "step": 145305 }, { "epoch": 18.206991605062022, "grad_norm": 3.987809658050537, "learning_rate": 2.4286920885015997e-07, "loss": 0.4378, "num_input_tokens_seen": 176706848, "step": 145310 }, { "epoch": 18.207618092970804, "grad_norm": 4.0921101570129395, "learning_rate": 2.427009166561162e-07, "loss": 0.414, "num_input_tokens_seen": 176712960, "step": 145315 }, { "epoch": 18.20824458087959, "grad_norm": 11.240053176879883, "learning_rate": 2.4253268133958284e-07, "loss": 0.4105, "num_input_tokens_seen": 176719136, "step": 145320 }, { "epoch": 18.208871068788373, "grad_norm": 28.68234634399414, "learning_rate": 2.4236450290257065e-07, "loss": 0.4746, "num_input_tokens_seen": 176725312, "step": 145325 }, { "epoch": 18.209497556697155, "grad_norm": 13.794109344482422, "learning_rate": 2.421963813470918e-07, "loss": 0.5104, "num_input_tokens_seen": 176731264, "step": 145330 }, { "epoch": 18.21012404460594, "grad_norm": 7.056636810302734, "learning_rate": 2.420283166751547e-07, "loss": 0.4686, "num_input_tokens_seen": 176737216, "step": 145335 }, { "epoch": 18.210750532514723, "grad_norm": 4.698622703552246, "learning_rate": 2.418603088887694e-07, "loss": 0.4205, "num_input_tokens_seen": 176743424, "step": 145340 }, { "epoch": 18.211377020423505, "grad_norm": 5.872308731079102, "learning_rate": 2.416923579899444e-07, "loss": 0.4053, "num_input_tokens_seen": 176749760, "step": 145345 }, { "epoch": 18.212003508332288, "grad_norm": 34.29082107543945, "learning_rate": 2.4152446398068797e-07, "loss": 0.4849, "num_input_tokens_seen": 176755776, "step": 145350 }, { "epoch": 18.212629996241073, "grad_norm": 7.045538425445557, "learning_rate": 2.4135662686300743e-07, "loss": 0.4297, "num_input_tokens_seen": 176761728, "step": 145355 }, { "epoch": 18.213256484149856, "grad_norm": 19.163238525390625, "learning_rate": 2.4118884663890896e-07, "loss": 0.4925, "num_input_tokens_seen": 176767648, "step": 145360 }, { "epoch": 18.213882972058638, "grad_norm": 3.8854944705963135, "learning_rate": 2.4102112331039874e-07, "loss": 0.4358, "num_input_tokens_seen": 176773728, "step": 145365 }, { "epoch": 18.214509459967424, "grad_norm": 16.001523971557617, "learning_rate": 2.4085345687948234e-07, "loss": 0.4667, "num_input_tokens_seen": 176779744, "step": 145370 }, { "epoch": 18.215135947876206, "grad_norm": 5.688230037689209, "learning_rate": 2.4068584734816436e-07, "loss": 0.4212, "num_input_tokens_seen": 176785344, "step": 145375 }, { "epoch": 18.21576243578499, "grad_norm": 9.410857200622559, "learning_rate": 2.405182947184481e-07, "loss": 0.4526, "num_input_tokens_seen": 176791200, "step": 145380 }, { "epoch": 18.216388923693774, "grad_norm": 17.001239776611328, "learning_rate": 2.4035079899233757e-07, "loss": 0.4517, "num_input_tokens_seen": 176797408, "step": 145385 }, { "epoch": 18.217015411602556, "grad_norm": 4.727973461151123, "learning_rate": 2.4018336017183454e-07, "loss": 0.392, "num_input_tokens_seen": 176803584, "step": 145390 }, { "epoch": 18.21764189951134, "grad_norm": 22.67744255065918, "learning_rate": 2.4001597825894175e-07, "loss": 0.5235, "num_input_tokens_seen": 176809888, "step": 145395 }, { "epoch": 18.218268387420125, "grad_norm": 18.386926651000977, "learning_rate": 2.398486532556593e-07, "loss": 0.463, "num_input_tokens_seen": 176815936, "step": 145400 }, { "epoch": 18.218894875328907, "grad_norm": 15.925457000732422, "learning_rate": 2.3968138516398953e-07, "loss": 0.4537, "num_input_tokens_seen": 176822080, "step": 145405 }, { "epoch": 18.21952136323769, "grad_norm": 5.334912300109863, "learning_rate": 2.395141739859297e-07, "loss": 0.4332, "num_input_tokens_seen": 176828256, "step": 145410 }, { "epoch": 18.22014785114647, "grad_norm": 22.306291580200195, "learning_rate": 2.3934701972348093e-07, "loss": 0.461, "num_input_tokens_seen": 176834624, "step": 145415 }, { "epoch": 18.220774339055257, "grad_norm": 5.492055892944336, "learning_rate": 2.391799223786412e-07, "loss": 0.3677, "num_input_tokens_seen": 176840864, "step": 145420 }, { "epoch": 18.22140082696404, "grad_norm": 8.830946922302246, "learning_rate": 2.390128819534071e-07, "loss": 0.448, "num_input_tokens_seen": 176846624, "step": 145425 }, { "epoch": 18.222027314872822, "grad_norm": 7.556232929229736, "learning_rate": 2.3884589844977825e-07, "loss": 0.4333, "num_input_tokens_seen": 176852448, "step": 145430 }, { "epoch": 18.222653802781608, "grad_norm": 4.20020055770874, "learning_rate": 2.386789718697485e-07, "loss": 0.466, "num_input_tokens_seen": 176858752, "step": 145435 }, { "epoch": 18.22328029069039, "grad_norm": 3.12318754196167, "learning_rate": 2.3851210221531587e-07, "loss": 0.43, "num_input_tokens_seen": 176864896, "step": 145440 }, { "epoch": 18.223906778599172, "grad_norm": 3.3248398303985596, "learning_rate": 2.3834528948847357e-07, "loss": 0.4656, "num_input_tokens_seen": 176870784, "step": 145445 }, { "epoch": 18.224533266507958, "grad_norm": 4.566865921020508, "learning_rate": 2.3817853369121735e-07, "loss": 0.4323, "num_input_tokens_seen": 176876704, "step": 145450 }, { "epoch": 18.22515975441674, "grad_norm": 5.4742960929870605, "learning_rate": 2.3801183482553947e-07, "loss": 0.5114, "num_input_tokens_seen": 176882528, "step": 145455 }, { "epoch": 18.225786242325523, "grad_norm": 22.65117645263672, "learning_rate": 2.3784519289343334e-07, "loss": 0.5364, "num_input_tokens_seen": 176888864, "step": 145460 }, { "epoch": 18.226412730234305, "grad_norm": 19.117923736572266, "learning_rate": 2.3767860789689235e-07, "loss": 0.5478, "num_input_tokens_seen": 176894848, "step": 145465 }, { "epoch": 18.22703921814309, "grad_norm": 5.6843461990356445, "learning_rate": 2.3751207983790824e-07, "loss": 0.4149, "num_input_tokens_seen": 176901024, "step": 145470 }, { "epoch": 18.227665706051873, "grad_norm": 4.265415191650391, "learning_rate": 2.3734560871847002e-07, "loss": 0.3948, "num_input_tokens_seen": 176906464, "step": 145475 }, { "epoch": 18.228292193960655, "grad_norm": 4.380497932434082, "learning_rate": 2.3717919454057047e-07, "loss": 0.4256, "num_input_tokens_seen": 176912640, "step": 145480 }, { "epoch": 18.22891868186944, "grad_norm": 4.284635543823242, "learning_rate": 2.370128373061975e-07, "loss": 0.441, "num_input_tokens_seen": 176919040, "step": 145485 }, { "epoch": 18.229545169778223, "grad_norm": 10.990541458129883, "learning_rate": 2.3684653701734005e-07, "loss": 0.4685, "num_input_tokens_seen": 176925536, "step": 145490 }, { "epoch": 18.230171657687006, "grad_norm": 4.872529983520508, "learning_rate": 2.3668029367598767e-07, "loss": 0.447, "num_input_tokens_seen": 176931328, "step": 145495 }, { "epoch": 18.23079814559579, "grad_norm": 5.77857780456543, "learning_rate": 2.365141072841265e-07, "loss": 0.4597, "num_input_tokens_seen": 176937632, "step": 145500 }, { "epoch": 18.231424633504574, "grad_norm": 24.829370498657227, "learning_rate": 2.3634797784374497e-07, "loss": 0.4234, "num_input_tokens_seen": 176943840, "step": 145505 }, { "epoch": 18.232051121413356, "grad_norm": 6.311888217926025, "learning_rate": 2.361819053568276e-07, "loss": 0.3912, "num_input_tokens_seen": 176950016, "step": 145510 }, { "epoch": 18.23267760932214, "grad_norm": 4.541516304016113, "learning_rate": 2.3601588982536172e-07, "loss": 0.376, "num_input_tokens_seen": 176956192, "step": 145515 }, { "epoch": 18.233304097230924, "grad_norm": 9.176857948303223, "learning_rate": 2.3584993125133016e-07, "loss": 0.4227, "num_input_tokens_seen": 176962304, "step": 145520 }, { "epoch": 18.233930585139706, "grad_norm": 4.749417304992676, "learning_rate": 2.356840296367191e-07, "loss": 0.4228, "num_input_tokens_seen": 176967744, "step": 145525 }, { "epoch": 18.23455707304849, "grad_norm": 10.751810073852539, "learning_rate": 2.3551818498351032e-07, "loss": 0.4801, "num_input_tokens_seen": 176973792, "step": 145530 }, { "epoch": 18.235183560957275, "grad_norm": 9.18925666809082, "learning_rate": 2.3535239729368775e-07, "loss": 0.4119, "num_input_tokens_seen": 176980128, "step": 145535 }, { "epoch": 18.235810048866057, "grad_norm": 6.910944938659668, "learning_rate": 2.3518666656923262e-07, "loss": 0.4286, "num_input_tokens_seen": 176986240, "step": 145540 }, { "epoch": 18.23643653677484, "grad_norm": 6.002365589141846, "learning_rate": 2.3502099281212775e-07, "loss": 0.4256, "num_input_tokens_seen": 176992864, "step": 145545 }, { "epoch": 18.237063024683625, "grad_norm": 3.4909446239471436, "learning_rate": 2.348553760243527e-07, "loss": 0.4525, "num_input_tokens_seen": 176999008, "step": 145550 }, { "epoch": 18.237689512592407, "grad_norm": 6.468424320220947, "learning_rate": 2.3468981620788755e-07, "loss": 0.436, "num_input_tokens_seen": 177004928, "step": 145555 }, { "epoch": 18.23831600050119, "grad_norm": 5.200901985168457, "learning_rate": 2.3452431336471292e-07, "loss": 0.4421, "num_input_tokens_seen": 177011232, "step": 145560 }, { "epoch": 18.238942488409975, "grad_norm": 7.63052225112915, "learning_rate": 2.3435886749680558e-07, "loss": 0.4536, "num_input_tokens_seen": 177017696, "step": 145565 }, { "epoch": 18.239568976318758, "grad_norm": 6.375015735626221, "learning_rate": 2.3419347860614506e-07, "loss": 0.4309, "num_input_tokens_seen": 177024096, "step": 145570 }, { "epoch": 18.24019546422754, "grad_norm": 7.257413387298584, "learning_rate": 2.3402814669470864e-07, "loss": 0.4793, "num_input_tokens_seen": 177030208, "step": 145575 }, { "epoch": 18.240821952136322, "grad_norm": 8.23486042022705, "learning_rate": 2.3386287176447309e-07, "loss": 0.4484, "num_input_tokens_seen": 177035808, "step": 145580 }, { "epoch": 18.241448440045108, "grad_norm": 11.609031677246094, "learning_rate": 2.336976538174135e-07, "loss": 0.4063, "num_input_tokens_seen": 177041952, "step": 145585 }, { "epoch": 18.24207492795389, "grad_norm": 15.448967933654785, "learning_rate": 2.335324928555066e-07, "loss": 0.4449, "num_input_tokens_seen": 177048000, "step": 145590 }, { "epoch": 18.242701415862673, "grad_norm": 6.1974077224731445, "learning_rate": 2.333673888807253e-07, "loss": 0.4161, "num_input_tokens_seen": 177053984, "step": 145595 }, { "epoch": 18.24332790377146, "grad_norm": 23.43514060974121, "learning_rate": 2.3320234189504465e-07, "loss": 0.4523, "num_input_tokens_seen": 177059904, "step": 145600 }, { "epoch": 18.24395439168024, "grad_norm": 3.533651113510132, "learning_rate": 2.3303735190043753e-07, "loss": 0.4437, "num_input_tokens_seen": 177065952, "step": 145605 }, { "epoch": 18.244580879589023, "grad_norm": 14.11005973815918, "learning_rate": 2.328724188988768e-07, "loss": 0.5157, "num_input_tokens_seen": 177072096, "step": 145610 }, { "epoch": 18.24520736749781, "grad_norm": 20.318490982055664, "learning_rate": 2.3270754289233476e-07, "loss": 0.4825, "num_input_tokens_seen": 177078304, "step": 145615 }, { "epoch": 18.24583385540659, "grad_norm": 11.147153854370117, "learning_rate": 2.3254272388278154e-07, "loss": 0.4296, "num_input_tokens_seen": 177083840, "step": 145620 }, { "epoch": 18.246460343315373, "grad_norm": 3.3753371238708496, "learning_rate": 2.3237796187218886e-07, "loss": 0.396, "num_input_tokens_seen": 177089696, "step": 145625 }, { "epoch": 18.247086831224156, "grad_norm": 4.3659210205078125, "learning_rate": 2.3221325686252573e-07, "loss": 0.5117, "num_input_tokens_seen": 177095616, "step": 145630 }, { "epoch": 18.24771331913294, "grad_norm": 7.231491565704346, "learning_rate": 2.320486088557622e-07, "loss": 0.3776, "num_input_tokens_seen": 177101856, "step": 145635 }, { "epoch": 18.248339807041724, "grad_norm": 6.8003830909729, "learning_rate": 2.3188401785386506e-07, "loss": 0.4188, "num_input_tokens_seen": 177108224, "step": 145640 }, { "epoch": 18.248966294950506, "grad_norm": 3.428571939468384, "learning_rate": 2.317194838588044e-07, "loss": 0.4696, "num_input_tokens_seen": 177114112, "step": 145645 }, { "epoch": 18.249592782859292, "grad_norm": 3.7568299770355225, "learning_rate": 2.3155500687254585e-07, "loss": 0.4361, "num_input_tokens_seen": 177120352, "step": 145650 }, { "epoch": 18.250219270768074, "grad_norm": 9.440142631530762, "learning_rate": 2.313905868970562e-07, "loss": 0.4868, "num_input_tokens_seen": 177126688, "step": 145655 }, { "epoch": 18.250845758676856, "grad_norm": 16.960939407348633, "learning_rate": 2.312262239343016e-07, "loss": 0.4682, "num_input_tokens_seen": 177132864, "step": 145660 }, { "epoch": 18.251472246585642, "grad_norm": 8.093859672546387, "learning_rate": 2.3106191798624612e-07, "loss": 0.4637, "num_input_tokens_seen": 177138496, "step": 145665 }, { "epoch": 18.252098734494425, "grad_norm": 7.915256500244141, "learning_rate": 2.3089766905485533e-07, "loss": 0.4633, "num_input_tokens_seen": 177144800, "step": 145670 }, { "epoch": 18.252725222403207, "grad_norm": 5.623493671417236, "learning_rate": 2.307334771420927e-07, "loss": 0.4004, "num_input_tokens_seen": 177151008, "step": 145675 }, { "epoch": 18.253351710311993, "grad_norm": 11.5518798828125, "learning_rate": 2.3056934224992057e-07, "loss": 0.4377, "num_input_tokens_seen": 177156672, "step": 145680 }, { "epoch": 18.253978198220775, "grad_norm": 29.500112533569336, "learning_rate": 2.3040526438030232e-07, "loss": 0.5167, "num_input_tokens_seen": 177162784, "step": 145685 }, { "epoch": 18.254604686129557, "grad_norm": 4.005356311798096, "learning_rate": 2.302412435351997e-07, "loss": 0.4296, "num_input_tokens_seen": 177168800, "step": 145690 }, { "epoch": 18.25523117403834, "grad_norm": 14.678586959838867, "learning_rate": 2.3007727971657233e-07, "loss": 0.5117, "num_input_tokens_seen": 177174880, "step": 145695 }, { "epoch": 18.255857661947125, "grad_norm": 15.124788284301758, "learning_rate": 2.299133729263825e-07, "loss": 0.3749, "num_input_tokens_seen": 177180704, "step": 145700 }, { "epoch": 18.256484149855908, "grad_norm": 4.340108871459961, "learning_rate": 2.297495231665875e-07, "loss": 0.4059, "num_input_tokens_seen": 177186464, "step": 145705 }, { "epoch": 18.25711063776469, "grad_norm": 6.459095478057861, "learning_rate": 2.2958573043914855e-07, "loss": 0.4251, "num_input_tokens_seen": 177192800, "step": 145710 }, { "epoch": 18.257737125673476, "grad_norm": 9.961444854736328, "learning_rate": 2.2942199474602245e-07, "loss": 0.5353, "num_input_tokens_seen": 177198688, "step": 145715 }, { "epoch": 18.258363613582258, "grad_norm": 3.950920581817627, "learning_rate": 2.292583160891676e-07, "loss": 0.4333, "num_input_tokens_seen": 177204800, "step": 145720 }, { "epoch": 18.25899010149104, "grad_norm": 13.803851127624512, "learning_rate": 2.2909469447054023e-07, "loss": 0.4924, "num_input_tokens_seen": 177211136, "step": 145725 }, { "epoch": 18.259616589399826, "grad_norm": 6.175239562988281, "learning_rate": 2.289311298920971e-07, "loss": 0.3844, "num_input_tokens_seen": 177217216, "step": 145730 }, { "epoch": 18.26024307730861, "grad_norm": 5.082642078399658, "learning_rate": 2.287676223557933e-07, "loss": 0.3946, "num_input_tokens_seen": 177223424, "step": 145735 }, { "epoch": 18.26086956521739, "grad_norm": 12.526277542114258, "learning_rate": 2.2860417186358396e-07, "loss": 0.4919, "num_input_tokens_seen": 177229696, "step": 145740 }, { "epoch": 18.261496053126173, "grad_norm": 5.145725727081299, "learning_rate": 2.2844077841742418e-07, "loss": 0.4482, "num_input_tokens_seen": 177235456, "step": 145745 }, { "epoch": 18.26212254103496, "grad_norm": 5.762531280517578, "learning_rate": 2.2827744201926573e-07, "loss": 0.4404, "num_input_tokens_seen": 177241568, "step": 145750 }, { "epoch": 18.26274902894374, "grad_norm": 3.5188584327697754, "learning_rate": 2.2811416267106313e-07, "loss": 0.4026, "num_input_tokens_seen": 177247200, "step": 145755 }, { "epoch": 18.263375516852523, "grad_norm": 15.392134666442871, "learning_rate": 2.2795094037476705e-07, "loss": 0.5027, "num_input_tokens_seen": 177253280, "step": 145760 }, { "epoch": 18.26400200476131, "grad_norm": 3.3471620082855225, "learning_rate": 2.2778777513232932e-07, "loss": 0.4656, "num_input_tokens_seen": 177259488, "step": 145765 }, { "epoch": 18.26462849267009, "grad_norm": 6.056241512298584, "learning_rate": 2.2762466694570163e-07, "loss": 0.4843, "num_input_tokens_seen": 177265504, "step": 145770 }, { "epoch": 18.265254980578874, "grad_norm": 10.300256729125977, "learning_rate": 2.27461615816833e-07, "loss": 0.4045, "num_input_tokens_seen": 177271488, "step": 145775 }, { "epoch": 18.26588146848766, "grad_norm": 7.036635875701904, "learning_rate": 2.2729862174767302e-07, "loss": 0.3959, "num_input_tokens_seen": 177277728, "step": 145780 }, { "epoch": 18.266507956396442, "grad_norm": 7.055919170379639, "learning_rate": 2.2713568474017177e-07, "loss": 0.4889, "num_input_tokens_seen": 177283712, "step": 145785 }, { "epoch": 18.267134444305224, "grad_norm": 10.408761024475098, "learning_rate": 2.2697280479627548e-07, "loss": 0.4639, "num_input_tokens_seen": 177289856, "step": 145790 }, { "epoch": 18.26776093221401, "grad_norm": 4.124989032745361, "learning_rate": 2.2680998191793203e-07, "loss": 0.4774, "num_input_tokens_seen": 177296032, "step": 145795 }, { "epoch": 18.268387420122792, "grad_norm": 13.853269577026367, "learning_rate": 2.2664721610708873e-07, "loss": 0.5372, "num_input_tokens_seen": 177302464, "step": 145800 }, { "epoch": 18.269013908031575, "grad_norm": 12.43225383758545, "learning_rate": 2.2648450736569127e-07, "loss": 0.4063, "num_input_tokens_seen": 177308640, "step": 145805 }, { "epoch": 18.269640395940357, "grad_norm": 4.7452778816223145, "learning_rate": 2.2632185569568477e-07, "loss": 0.4957, "num_input_tokens_seen": 177315008, "step": 145810 }, { "epoch": 18.270266883849143, "grad_norm": 10.44234561920166, "learning_rate": 2.2615926109901376e-07, "loss": 0.4165, "num_input_tokens_seen": 177321280, "step": 145815 }, { "epoch": 18.270893371757925, "grad_norm": 3.357605218887329, "learning_rate": 2.259967235776228e-07, "loss": 0.4356, "num_input_tokens_seen": 177326944, "step": 145820 }, { "epoch": 18.271519859666707, "grad_norm": 5.8880157470703125, "learning_rate": 2.2583424313345427e-07, "loss": 0.4264, "num_input_tokens_seen": 177333216, "step": 145825 }, { "epoch": 18.272146347575493, "grad_norm": 13.259711265563965, "learning_rate": 2.256718197684521e-07, "loss": 0.5164, "num_input_tokens_seen": 177339104, "step": 145830 }, { "epoch": 18.272772835484275, "grad_norm": 3.9781923294067383, "learning_rate": 2.2550945348455643e-07, "loss": 0.4182, "num_input_tokens_seen": 177345088, "step": 145835 }, { "epoch": 18.273399323393058, "grad_norm": 11.09213638305664, "learning_rate": 2.253471442837102e-07, "loss": 0.4736, "num_input_tokens_seen": 177351360, "step": 145840 }, { "epoch": 18.274025811301843, "grad_norm": 15.297762870788574, "learning_rate": 2.2518489216785233e-07, "loss": 0.4363, "num_input_tokens_seen": 177357696, "step": 145845 }, { "epoch": 18.274652299210626, "grad_norm": 4.324273586273193, "learning_rate": 2.2502269713892466e-07, "loss": 0.4813, "num_input_tokens_seen": 177363552, "step": 145850 }, { "epoch": 18.275278787119408, "grad_norm": 9.984740257263184, "learning_rate": 2.2486055919886397e-07, "loss": 0.4403, "num_input_tokens_seen": 177369888, "step": 145855 }, { "epoch": 18.27590527502819, "grad_norm": 16.44707679748535, "learning_rate": 2.2469847834961034e-07, "loss": 0.4386, "num_input_tokens_seen": 177376064, "step": 145860 }, { "epoch": 18.276531762936976, "grad_norm": 8.516533851623535, "learning_rate": 2.245364545931017e-07, "loss": 0.396, "num_input_tokens_seen": 177381952, "step": 145865 }, { "epoch": 18.27715825084576, "grad_norm": 8.691608428955078, "learning_rate": 2.2437448793127425e-07, "loss": 0.3781, "num_input_tokens_seen": 177388160, "step": 145870 }, { "epoch": 18.27778473875454, "grad_norm": 4.543501377105713, "learning_rate": 2.242125783660648e-07, "loss": 0.4574, "num_input_tokens_seen": 177394208, "step": 145875 }, { "epoch": 18.278411226663327, "grad_norm": 20.794124603271484, "learning_rate": 2.2405072589940958e-07, "loss": 0.5499, "num_input_tokens_seen": 177400416, "step": 145880 }, { "epoch": 18.27903771457211, "grad_norm": 6.424139499664307, "learning_rate": 2.2388893053324367e-07, "loss": 0.4261, "num_input_tokens_seen": 177406752, "step": 145885 }, { "epoch": 18.27966420248089, "grad_norm": 5.683815956115723, "learning_rate": 2.2372719226950056e-07, "loss": 0.5088, "num_input_tokens_seen": 177412704, "step": 145890 }, { "epoch": 18.280290690389677, "grad_norm": 22.833637237548828, "learning_rate": 2.2356551111011537e-07, "loss": 0.4403, "num_input_tokens_seen": 177418880, "step": 145895 }, { "epoch": 18.28091717829846, "grad_norm": 11.867313385009766, "learning_rate": 2.2340388705701931e-07, "loss": 0.4204, "num_input_tokens_seen": 177424224, "step": 145900 }, { "epoch": 18.28154366620724, "grad_norm": 3.795199155807495, "learning_rate": 2.2324232011214643e-07, "loss": 0.4824, "num_input_tokens_seen": 177430400, "step": 145905 }, { "epoch": 18.282170154116027, "grad_norm": 5.23449182510376, "learning_rate": 2.2308081027742733e-07, "loss": 0.4446, "num_input_tokens_seen": 177436608, "step": 145910 }, { "epoch": 18.28279664202481, "grad_norm": 3.2753636837005615, "learning_rate": 2.2291935755479333e-07, "loss": 0.3982, "num_input_tokens_seen": 177442656, "step": 145915 }, { "epoch": 18.283423129933592, "grad_norm": 3.1835625171661377, "learning_rate": 2.2275796194617506e-07, "loss": 0.4845, "num_input_tokens_seen": 177447904, "step": 145920 }, { "epoch": 18.284049617842374, "grad_norm": 4.450154781341553, "learning_rate": 2.2259662345350096e-07, "loss": 0.4628, "num_input_tokens_seen": 177453984, "step": 145925 }, { "epoch": 18.28467610575116, "grad_norm": 4.194380283355713, "learning_rate": 2.2243534207870176e-07, "loss": 0.4543, "num_input_tokens_seen": 177459904, "step": 145930 }, { "epoch": 18.285302593659942, "grad_norm": 4.2887349128723145, "learning_rate": 2.2227411782370423e-07, "loss": 0.4539, "num_input_tokens_seen": 177466112, "step": 145935 }, { "epoch": 18.285929081568725, "grad_norm": 5.3298115730285645, "learning_rate": 2.2211295069043736e-07, "loss": 0.4343, "num_input_tokens_seen": 177472384, "step": 145940 }, { "epoch": 18.28655556947751, "grad_norm": 6.532764911651611, "learning_rate": 2.2195184068082632e-07, "loss": 0.4079, "num_input_tokens_seen": 177478400, "step": 145945 }, { "epoch": 18.287182057386293, "grad_norm": 4.978826999664307, "learning_rate": 2.2179078779679897e-07, "loss": 0.5006, "num_input_tokens_seen": 177484512, "step": 145950 }, { "epoch": 18.287808545295075, "grad_norm": 5.129790306091309, "learning_rate": 2.2162979204027935e-07, "loss": 0.4666, "num_input_tokens_seen": 177490432, "step": 145955 }, { "epoch": 18.28843503320386, "grad_norm": 4.908120155334473, "learning_rate": 2.214688534131937e-07, "loss": 0.5916, "num_input_tokens_seen": 177496704, "step": 145960 }, { "epoch": 18.289061521112643, "grad_norm": 14.231212615966797, "learning_rate": 2.213079719174649e-07, "loss": 0.4891, "num_input_tokens_seen": 177502816, "step": 145965 }, { "epoch": 18.289688009021425, "grad_norm": 3.931544542312622, "learning_rate": 2.2114714755501643e-07, "loss": 0.6085, "num_input_tokens_seen": 177508992, "step": 145970 }, { "epoch": 18.290314496930208, "grad_norm": 8.698881149291992, "learning_rate": 2.2098638032777287e-07, "loss": 0.4562, "num_input_tokens_seen": 177514944, "step": 145975 }, { "epoch": 18.290940984838993, "grad_norm": 6.653519630432129, "learning_rate": 2.2082567023765433e-07, "loss": 0.446, "num_input_tokens_seen": 177521056, "step": 145980 }, { "epoch": 18.291567472747776, "grad_norm": 7.689611911773682, "learning_rate": 2.2066501728658317e-07, "loss": 0.4136, "num_input_tokens_seen": 177527424, "step": 145985 }, { "epoch": 18.292193960656558, "grad_norm": 14.0150728225708, "learning_rate": 2.2050442147647956e-07, "loss": 0.4374, "num_input_tokens_seen": 177533728, "step": 145990 }, { "epoch": 18.292820448565344, "grad_norm": 4.381026268005371, "learning_rate": 2.2034388280926523e-07, "loss": 0.4171, "num_input_tokens_seen": 177539840, "step": 145995 }, { "epoch": 18.293446936474126, "grad_norm": 2.429062604904175, "learning_rate": 2.20183401286857e-07, "loss": 0.3558, "num_input_tokens_seen": 177546144, "step": 146000 }, { "epoch": 18.29407342438291, "grad_norm": 8.178281784057617, "learning_rate": 2.2002297691117558e-07, "loss": 0.5285, "num_input_tokens_seen": 177552064, "step": 146005 }, { "epoch": 18.294699912291694, "grad_norm": 11.32828426361084, "learning_rate": 2.1986260968413776e-07, "loss": 0.5172, "num_input_tokens_seen": 177557952, "step": 146010 }, { "epoch": 18.295326400200477, "grad_norm": 16.90614891052246, "learning_rate": 2.1970229960766254e-07, "loss": 0.4665, "num_input_tokens_seen": 177564032, "step": 146015 }, { "epoch": 18.29595288810926, "grad_norm": 5.626687049865723, "learning_rate": 2.1954204668366396e-07, "loss": 0.4001, "num_input_tokens_seen": 177570112, "step": 146020 }, { "epoch": 18.296579376018045, "grad_norm": 13.792963027954102, "learning_rate": 2.193818509140605e-07, "loss": 0.4588, "num_input_tokens_seen": 177575616, "step": 146025 }, { "epoch": 18.297205863926827, "grad_norm": 37.4825553894043, "learning_rate": 2.1922171230076561e-07, "loss": 0.5659, "num_input_tokens_seen": 177581952, "step": 146030 }, { "epoch": 18.29783235183561, "grad_norm": 14.005746841430664, "learning_rate": 2.19061630845695e-07, "loss": 0.4925, "num_input_tokens_seen": 177588256, "step": 146035 }, { "epoch": 18.29845883974439, "grad_norm": 10.825281143188477, "learning_rate": 2.1890160655076153e-07, "loss": 0.4189, "num_input_tokens_seen": 177594592, "step": 146040 }, { "epoch": 18.299085327653177, "grad_norm": 14.826093673706055, "learning_rate": 2.1874163941787929e-07, "loss": 0.4458, "num_input_tokens_seen": 177600768, "step": 146045 }, { "epoch": 18.29971181556196, "grad_norm": 5.233677864074707, "learning_rate": 2.1858172944896117e-07, "loss": 0.5267, "num_input_tokens_seen": 177606944, "step": 146050 }, { "epoch": 18.300338303470742, "grad_norm": 13.40878677368164, "learning_rate": 2.1842187664591786e-07, "loss": 0.4101, "num_input_tokens_seen": 177613184, "step": 146055 }, { "epoch": 18.300964791379528, "grad_norm": 12.000261306762695, "learning_rate": 2.182620810106617e-07, "loss": 0.4334, "num_input_tokens_seen": 177618720, "step": 146060 }, { "epoch": 18.30159127928831, "grad_norm": 5.777533054351807, "learning_rate": 2.181023425451023e-07, "loss": 0.4447, "num_input_tokens_seen": 177624800, "step": 146065 }, { "epoch": 18.302217767197092, "grad_norm": 23.82649803161621, "learning_rate": 2.1794266125115037e-07, "loss": 0.4586, "num_input_tokens_seen": 177630784, "step": 146070 }, { "epoch": 18.302844255105878, "grad_norm": 11.72386646270752, "learning_rate": 2.1778303713071324e-07, "loss": 0.4316, "num_input_tokens_seen": 177637152, "step": 146075 }, { "epoch": 18.30347074301466, "grad_norm": 4.283461570739746, "learning_rate": 2.1762347018570107e-07, "loss": 0.4491, "num_input_tokens_seen": 177643392, "step": 146080 }, { "epoch": 18.304097230923443, "grad_norm": 12.928479194641113, "learning_rate": 2.174639604180212e-07, "loss": 0.5317, "num_input_tokens_seen": 177649632, "step": 146085 }, { "epoch": 18.304723718832225, "grad_norm": 4.606206893920898, "learning_rate": 2.17304507829581e-07, "loss": 0.3946, "num_input_tokens_seen": 177655712, "step": 146090 }, { "epoch": 18.30535020674101, "grad_norm": 6.221558094024658, "learning_rate": 2.171451124222862e-07, "loss": 0.4531, "num_input_tokens_seen": 177661728, "step": 146095 }, { "epoch": 18.305976694649793, "grad_norm": 27.542282104492188, "learning_rate": 2.16985774198043e-07, "loss": 0.5052, "num_input_tokens_seen": 177668480, "step": 146100 }, { "epoch": 18.306603182558575, "grad_norm": 4.625575542449951, "learning_rate": 2.1682649315875547e-07, "loss": 0.3656, "num_input_tokens_seen": 177674592, "step": 146105 }, { "epoch": 18.30722967046736, "grad_norm": 28.736082077026367, "learning_rate": 2.1666726930632932e-07, "loss": 0.4539, "num_input_tokens_seen": 177680768, "step": 146110 }, { "epoch": 18.307856158376143, "grad_norm": 4.491607666015625, "learning_rate": 2.1650810264266742e-07, "loss": 0.5155, "num_input_tokens_seen": 177687104, "step": 146115 }, { "epoch": 18.308482646284926, "grad_norm": 5.218983173370361, "learning_rate": 2.1634899316967272e-07, "loss": 0.4786, "num_input_tokens_seen": 177693280, "step": 146120 }, { "epoch": 18.30910913419371, "grad_norm": 22.593151092529297, "learning_rate": 2.1618994088924817e-07, "loss": 0.4338, "num_input_tokens_seen": 177699616, "step": 146125 }, { "epoch": 18.309735622102494, "grad_norm": 7.6168532371521, "learning_rate": 2.160309458032944e-07, "loss": 0.4254, "num_input_tokens_seen": 177705824, "step": 146130 }, { "epoch": 18.310362110011276, "grad_norm": 4.163544178009033, "learning_rate": 2.1587200791371333e-07, "loss": 0.4225, "num_input_tokens_seen": 177712064, "step": 146135 }, { "epoch": 18.31098859792006, "grad_norm": 4.790045738220215, "learning_rate": 2.1571312722240444e-07, "loss": 0.4109, "num_input_tokens_seen": 177717920, "step": 146140 }, { "epoch": 18.311615085828844, "grad_norm": 11.071069717407227, "learning_rate": 2.155543037312674e-07, "loss": 0.4567, "num_input_tokens_seen": 177724128, "step": 146145 }, { "epoch": 18.312241573737627, "grad_norm": 10.176860809326172, "learning_rate": 2.1539553744220122e-07, "loss": 0.4867, "num_input_tokens_seen": 177730624, "step": 146150 }, { "epoch": 18.31286806164641, "grad_norm": 8.662766456604004, "learning_rate": 2.1523682835710435e-07, "loss": 0.4973, "num_input_tokens_seen": 177736928, "step": 146155 }, { "epoch": 18.313494549555195, "grad_norm": 5.551916599273682, "learning_rate": 2.1507817647787367e-07, "loss": 0.4812, "num_input_tokens_seen": 177743072, "step": 146160 }, { "epoch": 18.314121037463977, "grad_norm": 25.56263542175293, "learning_rate": 2.1491958180640648e-07, "loss": 0.478, "num_input_tokens_seen": 177749344, "step": 146165 }, { "epoch": 18.31474752537276, "grad_norm": 4.140408992767334, "learning_rate": 2.1476104434459855e-07, "loss": 0.386, "num_input_tokens_seen": 177755520, "step": 146170 }, { "epoch": 18.315374013281545, "grad_norm": 18.731168746948242, "learning_rate": 2.1460256409434553e-07, "loss": 0.5079, "num_input_tokens_seen": 177761472, "step": 146175 }, { "epoch": 18.316000501190327, "grad_norm": 22.987091064453125, "learning_rate": 2.1444414105754262e-07, "loss": 0.4322, "num_input_tokens_seen": 177767680, "step": 146180 }, { "epoch": 18.31662698909911, "grad_norm": 5.23577356338501, "learning_rate": 2.1428577523608274e-07, "loss": 0.4672, "num_input_tokens_seen": 177774240, "step": 146185 }, { "epoch": 18.317253477007895, "grad_norm": 4.902998447418213, "learning_rate": 2.1412746663186047e-07, "loss": 0.4986, "num_input_tokens_seen": 177780352, "step": 146190 }, { "epoch": 18.317879964916678, "grad_norm": 6.3049750328063965, "learning_rate": 2.139692152467676e-07, "loss": 0.44, "num_input_tokens_seen": 177786336, "step": 146195 }, { "epoch": 18.31850645282546, "grad_norm": 5.097901344299316, "learning_rate": 2.1381102108269768e-07, "loss": 0.4613, "num_input_tokens_seen": 177792512, "step": 146200 }, { "epoch": 18.319132940734242, "grad_norm": 6.453112602233887, "learning_rate": 2.1365288414154028e-07, "loss": 0.4141, "num_input_tokens_seen": 177798688, "step": 146205 }, { "epoch": 18.319759428643028, "grad_norm": 19.08184814453125, "learning_rate": 2.1349480442518722e-07, "loss": 0.4267, "num_input_tokens_seen": 177804704, "step": 146210 }, { "epoch": 18.32038591655181, "grad_norm": 6.822463512420654, "learning_rate": 2.1333678193552753e-07, "loss": 0.4175, "num_input_tokens_seen": 177810880, "step": 146215 }, { "epoch": 18.321012404460593, "grad_norm": 4.219747066497803, "learning_rate": 2.131788166744514e-07, "loss": 0.4243, "num_input_tokens_seen": 177817184, "step": 146220 }, { "epoch": 18.32163889236938, "grad_norm": 21.919326782226562, "learning_rate": 2.1302090864384673e-07, "loss": 0.4068, "num_input_tokens_seen": 177823104, "step": 146225 }, { "epoch": 18.32226538027816, "grad_norm": 12.577404022216797, "learning_rate": 2.12863057845602e-07, "loss": 0.4281, "num_input_tokens_seen": 177829280, "step": 146230 }, { "epoch": 18.322891868186943, "grad_norm": 5.444172382354736, "learning_rate": 2.1270526428160466e-07, "loss": 0.4557, "num_input_tokens_seen": 177835584, "step": 146235 }, { "epoch": 18.32351835609573, "grad_norm": 4.338856220245361, "learning_rate": 2.1254752795373979e-07, "loss": 0.4143, "num_input_tokens_seen": 177841856, "step": 146240 }, { "epoch": 18.32414484400451, "grad_norm": 5.338629722595215, "learning_rate": 2.1238984886389535e-07, "loss": 0.5049, "num_input_tokens_seen": 177847360, "step": 146245 }, { "epoch": 18.324771331913293, "grad_norm": 5.794131755828857, "learning_rate": 2.122322270139543e-07, "loss": 0.4385, "num_input_tokens_seen": 177853504, "step": 146250 }, { "epoch": 18.325397819822076, "grad_norm": 4.406290531158447, "learning_rate": 2.1207466240580344e-07, "loss": 0.3771, "num_input_tokens_seen": 177859584, "step": 146255 }, { "epoch": 18.32602430773086, "grad_norm": 12.211482048034668, "learning_rate": 2.1191715504132516e-07, "loss": 0.4851, "num_input_tokens_seen": 177865728, "step": 146260 }, { "epoch": 18.326650795639644, "grad_norm": 5.494380474090576, "learning_rate": 2.1175970492240294e-07, "loss": 0.4438, "num_input_tokens_seen": 177871840, "step": 146265 }, { "epoch": 18.327277283548426, "grad_norm": 6.110360145568848, "learning_rate": 2.1160231205091863e-07, "loss": 0.4692, "num_input_tokens_seen": 177877472, "step": 146270 }, { "epoch": 18.327903771457212, "grad_norm": 5.74289608001709, "learning_rate": 2.1144497642875518e-07, "loss": 0.4042, "num_input_tokens_seen": 177883552, "step": 146275 }, { "epoch": 18.328530259365994, "grad_norm": 11.690154075622559, "learning_rate": 2.112876980577927e-07, "loss": 0.4154, "num_input_tokens_seen": 177889504, "step": 146280 }, { "epoch": 18.329156747274777, "grad_norm": 3.8510642051696777, "learning_rate": 2.111304769399114e-07, "loss": 0.4007, "num_input_tokens_seen": 177895584, "step": 146285 }, { "epoch": 18.329783235183562, "grad_norm": 14.65818977355957, "learning_rate": 2.1097331307699198e-07, "loss": 0.4949, "num_input_tokens_seen": 177901728, "step": 146290 }, { "epoch": 18.330409723092345, "grad_norm": 6.867459297180176, "learning_rate": 2.108162064709135e-07, "loss": 0.4227, "num_input_tokens_seen": 177907552, "step": 146295 }, { "epoch": 18.331036211001127, "grad_norm": 6.0553107261657715, "learning_rate": 2.1065915712355277e-07, "loss": 0.4609, "num_input_tokens_seen": 177913632, "step": 146300 }, { "epoch": 18.331662698909913, "grad_norm": 5.0104875564575195, "learning_rate": 2.105021650367889e-07, "loss": 0.4696, "num_input_tokens_seen": 177919712, "step": 146305 }, { "epoch": 18.332289186818695, "grad_norm": 5.116230010986328, "learning_rate": 2.103452302124992e-07, "loss": 0.5141, "num_input_tokens_seen": 177925664, "step": 146310 }, { "epoch": 18.332915674727477, "grad_norm": 28.944421768188477, "learning_rate": 2.1018835265255832e-07, "loss": 0.5323, "num_input_tokens_seen": 177931808, "step": 146315 }, { "epoch": 18.33354216263626, "grad_norm": 26.4354248046875, "learning_rate": 2.100315323588431e-07, "loss": 0.4899, "num_input_tokens_seen": 177938272, "step": 146320 }, { "epoch": 18.334168650545045, "grad_norm": 8.139409065246582, "learning_rate": 2.0987476933322758e-07, "loss": 0.4605, "num_input_tokens_seen": 177944704, "step": 146325 }, { "epoch": 18.334795138453828, "grad_norm": 4.094125747680664, "learning_rate": 2.097180635775875e-07, "loss": 0.4252, "num_input_tokens_seen": 177951008, "step": 146330 }, { "epoch": 18.33542162636261, "grad_norm": 24.22261619567871, "learning_rate": 2.095614150937947e-07, "loss": 0.4977, "num_input_tokens_seen": 177956992, "step": 146335 }, { "epoch": 18.336048114271396, "grad_norm": 3.403778553009033, "learning_rate": 2.094048238837232e-07, "loss": 0.4467, "num_input_tokens_seen": 177963200, "step": 146340 }, { "epoch": 18.336674602180178, "grad_norm": 14.166586875915527, "learning_rate": 2.092482899492443e-07, "loss": 0.5338, "num_input_tokens_seen": 177969120, "step": 146345 }, { "epoch": 18.33730109008896, "grad_norm": 5.722856044769287, "learning_rate": 2.0909181329223038e-07, "loss": 0.3677, "num_input_tokens_seen": 177975360, "step": 146350 }, { "epoch": 18.337927577997746, "grad_norm": 4.0551438331604, "learning_rate": 2.089353939145511e-07, "loss": 0.4097, "num_input_tokens_seen": 177981376, "step": 146355 }, { "epoch": 18.33855406590653, "grad_norm": 20.121591567993164, "learning_rate": 2.0877903181807768e-07, "loss": 0.4995, "num_input_tokens_seen": 177987488, "step": 146360 }, { "epoch": 18.33918055381531, "grad_norm": 4.564813137054443, "learning_rate": 2.0862272700467924e-07, "loss": 0.4208, "num_input_tokens_seen": 177993728, "step": 146365 }, { "epoch": 18.339807041724093, "grad_norm": 4.545350074768066, "learning_rate": 2.0846647947622424e-07, "loss": 0.422, "num_input_tokens_seen": 178000192, "step": 146370 }, { "epoch": 18.34043352963288, "grad_norm": 8.10528564453125, "learning_rate": 2.0831028923458173e-07, "loss": 0.4944, "num_input_tokens_seen": 178006432, "step": 146375 }, { "epoch": 18.34106001754166, "grad_norm": 6.759464263916016, "learning_rate": 2.081541562816175e-07, "loss": 0.4718, "num_input_tokens_seen": 178012416, "step": 146380 }, { "epoch": 18.341686505450443, "grad_norm": 15.283966064453125, "learning_rate": 2.079980806191989e-07, "loss": 0.4216, "num_input_tokens_seen": 178018176, "step": 146385 }, { "epoch": 18.34231299335923, "grad_norm": 4.801230430603027, "learning_rate": 2.0784206224919278e-07, "loss": 0.4245, "num_input_tokens_seen": 178024320, "step": 146390 }, { "epoch": 18.34293948126801, "grad_norm": 5.266897678375244, "learning_rate": 2.0768610117346376e-07, "loss": 0.4061, "num_input_tokens_seen": 178030496, "step": 146395 }, { "epoch": 18.343565969176794, "grad_norm": 9.029387474060059, "learning_rate": 2.075301973938759e-07, "loss": 0.4257, "num_input_tokens_seen": 178036672, "step": 146400 }, { "epoch": 18.34419245708558, "grad_norm": 7.902953147888184, "learning_rate": 2.0737435091229497e-07, "loss": 0.4023, "num_input_tokens_seen": 178042624, "step": 146405 }, { "epoch": 18.344818944994362, "grad_norm": 12.58253002166748, "learning_rate": 2.072185617305822e-07, "loss": 0.416, "num_input_tokens_seen": 178048864, "step": 146410 }, { "epoch": 18.345445432903144, "grad_norm": 9.750990867614746, "learning_rate": 2.0706282985060112e-07, "loss": 0.4526, "num_input_tokens_seen": 178055040, "step": 146415 }, { "epoch": 18.34607192081193, "grad_norm": 4.274483680725098, "learning_rate": 2.0690715527421413e-07, "loss": 0.4416, "num_input_tokens_seen": 178060672, "step": 146420 }, { "epoch": 18.346698408720712, "grad_norm": 21.65410804748535, "learning_rate": 2.067515380032814e-07, "loss": 0.4315, "num_input_tokens_seen": 178067008, "step": 146425 }, { "epoch": 18.347324896629495, "grad_norm": 8.196954727172852, "learning_rate": 2.0659597803966426e-07, "loss": 0.403, "num_input_tokens_seen": 178073088, "step": 146430 }, { "epoch": 18.347951384538277, "grad_norm": 5.147919654846191, "learning_rate": 2.0644047538522226e-07, "loss": 0.509, "num_input_tokens_seen": 178078848, "step": 146435 }, { "epoch": 18.348577872447063, "grad_norm": 3.953369379043579, "learning_rate": 2.0628503004181455e-07, "loss": 0.369, "num_input_tokens_seen": 178084992, "step": 146440 }, { "epoch": 18.349204360355845, "grad_norm": 3.8208999633789062, "learning_rate": 2.0612964201129958e-07, "loss": 0.424, "num_input_tokens_seen": 178090720, "step": 146445 }, { "epoch": 18.349830848264627, "grad_norm": 5.956305503845215, "learning_rate": 2.0597431129553536e-07, "loss": 0.438, "num_input_tokens_seen": 178096672, "step": 146450 }, { "epoch": 18.350457336173413, "grad_norm": 4.855175971984863, "learning_rate": 2.058190378963787e-07, "loss": 0.4097, "num_input_tokens_seen": 178102688, "step": 146455 }, { "epoch": 18.351083824082195, "grad_norm": 7.804774761199951, "learning_rate": 2.0566382181568645e-07, "loss": 0.4391, "num_input_tokens_seen": 178108896, "step": 146460 }, { "epoch": 18.351710311990978, "grad_norm": 5.3365020751953125, "learning_rate": 2.055086630553138e-07, "loss": 0.4191, "num_input_tokens_seen": 178114720, "step": 146465 }, { "epoch": 18.352336799899764, "grad_norm": 13.31378173828125, "learning_rate": 2.053535616171165e-07, "loss": 0.425, "num_input_tokens_seen": 178121088, "step": 146470 }, { "epoch": 18.352963287808546, "grad_norm": 4.604372024536133, "learning_rate": 2.051985175029475e-07, "loss": 0.4779, "num_input_tokens_seen": 178127328, "step": 146475 }, { "epoch": 18.353589775717328, "grad_norm": 25.07779884338379, "learning_rate": 2.0504353071466198e-07, "loss": 0.5117, "num_input_tokens_seen": 178133568, "step": 146480 }, { "epoch": 18.35421626362611, "grad_norm": 3.7501821517944336, "learning_rate": 2.048886012541129e-07, "loss": 0.4155, "num_input_tokens_seen": 178139136, "step": 146485 }, { "epoch": 18.354842751534896, "grad_norm": 4.867094993591309, "learning_rate": 2.0473372912315103e-07, "loss": 0.4004, "num_input_tokens_seen": 178145024, "step": 146490 }, { "epoch": 18.35546923944368, "grad_norm": 9.166341781616211, "learning_rate": 2.0457891432362985e-07, "loss": 0.4981, "num_input_tokens_seen": 178151104, "step": 146495 }, { "epoch": 18.35609572735246, "grad_norm": 6.5615997314453125, "learning_rate": 2.0442415685739903e-07, "loss": 0.4368, "num_input_tokens_seen": 178157280, "step": 146500 }, { "epoch": 18.356722215261247, "grad_norm": 16.595224380493164, "learning_rate": 2.0426945672630982e-07, "loss": 0.4996, "num_input_tokens_seen": 178162592, "step": 146505 }, { "epoch": 18.35734870317003, "grad_norm": 4.44370698928833, "learning_rate": 2.0411481393221077e-07, "loss": 0.4434, "num_input_tokens_seen": 178168640, "step": 146510 }, { "epoch": 18.35797519107881, "grad_norm": 4.784815311431885, "learning_rate": 2.0396022847695206e-07, "loss": 0.5362, "num_input_tokens_seen": 178174720, "step": 146515 }, { "epoch": 18.358601678987597, "grad_norm": 4.450496673583984, "learning_rate": 2.0380570036238056e-07, "loss": 0.4384, "num_input_tokens_seen": 178180384, "step": 146520 }, { "epoch": 18.35922816689638, "grad_norm": 12.29008960723877, "learning_rate": 2.0365122959034479e-07, "loss": 0.4522, "num_input_tokens_seen": 178186528, "step": 146525 }, { "epoch": 18.35985465480516, "grad_norm": 3.3038833141326904, "learning_rate": 2.0349681616269045e-07, "loss": 0.4587, "num_input_tokens_seen": 178193088, "step": 146530 }, { "epoch": 18.360481142713944, "grad_norm": 13.325638771057129, "learning_rate": 2.0334246008126503e-07, "loss": 0.4603, "num_input_tokens_seen": 178199200, "step": 146535 }, { "epoch": 18.36110763062273, "grad_norm": 3.8342654705047607, "learning_rate": 2.0318816134791252e-07, "loss": 0.5436, "num_input_tokens_seen": 178205472, "step": 146540 }, { "epoch": 18.361734118531512, "grad_norm": 6.522303581237793, "learning_rate": 2.0303391996447874e-07, "loss": 0.449, "num_input_tokens_seen": 178211840, "step": 146545 }, { "epoch": 18.362360606440294, "grad_norm": 22.225217819213867, "learning_rate": 2.028797359328083e-07, "loss": 0.4763, "num_input_tokens_seen": 178217312, "step": 146550 }, { "epoch": 18.36298709434908, "grad_norm": 4.102651119232178, "learning_rate": 2.0272560925474304e-07, "loss": 0.4027, "num_input_tokens_seen": 178223136, "step": 146555 }, { "epoch": 18.363613582257862, "grad_norm": 21.219284057617188, "learning_rate": 2.0257153993212709e-07, "loss": 0.4696, "num_input_tokens_seen": 178229312, "step": 146560 }, { "epoch": 18.364240070166645, "grad_norm": 23.47344207763672, "learning_rate": 2.0241752796680058e-07, "loss": 0.4687, "num_input_tokens_seen": 178234752, "step": 146565 }, { "epoch": 18.36486655807543, "grad_norm": 5.354794025421143, "learning_rate": 2.0226357336060766e-07, "loss": 0.4463, "num_input_tokens_seen": 178240704, "step": 146570 }, { "epoch": 18.365493045984213, "grad_norm": 4.710776329040527, "learning_rate": 2.0210967611538623e-07, "loss": 0.4322, "num_input_tokens_seen": 178246912, "step": 146575 }, { "epoch": 18.366119533892995, "grad_norm": 4.478226184844971, "learning_rate": 2.019558362329782e-07, "loss": 0.443, "num_input_tokens_seen": 178253120, "step": 146580 }, { "epoch": 18.36674602180178, "grad_norm": 4.698506832122803, "learning_rate": 2.0180205371522155e-07, "loss": 0.398, "num_input_tokens_seen": 178259360, "step": 146585 }, { "epoch": 18.367372509710563, "grad_norm": 18.83683967590332, "learning_rate": 2.0164832856395534e-07, "loss": 0.4551, "num_input_tokens_seen": 178265248, "step": 146590 }, { "epoch": 18.367998997619345, "grad_norm": 26.567642211914062, "learning_rate": 2.0149466078101864e-07, "loss": 0.47, "num_input_tokens_seen": 178270912, "step": 146595 }, { "epoch": 18.368625485528128, "grad_norm": 23.475399017333984, "learning_rate": 2.013410503682467e-07, "loss": 0.4943, "num_input_tokens_seen": 178275904, "step": 146600 }, { "epoch": 18.369251973436914, "grad_norm": 6.140089511871338, "learning_rate": 2.0118749732747688e-07, "loss": 0.4135, "num_input_tokens_seen": 178282240, "step": 146605 }, { "epoch": 18.369878461345696, "grad_norm": 7.974173545837402, "learning_rate": 2.0103400166054554e-07, "loss": 0.427, "num_input_tokens_seen": 178288384, "step": 146610 }, { "epoch": 18.370504949254478, "grad_norm": 8.858697891235352, "learning_rate": 2.008805633692873e-07, "loss": 0.4418, "num_input_tokens_seen": 178294464, "step": 146615 }, { "epoch": 18.371131437163264, "grad_norm": 5.410797595977783, "learning_rate": 2.007271824555368e-07, "loss": 0.5129, "num_input_tokens_seen": 178300288, "step": 146620 }, { "epoch": 18.371757925072046, "grad_norm": 7.521639347076416, "learning_rate": 2.0057385892112868e-07, "loss": 0.4644, "num_input_tokens_seen": 178306272, "step": 146625 }, { "epoch": 18.37238441298083, "grad_norm": 7.385921001434326, "learning_rate": 2.0042059276789428e-07, "loss": 0.4343, "num_input_tokens_seen": 178312320, "step": 146630 }, { "epoch": 18.373010900889614, "grad_norm": 6.268392562866211, "learning_rate": 2.002673839976671e-07, "loss": 0.4274, "num_input_tokens_seen": 178318368, "step": 146635 }, { "epoch": 18.373637388798397, "grad_norm": 9.483339309692383, "learning_rate": 2.0011423261227902e-07, "loss": 0.4745, "num_input_tokens_seen": 178324576, "step": 146640 }, { "epoch": 18.37426387670718, "grad_norm": 5.712663650512695, "learning_rate": 1.999611386135608e-07, "loss": 0.5346, "num_input_tokens_seen": 178330304, "step": 146645 }, { "epoch": 18.374890364615965, "grad_norm": 4.7750468254089355, "learning_rate": 1.9980810200334267e-07, "loss": 0.449, "num_input_tokens_seen": 178336256, "step": 146650 }, { "epoch": 18.375516852524747, "grad_norm": 6.942087650299072, "learning_rate": 1.9965512278345478e-07, "loss": 0.3782, "num_input_tokens_seen": 178342176, "step": 146655 }, { "epoch": 18.37614334043353, "grad_norm": 16.880151748657227, "learning_rate": 1.9950220095572513e-07, "loss": 0.4973, "num_input_tokens_seen": 178348288, "step": 146660 }, { "epoch": 18.37676982834231, "grad_norm": 4.886198997497559, "learning_rate": 1.9934933652198286e-07, "loss": 0.3969, "num_input_tokens_seen": 178354784, "step": 146665 }, { "epoch": 18.377396316251097, "grad_norm": 5.963059425354004, "learning_rate": 1.9919652948405587e-07, "loss": 0.41, "num_input_tokens_seen": 178361056, "step": 146670 }, { "epoch": 18.37802280415988, "grad_norm": 5.816525936126709, "learning_rate": 1.9904377984377e-07, "loss": 0.4337, "num_input_tokens_seen": 178367072, "step": 146675 }, { "epoch": 18.378649292068662, "grad_norm": 16.623950958251953, "learning_rate": 1.9889108760295318e-07, "loss": 0.4425, "num_input_tokens_seen": 178373216, "step": 146680 }, { "epoch": 18.379275779977448, "grad_norm": 12.825873374938965, "learning_rate": 1.9873845276342896e-07, "loss": 0.4818, "num_input_tokens_seen": 178379584, "step": 146685 }, { "epoch": 18.37990226788623, "grad_norm": 13.3615140914917, "learning_rate": 1.9858587532702367e-07, "loss": 0.414, "num_input_tokens_seen": 178385824, "step": 146690 }, { "epoch": 18.380528755795012, "grad_norm": 4.895733833312988, "learning_rate": 1.9843335529556085e-07, "loss": 0.41, "num_input_tokens_seen": 178391360, "step": 146695 }, { "epoch": 18.381155243703798, "grad_norm": 5.800622940063477, "learning_rate": 1.9828089267086402e-07, "loss": 0.4427, "num_input_tokens_seen": 178397056, "step": 146700 }, { "epoch": 18.38178173161258, "grad_norm": 25.182043075561523, "learning_rate": 1.9812848745475622e-07, "loss": 0.4705, "num_input_tokens_seen": 178403200, "step": 146705 }, { "epoch": 18.382408219521363, "grad_norm": 20.763473510742188, "learning_rate": 1.9797613964906038e-07, "loss": 0.4158, "num_input_tokens_seen": 178409696, "step": 146710 }, { "epoch": 18.383034707430145, "grad_norm": 7.274779796600342, "learning_rate": 1.9782384925559616e-07, "loss": 0.4599, "num_input_tokens_seen": 178414912, "step": 146715 }, { "epoch": 18.38366119533893, "grad_norm": 5.6320881843566895, "learning_rate": 1.9767161627618603e-07, "loss": 0.4357, "num_input_tokens_seen": 178420864, "step": 146720 }, { "epoch": 18.384287683247713, "grad_norm": 8.488419532775879, "learning_rate": 1.9751944071264851e-07, "loss": 0.3853, "num_input_tokens_seen": 178426912, "step": 146725 }, { "epoch": 18.384914171156495, "grad_norm": 6.68581485748291, "learning_rate": 1.9736732256680435e-07, "loss": 0.474, "num_input_tokens_seen": 178432992, "step": 146730 }, { "epoch": 18.38554065906528, "grad_norm": 3.9093830585479736, "learning_rate": 1.9721526184047157e-07, "loss": 0.5507, "num_input_tokens_seen": 178439072, "step": 146735 }, { "epoch": 18.386167146974064, "grad_norm": 10.154501914978027, "learning_rate": 1.9706325853546816e-07, "loss": 0.4469, "num_input_tokens_seen": 178445504, "step": 146740 }, { "epoch": 18.386793634882846, "grad_norm": 4.38157844543457, "learning_rate": 1.9691131265361208e-07, "loss": 0.4205, "num_input_tokens_seen": 178451584, "step": 146745 }, { "epoch": 18.38742012279163, "grad_norm": 3.5081582069396973, "learning_rate": 1.9675942419671856e-07, "loss": 0.4608, "num_input_tokens_seen": 178457504, "step": 146750 }, { "epoch": 18.388046610700414, "grad_norm": 19.477933883666992, "learning_rate": 1.966075931666056e-07, "loss": 0.4632, "num_input_tokens_seen": 178462592, "step": 146755 }, { "epoch": 18.388673098609196, "grad_norm": 6.318033695220947, "learning_rate": 1.964558195650862e-07, "loss": 0.4947, "num_input_tokens_seen": 178468800, "step": 146760 }, { "epoch": 18.38929958651798, "grad_norm": 10.753538131713867, "learning_rate": 1.9630410339397722e-07, "loss": 0.412, "num_input_tokens_seen": 178474784, "step": 146765 }, { "epoch": 18.389926074426764, "grad_norm": 5.036837577819824, "learning_rate": 1.9615244465509053e-07, "loss": 0.4199, "num_input_tokens_seen": 178480800, "step": 146770 }, { "epoch": 18.390552562335547, "grad_norm": 5.20465612411499, "learning_rate": 1.9600084335024084e-07, "loss": 0.4547, "num_input_tokens_seen": 178486848, "step": 146775 }, { "epoch": 18.39117905024433, "grad_norm": 7.909121036529541, "learning_rate": 1.9584929948123943e-07, "loss": 0.3782, "num_input_tokens_seen": 178493056, "step": 146780 }, { "epoch": 18.391805538153115, "grad_norm": 4.6770100593566895, "learning_rate": 1.956978130498993e-07, "loss": 0.4942, "num_input_tokens_seen": 178499488, "step": 146785 }, { "epoch": 18.392432026061897, "grad_norm": 6.791855812072754, "learning_rate": 1.9554638405803016e-07, "loss": 0.4654, "num_input_tokens_seen": 178505792, "step": 146790 }, { "epoch": 18.39305851397068, "grad_norm": 28.282102584838867, "learning_rate": 1.9539501250744386e-07, "loss": 0.446, "num_input_tokens_seen": 178511872, "step": 146795 }, { "epoch": 18.393685001879465, "grad_norm": 8.255107879638672, "learning_rate": 1.952436983999495e-07, "loss": 0.4623, "num_input_tokens_seen": 178517344, "step": 146800 }, { "epoch": 18.394311489788247, "grad_norm": 3.8229050636291504, "learning_rate": 1.9509244173735676e-07, "loss": 0.4446, "num_input_tokens_seen": 178523392, "step": 146805 }, { "epoch": 18.39493797769703, "grad_norm": 11.568195343017578, "learning_rate": 1.9494124252147361e-07, "loss": 0.4274, "num_input_tokens_seen": 178529888, "step": 146810 }, { "epoch": 18.395564465605815, "grad_norm": 5.429938793182373, "learning_rate": 1.9479010075410698e-07, "loss": 0.4229, "num_input_tokens_seen": 178535872, "step": 146815 }, { "epoch": 18.396190953514598, "grad_norm": 8.004232406616211, "learning_rate": 1.9463901643706596e-07, "loss": 0.4425, "num_input_tokens_seen": 178541696, "step": 146820 }, { "epoch": 18.39681744142338, "grad_norm": 5.7349748611450195, "learning_rate": 1.9448798957215464e-07, "loss": 0.4206, "num_input_tokens_seen": 178547872, "step": 146825 }, { "epoch": 18.397443929332162, "grad_norm": 4.42344856262207, "learning_rate": 1.943370201611805e-07, "loss": 0.4039, "num_input_tokens_seen": 178554144, "step": 146830 }, { "epoch": 18.398070417240948, "grad_norm": 4.571386814117432, "learning_rate": 1.9418610820594652e-07, "loss": 0.4065, "num_input_tokens_seen": 178560512, "step": 146835 }, { "epoch": 18.39869690514973, "grad_norm": 6.4522480964660645, "learning_rate": 1.940352537082596e-07, "loss": 0.5264, "num_input_tokens_seen": 178566688, "step": 146840 }, { "epoch": 18.399323393058513, "grad_norm": 11.393410682678223, "learning_rate": 1.938844566699205e-07, "loss": 0.5456, "num_input_tokens_seen": 178572832, "step": 146845 }, { "epoch": 18.3999498809673, "grad_norm": 10.43433666229248, "learning_rate": 1.9373371709273392e-07, "loss": 0.424, "num_input_tokens_seen": 178579008, "step": 146850 }, { "epoch": 18.40057636887608, "grad_norm": 4.321961402893066, "learning_rate": 1.935830349785023e-07, "loss": 0.4472, "num_input_tokens_seen": 178585184, "step": 146855 }, { "epoch": 18.401202856784863, "grad_norm": 5.174942970275879, "learning_rate": 1.9343241032902528e-07, "loss": 0.4558, "num_input_tokens_seen": 178591520, "step": 146860 }, { "epoch": 18.40182934469365, "grad_norm": 6.704780578613281, "learning_rate": 1.932818431461059e-07, "loss": 0.4781, "num_input_tokens_seen": 178597440, "step": 146865 }, { "epoch": 18.40245583260243, "grad_norm": 6.407668113708496, "learning_rate": 1.9313133343154322e-07, "loss": 0.4532, "num_input_tokens_seen": 178603648, "step": 146870 }, { "epoch": 18.403082320511214, "grad_norm": 4.419633388519287, "learning_rate": 1.92980881187137e-07, "loss": 0.4545, "num_input_tokens_seen": 178609088, "step": 146875 }, { "epoch": 18.403708808419996, "grad_norm": 6.6895527839660645, "learning_rate": 1.9283048641468515e-07, "loss": 0.4387, "num_input_tokens_seen": 178614560, "step": 146880 }, { "epoch": 18.40433529632878, "grad_norm": 9.353833198547363, "learning_rate": 1.9268014911598744e-07, "loss": 0.4415, "num_input_tokens_seen": 178620672, "step": 146885 }, { "epoch": 18.404961784237564, "grad_norm": 5.989616870880127, "learning_rate": 1.92529869292839e-07, "loss": 0.3973, "num_input_tokens_seen": 178627072, "step": 146890 }, { "epoch": 18.405588272146346, "grad_norm": 4.87750768661499, "learning_rate": 1.9237964694703902e-07, "loss": 0.4846, "num_input_tokens_seen": 178633248, "step": 146895 }, { "epoch": 18.406214760055132, "grad_norm": 5.815914154052734, "learning_rate": 1.9222948208038218e-07, "loss": 0.3839, "num_input_tokens_seen": 178639264, "step": 146900 }, { "epoch": 18.406841247963914, "grad_norm": 5.39628267288208, "learning_rate": 1.920793746946631e-07, "loss": 0.4307, "num_input_tokens_seen": 178645408, "step": 146905 }, { "epoch": 18.407467735872697, "grad_norm": 6.08644962310791, "learning_rate": 1.9192932479167813e-07, "loss": 0.4333, "num_input_tokens_seen": 178651712, "step": 146910 }, { "epoch": 18.408094223781482, "grad_norm": 13.845110893249512, "learning_rate": 1.9177933237322087e-07, "loss": 0.4963, "num_input_tokens_seen": 178657984, "step": 146915 }, { "epoch": 18.408720711690265, "grad_norm": 15.489550590515137, "learning_rate": 1.916293974410832e-07, "loss": 0.5416, "num_input_tokens_seen": 178664384, "step": 146920 }, { "epoch": 18.409347199599047, "grad_norm": 3.3125081062316895, "learning_rate": 1.9147951999705928e-07, "loss": 0.4044, "num_input_tokens_seen": 178670624, "step": 146925 }, { "epoch": 18.409973687507833, "grad_norm": 36.3415412902832, "learning_rate": 1.9132970004294095e-07, "loss": 0.5053, "num_input_tokens_seen": 178676704, "step": 146930 }, { "epoch": 18.410600175416615, "grad_norm": 5.788298606872559, "learning_rate": 1.9117993758051846e-07, "loss": 0.4898, "num_input_tokens_seen": 178682112, "step": 146935 }, { "epoch": 18.411226663325397, "grad_norm": 4.1638970375061035, "learning_rate": 1.9103023261158315e-07, "loss": 0.4483, "num_input_tokens_seen": 178688224, "step": 146940 }, { "epoch": 18.41185315123418, "grad_norm": 6.193140029907227, "learning_rate": 1.9088058513792418e-07, "loss": 0.4719, "num_input_tokens_seen": 178694688, "step": 146945 }, { "epoch": 18.412479639142965, "grad_norm": 7.749250411987305, "learning_rate": 1.9073099516133174e-07, "loss": 0.43, "num_input_tokens_seen": 178701024, "step": 146950 }, { "epoch": 18.413106127051748, "grad_norm": 10.025907516479492, "learning_rate": 1.905814626835928e-07, "loss": 0.4829, "num_input_tokens_seen": 178706784, "step": 146955 }, { "epoch": 18.41373261496053, "grad_norm": 6.641342639923096, "learning_rate": 1.9043198770649695e-07, "loss": 0.472, "num_input_tokens_seen": 178712768, "step": 146960 }, { "epoch": 18.414359102869316, "grad_norm": 27.98011016845703, "learning_rate": 1.902825702318295e-07, "loss": 0.4292, "num_input_tokens_seen": 178718944, "step": 146965 }, { "epoch": 18.414985590778098, "grad_norm": 7.176637172698975, "learning_rate": 1.9013321026137843e-07, "loss": 0.5153, "num_input_tokens_seen": 178725184, "step": 146970 }, { "epoch": 18.41561207868688, "grad_norm": 3.8900997638702393, "learning_rate": 1.8998390779692787e-07, "loss": 0.4124, "num_input_tokens_seen": 178731296, "step": 146975 }, { "epoch": 18.416238566595666, "grad_norm": 31.435569763183594, "learning_rate": 1.898346628402642e-07, "loss": 0.4665, "num_input_tokens_seen": 178737184, "step": 146980 }, { "epoch": 18.41686505450445, "grad_norm": 21.58401107788086, "learning_rate": 1.8968547539317152e-07, "loss": 0.4338, "num_input_tokens_seen": 178743264, "step": 146985 }, { "epoch": 18.41749154241323, "grad_norm": 5.402585029602051, "learning_rate": 1.8953634545743283e-07, "loss": 0.4549, "num_input_tokens_seen": 178749312, "step": 146990 }, { "epoch": 18.418118030322013, "grad_norm": 15.479527473449707, "learning_rate": 1.8938727303483228e-07, "loss": 0.4702, "num_input_tokens_seen": 178755296, "step": 146995 }, { "epoch": 18.4187445182308, "grad_norm": 7.190760135650635, "learning_rate": 1.8923825812715068e-07, "loss": 0.4268, "num_input_tokens_seen": 178760928, "step": 147000 }, { "epoch": 18.41937100613958, "grad_norm": 10.654053688049316, "learning_rate": 1.8908930073617048e-07, "loss": 0.4278, "num_input_tokens_seen": 178767008, "step": 147005 }, { "epoch": 18.419997494048364, "grad_norm": 3.985593557357788, "learning_rate": 1.8894040086367303e-07, "loss": 0.4599, "num_input_tokens_seen": 178772992, "step": 147010 }, { "epoch": 18.42062398195715, "grad_norm": 7.587100028991699, "learning_rate": 1.8879155851143693e-07, "loss": 0.4079, "num_input_tokens_seen": 178779072, "step": 147015 }, { "epoch": 18.42125046986593, "grad_norm": 6.172248840332031, "learning_rate": 1.886427736812435e-07, "loss": 0.4129, "num_input_tokens_seen": 178785152, "step": 147020 }, { "epoch": 18.421876957774714, "grad_norm": 9.398516654968262, "learning_rate": 1.8849404637487135e-07, "loss": 0.5131, "num_input_tokens_seen": 178791488, "step": 147025 }, { "epoch": 18.4225034456835, "grad_norm": 5.902615070343018, "learning_rate": 1.8834537659409734e-07, "loss": 0.4508, "num_input_tokens_seen": 178797344, "step": 147030 }, { "epoch": 18.423129933592282, "grad_norm": 7.194270133972168, "learning_rate": 1.8819676434069957e-07, "loss": 0.5038, "num_input_tokens_seen": 178803520, "step": 147035 }, { "epoch": 18.423756421501064, "grad_norm": 6.865591049194336, "learning_rate": 1.8804820961645598e-07, "loss": 0.4363, "num_input_tokens_seen": 178809440, "step": 147040 }, { "epoch": 18.42438290940985, "grad_norm": 8.1948881149292, "learning_rate": 1.8789971242314076e-07, "loss": 0.4565, "num_input_tokens_seen": 178815744, "step": 147045 }, { "epoch": 18.425009397318632, "grad_norm": 8.029202461242676, "learning_rate": 1.8775127276253136e-07, "loss": 0.4487, "num_input_tokens_seen": 178821728, "step": 147050 }, { "epoch": 18.425635885227415, "grad_norm": 4.922410488128662, "learning_rate": 1.8760289063640025e-07, "loss": 0.4468, "num_input_tokens_seen": 178827584, "step": 147055 }, { "epoch": 18.426262373136197, "grad_norm": 6.508065223693848, "learning_rate": 1.8745456604652378e-07, "loss": 0.4662, "num_input_tokens_seen": 178833472, "step": 147060 }, { "epoch": 18.426888861044983, "grad_norm": 4.226357936859131, "learning_rate": 1.8730629899467335e-07, "loss": 0.471, "num_input_tokens_seen": 178839584, "step": 147065 }, { "epoch": 18.427515348953765, "grad_norm": 7.130727291107178, "learning_rate": 1.8715808948262303e-07, "loss": 0.4119, "num_input_tokens_seen": 178845440, "step": 147070 }, { "epoch": 18.428141836862547, "grad_norm": 13.169651985168457, "learning_rate": 1.8700993751214313e-07, "loss": 0.4523, "num_input_tokens_seen": 178850976, "step": 147075 }, { "epoch": 18.428768324771333, "grad_norm": 5.561473369598389, "learning_rate": 1.868618430850072e-07, "loss": 0.4724, "num_input_tokens_seen": 178857216, "step": 147080 }, { "epoch": 18.429394812680115, "grad_norm": 8.961189270019531, "learning_rate": 1.8671380620298385e-07, "loss": 0.4285, "num_input_tokens_seen": 178863424, "step": 147085 }, { "epoch": 18.430021300588898, "grad_norm": 5.802813529968262, "learning_rate": 1.8656582686784387e-07, "loss": 0.4474, "num_input_tokens_seen": 178869536, "step": 147090 }, { "epoch": 18.430647788497684, "grad_norm": 3.281010389328003, "learning_rate": 1.8641790508135637e-07, "loss": 0.4552, "num_input_tokens_seen": 178875488, "step": 147095 }, { "epoch": 18.431274276406466, "grad_norm": 3.711292266845703, "learning_rate": 1.8627004084528944e-07, "loss": 0.4345, "num_input_tokens_seen": 178881664, "step": 147100 }, { "epoch": 18.431900764315248, "grad_norm": 4.429257392883301, "learning_rate": 1.8612223416141217e-07, "loss": 0.4261, "num_input_tokens_seen": 178887648, "step": 147105 }, { "epoch": 18.43252725222403, "grad_norm": 8.50538158416748, "learning_rate": 1.8597448503148986e-07, "loss": 0.4878, "num_input_tokens_seen": 178893664, "step": 147110 }, { "epoch": 18.433153740132816, "grad_norm": 3.999126672744751, "learning_rate": 1.858267934572905e-07, "loss": 0.4578, "num_input_tokens_seen": 178900064, "step": 147115 }, { "epoch": 18.4337802280416, "grad_norm": 6.516581058502197, "learning_rate": 1.8567915944057934e-07, "loss": 0.4173, "num_input_tokens_seen": 178906080, "step": 147120 }, { "epoch": 18.43440671595038, "grad_norm": 9.458380699157715, "learning_rate": 1.8553158298312167e-07, "loss": 0.438, "num_input_tokens_seen": 178911616, "step": 147125 }, { "epoch": 18.435033203859167, "grad_norm": 7.855798721313477, "learning_rate": 1.8538406408668163e-07, "loss": 0.4373, "num_input_tokens_seen": 178917856, "step": 147130 }, { "epoch": 18.43565969176795, "grad_norm": 7.990271091461182, "learning_rate": 1.8523660275302335e-07, "loss": 0.4602, "num_input_tokens_seen": 178923904, "step": 147135 }, { "epoch": 18.43628617967673, "grad_norm": 8.69583511352539, "learning_rate": 1.8508919898390876e-07, "loss": 0.4262, "num_input_tokens_seen": 178930048, "step": 147140 }, { "epoch": 18.436912667585517, "grad_norm": 30.396106719970703, "learning_rate": 1.8494185278110145e-07, "loss": 0.5006, "num_input_tokens_seen": 178936352, "step": 147145 }, { "epoch": 18.4375391554943, "grad_norm": 3.947524309158325, "learning_rate": 1.847945641463622e-07, "loss": 0.4204, "num_input_tokens_seen": 178942656, "step": 147150 }, { "epoch": 18.43816564340308, "grad_norm": 12.499787330627441, "learning_rate": 1.84647333081453e-07, "loss": 0.4343, "num_input_tokens_seen": 178949024, "step": 147155 }, { "epoch": 18.438792131311864, "grad_norm": 6.309139728546143, "learning_rate": 1.8450015958813294e-07, "loss": 0.4286, "num_input_tokens_seen": 178955136, "step": 147160 }, { "epoch": 18.43941861922065, "grad_norm": 31.99810028076172, "learning_rate": 1.8435304366816175e-07, "loss": 0.5185, "num_input_tokens_seen": 178961184, "step": 147165 }, { "epoch": 18.440045107129432, "grad_norm": 5.086050033569336, "learning_rate": 1.842059853232997e-07, "loss": 0.4298, "num_input_tokens_seen": 178967328, "step": 147170 }, { "epoch": 18.440671595038214, "grad_norm": 7.44923210144043, "learning_rate": 1.8405898455530314e-07, "loss": 0.4343, "num_input_tokens_seen": 178973600, "step": 147175 }, { "epoch": 18.441298082947, "grad_norm": 5.837561130523682, "learning_rate": 1.839120413659312e-07, "loss": 0.4309, "num_input_tokens_seen": 178979776, "step": 147180 }, { "epoch": 18.441924570855782, "grad_norm": 8.296457290649414, "learning_rate": 1.837651557569392e-07, "loss": 0.404, "num_input_tokens_seen": 178985760, "step": 147185 }, { "epoch": 18.442551058764565, "grad_norm": 11.702153205871582, "learning_rate": 1.8361832773008458e-07, "loss": 0.4298, "num_input_tokens_seen": 178991392, "step": 147190 }, { "epoch": 18.44317754667335, "grad_norm": 5.338576316833496, "learning_rate": 1.8347155728712206e-07, "loss": 0.4817, "num_input_tokens_seen": 178996416, "step": 147195 }, { "epoch": 18.443804034582133, "grad_norm": 3.672025680541992, "learning_rate": 1.833248444298069e-07, "loss": 0.46, "num_input_tokens_seen": 179001792, "step": 147200 }, { "epoch": 18.444430522490915, "grad_norm": 6.392733097076416, "learning_rate": 1.8317818915989216e-07, "loss": 0.367, "num_input_tokens_seen": 179007968, "step": 147205 }, { "epoch": 18.4450570103997, "grad_norm": 7.116328716278076, "learning_rate": 1.8303159147913196e-07, "loss": 0.4315, "num_input_tokens_seen": 179014080, "step": 147210 }, { "epoch": 18.445683498308483, "grad_norm": 7.625176429748535, "learning_rate": 1.8288505138927936e-07, "loss": 0.3956, "num_input_tokens_seen": 179019968, "step": 147215 }, { "epoch": 18.446309986217265, "grad_norm": 8.737661361694336, "learning_rate": 1.8273856889208576e-07, "loss": 0.5275, "num_input_tokens_seen": 179026048, "step": 147220 }, { "epoch": 18.446936474126048, "grad_norm": 7.1933159828186035, "learning_rate": 1.8259214398930247e-07, "loss": 0.4049, "num_input_tokens_seen": 179031936, "step": 147225 }, { "epoch": 18.447562962034834, "grad_norm": 14.732437133789062, "learning_rate": 1.8244577668268036e-07, "loss": 0.4881, "num_input_tokens_seen": 179038080, "step": 147230 }, { "epoch": 18.448189449943616, "grad_norm": 4.889946937561035, "learning_rate": 1.8229946697397026e-07, "loss": 0.4721, "num_input_tokens_seen": 179044032, "step": 147235 }, { "epoch": 18.448815937852398, "grad_norm": 4.970931053161621, "learning_rate": 1.8215321486491965e-07, "loss": 0.3789, "num_input_tokens_seen": 179050272, "step": 147240 }, { "epoch": 18.449442425761184, "grad_norm": 15.09018325805664, "learning_rate": 1.820070203572788e-07, "loss": 0.4142, "num_input_tokens_seen": 179056448, "step": 147245 }, { "epoch": 18.450068913669966, "grad_norm": 4.350208759307861, "learning_rate": 1.8186088345279408e-07, "loss": 0.4685, "num_input_tokens_seen": 179062560, "step": 147250 }, { "epoch": 18.45069540157875, "grad_norm": 6.502542018890381, "learning_rate": 1.8171480415321409e-07, "loss": 0.4563, "num_input_tokens_seen": 179068736, "step": 147255 }, { "epoch": 18.451321889487534, "grad_norm": 4.989405632019043, "learning_rate": 1.8156878246028353e-07, "loss": 0.4167, "num_input_tokens_seen": 179075008, "step": 147260 }, { "epoch": 18.451948377396317, "grad_norm": 9.432239532470703, "learning_rate": 1.8142281837575048e-07, "loss": 0.401, "num_input_tokens_seen": 179080960, "step": 147265 }, { "epoch": 18.4525748653051, "grad_norm": 11.293929100036621, "learning_rate": 1.81276911901358e-07, "loss": 0.4567, "num_input_tokens_seen": 179087232, "step": 147270 }, { "epoch": 18.453201353213885, "grad_norm": 20.200607299804688, "learning_rate": 1.8113106303885187e-07, "loss": 0.5137, "num_input_tokens_seen": 179093440, "step": 147275 }, { "epoch": 18.453827841122667, "grad_norm": 19.54911231994629, "learning_rate": 1.809852717899746e-07, "loss": 0.465, "num_input_tokens_seen": 179099296, "step": 147280 }, { "epoch": 18.45445432903145, "grad_norm": 6.871166706085205, "learning_rate": 1.8083953815647037e-07, "loss": 0.4078, "num_input_tokens_seen": 179105568, "step": 147285 }, { "epoch": 18.45508081694023, "grad_norm": 3.8595519065856934, "learning_rate": 1.8069386214008167e-07, "loss": 0.4905, "num_input_tokens_seen": 179111488, "step": 147290 }, { "epoch": 18.455707304849017, "grad_norm": 18.62755584716797, "learning_rate": 1.805482437425493e-07, "loss": 0.5514, "num_input_tokens_seen": 179117696, "step": 147295 }, { "epoch": 18.4563337927578, "grad_norm": 13.085888862609863, "learning_rate": 1.8040268296561525e-07, "loss": 0.4023, "num_input_tokens_seen": 179123648, "step": 147300 }, { "epoch": 18.456960280666582, "grad_norm": 10.797165870666504, "learning_rate": 1.802571798110181e-07, "loss": 0.4291, "num_input_tokens_seen": 179129792, "step": 147305 }, { "epoch": 18.457586768575368, "grad_norm": 7.8581013679504395, "learning_rate": 1.8011173428049978e-07, "loss": 0.4152, "num_input_tokens_seen": 179135552, "step": 147310 }, { "epoch": 18.45821325648415, "grad_norm": 19.55783462524414, "learning_rate": 1.7996634637579723e-07, "loss": 0.4771, "num_input_tokens_seen": 179141760, "step": 147315 }, { "epoch": 18.458839744392932, "grad_norm": 5.853822708129883, "learning_rate": 1.798210160986491e-07, "loss": 0.4369, "num_input_tokens_seen": 179147872, "step": 147320 }, { "epoch": 18.45946623230172, "grad_norm": 3.678891658782959, "learning_rate": 1.7967574345079397e-07, "loss": 0.519, "num_input_tokens_seen": 179153600, "step": 147325 }, { "epoch": 18.4600927202105, "grad_norm": 7.519381523132324, "learning_rate": 1.7953052843396823e-07, "loss": 0.4116, "num_input_tokens_seen": 179159424, "step": 147330 }, { "epoch": 18.460719208119283, "grad_norm": 19.77199935913086, "learning_rate": 1.7938537104990717e-07, "loss": 0.4811, "num_input_tokens_seen": 179165632, "step": 147335 }, { "epoch": 18.461345696028065, "grad_norm": 22.76494026184082, "learning_rate": 1.792402713003477e-07, "loss": 0.4661, "num_input_tokens_seen": 179171808, "step": 147340 }, { "epoch": 18.46197218393685, "grad_norm": 13.759815216064453, "learning_rate": 1.7909522918702349e-07, "loss": 0.4137, "num_input_tokens_seen": 179177824, "step": 147345 }, { "epoch": 18.462598671845633, "grad_norm": 9.484726905822754, "learning_rate": 1.7895024471166867e-07, "loss": 0.4479, "num_input_tokens_seen": 179184192, "step": 147350 }, { "epoch": 18.463225159754415, "grad_norm": 5.176574230194092, "learning_rate": 1.788053178760174e-07, "loss": 0.4366, "num_input_tokens_seen": 179190528, "step": 147355 }, { "epoch": 18.4638516476632, "grad_norm": 3.680957317352295, "learning_rate": 1.7866044868180164e-07, "loss": 0.446, "num_input_tokens_seen": 179196416, "step": 147360 }, { "epoch": 18.464478135571984, "grad_norm": 8.459593772888184, "learning_rate": 1.7851563713075448e-07, "loss": 0.4244, "num_input_tokens_seen": 179202624, "step": 147365 }, { "epoch": 18.465104623480766, "grad_norm": 4.8706207275390625, "learning_rate": 1.7837088322460617e-07, "loss": 0.4117, "num_input_tokens_seen": 179208640, "step": 147370 }, { "epoch": 18.46573111138955, "grad_norm": 5.07064962387085, "learning_rate": 1.782261869650881e-07, "loss": 0.4536, "num_input_tokens_seen": 179214496, "step": 147375 }, { "epoch": 18.466357599298334, "grad_norm": 3.8074235916137695, "learning_rate": 1.7808154835392887e-07, "loss": 0.4249, "num_input_tokens_seen": 179220896, "step": 147380 }, { "epoch": 18.466984087207116, "grad_norm": 28.890167236328125, "learning_rate": 1.7793696739285992e-07, "loss": 0.4431, "num_input_tokens_seen": 179227008, "step": 147385 }, { "epoch": 18.4676105751159, "grad_norm": 13.940855026245117, "learning_rate": 1.777924440836082e-07, "loss": 0.4385, "num_input_tokens_seen": 179232576, "step": 147390 }, { "epoch": 18.468237063024684, "grad_norm": 3.383025884628296, "learning_rate": 1.7764797842790228e-07, "loss": 0.4293, "num_input_tokens_seen": 179238912, "step": 147395 }, { "epoch": 18.468863550933467, "grad_norm": 8.175899505615234, "learning_rate": 1.7750357042746857e-07, "loss": 0.3978, "num_input_tokens_seen": 179244320, "step": 147400 }, { "epoch": 18.46949003884225, "grad_norm": 5.740516662597656, "learning_rate": 1.773592200840346e-07, "loss": 0.4781, "num_input_tokens_seen": 179250176, "step": 147405 }, { "epoch": 18.470116526751035, "grad_norm": 24.676549911499023, "learning_rate": 1.772149273993262e-07, "loss": 0.5017, "num_input_tokens_seen": 179256160, "step": 147410 }, { "epoch": 18.470743014659817, "grad_norm": 14.407246589660645, "learning_rate": 1.7707069237506757e-07, "loss": 0.4116, "num_input_tokens_seen": 179262336, "step": 147415 }, { "epoch": 18.4713695025686, "grad_norm": 6.472207069396973, "learning_rate": 1.7692651501298342e-07, "loss": 0.4186, "num_input_tokens_seen": 179268000, "step": 147420 }, { "epoch": 18.471995990477385, "grad_norm": 4.988900661468506, "learning_rate": 1.7678239531479845e-07, "loss": 0.4791, "num_input_tokens_seen": 179273984, "step": 147425 }, { "epoch": 18.472622478386167, "grad_norm": 6.732059955596924, "learning_rate": 1.7663833328223467e-07, "loss": 0.5076, "num_input_tokens_seen": 179280000, "step": 147430 }, { "epoch": 18.47324896629495, "grad_norm": 5.904979228973389, "learning_rate": 1.7649432891701512e-07, "loss": 0.4231, "num_input_tokens_seen": 179286624, "step": 147435 }, { "epoch": 18.473875454203736, "grad_norm": 5.825008392333984, "learning_rate": 1.7635038222086122e-07, "loss": 0.5034, "num_input_tokens_seen": 179293152, "step": 147440 }, { "epoch": 18.474501942112518, "grad_norm": 15.626656532287598, "learning_rate": 1.7620649319549377e-07, "loss": 0.4526, "num_input_tokens_seen": 179299360, "step": 147445 }, { "epoch": 18.4751284300213, "grad_norm": 10.053201675415039, "learning_rate": 1.7606266184263422e-07, "loss": 0.4114, "num_input_tokens_seen": 179305408, "step": 147450 }, { "epoch": 18.475754917930082, "grad_norm": 5.7766900062561035, "learning_rate": 1.759188881640006e-07, "loss": 0.4647, "num_input_tokens_seen": 179311104, "step": 147455 }, { "epoch": 18.47638140583887, "grad_norm": 5.354074478149414, "learning_rate": 1.7577517216131267e-07, "loss": 0.4044, "num_input_tokens_seen": 179317248, "step": 147460 }, { "epoch": 18.47700789374765, "grad_norm": 6.294731616973877, "learning_rate": 1.7563151383628796e-07, "loss": 0.428, "num_input_tokens_seen": 179323424, "step": 147465 }, { "epoch": 18.477634381656433, "grad_norm": 14.132451057434082, "learning_rate": 1.7548791319064506e-07, "loss": 0.4333, "num_input_tokens_seen": 179329888, "step": 147470 }, { "epoch": 18.47826086956522, "grad_norm": 6.638960361480713, "learning_rate": 1.7534437022610097e-07, "loss": 0.5386, "num_input_tokens_seen": 179335808, "step": 147475 }, { "epoch": 18.478887357474, "grad_norm": 9.101555824279785, "learning_rate": 1.7520088494437038e-07, "loss": 0.4392, "num_input_tokens_seen": 179342368, "step": 147480 }, { "epoch": 18.479513845382783, "grad_norm": 5.839547634124756, "learning_rate": 1.750574573471703e-07, "loss": 0.3724, "num_input_tokens_seen": 179348480, "step": 147485 }, { "epoch": 18.48014033329157, "grad_norm": 19.21822166442871, "learning_rate": 1.749140874362143e-07, "loss": 0.4418, "num_input_tokens_seen": 179354688, "step": 147490 }, { "epoch": 18.48076682120035, "grad_norm": 9.524393081665039, "learning_rate": 1.7477077521321773e-07, "loss": 0.4905, "num_input_tokens_seen": 179360832, "step": 147495 }, { "epoch": 18.481393309109134, "grad_norm": 12.229260444641113, "learning_rate": 1.7462752067989307e-07, "loss": 0.4474, "num_input_tokens_seen": 179366976, "step": 147500 }, { "epoch": 18.482019797017916, "grad_norm": 6.617541790008545, "learning_rate": 1.7448432383795344e-07, "loss": 0.4231, "num_input_tokens_seen": 179373344, "step": 147505 }, { "epoch": 18.4826462849267, "grad_norm": 7.7049736976623535, "learning_rate": 1.7434118468911077e-07, "loss": 0.3725, "num_input_tokens_seen": 179379456, "step": 147510 }, { "epoch": 18.483272772835484, "grad_norm": 5.333646774291992, "learning_rate": 1.7419810323507646e-07, "loss": 0.4014, "num_input_tokens_seen": 179385664, "step": 147515 }, { "epoch": 18.483899260744266, "grad_norm": 19.377653121948242, "learning_rate": 1.7405507947756083e-07, "loss": 0.5007, "num_input_tokens_seen": 179391936, "step": 147520 }, { "epoch": 18.484525748653052, "grad_norm": 12.006237983703613, "learning_rate": 1.7391211341827418e-07, "loss": 0.4897, "num_input_tokens_seen": 179396832, "step": 147525 }, { "epoch": 18.485152236561834, "grad_norm": 6.836721897125244, "learning_rate": 1.737692050589257e-07, "loss": 0.4395, "num_input_tokens_seen": 179403040, "step": 147530 }, { "epoch": 18.485778724470617, "grad_norm": 6.465260982513428, "learning_rate": 1.7362635440122456e-07, "loss": 0.5077, "num_input_tokens_seen": 179408928, "step": 147535 }, { "epoch": 18.486405212379402, "grad_norm": 27.945480346679688, "learning_rate": 1.734835614468777e-07, "loss": 0.4751, "num_input_tokens_seen": 179414848, "step": 147540 }, { "epoch": 18.487031700288185, "grad_norm": 18.363197326660156, "learning_rate": 1.7334082619759274e-07, "loss": 0.455, "num_input_tokens_seen": 179420960, "step": 147545 }, { "epoch": 18.487658188196967, "grad_norm": 3.3353068828582764, "learning_rate": 1.7319814865507656e-07, "loss": 0.4297, "num_input_tokens_seen": 179427136, "step": 147550 }, { "epoch": 18.488284676105753, "grad_norm": 4.149209976196289, "learning_rate": 1.7305552882103448e-07, "loss": 0.4771, "num_input_tokens_seen": 179433504, "step": 147555 }, { "epoch": 18.488911164014535, "grad_norm": 5.978426456451416, "learning_rate": 1.729129666971724e-07, "loss": 0.4387, "num_input_tokens_seen": 179439616, "step": 147560 }, { "epoch": 18.489537651923317, "grad_norm": 6.656038284301758, "learning_rate": 1.7277046228519333e-07, "loss": 0.4477, "num_input_tokens_seen": 179445888, "step": 147565 }, { "epoch": 18.4901641398321, "grad_norm": 7.894781112670898, "learning_rate": 1.7262801558680265e-07, "loss": 0.4485, "num_input_tokens_seen": 179452256, "step": 147570 }, { "epoch": 18.490790627740886, "grad_norm": 7.188126087188721, "learning_rate": 1.724856266037017e-07, "loss": 0.4974, "num_input_tokens_seen": 179458528, "step": 147575 }, { "epoch": 18.491417115649668, "grad_norm": 9.34946346282959, "learning_rate": 1.7234329533759476e-07, "loss": 0.4543, "num_input_tokens_seen": 179464672, "step": 147580 }, { "epoch": 18.49204360355845, "grad_norm": 7.949962615966797, "learning_rate": 1.7220102179018205e-07, "loss": 0.4829, "num_input_tokens_seen": 179470880, "step": 147585 }, { "epoch": 18.492670091467236, "grad_norm": 9.324424743652344, "learning_rate": 1.7205880596316506e-07, "loss": 0.4336, "num_input_tokens_seen": 179477088, "step": 147590 }, { "epoch": 18.49329657937602, "grad_norm": 10.69836139678955, "learning_rate": 1.7191664785824513e-07, "loss": 0.4455, "num_input_tokens_seen": 179482880, "step": 147595 }, { "epoch": 18.4939230672848, "grad_norm": 6.005423069000244, "learning_rate": 1.7177454747711985e-07, "loss": 0.4894, "num_input_tokens_seen": 179488736, "step": 147600 }, { "epoch": 18.494549555193586, "grad_norm": 5.928592681884766, "learning_rate": 1.7163250482148952e-07, "loss": 0.4576, "num_input_tokens_seen": 179494944, "step": 147605 }, { "epoch": 18.49517604310237, "grad_norm": 16.15915870666504, "learning_rate": 1.714905198930522e-07, "loss": 0.4746, "num_input_tokens_seen": 179501024, "step": 147610 }, { "epoch": 18.49580253101115, "grad_norm": 4.678489685058594, "learning_rate": 1.7134859269350546e-07, "loss": 0.4503, "num_input_tokens_seen": 179507392, "step": 147615 }, { "epoch": 18.496429018919933, "grad_norm": 6.740105152130127, "learning_rate": 1.7120672322454513e-07, "loss": 0.441, "num_input_tokens_seen": 179513152, "step": 147620 }, { "epoch": 18.49705550682872, "grad_norm": 8.062355995178223, "learning_rate": 1.7106491148786874e-07, "loss": 0.505, "num_input_tokens_seen": 179519488, "step": 147625 }, { "epoch": 18.4976819947375, "grad_norm": 5.531294822692871, "learning_rate": 1.7092315748517162e-07, "loss": 0.4303, "num_input_tokens_seen": 179525728, "step": 147630 }, { "epoch": 18.498308482646284, "grad_norm": 5.743072032928467, "learning_rate": 1.70781461218148e-07, "loss": 0.4386, "num_input_tokens_seen": 179532128, "step": 147635 }, { "epoch": 18.49893497055507, "grad_norm": 6.897784233093262, "learning_rate": 1.70639822688492e-07, "loss": 0.4805, "num_input_tokens_seen": 179538048, "step": 147640 }, { "epoch": 18.49956145846385, "grad_norm": 8.604201316833496, "learning_rate": 1.7049824189789733e-07, "loss": 0.4552, "num_input_tokens_seen": 179543936, "step": 147645 }, { "epoch": 18.500187946372634, "grad_norm": 5.21373176574707, "learning_rate": 1.7035671884805648e-07, "loss": 0.4094, "num_input_tokens_seen": 179549760, "step": 147650 }, { "epoch": 18.50081443428142, "grad_norm": 6.504849910736084, "learning_rate": 1.7021525354066148e-07, "loss": 0.3973, "num_input_tokens_seen": 179556064, "step": 147655 }, { "epoch": 18.501440922190202, "grad_norm": 3.7706525325775146, "learning_rate": 1.7007384597740483e-07, "loss": 0.5438, "num_input_tokens_seen": 179562272, "step": 147660 }, { "epoch": 18.502067410098984, "grad_norm": 4.743684768676758, "learning_rate": 1.699324961599752e-07, "loss": 0.4393, "num_input_tokens_seen": 179568576, "step": 147665 }, { "epoch": 18.50269389800777, "grad_norm": 28.641468048095703, "learning_rate": 1.69791204090064e-07, "loss": 0.5079, "num_input_tokens_seen": 179574624, "step": 147670 }, { "epoch": 18.503320385916552, "grad_norm": 13.282620429992676, "learning_rate": 1.6964996976935932e-07, "loss": 0.4438, "num_input_tokens_seen": 179580416, "step": 147675 }, { "epoch": 18.503946873825335, "grad_norm": 5.987240314483643, "learning_rate": 1.6950879319955093e-07, "loss": 0.4725, "num_input_tokens_seen": 179586304, "step": 147680 }, { "epoch": 18.504573361734117, "grad_norm": 11.322819709777832, "learning_rate": 1.693676743823258e-07, "loss": 0.4484, "num_input_tokens_seen": 179592192, "step": 147685 }, { "epoch": 18.505199849642903, "grad_norm": 5.943746566772461, "learning_rate": 1.692266133193715e-07, "loss": 0.4102, "num_input_tokens_seen": 179598144, "step": 147690 }, { "epoch": 18.505826337551685, "grad_norm": 3.5369913578033447, "learning_rate": 1.6908561001237443e-07, "loss": 0.3842, "num_input_tokens_seen": 179603936, "step": 147695 }, { "epoch": 18.506452825460467, "grad_norm": 7.816272735595703, "learning_rate": 1.6894466446302104e-07, "loss": 0.4246, "num_input_tokens_seen": 179609920, "step": 147700 }, { "epoch": 18.507079313369253, "grad_norm": 6.106656551361084, "learning_rate": 1.688037766729955e-07, "loss": 0.4405, "num_input_tokens_seen": 179615904, "step": 147705 }, { "epoch": 18.507705801278036, "grad_norm": 5.982693672180176, "learning_rate": 1.6866294664398318e-07, "loss": 0.3731, "num_input_tokens_seen": 179622208, "step": 147710 }, { "epoch": 18.508332289186818, "grad_norm": 6.718263626098633, "learning_rate": 1.685221743776666e-07, "loss": 0.4039, "num_input_tokens_seen": 179628064, "step": 147715 }, { "epoch": 18.508958777095604, "grad_norm": 5.432086944580078, "learning_rate": 1.683814598757294e-07, "loss": 0.4857, "num_input_tokens_seen": 179634272, "step": 147720 }, { "epoch": 18.509585265004386, "grad_norm": 20.928625106811523, "learning_rate": 1.6824080313985526e-07, "loss": 0.456, "num_input_tokens_seen": 179640672, "step": 147725 }, { "epoch": 18.51021175291317, "grad_norm": 9.046751022338867, "learning_rate": 1.6810020417172336e-07, "loss": 0.4164, "num_input_tokens_seen": 179646784, "step": 147730 }, { "epoch": 18.51083824082195, "grad_norm": 8.176816940307617, "learning_rate": 1.6795966297301625e-07, "loss": 0.4592, "num_input_tokens_seen": 179652992, "step": 147735 }, { "epoch": 18.511464728730736, "grad_norm": 9.328262329101562, "learning_rate": 1.6781917954541372e-07, "loss": 0.4771, "num_input_tokens_seen": 179659456, "step": 147740 }, { "epoch": 18.51209121663952, "grad_norm": 5.704545497894287, "learning_rate": 1.6767875389059607e-07, "loss": 0.4217, "num_input_tokens_seen": 179665504, "step": 147745 }, { "epoch": 18.5127177045483, "grad_norm": 4.700335502624512, "learning_rate": 1.6753838601024142e-07, "loss": 0.4028, "num_input_tokens_seen": 179671072, "step": 147750 }, { "epoch": 18.513344192457087, "grad_norm": 5.085549354553223, "learning_rate": 1.6739807590602896e-07, "loss": 0.4138, "num_input_tokens_seen": 179677408, "step": 147755 }, { "epoch": 18.51397068036587, "grad_norm": 5.125445365905762, "learning_rate": 1.672578235796346e-07, "loss": 0.4084, "num_input_tokens_seen": 179683456, "step": 147760 }, { "epoch": 18.51459716827465, "grad_norm": 3.82277512550354, "learning_rate": 1.6711762903273698e-07, "loss": 0.466, "num_input_tokens_seen": 179689600, "step": 147765 }, { "epoch": 18.515223656183437, "grad_norm": 11.255892753601074, "learning_rate": 1.6697749226701031e-07, "loss": 0.4302, "num_input_tokens_seen": 179695680, "step": 147770 }, { "epoch": 18.51585014409222, "grad_norm": 16.706235885620117, "learning_rate": 1.6683741328413216e-07, "loss": 0.5072, "num_input_tokens_seen": 179701472, "step": 147775 }, { "epoch": 18.516476632001, "grad_norm": 5.926191806793213, "learning_rate": 1.6669739208577563e-07, "loss": 0.4343, "num_input_tokens_seen": 179707712, "step": 147780 }, { "epoch": 18.517103119909784, "grad_norm": 12.47533130645752, "learning_rate": 1.6655742867361546e-07, "loss": 0.4183, "num_input_tokens_seen": 179713568, "step": 147785 }, { "epoch": 18.51772960781857, "grad_norm": 8.724367141723633, "learning_rate": 1.6641752304932534e-07, "loss": 0.4266, "num_input_tokens_seen": 179719328, "step": 147790 }, { "epoch": 18.518356095727352, "grad_norm": 5.224247932434082, "learning_rate": 1.6627767521457673e-07, "loss": 0.4369, "num_input_tokens_seen": 179724672, "step": 147795 }, { "epoch": 18.518982583636134, "grad_norm": 6.772347450256348, "learning_rate": 1.6613788517104378e-07, "loss": 0.4599, "num_input_tokens_seen": 179730816, "step": 147800 }, { "epoch": 18.51960907154492, "grad_norm": 6.548843860626221, "learning_rate": 1.6599815292039524e-07, "loss": 0.464, "num_input_tokens_seen": 179736768, "step": 147805 }, { "epoch": 18.520235559453702, "grad_norm": 25.655954360961914, "learning_rate": 1.658584784643036e-07, "loss": 0.4731, "num_input_tokens_seen": 179742976, "step": 147810 }, { "epoch": 18.520862047362485, "grad_norm": 5.017922401428223, "learning_rate": 1.6571886180443807e-07, "loss": 0.4341, "num_input_tokens_seen": 179748544, "step": 147815 }, { "epoch": 18.52148853527127, "grad_norm": 9.12903118133545, "learning_rate": 1.6557930294246848e-07, "loss": 0.4037, "num_input_tokens_seen": 179754656, "step": 147820 }, { "epoch": 18.522115023180053, "grad_norm": 5.1888108253479, "learning_rate": 1.654398018800618e-07, "loss": 0.4339, "num_input_tokens_seen": 179760896, "step": 147825 }, { "epoch": 18.522741511088835, "grad_norm": 5.649564743041992, "learning_rate": 1.6530035861888728e-07, "loss": 0.4348, "num_input_tokens_seen": 179766976, "step": 147830 }, { "epoch": 18.52336799899762, "grad_norm": 7.126667022705078, "learning_rate": 1.6516097316061242e-07, "loss": 0.4405, "num_input_tokens_seen": 179772832, "step": 147835 }, { "epoch": 18.523994486906403, "grad_norm": 6.691122531890869, "learning_rate": 1.650216455069026e-07, "loss": 0.4415, "num_input_tokens_seen": 179778400, "step": 147840 }, { "epoch": 18.524620974815186, "grad_norm": 7.4869208335876465, "learning_rate": 1.6488237565942366e-07, "loss": 0.4432, "num_input_tokens_seen": 179784640, "step": 147845 }, { "epoch": 18.525247462723968, "grad_norm": 23.07906150817871, "learning_rate": 1.6474316361984155e-07, "loss": 0.5089, "num_input_tokens_seen": 179790720, "step": 147850 }, { "epoch": 18.525873950632754, "grad_norm": 23.929048538208008, "learning_rate": 1.6460400938982046e-07, "loss": 0.5047, "num_input_tokens_seen": 179796864, "step": 147855 }, { "epoch": 18.526500438541536, "grad_norm": 5.922513484954834, "learning_rate": 1.6446491297102296e-07, "loss": 0.3727, "num_input_tokens_seen": 179803008, "step": 147860 }, { "epoch": 18.527126926450318, "grad_norm": 4.536120891571045, "learning_rate": 1.6432587436511382e-07, "loss": 0.4172, "num_input_tokens_seen": 179809120, "step": 147865 }, { "epoch": 18.527753414359104, "grad_norm": 13.955735206604004, "learning_rate": 1.6418689357375394e-07, "loss": 0.4335, "num_input_tokens_seen": 179815584, "step": 147870 }, { "epoch": 18.528379902267886, "grad_norm": 5.532036304473877, "learning_rate": 1.640479705986059e-07, "loss": 0.371, "num_input_tokens_seen": 179821632, "step": 147875 }, { "epoch": 18.52900639017667, "grad_norm": 9.08171558380127, "learning_rate": 1.6390910544132944e-07, "loss": 0.477, "num_input_tokens_seen": 179827776, "step": 147880 }, { "epoch": 18.529632878085454, "grad_norm": 3.4972984790802, "learning_rate": 1.6377029810358658e-07, "loss": 0.4562, "num_input_tokens_seen": 179833920, "step": 147885 }, { "epoch": 18.530259365994237, "grad_norm": 4.437992095947266, "learning_rate": 1.6363154858703488e-07, "loss": 0.4297, "num_input_tokens_seen": 179840128, "step": 147890 }, { "epoch": 18.53088585390302, "grad_norm": 22.925003051757812, "learning_rate": 1.6349285689333526e-07, "loss": 0.4685, "num_input_tokens_seen": 179846496, "step": 147895 }, { "epoch": 18.531512341811805, "grad_norm": 6.302866458892822, "learning_rate": 1.6335422302414415e-07, "loss": 0.456, "num_input_tokens_seen": 179852352, "step": 147900 }, { "epoch": 18.532138829720587, "grad_norm": 14.427918434143066, "learning_rate": 1.632156469811197e-07, "loss": 0.524, "num_input_tokens_seen": 179858400, "step": 147905 }, { "epoch": 18.53276531762937, "grad_norm": 8.326260566711426, "learning_rate": 1.6307712876591886e-07, "loss": 0.4377, "num_input_tokens_seen": 179864608, "step": 147910 }, { "epoch": 18.53339180553815, "grad_norm": 9.816259384155273, "learning_rate": 1.6293866838019756e-07, "loss": 0.4032, "num_input_tokens_seen": 179870080, "step": 147915 }, { "epoch": 18.534018293446938, "grad_norm": 4.170977592468262, "learning_rate": 1.628002658256117e-07, "loss": 0.4275, "num_input_tokens_seen": 179876256, "step": 147920 }, { "epoch": 18.53464478135572, "grad_norm": 7.047199726104736, "learning_rate": 1.6266192110381496e-07, "loss": 0.4727, "num_input_tokens_seen": 179882656, "step": 147925 }, { "epoch": 18.535271269264502, "grad_norm": 5.862823009490967, "learning_rate": 1.6252363421646266e-07, "loss": 0.4528, "num_input_tokens_seen": 179888832, "step": 147930 }, { "epoch": 18.535897757173288, "grad_norm": 27.40644073486328, "learning_rate": 1.6238540516520684e-07, "loss": 0.5442, "num_input_tokens_seen": 179895104, "step": 147935 }, { "epoch": 18.53652424508207, "grad_norm": 12.038878440856934, "learning_rate": 1.622472339517006e-07, "loss": 0.5188, "num_input_tokens_seen": 179901408, "step": 147940 }, { "epoch": 18.537150732990852, "grad_norm": 4.4486775398254395, "learning_rate": 1.6210912057759597e-07, "loss": 0.3845, "num_input_tokens_seen": 179907552, "step": 147945 }, { "epoch": 18.53777722089964, "grad_norm": 9.841463088989258, "learning_rate": 1.6197106504454497e-07, "loss": 0.442, "num_input_tokens_seen": 179913472, "step": 147950 }, { "epoch": 18.53840370880842, "grad_norm": 7.230851173400879, "learning_rate": 1.618330673541968e-07, "loss": 0.4127, "num_input_tokens_seen": 179919808, "step": 147955 }, { "epoch": 18.539030196717203, "grad_norm": 15.14123249053955, "learning_rate": 1.6169512750820294e-07, "loss": 0.413, "num_input_tokens_seen": 179925824, "step": 147960 }, { "epoch": 18.539656684625985, "grad_norm": 5.560218811035156, "learning_rate": 1.6155724550821094e-07, "loss": 0.4371, "num_input_tokens_seen": 179931808, "step": 147965 }, { "epoch": 18.54028317253477, "grad_norm": 4.035639762878418, "learning_rate": 1.6141942135586953e-07, "loss": 0.3987, "num_input_tokens_seen": 179937888, "step": 147970 }, { "epoch": 18.540909660443553, "grad_norm": 2.4925479888916016, "learning_rate": 1.6128165505282788e-07, "loss": 0.4496, "num_input_tokens_seen": 179944000, "step": 147975 }, { "epoch": 18.541536148352336, "grad_norm": 18.39095115661621, "learning_rate": 1.6114394660073196e-07, "loss": 0.5936, "num_input_tokens_seen": 179950080, "step": 147980 }, { "epoch": 18.54216263626112, "grad_norm": 4.486668109893799, "learning_rate": 1.6100629600122875e-07, "loss": 0.4083, "num_input_tokens_seen": 179955424, "step": 147985 }, { "epoch": 18.542789124169904, "grad_norm": 3.2972850799560547, "learning_rate": 1.6086870325596305e-07, "loss": 0.4512, "num_input_tokens_seen": 179961152, "step": 147990 }, { "epoch": 18.543415612078686, "grad_norm": 11.30068302154541, "learning_rate": 1.6073116836658131e-07, "loss": 0.4758, "num_input_tokens_seen": 179966400, "step": 147995 }, { "epoch": 18.54404209998747, "grad_norm": 11.361199378967285, "learning_rate": 1.6059369133472612e-07, "loss": 0.5166, "num_input_tokens_seen": 179971776, "step": 148000 }, { "epoch": 18.544668587896254, "grad_norm": 30.88274574279785, "learning_rate": 1.6045627216204285e-07, "loss": 0.4534, "num_input_tokens_seen": 179977856, "step": 148005 }, { "epoch": 18.545295075805036, "grad_norm": 3.471006155014038, "learning_rate": 1.603189108501735e-07, "loss": 0.3961, "num_input_tokens_seen": 179983936, "step": 148010 }, { "epoch": 18.54592156371382, "grad_norm": 5.093684196472168, "learning_rate": 1.6018160740076062e-07, "loss": 0.3862, "num_input_tokens_seen": 179990208, "step": 148015 }, { "epoch": 18.546548051622604, "grad_norm": 7.504204273223877, "learning_rate": 1.600443618154457e-07, "loss": 0.4308, "num_input_tokens_seen": 179996256, "step": 148020 }, { "epoch": 18.547174539531387, "grad_norm": 3.6329145431518555, "learning_rate": 1.599071740958691e-07, "loss": 0.4216, "num_input_tokens_seen": 180002304, "step": 148025 }, { "epoch": 18.54780102744017, "grad_norm": 33.041725158691406, "learning_rate": 1.5977004424367225e-07, "loss": 0.4925, "num_input_tokens_seen": 180008320, "step": 148030 }, { "epoch": 18.548427515348955, "grad_norm": 5.4314775466918945, "learning_rate": 1.5963297226049335e-07, "loss": 0.4332, "num_input_tokens_seen": 180014496, "step": 148035 }, { "epoch": 18.549054003257737, "grad_norm": 6.9835205078125, "learning_rate": 1.5949595814797215e-07, "loss": 0.5433, "num_input_tokens_seen": 180020384, "step": 148040 }, { "epoch": 18.54968049116652, "grad_norm": 4.676926136016846, "learning_rate": 1.5935900190774678e-07, "loss": 0.4351, "num_input_tokens_seen": 180026464, "step": 148045 }, { "epoch": 18.550306979075305, "grad_norm": 28.08585548400879, "learning_rate": 1.5922210354145374e-07, "loss": 0.4375, "num_input_tokens_seen": 180032192, "step": 148050 }, { "epoch": 18.550933466984088, "grad_norm": 25.439319610595703, "learning_rate": 1.5908526305073058e-07, "loss": 0.4248, "num_input_tokens_seen": 180038336, "step": 148055 }, { "epoch": 18.55155995489287, "grad_norm": 7.689571380615234, "learning_rate": 1.5894848043721323e-07, "loss": 0.499, "num_input_tokens_seen": 180044384, "step": 148060 }, { "epoch": 18.552186442801656, "grad_norm": 4.437832832336426, "learning_rate": 1.5881175570253648e-07, "loss": 0.422, "num_input_tokens_seen": 180050240, "step": 148065 }, { "epoch": 18.552812930710438, "grad_norm": 13.149046897888184, "learning_rate": 1.5867508884833628e-07, "loss": 0.4257, "num_input_tokens_seen": 180056416, "step": 148070 }, { "epoch": 18.55343941861922, "grad_norm": 4.518500328063965, "learning_rate": 1.5853847987624516e-07, "loss": 0.4286, "num_input_tokens_seen": 180062464, "step": 148075 }, { "epoch": 18.554065906528002, "grad_norm": 5.8778252601623535, "learning_rate": 1.584019287878974e-07, "loss": 0.4335, "num_input_tokens_seen": 180068672, "step": 148080 }, { "epoch": 18.55469239443679, "grad_norm": 4.107324123382568, "learning_rate": 1.5826543558492446e-07, "loss": 0.4628, "num_input_tokens_seen": 180074720, "step": 148085 }, { "epoch": 18.55531888234557, "grad_norm": 5.788047790527344, "learning_rate": 1.581290002689595e-07, "loss": 0.4547, "num_input_tokens_seen": 180080896, "step": 148090 }, { "epoch": 18.555945370254353, "grad_norm": 5.857211112976074, "learning_rate": 1.579926228416334e-07, "loss": 0.4838, "num_input_tokens_seen": 180086816, "step": 148095 }, { "epoch": 18.55657185816314, "grad_norm": 6.333033084869385, "learning_rate": 1.5785630330457547e-07, "loss": 0.4135, "num_input_tokens_seen": 180093312, "step": 148100 }, { "epoch": 18.55719834607192, "grad_norm": 4.447852611541748, "learning_rate": 1.5772004165941767e-07, "loss": 0.3941, "num_input_tokens_seen": 180099712, "step": 148105 }, { "epoch": 18.557824833980703, "grad_norm": 16.866600036621094, "learning_rate": 1.5758383790778708e-07, "loss": 0.4636, "num_input_tokens_seen": 180105664, "step": 148110 }, { "epoch": 18.55845132188949, "grad_norm": 21.194398880004883, "learning_rate": 1.5744769205131348e-07, "loss": 0.4187, "num_input_tokens_seen": 180111872, "step": 148115 }, { "epoch": 18.55907780979827, "grad_norm": 6.2396559715271, "learning_rate": 1.5731160409162394e-07, "loss": 0.4054, "num_input_tokens_seen": 180117920, "step": 148120 }, { "epoch": 18.559704297707054, "grad_norm": 21.908418655395508, "learning_rate": 1.57175574030346e-07, "loss": 0.5053, "num_input_tokens_seen": 180124064, "step": 148125 }, { "epoch": 18.560330785615836, "grad_norm": 5.115862846374512, "learning_rate": 1.5703960186910505e-07, "loss": 0.4493, "num_input_tokens_seen": 180130432, "step": 148130 }, { "epoch": 18.56095727352462, "grad_norm": 5.545619487762451, "learning_rate": 1.5690368760952813e-07, "loss": 0.4123, "num_input_tokens_seen": 180136576, "step": 148135 }, { "epoch": 18.561583761433404, "grad_norm": 25.39546012878418, "learning_rate": 1.5676783125323835e-07, "loss": 0.4682, "num_input_tokens_seen": 180142816, "step": 148140 }, { "epoch": 18.562210249342186, "grad_norm": 4.796711444854736, "learning_rate": 1.5663203280186167e-07, "loss": 0.4941, "num_input_tokens_seen": 180148960, "step": 148145 }, { "epoch": 18.562836737250972, "grad_norm": 22.328027725219727, "learning_rate": 1.5649629225702124e-07, "loss": 0.4449, "num_input_tokens_seen": 180155168, "step": 148150 }, { "epoch": 18.563463225159754, "grad_norm": 21.558286666870117, "learning_rate": 1.563606096203396e-07, "loss": 0.4784, "num_input_tokens_seen": 180161120, "step": 148155 }, { "epoch": 18.564089713068537, "grad_norm": 6.238034725189209, "learning_rate": 1.562249848934394e-07, "loss": 0.4161, "num_input_tokens_seen": 180166848, "step": 148160 }, { "epoch": 18.564716200977323, "grad_norm": 10.243520736694336, "learning_rate": 1.560894180779421e-07, "loss": 0.5492, "num_input_tokens_seen": 180173024, "step": 148165 }, { "epoch": 18.565342688886105, "grad_norm": 7.54171085357666, "learning_rate": 1.559539091754686e-07, "loss": 0.3988, "num_input_tokens_seen": 180179264, "step": 148170 }, { "epoch": 18.565969176794887, "grad_norm": 10.678969383239746, "learning_rate": 1.5581845818763764e-07, "loss": 0.4458, "num_input_tokens_seen": 180185312, "step": 148175 }, { "epoch": 18.56659566470367, "grad_norm": 4.16749906539917, "learning_rate": 1.5568306511607123e-07, "loss": 0.4863, "num_input_tokens_seen": 180191456, "step": 148180 }, { "epoch": 18.567222152612455, "grad_norm": 19.589080810546875, "learning_rate": 1.5554772996238532e-07, "loss": 0.5703, "num_input_tokens_seen": 180196736, "step": 148185 }, { "epoch": 18.567848640521238, "grad_norm": 5.445307731628418, "learning_rate": 1.5541245272820027e-07, "loss": 0.4878, "num_input_tokens_seen": 180203232, "step": 148190 }, { "epoch": 18.56847512843002, "grad_norm": 25.302061080932617, "learning_rate": 1.55277233415132e-07, "loss": 0.4703, "num_input_tokens_seen": 180209568, "step": 148195 }, { "epoch": 18.569101616338806, "grad_norm": 7.805586338043213, "learning_rate": 1.5514207202479813e-07, "loss": 0.4477, "num_input_tokens_seen": 180215776, "step": 148200 }, { "epoch": 18.569728104247588, "grad_norm": 13.99141788482666, "learning_rate": 1.5500696855881347e-07, "loss": 0.4331, "num_input_tokens_seen": 180221152, "step": 148205 }, { "epoch": 18.57035459215637, "grad_norm": 21.307647705078125, "learning_rate": 1.5487192301879394e-07, "loss": 0.5008, "num_input_tokens_seen": 180227264, "step": 148210 }, { "epoch": 18.570981080065156, "grad_norm": 7.333461284637451, "learning_rate": 1.5473693540635436e-07, "loss": 0.429, "num_input_tokens_seen": 180233312, "step": 148215 }, { "epoch": 18.57160756797394, "grad_norm": 11.492695808410645, "learning_rate": 1.546020057231079e-07, "loss": 0.425, "num_input_tokens_seen": 180239360, "step": 148220 }, { "epoch": 18.57223405588272, "grad_norm": 7.486364841461182, "learning_rate": 1.5446713397066938e-07, "loss": 0.4804, "num_input_tokens_seen": 180245472, "step": 148225 }, { "epoch": 18.572860543791506, "grad_norm": 11.88440227508545, "learning_rate": 1.5433232015064913e-07, "loss": 0.4269, "num_input_tokens_seen": 180251168, "step": 148230 }, { "epoch": 18.57348703170029, "grad_norm": 3.8165054321289062, "learning_rate": 1.5419756426466092e-07, "loss": 0.4409, "num_input_tokens_seen": 180257536, "step": 148235 }, { "epoch": 18.57411351960907, "grad_norm": 15.992375373840332, "learning_rate": 1.54062866314314e-07, "loss": 0.4151, "num_input_tokens_seen": 180263616, "step": 148240 }, { "epoch": 18.574740007517853, "grad_norm": 5.605353355407715, "learning_rate": 1.5392822630121984e-07, "loss": 0.4746, "num_input_tokens_seen": 180269536, "step": 148245 }, { "epoch": 18.57536649542664, "grad_norm": 28.261995315551758, "learning_rate": 1.5379364422698883e-07, "loss": 0.5856, "num_input_tokens_seen": 180275968, "step": 148250 }, { "epoch": 18.57599298333542, "grad_norm": 11.166028022766113, "learning_rate": 1.5365912009322857e-07, "loss": 0.6151, "num_input_tokens_seen": 180282304, "step": 148255 }, { "epoch": 18.576619471244204, "grad_norm": 4.286324501037598, "learning_rate": 1.5352465390154836e-07, "loss": 0.4621, "num_input_tokens_seen": 180288288, "step": 148260 }, { "epoch": 18.57724595915299, "grad_norm": 10.5263671875, "learning_rate": 1.5339024565355576e-07, "loss": 0.4176, "num_input_tokens_seen": 180294464, "step": 148265 }, { "epoch": 18.57787244706177, "grad_norm": 14.483894348144531, "learning_rate": 1.5325589535085727e-07, "loss": 0.4288, "num_input_tokens_seen": 180299680, "step": 148270 }, { "epoch": 18.578498934970554, "grad_norm": 12.614463806152344, "learning_rate": 1.531216029950594e-07, "loss": 0.4374, "num_input_tokens_seen": 180305824, "step": 148275 }, { "epoch": 18.57912542287934, "grad_norm": 11.785745620727539, "learning_rate": 1.5298736858776863e-07, "loss": 0.4183, "num_input_tokens_seen": 180311904, "step": 148280 }, { "epoch": 18.579751910788122, "grad_norm": 5.290894508361816, "learning_rate": 1.5285319213058814e-07, "loss": 0.4194, "num_input_tokens_seen": 180318592, "step": 148285 }, { "epoch": 18.580378398696904, "grad_norm": 5.524477958679199, "learning_rate": 1.5271907362512385e-07, "loss": 0.4962, "num_input_tokens_seen": 180324576, "step": 148290 }, { "epoch": 18.58100488660569, "grad_norm": 3.4926705360412598, "learning_rate": 1.5258501307297724e-07, "loss": 0.4485, "num_input_tokens_seen": 180330720, "step": 148295 }, { "epoch": 18.581631374514473, "grad_norm": 8.38226318359375, "learning_rate": 1.5245101047575316e-07, "loss": 0.4347, "num_input_tokens_seen": 180336480, "step": 148300 }, { "epoch": 18.582257862423255, "grad_norm": 4.524319648742676, "learning_rate": 1.5231706583505256e-07, "loss": 0.3613, "num_input_tokens_seen": 180343040, "step": 148305 }, { "epoch": 18.582884350332037, "grad_norm": 11.707338333129883, "learning_rate": 1.521831791524775e-07, "loss": 0.4296, "num_input_tokens_seen": 180349344, "step": 148310 }, { "epoch": 18.583510838240823, "grad_norm": 17.9350528717041, "learning_rate": 1.5204935042962775e-07, "loss": 0.4538, "num_input_tokens_seen": 180355392, "step": 148315 }, { "epoch": 18.584137326149605, "grad_norm": 12.309328079223633, "learning_rate": 1.5191557966810433e-07, "loss": 0.5047, "num_input_tokens_seen": 180361088, "step": 148320 }, { "epoch": 18.584763814058388, "grad_norm": 4.033437728881836, "learning_rate": 1.5178186686950592e-07, "loss": 0.459, "num_input_tokens_seen": 180366944, "step": 148325 }, { "epoch": 18.585390301967173, "grad_norm": 31.001670837402344, "learning_rate": 1.5164821203543235e-07, "loss": 0.516, "num_input_tokens_seen": 180372896, "step": 148330 }, { "epoch": 18.586016789875956, "grad_norm": 6.111515522003174, "learning_rate": 1.5151461516747956e-07, "loss": 0.4988, "num_input_tokens_seen": 180379456, "step": 148335 }, { "epoch": 18.586643277784738, "grad_norm": 21.558828353881836, "learning_rate": 1.5138107626724573e-07, "loss": 0.4496, "num_input_tokens_seen": 180385632, "step": 148340 }, { "epoch": 18.587269765693524, "grad_norm": 4.9152445793151855, "learning_rate": 1.5124759533632848e-07, "loss": 0.4166, "num_input_tokens_seen": 180391968, "step": 148345 }, { "epoch": 18.587896253602306, "grad_norm": 20.597761154174805, "learning_rate": 1.5111417237632263e-07, "loss": 0.5072, "num_input_tokens_seen": 180398144, "step": 148350 }, { "epoch": 18.58852274151109, "grad_norm": 3.4124341011047363, "learning_rate": 1.5098080738882304e-07, "loss": 0.4289, "num_input_tokens_seen": 180404288, "step": 148355 }, { "epoch": 18.58914922941987, "grad_norm": 8.881959915161133, "learning_rate": 1.5084750037542505e-07, "loss": 0.459, "num_input_tokens_seen": 180410496, "step": 148360 }, { "epoch": 18.589775717328656, "grad_norm": 7.801432132720947, "learning_rate": 1.5071425133772243e-07, "loss": 0.4577, "num_input_tokens_seen": 180416832, "step": 148365 }, { "epoch": 18.59040220523744, "grad_norm": 8.105220794677734, "learning_rate": 1.5058106027730778e-07, "loss": 0.4776, "num_input_tokens_seen": 180422592, "step": 148370 }, { "epoch": 18.59102869314622, "grad_norm": 6.152912139892578, "learning_rate": 1.5044792719577427e-07, "loss": 0.49, "num_input_tokens_seen": 180428320, "step": 148375 }, { "epoch": 18.591655181055007, "grad_norm": 9.859375953674316, "learning_rate": 1.503148520947123e-07, "loss": 0.4644, "num_input_tokens_seen": 180434624, "step": 148380 }, { "epoch": 18.59228166896379, "grad_norm": 5.957566738128662, "learning_rate": 1.5018183497571448e-07, "loss": 0.43, "num_input_tokens_seen": 180440800, "step": 148385 }, { "epoch": 18.59290815687257, "grad_norm": 13.385318756103516, "learning_rate": 1.5004887584036954e-07, "loss": 0.4336, "num_input_tokens_seen": 180446528, "step": 148390 }, { "epoch": 18.593534644781357, "grad_norm": 4.99269962310791, "learning_rate": 1.4991597469026786e-07, "loss": 0.4448, "num_input_tokens_seen": 180452160, "step": 148395 }, { "epoch": 18.59416113269014, "grad_norm": 8.270516395568848, "learning_rate": 1.4978313152699874e-07, "loss": 0.3871, "num_input_tokens_seen": 180458368, "step": 148400 }, { "epoch": 18.59478762059892, "grad_norm": 20.67146873474121, "learning_rate": 1.4965034635214982e-07, "loss": 0.453, "num_input_tokens_seen": 180464256, "step": 148405 }, { "epoch": 18.595414108507704, "grad_norm": 4.036557197570801, "learning_rate": 1.4951761916730978e-07, "loss": 0.4404, "num_input_tokens_seen": 180470368, "step": 148410 }, { "epoch": 18.59604059641649, "grad_norm": 14.374564170837402, "learning_rate": 1.4938494997406405e-07, "loss": 0.4218, "num_input_tokens_seen": 180476576, "step": 148415 }, { "epoch": 18.596667084325272, "grad_norm": 5.042880058288574, "learning_rate": 1.492523387739997e-07, "loss": 0.4783, "num_input_tokens_seen": 180483008, "step": 148420 }, { "epoch": 18.597293572234054, "grad_norm": 4.077017784118652, "learning_rate": 1.4911978556870154e-07, "loss": 0.485, "num_input_tokens_seen": 180489632, "step": 148425 }, { "epoch": 18.59792006014284, "grad_norm": 8.948989868164062, "learning_rate": 1.48987290359755e-07, "loss": 0.4791, "num_input_tokens_seen": 180495968, "step": 148430 }, { "epoch": 18.598546548051623, "grad_norm": 4.5524702072143555, "learning_rate": 1.4885485314874382e-07, "loss": 0.4258, "num_input_tokens_seen": 180502528, "step": 148435 }, { "epoch": 18.599173035960405, "grad_norm": 7.789414882659912, "learning_rate": 1.487224739372517e-07, "loss": 0.4555, "num_input_tokens_seen": 180508640, "step": 148440 }, { "epoch": 18.59979952386919, "grad_norm": 12.569493293762207, "learning_rate": 1.4859015272686073e-07, "loss": 0.4204, "num_input_tokens_seen": 180514880, "step": 148445 }, { "epoch": 18.600426011777973, "grad_norm": 12.725948333740234, "learning_rate": 1.4845788951915296e-07, "loss": 0.4218, "num_input_tokens_seen": 180520672, "step": 148450 }, { "epoch": 18.601052499686755, "grad_norm": 11.953336715698242, "learning_rate": 1.4832568431571104e-07, "loss": 0.4076, "num_input_tokens_seen": 180527232, "step": 148455 }, { "epoch": 18.60167898759554, "grad_norm": 5.20521879196167, "learning_rate": 1.4819353711811424e-07, "loss": 0.4179, "num_input_tokens_seen": 180533408, "step": 148460 }, { "epoch": 18.602305475504323, "grad_norm": 29.716554641723633, "learning_rate": 1.480614479279424e-07, "loss": 0.4882, "num_input_tokens_seen": 180539808, "step": 148465 }, { "epoch": 18.602931963413106, "grad_norm": 21.29573631286621, "learning_rate": 1.479294167467754e-07, "loss": 0.5024, "num_input_tokens_seen": 180545728, "step": 148470 }, { "epoch": 18.603558451321888, "grad_norm": 7.152844429016113, "learning_rate": 1.4779744357619252e-07, "loss": 0.4242, "num_input_tokens_seen": 180551968, "step": 148475 }, { "epoch": 18.604184939230674, "grad_norm": 12.096467018127441, "learning_rate": 1.476655284177697e-07, "loss": 0.4689, "num_input_tokens_seen": 180558112, "step": 148480 }, { "epoch": 18.604811427139456, "grad_norm": 4.402569770812988, "learning_rate": 1.475336712730857e-07, "loss": 0.4391, "num_input_tokens_seen": 180563936, "step": 148485 }, { "epoch": 18.60543791504824, "grad_norm": 29.237747192382812, "learning_rate": 1.4740187214371592e-07, "loss": 0.5692, "num_input_tokens_seen": 180570048, "step": 148490 }, { "epoch": 18.606064402957024, "grad_norm": 11.702824592590332, "learning_rate": 1.4727013103123743e-07, "loss": 0.5052, "num_input_tokens_seen": 180576160, "step": 148495 }, { "epoch": 18.606690890865806, "grad_norm": 6.325376033782959, "learning_rate": 1.4713844793722344e-07, "loss": 0.546, "num_input_tokens_seen": 180582176, "step": 148500 }, { "epoch": 18.60731737877459, "grad_norm": 13.745118141174316, "learning_rate": 1.470068228632504e-07, "loss": 0.5136, "num_input_tokens_seen": 180588352, "step": 148505 }, { "epoch": 18.607943866683375, "grad_norm": 11.529573440551758, "learning_rate": 1.4687525581088991e-07, "loss": 0.4253, "num_input_tokens_seen": 180594336, "step": 148510 }, { "epoch": 18.608570354592157, "grad_norm": 5.104511737823486, "learning_rate": 1.4674374678171732e-07, "loss": 0.413, "num_input_tokens_seen": 180600064, "step": 148515 }, { "epoch": 18.60919684250094, "grad_norm": 5.63383150100708, "learning_rate": 1.4661229577730252e-07, "loss": 0.4094, "num_input_tokens_seen": 180606304, "step": 148520 }, { "epoch": 18.609823330409725, "grad_norm": 6.531872749328613, "learning_rate": 1.464809027992181e-07, "loss": 0.4257, "num_input_tokens_seen": 180612384, "step": 148525 }, { "epoch": 18.610449818318507, "grad_norm": 23.78739356994629, "learning_rate": 1.4634956784903565e-07, "loss": 0.4955, "num_input_tokens_seen": 180618624, "step": 148530 }, { "epoch": 18.61107630622729, "grad_norm": 22.307886123657227, "learning_rate": 1.4621829092832496e-07, "loss": 0.5472, "num_input_tokens_seen": 180624544, "step": 148535 }, { "epoch": 18.61170279413607, "grad_norm": 5.725029468536377, "learning_rate": 1.4608707203865535e-07, "loss": 0.4806, "num_input_tokens_seen": 180630336, "step": 148540 }, { "epoch": 18.612329282044858, "grad_norm": 4.211987018585205, "learning_rate": 1.459559111815956e-07, "loss": 0.396, "num_input_tokens_seen": 180636352, "step": 148545 }, { "epoch": 18.61295576995364, "grad_norm": 8.7822847366333, "learning_rate": 1.458248083587138e-07, "loss": 0.3837, "num_input_tokens_seen": 180642368, "step": 148550 }, { "epoch": 18.613582257862422, "grad_norm": 25.90846061706543, "learning_rate": 1.4569376357157772e-07, "loss": 0.4729, "num_input_tokens_seen": 180648224, "step": 148555 }, { "epoch": 18.614208745771208, "grad_norm": 9.647894859313965, "learning_rate": 1.455627768217538e-07, "loss": 0.4113, "num_input_tokens_seen": 180654528, "step": 148560 }, { "epoch": 18.61483523367999, "grad_norm": 22.540983200073242, "learning_rate": 1.4543184811080857e-07, "loss": 0.4555, "num_input_tokens_seen": 180660768, "step": 148565 }, { "epoch": 18.615461721588773, "grad_norm": 6.664167404174805, "learning_rate": 1.4530097744030692e-07, "loss": 0.4113, "num_input_tokens_seen": 180666816, "step": 148570 }, { "epoch": 18.61608820949756, "grad_norm": 13.396793365478516, "learning_rate": 1.4517016481181367e-07, "loss": 0.4438, "num_input_tokens_seen": 180672864, "step": 148575 }, { "epoch": 18.61671469740634, "grad_norm": 5.066447734832764, "learning_rate": 1.450394102268926e-07, "loss": 0.4275, "num_input_tokens_seen": 180679232, "step": 148580 }, { "epoch": 18.617341185315123, "grad_norm": 4.097264289855957, "learning_rate": 1.4490871368710803e-07, "loss": 0.4089, "num_input_tokens_seen": 180685280, "step": 148585 }, { "epoch": 18.617967673223905, "grad_norm": 6.333364963531494, "learning_rate": 1.4477807519402087e-07, "loss": 0.4668, "num_input_tokens_seen": 180691584, "step": 148590 }, { "epoch": 18.61859416113269, "grad_norm": 8.217768669128418, "learning_rate": 1.4464749474919437e-07, "loss": 0.4627, "num_input_tokens_seen": 180697440, "step": 148595 }, { "epoch": 18.619220649041473, "grad_norm": 7.483664512634277, "learning_rate": 1.4451697235418895e-07, "loss": 0.4785, "num_input_tokens_seen": 180703680, "step": 148600 }, { "epoch": 18.619847136950256, "grad_norm": 4.943946838378906, "learning_rate": 1.443865080105661e-07, "loss": 0.4277, "num_input_tokens_seen": 180709792, "step": 148605 }, { "epoch": 18.62047362485904, "grad_norm": 29.08639144897461, "learning_rate": 1.4425610171988402e-07, "loss": 0.41, "num_input_tokens_seen": 180715840, "step": 148610 }, { "epoch": 18.621100112767824, "grad_norm": 7.1662797927856445, "learning_rate": 1.4412575348370317e-07, "loss": 0.5135, "num_input_tokens_seen": 180722016, "step": 148615 }, { "epoch": 18.621726600676606, "grad_norm": 6.304308891296387, "learning_rate": 1.4399546330358116e-07, "loss": 0.4136, "num_input_tokens_seen": 180728384, "step": 148620 }, { "epoch": 18.622353088585392, "grad_norm": 5.490256309509277, "learning_rate": 1.4386523118107676e-07, "loss": 0.3961, "num_input_tokens_seen": 180734848, "step": 148625 }, { "epoch": 18.622979576494174, "grad_norm": 9.175946235656738, "learning_rate": 1.4373505711774592e-07, "loss": 0.4354, "num_input_tokens_seen": 180740576, "step": 148630 }, { "epoch": 18.623606064402956, "grad_norm": 15.822184562683105, "learning_rate": 1.4360494111514578e-07, "loss": 0.4235, "num_input_tokens_seen": 180746336, "step": 148635 }, { "epoch": 18.62423255231174, "grad_norm": 40.17839050292969, "learning_rate": 1.434748831748306e-07, "loss": 0.5248, "num_input_tokens_seen": 180752416, "step": 148640 }, { "epoch": 18.624859040220525, "grad_norm": 8.093711853027344, "learning_rate": 1.4334488329835694e-07, "loss": 0.4169, "num_input_tokens_seen": 180758272, "step": 148645 }, { "epoch": 18.625485528129307, "grad_norm": 10.098031997680664, "learning_rate": 1.4321494148727855e-07, "loss": 0.4209, "num_input_tokens_seen": 180764384, "step": 148650 }, { "epoch": 18.62611201603809, "grad_norm": 11.783955574035645, "learning_rate": 1.4308505774314863e-07, "loss": 0.4399, "num_input_tokens_seen": 180770560, "step": 148655 }, { "epoch": 18.626738503946875, "grad_norm": 5.644643306732178, "learning_rate": 1.4295523206752038e-07, "loss": 0.4032, "num_input_tokens_seen": 180776896, "step": 148660 }, { "epoch": 18.627364991855657, "grad_norm": 6.900822162628174, "learning_rate": 1.4282546446194646e-07, "loss": 0.4944, "num_input_tokens_seen": 180783136, "step": 148665 }, { "epoch": 18.62799147976444, "grad_norm": 11.409886360168457, "learning_rate": 1.4269575492797727e-07, "loss": 0.4798, "num_input_tokens_seen": 180789376, "step": 148670 }, { "epoch": 18.628617967673225, "grad_norm": 18.721494674682617, "learning_rate": 1.4256610346716383e-07, "loss": 0.5569, "num_input_tokens_seen": 180795872, "step": 148675 }, { "epoch": 18.629244455582008, "grad_norm": 6.611042022705078, "learning_rate": 1.4243651008105707e-07, "loss": 0.4137, "num_input_tokens_seen": 180802112, "step": 148680 }, { "epoch": 18.62987094349079, "grad_norm": 4.871006488800049, "learning_rate": 1.4230697477120525e-07, "loss": 0.3825, "num_input_tokens_seen": 180807840, "step": 148685 }, { "epoch": 18.630497431399576, "grad_norm": 10.928213119506836, "learning_rate": 1.4217749753915878e-07, "loss": 0.6285, "num_input_tokens_seen": 180814112, "step": 148690 }, { "epoch": 18.631123919308358, "grad_norm": 7.6728835105896, "learning_rate": 1.4204807838646362e-07, "loss": 0.4609, "num_input_tokens_seen": 180820128, "step": 148695 }, { "epoch": 18.63175040721714, "grad_norm": 3.604767084121704, "learning_rate": 1.4191871731466855e-07, "loss": 0.4044, "num_input_tokens_seen": 180826496, "step": 148700 }, { "epoch": 18.632376895125923, "grad_norm": 19.081331253051758, "learning_rate": 1.41789414325319e-07, "loss": 0.4748, "num_input_tokens_seen": 180831840, "step": 148705 }, { "epoch": 18.63300338303471, "grad_norm": 8.993565559387207, "learning_rate": 1.4166016941996208e-07, "loss": 0.3957, "num_input_tokens_seen": 180837824, "step": 148710 }, { "epoch": 18.63362987094349, "grad_norm": 14.460941314697266, "learning_rate": 1.4153098260014263e-07, "loss": 0.4993, "num_input_tokens_seen": 180844128, "step": 148715 }, { "epoch": 18.634256358852273, "grad_norm": 12.35865306854248, "learning_rate": 1.4140185386740447e-07, "loss": 0.4505, "num_input_tokens_seen": 180849504, "step": 148720 }, { "epoch": 18.63488284676106, "grad_norm": 7.601664066314697, "learning_rate": 1.4127278322329298e-07, "loss": 0.3851, "num_input_tokens_seen": 180855328, "step": 148725 }, { "epoch": 18.63550933466984, "grad_norm": 15.076055526733398, "learning_rate": 1.4114377066934915e-07, "loss": 0.5468, "num_input_tokens_seen": 180861856, "step": 148730 }, { "epoch": 18.636135822578623, "grad_norm": 8.353208541870117, "learning_rate": 1.410148162071179e-07, "loss": 0.4427, "num_input_tokens_seen": 180867968, "step": 148735 }, { "epoch": 18.63676231048741, "grad_norm": 6.851463317871094, "learning_rate": 1.408859198381385e-07, "loss": 0.4378, "num_input_tokens_seen": 180874016, "step": 148740 }, { "epoch": 18.63738879839619, "grad_norm": 11.135991096496582, "learning_rate": 1.407570815639542e-07, "loss": 0.4519, "num_input_tokens_seen": 180880352, "step": 148745 }, { "epoch": 18.638015286304974, "grad_norm": 7.484590530395508, "learning_rate": 1.4062830138610373e-07, "loss": 0.4618, "num_input_tokens_seen": 180886624, "step": 148750 }, { "epoch": 18.638641774213756, "grad_norm": 9.365166664123535, "learning_rate": 1.404995793061281e-07, "loss": 0.4974, "num_input_tokens_seen": 180892768, "step": 148755 }, { "epoch": 18.639268262122542, "grad_norm": 8.74917221069336, "learning_rate": 1.4037091532556502e-07, "loss": 0.4603, "num_input_tokens_seen": 180899008, "step": 148760 }, { "epoch": 18.639894750031324, "grad_norm": 4.322482585906982, "learning_rate": 1.4024230944595374e-07, "loss": 0.5016, "num_input_tokens_seen": 180904448, "step": 148765 }, { "epoch": 18.640521237940106, "grad_norm": 12.83420467376709, "learning_rate": 1.401137616688314e-07, "loss": 0.5019, "num_input_tokens_seen": 180910336, "step": 148770 }, { "epoch": 18.641147725848892, "grad_norm": 5.482497692108154, "learning_rate": 1.3998527199573453e-07, "loss": 0.5204, "num_input_tokens_seen": 180915968, "step": 148775 }, { "epoch": 18.641774213757675, "grad_norm": 5.625445365905762, "learning_rate": 1.398568404282008e-07, "loss": 0.4447, "num_input_tokens_seen": 180922016, "step": 148780 }, { "epoch": 18.642400701666457, "grad_norm": 8.755784034729004, "learning_rate": 1.3972846696776398e-07, "loss": 0.4082, "num_input_tokens_seen": 180928160, "step": 148785 }, { "epoch": 18.643027189575243, "grad_norm": 6.931070327758789, "learning_rate": 1.3960015161596008e-07, "loss": 0.4081, "num_input_tokens_seen": 180934624, "step": 148790 }, { "epoch": 18.643653677484025, "grad_norm": 8.404946327209473, "learning_rate": 1.3947189437432284e-07, "loss": 0.4076, "num_input_tokens_seen": 180940640, "step": 148795 }, { "epoch": 18.644280165392807, "grad_norm": 5.042049884796143, "learning_rate": 1.393436952443855e-07, "loss": 0.5151, "num_input_tokens_seen": 180946912, "step": 148800 }, { "epoch": 18.64490665330159, "grad_norm": 7.300734043121338, "learning_rate": 1.3921555422768073e-07, "loss": 0.4602, "num_input_tokens_seen": 180953088, "step": 148805 }, { "epoch": 18.645533141210375, "grad_norm": 17.288955688476562, "learning_rate": 1.3908747132574062e-07, "loss": 0.42, "num_input_tokens_seen": 180959328, "step": 148810 }, { "epoch": 18.646159629119158, "grad_norm": 8.775620460510254, "learning_rate": 1.3895944654009674e-07, "loss": 0.3758, "num_input_tokens_seen": 180965728, "step": 148815 }, { "epoch": 18.64678611702794, "grad_norm": 6.212303161621094, "learning_rate": 1.3883147987228009e-07, "loss": 0.3861, "num_input_tokens_seen": 180971872, "step": 148820 }, { "epoch": 18.647412604936726, "grad_norm": 5.572683334350586, "learning_rate": 1.387035713238194e-07, "loss": 0.4541, "num_input_tokens_seen": 180977792, "step": 148825 }, { "epoch": 18.648039092845508, "grad_norm": 6.313645839691162, "learning_rate": 1.385757208962446e-07, "loss": 0.4235, "num_input_tokens_seen": 180983808, "step": 148830 }, { "epoch": 18.64866558075429, "grad_norm": 5.851120471954346, "learning_rate": 1.3844792859108503e-07, "loss": 0.4593, "num_input_tokens_seen": 180990112, "step": 148835 }, { "epoch": 18.649292068663076, "grad_norm": 4.618867874145508, "learning_rate": 1.3832019440986723e-07, "loss": 0.4684, "num_input_tokens_seen": 180996032, "step": 148840 }, { "epoch": 18.64991855657186, "grad_norm": 8.411964416503906, "learning_rate": 1.3819251835411996e-07, "loss": 0.4779, "num_input_tokens_seen": 181002176, "step": 148845 }, { "epoch": 18.65054504448064, "grad_norm": 5.450412273406982, "learning_rate": 1.3806490042536758e-07, "loss": 0.5742, "num_input_tokens_seen": 181008224, "step": 148850 }, { "epoch": 18.651171532389426, "grad_norm": 6.4586100578308105, "learning_rate": 1.3793734062513775e-07, "loss": 0.4126, "num_input_tokens_seen": 181014400, "step": 148855 }, { "epoch": 18.65179802029821, "grad_norm": 4.868048191070557, "learning_rate": 1.3780983895495424e-07, "loss": 0.4239, "num_input_tokens_seen": 181020448, "step": 148860 }, { "epoch": 18.65242450820699, "grad_norm": 22.484268188476562, "learning_rate": 1.3768239541634198e-07, "loss": 0.4903, "num_input_tokens_seen": 181026592, "step": 148865 }, { "epoch": 18.653050996115773, "grad_norm": 5.8789448738098145, "learning_rate": 1.3755501001082527e-07, "loss": 0.5246, "num_input_tokens_seen": 181032448, "step": 148870 }, { "epoch": 18.65367748402456, "grad_norm": 16.357126235961914, "learning_rate": 1.3742768273992623e-07, "loss": 0.4601, "num_input_tokens_seen": 181038432, "step": 148875 }, { "epoch": 18.65430397193334, "grad_norm": 12.038792610168457, "learning_rate": 1.3730041360516699e-07, "loss": 0.4003, "num_input_tokens_seen": 181044256, "step": 148880 }, { "epoch": 18.654930459842124, "grad_norm": 4.436081886291504, "learning_rate": 1.3717320260807076e-07, "loss": 0.4298, "num_input_tokens_seen": 181050400, "step": 148885 }, { "epoch": 18.65555694775091, "grad_norm": 5.9773430824279785, "learning_rate": 1.3704604975015635e-07, "loss": 0.3847, "num_input_tokens_seen": 181056672, "step": 148890 }, { "epoch": 18.656183435659692, "grad_norm": 4.733478546142578, "learning_rate": 1.3691895503294527e-07, "loss": 0.4109, "num_input_tokens_seen": 181062272, "step": 148895 }, { "epoch": 18.656809923568474, "grad_norm": 4.005463600158691, "learning_rate": 1.3679191845795693e-07, "loss": 0.4228, "num_input_tokens_seen": 181068320, "step": 148900 }, { "epoch": 18.65743641147726, "grad_norm": 6.391800403594971, "learning_rate": 1.366649400267095e-07, "loss": 0.4173, "num_input_tokens_seen": 181073824, "step": 148905 }, { "epoch": 18.658062899386042, "grad_norm": 4.225095748901367, "learning_rate": 1.3653801974072234e-07, "loss": 0.498, "num_input_tokens_seen": 181080096, "step": 148910 }, { "epoch": 18.658689387294825, "grad_norm": 5.900073051452637, "learning_rate": 1.3641115760151148e-07, "loss": 0.4356, "num_input_tokens_seen": 181086272, "step": 148915 }, { "epoch": 18.65931587520361, "grad_norm": 27.028900146484375, "learning_rate": 1.3628435361059457e-07, "loss": 0.4313, "num_input_tokens_seen": 181091680, "step": 148920 }, { "epoch": 18.659942363112393, "grad_norm": 9.404948234558105, "learning_rate": 1.3615760776948705e-07, "loss": 0.5092, "num_input_tokens_seen": 181097984, "step": 148925 }, { "epoch": 18.660568851021175, "grad_norm": 6.260648727416992, "learning_rate": 1.3603092007970552e-07, "loss": 0.4307, "num_input_tokens_seen": 181104416, "step": 148930 }, { "epoch": 18.661195338929957, "grad_norm": 8.798592567443848, "learning_rate": 1.3590429054276266e-07, "loss": 0.4353, "num_input_tokens_seen": 181110784, "step": 148935 }, { "epoch": 18.661821826838743, "grad_norm": 3.3301751613616943, "learning_rate": 1.3577771916017446e-07, "loss": 0.4092, "num_input_tokens_seen": 181116992, "step": 148940 }, { "epoch": 18.662448314747525, "grad_norm": 13.527565956115723, "learning_rate": 1.356512059334525e-07, "loss": 0.5378, "num_input_tokens_seen": 181123680, "step": 148945 }, { "epoch": 18.663074802656308, "grad_norm": 15.88830280303955, "learning_rate": 1.3552475086411054e-07, "loss": 0.4361, "num_input_tokens_seen": 181129536, "step": 148950 }, { "epoch": 18.663701290565093, "grad_norm": 3.7311480045318604, "learning_rate": 1.3539835395365964e-07, "loss": 0.4648, "num_input_tokens_seen": 181135712, "step": 148955 }, { "epoch": 18.664327778473876, "grad_norm": 8.023974418640137, "learning_rate": 1.3527201520361134e-07, "loss": 0.4609, "num_input_tokens_seen": 181141344, "step": 148960 }, { "epoch": 18.664954266382658, "grad_norm": 10.076179504394531, "learning_rate": 1.3514573461547664e-07, "loss": 0.511, "num_input_tokens_seen": 181147328, "step": 148965 }, { "epoch": 18.665580754291444, "grad_norm": 8.494986534118652, "learning_rate": 1.3501951219076437e-07, "loss": 0.4107, "num_input_tokens_seen": 181153504, "step": 148970 }, { "epoch": 18.666207242200226, "grad_norm": 7.79119348526001, "learning_rate": 1.3489334793098384e-07, "loss": 0.4582, "num_input_tokens_seen": 181159776, "step": 148975 }, { "epoch": 18.66683373010901, "grad_norm": 3.4120442867279053, "learning_rate": 1.3476724183764388e-07, "loss": 0.4753, "num_input_tokens_seen": 181165888, "step": 148980 }, { "epoch": 18.66746021801779, "grad_norm": 9.7144136428833, "learning_rate": 1.3464119391225272e-07, "loss": 0.472, "num_input_tokens_seen": 181171872, "step": 148985 }, { "epoch": 18.668086705926576, "grad_norm": 4.907928943634033, "learning_rate": 1.345152041563158e-07, "loss": 0.3959, "num_input_tokens_seen": 181177728, "step": 148990 }, { "epoch": 18.66871319383536, "grad_norm": 8.662070274353027, "learning_rate": 1.3438927257134083e-07, "loss": 0.4579, "num_input_tokens_seen": 181184000, "step": 148995 }, { "epoch": 18.66933968174414, "grad_norm": 4.0496110916137695, "learning_rate": 1.342633991588327e-07, "loss": 0.4971, "num_input_tokens_seen": 181190176, "step": 149000 }, { "epoch": 18.669966169652927, "grad_norm": 5.258671760559082, "learning_rate": 1.3413758392029685e-07, "loss": 0.4264, "num_input_tokens_seen": 181196192, "step": 149005 }, { "epoch": 18.67059265756171, "grad_norm": 9.669469833374023, "learning_rate": 1.3401182685723657e-07, "loss": 0.4303, "num_input_tokens_seen": 181201792, "step": 149010 }, { "epoch": 18.67121914547049, "grad_norm": 4.2248735427856445, "learning_rate": 1.338861279711562e-07, "loss": 0.404, "num_input_tokens_seen": 181207232, "step": 149015 }, { "epoch": 18.671845633379277, "grad_norm": 4.438258171081543, "learning_rate": 1.337604872635584e-07, "loss": 0.4205, "num_input_tokens_seen": 181213408, "step": 149020 }, { "epoch": 18.67247212128806, "grad_norm": 4.890200138092041, "learning_rate": 1.336349047359453e-07, "loss": 0.4261, "num_input_tokens_seen": 181219456, "step": 149025 }, { "epoch": 18.673098609196842, "grad_norm": 4.973643779754639, "learning_rate": 1.3350938038981852e-07, "loss": 0.3806, "num_input_tokens_seen": 181225664, "step": 149030 }, { "epoch": 18.673725097105624, "grad_norm": 3.8857176303863525, "learning_rate": 1.3338391422667851e-07, "loss": 0.4098, "num_input_tokens_seen": 181231744, "step": 149035 }, { "epoch": 18.67435158501441, "grad_norm": 14.54486083984375, "learning_rate": 1.3325850624802572e-07, "loss": 0.4058, "num_input_tokens_seen": 181237952, "step": 149040 }, { "epoch": 18.674978072923192, "grad_norm": 9.282008171081543, "learning_rate": 1.3313315645535842e-07, "loss": 0.5339, "num_input_tokens_seen": 181243840, "step": 149045 }, { "epoch": 18.675604560831975, "grad_norm": 5.684211730957031, "learning_rate": 1.3300786485017703e-07, "loss": 0.3847, "num_input_tokens_seen": 181249920, "step": 149050 }, { "epoch": 18.67623104874076, "grad_norm": 4.607480049133301, "learning_rate": 1.3288263143397762e-07, "loss": 0.4934, "num_input_tokens_seen": 181255936, "step": 149055 }, { "epoch": 18.676857536649543, "grad_norm": 6.24183464050293, "learning_rate": 1.3275745620825953e-07, "loss": 0.44, "num_input_tokens_seen": 181262112, "step": 149060 }, { "epoch": 18.677484024558325, "grad_norm": 4.408573150634766, "learning_rate": 1.3263233917451711e-07, "loss": 0.4338, "num_input_tokens_seen": 181268320, "step": 149065 }, { "epoch": 18.67811051246711, "grad_norm": 4.626911163330078, "learning_rate": 1.3250728033424753e-07, "loss": 0.4222, "num_input_tokens_seen": 181274464, "step": 149070 }, { "epoch": 18.678737000375893, "grad_norm": 7.391019344329834, "learning_rate": 1.323822796889457e-07, "loss": 0.4197, "num_input_tokens_seen": 181279904, "step": 149075 }, { "epoch": 18.679363488284675, "grad_norm": 4.3145246505737305, "learning_rate": 1.3225733724010647e-07, "loss": 0.4826, "num_input_tokens_seen": 181286240, "step": 149080 }, { "epoch": 18.67998997619346, "grad_norm": 23.5628662109375, "learning_rate": 1.3213245298922316e-07, "loss": 0.5031, "num_input_tokens_seen": 181292352, "step": 149085 }, { "epoch": 18.680616464102243, "grad_norm": 6.640621185302734, "learning_rate": 1.32007626937789e-07, "loss": 0.4317, "num_input_tokens_seen": 181298752, "step": 149090 }, { "epoch": 18.681242952011026, "grad_norm": 14.161381721496582, "learning_rate": 1.3188285908729726e-07, "loss": 0.4681, "num_input_tokens_seen": 181304960, "step": 149095 }, { "epoch": 18.681869439919808, "grad_norm": 3.9606714248657227, "learning_rate": 1.3175814943923782e-07, "loss": 0.3958, "num_input_tokens_seen": 181310816, "step": 149100 }, { "epoch": 18.682495927828594, "grad_norm": 20.35064697265625, "learning_rate": 1.3163349799510339e-07, "loss": 0.4654, "num_input_tokens_seen": 181317088, "step": 149105 }, { "epoch": 18.683122415737376, "grad_norm": 16.112377166748047, "learning_rate": 1.3150890475638333e-07, "loss": 0.465, "num_input_tokens_seen": 181323008, "step": 149110 }, { "epoch": 18.68374890364616, "grad_norm": 6.293659687042236, "learning_rate": 1.313843697245676e-07, "loss": 0.4796, "num_input_tokens_seen": 181329280, "step": 149115 }, { "epoch": 18.684375391554944, "grad_norm": 12.135614395141602, "learning_rate": 1.3125989290114494e-07, "loss": 0.389, "num_input_tokens_seen": 181335168, "step": 149120 }, { "epoch": 18.685001879463726, "grad_norm": 7.038810729980469, "learning_rate": 1.3113547428760365e-07, "loss": 0.4107, "num_input_tokens_seen": 181341344, "step": 149125 }, { "epoch": 18.68562836737251, "grad_norm": 16.483240127563477, "learning_rate": 1.310111138854314e-07, "loss": 0.4217, "num_input_tokens_seen": 181347904, "step": 149130 }, { "epoch": 18.686254855281295, "grad_norm": 10.934754371643066, "learning_rate": 1.3088681169611484e-07, "loss": 0.3999, "num_input_tokens_seen": 181354432, "step": 149135 }, { "epoch": 18.686881343190077, "grad_norm": 23.750993728637695, "learning_rate": 1.3076256772113993e-07, "loss": 0.5672, "num_input_tokens_seen": 181360608, "step": 149140 }, { "epoch": 18.68750783109886, "grad_norm": 10.335089683532715, "learning_rate": 1.306383819619922e-07, "loss": 0.4931, "num_input_tokens_seen": 181366752, "step": 149145 }, { "epoch": 18.68813431900764, "grad_norm": 4.8143415451049805, "learning_rate": 1.3051425442015764e-07, "loss": 0.4223, "num_input_tokens_seen": 181372960, "step": 149150 }, { "epoch": 18.688760806916427, "grad_norm": 5.095506191253662, "learning_rate": 1.3039018509711786e-07, "loss": 0.4429, "num_input_tokens_seen": 181379328, "step": 149155 }, { "epoch": 18.68938729482521, "grad_norm": 10.520976066589355, "learning_rate": 1.3026617399435893e-07, "loss": 0.4343, "num_input_tokens_seen": 181385376, "step": 149160 }, { "epoch": 18.690013782733992, "grad_norm": 5.8356242179870605, "learning_rate": 1.3014222111336073e-07, "loss": 0.4002, "num_input_tokens_seen": 181390720, "step": 149165 }, { "epoch": 18.690640270642778, "grad_norm": 10.438237190246582, "learning_rate": 1.3001832645560763e-07, "loss": 0.4919, "num_input_tokens_seen": 181396800, "step": 149170 }, { "epoch": 18.69126675855156, "grad_norm": 10.25129508972168, "learning_rate": 1.2989449002257958e-07, "loss": 0.4662, "num_input_tokens_seen": 181403040, "step": 149175 }, { "epoch": 18.691893246460342, "grad_norm": 4.950095176696777, "learning_rate": 1.2977071181575706e-07, "loss": 0.5104, "num_input_tokens_seen": 181409152, "step": 149180 }, { "epoch": 18.692519734369128, "grad_norm": 11.526372909545898, "learning_rate": 1.2964699183662055e-07, "loss": 0.4554, "num_input_tokens_seen": 181415616, "step": 149185 }, { "epoch": 18.69314622227791, "grad_norm": 11.872961044311523, "learning_rate": 1.295233300866494e-07, "loss": 0.4239, "num_input_tokens_seen": 181421760, "step": 149190 }, { "epoch": 18.693772710186693, "grad_norm": 9.603999137878418, "learning_rate": 1.2939972656732135e-07, "loss": 0.4339, "num_input_tokens_seen": 181427648, "step": 149195 }, { "epoch": 18.69439919809548, "grad_norm": 4.26294469833374, "learning_rate": 1.2927618128011466e-07, "loss": 0.4218, "num_input_tokens_seen": 181433600, "step": 149200 }, { "epoch": 18.69502568600426, "grad_norm": 8.277653694152832, "learning_rate": 1.2915269422650645e-07, "loss": 0.4588, "num_input_tokens_seen": 181440064, "step": 149205 }, { "epoch": 18.695652173913043, "grad_norm": 11.563441276550293, "learning_rate": 1.2902926540797222e-07, "loss": 0.4768, "num_input_tokens_seen": 181445952, "step": 149210 }, { "epoch": 18.696278661821825, "grad_norm": 33.6645393371582, "learning_rate": 1.289058948259897e-07, "loss": 0.4867, "num_input_tokens_seen": 181452064, "step": 149215 }, { "epoch": 18.69690514973061, "grad_norm": 25.577062606811523, "learning_rate": 1.2878258248203156e-07, "loss": 0.4541, "num_input_tokens_seen": 181457728, "step": 149220 }, { "epoch": 18.697531637639393, "grad_norm": 26.19601058959961, "learning_rate": 1.286593283775739e-07, "loss": 0.4441, "num_input_tokens_seen": 181463744, "step": 149225 }, { "epoch": 18.698158125548176, "grad_norm": 7.015007019042969, "learning_rate": 1.2853613251408882e-07, "loss": 0.5349, "num_input_tokens_seen": 181469888, "step": 149230 }, { "epoch": 18.69878461345696, "grad_norm": 6.147305965423584, "learning_rate": 1.284129948930507e-07, "loss": 0.4499, "num_input_tokens_seen": 181475808, "step": 149235 }, { "epoch": 18.699411101365744, "grad_norm": 7.500572681427002, "learning_rate": 1.2828991551593063e-07, "loss": 0.4282, "num_input_tokens_seen": 181482048, "step": 149240 }, { "epoch": 18.700037589274526, "grad_norm": 7.405003547668457, "learning_rate": 1.2816689438420072e-07, "loss": 0.4735, "num_input_tokens_seen": 181487584, "step": 149245 }, { "epoch": 18.700664077183312, "grad_norm": 6.218685150146484, "learning_rate": 1.2804393149933092e-07, "loss": 0.5238, "num_input_tokens_seen": 181493792, "step": 149250 }, { "epoch": 18.701290565092094, "grad_norm": 8.249533653259277, "learning_rate": 1.2792102686279283e-07, "loss": 0.4397, "num_input_tokens_seen": 181499744, "step": 149255 }, { "epoch": 18.701917053000876, "grad_norm": 16.2177734375, "learning_rate": 1.2779818047605473e-07, "loss": 0.3946, "num_input_tokens_seen": 181505792, "step": 149260 }, { "epoch": 18.70254354090966, "grad_norm": 6.559939384460449, "learning_rate": 1.2767539234058547e-07, "loss": 0.4522, "num_input_tokens_seen": 181511648, "step": 149265 }, { "epoch": 18.703170028818445, "grad_norm": 4.3542256355285645, "learning_rate": 1.275526624578538e-07, "loss": 0.5047, "num_input_tokens_seen": 181518112, "step": 149270 }, { "epoch": 18.703796516727227, "grad_norm": 7.952455043792725, "learning_rate": 1.274299908293264e-07, "loss": 0.4986, "num_input_tokens_seen": 181524256, "step": 149275 }, { "epoch": 18.70442300463601, "grad_norm": 7.861008167266846, "learning_rate": 1.2730737745646982e-07, "loss": 0.3923, "num_input_tokens_seen": 181530496, "step": 149280 }, { "epoch": 18.705049492544795, "grad_norm": 5.4534220695495605, "learning_rate": 1.271848223407507e-07, "loss": 0.4289, "num_input_tokens_seen": 181536448, "step": 149285 }, { "epoch": 18.705675980453577, "grad_norm": 9.115091323852539, "learning_rate": 1.2706232548363394e-07, "loss": 0.4564, "num_input_tokens_seen": 181542528, "step": 149290 }, { "epoch": 18.70630246836236, "grad_norm": 5.189185619354248, "learning_rate": 1.2693988688658344e-07, "loss": 0.4182, "num_input_tokens_seen": 181548416, "step": 149295 }, { "epoch": 18.706928956271145, "grad_norm": 12.869900703430176, "learning_rate": 1.2681750655106407e-07, "loss": 0.4813, "num_input_tokens_seen": 181554432, "step": 149300 }, { "epoch": 18.707555444179928, "grad_norm": 6.530112266540527, "learning_rate": 1.2669518447853856e-07, "loss": 0.4423, "num_input_tokens_seen": 181560672, "step": 149305 }, { "epoch": 18.70818193208871, "grad_norm": 4.830183506011963, "learning_rate": 1.2657292067046968e-07, "loss": 0.3937, "num_input_tokens_seen": 181567168, "step": 149310 }, { "epoch": 18.708808419997496, "grad_norm": 3.701758861541748, "learning_rate": 1.2645071512831787e-07, "loss": 0.4319, "num_input_tokens_seen": 181573312, "step": 149315 }, { "epoch": 18.709434907906278, "grad_norm": 20.585844039916992, "learning_rate": 1.263285678535464e-07, "loss": 0.4486, "num_input_tokens_seen": 181579488, "step": 149320 }, { "epoch": 18.71006139581506, "grad_norm": 8.136435508728027, "learning_rate": 1.2620647884761361e-07, "loss": 0.4693, "num_input_tokens_seen": 181585824, "step": 149325 }, { "epoch": 18.710687883723843, "grad_norm": 26.06183433532715, "learning_rate": 1.2608444811197994e-07, "loss": 0.4858, "num_input_tokens_seen": 181592192, "step": 149330 }, { "epoch": 18.71131437163263, "grad_norm": 9.227882385253906, "learning_rate": 1.2596247564810538e-07, "loss": 0.4322, "num_input_tokens_seen": 181598336, "step": 149335 }, { "epoch": 18.71194085954141, "grad_norm": 12.870146751403809, "learning_rate": 1.2584056145744649e-07, "loss": 0.4438, "num_input_tokens_seen": 181604416, "step": 149340 }, { "epoch": 18.712567347450193, "grad_norm": 12.616250038146973, "learning_rate": 1.2571870554146214e-07, "loss": 0.4471, "num_input_tokens_seen": 181610496, "step": 149345 }, { "epoch": 18.71319383535898, "grad_norm": 7.5736165046691895, "learning_rate": 1.2559690790160838e-07, "loss": 0.4519, "num_input_tokens_seen": 181616992, "step": 149350 }, { "epoch": 18.71382032326776, "grad_norm": 4.014123439788818, "learning_rate": 1.2547516853934183e-07, "loss": 0.4462, "num_input_tokens_seen": 181623424, "step": 149355 }, { "epoch": 18.714446811176543, "grad_norm": 23.76102066040039, "learning_rate": 1.2535348745611797e-07, "loss": 0.4497, "num_input_tokens_seen": 181629504, "step": 149360 }, { "epoch": 18.71507329908533, "grad_norm": 5.551925182342529, "learning_rate": 1.2523186465339177e-07, "loss": 0.4942, "num_input_tokens_seen": 181635488, "step": 149365 }, { "epoch": 18.71569978699411, "grad_norm": 18.158628463745117, "learning_rate": 1.251103001326165e-07, "loss": 0.4374, "num_input_tokens_seen": 181641984, "step": 149370 }, { "epoch": 18.716326274902894, "grad_norm": 4.651543617248535, "learning_rate": 1.249887938952471e-07, "loss": 0.4219, "num_input_tokens_seen": 181648064, "step": 149375 }, { "epoch": 18.716952762811676, "grad_norm": 6.509354114532471, "learning_rate": 1.2486734594273463e-07, "loss": 0.4456, "num_input_tokens_seen": 181654048, "step": 149380 }, { "epoch": 18.717579250720462, "grad_norm": 5.108114719390869, "learning_rate": 1.2474595627653185e-07, "loss": 0.4682, "num_input_tokens_seen": 181660224, "step": 149385 }, { "epoch": 18.718205738629244, "grad_norm": 4.491879463195801, "learning_rate": 1.246246248980898e-07, "loss": 0.4539, "num_input_tokens_seen": 181666144, "step": 149390 }, { "epoch": 18.718832226538026, "grad_norm": 13.553647994995117, "learning_rate": 1.2450335180885952e-07, "loss": 0.4673, "num_input_tokens_seen": 181672192, "step": 149395 }, { "epoch": 18.719458714446812, "grad_norm": 30.366840362548828, "learning_rate": 1.24382137010291e-07, "loss": 0.6261, "num_input_tokens_seen": 181678432, "step": 149400 }, { "epoch": 18.720085202355595, "grad_norm": 19.9169921875, "learning_rate": 1.2426098050383307e-07, "loss": 0.4442, "num_input_tokens_seen": 181684672, "step": 149405 }, { "epoch": 18.720711690264377, "grad_norm": 4.511250972747803, "learning_rate": 1.2413988229093509e-07, "loss": 0.3768, "num_input_tokens_seen": 181690592, "step": 149410 }, { "epoch": 18.721338178173163, "grad_norm": 12.149120330810547, "learning_rate": 1.2401884237304319e-07, "loss": 0.4401, "num_input_tokens_seen": 181696384, "step": 149415 }, { "epoch": 18.721964666081945, "grad_norm": 11.281222343444824, "learning_rate": 1.238978607516067e-07, "loss": 0.4238, "num_input_tokens_seen": 181702464, "step": 149420 }, { "epoch": 18.722591153990727, "grad_norm": 6.0084099769592285, "learning_rate": 1.2377693742807006e-07, "loss": 0.4824, "num_input_tokens_seen": 181708480, "step": 149425 }, { "epoch": 18.72321764189951, "grad_norm": 6.50253963470459, "learning_rate": 1.236560724038799e-07, "loss": 0.4205, "num_input_tokens_seen": 181714400, "step": 149430 }, { "epoch": 18.723844129808295, "grad_norm": 7.780025482177734, "learning_rate": 1.2353526568048113e-07, "loss": 0.3844, "num_input_tokens_seen": 181720544, "step": 149435 }, { "epoch": 18.724470617717078, "grad_norm": 19.063377380371094, "learning_rate": 1.2341451725931873e-07, "loss": 0.5436, "num_input_tokens_seen": 181725984, "step": 149440 }, { "epoch": 18.72509710562586, "grad_norm": 18.57390785217285, "learning_rate": 1.232938271418349e-07, "loss": 0.4657, "num_input_tokens_seen": 181732192, "step": 149445 }, { "epoch": 18.725723593534646, "grad_norm": 6.723109245300293, "learning_rate": 1.2317319532947404e-07, "loss": 0.4446, "num_input_tokens_seen": 181738304, "step": 149450 }, { "epoch": 18.726350081443428, "grad_norm": 22.238304138183594, "learning_rate": 1.2305262182367772e-07, "loss": 0.4723, "num_input_tokens_seen": 181744352, "step": 149455 }, { "epoch": 18.72697656935221, "grad_norm": 4.86754846572876, "learning_rate": 1.2293210662588762e-07, "loss": 0.4749, "num_input_tokens_seen": 181750560, "step": 149460 }, { "epoch": 18.727603057260996, "grad_norm": 7.55257511138916, "learning_rate": 1.2281164973754478e-07, "loss": 0.4236, "num_input_tokens_seen": 181756896, "step": 149465 }, { "epoch": 18.72822954516978, "grad_norm": 7.102635860443115, "learning_rate": 1.2269125116008863e-07, "loss": 0.383, "num_input_tokens_seen": 181763008, "step": 149470 }, { "epoch": 18.72885603307856, "grad_norm": 7.4918107986450195, "learning_rate": 1.225709108949602e-07, "loss": 0.409, "num_input_tokens_seen": 181769024, "step": 149475 }, { "epoch": 18.729482520987347, "grad_norm": 14.434783935546875, "learning_rate": 1.2245062894359615e-07, "loss": 0.5108, "num_input_tokens_seen": 181775264, "step": 149480 }, { "epoch": 18.73010900889613, "grad_norm": 4.001907825469971, "learning_rate": 1.223304053074359e-07, "loss": 0.4478, "num_input_tokens_seen": 181781536, "step": 149485 }, { "epoch": 18.73073549680491, "grad_norm": 8.872804641723633, "learning_rate": 1.2221023998791713e-07, "loss": 0.4879, "num_input_tokens_seen": 181787616, "step": 149490 }, { "epoch": 18.731361984713693, "grad_norm": 5.7866435050964355, "learning_rate": 1.2209013298647543e-07, "loss": 0.5133, "num_input_tokens_seen": 181793792, "step": 149495 }, { "epoch": 18.73198847262248, "grad_norm": 5.881004810333252, "learning_rate": 1.2197008430454737e-07, "loss": 0.3952, "num_input_tokens_seen": 181799776, "step": 149500 }, { "epoch": 18.73261496053126, "grad_norm": 8.537076950073242, "learning_rate": 1.2185009394356851e-07, "loss": 0.4708, "num_input_tokens_seen": 181805760, "step": 149505 }, { "epoch": 18.733241448440044, "grad_norm": 9.50258731842041, "learning_rate": 1.2173016190497267e-07, "loss": 0.4522, "num_input_tokens_seen": 181812000, "step": 149510 }, { "epoch": 18.73386793634883, "grad_norm": 15.881462097167969, "learning_rate": 1.2161028819019426e-07, "loss": 0.4144, "num_input_tokens_seen": 181817984, "step": 149515 }, { "epoch": 18.734494424257612, "grad_norm": 5.341072082519531, "learning_rate": 1.2149047280066716e-07, "loss": 0.4097, "num_input_tokens_seen": 181823616, "step": 149520 }, { "epoch": 18.735120912166394, "grad_norm": 4.81752347946167, "learning_rate": 1.213707157378219e-07, "loss": 0.4487, "num_input_tokens_seen": 181829632, "step": 149525 }, { "epoch": 18.73574740007518, "grad_norm": 7.700404644012451, "learning_rate": 1.2125101700309227e-07, "loss": 0.4115, "num_input_tokens_seen": 181835872, "step": 149530 }, { "epoch": 18.736373887983962, "grad_norm": 5.415541172027588, "learning_rate": 1.211313765979083e-07, "loss": 0.4098, "num_input_tokens_seen": 181842048, "step": 149535 }, { "epoch": 18.737000375892745, "grad_norm": 9.498249053955078, "learning_rate": 1.2101179452370103e-07, "loss": 0.4288, "num_input_tokens_seen": 181847616, "step": 149540 }, { "epoch": 18.73762686380153, "grad_norm": 5.251649379730225, "learning_rate": 1.208922707818988e-07, "loss": 0.4412, "num_input_tokens_seen": 181854016, "step": 149545 }, { "epoch": 18.738253351710313, "grad_norm": 4.247061729431152, "learning_rate": 1.2077280537393265e-07, "loss": 0.421, "num_input_tokens_seen": 181860160, "step": 149550 }, { "epoch": 18.738879839619095, "grad_norm": 12.028939247131348, "learning_rate": 1.2065339830122924e-07, "loss": 0.4102, "num_input_tokens_seen": 181865920, "step": 149555 }, { "epoch": 18.739506327527877, "grad_norm": 10.507491111755371, "learning_rate": 1.2053404956521686e-07, "loss": 0.4451, "num_input_tokens_seen": 181871808, "step": 149560 }, { "epoch": 18.740132815436663, "grad_norm": 20.648801803588867, "learning_rate": 1.204147591673227e-07, "loss": 0.5242, "num_input_tokens_seen": 181878016, "step": 149565 }, { "epoch": 18.740759303345445, "grad_norm": 5.532679557800293, "learning_rate": 1.202955271089723e-07, "loss": 0.4126, "num_input_tokens_seen": 181884320, "step": 149570 }, { "epoch": 18.741385791254228, "grad_norm": 5.13394832611084, "learning_rate": 1.2017635339159118e-07, "loss": 0.4251, "num_input_tokens_seen": 181890560, "step": 149575 }, { "epoch": 18.742012279163013, "grad_norm": 19.73982048034668, "learning_rate": 1.2005723801660485e-07, "loss": 0.4332, "num_input_tokens_seen": 181896800, "step": 149580 }, { "epoch": 18.742638767071796, "grad_norm": 5.030874729156494, "learning_rate": 1.199381809854372e-07, "loss": 0.4912, "num_input_tokens_seen": 181903104, "step": 149585 }, { "epoch": 18.743265254980578, "grad_norm": 10.143412590026855, "learning_rate": 1.198191822995115e-07, "loss": 0.4655, "num_input_tokens_seen": 181909568, "step": 149590 }, { "epoch": 18.743891742889364, "grad_norm": 20.82651138305664, "learning_rate": 1.1970024196024999e-07, "loss": 0.4847, "num_input_tokens_seen": 181915104, "step": 149595 }, { "epoch": 18.744518230798146, "grad_norm": 35.90261459350586, "learning_rate": 1.1958135996907538e-07, "loss": 0.4629, "num_input_tokens_seen": 181921312, "step": 149600 }, { "epoch": 18.74514471870693, "grad_norm": 7.685717582702637, "learning_rate": 1.1946253632740934e-07, "loss": 0.455, "num_input_tokens_seen": 181927456, "step": 149605 }, { "epoch": 18.74577120661571, "grad_norm": 17.044105529785156, "learning_rate": 1.1934377103667182e-07, "loss": 0.4034, "num_input_tokens_seen": 181933856, "step": 149610 }, { "epoch": 18.746397694524497, "grad_norm": 5.374337196350098, "learning_rate": 1.1922506409828283e-07, "loss": 0.4649, "num_input_tokens_seen": 181940064, "step": 149615 }, { "epoch": 18.74702418243328, "grad_norm": 9.978410720825195, "learning_rate": 1.1910641551366176e-07, "loss": 0.4566, "num_input_tokens_seen": 181946048, "step": 149620 }, { "epoch": 18.74765067034206, "grad_norm": 14.713567733764648, "learning_rate": 1.1898782528422747e-07, "loss": 0.4303, "num_input_tokens_seen": 181952256, "step": 149625 }, { "epoch": 18.748277158250847, "grad_norm": 4.172111988067627, "learning_rate": 1.1886929341139663e-07, "loss": 0.4096, "num_input_tokens_seen": 181958400, "step": 149630 }, { "epoch": 18.74890364615963, "grad_norm": 6.534554958343506, "learning_rate": 1.1875081989658755e-07, "loss": 0.4795, "num_input_tokens_seen": 181963840, "step": 149635 }, { "epoch": 18.74953013406841, "grad_norm": 39.011627197265625, "learning_rate": 1.1863240474121685e-07, "loss": 0.4834, "num_input_tokens_seen": 181969856, "step": 149640 }, { "epoch": 18.750156621977197, "grad_norm": 5.88433313369751, "learning_rate": 1.1851404794669951e-07, "loss": 0.3789, "num_input_tokens_seen": 181976000, "step": 149645 }, { "epoch": 18.75078310988598, "grad_norm": 32.22105407714844, "learning_rate": 1.183957495144511e-07, "loss": 0.5221, "num_input_tokens_seen": 181982432, "step": 149650 }, { "epoch": 18.751409597794762, "grad_norm": 15.158076286315918, "learning_rate": 1.182775094458849e-07, "loss": 0.5443, "num_input_tokens_seen": 181988096, "step": 149655 }, { "epoch": 18.752036085703544, "grad_norm": 7.099496364593506, "learning_rate": 1.1815932774241645e-07, "loss": 0.4056, "num_input_tokens_seen": 181994368, "step": 149660 }, { "epoch": 18.75266257361233, "grad_norm": 4.237961769104004, "learning_rate": 1.1804120440545686e-07, "loss": 0.5449, "num_input_tokens_seen": 182000384, "step": 149665 }, { "epoch": 18.753289061521112, "grad_norm": 20.518238067626953, "learning_rate": 1.1792313943641942e-07, "loss": 0.4734, "num_input_tokens_seen": 182006592, "step": 149670 }, { "epoch": 18.753915549429895, "grad_norm": 18.480377197265625, "learning_rate": 1.178051328367158e-07, "loss": 0.4875, "num_input_tokens_seen": 182012224, "step": 149675 }, { "epoch": 18.75454203733868, "grad_norm": 6.713038444519043, "learning_rate": 1.1768718460775652e-07, "loss": 0.4654, "num_input_tokens_seen": 182018432, "step": 149680 }, { "epoch": 18.755168525247463, "grad_norm": 13.117481231689453, "learning_rate": 1.1756929475095103e-07, "loss": 0.3957, "num_input_tokens_seen": 182024576, "step": 149685 }, { "epoch": 18.755795013156245, "grad_norm": 3.295475482940674, "learning_rate": 1.1745146326770984e-07, "loss": 0.4238, "num_input_tokens_seen": 182030048, "step": 149690 }, { "epoch": 18.75642150106503, "grad_norm": 4.150712490081787, "learning_rate": 1.173336901594413e-07, "loss": 0.4417, "num_input_tokens_seen": 182035968, "step": 149695 }, { "epoch": 18.757047988973813, "grad_norm": 12.443787574768066, "learning_rate": 1.1721597542755425e-07, "loss": 0.4333, "num_input_tokens_seen": 182042176, "step": 149700 }, { "epoch": 18.757674476882595, "grad_norm": 5.556978702545166, "learning_rate": 1.1709831907345482e-07, "loss": 0.3961, "num_input_tokens_seen": 182048352, "step": 149705 }, { "epoch": 18.75830096479138, "grad_norm": 20.725385665893555, "learning_rate": 1.1698072109855075e-07, "loss": 0.4843, "num_input_tokens_seen": 182054464, "step": 149710 }, { "epoch": 18.758927452700163, "grad_norm": 6.52719783782959, "learning_rate": 1.1686318150424758e-07, "loss": 0.5068, "num_input_tokens_seen": 182060480, "step": 149715 }, { "epoch": 18.759553940608946, "grad_norm": 12.582662582397461, "learning_rate": 1.1674570029195032e-07, "loss": 0.4206, "num_input_tokens_seen": 182066432, "step": 149720 }, { "epoch": 18.760180428517728, "grad_norm": 5.565160274505615, "learning_rate": 1.1662827746306393e-07, "loss": 0.3745, "num_input_tokens_seen": 182072672, "step": 149725 }, { "epoch": 18.760806916426514, "grad_norm": 28.98779296875, "learning_rate": 1.1651091301899231e-07, "loss": 0.409, "num_input_tokens_seen": 182078848, "step": 149730 }, { "epoch": 18.761433404335296, "grad_norm": 12.642561912536621, "learning_rate": 1.1639360696113877e-07, "loss": 0.4456, "num_input_tokens_seen": 182085280, "step": 149735 }, { "epoch": 18.76205989224408, "grad_norm": 17.89958381652832, "learning_rate": 1.1627635929090498e-07, "loss": 0.4389, "num_input_tokens_seen": 182091328, "step": 149740 }, { "epoch": 18.762686380152864, "grad_norm": 4.858703136444092, "learning_rate": 1.1615917000969368e-07, "loss": 0.4335, "num_input_tokens_seen": 182097408, "step": 149745 }, { "epoch": 18.763312868061647, "grad_norm": 10.34274959564209, "learning_rate": 1.1604203911890544e-07, "loss": 0.4435, "num_input_tokens_seen": 182103776, "step": 149750 }, { "epoch": 18.76393935597043, "grad_norm": 7.519331455230713, "learning_rate": 1.1592496661994079e-07, "loss": 0.4281, "num_input_tokens_seen": 182110016, "step": 149755 }, { "epoch": 18.764565843879215, "grad_norm": 10.151397705078125, "learning_rate": 1.1580795251419974e-07, "loss": 0.3888, "num_input_tokens_seen": 182115936, "step": 149760 }, { "epoch": 18.765192331787997, "grad_norm": 11.604029655456543, "learning_rate": 1.156909968030806e-07, "loss": 0.4205, "num_input_tokens_seen": 182122144, "step": 149765 }, { "epoch": 18.76581881969678, "grad_norm": 6.5929412841796875, "learning_rate": 1.1557409948798226e-07, "loss": 0.4841, "num_input_tokens_seen": 182128192, "step": 149770 }, { "epoch": 18.76644530760556, "grad_norm": 31.88715171813965, "learning_rate": 1.1545726057030248e-07, "loss": 0.5111, "num_input_tokens_seen": 182134208, "step": 149775 }, { "epoch": 18.767071795514347, "grad_norm": 5.227466106414795, "learning_rate": 1.1534048005143795e-07, "loss": 0.5002, "num_input_tokens_seen": 182140384, "step": 149780 }, { "epoch": 18.76769828342313, "grad_norm": 4.83988094329834, "learning_rate": 1.1522375793278417e-07, "loss": 0.5222, "num_input_tokens_seen": 182146144, "step": 149785 }, { "epoch": 18.768324771331912, "grad_norm": 4.248225688934326, "learning_rate": 1.1510709421573785e-07, "loss": 0.4324, "num_input_tokens_seen": 182151968, "step": 149790 }, { "epoch": 18.768951259240698, "grad_norm": 5.354589462280273, "learning_rate": 1.1499048890169228e-07, "loss": 0.3568, "num_input_tokens_seen": 182158016, "step": 149795 }, { "epoch": 18.76957774714948, "grad_norm": 5.546135425567627, "learning_rate": 1.1487394199204305e-07, "loss": 0.4781, "num_input_tokens_seen": 182163808, "step": 149800 }, { "epoch": 18.770204235058262, "grad_norm": 9.307631492614746, "learning_rate": 1.147574534881829e-07, "loss": 0.3977, "num_input_tokens_seen": 182169824, "step": 149805 }, { "epoch": 18.770830722967048, "grad_norm": 6.81404447555542, "learning_rate": 1.1464102339150518e-07, "loss": 0.5071, "num_input_tokens_seen": 182175680, "step": 149810 }, { "epoch": 18.77145721087583, "grad_norm": 5.567633152008057, "learning_rate": 1.14524651703401e-07, "loss": 0.4311, "num_input_tokens_seen": 182181568, "step": 149815 }, { "epoch": 18.772083698784613, "grad_norm": 8.570680618286133, "learning_rate": 1.14408338425262e-07, "loss": 0.4473, "num_input_tokens_seen": 182187040, "step": 149820 }, { "epoch": 18.7727101866934, "grad_norm": 4.986842632293701, "learning_rate": 1.1429208355847931e-07, "loss": 0.4922, "num_input_tokens_seen": 182193152, "step": 149825 }, { "epoch": 18.77333667460218, "grad_norm": 7.403465270996094, "learning_rate": 1.1417588710444183e-07, "loss": 0.4279, "num_input_tokens_seen": 182199424, "step": 149830 }, { "epoch": 18.773963162510963, "grad_norm": 5.465939044952393, "learning_rate": 1.1405974906454009e-07, "loss": 0.443, "num_input_tokens_seen": 182205408, "step": 149835 }, { "epoch": 18.774589650419745, "grad_norm": 7.865548133850098, "learning_rate": 1.1394366944016133e-07, "loss": 0.444, "num_input_tokens_seen": 182211424, "step": 149840 }, { "epoch": 18.77521613832853, "grad_norm": 11.170110702514648, "learning_rate": 1.1382764823269443e-07, "loss": 0.4209, "num_input_tokens_seen": 182217312, "step": 149845 }, { "epoch": 18.775842626237313, "grad_norm": 30.00981903076172, "learning_rate": 1.1371168544352607e-07, "loss": 0.4658, "num_input_tokens_seen": 182223616, "step": 149850 }, { "epoch": 18.776469114146096, "grad_norm": 18.557905197143555, "learning_rate": 1.1359578107404235e-07, "loss": 0.4202, "num_input_tokens_seen": 182229856, "step": 149855 }, { "epoch": 18.77709560205488, "grad_norm": 8.048872947692871, "learning_rate": 1.1347993512562938e-07, "loss": 0.3951, "num_input_tokens_seen": 182236032, "step": 149860 }, { "epoch": 18.777722089963664, "grad_norm": 9.543630599975586, "learning_rate": 1.1336414759967274e-07, "loss": 0.497, "num_input_tokens_seen": 182242176, "step": 149865 }, { "epoch": 18.778348577872446, "grad_norm": 3.331231117248535, "learning_rate": 1.1324841849755519e-07, "loss": 0.3909, "num_input_tokens_seen": 182248416, "step": 149870 }, { "epoch": 18.778975065781232, "grad_norm": 16.055335998535156, "learning_rate": 1.131327478206623e-07, "loss": 0.5232, "num_input_tokens_seen": 182254432, "step": 149875 }, { "epoch": 18.779601553690014, "grad_norm": 13.877531051635742, "learning_rate": 1.1301713557037519e-07, "loss": 0.4528, "num_input_tokens_seen": 182260320, "step": 149880 }, { "epoch": 18.780228041598797, "grad_norm": 4.1889519691467285, "learning_rate": 1.1290158174807719e-07, "loss": 0.4202, "num_input_tokens_seen": 182266496, "step": 149885 }, { "epoch": 18.78085452950758, "grad_norm": 6.46940279006958, "learning_rate": 1.1278608635514998e-07, "loss": 0.4513, "num_input_tokens_seen": 182272384, "step": 149890 }, { "epoch": 18.781481017416365, "grad_norm": 6.3731842041015625, "learning_rate": 1.1267064939297412e-07, "loss": 0.4423, "num_input_tokens_seen": 182278880, "step": 149895 }, { "epoch": 18.782107505325147, "grad_norm": 6.0387282371521, "learning_rate": 1.1255527086292906e-07, "loss": 0.424, "num_input_tokens_seen": 182285056, "step": 149900 }, { "epoch": 18.78273399323393, "grad_norm": 5.218231201171875, "learning_rate": 1.1243995076639535e-07, "loss": 0.3941, "num_input_tokens_seen": 182290912, "step": 149905 }, { "epoch": 18.783360481142715, "grad_norm": 6.452658176422119, "learning_rate": 1.1232468910475137e-07, "loss": 0.4186, "num_input_tokens_seen": 182297120, "step": 149910 }, { "epoch": 18.783986969051497, "grad_norm": 5.557392597198486, "learning_rate": 1.122094858793743e-07, "loss": 0.445, "num_input_tokens_seen": 182303296, "step": 149915 }, { "epoch": 18.78461345696028, "grad_norm": 5.296529769897461, "learning_rate": 1.1209434109164308e-07, "loss": 0.4239, "num_input_tokens_seen": 182309440, "step": 149920 }, { "epoch": 18.785239944869065, "grad_norm": 5.284756183624268, "learning_rate": 1.1197925474293325e-07, "loss": 0.4164, "num_input_tokens_seen": 182314976, "step": 149925 }, { "epoch": 18.785866432777848, "grad_norm": 14.461307525634766, "learning_rate": 1.1186422683462151e-07, "loss": 0.484, "num_input_tokens_seen": 182321344, "step": 149930 }, { "epoch": 18.78649292068663, "grad_norm": 20.057588577270508, "learning_rate": 1.1174925736808174e-07, "loss": 0.4983, "num_input_tokens_seen": 182328064, "step": 149935 }, { "epoch": 18.787119408595416, "grad_norm": 19.21044921875, "learning_rate": 1.1163434634469006e-07, "loss": 0.5101, "num_input_tokens_seen": 182333568, "step": 149940 }, { "epoch": 18.787745896504198, "grad_norm": 16.424848556518555, "learning_rate": 1.1151949376581928e-07, "loss": 0.3984, "num_input_tokens_seen": 182339968, "step": 149945 }, { "epoch": 18.78837238441298, "grad_norm": 6.569555759429932, "learning_rate": 1.1140469963284328e-07, "loss": 0.4364, "num_input_tokens_seen": 182345952, "step": 149950 }, { "epoch": 18.788998872321763, "grad_norm": 4.763686656951904, "learning_rate": 1.1128996394713432e-07, "loss": 0.4106, "num_input_tokens_seen": 182351936, "step": 149955 }, { "epoch": 18.78962536023055, "grad_norm": 25.063627243041992, "learning_rate": 1.111752867100635e-07, "loss": 0.4617, "num_input_tokens_seen": 182358176, "step": 149960 }, { "epoch": 18.79025184813933, "grad_norm": 15.504632949829102, "learning_rate": 1.1106066792300307e-07, "loss": 0.4961, "num_input_tokens_seen": 182363904, "step": 149965 }, { "epoch": 18.790878336048113, "grad_norm": 3.3994874954223633, "learning_rate": 1.1094610758732249e-07, "loss": 0.3949, "num_input_tokens_seen": 182370112, "step": 149970 }, { "epoch": 18.7915048239569, "grad_norm": 12.75410270690918, "learning_rate": 1.1083160570439177e-07, "loss": 0.4264, "num_input_tokens_seen": 182376288, "step": 149975 }, { "epoch": 18.79213131186568, "grad_norm": 3.2878103256225586, "learning_rate": 1.1071716227557927e-07, "loss": 0.3835, "num_input_tokens_seen": 182382176, "step": 149980 }, { "epoch": 18.792757799774463, "grad_norm": 21.084218978881836, "learning_rate": 1.1060277730225444e-07, "loss": 0.4838, "num_input_tokens_seen": 182388576, "step": 149985 }, { "epoch": 18.79338428768325, "grad_norm": 7.156714916229248, "learning_rate": 1.1048845078578396e-07, "loss": 0.4252, "num_input_tokens_seen": 182394624, "step": 149990 }, { "epoch": 18.79401077559203, "grad_norm": 9.664717674255371, "learning_rate": 1.1037418272753452e-07, "loss": 0.4221, "num_input_tokens_seen": 182400928, "step": 149995 }, { "epoch": 18.794637263500814, "grad_norm": 5.890068531036377, "learning_rate": 1.1025997312887338e-07, "loss": 0.4592, "num_input_tokens_seen": 182407104, "step": 150000 }, { "epoch": 18.795263751409596, "grad_norm": 5.181509017944336, "learning_rate": 1.1014582199116497e-07, "loss": 0.4249, "num_input_tokens_seen": 182413216, "step": 150005 }, { "epoch": 18.795890239318382, "grad_norm": 21.057823181152344, "learning_rate": 1.1003172931577433e-07, "loss": 0.4648, "num_input_tokens_seen": 182418720, "step": 150010 }, { "epoch": 18.796516727227164, "grad_norm": 4.886347770690918, "learning_rate": 1.0991769510406535e-07, "loss": 0.4359, "num_input_tokens_seen": 182424832, "step": 150015 }, { "epoch": 18.797143215135947, "grad_norm": 7.9099321365356445, "learning_rate": 1.0980371935740253e-07, "loss": 0.5073, "num_input_tokens_seen": 182430528, "step": 150020 }, { "epoch": 18.797769703044732, "grad_norm": 4.282578945159912, "learning_rate": 1.0968980207714696e-07, "loss": 0.4203, "num_input_tokens_seen": 182436864, "step": 150025 }, { "epoch": 18.798396190953515, "grad_norm": 5.1811089515686035, "learning_rate": 1.0957594326466203e-07, "loss": 0.4759, "num_input_tokens_seen": 182442976, "step": 150030 }, { "epoch": 18.799022678862297, "grad_norm": 10.376654624938965, "learning_rate": 1.0946214292130775e-07, "loss": 0.4895, "num_input_tokens_seen": 182449152, "step": 150035 }, { "epoch": 18.799649166771083, "grad_norm": 4.524719715118408, "learning_rate": 1.0934840104844579e-07, "loss": 0.3913, "num_input_tokens_seen": 182455488, "step": 150040 }, { "epoch": 18.800275654679865, "grad_norm": 6.505956649780273, "learning_rate": 1.0923471764743509e-07, "loss": 0.4662, "num_input_tokens_seen": 182461408, "step": 150045 }, { "epoch": 18.800902142588647, "grad_norm": 4.174581050872803, "learning_rate": 1.0912109271963566e-07, "loss": 0.4229, "num_input_tokens_seen": 182467232, "step": 150050 }, { "epoch": 18.80152863049743, "grad_norm": 17.755693435668945, "learning_rate": 1.0900752626640476e-07, "loss": 0.4306, "num_input_tokens_seen": 182473280, "step": 150055 }, { "epoch": 18.802155118406215, "grad_norm": 15.685015678405762, "learning_rate": 1.0889401828910184e-07, "loss": 0.4908, "num_input_tokens_seen": 182478656, "step": 150060 }, { "epoch": 18.802781606314998, "grad_norm": 4.710458755493164, "learning_rate": 1.0878056878908305e-07, "loss": 0.4911, "num_input_tokens_seen": 182484896, "step": 150065 }, { "epoch": 18.80340809422378, "grad_norm": 6.31961727142334, "learning_rate": 1.0866717776770452e-07, "loss": 0.4335, "num_input_tokens_seen": 182491040, "step": 150070 }, { "epoch": 18.804034582132566, "grad_norm": 7.717809200286865, "learning_rate": 1.085538452263224e-07, "loss": 0.4223, "num_input_tokens_seen": 182496896, "step": 150075 }, { "epoch": 18.804661070041348, "grad_norm": 4.4850029945373535, "learning_rate": 1.084405711662917e-07, "loss": 0.4062, "num_input_tokens_seen": 182503296, "step": 150080 }, { "epoch": 18.80528755795013, "grad_norm": 13.592992782592773, "learning_rate": 1.083273555889669e-07, "loss": 0.4069, "num_input_tokens_seen": 182509440, "step": 150085 }, { "epoch": 18.805914045858916, "grad_norm": 9.960110664367676, "learning_rate": 1.0821419849570025e-07, "loss": 0.5124, "num_input_tokens_seen": 182515296, "step": 150090 }, { "epoch": 18.8065405337677, "grad_norm": 4.561639308929443, "learning_rate": 1.0810109988784678e-07, "loss": 0.3911, "num_input_tokens_seen": 182521536, "step": 150095 }, { "epoch": 18.80716702167648, "grad_norm": 17.307262420654297, "learning_rate": 1.0798805976675708e-07, "loss": 0.4411, "num_input_tokens_seen": 182527648, "step": 150100 }, { "epoch": 18.807793509585267, "grad_norm": 23.55890655517578, "learning_rate": 1.0787507813378284e-07, "loss": 0.5185, "num_input_tokens_seen": 182533824, "step": 150105 }, { "epoch": 18.80841999749405, "grad_norm": 5.88932991027832, "learning_rate": 1.077621549902752e-07, "loss": 0.4063, "num_input_tokens_seen": 182540064, "step": 150110 }, { "epoch": 18.80904648540283, "grad_norm": 4.663318157196045, "learning_rate": 1.0764929033758421e-07, "loss": 0.4508, "num_input_tokens_seen": 182546112, "step": 150115 }, { "epoch": 18.809672973311613, "grad_norm": 8.3089017868042, "learning_rate": 1.0753648417705931e-07, "loss": 0.4855, "num_input_tokens_seen": 182552352, "step": 150120 }, { "epoch": 18.8102994612204, "grad_norm": 21.691373825073242, "learning_rate": 1.0742373651004889e-07, "loss": 0.4758, "num_input_tokens_seen": 182557664, "step": 150125 }, { "epoch": 18.81092594912918, "grad_norm": 20.566802978515625, "learning_rate": 1.0731104733790132e-07, "loss": 0.5187, "num_input_tokens_seen": 182563872, "step": 150130 }, { "epoch": 18.811552437037964, "grad_norm": 13.107460975646973, "learning_rate": 1.0719841666196385e-07, "loss": 0.4041, "num_input_tokens_seen": 182570048, "step": 150135 }, { "epoch": 18.81217892494675, "grad_norm": 35.64999008178711, "learning_rate": 1.0708584448358261e-07, "loss": 0.5228, "num_input_tokens_seen": 182575648, "step": 150140 }, { "epoch": 18.812805412855532, "grad_norm": 7.127460479736328, "learning_rate": 1.0697333080410432e-07, "loss": 0.5083, "num_input_tokens_seen": 182581664, "step": 150145 }, { "epoch": 18.813431900764314, "grad_norm": 9.234105110168457, "learning_rate": 1.0686087562487347e-07, "loss": 0.406, "num_input_tokens_seen": 182587872, "step": 150150 }, { "epoch": 18.8140583886731, "grad_norm": 14.499133110046387, "learning_rate": 1.0674847894723451e-07, "loss": 0.4202, "num_input_tokens_seen": 182593952, "step": 150155 }, { "epoch": 18.814684876581882, "grad_norm": 3.5671980381011963, "learning_rate": 1.0663614077253192e-07, "loss": 0.449, "num_input_tokens_seen": 182600416, "step": 150160 }, { "epoch": 18.815311364490665, "grad_norm": 6.172123432159424, "learning_rate": 1.0652386110210799e-07, "loss": 0.3909, "num_input_tokens_seen": 182606016, "step": 150165 }, { "epoch": 18.81593785239945, "grad_norm": 10.587891578674316, "learning_rate": 1.0641163993730608e-07, "loss": 0.4466, "num_input_tokens_seen": 182611968, "step": 150170 }, { "epoch": 18.816564340308233, "grad_norm": 21.066984176635742, "learning_rate": 1.0629947727946677e-07, "loss": 0.4207, "num_input_tokens_seen": 182618176, "step": 150175 }, { "epoch": 18.817190828217015, "grad_norm": 10.346632957458496, "learning_rate": 1.0618737312993233e-07, "loss": 0.4793, "num_input_tokens_seen": 182624256, "step": 150180 }, { "epoch": 18.817817316125797, "grad_norm": 15.296926498413086, "learning_rate": 1.0607532749004168e-07, "loss": 0.3883, "num_input_tokens_seen": 182630240, "step": 150185 }, { "epoch": 18.818443804034583, "grad_norm": 13.630692481994629, "learning_rate": 1.0596334036113488e-07, "loss": 0.494, "num_input_tokens_seen": 182636288, "step": 150190 }, { "epoch": 18.819070291943365, "grad_norm": 11.421722412109375, "learning_rate": 1.0585141174455138e-07, "loss": 0.4642, "num_input_tokens_seen": 182642528, "step": 150195 }, { "epoch": 18.819696779852148, "grad_norm": 4.644240856170654, "learning_rate": 1.0573954164162903e-07, "loss": 0.4472, "num_input_tokens_seen": 182648480, "step": 150200 }, { "epoch": 18.820323267760934, "grad_norm": 24.596158981323242, "learning_rate": 1.0562773005370563e-07, "loss": 0.4266, "num_input_tokens_seen": 182654592, "step": 150205 }, { "epoch": 18.820949755669716, "grad_norm": 5.615537166595459, "learning_rate": 1.0551597698211679e-07, "loss": 0.45, "num_input_tokens_seen": 182660704, "step": 150210 }, { "epoch": 18.821576243578498, "grad_norm": 13.996286392211914, "learning_rate": 1.0540428242819978e-07, "loss": 0.493, "num_input_tokens_seen": 182667040, "step": 150215 }, { "epoch": 18.822202731487284, "grad_norm": 5.03551721572876, "learning_rate": 1.0529264639329017e-07, "loss": 0.4802, "num_input_tokens_seen": 182673504, "step": 150220 }, { "epoch": 18.822829219396066, "grad_norm": 4.284599304199219, "learning_rate": 1.0518106887872192e-07, "loss": 0.4788, "num_input_tokens_seen": 182679584, "step": 150225 }, { "epoch": 18.82345570730485, "grad_norm": 5.527303218841553, "learning_rate": 1.0506954988582951e-07, "loss": 0.411, "num_input_tokens_seen": 182685760, "step": 150230 }, { "epoch": 18.82408219521363, "grad_norm": 10.444406509399414, "learning_rate": 1.0495808941594631e-07, "loss": 0.4913, "num_input_tokens_seen": 182692000, "step": 150235 }, { "epoch": 18.824708683122417, "grad_norm": 7.836266994476318, "learning_rate": 1.0484668747040405e-07, "loss": 0.357, "num_input_tokens_seen": 182698432, "step": 150240 }, { "epoch": 18.8253351710312, "grad_norm": 6.416188716888428, "learning_rate": 1.047353440505361e-07, "loss": 0.4413, "num_input_tokens_seen": 182704608, "step": 150245 }, { "epoch": 18.82596165893998, "grad_norm": 4.555511951446533, "learning_rate": 1.0462405915767193e-07, "loss": 0.4414, "num_input_tokens_seen": 182710944, "step": 150250 }, { "epoch": 18.826588146848767, "grad_norm": 28.097612380981445, "learning_rate": 1.0451283279314329e-07, "loss": 0.4603, "num_input_tokens_seen": 182717024, "step": 150255 }, { "epoch": 18.82721463475755, "grad_norm": 6.093654155731201, "learning_rate": 1.0440166495827963e-07, "loss": 0.5734, "num_input_tokens_seen": 182723040, "step": 150260 }, { "epoch": 18.82784112266633, "grad_norm": 21.257408142089844, "learning_rate": 1.0429055565440993e-07, "loss": 0.5088, "num_input_tokens_seen": 182729344, "step": 150265 }, { "epoch": 18.828467610575117, "grad_norm": 6.959063529968262, "learning_rate": 1.041795048828631e-07, "loss": 0.4447, "num_input_tokens_seen": 182735392, "step": 150270 }, { "epoch": 18.8290940984839, "grad_norm": 3.7835891246795654, "learning_rate": 1.0406851264496587e-07, "loss": 0.4366, "num_input_tokens_seen": 182741376, "step": 150275 }, { "epoch": 18.829720586392682, "grad_norm": 11.313977241516113, "learning_rate": 1.039575789420466e-07, "loss": 0.4098, "num_input_tokens_seen": 182747744, "step": 150280 }, { "epoch": 18.830347074301464, "grad_norm": 4.060204029083252, "learning_rate": 1.0384670377543038e-07, "loss": 0.4046, "num_input_tokens_seen": 182754016, "step": 150285 }, { "epoch": 18.83097356221025, "grad_norm": 6.262150764465332, "learning_rate": 1.0373588714644334e-07, "loss": 0.4045, "num_input_tokens_seen": 182760448, "step": 150290 }, { "epoch": 18.831600050119032, "grad_norm": 11.083066940307617, "learning_rate": 1.0362512905641054e-07, "loss": 0.502, "num_input_tokens_seen": 182766624, "step": 150295 }, { "epoch": 18.832226538027815, "grad_norm": 6.046058177947998, "learning_rate": 1.0351442950665591e-07, "loss": 0.3804, "num_input_tokens_seen": 182772800, "step": 150300 }, { "epoch": 18.8328530259366, "grad_norm": 6.3402099609375, "learning_rate": 1.034037884985023e-07, "loss": 0.4695, "num_input_tokens_seen": 182779104, "step": 150305 }, { "epoch": 18.833479513845383, "grad_norm": 5.6065521240234375, "learning_rate": 1.0329320603327364e-07, "loss": 0.4646, "num_input_tokens_seen": 182785472, "step": 150310 }, { "epoch": 18.834106001754165, "grad_norm": 3.336775541305542, "learning_rate": 1.0318268211229165e-07, "loss": 0.4437, "num_input_tokens_seen": 182791648, "step": 150315 }, { "epoch": 18.83473248966295, "grad_norm": 9.528230667114258, "learning_rate": 1.0307221673687806e-07, "loss": 0.4507, "num_input_tokens_seen": 182797568, "step": 150320 }, { "epoch": 18.835358977571733, "grad_norm": 4.620484352111816, "learning_rate": 1.0296180990835292e-07, "loss": 0.3985, "num_input_tokens_seen": 182803680, "step": 150325 }, { "epoch": 18.835985465480515, "grad_norm": 18.006511688232422, "learning_rate": 1.0285146162803627e-07, "loss": 0.5017, "num_input_tokens_seen": 182810016, "step": 150330 }, { "epoch": 18.8366119533893, "grad_norm": 6.585190773010254, "learning_rate": 1.0274117189724874e-07, "loss": 0.4937, "num_input_tokens_seen": 182815904, "step": 150335 }, { "epoch": 18.837238441298084, "grad_norm": 9.576876640319824, "learning_rate": 1.0263094071730705e-07, "loss": 0.475, "num_input_tokens_seen": 182822336, "step": 150340 }, { "epoch": 18.837864929206866, "grad_norm": 23.798189163208008, "learning_rate": 1.0252076808953015e-07, "loss": 0.4353, "num_input_tokens_seen": 182828736, "step": 150345 }, { "epoch": 18.838491417115648, "grad_norm": 26.226472854614258, "learning_rate": 1.0241065401523475e-07, "loss": 0.4639, "num_input_tokens_seen": 182834976, "step": 150350 }, { "epoch": 18.839117905024434, "grad_norm": 5.6244306564331055, "learning_rate": 1.0230059849573815e-07, "loss": 0.4144, "num_input_tokens_seen": 182841184, "step": 150355 }, { "epoch": 18.839744392933216, "grad_norm": 5.520465850830078, "learning_rate": 1.0219060153235538e-07, "loss": 0.3988, "num_input_tokens_seen": 182847616, "step": 150360 }, { "epoch": 18.840370880842, "grad_norm": 11.179057121276855, "learning_rate": 1.0208066312640209e-07, "loss": 0.4251, "num_input_tokens_seen": 182853760, "step": 150365 }, { "epoch": 18.840997368750784, "grad_norm": 4.070518493652344, "learning_rate": 1.0197078327919163e-07, "loss": 0.4606, "num_input_tokens_seen": 182859776, "step": 150370 }, { "epoch": 18.841623856659567, "grad_norm": 4.260440826416016, "learning_rate": 1.018609619920391e-07, "loss": 0.4641, "num_input_tokens_seen": 182865760, "step": 150375 }, { "epoch": 18.84225034456835, "grad_norm": 8.428311347961426, "learning_rate": 1.0175119926625676e-07, "loss": 0.4603, "num_input_tokens_seen": 182871808, "step": 150380 }, { "epoch": 18.842876832477135, "grad_norm": 18.552978515625, "learning_rate": 1.0164149510315691e-07, "loss": 0.4242, "num_input_tokens_seen": 182877952, "step": 150385 }, { "epoch": 18.843503320385917, "grad_norm": 13.358434677124023, "learning_rate": 1.0153184950405182e-07, "loss": 0.4679, "num_input_tokens_seen": 182884064, "step": 150390 }, { "epoch": 18.8441298082947, "grad_norm": 3.925581455230713, "learning_rate": 1.0142226247025155e-07, "loss": 0.4481, "num_input_tokens_seen": 182890080, "step": 150395 }, { "epoch": 18.84475629620348, "grad_norm": 12.34673023223877, "learning_rate": 1.0131273400306674e-07, "loss": 0.5086, "num_input_tokens_seen": 182896416, "step": 150400 }, { "epoch": 18.845382784112267, "grad_norm": 15.21041488647461, "learning_rate": 1.0120326410380632e-07, "loss": 0.4654, "num_input_tokens_seen": 182902592, "step": 150405 }, { "epoch": 18.84600927202105, "grad_norm": 20.572906494140625, "learning_rate": 1.0109385277378037e-07, "loss": 0.4378, "num_input_tokens_seen": 182908864, "step": 150410 }, { "epoch": 18.846635759929832, "grad_norm": 4.681315898895264, "learning_rate": 1.009845000142956e-07, "loss": 0.4209, "num_input_tokens_seen": 182914944, "step": 150415 }, { "epoch": 18.847262247838618, "grad_norm": 2.6904757022857666, "learning_rate": 1.0087520582665988e-07, "loss": 0.4044, "num_input_tokens_seen": 182920672, "step": 150420 }, { "epoch": 18.8478887357474, "grad_norm": 17.841758728027344, "learning_rate": 1.0076597021217993e-07, "loss": 0.4722, "num_input_tokens_seen": 182926976, "step": 150425 }, { "epoch": 18.848515223656182, "grad_norm": 5.584997177124023, "learning_rate": 1.0065679317216248e-07, "loss": 0.5634, "num_input_tokens_seen": 182933184, "step": 150430 }, { "epoch": 18.849141711564968, "grad_norm": 4.1116533279418945, "learning_rate": 1.0054767470791205e-07, "loss": 0.4796, "num_input_tokens_seen": 182939040, "step": 150435 }, { "epoch": 18.84976819947375, "grad_norm": 6.3386311531066895, "learning_rate": 1.0043861482073314e-07, "loss": 0.4623, "num_input_tokens_seen": 182944800, "step": 150440 }, { "epoch": 18.850394687382533, "grad_norm": 6.079556941986084, "learning_rate": 1.0032961351193027e-07, "loss": 0.4289, "num_input_tokens_seen": 182950464, "step": 150445 }, { "epoch": 18.851021175291315, "grad_norm": 9.986058235168457, "learning_rate": 1.0022067078280572e-07, "loss": 0.465, "num_input_tokens_seen": 182956928, "step": 150450 }, { "epoch": 18.8516476632001, "grad_norm": 8.55917739868164, "learning_rate": 1.0011178663466348e-07, "loss": 0.4314, "num_input_tokens_seen": 182963040, "step": 150455 }, { "epoch": 18.852274151108883, "grad_norm": 9.944295883178711, "learning_rate": 1.0000296106880359e-07, "loss": 0.4607, "num_input_tokens_seen": 182969376, "step": 150460 }, { "epoch": 18.852900639017665, "grad_norm": 28.910293579101562, "learning_rate": 9.989419408652834e-08, "loss": 0.4461, "num_input_tokens_seen": 182975584, "step": 150465 }, { "epoch": 18.85352712692645, "grad_norm": 5.146899223327637, "learning_rate": 9.978548568913782e-08, "loss": 0.4378, "num_input_tokens_seen": 182981760, "step": 150470 }, { "epoch": 18.854153614835234, "grad_norm": 7.90673303604126, "learning_rate": 9.967683587793153e-08, "loss": 0.4442, "num_input_tokens_seen": 182987744, "step": 150475 }, { "epoch": 18.854780102744016, "grad_norm": 18.293914794921875, "learning_rate": 9.956824465420844e-08, "loss": 0.4879, "num_input_tokens_seen": 182993792, "step": 150480 }, { "epoch": 18.8554065906528, "grad_norm": 3.6838107109069824, "learning_rate": 9.945971201926752e-08, "loss": 0.3673, "num_input_tokens_seen": 182999968, "step": 150485 }, { "epoch": 18.856033078561584, "grad_norm": 9.26700496673584, "learning_rate": 9.935123797440549e-08, "loss": 0.4271, "num_input_tokens_seen": 183005824, "step": 150490 }, { "epoch": 18.856659566470366, "grad_norm": 14.521828651428223, "learning_rate": 9.924282252092021e-08, "loss": 0.4666, "num_input_tokens_seen": 183011680, "step": 150495 }, { "epoch": 18.857286054379152, "grad_norm": 5.526365280151367, "learning_rate": 9.913446566010676e-08, "loss": 0.4492, "num_input_tokens_seen": 183017952, "step": 150500 }, { "epoch": 18.857912542287934, "grad_norm": 4.196359157562256, "learning_rate": 9.902616739326132e-08, "loss": 0.4727, "num_input_tokens_seen": 183024064, "step": 150505 }, { "epoch": 18.858539030196717, "grad_norm": 5.2851457595825195, "learning_rate": 9.891792772167841e-08, "loss": 0.4457, "num_input_tokens_seen": 183029312, "step": 150510 }, { "epoch": 18.8591655181055, "grad_norm": 20.190305709838867, "learning_rate": 9.880974664665255e-08, "loss": 0.487, "num_input_tokens_seen": 183035264, "step": 150515 }, { "epoch": 18.859792006014285, "grad_norm": 8.238815307617188, "learning_rate": 9.87016241694766e-08, "loss": 0.4176, "num_input_tokens_seen": 183041504, "step": 150520 }, { "epoch": 18.860418493923067, "grad_norm": 10.588849067687988, "learning_rate": 9.859356029144395e-08, "loss": 0.5635, "num_input_tokens_seen": 183047552, "step": 150525 }, { "epoch": 18.86104498183185, "grad_norm": 13.45526123046875, "learning_rate": 9.848555501384583e-08, "loss": 0.4251, "num_input_tokens_seen": 183053440, "step": 150530 }, { "epoch": 18.861671469740635, "grad_norm": 8.76494026184082, "learning_rate": 9.837760833797339e-08, "loss": 0.4291, "num_input_tokens_seen": 183059744, "step": 150535 }, { "epoch": 18.862297957649417, "grad_norm": 6.541250228881836, "learning_rate": 9.826972026511894e-08, "loss": 0.4398, "num_input_tokens_seen": 183065824, "step": 150540 }, { "epoch": 18.8629244455582, "grad_norm": 17.120378494262695, "learning_rate": 9.81618907965698e-08, "loss": 0.4384, "num_input_tokens_seen": 183071904, "step": 150545 }, { "epoch": 18.863550933466986, "grad_norm": 14.101640701293945, "learning_rate": 9.805411993361768e-08, "loss": 0.4522, "num_input_tokens_seen": 183078304, "step": 150550 }, { "epoch": 18.864177421375768, "grad_norm": 6.466597557067871, "learning_rate": 9.794640767754882e-08, "loss": 0.4017, "num_input_tokens_seen": 183084576, "step": 150555 }, { "epoch": 18.86480390928455, "grad_norm": 7.4485673904418945, "learning_rate": 9.783875402965326e-08, "loss": 0.4566, "num_input_tokens_seen": 183090880, "step": 150560 }, { "epoch": 18.865430397193336, "grad_norm": 5.403663158416748, "learning_rate": 9.773115899121611e-08, "loss": 0.458, "num_input_tokens_seen": 183096896, "step": 150565 }, { "epoch": 18.866056885102118, "grad_norm": 3.297337055206299, "learning_rate": 9.762362256352409e-08, "loss": 0.3776, "num_input_tokens_seen": 183103424, "step": 150570 }, { "epoch": 18.8666833730109, "grad_norm": 5.162166118621826, "learning_rate": 9.751614474786453e-08, "loss": 0.4055, "num_input_tokens_seen": 183109632, "step": 150575 }, { "epoch": 18.867309860919683, "grad_norm": 5.182093620300293, "learning_rate": 9.740872554552027e-08, "loss": 0.4283, "num_input_tokens_seen": 183115712, "step": 150580 }, { "epoch": 18.86793634882847, "grad_norm": 33.4409294128418, "learning_rate": 9.730136495777753e-08, "loss": 0.4511, "num_input_tokens_seen": 183122144, "step": 150585 }, { "epoch": 18.86856283673725, "grad_norm": 7.718140602111816, "learning_rate": 9.719406298591804e-08, "loss": 0.4116, "num_input_tokens_seen": 183128224, "step": 150590 }, { "epoch": 18.869189324646033, "grad_norm": 5.773443222045898, "learning_rate": 9.708681963122635e-08, "loss": 0.4423, "num_input_tokens_seen": 183134016, "step": 150595 }, { "epoch": 18.86981581255482, "grad_norm": 3.919389486312866, "learning_rate": 9.697963489498308e-08, "loss": 0.4424, "num_input_tokens_seen": 183140000, "step": 150600 }, { "epoch": 18.8704423004636, "grad_norm": 4.728980541229248, "learning_rate": 9.68725087784711e-08, "loss": 0.4227, "num_input_tokens_seen": 183145344, "step": 150605 }, { "epoch": 18.871068788372384, "grad_norm": 18.630931854248047, "learning_rate": 9.676544128296994e-08, "loss": 0.4363, "num_input_tokens_seen": 183151552, "step": 150610 }, { "epoch": 18.87169527628117, "grad_norm": 20.71038246154785, "learning_rate": 9.66584324097608e-08, "loss": 0.5289, "num_input_tokens_seen": 183157792, "step": 150615 }, { "epoch": 18.87232176418995, "grad_norm": 8.506847381591797, "learning_rate": 9.655148216012266e-08, "loss": 0.4536, "num_input_tokens_seen": 183163296, "step": 150620 }, { "epoch": 18.872948252098734, "grad_norm": 5.182231426239014, "learning_rate": 9.644459053533395e-08, "loss": 0.4175, "num_input_tokens_seen": 183169600, "step": 150625 }, { "epoch": 18.873574740007516, "grad_norm": 14.23363208770752, "learning_rate": 9.633775753667252e-08, "loss": 0.4561, "num_input_tokens_seen": 183175744, "step": 150630 }, { "epoch": 18.874201227916302, "grad_norm": 13.834053993225098, "learning_rate": 9.623098316541623e-08, "loss": 0.5068, "num_input_tokens_seen": 183181760, "step": 150635 }, { "epoch": 18.874827715825084, "grad_norm": 9.663054466247559, "learning_rate": 9.612426742284186e-08, "loss": 0.502, "num_input_tokens_seen": 183187968, "step": 150640 }, { "epoch": 18.875454203733867, "grad_norm": 4.021018028259277, "learning_rate": 9.60176103102245e-08, "loss": 0.4767, "num_input_tokens_seen": 183194368, "step": 150645 }, { "epoch": 18.876080691642652, "grad_norm": 23.163597106933594, "learning_rate": 9.591101182883977e-08, "loss": 0.4722, "num_input_tokens_seen": 183200608, "step": 150650 }, { "epoch": 18.876707179551435, "grad_norm": 9.110921859741211, "learning_rate": 9.580447197996168e-08, "loss": 0.3845, "num_input_tokens_seen": 183206496, "step": 150655 }, { "epoch": 18.877333667460217, "grad_norm": 4.556296348571777, "learning_rate": 9.56979907648653e-08, "loss": 0.4935, "num_input_tokens_seen": 183212608, "step": 150660 }, { "epoch": 18.877960155369003, "grad_norm": 7.034170627593994, "learning_rate": 9.559156818482184e-08, "loss": 0.438, "num_input_tokens_seen": 183218912, "step": 150665 }, { "epoch": 18.878586643277785, "grad_norm": 5.14370584487915, "learning_rate": 9.548520424110475e-08, "loss": 0.3863, "num_input_tokens_seen": 183225024, "step": 150670 }, { "epoch": 18.879213131186567, "grad_norm": 6.282452583312988, "learning_rate": 9.537889893498519e-08, "loss": 0.4519, "num_input_tokens_seen": 183231328, "step": 150675 }, { "epoch": 18.87983961909535, "grad_norm": 4.9686174392700195, "learning_rate": 9.527265226773552e-08, "loss": 0.4583, "num_input_tokens_seen": 183237792, "step": 150680 }, { "epoch": 18.880466107004136, "grad_norm": 5.164062023162842, "learning_rate": 9.516646424062415e-08, "loss": 0.4494, "num_input_tokens_seen": 183244000, "step": 150685 }, { "epoch": 18.881092594912918, "grad_norm": 4.942605018615723, "learning_rate": 9.506033485492117e-08, "loss": 0.4897, "num_input_tokens_seen": 183250496, "step": 150690 }, { "epoch": 18.8817190828217, "grad_norm": 5.94655704498291, "learning_rate": 9.495426411189667e-08, "loss": 0.4017, "num_input_tokens_seen": 183256704, "step": 150695 }, { "epoch": 18.882345570730486, "grad_norm": 6.957239627838135, "learning_rate": 9.484825201281688e-08, "loss": 0.455, "num_input_tokens_seen": 183263008, "step": 150700 }, { "epoch": 18.882972058639268, "grad_norm": 8.430254936218262, "learning_rate": 9.474229855895134e-08, "loss": 0.4385, "num_input_tokens_seen": 183269152, "step": 150705 }, { "epoch": 18.88359854654805, "grad_norm": 4.019227504730225, "learning_rate": 9.463640375156457e-08, "loss": 0.408, "num_input_tokens_seen": 183275264, "step": 150710 }, { "epoch": 18.884225034456836, "grad_norm": 9.098193168640137, "learning_rate": 9.453056759192503e-08, "loss": 0.4124, "num_input_tokens_seen": 183281344, "step": 150715 }, { "epoch": 18.88485152236562, "grad_norm": 6.7702107429504395, "learning_rate": 9.442479008129557e-08, "loss": 0.4306, "num_input_tokens_seen": 183287072, "step": 150720 }, { "epoch": 18.8854780102744, "grad_norm": 33.43608856201172, "learning_rate": 9.431907122094296e-08, "loss": 0.493, "num_input_tokens_seen": 183292640, "step": 150725 }, { "epoch": 18.886104498183187, "grad_norm": 6.266536235809326, "learning_rate": 9.42134110121301e-08, "loss": 0.4866, "num_input_tokens_seen": 183298976, "step": 150730 }, { "epoch": 18.88673098609197, "grad_norm": 4.463433265686035, "learning_rate": 9.410780945612042e-08, "loss": 0.4415, "num_input_tokens_seen": 183305120, "step": 150735 }, { "epoch": 18.88735747400075, "grad_norm": 3.9615092277526855, "learning_rate": 9.400226655417621e-08, "loss": 0.4358, "num_input_tokens_seen": 183311360, "step": 150740 }, { "epoch": 18.887983961909534, "grad_norm": 11.328803062438965, "learning_rate": 9.389678230756039e-08, "loss": 0.3656, "num_input_tokens_seen": 183317664, "step": 150745 }, { "epoch": 18.88861044981832, "grad_norm": 20.603107452392578, "learning_rate": 9.379135671753303e-08, "loss": 0.4802, "num_input_tokens_seen": 183323808, "step": 150750 }, { "epoch": 18.8892369377271, "grad_norm": 8.82815933227539, "learning_rate": 9.368598978535425e-08, "loss": 0.4081, "num_input_tokens_seen": 183329664, "step": 150755 }, { "epoch": 18.889863425635884, "grad_norm": 14.520332336425781, "learning_rate": 9.358068151228528e-08, "loss": 0.4811, "num_input_tokens_seen": 183335872, "step": 150760 }, { "epoch": 18.89048991354467, "grad_norm": 4.870615482330322, "learning_rate": 9.347543189958397e-08, "loss": 0.4443, "num_input_tokens_seen": 183340928, "step": 150765 }, { "epoch": 18.891116401453452, "grad_norm": 6.524567127227783, "learning_rate": 9.337024094850988e-08, "loss": 0.4288, "num_input_tokens_seen": 183347008, "step": 150770 }, { "epoch": 18.891742889362234, "grad_norm": 6.005559921264648, "learning_rate": 9.326510866031869e-08, "loss": 0.5052, "num_input_tokens_seen": 183353184, "step": 150775 }, { "epoch": 18.89236937727102, "grad_norm": 5.3914690017700195, "learning_rate": 9.316003503626936e-08, "loss": 0.4181, "num_input_tokens_seen": 183358848, "step": 150780 }, { "epoch": 18.892995865179802, "grad_norm": 4.2849955558776855, "learning_rate": 9.305502007761646e-08, "loss": 0.4204, "num_input_tokens_seen": 183365056, "step": 150785 }, { "epoch": 18.893622353088585, "grad_norm": 5.35603141784668, "learning_rate": 9.295006378561733e-08, "loss": 0.4274, "num_input_tokens_seen": 183371648, "step": 150790 }, { "epoch": 18.89424884099737, "grad_norm": 11.694864273071289, "learning_rate": 9.284516616152483e-08, "loss": 0.5067, "num_input_tokens_seen": 183377312, "step": 150795 }, { "epoch": 18.894875328906153, "grad_norm": 6.065974235534668, "learning_rate": 9.274032720659465e-08, "loss": 0.4413, "num_input_tokens_seen": 183383392, "step": 150800 }, { "epoch": 18.895501816814935, "grad_norm": 15.938480377197266, "learning_rate": 9.263554692207966e-08, "loss": 0.4187, "num_input_tokens_seen": 183389312, "step": 150805 }, { "epoch": 18.896128304723717, "grad_norm": 12.678282737731934, "learning_rate": 9.253082530923219e-08, "loss": 0.4557, "num_input_tokens_seen": 183395040, "step": 150810 }, { "epoch": 18.896754792632503, "grad_norm": 6.375213146209717, "learning_rate": 9.242616236930512e-08, "loss": 0.4091, "num_input_tokens_seen": 183401152, "step": 150815 }, { "epoch": 18.897381280541286, "grad_norm": 11.195100784301758, "learning_rate": 9.23215581035486e-08, "loss": 0.4549, "num_input_tokens_seen": 183407360, "step": 150820 }, { "epoch": 18.898007768450068, "grad_norm": 8.710299491882324, "learning_rate": 9.221701251321491e-08, "loss": 0.4306, "num_input_tokens_seen": 183413056, "step": 150825 }, { "epoch": 18.898634256358854, "grad_norm": 8.255440711975098, "learning_rate": 9.211252559955252e-08, "loss": 0.4201, "num_input_tokens_seen": 183418464, "step": 150830 }, { "epoch": 18.899260744267636, "grad_norm": 15.160923957824707, "learning_rate": 9.2008097363811e-08, "loss": 0.4324, "num_input_tokens_seen": 183424832, "step": 150835 }, { "epoch": 18.899887232176418, "grad_norm": 8.961302757263184, "learning_rate": 9.190372780723933e-08, "loss": 0.4274, "num_input_tokens_seen": 183431264, "step": 150840 }, { "epoch": 18.900513720085204, "grad_norm": 3.47727108001709, "learning_rate": 9.17994169310854e-08, "loss": 0.3698, "num_input_tokens_seen": 183437280, "step": 150845 }, { "epoch": 18.901140207993986, "grad_norm": 17.66275405883789, "learning_rate": 9.169516473659601e-08, "loss": 0.4127, "num_input_tokens_seen": 183443456, "step": 150850 }, { "epoch": 18.90176669590277, "grad_norm": 5.0971150398254395, "learning_rate": 9.159097122501736e-08, "loss": 0.4353, "num_input_tokens_seen": 183449568, "step": 150855 }, { "epoch": 18.90239318381155, "grad_norm": 19.76819610595703, "learning_rate": 9.148683639759515e-08, "loss": 0.4409, "num_input_tokens_seen": 183455168, "step": 150860 }, { "epoch": 18.903019671720337, "grad_norm": 4.867936134338379, "learning_rate": 9.138276025557558e-08, "loss": 0.446, "num_input_tokens_seen": 183461248, "step": 150865 }, { "epoch": 18.90364615962912, "grad_norm": 5.2629523277282715, "learning_rate": 9.1278742800201e-08, "loss": 0.4372, "num_input_tokens_seen": 183467232, "step": 150870 }, { "epoch": 18.9042726475379, "grad_norm": 29.47509765625, "learning_rate": 9.117478403271651e-08, "loss": 0.6333, "num_input_tokens_seen": 183473184, "step": 150875 }, { "epoch": 18.904899135446687, "grad_norm": 9.471907615661621, "learning_rate": 9.107088395436503e-08, "loss": 0.4695, "num_input_tokens_seen": 183478848, "step": 150880 }, { "epoch": 18.90552562335547, "grad_norm": 5.065328121185303, "learning_rate": 9.096704256638777e-08, "loss": 0.3948, "num_input_tokens_seen": 183485280, "step": 150885 }, { "epoch": 18.90615211126425, "grad_norm": 5.692752838134766, "learning_rate": 9.086325987002708e-08, "loss": 0.4347, "num_input_tokens_seen": 183491008, "step": 150890 }, { "epoch": 18.906778599173037, "grad_norm": 29.612937927246094, "learning_rate": 9.075953586652309e-08, "loss": 0.4718, "num_input_tokens_seen": 183497312, "step": 150895 }, { "epoch": 18.90740508708182, "grad_norm": 6.122263431549072, "learning_rate": 9.0655870557117e-08, "loss": 0.5135, "num_input_tokens_seen": 183503456, "step": 150900 }, { "epoch": 18.908031574990602, "grad_norm": 6.176807880401611, "learning_rate": 9.055226394304673e-08, "loss": 0.4313, "num_input_tokens_seen": 183509280, "step": 150905 }, { "epoch": 18.908658062899384, "grad_norm": 6.140519142150879, "learning_rate": 9.044871602555238e-08, "loss": 0.3954, "num_input_tokens_seen": 183515360, "step": 150910 }, { "epoch": 18.90928455080817, "grad_norm": 20.241350173950195, "learning_rate": 9.034522680587132e-08, "loss": 0.473, "num_input_tokens_seen": 183521408, "step": 150915 }, { "epoch": 18.909911038716952, "grad_norm": 8.357527732849121, "learning_rate": 9.024179628524088e-08, "loss": 0.434, "num_input_tokens_seen": 183527584, "step": 150920 }, { "epoch": 18.910537526625735, "grad_norm": 9.575640678405762, "learning_rate": 9.013842446489729e-08, "loss": 0.4224, "num_input_tokens_seen": 183533856, "step": 150925 }, { "epoch": 18.91116401453452, "grad_norm": 8.297494888305664, "learning_rate": 9.003511134607678e-08, "loss": 0.3775, "num_input_tokens_seen": 183539904, "step": 150930 }, { "epoch": 18.911790502443303, "grad_norm": 23.72849464416504, "learning_rate": 8.99318569300145e-08, "loss": 0.4999, "num_input_tokens_seen": 183546016, "step": 150935 }, { "epoch": 18.912416990352085, "grad_norm": 6.181870460510254, "learning_rate": 8.982866121794554e-08, "loss": 0.4039, "num_input_tokens_seen": 183552288, "step": 150940 }, { "epoch": 18.91304347826087, "grad_norm": 5.0802202224731445, "learning_rate": 8.97255242111028e-08, "loss": 0.4025, "num_input_tokens_seen": 183558400, "step": 150945 }, { "epoch": 18.913669966169653, "grad_norm": 7.042725563049316, "learning_rate": 8.962244591071979e-08, "loss": 0.4639, "num_input_tokens_seen": 183564480, "step": 150950 }, { "epoch": 18.914296454078436, "grad_norm": 15.858773231506348, "learning_rate": 8.951942631802935e-08, "loss": 0.4512, "num_input_tokens_seen": 183570656, "step": 150955 }, { "epoch": 18.91492294198722, "grad_norm": 9.079487800598145, "learning_rate": 8.941646543426219e-08, "loss": 0.5254, "num_input_tokens_seen": 183576928, "step": 150960 }, { "epoch": 18.915549429896004, "grad_norm": 5.424539089202881, "learning_rate": 8.931356326065011e-08, "loss": 0.3914, "num_input_tokens_seen": 183583168, "step": 150965 }, { "epoch": 18.916175917804786, "grad_norm": 3.6019082069396973, "learning_rate": 8.921071979842266e-08, "loss": 0.4262, "num_input_tokens_seen": 183589024, "step": 150970 }, { "epoch": 18.916802405713568, "grad_norm": 15.896668434143066, "learning_rate": 8.910793504881055e-08, "loss": 0.4396, "num_input_tokens_seen": 183594816, "step": 150975 }, { "epoch": 18.917428893622354, "grad_norm": 6.586805820465088, "learning_rate": 8.90052090130411e-08, "loss": 0.4557, "num_input_tokens_seen": 183600704, "step": 150980 }, { "epoch": 18.918055381531136, "grad_norm": 4.201447486877441, "learning_rate": 8.890254169234391e-08, "loss": 0.4213, "num_input_tokens_seen": 183606304, "step": 150985 }, { "epoch": 18.91868186943992, "grad_norm": 4.635209083557129, "learning_rate": 8.879993308794522e-08, "loss": 0.4217, "num_input_tokens_seen": 183612416, "step": 150990 }, { "epoch": 18.919308357348704, "grad_norm": 6.807751655578613, "learning_rate": 8.86973832010729e-08, "loss": 0.3819, "num_input_tokens_seen": 183618528, "step": 150995 }, { "epoch": 18.919934845257487, "grad_norm": 4.614530086517334, "learning_rate": 8.859489203295268e-08, "loss": 0.4188, "num_input_tokens_seen": 183624832, "step": 151000 }, { "epoch": 18.92056133316627, "grad_norm": 18.000078201293945, "learning_rate": 8.849245958480912e-08, "loss": 0.5288, "num_input_tokens_seen": 183630976, "step": 151005 }, { "epoch": 18.921187821075055, "grad_norm": 27.469751358032227, "learning_rate": 8.839008585786846e-08, "loss": 0.4763, "num_input_tokens_seen": 183637120, "step": 151010 }, { "epoch": 18.921814308983837, "grad_norm": 9.26057243347168, "learning_rate": 8.828777085335305e-08, "loss": 0.4183, "num_input_tokens_seen": 183643136, "step": 151015 }, { "epoch": 18.92244079689262, "grad_norm": 16.652101516723633, "learning_rate": 8.818551457248747e-08, "loss": 0.5029, "num_input_tokens_seen": 183649312, "step": 151020 }, { "epoch": 18.9230672848014, "grad_norm": 3.2253572940826416, "learning_rate": 8.808331701649297e-08, "loss": 0.4854, "num_input_tokens_seen": 183655616, "step": 151025 }, { "epoch": 18.923693772710187, "grad_norm": 31.219097137451172, "learning_rate": 8.798117818659302e-08, "loss": 0.4372, "num_input_tokens_seen": 183661600, "step": 151030 }, { "epoch": 18.92432026061897, "grad_norm": 7.410858631134033, "learning_rate": 8.787909808400718e-08, "loss": 0.4114, "num_input_tokens_seen": 183667680, "step": 151035 }, { "epoch": 18.924946748527752, "grad_norm": 26.995498657226562, "learning_rate": 8.777707670995672e-08, "loss": 0.559, "num_input_tokens_seen": 183673728, "step": 151040 }, { "epoch": 18.925573236436538, "grad_norm": 17.082447052001953, "learning_rate": 8.76751140656612e-08, "loss": 0.4804, "num_input_tokens_seen": 183680032, "step": 151045 }, { "epoch": 18.92619972434532, "grad_norm": 6.984182834625244, "learning_rate": 8.757321015234022e-08, "loss": 0.4114, "num_input_tokens_seen": 183686048, "step": 151050 }, { "epoch": 18.926826212254102, "grad_norm": 8.229111671447754, "learning_rate": 8.747136497121111e-08, "loss": 0.4362, "num_input_tokens_seen": 183691936, "step": 151055 }, { "epoch": 18.92745270016289, "grad_norm": 17.671445846557617, "learning_rate": 8.736957852349237e-08, "loss": 0.4796, "num_input_tokens_seen": 183698176, "step": 151060 }, { "epoch": 18.92807918807167, "grad_norm": 3.7466094493865967, "learning_rate": 8.72678508104008e-08, "loss": 0.4191, "num_input_tokens_seen": 183703808, "step": 151065 }, { "epoch": 18.928705675980453, "grad_norm": 3.69966459274292, "learning_rate": 8.716618183315206e-08, "loss": 0.3877, "num_input_tokens_seen": 183710176, "step": 151070 }, { "epoch": 18.929332163889235, "grad_norm": 17.94025421142578, "learning_rate": 8.706457159296245e-08, "loss": 0.436, "num_input_tokens_seen": 183716640, "step": 151075 }, { "epoch": 18.92995865179802, "grad_norm": 7.907773971557617, "learning_rate": 8.696302009104596e-08, "loss": 0.3994, "num_input_tokens_seen": 183722720, "step": 151080 }, { "epoch": 18.930585139706803, "grad_norm": 7.167979717254639, "learning_rate": 8.686152732861774e-08, "loss": 0.4351, "num_input_tokens_seen": 183728096, "step": 151085 }, { "epoch": 18.931211627615586, "grad_norm": 6.5201735496521, "learning_rate": 8.676009330689073e-08, "loss": 0.4405, "num_input_tokens_seen": 183733856, "step": 151090 }, { "epoch": 18.93183811552437, "grad_norm": 19.00609016418457, "learning_rate": 8.665871802707727e-08, "loss": 0.5056, "num_input_tokens_seen": 183740032, "step": 151095 }, { "epoch": 18.932464603433154, "grad_norm": 6.024680137634277, "learning_rate": 8.655740149038971e-08, "loss": 0.4017, "num_input_tokens_seen": 183746208, "step": 151100 }, { "epoch": 18.933091091341936, "grad_norm": 10.569866180419922, "learning_rate": 8.64561436980399e-08, "loss": 0.4432, "num_input_tokens_seen": 183752192, "step": 151105 }, { "epoch": 18.93371757925072, "grad_norm": 9.287585258483887, "learning_rate": 8.635494465123795e-08, "loss": 0.4194, "num_input_tokens_seen": 183758496, "step": 151110 }, { "epoch": 18.934344067159504, "grad_norm": 8.58687686920166, "learning_rate": 8.625380435119346e-08, "loss": 0.4328, "num_input_tokens_seen": 183764832, "step": 151115 }, { "epoch": 18.934970555068286, "grad_norm": 6.107619762420654, "learning_rate": 8.615272279911602e-08, "loss": 0.4313, "num_input_tokens_seen": 183771072, "step": 151120 }, { "epoch": 18.935597042977072, "grad_norm": 8.045929908752441, "learning_rate": 8.60516999962141e-08, "loss": 0.4707, "num_input_tokens_seen": 183777120, "step": 151125 }, { "epoch": 18.936223530885854, "grad_norm": 7.495816707611084, "learning_rate": 8.595073594369562e-08, "loss": 0.4562, "num_input_tokens_seen": 183783264, "step": 151130 }, { "epoch": 18.936850018794637, "grad_norm": 8.182971954345703, "learning_rate": 8.584983064276741e-08, "loss": 0.4717, "num_input_tokens_seen": 183789408, "step": 151135 }, { "epoch": 18.93747650670342, "grad_norm": 12.751070022583008, "learning_rate": 8.574898409463573e-08, "loss": 0.4749, "num_input_tokens_seen": 183795680, "step": 151140 }, { "epoch": 18.938102994612205, "grad_norm": 5.857755184173584, "learning_rate": 8.564819630050735e-08, "loss": 0.4182, "num_input_tokens_seen": 183801984, "step": 151145 }, { "epoch": 18.938729482520987, "grad_norm": 5.9250102043151855, "learning_rate": 8.554746726158636e-08, "loss": 0.4001, "num_input_tokens_seen": 183808224, "step": 151150 }, { "epoch": 18.93935597042977, "grad_norm": 6.078082084655762, "learning_rate": 8.544679697907677e-08, "loss": 0.438, "num_input_tokens_seen": 183814592, "step": 151155 }, { "epoch": 18.939982458338555, "grad_norm": 5.930434703826904, "learning_rate": 8.534618545418371e-08, "loss": 0.5056, "num_input_tokens_seen": 183820480, "step": 151160 }, { "epoch": 18.940608946247337, "grad_norm": 14.167570114135742, "learning_rate": 8.524563268810792e-08, "loss": 0.4509, "num_input_tokens_seen": 183826752, "step": 151165 }, { "epoch": 18.94123543415612, "grad_norm": 6.060065269470215, "learning_rate": 8.514513868205343e-08, "loss": 0.3961, "num_input_tokens_seen": 183832896, "step": 151170 }, { "epoch": 18.941861922064906, "grad_norm": 13.054608345031738, "learning_rate": 8.504470343722038e-08, "loss": 0.4191, "num_input_tokens_seen": 183839072, "step": 151175 }, { "epoch": 18.942488409973688, "grad_norm": 19.819072723388672, "learning_rate": 8.494432695481058e-08, "loss": 0.4283, "num_input_tokens_seen": 183845408, "step": 151180 }, { "epoch": 18.94311489788247, "grad_norm": 15.783145904541016, "learning_rate": 8.484400923602365e-08, "loss": 0.4644, "num_input_tokens_seen": 183851104, "step": 151185 }, { "epoch": 18.943741385791256, "grad_norm": 8.171286582946777, "learning_rate": 8.474375028205861e-08, "loss": 0.4367, "num_input_tokens_seen": 183857184, "step": 151190 }, { "epoch": 18.94436787370004, "grad_norm": 6.302472114562988, "learning_rate": 8.464355009411563e-08, "loss": 0.4115, "num_input_tokens_seen": 183863264, "step": 151195 }, { "epoch": 18.94499436160882, "grad_norm": 9.93266487121582, "learning_rate": 8.454340867339039e-08, "loss": 0.4391, "num_input_tokens_seen": 183868832, "step": 151200 }, { "epoch": 18.945620849517603, "grad_norm": 8.290246963500977, "learning_rate": 8.444332602108252e-08, "loss": 0.4014, "num_input_tokens_seen": 183874784, "step": 151205 }, { "epoch": 18.94624733742639, "grad_norm": 4.927300453186035, "learning_rate": 8.43433021383866e-08, "loss": 0.4573, "num_input_tokens_seen": 183880544, "step": 151210 }, { "epoch": 18.94687382533517, "grad_norm": 14.528914451599121, "learning_rate": 8.424333702649945e-08, "loss": 0.404, "num_input_tokens_seen": 183886688, "step": 151215 }, { "epoch": 18.947500313243953, "grad_norm": 4.066286563873291, "learning_rate": 8.414343068661624e-08, "loss": 0.4476, "num_input_tokens_seen": 183892992, "step": 151220 }, { "epoch": 18.94812680115274, "grad_norm": 9.114949226379395, "learning_rate": 8.404358311993154e-08, "loss": 0.4373, "num_input_tokens_seen": 183899200, "step": 151225 }, { "epoch": 18.94875328906152, "grad_norm": 5.87642240524292, "learning_rate": 8.394379432763889e-08, "loss": 0.3751, "num_input_tokens_seen": 183905248, "step": 151230 }, { "epoch": 18.949379776970304, "grad_norm": 5.590041160583496, "learning_rate": 8.384406431093117e-08, "loss": 0.4614, "num_input_tokens_seen": 183911616, "step": 151235 }, { "epoch": 18.95000626487909, "grad_norm": 13.189658164978027, "learning_rate": 8.37443930710008e-08, "loss": 0.4138, "num_input_tokens_seen": 183917760, "step": 151240 }, { "epoch": 18.95063275278787, "grad_norm": 11.065853118896484, "learning_rate": 8.364478060904013e-08, "loss": 0.4104, "num_input_tokens_seen": 183924096, "step": 151245 }, { "epoch": 18.951259240696654, "grad_norm": 5.612077236175537, "learning_rate": 8.354522692623878e-08, "loss": 0.3814, "num_input_tokens_seen": 183930080, "step": 151250 }, { "epoch": 18.951885728605436, "grad_norm": 12.258594512939453, "learning_rate": 8.344573202378803e-08, "loss": 0.3839, "num_input_tokens_seen": 183936256, "step": 151255 }, { "epoch": 18.952512216514222, "grad_norm": 4.1097259521484375, "learning_rate": 8.334629590287747e-08, "loss": 0.3879, "num_input_tokens_seen": 183941984, "step": 151260 }, { "epoch": 18.953138704423004, "grad_norm": 5.303573131561279, "learning_rate": 8.324691856469558e-08, "loss": 0.439, "num_input_tokens_seen": 183948000, "step": 151265 }, { "epoch": 18.953765192331787, "grad_norm": 6.760383605957031, "learning_rate": 8.314760001043031e-08, "loss": 0.4286, "num_input_tokens_seen": 183954304, "step": 151270 }, { "epoch": 18.954391680240573, "grad_norm": 4.523797035217285, "learning_rate": 8.304834024126963e-08, "loss": 0.4138, "num_input_tokens_seen": 183960256, "step": 151275 }, { "epoch": 18.955018168149355, "grad_norm": 18.329540252685547, "learning_rate": 8.294913925840032e-08, "loss": 0.4197, "num_input_tokens_seen": 183966464, "step": 151280 }, { "epoch": 18.955644656058137, "grad_norm": 12.655498504638672, "learning_rate": 8.2849997063007e-08, "loss": 0.4286, "num_input_tokens_seen": 183972288, "step": 151285 }, { "epoch": 18.956271143966923, "grad_norm": 12.314176559448242, "learning_rate": 8.275091365627708e-08, "loss": 0.5839, "num_input_tokens_seen": 183978432, "step": 151290 }, { "epoch": 18.956897631875705, "grad_norm": 4.210605621337891, "learning_rate": 8.265188903939348e-08, "loss": 0.4549, "num_input_tokens_seen": 183984416, "step": 151295 }, { "epoch": 18.957524119784487, "grad_norm": 18.259593963623047, "learning_rate": 8.255292321354191e-08, "loss": 0.4278, "num_input_tokens_seen": 183990240, "step": 151300 }, { "epoch": 18.95815060769327, "grad_norm": 31.67523193359375, "learning_rate": 8.245401617990312e-08, "loss": 0.4856, "num_input_tokens_seen": 183996544, "step": 151305 }, { "epoch": 18.958777095602056, "grad_norm": 20.229923248291016, "learning_rate": 8.23551679396617e-08, "loss": 0.4764, "num_input_tokens_seen": 184002848, "step": 151310 }, { "epoch": 18.959403583510838, "grad_norm": 6.95272159576416, "learning_rate": 8.225637849399892e-08, "loss": 0.4404, "num_input_tokens_seen": 184008832, "step": 151315 }, { "epoch": 18.96003007141962, "grad_norm": 6.89857292175293, "learning_rate": 8.215764784409552e-08, "loss": 0.4201, "num_input_tokens_seen": 184014688, "step": 151320 }, { "epoch": 18.960656559328406, "grad_norm": 8.879044532775879, "learning_rate": 8.205897599113221e-08, "loss": 0.393, "num_input_tokens_seen": 184021056, "step": 151325 }, { "epoch": 18.96128304723719, "grad_norm": 30.604385375976562, "learning_rate": 8.196036293628862e-08, "loss": 0.5363, "num_input_tokens_seen": 184027072, "step": 151330 }, { "epoch": 18.96190953514597, "grad_norm": 4.912461280822754, "learning_rate": 8.186180868074378e-08, "loss": 0.4357, "num_input_tokens_seen": 184033344, "step": 151335 }, { "epoch": 18.962536023054756, "grad_norm": 4.100583553314209, "learning_rate": 8.176331322567565e-08, "loss": 0.4106, "num_input_tokens_seen": 184039488, "step": 151340 }, { "epoch": 18.96316251096354, "grad_norm": 19.7302303314209, "learning_rate": 8.166487657226218e-08, "loss": 0.4769, "num_input_tokens_seen": 184045792, "step": 151345 }, { "epoch": 18.96378899887232, "grad_norm": 9.869668006896973, "learning_rate": 8.156649872168076e-08, "loss": 0.4714, "num_input_tokens_seen": 184052160, "step": 151350 }, { "epoch": 18.964415486781107, "grad_norm": 8.663909912109375, "learning_rate": 8.146817967510657e-08, "loss": 0.4178, "num_input_tokens_seen": 184058304, "step": 151355 }, { "epoch": 18.96504197468989, "grad_norm": 7.105409145355225, "learning_rate": 8.136991943371531e-08, "loss": 0.4025, "num_input_tokens_seen": 184064512, "step": 151360 }, { "epoch": 18.96566846259867, "grad_norm": 5.455399990081787, "learning_rate": 8.127171799868216e-08, "loss": 0.5267, "num_input_tokens_seen": 184070688, "step": 151365 }, { "epoch": 18.966294950507454, "grad_norm": 7.772367477416992, "learning_rate": 8.117357537118175e-08, "loss": 0.5519, "num_input_tokens_seen": 184076704, "step": 151370 }, { "epoch": 18.96692143841624, "grad_norm": 14.322945594787598, "learning_rate": 8.107549155238592e-08, "loss": 0.5125, "num_input_tokens_seen": 184082432, "step": 151375 }, { "epoch": 18.96754792632502, "grad_norm": 5.095350742340088, "learning_rate": 8.097746654346872e-08, "loss": 0.5001, "num_input_tokens_seen": 184088224, "step": 151380 }, { "epoch": 18.968174414233804, "grad_norm": 11.86153507232666, "learning_rate": 8.087950034560144e-08, "loss": 0.5174, "num_input_tokens_seen": 184094560, "step": 151385 }, { "epoch": 18.96880090214259, "grad_norm": 10.765719413757324, "learning_rate": 8.078159295995592e-08, "loss": 0.4988, "num_input_tokens_seen": 184101024, "step": 151390 }, { "epoch": 18.969427390051372, "grad_norm": 4.242192268371582, "learning_rate": 8.068374438770177e-08, "loss": 0.4524, "num_input_tokens_seen": 184107104, "step": 151395 }, { "epoch": 18.970053877960154, "grad_norm": 25.802581787109375, "learning_rate": 8.058595463000974e-08, "loss": 0.477, "num_input_tokens_seen": 184113120, "step": 151400 }, { "epoch": 18.97068036586894, "grad_norm": 6.361908435821533, "learning_rate": 8.048822368804832e-08, "loss": 0.4164, "num_input_tokens_seen": 184119520, "step": 151405 }, { "epoch": 18.971306853777723, "grad_norm": 4.683972358703613, "learning_rate": 8.039055156298658e-08, "loss": 0.3784, "num_input_tokens_seen": 184125920, "step": 151410 }, { "epoch": 18.971933341686505, "grad_norm": 4.675624370574951, "learning_rate": 8.029293825599139e-08, "loss": 0.4222, "num_input_tokens_seen": 184132128, "step": 151415 }, { "epoch": 18.97255982959529, "grad_norm": 4.624443054199219, "learning_rate": 8.019538376823122e-08, "loss": 0.4809, "num_input_tokens_seen": 184137920, "step": 151420 }, { "epoch": 18.973186317504073, "grad_norm": 5.40739107131958, "learning_rate": 8.009788810087072e-08, "loss": 0.408, "num_input_tokens_seen": 184144128, "step": 151425 }, { "epoch": 18.973812805412855, "grad_norm": 26.295854568481445, "learning_rate": 8.000045125507672e-08, "loss": 0.4743, "num_input_tokens_seen": 184150208, "step": 151430 }, { "epoch": 18.974439293321637, "grad_norm": 5.731980800628662, "learning_rate": 7.99030732320144e-08, "loss": 0.4368, "num_input_tokens_seen": 184156256, "step": 151435 }, { "epoch": 18.975065781230423, "grad_norm": 19.031919479370117, "learning_rate": 7.980575403284673e-08, "loss": 0.4552, "num_input_tokens_seen": 184162720, "step": 151440 }, { "epoch": 18.975692269139206, "grad_norm": 3.8712308406829834, "learning_rate": 7.970849365873833e-08, "loss": 0.4058, "num_input_tokens_seen": 184168832, "step": 151445 }, { "epoch": 18.976318757047988, "grad_norm": 5.18900203704834, "learning_rate": 7.961129211085105e-08, "loss": 0.4517, "num_input_tokens_seen": 184174944, "step": 151450 }, { "epoch": 18.976945244956774, "grad_norm": 6.947927951812744, "learning_rate": 7.951414939034785e-08, "loss": 0.4574, "num_input_tokens_seen": 184180928, "step": 151455 }, { "epoch": 18.977571732865556, "grad_norm": 12.062480926513672, "learning_rate": 7.941706549839002e-08, "loss": 0.4586, "num_input_tokens_seen": 184187200, "step": 151460 }, { "epoch": 18.97819822077434, "grad_norm": 7.115492820739746, "learning_rate": 7.932004043613884e-08, "loss": 0.446, "num_input_tokens_seen": 184193280, "step": 151465 }, { "epoch": 18.978824708683124, "grad_norm": 7.810657978057861, "learning_rate": 7.922307420475284e-08, "loss": 0.4309, "num_input_tokens_seen": 184199168, "step": 151470 }, { "epoch": 18.979451196591906, "grad_norm": 32.9688606262207, "learning_rate": 7.91261668053922e-08, "loss": 0.4727, "num_input_tokens_seen": 184205472, "step": 151475 }, { "epoch": 18.98007768450069, "grad_norm": 6.7367072105407715, "learning_rate": 7.9029318239216e-08, "loss": 0.4038, "num_input_tokens_seen": 184211584, "step": 151480 }, { "epoch": 18.98070417240947, "grad_norm": 6.074687957763672, "learning_rate": 7.893252850738109e-08, "loss": 0.544, "num_input_tokens_seen": 184217696, "step": 151485 }, { "epoch": 18.981330660318257, "grad_norm": 6.538610458374023, "learning_rate": 7.883579761104542e-08, "loss": 0.3993, "num_input_tokens_seen": 184223968, "step": 151490 }, { "epoch": 18.98195714822704, "grad_norm": 6.174429416656494, "learning_rate": 7.873912555136532e-08, "loss": 0.4407, "num_input_tokens_seen": 184230112, "step": 151495 }, { "epoch": 18.98258363613582, "grad_norm": 6.993476867675781, "learning_rate": 7.864251232949649e-08, "loss": 0.5687, "num_input_tokens_seen": 184236224, "step": 151500 }, { "epoch": 18.983210124044607, "grad_norm": 4.4389472007751465, "learning_rate": 7.854595794659414e-08, "loss": 0.5037, "num_input_tokens_seen": 184242080, "step": 151505 }, { "epoch": 18.98383661195339, "grad_norm": 15.463994979858398, "learning_rate": 7.844946240381235e-08, "loss": 0.4958, "num_input_tokens_seen": 184248288, "step": 151510 }, { "epoch": 18.98446309986217, "grad_norm": 5.506060600280762, "learning_rate": 7.835302570230519e-08, "loss": 0.4488, "num_input_tokens_seen": 184253280, "step": 151515 }, { "epoch": 18.985089587770958, "grad_norm": 6.724870204925537, "learning_rate": 7.825664784322561e-08, "loss": 0.393, "num_input_tokens_seen": 184259744, "step": 151520 }, { "epoch": 18.98571607567974, "grad_norm": 7.859842777252197, "learning_rate": 7.816032882772551e-08, "loss": 0.4349, "num_input_tokens_seen": 184264960, "step": 151525 }, { "epoch": 18.986342563588522, "grad_norm": 7.795412063598633, "learning_rate": 7.806406865695726e-08, "loss": 0.5328, "num_input_tokens_seen": 184271072, "step": 151530 }, { "epoch": 18.986969051497304, "grad_norm": 4.804797172546387, "learning_rate": 7.796786733207051e-08, "loss": 0.4593, "num_input_tokens_seen": 184277312, "step": 151535 }, { "epoch": 18.98759553940609, "grad_norm": 6.931375026702881, "learning_rate": 7.787172485421713e-08, "loss": 0.5078, "num_input_tokens_seen": 184283232, "step": 151540 }, { "epoch": 18.988222027314873, "grad_norm": 5.95780086517334, "learning_rate": 7.77756412245445e-08, "loss": 0.4809, "num_input_tokens_seen": 184289536, "step": 151545 }, { "epoch": 18.988848515223655, "grad_norm": 19.203920364379883, "learning_rate": 7.767961644420286e-08, "loss": 0.4735, "num_input_tokens_seen": 184295744, "step": 151550 }, { "epoch": 18.98947500313244, "grad_norm": 4.543773174285889, "learning_rate": 7.758365051433958e-08, "loss": 0.4439, "num_input_tokens_seen": 184302304, "step": 151555 }, { "epoch": 18.990101491041223, "grad_norm": 11.165889739990234, "learning_rate": 7.748774343610266e-08, "loss": 0.4629, "num_input_tokens_seen": 184308512, "step": 151560 }, { "epoch": 18.990727978950005, "grad_norm": 12.384096145629883, "learning_rate": 7.739189521063839e-08, "loss": 0.4412, "num_input_tokens_seen": 184314816, "step": 151565 }, { "epoch": 18.99135446685879, "grad_norm": 3.5974714756011963, "learning_rate": 7.729610583909253e-08, "loss": 0.4145, "num_input_tokens_seen": 184320800, "step": 151570 }, { "epoch": 18.991980954767573, "grad_norm": 28.99732780456543, "learning_rate": 7.720037532261082e-08, "loss": 0.483, "num_input_tokens_seen": 184326848, "step": 151575 }, { "epoch": 18.992607442676356, "grad_norm": 7.194638252258301, "learning_rate": 7.710470366233736e-08, "loss": 0.4808, "num_input_tokens_seen": 184332800, "step": 151580 }, { "epoch": 18.99323393058514, "grad_norm": 3.489849805831909, "learning_rate": 7.700909085941622e-08, "loss": 0.4088, "num_input_tokens_seen": 184338528, "step": 151585 }, { "epoch": 18.993860418493924, "grad_norm": 6.157803535461426, "learning_rate": 7.691353691498981e-08, "loss": 0.4235, "num_input_tokens_seen": 184344480, "step": 151590 }, { "epoch": 18.994486906402706, "grad_norm": 10.677104949951172, "learning_rate": 7.68180418302017e-08, "loss": 0.4652, "num_input_tokens_seen": 184350624, "step": 151595 }, { "epoch": 18.99511339431149, "grad_norm": 3.466615915298462, "learning_rate": 7.672260560619316e-08, "loss": 0.3844, "num_input_tokens_seen": 184356832, "step": 151600 }, { "epoch": 18.995739882220274, "grad_norm": 23.6124324798584, "learning_rate": 7.662722824410495e-08, "loss": 0.466, "num_input_tokens_seen": 184363232, "step": 151605 }, { "epoch": 18.996366370129056, "grad_norm": 6.965620517730713, "learning_rate": 7.65319097450773e-08, "loss": 0.4918, "num_input_tokens_seen": 184369408, "step": 151610 }, { "epoch": 18.99699285803784, "grad_norm": 13.641197204589844, "learning_rate": 7.643665011025036e-08, "loss": 0.4435, "num_input_tokens_seen": 184375392, "step": 151615 }, { "epoch": 18.997619345946624, "grad_norm": 7.041078567504883, "learning_rate": 7.634144934076326e-08, "loss": 0.4336, "num_input_tokens_seen": 184381440, "step": 151620 }, { "epoch": 18.998245833855407, "grad_norm": 10.347430229187012, "learning_rate": 7.624630743775286e-08, "loss": 0.4271, "num_input_tokens_seen": 184387392, "step": 151625 }, { "epoch": 18.99887232176419, "grad_norm": 5.949550151824951, "learning_rate": 7.615122440235822e-08, "loss": 0.4412, "num_input_tokens_seen": 184393120, "step": 151630 }, { "epoch": 18.999498809672975, "grad_norm": 16.446453094482422, "learning_rate": 7.605620023571513e-08, "loss": 0.4792, "num_input_tokens_seen": 184399040, "step": 151635 }, { "epoch": 19.000125297581757, "grad_norm": 14.583985328674316, "learning_rate": 7.59612349389599e-08, "loss": 0.4411, "num_input_tokens_seen": 184405088, "step": 151640 }, { "epoch": 19.00075178549054, "grad_norm": 13.170024871826172, "learning_rate": 7.586632851322829e-08, "loss": 0.4538, "num_input_tokens_seen": 184411584, "step": 151645 }, { "epoch": 19.00137827339932, "grad_norm": 17.867155075073242, "learning_rate": 7.577148095965436e-08, "loss": 0.4752, "num_input_tokens_seen": 184417856, "step": 151650 }, { "epoch": 19.002004761308108, "grad_norm": 5.613503456115723, "learning_rate": 7.56766922793728e-08, "loss": 0.4352, "num_input_tokens_seen": 184423904, "step": 151655 }, { "epoch": 19.00263124921689, "grad_norm": 17.587177276611328, "learning_rate": 7.558196247351601e-08, "loss": 0.4474, "num_input_tokens_seen": 184429856, "step": 151660 }, { "epoch": 19.003257737125672, "grad_norm": 4.195199966430664, "learning_rate": 7.548729154321699e-08, "loss": 0.411, "num_input_tokens_seen": 184435968, "step": 151665 }, { "epoch": 19.003884225034458, "grad_norm": 10.783341407775879, "learning_rate": 7.539267948960871e-08, "loss": 0.5017, "num_input_tokens_seen": 184442016, "step": 151670 }, { "epoch": 19.00451071294324, "grad_norm": 15.20586109161377, "learning_rate": 7.529812631382028e-08, "loss": 0.4642, "num_input_tokens_seen": 184448352, "step": 151675 }, { "epoch": 19.005137200852023, "grad_norm": 6.354846954345703, "learning_rate": 7.520363201698356e-08, "loss": 0.5222, "num_input_tokens_seen": 184454304, "step": 151680 }, { "epoch": 19.00576368876081, "grad_norm": 5.467634201049805, "learning_rate": 7.510919660022819e-08, "loss": 0.4764, "num_input_tokens_seen": 184459968, "step": 151685 }, { "epoch": 19.00639017666959, "grad_norm": 6.099558353424072, "learning_rate": 7.501482006468275e-08, "loss": 0.4032, "num_input_tokens_seen": 184465856, "step": 151690 }, { "epoch": 19.007016664578373, "grad_norm": 5.233859062194824, "learning_rate": 7.49205024114763e-08, "loss": 0.4131, "num_input_tokens_seen": 184472032, "step": 151695 }, { "epoch": 19.00764315248716, "grad_norm": 6.243783950805664, "learning_rate": 7.482624364173518e-08, "loss": 0.5054, "num_input_tokens_seen": 184478272, "step": 151700 }, { "epoch": 19.00826964039594, "grad_norm": 8.242229461669922, "learning_rate": 7.473204375658794e-08, "loss": 0.4729, "num_input_tokens_seen": 184484576, "step": 151705 }, { "epoch": 19.008896128304723, "grad_norm": 7.423269748687744, "learning_rate": 7.463790275715976e-08, "loss": 0.4178, "num_input_tokens_seen": 184490880, "step": 151710 }, { "epoch": 19.009522616213506, "grad_norm": 3.632589817047119, "learning_rate": 7.454382064457643e-08, "loss": 0.4305, "num_input_tokens_seen": 184496416, "step": 151715 }, { "epoch": 19.01014910412229, "grad_norm": 5.1940598487854, "learning_rate": 7.444979741996317e-08, "loss": 0.4209, "num_input_tokens_seen": 184502144, "step": 151720 }, { "epoch": 19.010775592031074, "grad_norm": 6.78325891494751, "learning_rate": 7.435583308444349e-08, "loss": 0.403, "num_input_tokens_seen": 184508416, "step": 151725 }, { "epoch": 19.011402079939856, "grad_norm": 6.055868625640869, "learning_rate": 7.426192763914097e-08, "loss": 0.4665, "num_input_tokens_seen": 184514688, "step": 151730 }, { "epoch": 19.012028567848642, "grad_norm": 10.205084800720215, "learning_rate": 7.41680810851786e-08, "loss": 0.4479, "num_input_tokens_seen": 184520896, "step": 151735 }, { "epoch": 19.012655055757424, "grad_norm": 5.747213363647461, "learning_rate": 7.407429342367823e-08, "loss": 0.4652, "num_input_tokens_seen": 184527168, "step": 151740 }, { "epoch": 19.013281543666206, "grad_norm": 22.404211044311523, "learning_rate": 7.398056465576065e-08, "loss": 0.4885, "num_input_tokens_seen": 184533024, "step": 151745 }, { "epoch": 19.013908031574992, "grad_norm": 4.127632141113281, "learning_rate": 7.388689478254774e-08, "loss": 0.4256, "num_input_tokens_seen": 184539040, "step": 151750 }, { "epoch": 19.014534519483774, "grad_norm": 3.9573843479156494, "learning_rate": 7.379328380515805e-08, "loss": 0.4625, "num_input_tokens_seen": 184545088, "step": 151755 }, { "epoch": 19.015161007392557, "grad_norm": 5.137096881866455, "learning_rate": 7.369973172471179e-08, "loss": 0.4728, "num_input_tokens_seen": 184551520, "step": 151760 }, { "epoch": 19.01578749530134, "grad_norm": 10.39947509765625, "learning_rate": 7.360623854232696e-08, "loss": 0.4901, "num_input_tokens_seen": 184556768, "step": 151765 }, { "epoch": 19.016413983210125, "grad_norm": 23.274200439453125, "learning_rate": 7.351280425912099e-08, "loss": 0.465, "num_input_tokens_seen": 184562912, "step": 151770 }, { "epoch": 19.017040471118907, "grad_norm": 8.904126167297363, "learning_rate": 7.341942887621189e-08, "loss": 0.4731, "num_input_tokens_seen": 184569408, "step": 151775 }, { "epoch": 19.01766695902769, "grad_norm": 6.809628009796143, "learning_rate": 7.332611239471543e-08, "loss": 0.53, "num_input_tokens_seen": 184575456, "step": 151780 }, { "epoch": 19.018293446936475, "grad_norm": 5.335586071014404, "learning_rate": 7.323285481574738e-08, "loss": 0.4242, "num_input_tokens_seen": 184581408, "step": 151785 }, { "epoch": 19.018919934845258, "grad_norm": 7.110034942626953, "learning_rate": 7.31396561404235e-08, "loss": 0.4441, "num_input_tokens_seen": 184587424, "step": 151790 }, { "epoch": 19.01954642275404, "grad_norm": 6.927878379821777, "learning_rate": 7.304651636985627e-08, "loss": 0.4257, "num_input_tokens_seen": 184593216, "step": 151795 }, { "epoch": 19.020172910662826, "grad_norm": 7.750085353851318, "learning_rate": 7.295343550516032e-08, "loss": 0.4509, "num_input_tokens_seen": 184599136, "step": 151800 }, { "epoch": 19.020799398571608, "grad_norm": 6.788219928741455, "learning_rate": 7.286041354744922e-08, "loss": 0.4507, "num_input_tokens_seen": 184605344, "step": 151805 }, { "epoch": 19.02142588648039, "grad_norm": 15.942262649536133, "learning_rate": 7.276745049783374e-08, "loss": 0.4217, "num_input_tokens_seen": 184611744, "step": 151810 }, { "epoch": 19.022052374389176, "grad_norm": 4.0557780265808105, "learning_rate": 7.267454635742632e-08, "loss": 0.4335, "num_input_tokens_seen": 184617696, "step": 151815 }, { "epoch": 19.02267886229796, "grad_norm": 7.017756462097168, "learning_rate": 7.25817011273372e-08, "loss": 0.542, "num_input_tokens_seen": 184623840, "step": 151820 }, { "epoch": 19.02330535020674, "grad_norm": 9.126172065734863, "learning_rate": 7.248891480867715e-08, "loss": 0.4549, "num_input_tokens_seen": 184629632, "step": 151825 }, { "epoch": 19.023931838115523, "grad_norm": 12.87857437133789, "learning_rate": 7.239618740255417e-08, "loss": 0.4506, "num_input_tokens_seen": 184635680, "step": 151830 }, { "epoch": 19.02455832602431, "grad_norm": 15.204890251159668, "learning_rate": 7.230351891007792e-08, "loss": 0.4301, "num_input_tokens_seen": 184641920, "step": 151835 }, { "epoch": 19.02518481393309, "grad_norm": 5.16971492767334, "learning_rate": 7.221090933235642e-08, "loss": 0.4634, "num_input_tokens_seen": 184648128, "step": 151840 }, { "epoch": 19.025811301841873, "grad_norm": 5.059301376342773, "learning_rate": 7.2118358670496e-08, "loss": 0.4872, "num_input_tokens_seen": 184654176, "step": 151845 }, { "epoch": 19.02643778975066, "grad_norm": 16.357057571411133, "learning_rate": 7.20258669256041e-08, "loss": 0.4732, "num_input_tokens_seen": 184659616, "step": 151850 }, { "epoch": 19.02706427765944, "grad_norm": 30.015226364135742, "learning_rate": 7.193343409878594e-08, "loss": 0.4355, "num_input_tokens_seen": 184665856, "step": 151855 }, { "epoch": 19.027690765568224, "grad_norm": 26.199596405029297, "learning_rate": 7.184106019114734e-08, "loss": 0.509, "num_input_tokens_seen": 184671936, "step": 151860 }, { "epoch": 19.02831725347701, "grad_norm": 16.751523971557617, "learning_rate": 7.174874520379127e-08, "loss": 0.4329, "num_input_tokens_seen": 184678080, "step": 151865 }, { "epoch": 19.028943741385792, "grad_norm": 4.849814414978027, "learning_rate": 7.165648913782297e-08, "loss": 0.4205, "num_input_tokens_seen": 184684256, "step": 151870 }, { "epoch": 19.029570229294574, "grad_norm": 28.394182205200195, "learning_rate": 7.15642919943449e-08, "loss": 0.442, "num_input_tokens_seen": 184690240, "step": 151875 }, { "epoch": 19.030196717203356, "grad_norm": 7.68861198425293, "learning_rate": 7.147215377445949e-08, "loss": 0.4643, "num_input_tokens_seen": 184696928, "step": 151880 }, { "epoch": 19.030823205112142, "grad_norm": 5.2609968185424805, "learning_rate": 7.138007447926809e-08, "loss": 0.3819, "num_input_tokens_seen": 184703328, "step": 151885 }, { "epoch": 19.031449693020924, "grad_norm": 5.1676740646362305, "learning_rate": 7.12880541098715e-08, "loss": 0.4115, "num_input_tokens_seen": 184709248, "step": 151890 }, { "epoch": 19.032076180929707, "grad_norm": 3.8287887573242188, "learning_rate": 7.119609266736993e-08, "loss": 0.4324, "num_input_tokens_seen": 184715200, "step": 151895 }, { "epoch": 19.032702668838493, "grad_norm": 4.277255058288574, "learning_rate": 7.110419015286363e-08, "loss": 0.4325, "num_input_tokens_seen": 184721088, "step": 151900 }, { "epoch": 19.033329156747275, "grad_norm": 21.701005935668945, "learning_rate": 7.101234656745004e-08, "loss": 0.4947, "num_input_tokens_seen": 184727200, "step": 151905 }, { "epoch": 19.033955644656057, "grad_norm": 3.9644227027893066, "learning_rate": 7.092056191222829e-08, "loss": 0.384, "num_input_tokens_seen": 184733440, "step": 151910 }, { "epoch": 19.034582132564843, "grad_norm": 5.487144947052002, "learning_rate": 7.082883618829528e-08, "loss": 0.5174, "num_input_tokens_seen": 184739392, "step": 151915 }, { "epoch": 19.035208620473625, "grad_norm": 3.961122512817383, "learning_rate": 7.07371693967479e-08, "loss": 0.4793, "num_input_tokens_seen": 184745664, "step": 151920 }, { "epoch": 19.035835108382408, "grad_norm": 5.621424674987793, "learning_rate": 7.064556153868141e-08, "loss": 0.5972, "num_input_tokens_seen": 184751712, "step": 151925 }, { "epoch": 19.03646159629119, "grad_norm": 10.563261032104492, "learning_rate": 7.055401261519157e-08, "loss": 0.4789, "num_input_tokens_seen": 184757728, "step": 151930 }, { "epoch": 19.037088084199976, "grad_norm": 3.58803653717041, "learning_rate": 7.046252262737363e-08, "loss": 0.4317, "num_input_tokens_seen": 184763904, "step": 151935 }, { "epoch": 19.037714572108758, "grad_norm": 6.937755584716797, "learning_rate": 7.037109157632061e-08, "loss": 0.4841, "num_input_tokens_seen": 184769408, "step": 151940 }, { "epoch": 19.03834106001754, "grad_norm": 4.566143035888672, "learning_rate": 7.027971946312551e-08, "loss": 0.4507, "num_input_tokens_seen": 184775648, "step": 151945 }, { "epoch": 19.038967547926326, "grad_norm": 4.72178840637207, "learning_rate": 7.018840628888135e-08, "loss": 0.4501, "num_input_tokens_seen": 184781568, "step": 151950 }, { "epoch": 19.03959403583511, "grad_norm": 11.494073867797852, "learning_rate": 7.009715205467949e-08, "loss": 0.4207, "num_input_tokens_seen": 184787968, "step": 151955 }, { "epoch": 19.04022052374389, "grad_norm": 6.25821590423584, "learning_rate": 7.00059567616107e-08, "loss": 0.4247, "num_input_tokens_seen": 184794400, "step": 151960 }, { "epoch": 19.040847011652676, "grad_norm": 26.190410614013672, "learning_rate": 6.99148204107658e-08, "loss": 0.5126, "num_input_tokens_seen": 184800864, "step": 151965 }, { "epoch": 19.04147349956146, "grad_norm": 7.369274616241455, "learning_rate": 6.982374300323392e-08, "loss": 0.4473, "num_input_tokens_seen": 184807232, "step": 151970 }, { "epoch": 19.04209998747024, "grad_norm": 24.709365844726562, "learning_rate": 6.973272454010471e-08, "loss": 0.5057, "num_input_tokens_seen": 184813120, "step": 151975 }, { "epoch": 19.042726475379027, "grad_norm": 13.121996879577637, "learning_rate": 6.964176502246567e-08, "loss": 0.4703, "num_input_tokens_seen": 184819136, "step": 151980 }, { "epoch": 19.04335296328781, "grad_norm": 7.469198703765869, "learning_rate": 6.955086445140425e-08, "loss": 0.4621, "num_input_tokens_seen": 184824736, "step": 151985 }, { "epoch": 19.04397945119659, "grad_norm": 9.253582954406738, "learning_rate": 6.946002282800846e-08, "loss": 0.4643, "num_input_tokens_seen": 184830720, "step": 151990 }, { "epoch": 19.044605939105374, "grad_norm": 6.476449489593506, "learning_rate": 6.936924015336244e-08, "loss": 0.4575, "num_input_tokens_seen": 184836832, "step": 151995 }, { "epoch": 19.04523242701416, "grad_norm": 7.654634952545166, "learning_rate": 6.927851642855366e-08, "loss": 0.4135, "num_input_tokens_seen": 184843040, "step": 152000 }, { "epoch": 19.045858914922942, "grad_norm": 4.194513320922852, "learning_rate": 6.918785165466458e-08, "loss": 0.4547, "num_input_tokens_seen": 184848640, "step": 152005 }, { "epoch": 19.046485402831724, "grad_norm": 4.903320789337158, "learning_rate": 6.909724583278154e-08, "loss": 0.4845, "num_input_tokens_seen": 184854656, "step": 152010 }, { "epoch": 19.04711189074051, "grad_norm": 5.954339027404785, "learning_rate": 6.900669896398537e-08, "loss": 0.4838, "num_input_tokens_seen": 184860832, "step": 152015 }, { "epoch": 19.047738378649292, "grad_norm": 3.5967652797698975, "learning_rate": 6.891621104936074e-08, "loss": 0.4375, "num_input_tokens_seen": 184866592, "step": 152020 }, { "epoch": 19.048364866558074, "grad_norm": 3.0659005641937256, "learning_rate": 6.882578208998846e-08, "loss": 0.4156, "num_input_tokens_seen": 184872608, "step": 152025 }, { "epoch": 19.04899135446686, "grad_norm": 4.817276954650879, "learning_rate": 6.873541208694989e-08, "loss": 0.4212, "num_input_tokens_seen": 184879008, "step": 152030 }, { "epoch": 19.049617842375643, "grad_norm": 31.93211555480957, "learning_rate": 6.864510104132527e-08, "loss": 0.5966, "num_input_tokens_seen": 184884608, "step": 152035 }, { "epoch": 19.050244330284425, "grad_norm": 4.342573642730713, "learning_rate": 6.855484895419484e-08, "loss": 0.5235, "num_input_tokens_seen": 184890784, "step": 152040 }, { "epoch": 19.050870818193207, "grad_norm": 7.426815509796143, "learning_rate": 6.846465582663664e-08, "loss": 0.4287, "num_input_tokens_seen": 184896160, "step": 152045 }, { "epoch": 19.051497306101993, "grad_norm": 5.744129657745361, "learning_rate": 6.83745216597298e-08, "loss": 0.4399, "num_input_tokens_seen": 184902400, "step": 152050 }, { "epoch": 19.052123794010775, "grad_norm": 8.552492141723633, "learning_rate": 6.828444645455235e-08, "loss": 0.4548, "num_input_tokens_seen": 184908064, "step": 152055 }, { "epoch": 19.052750281919558, "grad_norm": 19.384586334228516, "learning_rate": 6.819443021218063e-08, "loss": 0.4149, "num_input_tokens_seen": 184914592, "step": 152060 }, { "epoch": 19.053376769828343, "grad_norm": 11.388970375061035, "learning_rate": 6.810447293369105e-08, "loss": 0.4477, "num_input_tokens_seen": 184920928, "step": 152065 }, { "epoch": 19.054003257737126, "grad_norm": 4.747447967529297, "learning_rate": 6.801457462015826e-08, "loss": 0.5184, "num_input_tokens_seen": 184926816, "step": 152070 }, { "epoch": 19.054629745645908, "grad_norm": 4.403864860534668, "learning_rate": 6.792473527265863e-08, "loss": 0.4117, "num_input_tokens_seen": 184932768, "step": 152075 }, { "epoch": 19.055256233554694, "grad_norm": 4.6768479347229, "learning_rate": 6.783495489226466e-08, "loss": 0.3762, "num_input_tokens_seen": 184938912, "step": 152080 }, { "epoch": 19.055882721463476, "grad_norm": 12.087390899658203, "learning_rate": 6.774523348005157e-08, "loss": 0.4136, "num_input_tokens_seen": 184945632, "step": 152085 }, { "epoch": 19.05650920937226, "grad_norm": 5.7082929611206055, "learning_rate": 6.76555710370902e-08, "loss": 0.424, "num_input_tokens_seen": 184951584, "step": 152090 }, { "epoch": 19.057135697281044, "grad_norm": 8.371758460998535, "learning_rate": 6.756596756445355e-08, "loss": 0.3814, "num_input_tokens_seen": 184957536, "step": 152095 }, { "epoch": 19.057762185189826, "grad_norm": 19.022945404052734, "learning_rate": 6.7476423063213e-08, "loss": 0.4342, "num_input_tokens_seen": 184964096, "step": 152100 }, { "epoch": 19.05838867309861, "grad_norm": 12.537690162658691, "learning_rate": 6.738693753443882e-08, "loss": 0.5032, "num_input_tokens_seen": 184970496, "step": 152105 }, { "epoch": 19.05901516100739, "grad_norm": 5.198465347290039, "learning_rate": 6.729751097920012e-08, "loss": 0.4021, "num_input_tokens_seen": 184976448, "step": 152110 }, { "epoch": 19.059641648916177, "grad_norm": 7.567957878112793, "learning_rate": 6.720814339856773e-08, "loss": 0.4663, "num_input_tokens_seen": 184982784, "step": 152115 }, { "epoch": 19.06026813682496, "grad_norm": 5.528916835784912, "learning_rate": 6.711883479360915e-08, "loss": 0.4515, "num_input_tokens_seen": 184988832, "step": 152120 }, { "epoch": 19.06089462473374, "grad_norm": 5.498947620391846, "learning_rate": 6.702958516539181e-08, "loss": 0.4376, "num_input_tokens_seen": 184995072, "step": 152125 }, { "epoch": 19.061521112642527, "grad_norm": 6.101199626922607, "learning_rate": 6.694039451498379e-08, "loss": 0.4272, "num_input_tokens_seen": 185001088, "step": 152130 }, { "epoch": 19.06214760055131, "grad_norm": 7.369645118713379, "learning_rate": 6.685126284345033e-08, "loss": 0.5424, "num_input_tokens_seen": 185007296, "step": 152135 }, { "epoch": 19.062774088460092, "grad_norm": 6.181682109832764, "learning_rate": 6.676219015185781e-08, "loss": 0.468, "num_input_tokens_seen": 185012864, "step": 152140 }, { "epoch": 19.063400576368878, "grad_norm": 5.42335319519043, "learning_rate": 6.667317644127091e-08, "loss": 0.431, "num_input_tokens_seen": 185018560, "step": 152145 }, { "epoch": 19.06402706427766, "grad_norm": 11.722270965576172, "learning_rate": 6.658422171275437e-08, "loss": 0.4187, "num_input_tokens_seen": 185024864, "step": 152150 }, { "epoch": 19.064653552186442, "grad_norm": 26.358949661254883, "learning_rate": 6.649532596737063e-08, "loss": 0.4259, "num_input_tokens_seen": 185031328, "step": 152155 }, { "epoch": 19.065280040095224, "grad_norm": 31.38018798828125, "learning_rate": 6.640648920618331e-08, "loss": 0.4572, "num_input_tokens_seen": 185036832, "step": 152160 }, { "epoch": 19.06590652800401, "grad_norm": 17.417573928833008, "learning_rate": 6.631771143025434e-08, "loss": 0.3952, "num_input_tokens_seen": 185042944, "step": 152165 }, { "epoch": 19.066533015912793, "grad_norm": 5.021567344665527, "learning_rate": 6.622899264064453e-08, "loss": 0.4108, "num_input_tokens_seen": 185048992, "step": 152170 }, { "epoch": 19.067159503821575, "grad_norm": 19.974571228027344, "learning_rate": 6.614033283841581e-08, "loss": 0.5369, "num_input_tokens_seen": 185055072, "step": 152175 }, { "epoch": 19.06778599173036, "grad_norm": 5.646735191345215, "learning_rate": 6.605173202462789e-08, "loss": 0.5617, "num_input_tokens_seen": 185061088, "step": 152180 }, { "epoch": 19.068412479639143, "grad_norm": 4.101066589355469, "learning_rate": 6.596319020033937e-08, "loss": 0.3798, "num_input_tokens_seen": 185067200, "step": 152185 }, { "epoch": 19.069038967547925, "grad_norm": 45.69940948486328, "learning_rate": 6.587470736660884e-08, "loss": 0.523, "num_input_tokens_seen": 185073792, "step": 152190 }, { "epoch": 19.06966545545671, "grad_norm": 4.633298397064209, "learning_rate": 6.578628352449545e-08, "loss": 0.4238, "num_input_tokens_seen": 185080096, "step": 152195 }, { "epoch": 19.070291943365493, "grad_norm": 5.561607360839844, "learning_rate": 6.569791867505448e-08, "loss": 0.4666, "num_input_tokens_seen": 185086240, "step": 152200 }, { "epoch": 19.070918431274276, "grad_norm": 6.548210144042969, "learning_rate": 6.560961281934452e-08, "loss": 0.4067, "num_input_tokens_seen": 185092576, "step": 152205 }, { "epoch": 19.07154491918306, "grad_norm": 4.177508354187012, "learning_rate": 6.552136595841973e-08, "loss": 0.4454, "num_input_tokens_seen": 185098528, "step": 152210 }, { "epoch": 19.072171407091844, "grad_norm": 20.929704666137695, "learning_rate": 6.543317809333538e-08, "loss": 0.4783, "num_input_tokens_seen": 185104576, "step": 152215 }, { "epoch": 19.072797895000626, "grad_norm": 27.915542602539062, "learning_rate": 6.534504922514673e-08, "loss": 0.4897, "num_input_tokens_seen": 185110816, "step": 152220 }, { "epoch": 19.07342438290941, "grad_norm": 12.685322761535645, "learning_rate": 6.525697935490682e-08, "loss": 0.4131, "num_input_tokens_seen": 185116768, "step": 152225 }, { "epoch": 19.074050870818194, "grad_norm": 8.742520332336426, "learning_rate": 6.516896848366816e-08, "loss": 0.486, "num_input_tokens_seen": 185123168, "step": 152230 }, { "epoch": 19.074677358726976, "grad_norm": 6.773947715759277, "learning_rate": 6.508101661248323e-08, "loss": 0.4717, "num_input_tokens_seen": 185129536, "step": 152235 }, { "epoch": 19.07530384663576, "grad_norm": 3.744900941848755, "learning_rate": 6.499312374240451e-08, "loss": 0.4464, "num_input_tokens_seen": 185135296, "step": 152240 }, { "epoch": 19.075930334544545, "grad_norm": 9.642263412475586, "learning_rate": 6.490528987448175e-08, "loss": 0.4186, "num_input_tokens_seen": 185140864, "step": 152245 }, { "epoch": 19.076556822453327, "grad_norm": 7.220871448516846, "learning_rate": 6.481751500976574e-08, "loss": 0.4263, "num_input_tokens_seen": 185147104, "step": 152250 }, { "epoch": 19.07718331036211, "grad_norm": 6.561002731323242, "learning_rate": 6.47297991493051e-08, "loss": 0.5027, "num_input_tokens_seen": 185153536, "step": 152255 }, { "epoch": 19.077809798270895, "grad_norm": 11.331474304199219, "learning_rate": 6.464214229414956e-08, "loss": 0.4019, "num_input_tokens_seen": 185159968, "step": 152260 }, { "epoch": 19.078436286179677, "grad_norm": 24.539993286132812, "learning_rate": 6.455454444534659e-08, "loss": 0.4628, "num_input_tokens_seen": 185166528, "step": 152265 }, { "epoch": 19.07906277408846, "grad_norm": 14.241281509399414, "learning_rate": 6.446700560394314e-08, "loss": 0.428, "num_input_tokens_seen": 185173024, "step": 152270 }, { "epoch": 19.079689261997242, "grad_norm": 5.86927604675293, "learning_rate": 6.437952577098672e-08, "loss": 0.4584, "num_input_tokens_seen": 185179168, "step": 152275 }, { "epoch": 19.080315749906028, "grad_norm": 5.468666076660156, "learning_rate": 6.429210494752202e-08, "loss": 0.3845, "num_input_tokens_seen": 185185344, "step": 152280 }, { "epoch": 19.08094223781481, "grad_norm": 17.704261779785156, "learning_rate": 6.420474313459546e-08, "loss": 0.473, "num_input_tokens_seen": 185191520, "step": 152285 }, { "epoch": 19.081568725723592, "grad_norm": 12.031723976135254, "learning_rate": 6.411744033325062e-08, "loss": 0.4352, "num_input_tokens_seen": 185197504, "step": 152290 }, { "epoch": 19.082195213632378, "grad_norm": 7.3215250968933105, "learning_rate": 6.403019654453168e-08, "loss": 0.4362, "num_input_tokens_seen": 185203360, "step": 152295 }, { "epoch": 19.08282170154116, "grad_norm": 4.687819004058838, "learning_rate": 6.394301176948226e-08, "loss": 0.3765, "num_input_tokens_seen": 185208768, "step": 152300 }, { "epoch": 19.083448189449943, "grad_norm": 4.603945732116699, "learning_rate": 6.385588600914372e-08, "loss": 0.5177, "num_input_tokens_seen": 185214976, "step": 152305 }, { "epoch": 19.08407467735873, "grad_norm": 5.510281085968018, "learning_rate": 6.376881926455802e-08, "loss": 0.4124, "num_input_tokens_seen": 185220896, "step": 152310 }, { "epoch": 19.08470116526751, "grad_norm": 5.035051345825195, "learning_rate": 6.368181153676656e-08, "loss": 0.446, "num_input_tokens_seen": 185226976, "step": 152315 }, { "epoch": 19.085327653176293, "grad_norm": 7.325882911682129, "learning_rate": 6.359486282680904e-08, "loss": 0.4833, "num_input_tokens_seen": 185232864, "step": 152320 }, { "epoch": 19.08595414108508, "grad_norm": 9.52906608581543, "learning_rate": 6.350797313572521e-08, "loss": 0.5282, "num_input_tokens_seen": 185239072, "step": 152325 }, { "epoch": 19.08658062899386, "grad_norm": 7.90297269821167, "learning_rate": 6.342114246455422e-08, "loss": 0.4676, "num_input_tokens_seen": 185244832, "step": 152330 }, { "epoch": 19.087207116902643, "grad_norm": 7.531834125518799, "learning_rate": 6.333437081433414e-08, "loss": 0.4068, "num_input_tokens_seen": 185251136, "step": 152335 }, { "epoch": 19.087833604811426, "grad_norm": 4.442178726196289, "learning_rate": 6.324765818610135e-08, "loss": 0.4641, "num_input_tokens_seen": 185257312, "step": 152340 }, { "epoch": 19.08846009272021, "grad_norm": 4.968215465545654, "learning_rate": 6.316100458089392e-08, "loss": 0.4148, "num_input_tokens_seen": 185263456, "step": 152345 }, { "epoch": 19.089086580628994, "grad_norm": 25.4642276763916, "learning_rate": 6.307440999974713e-08, "loss": 0.5226, "num_input_tokens_seen": 185269568, "step": 152350 }, { "epoch": 19.089713068537776, "grad_norm": 10.172123908996582, "learning_rate": 6.298787444369681e-08, "loss": 0.3809, "num_input_tokens_seen": 185275584, "step": 152355 }, { "epoch": 19.090339556446562, "grad_norm": 26.216636657714844, "learning_rate": 6.290139791377714e-08, "loss": 0.4916, "num_input_tokens_seen": 185281568, "step": 152360 }, { "epoch": 19.090966044355344, "grad_norm": 6.186068534851074, "learning_rate": 6.281498041102174e-08, "loss": 0.4753, "num_input_tokens_seen": 185287616, "step": 152365 }, { "epoch": 19.091592532264126, "grad_norm": 5.039506912231445, "learning_rate": 6.272862193646478e-08, "loss": 0.4717, "num_input_tokens_seen": 185293760, "step": 152370 }, { "epoch": 19.092219020172912, "grad_norm": 4.5154032707214355, "learning_rate": 6.264232249113766e-08, "loss": 0.4485, "num_input_tokens_seen": 185299936, "step": 152375 }, { "epoch": 19.092845508081695, "grad_norm": 8.969132423400879, "learning_rate": 6.255608207607289e-08, "loss": 0.4793, "num_input_tokens_seen": 185306336, "step": 152380 }, { "epoch": 19.093471995990477, "grad_norm": 5.025639533996582, "learning_rate": 6.246990069230185e-08, "loss": 0.43, "num_input_tokens_seen": 185312416, "step": 152385 }, { "epoch": 19.09409848389926, "grad_norm": 4.663956165313721, "learning_rate": 6.238377834085373e-08, "loss": 0.3678, "num_input_tokens_seen": 185318464, "step": 152390 }, { "epoch": 19.094724971808045, "grad_norm": 7.841678142547607, "learning_rate": 6.229771502275883e-08, "loss": 0.4214, "num_input_tokens_seen": 185324800, "step": 152395 }, { "epoch": 19.095351459716827, "grad_norm": 17.929346084594727, "learning_rate": 6.22117107390463e-08, "loss": 0.5102, "num_input_tokens_seen": 185331008, "step": 152400 }, { "epoch": 19.09597794762561, "grad_norm": 19.396894454956055, "learning_rate": 6.212576549074423e-08, "loss": 0.4904, "num_input_tokens_seen": 185336992, "step": 152405 }, { "epoch": 19.096604435534395, "grad_norm": 4.199740409851074, "learning_rate": 6.203987927888066e-08, "loss": 0.3957, "num_input_tokens_seen": 185342208, "step": 152410 }, { "epoch": 19.097230923443178, "grad_norm": 7.778866291046143, "learning_rate": 6.195405210448146e-08, "loss": 0.5058, "num_input_tokens_seen": 185348480, "step": 152415 }, { "epoch": 19.09785741135196, "grad_norm": 18.792295455932617, "learning_rate": 6.186828396857303e-08, "loss": 0.4415, "num_input_tokens_seen": 185354656, "step": 152420 }, { "epoch": 19.098483899260746, "grad_norm": 5.3690714836120605, "learning_rate": 6.178257487218176e-08, "loss": 0.4094, "num_input_tokens_seen": 185360704, "step": 152425 }, { "epoch": 19.099110387169528, "grad_norm": 10.016865730285645, "learning_rate": 6.169692481633072e-08, "loss": 0.4746, "num_input_tokens_seen": 185366496, "step": 152430 }, { "epoch": 19.09973687507831, "grad_norm": 23.302677154541016, "learning_rate": 6.161133380204576e-08, "loss": 0.4847, "num_input_tokens_seen": 185372576, "step": 152435 }, { "epoch": 19.100363362987093, "grad_norm": 6.683319568634033, "learning_rate": 6.152580183034829e-08, "loss": 0.4135, "num_input_tokens_seen": 185378656, "step": 152440 }, { "epoch": 19.10098985089588, "grad_norm": 10.300782203674316, "learning_rate": 6.144032890226304e-08, "loss": 0.5245, "num_input_tokens_seen": 185385120, "step": 152445 }, { "epoch": 19.10161633880466, "grad_norm": 7.842737197875977, "learning_rate": 6.135491501880975e-08, "loss": 0.4963, "num_input_tokens_seen": 185391104, "step": 152450 }, { "epoch": 19.102242826713443, "grad_norm": 4.835518836975098, "learning_rate": 6.12695601810115e-08, "loss": 0.3903, "num_input_tokens_seen": 185396896, "step": 152455 }, { "epoch": 19.10286931462223, "grad_norm": 16.011751174926758, "learning_rate": 6.118426438988689e-08, "loss": 0.4035, "num_input_tokens_seen": 185403040, "step": 152460 }, { "epoch": 19.10349580253101, "grad_norm": 33.29381561279297, "learning_rate": 6.109902764645792e-08, "loss": 0.511, "num_input_tokens_seen": 185409536, "step": 152465 }, { "epoch": 19.104122290439793, "grad_norm": 4.302855968475342, "learning_rate": 6.101384995174153e-08, "loss": 0.4469, "num_input_tokens_seen": 185415456, "step": 152470 }, { "epoch": 19.10474877834858, "grad_norm": 7.105640411376953, "learning_rate": 6.092873130675692e-08, "loss": 0.5014, "num_input_tokens_seen": 185422016, "step": 152475 }, { "epoch": 19.10537526625736, "grad_norm": 4.678335666656494, "learning_rate": 6.08436717125227e-08, "loss": 0.3979, "num_input_tokens_seen": 185428192, "step": 152480 }, { "epoch": 19.106001754166144, "grad_norm": 14.69908618927002, "learning_rate": 6.075867117005473e-08, "loss": 0.4012, "num_input_tokens_seen": 185434400, "step": 152485 }, { "epoch": 19.10662824207493, "grad_norm": 9.485343933105469, "learning_rate": 6.067372968036889e-08, "loss": 0.4188, "num_input_tokens_seen": 185440704, "step": 152490 }, { "epoch": 19.107254729983712, "grad_norm": 6.068175315856934, "learning_rate": 6.058884724448211e-08, "loss": 0.5125, "num_input_tokens_seen": 185446688, "step": 152495 }, { "epoch": 19.107881217892494, "grad_norm": 19.147214889526367, "learning_rate": 6.050402386340804e-08, "loss": 0.4541, "num_input_tokens_seen": 185452256, "step": 152500 }, { "epoch": 19.108507705801276, "grad_norm": 8.11416244506836, "learning_rate": 6.041925953816141e-08, "loss": 0.4968, "num_input_tokens_seen": 185457952, "step": 152505 }, { "epoch": 19.109134193710062, "grad_norm": 4.772413730621338, "learning_rate": 6.033455426975588e-08, "loss": 0.4317, "num_input_tokens_seen": 185463904, "step": 152510 }, { "epoch": 19.109760681618845, "grad_norm": 10.045836448669434, "learning_rate": 6.024990805920339e-08, "loss": 0.4222, "num_input_tokens_seen": 185469984, "step": 152515 }, { "epoch": 19.110387169527627, "grad_norm": 21.195768356323242, "learning_rate": 6.016532090751648e-08, "loss": 0.4746, "num_input_tokens_seen": 185476256, "step": 152520 }, { "epoch": 19.111013657436413, "grad_norm": 18.694982528686523, "learning_rate": 6.008079281570655e-08, "loss": 0.4669, "num_input_tokens_seen": 185482272, "step": 152525 }, { "epoch": 19.111640145345195, "grad_norm": 6.699217319488525, "learning_rate": 5.999632378478392e-08, "loss": 0.411, "num_input_tokens_seen": 185488064, "step": 152530 }, { "epoch": 19.112266633253977, "grad_norm": 13.473129272460938, "learning_rate": 5.991191381575834e-08, "loss": 0.39, "num_input_tokens_seen": 185494240, "step": 152535 }, { "epoch": 19.112893121162763, "grad_norm": 5.193467140197754, "learning_rate": 5.982756290963953e-08, "loss": 0.4224, "num_input_tokens_seen": 185500384, "step": 152540 }, { "epoch": 19.113519609071545, "grad_norm": 7.214680194854736, "learning_rate": 5.974327106743504e-08, "loss": 0.4208, "num_input_tokens_seen": 185506688, "step": 152545 }, { "epoch": 19.114146096980328, "grad_norm": 5.217155456542969, "learning_rate": 5.965903829015352e-08, "loss": 0.4524, "num_input_tokens_seen": 185512704, "step": 152550 }, { "epoch": 19.11477258488911, "grad_norm": 8.558982849121094, "learning_rate": 5.957486457880246e-08, "loss": 0.4099, "num_input_tokens_seen": 185518432, "step": 152555 }, { "epoch": 19.115399072797896, "grad_norm": 3.7659764289855957, "learning_rate": 5.949074993438664e-08, "loss": 0.4488, "num_input_tokens_seen": 185524640, "step": 152560 }, { "epoch": 19.116025560706678, "grad_norm": 6.312963485717773, "learning_rate": 5.940669435791357e-08, "loss": 0.4287, "num_input_tokens_seen": 185530816, "step": 152565 }, { "epoch": 19.11665204861546, "grad_norm": 4.165204048156738, "learning_rate": 5.932269785038636e-08, "loss": 0.3964, "num_input_tokens_seen": 185536992, "step": 152570 }, { "epoch": 19.117278536524246, "grad_norm": 10.123552322387695, "learning_rate": 5.9238760412810846e-08, "loss": 0.439, "num_input_tokens_seen": 185543360, "step": 152575 }, { "epoch": 19.11790502443303, "grad_norm": 4.7934675216674805, "learning_rate": 5.915488204618958e-08, "loss": 0.4153, "num_input_tokens_seen": 185549120, "step": 152580 }, { "epoch": 19.11853151234181, "grad_norm": 28.463848114013672, "learning_rate": 5.9071062751525635e-08, "loss": 0.4929, "num_input_tokens_seen": 185554560, "step": 152585 }, { "epoch": 19.119158000250597, "grad_norm": 16.748693466186523, "learning_rate": 5.8987302529821545e-08, "loss": 0.4858, "num_input_tokens_seen": 185560544, "step": 152590 }, { "epoch": 19.11978448815938, "grad_norm": 10.349297523498535, "learning_rate": 5.890360138207873e-08, "loss": 0.4708, "num_input_tokens_seen": 185566336, "step": 152595 }, { "epoch": 19.12041097606816, "grad_norm": 3.724705457687378, "learning_rate": 5.8819959309296956e-08, "loss": 0.4715, "num_input_tokens_seen": 185572096, "step": 152600 }, { "epoch": 19.121037463976947, "grad_norm": 7.645556926727295, "learning_rate": 5.873637631247708e-08, "loss": 0.3735, "num_input_tokens_seen": 185578016, "step": 152605 }, { "epoch": 19.12166395188573, "grad_norm": 10.530330657958984, "learning_rate": 5.865285239261831e-08, "loss": 0.4738, "num_input_tokens_seen": 185584288, "step": 152610 }, { "epoch": 19.12229043979451, "grad_norm": 4.051266193389893, "learning_rate": 5.8569387550719283e-08, "loss": 0.4277, "num_input_tokens_seen": 185590272, "step": 152615 }, { "epoch": 19.122916927703294, "grad_norm": 32.79237365722656, "learning_rate": 5.8485981787777535e-08, "loss": 0.4391, "num_input_tokens_seen": 185596832, "step": 152620 }, { "epoch": 19.12354341561208, "grad_norm": 11.712061882019043, "learning_rate": 5.8402635104790605e-08, "loss": 0.4856, "num_input_tokens_seen": 185603040, "step": 152625 }, { "epoch": 19.124169903520862, "grad_norm": 5.995782852172852, "learning_rate": 5.831934750275492e-08, "loss": 0.4519, "num_input_tokens_seen": 185609024, "step": 152630 }, { "epoch": 19.124796391429644, "grad_norm": 18.433860778808594, "learning_rate": 5.823611898266579e-08, "loss": 0.4677, "num_input_tokens_seen": 185615232, "step": 152635 }, { "epoch": 19.12542287933843, "grad_norm": 13.835948944091797, "learning_rate": 5.815294954551909e-08, "loss": 0.4543, "num_input_tokens_seen": 185621536, "step": 152640 }, { "epoch": 19.126049367247212, "grad_norm": 11.581939697265625, "learning_rate": 5.806983919230846e-08, "loss": 0.4348, "num_input_tokens_seen": 185627648, "step": 152645 }, { "epoch": 19.126675855155995, "grad_norm": 21.47395133972168, "learning_rate": 5.798678792402812e-08, "loss": 0.5332, "num_input_tokens_seen": 185633568, "step": 152650 }, { "epoch": 19.12730234306478, "grad_norm": 19.589988708496094, "learning_rate": 5.7903795741670596e-08, "loss": 0.483, "num_input_tokens_seen": 185639936, "step": 152655 }, { "epoch": 19.127928830973563, "grad_norm": 23.85323143005371, "learning_rate": 5.7820862646228436e-08, "loss": 0.5065, "num_input_tokens_seen": 185646048, "step": 152660 }, { "epoch": 19.128555318882345, "grad_norm": 4.218106269836426, "learning_rate": 5.773798863869251e-08, "loss": 0.4256, "num_input_tokens_seen": 185652224, "step": 152665 }, { "epoch": 19.129181806791127, "grad_norm": 3.8768374919891357, "learning_rate": 5.76551737200548e-08, "loss": 0.4079, "num_input_tokens_seen": 185657856, "step": 152670 }, { "epoch": 19.129808294699913, "grad_norm": 20.773916244506836, "learning_rate": 5.757241789130452e-08, "loss": 0.4423, "num_input_tokens_seen": 185664128, "step": 152675 }, { "epoch": 19.130434782608695, "grad_norm": 10.762260437011719, "learning_rate": 5.748972115343143e-08, "loss": 0.4373, "num_input_tokens_seen": 185670240, "step": 152680 }, { "epoch": 19.131061270517478, "grad_norm": 5.420276641845703, "learning_rate": 5.740708350742419e-08, "loss": 0.4407, "num_input_tokens_seen": 185676352, "step": 152685 }, { "epoch": 19.131687758426263, "grad_norm": 3.9883158206939697, "learning_rate": 5.7324504954270335e-08, "loss": 0.4773, "num_input_tokens_seen": 185681952, "step": 152690 }, { "epoch": 19.132314246335046, "grad_norm": 16.486352920532227, "learning_rate": 5.7241985494957405e-08, "loss": 0.4433, "num_input_tokens_seen": 185688032, "step": 152695 }, { "epoch": 19.132940734243828, "grad_norm": 3.4541819095611572, "learning_rate": 5.715952513047296e-08, "loss": 0.4172, "num_input_tokens_seen": 185694304, "step": 152700 }, { "epoch": 19.133567222152614, "grad_norm": 7.141734600067139, "learning_rate": 5.707712386180175e-08, "loss": 0.4103, "num_input_tokens_seen": 185700480, "step": 152705 }, { "epoch": 19.134193710061396, "grad_norm": 5.397764682769775, "learning_rate": 5.6994781689929116e-08, "loss": 0.4521, "num_input_tokens_seen": 185706496, "step": 152710 }, { "epoch": 19.13482019797018, "grad_norm": 15.125371932983398, "learning_rate": 5.691249861584036e-08, "loss": 0.4714, "num_input_tokens_seen": 185712224, "step": 152715 }, { "epoch": 19.135446685878964, "grad_norm": 14.329187393188477, "learning_rate": 5.6830274640518044e-08, "loss": 0.43, "num_input_tokens_seen": 185718240, "step": 152720 }, { "epoch": 19.136073173787747, "grad_norm": 4.257279872894287, "learning_rate": 5.6748109764945824e-08, "loss": 0.3921, "num_input_tokens_seen": 185724544, "step": 152725 }, { "epoch": 19.13669966169653, "grad_norm": 17.924407958984375, "learning_rate": 5.6666003990106245e-08, "loss": 0.5351, "num_input_tokens_seen": 185730400, "step": 152730 }, { "epoch": 19.13732614960531, "grad_norm": 6.028094291687012, "learning_rate": 5.6583957316980185e-08, "loss": 0.4354, "num_input_tokens_seen": 185736512, "step": 152735 }, { "epoch": 19.137952637514097, "grad_norm": 5.252111434936523, "learning_rate": 5.65019697465502e-08, "loss": 0.4554, "num_input_tokens_seen": 185742720, "step": 152740 }, { "epoch": 19.13857912542288, "grad_norm": 10.023933410644531, "learning_rate": 5.642004127979439e-08, "loss": 0.4424, "num_input_tokens_seen": 185748928, "step": 152745 }, { "epoch": 19.13920561333166, "grad_norm": 7.710527420043945, "learning_rate": 5.633817191769364e-08, "loss": 0.4538, "num_input_tokens_seen": 185754976, "step": 152750 }, { "epoch": 19.139832101240447, "grad_norm": 6.956901550292969, "learning_rate": 5.625636166122661e-08, "loss": 0.4158, "num_input_tokens_seen": 185760736, "step": 152755 }, { "epoch": 19.14045858914923, "grad_norm": 20.54878044128418, "learning_rate": 5.6174610511371406e-08, "loss": 0.5272, "num_input_tokens_seen": 185766528, "step": 152760 }, { "epoch": 19.141085077058012, "grad_norm": 33.21842575073242, "learning_rate": 5.6092918469104473e-08, "loss": 0.4817, "num_input_tokens_seen": 185772608, "step": 152765 }, { "epoch": 19.141711564966798, "grad_norm": 6.106203556060791, "learning_rate": 5.601128553540447e-08, "loss": 0.3978, "num_input_tokens_seen": 185778528, "step": 152770 }, { "epoch": 19.14233805287558, "grad_norm": 6.167496204376221, "learning_rate": 5.592971171124506e-08, "loss": 0.4525, "num_input_tokens_seen": 185784416, "step": 152775 }, { "epoch": 19.142964540784362, "grad_norm": 3.76179575920105, "learning_rate": 5.58481969976038e-08, "loss": 0.3956, "num_input_tokens_seen": 185789920, "step": 152780 }, { "epoch": 19.143591028693145, "grad_norm": 5.020731449127197, "learning_rate": 5.576674139545324e-08, "loss": 0.416, "num_input_tokens_seen": 185796096, "step": 152785 }, { "epoch": 19.14421751660193, "grad_norm": 5.575927257537842, "learning_rate": 5.568534490576816e-08, "loss": 0.3882, "num_input_tokens_seen": 185802368, "step": 152790 }, { "epoch": 19.144844004510713, "grad_norm": 6.776293754577637, "learning_rate": 5.560400752952222e-08, "loss": 0.463, "num_input_tokens_seen": 185807488, "step": 152795 }, { "epoch": 19.145470492419495, "grad_norm": 4.633934497833252, "learning_rate": 5.552272926768743e-08, "loss": 0.5383, "num_input_tokens_seen": 185813440, "step": 152800 }, { "epoch": 19.14609698032828, "grad_norm": 18.685598373413086, "learning_rate": 5.544151012123522e-08, "loss": 0.4745, "num_input_tokens_seen": 185819776, "step": 152805 }, { "epoch": 19.146723468237063, "grad_norm": 12.924270629882812, "learning_rate": 5.5360350091136497e-08, "loss": 0.4547, "num_input_tokens_seen": 185826048, "step": 152810 }, { "epoch": 19.147349956145845, "grad_norm": 11.636075973510742, "learning_rate": 5.527924917836269e-08, "loss": 0.4846, "num_input_tokens_seen": 185832160, "step": 152815 }, { "epoch": 19.14797644405463, "grad_norm": 27.066448211669922, "learning_rate": 5.5198207383882484e-08, "loss": 0.4621, "num_input_tokens_seen": 185838208, "step": 152820 }, { "epoch": 19.148602931963413, "grad_norm": 11.120660781860352, "learning_rate": 5.511722470866565e-08, "loss": 0.4476, "num_input_tokens_seen": 185844096, "step": 152825 }, { "epoch": 19.149229419872196, "grad_norm": 8.079507827758789, "learning_rate": 5.5036301153679197e-08, "loss": 0.4174, "num_input_tokens_seen": 185849664, "step": 152830 }, { "epoch": 19.14985590778098, "grad_norm": 8.341584205627441, "learning_rate": 5.4955436719891785e-08, "loss": 0.5527, "num_input_tokens_seen": 185855424, "step": 152835 }, { "epoch": 19.150482395689764, "grad_norm": 10.922401428222656, "learning_rate": 5.487463140826876e-08, "loss": 0.4593, "num_input_tokens_seen": 185861184, "step": 152840 }, { "epoch": 19.151108883598546, "grad_norm": 18.671836853027344, "learning_rate": 5.479388521977824e-08, "loss": 0.424, "num_input_tokens_seen": 185867328, "step": 152845 }, { "epoch": 19.15173537150733, "grad_norm": 4.9901885986328125, "learning_rate": 5.47131981553839e-08, "loss": 0.4997, "num_input_tokens_seen": 185873600, "step": 152850 }, { "epoch": 19.152361859416114, "grad_norm": 20.52735137939453, "learning_rate": 5.463257021605051e-08, "loss": 0.4484, "num_input_tokens_seen": 185879744, "step": 152855 }, { "epoch": 19.152988347324897, "grad_norm": 7.335433483123779, "learning_rate": 5.4552001402743415e-08, "loss": 0.4365, "num_input_tokens_seen": 185885728, "step": 152860 }, { "epoch": 19.15361483523368, "grad_norm": 27.43691635131836, "learning_rate": 5.447149171642463e-08, "loss": 0.4297, "num_input_tokens_seen": 185891968, "step": 152865 }, { "epoch": 19.154241323142465, "grad_norm": 5.720200061798096, "learning_rate": 5.439104115805782e-08, "loss": 0.4835, "num_input_tokens_seen": 185897888, "step": 152870 }, { "epoch": 19.154867811051247, "grad_norm": 5.790455341339111, "learning_rate": 5.4310649728603335e-08, "loss": 0.414, "num_input_tokens_seen": 185903808, "step": 152875 }, { "epoch": 19.15549429896003, "grad_norm": 6.016263008117676, "learning_rate": 5.423031742902374e-08, "loss": 0.4145, "num_input_tokens_seen": 185909856, "step": 152880 }, { "epoch": 19.156120786868815, "grad_norm": 4.055266380310059, "learning_rate": 5.4150044260278255e-08, "loss": 0.4622, "num_input_tokens_seen": 185915136, "step": 152885 }, { "epoch": 19.156747274777597, "grad_norm": 20.892770767211914, "learning_rate": 5.406983022332779e-08, "loss": 0.5515, "num_input_tokens_seen": 185921152, "step": 152890 }, { "epoch": 19.15737376268638, "grad_norm": 16.544654846191406, "learning_rate": 5.3989675319130464e-08, "loss": 0.427, "num_input_tokens_seen": 185927200, "step": 152895 }, { "epoch": 19.158000250595162, "grad_norm": 13.311705589294434, "learning_rate": 5.390957954864495e-08, "loss": 0.503, "num_input_tokens_seen": 185933088, "step": 152900 }, { "epoch": 19.158626738503948, "grad_norm": 6.317192077636719, "learning_rate": 5.3829542912828825e-08, "loss": 0.392, "num_input_tokens_seen": 185939392, "step": 152905 }, { "epoch": 19.15925322641273, "grad_norm": 5.5647501945495605, "learning_rate": 5.374956541263965e-08, "loss": 0.3856, "num_input_tokens_seen": 185945728, "step": 152910 }, { "epoch": 19.159879714321512, "grad_norm": 12.58107852935791, "learning_rate": 5.3669647049032216e-08, "loss": 0.4287, "num_input_tokens_seen": 185951840, "step": 152915 }, { "epoch": 19.160506202230298, "grad_norm": 7.611630439758301, "learning_rate": 5.358978782296298e-08, "loss": 0.4912, "num_input_tokens_seen": 185957920, "step": 152920 }, { "epoch": 19.16113269013908, "grad_norm": 5.748990535736084, "learning_rate": 5.3509987735387295e-08, "loss": 0.4465, "num_input_tokens_seen": 185963520, "step": 152925 }, { "epoch": 19.161759178047863, "grad_norm": 7.802751064300537, "learning_rate": 5.343024678725772e-08, "loss": 0.4483, "num_input_tokens_seen": 185969664, "step": 152930 }, { "epoch": 19.16238566595665, "grad_norm": 18.810802459716797, "learning_rate": 5.335056497952851e-08, "loss": 0.4211, "num_input_tokens_seen": 185975616, "step": 152935 }, { "epoch": 19.16301215386543, "grad_norm": 14.419990539550781, "learning_rate": 5.327094231315222e-08, "loss": 0.3958, "num_input_tokens_seen": 185981920, "step": 152940 }, { "epoch": 19.163638641774213, "grad_norm": 4.258374214172363, "learning_rate": 5.319137878908087e-08, "loss": 0.4244, "num_input_tokens_seen": 185988032, "step": 152945 }, { "epoch": 19.164265129682995, "grad_norm": 5.314012050628662, "learning_rate": 5.311187440826537e-08, "loss": 0.4844, "num_input_tokens_seen": 185993952, "step": 152950 }, { "epoch": 19.16489161759178, "grad_norm": 12.531922340393066, "learning_rate": 5.3032429171657185e-08, "loss": 0.4174, "num_input_tokens_seen": 186000448, "step": 152955 }, { "epoch": 19.165518105500563, "grad_norm": 6.446976661682129, "learning_rate": 5.295304308020499e-08, "loss": 0.4347, "num_input_tokens_seen": 186006752, "step": 152960 }, { "epoch": 19.166144593409346, "grad_norm": 8.475712776184082, "learning_rate": 5.287371613485859e-08, "loss": 0.4774, "num_input_tokens_seen": 186013152, "step": 152965 }, { "epoch": 19.16677108131813, "grad_norm": 5.373295783996582, "learning_rate": 5.2794448336566106e-08, "loss": 0.4254, "num_input_tokens_seen": 186018976, "step": 152970 }, { "epoch": 19.167397569226914, "grad_norm": 33.9724006652832, "learning_rate": 5.271523968627512e-08, "loss": 0.5115, "num_input_tokens_seen": 186025312, "step": 152975 }, { "epoch": 19.168024057135696, "grad_norm": 12.564001083374023, "learning_rate": 5.263609018493321e-08, "loss": 0.5314, "num_input_tokens_seen": 186031488, "step": 152980 }, { "epoch": 19.168650545044482, "grad_norm": 4.505061149597168, "learning_rate": 5.255699983348628e-08, "loss": 0.4381, "num_input_tokens_seen": 186037920, "step": 152985 }, { "epoch": 19.169277032953264, "grad_norm": 12.468374252319336, "learning_rate": 5.247796863288024e-08, "loss": 0.45, "num_input_tokens_seen": 186043456, "step": 152990 }, { "epoch": 19.169903520862047, "grad_norm": 7.989639759063721, "learning_rate": 5.239899658405934e-08, "loss": 0.4814, "num_input_tokens_seen": 186049536, "step": 152995 }, { "epoch": 19.170530008770832, "grad_norm": 3.5884904861450195, "learning_rate": 5.2320083687967814e-08, "loss": 0.4873, "num_input_tokens_seen": 186055456, "step": 153000 }, { "epoch": 19.171156496679615, "grad_norm": 13.232484817504883, "learning_rate": 5.224122994555048e-08, "loss": 0.4991, "num_input_tokens_seen": 186061792, "step": 153005 }, { "epoch": 19.171782984588397, "grad_norm": 6.500168800354004, "learning_rate": 5.2162435357748234e-08, "loss": 0.5433, "num_input_tokens_seen": 186067904, "step": 153010 }, { "epoch": 19.17240947249718, "grad_norm": 5.702090740203857, "learning_rate": 5.2083699925504215e-08, "loss": 0.4482, "num_input_tokens_seen": 186074144, "step": 153015 }, { "epoch": 19.173035960405965, "grad_norm": 4.48261022567749, "learning_rate": 5.20050236497599e-08, "loss": 0.4155, "num_input_tokens_seen": 186080288, "step": 153020 }, { "epoch": 19.173662448314747, "grad_norm": 15.226104736328125, "learning_rate": 5.192640653145509e-08, "loss": 0.3896, "num_input_tokens_seen": 186086144, "step": 153025 }, { "epoch": 19.17428893622353, "grad_norm": 32.964385986328125, "learning_rate": 5.184784857153069e-08, "loss": 0.4687, "num_input_tokens_seen": 186092096, "step": 153030 }, { "epoch": 19.174915424132315, "grad_norm": 6.12952995300293, "learning_rate": 5.17693497709254e-08, "loss": 0.4234, "num_input_tokens_seen": 186098368, "step": 153035 }, { "epoch": 19.175541912041098, "grad_norm": 5.397068977355957, "learning_rate": 5.169091013057736e-08, "loss": 0.4164, "num_input_tokens_seen": 186104768, "step": 153040 }, { "epoch": 19.17616839994988, "grad_norm": 7.355011463165283, "learning_rate": 5.161252965142527e-08, "loss": 0.4284, "num_input_tokens_seen": 186110784, "step": 153045 }, { "epoch": 19.176794887858666, "grad_norm": 4.115920066833496, "learning_rate": 5.153420833440503e-08, "loss": 0.3992, "num_input_tokens_seen": 186116512, "step": 153050 }, { "epoch": 19.177421375767448, "grad_norm": 8.005919456481934, "learning_rate": 5.1455946180454795e-08, "loss": 0.4224, "num_input_tokens_seen": 186122720, "step": 153055 }, { "epoch": 19.17804786367623, "grad_norm": 7.747102737426758, "learning_rate": 5.137774319050826e-08, "loss": 0.4398, "num_input_tokens_seen": 186128480, "step": 153060 }, { "epoch": 19.178674351585013, "grad_norm": 8.740918159484863, "learning_rate": 5.129959936550189e-08, "loss": 0.4575, "num_input_tokens_seen": 186134592, "step": 153065 }, { "epoch": 19.1793008394938, "grad_norm": 5.200645923614502, "learning_rate": 5.122151470636938e-08, "loss": 0.4531, "num_input_tokens_seen": 186140832, "step": 153070 }, { "epoch": 19.17992732740258, "grad_norm": 4.960744857788086, "learning_rate": 5.114348921404444e-08, "loss": 0.4612, "num_input_tokens_seen": 186147136, "step": 153075 }, { "epoch": 19.180553815311363, "grad_norm": 9.629280090332031, "learning_rate": 5.106552288945965e-08, "loss": 0.4977, "num_input_tokens_seen": 186153120, "step": 153080 }, { "epoch": 19.18118030322015, "grad_norm": 4.8785247802734375, "learning_rate": 5.0987615733547605e-08, "loss": 0.4322, "num_input_tokens_seen": 186159456, "step": 153085 }, { "epoch": 19.18180679112893, "grad_norm": 4.607615947723389, "learning_rate": 5.090976774723922e-08, "loss": 0.4767, "num_input_tokens_seen": 186165280, "step": 153090 }, { "epoch": 19.182433279037713, "grad_norm": 5.481633186340332, "learning_rate": 5.083197893146541e-08, "loss": 0.4668, "num_input_tokens_seen": 186171328, "step": 153095 }, { "epoch": 19.1830597669465, "grad_norm": 10.014702796936035, "learning_rate": 5.075424928715711e-08, "loss": 0.4576, "num_input_tokens_seen": 186177568, "step": 153100 }, { "epoch": 19.18368625485528, "grad_norm": 3.716656446456909, "learning_rate": 5.0676578815241904e-08, "loss": 0.3998, "num_input_tokens_seen": 186183616, "step": 153105 }, { "epoch": 19.184312742764064, "grad_norm": 17.492780685424805, "learning_rate": 5.059896751665016e-08, "loss": 0.4866, "num_input_tokens_seen": 186188960, "step": 153110 }, { "epoch": 19.18493923067285, "grad_norm": 14.451546669006348, "learning_rate": 5.052141539230837e-08, "loss": 0.3885, "num_input_tokens_seen": 186194848, "step": 153115 }, { "epoch": 19.185565718581632, "grad_norm": 4.995043754577637, "learning_rate": 5.044392244314467e-08, "loss": 0.359, "num_input_tokens_seen": 186200704, "step": 153120 }, { "epoch": 19.186192206490414, "grad_norm": 16.42182159423828, "learning_rate": 5.036648867008498e-08, "loss": 0.4725, "num_input_tokens_seen": 186206656, "step": 153125 }, { "epoch": 19.186818694399197, "grad_norm": 3.7009212970733643, "learning_rate": 5.028911407405579e-08, "loss": 0.4297, "num_input_tokens_seen": 186212416, "step": 153130 }, { "epoch": 19.187445182307982, "grad_norm": 6.549929618835449, "learning_rate": 5.021179865598136e-08, "loss": 0.4087, "num_input_tokens_seen": 186218432, "step": 153135 }, { "epoch": 19.188071670216765, "grad_norm": 4.407039165496826, "learning_rate": 5.0134542416786504e-08, "loss": 0.3756, "num_input_tokens_seen": 186224608, "step": 153140 }, { "epoch": 19.188698158125547, "grad_norm": 8.366751670837402, "learning_rate": 5.005734535739493e-08, "loss": 0.466, "num_input_tokens_seen": 186229984, "step": 153145 }, { "epoch": 19.189324646034333, "grad_norm": 4.375504493713379, "learning_rate": 4.998020747872978e-08, "loss": 0.4635, "num_input_tokens_seen": 186236160, "step": 153150 }, { "epoch": 19.189951133943115, "grad_norm": 7.537975788116455, "learning_rate": 4.9903128781712554e-08, "loss": 0.4594, "num_input_tokens_seen": 186242368, "step": 153155 }, { "epoch": 19.190577621851897, "grad_norm": 5.873282432556152, "learning_rate": 4.982610926726472e-08, "loss": 0.4661, "num_input_tokens_seen": 186248480, "step": 153160 }, { "epoch": 19.191204109760683, "grad_norm": 10.143898010253906, "learning_rate": 4.974914893630833e-08, "loss": 0.4024, "num_input_tokens_seen": 186254848, "step": 153165 }, { "epoch": 19.191830597669465, "grad_norm": 24.221046447753906, "learning_rate": 4.967224778976265e-08, "loss": 0.5054, "num_input_tokens_seen": 186260896, "step": 153170 }, { "epoch": 19.192457085578248, "grad_norm": 10.281065940856934, "learning_rate": 4.9595405828547494e-08, "loss": 0.523, "num_input_tokens_seen": 186267008, "step": 153175 }, { "epoch": 19.19308357348703, "grad_norm": 7.711524486541748, "learning_rate": 4.951862305358102e-08, "loss": 0.4409, "num_input_tokens_seen": 186273312, "step": 153180 }, { "epoch": 19.193710061395816, "grad_norm": 4.5800275802612305, "learning_rate": 4.9441899465781927e-08, "loss": 0.4659, "num_input_tokens_seen": 186278720, "step": 153185 }, { "epoch": 19.194336549304598, "grad_norm": 20.47570037841797, "learning_rate": 4.936523506606672e-08, "loss": 0.4702, "num_input_tokens_seen": 186284768, "step": 153190 }, { "epoch": 19.19496303721338, "grad_norm": 12.59604263305664, "learning_rate": 4.9288629855352984e-08, "loss": 0.4291, "num_input_tokens_seen": 186291232, "step": 153195 }, { "epoch": 19.195589525122166, "grad_norm": 9.872176170349121, "learning_rate": 4.9212083834555e-08, "loss": 0.4238, "num_input_tokens_seen": 186297248, "step": 153200 }, { "epoch": 19.19621601303095, "grad_norm": 24.46099281311035, "learning_rate": 4.9135597004589256e-08, "loss": 0.4723, "num_input_tokens_seen": 186303488, "step": 153205 }, { "epoch": 19.19684250093973, "grad_norm": 4.102286338806152, "learning_rate": 4.9059169366370007e-08, "loss": 0.3985, "num_input_tokens_seen": 186309760, "step": 153210 }, { "epoch": 19.197468988848517, "grad_norm": 5.292072772979736, "learning_rate": 4.8982800920811534e-08, "loss": 0.4181, "num_input_tokens_seen": 186315712, "step": 153215 }, { "epoch": 19.1980954767573, "grad_norm": 4.787707805633545, "learning_rate": 4.890649166882533e-08, "loss": 0.4194, "num_input_tokens_seen": 186321536, "step": 153220 }, { "epoch": 19.19872196466608, "grad_norm": 7.354067325592041, "learning_rate": 4.88302416113251e-08, "loss": 0.3962, "num_input_tokens_seen": 186327648, "step": 153225 }, { "epoch": 19.199348452574867, "grad_norm": 5.55666971206665, "learning_rate": 4.875405074922235e-08, "loss": 0.4392, "num_input_tokens_seen": 186333664, "step": 153230 }, { "epoch": 19.19997494048365, "grad_norm": 19.066627502441406, "learning_rate": 4.867791908342745e-08, "loss": 0.4632, "num_input_tokens_seen": 186339424, "step": 153235 }, { "epoch": 19.20060142839243, "grad_norm": 5.775944232940674, "learning_rate": 4.860184661485079e-08, "loss": 0.3486, "num_input_tokens_seen": 186345120, "step": 153240 }, { "epoch": 19.201227916301214, "grad_norm": 4.986824035644531, "learning_rate": 4.85258333444022e-08, "loss": 0.4103, "num_input_tokens_seen": 186351424, "step": 153245 }, { "epoch": 19.20185440421, "grad_norm": 19.267854690551758, "learning_rate": 4.844987927298983e-08, "loss": 0.4874, "num_input_tokens_seen": 186357120, "step": 153250 }, { "epoch": 19.202480892118782, "grad_norm": 10.827932357788086, "learning_rate": 4.837398440152241e-08, "loss": 0.4602, "num_input_tokens_seen": 186363104, "step": 153255 }, { "epoch": 19.203107380027564, "grad_norm": 4.443205833435059, "learning_rate": 4.8298148730907544e-08, "loss": 0.4641, "num_input_tokens_seen": 186369120, "step": 153260 }, { "epoch": 19.20373386793635, "grad_norm": 14.36684799194336, "learning_rate": 4.822237226205062e-08, "loss": 0.5692, "num_input_tokens_seen": 186375360, "step": 153265 }, { "epoch": 19.204360355845132, "grad_norm": 15.358633995056152, "learning_rate": 4.8146654995859246e-08, "loss": 0.5393, "num_input_tokens_seen": 186381344, "step": 153270 }, { "epoch": 19.204986843753915, "grad_norm": 6.221506595611572, "learning_rate": 4.807099693323714e-08, "loss": 0.4879, "num_input_tokens_seen": 186387520, "step": 153275 }, { "epoch": 19.2056133316627, "grad_norm": 4.432089805603027, "learning_rate": 4.799539807509024e-08, "loss": 0.4245, "num_input_tokens_seen": 186393696, "step": 153280 }, { "epoch": 19.206239819571483, "grad_norm": 5.985428810119629, "learning_rate": 4.791985842232116e-08, "loss": 0.3763, "num_input_tokens_seen": 186399840, "step": 153285 }, { "epoch": 19.206866307480265, "grad_norm": 7.781672477722168, "learning_rate": 4.7844377975834186e-08, "loss": 0.4222, "num_input_tokens_seen": 186405952, "step": 153290 }, { "epoch": 19.207492795389047, "grad_norm": 4.995540142059326, "learning_rate": 4.7768956736531367e-08, "loss": 0.4278, "num_input_tokens_seen": 186412064, "step": 153295 }, { "epoch": 19.208119283297833, "grad_norm": 4.883903980255127, "learning_rate": 4.769359470531365e-08, "loss": 0.4984, "num_input_tokens_seen": 186418240, "step": 153300 }, { "epoch": 19.208745771206615, "grad_norm": 12.236394882202148, "learning_rate": 4.761829188308309e-08, "loss": 0.4523, "num_input_tokens_seen": 186424448, "step": 153305 }, { "epoch": 19.209372259115398, "grad_norm": 25.77215003967285, "learning_rate": 4.754304827073952e-08, "loss": 0.4398, "num_input_tokens_seen": 186430784, "step": 153310 }, { "epoch": 19.209998747024184, "grad_norm": 7.50622034072876, "learning_rate": 4.746786386918223e-08, "loss": 0.4253, "num_input_tokens_seen": 186436864, "step": 153315 }, { "epoch": 19.210625234932966, "grad_norm": 7.255483150482178, "learning_rate": 4.739273867931104e-08, "loss": 0.4213, "num_input_tokens_seen": 186443200, "step": 153320 }, { "epoch": 19.211251722841748, "grad_norm": 23.678497314453125, "learning_rate": 4.731767270202359e-08, "loss": 0.4875, "num_input_tokens_seen": 186449216, "step": 153325 }, { "epoch": 19.211878210750534, "grad_norm": 21.927078247070312, "learning_rate": 4.7242665938216915e-08, "loss": 0.4699, "num_input_tokens_seen": 186455488, "step": 153330 }, { "epoch": 19.212504698659316, "grad_norm": 5.203076362609863, "learning_rate": 4.7167718388788644e-08, "loss": 0.3836, "num_input_tokens_seen": 186461696, "step": 153335 }, { "epoch": 19.2131311865681, "grad_norm": 12.982855796813965, "learning_rate": 4.709283005463361e-08, "loss": 0.4389, "num_input_tokens_seen": 186467840, "step": 153340 }, { "epoch": 19.213757674476884, "grad_norm": 25.10832977294922, "learning_rate": 4.701800093664888e-08, "loss": 0.4847, "num_input_tokens_seen": 186474176, "step": 153345 }, { "epoch": 19.214384162385667, "grad_norm": 12.976115226745605, "learning_rate": 4.694323103572762e-08, "loss": 0.4826, "num_input_tokens_seen": 186480288, "step": 153350 }, { "epoch": 19.21501065029445, "grad_norm": 16.77126693725586, "learning_rate": 4.686852035276468e-08, "loss": 0.4939, "num_input_tokens_seen": 186486368, "step": 153355 }, { "epoch": 19.21563713820323, "grad_norm": 4.711679458618164, "learning_rate": 4.679386888865323e-08, "loss": 0.5102, "num_input_tokens_seen": 186492416, "step": 153360 }, { "epoch": 19.216263626112017, "grad_norm": 9.660711288452148, "learning_rate": 4.671927664428477e-08, "loss": 0.4737, "num_input_tokens_seen": 186498208, "step": 153365 }, { "epoch": 19.2168901140208, "grad_norm": 10.643540382385254, "learning_rate": 4.664474362055249e-08, "loss": 0.4336, "num_input_tokens_seen": 186504608, "step": 153370 }, { "epoch": 19.21751660192958, "grad_norm": 9.000649452209473, "learning_rate": 4.657026981834623e-08, "loss": 0.4917, "num_input_tokens_seen": 186510496, "step": 153375 }, { "epoch": 19.218143089838367, "grad_norm": 14.516672134399414, "learning_rate": 4.6495855238557484e-08, "loss": 0.468, "num_input_tokens_seen": 186516800, "step": 153380 }, { "epoch": 19.21876957774715, "grad_norm": 10.511080741882324, "learning_rate": 4.6421499882075004e-08, "loss": 0.4213, "num_input_tokens_seen": 186523264, "step": 153385 }, { "epoch": 19.219396065655932, "grad_norm": 9.990872383117676, "learning_rate": 4.6347203749789184e-08, "loss": 0.4787, "num_input_tokens_seen": 186529408, "step": 153390 }, { "epoch": 19.220022553564718, "grad_norm": 16.142736434936523, "learning_rate": 4.627296684258653e-08, "loss": 0.4532, "num_input_tokens_seen": 186535744, "step": 153395 }, { "epoch": 19.2206490414735, "grad_norm": 4.846827030181885, "learning_rate": 4.6198789161355784e-08, "loss": 0.422, "num_input_tokens_seen": 186541664, "step": 153400 }, { "epoch": 19.221275529382282, "grad_norm": 21.747411727905273, "learning_rate": 4.61246707069829e-08, "loss": 0.4345, "num_input_tokens_seen": 186547136, "step": 153405 }, { "epoch": 19.221902017291065, "grad_norm": 4.982183933258057, "learning_rate": 4.6050611480354944e-08, "loss": 0.4559, "num_input_tokens_seen": 186553312, "step": 153410 }, { "epoch": 19.22252850519985, "grad_norm": 13.449098587036133, "learning_rate": 4.597661148235677e-08, "loss": 0.43, "num_input_tokens_seen": 186559456, "step": 153415 }, { "epoch": 19.223154993108633, "grad_norm": 8.68974781036377, "learning_rate": 4.5902670713873774e-08, "loss": 0.5122, "num_input_tokens_seen": 186565888, "step": 153420 }, { "epoch": 19.223781481017415, "grad_norm": 5.708876132965088, "learning_rate": 4.5828789175789143e-08, "loss": 0.4099, "num_input_tokens_seen": 186571648, "step": 153425 }, { "epoch": 19.2244079689262, "grad_norm": 15.672026634216309, "learning_rate": 4.5754966868986616e-08, "loss": 0.4734, "num_input_tokens_seen": 186576896, "step": 153430 }, { "epoch": 19.225034456834983, "grad_norm": 28.28049659729004, "learning_rate": 4.5681203794348814e-08, "loss": 0.5219, "num_input_tokens_seen": 186582240, "step": 153435 }, { "epoch": 19.225660944743765, "grad_norm": 6.194867134094238, "learning_rate": 4.560749995275726e-08, "loss": 0.4336, "num_input_tokens_seen": 186588288, "step": 153440 }, { "epoch": 19.22628743265255, "grad_norm": 7.376177787780762, "learning_rate": 4.553385534509403e-08, "loss": 0.4883, "num_input_tokens_seen": 186594176, "step": 153445 }, { "epoch": 19.226913920561334, "grad_norm": 3.4562766551971436, "learning_rate": 4.5460269972238957e-08, "loss": 0.4504, "num_input_tokens_seen": 186599968, "step": 153450 }, { "epoch": 19.227540408470116, "grad_norm": 11.54710865020752, "learning_rate": 4.538674383507191e-08, "loss": 0.4211, "num_input_tokens_seen": 186605888, "step": 153455 }, { "epoch": 19.2281668963789, "grad_norm": 6.88014554977417, "learning_rate": 4.531327693447163e-08, "loss": 0.4581, "num_input_tokens_seen": 186611904, "step": 153460 }, { "epoch": 19.228793384287684, "grad_norm": 3.8279101848602295, "learning_rate": 4.523986927131685e-08, "loss": 0.4222, "num_input_tokens_seen": 186618368, "step": 153465 }, { "epoch": 19.229419872196466, "grad_norm": 8.400066375732422, "learning_rate": 4.5166520846485204e-08, "loss": 0.401, "num_input_tokens_seen": 186624000, "step": 153470 }, { "epoch": 19.23004636010525, "grad_norm": 5.41544246673584, "learning_rate": 4.509323166085378e-08, "loss": 0.373, "num_input_tokens_seen": 186630368, "step": 153475 }, { "epoch": 19.230672848014034, "grad_norm": 6.659729957580566, "learning_rate": 4.5020001715298524e-08, "loss": 0.3859, "num_input_tokens_seen": 186635936, "step": 153480 }, { "epoch": 19.231299335922817, "grad_norm": 14.712669372558594, "learning_rate": 4.4946831010695413e-08, "loss": 0.4486, "num_input_tokens_seen": 186642272, "step": 153485 }, { "epoch": 19.2319258238316, "grad_norm": 5.558819770812988, "learning_rate": 4.487371954791875e-08, "loss": 0.4042, "num_input_tokens_seen": 186647584, "step": 153490 }, { "epoch": 19.232552311740385, "grad_norm": 11.911924362182617, "learning_rate": 4.480066732784227e-08, "loss": 0.4044, "num_input_tokens_seen": 186653664, "step": 153495 }, { "epoch": 19.233178799649167, "grad_norm": 30.84892463684082, "learning_rate": 4.472767435134029e-08, "loss": 0.5052, "num_input_tokens_seen": 186659808, "step": 153500 }, { "epoch": 19.23380528755795, "grad_norm": 20.76903533935547, "learning_rate": 4.4654740619284874e-08, "loss": 0.4305, "num_input_tokens_seen": 186665952, "step": 153505 }, { "epoch": 19.234431775466735, "grad_norm": 5.706088542938232, "learning_rate": 4.458186613254867e-08, "loss": 0.4166, "num_input_tokens_seen": 186672320, "step": 153510 }, { "epoch": 19.235058263375517, "grad_norm": 4.346258163452148, "learning_rate": 4.4509050892002636e-08, "loss": 0.3732, "num_input_tokens_seen": 186678272, "step": 153515 }, { "epoch": 19.2356847512843, "grad_norm": 19.094648361206055, "learning_rate": 4.443629489851664e-08, "loss": 0.4698, "num_input_tokens_seen": 186684128, "step": 153520 }, { "epoch": 19.236311239193082, "grad_norm": 10.844672203063965, "learning_rate": 4.4363598152961654e-08, "loss": 0.3835, "num_input_tokens_seen": 186689792, "step": 153525 }, { "epoch": 19.236937727101868, "grad_norm": 35.11355972290039, "learning_rate": 4.4290960656205864e-08, "loss": 0.576, "num_input_tokens_seen": 186696000, "step": 153530 }, { "epoch": 19.23756421501065, "grad_norm": 9.810760498046875, "learning_rate": 4.4218382409118576e-08, "loss": 0.5013, "num_input_tokens_seen": 186702464, "step": 153535 }, { "epoch": 19.238190702919432, "grad_norm": 8.081524848937988, "learning_rate": 4.414586341256688e-08, "loss": 0.4304, "num_input_tokens_seen": 186708736, "step": 153540 }, { "epoch": 19.238817190828218, "grad_norm": 4.6957478523254395, "learning_rate": 4.407340366741841e-08, "loss": 0.5346, "num_input_tokens_seen": 186715040, "step": 153545 }, { "epoch": 19.239443678737, "grad_norm": 18.282068252563477, "learning_rate": 4.400100317453859e-08, "loss": 0.5285, "num_input_tokens_seen": 186721024, "step": 153550 }, { "epoch": 19.240070166645783, "grad_norm": 6.037290096282959, "learning_rate": 4.392866193479395e-08, "loss": 0.4344, "num_input_tokens_seen": 186726848, "step": 153555 }, { "epoch": 19.24069665455457, "grad_norm": 6.135081768035889, "learning_rate": 4.3856379949048786e-08, "loss": 0.424, "num_input_tokens_seen": 186733120, "step": 153560 }, { "epoch": 19.24132314246335, "grad_norm": 5.031234264373779, "learning_rate": 4.378415721816798e-08, "loss": 0.4024, "num_input_tokens_seen": 186739392, "step": 153565 }, { "epoch": 19.241949630372133, "grad_norm": 16.245868682861328, "learning_rate": 4.371199374301416e-08, "loss": 0.4319, "num_input_tokens_seen": 186745344, "step": 153570 }, { "epoch": 19.242576118280915, "grad_norm": 8.724383354187012, "learning_rate": 4.363988952445053e-08, "loss": 0.4492, "num_input_tokens_seen": 186751232, "step": 153575 }, { "epoch": 19.2432026061897, "grad_norm": 6.500828742980957, "learning_rate": 4.3567844563338624e-08, "loss": 0.3916, "num_input_tokens_seen": 186756672, "step": 153580 }, { "epoch": 19.243829094098484, "grad_norm": 13.589787483215332, "learning_rate": 4.349585886054109e-08, "loss": 0.4343, "num_input_tokens_seen": 186763072, "step": 153585 }, { "epoch": 19.244455582007266, "grad_norm": 7.979119300842285, "learning_rate": 4.3423932416917224e-08, "loss": 0.4646, "num_input_tokens_seen": 186768928, "step": 153590 }, { "epoch": 19.24508206991605, "grad_norm": 6.167308807373047, "learning_rate": 4.335206523332747e-08, "loss": 0.4556, "num_input_tokens_seen": 186775008, "step": 153595 }, { "epoch": 19.245708557824834, "grad_norm": 5.620704650878906, "learning_rate": 4.328025731063112e-08, "loss": 0.4411, "num_input_tokens_seen": 186781152, "step": 153600 }, { "epoch": 19.246335045733616, "grad_norm": 5.58180046081543, "learning_rate": 4.320850864968695e-08, "loss": 0.4432, "num_input_tokens_seen": 186786912, "step": 153605 }, { "epoch": 19.246961533642402, "grad_norm": 14.245512008666992, "learning_rate": 4.31368192513526e-08, "loss": 0.4134, "num_input_tokens_seen": 186792640, "step": 153610 }, { "epoch": 19.247588021551184, "grad_norm": 14.770638465881348, "learning_rate": 4.3065189116484606e-08, "loss": 0.4365, "num_input_tokens_seen": 186798752, "step": 153615 }, { "epoch": 19.248214509459967, "grad_norm": 27.39506721496582, "learning_rate": 4.299361824594006e-08, "loss": 0.502, "num_input_tokens_seen": 186804960, "step": 153620 }, { "epoch": 19.248840997368752, "grad_norm": 3.590838670730591, "learning_rate": 4.292210664057439e-08, "loss": 0.4131, "num_input_tokens_seen": 186811136, "step": 153625 }, { "epoch": 19.249467485277535, "grad_norm": 3.850172758102417, "learning_rate": 4.285065430124247e-08, "loss": 0.419, "num_input_tokens_seen": 186816960, "step": 153630 }, { "epoch": 19.250093973186317, "grad_norm": 6.4264936447143555, "learning_rate": 4.2779261228799166e-08, "loss": 0.4392, "num_input_tokens_seen": 186823168, "step": 153635 }, { "epoch": 19.2507204610951, "grad_norm": 5.754574775695801, "learning_rate": 4.270792742409713e-08, "loss": 0.4175, "num_input_tokens_seen": 186829216, "step": 153640 }, { "epoch": 19.251346949003885, "grad_norm": 8.842368125915527, "learning_rate": 4.263665288799013e-08, "loss": 0.4697, "num_input_tokens_seen": 186835552, "step": 153645 }, { "epoch": 19.251973436912667, "grad_norm": 4.937235355377197, "learning_rate": 4.25654376213297e-08, "loss": 0.3939, "num_input_tokens_seen": 186841536, "step": 153650 }, { "epoch": 19.25259992482145, "grad_norm": 7.321616172790527, "learning_rate": 4.249428162496738e-08, "loss": 0.4338, "num_input_tokens_seen": 186848000, "step": 153655 }, { "epoch": 19.253226412730235, "grad_norm": 22.800701141357422, "learning_rate": 4.242318489975361e-08, "loss": 0.4223, "num_input_tokens_seen": 186854240, "step": 153660 }, { "epoch": 19.253852900639018, "grad_norm": 13.972179412841797, "learning_rate": 4.235214744653937e-08, "loss": 0.4248, "num_input_tokens_seen": 186860704, "step": 153665 }, { "epoch": 19.2544793885478, "grad_norm": 5.266969203948975, "learning_rate": 4.2281169266172875e-08, "loss": 0.452, "num_input_tokens_seen": 186866688, "step": 153670 }, { "epoch": 19.255105876456586, "grad_norm": 5.263580322265625, "learning_rate": 4.2210250359503433e-08, "loss": 0.4485, "num_input_tokens_seen": 186872256, "step": 153675 }, { "epoch": 19.255732364365368, "grad_norm": 5.069552421569824, "learning_rate": 4.2139390727378714e-08, "loss": 0.4283, "num_input_tokens_seen": 186878432, "step": 153680 }, { "epoch": 19.25635885227415, "grad_norm": 7.597701072692871, "learning_rate": 4.2068590370646364e-08, "loss": 0.4407, "num_input_tokens_seen": 186884704, "step": 153685 }, { "epoch": 19.256985340182933, "grad_norm": 4.88675594329834, "learning_rate": 4.199784929015183e-08, "loss": 0.4326, "num_input_tokens_seen": 186891104, "step": 153690 }, { "epoch": 19.25761182809172, "grad_norm": 11.324592590332031, "learning_rate": 4.192716748674164e-08, "loss": 0.4445, "num_input_tokens_seen": 186897216, "step": 153695 }, { "epoch": 19.2582383160005, "grad_norm": 20.06987953186035, "learning_rate": 4.185654496126068e-08, "loss": 0.4564, "num_input_tokens_seen": 186903392, "step": 153700 }, { "epoch": 19.258864803909283, "grad_norm": 7.699454307556152, "learning_rate": 4.178598171455328e-08, "loss": 0.4878, "num_input_tokens_seen": 186909408, "step": 153705 }, { "epoch": 19.25949129181807, "grad_norm": 5.470555305480957, "learning_rate": 4.171547774746321e-08, "loss": 0.4378, "num_input_tokens_seen": 186915840, "step": 153710 }, { "epoch": 19.26011777972685, "grad_norm": 24.554536819458008, "learning_rate": 4.164503306083312e-08, "loss": 0.4434, "num_input_tokens_seen": 186921824, "step": 153715 }, { "epoch": 19.260744267635634, "grad_norm": 8.094866752624512, "learning_rate": 4.157464765550567e-08, "loss": 0.4313, "num_input_tokens_seen": 186927616, "step": 153720 }, { "epoch": 19.26137075554442, "grad_norm": 14.181025505065918, "learning_rate": 4.1504321532321865e-08, "loss": 0.3976, "num_input_tokens_seen": 186933696, "step": 153725 }, { "epoch": 19.2619972434532, "grad_norm": 4.602836608886719, "learning_rate": 4.1434054692122694e-08, "loss": 0.4257, "num_input_tokens_seen": 186939936, "step": 153730 }, { "epoch": 19.262623731361984, "grad_norm": 3.3154971599578857, "learning_rate": 4.136384713574859e-08, "loss": 0.4289, "num_input_tokens_seen": 186945952, "step": 153735 }, { "epoch": 19.26325021927077, "grad_norm": 12.54023265838623, "learning_rate": 4.129369886403889e-08, "loss": 0.4935, "num_input_tokens_seen": 186952320, "step": 153740 }, { "epoch": 19.263876707179552, "grad_norm": 8.18780517578125, "learning_rate": 4.122360987783125e-08, "loss": 0.3891, "num_input_tokens_seen": 186958400, "step": 153745 }, { "epoch": 19.264503195088334, "grad_norm": 9.338966369628906, "learning_rate": 4.115358017796556e-08, "loss": 0.4967, "num_input_tokens_seen": 186964032, "step": 153750 }, { "epoch": 19.265129682997117, "grad_norm": 8.7564697265625, "learning_rate": 4.108360976527726e-08, "loss": 0.4211, "num_input_tokens_seen": 186970048, "step": 153755 }, { "epoch": 19.265756170905902, "grad_norm": 4.46083927154541, "learning_rate": 4.1013698640604006e-08, "loss": 0.3934, "num_input_tokens_seen": 186976384, "step": 153760 }, { "epoch": 19.266382658814685, "grad_norm": 25.496559143066406, "learning_rate": 4.09438468047807e-08, "loss": 0.4373, "num_input_tokens_seen": 186982592, "step": 153765 }, { "epoch": 19.267009146723467, "grad_norm": 12.573200225830078, "learning_rate": 4.087405425864388e-08, "loss": 0.412, "num_input_tokens_seen": 186988576, "step": 153770 }, { "epoch": 19.267635634632253, "grad_norm": 25.05252456665039, "learning_rate": 4.080432100302622e-08, "loss": 0.5131, "num_input_tokens_seen": 186994752, "step": 153775 }, { "epoch": 19.268262122541035, "grad_norm": 13.14957046508789, "learning_rate": 4.073464703876262e-08, "loss": 0.49, "num_input_tokens_seen": 187000896, "step": 153780 }, { "epoch": 19.268888610449817, "grad_norm": 19.888708114624023, "learning_rate": 4.0665032366686284e-08, "loss": 0.4898, "num_input_tokens_seen": 187006944, "step": 153785 }, { "epoch": 19.269515098358603, "grad_norm": 4.116987705230713, "learning_rate": 4.0595476987628776e-08, "loss": 0.4344, "num_input_tokens_seen": 187013088, "step": 153790 }, { "epoch": 19.270141586267385, "grad_norm": 5.362717151641846, "learning_rate": 4.0525980902422214e-08, "loss": 0.3982, "num_input_tokens_seen": 187019296, "step": 153795 }, { "epoch": 19.270768074176168, "grad_norm": 6.722880840301514, "learning_rate": 4.045654411189703e-08, "loss": 0.4408, "num_input_tokens_seen": 187025216, "step": 153800 }, { "epoch": 19.27139456208495, "grad_norm": 9.128238677978516, "learning_rate": 4.038716661688369e-08, "loss": 0.4082, "num_input_tokens_seen": 187031168, "step": 153805 }, { "epoch": 19.272021049993736, "grad_norm": 10.559208869934082, "learning_rate": 4.0317848418211515e-08, "loss": 0.4011, "num_input_tokens_seen": 187037152, "step": 153810 }, { "epoch": 19.272647537902518, "grad_norm": 9.832343101501465, "learning_rate": 4.024858951670929e-08, "loss": 0.4808, "num_input_tokens_seen": 187042912, "step": 153815 }, { "epoch": 19.2732740258113, "grad_norm": 12.326163291931152, "learning_rate": 4.0179389913205246e-08, "loss": 0.4754, "num_input_tokens_seen": 187048992, "step": 153820 }, { "epoch": 19.273900513720086, "grad_norm": 8.171427726745605, "learning_rate": 4.01102496085265e-08, "loss": 0.4327, "num_input_tokens_seen": 187054944, "step": 153825 }, { "epoch": 19.27452700162887, "grad_norm": 31.17525291442871, "learning_rate": 4.00411686034996e-08, "loss": 0.4728, "num_input_tokens_seen": 187060480, "step": 153830 }, { "epoch": 19.27515348953765, "grad_norm": 27.72332191467285, "learning_rate": 3.9972146898951126e-08, "loss": 0.6073, "num_input_tokens_seen": 187066624, "step": 153835 }, { "epoch": 19.275779977446437, "grad_norm": 25.320301055908203, "learning_rate": 3.99031844957054e-08, "loss": 0.495, "num_input_tokens_seen": 187072672, "step": 153840 }, { "epoch": 19.27640646535522, "grad_norm": 4.4700093269348145, "learning_rate": 3.983428139458734e-08, "loss": 0.4875, "num_input_tokens_seen": 187078720, "step": 153845 }, { "epoch": 19.277032953264, "grad_norm": 16.963306427001953, "learning_rate": 3.976543759642071e-08, "loss": 0.4422, "num_input_tokens_seen": 187084576, "step": 153850 }, { "epoch": 19.277659441172787, "grad_norm": 6.088016986846924, "learning_rate": 3.969665310202875e-08, "loss": 0.4084, "num_input_tokens_seen": 187090688, "step": 153855 }, { "epoch": 19.27828592908157, "grad_norm": 20.663026809692383, "learning_rate": 3.9627927912233575e-08, "loss": 0.4589, "num_input_tokens_seen": 187096928, "step": 153860 }, { "epoch": 19.27891241699035, "grad_norm": 17.875642776489258, "learning_rate": 3.955926202785676e-08, "loss": 0.4763, "num_input_tokens_seen": 187102784, "step": 153865 }, { "epoch": 19.279538904899134, "grad_norm": 30.212106704711914, "learning_rate": 3.9490655449720416e-08, "loss": 0.5209, "num_input_tokens_seen": 187108704, "step": 153870 }, { "epoch": 19.28016539280792, "grad_norm": 14.069499015808105, "learning_rate": 3.9422108178642784e-08, "loss": 0.44, "num_input_tokens_seen": 187115136, "step": 153875 }, { "epoch": 19.280791880716702, "grad_norm": 5.363625526428223, "learning_rate": 3.935362021544486e-08, "loss": 0.4718, "num_input_tokens_seen": 187121280, "step": 153880 }, { "epoch": 19.281418368625484, "grad_norm": 27.157732009887695, "learning_rate": 3.9285191560945454e-08, "loss": 0.4286, "num_input_tokens_seen": 187127648, "step": 153885 }, { "epoch": 19.28204485653427, "grad_norm": 10.380542755126953, "learning_rate": 3.921682221596168e-08, "loss": 0.4989, "num_input_tokens_seen": 187133632, "step": 153890 }, { "epoch": 19.282671344443052, "grad_norm": 6.749963760375977, "learning_rate": 3.914851218131177e-08, "loss": 0.4231, "num_input_tokens_seen": 187139936, "step": 153895 }, { "epoch": 19.283297832351835, "grad_norm": 7.324812889099121, "learning_rate": 3.9080261457812855e-08, "loss": 0.4462, "num_input_tokens_seen": 187146144, "step": 153900 }, { "epoch": 19.28392432026062, "grad_norm": 19.786537170410156, "learning_rate": 3.9012070046279824e-08, "loss": 0.5259, "num_input_tokens_seen": 187152416, "step": 153905 }, { "epoch": 19.284550808169403, "grad_norm": 12.206748008728027, "learning_rate": 3.8943937947528156e-08, "loss": 0.4249, "num_input_tokens_seen": 187157888, "step": 153910 }, { "epoch": 19.285177296078185, "grad_norm": 23.08021354675293, "learning_rate": 3.8875865162373295e-08, "loss": 0.556, "num_input_tokens_seen": 187164128, "step": 153915 }, { "epoch": 19.285803783986967, "grad_norm": 32.25218200683594, "learning_rate": 3.8807851691627926e-08, "loss": 0.4655, "num_input_tokens_seen": 187170368, "step": 153920 }, { "epoch": 19.286430271895753, "grad_norm": 6.652832984924316, "learning_rate": 3.873989753610641e-08, "loss": 0.4347, "num_input_tokens_seen": 187176352, "step": 153925 }, { "epoch": 19.287056759804535, "grad_norm": 7.088463306427002, "learning_rate": 3.86720026966203e-08, "loss": 0.4443, "num_input_tokens_seen": 187181824, "step": 153930 }, { "epoch": 19.287683247713318, "grad_norm": 7.992974758148193, "learning_rate": 3.860416717398174e-08, "loss": 0.4379, "num_input_tokens_seen": 187187936, "step": 153935 }, { "epoch": 19.288309735622104, "grad_norm": 10.690946578979492, "learning_rate": 3.853639096900175e-08, "loss": 0.4105, "num_input_tokens_seen": 187194400, "step": 153940 }, { "epoch": 19.288936223530886, "grad_norm": 5.0743937492370605, "learning_rate": 3.846867408249077e-08, "loss": 0.4542, "num_input_tokens_seen": 187200480, "step": 153945 }, { "epoch": 19.289562711439668, "grad_norm": 4.139834880828857, "learning_rate": 3.8401016515257626e-08, "loss": 0.4342, "num_input_tokens_seen": 187205888, "step": 153950 }, { "epoch": 19.290189199348454, "grad_norm": 6.673772811889648, "learning_rate": 3.833341826811221e-08, "loss": 0.395, "num_input_tokens_seen": 187211712, "step": 153955 }, { "epoch": 19.290815687257236, "grad_norm": 7.770442008972168, "learning_rate": 3.826587934186221e-08, "loss": 0.4367, "num_input_tokens_seen": 187217856, "step": 153960 }, { "epoch": 19.29144217516602, "grad_norm": 8.670650482177734, "learning_rate": 3.8198399737315314e-08, "loss": 0.4548, "num_input_tokens_seen": 187223968, "step": 153965 }, { "epoch": 19.292068663074804, "grad_norm": 14.847134590148926, "learning_rate": 3.813097945527866e-08, "loss": 0.449, "num_input_tokens_seen": 187230080, "step": 153970 }, { "epoch": 19.292695150983587, "grad_norm": 14.934288024902344, "learning_rate": 3.806361849655715e-08, "loss": 0.3977, "num_input_tokens_seen": 187236160, "step": 153975 }, { "epoch": 19.29332163889237, "grad_norm": 15.365848541259766, "learning_rate": 3.799631686195737e-08, "loss": 0.405, "num_input_tokens_seen": 187242464, "step": 153980 }, { "epoch": 19.29394812680115, "grad_norm": 23.361209869384766, "learning_rate": 3.792907455228256e-08, "loss": 0.464, "num_input_tokens_seen": 187248416, "step": 153985 }, { "epoch": 19.294574614709937, "grad_norm": 8.042837142944336, "learning_rate": 3.7861891568338746e-08, "loss": 0.4125, "num_input_tokens_seen": 187254752, "step": 153990 }, { "epoch": 19.29520110261872, "grad_norm": 4.083435535430908, "learning_rate": 3.7794767910926956e-08, "loss": 0.4598, "num_input_tokens_seen": 187260192, "step": 153995 }, { "epoch": 19.2958275905275, "grad_norm": 9.712508201599121, "learning_rate": 3.772770358085154e-08, "loss": 0.4443, "num_input_tokens_seen": 187266432, "step": 154000 }, { "epoch": 19.296454078436287, "grad_norm": 8.337708473205566, "learning_rate": 3.7660698578912415e-08, "loss": 0.465, "num_input_tokens_seen": 187272800, "step": 154005 }, { "epoch": 19.29708056634507, "grad_norm": 5.374607563018799, "learning_rate": 3.759375290591283e-08, "loss": 0.4283, "num_input_tokens_seen": 187278816, "step": 154010 }, { "epoch": 19.297707054253852, "grad_norm": 4.742234230041504, "learning_rate": 3.752686656265159e-08, "loss": 0.4028, "num_input_tokens_seen": 187284608, "step": 154015 }, { "epoch": 19.298333542162638, "grad_norm": 23.052978515625, "learning_rate": 3.746003954992916e-08, "loss": 0.495, "num_input_tokens_seen": 187290624, "step": 154020 }, { "epoch": 19.29896003007142, "grad_norm": 16.270469665527344, "learning_rate": 3.73932718685438e-08, "loss": 0.4827, "num_input_tokens_seen": 187296672, "step": 154025 }, { "epoch": 19.299586517980202, "grad_norm": 8.53902816772461, "learning_rate": 3.732656351929431e-08, "loss": 0.4384, "num_input_tokens_seen": 187302592, "step": 154030 }, { "epoch": 19.300213005888985, "grad_norm": 5.516881465911865, "learning_rate": 3.7259914502977835e-08, "loss": 0.3909, "num_input_tokens_seen": 187308736, "step": 154035 }, { "epoch": 19.30083949379777, "grad_norm": 4.442803382873535, "learning_rate": 3.7193324820392065e-08, "loss": 0.4239, "num_input_tokens_seen": 187314880, "step": 154040 }, { "epoch": 19.301465981706553, "grad_norm": 9.47081184387207, "learning_rate": 3.712679447233247e-08, "loss": 0.4553, "num_input_tokens_seen": 187320992, "step": 154045 }, { "epoch": 19.302092469615335, "grad_norm": 13.454805374145508, "learning_rate": 3.706032345959454e-08, "loss": 0.4944, "num_input_tokens_seen": 187326784, "step": 154050 }, { "epoch": 19.30271895752412, "grad_norm": 3.676198720932007, "learning_rate": 3.699391178297318e-08, "loss": 0.437, "num_input_tokens_seen": 187332832, "step": 154055 }, { "epoch": 19.303345445432903, "grad_norm": 15.290791511535645, "learning_rate": 3.692755944326165e-08, "loss": 0.4419, "num_input_tokens_seen": 187339296, "step": 154060 }, { "epoch": 19.303971933341685, "grad_norm": 28.50010108947754, "learning_rate": 3.686126644125487e-08, "loss": 0.5203, "num_input_tokens_seen": 187345696, "step": 154065 }, { "epoch": 19.30459842125047, "grad_norm": 4.741760730743408, "learning_rate": 3.6795032777743875e-08, "loss": 0.429, "num_input_tokens_seen": 187351648, "step": 154070 }, { "epoch": 19.305224909159254, "grad_norm": 21.318723678588867, "learning_rate": 3.6728858453521364e-08, "loss": 0.5188, "num_input_tokens_seen": 187357344, "step": 154075 }, { "epoch": 19.305851397068036, "grad_norm": 17.570648193359375, "learning_rate": 3.666274346937782e-08, "loss": 0.5501, "num_input_tokens_seen": 187363616, "step": 154080 }, { "epoch": 19.30647788497682, "grad_norm": 21.633323669433594, "learning_rate": 3.659668782610426e-08, "loss": 0.5176, "num_input_tokens_seen": 187370048, "step": 154085 }, { "epoch": 19.307104372885604, "grad_norm": 4.052549839019775, "learning_rate": 3.6530691524490625e-08, "loss": 0.3565, "num_input_tokens_seen": 187376000, "step": 154090 }, { "epoch": 19.307730860794386, "grad_norm": 10.524222373962402, "learning_rate": 3.6464754565325165e-08, "loss": 0.5345, "num_input_tokens_seen": 187381792, "step": 154095 }, { "epoch": 19.30835734870317, "grad_norm": 17.62491798400879, "learning_rate": 3.63988769493967e-08, "loss": 0.4618, "num_input_tokens_seen": 187388192, "step": 154100 }, { "epoch": 19.308983836611954, "grad_norm": 11.408204078674316, "learning_rate": 3.633305867749293e-08, "loss": 0.4971, "num_input_tokens_seen": 187394176, "step": 154105 }, { "epoch": 19.309610324520737, "grad_norm": 6.024444580078125, "learning_rate": 3.6267299750400996e-08, "loss": 0.4599, "num_input_tokens_seen": 187400384, "step": 154110 }, { "epoch": 19.31023681242952, "grad_norm": 6.029700756072998, "learning_rate": 3.620160016890639e-08, "loss": 0.511, "num_input_tokens_seen": 187406560, "step": 154115 }, { "epoch": 19.310863300338305, "grad_norm": 18.026418685913086, "learning_rate": 3.613595993379571e-08, "loss": 0.4916, "num_input_tokens_seen": 187412640, "step": 154120 }, { "epoch": 19.311489788247087, "grad_norm": 4.393353462219238, "learning_rate": 3.60703790458522e-08, "loss": 0.419, "num_input_tokens_seen": 187418624, "step": 154125 }, { "epoch": 19.31211627615587, "grad_norm": 6.406695365905762, "learning_rate": 3.60048575058608e-08, "loss": 0.4381, "num_input_tokens_seen": 187424288, "step": 154130 }, { "epoch": 19.312742764064655, "grad_norm": 9.888002395629883, "learning_rate": 3.5939395314605327e-08, "loss": 0.438, "num_input_tokens_seen": 187430272, "step": 154135 }, { "epoch": 19.313369251973437, "grad_norm": 8.182662010192871, "learning_rate": 3.587399247286738e-08, "loss": 0.4634, "num_input_tokens_seen": 187436384, "step": 154140 }, { "epoch": 19.31399573988222, "grad_norm": 6.231614112854004, "learning_rate": 3.580864898142966e-08, "loss": 0.492, "num_input_tokens_seen": 187442368, "step": 154145 }, { "epoch": 19.314622227791002, "grad_norm": 6.499593734741211, "learning_rate": 3.5743364841072656e-08, "loss": 0.4865, "num_input_tokens_seen": 187448672, "step": 154150 }, { "epoch": 19.315248715699788, "grad_norm": 11.079846382141113, "learning_rate": 3.5678140052578527e-08, "loss": 0.4258, "num_input_tokens_seen": 187454848, "step": 154155 }, { "epoch": 19.31587520360857, "grad_norm": 5.855464935302734, "learning_rate": 3.5612974616725524e-08, "loss": 0.4414, "num_input_tokens_seen": 187460704, "step": 154160 }, { "epoch": 19.316501691517352, "grad_norm": 7.139854907989502, "learning_rate": 3.5547868534293037e-08, "loss": 0.4397, "num_input_tokens_seen": 187466304, "step": 154165 }, { "epoch": 19.31712817942614, "grad_norm": 5.019951820373535, "learning_rate": 3.5482821806059886e-08, "loss": 0.4225, "num_input_tokens_seen": 187472096, "step": 154170 }, { "epoch": 19.31775466733492, "grad_norm": 10.297148704528809, "learning_rate": 3.541783443280378e-08, "loss": 0.4813, "num_input_tokens_seen": 187478080, "step": 154175 }, { "epoch": 19.318381155243703, "grad_norm": 3.316497564315796, "learning_rate": 3.535290641530076e-08, "loss": 0.4306, "num_input_tokens_seen": 187484512, "step": 154180 }, { "epoch": 19.31900764315249, "grad_norm": 13.32080078125, "learning_rate": 3.528803775432854e-08, "loss": 0.4312, "num_input_tokens_seen": 187490720, "step": 154185 }, { "epoch": 19.31963413106127, "grad_norm": 7.653013229370117, "learning_rate": 3.522322845066151e-08, "loss": 0.3698, "num_input_tokens_seen": 187496864, "step": 154190 }, { "epoch": 19.320260618970053, "grad_norm": 7.451376914978027, "learning_rate": 3.515847850507514e-08, "loss": 0.4443, "num_input_tokens_seen": 187502656, "step": 154195 }, { "epoch": 19.320887106878835, "grad_norm": 5.146883010864258, "learning_rate": 3.509378791834328e-08, "loss": 0.4357, "num_input_tokens_seen": 187508800, "step": 154200 }, { "epoch": 19.32151359478762, "grad_norm": 13.549196243286133, "learning_rate": 3.502915669123919e-08, "loss": 0.4264, "num_input_tokens_seen": 187514752, "step": 154205 }, { "epoch": 19.322140082696404, "grad_norm": 13.65036678314209, "learning_rate": 3.496458482453613e-08, "loss": 0.4788, "num_input_tokens_seen": 187521152, "step": 154210 }, { "epoch": 19.322766570605186, "grad_norm": 4.8521409034729, "learning_rate": 3.4900072319005715e-08, "loss": 0.3921, "num_input_tokens_seen": 187526848, "step": 154215 }, { "epoch": 19.32339305851397, "grad_norm": 9.18237590789795, "learning_rate": 3.483561917541955e-08, "loss": 0.4695, "num_input_tokens_seen": 187533120, "step": 154220 }, { "epoch": 19.324019546422754, "grad_norm": 4.884147644042969, "learning_rate": 3.477122539454758e-08, "loss": 0.475, "num_input_tokens_seen": 187539424, "step": 154225 }, { "epoch": 19.324646034331536, "grad_norm": 4.144320964813232, "learning_rate": 3.4706890977160845e-08, "loss": 0.4325, "num_input_tokens_seen": 187545312, "step": 154230 }, { "epoch": 19.325272522240322, "grad_norm": 3.922243595123291, "learning_rate": 3.4642615924027065e-08, "loss": 0.4212, "num_input_tokens_seen": 187551680, "step": 154235 }, { "epoch": 19.325899010149104, "grad_norm": 17.10074806213379, "learning_rate": 3.457840023591619e-08, "loss": 0.4065, "num_input_tokens_seen": 187557728, "step": 154240 }, { "epoch": 19.326525498057887, "grad_norm": 7.565595626831055, "learning_rate": 3.451424391359426e-08, "loss": 0.492, "num_input_tokens_seen": 187563232, "step": 154245 }, { "epoch": 19.327151985966672, "grad_norm": 25.284868240356445, "learning_rate": 3.445014695783011e-08, "loss": 0.4137, "num_input_tokens_seen": 187569280, "step": 154250 }, { "epoch": 19.327778473875455, "grad_norm": 5.534329414367676, "learning_rate": 3.438610936938924e-08, "loss": 0.4352, "num_input_tokens_seen": 187575616, "step": 154255 }, { "epoch": 19.328404961784237, "grad_norm": 4.110820293426514, "learning_rate": 3.4322131149037154e-08, "loss": 0.4467, "num_input_tokens_seen": 187581632, "step": 154260 }, { "epoch": 19.32903144969302, "grad_norm": 11.742380142211914, "learning_rate": 3.425821229753878e-08, "loss": 0.4338, "num_input_tokens_seen": 187588032, "step": 154265 }, { "epoch": 19.329657937601805, "grad_norm": 4.624228477478027, "learning_rate": 3.4194352815658524e-08, "loss": 0.4668, "num_input_tokens_seen": 187593248, "step": 154270 }, { "epoch": 19.330284425510587, "grad_norm": 5.485140323638916, "learning_rate": 3.413055270415966e-08, "loss": 0.3991, "num_input_tokens_seen": 187599584, "step": 154275 }, { "epoch": 19.33091091341937, "grad_norm": 7.421794414520264, "learning_rate": 3.406681196380546e-08, "loss": 0.4319, "num_input_tokens_seen": 187605824, "step": 154280 }, { "epoch": 19.331537401328156, "grad_norm": 26.001718521118164, "learning_rate": 3.4003130595357534e-08, "loss": 0.4663, "num_input_tokens_seen": 187612224, "step": 154285 }, { "epoch": 19.332163889236938, "grad_norm": 24.495580673217773, "learning_rate": 3.393950859957695e-08, "loss": 0.5024, "num_input_tokens_seen": 187618336, "step": 154290 }, { "epoch": 19.33279037714572, "grad_norm": 7.107899188995361, "learning_rate": 3.387594597722532e-08, "loss": 0.4721, "num_input_tokens_seen": 187624640, "step": 154295 }, { "epoch": 19.333416865054506, "grad_norm": 3.842775821685791, "learning_rate": 3.381244272906204e-08, "loss": 0.4796, "num_input_tokens_seen": 187631008, "step": 154300 }, { "epoch": 19.33404335296329, "grad_norm": 5.171138286590576, "learning_rate": 3.374899885584593e-08, "loss": 0.4611, "num_input_tokens_seen": 187636480, "step": 154305 }, { "epoch": 19.33466984087207, "grad_norm": 3.5296010971069336, "learning_rate": 3.368561435833639e-08, "loss": 0.4148, "num_input_tokens_seen": 187642560, "step": 154310 }, { "epoch": 19.335296328780853, "grad_norm": 3.779210329055786, "learning_rate": 3.362228923729061e-08, "loss": 0.5029, "num_input_tokens_seen": 187648544, "step": 154315 }, { "epoch": 19.33592281668964, "grad_norm": 10.812527656555176, "learning_rate": 3.355902349346518e-08, "loss": 0.4451, "num_input_tokens_seen": 187654656, "step": 154320 }, { "epoch": 19.33654930459842, "grad_norm": 18.857200622558594, "learning_rate": 3.34958171276184e-08, "loss": 0.574, "num_input_tokens_seen": 187660800, "step": 154325 }, { "epoch": 19.337175792507203, "grad_norm": 4.937990188598633, "learning_rate": 3.3432670140503555e-08, "loss": 0.4066, "num_input_tokens_seen": 187666528, "step": 154330 }, { "epoch": 19.33780228041599, "grad_norm": 6.632377624511719, "learning_rate": 3.3369582532877254e-08, "loss": 0.388, "num_input_tokens_seen": 187672800, "step": 154335 }, { "epoch": 19.33842876832477, "grad_norm": 6.355655193328857, "learning_rate": 3.33065543054939e-08, "loss": 0.4508, "num_input_tokens_seen": 187679200, "step": 154340 }, { "epoch": 19.339055256233554, "grad_norm": 18.159469604492188, "learning_rate": 3.324358545910567e-08, "loss": 0.4695, "num_input_tokens_seen": 187685408, "step": 154345 }, { "epoch": 19.33968174414234, "grad_norm": 7.725470066070557, "learning_rate": 3.3180675994466394e-08, "loss": 0.5627, "num_input_tokens_seen": 187691744, "step": 154350 }, { "epoch": 19.34030823205112, "grad_norm": 5.983949661254883, "learning_rate": 3.3117825912327706e-08, "loss": 0.5697, "num_input_tokens_seen": 187697472, "step": 154355 }, { "epoch": 19.340934719959904, "grad_norm": 5.714109897613525, "learning_rate": 3.305503521344233e-08, "loss": 0.4469, "num_input_tokens_seen": 187703488, "step": 154360 }, { "epoch": 19.34156120786869, "grad_norm": 4.545801639556885, "learning_rate": 3.299230389855912e-08, "loss": 0.3991, "num_input_tokens_seen": 187709824, "step": 154365 }, { "epoch": 19.342187695777472, "grad_norm": 21.768917083740234, "learning_rate": 3.292963196842913e-08, "loss": 0.433, "num_input_tokens_seen": 187716000, "step": 154370 }, { "epoch": 19.342814183686254, "grad_norm": 6.264053821563721, "learning_rate": 3.2867019423801773e-08, "loss": 0.4527, "num_input_tokens_seen": 187721376, "step": 154375 }, { "epoch": 19.343440671595037, "grad_norm": 7.703120708465576, "learning_rate": 3.280446626542533e-08, "loss": 0.4563, "num_input_tokens_seen": 187727616, "step": 154380 }, { "epoch": 19.344067159503822, "grad_norm": 7.763079643249512, "learning_rate": 3.274197249404698e-08, "loss": 0.4083, "num_input_tokens_seen": 187733952, "step": 154385 }, { "epoch": 19.344693647412605, "grad_norm": 6.65741491317749, "learning_rate": 3.267953811041558e-08, "loss": 0.4871, "num_input_tokens_seen": 187740288, "step": 154390 }, { "epoch": 19.345320135321387, "grad_norm": 4.301203727722168, "learning_rate": 3.2617163115275515e-08, "loss": 0.4406, "num_input_tokens_seen": 187746304, "step": 154395 }, { "epoch": 19.345946623230173, "grad_norm": 5.438210487365723, "learning_rate": 3.255484750937454e-08, "loss": 0.4017, "num_input_tokens_seen": 187752544, "step": 154400 }, { "epoch": 19.346573111138955, "grad_norm": 12.970919609069824, "learning_rate": 3.24925912934565e-08, "loss": 0.413, "num_input_tokens_seen": 187758528, "step": 154405 }, { "epoch": 19.347199599047737, "grad_norm": 6.129416465759277, "learning_rate": 3.243039446826579e-08, "loss": 0.4713, "num_input_tokens_seen": 187764800, "step": 154410 }, { "epoch": 19.347826086956523, "grad_norm": 18.82930564880371, "learning_rate": 3.236825703454627e-08, "loss": 0.4731, "num_input_tokens_seen": 187770496, "step": 154415 }, { "epoch": 19.348452574865306, "grad_norm": 6.19986629486084, "learning_rate": 3.230617899304067e-08, "loss": 0.4789, "num_input_tokens_seen": 187776576, "step": 154420 }, { "epoch": 19.349079062774088, "grad_norm": 21.759492874145508, "learning_rate": 3.224416034449174e-08, "loss": 0.3879, "num_input_tokens_seen": 187782656, "step": 154425 }, { "epoch": 19.34970555068287, "grad_norm": 4.422676086425781, "learning_rate": 3.2182201089639986e-08, "loss": 0.487, "num_input_tokens_seen": 187788864, "step": 154430 }, { "epoch": 19.350332038591656, "grad_norm": 9.074798583984375, "learning_rate": 3.2120301229227046e-08, "loss": 0.4661, "num_input_tokens_seen": 187795136, "step": 154435 }, { "epoch": 19.35095852650044, "grad_norm": 4.803723335266113, "learning_rate": 3.2058460763992884e-08, "loss": 0.4529, "num_input_tokens_seen": 187801280, "step": 154440 }, { "epoch": 19.35158501440922, "grad_norm": 12.944561958312988, "learning_rate": 3.199667969467579e-08, "loss": 0.4142, "num_input_tokens_seen": 187807584, "step": 154445 }, { "epoch": 19.352211502318006, "grad_norm": 13.873493194580078, "learning_rate": 3.193495802201629e-08, "loss": 0.497, "num_input_tokens_seen": 187813312, "step": 154450 }, { "epoch": 19.35283799022679, "grad_norm": 7.122284889221191, "learning_rate": 3.187329574675047e-08, "loss": 0.3781, "num_input_tokens_seen": 187819584, "step": 154455 }, { "epoch": 19.35346447813557, "grad_norm": 27.392620086669922, "learning_rate": 3.181169286961661e-08, "loss": 0.5091, "num_input_tokens_seen": 187826080, "step": 154460 }, { "epoch": 19.354090966044357, "grad_norm": 5.404457092285156, "learning_rate": 3.175014939135135e-08, "loss": 0.4463, "num_input_tokens_seen": 187832000, "step": 154465 }, { "epoch": 19.35471745395314, "grad_norm": 5.978999137878418, "learning_rate": 3.1688665312690215e-08, "loss": 0.4588, "num_input_tokens_seen": 187838112, "step": 154470 }, { "epoch": 19.35534394186192, "grad_norm": 5.120262622833252, "learning_rate": 3.162724063436762e-08, "loss": 0.4292, "num_input_tokens_seen": 187844352, "step": 154475 }, { "epoch": 19.355970429770707, "grad_norm": 3.2946455478668213, "learning_rate": 3.156587535711908e-08, "loss": 0.4258, "num_input_tokens_seen": 187850336, "step": 154480 }, { "epoch": 19.35659691767949, "grad_norm": 5.9907708168029785, "learning_rate": 3.150456948167735e-08, "loss": 0.4638, "num_input_tokens_seen": 187856352, "step": 154485 }, { "epoch": 19.35722340558827, "grad_norm": 19.41510009765625, "learning_rate": 3.144332300877573e-08, "loss": 0.4377, "num_input_tokens_seen": 187862560, "step": 154490 }, { "epoch": 19.357849893497054, "grad_norm": 3.1409554481506348, "learning_rate": 3.138213593914696e-08, "loss": 0.44, "num_input_tokens_seen": 187868096, "step": 154495 }, { "epoch": 19.35847638140584, "grad_norm": 7.450859546661377, "learning_rate": 3.132100827352158e-08, "loss": 0.4595, "num_input_tokens_seen": 187874144, "step": 154500 }, { "epoch": 19.359102869314622, "grad_norm": 10.326208114624023, "learning_rate": 3.125994001263122e-08, "loss": 0.4215, "num_input_tokens_seen": 187880512, "step": 154505 }, { "epoch": 19.359729357223404, "grad_norm": 21.90757942199707, "learning_rate": 3.119893115720585e-08, "loss": 0.4756, "num_input_tokens_seen": 187886848, "step": 154510 }, { "epoch": 19.36035584513219, "grad_norm": 2.713167428970337, "learning_rate": 3.113798170797489e-08, "loss": 0.4801, "num_input_tokens_seen": 187892672, "step": 154515 }, { "epoch": 19.360982333040972, "grad_norm": 11.083270072937012, "learning_rate": 3.1077091665666637e-08, "loss": 0.4386, "num_input_tokens_seen": 187898880, "step": 154520 }, { "epoch": 19.361608820949755, "grad_norm": 4.374444007873535, "learning_rate": 3.101626103100941e-08, "loss": 0.4665, "num_input_tokens_seen": 187904992, "step": 154525 }, { "epoch": 19.36223530885854, "grad_norm": 16.66891098022461, "learning_rate": 3.095548980473095e-08, "loss": 0.5475, "num_input_tokens_seen": 187911520, "step": 154530 }, { "epoch": 19.362861796767323, "grad_norm": 11.017374038696289, "learning_rate": 3.08947779875568e-08, "loss": 0.4269, "num_input_tokens_seen": 187917472, "step": 154535 }, { "epoch": 19.363488284676105, "grad_norm": 5.073217868804932, "learning_rate": 3.083412558021359e-08, "loss": 0.4385, "num_input_tokens_seen": 187923520, "step": 154540 }, { "epoch": 19.364114772584887, "grad_norm": 10.020049095153809, "learning_rate": 3.07735325834263e-08, "loss": 0.4512, "num_input_tokens_seen": 187928832, "step": 154545 }, { "epoch": 19.364741260493673, "grad_norm": 10.225214004516602, "learning_rate": 3.071299899791935e-08, "loss": 0.4499, "num_input_tokens_seen": 187934528, "step": 154550 }, { "epoch": 19.365367748402456, "grad_norm": 4.79915714263916, "learning_rate": 3.065252482441605e-08, "loss": 0.4195, "num_input_tokens_seen": 187940640, "step": 154555 }, { "epoch": 19.365994236311238, "grad_norm": 11.118175506591797, "learning_rate": 3.059211006364027e-08, "loss": 0.5019, "num_input_tokens_seen": 187946656, "step": 154560 }, { "epoch": 19.366620724220024, "grad_norm": 7.021004676818848, "learning_rate": 3.053175471631365e-08, "loss": 0.4376, "num_input_tokens_seen": 187952800, "step": 154565 }, { "epoch": 19.367247212128806, "grad_norm": 7.315429210662842, "learning_rate": 3.047145878315838e-08, "loss": 0.4042, "num_input_tokens_seen": 187959168, "step": 154570 }, { "epoch": 19.36787370003759, "grad_norm": 7.787156581878662, "learning_rate": 3.041122226489446e-08, "loss": 0.455, "num_input_tokens_seen": 187964992, "step": 154575 }, { "epoch": 19.368500187946374, "grad_norm": 8.242944717407227, "learning_rate": 3.035104516224296e-08, "loss": 0.3929, "num_input_tokens_seen": 187971136, "step": 154580 }, { "epoch": 19.369126675855156, "grad_norm": 8.946617126464844, "learning_rate": 3.029092747592277e-08, "loss": 0.3758, "num_input_tokens_seen": 187977280, "step": 154585 }, { "epoch": 19.36975316376394, "grad_norm": 26.350322723388672, "learning_rate": 3.023086920665275e-08, "loss": 0.575, "num_input_tokens_seen": 187983680, "step": 154590 }, { "epoch": 19.370379651672724, "grad_norm": 24.647260665893555, "learning_rate": 3.0170870355151205e-08, "loss": 0.6209, "num_input_tokens_seen": 187989760, "step": 154595 }, { "epoch": 19.371006139581507, "grad_norm": 6.915020942687988, "learning_rate": 3.011093092213535e-08, "loss": 0.4239, "num_input_tokens_seen": 187995936, "step": 154600 }, { "epoch": 19.37163262749029, "grad_norm": 4.323150634765625, "learning_rate": 3.0051050908321834e-08, "loss": 0.3854, "num_input_tokens_seen": 188001984, "step": 154605 }, { "epoch": 19.37225911539907, "grad_norm": 7.6330060958862305, "learning_rate": 2.999123031442619e-08, "loss": 0.4202, "num_input_tokens_seen": 188008224, "step": 154610 }, { "epoch": 19.372885603307857, "grad_norm": 5.484438419342041, "learning_rate": 2.993146914116396e-08, "loss": 0.4414, "num_input_tokens_seen": 188014592, "step": 154615 }, { "epoch": 19.37351209121664, "grad_norm": 5.156764984130859, "learning_rate": 2.9871767389250126e-08, "loss": 0.474, "num_input_tokens_seen": 188020544, "step": 154620 }, { "epoch": 19.37413857912542, "grad_norm": 4.790592193603516, "learning_rate": 2.981212505939746e-08, "loss": 0.4273, "num_input_tokens_seen": 188026432, "step": 154625 }, { "epoch": 19.374765067034208, "grad_norm": 4.7311601638793945, "learning_rate": 2.975254215231982e-08, "loss": 0.3758, "num_input_tokens_seen": 188032544, "step": 154630 }, { "epoch": 19.37539155494299, "grad_norm": 6.188992023468018, "learning_rate": 2.9693018668728868e-08, "loss": 0.4798, "num_input_tokens_seen": 188038624, "step": 154635 }, { "epoch": 19.376018042851772, "grad_norm": 4.3624982833862305, "learning_rate": 2.9633554609337366e-08, "loss": 0.4336, "num_input_tokens_seen": 188044864, "step": 154640 }, { "epoch": 19.376644530760558, "grad_norm": 6.0013957023620605, "learning_rate": 2.957414997485475e-08, "loss": 0.3876, "num_input_tokens_seen": 188050912, "step": 154645 }, { "epoch": 19.37727101866934, "grad_norm": 22.35020637512207, "learning_rate": 2.9514804765992667e-08, "loss": 0.4667, "num_input_tokens_seen": 188057312, "step": 154650 }, { "epoch": 19.377897506578122, "grad_norm": 14.887771606445312, "learning_rate": 2.9455518983459996e-08, "loss": 0.4973, "num_input_tokens_seen": 188063104, "step": 154655 }, { "epoch": 19.378523994486905, "grad_norm": 6.533364772796631, "learning_rate": 2.9396292627965618e-08, "loss": 0.3997, "num_input_tokens_seen": 188069184, "step": 154660 }, { "epoch": 19.37915048239569, "grad_norm": 3.1249544620513916, "learning_rate": 2.9337125700217296e-08, "loss": 0.4433, "num_input_tokens_seen": 188075456, "step": 154665 }, { "epoch": 19.379776970304473, "grad_norm": 20.195199966430664, "learning_rate": 2.9278018200922798e-08, "loss": 0.4637, "num_input_tokens_seen": 188081536, "step": 154670 }, { "epoch": 19.380403458213255, "grad_norm": 4.540926456451416, "learning_rate": 2.921897013078878e-08, "loss": 0.4322, "num_input_tokens_seen": 188087520, "step": 154675 }, { "epoch": 19.38102994612204, "grad_norm": 32.23853302001953, "learning_rate": 2.915998149052135e-08, "loss": 0.4607, "num_input_tokens_seen": 188093664, "step": 154680 }, { "epoch": 19.381656434030823, "grad_norm": 5.3056793212890625, "learning_rate": 2.9101052280825492e-08, "loss": 0.3947, "num_input_tokens_seen": 188099872, "step": 154685 }, { "epoch": 19.382282921939606, "grad_norm": 3.098843812942505, "learning_rate": 2.9042182502406202e-08, "loss": 0.3747, "num_input_tokens_seen": 188106272, "step": 154690 }, { "epoch": 19.38290940984839, "grad_norm": 11.585780143737793, "learning_rate": 2.8983372155966805e-08, "loss": 0.4345, "num_input_tokens_seen": 188112288, "step": 154695 }, { "epoch": 19.383535897757174, "grad_norm": 9.021413803100586, "learning_rate": 2.892462124221007e-08, "loss": 0.4198, "num_input_tokens_seen": 188118560, "step": 154700 }, { "epoch": 19.384162385665956, "grad_norm": 7.7067389488220215, "learning_rate": 2.886592976183933e-08, "loss": 0.4311, "num_input_tokens_seen": 188124640, "step": 154705 }, { "epoch": 19.384788873574742, "grad_norm": 4.7515645027160645, "learning_rate": 2.8807297715555682e-08, "loss": 0.528, "num_input_tokens_seen": 188130976, "step": 154710 }, { "epoch": 19.385415361483524, "grad_norm": 5.7501630783081055, "learning_rate": 2.8748725104060793e-08, "loss": 0.4011, "num_input_tokens_seen": 188136480, "step": 154715 }, { "epoch": 19.386041849392306, "grad_norm": 4.511610507965088, "learning_rate": 2.8690211928054102e-08, "loss": 0.4441, "num_input_tokens_seen": 188142816, "step": 154720 }, { "epoch": 19.38666833730109, "grad_norm": 20.855371475219727, "learning_rate": 2.8631758188235603e-08, "loss": 0.5182, "num_input_tokens_seen": 188149152, "step": 154725 }, { "epoch": 19.387294825209874, "grad_norm": 10.862733840942383, "learning_rate": 2.857336388530474e-08, "loss": 0.4672, "num_input_tokens_seen": 188155232, "step": 154730 }, { "epoch": 19.387921313118657, "grad_norm": 6.129744529724121, "learning_rate": 2.8515029019958174e-08, "loss": 0.4445, "num_input_tokens_seen": 188161376, "step": 154735 }, { "epoch": 19.38854780102744, "grad_norm": 8.66439437866211, "learning_rate": 2.8456753592894792e-08, "loss": 0.4893, "num_input_tokens_seen": 188167488, "step": 154740 }, { "epoch": 19.389174288936225, "grad_norm": 5.560525894165039, "learning_rate": 2.8398537604810704e-08, "loss": 0.4556, "num_input_tokens_seen": 188172992, "step": 154745 }, { "epoch": 19.389800776845007, "grad_norm": 6.543185234069824, "learning_rate": 2.8340381056401466e-08, "loss": 0.5289, "num_input_tokens_seen": 188178976, "step": 154750 }, { "epoch": 19.39042726475379, "grad_norm": 7.52673864364624, "learning_rate": 2.8282283948363743e-08, "loss": 0.4323, "num_input_tokens_seen": 188185216, "step": 154755 }, { "epoch": 19.391053752662575, "grad_norm": 21.431257247924805, "learning_rate": 2.8224246281390867e-08, "loss": 0.4497, "num_input_tokens_seen": 188191168, "step": 154760 }, { "epoch": 19.391680240571358, "grad_norm": 17.319759368896484, "learning_rate": 2.8166268056176726e-08, "loss": 0.4057, "num_input_tokens_seen": 188197440, "step": 154765 }, { "epoch": 19.39230672848014, "grad_norm": 5.242209434509277, "learning_rate": 2.8108349273415216e-08, "loss": 0.4171, "num_input_tokens_seen": 188203552, "step": 154770 }, { "epoch": 19.392933216388922, "grad_norm": 7.351499557495117, "learning_rate": 2.8050489933799108e-08, "loss": 0.4904, "num_input_tokens_seen": 188209856, "step": 154775 }, { "epoch": 19.393559704297708, "grad_norm": 6.12777853012085, "learning_rate": 2.7992690038018966e-08, "loss": 0.4392, "num_input_tokens_seen": 188216064, "step": 154780 }, { "epoch": 19.39418619220649, "grad_norm": 5.8480024337768555, "learning_rate": 2.793494958676701e-08, "loss": 0.438, "num_input_tokens_seen": 188221056, "step": 154785 }, { "epoch": 19.394812680115272, "grad_norm": 17.68494415283203, "learning_rate": 2.7877268580732143e-08, "loss": 0.4454, "num_input_tokens_seen": 188226912, "step": 154790 }, { "epoch": 19.39543916802406, "grad_norm": 5.0662031173706055, "learning_rate": 2.7819647020606022e-08, "loss": 0.4216, "num_input_tokens_seen": 188233024, "step": 154795 }, { "epoch": 19.39606565593284, "grad_norm": 6.094033718109131, "learning_rate": 2.7762084907075325e-08, "loss": 0.4401, "num_input_tokens_seen": 188239040, "step": 154800 }, { "epoch": 19.396692143841623, "grad_norm": 7.191147327423096, "learning_rate": 2.7704582240830057e-08, "loss": 0.4552, "num_input_tokens_seen": 188244256, "step": 154805 }, { "epoch": 19.39731863175041, "grad_norm": 17.921743392944336, "learning_rate": 2.7647139022556336e-08, "loss": 0.5493, "num_input_tokens_seen": 188250208, "step": 154810 }, { "epoch": 19.39794511965919, "grad_norm": 6.899609088897705, "learning_rate": 2.7589755252942496e-08, "loss": 0.4906, "num_input_tokens_seen": 188256160, "step": 154815 }, { "epoch": 19.398571607567973, "grad_norm": 6.917669773101807, "learning_rate": 2.7532430932672994e-08, "loss": 0.3956, "num_input_tokens_seen": 188261728, "step": 154820 }, { "epoch": 19.399198095476756, "grad_norm": 12.092580795288086, "learning_rate": 2.7475166062433944e-08, "loss": 0.4937, "num_input_tokens_seen": 188267904, "step": 154825 }, { "epoch": 19.39982458338554, "grad_norm": 11.30064582824707, "learning_rate": 2.74179606429098e-08, "loss": 0.4507, "num_input_tokens_seen": 188273888, "step": 154830 }, { "epoch": 19.400451071294324, "grad_norm": 7.993271350860596, "learning_rate": 2.736081467478502e-08, "loss": 0.4112, "num_input_tokens_seen": 188280384, "step": 154835 }, { "epoch": 19.401077559203106, "grad_norm": 5.400484085083008, "learning_rate": 2.7303728158742382e-08, "loss": 0.4338, "num_input_tokens_seen": 188286176, "step": 154840 }, { "epoch": 19.401704047111892, "grad_norm": 23.11698341369629, "learning_rate": 2.724670109546468e-08, "loss": 0.5202, "num_input_tokens_seen": 188292288, "step": 154845 }, { "epoch": 19.402330535020674, "grad_norm": 3.8844101428985596, "learning_rate": 2.7189733485633583e-08, "loss": 0.4523, "num_input_tokens_seen": 188298528, "step": 154850 }, { "epoch": 19.402957022929456, "grad_norm": 4.785308837890625, "learning_rate": 2.713282532992967e-08, "loss": 0.3936, "num_input_tokens_seen": 188304640, "step": 154855 }, { "epoch": 19.403583510838242, "grad_norm": 5.599616050720215, "learning_rate": 2.7075976629033496e-08, "loss": 0.4626, "num_input_tokens_seen": 188310528, "step": 154860 }, { "epoch": 19.404209998747024, "grad_norm": 13.067138671875, "learning_rate": 2.7019187383625634e-08, "loss": 0.4775, "num_input_tokens_seen": 188316576, "step": 154865 }, { "epoch": 19.404836486655807, "grad_norm": 17.875944137573242, "learning_rate": 2.6962457594384427e-08, "loss": 0.4615, "num_input_tokens_seen": 188322720, "step": 154870 }, { "epoch": 19.405462974564593, "grad_norm": 18.50028419494629, "learning_rate": 2.6905787261988225e-08, "loss": 0.5499, "num_input_tokens_seen": 188328704, "step": 154875 }, { "epoch": 19.406089462473375, "grad_norm": 15.259666442871094, "learning_rate": 2.684917638711426e-08, "loss": 0.6144, "num_input_tokens_seen": 188334848, "step": 154880 }, { "epoch": 19.406715950382157, "grad_norm": 13.150529861450195, "learning_rate": 2.679262497043922e-08, "loss": 0.4483, "num_input_tokens_seen": 188340832, "step": 154885 }, { "epoch": 19.40734243829094, "grad_norm": 29.26520538330078, "learning_rate": 2.673613301264033e-08, "loss": 0.4946, "num_input_tokens_seen": 188346784, "step": 154890 }, { "epoch": 19.407968926199725, "grad_norm": 17.994476318359375, "learning_rate": 2.667970051439206e-08, "loss": 0.4316, "num_input_tokens_seen": 188352416, "step": 154895 }, { "epoch": 19.408595414108508, "grad_norm": 4.934402942657471, "learning_rate": 2.6623327476368866e-08, "loss": 0.4475, "num_input_tokens_seen": 188358560, "step": 154900 }, { "epoch": 19.40922190201729, "grad_norm": 5.671901702880859, "learning_rate": 2.6567013899245765e-08, "loss": 0.5111, "num_input_tokens_seen": 188364800, "step": 154905 }, { "epoch": 19.409848389926076, "grad_norm": 9.005884170532227, "learning_rate": 2.6510759783694994e-08, "loss": 0.4069, "num_input_tokens_seen": 188371008, "step": 154910 }, { "epoch": 19.410474877834858, "grad_norm": 5.079419136047363, "learning_rate": 2.6454565130389907e-08, "loss": 0.4179, "num_input_tokens_seen": 188377216, "step": 154915 }, { "epoch": 19.41110136574364, "grad_norm": 5.1277241706848145, "learning_rate": 2.6398429940001636e-08, "loss": 0.4349, "num_input_tokens_seen": 188383520, "step": 154920 }, { "epoch": 19.411727853652426, "grad_norm": 5.054850101470947, "learning_rate": 2.634235421320186e-08, "loss": 0.4592, "num_input_tokens_seen": 188389408, "step": 154925 }, { "epoch": 19.41235434156121, "grad_norm": 7.630918979644775, "learning_rate": 2.6286337950661155e-08, "loss": 0.4134, "num_input_tokens_seen": 188395552, "step": 154930 }, { "epoch": 19.41298082946999, "grad_norm": 5.545123100280762, "learning_rate": 2.6230381153048434e-08, "loss": 0.4401, "num_input_tokens_seen": 188401376, "step": 154935 }, { "epoch": 19.413607317378773, "grad_norm": 6.441939830780029, "learning_rate": 2.6174483821033713e-08, "loss": 0.4638, "num_input_tokens_seen": 188407680, "step": 154940 }, { "epoch": 19.41423380528756, "grad_norm": 5.2901506423950195, "learning_rate": 2.6118645955284793e-08, "loss": 0.5117, "num_input_tokens_seen": 188414016, "step": 154945 }, { "epoch": 19.41486029319634, "grad_norm": 14.911859512329102, "learning_rate": 2.606286755646892e-08, "loss": 0.4542, "num_input_tokens_seen": 188420448, "step": 154950 }, { "epoch": 19.415486781105123, "grad_norm": 10.65318775177002, "learning_rate": 2.600714862525333e-08, "loss": 0.4565, "num_input_tokens_seen": 188426624, "step": 154955 }, { "epoch": 19.41611326901391, "grad_norm": 7.503287315368652, "learning_rate": 2.5951489162304168e-08, "loss": 0.5505, "num_input_tokens_seen": 188432704, "step": 154960 }, { "epoch": 19.41673975692269, "grad_norm": 6.507822036743164, "learning_rate": 2.5895889168287004e-08, "loss": 0.4691, "num_input_tokens_seen": 188438912, "step": 154965 }, { "epoch": 19.417366244831474, "grad_norm": 6.792926788330078, "learning_rate": 2.5840348643866864e-08, "loss": 0.4478, "num_input_tokens_seen": 188445312, "step": 154970 }, { "epoch": 19.41799273274026, "grad_norm": 8.6442232131958, "learning_rate": 2.578486758970655e-08, "loss": 0.4994, "num_input_tokens_seen": 188451360, "step": 154975 }, { "epoch": 19.418619220649042, "grad_norm": 32.83554458618164, "learning_rate": 2.5729446006471092e-08, "loss": 0.4657, "num_input_tokens_seen": 188457536, "step": 154980 }, { "epoch": 19.419245708557824, "grad_norm": 14.601789474487305, "learning_rate": 2.5674083894821622e-08, "loss": 0.51, "num_input_tokens_seen": 188463872, "step": 154985 }, { "epoch": 19.41987219646661, "grad_norm": 10.567177772521973, "learning_rate": 2.5618781255421497e-08, "loss": 0.4382, "num_input_tokens_seen": 188469952, "step": 154990 }, { "epoch": 19.420498684375392, "grad_norm": 5.479039192199707, "learning_rate": 2.5563538088930196e-08, "loss": 0.4546, "num_input_tokens_seen": 188476000, "step": 154995 }, { "epoch": 19.421125172284174, "grad_norm": 21.46242332458496, "learning_rate": 2.5508354396009406e-08, "loss": 0.477, "num_input_tokens_seen": 188482176, "step": 155000 }, { "epoch": 19.421751660192957, "grad_norm": 9.755671501159668, "learning_rate": 2.54532301773186e-08, "loss": 0.449, "num_input_tokens_seen": 188488224, "step": 155005 }, { "epoch": 19.422378148101743, "grad_norm": 3.6037206649780273, "learning_rate": 2.5398165433516697e-08, "loss": 0.4477, "num_input_tokens_seen": 188493280, "step": 155010 }, { "epoch": 19.423004636010525, "grad_norm": 6.555286407470703, "learning_rate": 2.534316016526206e-08, "loss": 0.3939, "num_input_tokens_seen": 188499520, "step": 155015 }, { "epoch": 19.423631123919307, "grad_norm": 4.087812900543213, "learning_rate": 2.5288214373212494e-08, "loss": 0.4214, "num_input_tokens_seen": 188505472, "step": 155020 }, { "epoch": 19.424257611828093, "grad_norm": 10.277117729187012, "learning_rate": 2.523332805802525e-08, "loss": 0.4231, "num_input_tokens_seen": 188511296, "step": 155025 }, { "epoch": 19.424884099736875, "grad_norm": 7.424834728240967, "learning_rate": 2.5178501220355366e-08, "loss": 0.4502, "num_input_tokens_seen": 188517248, "step": 155030 }, { "epoch": 19.425510587645658, "grad_norm": 7.923213005065918, "learning_rate": 2.5123733860859534e-08, "loss": 0.3966, "num_input_tokens_seen": 188522784, "step": 155035 }, { "epoch": 19.426137075554443, "grad_norm": 14.668600082397461, "learning_rate": 2.506902598019223e-08, "loss": 0.4305, "num_input_tokens_seen": 188528672, "step": 155040 }, { "epoch": 19.426763563463226, "grad_norm": 10.923377990722656, "learning_rate": 2.5014377579007376e-08, "loss": 0.4891, "num_input_tokens_seen": 188534816, "step": 155045 }, { "epoch": 19.427390051372008, "grad_norm": 18.6074275970459, "learning_rate": 2.495978865795834e-08, "loss": 0.453, "num_input_tokens_seen": 188540608, "step": 155050 }, { "epoch": 19.42801653928079, "grad_norm": 7.084512233734131, "learning_rate": 2.4905259217697374e-08, "loss": 0.4275, "num_input_tokens_seen": 188547168, "step": 155055 }, { "epoch": 19.428643027189576, "grad_norm": 5.8156046867370605, "learning_rate": 2.4850789258877294e-08, "loss": 0.4604, "num_input_tokens_seen": 188553408, "step": 155060 }, { "epoch": 19.42926951509836, "grad_norm": 14.434013366699219, "learning_rate": 2.479637878214869e-08, "loss": 0.5082, "num_input_tokens_seen": 188559616, "step": 155065 }, { "epoch": 19.42989600300714, "grad_norm": 7.287076473236084, "learning_rate": 2.474202778816215e-08, "loss": 0.4177, "num_input_tokens_seen": 188565632, "step": 155070 }, { "epoch": 19.430522490915926, "grad_norm": 6.185164928436279, "learning_rate": 2.4687736277568264e-08, "loss": 0.4435, "num_input_tokens_seen": 188571552, "step": 155075 }, { "epoch": 19.43114897882471, "grad_norm": 7.782191753387451, "learning_rate": 2.463350425101485e-08, "loss": 0.4834, "num_input_tokens_seen": 188577504, "step": 155080 }, { "epoch": 19.43177546673349, "grad_norm": 18.11566162109375, "learning_rate": 2.457933170915139e-08, "loss": 0.4675, "num_input_tokens_seen": 188583904, "step": 155085 }, { "epoch": 19.432401954642277, "grad_norm": 11.017189979553223, "learning_rate": 2.4525218652625138e-08, "loss": 0.4519, "num_input_tokens_seen": 188589344, "step": 155090 }, { "epoch": 19.43302844255106, "grad_norm": 3.3142266273498535, "learning_rate": 2.4471165082082804e-08, "loss": 0.4131, "num_input_tokens_seen": 188595520, "step": 155095 }, { "epoch": 19.43365493045984, "grad_norm": 10.690228462219238, "learning_rate": 2.441717099817109e-08, "loss": 0.4796, "num_input_tokens_seen": 188601728, "step": 155100 }, { "epoch": 19.434281418368627, "grad_norm": 4.558322429656982, "learning_rate": 2.436323640153504e-08, "loss": 0.4648, "num_input_tokens_seen": 188607744, "step": 155105 }, { "epoch": 19.43490790627741, "grad_norm": 4.792394638061523, "learning_rate": 2.4309361292820245e-08, "loss": 0.4084, "num_input_tokens_seen": 188613696, "step": 155110 }, { "epoch": 19.43553439418619, "grad_norm": 3.9852442741394043, "learning_rate": 2.4255545672670076e-08, "loss": 0.4117, "num_input_tokens_seen": 188619584, "step": 155115 }, { "epoch": 19.436160882094974, "grad_norm": 9.512899398803711, "learning_rate": 2.4201789541727915e-08, "loss": 0.3884, "num_input_tokens_seen": 188625536, "step": 155120 }, { "epoch": 19.43678737000376, "grad_norm": 4.6758551597595215, "learning_rate": 2.414809290063713e-08, "loss": 0.397, "num_input_tokens_seen": 188631456, "step": 155125 }, { "epoch": 19.437413857912542, "grad_norm": 27.342041015625, "learning_rate": 2.4094455750039436e-08, "loss": 0.4903, "num_input_tokens_seen": 188637760, "step": 155130 }, { "epoch": 19.438040345821324, "grad_norm": 9.526246070861816, "learning_rate": 2.404087809057598e-08, "loss": 0.4532, "num_input_tokens_seen": 188643136, "step": 155135 }, { "epoch": 19.43866683373011, "grad_norm": 5.113968372344971, "learning_rate": 2.3987359922887365e-08, "loss": 0.4281, "num_input_tokens_seen": 188649440, "step": 155140 }, { "epoch": 19.439293321638893, "grad_norm": 12.143173217773438, "learning_rate": 2.3933901247613634e-08, "loss": 0.4813, "num_input_tokens_seen": 188655392, "step": 155145 }, { "epoch": 19.439919809547675, "grad_norm": 16.031091690063477, "learning_rate": 2.3880502065393164e-08, "loss": 0.4589, "num_input_tokens_seen": 188661632, "step": 155150 }, { "epoch": 19.44054629745646, "grad_norm": 4.364701747894287, "learning_rate": 2.3827162376865443e-08, "loss": 0.5146, "num_input_tokens_seen": 188667648, "step": 155155 }, { "epoch": 19.441172785365243, "grad_norm": 7.045782566070557, "learning_rate": 2.377388218266774e-08, "loss": 0.4356, "num_input_tokens_seen": 188673984, "step": 155160 }, { "epoch": 19.441799273274025, "grad_norm": 5.317216873168945, "learning_rate": 2.3720661483436768e-08, "loss": 0.4062, "num_input_tokens_seen": 188680160, "step": 155165 }, { "epoch": 19.442425761182808, "grad_norm": 11.101457595825195, "learning_rate": 2.366750027980924e-08, "loss": 0.4593, "num_input_tokens_seen": 188686432, "step": 155170 }, { "epoch": 19.443052249091593, "grad_norm": 4.814892768859863, "learning_rate": 2.361439857242076e-08, "loss": 0.4313, "num_input_tokens_seen": 188692352, "step": 155175 }, { "epoch": 19.443678737000376, "grad_norm": 16.64969825744629, "learning_rate": 2.3561356361905818e-08, "loss": 0.4551, "num_input_tokens_seen": 188698208, "step": 155180 }, { "epoch": 19.444305224909158, "grad_norm": 7.263765811920166, "learning_rate": 2.350837364889946e-08, "loss": 0.3852, "num_input_tokens_seen": 188704320, "step": 155185 }, { "epoch": 19.444931712817944, "grad_norm": 6.141375541687012, "learning_rate": 2.3455450434033968e-08, "loss": 0.4889, "num_input_tokens_seen": 188710432, "step": 155190 }, { "epoch": 19.445558200726726, "grad_norm": 25.63015365600586, "learning_rate": 2.340258671794271e-08, "loss": 0.5203, "num_input_tokens_seen": 188716544, "step": 155195 }, { "epoch": 19.44618468863551, "grad_norm": 9.805500984191895, "learning_rate": 2.3349782501257413e-08, "loss": 0.4377, "num_input_tokens_seen": 188722752, "step": 155200 }, { "epoch": 19.446811176544294, "grad_norm": 5.883247375488281, "learning_rate": 2.3297037784609787e-08, "loss": 0.4342, "num_input_tokens_seen": 188728992, "step": 155205 }, { "epoch": 19.447437664453076, "grad_norm": 4.127749443054199, "learning_rate": 2.3244352568630445e-08, "loss": 0.4416, "num_input_tokens_seen": 188735168, "step": 155210 }, { "epoch": 19.44806415236186, "grad_norm": 4.988032341003418, "learning_rate": 2.319172685394888e-08, "loss": 0.3919, "num_input_tokens_seen": 188741184, "step": 155215 }, { "epoch": 19.44869064027064, "grad_norm": 5.693800449371338, "learning_rate": 2.313916064119459e-08, "loss": 0.4595, "num_input_tokens_seen": 188747040, "step": 155220 }, { "epoch": 19.449317128179427, "grad_norm": 13.406240463256836, "learning_rate": 2.308665393099596e-08, "loss": 0.43, "num_input_tokens_seen": 188752896, "step": 155225 }, { "epoch": 19.44994361608821, "grad_norm": 11.678905487060547, "learning_rate": 2.3034206723980268e-08, "loss": 0.4543, "num_input_tokens_seen": 188759040, "step": 155230 }, { "epoch": 19.45057010399699, "grad_norm": 6.221916675567627, "learning_rate": 2.298181902077534e-08, "loss": 0.4875, "num_input_tokens_seen": 188765408, "step": 155235 }, { "epoch": 19.451196591905777, "grad_norm": 4.166777610778809, "learning_rate": 2.2929490822007905e-08, "loss": 0.3925, "num_input_tokens_seen": 188771552, "step": 155240 }, { "epoch": 19.45182307981456, "grad_norm": 8.589136123657227, "learning_rate": 2.2877222128301902e-08, "loss": 0.4167, "num_input_tokens_seen": 188777504, "step": 155245 }, { "epoch": 19.45244956772334, "grad_norm": 6.793211460113525, "learning_rate": 2.2825012940284053e-08, "loss": 0.457, "num_input_tokens_seen": 188783584, "step": 155250 }, { "epoch": 19.453076055632128, "grad_norm": 16.58559226989746, "learning_rate": 2.2772863258577193e-08, "loss": 0.4292, "num_input_tokens_seen": 188789600, "step": 155255 }, { "epoch": 19.45370254354091, "grad_norm": 15.89223575592041, "learning_rate": 2.272077308380527e-08, "loss": 0.4739, "num_input_tokens_seen": 188796064, "step": 155260 }, { "epoch": 19.454329031449692, "grad_norm": 7.444750785827637, "learning_rate": 2.266874241659167e-08, "loss": 0.4406, "num_input_tokens_seen": 188802080, "step": 155265 }, { "epoch": 19.454955519358478, "grad_norm": 6.07996129989624, "learning_rate": 2.261677125755757e-08, "loss": 0.4133, "num_input_tokens_seen": 188808672, "step": 155270 }, { "epoch": 19.45558200726726, "grad_norm": 7.165733814239502, "learning_rate": 2.2564859607324686e-08, "loss": 0.3863, "num_input_tokens_seen": 188814976, "step": 155275 }, { "epoch": 19.456208495176043, "grad_norm": 10.091631889343262, "learning_rate": 2.2513007466514192e-08, "loss": 0.3939, "num_input_tokens_seen": 188821024, "step": 155280 }, { "epoch": 19.456834983084825, "grad_norm": 5.573678016662598, "learning_rate": 2.246121483574504e-08, "loss": 0.5391, "num_input_tokens_seen": 188827264, "step": 155285 }, { "epoch": 19.45746147099361, "grad_norm": 15.106717109680176, "learning_rate": 2.2409481715636727e-08, "loss": 0.4159, "num_input_tokens_seen": 188833184, "step": 155290 }, { "epoch": 19.458087958902393, "grad_norm": 18.125904083251953, "learning_rate": 2.2357808106808766e-08, "loss": 0.4943, "num_input_tokens_seen": 188839392, "step": 155295 }, { "epoch": 19.458714446811175, "grad_norm": 11.161402702331543, "learning_rate": 2.230619400987788e-08, "loss": 0.489, "num_input_tokens_seen": 188844704, "step": 155300 }, { "epoch": 19.45934093471996, "grad_norm": 8.507698059082031, "learning_rate": 2.2254639425461354e-08, "loss": 0.436, "num_input_tokens_seen": 188850880, "step": 155305 }, { "epoch": 19.459967422628743, "grad_norm": 22.88502311706543, "learning_rate": 2.2203144354175365e-08, "loss": 0.4035, "num_input_tokens_seen": 188856992, "step": 155310 }, { "epoch": 19.460593910537526, "grad_norm": 14.525898933410645, "learning_rate": 2.215170879663664e-08, "loss": 0.4194, "num_input_tokens_seen": 188863392, "step": 155315 }, { "epoch": 19.46122039844631, "grad_norm": 3.879065752029419, "learning_rate": 2.2100332753458577e-08, "loss": 0.4209, "num_input_tokens_seen": 188869504, "step": 155320 }, { "epoch": 19.461846886355094, "grad_norm": 38.83277893066406, "learning_rate": 2.2049016225256792e-08, "loss": 0.5199, "num_input_tokens_seen": 188875872, "step": 155325 }, { "epoch": 19.462473374263876, "grad_norm": 7.032590389251709, "learning_rate": 2.1997759212644133e-08, "loss": 0.416, "num_input_tokens_seen": 188881984, "step": 155330 }, { "epoch": 19.463099862172662, "grad_norm": 10.660432815551758, "learning_rate": 2.1946561716233438e-08, "loss": 0.4088, "num_input_tokens_seen": 188888160, "step": 155335 }, { "epoch": 19.463726350081444, "grad_norm": 4.78642463684082, "learning_rate": 2.1895423736637e-08, "loss": 0.4419, "num_input_tokens_seen": 188894368, "step": 155340 }, { "epoch": 19.464352837990226, "grad_norm": 6.668210506439209, "learning_rate": 2.1844345274465996e-08, "loss": 0.4785, "num_input_tokens_seen": 188900256, "step": 155345 }, { "epoch": 19.46497932589901, "grad_norm": 3.922426223754883, "learning_rate": 2.17933263303316e-08, "loss": 0.3933, "num_input_tokens_seen": 188906176, "step": 155350 }, { "epoch": 19.465605813807795, "grad_norm": 7.295553684234619, "learning_rate": 2.1742366904842772e-08, "loss": 0.4311, "num_input_tokens_seen": 188912320, "step": 155355 }, { "epoch": 19.466232301716577, "grad_norm": 10.155112266540527, "learning_rate": 2.1691466998610134e-08, "loss": 0.4487, "num_input_tokens_seen": 188918368, "step": 155360 }, { "epoch": 19.46685878962536, "grad_norm": 6.628993034362793, "learning_rate": 2.1640626612240978e-08, "loss": 0.4155, "num_input_tokens_seen": 188924640, "step": 155365 }, { "epoch": 19.467485277534145, "grad_norm": 28.4637508392334, "learning_rate": 2.1589845746344262e-08, "loss": 0.4801, "num_input_tokens_seen": 188931008, "step": 155370 }, { "epoch": 19.468111765442927, "grad_norm": 27.734458923339844, "learning_rate": 2.1539124401526167e-08, "loss": 0.4462, "num_input_tokens_seen": 188937248, "step": 155375 }, { "epoch": 19.46873825335171, "grad_norm": 5.0835700035095215, "learning_rate": 2.1488462578393432e-08, "loss": 0.4195, "num_input_tokens_seen": 188943328, "step": 155380 }, { "epoch": 19.469364741260495, "grad_norm": 5.79398250579834, "learning_rate": 2.1437860277552237e-08, "loss": 0.4537, "num_input_tokens_seen": 188949344, "step": 155385 }, { "epoch": 19.469991229169278, "grad_norm": 7.971628665924072, "learning_rate": 2.1387317499606542e-08, "loss": 0.3735, "num_input_tokens_seen": 188955904, "step": 155390 }, { "epoch": 19.47061771707806, "grad_norm": 11.826911926269531, "learning_rate": 2.1336834245161975e-08, "loss": 0.3979, "num_input_tokens_seen": 188961952, "step": 155395 }, { "epoch": 19.471244204986842, "grad_norm": 5.371611595153809, "learning_rate": 2.1286410514820833e-08, "loss": 0.36, "num_input_tokens_seen": 188968160, "step": 155400 }, { "epoch": 19.471870692895628, "grad_norm": 19.1668643951416, "learning_rate": 2.123604630918652e-08, "loss": 0.4386, "num_input_tokens_seen": 188974400, "step": 155405 }, { "epoch": 19.47249718080441, "grad_norm": 14.586162567138672, "learning_rate": 2.1185741628861333e-08, "loss": 0.4171, "num_input_tokens_seen": 188980480, "step": 155410 }, { "epoch": 19.473123668713193, "grad_norm": 5.5237250328063965, "learning_rate": 2.1135496474447016e-08, "loss": 0.4422, "num_input_tokens_seen": 188986336, "step": 155415 }, { "epoch": 19.47375015662198, "grad_norm": 13.135844230651855, "learning_rate": 2.1085310846543084e-08, "loss": 0.4204, "num_input_tokens_seen": 188992512, "step": 155420 }, { "epoch": 19.47437664453076, "grad_norm": 4.530538082122803, "learning_rate": 2.103518474575128e-08, "loss": 0.3787, "num_input_tokens_seen": 188998688, "step": 155425 }, { "epoch": 19.475003132439543, "grad_norm": 10.238204956054688, "learning_rate": 2.0985118172668907e-08, "loss": 0.5078, "num_input_tokens_seen": 189004960, "step": 155430 }, { "epoch": 19.47562962034833, "grad_norm": 13.294856071472168, "learning_rate": 2.0935111127896037e-08, "loss": 0.4882, "num_input_tokens_seen": 189011040, "step": 155435 }, { "epoch": 19.47625610825711, "grad_norm": 13.288626670837402, "learning_rate": 2.088516361202997e-08, "loss": 0.4662, "num_input_tokens_seen": 189017152, "step": 155440 }, { "epoch": 19.476882596165893, "grad_norm": 17.115690231323242, "learning_rate": 2.0835275625668006e-08, "loss": 0.4675, "num_input_tokens_seen": 189023136, "step": 155445 }, { "epoch": 19.477509084074676, "grad_norm": 6.172759532928467, "learning_rate": 2.078544716940689e-08, "loss": 0.476, "num_input_tokens_seen": 189029504, "step": 155450 }, { "epoch": 19.47813557198346, "grad_norm": 4.699851989746094, "learning_rate": 2.0735678243841705e-08, "loss": 0.382, "num_input_tokens_seen": 189035552, "step": 155455 }, { "epoch": 19.478762059892244, "grad_norm": 18.976703643798828, "learning_rate": 2.068596884956753e-08, "loss": 0.5172, "num_input_tokens_seen": 189041824, "step": 155460 }, { "epoch": 19.479388547801026, "grad_norm": 5.301174163818359, "learning_rate": 2.0636318987179438e-08, "loss": 0.4347, "num_input_tokens_seen": 189047456, "step": 155465 }, { "epoch": 19.480015035709812, "grad_norm": 4.567154884338379, "learning_rate": 2.058672865727085e-08, "loss": 0.422, "num_input_tokens_seen": 189053792, "step": 155470 }, { "epoch": 19.480641523618594, "grad_norm": 5.663530349731445, "learning_rate": 2.0537197860433512e-08, "loss": 0.3995, "num_input_tokens_seen": 189060096, "step": 155475 }, { "epoch": 19.481268011527376, "grad_norm": 9.314103126525879, "learning_rate": 2.0487726597260838e-08, "loss": 0.4004, "num_input_tokens_seen": 189066208, "step": 155480 }, { "epoch": 19.481894499436162, "grad_norm": 5.107113838195801, "learning_rate": 2.0438314868344023e-08, "loss": 0.3808, "num_input_tokens_seen": 189072512, "step": 155485 }, { "epoch": 19.482520987344945, "grad_norm": 35.39574432373047, "learning_rate": 2.038896267427426e-08, "loss": 0.5381, "num_input_tokens_seen": 189078688, "step": 155490 }, { "epoch": 19.483147475253727, "grad_norm": 11.380622863769531, "learning_rate": 2.0339670015639967e-08, "loss": 0.5002, "num_input_tokens_seen": 189084704, "step": 155495 }, { "epoch": 19.483773963162513, "grad_norm": 7.455352783203125, "learning_rate": 2.029043689303234e-08, "loss": 0.4214, "num_input_tokens_seen": 189090784, "step": 155500 }, { "epoch": 19.484400451071295, "grad_norm": 4.221345901489258, "learning_rate": 2.024126330703924e-08, "loss": 0.4088, "num_input_tokens_seen": 189096736, "step": 155505 }, { "epoch": 19.485026938980077, "grad_norm": 12.112915992736816, "learning_rate": 2.0192149258248528e-08, "loss": 0.4791, "num_input_tokens_seen": 189103008, "step": 155510 }, { "epoch": 19.48565342688886, "grad_norm": 7.642195224761963, "learning_rate": 2.0143094747247517e-08, "loss": 0.41, "num_input_tokens_seen": 189108992, "step": 155515 }, { "epoch": 19.486279914797645, "grad_norm": 16.013242721557617, "learning_rate": 2.00940997746224e-08, "loss": 0.5179, "num_input_tokens_seen": 189114816, "step": 155520 }, { "epoch": 19.486906402706428, "grad_norm": 3.6867480278015137, "learning_rate": 2.0045164340959376e-08, "loss": 0.3966, "num_input_tokens_seen": 189120864, "step": 155525 }, { "epoch": 19.48753289061521, "grad_norm": 12.800071716308594, "learning_rate": 1.9996288446842982e-08, "loss": 0.5054, "num_input_tokens_seen": 189127392, "step": 155530 }, { "epoch": 19.488159378523996, "grad_norm": 4.636980056762695, "learning_rate": 1.99474720928583e-08, "loss": 0.41, "num_input_tokens_seen": 189133504, "step": 155535 }, { "epoch": 19.488785866432778, "grad_norm": 7.759599208831787, "learning_rate": 1.98987152795882e-08, "loss": 0.4273, "num_input_tokens_seen": 189139936, "step": 155540 }, { "epoch": 19.48941235434156, "grad_norm": 4.042569160461426, "learning_rate": 1.9850018007616657e-08, "loss": 0.413, "num_input_tokens_seen": 189145792, "step": 155545 }, { "epoch": 19.490038842250346, "grad_norm": 15.35268783569336, "learning_rate": 1.9801380277524874e-08, "loss": 0.476, "num_input_tokens_seen": 189151840, "step": 155550 }, { "epoch": 19.49066533015913, "grad_norm": 28.727506637573242, "learning_rate": 1.9752802089894605e-08, "loss": 0.4592, "num_input_tokens_seen": 189157760, "step": 155555 }, { "epoch": 19.49129181806791, "grad_norm": 6.055774211883545, "learning_rate": 1.970428344530706e-08, "loss": 0.438, "num_input_tokens_seen": 189163040, "step": 155560 }, { "epoch": 19.491918305976693, "grad_norm": 17.226530075073242, "learning_rate": 1.9655824344341767e-08, "loss": 0.5836, "num_input_tokens_seen": 189169344, "step": 155565 }, { "epoch": 19.49254479388548, "grad_norm": 22.851179122924805, "learning_rate": 1.9607424787578266e-08, "loss": 0.4308, "num_input_tokens_seen": 189175776, "step": 155570 }, { "epoch": 19.49317128179426, "grad_norm": 8.34648323059082, "learning_rate": 1.9559084775594983e-08, "loss": 0.4911, "num_input_tokens_seen": 189182016, "step": 155575 }, { "epoch": 19.493797769703043, "grad_norm": 5.472936153411865, "learning_rate": 1.95108043089709e-08, "loss": 0.5007, "num_input_tokens_seen": 189187840, "step": 155580 }, { "epoch": 19.49442425761183, "grad_norm": 8.528524398803711, "learning_rate": 1.946258338828222e-08, "loss": 0.5073, "num_input_tokens_seen": 189194208, "step": 155585 }, { "epoch": 19.49505074552061, "grad_norm": 17.53444480895996, "learning_rate": 1.941442201410515e-08, "loss": 0.4397, "num_input_tokens_seen": 189200544, "step": 155590 }, { "epoch": 19.495677233429394, "grad_norm": 9.219225883483887, "learning_rate": 1.9366320187017006e-08, "loss": 0.4178, "num_input_tokens_seen": 189206560, "step": 155595 }, { "epoch": 19.49630372133818, "grad_norm": 7.49537992477417, "learning_rate": 1.9318277907591777e-08, "loss": 0.4036, "num_input_tokens_seen": 189211904, "step": 155600 }, { "epoch": 19.496930209246962, "grad_norm": 22.14378547668457, "learning_rate": 1.9270295176403443e-08, "loss": 0.5658, "num_input_tokens_seen": 189217408, "step": 155605 }, { "epoch": 19.497556697155744, "grad_norm": 5.642396450042725, "learning_rate": 1.92223719940271e-08, "loss": 0.4431, "num_input_tokens_seen": 189222848, "step": 155610 }, { "epoch": 19.49818318506453, "grad_norm": 11.739970207214355, "learning_rate": 1.9174508361034516e-08, "loss": 0.4361, "num_input_tokens_seen": 189229152, "step": 155615 }, { "epoch": 19.498809672973312, "grad_norm": 8.709797859191895, "learning_rate": 1.9126704277998564e-08, "loss": 0.4549, "num_input_tokens_seen": 189235328, "step": 155620 }, { "epoch": 19.499436160882095, "grad_norm": 5.122087001800537, "learning_rate": 1.9078959745490454e-08, "loss": 0.4233, "num_input_tokens_seen": 189241184, "step": 155625 }, { "epoch": 19.500062648790877, "grad_norm": 5.68109655380249, "learning_rate": 1.9031274764080842e-08, "loss": 0.4603, "num_input_tokens_seen": 189247328, "step": 155630 }, { "epoch": 19.500689136699663, "grad_norm": 4.632761001586914, "learning_rate": 1.8983649334340382e-08, "loss": 0.4076, "num_input_tokens_seen": 189253664, "step": 155635 }, { "epoch": 19.501315624608445, "grad_norm": 16.68857765197754, "learning_rate": 1.893608345683806e-08, "loss": 0.5182, "num_input_tokens_seen": 189259936, "step": 155640 }, { "epoch": 19.501942112517227, "grad_norm": 4.594710350036621, "learning_rate": 1.8888577132142873e-08, "loss": 0.4508, "num_input_tokens_seen": 189266080, "step": 155645 }, { "epoch": 19.502568600426013, "grad_norm": 32.643009185791016, "learning_rate": 1.884113036082269e-08, "loss": 0.4419, "num_input_tokens_seen": 189272224, "step": 155650 }, { "epoch": 19.503195088334795, "grad_norm": 4.249451160430908, "learning_rate": 1.879374314344429e-08, "loss": 0.5056, "num_input_tokens_seen": 189278176, "step": 155655 }, { "epoch": 19.503821576243578, "grad_norm": 8.762002944946289, "learning_rate": 1.8746415480574986e-08, "loss": 0.4652, "num_input_tokens_seen": 189284192, "step": 155660 }, { "epoch": 19.504448064152363, "grad_norm": 7.408813953399658, "learning_rate": 1.8699147372780446e-08, "loss": 0.4153, "num_input_tokens_seen": 189290208, "step": 155665 }, { "epoch": 19.505074552061146, "grad_norm": 7.77890157699585, "learning_rate": 1.8651938820625213e-08, "loss": 0.4282, "num_input_tokens_seen": 189296320, "step": 155670 }, { "epoch": 19.505701039969928, "grad_norm": 16.982051849365234, "learning_rate": 1.8604789824674953e-08, "loss": 0.4053, "num_input_tokens_seen": 189302464, "step": 155675 }, { "epoch": 19.50632752787871, "grad_norm": 14.075927734375, "learning_rate": 1.8557700385491428e-08, "loss": 0.4368, "num_input_tokens_seen": 189308704, "step": 155680 }, { "epoch": 19.506954015787496, "grad_norm": 6.439369201660156, "learning_rate": 1.8510670503639193e-08, "loss": 0.4185, "num_input_tokens_seen": 189314656, "step": 155685 }, { "epoch": 19.50758050369628, "grad_norm": 8.262935638427734, "learning_rate": 1.8463700179680576e-08, "loss": 0.4641, "num_input_tokens_seen": 189320448, "step": 155690 }, { "epoch": 19.50820699160506, "grad_norm": 6.778791904449463, "learning_rate": 1.841678941417624e-08, "loss": 0.3803, "num_input_tokens_seen": 189326624, "step": 155695 }, { "epoch": 19.508833479513846, "grad_norm": 11.82536506652832, "learning_rate": 1.8369938207687398e-08, "loss": 0.4265, "num_input_tokens_seen": 189332320, "step": 155700 }, { "epoch": 19.50945996742263, "grad_norm": 4.253251552581787, "learning_rate": 1.8323146560774164e-08, "loss": 0.3777, "num_input_tokens_seen": 189338368, "step": 155705 }, { "epoch": 19.51008645533141, "grad_norm": 9.762985229492188, "learning_rate": 1.827641447399664e-08, "loss": 0.3961, "num_input_tokens_seen": 189344320, "step": 155710 }, { "epoch": 19.510712943240197, "grad_norm": 27.619035720825195, "learning_rate": 1.8229741947912717e-08, "loss": 0.4913, "num_input_tokens_seen": 189350208, "step": 155715 }, { "epoch": 19.51133943114898, "grad_norm": 3.8473496437072754, "learning_rate": 1.8183128983080834e-08, "loss": 0.4014, "num_input_tokens_seen": 189356544, "step": 155720 }, { "epoch": 19.51196591905776, "grad_norm": 8.444318771362305, "learning_rate": 1.813657558005777e-08, "loss": 0.4508, "num_input_tokens_seen": 189362880, "step": 155725 }, { "epoch": 19.512592406966547, "grad_norm": 41.7706413269043, "learning_rate": 1.8090081739400856e-08, "loss": 0.5582, "num_input_tokens_seen": 189369248, "step": 155730 }, { "epoch": 19.51321889487533, "grad_norm": 6.065275192260742, "learning_rate": 1.8043647461665202e-08, "loss": 0.446, "num_input_tokens_seen": 189375200, "step": 155735 }, { "epoch": 19.513845382784112, "grad_norm": 5.11458158493042, "learning_rate": 1.799727274740648e-08, "loss": 0.3532, "num_input_tokens_seen": 189381312, "step": 155740 }, { "epoch": 19.514471870692894, "grad_norm": 5.935478210449219, "learning_rate": 1.7950957597178685e-08, "loss": 0.4894, "num_input_tokens_seen": 189387648, "step": 155745 }, { "epoch": 19.51509835860168, "grad_norm": 6.559335708618164, "learning_rate": 1.790470201153638e-08, "loss": 0.3717, "num_input_tokens_seen": 189393888, "step": 155750 }, { "epoch": 19.515724846510462, "grad_norm": 5.3556742668151855, "learning_rate": 1.7858505991031892e-08, "loss": 0.4239, "num_input_tokens_seen": 189399680, "step": 155755 }, { "epoch": 19.516351334419245, "grad_norm": 18.618181228637695, "learning_rate": 1.7812369536217565e-08, "loss": 0.4831, "num_input_tokens_seen": 189405376, "step": 155760 }, { "epoch": 19.51697782232803, "grad_norm": 28.34695053100586, "learning_rate": 1.776629264764518e-08, "loss": 0.4414, "num_input_tokens_seen": 189411520, "step": 155765 }, { "epoch": 19.517604310236813, "grad_norm": 6.09425163269043, "learning_rate": 1.7720275325865954e-08, "loss": 0.4621, "num_input_tokens_seen": 189417472, "step": 155770 }, { "epoch": 19.518230798145595, "grad_norm": 6.76065731048584, "learning_rate": 1.7674317571429455e-08, "loss": 0.4227, "num_input_tokens_seen": 189423392, "step": 155775 }, { "epoch": 19.51885728605438, "grad_norm": 11.10818099975586, "learning_rate": 1.7628419384885242e-08, "loss": 0.52, "num_input_tokens_seen": 189429312, "step": 155780 }, { "epoch": 19.519483773963163, "grad_norm": 15.022310256958008, "learning_rate": 1.758258076678232e-08, "loss": 0.4259, "num_input_tokens_seen": 189435328, "step": 155785 }, { "epoch": 19.520110261871945, "grad_norm": 4.227572441101074, "learning_rate": 1.7536801717668583e-08, "loss": 0.3936, "num_input_tokens_seen": 189441184, "step": 155790 }, { "epoch": 19.520736749780728, "grad_norm": 5.556304931640625, "learning_rate": 1.749108223809193e-08, "loss": 0.5321, "num_input_tokens_seen": 189447744, "step": 155795 }, { "epoch": 19.521363237689513, "grad_norm": 5.273725986480713, "learning_rate": 1.7445422328597472e-08, "loss": 0.4404, "num_input_tokens_seen": 189453952, "step": 155800 }, { "epoch": 19.521989725598296, "grad_norm": 7.74094820022583, "learning_rate": 1.7399821989733113e-08, "loss": 0.4571, "num_input_tokens_seen": 189460128, "step": 155805 }, { "epoch": 19.522616213507078, "grad_norm": 7.864814281463623, "learning_rate": 1.7354281222042303e-08, "loss": 0.4013, "num_input_tokens_seen": 189466400, "step": 155810 }, { "epoch": 19.523242701415864, "grad_norm": 21.039278030395508, "learning_rate": 1.7308800026070717e-08, "loss": 0.4387, "num_input_tokens_seen": 189472512, "step": 155815 }, { "epoch": 19.523869189324646, "grad_norm": 7.858004093170166, "learning_rate": 1.7263378402361252e-08, "loss": 0.4517, "num_input_tokens_seen": 189478944, "step": 155820 }, { "epoch": 19.52449567723343, "grad_norm": 9.600249290466309, "learning_rate": 1.7218016351457367e-08, "loss": 0.4871, "num_input_tokens_seen": 189484928, "step": 155825 }, { "epoch": 19.525122165142214, "grad_norm": 4.91057014465332, "learning_rate": 1.717271387390196e-08, "loss": 0.458, "num_input_tokens_seen": 189490912, "step": 155830 }, { "epoch": 19.525748653050996, "grad_norm": 6.207953453063965, "learning_rate": 1.7127470970235705e-08, "loss": 0.417, "num_input_tokens_seen": 189497216, "step": 155835 }, { "epoch": 19.52637514095978, "grad_norm": 5.770716190338135, "learning_rate": 1.708228764099984e-08, "loss": 0.4309, "num_input_tokens_seen": 189503264, "step": 155840 }, { "epoch": 19.52700162886856, "grad_norm": 6.035470962524414, "learning_rate": 1.7037163886734488e-08, "loss": 0.4537, "num_input_tokens_seen": 189509344, "step": 155845 }, { "epoch": 19.527628116777347, "grad_norm": 5.477353572845459, "learning_rate": 1.699209970797977e-08, "loss": 0.3883, "num_input_tokens_seen": 189515456, "step": 155850 }, { "epoch": 19.52825460468613, "grad_norm": 3.818492889404297, "learning_rate": 1.6947095105273592e-08, "loss": 0.5037, "num_input_tokens_seen": 189521280, "step": 155855 }, { "epoch": 19.52888109259491, "grad_norm": 10.325759887695312, "learning_rate": 1.690215007915441e-08, "loss": 0.4904, "num_input_tokens_seen": 189527456, "step": 155860 }, { "epoch": 19.529507580503697, "grad_norm": 5.794528961181641, "learning_rate": 1.6857264630159576e-08, "loss": 0.3917, "num_input_tokens_seen": 189533696, "step": 155865 }, { "epoch": 19.53013406841248, "grad_norm": 5.109079360961914, "learning_rate": 1.6812438758825877e-08, "loss": 0.4994, "num_input_tokens_seen": 189540000, "step": 155870 }, { "epoch": 19.530760556321262, "grad_norm": 4.175611972808838, "learning_rate": 1.6767672465689e-08, "loss": 0.4124, "num_input_tokens_seen": 189546176, "step": 155875 }, { "epoch": 19.531387044230048, "grad_norm": 4.141369819641113, "learning_rate": 1.6722965751284626e-08, "loss": 0.4065, "num_input_tokens_seen": 189551968, "step": 155880 }, { "epoch": 19.53201353213883, "grad_norm": 4.578768730163574, "learning_rate": 1.6678318616146216e-08, "loss": 0.3851, "num_input_tokens_seen": 189557856, "step": 155885 }, { "epoch": 19.532640020047612, "grad_norm": 5.754526615142822, "learning_rate": 1.66337310608089e-08, "loss": 0.433, "num_input_tokens_seen": 189564320, "step": 155890 }, { "epoch": 19.533266507956398, "grad_norm": 5.115817546844482, "learning_rate": 1.6589203085804473e-08, "loss": 0.4375, "num_input_tokens_seen": 189570592, "step": 155895 }, { "epoch": 19.53389299586518, "grad_norm": 6.757443904876709, "learning_rate": 1.6544734691666397e-08, "loss": 0.4556, "num_input_tokens_seen": 189576480, "step": 155900 }, { "epoch": 19.534519483773963, "grad_norm": 9.11725902557373, "learning_rate": 1.6500325878925915e-08, "loss": 0.3895, "num_input_tokens_seen": 189582656, "step": 155905 }, { "epoch": 19.535145971682745, "grad_norm": 6.875350475311279, "learning_rate": 1.6455976648113712e-08, "loss": 0.4221, "num_input_tokens_seen": 189588640, "step": 155910 }, { "epoch": 19.53577245959153, "grad_norm": 14.122123718261719, "learning_rate": 1.641168699976048e-08, "loss": 0.451, "num_input_tokens_seen": 189594816, "step": 155915 }, { "epoch": 19.536398947500313, "grad_norm": 8.076043128967285, "learning_rate": 1.636745693439523e-08, "loss": 0.4902, "num_input_tokens_seen": 189601024, "step": 155920 }, { "epoch": 19.537025435409095, "grad_norm": 4.507192134857178, "learning_rate": 1.632328645254755e-08, "loss": 0.457, "num_input_tokens_seen": 189607552, "step": 155925 }, { "epoch": 19.53765192331788, "grad_norm": 24.02219581604004, "learning_rate": 1.6279175554744786e-08, "loss": 0.4232, "num_input_tokens_seen": 189613664, "step": 155930 }, { "epoch": 19.538278411226663, "grad_norm": 25.730998992919922, "learning_rate": 1.62351242415143e-08, "loss": 0.4265, "num_input_tokens_seen": 189619744, "step": 155935 }, { "epoch": 19.538904899135446, "grad_norm": 10.925774574279785, "learning_rate": 1.619113251338289e-08, "loss": 0.4111, "num_input_tokens_seen": 189625888, "step": 155940 }, { "epoch": 19.53953138704423, "grad_norm": 25.235637664794922, "learning_rate": 1.6147200370876804e-08, "loss": 0.4826, "num_input_tokens_seen": 189632064, "step": 155945 }, { "epoch": 19.540157874953014, "grad_norm": 4.528311729431152, "learning_rate": 1.610332781452173e-08, "loss": 0.3988, "num_input_tokens_seen": 189638464, "step": 155950 }, { "epoch": 19.540784362861796, "grad_norm": 6.0427937507629395, "learning_rate": 1.605951484484114e-08, "loss": 0.4553, "num_input_tokens_seen": 189644832, "step": 155955 }, { "epoch": 19.541410850770582, "grad_norm": 4.655359745025635, "learning_rate": 1.6015761462359058e-08, "loss": 0.4027, "num_input_tokens_seen": 189651040, "step": 155960 }, { "epoch": 19.542037338679364, "grad_norm": 3.9762954711914062, "learning_rate": 1.5972067667598957e-08, "loss": 0.4421, "num_input_tokens_seen": 189657120, "step": 155965 }, { "epoch": 19.542663826588146, "grad_norm": 4.5518879890441895, "learning_rate": 1.592843346108375e-08, "loss": 0.4082, "num_input_tokens_seen": 189663168, "step": 155970 }, { "epoch": 19.54329031449693, "grad_norm": 7.238092422485352, "learning_rate": 1.588485884333357e-08, "loss": 0.4673, "num_input_tokens_seen": 189669408, "step": 155975 }, { "epoch": 19.543916802405715, "grad_norm": 7.067939758300781, "learning_rate": 1.584134381487079e-08, "loss": 0.4095, "num_input_tokens_seen": 189675648, "step": 155980 }, { "epoch": 19.544543290314497, "grad_norm": 8.627470970153809, "learning_rate": 1.579788837621554e-08, "loss": 0.4689, "num_input_tokens_seen": 189681408, "step": 155985 }, { "epoch": 19.54516977822328, "grad_norm": 5.0691351890563965, "learning_rate": 1.57544925278863e-08, "loss": 0.3894, "num_input_tokens_seen": 189687840, "step": 155990 }, { "epoch": 19.545796266132065, "grad_norm": 13.819038391113281, "learning_rate": 1.5711156270403206e-08, "loss": 0.4653, "num_input_tokens_seen": 189694400, "step": 155995 }, { "epoch": 19.546422754040847, "grad_norm": 6.393338680267334, "learning_rate": 1.5667879604283065e-08, "loss": 0.469, "num_input_tokens_seen": 189700672, "step": 156000 }, { "epoch": 19.54704924194963, "grad_norm": 12.103328704833984, "learning_rate": 1.562466253004491e-08, "loss": 0.4875, "num_input_tokens_seen": 189707040, "step": 156005 }, { "epoch": 19.547675729858415, "grad_norm": 13.101383209228516, "learning_rate": 1.5581505048203882e-08, "loss": 0.5077, "num_input_tokens_seen": 189712576, "step": 156010 }, { "epoch": 19.548302217767198, "grad_norm": 34.99287414550781, "learning_rate": 1.553840715927679e-08, "loss": 0.5263, "num_input_tokens_seen": 189718304, "step": 156015 }, { "epoch": 19.54892870567598, "grad_norm": 11.867448806762695, "learning_rate": 1.549536886377878e-08, "loss": 0.4565, "num_input_tokens_seen": 189724832, "step": 156020 }, { "epoch": 19.549555193584762, "grad_norm": 3.7449285984039307, "learning_rate": 1.5452390162224994e-08, "loss": 0.4433, "num_input_tokens_seen": 189731040, "step": 156025 }, { "epoch": 19.550181681493548, "grad_norm": 6.3280768394470215, "learning_rate": 1.54094710551278e-08, "loss": 0.5014, "num_input_tokens_seen": 189737152, "step": 156030 }, { "epoch": 19.55080816940233, "grad_norm": 14.621513366699219, "learning_rate": 1.5366611543001786e-08, "loss": 0.3859, "num_input_tokens_seen": 189743168, "step": 156035 }, { "epoch": 19.551434657311113, "grad_norm": 7.026313781738281, "learning_rate": 1.5323811626358208e-08, "loss": 0.4168, "num_input_tokens_seen": 189749184, "step": 156040 }, { "epoch": 19.5520611452199, "grad_norm": 6.904962539672852, "learning_rate": 1.5281071305709992e-08, "loss": 0.4281, "num_input_tokens_seen": 189755360, "step": 156045 }, { "epoch": 19.55268763312868, "grad_norm": 10.253703117370605, "learning_rate": 1.5238390581566733e-08, "loss": 0.4903, "num_input_tokens_seen": 189761600, "step": 156050 }, { "epoch": 19.553314121037463, "grad_norm": 24.98105812072754, "learning_rate": 1.5195769454440235e-08, "loss": 0.4837, "num_input_tokens_seen": 189767712, "step": 156055 }, { "epoch": 19.55394060894625, "grad_norm": 30.1606388092041, "learning_rate": 1.5153207924838432e-08, "loss": 0.5458, "num_input_tokens_seen": 189773568, "step": 156060 }, { "epoch": 19.55456709685503, "grad_norm": 5.576061248779297, "learning_rate": 1.5110705993272025e-08, "loss": 0.428, "num_input_tokens_seen": 189779520, "step": 156065 }, { "epoch": 19.555193584763813, "grad_norm": 5.504059791564941, "learning_rate": 1.5068263660247827e-08, "loss": 0.474, "num_input_tokens_seen": 189784896, "step": 156070 }, { "epoch": 19.555820072672596, "grad_norm": 10.780135154724121, "learning_rate": 1.5025880926273217e-08, "loss": 0.4142, "num_input_tokens_seen": 189790848, "step": 156075 }, { "epoch": 19.55644656058138, "grad_norm": 17.691417694091797, "learning_rate": 1.498355779185556e-08, "loss": 0.3996, "num_input_tokens_seen": 189797184, "step": 156080 }, { "epoch": 19.557073048490164, "grad_norm": 21.632543563842773, "learning_rate": 1.4941294257501125e-08, "loss": 0.4597, "num_input_tokens_seen": 189803712, "step": 156085 }, { "epoch": 19.557699536398946, "grad_norm": 3.396491289138794, "learning_rate": 1.489909032371395e-08, "loss": 0.4571, "num_input_tokens_seen": 189809856, "step": 156090 }, { "epoch": 19.558326024307732, "grad_norm": 13.88940715789795, "learning_rate": 1.4856945990999739e-08, "loss": 0.4281, "num_input_tokens_seen": 189816000, "step": 156095 }, { "epoch": 19.558952512216514, "grad_norm": 4.409671783447266, "learning_rate": 1.4814861259861978e-08, "loss": 0.4285, "num_input_tokens_seen": 189822272, "step": 156100 }, { "epoch": 19.559579000125296, "grad_norm": 6.424086093902588, "learning_rate": 1.4772836130804158e-08, "loss": 0.5489, "num_input_tokens_seen": 189828416, "step": 156105 }, { "epoch": 19.560205488034082, "grad_norm": 5.78458309173584, "learning_rate": 1.4730870604328095e-08, "loss": 0.3934, "num_input_tokens_seen": 189834688, "step": 156110 }, { "epoch": 19.560831975942865, "grad_norm": 17.799863815307617, "learning_rate": 1.4688964680936169e-08, "loss": 0.4186, "num_input_tokens_seen": 189840864, "step": 156115 }, { "epoch": 19.561458463851647, "grad_norm": 10.299485206604004, "learning_rate": 1.4647118361129087e-08, "loss": 0.4195, "num_input_tokens_seen": 189847072, "step": 156120 }, { "epoch": 19.562084951760433, "grad_norm": 18.500850677490234, "learning_rate": 1.4605331645407007e-08, "loss": 0.4723, "num_input_tokens_seen": 189853472, "step": 156125 }, { "epoch": 19.562711439669215, "grad_norm": 6.5536789894104, "learning_rate": 1.4563604534269527e-08, "loss": 0.4332, "num_input_tokens_seen": 189859712, "step": 156130 }, { "epoch": 19.563337927577997, "grad_norm": 27.782093048095703, "learning_rate": 1.4521937028215694e-08, "loss": 0.5034, "num_input_tokens_seen": 189865952, "step": 156135 }, { "epoch": 19.56396441548678, "grad_norm": 12.066381454467773, "learning_rate": 1.4480329127743443e-08, "loss": 0.4426, "num_input_tokens_seen": 189872000, "step": 156140 }, { "epoch": 19.564590903395565, "grad_norm": 15.506444931030273, "learning_rate": 1.4438780833351263e-08, "loss": 0.5729, "num_input_tokens_seen": 189878144, "step": 156145 }, { "epoch": 19.565217391304348, "grad_norm": 7.011914253234863, "learning_rate": 1.4397292145534314e-08, "loss": 0.4332, "num_input_tokens_seen": 189884384, "step": 156150 }, { "epoch": 19.56584387921313, "grad_norm": 9.877708435058594, "learning_rate": 1.4355863064789976e-08, "loss": 0.5405, "num_input_tokens_seen": 189890048, "step": 156155 }, { "epoch": 19.566470367121916, "grad_norm": 7.656589031219482, "learning_rate": 1.43144935916123e-08, "loss": 0.4976, "num_input_tokens_seen": 189896160, "step": 156160 }, { "epoch": 19.567096855030698, "grad_norm": 3.756047248840332, "learning_rate": 1.4273183726497552e-08, "loss": 0.4274, "num_input_tokens_seen": 189902368, "step": 156165 }, { "epoch": 19.56772334293948, "grad_norm": 6.866871356964111, "learning_rate": 1.4231933469937565e-08, "loss": 0.4123, "num_input_tokens_seen": 189908448, "step": 156170 }, { "epoch": 19.568349830848266, "grad_norm": 5.498847007751465, "learning_rate": 1.4190742822427494e-08, "loss": 0.3946, "num_input_tokens_seen": 189914656, "step": 156175 }, { "epoch": 19.56897631875705, "grad_norm": 8.790595054626465, "learning_rate": 1.4149611784458616e-08, "loss": 0.4047, "num_input_tokens_seen": 189920832, "step": 156180 }, { "epoch": 19.56960280666583, "grad_norm": 17.050737380981445, "learning_rate": 1.410854035652276e-08, "loss": 0.4976, "num_input_tokens_seen": 189927232, "step": 156185 }, { "epoch": 19.570229294574613, "grad_norm": 6.174343585968018, "learning_rate": 1.4067528539111752e-08, "loss": 0.4194, "num_input_tokens_seen": 189933184, "step": 156190 }, { "epoch": 19.5708557824834, "grad_norm": 14.32563304901123, "learning_rate": 1.4026576332714647e-08, "loss": 0.458, "num_input_tokens_seen": 189939104, "step": 156195 }, { "epoch": 19.57148227039218, "grad_norm": 5.407043933868408, "learning_rate": 1.3985683737822164e-08, "loss": 0.3723, "num_input_tokens_seen": 189945312, "step": 156200 }, { "epoch": 19.572108758300963, "grad_norm": 8.938302040100098, "learning_rate": 1.3944850754923356e-08, "loss": 0.4474, "num_input_tokens_seen": 189951264, "step": 156205 }, { "epoch": 19.57273524620975, "grad_norm": 14.176200866699219, "learning_rate": 1.3904077384505054e-08, "loss": 0.4267, "num_input_tokens_seen": 189957408, "step": 156210 }, { "epoch": 19.57336173411853, "grad_norm": 14.226447105407715, "learning_rate": 1.386336362705576e-08, "loss": 0.412, "num_input_tokens_seen": 189963584, "step": 156215 }, { "epoch": 19.573988222027314, "grad_norm": 10.295361518859863, "learning_rate": 1.3822709483062302e-08, "loss": 0.5156, "num_input_tokens_seen": 189969952, "step": 156220 }, { "epoch": 19.5746147099361, "grad_norm": 9.344218254089355, "learning_rate": 1.3782114953010406e-08, "loss": 0.4019, "num_input_tokens_seen": 189976416, "step": 156225 }, { "epoch": 19.575241197844882, "grad_norm": 9.120819091796875, "learning_rate": 1.3741580037385793e-08, "loss": 0.4018, "num_input_tokens_seen": 189982720, "step": 156230 }, { "epoch": 19.575867685753664, "grad_norm": 18.63920783996582, "learning_rate": 1.3701104736672521e-08, "loss": 0.5234, "num_input_tokens_seen": 189989024, "step": 156235 }, { "epoch": 19.576494173662446, "grad_norm": 19.786619186401367, "learning_rate": 1.3660689051355203e-08, "loss": 0.45, "num_input_tokens_seen": 189995136, "step": 156240 }, { "epoch": 19.577120661571232, "grad_norm": 5.418338298797607, "learning_rate": 1.362033298191623e-08, "loss": 0.5823, "num_input_tokens_seen": 190001184, "step": 156245 }, { "epoch": 19.577747149480015, "grad_norm": 5.177588939666748, "learning_rate": 1.3580036528838547e-08, "loss": 0.3889, "num_input_tokens_seen": 190007392, "step": 156250 }, { "epoch": 19.578373637388797, "grad_norm": 5.105847358703613, "learning_rate": 1.3539799692603438e-08, "loss": 0.4537, "num_input_tokens_seen": 190013280, "step": 156255 }, { "epoch": 19.579000125297583, "grad_norm": 4.857492923736572, "learning_rate": 1.3499622473693296e-08, "loss": 0.4248, "num_input_tokens_seen": 190019552, "step": 156260 }, { "epoch": 19.579626613206365, "grad_norm": 13.455121040344238, "learning_rate": 1.3459504872587182e-08, "loss": 0.4687, "num_input_tokens_seen": 190025632, "step": 156265 }, { "epoch": 19.580253101115147, "grad_norm": 4.653132915496826, "learning_rate": 1.341944688976471e-08, "loss": 0.4798, "num_input_tokens_seen": 190031616, "step": 156270 }, { "epoch": 19.580879589023933, "grad_norm": 3.595463275909424, "learning_rate": 1.3379448525706051e-08, "loss": 0.3964, "num_input_tokens_seen": 190037696, "step": 156275 }, { "epoch": 19.581506076932715, "grad_norm": 5.479983329772949, "learning_rate": 1.333950978088805e-08, "loss": 0.4156, "num_input_tokens_seen": 190043168, "step": 156280 }, { "epoch": 19.582132564841498, "grad_norm": 3.258186101913452, "learning_rate": 1.3299630655788653e-08, "loss": 0.448, "num_input_tokens_seen": 190049376, "step": 156285 }, { "epoch": 19.582759052750284, "grad_norm": 6.666043758392334, "learning_rate": 1.325981115088526e-08, "loss": 0.4244, "num_input_tokens_seen": 190055712, "step": 156290 }, { "epoch": 19.583385540659066, "grad_norm": 6.748082637786865, "learning_rate": 1.3220051266653044e-08, "loss": 0.4206, "num_input_tokens_seen": 190062080, "step": 156295 }, { "epoch": 19.584012028567848, "grad_norm": 13.163166046142578, "learning_rate": 1.3180351003567738e-08, "loss": 0.4831, "num_input_tokens_seen": 190068032, "step": 156300 }, { "epoch": 19.58463851647663, "grad_norm": 18.66222381591797, "learning_rate": 1.314071036210396e-08, "loss": 0.4871, "num_input_tokens_seen": 190074624, "step": 156305 }, { "epoch": 19.585265004385416, "grad_norm": 8.351860046386719, "learning_rate": 1.310112934273633e-08, "loss": 0.4165, "num_input_tokens_seen": 190080864, "step": 156310 }, { "epoch": 19.5858914922942, "grad_norm": 24.376251220703125, "learning_rate": 1.3061607945936693e-08, "loss": 0.5222, "num_input_tokens_seen": 190087104, "step": 156315 }, { "epoch": 19.58651798020298, "grad_norm": 19.145261764526367, "learning_rate": 1.3022146172179118e-08, "loss": 0.4694, "num_input_tokens_seen": 190093152, "step": 156320 }, { "epoch": 19.587144468111767, "grad_norm": 12.769206047058105, "learning_rate": 1.298274402193378e-08, "loss": 0.4078, "num_input_tokens_seen": 190099488, "step": 156325 }, { "epoch": 19.58777095602055, "grad_norm": 10.178936958312988, "learning_rate": 1.294340149567308e-08, "loss": 0.4412, "num_input_tokens_seen": 190105760, "step": 156330 }, { "epoch": 19.58839744392933, "grad_norm": 8.771594047546387, "learning_rate": 1.2904118593866643e-08, "loss": 0.4126, "num_input_tokens_seen": 190111936, "step": 156335 }, { "epoch": 19.589023931838117, "grad_norm": 8.530159950256348, "learning_rate": 1.286489531698465e-08, "loss": 0.5478, "num_input_tokens_seen": 190118016, "step": 156340 }, { "epoch": 19.5896504197469, "grad_norm": 6.751772880554199, "learning_rate": 1.282573166549561e-08, "loss": 0.4545, "num_input_tokens_seen": 190124160, "step": 156345 }, { "epoch": 19.59027690765568, "grad_norm": 13.388819694519043, "learning_rate": 1.2786627639868044e-08, "loss": 0.4243, "num_input_tokens_seen": 190130272, "step": 156350 }, { "epoch": 19.590903395564467, "grad_norm": 5.532272815704346, "learning_rate": 1.274758324056935e-08, "loss": 0.4397, "num_input_tokens_seen": 190135456, "step": 156355 }, { "epoch": 19.59152988347325, "grad_norm": 16.38470458984375, "learning_rate": 1.270859846806638e-08, "loss": 0.5077, "num_input_tokens_seen": 190141888, "step": 156360 }, { "epoch": 19.592156371382032, "grad_norm": 6.300779819488525, "learning_rate": 1.2669673322825426e-08, "loss": 0.447, "num_input_tokens_seen": 190148064, "step": 156365 }, { "epoch": 19.592782859290814, "grad_norm": 5.413578987121582, "learning_rate": 1.2630807805311119e-08, "loss": 0.4267, "num_input_tokens_seen": 190153152, "step": 156370 }, { "epoch": 19.5934093471996, "grad_norm": 7.280312538146973, "learning_rate": 1.2592001915989193e-08, "loss": 0.3846, "num_input_tokens_seen": 190159104, "step": 156375 }, { "epoch": 19.594035835108382, "grad_norm": 10.10534954071045, "learning_rate": 1.2553255655322616e-08, "loss": 0.4637, "num_input_tokens_seen": 190165216, "step": 156380 }, { "epoch": 19.594662323017165, "grad_norm": 19.69411849975586, "learning_rate": 1.2514569023775459e-08, "loss": 0.4988, "num_input_tokens_seen": 190171328, "step": 156385 }, { "epoch": 19.59528881092595, "grad_norm": 6.092004776000977, "learning_rate": 1.2475942021809573e-08, "loss": 0.3961, "num_input_tokens_seen": 190177824, "step": 156390 }, { "epoch": 19.595915298834733, "grad_norm": 9.81521987915039, "learning_rate": 1.2437374649886814e-08, "loss": 0.4533, "num_input_tokens_seen": 190183872, "step": 156395 }, { "epoch": 19.596541786743515, "grad_norm": 20.747629165649414, "learning_rate": 1.2398866908469032e-08, "loss": 0.4814, "num_input_tokens_seen": 190189792, "step": 156400 }, { "epoch": 19.5971682746523, "grad_norm": 28.994083404541016, "learning_rate": 1.236041879801586e-08, "loss": 0.5403, "num_input_tokens_seen": 190196064, "step": 156405 }, { "epoch": 19.597794762561083, "grad_norm": 12.053095817565918, "learning_rate": 1.2322030318987488e-08, "loss": 0.4661, "num_input_tokens_seen": 190202496, "step": 156410 }, { "epoch": 19.598421250469865, "grad_norm": 3.9777917861938477, "learning_rate": 1.2283701471842435e-08, "loss": 0.4055, "num_input_tokens_seen": 190208640, "step": 156415 }, { "epoch": 19.599047738378648, "grad_norm": 5.4788007736206055, "learning_rate": 1.2245432257039225e-08, "loss": 0.4313, "num_input_tokens_seen": 190214752, "step": 156420 }, { "epoch": 19.599674226287433, "grad_norm": 20.454029083251953, "learning_rate": 1.2207222675035824e-08, "loss": 0.431, "num_input_tokens_seen": 190221056, "step": 156425 }, { "epoch": 19.600300714196216, "grad_norm": 10.35576057434082, "learning_rate": 1.2169072726287978e-08, "loss": 0.4558, "num_input_tokens_seen": 190227008, "step": 156430 }, { "epoch": 19.600927202104998, "grad_norm": 12.691393852233887, "learning_rate": 1.2130982411252545e-08, "loss": 0.4123, "num_input_tokens_seen": 190233248, "step": 156435 }, { "epoch": 19.601553690013784, "grad_norm": 5.520246982574463, "learning_rate": 1.2092951730384717e-08, "loss": 0.4108, "num_input_tokens_seen": 190239424, "step": 156440 }, { "epoch": 19.602180177922566, "grad_norm": 5.182805061340332, "learning_rate": 1.2054980684139683e-08, "loss": 0.4113, "num_input_tokens_seen": 190245600, "step": 156445 }, { "epoch": 19.60280666583135, "grad_norm": 5.544539928436279, "learning_rate": 1.2017069272970416e-08, "loss": 0.4949, "num_input_tokens_seen": 190251360, "step": 156450 }, { "epoch": 19.603433153740134, "grad_norm": 19.04935646057129, "learning_rate": 1.1979217497330997e-08, "loss": 0.4163, "num_input_tokens_seen": 190257408, "step": 156455 }, { "epoch": 19.604059641648917, "grad_norm": 9.787232398986816, "learning_rate": 1.1941425357673842e-08, "loss": 0.3875, "num_input_tokens_seen": 190263360, "step": 156460 }, { "epoch": 19.6046861295577, "grad_norm": 6.556876182556152, "learning_rate": 1.190369285445081e-08, "loss": 0.4846, "num_input_tokens_seen": 190269632, "step": 156465 }, { "epoch": 19.60531261746648, "grad_norm": 16.4685115814209, "learning_rate": 1.1866019988112654e-08, "loss": 0.4345, "num_input_tokens_seen": 190275840, "step": 156470 }, { "epoch": 19.605939105375267, "grad_norm": 4.56020450592041, "learning_rate": 1.1828406759110122e-08, "loss": 0.4589, "num_input_tokens_seen": 190281984, "step": 156475 }, { "epoch": 19.60656559328405, "grad_norm": 18.107406616210938, "learning_rate": 1.1790853167892858e-08, "loss": 0.4786, "num_input_tokens_seen": 190287840, "step": 156480 }, { "epoch": 19.60719208119283, "grad_norm": 8.005188941955566, "learning_rate": 1.1753359214909388e-08, "loss": 0.4351, "num_input_tokens_seen": 190293984, "step": 156485 }, { "epoch": 19.607818569101617, "grad_norm": 18.79005241394043, "learning_rate": 1.17159249006088e-08, "loss": 0.4621, "num_input_tokens_seen": 190300224, "step": 156490 }, { "epoch": 19.6084450570104, "grad_norm": 9.843677520751953, "learning_rate": 1.1678550225437957e-08, "loss": 0.4368, "num_input_tokens_seen": 190306400, "step": 156495 }, { "epoch": 19.609071544919182, "grad_norm": 4.301514625549316, "learning_rate": 1.164123518984428e-08, "loss": 0.3987, "num_input_tokens_seen": 190312672, "step": 156500 }, { "epoch": 19.609698032827968, "grad_norm": 25.247922897338867, "learning_rate": 1.1603979794273524e-08, "loss": 0.4456, "num_input_tokens_seen": 190318720, "step": 156505 }, { "epoch": 19.61032452073675, "grad_norm": 6.869872093200684, "learning_rate": 1.1566784039171442e-08, "loss": 0.4899, "num_input_tokens_seen": 190324704, "step": 156510 }, { "epoch": 19.610951008645532, "grad_norm": 17.063791275024414, "learning_rate": 1.1529647924982123e-08, "loss": 0.4137, "num_input_tokens_seen": 190330304, "step": 156515 }, { "epoch": 19.611577496554318, "grad_norm": 5.335415840148926, "learning_rate": 1.149257145214966e-08, "loss": 0.3994, "num_input_tokens_seen": 190336384, "step": 156520 }, { "epoch": 19.6122039844631, "grad_norm": 6.638374328613281, "learning_rate": 1.1455554621118137e-08, "loss": 0.4178, "num_input_tokens_seen": 190342560, "step": 156525 }, { "epoch": 19.612830472371883, "grad_norm": 10.405830383300781, "learning_rate": 1.1418597432329426e-08, "loss": 0.4227, "num_input_tokens_seen": 190348736, "step": 156530 }, { "epoch": 19.613456960280665, "grad_norm": 4.291383266448975, "learning_rate": 1.138169988622595e-08, "loss": 0.4493, "num_input_tokens_seen": 190354272, "step": 156535 }, { "epoch": 19.61408344818945, "grad_norm": 41.55030822753906, "learning_rate": 1.1344861983247912e-08, "loss": 0.5478, "num_input_tokens_seen": 190360416, "step": 156540 }, { "epoch": 19.614709936098233, "grad_norm": 4.861684799194336, "learning_rate": 1.1308083723836627e-08, "loss": 0.4233, "num_input_tokens_seen": 190366240, "step": 156545 }, { "epoch": 19.615336424007015, "grad_norm": 24.617507934570312, "learning_rate": 1.1271365108431187e-08, "loss": 0.4421, "num_input_tokens_seen": 190372128, "step": 156550 }, { "epoch": 19.6159629119158, "grad_norm": 4.537538528442383, "learning_rate": 1.123470613747124e-08, "loss": 0.4219, "num_input_tokens_seen": 190377984, "step": 156555 }, { "epoch": 19.616589399824583, "grad_norm": 7.2406487464904785, "learning_rate": 1.1198106811394216e-08, "loss": 0.5063, "num_input_tokens_seen": 190384224, "step": 156560 }, { "epoch": 19.617215887733366, "grad_norm": 23.172237396240234, "learning_rate": 1.1161567130638095e-08, "loss": 0.4666, "num_input_tokens_seen": 190390368, "step": 156565 }, { "epoch": 19.61784237564215, "grad_norm": 4.833317279815674, "learning_rate": 1.1125087095640308e-08, "loss": 0.5225, "num_input_tokens_seen": 190396288, "step": 156570 }, { "epoch": 19.618468863550934, "grad_norm": 8.543435096740723, "learning_rate": 1.1088666706836615e-08, "loss": 0.4401, "num_input_tokens_seen": 190402208, "step": 156575 }, { "epoch": 19.619095351459716, "grad_norm": 18.248010635375977, "learning_rate": 1.1052305964662224e-08, "loss": 0.4416, "num_input_tokens_seen": 190408416, "step": 156580 }, { "epoch": 19.619721839368502, "grad_norm": 5.440680503845215, "learning_rate": 1.1016004869551788e-08, "loss": 0.464, "num_input_tokens_seen": 190414400, "step": 156585 }, { "epoch": 19.620348327277284, "grad_norm": 9.88288688659668, "learning_rate": 1.0979763421939404e-08, "loss": 0.4288, "num_input_tokens_seen": 190420544, "step": 156590 }, { "epoch": 19.620974815186067, "grad_norm": 5.478298187255859, "learning_rate": 1.0943581622258614e-08, "loss": 0.4272, "num_input_tokens_seen": 190426976, "step": 156595 }, { "epoch": 19.62160130309485, "grad_norm": 7.6235880851745605, "learning_rate": 1.090745947094185e-08, "loss": 0.3975, "num_input_tokens_seen": 190433408, "step": 156600 }, { "epoch": 19.622227791003635, "grad_norm": 25.151094436645508, "learning_rate": 1.0871396968420988e-08, "loss": 0.4589, "num_input_tokens_seen": 190439488, "step": 156605 }, { "epoch": 19.622854278912417, "grad_norm": 6.282964706420898, "learning_rate": 1.083539411512735e-08, "loss": 0.4281, "num_input_tokens_seen": 190445664, "step": 156610 }, { "epoch": 19.6234807668212, "grad_norm": 7.537769794464111, "learning_rate": 1.079945091149115e-08, "loss": 0.5165, "num_input_tokens_seen": 190451488, "step": 156615 }, { "epoch": 19.624107254729985, "grad_norm": 14.276000022888184, "learning_rate": 1.0763567357942595e-08, "loss": 0.4593, "num_input_tokens_seen": 190457504, "step": 156620 }, { "epoch": 19.624733742638767, "grad_norm": 9.617938995361328, "learning_rate": 1.0727743454909678e-08, "loss": 0.5005, "num_input_tokens_seen": 190463840, "step": 156625 }, { "epoch": 19.62536023054755, "grad_norm": 6.178868770599365, "learning_rate": 1.06919792028215e-08, "loss": 0.4947, "num_input_tokens_seen": 190470144, "step": 156630 }, { "epoch": 19.625986718456335, "grad_norm": 7.413523197174072, "learning_rate": 1.0656274602106054e-08, "loss": 0.4092, "num_input_tokens_seen": 190475744, "step": 156635 }, { "epoch": 19.626613206365118, "grad_norm": 18.732830047607422, "learning_rate": 1.0620629653189107e-08, "loss": 0.4815, "num_input_tokens_seen": 190482016, "step": 156640 }, { "epoch": 19.6272396942739, "grad_norm": 5.204065799713135, "learning_rate": 1.0585044356497542e-08, "loss": 0.4099, "num_input_tokens_seen": 190488096, "step": 156645 }, { "epoch": 19.627866182182682, "grad_norm": 17.68103790283203, "learning_rate": 1.0549518712456019e-08, "loss": 0.4386, "num_input_tokens_seen": 190494272, "step": 156650 }, { "epoch": 19.628492670091468, "grad_norm": 7.495068550109863, "learning_rate": 1.0514052721490865e-08, "loss": 0.4725, "num_input_tokens_seen": 190500064, "step": 156655 }, { "epoch": 19.62911915800025, "grad_norm": 7.008694171905518, "learning_rate": 1.0478646384024516e-08, "loss": 0.4039, "num_input_tokens_seen": 190506368, "step": 156660 }, { "epoch": 19.629745645909033, "grad_norm": 4.030392169952393, "learning_rate": 1.0443299700481079e-08, "loss": 0.5276, "num_input_tokens_seen": 190512768, "step": 156665 }, { "epoch": 19.63037213381782, "grad_norm": 10.435715675354004, "learning_rate": 1.0408012671282996e-08, "loss": 0.4335, "num_input_tokens_seen": 190518976, "step": 156670 }, { "epoch": 19.6309986217266, "grad_norm": 12.275649070739746, "learning_rate": 1.0372785296852706e-08, "loss": 0.4698, "num_input_tokens_seen": 190524512, "step": 156675 }, { "epoch": 19.631625109635383, "grad_norm": 2.761693000793457, "learning_rate": 1.033761757760987e-08, "loss": 0.4411, "num_input_tokens_seen": 190530176, "step": 156680 }, { "epoch": 19.63225159754417, "grad_norm": 4.811076641082764, "learning_rate": 1.0302509513976378e-08, "loss": 0.4127, "num_input_tokens_seen": 190536224, "step": 156685 }, { "epoch": 19.63287808545295, "grad_norm": 13.393705368041992, "learning_rate": 1.0267461106371335e-08, "loss": 0.4136, "num_input_tokens_seen": 190542592, "step": 156690 }, { "epoch": 19.633504573361733, "grad_norm": 29.239927291870117, "learning_rate": 1.0232472355213851e-08, "loss": 0.4442, "num_input_tokens_seen": 190548960, "step": 156695 }, { "epoch": 19.634131061270516, "grad_norm": 8.831403732299805, "learning_rate": 1.0197543260922482e-08, "loss": 0.3966, "num_input_tokens_seen": 190555008, "step": 156700 }, { "epoch": 19.6347575491793, "grad_norm": 8.112895965576172, "learning_rate": 1.016267382391467e-08, "loss": 0.3773, "num_input_tokens_seen": 190560192, "step": 156705 }, { "epoch": 19.635384037088084, "grad_norm": 5.260482311248779, "learning_rate": 1.012786404460786e-08, "loss": 0.4722, "num_input_tokens_seen": 190566208, "step": 156710 }, { "epoch": 19.636010524996866, "grad_norm": 5.151381969451904, "learning_rate": 1.009311392341672e-08, "loss": 0.4524, "num_input_tokens_seen": 190572128, "step": 156715 }, { "epoch": 19.636637012905652, "grad_norm": 5.41762113571167, "learning_rate": 1.0058423460758137e-08, "loss": 0.4481, "num_input_tokens_seen": 190578240, "step": 156720 }, { "epoch": 19.637263500814434, "grad_norm": 3.927870750427246, "learning_rate": 1.0023792657046227e-08, "loss": 0.3949, "num_input_tokens_seen": 190584160, "step": 156725 }, { "epoch": 19.637889988723217, "grad_norm": 9.794718742370605, "learning_rate": 9.989221512695658e-09, "loss": 0.5022, "num_input_tokens_seen": 190590144, "step": 156730 }, { "epoch": 19.638516476632002, "grad_norm": 6.63735818862915, "learning_rate": 9.954710028118874e-09, "loss": 0.4525, "num_input_tokens_seen": 190595648, "step": 156735 }, { "epoch": 19.639142964540785, "grad_norm": 15.447099685668945, "learning_rate": 9.920258203729438e-09, "loss": 0.4295, "num_input_tokens_seen": 190601664, "step": 156740 }, { "epoch": 19.639769452449567, "grad_norm": 17.01462173461914, "learning_rate": 9.88586603993813e-09, "loss": 0.4157, "num_input_tokens_seen": 190607872, "step": 156745 }, { "epoch": 19.640395940358353, "grad_norm": 6.540477752685547, "learning_rate": 9.851533537156844e-09, "loss": 0.4105, "num_input_tokens_seen": 190613504, "step": 156750 }, { "epoch": 19.641022428267135, "grad_norm": 2.8667187690734863, "learning_rate": 9.81726069579636e-09, "loss": 0.4179, "num_input_tokens_seen": 190619552, "step": 156755 }, { "epoch": 19.641648916175917, "grad_norm": 7.6757121086120605, "learning_rate": 9.783047516265798e-09, "loss": 0.4217, "num_input_tokens_seen": 190625472, "step": 156760 }, { "epoch": 19.6422754040847, "grad_norm": 5.812093734741211, "learning_rate": 9.748893998974828e-09, "loss": 0.4272, "num_input_tokens_seen": 190631584, "step": 156765 }, { "epoch": 19.642901891993485, "grad_norm": 5.930614471435547, "learning_rate": 9.714800144330904e-09, "loss": 0.441, "num_input_tokens_seen": 190637344, "step": 156770 }, { "epoch": 19.643528379902268, "grad_norm": 6.028167247772217, "learning_rate": 9.680765952742032e-09, "loss": 0.4247, "num_input_tokens_seen": 190643392, "step": 156775 }, { "epoch": 19.64415486781105, "grad_norm": 4.004083633422852, "learning_rate": 9.646791424615664e-09, "loss": 0.4648, "num_input_tokens_seen": 190649728, "step": 156780 }, { "epoch": 19.644781355719836, "grad_norm": 5.4200897216796875, "learning_rate": 9.612876560357588e-09, "loss": 0.4174, "num_input_tokens_seen": 190655968, "step": 156785 }, { "epoch": 19.645407843628618, "grad_norm": 6.353776931762695, "learning_rate": 9.57902136037303e-09, "loss": 0.3923, "num_input_tokens_seen": 190662080, "step": 156790 }, { "epoch": 19.6460343315374, "grad_norm": 15.499614715576172, "learning_rate": 9.545225825066672e-09, "loss": 0.4167, "num_input_tokens_seen": 190668128, "step": 156795 }, { "epoch": 19.646660819446186, "grad_norm": 5.830363750457764, "learning_rate": 9.511489954843189e-09, "loss": 0.4173, "num_input_tokens_seen": 190674144, "step": 156800 }, { "epoch": 19.64728730735497, "grad_norm": 5.057585716247559, "learning_rate": 9.477813750106147e-09, "loss": 0.4176, "num_input_tokens_seen": 190680288, "step": 156805 }, { "epoch": 19.64791379526375, "grad_norm": 5.305849552154541, "learning_rate": 9.44419721125689e-09, "loss": 0.4044, "num_input_tokens_seen": 190686656, "step": 156810 }, { "epoch": 19.648540283172533, "grad_norm": 11.27826976776123, "learning_rate": 9.41064033869843e-09, "loss": 0.447, "num_input_tokens_seen": 190692736, "step": 156815 }, { "epoch": 19.64916677108132, "grad_norm": 12.119172096252441, "learning_rate": 9.377143132831557e-09, "loss": 0.4814, "num_input_tokens_seen": 190698624, "step": 156820 }, { "epoch": 19.6497932589901, "grad_norm": 3.3739264011383057, "learning_rate": 9.343705594056507e-09, "loss": 0.4514, "num_input_tokens_seen": 190704864, "step": 156825 }, { "epoch": 19.650419746898883, "grad_norm": 6.94520378112793, "learning_rate": 9.310327722773515e-09, "loss": 0.4114, "num_input_tokens_seen": 190710976, "step": 156830 }, { "epoch": 19.65104623480767, "grad_norm": 7.231636047363281, "learning_rate": 9.277009519381708e-09, "loss": 0.4312, "num_input_tokens_seen": 190717184, "step": 156835 }, { "epoch": 19.65167272271645, "grad_norm": 18.232297897338867, "learning_rate": 9.243750984279098e-09, "loss": 0.457, "num_input_tokens_seen": 190723264, "step": 156840 }, { "epoch": 19.652299210625234, "grad_norm": 11.414685249328613, "learning_rate": 9.210552117863703e-09, "loss": 0.4329, "num_input_tokens_seen": 190729120, "step": 156845 }, { "epoch": 19.65292569853402, "grad_norm": 6.855981826782227, "learning_rate": 9.17741292053187e-09, "loss": 0.4773, "num_input_tokens_seen": 190735392, "step": 156850 }, { "epoch": 19.653552186442802, "grad_norm": 5.534736633300781, "learning_rate": 9.144333392680505e-09, "loss": 0.492, "num_input_tokens_seen": 190741056, "step": 156855 }, { "epoch": 19.654178674351584, "grad_norm": 50.22950744628906, "learning_rate": 9.111313534704292e-09, "loss": 0.5094, "num_input_tokens_seen": 190747136, "step": 156860 }, { "epoch": 19.654805162260367, "grad_norm": 5.444449424743652, "learning_rate": 9.07835334699847e-09, "loss": 0.4437, "num_input_tokens_seen": 190753088, "step": 156865 }, { "epoch": 19.655431650169152, "grad_norm": 9.80694580078125, "learning_rate": 9.04545282995717e-09, "loss": 0.3914, "num_input_tokens_seen": 190758976, "step": 156870 }, { "epoch": 19.656058138077935, "grad_norm": 4.810057163238525, "learning_rate": 9.012611983973963e-09, "loss": 0.4331, "num_input_tokens_seen": 190765248, "step": 156875 }, { "epoch": 19.656684625986717, "grad_norm": 4.842149257659912, "learning_rate": 8.979830809440759e-09, "loss": 0.4192, "num_input_tokens_seen": 190771456, "step": 156880 }, { "epoch": 19.657311113895503, "grad_norm": 7.50358772277832, "learning_rate": 8.947109306750023e-09, "loss": 0.4548, "num_input_tokens_seen": 190777472, "step": 156885 }, { "epoch": 19.657937601804285, "grad_norm": 7.860629081726074, "learning_rate": 8.914447476292554e-09, "loss": 0.4391, "num_input_tokens_seen": 190783808, "step": 156890 }, { "epoch": 19.658564089713067, "grad_norm": 21.98395538330078, "learning_rate": 8.881845318459703e-09, "loss": 0.4943, "num_input_tokens_seen": 190790176, "step": 156895 }, { "epoch": 19.659190577621853, "grad_norm": 5.439471244812012, "learning_rate": 8.84930283364005e-09, "loss": 0.3896, "num_input_tokens_seen": 190796320, "step": 156900 }, { "epoch": 19.659817065530635, "grad_norm": 7.19773006439209, "learning_rate": 8.816820022223838e-09, "loss": 0.3733, "num_input_tokens_seen": 190802144, "step": 156905 }, { "epoch": 19.660443553439418, "grad_norm": 3.7828972339630127, "learning_rate": 8.784396884598534e-09, "loss": 0.4637, "num_input_tokens_seen": 190808224, "step": 156910 }, { "epoch": 19.661070041348204, "grad_norm": 9.056641578674316, "learning_rate": 8.752033421152717e-09, "loss": 0.4507, "num_input_tokens_seen": 190814336, "step": 156915 }, { "epoch": 19.661696529256986, "grad_norm": 7.185752868652344, "learning_rate": 8.71972963227219e-09, "loss": 0.53, "num_input_tokens_seen": 190820256, "step": 156920 }, { "epoch": 19.662323017165768, "grad_norm": 6.736793518066406, "learning_rate": 8.687485518343863e-09, "loss": 0.48, "num_input_tokens_seen": 190826432, "step": 156925 }, { "epoch": 19.66294950507455, "grad_norm": 9.737590789794922, "learning_rate": 8.655301079752986e-09, "loss": 0.4096, "num_input_tokens_seen": 190832512, "step": 156930 }, { "epoch": 19.663575992983336, "grad_norm": 6.13426399230957, "learning_rate": 8.623176316884807e-09, "loss": 0.4402, "num_input_tokens_seen": 190838592, "step": 156935 }, { "epoch": 19.66420248089212, "grad_norm": 14.279072761535645, "learning_rate": 8.591111230122907e-09, "loss": 0.4481, "num_input_tokens_seen": 190844768, "step": 156940 }, { "epoch": 19.6648289688009, "grad_norm": 4.512223720550537, "learning_rate": 8.559105819850865e-09, "loss": 0.445, "num_input_tokens_seen": 190851104, "step": 156945 }, { "epoch": 19.665455456709687, "grad_norm": 33.73993682861328, "learning_rate": 8.527160086451158e-09, "loss": 0.5573, "num_input_tokens_seen": 190857184, "step": 156950 }, { "epoch": 19.66608194461847, "grad_norm": 16.0081787109375, "learning_rate": 8.4952740303057e-09, "loss": 0.4567, "num_input_tokens_seen": 190862624, "step": 156955 }, { "epoch": 19.66670843252725, "grad_norm": 20.98069190979004, "learning_rate": 8.463447651796409e-09, "loss": 0.4788, "num_input_tokens_seen": 190868544, "step": 156960 }, { "epoch": 19.667334920436037, "grad_norm": 5.166304111480713, "learning_rate": 8.431680951302979e-09, "loss": 0.3956, "num_input_tokens_seen": 190874816, "step": 156965 }, { "epoch": 19.66796140834482, "grad_norm": 6.179839134216309, "learning_rate": 8.399973929205108e-09, "loss": 0.4724, "num_input_tokens_seen": 190880960, "step": 156970 }, { "epoch": 19.6685878962536, "grad_norm": 16.550642013549805, "learning_rate": 8.368326585882491e-09, "loss": 0.4349, "num_input_tokens_seen": 190887360, "step": 156975 }, { "epoch": 19.669214384162387, "grad_norm": 5.319772720336914, "learning_rate": 8.336738921713716e-09, "loss": 0.4147, "num_input_tokens_seen": 190893504, "step": 156980 }, { "epoch": 19.66984087207117, "grad_norm": 5.683206558227539, "learning_rate": 8.305210937075702e-09, "loss": 0.4026, "num_input_tokens_seen": 190899744, "step": 156985 }, { "epoch": 19.670467359979952, "grad_norm": 6.391060829162598, "learning_rate": 8.273742632345372e-09, "loss": 0.416, "num_input_tokens_seen": 190905664, "step": 156990 }, { "epoch": 19.671093847888734, "grad_norm": 7.35654878616333, "learning_rate": 8.242334007899644e-09, "loss": 0.4175, "num_input_tokens_seen": 190911712, "step": 156995 }, { "epoch": 19.67172033579752, "grad_norm": 15.748710632324219, "learning_rate": 8.21098506411322e-09, "loss": 0.4046, "num_input_tokens_seen": 190917792, "step": 157000 }, { "epoch": 19.672346823706302, "grad_norm": 28.115314483642578, "learning_rate": 8.179695801361908e-09, "loss": 0.4849, "num_input_tokens_seen": 190923936, "step": 157005 }, { "epoch": 19.672973311615085, "grad_norm": 20.243661880493164, "learning_rate": 8.148466220018746e-09, "loss": 0.471, "num_input_tokens_seen": 190930176, "step": 157010 }, { "epoch": 19.67359979952387, "grad_norm": 10.822495460510254, "learning_rate": 8.11729632045788e-09, "loss": 0.4553, "num_input_tokens_seen": 190936416, "step": 157015 }, { "epoch": 19.674226287432653, "grad_norm": 9.85583209991455, "learning_rate": 8.086186103051785e-09, "loss": 0.4513, "num_input_tokens_seen": 190942624, "step": 157020 }, { "epoch": 19.674852775341435, "grad_norm": 10.174442291259766, "learning_rate": 8.055135568171834e-09, "loss": 0.3978, "num_input_tokens_seen": 190948832, "step": 157025 }, { "epoch": 19.67547926325022, "grad_norm": 8.269575119018555, "learning_rate": 8.024144716189952e-09, "loss": 0.4866, "num_input_tokens_seen": 190955072, "step": 157030 }, { "epoch": 19.676105751159003, "grad_norm": 21.033254623413086, "learning_rate": 7.993213547476398e-09, "loss": 0.499, "num_input_tokens_seen": 190960768, "step": 157035 }, { "epoch": 19.676732239067785, "grad_norm": 7.759933948516846, "learning_rate": 7.962342062401429e-09, "loss": 0.4416, "num_input_tokens_seen": 190966944, "step": 157040 }, { "epoch": 19.677358726976568, "grad_norm": 37.149131774902344, "learning_rate": 7.931530261333087e-09, "loss": 0.4889, "num_input_tokens_seen": 190973440, "step": 157045 }, { "epoch": 19.677985214885354, "grad_norm": 28.662046432495117, "learning_rate": 7.900778144641074e-09, "loss": 0.5346, "num_input_tokens_seen": 190979552, "step": 157050 }, { "epoch": 19.678611702794136, "grad_norm": 5.992755889892578, "learning_rate": 7.870085712691766e-09, "loss": 0.4095, "num_input_tokens_seen": 190985568, "step": 157055 }, { "epoch": 19.679238190702918, "grad_norm": 4.454666614532471, "learning_rate": 7.8394529658532e-09, "loss": 0.3911, "num_input_tokens_seen": 190991712, "step": 157060 }, { "epoch": 19.679864678611704, "grad_norm": 7.805208683013916, "learning_rate": 7.80887990449064e-09, "loss": 0.4051, "num_input_tokens_seen": 190997792, "step": 157065 }, { "epoch": 19.680491166520486, "grad_norm": 10.123405456542969, "learning_rate": 7.778366528971015e-09, "loss": 0.5265, "num_input_tokens_seen": 191003968, "step": 157070 }, { "epoch": 19.68111765442927, "grad_norm": 5.399227142333984, "learning_rate": 7.747912839657368e-09, "loss": 0.4018, "num_input_tokens_seen": 191010112, "step": 157075 }, { "epoch": 19.681744142338054, "grad_norm": 5.884436130523682, "learning_rate": 7.717518836914962e-09, "loss": 0.4488, "num_input_tokens_seen": 191016000, "step": 157080 }, { "epoch": 19.682370630246837, "grad_norm": 5.568284034729004, "learning_rate": 7.687184521106839e-09, "loss": 0.4379, "num_input_tokens_seen": 191022304, "step": 157085 }, { "epoch": 19.68299711815562, "grad_norm": 16.461767196655273, "learning_rate": 7.656909892596043e-09, "loss": 0.386, "num_input_tokens_seen": 191028448, "step": 157090 }, { "epoch": 19.6836236060644, "grad_norm": 6.069138050079346, "learning_rate": 7.626694951743952e-09, "loss": 0.4532, "num_input_tokens_seen": 191034816, "step": 157095 }, { "epoch": 19.684250093973187, "grad_norm": 11.99577808380127, "learning_rate": 7.596539698912498e-09, "loss": 0.4135, "num_input_tokens_seen": 191040832, "step": 157100 }, { "epoch": 19.68487658188197, "grad_norm": 15.115488052368164, "learning_rate": 7.566444134461392e-09, "loss": 0.5083, "num_input_tokens_seen": 191047040, "step": 157105 }, { "epoch": 19.68550306979075, "grad_norm": 5.68221378326416, "learning_rate": 7.536408258750905e-09, "loss": 0.4809, "num_input_tokens_seen": 191053280, "step": 157110 }, { "epoch": 19.686129557699537, "grad_norm": 5.829649925231934, "learning_rate": 7.506432072139635e-09, "loss": 0.5103, "num_input_tokens_seen": 191059488, "step": 157115 }, { "epoch": 19.68675604560832, "grad_norm": 5.249724864959717, "learning_rate": 7.476515574987298e-09, "loss": 0.3732, "num_input_tokens_seen": 191064896, "step": 157120 }, { "epoch": 19.687382533517102, "grad_norm": 4.277283191680908, "learning_rate": 7.446658767650272e-09, "loss": 0.4428, "num_input_tokens_seen": 191071200, "step": 157125 }, { "epoch": 19.688009021425888, "grad_norm": 12.4789457321167, "learning_rate": 7.416861650485496e-09, "loss": 0.4749, "num_input_tokens_seen": 191077088, "step": 157130 }, { "epoch": 19.68863550933467, "grad_norm": 22.750612258911133, "learning_rate": 7.387124223850462e-09, "loss": 0.459, "num_input_tokens_seen": 191083168, "step": 157135 }, { "epoch": 19.689261997243452, "grad_norm": 4.6708149909973145, "learning_rate": 7.357446488099329e-09, "loss": 0.4015, "num_input_tokens_seen": 191089280, "step": 157140 }, { "epoch": 19.689888485152238, "grad_norm": 7.682675838470459, "learning_rate": 7.327828443587925e-09, "loss": 0.5005, "num_input_tokens_seen": 191095360, "step": 157145 }, { "epoch": 19.69051497306102, "grad_norm": 4.283957004547119, "learning_rate": 7.298270090669301e-09, "loss": 0.3931, "num_input_tokens_seen": 191101472, "step": 157150 }, { "epoch": 19.691141460969803, "grad_norm": 19.36551284790039, "learning_rate": 7.268771429698174e-09, "loss": 0.4747, "num_input_tokens_seen": 191107520, "step": 157155 }, { "epoch": 19.691767948878585, "grad_norm": 20.63991928100586, "learning_rate": 7.239332461026483e-09, "loss": 0.3973, "num_input_tokens_seen": 191113728, "step": 157160 }, { "epoch": 19.69239443678737, "grad_norm": 3.991508960723877, "learning_rate": 7.209953185006169e-09, "loss": 0.421, "num_input_tokens_seen": 191119520, "step": 157165 }, { "epoch": 19.693020924696153, "grad_norm": 16.387544631958008, "learning_rate": 7.180633601988063e-09, "loss": 0.4663, "num_input_tokens_seen": 191125664, "step": 157170 }, { "epoch": 19.693647412604935, "grad_norm": 11.650679588317871, "learning_rate": 7.151373712324106e-09, "loss": 0.4523, "num_input_tokens_seen": 191131584, "step": 157175 }, { "epoch": 19.69427390051372, "grad_norm": 5.565473556518555, "learning_rate": 7.122173516362352e-09, "loss": 0.4219, "num_input_tokens_seen": 191137984, "step": 157180 }, { "epoch": 19.694900388422504, "grad_norm": 6.985321998596191, "learning_rate": 7.093033014453632e-09, "loss": 0.4953, "num_input_tokens_seen": 191144096, "step": 157185 }, { "epoch": 19.695526876331286, "grad_norm": 25.245101928710938, "learning_rate": 7.063952206944891e-09, "loss": 0.4869, "num_input_tokens_seen": 191150176, "step": 157190 }, { "epoch": 19.69615336424007, "grad_norm": 16.94140625, "learning_rate": 7.0349310941847385e-09, "loss": 0.5224, "num_input_tokens_seen": 191156288, "step": 157195 }, { "epoch": 19.696779852148854, "grad_norm": 13.06338119506836, "learning_rate": 7.005969676519564e-09, "loss": 0.4183, "num_input_tokens_seen": 191161920, "step": 157200 }, { "epoch": 19.697406340057636, "grad_norm": 7.862444877624512, "learning_rate": 6.977067954296313e-09, "loss": 0.4297, "num_input_tokens_seen": 191168320, "step": 157205 }, { "epoch": 19.698032827966422, "grad_norm": 3.734398603439331, "learning_rate": 6.948225927859709e-09, "loss": 0.466, "num_input_tokens_seen": 191173984, "step": 157210 }, { "epoch": 19.698659315875204, "grad_norm": 5.670324325561523, "learning_rate": 6.919443597555031e-09, "loss": 0.4354, "num_input_tokens_seen": 191180352, "step": 157215 }, { "epoch": 19.699285803783987, "grad_norm": 4.98099422454834, "learning_rate": 6.890720963726449e-09, "loss": 0.4026, "num_input_tokens_seen": 191186816, "step": 157220 }, { "epoch": 19.69991229169277, "grad_norm": 3.848149299621582, "learning_rate": 6.862058026717577e-09, "loss": 0.503, "num_input_tokens_seen": 191192928, "step": 157225 }, { "epoch": 19.700538779601555, "grad_norm": 5.925124168395996, "learning_rate": 6.833454786870364e-09, "loss": 0.4409, "num_input_tokens_seen": 191198368, "step": 157230 }, { "epoch": 19.701165267510337, "grad_norm": 14.371583938598633, "learning_rate": 6.804911244527312e-09, "loss": 0.446, "num_input_tokens_seen": 191204384, "step": 157235 }, { "epoch": 19.70179175541912, "grad_norm": 21.452051162719727, "learning_rate": 6.776427400029817e-09, "loss": 0.4751, "num_input_tokens_seen": 191210432, "step": 157240 }, { "epoch": 19.702418243327905, "grad_norm": 3.523873805999756, "learning_rate": 6.748003253718161e-09, "loss": 0.4501, "num_input_tokens_seen": 191216256, "step": 157245 }, { "epoch": 19.703044731236687, "grad_norm": 19.558494567871094, "learning_rate": 6.719638805932072e-09, "loss": 0.4957, "num_input_tokens_seen": 191222528, "step": 157250 }, { "epoch": 19.70367121914547, "grad_norm": 18.95383644104004, "learning_rate": 6.691334057010723e-09, "loss": 0.448, "num_input_tokens_seen": 191228640, "step": 157255 }, { "epoch": 19.704297707054256, "grad_norm": 4.63892126083374, "learning_rate": 6.663089007292734e-09, "loss": 0.399, "num_input_tokens_seen": 191234720, "step": 157260 }, { "epoch": 19.704924194963038, "grad_norm": 6.313037395477295, "learning_rate": 6.63490365711561e-09, "loss": 0.4246, "num_input_tokens_seen": 191240960, "step": 157265 }, { "epoch": 19.70555068287182, "grad_norm": 3.1532437801361084, "learning_rate": 6.606778006816305e-09, "loss": 0.4608, "num_input_tokens_seen": 191247136, "step": 157270 }, { "epoch": 19.706177170780602, "grad_norm": 10.131660461425781, "learning_rate": 6.5787120567317734e-09, "loss": 0.4226, "num_input_tokens_seen": 191253184, "step": 157275 }, { "epoch": 19.706803658689388, "grad_norm": 4.1009907722473145, "learning_rate": 6.550705807196189e-09, "loss": 0.4004, "num_input_tokens_seen": 191259712, "step": 157280 }, { "epoch": 19.70743014659817, "grad_norm": 22.076501846313477, "learning_rate": 6.522759258545952e-09, "loss": 0.4487, "num_input_tokens_seen": 191265696, "step": 157285 }, { "epoch": 19.708056634506953, "grad_norm": 13.390368461608887, "learning_rate": 6.494872411114128e-09, "loss": 0.4569, "num_input_tokens_seen": 191271904, "step": 157290 }, { "epoch": 19.70868312241574, "grad_norm": 21.420745849609375, "learning_rate": 6.46704526523434e-09, "loss": 0.5232, "num_input_tokens_seen": 191277600, "step": 157295 }, { "epoch": 19.70930961032452, "grad_norm": 4.341527938842773, "learning_rate": 6.439277821239098e-09, "loss": 0.4018, "num_input_tokens_seen": 191283936, "step": 157300 }, { "epoch": 19.709936098233303, "grad_norm": 21.898611068725586, "learning_rate": 6.411570079460916e-09, "loss": 0.4581, "num_input_tokens_seen": 191290048, "step": 157305 }, { "epoch": 19.71056258614209, "grad_norm": 25.2795352935791, "learning_rate": 6.383922040230639e-09, "loss": 0.5552, "num_input_tokens_seen": 191296064, "step": 157310 }, { "epoch": 19.71118907405087, "grad_norm": 7.003259658813477, "learning_rate": 6.3563337038791136e-09, "loss": 0.454, "num_input_tokens_seen": 191302496, "step": 157315 }, { "epoch": 19.711815561959654, "grad_norm": 5.874181270599365, "learning_rate": 6.328805070736077e-09, "loss": 0.4268, "num_input_tokens_seen": 191308672, "step": 157320 }, { "epoch": 19.712442049868436, "grad_norm": 19.375198364257812, "learning_rate": 6.30133614113071e-09, "loss": 0.5254, "num_input_tokens_seen": 191314688, "step": 157325 }, { "epoch": 19.71306853777722, "grad_norm": 16.930219650268555, "learning_rate": 6.273926915391082e-09, "loss": 0.4387, "num_input_tokens_seen": 191320544, "step": 157330 }, { "epoch": 19.713695025686004, "grad_norm": 10.404399871826172, "learning_rate": 6.246577393845266e-09, "loss": 0.4044, "num_input_tokens_seen": 191326976, "step": 157335 }, { "epoch": 19.714321513594786, "grad_norm": 10.648113250732422, "learning_rate": 6.219287576820221e-09, "loss": 0.481, "num_input_tokens_seen": 191333312, "step": 157340 }, { "epoch": 19.714948001503572, "grad_norm": 6.146290302276611, "learning_rate": 6.1920574646417985e-09, "loss": 0.4365, "num_input_tokens_seen": 191339552, "step": 157345 }, { "epoch": 19.715574489412354, "grad_norm": 12.995725631713867, "learning_rate": 6.164887057636404e-09, "loss": 0.3862, "num_input_tokens_seen": 191345632, "step": 157350 }, { "epoch": 19.716200977321137, "grad_norm": 6.99832820892334, "learning_rate": 6.137776356127667e-09, "loss": 0.4275, "num_input_tokens_seen": 191351904, "step": 157355 }, { "epoch": 19.716827465229922, "grad_norm": 25.19797706604004, "learning_rate": 6.110725360440883e-09, "loss": 0.4338, "num_input_tokens_seen": 191358112, "step": 157360 }, { "epoch": 19.717453953138705, "grad_norm": 5.930258274078369, "learning_rate": 6.083734070899127e-09, "loss": 0.4224, "num_input_tokens_seen": 191364096, "step": 157365 }, { "epoch": 19.718080441047487, "grad_norm": 7.272455215454102, "learning_rate": 6.05680248782492e-09, "loss": 0.4467, "num_input_tokens_seen": 191369984, "step": 157370 }, { "epoch": 19.718706928956273, "grad_norm": 6.136992931365967, "learning_rate": 6.0299306115402245e-09, "loss": 0.4119, "num_input_tokens_seen": 191376192, "step": 157375 }, { "epoch": 19.719333416865055, "grad_norm": 22.08074378967285, "learning_rate": 6.003118442366451e-09, "loss": 0.5197, "num_input_tokens_seen": 191382400, "step": 157380 }, { "epoch": 19.719959904773837, "grad_norm": 7.029231548309326, "learning_rate": 5.9763659806239e-09, "loss": 0.4176, "num_input_tokens_seen": 191388736, "step": 157385 }, { "epoch": 19.72058639268262, "grad_norm": 5.065176486968994, "learning_rate": 5.9496732266328686e-09, "loss": 0.416, "num_input_tokens_seen": 191394848, "step": 157390 }, { "epoch": 19.721212880591406, "grad_norm": 4.806799411773682, "learning_rate": 5.923040180712547e-09, "loss": 0.4221, "num_input_tokens_seen": 191400960, "step": 157395 }, { "epoch": 19.721839368500188, "grad_norm": 33.00434112548828, "learning_rate": 5.896466843180459e-09, "loss": 0.5664, "num_input_tokens_seen": 191407520, "step": 157400 }, { "epoch": 19.72246585640897, "grad_norm": 6.635572910308838, "learning_rate": 5.8699532143552395e-09, "loss": 0.4, "num_input_tokens_seen": 191413792, "step": 157405 }, { "epoch": 19.723092344317756, "grad_norm": 13.872138023376465, "learning_rate": 5.843499294553301e-09, "loss": 0.498, "num_input_tokens_seen": 191420128, "step": 157410 }, { "epoch": 19.723718832226538, "grad_norm": 14.2233247756958, "learning_rate": 5.8171050840916125e-09, "loss": 0.4628, "num_input_tokens_seen": 191426368, "step": 157415 }, { "epoch": 19.72434532013532, "grad_norm": 6.307510852813721, "learning_rate": 5.7907705832849215e-09, "loss": 0.42, "num_input_tokens_seen": 191432320, "step": 157420 }, { "epoch": 19.724971808044106, "grad_norm": 18.532020568847656, "learning_rate": 5.764495792449087e-09, "loss": 0.4703, "num_input_tokens_seen": 191438400, "step": 157425 }, { "epoch": 19.72559829595289, "grad_norm": 11.167131423950195, "learning_rate": 5.738280711897193e-09, "loss": 0.4193, "num_input_tokens_seen": 191444288, "step": 157430 }, { "epoch": 19.72622478386167, "grad_norm": 4.9549760818481445, "learning_rate": 5.712125341942876e-09, "loss": 0.4425, "num_input_tokens_seen": 191450272, "step": 157435 }, { "epoch": 19.726851271770453, "grad_norm": 7.553970813751221, "learning_rate": 5.686029682899219e-09, "loss": 0.4727, "num_input_tokens_seen": 191456256, "step": 157440 }, { "epoch": 19.72747775967924, "grad_norm": 5.256423473358154, "learning_rate": 5.659993735078196e-09, "loss": 0.5134, "num_input_tokens_seen": 191462368, "step": 157445 }, { "epoch": 19.72810424758802, "grad_norm": 5.598389148712158, "learning_rate": 5.634017498790667e-09, "loss": 0.3931, "num_input_tokens_seen": 191468256, "step": 157450 }, { "epoch": 19.728730735496804, "grad_norm": 13.875947952270508, "learning_rate": 5.608100974348052e-09, "loss": 0.3923, "num_input_tokens_seen": 191474368, "step": 157455 }, { "epoch": 19.72935722340559, "grad_norm": 4.8007941246032715, "learning_rate": 5.582244162058991e-09, "loss": 0.4657, "num_input_tokens_seen": 191480416, "step": 157460 }, { "epoch": 19.72998371131437, "grad_norm": 8.873653411865234, "learning_rate": 5.556447062233794e-09, "loss": 0.4324, "num_input_tokens_seen": 191486784, "step": 157465 }, { "epoch": 19.730610199223154, "grad_norm": 19.687875747680664, "learning_rate": 5.53070967517999e-09, "loss": 0.4524, "num_input_tokens_seen": 191492896, "step": 157470 }, { "epoch": 19.73123668713194, "grad_norm": 5.781172752380371, "learning_rate": 5.505032001205668e-09, "loss": 0.386, "num_input_tokens_seen": 191498752, "step": 157475 }, { "epoch": 19.731863175040722, "grad_norm": 12.919170379638672, "learning_rate": 5.4794140406183585e-09, "loss": 0.4702, "num_input_tokens_seen": 191504064, "step": 157480 }, { "epoch": 19.732489662949504, "grad_norm": 13.016063690185547, "learning_rate": 5.4538557937233725e-09, "loss": 0.4841, "num_input_tokens_seen": 191510208, "step": 157485 }, { "epoch": 19.733116150858287, "grad_norm": 4.426594257354736, "learning_rate": 5.428357260827133e-09, "loss": 0.45, "num_input_tokens_seen": 191515872, "step": 157490 }, { "epoch": 19.733742638767072, "grad_norm": 3.6790244579315186, "learning_rate": 5.4029184422332845e-09, "loss": 0.4731, "num_input_tokens_seen": 191521824, "step": 157495 }, { "epoch": 19.734369126675855, "grad_norm": 29.22484016418457, "learning_rate": 5.377539338247695e-09, "loss": 0.5188, "num_input_tokens_seen": 191527616, "step": 157500 }, { "epoch": 19.734995614584637, "grad_norm": 14.132722854614258, "learning_rate": 5.352219949172343e-09, "loss": 0.4816, "num_input_tokens_seen": 191533952, "step": 157505 }, { "epoch": 19.735622102493423, "grad_norm": 8.906713485717773, "learning_rate": 5.326960275310322e-09, "loss": 0.5421, "num_input_tokens_seen": 191540416, "step": 157510 }, { "epoch": 19.736248590402205, "grad_norm": 7.988497734069824, "learning_rate": 5.301760316964166e-09, "loss": 0.4145, "num_input_tokens_seen": 191546432, "step": 157515 }, { "epoch": 19.736875078310987, "grad_norm": 10.107386589050293, "learning_rate": 5.276620074434746e-09, "loss": 0.4385, "num_input_tokens_seen": 191552736, "step": 157520 }, { "epoch": 19.737501566219773, "grad_norm": 9.692686080932617, "learning_rate": 5.2515395480223775e-09, "loss": 0.518, "num_input_tokens_seen": 191558944, "step": 157525 }, { "epoch": 19.738128054128556, "grad_norm": 26.135242462158203, "learning_rate": 5.226518738027375e-09, "loss": 0.4476, "num_input_tokens_seen": 191564320, "step": 157530 }, { "epoch": 19.738754542037338, "grad_norm": 5.770515441894531, "learning_rate": 5.201557644748945e-09, "loss": 0.4047, "num_input_tokens_seen": 191570688, "step": 157535 }, { "epoch": 19.739381029946124, "grad_norm": 4.855020999908447, "learning_rate": 5.176656268485181e-09, "loss": 0.4207, "num_input_tokens_seen": 191576896, "step": 157540 }, { "epoch": 19.740007517854906, "grad_norm": 16.088979721069336, "learning_rate": 5.151814609533623e-09, "loss": 0.5375, "num_input_tokens_seen": 191583232, "step": 157545 }, { "epoch": 19.740634005763688, "grad_norm": 8.443523406982422, "learning_rate": 5.127032668191812e-09, "loss": 0.5103, "num_input_tokens_seen": 191588864, "step": 157550 }, { "epoch": 19.74126049367247, "grad_norm": 9.66881275177002, "learning_rate": 5.102310444755621e-09, "loss": 0.4287, "num_input_tokens_seen": 191595264, "step": 157555 }, { "epoch": 19.741886981581256, "grad_norm": 4.848872661590576, "learning_rate": 5.077647939520925e-09, "loss": 0.443, "num_input_tokens_seen": 191601792, "step": 157560 }, { "epoch": 19.74251346949004, "grad_norm": 10.40286636352539, "learning_rate": 5.053045152781933e-09, "loss": 0.516, "num_input_tokens_seen": 191607648, "step": 157565 }, { "epoch": 19.74313995739882, "grad_norm": 8.342877388000488, "learning_rate": 5.028502084833964e-09, "loss": 0.3605, "num_input_tokens_seen": 191613088, "step": 157570 }, { "epoch": 19.743766445307607, "grad_norm": 16.456554412841797, "learning_rate": 5.0040187359695625e-09, "loss": 0.4419, "num_input_tokens_seen": 191619008, "step": 157575 }, { "epoch": 19.74439293321639, "grad_norm": 9.37822437286377, "learning_rate": 4.979595106481827e-09, "loss": 0.4536, "num_input_tokens_seen": 191625312, "step": 157580 }, { "epoch": 19.74501942112517, "grad_norm": 9.615460395812988, "learning_rate": 4.955231196662191e-09, "loss": 0.4694, "num_input_tokens_seen": 191631296, "step": 157585 }, { "epoch": 19.745645909033957, "grad_norm": 25.109439849853516, "learning_rate": 4.9309270068031986e-09, "loss": 0.487, "num_input_tokens_seen": 191637216, "step": 157590 }, { "epoch": 19.74627239694274, "grad_norm": 15.579168319702148, "learning_rate": 4.906682537193508e-09, "loss": 0.4213, "num_input_tokens_seen": 191642976, "step": 157595 }, { "epoch": 19.74689888485152, "grad_norm": 5.21521520614624, "learning_rate": 4.882497788125107e-09, "loss": 0.3761, "num_input_tokens_seen": 191649216, "step": 157600 }, { "epoch": 19.747525372760308, "grad_norm": 5.183368682861328, "learning_rate": 4.858372759885544e-09, "loss": 0.4107, "num_input_tokens_seen": 191655808, "step": 157605 }, { "epoch": 19.74815186066909, "grad_norm": 9.919403076171875, "learning_rate": 4.834307452764031e-09, "loss": 0.5633, "num_input_tokens_seen": 191661824, "step": 157610 }, { "epoch": 19.748778348577872, "grad_norm": 5.637771129608154, "learning_rate": 4.810301867047562e-09, "loss": 0.4334, "num_input_tokens_seen": 191667904, "step": 157615 }, { "epoch": 19.749404836486654, "grad_norm": 10.307437896728516, "learning_rate": 4.786356003024239e-09, "loss": 0.4249, "num_input_tokens_seen": 191673984, "step": 157620 }, { "epoch": 19.75003132439544, "grad_norm": 6.200657367706299, "learning_rate": 4.762469860979391e-09, "loss": 0.4634, "num_input_tokens_seen": 191680288, "step": 157625 }, { "epoch": 19.750657812304222, "grad_norm": 5.188592433929443, "learning_rate": 4.738643441199453e-09, "loss": 0.4137, "num_input_tokens_seen": 191685824, "step": 157630 }, { "epoch": 19.751284300213005, "grad_norm": 5.505903720855713, "learning_rate": 4.714876743968644e-09, "loss": 0.4309, "num_input_tokens_seen": 191692128, "step": 157635 }, { "epoch": 19.75191078812179, "grad_norm": 34.70428466796875, "learning_rate": 4.69116976957118e-09, "loss": 0.5286, "num_input_tokens_seen": 191698112, "step": 157640 }, { "epoch": 19.752537276030573, "grad_norm": 3.562505006790161, "learning_rate": 4.6675225182901685e-09, "loss": 0.413, "num_input_tokens_seen": 191704320, "step": 157645 }, { "epoch": 19.753163763939355, "grad_norm": 10.685908317565918, "learning_rate": 4.643934990409271e-09, "loss": 0.4527, "num_input_tokens_seen": 191710336, "step": 157650 }, { "epoch": 19.75379025184814, "grad_norm": 19.9614315032959, "learning_rate": 4.62040718620993e-09, "loss": 0.4446, "num_input_tokens_seen": 191716320, "step": 157655 }, { "epoch": 19.754416739756923, "grad_norm": 3.1250972747802734, "learning_rate": 4.596939105973586e-09, "loss": 0.4501, "num_input_tokens_seen": 191722656, "step": 157660 }, { "epoch": 19.755043227665706, "grad_norm": 6.962682723999023, "learning_rate": 4.573530749980571e-09, "loss": 0.4032, "num_input_tokens_seen": 191728544, "step": 157665 }, { "epoch": 19.755669715574488, "grad_norm": 3.6105175018310547, "learning_rate": 4.550182118511215e-09, "loss": 0.4463, "num_input_tokens_seen": 191734272, "step": 157670 }, { "epoch": 19.756296203483274, "grad_norm": 9.250802040100098, "learning_rate": 4.5268932118441854e-09, "loss": 0.4378, "num_input_tokens_seen": 191740896, "step": 157675 }, { "epoch": 19.756922691392056, "grad_norm": 11.734950065612793, "learning_rate": 4.503664030258148e-09, "loss": 0.3845, "num_input_tokens_seen": 191747488, "step": 157680 }, { "epoch": 19.757549179300838, "grad_norm": 7.70175838470459, "learning_rate": 4.480494574030658e-09, "loss": 0.4083, "num_input_tokens_seen": 191753792, "step": 157685 }, { "epoch": 19.758175667209624, "grad_norm": 2.7732315063476562, "learning_rate": 4.457384843438717e-09, "loss": 0.4269, "num_input_tokens_seen": 191760192, "step": 157690 }, { "epoch": 19.758802155118406, "grad_norm": 9.181032180786133, "learning_rate": 4.434334838759325e-09, "loss": 0.4152, "num_input_tokens_seen": 191766208, "step": 157695 }, { "epoch": 19.75942864302719, "grad_norm": 5.082907676696777, "learning_rate": 4.4113445602672615e-09, "loss": 0.4108, "num_input_tokens_seen": 191772320, "step": 157700 }, { "epoch": 19.760055130935974, "grad_norm": 18.850894927978516, "learning_rate": 4.3884140082373075e-09, "loss": 0.4752, "num_input_tokens_seen": 191778464, "step": 157705 }, { "epoch": 19.760681618844757, "grad_norm": 7.180081844329834, "learning_rate": 4.365543182944243e-09, "loss": 0.4026, "num_input_tokens_seen": 191784640, "step": 157710 }, { "epoch": 19.76130810675354, "grad_norm": 6.249981880187988, "learning_rate": 4.342732084661183e-09, "loss": 0.5394, "num_input_tokens_seen": 191791008, "step": 157715 }, { "epoch": 19.76193459466232, "grad_norm": 3.4296653270721436, "learning_rate": 4.319980713660687e-09, "loss": 0.471, "num_input_tokens_seen": 191797312, "step": 157720 }, { "epoch": 19.762561082571107, "grad_norm": 7.947160720825195, "learning_rate": 4.2972890702153155e-09, "loss": 0.4343, "num_input_tokens_seen": 191803264, "step": 157725 }, { "epoch": 19.76318757047989, "grad_norm": 4.227210521697998, "learning_rate": 4.274657154595408e-09, "loss": 0.4159, "num_input_tokens_seen": 191809184, "step": 157730 }, { "epoch": 19.76381405838867, "grad_norm": 11.766816139221191, "learning_rate": 4.252084967072967e-09, "loss": 0.4517, "num_input_tokens_seen": 191815392, "step": 157735 }, { "epoch": 19.764440546297458, "grad_norm": 9.399618148803711, "learning_rate": 4.229572507916113e-09, "loss": 0.4096, "num_input_tokens_seen": 191821504, "step": 157740 }, { "epoch": 19.76506703420624, "grad_norm": 5.590144634246826, "learning_rate": 4.207119777395741e-09, "loss": 0.4443, "num_input_tokens_seen": 191826336, "step": 157745 }, { "epoch": 19.765693522115022, "grad_norm": 4.684503078460693, "learning_rate": 4.184726775778858e-09, "loss": 0.4557, "num_input_tokens_seen": 191832576, "step": 157750 }, { "epoch": 19.766320010023808, "grad_norm": 3.315640687942505, "learning_rate": 4.162393503334139e-09, "loss": 0.4138, "num_input_tokens_seen": 191838912, "step": 157755 }, { "epoch": 19.76694649793259, "grad_norm": 4.795749664306641, "learning_rate": 4.140119960328593e-09, "loss": 0.4813, "num_input_tokens_seen": 191845088, "step": 157760 }, { "epoch": 19.767572985841372, "grad_norm": 6.746072769165039, "learning_rate": 4.117906147027562e-09, "loss": 0.4885, "num_input_tokens_seen": 191851040, "step": 157765 }, { "epoch": 19.76819947375016, "grad_norm": 3.5602056980133057, "learning_rate": 4.0957520636975e-09, "loss": 0.3809, "num_input_tokens_seen": 191857184, "step": 157770 }, { "epoch": 19.76882596165894, "grad_norm": 26.049230575561523, "learning_rate": 4.073657710603196e-09, "loss": 0.4348, "num_input_tokens_seen": 191863296, "step": 157775 }, { "epoch": 19.769452449567723, "grad_norm": 3.6610734462738037, "learning_rate": 4.051623088008882e-09, "loss": 0.4046, "num_input_tokens_seen": 191869632, "step": 157780 }, { "epoch": 19.770078937476505, "grad_norm": 5.642786026000977, "learning_rate": 4.029648196177128e-09, "loss": 0.46, "num_input_tokens_seen": 191875616, "step": 157785 }, { "epoch": 19.77070542538529, "grad_norm": 28.53772735595703, "learning_rate": 4.007733035372163e-09, "loss": 0.5249, "num_input_tokens_seen": 191881632, "step": 157790 }, { "epoch": 19.771331913294073, "grad_norm": 5.706791877746582, "learning_rate": 3.985877605854338e-09, "loss": 0.4699, "num_input_tokens_seen": 191886528, "step": 157795 }, { "epoch": 19.771958401202856, "grad_norm": 34.397491455078125, "learning_rate": 3.964081907886219e-09, "loss": 0.4554, "num_input_tokens_seen": 191892768, "step": 157800 }, { "epoch": 19.77258488911164, "grad_norm": 5.809835433959961, "learning_rate": 3.942345941727599e-09, "loss": 0.3891, "num_input_tokens_seen": 191898976, "step": 157805 }, { "epoch": 19.773211377020424, "grad_norm": 6.318004131317139, "learning_rate": 3.920669707638824e-09, "loss": 0.4212, "num_input_tokens_seen": 191905024, "step": 157810 }, { "epoch": 19.773837864929206, "grad_norm": 5.726510047912598, "learning_rate": 3.899053205879133e-09, "loss": 0.4476, "num_input_tokens_seen": 191911040, "step": 157815 }, { "epoch": 19.77446435283799, "grad_norm": 12.999403953552246, "learning_rate": 3.877496436706652e-09, "loss": 0.4732, "num_input_tokens_seen": 191917120, "step": 157820 }, { "epoch": 19.775090840746774, "grad_norm": 26.202241897583008, "learning_rate": 3.855999400378951e-09, "loss": 0.4753, "num_input_tokens_seen": 191922656, "step": 157825 }, { "epoch": 19.775717328655556, "grad_norm": 3.1920409202575684, "learning_rate": 3.834562097153605e-09, "loss": 0.4836, "num_input_tokens_seen": 191928000, "step": 157830 }, { "epoch": 19.77634381656434, "grad_norm": 3.9952094554901123, "learning_rate": 3.813184527286518e-09, "loss": 0.3996, "num_input_tokens_seen": 191933408, "step": 157835 }, { "epoch": 19.776970304473124, "grad_norm": 26.289295196533203, "learning_rate": 3.791866691033597e-09, "loss": 0.5414, "num_input_tokens_seen": 191939712, "step": 157840 }, { "epoch": 19.777596792381907, "grad_norm": 6.244580268859863, "learning_rate": 3.770608588649083e-09, "loss": 0.4396, "num_input_tokens_seen": 191945664, "step": 157845 }, { "epoch": 19.77822328029069, "grad_norm": 18.192962646484375, "learning_rate": 3.749410220387772e-09, "loss": 0.4564, "num_input_tokens_seen": 191951936, "step": 157850 }, { "epoch": 19.778849768199475, "grad_norm": 5.609785556793213, "learning_rate": 3.7282715865027964e-09, "loss": 0.4227, "num_input_tokens_seen": 191958208, "step": 157855 }, { "epoch": 19.779476256108257, "grad_norm": 23.692413330078125, "learning_rate": 3.7071926872467302e-09, "loss": 0.4799, "num_input_tokens_seen": 191964544, "step": 157860 }, { "epoch": 19.78010274401704, "grad_norm": 18.287145614624023, "learning_rate": 3.6861735228721496e-09, "loss": 0.4397, "num_input_tokens_seen": 191970752, "step": 157865 }, { "epoch": 19.780729231925825, "grad_norm": 14.008317947387695, "learning_rate": 3.6652140936299653e-09, "loss": 0.4074, "num_input_tokens_seen": 191976736, "step": 157870 }, { "epoch": 19.781355719834607, "grad_norm": 5.498620510101318, "learning_rate": 3.6443143997710872e-09, "loss": 0.3997, "num_input_tokens_seen": 191982976, "step": 157875 }, { "epoch": 19.78198220774339, "grad_norm": 7.856369972229004, "learning_rate": 3.623474441544761e-09, "loss": 0.4578, "num_input_tokens_seen": 191988864, "step": 157880 }, { "epoch": 19.782608695652176, "grad_norm": 7.753016471862793, "learning_rate": 3.6026942192007862e-09, "loss": 0.4258, "num_input_tokens_seen": 191994848, "step": 157885 }, { "epoch": 19.783235183560958, "grad_norm": 6.845991134643555, "learning_rate": 3.5819737329872983e-09, "loss": 0.4117, "num_input_tokens_seen": 192001408, "step": 157890 }, { "epoch": 19.78386167146974, "grad_norm": 2.7322421073913574, "learning_rate": 3.5613129831518766e-09, "loss": 0.4414, "num_input_tokens_seen": 192007520, "step": 157895 }, { "epoch": 19.784488159378522, "grad_norm": 5.88471794128418, "learning_rate": 3.5407119699415458e-09, "loss": 0.4261, "num_input_tokens_seen": 192013568, "step": 157900 }, { "epoch": 19.78511464728731, "grad_norm": 5.276689529418945, "learning_rate": 3.520170693603331e-09, "loss": 0.4612, "num_input_tokens_seen": 192019872, "step": 157905 }, { "epoch": 19.78574113519609, "grad_norm": 18.534589767456055, "learning_rate": 3.4996891543820357e-09, "loss": 0.4702, "num_input_tokens_seen": 192025568, "step": 157910 }, { "epoch": 19.786367623104873, "grad_norm": 10.993000984191895, "learning_rate": 3.4792673525230193e-09, "loss": 0.4148, "num_input_tokens_seen": 192032160, "step": 157915 }, { "epoch": 19.78699411101366, "grad_norm": 4.729680061340332, "learning_rate": 3.4589052882699756e-09, "loss": 0.3999, "num_input_tokens_seen": 192038336, "step": 157920 }, { "epoch": 19.78762059892244, "grad_norm": 4.363672733306885, "learning_rate": 3.4386029618665993e-09, "loss": 0.4681, "num_input_tokens_seen": 192044608, "step": 157925 }, { "epoch": 19.788247086831223, "grad_norm": 4.874757289886475, "learning_rate": 3.4183603735554737e-09, "loss": 0.4191, "num_input_tokens_seen": 192050656, "step": 157930 }, { "epoch": 19.78887357474001, "grad_norm": 6.526323318481445, "learning_rate": 3.3981775235786274e-09, "loss": 0.4518, "num_input_tokens_seen": 192056608, "step": 157935 }, { "epoch": 19.78950006264879, "grad_norm": 5.681138515472412, "learning_rate": 3.378054412177534e-09, "loss": 0.509, "num_input_tokens_seen": 192063168, "step": 157940 }, { "epoch": 19.790126550557574, "grad_norm": 18.433155059814453, "learning_rate": 3.3579910395931115e-09, "loss": 0.5343, "num_input_tokens_seen": 192069280, "step": 157945 }, { "epoch": 19.790753038466356, "grad_norm": 7.3523850440979, "learning_rate": 3.337987406064613e-09, "loss": 0.4514, "num_input_tokens_seen": 192074944, "step": 157950 }, { "epoch": 19.79137952637514, "grad_norm": 13.203385353088379, "learning_rate": 3.3180435118312926e-09, "loss": 0.4321, "num_input_tokens_seen": 192080448, "step": 157955 }, { "epoch": 19.792006014283924, "grad_norm": 6.754484176635742, "learning_rate": 3.298159357131847e-09, "loss": 0.5435, "num_input_tokens_seen": 192086272, "step": 157960 }, { "epoch": 19.792632502192706, "grad_norm": 8.597606658935547, "learning_rate": 3.2783349422044197e-09, "loss": 0.3928, "num_input_tokens_seen": 192092512, "step": 157965 }, { "epoch": 19.793258990101492, "grad_norm": 4.769662857055664, "learning_rate": 3.2585702672849327e-09, "loss": 0.4203, "num_input_tokens_seen": 192098720, "step": 157970 }, { "epoch": 19.793885478010274, "grad_norm": 7.894480228424072, "learning_rate": 3.238865332610419e-09, "loss": 0.4402, "num_input_tokens_seen": 192104928, "step": 157975 }, { "epoch": 19.794511965919057, "grad_norm": 3.960017442703247, "learning_rate": 3.219220138416246e-09, "loss": 0.4396, "num_input_tokens_seen": 192110880, "step": 157980 }, { "epoch": 19.795138453827843, "grad_norm": 21.71977424621582, "learning_rate": 3.199634684937225e-09, "loss": 0.4378, "num_input_tokens_seen": 192116608, "step": 157985 }, { "epoch": 19.795764941736625, "grad_norm": 6.599799156188965, "learning_rate": 3.180108972407614e-09, "loss": 0.5004, "num_input_tokens_seen": 192122464, "step": 157990 }, { "epoch": 19.796391429645407, "grad_norm": 5.321771144866943, "learning_rate": 3.1606430010616694e-09, "loss": 0.3861, "num_input_tokens_seen": 192128896, "step": 157995 }, { "epoch": 19.797017917554193, "grad_norm": 15.934615135192871, "learning_rate": 3.1412367711303184e-09, "loss": 0.4363, "num_input_tokens_seen": 192134464, "step": 158000 }, { "epoch": 19.797644405462975, "grad_norm": 15.164849281311035, "learning_rate": 3.121890282847262e-09, "loss": 0.5211, "num_input_tokens_seen": 192140544, "step": 158005 }, { "epoch": 19.798270893371757, "grad_norm": 2.4680724143981934, "learning_rate": 3.102603536442872e-09, "loss": 0.4174, "num_input_tokens_seen": 192146944, "step": 158010 }, { "epoch": 19.79889738128054, "grad_norm": 11.032858848571777, "learning_rate": 3.0833765321480745e-09, "loss": 0.4354, "num_input_tokens_seen": 192153184, "step": 158015 }, { "epoch": 19.799523869189326, "grad_norm": 8.99177074432373, "learning_rate": 3.0642092701926863e-09, "loss": 0.4413, "num_input_tokens_seen": 192158880, "step": 158020 }, { "epoch": 19.800150357098108, "grad_norm": 11.41114616394043, "learning_rate": 3.045101750805968e-09, "loss": 0.4133, "num_input_tokens_seen": 192165312, "step": 158025 }, { "epoch": 19.80077684500689, "grad_norm": 5.946745872497559, "learning_rate": 3.02605397421607e-09, "loss": 0.591, "num_input_tokens_seen": 192171360, "step": 158030 }, { "epoch": 19.801403332915676, "grad_norm": 6.263338565826416, "learning_rate": 3.007065940651144e-09, "loss": 0.4367, "num_input_tokens_seen": 192177312, "step": 158035 }, { "epoch": 19.80202982082446, "grad_norm": 6.71479606628418, "learning_rate": 2.9881376503382297e-09, "loss": 0.4797, "num_input_tokens_seen": 192183360, "step": 158040 }, { "epoch": 19.80265630873324, "grad_norm": 2.312013626098633, "learning_rate": 2.969269103502703e-09, "loss": 0.4314, "num_input_tokens_seen": 192189472, "step": 158045 }, { "epoch": 19.803282796642026, "grad_norm": 24.732791900634766, "learning_rate": 2.950460300371605e-09, "loss": 0.5094, "num_input_tokens_seen": 192195264, "step": 158050 }, { "epoch": 19.80390928455081, "grad_norm": 7.125919818878174, "learning_rate": 2.9317112411686442e-09, "loss": 0.4264, "num_input_tokens_seen": 192201248, "step": 158055 }, { "epoch": 19.80453577245959, "grad_norm": 9.919289588928223, "learning_rate": 2.913021926118087e-09, "loss": 0.3821, "num_input_tokens_seen": 192207328, "step": 158060 }, { "epoch": 19.805162260368373, "grad_norm": 11.619349479675293, "learning_rate": 2.894392355444198e-09, "loss": 0.419, "num_input_tokens_seen": 192213472, "step": 158065 }, { "epoch": 19.80578874827716, "grad_norm": 16.23316192626953, "learning_rate": 2.8758225293684662e-09, "loss": 0.4796, "num_input_tokens_seen": 192219680, "step": 158070 }, { "epoch": 19.80641523618594, "grad_norm": 8.663431167602539, "learning_rate": 2.8573124481140467e-09, "loss": 0.4734, "num_input_tokens_seen": 192225600, "step": 158075 }, { "epoch": 19.807041724094724, "grad_norm": 4.295156002044678, "learning_rate": 2.8388621119018743e-09, "loss": 0.4065, "num_input_tokens_seen": 192231616, "step": 158080 }, { "epoch": 19.80766821200351, "grad_norm": 5.222618579864502, "learning_rate": 2.820471520952883e-09, "loss": 0.4341, "num_input_tokens_seen": 192238048, "step": 158085 }, { "epoch": 19.80829469991229, "grad_norm": 6.1258368492126465, "learning_rate": 2.8021406754857873e-09, "loss": 0.4934, "num_input_tokens_seen": 192244256, "step": 158090 }, { "epoch": 19.808921187821074, "grad_norm": 29.198810577392578, "learning_rate": 2.7838695757209653e-09, "loss": 0.4941, "num_input_tokens_seen": 192250272, "step": 158095 }, { "epoch": 19.80954767572986, "grad_norm": 3.1603477001190186, "learning_rate": 2.765658221876577e-09, "loss": 0.4275, "num_input_tokens_seen": 192256320, "step": 158100 }, { "epoch": 19.810174163638642, "grad_norm": 31.300405502319336, "learning_rate": 2.7475066141696704e-09, "loss": 0.4507, "num_input_tokens_seen": 192262368, "step": 158105 }, { "epoch": 19.810800651547424, "grad_norm": 8.714630126953125, "learning_rate": 2.7294147528184044e-09, "loss": 0.4984, "num_input_tokens_seen": 192268384, "step": 158110 }, { "epoch": 19.811427139456207, "grad_norm": 8.377781867980957, "learning_rate": 2.7113826380376074e-09, "loss": 0.4725, "num_input_tokens_seen": 192274784, "step": 158115 }, { "epoch": 19.812053627364993, "grad_norm": 7.129191875457764, "learning_rate": 2.6934102700443276e-09, "loss": 0.4249, "num_input_tokens_seen": 192280512, "step": 158120 }, { "epoch": 19.812680115273775, "grad_norm": 8.257816314697266, "learning_rate": 2.6754976490528383e-09, "loss": 0.4891, "num_input_tokens_seen": 192286208, "step": 158125 }, { "epoch": 19.813306603182557, "grad_norm": 7.682085990905762, "learning_rate": 2.6576447752774126e-09, "loss": 0.4477, "num_input_tokens_seen": 192292480, "step": 158130 }, { "epoch": 19.813933091091343, "grad_norm": 4.95541524887085, "learning_rate": 2.639851648930658e-09, "loss": 0.3937, "num_input_tokens_seen": 192298624, "step": 158135 }, { "epoch": 19.814559579000125, "grad_norm": 4.734592914581299, "learning_rate": 2.6221182702268477e-09, "loss": 0.4761, "num_input_tokens_seen": 192305120, "step": 158140 }, { "epoch": 19.815186066908907, "grad_norm": 4.893431186676025, "learning_rate": 2.604444639376924e-09, "loss": 0.4455, "num_input_tokens_seen": 192311424, "step": 158145 }, { "epoch": 19.815812554817693, "grad_norm": 5.335693359375, "learning_rate": 2.5868307565923843e-09, "loss": 0.4164, "num_input_tokens_seen": 192317632, "step": 158150 }, { "epoch": 19.816439042726476, "grad_norm": 5.334841251373291, "learning_rate": 2.569276622083616e-09, "loss": 0.5792, "num_input_tokens_seen": 192323712, "step": 158155 }, { "epoch": 19.817065530635258, "grad_norm": 5.489436149597168, "learning_rate": 2.5517822360610066e-09, "loss": 0.4267, "num_input_tokens_seen": 192329152, "step": 158160 }, { "epoch": 19.817692018544044, "grad_norm": 17.720232009887695, "learning_rate": 2.5343475987332776e-09, "loss": 0.4152, "num_input_tokens_seen": 192335296, "step": 158165 }, { "epoch": 19.818318506452826, "grad_norm": 5.970742225646973, "learning_rate": 2.5169727103091514e-09, "loss": 0.5062, "num_input_tokens_seen": 192341664, "step": 158170 }, { "epoch": 19.81894499436161, "grad_norm": 5.522906303405762, "learning_rate": 2.499657570996239e-09, "loss": 0.4378, "num_input_tokens_seen": 192348224, "step": 158175 }, { "epoch": 19.81957148227039, "grad_norm": 21.73505973815918, "learning_rate": 2.4824021810021527e-09, "loss": 0.5147, "num_input_tokens_seen": 192354272, "step": 158180 }, { "epoch": 19.820197970179176, "grad_norm": 7.0778679847717285, "learning_rate": 2.465206540531728e-09, "loss": 0.4269, "num_input_tokens_seen": 192360480, "step": 158185 }, { "epoch": 19.82082445808796, "grad_norm": 24.9912052154541, "learning_rate": 2.4480706497920224e-09, "loss": 0.4607, "num_input_tokens_seen": 192366464, "step": 158190 }, { "epoch": 19.82145094599674, "grad_norm": 6.336816787719727, "learning_rate": 2.4309945089873164e-09, "loss": 0.4222, "num_input_tokens_seen": 192372352, "step": 158195 }, { "epoch": 19.822077433905527, "grad_norm": 5.583441257476807, "learning_rate": 2.4139781183218915e-09, "loss": 0.524, "num_input_tokens_seen": 192378336, "step": 158200 }, { "epoch": 19.82270392181431, "grad_norm": 4.807180881500244, "learning_rate": 2.3970214779989178e-09, "loss": 0.3552, "num_input_tokens_seen": 192384480, "step": 158205 }, { "epoch": 19.82333040972309, "grad_norm": 6.812119483947754, "learning_rate": 2.380124588221011e-09, "loss": 0.488, "num_input_tokens_seen": 192390336, "step": 158210 }, { "epoch": 19.823956897631877, "grad_norm": 10.69944953918457, "learning_rate": 2.3632874491907874e-09, "loss": 0.5144, "num_input_tokens_seen": 192396480, "step": 158215 }, { "epoch": 19.82458338554066, "grad_norm": 24.17329978942871, "learning_rate": 2.3465100611091974e-09, "loss": 0.4289, "num_input_tokens_seen": 192402816, "step": 158220 }, { "epoch": 19.82520987344944, "grad_norm": 4.4290266036987305, "learning_rate": 2.3297924241771906e-09, "loss": 0.4602, "num_input_tokens_seen": 192409024, "step": 158225 }, { "epoch": 19.825836361358228, "grad_norm": 3.4510176181793213, "learning_rate": 2.313134538594053e-09, "loss": 0.3897, "num_input_tokens_seen": 192415104, "step": 158230 }, { "epoch": 19.82646284926701, "grad_norm": 6.451815605163574, "learning_rate": 2.296536404559069e-09, "loss": 0.3827, "num_input_tokens_seen": 192421152, "step": 158235 }, { "epoch": 19.827089337175792, "grad_norm": 4.8913469314575195, "learning_rate": 2.2799980222709684e-09, "loss": 0.4176, "num_input_tokens_seen": 192427584, "step": 158240 }, { "epoch": 19.827715825084574, "grad_norm": 23.758533477783203, "learning_rate": 2.2635193919273714e-09, "loss": 0.4441, "num_input_tokens_seen": 192433792, "step": 158245 }, { "epoch": 19.82834231299336, "grad_norm": 11.262433052062988, "learning_rate": 2.247100513724787e-09, "loss": 0.4654, "num_input_tokens_seen": 192439872, "step": 158250 }, { "epoch": 19.828968800902143, "grad_norm": 6.076200008392334, "learning_rate": 2.230741387860835e-09, "loss": 0.427, "num_input_tokens_seen": 192445920, "step": 158255 }, { "epoch": 19.829595288810925, "grad_norm": 6.45424747467041, "learning_rate": 2.214442014529805e-09, "loss": 0.5329, "num_input_tokens_seen": 192451968, "step": 158260 }, { "epoch": 19.83022177671971, "grad_norm": 4.209254741668701, "learning_rate": 2.1982023939270956e-09, "loss": 0.3877, "num_input_tokens_seen": 192457760, "step": 158265 }, { "epoch": 19.830848264628493, "grad_norm": 21.03875732421875, "learning_rate": 2.182022526246441e-09, "loss": 0.4313, "num_input_tokens_seen": 192464064, "step": 158270 }, { "epoch": 19.831474752537275, "grad_norm": 5.656696319580078, "learning_rate": 2.1659024116821304e-09, "loss": 0.52, "num_input_tokens_seen": 192470208, "step": 158275 }, { "epoch": 19.83210124044606, "grad_norm": 10.965105056762695, "learning_rate": 2.149842050426232e-09, "loss": 0.4331, "num_input_tokens_seen": 192476288, "step": 158280 }, { "epoch": 19.832727728354843, "grad_norm": 27.053394317626953, "learning_rate": 2.1338414426708144e-09, "loss": 0.4773, "num_input_tokens_seen": 192482368, "step": 158285 }, { "epoch": 19.833354216263626, "grad_norm": 33.97170639038086, "learning_rate": 2.1179005886073912e-09, "loss": 0.4276, "num_input_tokens_seen": 192488736, "step": 158290 }, { "epoch": 19.833980704172408, "grad_norm": 9.311650276184082, "learning_rate": 2.1020194884269206e-09, "loss": 0.3462, "num_input_tokens_seen": 192495104, "step": 158295 }, { "epoch": 19.834607192081194, "grad_norm": 9.36680793762207, "learning_rate": 2.086198142318141e-09, "loss": 0.4345, "num_input_tokens_seen": 192501408, "step": 158300 }, { "epoch": 19.835233679989976, "grad_norm": 8.065895080566406, "learning_rate": 2.0704365504709e-09, "loss": 0.5188, "num_input_tokens_seen": 192507648, "step": 158305 }, { "epoch": 19.83586016789876, "grad_norm": 8.159979820251465, "learning_rate": 2.054734713073936e-09, "loss": 0.4069, "num_input_tokens_seen": 192513920, "step": 158310 }, { "epoch": 19.836486655807544, "grad_norm": 10.319567680358887, "learning_rate": 2.039092630314876e-09, "loss": 0.4173, "num_input_tokens_seen": 192520000, "step": 158315 }, { "epoch": 19.837113143716326, "grad_norm": 12.642577171325684, "learning_rate": 2.023510302380238e-09, "loss": 0.4483, "num_input_tokens_seen": 192526016, "step": 158320 }, { "epoch": 19.83773963162511, "grad_norm": 6.330848217010498, "learning_rate": 2.0079877294565397e-09, "loss": 0.4531, "num_input_tokens_seen": 192532000, "step": 158325 }, { "epoch": 19.838366119533895, "grad_norm": 7.208119869232178, "learning_rate": 1.9925249117291878e-09, "loss": 0.4137, "num_input_tokens_seen": 192538016, "step": 158330 }, { "epoch": 19.838992607442677, "grad_norm": 7.418776988983154, "learning_rate": 1.9771218493835896e-09, "loss": 0.5361, "num_input_tokens_seen": 192544128, "step": 158335 }, { "epoch": 19.83961909535146, "grad_norm": 5.672791957855225, "learning_rate": 1.9617785426034876e-09, "loss": 0.4019, "num_input_tokens_seen": 192550464, "step": 158340 }, { "epoch": 19.84024558326024, "grad_norm": 12.880337715148926, "learning_rate": 1.9464949915726226e-09, "loss": 0.418, "num_input_tokens_seen": 192556768, "step": 158345 }, { "epoch": 19.840872071169027, "grad_norm": 7.614747524261475, "learning_rate": 1.9312711964736274e-09, "loss": 0.4584, "num_input_tokens_seen": 192562848, "step": 158350 }, { "epoch": 19.84149855907781, "grad_norm": 6.039300441741943, "learning_rate": 1.916107157488023e-09, "loss": 0.4703, "num_input_tokens_seen": 192568800, "step": 158355 }, { "epoch": 19.84212504698659, "grad_norm": 12.341655731201172, "learning_rate": 1.901002874797886e-09, "loss": 0.4391, "num_input_tokens_seen": 192574784, "step": 158360 }, { "epoch": 19.842751534895378, "grad_norm": 5.0332350730896, "learning_rate": 1.8859583485830726e-09, "loss": 0.4346, "num_input_tokens_seen": 192580704, "step": 158365 }, { "epoch": 19.84337802280416, "grad_norm": 7.937411785125732, "learning_rate": 1.8709735790245485e-09, "loss": 0.3978, "num_input_tokens_seen": 192586848, "step": 158370 }, { "epoch": 19.844004510712942, "grad_norm": 5.36670446395874, "learning_rate": 1.8560485662999507e-09, "loss": 0.4651, "num_input_tokens_seen": 192592896, "step": 158375 }, { "epoch": 19.844630998621728, "grad_norm": 6.43657922744751, "learning_rate": 1.841183310588579e-09, "loss": 0.4225, "num_input_tokens_seen": 192599072, "step": 158380 }, { "epoch": 19.84525748653051, "grad_norm": 25.771522521972656, "learning_rate": 1.8263778120686249e-09, "loss": 0.4656, "num_input_tokens_seen": 192605376, "step": 158385 }, { "epoch": 19.845883974439293, "grad_norm": 10.369465827941895, "learning_rate": 1.8116320709160585e-09, "loss": 0.4783, "num_input_tokens_seen": 192610720, "step": 158390 }, { "epoch": 19.84651046234808, "grad_norm": 5.784312725067139, "learning_rate": 1.7969460873079602e-09, "loss": 0.4989, "num_input_tokens_seen": 192616800, "step": 158395 }, { "epoch": 19.84713695025686, "grad_norm": 18.824068069458008, "learning_rate": 1.7823198614191906e-09, "loss": 0.439, "num_input_tokens_seen": 192622816, "step": 158400 }, { "epoch": 19.847763438165643, "grad_norm": 5.439582347869873, "learning_rate": 1.7677533934257195e-09, "loss": 0.4101, "num_input_tokens_seen": 192628992, "step": 158405 }, { "epoch": 19.848389926074425, "grad_norm": 12.134119033813477, "learning_rate": 1.753246683500187e-09, "loss": 0.4028, "num_input_tokens_seen": 192635104, "step": 158410 }, { "epoch": 19.84901641398321, "grad_norm": 8.853924751281738, "learning_rate": 1.7387997318174532e-09, "loss": 0.4361, "num_input_tokens_seen": 192641248, "step": 158415 }, { "epoch": 19.849642901891993, "grad_norm": 9.157885551452637, "learning_rate": 1.7244125385496025e-09, "loss": 0.5111, "num_input_tokens_seen": 192647072, "step": 158420 }, { "epoch": 19.850269389800776, "grad_norm": 12.046908378601074, "learning_rate": 1.7100851038687193e-09, "loss": 0.5339, "num_input_tokens_seen": 192652960, "step": 158425 }, { "epoch": 19.85089587770956, "grad_norm": 8.886137962341309, "learning_rate": 1.6958174279463334e-09, "loss": 0.4989, "num_input_tokens_seen": 192659008, "step": 158430 }, { "epoch": 19.851522365618344, "grad_norm": 6.848809242248535, "learning_rate": 1.6816095109523089e-09, "loss": 0.4256, "num_input_tokens_seen": 192665024, "step": 158435 }, { "epoch": 19.852148853527126, "grad_norm": 18.76795196533203, "learning_rate": 1.66746135305762e-09, "loss": 0.448, "num_input_tokens_seen": 192671008, "step": 158440 }, { "epoch": 19.852775341435912, "grad_norm": 25.071826934814453, "learning_rate": 1.6533729544299105e-09, "loss": 0.5008, "num_input_tokens_seen": 192677024, "step": 158445 }, { "epoch": 19.853401829344694, "grad_norm": 4.567586898803711, "learning_rate": 1.6393443152390442e-09, "loss": 0.3905, "num_input_tokens_seen": 192683328, "step": 158450 }, { "epoch": 19.854028317253476, "grad_norm": 4.931792736053467, "learning_rate": 1.6253754356521102e-09, "loss": 0.4486, "num_input_tokens_seen": 192689472, "step": 158455 }, { "epoch": 19.85465480516226, "grad_norm": 5.880378246307373, "learning_rate": 1.6114663158367516e-09, "loss": 0.4125, "num_input_tokens_seen": 192695520, "step": 158460 }, { "epoch": 19.855281293071045, "grad_norm": 4.879729270935059, "learning_rate": 1.597616955957837e-09, "loss": 0.4404, "num_input_tokens_seen": 192701696, "step": 158465 }, { "epoch": 19.855907780979827, "grad_norm": 22.14920425415039, "learning_rate": 1.5838273561824547e-09, "loss": 0.4336, "num_input_tokens_seen": 192707712, "step": 158470 }, { "epoch": 19.85653426888861, "grad_norm": 9.875364303588867, "learning_rate": 1.5700975166749177e-09, "loss": 0.4533, "num_input_tokens_seen": 192713920, "step": 158475 }, { "epoch": 19.857160756797395, "grad_norm": 6.162812232971191, "learning_rate": 1.5564274375989841e-09, "loss": 0.4712, "num_input_tokens_seen": 192720032, "step": 158480 }, { "epoch": 19.857787244706177, "grad_norm": 17.0330753326416, "learning_rate": 1.5428171191184116e-09, "loss": 0.4679, "num_input_tokens_seen": 192726208, "step": 158485 }, { "epoch": 19.85841373261496, "grad_norm": 8.149197578430176, "learning_rate": 1.5292665613958479e-09, "loss": 0.3727, "num_input_tokens_seen": 192732640, "step": 158490 }, { "epoch": 19.859040220523745, "grad_norm": 7.195271968841553, "learning_rate": 1.5157757645933857e-09, "loss": 0.4205, "num_input_tokens_seen": 192739136, "step": 158495 }, { "epoch": 19.859666708432528, "grad_norm": 16.698373794555664, "learning_rate": 1.5023447288725623e-09, "loss": 0.4348, "num_input_tokens_seen": 192745248, "step": 158500 }, { "epoch": 19.86029319634131, "grad_norm": 6.725940704345703, "learning_rate": 1.488973454393805e-09, "loss": 0.392, "num_input_tokens_seen": 192751584, "step": 158505 }, { "epoch": 19.860919684250096, "grad_norm": 11.212739944458008, "learning_rate": 1.4756619413164309e-09, "loss": 0.4394, "num_input_tokens_seen": 192757824, "step": 158510 }, { "epoch": 19.861546172158878, "grad_norm": 4.571922302246094, "learning_rate": 1.4624101898003117e-09, "loss": 0.3962, "num_input_tokens_seen": 192763552, "step": 158515 }, { "epoch": 19.86217266006766, "grad_norm": 15.512025833129883, "learning_rate": 1.4492182000036548e-09, "loss": 0.4774, "num_input_tokens_seen": 192769696, "step": 158520 }, { "epoch": 19.862799147976443, "grad_norm": 20.837879180908203, "learning_rate": 1.4360859720841113e-09, "loss": 0.4473, "num_input_tokens_seen": 192775552, "step": 158525 }, { "epoch": 19.86342563588523, "grad_norm": 4.158692836761475, "learning_rate": 1.4230135061987783e-09, "loss": 0.4491, "num_input_tokens_seen": 192781696, "step": 158530 }, { "epoch": 19.86405212379401, "grad_norm": 9.016956329345703, "learning_rate": 1.4100008025041966e-09, "loss": 0.5433, "num_input_tokens_seen": 192788288, "step": 158535 }, { "epoch": 19.864678611702793, "grad_norm": 11.790961265563965, "learning_rate": 1.397047861155243e-09, "loss": 0.403, "num_input_tokens_seen": 192794144, "step": 158540 }, { "epoch": 19.86530509961158, "grad_norm": 6.121182441711426, "learning_rate": 1.384154682306793e-09, "loss": 0.399, "num_input_tokens_seen": 192800384, "step": 158545 }, { "epoch": 19.86593158752036, "grad_norm": 15.597559928894043, "learning_rate": 1.3713212661142784e-09, "loss": 0.4159, "num_input_tokens_seen": 192807008, "step": 158550 }, { "epoch": 19.866558075429143, "grad_norm": 17.05472755432129, "learning_rate": 1.3585476127297991e-09, "loss": 0.4557, "num_input_tokens_seen": 192813408, "step": 158555 }, { "epoch": 19.86718456333793, "grad_norm": 4.487260341644287, "learning_rate": 1.3458337223060113e-09, "loss": 0.3938, "num_input_tokens_seen": 192819392, "step": 158560 }, { "epoch": 19.86781105124671, "grad_norm": 5.929135322570801, "learning_rate": 1.3331795949961258e-09, "loss": 0.4464, "num_input_tokens_seen": 192825568, "step": 158565 }, { "epoch": 19.868437539155494, "grad_norm": 4.200812339782715, "learning_rate": 1.3205852309505773e-09, "loss": 0.427, "num_input_tokens_seen": 192831968, "step": 158570 }, { "epoch": 19.869064027064276, "grad_norm": 7.7408366203308105, "learning_rate": 1.3080506303198014e-09, "loss": 0.431, "num_input_tokens_seen": 192837024, "step": 158575 }, { "epoch": 19.869690514973062, "grad_norm": 7.713889122009277, "learning_rate": 1.2955757932542334e-09, "loss": 0.4376, "num_input_tokens_seen": 192843264, "step": 158580 }, { "epoch": 19.870317002881844, "grad_norm": 6.330906867980957, "learning_rate": 1.283160719902643e-09, "loss": 0.4382, "num_input_tokens_seen": 192849408, "step": 158585 }, { "epoch": 19.870943490790626, "grad_norm": 6.545017719268799, "learning_rate": 1.270805410413245e-09, "loss": 0.4111, "num_input_tokens_seen": 192854944, "step": 158590 }, { "epoch": 19.871569978699412, "grad_norm": 10.343372344970703, "learning_rate": 1.2585098649348092e-09, "loss": 0.4341, "num_input_tokens_seen": 192861120, "step": 158595 }, { "epoch": 19.872196466608194, "grad_norm": 7.679712772369385, "learning_rate": 1.246274083612775e-09, "loss": 0.3988, "num_input_tokens_seen": 192867104, "step": 158600 }, { "epoch": 19.872822954516977, "grad_norm": 7.946930885314941, "learning_rate": 1.2340980665948022e-09, "loss": 0.4452, "num_input_tokens_seen": 192873184, "step": 158605 }, { "epoch": 19.873449442425763, "grad_norm": 5.130829811096191, "learning_rate": 1.2219818140257745e-09, "loss": 0.4201, "num_input_tokens_seen": 192879456, "step": 158610 }, { "epoch": 19.874075930334545, "grad_norm": 10.524106979370117, "learning_rate": 1.2099253260505762e-09, "loss": 0.4281, "num_input_tokens_seen": 192885664, "step": 158615 }, { "epoch": 19.874702418243327, "grad_norm": 5.77435302734375, "learning_rate": 1.1979286028135361e-09, "loss": 0.4468, "num_input_tokens_seen": 192891840, "step": 158620 }, { "epoch": 19.875328906152113, "grad_norm": 4.072988033294678, "learning_rate": 1.1859916444584285e-09, "loss": 0.4513, "num_input_tokens_seen": 192898176, "step": 158625 }, { "epoch": 19.875955394060895, "grad_norm": 7.553695201873779, "learning_rate": 1.1741144511268065e-09, "loss": 0.4807, "num_input_tokens_seen": 192904416, "step": 158630 }, { "epoch": 19.876581881969678, "grad_norm": 6.633865833282471, "learning_rate": 1.162297022961889e-09, "loss": 0.3961, "num_input_tokens_seen": 192910432, "step": 158635 }, { "epoch": 19.87720836987846, "grad_norm": 24.9422664642334, "learning_rate": 1.1505393601046744e-09, "loss": 0.492, "num_input_tokens_seen": 192916192, "step": 158640 }, { "epoch": 19.877834857787246, "grad_norm": 4.206384181976318, "learning_rate": 1.1388414626950507e-09, "loss": 0.4292, "num_input_tokens_seen": 192922144, "step": 158645 }, { "epoch": 19.878461345696028, "grad_norm": 5.710330009460449, "learning_rate": 1.127203330874016e-09, "loss": 0.4695, "num_input_tokens_seen": 192928288, "step": 158650 }, { "epoch": 19.87908783360481, "grad_norm": 7.070052623748779, "learning_rate": 1.1156249647797934e-09, "loss": 0.4068, "num_input_tokens_seen": 192933984, "step": 158655 }, { "epoch": 19.879714321513596, "grad_norm": 14.189857482910156, "learning_rate": 1.1041063645506057e-09, "loss": 0.4645, "num_input_tokens_seen": 192940352, "step": 158660 }, { "epoch": 19.88034080942238, "grad_norm": 6.0567121505737305, "learning_rate": 1.0926475303252304e-09, "loss": 0.4767, "num_input_tokens_seen": 192946528, "step": 158665 }, { "epoch": 19.88096729733116, "grad_norm": 14.672270774841309, "learning_rate": 1.0812484622402253e-09, "loss": 0.4276, "num_input_tokens_seen": 192952544, "step": 158670 }, { "epoch": 19.881593785239946, "grad_norm": 12.677456855773926, "learning_rate": 1.0699091604310375e-09, "loss": 0.4902, "num_input_tokens_seen": 192958624, "step": 158675 }, { "epoch": 19.88222027314873, "grad_norm": 18.1143741607666, "learning_rate": 1.0586296250347794e-09, "loss": 0.587, "num_input_tokens_seen": 192964928, "step": 158680 }, { "epoch": 19.88284676105751, "grad_norm": 13.732945442199707, "learning_rate": 1.047409856185233e-09, "loss": 0.4587, "num_input_tokens_seen": 192970944, "step": 158685 }, { "epoch": 19.883473248966293, "grad_norm": 19.350339889526367, "learning_rate": 1.0362498540161802e-09, "loss": 0.4642, "num_input_tokens_seen": 192977152, "step": 158690 }, { "epoch": 19.88409973687508, "grad_norm": 24.602500915527344, "learning_rate": 1.0251496186625131e-09, "loss": 0.506, "num_input_tokens_seen": 192983264, "step": 158695 }, { "epoch": 19.88472622478386, "grad_norm": 6.910403728485107, "learning_rate": 1.014109150255793e-09, "loss": 0.4359, "num_input_tokens_seen": 192989408, "step": 158700 }, { "epoch": 19.885352712692644, "grad_norm": 6.482200622558594, "learning_rate": 1.0031284489281368e-09, "loss": 0.4095, "num_input_tokens_seen": 192995520, "step": 158705 }, { "epoch": 19.88597920060143, "grad_norm": 4.590009689331055, "learning_rate": 9.922075148111055e-10, "loss": 0.4376, "num_input_tokens_seen": 193001632, "step": 158710 }, { "epoch": 19.886605688510212, "grad_norm": 4.781497955322266, "learning_rate": 9.813463480351503e-10, "loss": 0.3875, "num_input_tokens_seen": 193007360, "step": 158715 }, { "epoch": 19.887232176418994, "grad_norm": 5.276088237762451, "learning_rate": 9.705449487301677e-10, "loss": 0.4666, "num_input_tokens_seen": 193013792, "step": 158720 }, { "epoch": 19.88785866432778, "grad_norm": 7.170138835906982, "learning_rate": 9.598033170254983e-10, "loss": 0.416, "num_input_tokens_seen": 193020096, "step": 158725 }, { "epoch": 19.888485152236562, "grad_norm": 22.61029052734375, "learning_rate": 9.49121453048818e-10, "loss": 0.3983, "num_input_tokens_seen": 193026272, "step": 158730 }, { "epoch": 19.889111640145344, "grad_norm": 22.345996856689453, "learning_rate": 9.384993569289125e-10, "loss": 0.415, "num_input_tokens_seen": 193032320, "step": 158735 }, { "epoch": 19.889738128054127, "grad_norm": 12.007274627685547, "learning_rate": 9.279370287917922e-10, "loss": 0.4525, "num_input_tokens_seen": 193038400, "step": 158740 }, { "epoch": 19.890364615962913, "grad_norm": 5.7820329666137695, "learning_rate": 9.174344687645775e-10, "loss": 0.4563, "num_input_tokens_seen": 193044416, "step": 158745 }, { "epoch": 19.890991103871695, "grad_norm": 5.452436447143555, "learning_rate": 9.069916769721687e-10, "loss": 0.4162, "num_input_tokens_seen": 193050304, "step": 158750 }, { "epoch": 19.891617591780477, "grad_norm": 3.98352313041687, "learning_rate": 8.966086535394658e-10, "loss": 0.412, "num_input_tokens_seen": 193056288, "step": 158755 }, { "epoch": 19.892244079689263, "grad_norm": 24.84955406188965, "learning_rate": 8.862853985913688e-10, "loss": 0.4916, "num_input_tokens_seen": 193062144, "step": 158760 }, { "epoch": 19.892870567598045, "grad_norm": 8.411011695861816, "learning_rate": 8.760219122505575e-10, "loss": 0.4114, "num_input_tokens_seen": 193068224, "step": 158765 }, { "epoch": 19.893497055506828, "grad_norm": 10.00527572631836, "learning_rate": 8.658181946397115e-10, "loss": 0.4244, "num_input_tokens_seen": 193074624, "step": 158770 }, { "epoch": 19.894123543415613, "grad_norm": 7.502540588378906, "learning_rate": 8.556742458809553e-10, "loss": 0.4324, "num_input_tokens_seen": 193080608, "step": 158775 }, { "epoch": 19.894750031324396, "grad_norm": 11.673087120056152, "learning_rate": 8.455900660958582e-10, "loss": 0.4119, "num_input_tokens_seen": 193086688, "step": 158780 }, { "epoch": 19.895376519233178, "grad_norm": 6.146397590637207, "learning_rate": 8.355656554048797e-10, "loss": 0.4427, "num_input_tokens_seen": 193092992, "step": 158785 }, { "epoch": 19.896003007141964, "grad_norm": 11.479029655456543, "learning_rate": 8.256010139279236e-10, "loss": 0.4675, "num_input_tokens_seen": 193099040, "step": 158790 }, { "epoch": 19.896629495050746, "grad_norm": 10.495777130126953, "learning_rate": 8.156961417837838e-10, "loss": 0.4102, "num_input_tokens_seen": 193105056, "step": 158795 }, { "epoch": 19.89725598295953, "grad_norm": 24.742563247680664, "learning_rate": 8.058510390912544e-10, "loss": 0.4986, "num_input_tokens_seen": 193111424, "step": 158800 }, { "epoch": 19.89788247086831, "grad_norm": 14.07705307006836, "learning_rate": 7.960657059674636e-10, "loss": 0.4518, "num_input_tokens_seen": 193117568, "step": 158805 }, { "epoch": 19.898508958777096, "grad_norm": 11.006091117858887, "learning_rate": 7.863401425300954e-10, "loss": 0.4873, "num_input_tokens_seen": 193123552, "step": 158810 }, { "epoch": 19.89913544668588, "grad_norm": 5.315341472625732, "learning_rate": 7.766743488951677e-10, "loss": 0.4476, "num_input_tokens_seen": 193128736, "step": 158815 }, { "epoch": 19.89976193459466, "grad_norm": 9.771980285644531, "learning_rate": 7.670683251781441e-10, "loss": 0.4302, "num_input_tokens_seen": 193134784, "step": 158820 }, { "epoch": 19.900388422503447, "grad_norm": 6.77654504776001, "learning_rate": 7.575220714939324e-10, "loss": 0.414, "num_input_tokens_seen": 193141184, "step": 158825 }, { "epoch": 19.90101491041223, "grad_norm": 12.147390365600586, "learning_rate": 7.480355879563306e-10, "loss": 0.4367, "num_input_tokens_seen": 193147456, "step": 158830 }, { "epoch": 19.90164139832101, "grad_norm": 8.534059524536133, "learning_rate": 7.386088746796915e-10, "loss": 0.4583, "num_input_tokens_seen": 193153280, "step": 158835 }, { "epoch": 19.902267886229797, "grad_norm": 5.755137920379639, "learning_rate": 7.292419317755928e-10, "loss": 0.4425, "num_input_tokens_seen": 193159456, "step": 158840 }, { "epoch": 19.90289437413858, "grad_norm": 10.440753936767578, "learning_rate": 7.19934759357277e-10, "loss": 0.4248, "num_input_tokens_seen": 193165472, "step": 158845 }, { "epoch": 19.903520862047362, "grad_norm": 8.085794448852539, "learning_rate": 7.106873575346562e-10, "loss": 0.3777, "num_input_tokens_seen": 193171328, "step": 158850 }, { "epoch": 19.904147349956148, "grad_norm": 5.359994411468506, "learning_rate": 7.01499726419308e-10, "loss": 0.4043, "num_input_tokens_seen": 193177248, "step": 158855 }, { "epoch": 19.90477383786493, "grad_norm": 3.631608724594116, "learning_rate": 6.923718661205891e-10, "loss": 0.4232, "num_input_tokens_seen": 193183200, "step": 158860 }, { "epoch": 19.905400325773712, "grad_norm": 21.509340286254883, "learning_rate": 6.833037767473017e-10, "loss": 0.418, "num_input_tokens_seen": 193189152, "step": 158865 }, { "epoch": 19.906026813682494, "grad_norm": 4.892312526702881, "learning_rate": 6.742954584088024e-10, "loss": 0.398, "num_input_tokens_seen": 193195360, "step": 158870 }, { "epoch": 19.90665330159128, "grad_norm": 4.444088459014893, "learning_rate": 6.65346911212228e-10, "loss": 0.4259, "num_input_tokens_seen": 193201664, "step": 158875 }, { "epoch": 19.907279789500063, "grad_norm": 5.835570335388184, "learning_rate": 6.564581352647148e-10, "loss": 0.3965, "num_input_tokens_seen": 193207584, "step": 158880 }, { "epoch": 19.907906277408845, "grad_norm": 6.33492374420166, "learning_rate": 6.476291306722893e-10, "loss": 0.4129, "num_input_tokens_seen": 193213184, "step": 158885 }, { "epoch": 19.90853276531763, "grad_norm": 32.22099685668945, "learning_rate": 6.388598975409776e-10, "loss": 0.4515, "num_input_tokens_seen": 193219328, "step": 158890 }, { "epoch": 19.909159253226413, "grad_norm": 15.854153633117676, "learning_rate": 6.301504359751409e-10, "loss": 0.3837, "num_input_tokens_seen": 193224864, "step": 158895 }, { "epoch": 19.909785741135195, "grad_norm": 6.617839813232422, "learning_rate": 6.2150074607914e-10, "loss": 0.4866, "num_input_tokens_seen": 193231328, "step": 158900 }, { "epoch": 19.91041222904398, "grad_norm": 20.609912872314453, "learning_rate": 6.129108279562257e-10, "loss": 0.547, "num_input_tokens_seen": 193237504, "step": 158905 }, { "epoch": 19.911038716952763, "grad_norm": 7.14011812210083, "learning_rate": 6.043806817090936e-10, "loss": 0.496, "num_input_tokens_seen": 193244064, "step": 158910 }, { "epoch": 19.911665204861546, "grad_norm": 4.798267841339111, "learning_rate": 5.959103074404393e-10, "loss": 0.4354, "num_input_tokens_seen": 193250464, "step": 158915 }, { "epoch": 19.912291692770328, "grad_norm": 17.5197696685791, "learning_rate": 5.87499705250183e-10, "loss": 0.4882, "num_input_tokens_seen": 193256800, "step": 158920 }, { "epoch": 19.912918180679114, "grad_norm": 18.940065383911133, "learning_rate": 5.791488752399099e-10, "loss": 0.4891, "num_input_tokens_seen": 193262784, "step": 158925 }, { "epoch": 19.913544668587896, "grad_norm": 7.256877422332764, "learning_rate": 5.708578175095403e-10, "loss": 0.5285, "num_input_tokens_seen": 193268960, "step": 158930 }, { "epoch": 19.91417115649668, "grad_norm": 5.5839643478393555, "learning_rate": 5.626265321578838e-10, "loss": 0.4035, "num_input_tokens_seen": 193274432, "step": 158935 }, { "epoch": 19.914797644405464, "grad_norm": 6.2761921882629395, "learning_rate": 5.544550192826403e-10, "loss": 0.5229, "num_input_tokens_seen": 193280352, "step": 158940 }, { "epoch": 19.915424132314246, "grad_norm": 25.891002655029297, "learning_rate": 5.463432789826195e-10, "loss": 0.4969, "num_input_tokens_seen": 193286688, "step": 158945 }, { "epoch": 19.91605062022303, "grad_norm": 5.435074329376221, "learning_rate": 5.382913113544108e-10, "loss": 0.4557, "num_input_tokens_seen": 193292832, "step": 158950 }, { "epoch": 19.916677108131815, "grad_norm": 8.931855201721191, "learning_rate": 5.302991164940486e-10, "loss": 0.4514, "num_input_tokens_seen": 193299008, "step": 158955 }, { "epoch": 19.917303596040597, "grad_norm": 4.162511348724365, "learning_rate": 5.22366694497567e-10, "loss": 0.3636, "num_input_tokens_seen": 193305344, "step": 158960 }, { "epoch": 19.91793008394938, "grad_norm": 6.874430179595947, "learning_rate": 5.14494045459335e-10, "loss": 0.4233, "num_input_tokens_seen": 193311456, "step": 158965 }, { "epoch": 19.91855657185816, "grad_norm": 4.461735725402832, "learning_rate": 5.066811694737217e-10, "loss": 0.4011, "num_input_tokens_seen": 193317792, "step": 158970 }, { "epoch": 19.919183059766947, "grad_norm": 6.531605243682861, "learning_rate": 4.989280666339857e-10, "loss": 0.5513, "num_input_tokens_seen": 193323936, "step": 158975 }, { "epoch": 19.91980954767573, "grad_norm": 3.8161911964416504, "learning_rate": 4.912347370328308e-10, "loss": 0.4537, "num_input_tokens_seen": 193329216, "step": 158980 }, { "epoch": 19.920436035584512, "grad_norm": 8.504416465759277, "learning_rate": 4.836011807629603e-10, "loss": 0.3972, "num_input_tokens_seen": 193335200, "step": 158985 }, { "epoch": 19.921062523493298, "grad_norm": 7.262471675872803, "learning_rate": 4.760273979143026e-10, "loss": 0.4431, "num_input_tokens_seen": 193341504, "step": 158990 }, { "epoch": 19.92168901140208, "grad_norm": 4.927009105682373, "learning_rate": 4.68513388578451e-10, "loss": 0.3945, "num_input_tokens_seen": 193347616, "step": 158995 }, { "epoch": 19.922315499310862, "grad_norm": 12.932615280151367, "learning_rate": 4.610591528447783e-10, "loss": 0.4399, "num_input_tokens_seen": 193353696, "step": 159000 }, { "epoch": 19.922941987219648, "grad_norm": 7.5887675285339355, "learning_rate": 4.5366469080210253e-10, "loss": 0.4252, "num_input_tokens_seen": 193359744, "step": 159005 }, { "epoch": 19.92356847512843, "grad_norm": 4.070168972015381, "learning_rate": 4.4633000253979655e-10, "loss": 0.5518, "num_input_tokens_seen": 193365664, "step": 159010 }, { "epoch": 19.924194963037213, "grad_norm": 21.97953987121582, "learning_rate": 4.3905508814501287e-10, "loss": 0.5188, "num_input_tokens_seen": 193371392, "step": 159015 }, { "epoch": 19.924821450946, "grad_norm": 3.9928345680236816, "learning_rate": 4.3183994770434887e-10, "loss": 0.4215, "num_input_tokens_seen": 193377440, "step": 159020 }, { "epoch": 19.92544793885478, "grad_norm": 8.596457481384277, "learning_rate": 4.2468458130495714e-10, "loss": 0.4571, "num_input_tokens_seen": 193383456, "step": 159025 }, { "epoch": 19.926074426763563, "grad_norm": 7.58639669418335, "learning_rate": 4.175889890312146e-10, "loss": 0.3821, "num_input_tokens_seen": 193389632, "step": 159030 }, { "epoch": 19.926700914672345, "grad_norm": 9.30758285522461, "learning_rate": 4.105531709691635e-10, "loss": 0.397, "num_input_tokens_seen": 193395296, "step": 159035 }, { "epoch": 19.92732740258113, "grad_norm": 11.712705612182617, "learning_rate": 4.035771272020705e-10, "loss": 0.434, "num_input_tokens_seen": 193401504, "step": 159040 }, { "epoch": 19.927953890489913, "grad_norm": 4.635260581970215, "learning_rate": 3.966608578137576e-10, "loss": 0.5906, "num_input_tokens_seen": 193407712, "step": 159045 }, { "epoch": 19.928580378398696, "grad_norm": 6.176573753356934, "learning_rate": 3.8980436288693636e-10, "loss": 0.3943, "num_input_tokens_seen": 193413664, "step": 159050 }, { "epoch": 19.92920686630748, "grad_norm": 7.313301086425781, "learning_rate": 3.830076425032081e-10, "loss": 0.4304, "num_input_tokens_seen": 193419488, "step": 159055 }, { "epoch": 19.929833354216264, "grad_norm": 3.875901222229004, "learning_rate": 3.7627069674417425e-10, "loss": 0.4562, "num_input_tokens_seen": 193424896, "step": 159060 }, { "epoch": 19.930459842125046, "grad_norm": 6.585014343261719, "learning_rate": 3.69593525690326e-10, "loss": 0.4423, "num_input_tokens_seen": 193430944, "step": 159065 }, { "epoch": 19.931086330033832, "grad_norm": 3.8639907836914062, "learning_rate": 3.6297612942159944e-10, "loss": 0.4492, "num_input_tokens_seen": 193436512, "step": 159070 }, { "epoch": 19.931712817942614, "grad_norm": 5.648687839508057, "learning_rate": 3.5641850801682034e-10, "loss": 0.3902, "num_input_tokens_seen": 193442816, "step": 159075 }, { "epoch": 19.932339305851396, "grad_norm": 4.4690656661987305, "learning_rate": 3.499206615542594e-10, "loss": 0.3951, "num_input_tokens_seen": 193448928, "step": 159080 }, { "epoch": 19.93296579376018, "grad_norm": 11.595396041870117, "learning_rate": 3.4348259011218745e-10, "loss": 0.4677, "num_input_tokens_seen": 193455584, "step": 159085 }, { "epoch": 19.933592281668965, "grad_norm": 4.500514507293701, "learning_rate": 3.371042937672098e-10, "loss": 0.3975, "num_input_tokens_seen": 193461152, "step": 159090 }, { "epoch": 19.934218769577747, "grad_norm": 8.891849517822266, "learning_rate": 3.307857725953767e-10, "loss": 0.5496, "num_input_tokens_seen": 193467168, "step": 159095 }, { "epoch": 19.93484525748653, "grad_norm": 4.872588157653809, "learning_rate": 3.245270266727385e-10, "loss": 0.4537, "num_input_tokens_seen": 193473184, "step": 159100 }, { "epoch": 19.935471745395315, "grad_norm": 4.8722710609436035, "learning_rate": 3.183280560736801e-10, "loss": 0.3945, "num_input_tokens_seen": 193479296, "step": 159105 }, { "epoch": 19.936098233304097, "grad_norm": 5.73853063583374, "learning_rate": 3.121888608731416e-10, "loss": 0.444, "num_input_tokens_seen": 193485408, "step": 159110 }, { "epoch": 19.93672472121288, "grad_norm": 4.951519012451172, "learning_rate": 3.0610944114328745e-10, "loss": 0.4027, "num_input_tokens_seen": 193491776, "step": 159115 }, { "epoch": 19.937351209121665, "grad_norm": 6.062158584594727, "learning_rate": 3.000897969573924e-10, "loss": 0.4512, "num_input_tokens_seen": 193497856, "step": 159120 }, { "epoch": 19.937977697030448, "grad_norm": 14.408843040466309, "learning_rate": 2.941299283876209e-10, "loss": 0.4395, "num_input_tokens_seen": 193504096, "step": 159125 }, { "epoch": 19.93860418493923, "grad_norm": 5.475984573364258, "learning_rate": 2.8822983550502725e-10, "loss": 0.4318, "num_input_tokens_seen": 193510336, "step": 159130 }, { "epoch": 19.939230672848012, "grad_norm": 5.351689338684082, "learning_rate": 2.823895183801106e-10, "loss": 0.3794, "num_input_tokens_seen": 193516480, "step": 159135 }, { "epoch": 19.939857160756798, "grad_norm": 5.219244956970215, "learning_rate": 2.766089770828151e-10, "loss": 0.425, "num_input_tokens_seen": 193522592, "step": 159140 }, { "epoch": 19.94048364866558, "grad_norm": 7.23232889175415, "learning_rate": 2.7088821168197445e-10, "loss": 0.4208, "num_input_tokens_seen": 193528704, "step": 159145 }, { "epoch": 19.941110136574363, "grad_norm": 5.424529552459717, "learning_rate": 2.652272222464225e-10, "loss": 0.4192, "num_input_tokens_seen": 193534560, "step": 159150 }, { "epoch": 19.94173662448315, "grad_norm": 6.385147571563721, "learning_rate": 2.5962600884388287e-10, "loss": 0.4167, "num_input_tokens_seen": 193540128, "step": 159155 }, { "epoch": 19.94236311239193, "grad_norm": 5.9655070304870605, "learning_rate": 2.540845715404139e-10, "loss": 0.4162, "num_input_tokens_seen": 193546528, "step": 159160 }, { "epoch": 19.942989600300713, "grad_norm": 5.991840362548828, "learning_rate": 2.486029104037391e-10, "loss": 0.3899, "num_input_tokens_seen": 193552800, "step": 159165 }, { "epoch": 19.9436160882095, "grad_norm": 7.555107116699219, "learning_rate": 2.4318102549769627e-10, "loss": 0.4026, "num_input_tokens_seen": 193558720, "step": 159170 }, { "epoch": 19.94424257611828, "grad_norm": 20.100543975830078, "learning_rate": 2.378189168888989e-10, "loss": 0.4165, "num_input_tokens_seen": 193564416, "step": 159175 }, { "epoch": 19.944869064027063, "grad_norm": 25.0531005859375, "learning_rate": 2.3251658464007454e-10, "loss": 0.5613, "num_input_tokens_seen": 193570784, "step": 159180 }, { "epoch": 19.94549555193585, "grad_norm": 11.359522819519043, "learning_rate": 2.27274028815061e-10, "loss": 0.3918, "num_input_tokens_seen": 193576864, "step": 159185 }, { "epoch": 19.94612203984463, "grad_norm": 14.328568458557129, "learning_rate": 2.220912494765859e-10, "loss": 0.4151, "num_input_tokens_seen": 193582432, "step": 159190 }, { "epoch": 19.946748527753414, "grad_norm": 7.00479793548584, "learning_rate": 2.169682466862666e-10, "loss": 0.4146, "num_input_tokens_seen": 193588480, "step": 159195 }, { "epoch": 19.947375015662196, "grad_norm": 23.50058937072754, "learning_rate": 2.119050205062756e-10, "loss": 0.4704, "num_input_tokens_seen": 193594368, "step": 159200 }, { "epoch": 19.948001503570982, "grad_norm": 7.158909320831299, "learning_rate": 2.0690157099600982e-10, "loss": 0.4087, "num_input_tokens_seen": 193600288, "step": 159205 }, { "epoch": 19.948627991479764, "grad_norm": 15.300264358520508, "learning_rate": 2.0195789821597645e-10, "loss": 0.4629, "num_input_tokens_seen": 193606176, "step": 159210 }, { "epoch": 19.949254479388546, "grad_norm": 5.274759292602539, "learning_rate": 1.9707400222557238e-10, "loss": 0.4922, "num_input_tokens_seen": 193612288, "step": 159215 }, { "epoch": 19.949880967297332, "grad_norm": 5.73344087600708, "learning_rate": 1.9224988308252923e-10, "loss": 0.4442, "num_input_tokens_seen": 193618048, "step": 159220 }, { "epoch": 19.950507455206115, "grad_norm": 5.663008689880371, "learning_rate": 1.874855408445786e-10, "loss": 0.4108, "num_input_tokens_seen": 193623776, "step": 159225 }, { "epoch": 19.951133943114897, "grad_norm": 9.920937538146973, "learning_rate": 1.8278097556889694e-10, "loss": 0.4503, "num_input_tokens_seen": 193629952, "step": 159230 }, { "epoch": 19.951760431023683, "grad_norm": 6.342840671539307, "learning_rate": 1.7813618731155057e-10, "loss": 0.4159, "num_input_tokens_seen": 193635968, "step": 159235 }, { "epoch": 19.952386918932465, "grad_norm": 32.54093933105469, "learning_rate": 1.7355117612860572e-10, "loss": 0.4748, "num_input_tokens_seen": 193642048, "step": 159240 }, { "epoch": 19.953013406841247, "grad_norm": 4.611863613128662, "learning_rate": 1.690259420739082e-10, "loss": 0.4491, "num_input_tokens_seen": 193648480, "step": 159245 }, { "epoch": 19.953639894750033, "grad_norm": 4.2611823081970215, "learning_rate": 1.6456048520241408e-10, "loss": 0.4166, "num_input_tokens_seen": 193654336, "step": 159250 }, { "epoch": 19.954266382658815, "grad_norm": 6.2255120277404785, "learning_rate": 1.6015480556741404e-10, "loss": 0.451, "num_input_tokens_seen": 193660448, "step": 159255 }, { "epoch": 19.954892870567598, "grad_norm": 5.444931507110596, "learning_rate": 1.5580890322164366e-10, "loss": 0.4179, "num_input_tokens_seen": 193666784, "step": 159260 }, { "epoch": 19.95551935847638, "grad_norm": 13.745845794677734, "learning_rate": 1.5152277821617323e-10, "loss": 0.4259, "num_input_tokens_seen": 193673024, "step": 159265 }, { "epoch": 19.956145846385166, "grad_norm": 3.9025325775146484, "learning_rate": 1.4729643060318322e-10, "loss": 0.4109, "num_input_tokens_seen": 193679200, "step": 159270 }, { "epoch": 19.956772334293948, "grad_norm": 7.153829574584961, "learning_rate": 1.4312986043263366e-10, "loss": 0.4568, "num_input_tokens_seen": 193685568, "step": 159275 }, { "epoch": 19.95739882220273, "grad_norm": 8.722552299499512, "learning_rate": 1.390230677550397e-10, "loss": 0.4778, "num_input_tokens_seen": 193692032, "step": 159280 }, { "epoch": 19.958025310111516, "grad_norm": 5.971336364746094, "learning_rate": 1.3497605261869605e-10, "loss": 0.4273, "num_input_tokens_seen": 193698112, "step": 159285 }, { "epoch": 19.9586517980203, "grad_norm": 13.833405494689941, "learning_rate": 1.3098881507300766e-10, "loss": 0.4331, "num_input_tokens_seen": 193704416, "step": 159290 }, { "epoch": 19.95927828592908, "grad_norm": 6.4109015464782715, "learning_rate": 1.2706135516460384e-10, "loss": 0.5024, "num_input_tokens_seen": 193710240, "step": 159295 }, { "epoch": 19.959904773837867, "grad_norm": 16.073156356811523, "learning_rate": 1.2319367294066908e-10, "loss": 0.4274, "num_input_tokens_seen": 193716224, "step": 159300 }, { "epoch": 19.96053126174665, "grad_norm": 9.336320877075195, "learning_rate": 1.193857684478328e-10, "loss": 0.5217, "num_input_tokens_seen": 193722400, "step": 159305 }, { "epoch": 19.96115774965543, "grad_norm": 8.761655807495117, "learning_rate": 1.1563764173105896e-10, "loss": 0.5217, "num_input_tokens_seen": 193728608, "step": 159310 }, { "epoch": 19.961784237564213, "grad_norm": 13.547100067138672, "learning_rate": 1.1194929283586675e-10, "loss": 0.4255, "num_input_tokens_seen": 193734016, "step": 159315 }, { "epoch": 19.962410725473, "grad_norm": 3.786087989807129, "learning_rate": 1.0832072180610997e-10, "loss": 0.4219, "num_input_tokens_seen": 193740096, "step": 159320 }, { "epoch": 19.96303721338178, "grad_norm": 6.774170875549316, "learning_rate": 1.047519286845322e-10, "loss": 0.364, "num_input_tokens_seen": 193745888, "step": 159325 }, { "epoch": 19.963663701290564, "grad_norm": 5.458036422729492, "learning_rate": 1.0124291351498727e-10, "loss": 0.4818, "num_input_tokens_seen": 193752032, "step": 159330 }, { "epoch": 19.96429018919935, "grad_norm": 5.448161602020264, "learning_rate": 9.77936763379983e-11, "loss": 0.3926, "num_input_tokens_seen": 193758080, "step": 159335 }, { "epoch": 19.964916677108132, "grad_norm": 17.73199462890625, "learning_rate": 9.440421719630888e-11, "loss": 0.455, "num_input_tokens_seen": 193764128, "step": 159340 }, { "epoch": 19.965543165016914, "grad_norm": 6.118503093719482, "learning_rate": 9.107453612933192e-11, "loss": 0.4442, "num_input_tokens_seen": 193770368, "step": 159345 }, { "epoch": 19.9661696529257, "grad_norm": 10.87368106842041, "learning_rate": 8.780463317703547e-11, "loss": 0.446, "num_input_tokens_seen": 193776832, "step": 159350 }, { "epoch": 19.966796140834482, "grad_norm": 6.182219505310059, "learning_rate": 8.459450837938754e-11, "loss": 0.4661, "num_input_tokens_seen": 193782976, "step": 159355 }, { "epoch": 19.967422628743265, "grad_norm": 29.71358299255371, "learning_rate": 8.144416177358061e-11, "loss": 0.5511, "num_input_tokens_seen": 193788928, "step": 159360 }, { "epoch": 19.968049116652047, "grad_norm": 6.969724178314209, "learning_rate": 7.835359339791737e-11, "loss": 0.4295, "num_input_tokens_seen": 193794976, "step": 159365 }, { "epoch": 19.968675604560833, "grad_norm": 5.952914714813232, "learning_rate": 7.53228032895903e-11, "loss": 0.5285, "num_input_tokens_seen": 193801216, "step": 159370 }, { "epoch": 19.969302092469615, "grad_norm": 8.41026782989502, "learning_rate": 7.235179148412652e-11, "loss": 0.4587, "num_input_tokens_seen": 193807328, "step": 159375 }, { "epoch": 19.969928580378397, "grad_norm": 7.707407474517822, "learning_rate": 6.944055801705319e-11, "loss": 0.3876, "num_input_tokens_seen": 193813568, "step": 159380 }, { "epoch": 19.970555068287183, "grad_norm": 5.675724029541016, "learning_rate": 6.658910292445253e-11, "loss": 0.4339, "num_input_tokens_seen": 193819840, "step": 159385 }, { "epoch": 19.971181556195965, "grad_norm": 5.742363929748535, "learning_rate": 6.379742623852103e-11, "loss": 0.4581, "num_input_tokens_seen": 193825824, "step": 159390 }, { "epoch": 19.971808044104748, "grad_norm": 7.755104064941406, "learning_rate": 6.10655279942307e-11, "loss": 0.4128, "num_input_tokens_seen": 193832224, "step": 159395 }, { "epoch": 19.972434532013533, "grad_norm": 23.37204360961914, "learning_rate": 5.839340822322293e-11, "loss": 0.4385, "num_input_tokens_seen": 193838464, "step": 159400 }, { "epoch": 19.973061019922316, "grad_norm": 4.47263765335083, "learning_rate": 5.578106695824925e-11, "loss": 0.4213, "num_input_tokens_seen": 193843808, "step": 159405 }, { "epoch": 19.973687507831098, "grad_norm": 8.610904693603516, "learning_rate": 5.322850422984083e-11, "loss": 0.4071, "num_input_tokens_seen": 193849856, "step": 159410 }, { "epoch": 19.974313995739884, "grad_norm": 20.062145233154297, "learning_rate": 5.073572006908389e-11, "loss": 0.5067, "num_input_tokens_seen": 193856064, "step": 159415 }, { "epoch": 19.974940483648666, "grad_norm": 6.073309421539307, "learning_rate": 4.830271450539936e-11, "loss": 0.5219, "num_input_tokens_seen": 193862176, "step": 159420 }, { "epoch": 19.97556697155745, "grad_norm": 4.197552680969238, "learning_rate": 4.592948756765303e-11, "loss": 0.4167, "num_input_tokens_seen": 193868352, "step": 159425 }, { "epoch": 19.97619345946623, "grad_norm": 6.811000347137451, "learning_rate": 4.361603928526581e-11, "loss": 0.4318, "num_input_tokens_seen": 193874368, "step": 159430 }, { "epoch": 19.976819947375017, "grad_norm": 18.653268814086914, "learning_rate": 4.136236968432794e-11, "loss": 0.461, "num_input_tokens_seen": 193880256, "step": 159435 }, { "epoch": 19.9774464352838, "grad_norm": 6.658127784729004, "learning_rate": 3.916847879315011e-11, "loss": 0.4015, "num_input_tokens_seen": 193886528, "step": 159440 }, { "epoch": 19.97807292319258, "grad_norm": 15.073786735534668, "learning_rate": 3.703436663726745e-11, "loss": 0.4017, "num_input_tokens_seen": 193892800, "step": 159445 }, { "epoch": 19.978699411101367, "grad_norm": 7.940292835235596, "learning_rate": 3.4960033242770195e-11, "loss": 0.4289, "num_input_tokens_seen": 193898816, "step": 159450 }, { "epoch": 19.97932589901015, "grad_norm": 3.8062191009521484, "learning_rate": 3.294547863352815e-11, "loss": 0.4081, "num_input_tokens_seen": 193904544, "step": 159455 }, { "epoch": 19.97995238691893, "grad_norm": 3.918994903564453, "learning_rate": 3.099070283452132e-11, "loss": 0.3993, "num_input_tokens_seen": 193910656, "step": 159460 }, { "epoch": 19.980578874827717, "grad_norm": 3.4300029277801514, "learning_rate": 2.9095705868509293e-11, "loss": 0.4745, "num_input_tokens_seen": 193916800, "step": 159465 }, { "epoch": 19.9812053627365, "grad_norm": 8.896933555603027, "learning_rate": 2.7260487758806743e-11, "loss": 0.4748, "num_input_tokens_seen": 193922912, "step": 159470 }, { "epoch": 19.981831850645282, "grad_norm": 5.26328706741333, "learning_rate": 2.5485048526507906e-11, "loss": 0.414, "num_input_tokens_seen": 193928768, "step": 159475 }, { "epoch": 19.982458338554068, "grad_norm": 21.73641586303711, "learning_rate": 2.3769388193262134e-11, "loss": 0.5012, "num_input_tokens_seen": 193935040, "step": 159480 }, { "epoch": 19.98308482646285, "grad_norm": 26.905797958374023, "learning_rate": 2.211350677960855e-11, "loss": 0.4599, "num_input_tokens_seen": 193940736, "step": 159485 }, { "epoch": 19.983711314371632, "grad_norm": 24.294931411743164, "learning_rate": 2.0517404305531174e-11, "loss": 0.4544, "num_input_tokens_seen": 193946976, "step": 159490 }, { "epoch": 19.984337802280415, "grad_norm": 18.01900863647461, "learning_rate": 1.8981080789903794e-11, "loss": 0.439, "num_input_tokens_seen": 193952992, "step": 159495 }, { "epoch": 19.9849642901892, "grad_norm": 4.472496032714844, "learning_rate": 1.75045362516002e-11, "loss": 0.5266, "num_input_tokens_seen": 193958944, "step": 159500 }, { "epoch": 19.985590778097983, "grad_norm": 20.9588623046875, "learning_rate": 1.6087770707273743e-11, "loss": 0.4507, "num_input_tokens_seen": 193965280, "step": 159505 }, { "epoch": 19.986217266006765, "grad_norm": 6.272324085235596, "learning_rate": 1.4730784174132874e-11, "loss": 0.4451, "num_input_tokens_seen": 193971872, "step": 159510 }, { "epoch": 19.98684375391555, "grad_norm": 5.808809280395508, "learning_rate": 1.3433576668830939e-11, "loss": 0.4202, "num_input_tokens_seen": 193978144, "step": 159515 }, { "epoch": 19.987470241824333, "grad_norm": 8.014453887939453, "learning_rate": 1.2196148206911063e-11, "loss": 0.3962, "num_input_tokens_seen": 193984544, "step": 159520 }, { "epoch": 19.988096729733115, "grad_norm": 4.923720359802246, "learning_rate": 1.1018498802806143e-11, "loss": 0.4463, "num_input_tokens_seen": 193990784, "step": 159525 }, { "epoch": 19.9887232176419, "grad_norm": 4.3776631355285645, "learning_rate": 9.900628470393969e-12, "loss": 0.4124, "num_input_tokens_seen": 193996864, "step": 159530 }, { "epoch": 19.989349705550683, "grad_norm": 19.081954956054688, "learning_rate": 8.842537224107439e-12, "loss": 0.4444, "num_input_tokens_seen": 194003200, "step": 159535 }, { "epoch": 19.989976193459466, "grad_norm": 14.728439331054688, "learning_rate": 7.844225075048783e-12, "loss": 0.4261, "num_input_tokens_seen": 194009280, "step": 159540 }, { "epoch": 19.990602681368248, "grad_norm": 22.10728645324707, "learning_rate": 6.905692036540679e-12, "loss": 0.4537, "num_input_tokens_seen": 194015232, "step": 159545 }, { "epoch": 19.991229169277034, "grad_norm": 4.791342258453369, "learning_rate": 6.026938119130243e-12, "loss": 0.4906, "num_input_tokens_seen": 194021600, "step": 159550 }, { "epoch": 19.991855657185816, "grad_norm": 3.6100192070007324, "learning_rate": 5.207963333364596e-12, "loss": 0.4508, "num_input_tokens_seen": 194027552, "step": 159555 }, { "epoch": 19.9924821450946, "grad_norm": 9.933659553527832, "learning_rate": 4.4487676892357445e-12, "loss": 0.3964, "num_input_tokens_seen": 194033632, "step": 159560 }, { "epoch": 19.993108633003384, "grad_norm": 4.3185296058654785, "learning_rate": 3.7493511956254726e-12, "loss": 0.4179, "num_input_tokens_seen": 194040224, "step": 159565 }, { "epoch": 19.993735120912167, "grad_norm": 5.401950359344482, "learning_rate": 3.1097138608604528e-12, "loss": 0.4105, "num_input_tokens_seen": 194046272, "step": 159570 }, { "epoch": 19.99436160882095, "grad_norm": 20.489717483520508, "learning_rate": 2.5298556927122462e-12, "loss": 0.4737, "num_input_tokens_seen": 194052032, "step": 159575 }, { "epoch": 19.994988096729735, "grad_norm": 3.9944748878479004, "learning_rate": 2.0097766978421916e-12, "loss": 0.3844, "num_input_tokens_seen": 194058176, "step": 159580 }, { "epoch": 19.995614584638517, "grad_norm": 31.934825897216797, "learning_rate": 1.5494768834667384e-12, "loss": 0.4755, "num_input_tokens_seen": 194064320, "step": 159585 }, { "epoch": 19.9962410725473, "grad_norm": 12.61680793762207, "learning_rate": 1.148956253471667e-12, "loss": 0.4458, "num_input_tokens_seen": 194070336, "step": 159590 }, { "epoch": 19.99686756045608, "grad_norm": 4.564164638519287, "learning_rate": 8.08214813963204e-13, "loss": 0.3989, "num_input_tokens_seen": 194076256, "step": 159595 }, { "epoch": 19.997494048364867, "grad_norm": 5.648801326751709, "learning_rate": 5.272525682720186e-13, "loss": 0.4252, "num_input_tokens_seen": 194082496, "step": 159600 }, { "epoch": 19.99812053627365, "grad_norm": 14.03060531616211, "learning_rate": 3.060695202838915e-13, "loss": 0.434, "num_input_tokens_seen": 194088864, "step": 159605 }, { "epoch": 19.998747024182432, "grad_norm": 5.677600860595703, "learning_rate": 1.4466567277438004e-13, "loss": 0.4424, "num_input_tokens_seen": 194095008, "step": 159610 }, { "epoch": 19.999373512091218, "grad_norm": 37.083770751953125, "learning_rate": 4.304102685370737e-14, "loss": 0.4978, "num_input_tokens_seen": 194100992, "step": 159615 }, { "epoch": 20.0, "grad_norm": 5.208285808563232, "learning_rate": 1.1955841872079988e-15, "loss": 0.4003, "num_input_tokens_seen": 194107168, "step": 159620 }, { "epoch": 20.0, "eval_loss": 0.5257867574691772, "eval_runtime": 224.0578, "eval_samples_per_second": 35.62, "eval_steps_per_second": 8.908, "num_input_tokens_seen": 194107168, "step": 159620 }, { "epoch": 20.0, "num_input_tokens_seen": 194107168, "step": 159620, "total_flos": 8.740558044811493e+18, "train_loss": 0.4781737984966894, "train_runtime": 42319.7188, "train_samples_per_second": 15.087, "train_steps_per_second": 3.772 } ], "logging_steps": 5, "max_steps": 159620, "num_input_tokens_seen": 194107168, "num_train_epochs": 20, "save_steps": 15962, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8.740558044811493e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }