{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 4.0, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.008, "grad_norm": 15.009456634521484, "learning_rate": 0.0, "loss": 4.4226, "mean_token_accuracy": 0.35082489252090454, "num_tokens": 5369.0, "step": 1 }, { "epoch": 0.016, "grad_norm": 16.488649368286133, "learning_rate": 1.0526315789473684e-05, "loss": 4.4839, "mean_token_accuracy": 0.3302236273884773, "num_tokens": 11554.0, "step": 2 }, { "epoch": 0.024, "grad_norm": 16.9512882232666, "learning_rate": 2.105263157894737e-05, "loss": 4.3643, "mean_token_accuracy": 0.3471578359603882, "num_tokens": 18923.0, "step": 3 }, { "epoch": 0.032, "grad_norm": 17.006921768188477, "learning_rate": 3.157894736842105e-05, "loss": 4.3385, "mean_token_accuracy": 0.3382377028465271, "num_tokens": 25712.0, "step": 4 }, { "epoch": 0.04, "grad_norm": 16.2688045501709, "learning_rate": 4.210526315789474e-05, "loss": 4.1501, "mean_token_accuracy": 0.36794763803482056, "num_tokens": 32193.0, "step": 5 }, { "epoch": 0.048, "grad_norm": 15.383474349975586, "learning_rate": 5.2631578947368424e-05, "loss": 4.1129, "mean_token_accuracy": 0.359960176050663, "num_tokens": 38679.0, "step": 6 }, { "epoch": 0.056, "grad_norm": 13.229964256286621, "learning_rate": 6.31578947368421e-05, "loss": 3.8384, "mean_token_accuracy": 0.40359261631965637, "num_tokens": 45031.0, "step": 7 }, { "epoch": 0.064, "grad_norm": 11.782334327697754, "learning_rate": 7.368421052631579e-05, "loss": 3.6457, "mean_token_accuracy": 0.4300616532564163, "num_tokens": 50876.0, "step": 8 }, { "epoch": 0.072, "grad_norm": 11.256607055664062, "learning_rate": 8.421052631578948e-05, "loss": 3.4825, "mean_token_accuracy": 0.45046091079711914, "num_tokens": 57785.0, "step": 9 }, { "epoch": 0.08, "grad_norm": 9.428688049316406, "learning_rate": 9.473684210526316e-05, "loss": 2.9709, "mean_token_accuracy": 0.5108220353722572, "num_tokens": 65483.0, "step": 10 }, { "epoch": 0.088, "grad_norm": 6.1326470375061035, "learning_rate": 0.00010526315789473685, "loss": 2.6532, "mean_token_accuracy": 0.564396545290947, "num_tokens": 71812.0, "step": 11 }, { "epoch": 0.096, "grad_norm": 5.3833465576171875, "learning_rate": 0.00011578947368421053, "loss": 2.4163, "mean_token_accuracy": 0.5896340012550354, "num_tokens": 79158.0, "step": 12 }, { "epoch": 0.104, "grad_norm": 4.847289562225342, "learning_rate": 0.0001263157894736842, "loss": 2.2105, "mean_token_accuracy": 0.6154301911592484, "num_tokens": 85836.0, "step": 13 }, { "epoch": 0.112, "grad_norm": 3.3179337978363037, "learning_rate": 0.0001368421052631579, "loss": 2.0785, "mean_token_accuracy": 0.6191774904727936, "num_tokens": 92025.0, "step": 14 }, { "epoch": 0.12, "grad_norm": 2.9202802181243896, "learning_rate": 0.00014736842105263158, "loss": 1.9833, "mean_token_accuracy": 0.6256388574838638, "num_tokens": 97864.0, "step": 15 }, { "epoch": 0.128, "grad_norm": 2.6852591037750244, "learning_rate": 0.00015789473684210527, "loss": 1.73, "mean_token_accuracy": 0.651897981762886, "num_tokens": 104289.0, "step": 16 }, { "epoch": 0.136, "grad_norm": 2.526517391204834, "learning_rate": 0.00016842105263157895, "loss": 1.6624, "mean_token_accuracy": 0.6490602791309357, "num_tokens": 110637.0, "step": 17 }, { "epoch": 0.144, "grad_norm": 2.5168607234954834, "learning_rate": 0.00017894736842105264, "loss": 1.5195, "mean_token_accuracy": 0.6619705110788345, "num_tokens": 116530.0, "step": 18 }, { "epoch": 0.152, "grad_norm": 2.120267868041992, "learning_rate": 0.00018947368421052632, "loss": 1.5078, "mean_token_accuracy": 0.6700999438762665, "num_tokens": 123873.0, "step": 19 }, { "epoch": 0.16, "grad_norm": 1.9365595579147339, "learning_rate": 0.0002, "loss": 1.2945, "mean_token_accuracy": 0.7037462592124939, "num_tokens": 130978.0, "step": 20 }, { "epoch": 0.168, "grad_norm": 2.063713788986206, "learning_rate": 0.00019999879061093312, "loss": 1.261, "mean_token_accuracy": 0.7024079412221909, "num_tokens": 136844.0, "step": 21 }, { "epoch": 0.176, "grad_norm": 1.333803653717041, "learning_rate": 0.0001999951624762352, "loss": 1.2327, "mean_token_accuracy": 0.7176639288663864, "num_tokens": 144549.0, "step": 22 }, { "epoch": 0.184, "grad_norm": 1.3329464197158813, "learning_rate": 0.00019998911569341348, "loss": 1.0802, "mean_token_accuracy": 0.7422485947608948, "num_tokens": 152113.0, "step": 23 }, { "epoch": 0.192, "grad_norm": 1.442068338394165, "learning_rate": 0.0001999806504249771, "loss": 0.9722, "mean_token_accuracy": 0.7662369161844254, "num_tokens": 158617.0, "step": 24 }, { "epoch": 0.2, "grad_norm": 1.1058902740478516, "learning_rate": 0.00019996976689843287, "loss": 0.9377, "mean_token_accuracy": 0.7860363870859146, "num_tokens": 166178.0, "step": 25 }, { "epoch": 0.208, "grad_norm": 0.9864588975906372, "learning_rate": 0.0001999564654062789, "loss": 0.8775, "mean_token_accuracy": 0.8102598041296005, "num_tokens": 173443.0, "step": 26 }, { "epoch": 0.216, "grad_norm": 0.8829644322395325, "learning_rate": 0.00019994074630599705, "loss": 0.7958, "mean_token_accuracy": 0.8219314068555832, "num_tokens": 179516.0, "step": 27 }, { "epoch": 0.224, "grad_norm": 0.8294792771339417, "learning_rate": 0.00019992261002004294, "loss": 0.7202, "mean_token_accuracy": 0.8282240331172943, "num_tokens": 185900.0, "step": 28 }, { "epoch": 0.232, "grad_norm": 0.7125921845436096, "learning_rate": 0.00019990205703583497, "loss": 0.7831, "mean_token_accuracy": 0.8209341168403625, "num_tokens": 191944.0, "step": 29 }, { "epoch": 0.24, "grad_norm": 0.5321927070617676, "learning_rate": 0.00019987908790574104, "loss": 0.7557, "mean_token_accuracy": 0.824893981218338, "num_tokens": 199797.0, "step": 30 }, { "epoch": 0.248, "grad_norm": 0.5975250005722046, "learning_rate": 0.00019985370324706366, "loss": 0.7598, "mean_token_accuracy": 0.8194688409566879, "num_tokens": 206866.0, "step": 31 }, { "epoch": 0.256, "grad_norm": 0.4956493079662323, "learning_rate": 0.0001998259037420235, "loss": 0.762, "mean_token_accuracy": 0.8325388431549072, "num_tokens": 213177.0, "step": 32 }, { "epoch": 0.264, "grad_norm": 0.42859816551208496, "learning_rate": 0.00019979569013774093, "loss": 0.7826, "mean_token_accuracy": 0.8148715794086456, "num_tokens": 220005.0, "step": 33 }, { "epoch": 0.272, "grad_norm": 0.37347087264060974, "learning_rate": 0.000199763063246216, "loss": 0.7126, "mean_token_accuracy": 0.8346025496721268, "num_tokens": 227056.0, "step": 34 }, { "epoch": 0.28, "grad_norm": 0.3506178557872772, "learning_rate": 0.00019972802394430664, "loss": 0.6698, "mean_token_accuracy": 0.8443764001131058, "num_tokens": 234069.0, "step": 35 }, { "epoch": 0.288, "grad_norm": 0.41187265515327454, "learning_rate": 0.00019969057317370504, "loss": 0.8063, "mean_token_accuracy": 0.8158510625362396, "num_tokens": 241437.0, "step": 36 }, { "epoch": 0.296, "grad_norm": 0.37258651852607727, "learning_rate": 0.00019965071194091237, "loss": 0.749, "mean_token_accuracy": 0.8245450109243393, "num_tokens": 248270.0, "step": 37 }, { "epoch": 0.304, "grad_norm": 0.416504442691803, "learning_rate": 0.00019960844131721171, "loss": 0.7077, "mean_token_accuracy": 0.8351310938596725, "num_tokens": 255170.0, "step": 38 }, { "epoch": 0.312, "grad_norm": 0.3824760615825653, "learning_rate": 0.00019956376243863926, "loss": 0.6536, "mean_token_accuracy": 0.8547748029232025, "num_tokens": 262614.0, "step": 39 }, { "epoch": 0.32, "grad_norm": 0.3614872992038727, "learning_rate": 0.00019951667650595388, "loss": 0.7179, "mean_token_accuracy": 0.8378085196018219, "num_tokens": 269100.0, "step": 40 }, { "epoch": 0.328, "grad_norm": 0.29622912406921387, "learning_rate": 0.00019946718478460474, "loss": 0.5958, "mean_token_accuracy": 0.8569624125957489, "num_tokens": 275562.0, "step": 41 }, { "epoch": 0.336, "grad_norm": 0.3648947477340698, "learning_rate": 0.0001994152886046973, "loss": 0.674, "mean_token_accuracy": 0.8447534888982773, "num_tokens": 281982.0, "step": 42 }, { "epoch": 0.344, "grad_norm": 0.349088579416275, "learning_rate": 0.00019936098936095765, "loss": 0.698, "mean_token_accuracy": 0.8474298566579819, "num_tokens": 289174.0, "step": 43 }, { "epoch": 0.352, "grad_norm": 0.32133880257606506, "learning_rate": 0.00019930428851269488, "loss": 0.624, "mean_token_accuracy": 0.853282168507576, "num_tokens": 294854.0, "step": 44 }, { "epoch": 0.36, "grad_norm": 0.30549532175064087, "learning_rate": 0.00019924518758376208, "loss": 0.6579, "mean_token_accuracy": 0.8428997695446014, "num_tokens": 301295.0, "step": 45 }, { "epoch": 0.368, "grad_norm": 0.3237819969654083, "learning_rate": 0.00019918368816251514, "loss": 0.6883, "mean_token_accuracy": 0.8418869972229004, "num_tokens": 307638.0, "step": 46 }, { "epoch": 0.376, "grad_norm": 0.2714066207408905, "learning_rate": 0.00019911979190177028, "loss": 0.6979, "mean_token_accuracy": 0.8324394673109055, "num_tokens": 315370.0, "step": 47 }, { "epoch": 0.384, "grad_norm": 0.3279217779636383, "learning_rate": 0.0001990535005187594, "loss": 0.583, "mean_token_accuracy": 0.8681101649999619, "num_tokens": 320859.0, "step": 48 }, { "epoch": 0.392, "grad_norm": 0.26718366146087646, "learning_rate": 0.00019898481579508421, "loss": 0.68, "mean_token_accuracy": 0.8338876813650131, "num_tokens": 328079.0, "step": 49 }, { "epoch": 0.4, "grad_norm": 0.3171955347061157, "learning_rate": 0.0001989137395766681, "loss": 0.6446, "mean_token_accuracy": 0.8496119379997253, "num_tokens": 334446.0, "step": 50 }, { "epoch": 0.408, "grad_norm": 0.30903536081314087, "learning_rate": 0.00019884027377370668, "loss": 0.6508, "mean_token_accuracy": 0.8466218858957291, "num_tokens": 341387.0, "step": 51 }, { "epoch": 0.416, "grad_norm": 0.30890390276908875, "learning_rate": 0.0001987644203606164, "loss": 0.7142, "mean_token_accuracy": 0.8338942974805832, "num_tokens": 348546.0, "step": 52 }, { "epoch": 0.424, "grad_norm": 0.27779823541641235, "learning_rate": 0.00019868618137598132, "loss": 0.5887, "mean_token_accuracy": 0.8586048036813736, "num_tokens": 354727.0, "step": 53 }, { "epoch": 0.432, "grad_norm": 0.3139258921146393, "learning_rate": 0.00019860555892249875, "loss": 0.7511, "mean_token_accuracy": 0.8265256285667419, "num_tokens": 361770.0, "step": 54 }, { "epoch": 0.44, "grad_norm": 0.3174983263015747, "learning_rate": 0.00019852255516692225, "loss": 0.6337, "mean_token_accuracy": 0.853050097823143, "num_tokens": 368538.0, "step": 55 }, { "epoch": 0.448, "grad_norm": 0.3364870250225067, "learning_rate": 0.00019843717234000374, "loss": 0.7523, "mean_token_accuracy": 0.8259449601173401, "num_tokens": 375368.0, "step": 56 }, { "epoch": 0.456, "grad_norm": 0.37718722224235535, "learning_rate": 0.00019834941273643336, "loss": 0.7322, "mean_token_accuracy": 0.8312457799911499, "num_tokens": 381878.0, "step": 57 }, { "epoch": 0.464, "grad_norm": 0.33022674918174744, "learning_rate": 0.0001982592787147779, "loss": 0.6747, "mean_token_accuracy": 0.8385090231895447, "num_tokens": 389449.0, "step": 58 }, { "epoch": 0.472, "grad_norm": 0.43705055117607117, "learning_rate": 0.00019816677269741733, "loss": 0.6655, "mean_token_accuracy": 0.8439074158668518, "num_tokens": 396114.0, "step": 59 }, { "epoch": 0.48, "grad_norm": 0.3090667128562927, "learning_rate": 0.00019807189717047986, "loss": 0.6475, "mean_token_accuracy": 0.8441413044929504, "num_tokens": 403645.0, "step": 60 }, { "epoch": 0.488, "grad_norm": 0.2958497107028961, "learning_rate": 0.0001979746546837749, "loss": 0.7174, "mean_token_accuracy": 0.8332030922174454, "num_tokens": 411503.0, "step": 61 }, { "epoch": 0.496, "grad_norm": 0.31799939274787903, "learning_rate": 0.00019787504785072463, "loss": 0.6776, "mean_token_accuracy": 0.845548689365387, "num_tokens": 418460.0, "step": 62 }, { "epoch": 0.504, "grad_norm": 0.30380430817604065, "learning_rate": 0.0001977730793482939, "loss": 0.5266, "mean_token_accuracy": 0.8771664351224899, "num_tokens": 424931.0, "step": 63 }, { "epoch": 0.512, "grad_norm": 0.3135907053947449, "learning_rate": 0.00019766875191691802, "loss": 0.6084, "mean_token_accuracy": 0.8567227721214294, "num_tokens": 431025.0, "step": 64 }, { "epoch": 0.52, "grad_norm": 0.34858691692352295, "learning_rate": 0.00019756206836042938, "loss": 0.5781, "mean_token_accuracy": 0.8666173219680786, "num_tokens": 436638.0, "step": 65 }, { "epoch": 0.528, "grad_norm": 0.28917914628982544, "learning_rate": 0.00019745303154598186, "loss": 0.4806, "mean_token_accuracy": 0.8829390704631805, "num_tokens": 442397.0, "step": 66 }, { "epoch": 0.536, "grad_norm": 0.457060307264328, "learning_rate": 0.00019734164440397397, "loss": 0.7192, "mean_token_accuracy": 0.8350439369678497, "num_tokens": 449304.0, "step": 67 }, { "epoch": 0.544, "grad_norm": 0.32380974292755127, "learning_rate": 0.00019722790992796995, "loss": 0.6501, "mean_token_accuracy": 0.8544299602508545, "num_tokens": 455910.0, "step": 68 }, { "epoch": 0.552, "grad_norm": 0.2932593524456024, "learning_rate": 0.00019711183117461942, "loss": 0.5909, "mean_token_accuracy": 0.8556720912456512, "num_tokens": 462281.0, "step": 69 }, { "epoch": 0.56, "grad_norm": 0.3595532774925232, "learning_rate": 0.00019699341126357513, "loss": 0.6598, "mean_token_accuracy": 0.8415083587169647, "num_tokens": 469266.0, "step": 70 }, { "epoch": 0.568, "grad_norm": 0.30987825989723206, "learning_rate": 0.0001968726533774092, "loss": 0.682, "mean_token_accuracy": 0.8381143063306808, "num_tokens": 476358.0, "step": 71 }, { "epoch": 0.576, "grad_norm": 0.3039684593677521, "learning_rate": 0.00019674956076152762, "loss": 0.5798, "mean_token_accuracy": 0.8601821660995483, "num_tokens": 483042.0, "step": 72 }, { "epoch": 0.584, "grad_norm": 0.3100774884223938, "learning_rate": 0.00019662413672408288, "loss": 0.5976, "mean_token_accuracy": 0.8558310866355896, "num_tokens": 490230.0, "step": 73 }, { "epoch": 0.592, "grad_norm": 0.31717392802238464, "learning_rate": 0.00019649638463588523, "loss": 0.4965, "mean_token_accuracy": 0.8768026232719421, "num_tokens": 496836.0, "step": 74 }, { "epoch": 0.6, "grad_norm": 0.30086132884025574, "learning_rate": 0.00019636630793031193, "loss": 0.6397, "mean_token_accuracy": 0.8408895879983902, "num_tokens": 504481.0, "step": 75 }, { "epoch": 0.608, "grad_norm": 0.2968375086784363, "learning_rate": 0.0001962339101032151, "loss": 0.6198, "mean_token_accuracy": 0.8515170067548752, "num_tokens": 511137.0, "step": 76 }, { "epoch": 0.616, "grad_norm": 0.36928027868270874, "learning_rate": 0.0001960991947128278, "loss": 0.5716, "mean_token_accuracy": 0.8672203570604324, "num_tokens": 516720.0, "step": 77 }, { "epoch": 0.624, "grad_norm": 0.30351924896240234, "learning_rate": 0.00019596216537966818, "loss": 0.4152, "mean_token_accuracy": 0.8895243555307388, "num_tokens": 522800.0, "step": 78 }, { "epoch": 0.632, "grad_norm": 0.2897997200489044, "learning_rate": 0.00019582282578644244, "loss": 0.5727, "mean_token_accuracy": 0.8639173209667206, "num_tokens": 529162.0, "step": 79 }, { "epoch": 0.64, "grad_norm": 0.3304134011268616, "learning_rate": 0.0001956811796779457, "loss": 0.6134, "mean_token_accuracy": 0.8554870337247849, "num_tokens": 535050.0, "step": 80 }, { "epoch": 0.648, "grad_norm": 0.3313748240470886, "learning_rate": 0.00019553723086096142, "loss": 0.5661, "mean_token_accuracy": 0.8692257404327393, "num_tokens": 540982.0, "step": 81 }, { "epoch": 0.656, "grad_norm": 0.35512128472328186, "learning_rate": 0.00019539098320415902, "loss": 0.5537, "mean_token_accuracy": 0.864826962351799, "num_tokens": 546529.0, "step": 82 }, { "epoch": 0.664, "grad_norm": 0.7572323679924011, "learning_rate": 0.00019524244063799003, "loss": 0.6078, "mean_token_accuracy": 0.8581458032131195, "num_tokens": 553172.0, "step": 83 }, { "epoch": 0.672, "grad_norm": 0.3112167716026306, "learning_rate": 0.00019509160715458233, "loss": 0.5698, "mean_token_accuracy": 0.8646929711103439, "num_tokens": 560322.0, "step": 84 }, { "epoch": 0.68, "grad_norm": 0.30758005380630493, "learning_rate": 0.0001949384868076329, "loss": 0.5658, "mean_token_accuracy": 0.863633781671524, "num_tokens": 566291.0, "step": 85 }, { "epoch": 0.688, "grad_norm": 0.32986459136009216, "learning_rate": 0.000194783083712299, "loss": 0.5461, "mean_token_accuracy": 0.8771772384643555, "num_tokens": 572945.0, "step": 86 }, { "epoch": 0.696, "grad_norm": 0.3336279094219208, "learning_rate": 0.00019462540204508738, "loss": 0.5355, "mean_token_accuracy": 0.8690856248140335, "num_tokens": 579902.0, "step": 87 }, { "epoch": 0.704, "grad_norm": 0.3255175054073334, "learning_rate": 0.00019446544604374215, "loss": 0.5888, "mean_token_accuracy": 0.8579987734556198, "num_tokens": 586616.0, "step": 88 }, { "epoch": 0.712, "grad_norm": 0.323868989944458, "learning_rate": 0.00019430322000713076, "loss": 0.492, "mean_token_accuracy": 0.8785415291786194, "num_tokens": 592728.0, "step": 89 }, { "epoch": 0.72, "grad_norm": 0.3210422992706299, "learning_rate": 0.00019413872829512874, "loss": 0.5838, "mean_token_accuracy": 0.8545652478933334, "num_tokens": 600500.0, "step": 90 }, { "epoch": 0.728, "grad_norm": 0.40252962708473206, "learning_rate": 0.00019397197532850224, "loss": 0.5363, "mean_token_accuracy": 0.8665568381547928, "num_tokens": 605789.0, "step": 91 }, { "epoch": 0.736, "grad_norm": 0.3653203248977661, "learning_rate": 0.0001938029655887894, "loss": 0.5342, "mean_token_accuracy": 0.8687711954116821, "num_tokens": 611788.0, "step": 92 }, { "epoch": 0.744, "grad_norm": 0.33851131796836853, "learning_rate": 0.00019363170361817971, "loss": 0.5601, "mean_token_accuracy": 0.8600920587778091, "num_tokens": 618091.0, "step": 93 }, { "epoch": 0.752, "grad_norm": 0.3361421525478363, "learning_rate": 0.00019345819401939227, "loss": 0.6039, "mean_token_accuracy": 0.8590127527713776, "num_tokens": 625307.0, "step": 94 }, { "epoch": 0.76, "grad_norm": 0.34586572647094727, "learning_rate": 0.00019328244145555177, "loss": 0.5299, "mean_token_accuracy": 0.8717793226242065, "num_tokens": 632237.0, "step": 95 }, { "epoch": 0.768, "grad_norm": 0.3349948823451996, "learning_rate": 0.00019310445065006323, "loss": 0.6081, "mean_token_accuracy": 0.8523767739534378, "num_tokens": 638815.0, "step": 96 }, { "epoch": 0.776, "grad_norm": 0.33434224128723145, "learning_rate": 0.00019292422638648527, "loss": 0.5909, "mean_token_accuracy": 0.8491196036338806, "num_tokens": 646629.0, "step": 97 }, { "epoch": 0.784, "grad_norm": 0.316795289516449, "learning_rate": 0.00019274177350840125, "loss": 0.5766, "mean_token_accuracy": 0.8626135885715485, "num_tokens": 653868.0, "step": 98 }, { "epoch": 0.792, "grad_norm": 0.38980644941329956, "learning_rate": 0.0001925570969192894, "loss": 0.5984, "mean_token_accuracy": 0.8572886437177658, "num_tokens": 660762.0, "step": 99 }, { "epoch": 0.8, "grad_norm": 0.34287554025650024, "learning_rate": 0.00019237020158239065, "loss": 0.5727, "mean_token_accuracy": 0.8676161915063858, "num_tokens": 668093.0, "step": 100 }, { "epoch": 0.808, "grad_norm": 0.34517669677734375, "learning_rate": 0.0001921810925205757, "loss": 0.5259, "mean_token_accuracy": 0.8704000562429428, "num_tokens": 673920.0, "step": 101 }, { "epoch": 0.816, "grad_norm": 0.3233802020549774, "learning_rate": 0.00019198977481620967, "loss": 0.6379, "mean_token_accuracy": 0.8521578162908554, "num_tokens": 681545.0, "step": 102 }, { "epoch": 0.824, "grad_norm": 0.3119167387485504, "learning_rate": 0.00019179625361101564, "loss": 0.4972, "mean_token_accuracy": 0.8785851448774338, "num_tokens": 687911.0, "step": 103 }, { "epoch": 0.832, "grad_norm": 0.34621503949165344, "learning_rate": 0.00019160053410593653, "loss": 0.5737, "mean_token_accuracy": 0.8579032868146896, "num_tokens": 694681.0, "step": 104 }, { "epoch": 0.84, "grad_norm": 0.31242305040359497, "learning_rate": 0.0001914026215609952, "loss": 0.4728, "mean_token_accuracy": 0.8880013078451157, "num_tokens": 702940.0, "step": 105 }, { "epoch": 0.848, "grad_norm": 0.35367879271507263, "learning_rate": 0.00019120252129515322, "loss": 0.6257, "mean_token_accuracy": 0.85282202064991, "num_tokens": 710021.0, "step": 106 }, { "epoch": 0.856, "grad_norm": 0.37111493945121765, "learning_rate": 0.0001910002386861677, "loss": 0.624, "mean_token_accuracy": 0.8507054150104523, "num_tokens": 717287.0, "step": 107 }, { "epoch": 0.864, "grad_norm": 0.36353397369384766, "learning_rate": 0.00019079577917044705, "loss": 0.5918, "mean_token_accuracy": 0.8595898598432541, "num_tokens": 724046.0, "step": 108 }, { "epoch": 0.872, "grad_norm": 0.3648279011249542, "learning_rate": 0.00019058914824290465, "loss": 0.5022, "mean_token_accuracy": 0.8799950033426285, "num_tokens": 730456.0, "step": 109 }, { "epoch": 0.88, "grad_norm": 0.39622411131858826, "learning_rate": 0.00019038035145681125, "loss": 0.6507, "mean_token_accuracy": 0.8495734184980392, "num_tokens": 735957.0, "step": 110 }, { "epoch": 0.888, "grad_norm": 0.3640548586845398, "learning_rate": 0.00019016939442364578, "loss": 0.6925, "mean_token_accuracy": 0.8380750119686127, "num_tokens": 743123.0, "step": 111 }, { "epoch": 0.896, "grad_norm": 0.3780752122402191, "learning_rate": 0.00018995628281294442, "loss": 0.5735, "mean_token_accuracy": 0.8675370812416077, "num_tokens": 749522.0, "step": 112 }, { "epoch": 0.904, "grad_norm": 0.36225274205207825, "learning_rate": 0.00018974102235214834, "loss": 0.6425, "mean_token_accuracy": 0.8510697484016418, "num_tokens": 756971.0, "step": 113 }, { "epoch": 0.912, "grad_norm": 0.33881205320358276, "learning_rate": 0.0001895236188264497, "loss": 0.583, "mean_token_accuracy": 0.8637432605028152, "num_tokens": 763593.0, "step": 114 }, { "epoch": 0.92, "grad_norm": 0.36713939905166626, "learning_rate": 0.00018930407807863628, "loss": 0.6164, "mean_token_accuracy": 0.8609535098075867, "num_tokens": 770805.0, "step": 115 }, { "epoch": 0.928, "grad_norm": 0.41083401441574097, "learning_rate": 0.00018908240600893419, "loss": 0.5228, "mean_token_accuracy": 0.8732753098011017, "num_tokens": 777335.0, "step": 116 }, { "epoch": 0.936, "grad_norm": 0.35693636536598206, "learning_rate": 0.00018885860857484972, "loss": 0.5245, "mean_token_accuracy": 0.873714879155159, "num_tokens": 783820.0, "step": 117 }, { "epoch": 0.944, "grad_norm": 0.3496348261833191, "learning_rate": 0.00018863269179100875, "loss": 0.4445, "mean_token_accuracy": 0.8872530460357666, "num_tokens": 790353.0, "step": 118 }, { "epoch": 0.952, "grad_norm": 0.4185085594654083, "learning_rate": 0.00018840466172899553, "loss": 0.4811, "mean_token_accuracy": 0.8783089220523834, "num_tokens": 796136.0, "step": 119 }, { "epoch": 0.96, "grad_norm": 0.3815785050392151, "learning_rate": 0.00018817452451718928, "loss": 0.5099, "mean_token_accuracy": 0.8799064457416534, "num_tokens": 803368.0, "step": 120 }, { "epoch": 0.968, "grad_norm": 0.4028662145137787, "learning_rate": 0.0001879422863405995, "loss": 0.5993, "mean_token_accuracy": 0.8580205887556076, "num_tokens": 809209.0, "step": 121 }, { "epoch": 0.976, "grad_norm": 0.412908136844635, "learning_rate": 0.00018770795344069972, "loss": 0.576, "mean_token_accuracy": 0.8594372272491455, "num_tokens": 815004.0, "step": 122 }, { "epoch": 0.984, "grad_norm": 0.3622879981994629, "learning_rate": 0.00018747153211525996, "loss": 0.6293, "mean_token_accuracy": 0.8497031927108765, "num_tokens": 822058.0, "step": 123 }, { "epoch": 0.992, "grad_norm": 0.3823924660682678, "learning_rate": 0.00018723302871817717, "loss": 0.6086, "mean_token_accuracy": 0.8630327731370926, "num_tokens": 828164.0, "step": 124 }, { "epoch": 1.0, "grad_norm": 0.3807222247123718, "learning_rate": 0.00018699244965930475, "loss": 0.5867, "mean_token_accuracy": 0.8589149713516235, "num_tokens": 833384.0, "step": 125 }, { "epoch": 1.008, "grad_norm": 0.34858500957489014, "learning_rate": 0.00018674980140428012, "loss": 0.5984, "mean_token_accuracy": 0.8534691333770752, "num_tokens": 840189.0, "step": 126 }, { "epoch": 1.016, "grad_norm": 0.39528483152389526, "learning_rate": 0.00018650509047435094, "loss": 0.5477, "mean_token_accuracy": 0.8631355613470078, "num_tokens": 846488.0, "step": 127 }, { "epoch": 1.024, "grad_norm": 0.38638606667518616, "learning_rate": 0.0001862583234462, "loss": 0.4704, "mean_token_accuracy": 0.8793387711048126, "num_tokens": 854057.0, "step": 128 }, { "epoch": 1.032, "grad_norm": 0.3776165843009949, "learning_rate": 0.00018600950695176827, "loss": 0.4073, "mean_token_accuracy": 0.8982667475938797, "num_tokens": 859782.0, "step": 129 }, { "epoch": 1.04, "grad_norm": 0.3408069908618927, "learning_rate": 0.00018575864767807684, "loss": 0.4311, "mean_token_accuracy": 0.8969773948192596, "num_tokens": 865811.0, "step": 130 }, { "epoch": 1.048, "grad_norm": 0.3915804624557495, "learning_rate": 0.00018550575236704712, "loss": 0.6324, "mean_token_accuracy": 0.8482974618673325, "num_tokens": 872507.0, "step": 131 }, { "epoch": 1.056, "grad_norm": 0.40758293867111206, "learning_rate": 0.00018525082781531963, "loss": 0.6765, "mean_token_accuracy": 0.8475436121225357, "num_tokens": 879296.0, "step": 132 }, { "epoch": 1.064, "grad_norm": 0.42398837208747864, "learning_rate": 0.00018499388087407138, "loss": 0.4452, "mean_token_accuracy": 0.8881448060274124, "num_tokens": 884862.0, "step": 133 }, { "epoch": 1.072, "grad_norm": 0.38132086396217346, "learning_rate": 0.00018473491844883178, "loss": 0.3368, "mean_token_accuracy": 0.9102375656366348, "num_tokens": 891182.0, "step": 134 }, { "epoch": 1.08, "grad_norm": 0.40963220596313477, "learning_rate": 0.00018447394749929692, "loss": 0.5017, "mean_token_accuracy": 0.8820759654045105, "num_tokens": 897508.0, "step": 135 }, { "epoch": 1.088, "grad_norm": 0.36850762367248535, "learning_rate": 0.00018421097503914266, "loss": 0.4136, "mean_token_accuracy": 0.8976060748100281, "num_tokens": 903854.0, "step": 136 }, { "epoch": 1.096, "grad_norm": 0.3924892842769623, "learning_rate": 0.00018394600813583607, "loss": 0.4128, "mean_token_accuracy": 0.8969376981258392, "num_tokens": 909818.0, "step": 137 }, { "epoch": 1.104, "grad_norm": 0.39313405752182007, "learning_rate": 0.00018367905391044549, "loss": 0.4955, "mean_token_accuracy": 0.8764273226261139, "num_tokens": 916019.0, "step": 138 }, { "epoch": 1.112, "grad_norm": 0.4524909555912018, "learning_rate": 0.00018341011953744923, "loss": 0.5858, "mean_token_accuracy": 0.8611360937356949, "num_tokens": 921924.0, "step": 139 }, { "epoch": 1.12, "grad_norm": 0.45517846941947937, "learning_rate": 0.00018313921224454252, "loss": 0.6413, "mean_token_accuracy": 0.8465032875537872, "num_tokens": 930105.0, "step": 140 }, { "epoch": 1.1280000000000001, "grad_norm": 0.3854213356971741, "learning_rate": 0.00018286633931244366, "loss": 0.4344, "mean_token_accuracy": 0.8929370790719986, "num_tokens": 936763.0, "step": 141 }, { "epoch": 1.1360000000000001, "grad_norm": 0.4203244745731354, "learning_rate": 0.00018259150807469797, "loss": 0.4439, "mean_token_accuracy": 0.8955252319574356, "num_tokens": 942086.0, "step": 142 }, { "epoch": 1.144, "grad_norm": 0.44454455375671387, "learning_rate": 0.00018231472591748082, "loss": 0.5156, "mean_token_accuracy": 0.8750511556863785, "num_tokens": 948273.0, "step": 143 }, { "epoch": 1.152, "grad_norm": 0.4443396031856537, "learning_rate": 0.00018203600027939926, "loss": 0.4615, "mean_token_accuracy": 0.8853558897972107, "num_tokens": 955513.0, "step": 144 }, { "epoch": 1.16, "grad_norm": 0.4034740626811981, "learning_rate": 0.00018175533865129193, "loss": 0.5616, "mean_token_accuracy": 0.8626160174608231, "num_tokens": 962724.0, "step": 145 }, { "epoch": 1.168, "grad_norm": 0.4190956950187683, "learning_rate": 0.0001814727485760278, "loss": 0.4393, "mean_token_accuracy": 0.8891415745019913, "num_tokens": 968790.0, "step": 146 }, { "epoch": 1.176, "grad_norm": 0.4239446520805359, "learning_rate": 0.00018118823764830356, "loss": 0.4912, "mean_token_accuracy": 0.8838336318731308, "num_tokens": 975377.0, "step": 147 }, { "epoch": 1.184, "grad_norm": 0.5161798000335693, "learning_rate": 0.0001809018135144392, "loss": 0.5945, "mean_token_accuracy": 0.859768271446228, "num_tokens": 981360.0, "step": 148 }, { "epoch": 1.192, "grad_norm": 0.4592292904853821, "learning_rate": 0.00018061348387217295, "loss": 0.5519, "mean_token_accuracy": 0.8640515506267548, "num_tokens": 987797.0, "step": 149 }, { "epoch": 1.2, "grad_norm": 0.42503586411476135, "learning_rate": 0.00018032325647045403, "loss": 0.4913, "mean_token_accuracy": 0.8824547529220581, "num_tokens": 994410.0, "step": 150 }, { "epoch": 1.208, "grad_norm": 0.4126821458339691, "learning_rate": 0.00018003113910923461, "loss": 0.5632, "mean_token_accuracy": 0.8633860200643539, "num_tokens": 1002029.0, "step": 151 }, { "epoch": 1.216, "grad_norm": 0.3857492208480835, "learning_rate": 0.00017973713963926008, "loss": 0.4983, "mean_token_accuracy": 0.8792683780193329, "num_tokens": 1008823.0, "step": 152 }, { "epoch": 1.224, "grad_norm": 0.45037853717803955, "learning_rate": 0.0001794412659618581, "loss": 0.5341, "mean_token_accuracy": 0.8727520704269409, "num_tokens": 1015633.0, "step": 153 }, { "epoch": 1.232, "grad_norm": 0.43658384680747986, "learning_rate": 0.00017914352602872626, "loss": 0.5169, "mean_token_accuracy": 0.8772079050540924, "num_tokens": 1022217.0, "step": 154 }, { "epoch": 1.24, "grad_norm": 0.4409887492656708, "learning_rate": 0.00017884392784171831, "loss": 0.4755, "mean_token_accuracy": 0.882952868938446, "num_tokens": 1028061.0, "step": 155 }, { "epoch": 1.248, "grad_norm": 0.40505707263946533, "learning_rate": 0.00017854247945262917, "loss": 0.496, "mean_token_accuracy": 0.8811916708946228, "num_tokens": 1035143.0, "step": 156 }, { "epoch": 1.256, "grad_norm": 0.42024296522140503, "learning_rate": 0.00017823918896297852, "loss": 0.4886, "mean_token_accuracy": 0.8800901770591736, "num_tokens": 1041913.0, "step": 157 }, { "epoch": 1.264, "grad_norm": 0.3689241111278534, "learning_rate": 0.00017793406452379314, "loss": 0.4005, "mean_token_accuracy": 0.8970769792795181, "num_tokens": 1048282.0, "step": 158 }, { "epoch": 1.272, "grad_norm": 0.4727207124233246, "learning_rate": 0.00017762711433538765, "loss": 0.5948, "mean_token_accuracy": 0.8544750362634659, "num_tokens": 1056066.0, "step": 159 }, { "epoch": 1.28, "grad_norm": 0.46616116166114807, "learning_rate": 0.00017731834664714438, "loss": 0.4897, "mean_token_accuracy": 0.8830447345972061, "num_tokens": 1063335.0, "step": 160 }, { "epoch": 1.288, "grad_norm": 0.46555575728416443, "learning_rate": 0.00017700776975729138, "loss": 0.4189, "mean_token_accuracy": 0.8959919512271881, "num_tokens": 1069390.0, "step": 161 }, { "epoch": 1.296, "grad_norm": 0.5011019110679626, "learning_rate": 0.00017669539201267974, "loss": 0.5108, "mean_token_accuracy": 0.8755719512701035, "num_tokens": 1076382.0, "step": 162 }, { "epoch": 1.304, "grad_norm": 0.4347600042819977, "learning_rate": 0.0001763812218085589, "loss": 0.4993, "mean_token_accuracy": 0.8790145367383957, "num_tokens": 1083154.0, "step": 163 }, { "epoch": 1.312, "grad_norm": 0.4214591085910797, "learning_rate": 0.00017606526758835145, "loss": 0.4609, "mean_token_accuracy": 0.8841957598924637, "num_tokens": 1088943.0, "step": 164 }, { "epoch": 1.32, "grad_norm": 0.4219564199447632, "learning_rate": 0.00017574753784342566, "loss": 0.4472, "mean_token_accuracy": 0.8914137780666351, "num_tokens": 1096102.0, "step": 165 }, { "epoch": 1.328, "grad_norm": 0.4628419280052185, "learning_rate": 0.00017542804111286785, "loss": 0.6057, "mean_token_accuracy": 0.8631048053503036, "num_tokens": 1103356.0, "step": 166 }, { "epoch": 1.336, "grad_norm": 0.4196925163269043, "learning_rate": 0.00017510678598325248, "loss": 0.5833, "mean_token_accuracy": 0.8595189452171326, "num_tokens": 1109666.0, "step": 167 }, { "epoch": 1.3439999999999999, "grad_norm": 0.43105360865592957, "learning_rate": 0.0001747837810884116, "loss": 0.4007, "mean_token_accuracy": 0.8960121870040894, "num_tokens": 1116734.0, "step": 168 }, { "epoch": 1.3519999999999999, "grad_norm": 0.40199485421180725, "learning_rate": 0.00017445903510920278, "loss": 0.4559, "mean_token_accuracy": 0.8918093591928482, "num_tokens": 1122689.0, "step": 169 }, { "epoch": 1.3599999999999999, "grad_norm": 0.45968693494796753, "learning_rate": 0.00017413255677327564, "loss": 0.5566, "mean_token_accuracy": 0.8674175888299942, "num_tokens": 1129295.0, "step": 170 }, { "epoch": 1.3679999999999999, "grad_norm": 0.4297579228878021, "learning_rate": 0.00017380435485483755, "loss": 0.4797, "mean_token_accuracy": 0.8841190785169601, "num_tokens": 1135480.0, "step": 171 }, { "epoch": 1.376, "grad_norm": 0.4660268723964691, "learning_rate": 0.0001734744381744177, "loss": 0.4784, "mean_token_accuracy": 0.88442263007164, "num_tokens": 1141600.0, "step": 172 }, { "epoch": 1.384, "grad_norm": 0.45515453815460205, "learning_rate": 0.0001731428155986299, "loss": 0.5086, "mean_token_accuracy": 0.8794610053300858, "num_tokens": 1147550.0, "step": 173 }, { "epoch": 1.392, "grad_norm": 0.44932329654693604, "learning_rate": 0.00017280949603993466, "loss": 0.468, "mean_token_accuracy": 0.8865546584129333, "num_tokens": 1154625.0, "step": 174 }, { "epoch": 1.4, "grad_norm": 0.4518669843673706, "learning_rate": 0.00017247448845639926, "loss": 0.4911, "mean_token_accuracy": 0.8744615465402603, "num_tokens": 1161689.0, "step": 175 }, { "epoch": 1.408, "grad_norm": 0.38170140981674194, "learning_rate": 0.00017213780185145722, "loss": 0.4485, "mean_token_accuracy": 0.8911368399858475, "num_tokens": 1169600.0, "step": 176 }, { "epoch": 1.416, "grad_norm": 0.4645857512950897, "learning_rate": 0.00017179944527366634, "loss": 0.5235, "mean_token_accuracy": 0.8698697537183762, "num_tokens": 1176813.0, "step": 177 }, { "epoch": 1.424, "grad_norm": 0.42486289143562317, "learning_rate": 0.00017145942781646533, "loss": 0.4946, "mean_token_accuracy": 0.8845863193273544, "num_tokens": 1183841.0, "step": 178 }, { "epoch": 1.432, "grad_norm": 0.43411773443222046, "learning_rate": 0.00017111775861792977, "loss": 0.4414, "mean_token_accuracy": 0.8896961510181427, "num_tokens": 1191482.0, "step": 179 }, { "epoch": 1.44, "grad_norm": 0.4529632329940796, "learning_rate": 0.00017077444686052607, "loss": 0.3722, "mean_token_accuracy": 0.906092494726181, "num_tokens": 1197380.0, "step": 180 }, { "epoch": 1.448, "grad_norm": 0.4266018867492676, "learning_rate": 0.00017042950177086503, "loss": 0.3488, "mean_token_accuracy": 0.9110171794891357, "num_tokens": 1204168.0, "step": 181 }, { "epoch": 1.456, "grad_norm": 0.5139121413230896, "learning_rate": 0.00017008293261945383, "loss": 0.4221, "mean_token_accuracy": 0.8969552367925644, "num_tokens": 1209548.0, "step": 182 }, { "epoch": 1.464, "grad_norm": 0.4341898560523987, "learning_rate": 0.00016973474872044672, "loss": 0.549, "mean_token_accuracy": 0.8663805574178696, "num_tokens": 1216887.0, "step": 183 }, { "epoch": 1.472, "grad_norm": 0.4988938570022583, "learning_rate": 0.0001693849594313948, "loss": 0.6465, "mean_token_accuracy": 0.8550772070884705, "num_tokens": 1225369.0, "step": 184 }, { "epoch": 1.48, "grad_norm": 0.563200056552887, "learning_rate": 0.00016903357415299462, "loss": 0.5472, "mean_token_accuracy": 0.8734919279813766, "num_tokens": 1231649.0, "step": 185 }, { "epoch": 1.488, "grad_norm": 0.4169449210166931, "learning_rate": 0.00016868060232883532, "loss": 0.5015, "mean_token_accuracy": 0.8798184394836426, "num_tokens": 1239148.0, "step": 186 }, { "epoch": 1.496, "grad_norm": 0.3919561207294464, "learning_rate": 0.000168326053445145, "loss": 0.5079, "mean_token_accuracy": 0.8830395936965942, "num_tokens": 1246836.0, "step": 187 }, { "epoch": 1.504, "grad_norm": 0.4209696352481842, "learning_rate": 0.0001679699370305358, "loss": 0.5637, "mean_token_accuracy": 0.8605077713727951, "num_tokens": 1254199.0, "step": 188 }, { "epoch": 1.512, "grad_norm": 0.44462457299232483, "learning_rate": 0.00016761226265574766, "loss": 0.5853, "mean_token_accuracy": 0.853949248790741, "num_tokens": 1261390.0, "step": 189 }, { "epoch": 1.52, "grad_norm": 0.4027554392814636, "learning_rate": 0.00016725303993339122, "loss": 0.4526, "mean_token_accuracy": 0.8871663361787796, "num_tokens": 1267736.0, "step": 190 }, { "epoch": 1.528, "grad_norm": 0.4461030662059784, "learning_rate": 0.00016689227851768938, "loss": 0.4235, "mean_token_accuracy": 0.8971422165632248, "num_tokens": 1274894.0, "step": 191 }, { "epoch": 1.536, "grad_norm": 0.44130492210388184, "learning_rate": 0.00016652998810421805, "loss": 0.4614, "mean_token_accuracy": 0.883885383605957, "num_tokens": 1280992.0, "step": 192 }, { "epoch": 1.544, "grad_norm": 0.43946003913879395, "learning_rate": 0.00016616617842964523, "loss": 0.4715, "mean_token_accuracy": 0.8811623752117157, "num_tokens": 1288143.0, "step": 193 }, { "epoch": 1.552, "grad_norm": 0.4609353244304657, "learning_rate": 0.00016580085927146978, "loss": 0.5212, "mean_token_accuracy": 0.8754368126392365, "num_tokens": 1294790.0, "step": 194 }, { "epoch": 1.56, "grad_norm": 0.5108767747879028, "learning_rate": 0.00016543404044775815, "loss": 0.5406, "mean_token_accuracy": 0.8638564199209213, "num_tokens": 1302223.0, "step": 195 }, { "epoch": 1.568, "grad_norm": 0.4694247841835022, "learning_rate": 0.000165065731816881, "loss": 0.4348, "mean_token_accuracy": 0.9047677963972092, "num_tokens": 1308004.0, "step": 196 }, { "epoch": 1.576, "grad_norm": 0.4923543632030487, "learning_rate": 0.00016469594327724786, "loss": 0.4903, "mean_token_accuracy": 0.8743230998516083, "num_tokens": 1316242.0, "step": 197 }, { "epoch": 1.584, "grad_norm": 0.45112791657447815, "learning_rate": 0.00016432468476704134, "loss": 0.5671, "mean_token_accuracy": 0.85928013920784, "num_tokens": 1323666.0, "step": 198 }, { "epoch": 1.592, "grad_norm": 0.37915629148483276, "learning_rate": 0.00016395196626394995, "loss": 0.3492, "mean_token_accuracy": 0.9059862494468689, "num_tokens": 1330469.0, "step": 199 }, { "epoch": 1.6, "grad_norm": 0.4130063056945801, "learning_rate": 0.00016357779778489995, "loss": 0.5328, "mean_token_accuracy": 0.8865040391683578, "num_tokens": 1337305.0, "step": 200 }, { "epoch": 1.608, "grad_norm": 0.4619772732257843, "learning_rate": 0.00016320218938578623, "loss": 0.5687, "mean_token_accuracy": 0.8649569749832153, "num_tokens": 1344011.0, "step": 201 }, { "epoch": 1.616, "grad_norm": 0.4249042570590973, "learning_rate": 0.00016282515116120187, "loss": 0.4843, "mean_token_accuracy": 0.890045240521431, "num_tokens": 1350290.0, "step": 202 }, { "epoch": 1.624, "grad_norm": 0.42717766761779785, "learning_rate": 0.0001624466932441671, "loss": 0.4798, "mean_token_accuracy": 0.8857530951499939, "num_tokens": 1356903.0, "step": 203 }, { "epoch": 1.6320000000000001, "grad_norm": 0.5048653483390808, "learning_rate": 0.00016206682580585668, "loss": 0.5756, "mean_token_accuracy": 0.8639950603246689, "num_tokens": 1364033.0, "step": 204 }, { "epoch": 1.6400000000000001, "grad_norm": 0.4343317747116089, "learning_rate": 0.00016168555905532675, "loss": 0.5485, "mean_token_accuracy": 0.874378427863121, "num_tokens": 1370255.0, "step": 205 }, { "epoch": 1.6480000000000001, "grad_norm": 0.43780046701431274, "learning_rate": 0.00016130290323924045, "loss": 0.3855, "mean_token_accuracy": 0.9005103707313538, "num_tokens": 1376349.0, "step": 206 }, { "epoch": 1.6560000000000001, "grad_norm": 0.4596022665500641, "learning_rate": 0.00016091886864159246, "loss": 0.4991, "mean_token_accuracy": 0.8819662034511566, "num_tokens": 1383133.0, "step": 207 }, { "epoch": 1.6640000000000001, "grad_norm": 0.5207439064979553, "learning_rate": 0.0001605334655834326, "loss": 0.5077, "mean_token_accuracy": 0.8812099099159241, "num_tokens": 1389673.0, "step": 208 }, { "epoch": 1.6720000000000002, "grad_norm": 0.5418673157691956, "learning_rate": 0.00016014670442258857, "loss": 0.4828, "mean_token_accuracy": 0.8806623220443726, "num_tokens": 1396674.0, "step": 209 }, { "epoch": 1.6800000000000002, "grad_norm": 0.42274704575538635, "learning_rate": 0.00015975859555338745, "loss": 0.3791, "mean_token_accuracy": 0.9050882756710052, "num_tokens": 1403522.0, "step": 210 }, { "epoch": 1.688, "grad_norm": 0.4714980721473694, "learning_rate": 0.00015936914940637643, "loss": 0.4824, "mean_token_accuracy": 0.8794721812009811, "num_tokens": 1411235.0, "step": 211 }, { "epoch": 1.696, "grad_norm": 0.49704262614250183, "learning_rate": 0.00015897837644804247, "loss": 0.4881, "mean_token_accuracy": 0.8812518864870071, "num_tokens": 1417883.0, "step": 212 }, { "epoch": 1.704, "grad_norm": 0.5860664248466492, "learning_rate": 0.00015858628718053093, "loss": 0.4613, "mean_token_accuracy": 0.8794592171907425, "num_tokens": 1424340.0, "step": 213 }, { "epoch": 1.712, "grad_norm": 0.4694487750530243, "learning_rate": 0.0001581928921413635, "loss": 0.5753, "mean_token_accuracy": 0.8628325164318085, "num_tokens": 1431053.0, "step": 214 }, { "epoch": 1.72, "grad_norm": 0.43483293056488037, "learning_rate": 0.00015779820190315477, "loss": 0.4874, "mean_token_accuracy": 0.879583090543747, "num_tokens": 1437772.0, "step": 215 }, { "epoch": 1.728, "grad_norm": 0.4641876220703125, "learning_rate": 0.0001574022270733283, "loss": 0.4962, "mean_token_accuracy": 0.8789726048707962, "num_tokens": 1444795.0, "step": 216 }, { "epoch": 1.736, "grad_norm": 0.43445879220962524, "learning_rate": 0.00015700497829383145, "loss": 0.3795, "mean_token_accuracy": 0.9033605754375458, "num_tokens": 1451282.0, "step": 217 }, { "epoch": 1.744, "grad_norm": 0.40109649300575256, "learning_rate": 0.00015660646624084928, "loss": 0.4766, "mean_token_accuracy": 0.8847470134496689, "num_tokens": 1459481.0, "step": 218 }, { "epoch": 1.752, "grad_norm": 0.4893306791782379, "learning_rate": 0.00015620670162451786, "loss": 0.4273, "mean_token_accuracy": 0.8936987966299057, "num_tokens": 1465561.0, "step": 219 }, { "epoch": 1.76, "grad_norm": 0.45475998520851135, "learning_rate": 0.0001558056951886362, "loss": 0.5181, "mean_token_accuracy": 0.8771376013755798, "num_tokens": 1472055.0, "step": 220 }, { "epoch": 1.768, "grad_norm": 0.5032304525375366, "learning_rate": 0.00015540345771037758, "loss": 0.4443, "mean_token_accuracy": 0.8893763720989227, "num_tokens": 1477875.0, "step": 221 }, { "epoch": 1.776, "grad_norm": 0.4644501209259033, "learning_rate": 0.000155, "loss": 0.4399, "mean_token_accuracy": 0.9007304608821869, "num_tokens": 1484566.0, "step": 222 }, { "epoch": 1.784, "grad_norm": 0.47066959738731384, "learning_rate": 0.00015459533290055556, "loss": 0.4494, "mean_token_accuracy": 0.8861970603466034, "num_tokens": 1491299.0, "step": 223 }, { "epoch": 1.792, "grad_norm": 0.5704615116119385, "learning_rate": 0.000154189467287599, "loss": 0.4026, "mean_token_accuracy": 0.8966558575630188, "num_tokens": 1497249.0, "step": 224 }, { "epoch": 1.8, "grad_norm": 0.5205431580543518, "learning_rate": 0.00015378241406889558, "loss": 0.5171, "mean_token_accuracy": 0.8707104325294495, "num_tokens": 1504734.0, "step": 225 }, { "epoch": 1.808, "grad_norm": 0.5295842289924622, "learning_rate": 0.00015337418418412784, "loss": 0.5959, "mean_token_accuracy": 0.8571827560663223, "num_tokens": 1511298.0, "step": 226 }, { "epoch": 1.8159999999999998, "grad_norm": 0.4663291871547699, "learning_rate": 0.00015296478860460144, "loss": 0.4224, "mean_token_accuracy": 0.901121199131012, "num_tokens": 1516565.0, "step": 227 }, { "epoch": 1.8239999999999998, "grad_norm": 0.47146064043045044, "learning_rate": 0.00015255423833295063, "loss": 0.4022, "mean_token_accuracy": 0.8953949958086014, "num_tokens": 1522610.0, "step": 228 }, { "epoch": 1.8319999999999999, "grad_norm": 0.41930270195007324, "learning_rate": 0.0001521425444028423, "loss": 0.3736, "mean_token_accuracy": 0.9028169065713882, "num_tokens": 1529406.0, "step": 229 }, { "epoch": 1.8399999999999999, "grad_norm": 0.5190208554267883, "learning_rate": 0.00015172971787867946, "loss": 0.4997, "mean_token_accuracy": 0.8748685717582703, "num_tokens": 1535144.0, "step": 230 }, { "epoch": 1.8479999999999999, "grad_norm": 0.4800821840763092, "learning_rate": 0.00015131576985530406, "loss": 0.5484, "mean_token_accuracy": 0.8690352439880371, "num_tokens": 1542419.0, "step": 231 }, { "epoch": 1.8559999999999999, "grad_norm": 0.4962865114212036, "learning_rate": 0.00015090071145769856, "loss": 0.5338, "mean_token_accuracy": 0.8726425766944885, "num_tokens": 1549165.0, "step": 232 }, { "epoch": 1.8639999999999999, "grad_norm": 0.4471769332885742, "learning_rate": 0.00015048455384068725, "loss": 0.4043, "mean_token_accuracy": 0.9041309505701065, "num_tokens": 1555530.0, "step": 233 }, { "epoch": 1.8719999999999999, "grad_norm": 0.501523494720459, "learning_rate": 0.00015006730818863603, "loss": 0.4924, "mean_token_accuracy": 0.8754971027374268, "num_tokens": 1561743.0, "step": 234 }, { "epoch": 1.88, "grad_norm": 0.48072171211242676, "learning_rate": 0.00014964898571515235, "loss": 0.3008, "mean_token_accuracy": 0.9269110709428787, "num_tokens": 1567141.0, "step": 235 }, { "epoch": 1.888, "grad_norm": 0.49401360750198364, "learning_rate": 0.0001492295976627834, "loss": 0.4267, "mean_token_accuracy": 0.8951213508844376, "num_tokens": 1573149.0, "step": 236 }, { "epoch": 1.896, "grad_norm": 0.5359240174293518, "learning_rate": 0.00014880915530271417, "loss": 0.5032, "mean_token_accuracy": 0.8789175897836685, "num_tokens": 1579593.0, "step": 237 }, { "epoch": 1.904, "grad_norm": 0.4994426667690277, "learning_rate": 0.0001483876699344646, "loss": 0.459, "mean_token_accuracy": 0.89103102684021, "num_tokens": 1586112.0, "step": 238 }, { "epoch": 1.912, "grad_norm": 0.4566788375377655, "learning_rate": 0.0001479651528855856, "loss": 0.4739, "mean_token_accuracy": 0.8811845034360886, "num_tokens": 1592229.0, "step": 239 }, { "epoch": 1.92, "grad_norm": 0.4718650281429291, "learning_rate": 0.00014754161551135505, "loss": 0.5682, "mean_token_accuracy": 0.873079851269722, "num_tokens": 1597631.0, "step": 240 }, { "epoch": 1.928, "grad_norm": 0.44872692227363586, "learning_rate": 0.00014711706919447217, "loss": 0.4131, "mean_token_accuracy": 0.8959172517061234, "num_tokens": 1604840.0, "step": 241 }, { "epoch": 1.936, "grad_norm": 0.46837520599365234, "learning_rate": 0.000146691525344752, "loss": 0.54, "mean_token_accuracy": 0.873401090502739, "num_tokens": 1611227.0, "step": 242 }, { "epoch": 1.944, "grad_norm": 0.5113170742988586, "learning_rate": 0.00014626499539881853, "loss": 0.4188, "mean_token_accuracy": 0.9023873209953308, "num_tokens": 1617312.0, "step": 243 }, { "epoch": 1.952, "grad_norm": 0.5134310126304626, "learning_rate": 0.00014583749081979738, "loss": 0.451, "mean_token_accuracy": 0.8924902826547623, "num_tokens": 1625141.0, "step": 244 }, { "epoch": 1.96, "grad_norm": 0.49019724130630493, "learning_rate": 0.00014540902309700772, "loss": 0.5054, "mean_token_accuracy": 0.8831940442323685, "num_tokens": 1631493.0, "step": 245 }, { "epoch": 1.968, "grad_norm": 0.4522954225540161, "learning_rate": 0.0001449796037456536, "loss": 0.4661, "mean_token_accuracy": 0.882496640086174, "num_tokens": 1638605.0, "step": 246 }, { "epoch": 1.976, "grad_norm": 0.5024929642677307, "learning_rate": 0.00014454924430651423, "loss": 0.5102, "mean_token_accuracy": 0.8735022842884064, "num_tokens": 1646734.0, "step": 247 }, { "epoch": 1.984, "grad_norm": 0.5305299758911133, "learning_rate": 0.00014411795634563417, "loss": 0.4766, "mean_token_accuracy": 0.8881524056196213, "num_tokens": 1654448.0, "step": 248 }, { "epoch": 1.992, "grad_norm": 0.47463810443878174, "learning_rate": 0.00014368575145401208, "loss": 0.4619, "mean_token_accuracy": 0.8911669850349426, "num_tokens": 1661117.0, "step": 249 }, { "epoch": 2.0, "grad_norm": 0.5264838337898254, "learning_rate": 0.00014325264124728966, "loss": 0.5779, "mean_token_accuracy": 0.8617005199193954, "num_tokens": 1666768.0, "step": 250 }, { "epoch": 2.008, "grad_norm": 0.45901045203208923, "learning_rate": 0.00014281863736543898, "loss": 0.3967, "mean_token_accuracy": 0.898947224020958, "num_tokens": 1674982.0, "step": 251 }, { "epoch": 2.016, "grad_norm": 0.5122538805007935, "learning_rate": 0.0001423837514724501, "loss": 0.5583, "mean_token_accuracy": 0.865815594792366, "num_tokens": 1682038.0, "step": 252 }, { "epoch": 2.024, "grad_norm": 0.4520164728164673, "learning_rate": 0.0001419479952560173, "loss": 0.3392, "mean_token_accuracy": 0.9159495681524277, "num_tokens": 1687654.0, "step": 253 }, { "epoch": 2.032, "grad_norm": 0.49054303765296936, "learning_rate": 0.00014151138042722516, "loss": 0.4199, "mean_token_accuracy": 0.8956295102834702, "num_tokens": 1694926.0, "step": 254 }, { "epoch": 2.04, "grad_norm": 0.49620193243026733, "learning_rate": 0.00014107391872023367, "loss": 0.4329, "mean_token_accuracy": 0.8901129364967346, "num_tokens": 1701828.0, "step": 255 }, { "epoch": 2.048, "grad_norm": 0.6763495206832886, "learning_rate": 0.00014063562189196296, "loss": 0.4563, "mean_token_accuracy": 0.888703241944313, "num_tokens": 1708625.0, "step": 256 }, { "epoch": 2.056, "grad_norm": 0.6635557413101196, "learning_rate": 0.00014019650172177725, "loss": 0.4928, "mean_token_accuracy": 0.8792758584022522, "num_tokens": 1715480.0, "step": 257 }, { "epoch": 2.064, "grad_norm": 0.6684744954109192, "learning_rate": 0.00013975657001116844, "loss": 0.5202, "mean_token_accuracy": 0.8767696470022202, "num_tokens": 1721583.0, "step": 258 }, { "epoch": 2.072, "grad_norm": 0.6005105972290039, "learning_rate": 0.00013931583858343876, "loss": 0.4285, "mean_token_accuracy": 0.8916260153055191, "num_tokens": 1728632.0, "step": 259 }, { "epoch": 2.08, "grad_norm": 0.5100651383399963, "learning_rate": 0.00013887431928338308, "loss": 0.4078, "mean_token_accuracy": 0.8953111171722412, "num_tokens": 1736118.0, "step": 260 }, { "epoch": 2.088, "grad_norm": 0.5434725880622864, "learning_rate": 0.00013843202397697066, "loss": 0.441, "mean_token_accuracy": 0.8892927020788193, "num_tokens": 1743653.0, "step": 261 }, { "epoch": 2.096, "grad_norm": 0.4428845942020416, "learning_rate": 0.00013798896455102607, "loss": 0.3123, "mean_token_accuracy": 0.9208179265260696, "num_tokens": 1750148.0, "step": 262 }, { "epoch": 2.104, "grad_norm": 0.5046375393867493, "learning_rate": 0.00013754515291290989, "loss": 0.5289, "mean_token_accuracy": 0.8782770782709122, "num_tokens": 1756404.0, "step": 263 }, { "epoch": 2.112, "grad_norm": 0.6358498334884644, "learning_rate": 0.0001371006009901986, "loss": 0.4881, "mean_token_accuracy": 0.8802718222141266, "num_tokens": 1762576.0, "step": 264 }, { "epoch": 2.12, "grad_norm": 0.546139121055603, "learning_rate": 0.00013665532073036415, "loss": 0.3966, "mean_token_accuracy": 0.9026769399642944, "num_tokens": 1768956.0, "step": 265 }, { "epoch": 2.128, "grad_norm": 0.5296169519424438, "learning_rate": 0.0001362093241004527, "loss": 0.4855, "mean_token_accuracy": 0.8773520290851593, "num_tokens": 1775034.0, "step": 266 }, { "epoch": 2.136, "grad_norm": 0.484761506319046, "learning_rate": 0.00013576262308676309, "loss": 0.3552, "mean_token_accuracy": 0.9086227267980576, "num_tokens": 1781219.0, "step": 267 }, { "epoch": 2.144, "grad_norm": 0.5612555146217346, "learning_rate": 0.00013531522969452466, "loss": 0.4677, "mean_token_accuracy": 0.8840114921331406, "num_tokens": 1787902.0, "step": 268 }, { "epoch": 2.152, "grad_norm": 0.5658952593803406, "learning_rate": 0.00013486715594757473, "loss": 0.4403, "mean_token_accuracy": 0.8926974385976791, "num_tokens": 1793826.0, "step": 269 }, { "epoch": 2.16, "grad_norm": 0.4885717034339905, "learning_rate": 0.0001344184138880353, "loss": 0.3406, "mean_token_accuracy": 0.9074417948722839, "num_tokens": 1800620.0, "step": 270 }, { "epoch": 2.168, "grad_norm": 0.5280024409294128, "learning_rate": 0.0001339690155759895, "loss": 0.4463, "mean_token_accuracy": 0.8845728188753128, "num_tokens": 1808231.0, "step": 271 }, { "epoch": 2.176, "grad_norm": 0.4429165720939636, "learning_rate": 0.00013351897308915746, "loss": 0.2424, "mean_token_accuracy": 0.9361128211021423, "num_tokens": 1814413.0, "step": 272 }, { "epoch": 2.184, "grad_norm": 0.5282138586044312, "learning_rate": 0.00013306829852257167, "loss": 0.3729, "mean_token_accuracy": 0.9080943763256073, "num_tokens": 1820533.0, "step": 273 }, { "epoch": 2.192, "grad_norm": 0.4441615641117096, "learning_rate": 0.000132617003988252, "loss": 0.3139, "mean_token_accuracy": 0.9184619784355164, "num_tokens": 1828083.0, "step": 274 }, { "epoch": 2.2, "grad_norm": 0.5623084902763367, "learning_rate": 0.00013216510161488014, "loss": 0.4806, "mean_token_accuracy": 0.8825304508209229, "num_tokens": 1835447.0, "step": 275 }, { "epoch": 2.208, "grad_norm": 0.4704199731349945, "learning_rate": 0.00013171260354747358, "loss": 0.3998, "mean_token_accuracy": 0.8931941241025925, "num_tokens": 1843554.0, "step": 276 }, { "epoch": 2.216, "grad_norm": 0.5189396142959595, "learning_rate": 0.00013125952194705944, "loss": 0.4548, "mean_token_accuracy": 0.8932492285966873, "num_tokens": 1850140.0, "step": 277 }, { "epoch": 2.224, "grad_norm": 0.5081222653388977, "learning_rate": 0.0001308058689903473, "loss": 0.3671, "mean_token_accuracy": 0.9117176234722137, "num_tokens": 1856066.0, "step": 278 }, { "epoch": 2.232, "grad_norm": 0.5323463678359985, "learning_rate": 0.00013035165686940212, "loss": 0.3371, "mean_token_accuracy": 0.9107578992843628, "num_tokens": 1861731.0, "step": 279 }, { "epoch": 2.24, "grad_norm": 0.5867413282394409, "learning_rate": 0.00012989689779131677, "loss": 0.4363, "mean_token_accuracy": 0.8919873833656311, "num_tokens": 1867789.0, "step": 280 }, { "epoch": 2.248, "grad_norm": 0.6243230700492859, "learning_rate": 0.00012944160397788354, "loss": 0.3911, "mean_token_accuracy": 0.9052166640758514, "num_tokens": 1874541.0, "step": 281 }, { "epoch": 2.2560000000000002, "grad_norm": 0.5407280921936035, "learning_rate": 0.000128985787665266, "loss": 0.2974, "mean_token_accuracy": 0.9200872629880905, "num_tokens": 1880128.0, "step": 282 }, { "epoch": 2.2640000000000002, "grad_norm": 0.5016720294952393, "learning_rate": 0.00012852946110367006, "loss": 0.3137, "mean_token_accuracy": 0.9187629073858261, "num_tokens": 1886134.0, "step": 283 }, { "epoch": 2.2720000000000002, "grad_norm": 0.7218824028968811, "learning_rate": 0.00012807263655701466, "loss": 0.4399, "mean_token_accuracy": 0.892376720905304, "num_tokens": 1893212.0, "step": 284 }, { "epoch": 2.2800000000000002, "grad_norm": 0.5728095769882202, "learning_rate": 0.00012761532630260237, "loss": 0.4434, "mean_token_accuracy": 0.8913872987031937, "num_tokens": 1899748.0, "step": 285 }, { "epoch": 2.288, "grad_norm": 0.6006686687469482, "learning_rate": 0.0001271575426307892, "loss": 0.4862, "mean_token_accuracy": 0.8813899457454681, "num_tokens": 1906082.0, "step": 286 }, { "epoch": 2.296, "grad_norm": 0.5110010504722595, "learning_rate": 0.00012669929784465443, "loss": 0.3358, "mean_token_accuracy": 0.9156067073345184, "num_tokens": 1912968.0, "step": 287 }, { "epoch": 2.304, "grad_norm": 0.7630503177642822, "learning_rate": 0.00012624060425966985, "loss": 0.5382, "mean_token_accuracy": 0.8766893446445465, "num_tokens": 1919896.0, "step": 288 }, { "epoch": 2.312, "grad_norm": 0.4548809826374054, "learning_rate": 0.0001257814742033691, "loss": 0.2691, "mean_token_accuracy": 0.9295577853918076, "num_tokens": 1926759.0, "step": 289 }, { "epoch": 2.32, "grad_norm": 0.5400310158729553, "learning_rate": 0.00012532192001501587, "loss": 0.4073, "mean_token_accuracy": 0.9064356535673141, "num_tokens": 1933488.0, "step": 290 }, { "epoch": 2.328, "grad_norm": 0.6224051713943481, "learning_rate": 0.00012486195404527264, "loss": 0.5118, "mean_token_accuracy": 0.87581767141819, "num_tokens": 1940023.0, "step": 291 }, { "epoch": 2.336, "grad_norm": 0.5552983283996582, "learning_rate": 0.00012440158865586868, "loss": 0.3916, "mean_token_accuracy": 0.8996559679508209, "num_tokens": 1946461.0, "step": 292 }, { "epoch": 2.344, "grad_norm": 0.5545081496238708, "learning_rate": 0.00012394083621926764, "loss": 0.3797, "mean_token_accuracy": 0.9041395634412766, "num_tokens": 1952478.0, "step": 293 }, { "epoch": 2.352, "grad_norm": 0.5815087556838989, "learning_rate": 0.00012347970911833536, "loss": 0.4378, "mean_token_accuracy": 0.8938320130109787, "num_tokens": 1959797.0, "step": 294 }, { "epoch": 2.36, "grad_norm": 0.5759497880935669, "learning_rate": 0.00012301821974600678, "loss": 0.5294, "mean_token_accuracy": 0.8737110495567322, "num_tokens": 1967856.0, "step": 295 }, { "epoch": 2.368, "grad_norm": 0.5934637188911438, "learning_rate": 0.00012255638050495308, "loss": 0.3177, "mean_token_accuracy": 0.9201480895280838, "num_tokens": 1974273.0, "step": 296 }, { "epoch": 2.376, "grad_norm": 0.6188419461250305, "learning_rate": 0.00012209420380724823, "loss": 0.4129, "mean_token_accuracy": 0.8965235501527786, "num_tokens": 1980548.0, "step": 297 }, { "epoch": 2.384, "grad_norm": 0.5763571858406067, "learning_rate": 0.0001216317020740354, "loss": 0.3132, "mean_token_accuracy": 0.9215654134750366, "num_tokens": 1987746.0, "step": 298 }, { "epoch": 2.392, "grad_norm": 0.751468300819397, "learning_rate": 0.00012116888773519334, "loss": 0.4326, "mean_token_accuracy": 0.895213857293129, "num_tokens": 1994745.0, "step": 299 }, { "epoch": 2.4, "grad_norm": 0.7481938600540161, "learning_rate": 0.00012070577322900203, "loss": 0.572, "mean_token_accuracy": 0.8740457892417908, "num_tokens": 2001516.0, "step": 300 }, { "epoch": 2.408, "grad_norm": 0.5347183346748352, "learning_rate": 0.0001202423710018086, "loss": 0.359, "mean_token_accuracy": 0.9106192588806152, "num_tokens": 2007284.0, "step": 301 }, { "epoch": 2.416, "grad_norm": 0.5099695920944214, "learning_rate": 0.00011977869350769271, "loss": 0.3371, "mean_token_accuracy": 0.9177048951387405, "num_tokens": 2013281.0, "step": 302 }, { "epoch": 2.424, "grad_norm": 0.6337592005729675, "learning_rate": 0.00011931475320813203, "loss": 0.5071, "mean_token_accuracy": 0.8825759738683701, "num_tokens": 2019547.0, "step": 303 }, { "epoch": 2.432, "grad_norm": 0.5821767449378967, "learning_rate": 0.00011885056257166714, "loss": 0.3934, "mean_token_accuracy": 0.8972548693418503, "num_tokens": 2026113.0, "step": 304 }, { "epoch": 2.44, "grad_norm": 0.5851284861564636, "learning_rate": 0.00011838613407356647, "loss": 0.4484, "mean_token_accuracy": 0.8894606381654739, "num_tokens": 2032809.0, "step": 305 }, { "epoch": 2.448, "grad_norm": 0.5118408203125, "learning_rate": 0.00011792148019549108, "loss": 0.392, "mean_token_accuracy": 0.8994560539722443, "num_tokens": 2039543.0, "step": 306 }, { "epoch": 2.456, "grad_norm": 0.5648627281188965, "learning_rate": 0.00011745661342515917, "loss": 0.4037, "mean_token_accuracy": 0.8992389291524887, "num_tokens": 2045498.0, "step": 307 }, { "epoch": 2.464, "grad_norm": 0.6364490985870361, "learning_rate": 0.00011699154625601059, "loss": 0.4773, "mean_token_accuracy": 0.8821236789226532, "num_tokens": 2051540.0, "step": 308 }, { "epoch": 2.472, "grad_norm": 0.6649516820907593, "learning_rate": 0.00011652629118687081, "loss": 0.4277, "mean_token_accuracy": 0.8933283537626266, "num_tokens": 2057708.0, "step": 309 }, { "epoch": 2.48, "grad_norm": 0.5486933588981628, "learning_rate": 0.00011606086072161529, "loss": 0.3309, "mean_token_accuracy": 0.9130555689334869, "num_tokens": 2065035.0, "step": 310 }, { "epoch": 2.488, "grad_norm": 0.6395343542098999, "learning_rate": 0.00011559526736883326, "loss": 0.3964, "mean_token_accuracy": 0.9014268219470978, "num_tokens": 2071559.0, "step": 311 }, { "epoch": 2.496, "grad_norm": 0.5258601307868958, "learning_rate": 0.00011512952364149163, "loss": 0.3307, "mean_token_accuracy": 0.9159080535173416, "num_tokens": 2077961.0, "step": 312 }, { "epoch": 2.504, "grad_norm": 0.5728172063827515, "learning_rate": 0.00011466364205659868, "loss": 0.3504, "mean_token_accuracy": 0.9113341420888901, "num_tokens": 2084338.0, "step": 313 }, { "epoch": 2.512, "grad_norm": 0.4729403853416443, "learning_rate": 0.00011419763513486758, "loss": 0.2482, "mean_token_accuracy": 0.9292099177837372, "num_tokens": 2090257.0, "step": 314 }, { "epoch": 2.52, "grad_norm": 0.5840467810630798, "learning_rate": 0.0001137315154003801, "loss": 0.4887, "mean_token_accuracy": 0.873644083738327, "num_tokens": 2096764.0, "step": 315 }, { "epoch": 2.528, "grad_norm": 0.5904984474182129, "learning_rate": 0.00011326529538024973, "loss": 0.3732, "mean_token_accuracy": 0.9035074561834335, "num_tokens": 2102793.0, "step": 316 }, { "epoch": 2.536, "grad_norm": 0.5417636632919312, "learning_rate": 0.00011279898760428534, "loss": 0.4495, "mean_token_accuracy": 0.8885599672794342, "num_tokens": 2110989.0, "step": 317 }, { "epoch": 2.544, "grad_norm": 0.5569573044776917, "learning_rate": 0.0001123326046046541, "loss": 0.3128, "mean_token_accuracy": 0.9152603000402451, "num_tokens": 2116687.0, "step": 318 }, { "epoch": 2.552, "grad_norm": 0.5581322908401489, "learning_rate": 0.00011186615891554498, "loss": 0.3806, "mean_token_accuracy": 0.9020133912563324, "num_tokens": 2123586.0, "step": 319 }, { "epoch": 2.56, "grad_norm": 0.5262582898139954, "learning_rate": 0.00011139966307283161, "loss": 0.3444, "mean_token_accuracy": 0.9165927618741989, "num_tokens": 2129931.0, "step": 320 }, { "epoch": 2.568, "grad_norm": 0.6239840388298035, "learning_rate": 0.00011093312961373561, "loss": 0.3606, "mean_token_accuracy": 0.90853650867939, "num_tokens": 2135060.0, "step": 321 }, { "epoch": 2.576, "grad_norm": 0.676482081413269, "learning_rate": 0.00011046657107648959, "loss": 0.3903, "mean_token_accuracy": 0.8996864557266235, "num_tokens": 2143165.0, "step": 322 }, { "epoch": 2.584, "grad_norm": 0.5831571221351624, "learning_rate": 0.00011000000000000002, "loss": 0.2968, "mean_token_accuracy": 0.9183152318000793, "num_tokens": 2148841.0, "step": 323 }, { "epoch": 2.592, "grad_norm": 0.5738188028335571, "learning_rate": 0.00010953342892351046, "loss": 0.2955, "mean_token_accuracy": 0.9245442301034927, "num_tokens": 2155034.0, "step": 324 }, { "epoch": 2.6, "grad_norm": 0.6479945182800293, "learning_rate": 0.0001090668703862644, "loss": 0.3616, "mean_token_accuracy": 0.9066135734319687, "num_tokens": 2162595.0, "step": 325 }, { "epoch": 2.608, "grad_norm": 0.5514622330665588, "learning_rate": 0.00010860033692716843, "loss": 0.4308, "mean_token_accuracy": 0.8917038440704346, "num_tokens": 2170216.0, "step": 326 }, { "epoch": 2.616, "grad_norm": 0.54876309633255, "learning_rate": 0.0001081338410844551, "loss": 0.3296, "mean_token_accuracy": 0.9163542240858078, "num_tokens": 2176010.0, "step": 327 }, { "epoch": 2.624, "grad_norm": 0.5075457692146301, "learning_rate": 0.00010766739539534591, "loss": 0.4033, "mean_token_accuracy": 0.9006696194410324, "num_tokens": 2183355.0, "step": 328 }, { "epoch": 2.632, "grad_norm": 0.5311897993087769, "learning_rate": 0.0001072010123957147, "loss": 0.443, "mean_token_accuracy": 0.8914453685283661, "num_tokens": 2191374.0, "step": 329 }, { "epoch": 2.64, "grad_norm": 0.6150345802307129, "learning_rate": 0.00010673470461975028, "loss": 0.4326, "mean_token_accuracy": 0.8918435126543045, "num_tokens": 2198126.0, "step": 330 }, { "epoch": 2.648, "grad_norm": 0.6158291101455688, "learning_rate": 0.00010626848459961993, "loss": 0.3973, "mean_token_accuracy": 0.9031812250614166, "num_tokens": 2205329.0, "step": 331 }, { "epoch": 2.656, "grad_norm": 0.584537923336029, "learning_rate": 0.00010580236486513244, "loss": 0.5038, "mean_token_accuracy": 0.8791113644838333, "num_tokens": 2210989.0, "step": 332 }, { "epoch": 2.664, "grad_norm": 0.48980802297592163, "learning_rate": 0.00010533635794340136, "loss": 0.3534, "mean_token_accuracy": 0.9118558913469315, "num_tokens": 2218251.0, "step": 333 }, { "epoch": 2.672, "grad_norm": 0.6641169190406799, "learning_rate": 0.00010487047635850837, "loss": 0.4302, "mean_token_accuracy": 0.9002415388822556, "num_tokens": 2225009.0, "step": 334 }, { "epoch": 2.68, "grad_norm": 0.6065587997436523, "learning_rate": 0.00010440473263116676, "loss": 0.426, "mean_token_accuracy": 0.8951088786125183, "num_tokens": 2230736.0, "step": 335 }, { "epoch": 2.6879999999999997, "grad_norm": 0.5646011233329773, "learning_rate": 0.00010393913927838475, "loss": 0.3362, "mean_token_accuracy": 0.9133762270212173, "num_tokens": 2237088.0, "step": 336 }, { "epoch": 2.6959999999999997, "grad_norm": 0.4926559031009674, "learning_rate": 0.00010347370881312926, "loss": 0.3314, "mean_token_accuracy": 0.9141460210084915, "num_tokens": 2243281.0, "step": 337 }, { "epoch": 2.7039999999999997, "grad_norm": 0.6366980671882629, "learning_rate": 0.00010300845374398944, "loss": 0.3993, "mean_token_accuracy": 0.8970449268817902, "num_tokens": 2249268.0, "step": 338 }, { "epoch": 2.7119999999999997, "grad_norm": 0.6825828552246094, "learning_rate": 0.00010254338657484086, "loss": 0.4136, "mean_token_accuracy": 0.9005266577005386, "num_tokens": 2255840.0, "step": 339 }, { "epoch": 2.7199999999999998, "grad_norm": 0.6325793862342834, "learning_rate": 0.00010207851980450897, "loss": 0.4124, "mean_token_accuracy": 0.8939056694507599, "num_tokens": 2263126.0, "step": 340 }, { "epoch": 2.7279999999999998, "grad_norm": 0.5315702557563782, "learning_rate": 0.00010161386592643356, "loss": 0.3984, "mean_token_accuracy": 0.898924395442009, "num_tokens": 2270071.0, "step": 341 }, { "epoch": 2.7359999999999998, "grad_norm": 0.48808005452156067, "learning_rate": 0.0001011494374283329, "loss": 0.3207, "mean_token_accuracy": 0.9150982201099396, "num_tokens": 2277444.0, "step": 342 }, { "epoch": 2.7439999999999998, "grad_norm": 0.6010448932647705, "learning_rate": 0.00010068524679186799, "loss": 0.4597, "mean_token_accuracy": 0.8895941376686096, "num_tokens": 2284738.0, "step": 343 }, { "epoch": 2.752, "grad_norm": 0.5889164805412292, "learning_rate": 0.0001002213064923073, "loss": 0.4473, "mean_token_accuracy": 0.8923371285200119, "num_tokens": 2291831.0, "step": 344 }, { "epoch": 2.76, "grad_norm": 0.6014875769615173, "learning_rate": 9.975762899819144e-05, "loss": 0.466, "mean_token_accuracy": 0.8965367525815964, "num_tokens": 2297765.0, "step": 345 }, { "epoch": 2.768, "grad_norm": 0.6199116706848145, "learning_rate": 9.929422677099802e-05, "loss": 0.5187, "mean_token_accuracy": 0.8750995695590973, "num_tokens": 2305641.0, "step": 346 }, { "epoch": 2.776, "grad_norm": 0.5978701710700989, "learning_rate": 9.883111226480665e-05, "loss": 0.432, "mean_token_accuracy": 0.8937551379203796, "num_tokens": 2312150.0, "step": 347 }, { "epoch": 2.784, "grad_norm": 0.6672642827033997, "learning_rate": 9.83682979259646e-05, "loss": 0.5309, "mean_token_accuracy": 0.880384549498558, "num_tokens": 2319869.0, "step": 348 }, { "epoch": 2.792, "grad_norm": 0.5927156209945679, "learning_rate": 9.790579619275182e-05, "loss": 0.3449, "mean_token_accuracy": 0.9099213778972626, "num_tokens": 2325735.0, "step": 349 }, { "epoch": 2.8, "grad_norm": 0.5710451602935791, "learning_rate": 9.744361949504694e-05, "loss": 0.4232, "mean_token_accuracy": 0.8960713893175125, "num_tokens": 2332447.0, "step": 350 }, { "epoch": 2.808, "grad_norm": 0.5022631287574768, "learning_rate": 9.698178025399325e-05, "loss": 0.3159, "mean_token_accuracy": 0.9238518178462982, "num_tokens": 2338404.0, "step": 351 }, { "epoch": 2.816, "grad_norm": 0.5941007137298584, "learning_rate": 9.652029088166468e-05, "loss": 0.4233, "mean_token_accuracy": 0.8963815271854401, "num_tokens": 2344633.0, "step": 352 }, { "epoch": 2.824, "grad_norm": 0.44153064489364624, "learning_rate": 9.605916378073238e-05, "loss": 0.2426, "mean_token_accuracy": 0.9335204064846039, "num_tokens": 2350349.0, "step": 353 }, { "epoch": 2.832, "grad_norm": 0.51079261302948, "learning_rate": 9.559841134413137e-05, "loss": 0.3345, "mean_token_accuracy": 0.9156464785337448, "num_tokens": 2355901.0, "step": 354 }, { "epoch": 2.84, "grad_norm": 0.5738810896873474, "learning_rate": 9.513804595472739e-05, "loss": 0.4553, "mean_token_accuracy": 0.8870168775320053, "num_tokens": 2363882.0, "step": 355 }, { "epoch": 2.848, "grad_norm": 0.5877161622047424, "learning_rate": 9.467807998498412e-05, "loss": 0.3729, "mean_token_accuracy": 0.9043427407741547, "num_tokens": 2371416.0, "step": 356 }, { "epoch": 2.856, "grad_norm": 0.5334792137145996, "learning_rate": 9.421852579663091e-05, "loss": 0.4161, "mean_token_accuracy": 0.8991669416427612, "num_tokens": 2380136.0, "step": 357 }, { "epoch": 2.864, "grad_norm": 0.6109706163406372, "learning_rate": 9.375939574033015e-05, "loss": 0.3871, "mean_token_accuracy": 0.9027614295482635, "num_tokens": 2385717.0, "step": 358 }, { "epoch": 2.872, "grad_norm": 0.5849063396453857, "learning_rate": 9.33007021553456e-05, "loss": 0.4116, "mean_token_accuracy": 0.8949293345212936, "num_tokens": 2392903.0, "step": 359 }, { "epoch": 2.88, "grad_norm": 0.567379355430603, "learning_rate": 9.284245736921084e-05, "loss": 0.4413, "mean_token_accuracy": 0.8969996869564056, "num_tokens": 2399565.0, "step": 360 }, { "epoch": 2.888, "grad_norm": 0.5341576337814331, "learning_rate": 9.238467369739765e-05, "loss": 0.3026, "mean_token_accuracy": 0.9232186675071716, "num_tokens": 2406742.0, "step": 361 }, { "epoch": 2.896, "grad_norm": 0.6203696727752686, "learning_rate": 9.192736344298536e-05, "loss": 0.4832, "mean_token_accuracy": 0.8850630819797516, "num_tokens": 2413366.0, "step": 362 }, { "epoch": 2.904, "grad_norm": 0.5966675877571106, "learning_rate": 9.147053889632998e-05, "loss": 0.475, "mean_token_accuracy": 0.8814896196126938, "num_tokens": 2420370.0, "step": 363 }, { "epoch": 2.912, "grad_norm": 0.7352378964424133, "learning_rate": 9.101421233473404e-05, "loss": 0.4751, "mean_token_accuracy": 0.8790076822042465, "num_tokens": 2425957.0, "step": 364 }, { "epoch": 2.92, "grad_norm": 0.649573564529419, "learning_rate": 9.05583960221165e-05, "loss": 0.483, "mean_token_accuracy": 0.8800929635763168, "num_tokens": 2432192.0, "step": 365 }, { "epoch": 2.928, "grad_norm": 0.6641559600830078, "learning_rate": 9.010310220868325e-05, "loss": 0.4624, "mean_token_accuracy": 0.8848322480916977, "num_tokens": 2439418.0, "step": 366 }, { "epoch": 2.936, "grad_norm": 0.6114389896392822, "learning_rate": 8.964834313059789e-05, "loss": 0.4182, "mean_token_accuracy": 0.8939742594957352, "num_tokens": 2446386.0, "step": 367 }, { "epoch": 2.944, "grad_norm": 0.49147117137908936, "learning_rate": 8.919413100965277e-05, "loss": 0.3372, "mean_token_accuracy": 0.9202758967876434, "num_tokens": 2454243.0, "step": 368 }, { "epoch": 2.952, "grad_norm": 0.6345622539520264, "learning_rate": 8.874047805294058e-05, "loss": 0.4655, "mean_token_accuracy": 0.8817432820796967, "num_tokens": 2461766.0, "step": 369 }, { "epoch": 2.96, "grad_norm": 0.6763973236083984, "learning_rate": 8.828739645252641e-05, "loss": 0.4068, "mean_token_accuracy": 0.9020190238952637, "num_tokens": 2469062.0, "step": 370 }, { "epoch": 2.968, "grad_norm": 0.642564594745636, "learning_rate": 8.783489838511989e-05, "loss": 0.3712, "mean_token_accuracy": 0.9081651717424393, "num_tokens": 2475360.0, "step": 371 }, { "epoch": 2.976, "grad_norm": 0.6450640559196472, "learning_rate": 8.738299601174802e-05, "loss": 0.4187, "mean_token_accuracy": 0.893525555729866, "num_tokens": 2481518.0, "step": 372 }, { "epoch": 2.984, "grad_norm": 0.6248624920845032, "learning_rate": 8.693170147742835e-05, "loss": 0.403, "mean_token_accuracy": 0.8992810100317001, "num_tokens": 2488088.0, "step": 373 }, { "epoch": 2.992, "grad_norm": 0.6563145518302917, "learning_rate": 8.648102691084256e-05, "loss": 0.4302, "mean_token_accuracy": 0.8928894251585007, "num_tokens": 2495134.0, "step": 374 }, { "epoch": 3.0, "grad_norm": 0.5383606553077698, "learning_rate": 8.603098442401049e-05, "loss": 0.2435, "mean_token_accuracy": 0.9337793737649918, "num_tokens": 2500152.0, "step": 375 }, { "epoch": 3.008, "grad_norm": 0.5827193856239319, "learning_rate": 8.558158611196471e-05, "loss": 0.391, "mean_token_accuracy": 0.9050875455141068, "num_tokens": 2505997.0, "step": 376 }, { "epoch": 3.016, "grad_norm": 0.6390306353569031, "learning_rate": 8.51328440524253e-05, "loss": 0.3528, "mean_token_accuracy": 0.9101446568965912, "num_tokens": 2512840.0, "step": 377 }, { "epoch": 3.024, "grad_norm": 0.5810700058937073, "learning_rate": 8.468477030547538e-05, "loss": 0.3012, "mean_token_accuracy": 0.9227552264928818, "num_tokens": 2520524.0, "step": 378 }, { "epoch": 3.032, "grad_norm": 0.551525890827179, "learning_rate": 8.423737691323696e-05, "loss": 0.3089, "mean_token_accuracy": 0.9186848849058151, "num_tokens": 2527060.0, "step": 379 }, { "epoch": 3.04, "grad_norm": 0.5348106622695923, "learning_rate": 8.379067589954735e-05, "loss": 0.325, "mean_token_accuracy": 0.9210143238306046, "num_tokens": 2533882.0, "step": 380 }, { "epoch": 3.048, "grad_norm": 0.6358270049095154, "learning_rate": 8.334467926963585e-05, "loss": 0.3007, "mean_token_accuracy": 0.918445810675621, "num_tokens": 2540847.0, "step": 381 }, { "epoch": 3.056, "grad_norm": 0.6169764995574951, "learning_rate": 8.289939900980142e-05, "loss": 0.2598, "mean_token_accuracy": 0.9337571561336517, "num_tokens": 2546625.0, "step": 382 }, { "epoch": 3.064, "grad_norm": 0.9182451367378235, "learning_rate": 8.245484708709015e-05, "loss": 0.375, "mean_token_accuracy": 0.903697207570076, "num_tokens": 2553964.0, "step": 383 }, { "epoch": 3.072, "grad_norm": 1.041986107826233, "learning_rate": 8.201103544897395e-05, "loss": 0.351, "mean_token_accuracy": 0.9103524535894394, "num_tokens": 2560322.0, "step": 384 }, { "epoch": 3.08, "grad_norm": 0.7462720274925232, "learning_rate": 8.156797602302935e-05, "loss": 0.3519, "mean_token_accuracy": 0.9099837243556976, "num_tokens": 2566454.0, "step": 385 }, { "epoch": 3.088, "grad_norm": 0.5972948670387268, "learning_rate": 8.112568071661692e-05, "loss": 0.3209, "mean_token_accuracy": 0.9207572788000107, "num_tokens": 2572530.0, "step": 386 }, { "epoch": 3.096, "grad_norm": 0.5934342741966248, "learning_rate": 8.068416141656127e-05, "loss": 0.3264, "mean_token_accuracy": 0.9189362674951553, "num_tokens": 2580032.0, "step": 387 }, { "epoch": 3.104, "grad_norm": 0.6861932277679443, "learning_rate": 8.024342998883157e-05, "loss": 0.3841, "mean_token_accuracy": 0.904761329293251, "num_tokens": 2586057.0, "step": 388 }, { "epoch": 3.112, "grad_norm": 0.6972135901451111, "learning_rate": 7.980349827822275e-05, "loss": 0.3524, "mean_token_accuracy": 0.9070709049701691, "num_tokens": 2592574.0, "step": 389 }, { "epoch": 3.12, "grad_norm": 0.6740308403968811, "learning_rate": 7.936437810803708e-05, "loss": 0.4022, "mean_token_accuracy": 0.9070966988801956, "num_tokens": 2598908.0, "step": 390 }, { "epoch": 3.128, "grad_norm": 0.6802906394004822, "learning_rate": 7.892608127976633e-05, "loss": 0.4084, "mean_token_accuracy": 0.8942376673221588, "num_tokens": 2604925.0, "step": 391 }, { "epoch": 3.136, "grad_norm": 0.6520804166793823, "learning_rate": 7.848861957277485e-05, "loss": 0.3844, "mean_token_accuracy": 0.9039040952920914, "num_tokens": 2611739.0, "step": 392 }, { "epoch": 3.144, "grad_norm": 0.6760826706886292, "learning_rate": 7.805200474398273e-05, "loss": 0.4538, "mean_token_accuracy": 0.885806143283844, "num_tokens": 2620140.0, "step": 393 }, { "epoch": 3.152, "grad_norm": 0.6366507411003113, "learning_rate": 7.761624852754992e-05, "loss": 0.2999, "mean_token_accuracy": 0.9276435822248459, "num_tokens": 2627295.0, "step": 394 }, { "epoch": 3.16, "grad_norm": 0.6806789636611938, "learning_rate": 7.718136263456106e-05, "loss": 0.2875, "mean_token_accuracy": 0.9271189719438553, "num_tokens": 2632835.0, "step": 395 }, { "epoch": 3.168, "grad_norm": 0.7685542106628418, "learning_rate": 7.67473587527104e-05, "loss": 0.4355, "mean_token_accuracy": 0.8999910950660706, "num_tokens": 2639959.0, "step": 396 }, { "epoch": 3.176, "grad_norm": 0.5358136892318726, "learning_rate": 7.631424854598792e-05, "loss": 0.2073, "mean_token_accuracy": 0.9396394193172455, "num_tokens": 2646328.0, "step": 397 }, { "epoch": 3.184, "grad_norm": 0.8773845434188843, "learning_rate": 7.588204365436589e-05, "loss": 0.4198, "mean_token_accuracy": 0.892386183142662, "num_tokens": 2652980.0, "step": 398 }, { "epoch": 3.192, "grad_norm": 0.607337474822998, "learning_rate": 7.545075569348579e-05, "loss": 0.2734, "mean_token_accuracy": 0.9268697798252106, "num_tokens": 2659278.0, "step": 399 }, { "epoch": 3.2, "grad_norm": 0.6373876333236694, "learning_rate": 7.502039625434644e-05, "loss": 0.3147, "mean_token_accuracy": 0.9217724651098251, "num_tokens": 2665794.0, "step": 400 }, { "epoch": 3.208, "grad_norm": 0.7114989757537842, "learning_rate": 7.45909769029923e-05, "loss": 0.3283, "mean_token_accuracy": 0.9154657870531082, "num_tokens": 2672199.0, "step": 401 }, { "epoch": 3.216, "grad_norm": 0.6082598567008972, "learning_rate": 7.41625091802027e-05, "loss": 0.3857, "mean_token_accuracy": 0.9070005118846893, "num_tokens": 2678647.0, "step": 402 }, { "epoch": 3.224, "grad_norm": 0.6768674254417419, "learning_rate": 7.373500460118148e-05, "loss": 0.3701, "mean_token_accuracy": 0.905963197350502, "num_tokens": 2685520.0, "step": 403 }, { "epoch": 3.232, "grad_norm": 0.5871829390525818, "learning_rate": 7.330847465524802e-05, "loss": 0.2989, "mean_token_accuracy": 0.9218492060899734, "num_tokens": 2692870.0, "step": 404 }, { "epoch": 3.24, "grad_norm": 0.620406985282898, "learning_rate": 7.288293080552785e-05, "loss": 0.3053, "mean_token_accuracy": 0.9199936836957932, "num_tokens": 2699686.0, "step": 405 }, { "epoch": 3.248, "grad_norm": 0.5530748963356018, "learning_rate": 7.245838448864497e-05, "loss": 0.2875, "mean_token_accuracy": 0.9256064593791962, "num_tokens": 2706879.0, "step": 406 }, { "epoch": 3.2560000000000002, "grad_norm": 0.6250858306884766, "learning_rate": 7.203484711441442e-05, "loss": 0.3277, "mean_token_accuracy": 0.9196437895298004, "num_tokens": 2713994.0, "step": 407 }, { "epoch": 3.2640000000000002, "grad_norm": 0.7308060526847839, "learning_rate": 7.161233006553545e-05, "loss": 0.3969, "mean_token_accuracy": 0.9017348885536194, "num_tokens": 2720293.0, "step": 408 }, { "epoch": 3.2720000000000002, "grad_norm": 0.7297453284263611, "learning_rate": 7.119084469728578e-05, "loss": 0.2733, "mean_token_accuracy": 0.9292089939117432, "num_tokens": 2726634.0, "step": 409 }, { "epoch": 3.2800000000000002, "grad_norm": 0.8090037107467651, "learning_rate": 7.077040233721662e-05, "loss": 0.3056, "mean_token_accuracy": 0.9220250397920609, "num_tokens": 2732336.0, "step": 410 }, { "epoch": 3.288, "grad_norm": 0.6746524572372437, "learning_rate": 7.035101428484767e-05, "loss": 0.3195, "mean_token_accuracy": 0.9225981682538986, "num_tokens": 2737931.0, "step": 411 }, { "epoch": 3.296, "grad_norm": 0.6420883536338806, "learning_rate": 6.993269181136397e-05, "loss": 0.3578, "mean_token_accuracy": 0.9099164307117462, "num_tokens": 2745451.0, "step": 412 }, { "epoch": 3.304, "grad_norm": 0.6245267987251282, "learning_rate": 6.951544615931278e-05, "loss": 0.3419, "mean_token_accuracy": 0.9129006862640381, "num_tokens": 2752572.0, "step": 413 }, { "epoch": 3.312, "grad_norm": 0.6458064317703247, "learning_rate": 6.909928854230146e-05, "loss": 0.3474, "mean_token_accuracy": 0.9182617217302322, "num_tokens": 2758870.0, "step": 414 }, { "epoch": 3.32, "grad_norm": 0.5720428824424744, "learning_rate": 6.868423014469597e-05, "loss": 0.2484, "mean_token_accuracy": 0.9298986792564392, "num_tokens": 2764555.0, "step": 415 }, { "epoch": 3.328, "grad_norm": 0.6797909140586853, "learning_rate": 6.827028212132052e-05, "loss": 0.2924, "mean_token_accuracy": 0.9219144433736801, "num_tokens": 2770455.0, "step": 416 }, { "epoch": 3.336, "grad_norm": 0.6589520573616028, "learning_rate": 6.785745559715774e-05, "loss": 0.2689, "mean_token_accuracy": 0.9259062707424164, "num_tokens": 2776352.0, "step": 417 }, { "epoch": 3.344, "grad_norm": 0.5898686051368713, "learning_rate": 6.744576166704941e-05, "loss": 0.2914, "mean_token_accuracy": 0.9280878603458405, "num_tokens": 2783499.0, "step": 418 }, { "epoch": 3.352, "grad_norm": 0.6554582715034485, "learning_rate": 6.703521139539855e-05, "loss": 0.3587, "mean_token_accuracy": 0.9137773662805557, "num_tokens": 2790376.0, "step": 419 }, { "epoch": 3.36, "grad_norm": 0.709206223487854, "learning_rate": 6.66258158158722e-05, "loss": 0.3575, "mean_token_accuracy": 0.9057103246450424, "num_tokens": 2796980.0, "step": 420 }, { "epoch": 3.368, "grad_norm": 0.7458584904670715, "learning_rate": 6.621758593110444e-05, "loss": 0.3876, "mean_token_accuracy": 0.9051748067140579, "num_tokens": 2803574.0, "step": 421 }, { "epoch": 3.376, "grad_norm": 0.8126646876335144, "learning_rate": 6.581053271240101e-05, "loss": 0.3947, "mean_token_accuracy": 0.9007379114627838, "num_tokens": 2810515.0, "step": 422 }, { "epoch": 3.384, "grad_norm": 0.7045720219612122, "learning_rate": 6.540466709944446e-05, "loss": 0.3656, "mean_token_accuracy": 0.9115022867918015, "num_tokens": 2817789.0, "step": 423 }, { "epoch": 3.392, "grad_norm": 0.8205469846725464, "learning_rate": 6.500000000000002e-05, "loss": 0.3827, "mean_token_accuracy": 0.900556743144989, "num_tokens": 2823484.0, "step": 424 }, { "epoch": 3.4, "grad_norm": 0.657362699508667, "learning_rate": 6.459654228962244e-05, "loss": 0.2977, "mean_token_accuracy": 0.9223733395338058, "num_tokens": 2829626.0, "step": 425 }, { "epoch": 3.408, "grad_norm": 0.6915479302406311, "learning_rate": 6.419430481136381e-05, "loss": 0.3139, "mean_token_accuracy": 0.9167509377002716, "num_tokens": 2836640.0, "step": 426 }, { "epoch": 3.416, "grad_norm": 0.6523987650871277, "learning_rate": 6.379329837548216e-05, "loss": 0.3915, "mean_token_accuracy": 0.9017972648143768, "num_tokens": 2844323.0, "step": 427 }, { "epoch": 3.424, "grad_norm": 0.624142587184906, "learning_rate": 6.339353375915071e-05, "loss": 0.2825, "mean_token_accuracy": 0.9284717440605164, "num_tokens": 2850061.0, "step": 428 }, { "epoch": 3.432, "grad_norm": 0.6127302050590515, "learning_rate": 6.29950217061686e-05, "loss": 0.2936, "mean_token_accuracy": 0.9226386398077011, "num_tokens": 2856118.0, "step": 429 }, { "epoch": 3.44, "grad_norm": 0.6790028214454651, "learning_rate": 6.259777292667172e-05, "loss": 0.403, "mean_token_accuracy": 0.9005888849496841, "num_tokens": 2862733.0, "step": 430 }, { "epoch": 3.448, "grad_norm": 0.6278355121612549, "learning_rate": 6.220179809684524e-05, "loss": 0.2773, "mean_token_accuracy": 0.9252165108919144, "num_tokens": 2868945.0, "step": 431 }, { "epoch": 3.456, "grad_norm": 0.6876513361930847, "learning_rate": 6.180710785863655e-05, "loss": 0.3281, "mean_token_accuracy": 0.9138449281454086, "num_tokens": 2876038.0, "step": 432 }, { "epoch": 3.464, "grad_norm": 0.6686937808990479, "learning_rate": 6.141371281946908e-05, "loss": 0.3122, "mean_token_accuracy": 0.9213560372591019, "num_tokens": 2883508.0, "step": 433 }, { "epoch": 3.472, "grad_norm": 0.745513379573822, "learning_rate": 6.102162355195753e-05, "loss": 0.3948, "mean_token_accuracy": 0.8994506299495697, "num_tokens": 2890314.0, "step": 434 }, { "epoch": 3.48, "grad_norm": 0.7086876630783081, "learning_rate": 6.063085059362358e-05, "loss": 0.3515, "mean_token_accuracy": 0.9120168387889862, "num_tokens": 2897111.0, "step": 435 }, { "epoch": 3.488, "grad_norm": 0.6623766422271729, "learning_rate": 6.024140444661258e-05, "loss": 0.304, "mean_token_accuracy": 0.9256622493267059, "num_tokens": 2903349.0, "step": 436 }, { "epoch": 3.496, "grad_norm": 0.7545041441917419, "learning_rate": 5.985329557741146e-05, "loss": 0.418, "mean_token_accuracy": 0.9027806520462036, "num_tokens": 2909800.0, "step": 437 }, { "epoch": 3.504, "grad_norm": 0.8531441688537598, "learning_rate": 5.946653441656741e-05, "loss": 0.4767, "mean_token_accuracy": 0.8789743334054947, "num_tokens": 2915937.0, "step": 438 }, { "epoch": 3.512, "grad_norm": 0.7345475554466248, "learning_rate": 5.908113135840758e-05, "loss": 0.3281, "mean_token_accuracy": 0.9175153225660324, "num_tokens": 2922189.0, "step": 439 }, { "epoch": 3.52, "grad_norm": 0.7147907018661499, "learning_rate": 5.8697096760759565e-05, "loss": 0.32, "mean_token_accuracy": 0.9190562069416046, "num_tokens": 2929265.0, "step": 440 }, { "epoch": 3.528, "grad_norm": 0.6168371438980103, "learning_rate": 5.831444094467326e-05, "loss": 0.2761, "mean_token_accuracy": 0.9286189675331116, "num_tokens": 2936181.0, "step": 441 }, { "epoch": 3.536, "grad_norm": 0.8118520975112915, "learning_rate": 5.793317419414337e-05, "loss": 0.4774, "mean_token_accuracy": 0.8767688423395157, "num_tokens": 2942451.0, "step": 442 }, { "epoch": 3.544, "grad_norm": 0.7241776585578918, "learning_rate": 5.755330675583292e-05, "loss": 0.3554, "mean_token_accuracy": 0.9102961868047714, "num_tokens": 2949461.0, "step": 443 }, { "epoch": 3.552, "grad_norm": 0.7084028720855713, "learning_rate": 5.717484883879811e-05, "loss": 0.2856, "mean_token_accuracy": 0.9266216903924942, "num_tokens": 2955279.0, "step": 444 }, { "epoch": 3.56, "grad_norm": 0.7020514607429504, "learning_rate": 5.6797810614213806e-05, "loss": 0.3466, "mean_token_accuracy": 0.9103652834892273, "num_tokens": 2961545.0, "step": 445 }, { "epoch": 3.568, "grad_norm": 0.6163370013237, "learning_rate": 5.642220221510008e-05, "loss": 0.2283, "mean_token_accuracy": 0.9415542483329773, "num_tokens": 2967608.0, "step": 446 }, { "epoch": 3.576, "grad_norm": 0.6595510840415955, "learning_rate": 5.604803373605006e-05, "loss": 0.3214, "mean_token_accuracy": 0.9144968539476395, "num_tokens": 2974757.0, "step": 447 }, { "epoch": 3.584, "grad_norm": 0.7288110852241516, "learning_rate": 5.567531523295868e-05, "loss": 0.4025, "mean_token_accuracy": 0.8981879353523254, "num_tokens": 2981455.0, "step": 448 }, { "epoch": 3.592, "grad_norm": 0.68780916929245, "learning_rate": 5.5304056722752185e-05, "loss": 0.3366, "mean_token_accuracy": 0.9154670089483261, "num_tokens": 2987301.0, "step": 449 }, { "epoch": 3.6, "grad_norm": 0.879482626914978, "learning_rate": 5.493426818311902e-05, "loss": 0.3783, "mean_token_accuracy": 0.9188111573457718, "num_tokens": 2993179.0, "step": 450 }, { "epoch": 3.608, "grad_norm": 0.6251075863838196, "learning_rate": 5.456595955224184e-05, "loss": 0.2806, "mean_token_accuracy": 0.927557647228241, "num_tokens": 3000022.0, "step": 451 }, { "epoch": 3.616, "grad_norm": 0.7020049691200256, "learning_rate": 5.419914072853025e-05, "loss": 0.3618, "mean_token_accuracy": 0.9032842069864273, "num_tokens": 3007237.0, "step": 452 }, { "epoch": 3.624, "grad_norm": 0.5821812748908997, "learning_rate": 5.383382157035477e-05, "loss": 0.2331, "mean_token_accuracy": 0.9408837407827377, "num_tokens": 3013463.0, "step": 453 }, { "epoch": 3.632, "grad_norm": 0.6966884732246399, "learning_rate": 5.347001189578198e-05, "loss": 0.3973, "mean_token_accuracy": 0.9004788398742676, "num_tokens": 3020153.0, "step": 454 }, { "epoch": 3.64, "grad_norm": 0.6050215363502502, "learning_rate": 5.3107721482310634e-05, "loss": 0.3171, "mean_token_accuracy": 0.922451063990593, "num_tokens": 3027354.0, "step": 455 }, { "epoch": 3.648, "grad_norm": 0.7329418063163757, "learning_rate": 5.27469600666088e-05, "loss": 0.2924, "mean_token_accuracy": 0.9290041327476501, "num_tokens": 3033845.0, "step": 456 }, { "epoch": 3.656, "grad_norm": 0.6909907460212708, "learning_rate": 5.2387737344252365e-05, "loss": 0.2902, "mean_token_accuracy": 0.9246671795845032, "num_tokens": 3040052.0, "step": 457 }, { "epoch": 3.664, "grad_norm": 0.7824153304100037, "learning_rate": 5.203006296946421e-05, "loss": 0.47, "mean_token_accuracy": 0.8981824964284897, "num_tokens": 3047655.0, "step": 458 }, { "epoch": 3.672, "grad_norm": 0.7358590960502625, "learning_rate": 5.1673946554855e-05, "loss": 0.2781, "mean_token_accuracy": 0.9266254603862762, "num_tokens": 3052496.0, "step": 459 }, { "epoch": 3.68, "grad_norm": 0.6887803077697754, "learning_rate": 5.131939767116472e-05, "loss": 0.339, "mean_token_accuracy": 0.9165306240320206, "num_tokens": 3058822.0, "step": 460 }, { "epoch": 3.6879999999999997, "grad_norm": 0.7517057657241821, "learning_rate": 5.096642584700542e-05, "loss": 0.4364, "mean_token_accuracy": 0.897202730178833, "num_tokens": 3065038.0, "step": 461 }, { "epoch": 3.6959999999999997, "grad_norm": 0.6500712037086487, "learning_rate": 5.061504056860522e-05, "loss": 0.3102, "mean_token_accuracy": 0.9211316555738449, "num_tokens": 3073091.0, "step": 462 }, { "epoch": 3.7039999999999997, "grad_norm": 0.6732147932052612, "learning_rate": 5.0265251279553304e-05, "loss": 0.2944, "mean_token_accuracy": 0.9234922379255295, "num_tokens": 3079702.0, "step": 463 }, { "epoch": 3.7119999999999997, "grad_norm": 0.694962203502655, "learning_rate": 4.991706738054618e-05, "loss": 0.3778, "mean_token_accuracy": 0.9041986465454102, "num_tokens": 3086844.0, "step": 464 }, { "epoch": 3.7199999999999998, "grad_norm": 0.6793336868286133, "learning_rate": 4.9570498229134986e-05, "loss": 0.3438, "mean_token_accuracy": 0.9137970954179764, "num_tokens": 3094103.0, "step": 465 }, { "epoch": 3.7279999999999998, "grad_norm": 0.6886551380157471, "learning_rate": 4.922555313947397e-05, "loss": 0.3581, "mean_token_accuracy": 0.9094865620136261, "num_tokens": 3100418.0, "step": 466 }, { "epoch": 3.7359999999999998, "grad_norm": 0.7724429368972778, "learning_rate": 4.888224138207029e-05, "loss": 0.3566, "mean_token_accuracy": 0.9097979664802551, "num_tokens": 3106630.0, "step": 467 }, { "epoch": 3.7439999999999998, "grad_norm": 0.6701758503913879, "learning_rate": 4.8540572183534676e-05, "loss": 0.3529, "mean_token_accuracy": 0.9114904552698135, "num_tokens": 3113833.0, "step": 468 }, { "epoch": 3.752, "grad_norm": 0.7943323850631714, "learning_rate": 4.8200554726333695e-05, "loss": 0.3224, "mean_token_accuracy": 0.9194429665803909, "num_tokens": 3120525.0, "step": 469 }, { "epoch": 3.76, "grad_norm": 0.8230472803115845, "learning_rate": 4.7862198148542804e-05, "loss": 0.3263, "mean_token_accuracy": 0.9154174476861954, "num_tokens": 3126456.0, "step": 470 }, { "epoch": 3.768, "grad_norm": 0.702497124671936, "learning_rate": 4.752551154360077e-05, "loss": 0.2665, "mean_token_accuracy": 0.9310731291770935, "num_tokens": 3133581.0, "step": 471 }, { "epoch": 3.776, "grad_norm": 0.6627335548400879, "learning_rate": 4.719050396006535e-05, "loss": 0.2903, "mean_token_accuracy": 0.9243995696306229, "num_tokens": 3139926.0, "step": 472 }, { "epoch": 3.784, "grad_norm": 0.733246386051178, "learning_rate": 4.685718440137011e-05, "loss": 0.3214, "mean_token_accuracy": 0.9217909723520279, "num_tokens": 3146541.0, "step": 473 }, { "epoch": 3.792, "grad_norm": 0.7050088047981262, "learning_rate": 4.652556182558237e-05, "loss": 0.4576, "mean_token_accuracy": 0.8917857855558395, "num_tokens": 3154227.0, "step": 474 }, { "epoch": 3.8, "grad_norm": 0.7012367844581604, "learning_rate": 4.619564514516245e-05, "loss": 0.3395, "mean_token_accuracy": 0.9158650636672974, "num_tokens": 3161062.0, "step": 475 }, { "epoch": 3.808, "grad_norm": 0.6993271112442017, "learning_rate": 4.5867443226724386e-05, "loss": 0.2893, "mean_token_accuracy": 0.9277119338512421, "num_tokens": 3166777.0, "step": 476 }, { "epoch": 3.816, "grad_norm": 0.6473844051361084, "learning_rate": 4.554096489079727e-05, "loss": 0.3589, "mean_token_accuracy": 0.9181333482265472, "num_tokens": 3174604.0, "step": 477 }, { "epoch": 3.824, "grad_norm": 0.7299215793609619, "learning_rate": 4.5216218911588396e-05, "loss": 0.3559, "mean_token_accuracy": 0.9077121019363403, "num_tokens": 3180898.0, "step": 478 }, { "epoch": 3.832, "grad_norm": 0.7105717062950134, "learning_rate": 4.489321401674753e-05, "loss": 0.3604, "mean_token_accuracy": 0.907567098736763, "num_tokens": 3188305.0, "step": 479 }, { "epoch": 3.84, "grad_norm": 0.7652660608291626, "learning_rate": 4.457195888713219e-05, "loss": 0.4361, "mean_token_accuracy": 0.9033184349536896, "num_tokens": 3195214.0, "step": 480 }, { "epoch": 3.848, "grad_norm": 0.668923020362854, "learning_rate": 4.425246215657436e-05, "loss": 0.3239, "mean_token_accuracy": 0.9226205497980118, "num_tokens": 3201799.0, "step": 481 }, { "epoch": 3.856, "grad_norm": 0.6668715476989746, "learning_rate": 4.3934732411648585e-05, "loss": 0.2954, "mean_token_accuracy": 0.9225424528121948, "num_tokens": 3208368.0, "step": 482 }, { "epoch": 3.864, "grad_norm": 0.7104055881500244, "learning_rate": 4.36187781914411e-05, "loss": 0.3945, "mean_token_accuracy": 0.9023055136203766, "num_tokens": 3215720.0, "step": 483 }, { "epoch": 3.872, "grad_norm": 0.6516867280006409, "learning_rate": 4.33046079873203e-05, "loss": 0.2958, "mean_token_accuracy": 0.9258823990821838, "num_tokens": 3222336.0, "step": 484 }, { "epoch": 3.88, "grad_norm": 0.6678684949874878, "learning_rate": 4.2992230242708645e-05, "loss": 0.3172, "mean_token_accuracy": 0.9214581400156021, "num_tokens": 3228529.0, "step": 485 }, { "epoch": 3.888, "grad_norm": 0.6983867287635803, "learning_rate": 4.268165335285566e-05, "loss": 0.3673, "mean_token_accuracy": 0.9060652107000351, "num_tokens": 3236563.0, "step": 486 }, { "epoch": 3.896, "grad_norm": 0.7120466828346252, "learning_rate": 4.237288566461235e-05, "loss": 0.3655, "mean_token_accuracy": 0.9145108759403229, "num_tokens": 3243835.0, "step": 487 }, { "epoch": 3.904, "grad_norm": 0.9121913909912109, "learning_rate": 4.2065935476206885e-05, "loss": 0.3793, "mean_token_accuracy": 0.9070864021778107, "num_tokens": 3250720.0, "step": 488 }, { "epoch": 3.912, "grad_norm": 0.6981674432754517, "learning_rate": 4.1760811037021484e-05, "loss": 0.3597, "mean_token_accuracy": 0.9061014503240585, "num_tokens": 3258490.0, "step": 489 }, { "epoch": 3.92, "grad_norm": 0.7587934732437134, "learning_rate": 4.145752054737087e-05, "loss": 0.4563, "mean_token_accuracy": 0.8915009945631027, "num_tokens": 3265051.0, "step": 490 }, { "epoch": 3.928, "grad_norm": 0.7366196513175964, "learning_rate": 4.115607215828172e-05, "loss": 0.3995, "mean_token_accuracy": 0.9052659422159195, "num_tokens": 3271916.0, "step": 491 }, { "epoch": 3.936, "grad_norm": 0.8313382863998413, "learning_rate": 4.085647397127376e-05, "loss": 0.4489, "mean_token_accuracy": 0.8994324654340744, "num_tokens": 3278383.0, "step": 492 }, { "epoch": 3.944, "grad_norm": 0.8330784440040588, "learning_rate": 4.055873403814191e-05, "loss": 0.386, "mean_token_accuracy": 0.9015507251024246, "num_tokens": 3285298.0, "step": 493 }, { "epoch": 3.952, "grad_norm": 0.6721290946006775, "learning_rate": 4.0262860360739915e-05, "loss": 0.275, "mean_token_accuracy": 0.9298962950706482, "num_tokens": 3291643.0, "step": 494 }, { "epoch": 3.96, "grad_norm": 0.7431320548057556, "learning_rate": 3.996886089076541e-05, "loss": 0.4362, "mean_token_accuracy": 0.8939598053693771, "num_tokens": 3299207.0, "step": 495 }, { "epoch": 3.968, "grad_norm": 0.6020960807800293, "learning_rate": 3.967674352954599e-05, "loss": 0.2915, "mean_token_accuracy": 0.9262511730194092, "num_tokens": 3306725.0, "step": 496 }, { "epoch": 3.976, "grad_norm": 0.6336486339569092, "learning_rate": 3.938651612782707e-05, "loss": 0.2658, "mean_token_accuracy": 0.9318113178014755, "num_tokens": 3311940.0, "step": 497 }, { "epoch": 3.984, "grad_norm": 0.7199985384941101, "learning_rate": 3.909818648556082e-05, "loss": 0.3755, "mean_token_accuracy": 0.9044479876756668, "num_tokens": 3319740.0, "step": 498 }, { "epoch": 3.992, "grad_norm": 0.7833542823791504, "learning_rate": 3.881176235169648e-05, "loss": 0.3921, "mean_token_accuracy": 0.8967743813991547, "num_tokens": 3325646.0, "step": 499 }, { "epoch": 4.0, "grad_norm": 0.6990426182746887, "learning_rate": 3.852725142397219e-05, "loss": 0.3151, "mean_token_accuracy": 0.9170209616422653, "num_tokens": 3333536.0, "step": 500 } ], "logging_steps": 1, "max_steps": 625, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 5.827222227488164e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }