{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.31108230719377833, "eval_steps": 300, "global_step": 1200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0012961762799740765, "grad_norm": 8.518570899963379, "learning_rate": 1.724137931034483e-06, "loss": 6.9757, "step": 5 }, { "epoch": 0.002592352559948153, "grad_norm": 10.28520393371582, "learning_rate": 3.8793103448275865e-06, "loss": 6.929, "step": 10 }, { "epoch": 0.0038885288399222295, "grad_norm": 8.170079231262207, "learning_rate": 6.03448275862069e-06, "loss": 6.7623, "step": 15 }, { "epoch": 0.005184705119896306, "grad_norm": 8.278770446777344, "learning_rate": 8.189655172413793e-06, "loss": 6.9025, "step": 20 }, { "epoch": 0.0064808813998703824, "grad_norm": 8.211565017700195, "learning_rate": 1.0344827586206897e-05, "loss": 6.7733, "step": 25 }, { "epoch": 0.007777057679844459, "grad_norm": 8.835349082946777, "learning_rate": 1.25e-05, "loss": 6.8982, "step": 30 }, { "epoch": 0.009073233959818535, "grad_norm": 8.735467910766602, "learning_rate": 1.4655172413793103e-05, "loss": 7.1852, "step": 35 }, { "epoch": 0.010369410239792612, "grad_norm": 8.508360862731934, "learning_rate": 1.6810344827586207e-05, "loss": 7.148, "step": 40 }, { "epoch": 0.011665586519766688, "grad_norm": 7.931098461151123, "learning_rate": 1.896551724137931e-05, "loss": 7.0417, "step": 45 }, { "epoch": 0.012961762799740765, "grad_norm": 9.208560943603516, "learning_rate": 2.1120689655172415e-05, "loss": 7.1105, "step": 50 }, { "epoch": 0.014257939079714841, "grad_norm": 8.593817710876465, "learning_rate": 2.327586206896552e-05, "loss": 6.9046, "step": 55 }, { "epoch": 0.015554115359688918, "grad_norm": 8.273022651672363, "learning_rate": 2.543103448275862e-05, "loss": 6.8974, "step": 60 }, { "epoch": 0.016850291639662993, "grad_norm": 8.649996757507324, "learning_rate": 2.7586206896551727e-05, "loss": 7.0945, "step": 65 }, { "epoch": 0.01814646791963707, "grad_norm": 8.942249298095703, "learning_rate": 2.974137931034483e-05, "loss": 6.5558, "step": 70 }, { "epoch": 0.019442644199611146, "grad_norm": 8.8996000289917, "learning_rate": 3.1896551724137935e-05, "loss": 7.2669, "step": 75 }, { "epoch": 0.020738820479585224, "grad_norm": 8.445710182189941, "learning_rate": 3.405172413793103e-05, "loss": 7.9577, "step": 80 }, { "epoch": 0.0220349967595593, "grad_norm": 8.136696815490723, "learning_rate": 3.620689655172414e-05, "loss": 7.3408, "step": 85 }, { "epoch": 0.023331173039533377, "grad_norm": 8.659695625305176, "learning_rate": 3.8362068965517246e-05, "loss": 7.0263, "step": 90 }, { "epoch": 0.02462734931950745, "grad_norm": 8.280396461486816, "learning_rate": 4.0517241379310344e-05, "loss": 7.285, "step": 95 }, { "epoch": 0.02592352559948153, "grad_norm": 8.545528411865234, "learning_rate": 4.267241379310345e-05, "loss": 7.0919, "step": 100 }, { "epoch": 0.027219701879455604, "grad_norm": 8.349964141845703, "learning_rate": 4.482758620689655e-05, "loss": 7.0525, "step": 105 }, { "epoch": 0.028515878159429683, "grad_norm": 9.21499252319336, "learning_rate": 4.698275862068966e-05, "loss": 6.8216, "step": 110 }, { "epoch": 0.029812054439403757, "grad_norm": 8.90820026397705, "learning_rate": 4.913793103448276e-05, "loss": 6.7591, "step": 115 }, { "epoch": 0.031108230719377836, "grad_norm": 8.280736923217773, "learning_rate": 5.129310344827587e-05, "loss": 6.7079, "step": 120 }, { "epoch": 0.03240440699935191, "grad_norm": 8.287160873413086, "learning_rate": 5.344827586206896e-05, "loss": 6.6935, "step": 125 }, { "epoch": 0.033700583279325985, "grad_norm": 8.187925338745117, "learning_rate": 5.560344827586207e-05, "loss": 6.5272, "step": 130 }, { "epoch": 0.03499675955930007, "grad_norm": 8.437505722045898, "learning_rate": 5.7758620689655175e-05, "loss": 7.2082, "step": 135 }, { "epoch": 0.03629293583927414, "grad_norm": 8.31921100616455, "learning_rate": 5.991379310344828e-05, "loss": 7.1288, "step": 140 }, { "epoch": 0.037589112119248216, "grad_norm": 8.23615837097168, "learning_rate": 6.206896551724138e-05, "loss": 6.929, "step": 145 }, { "epoch": 0.03888528839922229, "grad_norm": 9.158825874328613, "learning_rate": 6.422413793103449e-05, "loss": 6.9356, "step": 150 }, { "epoch": 0.04018146467919637, "grad_norm": 8.400908470153809, "learning_rate": 6.637931034482759e-05, "loss": 6.8375, "step": 155 }, { "epoch": 0.04147764095917045, "grad_norm": 8.317960739135742, "learning_rate": 6.85344827586207e-05, "loss": 6.8323, "step": 160 }, { "epoch": 0.04277381723914452, "grad_norm": 8.557168006896973, "learning_rate": 7.06896551724138e-05, "loss": 7.2039, "step": 165 }, { "epoch": 0.0440699935191186, "grad_norm": 7.868274688720703, "learning_rate": 7.28448275862069e-05, "loss": 6.9444, "step": 170 }, { "epoch": 0.04536616979909268, "grad_norm": 7.6865081787109375, "learning_rate": 7.500000000000001e-05, "loss": 6.9228, "step": 175 }, { "epoch": 0.046662346079066754, "grad_norm": 8.205500602722168, "learning_rate": 7.715517241379311e-05, "loss": 7.2978, "step": 180 }, { "epoch": 0.04795852235904083, "grad_norm": 8.222076416015625, "learning_rate": 7.931034482758621e-05, "loss": 7.3827, "step": 185 }, { "epoch": 0.0492546986390149, "grad_norm": 8.975173950195312, "learning_rate": 8.146551724137932e-05, "loss": 6.9009, "step": 190 }, { "epoch": 0.050550874918988985, "grad_norm": 7.8934197425842285, "learning_rate": 8.362068965517241e-05, "loss": 7.4355, "step": 195 }, { "epoch": 0.05184705119896306, "grad_norm": 10.264134407043457, "learning_rate": 8.577586206896551e-05, "loss": 7.535, "step": 200 }, { "epoch": 0.053143227478937134, "grad_norm": 7.964872360229492, "learning_rate": 8.793103448275862e-05, "loss": 7.3533, "step": 205 }, { "epoch": 0.05443940375891121, "grad_norm": 8.749735832214355, "learning_rate": 9.008620689655173e-05, "loss": 6.8626, "step": 210 }, { "epoch": 0.05573558003888529, "grad_norm": 8.586889266967773, "learning_rate": 9.224137931034484e-05, "loss": 7.6321, "step": 215 }, { "epoch": 0.057031756318859365, "grad_norm": 8.07584285736084, "learning_rate": 9.439655172413794e-05, "loss": 6.9314, "step": 220 }, { "epoch": 0.05832793259883344, "grad_norm": 8.720308303833008, "learning_rate": 9.655172413793105e-05, "loss": 7.0986, "step": 225 }, { "epoch": 0.059624108878807515, "grad_norm": 7.970818996429443, "learning_rate": 9.870689655172414e-05, "loss": 6.9804, "step": 230 }, { "epoch": 0.0609202851587816, "grad_norm": 7.308590412139893, "learning_rate": 9.999992493386817e-05, "loss": 7.1467, "step": 235 }, { "epoch": 0.06221646143875567, "grad_norm": 7.499377250671387, "learning_rate": 9.999908044247358e-05, "loss": 7.0036, "step": 240 }, { "epoch": 0.06351263771872975, "grad_norm": 9.098915100097656, "learning_rate": 9.999729764292059e-05, "loss": 7.2308, "step": 245 }, { "epoch": 0.06480881399870382, "grad_norm": 8.107562065124512, "learning_rate": 9.999457656866613e-05, "loss": 6.6914, "step": 250 }, { "epoch": 0.0661049902786779, "grad_norm": 10.761311531066895, "learning_rate": 9.999091727077524e-05, "loss": 7.1625, "step": 255 }, { "epoch": 0.06740116655865197, "grad_norm": 7.404232025146484, "learning_rate": 9.99863198179202e-05, "loss": 7.5691, "step": 260 }, { "epoch": 0.06869734283862605, "grad_norm": 8.12511157989502, "learning_rate": 9.99807842963791e-05, "loss": 7.5045, "step": 265 }, { "epoch": 0.06999351911860013, "grad_norm": 7.8374505043029785, "learning_rate": 9.99743108100344e-05, "loss": 7.1271, "step": 270 }, { "epoch": 0.0712896953985742, "grad_norm": 7.930497169494629, "learning_rate": 9.99668994803708e-05, "loss": 7.2667, "step": 275 }, { "epoch": 0.07258587167854828, "grad_norm": 7.750555515289307, "learning_rate": 9.99585504464731e-05, "loss": 7.1903, "step": 280 }, { "epoch": 0.07388204795852236, "grad_norm": 8.279412269592285, "learning_rate": 9.99492638650235e-05, "loss": 7.1222, "step": 285 }, { "epoch": 0.07517822423849643, "grad_norm": 11.771440505981445, "learning_rate": 9.993903991029873e-05, "loss": 6.9378, "step": 290 }, { "epoch": 0.07647440051847051, "grad_norm": 6.922826766967773, "learning_rate": 9.99278787741667e-05, "loss": 7.6272, "step": 295 }, { "epoch": 0.07777057679844458, "grad_norm": 7.709715843200684, "learning_rate": 9.991578066608296e-05, "loss": 7.0954, "step": 300 }, { "epoch": 0.07777057679844458, "eval_loss": 1.8095265626907349, "eval_runtime": 352.4956, "eval_samples_per_second": 9.623, "eval_steps_per_second": 1.203, "step": 300 }, { "epoch": 0.07906675307841866, "grad_norm": 8.642642974853516, "learning_rate": 9.990274581308676e-05, "loss": 7.1369, "step": 305 }, { "epoch": 0.08036292935839275, "grad_norm": 7.364828109741211, "learning_rate": 9.98887744597968e-05, "loss": 7.3196, "step": 310 }, { "epoch": 0.08165910563836681, "grad_norm": 8.421533584594727, "learning_rate": 9.987386686840658e-05, "loss": 7.3091, "step": 315 }, { "epoch": 0.0829552819183409, "grad_norm": 7.364363193511963, "learning_rate": 9.985802331867953e-05, "loss": 7.1727, "step": 320 }, { "epoch": 0.08425145819831498, "grad_norm": 7.456173896789551, "learning_rate": 9.984124410794376e-05, "loss": 7.1307, "step": 325 }, { "epoch": 0.08554763447828904, "grad_norm": 10.0582857131958, "learning_rate": 9.982352955108648e-05, "loss": 7.5445, "step": 330 }, { "epoch": 0.08684381075826313, "grad_norm": 7.809337615966797, "learning_rate": 9.980487998054806e-05, "loss": 7.4881, "step": 335 }, { "epoch": 0.0881399870382372, "grad_norm": 7.382376194000244, "learning_rate": 9.978529574631583e-05, "loss": 6.9715, "step": 340 }, { "epoch": 0.08943616331821128, "grad_norm": 7.3704447746276855, "learning_rate": 9.976477721591745e-05, "loss": 7.5716, "step": 345 }, { "epoch": 0.09073233959818536, "grad_norm": 7.512279987335205, "learning_rate": 9.974332477441415e-05, "loss": 7.2183, "step": 350 }, { "epoch": 0.09202851587815943, "grad_norm": 7.8721923828125, "learning_rate": 9.972093882439331e-05, "loss": 7.5339, "step": 355 }, { "epoch": 0.09332469215813351, "grad_norm": 7.310612678527832, "learning_rate": 9.969761978596104e-05, "loss": 7.6793, "step": 360 }, { "epoch": 0.09462086843810759, "grad_norm": 6.280037879943848, "learning_rate": 9.96733680967343e-05, "loss": 7.1348, "step": 365 }, { "epoch": 0.09591704471808166, "grad_norm": 7.298839569091797, "learning_rate": 9.96481842118326e-05, "loss": 7.3563, "step": 370 }, { "epoch": 0.09721322099805574, "grad_norm": 8.150246620178223, "learning_rate": 9.962206860386952e-05, "loss": 7.443, "step": 375 }, { "epoch": 0.0985093972780298, "grad_norm": 7.3592681884765625, "learning_rate": 9.959502176294383e-05, "loss": 6.9807, "step": 380 }, { "epoch": 0.09980557355800389, "grad_norm": 7.6831746101379395, "learning_rate": 9.956704419663034e-05, "loss": 7.565, "step": 385 }, { "epoch": 0.10110174983797797, "grad_norm": 7.569375991821289, "learning_rate": 9.953813642997023e-05, "loss": 7.2685, "step": 390 }, { "epoch": 0.10239792611795204, "grad_norm": 7.802665710449219, "learning_rate": 9.950829900546135e-05, "loss": 7.021, "step": 395 }, { "epoch": 0.10369410239792612, "grad_norm": 7.149716854095459, "learning_rate": 9.947753248304798e-05, "loss": 7.0914, "step": 400 }, { "epoch": 0.1049902786779002, "grad_norm": 6.885632514953613, "learning_rate": 9.944583744011035e-05, "loss": 6.9159, "step": 405 }, { "epoch": 0.10628645495787427, "grad_norm": 6.948114395141602, "learning_rate": 9.941321447145369e-05, "loss": 7.0835, "step": 410 }, { "epoch": 0.10758263123784835, "grad_norm": 7.526493549346924, "learning_rate": 9.937966418929726e-05, "loss": 7.1776, "step": 415 }, { "epoch": 0.10887880751782242, "grad_norm": 7.591228008270264, "learning_rate": 9.934518722326268e-05, "loss": 7.553, "step": 420 }, { "epoch": 0.1101749837977965, "grad_norm": 7.224096775054932, "learning_rate": 9.930978422036224e-05, "loss": 7.3159, "step": 425 }, { "epoch": 0.11147116007777058, "grad_norm": 18.597820281982422, "learning_rate": 9.927345584498666e-05, "loss": 7.066, "step": 430 }, { "epoch": 0.11276733635774465, "grad_norm": 7.270762920379639, "learning_rate": 9.923620277889271e-05, "loss": 7.2213, "step": 435 }, { "epoch": 0.11406351263771873, "grad_norm": 7.843218803405762, "learning_rate": 9.91980257211904e-05, "loss": 7.4688, "step": 440 }, { "epoch": 0.11535968891769281, "grad_norm": 8.111706733703613, "learning_rate": 9.915892538832975e-05, "loss": 7.7934, "step": 445 }, { "epoch": 0.11665586519766688, "grad_norm": 8.022403717041016, "learning_rate": 9.911890251408751e-05, "loss": 7.115, "step": 450 }, { "epoch": 0.11795204147764096, "grad_norm": 7.161884307861328, "learning_rate": 9.907795784955327e-05, "loss": 6.9718, "step": 455 }, { "epoch": 0.11924821775761503, "grad_norm": 6.838676452636719, "learning_rate": 9.903609216311543e-05, "loss": 6.8441, "step": 460 }, { "epoch": 0.12054439403758911, "grad_norm": 6.686822414398193, "learning_rate": 9.899330624044672e-05, "loss": 7.088, "step": 465 }, { "epoch": 0.1218405703175632, "grad_norm": 8.322293281555176, "learning_rate": 9.894960088448952e-05, "loss": 6.8471, "step": 470 }, { "epoch": 0.12313674659753726, "grad_norm": 7.765487194061279, "learning_rate": 9.890497691544078e-05, "loss": 7.4387, "step": 475 }, { "epoch": 0.12443292287751134, "grad_norm": 7.90057897567749, "learning_rate": 9.885943517073656e-05, "loss": 7.4196, "step": 480 }, { "epoch": 0.12572909915748542, "grad_norm": 7.194269180297852, "learning_rate": 9.881297650503641e-05, "loss": 7.6986, "step": 485 }, { "epoch": 0.1270252754374595, "grad_norm": 7.885738372802734, "learning_rate": 9.876560179020723e-05, "loss": 6.7973, "step": 490 }, { "epoch": 0.12832145171743356, "grad_norm": 6.831270694732666, "learning_rate": 9.871731191530703e-05, "loss": 6.9532, "step": 495 }, { "epoch": 0.12961762799740764, "grad_norm": 7.411842346191406, "learning_rate": 9.866810778656815e-05, "loss": 7.2362, "step": 500 }, { "epoch": 0.13091380427738172, "grad_norm": 6.42346715927124, "learning_rate": 9.861799032738026e-05, "loss": 7.2695, "step": 505 }, { "epoch": 0.1322099805573558, "grad_norm": 7.161039352416992, "learning_rate": 9.856696047827309e-05, "loss": 7.5941, "step": 510 }, { "epoch": 0.1335061568373299, "grad_norm": 7.035979270935059, "learning_rate": 9.851501919689872e-05, "loss": 8.0644, "step": 515 }, { "epoch": 0.13480233311730394, "grad_norm": 6.812289237976074, "learning_rate": 9.846216745801365e-05, "loss": 7.1331, "step": 520 }, { "epoch": 0.13609850939727802, "grad_norm": 8.006174087524414, "learning_rate": 9.840840625346046e-05, "loss": 7.3211, "step": 525 }, { "epoch": 0.1373946856772521, "grad_norm": 7.678815841674805, "learning_rate": 9.835373659214925e-05, "loss": 7.4027, "step": 530 }, { "epoch": 0.13869086195722619, "grad_norm": 6.411709785461426, "learning_rate": 9.829815950003869e-05, "loss": 6.821, "step": 535 }, { "epoch": 0.13998703823720027, "grad_norm": 7.886822700500488, "learning_rate": 9.824167602011671e-05, "loss": 7.9045, "step": 540 }, { "epoch": 0.14128321451717435, "grad_norm": 12.82714557647705, "learning_rate": 9.818428721238101e-05, "loss": 7.116, "step": 545 }, { "epoch": 0.1425793907971484, "grad_norm": 8.018953323364258, "learning_rate": 9.812599415381916e-05, "loss": 7.509, "step": 550 }, { "epoch": 0.14387556707712248, "grad_norm": 7.40985631942749, "learning_rate": 9.806679793838829e-05, "loss": 7.2557, "step": 555 }, { "epoch": 0.14517174335709657, "grad_norm": 9.974076271057129, "learning_rate": 9.800669967699467e-05, "loss": 7.6975, "step": 560 }, { "epoch": 0.14646791963707065, "grad_norm": 7.4648213386535645, "learning_rate": 9.794570049747285e-05, "loss": 7.0384, "step": 565 }, { "epoch": 0.14776409591704473, "grad_norm": 6.717657566070557, "learning_rate": 9.788380154456443e-05, "loss": 7.2182, "step": 570 }, { "epoch": 0.14906027219701878, "grad_norm": 7.273350238800049, "learning_rate": 9.78210039798966e-05, "loss": 7.0696, "step": 575 }, { "epoch": 0.15035644847699287, "grad_norm": 7.2245073318481445, "learning_rate": 9.775730898196038e-05, "loss": 7.1045, "step": 580 }, { "epoch": 0.15165262475696695, "grad_norm": 7.395965576171875, "learning_rate": 9.769271774608853e-05, "loss": 7.6671, "step": 585 }, { "epoch": 0.15294880103694103, "grad_norm": 6.770759105682373, "learning_rate": 9.762723148443296e-05, "loss": 7.1223, "step": 590 }, { "epoch": 0.1542449773169151, "grad_norm": 6.783857822418213, "learning_rate": 9.756085142594215e-05, "loss": 7.3923, "step": 595 }, { "epoch": 0.15554115359688916, "grad_norm": 7.1208672523498535, "learning_rate": 9.749357881633805e-05, "loss": 7.4655, "step": 600 }, { "epoch": 0.15554115359688916, "eval_loss": 1.7980588674545288, "eval_runtime": 352.3146, "eval_samples_per_second": 9.628, "eval_steps_per_second": 1.203, "step": 600 }, { "epoch": 0.15683732987686325, "grad_norm": 7.201072692871094, "learning_rate": 9.742541491809261e-05, "loss": 7.1481, "step": 605 }, { "epoch": 0.15813350615683733, "grad_norm": 7.581305503845215, "learning_rate": 9.735636101040422e-05, "loss": 7.2911, "step": 610 }, { "epoch": 0.1594296824368114, "grad_norm": 7.294591426849365, "learning_rate": 9.72864183891736e-05, "loss": 7.0068, "step": 615 }, { "epoch": 0.1607258587167855, "grad_norm": 6.293265342712402, "learning_rate": 9.721558836697952e-05, "loss": 7.0355, "step": 620 }, { "epoch": 0.16202203499675957, "grad_norm": 7.049729824066162, "learning_rate": 9.714387227305422e-05, "loss": 7.3987, "step": 625 }, { "epoch": 0.16331821127673363, "grad_norm": 6.990370750427246, "learning_rate": 9.707127145325833e-05, "loss": 7.2358, "step": 630 }, { "epoch": 0.1646143875567077, "grad_norm": 6.324002742767334, "learning_rate": 9.699778727005575e-05, "loss": 6.7834, "step": 635 }, { "epoch": 0.1659105638366818, "grad_norm": 6.683818817138672, "learning_rate": 9.692342110248802e-05, "loss": 7.0681, "step": 640 }, { "epoch": 0.16720674011665587, "grad_norm": 7.697535991668701, "learning_rate": 9.684817434614844e-05, "loss": 7.1317, "step": 645 }, { "epoch": 0.16850291639662995, "grad_norm": 7.235431671142578, "learning_rate": 9.67720484131559e-05, "loss": 6.982, "step": 650 }, { "epoch": 0.169799092676604, "grad_norm": 6.669722080230713, "learning_rate": 9.669504473212834e-05, "loss": 7.7691, "step": 655 }, { "epoch": 0.1710952689565781, "grad_norm": 6.637908935546875, "learning_rate": 9.661716474815597e-05, "loss": 6.7372, "step": 660 }, { "epoch": 0.17239144523655217, "grad_norm": 7.085820198059082, "learning_rate": 9.653840992277417e-05, "loss": 7.1073, "step": 665 }, { "epoch": 0.17368762151652625, "grad_norm": 6.983147621154785, "learning_rate": 9.645878173393601e-05, "loss": 7.392, "step": 670 }, { "epoch": 0.17498379779650033, "grad_norm": 7.165342807769775, "learning_rate": 9.637828167598457e-05, "loss": 7.7153, "step": 675 }, { "epoch": 0.1762799740764744, "grad_norm": 6.5106329917907715, "learning_rate": 9.629691125962487e-05, "loss": 7.461, "step": 680 }, { "epoch": 0.17757615035644847, "grad_norm": 7.141298770904541, "learning_rate": 9.62146720118955e-05, "loss": 7.7309, "step": 685 }, { "epoch": 0.17887232663642255, "grad_norm": 6.7177300453186035, "learning_rate": 9.613156547613994e-05, "loss": 7.3416, "step": 690 }, { "epoch": 0.18016850291639663, "grad_norm": 6.568918704986572, "learning_rate": 9.604759321197773e-05, "loss": 6.9979, "step": 695 }, { "epoch": 0.18146467919637072, "grad_norm": 6.893847942352295, "learning_rate": 9.596275679527506e-05, "loss": 7.1531, "step": 700 }, { "epoch": 0.18276085547634477, "grad_norm": 6.103288173675537, "learning_rate": 9.587705781811524e-05, "loss": 7.3786, "step": 705 }, { "epoch": 0.18405703175631885, "grad_norm": 6.956258296966553, "learning_rate": 9.579049788876883e-05, "loss": 6.9257, "step": 710 }, { "epoch": 0.18535320803629293, "grad_norm": 6.511674404144287, "learning_rate": 9.570307863166347e-05, "loss": 7.3252, "step": 715 }, { "epoch": 0.18664938431626701, "grad_norm": 6.869830131530762, "learning_rate": 9.561480168735337e-05, "loss": 7.5487, "step": 720 }, { "epoch": 0.1879455605962411, "grad_norm": 6.807217597961426, "learning_rate": 9.552566871248854e-05, "loss": 7.5518, "step": 725 }, { "epoch": 0.18924173687621518, "grad_norm": 8.128490447998047, "learning_rate": 9.543568137978372e-05, "loss": 7.4233, "step": 730 }, { "epoch": 0.19053791315618923, "grad_norm": 6.761119842529297, "learning_rate": 9.53448413779869e-05, "loss": 7.4946, "step": 735 }, { "epoch": 0.1918340894361633, "grad_norm": 7.9971537590026855, "learning_rate": 9.525315041184772e-05, "loss": 7.1483, "step": 740 }, { "epoch": 0.1931302657161374, "grad_norm": 6.637436389923096, "learning_rate": 9.516061020208549e-05, "loss": 6.878, "step": 745 }, { "epoch": 0.19442644199611148, "grad_norm": 7.1380534172058105, "learning_rate": 9.506722248535683e-05, "loss": 6.9908, "step": 750 }, { "epoch": 0.19572261827608556, "grad_norm": 7.076066017150879, "learning_rate": 9.497298901422307e-05, "loss": 7.3078, "step": 755 }, { "epoch": 0.1970187945560596, "grad_norm": 7.04055643081665, "learning_rate": 9.487791155711745e-05, "loss": 6.9018, "step": 760 }, { "epoch": 0.1983149708360337, "grad_norm": 6.835010051727295, "learning_rate": 9.478199189831183e-05, "loss": 7.5114, "step": 765 }, { "epoch": 0.19961114711600778, "grad_norm": 6.450118541717529, "learning_rate": 9.468523183788333e-05, "loss": 7.399, "step": 770 }, { "epoch": 0.20090732339598186, "grad_norm": 6.935841083526611, "learning_rate": 9.45876331916804e-05, "loss": 7.8322, "step": 775 }, { "epoch": 0.20220349967595594, "grad_norm": 6.710315227508545, "learning_rate": 9.448919779128884e-05, "loss": 6.8819, "step": 780 }, { "epoch": 0.20349967595593, "grad_norm": 6.616578102111816, "learning_rate": 9.438992748399742e-05, "loss": 7.2868, "step": 785 }, { "epoch": 0.20479585223590407, "grad_norm": 7.159726619720459, "learning_rate": 9.428982413276318e-05, "loss": 6.7942, "step": 790 }, { "epoch": 0.20609202851587816, "grad_norm": 22.04395294189453, "learning_rate": 9.41888896161765e-05, "loss": 7.1399, "step": 795 }, { "epoch": 0.20738820479585224, "grad_norm": 7.15266227722168, "learning_rate": 9.408712582842583e-05, "loss": 7.1105, "step": 800 }, { "epoch": 0.20868438107582632, "grad_norm": 6.6807332038879395, "learning_rate": 9.39845346792621e-05, "loss": 7.2239, "step": 805 }, { "epoch": 0.2099805573558004, "grad_norm": 6.612126350402832, "learning_rate": 9.3881118093963e-05, "loss": 7.113, "step": 810 }, { "epoch": 0.21127673363577446, "grad_norm": 6.797806739807129, "learning_rate": 9.377687801329674e-05, "loss": 7.5356, "step": 815 }, { "epoch": 0.21257290991574854, "grad_norm": 6.90980339050293, "learning_rate": 9.367181639348564e-05, "loss": 7.6236, "step": 820 }, { "epoch": 0.21386908619572262, "grad_norm": 7.254889965057373, "learning_rate": 9.356593520616948e-05, "loss": 7.7441, "step": 825 }, { "epoch": 0.2151652624756967, "grad_norm": 6.685713291168213, "learning_rate": 9.34592364383684e-05, "loss": 7.0598, "step": 830 }, { "epoch": 0.21646143875567078, "grad_norm": 6.613460540771484, "learning_rate": 9.335172209244575e-05, "loss": 7.7122, "step": 835 }, { "epoch": 0.21775761503564484, "grad_norm": 7.714509963989258, "learning_rate": 9.324339418607041e-05, "loss": 6.8361, "step": 840 }, { "epoch": 0.21905379131561892, "grad_norm": 6.777724266052246, "learning_rate": 9.31342547521789e-05, "loss": 7.1172, "step": 845 }, { "epoch": 0.220349967595593, "grad_norm": 6.394195079803467, "learning_rate": 9.302430583893731e-05, "loss": 7.1197, "step": 850 }, { "epoch": 0.22164614387556708, "grad_norm": 6.490446090698242, "learning_rate": 9.291354950970286e-05, "loss": 6.9573, "step": 855 }, { "epoch": 0.22294232015554116, "grad_norm": 6.3486223220825195, "learning_rate": 9.28019878429851e-05, "loss": 6.9237, "step": 860 }, { "epoch": 0.22423849643551522, "grad_norm": 6.645473480224609, "learning_rate": 9.268962293240701e-05, "loss": 7.1893, "step": 865 }, { "epoch": 0.2255346727154893, "grad_norm": 6.67482852935791, "learning_rate": 9.257645688666556e-05, "loss": 6.9246, "step": 870 }, { "epoch": 0.22683084899546338, "grad_norm": 6.759736061096191, "learning_rate": 9.246249182949233e-05, "loss": 7.2272, "step": 875 }, { "epoch": 0.22812702527543746, "grad_norm": 6.765931606292725, "learning_rate": 9.234772989961352e-05, "loss": 6.9023, "step": 880 }, { "epoch": 0.22942320155541154, "grad_norm": 6.720470428466797, "learning_rate": 9.22321732507098e-05, "loss": 7.1597, "step": 885 }, { "epoch": 0.23071937783538563, "grad_norm": 7.019164085388184, "learning_rate": 9.211582405137603e-05, "loss": 6.9342, "step": 890 }, { "epoch": 0.23201555411535968, "grad_norm": 8.791522026062012, "learning_rate": 9.199868448508037e-05, "loss": 7.2624, "step": 895 }, { "epoch": 0.23331173039533376, "grad_norm": 6.641139030456543, "learning_rate": 9.188075675012351e-05, "loss": 6.7989, "step": 900 }, { "epoch": 0.23331173039533376, "eval_loss": 1.7671825885772705, "eval_runtime": 352.6662, "eval_samples_per_second": 9.618, "eval_steps_per_second": 1.202, "step": 900 }, { "epoch": 0.23460790667530784, "grad_norm": 7.722927570343018, "learning_rate": 9.176204305959726e-05, "loss": 7.2626, "step": 905 }, { "epoch": 0.23590408295528192, "grad_norm": 6.987146377563477, "learning_rate": 9.164254564134305e-05, "loss": 7.5034, "step": 910 }, { "epoch": 0.237200259235256, "grad_norm": 6.865291118621826, "learning_rate": 9.15222667379102e-05, "loss": 7.3419, "step": 915 }, { "epoch": 0.23849643551523006, "grad_norm": 8.063140869140625, "learning_rate": 9.140120860651374e-05, "loss": 7.3256, "step": 920 }, { "epoch": 0.23979261179520414, "grad_norm": 6.718273162841797, "learning_rate": 9.127937351899211e-05, "loss": 7.0404, "step": 925 }, { "epoch": 0.24108878807517822, "grad_norm": 18.202068328857422, "learning_rate": 9.115676376176448e-05, "loss": 7.1696, "step": 930 }, { "epoch": 0.2423849643551523, "grad_norm": 7.2906599044799805, "learning_rate": 9.103338163578787e-05, "loss": 7.8567, "step": 935 }, { "epoch": 0.2436811406351264, "grad_norm": 6.45253849029541, "learning_rate": 9.090922945651399e-05, "loss": 6.741, "step": 940 }, { "epoch": 0.24497731691510044, "grad_norm": 6.1617841720581055, "learning_rate": 9.078430955384572e-05, "loss": 7.2458, "step": 945 }, { "epoch": 0.24627349319507452, "grad_norm": 7.127076625823975, "learning_rate": 9.065862427209349e-05, "loss": 7.1697, "step": 950 }, { "epoch": 0.2475696694750486, "grad_norm": 6.861258506774902, "learning_rate": 9.053217596993114e-05, "loss": 6.8588, "step": 955 }, { "epoch": 0.24886584575502269, "grad_norm": 8.234814643859863, "learning_rate": 9.040496702035181e-05, "loss": 7.6273, "step": 960 }, { "epoch": 0.25016202203499677, "grad_norm": 7.15592622756958, "learning_rate": 9.027699981062332e-05, "loss": 7.1422, "step": 965 }, { "epoch": 0.25145819831497085, "grad_norm": 7.064651966094971, "learning_rate": 9.014827674224333e-05, "loss": 6.9701, "step": 970 }, { "epoch": 0.25275437459494493, "grad_norm": 7.261960506439209, "learning_rate": 9.001880023089441e-05, "loss": 6.7714, "step": 975 }, { "epoch": 0.254050550874919, "grad_norm": 6.393186569213867, "learning_rate": 8.988857270639857e-05, "loss": 7.1064, "step": 980 }, { "epoch": 0.25534672715489304, "grad_norm": 6.456246852874756, "learning_rate": 8.975759661267173e-05, "loss": 7.2823, "step": 985 }, { "epoch": 0.2566429034348671, "grad_norm": 7.068678379058838, "learning_rate": 8.962587440767787e-05, "loss": 6.9554, "step": 990 }, { "epoch": 0.2579390797148412, "grad_norm": 6.7908034324646, "learning_rate": 8.94934085633828e-05, "loss": 7.1949, "step": 995 }, { "epoch": 0.2592352559948153, "grad_norm": 7.025973320007324, "learning_rate": 8.93602015657079e-05, "loss": 7.2298, "step": 1000 }, { "epoch": 0.26053143227478937, "grad_norm": 6.781772136688232, "learning_rate": 8.922625591448341e-05, "loss": 7.0498, "step": 1005 }, { "epoch": 0.26182760855476345, "grad_norm": 7.880243301391602, "learning_rate": 8.90915741234015e-05, "loss": 6.8956, "step": 1010 }, { "epoch": 0.26312378483473753, "grad_norm": 6.7520294189453125, "learning_rate": 8.895615871996911e-05, "loss": 7.1497, "step": 1015 }, { "epoch": 0.2644199611147116, "grad_norm": 7.201786518096924, "learning_rate": 8.882001224546057e-05, "loss": 6.754, "step": 1020 }, { "epoch": 0.2657161373946857, "grad_norm": 7.413877964019775, "learning_rate": 8.868313725486979e-05, "loss": 6.6366, "step": 1025 }, { "epoch": 0.2670123136746598, "grad_norm": 6.395321846008301, "learning_rate": 8.854553631686241e-05, "loss": 6.9973, "step": 1030 }, { "epoch": 0.26830848995463386, "grad_norm": 7.424454689025879, "learning_rate": 8.84072120137276e-05, "loss": 7.5214, "step": 1035 }, { "epoch": 0.2696046662346079, "grad_norm": 6.671915531158447, "learning_rate": 8.826816694132955e-05, "loss": 7.4411, "step": 1040 }, { "epoch": 0.27090084251458196, "grad_norm": 6.754222393035889, "learning_rate": 8.812840370905873e-05, "loss": 7.0095, "step": 1045 }, { "epoch": 0.27219701879455604, "grad_norm": 6.683716773986816, "learning_rate": 8.798792493978305e-05, "loss": 7.5134, "step": 1050 }, { "epoch": 0.2734931950745301, "grad_norm": 7.748030662536621, "learning_rate": 8.784673326979844e-05, "loss": 7.4997, "step": 1055 }, { "epoch": 0.2747893713545042, "grad_norm": 6.669410705566406, "learning_rate": 8.77048313487796e-05, "loss": 7.5968, "step": 1060 }, { "epoch": 0.2760855476344783, "grad_norm": 6.345719814300537, "learning_rate": 8.756222183973008e-05, "loss": 6.9597, "step": 1065 }, { "epoch": 0.27738172391445237, "grad_norm": 7.13865852355957, "learning_rate": 8.741890741893244e-05, "loss": 6.4808, "step": 1070 }, { "epoch": 0.27867790019442645, "grad_norm": 6.253044128417969, "learning_rate": 8.727489077589793e-05, "loss": 6.7209, "step": 1075 }, { "epoch": 0.27997407647440054, "grad_norm": 6.536415100097656, "learning_rate": 8.713017461331608e-05, "loss": 7.5359, "step": 1080 }, { "epoch": 0.2812702527543746, "grad_norm": 6.480521202087402, "learning_rate": 8.698476164700395e-05, "loss": 7.0728, "step": 1085 }, { "epoch": 0.2825664290343487, "grad_norm": 6.623178958892822, "learning_rate": 8.683865460585518e-05, "loss": 7.1734, "step": 1090 }, { "epoch": 0.2838626053143227, "grad_norm": 6.348566055297852, "learning_rate": 8.669185623178879e-05, "loss": 7.0204, "step": 1095 }, { "epoch": 0.2851587815942968, "grad_norm": 23.549373626708984, "learning_rate": 8.654436927969767e-05, "loss": 7.0628, "step": 1100 }, { "epoch": 0.2864549578742709, "grad_norm": 8.011816024780273, "learning_rate": 8.639619651739694e-05, "loss": 6.8218, "step": 1105 }, { "epoch": 0.28775113415424497, "grad_norm": 18.145414352416992, "learning_rate": 8.624734072557199e-05, "loss": 7.5796, "step": 1110 }, { "epoch": 0.28904731043421905, "grad_norm": 6.338537216186523, "learning_rate": 8.609780469772623e-05, "loss": 6.9521, "step": 1115 }, { "epoch": 0.29034348671419313, "grad_norm": 6.549278736114502, "learning_rate": 8.59475912401288e-05, "loss": 7.099, "step": 1120 }, { "epoch": 0.2916396629941672, "grad_norm": 7.3748040199279785, "learning_rate": 8.579670317176179e-05, "loss": 7.4141, "step": 1125 }, { "epoch": 0.2929358392741413, "grad_norm": 6.072135925292969, "learning_rate": 8.564514332426741e-05, "loss": 7.214, "step": 1130 }, { "epoch": 0.2942320155541154, "grad_norm": 6.670959949493408, "learning_rate": 8.549291454189477e-05, "loss": 6.8303, "step": 1135 }, { "epoch": 0.29552819183408946, "grad_norm": 7.920526504516602, "learning_rate": 8.534001968144656e-05, "loss": 7.2804, "step": 1140 }, { "epoch": 0.2968243681140635, "grad_norm": 6.646933078765869, "learning_rate": 8.51864616122255e-05, "loss": 6.7573, "step": 1145 }, { "epoch": 0.29812054439403757, "grad_norm": 6.354043006896973, "learning_rate": 8.503224321598035e-05, "loss": 7.3581, "step": 1150 }, { "epoch": 0.29941672067401165, "grad_norm": 6.825526237487793, "learning_rate": 8.48773673868519e-05, "loss": 7.1982, "step": 1155 }, { "epoch": 0.30071289695398573, "grad_norm": 6.163865566253662, "learning_rate": 8.472183703131873e-05, "loss": 7.2421, "step": 1160 }, { "epoch": 0.3020090732339598, "grad_norm": 8.405832290649414, "learning_rate": 8.456565506814251e-05, "loss": 7.0333, "step": 1165 }, { "epoch": 0.3033052495139339, "grad_norm": 6.760126113891602, "learning_rate": 8.440882442831336e-05, "loss": 6.8702, "step": 1170 }, { "epoch": 0.304601425793908, "grad_norm": 6.234746932983398, "learning_rate": 8.42513480549948e-05, "loss": 7.1833, "step": 1175 }, { "epoch": 0.30589760207388206, "grad_norm": 6.794267177581787, "learning_rate": 8.409322890346847e-05, "loss": 7.2797, "step": 1180 }, { "epoch": 0.30719377835385614, "grad_norm": 6.828617095947266, "learning_rate": 8.393446994107877e-05, "loss": 7.1012, "step": 1185 }, { "epoch": 0.3084899546338302, "grad_norm": 6.547544002532959, "learning_rate": 8.377507414717706e-05, "loss": 6.6778, "step": 1190 }, { "epoch": 0.3097861309138043, "grad_norm": 6.884530067443848, "learning_rate": 8.361504451306585e-05, "loss": 7.0536, "step": 1195 }, { "epoch": 0.31108230719377833, "grad_norm": 6.597454071044922, "learning_rate": 8.345438404194259e-05, "loss": 6.9122, "step": 1200 }, { "epoch": 0.31108230719377833, "eval_loss": 1.737050175666809, "eval_runtime": 352.2912, "eval_samples_per_second": 9.628, "eval_steps_per_second": 1.204, "step": 1200 } ], "logging_steps": 5, "max_steps": 3858, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 300, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.760214742433792e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }