{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 6.0, "eval_steps": 500, "global_step": 2490, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.060350030175015085, "grad_norm": 0.20140838623046875, "learning_rate": 0.00013936363636363633, "loss": 1.527, "mean_token_accuracy": 0.69229452252388, "num_tokens": 204800.0, "step": 25 }, { "epoch": 0.12070006035003017, "grad_norm": 0.18473632633686066, "learning_rate": 0.0002845340909090909, "loss": 0.6554, "mean_token_accuracy": 0.8319960880279541, "num_tokens": 409600.0, "step": 50 }, { "epoch": 0.18105009052504525, "grad_norm": 0.17837993800640106, "learning_rate": 0.00042970454545454545, "loss": 0.4641, "mean_token_accuracy": 0.8707974565029144, "num_tokens": 614400.0, "step": 75 }, { "epoch": 0.24140012070006034, "grad_norm": 0.167810320854187, "learning_rate": 0.0005109807749762905, "loss": 0.3479, "mean_token_accuracy": 0.9018591004610061, "num_tokens": 819200.0, "step": 100 }, { "epoch": 0.30175015087507545, "grad_norm": 0.15574496984481812, "learning_rate": 0.00051079411077194, "loss": 0.2156, "mean_token_accuracy": 0.9376174163818359, "num_tokens": 1024000.0, "step": 125 }, { "epoch": 0.3621001810500905, "grad_norm": 0.1890965849161148, "learning_rate": 0.0005104090113588009, "loss": 0.19, "mean_token_accuracy": 0.9459148728847504, "num_tokens": 1228800.0, "step": 150 }, { "epoch": 0.4224502112251056, "grad_norm": 0.14420419931411743, "learning_rate": 0.0005098257760672504, "loss": 0.1464, "mean_token_accuracy": 0.9564285731315613, "num_tokens": 1433600.0, "step": 175 }, { "epoch": 0.4828002414001207, "grad_norm": 0.11044761538505554, "learning_rate": 0.0005090448582348783, "loss": 0.1126, "mean_token_accuracy": 0.9676614457368851, "num_tokens": 1638400.0, "step": 200 }, { "epoch": 0.5431502715751357, "grad_norm": 0.15006929636001587, "learning_rate": 0.0005080668648541163, "loss": 0.0852, "mean_token_accuracy": 0.9751027381420135, "num_tokens": 1843200.0, "step": 225 }, { "epoch": 0.6035003017501509, "grad_norm": 0.11620533466339111, "learning_rate": 0.0005068925561004347, "loss": 0.0823, "mean_token_accuracy": 0.9755919778347015, "num_tokens": 2048000.0, "step": 250 }, { "epoch": 0.663850331925166, "grad_norm": 0.1243673637509346, "learning_rate": 0.0005055228447414724, "loss": 0.0558, "mean_token_accuracy": 0.9836203479766845, "num_tokens": 2252800.0, "step": 275 }, { "epoch": 0.724200362100181, "grad_norm": 0.06930796056985855, "learning_rate": 0.0005039587954275602, "loss": 0.0461, "mean_token_accuracy": 0.9866242682933808, "num_tokens": 2457600.0, "step": 300 }, { "epoch": 0.7845503922751962, "grad_norm": 0.10527455806732178, "learning_rate": 0.0005022016238641887, "loss": 0.0405, "mean_token_accuracy": 0.9886105668544769, "num_tokens": 2662400.0, "step": 325 }, { "epoch": 0.8449004224502112, "grad_norm": 0.14308834075927734, "learning_rate": 0.0005002526958670635, "loss": 0.0518, "mean_token_accuracy": 0.9848287659883499, "num_tokens": 2867200.0, "step": 350 }, { "epoch": 0.9052504526252263, "grad_norm": 0.0677972286939621, "learning_rate": 0.000498113526300483, "loss": 0.0347, "mean_token_accuracy": 0.9900684940814972, "num_tokens": 3072000.0, "step": 375 }, { "epoch": 0.9656004828002414, "grad_norm": 0.104254812002182, "learning_rate": 0.0004957857778998638, "loss": 0.0346, "mean_token_accuracy": 0.9899070507287979, "num_tokens": 3276800.0, "step": 400 }, { "epoch": 1.0, "eval_loss": 0.03505053371191025, "eval_mean_token_accuracy": 0.989744543062674, "eval_num_tokens": 3392512.0, "eval_runtime": 34.2879, "eval_samples_per_second": 10.762, "eval_steps_per_second": 5.395, "step": 415 }, { "epoch": 1.024140012070006, "grad_norm": 0.12752845883369446, "learning_rate": 0.0004932712599793277, "loss": 0.0298, "mean_token_accuracy": 0.9917888967032286, "num_tokens": 3474432.0, "step": 425 }, { "epoch": 1.0844900422450212, "grad_norm": 0.07175713777542114, "learning_rate": 0.0004905719270253573, "loss": 0.0279, "mean_token_accuracy": 0.9915557825565338, "num_tokens": 3679232.0, "step": 450 }, { "epoch": 1.1448400724200363, "grad_norm": 0.07020337134599686, "learning_rate": 0.0004876898771776108, "loss": 0.0283, "mean_token_accuracy": 0.9918444293737412, "num_tokens": 3884032.0, "step": 475 }, { "epoch": 1.2051901025950513, "grad_norm": 0.11085273325443268, "learning_rate": 0.00048462735059807835, "loss": 0.0249, "mean_token_accuracy": 0.9930137068033218, "num_tokens": 4088832.0, "step": 500 }, { "epoch": 1.2655401327700664, "grad_norm": 0.039446864277124405, "learning_rate": 0.00048138672772984735, "loss": 0.0236, "mean_token_accuracy": 0.9932827889919281, "num_tokens": 4293632.0, "step": 525 }, { "epoch": 1.3258901629450814, "grad_norm": 0.09495913982391357, "learning_rate": 0.00047797052744682957, "loss": 0.0287, "mean_token_accuracy": 0.9913454109430313, "num_tokens": 4498432.0, "step": 550 }, { "epoch": 1.3862401931200965, "grad_norm": 0.07164479792118073, "learning_rate": 0.0004743814050958891, "loss": 0.0229, "mean_token_accuracy": 0.993644825220108, "num_tokens": 4703232.0, "step": 575 }, { "epoch": 1.4465902232951118, "grad_norm": 0.07847117632627487, "learning_rate": 0.00047062215043289175, "loss": 0.0203, "mean_token_accuracy": 0.9942955124378204, "num_tokens": 4908032.0, "step": 600 }, { "epoch": 1.5069402534701268, "grad_norm": 0.06564558297395706, "learning_rate": 0.00046669568545428187, "loss": 0.0219, "mean_token_accuracy": 0.9936595034599304, "num_tokens": 5112832.0, "step": 625 }, { "epoch": 1.567290283645142, "grad_norm": 0.07522366940975189, "learning_rate": 0.00046260506212587063, "loss": 0.0174, "mean_token_accuracy": 0.9950636166334152, "num_tokens": 5317632.0, "step": 650 }, { "epoch": 1.627640313820157, "grad_norm": 0.15518558025360107, "learning_rate": 0.00045835346001060117, "loss": 0.0211, "mean_token_accuracy": 0.9940068638324737, "num_tokens": 5522432.0, "step": 675 }, { "epoch": 1.687990343995172, "grad_norm": 0.043186988681554794, "learning_rate": 0.0004539441837971359, "loss": 0.0165, "mean_token_accuracy": 0.99529845058918, "num_tokens": 5727232.0, "step": 700 }, { "epoch": 1.748340374170187, "grad_norm": 0.05615503713488579, "learning_rate": 0.00044938066073118524, "loss": 0.0201, "mean_token_accuracy": 0.9943493288755417, "num_tokens": 5932032.0, "step": 725 }, { "epoch": 1.8086904043452021, "grad_norm": 0.05391787365078926, "learning_rate": 0.00044466643795157515, "loss": 0.0194, "mean_token_accuracy": 0.9942416971921921, "num_tokens": 6136832.0, "step": 750 }, { "epoch": 1.8690404345202172, "grad_norm": 0.028450896963477135, "learning_rate": 0.00043980517973312485, "loss": 0.0188, "mean_token_accuracy": 0.9946917951107025, "num_tokens": 6341632.0, "step": 775 }, { "epoch": 1.9293904646952322, "grad_norm": 0.04688919708132744, "learning_rate": 0.00043480066463847576, "loss": 0.0152, "mean_token_accuracy": 0.9956115609407425, "num_tokens": 6546432.0, "step": 800 }, { "epoch": 1.9897404948702473, "grad_norm": 0.040764886885881424, "learning_rate": 0.0004296567825810876, "loss": 0.0154, "mean_token_accuracy": 0.9957632231712341, "num_tokens": 6751232.0, "step": 825 }, { "epoch": 2.0, "eval_loss": 0.016818219795823097, "eval_mean_token_accuracy": 0.9951816922909504, "eval_num_tokens": 6785024.0, "eval_runtime": 34.2431, "eval_samples_per_second": 10.776, "eval_steps_per_second": 5.403, "step": 830 }, { "epoch": 2.048280024140012, "grad_norm": 0.04474746063351631, "learning_rate": 0.00042437753180168345, "loss": 0.0142, "mean_token_accuracy": 0.9958288927668149, "num_tokens": 6948864.0, "step": 850 }, { "epoch": 2.1086300543150274, "grad_norm": 0.04984824359416962, "learning_rate": 0.00041896701576049384, "loss": 0.0134, "mean_token_accuracy": 0.9960176265239715, "num_tokens": 7153664.0, "step": 875 }, { "epoch": 2.1689800844900424, "grad_norm": 0.03551790118217468, "learning_rate": 0.00041342943994771616, "loss": 0.0127, "mean_token_accuracy": 0.9962573528289795, "num_tokens": 7358464.0, "step": 900 }, { "epoch": 2.2293301146650575, "grad_norm": 0.019452068954706192, "learning_rate": 0.0004077691086146677, "loss": 0.013, "mean_token_accuracy": 0.9960812264680863, "num_tokens": 7563264.0, "step": 925 }, { "epoch": 2.2896801448400725, "grad_norm": 0.046463266015052795, "learning_rate": 0.0004019904214281739, "loss": 0.014, "mean_token_accuracy": 0.9956604844331741, "num_tokens": 7768064.0, "step": 950 }, { "epoch": 2.3500301750150876, "grad_norm": 0.0360218770802021, "learning_rate": 0.00039609787005079176, "loss": 0.012, "mean_token_accuracy": 0.9964090156555175, "num_tokens": 7972864.0, "step": 975 }, { "epoch": 2.4103802051901027, "grad_norm": 0.04305625334382057, "learning_rate": 0.0003900960346495268, "loss": 0.0121, "mean_token_accuracy": 0.9963551986217499, "num_tokens": 8177664.0, "step": 1000 }, { "epoch": 2.4707302353651177, "grad_norm": 0.02894781529903412, "learning_rate": 0.0003839895803357572, "loss": 0.0119, "mean_token_accuracy": 0.9963845533132553, "num_tokens": 8382464.0, "step": 1025 }, { "epoch": 2.5310802655401328, "grad_norm": 0.03192359209060669, "learning_rate": 0.0003777832535391326, "loss": 0.0131, "mean_token_accuracy": 0.996208428144455, "num_tokens": 8587264.0, "step": 1050 }, { "epoch": 2.591430295715148, "grad_norm": 0.031214363873004913, "learning_rate": 0.000371481878318265, "loss": 0.012, "mean_token_accuracy": 0.9964139074087143, "num_tokens": 8792064.0, "step": 1075 }, { "epoch": 2.651780325890163, "grad_norm": 0.023792628198862076, "learning_rate": 0.0003650903526110812, "loss": 0.0119, "mean_token_accuracy": 0.9963405227661133, "num_tokens": 8996864.0, "step": 1100 }, { "epoch": 2.712130356065178, "grad_norm": 0.019435923546552658, "learning_rate": 0.00035861364442774926, "loss": 0.0111, "mean_token_accuracy": 0.9965313243865966, "num_tokens": 9201664.0, "step": 1125 }, { "epoch": 2.772480386240193, "grad_norm": 0.011904253624379635, "learning_rate": 0.00035205678798914004, "loss": 0.0108, "mean_token_accuracy": 0.9966389560699462, "num_tokens": 9406464.0, "step": 1150 }, { "epoch": 2.832830416415208, "grad_norm": 0.022762220352888107, "learning_rate": 0.0003454248798138234, "loss": 0.0106, "mean_token_accuracy": 0.99662917137146, "num_tokens": 9611264.0, "step": 1175 }, { "epoch": 2.8931804465902236, "grad_norm": 0.02752029150724411, "learning_rate": 0.0003387230747566431, "loss": 0.0106, "mean_token_accuracy": 0.9966829872131348, "num_tokens": 9816064.0, "step": 1200 }, { "epoch": 2.9535304767652386, "grad_norm": 0.03439803048968315, "learning_rate": 0.0003319565820019463, "loss": 0.0108, "mean_token_accuracy": 0.9966242790222168, "num_tokens": 10020864.0, "step": 1225 }, { "epoch": 3.0, "eval_loss": 0.011013730429112911, "eval_mean_token_accuracy": 0.9966784915408573, "eval_num_tokens": 10177536.0, "eval_runtime": 34.2376, "eval_samples_per_second": 10.778, "eval_steps_per_second": 5.403, "step": 1245 }, { "epoch": 3.012070006035003, "grad_norm": 0.019530098885297775, "learning_rate": 0.00032513066101458505, "loss": 0.0106, "mean_token_accuracy": 0.996635879437948, "num_tokens": 10218496.0, "step": 1250 }, { "epoch": 3.0724200362100182, "grad_norm": 0.014008881524205208, "learning_rate": 0.0003182506174518353, "loss": 0.0106, "mean_token_accuracy": 0.9967661571502685, "num_tokens": 10423296.0, "step": 1275 }, { "epoch": 3.1327700663850333, "grad_norm": 0.023790989071130753, "learning_rate": 0.0003113217990394111, "loss": 0.0116, "mean_token_accuracy": 0.9965166473388671, "num_tokens": 10628096.0, "step": 1300 }, { "epoch": 3.1931200965600484, "grad_norm": 0.019647866487503052, "learning_rate": 0.0003043495914147805, "loss": 0.0104, "mean_token_accuracy": 0.9967123413085938, "num_tokens": 10832896.0, "step": 1325 }, { "epoch": 3.2534701267350634, "grad_norm": 0.031535252928733826, "learning_rate": 0.0002973394139410128, "loss": 0.0105, "mean_token_accuracy": 0.9966634178161621, "num_tokens": 11037696.0, "step": 1350 }, { "epoch": 3.3138201569100785, "grad_norm": 0.035145845264196396, "learning_rate": 0.00029029671549441144, "loss": 0.0103, "mean_token_accuracy": 0.9967025566101074, "num_tokens": 11242496.0, "step": 1375 }, { "epoch": 3.3741701870850935, "grad_norm": 0.011679012328386307, "learning_rate": 0.0002832269702292078, "loss": 0.01, "mean_token_accuracy": 0.9966927719116211, "num_tokens": 11447296.0, "step": 1400 }, { "epoch": 3.4345202172601086, "grad_norm": 0.009404915384948254, "learning_rate": 0.0002761356733226058, "loss": 0.01, "mean_token_accuracy": 0.9966976642608643, "num_tokens": 11652096.0, "step": 1425 }, { "epoch": 3.4948702474351236, "grad_norm": 0.01060599833726883, "learning_rate": 0.00026902833670348577, "loss": 0.0098, "mean_token_accuracy": 0.9968199729919434, "num_tokens": 11856896.0, "step": 1450 }, { "epoch": 3.5552202776101387, "grad_norm": 0.011005139909684658, "learning_rate": 0.00026191048476808935, "loss": 0.0101, "mean_token_accuracy": 0.9967221260070801, "num_tokens": 12061696.0, "step": 1475 }, { "epoch": 3.6155703077851538, "grad_norm": 0.00999362114816904, "learning_rate": 0.0002547876500860108, "loss": 0.0098, "mean_token_accuracy": 0.9968640041351319, "num_tokens": 12266496.0, "step": 1500 }, { "epoch": 3.675920337960169, "grad_norm": 0.014084997586905956, "learning_rate": 0.0002476653690998382, "loss": 0.0098, "mean_token_accuracy": 0.9968199729919434, "num_tokens": 12471296.0, "step": 1525 }, { "epoch": 3.736270368135184, "grad_norm": 0.010335746221244335, "learning_rate": 0.00024054917782178188, "loss": 0.0099, "mean_token_accuracy": 0.9967123413085938, "num_tokens": 12676096.0, "step": 1550 }, { "epoch": 3.796620398310199, "grad_norm": 0.009000259451568127, "learning_rate": 0.0002334446075306396, "loss": 0.0097, "mean_token_accuracy": 0.9969129276275634, "num_tokens": 12880896.0, "step": 1575 }, { "epoch": 3.856970428485214, "grad_norm": 0.009936418384313583, "learning_rate": 0.00022635718047243897, "loss": 0.0098, "mean_token_accuracy": 0.9967465877532959, "num_tokens": 13085696.0, "step": 1600 }, { "epoch": 3.9173204586602295, "grad_norm": 0.009142445400357246, "learning_rate": 0.00021929240556810162, "loss": 0.0094, "mean_token_accuracy": 0.9967563724517823, "num_tokens": 13290496.0, "step": 1625 }, { "epoch": 3.9776704888352445, "grad_norm": 0.011989991180598736, "learning_rate": 0.00021225577413146564, "loss": 0.0096, "mean_token_accuracy": 0.9968640041351319, "num_tokens": 13495296.0, "step": 1650 }, { "epoch": 4.0, "eval_loss": 0.009996045380830765, "eval_mean_token_accuracy": 0.9968979861285235, "eval_num_tokens": 13570048.0, "eval_runtime": 34.2398, "eval_samples_per_second": 10.777, "eval_steps_per_second": 5.403, "step": 1660 }, { "epoch": 4.036210018105009, "grad_norm": 0.012058747932314873, "learning_rate": 0.0002052527556009917, "loss": 0.0094, "mean_token_accuracy": 0.9968678877525723, "num_tokens": 13692928.0, "step": 1675 }, { "epoch": 4.096560048280024, "grad_norm": 0.009371360763907433, "learning_rate": 0.00019828879328847448, "loss": 0.0093, "mean_token_accuracy": 0.9968884658813476, "num_tokens": 13897728.0, "step": 1700 }, { "epoch": 4.15691007845504, "grad_norm": 0.008277242071926594, "learning_rate": 0.00019136930014806037, "loss": 0.0092, "mean_token_accuracy": 0.9968346500396729, "num_tokens": 14102528.0, "step": 1725 }, { "epoch": 4.217260108630055, "grad_norm": 0.008042911067605019, "learning_rate": 0.00018449965456886262, "loss": 0.009, "mean_token_accuracy": 0.9969080352783203, "num_tokens": 14307328.0, "step": 1750 }, { "epoch": 4.27761013880507, "grad_norm": 0.010238353163003922, "learning_rate": 0.0001776851961944424, "loss": 0.0092, "mean_token_accuracy": 0.9969129276275634, "num_tokens": 14512128.0, "step": 1775 }, { "epoch": 4.337960168980085, "grad_norm": 0.009031562134623528, "learning_rate": 0.00017093122177240684, "loss": 0.0093, "mean_token_accuracy": 0.9969667434692383, "num_tokens": 14716928.0, "step": 1800 }, { "epoch": 4.3983101991551, "grad_norm": 0.0108866598457098, "learning_rate": 0.00016424298103734827, "loss": 0.0091, "mean_token_accuracy": 0.9969080352783203, "num_tokens": 14921728.0, "step": 1825 }, { "epoch": 4.458660229330115, "grad_norm": 0.00900843646377325, "learning_rate": 0.0001576256726303272, "loss": 0.0093, "mean_token_accuracy": 0.9969324970245361, "num_tokens": 15126528.0, "step": 1850 }, { "epoch": 4.51901025950513, "grad_norm": 0.008820630609989166, "learning_rate": 0.00015108444005806725, "loss": 0.0093, "mean_token_accuracy": 0.9968835735321044, "num_tokens": 15331328.0, "step": 1875 }, { "epoch": 4.579360289680145, "grad_norm": 0.009221971966326237, "learning_rate": 0.00014462436769500773, "loss": 0.0091, "mean_token_accuracy": 0.9968933582305908, "num_tokens": 15536128.0, "step": 1900 }, { "epoch": 4.63971031985516, "grad_norm": 0.010153044946491718, "learning_rate": 0.0001382504768313155, "loss": 0.0092, "mean_token_accuracy": 0.9969031429290771, "num_tokens": 15740928.0, "step": 1925 }, { "epoch": 4.700060350030175, "grad_norm": 0.009132993407547474, "learning_rate": 0.00013196772176993376, "loss": 0.0093, "mean_token_accuracy": 0.997020559310913, "num_tokens": 15945728.0, "step": 1950 }, { "epoch": 4.76041038020519, "grad_norm": 0.009844532236456871, "learning_rate": 0.00012578098597569682, "loss": 0.0093, "mean_token_accuracy": 0.9968884658813476, "num_tokens": 16150528.0, "step": 1975 }, { "epoch": 4.820760410380205, "grad_norm": 0.010243461467325687, "learning_rate": 0.00011969507827950694, "loss": 0.0094, "mean_token_accuracy": 0.9969080352783203, "num_tokens": 16355328.0, "step": 2000 }, { "epoch": 4.88111044055522, "grad_norm": 0.010576976463198662, "learning_rate": 0.00011371472914052234, "loss": 0.0091, "mean_token_accuracy": 0.9969520664215088, "num_tokens": 16560128.0, "step": 2025 }, { "epoch": 4.941460470730235, "grad_norm": 0.008221893571317196, "learning_rate": 0.00010784458696926253, "loss": 0.0092, "mean_token_accuracy": 0.9968737888336182, "num_tokens": 16764928.0, "step": 2050 }, { "epoch": 5.0, "grad_norm": 0.02519950643181801, "learning_rate": 0.0001020892145144872, "loss": 0.009, "mean_token_accuracy": 0.996883018729613, "num_tokens": 16962560.0, "step": 2075 }, { "epoch": 5.0, "eval_loss": 0.009800967760384083, "eval_mean_token_accuracy": 0.9968027836567647, "eval_num_tokens": 16962560.0, "eval_runtime": 34.241, "eval_samples_per_second": 10.777, "eval_steps_per_second": 5.403, "step": 2075 }, { "epoch": 5.060350030175015, "grad_norm": 0.009994215331971645, "learning_rate": 9.645308531666052e-05, "loss": 0.0089, "mean_token_accuracy": 0.9969227123260498, "num_tokens": 17167360.0, "step": 2100 }, { "epoch": 5.12070006035003, "grad_norm": 0.00914147961884737, "learning_rate": 9.094058023075292e-05, "loss": 0.0087, "mean_token_accuracy": 0.9971526527404785, "num_tokens": 17372160.0, "step": 2125 }, { "epoch": 5.181050090525045, "grad_norm": 0.008889489807188511, "learning_rate": 8.555598402108725e-05, "loss": 0.009, "mean_token_accuracy": 0.9970107746124267, "num_tokens": 17576960.0, "step": 2150 }, { "epoch": 5.24140012070006, "grad_norm": 0.009806985966861248, "learning_rate": 8.030348203087353e-05, "loss": 0.009, "mean_token_accuracy": 0.9970596981048584, "num_tokens": 17781760.0, "step": 2175 }, { "epoch": 5.301750150875075, "grad_norm": 0.009788556955754757, "learning_rate": 7.518715692902254e-05, "loss": 0.009, "mean_token_accuracy": 0.9970352363586426, "num_tokens": 17986560.0, "step": 2200 }, { "epoch": 5.36210018105009, "grad_norm": 0.010473587550222874, "learning_rate": 7.021098553676493e-05, "loss": 0.0089, "mean_token_accuracy": 0.996927604675293, "num_tokens": 18191360.0, "step": 2225 }, { "epoch": 5.422450211225105, "grad_norm": 0.011041617952287197, "learning_rate": 6.537883573654501e-05, "loss": 0.0089, "mean_token_accuracy": 0.9969618511199951, "num_tokens": 18396160.0, "step": 2250 }, { "epoch": 5.4828002414001205, "grad_norm": 0.008172512985765934, "learning_rate": 6.069446346558982e-05, "loss": 0.0088, "mean_token_accuracy": 0.9970352363586426, "num_tokens": 18600960.0, "step": 2275 }, { "epoch": 5.5431502715751355, "grad_norm": 0.009114045649766922, "learning_rate": 5.616150979649147e-05, "loss": 0.0089, "mean_token_accuracy": 0.9970548057556152, "num_tokens": 18805760.0, "step": 2300 }, { "epoch": 5.603500301750151, "grad_norm": 0.008835142478346825, "learning_rate": 5.178349810707104e-05, "loss": 0.0088, "mean_token_accuracy": 0.9969912052154541, "num_tokens": 19010560.0, "step": 2325 }, { "epoch": 5.663850331925166, "grad_norm": 0.008637483231723309, "learning_rate": 4.756383134172478e-05, "loss": 0.0088, "mean_token_accuracy": 0.9970939445495606, "num_tokens": 19215360.0, "step": 2350 }, { "epoch": 5.724200362100181, "grad_norm": 0.008629810996353626, "learning_rate": 4.3505789366381116e-05, "loss": 0.0088, "mean_token_accuracy": 0.9970254516601562, "num_tokens": 19420160.0, "step": 2375 }, { "epoch": 5.784550392275197, "grad_norm": 0.009468801319599152, "learning_rate": 3.9612526419123156e-05, "loss": 0.0088, "mean_token_accuracy": 0.9969618511199951, "num_tokens": 19624960.0, "step": 2400 }, { "epoch": 5.844900422450211, "grad_norm": 0.009678692556917667, "learning_rate": 3.5887068658460824e-05, "loss": 0.0088, "mean_token_accuracy": 0.9969667434692383, "num_tokens": 19829760.0, "step": 2425 }, { "epoch": 5.905250452625227, "grad_norm": 0.010459608398377895, "learning_rate": 3.2332311811155415e-05, "loss": 0.0088, "mean_token_accuracy": 0.9969618511199951, "num_tokens": 20034560.0, "step": 2450 }, { "epoch": 5.965600482800241, "grad_norm": 0.011055945418775082, "learning_rate": 2.8951018921426954e-05, "loss": 0.0088, "mean_token_accuracy": 0.9969667434692383, "num_tokens": 20239360.0, "step": 2475 }, { "epoch": 6.0, "eval_loss": 0.009622456505894661, "eval_mean_token_accuracy": 0.9968900525892103, "eval_num_tokens": 20355072.0, "eval_runtime": 34.2252, "eval_samples_per_second": 10.782, "eval_steps_per_second": 5.405, "step": 2490 } ], "logging_steps": 25, "max_steps": 2905, "num_input_tokens_seen": 0, "num_train_epochs": 7, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 8.832594777263309e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }