{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 822, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.012165450121654502, "grad_norm": 35.136810704281835, "learning_rate": 6.024096385542168e-08, "loss": 2.8872, "step": 5 }, { "epoch": 0.024330900243309004, "grad_norm": 37.11644257882846, "learning_rate": 1.2048192771084337e-07, "loss": 2.9308, "step": 10 }, { "epoch": 0.0364963503649635, "grad_norm": 35.50815851457726, "learning_rate": 1.8072289156626505e-07, "loss": 2.9404, "step": 15 }, { "epoch": 0.04866180048661801, "grad_norm": 33.82179290022993, "learning_rate": 2.4096385542168674e-07, "loss": 2.9574, "step": 20 }, { "epoch": 0.06082725060827251, "grad_norm": 29.345471207635402, "learning_rate": 3.0120481927710845e-07, "loss": 2.7601, "step": 25 }, { "epoch": 0.072992700729927, "grad_norm": 18.103899475986932, "learning_rate": 3.614457831325301e-07, "loss": 2.5879, "step": 30 }, { "epoch": 0.0851581508515815, "grad_norm": 17.30630038808378, "learning_rate": 4.216867469879518e-07, "loss": 2.5094, "step": 35 }, { "epoch": 0.09732360097323602, "grad_norm": 14.377152532537034, "learning_rate": 4.819277108433735e-07, "loss": 2.2496, "step": 40 }, { "epoch": 0.10948905109489052, "grad_norm": 12.691958118803983, "learning_rate": 5.421686746987951e-07, "loss": 2.0637, "step": 45 }, { "epoch": 0.12165450121654502, "grad_norm": 14.246104759649635, "learning_rate": 6.024096385542169e-07, "loss": 1.8083, "step": 50 }, { "epoch": 0.13381995133819952, "grad_norm": 10.324050556387458, "learning_rate": 6.626506024096386e-07, "loss": 1.3742, "step": 55 }, { "epoch": 0.145985401459854, "grad_norm": 7.984984851081965, "learning_rate": 7.228915662650602e-07, "loss": 1.0691, "step": 60 }, { "epoch": 0.15815085158150852, "grad_norm": 6.140514634003949, "learning_rate": 7.831325301204819e-07, "loss": 0.9222, "step": 65 }, { "epoch": 0.170316301703163, "grad_norm": 5.29747208490695, "learning_rate": 8.433734939759036e-07, "loss": 0.8104, "step": 70 }, { "epoch": 0.18248175182481752, "grad_norm": 4.016133507996615, "learning_rate": 9.036144578313253e-07, "loss": 0.8247, "step": 75 }, { "epoch": 0.19464720194647203, "grad_norm": 4.098253893348748, "learning_rate": 9.63855421686747e-07, "loss": 0.7495, "step": 80 }, { "epoch": 0.20681265206812652, "grad_norm": 4.091518467980595, "learning_rate": 9.999819279153408e-07, "loss": 0.6703, "step": 85 }, { "epoch": 0.21897810218978103, "grad_norm": 3.522646738471009, "learning_rate": 9.997786319658268e-07, "loss": 0.7445, "step": 90 }, { "epoch": 0.23114355231143552, "grad_norm": 3.837941967089161, "learning_rate": 9.993495421137991e-07, "loss": 0.6855, "step": 95 }, { "epoch": 0.24330900243309003, "grad_norm": 3.5850838668962104, "learning_rate": 9.986948522168299e-07, "loss": 0.6734, "step": 100 }, { "epoch": 0.25547445255474455, "grad_norm": 3.6056795668460855, "learning_rate": 9.97814858055846e-07, "loss": 0.6749, "step": 105 }, { "epoch": 0.26763990267639903, "grad_norm": 3.1954421939622586, "learning_rate": 9.967099572014976e-07, "loss": 0.6319, "step": 110 }, { "epoch": 0.2798053527980535, "grad_norm": 3.5304794337438654, "learning_rate": 9.953806488345415e-07, "loss": 0.6369, "step": 115 }, { "epoch": 0.291970802919708, "grad_norm": 3.607980331043868, "learning_rate": 9.938275335203175e-07, "loss": 0.7232, "step": 120 }, { "epoch": 0.30413625304136255, "grad_norm": 3.420469696607331, "learning_rate": 9.920513129374196e-07, "loss": 0.6547, "step": 125 }, { "epoch": 0.31630170316301703, "grad_norm": 3.6337951695338084, "learning_rate": 9.900527895606867e-07, "loss": 0.6567, "step": 130 }, { "epoch": 0.3284671532846715, "grad_norm": 3.9119710893490014, "learning_rate": 9.87832866298654e-07, "loss": 0.5806, "step": 135 }, { "epoch": 0.340632603406326, "grad_norm": 3.412844406157836, "learning_rate": 9.8539254608563e-07, "loss": 0.723, "step": 140 }, { "epoch": 0.35279805352798055, "grad_norm": 3.0959507468766247, "learning_rate": 9.827329314285824e-07, "loss": 0.6161, "step": 145 }, { "epoch": 0.36496350364963503, "grad_norm": 3.1119106656410396, "learning_rate": 9.798552239090402e-07, "loss": 0.7561, "step": 150 }, { "epoch": 0.3771289537712895, "grad_norm": 3.410091070862306, "learning_rate": 9.767607236402329e-07, "loss": 0.6255, "step": 155 }, { "epoch": 0.38929440389294406, "grad_norm": 3.4882813480618315, "learning_rate": 9.734508286797147e-07, "loss": 0.6027, "step": 160 }, { "epoch": 0.40145985401459855, "grad_norm": 3.0710632886841474, "learning_rate": 9.699270343977402e-07, "loss": 0.6269, "step": 165 }, { "epoch": 0.41362530413625304, "grad_norm": 3.1675830915600898, "learning_rate": 9.661909328016738e-07, "loss": 0.612, "step": 170 }, { "epoch": 0.4257907542579075, "grad_norm": 3.1668114264127185, "learning_rate": 9.622442118167395e-07, "loss": 0.6186, "step": 175 }, { "epoch": 0.43795620437956206, "grad_norm": 3.6127782638940222, "learning_rate": 9.580886545234385e-07, "loss": 0.6156, "step": 180 }, { "epoch": 0.45012165450121655, "grad_norm": 3.4173672748523134, "learning_rate": 9.537261383519736e-07, "loss": 0.7154, "step": 185 }, { "epoch": 0.46228710462287104, "grad_norm": 3.4873902399343626, "learning_rate": 9.49158634234049e-07, "loss": 0.6056, "step": 190 }, { "epoch": 0.4744525547445255, "grad_norm": 2.7169584259345667, "learning_rate": 9.443882057124292e-07, "loss": 0.57, "step": 195 }, { "epoch": 0.48661800486618007, "grad_norm": 3.511025737698735, "learning_rate": 9.394170080086536e-07, "loss": 0.6309, "step": 200 }, { "epoch": 0.49878345498783455, "grad_norm": 3.173467784674973, "learning_rate": 9.342472870493341e-07, "loss": 0.5871, "step": 205 }, { "epoch": 0.5109489051094891, "grad_norm": 2.988370332734116, "learning_rate": 9.288813784514732e-07, "loss": 0.6521, "step": 210 }, { "epoch": 0.5231143552311436, "grad_norm": 2.9332212832588582, "learning_rate": 9.233217064672606e-07, "loss": 0.5808, "step": 215 }, { "epoch": 0.5352798053527981, "grad_norm": 3.200346979680469, "learning_rate": 9.175707828888252e-07, "loss": 0.6191, "step": 220 }, { "epoch": 0.5474452554744526, "grad_norm": 2.5736941320250506, "learning_rate": 9.116312059134384e-07, "loss": 0.5634, "step": 225 }, { "epoch": 0.559610705596107, "grad_norm": 2.839715219548779, "learning_rate": 9.055056589696799e-07, "loss": 0.559, "step": 230 }, { "epoch": 0.5717761557177615, "grad_norm": 3.21935500339033, "learning_rate": 8.991969095050974e-07, "loss": 0.6039, "step": 235 }, { "epoch": 0.583941605839416, "grad_norm": 2.940998241726388, "learning_rate": 8.927078077359076e-07, "loss": 0.6824, "step": 240 }, { "epoch": 0.5961070559610706, "grad_norm": 3.1839756022296997, "learning_rate": 8.860412853593032e-07, "loss": 0.6896, "step": 245 }, { "epoch": 0.6082725060827251, "grad_norm": 3.308864865524755, "learning_rate": 8.792003542289477e-07, "loss": 0.5722, "step": 250 }, { "epoch": 0.6204379562043796, "grad_norm": 3.031859128373944, "learning_rate": 8.721881049942564e-07, "loss": 0.6617, "step": 255 }, { "epoch": 0.6326034063260341, "grad_norm": 2.7610589310808353, "learning_rate": 8.650077057040792e-07, "loss": 0.7348, "step": 260 }, { "epoch": 0.6447688564476886, "grad_norm": 3.2001529743714197, "learning_rate": 8.576624003754139e-07, "loss": 0.5885, "step": 265 }, { "epoch": 0.656934306569343, "grad_norm": 3.051911162592852, "learning_rate": 8.501555075277996e-07, "loss": 0.5982, "step": 270 }, { "epoch": 0.6690997566909975, "grad_norm": 3.5209648449131397, "learning_rate": 8.424904186840494e-07, "loss": 0.6854, "step": 275 }, { "epoch": 0.681265206812652, "grad_norm": 2.923839380139526, "learning_rate": 8.346705968380014e-07, "loss": 0.5715, "step": 280 }, { "epoch": 0.6934306569343066, "grad_norm": 3.000228221538061, "learning_rate": 8.266995748899819e-07, "loss": 0.6109, "step": 285 }, { "epoch": 0.7055961070559611, "grad_norm": 3.2170282244929007, "learning_rate": 8.185809540506816e-07, "loss": 0.6123, "step": 290 }, { "epoch": 0.7177615571776156, "grad_norm": 2.8928521749153937, "learning_rate": 8.103184022141744e-07, "loss": 0.6431, "step": 295 }, { "epoch": 0.7299270072992701, "grad_norm": 2.80504169955644, "learning_rate": 8.019156523008064e-07, "loss": 0.6185, "step": 300 }, { "epoch": 0.7420924574209246, "grad_norm": 3.0197651841442514, "learning_rate": 7.933765005707084e-07, "loss": 0.6198, "step": 305 }, { "epoch": 0.754257907542579, "grad_norm": 3.079524154161924, "learning_rate": 7.847048049086919e-07, "loss": 0.587, "step": 310 }, { "epoch": 0.7664233576642335, "grad_norm": 2.8303257564626603, "learning_rate": 7.759044830813035e-07, "loss": 0.5723, "step": 315 }, { "epoch": 0.7785888077858881, "grad_norm": 3.1558349614988694, "learning_rate": 7.669795109668249e-07, "loss": 0.6104, "step": 320 }, { "epoch": 0.7907542579075426, "grad_norm": 2.693364126549454, "learning_rate": 7.579339207590216e-07, "loss": 0.4889, "step": 325 }, { "epoch": 0.8029197080291971, "grad_norm": 2.644662771230438, "learning_rate": 7.487717991454439e-07, "loss": 0.5816, "step": 330 }, { "epoch": 0.8150851581508516, "grad_norm": 3.2235135777624415, "learning_rate": 7.394972854611141e-07, "loss": 0.592, "step": 335 }, { "epoch": 0.8272506082725061, "grad_norm": 2.918869528455333, "learning_rate": 7.301145698184233e-07, "loss": 0.5272, "step": 340 }, { "epoch": 0.8394160583941606, "grad_norm": 2.5627359830835306, "learning_rate": 7.206278912140906e-07, "loss": 0.5297, "step": 345 }, { "epoch": 0.851581508515815, "grad_norm": 2.7061548346220556, "learning_rate": 7.110415356140355e-07, "loss": 0.5031, "step": 350 }, { "epoch": 0.8637469586374696, "grad_norm": 3.274089996717074, "learning_rate": 7.013598340170311e-07, "loss": 0.5281, "step": 355 }, { "epoch": 0.8759124087591241, "grad_norm": 2.8139170443601356, "learning_rate": 6.915871604980113e-07, "loss": 0.6147, "step": 360 }, { "epoch": 0.8880778588807786, "grad_norm": 2.9215610496189335, "learning_rate": 6.817279302319169e-07, "loss": 0.5886, "step": 365 }, { "epoch": 0.9002433090024331, "grad_norm": 3.154431486065749, "learning_rate": 6.717865974989738e-07, "loss": 0.6734, "step": 370 }, { "epoch": 0.9124087591240876, "grad_norm": 2.9822680687846983, "learning_rate": 6.617676536723023e-07, "loss": 0.6426, "step": 375 }, { "epoch": 0.9245742092457421, "grad_norm": 3.158101801696937, "learning_rate": 6.51675625188771e-07, "loss": 0.6362, "step": 380 }, { "epoch": 0.9367396593673966, "grad_norm": 2.9948949271554577, "learning_rate": 6.415150715040065e-07, "loss": 0.6471, "step": 385 }, { "epoch": 0.948905109489051, "grad_norm": 2.8943314652531336, "learning_rate": 6.31290583032487e-07, "loss": 0.6074, "step": 390 }, { "epoch": 0.9610705596107056, "grad_norm": 2.4244041105109324, "learning_rate": 6.210067790736495e-07, "loss": 0.5467, "step": 395 }, { "epoch": 0.9732360097323601, "grad_norm": 3.3180380615601495, "learning_rate": 6.10668305724946e-07, "loss": 0.6026, "step": 400 }, { "epoch": 0.9854014598540146, "grad_norm": 2.7318174172636143, "learning_rate": 6.002798337827934e-07, "loss": 0.5002, "step": 405 }, { "epoch": 0.9975669099756691, "grad_norm": 2.9984040241309917, "learning_rate": 5.898460566323649e-07, "loss": 0.5698, "step": 410 }, { "epoch": 1.0, "eval_loss": 0.6054084897041321, "eval_runtime": 126.3397, "eval_samples_per_second": 1.069, "eval_steps_per_second": 0.135, "step": 411 }, { "epoch": 1.0097323600973236, "grad_norm": 2.742008958066821, "learning_rate": 5.793716881271742e-07, "loss": 0.5714, "step": 415 }, { "epoch": 1.0218978102189782, "grad_norm": 2.5207243812109406, "learning_rate": 5.688614604594164e-07, "loss": 0.5354, "step": 420 }, { "epoch": 1.0340632603406326, "grad_norm": 3.225694116361688, "learning_rate": 5.583201220220188e-07, "loss": 0.5201, "step": 425 }, { "epoch": 1.0462287104622872, "grad_norm": 2.7176526570864254, "learning_rate": 5.477524352633763e-07, "loss": 0.4815, "step": 430 }, { "epoch": 1.0583941605839415, "grad_norm": 3.10669907629376, "learning_rate": 5.371631745357343e-07, "loss": 0.4908, "step": 435 }, { "epoch": 1.0705596107055961, "grad_norm": 2.872517103973882, "learning_rate": 5.265571239381949e-07, "loss": 0.5275, "step": 440 }, { "epoch": 1.0827250608272505, "grad_norm": 2.6380545882641586, "learning_rate": 5.159390751553191e-07, "loss": 0.4484, "step": 445 }, { "epoch": 1.094890510948905, "grad_norm": 2.680997236943566, "learning_rate": 5.053138252923018e-07, "loss": 0.4832, "step": 450 }, { "epoch": 1.1070559610705597, "grad_norm": 2.74320860287435, "learning_rate": 4.946861747076983e-07, "loss": 0.5279, "step": 455 }, { "epoch": 1.119221411192214, "grad_norm": 2.9557900220447246, "learning_rate": 4.840609248446809e-07, "loss": 0.4687, "step": 460 }, { "epoch": 1.1313868613138687, "grad_norm": 2.483562093833652, "learning_rate": 4.73442876061805e-07, "loss": 0.4787, "step": 465 }, { "epoch": 1.143552311435523, "grad_norm": 3.101013440130931, "learning_rate": 4.628368254642656e-07, "loss": 0.5122, "step": 470 }, { "epoch": 1.1557177615571776, "grad_norm": 2.9506636729265194, "learning_rate": 4.522475647366236e-07, "loss": 0.5566, "step": 475 }, { "epoch": 1.167883211678832, "grad_norm": 2.8644230748110733, "learning_rate": 4.41679877977981e-07, "loss": 0.5287, "step": 480 }, { "epoch": 1.1800486618004866, "grad_norm": 2.762559136492079, "learning_rate": 4.3113853954058376e-07, "loss": 0.526, "step": 485 }, { "epoch": 1.1922141119221412, "grad_norm": 3.1131205915754, "learning_rate": 4.2062831187282583e-07, "loss": 0.4984, "step": 490 }, { "epoch": 1.2043795620437956, "grad_norm": 2.7730024747709217, "learning_rate": 4.101539433676353e-07, "loss": 0.4484, "step": 495 }, { "epoch": 1.2165450121654502, "grad_norm": 2.6668215483647044, "learning_rate": 3.997201662172065e-07, "loss": 0.494, "step": 500 }, { "epoch": 1.2287104622871046, "grad_norm": 2.7390572412081133, "learning_rate": 3.8933169427505395e-07, "loss": 0.5154, "step": 505 }, { "epoch": 1.2408759124087592, "grad_norm": 2.748963440690659, "learning_rate": 3.7899322092635056e-07, "loss": 0.4893, "step": 510 }, { "epoch": 1.2530413625304138, "grad_norm": 3.0754378199533274, "learning_rate": 3.6870941696751303e-07, "loss": 0.5126, "step": 515 }, { "epoch": 1.2652068126520681, "grad_norm": 2.926138455962271, "learning_rate": 3.5848492849599354e-07, "loss": 0.4934, "step": 520 }, { "epoch": 1.2773722627737225, "grad_norm": 2.726637741713793, "learning_rate": 3.4832437481122894e-07, "loss": 0.4648, "step": 525 }, { "epoch": 1.289537712895377, "grad_norm": 2.418458602462162, "learning_rate": 3.3823234632769764e-07, "loss": 0.5296, "step": 530 }, { "epoch": 1.3017031630170317, "grad_norm": 2.7352379445472073, "learning_rate": 3.2821340250102624e-07, "loss": 0.61, "step": 535 }, { "epoch": 1.313868613138686, "grad_norm": 3.217541189678243, "learning_rate": 3.1827206976808306e-07, "loss": 0.5492, "step": 540 }, { "epoch": 1.3260340632603407, "grad_norm": 2.864795877647711, "learning_rate": 3.084128395019887e-07, "loss": 0.4614, "step": 545 }, { "epoch": 1.338199513381995, "grad_norm": 2.6038058437141136, "learning_rate": 2.986401659829689e-07, "loss": 0.4135, "step": 550 }, { "epoch": 1.3503649635036497, "grad_norm": 2.8613012595114995, "learning_rate": 2.889584643859646e-07, "loss": 0.5568, "step": 555 }, { "epoch": 1.3625304136253042, "grad_norm": 2.616053505003722, "learning_rate": 2.793721087859094e-07, "loss": 0.4706, "step": 560 }, { "epoch": 1.3746958637469586, "grad_norm": 2.561835897607896, "learning_rate": 2.6988543018157664e-07, "loss": 0.501, "step": 565 }, { "epoch": 1.3868613138686132, "grad_norm": 2.745598821300744, "learning_rate": 2.6050271453888594e-07, "loss": 0.4494, "step": 570 }, { "epoch": 1.3990267639902676, "grad_norm": 2.970551131416165, "learning_rate": 2.51228200854556e-07, "loss": 0.5094, "step": 575 }, { "epoch": 1.4111922141119222, "grad_norm": 2.5928507359956945, "learning_rate": 2.4206607924097856e-07, "loss": 0.5446, "step": 580 }, { "epoch": 1.4233576642335766, "grad_norm": 2.691281584447722, "learning_rate": 2.3302048903317495e-07, "loss": 0.5546, "step": 585 }, { "epoch": 1.4355231143552312, "grad_norm": 2.2861873121535456, "learning_rate": 2.2409551691869645e-07, "loss": 0.4912, "step": 590 }, { "epoch": 1.4476885644768855, "grad_norm": 2.491856786980672, "learning_rate": 2.1529519509130794e-07, "loss": 0.512, "step": 595 }, { "epoch": 1.4598540145985401, "grad_norm": 2.7964995639618935, "learning_rate": 2.066234994292916e-07, "loss": 0.5381, "step": 600 }, { "epoch": 1.4720194647201947, "grad_norm": 2.4974473604446623, "learning_rate": 1.9808434769919357e-07, "loss": 0.501, "step": 605 }, { "epoch": 1.4841849148418491, "grad_norm": 3.1505249810609848, "learning_rate": 1.896815977858257e-07, "loss": 0.5276, "step": 610 }, { "epoch": 1.4963503649635037, "grad_norm": 2.250797335242495, "learning_rate": 1.8141904594931834e-07, "loss": 0.4827, "step": 615 }, { "epoch": 1.508515815085158, "grad_norm": 2.5168578342923666, "learning_rate": 1.733004251100182e-07, "loss": 0.5274, "step": 620 }, { "epoch": 1.5206812652068127, "grad_norm": 2.66648533015604, "learning_rate": 1.653294031619985e-07, "loss": 0.5588, "step": 625 }, { "epoch": 1.5328467153284673, "grad_norm": 2.5199189060372507, "learning_rate": 1.575095813159507e-07, "loss": 0.5205, "step": 630 }, { "epoch": 1.5450121654501217, "grad_norm": 2.6092732892738613, "learning_rate": 1.4984449247220045e-07, "loss": 0.5239, "step": 635 }, { "epoch": 1.557177615571776, "grad_norm": 3.020339709734662, "learning_rate": 1.4233759962458602e-07, "loss": 0.4658, "step": 640 }, { "epoch": 1.5693430656934306, "grad_norm": 2.8137695267450096, "learning_rate": 1.3499229429592086e-07, "loss": 0.4572, "step": 645 }, { "epoch": 1.5815085158150852, "grad_norm": 2.6935275017311433, "learning_rate": 1.2781189500574351e-07, "loss": 0.4371, "step": 650 }, { "epoch": 1.5936739659367398, "grad_norm": 2.854986897085518, "learning_rate": 1.207996457710524e-07, "loss": 0.5956, "step": 655 }, { "epoch": 1.6058394160583942, "grad_norm": 2.2754521205487297, "learning_rate": 1.139587146406969e-07, "loss": 0.4162, "step": 660 }, { "epoch": 1.6180048661800486, "grad_norm": 2.593006197263767, "learning_rate": 1.0729219226409242e-07, "loss": 0.5706, "step": 665 }, { "epoch": 1.6301703163017032, "grad_norm": 2.525394495995131, "learning_rate": 1.008030904949026e-07, "loss": 0.4597, "step": 670 }, { "epoch": 1.6423357664233578, "grad_norm": 3.214640448617004, "learning_rate": 9.449434103032017e-08, "loss": 0.4982, "step": 675 }, { "epoch": 1.6545012165450121, "grad_norm": 2.6717273679286033, "learning_rate": 8.836879408656156e-08, "loss": 0.5335, "step": 680 }, { "epoch": 1.6666666666666665, "grad_norm": 2.817442563468328, "learning_rate": 8.242921711117467e-08, "loss": 0.5899, "step": 685 }, { "epoch": 1.6788321167883211, "grad_norm": 2.7046959833989987, "learning_rate": 7.667829353273941e-08, "loss": 0.4601, "step": 690 }, { "epoch": 1.6909975669099757, "grad_norm": 2.5686693044434055, "learning_rate": 7.111862154852671e-08, "loss": 0.4387, "step": 695 }, { "epoch": 1.7031630170316303, "grad_norm": 2.7549573918269377, "learning_rate": 6.575271295066593e-08, "loss": 0.4407, "step": 700 }, { "epoch": 1.7153284671532847, "grad_norm": 3.0386226251872763, "learning_rate": 6.058299199134637e-08, "loss": 0.4081, "step": 705 }, { "epoch": 1.727493917274939, "grad_norm": 3.065898386308577, "learning_rate": 5.5611794287570626e-08, "loss": 0.5519, "step": 710 }, { "epoch": 1.7396593673965937, "grad_norm": 2.796246594132091, "learning_rate": 5.0841365765950995e-08, "loss": 0.4406, "step": 715 }, { "epoch": 1.7518248175182483, "grad_norm": 2.6459204292065057, "learning_rate": 4.62738616480266e-08, "loss": 0.4446, "step": 720 }, { "epoch": 1.7639902676399026, "grad_norm": 2.638183132515643, "learning_rate": 4.191134547656145e-08, "loss": 0.4898, "step": 725 }, { "epoch": 1.7761557177615572, "grad_norm": 2.66741157267872, "learning_rate": 3.775578818326047e-08, "loss": 0.4401, "step": 730 }, { "epoch": 1.7883211678832116, "grad_norm": 2.7007012738881095, "learning_rate": 3.3809067198326266e-08, "loss": 0.5357, "step": 735 }, { "epoch": 1.8004866180048662, "grad_norm": 2.6246826305404265, "learning_rate": 3.007296560225975e-08, "loss": 0.468, "step": 740 }, { "epoch": 1.8126520681265208, "grad_norm": 2.801434059165694, "learning_rate": 2.6549171320285223e-08, "loss": 0.5324, "step": 745 }, { "epoch": 1.8248175182481752, "grad_norm": 2.675356520295435, "learning_rate": 2.3239276359767023e-08, "loss": 0.5333, "step": 750 }, { "epoch": 1.8369829683698295, "grad_norm": 2.7346081415961674, "learning_rate": 2.0144776090959716e-08, "loss": 0.5328, "step": 755 }, { "epoch": 1.8491484184914841, "grad_norm": 2.584180502082461, "learning_rate": 1.726706857141763e-08, "loss": 0.5026, "step": 760 }, { "epoch": 1.8613138686131387, "grad_norm": 2.7992025496113313, "learning_rate": 1.4607453914370182e-08, "loss": 0.4324, "step": 765 }, { "epoch": 1.8734793187347933, "grad_norm": 2.747712922707614, "learning_rate": 1.2167133701345977e-08, "loss": 0.4999, "step": 770 }, { "epoch": 1.8856447688564477, "grad_norm": 2.686543695188907, "learning_rate": 9.947210439313237e-09, "loss": 0.4602, "step": 775 }, { "epoch": 1.897810218978102, "grad_norm": 2.934346492252209, "learning_rate": 7.94868706258034e-09, "loss": 0.3804, "step": 780 }, { "epoch": 1.9099756690997567, "grad_norm": 2.6332167602444847, "learning_rate": 6.172466479682448e-09, "loss": 0.4494, "step": 785 }, { "epoch": 1.9221411192214113, "grad_norm": 2.7971963966287574, "learning_rate": 4.6193511654584184e-09, "loss": 0.5266, "step": 790 }, { "epoch": 1.9343065693430657, "grad_norm": 2.7935382704503877, "learning_rate": 3.290042798502424e-09, "loss": 0.4238, "step": 795 }, { "epoch": 1.94647201946472, "grad_norm": 2.526189372509031, "learning_rate": 2.1851419441539787e-09, "loss": 0.5444, "step": 800 }, { "epoch": 1.9586374695863746, "grad_norm": 2.674765086811712, "learning_rate": 1.3051477831699797e-09, "loss": 0.5028, "step": 805 }, { "epoch": 1.9708029197080292, "grad_norm": 2.515503451570525, "learning_rate": 6.504578862009391e-10, "loss": 0.5325, "step": 810 }, { "epoch": 1.9829683698296838, "grad_norm": 2.3964208464118606, "learning_rate": 2.2136803417321937e-10, "loss": 0.5472, "step": 815 }, { "epoch": 1.9951338199513382, "grad_norm": 2.530312218929402, "learning_rate": 1.8072084659093156e-11, "loss": 0.4639, "step": 820 }, { "epoch": 2.0, "eval_loss": 0.5793226361274719, "eval_runtime": 125.5722, "eval_samples_per_second": 1.075, "eval_steps_per_second": 0.135, "step": 822 }, { "epoch": 2.0, "step": 822, "total_flos": 17716450041856.0, "train_loss": 0.5335111168469245, "train_runtime": 39410.5295, "train_samples_per_second": 0.333, "train_steps_per_second": 0.021 } ], "logging_steps": 5, "max_steps": 822, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 17716450041856.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }