{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.30038022813688214, "eval_steps": 500, "global_step": 474, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0006337135614702154, "grad_norm": 0.22353313863277435, "learning_rate": 2e-05, "loss": 0.795, "step": 1 }, { "epoch": 0.0012674271229404308, "grad_norm": 0.270685613155365, "learning_rate": 4e-05, "loss": 0.9841, "step": 2 }, { "epoch": 0.0019011406844106464, "grad_norm": 0.13555319607257843, "learning_rate": 6e-05, "loss": 0.8728, "step": 3 }, { "epoch": 0.0025348542458808617, "grad_norm": 0.1665652096271515, "learning_rate": 8e-05, "loss": 0.8625, "step": 4 }, { "epoch": 0.0031685678073510772, "grad_norm": 0.13588839769363403, "learning_rate": 0.0001, "loss": 0.6776, "step": 5 }, { "epoch": 0.0038022813688212928, "grad_norm": 0.2811749279499054, "learning_rate": 0.00012, "loss": 0.8813, "step": 6 }, { "epoch": 0.004435994930291508, "grad_norm": 0.327694833278656, "learning_rate": 0.00014, "loss": 0.9009, "step": 7 }, { "epoch": 0.005069708491761723, "grad_norm": 0.24555213749408722, "learning_rate": 0.00016, "loss": 0.7054, "step": 8 }, { "epoch": 0.005703422053231939, "grad_norm": 0.14921338856220245, "learning_rate": 0.00018, "loss": 0.697, "step": 9 }, { "epoch": 0.0063371356147021544, "grad_norm": 0.13169103860855103, "learning_rate": 0.0002, "loss": 0.6007, "step": 10 }, { "epoch": 0.00697084917617237, "grad_norm": 0.06807047873735428, "learning_rate": 0.00019999979928608238, "loss": 0.6155, "step": 11 }, { "epoch": 0.0076045627376425855, "grad_norm": 0.08288167417049408, "learning_rate": 0.00019999919714513528, "loss": 0.5641, "step": 12 }, { "epoch": 0.008238276299112801, "grad_norm": 0.12285872548818588, "learning_rate": 0.00019999819357957582, "loss": 0.7526, "step": 13 }, { "epoch": 0.008871989860583017, "grad_norm": 0.15566691756248474, "learning_rate": 0.00019999678859343263, "loss": 0.4519, "step": 14 }, { "epoch": 0.009505703422053232, "grad_norm": 0.1301712989807129, "learning_rate": 0.00019999498219234568, "loss": 0.486, "step": 15 }, { "epoch": 0.010139416983523447, "grad_norm": 0.14493511617183685, "learning_rate": 0.00019999277438356638, "loss": 0.7146, "step": 16 }, { "epoch": 0.010773130544993664, "grad_norm": 0.1372271478176117, "learning_rate": 0.00019999016517595753, "loss": 0.5933, "step": 17 }, { "epoch": 0.011406844106463879, "grad_norm": 0.09944190829992294, "learning_rate": 0.00019998715457999314, "loss": 0.8399, "step": 18 }, { "epoch": 0.012040557667934094, "grad_norm": 0.057923465967178345, "learning_rate": 0.0001999837426077586, "loss": 0.5613, "step": 19 }, { "epoch": 0.012674271229404309, "grad_norm": 0.06214901804924011, "learning_rate": 0.00019997992927295059, "loss": 0.5374, "step": 20 }, { "epoch": 0.013307984790874524, "grad_norm": 0.04898112639784813, "learning_rate": 0.0001999757145908768, "loss": 0.5451, "step": 21 }, { "epoch": 0.01394169835234474, "grad_norm": 0.07026948034763336, "learning_rate": 0.0001999710985784562, "loss": 0.5635, "step": 22 }, { "epoch": 0.014575411913814956, "grad_norm": 0.0672365352511406, "learning_rate": 0.00019996608125421873, "loss": 0.5996, "step": 23 }, { "epoch": 0.015209125475285171, "grad_norm": 0.06477885693311691, "learning_rate": 0.00019996066263830531, "loss": 0.4707, "step": 24 }, { "epoch": 0.015842839036755388, "grad_norm": 0.07720793038606644, "learning_rate": 0.0001999548427524678, "loss": 0.5891, "step": 25 }, { "epoch": 0.016476552598225603, "grad_norm": 0.06699500977993011, "learning_rate": 0.0001999486216200688, "loss": 0.5316, "step": 26 }, { "epoch": 0.017110266159695818, "grad_norm": 0.07539479434490204, "learning_rate": 0.00019994199926608172, "loss": 0.5854, "step": 27 }, { "epoch": 0.017743979721166033, "grad_norm": 4.677523136138916, "learning_rate": 0.00019993497571709048, "loss": 0.5019, "step": 28 }, { "epoch": 0.018377693282636248, "grad_norm": 0.07100815325975418, "learning_rate": 0.00019992755100128962, "loss": 0.4729, "step": 29 }, { "epoch": 0.019011406844106463, "grad_norm": 0.06506210565567017, "learning_rate": 0.000199919725148484, "loss": 0.5597, "step": 30 }, { "epoch": 0.01964512040557668, "grad_norm": 0.04945315420627594, "learning_rate": 0.0001999114981900887, "loss": 0.5044, "step": 31 }, { "epoch": 0.020278833967046894, "grad_norm": 0.05103156715631485, "learning_rate": 0.0001999028701591291, "loss": 0.3637, "step": 32 }, { "epoch": 0.02091254752851711, "grad_norm": 0.05288761481642723, "learning_rate": 0.00019989384109024048, "loss": 0.4345, "step": 33 }, { "epoch": 0.021546261089987327, "grad_norm": 0.05457635968923569, "learning_rate": 0.0001998844110196681, "loss": 0.4714, "step": 34 }, { "epoch": 0.022179974651457542, "grad_norm": 0.055830612778663635, "learning_rate": 0.0001998745799852668, "loss": 0.5285, "step": 35 }, { "epoch": 0.022813688212927757, "grad_norm": 0.05858856439590454, "learning_rate": 0.00019986434802650113, "loss": 0.5106, "step": 36 }, { "epoch": 0.023447401774397972, "grad_norm": 0.05847540497779846, "learning_rate": 0.00019985371518444503, "loss": 0.4394, "step": 37 }, { "epoch": 0.024081115335868188, "grad_norm": 0.1140831857919693, "learning_rate": 0.00019984268150178167, "loss": 0.4782, "step": 38 }, { "epoch": 0.024714828897338403, "grad_norm": 0.06483329832553864, "learning_rate": 0.00019983124702280334, "loss": 0.396, "step": 39 }, { "epoch": 0.025348542458808618, "grad_norm": 0.07212468981742859, "learning_rate": 0.00019981941179341117, "loss": 0.5173, "step": 40 }, { "epoch": 0.025982256020278833, "grad_norm": 0.1697537750005722, "learning_rate": 0.00019980717586111512, "loss": 0.6164, "step": 41 }, { "epoch": 0.026615969581749048, "grad_norm": 0.05975339934229851, "learning_rate": 0.00019979453927503364, "loss": 0.4981, "step": 42 }, { "epoch": 0.027249683143219267, "grad_norm": 0.0607403926551342, "learning_rate": 0.00019978150208589348, "loss": 0.533, "step": 43 }, { "epoch": 0.02788339670468948, "grad_norm": 0.07225210964679718, "learning_rate": 0.00019976806434602952, "loss": 0.5055, "step": 44 }, { "epoch": 0.028517110266159697, "grad_norm": 0.07008686661720276, "learning_rate": 0.00019975422610938462, "loss": 0.6274, "step": 45 }, { "epoch": 0.029150823827629912, "grad_norm": 0.07289402186870575, "learning_rate": 0.0001997399874315093, "loss": 0.5247, "step": 46 }, { "epoch": 0.029784537389100127, "grad_norm": 0.10037431120872498, "learning_rate": 0.0001997253483695616, "loss": 0.647, "step": 47 }, { "epoch": 0.030418250950570342, "grad_norm": 0.06468270719051361, "learning_rate": 0.00019971030898230672, "loss": 0.5719, "step": 48 }, { "epoch": 0.031051964512040557, "grad_norm": 0.0472278967499733, "learning_rate": 0.00019969486933011705, "loss": 0.5565, "step": 49 }, { "epoch": 0.031685678073510776, "grad_norm": 0.0584145151078701, "learning_rate": 0.00019967902947497156, "loss": 0.5432, "step": 50 }, { "epoch": 0.03231939163498099, "grad_norm": 0.08962458372116089, "learning_rate": 0.00019966278948045592, "loss": 0.6432, "step": 51 }, { "epoch": 0.032953105196451206, "grad_norm": 0.08193643391132355, "learning_rate": 0.00019964614941176195, "loss": 0.5341, "step": 52 }, { "epoch": 0.03358681875792142, "grad_norm": 0.07166769355535507, "learning_rate": 0.00019962910933568747, "loss": 0.5481, "step": 53 }, { "epoch": 0.034220532319391636, "grad_norm": 0.10422351956367493, "learning_rate": 0.00019961166932063614, "loss": 0.6145, "step": 54 }, { "epoch": 0.03485424588086185, "grad_norm": 0.06273826211690903, "learning_rate": 0.00019959382943661704, "loss": 0.4969, "step": 55 }, { "epoch": 0.035487959442332066, "grad_norm": 0.06504670530557632, "learning_rate": 0.0001995755897552444, "loss": 0.6093, "step": 56 }, { "epoch": 0.03612167300380228, "grad_norm": 0.05045778304338455, "learning_rate": 0.00019955695034973742, "loss": 0.4191, "step": 57 }, { "epoch": 0.036755386565272496, "grad_norm": 0.06495866179466248, "learning_rate": 0.00019953791129491983, "loss": 0.4762, "step": 58 }, { "epoch": 0.037389100126742715, "grad_norm": 0.0814126655459404, "learning_rate": 0.0001995184726672197, "loss": 0.5599, "step": 59 }, { "epoch": 0.03802281368821293, "grad_norm": 0.052061304450035095, "learning_rate": 0.00019949863454466908, "loss": 0.4822, "step": 60 }, { "epoch": 0.038656527249683145, "grad_norm": 0.05419475957751274, "learning_rate": 0.00019947839700690375, "loss": 0.5625, "step": 61 }, { "epoch": 0.03929024081115336, "grad_norm": 0.06495067477226257, "learning_rate": 0.0001994577601351628, "loss": 0.5863, "step": 62 }, { "epoch": 0.039923954372623575, "grad_norm": 0.055791907012462616, "learning_rate": 0.00019943672401228837, "loss": 0.4588, "step": 63 }, { "epoch": 0.04055766793409379, "grad_norm": 0.03923908621072769, "learning_rate": 0.00019941528872272532, "loss": 0.3841, "step": 64 }, { "epoch": 0.041191381495564006, "grad_norm": 0.08200399577617645, "learning_rate": 0.00019939345435252088, "loss": 0.6163, "step": 65 }, { "epoch": 0.04182509505703422, "grad_norm": 0.05708305537700653, "learning_rate": 0.00019937122098932428, "loss": 0.6363, "step": 66 }, { "epoch": 0.042458808618504436, "grad_norm": 0.053468603640794754, "learning_rate": 0.0001993485887223864, "loss": 0.4777, "step": 67 }, { "epoch": 0.043092522179974654, "grad_norm": 0.08539824187755585, "learning_rate": 0.00019932555764255952, "loss": 0.4922, "step": 68 }, { "epoch": 0.043726235741444866, "grad_norm": 0.07483454793691635, "learning_rate": 0.00019930212784229675, "loss": 0.6337, "step": 69 }, { "epoch": 0.044359949302915085, "grad_norm": 0.06771700084209442, "learning_rate": 0.00019927829941565186, "loss": 0.4559, "step": 70 }, { "epoch": 0.044993662864385296, "grad_norm": 0.05689261853694916, "learning_rate": 0.0001992540724582788, "loss": 0.5489, "step": 71 }, { "epoch": 0.045627376425855515, "grad_norm": 0.05044565722346306, "learning_rate": 0.00019922944706743127, "loss": 0.4472, "step": 72 }, { "epoch": 0.046261089987325726, "grad_norm": 0.07331253588199615, "learning_rate": 0.00019920442334196248, "loss": 0.4752, "step": 73 }, { "epoch": 0.046894803548795945, "grad_norm": 0.057449884712696075, "learning_rate": 0.0001991790013823246, "loss": 0.4525, "step": 74 }, { "epoch": 0.04752851711026616, "grad_norm": 0.08357278257608414, "learning_rate": 0.00019915318129056853, "loss": 0.5813, "step": 75 }, { "epoch": 0.048162230671736375, "grad_norm": 0.051311176270246506, "learning_rate": 0.00019912696317034322, "loss": 0.4593, "step": 76 }, { "epoch": 0.048795944233206594, "grad_norm": 0.06535078585147858, "learning_rate": 0.00019910034712689552, "loss": 0.5339, "step": 77 }, { "epoch": 0.049429657794676805, "grad_norm": 0.13796891272068024, "learning_rate": 0.00019907333326706967, "loss": 0.5438, "step": 78 }, { "epoch": 0.050063371356147024, "grad_norm": 0.05667581036686897, "learning_rate": 0.0001990459216993068, "loss": 0.6295, "step": 79 }, { "epoch": 0.050697084917617236, "grad_norm": 0.05243121087551117, "learning_rate": 0.00019901811253364456, "loss": 0.4782, "step": 80 }, { "epoch": 0.051330798479087454, "grad_norm": 0.0769771933555603, "learning_rate": 0.0001989899058817167, "loss": 0.5692, "step": 81 }, { "epoch": 0.051964512040557666, "grad_norm": 0.07334766536951065, "learning_rate": 0.00019896130185675261, "loss": 0.569, "step": 82 }, { "epoch": 0.052598225602027884, "grad_norm": 0.07953603565692902, "learning_rate": 0.00019893230057357671, "loss": 0.4059, "step": 83 }, { "epoch": 0.053231939163498096, "grad_norm": 0.05282806232571602, "learning_rate": 0.00019890290214860833, "loss": 0.5186, "step": 84 }, { "epoch": 0.053865652724968315, "grad_norm": 0.06661225110292435, "learning_rate": 0.00019887310669986085, "loss": 0.6404, "step": 85 }, { "epoch": 0.05449936628643853, "grad_norm": 0.07150626182556152, "learning_rate": 0.00019884291434694152, "loss": 0.5865, "step": 86 }, { "epoch": 0.055133079847908745, "grad_norm": 0.054674554616212845, "learning_rate": 0.00019881232521105089, "loss": 0.5429, "step": 87 }, { "epoch": 0.05576679340937896, "grad_norm": 0.057950377464294434, "learning_rate": 0.00019878133941498224, "loss": 0.6705, "step": 88 }, { "epoch": 0.056400506970849175, "grad_norm": 0.07045155763626099, "learning_rate": 0.0001987499570831211, "loss": 0.5393, "step": 89 }, { "epoch": 0.057034220532319393, "grad_norm": 0.055960092693567276, "learning_rate": 0.00019871817834144504, "loss": 0.4481, "step": 90 }, { "epoch": 0.057667934093789605, "grad_norm": 0.05631652846932411, "learning_rate": 0.00019868600331752264, "loss": 0.5963, "step": 91 }, { "epoch": 0.058301647655259824, "grad_norm": 0.05120407044887543, "learning_rate": 0.00019865343214051347, "loss": 0.486, "step": 92 }, { "epoch": 0.058935361216730035, "grad_norm": 0.05507562682032585, "learning_rate": 0.0001986204649411673, "loss": 0.5514, "step": 93 }, { "epoch": 0.059569074778200254, "grad_norm": 0.057690516114234924, "learning_rate": 0.0001985871018518236, "loss": 0.4969, "step": 94 }, { "epoch": 0.060202788339670466, "grad_norm": 0.05942325294017792, "learning_rate": 0.00019855334300641114, "loss": 0.51, "step": 95 }, { "epoch": 0.060836501901140684, "grad_norm": 0.05777527391910553, "learning_rate": 0.0001985191885404473, "loss": 0.5401, "step": 96 }, { "epoch": 0.0614702154626109, "grad_norm": 0.07077159732580185, "learning_rate": 0.00019848463859103763, "loss": 0.5568, "step": 97 }, { "epoch": 0.062103929024081114, "grad_norm": 0.050649482756853104, "learning_rate": 0.00019844969329687527, "loss": 0.5418, "step": 98 }, { "epoch": 0.06273764258555133, "grad_norm": 0.059522844851017, "learning_rate": 0.00019841435279824028, "loss": 0.4679, "step": 99 }, { "epoch": 0.06337135614702155, "grad_norm": 0.061260003596544266, "learning_rate": 0.0001983786172369993, "loss": 0.557, "step": 100 }, { "epoch": 0.06400506970849176, "grad_norm": 0.0513591468334198, "learning_rate": 0.00019834248675660486, "loss": 0.5849, "step": 101 }, { "epoch": 0.06463878326996197, "grad_norm": 0.06722971051931381, "learning_rate": 0.0001983059615020947, "loss": 0.4003, "step": 102 }, { "epoch": 0.06527249683143219, "grad_norm": 0.0629379004240036, "learning_rate": 0.0001982690416200914, "loss": 0.5322, "step": 103 }, { "epoch": 0.06590621039290241, "grad_norm": 0.05402471870183945, "learning_rate": 0.00019823172725880165, "loss": 0.5634, "step": 104 }, { "epoch": 0.06653992395437262, "grad_norm": 0.15680162608623505, "learning_rate": 0.0001981940185680156, "loss": 0.5361, "step": 105 }, { "epoch": 0.06717363751584284, "grad_norm": 0.06348865479230881, "learning_rate": 0.00019815591569910654, "loss": 0.5322, "step": 106 }, { "epoch": 0.06780735107731306, "grad_norm": 0.05004284158349037, "learning_rate": 0.00019811741880502995, "loss": 0.5524, "step": 107 }, { "epoch": 0.06844106463878327, "grad_norm": 0.06271985173225403, "learning_rate": 0.00019807852804032305, "loss": 0.4347, "step": 108 }, { "epoch": 0.06907477820025348, "grad_norm": 0.1546468287706375, "learning_rate": 0.00019803924356110423, "loss": 0.4294, "step": 109 }, { "epoch": 0.0697084917617237, "grad_norm": 0.06472460180521011, "learning_rate": 0.00019799956552507233, "loss": 0.5693, "step": 110 }, { "epoch": 0.07034220532319392, "grad_norm": 0.06021984666585922, "learning_rate": 0.00019795949409150598, "loss": 0.6554, "step": 111 }, { "epoch": 0.07097591888466413, "grad_norm": 0.04533032327890396, "learning_rate": 0.00019791902942126313, "loss": 0.4425, "step": 112 }, { "epoch": 0.07160963244613434, "grad_norm": 0.0662391185760498, "learning_rate": 0.0001978781716767802, "loss": 0.5258, "step": 113 }, { "epoch": 0.07224334600760456, "grad_norm": 0.06131117045879364, "learning_rate": 0.00019783692102207155, "loss": 0.4556, "step": 114 }, { "epoch": 0.07287705956907478, "grad_norm": 0.07924918830394745, "learning_rate": 0.00019779527762272877, "loss": 0.5137, "step": 115 }, { "epoch": 0.07351077313054499, "grad_norm": 0.07061261683702469, "learning_rate": 0.0001977532416459201, "loss": 0.4554, "step": 116 }, { "epoch": 0.0741444866920152, "grad_norm": 0.04919254407286644, "learning_rate": 0.00019771081326038962, "loss": 0.5213, "step": 117 }, { "epoch": 0.07477820025348543, "grad_norm": 0.053799472749233246, "learning_rate": 0.00019766799263645673, "loss": 0.5648, "step": 118 }, { "epoch": 0.07541191381495564, "grad_norm": 0.06857369095087051, "learning_rate": 0.00019762477994601522, "loss": 0.6841, "step": 119 }, { "epoch": 0.07604562737642585, "grad_norm": 0.0719090923666954, "learning_rate": 0.000197581175362533, "loss": 0.4154, "step": 120 }, { "epoch": 0.07667934093789606, "grad_norm": 0.10528447479009628, "learning_rate": 0.00019753717906105092, "loss": 0.5674, "step": 121 }, { "epoch": 0.07731305449936629, "grad_norm": 0.05879104137420654, "learning_rate": 0.00019749279121818235, "loss": 0.5282, "step": 122 }, { "epoch": 0.0779467680608365, "grad_norm": 0.050949644297361374, "learning_rate": 0.00019744801201211255, "loss": 0.4398, "step": 123 }, { "epoch": 0.07858048162230671, "grad_norm": 0.061247747391462326, "learning_rate": 0.00019740284162259765, "loss": 0.4269, "step": 124 }, { "epoch": 0.07921419518377694, "grad_norm": 0.09446462988853455, "learning_rate": 0.0001973572802309642, "loss": 0.6362, "step": 125 }, { "epoch": 0.07984790874524715, "grad_norm": 0.06124195456504822, "learning_rate": 0.0001973113280201082, "loss": 0.435, "step": 126 }, { "epoch": 0.08048162230671736, "grad_norm": 0.05198049172759056, "learning_rate": 0.0001972649851744948, "loss": 0.4617, "step": 127 }, { "epoch": 0.08111533586818757, "grad_norm": 0.05457935482263565, "learning_rate": 0.00019721825188015693, "loss": 0.548, "step": 128 }, { "epoch": 0.0817490494296578, "grad_norm": 0.054542481899261475, "learning_rate": 0.0001971711283246951, "loss": 0.4449, "step": 129 }, { "epoch": 0.08238276299112801, "grad_norm": 0.0528152696788311, "learning_rate": 0.0001971236146972764, "loss": 0.5868, "step": 130 }, { "epoch": 0.08301647655259822, "grad_norm": 0.049837883561849594, "learning_rate": 0.0001970757111886337, "loss": 0.4426, "step": 131 }, { "epoch": 0.08365019011406843, "grad_norm": 0.04912682995200157, "learning_rate": 0.00019702741799106508, "loss": 0.5328, "step": 132 }, { "epoch": 0.08428390367553866, "grad_norm": 0.06654444336891174, "learning_rate": 0.00019697873529843282, "loss": 0.6239, "step": 133 }, { "epoch": 0.08491761723700887, "grad_norm": 0.1822642683982849, "learning_rate": 0.00019692966330616283, "loss": 0.6482, "step": 134 }, { "epoch": 0.08555133079847908, "grad_norm": 0.07404999434947968, "learning_rate": 0.00019688020221124376, "loss": 0.5473, "step": 135 }, { "epoch": 0.08618504435994931, "grad_norm": 0.08534666895866394, "learning_rate": 0.00019683035221222618, "loss": 0.4794, "step": 136 }, { "epoch": 0.08681875792141952, "grad_norm": 0.05804799869656563, "learning_rate": 0.00019678011350922185, "loss": 0.5749, "step": 137 }, { "epoch": 0.08745247148288973, "grad_norm": 0.0600556954741478, "learning_rate": 0.00019672948630390294, "loss": 0.4929, "step": 138 }, { "epoch": 0.08808618504435994, "grad_norm": 0.07564158737659454, "learning_rate": 0.00019667847079950118, "loss": 0.5806, "step": 139 }, { "epoch": 0.08871989860583017, "grad_norm": 0.06359097361564636, "learning_rate": 0.00019662706720080693, "loss": 0.5427, "step": 140 }, { "epoch": 0.08935361216730038, "grad_norm": 0.05452190712094307, "learning_rate": 0.00019657527571416856, "loss": 0.4845, "step": 141 }, { "epoch": 0.08998732572877059, "grad_norm": 0.05258841812610626, "learning_rate": 0.00019652309654749156, "loss": 0.5255, "step": 142 }, { "epoch": 0.09062103929024082, "grad_norm": 0.06789179146289825, "learning_rate": 0.0001964705299102376, "loss": 0.6002, "step": 143 }, { "epoch": 0.09125475285171103, "grad_norm": 0.05940316617488861, "learning_rate": 0.00019641757601342378, "loss": 0.6178, "step": 144 }, { "epoch": 0.09188846641318124, "grad_norm": 0.08051005005836487, "learning_rate": 0.00019636423506962181, "loss": 0.4728, "step": 145 }, { "epoch": 0.09252217997465145, "grad_norm": 0.06979210674762726, "learning_rate": 0.00019631050729295707, "loss": 0.5166, "step": 146 }, { "epoch": 0.09315589353612168, "grad_norm": 0.04284743592143059, "learning_rate": 0.00019625639289910777, "loss": 0.3685, "step": 147 }, { "epoch": 0.09378960709759189, "grad_norm": 0.05410388484597206, "learning_rate": 0.00019620189210530425, "loss": 0.582, "step": 148 }, { "epoch": 0.0944233206590621, "grad_norm": 0.08875017613172531, "learning_rate": 0.00019614700513032775, "loss": 0.6757, "step": 149 }, { "epoch": 0.09505703422053231, "grad_norm": 0.06792068481445312, "learning_rate": 0.00019609173219450998, "loss": 0.5236, "step": 150 }, { "epoch": 0.09569074778200254, "grad_norm": 0.060000237077474594, "learning_rate": 0.0001960360735197318, "loss": 0.4813, "step": 151 }, { "epoch": 0.09632446134347275, "grad_norm": 0.052172888070344925, "learning_rate": 0.00019598002932942266, "loss": 0.5792, "step": 152 }, { "epoch": 0.09695817490494296, "grad_norm": 0.04992865398526192, "learning_rate": 0.00019592359984855952, "loss": 0.4652, "step": 153 }, { "epoch": 0.09759188846641319, "grad_norm": 0.05908304825425148, "learning_rate": 0.00019586678530366606, "loss": 0.4968, "step": 154 }, { "epoch": 0.0982256020278834, "grad_norm": 0.16080443561077118, "learning_rate": 0.00019580958592281167, "loss": 0.4804, "step": 155 }, { "epoch": 0.09885931558935361, "grad_norm": 0.05863935872912407, "learning_rate": 0.00019575200193561057, "loss": 0.5313, "step": 156 }, { "epoch": 0.09949302915082382, "grad_norm": 0.047341488301754, "learning_rate": 0.0001956940335732209, "loss": 0.4939, "step": 157 }, { "epoch": 0.10012674271229405, "grad_norm": 0.059797484427690506, "learning_rate": 0.00019563568106834383, "loss": 0.4806, "step": 158 }, { "epoch": 0.10076045627376426, "grad_norm": 0.08543235808610916, "learning_rate": 0.00019557694465522255, "loss": 0.5691, "step": 159 }, { "epoch": 0.10139416983523447, "grad_norm": 0.0614972747862339, "learning_rate": 0.00019551782456964136, "loss": 0.5143, "step": 160 }, { "epoch": 0.10202788339670468, "grad_norm": 0.12742456793785095, "learning_rate": 0.00019545832104892475, "loss": 0.4987, "step": 161 }, { "epoch": 0.10266159695817491, "grad_norm": 0.06898955255746841, "learning_rate": 0.00019539843433193639, "loss": 0.5504, "step": 162 }, { "epoch": 0.10329531051964512, "grad_norm": 0.11239788681268692, "learning_rate": 0.0001953381646590783, "loss": 0.3448, "step": 163 }, { "epoch": 0.10392902408111533, "grad_norm": 0.24028901755809784, "learning_rate": 0.00019527751227228963, "loss": 0.5294, "step": 164 }, { "epoch": 0.10456273764258556, "grad_norm": 0.0903674066066742, "learning_rate": 0.00019521647741504604, "loss": 0.514, "step": 165 }, { "epoch": 0.10519645120405577, "grad_norm": 0.051598865538835526, "learning_rate": 0.00019515506033235833, "loss": 0.4771, "step": 166 }, { "epoch": 0.10583016476552598, "grad_norm": 0.05018608644604683, "learning_rate": 0.0001950932612707719, "loss": 0.4492, "step": 167 }, { "epoch": 0.10646387832699619, "grad_norm": 0.07150580734014511, "learning_rate": 0.00019503108047836523, "loss": 0.5806, "step": 168 }, { "epoch": 0.10709759188846642, "grad_norm": 0.05979820713400841, "learning_rate": 0.00019496851820474944, "loss": 0.6138, "step": 169 }, { "epoch": 0.10773130544993663, "grad_norm": 0.05117090418934822, "learning_rate": 0.00019490557470106686, "loss": 0.5138, "step": 170 }, { "epoch": 0.10836501901140684, "grad_norm": 0.049405183643102646, "learning_rate": 0.0001948422502199903, "loss": 0.4974, "step": 171 }, { "epoch": 0.10899873257287707, "grad_norm": 0.060524292290210724, "learning_rate": 0.00019477854501572176, "loss": 0.5448, "step": 172 }, { "epoch": 0.10963244613434728, "grad_norm": 0.05022512748837471, "learning_rate": 0.0001947144593439917, "loss": 0.5295, "step": 173 }, { "epoch": 0.11026615969581749, "grad_norm": 0.05024838447570801, "learning_rate": 0.0001946499934620579, "loss": 0.4842, "step": 174 }, { "epoch": 0.1108998732572877, "grad_norm": 0.05859989672899246, "learning_rate": 0.00019458514762870426, "loss": 0.5105, "step": 175 }, { "epoch": 0.11153358681875793, "grad_norm": 0.05963319167494774, "learning_rate": 0.00019451992210424006, "loss": 0.4833, "step": 176 }, { "epoch": 0.11216730038022814, "grad_norm": 0.05941782146692276, "learning_rate": 0.0001944543171504987, "loss": 0.4743, "step": 177 }, { "epoch": 0.11280101394169835, "grad_norm": 0.07598856091499329, "learning_rate": 0.00019438833303083678, "loss": 0.483, "step": 178 }, { "epoch": 0.11343472750316856, "grad_norm": 0.05751622095704079, "learning_rate": 0.0001943219700101328, "loss": 0.563, "step": 179 }, { "epoch": 0.11406844106463879, "grad_norm": 0.08273158222436905, "learning_rate": 0.0001942552283547865, "loss": 0.5514, "step": 180 }, { "epoch": 0.114702154626109, "grad_norm": 0.04589926823973656, "learning_rate": 0.00019418810833271745, "loss": 0.4353, "step": 181 }, { "epoch": 0.11533586818757921, "grad_norm": 0.04818568378686905, "learning_rate": 0.00019412061021336404, "loss": 0.4653, "step": 182 }, { "epoch": 0.11596958174904944, "grad_norm": 0.062292054295539856, "learning_rate": 0.0001940527342676826, "loss": 0.5451, "step": 183 }, { "epoch": 0.11660329531051965, "grad_norm": 0.05161510780453682, "learning_rate": 0.000193984480768146, "loss": 0.5174, "step": 184 }, { "epoch": 0.11723700887198986, "grad_norm": 0.0669926106929779, "learning_rate": 0.0001939158499887428, "loss": 0.5074, "step": 185 }, { "epoch": 0.11787072243346007, "grad_norm": 0.04856441915035248, "learning_rate": 0.00019384684220497605, "loss": 0.3898, "step": 186 }, { "epoch": 0.1185044359949303, "grad_norm": 0.05841194465756416, "learning_rate": 0.0001937774576938622, "loss": 0.5437, "step": 187 }, { "epoch": 0.11913814955640051, "grad_norm": 0.05253444239497185, "learning_rate": 0.00019370769673393007, "loss": 0.5669, "step": 188 }, { "epoch": 0.11977186311787072, "grad_norm": 0.05771539360284805, "learning_rate": 0.00019363755960521943, "loss": 0.4965, "step": 189 }, { "epoch": 0.12040557667934093, "grad_norm": 0.07135152071714401, "learning_rate": 0.00019356704658928035, "loss": 0.4089, "step": 190 }, { "epoch": 0.12103929024081116, "grad_norm": 0.05927246809005737, "learning_rate": 0.00019349615796917163, "loss": 0.465, "step": 191 }, { "epoch": 0.12167300380228137, "grad_norm": 0.06522128731012344, "learning_rate": 0.00019342489402945998, "loss": 0.3797, "step": 192 }, { "epoch": 0.12230671736375158, "grad_norm": 0.05745214596390724, "learning_rate": 0.0001933532550562187, "loss": 0.56, "step": 193 }, { "epoch": 0.1229404309252218, "grad_norm": 0.05626146122813225, "learning_rate": 0.0001932812413370265, "loss": 0.5439, "step": 194 }, { "epoch": 0.12357414448669202, "grad_norm": 0.07615689933300018, "learning_rate": 0.00019320885316096654, "loss": 0.5187, "step": 195 }, { "epoch": 0.12420785804816223, "grad_norm": 0.19566097855567932, "learning_rate": 0.00019313609081862508, "loss": 0.5535, "step": 196 }, { "epoch": 0.12484157160963244, "grad_norm": 0.052284326404333115, "learning_rate": 0.00019306295460209044, "loss": 0.4056, "step": 197 }, { "epoch": 0.12547528517110265, "grad_norm": 0.050081610679626465, "learning_rate": 0.00019298944480495176, "loss": 0.451, "step": 198 }, { "epoch": 0.12610899873257286, "grad_norm": 0.07420384138822556, "learning_rate": 0.00019291556172229785, "loss": 0.5485, "step": 199 }, { "epoch": 0.1267427122940431, "grad_norm": 0.046289846301078796, "learning_rate": 0.00019284130565071588, "loss": 0.4944, "step": 200 }, { "epoch": 0.12737642585551331, "grad_norm": 0.041031207889318466, "learning_rate": 0.00019276667688829043, "loss": 0.4507, "step": 201 }, { "epoch": 0.12801013941698353, "grad_norm": 0.07089229673147202, "learning_rate": 0.0001926916757346022, "loss": 0.513, "step": 202 }, { "epoch": 0.12864385297845374, "grad_norm": 0.04405022785067558, "learning_rate": 0.00019261630249072659, "loss": 0.3709, "step": 203 }, { "epoch": 0.12927756653992395, "grad_norm": 0.059661708772182465, "learning_rate": 0.00019254055745923285, "loss": 0.4813, "step": 204 }, { "epoch": 0.12991128010139416, "grad_norm": 0.07400868833065033, "learning_rate": 0.00019246444094418255, "loss": 0.5346, "step": 205 }, { "epoch": 0.13054499366286437, "grad_norm": 0.05862591415643692, "learning_rate": 0.0001923879532511287, "loss": 0.4856, "step": 206 }, { "epoch": 0.1311787072243346, "grad_norm": 0.05793355405330658, "learning_rate": 0.00019231109468711405, "loss": 0.5129, "step": 207 }, { "epoch": 0.13181242078580482, "grad_norm": 0.043961625546216965, "learning_rate": 0.00019223386556067033, "loss": 0.4803, "step": 208 }, { "epoch": 0.13244613434727504, "grad_norm": 0.07102088630199432, "learning_rate": 0.00019215626618181676, "loss": 0.5078, "step": 209 }, { "epoch": 0.13307984790874525, "grad_norm": 0.07707204669713974, "learning_rate": 0.00019207829686205882, "loss": 0.5465, "step": 210 }, { "epoch": 0.13371356147021546, "grad_norm": 0.06010926514863968, "learning_rate": 0.0001919999579143871, "loss": 0.5532, "step": 211 }, { "epoch": 0.13434727503168567, "grad_norm": 0.0627330020070076, "learning_rate": 0.0001919212496532759, "loss": 0.4055, "step": 212 }, { "epoch": 0.13498098859315588, "grad_norm": 0.04347623884677887, "learning_rate": 0.00019184217239468212, "loss": 0.4581, "step": 213 }, { "epoch": 0.13561470215462612, "grad_norm": 0.05672100558876991, "learning_rate": 0.00019176272645604386, "loss": 0.5335, "step": 214 }, { "epoch": 0.13624841571609633, "grad_norm": 0.05062992498278618, "learning_rate": 0.00019168291215627926, "loss": 0.4801, "step": 215 }, { "epoch": 0.13688212927756654, "grad_norm": 8.16939640045166, "learning_rate": 0.00019160272981578512, "loss": 0.5814, "step": 216 }, { "epoch": 0.13751584283903676, "grad_norm": 0.058165278285741806, "learning_rate": 0.00019152217975643566, "loss": 0.5163, "step": 217 }, { "epoch": 0.13814955640050697, "grad_norm": 0.06994735449552536, "learning_rate": 0.00019144126230158127, "loss": 0.5558, "step": 218 }, { "epoch": 0.13878326996197718, "grad_norm": 0.05495104938745499, "learning_rate": 0.0001913599777760471, "loss": 0.5298, "step": 219 }, { "epoch": 0.1394169835234474, "grad_norm": 0.060677338391542435, "learning_rate": 0.00019127832650613189, "loss": 0.5614, "step": 220 }, { "epoch": 0.14005069708491763, "grad_norm": 0.060457441955804825, "learning_rate": 0.00019119630881960658, "loss": 0.5139, "step": 221 }, { "epoch": 0.14068441064638784, "grad_norm": 0.0608784481883049, "learning_rate": 0.00019111392504571296, "loss": 0.4711, "step": 222 }, { "epoch": 0.14131812420785805, "grad_norm": 0.07560902833938599, "learning_rate": 0.00019103117551516244, "loss": 0.486, "step": 223 }, { "epoch": 0.14195183776932827, "grad_norm": 0.0847187414765358, "learning_rate": 0.00019094806056013468, "loss": 0.5934, "step": 224 }, { "epoch": 0.14258555133079848, "grad_norm": 0.06016870215535164, "learning_rate": 0.00019086458051427622, "loss": 0.4529, "step": 225 }, { "epoch": 0.1432192648922687, "grad_norm": 0.17245864868164062, "learning_rate": 0.00019078073571269922, "loss": 0.5307, "step": 226 }, { "epoch": 0.1438529784537389, "grad_norm": 0.0647033080458641, "learning_rate": 0.00019069652649198005, "loss": 0.569, "step": 227 }, { "epoch": 0.1444866920152091, "grad_norm": 0.07447489351034164, "learning_rate": 0.00019061195319015797, "loss": 0.547, "step": 228 }, { "epoch": 0.14512040557667935, "grad_norm": 0.05335066467523575, "learning_rate": 0.00019052701614673373, "loss": 0.5363, "step": 229 }, { "epoch": 0.14575411913814956, "grad_norm": 0.04057115688920021, "learning_rate": 0.0001904417157026683, "loss": 0.4354, "step": 230 }, { "epoch": 0.14638783269961977, "grad_norm": 0.05564083158969879, "learning_rate": 0.00019035605220038137, "loss": 0.5674, "step": 231 }, { "epoch": 0.14702154626108999, "grad_norm": 0.1210884302854538, "learning_rate": 0.00019027002598375012, "loss": 0.5645, "step": 232 }, { "epoch": 0.1476552598225602, "grad_norm": 0.05494518578052521, "learning_rate": 0.00019018363739810767, "loss": 0.6239, "step": 233 }, { "epoch": 0.1482889733840304, "grad_norm": 0.04633218050003052, "learning_rate": 0.0001900968867902419, "loss": 0.4787, "step": 234 }, { "epoch": 0.14892268694550062, "grad_norm": 0.06846950203180313, "learning_rate": 0.00019000977450839393, "loss": 0.5607, "step": 235 }, { "epoch": 0.14955640050697086, "grad_norm": 0.0618814192712307, "learning_rate": 0.0001899223009022566, "loss": 0.631, "step": 236 }, { "epoch": 0.15019011406844107, "grad_norm": 0.06061235070228577, "learning_rate": 0.00018983446632297343, "loss": 0.5989, "step": 237 }, { "epoch": 0.15082382762991128, "grad_norm": 0.06494279205799103, "learning_rate": 0.00018974627112313677, "loss": 0.5816, "step": 238 }, { "epoch": 0.1514575411913815, "grad_norm": 0.04907020181417465, "learning_rate": 0.0001896577156567868, "loss": 0.5097, "step": 239 }, { "epoch": 0.1520912547528517, "grad_norm": 0.04682941362261772, "learning_rate": 0.00018956880027940967, "loss": 0.5828, "step": 240 }, { "epoch": 0.15272496831432192, "grad_norm": 0.05498978868126869, "learning_rate": 0.00018947952534793661, "loss": 0.5257, "step": 241 }, { "epoch": 0.15335868187579213, "grad_norm": 0.04309950768947601, "learning_rate": 0.00018938989122074197, "loss": 0.3662, "step": 242 }, { "epoch": 0.15399239543726237, "grad_norm": 0.06519515067338943, "learning_rate": 0.00018929989825764207, "loss": 0.4058, "step": 243 }, { "epoch": 0.15462610899873258, "grad_norm": 0.046929214149713516, "learning_rate": 0.00018920954681989378, "loss": 0.4916, "step": 244 }, { "epoch": 0.1552598225602028, "grad_norm": 0.05388319492340088, "learning_rate": 0.00018911883727019285, "loss": 0.4143, "step": 245 }, { "epoch": 0.155893536121673, "grad_norm": 0.05619863048195839, "learning_rate": 0.00018902776997267268, "loss": 0.5107, "step": 246 }, { "epoch": 0.15652724968314322, "grad_norm": 0.053882747888565063, "learning_rate": 0.00018893634529290279, "loss": 0.5559, "step": 247 }, { "epoch": 0.15716096324461343, "grad_norm": 0.05231885239481926, "learning_rate": 0.00018884456359788724, "loss": 0.5076, "step": 248 }, { "epoch": 0.15779467680608364, "grad_norm": 0.07149146497249603, "learning_rate": 0.00018875242525606334, "loss": 0.558, "step": 249 }, { "epoch": 0.15842839036755388, "grad_norm": 0.04615316912531853, "learning_rate": 0.00018865993063730004, "loss": 0.4971, "step": 250 }, { "epoch": 0.1590621039290241, "grad_norm": 0.05331886187195778, "learning_rate": 0.00018856708011289643, "loss": 0.5506, "step": 251 }, { "epoch": 0.1596958174904943, "grad_norm": 0.05348580330610275, "learning_rate": 0.00018847387405558045, "loss": 0.4515, "step": 252 }, { "epoch": 0.1603295310519645, "grad_norm": 0.0438147634267807, "learning_rate": 0.00018838031283950705, "loss": 0.3818, "step": 253 }, { "epoch": 0.16096324461343473, "grad_norm": 0.0473354198038578, "learning_rate": 0.0001882863968402571, "loss": 0.4458, "step": 254 }, { "epoch": 0.16159695817490494, "grad_norm": 0.05930502712726593, "learning_rate": 0.0001881921264348355, "loss": 0.6228, "step": 255 }, { "epoch": 0.16223067173637515, "grad_norm": 0.04982107877731323, "learning_rate": 0.00018809750200166994, "loss": 0.5916, "step": 256 }, { "epoch": 0.1628643852978454, "grad_norm": 0.09739918261766434, "learning_rate": 0.0001880025239206092, "loss": 0.651, "step": 257 }, { "epoch": 0.1634980988593156, "grad_norm": 0.09072676301002502, "learning_rate": 0.00018790719257292174, "loss": 0.5564, "step": 258 }, { "epoch": 0.1641318124207858, "grad_norm": 0.0638791099190712, "learning_rate": 0.00018781150834129413, "loss": 0.4545, "step": 259 }, { "epoch": 0.16476552598225602, "grad_norm": 0.05755198001861572, "learning_rate": 0.0001877154716098295, "loss": 0.4457, "step": 260 }, { "epoch": 0.16539923954372623, "grad_norm": 0.2049247920513153, "learning_rate": 0.00018761908276404603, "loss": 0.5447, "step": 261 }, { "epoch": 0.16603295310519645, "grad_norm": 0.06760350614786148, "learning_rate": 0.00018752234219087538, "loss": 0.4743, "step": 262 }, { "epoch": 0.16666666666666666, "grad_norm": 0.061410121619701385, "learning_rate": 0.00018742525027866115, "loss": 0.547, "step": 263 }, { "epoch": 0.16730038022813687, "grad_norm": 0.04981521889567375, "learning_rate": 0.00018732780741715724, "loss": 0.4924, "step": 264 }, { "epoch": 0.1679340937896071, "grad_norm": 0.06636273115873337, "learning_rate": 0.00018723001399752653, "loss": 0.591, "step": 265 }, { "epoch": 0.16856780735107732, "grad_norm": 0.0517747662961483, "learning_rate": 0.00018713187041233896, "loss": 0.5294, "step": 266 }, { "epoch": 0.16920152091254753, "grad_norm": 0.11798780411481857, "learning_rate": 0.00018703337705557017, "loss": 0.4953, "step": 267 }, { "epoch": 0.16983523447401774, "grad_norm": 0.1441587656736374, "learning_rate": 0.00018693453432259998, "loss": 0.4898, "step": 268 }, { "epoch": 0.17046894803548795, "grad_norm": 0.06387986242771149, "learning_rate": 0.00018683534261021057, "loss": 0.4663, "step": 269 }, { "epoch": 0.17110266159695817, "grad_norm": 0.05943833664059639, "learning_rate": 0.0001867358023165851, "loss": 0.5607, "step": 270 }, { "epoch": 0.17173637515842838, "grad_norm": 0.05011943355202675, "learning_rate": 0.00018663591384130606, "loss": 0.5297, "step": 271 }, { "epoch": 0.17237008871989862, "grad_norm": 0.059131983667612076, "learning_rate": 0.00018653567758535354, "loss": 0.4896, "step": 272 }, { "epoch": 0.17300380228136883, "grad_norm": 0.06053609773516655, "learning_rate": 0.0001864350939511038, "loss": 0.5446, "step": 273 }, { "epoch": 0.17363751584283904, "grad_norm": 0.05496980994939804, "learning_rate": 0.00018633416334232753, "loss": 0.5427, "step": 274 }, { "epoch": 0.17427122940430925, "grad_norm": 0.05304751545190811, "learning_rate": 0.0001862328861641883, "loss": 0.4189, "step": 275 }, { "epoch": 0.17490494296577946, "grad_norm": 0.04881710559129715, "learning_rate": 0.00018613126282324092, "loss": 0.4555, "step": 276 }, { "epoch": 0.17553865652724968, "grad_norm": 0.051984284073114395, "learning_rate": 0.0001860292937274297, "loss": 0.5282, "step": 277 }, { "epoch": 0.1761723700887199, "grad_norm": 0.05241424962878227, "learning_rate": 0.00018592697928608703, "loss": 0.4924, "step": 278 }, { "epoch": 0.17680608365019013, "grad_norm": 0.04947778955101967, "learning_rate": 0.00018582431990993151, "loss": 0.4867, "step": 279 }, { "epoch": 0.17743979721166034, "grad_norm": 0.04952229931950569, "learning_rate": 0.00018572131601106654, "loss": 0.4362, "step": 280 }, { "epoch": 0.17807351077313055, "grad_norm": 0.061900023370981216, "learning_rate": 0.00018561796800297832, "loss": 0.6342, "step": 281 }, { "epoch": 0.17870722433460076, "grad_norm": 0.04405650496482849, "learning_rate": 0.00018551427630053463, "loss": 0.4612, "step": 282 }, { "epoch": 0.17934093789607097, "grad_norm": 0.5723605155944824, "learning_rate": 0.00018541024131998274, "loss": 0.4917, "step": 283 }, { "epoch": 0.17997465145754118, "grad_norm": 0.07066962867975235, "learning_rate": 0.0001853058634789481, "loss": 0.5386, "step": 284 }, { "epoch": 0.1806083650190114, "grad_norm": 0.041575830429792404, "learning_rate": 0.00018520114319643235, "loss": 0.4894, "step": 285 }, { "epoch": 0.18124207858048164, "grad_norm": 0.07731833308935165, "learning_rate": 0.0001850960808928119, "loss": 0.5382, "step": 286 }, { "epoch": 0.18187579214195185, "grad_norm": 0.05468999221920967, "learning_rate": 0.00018499067698983605, "loss": 0.4514, "step": 287 }, { "epoch": 0.18250950570342206, "grad_norm": 0.04942842200398445, "learning_rate": 0.00018488493191062542, "loss": 0.4329, "step": 288 }, { "epoch": 0.18314321926489227, "grad_norm": 0.053615666925907135, "learning_rate": 0.0001847788460796702, "loss": 0.5182, "step": 289 }, { "epoch": 0.18377693282636248, "grad_norm": 0.04232574254274368, "learning_rate": 0.00018467241992282843, "loss": 0.3108, "step": 290 }, { "epoch": 0.1844106463878327, "grad_norm": 0.04795556515455246, "learning_rate": 0.00018456565386732433, "loss": 0.383, "step": 291 }, { "epoch": 0.1850443599493029, "grad_norm": 0.053252723067998886, "learning_rate": 0.00018445854834174655, "loss": 0.4597, "step": 292 }, { "epoch": 0.18567807351077312, "grad_norm": 0.044747479259967804, "learning_rate": 0.00018435110377604654, "loss": 0.5066, "step": 293 }, { "epoch": 0.18631178707224336, "grad_norm": 0.0473531037569046, "learning_rate": 0.00018424332060153664, "loss": 0.4258, "step": 294 }, { "epoch": 0.18694550063371357, "grad_norm": 0.05739828571677208, "learning_rate": 0.0001841351992508885, "loss": 0.4498, "step": 295 }, { "epoch": 0.18757921419518378, "grad_norm": 0.0635855570435524, "learning_rate": 0.0001840267401581314, "loss": 0.5368, "step": 296 }, { "epoch": 0.188212927756654, "grad_norm": 0.05470935255289078, "learning_rate": 0.00018391794375865024, "loss": 0.5367, "step": 297 }, { "epoch": 0.1888466413181242, "grad_norm": 0.04850434139370918, "learning_rate": 0.00018380881048918405, "loss": 0.5369, "step": 298 }, { "epoch": 0.18948035487959441, "grad_norm": 0.1420743763446808, "learning_rate": 0.00018369934078782426, "loss": 0.5101, "step": 299 }, { "epoch": 0.19011406844106463, "grad_norm": 0.0749795064330101, "learning_rate": 0.00018358953509401262, "loss": 0.5756, "step": 300 }, { "epoch": 0.19074778200253487, "grad_norm": 0.05331069603562355, "learning_rate": 0.00018347939384853978, "loss": 0.5759, "step": 301 }, { "epoch": 0.19138149556400508, "grad_norm": 0.05981903895735741, "learning_rate": 0.00018336891749354335, "loss": 0.6036, "step": 302 }, { "epoch": 0.1920152091254753, "grad_norm": 0.08048289269208908, "learning_rate": 0.00018325810647250616, "loss": 0.4424, "step": 303 }, { "epoch": 0.1926489226869455, "grad_norm": 0.07861804962158203, "learning_rate": 0.00018314696123025454, "loss": 0.5725, "step": 304 }, { "epoch": 0.1932826362484157, "grad_norm": 0.14672251045703888, "learning_rate": 0.0001830354822129564, "loss": 0.5068, "step": 305 }, { "epoch": 0.19391634980988592, "grad_norm": 0.06640765070915222, "learning_rate": 0.0001829236698681195, "loss": 0.585, "step": 306 }, { "epoch": 0.19455006337135614, "grad_norm": 0.0588274821639061, "learning_rate": 0.0001828115246445898, "loss": 0.5779, "step": 307 }, { "epoch": 0.19518377693282637, "grad_norm": 0.05600736290216446, "learning_rate": 0.0001826990469925494, "loss": 0.5216, "step": 308 }, { "epoch": 0.1958174904942966, "grad_norm": 0.052844930440187454, "learning_rate": 0.0001825862373635149, "loss": 0.5482, "step": 309 }, { "epoch": 0.1964512040557668, "grad_norm": 0.04969317838549614, "learning_rate": 0.0001824730962103356, "loss": 0.5928, "step": 310 }, { "epoch": 0.197084917617237, "grad_norm": 0.06168043613433838, "learning_rate": 0.00018235962398719147, "loss": 0.5185, "step": 311 }, { "epoch": 0.19771863117870722, "grad_norm": 0.051151130348443985, "learning_rate": 0.00018224582114959172, "loss": 0.4677, "step": 312 }, { "epoch": 0.19835234474017743, "grad_norm": 0.060467127710580826, "learning_rate": 0.00018213168815437255, "loss": 0.5566, "step": 313 }, { "epoch": 0.19898605830164764, "grad_norm": 0.043170325458049774, "learning_rate": 0.0001820172254596956, "loss": 0.489, "step": 314 }, { "epoch": 0.19961977186311788, "grad_norm": 0.06550537794828415, "learning_rate": 0.00018190243352504597, "loss": 0.5809, "step": 315 }, { "epoch": 0.2002534854245881, "grad_norm": 0.04956373944878578, "learning_rate": 0.00018178731281123044, "loss": 0.462, "step": 316 }, { "epoch": 0.2008871989860583, "grad_norm": 0.05908495932817459, "learning_rate": 0.00018167186378037563, "loss": 0.4611, "step": 317 }, { "epoch": 0.20152091254752852, "grad_norm": 0.047168437391519547, "learning_rate": 0.00018155608689592604, "loss": 0.5283, "step": 318 }, { "epoch": 0.20215462610899873, "grad_norm": 0.04968830570578575, "learning_rate": 0.00018143998262264233, "loss": 0.4982, "step": 319 }, { "epoch": 0.20278833967046894, "grad_norm": 0.06764087826013565, "learning_rate": 0.00018132355142659937, "loss": 0.5244, "step": 320 }, { "epoch": 0.20342205323193915, "grad_norm": 0.06344570964574814, "learning_rate": 0.0001812067937751844, "loss": 0.606, "step": 321 }, { "epoch": 0.20405576679340937, "grad_norm": 0.06029113009572029, "learning_rate": 0.0001810897101370951, "loss": 0.5407, "step": 322 }, { "epoch": 0.2046894803548796, "grad_norm": 0.08346560597419739, "learning_rate": 0.00018097230098233785, "loss": 0.4814, "step": 323 }, { "epoch": 0.20532319391634982, "grad_norm": 0.04595065116882324, "learning_rate": 0.00018085456678222558, "loss": 0.471, "step": 324 }, { "epoch": 0.20595690747782003, "grad_norm": 0.4050588309764862, "learning_rate": 0.00018073650800937624, "loss": 0.4586, "step": 325 }, { "epoch": 0.20659062103929024, "grad_norm": 0.055679477751255035, "learning_rate": 0.00018061812513771053, "loss": 0.516, "step": 326 }, { "epoch": 0.20722433460076045, "grad_norm": 0.05209626257419586, "learning_rate": 0.00018049941864245033, "loss": 0.4528, "step": 327 }, { "epoch": 0.20785804816223066, "grad_norm": 0.05503727123141289, "learning_rate": 0.00018038038900011652, "loss": 0.4297, "step": 328 }, { "epoch": 0.20849176172370087, "grad_norm": 0.05453247204422951, "learning_rate": 0.0001802610366885271, "loss": 0.4731, "step": 329 }, { "epoch": 0.20912547528517111, "grad_norm": 0.05371938645839691, "learning_rate": 0.00018014136218679567, "loss": 0.569, "step": 330 }, { "epoch": 0.20975918884664133, "grad_norm": 0.05164814740419388, "learning_rate": 0.0001800213659753289, "loss": 0.4883, "step": 331 }, { "epoch": 0.21039290240811154, "grad_norm": 0.06455442309379578, "learning_rate": 0.00017990104853582493, "loss": 0.4829, "step": 332 }, { "epoch": 0.21102661596958175, "grad_norm": 0.04764432832598686, "learning_rate": 0.0001797804103512715, "loss": 0.5525, "step": 333 }, { "epoch": 0.21166032953105196, "grad_norm": 0.0578368604183197, "learning_rate": 0.00017965945190594388, "loss": 0.4824, "step": 334 }, { "epoch": 0.21229404309252217, "grad_norm": 0.05196613445878029, "learning_rate": 0.00017953817368540292, "loss": 0.5036, "step": 335 }, { "epoch": 0.21292775665399238, "grad_norm": 0.044868264347314835, "learning_rate": 0.00017941657617649316, "loss": 0.36, "step": 336 }, { "epoch": 0.21356147021546262, "grad_norm": 0.0686643123626709, "learning_rate": 0.00017929465986734084, "loss": 0.6069, "step": 337 }, { "epoch": 0.21419518377693283, "grad_norm": 0.08286602050065994, "learning_rate": 0.000179172425247352, "loss": 0.5635, "step": 338 }, { "epoch": 0.21482889733840305, "grad_norm": 0.5979371070861816, "learning_rate": 0.00017904987280721035, "loss": 0.3994, "step": 339 }, { "epoch": 0.21546261089987326, "grad_norm": 0.05577315390110016, "learning_rate": 0.00017892700303887558, "loss": 0.5699, "step": 340 }, { "epoch": 0.21609632446134347, "grad_norm": 0.06650438159704208, "learning_rate": 0.0001788038164355811, "loss": 0.5557, "step": 341 }, { "epoch": 0.21673003802281368, "grad_norm": 0.06644187867641449, "learning_rate": 0.00017868031349183217, "loss": 0.5593, "step": 342 }, { "epoch": 0.2173637515842839, "grad_norm": 0.05286836251616478, "learning_rate": 0.00017855649470340413, "loss": 0.4902, "step": 343 }, { "epoch": 0.21799746514575413, "grad_norm": 0.05314694344997406, "learning_rate": 0.00017843236056733992, "loss": 0.5036, "step": 344 }, { "epoch": 0.21863117870722434, "grad_norm": 0.0668027251958847, "learning_rate": 0.0001783079115819486, "loss": 0.6198, "step": 345 }, { "epoch": 0.21926489226869456, "grad_norm": 0.04909252002835274, "learning_rate": 0.000178183148246803, "loss": 0.4273, "step": 346 }, { "epoch": 0.21989860583016477, "grad_norm": 0.053546786308288574, "learning_rate": 0.00017805807106273787, "loss": 0.5077, "step": 347 }, { "epoch": 0.22053231939163498, "grad_norm": 0.0647466629743576, "learning_rate": 0.00017793268053184786, "loss": 0.5262, "step": 348 }, { "epoch": 0.2211660329531052, "grad_norm": 0.05518212169408798, "learning_rate": 0.00017780697715748546, "loss": 0.5621, "step": 349 }, { "epoch": 0.2217997465145754, "grad_norm": 0.0661974772810936, "learning_rate": 0.00017768096144425902, "loss": 0.5727, "step": 350 }, { "epoch": 0.2224334600760456, "grad_norm": 0.09333747625350952, "learning_rate": 0.00017755463389803065, "loss": 0.4891, "step": 351 }, { "epoch": 0.22306717363751585, "grad_norm": 0.04791216179728508, "learning_rate": 0.0001774279950259143, "loss": 0.5569, "step": 352 }, { "epoch": 0.22370088719898606, "grad_norm": 0.05712969973683357, "learning_rate": 0.0001773010453362737, "loss": 0.5433, "step": 353 }, { "epoch": 0.22433460076045628, "grad_norm": 0.05735623091459274, "learning_rate": 0.00017717378533872017, "loss": 0.5702, "step": 354 }, { "epoch": 0.2249683143219265, "grad_norm": 0.05040268227458, "learning_rate": 0.00017704621554411084, "loss": 0.4964, "step": 355 }, { "epoch": 0.2256020278833967, "grad_norm": 0.04687810316681862, "learning_rate": 0.00017691833646454628, "loss": 0.5242, "step": 356 }, { "epoch": 0.2262357414448669, "grad_norm": 0.051406193524599075, "learning_rate": 0.00017679014861336878, "loss": 0.5146, "step": 357 }, { "epoch": 0.22686945500633712, "grad_norm": 0.04884679988026619, "learning_rate": 0.00017666165250516006, "loss": 0.4825, "step": 358 }, { "epoch": 0.22750316856780736, "grad_norm": 0.053725842386484146, "learning_rate": 0.0001765328486557392, "loss": 0.4932, "step": 359 }, { "epoch": 0.22813688212927757, "grad_norm": 0.06212908402085304, "learning_rate": 0.00017640373758216077, "loss": 0.506, "step": 360 }, { "epoch": 0.22877059569074779, "grad_norm": 0.05059286579489708, "learning_rate": 0.0001762743198027125, "loss": 0.4719, "step": 361 }, { "epoch": 0.229404309252218, "grad_norm": 0.04520050436258316, "learning_rate": 0.00017614459583691346, "loss": 0.4553, "step": 362 }, { "epoch": 0.2300380228136882, "grad_norm": 0.05503036454319954, "learning_rate": 0.0001760145662055117, "loss": 0.4706, "step": 363 }, { "epoch": 0.23067173637515842, "grad_norm": 0.046107854694128036, "learning_rate": 0.00017588423143048235, "loss": 0.4177, "step": 364 }, { "epoch": 0.23130544993662863, "grad_norm": 0.12301266193389893, "learning_rate": 0.0001757535920350255, "loss": 0.5922, "step": 365 }, { "epoch": 0.23193916349809887, "grad_norm": 1.179470419883728, "learning_rate": 0.00017562264854356405, "loss": 0.5123, "step": 366 }, { "epoch": 0.23257287705956908, "grad_norm": 0.11167129874229431, "learning_rate": 0.0001754914014817416, "loss": 0.3884, "step": 367 }, { "epoch": 0.2332065906210393, "grad_norm": 0.055067550390958786, "learning_rate": 0.00017535985137642044, "loss": 0.4544, "step": 368 }, { "epoch": 0.2338403041825095, "grad_norm": 0.07947530597448349, "learning_rate": 0.0001752279987556792, "loss": 0.6575, "step": 369 }, { "epoch": 0.23447401774397972, "grad_norm": 0.10236025601625443, "learning_rate": 0.00017509584414881113, "loss": 0.5334, "step": 370 }, { "epoch": 0.23510773130544993, "grad_norm": 0.12996040284633636, "learning_rate": 0.00017496338808632155, "loss": 0.3897, "step": 371 }, { "epoch": 0.23574144486692014, "grad_norm": 0.07005209475755692, "learning_rate": 0.00017483063109992596, "loss": 0.5077, "step": 372 }, { "epoch": 0.23637515842839038, "grad_norm": 0.04446430131793022, "learning_rate": 0.00017469757372254785, "loss": 0.4467, "step": 373 }, { "epoch": 0.2370088719898606, "grad_norm": 6.105027198791504, "learning_rate": 0.00017456421648831655, "loss": 1.722, "step": 374 }, { "epoch": 0.2376425855513308, "grad_norm": 0.07488813251256943, "learning_rate": 0.0001744305599325652, "loss": 0.7018, "step": 375 }, { "epoch": 0.23827629911280102, "grad_norm": 0.05676595866680145, "learning_rate": 0.00017429660459182834, "loss": 0.4865, "step": 376 }, { "epoch": 0.23891001267427123, "grad_norm": 0.058106616139411926, "learning_rate": 0.00017416235100384007, "loss": 0.4453, "step": 377 }, { "epoch": 0.23954372623574144, "grad_norm": 0.4252207577228546, "learning_rate": 0.00017402779970753155, "loss": 3.008, "step": 378 }, { "epoch": 0.24017743979721165, "grad_norm": 0.24036817252635956, "learning_rate": 0.00017389295124302923, "loss": 0.7246, "step": 379 }, { "epoch": 0.24081115335868186, "grad_norm": 4.316144943237305, "learning_rate": 0.00017375780615165235, "loss": 0.664, "step": 380 }, { "epoch": 0.2414448669201521, "grad_norm": 6.4877166748046875, "learning_rate": 0.00017362236497591094, "loss": 0.487, "step": 381 }, { "epoch": 0.2420785804816223, "grad_norm": 0.12358918786048889, "learning_rate": 0.00017348662825950357, "loss": 0.4839, "step": 382 }, { "epoch": 0.24271229404309252, "grad_norm": 0.7211472988128662, "learning_rate": 0.0001733505965473152, "loss": 0.6351, "step": 383 }, { "epoch": 0.24334600760456274, "grad_norm": 0.10177785158157349, "learning_rate": 0.00017321427038541494, "loss": 0.6043, "step": 384 }, { "epoch": 0.24397972116603295, "grad_norm": 0.054658226668834686, "learning_rate": 0.00017307765032105406, "loss": 0.473, "step": 385 }, { "epoch": 0.24461343472750316, "grad_norm": 0.10075858235359192, "learning_rate": 0.00017294073690266344, "loss": 0.4892, "step": 386 }, { "epoch": 0.24524714828897337, "grad_norm": 0.06497970223426819, "learning_rate": 0.00017280353067985167, "loss": 0.4986, "step": 387 }, { "epoch": 0.2458808618504436, "grad_norm": 0.7542481422424316, "learning_rate": 0.0001726660322034027, "loss": 0.5513, "step": 388 }, { "epoch": 0.24651457541191382, "grad_norm": 0.08190987259149551, "learning_rate": 0.00017252824202527376, "loss": 0.5077, "step": 389 }, { "epoch": 0.24714828897338403, "grad_norm": 0.08874624967575073, "learning_rate": 0.0001723901606985929, "loss": 0.3973, "step": 390 }, { "epoch": 0.24778200253485425, "grad_norm": 0.32968223094940186, "learning_rate": 0.00017225178877765704, "loss": 0.4411, "step": 391 }, { "epoch": 0.24841571609632446, "grad_norm": 0.39434677362442017, "learning_rate": 0.00017211312681792958, "loss": 0.5201, "step": 392 }, { "epoch": 0.24904942965779467, "grad_norm": 0.11154969036579132, "learning_rate": 0.00017197417537603827, "loss": 0.6205, "step": 393 }, { "epoch": 0.24968314321926488, "grad_norm": 0.07316391915082932, "learning_rate": 0.00017183493500977278, "loss": 0.5129, "step": 394 }, { "epoch": 0.2503168567807351, "grad_norm": 0.08883780986070633, "learning_rate": 0.00017169540627808274, "loss": 0.5036, "step": 395 }, { "epoch": 0.2509505703422053, "grad_norm": 0.07377318292856216, "learning_rate": 0.00017155558974107536, "loss": 0.591, "step": 396 }, { "epoch": 0.25158428390367554, "grad_norm": 0.064984992146492, "learning_rate": 0.00017141548596001305, "loss": 0.645, "step": 397 }, { "epoch": 0.2522179974651457, "grad_norm": 0.07279626280069351, "learning_rate": 0.00017127509549731148, "loss": 0.5108, "step": 398 }, { "epoch": 0.25285171102661597, "grad_norm": 0.06948740035295486, "learning_rate": 0.000171134418916537, "loss": 0.4959, "step": 399 }, { "epoch": 0.2534854245880862, "grad_norm": 1.0025055408477783, "learning_rate": 0.00017099345678240452, "loss": 0.5248, "step": 400 }, { "epoch": 0.2541191381495564, "grad_norm": 0.34188470244407654, "learning_rate": 0.00017085220966077538, "loss": 0.5588, "step": 401 }, { "epoch": 0.25475285171102663, "grad_norm": 0.04984923452138901, "learning_rate": 0.00017071067811865476, "loss": 0.4033, "step": 402 }, { "epoch": 0.2553865652724968, "grad_norm": 0.05613204464316368, "learning_rate": 0.0001705688627241897, "loss": 0.5774, "step": 403 }, { "epoch": 0.25602027883396705, "grad_norm": 0.058507829904556274, "learning_rate": 0.0001704267640466667, "loss": 0.52, "step": 404 }, { "epoch": 0.25665399239543724, "grad_norm": 0.23744581639766693, "learning_rate": 0.00017028438265650933, "loss": 0.6028, "step": 405 }, { "epoch": 0.2572877059569075, "grad_norm": 0.11817914992570877, "learning_rate": 0.00017014171912527616, "loss": 0.5416, "step": 406 }, { "epoch": 0.2579214195183777, "grad_norm": 0.29011303186416626, "learning_rate": 0.00016999877402565833, "loss": 0.4381, "step": 407 }, { "epoch": 0.2585551330798479, "grad_norm": 0.06895189732313156, "learning_rate": 0.00016985554793147727, "loss": 0.5046, "step": 408 }, { "epoch": 0.25918884664131814, "grad_norm": 0.059166181832551956, "learning_rate": 0.00016971204141768233, "loss": 0.582, "step": 409 }, { "epoch": 0.2598225602027883, "grad_norm": 0.09994165599346161, "learning_rate": 0.00016956825506034867, "loss": 0.6042, "step": 410 }, { "epoch": 0.26045627376425856, "grad_norm": 0.09195294976234436, "learning_rate": 0.00016942418943667468, "loss": 0.577, "step": 411 }, { "epoch": 0.26108998732572875, "grad_norm": 0.08966407924890518, "learning_rate": 0.00016927984512497992, "loss": 0.5795, "step": 412 }, { "epoch": 0.261723700887199, "grad_norm": 0.08420640975236893, "learning_rate": 0.00016913522270470263, "loss": 0.4446, "step": 413 }, { "epoch": 0.2623574144486692, "grad_norm": 0.05902143940329552, "learning_rate": 0.0001689903227563975, "loss": 0.4458, "step": 414 }, { "epoch": 0.2629911280101394, "grad_norm": 0.046236153692007065, "learning_rate": 0.0001688451458617332, "loss": 0.3762, "step": 415 }, { "epoch": 0.26362484157160965, "grad_norm": 0.10383841395378113, "learning_rate": 0.00016869969260349018, "loss": 0.6076, "step": 416 }, { "epoch": 0.26425855513307983, "grad_norm": 0.059753723442554474, "learning_rate": 0.00016855396356555834, "loss": 0.4116, "step": 417 }, { "epoch": 0.26489226869455007, "grad_norm": 0.05825261399149895, "learning_rate": 0.00016840795933293463, "loss": 0.5377, "step": 418 }, { "epoch": 0.26552598225602025, "grad_norm": 0.07149126380681992, "learning_rate": 0.00016826168049172062, "loss": 0.5946, "step": 419 }, { "epoch": 0.2661596958174905, "grad_norm": 0.0636037141084671, "learning_rate": 0.00016811512762912034, "loss": 0.4232, "step": 420 }, { "epoch": 0.26679340937896073, "grad_norm": 0.06662221997976303, "learning_rate": 0.00016796830133343775, "loss": 0.5406, "step": 421 }, { "epoch": 0.2674271229404309, "grad_norm": 0.058340173214673996, "learning_rate": 0.00016782120219407452, "loss": 0.5402, "step": 422 }, { "epoch": 0.26806083650190116, "grad_norm": 0.054275717586278915, "learning_rate": 0.00016767383080152742, "loss": 0.5215, "step": 423 }, { "epoch": 0.26869455006337134, "grad_norm": 0.055525969713926315, "learning_rate": 0.00016752618774738639, "loss": 0.5743, "step": 424 }, { "epoch": 0.2693282636248416, "grad_norm": 0.05762525647878647, "learning_rate": 0.00016737827362433164, "loss": 0.5806, "step": 425 }, { "epoch": 0.26996197718631176, "grad_norm": 0.059116896241903305, "learning_rate": 0.0001672300890261317, "loss": 0.4828, "step": 426 }, { "epoch": 0.270595690747782, "grad_norm": 0.046420734375715256, "learning_rate": 0.00016708163454764075, "loss": 0.4509, "step": 427 }, { "epoch": 0.27122940430925224, "grad_norm": 0.11202160269021988, "learning_rate": 0.00016693291078479638, "loss": 0.5139, "step": 428 }, { "epoch": 0.2718631178707224, "grad_norm": 0.08383259177207947, "learning_rate": 0.00016678391833461722, "loss": 0.7026, "step": 429 }, { "epoch": 0.27249683143219267, "grad_norm": 0.058648403733968735, "learning_rate": 0.0001666346577952004, "loss": 0.4704, "step": 430 }, { "epoch": 0.27313054499366285, "grad_norm": 0.08609268069267273, "learning_rate": 0.0001664851297657193, "loss": 0.5186, "step": 431 }, { "epoch": 0.2737642585551331, "grad_norm": 0.10570003092288971, "learning_rate": 0.00016633533484642103, "loss": 0.4615, "step": 432 }, { "epoch": 0.2743979721166033, "grad_norm": 0.09764793515205383, "learning_rate": 0.00016618527363862408, "loss": 0.4519, "step": 433 }, { "epoch": 0.2750316856780735, "grad_norm": 0.08797989040613174, "learning_rate": 0.00016603494674471593, "loss": 0.6139, "step": 434 }, { "epoch": 0.27566539923954375, "grad_norm": 0.0714520812034607, "learning_rate": 0.0001658843547681506, "loss": 0.5027, "step": 435 }, { "epoch": 0.27629911280101394, "grad_norm": 0.08733757585287094, "learning_rate": 0.00016573349831344616, "loss": 0.4582, "step": 436 }, { "epoch": 0.2769328263624842, "grad_norm": 0.0712830200791359, "learning_rate": 0.00016558237798618245, "loss": 0.4336, "step": 437 }, { "epoch": 0.27756653992395436, "grad_norm": 0.06345337629318237, "learning_rate": 0.00016543099439299844, "loss": 0.4587, "step": 438 }, { "epoch": 0.2782002534854246, "grad_norm": 0.06224706023931503, "learning_rate": 0.0001652793481415901, "loss": 0.5171, "step": 439 }, { "epoch": 0.2788339670468948, "grad_norm": 0.0549205057322979, "learning_rate": 0.00016512743984070769, "loss": 0.5189, "step": 440 }, { "epoch": 0.279467680608365, "grad_norm": 0.07211892306804657, "learning_rate": 0.00016497527010015336, "loss": 0.6118, "step": 441 }, { "epoch": 0.28010139416983526, "grad_norm": 0.05902037024497986, "learning_rate": 0.00016482283953077887, "loss": 0.5376, "step": 442 }, { "epoch": 0.28073510773130544, "grad_norm": 0.04935478791594505, "learning_rate": 0.00016467014874448288, "loss": 0.5468, "step": 443 }, { "epoch": 0.2813688212927757, "grad_norm": 0.08219460397958755, "learning_rate": 0.00016451719835420877, "loss": 0.5723, "step": 444 }, { "epoch": 0.28200253485424587, "grad_norm": 0.08607888221740723, "learning_rate": 0.000164363988973942, "loss": 0.4821, "step": 445 }, { "epoch": 0.2826362484157161, "grad_norm": 0.05368666350841522, "learning_rate": 0.00016421052121870755, "loss": 0.4759, "step": 446 }, { "epoch": 0.2832699619771863, "grad_norm": 0.09421613812446594, "learning_rate": 0.00016405679570456782, "loss": 0.4634, "step": 447 }, { "epoch": 0.28390367553865653, "grad_norm": 0.06585177779197693, "learning_rate": 0.0001639028130486198, "loss": 0.5049, "step": 448 }, { "epoch": 0.28453738910012677, "grad_norm": 0.07445032149553299, "learning_rate": 0.00016374857386899268, "loss": 0.6255, "step": 449 }, { "epoch": 0.28517110266159695, "grad_norm": 0.05892190709710121, "learning_rate": 0.00016359407878484552, "loss": 0.5035, "step": 450 }, { "epoch": 0.2858048162230672, "grad_norm": 0.08238600939512253, "learning_rate": 0.00016343932841636456, "loss": 0.4818, "step": 451 }, { "epoch": 0.2864385297845374, "grad_norm": 0.0664915144443512, "learning_rate": 0.00016328432338476084, "loss": 0.4375, "step": 452 }, { "epoch": 0.2870722433460076, "grad_norm": 0.04862099885940552, "learning_rate": 0.00016312906431226773, "loss": 0.4138, "step": 453 }, { "epoch": 0.2877059569074778, "grad_norm": 0.04187007248401642, "learning_rate": 0.00016297355182213837, "loss": 0.3836, "step": 454 }, { "epoch": 0.28833967046894804, "grad_norm": 0.05451095104217529, "learning_rate": 0.00016281778653864316, "loss": 0.4451, "step": 455 }, { "epoch": 0.2889733840304182, "grad_norm": 0.061764512211084366, "learning_rate": 0.0001626617690870673, "loss": 0.6315, "step": 456 }, { "epoch": 0.28960709759188846, "grad_norm": 0.05365981534123421, "learning_rate": 0.0001625055000937083, "loss": 0.4399, "step": 457 }, { "epoch": 0.2902408111533587, "grad_norm": 0.10771326720714569, "learning_rate": 0.00016234898018587337, "loss": 0.5229, "step": 458 }, { "epoch": 0.2908745247148289, "grad_norm": 0.05859148129820824, "learning_rate": 0.000162192209991877, "loss": 0.4254, "step": 459 }, { "epoch": 0.2915082382762991, "grad_norm": 0.08183909952640533, "learning_rate": 0.00016203519014103837, "loss": 0.3658, "step": 460 }, { "epoch": 0.2921419518377693, "grad_norm": 0.04404648020863533, "learning_rate": 0.00016187792126367886, "loss": 0.4138, "step": 461 }, { "epoch": 0.29277566539923955, "grad_norm": 0.056379418820142746, "learning_rate": 0.00016172040399111957, "loss": 0.4781, "step": 462 }, { "epoch": 0.29340937896070973, "grad_norm": 0.0440094955265522, "learning_rate": 0.00016156263895567867, "loss": 0.4623, "step": 463 }, { "epoch": 0.29404309252217997, "grad_norm": 0.055651161819696426, "learning_rate": 0.00016140462679066885, "loss": 0.5002, "step": 464 }, { "epoch": 0.2946768060836502, "grad_norm": 0.09338720887899399, "learning_rate": 0.00016124636813039502, "loss": 0.5199, "step": 465 }, { "epoch": 0.2953105196451204, "grad_norm": 0.07024485617876053, "learning_rate": 0.00016108786361015143, "loss": 0.5378, "step": 466 }, { "epoch": 0.29594423320659063, "grad_norm": 0.05211356282234192, "learning_rate": 0.00016092911386621938, "loss": 0.5895, "step": 467 }, { "epoch": 0.2965779467680608, "grad_norm": 0.05571569502353668, "learning_rate": 0.00016077011953586452, "loss": 0.4952, "step": 468 }, { "epoch": 0.29721166032953106, "grad_norm": 0.07663686573505402, "learning_rate": 0.00016061088125733433, "loss": 0.5341, "step": 469 }, { "epoch": 0.29784537389100124, "grad_norm": 0.04910871386528015, "learning_rate": 0.0001604513996698556, "loss": 0.445, "step": 470 }, { "epoch": 0.2984790874524715, "grad_norm": 0.07365076243877411, "learning_rate": 0.0001602916754136318, "loss": 0.5364, "step": 471 }, { "epoch": 0.2991128010139417, "grad_norm": 0.08367875218391418, "learning_rate": 0.00016013170912984058, "loss": 0.5709, "step": 472 }, { "epoch": 0.2997465145754119, "grad_norm": 0.06659605354070663, "learning_rate": 0.00015997150146063115, "loss": 0.5351, "step": 473 }, { "epoch": 0.30038022813688214, "grad_norm": 0.05647695064544678, "learning_rate": 0.00015981105304912162, "loss": 0.4103, "step": 474 } ], "logging_steps": 1, "max_steps": 1578, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 158, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 6.515739834294731e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }