{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 563148, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0026635982015384943, "grad_norm": 0.2271278351545334, "learning_rate": 0.0001996, "loss": 8.7148, "step": 500 }, { "epoch": 0.005327196403076989, "grad_norm": 0.448383092880249, "learning_rate": 0.0003996, "loss": 7.4094, "step": 1000 }, { "epoch": 0.007990794604615483, "grad_norm": 0.46370673179626465, "learning_rate": 0.0005996, "loss": 7.1049, "step": 1500 }, { "epoch": 0.010654392806153977, "grad_norm": 0.7845134735107422, "learning_rate": 0.0007996, "loss": 6.8619, "step": 2000 }, { "epoch": 0.013317991007692471, "grad_norm": 0.7677924036979675, "learning_rate": 0.0009996, "loss": 6.7206, "step": 2500 }, { "epoch": 0.015981589209230967, "grad_norm": 0.7272828817367554, "learning_rate": 0.0009991099584766199, "loss": 6.6171, "step": 3000 }, { "epoch": 0.01864518741076946, "grad_norm": 0.7266383171081543, "learning_rate": 0.0009982181333028923, "loss": 6.4961, "step": 3500 }, { "epoch": 0.021308785612307955, "grad_norm": 0.8149316310882568, "learning_rate": 0.0009973263081291647, "loss": 6.3995, "step": 4000 }, { "epoch": 0.02397238381384645, "grad_norm": 0.8527867794036865, "learning_rate": 0.0009964344829554372, "loss": 6.3342, "step": 4500 }, { "epoch": 0.026635982015384942, "grad_norm": 1.2359241247177124, "learning_rate": 0.0009955444414320573, "loss": 6.2305, "step": 5000 }, { "epoch": 0.029299580216923436, "grad_norm": 1.1131370067596436, "learning_rate": 0.0009946526162583297, "loss": 6.0731, "step": 5500 }, { "epoch": 0.031963178418461934, "grad_norm": 1.185133457183838, "learning_rate": 0.0009937607910846021, "loss": 5.9349, "step": 6000 }, { "epoch": 0.034626776620000424, "grad_norm": 1.201166033744812, "learning_rate": 0.0009928689659108746, "loss": 5.7587, "step": 6500 }, { "epoch": 0.03729037482153892, "grad_norm": 1.2446848154067993, "learning_rate": 0.0009919789243874944, "loss": 5.6453, "step": 7000 }, { "epoch": 0.03995397302307741, "grad_norm": 1.2813904285430908, "learning_rate": 0.0009910870992137668, "loss": 5.5547, "step": 7500 }, { "epoch": 0.04261757122461591, "grad_norm": 0.9883731007575989, "learning_rate": 0.0009901952740400395, "loss": 5.3078, "step": 8000 }, { "epoch": 0.045281169426154406, "grad_norm": 0.9527985453605652, "learning_rate": 0.000989303448866312, "loss": 5.1301, "step": 8500 }, { "epoch": 0.0479447676276929, "grad_norm": 0.9772309064865112, "learning_rate": 0.0009884134073429318, "loss": 5.0381, "step": 9000 }, { "epoch": 0.050608365829231394, "grad_norm": 1.0352524518966675, "learning_rate": 0.0009875215821692042, "loss": 4.9814, "step": 9500 }, { "epoch": 0.053271964030769885, "grad_norm": 0.8517736196517944, "learning_rate": 0.0009866297569954767, "loss": 4.9238, "step": 10000 }, { "epoch": 0.05593556223230838, "grad_norm": 0.9034407138824463, "learning_rate": 0.000985737931821749, "loss": 4.8745, "step": 10500 }, { "epoch": 0.05859916043384687, "grad_norm": 0.8332895636558533, "learning_rate": 0.0009848461066480215, "loss": 4.845, "step": 11000 }, { "epoch": 0.06126275863538537, "grad_norm": 0.8637209534645081, "learning_rate": 0.0009839560651246416, "loss": 4.8014, "step": 11500 }, { "epoch": 0.06392635683692387, "grad_norm": 0.8696839213371277, "learning_rate": 0.000983064239950914, "loss": 4.7803, "step": 12000 }, { "epoch": 0.06658995503846236, "grad_norm": 0.8878291249275208, "learning_rate": 0.0009821724147771865, "loss": 4.7629, "step": 12500 }, { "epoch": 0.06925355324000085, "grad_norm": 0.8268778324127197, "learning_rate": 0.000981280589603459, "loss": 4.7312, "step": 13000 }, { "epoch": 0.07191715144153935, "grad_norm": 0.884635329246521, "learning_rate": 0.0009803887644297313, "loss": 4.7146, "step": 13500 }, { "epoch": 0.07458074964307784, "grad_norm": 0.7639057636260986, "learning_rate": 0.0009794969392560038, "loss": 4.6961, "step": 14000 }, { "epoch": 0.07724434784461634, "grad_norm": 0.8192263245582581, "learning_rate": 0.0009786051140822762, "loss": 4.6766, "step": 14500 }, { "epoch": 0.07990794604615482, "grad_norm": 0.8075643181800842, "learning_rate": 0.0009777132889085486, "loss": 4.6582, "step": 15000 }, { "epoch": 0.08257154424769332, "grad_norm": 0.7193809151649475, "learning_rate": 0.0009768232473851685, "loss": 4.655, "step": 15500 }, { "epoch": 0.08523514244923182, "grad_norm": 0.8761749267578125, "learning_rate": 0.000975931422211441, "loss": 4.6378, "step": 16000 }, { "epoch": 0.08789874065077032, "grad_norm": 0.8616175055503845, "learning_rate": 0.0009750395970377135, "loss": 4.6265, "step": 16500 }, { "epoch": 0.09056233885230881, "grad_norm": 0.8099841475486755, "learning_rate": 0.000974147771863986, "loss": 4.6079, "step": 17000 }, { "epoch": 0.0932259370538473, "grad_norm": 0.811244010925293, "learning_rate": 0.000973257730340606, "loss": 4.5949, "step": 17500 }, { "epoch": 0.0958895352553858, "grad_norm": 0.8826119303703308, "learning_rate": 0.0009723659051668784, "loss": 4.589, "step": 18000 }, { "epoch": 0.09855313345692429, "grad_norm": 0.8135235905647278, "learning_rate": 0.0009714740799931508, "loss": 4.5715, "step": 18500 }, { "epoch": 0.10121673165846279, "grad_norm": 0.8390595316886902, "learning_rate": 0.0009705822548194233, "loss": 4.5581, "step": 19000 }, { "epoch": 0.10388032986000127, "grad_norm": 0.7602077126502991, "learning_rate": 0.0009696922132960431, "loss": 4.5527, "step": 19500 }, { "epoch": 0.10654392806153977, "grad_norm": 0.8945237994194031, "learning_rate": 0.0009688003881223157, "loss": 4.5301, "step": 20000 }, { "epoch": 0.10920752626307827, "grad_norm": 0.6963039040565491, "learning_rate": 0.0009679085629485881, "loss": 4.5186, "step": 20500 }, { "epoch": 0.11187112446461676, "grad_norm": 0.7871098518371582, "learning_rate": 0.0009670167377748605, "loss": 4.5069, "step": 21000 }, { "epoch": 0.11453472266615526, "grad_norm": 0.7853402495384216, "learning_rate": 0.000966124912601133, "loss": 4.4966, "step": 21500 }, { "epoch": 0.11719832086769374, "grad_norm": 0.7557271718978882, "learning_rate": 0.0009652348710777528, "loss": 4.4857, "step": 22000 }, { "epoch": 0.11986191906923224, "grad_norm": 0.7256771326065063, "learning_rate": 0.0009643430459040254, "loss": 4.4756, "step": 22500 }, { "epoch": 0.12252551727077074, "grad_norm": 0.7980550527572632, "learning_rate": 0.0009634512207302978, "loss": 4.4726, "step": 23000 }, { "epoch": 0.12518911547230924, "grad_norm": 0.7480477690696716, "learning_rate": 0.0009625593955565702, "loss": 4.4558, "step": 23500 }, { "epoch": 0.12785271367384773, "grad_norm": 0.7309882044792175, "learning_rate": 0.0009616675703828427, "loss": 4.4546, "step": 24000 }, { "epoch": 0.13051631187538623, "grad_norm": 0.8072414398193359, "learning_rate": 0.0009607775288594626, "loss": 4.4408, "step": 24500 }, { "epoch": 0.13317991007692473, "grad_norm": 0.7929727435112, "learning_rate": 0.0009598857036857352, "loss": 4.4436, "step": 25000 }, { "epoch": 0.1358435082784632, "grad_norm": 0.7073729038238525, "learning_rate": 0.0009589938785120076, "loss": 4.4261, "step": 25500 }, { "epoch": 0.1385071064800017, "grad_norm": 0.7210267782211304, "learning_rate": 0.00095810205333828, "loss": 4.425, "step": 26000 }, { "epoch": 0.1411707046815402, "grad_norm": 0.6783360838890076, "learning_rate": 0.0009572102281645525, "loss": 4.4123, "step": 26500 }, { "epoch": 0.1438343028830787, "grad_norm": 0.7039027214050293, "learning_rate": 0.0009563184029908249, "loss": 4.414, "step": 27000 }, { "epoch": 0.1464979010846172, "grad_norm": 0.7899590730667114, "learning_rate": 0.0009554265778170974, "loss": 4.3951, "step": 27500 }, { "epoch": 0.14916149928615569, "grad_norm": 0.7651330828666687, "learning_rate": 0.0009545347526433699, "loss": 4.3997, "step": 28000 }, { "epoch": 0.15182509748769418, "grad_norm": 0.8091022372245789, "learning_rate": 0.0009536447111199897, "loss": 4.3865, "step": 28500 }, { "epoch": 0.15448869568923268, "grad_norm": 0.7238765954971313, "learning_rate": 0.0009527528859462622, "loss": 4.3845, "step": 29000 }, { "epoch": 0.15715229389077118, "grad_norm": 0.7803590893745422, "learning_rate": 0.0009518610607725346, "loss": 4.3805, "step": 29500 }, { "epoch": 0.15981589209230965, "grad_norm": 0.778491735458374, "learning_rate": 0.0009509692355988071, "loss": 4.3794, "step": 30000 }, { "epoch": 0.16247949029384814, "grad_norm": 0.7399048209190369, "learning_rate": 0.000950079194075427, "loss": 4.3795, "step": 30500 }, { "epoch": 0.16514308849538664, "grad_norm": 0.7823745012283325, "learning_rate": 0.0009491873689016994, "loss": 4.3782, "step": 31000 }, { "epoch": 0.16780668669692514, "grad_norm": 0.7693122029304504, "learning_rate": 0.0009482955437279719, "loss": 4.3612, "step": 31500 }, { "epoch": 0.17047028489846364, "grad_norm": 0.7326549887657166, "learning_rate": 0.0009474037185542443, "loss": 4.3658, "step": 32000 }, { "epoch": 0.17313388310000213, "grad_norm": 0.6827363967895508, "learning_rate": 0.0009465136770308644, "loss": 4.3621, "step": 32500 }, { "epoch": 0.17579748130154063, "grad_norm": 0.7000982761383057, "learning_rate": 0.0009456218518571368, "loss": 4.3566, "step": 33000 }, { "epoch": 0.17846107950307913, "grad_norm": 0.7949216365814209, "learning_rate": 0.0009447300266834092, "loss": 4.349, "step": 33500 }, { "epoch": 0.18112467770461763, "grad_norm": 0.7766338586807251, "learning_rate": 0.0009438382015096817, "loss": 4.3564, "step": 34000 }, { "epoch": 0.1837882759061561, "grad_norm": 0.7235038876533508, "learning_rate": 0.0009429481599863015, "loss": 4.3434, "step": 34500 }, { "epoch": 0.1864518741076946, "grad_norm": 0.7254591584205627, "learning_rate": 0.0009420563348125741, "loss": 4.3352, "step": 35000 }, { "epoch": 0.1891154723092331, "grad_norm": 0.6868504285812378, "learning_rate": 0.0009411645096388465, "loss": 4.34, "step": 35500 }, { "epoch": 0.1917790705107716, "grad_norm": 0.7674193978309631, "learning_rate": 0.0009402726844651189, "loss": 4.3333, "step": 36000 }, { "epoch": 0.19444266871231008, "grad_norm": 0.778035581111908, "learning_rate": 0.0009393826429417389, "loss": 4.3314, "step": 36500 }, { "epoch": 0.19710626691384858, "grad_norm": 0.7400960922241211, "learning_rate": 0.0009384908177680113, "loss": 4.3319, "step": 37000 }, { "epoch": 0.19976986511538708, "grad_norm": 0.7500663995742798, "learning_rate": 0.0009375989925942838, "loss": 4.328, "step": 37500 }, { "epoch": 0.20243346331692558, "grad_norm": 0.683749794960022, "learning_rate": 0.0009367071674205563, "loss": 4.3268, "step": 38000 }, { "epoch": 0.20509706151846407, "grad_norm": 0.7642583250999451, "learning_rate": 0.0009358171258971762, "loss": 4.3269, "step": 38500 }, { "epoch": 0.20776065972000254, "grad_norm": 0.6992856860160828, "learning_rate": 0.0009349253007234486, "loss": 4.3218, "step": 39000 }, { "epoch": 0.21042425792154104, "grad_norm": 0.7553698420524597, "learning_rate": 0.000934033475549721, "loss": 4.3209, "step": 39500 }, { "epoch": 0.21308785612307954, "grad_norm": 0.6873403787612915, "learning_rate": 0.0009331416503759935, "loss": 4.3157, "step": 40000 }, { "epoch": 0.21575145432461804, "grad_norm": 0.7638967633247375, "learning_rate": 0.0009322516088526134, "loss": 4.3163, "step": 40500 }, { "epoch": 0.21841505252615653, "grad_norm": 0.6896612048149109, "learning_rate": 0.0009313597836788859, "loss": 4.3123, "step": 41000 }, { "epoch": 0.22107865072769503, "grad_norm": 0.7294336557388306, "learning_rate": 0.0009304679585051583, "loss": 4.3142, "step": 41500 }, { "epoch": 0.22374224892923353, "grad_norm": 0.7498676776885986, "learning_rate": 0.0009295761333314307, "loss": 4.3038, "step": 42000 }, { "epoch": 0.22640584713077203, "grad_norm": 0.7050178647041321, "learning_rate": 0.0009286860918080507, "loss": 4.2978, "step": 42500 }, { "epoch": 0.22906944533231052, "grad_norm": 0.7527032494544983, "learning_rate": 0.0009277942666343233, "loss": 4.3067, "step": 43000 }, { "epoch": 0.231733043533849, "grad_norm": 0.6919755935668945, "learning_rate": 0.0009269024414605957, "loss": 4.295, "step": 43500 }, { "epoch": 0.2343966417353875, "grad_norm": 0.7255104184150696, "learning_rate": 0.0009260106162868681, "loss": 4.2946, "step": 44000 }, { "epoch": 0.237060239936926, "grad_norm": 0.6978445649147034, "learning_rate": 0.000925120574763488, "loss": 4.2937, "step": 44500 }, { "epoch": 0.23972383813846448, "grad_norm": 0.7008663415908813, "learning_rate": 0.0009242287495897604, "loss": 4.2974, "step": 45000 }, { "epoch": 0.24238743634000298, "grad_norm": 0.704937756061554, "learning_rate": 0.000923336924416033, "loss": 4.2857, "step": 45500 }, { "epoch": 0.24505103454154148, "grad_norm": 0.7343337535858154, "learning_rate": 0.0009224450992423054, "loss": 4.2891, "step": 46000 }, { "epoch": 0.24771463274307998, "grad_norm": 0.7263538241386414, "learning_rate": 0.0009215550577189252, "loss": 4.2895, "step": 46500 }, { "epoch": 0.2503782309446185, "grad_norm": 0.7095937728881836, "learning_rate": 0.0009206632325451977, "loss": 4.2853, "step": 47000 }, { "epoch": 0.25304182914615697, "grad_norm": 0.7221779823303223, "learning_rate": 0.0009197714073714701, "loss": 4.2858, "step": 47500 }, { "epoch": 0.25570542734769547, "grad_norm": 0.7522983551025391, "learning_rate": 0.0009188795821977425, "loss": 4.2795, "step": 48000 }, { "epoch": 0.25836902554923397, "grad_norm": 0.7212731838226318, "learning_rate": 0.0009179895406743626, "loss": 4.2749, "step": 48500 }, { "epoch": 0.26103262375077246, "grad_norm": 0.75824373960495, "learning_rate": 0.000917097715500635, "loss": 4.2738, "step": 49000 }, { "epoch": 0.26369622195231096, "grad_norm": 0.7861409783363342, "learning_rate": 0.0009162058903269075, "loss": 4.2781, "step": 49500 }, { "epoch": 0.26635982015384946, "grad_norm": 0.7585176229476929, "learning_rate": 0.0009153140651531799, "loss": 4.2742, "step": 50000 }, { "epoch": 0.2690234183553879, "grad_norm": 0.7468889951705933, "learning_rate": 0.0009144240236297998, "loss": 4.2779, "step": 50500 }, { "epoch": 0.2716870165569264, "grad_norm": 0.7378383278846741, "learning_rate": 0.0009135321984560723, "loss": 4.2724, "step": 51000 }, { "epoch": 0.2743506147584649, "grad_norm": 0.6867294907569885, "learning_rate": 0.0009126403732823447, "loss": 4.2753, "step": 51500 }, { "epoch": 0.2770142129600034, "grad_norm": 0.6850928068161011, "learning_rate": 0.0009117485481086172, "loss": 4.2718, "step": 52000 }, { "epoch": 0.2796778111615419, "grad_norm": 0.7450153827667236, "learning_rate": 0.000910858506585237, "loss": 4.2711, "step": 52500 }, { "epoch": 0.2823414093630804, "grad_norm": 0.7175604104995728, "learning_rate": 0.0009099666814115095, "loss": 4.2636, "step": 53000 }, { "epoch": 0.2850050075646189, "grad_norm": 0.7004239559173584, "learning_rate": 0.000909074856237782, "loss": 4.273, "step": 53500 }, { "epoch": 0.2876686057661574, "grad_norm": 0.7755109667778015, "learning_rate": 0.0009081830310640544, "loss": 4.262, "step": 54000 }, { "epoch": 0.2903322039676959, "grad_norm": 0.7420957684516907, "learning_rate": 0.0009072929895406744, "loss": 4.2703, "step": 54500 }, { "epoch": 0.2929958021692344, "grad_norm": 0.7163523435592651, "learning_rate": 0.0009064011643669468, "loss": 4.265, "step": 55000 }, { "epoch": 0.2956594003707729, "grad_norm": 0.7003483176231384, "learning_rate": 0.0009055093391932193, "loss": 4.2529, "step": 55500 }, { "epoch": 0.29832299857231137, "grad_norm": 0.7118489742279053, "learning_rate": 0.0009046175140194918, "loss": 4.2556, "step": 56000 }, { "epoch": 0.30098659677384987, "grad_norm": 0.7034066319465637, "learning_rate": 0.0009037274724961117, "loss": 4.2547, "step": 56500 }, { "epoch": 0.30365019497538837, "grad_norm": 0.6700213551521301, "learning_rate": 0.0009028356473223841, "loss": 4.2561, "step": 57000 }, { "epoch": 0.30631379317692686, "grad_norm": 0.738164484500885, "learning_rate": 0.0009019438221486565, "loss": 4.26, "step": 57500 }, { "epoch": 0.30897739137846536, "grad_norm": 0.7396353483200073, "learning_rate": 0.000901051996974929, "loss": 4.2562, "step": 58000 }, { "epoch": 0.31164098958000386, "grad_norm": 0.7478146553039551, "learning_rate": 0.0009001619554515488, "loss": 4.25, "step": 58500 }, { "epoch": 0.31430458778154235, "grad_norm": 0.7298335433006287, "learning_rate": 0.0008992701302778215, "loss": 4.2562, "step": 59000 }, { "epoch": 0.3169681859830808, "grad_norm": 0.7685016989707947, "learning_rate": 0.0008983783051040939, "loss": 4.2551, "step": 59500 }, { "epoch": 0.3196317841846193, "grad_norm": 0.8017458915710449, "learning_rate": 0.0008974864799303664, "loss": 4.2481, "step": 60000 }, { "epoch": 0.3222953823861578, "grad_norm": 0.7588088512420654, "learning_rate": 0.0008965964384069862, "loss": 4.2537, "step": 60500 }, { "epoch": 0.3249589805876963, "grad_norm": 0.7897168397903442, "learning_rate": 0.0008957046132332586, "loss": 4.2427, "step": 61000 }, { "epoch": 0.3276225787892348, "grad_norm": 0.7311574220657349, "learning_rate": 0.0008948127880595312, "loss": 4.2518, "step": 61500 }, { "epoch": 0.3302861769907733, "grad_norm": 0.7892371416091919, "learning_rate": 0.0008939209628858036, "loss": 4.234, "step": 62000 }, { "epoch": 0.3329497751923118, "grad_norm": 0.6944438815116882, "learning_rate": 0.0008930309213624235, "loss": 4.2382, "step": 62500 }, { "epoch": 0.3356133733938503, "grad_norm": 0.7701837420463562, "learning_rate": 0.0008921390961886959, "loss": 4.2474, "step": 63000 }, { "epoch": 0.3382769715953888, "grad_norm": 0.7789635062217712, "learning_rate": 0.0008912472710149683, "loss": 4.2379, "step": 63500 }, { "epoch": 0.3409405697969273, "grad_norm": 0.7212055921554565, "learning_rate": 0.0008903554458412409, "loss": 4.2407, "step": 64000 }, { "epoch": 0.34360416799846577, "grad_norm": 0.7439520359039307, "learning_rate": 0.0008894654043178609, "loss": 4.2386, "step": 64500 }, { "epoch": 0.34626776620000427, "grad_norm": 0.6747229695320129, "learning_rate": 0.0008885735791441333, "loss": 4.2391, "step": 65000 }, { "epoch": 0.34893136440154277, "grad_norm": 0.7761566638946533, "learning_rate": 0.0008876817539704057, "loss": 4.2337, "step": 65500 }, { "epoch": 0.35159496260308126, "grad_norm": 0.7024859189987183, "learning_rate": 0.0008867899287966782, "loss": 4.2299, "step": 66000 }, { "epoch": 0.35425856080461976, "grad_norm": 0.7179946303367615, "learning_rate": 0.000885899887273298, "loss": 4.2379, "step": 66500 }, { "epoch": 0.35692215900615826, "grad_norm": 0.699834942817688, "learning_rate": 0.0008850080620995706, "loss": 4.2321, "step": 67000 }, { "epoch": 0.35958575720769675, "grad_norm": 0.6902332901954651, "learning_rate": 0.000884116236925843, "loss": 4.2376, "step": 67500 }, { "epoch": 0.36224935540923525, "grad_norm": 0.7003384232521057, "learning_rate": 0.0008832244117521154, "loss": 4.2261, "step": 68000 }, { "epoch": 0.36491295361077375, "grad_norm": 0.7879477739334106, "learning_rate": 0.0008823343702287353, "loss": 4.2292, "step": 68500 }, { "epoch": 0.3675765518123122, "grad_norm": 0.6793246269226074, "learning_rate": 0.0008814425450550077, "loss": 4.2342, "step": 69000 }, { "epoch": 0.3702401500138507, "grad_norm": 0.7284209728240967, "learning_rate": 0.0008805507198812803, "loss": 4.2276, "step": 69500 }, { "epoch": 0.3729037482153892, "grad_norm": 0.7192456722259521, "learning_rate": 0.0008796588947075527, "loss": 4.2248, "step": 70000 }, { "epoch": 0.3755673464169277, "grad_norm": 0.7695698738098145, "learning_rate": 0.0008787688531841727, "loss": 4.2276, "step": 70500 }, { "epoch": 0.3782309446184662, "grad_norm": 0.740368664264679, "learning_rate": 0.0008778770280104451, "loss": 4.2286, "step": 71000 }, { "epoch": 0.3808945428200047, "grad_norm": 0.7393242716789246, "learning_rate": 0.0008769852028367175, "loss": 4.2239, "step": 71500 }, { "epoch": 0.3835581410215432, "grad_norm": 0.7269551157951355, "learning_rate": 0.0008760933776629901, "loss": 4.2196, "step": 72000 }, { "epoch": 0.3862217392230817, "grad_norm": 0.6773830056190491, "learning_rate": 0.0008752033361396099, "loss": 4.2283, "step": 72500 }, { "epoch": 0.38888533742462017, "grad_norm": 0.7091046571731567, "learning_rate": 0.0008743115109658824, "loss": 4.2252, "step": 73000 }, { "epoch": 0.39154893562615867, "grad_norm": 0.7202826738357544, "learning_rate": 0.0008734196857921548, "loss": 4.2102, "step": 73500 }, { "epoch": 0.39421253382769716, "grad_norm": 0.6965381503105164, "learning_rate": 0.0008725278606184272, "loss": 4.222, "step": 74000 }, { "epoch": 0.39687613202923566, "grad_norm": 0.7711541652679443, "learning_rate": 0.0008716378190950471, "loss": 4.2138, "step": 74500 }, { "epoch": 0.39953973023077416, "grad_norm": 0.6982942223548889, "learning_rate": 0.0008707459939213196, "loss": 4.2209, "step": 75000 }, { "epoch": 0.40220332843231266, "grad_norm": 0.700356662273407, "learning_rate": 0.0008698541687475921, "loss": 4.2153, "step": 75500 }, { "epoch": 0.40486692663385115, "grad_norm": 0.7417271137237549, "learning_rate": 0.0008689623435738645, "loss": 4.216, "step": 76000 }, { "epoch": 0.40753052483538965, "grad_norm": 0.7237849235534668, "learning_rate": 0.0008680723020504845, "loss": 4.2172, "step": 76500 }, { "epoch": 0.41019412303692815, "grad_norm": 0.7940893769264221, "learning_rate": 0.0008671804768767569, "loss": 4.2224, "step": 77000 }, { "epoch": 0.41285772123846665, "grad_norm": 0.7201411724090576, "learning_rate": 0.0008662886517030294, "loss": 4.2203, "step": 77500 }, { "epoch": 0.4155213194400051, "grad_norm": 0.7360599637031555, "learning_rate": 0.0008653968265293019, "loss": 4.2208, "step": 78000 }, { "epoch": 0.4181849176415436, "grad_norm": 0.7827675938606262, "learning_rate": 0.0008645067850059217, "loss": 4.2095, "step": 78500 }, { "epoch": 0.4208485158430821, "grad_norm": 0.7322735786437988, "learning_rate": 0.0008636149598321942, "loss": 4.2085, "step": 79000 }, { "epoch": 0.4235121140446206, "grad_norm": 0.6896507740020752, "learning_rate": 0.0008627231346584666, "loss": 4.2045, "step": 79500 }, { "epoch": 0.4261757122461591, "grad_norm": 0.780642569065094, "learning_rate": 0.0008618313094847391, "loss": 4.2157, "step": 80000 }, { "epoch": 0.4288393104476976, "grad_norm": 0.717087984085083, "learning_rate": 0.000860941267961359, "loss": 4.208, "step": 80500 }, { "epoch": 0.43150290864923607, "grad_norm": 0.7145330309867859, "learning_rate": 0.0008600494427876314, "loss": 4.2128, "step": 81000 }, { "epoch": 0.43416650685077457, "grad_norm": 0.7336823344230652, "learning_rate": 0.0008591576176139039, "loss": 4.2124, "step": 81500 }, { "epoch": 0.43683010505231307, "grad_norm": 0.6869795322418213, "learning_rate": 0.0008582657924401764, "loss": 4.2103, "step": 82000 }, { "epoch": 0.43949370325385156, "grad_norm": 0.7188379168510437, "learning_rate": 0.0008573757509167964, "loss": 4.2084, "step": 82500 }, { "epoch": 0.44215730145539006, "grad_norm": 0.7271597981452942, "learning_rate": 0.0008564839257430688, "loss": 4.2087, "step": 83000 }, { "epoch": 0.44482089965692856, "grad_norm": 0.7935476303100586, "learning_rate": 0.0008555921005693412, "loss": 4.199, "step": 83500 }, { "epoch": 0.44748449785846706, "grad_norm": 0.732509195804596, "learning_rate": 0.0008547002753956137, "loss": 4.2014, "step": 84000 }, { "epoch": 0.45014809606000555, "grad_norm": 0.7381872534751892, "learning_rate": 0.0008538102338722335, "loss": 4.2078, "step": 84500 }, { "epoch": 0.45281169426154405, "grad_norm": 0.697894811630249, "learning_rate": 0.0008529184086985061, "loss": 4.1978, "step": 85000 }, { "epoch": 0.45547529246308255, "grad_norm": 0.715933084487915, "learning_rate": 0.0008520265835247785, "loss": 4.205, "step": 85500 }, { "epoch": 0.45813889066462105, "grad_norm": 0.7199248671531677, "learning_rate": 0.0008511347583510509, "loss": 4.201, "step": 86000 }, { "epoch": 0.46080248886615954, "grad_norm": 0.7358156442642212, "learning_rate": 0.0008502447168276709, "loss": 4.2025, "step": 86500 }, { "epoch": 0.463466087067698, "grad_norm": 0.8218105435371399, "learning_rate": 0.0008493528916539433, "loss": 4.2017, "step": 87000 }, { "epoch": 0.4661296852692365, "grad_norm": 0.77776700258255, "learning_rate": 0.0008484610664802158, "loss": 4.1905, "step": 87500 }, { "epoch": 0.468793283470775, "grad_norm": 0.6795767545700073, "learning_rate": 0.0008475692413064883, "loss": 4.1913, "step": 88000 }, { "epoch": 0.4714568816723135, "grad_norm": 0.7476922869682312, "learning_rate": 0.0008466791997831082, "loss": 4.1935, "step": 88500 }, { "epoch": 0.474120479873852, "grad_norm": 0.7420318722724915, "learning_rate": 0.0008457873746093806, "loss": 4.1989, "step": 89000 }, { "epoch": 0.47678407807539047, "grad_norm": 0.677543044090271, "learning_rate": 0.000844895549435653, "loss": 4.1921, "step": 89500 }, { "epoch": 0.47944767627692897, "grad_norm": 0.7159215211868286, "learning_rate": 0.0008440037242619255, "loss": 4.1935, "step": 90000 }, { "epoch": 0.48211127447846747, "grad_norm": 0.7259414792060852, "learning_rate": 0.0008431136827385454, "loss": 4.2041, "step": 90500 }, { "epoch": 0.48477487268000596, "grad_norm": 0.6838536262512207, "learning_rate": 0.0008422218575648179, "loss": 4.1954, "step": 91000 }, { "epoch": 0.48743847088154446, "grad_norm": 0.6978190541267395, "learning_rate": 0.0008413300323910903, "loss": 4.1944, "step": 91500 }, { "epoch": 0.49010206908308296, "grad_norm": 0.7434132695198059, "learning_rate": 0.0008404382072173627, "loss": 4.1932, "step": 92000 }, { "epoch": 0.49276566728462146, "grad_norm": 0.6992717981338501, "learning_rate": 0.0008395481656939827, "loss": 4.1963, "step": 92500 }, { "epoch": 0.49542926548615995, "grad_norm": 0.7276673316955566, "learning_rate": 0.0008386563405202552, "loss": 4.1967, "step": 93000 }, { "epoch": 0.49809286368769845, "grad_norm": 0.7243706583976746, "learning_rate": 0.0008377645153465277, "loss": 4.1938, "step": 93500 }, { "epoch": 0.500756461889237, "grad_norm": 0.7238306999206543, "learning_rate": 0.0008368726901728001, "loss": 4.1944, "step": 94000 }, { "epoch": 0.5034200600907754, "grad_norm": 0.7251293063163757, "learning_rate": 0.00083598264864942, "loss": 4.187, "step": 94500 }, { "epoch": 0.5060836582923139, "grad_norm": 0.6981387734413147, "learning_rate": 0.0008350908234756924, "loss": 4.1942, "step": 95000 }, { "epoch": 0.5087472564938524, "grad_norm": 0.7512865662574768, "learning_rate": 0.0008341989983019649, "loss": 4.1896, "step": 95500 }, { "epoch": 0.5114108546953909, "grad_norm": 0.76689213514328, "learning_rate": 0.0008333071731282374, "loss": 4.1895, "step": 96000 }, { "epoch": 0.5140744528969294, "grad_norm": 0.7794478535652161, "learning_rate": 0.0008324171316048572, "loss": 4.1877, "step": 96500 }, { "epoch": 0.5167380510984679, "grad_norm": 0.7624120712280273, "learning_rate": 0.0008315253064311297, "loss": 4.1905, "step": 97000 }, { "epoch": 0.5194016493000064, "grad_norm": 0.812703549861908, "learning_rate": 0.0008306334812574021, "loss": 4.1918, "step": 97500 }, { "epoch": 0.5220652475015449, "grad_norm": 0.7445054054260254, "learning_rate": 0.0008297416560836745, "loss": 4.1932, "step": 98000 }, { "epoch": 0.5247288457030834, "grad_norm": 0.6916468143463135, "learning_rate": 0.0008288498309099471, "loss": 4.1927, "step": 98500 }, { "epoch": 0.5273924439046219, "grad_norm": 0.7391178011894226, "learning_rate": 0.000827959789386567, "loss": 4.1822, "step": 99000 }, { "epoch": 0.5300560421061604, "grad_norm": 0.7245861887931824, "learning_rate": 0.0008270679642128395, "loss": 4.1897, "step": 99500 }, { "epoch": 0.5327196403076989, "grad_norm": 0.7156808376312256, "learning_rate": 0.0008261761390391119, "loss": 4.186, "step": 100000 }, { "epoch": 0.5353832385092374, "grad_norm": 0.7185246348381042, "learning_rate": 0.0008252843138653843, "loss": 4.182, "step": 100500 }, { "epoch": 0.5380468367107758, "grad_norm": 0.7230123281478882, "learning_rate": 0.0008243942723420043, "loss": 4.1888, "step": 101000 }, { "epoch": 0.5407104349123143, "grad_norm": 0.6807687282562256, "learning_rate": 0.0008235024471682767, "loss": 4.1757, "step": 101500 }, { "epoch": 0.5433740331138528, "grad_norm": 0.6942833065986633, "learning_rate": 0.0008226106219945492, "loss": 4.1818, "step": 102000 }, { "epoch": 0.5460376313153913, "grad_norm": 0.7553761601448059, "learning_rate": 0.0008217187968208216, "loss": 4.1876, "step": 102500 }, { "epoch": 0.5487012295169298, "grad_norm": 0.8295273184776306, "learning_rate": 0.0008208287552974415, "loss": 4.1763, "step": 103000 }, { "epoch": 0.5513648277184683, "grad_norm": 0.7182528972625732, "learning_rate": 0.000819936930123714, "loss": 4.1867, "step": 103500 }, { "epoch": 0.5540284259200068, "grad_norm": 0.7191228270530701, "learning_rate": 0.0008190451049499864, "loss": 4.1822, "step": 104000 }, { "epoch": 0.5566920241215453, "grad_norm": 0.7880285382270813, "learning_rate": 0.0008181532797762589, "loss": 4.178, "step": 104500 }, { "epoch": 0.5593556223230838, "grad_norm": 0.7537713050842285, "learning_rate": 0.0008172632382528788, "loss": 4.1865, "step": 105000 }, { "epoch": 0.5620192205246223, "grad_norm": 0.7707012891769409, "learning_rate": 0.0008163714130791513, "loss": 4.1847, "step": 105500 }, { "epoch": 0.5646828187261608, "grad_norm": 0.7433204054832458, "learning_rate": 0.0008154795879054238, "loss": 4.1778, "step": 106000 }, { "epoch": 0.5673464169276993, "grad_norm": 0.760553240776062, "learning_rate": 0.0008145877627316962, "loss": 4.1804, "step": 106500 }, { "epoch": 0.5700100151292378, "grad_norm": 0.744844913482666, "learning_rate": 0.0008136977212083161, "loss": 4.1809, "step": 107000 }, { "epoch": 0.5726736133307763, "grad_norm": 0.7252081036567688, "learning_rate": 0.0008128058960345885, "loss": 4.1731, "step": 107500 }, { "epoch": 0.5753372115323148, "grad_norm": 0.6822036504745483, "learning_rate": 0.000811914070860861, "loss": 4.1799, "step": 108000 }, { "epoch": 0.5780008097338533, "grad_norm": 0.7590454816818237, "learning_rate": 0.0008110222456871334, "loss": 4.1771, "step": 108500 }, { "epoch": 0.5806644079353918, "grad_norm": 0.7851970791816711, "learning_rate": 0.0008101322041637535, "loss": 4.1762, "step": 109000 }, { "epoch": 0.5833280061369303, "grad_norm": 0.7638763785362244, "learning_rate": 0.0008092403789900259, "loss": 4.1699, "step": 109500 }, { "epoch": 0.5859916043384688, "grad_norm": 0.7190741896629333, "learning_rate": 0.0008083485538162983, "loss": 4.181, "step": 110000 }, { "epoch": 0.5886552025400072, "grad_norm": 0.8082555532455444, "learning_rate": 0.0008074567286425708, "loss": 4.1711, "step": 110500 }, { "epoch": 0.5913188007415457, "grad_norm": 0.7326035499572754, "learning_rate": 0.0008065666871191906, "loss": 4.1743, "step": 111000 }, { "epoch": 0.5939823989430842, "grad_norm": 0.7412554621696472, "learning_rate": 0.0008056748619454632, "loss": 4.1761, "step": 111500 }, { "epoch": 0.5966459971446227, "grad_norm": 0.6986061930656433, "learning_rate": 0.0008047830367717356, "loss": 4.1788, "step": 112000 }, { "epoch": 0.5993095953461612, "grad_norm": 0.8155457973480225, "learning_rate": 0.000803891211598008, "loss": 4.1801, "step": 112500 }, { "epoch": 0.6019731935476997, "grad_norm": 0.7332949042320251, "learning_rate": 0.0008030011700746279, "loss": 4.1678, "step": 113000 }, { "epoch": 0.6046367917492382, "grad_norm": 0.8117866516113281, "learning_rate": 0.0008021093449009003, "loss": 4.1781, "step": 113500 }, { "epoch": 0.6073003899507767, "grad_norm": 0.7188646197319031, "learning_rate": 0.0008012175197271729, "loss": 4.1702, "step": 114000 }, { "epoch": 0.6099639881523152, "grad_norm": 0.7319905757904053, "learning_rate": 0.0008003256945534453, "loss": 4.1709, "step": 114500 }, { "epoch": 0.6126275863538537, "grad_norm": 0.7118169069290161, "learning_rate": 0.0007994356530300653, "loss": 4.1709, "step": 115000 }, { "epoch": 0.6152911845553922, "grad_norm": 0.7694860696792603, "learning_rate": 0.0007985438278563377, "loss": 4.1723, "step": 115500 }, { "epoch": 0.6179547827569307, "grad_norm": 0.7366968989372253, "learning_rate": 0.0007976520026826101, "loss": 4.1676, "step": 116000 }, { "epoch": 0.6206183809584692, "grad_norm": 0.7481387257575989, "learning_rate": 0.0007967601775088827, "loss": 4.1729, "step": 116500 }, { "epoch": 0.6232819791600077, "grad_norm": 0.7446570992469788, "learning_rate": 0.0007958701359855025, "loss": 4.1657, "step": 117000 }, { "epoch": 0.6259455773615462, "grad_norm": 0.7612956166267395, "learning_rate": 0.000794978310811775, "loss": 4.1685, "step": 117500 }, { "epoch": 0.6286091755630847, "grad_norm": 0.7427545189857483, "learning_rate": 0.0007940864856380474, "loss": 4.1685, "step": 118000 }, { "epoch": 0.6312727737646232, "grad_norm": 0.7789895534515381, "learning_rate": 0.0007931946604643198, "loss": 4.1726, "step": 118500 }, { "epoch": 0.6339363719661616, "grad_norm": 0.751118540763855, "learning_rate": 0.0007923046189409397, "loss": 4.1693, "step": 119000 }, { "epoch": 0.6365999701677001, "grad_norm": 0.8121469616889954, "learning_rate": 0.0007914127937672122, "loss": 4.1667, "step": 119500 }, { "epoch": 0.6392635683692386, "grad_norm": 0.7127716541290283, "learning_rate": 0.0007905209685934847, "loss": 4.1604, "step": 120000 }, { "epoch": 0.6419271665707771, "grad_norm": 0.7496224045753479, "learning_rate": 0.0007896291434197571, "loss": 4.1655, "step": 120500 }, { "epoch": 0.6445907647723156, "grad_norm": 0.7957298755645752, "learning_rate": 0.0007887391018963771, "loss": 4.1685, "step": 121000 }, { "epoch": 0.6472543629738541, "grad_norm": 0.708066463470459, "learning_rate": 0.0007878472767226495, "loss": 4.1684, "step": 121500 }, { "epoch": 0.6499179611753926, "grad_norm": 0.8204523324966431, "learning_rate": 0.000786955451548922, "loss": 4.1685, "step": 122000 }, { "epoch": 0.6525815593769311, "grad_norm": 0.7236646413803101, "learning_rate": 0.0007860636263751945, "loss": 4.1692, "step": 122500 }, { "epoch": 0.6552451575784696, "grad_norm": 0.7952857613563538, "learning_rate": 0.0007851735848518143, "loss": 4.1623, "step": 123000 }, { "epoch": 0.6579087557800081, "grad_norm": 0.7337407469749451, "learning_rate": 0.0007842817596780868, "loss": 4.1675, "step": 123500 }, { "epoch": 0.6605723539815466, "grad_norm": 0.740993082523346, "learning_rate": 0.0007833899345043592, "loss": 4.1643, "step": 124000 }, { "epoch": 0.6632359521830851, "grad_norm": 0.7212578654289246, "learning_rate": 0.0007824981093306317, "loss": 4.1656, "step": 124500 }, { "epoch": 0.6658995503846236, "grad_norm": 0.7532219886779785, "learning_rate": 0.0007816080678072516, "loss": 4.1682, "step": 125000 }, { "epoch": 0.6685631485861621, "grad_norm": 0.759222686290741, "learning_rate": 0.000780716242633524, "loss": 4.165, "step": 125500 }, { "epoch": 0.6712267467877006, "grad_norm": 0.7389349937438965, "learning_rate": 0.0007798244174597965, "loss": 4.1623, "step": 126000 }, { "epoch": 0.673890344989239, "grad_norm": 0.7558398246765137, "learning_rate": 0.0007789325922860689, "loss": 4.165, "step": 126500 }, { "epoch": 0.6765539431907776, "grad_norm": 0.778786838054657, "learning_rate": 0.0007780425507626889, "loss": 4.1636, "step": 127000 }, { "epoch": 0.679217541392316, "grad_norm": 0.7308077812194824, "learning_rate": 0.0007771507255889614, "loss": 4.1609, "step": 127500 }, { "epoch": 0.6818811395938545, "grad_norm": 0.7642717361450195, "learning_rate": 0.0007762589004152338, "loss": 4.1623, "step": 128000 }, { "epoch": 0.684544737795393, "grad_norm": 0.7278922200202942, "learning_rate": 0.0007753670752415063, "loss": 4.1636, "step": 128500 }, { "epoch": 0.6872083359969315, "grad_norm": 0.7422888278961182, "learning_rate": 0.0007744770337181261, "loss": 4.1542, "step": 129000 }, { "epoch": 0.68987193419847, "grad_norm": 0.7136949896812439, "learning_rate": 0.0007735852085443986, "loss": 4.1579, "step": 129500 }, { "epoch": 0.6925355324000085, "grad_norm": 0.7696181535720825, "learning_rate": 0.0007726933833706711, "loss": 4.1615, "step": 130000 }, { "epoch": 0.695199130601547, "grad_norm": 0.7375788688659668, "learning_rate": 0.0007718015581969435, "loss": 4.1625, "step": 130500 }, { "epoch": 0.6978627288030855, "grad_norm": 0.7175765037536621, "learning_rate": 0.0007709115166735635, "loss": 4.1562, "step": 131000 }, { "epoch": 0.700526327004624, "grad_norm": 0.7179591655731201, "learning_rate": 0.000770019691499836, "loss": 4.1604, "step": 131500 }, { "epoch": 0.7031899252061625, "grad_norm": 0.7693660259246826, "learning_rate": 0.0007691278663261084, "loss": 4.1623, "step": 132000 }, { "epoch": 0.705853523407701, "grad_norm": 0.7547662854194641, "learning_rate": 0.0007682360411523809, "loss": 4.1604, "step": 132500 }, { "epoch": 0.7085171216092395, "grad_norm": 0.7436234951019287, "learning_rate": 0.0007673459996290008, "loss": 4.159, "step": 133000 }, { "epoch": 0.711180719810778, "grad_norm": 0.7248745560646057, "learning_rate": 0.0007664541744552732, "loss": 4.155, "step": 133500 }, { "epoch": 0.7138443180123165, "grad_norm": 0.7338257431983948, "learning_rate": 0.0007655623492815456, "loss": 4.1573, "step": 134000 }, { "epoch": 0.716507916213855, "grad_norm": 0.7636457085609436, "learning_rate": 0.0007646705241078181, "loss": 4.1568, "step": 134500 }, { "epoch": 0.7191715144153935, "grad_norm": 0.7198740243911743, "learning_rate": 0.000763780482584438, "loss": 4.1597, "step": 135000 }, { "epoch": 0.721835112616932, "grad_norm": 0.7390605807304382, "learning_rate": 0.0007628886574107105, "loss": 4.1471, "step": 135500 }, { "epoch": 0.7244987108184705, "grad_norm": 0.7730891108512878, "learning_rate": 0.0007619968322369829, "loss": 4.1518, "step": 136000 }, { "epoch": 0.727162309020009, "grad_norm": 0.7512543797492981, "learning_rate": 0.0007611050070632553, "loss": 4.1602, "step": 136500 }, { "epoch": 0.7298259072215475, "grad_norm": 0.7366748452186584, "learning_rate": 0.0007602149655398753, "loss": 4.1583, "step": 137000 }, { "epoch": 0.7324895054230859, "grad_norm": 0.7468605041503906, "learning_rate": 0.0007593231403661477, "loss": 4.1535, "step": 137500 }, { "epoch": 0.7351531036246244, "grad_norm": 0.7176985144615173, "learning_rate": 0.0007584313151924203, "loss": 4.1525, "step": 138000 }, { "epoch": 0.7378167018261629, "grad_norm": 0.7422710657119751, "learning_rate": 0.0007575394900186927, "loss": 4.1507, "step": 138500 }, { "epoch": 0.7404803000277014, "grad_norm": 0.7459094524383545, "learning_rate": 0.0007566494484953126, "loss": 4.1541, "step": 139000 }, { "epoch": 0.7431438982292399, "grad_norm": 0.7306596636772156, "learning_rate": 0.000755757623321585, "loss": 4.1502, "step": 139500 }, { "epoch": 0.7458074964307784, "grad_norm": 0.7191296219825745, "learning_rate": 0.0007548657981478574, "loss": 4.1483, "step": 140000 }, { "epoch": 0.7484710946323169, "grad_norm": 0.7819980382919312, "learning_rate": 0.00075397397297413, "loss": 4.1589, "step": 140500 }, { "epoch": 0.7511346928338554, "grad_norm": 0.7624921202659607, "learning_rate": 0.0007530839314507498, "loss": 4.1531, "step": 141000 }, { "epoch": 0.7537982910353939, "grad_norm": 0.7341359257698059, "learning_rate": 0.0007521921062770223, "loss": 4.1514, "step": 141500 }, { "epoch": 0.7564618892369324, "grad_norm": 0.7539492249488831, "learning_rate": 0.0007513002811032947, "loss": 4.153, "step": 142000 }, { "epoch": 0.7591254874384709, "grad_norm": 0.7897160053253174, "learning_rate": 0.0007504084559295671, "loss": 4.1462, "step": 142500 }, { "epoch": 0.7617890856400094, "grad_norm": 0.7714428901672363, "learning_rate": 0.0007495184144061872, "loss": 4.1436, "step": 143000 }, { "epoch": 0.7644526838415479, "grad_norm": 0.8038801550865173, "learning_rate": 0.0007486265892324597, "loss": 4.1506, "step": 143500 }, { "epoch": 0.7671162820430864, "grad_norm": 0.7296925187110901, "learning_rate": 0.0007477347640587321, "loss": 4.1493, "step": 144000 }, { "epoch": 0.7697798802446248, "grad_norm": 0.7423230409622192, "learning_rate": 0.0007468429388850045, "loss": 4.1464, "step": 144500 }, { "epoch": 0.7724434784461633, "grad_norm": 0.7713762521743774, "learning_rate": 0.0007459528973616244, "loss": 4.151, "step": 145000 }, { "epoch": 0.7751070766477018, "grad_norm": 0.7986962199211121, "learning_rate": 0.0007450610721878969, "loss": 4.1448, "step": 145500 }, { "epoch": 0.7777706748492403, "grad_norm": 0.794867217540741, "learning_rate": 0.0007441692470141694, "loss": 4.1523, "step": 146000 }, { "epoch": 0.7804342730507788, "grad_norm": 0.7599649429321289, "learning_rate": 0.0007432774218404418, "loss": 4.1454, "step": 146500 }, { "epoch": 0.7830978712523173, "grad_norm": 0.7340590357780457, "learning_rate": 0.0007423873803170616, "loss": 4.144, "step": 147000 }, { "epoch": 0.7857614694538558, "grad_norm": 0.7674250602722168, "learning_rate": 0.0007414955551433341, "loss": 4.1502, "step": 147500 }, { "epoch": 0.7884250676553943, "grad_norm": 0.7552058696746826, "learning_rate": 0.0007406037299696065, "loss": 4.1453, "step": 148000 }, { "epoch": 0.7910886658569328, "grad_norm": 0.7295849323272705, "learning_rate": 0.0007397119047958791, "loss": 4.1506, "step": 148500 }, { "epoch": 0.7937522640584713, "grad_norm": 0.754206120967865, "learning_rate": 0.000738821863272499, "loss": 4.1452, "step": 149000 }, { "epoch": 0.7964158622600098, "grad_norm": 0.8196142911911011, "learning_rate": 0.0007379300380987715, "loss": 4.153, "step": 149500 }, { "epoch": 0.7990794604615483, "grad_norm": 0.7535151243209839, "learning_rate": 0.0007370382129250439, "loss": 4.1493, "step": 150000 }, { "epoch": 0.8017430586630868, "grad_norm": 0.8634600043296814, "learning_rate": 0.0007361463877513163, "loss": 4.1483, "step": 150500 }, { "epoch": 0.8044066568646253, "grad_norm": 0.7539383769035339, "learning_rate": 0.0007352563462279363, "loss": 4.1511, "step": 151000 }, { "epoch": 0.8070702550661638, "grad_norm": 0.7170119881629944, "learning_rate": 0.0007343645210542087, "loss": 4.1504, "step": 151500 }, { "epoch": 0.8097338532677023, "grad_norm": 0.7679442763328552, "learning_rate": 0.0007334726958804812, "loss": 4.1455, "step": 152000 }, { "epoch": 0.8123974514692408, "grad_norm": 0.7368362545967102, "learning_rate": 0.0007325808707067536, "loss": 4.1481, "step": 152500 }, { "epoch": 0.8150610496707793, "grad_norm": 0.7174336910247803, "learning_rate": 0.000731689045533026, "loss": 4.1451, "step": 153000 }, { "epoch": 0.8177246478723178, "grad_norm": 0.7762460708618164, "learning_rate": 0.0007307990040096461, "loss": 4.1437, "step": 153500 }, { "epoch": 0.8203882460738563, "grad_norm": 0.6886820197105408, "learning_rate": 0.0007299071788359185, "loss": 4.1429, "step": 154000 }, { "epoch": 0.8230518442753948, "grad_norm": 0.7819857597351074, "learning_rate": 0.000729015353662191, "loss": 4.1408, "step": 154500 }, { "epoch": 0.8257154424769333, "grad_norm": 0.78780198097229, "learning_rate": 0.0007281235284884634, "loss": 4.147, "step": 155000 }, { "epoch": 0.8283790406784717, "grad_norm": 0.7623980045318604, "learning_rate": 0.0007272334869650833, "loss": 4.1449, "step": 155500 }, { "epoch": 0.8310426388800102, "grad_norm": 0.7452903389930725, "learning_rate": 0.0007263416617913558, "loss": 4.1444, "step": 156000 }, { "epoch": 0.8337062370815487, "grad_norm": 0.7188674807548523, "learning_rate": 0.0007254498366176282, "loss": 4.1378, "step": 156500 }, { "epoch": 0.8363698352830872, "grad_norm": 0.7653003931045532, "learning_rate": 0.0007245580114439007, "loss": 4.1454, "step": 157000 }, { "epoch": 0.8390334334846257, "grad_norm": 0.7343904376029968, "learning_rate": 0.0007236679699205205, "loss": 4.1479, "step": 157500 }, { "epoch": 0.8416970316861642, "grad_norm": 0.7688188552856445, "learning_rate": 0.000722776144746793, "loss": 4.1353, "step": 158000 }, { "epoch": 0.8443606298877027, "grad_norm": 0.7669944167137146, "learning_rate": 0.0007218843195730654, "loss": 4.1369, "step": 158500 }, { "epoch": 0.8470242280892412, "grad_norm": 0.7605074048042297, "learning_rate": 0.0007209924943993379, "loss": 4.1446, "step": 159000 }, { "epoch": 0.8496878262907797, "grad_norm": 0.7343530058860779, "learning_rate": 0.0007201024528759579, "loss": 4.1409, "step": 159500 }, { "epoch": 0.8523514244923182, "grad_norm": 0.7942246198654175, "learning_rate": 0.0007192106277022303, "loss": 4.144, "step": 160000 }, { "epoch": 0.8550150226938567, "grad_norm": 0.7736623287200928, "learning_rate": 0.0007183188025285028, "loss": 4.141, "step": 160500 }, { "epoch": 0.8576786208953951, "grad_norm": 0.7663691639900208, "learning_rate": 0.0007174269773547752, "loss": 4.1434, "step": 161000 }, { "epoch": 0.8603422190969336, "grad_norm": 0.7635341286659241, "learning_rate": 0.0007165369358313952, "loss": 4.1439, "step": 161500 }, { "epoch": 0.8630058172984721, "grad_norm": 0.797211766242981, "learning_rate": 0.0007156451106576676, "loss": 4.1331, "step": 162000 }, { "epoch": 0.8656694155000106, "grad_norm": 0.7563562393188477, "learning_rate": 0.00071475328548394, "loss": 4.1429, "step": 162500 }, { "epoch": 0.8683330137015491, "grad_norm": 0.7162951827049255, "learning_rate": 0.0007138614603102125, "loss": 4.1389, "step": 163000 }, { "epoch": 0.8709966119030876, "grad_norm": 0.7123258709907532, "learning_rate": 0.0007129714187868323, "loss": 4.136, "step": 163500 }, { "epoch": 0.8736602101046261, "grad_norm": 0.728543221950531, "learning_rate": 0.0007120795936131049, "loss": 4.1325, "step": 164000 }, { "epoch": 0.8763238083061646, "grad_norm": 0.7728511691093445, "learning_rate": 0.0007111877684393773, "loss": 4.1348, "step": 164500 }, { "epoch": 0.8789874065077031, "grad_norm": 0.7468729019165039, "learning_rate": 0.0007102959432656497, "loss": 4.1361, "step": 165000 }, { "epoch": 0.8816510047092416, "grad_norm": 0.7346534132957458, "learning_rate": 0.0007094059017422697, "loss": 4.1396, "step": 165500 }, { "epoch": 0.8843146029107801, "grad_norm": 0.7773277759552002, "learning_rate": 0.0007085140765685421, "loss": 4.1401, "step": 166000 }, { "epoch": 0.8869782011123186, "grad_norm": 0.709701657295227, "learning_rate": 0.0007076222513948147, "loss": 4.1317, "step": 166500 }, { "epoch": 0.8896417993138571, "grad_norm": 0.7487180233001709, "learning_rate": 0.0007067304262210871, "loss": 4.13, "step": 167000 }, { "epoch": 0.8923053975153956, "grad_norm": 0.7227104306221008, "learning_rate": 0.000705840384697707, "loss": 4.1367, "step": 167500 }, { "epoch": 0.8949689957169341, "grad_norm": 0.7912375330924988, "learning_rate": 0.0007049485595239794, "loss": 4.1294, "step": 168000 }, { "epoch": 0.8976325939184726, "grad_norm": 0.8671672344207764, "learning_rate": 0.0007040567343502518, "loss": 4.129, "step": 168500 }, { "epoch": 0.9002961921200111, "grad_norm": 0.7554329633712769, "learning_rate": 0.0007031649091765244, "loss": 4.1381, "step": 169000 }, { "epoch": 0.9029597903215496, "grad_norm": 0.7798919081687927, "learning_rate": 0.0007022748676531442, "loss": 4.1297, "step": 169500 }, { "epoch": 0.9056233885230881, "grad_norm": 0.7176423668861389, "learning_rate": 0.0007013830424794167, "loss": 4.132, "step": 170000 }, { "epoch": 0.9082869867246266, "grad_norm": 0.7016908526420593, "learning_rate": 0.0007004912173056891, "loss": 4.132, "step": 170500 }, { "epoch": 0.9109505849261651, "grad_norm": 0.7394859790802002, "learning_rate": 0.0006995993921319615, "loss": 4.1337, "step": 171000 }, { "epoch": 0.9136141831277036, "grad_norm": 0.745543897151947, "learning_rate": 0.0006987093506085815, "loss": 4.1316, "step": 171500 }, { "epoch": 0.9162777813292421, "grad_norm": 0.7842167019844055, "learning_rate": 0.000697817525434854, "loss": 4.1314, "step": 172000 }, { "epoch": 0.9189413795307806, "grad_norm": 0.7487747073173523, "learning_rate": 0.0006969257002611265, "loss": 4.1281, "step": 172500 }, { "epoch": 0.9216049777323191, "grad_norm": 0.737399160861969, "learning_rate": 0.0006960338750873989, "loss": 4.1325, "step": 173000 }, { "epoch": 0.9242685759338576, "grad_norm": 0.7666307687759399, "learning_rate": 0.0006951438335640188, "loss": 4.1333, "step": 173500 }, { "epoch": 0.926932174135396, "grad_norm": 0.7485344409942627, "learning_rate": 0.0006942520083902912, "loss": 4.1317, "step": 174000 }, { "epoch": 0.9295957723369345, "grad_norm": 0.7282237410545349, "learning_rate": 0.0006933601832165637, "loss": 4.1326, "step": 174500 }, { "epoch": 0.932259370538473, "grad_norm": 0.7747819423675537, "learning_rate": 0.0006924701416931836, "loss": 4.1362, "step": 175000 }, { "epoch": 0.9349229687400115, "grad_norm": 0.7578604817390442, "learning_rate": 0.000691578316519456, "loss": 4.1383, "step": 175500 }, { "epoch": 0.93758656694155, "grad_norm": 0.7957220673561096, "learning_rate": 0.0006906864913457285, "loss": 4.128, "step": 176000 }, { "epoch": 0.9402501651430885, "grad_norm": 0.7936584949493408, "learning_rate": 0.000689794666172001, "loss": 4.122, "step": 176500 }, { "epoch": 0.942913763344627, "grad_norm": 0.8081178069114685, "learning_rate": 0.0006889028409982735, "loss": 4.1298, "step": 177000 }, { "epoch": 0.9455773615461655, "grad_norm": 0.7892795205116272, "learning_rate": 0.000688011015824546, "loss": 4.1267, "step": 177500 }, { "epoch": 0.948240959747704, "grad_norm": 0.7274259328842163, "learning_rate": 0.0006871191906508184, "loss": 4.1232, "step": 178000 }, { "epoch": 0.9509045579492424, "grad_norm": 0.7544950246810913, "learning_rate": 0.0006862291491274383, "loss": 4.1267, "step": 178500 }, { "epoch": 0.9535681561507809, "grad_norm": 0.798841655254364, "learning_rate": 0.0006853373239537107, "loss": 4.1328, "step": 179000 }, { "epoch": 0.9562317543523194, "grad_norm": 0.7239564657211304, "learning_rate": 0.0006844454987799832, "loss": 4.1336, "step": 179500 }, { "epoch": 0.9588953525538579, "grad_norm": 0.8423783779144287, "learning_rate": 0.0006835536736062557, "loss": 4.1286, "step": 180000 }, { "epoch": 0.9615589507553964, "grad_norm": 0.7887551784515381, "learning_rate": 0.0006826618484325281, "loss": 4.1199, "step": 180500 }, { "epoch": 0.9642225489569349, "grad_norm": 0.7365000247955322, "learning_rate": 0.0006817700232588005, "loss": 4.1321, "step": 181000 }, { "epoch": 0.9668861471584734, "grad_norm": 0.7989848256111145, "learning_rate": 0.0006808799817354204, "loss": 4.1327, "step": 181500 }, { "epoch": 0.9695497453600119, "grad_norm": 0.7484691143035889, "learning_rate": 0.0006799881565616928, "loss": 4.1239, "step": 182000 }, { "epoch": 0.9722133435615504, "grad_norm": 0.8183499574661255, "learning_rate": 0.0006790963313879654, "loss": 4.1253, "step": 182500 }, { "epoch": 0.9748769417630889, "grad_norm": 0.7121425271034241, "learning_rate": 0.0006782045062142378, "loss": 4.1342, "step": 183000 }, { "epoch": 0.9775405399646274, "grad_norm": 0.7777406573295593, "learning_rate": 0.0006773144646908578, "loss": 4.1286, "step": 183500 }, { "epoch": 0.9802041381661659, "grad_norm": 0.7477155327796936, "learning_rate": 0.0006764226395171302, "loss": 4.1278, "step": 184000 }, { "epoch": 0.9828677363677044, "grad_norm": 0.8153510093688965, "learning_rate": 0.0006755308143434026, "loss": 4.1232, "step": 184500 }, { "epoch": 0.9855313345692429, "grad_norm": 0.7904220819473267, "learning_rate": 0.0006746389891696752, "loss": 4.1283, "step": 185000 }, { "epoch": 0.9881949327707814, "grad_norm": 0.8383620977401733, "learning_rate": 0.0006737471639959476, "loss": 4.1334, "step": 185500 }, { "epoch": 0.9908585309723199, "grad_norm": 0.7521381378173828, "learning_rate": 0.0006728571224725675, "loss": 4.1339, "step": 186000 }, { "epoch": 0.9935221291738584, "grad_norm": 0.7851571440696716, "learning_rate": 0.0006719652972988399, "loss": 4.1289, "step": 186500 }, { "epoch": 0.9961857273753969, "grad_norm": 0.7758961319923401, "learning_rate": 0.0006710734721251123, "loss": 4.1294, "step": 187000 }, { "epoch": 0.9988493255769354, "grad_norm": 0.7806641459465027, "learning_rate": 0.0006701816469513849, "loss": 4.1285, "step": 187500 }, { "epoch": 1.001512923778474, "grad_norm": 0.7453823685646057, "learning_rate": 0.0006692916054280047, "loss": 4.1283, "step": 188000 }, { "epoch": 1.0041765219800123, "grad_norm": 0.7377151846885681, "learning_rate": 0.0006683997802542772, "loss": 4.1297, "step": 188500 }, { "epoch": 1.006840120181551, "grad_norm": 0.7941287755966187, "learning_rate": 0.0006675079550805496, "loss": 4.1212, "step": 189000 }, { "epoch": 1.0095037183830893, "grad_norm": 0.767425000667572, "learning_rate": 0.000666616129906822, "loss": 4.1229, "step": 189500 }, { "epoch": 1.0121673165846279, "grad_norm": 0.7483153343200684, "learning_rate": 0.0006657243047330946, "loss": 4.1242, "step": 190000 }, { "epoch": 1.0148309147861663, "grad_norm": 0.7890580892562866, "learning_rate": 0.0006648342632097145, "loss": 4.1306, "step": 190500 }, { "epoch": 1.0174945129877049, "grad_norm": 0.7415242791175842, "learning_rate": 0.000663942438035987, "loss": 4.1285, "step": 191000 }, { "epoch": 1.0201581111892433, "grad_norm": 0.7596645951271057, "learning_rate": 0.0006630506128622594, "loss": 4.1258, "step": 191500 }, { "epoch": 1.0228217093907819, "grad_norm": 0.8304431438446045, "learning_rate": 0.0006621587876885318, "loss": 4.1232, "step": 192000 }, { "epoch": 1.0254853075923203, "grad_norm": 0.77840656042099, "learning_rate": 0.0006612687461651517, "loss": 4.1195, "step": 192500 }, { "epoch": 1.0281489057938589, "grad_norm": 0.7862575650215149, "learning_rate": 0.0006603769209914242, "loss": 4.1258, "step": 193000 }, { "epoch": 1.0308125039953973, "grad_norm": 0.7667100429534912, "learning_rate": 0.0006594850958176967, "loss": 4.1185, "step": 193500 }, { "epoch": 1.0334761021969359, "grad_norm": 0.7835633754730225, "learning_rate": 0.0006585932706439691, "loss": 4.1224, "step": 194000 }, { "epoch": 1.0361397003984743, "grad_norm": 0.7486304640769958, "learning_rate": 0.000657703229120589, "loss": 4.124, "step": 194500 }, { "epoch": 1.0388032986000129, "grad_norm": 0.7897284030914307, "learning_rate": 0.0006568114039468614, "loss": 4.1203, "step": 195000 }, { "epoch": 1.0414668968015512, "grad_norm": 0.7997919321060181, "learning_rate": 0.0006559195787731339, "loss": 4.1202, "step": 195500 }, { "epoch": 1.0441304950030899, "grad_norm": 0.7987415194511414, "learning_rate": 0.0006550277535994064, "loss": 4.1231, "step": 196000 }, { "epoch": 1.0467940932046282, "grad_norm": 0.7434735894203186, "learning_rate": 0.0006541377120760263, "loss": 4.1196, "step": 196500 }, { "epoch": 1.0494576914061668, "grad_norm": 0.806969404220581, "learning_rate": 0.0006532458869022988, "loss": 4.1185, "step": 197000 }, { "epoch": 1.0521212896077052, "grad_norm": 0.8006301522254944, "learning_rate": 0.0006523540617285712, "loss": 4.1209, "step": 197500 }, { "epoch": 1.0547848878092438, "grad_norm": 0.759758472442627, "learning_rate": 0.0006514622365548438, "loss": 4.1194, "step": 198000 }, { "epoch": 1.0574484860107822, "grad_norm": 0.8778506517410278, "learning_rate": 0.0006505704113811162, "loss": 4.1293, "step": 198500 }, { "epoch": 1.0601120842123208, "grad_norm": 0.7795832753181458, "learning_rate": 0.000649680369857736, "loss": 4.1152, "step": 199000 }, { "epoch": 1.0627756824138592, "grad_norm": 0.7928754687309265, "learning_rate": 0.0006487885446840085, "loss": 4.1177, "step": 199500 }, { "epoch": 1.0654392806153978, "grad_norm": 0.8119847774505615, "learning_rate": 0.0006478967195102809, "loss": 4.1205, "step": 200000 }, { "epoch": 1.0681028788169362, "grad_norm": 0.739378035068512, "learning_rate": 0.0006470048943365535, "loss": 4.1111, "step": 200500 }, { "epoch": 1.0707664770184748, "grad_norm": 0.7906088829040527, "learning_rate": 0.0006461148528131734, "loss": 4.1186, "step": 201000 }, { "epoch": 1.0734300752200132, "grad_norm": 0.7810208797454834, "learning_rate": 0.0006452230276394459, "loss": 4.1204, "step": 201500 }, { "epoch": 1.0760936734215516, "grad_norm": 0.741383969783783, "learning_rate": 0.0006443312024657183, "loss": 4.1222, "step": 202000 }, { "epoch": 1.0787572716230902, "grad_norm": 0.7824720740318298, "learning_rate": 0.0006434393772919907, "loss": 4.1174, "step": 202500 }, { "epoch": 1.0814208698246286, "grad_norm": 0.7920011281967163, "learning_rate": 0.0006425493357686106, "loss": 4.1196, "step": 203000 }, { "epoch": 1.0840844680261672, "grad_norm": 0.792914628982544, "learning_rate": 0.0006416575105948831, "loss": 4.1153, "step": 203500 }, { "epoch": 1.0867480662277056, "grad_norm": 0.7724523544311523, "learning_rate": 0.0006407656854211556, "loss": 4.1105, "step": 204000 }, { "epoch": 1.0894116644292442, "grad_norm": 0.7834595441818237, "learning_rate": 0.000639873860247428, "loss": 4.1179, "step": 204500 }, { "epoch": 1.0920752626307826, "grad_norm": 0.8056479096412659, "learning_rate": 0.0006389838187240478, "loss": 4.1126, "step": 205000 }, { "epoch": 1.0947388608323212, "grad_norm": 0.7697902321815491, "learning_rate": 0.0006380919935503203, "loss": 4.1193, "step": 205500 }, { "epoch": 1.0974024590338596, "grad_norm": 0.7807758450508118, "learning_rate": 0.0006372001683765928, "loss": 4.1192, "step": 206000 }, { "epoch": 1.1000660572353982, "grad_norm": 0.7408417463302612, "learning_rate": 0.0006363083432028652, "loss": 4.1119, "step": 206500 }, { "epoch": 1.1027296554369366, "grad_norm": 0.9000714421272278, "learning_rate": 0.0006354165180291377, "loss": 4.1185, "step": 207000 }, { "epoch": 1.1053932536384752, "grad_norm": 0.8088692426681519, "learning_rate": 0.0006345264765057577, "loss": 4.1177, "step": 207500 }, { "epoch": 1.1080568518400136, "grad_norm": 0.778122067451477, "learning_rate": 0.0006336346513320301, "loss": 4.1143, "step": 208000 }, { "epoch": 1.1107204500415522, "grad_norm": 0.8222107291221619, "learning_rate": 0.0006327428261583026, "loss": 4.1136, "step": 208500 }, { "epoch": 1.1133840482430906, "grad_norm": 0.7356205582618713, "learning_rate": 0.0006318510009845751, "loss": 4.1187, "step": 209000 }, { "epoch": 1.1160476464446292, "grad_norm": 0.7457647919654846, "learning_rate": 0.0006309609594611949, "loss": 4.1123, "step": 209500 }, { "epoch": 1.1187112446461676, "grad_norm": 0.789622962474823, "learning_rate": 0.0006300691342874674, "loss": 4.1175, "step": 210000 }, { "epoch": 1.1213748428477062, "grad_norm": 0.8369338512420654, "learning_rate": 0.0006291773091137398, "loss": 4.1147, "step": 210500 }, { "epoch": 1.1240384410492446, "grad_norm": 0.8210717439651489, "learning_rate": 0.0006282854839400123, "loss": 4.1142, "step": 211000 }, { "epoch": 1.1267020392507832, "grad_norm": 0.7775838375091553, "learning_rate": 0.0006273954424166322, "loss": 4.1203, "step": 211500 }, { "epoch": 1.1293656374523215, "grad_norm": 0.7949962019920349, "learning_rate": 0.0006265036172429046, "loss": 4.1139, "step": 212000 }, { "epoch": 1.1320292356538602, "grad_norm": 0.7534223794937134, "learning_rate": 0.000625611792069177, "loss": 4.1177, "step": 212500 }, { "epoch": 1.1346928338553985, "grad_norm": 0.8075549602508545, "learning_rate": 0.0006247199668954495, "loss": 4.1147, "step": 213000 }, { "epoch": 1.1373564320569371, "grad_norm": 0.7999294400215149, "learning_rate": 0.0006238299253720696, "loss": 4.116, "step": 213500 }, { "epoch": 1.1400200302584755, "grad_norm": 0.7690563797950745, "learning_rate": 0.000622938100198342, "loss": 4.1108, "step": 214000 }, { "epoch": 1.1426836284600141, "grad_norm": 0.7599471211433411, "learning_rate": 0.0006220462750246144, "loss": 4.1155, "step": 214500 }, { "epoch": 1.1453472266615525, "grad_norm": 0.7433050274848938, "learning_rate": 0.0006211544498508869, "loss": 4.1172, "step": 215000 }, { "epoch": 1.1480108248630911, "grad_norm": 0.781114935874939, "learning_rate": 0.0006202644083275067, "loss": 4.1084, "step": 215500 }, { "epoch": 1.1506744230646295, "grad_norm": 0.7194410562515259, "learning_rate": 0.0006193725831537791, "loss": 4.1127, "step": 216000 }, { "epoch": 1.1533380212661681, "grad_norm": 0.8126916289329529, "learning_rate": 0.0006184807579800517, "loss": 4.1126, "step": 216500 }, { "epoch": 1.1560016194677065, "grad_norm": 0.8229861855506897, "learning_rate": 0.0006175889328063241, "loss": 4.1121, "step": 217000 }, { "epoch": 1.158665217669245, "grad_norm": 0.8246269226074219, "learning_rate": 0.000616698891282944, "loss": 4.1092, "step": 217500 }, { "epoch": 1.1613288158707835, "grad_norm": 0.8146107196807861, "learning_rate": 0.0006158070661092164, "loss": 4.1091, "step": 218000 }, { "epoch": 1.1639924140723221, "grad_norm": 0.7878261208534241, "learning_rate": 0.0006149152409354888, "loss": 4.1161, "step": 218500 }, { "epoch": 1.1666560122738605, "grad_norm": 0.7780360579490662, "learning_rate": 0.0006140234157617614, "loss": 4.1079, "step": 219000 }, { "epoch": 1.169319610475399, "grad_norm": 0.7969585657119751, "learning_rate": 0.0006131333742383814, "loss": 4.1134, "step": 219500 }, { "epoch": 1.1719832086769375, "grad_norm": 0.8402618765830994, "learning_rate": 0.0006122415490646538, "loss": 4.1143, "step": 220000 }, { "epoch": 1.1746468068784761, "grad_norm": 0.7946035861968994, "learning_rate": 0.0006113497238909262, "loss": 4.114, "step": 220500 }, { "epoch": 1.1773104050800145, "grad_norm": 0.7864482402801514, "learning_rate": 0.0006104578987171987, "loss": 4.1126, "step": 221000 }, { "epoch": 1.1799740032815529, "grad_norm": 0.8313577771186829, "learning_rate": 0.0006095678571938186, "loss": 4.106, "step": 221500 }, { "epoch": 1.1826376014830915, "grad_norm": 0.8574484586715698, "learning_rate": 0.0006086760320200911, "loss": 4.1085, "step": 222000 }, { "epoch": 1.1853011996846299, "grad_norm": 0.7599306702613831, "learning_rate": 0.0006077842068463635, "loss": 4.1071, "step": 222500 }, { "epoch": 1.1879647978861685, "grad_norm": 0.7732433676719666, "learning_rate": 0.0006068923816726359, "loss": 4.1185, "step": 223000 }, { "epoch": 1.1906283960877069, "grad_norm": 0.8210047483444214, "learning_rate": 0.0006060023401492559, "loss": 4.1099, "step": 223500 }, { "epoch": 1.1932919942892455, "grad_norm": 0.8054102063179016, "learning_rate": 0.0006051105149755284, "loss": 4.1181, "step": 224000 }, { "epoch": 1.1959555924907839, "grad_norm": 0.7870852947235107, "learning_rate": 0.0006042186898018009, "loss": 4.1016, "step": 224500 }, { "epoch": 1.1986191906923225, "grad_norm": 0.8508167266845703, "learning_rate": 0.0006033268646280733, "loss": 4.1202, "step": 225000 }, { "epoch": 1.2012827888938609, "grad_norm": 0.7744969129562378, "learning_rate": 0.0006024368231046932, "loss": 4.1094, "step": 225500 }, { "epoch": 1.2039463870953995, "grad_norm": 0.7836142778396606, "learning_rate": 0.0006015449979309656, "loss": 4.1079, "step": 226000 }, { "epoch": 1.2066099852969379, "grad_norm": 0.7741486430168152, "learning_rate": 0.000600653172757238, "loss": 4.1088, "step": 226500 }, { "epoch": 1.2092735834984765, "grad_norm": 0.77290940284729, "learning_rate": 0.0005997613475835106, "loss": 4.1025, "step": 227000 }, { "epoch": 1.2119371817000149, "grad_norm": 0.8240610361099243, "learning_rate": 0.0005988713060601304, "loss": 4.104, "step": 227500 }, { "epoch": 1.2146007799015535, "grad_norm": 0.7438703775405884, "learning_rate": 0.0005979794808864029, "loss": 4.1084, "step": 228000 }, { "epoch": 1.2172643781030918, "grad_norm": 0.837753415107727, "learning_rate": 0.0005970876557126753, "loss": 4.1017, "step": 228500 }, { "epoch": 1.2199279763046305, "grad_norm": 0.7918710112571716, "learning_rate": 0.0005961958305389477, "loss": 4.1094, "step": 229000 }, { "epoch": 1.2225915745061688, "grad_norm": 0.8078004121780396, "learning_rate": 0.0005953040053652203, "loss": 4.1043, "step": 229500 }, { "epoch": 1.2252551727077075, "grad_norm": 0.8458930253982544, "learning_rate": 0.0005944139638418402, "loss": 4.1069, "step": 230000 }, { "epoch": 1.2279187709092458, "grad_norm": 0.7811508178710938, "learning_rate": 0.0005935221386681127, "loss": 4.1071, "step": 230500 }, { "epoch": 1.2305823691107844, "grad_norm": 0.8446598649024963, "learning_rate": 0.0005926303134943851, "loss": 4.1063, "step": 231000 }, { "epoch": 1.2332459673123228, "grad_norm": 0.8074429035186768, "learning_rate": 0.0005917384883206575, "loss": 4.109, "step": 231500 }, { "epoch": 1.2359095655138614, "grad_norm": 0.8163787722587585, "learning_rate": 0.0005908484467972775, "loss": 4.1028, "step": 232000 }, { "epoch": 1.2385731637153998, "grad_norm": 0.7774120569229126, "learning_rate": 0.0005899566216235499, "loss": 4.1084, "step": 232500 }, { "epoch": 1.2412367619169384, "grad_norm": 0.7910379767417908, "learning_rate": 0.0005890647964498224, "loss": 4.1002, "step": 233000 }, { "epoch": 1.2439003601184768, "grad_norm": 0.8428027629852295, "learning_rate": 0.0005881729712760948, "loss": 4.1127, "step": 233500 }, { "epoch": 1.2465639583200154, "grad_norm": 0.7961114645004272, "learning_rate": 0.0005872829297527147, "loss": 4.1046, "step": 234000 }, { "epoch": 1.2492275565215538, "grad_norm": 0.8194419145584106, "learning_rate": 0.0005863911045789872, "loss": 4.1088, "step": 234500 }, { "epoch": 1.2518911547230922, "grad_norm": 0.783875584602356, "learning_rate": 0.0005854992794052596, "loss": 4.1086, "step": 235000 }, { "epoch": 1.2545547529246308, "grad_norm": 0.7610777020454407, "learning_rate": 0.0005846074542315321, "loss": 4.1024, "step": 235500 }, { "epoch": 1.2572183511261694, "grad_norm": 0.7696565389633179, "learning_rate": 0.000583717412708152, "loss": 4.1016, "step": 236000 }, { "epoch": 1.2598819493277078, "grad_norm": 0.82817542552948, "learning_rate": 0.0005828255875344245, "loss": 4.0958, "step": 236500 }, { "epoch": 1.2625455475292462, "grad_norm": 0.8974746465682983, "learning_rate": 0.0005819337623606969, "loss": 4.1077, "step": 237000 }, { "epoch": 1.2652091457307848, "grad_norm": 0.7882625460624695, "learning_rate": 0.0005810419371869694, "loss": 4.1027, "step": 237500 }, { "epoch": 1.2678727439323234, "grad_norm": 0.7710665464401245, "learning_rate": 0.0005801518956635893, "loss": 4.1071, "step": 238000 }, { "epoch": 1.2705363421338618, "grad_norm": 0.8462359309196472, "learning_rate": 0.0005792600704898617, "loss": 4.0993, "step": 238500 }, { "epoch": 1.2731999403354002, "grad_norm": 0.7785073518753052, "learning_rate": 0.0005783682453161342, "loss": 4.1051, "step": 239000 }, { "epoch": 1.2758635385369388, "grad_norm": 0.7724746465682983, "learning_rate": 0.0005774764201424066, "loss": 4.1082, "step": 239500 }, { "epoch": 1.2785271367384774, "grad_norm": 0.8276979923248291, "learning_rate": 0.0005765863786190266, "loss": 4.095, "step": 240000 }, { "epoch": 1.2811907349400158, "grad_norm": 0.7959253191947937, "learning_rate": 0.000575694553445299, "loss": 4.1026, "step": 240500 }, { "epoch": 1.2838543331415542, "grad_norm": 0.806239664554596, "learning_rate": 0.0005748027282715714, "loss": 4.1019, "step": 241000 }, { "epoch": 1.2865179313430928, "grad_norm": 0.9089943170547485, "learning_rate": 0.0005739109030978439, "loss": 4.0955, "step": 241500 }, { "epoch": 1.2891815295446314, "grad_norm": 0.8239426612854004, "learning_rate": 0.0005730208615744638, "loss": 4.1033, "step": 242000 }, { "epoch": 1.2918451277461698, "grad_norm": 0.8066053986549377, "learning_rate": 0.0005721290364007364, "loss": 4.1068, "step": 242500 }, { "epoch": 1.2945087259477082, "grad_norm": 0.7600257396697998, "learning_rate": 0.0005712372112270088, "loss": 4.1006, "step": 243000 }, { "epoch": 1.2971723241492468, "grad_norm": 0.7940685749053955, "learning_rate": 0.0005703471697036287, "loss": 4.1004, "step": 243500 }, { "epoch": 1.2998359223507852, "grad_norm": 0.7310413718223572, "learning_rate": 0.0005694553445299011, "loss": 4.1028, "step": 244000 }, { "epoch": 1.3024995205523238, "grad_norm": 0.8132951855659485, "learning_rate": 0.0005685635193561735, "loss": 4.1104, "step": 244500 }, { "epoch": 1.3051631187538622, "grad_norm": 0.8280708193778992, "learning_rate": 0.0005676716941824461, "loss": 4.1029, "step": 245000 }, { "epoch": 1.3078267169554008, "grad_norm": 0.7521162629127502, "learning_rate": 0.0005667798690087185, "loss": 4.0991, "step": 245500 }, { "epoch": 1.3104903151569391, "grad_norm": 0.8909037709236145, "learning_rate": 0.0005658880438349909, "loss": 4.1005, "step": 246000 }, { "epoch": 1.3131539133584778, "grad_norm": 0.8605440855026245, "learning_rate": 0.0005649962186612634, "loss": 4.0999, "step": 246500 }, { "epoch": 1.3158175115600161, "grad_norm": 0.9294172525405884, "learning_rate": 0.0005641043934875358, "loss": 4.0978, "step": 247000 }, { "epoch": 1.3184811097615547, "grad_norm": 0.8271783590316772, "learning_rate": 0.0005632143519641559, "loss": 4.1005, "step": 247500 }, { "epoch": 1.3211447079630931, "grad_norm": 0.7716344594955444, "learning_rate": 0.0005623225267904283, "loss": 4.0972, "step": 248000 }, { "epoch": 1.3238083061646317, "grad_norm": 0.7663143873214722, "learning_rate": 0.0005614307016167007, "loss": 4.1068, "step": 248500 }, { "epoch": 1.3264719043661701, "grad_norm": 0.8361650705337524, "learning_rate": 0.0005605388764429732, "loss": 4.0955, "step": 249000 }, { "epoch": 1.3291355025677087, "grad_norm": 0.8032039403915405, "learning_rate": 0.000559648834919593, "loss": 4.0981, "step": 249500 }, { "epoch": 1.3317991007692471, "grad_norm": 0.7755228281021118, "learning_rate": 0.0005587570097458655, "loss": 4.0985, "step": 250000 }, { "epoch": 1.3344626989707857, "grad_norm": 0.8239076733589172, "learning_rate": 0.000557865184572138, "loss": 4.102, "step": 250500 }, { "epoch": 1.3371262971723241, "grad_norm": 0.849665105342865, "learning_rate": 0.0005569733593984104, "loss": 4.1022, "step": 251000 }, { "epoch": 1.3397898953738627, "grad_norm": 0.7836341857910156, "learning_rate": 0.0005560833178750303, "loss": 4.0985, "step": 251500 }, { "epoch": 1.3424534935754011, "grad_norm": 0.7993196845054626, "learning_rate": 0.0005551914927013027, "loss": 4.0959, "step": 252000 }, { "epoch": 1.3451170917769395, "grad_norm": 0.8100605010986328, "learning_rate": 0.0005542996675275752, "loss": 4.0938, "step": 252500 }, { "epoch": 1.347780689978478, "grad_norm": 0.8267188668251038, "learning_rate": 0.0005534078423538477, "loss": 4.0975, "step": 253000 }, { "epoch": 1.3504442881800167, "grad_norm": 0.7876518964767456, "learning_rate": 0.0005525178008304677, "loss": 4.0966, "step": 253500 }, { "epoch": 1.353107886381555, "grad_norm": 0.8013073801994324, "learning_rate": 0.0005516259756567401, "loss": 4.0993, "step": 254000 }, { "epoch": 1.3557714845830935, "grad_norm": 0.7732263207435608, "learning_rate": 0.0005507341504830125, "loss": 4.0955, "step": 254500 }, { "epoch": 1.358435082784632, "grad_norm": 0.8235819935798645, "learning_rate": 0.000549842325309285, "loss": 4.0997, "step": 255000 }, { "epoch": 1.3610986809861707, "grad_norm": 0.7818782329559326, "learning_rate": 0.0005489505001355575, "loss": 4.1026, "step": 255500 }, { "epoch": 1.363762279187709, "grad_norm": 0.8184423446655273, "learning_rate": 0.0005480604586121774, "loss": 4.092, "step": 256000 }, { "epoch": 1.3664258773892475, "grad_norm": 0.7807801365852356, "learning_rate": 0.0005471686334384498, "loss": 4.0938, "step": 256500 }, { "epoch": 1.369089475590786, "grad_norm": 0.8043480515480042, "learning_rate": 0.0005462768082647222, "loss": 4.0964, "step": 257000 }, { "epoch": 1.3717530737923247, "grad_norm": 0.8113440871238708, "learning_rate": 0.0005453849830909947, "loss": 4.092, "step": 257500 }, { "epoch": 1.374416671993863, "grad_norm": 0.776531994342804, "learning_rate": 0.0005444949415676145, "loss": 4.1043, "step": 258000 }, { "epoch": 1.3770802701954015, "grad_norm": 0.9090542197227478, "learning_rate": 0.0005436031163938871, "loss": 4.1026, "step": 258500 }, { "epoch": 1.37974386839694, "grad_norm": 0.8724551796913147, "learning_rate": 0.0005427112912201595, "loss": 4.0983, "step": 259000 }, { "epoch": 1.3824074665984787, "grad_norm": 0.7889623045921326, "learning_rate": 0.0005418194660464319, "loss": 4.1027, "step": 259500 }, { "epoch": 1.385071064800017, "grad_norm": 0.7813825011253357, "learning_rate": 0.0005409294245230519, "loss": 4.092, "step": 260000 }, { "epoch": 1.3877346630015555, "grad_norm": 0.8187386989593506, "learning_rate": 0.0005400393829996718, "loss": 4.0955, "step": 260500 }, { "epoch": 1.390398261203094, "grad_norm": 0.8593798279762268, "learning_rate": 0.0005391475578259443, "loss": 4.094, "step": 261000 }, { "epoch": 1.3930618594046325, "grad_norm": 0.8074827194213867, "learning_rate": 0.0005382557326522167, "loss": 4.095, "step": 261500 }, { "epoch": 1.395725457606171, "grad_norm": 0.8229965567588806, "learning_rate": 0.0005373639074784892, "loss": 4.0909, "step": 262000 }, { "epoch": 1.3983890558077094, "grad_norm": 0.7867224216461182, "learning_rate": 0.0005364720823047616, "loss": 4.0934, "step": 262500 }, { "epoch": 1.401052654009248, "grad_norm": 0.9083333611488342, "learning_rate": 0.000535580257131034, "loss": 4.0982, "step": 263000 }, { "epoch": 1.4037162522107864, "grad_norm": 0.8077040314674377, "learning_rate": 0.0005346884319573066, "loss": 4.0949, "step": 263500 }, { "epoch": 1.406379850412325, "grad_norm": 0.871181070804596, "learning_rate": 0.000533796606783579, "loss": 4.096, "step": 264000 }, { "epoch": 1.4090434486138634, "grad_norm": 0.8004094958305359, "learning_rate": 0.0005329065652601989, "loss": 4.0969, "step": 264500 }, { "epoch": 1.411707046815402, "grad_norm": 0.8624884486198425, "learning_rate": 0.0005320147400864713, "loss": 4.0964, "step": 265000 }, { "epoch": 1.4143706450169404, "grad_norm": 0.7955045104026794, "learning_rate": 0.0005311229149127437, "loss": 4.0944, "step": 265500 }, { "epoch": 1.417034243218479, "grad_norm": 0.7732199430465698, "learning_rate": 0.0005302310897390163, "loss": 4.0906, "step": 266000 }, { "epoch": 1.4196978414200174, "grad_norm": 0.8164415955543518, "learning_rate": 0.0005293410482156362, "loss": 4.0887, "step": 266500 }, { "epoch": 1.422361439621556, "grad_norm": 0.8961130380630493, "learning_rate": 0.0005284492230419087, "loss": 4.1001, "step": 267000 }, { "epoch": 1.4250250378230944, "grad_norm": 0.8140637874603271, "learning_rate": 0.0005275573978681811, "loss": 4.0898, "step": 267500 }, { "epoch": 1.427688636024633, "grad_norm": 0.8230092525482178, "learning_rate": 0.0005266655726944535, "loss": 4.0994, "step": 268000 }, { "epoch": 1.4303522342261714, "grad_norm": 0.800144612789154, "learning_rate": 0.0005257755311710735, "loss": 4.0914, "step": 268500 }, { "epoch": 1.43301583242771, "grad_norm": 0.8252524733543396, "learning_rate": 0.000524883705997346, "loss": 4.0944, "step": 269000 }, { "epoch": 1.4356794306292484, "grad_norm": 0.7676013708114624, "learning_rate": 0.0005239918808236184, "loss": 4.092, "step": 269500 }, { "epoch": 1.4383430288307868, "grad_norm": 0.8423929810523987, "learning_rate": 0.0005231000556498908, "loss": 4.0871, "step": 270000 }, { "epoch": 1.4410066270323254, "grad_norm": 0.7545808553695679, "learning_rate": 0.0005222100141265108, "loss": 4.0923, "step": 270500 }, { "epoch": 1.443670225233864, "grad_norm": 0.820381224155426, "learning_rate": 0.0005213181889527832, "loss": 4.0827, "step": 271000 }, { "epoch": 1.4463338234354024, "grad_norm": 0.8105764985084534, "learning_rate": 0.0005204263637790558, "loss": 4.0943, "step": 271500 }, { "epoch": 1.4489974216369408, "grad_norm": 0.7974145412445068, "learning_rate": 0.0005195345386053282, "loss": 4.0852, "step": 272000 }, { "epoch": 1.4516610198384794, "grad_norm": 0.7740100026130676, "learning_rate": 0.000518644497081948, "loss": 4.0943, "step": 272500 }, { "epoch": 1.454324618040018, "grad_norm": 0.8262558579444885, "learning_rate": 0.0005177526719082205, "loss": 4.0889, "step": 273000 }, { "epoch": 1.4569882162415564, "grad_norm": 0.8640192747116089, "learning_rate": 0.0005168608467344929, "loss": 4.0844, "step": 273500 }, { "epoch": 1.4596518144430948, "grad_norm": 0.8319873809814453, "learning_rate": 0.0005159690215607655, "loss": 4.0936, "step": 274000 }, { "epoch": 1.4623154126446334, "grad_norm": 0.876741886138916, "learning_rate": 0.0005150789800373853, "loss": 4.0855, "step": 274500 }, { "epoch": 1.464979010846172, "grad_norm": 0.8290923833847046, "learning_rate": 0.0005141871548636577, "loss": 4.0949, "step": 275000 }, { "epoch": 1.4676426090477104, "grad_norm": 0.7827680110931396, "learning_rate": 0.0005132953296899302, "loss": 4.0821, "step": 275500 }, { "epoch": 1.4703062072492488, "grad_norm": 0.8360860347747803, "learning_rate": 0.0005124035045162026, "loss": 4.0921, "step": 276000 }, { "epoch": 1.4729698054507874, "grad_norm": 0.7869288325309753, "learning_rate": 0.0005115134629928227, "loss": 4.0795, "step": 276500 }, { "epoch": 1.475633403652326, "grad_norm": 0.8743867874145508, "learning_rate": 0.0005106216378190951, "loss": 4.0867, "step": 277000 }, { "epoch": 1.4782970018538644, "grad_norm": 0.8454434871673584, "learning_rate": 0.0005097298126453676, "loss": 4.083, "step": 277500 }, { "epoch": 1.4809606000554028, "grad_norm": 0.8108798265457153, "learning_rate": 0.00050883798747164, "loss": 4.086, "step": 278000 }, { "epoch": 1.4836241982569414, "grad_norm": 0.8548552989959717, "learning_rate": 0.0005079479459482598, "loss": 4.0853, "step": 278500 }, { "epoch": 1.4862877964584797, "grad_norm": 0.8752163052558899, "learning_rate": 0.0005070561207745324, "loss": 4.0891, "step": 279000 }, { "epoch": 1.4889513946600184, "grad_norm": 0.9157357811927795, "learning_rate": 0.0005061642956008048, "loss": 4.0872, "step": 279500 }, { "epoch": 1.4916149928615567, "grad_norm": 0.8573022484779358, "learning_rate": 0.0005052724704270773, "loss": 4.0854, "step": 280000 }, { "epoch": 1.4942785910630954, "grad_norm": 0.8331462740898132, "learning_rate": 0.0005043806452533497, "loss": 4.0887, "step": 280500 }, { "epoch": 1.4969421892646337, "grad_norm": 0.7753505110740662, "learning_rate": 0.0005034888200796221, "loss": 4.0901, "step": 281000 }, { "epoch": 1.4996057874661723, "grad_norm": 0.781449556350708, "learning_rate": 0.0005025969949058947, "loss": 4.0844, "step": 281500 }, { "epoch": 1.5022693856677107, "grad_norm": 0.9343318343162537, "learning_rate": 0.0005017051697321671, "loss": 4.0906, "step": 282000 }, { "epoch": 1.5049329838692493, "grad_norm": 0.8867080807685852, "learning_rate": 0.000500815128208787, "loss": 4.08, "step": 282500 }, { "epoch": 1.507596582070788, "grad_norm": 0.8553933501243591, "learning_rate": 0.0004999233030350595, "loss": 4.0898, "step": 283000 }, { "epoch": 1.5102601802723261, "grad_norm": 0.849162757396698, "learning_rate": 0.0004990314778613319, "loss": 4.0894, "step": 283500 }, { "epoch": 1.5129237784738647, "grad_norm": 0.787109375, "learning_rate": 0.0004981396526876044, "loss": 4.085, "step": 284000 }, { "epoch": 1.5155873766754033, "grad_norm": 0.8072954416275024, "learning_rate": 0.0004972496111642243, "loss": 4.0842, "step": 284500 }, { "epoch": 1.5182509748769417, "grad_norm": 0.8034284114837646, "learning_rate": 0.0004963595696408442, "loss": 4.0866, "step": 285000 }, { "epoch": 1.52091457307848, "grad_norm": 0.8554684519767761, "learning_rate": 0.0004954677444671166, "loss": 4.0851, "step": 285500 }, { "epoch": 1.5235781712800187, "grad_norm": 0.8422802686691284, "learning_rate": 0.000494575919293389, "loss": 4.0869, "step": 286000 }, { "epoch": 1.5262417694815573, "grad_norm": 0.7712003588676453, "learning_rate": 0.0004936840941196615, "loss": 4.0808, "step": 286500 }, { "epoch": 1.5289053676830957, "grad_norm": 0.8626993894577026, "learning_rate": 0.000492792268945934, "loss": 4.0805, "step": 287000 }, { "epoch": 1.531568965884634, "grad_norm": 0.8277269601821899, "learning_rate": 0.0004919022274225539, "loss": 4.0906, "step": 287500 }, { "epoch": 1.5342325640861727, "grad_norm": 0.8013060688972473, "learning_rate": 0.0004910104022488263, "loss": 4.0836, "step": 288000 }, { "epoch": 1.5368961622877113, "grad_norm": 0.7702099084854126, "learning_rate": 0.0004901185770750989, "loss": 4.0777, "step": 288500 }, { "epoch": 1.5395597604892497, "grad_norm": 0.8085469603538513, "learning_rate": 0.0004892267519013713, "loss": 4.0898, "step": 289000 }, { "epoch": 1.542223358690788, "grad_norm": 0.7977801561355591, "learning_rate": 0.0004883349267276437, "loss": 4.0955, "step": 289500 }, { "epoch": 1.5448869568923267, "grad_norm": 0.8373309969902039, "learning_rate": 0.0004874431015539162, "loss": 4.0783, "step": 290000 }, { "epoch": 1.5475505550938653, "grad_norm": 0.7764778733253479, "learning_rate": 0.0004865530600305361, "loss": 4.0861, "step": 290500 }, { "epoch": 1.5502141532954037, "grad_norm": 0.8451995849609375, "learning_rate": 0.00048566123485680856, "loss": 4.0817, "step": 291000 }, { "epoch": 1.552877751496942, "grad_norm": 0.8463019728660583, "learning_rate": 0.00048476940968308105, "loss": 4.0822, "step": 291500 }, { "epoch": 1.5555413496984807, "grad_norm": 0.8065968155860901, "learning_rate": 0.0004838775845093535, "loss": 4.089, "step": 292000 }, { "epoch": 1.5582049479000193, "grad_norm": 0.8490435481071472, "learning_rate": 0.00048298754298597334, "loss": 4.0765, "step": 292500 }, { "epoch": 1.5608685461015577, "grad_norm": 0.8057785630226135, "learning_rate": 0.0004820957178122458, "loss": 4.0809, "step": 293000 }, { "epoch": 1.563532144303096, "grad_norm": 0.9338017702102661, "learning_rate": 0.00048120389263851826, "loss": 4.0787, "step": 293500 }, { "epoch": 1.5661957425046347, "grad_norm": 0.9003413915634155, "learning_rate": 0.00048031206746479074, "loss": 4.0756, "step": 294000 }, { "epoch": 1.5688593407061733, "grad_norm": 0.779014527797699, "learning_rate": 0.00047942024229106323, "loss": 4.0832, "step": 294500 }, { "epoch": 1.5715229389077117, "grad_norm": 0.8321064114570618, "learning_rate": 0.0004785302007676831, "loss": 4.0885, "step": 295000 }, { "epoch": 1.57418653710925, "grad_norm": 0.8152427077293396, "learning_rate": 0.0004776383755939556, "loss": 4.0847, "step": 295500 }, { "epoch": 1.5768501353107887, "grad_norm": 0.8888664245605469, "learning_rate": 0.000476746550420228, "loss": 4.0777, "step": 296000 }, { "epoch": 1.5795137335123273, "grad_norm": 0.8546236157417297, "learning_rate": 0.0004758547252465005, "loss": 4.0898, "step": 296500 }, { "epoch": 1.5821773317138657, "grad_norm": 0.7983977794647217, "learning_rate": 0.00047496290007277293, "loss": 4.0869, "step": 297000 }, { "epoch": 1.584840929915404, "grad_norm": 0.9709325432777405, "learning_rate": 0.00047407107489904536, "loss": 4.0864, "step": 297500 }, { "epoch": 1.5875045281169426, "grad_norm": 0.8570044040679932, "learning_rate": 0.00047317924972531785, "loss": 4.0886, "step": 298000 }, { "epoch": 1.5901681263184813, "grad_norm": 0.8361437320709229, "learning_rate": 0.00047228920820193776, "loss": 4.0794, "step": 298500 }, { "epoch": 1.5928317245200196, "grad_norm": 0.8911067247390747, "learning_rate": 0.00047139738302821025, "loss": 4.0836, "step": 299000 }, { "epoch": 1.595495322721558, "grad_norm": 0.8150638341903687, "learning_rate": 0.0004705055578544827, "loss": 4.0806, "step": 299500 }, { "epoch": 1.5981589209230966, "grad_norm": 0.8484770059585571, "learning_rate": 0.0004696137326807551, "loss": 4.0796, "step": 300000 }, { "epoch": 1.6008225191246352, "grad_norm": 0.8199454545974731, "learning_rate": 0.0004687219075070276, "loss": 4.0789, "step": 300500 }, { "epoch": 1.6034861173261736, "grad_norm": 0.8845428824424744, "learning_rate": 0.0004678318659836475, "loss": 4.073, "step": 301000 }, { "epoch": 1.606149715527712, "grad_norm": 0.8244544267654419, "learning_rate": 0.00046694004080991995, "loss": 4.0753, "step": 301500 }, { "epoch": 1.6088133137292506, "grad_norm": 0.8862385153770447, "learning_rate": 0.00046604821563619244, "loss": 4.0784, "step": 302000 }, { "epoch": 1.611476911930789, "grad_norm": 0.8142257928848267, "learning_rate": 0.00046515639046246487, "loss": 4.0806, "step": 302500 }, { "epoch": 1.6141405101323274, "grad_norm": 0.850913941860199, "learning_rate": 0.00046426456528873735, "loss": 4.0821, "step": 303000 }, { "epoch": 1.616804108333866, "grad_norm": 0.7964518666267395, "learning_rate": 0.0004633727401150098, "loss": 4.0802, "step": 303500 }, { "epoch": 1.6194677065354046, "grad_norm": 0.8475667834281921, "learning_rate": 0.0004624809149412823, "loss": 4.0825, "step": 304000 }, { "epoch": 1.622131304736943, "grad_norm": 0.8427020311355591, "learning_rate": 0.0004615890897675547, "loss": 4.0746, "step": 304500 }, { "epoch": 1.6247949029384814, "grad_norm": 0.8353922367095947, "learning_rate": 0.0004606990482441746, "loss": 4.0785, "step": 305000 }, { "epoch": 1.62745850114002, "grad_norm": 0.8765130043029785, "learning_rate": 0.0004598072230704471, "loss": 4.0827, "step": 305500 }, { "epoch": 1.6301220993415586, "grad_norm": 0.7863726615905762, "learning_rate": 0.00045891718154706697, "loss": 4.0782, "step": 306000 }, { "epoch": 1.632785697543097, "grad_norm": 0.7965743541717529, "learning_rate": 0.0004580253563733394, "loss": 4.0751, "step": 306500 }, { "epoch": 1.6354492957446354, "grad_norm": 0.7712193131446838, "learning_rate": 0.0004571335311996119, "loss": 4.0775, "step": 307000 }, { "epoch": 1.638112893946174, "grad_norm": 0.8547102212905884, "learning_rate": 0.0004562417060258843, "loss": 4.0687, "step": 307500 }, { "epoch": 1.6407764921477126, "grad_norm": 0.794670581817627, "learning_rate": 0.00045535166450250423, "loss": 4.0809, "step": 308000 }, { "epoch": 1.643440090349251, "grad_norm": 0.8939191102981567, "learning_rate": 0.0004544598393287767, "loss": 4.0755, "step": 308500 }, { "epoch": 1.6461036885507894, "grad_norm": 0.830675482749939, "learning_rate": 0.00045356801415504915, "loss": 4.0849, "step": 309000 }, { "epoch": 1.648767286752328, "grad_norm": 0.8708091378211975, "learning_rate": 0.00045267618898132164, "loss": 4.0664, "step": 309500 }, { "epoch": 1.6514308849538666, "grad_norm": 0.7933617830276489, "learning_rate": 0.00045178436380759407, "loss": 4.0802, "step": 310000 }, { "epoch": 1.654094483155405, "grad_norm": 0.8032438158988953, "learning_rate": 0.000450894322284214, "loss": 4.0783, "step": 310500 }, { "epoch": 1.6567580813569434, "grad_norm": 0.8478823304176331, "learning_rate": 0.0004500024971104865, "loss": 4.0831, "step": 311000 }, { "epoch": 1.659421679558482, "grad_norm": 0.8288933634757996, "learning_rate": 0.0004491106719367589, "loss": 4.0801, "step": 311500 }, { "epoch": 1.6620852777600206, "grad_norm": 0.8561184406280518, "learning_rate": 0.0004482188467630314, "loss": 4.0788, "step": 312000 }, { "epoch": 1.664748875961559, "grad_norm": 0.9229483008384705, "learning_rate": 0.0004473270215893038, "loss": 4.0813, "step": 312500 }, { "epoch": 1.6674124741630973, "grad_norm": 0.8853760361671448, "learning_rate": 0.0004464369800659237, "loss": 4.0728, "step": 313000 }, { "epoch": 1.670076072364636, "grad_norm": 0.8472786545753479, "learning_rate": 0.0004455451548921962, "loss": 4.076, "step": 313500 }, { "epoch": 1.6727396705661746, "grad_norm": 0.834415853023529, "learning_rate": 0.0004446533297184686, "loss": 4.0776, "step": 314000 }, { "epoch": 1.675403268767713, "grad_norm": 0.8151890635490417, "learning_rate": 0.0004437615045447411, "loss": 4.0712, "step": 314500 }, { "epoch": 1.6780668669692513, "grad_norm": 0.8340436816215515, "learning_rate": 0.0004428696793710135, "loss": 4.0773, "step": 315000 }, { "epoch": 1.68073046517079, "grad_norm": 0.7873215079307556, "learning_rate": 0.00044197963784763344, "loss": 4.0796, "step": 315500 }, { "epoch": 1.6833940633723286, "grad_norm": 0.7956321835517883, "learning_rate": 0.0004410878126739059, "loss": 4.0738, "step": 316000 }, { "epoch": 1.686057661573867, "grad_norm": 0.8906182646751404, "learning_rate": 0.00044019598750017836, "loss": 4.0776, "step": 316500 }, { "epoch": 1.6887212597754053, "grad_norm": 0.8356565833091736, "learning_rate": 0.0004393041623264508, "loss": 4.0686, "step": 317000 }, { "epoch": 1.691384857976944, "grad_norm": 0.8309632539749146, "learning_rate": 0.0004384123371527233, "loss": 4.0786, "step": 317500 }, { "epoch": 1.6940484561784825, "grad_norm": 0.8648601770401001, "learning_rate": 0.0004375205119789957, "loss": 4.076, "step": 318000 }, { "epoch": 1.696712054380021, "grad_norm": 0.799662172794342, "learning_rate": 0.0004366304704556157, "loss": 4.0769, "step": 318500 }, { "epoch": 1.6993756525815593, "grad_norm": 0.884032130241394, "learning_rate": 0.0004357386452818881, "loss": 4.0742, "step": 319000 }, { "epoch": 1.702039250783098, "grad_norm": 0.8695617914199829, "learning_rate": 0.00043484682010816054, "loss": 4.0721, "step": 319500 }, { "epoch": 1.7047028489846365, "grad_norm": 0.801929235458374, "learning_rate": 0.00043395499493443303, "loss": 4.0722, "step": 320000 }, { "epoch": 1.7073664471861747, "grad_norm": 0.7920409440994263, "learning_rate": 0.00043306495341105295, "loss": 4.076, "step": 320500 }, { "epoch": 1.7100300453877133, "grad_norm": 0.821932852268219, "learning_rate": 0.00043217312823732543, "loss": 4.076, "step": 321000 }, { "epoch": 1.712693643589252, "grad_norm": 0.8553212881088257, "learning_rate": 0.00043128130306359786, "loss": 4.0748, "step": 321500 }, { "epoch": 1.7153572417907903, "grad_norm": 0.911418080329895, "learning_rate": 0.0004303894778898703, "loss": 4.0794, "step": 322000 }, { "epoch": 1.7180208399923287, "grad_norm": 0.8463834524154663, "learning_rate": 0.0004294976527161428, "loss": 4.0676, "step": 322500 }, { "epoch": 1.7206844381938673, "grad_norm": 0.8559086322784424, "learning_rate": 0.0004286058275424152, "loss": 4.0771, "step": 323000 }, { "epoch": 1.723348036395406, "grad_norm": 0.8981167674064636, "learning_rate": 0.0004277140023686877, "loss": 4.0688, "step": 323500 }, { "epoch": 1.7260116345969443, "grad_norm": 0.8651977181434631, "learning_rate": 0.00042682396084530756, "loss": 4.0728, "step": 324000 }, { "epoch": 1.7286752327984827, "grad_norm": 0.9066988229751587, "learning_rate": 0.00042593213567158, "loss": 4.072, "step": 324500 }, { "epoch": 1.7313388310000213, "grad_norm": 0.8543113470077515, "learning_rate": 0.0004250403104978525, "loss": 4.0727, "step": 325000 }, { "epoch": 1.73400242920156, "grad_norm": 0.8599368333816528, "learning_rate": 0.00042414848532412497, "loss": 4.0665, "step": 325500 }, { "epoch": 1.7366660274030983, "grad_norm": 0.8290531039237976, "learning_rate": 0.00042325666015039746, "loss": 4.0739, "step": 326000 }, { "epoch": 1.7393296256046367, "grad_norm": 0.8055272102355957, "learning_rate": 0.0004223666186270173, "loss": 4.0735, "step": 326500 }, { "epoch": 1.7419932238061753, "grad_norm": 0.8045780658721924, "learning_rate": 0.00042147479345328975, "loss": 4.071, "step": 327000 }, { "epoch": 1.7446568220077139, "grad_norm": 0.8758577108383179, "learning_rate": 0.00042058296827956224, "loss": 4.0735, "step": 327500 }, { "epoch": 1.7473204202092523, "grad_norm": 0.8138041496276855, "learning_rate": 0.00041969114310583467, "loss": 4.0686, "step": 328000 }, { "epoch": 1.7499840184107907, "grad_norm": 0.8927600979804993, "learning_rate": 0.0004188011015824546, "loss": 4.0749, "step": 328500 }, { "epoch": 1.7526476166123293, "grad_norm": 0.8370145559310913, "learning_rate": 0.00041790927640872707, "loss": 4.0723, "step": 329000 }, { "epoch": 1.7553112148138679, "grad_norm": 0.8793504238128662, "learning_rate": 0.0004170174512349995, "loss": 4.0674, "step": 329500 }, { "epoch": 1.7579748130154063, "grad_norm": 0.8913201689720154, "learning_rate": 0.000416125626061272, "loss": 4.0699, "step": 330000 }, { "epoch": 1.7606384112169446, "grad_norm": 0.8198757767677307, "learning_rate": 0.0004152338008875444, "loss": 4.0738, "step": 330500 }, { "epoch": 1.7633020094184833, "grad_norm": 0.8716715574264526, "learning_rate": 0.00041434375936416434, "loss": 4.0762, "step": 331000 }, { "epoch": 1.7659656076200219, "grad_norm": 0.8413424491882324, "learning_rate": 0.0004134519341904368, "loss": 4.0635, "step": 331500 }, { "epoch": 1.7686292058215602, "grad_norm": 0.838036060333252, "learning_rate": 0.00041256010901670926, "loss": 4.0731, "step": 332000 }, { "epoch": 1.7712928040230986, "grad_norm": 0.8625719547271729, "learning_rate": 0.00041166828384298174, "loss": 4.0765, "step": 332500 }, { "epoch": 1.7739564022246372, "grad_norm": 0.8333448171615601, "learning_rate": 0.0004107782423196016, "loss": 4.0691, "step": 333000 }, { "epoch": 1.7766200004261758, "grad_norm": 0.8514916300773621, "learning_rate": 0.00040988641714587403, "loss": 4.0682, "step": 333500 }, { "epoch": 1.7792835986277142, "grad_norm": 0.8220165371894836, "learning_rate": 0.0004089945919721465, "loss": 4.0796, "step": 334000 }, { "epoch": 1.7819471968292526, "grad_norm": 0.838065505027771, "learning_rate": 0.00040810276679841895, "loss": 4.0672, "step": 334500 }, { "epoch": 1.7846107950307912, "grad_norm": 0.8731646537780762, "learning_rate": 0.00040721272527503887, "loss": 4.0667, "step": 335000 }, { "epoch": 1.7872743932323298, "grad_norm": 0.8466665148735046, "learning_rate": 0.00040632090010131136, "loss": 4.0733, "step": 335500 }, { "epoch": 1.7899379914338682, "grad_norm": 0.9406811594963074, "learning_rate": 0.0004054290749275838, "loss": 4.0708, "step": 336000 }, { "epoch": 1.7926015896354066, "grad_norm": 0.8663309812545776, "learning_rate": 0.0004045372497538563, "loss": 4.0688, "step": 336500 }, { "epoch": 1.7952651878369452, "grad_norm": 0.8506413698196411, "learning_rate": 0.0004036454245801287, "loss": 4.0795, "step": 337000 }, { "epoch": 1.7979287860384838, "grad_norm": 0.8088420033454895, "learning_rate": 0.0004027553830567486, "loss": 4.0724, "step": 337500 }, { "epoch": 1.8005923842400222, "grad_norm": 0.8378006815910339, "learning_rate": 0.0004018635578830211, "loss": 4.0668, "step": 338000 }, { "epoch": 1.8032559824415606, "grad_norm": 0.8574025630950928, "learning_rate": 0.00040097173270929354, "loss": 4.0678, "step": 338500 }, { "epoch": 1.8059195806430992, "grad_norm": 0.8278779983520508, "learning_rate": 0.00040007990753556603, "loss": 4.0695, "step": 339000 }, { "epoch": 1.8085831788446376, "grad_norm": 0.9120043516159058, "learning_rate": 0.00039918986601218594, "loss": 4.0629, "step": 339500 }, { "epoch": 1.811246777046176, "grad_norm": 0.822943925857544, "learning_rate": 0.0003982980408384584, "loss": 4.0674, "step": 340000 }, { "epoch": 1.8139103752477146, "grad_norm": 0.8420679569244385, "learning_rate": 0.00039740621566473086, "loss": 4.0683, "step": 340500 }, { "epoch": 1.8165739734492532, "grad_norm": 0.8428717851638794, "learning_rate": 0.0003965143904910033, "loss": 4.0672, "step": 341000 }, { "epoch": 1.8192375716507916, "grad_norm": 0.8921811580657959, "learning_rate": 0.0003956225653172757, "loss": 4.0655, "step": 341500 }, { "epoch": 1.82190116985233, "grad_norm": 0.8687016367912292, "learning_rate": 0.0003947307401435482, "loss": 4.0712, "step": 342000 }, { "epoch": 1.8245647680538686, "grad_norm": 0.8464400172233582, "learning_rate": 0.0003938406986201681, "loss": 4.0687, "step": 342500 }, { "epoch": 1.8272283662554072, "grad_norm": 0.8673765063285828, "learning_rate": 0.00039294887344644056, "loss": 4.0628, "step": 343000 }, { "epoch": 1.8298919644569456, "grad_norm": 0.9040893316268921, "learning_rate": 0.000392057048272713, "loss": 4.0633, "step": 343500 }, { "epoch": 1.832555562658484, "grad_norm": 0.8810034394264221, "learning_rate": 0.0003911652230989854, "loss": 4.0637, "step": 344000 }, { "epoch": 1.8352191608600226, "grad_norm": 0.8870866894721985, "learning_rate": 0.0003902733979252579, "loss": 4.0712, "step": 344500 }, { "epoch": 1.8378827590615612, "grad_norm": 0.8724194169044495, "learning_rate": 0.0003893833564018778, "loss": 4.0761, "step": 345000 }, { "epoch": 1.8405463572630996, "grad_norm": 1.1327623128890991, "learning_rate": 0.00038849153122815026, "loss": 4.0656, "step": 345500 }, { "epoch": 1.843209955464638, "grad_norm": 0.8693875670433044, "learning_rate": 0.00038759970605442275, "loss": 4.0692, "step": 346000 }, { "epoch": 1.8458735536661766, "grad_norm": 0.9146456122398376, "learning_rate": 0.0003867078808806952, "loss": 4.0663, "step": 346500 }, { "epoch": 1.8485371518677152, "grad_norm": 0.8626604676246643, "learning_rate": 0.00038581605570696766, "loss": 4.0618, "step": 347000 }, { "epoch": 1.8512007500692536, "grad_norm": 1.0062013864517212, "learning_rate": 0.0003849242305332401, "loss": 4.0678, "step": 347500 }, { "epoch": 1.853864348270792, "grad_norm": 0.842510461807251, "learning_rate": 0.00038403418900986, "loss": 4.065, "step": 348000 }, { "epoch": 1.8565279464723305, "grad_norm": 0.8646286129951477, "learning_rate": 0.0003831423638361325, "loss": 4.0629, "step": 348500 }, { "epoch": 1.8591915446738692, "grad_norm": 0.8638767004013062, "learning_rate": 0.00038225053866240493, "loss": 4.0656, "step": 349000 }, { "epoch": 1.8618551428754075, "grad_norm": 0.8934078216552734, "learning_rate": 0.0003813587134886774, "loss": 4.0714, "step": 349500 }, { "epoch": 1.864518741076946, "grad_norm": 0.8266724944114685, "learning_rate": 0.00038046688831494985, "loss": 4.0645, "step": 350000 }, { "epoch": 1.8671823392784845, "grad_norm": 0.8602758646011353, "learning_rate": 0.00037957684679156977, "loss": 4.0642, "step": 350500 }, { "epoch": 1.8698459374800231, "grad_norm": 0.8677871823310852, "learning_rate": 0.00037868502161784225, "loss": 4.0685, "step": 351000 }, { "epoch": 1.8725095356815615, "grad_norm": 0.870879590511322, "learning_rate": 0.0003777931964441147, "loss": 4.0747, "step": 351500 }, { "epoch": 1.8751731338831, "grad_norm": 0.8714147806167603, "learning_rate": 0.00037690137127038717, "loss": 4.061, "step": 352000 }, { "epoch": 1.8778367320846385, "grad_norm": 0.8625131249427795, "learning_rate": 0.00037601132974700703, "loss": 4.06, "step": 352500 }, { "epoch": 1.8805003302861771, "grad_norm": 0.9685169458389282, "learning_rate": 0.00037511950457327946, "loss": 4.071, "step": 353000 }, { "epoch": 1.8831639284877155, "grad_norm": 0.9301902055740356, "learning_rate": 0.00037422767939955195, "loss": 4.0663, "step": 353500 }, { "epoch": 1.885827526689254, "grad_norm": 0.8485379219055176, "learning_rate": 0.0003733358542258244, "loss": 4.0709, "step": 354000 }, { "epoch": 1.8884911248907925, "grad_norm": 0.833081841468811, "learning_rate": 0.00037244402905209687, "loss": 4.0596, "step": 354500 }, { "epoch": 1.8911547230923311, "grad_norm": 0.8548697829246521, "learning_rate": 0.0003715539875287168, "loss": 4.0701, "step": 355000 }, { "epoch": 1.8938183212938695, "grad_norm": 0.8501580357551575, "learning_rate": 0.0003706621623549892, "loss": 4.0567, "step": 355500 }, { "epoch": 1.896481919495408, "grad_norm": 0.8642673492431641, "learning_rate": 0.0003697703371812617, "loss": 4.0621, "step": 356000 }, { "epoch": 1.8991455176969465, "grad_norm": 0.8171157240867615, "learning_rate": 0.00036887851200753414, "loss": 4.0542, "step": 356500 }, { "epoch": 1.901809115898485, "grad_norm": 0.873189389705658, "learning_rate": 0.00036798668683380657, "loss": 4.06, "step": 357000 }, { "epoch": 1.9044727141000233, "grad_norm": 0.8762955665588379, "learning_rate": 0.00036709664531042654, "loss": 4.063, "step": 357500 }, { "epoch": 1.9071363123015619, "grad_norm": 0.8550353050231934, "learning_rate": 0.00036620482013669897, "loss": 4.0597, "step": 358000 }, { "epoch": 1.9097999105031005, "grad_norm": 0.8709129691123962, "learning_rate": 0.00036531299496297146, "loss": 4.0578, "step": 358500 }, { "epoch": 1.9124635087046389, "grad_norm": 0.9054292440414429, "learning_rate": 0.0003644211697892439, "loss": 4.0589, "step": 359000 }, { "epoch": 1.9151271069061773, "grad_norm": 0.8816952705383301, "learning_rate": 0.0003635293446155163, "loss": 4.0563, "step": 359500 }, { "epoch": 1.9177907051077159, "grad_norm": 0.8601788282394409, "learning_rate": 0.0003626393030921363, "loss": 4.057, "step": 360000 }, { "epoch": 1.9204543033092545, "grad_norm": 0.933283269405365, "learning_rate": 0.0003617474779184087, "loss": 4.0688, "step": 360500 }, { "epoch": 1.9231179015107929, "grad_norm": 0.9095755815505981, "learning_rate": 0.0003608556527446812, "loss": 4.0531, "step": 361000 }, { "epoch": 1.9257814997123313, "grad_norm": 0.8889813423156738, "learning_rate": 0.00035996382757095364, "loss": 4.0638, "step": 361500 }, { "epoch": 1.9284450979138699, "grad_norm": 0.8663842678070068, "learning_rate": 0.0003590737860475735, "loss": 4.062, "step": 362000 }, { "epoch": 1.9311086961154085, "grad_norm": 0.8386211395263672, "learning_rate": 0.000358181960873846, "loss": 4.0561, "step": 362500 }, { "epoch": 1.9337722943169469, "grad_norm": 0.8373234868049622, "learning_rate": 0.0003572901357001184, "loss": 4.0666, "step": 363000 }, { "epoch": 1.9364358925184852, "grad_norm": 0.8931795954704285, "learning_rate": 0.00035639831052639085, "loss": 4.0554, "step": 363500 }, { "epoch": 1.9390994907200239, "grad_norm": 0.8433584570884705, "learning_rate": 0.0003555082690030108, "loss": 4.0583, "step": 364000 }, { "epoch": 1.9417630889215625, "grad_norm": 0.8926225900650024, "learning_rate": 0.00035461644382928326, "loss": 4.0585, "step": 364500 }, { "epoch": 1.9444266871231008, "grad_norm": 0.865616500377655, "learning_rate": 0.00035372461865555574, "loss": 4.0633, "step": 365000 }, { "epoch": 1.9470902853246392, "grad_norm": 0.8474301099777222, "learning_rate": 0.0003528327934818282, "loss": 4.0602, "step": 365500 }, { "epoch": 1.9497538835261778, "grad_norm": 0.8580695986747742, "learning_rate": 0.0003519427519584481, "loss": 4.0544, "step": 366000 }, { "epoch": 1.9524174817277165, "grad_norm": 0.8627407550811768, "learning_rate": 0.0003510509267847206, "loss": 4.0481, "step": 366500 }, { "epoch": 1.9550810799292548, "grad_norm": 0.8328742384910583, "learning_rate": 0.000350159101610993, "loss": 4.0581, "step": 367000 }, { "epoch": 1.9577446781307932, "grad_norm": 0.8515557050704956, "learning_rate": 0.0003492672764372655, "loss": 4.06, "step": 367500 }, { "epoch": 1.9604082763323318, "grad_norm": 0.9069979786872864, "learning_rate": 0.00034837545126353793, "loss": 4.0602, "step": 368000 }, { "epoch": 1.9630718745338704, "grad_norm": 0.8612348437309265, "learning_rate": 0.0003474854097401578, "loss": 4.0565, "step": 368500 }, { "epoch": 1.9657354727354088, "grad_norm": 0.9286240339279175, "learning_rate": 0.0003465935845664303, "loss": 4.0605, "step": 369000 }, { "epoch": 1.9683990709369472, "grad_norm": 0.8804614543914795, "learning_rate": 0.00034570175939270276, "loss": 4.0575, "step": 369500 }, { "epoch": 1.9710626691384858, "grad_norm": 0.8332533836364746, "learning_rate": 0.0003448099342189752, "loss": 4.0587, "step": 370000 }, { "epoch": 1.9737262673400244, "grad_norm": 0.8402279615402222, "learning_rate": 0.0003439198926955951, "loss": 4.0569, "step": 370500 }, { "epoch": 1.9763898655415628, "grad_norm": 0.8684757351875305, "learning_rate": 0.00034302806752186754, "loss": 4.0668, "step": 371000 }, { "epoch": 1.9790534637431012, "grad_norm": 0.880416750907898, "learning_rate": 0.00034213624234814003, "loss": 4.0612, "step": 371500 }, { "epoch": 1.9817170619446398, "grad_norm": 0.9281913042068481, "learning_rate": 0.00034124441717441246, "loss": 4.0583, "step": 372000 }, { "epoch": 1.9843806601461784, "grad_norm": 0.8712506294250488, "learning_rate": 0.0003403525920006849, "loss": 4.0539, "step": 372500 }, { "epoch": 1.9870442583477168, "grad_norm": 0.8760526180267334, "learning_rate": 0.00033946255047730486, "loss": 4.0502, "step": 373000 }, { "epoch": 1.9897078565492552, "grad_norm": 0.8705692291259766, "learning_rate": 0.0003385707253035773, "loss": 4.0592, "step": 373500 }, { "epoch": 1.9923714547507938, "grad_norm": 0.8519155383110046, "learning_rate": 0.00033767890012984973, "loss": 4.0607, "step": 374000 }, { "epoch": 1.9950350529523324, "grad_norm": 0.879636287689209, "learning_rate": 0.0003367870749561222, "loss": 4.0566, "step": 374500 }, { "epoch": 1.9976986511538706, "grad_norm": 0.8572770357131958, "learning_rate": 0.00033589703343274213, "loss": 4.0504, "step": 375000 }, { "epoch": 2.000362249355409, "grad_norm": 0.8497179746627808, "learning_rate": 0.0003350052082590146, "loss": 4.0603, "step": 375500 }, { "epoch": 2.003025847556948, "grad_norm": 0.8854038715362549, "learning_rate": 0.00033411338308528705, "loss": 4.055, "step": 376000 }, { "epoch": 2.0056894457584864, "grad_norm": 0.9853951334953308, "learning_rate": 0.0003332215579115595, "loss": 4.057, "step": 376500 }, { "epoch": 2.0083530439600246, "grad_norm": 0.9749231934547424, "learning_rate": 0.0003323315163881794, "loss": 4.0497, "step": 377000 }, { "epoch": 2.011016642161563, "grad_norm": 0.9801936745643616, "learning_rate": 0.00033143969121445183, "loss": 4.0609, "step": 377500 }, { "epoch": 2.013680240363102, "grad_norm": 0.9140198826789856, "learning_rate": 0.0003305478660407243, "loss": 4.0491, "step": 378000 }, { "epoch": 2.0163438385646404, "grad_norm": 0.9118580222129822, "learning_rate": 0.00032965604086699675, "loss": 4.0484, "step": 378500 }, { "epoch": 2.0190074367661786, "grad_norm": 1.0234750509262085, "learning_rate": 0.0003287642156932692, "loss": 4.0466, "step": 379000 }, { "epoch": 2.021671034967717, "grad_norm": 0.8892688751220703, "learning_rate": 0.00032787239051954167, "loss": 4.0569, "step": 379500 }, { "epoch": 2.0243346331692558, "grad_norm": 0.860365092754364, "learning_rate": 0.0003269823489961616, "loss": 4.0592, "step": 380000 }, { "epoch": 2.0269982313707944, "grad_norm": 0.8938810229301453, "learning_rate": 0.000326090523822434, "loss": 4.0523, "step": 380500 }, { "epoch": 2.0296618295723325, "grad_norm": 0.885435163974762, "learning_rate": 0.0003251986986487065, "loss": 4.0574, "step": 381000 }, { "epoch": 2.032325427773871, "grad_norm": 0.9123975038528442, "learning_rate": 0.00032430687347497893, "loss": 4.046, "step": 381500 }, { "epoch": 2.0349890259754098, "grad_norm": 0.9096443057060242, "learning_rate": 0.0003234168319515989, "loss": 4.0551, "step": 382000 }, { "epoch": 2.0376526241769484, "grad_norm": 0.8680484890937805, "learning_rate": 0.00032252500677787133, "loss": 4.0532, "step": 382500 }, { "epoch": 2.0403162223784865, "grad_norm": 0.8725469708442688, "learning_rate": 0.00032163318160414377, "loss": 4.0563, "step": 383000 }, { "epoch": 2.042979820580025, "grad_norm": 0.9647555947303772, "learning_rate": 0.00032074135643041625, "loss": 4.0536, "step": 383500 }, { "epoch": 2.0456434187815637, "grad_norm": 0.8826559782028198, "learning_rate": 0.0003198495312566887, "loss": 4.0527, "step": 384000 }, { "epoch": 2.0483070169831024, "grad_norm": 0.9342438578605652, "learning_rate": 0.0003189594897333086, "loss": 4.0607, "step": 384500 }, { "epoch": 2.0509706151846405, "grad_norm": 0.9360005855560303, "learning_rate": 0.0003180676645595811, "loss": 4.0472, "step": 385000 }, { "epoch": 2.053634213386179, "grad_norm": 0.9147686958312988, "learning_rate": 0.0003171758393858535, "loss": 4.0485, "step": 385500 }, { "epoch": 2.0562978115877177, "grad_norm": 0.8479260206222534, "learning_rate": 0.000316284014212126, "loss": 4.0504, "step": 386000 }, { "epoch": 2.058961409789256, "grad_norm": 0.8525492548942566, "learning_rate": 0.00031539218903839844, "loss": 4.0496, "step": 386500 }, { "epoch": 2.0616250079907945, "grad_norm": 0.8503657579421997, "learning_rate": 0.0003145021475150183, "loss": 4.0571, "step": 387000 }, { "epoch": 2.064288606192333, "grad_norm": 0.8873237371444702, "learning_rate": 0.0003136103223412908, "loss": 4.0511, "step": 387500 }, { "epoch": 2.0669522043938717, "grad_norm": 0.9111925959587097, "learning_rate": 0.0003127184971675632, "loss": 4.0477, "step": 388000 }, { "epoch": 2.06961580259541, "grad_norm": 0.864146888256073, "learning_rate": 0.0003118266719938357, "loss": 4.0526, "step": 388500 }, { "epoch": 2.0722794007969485, "grad_norm": 0.8477506637573242, "learning_rate": 0.00031093484682010814, "loss": 4.054, "step": 389000 }, { "epoch": 2.074942998998487, "grad_norm": 0.9023974537849426, "learning_rate": 0.00031004480529672805, "loss": 4.0579, "step": 389500 }, { "epoch": 2.0776065972000257, "grad_norm": 0.8909152150154114, "learning_rate": 0.00030915298012300054, "loss": 4.0521, "step": 390000 }, { "epoch": 2.080270195401564, "grad_norm": 0.9014437794685364, "learning_rate": 0.00030826115494927297, "loss": 4.0553, "step": 390500 }, { "epoch": 2.0829337936031025, "grad_norm": 0.8972243666648865, "learning_rate": 0.00030736932977554546, "loss": 4.0507, "step": 391000 }, { "epoch": 2.085597391804641, "grad_norm": 0.8825047016143799, "learning_rate": 0.0003064792882521654, "loss": 4.0526, "step": 391500 }, { "epoch": 2.0882609900061797, "grad_norm": 0.924751341342926, "learning_rate": 0.0003055874630784378, "loss": 4.0521, "step": 392000 }, { "epoch": 2.090924588207718, "grad_norm": 0.8999988436698914, "learning_rate": 0.0003046956379047103, "loss": 4.0524, "step": 392500 }, { "epoch": 2.0935881864092565, "grad_norm": 0.8595131635665894, "learning_rate": 0.0003038038127309827, "loss": 4.0519, "step": 393000 }, { "epoch": 2.096251784610795, "grad_norm": 0.9281662106513977, "learning_rate": 0.00030291377120760264, "loss": 4.0489, "step": 393500 }, { "epoch": 2.0989153828123337, "grad_norm": 0.8841512799263, "learning_rate": 0.0003020219460338751, "loss": 4.0504, "step": 394000 }, { "epoch": 2.101578981013872, "grad_norm": 0.8970746994018555, "learning_rate": 0.00030113012086014756, "loss": 4.0453, "step": 394500 }, { "epoch": 2.1042425792154105, "grad_norm": 0.946937084197998, "learning_rate": 0.00030023829568642005, "loss": 4.0443, "step": 395000 }, { "epoch": 2.106906177416949, "grad_norm": 1.066956877708435, "learning_rate": 0.0002993482541630399, "loss": 4.0591, "step": 395500 }, { "epoch": 2.1095697756184877, "grad_norm": 0.8527683615684509, "learning_rate": 0.00029845642898931234, "loss": 4.0498, "step": 396000 }, { "epoch": 2.112233373820026, "grad_norm": 0.9100342988967896, "learning_rate": 0.0002975646038155848, "loss": 4.0463, "step": 396500 }, { "epoch": 2.1148969720215645, "grad_norm": 0.9486255645751953, "learning_rate": 0.00029667277864185726, "loss": 4.0541, "step": 397000 }, { "epoch": 2.117560570223103, "grad_norm": 0.9460600018501282, "learning_rate": 0.00029578273711847717, "loss": 4.0481, "step": 397500 }, { "epoch": 2.1202241684246417, "grad_norm": 0.9710919857025146, "learning_rate": 0.00029489091194474966, "loss": 4.0486, "step": 398000 }, { "epoch": 2.12288776662618, "grad_norm": 0.9194395542144775, "learning_rate": 0.0002939990867710221, "loss": 4.0458, "step": 398500 }, { "epoch": 2.1255513648277184, "grad_norm": 0.8708109855651855, "learning_rate": 0.0002931072615972946, "loss": 4.0465, "step": 399000 }, { "epoch": 2.128214963029257, "grad_norm": 0.8814635276794434, "learning_rate": 0.0002922172200739145, "loss": 4.0441, "step": 399500 }, { "epoch": 2.1308785612307957, "grad_norm": 0.9306267499923706, "learning_rate": 0.0002913253949001869, "loss": 4.0417, "step": 400000 }, { "epoch": 2.133542159432334, "grad_norm": 0.9086319208145142, "learning_rate": 0.0002904335697264594, "loss": 4.0485, "step": 400500 }, { "epoch": 2.1362057576338724, "grad_norm": 0.9667945504188538, "learning_rate": 0.00028954174455273184, "loss": 4.0387, "step": 401000 }, { "epoch": 2.138869355835411, "grad_norm": 0.9225121736526489, "learning_rate": 0.00028864991937900433, "loss": 4.0424, "step": 401500 }, { "epoch": 2.1415329540369497, "grad_norm": 0.891379714012146, "learning_rate": 0.0002877598778556242, "loss": 4.046, "step": 402000 }, { "epoch": 2.144196552238488, "grad_norm": 0.9507352709770203, "learning_rate": 0.0002868680526818966, "loss": 4.0477, "step": 402500 }, { "epoch": 2.1468601504400264, "grad_norm": 0.9602506756782532, "learning_rate": 0.00028597622750816917, "loss": 4.0498, "step": 403000 }, { "epoch": 2.149523748641565, "grad_norm": 0.9250164031982422, "learning_rate": 0.0002850844023344416, "loss": 4.0404, "step": 403500 }, { "epoch": 2.152187346843103, "grad_norm": 0.917396605014801, "learning_rate": 0.00028419436081106146, "loss": 4.0488, "step": 404000 }, { "epoch": 2.154850945044642, "grad_norm": 0.8889843821525574, "learning_rate": 0.00028330253563733395, "loss": 4.0412, "step": 404500 }, { "epoch": 2.1575145432461804, "grad_norm": 0.9360488653182983, "learning_rate": 0.0002824107104636064, "loss": 4.0407, "step": 405000 }, { "epoch": 2.160178141447719, "grad_norm": 0.9107580184936523, "learning_rate": 0.00028151888528987886, "loss": 4.0439, "step": 405500 }, { "epoch": 2.162841739649257, "grad_norm": 0.9053534865379333, "learning_rate": 0.0002806270601161513, "loss": 4.042, "step": 406000 }, { "epoch": 2.165505337850796, "grad_norm": 0.8875529766082764, "learning_rate": 0.0002797370185927712, "loss": 4.0429, "step": 406500 }, { "epoch": 2.1681689360523344, "grad_norm": 0.9056974053382874, "learning_rate": 0.0002788451934190437, "loss": 4.0461, "step": 407000 }, { "epoch": 2.170832534253873, "grad_norm": 0.8870306015014648, "learning_rate": 0.00027795336824531613, "loss": 4.0473, "step": 407500 }, { "epoch": 2.173496132455411, "grad_norm": 0.9122534394264221, "learning_rate": 0.0002770615430715886, "loss": 4.0423, "step": 408000 }, { "epoch": 2.17615973065695, "grad_norm": 0.8884118795394897, "learning_rate": 0.00027617150154820853, "loss": 4.0455, "step": 408500 }, { "epoch": 2.1788233288584884, "grad_norm": 0.8788624405860901, "learning_rate": 0.00027527967637448096, "loss": 4.0396, "step": 409000 }, { "epoch": 2.181486927060027, "grad_norm": 0.9050582647323608, "learning_rate": 0.00027438785120075345, "loss": 4.0364, "step": 409500 }, { "epoch": 2.184150525261565, "grad_norm": 0.9116672277450562, "learning_rate": 0.0002734960260270259, "loss": 4.0479, "step": 410000 }, { "epoch": 2.1868141234631038, "grad_norm": 0.8476006984710693, "learning_rate": 0.00027260420085329837, "loss": 4.0407, "step": 410500 }, { "epoch": 2.1894777216646424, "grad_norm": 0.9175940752029419, "learning_rate": 0.00027171415932991823, "loss": 4.0469, "step": 411000 }, { "epoch": 2.192141319866181, "grad_norm": 0.9391987919807434, "learning_rate": 0.00027082233415619066, "loss": 4.0477, "step": 411500 }, { "epoch": 2.194804918067719, "grad_norm": 0.880539059638977, "learning_rate": 0.00026993050898246315, "loss": 4.0483, "step": 412000 }, { "epoch": 2.1974685162692578, "grad_norm": 0.9159991145133972, "learning_rate": 0.0002690386838087356, "loss": 4.0439, "step": 412500 }, { "epoch": 2.2001321144707964, "grad_norm": 0.846324622631073, "learning_rate": 0.0002681486422853555, "loss": 4.0491, "step": 413000 }, { "epoch": 2.202795712672335, "grad_norm": 0.9291318655014038, "learning_rate": 0.000267256817111628, "loss": 4.0433, "step": 413500 }, { "epoch": 2.205459310873873, "grad_norm": 0.9299983978271484, "learning_rate": 0.0002663649919379004, "loss": 4.039, "step": 414000 }, { "epoch": 2.2081229090754118, "grad_norm": 0.9034929275512695, "learning_rate": 0.0002654731667641729, "loss": 4.0426, "step": 414500 }, { "epoch": 2.2107865072769504, "grad_norm": 0.8487489223480225, "learning_rate": 0.0002645831252407928, "loss": 4.0382, "step": 415000 }, { "epoch": 2.213450105478489, "grad_norm": 0.9376189112663269, "learning_rate": 0.00026369130006706525, "loss": 4.0478, "step": 415500 }, { "epoch": 2.216113703680027, "grad_norm": 0.9032031297683716, "learning_rate": 0.00026279947489333774, "loss": 4.0446, "step": 416000 }, { "epoch": 2.2187773018815657, "grad_norm": 0.873349666595459, "learning_rate": 0.00026190764971961017, "loss": 4.0419, "step": 416500 }, { "epoch": 2.2214409000831044, "grad_norm": 0.9227972626686096, "learning_rate": 0.0002610176081962301, "loss": 4.0415, "step": 417000 }, { "epoch": 2.224104498284643, "grad_norm": 0.9360315203666687, "learning_rate": 0.00026012578302250257, "loss": 4.0391, "step": 417500 }, { "epoch": 2.226768096486181, "grad_norm": 1.0437467098236084, "learning_rate": 0.000259233957848775, "loss": 4.0425, "step": 418000 }, { "epoch": 2.2294316946877197, "grad_norm": 0.9248673319816589, "learning_rate": 0.0002583421326750475, "loss": 4.0413, "step": 418500 }, { "epoch": 2.2320952928892583, "grad_norm": 0.8973048329353333, "learning_rate": 0.00025745209115166735, "loss": 4.0411, "step": 419000 }, { "epoch": 2.234758891090797, "grad_norm": 0.9082027077674866, "learning_rate": 0.0002565602659779398, "loss": 4.0424, "step": 419500 }, { "epoch": 2.237422489292335, "grad_norm": 0.8980434536933899, "learning_rate": 0.00025566844080421227, "loss": 4.0389, "step": 420000 }, { "epoch": 2.2400860874938737, "grad_norm": 0.8749063014984131, "learning_rate": 0.0002547766156304847, "loss": 4.0283, "step": 420500 }, { "epoch": 2.2427496856954123, "grad_norm": 0.9931572675704956, "learning_rate": 0.0002538865741071046, "loss": 4.0411, "step": 421000 }, { "epoch": 2.2454132838969505, "grad_norm": 1.0000332593917847, "learning_rate": 0.0002529947489333771, "loss": 4.0426, "step": 421500 }, { "epoch": 2.248076882098489, "grad_norm": 0.8988611698150635, "learning_rate": 0.00025210292375964954, "loss": 4.0401, "step": 422000 }, { "epoch": 2.2507404803000277, "grad_norm": 0.9371945261955261, "learning_rate": 0.000251211098585922, "loss": 4.0367, "step": 422500 }, { "epoch": 2.2534040785015663, "grad_norm": 0.9270386099815369, "learning_rate": 0.00025031927341219446, "loss": 4.0481, "step": 423000 }, { "epoch": 2.256067676703105, "grad_norm": 0.964900553226471, "learning_rate": 0.00024942923188881437, "loss": 4.0381, "step": 423500 }, { "epoch": 2.258731274904643, "grad_norm": 0.8744553923606873, "learning_rate": 0.00024853740671508686, "loss": 4.0375, "step": 424000 }, { "epoch": 2.2613948731061817, "grad_norm": 0.9299191236495972, "learning_rate": 0.0002476455815413593, "loss": 4.036, "step": 424500 }, { "epoch": 2.2640584713077203, "grad_norm": 0.9264661073684692, "learning_rate": 0.0002467537563676318, "loss": 4.04, "step": 425000 }, { "epoch": 2.2667220695092585, "grad_norm": 0.9486096501350403, "learning_rate": 0.00024586371484425164, "loss": 4.0362, "step": 425500 }, { "epoch": 2.269385667710797, "grad_norm": 0.9084232449531555, "learning_rate": 0.0002449718896705241, "loss": 4.0442, "step": 426000 }, { "epoch": 2.2720492659123357, "grad_norm": 0.898169755935669, "learning_rate": 0.00024408006449679656, "loss": 4.04, "step": 426500 }, { "epoch": 2.2747128641138743, "grad_norm": 0.9344006180763245, "learning_rate": 0.00024318823932306902, "loss": 4.0393, "step": 427000 }, { "epoch": 2.2773764623154125, "grad_norm": 0.9698314666748047, "learning_rate": 0.00024229641414934147, "loss": 4.0293, "step": 427500 }, { "epoch": 2.280040060516951, "grad_norm": 0.9501084685325623, "learning_rate": 0.0002414063726259614, "loss": 4.038, "step": 428000 }, { "epoch": 2.2827036587184897, "grad_norm": 0.8912844061851501, "learning_rate": 0.00024051454745223385, "loss": 4.0374, "step": 428500 }, { "epoch": 2.2853672569200283, "grad_norm": 0.9317381978034973, "learning_rate": 0.0002396227222785063, "loss": 4.0353, "step": 429000 }, { "epoch": 2.2880308551215665, "grad_norm": 0.9316912889480591, "learning_rate": 0.00023873089710477877, "loss": 4.0383, "step": 429500 }, { "epoch": 2.290694453323105, "grad_norm": 0.9433039426803589, "learning_rate": 0.00023784085558139868, "loss": 4.0332, "step": 430000 }, { "epoch": 2.2933580515246437, "grad_norm": 0.9455925226211548, "learning_rate": 0.00023694903040767112, "loss": 4.0326, "step": 430500 }, { "epoch": 2.2960216497261823, "grad_norm": 0.9149669408798218, "learning_rate": 0.00023605720523394358, "loss": 4.0442, "step": 431000 }, { "epoch": 2.2986852479277204, "grad_norm": 0.9723134636878967, "learning_rate": 0.00023516538006021603, "loss": 4.0313, "step": 431500 }, { "epoch": 2.301348846129259, "grad_norm": 0.9359349012374878, "learning_rate": 0.00023427533853683595, "loss": 4.0369, "step": 432000 }, { "epoch": 2.3040124443307977, "grad_norm": 0.9478726983070374, "learning_rate": 0.0002333835133631084, "loss": 4.0386, "step": 432500 }, { "epoch": 2.3066760425323363, "grad_norm": 0.9433446526527405, "learning_rate": 0.00023249168818938084, "loss": 4.0334, "step": 433000 }, { "epoch": 2.3093396407338744, "grad_norm": 0.9548355340957642, "learning_rate": 0.00023159986301565333, "loss": 4.0404, "step": 433500 }, { "epoch": 2.312003238935413, "grad_norm": 1.014600157737732, "learning_rate": 0.0002307080378419258, "loss": 4.0337, "step": 434000 }, { "epoch": 2.3146668371369516, "grad_norm": 0.8967020511627197, "learning_rate": 0.0002298179963185457, "loss": 4.0343, "step": 434500 }, { "epoch": 2.31733043533849, "grad_norm": 1.0393925905227661, "learning_rate": 0.00022892617114481814, "loss": 4.0354, "step": 435000 }, { "epoch": 2.3199940335400284, "grad_norm": 0.9963262677192688, "learning_rate": 0.0002280343459710906, "loss": 4.0358, "step": 435500 }, { "epoch": 2.322657631741567, "grad_norm": 0.9155731797218323, "learning_rate": 0.00022714252079736305, "loss": 4.0372, "step": 436000 }, { "epoch": 2.3253212299431056, "grad_norm": 0.9272859692573547, "learning_rate": 0.00022625247927398297, "loss": 4.04, "step": 436500 }, { "epoch": 2.3279848281446442, "grad_norm": 0.9763675928115845, "learning_rate": 0.0002253606541002554, "loss": 4.0312, "step": 437000 }, { "epoch": 2.3306484263461824, "grad_norm": 0.9596668481826782, "learning_rate": 0.00022446882892652786, "loss": 4.0337, "step": 437500 }, { "epoch": 2.333312024547721, "grad_norm": 0.9284877777099609, "learning_rate": 0.00022357700375280032, "loss": 4.0386, "step": 438000 }, { "epoch": 2.3359756227492596, "grad_norm": 0.9726400971412659, "learning_rate": 0.00022268696222942026, "loss": 4.0354, "step": 438500 }, { "epoch": 2.338639220950798, "grad_norm": 0.9305101037025452, "learning_rate": 0.0002217951370556927, "loss": 4.0213, "step": 439000 }, { "epoch": 2.3413028191523364, "grad_norm": 0.9207624793052673, "learning_rate": 0.00022090331188196515, "loss": 4.0388, "step": 439500 }, { "epoch": 2.343966417353875, "grad_norm": 0.940703809261322, "learning_rate": 0.00022001148670823761, "loss": 4.0303, "step": 440000 }, { "epoch": 2.3466300155554136, "grad_norm": 1.0912624597549438, "learning_rate": 0.00021912144518485753, "loss": 4.0319, "step": 440500 }, { "epoch": 2.3492936137569522, "grad_norm": 0.9056357145309448, "learning_rate": 0.00021822962001113, "loss": 4.0326, "step": 441000 }, { "epoch": 2.3519572119584904, "grad_norm": 0.891265332698822, "learning_rate": 0.00021733779483740242, "loss": 4.0398, "step": 441500 }, { "epoch": 2.354620810160029, "grad_norm": 0.9790766835212708, "learning_rate": 0.00021644596966367488, "loss": 4.0352, "step": 442000 }, { "epoch": 2.3572844083615676, "grad_norm": 0.9584769010543823, "learning_rate": 0.00021555414448994734, "loss": 4.0393, "step": 442500 }, { "epoch": 2.3599480065631058, "grad_norm": 0.9171414971351624, "learning_rate": 0.00021466410296656728, "loss": 4.0384, "step": 443000 }, { "epoch": 2.3626116047646444, "grad_norm": 0.9353621006011963, "learning_rate": 0.00021377227779283972, "loss": 4.0247, "step": 443500 }, { "epoch": 2.365275202966183, "grad_norm": 1.1184170246124268, "learning_rate": 0.00021288045261911217, "loss": 4.0374, "step": 444000 }, { "epoch": 2.3679388011677216, "grad_norm": 0.9417023062705994, "learning_rate": 0.00021198862744538463, "loss": 4.0279, "step": 444500 }, { "epoch": 2.3706023993692598, "grad_norm": 1.0378462076187134, "learning_rate": 0.00021109858592200455, "loss": 4.0357, "step": 445000 }, { "epoch": 2.3732659975707984, "grad_norm": 0.9642356634140015, "learning_rate": 0.00021020676074827698, "loss": 4.0334, "step": 445500 }, { "epoch": 2.375929595772337, "grad_norm": 0.970891535282135, "learning_rate": 0.00020931493557454944, "loss": 4.025, "step": 446000 }, { "epoch": 2.3785931939738756, "grad_norm": 0.9346612691879272, "learning_rate": 0.0002084231104008219, "loss": 4.0255, "step": 446500 }, { "epoch": 2.3812567921754138, "grad_norm": 0.9348496794700623, "learning_rate": 0.00020753128522709436, "loss": 4.0305, "step": 447000 }, { "epoch": 2.3839203903769524, "grad_norm": 0.9465219974517822, "learning_rate": 0.00020664124370371428, "loss": 4.0279, "step": 447500 }, { "epoch": 2.386583988578491, "grad_norm": 0.9686950445175171, "learning_rate": 0.00020574941852998673, "loss": 4.038, "step": 448000 }, { "epoch": 2.3892475867800296, "grad_norm": 0.8983688354492188, "learning_rate": 0.0002048575933562592, "loss": 4.0302, "step": 448500 }, { "epoch": 2.3919111849815677, "grad_norm": 0.9491548538208008, "learning_rate": 0.00020396576818253165, "loss": 4.0302, "step": 449000 }, { "epoch": 2.3945747831831063, "grad_norm": 0.9248127341270447, "learning_rate": 0.00020307572665915154, "loss": 4.0338, "step": 449500 }, { "epoch": 2.397238381384645, "grad_norm": 0.9573125243186951, "learning_rate": 0.000202183901485424, "loss": 4.0337, "step": 450000 }, { "epoch": 2.3999019795861836, "grad_norm": 0.9655391573905945, "learning_rate": 0.00020129207631169646, "loss": 4.0338, "step": 450500 }, { "epoch": 2.4025655777877217, "grad_norm": 0.9134914875030518, "learning_rate": 0.00020040025113796892, "loss": 4.0241, "step": 451000 }, { "epoch": 2.4052291759892603, "grad_norm": 0.9635368585586548, "learning_rate": 0.00019951020961458886, "loss": 4.0357, "step": 451500 }, { "epoch": 2.407892774190799, "grad_norm": 0.9742798805236816, "learning_rate": 0.0001986183844408613, "loss": 4.0242, "step": 452000 }, { "epoch": 2.4105563723923376, "grad_norm": 0.9775349497795105, "learning_rate": 0.00019772655926713375, "loss": 4.0279, "step": 452500 }, { "epoch": 2.4132199705938757, "grad_norm": 0.9313619136810303, "learning_rate": 0.0001968347340934062, "loss": 4.03, "step": 453000 }, { "epoch": 2.4158835687954143, "grad_norm": 0.9796269536018372, "learning_rate": 0.00019594469257002613, "loss": 4.0254, "step": 453500 }, { "epoch": 2.418547166996953, "grad_norm": 0.9695695042610168, "learning_rate": 0.00019505286739629856, "loss": 4.0353, "step": 454000 }, { "epoch": 2.4212107651984915, "grad_norm": 0.9753876328468323, "learning_rate": 0.00019416104222257102, "loss": 4.0269, "step": 454500 }, { "epoch": 2.4238743634000297, "grad_norm": 0.9220411777496338, "learning_rate": 0.00019326921704884348, "loss": 4.0289, "step": 455000 }, { "epoch": 2.4265379616015683, "grad_norm": 0.9355341196060181, "learning_rate": 0.0001923791755254634, "loss": 4.0297, "step": 455500 }, { "epoch": 2.429201559803107, "grad_norm": 1.0068522691726685, "learning_rate": 0.00019148735035173583, "loss": 4.0332, "step": 456000 }, { "epoch": 2.431865158004645, "grad_norm": 0.9809306263923645, "learning_rate": 0.00019059552517800831, "loss": 4.025, "step": 456500 }, { "epoch": 2.4345287562061837, "grad_norm": 0.9140877723693848, "learning_rate": 0.00018970370000428077, "loss": 4.0237, "step": 457000 }, { "epoch": 2.4371923544077223, "grad_norm": 0.942362368106842, "learning_rate": 0.00018881187483055323, "loss": 4.0299, "step": 457500 }, { "epoch": 2.439855952609261, "grad_norm": 1.0030492544174194, "learning_rate": 0.00018792183330717312, "loss": 4.0241, "step": 458000 }, { "epoch": 2.4425195508107995, "grad_norm": 0.9555344581604004, "learning_rate": 0.00018703000813344558, "loss": 4.0269, "step": 458500 }, { "epoch": 2.4451831490123377, "grad_norm": 0.9068697690963745, "learning_rate": 0.00018613818295971804, "loss": 4.0273, "step": 459000 }, { "epoch": 2.4478467472138763, "grad_norm": 1.026928186416626, "learning_rate": 0.0001852463577859905, "loss": 4.0271, "step": 459500 }, { "epoch": 2.450510345415415, "grad_norm": 1.0138953924179077, "learning_rate": 0.00018435631626261041, "loss": 4.0273, "step": 460000 }, { "epoch": 2.453173943616953, "grad_norm": 0.9750286936759949, "learning_rate": 0.00018346449108888285, "loss": 4.0304, "step": 460500 }, { "epoch": 2.4558375418184917, "grad_norm": 0.9891506433486938, "learning_rate": 0.0001825726659151553, "loss": 4.028, "step": 461000 }, { "epoch": 2.4585011400200303, "grad_norm": 0.9331740140914917, "learning_rate": 0.00018168084074142777, "loss": 4.0259, "step": 461500 }, { "epoch": 2.461164738221569, "grad_norm": 0.9839907288551331, "learning_rate": 0.00018078901556770025, "loss": 4.0299, "step": 462000 }, { "epoch": 2.463828336423107, "grad_norm": 1.092699408531189, "learning_rate": 0.00017989897404432014, "loss": 4.0279, "step": 462500 }, { "epoch": 2.4664919346246457, "grad_norm": 0.9484713673591614, "learning_rate": 0.0001790071488705926, "loss": 4.0141, "step": 463000 }, { "epoch": 2.4691555328261843, "grad_norm": 0.9671944975852966, "learning_rate": 0.00017811532369686506, "loss": 4.0262, "step": 463500 }, { "epoch": 2.471819131027723, "grad_norm": 0.9488347172737122, "learning_rate": 0.00017722349852313752, "loss": 4.0197, "step": 464000 }, { "epoch": 2.474482729229261, "grad_norm": 0.9663012623786926, "learning_rate": 0.0001763334569997574, "loss": 4.0238, "step": 464500 }, { "epoch": 2.4771463274307997, "grad_norm": 0.9515085220336914, "learning_rate": 0.00017544163182602987, "loss": 4.0248, "step": 465000 }, { "epoch": 2.4798099256323383, "grad_norm": 0.969129204750061, "learning_rate": 0.00017454980665230233, "loss": 4.027, "step": 465500 }, { "epoch": 2.482473523833877, "grad_norm": 0.9723744988441467, "learning_rate": 0.00017365798147857479, "loss": 4.0223, "step": 466000 }, { "epoch": 2.485137122035415, "grad_norm": 0.9454832673072815, "learning_rate": 0.0001727679399551947, "loss": 4.0257, "step": 466500 }, { "epoch": 2.4878007202369536, "grad_norm": 0.9404035210609436, "learning_rate": 0.00017187611478146716, "loss": 4.0292, "step": 467000 }, { "epoch": 2.4904643184384923, "grad_norm": 0.9745790362358093, "learning_rate": 0.00017098428960773962, "loss": 4.027, "step": 467500 }, { "epoch": 2.493127916640031, "grad_norm": 0.952643871307373, "learning_rate": 0.00017009246443401208, "loss": 4.0259, "step": 468000 }, { "epoch": 2.495791514841569, "grad_norm": 1.0002975463867188, "learning_rate": 0.000169202422910632, "loss": 4.0286, "step": 468500 }, { "epoch": 2.4984551130431076, "grad_norm": 0.9904667139053345, "learning_rate": 0.00016831059773690443, "loss": 4.0233, "step": 469000 }, { "epoch": 2.5011187112446462, "grad_norm": 0.9523800015449524, "learning_rate": 0.00016741877256317689, "loss": 4.0205, "step": 469500 }, { "epoch": 2.5037823094461844, "grad_norm": 1.111253023147583, "learning_rate": 0.00016652694738944935, "loss": 4.0211, "step": 470000 }, { "epoch": 2.506445907647723, "grad_norm": 0.9411515593528748, "learning_rate": 0.0001656369058660693, "loss": 4.0276, "step": 470500 }, { "epoch": 2.5091095058492616, "grad_norm": 0.9541642665863037, "learning_rate": 0.00016474508069234172, "loss": 4.0248, "step": 471000 }, { "epoch": 2.5117731040508002, "grad_norm": 1.016478180885315, "learning_rate": 0.00016385325551861418, "loss": 4.0253, "step": 471500 }, { "epoch": 2.514436702252339, "grad_norm": 0.9605896472930908, "learning_rate": 0.00016296143034488664, "loss": 4.0201, "step": 472000 }, { "epoch": 2.517100300453877, "grad_norm": 0.9732680916786194, "learning_rate": 0.00016207138882150655, "loss": 4.02, "step": 472500 }, { "epoch": 2.5197638986554156, "grad_norm": 0.9240507483482361, "learning_rate": 0.000161179563647779, "loss": 4.0156, "step": 473000 }, { "epoch": 2.522427496856954, "grad_norm": 1.063936471939087, "learning_rate": 0.00016028773847405145, "loss": 4.0252, "step": 473500 }, { "epoch": 2.5250910950584924, "grad_norm": 0.9789932370185852, "learning_rate": 0.0001593959133003239, "loss": 4.0243, "step": 474000 }, { "epoch": 2.527754693260031, "grad_norm": 0.9427129030227661, "learning_rate": 0.00015850587177694385, "loss": 4.0193, "step": 474500 }, { "epoch": 2.5304182914615696, "grad_norm": 1.0714107751846313, "learning_rate": 0.00015761404660321628, "loss": 4.0165, "step": 475000 }, { "epoch": 2.533081889663108, "grad_norm": 0.9931527376174927, "learning_rate": 0.00015672222142948874, "loss": 4.0236, "step": 475500 }, { "epoch": 2.535745487864647, "grad_norm": 0.9835180640220642, "learning_rate": 0.0001558303962557612, "loss": 4.0227, "step": 476000 }, { "epoch": 2.538409086066185, "grad_norm": 1.021427869796753, "learning_rate": 0.00015493857108203366, "loss": 4.0233, "step": 476500 }, { "epoch": 2.5410726842677236, "grad_norm": 1.2135415077209473, "learning_rate": 0.00015404852955865357, "loss": 4.0206, "step": 477000 }, { "epoch": 2.543736282469262, "grad_norm": 1.0140650272369385, "learning_rate": 0.000153156704384926, "loss": 4.0232, "step": 477500 }, { "epoch": 2.5463998806708004, "grad_norm": 1.0078463554382324, "learning_rate": 0.00015226487921119847, "loss": 4.0182, "step": 478000 }, { "epoch": 2.549063478872339, "grad_norm": 1.0854226350784302, "learning_rate": 0.00015137305403747092, "loss": 4.019, "step": 478500 }, { "epoch": 2.5517270770738776, "grad_norm": 0.9886216521263123, "learning_rate": 0.00015048301251409084, "loss": 4.0224, "step": 479000 }, { "epoch": 2.554390675275416, "grad_norm": 1.0139665603637695, "learning_rate": 0.0001495911873403633, "loss": 4.0129, "step": 479500 }, { "epoch": 2.557054273476955, "grad_norm": 0.9683591723442078, "learning_rate": 0.00014869936216663576, "loss": 4.017, "step": 480000 }, { "epoch": 2.559717871678493, "grad_norm": 1.039494276046753, "learning_rate": 0.00014780753699290822, "loss": 4.0145, "step": 480500 }, { "epoch": 2.5623814698800316, "grad_norm": 1.0008569955825806, "learning_rate": 0.00014691749546952813, "loss": 4.0191, "step": 481000 }, { "epoch": 2.56504506808157, "grad_norm": 0.9593690037727356, "learning_rate": 0.00014602567029580057, "loss": 4.0247, "step": 481500 }, { "epoch": 2.5677086662831083, "grad_norm": 0.9470319747924805, "learning_rate": 0.00014513384512207303, "loss": 4.0227, "step": 482000 }, { "epoch": 2.570372264484647, "grad_norm": 1.0550135374069214, "learning_rate": 0.00014424201994834549, "loss": 4.0201, "step": 482500 }, { "epoch": 2.5730358626861856, "grad_norm": 1.0270289182662964, "learning_rate": 0.0001433519784249654, "loss": 4.0155, "step": 483000 }, { "epoch": 2.575699460887724, "grad_norm": 1.0669533014297485, "learning_rate": 0.00014246015325123783, "loss": 4.0256, "step": 483500 }, { "epoch": 2.5783630590892628, "grad_norm": 0.9935122132301331, "learning_rate": 0.0001415683280775103, "loss": 4.0131, "step": 484000 }, { "epoch": 2.581026657290801, "grad_norm": 1.0519307851791382, "learning_rate": 0.00014067650290378275, "loss": 4.0225, "step": 484500 }, { "epoch": 2.5836902554923395, "grad_norm": 0.9848348498344421, "learning_rate": 0.0001397864613804027, "loss": 4.0173, "step": 485000 }, { "epoch": 2.586353853693878, "grad_norm": 0.9730287194252014, "learning_rate": 0.00013889463620667515, "loss": 4.0184, "step": 485500 }, { "epoch": 2.5890174518954163, "grad_norm": 1.023484706878662, "learning_rate": 0.00013800281103294759, "loss": 4.0183, "step": 486000 }, { "epoch": 2.591681050096955, "grad_norm": 0.9631215929985046, "learning_rate": 0.00013711098585922005, "loss": 4.0186, "step": 486500 }, { "epoch": 2.5943446482984935, "grad_norm": 0.9774326682090759, "learning_rate": 0.00013622094433583996, "loss": 4.0212, "step": 487000 }, { "epoch": 2.5970082465000317, "grad_norm": 1.052068829536438, "learning_rate": 0.00013532911916211242, "loss": 4.0183, "step": 487500 }, { "epoch": 2.5996718447015703, "grad_norm": 0.9873191714286804, "learning_rate": 0.00013443729398838485, "loss": 4.0241, "step": 488000 }, { "epoch": 2.602335442903109, "grad_norm": 1.1005477905273438, "learning_rate": 0.0001335454688146573, "loss": 4.017, "step": 488500 }, { "epoch": 2.6049990411046475, "grad_norm": 0.9617475271224976, "learning_rate": 0.00013265542729127725, "loss": 4.0207, "step": 489000 }, { "epoch": 2.607662639306186, "grad_norm": 0.9862669706344604, "learning_rate": 0.0001317636021175497, "loss": 4.0168, "step": 489500 }, { "epoch": 2.6103262375077243, "grad_norm": 0.9720093011856079, "learning_rate": 0.00013087177694382215, "loss": 4.0058, "step": 490000 }, { "epoch": 2.612989835709263, "grad_norm": 0.9520342350006104, "learning_rate": 0.0001299799517700946, "loss": 4.0146, "step": 490500 }, { "epoch": 2.6156534339108015, "grad_norm": 1.054432988166809, "learning_rate": 0.00012908991024671452, "loss": 4.0105, "step": 491000 }, { "epoch": 2.6183170321123397, "grad_norm": 0.9796612858772278, "learning_rate": 0.00012819808507298698, "loss": 4.0114, "step": 491500 }, { "epoch": 2.6209806303138783, "grad_norm": 1.0970081090927124, "learning_rate": 0.0001273062598992594, "loss": 4.0232, "step": 492000 }, { "epoch": 2.623644228515417, "grad_norm": 0.9749308228492737, "learning_rate": 0.00012641443472553187, "loss": 4.009, "step": 492500 }, { "epoch": 2.6263078267169555, "grad_norm": 1.0011272430419922, "learning_rate": 0.00012552439320215181, "loss": 4.0182, "step": 493000 }, { "epoch": 2.628971424918494, "grad_norm": 0.9727855920791626, "learning_rate": 0.00012463256802842425, "loss": 4.0142, "step": 493500 }, { "epoch": 2.6316350231200323, "grad_norm": 1.054745078086853, "learning_rate": 0.0001237407428546967, "loss": 4.0153, "step": 494000 }, { "epoch": 2.634298621321571, "grad_norm": 0.9852134585380554, "learning_rate": 0.00012284891768096917, "loss": 4.0202, "step": 494500 }, { "epoch": 2.6369622195231095, "grad_norm": 1.0056986808776855, "learning_rate": 0.00012195887615758908, "loss": 4.0187, "step": 495000 }, { "epoch": 2.6396258177246477, "grad_norm": 0.9925665259361267, "learning_rate": 0.00012106705098386153, "loss": 4.0102, "step": 495500 }, { "epoch": 2.6422894159261863, "grad_norm": 0.9884349703788757, "learning_rate": 0.00012017522581013399, "loss": 4.0161, "step": 496000 }, { "epoch": 2.644953014127725, "grad_norm": 0.9753773808479309, "learning_rate": 0.00011928340063640645, "loss": 4.0122, "step": 496500 }, { "epoch": 2.6476166123292635, "grad_norm": 1.0602976083755493, "learning_rate": 0.00011839157546267889, "loss": 4.0148, "step": 497000 }, { "epoch": 2.650280210530802, "grad_norm": 1.024678349494934, "learning_rate": 0.00011750153393929882, "loss": 4.0148, "step": 497500 }, { "epoch": 2.6529438087323403, "grad_norm": 1.0422247648239136, "learning_rate": 0.00011660970876557127, "loss": 4.0139, "step": 498000 }, { "epoch": 2.655607406933879, "grad_norm": 0.9945011734962463, "learning_rate": 0.00011571788359184373, "loss": 4.0098, "step": 498500 }, { "epoch": 2.6582710051354175, "grad_norm": 0.9866018891334534, "learning_rate": 0.00011482605841811617, "loss": 4.0151, "step": 499000 }, { "epoch": 2.6609346033369556, "grad_norm": 1.071170449256897, "learning_rate": 0.0001139360168947361, "loss": 4.016, "step": 499500 }, { "epoch": 2.6635982015384942, "grad_norm": 1.120274543762207, "learning_rate": 0.00011304419172100855, "loss": 4.0115, "step": 500000 }, { "epoch": 2.666261799740033, "grad_norm": 1.0567705631256104, "learning_rate": 0.000112152366547281, "loss": 4.012, "step": 500500 }, { "epoch": 2.6689253979415715, "grad_norm": 0.9878965020179749, "learning_rate": 0.00011126054137355346, "loss": 4.0176, "step": 501000 }, { "epoch": 2.67158899614311, "grad_norm": 1.064886212348938, "learning_rate": 0.00011037049985017338, "loss": 4.0103, "step": 501500 }, { "epoch": 2.6742525943446482, "grad_norm": 1.0028510093688965, "learning_rate": 0.00010947867467644583, "loss": 4.0122, "step": 502000 }, { "epoch": 2.676916192546187, "grad_norm": 1.0561763048171997, "learning_rate": 0.00010858684950271829, "loss": 4.0078, "step": 502500 }, { "epoch": 2.6795797907477255, "grad_norm": 0.9861183166503906, "learning_rate": 0.00010769502432899074, "loss": 4.0162, "step": 503000 }, { "epoch": 2.6822433889492636, "grad_norm": 1.0413438081741333, "learning_rate": 0.00010680498280561066, "loss": 4.0205, "step": 503500 }, { "epoch": 2.6849069871508022, "grad_norm": 0.9923077821731567, "learning_rate": 0.0001059131576318831, "loss": 4.0078, "step": 504000 }, { "epoch": 2.687570585352341, "grad_norm": 0.9952608346939087, "learning_rate": 0.00010502133245815557, "loss": 4.0078, "step": 504500 }, { "epoch": 2.690234183553879, "grad_norm": 1.0345313549041748, "learning_rate": 0.00010412950728442802, "loss": 4.0118, "step": 505000 }, { "epoch": 2.6928977817554176, "grad_norm": 0.9837112426757812, "learning_rate": 0.00010323946576104794, "loss": 4.0108, "step": 505500 }, { "epoch": 2.695561379956956, "grad_norm": 1.0294288396835327, "learning_rate": 0.00010234764058732039, "loss": 4.0074, "step": 506000 }, { "epoch": 2.698224978158495, "grad_norm": 1.0430691242218018, "learning_rate": 0.00010145581541359285, "loss": 4.008, "step": 506500 }, { "epoch": 2.7008885763600334, "grad_norm": 1.006121039390564, "learning_rate": 0.0001005639902398653, "loss": 4.0022, "step": 507000 }, { "epoch": 2.7035521745615716, "grad_norm": 1.0028232336044312, "learning_rate": 9.967216506613775e-05, "loss": 4.0164, "step": 507500 }, { "epoch": 2.70621577276311, "grad_norm": 0.9883862733840942, "learning_rate": 9.878212354275768e-05, "loss": 4.0104, "step": 508000 }, { "epoch": 2.708879370964649, "grad_norm": 1.087190866470337, "learning_rate": 9.789029836903013e-05, "loss": 4.0132, "step": 508500 }, { "epoch": 2.711542969166187, "grad_norm": 1.0679038763046265, "learning_rate": 9.699847319530258e-05, "loss": 4.0105, "step": 509000 }, { "epoch": 2.7142065673677256, "grad_norm": 0.9755781888961792, "learning_rate": 9.610664802157504e-05, "loss": 4.0141, "step": 509500 }, { "epoch": 2.716870165569264, "grad_norm": 1.09120512008667, "learning_rate": 9.521660649819495e-05, "loss": 4.0138, "step": 510000 }, { "epoch": 2.719533763770803, "grad_norm": 1.0885505676269531, "learning_rate": 9.43247813244674e-05, "loss": 4.0065, "step": 510500 }, { "epoch": 2.7221973619723414, "grad_norm": 0.9858110547065735, "learning_rate": 9.343295615073986e-05, "loss": 4.0082, "step": 511000 }, { "epoch": 2.7248609601738796, "grad_norm": 1.0929360389709473, "learning_rate": 9.254113097701232e-05, "loss": 4.0107, "step": 511500 }, { "epoch": 2.727524558375418, "grad_norm": 1.139798641204834, "learning_rate": 9.165108945363223e-05, "loss": 4.0113, "step": 512000 }, { "epoch": 2.730188156576957, "grad_norm": 1.009216070175171, "learning_rate": 9.075926427990467e-05, "loss": 4.0065, "step": 512500 }, { "epoch": 2.732851754778495, "grad_norm": 1.047379732131958, "learning_rate": 8.986743910617714e-05, "loss": 4.0164, "step": 513000 }, { "epoch": 2.7355153529800336, "grad_norm": 0.9918530583381653, "learning_rate": 8.89756139324496e-05, "loss": 4.0016, "step": 513500 }, { "epoch": 2.738178951181572, "grad_norm": 1.0664864778518677, "learning_rate": 8.80855724090695e-05, "loss": 4.0112, "step": 514000 }, { "epoch": 2.740842549383111, "grad_norm": 1.0139024257659912, "learning_rate": 8.719374723534195e-05, "loss": 4.014, "step": 514500 }, { "epoch": 2.7435061475846494, "grad_norm": 1.0350786447525024, "learning_rate": 8.630192206161441e-05, "loss": 4.0062, "step": 515000 }, { "epoch": 2.7461697457861876, "grad_norm": 1.1327440738677979, "learning_rate": 8.541009688788688e-05, "loss": 4.0072, "step": 515500 }, { "epoch": 2.748833343987726, "grad_norm": 1.0807819366455078, "learning_rate": 8.452005536450679e-05, "loss": 4.0037, "step": 516000 }, { "epoch": 2.7514969421892648, "grad_norm": 0.9618473649024963, "learning_rate": 8.362823019077925e-05, "loss": 4.0069, "step": 516500 }, { "epoch": 2.754160540390803, "grad_norm": 1.0459738969802856, "learning_rate": 8.273640501705169e-05, "loss": 4.0066, "step": 517000 }, { "epoch": 2.7568241385923415, "grad_norm": 0.9917722940444946, "learning_rate": 8.184457984332415e-05, "loss": 3.9992, "step": 517500 }, { "epoch": 2.75948773679388, "grad_norm": 1.0388100147247314, "learning_rate": 8.095453831994407e-05, "loss": 4.0052, "step": 518000 }, { "epoch": 2.7621513349954188, "grad_norm": 1.041391372680664, "learning_rate": 8.006271314621653e-05, "loss": 4.0032, "step": 518500 }, { "epoch": 2.7648149331969574, "grad_norm": 1.06915283203125, "learning_rate": 7.917088797248897e-05, "loss": 4.0031, "step": 519000 }, { "epoch": 2.7674785313984955, "grad_norm": 1.0097078084945679, "learning_rate": 7.827906279876143e-05, "loss": 4.0074, "step": 519500 }, { "epoch": 2.770142129600034, "grad_norm": 1.0231430530548096, "learning_rate": 7.738902127538135e-05, "loss": 4.0133, "step": 520000 }, { "epoch": 2.7728057278015728, "grad_norm": 1.1709152460098267, "learning_rate": 7.64971961016538e-05, "loss": 4.0105, "step": 520500 }, { "epoch": 2.775469326003111, "grad_norm": 1.0553919076919556, "learning_rate": 7.560537092792625e-05, "loss": 4.0005, "step": 521000 }, { "epoch": 2.7781329242046495, "grad_norm": 1.0332099199295044, "learning_rate": 7.471354575419871e-05, "loss": 4.0137, "step": 521500 }, { "epoch": 2.780796522406188, "grad_norm": 1.0436155796051025, "learning_rate": 7.382350423081863e-05, "loss": 4.0046, "step": 522000 }, { "epoch": 2.7834601206077263, "grad_norm": 1.0391409397125244, "learning_rate": 7.293167905709109e-05, "loss": 4.0041, "step": 522500 }, { "epoch": 2.786123718809265, "grad_norm": 1.1365002393722534, "learning_rate": 7.203985388336353e-05, "loss": 4.0052, "step": 523000 }, { "epoch": 2.7887873170108035, "grad_norm": 1.0857511758804321, "learning_rate": 7.114802870963599e-05, "loss": 4.0059, "step": 523500 }, { "epoch": 2.791450915212342, "grad_norm": 0.9912382364273071, "learning_rate": 7.02579871862559e-05, "loss": 3.9987, "step": 524000 }, { "epoch": 2.7941145134138807, "grad_norm": 1.032727599143982, "learning_rate": 6.936616201252837e-05, "loss": 4.0058, "step": 524500 }, { "epoch": 2.796778111615419, "grad_norm": 1.0187702178955078, "learning_rate": 6.847433683880082e-05, "loss": 4.0103, "step": 525000 }, { "epoch": 2.7994417098169575, "grad_norm": 0.981054425239563, "learning_rate": 6.758251166507327e-05, "loss": 4.0111, "step": 525500 }, { "epoch": 2.802105308018496, "grad_norm": 1.1054233312606812, "learning_rate": 6.669068649134573e-05, "loss": 4.0051, "step": 526000 }, { "epoch": 2.8047689062200343, "grad_norm": 1.060707449913025, "learning_rate": 6.580064496796565e-05, "loss": 4.0112, "step": 526500 }, { "epoch": 2.807432504421573, "grad_norm": 0.9906247854232788, "learning_rate": 6.49088197942381e-05, "loss": 4.0067, "step": 527000 }, { "epoch": 2.8100961026231115, "grad_norm": 1.0259308815002441, "learning_rate": 6.401699462051055e-05, "loss": 3.9976, "step": 527500 }, { "epoch": 2.81275970082465, "grad_norm": 1.0347638130187988, "learning_rate": 6.312516944678301e-05, "loss": 4.0036, "step": 528000 }, { "epoch": 2.8154232990261887, "grad_norm": 1.0310813188552856, "learning_rate": 6.223512792340293e-05, "loss": 3.9994, "step": 528500 }, { "epoch": 2.818086897227727, "grad_norm": 1.085179090499878, "learning_rate": 6.134330274967537e-05, "loss": 4.0085, "step": 529000 }, { "epoch": 2.8207504954292655, "grad_norm": 1.0044561624526978, "learning_rate": 6.045147757594784e-05, "loss": 4.0058, "step": 529500 }, { "epoch": 2.823414093630804, "grad_norm": 1.0580705404281616, "learning_rate": 5.955965240222029e-05, "loss": 3.9968, "step": 530000 }, { "epoch": 2.8260776918323423, "grad_norm": 1.1205203533172607, "learning_rate": 5.86696108788402e-05, "loss": 3.9991, "step": 530500 }, { "epoch": 2.828741290033881, "grad_norm": 1.0346322059631348, "learning_rate": 5.777778570511266e-05, "loss": 4.0044, "step": 531000 }, { "epoch": 2.8314048882354195, "grad_norm": 1.078075647354126, "learning_rate": 5.688596053138511e-05, "loss": 3.9978, "step": 531500 }, { "epoch": 2.834068486436958, "grad_norm": 1.0365418195724487, "learning_rate": 5.599413535765757e-05, "loss": 4.0039, "step": 532000 }, { "epoch": 2.8367320846384967, "grad_norm": 1.0657716989517212, "learning_rate": 5.510409383427748e-05, "loss": 4.004, "step": 532500 }, { "epoch": 2.839395682840035, "grad_norm": 1.1193735599517822, "learning_rate": 5.421226866054994e-05, "loss": 3.9981, "step": 533000 }, { "epoch": 2.8420592810415735, "grad_norm": 1.0354912281036377, "learning_rate": 5.332044348682239e-05, "loss": 4.004, "step": 533500 }, { "epoch": 2.844722879243112, "grad_norm": 1.0501588582992554, "learning_rate": 5.2428618313094844e-05, "loss": 4.0008, "step": 534000 }, { "epoch": 2.8473864774446502, "grad_norm": 1.0080904960632324, "learning_rate": 5.1538576789714766e-05, "loss": 4.002, "step": 534500 }, { "epoch": 2.850050075646189, "grad_norm": 1.0569877624511719, "learning_rate": 5.064675161598722e-05, "loss": 4.0042, "step": 535000 }, { "epoch": 2.8527136738477274, "grad_norm": 1.0170665979385376, "learning_rate": 4.975492644225967e-05, "loss": 4.0016, "step": 535500 }, { "epoch": 2.855377272049266, "grad_norm": 1.0019437074661255, "learning_rate": 4.886310126853213e-05, "loss": 3.9992, "step": 536000 }, { "epoch": 2.8580408702508047, "grad_norm": 1.059810757637024, "learning_rate": 4.797305974515204e-05, "loss": 4.0066, "step": 536500 }, { "epoch": 2.860704468452343, "grad_norm": 1.0938292741775513, "learning_rate": 4.70812345714245e-05, "loss": 4.0008, "step": 537000 }, { "epoch": 2.8633680666538814, "grad_norm": 1.0392727851867676, "learning_rate": 4.618940939769695e-05, "loss": 4.0009, "step": 537500 }, { "epoch": 2.86603166485542, "grad_norm": 1.041225790977478, "learning_rate": 4.529758422396941e-05, "loss": 4.0025, "step": 538000 }, { "epoch": 2.868695263056958, "grad_norm": 1.0904215574264526, "learning_rate": 4.440754270058932e-05, "loss": 3.9982, "step": 538500 }, { "epoch": 2.871358861258497, "grad_norm": 1.0225439071655273, "learning_rate": 4.351571752686177e-05, "loss": 3.9986, "step": 539000 }, { "epoch": 2.8740224594600354, "grad_norm": 1.0368945598602295, "learning_rate": 4.262389235313424e-05, "loss": 3.9998, "step": 539500 }, { "epoch": 2.8766860576615736, "grad_norm": 1.0657331943511963, "learning_rate": 4.173206717940669e-05, "loss": 3.996, "step": 540000 }, { "epoch": 2.879349655863112, "grad_norm": 1.0275654792785645, "learning_rate": 4.084024200567914e-05, "loss": 3.9983, "step": 540500 }, { "epoch": 2.882013254064651, "grad_norm": 1.107050895690918, "learning_rate": 3.995020048229905e-05, "loss": 4.0028, "step": 541000 }, { "epoch": 2.8846768522661894, "grad_norm": 1.001038908958435, "learning_rate": 3.905837530857151e-05, "loss": 3.9941, "step": 541500 }, { "epoch": 2.887340450467728, "grad_norm": 1.0545873641967773, "learning_rate": 3.8166550134843964e-05, "loss": 3.9987, "step": 542000 }, { "epoch": 2.890004048669266, "grad_norm": 1.0375920534133911, "learning_rate": 3.727472496111642e-05, "loss": 3.995, "step": 542500 }, { "epoch": 2.892667646870805, "grad_norm": 1.0322425365447998, "learning_rate": 3.638468343773634e-05, "loss": 3.994, "step": 543000 }, { "epoch": 2.8953312450723434, "grad_norm": 1.0789730548858643, "learning_rate": 3.549285826400879e-05, "loss": 3.9958, "step": 543500 }, { "epoch": 2.8979948432738816, "grad_norm": 1.1932363510131836, "learning_rate": 3.4601033090281244e-05, "loss": 4.005, "step": 544000 }, { "epoch": 2.90065844147542, "grad_norm": 1.1194884777069092, "learning_rate": 3.3709207916553696e-05, "loss": 3.9965, "step": 544500 }, { "epoch": 2.903322039676959, "grad_norm": 1.03001868724823, "learning_rate": 3.281916639317362e-05, "loss": 4.0013, "step": 545000 }, { "epoch": 2.9059856378784974, "grad_norm": 0.986453115940094, "learning_rate": 3.192734121944607e-05, "loss": 3.9935, "step": 545500 }, { "epoch": 2.908649236080036, "grad_norm": 1.0338671207427979, "learning_rate": 3.1035516045718524e-05, "loss": 4.0017, "step": 546000 }, { "epoch": 2.911312834281574, "grad_norm": 1.0669965744018555, "learning_rate": 3.014369087199098e-05, "loss": 3.9954, "step": 546500 }, { "epoch": 2.9139764324831128, "grad_norm": 1.024873971939087, "learning_rate": 2.9253649348610895e-05, "loss": 3.9967, "step": 547000 }, { "epoch": 2.9166400306846514, "grad_norm": 1.0891566276550293, "learning_rate": 2.8361824174883348e-05, "loss": 4.0024, "step": 547500 }, { "epoch": 2.9193036288861895, "grad_norm": 0.9691978096961975, "learning_rate": 2.7469999001155807e-05, "loss": 3.9982, "step": 548000 }, { "epoch": 2.921967227087728, "grad_norm": 1.0564926862716675, "learning_rate": 2.6578173827428263e-05, "loss": 4.0025, "step": 548500 }, { "epoch": 2.9246308252892668, "grad_norm": 0.997660756111145, "learning_rate": 2.5688132304048175e-05, "loss": 3.9959, "step": 549000 }, { "epoch": 2.9272944234908054, "grad_norm": 1.0368565320968628, "learning_rate": 2.479630713032063e-05, "loss": 3.9977, "step": 549500 }, { "epoch": 2.929958021692344, "grad_norm": 1.069231629371643, "learning_rate": 2.3904481956593084e-05, "loss": 3.9915, "step": 550000 }, { "epoch": 2.932621619893882, "grad_norm": 1.0751917362213135, "learning_rate": 2.3012656782865543e-05, "loss": 3.997, "step": 550500 }, { "epoch": 2.9352852180954208, "grad_norm": 1.0397218465805054, "learning_rate": 2.212261525948545e-05, "loss": 3.9997, "step": 551000 }, { "epoch": 2.9379488162969594, "grad_norm": 1.086714506149292, "learning_rate": 2.1230790085757908e-05, "loss": 3.9943, "step": 551500 }, { "epoch": 2.9406124144984975, "grad_norm": 1.141553521156311, "learning_rate": 2.0338964912030367e-05, "loss": 3.9987, "step": 552000 }, { "epoch": 2.943276012700036, "grad_norm": 1.005601406097412, "learning_rate": 1.944713973830282e-05, "loss": 3.9904, "step": 552500 }, { "epoch": 2.9459396109015747, "grad_norm": 1.010642647743225, "learning_rate": 1.8557098214922735e-05, "loss": 3.9881, "step": 553000 }, { "epoch": 2.9486032091031134, "grad_norm": 1.104560375213623, "learning_rate": 1.7665273041195188e-05, "loss": 3.9918, "step": 553500 }, { "epoch": 2.951266807304652, "grad_norm": 1.0412003993988037, "learning_rate": 1.6773447867467644e-05, "loss": 3.9997, "step": 554000 }, { "epoch": 2.95393040550619, "grad_norm": 1.0635658502578735, "learning_rate": 1.5881622693740103e-05, "loss": 3.994, "step": 554500 }, { "epoch": 2.9565940037077287, "grad_norm": 1.0909868478775024, "learning_rate": 1.4991581170360012e-05, "loss": 3.9942, "step": 555000 }, { "epoch": 2.9592576019092673, "grad_norm": 1.052293062210083, "learning_rate": 1.4099755996632468e-05, "loss": 3.9975, "step": 555500 }, { "epoch": 2.9619212001108055, "grad_norm": 1.068088412284851, "learning_rate": 1.3207930822904926e-05, "loss": 3.9942, "step": 556000 }, { "epoch": 2.964584798312344, "grad_norm": 1.1510958671569824, "learning_rate": 1.2316105649177382e-05, "loss": 3.9951, "step": 556500 }, { "epoch": 2.9672483965138827, "grad_norm": 1.048006534576416, "learning_rate": 1.1426064125797293e-05, "loss": 3.9971, "step": 557000 }, { "epoch": 2.9699119947154213, "grad_norm": 1.0319584608078003, "learning_rate": 1.0534238952069748e-05, "loss": 3.9934, "step": 557500 }, { "epoch": 2.9725755929169595, "grad_norm": 1.0391571521759033, "learning_rate": 9.642413778342204e-06, "loss": 3.9943, "step": 558000 }, { "epoch": 2.975239191118498, "grad_norm": 1.0609184503555298, "learning_rate": 8.75058860461466e-06, "loss": 3.9923, "step": 558500 }, { "epoch": 2.9779027893200367, "grad_norm": 1.0420206785202026, "learning_rate": 7.860547081234572e-06, "loss": 3.9939, "step": 559000 }, { "epoch": 2.9805663875215753, "grad_norm": 1.0162791013717651, "learning_rate": 6.968721907507028e-06, "loss": 3.9993, "step": 559500 }, { "epoch": 2.9832299857231135, "grad_norm": 1.1188008785247803, "learning_rate": 6.076896733779484e-06, "loss": 3.9952, "step": 560000 }, { "epoch": 2.985893583924652, "grad_norm": 1.1251684427261353, "learning_rate": 5.18507156005194e-06, "loss": 3.9936, "step": 560500 }, { "epoch": 2.9885571821261907, "grad_norm": 1.072590947151184, "learning_rate": 4.295030036671852e-06, "loss": 3.9891, "step": 561000 }, { "epoch": 2.991220780327729, "grad_norm": 1.0949697494506836, "learning_rate": 3.403204862944307e-06, "loss": 3.9909, "step": 561500 }, { "epoch": 2.9938843785292675, "grad_norm": 1.0467427968978882, "learning_rate": 2.5113796892167635e-06, "loss": 4.0004, "step": 562000 }, { "epoch": 2.996547976730806, "grad_norm": 1.0436049699783325, "learning_rate": 1.6195545154892197e-06, "loss": 3.9896, "step": 562500 }, { "epoch": 2.9992115749323447, "grad_norm": 1.1010395288467407, "learning_rate": 7.295129921091309e-07, "loss": 3.9912, "step": 563000 }, { "epoch": 3.0, "step": 563148, "total_flos": 4.819699538212516e+17, "train_loss": 4.150129232981245, "train_runtime": 39834.0737, "train_samples_per_second": 904.789, "train_steps_per_second": 14.137 } ], "logging_steps": 500, "max_steps": 563148, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 5000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.819699538212516e+17, "train_batch_size": 64, "trial_name": null, "trial_params": null }