yuzhounie's picture
End of training
0faf77c verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9989909182643795,
"eval_steps": 500,
"global_step": 330,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0030272452068617556,
"grad_norm": 3.582401336916988,
"learning_rate": 3.0303030303030305e-07,
"loss": 0.9292,
"step": 1
},
{
"epoch": 0.006054490413723511,
"grad_norm": 3.4205519831406135,
"learning_rate": 6.060606060606061e-07,
"loss": 0.9108,
"step": 2
},
{
"epoch": 0.009081735620585268,
"grad_norm": 3.16713551515301,
"learning_rate": 9.090909090909091e-07,
"loss": 0.8814,
"step": 3
},
{
"epoch": 0.012108980827447022,
"grad_norm": 3.0763308435640826,
"learning_rate": 1.2121212121212122e-06,
"loss": 0.8916,
"step": 4
},
{
"epoch": 0.015136226034308779,
"grad_norm": 3.0041658248884637,
"learning_rate": 1.5151515151515152e-06,
"loss": 0.8653,
"step": 5
},
{
"epoch": 0.018163471241170535,
"grad_norm": 3.1701740364238784,
"learning_rate": 1.8181818181818183e-06,
"loss": 0.8708,
"step": 6
},
{
"epoch": 0.02119071644803229,
"grad_norm": 2.763230048221692,
"learning_rate": 2.1212121212121216e-06,
"loss": 0.8537,
"step": 7
},
{
"epoch": 0.024217961654894045,
"grad_norm": 2.733298638430716,
"learning_rate": 2.4242424242424244e-06,
"loss": 0.8467,
"step": 8
},
{
"epoch": 0.027245206861755803,
"grad_norm": 2.4137333198062363,
"learning_rate": 2.7272727272727272e-06,
"loss": 0.8259,
"step": 9
},
{
"epoch": 0.030272452068617558,
"grad_norm": 2.3531427364521806,
"learning_rate": 3.0303030303030305e-06,
"loss": 0.8647,
"step": 10
},
{
"epoch": 0.033299697275479316,
"grad_norm": 2.257535799815012,
"learning_rate": 3.3333333333333333e-06,
"loss": 0.8477,
"step": 11
},
{
"epoch": 0.03632694248234107,
"grad_norm": 1.870438795004336,
"learning_rate": 3.6363636363636366e-06,
"loss": 0.8058,
"step": 12
},
{
"epoch": 0.039354187689202826,
"grad_norm": 1.8529024545949584,
"learning_rate": 3.93939393939394e-06,
"loss": 0.8188,
"step": 13
},
{
"epoch": 0.04238143289606458,
"grad_norm": 1.5995269205103906,
"learning_rate": 4.242424242424243e-06,
"loss": 0.7798,
"step": 14
},
{
"epoch": 0.045408678102926335,
"grad_norm": 1.4849837969263027,
"learning_rate": 4.5454545454545455e-06,
"loss": 0.7775,
"step": 15
},
{
"epoch": 0.04843592330978809,
"grad_norm": 1.247687880640269,
"learning_rate": 4.848484848484849e-06,
"loss": 0.7377,
"step": 16
},
{
"epoch": 0.05146316851664985,
"grad_norm": 1.174811425240601,
"learning_rate": 5.151515151515152e-06,
"loss": 0.7337,
"step": 17
},
{
"epoch": 0.054490413723511606,
"grad_norm": 1.2904171645942244,
"learning_rate": 5.4545454545454545e-06,
"loss": 0.7436,
"step": 18
},
{
"epoch": 0.05751765893037336,
"grad_norm": 1.3055922878261663,
"learning_rate": 5.7575757575757586e-06,
"loss": 0.7376,
"step": 19
},
{
"epoch": 0.060544904137235116,
"grad_norm": 1.1674975811661823,
"learning_rate": 6.060606060606061e-06,
"loss": 0.7287,
"step": 20
},
{
"epoch": 0.06357214934409687,
"grad_norm": 1.261703113260723,
"learning_rate": 6.363636363636364e-06,
"loss": 0.7316,
"step": 21
},
{
"epoch": 0.06659939455095863,
"grad_norm": 0.9402917439962628,
"learning_rate": 6.666666666666667e-06,
"loss": 0.7161,
"step": 22
},
{
"epoch": 0.06962663975782038,
"grad_norm": 0.8837659672949174,
"learning_rate": 6.969696969696971e-06,
"loss": 0.6917,
"step": 23
},
{
"epoch": 0.07265388496468214,
"grad_norm": 0.9021995470350834,
"learning_rate": 7.272727272727273e-06,
"loss": 0.6939,
"step": 24
},
{
"epoch": 0.07568113017154389,
"grad_norm": 0.8607643590670297,
"learning_rate": 7.5757575757575764e-06,
"loss": 0.6748,
"step": 25
},
{
"epoch": 0.07870837537840565,
"grad_norm": 0.8819188540322948,
"learning_rate": 7.87878787878788e-06,
"loss": 0.7082,
"step": 26
},
{
"epoch": 0.08173562058526741,
"grad_norm": 0.8549815982958876,
"learning_rate": 8.181818181818183e-06,
"loss": 0.6723,
"step": 27
},
{
"epoch": 0.08476286579212916,
"grad_norm": 0.7850811101611005,
"learning_rate": 8.484848484848486e-06,
"loss": 0.6931,
"step": 28
},
{
"epoch": 0.08779011099899092,
"grad_norm": 0.8110489386808306,
"learning_rate": 8.787878787878788e-06,
"loss": 0.69,
"step": 29
},
{
"epoch": 0.09081735620585267,
"grad_norm": 0.7839440321984453,
"learning_rate": 9.090909090909091e-06,
"loss": 0.6847,
"step": 30
},
{
"epoch": 0.09384460141271443,
"grad_norm": 0.7260095058587323,
"learning_rate": 9.393939393939396e-06,
"loss": 0.6629,
"step": 31
},
{
"epoch": 0.09687184661957618,
"grad_norm": 0.7200347256119544,
"learning_rate": 9.696969696969698e-06,
"loss": 0.6736,
"step": 32
},
{
"epoch": 0.09989909182643794,
"grad_norm": 0.8740438482028542,
"learning_rate": 1e-05,
"loss": 0.6873,
"step": 33
},
{
"epoch": 0.1029263370332997,
"grad_norm": 0.6895041893850171,
"learning_rate": 9.999720280459576e-06,
"loss": 0.6672,
"step": 34
},
{
"epoch": 0.10595358224016145,
"grad_norm": 0.7458848790013037,
"learning_rate": 9.99888115313551e-06,
"loss": 0.661,
"step": 35
},
{
"epoch": 0.10898082744702321,
"grad_norm": 0.6925182714653222,
"learning_rate": 9.997482711915926e-06,
"loss": 0.6646,
"step": 36
},
{
"epoch": 0.11200807265388496,
"grad_norm": 0.6529697619098291,
"learning_rate": 9.99552511326936e-06,
"loss": 0.636,
"step": 37
},
{
"epoch": 0.11503531786074672,
"grad_norm": 0.6932749658928793,
"learning_rate": 9.993008576227248e-06,
"loss": 0.644,
"step": 38
},
{
"epoch": 0.11806256306760847,
"grad_norm": 0.6777649978595066,
"learning_rate": 9.989933382359423e-06,
"loss": 0.6527,
"step": 39
},
{
"epoch": 0.12108980827447023,
"grad_norm": 0.7421775067951599,
"learning_rate": 9.986299875742612e-06,
"loss": 0.6507,
"step": 40
},
{
"epoch": 0.124117053481332,
"grad_norm": 0.6274186532387682,
"learning_rate": 9.982108462921938e-06,
"loss": 0.6511,
"step": 41
},
{
"epoch": 0.12714429868819374,
"grad_norm": 0.7278450286683587,
"learning_rate": 9.977359612865424e-06,
"loss": 0.657,
"step": 42
},
{
"epoch": 0.1301715438950555,
"grad_norm": 0.703037580866815,
"learning_rate": 9.972053856911534e-06,
"loss": 0.6506,
"step": 43
},
{
"epoch": 0.13319878910191726,
"grad_norm": 0.7013283182614253,
"learning_rate": 9.966191788709716e-06,
"loss": 0.614,
"step": 44
},
{
"epoch": 0.136226034308779,
"grad_norm": 0.6546233293808662,
"learning_rate": 9.959774064153977e-06,
"loss": 0.6438,
"step": 45
},
{
"epoch": 0.13925327951564076,
"grad_norm": 0.7610412130219326,
"learning_rate": 9.952801401309504e-06,
"loss": 0.6359,
"step": 46
},
{
"epoch": 0.14228052472250252,
"grad_norm": 0.6919514514098687,
"learning_rate": 9.945274580332316e-06,
"loss": 0.6491,
"step": 47
},
{
"epoch": 0.14530776992936428,
"grad_norm": 0.6174270202036972,
"learning_rate": 9.937194443381972e-06,
"loss": 0.644,
"step": 48
},
{
"epoch": 0.14833501513622604,
"grad_norm": 0.6733177882785241,
"learning_rate": 9.928561894527354e-06,
"loss": 0.6599,
"step": 49
},
{
"epoch": 0.15136226034308778,
"grad_norm": 0.6571392240672871,
"learning_rate": 9.919377899645497e-06,
"loss": 0.6183,
"step": 50
},
{
"epoch": 0.15438950554994954,
"grad_norm": 0.6264066516136073,
"learning_rate": 9.909643486313533e-06,
"loss": 0.6475,
"step": 51
},
{
"epoch": 0.1574167507568113,
"grad_norm": 0.7318268238817622,
"learning_rate": 9.899359743693715e-06,
"loss": 0.6544,
"step": 52
},
{
"epoch": 0.16044399596367306,
"grad_norm": 0.6239161912485861,
"learning_rate": 9.888527822411543e-06,
"loss": 0.6461,
"step": 53
},
{
"epoch": 0.16347124117053483,
"grad_norm": 0.6757651283713,
"learning_rate": 9.877148934427037e-06,
"loss": 0.6304,
"step": 54
},
{
"epoch": 0.16649848637739656,
"grad_norm": 0.6434789578226523,
"learning_rate": 9.86522435289912e-06,
"loss": 0.6437,
"step": 55
},
{
"epoch": 0.16952573158425832,
"grad_norm": 0.6165046999823262,
"learning_rate": 9.85275541204318e-06,
"loss": 0.6225,
"step": 56
},
{
"epoch": 0.17255297679112008,
"grad_norm": 0.5715725754168417,
"learning_rate": 9.839743506981783e-06,
"loss": 0.6168,
"step": 57
},
{
"epoch": 0.17558022199798184,
"grad_norm": 0.6938321883682744,
"learning_rate": 9.826190093588564e-06,
"loss": 0.6417,
"step": 58
},
{
"epoch": 0.17860746720484358,
"grad_norm": 0.6504435632338657,
"learning_rate": 9.812096688325354e-06,
"loss": 0.6335,
"step": 59
},
{
"epoch": 0.18163471241170534,
"grad_norm": 0.7595444009911593,
"learning_rate": 9.797464868072489e-06,
"loss": 0.6584,
"step": 60
},
{
"epoch": 0.1846619576185671,
"grad_norm": 0.6836461405960763,
"learning_rate": 9.78229626995238e-06,
"loss": 0.6272,
"step": 61
},
{
"epoch": 0.18768920282542886,
"grad_norm": 0.5994860735808994,
"learning_rate": 9.766592591146353e-06,
"loss": 0.6473,
"step": 62
},
{
"epoch": 0.19071644803229063,
"grad_norm": 0.6281792185287286,
"learning_rate": 9.750355588704728e-06,
"loss": 0.6254,
"step": 63
},
{
"epoch": 0.19374369323915236,
"grad_norm": 0.624285070765526,
"learning_rate": 9.733587079350254e-06,
"loss": 0.6394,
"step": 64
},
{
"epoch": 0.19677093844601412,
"grad_norm": 0.6348141931505765,
"learning_rate": 9.716288939274818e-06,
"loss": 0.6403,
"step": 65
},
{
"epoch": 0.19979818365287588,
"grad_norm": 0.6809451524519542,
"learning_rate": 9.698463103929542e-06,
"loss": 0.6419,
"step": 66
},
{
"epoch": 0.20282542885973764,
"grad_norm": 0.5997797125302732,
"learning_rate": 9.680111567808212e-06,
"loss": 0.6163,
"step": 67
},
{
"epoch": 0.2058526740665994,
"grad_norm": 0.6687392407345998,
"learning_rate": 9.66123638422413e-06,
"loss": 0.601,
"step": 68
},
{
"epoch": 0.20887991927346114,
"grad_norm": 0.6789891063922996,
"learning_rate": 9.641839665080363e-06,
"loss": 0.6393,
"step": 69
},
{
"epoch": 0.2119071644803229,
"grad_norm": 0.7128749064772688,
"learning_rate": 9.621923580633462e-06,
"loss": 0.6406,
"step": 70
},
{
"epoch": 0.21493440968718466,
"grad_norm": 0.6599744626744597,
"learning_rate": 9.601490359250616e-06,
"loss": 0.6141,
"step": 71
},
{
"epoch": 0.21796165489404642,
"grad_norm": 0.6154817817864259,
"learning_rate": 9.580542287160348e-06,
"loss": 0.6545,
"step": 72
},
{
"epoch": 0.2209889001009082,
"grad_norm": 0.6095075575760279,
"learning_rate": 9.559081708196696e-06,
"loss": 0.6556,
"step": 73
},
{
"epoch": 0.22401614530776992,
"grad_norm": 0.7092185103024052,
"learning_rate": 9.537111023536973e-06,
"loss": 0.637,
"step": 74
},
{
"epoch": 0.22704339051463168,
"grad_norm": 0.5557331258417827,
"learning_rate": 9.514632691433108e-06,
"loss": 0.6114,
"step": 75
},
{
"epoch": 0.23007063572149344,
"grad_norm": 0.6572542761590052,
"learning_rate": 9.491649226936586e-06,
"loss": 0.6127,
"step": 76
},
{
"epoch": 0.2330978809283552,
"grad_norm": 0.5708314510562087,
"learning_rate": 9.468163201617063e-06,
"loss": 0.6447,
"step": 77
},
{
"epoch": 0.23612512613521694,
"grad_norm": 0.6479609544942071,
"learning_rate": 9.444177243274619e-06,
"loss": 0.6351,
"step": 78
},
{
"epoch": 0.2391523713420787,
"grad_norm": 0.6210548369272807,
"learning_rate": 9.419694035645753e-06,
"loss": 0.6512,
"step": 79
},
{
"epoch": 0.24217961654894046,
"grad_norm": 0.6203709937711855,
"learning_rate": 9.394716318103098e-06,
"loss": 0.6364,
"step": 80
},
{
"epoch": 0.24520686175580222,
"grad_norm": 0.5641516890803803,
"learning_rate": 9.369246885348926e-06,
"loss": 0.607,
"step": 81
},
{
"epoch": 0.248234106962664,
"grad_norm": 0.6057720805942678,
"learning_rate": 9.343288587102444e-06,
"loss": 0.6084,
"step": 82
},
{
"epoch": 0.2512613521695257,
"grad_norm": 0.5996039859430649,
"learning_rate": 9.316844327780955e-06,
"loss": 0.6229,
"step": 83
},
{
"epoch": 0.2542885973763875,
"grad_norm": 0.7233799092359594,
"learning_rate": 9.289917066174887e-06,
"loss": 0.6553,
"step": 84
},
{
"epoch": 0.25731584258324924,
"grad_norm": 0.6381091800881794,
"learning_rate": 9.262509815116732e-06,
"loss": 0.6202,
"step": 85
},
{
"epoch": 0.260343087790111,
"grad_norm": 0.571873602858053,
"learning_rate": 9.234625641143962e-06,
"loss": 0.6272,
"step": 86
},
{
"epoch": 0.26337033299697277,
"grad_norm": 0.6477272311046639,
"learning_rate": 9.206267664155906e-06,
"loss": 0.6282,
"step": 87
},
{
"epoch": 0.26639757820383453,
"grad_norm": 0.6204243186542612,
"learning_rate": 9.177439057064684e-06,
"loss": 0.6027,
"step": 88
},
{
"epoch": 0.2694248234106963,
"grad_norm": 0.574163585781343,
"learning_rate": 9.148143045440181e-06,
"loss": 0.615,
"step": 89
},
{
"epoch": 0.272452068617558,
"grad_norm": 0.547214229348299,
"learning_rate": 9.118382907149164e-06,
"loss": 0.6086,
"step": 90
},
{
"epoch": 0.27547931382441976,
"grad_norm": 0.5634033876553848,
"learning_rate": 9.088161971988517e-06,
"loss": 0.6176,
"step": 91
},
{
"epoch": 0.2785065590312815,
"grad_norm": 0.5767515707064599,
"learning_rate": 9.057483621312671e-06,
"loss": 0.6133,
"step": 92
},
{
"epoch": 0.2815338042381433,
"grad_norm": 0.7238818760752277,
"learning_rate": 9.026351287655294e-06,
"loss": 0.6192,
"step": 93
},
{
"epoch": 0.28456104944500504,
"grad_norm": 0.6424760662074501,
"learning_rate": 8.994768454345207e-06,
"loss": 0.6429,
"step": 94
},
{
"epoch": 0.2875882946518668,
"grad_norm": 0.6121209460846522,
"learning_rate": 8.96273865511666e-06,
"loss": 0.6038,
"step": 95
},
{
"epoch": 0.29061553985872857,
"grad_norm": 0.5730328780876065,
"learning_rate": 8.930265473713939e-06,
"loss": 0.6384,
"step": 96
},
{
"epoch": 0.29364278506559033,
"grad_norm": 0.658452195329774,
"learning_rate": 8.897352543490396e-06,
"loss": 0.6208,
"step": 97
},
{
"epoch": 0.2966700302724521,
"grad_norm": 0.5750961216550038,
"learning_rate": 8.864003547001916e-06,
"loss": 0.6037,
"step": 98
},
{
"epoch": 0.29969727547931385,
"grad_norm": 0.6170410932574037,
"learning_rate": 8.83022221559489e-06,
"loss": 0.6518,
"step": 99
},
{
"epoch": 0.30272452068617556,
"grad_norm": 0.6099736047813001,
"learning_rate": 8.796012328988716e-06,
"loss": 0.6093,
"step": 100
},
{
"epoch": 0.3057517658930373,
"grad_norm": 0.6296693936221233,
"learning_rate": 8.7613777148529e-06,
"loss": 0.6303,
"step": 101
},
{
"epoch": 0.3087790110998991,
"grad_norm": 0.6387896700049089,
"learning_rate": 8.726322248378775e-06,
"loss": 0.6303,
"step": 102
},
{
"epoch": 0.31180625630676084,
"grad_norm": 0.5569352997591892,
"learning_rate": 8.690849851845933e-06,
"loss": 0.5851,
"step": 103
},
{
"epoch": 0.3148335015136226,
"grad_norm": 0.5822606667893254,
"learning_rate": 8.65496449418336e-06,
"loss": 0.6221,
"step": 104
},
{
"epoch": 0.31786074672048437,
"grad_norm": 0.6445396388188233,
"learning_rate": 8.61867019052535e-06,
"loss": 0.6189,
"step": 105
},
{
"epoch": 0.32088799192734613,
"grad_norm": 0.6972243818017967,
"learning_rate": 8.581971001762287e-06,
"loss": 0.6268,
"step": 106
},
{
"epoch": 0.3239152371342079,
"grad_norm": 0.64485578186628,
"learning_rate": 8.54487103408625e-06,
"loss": 0.6093,
"step": 107
},
{
"epoch": 0.32694248234106965,
"grad_norm": 0.6201740651760959,
"learning_rate": 8.507374438531606e-06,
"loss": 0.6023,
"step": 108
},
{
"epoch": 0.32996972754793136,
"grad_norm": 0.6447976605200675,
"learning_rate": 8.469485410510545e-06,
"loss": 0.6206,
"step": 109
},
{
"epoch": 0.3329969727547931,
"grad_norm": 0.6337500419970387,
"learning_rate": 8.43120818934367e-06,
"loss": 0.6116,
"step": 110
},
{
"epoch": 0.3360242179616549,
"grad_norm": 0.6182818527204792,
"learning_rate": 8.392547057785662e-06,
"loss": 0.6046,
"step": 111
},
{
"epoch": 0.33905146316851664,
"grad_norm": 0.5881772688459851,
"learning_rate": 8.353506341546106e-06,
"loss": 0.6239,
"step": 112
},
{
"epoch": 0.3420787083753784,
"grad_norm": 0.6402078672936957,
"learning_rate": 8.314090408805481e-06,
"loss": 0.6055,
"step": 113
},
{
"epoch": 0.34510595358224017,
"grad_norm": 0.5757997826606626,
"learning_rate": 8.274303669726427e-06,
"loss": 0.6092,
"step": 114
},
{
"epoch": 0.3481331987891019,
"grad_norm": 0.6514206878455265,
"learning_rate": 8.234150575960288e-06,
"loss": 0.6061,
"step": 115
},
{
"epoch": 0.3511604439959637,
"grad_norm": 0.5895203298799747,
"learning_rate": 8.193635620149041e-06,
"loss": 0.5989,
"step": 116
},
{
"epoch": 0.35418768920282545,
"grad_norm": 0.6112454742897356,
"learning_rate": 8.152763335422612e-06,
"loss": 0.6356,
"step": 117
},
{
"epoch": 0.35721493440968716,
"grad_norm": 0.6832380995022461,
"learning_rate": 8.111538294891684e-06,
"loss": 0.6017,
"step": 118
},
{
"epoch": 0.3602421796165489,
"grad_norm": 0.5860546413310351,
"learning_rate": 8.06996511113601e-06,
"loss": 0.6096,
"step": 119
},
{
"epoch": 0.3632694248234107,
"grad_norm": 0.5923137153339123,
"learning_rate": 8.028048435688333e-06,
"loss": 0.6036,
"step": 120
},
{
"epoch": 0.36629667003027244,
"grad_norm": 0.6240874840997842,
"learning_rate": 7.985792958513932e-06,
"loss": 0.6319,
"step": 121
},
{
"epoch": 0.3693239152371342,
"grad_norm": 0.6372964158349076,
"learning_rate": 7.943203407485864e-06,
"loss": 0.5944,
"step": 122
},
{
"epoch": 0.37235116044399597,
"grad_norm": 0.6037065207548263,
"learning_rate": 7.900284547855992e-06,
"loss": 0.6318,
"step": 123
},
{
"epoch": 0.3753784056508577,
"grad_norm": 0.668304599315594,
"learning_rate": 7.857041181721788e-06,
"loss": 0.618,
"step": 124
},
{
"epoch": 0.3784056508577195,
"grad_norm": 0.5794387781565752,
"learning_rate": 7.813478147489052e-06,
"loss": 0.6356,
"step": 125
},
{
"epoch": 0.38143289606458125,
"grad_norm": 0.556403184172075,
"learning_rate": 7.769600319330553e-06,
"loss": 0.5761,
"step": 126
},
{
"epoch": 0.384460141271443,
"grad_norm": 0.5475469668687941,
"learning_rate": 7.725412606640658e-06,
"loss": 0.6121,
"step": 127
},
{
"epoch": 0.3874873864783047,
"grad_norm": 0.5235670194488732,
"learning_rate": 7.680919953486047e-06,
"loss": 0.609,
"step": 128
},
{
"epoch": 0.3905146316851665,
"grad_norm": 0.5714837826451689,
"learning_rate": 7.636127338052513e-06,
"loss": 0.6133,
"step": 129
},
{
"epoch": 0.39354187689202824,
"grad_norm": 0.6085188884666131,
"learning_rate": 7.5910397720879785e-06,
"loss": 0.6242,
"step": 130
},
{
"epoch": 0.39656912209889,
"grad_norm": 0.6377617480748068,
"learning_rate": 7.545662300341736e-06,
"loss": 0.6377,
"step": 131
},
{
"epoch": 0.39959636730575177,
"grad_norm": 0.6317691711137606,
"learning_rate": 7.500000000000001e-06,
"loss": 0.6314,
"step": 132
},
{
"epoch": 0.4026236125126135,
"grad_norm": 0.6164919654580555,
"learning_rate": 7.454057980117842e-06,
"loss": 0.6151,
"step": 133
},
{
"epoch": 0.4056508577194753,
"grad_norm": 0.5689202036733442,
"learning_rate": 7.407841381047533e-06,
"loss": 0.5685,
"step": 134
},
{
"epoch": 0.40867810292633705,
"grad_norm": 0.6243912370782239,
"learning_rate": 7.361355373863415e-06,
"loss": 0.6351,
"step": 135
},
{
"epoch": 0.4117053481331988,
"grad_norm": 0.5881391685573231,
"learning_rate": 7.314605159783313e-06,
"loss": 0.5888,
"step": 136
},
{
"epoch": 0.4147325933400605,
"grad_norm": 0.594297458196102,
"learning_rate": 7.2675959695865896e-06,
"loss": 0.6048,
"step": 137
},
{
"epoch": 0.4177598385469223,
"grad_norm": 0.5484200902957905,
"learning_rate": 7.2203330630288714e-06,
"loss": 0.6015,
"step": 138
},
{
"epoch": 0.42078708375378404,
"grad_norm": 0.6707849048805817,
"learning_rate": 7.172821728253563e-06,
"loss": 0.5926,
"step": 139
},
{
"epoch": 0.4238143289606458,
"grad_norm": 0.5626840706033667,
"learning_rate": 7.1250672812001505e-06,
"loss": 0.5732,
"step": 140
},
{
"epoch": 0.42684157416750756,
"grad_norm": 0.5595741793241668,
"learning_rate": 7.0770750650094335e-06,
"loss": 0.6052,
"step": 141
},
{
"epoch": 0.4298688193743693,
"grad_norm": 0.5812370273753067,
"learning_rate": 7.02885044942567e-06,
"loss": 0.5921,
"step": 142
},
{
"epoch": 0.4328960645812311,
"grad_norm": 0.5880927887973462,
"learning_rate": 6.980398830195785e-06,
"loss": 0.5736,
"step": 143
},
{
"epoch": 0.43592330978809285,
"grad_norm": 0.5598146523471503,
"learning_rate": 6.931725628465643e-06,
"loss": 0.628,
"step": 144
},
{
"epoch": 0.4389505549949546,
"grad_norm": 0.5816815202355324,
"learning_rate": 6.882836290173493e-06,
"loss": 0.6311,
"step": 145
},
{
"epoch": 0.4419778002018164,
"grad_norm": 0.6236050247492952,
"learning_rate": 6.833736285440632e-06,
"loss": 0.5876,
"step": 146
},
{
"epoch": 0.4450050454086781,
"grad_norm": 0.6005470880702801,
"learning_rate": 6.78443110795936e-06,
"loss": 0.6388,
"step": 147
},
{
"epoch": 0.44803229061553984,
"grad_norm": 0.5929402616779416,
"learning_rate": 6.734926274378313e-06,
"loss": 0.5818,
"step": 148
},
{
"epoch": 0.4510595358224016,
"grad_norm": 0.5428046344760048,
"learning_rate": 6.685227323685209e-06,
"loss": 0.5816,
"step": 149
},
{
"epoch": 0.45408678102926336,
"grad_norm": 0.5722993865645706,
"learning_rate": 6.635339816587109e-06,
"loss": 0.5927,
"step": 150
},
{
"epoch": 0.4571140262361251,
"grad_norm": 0.5837598326989475,
"learning_rate": 6.5852693348882345e-06,
"loss": 0.6134,
"step": 151
},
{
"epoch": 0.4601412714429869,
"grad_norm": 0.5705109632327479,
"learning_rate": 6.535021480865439e-06,
"loss": 0.617,
"step": 152
},
{
"epoch": 0.46316851664984865,
"grad_norm": 0.587486644149134,
"learning_rate": 6.484601876641375e-06,
"loss": 0.5776,
"step": 153
},
{
"epoch": 0.4661957618567104,
"grad_norm": 0.5303833642684306,
"learning_rate": 6.434016163555452e-06,
"loss": 0.6013,
"step": 154
},
{
"epoch": 0.4692230070635722,
"grad_norm": 0.6425271635903188,
"learning_rate": 6.383270001532636e-06,
"loss": 0.6262,
"step": 155
},
{
"epoch": 0.4722502522704339,
"grad_norm": 0.6443860117939154,
"learning_rate": 6.332369068450175e-06,
"loss": 0.6062,
"step": 156
},
{
"epoch": 0.47527749747729564,
"grad_norm": 0.5962963325451314,
"learning_rate": 6.2813190595023135e-06,
"loss": 0.5933,
"step": 157
},
{
"epoch": 0.4783047426841574,
"grad_norm": 0.6125248028036528,
"learning_rate": 6.230125686563068e-06,
"loss": 0.581,
"step": 158
},
{
"epoch": 0.48133198789101916,
"grad_norm": 0.5902030130175919,
"learning_rate": 6.178794677547138e-06,
"loss": 0.5916,
"step": 159
},
{
"epoch": 0.4843592330978809,
"grad_norm": 0.6070118556197786,
"learning_rate": 6.127331775769023e-06,
"loss": 0.5959,
"step": 160
},
{
"epoch": 0.4873864783047427,
"grad_norm": 0.6139246526560278,
"learning_rate": 6.07574273930042e-06,
"loss": 0.6042,
"step": 161
},
{
"epoch": 0.49041372351160445,
"grad_norm": 0.5853691829642305,
"learning_rate": 6.024033340325954e-06,
"loss": 0.6005,
"step": 162
},
{
"epoch": 0.4934409687184662,
"grad_norm": 0.6217992136346727,
"learning_rate": 5.972209364497355e-06,
"loss": 0.6148,
"step": 163
},
{
"epoch": 0.496468213925328,
"grad_norm": 0.5153620135411245,
"learning_rate": 5.920276610286102e-06,
"loss": 0.5946,
"step": 164
},
{
"epoch": 0.49949545913218973,
"grad_norm": 0.579045299846703,
"learning_rate": 5.8682408883346535e-06,
"loss": 0.5968,
"step": 165
},
{
"epoch": 0.5025227043390514,
"grad_norm": 0.582272360892149,
"learning_rate": 5.816108020806297e-06,
"loss": 0.5924,
"step": 166
},
{
"epoch": 0.5055499495459133,
"grad_norm": 0.6731693269444652,
"learning_rate": 5.763883840733736e-06,
"loss": 0.6099,
"step": 167
},
{
"epoch": 0.508577194752775,
"grad_norm": 0.5987564495145384,
"learning_rate": 5.711574191366427e-06,
"loss": 0.5819,
"step": 168
},
{
"epoch": 0.5116044399596368,
"grad_norm": 0.6314446692251019,
"learning_rate": 5.659184925516802e-06,
"loss": 0.5704,
"step": 169
},
{
"epoch": 0.5146316851664985,
"grad_norm": 0.5620837639422965,
"learning_rate": 5.60672190490541e-06,
"loss": 0.5775,
"step": 170
},
{
"epoch": 0.5176589303733602,
"grad_norm": 0.6316977870227769,
"learning_rate": 5.5541909995050554e-06,
"loss": 0.5969,
"step": 171
},
{
"epoch": 0.520686175580222,
"grad_norm": 0.5610320762634523,
"learning_rate": 5.5015980868840254e-06,
"loss": 0.5775,
"step": 172
},
{
"epoch": 0.5237134207870837,
"grad_norm": 0.5954818956974259,
"learning_rate": 5.448949051548459e-06,
"loss": 0.6194,
"step": 173
},
{
"epoch": 0.5267406659939455,
"grad_norm": 0.5907233979522453,
"learning_rate": 5.396249784283943e-06,
"loss": 0.619,
"step": 174
},
{
"epoch": 0.5297679112008072,
"grad_norm": 0.5878556689209177,
"learning_rate": 5.343506181496405e-06,
"loss": 0.635,
"step": 175
},
{
"epoch": 0.5327951564076691,
"grad_norm": 0.5424993453242938,
"learning_rate": 5.290724144552379e-06,
"loss": 0.5665,
"step": 176
},
{
"epoch": 0.5358224016145308,
"grad_norm": 0.56413748582039,
"learning_rate": 5.237909579118713e-06,
"loss": 0.5873,
"step": 177
},
{
"epoch": 0.5388496468213926,
"grad_norm": 0.5862074559171186,
"learning_rate": 5.185068394501791e-06,
"loss": 0.5922,
"step": 178
},
{
"epoch": 0.5418768920282543,
"grad_norm": 0.5863229615787047,
"learning_rate": 5.132206502986368e-06,
"loss": 0.5957,
"step": 179
},
{
"epoch": 0.544904137235116,
"grad_norm": 0.6115888080302968,
"learning_rate": 5.07932981917404e-06,
"loss": 0.6244,
"step": 180
},
{
"epoch": 0.5479313824419778,
"grad_norm": 0.5967064630776174,
"learning_rate": 5.026444259321489e-06,
"loss": 0.5918,
"step": 181
},
{
"epoch": 0.5509586276488395,
"grad_norm": 0.598155707542942,
"learning_rate": 4.973555740678512e-06,
"loss": 0.6068,
"step": 182
},
{
"epoch": 0.5539858728557013,
"grad_norm": 0.5963013201559106,
"learning_rate": 4.9206701808259605e-06,
"loss": 0.5908,
"step": 183
},
{
"epoch": 0.557013118062563,
"grad_norm": 0.5743174011892115,
"learning_rate": 4.867793497013634e-06,
"loss": 0.594,
"step": 184
},
{
"epoch": 0.5600403632694249,
"grad_norm": 0.6047934217795865,
"learning_rate": 4.81493160549821e-06,
"loss": 0.6276,
"step": 185
},
{
"epoch": 0.5630676084762866,
"grad_norm": 0.5644641937652061,
"learning_rate": 4.762090420881289e-06,
"loss": 0.6261,
"step": 186
},
{
"epoch": 0.5660948536831484,
"grad_norm": 0.6017459921374998,
"learning_rate": 4.7092758554476215e-06,
"loss": 0.5784,
"step": 187
},
{
"epoch": 0.5691220988900101,
"grad_norm": 0.5255259760004068,
"learning_rate": 4.6564938185035954e-06,
"loss": 0.578,
"step": 188
},
{
"epoch": 0.5721493440968718,
"grad_norm": 0.5412024608564904,
"learning_rate": 4.603750215716057e-06,
"loss": 0.5864,
"step": 189
},
{
"epoch": 0.5751765893037336,
"grad_norm": 0.5672691986658981,
"learning_rate": 4.551050948451542e-06,
"loss": 0.5941,
"step": 190
},
{
"epoch": 0.5782038345105953,
"grad_norm": 0.5343510102399306,
"learning_rate": 4.498401913115975e-06,
"loss": 0.5923,
"step": 191
},
{
"epoch": 0.5812310797174571,
"grad_norm": 0.6321550272676739,
"learning_rate": 4.445809000494945e-06,
"loss": 0.5915,
"step": 192
},
{
"epoch": 0.5842583249243188,
"grad_norm": 0.7435167051194291,
"learning_rate": 4.393278095094591e-06,
"loss": 0.6,
"step": 193
},
{
"epoch": 0.5872855701311807,
"grad_norm": 0.6450789080234447,
"learning_rate": 4.340815074483199e-06,
"loss": 0.6159,
"step": 194
},
{
"epoch": 0.5903128153380424,
"grad_norm": 0.6930560431540468,
"learning_rate": 4.2884258086335755e-06,
"loss": 0.5963,
"step": 195
},
{
"epoch": 0.5933400605449042,
"grad_norm": 0.6255483765313397,
"learning_rate": 4.2361161592662655e-06,
"loss": 0.6052,
"step": 196
},
{
"epoch": 0.5963673057517659,
"grad_norm": 0.5639810267312384,
"learning_rate": 4.183891979193703e-06,
"loss": 0.6015,
"step": 197
},
{
"epoch": 0.5993945509586277,
"grad_norm": 0.548527928065822,
"learning_rate": 4.131759111665349e-06,
"loss": 0.5703,
"step": 198
},
{
"epoch": 0.6024217961654894,
"grad_norm": 0.7324829472314374,
"learning_rate": 4.079723389713899e-06,
"loss": 0.5807,
"step": 199
},
{
"epoch": 0.6054490413723511,
"grad_norm": 0.5518608852991715,
"learning_rate": 4.027790635502646e-06,
"loss": 0.5882,
"step": 200
},
{
"epoch": 0.6084762865792129,
"grad_norm": 0.45905744984518376,
"learning_rate": 3.975966659674048e-06,
"loss": 0.5838,
"step": 201
},
{
"epoch": 0.6115035317860746,
"grad_norm": 0.5282546348577034,
"learning_rate": 3.924257260699583e-06,
"loss": 0.6181,
"step": 202
},
{
"epoch": 0.6145307769929365,
"grad_norm": 0.5946914919978067,
"learning_rate": 3.872668224230979e-06,
"loss": 0.5998,
"step": 203
},
{
"epoch": 0.6175580221997982,
"grad_norm": 0.6580516849807101,
"learning_rate": 3.821205322452863e-06,
"loss": 0.5827,
"step": 204
},
{
"epoch": 0.62058526740666,
"grad_norm": 0.5690309734419176,
"learning_rate": 3.769874313436933e-06,
"loss": 0.5907,
"step": 205
},
{
"epoch": 0.6236125126135217,
"grad_norm": 0.49747583948855606,
"learning_rate": 3.7186809404976877e-06,
"loss": 0.5743,
"step": 206
},
{
"epoch": 0.6266397578203835,
"grad_norm": 0.523896476102964,
"learning_rate": 3.667630931549826e-06,
"loss": 0.5897,
"step": 207
},
{
"epoch": 0.6296670030272452,
"grad_norm": 0.5963534390732177,
"learning_rate": 3.6167299984673655e-06,
"loss": 0.5809,
"step": 208
},
{
"epoch": 0.6326942482341069,
"grad_norm": 0.5768474107934146,
"learning_rate": 3.5659838364445505e-06,
"loss": 0.6127,
"step": 209
},
{
"epoch": 0.6357214934409687,
"grad_norm": 0.5498181194865619,
"learning_rate": 3.5153981233586277e-06,
"loss": 0.597,
"step": 210
},
{
"epoch": 0.6387487386478304,
"grad_norm": 0.4841409122780873,
"learning_rate": 3.4649785191345613e-06,
"loss": 0.5908,
"step": 211
},
{
"epoch": 0.6417759838546923,
"grad_norm": 0.5952197224082245,
"learning_rate": 3.4147306651117663e-06,
"loss": 0.5886,
"step": 212
},
{
"epoch": 0.644803229061554,
"grad_norm": 0.5759467283025765,
"learning_rate": 3.3646601834128924e-06,
"loss": 0.5813,
"step": 213
},
{
"epoch": 0.6478304742684158,
"grad_norm": 0.504084506350078,
"learning_rate": 3.3147726763147913e-06,
"loss": 0.5567,
"step": 214
},
{
"epoch": 0.6508577194752775,
"grad_norm": 0.6179436505824255,
"learning_rate": 3.2650737256216885e-06,
"loss": 0.5736,
"step": 215
},
{
"epoch": 0.6538849646821393,
"grad_norm": 0.5858944695639594,
"learning_rate": 3.2155688920406415e-06,
"loss": 0.561,
"step": 216
},
{
"epoch": 0.656912209889001,
"grad_norm": 0.5696652278554001,
"learning_rate": 3.16626371455937e-06,
"loss": 0.5977,
"step": 217
},
{
"epoch": 0.6599394550958627,
"grad_norm": 0.6108743561571359,
"learning_rate": 3.1171637098265063e-06,
"loss": 0.5868,
"step": 218
},
{
"epoch": 0.6629667003027245,
"grad_norm": 0.5740706419823538,
"learning_rate": 3.0682743715343565e-06,
"loss": 0.5919,
"step": 219
},
{
"epoch": 0.6659939455095862,
"grad_norm": 0.568317252040956,
"learning_rate": 3.019601169804216e-06,
"loss": 0.6065,
"step": 220
},
{
"epoch": 0.669021190716448,
"grad_norm": 0.5769966351779006,
"learning_rate": 2.9711495505743317e-06,
"loss": 0.5608,
"step": 221
},
{
"epoch": 0.6720484359233098,
"grad_norm": 0.5599812939135156,
"learning_rate": 2.9229249349905686e-06,
"loss": 0.5842,
"step": 222
},
{
"epoch": 0.6750756811301716,
"grad_norm": 0.5132372279255009,
"learning_rate": 2.8749327187998516e-06,
"loss": 0.5846,
"step": 223
},
{
"epoch": 0.6781029263370333,
"grad_norm": 0.6135819674081163,
"learning_rate": 2.8271782717464413e-06,
"loss": 0.6023,
"step": 224
},
{
"epoch": 0.6811301715438951,
"grad_norm": 0.49680909922668626,
"learning_rate": 2.7796669369711294e-06,
"loss": 0.5744,
"step": 225
},
{
"epoch": 0.6841574167507568,
"grad_norm": 0.5583835598384879,
"learning_rate": 2.7324040304134125e-06,
"loss": 0.5978,
"step": 226
},
{
"epoch": 0.6871846619576185,
"grad_norm": 0.5766383124023369,
"learning_rate": 2.685394840216688e-06,
"loss": 0.5922,
"step": 227
},
{
"epoch": 0.6902119071644803,
"grad_norm": 0.5643109717278529,
"learning_rate": 2.6386446261365874e-06,
"loss": 0.5705,
"step": 228
},
{
"epoch": 0.693239152371342,
"grad_norm": 0.5454688534879062,
"learning_rate": 2.5921586189524694e-06,
"loss": 0.5844,
"step": 229
},
{
"epoch": 0.6962663975782039,
"grad_norm": 0.623740507871138,
"learning_rate": 2.5459420198821604e-06,
"loss": 0.619,
"step": 230
},
{
"epoch": 0.6992936427850656,
"grad_norm": 0.5831337080836875,
"learning_rate": 2.5000000000000015e-06,
"loss": 0.6039,
"step": 231
},
{
"epoch": 0.7023208879919274,
"grad_norm": 0.5389878318645905,
"learning_rate": 2.454337699658267e-06,
"loss": 0.6089,
"step": 232
},
{
"epoch": 0.7053481331987891,
"grad_norm": 0.5690652768890263,
"learning_rate": 2.4089602279120224e-06,
"loss": 0.5837,
"step": 233
},
{
"epoch": 0.7083753784056509,
"grad_norm": 0.5733144963226237,
"learning_rate": 2.363872661947488e-06,
"loss": 0.5867,
"step": 234
},
{
"epoch": 0.7114026236125126,
"grad_norm": 0.5425153422196943,
"learning_rate": 2.319080046513954e-06,
"loss": 0.5877,
"step": 235
},
{
"epoch": 0.7144298688193743,
"grad_norm": 0.5612145710224937,
"learning_rate": 2.274587393359342e-06,
"loss": 0.5708,
"step": 236
},
{
"epoch": 0.7174571140262361,
"grad_norm": 0.535857077864264,
"learning_rate": 2.230399680669449e-06,
"loss": 0.5675,
"step": 237
},
{
"epoch": 0.7204843592330978,
"grad_norm": 0.5273349601397522,
"learning_rate": 2.1865218525109496e-06,
"loss": 0.5874,
"step": 238
},
{
"epoch": 0.7235116044399597,
"grad_norm": 0.5796363429064205,
"learning_rate": 2.1429588182782147e-06,
"loss": 0.5946,
"step": 239
},
{
"epoch": 0.7265388496468214,
"grad_norm": 0.5328272940580978,
"learning_rate": 2.09971545214401e-06,
"loss": 0.589,
"step": 240
},
{
"epoch": 0.7295660948536832,
"grad_norm": 0.5344685316026941,
"learning_rate": 2.0567965925141366e-06,
"loss": 0.5945,
"step": 241
},
{
"epoch": 0.7325933400605449,
"grad_norm": 0.565859281080142,
"learning_rate": 2.0142070414860704e-06,
"loss": 0.5788,
"step": 242
},
{
"epoch": 0.7356205852674067,
"grad_norm": 0.5153266259521138,
"learning_rate": 1.971951564311668e-06,
"loss": 0.5973,
"step": 243
},
{
"epoch": 0.7386478304742684,
"grad_norm": 0.5240446772325741,
"learning_rate": 1.9300348888639915e-06,
"loss": 0.5431,
"step": 244
},
{
"epoch": 0.7416750756811302,
"grad_norm": 0.534953964759512,
"learning_rate": 1.8884617051083183e-06,
"loss": 0.5832,
"step": 245
},
{
"epoch": 0.7447023208879919,
"grad_norm": 0.4979606670452873,
"learning_rate": 1.8472366645773892e-06,
"loss": 0.571,
"step": 246
},
{
"epoch": 0.7477295660948536,
"grad_norm": 0.5130245378945086,
"learning_rate": 1.8063643798509594e-06,
"loss": 0.5646,
"step": 247
},
{
"epoch": 0.7507568113017155,
"grad_norm": 0.5323596668309327,
"learning_rate": 1.7658494240397127e-06,
"loss": 0.5812,
"step": 248
},
{
"epoch": 0.7537840565085772,
"grad_norm": 0.4861912769618461,
"learning_rate": 1.7256963302735752e-06,
"loss": 0.5751,
"step": 249
},
{
"epoch": 0.756811301715439,
"grad_norm": 0.5364551553684321,
"learning_rate": 1.68590959119452e-06,
"loss": 0.5558,
"step": 250
},
{
"epoch": 0.7598385469223007,
"grad_norm": 0.5478652783214457,
"learning_rate": 1.646493658453896e-06,
"loss": 0.5751,
"step": 251
},
{
"epoch": 0.7628657921291625,
"grad_norm": 0.5077435584491996,
"learning_rate": 1.6074529422143398e-06,
"loss": 0.5661,
"step": 252
},
{
"epoch": 0.7658930373360242,
"grad_norm": 0.5357597171272159,
"learning_rate": 1.5687918106563326e-06,
"loss": 0.5859,
"step": 253
},
{
"epoch": 0.768920282542886,
"grad_norm": 0.47868175675345265,
"learning_rate": 1.5305145894894547e-06,
"loss": 0.5524,
"step": 254
},
{
"epoch": 0.7719475277497477,
"grad_norm": 0.6035483312601989,
"learning_rate": 1.4926255614683931e-06,
"loss": 0.5823,
"step": 255
},
{
"epoch": 0.7749747729566094,
"grad_norm": 0.5511280570901731,
"learning_rate": 1.4551289659137497e-06,
"loss": 0.6083,
"step": 256
},
{
"epoch": 0.7780020181634713,
"grad_norm": 0.5108934043144912,
"learning_rate": 1.4180289982377138e-06,
"loss": 0.5828,
"step": 257
},
{
"epoch": 0.781029263370333,
"grad_norm": 0.6121486544198794,
"learning_rate": 1.3813298094746491e-06,
"loss": 0.5916,
"step": 258
},
{
"epoch": 0.7840565085771948,
"grad_norm": 0.5264008489117372,
"learning_rate": 1.345035505816642e-06,
"loss": 0.5924,
"step": 259
},
{
"epoch": 0.7870837537840565,
"grad_norm": 0.5104023168267343,
"learning_rate": 1.3091501481540676e-06,
"loss": 0.5773,
"step": 260
},
{
"epoch": 0.7901109989909183,
"grad_norm": 0.5631499535157282,
"learning_rate": 1.2736777516212267e-06,
"loss": 0.5861,
"step": 261
},
{
"epoch": 0.79313824419778,
"grad_norm": 0.4886744980677968,
"learning_rate": 1.238622285147103e-06,
"loss": 0.576,
"step": 262
},
{
"epoch": 0.7961654894046418,
"grad_norm": 0.5770670045506326,
"learning_rate": 1.2039876710112847e-06,
"loss": 0.5982,
"step": 263
},
{
"epoch": 0.7991927346115035,
"grad_norm": 0.6244884325170447,
"learning_rate": 1.1697777844051105e-06,
"loss": 0.5965,
"step": 264
},
{
"epoch": 0.8022199798183652,
"grad_norm": 0.520243748614553,
"learning_rate": 1.135996452998085e-06,
"loss": 0.5842,
"step": 265
},
{
"epoch": 0.805247225025227,
"grad_norm": 0.5416963581354513,
"learning_rate": 1.1026474565096068e-06,
"loss": 0.5718,
"step": 266
},
{
"epoch": 0.8082744702320888,
"grad_norm": 0.6148117033884745,
"learning_rate": 1.0697345262860638e-06,
"loss": 0.6177,
"step": 267
},
{
"epoch": 0.8113017154389506,
"grad_norm": 0.5214972151161207,
"learning_rate": 1.0372613448833429e-06,
"loss": 0.5591,
"step": 268
},
{
"epoch": 0.8143289606458123,
"grad_norm": 0.6122031262529768,
"learning_rate": 1.0052315456547934e-06,
"loss": 0.6042,
"step": 269
},
{
"epoch": 0.8173562058526741,
"grad_norm": 0.5758477746938059,
"learning_rate": 9.73648712344707e-07,
"loss": 0.6115,
"step": 270
},
{
"epoch": 0.8203834510595358,
"grad_norm": 0.570112512816937,
"learning_rate": 9.425163786873292e-07,
"loss": 0.5866,
"step": 271
},
{
"epoch": 0.8234106962663976,
"grad_norm": 0.5820516198175567,
"learning_rate": 9.118380280114858e-07,
"loss": 0.5947,
"step": 272
},
{
"epoch": 0.8264379414732593,
"grad_norm": 0.5580731212292597,
"learning_rate": 8.816170928508367e-07,
"loss": 0.557,
"step": 273
},
{
"epoch": 0.829465186680121,
"grad_norm": 0.6320445569049652,
"learning_rate": 8.518569545598198e-07,
"loss": 0.5678,
"step": 274
},
{
"epoch": 0.8324924318869829,
"grad_norm": 0.5175089657506726,
"learning_rate": 8.225609429353187e-07,
"loss": 0.5768,
"step": 275
},
{
"epoch": 0.8355196770938446,
"grad_norm": 0.5267385437616795,
"learning_rate": 7.937323358440935e-07,
"loss": 0.5771,
"step": 276
},
{
"epoch": 0.8385469223007064,
"grad_norm": 0.5577148900883375,
"learning_rate": 7.653743588560387e-07,
"loss": 0.5997,
"step": 277
},
{
"epoch": 0.8415741675075681,
"grad_norm": 0.5849415807243107,
"learning_rate": 7.374901848832683e-07,
"loss": 0.6034,
"step": 278
},
{
"epoch": 0.8446014127144299,
"grad_norm": 0.5516315304697584,
"learning_rate": 7.100829338251147e-07,
"loss": 0.5825,
"step": 279
},
{
"epoch": 0.8476286579212916,
"grad_norm": 0.6504410341118083,
"learning_rate": 6.831556722190453e-07,
"loss": 0.5754,
"step": 280
},
{
"epoch": 0.8506559031281534,
"grad_norm": 0.508666674698585,
"learning_rate": 6.567114128975571e-07,
"loss": 0.5448,
"step": 281
},
{
"epoch": 0.8536831483350151,
"grad_norm": 0.5219473110967775,
"learning_rate": 6.307531146510754e-07,
"loss": 0.5877,
"step": 282
},
{
"epoch": 0.856710393541877,
"grad_norm": 0.5729786897858258,
"learning_rate": 6.052836818969027e-07,
"loss": 0.5605,
"step": 283
},
{
"epoch": 0.8597376387487387,
"grad_norm": 0.5258807165804881,
"learning_rate": 5.803059643542491e-07,
"loss": 0.5809,
"step": 284
},
{
"epoch": 0.8627648839556004,
"grad_norm": 0.537495933532566,
"learning_rate": 5.558227567253832e-07,
"loss": 0.5837,
"step": 285
},
{
"epoch": 0.8657921291624622,
"grad_norm": 0.515813825148231,
"learning_rate": 5.318367983829393e-07,
"loss": 0.6036,
"step": 286
},
{
"epoch": 0.8688193743693239,
"grad_norm": 0.499231906681902,
"learning_rate": 5.083507730634152e-07,
"loss": 0.5745,
"step": 287
},
{
"epoch": 0.8718466195761857,
"grad_norm": 0.5908751687131693,
"learning_rate": 4.853673085668947e-07,
"loss": 0.6125,
"step": 288
},
{
"epoch": 0.8748738647830474,
"grad_norm": 0.47941442655226785,
"learning_rate": 4.628889764630279e-07,
"loss": 0.5664,
"step": 289
},
{
"epoch": 0.8779011099899092,
"grad_norm": 0.46797616578568446,
"learning_rate": 4.4091829180330503e-07,
"loss": 0.5982,
"step": 290
},
{
"epoch": 0.8809283551967709,
"grad_norm": 0.5277287671978815,
"learning_rate": 4.194577128396521e-07,
"loss": 0.576,
"step": 291
},
{
"epoch": 0.8839556004036327,
"grad_norm": 0.5235787121811849,
"learning_rate": 3.985096407493838e-07,
"loss": 0.5954,
"step": 292
},
{
"epoch": 0.8869828456104945,
"grad_norm": 0.47994840593868204,
"learning_rate": 3.7807641936653984e-07,
"loss": 0.5762,
"step": 293
},
{
"epoch": 0.8900100908173562,
"grad_norm": 0.7091280394514846,
"learning_rate": 3.581603349196372e-07,
"loss": 0.5759,
"step": 294
},
{
"epoch": 0.893037336024218,
"grad_norm": 0.5116029984310859,
"learning_rate": 3.3876361577587115e-07,
"loss": 0.5605,
"step": 295
},
{
"epoch": 0.8960645812310797,
"grad_norm": 0.5589353184753382,
"learning_rate": 3.1988843219178776e-07,
"loss": 0.59,
"step": 296
},
{
"epoch": 0.8990918264379415,
"grad_norm": 0.5658147075025127,
"learning_rate": 3.015368960704584e-07,
"loss": 0.5762,
"step": 297
},
{
"epoch": 0.9021190716448032,
"grad_norm": 0.5882503585990159,
"learning_rate": 2.8371106072518194e-07,
"loss": 0.5671,
"step": 298
},
{
"epoch": 0.905146316851665,
"grad_norm": 0.5755873180319001,
"learning_rate": 2.664129206497479e-07,
"loss": 0.5929,
"step": 299
},
{
"epoch": 0.9081735620585267,
"grad_norm": 0.5209213812443522,
"learning_rate": 2.4964441129527337e-07,
"loss": 0.5803,
"step": 300
},
{
"epoch": 0.9112008072653885,
"grad_norm": 0.5965283286772103,
"learning_rate": 2.3340740885364922e-07,
"loss": 0.5801,
"step": 301
},
{
"epoch": 0.9142280524722503,
"grad_norm": 0.592260031430601,
"learning_rate": 2.1770373004762035e-07,
"loss": 0.5588,
"step": 302
},
{
"epoch": 0.917255297679112,
"grad_norm": 0.5055643569791889,
"learning_rate": 2.0253513192751374e-07,
"loss": 0.563,
"step": 303
},
{
"epoch": 0.9202825428859738,
"grad_norm": 0.4958132490271683,
"learning_rate": 1.8790331167464758e-07,
"loss": 0.5918,
"step": 304
},
{
"epoch": 0.9233097880928355,
"grad_norm": 0.5415503531670961,
"learning_rate": 1.738099064114368e-07,
"loss": 0.5621,
"step": 305
},
{
"epoch": 0.9263370332996973,
"grad_norm": 0.4702073705919661,
"learning_rate": 1.6025649301821877e-07,
"loss": 0.5937,
"step": 306
},
{
"epoch": 0.929364278506559,
"grad_norm": 0.4879121640901005,
"learning_rate": 1.4724458795681962e-07,
"loss": 0.5864,
"step": 307
},
{
"epoch": 0.9323915237134208,
"grad_norm": 0.5245732459296225,
"learning_rate": 1.3477564710088097e-07,
"loss": 0.5502,
"step": 308
},
{
"epoch": 0.9354187689202825,
"grad_norm": 0.5278052522669653,
"learning_rate": 1.2285106557296479e-07,
"loss": 0.5937,
"step": 309
},
{
"epoch": 0.9384460141271443,
"grad_norm": 0.6237856474515564,
"learning_rate": 1.1147217758845752e-07,
"loss": 0.6039,
"step": 310
},
{
"epoch": 0.941473259334006,
"grad_norm": 0.5242772954775254,
"learning_rate": 1.0064025630628583e-07,
"loss": 0.5893,
"step": 311
},
{
"epoch": 0.9445005045408678,
"grad_norm": 0.5968530957059317,
"learning_rate": 9.035651368646647e-08,
"loss": 0.5754,
"step": 312
},
{
"epoch": 0.9475277497477296,
"grad_norm": 0.5767016532486906,
"learning_rate": 8.06221003545038e-08,
"loss": 0.5744,
"step": 313
},
{
"epoch": 0.9505549949545913,
"grad_norm": 0.4886610578988748,
"learning_rate": 7.143810547264762e-08,
"loss": 0.5838,
"step": 314
},
{
"epoch": 0.9535822401614531,
"grad_norm": 0.4954700924731553,
"learning_rate": 6.280555661802857e-08,
"loss": 0.5738,
"step": 315
},
{
"epoch": 0.9566094853683148,
"grad_norm": 0.5021413532010757,
"learning_rate": 5.472541966768552e-08,
"loss": 0.5547,
"step": 316
},
{
"epoch": 0.9596367305751766,
"grad_norm": 0.489196020943252,
"learning_rate": 4.719859869049659e-08,
"loss": 0.5927,
"step": 317
},
{
"epoch": 0.9626639757820383,
"grad_norm": 0.6080792649031748,
"learning_rate": 4.02259358460233e-08,
"loss": 0.604,
"step": 318
},
{
"epoch": 0.9656912209889001,
"grad_norm": 0.5165669751885817,
"learning_rate": 3.3808211290284886e-08,
"loss": 0.5545,
"step": 319
},
{
"epoch": 0.9687184661957619,
"grad_norm": 0.5480571861823927,
"learning_rate": 2.7946143088466437e-08,
"loss": 0.5681,
"step": 320
},
{
"epoch": 0.9717457114026236,
"grad_norm": 0.5445727157430111,
"learning_rate": 2.264038713457706e-08,
"loss": 0.5935,
"step": 321
},
{
"epoch": 0.9747729566094854,
"grad_norm": 0.49634607045068185,
"learning_rate": 1.789153707806357e-08,
"loss": 0.5815,
"step": 322
},
{
"epoch": 0.9778002018163471,
"grad_norm": 0.5190636033293253,
"learning_rate": 1.3700124257388092e-08,
"loss": 0.568,
"step": 323
},
{
"epoch": 0.9808274470232089,
"grad_norm": 0.4828384456231745,
"learning_rate": 1.006661764057837e-08,
"loss": 0.5889,
"step": 324
},
{
"epoch": 0.9838546922300706,
"grad_norm": 0.5214611299568966,
"learning_rate": 6.991423772753636e-09,
"loss": 0.5892,
"step": 325
},
{
"epoch": 0.9868819374369324,
"grad_norm": 0.5311765550115639,
"learning_rate": 4.474886730641004e-09,
"loss": 0.5557,
"step": 326
},
{
"epoch": 0.9899091826437941,
"grad_norm": 0.5581341670017683,
"learning_rate": 2.5172880840745873e-09,
"loss": 0.5947,
"step": 327
},
{
"epoch": 0.992936427850656,
"grad_norm": 0.5483417476042656,
"learning_rate": 1.118846864490708e-09,
"loss": 0.5799,
"step": 328
},
{
"epoch": 0.9959636730575177,
"grad_norm": 0.5108713336774127,
"learning_rate": 2.797195404247166e-10,
"loss": 0.6077,
"step": 329
},
{
"epoch": 0.9989909182643795,
"grad_norm": 0.5197265314296852,
"learning_rate": 0.0,
"loss": 0.5758,
"step": 330
},
{
"epoch": 0.9989909182643795,
"step": 330,
"total_flos": 2.128428444764078e+17,
"train_loss": 0.6183218762730107,
"train_runtime": 10772.3595,
"train_samples_per_second": 1.472,
"train_steps_per_second": 0.031
}
],
"logging_steps": 1,
"max_steps": 330,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 400,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.128428444764078e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}