nicoboss's picture
Upload folder using huggingface_hub
25b3f76 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.6007604562737643,
"eval_steps": 500,
"global_step": 948,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0006337135614702154,
"grad_norm": 0.22353313863277435,
"learning_rate": 2e-05,
"loss": 0.795,
"step": 1
},
{
"epoch": 0.0012674271229404308,
"grad_norm": 0.270685613155365,
"learning_rate": 4e-05,
"loss": 0.9841,
"step": 2
},
{
"epoch": 0.0019011406844106464,
"grad_norm": 0.13555319607257843,
"learning_rate": 6e-05,
"loss": 0.8728,
"step": 3
},
{
"epoch": 0.0025348542458808617,
"grad_norm": 0.1665652096271515,
"learning_rate": 8e-05,
"loss": 0.8625,
"step": 4
},
{
"epoch": 0.0031685678073510772,
"grad_norm": 0.13588839769363403,
"learning_rate": 0.0001,
"loss": 0.6776,
"step": 5
},
{
"epoch": 0.0038022813688212928,
"grad_norm": 0.2811749279499054,
"learning_rate": 0.00012,
"loss": 0.8813,
"step": 6
},
{
"epoch": 0.004435994930291508,
"grad_norm": 0.327694833278656,
"learning_rate": 0.00014,
"loss": 0.9009,
"step": 7
},
{
"epoch": 0.005069708491761723,
"grad_norm": 0.24555213749408722,
"learning_rate": 0.00016,
"loss": 0.7054,
"step": 8
},
{
"epoch": 0.005703422053231939,
"grad_norm": 0.14921338856220245,
"learning_rate": 0.00018,
"loss": 0.697,
"step": 9
},
{
"epoch": 0.0063371356147021544,
"grad_norm": 0.13169103860855103,
"learning_rate": 0.0002,
"loss": 0.6007,
"step": 10
},
{
"epoch": 0.00697084917617237,
"grad_norm": 0.06807047873735428,
"learning_rate": 0.00019999979928608238,
"loss": 0.6155,
"step": 11
},
{
"epoch": 0.0076045627376425855,
"grad_norm": 0.08288167417049408,
"learning_rate": 0.00019999919714513528,
"loss": 0.5641,
"step": 12
},
{
"epoch": 0.008238276299112801,
"grad_norm": 0.12285872548818588,
"learning_rate": 0.00019999819357957582,
"loss": 0.7526,
"step": 13
},
{
"epoch": 0.008871989860583017,
"grad_norm": 0.15566691756248474,
"learning_rate": 0.00019999678859343263,
"loss": 0.4519,
"step": 14
},
{
"epoch": 0.009505703422053232,
"grad_norm": 0.1301712989807129,
"learning_rate": 0.00019999498219234568,
"loss": 0.486,
"step": 15
},
{
"epoch": 0.010139416983523447,
"grad_norm": 0.14493511617183685,
"learning_rate": 0.00019999277438356638,
"loss": 0.7146,
"step": 16
},
{
"epoch": 0.010773130544993664,
"grad_norm": 0.1372271478176117,
"learning_rate": 0.00019999016517595753,
"loss": 0.5933,
"step": 17
},
{
"epoch": 0.011406844106463879,
"grad_norm": 0.09944190829992294,
"learning_rate": 0.00019998715457999314,
"loss": 0.8399,
"step": 18
},
{
"epoch": 0.012040557667934094,
"grad_norm": 0.057923465967178345,
"learning_rate": 0.0001999837426077586,
"loss": 0.5613,
"step": 19
},
{
"epoch": 0.012674271229404309,
"grad_norm": 0.06214901804924011,
"learning_rate": 0.00019997992927295059,
"loss": 0.5374,
"step": 20
},
{
"epoch": 0.013307984790874524,
"grad_norm": 0.04898112639784813,
"learning_rate": 0.0001999757145908768,
"loss": 0.5451,
"step": 21
},
{
"epoch": 0.01394169835234474,
"grad_norm": 0.07026948034763336,
"learning_rate": 0.0001999710985784562,
"loss": 0.5635,
"step": 22
},
{
"epoch": 0.014575411913814956,
"grad_norm": 0.0672365352511406,
"learning_rate": 0.00019996608125421873,
"loss": 0.5996,
"step": 23
},
{
"epoch": 0.015209125475285171,
"grad_norm": 0.06477885693311691,
"learning_rate": 0.00019996066263830531,
"loss": 0.4707,
"step": 24
},
{
"epoch": 0.015842839036755388,
"grad_norm": 0.07720793038606644,
"learning_rate": 0.0001999548427524678,
"loss": 0.5891,
"step": 25
},
{
"epoch": 0.016476552598225603,
"grad_norm": 0.06699500977993011,
"learning_rate": 0.0001999486216200688,
"loss": 0.5316,
"step": 26
},
{
"epoch": 0.017110266159695818,
"grad_norm": 0.07539479434490204,
"learning_rate": 0.00019994199926608172,
"loss": 0.5854,
"step": 27
},
{
"epoch": 0.017743979721166033,
"grad_norm": 4.677523136138916,
"learning_rate": 0.00019993497571709048,
"loss": 0.5019,
"step": 28
},
{
"epoch": 0.018377693282636248,
"grad_norm": 0.07100815325975418,
"learning_rate": 0.00019992755100128962,
"loss": 0.4729,
"step": 29
},
{
"epoch": 0.019011406844106463,
"grad_norm": 0.06506210565567017,
"learning_rate": 0.000199919725148484,
"loss": 0.5597,
"step": 30
},
{
"epoch": 0.01964512040557668,
"grad_norm": 0.04945315420627594,
"learning_rate": 0.0001999114981900887,
"loss": 0.5044,
"step": 31
},
{
"epoch": 0.020278833967046894,
"grad_norm": 0.05103156715631485,
"learning_rate": 0.0001999028701591291,
"loss": 0.3637,
"step": 32
},
{
"epoch": 0.02091254752851711,
"grad_norm": 0.05288761481642723,
"learning_rate": 0.00019989384109024048,
"loss": 0.4345,
"step": 33
},
{
"epoch": 0.021546261089987327,
"grad_norm": 0.05457635968923569,
"learning_rate": 0.0001998844110196681,
"loss": 0.4714,
"step": 34
},
{
"epoch": 0.022179974651457542,
"grad_norm": 0.055830612778663635,
"learning_rate": 0.0001998745799852668,
"loss": 0.5285,
"step": 35
},
{
"epoch": 0.022813688212927757,
"grad_norm": 0.05858856439590454,
"learning_rate": 0.00019986434802650113,
"loss": 0.5106,
"step": 36
},
{
"epoch": 0.023447401774397972,
"grad_norm": 0.05847540497779846,
"learning_rate": 0.00019985371518444503,
"loss": 0.4394,
"step": 37
},
{
"epoch": 0.024081115335868188,
"grad_norm": 0.1140831857919693,
"learning_rate": 0.00019984268150178167,
"loss": 0.4782,
"step": 38
},
{
"epoch": 0.024714828897338403,
"grad_norm": 0.06483329832553864,
"learning_rate": 0.00019983124702280334,
"loss": 0.396,
"step": 39
},
{
"epoch": 0.025348542458808618,
"grad_norm": 0.07212468981742859,
"learning_rate": 0.00019981941179341117,
"loss": 0.5173,
"step": 40
},
{
"epoch": 0.025982256020278833,
"grad_norm": 0.1697537750005722,
"learning_rate": 0.00019980717586111512,
"loss": 0.6164,
"step": 41
},
{
"epoch": 0.026615969581749048,
"grad_norm": 0.05975339934229851,
"learning_rate": 0.00019979453927503364,
"loss": 0.4981,
"step": 42
},
{
"epoch": 0.027249683143219267,
"grad_norm": 0.0607403926551342,
"learning_rate": 0.00019978150208589348,
"loss": 0.533,
"step": 43
},
{
"epoch": 0.02788339670468948,
"grad_norm": 0.07225210964679718,
"learning_rate": 0.00019976806434602952,
"loss": 0.5055,
"step": 44
},
{
"epoch": 0.028517110266159697,
"grad_norm": 0.07008686661720276,
"learning_rate": 0.00019975422610938462,
"loss": 0.6274,
"step": 45
},
{
"epoch": 0.029150823827629912,
"grad_norm": 0.07289402186870575,
"learning_rate": 0.0001997399874315093,
"loss": 0.5247,
"step": 46
},
{
"epoch": 0.029784537389100127,
"grad_norm": 0.10037431120872498,
"learning_rate": 0.0001997253483695616,
"loss": 0.647,
"step": 47
},
{
"epoch": 0.030418250950570342,
"grad_norm": 0.06468270719051361,
"learning_rate": 0.00019971030898230672,
"loss": 0.5719,
"step": 48
},
{
"epoch": 0.031051964512040557,
"grad_norm": 0.0472278967499733,
"learning_rate": 0.00019969486933011705,
"loss": 0.5565,
"step": 49
},
{
"epoch": 0.031685678073510776,
"grad_norm": 0.0584145151078701,
"learning_rate": 0.00019967902947497156,
"loss": 0.5432,
"step": 50
},
{
"epoch": 0.03231939163498099,
"grad_norm": 0.08962458372116089,
"learning_rate": 0.00019966278948045592,
"loss": 0.6432,
"step": 51
},
{
"epoch": 0.032953105196451206,
"grad_norm": 0.08193643391132355,
"learning_rate": 0.00019964614941176195,
"loss": 0.5341,
"step": 52
},
{
"epoch": 0.03358681875792142,
"grad_norm": 0.07166769355535507,
"learning_rate": 0.00019962910933568747,
"loss": 0.5481,
"step": 53
},
{
"epoch": 0.034220532319391636,
"grad_norm": 0.10422351956367493,
"learning_rate": 0.00019961166932063614,
"loss": 0.6145,
"step": 54
},
{
"epoch": 0.03485424588086185,
"grad_norm": 0.06273826211690903,
"learning_rate": 0.00019959382943661704,
"loss": 0.4969,
"step": 55
},
{
"epoch": 0.035487959442332066,
"grad_norm": 0.06504670530557632,
"learning_rate": 0.0001995755897552444,
"loss": 0.6093,
"step": 56
},
{
"epoch": 0.03612167300380228,
"grad_norm": 0.05045778304338455,
"learning_rate": 0.00019955695034973742,
"loss": 0.4191,
"step": 57
},
{
"epoch": 0.036755386565272496,
"grad_norm": 0.06495866179466248,
"learning_rate": 0.00019953791129491983,
"loss": 0.4762,
"step": 58
},
{
"epoch": 0.037389100126742715,
"grad_norm": 0.0814126655459404,
"learning_rate": 0.0001995184726672197,
"loss": 0.5599,
"step": 59
},
{
"epoch": 0.03802281368821293,
"grad_norm": 0.052061304450035095,
"learning_rate": 0.00019949863454466908,
"loss": 0.4822,
"step": 60
},
{
"epoch": 0.038656527249683145,
"grad_norm": 0.05419475957751274,
"learning_rate": 0.00019947839700690375,
"loss": 0.5625,
"step": 61
},
{
"epoch": 0.03929024081115336,
"grad_norm": 0.06495067477226257,
"learning_rate": 0.0001994577601351628,
"loss": 0.5863,
"step": 62
},
{
"epoch": 0.039923954372623575,
"grad_norm": 0.055791907012462616,
"learning_rate": 0.00019943672401228837,
"loss": 0.4588,
"step": 63
},
{
"epoch": 0.04055766793409379,
"grad_norm": 0.03923908621072769,
"learning_rate": 0.00019941528872272532,
"loss": 0.3841,
"step": 64
},
{
"epoch": 0.041191381495564006,
"grad_norm": 0.08200399577617645,
"learning_rate": 0.00019939345435252088,
"loss": 0.6163,
"step": 65
},
{
"epoch": 0.04182509505703422,
"grad_norm": 0.05708305537700653,
"learning_rate": 0.00019937122098932428,
"loss": 0.6363,
"step": 66
},
{
"epoch": 0.042458808618504436,
"grad_norm": 0.053468603640794754,
"learning_rate": 0.0001993485887223864,
"loss": 0.4777,
"step": 67
},
{
"epoch": 0.043092522179974654,
"grad_norm": 0.08539824187755585,
"learning_rate": 0.00019932555764255952,
"loss": 0.4922,
"step": 68
},
{
"epoch": 0.043726235741444866,
"grad_norm": 0.07483454793691635,
"learning_rate": 0.00019930212784229675,
"loss": 0.6337,
"step": 69
},
{
"epoch": 0.044359949302915085,
"grad_norm": 0.06771700084209442,
"learning_rate": 0.00019927829941565186,
"loss": 0.4559,
"step": 70
},
{
"epoch": 0.044993662864385296,
"grad_norm": 0.05689261853694916,
"learning_rate": 0.0001992540724582788,
"loss": 0.5489,
"step": 71
},
{
"epoch": 0.045627376425855515,
"grad_norm": 0.05044565722346306,
"learning_rate": 0.00019922944706743127,
"loss": 0.4472,
"step": 72
},
{
"epoch": 0.046261089987325726,
"grad_norm": 0.07331253588199615,
"learning_rate": 0.00019920442334196248,
"loss": 0.4752,
"step": 73
},
{
"epoch": 0.046894803548795945,
"grad_norm": 0.057449884712696075,
"learning_rate": 0.0001991790013823246,
"loss": 0.4525,
"step": 74
},
{
"epoch": 0.04752851711026616,
"grad_norm": 0.08357278257608414,
"learning_rate": 0.00019915318129056853,
"loss": 0.5813,
"step": 75
},
{
"epoch": 0.048162230671736375,
"grad_norm": 0.051311176270246506,
"learning_rate": 0.00019912696317034322,
"loss": 0.4593,
"step": 76
},
{
"epoch": 0.048795944233206594,
"grad_norm": 0.06535078585147858,
"learning_rate": 0.00019910034712689552,
"loss": 0.5339,
"step": 77
},
{
"epoch": 0.049429657794676805,
"grad_norm": 0.13796891272068024,
"learning_rate": 0.00019907333326706967,
"loss": 0.5438,
"step": 78
},
{
"epoch": 0.050063371356147024,
"grad_norm": 0.05667581036686897,
"learning_rate": 0.0001990459216993068,
"loss": 0.6295,
"step": 79
},
{
"epoch": 0.050697084917617236,
"grad_norm": 0.05243121087551117,
"learning_rate": 0.00019901811253364456,
"loss": 0.4782,
"step": 80
},
{
"epoch": 0.051330798479087454,
"grad_norm": 0.0769771933555603,
"learning_rate": 0.0001989899058817167,
"loss": 0.5692,
"step": 81
},
{
"epoch": 0.051964512040557666,
"grad_norm": 0.07334766536951065,
"learning_rate": 0.00019896130185675261,
"loss": 0.569,
"step": 82
},
{
"epoch": 0.052598225602027884,
"grad_norm": 0.07953603565692902,
"learning_rate": 0.00019893230057357671,
"loss": 0.4059,
"step": 83
},
{
"epoch": 0.053231939163498096,
"grad_norm": 0.05282806232571602,
"learning_rate": 0.00019890290214860833,
"loss": 0.5186,
"step": 84
},
{
"epoch": 0.053865652724968315,
"grad_norm": 0.06661225110292435,
"learning_rate": 0.00019887310669986085,
"loss": 0.6404,
"step": 85
},
{
"epoch": 0.05449936628643853,
"grad_norm": 0.07150626182556152,
"learning_rate": 0.00019884291434694152,
"loss": 0.5865,
"step": 86
},
{
"epoch": 0.055133079847908745,
"grad_norm": 0.054674554616212845,
"learning_rate": 0.00019881232521105089,
"loss": 0.5429,
"step": 87
},
{
"epoch": 0.05576679340937896,
"grad_norm": 0.057950377464294434,
"learning_rate": 0.00019878133941498224,
"loss": 0.6705,
"step": 88
},
{
"epoch": 0.056400506970849175,
"grad_norm": 0.07045155763626099,
"learning_rate": 0.0001987499570831211,
"loss": 0.5393,
"step": 89
},
{
"epoch": 0.057034220532319393,
"grad_norm": 0.055960092693567276,
"learning_rate": 0.00019871817834144504,
"loss": 0.4481,
"step": 90
},
{
"epoch": 0.057667934093789605,
"grad_norm": 0.05631652846932411,
"learning_rate": 0.00019868600331752264,
"loss": 0.5963,
"step": 91
},
{
"epoch": 0.058301647655259824,
"grad_norm": 0.05120407044887543,
"learning_rate": 0.00019865343214051347,
"loss": 0.486,
"step": 92
},
{
"epoch": 0.058935361216730035,
"grad_norm": 0.05507562682032585,
"learning_rate": 0.0001986204649411673,
"loss": 0.5514,
"step": 93
},
{
"epoch": 0.059569074778200254,
"grad_norm": 0.057690516114234924,
"learning_rate": 0.0001985871018518236,
"loss": 0.4969,
"step": 94
},
{
"epoch": 0.060202788339670466,
"grad_norm": 0.05942325294017792,
"learning_rate": 0.00019855334300641114,
"loss": 0.51,
"step": 95
},
{
"epoch": 0.060836501901140684,
"grad_norm": 0.05777527391910553,
"learning_rate": 0.0001985191885404473,
"loss": 0.5401,
"step": 96
},
{
"epoch": 0.0614702154626109,
"grad_norm": 0.07077159732580185,
"learning_rate": 0.00019848463859103763,
"loss": 0.5568,
"step": 97
},
{
"epoch": 0.062103929024081114,
"grad_norm": 0.050649482756853104,
"learning_rate": 0.00019844969329687527,
"loss": 0.5418,
"step": 98
},
{
"epoch": 0.06273764258555133,
"grad_norm": 0.059522844851017,
"learning_rate": 0.00019841435279824028,
"loss": 0.4679,
"step": 99
},
{
"epoch": 0.06337135614702155,
"grad_norm": 0.061260003596544266,
"learning_rate": 0.0001983786172369993,
"loss": 0.557,
"step": 100
},
{
"epoch": 0.06400506970849176,
"grad_norm": 0.0513591468334198,
"learning_rate": 0.00019834248675660486,
"loss": 0.5849,
"step": 101
},
{
"epoch": 0.06463878326996197,
"grad_norm": 0.06722971051931381,
"learning_rate": 0.0001983059615020947,
"loss": 0.4003,
"step": 102
},
{
"epoch": 0.06527249683143219,
"grad_norm": 0.0629379004240036,
"learning_rate": 0.0001982690416200914,
"loss": 0.5322,
"step": 103
},
{
"epoch": 0.06590621039290241,
"grad_norm": 0.05402471870183945,
"learning_rate": 0.00019823172725880165,
"loss": 0.5634,
"step": 104
},
{
"epoch": 0.06653992395437262,
"grad_norm": 0.15680162608623505,
"learning_rate": 0.0001981940185680156,
"loss": 0.5361,
"step": 105
},
{
"epoch": 0.06717363751584284,
"grad_norm": 0.06348865479230881,
"learning_rate": 0.00019815591569910654,
"loss": 0.5322,
"step": 106
},
{
"epoch": 0.06780735107731306,
"grad_norm": 0.05004284158349037,
"learning_rate": 0.00019811741880502995,
"loss": 0.5524,
"step": 107
},
{
"epoch": 0.06844106463878327,
"grad_norm": 0.06271985173225403,
"learning_rate": 0.00019807852804032305,
"loss": 0.4347,
"step": 108
},
{
"epoch": 0.06907477820025348,
"grad_norm": 0.1546468287706375,
"learning_rate": 0.00019803924356110423,
"loss": 0.4294,
"step": 109
},
{
"epoch": 0.0697084917617237,
"grad_norm": 0.06472460180521011,
"learning_rate": 0.00019799956552507233,
"loss": 0.5693,
"step": 110
},
{
"epoch": 0.07034220532319392,
"grad_norm": 0.06021984666585922,
"learning_rate": 0.00019795949409150598,
"loss": 0.6554,
"step": 111
},
{
"epoch": 0.07097591888466413,
"grad_norm": 0.04533032327890396,
"learning_rate": 0.00019791902942126313,
"loss": 0.4425,
"step": 112
},
{
"epoch": 0.07160963244613434,
"grad_norm": 0.0662391185760498,
"learning_rate": 0.0001978781716767802,
"loss": 0.5258,
"step": 113
},
{
"epoch": 0.07224334600760456,
"grad_norm": 0.06131117045879364,
"learning_rate": 0.00019783692102207155,
"loss": 0.4556,
"step": 114
},
{
"epoch": 0.07287705956907478,
"grad_norm": 0.07924918830394745,
"learning_rate": 0.00019779527762272877,
"loss": 0.5137,
"step": 115
},
{
"epoch": 0.07351077313054499,
"grad_norm": 0.07061261683702469,
"learning_rate": 0.0001977532416459201,
"loss": 0.4554,
"step": 116
},
{
"epoch": 0.0741444866920152,
"grad_norm": 0.04919254407286644,
"learning_rate": 0.00019771081326038962,
"loss": 0.5213,
"step": 117
},
{
"epoch": 0.07477820025348543,
"grad_norm": 0.053799472749233246,
"learning_rate": 0.00019766799263645673,
"loss": 0.5648,
"step": 118
},
{
"epoch": 0.07541191381495564,
"grad_norm": 0.06857369095087051,
"learning_rate": 0.00019762477994601522,
"loss": 0.6841,
"step": 119
},
{
"epoch": 0.07604562737642585,
"grad_norm": 0.0719090923666954,
"learning_rate": 0.000197581175362533,
"loss": 0.4154,
"step": 120
},
{
"epoch": 0.07667934093789606,
"grad_norm": 0.10528447479009628,
"learning_rate": 0.00019753717906105092,
"loss": 0.5674,
"step": 121
},
{
"epoch": 0.07731305449936629,
"grad_norm": 0.05879104137420654,
"learning_rate": 0.00019749279121818235,
"loss": 0.5282,
"step": 122
},
{
"epoch": 0.0779467680608365,
"grad_norm": 0.050949644297361374,
"learning_rate": 0.00019744801201211255,
"loss": 0.4398,
"step": 123
},
{
"epoch": 0.07858048162230671,
"grad_norm": 0.061247747391462326,
"learning_rate": 0.00019740284162259765,
"loss": 0.4269,
"step": 124
},
{
"epoch": 0.07921419518377694,
"grad_norm": 0.09446462988853455,
"learning_rate": 0.0001973572802309642,
"loss": 0.6362,
"step": 125
},
{
"epoch": 0.07984790874524715,
"grad_norm": 0.06124195456504822,
"learning_rate": 0.0001973113280201082,
"loss": 0.435,
"step": 126
},
{
"epoch": 0.08048162230671736,
"grad_norm": 0.05198049172759056,
"learning_rate": 0.0001972649851744948,
"loss": 0.4617,
"step": 127
},
{
"epoch": 0.08111533586818757,
"grad_norm": 0.05457935482263565,
"learning_rate": 0.00019721825188015693,
"loss": 0.548,
"step": 128
},
{
"epoch": 0.0817490494296578,
"grad_norm": 0.054542481899261475,
"learning_rate": 0.0001971711283246951,
"loss": 0.4449,
"step": 129
},
{
"epoch": 0.08238276299112801,
"grad_norm": 0.0528152696788311,
"learning_rate": 0.0001971236146972764,
"loss": 0.5868,
"step": 130
},
{
"epoch": 0.08301647655259822,
"grad_norm": 0.049837883561849594,
"learning_rate": 0.0001970757111886337,
"loss": 0.4426,
"step": 131
},
{
"epoch": 0.08365019011406843,
"grad_norm": 0.04912682995200157,
"learning_rate": 0.00019702741799106508,
"loss": 0.5328,
"step": 132
},
{
"epoch": 0.08428390367553866,
"grad_norm": 0.06654444336891174,
"learning_rate": 0.00019697873529843282,
"loss": 0.6239,
"step": 133
},
{
"epoch": 0.08491761723700887,
"grad_norm": 0.1822642683982849,
"learning_rate": 0.00019692966330616283,
"loss": 0.6482,
"step": 134
},
{
"epoch": 0.08555133079847908,
"grad_norm": 0.07404999434947968,
"learning_rate": 0.00019688020221124376,
"loss": 0.5473,
"step": 135
},
{
"epoch": 0.08618504435994931,
"grad_norm": 0.08534666895866394,
"learning_rate": 0.00019683035221222618,
"loss": 0.4794,
"step": 136
},
{
"epoch": 0.08681875792141952,
"grad_norm": 0.05804799869656563,
"learning_rate": 0.00019678011350922185,
"loss": 0.5749,
"step": 137
},
{
"epoch": 0.08745247148288973,
"grad_norm": 0.0600556954741478,
"learning_rate": 0.00019672948630390294,
"loss": 0.4929,
"step": 138
},
{
"epoch": 0.08808618504435994,
"grad_norm": 0.07564158737659454,
"learning_rate": 0.00019667847079950118,
"loss": 0.5806,
"step": 139
},
{
"epoch": 0.08871989860583017,
"grad_norm": 0.06359097361564636,
"learning_rate": 0.00019662706720080693,
"loss": 0.5427,
"step": 140
},
{
"epoch": 0.08935361216730038,
"grad_norm": 0.05452190712094307,
"learning_rate": 0.00019657527571416856,
"loss": 0.4845,
"step": 141
},
{
"epoch": 0.08998732572877059,
"grad_norm": 0.05258841812610626,
"learning_rate": 0.00019652309654749156,
"loss": 0.5255,
"step": 142
},
{
"epoch": 0.09062103929024082,
"grad_norm": 0.06789179146289825,
"learning_rate": 0.0001964705299102376,
"loss": 0.6002,
"step": 143
},
{
"epoch": 0.09125475285171103,
"grad_norm": 0.05940316617488861,
"learning_rate": 0.00019641757601342378,
"loss": 0.6178,
"step": 144
},
{
"epoch": 0.09188846641318124,
"grad_norm": 0.08051005005836487,
"learning_rate": 0.00019636423506962181,
"loss": 0.4728,
"step": 145
},
{
"epoch": 0.09252217997465145,
"grad_norm": 0.06979210674762726,
"learning_rate": 0.00019631050729295707,
"loss": 0.5166,
"step": 146
},
{
"epoch": 0.09315589353612168,
"grad_norm": 0.04284743592143059,
"learning_rate": 0.00019625639289910777,
"loss": 0.3685,
"step": 147
},
{
"epoch": 0.09378960709759189,
"grad_norm": 0.05410388484597206,
"learning_rate": 0.00019620189210530425,
"loss": 0.582,
"step": 148
},
{
"epoch": 0.0944233206590621,
"grad_norm": 0.08875017613172531,
"learning_rate": 0.00019614700513032775,
"loss": 0.6757,
"step": 149
},
{
"epoch": 0.09505703422053231,
"grad_norm": 0.06792068481445312,
"learning_rate": 0.00019609173219450998,
"loss": 0.5236,
"step": 150
},
{
"epoch": 0.09569074778200254,
"grad_norm": 0.060000237077474594,
"learning_rate": 0.0001960360735197318,
"loss": 0.4813,
"step": 151
},
{
"epoch": 0.09632446134347275,
"grad_norm": 0.052172888070344925,
"learning_rate": 0.00019598002932942266,
"loss": 0.5792,
"step": 152
},
{
"epoch": 0.09695817490494296,
"grad_norm": 0.04992865398526192,
"learning_rate": 0.00019592359984855952,
"loss": 0.4652,
"step": 153
},
{
"epoch": 0.09759188846641319,
"grad_norm": 0.05908304825425148,
"learning_rate": 0.00019586678530366606,
"loss": 0.4968,
"step": 154
},
{
"epoch": 0.0982256020278834,
"grad_norm": 0.16080443561077118,
"learning_rate": 0.00019580958592281167,
"loss": 0.4804,
"step": 155
},
{
"epoch": 0.09885931558935361,
"grad_norm": 0.05863935872912407,
"learning_rate": 0.00019575200193561057,
"loss": 0.5313,
"step": 156
},
{
"epoch": 0.09949302915082382,
"grad_norm": 0.047341488301754,
"learning_rate": 0.0001956940335732209,
"loss": 0.4939,
"step": 157
},
{
"epoch": 0.10012674271229405,
"grad_norm": 0.059797484427690506,
"learning_rate": 0.00019563568106834383,
"loss": 0.4806,
"step": 158
},
{
"epoch": 0.10076045627376426,
"grad_norm": 0.08543235808610916,
"learning_rate": 0.00019557694465522255,
"loss": 0.5691,
"step": 159
},
{
"epoch": 0.10139416983523447,
"grad_norm": 0.0614972747862339,
"learning_rate": 0.00019551782456964136,
"loss": 0.5143,
"step": 160
},
{
"epoch": 0.10202788339670468,
"grad_norm": 0.12742456793785095,
"learning_rate": 0.00019545832104892475,
"loss": 0.4987,
"step": 161
},
{
"epoch": 0.10266159695817491,
"grad_norm": 0.06898955255746841,
"learning_rate": 0.00019539843433193639,
"loss": 0.5504,
"step": 162
},
{
"epoch": 0.10329531051964512,
"grad_norm": 0.11239788681268692,
"learning_rate": 0.0001953381646590783,
"loss": 0.3448,
"step": 163
},
{
"epoch": 0.10392902408111533,
"grad_norm": 0.24028901755809784,
"learning_rate": 0.00019527751227228963,
"loss": 0.5294,
"step": 164
},
{
"epoch": 0.10456273764258556,
"grad_norm": 0.0903674066066742,
"learning_rate": 0.00019521647741504604,
"loss": 0.514,
"step": 165
},
{
"epoch": 0.10519645120405577,
"grad_norm": 0.051598865538835526,
"learning_rate": 0.00019515506033235833,
"loss": 0.4771,
"step": 166
},
{
"epoch": 0.10583016476552598,
"grad_norm": 0.05018608644604683,
"learning_rate": 0.0001950932612707719,
"loss": 0.4492,
"step": 167
},
{
"epoch": 0.10646387832699619,
"grad_norm": 0.07150580734014511,
"learning_rate": 0.00019503108047836523,
"loss": 0.5806,
"step": 168
},
{
"epoch": 0.10709759188846642,
"grad_norm": 0.05979820713400841,
"learning_rate": 0.00019496851820474944,
"loss": 0.6138,
"step": 169
},
{
"epoch": 0.10773130544993663,
"grad_norm": 0.05117090418934822,
"learning_rate": 0.00019490557470106686,
"loss": 0.5138,
"step": 170
},
{
"epoch": 0.10836501901140684,
"grad_norm": 0.049405183643102646,
"learning_rate": 0.0001948422502199903,
"loss": 0.4974,
"step": 171
},
{
"epoch": 0.10899873257287707,
"grad_norm": 0.060524292290210724,
"learning_rate": 0.00019477854501572176,
"loss": 0.5448,
"step": 172
},
{
"epoch": 0.10963244613434728,
"grad_norm": 0.05022512748837471,
"learning_rate": 0.0001947144593439917,
"loss": 0.5295,
"step": 173
},
{
"epoch": 0.11026615969581749,
"grad_norm": 0.05024838447570801,
"learning_rate": 0.0001946499934620579,
"loss": 0.4842,
"step": 174
},
{
"epoch": 0.1108998732572877,
"grad_norm": 0.05859989672899246,
"learning_rate": 0.00019458514762870426,
"loss": 0.5105,
"step": 175
},
{
"epoch": 0.11153358681875793,
"grad_norm": 0.05963319167494774,
"learning_rate": 0.00019451992210424006,
"loss": 0.4833,
"step": 176
},
{
"epoch": 0.11216730038022814,
"grad_norm": 0.05941782146692276,
"learning_rate": 0.0001944543171504987,
"loss": 0.4743,
"step": 177
},
{
"epoch": 0.11280101394169835,
"grad_norm": 0.07598856091499329,
"learning_rate": 0.00019438833303083678,
"loss": 0.483,
"step": 178
},
{
"epoch": 0.11343472750316856,
"grad_norm": 0.05751622095704079,
"learning_rate": 0.0001943219700101328,
"loss": 0.563,
"step": 179
},
{
"epoch": 0.11406844106463879,
"grad_norm": 0.08273158222436905,
"learning_rate": 0.0001942552283547865,
"loss": 0.5514,
"step": 180
},
{
"epoch": 0.114702154626109,
"grad_norm": 0.04589926823973656,
"learning_rate": 0.00019418810833271745,
"loss": 0.4353,
"step": 181
},
{
"epoch": 0.11533586818757921,
"grad_norm": 0.04818568378686905,
"learning_rate": 0.00019412061021336404,
"loss": 0.4653,
"step": 182
},
{
"epoch": 0.11596958174904944,
"grad_norm": 0.062292054295539856,
"learning_rate": 0.0001940527342676826,
"loss": 0.5451,
"step": 183
},
{
"epoch": 0.11660329531051965,
"grad_norm": 0.05161510780453682,
"learning_rate": 0.000193984480768146,
"loss": 0.5174,
"step": 184
},
{
"epoch": 0.11723700887198986,
"grad_norm": 0.0669926106929779,
"learning_rate": 0.0001939158499887428,
"loss": 0.5074,
"step": 185
},
{
"epoch": 0.11787072243346007,
"grad_norm": 0.04856441915035248,
"learning_rate": 0.00019384684220497605,
"loss": 0.3898,
"step": 186
},
{
"epoch": 0.1185044359949303,
"grad_norm": 0.05841194465756416,
"learning_rate": 0.0001937774576938622,
"loss": 0.5437,
"step": 187
},
{
"epoch": 0.11913814955640051,
"grad_norm": 0.05253444239497185,
"learning_rate": 0.00019370769673393007,
"loss": 0.5669,
"step": 188
},
{
"epoch": 0.11977186311787072,
"grad_norm": 0.05771539360284805,
"learning_rate": 0.00019363755960521943,
"loss": 0.4965,
"step": 189
},
{
"epoch": 0.12040557667934093,
"grad_norm": 0.07135152071714401,
"learning_rate": 0.00019356704658928035,
"loss": 0.4089,
"step": 190
},
{
"epoch": 0.12103929024081116,
"grad_norm": 0.05927246809005737,
"learning_rate": 0.00019349615796917163,
"loss": 0.465,
"step": 191
},
{
"epoch": 0.12167300380228137,
"grad_norm": 0.06522128731012344,
"learning_rate": 0.00019342489402945998,
"loss": 0.3797,
"step": 192
},
{
"epoch": 0.12230671736375158,
"grad_norm": 0.05745214596390724,
"learning_rate": 0.0001933532550562187,
"loss": 0.56,
"step": 193
},
{
"epoch": 0.1229404309252218,
"grad_norm": 0.05626146122813225,
"learning_rate": 0.0001932812413370265,
"loss": 0.5439,
"step": 194
},
{
"epoch": 0.12357414448669202,
"grad_norm": 0.07615689933300018,
"learning_rate": 0.00019320885316096654,
"loss": 0.5187,
"step": 195
},
{
"epoch": 0.12420785804816223,
"grad_norm": 0.19566097855567932,
"learning_rate": 0.00019313609081862508,
"loss": 0.5535,
"step": 196
},
{
"epoch": 0.12484157160963244,
"grad_norm": 0.052284326404333115,
"learning_rate": 0.00019306295460209044,
"loss": 0.4056,
"step": 197
},
{
"epoch": 0.12547528517110265,
"grad_norm": 0.050081610679626465,
"learning_rate": 0.00019298944480495176,
"loss": 0.451,
"step": 198
},
{
"epoch": 0.12610899873257286,
"grad_norm": 0.07420384138822556,
"learning_rate": 0.00019291556172229785,
"loss": 0.5485,
"step": 199
},
{
"epoch": 0.1267427122940431,
"grad_norm": 0.046289846301078796,
"learning_rate": 0.00019284130565071588,
"loss": 0.4944,
"step": 200
},
{
"epoch": 0.12737642585551331,
"grad_norm": 0.041031207889318466,
"learning_rate": 0.00019276667688829043,
"loss": 0.4507,
"step": 201
},
{
"epoch": 0.12801013941698353,
"grad_norm": 0.07089229673147202,
"learning_rate": 0.0001926916757346022,
"loss": 0.513,
"step": 202
},
{
"epoch": 0.12864385297845374,
"grad_norm": 0.04405022785067558,
"learning_rate": 0.00019261630249072659,
"loss": 0.3709,
"step": 203
},
{
"epoch": 0.12927756653992395,
"grad_norm": 0.059661708772182465,
"learning_rate": 0.00019254055745923285,
"loss": 0.4813,
"step": 204
},
{
"epoch": 0.12991128010139416,
"grad_norm": 0.07400868833065033,
"learning_rate": 0.00019246444094418255,
"loss": 0.5346,
"step": 205
},
{
"epoch": 0.13054499366286437,
"grad_norm": 0.05862591415643692,
"learning_rate": 0.0001923879532511287,
"loss": 0.4856,
"step": 206
},
{
"epoch": 0.1311787072243346,
"grad_norm": 0.05793355405330658,
"learning_rate": 0.00019231109468711405,
"loss": 0.5129,
"step": 207
},
{
"epoch": 0.13181242078580482,
"grad_norm": 0.043961625546216965,
"learning_rate": 0.00019223386556067033,
"loss": 0.4803,
"step": 208
},
{
"epoch": 0.13244613434727504,
"grad_norm": 0.07102088630199432,
"learning_rate": 0.00019215626618181676,
"loss": 0.5078,
"step": 209
},
{
"epoch": 0.13307984790874525,
"grad_norm": 0.07707204669713974,
"learning_rate": 0.00019207829686205882,
"loss": 0.5465,
"step": 210
},
{
"epoch": 0.13371356147021546,
"grad_norm": 0.06010926514863968,
"learning_rate": 0.0001919999579143871,
"loss": 0.5532,
"step": 211
},
{
"epoch": 0.13434727503168567,
"grad_norm": 0.0627330020070076,
"learning_rate": 0.0001919212496532759,
"loss": 0.4055,
"step": 212
},
{
"epoch": 0.13498098859315588,
"grad_norm": 0.04347623884677887,
"learning_rate": 0.00019184217239468212,
"loss": 0.4581,
"step": 213
},
{
"epoch": 0.13561470215462612,
"grad_norm": 0.05672100558876991,
"learning_rate": 0.00019176272645604386,
"loss": 0.5335,
"step": 214
},
{
"epoch": 0.13624841571609633,
"grad_norm": 0.05062992498278618,
"learning_rate": 0.00019168291215627926,
"loss": 0.4801,
"step": 215
},
{
"epoch": 0.13688212927756654,
"grad_norm": 8.16939640045166,
"learning_rate": 0.00019160272981578512,
"loss": 0.5814,
"step": 216
},
{
"epoch": 0.13751584283903676,
"grad_norm": 0.058165278285741806,
"learning_rate": 0.00019152217975643566,
"loss": 0.5163,
"step": 217
},
{
"epoch": 0.13814955640050697,
"grad_norm": 0.06994735449552536,
"learning_rate": 0.00019144126230158127,
"loss": 0.5558,
"step": 218
},
{
"epoch": 0.13878326996197718,
"grad_norm": 0.05495104938745499,
"learning_rate": 0.0001913599777760471,
"loss": 0.5298,
"step": 219
},
{
"epoch": 0.1394169835234474,
"grad_norm": 0.060677338391542435,
"learning_rate": 0.00019127832650613189,
"loss": 0.5614,
"step": 220
},
{
"epoch": 0.14005069708491763,
"grad_norm": 0.060457441955804825,
"learning_rate": 0.00019119630881960658,
"loss": 0.5139,
"step": 221
},
{
"epoch": 0.14068441064638784,
"grad_norm": 0.0608784481883049,
"learning_rate": 0.00019111392504571296,
"loss": 0.4711,
"step": 222
},
{
"epoch": 0.14131812420785805,
"grad_norm": 0.07560902833938599,
"learning_rate": 0.00019103117551516244,
"loss": 0.486,
"step": 223
},
{
"epoch": 0.14195183776932827,
"grad_norm": 0.0847187414765358,
"learning_rate": 0.00019094806056013468,
"loss": 0.5934,
"step": 224
},
{
"epoch": 0.14258555133079848,
"grad_norm": 0.06016870215535164,
"learning_rate": 0.00019086458051427622,
"loss": 0.4529,
"step": 225
},
{
"epoch": 0.1432192648922687,
"grad_norm": 0.17245864868164062,
"learning_rate": 0.00019078073571269922,
"loss": 0.5307,
"step": 226
},
{
"epoch": 0.1438529784537389,
"grad_norm": 0.0647033080458641,
"learning_rate": 0.00019069652649198005,
"loss": 0.569,
"step": 227
},
{
"epoch": 0.1444866920152091,
"grad_norm": 0.07447489351034164,
"learning_rate": 0.00019061195319015797,
"loss": 0.547,
"step": 228
},
{
"epoch": 0.14512040557667935,
"grad_norm": 0.05335066467523575,
"learning_rate": 0.00019052701614673373,
"loss": 0.5363,
"step": 229
},
{
"epoch": 0.14575411913814956,
"grad_norm": 0.04057115688920021,
"learning_rate": 0.0001904417157026683,
"loss": 0.4354,
"step": 230
},
{
"epoch": 0.14638783269961977,
"grad_norm": 0.05564083158969879,
"learning_rate": 0.00019035605220038137,
"loss": 0.5674,
"step": 231
},
{
"epoch": 0.14702154626108999,
"grad_norm": 0.1210884302854538,
"learning_rate": 0.00019027002598375012,
"loss": 0.5645,
"step": 232
},
{
"epoch": 0.1476552598225602,
"grad_norm": 0.05494518578052521,
"learning_rate": 0.00019018363739810767,
"loss": 0.6239,
"step": 233
},
{
"epoch": 0.1482889733840304,
"grad_norm": 0.04633218050003052,
"learning_rate": 0.0001900968867902419,
"loss": 0.4787,
"step": 234
},
{
"epoch": 0.14892268694550062,
"grad_norm": 0.06846950203180313,
"learning_rate": 0.00019000977450839393,
"loss": 0.5607,
"step": 235
},
{
"epoch": 0.14955640050697086,
"grad_norm": 0.0618814192712307,
"learning_rate": 0.0001899223009022566,
"loss": 0.631,
"step": 236
},
{
"epoch": 0.15019011406844107,
"grad_norm": 0.06061235070228577,
"learning_rate": 0.00018983446632297343,
"loss": 0.5989,
"step": 237
},
{
"epoch": 0.15082382762991128,
"grad_norm": 0.06494279205799103,
"learning_rate": 0.00018974627112313677,
"loss": 0.5816,
"step": 238
},
{
"epoch": 0.1514575411913815,
"grad_norm": 0.04907020181417465,
"learning_rate": 0.0001896577156567868,
"loss": 0.5097,
"step": 239
},
{
"epoch": 0.1520912547528517,
"grad_norm": 0.04682941362261772,
"learning_rate": 0.00018956880027940967,
"loss": 0.5828,
"step": 240
},
{
"epoch": 0.15272496831432192,
"grad_norm": 0.05498978868126869,
"learning_rate": 0.00018947952534793661,
"loss": 0.5257,
"step": 241
},
{
"epoch": 0.15335868187579213,
"grad_norm": 0.04309950768947601,
"learning_rate": 0.00018938989122074197,
"loss": 0.3662,
"step": 242
},
{
"epoch": 0.15399239543726237,
"grad_norm": 0.06519515067338943,
"learning_rate": 0.00018929989825764207,
"loss": 0.4058,
"step": 243
},
{
"epoch": 0.15462610899873258,
"grad_norm": 0.046929214149713516,
"learning_rate": 0.00018920954681989378,
"loss": 0.4916,
"step": 244
},
{
"epoch": 0.1552598225602028,
"grad_norm": 0.05388319492340088,
"learning_rate": 0.00018911883727019285,
"loss": 0.4143,
"step": 245
},
{
"epoch": 0.155893536121673,
"grad_norm": 0.05619863048195839,
"learning_rate": 0.00018902776997267268,
"loss": 0.5107,
"step": 246
},
{
"epoch": 0.15652724968314322,
"grad_norm": 0.053882747888565063,
"learning_rate": 0.00018893634529290279,
"loss": 0.5559,
"step": 247
},
{
"epoch": 0.15716096324461343,
"grad_norm": 0.05231885239481926,
"learning_rate": 0.00018884456359788724,
"loss": 0.5076,
"step": 248
},
{
"epoch": 0.15779467680608364,
"grad_norm": 0.07149146497249603,
"learning_rate": 0.00018875242525606334,
"loss": 0.558,
"step": 249
},
{
"epoch": 0.15842839036755388,
"grad_norm": 0.04615316912531853,
"learning_rate": 0.00018865993063730004,
"loss": 0.4971,
"step": 250
},
{
"epoch": 0.1590621039290241,
"grad_norm": 0.05331886187195778,
"learning_rate": 0.00018856708011289643,
"loss": 0.5506,
"step": 251
},
{
"epoch": 0.1596958174904943,
"grad_norm": 0.05348580330610275,
"learning_rate": 0.00018847387405558045,
"loss": 0.4515,
"step": 252
},
{
"epoch": 0.1603295310519645,
"grad_norm": 0.0438147634267807,
"learning_rate": 0.00018838031283950705,
"loss": 0.3818,
"step": 253
},
{
"epoch": 0.16096324461343473,
"grad_norm": 0.0473354198038578,
"learning_rate": 0.0001882863968402571,
"loss": 0.4458,
"step": 254
},
{
"epoch": 0.16159695817490494,
"grad_norm": 0.05930502712726593,
"learning_rate": 0.0001881921264348355,
"loss": 0.6228,
"step": 255
},
{
"epoch": 0.16223067173637515,
"grad_norm": 0.04982107877731323,
"learning_rate": 0.00018809750200166994,
"loss": 0.5916,
"step": 256
},
{
"epoch": 0.1628643852978454,
"grad_norm": 0.09739918261766434,
"learning_rate": 0.0001880025239206092,
"loss": 0.651,
"step": 257
},
{
"epoch": 0.1634980988593156,
"grad_norm": 0.09072676301002502,
"learning_rate": 0.00018790719257292174,
"loss": 0.5564,
"step": 258
},
{
"epoch": 0.1641318124207858,
"grad_norm": 0.0638791099190712,
"learning_rate": 0.00018781150834129413,
"loss": 0.4545,
"step": 259
},
{
"epoch": 0.16476552598225602,
"grad_norm": 0.05755198001861572,
"learning_rate": 0.0001877154716098295,
"loss": 0.4457,
"step": 260
},
{
"epoch": 0.16539923954372623,
"grad_norm": 0.2049247920513153,
"learning_rate": 0.00018761908276404603,
"loss": 0.5447,
"step": 261
},
{
"epoch": 0.16603295310519645,
"grad_norm": 0.06760350614786148,
"learning_rate": 0.00018752234219087538,
"loss": 0.4743,
"step": 262
},
{
"epoch": 0.16666666666666666,
"grad_norm": 0.061410121619701385,
"learning_rate": 0.00018742525027866115,
"loss": 0.547,
"step": 263
},
{
"epoch": 0.16730038022813687,
"grad_norm": 0.04981521889567375,
"learning_rate": 0.00018732780741715724,
"loss": 0.4924,
"step": 264
},
{
"epoch": 0.1679340937896071,
"grad_norm": 0.06636273115873337,
"learning_rate": 0.00018723001399752653,
"loss": 0.591,
"step": 265
},
{
"epoch": 0.16856780735107732,
"grad_norm": 0.0517747662961483,
"learning_rate": 0.00018713187041233896,
"loss": 0.5294,
"step": 266
},
{
"epoch": 0.16920152091254753,
"grad_norm": 0.11798780411481857,
"learning_rate": 0.00018703337705557017,
"loss": 0.4953,
"step": 267
},
{
"epoch": 0.16983523447401774,
"grad_norm": 0.1441587656736374,
"learning_rate": 0.00018693453432259998,
"loss": 0.4898,
"step": 268
},
{
"epoch": 0.17046894803548795,
"grad_norm": 0.06387986242771149,
"learning_rate": 0.00018683534261021057,
"loss": 0.4663,
"step": 269
},
{
"epoch": 0.17110266159695817,
"grad_norm": 0.05943833664059639,
"learning_rate": 0.0001867358023165851,
"loss": 0.5607,
"step": 270
},
{
"epoch": 0.17173637515842838,
"grad_norm": 0.05011943355202675,
"learning_rate": 0.00018663591384130606,
"loss": 0.5297,
"step": 271
},
{
"epoch": 0.17237008871989862,
"grad_norm": 0.059131983667612076,
"learning_rate": 0.00018653567758535354,
"loss": 0.4896,
"step": 272
},
{
"epoch": 0.17300380228136883,
"grad_norm": 0.06053609773516655,
"learning_rate": 0.0001864350939511038,
"loss": 0.5446,
"step": 273
},
{
"epoch": 0.17363751584283904,
"grad_norm": 0.05496980994939804,
"learning_rate": 0.00018633416334232753,
"loss": 0.5427,
"step": 274
},
{
"epoch": 0.17427122940430925,
"grad_norm": 0.05304751545190811,
"learning_rate": 0.0001862328861641883,
"loss": 0.4189,
"step": 275
},
{
"epoch": 0.17490494296577946,
"grad_norm": 0.04881710559129715,
"learning_rate": 0.00018613126282324092,
"loss": 0.4555,
"step": 276
},
{
"epoch": 0.17553865652724968,
"grad_norm": 0.051984284073114395,
"learning_rate": 0.0001860292937274297,
"loss": 0.5282,
"step": 277
},
{
"epoch": 0.1761723700887199,
"grad_norm": 0.05241424962878227,
"learning_rate": 0.00018592697928608703,
"loss": 0.4924,
"step": 278
},
{
"epoch": 0.17680608365019013,
"grad_norm": 0.04947778955101967,
"learning_rate": 0.00018582431990993151,
"loss": 0.4867,
"step": 279
},
{
"epoch": 0.17743979721166034,
"grad_norm": 0.04952229931950569,
"learning_rate": 0.00018572131601106654,
"loss": 0.4362,
"step": 280
},
{
"epoch": 0.17807351077313055,
"grad_norm": 0.061900023370981216,
"learning_rate": 0.00018561796800297832,
"loss": 0.6342,
"step": 281
},
{
"epoch": 0.17870722433460076,
"grad_norm": 0.04405650496482849,
"learning_rate": 0.00018551427630053463,
"loss": 0.4612,
"step": 282
},
{
"epoch": 0.17934093789607097,
"grad_norm": 0.5723605155944824,
"learning_rate": 0.00018541024131998274,
"loss": 0.4917,
"step": 283
},
{
"epoch": 0.17997465145754118,
"grad_norm": 0.07066962867975235,
"learning_rate": 0.0001853058634789481,
"loss": 0.5386,
"step": 284
},
{
"epoch": 0.1806083650190114,
"grad_norm": 0.041575830429792404,
"learning_rate": 0.00018520114319643235,
"loss": 0.4894,
"step": 285
},
{
"epoch": 0.18124207858048164,
"grad_norm": 0.07731833308935165,
"learning_rate": 0.0001850960808928119,
"loss": 0.5382,
"step": 286
},
{
"epoch": 0.18187579214195185,
"grad_norm": 0.05468999221920967,
"learning_rate": 0.00018499067698983605,
"loss": 0.4514,
"step": 287
},
{
"epoch": 0.18250950570342206,
"grad_norm": 0.04942842200398445,
"learning_rate": 0.00018488493191062542,
"loss": 0.4329,
"step": 288
},
{
"epoch": 0.18314321926489227,
"grad_norm": 0.053615666925907135,
"learning_rate": 0.0001847788460796702,
"loss": 0.5182,
"step": 289
},
{
"epoch": 0.18377693282636248,
"grad_norm": 0.04232574254274368,
"learning_rate": 0.00018467241992282843,
"loss": 0.3108,
"step": 290
},
{
"epoch": 0.1844106463878327,
"grad_norm": 0.04795556515455246,
"learning_rate": 0.00018456565386732433,
"loss": 0.383,
"step": 291
},
{
"epoch": 0.1850443599493029,
"grad_norm": 0.053252723067998886,
"learning_rate": 0.00018445854834174655,
"loss": 0.4597,
"step": 292
},
{
"epoch": 0.18567807351077312,
"grad_norm": 0.044747479259967804,
"learning_rate": 0.00018435110377604654,
"loss": 0.5066,
"step": 293
},
{
"epoch": 0.18631178707224336,
"grad_norm": 0.0473531037569046,
"learning_rate": 0.00018424332060153664,
"loss": 0.4258,
"step": 294
},
{
"epoch": 0.18694550063371357,
"grad_norm": 0.05739828571677208,
"learning_rate": 0.0001841351992508885,
"loss": 0.4498,
"step": 295
},
{
"epoch": 0.18757921419518378,
"grad_norm": 0.0635855570435524,
"learning_rate": 0.0001840267401581314,
"loss": 0.5368,
"step": 296
},
{
"epoch": 0.188212927756654,
"grad_norm": 0.05470935255289078,
"learning_rate": 0.00018391794375865024,
"loss": 0.5367,
"step": 297
},
{
"epoch": 0.1888466413181242,
"grad_norm": 0.04850434139370918,
"learning_rate": 0.00018380881048918405,
"loss": 0.5369,
"step": 298
},
{
"epoch": 0.18948035487959441,
"grad_norm": 0.1420743763446808,
"learning_rate": 0.00018369934078782426,
"loss": 0.5101,
"step": 299
},
{
"epoch": 0.19011406844106463,
"grad_norm": 0.0749795064330101,
"learning_rate": 0.00018358953509401262,
"loss": 0.5756,
"step": 300
},
{
"epoch": 0.19074778200253487,
"grad_norm": 0.05331069603562355,
"learning_rate": 0.00018347939384853978,
"loss": 0.5759,
"step": 301
},
{
"epoch": 0.19138149556400508,
"grad_norm": 0.05981903895735741,
"learning_rate": 0.00018336891749354335,
"loss": 0.6036,
"step": 302
},
{
"epoch": 0.1920152091254753,
"grad_norm": 0.08048289269208908,
"learning_rate": 0.00018325810647250616,
"loss": 0.4424,
"step": 303
},
{
"epoch": 0.1926489226869455,
"grad_norm": 0.07861804962158203,
"learning_rate": 0.00018314696123025454,
"loss": 0.5725,
"step": 304
},
{
"epoch": 0.1932826362484157,
"grad_norm": 0.14672251045703888,
"learning_rate": 0.0001830354822129564,
"loss": 0.5068,
"step": 305
},
{
"epoch": 0.19391634980988592,
"grad_norm": 0.06640765070915222,
"learning_rate": 0.0001829236698681195,
"loss": 0.585,
"step": 306
},
{
"epoch": 0.19455006337135614,
"grad_norm": 0.0588274821639061,
"learning_rate": 0.0001828115246445898,
"loss": 0.5779,
"step": 307
},
{
"epoch": 0.19518377693282637,
"grad_norm": 0.05600736290216446,
"learning_rate": 0.0001826990469925494,
"loss": 0.5216,
"step": 308
},
{
"epoch": 0.1958174904942966,
"grad_norm": 0.052844930440187454,
"learning_rate": 0.0001825862373635149,
"loss": 0.5482,
"step": 309
},
{
"epoch": 0.1964512040557668,
"grad_norm": 0.04969317838549614,
"learning_rate": 0.0001824730962103356,
"loss": 0.5928,
"step": 310
},
{
"epoch": 0.197084917617237,
"grad_norm": 0.06168043613433838,
"learning_rate": 0.00018235962398719147,
"loss": 0.5185,
"step": 311
},
{
"epoch": 0.19771863117870722,
"grad_norm": 0.051151130348443985,
"learning_rate": 0.00018224582114959172,
"loss": 0.4677,
"step": 312
},
{
"epoch": 0.19835234474017743,
"grad_norm": 0.060467127710580826,
"learning_rate": 0.00018213168815437255,
"loss": 0.5566,
"step": 313
},
{
"epoch": 0.19898605830164764,
"grad_norm": 0.043170325458049774,
"learning_rate": 0.0001820172254596956,
"loss": 0.489,
"step": 314
},
{
"epoch": 0.19961977186311788,
"grad_norm": 0.06550537794828415,
"learning_rate": 0.00018190243352504597,
"loss": 0.5809,
"step": 315
},
{
"epoch": 0.2002534854245881,
"grad_norm": 0.04956373944878578,
"learning_rate": 0.00018178731281123044,
"loss": 0.462,
"step": 316
},
{
"epoch": 0.2008871989860583,
"grad_norm": 0.05908495932817459,
"learning_rate": 0.00018167186378037563,
"loss": 0.4611,
"step": 317
},
{
"epoch": 0.20152091254752852,
"grad_norm": 0.047168437391519547,
"learning_rate": 0.00018155608689592604,
"loss": 0.5283,
"step": 318
},
{
"epoch": 0.20215462610899873,
"grad_norm": 0.04968830570578575,
"learning_rate": 0.00018143998262264233,
"loss": 0.4982,
"step": 319
},
{
"epoch": 0.20278833967046894,
"grad_norm": 0.06764087826013565,
"learning_rate": 0.00018132355142659937,
"loss": 0.5244,
"step": 320
},
{
"epoch": 0.20342205323193915,
"grad_norm": 0.06344570964574814,
"learning_rate": 0.0001812067937751844,
"loss": 0.606,
"step": 321
},
{
"epoch": 0.20405576679340937,
"grad_norm": 0.06029113009572029,
"learning_rate": 0.0001810897101370951,
"loss": 0.5407,
"step": 322
},
{
"epoch": 0.2046894803548796,
"grad_norm": 0.08346560597419739,
"learning_rate": 0.00018097230098233785,
"loss": 0.4814,
"step": 323
},
{
"epoch": 0.20532319391634982,
"grad_norm": 0.04595065116882324,
"learning_rate": 0.00018085456678222558,
"loss": 0.471,
"step": 324
},
{
"epoch": 0.20595690747782003,
"grad_norm": 0.4050588309764862,
"learning_rate": 0.00018073650800937624,
"loss": 0.4586,
"step": 325
},
{
"epoch": 0.20659062103929024,
"grad_norm": 0.055679477751255035,
"learning_rate": 0.00018061812513771053,
"loss": 0.516,
"step": 326
},
{
"epoch": 0.20722433460076045,
"grad_norm": 0.05209626257419586,
"learning_rate": 0.00018049941864245033,
"loss": 0.4528,
"step": 327
},
{
"epoch": 0.20785804816223066,
"grad_norm": 0.05503727123141289,
"learning_rate": 0.00018038038900011652,
"loss": 0.4297,
"step": 328
},
{
"epoch": 0.20849176172370087,
"grad_norm": 0.05453247204422951,
"learning_rate": 0.0001802610366885271,
"loss": 0.4731,
"step": 329
},
{
"epoch": 0.20912547528517111,
"grad_norm": 0.05371938645839691,
"learning_rate": 0.00018014136218679567,
"loss": 0.569,
"step": 330
},
{
"epoch": 0.20975918884664133,
"grad_norm": 0.05164814740419388,
"learning_rate": 0.0001800213659753289,
"loss": 0.4883,
"step": 331
},
{
"epoch": 0.21039290240811154,
"grad_norm": 0.06455442309379578,
"learning_rate": 0.00017990104853582493,
"loss": 0.4829,
"step": 332
},
{
"epoch": 0.21102661596958175,
"grad_norm": 0.04764432832598686,
"learning_rate": 0.0001797804103512715,
"loss": 0.5525,
"step": 333
},
{
"epoch": 0.21166032953105196,
"grad_norm": 0.0578368604183197,
"learning_rate": 0.00017965945190594388,
"loss": 0.4824,
"step": 334
},
{
"epoch": 0.21229404309252217,
"grad_norm": 0.05196613445878029,
"learning_rate": 0.00017953817368540292,
"loss": 0.5036,
"step": 335
},
{
"epoch": 0.21292775665399238,
"grad_norm": 0.044868264347314835,
"learning_rate": 0.00017941657617649316,
"loss": 0.36,
"step": 336
},
{
"epoch": 0.21356147021546262,
"grad_norm": 0.0686643123626709,
"learning_rate": 0.00017929465986734084,
"loss": 0.6069,
"step": 337
},
{
"epoch": 0.21419518377693283,
"grad_norm": 0.08286602050065994,
"learning_rate": 0.000179172425247352,
"loss": 0.5635,
"step": 338
},
{
"epoch": 0.21482889733840305,
"grad_norm": 0.5979371070861816,
"learning_rate": 0.00017904987280721035,
"loss": 0.3994,
"step": 339
},
{
"epoch": 0.21546261089987326,
"grad_norm": 0.05577315390110016,
"learning_rate": 0.00017892700303887558,
"loss": 0.5699,
"step": 340
},
{
"epoch": 0.21609632446134347,
"grad_norm": 0.06650438159704208,
"learning_rate": 0.0001788038164355811,
"loss": 0.5557,
"step": 341
},
{
"epoch": 0.21673003802281368,
"grad_norm": 0.06644187867641449,
"learning_rate": 0.00017868031349183217,
"loss": 0.5593,
"step": 342
},
{
"epoch": 0.2173637515842839,
"grad_norm": 0.05286836251616478,
"learning_rate": 0.00017855649470340413,
"loss": 0.4902,
"step": 343
},
{
"epoch": 0.21799746514575413,
"grad_norm": 0.05314694344997406,
"learning_rate": 0.00017843236056733992,
"loss": 0.5036,
"step": 344
},
{
"epoch": 0.21863117870722434,
"grad_norm": 0.0668027251958847,
"learning_rate": 0.0001783079115819486,
"loss": 0.6198,
"step": 345
},
{
"epoch": 0.21926489226869456,
"grad_norm": 0.04909252002835274,
"learning_rate": 0.000178183148246803,
"loss": 0.4273,
"step": 346
},
{
"epoch": 0.21989860583016477,
"grad_norm": 0.053546786308288574,
"learning_rate": 0.00017805807106273787,
"loss": 0.5077,
"step": 347
},
{
"epoch": 0.22053231939163498,
"grad_norm": 0.0647466629743576,
"learning_rate": 0.00017793268053184786,
"loss": 0.5262,
"step": 348
},
{
"epoch": 0.2211660329531052,
"grad_norm": 0.05518212169408798,
"learning_rate": 0.00017780697715748546,
"loss": 0.5621,
"step": 349
},
{
"epoch": 0.2217997465145754,
"grad_norm": 0.0661974772810936,
"learning_rate": 0.00017768096144425902,
"loss": 0.5727,
"step": 350
},
{
"epoch": 0.2224334600760456,
"grad_norm": 0.09333747625350952,
"learning_rate": 0.00017755463389803065,
"loss": 0.4891,
"step": 351
},
{
"epoch": 0.22306717363751585,
"grad_norm": 0.04791216179728508,
"learning_rate": 0.0001774279950259143,
"loss": 0.5569,
"step": 352
},
{
"epoch": 0.22370088719898606,
"grad_norm": 0.05712969973683357,
"learning_rate": 0.0001773010453362737,
"loss": 0.5433,
"step": 353
},
{
"epoch": 0.22433460076045628,
"grad_norm": 0.05735623091459274,
"learning_rate": 0.00017717378533872017,
"loss": 0.5702,
"step": 354
},
{
"epoch": 0.2249683143219265,
"grad_norm": 0.05040268227458,
"learning_rate": 0.00017704621554411084,
"loss": 0.4964,
"step": 355
},
{
"epoch": 0.2256020278833967,
"grad_norm": 0.04687810316681862,
"learning_rate": 0.00017691833646454628,
"loss": 0.5242,
"step": 356
},
{
"epoch": 0.2262357414448669,
"grad_norm": 0.051406193524599075,
"learning_rate": 0.00017679014861336878,
"loss": 0.5146,
"step": 357
},
{
"epoch": 0.22686945500633712,
"grad_norm": 0.04884679988026619,
"learning_rate": 0.00017666165250516006,
"loss": 0.4825,
"step": 358
},
{
"epoch": 0.22750316856780736,
"grad_norm": 0.053725842386484146,
"learning_rate": 0.0001765328486557392,
"loss": 0.4932,
"step": 359
},
{
"epoch": 0.22813688212927757,
"grad_norm": 0.06212908402085304,
"learning_rate": 0.00017640373758216077,
"loss": 0.506,
"step": 360
},
{
"epoch": 0.22877059569074779,
"grad_norm": 0.05059286579489708,
"learning_rate": 0.0001762743198027125,
"loss": 0.4719,
"step": 361
},
{
"epoch": 0.229404309252218,
"grad_norm": 0.04520050436258316,
"learning_rate": 0.00017614459583691346,
"loss": 0.4553,
"step": 362
},
{
"epoch": 0.2300380228136882,
"grad_norm": 0.05503036454319954,
"learning_rate": 0.0001760145662055117,
"loss": 0.4706,
"step": 363
},
{
"epoch": 0.23067173637515842,
"grad_norm": 0.046107854694128036,
"learning_rate": 0.00017588423143048235,
"loss": 0.4177,
"step": 364
},
{
"epoch": 0.23130544993662863,
"grad_norm": 0.12301266193389893,
"learning_rate": 0.0001757535920350255,
"loss": 0.5922,
"step": 365
},
{
"epoch": 0.23193916349809887,
"grad_norm": 1.179470419883728,
"learning_rate": 0.00017562264854356405,
"loss": 0.5123,
"step": 366
},
{
"epoch": 0.23257287705956908,
"grad_norm": 0.11167129874229431,
"learning_rate": 0.0001754914014817416,
"loss": 0.3884,
"step": 367
},
{
"epoch": 0.2332065906210393,
"grad_norm": 0.055067550390958786,
"learning_rate": 0.00017535985137642044,
"loss": 0.4544,
"step": 368
},
{
"epoch": 0.2338403041825095,
"grad_norm": 0.07947530597448349,
"learning_rate": 0.0001752279987556792,
"loss": 0.6575,
"step": 369
},
{
"epoch": 0.23447401774397972,
"grad_norm": 0.10236025601625443,
"learning_rate": 0.00017509584414881113,
"loss": 0.5334,
"step": 370
},
{
"epoch": 0.23510773130544993,
"grad_norm": 0.12996040284633636,
"learning_rate": 0.00017496338808632155,
"loss": 0.3897,
"step": 371
},
{
"epoch": 0.23574144486692014,
"grad_norm": 0.07005209475755692,
"learning_rate": 0.00017483063109992596,
"loss": 0.5077,
"step": 372
},
{
"epoch": 0.23637515842839038,
"grad_norm": 0.04446430131793022,
"learning_rate": 0.00017469757372254785,
"loss": 0.4467,
"step": 373
},
{
"epoch": 0.2370088719898606,
"grad_norm": 6.105027198791504,
"learning_rate": 0.00017456421648831655,
"loss": 1.722,
"step": 374
},
{
"epoch": 0.2376425855513308,
"grad_norm": 0.07488813251256943,
"learning_rate": 0.0001744305599325652,
"loss": 0.7018,
"step": 375
},
{
"epoch": 0.23827629911280102,
"grad_norm": 0.05676595866680145,
"learning_rate": 0.00017429660459182834,
"loss": 0.4865,
"step": 376
},
{
"epoch": 0.23891001267427123,
"grad_norm": 0.058106616139411926,
"learning_rate": 0.00017416235100384007,
"loss": 0.4453,
"step": 377
},
{
"epoch": 0.23954372623574144,
"grad_norm": 0.4252207577228546,
"learning_rate": 0.00017402779970753155,
"loss": 3.008,
"step": 378
},
{
"epoch": 0.24017743979721165,
"grad_norm": 0.24036817252635956,
"learning_rate": 0.00017389295124302923,
"loss": 0.7246,
"step": 379
},
{
"epoch": 0.24081115335868186,
"grad_norm": 4.316144943237305,
"learning_rate": 0.00017375780615165235,
"loss": 0.664,
"step": 380
},
{
"epoch": 0.2414448669201521,
"grad_norm": 6.4877166748046875,
"learning_rate": 0.00017362236497591094,
"loss": 0.487,
"step": 381
},
{
"epoch": 0.2420785804816223,
"grad_norm": 0.12358918786048889,
"learning_rate": 0.00017348662825950357,
"loss": 0.4839,
"step": 382
},
{
"epoch": 0.24271229404309252,
"grad_norm": 0.7211472988128662,
"learning_rate": 0.0001733505965473152,
"loss": 0.6351,
"step": 383
},
{
"epoch": 0.24334600760456274,
"grad_norm": 0.10177785158157349,
"learning_rate": 0.00017321427038541494,
"loss": 0.6043,
"step": 384
},
{
"epoch": 0.24397972116603295,
"grad_norm": 0.054658226668834686,
"learning_rate": 0.00017307765032105406,
"loss": 0.473,
"step": 385
},
{
"epoch": 0.24461343472750316,
"grad_norm": 0.10075858235359192,
"learning_rate": 0.00017294073690266344,
"loss": 0.4892,
"step": 386
},
{
"epoch": 0.24524714828897337,
"grad_norm": 0.06497970223426819,
"learning_rate": 0.00017280353067985167,
"loss": 0.4986,
"step": 387
},
{
"epoch": 0.2458808618504436,
"grad_norm": 0.7542481422424316,
"learning_rate": 0.0001726660322034027,
"loss": 0.5513,
"step": 388
},
{
"epoch": 0.24651457541191382,
"grad_norm": 0.08190987259149551,
"learning_rate": 0.00017252824202527376,
"loss": 0.5077,
"step": 389
},
{
"epoch": 0.24714828897338403,
"grad_norm": 0.08874624967575073,
"learning_rate": 0.0001723901606985929,
"loss": 0.3973,
"step": 390
},
{
"epoch": 0.24778200253485425,
"grad_norm": 0.32968223094940186,
"learning_rate": 0.00017225178877765704,
"loss": 0.4411,
"step": 391
},
{
"epoch": 0.24841571609632446,
"grad_norm": 0.39434677362442017,
"learning_rate": 0.00017211312681792958,
"loss": 0.5201,
"step": 392
},
{
"epoch": 0.24904942965779467,
"grad_norm": 0.11154969036579132,
"learning_rate": 0.00017197417537603827,
"loss": 0.6205,
"step": 393
},
{
"epoch": 0.24968314321926488,
"grad_norm": 0.07316391915082932,
"learning_rate": 0.00017183493500977278,
"loss": 0.5129,
"step": 394
},
{
"epoch": 0.2503168567807351,
"grad_norm": 0.08883780986070633,
"learning_rate": 0.00017169540627808274,
"loss": 0.5036,
"step": 395
},
{
"epoch": 0.2509505703422053,
"grad_norm": 0.07377318292856216,
"learning_rate": 0.00017155558974107536,
"loss": 0.591,
"step": 396
},
{
"epoch": 0.25158428390367554,
"grad_norm": 0.064984992146492,
"learning_rate": 0.00017141548596001305,
"loss": 0.645,
"step": 397
},
{
"epoch": 0.2522179974651457,
"grad_norm": 0.07279626280069351,
"learning_rate": 0.00017127509549731148,
"loss": 0.5108,
"step": 398
},
{
"epoch": 0.25285171102661597,
"grad_norm": 0.06948740035295486,
"learning_rate": 0.000171134418916537,
"loss": 0.4959,
"step": 399
},
{
"epoch": 0.2534854245880862,
"grad_norm": 1.0025055408477783,
"learning_rate": 0.00017099345678240452,
"loss": 0.5248,
"step": 400
},
{
"epoch": 0.2541191381495564,
"grad_norm": 0.34188470244407654,
"learning_rate": 0.00017085220966077538,
"loss": 0.5588,
"step": 401
},
{
"epoch": 0.25475285171102663,
"grad_norm": 0.04984923452138901,
"learning_rate": 0.00017071067811865476,
"loss": 0.4033,
"step": 402
},
{
"epoch": 0.2553865652724968,
"grad_norm": 0.05613204464316368,
"learning_rate": 0.0001705688627241897,
"loss": 0.5774,
"step": 403
},
{
"epoch": 0.25602027883396705,
"grad_norm": 0.058507829904556274,
"learning_rate": 0.0001704267640466667,
"loss": 0.52,
"step": 404
},
{
"epoch": 0.25665399239543724,
"grad_norm": 0.23744581639766693,
"learning_rate": 0.00017028438265650933,
"loss": 0.6028,
"step": 405
},
{
"epoch": 0.2572877059569075,
"grad_norm": 0.11817914992570877,
"learning_rate": 0.00017014171912527616,
"loss": 0.5416,
"step": 406
},
{
"epoch": 0.2579214195183777,
"grad_norm": 0.29011303186416626,
"learning_rate": 0.00016999877402565833,
"loss": 0.4381,
"step": 407
},
{
"epoch": 0.2585551330798479,
"grad_norm": 0.06895189732313156,
"learning_rate": 0.00016985554793147727,
"loss": 0.5046,
"step": 408
},
{
"epoch": 0.25918884664131814,
"grad_norm": 0.059166181832551956,
"learning_rate": 0.00016971204141768233,
"loss": 0.582,
"step": 409
},
{
"epoch": 0.2598225602027883,
"grad_norm": 0.09994165599346161,
"learning_rate": 0.00016956825506034867,
"loss": 0.6042,
"step": 410
},
{
"epoch": 0.26045627376425856,
"grad_norm": 0.09195294976234436,
"learning_rate": 0.00016942418943667468,
"loss": 0.577,
"step": 411
},
{
"epoch": 0.26108998732572875,
"grad_norm": 0.08966407924890518,
"learning_rate": 0.00016927984512497992,
"loss": 0.5795,
"step": 412
},
{
"epoch": 0.261723700887199,
"grad_norm": 0.08420640975236893,
"learning_rate": 0.00016913522270470263,
"loss": 0.4446,
"step": 413
},
{
"epoch": 0.2623574144486692,
"grad_norm": 0.05902143940329552,
"learning_rate": 0.0001689903227563975,
"loss": 0.4458,
"step": 414
},
{
"epoch": 0.2629911280101394,
"grad_norm": 0.046236153692007065,
"learning_rate": 0.0001688451458617332,
"loss": 0.3762,
"step": 415
},
{
"epoch": 0.26362484157160965,
"grad_norm": 0.10383841395378113,
"learning_rate": 0.00016869969260349018,
"loss": 0.6076,
"step": 416
},
{
"epoch": 0.26425855513307983,
"grad_norm": 0.059753723442554474,
"learning_rate": 0.00016855396356555834,
"loss": 0.4116,
"step": 417
},
{
"epoch": 0.26489226869455007,
"grad_norm": 0.05825261399149895,
"learning_rate": 0.00016840795933293463,
"loss": 0.5377,
"step": 418
},
{
"epoch": 0.26552598225602025,
"grad_norm": 0.07149126380681992,
"learning_rate": 0.00016826168049172062,
"loss": 0.5946,
"step": 419
},
{
"epoch": 0.2661596958174905,
"grad_norm": 0.0636037141084671,
"learning_rate": 0.00016811512762912034,
"loss": 0.4232,
"step": 420
},
{
"epoch": 0.26679340937896073,
"grad_norm": 0.06662221997976303,
"learning_rate": 0.00016796830133343775,
"loss": 0.5406,
"step": 421
},
{
"epoch": 0.2674271229404309,
"grad_norm": 0.058340173214673996,
"learning_rate": 0.00016782120219407452,
"loss": 0.5402,
"step": 422
},
{
"epoch": 0.26806083650190116,
"grad_norm": 0.054275717586278915,
"learning_rate": 0.00016767383080152742,
"loss": 0.5215,
"step": 423
},
{
"epoch": 0.26869455006337134,
"grad_norm": 0.055525969713926315,
"learning_rate": 0.00016752618774738639,
"loss": 0.5743,
"step": 424
},
{
"epoch": 0.2693282636248416,
"grad_norm": 0.05762525647878647,
"learning_rate": 0.00016737827362433164,
"loss": 0.5806,
"step": 425
},
{
"epoch": 0.26996197718631176,
"grad_norm": 0.059116896241903305,
"learning_rate": 0.0001672300890261317,
"loss": 0.4828,
"step": 426
},
{
"epoch": 0.270595690747782,
"grad_norm": 0.046420734375715256,
"learning_rate": 0.00016708163454764075,
"loss": 0.4509,
"step": 427
},
{
"epoch": 0.27122940430925224,
"grad_norm": 0.11202160269021988,
"learning_rate": 0.00016693291078479638,
"loss": 0.5139,
"step": 428
},
{
"epoch": 0.2718631178707224,
"grad_norm": 0.08383259177207947,
"learning_rate": 0.00016678391833461722,
"loss": 0.7026,
"step": 429
},
{
"epoch": 0.27249683143219267,
"grad_norm": 0.058648403733968735,
"learning_rate": 0.0001666346577952004,
"loss": 0.4704,
"step": 430
},
{
"epoch": 0.27313054499366285,
"grad_norm": 0.08609268069267273,
"learning_rate": 0.0001664851297657193,
"loss": 0.5186,
"step": 431
},
{
"epoch": 0.2737642585551331,
"grad_norm": 0.10570003092288971,
"learning_rate": 0.00016633533484642103,
"loss": 0.4615,
"step": 432
},
{
"epoch": 0.2743979721166033,
"grad_norm": 0.09764793515205383,
"learning_rate": 0.00016618527363862408,
"loss": 0.4519,
"step": 433
},
{
"epoch": 0.2750316856780735,
"grad_norm": 0.08797989040613174,
"learning_rate": 0.00016603494674471593,
"loss": 0.6139,
"step": 434
},
{
"epoch": 0.27566539923954375,
"grad_norm": 0.0714520812034607,
"learning_rate": 0.0001658843547681506,
"loss": 0.5027,
"step": 435
},
{
"epoch": 0.27629911280101394,
"grad_norm": 0.08733757585287094,
"learning_rate": 0.00016573349831344616,
"loss": 0.4582,
"step": 436
},
{
"epoch": 0.2769328263624842,
"grad_norm": 0.0712830200791359,
"learning_rate": 0.00016558237798618245,
"loss": 0.4336,
"step": 437
},
{
"epoch": 0.27756653992395436,
"grad_norm": 0.06345337629318237,
"learning_rate": 0.00016543099439299844,
"loss": 0.4587,
"step": 438
},
{
"epoch": 0.2782002534854246,
"grad_norm": 0.06224706023931503,
"learning_rate": 0.0001652793481415901,
"loss": 0.5171,
"step": 439
},
{
"epoch": 0.2788339670468948,
"grad_norm": 0.0549205057322979,
"learning_rate": 0.00016512743984070769,
"loss": 0.5189,
"step": 440
},
{
"epoch": 0.279467680608365,
"grad_norm": 0.07211892306804657,
"learning_rate": 0.00016497527010015336,
"loss": 0.6118,
"step": 441
},
{
"epoch": 0.28010139416983526,
"grad_norm": 0.05902037024497986,
"learning_rate": 0.00016482283953077887,
"loss": 0.5376,
"step": 442
},
{
"epoch": 0.28073510773130544,
"grad_norm": 0.04935478791594505,
"learning_rate": 0.00016467014874448288,
"loss": 0.5468,
"step": 443
},
{
"epoch": 0.2813688212927757,
"grad_norm": 0.08219460397958755,
"learning_rate": 0.00016451719835420877,
"loss": 0.5723,
"step": 444
},
{
"epoch": 0.28200253485424587,
"grad_norm": 0.08607888221740723,
"learning_rate": 0.000164363988973942,
"loss": 0.4821,
"step": 445
},
{
"epoch": 0.2826362484157161,
"grad_norm": 0.05368666350841522,
"learning_rate": 0.00016421052121870755,
"loss": 0.4759,
"step": 446
},
{
"epoch": 0.2832699619771863,
"grad_norm": 0.09421613812446594,
"learning_rate": 0.00016405679570456782,
"loss": 0.4634,
"step": 447
},
{
"epoch": 0.28390367553865653,
"grad_norm": 0.06585177779197693,
"learning_rate": 0.0001639028130486198,
"loss": 0.5049,
"step": 448
},
{
"epoch": 0.28453738910012677,
"grad_norm": 0.07445032149553299,
"learning_rate": 0.00016374857386899268,
"loss": 0.6255,
"step": 449
},
{
"epoch": 0.28517110266159695,
"grad_norm": 0.05892190709710121,
"learning_rate": 0.00016359407878484552,
"loss": 0.5035,
"step": 450
},
{
"epoch": 0.2858048162230672,
"grad_norm": 0.08238600939512253,
"learning_rate": 0.00016343932841636456,
"loss": 0.4818,
"step": 451
},
{
"epoch": 0.2864385297845374,
"grad_norm": 0.0664915144443512,
"learning_rate": 0.00016328432338476084,
"loss": 0.4375,
"step": 452
},
{
"epoch": 0.2870722433460076,
"grad_norm": 0.04862099885940552,
"learning_rate": 0.00016312906431226773,
"loss": 0.4138,
"step": 453
},
{
"epoch": 0.2877059569074778,
"grad_norm": 0.04187007248401642,
"learning_rate": 0.00016297355182213837,
"loss": 0.3836,
"step": 454
},
{
"epoch": 0.28833967046894804,
"grad_norm": 0.05451095104217529,
"learning_rate": 0.00016281778653864316,
"loss": 0.4451,
"step": 455
},
{
"epoch": 0.2889733840304182,
"grad_norm": 0.061764512211084366,
"learning_rate": 0.0001626617690870673,
"loss": 0.6315,
"step": 456
},
{
"epoch": 0.28960709759188846,
"grad_norm": 0.05365981534123421,
"learning_rate": 0.0001625055000937083,
"loss": 0.4399,
"step": 457
},
{
"epoch": 0.2902408111533587,
"grad_norm": 0.10771326720714569,
"learning_rate": 0.00016234898018587337,
"loss": 0.5229,
"step": 458
},
{
"epoch": 0.2908745247148289,
"grad_norm": 0.05859148129820824,
"learning_rate": 0.000162192209991877,
"loss": 0.4254,
"step": 459
},
{
"epoch": 0.2915082382762991,
"grad_norm": 0.08183909952640533,
"learning_rate": 0.00016203519014103837,
"loss": 0.3658,
"step": 460
},
{
"epoch": 0.2921419518377693,
"grad_norm": 0.04404648020863533,
"learning_rate": 0.00016187792126367886,
"loss": 0.4138,
"step": 461
},
{
"epoch": 0.29277566539923955,
"grad_norm": 0.056379418820142746,
"learning_rate": 0.00016172040399111957,
"loss": 0.4781,
"step": 462
},
{
"epoch": 0.29340937896070973,
"grad_norm": 0.0440094955265522,
"learning_rate": 0.00016156263895567867,
"loss": 0.4623,
"step": 463
},
{
"epoch": 0.29404309252217997,
"grad_norm": 0.055651161819696426,
"learning_rate": 0.00016140462679066885,
"loss": 0.5002,
"step": 464
},
{
"epoch": 0.2946768060836502,
"grad_norm": 0.09338720887899399,
"learning_rate": 0.00016124636813039502,
"loss": 0.5199,
"step": 465
},
{
"epoch": 0.2953105196451204,
"grad_norm": 0.07024485617876053,
"learning_rate": 0.00016108786361015143,
"loss": 0.5378,
"step": 466
},
{
"epoch": 0.29594423320659063,
"grad_norm": 0.05211356282234192,
"learning_rate": 0.00016092911386621938,
"loss": 0.5895,
"step": 467
},
{
"epoch": 0.2965779467680608,
"grad_norm": 0.05571569502353668,
"learning_rate": 0.00016077011953586452,
"loss": 0.4952,
"step": 468
},
{
"epoch": 0.29721166032953106,
"grad_norm": 0.07663686573505402,
"learning_rate": 0.00016061088125733433,
"loss": 0.5341,
"step": 469
},
{
"epoch": 0.29784537389100124,
"grad_norm": 0.04910871386528015,
"learning_rate": 0.0001604513996698556,
"loss": 0.445,
"step": 470
},
{
"epoch": 0.2984790874524715,
"grad_norm": 0.07365076243877411,
"learning_rate": 0.0001602916754136318,
"loss": 0.5364,
"step": 471
},
{
"epoch": 0.2991128010139417,
"grad_norm": 0.08367875218391418,
"learning_rate": 0.00016013170912984058,
"loss": 0.5709,
"step": 472
},
{
"epoch": 0.2997465145754119,
"grad_norm": 0.06659605354070663,
"learning_rate": 0.00015997150146063115,
"loss": 0.5351,
"step": 473
},
{
"epoch": 0.30038022813688214,
"grad_norm": 0.05647695064544678,
"learning_rate": 0.00015981105304912162,
"loss": 0.4103,
"step": 474
},
{
"epoch": 0.3010139416983523,
"grad_norm": 0.05512802302837372,
"learning_rate": 0.0001596503645393966,
"loss": 0.4919,
"step": 475
},
{
"epoch": 0.30164765525982257,
"grad_norm": 0.07482268661260605,
"learning_rate": 0.0001594894365765045,
"loss": 0.5266,
"step": 476
},
{
"epoch": 0.30228136882129275,
"grad_norm": 0.08068813383579254,
"learning_rate": 0.000159328269806455,
"loss": 0.6268,
"step": 477
},
{
"epoch": 0.302915082382763,
"grad_norm": 0.05029362812638283,
"learning_rate": 0.00015916686487621635,
"loss": 0.4999,
"step": 478
},
{
"epoch": 0.30354879594423323,
"grad_norm": 0.0705760046839714,
"learning_rate": 0.00015900522243371282,
"loss": 0.5182,
"step": 479
},
{
"epoch": 0.3041825095057034,
"grad_norm": 0.20289281010627747,
"learning_rate": 0.00015884334312782223,
"loss": 0.6609,
"step": 480
},
{
"epoch": 0.30481622306717365,
"grad_norm": 0.05456344410777092,
"learning_rate": 0.00015868122760837313,
"loss": 0.4575,
"step": 481
},
{
"epoch": 0.30544993662864384,
"grad_norm": 0.06280402094125748,
"learning_rate": 0.00015851887652614237,
"loss": 0.4186,
"step": 482
},
{
"epoch": 0.3060836501901141,
"grad_norm": 0.06588494777679443,
"learning_rate": 0.0001583562905328524,
"loss": 0.5235,
"step": 483
},
{
"epoch": 0.30671736375158426,
"grad_norm": 0.14238761365413666,
"learning_rate": 0.00015819347028116858,
"loss": 0.5727,
"step": 484
},
{
"epoch": 0.3073510773130545,
"grad_norm": 0.0709756463766098,
"learning_rate": 0.0001580304164246968,
"loss": 0.4003,
"step": 485
},
{
"epoch": 0.30798479087452474,
"grad_norm": 0.3064410388469696,
"learning_rate": 0.0001578671296179806,
"loss": 0.524,
"step": 486
},
{
"epoch": 0.3086185044359949,
"grad_norm": 0.04714261740446091,
"learning_rate": 0.00015770361051649863,
"loss": 0.3965,
"step": 487
},
{
"epoch": 0.30925221799746516,
"grad_norm": 0.05930585786700249,
"learning_rate": 0.00015753985977666213,
"loss": 0.4562,
"step": 488
},
{
"epoch": 0.30988593155893535,
"grad_norm": 0.07817406952381134,
"learning_rate": 0.00015737587805581219,
"loss": 0.5846,
"step": 489
},
{
"epoch": 0.3105196451204056,
"grad_norm": 0.05352717638015747,
"learning_rate": 0.00015721166601221698,
"loss": 0.5899,
"step": 490
},
{
"epoch": 0.31115335868187577,
"grad_norm": 0.05995578318834305,
"learning_rate": 0.00015704722430506942,
"loss": 0.5521,
"step": 491
},
{
"epoch": 0.311787072243346,
"grad_norm": 0.15946877002716064,
"learning_rate": 0.00015688255359448428,
"loss": 0.6366,
"step": 492
},
{
"epoch": 0.31242078580481625,
"grad_norm": 0.06116756424307823,
"learning_rate": 0.00015671765454149559,
"loss": 0.4436,
"step": 493
},
{
"epoch": 0.31305449936628643,
"grad_norm": 0.272954523563385,
"learning_rate": 0.00015655252780805414,
"loss": 0.6512,
"step": 494
},
{
"epoch": 0.31368821292775667,
"grad_norm": 0.0462493859231472,
"learning_rate": 0.0001563871740570245,
"loss": 0.4075,
"step": 495
},
{
"epoch": 0.31432192648922685,
"grad_norm": 0.08116989582777023,
"learning_rate": 0.00015622159395218272,
"loss": 0.6353,
"step": 496
},
{
"epoch": 0.3149556400506971,
"grad_norm": 0.07837241142988205,
"learning_rate": 0.0001560557881582134,
"loss": 0.5087,
"step": 497
},
{
"epoch": 0.3155893536121673,
"grad_norm": 0.07096578180789948,
"learning_rate": 0.00015588975734070717,
"loss": 0.617,
"step": 498
},
{
"epoch": 0.3162230671736375,
"grad_norm": 0.07047011703252792,
"learning_rate": 0.0001557235021661579,
"loss": 0.6406,
"step": 499
},
{
"epoch": 0.31685678073510776,
"grad_norm": 0.06322109699249268,
"learning_rate": 0.00015555702330196023,
"loss": 0.5973,
"step": 500
},
{
"epoch": 0.31749049429657794,
"grad_norm": 0.1788979321718216,
"learning_rate": 0.00015539032141640658,
"loss": 0.6022,
"step": 501
},
{
"epoch": 0.3181242078580482,
"grad_norm": 0.05936092510819435,
"learning_rate": 0.00015522339717868476,
"loss": 0.4314,
"step": 502
},
{
"epoch": 0.31875792141951836,
"grad_norm": 0.05811009183526039,
"learning_rate": 0.00015505625125887508,
"loss": 0.5641,
"step": 503
},
{
"epoch": 0.3193916349809886,
"grad_norm": 0.11950580030679703,
"learning_rate": 0.00015488888432794784,
"loss": 0.5796,
"step": 504
},
{
"epoch": 0.3200253485424588,
"grad_norm": 0.04393857717514038,
"learning_rate": 0.00015472129705776047,
"loss": 0.3637,
"step": 505
},
{
"epoch": 0.320659062103929,
"grad_norm": 0.11919873207807541,
"learning_rate": 0.00015455349012105486,
"loss": 0.4967,
"step": 506
},
{
"epoch": 0.32129277566539927,
"grad_norm": 0.055687014013528824,
"learning_rate": 0.00015438546419145488,
"loss": 0.4932,
"step": 507
},
{
"epoch": 0.32192648922686945,
"grad_norm": 0.058437906205654144,
"learning_rate": 0.00015421721994346327,
"loss": 0.5351,
"step": 508
},
{
"epoch": 0.3225602027883397,
"grad_norm": 0.04726817458868027,
"learning_rate": 0.00015404875805245935,
"loss": 0.433,
"step": 509
},
{
"epoch": 0.3231939163498099,
"grad_norm": 0.04807078838348389,
"learning_rate": 0.00015388007919469603,
"loss": 0.4534,
"step": 510
},
{
"epoch": 0.3238276299112801,
"grad_norm": 0.07437839359045029,
"learning_rate": 0.00015371118404729716,
"loss": 0.584,
"step": 511
},
{
"epoch": 0.3244613434727503,
"grad_norm": 0.050413914024829865,
"learning_rate": 0.00015354207328825491,
"loss": 0.3788,
"step": 512
},
{
"epoch": 0.32509505703422054,
"grad_norm": 0.07370271533727646,
"learning_rate": 0.0001533727475964269,
"loss": 0.4768,
"step": 513
},
{
"epoch": 0.3257287705956908,
"grad_norm": 0.06317605078220367,
"learning_rate": 0.00015320320765153367,
"loss": 0.5665,
"step": 514
},
{
"epoch": 0.32636248415716096,
"grad_norm": 0.061747610569000244,
"learning_rate": 0.00015303345413415564,
"loss": 0.6061,
"step": 515
},
{
"epoch": 0.3269961977186312,
"grad_norm": 0.07719457149505615,
"learning_rate": 0.00015286348772573075,
"loss": 0.4041,
"step": 516
},
{
"epoch": 0.3276299112801014,
"grad_norm": 0.048449669033288956,
"learning_rate": 0.0001526933091085515,
"loss": 0.4865,
"step": 517
},
{
"epoch": 0.3282636248415716,
"grad_norm": 0.06786296516656876,
"learning_rate": 0.00015252291896576214,
"loss": 0.5036,
"step": 518
},
{
"epoch": 0.3288973384030418,
"grad_norm": 0.056538064032793045,
"learning_rate": 0.0001523523179813562,
"loss": 0.5077,
"step": 519
},
{
"epoch": 0.32953105196451205,
"grad_norm": 0.06674568355083466,
"learning_rate": 0.00015218150684017347,
"loss": 0.701,
"step": 520
},
{
"epoch": 0.33016476552598223,
"grad_norm": 0.07875782251358032,
"learning_rate": 0.00015201048622789747,
"loss": 0.5375,
"step": 521
},
{
"epoch": 0.33079847908745247,
"grad_norm": 0.06530767679214478,
"learning_rate": 0.00015183925683105254,
"loss": 0.5136,
"step": 522
},
{
"epoch": 0.3314321926489227,
"grad_norm": 0.06704816222190857,
"learning_rate": 0.00015166781933700105,
"loss": 0.6015,
"step": 523
},
{
"epoch": 0.3320659062103929,
"grad_norm": 0.061236705631017685,
"learning_rate": 0.00015149617443394094,
"loss": 0.5323,
"step": 524
},
{
"epoch": 0.33269961977186313,
"grad_norm": 0.11219301074743271,
"learning_rate": 0.00015132432281090256,
"loss": 0.6076,
"step": 525
},
{
"epoch": 0.3333333333333333,
"grad_norm": 0.04857495799660683,
"learning_rate": 0.00015115226515774618,
"loss": 0.4208,
"step": 526
},
{
"epoch": 0.33396704689480355,
"grad_norm": 0.04918389767408371,
"learning_rate": 0.0001509800021651591,
"loss": 0.5069,
"step": 527
},
{
"epoch": 0.33460076045627374,
"grad_norm": 0.06613993644714355,
"learning_rate": 0.00015080753452465296,
"loss": 0.5443,
"step": 528
},
{
"epoch": 0.335234474017744,
"grad_norm": 0.05695560947060585,
"learning_rate": 0.00015063486292856082,
"loss": 0.5632,
"step": 529
},
{
"epoch": 0.3358681875792142,
"grad_norm": 0.05377941578626633,
"learning_rate": 0.0001504619880700346,
"loss": 0.3954,
"step": 530
},
{
"epoch": 0.3365019011406844,
"grad_norm": 0.06934024393558502,
"learning_rate": 0.000150288910643042,
"loss": 0.5669,
"step": 531
},
{
"epoch": 0.33713561470215464,
"grad_norm": 0.10134469717741013,
"learning_rate": 0.00015011563134236408,
"loss": 0.5248,
"step": 532
},
{
"epoch": 0.3377693282636248,
"grad_norm": 0.11486341804265976,
"learning_rate": 0.00014994215086359212,
"loss": 0.6074,
"step": 533
},
{
"epoch": 0.33840304182509506,
"grad_norm": 0.07518647611141205,
"learning_rate": 0.00014976846990312514,
"loss": 0.5196,
"step": 534
},
{
"epoch": 0.33903675538656525,
"grad_norm": 0.06767034530639648,
"learning_rate": 0.0001495945891581668,
"loss": 0.4821,
"step": 535
},
{
"epoch": 0.3396704689480355,
"grad_norm": 0.047710105776786804,
"learning_rate": 0.00014942050932672277,
"loss": 0.4468,
"step": 536
},
{
"epoch": 0.3403041825095057,
"grad_norm": 0.10735978931188583,
"learning_rate": 0.000149246231107598,
"loss": 0.4851,
"step": 537
},
{
"epoch": 0.3409378960709759,
"grad_norm": 0.0501636303961277,
"learning_rate": 0.0001490717552003938,
"loss": 0.4831,
"step": 538
},
{
"epoch": 0.34157160963244615,
"grad_norm": 0.052001163363456726,
"learning_rate": 0.00014889708230550496,
"loss": 0.5206,
"step": 539
},
{
"epoch": 0.34220532319391633,
"grad_norm": 0.06634392589330673,
"learning_rate": 0.00014872221312411718,
"loss": 0.5051,
"step": 540
},
{
"epoch": 0.3428390367553866,
"grad_norm": 0.053568046540021896,
"learning_rate": 0.00014854714835820394,
"loss": 0.5257,
"step": 541
},
{
"epoch": 0.34347275031685676,
"grad_norm": 0.05587064474821091,
"learning_rate": 0.000148371888710524,
"loss": 0.5924,
"step": 542
},
{
"epoch": 0.344106463878327,
"grad_norm": 0.055588286370038986,
"learning_rate": 0.00014819643488461835,
"loss": 0.4242,
"step": 543
},
{
"epoch": 0.34474017743979724,
"grad_norm": 0.07102327048778534,
"learning_rate": 0.00014802078758480747,
"loss": 0.5229,
"step": 544
},
{
"epoch": 0.3453738910012674,
"grad_norm": 0.06629911810159683,
"learning_rate": 0.00014784494751618853,
"loss": 0.435,
"step": 545
},
{
"epoch": 0.34600760456273766,
"grad_norm": 0.054953474551439285,
"learning_rate": 0.00014766891538463254,
"loss": 0.5796,
"step": 546
},
{
"epoch": 0.34664131812420784,
"grad_norm": 0.05943427234888077,
"learning_rate": 0.00014749269189678142,
"loss": 0.427,
"step": 547
},
{
"epoch": 0.3472750316856781,
"grad_norm": 0.05509248375892639,
"learning_rate": 0.00014731627776004536,
"loss": 0.5456,
"step": 548
},
{
"epoch": 0.34790874524714827,
"grad_norm": 0.0867772102355957,
"learning_rate": 0.0001471396736825998,
"loss": 0.5649,
"step": 549
},
{
"epoch": 0.3485424588086185,
"grad_norm": 0.08892481029033661,
"learning_rate": 0.00014696288037338256,
"loss": 0.5489,
"step": 550
},
{
"epoch": 0.34917617237008874,
"grad_norm": 0.07534697651863098,
"learning_rate": 0.00014678589854209134,
"loss": 0.4728,
"step": 551
},
{
"epoch": 0.34980988593155893,
"grad_norm": 0.03929729387164116,
"learning_rate": 0.00014660872889918044,
"loss": 0.3527,
"step": 552
},
{
"epoch": 0.35044359949302917,
"grad_norm": 0.06847205758094788,
"learning_rate": 0.00014643137215585806,
"loss": 0.4204,
"step": 553
},
{
"epoch": 0.35107731305449935,
"grad_norm": 0.06959280371665955,
"learning_rate": 0.00014625382902408356,
"loss": 0.5043,
"step": 554
},
{
"epoch": 0.3517110266159696,
"grad_norm": 0.057750072330236435,
"learning_rate": 0.0001460761002165645,
"loss": 0.5717,
"step": 555
},
{
"epoch": 0.3523447401774398,
"grad_norm": 0.0640597864985466,
"learning_rate": 0.00014589818644675378,
"loss": 0.5691,
"step": 556
},
{
"epoch": 0.35297845373891,
"grad_norm": 0.05334803834557533,
"learning_rate": 0.0001457200884288468,
"loss": 0.4438,
"step": 557
},
{
"epoch": 0.35361216730038025,
"grad_norm": 0.050739504396915436,
"learning_rate": 0.0001455418068777786,
"loss": 0.4418,
"step": 558
},
{
"epoch": 0.35424588086185044,
"grad_norm": 0.04636020213365555,
"learning_rate": 0.00014536334250922093,
"loss": 0.3724,
"step": 559
},
{
"epoch": 0.3548795944233207,
"grad_norm": 0.04343942552804947,
"learning_rate": 0.00014518469603957943,
"loss": 0.3218,
"step": 560
},
{
"epoch": 0.35551330798479086,
"grad_norm": 0.06655412167310715,
"learning_rate": 0.00014500586818599076,
"loss": 0.5158,
"step": 561
},
{
"epoch": 0.3561470215462611,
"grad_norm": 0.06236552819609642,
"learning_rate": 0.0001448268596663197,
"loss": 0.5348,
"step": 562
},
{
"epoch": 0.3567807351077313,
"grad_norm": 0.0551675446331501,
"learning_rate": 0.00014464767119915629,
"loss": 0.5191,
"step": 563
},
{
"epoch": 0.3574144486692015,
"grad_norm": 0.0711677148938179,
"learning_rate": 0.00014446830350381293,
"loss": 0.5787,
"step": 564
},
{
"epoch": 0.35804816223067176,
"grad_norm": 0.05513966456055641,
"learning_rate": 0.00014428875730032145,
"loss": 0.4056,
"step": 565
},
{
"epoch": 0.35868187579214195,
"grad_norm": 0.07472972571849823,
"learning_rate": 0.00014410903330943029,
"loss": 0.4217,
"step": 566
},
{
"epoch": 0.3593155893536122,
"grad_norm": 0.05436578020453453,
"learning_rate": 0.00014392913225260153,
"loss": 0.5195,
"step": 567
},
{
"epoch": 0.35994930291508237,
"grad_norm": 0.14983688294887543,
"learning_rate": 0.00014374905485200817,
"loss": 0.6106,
"step": 568
},
{
"epoch": 0.3605830164765526,
"grad_norm": 0.09657621383666992,
"learning_rate": 0.00014356880183053104,
"loss": 0.5487,
"step": 569
},
{
"epoch": 0.3612167300380228,
"grad_norm": 0.06128871440887451,
"learning_rate": 0.00014338837391175582,
"loss": 0.2958,
"step": 570
},
{
"epoch": 0.36185044359949303,
"grad_norm": 0.3691087067127228,
"learning_rate": 0.00014320777181997052,
"loss": 0.4846,
"step": 571
},
{
"epoch": 0.36248415716096327,
"grad_norm": 0.07217471301555634,
"learning_rate": 0.00014302699628016208,
"loss": 0.4256,
"step": 572
},
{
"epoch": 0.36311787072243346,
"grad_norm": 0.05521377548575401,
"learning_rate": 0.00014284604801801396,
"loss": 0.48,
"step": 573
},
{
"epoch": 0.3637515842839037,
"grad_norm": 0.04929598793387413,
"learning_rate": 0.0001426649277599028,
"loss": 0.5303,
"step": 574
},
{
"epoch": 0.3643852978453739,
"grad_norm": 0.050052460283041,
"learning_rate": 0.00014248363623289574,
"loss": 0.4863,
"step": 575
},
{
"epoch": 0.3650190114068441,
"grad_norm": 0.04534770920872688,
"learning_rate": 0.0001423021741647474,
"loss": 0.5239,
"step": 576
},
{
"epoch": 0.3656527249683143,
"grad_norm": 0.07982175797224045,
"learning_rate": 0.0001421205422838971,
"loss": 0.5924,
"step": 577
},
{
"epoch": 0.36628643852978454,
"grad_norm": 0.04665097966790199,
"learning_rate": 0.0001419387413194657,
"loss": 0.4579,
"step": 578
},
{
"epoch": 0.3669201520912547,
"grad_norm": 0.0721178650856018,
"learning_rate": 0.0001417567720012529,
"loss": 0.5235,
"step": 579
},
{
"epoch": 0.36755386565272496,
"grad_norm": 0.04838218167424202,
"learning_rate": 0.00014157463505973418,
"loss": 0.4138,
"step": 580
},
{
"epoch": 0.3681875792141952,
"grad_norm": 0.07050075381994247,
"learning_rate": 0.00014139233122605798,
"loss": 0.5749,
"step": 581
},
{
"epoch": 0.3688212927756654,
"grad_norm": 0.07718097418546677,
"learning_rate": 0.00014120986123204257,
"loss": 0.5399,
"step": 582
},
{
"epoch": 0.3694550063371356,
"grad_norm": 0.08041960000991821,
"learning_rate": 0.00014102722581017332,
"loss": 0.4264,
"step": 583
},
{
"epoch": 0.3700887198986058,
"grad_norm": 0.08530323952436447,
"learning_rate": 0.00014084442569359964,
"loss": 0.4534,
"step": 584
},
{
"epoch": 0.37072243346007605,
"grad_norm": 0.0639512911438942,
"learning_rate": 0.00014066146161613208,
"loss": 0.4295,
"step": 585
},
{
"epoch": 0.37135614702154623,
"grad_norm": 0.06618323922157288,
"learning_rate": 0.00014047833431223938,
"loss": 0.6437,
"step": 586
},
{
"epoch": 0.3719898605830165,
"grad_norm": 0.057782579213380814,
"learning_rate": 0.00014029504451704557,
"loss": 0.4855,
"step": 587
},
{
"epoch": 0.3726235741444867,
"grad_norm": 0.04774455726146698,
"learning_rate": 0.00014011159296632678,
"loss": 0.3035,
"step": 588
},
{
"epoch": 0.3732572877059569,
"grad_norm": 0.05420040711760521,
"learning_rate": 0.00013992798039650872,
"loss": 0.4444,
"step": 589
},
{
"epoch": 0.37389100126742714,
"grad_norm": 0.06096061319112778,
"learning_rate": 0.00013974420754466328,
"loss": 0.5743,
"step": 590
},
{
"epoch": 0.3745247148288973,
"grad_norm": 0.055694580078125,
"learning_rate": 0.0001395602751485059,
"loss": 0.5652,
"step": 591
},
{
"epoch": 0.37515842839036756,
"grad_norm": 0.0731462761759758,
"learning_rate": 0.00013937618394639235,
"loss": 0.4977,
"step": 592
},
{
"epoch": 0.37579214195183774,
"grad_norm": 0.05172240361571312,
"learning_rate": 0.000139191934677316,
"loss": 0.524,
"step": 593
},
{
"epoch": 0.376425855513308,
"grad_norm": 0.05123208463191986,
"learning_rate": 0.00013900752808090468,
"loss": 0.5355,
"step": 594
},
{
"epoch": 0.3770595690747782,
"grad_norm": 0.056850165128707886,
"learning_rate": 0.00013882296489741783,
"loss": 0.4908,
"step": 595
},
{
"epoch": 0.3776932826362484,
"grad_norm": 0.06634749472141266,
"learning_rate": 0.00013863824586774344,
"loss": 0.4283,
"step": 596
},
{
"epoch": 0.37832699619771865,
"grad_norm": 0.04840132221579552,
"learning_rate": 0.00013845337173339507,
"loss": 0.4897,
"step": 597
},
{
"epoch": 0.37896070975918883,
"grad_norm": 0.0695575699210167,
"learning_rate": 0.000138268343236509,
"loss": 0.583,
"step": 598
},
{
"epoch": 0.37959442332065907,
"grad_norm": 0.048906922340393066,
"learning_rate": 0.00013808316111984107,
"loss": 0.4496,
"step": 599
},
{
"epoch": 0.38022813688212925,
"grad_norm": 0.05677906423807144,
"learning_rate": 0.0001378978261267639,
"loss": 0.4717,
"step": 600
},
{
"epoch": 0.3808618504435995,
"grad_norm": 0.10213559865951538,
"learning_rate": 0.0001377123390012637,
"loss": 0.6238,
"step": 601
},
{
"epoch": 0.38149556400506973,
"grad_norm": 0.050033628940582275,
"learning_rate": 0.00013752670048793744,
"loss": 0.4001,
"step": 602
},
{
"epoch": 0.3821292775665399,
"grad_norm": 0.07862118631601334,
"learning_rate": 0.00013734091133198975,
"loss": 0.5346,
"step": 603
},
{
"epoch": 0.38276299112801015,
"grad_norm": 0.053442683070898056,
"learning_rate": 0.00013715497227923006,
"loss": 0.4903,
"step": 604
},
{
"epoch": 0.38339670468948034,
"grad_norm": 0.06940152496099472,
"learning_rate": 0.00013696888407606952,
"loss": 0.568,
"step": 605
},
{
"epoch": 0.3840304182509506,
"grad_norm": 0.048307280987501144,
"learning_rate": 0.00013678264746951787,
"loss": 0.5245,
"step": 606
},
{
"epoch": 0.38466413181242076,
"grad_norm": 0.04498027265071869,
"learning_rate": 0.00013659626320718077,
"loss": 0.3682,
"step": 607
},
{
"epoch": 0.385297845373891,
"grad_norm": 0.05874482914805412,
"learning_rate": 0.0001364097320372565,
"loss": 0.5148,
"step": 608
},
{
"epoch": 0.38593155893536124,
"grad_norm": 0.04996568709611893,
"learning_rate": 0.00013622305470853313,
"loss": 0.4756,
"step": 609
},
{
"epoch": 0.3865652724968314,
"grad_norm": 0.07363967597484589,
"learning_rate": 0.00013603623197038536,
"loss": 0.5053,
"step": 610
},
{
"epoch": 0.38719898605830166,
"grad_norm": 0.0668586939573288,
"learning_rate": 0.00013584926457277168,
"loss": 0.5362,
"step": 611
},
{
"epoch": 0.38783269961977185,
"grad_norm": 0.06371022760868073,
"learning_rate": 0.0001356621532662313,
"loss": 0.5457,
"step": 612
},
{
"epoch": 0.3884664131812421,
"grad_norm": 0.07108695805072784,
"learning_rate": 0.00013547489880188108,
"loss": 0.5238,
"step": 613
},
{
"epoch": 0.38910012674271227,
"grad_norm": 0.05326547846198082,
"learning_rate": 0.00013528750193141255,
"loss": 0.4505,
"step": 614
},
{
"epoch": 0.3897338403041825,
"grad_norm": 0.08405181765556335,
"learning_rate": 0.0001350999634070889,
"loss": 0.6235,
"step": 615
},
{
"epoch": 0.39036755386565275,
"grad_norm": 0.05981157347559929,
"learning_rate": 0.000134912283981742,
"loss": 0.5175,
"step": 616
},
{
"epoch": 0.39100126742712293,
"grad_norm": 0.05275322496891022,
"learning_rate": 0.00013472446440876927,
"loss": 0.5536,
"step": 617
},
{
"epoch": 0.3916349809885932,
"grad_norm": 0.053324826061725616,
"learning_rate": 0.00013453650544213076,
"loss": 0.5926,
"step": 618
},
{
"epoch": 0.39226869455006336,
"grad_norm": 0.056955184787511826,
"learning_rate": 0.0001343484078363461,
"loss": 0.4393,
"step": 619
},
{
"epoch": 0.3929024081115336,
"grad_norm": 0.05232278257608414,
"learning_rate": 0.00013416017234649146,
"loss": 0.5163,
"step": 620
},
{
"epoch": 0.3935361216730038,
"grad_norm": 0.06405606865882874,
"learning_rate": 0.00013397179972819643,
"loss": 0.575,
"step": 621
},
{
"epoch": 0.394169835234474,
"grad_norm": 0.058417316526174545,
"learning_rate": 0.00013378329073764119,
"loss": 0.542,
"step": 622
},
{
"epoch": 0.39480354879594426,
"grad_norm": 0.05610906332731247,
"learning_rate": 0.00013359464613155325,
"loss": 0.4576,
"step": 623
},
{
"epoch": 0.39543726235741444,
"grad_norm": 0.06383884698152542,
"learning_rate": 0.00013340586666720457,
"loss": 0.5938,
"step": 624
},
{
"epoch": 0.3960709759188847,
"grad_norm": 0.05517081543803215,
"learning_rate": 0.0001332169531024085,
"loss": 0.4492,
"step": 625
},
{
"epoch": 0.39670468948035487,
"grad_norm": 0.07210738211870193,
"learning_rate": 0.00013302790619551674,
"loss": 0.6145,
"step": 626
},
{
"epoch": 0.3973384030418251,
"grad_norm": 0.06636934727430344,
"learning_rate": 0.00013283872670541604,
"loss": 0.4242,
"step": 627
},
{
"epoch": 0.3979721166032953,
"grad_norm": 0.07977598905563354,
"learning_rate": 0.00013264941539152566,
"loss": 0.5553,
"step": 628
},
{
"epoch": 0.39860583016476553,
"grad_norm": 0.056893352419137955,
"learning_rate": 0.00013245997301379383,
"loss": 0.4311,
"step": 629
},
{
"epoch": 0.39923954372623577,
"grad_norm": 1.9656810760498047,
"learning_rate": 0.000132270400332695,
"loss": 0.5208,
"step": 630
},
{
"epoch": 0.39987325728770595,
"grad_norm": 0.05810742825269699,
"learning_rate": 0.00013208069810922673,
"loss": 0.56,
"step": 631
},
{
"epoch": 0.4005069708491762,
"grad_norm": 0.08527707308530807,
"learning_rate": 0.00013189086710490647,
"loss": 0.5094,
"step": 632
},
{
"epoch": 0.4011406844106464,
"grad_norm": 0.07540644705295563,
"learning_rate": 0.00013170090808176883,
"loss": 0.5193,
"step": 633
},
{
"epoch": 0.4017743979721166,
"grad_norm": 0.08294139802455902,
"learning_rate": 0.0001315108218023621,
"loss": 0.5538,
"step": 634
},
{
"epoch": 0.4024081115335868,
"grad_norm": 0.07025711983442307,
"learning_rate": 0.00013132060902974554,
"loss": 0.5451,
"step": 635
},
{
"epoch": 0.40304182509505704,
"grad_norm": 0.05808630213141441,
"learning_rate": 0.00013113027052748615,
"loss": 0.5342,
"step": 636
},
{
"epoch": 0.4036755386565273,
"grad_norm": 0.040730297565460205,
"learning_rate": 0.0001309398070596557,
"loss": 0.4434,
"step": 637
},
{
"epoch": 0.40430925221799746,
"grad_norm": 0.06423351913690567,
"learning_rate": 0.00013074921939082757,
"loss": 0.5463,
"step": 638
},
{
"epoch": 0.4049429657794677,
"grad_norm": 0.07848164439201355,
"learning_rate": 0.00013055850828607368,
"loss": 0.651,
"step": 639
},
{
"epoch": 0.4055766793409379,
"grad_norm": 0.08495569974184036,
"learning_rate": 0.00013036767451096148,
"loss": 0.4675,
"step": 640
},
{
"epoch": 0.4062103929024081,
"grad_norm": 0.06640883535146713,
"learning_rate": 0.0001301767188315509,
"loss": 0.5261,
"step": 641
},
{
"epoch": 0.4068441064638783,
"grad_norm": 0.04708843678236008,
"learning_rate": 0.00012998564201439116,
"loss": 0.3417,
"step": 642
},
{
"epoch": 0.40747782002534855,
"grad_norm": 0.09854655712842941,
"learning_rate": 0.00012979444482651782,
"loss": 0.6236,
"step": 643
},
{
"epoch": 0.40811153358681873,
"grad_norm": 0.11556591838598251,
"learning_rate": 0.00012960312803544962,
"loss": 0.6022,
"step": 644
},
{
"epoch": 0.40874524714828897,
"grad_norm": 0.922315776348114,
"learning_rate": 0.00012941169240918534,
"loss": 0.4034,
"step": 645
},
{
"epoch": 0.4093789607097592,
"grad_norm": 0.08266003429889679,
"learning_rate": 0.00012922013871620095,
"loss": 0.5455,
"step": 646
},
{
"epoch": 0.4100126742712294,
"grad_norm": 0.05183318257331848,
"learning_rate": 0.00012902846772544624,
"loss": 0.437,
"step": 647
},
{
"epoch": 0.41064638783269963,
"grad_norm": 0.10581205785274506,
"learning_rate": 0.00012883668020634195,
"loss": 0.5762,
"step": 648
},
{
"epoch": 0.4112801013941698,
"grad_norm": 0.06646697223186493,
"learning_rate": 0.00012864477692877657,
"loss": 0.5462,
"step": 649
},
{
"epoch": 0.41191381495564006,
"grad_norm": 0.10537492483854294,
"learning_rate": 0.00012845275866310324,
"loss": 0.5098,
"step": 650
},
{
"epoch": 0.41254752851711024,
"grad_norm": 0.07540510594844818,
"learning_rate": 0.0001282606261801368,
"loss": 0.6208,
"step": 651
},
{
"epoch": 0.4131812420785805,
"grad_norm": 0.06597273051738739,
"learning_rate": 0.0001280683802511504,
"loss": 0.5896,
"step": 652
},
{
"epoch": 0.4138149556400507,
"grad_norm": 0.060704171657562256,
"learning_rate": 0.0001278760216478728,
"loss": 0.4844,
"step": 653
},
{
"epoch": 0.4144486692015209,
"grad_norm": 0.07420588284730911,
"learning_rate": 0.00012768355114248494,
"loss": 0.5673,
"step": 654
},
{
"epoch": 0.41508238276299114,
"grad_norm": 0.06360962241888046,
"learning_rate": 0.00012749096950761702,
"loss": 0.5322,
"step": 655
},
{
"epoch": 0.4157160963244613,
"grad_norm": 0.0631156638264656,
"learning_rate": 0.00012729827751634533,
"loss": 0.4863,
"step": 656
},
{
"epoch": 0.41634980988593157,
"grad_norm": 0.06497811526060104,
"learning_rate": 0.00012710547594218917,
"loss": 0.5775,
"step": 657
},
{
"epoch": 0.41698352344740175,
"grad_norm": 0.07515639066696167,
"learning_rate": 0.00012691256555910768,
"loss": 0.5207,
"step": 658
},
{
"epoch": 0.417617237008872,
"grad_norm": 0.073845274746418,
"learning_rate": 0.0001267195471414969,
"loss": 0.5306,
"step": 659
},
{
"epoch": 0.41825095057034223,
"grad_norm": 0.0654008612036705,
"learning_rate": 0.0001265264214641864,
"loss": 0.4677,
"step": 660
},
{
"epoch": 0.4188846641318124,
"grad_norm": 0.043669626116752625,
"learning_rate": 0.00012633318930243648,
"loss": 0.4221,
"step": 661
},
{
"epoch": 0.41951837769328265,
"grad_norm": 0.047917358577251434,
"learning_rate": 0.00012613985143193482,
"loss": 0.3635,
"step": 662
},
{
"epoch": 0.42015209125475284,
"grad_norm": 0.06635928153991699,
"learning_rate": 0.0001259464086287934,
"loss": 0.5453,
"step": 663
},
{
"epoch": 0.4207858048162231,
"grad_norm": 0.05781178921461105,
"learning_rate": 0.0001257528616695455,
"loss": 0.5,
"step": 664
},
{
"epoch": 0.42141951837769326,
"grad_norm": 0.0605790875852108,
"learning_rate": 0.00012555921133114247,
"loss": 0.5034,
"step": 665
},
{
"epoch": 0.4220532319391635,
"grad_norm": 0.04980487376451492,
"learning_rate": 0.00012536545839095074,
"loss": 0.4347,
"step": 666
},
{
"epoch": 0.42268694550063374,
"grad_norm": 0.06540601700544357,
"learning_rate": 0.00012517160362674848,
"loss": 0.5351,
"step": 667
},
{
"epoch": 0.4233206590621039,
"grad_norm": 0.049716752022504807,
"learning_rate": 0.0001249776478167227,
"loss": 0.4476,
"step": 668
},
{
"epoch": 0.42395437262357416,
"grad_norm": 0.10267884284257889,
"learning_rate": 0.00012478359173946602,
"loss": 0.5616,
"step": 669
},
{
"epoch": 0.42458808618504434,
"grad_norm": 0.05907197296619415,
"learning_rate": 0.00012458943617397344,
"loss": 0.4403,
"step": 670
},
{
"epoch": 0.4252217997465146,
"grad_norm": 0.09869077801704407,
"learning_rate": 0.0001243951818996396,
"loss": 0.6336,
"step": 671
},
{
"epoch": 0.42585551330798477,
"grad_norm": 0.07539843767881393,
"learning_rate": 0.00012420082969625518,
"loss": 0.6676,
"step": 672
},
{
"epoch": 0.426489226869455,
"grad_norm": 0.09385417401790619,
"learning_rate": 0.00012400638034400395,
"loss": 0.5714,
"step": 673
},
{
"epoch": 0.42712294043092525,
"grad_norm": 0.06782330572605133,
"learning_rate": 0.00012381183462345982,
"loss": 0.4956,
"step": 674
},
{
"epoch": 0.42775665399239543,
"grad_norm": 0.06100660189986229,
"learning_rate": 0.00012361719331558345,
"loss": 0.4217,
"step": 675
},
{
"epoch": 0.42839036755386567,
"grad_norm": 0.09908254444599152,
"learning_rate": 0.00012342245720171918,
"loss": 0.5405,
"step": 676
},
{
"epoch": 0.42902408111533585,
"grad_norm": 0.05237731710076332,
"learning_rate": 0.00012322762706359203,
"loss": 0.5044,
"step": 677
},
{
"epoch": 0.4296577946768061,
"grad_norm": 0.04910963028669357,
"learning_rate": 0.00012303270368330439,
"loss": 0.5073,
"step": 678
},
{
"epoch": 0.4302915082382763,
"grad_norm": 0.06268120557069778,
"learning_rate": 0.00012283768784333293,
"loss": 0.5736,
"step": 679
},
{
"epoch": 0.4309252217997465,
"grad_norm": 0.05207136273384094,
"learning_rate": 0.00012264258032652559,
"loss": 0.5319,
"step": 680
},
{
"epoch": 0.43155893536121676,
"grad_norm": 0.09583932906389236,
"learning_rate": 0.00012244738191609814,
"loss": 0.5891,
"step": 681
},
{
"epoch": 0.43219264892268694,
"grad_norm": 0.06307169795036316,
"learning_rate": 0.00012225209339563145,
"loss": 0.556,
"step": 682
},
{
"epoch": 0.4328263624841572,
"grad_norm": 0.062134500592947006,
"learning_rate": 0.00012205671554906794,
"loss": 0.5607,
"step": 683
},
{
"epoch": 0.43346007604562736,
"grad_norm": 0.04890581965446472,
"learning_rate": 0.00012186124916070867,
"loss": 0.4789,
"step": 684
},
{
"epoch": 0.4340937896070976,
"grad_norm": 0.04669584706425667,
"learning_rate": 0.00012166569501521017,
"loss": 0.4784,
"step": 685
},
{
"epoch": 0.4347275031685678,
"grad_norm": 0.05782284587621689,
"learning_rate": 0.00012147005389758117,
"loss": 0.5761,
"step": 686
},
{
"epoch": 0.435361216730038,
"grad_norm": 0.07015878707170486,
"learning_rate": 0.00012127432659317956,
"loss": 0.5462,
"step": 687
},
{
"epoch": 0.43599493029150826,
"grad_norm": 0.05989618971943855,
"learning_rate": 0.00012107851388770928,
"loss": 0.4671,
"step": 688
},
{
"epoch": 0.43662864385297845,
"grad_norm": 0.05732743442058563,
"learning_rate": 0.000120882616567217,
"loss": 0.4952,
"step": 689
},
{
"epoch": 0.4372623574144487,
"grad_norm": 0.06397297978401184,
"learning_rate": 0.00012068663541808909,
"loss": 0.5001,
"step": 690
},
{
"epoch": 0.43789607097591887,
"grad_norm": 0.05474892258644104,
"learning_rate": 0.00012049057122704846,
"loss": 0.4371,
"step": 691
},
{
"epoch": 0.4385297845373891,
"grad_norm": 0.0542195625603199,
"learning_rate": 0.00012029442478115129,
"loss": 0.4027,
"step": 692
},
{
"epoch": 0.4391634980988593,
"grad_norm": 0.0857028216123581,
"learning_rate": 0.00012009819686778408,
"loss": 0.5752,
"step": 693
},
{
"epoch": 0.43979721166032953,
"grad_norm": 0.07950462400913239,
"learning_rate": 0.00011990188827466025,
"loss": 0.4821,
"step": 694
},
{
"epoch": 0.4404309252217998,
"grad_norm": 0.13862280547618866,
"learning_rate": 0.00011970549978981715,
"loss": 0.5725,
"step": 695
},
{
"epoch": 0.44106463878326996,
"grad_norm": 0.06896214932203293,
"learning_rate": 0.00011950903220161285,
"loss": 0.5461,
"step": 696
},
{
"epoch": 0.4416983523447402,
"grad_norm": 0.05688636004924774,
"learning_rate": 0.00011931248629872287,
"loss": 0.6257,
"step": 697
},
{
"epoch": 0.4423320659062104,
"grad_norm": 0.07330068945884705,
"learning_rate": 0.00011911586287013725,
"loss": 0.4781,
"step": 698
},
{
"epoch": 0.4429657794676806,
"grad_norm": 0.057357531040906906,
"learning_rate": 0.0001189191627051571,
"loss": 0.3767,
"step": 699
},
{
"epoch": 0.4435994930291508,
"grad_norm": 0.05856744199991226,
"learning_rate": 0.00011872238659339168,
"loss": 0.5233,
"step": 700
},
{
"epoch": 0.44423320659062104,
"grad_norm": 0.04932614043354988,
"learning_rate": 0.00011852553532475503,
"loss": 0.5493,
"step": 701
},
{
"epoch": 0.4448669201520912,
"grad_norm": 0.10165086388587952,
"learning_rate": 0.00011832860968946297,
"loss": 0.626,
"step": 702
},
{
"epoch": 0.44550063371356147,
"grad_norm": 0.059510741382837296,
"learning_rate": 0.00011813161047802985,
"loss": 0.4979,
"step": 703
},
{
"epoch": 0.4461343472750317,
"grad_norm": 0.059596769511699677,
"learning_rate": 0.00011793453848126526,
"loss": 0.5903,
"step": 704
},
{
"epoch": 0.4467680608365019,
"grad_norm": 0.043714553117752075,
"learning_rate": 0.00011773739449027108,
"loss": 0.4347,
"step": 705
},
{
"epoch": 0.44740177439797213,
"grad_norm": 0.06549560278654099,
"learning_rate": 0.00011754017929643817,
"loss": 0.3608,
"step": 706
},
{
"epoch": 0.4480354879594423,
"grad_norm": 0.07389537245035172,
"learning_rate": 0.00011734289369144323,
"loss": 0.6457,
"step": 707
},
{
"epoch": 0.44866920152091255,
"grad_norm": 0.0611582025885582,
"learning_rate": 0.00011714553846724558,
"loss": 0.4182,
"step": 708
},
{
"epoch": 0.44930291508238274,
"grad_norm": 0.06682246923446655,
"learning_rate": 0.00011694811441608402,
"loss": 0.4601,
"step": 709
},
{
"epoch": 0.449936628643853,
"grad_norm": 0.05429236590862274,
"learning_rate": 0.00011675062233047364,
"loss": 0.5933,
"step": 710
},
{
"epoch": 0.4505703422053232,
"grad_norm": 0.07824891060590744,
"learning_rate": 0.00011655306300320268,
"loss": 0.6553,
"step": 711
},
{
"epoch": 0.4512040557667934,
"grad_norm": 0.0523335300385952,
"learning_rate": 0.0001163554372273292,
"loss": 0.382,
"step": 712
},
{
"epoch": 0.45183776932826364,
"grad_norm": 0.0779106542468071,
"learning_rate": 0.00011615774579617817,
"loss": 0.5208,
"step": 713
},
{
"epoch": 0.4524714828897338,
"grad_norm": 0.05331442877650261,
"learning_rate": 0.00011595998950333793,
"loss": 0.4668,
"step": 714
},
{
"epoch": 0.45310519645120406,
"grad_norm": 0.077408067882061,
"learning_rate": 0.00011576216914265734,
"loss": 0.4491,
"step": 715
},
{
"epoch": 0.45373891001267425,
"grad_norm": 0.2051779180765152,
"learning_rate": 0.00011556428550824237,
"loss": 0.5396,
"step": 716
},
{
"epoch": 0.4543726235741445,
"grad_norm": 0.052188027650117874,
"learning_rate": 0.000115366339394453,
"loss": 0.5815,
"step": 717
},
{
"epoch": 0.4550063371356147,
"grad_norm": 0.060880374163389206,
"learning_rate": 0.0001151683315959001,
"loss": 0.5019,
"step": 718
},
{
"epoch": 0.4556400506970849,
"grad_norm": 0.10370609164237976,
"learning_rate": 0.000114970262907442,
"loss": 0.5166,
"step": 719
},
{
"epoch": 0.45627376425855515,
"grad_norm": 0.059755194932222366,
"learning_rate": 0.00011477213412418157,
"loss": 0.5363,
"step": 720
},
{
"epoch": 0.45690747782002533,
"grad_norm": 0.05834079161286354,
"learning_rate": 0.00011457394604146294,
"loss": 0.487,
"step": 721
},
{
"epoch": 0.45754119138149557,
"grad_norm": 0.07119245082139969,
"learning_rate": 0.00011437569945486819,
"loss": 0.5711,
"step": 722
},
{
"epoch": 0.45817490494296575,
"grad_norm": 0.06131361797451973,
"learning_rate": 0.00011417739516021428,
"loss": 0.5226,
"step": 723
},
{
"epoch": 0.458808618504436,
"grad_norm": 0.04943651333451271,
"learning_rate": 0.00011397903395354996,
"loss": 0.4307,
"step": 724
},
{
"epoch": 0.45944233206590623,
"grad_norm": 0.046283356845378876,
"learning_rate": 0.00011378061663115222,
"loss": 0.3834,
"step": 725
},
{
"epoch": 0.4600760456273764,
"grad_norm": 0.0585121251642704,
"learning_rate": 0.00011358214398952347,
"loss": 0.6028,
"step": 726
},
{
"epoch": 0.46070975918884666,
"grad_norm": 0.08686511963605881,
"learning_rate": 0.00011338361682538811,
"loss": 0.4879,
"step": 727
},
{
"epoch": 0.46134347275031684,
"grad_norm": 0.07081152498722076,
"learning_rate": 0.00011318503593568948,
"loss": 0.6132,
"step": 728
},
{
"epoch": 0.4619771863117871,
"grad_norm": 0.05887436121702194,
"learning_rate": 0.00011298640211758648,
"loss": 0.5707,
"step": 729
},
{
"epoch": 0.46261089987325726,
"grad_norm": 0.06929212808609009,
"learning_rate": 0.00011278771616845061,
"loss": 0.449,
"step": 730
},
{
"epoch": 0.4632446134347275,
"grad_norm": 0.04306876286864281,
"learning_rate": 0.00011258897888586255,
"loss": 0.486,
"step": 731
},
{
"epoch": 0.46387832699619774,
"grad_norm": 0.05465447157621384,
"learning_rate": 0.00011239019106760908,
"loss": 0.4704,
"step": 732
},
{
"epoch": 0.4645120405576679,
"grad_norm": 0.058161042630672455,
"learning_rate": 0.00011219135351167979,
"loss": 0.5467,
"step": 733
},
{
"epoch": 0.46514575411913817,
"grad_norm": 0.06773436069488525,
"learning_rate": 0.00011199246701626405,
"loss": 0.5329,
"step": 734
},
{
"epoch": 0.46577946768060835,
"grad_norm": 0.04506424069404602,
"learning_rate": 0.00011179353237974756,
"loss": 0.4359,
"step": 735
},
{
"epoch": 0.4664131812420786,
"grad_norm": 0.05979963019490242,
"learning_rate": 0.00011159455040070936,
"loss": 0.5445,
"step": 736
},
{
"epoch": 0.4670468948035488,
"grad_norm": 0.0482424721121788,
"learning_rate": 0.00011139552187791848,
"loss": 0.4957,
"step": 737
},
{
"epoch": 0.467680608365019,
"grad_norm": 0.05097084492444992,
"learning_rate": 0.00011119644761033078,
"loss": 0.4642,
"step": 738
},
{
"epoch": 0.46831432192648925,
"grad_norm": 0.05539529025554657,
"learning_rate": 0.00011099732839708586,
"loss": 0.4227,
"step": 739
},
{
"epoch": 0.46894803548795944,
"grad_norm": 0.06280332803726196,
"learning_rate": 0.0001107981650375036,
"loss": 0.5842,
"step": 740
},
{
"epoch": 0.4695817490494297,
"grad_norm": 0.05138114467263222,
"learning_rate": 0.00011059895833108119,
"loss": 0.5681,
"step": 741
},
{
"epoch": 0.47021546261089986,
"grad_norm": 0.058239031583070755,
"learning_rate": 0.0001103997090774898,
"loss": 0.5582,
"step": 742
},
{
"epoch": 0.4708491761723701,
"grad_norm": 0.06877847760915756,
"learning_rate": 0.00011020041807657138,
"loss": 0.5912,
"step": 743
},
{
"epoch": 0.4714828897338403,
"grad_norm": 0.05639166757464409,
"learning_rate": 0.00011000108612833551,
"loss": 0.5888,
"step": 744
},
{
"epoch": 0.4721166032953105,
"grad_norm": 0.05756942555308342,
"learning_rate": 0.0001098017140329561,
"loss": 0.5451,
"step": 745
},
{
"epoch": 0.47275031685678076,
"grad_norm": 0.057658858597278595,
"learning_rate": 0.00010960230259076818,
"loss": 0.4939,
"step": 746
},
{
"epoch": 0.47338403041825095,
"grad_norm": 0.05436946451663971,
"learning_rate": 0.00010940285260226488,
"loss": 0.5084,
"step": 747
},
{
"epoch": 0.4740177439797212,
"grad_norm": 0.06349501758813858,
"learning_rate": 0.00010920336486809393,
"loss": 0.6588,
"step": 748
},
{
"epoch": 0.47465145754119137,
"grad_norm": 0.06300094723701477,
"learning_rate": 0.00010900384018905463,
"loss": 0.5655,
"step": 749
},
{
"epoch": 0.4752851711026616,
"grad_norm": 0.06454197317361832,
"learning_rate": 0.00010880427936609455,
"loss": 0.5455,
"step": 750
},
{
"epoch": 0.4759188846641318,
"grad_norm": 0.06663431227207184,
"learning_rate": 0.0001086046832003064,
"loss": 0.5263,
"step": 751
},
{
"epoch": 0.47655259822560203,
"grad_norm": 0.06523749232292175,
"learning_rate": 0.00010840505249292476,
"loss": 0.4109,
"step": 752
},
{
"epoch": 0.47718631178707227,
"grad_norm": 0.066495381295681,
"learning_rate": 0.00010820538804532286,
"loss": 0.5395,
"step": 753
},
{
"epoch": 0.47782002534854245,
"grad_norm": 0.07330245524644852,
"learning_rate": 0.00010800569065900933,
"loss": 0.5392,
"step": 754
},
{
"epoch": 0.4784537389100127,
"grad_norm": 0.05793917551636696,
"learning_rate": 0.00010780596113562514,
"loss": 0.5323,
"step": 755
},
{
"epoch": 0.4790874524714829,
"grad_norm": 0.05146726965904236,
"learning_rate": 0.0001076062002769401,
"loss": 0.4334,
"step": 756
},
{
"epoch": 0.4797211660329531,
"grad_norm": 0.06809573620557785,
"learning_rate": 0.00010740640888484996,
"loss": 0.5635,
"step": 757
},
{
"epoch": 0.4803548795944233,
"grad_norm": 0.05846872553229332,
"learning_rate": 0.00010720658776137298,
"loss": 0.5631,
"step": 758
},
{
"epoch": 0.48098859315589354,
"grad_norm": 0.06662282347679138,
"learning_rate": 0.00010700673770864673,
"loss": 0.3119,
"step": 759
},
{
"epoch": 0.4816223067173637,
"grad_norm": 0.05133543908596039,
"learning_rate": 0.00010680685952892502,
"loss": 0.5222,
"step": 760
},
{
"epoch": 0.48225602027883396,
"grad_norm": 0.06625013798475266,
"learning_rate": 0.00010660695402457442,
"loss": 0.4834,
"step": 761
},
{
"epoch": 0.4828897338403042,
"grad_norm": 0.07142903655767441,
"learning_rate": 0.0001064070219980713,
"loss": 0.551,
"step": 762
},
{
"epoch": 0.4835234474017744,
"grad_norm": 0.06273732334375381,
"learning_rate": 0.00010620706425199849,
"loss": 0.6681,
"step": 763
},
{
"epoch": 0.4841571609632446,
"grad_norm": 0.05467168986797333,
"learning_rate": 0.000106007081589042,
"loss": 0.5253,
"step": 764
},
{
"epoch": 0.4847908745247148,
"grad_norm": 0.05966407433152199,
"learning_rate": 0.00010580707481198796,
"loss": 0.516,
"step": 765
},
{
"epoch": 0.48542458808618505,
"grad_norm": 0.0470612607896328,
"learning_rate": 0.00010560704472371919,
"loss": 0.4632,
"step": 766
},
{
"epoch": 0.48605830164765523,
"grad_norm": 0.0659315288066864,
"learning_rate": 0.00010540699212721219,
"loss": 0.5164,
"step": 767
},
{
"epoch": 0.4866920152091255,
"grad_norm": 0.061314892023801804,
"learning_rate": 0.0001052069178255337,
"loss": 0.5968,
"step": 768
},
{
"epoch": 0.4873257287705957,
"grad_norm": 0.05175092816352844,
"learning_rate": 0.00010500682262183772,
"loss": 0.4665,
"step": 769
},
{
"epoch": 0.4879594423320659,
"grad_norm": 0.04965231940150261,
"learning_rate": 0.00010480670731936208,
"loss": 0.5068,
"step": 770
},
{
"epoch": 0.48859315589353614,
"grad_norm": 0.06218743324279785,
"learning_rate": 0.0001046065727214253,
"loss": 0.4043,
"step": 771
},
{
"epoch": 0.4892268694550063,
"grad_norm": 0.05969774350523949,
"learning_rate": 0.00010440641963142336,
"loss": 0.4471,
"step": 772
},
{
"epoch": 0.48986058301647656,
"grad_norm": 0.04538511112332344,
"learning_rate": 0.00010420624885282653,
"loss": 0.4891,
"step": 773
},
{
"epoch": 0.49049429657794674,
"grad_norm": 0.06056825444102287,
"learning_rate": 0.00010400606118917593,
"loss": 0.452,
"step": 774
},
{
"epoch": 0.491128010139417,
"grad_norm": 0.04322752729058266,
"learning_rate": 0.00010380585744408065,
"loss": 0.4044,
"step": 775
},
{
"epoch": 0.4917617237008872,
"grad_norm": 0.05485018342733383,
"learning_rate": 0.0001036056384212142,
"loss": 0.4913,
"step": 776
},
{
"epoch": 0.4923954372623574,
"grad_norm": 0.045921441167593,
"learning_rate": 0.0001034054049243115,
"loss": 0.4713,
"step": 777
},
{
"epoch": 0.49302915082382764,
"grad_norm": 0.05987657979130745,
"learning_rate": 0.00010320515775716555,
"loss": 0.4339,
"step": 778
},
{
"epoch": 0.49366286438529783,
"grad_norm": 0.06263814866542816,
"learning_rate": 0.00010300489772362416,
"loss": 0.5853,
"step": 779
},
{
"epoch": 0.49429657794676807,
"grad_norm": 0.07110540568828583,
"learning_rate": 0.0001028046256275869,
"loss": 0.5899,
"step": 780
},
{
"epoch": 0.49493029150823825,
"grad_norm": 0.05008992552757263,
"learning_rate": 0.00010260434227300171,
"loss": 0.5061,
"step": 781
},
{
"epoch": 0.4955640050697085,
"grad_norm": 0.05329698696732521,
"learning_rate": 0.00010240404846386168,
"loss": 0.5073,
"step": 782
},
{
"epoch": 0.49619771863117873,
"grad_norm": 0.060529615730047226,
"learning_rate": 0.000102203745004202,
"loss": 0.5194,
"step": 783
},
{
"epoch": 0.4968314321926489,
"grad_norm": 0.05783366411924362,
"learning_rate": 0.00010200343269809642,
"loss": 0.5393,
"step": 784
},
{
"epoch": 0.49746514575411915,
"grad_norm": 0.05209111049771309,
"learning_rate": 0.00010180311234965433,
"loss": 0.4858,
"step": 785
},
{
"epoch": 0.49809885931558934,
"grad_norm": 0.05122411996126175,
"learning_rate": 0.0001016027847630174,
"loss": 0.4476,
"step": 786
},
{
"epoch": 0.4987325728770596,
"grad_norm": 0.06304119527339935,
"learning_rate": 0.00010140245074235624,
"loss": 0.5741,
"step": 787
},
{
"epoch": 0.49936628643852976,
"grad_norm": 0.09011054039001465,
"learning_rate": 0.00010120211109186747,
"loss": 0.3418,
"step": 788
},
{
"epoch": 0.5,
"grad_norm": 0.06214231252670288,
"learning_rate": 0.00010100176661577015,
"loss": 0.5186,
"step": 789
},
{
"epoch": 0.5006337135614702,
"grad_norm": 0.19616113603115082,
"learning_rate": 0.00010080141811830277,
"loss": 0.5121,
"step": 790
},
{
"epoch": 0.5012674271229405,
"grad_norm": 0.05623235926032066,
"learning_rate": 0.00010060106640372,
"loss": 0.4457,
"step": 791
},
{
"epoch": 0.5019011406844106,
"grad_norm": 0.06097716465592384,
"learning_rate": 0.00010040071227628938,
"loss": 0.4578,
"step": 792
},
{
"epoch": 0.5025348542458808,
"grad_norm": 0.042372945696115494,
"learning_rate": 0.00010020035654028816,
"loss": 0.3896,
"step": 793
},
{
"epoch": 0.5031685678073511,
"grad_norm": 0.05927233397960663,
"learning_rate": 0.0001,
"loss": 0.6026,
"step": 794
},
{
"epoch": 0.5038022813688213,
"grad_norm": 0.06227416917681694,
"learning_rate": 9.979964345971188e-05,
"loss": 0.4366,
"step": 795
},
{
"epoch": 0.5044359949302915,
"grad_norm": 0.055778343230485916,
"learning_rate": 9.959928772371061e-05,
"loss": 0.4425,
"step": 796
},
{
"epoch": 0.5050697084917617,
"grad_norm": 0.04457565397024155,
"learning_rate": 9.939893359628001e-05,
"loss": 0.5326,
"step": 797
},
{
"epoch": 0.5057034220532319,
"grad_norm": 0.05732344835996628,
"learning_rate": 9.919858188169724e-05,
"loss": 0.5296,
"step": 798
},
{
"epoch": 0.5063371356147022,
"grad_norm": 0.04832519590854645,
"learning_rate": 9.899823338422986e-05,
"loss": 0.3992,
"step": 799
},
{
"epoch": 0.5069708491761724,
"grad_norm": 0.06504333764314651,
"learning_rate": 9.879788890813255e-05,
"loss": 0.3772,
"step": 800
},
{
"epoch": 0.5076045627376425,
"grad_norm": 0.05304650217294693,
"learning_rate": 9.859754925764378e-05,
"loss": 0.5455,
"step": 801
},
{
"epoch": 0.5082382762991128,
"grad_norm": 0.04738354682922363,
"learning_rate": 9.839721523698264e-05,
"loss": 0.4221,
"step": 802
},
{
"epoch": 0.508871989860583,
"grad_norm": 0.061429157853126526,
"learning_rate": 9.819688765034568e-05,
"loss": 0.5197,
"step": 803
},
{
"epoch": 0.5095057034220533,
"grad_norm": 0.04687187448143959,
"learning_rate": 9.79965673019036e-05,
"loss": 0.417,
"step": 804
},
{
"epoch": 0.5101394169835235,
"grad_norm": 0.05944183096289635,
"learning_rate": 9.779625499579805e-05,
"loss": 0.6043,
"step": 805
},
{
"epoch": 0.5107731305449936,
"grad_norm": 0.05007549747824669,
"learning_rate": 9.75959515361383e-05,
"loss": 0.5161,
"step": 806
},
{
"epoch": 0.5114068441064639,
"grad_norm": 0.0616040863096714,
"learning_rate": 9.739565772699831e-05,
"loss": 0.6219,
"step": 807
},
{
"epoch": 0.5120405576679341,
"grad_norm": 0.23154355585575104,
"learning_rate": 9.719537437241312e-05,
"loss": 0.4653,
"step": 808
},
{
"epoch": 0.5126742712294043,
"grad_norm": 0.08757317066192627,
"learning_rate": 9.699510227637586e-05,
"loss": 0.7004,
"step": 809
},
{
"epoch": 0.5133079847908745,
"grad_norm": 0.053165238350629807,
"learning_rate": 9.679484224283449e-05,
"loss": 0.5367,
"step": 810
},
{
"epoch": 0.5139416983523447,
"grad_norm": 0.05361173674464226,
"learning_rate": 9.659459507568853e-05,
"loss": 0.5044,
"step": 811
},
{
"epoch": 0.514575411913815,
"grad_norm": 0.0656973198056221,
"learning_rate": 9.63943615787858e-05,
"loss": 0.5785,
"step": 812
},
{
"epoch": 0.5152091254752852,
"grad_norm": 0.056508004665374756,
"learning_rate": 9.619414255591937e-05,
"loss": 0.505,
"step": 813
},
{
"epoch": 0.5158428390367554,
"grad_norm": 0.061718232929706573,
"learning_rate": 9.599393881082408e-05,
"loss": 0.5194,
"step": 814
},
{
"epoch": 0.5164765525982256,
"grad_norm": 0.055572785437107086,
"learning_rate": 9.579375114717351e-05,
"loss": 0.4633,
"step": 815
},
{
"epoch": 0.5171102661596958,
"grad_norm": 0.0603361539542675,
"learning_rate": 9.559358036857663e-05,
"loss": 0.4628,
"step": 816
},
{
"epoch": 0.517743979721166,
"grad_norm": 0.08223170042037964,
"learning_rate": 9.53934272785747e-05,
"loss": 0.4932,
"step": 817
},
{
"epoch": 0.5183776932826363,
"grad_norm": 0.05056726187467575,
"learning_rate": 9.519329268063795e-05,
"loss": 0.5267,
"step": 818
},
{
"epoch": 0.5190114068441065,
"grad_norm": 0.0726744681596756,
"learning_rate": 9.499317737816229e-05,
"loss": 0.5233,
"step": 819
},
{
"epoch": 0.5196451204055766,
"grad_norm": 0.06118292361497879,
"learning_rate": 9.479308217446633e-05,
"loss": 0.5627,
"step": 820
},
{
"epoch": 0.5202788339670469,
"grad_norm": 0.05231308937072754,
"learning_rate": 9.459300787278785e-05,
"loss": 0.5238,
"step": 821
},
{
"epoch": 0.5209125475285171,
"grad_norm": 0.0555204376578331,
"learning_rate": 9.439295527628081e-05,
"loss": 0.5648,
"step": 822
},
{
"epoch": 0.5215462610899874,
"grad_norm": 0.056751273572444916,
"learning_rate": 9.419292518801205e-05,
"loss": 0.6158,
"step": 823
},
{
"epoch": 0.5221799746514575,
"grad_norm": 0.055247753858566284,
"learning_rate": 9.399291841095802e-05,
"loss": 0.5938,
"step": 824
},
{
"epoch": 0.5228136882129277,
"grad_norm": 0.05264151841402054,
"learning_rate": 9.379293574800154e-05,
"loss": 0.4908,
"step": 825
},
{
"epoch": 0.523447401774398,
"grad_norm": 0.06633622944355011,
"learning_rate": 9.359297800192872e-05,
"loss": 0.4516,
"step": 826
},
{
"epoch": 0.5240811153358682,
"grad_norm": 0.06326263397932053,
"learning_rate": 9.33930459754256e-05,
"loss": 0.4583,
"step": 827
},
{
"epoch": 0.5247148288973384,
"grad_norm": 0.061470355838537216,
"learning_rate": 9.319314047107504e-05,
"loss": 0.5209,
"step": 828
},
{
"epoch": 0.5253485424588086,
"grad_norm": 0.048166628926992416,
"learning_rate": 9.299326229135326e-05,
"loss": 0.5184,
"step": 829
},
{
"epoch": 0.5259822560202788,
"grad_norm": 0.09853006154298782,
"learning_rate": 9.279341223862705e-05,
"loss": 0.5219,
"step": 830
},
{
"epoch": 0.526615969581749,
"grad_norm": 0.5687222480773926,
"learning_rate": 9.259359111515006e-05,
"loss": 0.4086,
"step": 831
},
{
"epoch": 0.5272496831432193,
"grad_norm": 0.05580870062112808,
"learning_rate": 9.239379972305992e-05,
"loss": 0.492,
"step": 832
},
{
"epoch": 0.5278833967046895,
"grad_norm": 0.05025511607527733,
"learning_rate": 9.219403886437489e-05,
"loss": 0.5146,
"step": 833
},
{
"epoch": 0.5285171102661597,
"grad_norm": 0.05787106603384018,
"learning_rate": 9.199430934099068e-05,
"loss": 0.5356,
"step": 834
},
{
"epoch": 0.5291508238276299,
"grad_norm": 0.06410747766494751,
"learning_rate": 9.179461195467714e-05,
"loss": 0.6312,
"step": 835
},
{
"epoch": 0.5297845373891001,
"grad_norm": 0.053113870322704315,
"learning_rate": 9.159494750707526e-05,
"loss": 0.4838,
"step": 836
},
{
"epoch": 0.5304182509505704,
"grad_norm": 0.06018316373229027,
"learning_rate": 9.139531679969362e-05,
"loss": 0.4631,
"step": 837
},
{
"epoch": 0.5310519645120405,
"grad_norm": 0.05416072905063629,
"learning_rate": 9.119572063390549e-05,
"loss": 0.4439,
"step": 838
},
{
"epoch": 0.5316856780735107,
"grad_norm": 0.08766517043113708,
"learning_rate": 9.09961598109454e-05,
"loss": 0.5445,
"step": 839
},
{
"epoch": 0.532319391634981,
"grad_norm": 0.0619327537715435,
"learning_rate": 9.079663513190611e-05,
"loss": 0.5428,
"step": 840
},
{
"epoch": 0.5329531051964512,
"grad_norm": 0.059881288558244705,
"learning_rate": 9.059714739773516e-05,
"loss": 0.513,
"step": 841
},
{
"epoch": 0.5335868187579215,
"grad_norm": 0.06464383006095886,
"learning_rate": 9.039769740923183e-05,
"loss": 0.4746,
"step": 842
},
{
"epoch": 0.5342205323193916,
"grad_norm": 0.054081957787275314,
"learning_rate": 9.019828596704394e-05,
"loss": 0.391,
"step": 843
},
{
"epoch": 0.5348542458808618,
"grad_norm": 0.07097287476062775,
"learning_rate": 8.999891387166453e-05,
"loss": 0.5668,
"step": 844
},
{
"epoch": 0.5354879594423321,
"grad_norm": 0.050909094512462616,
"learning_rate": 8.979958192342862e-05,
"loss": 0.5574,
"step": 845
},
{
"epoch": 0.5361216730038023,
"grad_norm": 0.0605645477771759,
"learning_rate": 8.960029092251023e-05,
"loss": 0.5608,
"step": 846
},
{
"epoch": 0.5367553865652726,
"grad_norm": 0.05807255208492279,
"learning_rate": 8.940104166891885e-05,
"loss": 0.5057,
"step": 847
},
{
"epoch": 0.5373891001267427,
"grad_norm": 0.05229676514863968,
"learning_rate": 8.920183496249642e-05,
"loss": 0.4968,
"step": 848
},
{
"epoch": 0.5380228136882129,
"grad_norm": 0.05831581726670265,
"learning_rate": 8.900267160291416e-05,
"loss": 0.421,
"step": 849
},
{
"epoch": 0.5386565272496832,
"grad_norm": 0.04102315753698349,
"learning_rate": 8.880355238966923e-05,
"loss": 0.4176,
"step": 850
},
{
"epoch": 0.5392902408111534,
"grad_norm": 0.04635517671704292,
"learning_rate": 8.860447812208157e-05,
"loss": 0.4623,
"step": 851
},
{
"epoch": 0.5399239543726235,
"grad_norm": 0.08849713206291199,
"learning_rate": 8.840544959929065e-05,
"loss": 0.6421,
"step": 852
},
{
"epoch": 0.5405576679340938,
"grad_norm": 0.07401357591152191,
"learning_rate": 8.820646762025246e-05,
"loss": 0.4958,
"step": 853
},
{
"epoch": 0.541191381495564,
"grad_norm": 0.07079368084669113,
"learning_rate": 8.800753298373596e-05,
"loss": 0.4828,
"step": 854
},
{
"epoch": 0.5418250950570342,
"grad_norm": 0.06453298032283783,
"learning_rate": 8.780864648832022e-05,
"loss": 0.6269,
"step": 855
},
{
"epoch": 0.5424588086185045,
"grad_norm": 0.05445917323231697,
"learning_rate": 8.760980893239094e-05,
"loss": 0.5873,
"step": 856
},
{
"epoch": 0.5430925221799746,
"grad_norm": 0.047000445425510406,
"learning_rate": 8.741102111413748e-05,
"loss": 0.4938,
"step": 857
},
{
"epoch": 0.5437262357414449,
"grad_norm": 0.06307143718004227,
"learning_rate": 8.721228383154939e-05,
"loss": 0.602,
"step": 858
},
{
"epoch": 0.5443599493029151,
"grad_norm": 0.046326130628585815,
"learning_rate": 8.701359788241354e-05,
"loss": 0.453,
"step": 859
},
{
"epoch": 0.5449936628643853,
"grad_norm": 0.05878138169646263,
"learning_rate": 8.681496406431056e-05,
"loss": 0.5619,
"step": 860
},
{
"epoch": 0.5456273764258555,
"grad_norm": 0.06828006356954575,
"learning_rate": 8.66163831746119e-05,
"loss": 0.4723,
"step": 861
},
{
"epoch": 0.5462610899873257,
"grad_norm": 0.062354519963264465,
"learning_rate": 8.641785601047654e-05,
"loss": 0.5345,
"step": 862
},
{
"epoch": 0.5468948035487959,
"grad_norm": 0.052326980978250504,
"learning_rate": 8.621938336884781e-05,
"loss": 0.5096,
"step": 863
},
{
"epoch": 0.5475285171102662,
"grad_norm": 0.09620847553014755,
"learning_rate": 8.602096604645009e-05,
"loss": 0.6523,
"step": 864
},
{
"epoch": 0.5481622306717364,
"grad_norm": 0.07187427580356598,
"learning_rate": 8.58226048397857e-05,
"loss": 0.5051,
"step": 865
},
{
"epoch": 0.5487959442332065,
"grad_norm": 0.058141518384218216,
"learning_rate": 8.562430054513184e-05,
"loss": 0.501,
"step": 866
},
{
"epoch": 0.5494296577946768,
"grad_norm": 0.037818700075149536,
"learning_rate": 8.54260539585371e-05,
"loss": 0.2518,
"step": 867
},
{
"epoch": 0.550063371356147,
"grad_norm": 0.04658188298344612,
"learning_rate": 8.522786587581844e-05,
"loss": 0.4531,
"step": 868
},
{
"epoch": 0.5506970849176173,
"grad_norm": 0.04527122154831886,
"learning_rate": 8.502973709255804e-05,
"loss": 0.4592,
"step": 869
},
{
"epoch": 0.5513307984790875,
"grad_norm": 0.05705267935991287,
"learning_rate": 8.483166840409995e-05,
"loss": 0.4575,
"step": 870
},
{
"epoch": 0.5519645120405576,
"grad_norm": 0.08155850321054459,
"learning_rate": 8.463366060554698e-05,
"loss": 0.5167,
"step": 871
},
{
"epoch": 0.5525982256020279,
"grad_norm": 0.07388201355934143,
"learning_rate": 8.443571449175766e-05,
"loss": 0.6817,
"step": 872
},
{
"epoch": 0.5532319391634981,
"grad_norm": 0.06419550627470016,
"learning_rate": 8.423783085734268e-05,
"loss": 0.5468,
"step": 873
},
{
"epoch": 0.5538656527249683,
"grad_norm": 0.05985475331544876,
"learning_rate": 8.404001049666211e-05,
"loss": 0.5247,
"step": 874
},
{
"epoch": 0.5544993662864385,
"grad_norm": 0.05610859394073486,
"learning_rate": 8.384225420382185e-05,
"loss": 0.5088,
"step": 875
},
{
"epoch": 0.5551330798479087,
"grad_norm": 0.5789166688919067,
"learning_rate": 8.36445627726708e-05,
"loss": 0.5744,
"step": 876
},
{
"epoch": 0.555766793409379,
"grad_norm": 0.05248624086380005,
"learning_rate": 8.344693699679736e-05,
"loss": 0.4797,
"step": 877
},
{
"epoch": 0.5564005069708492,
"grad_norm": 0.06693774461746216,
"learning_rate": 8.324937766952638e-05,
"loss": 0.5354,
"step": 878
},
{
"epoch": 0.5570342205323194,
"grad_norm": 0.058544524013996124,
"learning_rate": 8.305188558391599e-05,
"loss": 0.602,
"step": 879
},
{
"epoch": 0.5576679340937896,
"grad_norm": 0.05111921206116676,
"learning_rate": 8.285446153275445e-05,
"loss": 0.4541,
"step": 880
},
{
"epoch": 0.5583016476552598,
"grad_norm": 0.0569741316139698,
"learning_rate": 8.265710630855677e-05,
"loss": 0.5306,
"step": 881
},
{
"epoch": 0.55893536121673,
"grad_norm": 0.13403062522411346,
"learning_rate": 8.245982070356185e-05,
"loss": 0.56,
"step": 882
},
{
"epoch": 0.5595690747782003,
"grad_norm": 0.07512082904577255,
"learning_rate": 8.226260550972895e-05,
"loss": 0.5951,
"step": 883
},
{
"epoch": 0.5602027883396705,
"grad_norm": 0.046271927654743195,
"learning_rate": 8.206546151873478e-05,
"loss": 0.436,
"step": 884
},
{
"epoch": 0.5608365019011406,
"grad_norm": 0.05913880839943886,
"learning_rate": 8.186838952197018e-05,
"loss": 0.5116,
"step": 885
},
{
"epoch": 0.5614702154626109,
"grad_norm": 0.05060280114412308,
"learning_rate": 8.167139031053705e-05,
"loss": 0.5245,
"step": 886
},
{
"epoch": 0.5621039290240811,
"grad_norm": 0.0638653039932251,
"learning_rate": 8.1474464675245e-05,
"loss": 0.5099,
"step": 887
},
{
"epoch": 0.5627376425855514,
"grad_norm": 0.04928203299641609,
"learning_rate": 8.127761340660835e-05,
"loss": 0.3581,
"step": 888
},
{
"epoch": 0.5633713561470215,
"grad_norm": 0.04772525653243065,
"learning_rate": 8.108083729484292e-05,
"loss": 0.4432,
"step": 889
},
{
"epoch": 0.5640050697084917,
"grad_norm": 0.0834617018699646,
"learning_rate": 8.08841371298628e-05,
"loss": 0.6493,
"step": 890
},
{
"epoch": 0.564638783269962,
"grad_norm": 0.06321214139461517,
"learning_rate": 8.068751370127712e-05,
"loss": 0.4376,
"step": 891
},
{
"epoch": 0.5652724968314322,
"grad_norm": 0.07898563891649246,
"learning_rate": 8.049096779838719e-05,
"loss": 0.3803,
"step": 892
},
{
"epoch": 0.5659062103929025,
"grad_norm": 0.061078350991010666,
"learning_rate": 8.029450021018287e-05,
"loss": 0.4417,
"step": 893
},
{
"epoch": 0.5665399239543726,
"grad_norm": 0.05912580341100693,
"learning_rate": 8.009811172533976e-05,
"loss": 0.4558,
"step": 894
},
{
"epoch": 0.5671736375158428,
"grad_norm": 0.06853251159191132,
"learning_rate": 7.990180313221596e-05,
"loss": 0.4647,
"step": 895
},
{
"epoch": 0.5678073510773131,
"grad_norm": 0.13536880910396576,
"learning_rate": 7.970557521884873e-05,
"loss": 0.4849,
"step": 896
},
{
"epoch": 0.5684410646387833,
"grad_norm": 0.051422230899333954,
"learning_rate": 7.950942877295155e-05,
"loss": 0.5153,
"step": 897
},
{
"epoch": 0.5690747782002535,
"grad_norm": 0.05563550814986229,
"learning_rate": 7.931336458191092e-05,
"loss": 0.4608,
"step": 898
},
{
"epoch": 0.5697084917617237,
"grad_norm": 0.05387943610548973,
"learning_rate": 7.911738343278304e-05,
"loss": 0.308,
"step": 899
},
{
"epoch": 0.5703422053231939,
"grad_norm": 0.05549965053796768,
"learning_rate": 7.892148611229075e-05,
"loss": 0.477,
"step": 900
},
{
"epoch": 0.5709759188846641,
"grad_norm": 0.06661087274551392,
"learning_rate": 7.872567340682045e-05,
"loss": 0.5179,
"step": 901
},
{
"epoch": 0.5716096324461344,
"grad_norm": 0.06925564259290695,
"learning_rate": 7.852994610241885e-05,
"loss": 0.4785,
"step": 902
},
{
"epoch": 0.5722433460076045,
"grad_norm": 0.05441868305206299,
"learning_rate": 7.833430498478988e-05,
"loss": 0.5596,
"step": 903
},
{
"epoch": 0.5728770595690748,
"grad_norm": 0.04862716421484947,
"learning_rate": 7.813875083929132e-05,
"loss": 0.4659,
"step": 904
},
{
"epoch": 0.573510773130545,
"grad_norm": 0.07547637820243835,
"learning_rate": 7.794328445093208e-05,
"loss": 0.4485,
"step": 905
},
{
"epoch": 0.5741444866920152,
"grad_norm": 0.08132816851139069,
"learning_rate": 7.774790660436858e-05,
"loss": 0.6294,
"step": 906
},
{
"epoch": 0.5747782002534855,
"grad_norm": 0.06841199100017548,
"learning_rate": 7.755261808390187e-05,
"loss": 0.4667,
"step": 907
},
{
"epoch": 0.5754119138149556,
"grad_norm": 0.05556390807032585,
"learning_rate": 7.735741967347445e-05,
"loss": 0.5166,
"step": 908
},
{
"epoch": 0.5760456273764258,
"grad_norm": 0.07941378653049469,
"learning_rate": 7.716231215666711e-05,
"loss": 0.4368,
"step": 909
},
{
"epoch": 0.5766793409378961,
"grad_norm": 0.08058507740497589,
"learning_rate": 7.696729631669564e-05,
"loss": 0.6772,
"step": 910
},
{
"epoch": 0.5773130544993663,
"grad_norm": 0.06999081373214722,
"learning_rate": 7.6772372936408e-05,
"loss": 0.6374,
"step": 911
},
{
"epoch": 0.5779467680608364,
"grad_norm": 0.05269391089677811,
"learning_rate": 7.657754279828083e-05,
"loss": 0.3222,
"step": 912
},
{
"epoch": 0.5785804816223067,
"grad_norm": 0.059798724949359894,
"learning_rate": 7.63828066844166e-05,
"loss": 0.5354,
"step": 913
},
{
"epoch": 0.5792141951837769,
"grad_norm": 0.05695294961333275,
"learning_rate": 7.618816537654018e-05,
"loss": 0.4552,
"step": 914
},
{
"epoch": 0.5798479087452472,
"grad_norm": 0.07460351288318634,
"learning_rate": 7.599361965599606e-05,
"loss": 0.581,
"step": 915
},
{
"epoch": 0.5804816223067174,
"grad_norm": 0.04292193427681923,
"learning_rate": 7.579917030374489e-05,
"loss": 0.435,
"step": 916
},
{
"epoch": 0.5811153358681875,
"grad_norm": 0.05156205967068672,
"learning_rate": 7.56048181003604e-05,
"loss": 0.5231,
"step": 917
},
{
"epoch": 0.5817490494296578,
"grad_norm": 0.05971655622124672,
"learning_rate": 7.541056382602657e-05,
"loss": 0.5196,
"step": 918
},
{
"epoch": 0.582382762991128,
"grad_norm": 0.06214692071080208,
"learning_rate": 7.521640826053404e-05,
"loss": 0.5237,
"step": 919
},
{
"epoch": 0.5830164765525983,
"grad_norm": 0.05921977758407593,
"learning_rate": 7.502235218327731e-05,
"loss": 0.5444,
"step": 920
},
{
"epoch": 0.5836501901140685,
"grad_norm": 0.05885602533817291,
"learning_rate": 7.482839637325153e-05,
"loss": 0.4045,
"step": 921
},
{
"epoch": 0.5842839036755386,
"grad_norm": 0.05014495924115181,
"learning_rate": 7.463454160904928e-05,
"loss": 0.4261,
"step": 922
},
{
"epoch": 0.5849176172370089,
"grad_norm": 0.07014278322458267,
"learning_rate": 7.444078866885753e-05,
"loss": 0.5934,
"step": 923
},
{
"epoch": 0.5855513307984791,
"grad_norm": 0.04919711500406265,
"learning_rate": 7.424713833045452e-05,
"loss": 0.4819,
"step": 924
},
{
"epoch": 0.5861850443599493,
"grad_norm": 0.05253986269235611,
"learning_rate": 7.405359137120662e-05,
"loss": 0.5067,
"step": 925
},
{
"epoch": 0.5868187579214195,
"grad_norm": 0.05310770869255066,
"learning_rate": 7.386014856806523e-05,
"loss": 0.4878,
"step": 926
},
{
"epoch": 0.5874524714828897,
"grad_norm": 0.0604504756629467,
"learning_rate": 7.366681069756352e-05,
"loss": 0.3944,
"step": 927
},
{
"epoch": 0.5880861850443599,
"grad_norm": 0.042067963629961014,
"learning_rate": 7.347357853581361e-05,
"loss": 0.412,
"step": 928
},
{
"epoch": 0.5887198986058302,
"grad_norm": 0.04595714807510376,
"learning_rate": 7.328045285850313e-05,
"loss": 0.4234,
"step": 929
},
{
"epoch": 0.5893536121673004,
"grad_norm": 0.05038761347532272,
"learning_rate": 7.308743444089232e-05,
"loss": 0.5915,
"step": 930
},
{
"epoch": 0.5899873257287706,
"grad_norm": 0.061250437051057816,
"learning_rate": 7.289452405781084e-05,
"loss": 0.6433,
"step": 931
},
{
"epoch": 0.5906210392902408,
"grad_norm": 0.07605701684951782,
"learning_rate": 7.270172248365468e-05,
"loss": 0.6252,
"step": 932
},
{
"epoch": 0.591254752851711,
"grad_norm": 0.05717351287603378,
"learning_rate": 7.250903049238297e-05,
"loss": 0.4693,
"step": 933
},
{
"epoch": 0.5918884664131813,
"grad_norm": 0.05955088511109352,
"learning_rate": 7.231644885751507e-05,
"loss": 0.5883,
"step": 934
},
{
"epoch": 0.5925221799746515,
"grad_norm": 0.06226349249482155,
"learning_rate": 7.212397835212722e-05,
"loss": 0.4226,
"step": 935
},
{
"epoch": 0.5931558935361216,
"grad_norm": 0.062126316130161285,
"learning_rate": 7.193161974884964e-05,
"loss": 0.568,
"step": 936
},
{
"epoch": 0.5937896070975919,
"grad_norm": 0.08957802504301071,
"learning_rate": 7.173937381986323e-05,
"loss": 0.5132,
"step": 937
},
{
"epoch": 0.5944233206590621,
"grad_norm": 0.06909901648759842,
"learning_rate": 7.154724133689677e-05,
"loss": 0.5055,
"step": 938
},
{
"epoch": 0.5950570342205324,
"grad_norm": 0.0510685071349144,
"learning_rate": 7.135522307122346e-05,
"loss": 0.5349,
"step": 939
},
{
"epoch": 0.5956907477820025,
"grad_norm": 0.05713349208235741,
"learning_rate": 7.116331979365805e-05,
"loss": 0.4435,
"step": 940
},
{
"epoch": 0.5963244613434727,
"grad_norm": 0.05836547538638115,
"learning_rate": 7.097153227455379e-05,
"loss": 0.4525,
"step": 941
},
{
"epoch": 0.596958174904943,
"grad_norm": 0.058628011494874954,
"learning_rate": 7.077986128379908e-05,
"loss": 0.3689,
"step": 942
},
{
"epoch": 0.5975918884664132,
"grad_norm": 0.05638744682073593,
"learning_rate": 7.058830759081464e-05,
"loss": 0.4296,
"step": 943
},
{
"epoch": 0.5982256020278834,
"grad_norm": 0.04396173730492592,
"learning_rate": 7.039687196455042e-05,
"loss": 0.4846,
"step": 944
},
{
"epoch": 0.5988593155893536,
"grad_norm": 0.051896654069423676,
"learning_rate": 7.02055551734822e-05,
"loss": 0.5216,
"step": 945
},
{
"epoch": 0.5994930291508238,
"grad_norm": 0.07102696597576141,
"learning_rate": 7.001435798560883e-05,
"loss": 0.5707,
"step": 946
},
{
"epoch": 0.600126742712294,
"grad_norm": 0.06377355009317398,
"learning_rate": 6.982328116844912e-05,
"loss": 0.4078,
"step": 947
},
{
"epoch": 0.6007604562737643,
"grad_norm": 0.05575268715620041,
"learning_rate": 6.963232548903853e-05,
"loss": 0.5136,
"step": 948
}
],
"logging_steps": 1,
"max_steps": 1578,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 158,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.3031479668589462e+19,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}