nicoboss's picture
Upload folder using huggingface_hub
25b3f76 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9011406844106464,
"eval_steps": 500,
"global_step": 1422,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0006337135614702154,
"grad_norm": 0.22353313863277435,
"learning_rate": 2e-05,
"loss": 0.795,
"step": 1
},
{
"epoch": 0.0012674271229404308,
"grad_norm": 0.270685613155365,
"learning_rate": 4e-05,
"loss": 0.9841,
"step": 2
},
{
"epoch": 0.0019011406844106464,
"grad_norm": 0.13555319607257843,
"learning_rate": 6e-05,
"loss": 0.8728,
"step": 3
},
{
"epoch": 0.0025348542458808617,
"grad_norm": 0.1665652096271515,
"learning_rate": 8e-05,
"loss": 0.8625,
"step": 4
},
{
"epoch": 0.0031685678073510772,
"grad_norm": 0.13588839769363403,
"learning_rate": 0.0001,
"loss": 0.6776,
"step": 5
},
{
"epoch": 0.0038022813688212928,
"grad_norm": 0.2811749279499054,
"learning_rate": 0.00012,
"loss": 0.8813,
"step": 6
},
{
"epoch": 0.004435994930291508,
"grad_norm": 0.327694833278656,
"learning_rate": 0.00014,
"loss": 0.9009,
"step": 7
},
{
"epoch": 0.005069708491761723,
"grad_norm": 0.24555213749408722,
"learning_rate": 0.00016,
"loss": 0.7054,
"step": 8
},
{
"epoch": 0.005703422053231939,
"grad_norm": 0.14921338856220245,
"learning_rate": 0.00018,
"loss": 0.697,
"step": 9
},
{
"epoch": 0.0063371356147021544,
"grad_norm": 0.13169103860855103,
"learning_rate": 0.0002,
"loss": 0.6007,
"step": 10
},
{
"epoch": 0.00697084917617237,
"grad_norm": 0.06807047873735428,
"learning_rate": 0.00019999979928608238,
"loss": 0.6155,
"step": 11
},
{
"epoch": 0.0076045627376425855,
"grad_norm": 0.08288167417049408,
"learning_rate": 0.00019999919714513528,
"loss": 0.5641,
"step": 12
},
{
"epoch": 0.008238276299112801,
"grad_norm": 0.12285872548818588,
"learning_rate": 0.00019999819357957582,
"loss": 0.7526,
"step": 13
},
{
"epoch": 0.008871989860583017,
"grad_norm": 0.15566691756248474,
"learning_rate": 0.00019999678859343263,
"loss": 0.4519,
"step": 14
},
{
"epoch": 0.009505703422053232,
"grad_norm": 0.1301712989807129,
"learning_rate": 0.00019999498219234568,
"loss": 0.486,
"step": 15
},
{
"epoch": 0.010139416983523447,
"grad_norm": 0.14493511617183685,
"learning_rate": 0.00019999277438356638,
"loss": 0.7146,
"step": 16
},
{
"epoch": 0.010773130544993664,
"grad_norm": 0.1372271478176117,
"learning_rate": 0.00019999016517595753,
"loss": 0.5933,
"step": 17
},
{
"epoch": 0.011406844106463879,
"grad_norm": 0.09944190829992294,
"learning_rate": 0.00019998715457999314,
"loss": 0.8399,
"step": 18
},
{
"epoch": 0.012040557667934094,
"grad_norm": 0.057923465967178345,
"learning_rate": 0.0001999837426077586,
"loss": 0.5613,
"step": 19
},
{
"epoch": 0.012674271229404309,
"grad_norm": 0.06214901804924011,
"learning_rate": 0.00019997992927295059,
"loss": 0.5374,
"step": 20
},
{
"epoch": 0.013307984790874524,
"grad_norm": 0.04898112639784813,
"learning_rate": 0.0001999757145908768,
"loss": 0.5451,
"step": 21
},
{
"epoch": 0.01394169835234474,
"grad_norm": 0.07026948034763336,
"learning_rate": 0.0001999710985784562,
"loss": 0.5635,
"step": 22
},
{
"epoch": 0.014575411913814956,
"grad_norm": 0.0672365352511406,
"learning_rate": 0.00019996608125421873,
"loss": 0.5996,
"step": 23
},
{
"epoch": 0.015209125475285171,
"grad_norm": 0.06477885693311691,
"learning_rate": 0.00019996066263830531,
"loss": 0.4707,
"step": 24
},
{
"epoch": 0.015842839036755388,
"grad_norm": 0.07720793038606644,
"learning_rate": 0.0001999548427524678,
"loss": 0.5891,
"step": 25
},
{
"epoch": 0.016476552598225603,
"grad_norm": 0.06699500977993011,
"learning_rate": 0.0001999486216200688,
"loss": 0.5316,
"step": 26
},
{
"epoch": 0.017110266159695818,
"grad_norm": 0.07539479434490204,
"learning_rate": 0.00019994199926608172,
"loss": 0.5854,
"step": 27
},
{
"epoch": 0.017743979721166033,
"grad_norm": 4.677523136138916,
"learning_rate": 0.00019993497571709048,
"loss": 0.5019,
"step": 28
},
{
"epoch": 0.018377693282636248,
"grad_norm": 0.07100815325975418,
"learning_rate": 0.00019992755100128962,
"loss": 0.4729,
"step": 29
},
{
"epoch": 0.019011406844106463,
"grad_norm": 0.06506210565567017,
"learning_rate": 0.000199919725148484,
"loss": 0.5597,
"step": 30
},
{
"epoch": 0.01964512040557668,
"grad_norm": 0.04945315420627594,
"learning_rate": 0.0001999114981900887,
"loss": 0.5044,
"step": 31
},
{
"epoch": 0.020278833967046894,
"grad_norm": 0.05103156715631485,
"learning_rate": 0.0001999028701591291,
"loss": 0.3637,
"step": 32
},
{
"epoch": 0.02091254752851711,
"grad_norm": 0.05288761481642723,
"learning_rate": 0.00019989384109024048,
"loss": 0.4345,
"step": 33
},
{
"epoch": 0.021546261089987327,
"grad_norm": 0.05457635968923569,
"learning_rate": 0.0001998844110196681,
"loss": 0.4714,
"step": 34
},
{
"epoch": 0.022179974651457542,
"grad_norm": 0.055830612778663635,
"learning_rate": 0.0001998745799852668,
"loss": 0.5285,
"step": 35
},
{
"epoch": 0.022813688212927757,
"grad_norm": 0.05858856439590454,
"learning_rate": 0.00019986434802650113,
"loss": 0.5106,
"step": 36
},
{
"epoch": 0.023447401774397972,
"grad_norm": 0.05847540497779846,
"learning_rate": 0.00019985371518444503,
"loss": 0.4394,
"step": 37
},
{
"epoch": 0.024081115335868188,
"grad_norm": 0.1140831857919693,
"learning_rate": 0.00019984268150178167,
"loss": 0.4782,
"step": 38
},
{
"epoch": 0.024714828897338403,
"grad_norm": 0.06483329832553864,
"learning_rate": 0.00019983124702280334,
"loss": 0.396,
"step": 39
},
{
"epoch": 0.025348542458808618,
"grad_norm": 0.07212468981742859,
"learning_rate": 0.00019981941179341117,
"loss": 0.5173,
"step": 40
},
{
"epoch": 0.025982256020278833,
"grad_norm": 0.1697537750005722,
"learning_rate": 0.00019980717586111512,
"loss": 0.6164,
"step": 41
},
{
"epoch": 0.026615969581749048,
"grad_norm": 0.05975339934229851,
"learning_rate": 0.00019979453927503364,
"loss": 0.4981,
"step": 42
},
{
"epoch": 0.027249683143219267,
"grad_norm": 0.0607403926551342,
"learning_rate": 0.00019978150208589348,
"loss": 0.533,
"step": 43
},
{
"epoch": 0.02788339670468948,
"grad_norm": 0.07225210964679718,
"learning_rate": 0.00019976806434602952,
"loss": 0.5055,
"step": 44
},
{
"epoch": 0.028517110266159697,
"grad_norm": 0.07008686661720276,
"learning_rate": 0.00019975422610938462,
"loss": 0.6274,
"step": 45
},
{
"epoch": 0.029150823827629912,
"grad_norm": 0.07289402186870575,
"learning_rate": 0.0001997399874315093,
"loss": 0.5247,
"step": 46
},
{
"epoch": 0.029784537389100127,
"grad_norm": 0.10037431120872498,
"learning_rate": 0.0001997253483695616,
"loss": 0.647,
"step": 47
},
{
"epoch": 0.030418250950570342,
"grad_norm": 0.06468270719051361,
"learning_rate": 0.00019971030898230672,
"loss": 0.5719,
"step": 48
},
{
"epoch": 0.031051964512040557,
"grad_norm": 0.0472278967499733,
"learning_rate": 0.00019969486933011705,
"loss": 0.5565,
"step": 49
},
{
"epoch": 0.031685678073510776,
"grad_norm": 0.0584145151078701,
"learning_rate": 0.00019967902947497156,
"loss": 0.5432,
"step": 50
},
{
"epoch": 0.03231939163498099,
"grad_norm": 0.08962458372116089,
"learning_rate": 0.00019966278948045592,
"loss": 0.6432,
"step": 51
},
{
"epoch": 0.032953105196451206,
"grad_norm": 0.08193643391132355,
"learning_rate": 0.00019964614941176195,
"loss": 0.5341,
"step": 52
},
{
"epoch": 0.03358681875792142,
"grad_norm": 0.07166769355535507,
"learning_rate": 0.00019962910933568747,
"loss": 0.5481,
"step": 53
},
{
"epoch": 0.034220532319391636,
"grad_norm": 0.10422351956367493,
"learning_rate": 0.00019961166932063614,
"loss": 0.6145,
"step": 54
},
{
"epoch": 0.03485424588086185,
"grad_norm": 0.06273826211690903,
"learning_rate": 0.00019959382943661704,
"loss": 0.4969,
"step": 55
},
{
"epoch": 0.035487959442332066,
"grad_norm": 0.06504670530557632,
"learning_rate": 0.0001995755897552444,
"loss": 0.6093,
"step": 56
},
{
"epoch": 0.03612167300380228,
"grad_norm": 0.05045778304338455,
"learning_rate": 0.00019955695034973742,
"loss": 0.4191,
"step": 57
},
{
"epoch": 0.036755386565272496,
"grad_norm": 0.06495866179466248,
"learning_rate": 0.00019953791129491983,
"loss": 0.4762,
"step": 58
},
{
"epoch": 0.037389100126742715,
"grad_norm": 0.0814126655459404,
"learning_rate": 0.0001995184726672197,
"loss": 0.5599,
"step": 59
},
{
"epoch": 0.03802281368821293,
"grad_norm": 0.052061304450035095,
"learning_rate": 0.00019949863454466908,
"loss": 0.4822,
"step": 60
},
{
"epoch": 0.038656527249683145,
"grad_norm": 0.05419475957751274,
"learning_rate": 0.00019947839700690375,
"loss": 0.5625,
"step": 61
},
{
"epoch": 0.03929024081115336,
"grad_norm": 0.06495067477226257,
"learning_rate": 0.0001994577601351628,
"loss": 0.5863,
"step": 62
},
{
"epoch": 0.039923954372623575,
"grad_norm": 0.055791907012462616,
"learning_rate": 0.00019943672401228837,
"loss": 0.4588,
"step": 63
},
{
"epoch": 0.04055766793409379,
"grad_norm": 0.03923908621072769,
"learning_rate": 0.00019941528872272532,
"loss": 0.3841,
"step": 64
},
{
"epoch": 0.041191381495564006,
"grad_norm": 0.08200399577617645,
"learning_rate": 0.00019939345435252088,
"loss": 0.6163,
"step": 65
},
{
"epoch": 0.04182509505703422,
"grad_norm": 0.05708305537700653,
"learning_rate": 0.00019937122098932428,
"loss": 0.6363,
"step": 66
},
{
"epoch": 0.042458808618504436,
"grad_norm": 0.053468603640794754,
"learning_rate": 0.0001993485887223864,
"loss": 0.4777,
"step": 67
},
{
"epoch": 0.043092522179974654,
"grad_norm": 0.08539824187755585,
"learning_rate": 0.00019932555764255952,
"loss": 0.4922,
"step": 68
},
{
"epoch": 0.043726235741444866,
"grad_norm": 0.07483454793691635,
"learning_rate": 0.00019930212784229675,
"loss": 0.6337,
"step": 69
},
{
"epoch": 0.044359949302915085,
"grad_norm": 0.06771700084209442,
"learning_rate": 0.00019927829941565186,
"loss": 0.4559,
"step": 70
},
{
"epoch": 0.044993662864385296,
"grad_norm": 0.05689261853694916,
"learning_rate": 0.0001992540724582788,
"loss": 0.5489,
"step": 71
},
{
"epoch": 0.045627376425855515,
"grad_norm": 0.05044565722346306,
"learning_rate": 0.00019922944706743127,
"loss": 0.4472,
"step": 72
},
{
"epoch": 0.046261089987325726,
"grad_norm": 0.07331253588199615,
"learning_rate": 0.00019920442334196248,
"loss": 0.4752,
"step": 73
},
{
"epoch": 0.046894803548795945,
"grad_norm": 0.057449884712696075,
"learning_rate": 0.0001991790013823246,
"loss": 0.4525,
"step": 74
},
{
"epoch": 0.04752851711026616,
"grad_norm": 0.08357278257608414,
"learning_rate": 0.00019915318129056853,
"loss": 0.5813,
"step": 75
},
{
"epoch": 0.048162230671736375,
"grad_norm": 0.051311176270246506,
"learning_rate": 0.00019912696317034322,
"loss": 0.4593,
"step": 76
},
{
"epoch": 0.048795944233206594,
"grad_norm": 0.06535078585147858,
"learning_rate": 0.00019910034712689552,
"loss": 0.5339,
"step": 77
},
{
"epoch": 0.049429657794676805,
"grad_norm": 0.13796891272068024,
"learning_rate": 0.00019907333326706967,
"loss": 0.5438,
"step": 78
},
{
"epoch": 0.050063371356147024,
"grad_norm": 0.05667581036686897,
"learning_rate": 0.0001990459216993068,
"loss": 0.6295,
"step": 79
},
{
"epoch": 0.050697084917617236,
"grad_norm": 0.05243121087551117,
"learning_rate": 0.00019901811253364456,
"loss": 0.4782,
"step": 80
},
{
"epoch": 0.051330798479087454,
"grad_norm": 0.0769771933555603,
"learning_rate": 0.0001989899058817167,
"loss": 0.5692,
"step": 81
},
{
"epoch": 0.051964512040557666,
"grad_norm": 0.07334766536951065,
"learning_rate": 0.00019896130185675261,
"loss": 0.569,
"step": 82
},
{
"epoch": 0.052598225602027884,
"grad_norm": 0.07953603565692902,
"learning_rate": 0.00019893230057357671,
"loss": 0.4059,
"step": 83
},
{
"epoch": 0.053231939163498096,
"grad_norm": 0.05282806232571602,
"learning_rate": 0.00019890290214860833,
"loss": 0.5186,
"step": 84
},
{
"epoch": 0.053865652724968315,
"grad_norm": 0.06661225110292435,
"learning_rate": 0.00019887310669986085,
"loss": 0.6404,
"step": 85
},
{
"epoch": 0.05449936628643853,
"grad_norm": 0.07150626182556152,
"learning_rate": 0.00019884291434694152,
"loss": 0.5865,
"step": 86
},
{
"epoch": 0.055133079847908745,
"grad_norm": 0.054674554616212845,
"learning_rate": 0.00019881232521105089,
"loss": 0.5429,
"step": 87
},
{
"epoch": 0.05576679340937896,
"grad_norm": 0.057950377464294434,
"learning_rate": 0.00019878133941498224,
"loss": 0.6705,
"step": 88
},
{
"epoch": 0.056400506970849175,
"grad_norm": 0.07045155763626099,
"learning_rate": 0.0001987499570831211,
"loss": 0.5393,
"step": 89
},
{
"epoch": 0.057034220532319393,
"grad_norm": 0.055960092693567276,
"learning_rate": 0.00019871817834144504,
"loss": 0.4481,
"step": 90
},
{
"epoch": 0.057667934093789605,
"grad_norm": 0.05631652846932411,
"learning_rate": 0.00019868600331752264,
"loss": 0.5963,
"step": 91
},
{
"epoch": 0.058301647655259824,
"grad_norm": 0.05120407044887543,
"learning_rate": 0.00019865343214051347,
"loss": 0.486,
"step": 92
},
{
"epoch": 0.058935361216730035,
"grad_norm": 0.05507562682032585,
"learning_rate": 0.0001986204649411673,
"loss": 0.5514,
"step": 93
},
{
"epoch": 0.059569074778200254,
"grad_norm": 0.057690516114234924,
"learning_rate": 0.0001985871018518236,
"loss": 0.4969,
"step": 94
},
{
"epoch": 0.060202788339670466,
"grad_norm": 0.05942325294017792,
"learning_rate": 0.00019855334300641114,
"loss": 0.51,
"step": 95
},
{
"epoch": 0.060836501901140684,
"grad_norm": 0.05777527391910553,
"learning_rate": 0.0001985191885404473,
"loss": 0.5401,
"step": 96
},
{
"epoch": 0.0614702154626109,
"grad_norm": 0.07077159732580185,
"learning_rate": 0.00019848463859103763,
"loss": 0.5568,
"step": 97
},
{
"epoch": 0.062103929024081114,
"grad_norm": 0.050649482756853104,
"learning_rate": 0.00019844969329687527,
"loss": 0.5418,
"step": 98
},
{
"epoch": 0.06273764258555133,
"grad_norm": 0.059522844851017,
"learning_rate": 0.00019841435279824028,
"loss": 0.4679,
"step": 99
},
{
"epoch": 0.06337135614702155,
"grad_norm": 0.061260003596544266,
"learning_rate": 0.0001983786172369993,
"loss": 0.557,
"step": 100
},
{
"epoch": 0.06400506970849176,
"grad_norm": 0.0513591468334198,
"learning_rate": 0.00019834248675660486,
"loss": 0.5849,
"step": 101
},
{
"epoch": 0.06463878326996197,
"grad_norm": 0.06722971051931381,
"learning_rate": 0.0001983059615020947,
"loss": 0.4003,
"step": 102
},
{
"epoch": 0.06527249683143219,
"grad_norm": 0.0629379004240036,
"learning_rate": 0.0001982690416200914,
"loss": 0.5322,
"step": 103
},
{
"epoch": 0.06590621039290241,
"grad_norm": 0.05402471870183945,
"learning_rate": 0.00019823172725880165,
"loss": 0.5634,
"step": 104
},
{
"epoch": 0.06653992395437262,
"grad_norm": 0.15680162608623505,
"learning_rate": 0.0001981940185680156,
"loss": 0.5361,
"step": 105
},
{
"epoch": 0.06717363751584284,
"grad_norm": 0.06348865479230881,
"learning_rate": 0.00019815591569910654,
"loss": 0.5322,
"step": 106
},
{
"epoch": 0.06780735107731306,
"grad_norm": 0.05004284158349037,
"learning_rate": 0.00019811741880502995,
"loss": 0.5524,
"step": 107
},
{
"epoch": 0.06844106463878327,
"grad_norm": 0.06271985173225403,
"learning_rate": 0.00019807852804032305,
"loss": 0.4347,
"step": 108
},
{
"epoch": 0.06907477820025348,
"grad_norm": 0.1546468287706375,
"learning_rate": 0.00019803924356110423,
"loss": 0.4294,
"step": 109
},
{
"epoch": 0.0697084917617237,
"grad_norm": 0.06472460180521011,
"learning_rate": 0.00019799956552507233,
"loss": 0.5693,
"step": 110
},
{
"epoch": 0.07034220532319392,
"grad_norm": 0.06021984666585922,
"learning_rate": 0.00019795949409150598,
"loss": 0.6554,
"step": 111
},
{
"epoch": 0.07097591888466413,
"grad_norm": 0.04533032327890396,
"learning_rate": 0.00019791902942126313,
"loss": 0.4425,
"step": 112
},
{
"epoch": 0.07160963244613434,
"grad_norm": 0.0662391185760498,
"learning_rate": 0.0001978781716767802,
"loss": 0.5258,
"step": 113
},
{
"epoch": 0.07224334600760456,
"grad_norm": 0.06131117045879364,
"learning_rate": 0.00019783692102207155,
"loss": 0.4556,
"step": 114
},
{
"epoch": 0.07287705956907478,
"grad_norm": 0.07924918830394745,
"learning_rate": 0.00019779527762272877,
"loss": 0.5137,
"step": 115
},
{
"epoch": 0.07351077313054499,
"grad_norm": 0.07061261683702469,
"learning_rate": 0.0001977532416459201,
"loss": 0.4554,
"step": 116
},
{
"epoch": 0.0741444866920152,
"grad_norm": 0.04919254407286644,
"learning_rate": 0.00019771081326038962,
"loss": 0.5213,
"step": 117
},
{
"epoch": 0.07477820025348543,
"grad_norm": 0.053799472749233246,
"learning_rate": 0.00019766799263645673,
"loss": 0.5648,
"step": 118
},
{
"epoch": 0.07541191381495564,
"grad_norm": 0.06857369095087051,
"learning_rate": 0.00019762477994601522,
"loss": 0.6841,
"step": 119
},
{
"epoch": 0.07604562737642585,
"grad_norm": 0.0719090923666954,
"learning_rate": 0.000197581175362533,
"loss": 0.4154,
"step": 120
},
{
"epoch": 0.07667934093789606,
"grad_norm": 0.10528447479009628,
"learning_rate": 0.00019753717906105092,
"loss": 0.5674,
"step": 121
},
{
"epoch": 0.07731305449936629,
"grad_norm": 0.05879104137420654,
"learning_rate": 0.00019749279121818235,
"loss": 0.5282,
"step": 122
},
{
"epoch": 0.0779467680608365,
"grad_norm": 0.050949644297361374,
"learning_rate": 0.00019744801201211255,
"loss": 0.4398,
"step": 123
},
{
"epoch": 0.07858048162230671,
"grad_norm": 0.061247747391462326,
"learning_rate": 0.00019740284162259765,
"loss": 0.4269,
"step": 124
},
{
"epoch": 0.07921419518377694,
"grad_norm": 0.09446462988853455,
"learning_rate": 0.0001973572802309642,
"loss": 0.6362,
"step": 125
},
{
"epoch": 0.07984790874524715,
"grad_norm": 0.06124195456504822,
"learning_rate": 0.0001973113280201082,
"loss": 0.435,
"step": 126
},
{
"epoch": 0.08048162230671736,
"grad_norm": 0.05198049172759056,
"learning_rate": 0.0001972649851744948,
"loss": 0.4617,
"step": 127
},
{
"epoch": 0.08111533586818757,
"grad_norm": 0.05457935482263565,
"learning_rate": 0.00019721825188015693,
"loss": 0.548,
"step": 128
},
{
"epoch": 0.0817490494296578,
"grad_norm": 0.054542481899261475,
"learning_rate": 0.0001971711283246951,
"loss": 0.4449,
"step": 129
},
{
"epoch": 0.08238276299112801,
"grad_norm": 0.0528152696788311,
"learning_rate": 0.0001971236146972764,
"loss": 0.5868,
"step": 130
},
{
"epoch": 0.08301647655259822,
"grad_norm": 0.049837883561849594,
"learning_rate": 0.0001970757111886337,
"loss": 0.4426,
"step": 131
},
{
"epoch": 0.08365019011406843,
"grad_norm": 0.04912682995200157,
"learning_rate": 0.00019702741799106508,
"loss": 0.5328,
"step": 132
},
{
"epoch": 0.08428390367553866,
"grad_norm": 0.06654444336891174,
"learning_rate": 0.00019697873529843282,
"loss": 0.6239,
"step": 133
},
{
"epoch": 0.08491761723700887,
"grad_norm": 0.1822642683982849,
"learning_rate": 0.00019692966330616283,
"loss": 0.6482,
"step": 134
},
{
"epoch": 0.08555133079847908,
"grad_norm": 0.07404999434947968,
"learning_rate": 0.00019688020221124376,
"loss": 0.5473,
"step": 135
},
{
"epoch": 0.08618504435994931,
"grad_norm": 0.08534666895866394,
"learning_rate": 0.00019683035221222618,
"loss": 0.4794,
"step": 136
},
{
"epoch": 0.08681875792141952,
"grad_norm": 0.05804799869656563,
"learning_rate": 0.00019678011350922185,
"loss": 0.5749,
"step": 137
},
{
"epoch": 0.08745247148288973,
"grad_norm": 0.0600556954741478,
"learning_rate": 0.00019672948630390294,
"loss": 0.4929,
"step": 138
},
{
"epoch": 0.08808618504435994,
"grad_norm": 0.07564158737659454,
"learning_rate": 0.00019667847079950118,
"loss": 0.5806,
"step": 139
},
{
"epoch": 0.08871989860583017,
"grad_norm": 0.06359097361564636,
"learning_rate": 0.00019662706720080693,
"loss": 0.5427,
"step": 140
},
{
"epoch": 0.08935361216730038,
"grad_norm": 0.05452190712094307,
"learning_rate": 0.00019657527571416856,
"loss": 0.4845,
"step": 141
},
{
"epoch": 0.08998732572877059,
"grad_norm": 0.05258841812610626,
"learning_rate": 0.00019652309654749156,
"loss": 0.5255,
"step": 142
},
{
"epoch": 0.09062103929024082,
"grad_norm": 0.06789179146289825,
"learning_rate": 0.0001964705299102376,
"loss": 0.6002,
"step": 143
},
{
"epoch": 0.09125475285171103,
"grad_norm": 0.05940316617488861,
"learning_rate": 0.00019641757601342378,
"loss": 0.6178,
"step": 144
},
{
"epoch": 0.09188846641318124,
"grad_norm": 0.08051005005836487,
"learning_rate": 0.00019636423506962181,
"loss": 0.4728,
"step": 145
},
{
"epoch": 0.09252217997465145,
"grad_norm": 0.06979210674762726,
"learning_rate": 0.00019631050729295707,
"loss": 0.5166,
"step": 146
},
{
"epoch": 0.09315589353612168,
"grad_norm": 0.04284743592143059,
"learning_rate": 0.00019625639289910777,
"loss": 0.3685,
"step": 147
},
{
"epoch": 0.09378960709759189,
"grad_norm": 0.05410388484597206,
"learning_rate": 0.00019620189210530425,
"loss": 0.582,
"step": 148
},
{
"epoch": 0.0944233206590621,
"grad_norm": 0.08875017613172531,
"learning_rate": 0.00019614700513032775,
"loss": 0.6757,
"step": 149
},
{
"epoch": 0.09505703422053231,
"grad_norm": 0.06792068481445312,
"learning_rate": 0.00019609173219450998,
"loss": 0.5236,
"step": 150
},
{
"epoch": 0.09569074778200254,
"grad_norm": 0.060000237077474594,
"learning_rate": 0.0001960360735197318,
"loss": 0.4813,
"step": 151
},
{
"epoch": 0.09632446134347275,
"grad_norm": 0.052172888070344925,
"learning_rate": 0.00019598002932942266,
"loss": 0.5792,
"step": 152
},
{
"epoch": 0.09695817490494296,
"grad_norm": 0.04992865398526192,
"learning_rate": 0.00019592359984855952,
"loss": 0.4652,
"step": 153
},
{
"epoch": 0.09759188846641319,
"grad_norm": 0.05908304825425148,
"learning_rate": 0.00019586678530366606,
"loss": 0.4968,
"step": 154
},
{
"epoch": 0.0982256020278834,
"grad_norm": 0.16080443561077118,
"learning_rate": 0.00019580958592281167,
"loss": 0.4804,
"step": 155
},
{
"epoch": 0.09885931558935361,
"grad_norm": 0.05863935872912407,
"learning_rate": 0.00019575200193561057,
"loss": 0.5313,
"step": 156
},
{
"epoch": 0.09949302915082382,
"grad_norm": 0.047341488301754,
"learning_rate": 0.0001956940335732209,
"loss": 0.4939,
"step": 157
},
{
"epoch": 0.10012674271229405,
"grad_norm": 0.059797484427690506,
"learning_rate": 0.00019563568106834383,
"loss": 0.4806,
"step": 158
},
{
"epoch": 0.10076045627376426,
"grad_norm": 0.08543235808610916,
"learning_rate": 0.00019557694465522255,
"loss": 0.5691,
"step": 159
},
{
"epoch": 0.10139416983523447,
"grad_norm": 0.0614972747862339,
"learning_rate": 0.00019551782456964136,
"loss": 0.5143,
"step": 160
},
{
"epoch": 0.10202788339670468,
"grad_norm": 0.12742456793785095,
"learning_rate": 0.00019545832104892475,
"loss": 0.4987,
"step": 161
},
{
"epoch": 0.10266159695817491,
"grad_norm": 0.06898955255746841,
"learning_rate": 0.00019539843433193639,
"loss": 0.5504,
"step": 162
},
{
"epoch": 0.10329531051964512,
"grad_norm": 0.11239788681268692,
"learning_rate": 0.0001953381646590783,
"loss": 0.3448,
"step": 163
},
{
"epoch": 0.10392902408111533,
"grad_norm": 0.24028901755809784,
"learning_rate": 0.00019527751227228963,
"loss": 0.5294,
"step": 164
},
{
"epoch": 0.10456273764258556,
"grad_norm": 0.0903674066066742,
"learning_rate": 0.00019521647741504604,
"loss": 0.514,
"step": 165
},
{
"epoch": 0.10519645120405577,
"grad_norm": 0.051598865538835526,
"learning_rate": 0.00019515506033235833,
"loss": 0.4771,
"step": 166
},
{
"epoch": 0.10583016476552598,
"grad_norm": 0.05018608644604683,
"learning_rate": 0.0001950932612707719,
"loss": 0.4492,
"step": 167
},
{
"epoch": 0.10646387832699619,
"grad_norm": 0.07150580734014511,
"learning_rate": 0.00019503108047836523,
"loss": 0.5806,
"step": 168
},
{
"epoch": 0.10709759188846642,
"grad_norm": 0.05979820713400841,
"learning_rate": 0.00019496851820474944,
"loss": 0.6138,
"step": 169
},
{
"epoch": 0.10773130544993663,
"grad_norm": 0.05117090418934822,
"learning_rate": 0.00019490557470106686,
"loss": 0.5138,
"step": 170
},
{
"epoch": 0.10836501901140684,
"grad_norm": 0.049405183643102646,
"learning_rate": 0.0001948422502199903,
"loss": 0.4974,
"step": 171
},
{
"epoch": 0.10899873257287707,
"grad_norm": 0.060524292290210724,
"learning_rate": 0.00019477854501572176,
"loss": 0.5448,
"step": 172
},
{
"epoch": 0.10963244613434728,
"grad_norm": 0.05022512748837471,
"learning_rate": 0.0001947144593439917,
"loss": 0.5295,
"step": 173
},
{
"epoch": 0.11026615969581749,
"grad_norm": 0.05024838447570801,
"learning_rate": 0.0001946499934620579,
"loss": 0.4842,
"step": 174
},
{
"epoch": 0.1108998732572877,
"grad_norm": 0.05859989672899246,
"learning_rate": 0.00019458514762870426,
"loss": 0.5105,
"step": 175
},
{
"epoch": 0.11153358681875793,
"grad_norm": 0.05963319167494774,
"learning_rate": 0.00019451992210424006,
"loss": 0.4833,
"step": 176
},
{
"epoch": 0.11216730038022814,
"grad_norm": 0.05941782146692276,
"learning_rate": 0.0001944543171504987,
"loss": 0.4743,
"step": 177
},
{
"epoch": 0.11280101394169835,
"grad_norm": 0.07598856091499329,
"learning_rate": 0.00019438833303083678,
"loss": 0.483,
"step": 178
},
{
"epoch": 0.11343472750316856,
"grad_norm": 0.05751622095704079,
"learning_rate": 0.0001943219700101328,
"loss": 0.563,
"step": 179
},
{
"epoch": 0.11406844106463879,
"grad_norm": 0.08273158222436905,
"learning_rate": 0.0001942552283547865,
"loss": 0.5514,
"step": 180
},
{
"epoch": 0.114702154626109,
"grad_norm": 0.04589926823973656,
"learning_rate": 0.00019418810833271745,
"loss": 0.4353,
"step": 181
},
{
"epoch": 0.11533586818757921,
"grad_norm": 0.04818568378686905,
"learning_rate": 0.00019412061021336404,
"loss": 0.4653,
"step": 182
},
{
"epoch": 0.11596958174904944,
"grad_norm": 0.062292054295539856,
"learning_rate": 0.0001940527342676826,
"loss": 0.5451,
"step": 183
},
{
"epoch": 0.11660329531051965,
"grad_norm": 0.05161510780453682,
"learning_rate": 0.000193984480768146,
"loss": 0.5174,
"step": 184
},
{
"epoch": 0.11723700887198986,
"grad_norm": 0.0669926106929779,
"learning_rate": 0.0001939158499887428,
"loss": 0.5074,
"step": 185
},
{
"epoch": 0.11787072243346007,
"grad_norm": 0.04856441915035248,
"learning_rate": 0.00019384684220497605,
"loss": 0.3898,
"step": 186
},
{
"epoch": 0.1185044359949303,
"grad_norm": 0.05841194465756416,
"learning_rate": 0.0001937774576938622,
"loss": 0.5437,
"step": 187
},
{
"epoch": 0.11913814955640051,
"grad_norm": 0.05253444239497185,
"learning_rate": 0.00019370769673393007,
"loss": 0.5669,
"step": 188
},
{
"epoch": 0.11977186311787072,
"grad_norm": 0.05771539360284805,
"learning_rate": 0.00019363755960521943,
"loss": 0.4965,
"step": 189
},
{
"epoch": 0.12040557667934093,
"grad_norm": 0.07135152071714401,
"learning_rate": 0.00019356704658928035,
"loss": 0.4089,
"step": 190
},
{
"epoch": 0.12103929024081116,
"grad_norm": 0.05927246809005737,
"learning_rate": 0.00019349615796917163,
"loss": 0.465,
"step": 191
},
{
"epoch": 0.12167300380228137,
"grad_norm": 0.06522128731012344,
"learning_rate": 0.00019342489402945998,
"loss": 0.3797,
"step": 192
},
{
"epoch": 0.12230671736375158,
"grad_norm": 0.05745214596390724,
"learning_rate": 0.0001933532550562187,
"loss": 0.56,
"step": 193
},
{
"epoch": 0.1229404309252218,
"grad_norm": 0.05626146122813225,
"learning_rate": 0.0001932812413370265,
"loss": 0.5439,
"step": 194
},
{
"epoch": 0.12357414448669202,
"grad_norm": 0.07615689933300018,
"learning_rate": 0.00019320885316096654,
"loss": 0.5187,
"step": 195
},
{
"epoch": 0.12420785804816223,
"grad_norm": 0.19566097855567932,
"learning_rate": 0.00019313609081862508,
"loss": 0.5535,
"step": 196
},
{
"epoch": 0.12484157160963244,
"grad_norm": 0.052284326404333115,
"learning_rate": 0.00019306295460209044,
"loss": 0.4056,
"step": 197
},
{
"epoch": 0.12547528517110265,
"grad_norm": 0.050081610679626465,
"learning_rate": 0.00019298944480495176,
"loss": 0.451,
"step": 198
},
{
"epoch": 0.12610899873257286,
"grad_norm": 0.07420384138822556,
"learning_rate": 0.00019291556172229785,
"loss": 0.5485,
"step": 199
},
{
"epoch": 0.1267427122940431,
"grad_norm": 0.046289846301078796,
"learning_rate": 0.00019284130565071588,
"loss": 0.4944,
"step": 200
},
{
"epoch": 0.12737642585551331,
"grad_norm": 0.041031207889318466,
"learning_rate": 0.00019276667688829043,
"loss": 0.4507,
"step": 201
},
{
"epoch": 0.12801013941698353,
"grad_norm": 0.07089229673147202,
"learning_rate": 0.0001926916757346022,
"loss": 0.513,
"step": 202
},
{
"epoch": 0.12864385297845374,
"grad_norm": 0.04405022785067558,
"learning_rate": 0.00019261630249072659,
"loss": 0.3709,
"step": 203
},
{
"epoch": 0.12927756653992395,
"grad_norm": 0.059661708772182465,
"learning_rate": 0.00019254055745923285,
"loss": 0.4813,
"step": 204
},
{
"epoch": 0.12991128010139416,
"grad_norm": 0.07400868833065033,
"learning_rate": 0.00019246444094418255,
"loss": 0.5346,
"step": 205
},
{
"epoch": 0.13054499366286437,
"grad_norm": 0.05862591415643692,
"learning_rate": 0.0001923879532511287,
"loss": 0.4856,
"step": 206
},
{
"epoch": 0.1311787072243346,
"grad_norm": 0.05793355405330658,
"learning_rate": 0.00019231109468711405,
"loss": 0.5129,
"step": 207
},
{
"epoch": 0.13181242078580482,
"grad_norm": 0.043961625546216965,
"learning_rate": 0.00019223386556067033,
"loss": 0.4803,
"step": 208
},
{
"epoch": 0.13244613434727504,
"grad_norm": 0.07102088630199432,
"learning_rate": 0.00019215626618181676,
"loss": 0.5078,
"step": 209
},
{
"epoch": 0.13307984790874525,
"grad_norm": 0.07707204669713974,
"learning_rate": 0.00019207829686205882,
"loss": 0.5465,
"step": 210
},
{
"epoch": 0.13371356147021546,
"grad_norm": 0.06010926514863968,
"learning_rate": 0.0001919999579143871,
"loss": 0.5532,
"step": 211
},
{
"epoch": 0.13434727503168567,
"grad_norm": 0.0627330020070076,
"learning_rate": 0.0001919212496532759,
"loss": 0.4055,
"step": 212
},
{
"epoch": 0.13498098859315588,
"grad_norm": 0.04347623884677887,
"learning_rate": 0.00019184217239468212,
"loss": 0.4581,
"step": 213
},
{
"epoch": 0.13561470215462612,
"grad_norm": 0.05672100558876991,
"learning_rate": 0.00019176272645604386,
"loss": 0.5335,
"step": 214
},
{
"epoch": 0.13624841571609633,
"grad_norm": 0.05062992498278618,
"learning_rate": 0.00019168291215627926,
"loss": 0.4801,
"step": 215
},
{
"epoch": 0.13688212927756654,
"grad_norm": 8.16939640045166,
"learning_rate": 0.00019160272981578512,
"loss": 0.5814,
"step": 216
},
{
"epoch": 0.13751584283903676,
"grad_norm": 0.058165278285741806,
"learning_rate": 0.00019152217975643566,
"loss": 0.5163,
"step": 217
},
{
"epoch": 0.13814955640050697,
"grad_norm": 0.06994735449552536,
"learning_rate": 0.00019144126230158127,
"loss": 0.5558,
"step": 218
},
{
"epoch": 0.13878326996197718,
"grad_norm": 0.05495104938745499,
"learning_rate": 0.0001913599777760471,
"loss": 0.5298,
"step": 219
},
{
"epoch": 0.1394169835234474,
"grad_norm": 0.060677338391542435,
"learning_rate": 0.00019127832650613189,
"loss": 0.5614,
"step": 220
},
{
"epoch": 0.14005069708491763,
"grad_norm": 0.060457441955804825,
"learning_rate": 0.00019119630881960658,
"loss": 0.5139,
"step": 221
},
{
"epoch": 0.14068441064638784,
"grad_norm": 0.0608784481883049,
"learning_rate": 0.00019111392504571296,
"loss": 0.4711,
"step": 222
},
{
"epoch": 0.14131812420785805,
"grad_norm": 0.07560902833938599,
"learning_rate": 0.00019103117551516244,
"loss": 0.486,
"step": 223
},
{
"epoch": 0.14195183776932827,
"grad_norm": 0.0847187414765358,
"learning_rate": 0.00019094806056013468,
"loss": 0.5934,
"step": 224
},
{
"epoch": 0.14258555133079848,
"grad_norm": 0.06016870215535164,
"learning_rate": 0.00019086458051427622,
"loss": 0.4529,
"step": 225
},
{
"epoch": 0.1432192648922687,
"grad_norm": 0.17245864868164062,
"learning_rate": 0.00019078073571269922,
"loss": 0.5307,
"step": 226
},
{
"epoch": 0.1438529784537389,
"grad_norm": 0.0647033080458641,
"learning_rate": 0.00019069652649198005,
"loss": 0.569,
"step": 227
},
{
"epoch": 0.1444866920152091,
"grad_norm": 0.07447489351034164,
"learning_rate": 0.00019061195319015797,
"loss": 0.547,
"step": 228
},
{
"epoch": 0.14512040557667935,
"grad_norm": 0.05335066467523575,
"learning_rate": 0.00019052701614673373,
"loss": 0.5363,
"step": 229
},
{
"epoch": 0.14575411913814956,
"grad_norm": 0.04057115688920021,
"learning_rate": 0.0001904417157026683,
"loss": 0.4354,
"step": 230
},
{
"epoch": 0.14638783269961977,
"grad_norm": 0.05564083158969879,
"learning_rate": 0.00019035605220038137,
"loss": 0.5674,
"step": 231
},
{
"epoch": 0.14702154626108999,
"grad_norm": 0.1210884302854538,
"learning_rate": 0.00019027002598375012,
"loss": 0.5645,
"step": 232
},
{
"epoch": 0.1476552598225602,
"grad_norm": 0.05494518578052521,
"learning_rate": 0.00019018363739810767,
"loss": 0.6239,
"step": 233
},
{
"epoch": 0.1482889733840304,
"grad_norm": 0.04633218050003052,
"learning_rate": 0.0001900968867902419,
"loss": 0.4787,
"step": 234
},
{
"epoch": 0.14892268694550062,
"grad_norm": 0.06846950203180313,
"learning_rate": 0.00019000977450839393,
"loss": 0.5607,
"step": 235
},
{
"epoch": 0.14955640050697086,
"grad_norm": 0.0618814192712307,
"learning_rate": 0.0001899223009022566,
"loss": 0.631,
"step": 236
},
{
"epoch": 0.15019011406844107,
"grad_norm": 0.06061235070228577,
"learning_rate": 0.00018983446632297343,
"loss": 0.5989,
"step": 237
},
{
"epoch": 0.15082382762991128,
"grad_norm": 0.06494279205799103,
"learning_rate": 0.00018974627112313677,
"loss": 0.5816,
"step": 238
},
{
"epoch": 0.1514575411913815,
"grad_norm": 0.04907020181417465,
"learning_rate": 0.0001896577156567868,
"loss": 0.5097,
"step": 239
},
{
"epoch": 0.1520912547528517,
"grad_norm": 0.04682941362261772,
"learning_rate": 0.00018956880027940967,
"loss": 0.5828,
"step": 240
},
{
"epoch": 0.15272496831432192,
"grad_norm": 0.05498978868126869,
"learning_rate": 0.00018947952534793661,
"loss": 0.5257,
"step": 241
},
{
"epoch": 0.15335868187579213,
"grad_norm": 0.04309950768947601,
"learning_rate": 0.00018938989122074197,
"loss": 0.3662,
"step": 242
},
{
"epoch": 0.15399239543726237,
"grad_norm": 0.06519515067338943,
"learning_rate": 0.00018929989825764207,
"loss": 0.4058,
"step": 243
},
{
"epoch": 0.15462610899873258,
"grad_norm": 0.046929214149713516,
"learning_rate": 0.00018920954681989378,
"loss": 0.4916,
"step": 244
},
{
"epoch": 0.1552598225602028,
"grad_norm": 0.05388319492340088,
"learning_rate": 0.00018911883727019285,
"loss": 0.4143,
"step": 245
},
{
"epoch": 0.155893536121673,
"grad_norm": 0.05619863048195839,
"learning_rate": 0.00018902776997267268,
"loss": 0.5107,
"step": 246
},
{
"epoch": 0.15652724968314322,
"grad_norm": 0.053882747888565063,
"learning_rate": 0.00018893634529290279,
"loss": 0.5559,
"step": 247
},
{
"epoch": 0.15716096324461343,
"grad_norm": 0.05231885239481926,
"learning_rate": 0.00018884456359788724,
"loss": 0.5076,
"step": 248
},
{
"epoch": 0.15779467680608364,
"grad_norm": 0.07149146497249603,
"learning_rate": 0.00018875242525606334,
"loss": 0.558,
"step": 249
},
{
"epoch": 0.15842839036755388,
"grad_norm": 0.04615316912531853,
"learning_rate": 0.00018865993063730004,
"loss": 0.4971,
"step": 250
},
{
"epoch": 0.1590621039290241,
"grad_norm": 0.05331886187195778,
"learning_rate": 0.00018856708011289643,
"loss": 0.5506,
"step": 251
},
{
"epoch": 0.1596958174904943,
"grad_norm": 0.05348580330610275,
"learning_rate": 0.00018847387405558045,
"loss": 0.4515,
"step": 252
},
{
"epoch": 0.1603295310519645,
"grad_norm": 0.0438147634267807,
"learning_rate": 0.00018838031283950705,
"loss": 0.3818,
"step": 253
},
{
"epoch": 0.16096324461343473,
"grad_norm": 0.0473354198038578,
"learning_rate": 0.0001882863968402571,
"loss": 0.4458,
"step": 254
},
{
"epoch": 0.16159695817490494,
"grad_norm": 0.05930502712726593,
"learning_rate": 0.0001881921264348355,
"loss": 0.6228,
"step": 255
},
{
"epoch": 0.16223067173637515,
"grad_norm": 0.04982107877731323,
"learning_rate": 0.00018809750200166994,
"loss": 0.5916,
"step": 256
},
{
"epoch": 0.1628643852978454,
"grad_norm": 0.09739918261766434,
"learning_rate": 0.0001880025239206092,
"loss": 0.651,
"step": 257
},
{
"epoch": 0.1634980988593156,
"grad_norm": 0.09072676301002502,
"learning_rate": 0.00018790719257292174,
"loss": 0.5564,
"step": 258
},
{
"epoch": 0.1641318124207858,
"grad_norm": 0.0638791099190712,
"learning_rate": 0.00018781150834129413,
"loss": 0.4545,
"step": 259
},
{
"epoch": 0.16476552598225602,
"grad_norm": 0.05755198001861572,
"learning_rate": 0.0001877154716098295,
"loss": 0.4457,
"step": 260
},
{
"epoch": 0.16539923954372623,
"grad_norm": 0.2049247920513153,
"learning_rate": 0.00018761908276404603,
"loss": 0.5447,
"step": 261
},
{
"epoch": 0.16603295310519645,
"grad_norm": 0.06760350614786148,
"learning_rate": 0.00018752234219087538,
"loss": 0.4743,
"step": 262
},
{
"epoch": 0.16666666666666666,
"grad_norm": 0.061410121619701385,
"learning_rate": 0.00018742525027866115,
"loss": 0.547,
"step": 263
},
{
"epoch": 0.16730038022813687,
"grad_norm": 0.04981521889567375,
"learning_rate": 0.00018732780741715724,
"loss": 0.4924,
"step": 264
},
{
"epoch": 0.1679340937896071,
"grad_norm": 0.06636273115873337,
"learning_rate": 0.00018723001399752653,
"loss": 0.591,
"step": 265
},
{
"epoch": 0.16856780735107732,
"grad_norm": 0.0517747662961483,
"learning_rate": 0.00018713187041233896,
"loss": 0.5294,
"step": 266
},
{
"epoch": 0.16920152091254753,
"grad_norm": 0.11798780411481857,
"learning_rate": 0.00018703337705557017,
"loss": 0.4953,
"step": 267
},
{
"epoch": 0.16983523447401774,
"grad_norm": 0.1441587656736374,
"learning_rate": 0.00018693453432259998,
"loss": 0.4898,
"step": 268
},
{
"epoch": 0.17046894803548795,
"grad_norm": 0.06387986242771149,
"learning_rate": 0.00018683534261021057,
"loss": 0.4663,
"step": 269
},
{
"epoch": 0.17110266159695817,
"grad_norm": 0.05943833664059639,
"learning_rate": 0.0001867358023165851,
"loss": 0.5607,
"step": 270
},
{
"epoch": 0.17173637515842838,
"grad_norm": 0.05011943355202675,
"learning_rate": 0.00018663591384130606,
"loss": 0.5297,
"step": 271
},
{
"epoch": 0.17237008871989862,
"grad_norm": 0.059131983667612076,
"learning_rate": 0.00018653567758535354,
"loss": 0.4896,
"step": 272
},
{
"epoch": 0.17300380228136883,
"grad_norm": 0.06053609773516655,
"learning_rate": 0.0001864350939511038,
"loss": 0.5446,
"step": 273
},
{
"epoch": 0.17363751584283904,
"grad_norm": 0.05496980994939804,
"learning_rate": 0.00018633416334232753,
"loss": 0.5427,
"step": 274
},
{
"epoch": 0.17427122940430925,
"grad_norm": 0.05304751545190811,
"learning_rate": 0.0001862328861641883,
"loss": 0.4189,
"step": 275
},
{
"epoch": 0.17490494296577946,
"grad_norm": 0.04881710559129715,
"learning_rate": 0.00018613126282324092,
"loss": 0.4555,
"step": 276
},
{
"epoch": 0.17553865652724968,
"grad_norm": 0.051984284073114395,
"learning_rate": 0.0001860292937274297,
"loss": 0.5282,
"step": 277
},
{
"epoch": 0.1761723700887199,
"grad_norm": 0.05241424962878227,
"learning_rate": 0.00018592697928608703,
"loss": 0.4924,
"step": 278
},
{
"epoch": 0.17680608365019013,
"grad_norm": 0.04947778955101967,
"learning_rate": 0.00018582431990993151,
"loss": 0.4867,
"step": 279
},
{
"epoch": 0.17743979721166034,
"grad_norm": 0.04952229931950569,
"learning_rate": 0.00018572131601106654,
"loss": 0.4362,
"step": 280
},
{
"epoch": 0.17807351077313055,
"grad_norm": 0.061900023370981216,
"learning_rate": 0.00018561796800297832,
"loss": 0.6342,
"step": 281
},
{
"epoch": 0.17870722433460076,
"grad_norm": 0.04405650496482849,
"learning_rate": 0.00018551427630053463,
"loss": 0.4612,
"step": 282
},
{
"epoch": 0.17934093789607097,
"grad_norm": 0.5723605155944824,
"learning_rate": 0.00018541024131998274,
"loss": 0.4917,
"step": 283
},
{
"epoch": 0.17997465145754118,
"grad_norm": 0.07066962867975235,
"learning_rate": 0.0001853058634789481,
"loss": 0.5386,
"step": 284
},
{
"epoch": 0.1806083650190114,
"grad_norm": 0.041575830429792404,
"learning_rate": 0.00018520114319643235,
"loss": 0.4894,
"step": 285
},
{
"epoch": 0.18124207858048164,
"grad_norm": 0.07731833308935165,
"learning_rate": 0.0001850960808928119,
"loss": 0.5382,
"step": 286
},
{
"epoch": 0.18187579214195185,
"grad_norm": 0.05468999221920967,
"learning_rate": 0.00018499067698983605,
"loss": 0.4514,
"step": 287
},
{
"epoch": 0.18250950570342206,
"grad_norm": 0.04942842200398445,
"learning_rate": 0.00018488493191062542,
"loss": 0.4329,
"step": 288
},
{
"epoch": 0.18314321926489227,
"grad_norm": 0.053615666925907135,
"learning_rate": 0.0001847788460796702,
"loss": 0.5182,
"step": 289
},
{
"epoch": 0.18377693282636248,
"grad_norm": 0.04232574254274368,
"learning_rate": 0.00018467241992282843,
"loss": 0.3108,
"step": 290
},
{
"epoch": 0.1844106463878327,
"grad_norm": 0.04795556515455246,
"learning_rate": 0.00018456565386732433,
"loss": 0.383,
"step": 291
},
{
"epoch": 0.1850443599493029,
"grad_norm": 0.053252723067998886,
"learning_rate": 0.00018445854834174655,
"loss": 0.4597,
"step": 292
},
{
"epoch": 0.18567807351077312,
"grad_norm": 0.044747479259967804,
"learning_rate": 0.00018435110377604654,
"loss": 0.5066,
"step": 293
},
{
"epoch": 0.18631178707224336,
"grad_norm": 0.0473531037569046,
"learning_rate": 0.00018424332060153664,
"loss": 0.4258,
"step": 294
},
{
"epoch": 0.18694550063371357,
"grad_norm": 0.05739828571677208,
"learning_rate": 0.0001841351992508885,
"loss": 0.4498,
"step": 295
},
{
"epoch": 0.18757921419518378,
"grad_norm": 0.0635855570435524,
"learning_rate": 0.0001840267401581314,
"loss": 0.5368,
"step": 296
},
{
"epoch": 0.188212927756654,
"grad_norm": 0.05470935255289078,
"learning_rate": 0.00018391794375865024,
"loss": 0.5367,
"step": 297
},
{
"epoch": 0.1888466413181242,
"grad_norm": 0.04850434139370918,
"learning_rate": 0.00018380881048918405,
"loss": 0.5369,
"step": 298
},
{
"epoch": 0.18948035487959441,
"grad_norm": 0.1420743763446808,
"learning_rate": 0.00018369934078782426,
"loss": 0.5101,
"step": 299
},
{
"epoch": 0.19011406844106463,
"grad_norm": 0.0749795064330101,
"learning_rate": 0.00018358953509401262,
"loss": 0.5756,
"step": 300
},
{
"epoch": 0.19074778200253487,
"grad_norm": 0.05331069603562355,
"learning_rate": 0.00018347939384853978,
"loss": 0.5759,
"step": 301
},
{
"epoch": 0.19138149556400508,
"grad_norm": 0.05981903895735741,
"learning_rate": 0.00018336891749354335,
"loss": 0.6036,
"step": 302
},
{
"epoch": 0.1920152091254753,
"grad_norm": 0.08048289269208908,
"learning_rate": 0.00018325810647250616,
"loss": 0.4424,
"step": 303
},
{
"epoch": 0.1926489226869455,
"grad_norm": 0.07861804962158203,
"learning_rate": 0.00018314696123025454,
"loss": 0.5725,
"step": 304
},
{
"epoch": 0.1932826362484157,
"grad_norm": 0.14672251045703888,
"learning_rate": 0.0001830354822129564,
"loss": 0.5068,
"step": 305
},
{
"epoch": 0.19391634980988592,
"grad_norm": 0.06640765070915222,
"learning_rate": 0.0001829236698681195,
"loss": 0.585,
"step": 306
},
{
"epoch": 0.19455006337135614,
"grad_norm": 0.0588274821639061,
"learning_rate": 0.0001828115246445898,
"loss": 0.5779,
"step": 307
},
{
"epoch": 0.19518377693282637,
"grad_norm": 0.05600736290216446,
"learning_rate": 0.0001826990469925494,
"loss": 0.5216,
"step": 308
},
{
"epoch": 0.1958174904942966,
"grad_norm": 0.052844930440187454,
"learning_rate": 0.0001825862373635149,
"loss": 0.5482,
"step": 309
},
{
"epoch": 0.1964512040557668,
"grad_norm": 0.04969317838549614,
"learning_rate": 0.0001824730962103356,
"loss": 0.5928,
"step": 310
},
{
"epoch": 0.197084917617237,
"grad_norm": 0.06168043613433838,
"learning_rate": 0.00018235962398719147,
"loss": 0.5185,
"step": 311
},
{
"epoch": 0.19771863117870722,
"grad_norm": 0.051151130348443985,
"learning_rate": 0.00018224582114959172,
"loss": 0.4677,
"step": 312
},
{
"epoch": 0.19835234474017743,
"grad_norm": 0.060467127710580826,
"learning_rate": 0.00018213168815437255,
"loss": 0.5566,
"step": 313
},
{
"epoch": 0.19898605830164764,
"grad_norm": 0.043170325458049774,
"learning_rate": 0.0001820172254596956,
"loss": 0.489,
"step": 314
},
{
"epoch": 0.19961977186311788,
"grad_norm": 0.06550537794828415,
"learning_rate": 0.00018190243352504597,
"loss": 0.5809,
"step": 315
},
{
"epoch": 0.2002534854245881,
"grad_norm": 0.04956373944878578,
"learning_rate": 0.00018178731281123044,
"loss": 0.462,
"step": 316
},
{
"epoch": 0.2008871989860583,
"grad_norm": 0.05908495932817459,
"learning_rate": 0.00018167186378037563,
"loss": 0.4611,
"step": 317
},
{
"epoch": 0.20152091254752852,
"grad_norm": 0.047168437391519547,
"learning_rate": 0.00018155608689592604,
"loss": 0.5283,
"step": 318
},
{
"epoch": 0.20215462610899873,
"grad_norm": 0.04968830570578575,
"learning_rate": 0.00018143998262264233,
"loss": 0.4982,
"step": 319
},
{
"epoch": 0.20278833967046894,
"grad_norm": 0.06764087826013565,
"learning_rate": 0.00018132355142659937,
"loss": 0.5244,
"step": 320
},
{
"epoch": 0.20342205323193915,
"grad_norm": 0.06344570964574814,
"learning_rate": 0.0001812067937751844,
"loss": 0.606,
"step": 321
},
{
"epoch": 0.20405576679340937,
"grad_norm": 0.06029113009572029,
"learning_rate": 0.0001810897101370951,
"loss": 0.5407,
"step": 322
},
{
"epoch": 0.2046894803548796,
"grad_norm": 0.08346560597419739,
"learning_rate": 0.00018097230098233785,
"loss": 0.4814,
"step": 323
},
{
"epoch": 0.20532319391634982,
"grad_norm": 0.04595065116882324,
"learning_rate": 0.00018085456678222558,
"loss": 0.471,
"step": 324
},
{
"epoch": 0.20595690747782003,
"grad_norm": 0.4050588309764862,
"learning_rate": 0.00018073650800937624,
"loss": 0.4586,
"step": 325
},
{
"epoch": 0.20659062103929024,
"grad_norm": 0.055679477751255035,
"learning_rate": 0.00018061812513771053,
"loss": 0.516,
"step": 326
},
{
"epoch": 0.20722433460076045,
"grad_norm": 0.05209626257419586,
"learning_rate": 0.00018049941864245033,
"loss": 0.4528,
"step": 327
},
{
"epoch": 0.20785804816223066,
"grad_norm": 0.05503727123141289,
"learning_rate": 0.00018038038900011652,
"loss": 0.4297,
"step": 328
},
{
"epoch": 0.20849176172370087,
"grad_norm": 0.05453247204422951,
"learning_rate": 0.0001802610366885271,
"loss": 0.4731,
"step": 329
},
{
"epoch": 0.20912547528517111,
"grad_norm": 0.05371938645839691,
"learning_rate": 0.00018014136218679567,
"loss": 0.569,
"step": 330
},
{
"epoch": 0.20975918884664133,
"grad_norm": 0.05164814740419388,
"learning_rate": 0.0001800213659753289,
"loss": 0.4883,
"step": 331
},
{
"epoch": 0.21039290240811154,
"grad_norm": 0.06455442309379578,
"learning_rate": 0.00017990104853582493,
"loss": 0.4829,
"step": 332
},
{
"epoch": 0.21102661596958175,
"grad_norm": 0.04764432832598686,
"learning_rate": 0.0001797804103512715,
"loss": 0.5525,
"step": 333
},
{
"epoch": 0.21166032953105196,
"grad_norm": 0.0578368604183197,
"learning_rate": 0.00017965945190594388,
"loss": 0.4824,
"step": 334
},
{
"epoch": 0.21229404309252217,
"grad_norm": 0.05196613445878029,
"learning_rate": 0.00017953817368540292,
"loss": 0.5036,
"step": 335
},
{
"epoch": 0.21292775665399238,
"grad_norm": 0.044868264347314835,
"learning_rate": 0.00017941657617649316,
"loss": 0.36,
"step": 336
},
{
"epoch": 0.21356147021546262,
"grad_norm": 0.0686643123626709,
"learning_rate": 0.00017929465986734084,
"loss": 0.6069,
"step": 337
},
{
"epoch": 0.21419518377693283,
"grad_norm": 0.08286602050065994,
"learning_rate": 0.000179172425247352,
"loss": 0.5635,
"step": 338
},
{
"epoch": 0.21482889733840305,
"grad_norm": 0.5979371070861816,
"learning_rate": 0.00017904987280721035,
"loss": 0.3994,
"step": 339
},
{
"epoch": 0.21546261089987326,
"grad_norm": 0.05577315390110016,
"learning_rate": 0.00017892700303887558,
"loss": 0.5699,
"step": 340
},
{
"epoch": 0.21609632446134347,
"grad_norm": 0.06650438159704208,
"learning_rate": 0.0001788038164355811,
"loss": 0.5557,
"step": 341
},
{
"epoch": 0.21673003802281368,
"grad_norm": 0.06644187867641449,
"learning_rate": 0.00017868031349183217,
"loss": 0.5593,
"step": 342
},
{
"epoch": 0.2173637515842839,
"grad_norm": 0.05286836251616478,
"learning_rate": 0.00017855649470340413,
"loss": 0.4902,
"step": 343
},
{
"epoch": 0.21799746514575413,
"grad_norm": 0.05314694344997406,
"learning_rate": 0.00017843236056733992,
"loss": 0.5036,
"step": 344
},
{
"epoch": 0.21863117870722434,
"grad_norm": 0.0668027251958847,
"learning_rate": 0.0001783079115819486,
"loss": 0.6198,
"step": 345
},
{
"epoch": 0.21926489226869456,
"grad_norm": 0.04909252002835274,
"learning_rate": 0.000178183148246803,
"loss": 0.4273,
"step": 346
},
{
"epoch": 0.21989860583016477,
"grad_norm": 0.053546786308288574,
"learning_rate": 0.00017805807106273787,
"loss": 0.5077,
"step": 347
},
{
"epoch": 0.22053231939163498,
"grad_norm": 0.0647466629743576,
"learning_rate": 0.00017793268053184786,
"loss": 0.5262,
"step": 348
},
{
"epoch": 0.2211660329531052,
"grad_norm": 0.05518212169408798,
"learning_rate": 0.00017780697715748546,
"loss": 0.5621,
"step": 349
},
{
"epoch": 0.2217997465145754,
"grad_norm": 0.0661974772810936,
"learning_rate": 0.00017768096144425902,
"loss": 0.5727,
"step": 350
},
{
"epoch": 0.2224334600760456,
"grad_norm": 0.09333747625350952,
"learning_rate": 0.00017755463389803065,
"loss": 0.4891,
"step": 351
},
{
"epoch": 0.22306717363751585,
"grad_norm": 0.04791216179728508,
"learning_rate": 0.0001774279950259143,
"loss": 0.5569,
"step": 352
},
{
"epoch": 0.22370088719898606,
"grad_norm": 0.05712969973683357,
"learning_rate": 0.0001773010453362737,
"loss": 0.5433,
"step": 353
},
{
"epoch": 0.22433460076045628,
"grad_norm": 0.05735623091459274,
"learning_rate": 0.00017717378533872017,
"loss": 0.5702,
"step": 354
},
{
"epoch": 0.2249683143219265,
"grad_norm": 0.05040268227458,
"learning_rate": 0.00017704621554411084,
"loss": 0.4964,
"step": 355
},
{
"epoch": 0.2256020278833967,
"grad_norm": 0.04687810316681862,
"learning_rate": 0.00017691833646454628,
"loss": 0.5242,
"step": 356
},
{
"epoch": 0.2262357414448669,
"grad_norm": 0.051406193524599075,
"learning_rate": 0.00017679014861336878,
"loss": 0.5146,
"step": 357
},
{
"epoch": 0.22686945500633712,
"grad_norm": 0.04884679988026619,
"learning_rate": 0.00017666165250516006,
"loss": 0.4825,
"step": 358
},
{
"epoch": 0.22750316856780736,
"grad_norm": 0.053725842386484146,
"learning_rate": 0.0001765328486557392,
"loss": 0.4932,
"step": 359
},
{
"epoch": 0.22813688212927757,
"grad_norm": 0.06212908402085304,
"learning_rate": 0.00017640373758216077,
"loss": 0.506,
"step": 360
},
{
"epoch": 0.22877059569074779,
"grad_norm": 0.05059286579489708,
"learning_rate": 0.0001762743198027125,
"loss": 0.4719,
"step": 361
},
{
"epoch": 0.229404309252218,
"grad_norm": 0.04520050436258316,
"learning_rate": 0.00017614459583691346,
"loss": 0.4553,
"step": 362
},
{
"epoch": 0.2300380228136882,
"grad_norm": 0.05503036454319954,
"learning_rate": 0.0001760145662055117,
"loss": 0.4706,
"step": 363
},
{
"epoch": 0.23067173637515842,
"grad_norm": 0.046107854694128036,
"learning_rate": 0.00017588423143048235,
"loss": 0.4177,
"step": 364
},
{
"epoch": 0.23130544993662863,
"grad_norm": 0.12301266193389893,
"learning_rate": 0.0001757535920350255,
"loss": 0.5922,
"step": 365
},
{
"epoch": 0.23193916349809887,
"grad_norm": 1.179470419883728,
"learning_rate": 0.00017562264854356405,
"loss": 0.5123,
"step": 366
},
{
"epoch": 0.23257287705956908,
"grad_norm": 0.11167129874229431,
"learning_rate": 0.0001754914014817416,
"loss": 0.3884,
"step": 367
},
{
"epoch": 0.2332065906210393,
"grad_norm": 0.055067550390958786,
"learning_rate": 0.00017535985137642044,
"loss": 0.4544,
"step": 368
},
{
"epoch": 0.2338403041825095,
"grad_norm": 0.07947530597448349,
"learning_rate": 0.0001752279987556792,
"loss": 0.6575,
"step": 369
},
{
"epoch": 0.23447401774397972,
"grad_norm": 0.10236025601625443,
"learning_rate": 0.00017509584414881113,
"loss": 0.5334,
"step": 370
},
{
"epoch": 0.23510773130544993,
"grad_norm": 0.12996040284633636,
"learning_rate": 0.00017496338808632155,
"loss": 0.3897,
"step": 371
},
{
"epoch": 0.23574144486692014,
"grad_norm": 0.07005209475755692,
"learning_rate": 0.00017483063109992596,
"loss": 0.5077,
"step": 372
},
{
"epoch": 0.23637515842839038,
"grad_norm": 0.04446430131793022,
"learning_rate": 0.00017469757372254785,
"loss": 0.4467,
"step": 373
},
{
"epoch": 0.2370088719898606,
"grad_norm": 6.105027198791504,
"learning_rate": 0.00017456421648831655,
"loss": 1.722,
"step": 374
},
{
"epoch": 0.2376425855513308,
"grad_norm": 0.07488813251256943,
"learning_rate": 0.0001744305599325652,
"loss": 0.7018,
"step": 375
},
{
"epoch": 0.23827629911280102,
"grad_norm": 0.05676595866680145,
"learning_rate": 0.00017429660459182834,
"loss": 0.4865,
"step": 376
},
{
"epoch": 0.23891001267427123,
"grad_norm": 0.058106616139411926,
"learning_rate": 0.00017416235100384007,
"loss": 0.4453,
"step": 377
},
{
"epoch": 0.23954372623574144,
"grad_norm": 0.4252207577228546,
"learning_rate": 0.00017402779970753155,
"loss": 3.008,
"step": 378
},
{
"epoch": 0.24017743979721165,
"grad_norm": 0.24036817252635956,
"learning_rate": 0.00017389295124302923,
"loss": 0.7246,
"step": 379
},
{
"epoch": 0.24081115335868186,
"grad_norm": 4.316144943237305,
"learning_rate": 0.00017375780615165235,
"loss": 0.664,
"step": 380
},
{
"epoch": 0.2414448669201521,
"grad_norm": 6.4877166748046875,
"learning_rate": 0.00017362236497591094,
"loss": 0.487,
"step": 381
},
{
"epoch": 0.2420785804816223,
"grad_norm": 0.12358918786048889,
"learning_rate": 0.00017348662825950357,
"loss": 0.4839,
"step": 382
},
{
"epoch": 0.24271229404309252,
"grad_norm": 0.7211472988128662,
"learning_rate": 0.0001733505965473152,
"loss": 0.6351,
"step": 383
},
{
"epoch": 0.24334600760456274,
"grad_norm": 0.10177785158157349,
"learning_rate": 0.00017321427038541494,
"loss": 0.6043,
"step": 384
},
{
"epoch": 0.24397972116603295,
"grad_norm": 0.054658226668834686,
"learning_rate": 0.00017307765032105406,
"loss": 0.473,
"step": 385
},
{
"epoch": 0.24461343472750316,
"grad_norm": 0.10075858235359192,
"learning_rate": 0.00017294073690266344,
"loss": 0.4892,
"step": 386
},
{
"epoch": 0.24524714828897337,
"grad_norm": 0.06497970223426819,
"learning_rate": 0.00017280353067985167,
"loss": 0.4986,
"step": 387
},
{
"epoch": 0.2458808618504436,
"grad_norm": 0.7542481422424316,
"learning_rate": 0.0001726660322034027,
"loss": 0.5513,
"step": 388
},
{
"epoch": 0.24651457541191382,
"grad_norm": 0.08190987259149551,
"learning_rate": 0.00017252824202527376,
"loss": 0.5077,
"step": 389
},
{
"epoch": 0.24714828897338403,
"grad_norm": 0.08874624967575073,
"learning_rate": 0.0001723901606985929,
"loss": 0.3973,
"step": 390
},
{
"epoch": 0.24778200253485425,
"grad_norm": 0.32968223094940186,
"learning_rate": 0.00017225178877765704,
"loss": 0.4411,
"step": 391
},
{
"epoch": 0.24841571609632446,
"grad_norm": 0.39434677362442017,
"learning_rate": 0.00017211312681792958,
"loss": 0.5201,
"step": 392
},
{
"epoch": 0.24904942965779467,
"grad_norm": 0.11154969036579132,
"learning_rate": 0.00017197417537603827,
"loss": 0.6205,
"step": 393
},
{
"epoch": 0.24968314321926488,
"grad_norm": 0.07316391915082932,
"learning_rate": 0.00017183493500977278,
"loss": 0.5129,
"step": 394
},
{
"epoch": 0.2503168567807351,
"grad_norm": 0.08883780986070633,
"learning_rate": 0.00017169540627808274,
"loss": 0.5036,
"step": 395
},
{
"epoch": 0.2509505703422053,
"grad_norm": 0.07377318292856216,
"learning_rate": 0.00017155558974107536,
"loss": 0.591,
"step": 396
},
{
"epoch": 0.25158428390367554,
"grad_norm": 0.064984992146492,
"learning_rate": 0.00017141548596001305,
"loss": 0.645,
"step": 397
},
{
"epoch": 0.2522179974651457,
"grad_norm": 0.07279626280069351,
"learning_rate": 0.00017127509549731148,
"loss": 0.5108,
"step": 398
},
{
"epoch": 0.25285171102661597,
"grad_norm": 0.06948740035295486,
"learning_rate": 0.000171134418916537,
"loss": 0.4959,
"step": 399
},
{
"epoch": 0.2534854245880862,
"grad_norm": 1.0025055408477783,
"learning_rate": 0.00017099345678240452,
"loss": 0.5248,
"step": 400
},
{
"epoch": 0.2541191381495564,
"grad_norm": 0.34188470244407654,
"learning_rate": 0.00017085220966077538,
"loss": 0.5588,
"step": 401
},
{
"epoch": 0.25475285171102663,
"grad_norm": 0.04984923452138901,
"learning_rate": 0.00017071067811865476,
"loss": 0.4033,
"step": 402
},
{
"epoch": 0.2553865652724968,
"grad_norm": 0.05613204464316368,
"learning_rate": 0.0001705688627241897,
"loss": 0.5774,
"step": 403
},
{
"epoch": 0.25602027883396705,
"grad_norm": 0.058507829904556274,
"learning_rate": 0.0001704267640466667,
"loss": 0.52,
"step": 404
},
{
"epoch": 0.25665399239543724,
"grad_norm": 0.23744581639766693,
"learning_rate": 0.00017028438265650933,
"loss": 0.6028,
"step": 405
},
{
"epoch": 0.2572877059569075,
"grad_norm": 0.11817914992570877,
"learning_rate": 0.00017014171912527616,
"loss": 0.5416,
"step": 406
},
{
"epoch": 0.2579214195183777,
"grad_norm": 0.29011303186416626,
"learning_rate": 0.00016999877402565833,
"loss": 0.4381,
"step": 407
},
{
"epoch": 0.2585551330798479,
"grad_norm": 0.06895189732313156,
"learning_rate": 0.00016985554793147727,
"loss": 0.5046,
"step": 408
},
{
"epoch": 0.25918884664131814,
"grad_norm": 0.059166181832551956,
"learning_rate": 0.00016971204141768233,
"loss": 0.582,
"step": 409
},
{
"epoch": 0.2598225602027883,
"grad_norm": 0.09994165599346161,
"learning_rate": 0.00016956825506034867,
"loss": 0.6042,
"step": 410
},
{
"epoch": 0.26045627376425856,
"grad_norm": 0.09195294976234436,
"learning_rate": 0.00016942418943667468,
"loss": 0.577,
"step": 411
},
{
"epoch": 0.26108998732572875,
"grad_norm": 0.08966407924890518,
"learning_rate": 0.00016927984512497992,
"loss": 0.5795,
"step": 412
},
{
"epoch": 0.261723700887199,
"grad_norm": 0.08420640975236893,
"learning_rate": 0.00016913522270470263,
"loss": 0.4446,
"step": 413
},
{
"epoch": 0.2623574144486692,
"grad_norm": 0.05902143940329552,
"learning_rate": 0.0001689903227563975,
"loss": 0.4458,
"step": 414
},
{
"epoch": 0.2629911280101394,
"grad_norm": 0.046236153692007065,
"learning_rate": 0.0001688451458617332,
"loss": 0.3762,
"step": 415
},
{
"epoch": 0.26362484157160965,
"grad_norm": 0.10383841395378113,
"learning_rate": 0.00016869969260349018,
"loss": 0.6076,
"step": 416
},
{
"epoch": 0.26425855513307983,
"grad_norm": 0.059753723442554474,
"learning_rate": 0.00016855396356555834,
"loss": 0.4116,
"step": 417
},
{
"epoch": 0.26489226869455007,
"grad_norm": 0.05825261399149895,
"learning_rate": 0.00016840795933293463,
"loss": 0.5377,
"step": 418
},
{
"epoch": 0.26552598225602025,
"grad_norm": 0.07149126380681992,
"learning_rate": 0.00016826168049172062,
"loss": 0.5946,
"step": 419
},
{
"epoch": 0.2661596958174905,
"grad_norm": 0.0636037141084671,
"learning_rate": 0.00016811512762912034,
"loss": 0.4232,
"step": 420
},
{
"epoch": 0.26679340937896073,
"grad_norm": 0.06662221997976303,
"learning_rate": 0.00016796830133343775,
"loss": 0.5406,
"step": 421
},
{
"epoch": 0.2674271229404309,
"grad_norm": 0.058340173214673996,
"learning_rate": 0.00016782120219407452,
"loss": 0.5402,
"step": 422
},
{
"epoch": 0.26806083650190116,
"grad_norm": 0.054275717586278915,
"learning_rate": 0.00016767383080152742,
"loss": 0.5215,
"step": 423
},
{
"epoch": 0.26869455006337134,
"grad_norm": 0.055525969713926315,
"learning_rate": 0.00016752618774738639,
"loss": 0.5743,
"step": 424
},
{
"epoch": 0.2693282636248416,
"grad_norm": 0.05762525647878647,
"learning_rate": 0.00016737827362433164,
"loss": 0.5806,
"step": 425
},
{
"epoch": 0.26996197718631176,
"grad_norm": 0.059116896241903305,
"learning_rate": 0.0001672300890261317,
"loss": 0.4828,
"step": 426
},
{
"epoch": 0.270595690747782,
"grad_norm": 0.046420734375715256,
"learning_rate": 0.00016708163454764075,
"loss": 0.4509,
"step": 427
},
{
"epoch": 0.27122940430925224,
"grad_norm": 0.11202160269021988,
"learning_rate": 0.00016693291078479638,
"loss": 0.5139,
"step": 428
},
{
"epoch": 0.2718631178707224,
"grad_norm": 0.08383259177207947,
"learning_rate": 0.00016678391833461722,
"loss": 0.7026,
"step": 429
},
{
"epoch": 0.27249683143219267,
"grad_norm": 0.058648403733968735,
"learning_rate": 0.0001666346577952004,
"loss": 0.4704,
"step": 430
},
{
"epoch": 0.27313054499366285,
"grad_norm": 0.08609268069267273,
"learning_rate": 0.0001664851297657193,
"loss": 0.5186,
"step": 431
},
{
"epoch": 0.2737642585551331,
"grad_norm": 0.10570003092288971,
"learning_rate": 0.00016633533484642103,
"loss": 0.4615,
"step": 432
},
{
"epoch": 0.2743979721166033,
"grad_norm": 0.09764793515205383,
"learning_rate": 0.00016618527363862408,
"loss": 0.4519,
"step": 433
},
{
"epoch": 0.2750316856780735,
"grad_norm": 0.08797989040613174,
"learning_rate": 0.00016603494674471593,
"loss": 0.6139,
"step": 434
},
{
"epoch": 0.27566539923954375,
"grad_norm": 0.0714520812034607,
"learning_rate": 0.0001658843547681506,
"loss": 0.5027,
"step": 435
},
{
"epoch": 0.27629911280101394,
"grad_norm": 0.08733757585287094,
"learning_rate": 0.00016573349831344616,
"loss": 0.4582,
"step": 436
},
{
"epoch": 0.2769328263624842,
"grad_norm": 0.0712830200791359,
"learning_rate": 0.00016558237798618245,
"loss": 0.4336,
"step": 437
},
{
"epoch": 0.27756653992395436,
"grad_norm": 0.06345337629318237,
"learning_rate": 0.00016543099439299844,
"loss": 0.4587,
"step": 438
},
{
"epoch": 0.2782002534854246,
"grad_norm": 0.06224706023931503,
"learning_rate": 0.0001652793481415901,
"loss": 0.5171,
"step": 439
},
{
"epoch": 0.2788339670468948,
"grad_norm": 0.0549205057322979,
"learning_rate": 0.00016512743984070769,
"loss": 0.5189,
"step": 440
},
{
"epoch": 0.279467680608365,
"grad_norm": 0.07211892306804657,
"learning_rate": 0.00016497527010015336,
"loss": 0.6118,
"step": 441
},
{
"epoch": 0.28010139416983526,
"grad_norm": 0.05902037024497986,
"learning_rate": 0.00016482283953077887,
"loss": 0.5376,
"step": 442
},
{
"epoch": 0.28073510773130544,
"grad_norm": 0.04935478791594505,
"learning_rate": 0.00016467014874448288,
"loss": 0.5468,
"step": 443
},
{
"epoch": 0.2813688212927757,
"grad_norm": 0.08219460397958755,
"learning_rate": 0.00016451719835420877,
"loss": 0.5723,
"step": 444
},
{
"epoch": 0.28200253485424587,
"grad_norm": 0.08607888221740723,
"learning_rate": 0.000164363988973942,
"loss": 0.4821,
"step": 445
},
{
"epoch": 0.2826362484157161,
"grad_norm": 0.05368666350841522,
"learning_rate": 0.00016421052121870755,
"loss": 0.4759,
"step": 446
},
{
"epoch": 0.2832699619771863,
"grad_norm": 0.09421613812446594,
"learning_rate": 0.00016405679570456782,
"loss": 0.4634,
"step": 447
},
{
"epoch": 0.28390367553865653,
"grad_norm": 0.06585177779197693,
"learning_rate": 0.0001639028130486198,
"loss": 0.5049,
"step": 448
},
{
"epoch": 0.28453738910012677,
"grad_norm": 0.07445032149553299,
"learning_rate": 0.00016374857386899268,
"loss": 0.6255,
"step": 449
},
{
"epoch": 0.28517110266159695,
"grad_norm": 0.05892190709710121,
"learning_rate": 0.00016359407878484552,
"loss": 0.5035,
"step": 450
},
{
"epoch": 0.2858048162230672,
"grad_norm": 0.08238600939512253,
"learning_rate": 0.00016343932841636456,
"loss": 0.4818,
"step": 451
},
{
"epoch": 0.2864385297845374,
"grad_norm": 0.0664915144443512,
"learning_rate": 0.00016328432338476084,
"loss": 0.4375,
"step": 452
},
{
"epoch": 0.2870722433460076,
"grad_norm": 0.04862099885940552,
"learning_rate": 0.00016312906431226773,
"loss": 0.4138,
"step": 453
},
{
"epoch": 0.2877059569074778,
"grad_norm": 0.04187007248401642,
"learning_rate": 0.00016297355182213837,
"loss": 0.3836,
"step": 454
},
{
"epoch": 0.28833967046894804,
"grad_norm": 0.05451095104217529,
"learning_rate": 0.00016281778653864316,
"loss": 0.4451,
"step": 455
},
{
"epoch": 0.2889733840304182,
"grad_norm": 0.061764512211084366,
"learning_rate": 0.0001626617690870673,
"loss": 0.6315,
"step": 456
},
{
"epoch": 0.28960709759188846,
"grad_norm": 0.05365981534123421,
"learning_rate": 0.0001625055000937083,
"loss": 0.4399,
"step": 457
},
{
"epoch": 0.2902408111533587,
"grad_norm": 0.10771326720714569,
"learning_rate": 0.00016234898018587337,
"loss": 0.5229,
"step": 458
},
{
"epoch": 0.2908745247148289,
"grad_norm": 0.05859148129820824,
"learning_rate": 0.000162192209991877,
"loss": 0.4254,
"step": 459
},
{
"epoch": 0.2915082382762991,
"grad_norm": 0.08183909952640533,
"learning_rate": 0.00016203519014103837,
"loss": 0.3658,
"step": 460
},
{
"epoch": 0.2921419518377693,
"grad_norm": 0.04404648020863533,
"learning_rate": 0.00016187792126367886,
"loss": 0.4138,
"step": 461
},
{
"epoch": 0.29277566539923955,
"grad_norm": 0.056379418820142746,
"learning_rate": 0.00016172040399111957,
"loss": 0.4781,
"step": 462
},
{
"epoch": 0.29340937896070973,
"grad_norm": 0.0440094955265522,
"learning_rate": 0.00016156263895567867,
"loss": 0.4623,
"step": 463
},
{
"epoch": 0.29404309252217997,
"grad_norm": 0.055651161819696426,
"learning_rate": 0.00016140462679066885,
"loss": 0.5002,
"step": 464
},
{
"epoch": 0.2946768060836502,
"grad_norm": 0.09338720887899399,
"learning_rate": 0.00016124636813039502,
"loss": 0.5199,
"step": 465
},
{
"epoch": 0.2953105196451204,
"grad_norm": 0.07024485617876053,
"learning_rate": 0.00016108786361015143,
"loss": 0.5378,
"step": 466
},
{
"epoch": 0.29594423320659063,
"grad_norm": 0.05211356282234192,
"learning_rate": 0.00016092911386621938,
"loss": 0.5895,
"step": 467
},
{
"epoch": 0.2965779467680608,
"grad_norm": 0.05571569502353668,
"learning_rate": 0.00016077011953586452,
"loss": 0.4952,
"step": 468
},
{
"epoch": 0.29721166032953106,
"grad_norm": 0.07663686573505402,
"learning_rate": 0.00016061088125733433,
"loss": 0.5341,
"step": 469
},
{
"epoch": 0.29784537389100124,
"grad_norm": 0.04910871386528015,
"learning_rate": 0.0001604513996698556,
"loss": 0.445,
"step": 470
},
{
"epoch": 0.2984790874524715,
"grad_norm": 0.07365076243877411,
"learning_rate": 0.0001602916754136318,
"loss": 0.5364,
"step": 471
},
{
"epoch": 0.2991128010139417,
"grad_norm": 0.08367875218391418,
"learning_rate": 0.00016013170912984058,
"loss": 0.5709,
"step": 472
},
{
"epoch": 0.2997465145754119,
"grad_norm": 0.06659605354070663,
"learning_rate": 0.00015997150146063115,
"loss": 0.5351,
"step": 473
},
{
"epoch": 0.30038022813688214,
"grad_norm": 0.05647695064544678,
"learning_rate": 0.00015981105304912162,
"loss": 0.4103,
"step": 474
},
{
"epoch": 0.3010139416983523,
"grad_norm": 0.05512802302837372,
"learning_rate": 0.0001596503645393966,
"loss": 0.4919,
"step": 475
},
{
"epoch": 0.30164765525982257,
"grad_norm": 0.07482268661260605,
"learning_rate": 0.0001594894365765045,
"loss": 0.5266,
"step": 476
},
{
"epoch": 0.30228136882129275,
"grad_norm": 0.08068813383579254,
"learning_rate": 0.000159328269806455,
"loss": 0.6268,
"step": 477
},
{
"epoch": 0.302915082382763,
"grad_norm": 0.05029362812638283,
"learning_rate": 0.00015916686487621635,
"loss": 0.4999,
"step": 478
},
{
"epoch": 0.30354879594423323,
"grad_norm": 0.0705760046839714,
"learning_rate": 0.00015900522243371282,
"loss": 0.5182,
"step": 479
},
{
"epoch": 0.3041825095057034,
"grad_norm": 0.20289281010627747,
"learning_rate": 0.00015884334312782223,
"loss": 0.6609,
"step": 480
},
{
"epoch": 0.30481622306717365,
"grad_norm": 0.05456344410777092,
"learning_rate": 0.00015868122760837313,
"loss": 0.4575,
"step": 481
},
{
"epoch": 0.30544993662864384,
"grad_norm": 0.06280402094125748,
"learning_rate": 0.00015851887652614237,
"loss": 0.4186,
"step": 482
},
{
"epoch": 0.3060836501901141,
"grad_norm": 0.06588494777679443,
"learning_rate": 0.0001583562905328524,
"loss": 0.5235,
"step": 483
},
{
"epoch": 0.30671736375158426,
"grad_norm": 0.14238761365413666,
"learning_rate": 0.00015819347028116858,
"loss": 0.5727,
"step": 484
},
{
"epoch": 0.3073510773130545,
"grad_norm": 0.0709756463766098,
"learning_rate": 0.0001580304164246968,
"loss": 0.4003,
"step": 485
},
{
"epoch": 0.30798479087452474,
"grad_norm": 0.3064410388469696,
"learning_rate": 0.0001578671296179806,
"loss": 0.524,
"step": 486
},
{
"epoch": 0.3086185044359949,
"grad_norm": 0.04714261740446091,
"learning_rate": 0.00015770361051649863,
"loss": 0.3965,
"step": 487
},
{
"epoch": 0.30925221799746516,
"grad_norm": 0.05930585786700249,
"learning_rate": 0.00015753985977666213,
"loss": 0.4562,
"step": 488
},
{
"epoch": 0.30988593155893535,
"grad_norm": 0.07817406952381134,
"learning_rate": 0.00015737587805581219,
"loss": 0.5846,
"step": 489
},
{
"epoch": 0.3105196451204056,
"grad_norm": 0.05352717638015747,
"learning_rate": 0.00015721166601221698,
"loss": 0.5899,
"step": 490
},
{
"epoch": 0.31115335868187577,
"grad_norm": 0.05995578318834305,
"learning_rate": 0.00015704722430506942,
"loss": 0.5521,
"step": 491
},
{
"epoch": 0.311787072243346,
"grad_norm": 0.15946877002716064,
"learning_rate": 0.00015688255359448428,
"loss": 0.6366,
"step": 492
},
{
"epoch": 0.31242078580481625,
"grad_norm": 0.06116756424307823,
"learning_rate": 0.00015671765454149559,
"loss": 0.4436,
"step": 493
},
{
"epoch": 0.31305449936628643,
"grad_norm": 0.272954523563385,
"learning_rate": 0.00015655252780805414,
"loss": 0.6512,
"step": 494
},
{
"epoch": 0.31368821292775667,
"grad_norm": 0.0462493859231472,
"learning_rate": 0.0001563871740570245,
"loss": 0.4075,
"step": 495
},
{
"epoch": 0.31432192648922685,
"grad_norm": 0.08116989582777023,
"learning_rate": 0.00015622159395218272,
"loss": 0.6353,
"step": 496
},
{
"epoch": 0.3149556400506971,
"grad_norm": 0.07837241142988205,
"learning_rate": 0.0001560557881582134,
"loss": 0.5087,
"step": 497
},
{
"epoch": 0.3155893536121673,
"grad_norm": 0.07096578180789948,
"learning_rate": 0.00015588975734070717,
"loss": 0.617,
"step": 498
},
{
"epoch": 0.3162230671736375,
"grad_norm": 0.07047011703252792,
"learning_rate": 0.0001557235021661579,
"loss": 0.6406,
"step": 499
},
{
"epoch": 0.31685678073510776,
"grad_norm": 0.06322109699249268,
"learning_rate": 0.00015555702330196023,
"loss": 0.5973,
"step": 500
},
{
"epoch": 0.31749049429657794,
"grad_norm": 0.1788979321718216,
"learning_rate": 0.00015539032141640658,
"loss": 0.6022,
"step": 501
},
{
"epoch": 0.3181242078580482,
"grad_norm": 0.05936092510819435,
"learning_rate": 0.00015522339717868476,
"loss": 0.4314,
"step": 502
},
{
"epoch": 0.31875792141951836,
"grad_norm": 0.05811009183526039,
"learning_rate": 0.00015505625125887508,
"loss": 0.5641,
"step": 503
},
{
"epoch": 0.3193916349809886,
"grad_norm": 0.11950580030679703,
"learning_rate": 0.00015488888432794784,
"loss": 0.5796,
"step": 504
},
{
"epoch": 0.3200253485424588,
"grad_norm": 0.04393857717514038,
"learning_rate": 0.00015472129705776047,
"loss": 0.3637,
"step": 505
},
{
"epoch": 0.320659062103929,
"grad_norm": 0.11919873207807541,
"learning_rate": 0.00015455349012105486,
"loss": 0.4967,
"step": 506
},
{
"epoch": 0.32129277566539927,
"grad_norm": 0.055687014013528824,
"learning_rate": 0.00015438546419145488,
"loss": 0.4932,
"step": 507
},
{
"epoch": 0.32192648922686945,
"grad_norm": 0.058437906205654144,
"learning_rate": 0.00015421721994346327,
"loss": 0.5351,
"step": 508
},
{
"epoch": 0.3225602027883397,
"grad_norm": 0.04726817458868027,
"learning_rate": 0.00015404875805245935,
"loss": 0.433,
"step": 509
},
{
"epoch": 0.3231939163498099,
"grad_norm": 0.04807078838348389,
"learning_rate": 0.00015388007919469603,
"loss": 0.4534,
"step": 510
},
{
"epoch": 0.3238276299112801,
"grad_norm": 0.07437839359045029,
"learning_rate": 0.00015371118404729716,
"loss": 0.584,
"step": 511
},
{
"epoch": 0.3244613434727503,
"grad_norm": 0.050413914024829865,
"learning_rate": 0.00015354207328825491,
"loss": 0.3788,
"step": 512
},
{
"epoch": 0.32509505703422054,
"grad_norm": 0.07370271533727646,
"learning_rate": 0.0001533727475964269,
"loss": 0.4768,
"step": 513
},
{
"epoch": 0.3257287705956908,
"grad_norm": 0.06317605078220367,
"learning_rate": 0.00015320320765153367,
"loss": 0.5665,
"step": 514
},
{
"epoch": 0.32636248415716096,
"grad_norm": 0.061747610569000244,
"learning_rate": 0.00015303345413415564,
"loss": 0.6061,
"step": 515
},
{
"epoch": 0.3269961977186312,
"grad_norm": 0.07719457149505615,
"learning_rate": 0.00015286348772573075,
"loss": 0.4041,
"step": 516
},
{
"epoch": 0.3276299112801014,
"grad_norm": 0.048449669033288956,
"learning_rate": 0.0001526933091085515,
"loss": 0.4865,
"step": 517
},
{
"epoch": 0.3282636248415716,
"grad_norm": 0.06786296516656876,
"learning_rate": 0.00015252291896576214,
"loss": 0.5036,
"step": 518
},
{
"epoch": 0.3288973384030418,
"grad_norm": 0.056538064032793045,
"learning_rate": 0.0001523523179813562,
"loss": 0.5077,
"step": 519
},
{
"epoch": 0.32953105196451205,
"grad_norm": 0.06674568355083466,
"learning_rate": 0.00015218150684017347,
"loss": 0.701,
"step": 520
},
{
"epoch": 0.33016476552598223,
"grad_norm": 0.07875782251358032,
"learning_rate": 0.00015201048622789747,
"loss": 0.5375,
"step": 521
},
{
"epoch": 0.33079847908745247,
"grad_norm": 0.06530767679214478,
"learning_rate": 0.00015183925683105254,
"loss": 0.5136,
"step": 522
},
{
"epoch": 0.3314321926489227,
"grad_norm": 0.06704816222190857,
"learning_rate": 0.00015166781933700105,
"loss": 0.6015,
"step": 523
},
{
"epoch": 0.3320659062103929,
"grad_norm": 0.061236705631017685,
"learning_rate": 0.00015149617443394094,
"loss": 0.5323,
"step": 524
},
{
"epoch": 0.33269961977186313,
"grad_norm": 0.11219301074743271,
"learning_rate": 0.00015132432281090256,
"loss": 0.6076,
"step": 525
},
{
"epoch": 0.3333333333333333,
"grad_norm": 0.04857495799660683,
"learning_rate": 0.00015115226515774618,
"loss": 0.4208,
"step": 526
},
{
"epoch": 0.33396704689480355,
"grad_norm": 0.04918389767408371,
"learning_rate": 0.0001509800021651591,
"loss": 0.5069,
"step": 527
},
{
"epoch": 0.33460076045627374,
"grad_norm": 0.06613993644714355,
"learning_rate": 0.00015080753452465296,
"loss": 0.5443,
"step": 528
},
{
"epoch": 0.335234474017744,
"grad_norm": 0.05695560947060585,
"learning_rate": 0.00015063486292856082,
"loss": 0.5632,
"step": 529
},
{
"epoch": 0.3358681875792142,
"grad_norm": 0.05377941578626633,
"learning_rate": 0.0001504619880700346,
"loss": 0.3954,
"step": 530
},
{
"epoch": 0.3365019011406844,
"grad_norm": 0.06934024393558502,
"learning_rate": 0.000150288910643042,
"loss": 0.5669,
"step": 531
},
{
"epoch": 0.33713561470215464,
"grad_norm": 0.10134469717741013,
"learning_rate": 0.00015011563134236408,
"loss": 0.5248,
"step": 532
},
{
"epoch": 0.3377693282636248,
"grad_norm": 0.11486341804265976,
"learning_rate": 0.00014994215086359212,
"loss": 0.6074,
"step": 533
},
{
"epoch": 0.33840304182509506,
"grad_norm": 0.07518647611141205,
"learning_rate": 0.00014976846990312514,
"loss": 0.5196,
"step": 534
},
{
"epoch": 0.33903675538656525,
"grad_norm": 0.06767034530639648,
"learning_rate": 0.0001495945891581668,
"loss": 0.4821,
"step": 535
},
{
"epoch": 0.3396704689480355,
"grad_norm": 0.047710105776786804,
"learning_rate": 0.00014942050932672277,
"loss": 0.4468,
"step": 536
},
{
"epoch": 0.3403041825095057,
"grad_norm": 0.10735978931188583,
"learning_rate": 0.000149246231107598,
"loss": 0.4851,
"step": 537
},
{
"epoch": 0.3409378960709759,
"grad_norm": 0.0501636303961277,
"learning_rate": 0.0001490717552003938,
"loss": 0.4831,
"step": 538
},
{
"epoch": 0.34157160963244615,
"grad_norm": 0.052001163363456726,
"learning_rate": 0.00014889708230550496,
"loss": 0.5206,
"step": 539
},
{
"epoch": 0.34220532319391633,
"grad_norm": 0.06634392589330673,
"learning_rate": 0.00014872221312411718,
"loss": 0.5051,
"step": 540
},
{
"epoch": 0.3428390367553866,
"grad_norm": 0.053568046540021896,
"learning_rate": 0.00014854714835820394,
"loss": 0.5257,
"step": 541
},
{
"epoch": 0.34347275031685676,
"grad_norm": 0.05587064474821091,
"learning_rate": 0.000148371888710524,
"loss": 0.5924,
"step": 542
},
{
"epoch": 0.344106463878327,
"grad_norm": 0.055588286370038986,
"learning_rate": 0.00014819643488461835,
"loss": 0.4242,
"step": 543
},
{
"epoch": 0.34474017743979724,
"grad_norm": 0.07102327048778534,
"learning_rate": 0.00014802078758480747,
"loss": 0.5229,
"step": 544
},
{
"epoch": 0.3453738910012674,
"grad_norm": 0.06629911810159683,
"learning_rate": 0.00014784494751618853,
"loss": 0.435,
"step": 545
},
{
"epoch": 0.34600760456273766,
"grad_norm": 0.054953474551439285,
"learning_rate": 0.00014766891538463254,
"loss": 0.5796,
"step": 546
},
{
"epoch": 0.34664131812420784,
"grad_norm": 0.05943427234888077,
"learning_rate": 0.00014749269189678142,
"loss": 0.427,
"step": 547
},
{
"epoch": 0.3472750316856781,
"grad_norm": 0.05509248375892639,
"learning_rate": 0.00014731627776004536,
"loss": 0.5456,
"step": 548
},
{
"epoch": 0.34790874524714827,
"grad_norm": 0.0867772102355957,
"learning_rate": 0.0001471396736825998,
"loss": 0.5649,
"step": 549
},
{
"epoch": 0.3485424588086185,
"grad_norm": 0.08892481029033661,
"learning_rate": 0.00014696288037338256,
"loss": 0.5489,
"step": 550
},
{
"epoch": 0.34917617237008874,
"grad_norm": 0.07534697651863098,
"learning_rate": 0.00014678589854209134,
"loss": 0.4728,
"step": 551
},
{
"epoch": 0.34980988593155893,
"grad_norm": 0.03929729387164116,
"learning_rate": 0.00014660872889918044,
"loss": 0.3527,
"step": 552
},
{
"epoch": 0.35044359949302917,
"grad_norm": 0.06847205758094788,
"learning_rate": 0.00014643137215585806,
"loss": 0.4204,
"step": 553
},
{
"epoch": 0.35107731305449935,
"grad_norm": 0.06959280371665955,
"learning_rate": 0.00014625382902408356,
"loss": 0.5043,
"step": 554
},
{
"epoch": 0.3517110266159696,
"grad_norm": 0.057750072330236435,
"learning_rate": 0.0001460761002165645,
"loss": 0.5717,
"step": 555
},
{
"epoch": 0.3523447401774398,
"grad_norm": 0.0640597864985466,
"learning_rate": 0.00014589818644675378,
"loss": 0.5691,
"step": 556
},
{
"epoch": 0.35297845373891,
"grad_norm": 0.05334803834557533,
"learning_rate": 0.0001457200884288468,
"loss": 0.4438,
"step": 557
},
{
"epoch": 0.35361216730038025,
"grad_norm": 0.050739504396915436,
"learning_rate": 0.0001455418068777786,
"loss": 0.4418,
"step": 558
},
{
"epoch": 0.35424588086185044,
"grad_norm": 0.04636020213365555,
"learning_rate": 0.00014536334250922093,
"loss": 0.3724,
"step": 559
},
{
"epoch": 0.3548795944233207,
"grad_norm": 0.04343942552804947,
"learning_rate": 0.00014518469603957943,
"loss": 0.3218,
"step": 560
},
{
"epoch": 0.35551330798479086,
"grad_norm": 0.06655412167310715,
"learning_rate": 0.00014500586818599076,
"loss": 0.5158,
"step": 561
},
{
"epoch": 0.3561470215462611,
"grad_norm": 0.06236552819609642,
"learning_rate": 0.0001448268596663197,
"loss": 0.5348,
"step": 562
},
{
"epoch": 0.3567807351077313,
"grad_norm": 0.0551675446331501,
"learning_rate": 0.00014464767119915629,
"loss": 0.5191,
"step": 563
},
{
"epoch": 0.3574144486692015,
"grad_norm": 0.0711677148938179,
"learning_rate": 0.00014446830350381293,
"loss": 0.5787,
"step": 564
},
{
"epoch": 0.35804816223067176,
"grad_norm": 0.05513966456055641,
"learning_rate": 0.00014428875730032145,
"loss": 0.4056,
"step": 565
},
{
"epoch": 0.35868187579214195,
"grad_norm": 0.07472972571849823,
"learning_rate": 0.00014410903330943029,
"loss": 0.4217,
"step": 566
},
{
"epoch": 0.3593155893536122,
"grad_norm": 0.05436578020453453,
"learning_rate": 0.00014392913225260153,
"loss": 0.5195,
"step": 567
},
{
"epoch": 0.35994930291508237,
"grad_norm": 0.14983688294887543,
"learning_rate": 0.00014374905485200817,
"loss": 0.6106,
"step": 568
},
{
"epoch": 0.3605830164765526,
"grad_norm": 0.09657621383666992,
"learning_rate": 0.00014356880183053104,
"loss": 0.5487,
"step": 569
},
{
"epoch": 0.3612167300380228,
"grad_norm": 0.06128871440887451,
"learning_rate": 0.00014338837391175582,
"loss": 0.2958,
"step": 570
},
{
"epoch": 0.36185044359949303,
"grad_norm": 0.3691087067127228,
"learning_rate": 0.00014320777181997052,
"loss": 0.4846,
"step": 571
},
{
"epoch": 0.36248415716096327,
"grad_norm": 0.07217471301555634,
"learning_rate": 0.00014302699628016208,
"loss": 0.4256,
"step": 572
},
{
"epoch": 0.36311787072243346,
"grad_norm": 0.05521377548575401,
"learning_rate": 0.00014284604801801396,
"loss": 0.48,
"step": 573
},
{
"epoch": 0.3637515842839037,
"grad_norm": 0.04929598793387413,
"learning_rate": 0.0001426649277599028,
"loss": 0.5303,
"step": 574
},
{
"epoch": 0.3643852978453739,
"grad_norm": 0.050052460283041,
"learning_rate": 0.00014248363623289574,
"loss": 0.4863,
"step": 575
},
{
"epoch": 0.3650190114068441,
"grad_norm": 0.04534770920872688,
"learning_rate": 0.0001423021741647474,
"loss": 0.5239,
"step": 576
},
{
"epoch": 0.3656527249683143,
"grad_norm": 0.07982175797224045,
"learning_rate": 0.0001421205422838971,
"loss": 0.5924,
"step": 577
},
{
"epoch": 0.36628643852978454,
"grad_norm": 0.04665097966790199,
"learning_rate": 0.0001419387413194657,
"loss": 0.4579,
"step": 578
},
{
"epoch": 0.3669201520912547,
"grad_norm": 0.0721178650856018,
"learning_rate": 0.0001417567720012529,
"loss": 0.5235,
"step": 579
},
{
"epoch": 0.36755386565272496,
"grad_norm": 0.04838218167424202,
"learning_rate": 0.00014157463505973418,
"loss": 0.4138,
"step": 580
},
{
"epoch": 0.3681875792141952,
"grad_norm": 0.07050075381994247,
"learning_rate": 0.00014139233122605798,
"loss": 0.5749,
"step": 581
},
{
"epoch": 0.3688212927756654,
"grad_norm": 0.07718097418546677,
"learning_rate": 0.00014120986123204257,
"loss": 0.5399,
"step": 582
},
{
"epoch": 0.3694550063371356,
"grad_norm": 0.08041960000991821,
"learning_rate": 0.00014102722581017332,
"loss": 0.4264,
"step": 583
},
{
"epoch": 0.3700887198986058,
"grad_norm": 0.08530323952436447,
"learning_rate": 0.00014084442569359964,
"loss": 0.4534,
"step": 584
},
{
"epoch": 0.37072243346007605,
"grad_norm": 0.0639512911438942,
"learning_rate": 0.00014066146161613208,
"loss": 0.4295,
"step": 585
},
{
"epoch": 0.37135614702154623,
"grad_norm": 0.06618323922157288,
"learning_rate": 0.00014047833431223938,
"loss": 0.6437,
"step": 586
},
{
"epoch": 0.3719898605830165,
"grad_norm": 0.057782579213380814,
"learning_rate": 0.00014029504451704557,
"loss": 0.4855,
"step": 587
},
{
"epoch": 0.3726235741444867,
"grad_norm": 0.04774455726146698,
"learning_rate": 0.00014011159296632678,
"loss": 0.3035,
"step": 588
},
{
"epoch": 0.3732572877059569,
"grad_norm": 0.05420040711760521,
"learning_rate": 0.00013992798039650872,
"loss": 0.4444,
"step": 589
},
{
"epoch": 0.37389100126742714,
"grad_norm": 0.06096061319112778,
"learning_rate": 0.00013974420754466328,
"loss": 0.5743,
"step": 590
},
{
"epoch": 0.3745247148288973,
"grad_norm": 0.055694580078125,
"learning_rate": 0.0001395602751485059,
"loss": 0.5652,
"step": 591
},
{
"epoch": 0.37515842839036756,
"grad_norm": 0.0731462761759758,
"learning_rate": 0.00013937618394639235,
"loss": 0.4977,
"step": 592
},
{
"epoch": 0.37579214195183774,
"grad_norm": 0.05172240361571312,
"learning_rate": 0.000139191934677316,
"loss": 0.524,
"step": 593
},
{
"epoch": 0.376425855513308,
"grad_norm": 0.05123208463191986,
"learning_rate": 0.00013900752808090468,
"loss": 0.5355,
"step": 594
},
{
"epoch": 0.3770595690747782,
"grad_norm": 0.056850165128707886,
"learning_rate": 0.00013882296489741783,
"loss": 0.4908,
"step": 595
},
{
"epoch": 0.3776932826362484,
"grad_norm": 0.06634749472141266,
"learning_rate": 0.00013863824586774344,
"loss": 0.4283,
"step": 596
},
{
"epoch": 0.37832699619771865,
"grad_norm": 0.04840132221579552,
"learning_rate": 0.00013845337173339507,
"loss": 0.4897,
"step": 597
},
{
"epoch": 0.37896070975918883,
"grad_norm": 0.0695575699210167,
"learning_rate": 0.000138268343236509,
"loss": 0.583,
"step": 598
},
{
"epoch": 0.37959442332065907,
"grad_norm": 0.048906922340393066,
"learning_rate": 0.00013808316111984107,
"loss": 0.4496,
"step": 599
},
{
"epoch": 0.38022813688212925,
"grad_norm": 0.05677906423807144,
"learning_rate": 0.0001378978261267639,
"loss": 0.4717,
"step": 600
},
{
"epoch": 0.3808618504435995,
"grad_norm": 0.10213559865951538,
"learning_rate": 0.0001377123390012637,
"loss": 0.6238,
"step": 601
},
{
"epoch": 0.38149556400506973,
"grad_norm": 0.050033628940582275,
"learning_rate": 0.00013752670048793744,
"loss": 0.4001,
"step": 602
},
{
"epoch": 0.3821292775665399,
"grad_norm": 0.07862118631601334,
"learning_rate": 0.00013734091133198975,
"loss": 0.5346,
"step": 603
},
{
"epoch": 0.38276299112801015,
"grad_norm": 0.053442683070898056,
"learning_rate": 0.00013715497227923006,
"loss": 0.4903,
"step": 604
},
{
"epoch": 0.38339670468948034,
"grad_norm": 0.06940152496099472,
"learning_rate": 0.00013696888407606952,
"loss": 0.568,
"step": 605
},
{
"epoch": 0.3840304182509506,
"grad_norm": 0.048307280987501144,
"learning_rate": 0.00013678264746951787,
"loss": 0.5245,
"step": 606
},
{
"epoch": 0.38466413181242076,
"grad_norm": 0.04498027265071869,
"learning_rate": 0.00013659626320718077,
"loss": 0.3682,
"step": 607
},
{
"epoch": 0.385297845373891,
"grad_norm": 0.05874482914805412,
"learning_rate": 0.0001364097320372565,
"loss": 0.5148,
"step": 608
},
{
"epoch": 0.38593155893536124,
"grad_norm": 0.04996568709611893,
"learning_rate": 0.00013622305470853313,
"loss": 0.4756,
"step": 609
},
{
"epoch": 0.3865652724968314,
"grad_norm": 0.07363967597484589,
"learning_rate": 0.00013603623197038536,
"loss": 0.5053,
"step": 610
},
{
"epoch": 0.38719898605830166,
"grad_norm": 0.0668586939573288,
"learning_rate": 0.00013584926457277168,
"loss": 0.5362,
"step": 611
},
{
"epoch": 0.38783269961977185,
"grad_norm": 0.06371022760868073,
"learning_rate": 0.0001356621532662313,
"loss": 0.5457,
"step": 612
},
{
"epoch": 0.3884664131812421,
"grad_norm": 0.07108695805072784,
"learning_rate": 0.00013547489880188108,
"loss": 0.5238,
"step": 613
},
{
"epoch": 0.38910012674271227,
"grad_norm": 0.05326547846198082,
"learning_rate": 0.00013528750193141255,
"loss": 0.4505,
"step": 614
},
{
"epoch": 0.3897338403041825,
"grad_norm": 0.08405181765556335,
"learning_rate": 0.0001350999634070889,
"loss": 0.6235,
"step": 615
},
{
"epoch": 0.39036755386565275,
"grad_norm": 0.05981157347559929,
"learning_rate": 0.000134912283981742,
"loss": 0.5175,
"step": 616
},
{
"epoch": 0.39100126742712293,
"grad_norm": 0.05275322496891022,
"learning_rate": 0.00013472446440876927,
"loss": 0.5536,
"step": 617
},
{
"epoch": 0.3916349809885932,
"grad_norm": 0.053324826061725616,
"learning_rate": 0.00013453650544213076,
"loss": 0.5926,
"step": 618
},
{
"epoch": 0.39226869455006336,
"grad_norm": 0.056955184787511826,
"learning_rate": 0.0001343484078363461,
"loss": 0.4393,
"step": 619
},
{
"epoch": 0.3929024081115336,
"grad_norm": 0.05232278257608414,
"learning_rate": 0.00013416017234649146,
"loss": 0.5163,
"step": 620
},
{
"epoch": 0.3935361216730038,
"grad_norm": 0.06405606865882874,
"learning_rate": 0.00013397179972819643,
"loss": 0.575,
"step": 621
},
{
"epoch": 0.394169835234474,
"grad_norm": 0.058417316526174545,
"learning_rate": 0.00013378329073764119,
"loss": 0.542,
"step": 622
},
{
"epoch": 0.39480354879594426,
"grad_norm": 0.05610906332731247,
"learning_rate": 0.00013359464613155325,
"loss": 0.4576,
"step": 623
},
{
"epoch": 0.39543726235741444,
"grad_norm": 0.06383884698152542,
"learning_rate": 0.00013340586666720457,
"loss": 0.5938,
"step": 624
},
{
"epoch": 0.3960709759188847,
"grad_norm": 0.05517081543803215,
"learning_rate": 0.0001332169531024085,
"loss": 0.4492,
"step": 625
},
{
"epoch": 0.39670468948035487,
"grad_norm": 0.07210738211870193,
"learning_rate": 0.00013302790619551674,
"loss": 0.6145,
"step": 626
},
{
"epoch": 0.3973384030418251,
"grad_norm": 0.06636934727430344,
"learning_rate": 0.00013283872670541604,
"loss": 0.4242,
"step": 627
},
{
"epoch": 0.3979721166032953,
"grad_norm": 0.07977598905563354,
"learning_rate": 0.00013264941539152566,
"loss": 0.5553,
"step": 628
},
{
"epoch": 0.39860583016476553,
"grad_norm": 0.056893352419137955,
"learning_rate": 0.00013245997301379383,
"loss": 0.4311,
"step": 629
},
{
"epoch": 0.39923954372623577,
"grad_norm": 1.9656810760498047,
"learning_rate": 0.000132270400332695,
"loss": 0.5208,
"step": 630
},
{
"epoch": 0.39987325728770595,
"grad_norm": 0.05810742825269699,
"learning_rate": 0.00013208069810922673,
"loss": 0.56,
"step": 631
},
{
"epoch": 0.4005069708491762,
"grad_norm": 0.08527707308530807,
"learning_rate": 0.00013189086710490647,
"loss": 0.5094,
"step": 632
},
{
"epoch": 0.4011406844106464,
"grad_norm": 0.07540644705295563,
"learning_rate": 0.00013170090808176883,
"loss": 0.5193,
"step": 633
},
{
"epoch": 0.4017743979721166,
"grad_norm": 0.08294139802455902,
"learning_rate": 0.0001315108218023621,
"loss": 0.5538,
"step": 634
},
{
"epoch": 0.4024081115335868,
"grad_norm": 0.07025711983442307,
"learning_rate": 0.00013132060902974554,
"loss": 0.5451,
"step": 635
},
{
"epoch": 0.40304182509505704,
"grad_norm": 0.05808630213141441,
"learning_rate": 0.00013113027052748615,
"loss": 0.5342,
"step": 636
},
{
"epoch": 0.4036755386565273,
"grad_norm": 0.040730297565460205,
"learning_rate": 0.0001309398070596557,
"loss": 0.4434,
"step": 637
},
{
"epoch": 0.40430925221799746,
"grad_norm": 0.06423351913690567,
"learning_rate": 0.00013074921939082757,
"loss": 0.5463,
"step": 638
},
{
"epoch": 0.4049429657794677,
"grad_norm": 0.07848164439201355,
"learning_rate": 0.00013055850828607368,
"loss": 0.651,
"step": 639
},
{
"epoch": 0.4055766793409379,
"grad_norm": 0.08495569974184036,
"learning_rate": 0.00013036767451096148,
"loss": 0.4675,
"step": 640
},
{
"epoch": 0.4062103929024081,
"grad_norm": 0.06640883535146713,
"learning_rate": 0.0001301767188315509,
"loss": 0.5261,
"step": 641
},
{
"epoch": 0.4068441064638783,
"grad_norm": 0.04708843678236008,
"learning_rate": 0.00012998564201439116,
"loss": 0.3417,
"step": 642
},
{
"epoch": 0.40747782002534855,
"grad_norm": 0.09854655712842941,
"learning_rate": 0.00012979444482651782,
"loss": 0.6236,
"step": 643
},
{
"epoch": 0.40811153358681873,
"grad_norm": 0.11556591838598251,
"learning_rate": 0.00012960312803544962,
"loss": 0.6022,
"step": 644
},
{
"epoch": 0.40874524714828897,
"grad_norm": 0.922315776348114,
"learning_rate": 0.00012941169240918534,
"loss": 0.4034,
"step": 645
},
{
"epoch": 0.4093789607097592,
"grad_norm": 0.08266003429889679,
"learning_rate": 0.00012922013871620095,
"loss": 0.5455,
"step": 646
},
{
"epoch": 0.4100126742712294,
"grad_norm": 0.05183318257331848,
"learning_rate": 0.00012902846772544624,
"loss": 0.437,
"step": 647
},
{
"epoch": 0.41064638783269963,
"grad_norm": 0.10581205785274506,
"learning_rate": 0.00012883668020634195,
"loss": 0.5762,
"step": 648
},
{
"epoch": 0.4112801013941698,
"grad_norm": 0.06646697223186493,
"learning_rate": 0.00012864477692877657,
"loss": 0.5462,
"step": 649
},
{
"epoch": 0.41191381495564006,
"grad_norm": 0.10537492483854294,
"learning_rate": 0.00012845275866310324,
"loss": 0.5098,
"step": 650
},
{
"epoch": 0.41254752851711024,
"grad_norm": 0.07540510594844818,
"learning_rate": 0.0001282606261801368,
"loss": 0.6208,
"step": 651
},
{
"epoch": 0.4131812420785805,
"grad_norm": 0.06597273051738739,
"learning_rate": 0.0001280683802511504,
"loss": 0.5896,
"step": 652
},
{
"epoch": 0.4138149556400507,
"grad_norm": 0.060704171657562256,
"learning_rate": 0.0001278760216478728,
"loss": 0.4844,
"step": 653
},
{
"epoch": 0.4144486692015209,
"grad_norm": 0.07420588284730911,
"learning_rate": 0.00012768355114248494,
"loss": 0.5673,
"step": 654
},
{
"epoch": 0.41508238276299114,
"grad_norm": 0.06360962241888046,
"learning_rate": 0.00012749096950761702,
"loss": 0.5322,
"step": 655
},
{
"epoch": 0.4157160963244613,
"grad_norm": 0.0631156638264656,
"learning_rate": 0.00012729827751634533,
"loss": 0.4863,
"step": 656
},
{
"epoch": 0.41634980988593157,
"grad_norm": 0.06497811526060104,
"learning_rate": 0.00012710547594218917,
"loss": 0.5775,
"step": 657
},
{
"epoch": 0.41698352344740175,
"grad_norm": 0.07515639066696167,
"learning_rate": 0.00012691256555910768,
"loss": 0.5207,
"step": 658
},
{
"epoch": 0.417617237008872,
"grad_norm": 0.073845274746418,
"learning_rate": 0.0001267195471414969,
"loss": 0.5306,
"step": 659
},
{
"epoch": 0.41825095057034223,
"grad_norm": 0.0654008612036705,
"learning_rate": 0.0001265264214641864,
"loss": 0.4677,
"step": 660
},
{
"epoch": 0.4188846641318124,
"grad_norm": 0.043669626116752625,
"learning_rate": 0.00012633318930243648,
"loss": 0.4221,
"step": 661
},
{
"epoch": 0.41951837769328265,
"grad_norm": 0.047917358577251434,
"learning_rate": 0.00012613985143193482,
"loss": 0.3635,
"step": 662
},
{
"epoch": 0.42015209125475284,
"grad_norm": 0.06635928153991699,
"learning_rate": 0.0001259464086287934,
"loss": 0.5453,
"step": 663
},
{
"epoch": 0.4207858048162231,
"grad_norm": 0.05781178921461105,
"learning_rate": 0.0001257528616695455,
"loss": 0.5,
"step": 664
},
{
"epoch": 0.42141951837769326,
"grad_norm": 0.0605790875852108,
"learning_rate": 0.00012555921133114247,
"loss": 0.5034,
"step": 665
},
{
"epoch": 0.4220532319391635,
"grad_norm": 0.04980487376451492,
"learning_rate": 0.00012536545839095074,
"loss": 0.4347,
"step": 666
},
{
"epoch": 0.42268694550063374,
"grad_norm": 0.06540601700544357,
"learning_rate": 0.00012517160362674848,
"loss": 0.5351,
"step": 667
},
{
"epoch": 0.4233206590621039,
"grad_norm": 0.049716752022504807,
"learning_rate": 0.0001249776478167227,
"loss": 0.4476,
"step": 668
},
{
"epoch": 0.42395437262357416,
"grad_norm": 0.10267884284257889,
"learning_rate": 0.00012478359173946602,
"loss": 0.5616,
"step": 669
},
{
"epoch": 0.42458808618504434,
"grad_norm": 0.05907197296619415,
"learning_rate": 0.00012458943617397344,
"loss": 0.4403,
"step": 670
},
{
"epoch": 0.4252217997465146,
"grad_norm": 0.09869077801704407,
"learning_rate": 0.0001243951818996396,
"loss": 0.6336,
"step": 671
},
{
"epoch": 0.42585551330798477,
"grad_norm": 0.07539843767881393,
"learning_rate": 0.00012420082969625518,
"loss": 0.6676,
"step": 672
},
{
"epoch": 0.426489226869455,
"grad_norm": 0.09385417401790619,
"learning_rate": 0.00012400638034400395,
"loss": 0.5714,
"step": 673
},
{
"epoch": 0.42712294043092525,
"grad_norm": 0.06782330572605133,
"learning_rate": 0.00012381183462345982,
"loss": 0.4956,
"step": 674
},
{
"epoch": 0.42775665399239543,
"grad_norm": 0.06100660189986229,
"learning_rate": 0.00012361719331558345,
"loss": 0.4217,
"step": 675
},
{
"epoch": 0.42839036755386567,
"grad_norm": 0.09908254444599152,
"learning_rate": 0.00012342245720171918,
"loss": 0.5405,
"step": 676
},
{
"epoch": 0.42902408111533585,
"grad_norm": 0.05237731710076332,
"learning_rate": 0.00012322762706359203,
"loss": 0.5044,
"step": 677
},
{
"epoch": 0.4296577946768061,
"grad_norm": 0.04910963028669357,
"learning_rate": 0.00012303270368330439,
"loss": 0.5073,
"step": 678
},
{
"epoch": 0.4302915082382763,
"grad_norm": 0.06268120557069778,
"learning_rate": 0.00012283768784333293,
"loss": 0.5736,
"step": 679
},
{
"epoch": 0.4309252217997465,
"grad_norm": 0.05207136273384094,
"learning_rate": 0.00012264258032652559,
"loss": 0.5319,
"step": 680
},
{
"epoch": 0.43155893536121676,
"grad_norm": 0.09583932906389236,
"learning_rate": 0.00012244738191609814,
"loss": 0.5891,
"step": 681
},
{
"epoch": 0.43219264892268694,
"grad_norm": 0.06307169795036316,
"learning_rate": 0.00012225209339563145,
"loss": 0.556,
"step": 682
},
{
"epoch": 0.4328263624841572,
"grad_norm": 0.062134500592947006,
"learning_rate": 0.00012205671554906794,
"loss": 0.5607,
"step": 683
},
{
"epoch": 0.43346007604562736,
"grad_norm": 0.04890581965446472,
"learning_rate": 0.00012186124916070867,
"loss": 0.4789,
"step": 684
},
{
"epoch": 0.4340937896070976,
"grad_norm": 0.04669584706425667,
"learning_rate": 0.00012166569501521017,
"loss": 0.4784,
"step": 685
},
{
"epoch": 0.4347275031685678,
"grad_norm": 0.05782284587621689,
"learning_rate": 0.00012147005389758117,
"loss": 0.5761,
"step": 686
},
{
"epoch": 0.435361216730038,
"grad_norm": 0.07015878707170486,
"learning_rate": 0.00012127432659317956,
"loss": 0.5462,
"step": 687
},
{
"epoch": 0.43599493029150826,
"grad_norm": 0.05989618971943855,
"learning_rate": 0.00012107851388770928,
"loss": 0.4671,
"step": 688
},
{
"epoch": 0.43662864385297845,
"grad_norm": 0.05732743442058563,
"learning_rate": 0.000120882616567217,
"loss": 0.4952,
"step": 689
},
{
"epoch": 0.4372623574144487,
"grad_norm": 0.06397297978401184,
"learning_rate": 0.00012068663541808909,
"loss": 0.5001,
"step": 690
},
{
"epoch": 0.43789607097591887,
"grad_norm": 0.05474892258644104,
"learning_rate": 0.00012049057122704846,
"loss": 0.4371,
"step": 691
},
{
"epoch": 0.4385297845373891,
"grad_norm": 0.0542195625603199,
"learning_rate": 0.00012029442478115129,
"loss": 0.4027,
"step": 692
},
{
"epoch": 0.4391634980988593,
"grad_norm": 0.0857028216123581,
"learning_rate": 0.00012009819686778408,
"loss": 0.5752,
"step": 693
},
{
"epoch": 0.43979721166032953,
"grad_norm": 0.07950462400913239,
"learning_rate": 0.00011990188827466025,
"loss": 0.4821,
"step": 694
},
{
"epoch": 0.4404309252217998,
"grad_norm": 0.13862280547618866,
"learning_rate": 0.00011970549978981715,
"loss": 0.5725,
"step": 695
},
{
"epoch": 0.44106463878326996,
"grad_norm": 0.06896214932203293,
"learning_rate": 0.00011950903220161285,
"loss": 0.5461,
"step": 696
},
{
"epoch": 0.4416983523447402,
"grad_norm": 0.05688636004924774,
"learning_rate": 0.00011931248629872287,
"loss": 0.6257,
"step": 697
},
{
"epoch": 0.4423320659062104,
"grad_norm": 0.07330068945884705,
"learning_rate": 0.00011911586287013725,
"loss": 0.4781,
"step": 698
},
{
"epoch": 0.4429657794676806,
"grad_norm": 0.057357531040906906,
"learning_rate": 0.0001189191627051571,
"loss": 0.3767,
"step": 699
},
{
"epoch": 0.4435994930291508,
"grad_norm": 0.05856744199991226,
"learning_rate": 0.00011872238659339168,
"loss": 0.5233,
"step": 700
},
{
"epoch": 0.44423320659062104,
"grad_norm": 0.04932614043354988,
"learning_rate": 0.00011852553532475503,
"loss": 0.5493,
"step": 701
},
{
"epoch": 0.4448669201520912,
"grad_norm": 0.10165086388587952,
"learning_rate": 0.00011832860968946297,
"loss": 0.626,
"step": 702
},
{
"epoch": 0.44550063371356147,
"grad_norm": 0.059510741382837296,
"learning_rate": 0.00011813161047802985,
"loss": 0.4979,
"step": 703
},
{
"epoch": 0.4461343472750317,
"grad_norm": 0.059596769511699677,
"learning_rate": 0.00011793453848126526,
"loss": 0.5903,
"step": 704
},
{
"epoch": 0.4467680608365019,
"grad_norm": 0.043714553117752075,
"learning_rate": 0.00011773739449027108,
"loss": 0.4347,
"step": 705
},
{
"epoch": 0.44740177439797213,
"grad_norm": 0.06549560278654099,
"learning_rate": 0.00011754017929643817,
"loss": 0.3608,
"step": 706
},
{
"epoch": 0.4480354879594423,
"grad_norm": 0.07389537245035172,
"learning_rate": 0.00011734289369144323,
"loss": 0.6457,
"step": 707
},
{
"epoch": 0.44866920152091255,
"grad_norm": 0.0611582025885582,
"learning_rate": 0.00011714553846724558,
"loss": 0.4182,
"step": 708
},
{
"epoch": 0.44930291508238274,
"grad_norm": 0.06682246923446655,
"learning_rate": 0.00011694811441608402,
"loss": 0.4601,
"step": 709
},
{
"epoch": 0.449936628643853,
"grad_norm": 0.05429236590862274,
"learning_rate": 0.00011675062233047364,
"loss": 0.5933,
"step": 710
},
{
"epoch": 0.4505703422053232,
"grad_norm": 0.07824891060590744,
"learning_rate": 0.00011655306300320268,
"loss": 0.6553,
"step": 711
},
{
"epoch": 0.4512040557667934,
"grad_norm": 0.0523335300385952,
"learning_rate": 0.0001163554372273292,
"loss": 0.382,
"step": 712
},
{
"epoch": 0.45183776932826364,
"grad_norm": 0.0779106542468071,
"learning_rate": 0.00011615774579617817,
"loss": 0.5208,
"step": 713
},
{
"epoch": 0.4524714828897338,
"grad_norm": 0.05331442877650261,
"learning_rate": 0.00011595998950333793,
"loss": 0.4668,
"step": 714
},
{
"epoch": 0.45310519645120406,
"grad_norm": 0.077408067882061,
"learning_rate": 0.00011576216914265734,
"loss": 0.4491,
"step": 715
},
{
"epoch": 0.45373891001267425,
"grad_norm": 0.2051779180765152,
"learning_rate": 0.00011556428550824237,
"loss": 0.5396,
"step": 716
},
{
"epoch": 0.4543726235741445,
"grad_norm": 0.052188027650117874,
"learning_rate": 0.000115366339394453,
"loss": 0.5815,
"step": 717
},
{
"epoch": 0.4550063371356147,
"grad_norm": 0.060880374163389206,
"learning_rate": 0.0001151683315959001,
"loss": 0.5019,
"step": 718
},
{
"epoch": 0.4556400506970849,
"grad_norm": 0.10370609164237976,
"learning_rate": 0.000114970262907442,
"loss": 0.5166,
"step": 719
},
{
"epoch": 0.45627376425855515,
"grad_norm": 0.059755194932222366,
"learning_rate": 0.00011477213412418157,
"loss": 0.5363,
"step": 720
},
{
"epoch": 0.45690747782002533,
"grad_norm": 0.05834079161286354,
"learning_rate": 0.00011457394604146294,
"loss": 0.487,
"step": 721
},
{
"epoch": 0.45754119138149557,
"grad_norm": 0.07119245082139969,
"learning_rate": 0.00011437569945486819,
"loss": 0.5711,
"step": 722
},
{
"epoch": 0.45817490494296575,
"grad_norm": 0.06131361797451973,
"learning_rate": 0.00011417739516021428,
"loss": 0.5226,
"step": 723
},
{
"epoch": 0.458808618504436,
"grad_norm": 0.04943651333451271,
"learning_rate": 0.00011397903395354996,
"loss": 0.4307,
"step": 724
},
{
"epoch": 0.45944233206590623,
"grad_norm": 0.046283356845378876,
"learning_rate": 0.00011378061663115222,
"loss": 0.3834,
"step": 725
},
{
"epoch": 0.4600760456273764,
"grad_norm": 0.0585121251642704,
"learning_rate": 0.00011358214398952347,
"loss": 0.6028,
"step": 726
},
{
"epoch": 0.46070975918884666,
"grad_norm": 0.08686511963605881,
"learning_rate": 0.00011338361682538811,
"loss": 0.4879,
"step": 727
},
{
"epoch": 0.46134347275031684,
"grad_norm": 0.07081152498722076,
"learning_rate": 0.00011318503593568948,
"loss": 0.6132,
"step": 728
},
{
"epoch": 0.4619771863117871,
"grad_norm": 0.05887436121702194,
"learning_rate": 0.00011298640211758648,
"loss": 0.5707,
"step": 729
},
{
"epoch": 0.46261089987325726,
"grad_norm": 0.06929212808609009,
"learning_rate": 0.00011278771616845061,
"loss": 0.449,
"step": 730
},
{
"epoch": 0.4632446134347275,
"grad_norm": 0.04306876286864281,
"learning_rate": 0.00011258897888586255,
"loss": 0.486,
"step": 731
},
{
"epoch": 0.46387832699619774,
"grad_norm": 0.05465447157621384,
"learning_rate": 0.00011239019106760908,
"loss": 0.4704,
"step": 732
},
{
"epoch": 0.4645120405576679,
"grad_norm": 0.058161042630672455,
"learning_rate": 0.00011219135351167979,
"loss": 0.5467,
"step": 733
},
{
"epoch": 0.46514575411913817,
"grad_norm": 0.06773436069488525,
"learning_rate": 0.00011199246701626405,
"loss": 0.5329,
"step": 734
},
{
"epoch": 0.46577946768060835,
"grad_norm": 0.04506424069404602,
"learning_rate": 0.00011179353237974756,
"loss": 0.4359,
"step": 735
},
{
"epoch": 0.4664131812420786,
"grad_norm": 0.05979963019490242,
"learning_rate": 0.00011159455040070936,
"loss": 0.5445,
"step": 736
},
{
"epoch": 0.4670468948035488,
"grad_norm": 0.0482424721121788,
"learning_rate": 0.00011139552187791848,
"loss": 0.4957,
"step": 737
},
{
"epoch": 0.467680608365019,
"grad_norm": 0.05097084492444992,
"learning_rate": 0.00011119644761033078,
"loss": 0.4642,
"step": 738
},
{
"epoch": 0.46831432192648925,
"grad_norm": 0.05539529025554657,
"learning_rate": 0.00011099732839708586,
"loss": 0.4227,
"step": 739
},
{
"epoch": 0.46894803548795944,
"grad_norm": 0.06280332803726196,
"learning_rate": 0.0001107981650375036,
"loss": 0.5842,
"step": 740
},
{
"epoch": 0.4695817490494297,
"grad_norm": 0.05138114467263222,
"learning_rate": 0.00011059895833108119,
"loss": 0.5681,
"step": 741
},
{
"epoch": 0.47021546261089986,
"grad_norm": 0.058239031583070755,
"learning_rate": 0.0001103997090774898,
"loss": 0.5582,
"step": 742
},
{
"epoch": 0.4708491761723701,
"grad_norm": 0.06877847760915756,
"learning_rate": 0.00011020041807657138,
"loss": 0.5912,
"step": 743
},
{
"epoch": 0.4714828897338403,
"grad_norm": 0.05639166757464409,
"learning_rate": 0.00011000108612833551,
"loss": 0.5888,
"step": 744
},
{
"epoch": 0.4721166032953105,
"grad_norm": 0.05756942555308342,
"learning_rate": 0.0001098017140329561,
"loss": 0.5451,
"step": 745
},
{
"epoch": 0.47275031685678076,
"grad_norm": 0.057658858597278595,
"learning_rate": 0.00010960230259076818,
"loss": 0.4939,
"step": 746
},
{
"epoch": 0.47338403041825095,
"grad_norm": 0.05436946451663971,
"learning_rate": 0.00010940285260226488,
"loss": 0.5084,
"step": 747
},
{
"epoch": 0.4740177439797212,
"grad_norm": 0.06349501758813858,
"learning_rate": 0.00010920336486809393,
"loss": 0.6588,
"step": 748
},
{
"epoch": 0.47465145754119137,
"grad_norm": 0.06300094723701477,
"learning_rate": 0.00010900384018905463,
"loss": 0.5655,
"step": 749
},
{
"epoch": 0.4752851711026616,
"grad_norm": 0.06454197317361832,
"learning_rate": 0.00010880427936609455,
"loss": 0.5455,
"step": 750
},
{
"epoch": 0.4759188846641318,
"grad_norm": 0.06663431227207184,
"learning_rate": 0.0001086046832003064,
"loss": 0.5263,
"step": 751
},
{
"epoch": 0.47655259822560203,
"grad_norm": 0.06523749232292175,
"learning_rate": 0.00010840505249292476,
"loss": 0.4109,
"step": 752
},
{
"epoch": 0.47718631178707227,
"grad_norm": 0.066495381295681,
"learning_rate": 0.00010820538804532286,
"loss": 0.5395,
"step": 753
},
{
"epoch": 0.47782002534854245,
"grad_norm": 0.07330245524644852,
"learning_rate": 0.00010800569065900933,
"loss": 0.5392,
"step": 754
},
{
"epoch": 0.4784537389100127,
"grad_norm": 0.05793917551636696,
"learning_rate": 0.00010780596113562514,
"loss": 0.5323,
"step": 755
},
{
"epoch": 0.4790874524714829,
"grad_norm": 0.05146726965904236,
"learning_rate": 0.0001076062002769401,
"loss": 0.4334,
"step": 756
},
{
"epoch": 0.4797211660329531,
"grad_norm": 0.06809573620557785,
"learning_rate": 0.00010740640888484996,
"loss": 0.5635,
"step": 757
},
{
"epoch": 0.4803548795944233,
"grad_norm": 0.05846872553229332,
"learning_rate": 0.00010720658776137298,
"loss": 0.5631,
"step": 758
},
{
"epoch": 0.48098859315589354,
"grad_norm": 0.06662282347679138,
"learning_rate": 0.00010700673770864673,
"loss": 0.3119,
"step": 759
},
{
"epoch": 0.4816223067173637,
"grad_norm": 0.05133543908596039,
"learning_rate": 0.00010680685952892502,
"loss": 0.5222,
"step": 760
},
{
"epoch": 0.48225602027883396,
"grad_norm": 0.06625013798475266,
"learning_rate": 0.00010660695402457442,
"loss": 0.4834,
"step": 761
},
{
"epoch": 0.4828897338403042,
"grad_norm": 0.07142903655767441,
"learning_rate": 0.0001064070219980713,
"loss": 0.551,
"step": 762
},
{
"epoch": 0.4835234474017744,
"grad_norm": 0.06273732334375381,
"learning_rate": 0.00010620706425199849,
"loss": 0.6681,
"step": 763
},
{
"epoch": 0.4841571609632446,
"grad_norm": 0.05467168986797333,
"learning_rate": 0.000106007081589042,
"loss": 0.5253,
"step": 764
},
{
"epoch": 0.4847908745247148,
"grad_norm": 0.05966407433152199,
"learning_rate": 0.00010580707481198796,
"loss": 0.516,
"step": 765
},
{
"epoch": 0.48542458808618505,
"grad_norm": 0.0470612607896328,
"learning_rate": 0.00010560704472371919,
"loss": 0.4632,
"step": 766
},
{
"epoch": 0.48605830164765523,
"grad_norm": 0.0659315288066864,
"learning_rate": 0.00010540699212721219,
"loss": 0.5164,
"step": 767
},
{
"epoch": 0.4866920152091255,
"grad_norm": 0.061314892023801804,
"learning_rate": 0.0001052069178255337,
"loss": 0.5968,
"step": 768
},
{
"epoch": 0.4873257287705957,
"grad_norm": 0.05175092816352844,
"learning_rate": 0.00010500682262183772,
"loss": 0.4665,
"step": 769
},
{
"epoch": 0.4879594423320659,
"grad_norm": 0.04965231940150261,
"learning_rate": 0.00010480670731936208,
"loss": 0.5068,
"step": 770
},
{
"epoch": 0.48859315589353614,
"grad_norm": 0.06218743324279785,
"learning_rate": 0.0001046065727214253,
"loss": 0.4043,
"step": 771
},
{
"epoch": 0.4892268694550063,
"grad_norm": 0.05969774350523949,
"learning_rate": 0.00010440641963142336,
"loss": 0.4471,
"step": 772
},
{
"epoch": 0.48986058301647656,
"grad_norm": 0.04538511112332344,
"learning_rate": 0.00010420624885282653,
"loss": 0.4891,
"step": 773
},
{
"epoch": 0.49049429657794674,
"grad_norm": 0.06056825444102287,
"learning_rate": 0.00010400606118917593,
"loss": 0.452,
"step": 774
},
{
"epoch": 0.491128010139417,
"grad_norm": 0.04322752729058266,
"learning_rate": 0.00010380585744408065,
"loss": 0.4044,
"step": 775
},
{
"epoch": 0.4917617237008872,
"grad_norm": 0.05485018342733383,
"learning_rate": 0.0001036056384212142,
"loss": 0.4913,
"step": 776
},
{
"epoch": 0.4923954372623574,
"grad_norm": 0.045921441167593,
"learning_rate": 0.0001034054049243115,
"loss": 0.4713,
"step": 777
},
{
"epoch": 0.49302915082382764,
"grad_norm": 0.05987657979130745,
"learning_rate": 0.00010320515775716555,
"loss": 0.4339,
"step": 778
},
{
"epoch": 0.49366286438529783,
"grad_norm": 0.06263814866542816,
"learning_rate": 0.00010300489772362416,
"loss": 0.5853,
"step": 779
},
{
"epoch": 0.49429657794676807,
"grad_norm": 0.07110540568828583,
"learning_rate": 0.0001028046256275869,
"loss": 0.5899,
"step": 780
},
{
"epoch": 0.49493029150823825,
"grad_norm": 0.05008992552757263,
"learning_rate": 0.00010260434227300171,
"loss": 0.5061,
"step": 781
},
{
"epoch": 0.4955640050697085,
"grad_norm": 0.05329698696732521,
"learning_rate": 0.00010240404846386168,
"loss": 0.5073,
"step": 782
},
{
"epoch": 0.49619771863117873,
"grad_norm": 0.060529615730047226,
"learning_rate": 0.000102203745004202,
"loss": 0.5194,
"step": 783
},
{
"epoch": 0.4968314321926489,
"grad_norm": 0.05783366411924362,
"learning_rate": 0.00010200343269809642,
"loss": 0.5393,
"step": 784
},
{
"epoch": 0.49746514575411915,
"grad_norm": 0.05209111049771309,
"learning_rate": 0.00010180311234965433,
"loss": 0.4858,
"step": 785
},
{
"epoch": 0.49809885931558934,
"grad_norm": 0.05122411996126175,
"learning_rate": 0.0001016027847630174,
"loss": 0.4476,
"step": 786
},
{
"epoch": 0.4987325728770596,
"grad_norm": 0.06304119527339935,
"learning_rate": 0.00010140245074235624,
"loss": 0.5741,
"step": 787
},
{
"epoch": 0.49936628643852976,
"grad_norm": 0.09011054039001465,
"learning_rate": 0.00010120211109186747,
"loss": 0.3418,
"step": 788
},
{
"epoch": 0.5,
"grad_norm": 0.06214231252670288,
"learning_rate": 0.00010100176661577015,
"loss": 0.5186,
"step": 789
},
{
"epoch": 0.5006337135614702,
"grad_norm": 0.19616113603115082,
"learning_rate": 0.00010080141811830277,
"loss": 0.5121,
"step": 790
},
{
"epoch": 0.5012674271229405,
"grad_norm": 0.05623235926032066,
"learning_rate": 0.00010060106640372,
"loss": 0.4457,
"step": 791
},
{
"epoch": 0.5019011406844106,
"grad_norm": 0.06097716465592384,
"learning_rate": 0.00010040071227628938,
"loss": 0.4578,
"step": 792
},
{
"epoch": 0.5025348542458808,
"grad_norm": 0.042372945696115494,
"learning_rate": 0.00010020035654028816,
"loss": 0.3896,
"step": 793
},
{
"epoch": 0.5031685678073511,
"grad_norm": 0.05927233397960663,
"learning_rate": 0.0001,
"loss": 0.6026,
"step": 794
},
{
"epoch": 0.5038022813688213,
"grad_norm": 0.06227416917681694,
"learning_rate": 9.979964345971188e-05,
"loss": 0.4366,
"step": 795
},
{
"epoch": 0.5044359949302915,
"grad_norm": 0.055778343230485916,
"learning_rate": 9.959928772371061e-05,
"loss": 0.4425,
"step": 796
},
{
"epoch": 0.5050697084917617,
"grad_norm": 0.04457565397024155,
"learning_rate": 9.939893359628001e-05,
"loss": 0.5326,
"step": 797
},
{
"epoch": 0.5057034220532319,
"grad_norm": 0.05732344835996628,
"learning_rate": 9.919858188169724e-05,
"loss": 0.5296,
"step": 798
},
{
"epoch": 0.5063371356147022,
"grad_norm": 0.04832519590854645,
"learning_rate": 9.899823338422986e-05,
"loss": 0.3992,
"step": 799
},
{
"epoch": 0.5069708491761724,
"grad_norm": 0.06504333764314651,
"learning_rate": 9.879788890813255e-05,
"loss": 0.3772,
"step": 800
},
{
"epoch": 0.5076045627376425,
"grad_norm": 0.05304650217294693,
"learning_rate": 9.859754925764378e-05,
"loss": 0.5455,
"step": 801
},
{
"epoch": 0.5082382762991128,
"grad_norm": 0.04738354682922363,
"learning_rate": 9.839721523698264e-05,
"loss": 0.4221,
"step": 802
},
{
"epoch": 0.508871989860583,
"grad_norm": 0.061429157853126526,
"learning_rate": 9.819688765034568e-05,
"loss": 0.5197,
"step": 803
},
{
"epoch": 0.5095057034220533,
"grad_norm": 0.04687187448143959,
"learning_rate": 9.79965673019036e-05,
"loss": 0.417,
"step": 804
},
{
"epoch": 0.5101394169835235,
"grad_norm": 0.05944183096289635,
"learning_rate": 9.779625499579805e-05,
"loss": 0.6043,
"step": 805
},
{
"epoch": 0.5107731305449936,
"grad_norm": 0.05007549747824669,
"learning_rate": 9.75959515361383e-05,
"loss": 0.5161,
"step": 806
},
{
"epoch": 0.5114068441064639,
"grad_norm": 0.0616040863096714,
"learning_rate": 9.739565772699831e-05,
"loss": 0.6219,
"step": 807
},
{
"epoch": 0.5120405576679341,
"grad_norm": 0.23154355585575104,
"learning_rate": 9.719537437241312e-05,
"loss": 0.4653,
"step": 808
},
{
"epoch": 0.5126742712294043,
"grad_norm": 0.08757317066192627,
"learning_rate": 9.699510227637586e-05,
"loss": 0.7004,
"step": 809
},
{
"epoch": 0.5133079847908745,
"grad_norm": 0.053165238350629807,
"learning_rate": 9.679484224283449e-05,
"loss": 0.5367,
"step": 810
},
{
"epoch": 0.5139416983523447,
"grad_norm": 0.05361173674464226,
"learning_rate": 9.659459507568853e-05,
"loss": 0.5044,
"step": 811
},
{
"epoch": 0.514575411913815,
"grad_norm": 0.0656973198056221,
"learning_rate": 9.63943615787858e-05,
"loss": 0.5785,
"step": 812
},
{
"epoch": 0.5152091254752852,
"grad_norm": 0.056508004665374756,
"learning_rate": 9.619414255591937e-05,
"loss": 0.505,
"step": 813
},
{
"epoch": 0.5158428390367554,
"grad_norm": 0.061718232929706573,
"learning_rate": 9.599393881082408e-05,
"loss": 0.5194,
"step": 814
},
{
"epoch": 0.5164765525982256,
"grad_norm": 0.055572785437107086,
"learning_rate": 9.579375114717351e-05,
"loss": 0.4633,
"step": 815
},
{
"epoch": 0.5171102661596958,
"grad_norm": 0.0603361539542675,
"learning_rate": 9.559358036857663e-05,
"loss": 0.4628,
"step": 816
},
{
"epoch": 0.517743979721166,
"grad_norm": 0.08223170042037964,
"learning_rate": 9.53934272785747e-05,
"loss": 0.4932,
"step": 817
},
{
"epoch": 0.5183776932826363,
"grad_norm": 0.05056726187467575,
"learning_rate": 9.519329268063795e-05,
"loss": 0.5267,
"step": 818
},
{
"epoch": 0.5190114068441065,
"grad_norm": 0.0726744681596756,
"learning_rate": 9.499317737816229e-05,
"loss": 0.5233,
"step": 819
},
{
"epoch": 0.5196451204055766,
"grad_norm": 0.06118292361497879,
"learning_rate": 9.479308217446633e-05,
"loss": 0.5627,
"step": 820
},
{
"epoch": 0.5202788339670469,
"grad_norm": 0.05231308937072754,
"learning_rate": 9.459300787278785e-05,
"loss": 0.5238,
"step": 821
},
{
"epoch": 0.5209125475285171,
"grad_norm": 0.0555204376578331,
"learning_rate": 9.439295527628081e-05,
"loss": 0.5648,
"step": 822
},
{
"epoch": 0.5215462610899874,
"grad_norm": 0.056751273572444916,
"learning_rate": 9.419292518801205e-05,
"loss": 0.6158,
"step": 823
},
{
"epoch": 0.5221799746514575,
"grad_norm": 0.055247753858566284,
"learning_rate": 9.399291841095802e-05,
"loss": 0.5938,
"step": 824
},
{
"epoch": 0.5228136882129277,
"grad_norm": 0.05264151841402054,
"learning_rate": 9.379293574800154e-05,
"loss": 0.4908,
"step": 825
},
{
"epoch": 0.523447401774398,
"grad_norm": 0.06633622944355011,
"learning_rate": 9.359297800192872e-05,
"loss": 0.4516,
"step": 826
},
{
"epoch": 0.5240811153358682,
"grad_norm": 0.06326263397932053,
"learning_rate": 9.33930459754256e-05,
"loss": 0.4583,
"step": 827
},
{
"epoch": 0.5247148288973384,
"grad_norm": 0.061470355838537216,
"learning_rate": 9.319314047107504e-05,
"loss": 0.5209,
"step": 828
},
{
"epoch": 0.5253485424588086,
"grad_norm": 0.048166628926992416,
"learning_rate": 9.299326229135326e-05,
"loss": 0.5184,
"step": 829
},
{
"epoch": 0.5259822560202788,
"grad_norm": 0.09853006154298782,
"learning_rate": 9.279341223862705e-05,
"loss": 0.5219,
"step": 830
},
{
"epoch": 0.526615969581749,
"grad_norm": 0.5687222480773926,
"learning_rate": 9.259359111515006e-05,
"loss": 0.4086,
"step": 831
},
{
"epoch": 0.5272496831432193,
"grad_norm": 0.05580870062112808,
"learning_rate": 9.239379972305992e-05,
"loss": 0.492,
"step": 832
},
{
"epoch": 0.5278833967046895,
"grad_norm": 0.05025511607527733,
"learning_rate": 9.219403886437489e-05,
"loss": 0.5146,
"step": 833
},
{
"epoch": 0.5285171102661597,
"grad_norm": 0.05787106603384018,
"learning_rate": 9.199430934099068e-05,
"loss": 0.5356,
"step": 834
},
{
"epoch": 0.5291508238276299,
"grad_norm": 0.06410747766494751,
"learning_rate": 9.179461195467714e-05,
"loss": 0.6312,
"step": 835
},
{
"epoch": 0.5297845373891001,
"grad_norm": 0.053113870322704315,
"learning_rate": 9.159494750707526e-05,
"loss": 0.4838,
"step": 836
},
{
"epoch": 0.5304182509505704,
"grad_norm": 0.06018316373229027,
"learning_rate": 9.139531679969362e-05,
"loss": 0.4631,
"step": 837
},
{
"epoch": 0.5310519645120405,
"grad_norm": 0.05416072905063629,
"learning_rate": 9.119572063390549e-05,
"loss": 0.4439,
"step": 838
},
{
"epoch": 0.5316856780735107,
"grad_norm": 0.08766517043113708,
"learning_rate": 9.09961598109454e-05,
"loss": 0.5445,
"step": 839
},
{
"epoch": 0.532319391634981,
"grad_norm": 0.0619327537715435,
"learning_rate": 9.079663513190611e-05,
"loss": 0.5428,
"step": 840
},
{
"epoch": 0.5329531051964512,
"grad_norm": 0.059881288558244705,
"learning_rate": 9.059714739773516e-05,
"loss": 0.513,
"step": 841
},
{
"epoch": 0.5335868187579215,
"grad_norm": 0.06464383006095886,
"learning_rate": 9.039769740923183e-05,
"loss": 0.4746,
"step": 842
},
{
"epoch": 0.5342205323193916,
"grad_norm": 0.054081957787275314,
"learning_rate": 9.019828596704394e-05,
"loss": 0.391,
"step": 843
},
{
"epoch": 0.5348542458808618,
"grad_norm": 0.07097287476062775,
"learning_rate": 8.999891387166453e-05,
"loss": 0.5668,
"step": 844
},
{
"epoch": 0.5354879594423321,
"grad_norm": 0.050909094512462616,
"learning_rate": 8.979958192342862e-05,
"loss": 0.5574,
"step": 845
},
{
"epoch": 0.5361216730038023,
"grad_norm": 0.0605645477771759,
"learning_rate": 8.960029092251023e-05,
"loss": 0.5608,
"step": 846
},
{
"epoch": 0.5367553865652726,
"grad_norm": 0.05807255208492279,
"learning_rate": 8.940104166891885e-05,
"loss": 0.5057,
"step": 847
},
{
"epoch": 0.5373891001267427,
"grad_norm": 0.05229676514863968,
"learning_rate": 8.920183496249642e-05,
"loss": 0.4968,
"step": 848
},
{
"epoch": 0.5380228136882129,
"grad_norm": 0.05831581726670265,
"learning_rate": 8.900267160291416e-05,
"loss": 0.421,
"step": 849
},
{
"epoch": 0.5386565272496832,
"grad_norm": 0.04102315753698349,
"learning_rate": 8.880355238966923e-05,
"loss": 0.4176,
"step": 850
},
{
"epoch": 0.5392902408111534,
"grad_norm": 0.04635517671704292,
"learning_rate": 8.860447812208157e-05,
"loss": 0.4623,
"step": 851
},
{
"epoch": 0.5399239543726235,
"grad_norm": 0.08849713206291199,
"learning_rate": 8.840544959929065e-05,
"loss": 0.6421,
"step": 852
},
{
"epoch": 0.5405576679340938,
"grad_norm": 0.07401357591152191,
"learning_rate": 8.820646762025246e-05,
"loss": 0.4958,
"step": 853
},
{
"epoch": 0.541191381495564,
"grad_norm": 0.07079368084669113,
"learning_rate": 8.800753298373596e-05,
"loss": 0.4828,
"step": 854
},
{
"epoch": 0.5418250950570342,
"grad_norm": 0.06453298032283783,
"learning_rate": 8.780864648832022e-05,
"loss": 0.6269,
"step": 855
},
{
"epoch": 0.5424588086185045,
"grad_norm": 0.05445917323231697,
"learning_rate": 8.760980893239094e-05,
"loss": 0.5873,
"step": 856
},
{
"epoch": 0.5430925221799746,
"grad_norm": 0.047000445425510406,
"learning_rate": 8.741102111413748e-05,
"loss": 0.4938,
"step": 857
},
{
"epoch": 0.5437262357414449,
"grad_norm": 0.06307143718004227,
"learning_rate": 8.721228383154939e-05,
"loss": 0.602,
"step": 858
},
{
"epoch": 0.5443599493029151,
"grad_norm": 0.046326130628585815,
"learning_rate": 8.701359788241354e-05,
"loss": 0.453,
"step": 859
},
{
"epoch": 0.5449936628643853,
"grad_norm": 0.05878138169646263,
"learning_rate": 8.681496406431056e-05,
"loss": 0.5619,
"step": 860
},
{
"epoch": 0.5456273764258555,
"grad_norm": 0.06828006356954575,
"learning_rate": 8.66163831746119e-05,
"loss": 0.4723,
"step": 861
},
{
"epoch": 0.5462610899873257,
"grad_norm": 0.062354519963264465,
"learning_rate": 8.641785601047654e-05,
"loss": 0.5345,
"step": 862
},
{
"epoch": 0.5468948035487959,
"grad_norm": 0.052326980978250504,
"learning_rate": 8.621938336884781e-05,
"loss": 0.5096,
"step": 863
},
{
"epoch": 0.5475285171102662,
"grad_norm": 0.09620847553014755,
"learning_rate": 8.602096604645009e-05,
"loss": 0.6523,
"step": 864
},
{
"epoch": 0.5481622306717364,
"grad_norm": 0.07187427580356598,
"learning_rate": 8.58226048397857e-05,
"loss": 0.5051,
"step": 865
},
{
"epoch": 0.5487959442332065,
"grad_norm": 0.058141518384218216,
"learning_rate": 8.562430054513184e-05,
"loss": 0.501,
"step": 866
},
{
"epoch": 0.5494296577946768,
"grad_norm": 0.037818700075149536,
"learning_rate": 8.54260539585371e-05,
"loss": 0.2518,
"step": 867
},
{
"epoch": 0.550063371356147,
"grad_norm": 0.04658188298344612,
"learning_rate": 8.522786587581844e-05,
"loss": 0.4531,
"step": 868
},
{
"epoch": 0.5506970849176173,
"grad_norm": 0.04527122154831886,
"learning_rate": 8.502973709255804e-05,
"loss": 0.4592,
"step": 869
},
{
"epoch": 0.5513307984790875,
"grad_norm": 0.05705267935991287,
"learning_rate": 8.483166840409995e-05,
"loss": 0.4575,
"step": 870
},
{
"epoch": 0.5519645120405576,
"grad_norm": 0.08155850321054459,
"learning_rate": 8.463366060554698e-05,
"loss": 0.5167,
"step": 871
},
{
"epoch": 0.5525982256020279,
"grad_norm": 0.07388201355934143,
"learning_rate": 8.443571449175766e-05,
"loss": 0.6817,
"step": 872
},
{
"epoch": 0.5532319391634981,
"grad_norm": 0.06419550627470016,
"learning_rate": 8.423783085734268e-05,
"loss": 0.5468,
"step": 873
},
{
"epoch": 0.5538656527249683,
"grad_norm": 0.05985475331544876,
"learning_rate": 8.404001049666211e-05,
"loss": 0.5247,
"step": 874
},
{
"epoch": 0.5544993662864385,
"grad_norm": 0.05610859394073486,
"learning_rate": 8.384225420382185e-05,
"loss": 0.5088,
"step": 875
},
{
"epoch": 0.5551330798479087,
"grad_norm": 0.5789166688919067,
"learning_rate": 8.36445627726708e-05,
"loss": 0.5744,
"step": 876
},
{
"epoch": 0.555766793409379,
"grad_norm": 0.05248624086380005,
"learning_rate": 8.344693699679736e-05,
"loss": 0.4797,
"step": 877
},
{
"epoch": 0.5564005069708492,
"grad_norm": 0.06693774461746216,
"learning_rate": 8.324937766952638e-05,
"loss": 0.5354,
"step": 878
},
{
"epoch": 0.5570342205323194,
"grad_norm": 0.058544524013996124,
"learning_rate": 8.305188558391599e-05,
"loss": 0.602,
"step": 879
},
{
"epoch": 0.5576679340937896,
"grad_norm": 0.05111921206116676,
"learning_rate": 8.285446153275445e-05,
"loss": 0.4541,
"step": 880
},
{
"epoch": 0.5583016476552598,
"grad_norm": 0.0569741316139698,
"learning_rate": 8.265710630855677e-05,
"loss": 0.5306,
"step": 881
},
{
"epoch": 0.55893536121673,
"grad_norm": 0.13403062522411346,
"learning_rate": 8.245982070356185e-05,
"loss": 0.56,
"step": 882
},
{
"epoch": 0.5595690747782003,
"grad_norm": 0.07512082904577255,
"learning_rate": 8.226260550972895e-05,
"loss": 0.5951,
"step": 883
},
{
"epoch": 0.5602027883396705,
"grad_norm": 0.046271927654743195,
"learning_rate": 8.206546151873478e-05,
"loss": 0.436,
"step": 884
},
{
"epoch": 0.5608365019011406,
"grad_norm": 0.05913880839943886,
"learning_rate": 8.186838952197018e-05,
"loss": 0.5116,
"step": 885
},
{
"epoch": 0.5614702154626109,
"grad_norm": 0.05060280114412308,
"learning_rate": 8.167139031053705e-05,
"loss": 0.5245,
"step": 886
},
{
"epoch": 0.5621039290240811,
"grad_norm": 0.0638653039932251,
"learning_rate": 8.1474464675245e-05,
"loss": 0.5099,
"step": 887
},
{
"epoch": 0.5627376425855514,
"grad_norm": 0.04928203299641609,
"learning_rate": 8.127761340660835e-05,
"loss": 0.3581,
"step": 888
},
{
"epoch": 0.5633713561470215,
"grad_norm": 0.04772525653243065,
"learning_rate": 8.108083729484292e-05,
"loss": 0.4432,
"step": 889
},
{
"epoch": 0.5640050697084917,
"grad_norm": 0.0834617018699646,
"learning_rate": 8.08841371298628e-05,
"loss": 0.6493,
"step": 890
},
{
"epoch": 0.564638783269962,
"grad_norm": 0.06321214139461517,
"learning_rate": 8.068751370127712e-05,
"loss": 0.4376,
"step": 891
},
{
"epoch": 0.5652724968314322,
"grad_norm": 0.07898563891649246,
"learning_rate": 8.049096779838719e-05,
"loss": 0.3803,
"step": 892
},
{
"epoch": 0.5659062103929025,
"grad_norm": 0.061078350991010666,
"learning_rate": 8.029450021018287e-05,
"loss": 0.4417,
"step": 893
},
{
"epoch": 0.5665399239543726,
"grad_norm": 0.05912580341100693,
"learning_rate": 8.009811172533976e-05,
"loss": 0.4558,
"step": 894
},
{
"epoch": 0.5671736375158428,
"grad_norm": 0.06853251159191132,
"learning_rate": 7.990180313221596e-05,
"loss": 0.4647,
"step": 895
},
{
"epoch": 0.5678073510773131,
"grad_norm": 0.13536880910396576,
"learning_rate": 7.970557521884873e-05,
"loss": 0.4849,
"step": 896
},
{
"epoch": 0.5684410646387833,
"grad_norm": 0.051422230899333954,
"learning_rate": 7.950942877295155e-05,
"loss": 0.5153,
"step": 897
},
{
"epoch": 0.5690747782002535,
"grad_norm": 0.05563550814986229,
"learning_rate": 7.931336458191092e-05,
"loss": 0.4608,
"step": 898
},
{
"epoch": 0.5697084917617237,
"grad_norm": 0.05387943610548973,
"learning_rate": 7.911738343278304e-05,
"loss": 0.308,
"step": 899
},
{
"epoch": 0.5703422053231939,
"grad_norm": 0.05549965053796768,
"learning_rate": 7.892148611229075e-05,
"loss": 0.477,
"step": 900
},
{
"epoch": 0.5709759188846641,
"grad_norm": 0.06661087274551392,
"learning_rate": 7.872567340682045e-05,
"loss": 0.5179,
"step": 901
},
{
"epoch": 0.5716096324461344,
"grad_norm": 0.06925564259290695,
"learning_rate": 7.852994610241885e-05,
"loss": 0.4785,
"step": 902
},
{
"epoch": 0.5722433460076045,
"grad_norm": 0.05441868305206299,
"learning_rate": 7.833430498478988e-05,
"loss": 0.5596,
"step": 903
},
{
"epoch": 0.5728770595690748,
"grad_norm": 0.04862716421484947,
"learning_rate": 7.813875083929132e-05,
"loss": 0.4659,
"step": 904
},
{
"epoch": 0.573510773130545,
"grad_norm": 0.07547637820243835,
"learning_rate": 7.794328445093208e-05,
"loss": 0.4485,
"step": 905
},
{
"epoch": 0.5741444866920152,
"grad_norm": 0.08132816851139069,
"learning_rate": 7.774790660436858e-05,
"loss": 0.6294,
"step": 906
},
{
"epoch": 0.5747782002534855,
"grad_norm": 0.06841199100017548,
"learning_rate": 7.755261808390187e-05,
"loss": 0.4667,
"step": 907
},
{
"epoch": 0.5754119138149556,
"grad_norm": 0.05556390807032585,
"learning_rate": 7.735741967347445e-05,
"loss": 0.5166,
"step": 908
},
{
"epoch": 0.5760456273764258,
"grad_norm": 0.07941378653049469,
"learning_rate": 7.716231215666711e-05,
"loss": 0.4368,
"step": 909
},
{
"epoch": 0.5766793409378961,
"grad_norm": 0.08058507740497589,
"learning_rate": 7.696729631669564e-05,
"loss": 0.6772,
"step": 910
},
{
"epoch": 0.5773130544993663,
"grad_norm": 0.06999081373214722,
"learning_rate": 7.6772372936408e-05,
"loss": 0.6374,
"step": 911
},
{
"epoch": 0.5779467680608364,
"grad_norm": 0.05269391089677811,
"learning_rate": 7.657754279828083e-05,
"loss": 0.3222,
"step": 912
},
{
"epoch": 0.5785804816223067,
"grad_norm": 0.059798724949359894,
"learning_rate": 7.63828066844166e-05,
"loss": 0.5354,
"step": 913
},
{
"epoch": 0.5792141951837769,
"grad_norm": 0.05695294961333275,
"learning_rate": 7.618816537654018e-05,
"loss": 0.4552,
"step": 914
},
{
"epoch": 0.5798479087452472,
"grad_norm": 0.07460351288318634,
"learning_rate": 7.599361965599606e-05,
"loss": 0.581,
"step": 915
},
{
"epoch": 0.5804816223067174,
"grad_norm": 0.04292193427681923,
"learning_rate": 7.579917030374489e-05,
"loss": 0.435,
"step": 916
},
{
"epoch": 0.5811153358681875,
"grad_norm": 0.05156205967068672,
"learning_rate": 7.56048181003604e-05,
"loss": 0.5231,
"step": 917
},
{
"epoch": 0.5817490494296578,
"grad_norm": 0.05971655622124672,
"learning_rate": 7.541056382602657e-05,
"loss": 0.5196,
"step": 918
},
{
"epoch": 0.582382762991128,
"grad_norm": 0.06214692071080208,
"learning_rate": 7.521640826053404e-05,
"loss": 0.5237,
"step": 919
},
{
"epoch": 0.5830164765525983,
"grad_norm": 0.05921977758407593,
"learning_rate": 7.502235218327731e-05,
"loss": 0.5444,
"step": 920
},
{
"epoch": 0.5836501901140685,
"grad_norm": 0.05885602533817291,
"learning_rate": 7.482839637325153e-05,
"loss": 0.4045,
"step": 921
},
{
"epoch": 0.5842839036755386,
"grad_norm": 0.05014495924115181,
"learning_rate": 7.463454160904928e-05,
"loss": 0.4261,
"step": 922
},
{
"epoch": 0.5849176172370089,
"grad_norm": 0.07014278322458267,
"learning_rate": 7.444078866885753e-05,
"loss": 0.5934,
"step": 923
},
{
"epoch": 0.5855513307984791,
"grad_norm": 0.04919711500406265,
"learning_rate": 7.424713833045452e-05,
"loss": 0.4819,
"step": 924
},
{
"epoch": 0.5861850443599493,
"grad_norm": 0.05253986269235611,
"learning_rate": 7.405359137120662e-05,
"loss": 0.5067,
"step": 925
},
{
"epoch": 0.5868187579214195,
"grad_norm": 0.05310770869255066,
"learning_rate": 7.386014856806523e-05,
"loss": 0.4878,
"step": 926
},
{
"epoch": 0.5874524714828897,
"grad_norm": 0.0604504756629467,
"learning_rate": 7.366681069756352e-05,
"loss": 0.3944,
"step": 927
},
{
"epoch": 0.5880861850443599,
"grad_norm": 0.042067963629961014,
"learning_rate": 7.347357853581361e-05,
"loss": 0.412,
"step": 928
},
{
"epoch": 0.5887198986058302,
"grad_norm": 0.04595714807510376,
"learning_rate": 7.328045285850313e-05,
"loss": 0.4234,
"step": 929
},
{
"epoch": 0.5893536121673004,
"grad_norm": 0.05038761347532272,
"learning_rate": 7.308743444089232e-05,
"loss": 0.5915,
"step": 930
},
{
"epoch": 0.5899873257287706,
"grad_norm": 0.061250437051057816,
"learning_rate": 7.289452405781084e-05,
"loss": 0.6433,
"step": 931
},
{
"epoch": 0.5906210392902408,
"grad_norm": 0.07605701684951782,
"learning_rate": 7.270172248365468e-05,
"loss": 0.6252,
"step": 932
},
{
"epoch": 0.591254752851711,
"grad_norm": 0.05717351287603378,
"learning_rate": 7.250903049238297e-05,
"loss": 0.4693,
"step": 933
},
{
"epoch": 0.5918884664131813,
"grad_norm": 0.05955088511109352,
"learning_rate": 7.231644885751507e-05,
"loss": 0.5883,
"step": 934
},
{
"epoch": 0.5925221799746515,
"grad_norm": 0.06226349249482155,
"learning_rate": 7.212397835212722e-05,
"loss": 0.4226,
"step": 935
},
{
"epoch": 0.5931558935361216,
"grad_norm": 0.062126316130161285,
"learning_rate": 7.193161974884964e-05,
"loss": 0.568,
"step": 936
},
{
"epoch": 0.5937896070975919,
"grad_norm": 0.08957802504301071,
"learning_rate": 7.173937381986323e-05,
"loss": 0.5132,
"step": 937
},
{
"epoch": 0.5944233206590621,
"grad_norm": 0.06909901648759842,
"learning_rate": 7.154724133689677e-05,
"loss": 0.5055,
"step": 938
},
{
"epoch": 0.5950570342205324,
"grad_norm": 0.0510685071349144,
"learning_rate": 7.135522307122346e-05,
"loss": 0.5349,
"step": 939
},
{
"epoch": 0.5956907477820025,
"grad_norm": 0.05713349208235741,
"learning_rate": 7.116331979365805e-05,
"loss": 0.4435,
"step": 940
},
{
"epoch": 0.5963244613434727,
"grad_norm": 0.05836547538638115,
"learning_rate": 7.097153227455379e-05,
"loss": 0.4525,
"step": 941
},
{
"epoch": 0.596958174904943,
"grad_norm": 0.058628011494874954,
"learning_rate": 7.077986128379908e-05,
"loss": 0.3689,
"step": 942
},
{
"epoch": 0.5975918884664132,
"grad_norm": 0.05638744682073593,
"learning_rate": 7.058830759081464e-05,
"loss": 0.4296,
"step": 943
},
{
"epoch": 0.5982256020278834,
"grad_norm": 0.04396173730492592,
"learning_rate": 7.039687196455042e-05,
"loss": 0.4846,
"step": 944
},
{
"epoch": 0.5988593155893536,
"grad_norm": 0.051896654069423676,
"learning_rate": 7.02055551734822e-05,
"loss": 0.5216,
"step": 945
},
{
"epoch": 0.5994930291508238,
"grad_norm": 0.07102696597576141,
"learning_rate": 7.001435798560883e-05,
"loss": 0.5707,
"step": 946
},
{
"epoch": 0.600126742712294,
"grad_norm": 0.06377355009317398,
"learning_rate": 6.982328116844912e-05,
"loss": 0.4078,
"step": 947
},
{
"epoch": 0.6007604562737643,
"grad_norm": 0.05575268715620041,
"learning_rate": 6.963232548903853e-05,
"loss": 0.5136,
"step": 948
},
{
"epoch": 0.6013941698352345,
"grad_norm": 0.0727148950099945,
"learning_rate": 6.944149171392637e-05,
"loss": 0.463,
"step": 949
},
{
"epoch": 0.6020278833967047,
"grad_norm": 0.06672396510839462,
"learning_rate": 6.925078060917245e-05,
"loss": 0.5424,
"step": 950
},
{
"epoch": 0.6026615969581749,
"grad_norm": 0.05297897756099701,
"learning_rate": 6.906019294034432e-05,
"loss": 0.3956,
"step": 951
},
{
"epoch": 0.6032953105196451,
"grad_norm": 0.09691467881202698,
"learning_rate": 6.886972947251387e-05,
"loss": 0.4652,
"step": 952
},
{
"epoch": 0.6039290240811154,
"grad_norm": 0.05336275324225426,
"learning_rate": 6.86793909702545e-05,
"loss": 0.5057,
"step": 953
},
{
"epoch": 0.6045627376425855,
"grad_norm": 0.05873895063996315,
"learning_rate": 6.848917819763793e-05,
"loss": 0.4741,
"step": 954
},
{
"epoch": 0.6051964512040557,
"grad_norm": 0.05541690066456795,
"learning_rate": 6.829909191823121e-05,
"loss": 0.5333,
"step": 955
},
{
"epoch": 0.605830164765526,
"grad_norm": 0.05053303390741348,
"learning_rate": 6.810913289509351e-05,
"loss": 0.5369,
"step": 956
},
{
"epoch": 0.6064638783269962,
"grad_norm": 0.04831360653042793,
"learning_rate": 6.79193018907733e-05,
"loss": 0.4221,
"step": 957
},
{
"epoch": 0.6070975918884665,
"grad_norm": 0.06704218685626984,
"learning_rate": 6.7729599667305e-05,
"loss": 0.6387,
"step": 958
},
{
"epoch": 0.6077313054499366,
"grad_norm": 0.05862371623516083,
"learning_rate": 6.75400269862062e-05,
"loss": 0.4535,
"step": 959
},
{
"epoch": 0.6083650190114068,
"grad_norm": 0.04538964852690697,
"learning_rate": 6.735058460847437e-05,
"loss": 0.4628,
"step": 960
},
{
"epoch": 0.6089987325728771,
"grad_norm": 0.06569251418113708,
"learning_rate": 6.716127329458399e-05,
"loss": 0.6058,
"step": 961
},
{
"epoch": 0.6096324461343473,
"grad_norm": 0.0685662105679512,
"learning_rate": 6.697209380448333e-05,
"loss": 0.662,
"step": 962
},
{
"epoch": 0.6102661596958175,
"grad_norm": 0.04988419637084007,
"learning_rate": 6.678304689759147e-05,
"loss": 0.3806,
"step": 963
},
{
"epoch": 0.6108998732572877,
"grad_norm": 0.08099476248025894,
"learning_rate": 6.659413333279543e-05,
"loss": 0.5481,
"step": 964
},
{
"epoch": 0.6115335868187579,
"grad_norm": 0.09865555912256241,
"learning_rate": 6.640535386844679e-05,
"loss": 0.5004,
"step": 965
},
{
"epoch": 0.6121673003802282,
"grad_norm": 0.07820238918066025,
"learning_rate": 6.621670926235884e-05,
"loss": 0.5069,
"step": 966
},
{
"epoch": 0.6128010139416984,
"grad_norm": 0.06334922462701797,
"learning_rate": 6.602820027180359e-05,
"loss": 0.5714,
"step": 967
},
{
"epoch": 0.6134347275031685,
"grad_norm": 0.05438544973731041,
"learning_rate": 6.583982765350859e-05,
"loss": 0.5256,
"step": 968
},
{
"epoch": 0.6140684410646388,
"grad_norm": 0.06950180977582932,
"learning_rate": 6.565159216365389e-05,
"loss": 0.5267,
"step": 969
},
{
"epoch": 0.614702154626109,
"grad_norm": 0.052648235112428665,
"learning_rate": 6.546349455786926e-05,
"loss": 0.4368,
"step": 970
},
{
"epoch": 0.6153358681875792,
"grad_norm": 0.059094175696372986,
"learning_rate": 6.527553559123075e-05,
"loss": 0.4375,
"step": 971
},
{
"epoch": 0.6159695817490495,
"grad_norm": 0.0576835498213768,
"learning_rate": 6.508771601825805e-05,
"loss": 0.5336,
"step": 972
},
{
"epoch": 0.6166032953105196,
"grad_norm": 0.06954148411750793,
"learning_rate": 6.490003659291111e-05,
"loss": 0.5409,
"step": 973
},
{
"epoch": 0.6172370088719898,
"grad_norm": 0.04728791490197182,
"learning_rate": 6.471249806858749e-05,
"loss": 0.3401,
"step": 974
},
{
"epoch": 0.6178707224334601,
"grad_norm": 0.0617443211376667,
"learning_rate": 6.452510119811895e-05,
"loss": 0.4768,
"step": 975
},
{
"epoch": 0.6185044359949303,
"grad_norm": 0.0580204576253891,
"learning_rate": 6.43378467337687e-05,
"loss": 0.4066,
"step": 976
},
{
"epoch": 0.6191381495564005,
"grad_norm": 0.08840842545032501,
"learning_rate": 6.415073542722833e-05,
"loss": 0.5404,
"step": 977
},
{
"epoch": 0.6197718631178707,
"grad_norm": 0.05179251730442047,
"learning_rate": 6.396376802961468e-05,
"loss": 0.4582,
"step": 978
},
{
"epoch": 0.6204055766793409,
"grad_norm": 0.052679892629384995,
"learning_rate": 6.37769452914669e-05,
"loss": 0.4511,
"step": 979
},
{
"epoch": 0.6210392902408112,
"grad_norm": 0.053780052810907364,
"learning_rate": 6.359026796274353e-05,
"loss": 0.4421,
"step": 980
},
{
"epoch": 0.6216730038022814,
"grad_norm": 0.05369406193494797,
"learning_rate": 6.340373679281925e-05,
"loss": 0.5214,
"step": 981
},
{
"epoch": 0.6223067173637515,
"grad_norm": 0.05973728746175766,
"learning_rate": 6.321735253048213e-05,
"loss": 0.4608,
"step": 982
},
{
"epoch": 0.6229404309252218,
"grad_norm": 0.09265648573637009,
"learning_rate": 6.30311159239305e-05,
"loss": 0.5326,
"step": 983
},
{
"epoch": 0.623574144486692,
"grad_norm": 0.059120386838912964,
"learning_rate": 6.284502772076995e-05,
"loss": 0.5446,
"step": 984
},
{
"epoch": 0.6242078580481623,
"grad_norm": 0.06080583855509758,
"learning_rate": 6.26590886680103e-05,
"loss": 0.4881,
"step": 985
},
{
"epoch": 0.6248415716096325,
"grad_norm": 0.07576774805784225,
"learning_rate": 6.24732995120626e-05,
"loss": 0.5936,
"step": 986
},
{
"epoch": 0.6254752851711026,
"grad_norm": 0.064827099442482,
"learning_rate": 6.228766099873633e-05,
"loss": 0.459,
"step": 987
},
{
"epoch": 0.6261089987325729,
"grad_norm": 0.054790303111076355,
"learning_rate": 6.210217387323613e-05,
"loss": 0.5417,
"step": 988
},
{
"epoch": 0.6267427122940431,
"grad_norm": 0.04305886849761009,
"learning_rate": 6.191683888015894e-05,
"loss": 0.3993,
"step": 989
},
{
"epoch": 0.6273764258555133,
"grad_norm": 0.06198723614215851,
"learning_rate": 6.173165676349103e-05,
"loss": 0.4749,
"step": 990
},
{
"epoch": 0.6280101394169835,
"grad_norm": 0.06888420134782791,
"learning_rate": 6.154662826660497e-05,
"loss": 0.6456,
"step": 991
},
{
"epoch": 0.6286438529784537,
"grad_norm": 0.048178933560848236,
"learning_rate": 6.136175413225657e-05,
"loss": 0.4695,
"step": 992
},
{
"epoch": 0.629277566539924,
"grad_norm": 0.051914725452661514,
"learning_rate": 6.117703510258218e-05,
"loss": 0.4169,
"step": 993
},
{
"epoch": 0.6299112801013942,
"grad_norm": 0.045315682888031006,
"learning_rate": 6.0992471919095315e-05,
"loss": 0.4811,
"step": 994
},
{
"epoch": 0.6305449936628644,
"grad_norm": 0.057482652366161346,
"learning_rate": 6.0808065322683993e-05,
"loss": 0.4632,
"step": 995
},
{
"epoch": 0.6311787072243346,
"grad_norm": 0.04770313948392868,
"learning_rate": 6.062381605360766e-05,
"loss": 0.3449,
"step": 996
},
{
"epoch": 0.6318124207858048,
"grad_norm": 0.09517858922481537,
"learning_rate": 6.043972485149414e-05,
"loss": 0.5172,
"step": 997
},
{
"epoch": 0.632446134347275,
"grad_norm": 0.05451178550720215,
"learning_rate": 6.0255792455336735e-05,
"loss": 0.5564,
"step": 998
},
{
"epoch": 0.6330798479087453,
"grad_norm": 0.052426449954509735,
"learning_rate": 6.0072019603491306e-05,
"loss": 0.5569,
"step": 999
},
{
"epoch": 0.6337135614702155,
"grad_norm": 0.07426486909389496,
"learning_rate": 5.988840703367322e-05,
"loss": 0.5943,
"step": 1000
},
{
"epoch": 0.6343472750316856,
"grad_norm": 0.09531984478235245,
"learning_rate": 5.970495548295449e-05,
"loss": 0.6126,
"step": 1001
},
{
"epoch": 0.6349809885931559,
"grad_norm": 0.057190559804439545,
"learning_rate": 5.952166568776062e-05,
"loss": 0.5025,
"step": 1002
},
{
"epoch": 0.6356147021546261,
"grad_norm": 0.43628084659576416,
"learning_rate": 5.933853838386795e-05,
"loss": 0.5126,
"step": 1003
},
{
"epoch": 0.6362484157160964,
"grad_norm": 0.05435523763298988,
"learning_rate": 5.9155574306400395e-05,
"loss": 0.5081,
"step": 1004
},
{
"epoch": 0.6368821292775665,
"grad_norm": 0.048096027225255966,
"learning_rate": 5.897277418982672e-05,
"loss": 0.4067,
"step": 1005
},
{
"epoch": 0.6375158428390367,
"grad_norm": 0.049206674098968506,
"learning_rate": 5.879013876795745e-05,
"loss": 0.4213,
"step": 1006
},
{
"epoch": 0.638149556400507,
"grad_norm": 0.05596129968762398,
"learning_rate": 5.860766877394206e-05,
"loss": 0.4981,
"step": 1007
},
{
"epoch": 0.6387832699619772,
"grad_norm": 0.08870602399110794,
"learning_rate": 5.84253649402658e-05,
"loss": 0.6355,
"step": 1008
},
{
"epoch": 0.6394169835234474,
"grad_norm": 0.061666570603847504,
"learning_rate": 5.824322799874713e-05,
"loss": 0.5669,
"step": 1009
},
{
"epoch": 0.6400506970849176,
"grad_norm": 0.061136480420827866,
"learning_rate": 5.806125868053433e-05,
"loss": 0.3989,
"step": 1010
},
{
"epoch": 0.6406844106463878,
"grad_norm": 0.05063464865088463,
"learning_rate": 5.787945771610296e-05,
"loss": 0.5577,
"step": 1011
},
{
"epoch": 0.641318124207858,
"grad_norm": 0.04787430539727211,
"learning_rate": 5.7697825835252586e-05,
"loss": 0.4721,
"step": 1012
},
{
"epoch": 0.6419518377693283,
"grad_norm": 0.05791177973151207,
"learning_rate": 5.7516363767104265e-05,
"loss": 0.4997,
"step": 1013
},
{
"epoch": 0.6425855513307985,
"grad_norm": 0.049527108669281006,
"learning_rate": 5.733507224009723e-05,
"loss": 0.5186,
"step": 1014
},
{
"epoch": 0.6432192648922687,
"grad_norm": 0.12338205426931381,
"learning_rate": 5.715395198198603e-05,
"loss": 0.4634,
"step": 1015
},
{
"epoch": 0.6438529784537389,
"grad_norm": 0.06410299986600876,
"learning_rate": 5.697300371983794e-05,
"loss": 0.5332,
"step": 1016
},
{
"epoch": 0.6444866920152091,
"grad_norm": 0.05564810708165169,
"learning_rate": 5.679222818002954e-05,
"loss": 0.5017,
"step": 1017
},
{
"epoch": 0.6451204055766794,
"grad_norm": 0.05951263755559921,
"learning_rate": 5.6611626088244194e-05,
"loss": 0.4736,
"step": 1018
},
{
"epoch": 0.6457541191381495,
"grad_norm": 0.07646768540143967,
"learning_rate": 5.6431198169469e-05,
"loss": 0.4648,
"step": 1019
},
{
"epoch": 0.6463878326996197,
"grad_norm": 0.06121028959751129,
"learning_rate": 5.6250945147991804e-05,
"loss": 0.5012,
"step": 1020
},
{
"epoch": 0.64702154626109,
"grad_norm": 0.05163773521780968,
"learning_rate": 5.607086774739849e-05,
"loss": 0.4415,
"step": 1021
},
{
"epoch": 0.6476552598225602,
"grad_norm": 0.06718850135803223,
"learning_rate": 5.5890966690569724e-05,
"loss": 0.4203,
"step": 1022
},
{
"epoch": 0.6482889733840305,
"grad_norm": 0.0512927770614624,
"learning_rate": 5.57112426996786e-05,
"loss": 0.5041,
"step": 1023
},
{
"epoch": 0.6489226869455006,
"grad_norm": 0.06612823903560638,
"learning_rate": 5.55316964961871e-05,
"loss": 0.5321,
"step": 1024
},
{
"epoch": 0.6495564005069708,
"grad_norm": 0.04569435119628906,
"learning_rate": 5.5352328800843724e-05,
"loss": 0.3867,
"step": 1025
},
{
"epoch": 0.6501901140684411,
"grad_norm": 0.05401879921555519,
"learning_rate": 5.5173140333680306e-05,
"loss": 0.4198,
"step": 1026
},
{
"epoch": 0.6508238276299113,
"grad_norm": 0.06780200451612473,
"learning_rate": 5.499413181400926e-05,
"loss": 0.5243,
"step": 1027
},
{
"epoch": 0.6514575411913816,
"grad_norm": 0.039808813482522964,
"learning_rate": 5.481530396042059e-05,
"loss": 0.413,
"step": 1028
},
{
"epoch": 0.6520912547528517,
"grad_norm": 0.05415809899568558,
"learning_rate": 5.463665749077909e-05,
"loss": 0.4654,
"step": 1029
},
{
"epoch": 0.6527249683143219,
"grad_norm": 0.07211584597826004,
"learning_rate": 5.44581931222214e-05,
"loss": 0.6116,
"step": 1030
},
{
"epoch": 0.6533586818757922,
"grad_norm": 0.05783382058143616,
"learning_rate": 5.42799115711532e-05,
"loss": 0.5143,
"step": 1031
},
{
"epoch": 0.6539923954372624,
"grad_norm": 0.04725528135895729,
"learning_rate": 5.410181355324622e-05,
"loss": 0.3522,
"step": 1032
},
{
"epoch": 0.6546261089987325,
"grad_norm": 0.10452211648225784,
"learning_rate": 5.392389978343555e-05,
"loss": 0.5993,
"step": 1033
},
{
"epoch": 0.6552598225602028,
"grad_norm": 0.051397427916526794,
"learning_rate": 5.37461709759165e-05,
"loss": 0.4828,
"step": 1034
},
{
"epoch": 0.655893536121673,
"grad_norm": 0.055356886237859726,
"learning_rate": 5.356862784414199e-05,
"loss": 0.5121,
"step": 1035
},
{
"epoch": 0.6565272496831432,
"grad_norm": 0.05265672877430916,
"learning_rate": 5.3391271100819607e-05,
"loss": 0.5696,
"step": 1036
},
{
"epoch": 0.6571609632446135,
"grad_norm": 0.06235937401652336,
"learning_rate": 5.321410145790866e-05,
"loss": 0.4243,
"step": 1037
},
{
"epoch": 0.6577946768060836,
"grad_norm": 0.049563296139240265,
"learning_rate": 5.303711962661744e-05,
"loss": 0.4406,
"step": 1038
},
{
"epoch": 0.6584283903675539,
"grad_norm": 0.04914897307753563,
"learning_rate": 5.286032631740023e-05,
"loss": 0.4779,
"step": 1039
},
{
"epoch": 0.6590621039290241,
"grad_norm": 0.05867873132228851,
"learning_rate": 5.268372223995468e-05,
"loss": 0.3903,
"step": 1040
},
{
"epoch": 0.6596958174904943,
"grad_norm": 0.057386353611946106,
"learning_rate": 5.2507308103218554e-05,
"loss": 0.532,
"step": 1041
},
{
"epoch": 0.6603295310519645,
"grad_norm": 0.07655716687440872,
"learning_rate": 5.2331084615367485e-05,
"loss": 0.545,
"step": 1042
},
{
"epoch": 0.6609632446134347,
"grad_norm": 0.04618160054087639,
"learning_rate": 5.2155052483811484e-05,
"loss": 0.3573,
"step": 1043
},
{
"epoch": 0.6615969581749049,
"grad_norm": 0.05024750530719757,
"learning_rate": 5.197921241519252e-05,
"loss": 0.4974,
"step": 1044
},
{
"epoch": 0.6622306717363752,
"grad_norm": 0.051412664353847504,
"learning_rate": 5.1803565115381694e-05,
"loss": 0.5165,
"step": 1045
},
{
"epoch": 0.6628643852978454,
"grad_norm": 0.05353249981999397,
"learning_rate": 5.162811128947602e-05,
"loss": 0.4662,
"step": 1046
},
{
"epoch": 0.6634980988593155,
"grad_norm": 0.05121266096830368,
"learning_rate": 5.1452851641796074e-05,
"loss": 0.4855,
"step": 1047
},
{
"epoch": 0.6641318124207858,
"grad_norm": 0.0604366697371006,
"learning_rate": 5.127778687588285e-05,
"loss": 0.4704,
"step": 1048
},
{
"epoch": 0.664765525982256,
"grad_norm": 0.09897324442863464,
"learning_rate": 5.1102917694495034e-05,
"loss": 0.599,
"step": 1049
},
{
"epoch": 0.6653992395437263,
"grad_norm": 0.0662703886628151,
"learning_rate": 5.092824479960625e-05,
"loss": 0.5022,
"step": 1050
},
{
"epoch": 0.6660329531051965,
"grad_norm": 0.05987107753753662,
"learning_rate": 5.075376889240198e-05,
"loss": 0.4783,
"step": 1051
},
{
"epoch": 0.6666666666666666,
"grad_norm": 0.05671470984816551,
"learning_rate": 5.057949067327726e-05,
"loss": 0.4619,
"step": 1052
},
{
"epoch": 0.6673003802281369,
"grad_norm": 0.06275074928998947,
"learning_rate": 5.0405410841833253e-05,
"loss": 0.5151,
"step": 1053
},
{
"epoch": 0.6679340937896071,
"grad_norm": 0.08451675623655319,
"learning_rate": 5.023153009687489e-05,
"loss": 0.458,
"step": 1054
},
{
"epoch": 0.6685678073510773,
"grad_norm": 0.05530532822012901,
"learning_rate": 5.0057849136407874e-05,
"loss": 0.3859,
"step": 1055
},
{
"epoch": 0.6692015209125475,
"grad_norm": 0.0674528256058693,
"learning_rate": 4.988436865763594e-05,
"loss": 0.6544,
"step": 1056
},
{
"epoch": 0.6698352344740177,
"grad_norm": 0.052506398409605026,
"learning_rate": 4.971108935695801e-05,
"loss": 0.4777,
"step": 1057
},
{
"epoch": 0.670468948035488,
"grad_norm": 0.05678095668554306,
"learning_rate": 4.953801192996543e-05,
"loss": 0.5026,
"step": 1058
},
{
"epoch": 0.6711026615969582,
"grad_norm": 0.061344366520643234,
"learning_rate": 4.936513707143918e-05,
"loss": 0.4754,
"step": 1059
},
{
"epoch": 0.6717363751584284,
"grad_norm": 0.07330626994371414,
"learning_rate": 4.919246547534708e-05,
"loss": 0.5075,
"step": 1060
},
{
"epoch": 0.6723700887198986,
"grad_norm": 0.04772542044520378,
"learning_rate": 4.9019997834840884e-05,
"loss": 0.4995,
"step": 1061
},
{
"epoch": 0.6730038022813688,
"grad_norm": 0.05167115479707718,
"learning_rate": 4.884773484225385e-05,
"loss": 0.4022,
"step": 1062
},
{
"epoch": 0.673637515842839,
"grad_norm": 0.0586593933403492,
"learning_rate": 4.8675677189097465e-05,
"loss": 0.6427,
"step": 1063
},
{
"epoch": 0.6742712294043093,
"grad_norm": 0.05439648777246475,
"learning_rate": 4.850382556605908e-05,
"loss": 0.5481,
"step": 1064
},
{
"epoch": 0.6749049429657795,
"grad_norm": 0.0691414475440979,
"learning_rate": 4.833218066299896e-05,
"loss": 0.6269,
"step": 1065
},
{
"epoch": 0.6755386565272496,
"grad_norm": 0.06065572425723076,
"learning_rate": 4.8160743168947496e-05,
"loss": 0.4692,
"step": 1066
},
{
"epoch": 0.6761723700887199,
"grad_norm": 0.06905770301818848,
"learning_rate": 4.7989513772102537e-05,
"loss": 0.5701,
"step": 1067
},
{
"epoch": 0.6768060836501901,
"grad_norm": 0.05062158778309822,
"learning_rate": 4.781849315982653e-05,
"loss": 0.4795,
"step": 1068
},
{
"epoch": 0.6774397972116604,
"grad_norm": 0.06706640124320984,
"learning_rate": 4.7647682018643844e-05,
"loss": 0.5052,
"step": 1069
},
{
"epoch": 0.6780735107731305,
"grad_norm": 0.047475773841142654,
"learning_rate": 4.74770810342379e-05,
"loss": 0.4566,
"step": 1070
},
{
"epoch": 0.6787072243346007,
"grad_norm": 0.04724888876080513,
"learning_rate": 4.730669089144855e-05,
"loss": 0.5391,
"step": 1071
},
{
"epoch": 0.679340937896071,
"grad_norm": 0.051380082964897156,
"learning_rate": 4.713651227426926e-05,
"loss": 0.6071,
"step": 1072
},
{
"epoch": 0.6799746514575412,
"grad_norm": 0.06109333038330078,
"learning_rate": 4.696654586584437e-05,
"loss": 0.4275,
"step": 1073
},
{
"epoch": 0.6806083650190115,
"grad_norm": 0.052917756140232086,
"learning_rate": 4.6796792348466356e-05,
"loss": 0.5064,
"step": 1074
},
{
"epoch": 0.6812420785804816,
"grad_norm": 0.053081825375556946,
"learning_rate": 4.6627252403573085e-05,
"loss": 0.4093,
"step": 1075
},
{
"epoch": 0.6818757921419518,
"grad_norm": 0.054008882492780685,
"learning_rate": 4.6457926711745095e-05,
"loss": 0.4456,
"step": 1076
},
{
"epoch": 0.6825095057034221,
"grad_norm": 0.13207583129405975,
"learning_rate": 4.6288815952702826e-05,
"loss": 0.4382,
"step": 1077
},
{
"epoch": 0.6831432192648923,
"grad_norm": 0.058852966874837875,
"learning_rate": 4.6119920805303964e-05,
"loss": 0.472,
"step": 1078
},
{
"epoch": 0.6837769328263625,
"grad_norm": 0.060738544911146164,
"learning_rate": 4.595124194754066e-05,
"loss": 0.4584,
"step": 1079
},
{
"epoch": 0.6844106463878327,
"grad_norm": 0.06450516730546951,
"learning_rate": 4.57827800565367e-05,
"loss": 0.5646,
"step": 1080
},
{
"epoch": 0.6850443599493029,
"grad_norm": 0.05439593642950058,
"learning_rate": 4.561453580854516e-05,
"loss": 0.4885,
"step": 1081
},
{
"epoch": 0.6856780735107731,
"grad_norm": 0.07528258860111237,
"learning_rate": 4.544650987894514e-05,
"loss": 0.6053,
"step": 1082
},
{
"epoch": 0.6863117870722434,
"grad_norm": 0.05821401625871658,
"learning_rate": 4.527870294223957e-05,
"loss": 0.4686,
"step": 1083
},
{
"epoch": 0.6869455006337135,
"grad_norm": 0.056713685393333435,
"learning_rate": 4.5111115672052187e-05,
"loss": 0.556,
"step": 1084
},
{
"epoch": 0.6875792141951838,
"grad_norm": 0.06937955319881439,
"learning_rate": 4.4943748741124934e-05,
"loss": 0.5904,
"step": 1085
},
{
"epoch": 0.688212927756654,
"grad_norm": 0.05129878595471382,
"learning_rate": 4.477660282131529e-05,
"loss": 0.4215,
"step": 1086
},
{
"epoch": 0.6888466413181242,
"grad_norm": 0.06477949768304825,
"learning_rate": 4.4609678583593416e-05,
"loss": 0.5295,
"step": 1087
},
{
"epoch": 0.6894803548795945,
"grad_norm": 0.06432091444730759,
"learning_rate": 4.444297669803981e-05,
"loss": 0.603,
"step": 1088
},
{
"epoch": 0.6901140684410646,
"grad_norm": 0.07068527489900589,
"learning_rate": 4.427649783384211e-05,
"loss": 0.7618,
"step": 1089
},
{
"epoch": 0.6907477820025348,
"grad_norm": 0.04375835880637169,
"learning_rate": 4.4110242659292836e-05,
"loss": 0.4594,
"step": 1090
},
{
"epoch": 0.6913814955640051,
"grad_norm": 0.04846682399511337,
"learning_rate": 4.394421184178663e-05,
"loss": 0.4619,
"step": 1091
},
{
"epoch": 0.6920152091254753,
"grad_norm": 0.05694019794464111,
"learning_rate": 4.377840604781731e-05,
"loss": 0.5545,
"step": 1092
},
{
"epoch": 0.6926489226869454,
"grad_norm": 0.056636743247509,
"learning_rate": 4.361282594297552e-05,
"loss": 0.4916,
"step": 1093
},
{
"epoch": 0.6932826362484157,
"grad_norm": 0.05540623515844345,
"learning_rate": 4.3447472191945896e-05,
"loss": 0.4707,
"step": 1094
},
{
"epoch": 0.6939163498098859,
"grad_norm": 0.05961360037326813,
"learning_rate": 4.328234545850442e-05,
"loss": 0.5428,
"step": 1095
},
{
"epoch": 0.6945500633713562,
"grad_norm": 0.07297226786613464,
"learning_rate": 4.3117446405515784e-05,
"loss": 0.5159,
"step": 1096
},
{
"epoch": 0.6951837769328264,
"grad_norm": 0.05866815522313118,
"learning_rate": 4.295277569493059e-05,
"loss": 0.3985,
"step": 1097
},
{
"epoch": 0.6958174904942965,
"grad_norm": 0.05763423442840576,
"learning_rate": 4.278833398778306e-05,
"loss": 0.3906,
"step": 1098
},
{
"epoch": 0.6964512040557668,
"grad_norm": 0.057510554790496826,
"learning_rate": 4.262412194418786e-05,
"loss": 0.4979,
"step": 1099
},
{
"epoch": 0.697084917617237,
"grad_norm": 0.058582838624715805,
"learning_rate": 4.2460140223337875e-05,
"loss": 0.4242,
"step": 1100
},
{
"epoch": 0.6977186311787072,
"grad_norm": 0.0564759224653244,
"learning_rate": 4.229638948350139e-05,
"loss": 0.5166,
"step": 1101
},
{
"epoch": 0.6983523447401775,
"grad_norm": 0.11224331706762314,
"learning_rate": 4.213287038201943e-05,
"loss": 0.6246,
"step": 1102
},
{
"epoch": 0.6989860583016476,
"grad_norm": 0.05778801068663597,
"learning_rate": 4.196958357530322e-05,
"loss": 0.5049,
"step": 1103
},
{
"epoch": 0.6996197718631179,
"grad_norm": 0.05989357456564903,
"learning_rate": 4.180652971883142e-05,
"loss": 0.5431,
"step": 1104
},
{
"epoch": 0.7002534854245881,
"grad_norm": 0.07203585654497147,
"learning_rate": 4.1643709467147615e-05,
"loss": 0.5388,
"step": 1105
},
{
"epoch": 0.7008871989860583,
"grad_norm": 0.05250242352485657,
"learning_rate": 4.148112347385762e-05,
"loss": 0.4402,
"step": 1106
},
{
"epoch": 0.7015209125475285,
"grad_norm": 0.07697035372257233,
"learning_rate": 4.131877239162686e-05,
"loss": 0.4832,
"step": 1107
},
{
"epoch": 0.7021546261089987,
"grad_norm": 0.05670148506760597,
"learning_rate": 4.11566568721778e-05,
"loss": 0.3192,
"step": 1108
},
{
"epoch": 0.7027883396704689,
"grad_norm": 0.0636947974562645,
"learning_rate": 4.0994777566287204e-05,
"loss": 0.4571,
"step": 1109
},
{
"epoch": 0.7034220532319392,
"grad_norm": 0.06214752793312073,
"learning_rate": 4.0833135123783683e-05,
"loss": 0.5918,
"step": 1110
},
{
"epoch": 0.7040557667934094,
"grad_norm": 0.055310748517513275,
"learning_rate": 4.067173019354501e-05,
"loss": 0.56,
"step": 1111
},
{
"epoch": 0.7046894803548795,
"grad_norm": 0.09247829020023346,
"learning_rate": 4.05105634234955e-05,
"loss": 0.4502,
"step": 1112
},
{
"epoch": 0.7053231939163498,
"grad_norm": 0.0581621415913105,
"learning_rate": 4.0349635460603404e-05,
"loss": 0.4554,
"step": 1113
},
{
"epoch": 0.70595690747782,
"grad_norm": 0.057738032191991806,
"learning_rate": 4.0188946950878404e-05,
"loss": 0.459,
"step": 1114
},
{
"epoch": 0.7065906210392903,
"grad_norm": 0.05233234167098999,
"learning_rate": 4.002849853936891e-05,
"loss": 0.5418,
"step": 1115
},
{
"epoch": 0.7072243346007605,
"grad_norm": 0.058156002312898636,
"learning_rate": 3.9868290870159405e-05,
"loss": 0.4297,
"step": 1116
},
{
"epoch": 0.7078580481622306,
"grad_norm": 0.06034991517663002,
"learning_rate": 3.970832458636823e-05,
"loss": 0.5115,
"step": 1117
},
{
"epoch": 0.7084917617237009,
"grad_norm": 0.060476917773485184,
"learning_rate": 3.9548600330144436e-05,
"loss": 0.4905,
"step": 1118
},
{
"epoch": 0.7091254752851711,
"grad_norm": 0.06165899708867073,
"learning_rate": 3.9389118742665696e-05,
"loss": 0.4906,
"step": 1119
},
{
"epoch": 0.7097591888466414,
"grad_norm": 0.05476471036672592,
"learning_rate": 3.922988046413551e-05,
"loss": 0.5357,
"step": 1120
},
{
"epoch": 0.7103929024081115,
"grad_norm": 0.0555727519094944,
"learning_rate": 3.9070886133780635e-05,
"loss": 0.4257,
"step": 1121
},
{
"epoch": 0.7110266159695817,
"grad_norm": 0.06239394098520279,
"learning_rate": 3.8912136389848576e-05,
"loss": 0.5522,
"step": 1122
},
{
"epoch": 0.711660329531052,
"grad_norm": 0.0808614119887352,
"learning_rate": 3.875363186960499e-05,
"loss": 0.585,
"step": 1123
},
{
"epoch": 0.7122940430925222,
"grad_norm": 0.5757790207862854,
"learning_rate": 3.859537320933114e-05,
"loss": 0.418,
"step": 1124
},
{
"epoch": 0.7129277566539924,
"grad_norm": 0.060083821415901184,
"learning_rate": 3.843736104432137e-05,
"loss": 0.562,
"step": 1125
},
{
"epoch": 0.7135614702154626,
"grad_norm": 0.07535073906183243,
"learning_rate": 3.8279596008880416e-05,
"loss": 0.5658,
"step": 1126
},
{
"epoch": 0.7141951837769328,
"grad_norm": 0.061645809561014175,
"learning_rate": 3.812207873632115e-05,
"loss": 0.5278,
"step": 1127
},
{
"epoch": 0.714828897338403,
"grad_norm": 0.05202870815992355,
"learning_rate": 3.7964809858961655e-05,
"loss": 0.5626,
"step": 1128
},
{
"epoch": 0.7154626108998733,
"grad_norm": 0.07412015646696091,
"learning_rate": 3.780779000812303e-05,
"loss": 0.5756,
"step": 1129
},
{
"epoch": 0.7160963244613435,
"grad_norm": 0.06074469909071922,
"learning_rate": 3.7651019814126654e-05,
"loss": 0.4934,
"step": 1130
},
{
"epoch": 0.7167300380228137,
"grad_norm": 0.055158831179142,
"learning_rate": 3.749449990629173e-05,
"loss": 0.3941,
"step": 1131
},
{
"epoch": 0.7173637515842839,
"grad_norm": 0.046296197921037674,
"learning_rate": 3.733823091293274e-05,
"loss": 0.5466,
"step": 1132
},
{
"epoch": 0.7179974651457541,
"grad_norm": 0.0542571134865284,
"learning_rate": 3.718221346135685e-05,
"loss": 0.4754,
"step": 1133
},
{
"epoch": 0.7186311787072244,
"grad_norm": 0.056511152535676956,
"learning_rate": 3.7026448177861625e-05,
"loss": 0.453,
"step": 1134
},
{
"epoch": 0.7192648922686945,
"grad_norm": 0.0572255440056324,
"learning_rate": 3.687093568773229e-05,
"loss": 0.5494,
"step": 1135
},
{
"epoch": 0.7198986058301647,
"grad_norm": 0.05171896889805794,
"learning_rate": 3.671567661523915e-05,
"loss": 0.5434,
"step": 1136
},
{
"epoch": 0.720532319391635,
"grad_norm": 0.062315478920936584,
"learning_rate": 3.6560671583635467e-05,
"loss": 0.5841,
"step": 1137
},
{
"epoch": 0.7211660329531052,
"grad_norm": 0.06699743866920471,
"learning_rate": 3.6405921215154494e-05,
"loss": 0.5331,
"step": 1138
},
{
"epoch": 0.7217997465145755,
"grad_norm": 0.05675177648663521,
"learning_rate": 3.625142613100733e-05,
"loss": 0.5775,
"step": 1139
},
{
"epoch": 0.7224334600760456,
"grad_norm": 0.05293945595622063,
"learning_rate": 3.609718695138022e-05,
"loss": 0.4729,
"step": 1140
},
{
"epoch": 0.7230671736375158,
"grad_norm": 0.054869700223207474,
"learning_rate": 3.5943204295432186e-05,
"loss": 0.4514,
"step": 1141
},
{
"epoch": 0.7237008871989861,
"grad_norm": 0.07087720930576324,
"learning_rate": 3.578947878129245e-05,
"loss": 0.4286,
"step": 1142
},
{
"epoch": 0.7243346007604563,
"grad_norm": 0.05950114503502846,
"learning_rate": 3.563601102605804e-05,
"loss": 0.5829,
"step": 1143
},
{
"epoch": 0.7249683143219265,
"grad_norm": 0.05715707316994667,
"learning_rate": 3.548280164579126e-05,
"loss": 0.4924,
"step": 1144
},
{
"epoch": 0.7256020278833967,
"grad_norm": 0.049786679446697235,
"learning_rate": 3.532985125551715e-05,
"loss": 0.4821,
"step": 1145
},
{
"epoch": 0.7262357414448669,
"grad_norm": 0.05315759778022766,
"learning_rate": 3.517716046922118e-05,
"loss": 0.5044,
"step": 1146
},
{
"epoch": 0.7268694550063372,
"grad_norm": 0.05537761375308037,
"learning_rate": 3.502472989984667e-05,
"loss": 0.4959,
"step": 1147
},
{
"epoch": 0.7275031685678074,
"grad_norm": 0.04215134680271149,
"learning_rate": 3.4872560159292345e-05,
"loss": 0.3173,
"step": 1148
},
{
"epoch": 0.7281368821292775,
"grad_norm": 0.05657806992530823,
"learning_rate": 3.4720651858409915e-05,
"loss": 0.5109,
"step": 1149
},
{
"epoch": 0.7287705956907478,
"grad_norm": 0.05998120456933975,
"learning_rate": 3.456900560700158e-05,
"loss": 0.5254,
"step": 1150
},
{
"epoch": 0.729404309252218,
"grad_norm": 0.04827038198709488,
"learning_rate": 3.4417622013817595e-05,
"loss": 0.422,
"step": 1151
},
{
"epoch": 0.7300380228136882,
"grad_norm": 0.06192559376358986,
"learning_rate": 3.426650168655385e-05,
"loss": 0.5273,
"step": 1152
},
{
"epoch": 0.7306717363751585,
"grad_norm": 0.04682118073105812,
"learning_rate": 3.41156452318494e-05,
"loss": 0.4335,
"step": 1153
},
{
"epoch": 0.7313054499366286,
"grad_norm": 0.056407637894153595,
"learning_rate": 3.3965053255284084e-05,
"loss": 0.5223,
"step": 1154
},
{
"epoch": 0.7319391634980988,
"grad_norm": 0.045136161148548126,
"learning_rate": 3.381472636137591e-05,
"loss": 0.4298,
"step": 1155
},
{
"epoch": 0.7325728770595691,
"grad_norm": 0.054349806159734726,
"learning_rate": 3.3664665153579e-05,
"loss": 0.4769,
"step": 1156
},
{
"epoch": 0.7332065906210393,
"grad_norm": 0.054503411054611206,
"learning_rate": 3.3514870234280726e-05,
"loss": 0.5231,
"step": 1157
},
{
"epoch": 0.7338403041825095,
"grad_norm": 0.055708032101392746,
"learning_rate": 3.336534220479961e-05,
"loss": 0.4488,
"step": 1158
},
{
"epoch": 0.7344740177439797,
"grad_norm": 0.05808594450354576,
"learning_rate": 3.321608166538279e-05,
"loss": 0.5226,
"step": 1159
},
{
"epoch": 0.7351077313054499,
"grad_norm": 0.05214309319853783,
"learning_rate": 3.3067089215203625e-05,
"loss": 0.4613,
"step": 1160
},
{
"epoch": 0.7357414448669202,
"grad_norm": 0.08583290129899979,
"learning_rate": 3.29183654523593e-05,
"loss": 0.666,
"step": 1161
},
{
"epoch": 0.7363751584283904,
"grad_norm": 0.0662471279501915,
"learning_rate": 3.276991097386831e-05,
"loss": 0.5606,
"step": 1162
},
{
"epoch": 0.7370088719898605,
"grad_norm": 0.05517906695604324,
"learning_rate": 3.262172637566838e-05,
"loss": 0.4599,
"step": 1163
},
{
"epoch": 0.7376425855513308,
"grad_norm": 0.04590049758553505,
"learning_rate": 3.2473812252613645e-05,
"loss": 0.357,
"step": 1164
},
{
"epoch": 0.738276299112801,
"grad_norm": 0.055093150585889816,
"learning_rate": 3.2326169198472556e-05,
"loss": 0.4695,
"step": 1165
},
{
"epoch": 0.7389100126742713,
"grad_norm": 0.06565197557210922,
"learning_rate": 3.217879780592553e-05,
"loss": 0.6063,
"step": 1166
},
{
"epoch": 0.7395437262357415,
"grad_norm": 0.061285920441150665,
"learning_rate": 3.203169866656226e-05,
"loss": 0.3462,
"step": 1167
},
{
"epoch": 0.7401774397972116,
"grad_norm": 0.0389142669737339,
"learning_rate": 3.188487237087968e-05,
"loss": 0.419,
"step": 1168
},
{
"epoch": 0.7408111533586819,
"grad_norm": 0.04540220648050308,
"learning_rate": 3.173831950827939e-05,
"loss": 0.541,
"step": 1169
},
{
"epoch": 0.7414448669201521,
"grad_norm": 0.05841519311070442,
"learning_rate": 3.159204066706539e-05,
"loss": 0.553,
"step": 1170
},
{
"epoch": 0.7420785804816223,
"grad_norm": 0.06287078559398651,
"learning_rate": 3.1446036434441696e-05,
"loss": 0.465,
"step": 1171
},
{
"epoch": 0.7427122940430925,
"grad_norm": 0.045517582446336746,
"learning_rate": 3.130030739650983e-05,
"loss": 0.4424,
"step": 1172
},
{
"epoch": 0.7433460076045627,
"grad_norm": 0.057926177978515625,
"learning_rate": 3.1154854138266856e-05,
"loss": 0.5547,
"step": 1173
},
{
"epoch": 0.743979721166033,
"grad_norm": 0.04859447479248047,
"learning_rate": 3.100967724360254e-05,
"loss": 0.4152,
"step": 1174
},
{
"epoch": 0.7446134347275032,
"grad_norm": 0.047411590814590454,
"learning_rate": 3.0864777295297376e-05,
"loss": 0.4933,
"step": 1175
},
{
"epoch": 0.7452471482889734,
"grad_norm": 0.060108788311481476,
"learning_rate": 3.0720154875020093e-05,
"loss": 0.5968,
"step": 1176
},
{
"epoch": 0.7458808618504436,
"grad_norm": 0.055359989404678345,
"learning_rate": 3.057581056332533e-05,
"loss": 0.4146,
"step": 1177
},
{
"epoch": 0.7465145754119138,
"grad_norm": 0.5088172554969788,
"learning_rate": 3.0431744939651364e-05,
"loss": 0.4797,
"step": 1178
},
{
"epoch": 0.747148288973384,
"grad_norm": 0.04731796681880951,
"learning_rate": 3.0287958582317676e-05,
"loss": 0.4751,
"step": 1179
},
{
"epoch": 0.7477820025348543,
"grad_norm": 0.05965707078576088,
"learning_rate": 3.0144452068522744e-05,
"loss": 0.579,
"step": 1180
},
{
"epoch": 0.7484157160963245,
"grad_norm": 0.07227399200201035,
"learning_rate": 3.0001225974341696e-05,
"loss": 0.5075,
"step": 1181
},
{
"epoch": 0.7490494296577946,
"grad_norm": 0.058375824242830276,
"learning_rate": 2.985828087472383e-05,
"loss": 0.5896,
"step": 1182
},
{
"epoch": 0.7496831432192649,
"grad_norm": 0.07856516540050507,
"learning_rate": 2.97156173434907e-05,
"loss": 0.4944,
"step": 1183
},
{
"epoch": 0.7503168567807351,
"grad_norm": 0.06058354303240776,
"learning_rate": 2.9573235953333345e-05,
"loss": 0.4903,
"step": 1184
},
{
"epoch": 0.7509505703422054,
"grad_norm": 0.04492352157831192,
"learning_rate": 2.9431137275810317e-05,
"loss": 0.4895,
"step": 1185
},
{
"epoch": 0.7515842839036755,
"grad_norm": 0.051967304199934006,
"learning_rate": 2.9289321881345254e-05,
"loss": 0.4462,
"step": 1186
},
{
"epoch": 0.7522179974651457,
"grad_norm": 0.05165516957640648,
"learning_rate": 2.9147790339224645e-05,
"loss": 0.4734,
"step": 1187
},
{
"epoch": 0.752851711026616,
"grad_norm": 0.056040648370981216,
"learning_rate": 2.9006543217595485e-05,
"loss": 0.4842,
"step": 1188
},
{
"epoch": 0.7534854245880862,
"grad_norm": 0.04782859981060028,
"learning_rate": 2.8865581083463033e-05,
"loss": 0.5286,
"step": 1189
},
{
"epoch": 0.7541191381495564,
"grad_norm": 0.06290265917778015,
"learning_rate": 2.8724904502688566e-05,
"loss": 0.5157,
"step": 1190
},
{
"epoch": 0.7547528517110266,
"grad_norm": 0.06040764972567558,
"learning_rate": 2.8584514039986944e-05,
"loss": 0.6259,
"step": 1191
},
{
"epoch": 0.7553865652724968,
"grad_norm": 0.05384962260723114,
"learning_rate": 2.8444410258924693e-05,
"loss": 0.4999,
"step": 1192
},
{
"epoch": 0.756020278833967,
"grad_norm": 0.048367083072662354,
"learning_rate": 2.8304593721917285e-05,
"loss": 0.5449,
"step": 1193
},
{
"epoch": 0.7566539923954373,
"grad_norm": 0.056033551692962646,
"learning_rate": 2.8165064990227252e-05,
"loss": 0.5213,
"step": 1194
},
{
"epoch": 0.7572877059569075,
"grad_norm": 0.05267353728413582,
"learning_rate": 2.8025824623961773e-05,
"loss": 0.5148,
"step": 1195
},
{
"epoch": 0.7579214195183777,
"grad_norm": 0.048320189118385315,
"learning_rate": 2.7886873182070418e-05,
"loss": 0.446,
"step": 1196
},
{
"epoch": 0.7585551330798479,
"grad_norm": 0.0505547821521759,
"learning_rate": 2.7748211222342957e-05,
"loss": 0.4803,
"step": 1197
},
{
"epoch": 0.7591888466413181,
"grad_norm": 0.0495329424738884,
"learning_rate": 2.7609839301407104e-05,
"loss": 0.4113,
"step": 1198
},
{
"epoch": 0.7598225602027884,
"grad_norm": 0.05694635957479477,
"learning_rate": 2.7471757974726253e-05,
"loss": 0.5203,
"step": 1199
},
{
"epoch": 0.7604562737642585,
"grad_norm": 0.05900391563773155,
"learning_rate": 2.7333967796597315e-05,
"loss": 0.5719,
"step": 1200
},
{
"epoch": 0.7610899873257287,
"grad_norm": 0.04960976913571358,
"learning_rate": 2.7196469320148342e-05,
"loss": 0.3774,
"step": 1201
},
{
"epoch": 0.761723700887199,
"grad_norm": 0.06350208073854446,
"learning_rate": 2.7059263097336597e-05,
"loss": 0.4629,
"step": 1202
},
{
"epoch": 0.7623574144486692,
"grad_norm": 0.1048828512430191,
"learning_rate": 2.692234967894597e-05,
"loss": 0.4025,
"step": 1203
},
{
"epoch": 0.7629911280101395,
"grad_norm": 0.0617394745349884,
"learning_rate": 2.6785729614585066e-05,
"loss": 0.4818,
"step": 1204
},
{
"epoch": 0.7636248415716096,
"grad_norm": 0.04619552195072174,
"learning_rate": 2.664940345268483e-05,
"loss": 0.4707,
"step": 1205
},
{
"epoch": 0.7642585551330798,
"grad_norm": 0.05683249235153198,
"learning_rate": 2.651337174049645e-05,
"loss": 0.6169,
"step": 1206
},
{
"epoch": 0.7648922686945501,
"grad_norm": 0.055084507912397385,
"learning_rate": 2.6377635024089087e-05,
"loss": 0.3774,
"step": 1207
},
{
"epoch": 0.7655259822560203,
"grad_norm": 0.062256794422864914,
"learning_rate": 2.624219384834764e-05,
"loss": 0.5087,
"step": 1208
},
{
"epoch": 0.7661596958174905,
"grad_norm": 0.09854335337877274,
"learning_rate": 2.6107048756970764e-05,
"loss": 0.5207,
"step": 1209
},
{
"epoch": 0.7667934093789607,
"grad_norm": 0.047757312655448914,
"learning_rate": 2.5972200292468464e-05,
"loss": 0.4854,
"step": 1210
},
{
"epoch": 0.7674271229404309,
"grad_norm": 0.0559842512011528,
"learning_rate": 2.5837648996159948e-05,
"loss": 0.5389,
"step": 1211
},
{
"epoch": 0.7680608365019012,
"grad_norm": 0.07216788083314896,
"learning_rate": 2.570339540817167e-05,
"loss": 0.6026,
"step": 1212
},
{
"epoch": 0.7686945500633714,
"grad_norm": 0.058914799243211746,
"learning_rate": 2.5569440067434813e-05,
"loss": 0.4706,
"step": 1213
},
{
"epoch": 0.7693282636248415,
"grad_norm": 0.05721895024180412,
"learning_rate": 2.5435783511683443e-05,
"loss": 0.483,
"step": 1214
},
{
"epoch": 0.7699619771863118,
"grad_norm": 0.11345556378364563,
"learning_rate": 2.5302426277452172e-05,
"loss": 0.5928,
"step": 1215
},
{
"epoch": 0.770595690747782,
"grad_norm": 0.054349955171346664,
"learning_rate": 2.5169368900074065e-05,
"loss": 0.4172,
"step": 1216
},
{
"epoch": 0.7712294043092522,
"grad_norm": 0.06529269367456436,
"learning_rate": 2.50366119136785e-05,
"loss": 0.6143,
"step": 1217
},
{
"epoch": 0.7718631178707225,
"grad_norm": 0.04771547392010689,
"learning_rate": 2.4904155851188872e-05,
"loss": 0.5159,
"step": 1218
},
{
"epoch": 0.7724968314321926,
"grad_norm": 0.05920165777206421,
"learning_rate": 2.4772001244320808e-05,
"loss": 0.5099,
"step": 1219
},
{
"epoch": 0.7731305449936628,
"grad_norm": 0.18929556012153625,
"learning_rate": 2.4640148623579607e-05,
"loss": 0.5481,
"step": 1220
},
{
"epoch": 0.7737642585551331,
"grad_norm": 0.05958317965269089,
"learning_rate": 2.450859851825842e-05,
"loss": 0.6733,
"step": 1221
},
{
"epoch": 0.7743979721166033,
"grad_norm": 0.07901210337877274,
"learning_rate": 2.437735145643597e-05,
"loss": 0.5206,
"step": 1222
},
{
"epoch": 0.7750316856780735,
"grad_norm": 0.06391927599906921,
"learning_rate": 2.4246407964974514e-05,
"loss": 0.5176,
"step": 1223
},
{
"epoch": 0.7756653992395437,
"grad_norm": 0.06345279514789581,
"learning_rate": 2.4115768569517662e-05,
"loss": 0.5595,
"step": 1224
},
{
"epoch": 0.7762991128010139,
"grad_norm": 0.0551944300532341,
"learning_rate": 2.398543379448832e-05,
"loss": 0.4748,
"step": 1225
},
{
"epoch": 0.7769328263624842,
"grad_norm": 0.09742176532745361,
"learning_rate": 2.3855404163086558e-05,
"loss": 0.5032,
"step": 1226
},
{
"epoch": 0.7775665399239544,
"grad_norm": 0.08844289928674698,
"learning_rate": 2.3725680197287493e-05,
"loss": 0.6148,
"step": 1227
},
{
"epoch": 0.7782002534854245,
"grad_norm": 0.081541508436203,
"learning_rate": 2.3596262417839255e-05,
"loss": 0.7268,
"step": 1228
},
{
"epoch": 0.7788339670468948,
"grad_norm": 0.04980150982737541,
"learning_rate": 2.346715134426084e-05,
"loss": 0.4272,
"step": 1229
},
{
"epoch": 0.779467680608365,
"grad_norm": 0.10399996489286423,
"learning_rate": 2.3338347494839997e-05,
"loss": 0.4799,
"step": 1230
},
{
"epoch": 0.7801013941698353,
"grad_norm": 0.059049125760793686,
"learning_rate": 2.3209851386631244e-05,
"loss": 0.4476,
"step": 1231
},
{
"epoch": 0.7807351077313055,
"grad_norm": 0.05135215446352959,
"learning_rate": 2.3081663535453736e-05,
"loss": 0.5203,
"step": 1232
},
{
"epoch": 0.7813688212927756,
"grad_norm": 0.06840016692876816,
"learning_rate": 2.2953784455889192e-05,
"loss": 0.5327,
"step": 1233
},
{
"epoch": 0.7820025348542459,
"grad_norm": 0.05446001887321472,
"learning_rate": 2.282621466127982e-05,
"loss": 0.5004,
"step": 1234
},
{
"epoch": 0.7826362484157161,
"grad_norm": 0.07529130578041077,
"learning_rate": 2.26989546637263e-05,
"loss": 0.547,
"step": 1235
},
{
"epoch": 0.7832699619771863,
"grad_norm": 0.05020952597260475,
"learning_rate": 2.2572004974085715e-05,
"loss": 0.5472,
"step": 1236
},
{
"epoch": 0.7839036755386565,
"grad_norm": 0.11373034119606018,
"learning_rate": 2.2445366101969344e-05,
"loss": 0.6839,
"step": 1237
},
{
"epoch": 0.7845373891001267,
"grad_norm": 0.05025709047913551,
"learning_rate": 2.2319038555741012e-05,
"loss": 0.5883,
"step": 1238
},
{
"epoch": 0.785171102661597,
"grad_norm": 0.0527602918446064,
"learning_rate": 2.2193022842514554e-05,
"loss": 0.4959,
"step": 1239
},
{
"epoch": 0.7858048162230672,
"grad_norm": 0.04213232547044754,
"learning_rate": 2.2067319468152135e-05,
"loss": 0.4668,
"step": 1240
},
{
"epoch": 0.7864385297845374,
"grad_norm": 0.0538921095430851,
"learning_rate": 2.1941928937262147e-05,
"loss": 0.3504,
"step": 1241
},
{
"epoch": 0.7870722433460076,
"grad_norm": 0.06466099619865417,
"learning_rate": 2.181685175319702e-05,
"loss": 0.4315,
"step": 1242
},
{
"epoch": 0.7877059569074778,
"grad_norm": 0.06612730026245117,
"learning_rate": 2.1692088418051416e-05,
"loss": 0.5484,
"step": 1243
},
{
"epoch": 0.788339670468948,
"grad_norm": 0.056087784469127655,
"learning_rate": 2.156763943266008e-05,
"loss": 0.5191,
"step": 1244
},
{
"epoch": 0.7889733840304183,
"grad_norm": 0.05299568176269531,
"learning_rate": 2.144350529659589e-05,
"loss": 0.4347,
"step": 1245
},
{
"epoch": 0.7896070975918885,
"grad_norm": 0.05531737953424454,
"learning_rate": 2.1319686508167835e-05,
"loss": 0.4698,
"step": 1246
},
{
"epoch": 0.7902408111533586,
"grad_norm": 0.05265354737639427,
"learning_rate": 2.1196183564418916e-05,
"loss": 0.5946,
"step": 1247
},
{
"epoch": 0.7908745247148289,
"grad_norm": 0.05842369794845581,
"learning_rate": 2.107299696112445e-05,
"loss": 0.4225,
"step": 1248
},
{
"epoch": 0.7915082382762991,
"grad_norm": 0.07972507178783417,
"learning_rate": 2.095012719278966e-05,
"loss": 0.4368,
"step": 1249
},
{
"epoch": 0.7921419518377694,
"grad_norm": 0.07557940483093262,
"learning_rate": 2.0827574752648038e-05,
"loss": 0.5221,
"step": 1250
},
{
"epoch": 0.7927756653992395,
"grad_norm": 0.07189042866230011,
"learning_rate": 2.070534013265917e-05,
"loss": 0.6327,
"step": 1251
},
{
"epoch": 0.7934093789607097,
"grad_norm": 0.05134062096476555,
"learning_rate": 2.0583423823506854e-05,
"loss": 0.4055,
"step": 1252
},
{
"epoch": 0.79404309252218,
"grad_norm": 0.061564311385154724,
"learning_rate": 2.046182631459709e-05,
"loss": 0.5737,
"step": 1253
},
{
"epoch": 0.7946768060836502,
"grad_norm": 0.06396010518074036,
"learning_rate": 2.034054809405613e-05,
"loss": 0.5532,
"step": 1254
},
{
"epoch": 0.7953105196451205,
"grad_norm": 0.05783439427614212,
"learning_rate": 2.021958964872851e-05,
"loss": 0.4583,
"step": 1255
},
{
"epoch": 0.7959442332065906,
"grad_norm": 0.047606490552425385,
"learning_rate": 2.009895146417512e-05,
"loss": 0.438,
"step": 1256
},
{
"epoch": 0.7965779467680608,
"grad_norm": 0.04959520697593689,
"learning_rate": 1.9978634024671127e-05,
"loss": 0.5007,
"step": 1257
},
{
"epoch": 0.7972116603295311,
"grad_norm": 0.0747847780585289,
"learning_rate": 1.985863781320435e-05,
"loss": 0.5268,
"step": 1258
},
{
"epoch": 0.7978453738910013,
"grad_norm": 0.05679310858249664,
"learning_rate": 1.973896331147288e-05,
"loss": 0.5326,
"step": 1259
},
{
"epoch": 0.7984790874524715,
"grad_norm": 0.06326697021722794,
"learning_rate": 1.961961099988353e-05,
"loss": 0.4857,
"step": 1260
},
{
"epoch": 0.7991128010139417,
"grad_norm": 0.052933454513549805,
"learning_rate": 1.9500581357549675e-05,
"loss": 0.4833,
"step": 1261
},
{
"epoch": 0.7997465145754119,
"grad_norm": 0.06346312910318375,
"learning_rate": 1.938187486228945e-05,
"loss": 0.6137,
"step": 1262
},
{
"epoch": 0.8003802281368821,
"grad_norm": 0.04987538978457451,
"learning_rate": 1.926349199062376e-05,
"loss": 0.4839,
"step": 1263
},
{
"epoch": 0.8010139416983524,
"grad_norm": 0.08084560185670853,
"learning_rate": 1.9145433217774412e-05,
"loss": 0.5255,
"step": 1264
},
{
"epoch": 0.8016476552598225,
"grad_norm": 0.04327573999762535,
"learning_rate": 1.9027699017662194e-05,
"loss": 0.4094,
"step": 1265
},
{
"epoch": 0.8022813688212928,
"grad_norm": 0.045093148946762085,
"learning_rate": 1.891028986290492e-05,
"loss": 0.3749,
"step": 1266
},
{
"epoch": 0.802915082382763,
"grad_norm": 0.05421237647533417,
"learning_rate": 1.879320622481564e-05,
"loss": 0.5481,
"step": 1267
},
{
"epoch": 0.8035487959442332,
"grad_norm": 0.05031122267246246,
"learning_rate": 1.8676448573400662e-05,
"loss": 0.5616,
"step": 1268
},
{
"epoch": 0.8041825095057035,
"grad_norm": 0.061348918825387955,
"learning_rate": 1.8560017377357696e-05,
"loss": 0.4456,
"step": 1269
},
{
"epoch": 0.8048162230671736,
"grad_norm": 0.05528967082500458,
"learning_rate": 1.8443913104073983e-05,
"loss": 0.4684,
"step": 1270
},
{
"epoch": 0.8054499366286438,
"grad_norm": 0.061211880296468735,
"learning_rate": 1.832813621962439e-05,
"loss": 0.5477,
"step": 1271
},
{
"epoch": 0.8060836501901141,
"grad_norm": 0.053657419979572296,
"learning_rate": 1.8212687188769563e-05,
"loss": 0.4267,
"step": 1272
},
{
"epoch": 0.8067173637515843,
"grad_norm": 0.057141125202178955,
"learning_rate": 1.809756647495404e-05,
"loss": 0.5345,
"step": 1273
},
{
"epoch": 0.8073510773130546,
"grad_norm": 0.05224443972110748,
"learning_rate": 1.7982774540304403e-05,
"loss": 0.5199,
"step": 1274
},
{
"epoch": 0.8079847908745247,
"grad_norm": 0.06389784067869186,
"learning_rate": 1.7868311845627472e-05,
"loss": 0.554,
"step": 1275
},
{
"epoch": 0.8086185044359949,
"grad_norm": 0.045400507748126984,
"learning_rate": 1.7754178850408275e-05,
"loss": 0.488,
"step": 1276
},
{
"epoch": 0.8092522179974652,
"grad_norm": 0.06506984680891037,
"learning_rate": 1.7640376012808536e-05,
"loss": 0.537,
"step": 1277
},
{
"epoch": 0.8098859315589354,
"grad_norm": 0.07384153455495834,
"learning_rate": 1.752690378966444e-05,
"loss": 0.5557,
"step": 1278
},
{
"epoch": 0.8105196451204055,
"grad_norm": 0.06372448056936264,
"learning_rate": 1.741376263648511e-05,
"loss": 0.5745,
"step": 1279
},
{
"epoch": 0.8111533586818758,
"grad_norm": 0.06869064271450043,
"learning_rate": 1.7300953007450604e-05,
"loss": 0.6153,
"step": 1280
},
{
"epoch": 0.811787072243346,
"grad_norm": 0.057191621512174606,
"learning_rate": 1.7188475355410205e-05,
"loss": 0.5183,
"step": 1281
},
{
"epoch": 0.8124207858048162,
"grad_norm": 0.06072848662734032,
"learning_rate": 1.7076330131880526e-05,
"loss": 0.445,
"step": 1282
},
{
"epoch": 0.8130544993662865,
"grad_norm": 0.05268765240907669,
"learning_rate": 1.696451778704362e-05,
"loss": 0.3636,
"step": 1283
},
{
"epoch": 0.8136882129277566,
"grad_norm": 0.052101630717515945,
"learning_rate": 1.6853038769745467e-05,
"loss": 0.5545,
"step": 1284
},
{
"epoch": 0.8143219264892269,
"grad_norm": 0.05689757317304611,
"learning_rate": 1.6741893527493858e-05,
"loss": 0.4906,
"step": 1285
},
{
"epoch": 0.8149556400506971,
"grad_norm": 0.06319184601306915,
"learning_rate": 1.6631082506456664e-05,
"loss": 0.5482,
"step": 1286
},
{
"epoch": 0.8155893536121673,
"grad_norm": 0.04711335524916649,
"learning_rate": 1.652060615146026e-05,
"loss": 0.4909,
"step": 1287
},
{
"epoch": 0.8162230671736375,
"grad_norm": 0.057239387184381485,
"learning_rate": 1.641046490598741e-05,
"loss": 0.6353,
"step": 1288
},
{
"epoch": 0.8168567807351077,
"grad_norm": 0.05296272784471512,
"learning_rate": 1.6300659212175762e-05,
"loss": 0.4684,
"step": 1289
},
{
"epoch": 0.8174904942965779,
"grad_norm": 0.06298789381980896,
"learning_rate": 1.619118951081594e-05,
"loss": 0.5711,
"step": 1290
},
{
"epoch": 0.8181242078580482,
"grad_norm": 0.09905237704515457,
"learning_rate": 1.6082056241349786e-05,
"loss": 0.4142,
"step": 1291
},
{
"epoch": 0.8187579214195184,
"grad_norm": 0.05665900185704231,
"learning_rate": 1.5973259841868648e-05,
"loss": 0.3731,
"step": 1292
},
{
"epoch": 0.8193916349809885,
"grad_norm": 0.05161561071872711,
"learning_rate": 1.5864800749111498e-05,
"loss": 0.6025,
"step": 1293
},
{
"epoch": 0.8200253485424588,
"grad_norm": 0.05798448994755745,
"learning_rate": 1.5756679398463404e-05,
"loss": 0.5028,
"step": 1294
},
{
"epoch": 0.820659062103929,
"grad_norm": 0.058822982013225555,
"learning_rate": 1.564889622395349e-05,
"loss": 0.5285,
"step": 1295
},
{
"epoch": 0.8212927756653993,
"grad_norm": 0.061565153300762177,
"learning_rate": 1.554145165825346e-05,
"loss": 0.3851,
"step": 1296
},
{
"epoch": 0.8219264892268695,
"grad_norm": 0.06362520158290863,
"learning_rate": 1.543434613267569e-05,
"loss": 0.5463,
"step": 1297
},
{
"epoch": 0.8225602027883396,
"grad_norm": 0.05907528102397919,
"learning_rate": 1.5327580077171587e-05,
"loss": 0.5345,
"step": 1298
},
{
"epoch": 0.8231939163498099,
"grad_norm": 0.07837852835655212,
"learning_rate": 1.522115392032981e-05,
"loss": 0.5596,
"step": 1299
},
{
"epoch": 0.8238276299112801,
"grad_norm": 0.05258476361632347,
"learning_rate": 1.5115068089374584e-05,
"loss": 0.5727,
"step": 1300
},
{
"epoch": 0.8244613434727504,
"grad_norm": 0.04834875091910362,
"learning_rate": 1.5009323010163957e-05,
"loss": 0.4594,
"step": 1301
},
{
"epoch": 0.8250950570342205,
"grad_norm": 0.049648720771074295,
"learning_rate": 1.4903919107188103e-05,
"loss": 0.5206,
"step": 1302
},
{
"epoch": 0.8257287705956907,
"grad_norm": 0.05695943161845207,
"learning_rate": 1.479885680356764e-05,
"loss": 0.4571,
"step": 1303
},
{
"epoch": 0.826362484157161,
"grad_norm": 0.06374506652355194,
"learning_rate": 1.4694136521051927e-05,
"loss": 0.6129,
"step": 1304
},
{
"epoch": 0.8269961977186312,
"grad_norm": 0.05414360389113426,
"learning_rate": 1.4589758680017263e-05,
"loss": 0.5685,
"step": 1305
},
{
"epoch": 0.8276299112801014,
"grad_norm": 0.057321734726428986,
"learning_rate": 1.4485723699465392e-05,
"loss": 0.4432,
"step": 1306
},
{
"epoch": 0.8282636248415716,
"grad_norm": 0.07635460048913956,
"learning_rate": 1.4382031997021683e-05,
"loss": 0.6343,
"step": 1307
},
{
"epoch": 0.8288973384030418,
"grad_norm": 0.05573682114481926,
"learning_rate": 1.4278683988933483e-05,
"loss": 0.4767,
"step": 1308
},
{
"epoch": 0.829531051964512,
"grad_norm": 0.061004284769296646,
"learning_rate": 1.4175680090068477e-05,
"loss": 0.481,
"step": 1309
},
{
"epoch": 0.8301647655259823,
"grad_norm": 0.05386270582675934,
"learning_rate": 1.4073020713912987e-05,
"loss": 0.4173,
"step": 1310
},
{
"epoch": 0.8307984790874525,
"grad_norm": 0.056015755981206894,
"learning_rate": 1.3970706272570333e-05,
"loss": 0.5044,
"step": 1311
},
{
"epoch": 0.8314321926489227,
"grad_norm": 0.055295176804065704,
"learning_rate": 1.3868737176759106e-05,
"loss": 0.5162,
"step": 1312
},
{
"epoch": 0.8320659062103929,
"grad_norm": 0.05815298855304718,
"learning_rate": 1.3767113835811719e-05,
"loss": 0.503,
"step": 1313
},
{
"epoch": 0.8326996197718631,
"grad_norm": 0.05925634503364563,
"learning_rate": 1.3665836657672493e-05,
"loss": 0.376,
"step": 1314
},
{
"epoch": 0.8333333333333334,
"grad_norm": 0.07117018848657608,
"learning_rate": 1.356490604889622e-05,
"loss": 0.5373,
"step": 1315
},
{
"epoch": 0.8339670468948035,
"grad_norm": 0.05111690238118172,
"learning_rate": 1.346432241464648e-05,
"loss": 0.4692,
"step": 1316
},
{
"epoch": 0.8346007604562737,
"grad_norm": 0.04756741225719452,
"learning_rate": 1.3364086158693967e-05,
"loss": 0.494,
"step": 1317
},
{
"epoch": 0.835234474017744,
"grad_norm": 0.07146294414997101,
"learning_rate": 1.3264197683414914e-05,
"loss": 0.4826,
"step": 1318
},
{
"epoch": 0.8358681875792142,
"grad_norm": 0.053026240319013596,
"learning_rate": 1.3164657389789458e-05,
"loss": 0.5435,
"step": 1319
},
{
"epoch": 0.8365019011406845,
"grad_norm": 0.05339264124631882,
"learning_rate": 1.3065465677400046e-05,
"loss": 0.5336,
"step": 1320
},
{
"epoch": 0.8371356147021546,
"grad_norm": 0.0469050295650959,
"learning_rate": 1.2966622944429863e-05,
"loss": 0.5496,
"step": 1321
},
{
"epoch": 0.8377693282636248,
"grad_norm": 0.05157692730426788,
"learning_rate": 1.286812958766106e-05,
"loss": 0.4415,
"step": 1322
},
{
"epoch": 0.8384030418250951,
"grad_norm": 0.04584040492773056,
"learning_rate": 1.2769986002473488e-05,
"loss": 0.3915,
"step": 1323
},
{
"epoch": 0.8390367553865653,
"grad_norm": 0.05028408765792847,
"learning_rate": 1.2672192582842756e-05,
"loss": 0.4896,
"step": 1324
},
{
"epoch": 0.8396704689480355,
"grad_norm": 0.05721574276685715,
"learning_rate": 1.2574749721338874e-05,
"loss": 0.468,
"step": 1325
},
{
"epoch": 0.8403041825095057,
"grad_norm": 0.05546222999691963,
"learning_rate": 1.2477657809124631e-05,
"loss": 0.5043,
"step": 1326
},
{
"epoch": 0.8409378960709759,
"grad_norm": 0.06228777393698692,
"learning_rate": 1.2380917235953992e-05,
"loss": 0.4705,
"step": 1327
},
{
"epoch": 0.8415716096324461,
"grad_norm": 0.06848033517599106,
"learning_rate": 1.2284528390170547e-05,
"loss": 0.551,
"step": 1328
},
{
"epoch": 0.8422053231939164,
"grad_norm": 0.06439417600631714,
"learning_rate": 1.2188491658705892e-05,
"loss": 0.6371,
"step": 1329
},
{
"epoch": 0.8428390367553865,
"grad_norm": 0.04973322153091431,
"learning_rate": 1.2092807427078279e-05,
"loss": 0.4066,
"step": 1330
},
{
"epoch": 0.8434727503168568,
"grad_norm": 0.051313966512680054,
"learning_rate": 1.1997476079390835e-05,
"loss": 0.5193,
"step": 1331
},
{
"epoch": 0.844106463878327,
"grad_norm": 0.061408158391714096,
"learning_rate": 1.1902497998330064e-05,
"loss": 0.468,
"step": 1332
},
{
"epoch": 0.8447401774397972,
"grad_norm": 0.08167242258787155,
"learning_rate": 1.1807873565164506e-05,
"loss": 0.5863,
"step": 1333
},
{
"epoch": 0.8453738910012675,
"grad_norm": 0.05669960379600525,
"learning_rate": 1.1713603159742915e-05,
"loss": 0.4906,
"step": 1334
},
{
"epoch": 0.8460076045627376,
"grad_norm": 0.055099859833717346,
"learning_rate": 1.1619687160492953e-05,
"loss": 0.4962,
"step": 1335
},
{
"epoch": 0.8466413181242078,
"grad_norm": 0.048016201704740524,
"learning_rate": 1.1526125944419586e-05,
"loss": 0.4489,
"step": 1336
},
{
"epoch": 0.8472750316856781,
"grad_norm": 0.05006731301546097,
"learning_rate": 1.1432919887103578e-05,
"loss": 0.5157,
"step": 1337
},
{
"epoch": 0.8479087452471483,
"grad_norm": 0.06883764266967773,
"learning_rate": 1.134006936269999e-05,
"loss": 0.4872,
"step": 1338
},
{
"epoch": 0.8485424588086184,
"grad_norm": 0.06572287529706955,
"learning_rate": 1.1247574743936674e-05,
"loss": 0.5116,
"step": 1339
},
{
"epoch": 0.8491761723700887,
"grad_norm": 0.04616443067789078,
"learning_rate": 1.1155436402112785e-05,
"loss": 0.4337,
"step": 1340
},
{
"epoch": 0.8498098859315589,
"grad_norm": 0.052716389298439026,
"learning_rate": 1.1063654707097237e-05,
"loss": 0.4865,
"step": 1341
},
{
"epoch": 0.8504435994930292,
"grad_norm": 0.0780235081911087,
"learning_rate": 1.0972230027327335e-05,
"loss": 0.5133,
"step": 1342
},
{
"epoch": 0.8510773130544994,
"grad_norm": 0.0498424768447876,
"learning_rate": 1.0881162729807182e-05,
"loss": 0.5187,
"step": 1343
},
{
"epoch": 0.8517110266159695,
"grad_norm": 0.05532195791602135,
"learning_rate": 1.0790453180106253e-05,
"loss": 0.5798,
"step": 1344
},
{
"epoch": 0.8523447401774398,
"grad_norm": 0.07864464819431305,
"learning_rate": 1.0700101742357926e-05,
"loss": 0.5761,
"step": 1345
},
{
"epoch": 0.85297845373891,
"grad_norm": 0.09476902335882187,
"learning_rate": 1.0610108779258044e-05,
"loss": 0.6139,
"step": 1346
},
{
"epoch": 0.8536121673003803,
"grad_norm": 0.04403742030262947,
"learning_rate": 1.0520474652063394e-05,
"loss": 0.4819,
"step": 1347
},
{
"epoch": 0.8542458808618505,
"grad_norm": 0.06702962517738342,
"learning_rate": 1.0431199720590324e-05,
"loss": 0.4967,
"step": 1348
},
{
"epoch": 0.8548795944233206,
"grad_norm": 0.05125468224287033,
"learning_rate": 1.0342284343213238e-05,
"loss": 0.5489,
"step": 1349
},
{
"epoch": 0.8555133079847909,
"grad_norm": 0.11487980931997299,
"learning_rate": 1.0253728876863255e-05,
"loss": 0.639,
"step": 1350
},
{
"epoch": 0.8561470215462611,
"grad_norm": 0.06662024557590485,
"learning_rate": 1.0165533677026584e-05,
"loss": 0.4577,
"step": 1351
},
{
"epoch": 0.8567807351077313,
"grad_norm": 0.053168293088674545,
"learning_rate": 1.007769909774341e-05,
"loss": 0.4738,
"step": 1352
},
{
"epoch": 0.8574144486692015,
"grad_norm": 0.0470830462872982,
"learning_rate": 9.990225491606098e-06,
"loss": 0.455,
"step": 1353
},
{
"epoch": 0.8580481622306717,
"grad_norm": 0.08435950428247452,
"learning_rate": 9.903113209758096e-06,
"loss": 0.5088,
"step": 1354
},
{
"epoch": 0.858681875792142,
"grad_norm": 0.06445365399122238,
"learning_rate": 9.816362601892326e-06,
"loss": 0.5801,
"step": 1355
},
{
"epoch": 0.8593155893536122,
"grad_norm": 0.05347883328795433,
"learning_rate": 9.729974016249899e-06,
"loss": 0.3887,
"step": 1356
},
{
"epoch": 0.8599493029150824,
"grad_norm": 0.055654123425483704,
"learning_rate": 9.643947799618658e-06,
"loss": 0.3983,
"step": 1357
},
{
"epoch": 0.8605830164765526,
"grad_norm": 0.07010248303413391,
"learning_rate": 9.55828429733171e-06,
"loss": 0.4775,
"step": 1358
},
{
"epoch": 0.8612167300380228,
"grad_norm": 0.05431290343403816,
"learning_rate": 9.472983853266282e-06,
"loss": 0.5438,
"step": 1359
},
{
"epoch": 0.861850443599493,
"grad_norm": 0.10483434051275253,
"learning_rate": 9.388046809842055e-06,
"loss": 0.5196,
"step": 1360
},
{
"epoch": 0.8624841571609633,
"grad_norm": 0.05333583801984787,
"learning_rate": 9.303473508019944e-06,
"loss": 0.5467,
"step": 1361
},
{
"epoch": 0.8631178707224335,
"grad_norm": 0.05904083698987961,
"learning_rate": 9.219264287300799e-06,
"loss": 0.5264,
"step": 1362
},
{
"epoch": 0.8637515842839036,
"grad_norm": 0.0580391101539135,
"learning_rate": 9.135419485723796e-06,
"loss": 0.473,
"step": 1363
},
{
"epoch": 0.8643852978453739,
"grad_norm": 0.06643393635749817,
"learning_rate": 9.051939439865342e-06,
"loss": 0.5126,
"step": 1364
},
{
"epoch": 0.8650190114068441,
"grad_norm": 0.04457048326730728,
"learning_rate": 8.968824484837578e-06,
"loss": 0.3846,
"step": 1365
},
{
"epoch": 0.8656527249683144,
"grad_norm": 0.05300934240221977,
"learning_rate": 8.88607495428705e-06,
"loss": 0.4201,
"step": 1366
},
{
"epoch": 0.8662864385297845,
"grad_norm": 0.0492468886077404,
"learning_rate": 8.803691180393448e-06,
"loss": 0.5123,
"step": 1367
},
{
"epoch": 0.8669201520912547,
"grad_norm": 0.05493564158678055,
"learning_rate": 8.72167349386811e-06,
"loss": 0.5126,
"step": 1368
},
{
"epoch": 0.867553865652725,
"grad_norm": 0.05255114659667015,
"learning_rate": 8.640022223952915e-06,
"loss": 0.574,
"step": 1369
},
{
"epoch": 0.8681875792141952,
"grad_norm": 0.06494517624378204,
"learning_rate": 8.558737698418761e-06,
"loss": 0.4352,
"step": 1370
},
{
"epoch": 0.8688212927756654,
"grad_norm": 0.06190333142876625,
"learning_rate": 8.477820243564361e-06,
"loss": 0.3904,
"step": 1371
},
{
"epoch": 0.8694550063371356,
"grad_norm": 0.0645797923207283,
"learning_rate": 8.397270184214912e-06,
"loss": 0.5635,
"step": 1372
},
{
"epoch": 0.8700887198986058,
"grad_norm": 0.20880210399627686,
"learning_rate": 8.317087843720762e-06,
"loss": 0.5335,
"step": 1373
},
{
"epoch": 0.870722433460076,
"grad_norm": 0.05390581861138344,
"learning_rate": 8.237273543956147e-06,
"loss": 0.5685,
"step": 1374
},
{
"epoch": 0.8713561470215463,
"grad_norm": 0.054211899638175964,
"learning_rate": 8.157827605317892e-06,
"loss": 0.396,
"step": 1375
},
{
"epoch": 0.8719898605830165,
"grad_norm": 0.049035023897886276,
"learning_rate": 8.078750346724107e-06,
"loss": 0.4993,
"step": 1376
},
{
"epoch": 0.8726235741444867,
"grad_norm": 0.06823498010635376,
"learning_rate": 8.000042085612925e-06,
"loss": 0.6172,
"step": 1377
},
{
"epoch": 0.8732572877059569,
"grad_norm": 0.0540115050971508,
"learning_rate": 7.921703137941173e-06,
"loss": 0.5578,
"step": 1378
},
{
"epoch": 0.8738910012674271,
"grad_norm": 0.06128177419304848,
"learning_rate": 7.843733818183252e-06,
"loss": 0.5836,
"step": 1379
},
{
"epoch": 0.8745247148288974,
"grad_norm": 0.057263512164354324,
"learning_rate": 7.766134439329676e-06,
"loss": 0.416,
"step": 1380
},
{
"epoch": 0.8751584283903675,
"grad_norm": 0.21897205710411072,
"learning_rate": 7.688905312885963e-06,
"loss": 0.5334,
"step": 1381
},
{
"epoch": 0.8757921419518377,
"grad_norm": 0.0429847426712513,
"learning_rate": 7.612046748871327e-06,
"loss": 0.4916,
"step": 1382
},
{
"epoch": 0.876425855513308,
"grad_norm": 0.07513666898012161,
"learning_rate": 7.535559055817431e-06,
"loss": 0.4666,
"step": 1383
},
{
"epoch": 0.8770595690747782,
"grad_norm": 0.05872650444507599,
"learning_rate": 7.4594425407671694e-06,
"loss": 0.5264,
"step": 1384
},
{
"epoch": 0.8776932826362485,
"grad_norm": 0.060816336423158646,
"learning_rate": 7.383697509273424e-06,
"loss": 0.4328,
"step": 1385
},
{
"epoch": 0.8783269961977186,
"grad_norm": 0.04453590139746666,
"learning_rate": 7.308324265397836e-06,
"loss": 0.3405,
"step": 1386
},
{
"epoch": 0.8789607097591888,
"grad_norm": 0.05910937488079071,
"learning_rate": 7.233323111709556e-06,
"loss": 0.3857,
"step": 1387
},
{
"epoch": 0.8795944233206591,
"grad_norm": 0.0769592672586441,
"learning_rate": 7.158694349284145e-06,
"loss": 0.6534,
"step": 1388
},
{
"epoch": 0.8802281368821293,
"grad_norm": 0.07179361581802368,
"learning_rate": 7.084438277702188e-06,
"loss": 0.6393,
"step": 1389
},
{
"epoch": 0.8808618504435995,
"grad_norm": 0.05403920263051987,
"learning_rate": 7.010555195048241e-06,
"loss": 0.4918,
"step": 1390
},
{
"epoch": 0.8814955640050697,
"grad_norm": 0.04676292836666107,
"learning_rate": 6.9370453979095584e-06,
"loss": 0.4358,
"step": 1391
},
{
"epoch": 0.8821292775665399,
"grad_norm": 0.05920941010117531,
"learning_rate": 6.863909181374928e-06,
"loss": 0.5852,
"step": 1392
},
{
"epoch": 0.8827629911280102,
"grad_norm": 0.059993911534547806,
"learning_rate": 6.79114683903348e-06,
"loss": 0.5292,
"step": 1393
},
{
"epoch": 0.8833967046894804,
"grad_norm": 0.06140689179301262,
"learning_rate": 6.718758662973523e-06,
"loss": 0.5274,
"step": 1394
},
{
"epoch": 0.8840304182509505,
"grad_norm": 0.07671885192394257,
"learning_rate": 6.646744943781325e-06,
"loss": 0.6461,
"step": 1395
},
{
"epoch": 0.8846641318124208,
"grad_norm": 0.05516969412565231,
"learning_rate": 6.5751059705400295e-06,
"loss": 0.4195,
"step": 1396
},
{
"epoch": 0.885297845373891,
"grad_norm": 0.06641022861003876,
"learning_rate": 6.5038420308283555e-06,
"loss": 0.5851,
"step": 1397
},
{
"epoch": 0.8859315589353612,
"grad_norm": 0.5566907525062561,
"learning_rate": 6.4329534107196776e-06,
"loss": 0.655,
"step": 1398
},
{
"epoch": 0.8865652724968315,
"grad_norm": 0.04419538006186485,
"learning_rate": 6.362440394780577e-06,
"loss": 0.3974,
"step": 1399
},
{
"epoch": 0.8871989860583016,
"grad_norm": 0.057742148637771606,
"learning_rate": 6.292303266069965e-06,
"loss": 0.6743,
"step": 1400
},
{
"epoch": 0.8878326996197718,
"grad_norm": 0.04702699929475784,
"learning_rate": 6.222542306137791e-06,
"loss": 0.4404,
"step": 1401
},
{
"epoch": 0.8884664131812421,
"grad_norm": 0.04635150358080864,
"learning_rate": 6.153157795023956e-06,
"loss": 0.4961,
"step": 1402
},
{
"epoch": 0.8891001267427123,
"grad_norm": 0.06208278238773346,
"learning_rate": 6.084150011257239e-06,
"loss": 0.5394,
"step": 1403
},
{
"epoch": 0.8897338403041825,
"grad_norm": 0.06848407536745071,
"learning_rate": 6.015519231854017e-06,
"loss": 0.5841,
"step": 1404
},
{
"epoch": 0.8903675538656527,
"grad_norm": 0.06765418499708176,
"learning_rate": 5.947265732317408e-06,
"loss": 0.5375,
"step": 1405
},
{
"epoch": 0.8910012674271229,
"grad_norm": 0.05794088542461395,
"learning_rate": 5.879389786635958e-06,
"loss": 0.5654,
"step": 1406
},
{
"epoch": 0.8916349809885932,
"grad_norm": 0.1468249410390854,
"learning_rate": 5.811891667282554e-06,
"loss": 0.6949,
"step": 1407
},
{
"epoch": 0.8922686945500634,
"grad_norm": 0.05471213161945343,
"learning_rate": 5.744771645213498e-06,
"loss": 0.4148,
"step": 1408
},
{
"epoch": 0.8929024081115335,
"grad_norm": 0.0546412393450737,
"learning_rate": 5.678029989867195e-06,
"loss": 0.4388,
"step": 1409
},
{
"epoch": 0.8935361216730038,
"grad_norm": 0.05917483568191528,
"learning_rate": 5.611666969163243e-06,
"loss": 0.463,
"step": 1410
},
{
"epoch": 0.894169835234474,
"grad_norm": 0.0529630072414875,
"learning_rate": 5.545682849501288e-06,
"loss": 0.469,
"step": 1411
},
{
"epoch": 0.8948035487959443,
"grad_norm": 0.07593543082475662,
"learning_rate": 5.480077895759939e-06,
"loss": 0.5916,
"step": 1412
},
{
"epoch": 0.8954372623574145,
"grad_norm": 0.04886200651526451,
"learning_rate": 5.414852371295753e-06,
"loss": 0.5026,
"step": 1413
},
{
"epoch": 0.8960709759188846,
"grad_norm": 0.05194302648305893,
"learning_rate": 5.350006537942121e-06,
"loss": 0.4711,
"step": 1414
},
{
"epoch": 0.8967046894803549,
"grad_norm": 0.05789684131741524,
"learning_rate": 5.285540656008303e-06,
"loss": 0.569,
"step": 1415
},
{
"epoch": 0.8973384030418251,
"grad_norm": 0.04720381274819374,
"learning_rate": 5.221454984278262e-06,
"loss": 0.4992,
"step": 1416
},
{
"epoch": 0.8979721166032953,
"grad_norm": 0.08316215127706528,
"learning_rate": 5.157749780009735e-06,
"loss": 0.6154,
"step": 1417
},
{
"epoch": 0.8986058301647655,
"grad_norm": 0.07569985836744308,
"learning_rate": 5.094425298933136e-06,
"loss": 0.6809,
"step": 1418
},
{
"epoch": 0.8992395437262357,
"grad_norm": 0.05404726415872574,
"learning_rate": 5.03148179525057e-06,
"loss": 0.532,
"step": 1419
},
{
"epoch": 0.899873257287706,
"grad_norm": 0.051688052713871,
"learning_rate": 4.968919521634785e-06,
"loss": 0.4306,
"step": 1420
},
{
"epoch": 0.9005069708491762,
"grad_norm": 0.05708514526486397,
"learning_rate": 4.906738729228144e-06,
"loss": 0.3961,
"step": 1421
},
{
"epoch": 0.9011406844106464,
"grad_norm": 0.08331619948148727,
"learning_rate": 4.844939667641668e-06,
"loss": 0.5025,
"step": 1422
}
],
"logging_steps": 1,
"max_steps": 1578,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 158,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.9547219502884192e+19,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}