leduckhai's picture
Upload folder using huggingface_hub
7db9682 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.7036059806508356,
"eval_steps": 500,
"global_step": 200,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.003518029903254178,
"grad_norm": 1.4669007062911987,
"learning_rate": 0.0001,
"loss": 2.9718,
"step": 1
},
{
"epoch": 0.007036059806508356,
"grad_norm": 1.5240416526794434,
"learning_rate": 9.949748743718594e-05,
"loss": 3.0249,
"step": 2
},
{
"epoch": 0.010554089709762533,
"grad_norm": 1.3310328722000122,
"learning_rate": 9.899497487437186e-05,
"loss": 2.7545,
"step": 3
},
{
"epoch": 0.014072119613016711,
"grad_norm": 1.4892698526382446,
"learning_rate": 9.84924623115578e-05,
"loss": 2.6703,
"step": 4
},
{
"epoch": 0.01759014951627089,
"grad_norm": 1.4727792739868164,
"learning_rate": 9.798994974874372e-05,
"loss": 2.4731,
"step": 5
},
{
"epoch": 0.021108179419525065,
"grad_norm": 1.4451979398727417,
"learning_rate": 9.748743718592965e-05,
"loss": 2.2243,
"step": 6
},
{
"epoch": 0.024626209322779244,
"grad_norm": 1.3103245496749878,
"learning_rate": 9.698492462311559e-05,
"loss": 2.0194,
"step": 7
},
{
"epoch": 0.028144239226033423,
"grad_norm": 1.4852089881896973,
"learning_rate": 9.64824120603015e-05,
"loss": 1.9349,
"step": 8
},
{
"epoch": 0.0316622691292876,
"grad_norm": 1.5170249938964844,
"learning_rate": 9.597989949748745e-05,
"loss": 1.7582,
"step": 9
},
{
"epoch": 0.03518029903254178,
"grad_norm": 1.3428442478179932,
"learning_rate": 9.547738693467337e-05,
"loss": 1.6313,
"step": 10
},
{
"epoch": 0.03869832893579595,
"grad_norm": 1.0400348901748657,
"learning_rate": 9.49748743718593e-05,
"loss": 1.4358,
"step": 11
},
{
"epoch": 0.04221635883905013,
"grad_norm": 0.9891974329948425,
"learning_rate": 9.447236180904523e-05,
"loss": 1.3738,
"step": 12
},
{
"epoch": 0.04573438874230431,
"grad_norm": 0.6980912685394287,
"learning_rate": 9.396984924623115e-05,
"loss": 1.425,
"step": 13
},
{
"epoch": 0.04925241864555849,
"grad_norm": 0.6836680769920349,
"learning_rate": 9.34673366834171e-05,
"loss": 1.4575,
"step": 14
},
{
"epoch": 0.052770448548812667,
"grad_norm": 0.9314870238304138,
"learning_rate": 9.296482412060302e-05,
"loss": 1.3206,
"step": 15
},
{
"epoch": 0.056288478452066845,
"grad_norm": 0.6797922253608704,
"learning_rate": 9.246231155778895e-05,
"loss": 1.3724,
"step": 16
},
{
"epoch": 0.05980650835532102,
"grad_norm": 0.6958814263343811,
"learning_rate": 9.195979899497488e-05,
"loss": 1.3661,
"step": 17
},
{
"epoch": 0.0633245382585752,
"grad_norm": 0.7188398241996765,
"learning_rate": 9.14572864321608e-05,
"loss": 1.3242,
"step": 18
},
{
"epoch": 0.06684256816182937,
"grad_norm": 0.8997742533683777,
"learning_rate": 9.095477386934675e-05,
"loss": 1.4049,
"step": 19
},
{
"epoch": 0.07036059806508356,
"grad_norm": 0.8283623456954956,
"learning_rate": 9.045226130653267e-05,
"loss": 1.3121,
"step": 20
},
{
"epoch": 0.07387862796833773,
"grad_norm": 0.8064684867858887,
"learning_rate": 8.99497487437186e-05,
"loss": 1.3451,
"step": 21
},
{
"epoch": 0.0773966578715919,
"grad_norm": 0.8180544972419739,
"learning_rate": 8.944723618090453e-05,
"loss": 1.2111,
"step": 22
},
{
"epoch": 0.08091468777484609,
"grad_norm": 0.8000004887580872,
"learning_rate": 8.894472361809045e-05,
"loss": 1.2933,
"step": 23
},
{
"epoch": 0.08443271767810026,
"grad_norm": 0.8804137706756592,
"learning_rate": 8.84422110552764e-05,
"loss": 1.3173,
"step": 24
},
{
"epoch": 0.08795074758135445,
"grad_norm": 0.8556327819824219,
"learning_rate": 8.793969849246232e-05,
"loss": 1.321,
"step": 25
},
{
"epoch": 0.09146877748460862,
"grad_norm": 0.827410876750946,
"learning_rate": 8.743718592964825e-05,
"loss": 1.2195,
"step": 26
},
{
"epoch": 0.09498680738786279,
"grad_norm": 0.9081262946128845,
"learning_rate": 8.693467336683418e-05,
"loss": 1.2451,
"step": 27
},
{
"epoch": 0.09850483729111698,
"grad_norm": 0.9331269860267639,
"learning_rate": 8.64321608040201e-05,
"loss": 1.2204,
"step": 28
},
{
"epoch": 0.10202286719437115,
"grad_norm": 1.0290558338165283,
"learning_rate": 8.592964824120603e-05,
"loss": 1.2379,
"step": 29
},
{
"epoch": 0.10554089709762533,
"grad_norm": 1.1296031475067139,
"learning_rate": 8.542713567839196e-05,
"loss": 1.2412,
"step": 30
},
{
"epoch": 0.1090589270008795,
"grad_norm": 1.1690081357955933,
"learning_rate": 8.49246231155779e-05,
"loss": 1.1888,
"step": 31
},
{
"epoch": 0.11257695690413369,
"grad_norm": 1.1313647031784058,
"learning_rate": 8.442211055276383e-05,
"loss": 1.2961,
"step": 32
},
{
"epoch": 0.11609498680738786,
"grad_norm": 1.1976656913757324,
"learning_rate": 8.391959798994975e-05,
"loss": 1.2387,
"step": 33
},
{
"epoch": 0.11961301671064203,
"grad_norm": 1.20232355594635,
"learning_rate": 8.341708542713568e-05,
"loss": 1.3125,
"step": 34
},
{
"epoch": 0.12313104661389622,
"grad_norm": 1.2482579946517944,
"learning_rate": 8.291457286432161e-05,
"loss": 1.322,
"step": 35
},
{
"epoch": 0.1266490765171504,
"grad_norm": 1.0197736024856567,
"learning_rate": 8.241206030150754e-05,
"loss": 1.1192,
"step": 36
},
{
"epoch": 0.13016710642040458,
"grad_norm": 0.9190375208854675,
"learning_rate": 8.190954773869348e-05,
"loss": 1.2522,
"step": 37
},
{
"epoch": 0.13368513632365875,
"grad_norm": 0.7511453032493591,
"learning_rate": 8.14070351758794e-05,
"loss": 1.0525,
"step": 38
},
{
"epoch": 0.13720316622691292,
"grad_norm": 0.7151877880096436,
"learning_rate": 8.090452261306533e-05,
"loss": 1.1839,
"step": 39
},
{
"epoch": 0.14072119613016712,
"grad_norm": 0.6375951766967773,
"learning_rate": 8.040201005025126e-05,
"loss": 1.2203,
"step": 40
},
{
"epoch": 0.1442392260334213,
"grad_norm": 0.6267354488372803,
"learning_rate": 7.989949748743719e-05,
"loss": 1.1996,
"step": 41
},
{
"epoch": 0.14775725593667546,
"grad_norm": 0.5620112419128418,
"learning_rate": 7.939698492462313e-05,
"loss": 1.1745,
"step": 42
},
{
"epoch": 0.15127528583992964,
"grad_norm": 0.6898969411849976,
"learning_rate": 7.889447236180904e-05,
"loss": 1.2377,
"step": 43
},
{
"epoch": 0.1547933157431838,
"grad_norm": 0.5548388957977295,
"learning_rate": 7.839195979899498e-05,
"loss": 1.1654,
"step": 44
},
{
"epoch": 0.158311345646438,
"grad_norm": 0.5869529843330383,
"learning_rate": 7.788944723618091e-05,
"loss": 1.1669,
"step": 45
},
{
"epoch": 0.16182937554969218,
"grad_norm": 0.6272417902946472,
"learning_rate": 7.738693467336684e-05,
"loss": 1.132,
"step": 46
},
{
"epoch": 0.16534740545294635,
"grad_norm": 0.6158267855644226,
"learning_rate": 7.688442211055277e-05,
"loss": 1.0767,
"step": 47
},
{
"epoch": 0.16886543535620052,
"grad_norm": 0.661561906337738,
"learning_rate": 7.638190954773869e-05,
"loss": 1.1867,
"step": 48
},
{
"epoch": 0.1723834652594547,
"grad_norm": 0.5605206489562988,
"learning_rate": 7.587939698492463e-05,
"loss": 1.1243,
"step": 49
},
{
"epoch": 0.1759014951627089,
"grad_norm": 0.6338799595832825,
"learning_rate": 7.537688442211056e-05,
"loss": 1.1635,
"step": 50
},
{
"epoch": 0.17941952506596306,
"grad_norm": 0.7251884937286377,
"learning_rate": 7.487437185929649e-05,
"loss": 1.1462,
"step": 51
},
{
"epoch": 0.18293755496921724,
"grad_norm": 0.5688169598579407,
"learning_rate": 7.437185929648241e-05,
"loss": 1.1351,
"step": 52
},
{
"epoch": 0.1864555848724714,
"grad_norm": 0.6056070923805237,
"learning_rate": 7.386934673366834e-05,
"loss": 1.1352,
"step": 53
},
{
"epoch": 0.18997361477572558,
"grad_norm": 0.8283679485321045,
"learning_rate": 7.336683417085427e-05,
"loss": 1.2222,
"step": 54
},
{
"epoch": 0.19349164467897978,
"grad_norm": 0.6316900253295898,
"learning_rate": 7.28643216080402e-05,
"loss": 1.2023,
"step": 55
},
{
"epoch": 0.19700967458223395,
"grad_norm": 0.6092143058776855,
"learning_rate": 7.236180904522614e-05,
"loss": 1.0762,
"step": 56
},
{
"epoch": 0.20052770448548812,
"grad_norm": 0.5600019097328186,
"learning_rate": 7.185929648241206e-05,
"loss": 1.0127,
"step": 57
},
{
"epoch": 0.2040457343887423,
"grad_norm": 0.6157863736152649,
"learning_rate": 7.135678391959799e-05,
"loss": 1.1016,
"step": 58
},
{
"epoch": 0.2075637642919965,
"grad_norm": 0.6391822099685669,
"learning_rate": 7.085427135678392e-05,
"loss": 1.2009,
"step": 59
},
{
"epoch": 0.21108179419525067,
"grad_norm": 0.5637600421905518,
"learning_rate": 7.035175879396985e-05,
"loss": 1.1419,
"step": 60
},
{
"epoch": 0.21459982409850484,
"grad_norm": 0.6826542019844055,
"learning_rate": 6.984924623115579e-05,
"loss": 1.1084,
"step": 61
},
{
"epoch": 0.218117854001759,
"grad_norm": 0.6475107073783875,
"learning_rate": 6.93467336683417e-05,
"loss": 1.2033,
"step": 62
},
{
"epoch": 0.22163588390501318,
"grad_norm": 0.5701493620872498,
"learning_rate": 6.884422110552764e-05,
"loss": 1.1425,
"step": 63
},
{
"epoch": 0.22515391380826738,
"grad_norm": 0.5416231155395508,
"learning_rate": 6.834170854271357e-05,
"loss": 1.0869,
"step": 64
},
{
"epoch": 0.22867194371152155,
"grad_norm": 0.611254870891571,
"learning_rate": 6.78391959798995e-05,
"loss": 1.1344,
"step": 65
},
{
"epoch": 0.23218997361477572,
"grad_norm": 0.5644116401672363,
"learning_rate": 6.733668341708544e-05,
"loss": 1.0655,
"step": 66
},
{
"epoch": 0.2357080035180299,
"grad_norm": 0.5953249931335449,
"learning_rate": 6.683417085427135e-05,
"loss": 1.1267,
"step": 67
},
{
"epoch": 0.23922603342128407,
"grad_norm": 0.5902895331382751,
"learning_rate": 6.633165829145729e-05,
"loss": 1.1207,
"step": 68
},
{
"epoch": 0.24274406332453827,
"grad_norm": 0.571882426738739,
"learning_rate": 6.582914572864322e-05,
"loss": 1.0945,
"step": 69
},
{
"epoch": 0.24626209322779244,
"grad_norm": 0.6372458934783936,
"learning_rate": 6.532663316582915e-05,
"loss": 1.1933,
"step": 70
},
{
"epoch": 0.2497801231310466,
"grad_norm": 0.6739147901535034,
"learning_rate": 6.482412060301508e-05,
"loss": 1.1202,
"step": 71
},
{
"epoch": 0.2532981530343008,
"grad_norm": 0.6515147686004639,
"learning_rate": 6.4321608040201e-05,
"loss": 1.1685,
"step": 72
},
{
"epoch": 0.256816182937555,
"grad_norm": 0.5706716775894165,
"learning_rate": 6.381909547738694e-05,
"loss": 1.1084,
"step": 73
},
{
"epoch": 0.26033421284080915,
"grad_norm": 0.595585286617279,
"learning_rate": 6.331658291457287e-05,
"loss": 1.1218,
"step": 74
},
{
"epoch": 0.2638522427440633,
"grad_norm": 0.6020475625991821,
"learning_rate": 6.28140703517588e-05,
"loss": 1.1282,
"step": 75
},
{
"epoch": 0.2673702726473175,
"grad_norm": 0.628376305103302,
"learning_rate": 6.231155778894473e-05,
"loss": 1.1067,
"step": 76
},
{
"epoch": 0.27088830255057167,
"grad_norm": 0.6371076107025146,
"learning_rate": 6.180904522613065e-05,
"loss": 1.1466,
"step": 77
},
{
"epoch": 0.27440633245382584,
"grad_norm": 0.6206318140029907,
"learning_rate": 6.130653266331658e-05,
"loss": 1.0801,
"step": 78
},
{
"epoch": 0.27792436235708,
"grad_norm": 0.6293841600418091,
"learning_rate": 6.080402010050251e-05,
"loss": 1.1644,
"step": 79
},
{
"epoch": 0.28144239226033424,
"grad_norm": 0.6434080600738525,
"learning_rate": 6.030150753768844e-05,
"loss": 1.0589,
"step": 80
},
{
"epoch": 0.2849604221635884,
"grad_norm": 0.5857638120651245,
"learning_rate": 5.979899497487438e-05,
"loss": 1.1711,
"step": 81
},
{
"epoch": 0.2884784520668426,
"grad_norm": 0.6163449883460999,
"learning_rate": 5.929648241206031e-05,
"loss": 1.1627,
"step": 82
},
{
"epoch": 0.29199648197009676,
"grad_norm": 0.6543634533882141,
"learning_rate": 5.879396984924623e-05,
"loss": 1.0909,
"step": 83
},
{
"epoch": 0.2955145118733509,
"grad_norm": 0.6609559059143066,
"learning_rate": 5.829145728643216e-05,
"loss": 1.1505,
"step": 84
},
{
"epoch": 0.2990325417766051,
"grad_norm": 0.5798302292823792,
"learning_rate": 5.778894472361809e-05,
"loss": 1.0834,
"step": 85
},
{
"epoch": 0.30255057167985927,
"grad_norm": 0.6974066495895386,
"learning_rate": 5.728643216080403e-05,
"loss": 1.0965,
"step": 86
},
{
"epoch": 0.30606860158311344,
"grad_norm": 0.67149817943573,
"learning_rate": 5.6783919597989955e-05,
"loss": 1.09,
"step": 87
},
{
"epoch": 0.3095866314863676,
"grad_norm": 0.5761735439300537,
"learning_rate": 5.628140703517588e-05,
"loss": 1.1436,
"step": 88
},
{
"epoch": 0.3131046613896218,
"grad_norm": 0.6142584681510925,
"learning_rate": 5.577889447236181e-05,
"loss": 1.0489,
"step": 89
},
{
"epoch": 0.316622691292876,
"grad_norm": 0.6407614946365356,
"learning_rate": 5.527638190954774e-05,
"loss": 1.1449,
"step": 90
},
{
"epoch": 0.3201407211961302,
"grad_norm": 0.6835021376609802,
"learning_rate": 5.477386934673368e-05,
"loss": 1.1332,
"step": 91
},
{
"epoch": 0.32365875109938436,
"grad_norm": 0.5755856037139893,
"learning_rate": 5.4271356783919604e-05,
"loss": 1.1195,
"step": 92
},
{
"epoch": 0.32717678100263853,
"grad_norm": 0.6232398748397827,
"learning_rate": 5.376884422110553e-05,
"loss": 1.1696,
"step": 93
},
{
"epoch": 0.3306948109058927,
"grad_norm": 0.6193405389785767,
"learning_rate": 5.3266331658291455e-05,
"loss": 1.1106,
"step": 94
},
{
"epoch": 0.33421284080914687,
"grad_norm": 0.6834057569503784,
"learning_rate": 5.276381909547739e-05,
"loss": 1.1349,
"step": 95
},
{
"epoch": 0.33773087071240104,
"grad_norm": 0.7168384790420532,
"learning_rate": 5.226130653266332e-05,
"loss": 1.2054,
"step": 96
},
{
"epoch": 0.3412489006156552,
"grad_norm": 0.6553971767425537,
"learning_rate": 5.175879396984925e-05,
"loss": 1.0975,
"step": 97
},
{
"epoch": 0.3447669305189094,
"grad_norm": 0.6329600811004639,
"learning_rate": 5.125628140703518e-05,
"loss": 1.1212,
"step": 98
},
{
"epoch": 0.3482849604221636,
"grad_norm": 0.6656339764595032,
"learning_rate": 5.0753768844221104e-05,
"loss": 1.1451,
"step": 99
},
{
"epoch": 0.3518029903254178,
"grad_norm": 0.6817747950553894,
"learning_rate": 5.0251256281407036e-05,
"loss": 1.084,
"step": 100
},
{
"epoch": 0.35532102022867196,
"grad_norm": 0.6384849548339844,
"learning_rate": 4.974874371859297e-05,
"loss": 1.047,
"step": 101
},
{
"epoch": 0.35883905013192613,
"grad_norm": 0.6342082023620605,
"learning_rate": 4.92462311557789e-05,
"loss": 1.1122,
"step": 102
},
{
"epoch": 0.3623570800351803,
"grad_norm": 0.6114000082015991,
"learning_rate": 4.874371859296483e-05,
"loss": 1.1094,
"step": 103
},
{
"epoch": 0.3658751099384345,
"grad_norm": 0.6310352683067322,
"learning_rate": 4.824120603015075e-05,
"loss": 1.1508,
"step": 104
},
{
"epoch": 0.36939313984168864,
"grad_norm": 0.6773234605789185,
"learning_rate": 4.7738693467336685e-05,
"loss": 1.0511,
"step": 105
},
{
"epoch": 0.3729111697449428,
"grad_norm": 0.6625077724456787,
"learning_rate": 4.723618090452262e-05,
"loss": 1.1422,
"step": 106
},
{
"epoch": 0.376429199648197,
"grad_norm": 0.6125949025154114,
"learning_rate": 4.673366834170855e-05,
"loss": 1.1189,
"step": 107
},
{
"epoch": 0.37994722955145116,
"grad_norm": 0.684280514717102,
"learning_rate": 4.6231155778894475e-05,
"loss": 1.2249,
"step": 108
},
{
"epoch": 0.3834652594547054,
"grad_norm": 0.8305927515029907,
"learning_rate": 4.57286432160804e-05,
"loss": 1.1758,
"step": 109
},
{
"epoch": 0.38698328935795956,
"grad_norm": 0.6081312894821167,
"learning_rate": 4.522613065326633e-05,
"loss": 1.0853,
"step": 110
},
{
"epoch": 0.39050131926121373,
"grad_norm": 0.716929018497467,
"learning_rate": 4.4723618090452266e-05,
"loss": 1.1903,
"step": 111
},
{
"epoch": 0.3940193491644679,
"grad_norm": 0.5968315005302429,
"learning_rate": 4.42211055276382e-05,
"loss": 1.0717,
"step": 112
},
{
"epoch": 0.3975373790677221,
"grad_norm": 0.6502510905265808,
"learning_rate": 4.3718592964824124e-05,
"loss": 1.0629,
"step": 113
},
{
"epoch": 0.40105540897097625,
"grad_norm": 0.6408775448799133,
"learning_rate": 4.321608040201005e-05,
"loss": 1.0937,
"step": 114
},
{
"epoch": 0.4045734388742304,
"grad_norm": 0.6137213110923767,
"learning_rate": 4.271356783919598e-05,
"loss": 1.0853,
"step": 115
},
{
"epoch": 0.4080914687774846,
"grad_norm": 0.6401947736740112,
"learning_rate": 4.2211055276381914e-05,
"loss": 1.1542,
"step": 116
},
{
"epoch": 0.41160949868073876,
"grad_norm": 0.6332412362098694,
"learning_rate": 4.170854271356784e-05,
"loss": 1.0731,
"step": 117
},
{
"epoch": 0.415127528583993,
"grad_norm": 0.6274076700210571,
"learning_rate": 4.120603015075377e-05,
"loss": 1.0707,
"step": 118
},
{
"epoch": 0.41864555848724716,
"grad_norm": 0.632633626461029,
"learning_rate": 4.07035175879397e-05,
"loss": 1.108,
"step": 119
},
{
"epoch": 0.42216358839050133,
"grad_norm": 0.6979479193687439,
"learning_rate": 4.020100502512563e-05,
"loss": 1.1483,
"step": 120
},
{
"epoch": 0.4256816182937555,
"grad_norm": 0.7355033755302429,
"learning_rate": 3.969849246231156e-05,
"loss": 1.1358,
"step": 121
},
{
"epoch": 0.4291996481970097,
"grad_norm": 0.6254828572273254,
"learning_rate": 3.919597989949749e-05,
"loss": 1.1753,
"step": 122
},
{
"epoch": 0.43271767810026385,
"grad_norm": 0.6851824522018433,
"learning_rate": 3.869346733668342e-05,
"loss": 1.0128,
"step": 123
},
{
"epoch": 0.436235708003518,
"grad_norm": 0.6097928285598755,
"learning_rate": 3.8190954773869346e-05,
"loss": 1.1235,
"step": 124
},
{
"epoch": 0.4397537379067722,
"grad_norm": 0.6748325824737549,
"learning_rate": 3.768844221105528e-05,
"loss": 1.0452,
"step": 125
},
{
"epoch": 0.44327176781002636,
"grad_norm": 0.6666128039360046,
"learning_rate": 3.7185929648241204e-05,
"loss": 1.1075,
"step": 126
},
{
"epoch": 0.4467897977132806,
"grad_norm": 0.7474984526634216,
"learning_rate": 3.668341708542714e-05,
"loss": 1.0695,
"step": 127
},
{
"epoch": 0.45030782761653476,
"grad_norm": 0.6925339698791504,
"learning_rate": 3.618090452261307e-05,
"loss": 1.1024,
"step": 128
},
{
"epoch": 0.45382585751978893,
"grad_norm": 0.6140123009681702,
"learning_rate": 3.5678391959798995e-05,
"loss": 1.0788,
"step": 129
},
{
"epoch": 0.4573438874230431,
"grad_norm": 0.6771907806396484,
"learning_rate": 3.517587939698493e-05,
"loss": 1.0913,
"step": 130
},
{
"epoch": 0.4608619173262973,
"grad_norm": 0.6700430512428284,
"learning_rate": 3.467336683417085e-05,
"loss": 1.0566,
"step": 131
},
{
"epoch": 0.46437994722955145,
"grad_norm": 0.6931480169296265,
"learning_rate": 3.4170854271356785e-05,
"loss": 1.059,
"step": 132
},
{
"epoch": 0.4678979771328056,
"grad_norm": 0.6608771085739136,
"learning_rate": 3.366834170854272e-05,
"loss": 1.119,
"step": 133
},
{
"epoch": 0.4714160070360598,
"grad_norm": 0.6470663547515869,
"learning_rate": 3.3165829145728643e-05,
"loss": 1.0662,
"step": 134
},
{
"epoch": 0.47493403693931396,
"grad_norm": 0.5729122757911682,
"learning_rate": 3.2663316582914576e-05,
"loss": 0.9999,
"step": 135
},
{
"epoch": 0.47845206684256814,
"grad_norm": 0.6993862390518188,
"learning_rate": 3.21608040201005e-05,
"loss": 1.1819,
"step": 136
},
{
"epoch": 0.48197009674582236,
"grad_norm": 0.6929494738578796,
"learning_rate": 3.1658291457286434e-05,
"loss": 1.1719,
"step": 137
},
{
"epoch": 0.48548812664907653,
"grad_norm": 0.6951282620429993,
"learning_rate": 3.1155778894472366e-05,
"loss": 1.0716,
"step": 138
},
{
"epoch": 0.4890061565523307,
"grad_norm": 0.6766693592071533,
"learning_rate": 3.065326633165829e-05,
"loss": 1.1589,
"step": 139
},
{
"epoch": 0.4925241864555849,
"grad_norm": 0.6500269174575806,
"learning_rate": 3.015075376884422e-05,
"loss": 1.1122,
"step": 140
},
{
"epoch": 0.49604221635883905,
"grad_norm": 0.7741857171058655,
"learning_rate": 2.9648241206030153e-05,
"loss": 1.1594,
"step": 141
},
{
"epoch": 0.4995602462620932,
"grad_norm": 0.6630749106407166,
"learning_rate": 2.914572864321608e-05,
"loss": 1.0615,
"step": 142
},
{
"epoch": 0.5030782761653474,
"grad_norm": 0.7230671048164368,
"learning_rate": 2.8643216080402015e-05,
"loss": 1.1521,
"step": 143
},
{
"epoch": 0.5065963060686016,
"grad_norm": 0.6624138355255127,
"learning_rate": 2.814070351758794e-05,
"loss": 1.0347,
"step": 144
},
{
"epoch": 0.5101143359718557,
"grad_norm": 0.6560067534446716,
"learning_rate": 2.763819095477387e-05,
"loss": 1.1214,
"step": 145
},
{
"epoch": 0.51363236587511,
"grad_norm": 0.6742956638336182,
"learning_rate": 2.7135678391959802e-05,
"loss": 1.0956,
"step": 146
},
{
"epoch": 0.5171503957783641,
"grad_norm": 0.706284761428833,
"learning_rate": 2.6633165829145728e-05,
"loss": 1.1058,
"step": 147
},
{
"epoch": 0.5206684256816183,
"grad_norm": 0.6924006938934326,
"learning_rate": 2.613065326633166e-05,
"loss": 1.186,
"step": 148
},
{
"epoch": 0.5241864555848724,
"grad_norm": 0.6287305951118469,
"learning_rate": 2.562814070351759e-05,
"loss": 1.0422,
"step": 149
},
{
"epoch": 0.5277044854881267,
"grad_norm": 0.6957104206085205,
"learning_rate": 2.5125628140703518e-05,
"loss": 1.0896,
"step": 150
},
{
"epoch": 0.5312225153913809,
"grad_norm": 0.7039506435394287,
"learning_rate": 2.462311557788945e-05,
"loss": 1.0818,
"step": 151
},
{
"epoch": 0.534740545294635,
"grad_norm": 0.6502148509025574,
"learning_rate": 2.4120603015075376e-05,
"loss": 1.112,
"step": 152
},
{
"epoch": 0.5382585751978892,
"grad_norm": 0.6823992133140564,
"learning_rate": 2.361809045226131e-05,
"loss": 1.0298,
"step": 153
},
{
"epoch": 0.5417766051011433,
"grad_norm": 0.7539629936218262,
"learning_rate": 2.3115577889447238e-05,
"loss": 1.0618,
"step": 154
},
{
"epoch": 0.5452946350043976,
"grad_norm": 0.6974697113037109,
"learning_rate": 2.2613065326633167e-05,
"loss": 1.1702,
"step": 155
},
{
"epoch": 0.5488126649076517,
"grad_norm": 0.7035180330276489,
"learning_rate": 2.21105527638191e-05,
"loss": 1.0714,
"step": 156
},
{
"epoch": 0.5523306948109059,
"grad_norm": 0.9007865786552429,
"learning_rate": 2.1608040201005025e-05,
"loss": 1.0565,
"step": 157
},
{
"epoch": 0.55584872471416,
"grad_norm": 0.7083996534347534,
"learning_rate": 2.1105527638190957e-05,
"loss": 1.1425,
"step": 158
},
{
"epoch": 0.5593667546174143,
"grad_norm": 0.7241733074188232,
"learning_rate": 2.0603015075376886e-05,
"loss": 1.1211,
"step": 159
},
{
"epoch": 0.5628847845206685,
"grad_norm": 0.7474963068962097,
"learning_rate": 2.0100502512562815e-05,
"loss": 1.0546,
"step": 160
},
{
"epoch": 0.5664028144239226,
"grad_norm": 0.7051181793212891,
"learning_rate": 1.9597989949748744e-05,
"loss": 0.9878,
"step": 161
},
{
"epoch": 0.5699208443271768,
"grad_norm": 0.7359694242477417,
"learning_rate": 1.9095477386934673e-05,
"loss": 1.1283,
"step": 162
},
{
"epoch": 0.5734388742304309,
"grad_norm": 0.6908060908317566,
"learning_rate": 1.8592964824120602e-05,
"loss": 1.1287,
"step": 163
},
{
"epoch": 0.5769569041336852,
"grad_norm": 0.7220682501792908,
"learning_rate": 1.8090452261306535e-05,
"loss": 1.0424,
"step": 164
},
{
"epoch": 0.5804749340369393,
"grad_norm": 0.7415404319763184,
"learning_rate": 1.7587939698492464e-05,
"loss": 1.0749,
"step": 165
},
{
"epoch": 0.5839929639401935,
"grad_norm": 0.7168678641319275,
"learning_rate": 1.7085427135678393e-05,
"loss": 1.1308,
"step": 166
},
{
"epoch": 0.5875109938434476,
"grad_norm": 0.653301477432251,
"learning_rate": 1.6582914572864322e-05,
"loss": 1.0777,
"step": 167
},
{
"epoch": 0.5910290237467019,
"grad_norm": 0.7567819952964783,
"learning_rate": 1.608040201005025e-05,
"loss": 1.1476,
"step": 168
},
{
"epoch": 0.594547053649956,
"grad_norm": 0.7353144288063049,
"learning_rate": 1.5577889447236183e-05,
"loss": 1.0961,
"step": 169
},
{
"epoch": 0.5980650835532102,
"grad_norm": 0.6990388035774231,
"learning_rate": 1.507537688442211e-05,
"loss": 1.1619,
"step": 170
},
{
"epoch": 0.6015831134564644,
"grad_norm": 0.7032533288002014,
"learning_rate": 1.457286432160804e-05,
"loss": 1.0619,
"step": 171
},
{
"epoch": 0.6051011433597185,
"grad_norm": 0.6197975873947144,
"learning_rate": 1.407035175879397e-05,
"loss": 1.0953,
"step": 172
},
{
"epoch": 0.6086191732629728,
"grad_norm": 0.746258556842804,
"learning_rate": 1.3567839195979901e-05,
"loss": 1.1201,
"step": 173
},
{
"epoch": 0.6121372031662269,
"grad_norm": 0.6444905996322632,
"learning_rate": 1.306532663316583e-05,
"loss": 1.0241,
"step": 174
},
{
"epoch": 0.6156552330694811,
"grad_norm": 0.7037890553474426,
"learning_rate": 1.2562814070351759e-05,
"loss": 1.0739,
"step": 175
},
{
"epoch": 0.6191732629727352,
"grad_norm": 0.7138697504997253,
"learning_rate": 1.2060301507537688e-05,
"loss": 1.1102,
"step": 176
},
{
"epoch": 0.6226912928759895,
"grad_norm": 0.7358911037445068,
"learning_rate": 1.1557788944723619e-05,
"loss": 1.1945,
"step": 177
},
{
"epoch": 0.6262093227792436,
"grad_norm": 0.7306352853775024,
"learning_rate": 1.105527638190955e-05,
"loss": 1.0887,
"step": 178
},
{
"epoch": 0.6297273526824978,
"grad_norm": 0.7626399993896484,
"learning_rate": 1.0552763819095479e-05,
"loss": 1.0918,
"step": 179
},
{
"epoch": 0.633245382585752,
"grad_norm": 0.7157562375068665,
"learning_rate": 1.0050251256281408e-05,
"loss": 1.0794,
"step": 180
},
{
"epoch": 0.6367634124890061,
"grad_norm": 0.674655556678772,
"learning_rate": 9.547738693467337e-06,
"loss": 1.1632,
"step": 181
},
{
"epoch": 0.6402814423922604,
"grad_norm": 0.7276845574378967,
"learning_rate": 9.045226130653267e-06,
"loss": 1.0664,
"step": 182
},
{
"epoch": 0.6437994722955145,
"grad_norm": 0.7614260315895081,
"learning_rate": 8.542713567839196e-06,
"loss": 1.1185,
"step": 183
},
{
"epoch": 0.6473175021987687,
"grad_norm": 0.691209614276886,
"learning_rate": 8.040201005025125e-06,
"loss": 1.0648,
"step": 184
},
{
"epoch": 0.6508355321020228,
"grad_norm": 0.6736161708831787,
"learning_rate": 7.537688442211055e-06,
"loss": 1.11,
"step": 185
},
{
"epoch": 0.6543535620052771,
"grad_norm": 0.6875973343849182,
"learning_rate": 7.035175879396985e-06,
"loss": 1.1085,
"step": 186
},
{
"epoch": 0.6578715919085312,
"grad_norm": 0.6715053915977478,
"learning_rate": 6.532663316582915e-06,
"loss": 1.1391,
"step": 187
},
{
"epoch": 0.6613896218117854,
"grad_norm": 0.7241913080215454,
"learning_rate": 6.030150753768844e-06,
"loss": 1.193,
"step": 188
},
{
"epoch": 0.6649076517150396,
"grad_norm": 0.722939133644104,
"learning_rate": 5.527638190954775e-06,
"loss": 1.1218,
"step": 189
},
{
"epoch": 0.6684256816182937,
"grad_norm": 0.7348630428314209,
"learning_rate": 5.025125628140704e-06,
"loss": 1.0771,
"step": 190
},
{
"epoch": 0.671943711521548,
"grad_norm": 0.72852623462677,
"learning_rate": 4.522613065326634e-06,
"loss": 1.1196,
"step": 191
},
{
"epoch": 0.6754617414248021,
"grad_norm": 0.7617117762565613,
"learning_rate": 4.020100502512563e-06,
"loss": 1.1313,
"step": 192
},
{
"epoch": 0.6789797713280563,
"grad_norm": 0.8029654622077942,
"learning_rate": 3.5175879396984926e-06,
"loss": 1.1405,
"step": 193
},
{
"epoch": 0.6824978012313104,
"grad_norm": 0.6885625123977661,
"learning_rate": 3.015075376884422e-06,
"loss": 1.0565,
"step": 194
},
{
"epoch": 0.6860158311345647,
"grad_norm": 0.7057883143424988,
"learning_rate": 2.512562814070352e-06,
"loss": 1.1625,
"step": 195
},
{
"epoch": 0.6895338610378188,
"grad_norm": 0.7429342269897461,
"learning_rate": 2.0100502512562813e-06,
"loss": 1.044,
"step": 196
},
{
"epoch": 0.693051890941073,
"grad_norm": 0.7036694884300232,
"learning_rate": 1.507537688442211e-06,
"loss": 1.0991,
"step": 197
},
{
"epoch": 0.6965699208443272,
"grad_norm": 0.6950182318687439,
"learning_rate": 1.0050251256281407e-06,
"loss": 1.1014,
"step": 198
},
{
"epoch": 0.7000879507475813,
"grad_norm": 0.7009806632995605,
"learning_rate": 5.025125628140703e-07,
"loss": 1.1108,
"step": 199
},
{
"epoch": 0.7036059806508356,
"grad_norm": 0.6382765769958496,
"learning_rate": 0.0,
"loss": 1.0479,
"step": 200
}
],
"logging_steps": 1,
"max_steps": 200,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.5191482454605824e+16,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}