richardprobe's picture
Upload PEFT LoRA adapter
d5fb74c verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.405572755417957,
"eval_steps": 500,
"global_step": 1100,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.030959752321981424,
"grad_norm": 9.773098945617676,
"learning_rate": 2.991640866873065e-05,
"loss": 9.245,
"mean_token_accuracy": 0.2123243510723114,
"num_tokens": 5327.0,
"step": 10
},
{
"epoch": 0.06191950464396285,
"grad_norm": 3.128091335296631,
"learning_rate": 2.9823529411764707e-05,
"loss": 6.1602,
"mean_token_accuracy": 0.25044268518686297,
"num_tokens": 10795.0,
"step": 20
},
{
"epoch": 0.09287925696594428,
"grad_norm": 4.203906536102295,
"learning_rate": 2.973065015479876e-05,
"loss": 5.6948,
"mean_token_accuracy": 0.262071692943573,
"num_tokens": 16240.0,
"step": 30
},
{
"epoch": 0.1238390092879257,
"grad_norm": 4.233511924743652,
"learning_rate": 2.9637770897832817e-05,
"loss": 5.2733,
"mean_token_accuracy": 0.27527774721384046,
"num_tokens": 21582.0,
"step": 40
},
{
"epoch": 0.15479876160990713,
"grad_norm": 7.080459117889404,
"learning_rate": 2.9544891640866874e-05,
"loss": 4.9304,
"mean_token_accuracy": 0.2877007365226746,
"num_tokens": 27142.0,
"step": 50
},
{
"epoch": 0.18575851393188855,
"grad_norm": 7.273204326629639,
"learning_rate": 2.945201238390093e-05,
"loss": 4.689,
"mean_token_accuracy": 0.28890604972839357,
"num_tokens": 32801.0,
"step": 60
},
{
"epoch": 0.21671826625386997,
"grad_norm": 2.2185206413269043,
"learning_rate": 2.9359133126934984e-05,
"loss": 4.3965,
"mean_token_accuracy": 0.28117197155952456,
"num_tokens": 38472.0,
"step": 70
},
{
"epoch": 0.2476780185758514,
"grad_norm": 2.0464794635772705,
"learning_rate": 2.926625386996904e-05,
"loss": 4.062,
"mean_token_accuracy": 0.299494668841362,
"num_tokens": 43743.0,
"step": 80
},
{
"epoch": 0.2786377708978328,
"grad_norm": 1.633155345916748,
"learning_rate": 2.9173374613003097e-05,
"loss": 4.0378,
"mean_token_accuracy": 0.30819864571094513,
"num_tokens": 49087.0,
"step": 90
},
{
"epoch": 0.30959752321981426,
"grad_norm": 1.4594247341156006,
"learning_rate": 2.908049535603715e-05,
"loss": 3.8513,
"mean_token_accuracy": 0.3258361428976059,
"num_tokens": 54433.0,
"step": 100
},
{
"epoch": 0.34055727554179566,
"grad_norm": 1.5312635898590088,
"learning_rate": 2.898761609907121e-05,
"loss": 3.9162,
"mean_token_accuracy": 0.32140363454818727,
"num_tokens": 59629.0,
"step": 110
},
{
"epoch": 0.3715170278637771,
"grad_norm": 1.3190491199493408,
"learning_rate": 2.8894736842105263e-05,
"loss": 3.902,
"mean_token_accuracy": 0.3103078156709671,
"num_tokens": 65326.0,
"step": 120
},
{
"epoch": 0.4024767801857585,
"grad_norm": 1.6095689535140991,
"learning_rate": 2.880185758513932e-05,
"loss": 3.7107,
"mean_token_accuracy": 0.3353793561458588,
"num_tokens": 70440.0,
"step": 130
},
{
"epoch": 0.43343653250773995,
"grad_norm": 1.6634972095489502,
"learning_rate": 2.8708978328173377e-05,
"loss": 3.7747,
"mean_token_accuracy": 0.3298566401004791,
"num_tokens": 75712.0,
"step": 140
},
{
"epoch": 0.46439628482972134,
"grad_norm": 1.3906605243682861,
"learning_rate": 2.861609907120743e-05,
"loss": 3.7344,
"mean_token_accuracy": 0.34043932259082793,
"num_tokens": 81272.0,
"step": 150
},
{
"epoch": 0.4953560371517028,
"grad_norm": 1.6273926496505737,
"learning_rate": 2.8523219814241487e-05,
"loss": 3.6722,
"mean_token_accuracy": 0.33802524507045745,
"num_tokens": 86836.0,
"step": 160
},
{
"epoch": 0.5263157894736842,
"grad_norm": 1.595566987991333,
"learning_rate": 2.8430340557275543e-05,
"loss": 3.5486,
"mean_token_accuracy": 0.36929037272930143,
"num_tokens": 91622.0,
"step": 170
},
{
"epoch": 0.5572755417956656,
"grad_norm": 1.9571454524993896,
"learning_rate": 2.83374613003096e-05,
"loss": 3.6849,
"mean_token_accuracy": 0.3387055486440659,
"num_tokens": 97019.0,
"step": 180
},
{
"epoch": 0.5882352941176471,
"grad_norm": 1.6203333139419556,
"learning_rate": 2.8244582043343653e-05,
"loss": 3.5592,
"mean_token_accuracy": 0.36260710954666137,
"num_tokens": 102273.0,
"step": 190
},
{
"epoch": 0.6191950464396285,
"grad_norm": 1.8625439405441284,
"learning_rate": 2.815170278637771e-05,
"loss": 3.4542,
"mean_token_accuracy": 0.3554231733083725,
"num_tokens": 107847.0,
"step": 200
},
{
"epoch": 0.6501547987616099,
"grad_norm": 1.5171610116958618,
"learning_rate": 2.8058823529411766e-05,
"loss": 3.6914,
"mean_token_accuracy": 0.3506886214017868,
"num_tokens": 113499.0,
"step": 210
},
{
"epoch": 0.6811145510835913,
"grad_norm": 1.465408205986023,
"learning_rate": 2.796594427244582e-05,
"loss": 3.6008,
"mean_token_accuracy": 0.3558589071035385,
"num_tokens": 119014.0,
"step": 220
},
{
"epoch": 0.7120743034055728,
"grad_norm": 1.5382874011993408,
"learning_rate": 2.787306501547988e-05,
"loss": 3.5375,
"mean_token_accuracy": 0.3548148155212402,
"num_tokens": 124170.0,
"step": 230
},
{
"epoch": 0.7430340557275542,
"grad_norm": 1.773881196975708,
"learning_rate": 2.7780185758513933e-05,
"loss": 3.573,
"mean_token_accuracy": 0.3465736091136932,
"num_tokens": 129487.0,
"step": 240
},
{
"epoch": 0.7739938080495357,
"grad_norm": 1.7652744054794312,
"learning_rate": 2.7687306501547986e-05,
"loss": 3.6811,
"mean_token_accuracy": 0.33623204231262205,
"num_tokens": 135007.0,
"step": 250
},
{
"epoch": 0.804953560371517,
"grad_norm": 1.7662419080734253,
"learning_rate": 2.7594427244582046e-05,
"loss": 3.505,
"mean_token_accuracy": 0.3567329585552216,
"num_tokens": 140143.0,
"step": 260
},
{
"epoch": 0.8359133126934984,
"grad_norm": 1.9441474676132202,
"learning_rate": 2.75015479876161e-05,
"loss": 3.4804,
"mean_token_accuracy": 0.36218210160732267,
"num_tokens": 145363.0,
"step": 270
},
{
"epoch": 0.8668730650154799,
"grad_norm": 1.745896816253662,
"learning_rate": 2.7408668730650156e-05,
"loss": 3.6519,
"mean_token_accuracy": 0.34941086173057556,
"num_tokens": 150840.0,
"step": 280
},
{
"epoch": 0.8978328173374613,
"grad_norm": 1.928284764289856,
"learning_rate": 2.7315789473684213e-05,
"loss": 3.6138,
"mean_token_accuracy": 0.34826839864254,
"num_tokens": 156077.0,
"step": 290
},
{
"epoch": 0.9287925696594427,
"grad_norm": 2.177100896835327,
"learning_rate": 2.722291021671827e-05,
"loss": 3.4666,
"mean_token_accuracy": 0.36537405848503113,
"num_tokens": 160953.0,
"step": 300
},
{
"epoch": 0.9597523219814241,
"grad_norm": 2.203282594680786,
"learning_rate": 2.7130030959752322e-05,
"loss": 3.5842,
"mean_token_accuracy": 0.34776660799980164,
"num_tokens": 166286.0,
"step": 310
},
{
"epoch": 0.9907120743034056,
"grad_norm": 1.724373459815979,
"learning_rate": 2.7037151702786376e-05,
"loss": 3.4961,
"mean_token_accuracy": 0.35577190220355986,
"num_tokens": 171629.0,
"step": 320
},
{
"epoch": 1.021671826625387,
"grad_norm": 1.7433867454528809,
"learning_rate": 2.6944272445820436e-05,
"loss": 3.4088,
"mean_token_accuracy": 0.36975419521331787,
"num_tokens": 176968.0,
"step": 330
},
{
"epoch": 1.0526315789473684,
"grad_norm": 2.0577471256256104,
"learning_rate": 2.685139318885449e-05,
"loss": 3.5364,
"mean_token_accuracy": 0.35527182221412656,
"num_tokens": 182390.0,
"step": 340
},
{
"epoch": 1.08359133126935,
"grad_norm": 1.7357635498046875,
"learning_rate": 2.6758513931888546e-05,
"loss": 3.4495,
"mean_token_accuracy": 0.3565235286951065,
"num_tokens": 188181.0,
"step": 350
},
{
"epoch": 1.1145510835913313,
"grad_norm": 2.024507761001587,
"learning_rate": 2.6665634674922602e-05,
"loss": 3.3766,
"mean_token_accuracy": 0.3696540713310242,
"num_tokens": 193427.0,
"step": 360
},
{
"epoch": 1.1455108359133126,
"grad_norm": 2.1170592308044434,
"learning_rate": 2.6572755417956655e-05,
"loss": 3.4759,
"mean_token_accuracy": 0.35803447663784027,
"num_tokens": 199046.0,
"step": 370
},
{
"epoch": 1.1764705882352942,
"grad_norm": 2.118878126144409,
"learning_rate": 2.6479876160990712e-05,
"loss": 3.4685,
"mean_token_accuracy": 0.3570233076810837,
"num_tokens": 204089.0,
"step": 380
},
{
"epoch": 1.2074303405572755,
"grad_norm": 2.2280914783477783,
"learning_rate": 2.638699690402477e-05,
"loss": 3.4822,
"mean_token_accuracy": 0.36153341829776764,
"num_tokens": 209656.0,
"step": 390
},
{
"epoch": 1.238390092879257,
"grad_norm": 2.444979667663574,
"learning_rate": 2.6294117647058825e-05,
"loss": 3.3666,
"mean_token_accuracy": 0.3748770415782928,
"num_tokens": 214465.0,
"step": 400
},
{
"epoch": 1.2693498452012384,
"grad_norm": 1.9609659910202026,
"learning_rate": 2.620123839009288e-05,
"loss": 3.4161,
"mean_token_accuracy": 0.3594685852527618,
"num_tokens": 219728.0,
"step": 410
},
{
"epoch": 1.3003095975232197,
"grad_norm": 1.9759095907211304,
"learning_rate": 2.6108359133126935e-05,
"loss": 3.405,
"mean_token_accuracy": 0.3668099522590637,
"num_tokens": 224992.0,
"step": 420
},
{
"epoch": 1.3312693498452013,
"grad_norm": 2.1737940311431885,
"learning_rate": 2.6015479876160992e-05,
"loss": 3.3809,
"mean_token_accuracy": 0.37462269365787504,
"num_tokens": 230431.0,
"step": 430
},
{
"epoch": 1.3622291021671826,
"grad_norm": 2.475351333618164,
"learning_rate": 2.5922600619195045e-05,
"loss": 3.3454,
"mean_token_accuracy": 0.36902076900005343,
"num_tokens": 235868.0,
"step": 440
},
{
"epoch": 1.3931888544891642,
"grad_norm": 2.1027772426605225,
"learning_rate": 2.5829721362229105e-05,
"loss": 3.3967,
"mean_token_accuracy": 0.3779816538095474,
"num_tokens": 241149.0,
"step": 450
},
{
"epoch": 1.4241486068111455,
"grad_norm": 2.613186836242676,
"learning_rate": 2.5736842105263158e-05,
"loss": 3.3383,
"mean_token_accuracy": 0.37455591559410095,
"num_tokens": 246200.0,
"step": 460
},
{
"epoch": 1.4551083591331269,
"grad_norm": 2.1689629554748535,
"learning_rate": 2.5643962848297215e-05,
"loss": 3.4927,
"mean_token_accuracy": 0.3641968876123428,
"num_tokens": 251354.0,
"step": 470
},
{
"epoch": 1.4860681114551084,
"grad_norm": 1.9075849056243896,
"learning_rate": 2.555108359133127e-05,
"loss": 3.4211,
"mean_token_accuracy": 0.3670921057462692,
"num_tokens": 257201.0,
"step": 480
},
{
"epoch": 1.5170278637770898,
"grad_norm": 2.128737211227417,
"learning_rate": 2.5458204334365325e-05,
"loss": 3.3306,
"mean_token_accuracy": 0.3747966349124908,
"num_tokens": 262640.0,
"step": 490
},
{
"epoch": 1.5479876160990713,
"grad_norm": 1.9061874151229858,
"learning_rate": 2.536532507739938e-05,
"loss": 3.3874,
"mean_token_accuracy": 0.3706284284591675,
"num_tokens": 268129.0,
"step": 500
},
{
"epoch": 1.5789473684210527,
"grad_norm": 1.8868329524993896,
"learning_rate": 2.5272445820433438e-05,
"loss": 3.3185,
"mean_token_accuracy": 0.37699449956417086,
"num_tokens": 273489.0,
"step": 510
},
{
"epoch": 1.609907120743034,
"grad_norm": 1.8507658243179321,
"learning_rate": 2.5179566563467495e-05,
"loss": 3.2901,
"mean_token_accuracy": 0.3797824054956436,
"num_tokens": 279142.0,
"step": 520
},
{
"epoch": 1.6408668730650153,
"grad_norm": 2.396951198577881,
"learning_rate": 2.5086687306501548e-05,
"loss": 3.3503,
"mean_token_accuracy": 0.37934728860855105,
"num_tokens": 284049.0,
"step": 530
},
{
"epoch": 1.671826625386997,
"grad_norm": 2.256753921508789,
"learning_rate": 2.4993808049535605e-05,
"loss": 3.3706,
"mean_token_accuracy": 0.38219387233257296,
"num_tokens": 289514.0,
"step": 540
},
{
"epoch": 1.7027863777089784,
"grad_norm": 1.9369879961013794,
"learning_rate": 2.490092879256966e-05,
"loss": 3.3615,
"mean_token_accuracy": 0.37476457953453063,
"num_tokens": 295091.0,
"step": 550
},
{
"epoch": 1.7337461300309598,
"grad_norm": 2.6628825664520264,
"learning_rate": 2.4808049535603714e-05,
"loss": 3.3121,
"mean_token_accuracy": 0.3830788493156433,
"num_tokens": 300102.0,
"step": 560
},
{
"epoch": 1.7647058823529411,
"grad_norm": 2.6188621520996094,
"learning_rate": 2.4715170278637774e-05,
"loss": 3.4005,
"mean_token_accuracy": 0.3631168991327286,
"num_tokens": 305652.0,
"step": 570
},
{
"epoch": 1.7956656346749225,
"grad_norm": 2.093585968017578,
"learning_rate": 2.4622291021671828e-05,
"loss": 3.4483,
"mean_token_accuracy": 0.3639900177717209,
"num_tokens": 311333.0,
"step": 580
},
{
"epoch": 1.826625386996904,
"grad_norm": 2.355714797973633,
"learning_rate": 2.452941176470588e-05,
"loss": 3.2946,
"mean_token_accuracy": 0.3784641414880753,
"num_tokens": 316699.0,
"step": 590
},
{
"epoch": 1.8575851393188856,
"grad_norm": 2.5051403045654297,
"learning_rate": 2.4436532507739938e-05,
"loss": 3.2612,
"mean_token_accuracy": 0.3874821364879608,
"num_tokens": 321718.0,
"step": 600
},
{
"epoch": 1.888544891640867,
"grad_norm": 2.4884731769561768,
"learning_rate": 2.4343653250773994e-05,
"loss": 3.3375,
"mean_token_accuracy": 0.37331779301166534,
"num_tokens": 327347.0,
"step": 610
},
{
"epoch": 1.9195046439628483,
"grad_norm": 2.7246131896972656,
"learning_rate": 2.425077399380805e-05,
"loss": 3.3235,
"mean_token_accuracy": 0.38473727405071256,
"num_tokens": 332563.0,
"step": 620
},
{
"epoch": 1.9504643962848296,
"grad_norm": 2.420604705810547,
"learning_rate": 2.4157894736842104e-05,
"loss": 3.4027,
"mean_token_accuracy": 0.36371313631534574,
"num_tokens": 337880.0,
"step": 630
},
{
"epoch": 1.9814241486068112,
"grad_norm": 2.1465680599212646,
"learning_rate": 2.4065015479876164e-05,
"loss": 3.3508,
"mean_token_accuracy": 0.3747645616531372,
"num_tokens": 343193.0,
"step": 640
},
{
"epoch": 2.0123839009287927,
"grad_norm": 2.0795834064483643,
"learning_rate": 2.3972136222910217e-05,
"loss": 3.3529,
"mean_token_accuracy": 0.3744541972875595,
"num_tokens": 348460.0,
"step": 650
},
{
"epoch": 2.043343653250774,
"grad_norm": 2.4051778316497803,
"learning_rate": 2.387925696594427e-05,
"loss": 3.2404,
"mean_token_accuracy": 0.38319246768951415,
"num_tokens": 353701.0,
"step": 660
},
{
"epoch": 2.0743034055727554,
"grad_norm": 2.401045322418213,
"learning_rate": 2.378637770897833e-05,
"loss": 3.2801,
"mean_token_accuracy": 0.3773155301809311,
"num_tokens": 359147.0,
"step": 670
},
{
"epoch": 2.1052631578947367,
"grad_norm": 2.554138422012329,
"learning_rate": 2.3693498452012384e-05,
"loss": 3.2227,
"mean_token_accuracy": 0.37754152715206146,
"num_tokens": 364696.0,
"step": 680
},
{
"epoch": 2.136222910216718,
"grad_norm": 2.4874625205993652,
"learning_rate": 2.360061919504644e-05,
"loss": 3.2534,
"mean_token_accuracy": 0.3920708328485489,
"num_tokens": 370207.0,
"step": 690
},
{
"epoch": 2.1671826625387,
"grad_norm": 2.638068675994873,
"learning_rate": 2.3507739938080497e-05,
"loss": 3.2635,
"mean_token_accuracy": 0.3835586577653885,
"num_tokens": 375977.0,
"step": 700
},
{
"epoch": 2.198142414860681,
"grad_norm": 2.5179686546325684,
"learning_rate": 2.341486068111455e-05,
"loss": 3.1599,
"mean_token_accuracy": 0.39867666363716125,
"num_tokens": 381398.0,
"step": 710
},
{
"epoch": 2.2291021671826625,
"grad_norm": 2.9102959632873535,
"learning_rate": 2.3321981424148607e-05,
"loss": 3.2686,
"mean_token_accuracy": 0.3878729552030563,
"num_tokens": 386460.0,
"step": 720
},
{
"epoch": 2.260061919504644,
"grad_norm": 2.641160726547241,
"learning_rate": 2.3229102167182663e-05,
"loss": 3.2218,
"mean_token_accuracy": 0.38672482669353486,
"num_tokens": 391538.0,
"step": 730
},
{
"epoch": 2.291021671826625,
"grad_norm": 2.947000026702881,
"learning_rate": 2.313622291021672e-05,
"loss": 3.2215,
"mean_token_accuracy": 0.38650966584682467,
"num_tokens": 396881.0,
"step": 740
},
{
"epoch": 2.321981424148607,
"grad_norm": 2.3614673614501953,
"learning_rate": 2.3043343653250773e-05,
"loss": 3.2233,
"mean_token_accuracy": 0.388895845413208,
"num_tokens": 402418.0,
"step": 750
},
{
"epoch": 2.3529411764705883,
"grad_norm": 2.492814302444458,
"learning_rate": 2.295046439628483e-05,
"loss": 3.2005,
"mean_token_accuracy": 0.382270821928978,
"num_tokens": 407733.0,
"step": 760
},
{
"epoch": 2.3839009287925697,
"grad_norm": 2.646655321121216,
"learning_rate": 2.2857585139318887e-05,
"loss": 3.1642,
"mean_token_accuracy": 0.388735693693161,
"num_tokens": 413101.0,
"step": 770
},
{
"epoch": 2.414860681114551,
"grad_norm": 2.782440662384033,
"learning_rate": 2.276470588235294e-05,
"loss": 3.2513,
"mean_token_accuracy": 0.38574750125408175,
"num_tokens": 418754.0,
"step": 780
},
{
"epoch": 2.4458204334365323,
"grad_norm": 2.7094547748565674,
"learning_rate": 2.2671826625387e-05,
"loss": 3.3007,
"mean_token_accuracy": 0.38453402519226076,
"num_tokens": 424232.0,
"step": 790
},
{
"epoch": 2.476780185758514,
"grad_norm": 2.697098970413208,
"learning_rate": 2.2578947368421053e-05,
"loss": 3.254,
"mean_token_accuracy": 0.3893850326538086,
"num_tokens": 429369.0,
"step": 800
},
{
"epoch": 2.5077399380804954,
"grad_norm": 3.2908523082733154,
"learning_rate": 2.248606811145511e-05,
"loss": 3.1944,
"mean_token_accuracy": 0.39619002044200896,
"num_tokens": 434497.0,
"step": 810
},
{
"epoch": 2.538699690402477,
"grad_norm": 3.068455696105957,
"learning_rate": 2.2393188854489166e-05,
"loss": 3.2034,
"mean_token_accuracy": 0.3942800432443619,
"num_tokens": 439892.0,
"step": 820
},
{
"epoch": 2.569659442724458,
"grad_norm": 2.7893826961517334,
"learning_rate": 2.230030959752322e-05,
"loss": 3.0723,
"mean_token_accuracy": 0.39791189730167387,
"num_tokens": 445189.0,
"step": 830
},
{
"epoch": 2.6006191950464395,
"grad_norm": 2.7569446563720703,
"learning_rate": 2.2207430340557276e-05,
"loss": 3.187,
"mean_token_accuracy": 0.3884226083755493,
"num_tokens": 450338.0,
"step": 840
},
{
"epoch": 2.6315789473684212,
"grad_norm": 3.16340708732605,
"learning_rate": 2.2114551083591333e-05,
"loss": 3.1942,
"mean_token_accuracy": 0.3921455442905426,
"num_tokens": 455583.0,
"step": 850
},
{
"epoch": 2.6625386996904026,
"grad_norm": 2.549273729324341,
"learning_rate": 2.202167182662539e-05,
"loss": 3.278,
"mean_token_accuracy": 0.3782683253288269,
"num_tokens": 461089.0,
"step": 860
},
{
"epoch": 2.693498452012384,
"grad_norm": 3.216149091720581,
"learning_rate": 2.1928792569659443e-05,
"loss": 3.2023,
"mean_token_accuracy": 0.3940991997718811,
"num_tokens": 466239.0,
"step": 870
},
{
"epoch": 2.7244582043343653,
"grad_norm": 2.4680261611938477,
"learning_rate": 2.18359133126935e-05,
"loss": 3.2017,
"mean_token_accuracy": 0.3878098428249359,
"num_tokens": 471660.0,
"step": 880
},
{
"epoch": 2.7554179566563466,
"grad_norm": 3.6166999340057373,
"learning_rate": 2.1743034055727556e-05,
"loss": 3.2068,
"mean_token_accuracy": 0.3789886265993118,
"num_tokens": 476713.0,
"step": 890
},
{
"epoch": 2.7863777089783284,
"grad_norm": 2.0997393131256104,
"learning_rate": 2.165015479876161e-05,
"loss": 3.2216,
"mean_token_accuracy": 0.3913588523864746,
"num_tokens": 482569.0,
"step": 900
},
{
"epoch": 2.8173374613003097,
"grad_norm": 3.0189061164855957,
"learning_rate": 2.1557275541795666e-05,
"loss": 3.1346,
"mean_token_accuracy": 0.3967192888259888,
"num_tokens": 487629.0,
"step": 910
},
{
"epoch": 2.848297213622291,
"grad_norm": 2.8610568046569824,
"learning_rate": 2.1464396284829722e-05,
"loss": 3.2061,
"mean_token_accuracy": 0.3921816825866699,
"num_tokens": 493227.0,
"step": 920
},
{
"epoch": 2.8792569659442724,
"grad_norm": 3.023012638092041,
"learning_rate": 2.1371517027863776e-05,
"loss": 3.2208,
"mean_token_accuracy": 0.3786366432905197,
"num_tokens": 498394.0,
"step": 930
},
{
"epoch": 2.9102167182662537,
"grad_norm": 2.908886194229126,
"learning_rate": 2.1278637770897832e-05,
"loss": 3.1648,
"mean_token_accuracy": 0.3914026439189911,
"num_tokens": 503752.0,
"step": 940
},
{
"epoch": 2.9411764705882355,
"grad_norm": 3.078397750854492,
"learning_rate": 2.118575851393189e-05,
"loss": 3.1891,
"mean_token_accuracy": 0.39542897045612335,
"num_tokens": 509218.0,
"step": 950
},
{
"epoch": 2.972136222910217,
"grad_norm": 2.740389823913574,
"learning_rate": 2.1092879256965946e-05,
"loss": 3.1953,
"mean_token_accuracy": 0.3934174537658691,
"num_tokens": 514716.0,
"step": 960
},
{
"epoch": 3.003095975232198,
"grad_norm": 2.6426961421966553,
"learning_rate": 2.1e-05,
"loss": 3.1084,
"mean_token_accuracy": 0.39845702350139617,
"num_tokens": 520000.0,
"step": 970
},
{
"epoch": 3.0340557275541795,
"grad_norm": 2.8173840045928955,
"learning_rate": 2.090712074303406e-05,
"loss": 3.1574,
"mean_token_accuracy": 0.39481441378593446,
"num_tokens": 525561.0,
"step": 980
},
{
"epoch": 3.065015479876161,
"grad_norm": 3.29856014251709,
"learning_rate": 2.0814241486068112e-05,
"loss": 3.0613,
"mean_token_accuracy": 0.4054213762283325,
"num_tokens": 531021.0,
"step": 990
},
{
"epoch": 3.0959752321981426,
"grad_norm": 3.463890314102173,
"learning_rate": 2.0721362229102165e-05,
"loss": 3.0282,
"mean_token_accuracy": 0.4025467813014984,
"num_tokens": 536260.0,
"step": 1000
},
{
"epoch": 3.126934984520124,
"grad_norm": 3.134387731552124,
"learning_rate": 2.0628482972136225e-05,
"loss": 3.1147,
"mean_token_accuracy": 0.3955592781305313,
"num_tokens": 541721.0,
"step": 1010
},
{
"epoch": 3.1578947368421053,
"grad_norm": 3.237518072128296,
"learning_rate": 2.053560371517028e-05,
"loss": 3.0828,
"mean_token_accuracy": 0.40579850077629087,
"num_tokens": 547334.0,
"step": 1020
},
{
"epoch": 3.1888544891640866,
"grad_norm": 3.2742724418640137,
"learning_rate": 2.0442724458204335e-05,
"loss": 3.0907,
"mean_token_accuracy": 0.40091423988342284,
"num_tokens": 552904.0,
"step": 1030
},
{
"epoch": 3.219814241486068,
"grad_norm": 3.0646955966949463,
"learning_rate": 2.0349845201238392e-05,
"loss": 3.0758,
"mean_token_accuracy": 0.40378672182559966,
"num_tokens": 557743.0,
"step": 1040
},
{
"epoch": 3.2507739938080498,
"grad_norm": 3.2615506649017334,
"learning_rate": 2.0256965944272445e-05,
"loss": 3.0844,
"mean_token_accuracy": 0.41148235499858854,
"num_tokens": 563604.0,
"step": 1050
},
{
"epoch": 3.281733746130031,
"grad_norm": 3.001723527908325,
"learning_rate": 2.0164086687306502e-05,
"loss": 2.9993,
"mean_token_accuracy": 0.41538413166999816,
"num_tokens": 568980.0,
"step": 1060
},
{
"epoch": 3.3126934984520124,
"grad_norm": 3.545367956161499,
"learning_rate": 2.007120743034056e-05,
"loss": 3.0765,
"mean_token_accuracy": 0.40937572419643403,
"num_tokens": 573896.0,
"step": 1070
},
{
"epoch": 3.343653250773994,
"grad_norm": 3.4989709854125977,
"learning_rate": 1.9978328173374615e-05,
"loss": 3.1538,
"mean_token_accuracy": 0.3883706986904144,
"num_tokens": 579254.0,
"step": 1080
},
{
"epoch": 3.374613003095975,
"grad_norm": 3.964334487915039,
"learning_rate": 1.9885448916408668e-05,
"loss": 3.1399,
"mean_token_accuracy": 0.3968294531106949,
"num_tokens": 584603.0,
"step": 1090
},
{
"epoch": 3.405572755417957,
"grad_norm": 4.132855415344238,
"learning_rate": 1.9792569659442725e-05,
"loss": 3.1428,
"mean_token_accuracy": 0.3817721128463745,
"num_tokens": 590267.0,
"step": 1100
}
],
"logging_steps": 10,
"max_steps": 3230,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 2.778658848011059e+16,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}