Llama3-8B-MSN / trainer_state.json
DecoderImmortal's picture
Upload folder using huggingface_hub
7fafb47 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.9989717223650385,
"eval_steps": 500,
"global_step": 972,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.02056555269922879,
"grad_norm": 9.233743238441296,
"learning_rate": 1.9994839090452616e-05,
"loss": 1.5713,
"step": 10
},
{
"epoch": 0.04113110539845758,
"grad_norm": 13.69020758087521,
"learning_rate": 1.9979236966675828e-05,
"loss": 1.3565,
"step": 20
},
{
"epoch": 0.061696658097686374,
"grad_norm": 9.2629201307749,
"learning_rate": 1.9953209739827946e-05,
"loss": 1.3132,
"step": 30
},
{
"epoch": 0.08226221079691516,
"grad_norm": 5.836967088114798,
"learning_rate": 1.9916784600016132e-05,
"loss": 1.2708,
"step": 40
},
{
"epoch": 0.10282776349614396,
"grad_norm": 9.00423601885076,
"learning_rate": 1.9869999599832804e-05,
"loss": 1.2658,
"step": 50
},
{
"epoch": 0.12339331619537275,
"grad_norm": 13.166982152661006,
"learning_rate": 1.981290361460287e-05,
"loss": 1.2531,
"step": 60
},
{
"epoch": 0.14395886889460155,
"grad_norm": 10.728661259090346,
"learning_rate": 1.974555629132469e-05,
"loss": 1.2339,
"step": 70
},
{
"epoch": 0.16452442159383032,
"grad_norm": 9.294779587967374,
"learning_rate": 1.9668027986358082e-05,
"loss": 1.2372,
"step": 80
},
{
"epoch": 0.18508997429305912,
"grad_norm": 6.388693353299931,
"learning_rate": 1.9580399691924484e-05,
"loss": 1.2298,
"step": 90
},
{
"epoch": 0.20565552699228792,
"grad_norm": 5.685294719278735,
"learning_rate": 1.9482762951496056e-05,
"loss": 1.2312,
"step": 100
},
{
"epoch": 0.2262210796915167,
"grad_norm": 6.685866232367591,
"learning_rate": 1.9375219764162096e-05,
"loss": 1.2316,
"step": 110
},
{
"epoch": 0.2467866323907455,
"grad_norm": 7.688018717119547,
"learning_rate": 1.92578824780727e-05,
"loss": 1.2337,
"step": 120
},
{
"epoch": 0.26735218508997427,
"grad_norm": 3.884064710067138,
"learning_rate": 1.913087367307095e-05,
"loss": 1.2136,
"step": 130
},
{
"epoch": 0.2879177377892031,
"grad_norm": 9.444686410104936,
"learning_rate": 1.8994326032636318e-05,
"loss": 1.2072,
"step": 140
},
{
"epoch": 0.30848329048843187,
"grad_norm": 4.864822081558493,
"learning_rate": 1.8848382205272924e-05,
"loss": 1.2017,
"step": 150
},
{
"epoch": 0.32904884318766064,
"grad_norm": 5.9852253238798605,
"learning_rate": 1.869319465548762e-05,
"loss": 1.208,
"step": 160
},
{
"epoch": 0.3496143958868895,
"grad_norm": 6.2970547396781225,
"learning_rate": 1.852892550451345e-05,
"loss": 1.2012,
"step": 170
},
{
"epoch": 0.37017994858611825,
"grad_norm": 7.846752995277368,
"learning_rate": 1.835574636094494e-05,
"loss": 1.2035,
"step": 180
},
{
"epoch": 0.390745501285347,
"grad_norm": 9.795450293111358,
"learning_rate": 1.8173838141462145e-05,
"loss": 1.2147,
"step": 190
},
{
"epoch": 0.41131105398457585,
"grad_norm": 7.782933189444834,
"learning_rate": 1.798339088183071e-05,
"loss": 1.2126,
"step": 200
},
{
"epoch": 0.4318766066838046,
"grad_norm": 5.494144367822917,
"learning_rate": 1.7784603538375453e-05,
"loss": 1.2089,
"step": 210
},
{
"epoch": 0.4524421593830334,
"grad_norm": 6.609017931196138,
"learning_rate": 1.7577683780134756e-05,
"loss": 1.1879,
"step": 220
},
{
"epoch": 0.4730077120822622,
"grad_norm": 8.906035242084766,
"learning_rate": 1.7362847771913035e-05,
"loss": 1.2097,
"step": 230
},
{
"epoch": 0.493573264781491,
"grad_norm": 8.648775336727859,
"learning_rate": 1.714031994845782e-05,
"loss": 1.187,
"step": 240
},
{
"epoch": 0.5141388174807198,
"grad_norm": 5.843126331449411,
"learning_rate": 1.6910332779997378e-05,
"loss": 1.1835,
"step": 250
},
{
"epoch": 0.5347043701799485,
"grad_norm": 7.57208189767232,
"learning_rate": 1.6673126529383905e-05,
"loss": 1.1906,
"step": 260
},
{
"epoch": 0.5552699228791774,
"grad_norm": 5.872037189234157,
"learning_rate": 1.642894900109584e-05,
"loss": 1.1736,
"step": 270
},
{
"epoch": 0.5758354755784062,
"grad_norm": 5.59549822016061,
"learning_rate": 1.6178055282361642e-05,
"loss": 1.19,
"step": 280
},
{
"epoch": 0.596401028277635,
"grad_norm": 7.1037529678120785,
"learning_rate": 1.5920707476675446e-05,
"loss": 1.1851,
"step": 290
},
{
"epoch": 0.6169665809768637,
"grad_norm": 4.7250598751432875,
"learning_rate": 1.565717442998292e-05,
"loss": 1.1824,
"step": 300
},
{
"epoch": 0.6375321336760925,
"grad_norm": 7.603461950437508,
"learning_rate": 1.5387731449823474e-05,
"loss": 1.1543,
"step": 310
},
{
"epoch": 0.6580976863753213,
"grad_norm": 7.812378658902085,
"learning_rate": 1.5112660017722122e-05,
"loss": 1.1683,
"step": 320
},
{
"epoch": 0.6786632390745502,
"grad_norm": 6.866611481508661,
"learning_rate": 1.4832247495131566e-05,
"loss": 1.1643,
"step": 330
},
{
"epoch": 0.699228791773779,
"grad_norm": 8.447272563997627,
"learning_rate": 1.45467868232316e-05,
"loss": 1.1679,
"step": 340
},
{
"epoch": 0.7197943444730077,
"grad_norm": 8.29660722849121,
"learning_rate": 1.4256576216899494e-05,
"loss": 1.1605,
"step": 350
},
{
"epoch": 0.7403598971722365,
"grad_norm": 6.408869669950844,
"learning_rate": 1.3961918853171073e-05,
"loss": 1.1681,
"step": 360
},
{
"epoch": 0.7609254498714653,
"grad_norm": 7.257507857539118,
"learning_rate": 1.3663122554517917e-05,
"loss": 1.1545,
"step": 370
},
{
"epoch": 0.781491002570694,
"grad_norm": 6.745599564457566,
"learning_rate": 1.3360499467271552e-05,
"loss": 1.167,
"step": 380
},
{
"epoch": 0.8020565552699229,
"grad_norm": 4.372254672475978,
"learning_rate": 1.3054365735530666e-05,
"loss": 1.1706,
"step": 390
},
{
"epoch": 0.8226221079691517,
"grad_norm": 5.9583586806734585,
"learning_rate": 1.2745041170891827e-05,
"loss": 1.1512,
"step": 400
},
{
"epoch": 0.8431876606683805,
"grad_norm": 8.218488729311929,
"learning_rate": 1.243284891834894e-05,
"loss": 1.161,
"step": 410
},
{
"epoch": 0.8637532133676092,
"grad_norm": 9.05020466841952,
"learning_rate": 1.211811511871033e-05,
"loss": 1.1499,
"step": 420
},
{
"epoch": 0.884318766066838,
"grad_norm": 7.133782169539759,
"learning_rate": 1.1801168567886159e-05,
"loss": 1.1428,
"step": 430
},
{
"epoch": 0.9048843187660668,
"grad_norm": 10.061681353175718,
"learning_rate": 1.1482340373402128e-05,
"loss": 1.1548,
"step": 440
},
{
"epoch": 0.9254498714652957,
"grad_norm": 5.964595532826752,
"learning_rate": 1.1161963608498254e-05,
"loss": 1.1375,
"step": 450
},
{
"epoch": 0.9460154241645244,
"grad_norm": 7.682863713931749,
"learning_rate": 1.0840372964174148e-05,
"loss": 1.1441,
"step": 460
},
{
"epoch": 0.9665809768637532,
"grad_norm": 6.093064442303913,
"learning_rate": 1.051790439954422e-05,
"loss": 1.1374,
"step": 470
},
{
"epoch": 0.987146529562982,
"grad_norm": 2.859419774137469,
"learning_rate": 1.0194894790868113e-05,
"loss": 1.135,
"step": 480
},
{
"epoch": 1.0077120822622108,
"grad_norm": 5.504925974002832,
"learning_rate": 9.871681579623028e-06,
"loss": 1.1268,
"step": 490
},
{
"epoch": 1.0282776349614395,
"grad_norm": 6.006410935514135,
"learning_rate": 9.548602419985584e-06,
"loss": 1.0911,
"step": 500
},
{
"epoch": 1.0488431876606683,
"grad_norm": 3.2675854341672608,
"learning_rate": 9.225994826091431e-06,
"loss": 1.0816,
"step": 510
},
{
"epoch": 1.069408740359897,
"grad_norm": 6.801925989140796,
"learning_rate": 8.904195819441222e-06,
"loss": 1.0833,
"step": 520
},
{
"epoch": 1.089974293059126,
"grad_norm": 3.6643836420434663,
"learning_rate": 8.583541576821191e-06,
"loss": 1.081,
"step": 530
},
{
"epoch": 1.1105398457583548,
"grad_norm": 6.747377788645208,
"learning_rate": 8.264367079106194e-06,
"loss": 1.0793,
"step": 540
},
{
"epoch": 1.1311053984575836,
"grad_norm": 6.372537375904327,
"learning_rate": 7.947005761312097e-06,
"loss": 1.0979,
"step": 550
},
{
"epoch": 1.1516709511568124,
"grad_norm": 6.925714108622525,
"learning_rate": 7.6317891642631e-06,
"loss": 1.0868,
"step": 560
},
{
"epoch": 1.1722365038560412,
"grad_norm": 7.332135497496814,
"learning_rate": 7.319046588237864e-06,
"loss": 1.0613,
"step": 570
},
{
"epoch": 1.19280205655527,
"grad_norm": 3.4981756709446454,
"learning_rate": 7.009104748956304e-06,
"loss": 1.0801,
"step": 580
},
{
"epoch": 1.2133676092544987,
"grad_norm": 5.6660043695937,
"learning_rate": 6.7022874362664155e-06,
"loss": 1.0838,
"step": 590
},
{
"epoch": 1.2339331619537275,
"grad_norm": 6.6393615146162634,
"learning_rate": 6.398915175887698e-06,
"loss": 1.0692,
"step": 600
},
{
"epoch": 1.2544987146529563,
"grad_norm": 9.01722626725686,
"learning_rate": 6.099304894564544e-06,
"loss": 1.093,
"step": 610
},
{
"epoch": 1.275064267352185,
"grad_norm": 7.465032128492785,
"learning_rate": 5.8037695889794e-06,
"loss": 1.0781,
"step": 620
},
{
"epoch": 1.2956298200514138,
"grad_norm": 8.243642393853053,
"learning_rate": 5.512617998771598e-06,
"loss": 1.0833,
"step": 630
},
{
"epoch": 1.3161953727506428,
"grad_norm": 3.1779811319825577,
"learning_rate": 5.226154284003411e-06,
"loss": 1.0715,
"step": 640
},
{
"epoch": 1.3367609254498714,
"grad_norm": 4.120350391737107,
"learning_rate": 4.944677707410315e-06,
"loss": 1.0829,
"step": 650
},
{
"epoch": 1.3573264781491003,
"grad_norm": 7.105813120974067,
"learning_rate": 4.668482321767371e-06,
"loss": 1.0865,
"step": 660
},
{
"epoch": 1.3778920308483291,
"grad_norm": 8.035377582845337,
"learning_rate": 4.397856662698368e-06,
"loss": 1.0533,
"step": 670
},
{
"epoch": 1.398457583547558,
"grad_norm": 8.049531770624485,
"learning_rate": 4.133083447248599e-06,
"loss": 1.0745,
"step": 680
},
{
"epoch": 1.4190231362467867,
"grad_norm": 7.225962565044245,
"learning_rate": 3.874439278536187e-06,
"loss": 1.0899,
"step": 690
},
{
"epoch": 1.4395886889460154,
"grad_norm": 7.270761540137814,
"learning_rate": 3.6221943567905283e-06,
"loss": 1.0784,
"step": 700
},
{
"epoch": 1.4601542416452442,
"grad_norm": 8.705361491327107,
"learning_rate": 3.3766121970796716e-06,
"loss": 1.0819,
"step": 710
},
{
"epoch": 1.480719794344473,
"grad_norm": 4.183940499981808,
"learning_rate": 3.1379493540215677e-06,
"loss": 1.069,
"step": 720
},
{
"epoch": 1.5012853470437018,
"grad_norm": 6.398726679627311,
"learning_rate": 2.906455153766744e-06,
"loss": 1.0785,
"step": 730
},
{
"epoch": 1.5218508997429305,
"grad_norm": 13.601116552168905,
"learning_rate": 2.6823714335324237e-06,
"loss": 1.057,
"step": 740
},
{
"epoch": 1.5424164524421595,
"grad_norm": 5.5895207138975,
"learning_rate": 2.46593228896017e-06,
"loss": 1.0553,
"step": 750
},
{
"epoch": 1.562982005141388,
"grad_norm": 6.267057233279022,
"learning_rate": 2.257363829560986e-06,
"loss": 1.0542,
"step": 760
},
{
"epoch": 1.583547557840617,
"grad_norm": 4.378216644983502,
"learning_rate": 2.0568839425033906e-06,
"loss": 1.0799,
"step": 770
},
{
"epoch": 1.6041131105398456,
"grad_norm": 7.255472699627192,
"learning_rate": 1.864702064991173e-06,
"loss": 1.0571,
"step": 780
},
{
"epoch": 1.6246786632390746,
"grad_norm": 6.289474192540149,
"learning_rate": 1.6810189654686715e-06,
"loss": 1.0472,
"step": 790
},
{
"epoch": 1.6452442159383034,
"grad_norm": 6.589217658649451,
"learning_rate": 1.5060265338821123e-06,
"loss": 1.0703,
"step": 800
},
{
"epoch": 1.6658097686375322,
"grad_norm": 4.865701816340032,
"learning_rate": 1.3399075812161488e-06,
"loss": 1.055,
"step": 810
},
{
"epoch": 1.686375321336761,
"grad_norm": 8.65763359081179,
"learning_rate": 1.1828356485149927e-06,
"loss": 1.0622,
"step": 820
},
{
"epoch": 1.7069408740359897,
"grad_norm": 6.174972638755989,
"learning_rate": 1.0349748255876536e-06,
"loss": 1.0526,
"step": 830
},
{
"epoch": 1.7275064267352185,
"grad_norm": 4.986705584850829,
"learning_rate": 8.964795795867176e-07,
"loss": 1.051,
"step": 840
},
{
"epoch": 1.7480719794344473,
"grad_norm": 9.202122011281354,
"learning_rate": 7.67494593639686e-07,
"loss": 1.0574,
"step": 850
},
{
"epoch": 1.7686375321336762,
"grad_norm": 8.058395431378568,
"learning_rate": 6.481546157014996e-07,
"loss": 1.0589,
"step": 860
},
{
"epoch": 1.7892030848329048,
"grad_norm": 5.3354582297297135,
"learning_rate": 5.385843177861261e-07,
"loss": 1.0578,
"step": 870
},
{
"epoch": 1.8097686375321338,
"grad_norm": 3.8239017444437415,
"learning_rate": 4.388981657242819e-07,
"loss": 1.0663,
"step": 880
},
{
"epoch": 1.8303341902313623,
"grad_norm": 10.032769787716335,
"learning_rate": 3.4920029958333656e-07,
"loss": 1.0671,
"step": 890
},
{
"epoch": 1.8508997429305913,
"grad_norm": 4.449303124886201,
"learning_rate": 2.695844248743318e-07,
"loss": 1.0573,
"step": 900
},
{
"epoch": 1.87146529562982,
"grad_norm": 6.5875403409786735,
"learning_rate": 2.0013371465976816e-07,
"loss": 1.063,
"step": 910
},
{
"epoch": 1.8920308483290489,
"grad_norm": 6.1087849946753545,
"learning_rate": 1.409207226644227e-07,
"loss": 1.0703,
"step": 920
},
{
"epoch": 1.9125964010282777,
"grad_norm": 9.357948853303002,
"learning_rate": 9.200730747996211e-08,
"loss": 1.0615,
"step": 930
},
{
"epoch": 1.9331619537275064,
"grad_norm": 6.746455746388351,
"learning_rate": 5.344456794255881e-08,
"loss": 1.0591,
"step": 940
},
{
"epoch": 1.9537275064267352,
"grad_norm": 4.307020198331092,
"learning_rate": 2.5272789750980797e-08,
"loss": 1.0591,
"step": 950
},
{
"epoch": 1.974293059125964,
"grad_norm": 2.368072060390636,
"learning_rate": 7.521403380956748e-09,
"loss": 1.0602,
"step": 960
},
{
"epoch": 1.9948586118251928,
"grad_norm": 6.97337278897093,
"learning_rate": 2.089533397653387e-10,
"loss": 1.0495,
"step": 970
},
{
"epoch": 1.9989717223650385,
"step": 972,
"total_flos": 9.034546879177687e+18,
"train_loss": 1.1365475546675945,
"train_runtime": 15552.1986,
"train_samples_per_second": 32.015,
"train_steps_per_second": 0.062
}
],
"logging_steps": 10,
"max_steps": 972,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 500,
"total_flos": 9.034546879177687e+18,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}