gemma-2b-sciq / trainer_state.json
Darwinkel's picture
Upload 10 files
ead8650 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.8837229214830038,
"eval_steps": 500,
"global_step": 5500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"grad_norm": 0.5656424760818481,
"learning_rate": 0.00019972583961617548,
"loss": 2.6115,
"step": 10
},
{
"epoch": 0.01,
"grad_norm": 0.9568225145339966,
"learning_rate": 0.0001993831391363948,
"loss": 2.3674,
"step": 20
},
{
"epoch": 0.01,
"grad_norm": 0.8508038520812988,
"learning_rate": 0.00019904043865661412,
"loss": 2.1487,
"step": 30
},
{
"epoch": 0.01,
"grad_norm": 1.0186710357666016,
"learning_rate": 0.00019869773817683345,
"loss": 2.1421,
"step": 40
},
{
"epoch": 0.02,
"grad_norm": 1.7234398126602173,
"learning_rate": 0.0001983550376970528,
"loss": 2.0021,
"step": 50
},
{
"epoch": 0.02,
"grad_norm": 1.197520136833191,
"learning_rate": 0.0001980123372172721,
"loss": 2.0381,
"step": 60
},
{
"epoch": 0.02,
"grad_norm": 1.0739991664886475,
"learning_rate": 0.00019766963673749144,
"loss": 1.8883,
"step": 70
},
{
"epoch": 0.03,
"grad_norm": 1.0150132179260254,
"learning_rate": 0.00019732693625771076,
"loss": 1.8494,
"step": 80
},
{
"epoch": 0.03,
"grad_norm": 1.236234426498413,
"learning_rate": 0.0001969842357779301,
"loss": 1.9171,
"step": 90
},
{
"epoch": 0.03,
"grad_norm": 1.0886958837509155,
"learning_rate": 0.00019664153529814942,
"loss": 1.9387,
"step": 100
},
{
"epoch": 0.04,
"grad_norm": 1.1191097497940063,
"learning_rate": 0.00019629883481836875,
"loss": 1.976,
"step": 110
},
{
"epoch": 0.04,
"grad_norm": 1.0738675594329834,
"learning_rate": 0.00019595613433858808,
"loss": 1.8378,
"step": 120
},
{
"epoch": 0.04,
"grad_norm": 0.648668646812439,
"learning_rate": 0.0001956134338588074,
"loss": 1.768,
"step": 130
},
{
"epoch": 0.05,
"grad_norm": 0.9386289119720459,
"learning_rate": 0.00019527073337902674,
"loss": 2.0067,
"step": 140
},
{
"epoch": 0.05,
"grad_norm": 1.1613832712173462,
"learning_rate": 0.00019492803289924607,
"loss": 1.9355,
"step": 150
},
{
"epoch": 0.05,
"grad_norm": 0.7319044470787048,
"learning_rate": 0.0001945853324194654,
"loss": 1.9042,
"step": 160
},
{
"epoch": 0.06,
"grad_norm": 0.9041644930839539,
"learning_rate": 0.00019424263193968473,
"loss": 1.7163,
"step": 170
},
{
"epoch": 0.06,
"grad_norm": 0.9293299317359924,
"learning_rate": 0.00019389993145990406,
"loss": 1.807,
"step": 180
},
{
"epoch": 0.07,
"grad_norm": 0.9214122295379639,
"learning_rate": 0.00019355723098012336,
"loss": 1.9056,
"step": 190
},
{
"epoch": 0.07,
"grad_norm": 0.7177646160125732,
"learning_rate": 0.0001932145305003427,
"loss": 1.9574,
"step": 200
},
{
"epoch": 0.07,
"grad_norm": 0.813965916633606,
"learning_rate": 0.00019287183002056205,
"loss": 1.7995,
"step": 210
},
{
"epoch": 0.08,
"grad_norm": 1.0333760976791382,
"learning_rate": 0.00019252912954078138,
"loss": 1.814,
"step": 220
},
{
"epoch": 0.08,
"grad_norm": 0.6691217422485352,
"learning_rate": 0.0001921864290610007,
"loss": 1.8261,
"step": 230
},
{
"epoch": 0.08,
"grad_norm": 1.1737751960754395,
"learning_rate": 0.00019184372858122,
"loss": 1.9473,
"step": 240
},
{
"epoch": 0.09,
"grad_norm": 1.1508344411849976,
"learning_rate": 0.00019150102810143934,
"loss": 1.9176,
"step": 250
},
{
"epoch": 0.09,
"grad_norm": 0.6660133600234985,
"learning_rate": 0.00019115832762165867,
"loss": 1.835,
"step": 260
},
{
"epoch": 0.09,
"grad_norm": 0.6423531174659729,
"learning_rate": 0.00019081562714187803,
"loss": 1.7194,
"step": 270
},
{
"epoch": 0.1,
"grad_norm": 0.8241636157035828,
"learning_rate": 0.00019047292666209733,
"loss": 1.8679,
"step": 280
},
{
"epoch": 0.1,
"grad_norm": 0.7184795141220093,
"learning_rate": 0.00019013022618231666,
"loss": 1.8129,
"step": 290
},
{
"epoch": 0.1,
"grad_norm": 0.8253782391548157,
"learning_rate": 0.000189787525702536,
"loss": 1.8567,
"step": 300
},
{
"epoch": 0.11,
"grad_norm": 1.417243242263794,
"learning_rate": 0.00018944482522275532,
"loss": 1.7741,
"step": 310
},
{
"epoch": 0.11,
"grad_norm": 0.9040454626083374,
"learning_rate": 0.00018910212474297465,
"loss": 1.8458,
"step": 320
},
{
"epoch": 0.11,
"grad_norm": 0.6580069065093994,
"learning_rate": 0.00018875942426319398,
"loss": 1.7982,
"step": 330
},
{
"epoch": 0.12,
"grad_norm": 0.8849833011627197,
"learning_rate": 0.0001884167237834133,
"loss": 1.8622,
"step": 340
},
{
"epoch": 0.12,
"grad_norm": 1.0523239374160767,
"learning_rate": 0.00018807402330363264,
"loss": 1.8608,
"step": 350
},
{
"epoch": 0.12,
"grad_norm": 1.0496423244476318,
"learning_rate": 0.00018773132282385194,
"loss": 1.8245,
"step": 360
},
{
"epoch": 0.13,
"grad_norm": 0.9488272070884705,
"learning_rate": 0.00018738862234407127,
"loss": 1.8933,
"step": 370
},
{
"epoch": 0.13,
"grad_norm": 0.9461072087287903,
"learning_rate": 0.00018704592186429063,
"loss": 1.7277,
"step": 380
},
{
"epoch": 0.13,
"grad_norm": 0.6415026187896729,
"learning_rate": 0.00018670322138450996,
"loss": 1.7843,
"step": 390
},
{
"epoch": 0.14,
"grad_norm": 1.0457078218460083,
"learning_rate": 0.0001863605209047293,
"loss": 1.8874,
"step": 400
},
{
"epoch": 0.14,
"grad_norm": 1.0890721082687378,
"learning_rate": 0.0001860178204249486,
"loss": 1.8536,
"step": 410
},
{
"epoch": 0.14,
"grad_norm": 0.8896569013595581,
"learning_rate": 0.00018567511994516792,
"loss": 1.8297,
"step": 420
},
{
"epoch": 0.15,
"grad_norm": 0.9457584023475647,
"learning_rate": 0.00018533241946538728,
"loss": 1.8061,
"step": 430
},
{
"epoch": 0.15,
"grad_norm": 0.8208130598068237,
"learning_rate": 0.0001849897189856066,
"loss": 1.8238,
"step": 440
},
{
"epoch": 0.15,
"grad_norm": 0.7884149551391602,
"learning_rate": 0.0001846470185058259,
"loss": 1.7419,
"step": 450
},
{
"epoch": 0.16,
"grad_norm": 1.5733205080032349,
"learning_rate": 0.00018430431802604524,
"loss": 1.8829,
"step": 460
},
{
"epoch": 0.16,
"grad_norm": 0.963455319404602,
"learning_rate": 0.00018396161754626457,
"loss": 1.822,
"step": 470
},
{
"epoch": 0.16,
"grad_norm": 0.616909384727478,
"learning_rate": 0.0001836189170664839,
"loss": 1.7923,
"step": 480
},
{
"epoch": 0.17,
"grad_norm": 0.5382218360900879,
"learning_rate": 0.00018327621658670323,
"loss": 1.719,
"step": 490
},
{
"epoch": 0.17,
"grad_norm": 1.171004056930542,
"learning_rate": 0.00018293351610692256,
"loss": 1.8522,
"step": 500
},
{
"epoch": 0.17,
"eval_loss": 1.9394277334213257,
"eval_runtime": 33.4276,
"eval_samples_per_second": 29.915,
"eval_steps_per_second": 3.739,
"step": 500
},
{
"epoch": 0.17,
"grad_norm": 0.7731293439865112,
"learning_rate": 0.0001825908156271419,
"loss": 1.9151,
"step": 510
},
{
"epoch": 0.18,
"grad_norm": 0.8664043545722961,
"learning_rate": 0.00018224811514736122,
"loss": 1.6679,
"step": 520
},
{
"epoch": 0.18,
"grad_norm": 1.3886076211929321,
"learning_rate": 0.00018190541466758055,
"loss": 1.8509,
"step": 530
},
{
"epoch": 0.18,
"grad_norm": 0.7000617384910583,
"learning_rate": 0.00018156271418779988,
"loss": 1.8046,
"step": 540
},
{
"epoch": 0.19,
"grad_norm": 0.8490706086158752,
"learning_rate": 0.0001812200137080192,
"loss": 1.748,
"step": 550
},
{
"epoch": 0.19,
"grad_norm": 1.4293190240859985,
"learning_rate": 0.00018087731322823854,
"loss": 1.9725,
"step": 560
},
{
"epoch": 0.2,
"grad_norm": 0.7126957178115845,
"learning_rate": 0.00018053461274845787,
"loss": 1.6888,
"step": 570
},
{
"epoch": 0.2,
"grad_norm": 0.9974524974822998,
"learning_rate": 0.00018019191226867717,
"loss": 1.8405,
"step": 580
},
{
"epoch": 0.2,
"grad_norm": 0.9911081790924072,
"learning_rate": 0.0001798492117888965,
"loss": 1.7753,
"step": 590
},
{
"epoch": 0.21,
"grad_norm": 1.3659840822219849,
"learning_rate": 0.00017950651130911585,
"loss": 1.7435,
"step": 600
},
{
"epoch": 0.21,
"grad_norm": 0.4976978302001953,
"learning_rate": 0.00017916381082933518,
"loss": 1.759,
"step": 610
},
{
"epoch": 0.21,
"grad_norm": 0.7868736982345581,
"learning_rate": 0.0001788211103495545,
"loss": 1.7654,
"step": 620
},
{
"epoch": 0.22,
"grad_norm": 1.006628155708313,
"learning_rate": 0.00017847840986977382,
"loss": 1.7862,
"step": 630
},
{
"epoch": 0.22,
"grad_norm": 0.8664697408676147,
"learning_rate": 0.00017813570938999315,
"loss": 1.8815,
"step": 640
},
{
"epoch": 0.22,
"grad_norm": 0.44789645075798035,
"learning_rate": 0.00017779300891021248,
"loss": 1.779,
"step": 650
},
{
"epoch": 0.23,
"grad_norm": 0.9740760326385498,
"learning_rate": 0.00017745030843043183,
"loss": 1.7026,
"step": 660
},
{
"epoch": 0.23,
"grad_norm": 0.9802984595298767,
"learning_rate": 0.00017710760795065114,
"loss": 1.8359,
"step": 670
},
{
"epoch": 0.23,
"grad_norm": 1.0521053075790405,
"learning_rate": 0.00017676490747087047,
"loss": 1.7777,
"step": 680
},
{
"epoch": 0.24,
"grad_norm": 0.6399825215339661,
"learning_rate": 0.0001764222069910898,
"loss": 1.8129,
"step": 690
},
{
"epoch": 0.24,
"grad_norm": 1.1847810745239258,
"learning_rate": 0.00017607950651130912,
"loss": 1.8775,
"step": 700
},
{
"epoch": 0.24,
"grad_norm": 0.7050787806510925,
"learning_rate": 0.00017573680603152845,
"loss": 1.8454,
"step": 710
},
{
"epoch": 0.25,
"grad_norm": 0.8241177797317505,
"learning_rate": 0.00017539410555174778,
"loss": 1.7047,
"step": 720
},
{
"epoch": 0.25,
"grad_norm": 1.743680477142334,
"learning_rate": 0.00017505140507196711,
"loss": 1.8251,
"step": 730
},
{
"epoch": 0.25,
"grad_norm": 0.776196300983429,
"learning_rate": 0.00017470870459218644,
"loss": 1.8341,
"step": 740
},
{
"epoch": 0.26,
"grad_norm": 0.6896054744720459,
"learning_rate": 0.00017436600411240575,
"loss": 1.7569,
"step": 750
},
{
"epoch": 0.26,
"grad_norm": 0.703697919845581,
"learning_rate": 0.0001740233036326251,
"loss": 1.7696,
"step": 760
},
{
"epoch": 0.26,
"grad_norm": 0.6734452247619629,
"learning_rate": 0.00017368060315284443,
"loss": 1.6639,
"step": 770
},
{
"epoch": 0.27,
"grad_norm": 0.6856238842010498,
"learning_rate": 0.00017333790267306376,
"loss": 1.8419,
"step": 780
},
{
"epoch": 0.27,
"grad_norm": 1.1194758415222168,
"learning_rate": 0.00017299520219328306,
"loss": 1.7916,
"step": 790
},
{
"epoch": 0.27,
"grad_norm": 1.455841064453125,
"learning_rate": 0.0001726525017135024,
"loss": 1.7368,
"step": 800
},
{
"epoch": 0.28,
"grad_norm": 0.5988683700561523,
"learning_rate": 0.00017230980123372172,
"loss": 1.8434,
"step": 810
},
{
"epoch": 0.28,
"grad_norm": 0.9031710028648376,
"learning_rate": 0.00017196710075394108,
"loss": 1.7447,
"step": 820
},
{
"epoch": 0.28,
"grad_norm": 1.2125264406204224,
"learning_rate": 0.0001716244002741604,
"loss": 1.9449,
"step": 830
},
{
"epoch": 0.29,
"grad_norm": 0.9563066959381104,
"learning_rate": 0.0001712816997943797,
"loss": 1.7063,
"step": 840
},
{
"epoch": 0.29,
"grad_norm": 0.8778769969940186,
"learning_rate": 0.00017093899931459904,
"loss": 1.802,
"step": 850
},
{
"epoch": 0.29,
"grad_norm": 1.0570799112319946,
"learning_rate": 0.00017059629883481837,
"loss": 1.7331,
"step": 860
},
{
"epoch": 0.3,
"grad_norm": 0.8234407305717468,
"learning_rate": 0.0001702535983550377,
"loss": 1.7943,
"step": 870
},
{
"epoch": 0.3,
"grad_norm": 0.968658983707428,
"learning_rate": 0.00016991089787525703,
"loss": 1.8527,
"step": 880
},
{
"epoch": 0.3,
"grad_norm": 0.6607180237770081,
"learning_rate": 0.00016956819739547636,
"loss": 1.8521,
"step": 890
},
{
"epoch": 0.31,
"grad_norm": 0.8055354952812195,
"learning_rate": 0.0001692254969156957,
"loss": 1.6901,
"step": 900
},
{
"epoch": 0.31,
"grad_norm": 0.8606925010681152,
"learning_rate": 0.00016888279643591502,
"loss": 1.7248,
"step": 910
},
{
"epoch": 0.32,
"grad_norm": 0.9894892573356628,
"learning_rate": 0.00016854009595613432,
"loss": 1.7541,
"step": 920
},
{
"epoch": 0.32,
"grad_norm": 0.8559629321098328,
"learning_rate": 0.00016819739547635368,
"loss": 1.7803,
"step": 930
},
{
"epoch": 0.32,
"grad_norm": 0.8917673826217651,
"learning_rate": 0.000167854694996573,
"loss": 1.8224,
"step": 940
},
{
"epoch": 0.33,
"grad_norm": 1.2621186971664429,
"learning_rate": 0.00016751199451679234,
"loss": 1.8253,
"step": 950
},
{
"epoch": 0.33,
"grad_norm": 1.1135177612304688,
"learning_rate": 0.00016716929403701167,
"loss": 1.6519,
"step": 960
},
{
"epoch": 0.33,
"grad_norm": 0.7034028172492981,
"learning_rate": 0.00016682659355723097,
"loss": 1.7079,
"step": 970
},
{
"epoch": 0.34,
"grad_norm": 0.7942814826965332,
"learning_rate": 0.0001664838930774503,
"loss": 1.828,
"step": 980
},
{
"epoch": 0.34,
"grad_norm": 0.9687950611114502,
"learning_rate": 0.00016614119259766966,
"loss": 1.7203,
"step": 990
},
{
"epoch": 0.34,
"grad_norm": 1.1074302196502686,
"learning_rate": 0.000165798492117889,
"loss": 1.7146,
"step": 1000
},
{
"epoch": 0.34,
"eval_loss": 1.9078810214996338,
"eval_runtime": 33.2486,
"eval_samples_per_second": 30.076,
"eval_steps_per_second": 3.76,
"step": 1000
},
{
"epoch": 0.35,
"grad_norm": 0.9533829689025879,
"learning_rate": 0.0001654557916381083,
"loss": 1.7596,
"step": 1010
},
{
"epoch": 0.35,
"grad_norm": 1.0547090768814087,
"learning_rate": 0.00016511309115832762,
"loss": 1.9113,
"step": 1020
},
{
"epoch": 0.35,
"grad_norm": 1.0186220407485962,
"learning_rate": 0.00016477039067854695,
"loss": 1.7845,
"step": 1030
},
{
"epoch": 0.36,
"grad_norm": 0.9044001698493958,
"learning_rate": 0.00016442769019876628,
"loss": 1.8174,
"step": 1040
},
{
"epoch": 0.36,
"grad_norm": 0.6433171033859253,
"learning_rate": 0.0001640849897189856,
"loss": 1.7702,
"step": 1050
},
{
"epoch": 0.36,
"grad_norm": 1.2511520385742188,
"learning_rate": 0.00016374228923920494,
"loss": 1.9304,
"step": 1060
},
{
"epoch": 0.37,
"grad_norm": 0.7901211977005005,
"learning_rate": 0.00016339958875942427,
"loss": 1.8432,
"step": 1070
},
{
"epoch": 0.37,
"grad_norm": 1.515535831451416,
"learning_rate": 0.0001630568882796436,
"loss": 1.8818,
"step": 1080
},
{
"epoch": 0.37,
"grad_norm": 0.9449120759963989,
"learning_rate": 0.00016271418779986293,
"loss": 1.8594,
"step": 1090
},
{
"epoch": 0.38,
"grad_norm": 0.7776308059692383,
"learning_rate": 0.00016237148732008226,
"loss": 1.8896,
"step": 1100
},
{
"epoch": 0.38,
"grad_norm": 1.3541969060897827,
"learning_rate": 0.0001620287868403016,
"loss": 1.8208,
"step": 1110
},
{
"epoch": 0.38,
"grad_norm": 0.7614444494247437,
"learning_rate": 0.00016168608636052092,
"loss": 1.759,
"step": 1120
},
{
"epoch": 0.39,
"grad_norm": 1.170345425605774,
"learning_rate": 0.00016134338588074025,
"loss": 1.6713,
"step": 1130
},
{
"epoch": 0.39,
"grad_norm": 0.8094021081924438,
"learning_rate": 0.00016100068540095955,
"loss": 1.7394,
"step": 1140
},
{
"epoch": 0.39,
"grad_norm": 1.169124722480774,
"learning_rate": 0.0001606579849211789,
"loss": 1.7609,
"step": 1150
},
{
"epoch": 0.4,
"grad_norm": 0.6766496300697327,
"learning_rate": 0.00016031528444139824,
"loss": 1.7812,
"step": 1160
},
{
"epoch": 0.4,
"grad_norm": 1.0808138847351074,
"learning_rate": 0.00015997258396161757,
"loss": 1.7777,
"step": 1170
},
{
"epoch": 0.4,
"grad_norm": 0.6450923681259155,
"learning_rate": 0.00015962988348183687,
"loss": 1.8539,
"step": 1180
},
{
"epoch": 0.41,
"grad_norm": 1.0518946647644043,
"learning_rate": 0.0001592871830020562,
"loss": 1.7799,
"step": 1190
},
{
"epoch": 0.41,
"grad_norm": 0.7807414531707764,
"learning_rate": 0.00015894448252227553,
"loss": 1.774,
"step": 1200
},
{
"epoch": 0.41,
"grad_norm": 1.4259986877441406,
"learning_rate": 0.00015860178204249488,
"loss": 1.8153,
"step": 1210
},
{
"epoch": 0.42,
"grad_norm": 0.9342586994171143,
"learning_rate": 0.0001582590815627142,
"loss": 1.7495,
"step": 1220
},
{
"epoch": 0.42,
"grad_norm": 0.7621099948883057,
"learning_rate": 0.00015791638108293352,
"loss": 1.7964,
"step": 1230
},
{
"epoch": 0.42,
"grad_norm": 0.8253260254859924,
"learning_rate": 0.00015757368060315285,
"loss": 1.7669,
"step": 1240
},
{
"epoch": 0.43,
"grad_norm": 0.6914420127868652,
"learning_rate": 0.00015723098012337218,
"loss": 1.803,
"step": 1250
},
{
"epoch": 0.43,
"grad_norm": 0.7147281765937805,
"learning_rate": 0.0001568882796435915,
"loss": 1.8226,
"step": 1260
},
{
"epoch": 0.43,
"grad_norm": 2.0851213932037354,
"learning_rate": 0.00015654557916381084,
"loss": 1.6957,
"step": 1270
},
{
"epoch": 0.44,
"grad_norm": 0.6254770159721375,
"learning_rate": 0.00015620287868403017,
"loss": 1.75,
"step": 1280
},
{
"epoch": 0.44,
"grad_norm": 1.0984652042388916,
"learning_rate": 0.0001558601782042495,
"loss": 1.8425,
"step": 1290
},
{
"epoch": 0.45,
"grad_norm": 1.0353467464447021,
"learning_rate": 0.00015551747772446882,
"loss": 1.7995,
"step": 1300
},
{
"epoch": 0.45,
"grad_norm": 0.6647160053253174,
"learning_rate": 0.00015517477724468813,
"loss": 1.866,
"step": 1310
},
{
"epoch": 0.45,
"grad_norm": 0.6671775579452515,
"learning_rate": 0.00015483207676490748,
"loss": 1.6871,
"step": 1320
},
{
"epoch": 0.46,
"grad_norm": 1.0024131536483765,
"learning_rate": 0.00015448937628512681,
"loss": 1.7424,
"step": 1330
},
{
"epoch": 0.46,
"grad_norm": 1.0090551376342773,
"learning_rate": 0.00015414667580534614,
"loss": 1.7001,
"step": 1340
},
{
"epoch": 0.46,
"grad_norm": 0.9725455045700073,
"learning_rate": 0.00015380397532556545,
"loss": 1.7114,
"step": 1350
},
{
"epoch": 0.47,
"grad_norm": 0.6556392312049866,
"learning_rate": 0.00015346127484578478,
"loss": 1.5969,
"step": 1360
},
{
"epoch": 0.47,
"grad_norm": 1.156596302986145,
"learning_rate": 0.00015311857436600413,
"loss": 1.7334,
"step": 1370
},
{
"epoch": 0.47,
"grad_norm": 0.9172496199607849,
"learning_rate": 0.00015277587388622346,
"loss": 1.7373,
"step": 1380
},
{
"epoch": 0.48,
"grad_norm": 0.9010474681854248,
"learning_rate": 0.0001524331734064428,
"loss": 1.8032,
"step": 1390
},
{
"epoch": 0.48,
"grad_norm": 0.9486579298973083,
"learning_rate": 0.0001520904729266621,
"loss": 1.6388,
"step": 1400
},
{
"epoch": 0.48,
"grad_norm": 0.8411978483200073,
"learning_rate": 0.00015174777244688142,
"loss": 1.7671,
"step": 1410
},
{
"epoch": 0.49,
"grad_norm": 0.9575003385543823,
"learning_rate": 0.00015140507196710075,
"loss": 1.6523,
"step": 1420
},
{
"epoch": 0.49,
"grad_norm": 0.7651090025901794,
"learning_rate": 0.0001510623714873201,
"loss": 1.812,
"step": 1430
},
{
"epoch": 0.49,
"grad_norm": 0.8477165699005127,
"learning_rate": 0.0001507196710075394,
"loss": 1.7125,
"step": 1440
},
{
"epoch": 0.5,
"grad_norm": 0.9737070202827454,
"learning_rate": 0.00015037697052775874,
"loss": 1.7506,
"step": 1450
},
{
"epoch": 0.5,
"grad_norm": 1.0645496845245361,
"learning_rate": 0.00015003427004797807,
"loss": 1.7335,
"step": 1460
},
{
"epoch": 0.5,
"grad_norm": 0.9303259253501892,
"learning_rate": 0.0001496915695681974,
"loss": 1.8838,
"step": 1470
},
{
"epoch": 0.51,
"grad_norm": 0.6571500897407532,
"learning_rate": 0.00014934886908841673,
"loss": 1.8093,
"step": 1480
},
{
"epoch": 0.51,
"grad_norm": 0.7994106411933899,
"learning_rate": 0.00014900616860863606,
"loss": 1.6691,
"step": 1490
},
{
"epoch": 0.51,
"grad_norm": 0.8453437685966492,
"learning_rate": 0.0001486634681288554,
"loss": 1.6731,
"step": 1500
},
{
"epoch": 0.51,
"eval_loss": 1.8940061330795288,
"eval_runtime": 33.2126,
"eval_samples_per_second": 30.109,
"eval_steps_per_second": 3.764,
"step": 1500
},
{
"epoch": 0.52,
"grad_norm": 1.0370814800262451,
"learning_rate": 0.00014832076764907472,
"loss": 1.7869,
"step": 1510
},
{
"epoch": 0.52,
"grad_norm": 1.0886887311935425,
"learning_rate": 0.00014797806716929405,
"loss": 1.7887,
"step": 1520
},
{
"epoch": 0.52,
"grad_norm": 0.9058669209480286,
"learning_rate": 0.00014763536668951335,
"loss": 1.6781,
"step": 1530
},
{
"epoch": 0.53,
"grad_norm": 0.46401920914649963,
"learning_rate": 0.0001472926662097327,
"loss": 1.6465,
"step": 1540
},
{
"epoch": 0.53,
"grad_norm": 0.6265978813171387,
"learning_rate": 0.00014694996572995204,
"loss": 1.8399,
"step": 1550
},
{
"epoch": 0.53,
"grad_norm": 0.7882290482521057,
"learning_rate": 0.00014660726525017137,
"loss": 1.7707,
"step": 1560
},
{
"epoch": 0.54,
"grad_norm": 0.7576068043708801,
"learning_rate": 0.00014626456477039067,
"loss": 1.8781,
"step": 1570
},
{
"epoch": 0.54,
"grad_norm": 0.8988894820213318,
"learning_rate": 0.00014592186429061,
"loss": 1.7109,
"step": 1580
},
{
"epoch": 0.54,
"grad_norm": 0.7934654951095581,
"learning_rate": 0.00014557916381082933,
"loss": 1.8261,
"step": 1590
},
{
"epoch": 0.55,
"grad_norm": 0.9526162147521973,
"learning_rate": 0.0001452364633310487,
"loss": 1.7286,
"step": 1600
},
{
"epoch": 0.55,
"grad_norm": 0.8650903701782227,
"learning_rate": 0.000144893762851268,
"loss": 1.8075,
"step": 1610
},
{
"epoch": 0.55,
"grad_norm": 0.8737215399742126,
"learning_rate": 0.00014455106237148732,
"loss": 1.7683,
"step": 1620
},
{
"epoch": 0.56,
"grad_norm": 1.0927869081497192,
"learning_rate": 0.00014420836189170665,
"loss": 1.8238,
"step": 1630
},
{
"epoch": 0.56,
"grad_norm": 0.7490981817245483,
"learning_rate": 0.00014386566141192598,
"loss": 1.7528,
"step": 1640
},
{
"epoch": 0.57,
"grad_norm": 0.6721557974815369,
"learning_rate": 0.0001435229609321453,
"loss": 1.7212,
"step": 1650
},
{
"epoch": 0.57,
"grad_norm": 0.8125373125076294,
"learning_rate": 0.00014318026045236464,
"loss": 1.8369,
"step": 1660
},
{
"epoch": 0.57,
"grad_norm": 0.598507821559906,
"learning_rate": 0.00014283755997258397,
"loss": 1.8455,
"step": 1670
},
{
"epoch": 0.58,
"grad_norm": 1.2567535638809204,
"learning_rate": 0.0001424948594928033,
"loss": 1.7656,
"step": 1680
},
{
"epoch": 0.58,
"grad_norm": 1.5279853343963623,
"learning_rate": 0.00014215215901302263,
"loss": 1.8297,
"step": 1690
},
{
"epoch": 0.58,
"grad_norm": 1.1410638093948364,
"learning_rate": 0.00014180945853324196,
"loss": 1.7489,
"step": 1700
},
{
"epoch": 0.59,
"grad_norm": 0.9007987976074219,
"learning_rate": 0.0001414667580534613,
"loss": 1.7473,
"step": 1710
},
{
"epoch": 0.59,
"grad_norm": 0.5736974477767944,
"learning_rate": 0.00014112405757368062,
"loss": 1.8022,
"step": 1720
},
{
"epoch": 0.59,
"grad_norm": 0.6310347318649292,
"learning_rate": 0.00014078135709389995,
"loss": 1.7676,
"step": 1730
},
{
"epoch": 0.6,
"grad_norm": 0.9788106679916382,
"learning_rate": 0.00014043865661411925,
"loss": 1.7303,
"step": 1740
},
{
"epoch": 0.6,
"grad_norm": 0.6612042784690857,
"learning_rate": 0.00014009595613433858,
"loss": 1.675,
"step": 1750
},
{
"epoch": 0.6,
"grad_norm": 0.8740193247795105,
"learning_rate": 0.00013975325565455794,
"loss": 1.7945,
"step": 1760
},
{
"epoch": 0.61,
"grad_norm": 0.9548364877700806,
"learning_rate": 0.00013941055517477727,
"loss": 1.7485,
"step": 1770
},
{
"epoch": 0.61,
"grad_norm": 0.6676565408706665,
"learning_rate": 0.00013906785469499657,
"loss": 1.7479,
"step": 1780
},
{
"epoch": 0.61,
"grad_norm": 0.6287640333175659,
"learning_rate": 0.0001387251542152159,
"loss": 1.7007,
"step": 1790
},
{
"epoch": 0.62,
"grad_norm": 1.5443295240402222,
"learning_rate": 0.00013838245373543523,
"loss": 1.8916,
"step": 1800
},
{
"epoch": 0.62,
"grad_norm": 0.9970656037330627,
"learning_rate": 0.00013803975325565456,
"loss": 1.6733,
"step": 1810
},
{
"epoch": 0.62,
"grad_norm": 0.9320075511932373,
"learning_rate": 0.00013769705277587391,
"loss": 1.8622,
"step": 1820
},
{
"epoch": 0.63,
"grad_norm": 0.8384440541267395,
"learning_rate": 0.00013735435229609322,
"loss": 1.6825,
"step": 1830
},
{
"epoch": 0.63,
"grad_norm": 1.1807342767715454,
"learning_rate": 0.00013701165181631255,
"loss": 1.6548,
"step": 1840
},
{
"epoch": 0.63,
"grad_norm": 0.7640541195869446,
"learning_rate": 0.00013666895133653188,
"loss": 1.8134,
"step": 1850
},
{
"epoch": 0.64,
"grad_norm": 0.9137887358665466,
"learning_rate": 0.0001363262508567512,
"loss": 1.7685,
"step": 1860
},
{
"epoch": 0.64,
"grad_norm": 0.8986667394638062,
"learning_rate": 0.00013598355037697054,
"loss": 1.7455,
"step": 1870
},
{
"epoch": 0.64,
"grad_norm": 0.96836918592453,
"learning_rate": 0.00013564084989718987,
"loss": 1.8705,
"step": 1880
},
{
"epoch": 0.65,
"grad_norm": 1.381028175354004,
"learning_rate": 0.0001352981494174092,
"loss": 1.7644,
"step": 1890
},
{
"epoch": 0.65,
"grad_norm": 0.617438018321991,
"learning_rate": 0.00013495544893762853,
"loss": 1.6194,
"step": 1900
},
{
"epoch": 0.65,
"grad_norm": 0.8686628937721252,
"learning_rate": 0.00013461274845784783,
"loss": 1.7171,
"step": 1910
},
{
"epoch": 0.66,
"grad_norm": 0.7735409140586853,
"learning_rate": 0.00013427004797806716,
"loss": 1.725,
"step": 1920
},
{
"epoch": 0.66,
"grad_norm": 1.0692516565322876,
"learning_rate": 0.00013392734749828651,
"loss": 1.762,
"step": 1930
},
{
"epoch": 0.66,
"grad_norm": 0.763136625289917,
"learning_rate": 0.00013358464701850584,
"loss": 1.6546,
"step": 1940
},
{
"epoch": 0.67,
"grad_norm": 0.9908429980278015,
"learning_rate": 0.00013324194653872517,
"loss": 1.6499,
"step": 1950
},
{
"epoch": 0.67,
"grad_norm": 0.9493003487586975,
"learning_rate": 0.00013289924605894448,
"loss": 1.5616,
"step": 1960
},
{
"epoch": 0.67,
"grad_norm": 0.8336248993873596,
"learning_rate": 0.0001325565455791638,
"loss": 1.7914,
"step": 1970
},
{
"epoch": 0.68,
"grad_norm": 0.8938840627670288,
"learning_rate": 0.00013221384509938314,
"loss": 1.7274,
"step": 1980
},
{
"epoch": 0.68,
"grad_norm": 1.0243479013442993,
"learning_rate": 0.0001318711446196025,
"loss": 1.6643,
"step": 1990
},
{
"epoch": 0.68,
"grad_norm": 1.0226181745529175,
"learning_rate": 0.0001315284441398218,
"loss": 1.7626,
"step": 2000
},
{
"epoch": 0.68,
"eval_loss": 1.8913378715515137,
"eval_runtime": 33.1473,
"eval_samples_per_second": 30.168,
"eval_steps_per_second": 3.771,
"step": 2000
},
{
"epoch": 0.69,
"grad_norm": 1.1059471368789673,
"learning_rate": 0.00013118574366004112,
"loss": 1.6362,
"step": 2010
},
{
"epoch": 0.69,
"grad_norm": 1.3754314184188843,
"learning_rate": 0.00013084304318026045,
"loss": 1.8308,
"step": 2020
},
{
"epoch": 0.7,
"grad_norm": 1.3899627923965454,
"learning_rate": 0.00013050034270047978,
"loss": 1.6982,
"step": 2030
},
{
"epoch": 0.7,
"grad_norm": 0.8804599046707153,
"learning_rate": 0.00013015764222069911,
"loss": 1.8138,
"step": 2040
},
{
"epoch": 0.7,
"grad_norm": 0.6578095555305481,
"learning_rate": 0.00012981494174091844,
"loss": 1.7211,
"step": 2050
},
{
"epoch": 0.71,
"grad_norm": 1.5725558996200562,
"learning_rate": 0.00012947224126113777,
"loss": 1.8684,
"step": 2060
},
{
"epoch": 0.71,
"grad_norm": 1.097717523574829,
"learning_rate": 0.0001291295407813571,
"loss": 1.7705,
"step": 2070
},
{
"epoch": 0.71,
"grad_norm": 0.7564202547073364,
"learning_rate": 0.00012878684030157643,
"loss": 1.5935,
"step": 2080
},
{
"epoch": 0.72,
"grad_norm": 0.732243537902832,
"learning_rate": 0.00012844413982179576,
"loss": 1.7694,
"step": 2090
},
{
"epoch": 0.72,
"grad_norm": 0.6464608907699585,
"learning_rate": 0.0001281014393420151,
"loss": 1.8418,
"step": 2100
},
{
"epoch": 0.72,
"grad_norm": 0.7090341448783875,
"learning_rate": 0.00012775873886223442,
"loss": 1.8122,
"step": 2110
},
{
"epoch": 0.73,
"grad_norm": 1.1480237245559692,
"learning_rate": 0.00012741603838245375,
"loss": 1.7766,
"step": 2120
},
{
"epoch": 0.73,
"grad_norm": 0.6737000346183777,
"learning_rate": 0.00012707333790267305,
"loss": 1.7876,
"step": 2130
},
{
"epoch": 0.73,
"grad_norm": 0.7794924378395081,
"learning_rate": 0.00012673063742289238,
"loss": 1.8529,
"step": 2140
},
{
"epoch": 0.74,
"grad_norm": 1.3136320114135742,
"learning_rate": 0.00012638793694311174,
"loss": 1.6699,
"step": 2150
},
{
"epoch": 0.74,
"grad_norm": 0.884027361869812,
"learning_rate": 0.00012604523646333107,
"loss": 1.7689,
"step": 2160
},
{
"epoch": 0.74,
"grad_norm": 1.103605031967163,
"learning_rate": 0.00012570253598355037,
"loss": 1.8594,
"step": 2170
},
{
"epoch": 0.75,
"grad_norm": 1.3322539329528809,
"learning_rate": 0.0001253598355037697,
"loss": 1.6765,
"step": 2180
},
{
"epoch": 0.75,
"grad_norm": 0.7840645909309387,
"learning_rate": 0.00012501713502398903,
"loss": 1.65,
"step": 2190
},
{
"epoch": 0.75,
"grad_norm": 0.9259356260299683,
"learning_rate": 0.00012467443454420836,
"loss": 1.7805,
"step": 2200
},
{
"epoch": 0.76,
"grad_norm": 1.3709288835525513,
"learning_rate": 0.0001243317340644277,
"loss": 1.7086,
"step": 2210
},
{
"epoch": 0.76,
"grad_norm": 0.6325123310089111,
"learning_rate": 0.00012398903358464702,
"loss": 1.7124,
"step": 2220
},
{
"epoch": 0.76,
"grad_norm": 0.854541003704071,
"learning_rate": 0.00012364633310486635,
"loss": 1.7089,
"step": 2230
},
{
"epoch": 0.77,
"grad_norm": 0.8861531019210815,
"learning_rate": 0.00012330363262508568,
"loss": 1.8369,
"step": 2240
},
{
"epoch": 0.77,
"grad_norm": 1.269750714302063,
"learning_rate": 0.000122960932145305,
"loss": 1.7598,
"step": 2250
},
{
"epoch": 0.77,
"grad_norm": 0.999598503112793,
"learning_rate": 0.00012261823166552434,
"loss": 1.8376,
"step": 2260
},
{
"epoch": 0.78,
"grad_norm": 0.7654330134391785,
"learning_rate": 0.00012227553118574367,
"loss": 1.7236,
"step": 2270
},
{
"epoch": 0.78,
"grad_norm": 1.11728835105896,
"learning_rate": 0.000121932830705963,
"loss": 1.7375,
"step": 2280
},
{
"epoch": 0.78,
"grad_norm": 0.7219797968864441,
"learning_rate": 0.00012159013022618233,
"loss": 1.7786,
"step": 2290
},
{
"epoch": 0.79,
"grad_norm": 1.0127757787704468,
"learning_rate": 0.00012124742974640165,
"loss": 1.7003,
"step": 2300
},
{
"epoch": 0.79,
"grad_norm": 1.0450137853622437,
"learning_rate": 0.00012090472926662097,
"loss": 1.7425,
"step": 2310
},
{
"epoch": 0.79,
"grad_norm": 0.9303760528564453,
"learning_rate": 0.0001205620287868403,
"loss": 1.632,
"step": 2320
},
{
"epoch": 0.8,
"grad_norm": 0.7303478717803955,
"learning_rate": 0.00012021932830705965,
"loss": 1.6918,
"step": 2330
},
{
"epoch": 0.8,
"grad_norm": 0.6323578953742981,
"learning_rate": 0.00011987662782727895,
"loss": 1.672,
"step": 2340
},
{
"epoch": 0.8,
"grad_norm": 0.715811014175415,
"learning_rate": 0.00011953392734749828,
"loss": 1.7613,
"step": 2350
},
{
"epoch": 0.81,
"grad_norm": 0.7297527194023132,
"learning_rate": 0.00011919122686771762,
"loss": 1.7277,
"step": 2360
},
{
"epoch": 0.81,
"grad_norm": 1.0844471454620361,
"learning_rate": 0.00011884852638793695,
"loss": 1.8143,
"step": 2370
},
{
"epoch": 0.82,
"grad_norm": 0.9260643720626831,
"learning_rate": 0.00011850582590815628,
"loss": 1.7228,
"step": 2380
},
{
"epoch": 0.82,
"grad_norm": 0.9541537761688232,
"learning_rate": 0.0001181631254283756,
"loss": 1.7143,
"step": 2390
},
{
"epoch": 0.82,
"grad_norm": 1.0506033897399902,
"learning_rate": 0.00011782042494859493,
"loss": 1.7659,
"step": 2400
},
{
"epoch": 0.83,
"grad_norm": 0.7201717495918274,
"learning_rate": 0.00011747772446881427,
"loss": 1.7257,
"step": 2410
},
{
"epoch": 0.83,
"grad_norm": 0.8612362742424011,
"learning_rate": 0.0001171350239890336,
"loss": 1.7009,
"step": 2420
},
{
"epoch": 0.83,
"grad_norm": 0.8745547533035278,
"learning_rate": 0.0001167923235092529,
"loss": 1.733,
"step": 2430
},
{
"epoch": 0.84,
"grad_norm": 0.5927043557167053,
"learning_rate": 0.00011644962302947225,
"loss": 1.7724,
"step": 2440
},
{
"epoch": 0.84,
"grad_norm": 0.6471837162971497,
"learning_rate": 0.00011610692254969158,
"loss": 1.7103,
"step": 2450
},
{
"epoch": 0.84,
"grad_norm": 1.1340347528457642,
"learning_rate": 0.0001157642220699109,
"loss": 1.7053,
"step": 2460
},
{
"epoch": 0.85,
"grad_norm": 0.8819349408149719,
"learning_rate": 0.00011542152159013022,
"loss": 1.7552,
"step": 2470
},
{
"epoch": 0.85,
"grad_norm": 0.6587919592857361,
"learning_rate": 0.00011507882111034955,
"loss": 1.6482,
"step": 2480
},
{
"epoch": 0.85,
"grad_norm": 1.0057884454727173,
"learning_rate": 0.00011473612063056888,
"loss": 1.7711,
"step": 2490
},
{
"epoch": 0.86,
"grad_norm": 0.6465263962745667,
"learning_rate": 0.00011439342015078823,
"loss": 1.7565,
"step": 2500
},
{
"epoch": 0.86,
"eval_loss": 1.8792312145233154,
"eval_runtime": 33.1087,
"eval_samples_per_second": 30.204,
"eval_steps_per_second": 3.775,
"step": 2500
},
{
"epoch": 0.86,
"grad_norm": 0.5970360040664673,
"learning_rate": 0.00011405071967100756,
"loss": 1.7179,
"step": 2510
},
{
"epoch": 0.86,
"grad_norm": 1.3015583753585815,
"learning_rate": 0.00011370801919122687,
"loss": 1.7225,
"step": 2520
},
{
"epoch": 0.87,
"grad_norm": 0.9235218167304993,
"learning_rate": 0.0001133653187114462,
"loss": 1.7657,
"step": 2530
},
{
"epoch": 0.87,
"grad_norm": 1.025038480758667,
"learning_rate": 0.00011302261823166553,
"loss": 1.7755,
"step": 2540
},
{
"epoch": 0.87,
"grad_norm": 0.8988834619522095,
"learning_rate": 0.00011267991775188486,
"loss": 1.8187,
"step": 2550
},
{
"epoch": 0.88,
"grad_norm": 0.7810622453689575,
"learning_rate": 0.00011233721727210418,
"loss": 1.6565,
"step": 2560
},
{
"epoch": 0.88,
"grad_norm": 1.6817054748535156,
"learning_rate": 0.0001119945167923235,
"loss": 1.7764,
"step": 2570
},
{
"epoch": 0.88,
"grad_norm": 0.9688411355018616,
"learning_rate": 0.00011165181631254285,
"loss": 1.6599,
"step": 2580
},
{
"epoch": 0.89,
"grad_norm": 0.742932915687561,
"learning_rate": 0.00011130911583276218,
"loss": 1.7552,
"step": 2590
},
{
"epoch": 0.89,
"grad_norm": 0.5261206030845642,
"learning_rate": 0.0001109664153529815,
"loss": 1.6432,
"step": 2600
},
{
"epoch": 0.89,
"grad_norm": 0.8997339606285095,
"learning_rate": 0.00011062371487320082,
"loss": 1.8438,
"step": 2610
},
{
"epoch": 0.9,
"grad_norm": 0.8077126741409302,
"learning_rate": 0.00011028101439342015,
"loss": 1.8144,
"step": 2620
},
{
"epoch": 0.9,
"grad_norm": 0.9872453212738037,
"learning_rate": 0.00010993831391363948,
"loss": 1.7427,
"step": 2630
},
{
"epoch": 0.9,
"grad_norm": 1.1201390027999878,
"learning_rate": 0.00010959561343385883,
"loss": 1.7696,
"step": 2640
},
{
"epoch": 0.91,
"grad_norm": 1.1584488153457642,
"learning_rate": 0.00010925291295407813,
"loss": 1.6236,
"step": 2650
},
{
"epoch": 0.91,
"grad_norm": 0.8254250884056091,
"learning_rate": 0.00010891021247429747,
"loss": 1.6214,
"step": 2660
},
{
"epoch": 0.91,
"grad_norm": 0.9825947284698486,
"learning_rate": 0.0001085675119945168,
"loss": 1.7889,
"step": 2670
},
{
"epoch": 0.92,
"grad_norm": 1.0265246629714966,
"learning_rate": 0.00010822481151473613,
"loss": 1.7283,
"step": 2680
},
{
"epoch": 0.92,
"grad_norm": 0.891777515411377,
"learning_rate": 0.00010788211103495545,
"loss": 1.8176,
"step": 2690
},
{
"epoch": 0.92,
"grad_norm": 0.8920706510543823,
"learning_rate": 0.00010753941055517478,
"loss": 1.7676,
"step": 2700
},
{
"epoch": 0.93,
"grad_norm": 1.072204828262329,
"learning_rate": 0.00010719671007539411,
"loss": 1.5836,
"step": 2710
},
{
"epoch": 0.93,
"grad_norm": 0.9175311923027039,
"learning_rate": 0.00010685400959561345,
"loss": 1.8073,
"step": 2720
},
{
"epoch": 0.94,
"grad_norm": 0.6199253797531128,
"learning_rate": 0.00010651130911583275,
"loss": 1.828,
"step": 2730
},
{
"epoch": 0.94,
"grad_norm": 0.653229296207428,
"learning_rate": 0.0001061686086360521,
"loss": 1.7308,
"step": 2740
},
{
"epoch": 0.94,
"grad_norm": 0.790413498878479,
"learning_rate": 0.00010582590815627143,
"loss": 1.8169,
"step": 2750
},
{
"epoch": 0.95,
"grad_norm": 0.8657679557800293,
"learning_rate": 0.00010548320767649076,
"loss": 1.7453,
"step": 2760
},
{
"epoch": 0.95,
"grad_norm": 0.6758552193641663,
"learning_rate": 0.00010514050719671007,
"loss": 1.7171,
"step": 2770
},
{
"epoch": 0.95,
"grad_norm": 1.0935484170913696,
"learning_rate": 0.0001047978067169294,
"loss": 1.6754,
"step": 2780
},
{
"epoch": 0.96,
"grad_norm": 0.8095535635948181,
"learning_rate": 0.00010445510623714873,
"loss": 1.8387,
"step": 2790
},
{
"epoch": 0.96,
"grad_norm": 0.8804395198822021,
"learning_rate": 0.00010411240575736808,
"loss": 1.7839,
"step": 2800
},
{
"epoch": 0.96,
"grad_norm": 0.945090115070343,
"learning_rate": 0.0001037697052775874,
"loss": 1.7196,
"step": 2810
},
{
"epoch": 0.97,
"grad_norm": 0.6158414483070374,
"learning_rate": 0.00010342700479780672,
"loss": 1.8011,
"step": 2820
},
{
"epoch": 0.97,
"grad_norm": 0.7917384505271912,
"learning_rate": 0.00010308430431802605,
"loss": 1.744,
"step": 2830
},
{
"epoch": 0.97,
"grad_norm": 0.6415919065475464,
"learning_rate": 0.00010274160383824538,
"loss": 1.6379,
"step": 2840
},
{
"epoch": 0.98,
"grad_norm": 0.6077090501785278,
"learning_rate": 0.00010239890335846471,
"loss": 1.657,
"step": 2850
},
{
"epoch": 0.98,
"grad_norm": 1.036901593208313,
"learning_rate": 0.00010205620287868403,
"loss": 1.7059,
"step": 2860
},
{
"epoch": 0.98,
"grad_norm": 0.7633301019668579,
"learning_rate": 0.00010171350239890336,
"loss": 1.8085,
"step": 2870
},
{
"epoch": 0.99,
"grad_norm": 1.04219651222229,
"learning_rate": 0.0001013708019191227,
"loss": 1.6641,
"step": 2880
},
{
"epoch": 0.99,
"grad_norm": 0.9899976849555969,
"learning_rate": 0.00010102810143934203,
"loss": 1.6819,
"step": 2890
},
{
"epoch": 0.99,
"grad_norm": 0.755636990070343,
"learning_rate": 0.00010068540095956133,
"loss": 1.7573,
"step": 2900
},
{
"epoch": 1.0,
"grad_norm": 1.1326630115509033,
"learning_rate": 0.00010034270047978068,
"loss": 1.6942,
"step": 2910
},
{
"epoch": 1.0,
"grad_norm": 0.7579949498176575,
"learning_rate": 0.0001,
"loss": 1.6903,
"step": 2920
},
{
"epoch": 1.0,
"grad_norm": 0.7203909754753113,
"learning_rate": 9.965729952021933e-05,
"loss": 1.5946,
"step": 2930
},
{
"epoch": 1.01,
"grad_norm": 0.8731165528297424,
"learning_rate": 9.931459904043866e-05,
"loss": 1.6224,
"step": 2940
},
{
"epoch": 1.01,
"grad_norm": 0.6287246942520142,
"learning_rate": 9.8971898560658e-05,
"loss": 1.6279,
"step": 2950
},
{
"epoch": 1.01,
"grad_norm": 0.8794381618499756,
"learning_rate": 9.862919808087731e-05,
"loss": 1.7103,
"step": 2960
},
{
"epoch": 1.02,
"grad_norm": 1.1305402517318726,
"learning_rate": 9.828649760109665e-05,
"loss": 1.5876,
"step": 2970
},
{
"epoch": 1.02,
"grad_norm": 1.349693775177002,
"learning_rate": 9.794379712131597e-05,
"loss": 1.549,
"step": 2980
},
{
"epoch": 1.02,
"grad_norm": 1.1124284267425537,
"learning_rate": 9.76010966415353e-05,
"loss": 1.5688,
"step": 2990
},
{
"epoch": 1.03,
"grad_norm": 0.5864982604980469,
"learning_rate": 9.725839616175463e-05,
"loss": 1.6437,
"step": 3000
},
{
"epoch": 1.03,
"eval_loss": 1.8886157274246216,
"eval_runtime": 33.1481,
"eval_samples_per_second": 30.168,
"eval_steps_per_second": 3.771,
"step": 3000
},
{
"epoch": 1.03,
"grad_norm": 0.8807237148284912,
"learning_rate": 9.691569568197396e-05,
"loss": 1.5888,
"step": 3010
},
{
"epoch": 1.03,
"grad_norm": 0.8454139232635498,
"learning_rate": 9.657299520219329e-05,
"loss": 1.5414,
"step": 3020
},
{
"epoch": 1.04,
"grad_norm": 0.9541159272193909,
"learning_rate": 9.623029472241262e-05,
"loss": 1.7525,
"step": 3030
},
{
"epoch": 1.04,
"grad_norm": 1.38509202003479,
"learning_rate": 9.588759424263193e-05,
"loss": 1.5302,
"step": 3040
},
{
"epoch": 1.04,
"grad_norm": 1.242966651916504,
"learning_rate": 9.554489376285128e-05,
"loss": 1.6085,
"step": 3050
},
{
"epoch": 1.05,
"grad_norm": 1.1269468069076538,
"learning_rate": 9.52021932830706e-05,
"loss": 1.603,
"step": 3060
},
{
"epoch": 1.05,
"grad_norm": 1.1521382331848145,
"learning_rate": 9.485949280328992e-05,
"loss": 1.6984,
"step": 3070
},
{
"epoch": 1.05,
"grad_norm": 1.3359086513519287,
"learning_rate": 9.451679232350927e-05,
"loss": 1.4839,
"step": 3080
},
{
"epoch": 1.06,
"grad_norm": 1.057581901550293,
"learning_rate": 9.417409184372858e-05,
"loss": 1.5541,
"step": 3090
},
{
"epoch": 1.06,
"grad_norm": 1.090909719467163,
"learning_rate": 9.383139136394791e-05,
"loss": 1.5811,
"step": 3100
},
{
"epoch": 1.07,
"grad_norm": 1.3244885206222534,
"learning_rate": 9.348869088416724e-05,
"loss": 1.6006,
"step": 3110
},
{
"epoch": 1.07,
"grad_norm": 0.8855965733528137,
"learning_rate": 9.314599040438657e-05,
"loss": 1.5577,
"step": 3120
},
{
"epoch": 1.07,
"grad_norm": 0.9480008482933044,
"learning_rate": 9.28032899246059e-05,
"loss": 1.6064,
"step": 3130
},
{
"epoch": 1.08,
"grad_norm": 1.397888422012329,
"learning_rate": 9.246058944482523e-05,
"loss": 1.5708,
"step": 3140
},
{
"epoch": 1.08,
"grad_norm": 0.8178092241287231,
"learning_rate": 9.211788896504455e-05,
"loss": 1.4722,
"step": 3150
},
{
"epoch": 1.08,
"grad_norm": 1.3776417970657349,
"learning_rate": 9.177518848526389e-05,
"loss": 1.6941,
"step": 3160
},
{
"epoch": 1.09,
"grad_norm": 1.3224530220031738,
"learning_rate": 9.14324880054832e-05,
"loss": 1.5414,
"step": 3170
},
{
"epoch": 1.09,
"grad_norm": 1.3367009162902832,
"learning_rate": 9.108978752570254e-05,
"loss": 1.6275,
"step": 3180
},
{
"epoch": 1.09,
"grad_norm": 1.0063951015472412,
"learning_rate": 9.074708704592187e-05,
"loss": 1.6761,
"step": 3190
},
{
"epoch": 1.1,
"grad_norm": 1.320760726928711,
"learning_rate": 9.04043865661412e-05,
"loss": 1.5896,
"step": 3200
},
{
"epoch": 1.1,
"grad_norm": 1.0159672498703003,
"learning_rate": 9.006168608636053e-05,
"loss": 1.5728,
"step": 3210
},
{
"epoch": 1.1,
"grad_norm": 1.095314383506775,
"learning_rate": 8.971898560657985e-05,
"loss": 1.5329,
"step": 3220
},
{
"epoch": 1.11,
"grad_norm": 1.212713360786438,
"learning_rate": 8.937628512679918e-05,
"loss": 1.5746,
"step": 3230
},
{
"epoch": 1.11,
"grad_norm": 0.8203460574150085,
"learning_rate": 8.903358464701851e-05,
"loss": 1.6119,
"step": 3240
},
{
"epoch": 1.11,
"grad_norm": 0.9643343091011047,
"learning_rate": 8.869088416723784e-05,
"loss": 1.5893,
"step": 3250
},
{
"epoch": 1.12,
"grad_norm": 1.2415894269943237,
"learning_rate": 8.834818368745716e-05,
"loss": 1.6291,
"step": 3260
},
{
"epoch": 1.12,
"grad_norm": 1.826658844947815,
"learning_rate": 8.80054832076765e-05,
"loss": 1.6394,
"step": 3270
},
{
"epoch": 1.12,
"grad_norm": 1.3455665111541748,
"learning_rate": 8.766278272789582e-05,
"loss": 1.568,
"step": 3280
},
{
"epoch": 1.13,
"grad_norm": 1.8909701108932495,
"learning_rate": 8.732008224811515e-05,
"loss": 1.5733,
"step": 3290
},
{
"epoch": 1.13,
"grad_norm": 1.4277849197387695,
"learning_rate": 8.697738176833448e-05,
"loss": 1.6339,
"step": 3300
},
{
"epoch": 1.13,
"grad_norm": 0.9563093185424805,
"learning_rate": 8.663468128855381e-05,
"loss": 1.5775,
"step": 3310
},
{
"epoch": 1.14,
"grad_norm": 0.8461637496948242,
"learning_rate": 8.629198080877314e-05,
"loss": 1.653,
"step": 3320
},
{
"epoch": 1.14,
"grad_norm": 1.0858458280563354,
"learning_rate": 8.594928032899247e-05,
"loss": 1.4778,
"step": 3330
},
{
"epoch": 1.14,
"grad_norm": 1.1627178192138672,
"learning_rate": 8.560657984921178e-05,
"loss": 1.5374,
"step": 3340
},
{
"epoch": 1.15,
"grad_norm": 1.196664571762085,
"learning_rate": 8.526387936943113e-05,
"loss": 1.6483,
"step": 3350
},
{
"epoch": 1.15,
"grad_norm": 1.1990993022918701,
"learning_rate": 8.492117888965046e-05,
"loss": 1.5993,
"step": 3360
},
{
"epoch": 1.15,
"grad_norm": 1.0623687505722046,
"learning_rate": 8.457847840986977e-05,
"loss": 1.5743,
"step": 3370
},
{
"epoch": 1.16,
"grad_norm": 1.1684637069702148,
"learning_rate": 8.423577793008912e-05,
"loss": 1.5546,
"step": 3380
},
{
"epoch": 1.16,
"grad_norm": 1.2448011636734009,
"learning_rate": 8.389307745030843e-05,
"loss": 1.496,
"step": 3390
},
{
"epoch": 1.16,
"grad_norm": 0.9411953091621399,
"learning_rate": 8.355037697052776e-05,
"loss": 1.5966,
"step": 3400
},
{
"epoch": 1.17,
"grad_norm": 1.0667563676834106,
"learning_rate": 8.320767649074709e-05,
"loss": 1.5128,
"step": 3410
},
{
"epoch": 1.17,
"grad_norm": 1.50753653049469,
"learning_rate": 8.286497601096642e-05,
"loss": 1.5772,
"step": 3420
},
{
"epoch": 1.17,
"grad_norm": 0.9346134662628174,
"learning_rate": 8.252227553118574e-05,
"loss": 1.6321,
"step": 3430
},
{
"epoch": 1.18,
"grad_norm": 1.304190754890442,
"learning_rate": 8.217957505140508e-05,
"loss": 1.5656,
"step": 3440
},
{
"epoch": 1.18,
"grad_norm": 1.058018684387207,
"learning_rate": 8.18368745716244e-05,
"loss": 1.5413,
"step": 3450
},
{
"epoch": 1.19,
"grad_norm": 1.15809166431427,
"learning_rate": 8.149417409184373e-05,
"loss": 1.5673,
"step": 3460
},
{
"epoch": 1.19,
"grad_norm": 1.092393159866333,
"learning_rate": 8.115147361206306e-05,
"loss": 1.5962,
"step": 3470
},
{
"epoch": 1.19,
"grad_norm": 0.9390305876731873,
"learning_rate": 8.080877313228239e-05,
"loss": 1.5565,
"step": 3480
},
{
"epoch": 1.2,
"grad_norm": 1.002120852470398,
"learning_rate": 8.046607265250173e-05,
"loss": 1.5803,
"step": 3490
},
{
"epoch": 1.2,
"grad_norm": 1.0857172012329102,
"learning_rate": 8.012337217272105e-05,
"loss": 1.6345,
"step": 3500
},
{
"epoch": 1.2,
"eval_loss": 1.8998303413391113,
"eval_runtime": 33.1629,
"eval_samples_per_second": 30.154,
"eval_steps_per_second": 3.769,
"step": 3500
},
{
"epoch": 1.2,
"grad_norm": 0.9931670427322388,
"learning_rate": 7.978067169294038e-05,
"loss": 1.605,
"step": 3510
},
{
"epoch": 1.21,
"grad_norm": 1.3759890794754028,
"learning_rate": 7.94379712131597e-05,
"loss": 1.5059,
"step": 3520
},
{
"epoch": 1.21,
"grad_norm": 1.2301968336105347,
"learning_rate": 7.909527073337903e-05,
"loss": 1.582,
"step": 3530
},
{
"epoch": 1.21,
"grad_norm": 1.1518924236297607,
"learning_rate": 7.875257025359835e-05,
"loss": 1.5839,
"step": 3540
},
{
"epoch": 1.22,
"grad_norm": 0.9161165952682495,
"learning_rate": 7.84098697738177e-05,
"loss": 1.5494,
"step": 3550
},
{
"epoch": 1.22,
"grad_norm": 1.250705599784851,
"learning_rate": 7.806716929403701e-05,
"loss": 1.5178,
"step": 3560
},
{
"epoch": 1.22,
"grad_norm": 0.7702249884605408,
"learning_rate": 7.772446881425634e-05,
"loss": 1.644,
"step": 3570
},
{
"epoch": 1.23,
"grad_norm": 1.4425973892211914,
"learning_rate": 7.738176833447567e-05,
"loss": 1.5009,
"step": 3580
},
{
"epoch": 1.23,
"grad_norm": 1.2036337852478027,
"learning_rate": 7.7039067854695e-05,
"loss": 1.5456,
"step": 3590
},
{
"epoch": 1.23,
"grad_norm": 1.4006402492523193,
"learning_rate": 7.669636737491433e-05,
"loss": 1.5511,
"step": 3600
},
{
"epoch": 1.24,
"grad_norm": 1.1983481645584106,
"learning_rate": 7.635366689513366e-05,
"loss": 1.5645,
"step": 3610
},
{
"epoch": 1.24,
"grad_norm": 1.2755049467086792,
"learning_rate": 7.601096641535297e-05,
"loss": 1.6512,
"step": 3620
},
{
"epoch": 1.24,
"grad_norm": 1.3783161640167236,
"learning_rate": 7.566826593557232e-05,
"loss": 1.6747,
"step": 3630
},
{
"epoch": 1.25,
"grad_norm": 1.1947081089019775,
"learning_rate": 7.532556545579165e-05,
"loss": 1.6605,
"step": 3640
},
{
"epoch": 1.25,
"grad_norm": 1.2230151891708374,
"learning_rate": 7.498286497601096e-05,
"loss": 1.6187,
"step": 3650
},
{
"epoch": 1.25,
"grad_norm": 1.372226595878601,
"learning_rate": 7.464016449623031e-05,
"loss": 1.6354,
"step": 3660
},
{
"epoch": 1.26,
"grad_norm": 1.2375085353851318,
"learning_rate": 7.429746401644962e-05,
"loss": 1.656,
"step": 3670
},
{
"epoch": 1.26,
"grad_norm": 0.9703730940818787,
"learning_rate": 7.395476353666895e-05,
"loss": 1.5571,
"step": 3680
},
{
"epoch": 1.26,
"grad_norm": 1.3475947380065918,
"learning_rate": 7.361206305688828e-05,
"loss": 1.5487,
"step": 3690
},
{
"epoch": 1.27,
"grad_norm": 1.3879302740097046,
"learning_rate": 7.326936257710761e-05,
"loss": 1.6702,
"step": 3700
},
{
"epoch": 1.27,
"grad_norm": 1.4043548107147217,
"learning_rate": 7.292666209732694e-05,
"loss": 1.5555,
"step": 3710
},
{
"epoch": 1.27,
"grad_norm": 1.2937321662902832,
"learning_rate": 7.258396161754627e-05,
"loss": 1.5959,
"step": 3720
},
{
"epoch": 1.28,
"grad_norm": 1.4525338411331177,
"learning_rate": 7.224126113776559e-05,
"loss": 1.6252,
"step": 3730
},
{
"epoch": 1.28,
"grad_norm": 1.1089144945144653,
"learning_rate": 7.189856065798493e-05,
"loss": 1.5027,
"step": 3740
},
{
"epoch": 1.28,
"grad_norm": 1.2625998258590698,
"learning_rate": 7.155586017820425e-05,
"loss": 1.5907,
"step": 3750
},
{
"epoch": 1.29,
"grad_norm": 1.2458665370941162,
"learning_rate": 7.121315969842358e-05,
"loss": 1.54,
"step": 3760
},
{
"epoch": 1.29,
"grad_norm": 1.2830859422683716,
"learning_rate": 7.087045921864292e-05,
"loss": 1.5867,
"step": 3770
},
{
"epoch": 1.29,
"grad_norm": 1.0032719373703003,
"learning_rate": 7.052775873886224e-05,
"loss": 1.5374,
"step": 3780
},
{
"epoch": 1.3,
"grad_norm": 0.9105421304702759,
"learning_rate": 7.018505825908157e-05,
"loss": 1.528,
"step": 3790
},
{
"epoch": 1.3,
"grad_norm": 1.3588030338287354,
"learning_rate": 6.98423577793009e-05,
"loss": 1.6368,
"step": 3800
},
{
"epoch": 1.3,
"grad_norm": 1.4903500080108643,
"learning_rate": 6.949965729952023e-05,
"loss": 1.675,
"step": 3810
},
{
"epoch": 1.31,
"grad_norm": 1.229722261428833,
"learning_rate": 6.915695681973956e-05,
"loss": 1.555,
"step": 3820
},
{
"epoch": 1.31,
"grad_norm": 0.9523776769638062,
"learning_rate": 6.881425633995888e-05,
"loss": 1.6608,
"step": 3830
},
{
"epoch": 1.32,
"grad_norm": 1.986708164215088,
"learning_rate": 6.84715558601782e-05,
"loss": 1.7199,
"step": 3840
},
{
"epoch": 1.32,
"grad_norm": 0.79183429479599,
"learning_rate": 6.812885538039754e-05,
"loss": 1.5034,
"step": 3850
},
{
"epoch": 1.32,
"grad_norm": 1.1760715246200562,
"learning_rate": 6.778615490061686e-05,
"loss": 1.6812,
"step": 3860
},
{
"epoch": 1.33,
"grad_norm": 1.7899055480957031,
"learning_rate": 6.744345442083619e-05,
"loss": 1.7389,
"step": 3870
},
{
"epoch": 1.33,
"grad_norm": 1.2628593444824219,
"learning_rate": 6.710075394105552e-05,
"loss": 1.5317,
"step": 3880
},
{
"epoch": 1.33,
"grad_norm": 1.037351131439209,
"learning_rate": 6.675805346127485e-05,
"loss": 1.5858,
"step": 3890
},
{
"epoch": 1.34,
"grad_norm": 1.2006704807281494,
"learning_rate": 6.641535298149417e-05,
"loss": 1.4587,
"step": 3900
},
{
"epoch": 1.34,
"grad_norm": 1.0877715349197388,
"learning_rate": 6.607265250171351e-05,
"loss": 1.5306,
"step": 3910
},
{
"epoch": 1.34,
"grad_norm": 1.4047476053237915,
"learning_rate": 6.572995202193284e-05,
"loss": 1.5603,
"step": 3920
},
{
"epoch": 1.35,
"grad_norm": 1.2444441318511963,
"learning_rate": 6.538725154215215e-05,
"loss": 1.5809,
"step": 3930
},
{
"epoch": 1.35,
"grad_norm": 1.5738134384155273,
"learning_rate": 6.50445510623715e-05,
"loss": 1.5606,
"step": 3940
},
{
"epoch": 1.35,
"grad_norm": 1.4850690364837646,
"learning_rate": 6.470185058259081e-05,
"loss": 1.4945,
"step": 3950
},
{
"epoch": 1.36,
"grad_norm": 1.3746342658996582,
"learning_rate": 6.435915010281016e-05,
"loss": 1.5152,
"step": 3960
},
{
"epoch": 1.36,
"grad_norm": 1.139249324798584,
"learning_rate": 6.401644962302947e-05,
"loss": 1.6004,
"step": 3970
},
{
"epoch": 1.36,
"grad_norm": 1.3590480089187622,
"learning_rate": 6.36737491432488e-05,
"loss": 1.4926,
"step": 3980
},
{
"epoch": 1.37,
"grad_norm": 1.6366995573043823,
"learning_rate": 6.333104866346813e-05,
"loss": 1.6734,
"step": 3990
},
{
"epoch": 1.37,
"grad_norm": 1.1154892444610596,
"learning_rate": 6.298834818368746e-05,
"loss": 1.5628,
"step": 4000
},
{
"epoch": 1.37,
"eval_loss": 1.9075069427490234,
"eval_runtime": 33.1019,
"eval_samples_per_second": 30.21,
"eval_steps_per_second": 3.776,
"step": 4000
},
{
"epoch": 1.37,
"grad_norm": 1.123923897743225,
"learning_rate": 6.264564770390678e-05,
"loss": 1.6206,
"step": 4010
},
{
"epoch": 1.38,
"grad_norm": 1.3015213012695312,
"learning_rate": 6.230294722412612e-05,
"loss": 1.6292,
"step": 4020
},
{
"epoch": 1.38,
"grad_norm": 1.8867294788360596,
"learning_rate": 6.196024674434544e-05,
"loss": 1.6625,
"step": 4030
},
{
"epoch": 1.38,
"grad_norm": 1.5840169191360474,
"learning_rate": 6.161754626456477e-05,
"loss": 1.6224,
"step": 4040
},
{
"epoch": 1.39,
"grad_norm": 0.9141889810562134,
"learning_rate": 6.12748457847841e-05,
"loss": 1.5051,
"step": 4050
},
{
"epoch": 1.39,
"grad_norm": 1.5261061191558838,
"learning_rate": 6.093214530500343e-05,
"loss": 1.4289,
"step": 4060
},
{
"epoch": 1.39,
"grad_norm": 1.2253016233444214,
"learning_rate": 6.0589444825222764e-05,
"loss": 1.6065,
"step": 4070
},
{
"epoch": 1.4,
"grad_norm": 1.7163646221160889,
"learning_rate": 6.0246744345442087e-05,
"loss": 1.5978,
"step": 4080
},
{
"epoch": 1.4,
"grad_norm": 1.0204969644546509,
"learning_rate": 5.9904043865661416e-05,
"loss": 1.6267,
"step": 4090
},
{
"epoch": 1.4,
"grad_norm": 1.9314994812011719,
"learning_rate": 5.956134338588074e-05,
"loss": 1.6486,
"step": 4100
},
{
"epoch": 1.41,
"grad_norm": 1.1685149669647217,
"learning_rate": 5.9218642906100076e-05,
"loss": 1.6397,
"step": 4110
},
{
"epoch": 1.41,
"grad_norm": 1.422166347503662,
"learning_rate": 5.88759424263194e-05,
"loss": 1.6419,
"step": 4120
},
{
"epoch": 1.41,
"grad_norm": 1.3074285984039307,
"learning_rate": 5.853324194653873e-05,
"loss": 1.565,
"step": 4130
},
{
"epoch": 1.42,
"grad_norm": 0.965584933757782,
"learning_rate": 5.819054146675805e-05,
"loss": 1.5841,
"step": 4140
},
{
"epoch": 1.42,
"grad_norm": 0.9101732969284058,
"learning_rate": 5.784784098697739e-05,
"loss": 1.6144,
"step": 4150
},
{
"epoch": 1.42,
"grad_norm": 1.183640718460083,
"learning_rate": 5.750514050719671e-05,
"loss": 1.5998,
"step": 4160
},
{
"epoch": 1.43,
"grad_norm": 1.1072790622711182,
"learning_rate": 5.716244002741604e-05,
"loss": 1.4634,
"step": 4170
},
{
"epoch": 1.43,
"grad_norm": 1.608017086982727,
"learning_rate": 5.681973954763536e-05,
"loss": 1.5629,
"step": 4180
},
{
"epoch": 1.44,
"grad_norm": 1.4969751834869385,
"learning_rate": 5.64770390678547e-05,
"loss": 1.5966,
"step": 4190
},
{
"epoch": 1.44,
"grad_norm": 1.727695107460022,
"learning_rate": 5.613433858807403e-05,
"loss": 1.5456,
"step": 4200
},
{
"epoch": 1.44,
"grad_norm": 1.4587767124176025,
"learning_rate": 5.579163810829335e-05,
"loss": 1.5238,
"step": 4210
},
{
"epoch": 1.45,
"grad_norm": 1.5338579416275024,
"learning_rate": 5.544893762851269e-05,
"loss": 1.5485,
"step": 4220
},
{
"epoch": 1.45,
"grad_norm": 0.8002244234085083,
"learning_rate": 5.510623714873201e-05,
"loss": 1.634,
"step": 4230
},
{
"epoch": 1.45,
"grad_norm": 1.281417727470398,
"learning_rate": 5.476353666895134e-05,
"loss": 1.589,
"step": 4240
},
{
"epoch": 1.46,
"grad_norm": 0.906808078289032,
"learning_rate": 5.4420836189170664e-05,
"loss": 1.57,
"step": 4250
},
{
"epoch": 1.46,
"grad_norm": 1.799028992652893,
"learning_rate": 5.407813570939e-05,
"loss": 1.623,
"step": 4260
},
{
"epoch": 1.46,
"grad_norm": 1.2560220956802368,
"learning_rate": 5.3735435229609324e-05,
"loss": 1.4231,
"step": 4270
},
{
"epoch": 1.47,
"grad_norm": 1.315132737159729,
"learning_rate": 5.339273474982865e-05,
"loss": 1.553,
"step": 4280
},
{
"epoch": 1.47,
"grad_norm": 1.1687719821929932,
"learning_rate": 5.3050034270047976e-05,
"loss": 1.691,
"step": 4290
},
{
"epoch": 1.47,
"grad_norm": 1.182626724243164,
"learning_rate": 5.270733379026731e-05,
"loss": 1.58,
"step": 4300
},
{
"epoch": 1.48,
"grad_norm": 0.819560170173645,
"learning_rate": 5.2364633310486636e-05,
"loss": 1.574,
"step": 4310
},
{
"epoch": 1.48,
"grad_norm": 1.4093881845474243,
"learning_rate": 5.2021932830705965e-05,
"loss": 1.5805,
"step": 4320
},
{
"epoch": 1.48,
"grad_norm": 2.079927921295166,
"learning_rate": 5.167923235092529e-05,
"loss": 1.6296,
"step": 4330
},
{
"epoch": 1.49,
"grad_norm": 1.1056098937988281,
"learning_rate": 5.1336531871144625e-05,
"loss": 1.4964,
"step": 4340
},
{
"epoch": 1.49,
"grad_norm": 1.924827218055725,
"learning_rate": 5.0993831391363954e-05,
"loss": 1.5223,
"step": 4350
},
{
"epoch": 1.49,
"grad_norm": 1.461719274520874,
"learning_rate": 5.065113091158328e-05,
"loss": 1.5323,
"step": 4360
},
{
"epoch": 1.5,
"grad_norm": 1.6647108793258667,
"learning_rate": 5.0308430431802614e-05,
"loss": 1.6025,
"step": 4370
},
{
"epoch": 1.5,
"grad_norm": 1.33492910861969,
"learning_rate": 4.996572995202194e-05,
"loss": 1.5554,
"step": 4380
},
{
"epoch": 1.5,
"grad_norm": 1.134582757949829,
"learning_rate": 4.962302947224126e-05,
"loss": 1.621,
"step": 4390
},
{
"epoch": 1.51,
"grad_norm": 1.315508246421814,
"learning_rate": 4.928032899246059e-05,
"loss": 1.5828,
"step": 4400
},
{
"epoch": 1.51,
"grad_norm": 1.3290214538574219,
"learning_rate": 4.8937628512679926e-05,
"loss": 1.578,
"step": 4410
},
{
"epoch": 1.51,
"grad_norm": 1.2616337537765503,
"learning_rate": 4.859492803289925e-05,
"loss": 1.6177,
"step": 4420
},
{
"epoch": 1.52,
"grad_norm": 1.4099230766296387,
"learning_rate": 4.825222755311858e-05,
"loss": 1.4926,
"step": 4430
},
{
"epoch": 1.52,
"grad_norm": 0.9720429182052612,
"learning_rate": 4.790952707333791e-05,
"loss": 1.6552,
"step": 4440
},
{
"epoch": 1.52,
"grad_norm": 1.1491189002990723,
"learning_rate": 4.756682659355723e-05,
"loss": 1.7001,
"step": 4450
},
{
"epoch": 1.53,
"grad_norm": 1.1790263652801514,
"learning_rate": 4.722412611377656e-05,
"loss": 1.544,
"step": 4460
},
{
"epoch": 1.53,
"grad_norm": 1.1880890130996704,
"learning_rate": 4.688142563399589e-05,
"loss": 1.6053,
"step": 4470
},
{
"epoch": 1.53,
"grad_norm": 1.0895016193389893,
"learning_rate": 4.653872515421522e-05,
"loss": 1.455,
"step": 4480
},
{
"epoch": 1.54,
"grad_norm": 1.230600118637085,
"learning_rate": 4.619602467443454e-05,
"loss": 1.5752,
"step": 4490
},
{
"epoch": 1.54,
"grad_norm": 1.4027129411697388,
"learning_rate": 4.585332419465387e-05,
"loss": 1.5461,
"step": 4500
},
{
"epoch": 1.54,
"eval_loss": 1.9048413038253784,
"eval_runtime": 33.129,
"eval_samples_per_second": 30.185,
"eval_steps_per_second": 3.773,
"step": 4500
},
{
"epoch": 1.54,
"grad_norm": 0.8590341806411743,
"learning_rate": 4.55106237148732e-05,
"loss": 1.6303,
"step": 4510
},
{
"epoch": 1.55,
"grad_norm": 1.0827935934066772,
"learning_rate": 4.516792323509253e-05,
"loss": 1.5712,
"step": 4520
},
{
"epoch": 1.55,
"grad_norm": 0.8795824646949768,
"learning_rate": 4.4825222755311855e-05,
"loss": 1.4882,
"step": 4530
},
{
"epoch": 1.55,
"grad_norm": 1.509653091430664,
"learning_rate": 4.4482522275531185e-05,
"loss": 1.5534,
"step": 4540
},
{
"epoch": 1.56,
"grad_norm": 1.0400638580322266,
"learning_rate": 4.413982179575052e-05,
"loss": 1.5681,
"step": 4550
},
{
"epoch": 1.56,
"grad_norm": 1.1006004810333252,
"learning_rate": 4.3797121315969844e-05,
"loss": 1.5715,
"step": 4560
},
{
"epoch": 1.57,
"grad_norm": 1.1621884107589722,
"learning_rate": 4.3454420836189174e-05,
"loss": 1.6373,
"step": 4570
},
{
"epoch": 1.57,
"grad_norm": 1.0296626091003418,
"learning_rate": 4.3111720356408503e-05,
"loss": 1.6076,
"step": 4580
},
{
"epoch": 1.57,
"grad_norm": 1.6784312725067139,
"learning_rate": 4.276901987662783e-05,
"loss": 1.6046,
"step": 4590
},
{
"epoch": 1.58,
"grad_norm": 1.0730016231536865,
"learning_rate": 4.2426319396847156e-05,
"loss": 1.6317,
"step": 4600
},
{
"epoch": 1.58,
"grad_norm": 1.0070710182189941,
"learning_rate": 4.2083618917066486e-05,
"loss": 1.5472,
"step": 4610
},
{
"epoch": 1.58,
"grad_norm": 1.143546462059021,
"learning_rate": 4.1740918437285815e-05,
"loss": 1.4993,
"step": 4620
},
{
"epoch": 1.59,
"grad_norm": 1.8565304279327393,
"learning_rate": 4.1398217957505145e-05,
"loss": 1.6021,
"step": 4630
},
{
"epoch": 1.59,
"grad_norm": 1.1914728879928589,
"learning_rate": 4.105551747772447e-05,
"loss": 1.7231,
"step": 4640
},
{
"epoch": 1.59,
"grad_norm": 1.6387224197387695,
"learning_rate": 4.07128169979438e-05,
"loss": 1.5804,
"step": 4650
},
{
"epoch": 1.6,
"grad_norm": 1.65473210811615,
"learning_rate": 4.037011651816313e-05,
"loss": 1.6404,
"step": 4660
},
{
"epoch": 1.6,
"grad_norm": 1.6097077131271362,
"learning_rate": 4.002741603838245e-05,
"loss": 1.4651,
"step": 4670
},
{
"epoch": 1.6,
"grad_norm": 1.4290515184402466,
"learning_rate": 3.968471555860178e-05,
"loss": 1.5668,
"step": 4680
},
{
"epoch": 1.61,
"grad_norm": 1.047481894493103,
"learning_rate": 3.934201507882111e-05,
"loss": 1.5275,
"step": 4690
},
{
"epoch": 1.61,
"grad_norm": 1.3638914823532104,
"learning_rate": 3.8999314599040446e-05,
"loss": 1.6588,
"step": 4700
},
{
"epoch": 1.61,
"grad_norm": 1.7712153196334839,
"learning_rate": 3.865661411925977e-05,
"loss": 1.6079,
"step": 4710
},
{
"epoch": 1.62,
"grad_norm": 1.0898468494415283,
"learning_rate": 3.83139136394791e-05,
"loss": 1.574,
"step": 4720
},
{
"epoch": 1.62,
"grad_norm": 1.4913599491119385,
"learning_rate": 3.797121315969843e-05,
"loss": 1.5376,
"step": 4730
},
{
"epoch": 1.62,
"grad_norm": 1.225707769393921,
"learning_rate": 3.762851267991775e-05,
"loss": 1.5925,
"step": 4740
},
{
"epoch": 1.63,
"grad_norm": 1.5699125528335571,
"learning_rate": 3.728581220013708e-05,
"loss": 1.593,
"step": 4750
},
{
"epoch": 1.63,
"grad_norm": 1.318574070930481,
"learning_rate": 3.694311172035641e-05,
"loss": 1.5525,
"step": 4760
},
{
"epoch": 1.63,
"grad_norm": 1.4544116258621216,
"learning_rate": 3.660041124057574e-05,
"loss": 1.5678,
"step": 4770
},
{
"epoch": 1.64,
"grad_norm": 1.7460687160491943,
"learning_rate": 3.6257710760795063e-05,
"loss": 1.6081,
"step": 4780
},
{
"epoch": 1.64,
"grad_norm": 1.4106998443603516,
"learning_rate": 3.591501028101439e-05,
"loss": 1.5687,
"step": 4790
},
{
"epoch": 1.64,
"grad_norm": 1.0583499670028687,
"learning_rate": 3.557230980123372e-05,
"loss": 1.5467,
"step": 4800
},
{
"epoch": 1.65,
"grad_norm": 1.2292665243148804,
"learning_rate": 3.522960932145305e-05,
"loss": 1.5491,
"step": 4810
},
{
"epoch": 1.65,
"grad_norm": 1.3556251525878906,
"learning_rate": 3.4886908841672375e-05,
"loss": 1.5568,
"step": 4820
},
{
"epoch": 1.65,
"grad_norm": 1.6374377012252808,
"learning_rate": 3.4544208361891705e-05,
"loss": 1.6016,
"step": 4830
},
{
"epoch": 1.66,
"grad_norm": 1.0343750715255737,
"learning_rate": 3.420150788211104e-05,
"loss": 1.4693,
"step": 4840
},
{
"epoch": 1.66,
"grad_norm": 1.378056526184082,
"learning_rate": 3.3858807402330365e-05,
"loss": 1.6081,
"step": 4850
},
{
"epoch": 1.66,
"grad_norm": 1.370970368385315,
"learning_rate": 3.3516106922549694e-05,
"loss": 1.515,
"step": 4860
},
{
"epoch": 1.67,
"grad_norm": 1.3780639171600342,
"learning_rate": 3.3173406442769024e-05,
"loss": 1.5644,
"step": 4870
},
{
"epoch": 1.67,
"grad_norm": 1.0907922983169556,
"learning_rate": 3.2830705962988354e-05,
"loss": 1.5701,
"step": 4880
},
{
"epoch": 1.67,
"grad_norm": 1.4807682037353516,
"learning_rate": 3.2488005483207677e-05,
"loss": 1.5535,
"step": 4890
},
{
"epoch": 1.68,
"grad_norm": 1.7207825183868408,
"learning_rate": 3.2145305003427006e-05,
"loss": 1.6049,
"step": 4900
},
{
"epoch": 1.68,
"grad_norm": 0.8784241676330566,
"learning_rate": 3.1802604523646336e-05,
"loss": 1.5213,
"step": 4910
},
{
"epoch": 1.69,
"grad_norm": 1.6283917427062988,
"learning_rate": 3.1459904043865666e-05,
"loss": 1.4902,
"step": 4920
},
{
"epoch": 1.69,
"grad_norm": 1.0017669200897217,
"learning_rate": 3.111720356408499e-05,
"loss": 1.5147,
"step": 4930
},
{
"epoch": 1.69,
"grad_norm": 1.4256327152252197,
"learning_rate": 3.077450308430432e-05,
"loss": 1.518,
"step": 4940
},
{
"epoch": 1.7,
"grad_norm": 1.4298090934753418,
"learning_rate": 3.0431802604523645e-05,
"loss": 1.5843,
"step": 4950
},
{
"epoch": 1.7,
"grad_norm": 1.0894560813903809,
"learning_rate": 3.0089102124742974e-05,
"loss": 1.5931,
"step": 4960
},
{
"epoch": 1.7,
"grad_norm": 1.8101505041122437,
"learning_rate": 2.97464016449623e-05,
"loss": 1.5629,
"step": 4970
},
{
"epoch": 1.71,
"grad_norm": 0.966204047203064,
"learning_rate": 2.9403701165181634e-05,
"loss": 1.5048,
"step": 4980
},
{
"epoch": 1.71,
"grad_norm": 1.2718944549560547,
"learning_rate": 2.9061000685400963e-05,
"loss": 1.6685,
"step": 4990
},
{
"epoch": 1.71,
"grad_norm": 0.9012284874916077,
"learning_rate": 2.871830020562029e-05,
"loss": 1.5769,
"step": 5000
},
{
"epoch": 1.71,
"eval_loss": 1.9053254127502441,
"eval_runtime": 33.1389,
"eval_samples_per_second": 30.176,
"eval_steps_per_second": 3.772,
"step": 5000
},
{
"epoch": 1.72,
"grad_norm": 1.4876313209533691,
"learning_rate": 2.837559972583962e-05,
"loss": 1.4846,
"step": 5010
},
{
"epoch": 1.72,
"grad_norm": 0.9953039288520813,
"learning_rate": 2.8032899246058946e-05,
"loss": 1.6145,
"step": 5020
},
{
"epoch": 1.72,
"grad_norm": 1.4575115442276,
"learning_rate": 2.7690198766278275e-05,
"loss": 1.5442,
"step": 5030
},
{
"epoch": 1.73,
"grad_norm": 1.3410977125167847,
"learning_rate": 2.73474982864976e-05,
"loss": 1.5617,
"step": 5040
},
{
"epoch": 1.73,
"grad_norm": 1.5489014387130737,
"learning_rate": 2.700479780671693e-05,
"loss": 1.6061,
"step": 5050
},
{
"epoch": 1.73,
"grad_norm": 2.2693567276000977,
"learning_rate": 2.6662097326936258e-05,
"loss": 1.576,
"step": 5060
},
{
"epoch": 1.74,
"grad_norm": 1.776106595993042,
"learning_rate": 2.6319396847155587e-05,
"loss": 1.57,
"step": 5070
},
{
"epoch": 1.74,
"grad_norm": 1.0588148832321167,
"learning_rate": 2.5976696367374914e-05,
"loss": 1.476,
"step": 5080
},
{
"epoch": 1.74,
"grad_norm": 1.133484125137329,
"learning_rate": 2.5633995887594243e-05,
"loss": 1.5103,
"step": 5090
},
{
"epoch": 1.75,
"grad_norm": 1.3961825370788574,
"learning_rate": 2.529129540781357e-05,
"loss": 1.59,
"step": 5100
},
{
"epoch": 1.75,
"grad_norm": 1.7427486181259155,
"learning_rate": 2.49485949280329e-05,
"loss": 1.5608,
"step": 5110
},
{
"epoch": 1.75,
"grad_norm": 1.961029291152954,
"learning_rate": 2.460589444825223e-05,
"loss": 1.6278,
"step": 5120
},
{
"epoch": 1.76,
"grad_norm": 1.2870323657989502,
"learning_rate": 2.4263193968471555e-05,
"loss": 1.5877,
"step": 5130
},
{
"epoch": 1.76,
"grad_norm": 1.204353928565979,
"learning_rate": 2.3920493488690885e-05,
"loss": 1.5,
"step": 5140
},
{
"epoch": 1.76,
"grad_norm": 0.9764713644981384,
"learning_rate": 2.357779300891021e-05,
"loss": 1.6451,
"step": 5150
},
{
"epoch": 1.77,
"grad_norm": 1.2140144109725952,
"learning_rate": 2.3235092529129544e-05,
"loss": 1.4958,
"step": 5160
},
{
"epoch": 1.77,
"grad_norm": 1.9167425632476807,
"learning_rate": 2.289239204934887e-05,
"loss": 1.569,
"step": 5170
},
{
"epoch": 1.77,
"grad_norm": 1.864986538887024,
"learning_rate": 2.25496915695682e-05,
"loss": 1.5232,
"step": 5180
},
{
"epoch": 1.78,
"grad_norm": 1.2421759366989136,
"learning_rate": 2.2206991089787527e-05,
"loss": 1.5894,
"step": 5190
},
{
"epoch": 1.78,
"grad_norm": 1.412864089012146,
"learning_rate": 2.1898560657984922e-05,
"loss": 1.5737,
"step": 5200
},
{
"epoch": 1.78,
"grad_norm": 1.105542778968811,
"learning_rate": 2.1555860178204252e-05,
"loss": 1.5747,
"step": 5210
},
{
"epoch": 1.79,
"grad_norm": 1.7511961460113525,
"learning_rate": 2.1213159698423578e-05,
"loss": 1.5455,
"step": 5220
},
{
"epoch": 1.79,
"grad_norm": 1.4287422895431519,
"learning_rate": 2.0870459218642908e-05,
"loss": 1.4871,
"step": 5230
},
{
"epoch": 1.79,
"grad_norm": 0.835995614528656,
"learning_rate": 2.0527758738862234e-05,
"loss": 1.6128,
"step": 5240
},
{
"epoch": 1.8,
"grad_norm": 1.8323885202407837,
"learning_rate": 2.0185058259081564e-05,
"loss": 1.6333,
"step": 5250
},
{
"epoch": 1.8,
"grad_norm": 1.5953247547149658,
"learning_rate": 1.984235777930089e-05,
"loss": 1.5631,
"step": 5260
},
{
"epoch": 1.8,
"grad_norm": 1.4622983932495117,
"learning_rate": 1.9499657299520223e-05,
"loss": 1.5065,
"step": 5270
},
{
"epoch": 1.81,
"grad_norm": 1.6321667432785034,
"learning_rate": 1.915695681973955e-05,
"loss": 1.59,
"step": 5280
},
{
"epoch": 1.81,
"grad_norm": 1.3693170547485352,
"learning_rate": 1.8814256339958876e-05,
"loss": 1.5847,
"step": 5290
},
{
"epoch": 1.82,
"grad_norm": 1.5187667608261108,
"learning_rate": 1.8471555860178205e-05,
"loss": 1.541,
"step": 5300
},
{
"epoch": 1.82,
"grad_norm": 1.1000255346298218,
"learning_rate": 1.8128855380397532e-05,
"loss": 1.4775,
"step": 5310
},
{
"epoch": 1.82,
"grad_norm": 1.4071645736694336,
"learning_rate": 1.778615490061686e-05,
"loss": 1.6336,
"step": 5320
},
{
"epoch": 1.83,
"grad_norm": 1.5703157186508179,
"learning_rate": 1.7443454420836188e-05,
"loss": 1.6725,
"step": 5330
},
{
"epoch": 1.83,
"grad_norm": 1.0555702447891235,
"learning_rate": 1.710075394105552e-05,
"loss": 1.4712,
"step": 5340
},
{
"epoch": 1.83,
"grad_norm": 1.4873102903366089,
"learning_rate": 1.6758053461274847e-05,
"loss": 1.5741,
"step": 5350
},
{
"epoch": 1.84,
"grad_norm": 1.1715468168258667,
"learning_rate": 1.6415352981494177e-05,
"loss": 1.4884,
"step": 5360
},
{
"epoch": 1.84,
"grad_norm": 1.82741379737854,
"learning_rate": 1.6072652501713503e-05,
"loss": 1.5778,
"step": 5370
},
{
"epoch": 1.84,
"grad_norm": 1.6479945182800293,
"learning_rate": 1.5729952021932833e-05,
"loss": 1.6802,
"step": 5380
},
{
"epoch": 1.85,
"grad_norm": 1.0871607065200806,
"learning_rate": 1.538725154215216e-05,
"loss": 1.5509,
"step": 5390
},
{
"epoch": 1.85,
"grad_norm": 1.7326961755752563,
"learning_rate": 1.5044551062371487e-05,
"loss": 1.5746,
"step": 5400
},
{
"epoch": 1.85,
"grad_norm": 1.3573588132858276,
"learning_rate": 1.4701850582590817e-05,
"loss": 1.6147,
"step": 5410
},
{
"epoch": 1.86,
"grad_norm": 1.807897925376892,
"learning_rate": 1.4359150102810145e-05,
"loss": 1.6446,
"step": 5420
},
{
"epoch": 1.86,
"grad_norm": 1.0243467092514038,
"learning_rate": 1.4016449623029473e-05,
"loss": 1.5844,
"step": 5430
},
{
"epoch": 1.86,
"grad_norm": 1.709069013595581,
"learning_rate": 1.36737491432488e-05,
"loss": 1.5774,
"step": 5440
},
{
"epoch": 1.87,
"grad_norm": 1.717564582824707,
"learning_rate": 1.3331048663468129e-05,
"loss": 1.5898,
"step": 5450
},
{
"epoch": 1.87,
"grad_norm": 1.1066781282424927,
"learning_rate": 1.2988348183687457e-05,
"loss": 1.5828,
"step": 5460
},
{
"epoch": 1.87,
"grad_norm": 1.119360089302063,
"learning_rate": 1.2645647703906785e-05,
"loss": 1.5321,
"step": 5470
},
{
"epoch": 1.88,
"grad_norm": 1.0519651174545288,
"learning_rate": 1.2302947224126114e-05,
"loss": 1.5691,
"step": 5480
},
{
"epoch": 1.88,
"grad_norm": 1.7377208471298218,
"learning_rate": 1.1960246744345442e-05,
"loss": 1.5568,
"step": 5490
},
{
"epoch": 1.88,
"grad_norm": 1.4080170392990112,
"learning_rate": 1.1617546264564772e-05,
"loss": 1.6109,
"step": 5500
},
{
"epoch": 1.88,
"eval_loss": 1.9016900062561035,
"eval_runtime": 33.1979,
"eval_samples_per_second": 30.122,
"eval_steps_per_second": 3.765,
"step": 5500
}
],
"logging_steps": 10,
"max_steps": 5838,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 500,
"total_flos": 2.980420245786624e+16,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}