qwen2.5-vl-vqa-vibook-tmp / trainer_state.json
sunbv56's picture
feat: Upload full training checkpoint for resume
f1ece3b verified
{
"best_global_step": 750,
"best_metric": 0.48672306537628174,
"best_model_checkpoint": "./qwen2.5-vl-finetune-checkpoints/checkpoint-750",
"epoch": 6.998518518518519,
"eval_steps": 50,
"global_step": 2364,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.022222222222222223,
"grad_norm": 5.334873676300049,
"learning_rate": 2e-05,
"loss": 2.417,
"step": 10
},
{
"epoch": 0.044444444444444446,
"grad_norm": 6.264868259429932,
"learning_rate": 4.222222222222222e-05,
"loss": 1.7873,
"step": 20
},
{
"epoch": 0.06666666666666667,
"grad_norm": 9.44924259185791,
"learning_rate": 6.444444444444446e-05,
"loss": 1.2221,
"step": 30
},
{
"epoch": 0.08888888888888889,
"grad_norm": 24.675010681152344,
"learning_rate": 8.666666666666667e-05,
"loss": 0.8489,
"step": 40
},
{
"epoch": 0.1111111111111111,
"grad_norm": 7.224123954772949,
"learning_rate": 9.997593339404756e-05,
"loss": 0.9777,
"step": 50
},
{
"epoch": 0.1111111111111111,
"eval_loss": 1.0407381057739258,
"eval_runtime": 1366.0737,
"eval_samples_per_second": 0.293,
"eval_steps_per_second": 0.073,
"step": 50
},
{
"epoch": 0.13333333333333333,
"grad_norm": 3.4162638187408447,
"learning_rate": 9.970545007734807e-05,
"loss": 0.867,
"step": 60
},
{
"epoch": 0.15555555555555556,
"grad_norm": 8.475242614746094,
"learning_rate": 9.913603233532067e-05,
"loss": 0.8114,
"step": 70
},
{
"epoch": 0.17777777777777778,
"grad_norm": 4.519033908843994,
"learning_rate": 9.82711047132661e-05,
"loss": 1.0957,
"step": 80
},
{
"epoch": 0.2,
"grad_norm": 9.79876708984375,
"learning_rate": 9.711586898767462e-05,
"loss": 0.9682,
"step": 90
},
{
"epoch": 0.2222222222222222,
"grad_norm": 4.599425315856934,
"learning_rate": 9.567727288213005e-05,
"loss": 0.8787,
"step": 100
},
{
"epoch": 0.2222222222222222,
"eval_loss": 0.8106439113616943,
"eval_runtime": 1315.3239,
"eval_samples_per_second": 0.304,
"eval_steps_per_second": 0.076,
"step": 100
},
{
"epoch": 0.24444444444444444,
"grad_norm": 8.550354957580566,
"learning_rate": 9.396396828288272e-05,
"loss": 0.7664,
"step": 110
},
{
"epoch": 0.26666666666666666,
"grad_norm": 16.60089874267578,
"learning_rate": 9.19862592053875e-05,
"loss": 0.8729,
"step": 120
},
{
"epoch": 0.28888888888888886,
"grad_norm": 7.3388495445251465,
"learning_rate": 8.97560398247424e-05,
"loss": 0.6473,
"step": 130
},
{
"epoch": 0.3111111111111111,
"grad_norm": 4.2113847732543945,
"learning_rate": 8.728672294272008e-05,
"loss": 0.7883,
"step": 140
},
{
"epoch": 0.3333333333333333,
"grad_norm": 9.767938613891602,
"learning_rate": 8.459315932159979e-05,
"loss": 0.9219,
"step": 150
},
{
"epoch": 0.3333333333333333,
"eval_loss": 0.7609114050865173,
"eval_runtime": 1300.5726,
"eval_samples_per_second": 0.308,
"eval_steps_per_second": 0.077,
"step": 150
},
{
"epoch": 0.35555555555555557,
"grad_norm": 4.818809986114502,
"learning_rate": 8.169154836993551e-05,
"loss": 0.6935,
"step": 160
},
{
"epoch": 0.37777777777777777,
"grad_norm": 9.263418197631836,
"learning_rate": 7.859934071740692e-05,
"loss": 0.6154,
"step": 170
},
{
"epoch": 0.4,
"grad_norm": 5.390202522277832,
"learning_rate": 7.533513326467911e-05,
"loss": 0.7336,
"step": 180
},
{
"epoch": 0.4222222222222222,
"grad_norm": 6.1149373054504395,
"learning_rate": 7.191855733945387e-05,
"loss": 0.6445,
"step": 190
},
{
"epoch": 0.4444444444444444,
"grad_norm": 28.822879791259766,
"learning_rate": 6.837016063135491e-05,
"loss": 0.6949,
"step": 200
},
{
"epoch": 0.4444444444444444,
"eval_loss": 0.7009051442146301,
"eval_runtime": 1304.5098,
"eval_samples_per_second": 0.307,
"eval_steps_per_second": 0.077,
"step": 200
},
{
"epoch": 0.4666666666666667,
"grad_norm": 4.288070201873779,
"learning_rate": 6.471128361570476e-05,
"loss": 0.6743,
"step": 210
},
{
"epoch": 0.4888888888888889,
"grad_norm": 2.8349318504333496,
"learning_rate": 6.096393120939516e-05,
"loss": 0.7858,
"step": 220
},
{
"epoch": 0.5111111111111111,
"grad_norm": 4.94554328918457,
"learning_rate": 5.715064043072771e-05,
"loss": 0.5722,
"step": 230
},
{
"epoch": 0.5333333333333333,
"grad_norm": 4.504173278808594,
"learning_rate": 5.329434485913393e-05,
"loss": 0.8959,
"step": 240
},
{
"epoch": 0.5555555555555556,
"grad_norm": 6.013023376464844,
"learning_rate": 4.941823670993016e-05,
"loss": 0.7088,
"step": 250
},
{
"epoch": 0.5555555555555556,
"eval_loss": 0.6455658674240112,
"eval_runtime": 1299.4246,
"eval_samples_per_second": 0.308,
"eval_steps_per_second": 0.077,
"step": 250
},
{
"epoch": 0.5777777777777777,
"grad_norm": 4.269768238067627,
"learning_rate": 4.55456273536057e-05,
"loss": 0.7,
"step": 260
},
{
"epoch": 0.6,
"grad_norm": 3.1131203174591064,
"learning_rate": 4.169980711849781e-05,
"loss": 0.5789,
"step": 270
},
{
"epoch": 0.6222222222222222,
"grad_norm": 3.8976268768310547,
"learning_rate": 3.790390522001662e-05,
"loss": 0.5789,
"step": 280
},
{
"epoch": 0.6444444444444445,
"grad_norm": 3.225245714187622,
"learning_rate": 3.418075065882217e-05,
"loss": 0.5838,
"step": 290
},
{
"epoch": 0.6666666666666666,
"grad_norm": 4.711687088012695,
"learning_rate": 3.0552734924528306e-05,
"loss": 0.6903,
"step": 300
},
{
"epoch": 0.6666666666666666,
"eval_loss": 0.5962206125259399,
"eval_runtime": 1288.7617,
"eval_samples_per_second": 0.31,
"eval_steps_per_second": 0.078,
"step": 300
},
{
"epoch": 0.6888888888888889,
"grad_norm": 4.992547512054443,
"learning_rate": 2.7041677330649407e-05,
"loss": 0.6936,
"step": 310
},
{
"epoch": 0.7111111111111111,
"grad_norm": 9.838484764099121,
"learning_rate": 2.3668693790681634e-05,
"loss": 0.7052,
"step": 320
},
{
"epoch": 0.7333333333333333,
"grad_norm": 8.834742546081543,
"learning_rate": 2.0454069824514444e-05,
"loss": 0.7329,
"step": 330
},
{
"epoch": 0.7555555555555555,
"grad_norm": 5.83026123046875,
"learning_rate": 1.7417138558927244e-05,
"loss": 0.5383,
"step": 340
},
{
"epoch": 0.7777777777777778,
"grad_norm": 7.011873722076416,
"learning_rate": 1.4576164455890013e-05,
"loss": 0.5669,
"step": 350
},
{
"epoch": 0.7777777777777778,
"eval_loss": 0.5696190595626831,
"eval_runtime": 1284.3616,
"eval_samples_per_second": 0.311,
"eval_steps_per_second": 0.078,
"step": 350
},
{
"epoch": 0.8,
"grad_norm": 4.988184452056885,
"learning_rate": 1.194823346793998e-05,
"loss": 0.523,
"step": 360
},
{
"epoch": 0.8222222222222222,
"grad_norm": 6.482738971710205,
"learning_rate": 9.549150281252633e-06,
"loss": 0.6132,
"step": 370
},
{
"epoch": 0.8444444444444444,
"grad_norm": 7.583404541015625,
"learning_rate": 7.393343264399438e-06,
"loss": 0.5602,
"step": 380
},
{
"epoch": 0.8666666666666667,
"grad_norm": 3.9021246433258057,
"learning_rate": 5.493777694441521e-06,
"loss": 0.7415,
"step": 390
},
{
"epoch": 0.8888888888888888,
"grad_norm": 4.355510711669922,
"learning_rate": 3.861877782227885e-06,
"loss": 0.6577,
"step": 400
},
{
"epoch": 0.8888888888888888,
"eval_loss": 0.5607297420501709,
"eval_runtime": 1282.6163,
"eval_samples_per_second": 0.312,
"eval_steps_per_second": 0.078,
"step": 400
},
{
"epoch": 0.9111111111111111,
"grad_norm": 6.445341110229492,
"learning_rate": 2.5074579658471266e-06,
"loss": 0.7159,
"step": 410
},
{
"epoch": 0.9333333333333333,
"grad_norm": 4.908728122711182,
"learning_rate": 1.438663885441982e-06,
"loss": 0.5581,
"step": 420
},
{
"epoch": 0.9555555555555556,
"grad_norm": 3.629876136779785,
"learning_rate": 6.61923394371039e-07,
"loss": 0.5394,
"step": 430
},
{
"epoch": 0.9777777777777777,
"grad_norm": 4.596356391906738,
"learning_rate": 1.819079013423153e-07,
"loss": 0.5405,
"step": 440
},
{
"epoch": 1.0,
"grad_norm": 3.878979206085205,
"learning_rate": 1.5042760116212861e-09,
"loss": 0.4788,
"step": 450
},
{
"epoch": 1.0,
"eval_loss": 0.5549002289772034,
"eval_runtime": 1306.3502,
"eval_samples_per_second": 0.306,
"eval_steps_per_second": 0.077,
"step": 450
},
{
"epoch": 1.0,
"step": 450,
"total_flos": 2.2909672762220544e+16,
"train_loss": 0.7819234402974446,
"train_runtime": 36382.2657,
"train_samples_per_second": 0.099,
"train_steps_per_second": 0.012
},
{
"epoch": 1.0,
"step": 450,
"total_flos": 2.2909672762220544e+16,
"train_loss": 0.0,
"train_runtime": 0.2352,
"train_samples_per_second": 11479.607,
"train_steps_per_second": 1432.825
},
{
"epoch": 1.0,
"step": 450,
"total_flos": 2.2909672762220544e+16,
"train_loss": 0.0,
"train_runtime": 0.2432,
"train_samples_per_second": 11101.818,
"train_steps_per_second": 1385.671
},
{
"epoch": 1.3644444444444446,
"grad_norm": 6.853477954864502,
"learning_rate": 4.4362464041245384e-05,
"loss": 0.4727,
"step": 460
},
{
"epoch": 1.394074074074074,
"grad_norm": 5.060462951660156,
"learning_rate": 4.21673357748979e-05,
"loss": 0.7008,
"step": 470
},
{
"epoch": 1.4237037037037037,
"grad_norm": 3.702819585800171,
"learning_rate": 3.998758359194028e-05,
"loss": 0.4912,
"step": 480
},
{
"epoch": 1.4533333333333334,
"grad_norm": 37.993465423583984,
"learning_rate": 3.7827486502728574e-05,
"loss": 0.6657,
"step": 490
},
{
"epoch": 1.482962962962963,
"grad_norm": 11.906915664672852,
"learning_rate": 3.5691284933255654e-05,
"loss": 0.4878,
"step": 500
},
{
"epoch": 1.482962962962963,
"eval_loss": 0.5222220420837402,
"eval_runtime": 931.5337,
"eval_samples_per_second": 0.322,
"eval_steps_per_second": 0.081,
"step": 500
},
{
"epoch": 1.5125925925925925,
"grad_norm": 5.83925724029541,
"learning_rate": 3.358317240089008e-05,
"loss": 0.4312,
"step": 510
},
{
"epoch": 1.5422222222222222,
"grad_norm": 3.2574503421783447,
"learning_rate": 3.150728728219966e-05,
"loss": 0.5544,
"step": 520
},
{
"epoch": 1.5718518518518518,
"grad_norm": 6.785427093505859,
"learning_rate": 2.946770468902064e-05,
"loss": 0.5278,
"step": 530
},
{
"epoch": 1.6014814814814815,
"grad_norm": 48.51929473876953,
"learning_rate": 2.7468428468719877e-05,
"loss": 0.6246,
"step": 540
},
{
"epoch": 1.6311111111111112,
"grad_norm": 2.2513532638549805,
"learning_rate": 2.5513383344354467e-05,
"loss": 0.5973,
"step": 550
},
{
"epoch": 1.6311111111111112,
"eval_loss": 0.5203197002410889,
"eval_runtime": 910.2398,
"eval_samples_per_second": 0.33,
"eval_steps_per_second": 0.082,
"step": 550
},
{
"epoch": 1.6607407407407409,
"grad_norm": 5.722535133361816,
"learning_rate": 2.3606407210158006e-05,
"loss": 0.6404,
"step": 560
},
{
"epoch": 1.6903703703703705,
"grad_norm": 6.939681053161621,
"learning_rate": 2.175124359747806e-05,
"loss": 0.5434,
"step": 570
},
{
"epoch": 1.72,
"grad_norm": 6.644214630126953,
"learning_rate": 1.9951534325954914e-05,
"loss": 0.6884,
"step": 580
},
{
"epoch": 1.7496296296296296,
"grad_norm": 4.961182117462158,
"learning_rate": 1.82108123543675e-05,
"loss": 0.459,
"step": 590
},
{
"epoch": 1.779259259259259,
"grad_norm": 6.396653652191162,
"learning_rate": 1.6532494845181155e-05,
"loss": 0.5712,
"step": 600
},
{
"epoch": 1.779259259259259,
"eval_loss": 0.5014171600341797,
"eval_runtime": 902.4118,
"eval_samples_per_second": 0.332,
"eval_steps_per_second": 0.083,
"step": 600
},
{
"epoch": 1.8088888888888888,
"grad_norm": 4.0901360511779785,
"learning_rate": 1.4919876456411874e-05,
"loss": 0.5857,
"step": 610
},
{
"epoch": 1.8385185185185184,
"grad_norm": 5.865538597106934,
"learning_rate": 1.3376122873975616e-05,
"loss": 0.5479,
"step": 620
},
{
"epoch": 1.8681481481481481,
"grad_norm": 4.197350025177002,
"learning_rate": 1.1904264597219077e-05,
"loss": 0.5741,
"step": 630
},
{
"epoch": 1.8977777777777778,
"grad_norm": 12.017341613769531,
"learning_rate": 1.0507190989831412e-05,
"loss": 0.5084,
"step": 640
},
{
"epoch": 1.9274074074074075,
"grad_norm": 3.7081458568573,
"learning_rate": 9.187644607815498e-06,
"loss": 0.6477,
"step": 650
},
{
"epoch": 1.9274074074074075,
"eval_loss": 0.4961656630039215,
"eval_runtime": 888.3568,
"eval_samples_per_second": 0.338,
"eval_steps_per_second": 0.084,
"step": 650
},
{
"epoch": 1.9570370370370371,
"grad_norm": 2.705108165740967,
"learning_rate": 7.948215815653149e-06,
"loss": 0.4434,
"step": 660
},
{
"epoch": 1.9866666666666668,
"grad_norm": 4.442301273345947,
"learning_rate": 6.7913377012332694e-06,
"loss": 0.6607,
"step": 670
},
{
"epoch": 2.017777777777778,
"grad_norm": 3.787601947784424,
"learning_rate": 5.719281299525331e-06,
"loss": 0.5332,
"step": 680
},
{
"epoch": 2.0474074074074076,
"grad_norm": 12.064045906066895,
"learning_rate": 4.734151134374304e-06,
"loss": 0.4921,
"step": 690
},
{
"epoch": 2.0770370370370372,
"grad_norm": 3.7398955821990967,
"learning_rate": 3.837881087168932e-06,
"loss": 0.5258,
"step": 700
},
{
"epoch": 2.0770370370370372,
"eval_loss": 0.4903838038444519,
"eval_runtime": 946.841,
"eval_samples_per_second": 0.317,
"eval_steps_per_second": 0.079,
"step": 700
},
{
"epoch": 2.1066666666666665,
"grad_norm": 5.209836006164551,
"learning_rate": 3.0322306004934462e-06,
"loss": 0.5301,
"step": 710
},
{
"epoch": 2.136296296296296,
"grad_norm": 7.934685230255127,
"learning_rate": 2.3187812242151995e-06,
"loss": 0.3265,
"step": 720
},
{
"epoch": 2.165925925925926,
"grad_norm": 7.0827789306640625,
"learning_rate": 1.6989335107884862e-06,
"loss": 0.5313,
"step": 730
},
{
"epoch": 2.1955555555555555,
"grad_norm": 4.234689712524414,
"learning_rate": 1.1739042658693078e-06,
"loss": 0.4536,
"step": 740
},
{
"epoch": 2.225185185185185,
"grad_norm": 2.486720561981201,
"learning_rate": 7.447241596383381e-07,
"loss": 0.4678,
"step": 750
},
{
"epoch": 2.225185185185185,
"eval_loss": 0.48672306537628174,
"eval_runtime": 907.8778,
"eval_samples_per_second": 0.33,
"eval_steps_per_second": 0.083,
"step": 750
},
{
"epoch": 2.254814814814815,
"grad_norm": 3.3360986709594727,
"learning_rate": 4.122357035211855e-07,
"loss": 0.4285,
"step": 760
},
{
"epoch": 2.2844444444444445,
"grad_norm": 5.732705116271973,
"learning_rate": 1.7709159627787853e-07,
"loss": 0.422,
"step": 770
},
{
"epoch": 2.314074074074074,
"grad_norm": 3.294212579727173,
"learning_rate": 3.975344270823467e-08,
"loss": 0.4541,
"step": 780
},
{
"epoch": 2.3377777777777777,
"step": 788,
"total_flos": 4.05169174996992e+16,
"train_loss": 0.22821334594397374,
"train_runtime": 22612.2743,
"train_samples_per_second": 0.279,
"train_steps_per_second": 0.035
},
{
"epoch": 1.7555555555555555,
"grad_norm": 7.2509870529174805,
"learning_rate": 3.500580638048163e-05,
"loss": 0.8465,
"step": 790
},
{
"epoch": 1.7777777777777777,
"grad_norm": 12.120731353759766,
"learning_rate": 3.366679270419626e-05,
"loss": 0.9164,
"step": 800
},
{
"epoch": 1.7777777777777777,
"eval_loss": 0.542812168598175,
"eval_runtime": 1320.8378,
"eval_samples_per_second": 0.303,
"eval_steps_per_second": 0.076,
"step": 800
},
{
"epoch": 1.8,
"grad_norm": 13.869917869567871,
"learning_rate": 3.2340767918386884e-05,
"loss": 0.9066,
"step": 810
},
{
"epoch": 1.8222222222222222,
"grad_norm": 18.34079933166504,
"learning_rate": 3.102878653674449e-05,
"loss": 1.1401,
"step": 820
},
{
"epoch": 1.8444444444444446,
"grad_norm": 12.816385269165039,
"learning_rate": 2.973189190502259e-05,
"loss": 1.0268,
"step": 830
},
{
"epoch": 1.8666666666666667,
"grad_norm": 12.571537017822266,
"learning_rate": 2.84511153713223e-05,
"loss": 1.3972,
"step": 840
},
{
"epoch": 1.8888888888888888,
"grad_norm": 31.82439613342285,
"learning_rate": 2.7187475465918765e-05,
"loss": 1.2414,
"step": 850
},
{
"epoch": 1.8888888888888888,
"eval_loss": 0.5595240592956543,
"eval_runtime": 1193.2134,
"eval_samples_per_second": 0.335,
"eval_steps_per_second": 0.084,
"step": 850
},
{
"epoch": 1.911111111111111,
"grad_norm": 13.308499336242676,
"learning_rate": 2.594197709128061e-05,
"loss": 1.3218,
"step": 860
},
{
"epoch": 1.9333333333333333,
"grad_norm": 6.059114456176758,
"learning_rate": 2.471561072292703e-05,
"loss": 1.0453,
"step": 870
},
{
"epoch": 1.9555555555555557,
"grad_norm": 13.917656898498535,
"learning_rate": 2.3509351621757692e-05,
"loss": 1.0651,
"step": 880
},
{
"epoch": 1.9777777777777779,
"grad_norm": 9.584994316101074,
"learning_rate": 2.2324159058482085e-05,
"loss": 1.1164,
"step": 890
},
{
"epoch": 2.0,
"grad_norm": 12.185356140136719,
"learning_rate": 2.11609755507649e-05,
"loss": 0.9255,
"step": 900
},
{
"epoch": 2.0,
"eval_loss": 0.5336335897445679,
"eval_runtime": 1209.6116,
"eval_samples_per_second": 0.331,
"eval_steps_per_second": 0.083,
"step": 900
},
{
"epoch": 2.022222222222222,
"grad_norm": 9.4601411819458,
"learning_rate": 2.0020726113694204e-05,
"loss": 0.6626,
"step": 910
},
{
"epoch": 2.0444444444444443,
"grad_norm": 7.833770751953125,
"learning_rate": 1.8904317524168458e-05,
"loss": 0.8076,
"step": 920
},
{
"epoch": 2.066666666666667,
"grad_norm": 9.944056510925293,
"learning_rate": 1.7812637599787297e-05,
"loss": 0.826,
"step": 930
},
{
"epoch": 2.088888888888889,
"grad_norm": 50.9095344543457,
"learning_rate": 1.674655449281964e-05,
"loss": 0.8201,
"step": 940
},
{
"epoch": 2.111111111111111,
"grad_norm": 11.151677131652832,
"learning_rate": 1.570691599981053e-05,
"loss": 0.7905,
"step": 950
},
{
"epoch": 2.111111111111111,
"eval_loss": 0.5313804149627686,
"eval_runtime": 1190.18,
"eval_samples_per_second": 0.336,
"eval_steps_per_second": 0.084,
"step": 950
},
{
"epoch": 2.1333333333333333,
"grad_norm": 19.227052688598633,
"learning_rate": 1.4694548887375708e-05,
"loss": 0.7963,
"step": 960
},
{
"epoch": 2.1555555555555554,
"grad_norm": 3.875586748123169,
"learning_rate": 1.3710258234720192e-05,
"loss": 0.7642,
"step": 970
},
{
"epoch": 2.1777777777777776,
"grad_norm": 15.085531234741211,
"learning_rate": 1.2754826793403562e-05,
"loss": 0.9614,
"step": 980
},
{
"epoch": 2.2,
"grad_norm": 8.809530258178711,
"learning_rate": 1.1829014364861251e-05,
"loss": 0.8547,
"step": 990
},
{
"epoch": 2.2222222222222223,
"grad_norm": 9.07629680633545,
"learning_rate": 1.093355719617678e-05,
"loss": 0.7664,
"step": 1000
},
{
"epoch": 2.2222222222222223,
"eval_loss": 0.5206774473190308,
"eval_runtime": 1181.1098,
"eval_samples_per_second": 0.339,
"eval_steps_per_second": 0.085,
"step": 1000
},
{
"epoch": 2.2444444444444445,
"grad_norm": 6.77101993560791,
"learning_rate": 1.006916739458535e-05,
"loss": 0.693,
"step": 1010
},
{
"epoch": 2.2666666666666666,
"grad_norm": 5.627523899078369,
"learning_rate": 9.236532361174726e-06,
"loss": 0.9059,
"step": 1020
},
{
"epoch": 2.2888888888888888,
"grad_norm": 6.075342655181885,
"learning_rate": 8.43631424423334e-06,
"loss": 0.7724,
"step": 1030
},
{
"epoch": 2.311111111111111,
"grad_norm": 12.213552474975586,
"learning_rate": 7.669149412680605e-06,
"loss": 0.9813,
"step": 1040
},
{
"epoch": 2.3333333333333335,
"grad_norm": 4.757562637329102,
"learning_rate": 6.93564794999823e-06,
"loss": 1.2031,
"step": 1050
},
{
"epoch": 2.3333333333333335,
"eval_loss": 0.5155333876609802,
"eval_runtime": 1186.5765,
"eval_samples_per_second": 0.337,
"eval_steps_per_second": 0.084,
"step": 1050
},
{
"epoch": 2.3555555555555556,
"grad_norm": 11.984197616577148,
"learning_rate": 6.2363931690647195e-06,
"loss": 1.0289,
"step": 1060
},
{
"epoch": 2.3777777777777778,
"grad_norm": 6.141831874847412,
"learning_rate": 5.571941148279081e-06,
"loss": 0.9776,
"step": 1070
},
{
"epoch": 2.4,
"grad_norm": 6.076663017272949,
"learning_rate": 4.942820289342759e-06,
"loss": 0.5992,
"step": 1080
},
{
"epoch": 2.422222222222222,
"grad_norm": 113.96343994140625,
"learning_rate": 4.349530897051047e-06,
"loss": 0.875,
"step": 1090
},
{
"epoch": 2.4444444444444446,
"grad_norm": 12.476838111877441,
"learning_rate": 3.7925447814286087e-06,
"loss": 0.848,
"step": 1100
},
{
"epoch": 2.4444444444444446,
"eval_loss": 0.5155972838401794,
"eval_runtime": 1256.8424,
"eval_samples_per_second": 0.318,
"eval_steps_per_second": 0.08,
"step": 1100
},
{
"epoch": 2.466666666666667,
"grad_norm": 6.342575550079346,
"learning_rate": 3.2723048825252177e-06,
"loss": 0.876,
"step": 1110
},
{
"epoch": 2.488888888888889,
"grad_norm": 5.154838562011719,
"learning_rate": 2.7892249181701802e-06,
"loss": 0.6484,
"step": 1120
},
{
"epoch": 2.511111111111111,
"grad_norm": 14.073723793029785,
"learning_rate": 2.343689054965592e-06,
"loss": 0.7078,
"step": 1130
},
{
"epoch": 2.533333333333333,
"grad_norm": 11.655536651611328,
"learning_rate": 1.936051602780026e-06,
"loss": 0.8106,
"step": 1140
},
{
"epoch": 2.5555555555555554,
"grad_norm": 12.040849685668945,
"learning_rate": 1.5666367329856046e-06,
"loss": 0.736,
"step": 1150
},
{
"epoch": 2.5555555555555554,
"eval_loss": 0.5151739716529846,
"eval_runtime": 1180.2607,
"eval_samples_per_second": 0.339,
"eval_steps_per_second": 0.085,
"step": 1150
},
{
"epoch": 2.5777777777777775,
"grad_norm": 4.713861465454102,
"learning_rate": 1.2357382206625801e-06,
"loss": 0.9703,
"step": 1160
},
{
"epoch": 2.6,
"grad_norm": 16.40427589416504,
"learning_rate": 9.436192109763376e-07,
"loss": 0.8726,
"step": 1170
},
{
"epoch": 2.6222222222222222,
"grad_norm": 26.21332359313965,
"learning_rate": 6.90512009912725e-07,
"loss": 0.7543,
"step": 1180
},
{
"epoch": 2.6444444444444444,
"grad_norm": 10.207490921020508,
"learning_rate": 4.766178995379955e-07,
"loss": 1.0878,
"step": 1190
},
{
"epoch": 2.6666666666666665,
"grad_norm": 17.372835159301758,
"learning_rate": 3.0210697793044975e-07,
"loss": 0.6169,
"step": 1200
},
{
"epoch": 2.6666666666666665,
"eval_loss": 0.5134466290473938,
"eval_runtime": 1169.2654,
"eval_samples_per_second": 0.342,
"eval_steps_per_second": 0.086,
"step": 1200
},
{
"epoch": 2.688888888888889,
"grad_norm": 78.36367797851562,
"learning_rate": 1.671180239108172e-07,
"loss": 0.9518,
"step": 1210
},
{
"epoch": 2.7111111111111112,
"grad_norm": 7.34134578704834,
"learning_rate": 7.175838667927148e-08,
"loss": 0.6888,
"step": 1220
},
{
"epoch": 2.7333333333333334,
"grad_norm": 7.477999687194824,
"learning_rate": 1.6103900446534648e-08,
"loss": 0.7221,
"step": 1230
},
{
"epoch": 2.7511111111111113,
"step": 1238,
"total_flos": 6.388579291468186e+16,
"train_loss": 0.3261277918284082,
"train_runtime": 34829.2458,
"train_samples_per_second": 0.284,
"train_steps_per_second": 0.036
},
{
"epoch": 3.66962962962963,
"grad_norm": 14.275522232055664,
"learning_rate": 1.3300797847207797e-05,
"loss": 3.5621,
"step": 1240
},
{
"epoch": 3.699259259259259,
"grad_norm": 27.858943939208984,
"learning_rate": 1.2557515699430094e-05,
"loss": 4.3815,
"step": 1250
},
{
"epoch": 3.699259259259259,
"eval_loss": 2.2658419609069824,
"eval_runtime": 995.7526,
"eval_samples_per_second": 0.301,
"eval_steps_per_second": 0.075,
"step": 1250
},
{
"epoch": 3.728888888888889,
"grad_norm": 30.557031631469727,
"learning_rate": 1.1832611379355878e-05,
"loss": 3.2056,
"step": 1260
},
{
"epoch": 3.7585185185185184,
"grad_norm": 34.28306579589844,
"learning_rate": 1.1126440690477996e-05,
"loss": 2.8957,
"step": 1270
},
{
"epoch": 3.788148148148148,
"grad_norm": 29.017297744750977,
"learning_rate": 1.0439350241294566e-05,
"loss": 2.5225,
"step": 1280
},
{
"epoch": 3.8177777777777777,
"grad_norm": 23.32266616821289,
"learning_rate": 9.771677275183744e-06,
"loss": 2.6028,
"step": 1290
},
{
"epoch": 3.8474074074074074,
"grad_norm": 32.830848693847656,
"learning_rate": 9.123749504875135e-06,
"loss": 2.7177,
"step": 1300
},
{
"epoch": 3.8474074074074074,
"eval_loss": 1.3522464036941528,
"eval_runtime": 985.7859,
"eval_samples_per_second": 0.304,
"eval_steps_per_second": 0.076,
"step": 1300
},
{
"epoch": 3.877037037037037,
"grad_norm": 6.538234233856201,
"learning_rate": 8.495884951599142e-06,
"loss": 2.2624,
"step": 1310
},
{
"epoch": 3.9066666666666667,
"grad_norm": 19.523771286010742,
"learning_rate": 7.888391788993216e-06,
"loss": 2.6275,
"step": 1320
},
{
"epoch": 3.9362962962962964,
"grad_norm": 11.971488952636719,
"learning_rate": 7.301568191841457e-06,
"loss": 2.1496,
"step": 1330
},
{
"epoch": 3.965925925925926,
"grad_norm": 34.24433898925781,
"learning_rate": 6.735702189722115e-06,
"loss": 2.0774,
"step": 1340
},
{
"epoch": 3.9955555555555557,
"grad_norm": 12.619851112365723,
"learning_rate": 6.191071525634456e-06,
"loss": 2.0749,
"step": 1350
},
{
"epoch": 3.9955555555555557,
"eval_loss": 1.2665727138519287,
"eval_runtime": 972.1433,
"eval_samples_per_second": 0.309,
"eval_steps_per_second": 0.077,
"step": 1350
},
{
"epoch": 4.026666666666666,
"grad_norm": 21.63642692565918,
"learning_rate": 5.667943519674723e-06,
"loss": 2.2795,
"step": 1360
},
{
"epoch": 4.0562962962962965,
"grad_norm": 5.838581562042236,
"learning_rate": 5.166574937827867e-06,
"loss": 2.6146,
"step": 1370
},
{
"epoch": 4.085925925925926,
"grad_norm": 11.008721351623535,
"learning_rate": 4.687211865939539e-06,
"loss": 2.3045,
"step": 1380
},
{
"epoch": 4.115555555555556,
"grad_norm": 6.246650218963623,
"learning_rate": 4.2300895889302805e-06,
"loss": 1.823,
"step": 1390
},
{
"epoch": 4.145185185185185,
"grad_norm": 13.782442092895508,
"learning_rate": 3.7954324753109673e-06,
"loss": 2.2982,
"step": 1400
},
{
"epoch": 4.145185185185185,
"eval_loss": 1.2098972797393799,
"eval_runtime": 998.8662,
"eval_samples_per_second": 0.3,
"eval_steps_per_second": 0.075,
"step": 1400
},
{
"epoch": 4.174814814814815,
"grad_norm": 11.179134368896484,
"learning_rate": 3.383453867056452e-06,
"loss": 2.5618,
"step": 1410
},
{
"epoch": 4.204444444444444,
"grad_norm": 73.97550201416016,
"learning_rate": 2.9943559748912996e-06,
"loss": 1.8831,
"step": 1420
},
{
"epoch": 4.234074074074074,
"grad_norm": 17.907745361328125,
"learning_rate": 2.628329779039057e-06,
"loss": 2.2352,
"step": 1430
},
{
"epoch": 4.263703703703704,
"grad_norm": 81.71790313720703,
"learning_rate": 2.2855549354837912e-06,
"loss": 2.1651,
"step": 1440
},
{
"epoch": 4.293333333333333,
"grad_norm": 10.33467960357666,
"learning_rate": 1.9661996877898105e-06,
"loss": 1.7595,
"step": 1450
},
{
"epoch": 4.293333333333333,
"eval_loss": 1.1622637510299683,
"eval_runtime": 993.3397,
"eval_samples_per_second": 0.302,
"eval_steps_per_second": 0.076,
"step": 1450
},
{
"epoch": 4.322962962962963,
"grad_norm": 40.43919372558594,
"learning_rate": 1.6704207845230358e-06,
"loss": 1.9304,
"step": 1460
},
{
"epoch": 4.352592592592592,
"grad_norm": 10.497286796569824,
"learning_rate": 1.3983634023143511e-06,
"loss": 2.098,
"step": 1470
},
{
"epoch": 4.3822222222222225,
"grad_norm": 9.101359367370605,
"learning_rate": 1.1501610746028124e-06,
"loss": 1.8441,
"step": 1480
},
{
"epoch": 4.411851851851852,
"grad_norm": 20.517807006835938,
"learning_rate": 9.25935626093688e-07,
"loss": 2.3551,
"step": 1490
},
{
"epoch": 4.441481481481482,
"grad_norm": 7.981099605560303,
"learning_rate": 7.257971129634389e-07,
"loss": 1.6124,
"step": 1500
},
{
"epoch": 4.441481481481482,
"eval_loss": 1.1480356454849243,
"eval_runtime": 970.9195,
"eval_samples_per_second": 0.309,
"eval_steps_per_second": 0.077,
"step": 1500
},
{
"epoch": 4.471111111111111,
"grad_norm": 51.19599533081055,
"learning_rate": 5.498437688410463e-07,
"loss": 2.0946,
"step": 1510
},
{
"epoch": 4.50074074074074,
"grad_norm": 7.847194671630859,
"learning_rate": 3.981619565921968e-07,
"loss": 1.8896,
"step": 1520
},
{
"epoch": 4.53037037037037,
"grad_norm": 12.63452434539795,
"learning_rate": 2.708261259299072e-07,
"loss": 2.1132,
"step": 1530
},
{
"epoch": 4.5600000000000005,
"grad_norm": 8.711173057556152,
"learning_rate": 1.6789877687254928e-07,
"loss": 1.9074,
"step": 1540
},
{
"epoch": 4.58962962962963,
"grad_norm": 14.014768600463867,
"learning_rate": 8.943042906705001e-08,
"loss": 2.4591,
"step": 1550
},
{
"epoch": 4.58962962962963,
"eval_loss": 1.1526756286621094,
"eval_runtime": 1013.0536,
"eval_samples_per_second": 0.296,
"eval_steps_per_second": 0.074,
"step": 1550
},
{
"epoch": 4.619259259259259,
"grad_norm": 241.5323486328125,
"learning_rate": 3.545959699243207e-08,
"loss": 1.9968,
"step": 1560
},
{
"epoch": 4.648888888888889,
"grad_norm": 41.02328109741211,
"learning_rate": 6.0127710558133265e-09,
"loss": 1.9328,
"step": 1570
},
{
"epoch": 4.666666666666667,
"step": 1576,
"total_flos": 8.15036810717184e+16,
"train_loss": 0.49436442077462445,
"train_runtime": 26325.3193,
"train_samples_per_second": 0.479,
"train_steps_per_second": 0.06
},
{
"epoch": 3.511111111111111,
"grad_norm": 11.290818214416504,
"learning_rate": 1.4115578944331131e-05,
"loss": 4.0951,
"step": 1580
},
{
"epoch": 3.533333333333333,
"grad_norm": 29.64479637145996,
"learning_rate": 1.3520911423383454e-05,
"loss": 5.0902,
"step": 1590
},
{
"epoch": 3.5555555555555554,
"grad_norm": 12.257214546203613,
"learning_rate": 1.2937077174225081e-05,
"loss": 3.8541,
"step": 1600
},
{
"epoch": 3.5555555555555554,
"eval_loss": 1.5511490106582642,
"eval_runtime": 1129.8462,
"eval_samples_per_second": 0.354,
"eval_steps_per_second": 0.089,
"step": 1600
},
{
"epoch": 3.5777777777777775,
"grad_norm": 7.66765832901001,
"learning_rate": 1.2364249579342985e-05,
"loss": 2.9561,
"step": 1610
},
{
"epoch": 3.6,
"grad_norm": 5.419583797454834,
"learning_rate": 1.1802598752554878e-05,
"loss": 2.4979,
"step": 1620
},
{
"epoch": 3.6222222222222222,
"grad_norm": 10.220062255859375,
"learning_rate": 1.125229148849008e-05,
"loss": 2.1119,
"step": 1630
},
{
"epoch": 3.6444444444444444,
"grad_norm": 10.050875663757324,
"learning_rate": 1.071349121305622e-05,
"loss": 2.2186,
"step": 1640
},
{
"epoch": 3.6666666666666665,
"grad_norm": 13.71487045288086,
"learning_rate": 1.018635793490621e-05,
"loss": 2.6291,
"step": 1650
},
{
"epoch": 3.6666666666666665,
"eval_loss": 1.2097514867782593,
"eval_runtime": 1135.3798,
"eval_samples_per_second": 0.352,
"eval_steps_per_second": 0.088,
"step": 1650
},
{
"epoch": 3.688888888888889,
"grad_norm": 14.815037727355957,
"learning_rate": 9.671048197920247e-06,
"loss": 2.5383,
"step": 1660
},
{
"epoch": 3.7111111111111112,
"grad_norm": 13.783255577087402,
"learning_rate": 9.167715034716606e-06,
"loss": 2.6482,
"step": 1670
},
{
"epoch": 3.7333333333333334,
"grad_norm": 9.30642032623291,
"learning_rate": 8.676507921205162e-06,
"loss": 2.5038,
"step": 1680
},
{
"epoch": 3.7555555555555555,
"grad_norm": 7.022140026092529,
"learning_rate": 8.197572732197322e-06,
"loss": 2.1227,
"step": 1690
},
{
"epoch": 3.7777777777777777,
"grad_norm": 16.499279022216797,
"learning_rate": 7.731051698085162e-06,
"loss": 2.144,
"step": 1700
},
{
"epoch": 3.7777777777777777,
"eval_loss": 1.1110306978225708,
"eval_runtime": 1124.1297,
"eval_samples_per_second": 0.356,
"eval_steps_per_second": 0.089,
"step": 1700
},
{
"epoch": 3.8,
"grad_norm": 8.783102989196777,
"learning_rate": 7.277083362603099e-06,
"loss": 2.2054,
"step": 1710
},
{
"epoch": 3.822222222222222,
"grad_norm": 17.862638473510742,
"learning_rate": 6.835802541684117e-06,
"loss": 2.6757,
"step": 1720
},
{
"epoch": 3.8444444444444446,
"grad_norm": 10.636580467224121,
"learning_rate": 6.407340283423324e-06,
"loss": 2.4811,
"step": 1730
},
{
"epoch": 3.8666666666666667,
"grad_norm": 16.785629272460938,
"learning_rate": 5.9918238291602145e-06,
"loss": 3.0367,
"step": 1740
},
{
"epoch": 3.888888888888889,
"grad_norm": 18.506656646728516,
"learning_rate": 5.589376575691652e-06,
"loss": 2.6143,
"step": 1750
},
{
"epoch": 3.888888888888889,
"eval_loss": 1.0668652057647705,
"eval_runtime": 1135.4155,
"eval_samples_per_second": 0.352,
"eval_steps_per_second": 0.088,
"step": 1750
},
{
"epoch": 3.911111111111111,
"grad_norm": 33.2259635925293,
"learning_rate": 5.200118038626389e-06,
"loss": 2.0509,
"step": 1760
},
{
"epoch": 3.9333333333333336,
"grad_norm": 13.904667854309082,
"learning_rate": 4.824163816892241e-06,
"loss": 2.0153,
"step": 1770
},
{
"epoch": 3.9555555555555557,
"grad_norm": 16.34776496887207,
"learning_rate": 4.46162555840653e-06,
"loss": 2.1135,
"step": 1780
},
{
"epoch": 3.977777777777778,
"grad_norm": 9.64548397064209,
"learning_rate": 4.112610926919663e-06,
"loss": 1.8388,
"step": 1790
},
{
"epoch": 4.0,
"grad_norm": 14.741687774658203,
"learning_rate": 3.777223570042082e-06,
"loss": 1.8233,
"step": 1800
},
{
"epoch": 4.0,
"eval_loss": 1.015744686126709,
"eval_runtime": 1124.2011,
"eval_samples_per_second": 0.356,
"eval_steps_per_second": 0.089,
"step": 1800
},
{
"epoch": 4.022222222222222,
"grad_norm": 17.629962921142578,
"learning_rate": 3.455563088463737e-06,
"loss": 1.8195,
"step": 1810
},
{
"epoch": 4.044444444444444,
"grad_norm": 10.74545669555664,
"learning_rate": 3.1477250063755403e-06,
"loss": 1.7446,
"step": 1820
},
{
"epoch": 4.066666666666666,
"grad_norm": 14.61141586303711,
"learning_rate": 2.853800743101265e-06,
"loss": 1.7361,
"step": 1830
},
{
"epoch": 4.088888888888889,
"grad_norm": 17.42447853088379,
"learning_rate": 2.573877585948642e-06,
"loss": 1.7683,
"step": 1840
},
{
"epoch": 4.111111111111111,
"grad_norm": 12.99378776550293,
"learning_rate": 2.308038664287371e-06,
"loss": 1.8956,
"step": 1850
},
{
"epoch": 4.111111111111111,
"eval_loss": 0.9937378168106079,
"eval_runtime": 1121.1964,
"eval_samples_per_second": 0.357,
"eval_steps_per_second": 0.089,
"step": 1850
},
{
"epoch": 4.133333333333334,
"grad_norm": 6.575343608856201,
"learning_rate": 2.056362924862121e-06,
"loss": 1.7616,
"step": 1860
},
{
"epoch": 4.155555555555556,
"grad_norm": 7.720376491546631,
"learning_rate": 1.8189251083474469e-06,
"loss": 1.9345,
"step": 1870
},
{
"epoch": 4.177777777777778,
"grad_norm": 16.67999267578125,
"learning_rate": 1.5957957271519553e-06,
"loss": 1.7756,
"step": 1880
},
{
"epoch": 4.2,
"grad_norm": 25.952878952026367,
"learning_rate": 1.3870410444780824e-06,
"loss": 1.9684,
"step": 1890
},
{
"epoch": 4.222222222222222,
"grad_norm": 13.947036743164062,
"learning_rate": 1.1927230546437406e-06,
"loss": 2.4727,
"step": 1900
},
{
"epoch": 4.222222222222222,
"eval_loss": 0.9845434427261353,
"eval_runtime": 1121.7839,
"eval_samples_per_second": 0.357,
"eval_steps_per_second": 0.089,
"step": 1900
},
{
"epoch": 4.2444444444444445,
"grad_norm": 7.728816509246826,
"learning_rate": 1.0128994646717683e-06,
"loss": 2.5575,
"step": 1910
},
{
"epoch": 4.266666666666667,
"grad_norm": 7.709161758422852,
"learning_rate": 8.476236771525259e-07,
"loss": 2.0874,
"step": 1920
},
{
"epoch": 4.288888888888889,
"grad_norm": 12.649147033691406,
"learning_rate": 6.969447743848501e-07,
"loss": 2.1343,
"step": 1930
},
{
"epoch": 4.311111111111111,
"grad_norm": 14.208868980407715,
"learning_rate": 5.6090750379994e-07,
"loss": 1.6593,
"step": 1940
},
{
"epoch": 4.333333333333333,
"grad_norm": 113.40324401855469,
"learning_rate": 4.395522646726491e-07,
"loss": 1.877,
"step": 1950
},
{
"epoch": 4.333333333333333,
"eval_loss": 0.9760661125183105,
"eval_runtime": 1128.6817,
"eval_samples_per_second": 0.354,
"eval_steps_per_second": 0.089,
"step": 1950
},
{
"epoch": 4.355555555555555,
"grad_norm": 8.278970718383789,
"learning_rate": 3.329150961240146e-07,
"loss": 2.4419,
"step": 1960
},
{
"epoch": 4.377777777777778,
"grad_norm": 9.493860244750977,
"learning_rate": 2.410276664186473e-07,
"loss": 1.9977,
"step": 1970
},
{
"epoch": 4.4,
"grad_norm": 12.72214412689209,
"learning_rate": 1.6391726356013158e-07,
"loss": 1.996,
"step": 1980
},
{
"epoch": 4.4222222222222225,
"grad_norm": 16.340652465820312,
"learning_rate": 1.0160678718726945e-07,
"loss": 1.8244,
"step": 1990
},
{
"epoch": 4.444444444444445,
"grad_norm": 5.685306549072266,
"learning_rate": 5.411474177349218e-08,
"loss": 1.3904,
"step": 2000
},
{
"epoch": 4.444444444444445,
"eval_loss": 0.9806169271469116,
"eval_runtime": 1118.8009,
"eval_samples_per_second": 0.358,
"eval_steps_per_second": 0.089,
"step": 2000
},
{
"epoch": 4.466666666666667,
"grad_norm": 16.570466995239258,
"learning_rate": 2.145523113160075e-08,
"loss": 1.6432,
"step": 2010
},
{
"epoch": 4.488888888888889,
"grad_norm": 12.792643547058105,
"learning_rate": 3.637954225266249e-09,
"loss": 1.7129,
"step": 2020
},
{
"epoch": 4.502222222222223,
"step": 2026,
"total_flos": 1.0486716012367872e+17,
"train_loss": 0.5005418875375128,
"train_runtime": 32217.358,
"train_samples_per_second": 0.503,
"train_steps_per_second": 0.063
},
{
"epoch": 6.0088888888888885,
"grad_norm": 14.330232620239258,
"learning_rate": 5.99674741500138e-06,
"loss": 4.366,
"step": 2030
},
{
"epoch": 6.038518518518519,
"grad_norm": 9.551030158996582,
"learning_rate": 5.6508792702119225e-06,
"loss": 4.9312,
"step": 2040
},
{
"epoch": 6.068148148148148,
"grad_norm": 11.4534273147583,
"learning_rate": 5.314685922932666e-06,
"loss": 4.5235,
"step": 2050
},
{
"epoch": 6.068148148148148,
"eval_loss": 2.373319625854492,
"eval_runtime": 894.95,
"eval_samples_per_second": 0.335,
"eval_steps_per_second": 0.084,
"step": 2050
},
{
"epoch": 6.097777777777778,
"grad_norm": 11.584074020385742,
"learning_rate": 4.988240714021464e-06,
"loss": 3.6575,
"step": 2060
},
{
"epoch": 6.127407407407407,
"grad_norm": 12.21071720123291,
"learning_rate": 4.671614857771684e-06,
"loss": 4.4115,
"step": 2070
},
{
"epoch": 6.157037037037037,
"grad_norm": 7.9541168212890625,
"learning_rate": 4.364877426376762e-06,
"loss": 3.4241,
"step": 2080
},
{
"epoch": 6.1866666666666665,
"grad_norm": 15.817623138427734,
"learning_rate": 4.068095334862038e-06,
"loss": 2.7395,
"step": 2090
},
{
"epoch": 6.216296296296297,
"grad_norm": 11.574224472045898,
"learning_rate": 3.781333326487202e-06,
"loss": 2.9356,
"step": 2100
},
{
"epoch": 6.216296296296297,
"eval_loss": 1.5084153413772583,
"eval_runtime": 901.7323,
"eval_samples_per_second": 0.333,
"eval_steps_per_second": 0.083,
"step": 2100
},
{
"epoch": 6.245925925925926,
"grad_norm": 22.12990951538086,
"learning_rate": 3.504653958622456e-06,
"loss": 3.6526,
"step": 2110
},
{
"epoch": 6.275555555555556,
"grad_norm": 36.937110900878906,
"learning_rate": 3.238117589101658e-06,
"loss": 2.135,
"step": 2120
},
{
"epoch": 6.305185185185185,
"grad_norm": 14.819047927856445,
"learning_rate": 2.981782363055108e-06,
"loss": 2.9432,
"step": 2130
},
{
"epoch": 6.3348148148148145,
"grad_norm": 19.707664489746094,
"learning_rate": 2.7357042002251976e-06,
"loss": 2.6471,
"step": 2140
},
{
"epoch": 6.364444444444445,
"grad_norm": 15.465389251708984,
"learning_rate": 2.4999367827674756e-06,
"loss": 2.3315,
"step": 2150
},
{
"epoch": 6.364444444444445,
"eval_loss": 1.438122272491455,
"eval_runtime": 901.0131,
"eval_samples_per_second": 0.333,
"eval_steps_per_second": 0.083,
"step": 2150
},
{
"epoch": 6.394074074074074,
"grad_norm": 28.68686294555664,
"learning_rate": 2.274531543539815e-06,
"loss": 2.586,
"step": 2160
},
{
"epoch": 6.423703703703704,
"grad_norm": 22.25936508178711,
"learning_rate": 2.0595376548823097e-06,
"loss": 3.1009,
"step": 2170
},
{
"epoch": 6.453333333333333,
"grad_norm": 12.523676872253418,
"learning_rate": 1.8550020178902727e-06,
"loss": 1.9499,
"step": 2180
},
{
"epoch": 6.482962962962963,
"grad_norm": 146.981201171875,
"learning_rate": 1.6609692521827424e-06,
"loss": 2.565,
"step": 2190
},
{
"epoch": 6.5125925925925925,
"grad_norm": 17.379535675048828,
"learning_rate": 1.4774816861686636e-06,
"loss": 2.7072,
"step": 2200
},
{
"epoch": 6.5125925925925925,
"eval_loss": 1.3992533683776855,
"eval_runtime": 915.4557,
"eval_samples_per_second": 0.328,
"eval_steps_per_second": 0.082,
"step": 2200
},
{
"epoch": 6.542222222222223,
"grad_norm": 9.735651969909668,
"learning_rate": 1.304579347812912e-06,
"loss": 2.3235,
"step": 2210
},
{
"epoch": 6.571851851851852,
"grad_norm": 9.481500625610352,
"learning_rate": 1.1422999559041581e-06,
"loss": 2.4865,
"step": 2220
},
{
"epoch": 6.601481481481482,
"grad_norm": 7.604280948638916,
"learning_rate": 9.90678911826487e-07,
"loss": 2.3393,
"step": 2230
},
{
"epoch": 6.631111111111111,
"grad_norm": 24.429338455200195,
"learning_rate": 8.497492918365602e-07,
"loss": 2.2012,
"step": 2240
},
{
"epoch": 6.66074074074074,
"grad_norm": 15.720512390136719,
"learning_rate": 7.195418398479925e-07,
"loss": 2.6741,
"step": 2250
},
{
"epoch": 6.66074074074074,
"eval_loss": 1.3970364332199097,
"eval_runtime": 932.9152,
"eval_samples_per_second": 0.322,
"eval_steps_per_second": 0.08,
"step": 2250
},
{
"epoch": 6.6903703703703705,
"grad_norm": 10.252752304077148,
"learning_rate": 6.00084960724534e-07,
"loss": 2.4081,
"step": 2260
},
{
"epoch": 6.72,
"grad_norm": 17.19991111755371,
"learning_rate": 4.914047140835653e-07,
"loss": 2.4516,
"step": 2270
},
{
"epoch": 6.74962962962963,
"grad_norm": 23.26289176940918,
"learning_rate": 3.935248086111176e-07,
"loss": 2.5946,
"step": 2280
},
{
"epoch": 6.779259259259259,
"grad_norm": 12.669425964355469,
"learning_rate": 3.064665968898428e-07,
"loss": 2.804,
"step": 2290
},
{
"epoch": 6.808888888888889,
"grad_norm": 22.176555633544922,
"learning_rate": 2.3024907074091772e-07,
"loss": 2.6944,
"step": 2300
},
{
"epoch": 6.808888888888889,
"eval_loss": 1.3712869882583618,
"eval_runtime": 889.4139,
"eval_samples_per_second": 0.337,
"eval_steps_per_second": 0.084,
"step": 2300
},
{
"epoch": 6.838518518518518,
"grad_norm": 22.525259017944336,
"learning_rate": 1.6488885708094705e-07,
"loss": 2.3312,
"step": 2310
},
{
"epoch": 6.868148148148148,
"grad_norm": 9.424576759338379,
"learning_rate": 1.1040021429480907e-07,
"loss": 2.1776,
"step": 2320
},
{
"epoch": 6.897777777777778,
"grad_norm": 33.19387435913086,
"learning_rate": 6.679502912517732e-08,
"loss": 2.3443,
"step": 2330
},
{
"epoch": 6.927407407407408,
"grad_norm": 27.716110229492188,
"learning_rate": 3.408281407939473e-08,
"loss": 2.2612,
"step": 2340
},
{
"epoch": 6.957037037037037,
"grad_norm": 17.70338249206543,
"learning_rate": 1.2270705354333612e-08,
"loss": 2.4162,
"step": 2350
},
{
"epoch": 6.957037037037037,
"eval_loss": 1.3736646175384521,
"eval_runtime": 876.3782,
"eval_samples_per_second": 0.342,
"eval_steps_per_second": 0.086,
"step": 2350
},
{
"epoch": 6.986666666666666,
"grad_norm": 18.5972957611084,
"learning_rate": 1.3634612796298295e-09,
"loss": 2.4499,
"step": 2360
},
{
"epoch": 6.998518518518519,
"step": 2364,
"total_flos": 1.225036049440727e+17,
"train_loss": 0.4030211762526717,
"train_runtime": 24477.7962,
"train_samples_per_second": 0.773,
"train_steps_per_second": 0.097
}
],
"logging_steps": 10,
"max_steps": 2364,
"num_input_tokens_seen": 0,
"num_train_epochs": 7,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.225036049440727e+17,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}