{ "best_global_step": 750, "best_metric": 0.48672306537628174, "best_model_checkpoint": "./qwen2.5-vl-finetune-checkpoints/checkpoint-750", "epoch": 6.998518518518519, "eval_steps": 50, "global_step": 2364, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.022222222222222223, "grad_norm": 5.334873676300049, "learning_rate": 2e-05, "loss": 2.417, "step": 10 }, { "epoch": 0.044444444444444446, "grad_norm": 6.264868259429932, "learning_rate": 4.222222222222222e-05, "loss": 1.7873, "step": 20 }, { "epoch": 0.06666666666666667, "grad_norm": 9.44924259185791, "learning_rate": 6.444444444444446e-05, "loss": 1.2221, "step": 30 }, { "epoch": 0.08888888888888889, "grad_norm": 24.675010681152344, "learning_rate": 8.666666666666667e-05, "loss": 0.8489, "step": 40 }, { "epoch": 0.1111111111111111, "grad_norm": 7.224123954772949, "learning_rate": 9.997593339404756e-05, "loss": 0.9777, "step": 50 }, { "epoch": 0.1111111111111111, "eval_loss": 1.0407381057739258, "eval_runtime": 1366.0737, "eval_samples_per_second": 0.293, "eval_steps_per_second": 0.073, "step": 50 }, { "epoch": 0.13333333333333333, "grad_norm": 3.4162638187408447, "learning_rate": 9.970545007734807e-05, "loss": 0.867, "step": 60 }, { "epoch": 0.15555555555555556, "grad_norm": 8.475242614746094, "learning_rate": 9.913603233532067e-05, "loss": 0.8114, "step": 70 }, { "epoch": 0.17777777777777778, "grad_norm": 4.519033908843994, "learning_rate": 9.82711047132661e-05, "loss": 1.0957, "step": 80 }, { "epoch": 0.2, "grad_norm": 9.79876708984375, "learning_rate": 9.711586898767462e-05, "loss": 0.9682, "step": 90 }, { "epoch": 0.2222222222222222, "grad_norm": 4.599425315856934, "learning_rate": 9.567727288213005e-05, "loss": 0.8787, "step": 100 }, { "epoch": 0.2222222222222222, "eval_loss": 0.8106439113616943, "eval_runtime": 1315.3239, "eval_samples_per_second": 0.304, "eval_steps_per_second": 0.076, "step": 100 }, { "epoch": 0.24444444444444444, "grad_norm": 8.550354957580566, "learning_rate": 9.396396828288272e-05, "loss": 0.7664, "step": 110 }, { "epoch": 0.26666666666666666, "grad_norm": 16.60089874267578, "learning_rate": 9.19862592053875e-05, "loss": 0.8729, "step": 120 }, { "epoch": 0.28888888888888886, "grad_norm": 7.3388495445251465, "learning_rate": 8.97560398247424e-05, "loss": 0.6473, "step": 130 }, { "epoch": 0.3111111111111111, "grad_norm": 4.2113847732543945, "learning_rate": 8.728672294272008e-05, "loss": 0.7883, "step": 140 }, { "epoch": 0.3333333333333333, "grad_norm": 9.767938613891602, "learning_rate": 8.459315932159979e-05, "loss": 0.9219, "step": 150 }, { "epoch": 0.3333333333333333, "eval_loss": 0.7609114050865173, "eval_runtime": 1300.5726, "eval_samples_per_second": 0.308, "eval_steps_per_second": 0.077, "step": 150 }, { "epoch": 0.35555555555555557, "grad_norm": 4.818809986114502, "learning_rate": 8.169154836993551e-05, "loss": 0.6935, "step": 160 }, { "epoch": 0.37777777777777777, "grad_norm": 9.263418197631836, "learning_rate": 7.859934071740692e-05, "loss": 0.6154, "step": 170 }, { "epoch": 0.4, "grad_norm": 5.390202522277832, "learning_rate": 7.533513326467911e-05, "loss": 0.7336, "step": 180 }, { "epoch": 0.4222222222222222, "grad_norm": 6.1149373054504395, "learning_rate": 7.191855733945387e-05, "loss": 0.6445, "step": 190 }, { "epoch": 0.4444444444444444, "grad_norm": 28.822879791259766, "learning_rate": 6.837016063135491e-05, "loss": 0.6949, "step": 200 }, { "epoch": 0.4444444444444444, "eval_loss": 0.7009051442146301, "eval_runtime": 1304.5098, "eval_samples_per_second": 0.307, "eval_steps_per_second": 0.077, "step": 200 }, { "epoch": 0.4666666666666667, "grad_norm": 4.288070201873779, "learning_rate": 6.471128361570476e-05, "loss": 0.6743, "step": 210 }, { "epoch": 0.4888888888888889, "grad_norm": 2.8349318504333496, "learning_rate": 6.096393120939516e-05, "loss": 0.7858, "step": 220 }, { "epoch": 0.5111111111111111, "grad_norm": 4.94554328918457, "learning_rate": 5.715064043072771e-05, "loss": 0.5722, "step": 230 }, { "epoch": 0.5333333333333333, "grad_norm": 4.504173278808594, "learning_rate": 5.329434485913393e-05, "loss": 0.8959, "step": 240 }, { "epoch": 0.5555555555555556, "grad_norm": 6.013023376464844, "learning_rate": 4.941823670993016e-05, "loss": 0.7088, "step": 250 }, { "epoch": 0.5555555555555556, "eval_loss": 0.6455658674240112, "eval_runtime": 1299.4246, "eval_samples_per_second": 0.308, "eval_steps_per_second": 0.077, "step": 250 }, { "epoch": 0.5777777777777777, "grad_norm": 4.269768238067627, "learning_rate": 4.55456273536057e-05, "loss": 0.7, "step": 260 }, { "epoch": 0.6, "grad_norm": 3.1131203174591064, "learning_rate": 4.169980711849781e-05, "loss": 0.5789, "step": 270 }, { "epoch": 0.6222222222222222, "grad_norm": 3.8976268768310547, "learning_rate": 3.790390522001662e-05, "loss": 0.5789, "step": 280 }, { "epoch": 0.6444444444444445, "grad_norm": 3.225245714187622, "learning_rate": 3.418075065882217e-05, "loss": 0.5838, "step": 290 }, { "epoch": 0.6666666666666666, "grad_norm": 4.711687088012695, "learning_rate": 3.0552734924528306e-05, "loss": 0.6903, "step": 300 }, { "epoch": 0.6666666666666666, "eval_loss": 0.5962206125259399, "eval_runtime": 1288.7617, "eval_samples_per_second": 0.31, "eval_steps_per_second": 0.078, "step": 300 }, { "epoch": 0.6888888888888889, "grad_norm": 4.992547512054443, "learning_rate": 2.7041677330649407e-05, "loss": 0.6936, "step": 310 }, { "epoch": 0.7111111111111111, "grad_norm": 9.838484764099121, "learning_rate": 2.3668693790681634e-05, "loss": 0.7052, "step": 320 }, { "epoch": 0.7333333333333333, "grad_norm": 8.834742546081543, "learning_rate": 2.0454069824514444e-05, "loss": 0.7329, "step": 330 }, { "epoch": 0.7555555555555555, "grad_norm": 5.83026123046875, "learning_rate": 1.7417138558927244e-05, "loss": 0.5383, "step": 340 }, { "epoch": 0.7777777777777778, "grad_norm": 7.011873722076416, "learning_rate": 1.4576164455890013e-05, "loss": 0.5669, "step": 350 }, { "epoch": 0.7777777777777778, "eval_loss": 0.5696190595626831, "eval_runtime": 1284.3616, "eval_samples_per_second": 0.311, "eval_steps_per_second": 0.078, "step": 350 }, { "epoch": 0.8, "grad_norm": 4.988184452056885, "learning_rate": 1.194823346793998e-05, "loss": 0.523, "step": 360 }, { "epoch": 0.8222222222222222, "grad_norm": 6.482738971710205, "learning_rate": 9.549150281252633e-06, "loss": 0.6132, "step": 370 }, { "epoch": 0.8444444444444444, "grad_norm": 7.583404541015625, "learning_rate": 7.393343264399438e-06, "loss": 0.5602, "step": 380 }, { "epoch": 0.8666666666666667, "grad_norm": 3.9021246433258057, "learning_rate": 5.493777694441521e-06, "loss": 0.7415, "step": 390 }, { "epoch": 0.8888888888888888, "grad_norm": 4.355510711669922, "learning_rate": 3.861877782227885e-06, "loss": 0.6577, "step": 400 }, { "epoch": 0.8888888888888888, "eval_loss": 0.5607297420501709, "eval_runtime": 1282.6163, "eval_samples_per_second": 0.312, "eval_steps_per_second": 0.078, "step": 400 }, { "epoch": 0.9111111111111111, "grad_norm": 6.445341110229492, "learning_rate": 2.5074579658471266e-06, "loss": 0.7159, "step": 410 }, { "epoch": 0.9333333333333333, "grad_norm": 4.908728122711182, "learning_rate": 1.438663885441982e-06, "loss": 0.5581, "step": 420 }, { "epoch": 0.9555555555555556, "grad_norm": 3.629876136779785, "learning_rate": 6.61923394371039e-07, "loss": 0.5394, "step": 430 }, { "epoch": 0.9777777777777777, "grad_norm": 4.596356391906738, "learning_rate": 1.819079013423153e-07, "loss": 0.5405, "step": 440 }, { "epoch": 1.0, "grad_norm": 3.878979206085205, "learning_rate": 1.5042760116212861e-09, "loss": 0.4788, "step": 450 }, { "epoch": 1.0, "eval_loss": 0.5549002289772034, "eval_runtime": 1306.3502, "eval_samples_per_second": 0.306, "eval_steps_per_second": 0.077, "step": 450 }, { "epoch": 1.0, "step": 450, "total_flos": 2.2909672762220544e+16, "train_loss": 0.7819234402974446, "train_runtime": 36382.2657, "train_samples_per_second": 0.099, "train_steps_per_second": 0.012 }, { "epoch": 1.0, "step": 450, "total_flos": 2.2909672762220544e+16, "train_loss": 0.0, "train_runtime": 0.2352, "train_samples_per_second": 11479.607, "train_steps_per_second": 1432.825 }, { "epoch": 1.0, "step": 450, "total_flos": 2.2909672762220544e+16, "train_loss": 0.0, "train_runtime": 0.2432, "train_samples_per_second": 11101.818, "train_steps_per_second": 1385.671 }, { "epoch": 1.3644444444444446, "grad_norm": 6.853477954864502, "learning_rate": 4.4362464041245384e-05, "loss": 0.4727, "step": 460 }, { "epoch": 1.394074074074074, "grad_norm": 5.060462951660156, "learning_rate": 4.21673357748979e-05, "loss": 0.7008, "step": 470 }, { "epoch": 1.4237037037037037, "grad_norm": 3.702819585800171, "learning_rate": 3.998758359194028e-05, "loss": 0.4912, "step": 480 }, { "epoch": 1.4533333333333334, "grad_norm": 37.993465423583984, "learning_rate": 3.7827486502728574e-05, "loss": 0.6657, "step": 490 }, { "epoch": 1.482962962962963, "grad_norm": 11.906915664672852, "learning_rate": 3.5691284933255654e-05, "loss": 0.4878, "step": 500 }, { "epoch": 1.482962962962963, "eval_loss": 0.5222220420837402, "eval_runtime": 931.5337, "eval_samples_per_second": 0.322, "eval_steps_per_second": 0.081, "step": 500 }, { "epoch": 1.5125925925925925, "grad_norm": 5.83925724029541, "learning_rate": 3.358317240089008e-05, "loss": 0.4312, "step": 510 }, { "epoch": 1.5422222222222222, "grad_norm": 3.2574503421783447, "learning_rate": 3.150728728219966e-05, "loss": 0.5544, "step": 520 }, { "epoch": 1.5718518518518518, "grad_norm": 6.785427093505859, "learning_rate": 2.946770468902064e-05, "loss": 0.5278, "step": 530 }, { "epoch": 1.6014814814814815, "grad_norm": 48.51929473876953, "learning_rate": 2.7468428468719877e-05, "loss": 0.6246, "step": 540 }, { "epoch": 1.6311111111111112, "grad_norm": 2.2513532638549805, "learning_rate": 2.5513383344354467e-05, "loss": 0.5973, "step": 550 }, { "epoch": 1.6311111111111112, "eval_loss": 0.5203197002410889, "eval_runtime": 910.2398, "eval_samples_per_second": 0.33, "eval_steps_per_second": 0.082, "step": 550 }, { "epoch": 1.6607407407407409, "grad_norm": 5.722535133361816, "learning_rate": 2.3606407210158006e-05, "loss": 0.6404, "step": 560 }, { "epoch": 1.6903703703703705, "grad_norm": 6.939681053161621, "learning_rate": 2.175124359747806e-05, "loss": 0.5434, "step": 570 }, { "epoch": 1.72, "grad_norm": 6.644214630126953, "learning_rate": 1.9951534325954914e-05, "loss": 0.6884, "step": 580 }, { "epoch": 1.7496296296296296, "grad_norm": 4.961182117462158, "learning_rate": 1.82108123543675e-05, "loss": 0.459, "step": 590 }, { "epoch": 1.779259259259259, "grad_norm": 6.396653652191162, "learning_rate": 1.6532494845181155e-05, "loss": 0.5712, "step": 600 }, { "epoch": 1.779259259259259, "eval_loss": 0.5014171600341797, "eval_runtime": 902.4118, "eval_samples_per_second": 0.332, "eval_steps_per_second": 0.083, "step": 600 }, { "epoch": 1.8088888888888888, "grad_norm": 4.0901360511779785, "learning_rate": 1.4919876456411874e-05, "loss": 0.5857, "step": 610 }, { "epoch": 1.8385185185185184, "grad_norm": 5.865538597106934, "learning_rate": 1.3376122873975616e-05, "loss": 0.5479, "step": 620 }, { "epoch": 1.8681481481481481, "grad_norm": 4.197350025177002, "learning_rate": 1.1904264597219077e-05, "loss": 0.5741, "step": 630 }, { "epoch": 1.8977777777777778, "grad_norm": 12.017341613769531, "learning_rate": 1.0507190989831412e-05, "loss": 0.5084, "step": 640 }, { "epoch": 1.9274074074074075, "grad_norm": 3.7081458568573, "learning_rate": 9.187644607815498e-06, "loss": 0.6477, "step": 650 }, { "epoch": 1.9274074074074075, "eval_loss": 0.4961656630039215, "eval_runtime": 888.3568, "eval_samples_per_second": 0.338, "eval_steps_per_second": 0.084, "step": 650 }, { "epoch": 1.9570370370370371, "grad_norm": 2.705108165740967, "learning_rate": 7.948215815653149e-06, "loss": 0.4434, "step": 660 }, { "epoch": 1.9866666666666668, "grad_norm": 4.442301273345947, "learning_rate": 6.7913377012332694e-06, "loss": 0.6607, "step": 670 }, { "epoch": 2.017777777777778, "grad_norm": 3.787601947784424, "learning_rate": 5.719281299525331e-06, "loss": 0.5332, "step": 680 }, { "epoch": 2.0474074074074076, "grad_norm": 12.064045906066895, "learning_rate": 4.734151134374304e-06, "loss": 0.4921, "step": 690 }, { "epoch": 2.0770370370370372, "grad_norm": 3.7398955821990967, "learning_rate": 3.837881087168932e-06, "loss": 0.5258, "step": 700 }, { "epoch": 2.0770370370370372, "eval_loss": 0.4903838038444519, "eval_runtime": 946.841, "eval_samples_per_second": 0.317, "eval_steps_per_second": 0.079, "step": 700 }, { "epoch": 2.1066666666666665, "grad_norm": 5.209836006164551, "learning_rate": 3.0322306004934462e-06, "loss": 0.5301, "step": 710 }, { "epoch": 2.136296296296296, "grad_norm": 7.934685230255127, "learning_rate": 2.3187812242151995e-06, "loss": 0.3265, "step": 720 }, { "epoch": 2.165925925925926, "grad_norm": 7.0827789306640625, "learning_rate": 1.6989335107884862e-06, "loss": 0.5313, "step": 730 }, { "epoch": 2.1955555555555555, "grad_norm": 4.234689712524414, "learning_rate": 1.1739042658693078e-06, "loss": 0.4536, "step": 740 }, { "epoch": 2.225185185185185, "grad_norm": 2.486720561981201, "learning_rate": 7.447241596383381e-07, "loss": 0.4678, "step": 750 }, { "epoch": 2.225185185185185, "eval_loss": 0.48672306537628174, "eval_runtime": 907.8778, "eval_samples_per_second": 0.33, "eval_steps_per_second": 0.083, "step": 750 }, { "epoch": 2.254814814814815, "grad_norm": 3.3360986709594727, "learning_rate": 4.122357035211855e-07, "loss": 0.4285, "step": 760 }, { "epoch": 2.2844444444444445, "grad_norm": 5.732705116271973, "learning_rate": 1.7709159627787853e-07, "loss": 0.422, "step": 770 }, { "epoch": 2.314074074074074, "grad_norm": 3.294212579727173, "learning_rate": 3.975344270823467e-08, "loss": 0.4541, "step": 780 }, { "epoch": 2.3377777777777777, "step": 788, "total_flos": 4.05169174996992e+16, "train_loss": 0.22821334594397374, "train_runtime": 22612.2743, "train_samples_per_second": 0.279, "train_steps_per_second": 0.035 }, { "epoch": 1.7555555555555555, "grad_norm": 7.2509870529174805, "learning_rate": 3.500580638048163e-05, "loss": 0.8465, "step": 790 }, { "epoch": 1.7777777777777777, "grad_norm": 12.120731353759766, "learning_rate": 3.366679270419626e-05, "loss": 0.9164, "step": 800 }, { "epoch": 1.7777777777777777, "eval_loss": 0.542812168598175, "eval_runtime": 1320.8378, "eval_samples_per_second": 0.303, "eval_steps_per_second": 0.076, "step": 800 }, { "epoch": 1.8, "grad_norm": 13.869917869567871, "learning_rate": 3.2340767918386884e-05, "loss": 0.9066, "step": 810 }, { "epoch": 1.8222222222222222, "grad_norm": 18.34079933166504, "learning_rate": 3.102878653674449e-05, "loss": 1.1401, "step": 820 }, { "epoch": 1.8444444444444446, "grad_norm": 12.816385269165039, "learning_rate": 2.973189190502259e-05, "loss": 1.0268, "step": 830 }, { "epoch": 1.8666666666666667, "grad_norm": 12.571537017822266, "learning_rate": 2.84511153713223e-05, "loss": 1.3972, "step": 840 }, { "epoch": 1.8888888888888888, "grad_norm": 31.82439613342285, "learning_rate": 2.7187475465918765e-05, "loss": 1.2414, "step": 850 }, { "epoch": 1.8888888888888888, "eval_loss": 0.5595240592956543, "eval_runtime": 1193.2134, "eval_samples_per_second": 0.335, "eval_steps_per_second": 0.084, "step": 850 }, { "epoch": 1.911111111111111, "grad_norm": 13.308499336242676, "learning_rate": 2.594197709128061e-05, "loss": 1.3218, "step": 860 }, { "epoch": 1.9333333333333333, "grad_norm": 6.059114456176758, "learning_rate": 2.471561072292703e-05, "loss": 1.0453, "step": 870 }, { "epoch": 1.9555555555555557, "grad_norm": 13.917656898498535, "learning_rate": 2.3509351621757692e-05, "loss": 1.0651, "step": 880 }, { "epoch": 1.9777777777777779, "grad_norm": 9.584994316101074, "learning_rate": 2.2324159058482085e-05, "loss": 1.1164, "step": 890 }, { "epoch": 2.0, "grad_norm": 12.185356140136719, "learning_rate": 2.11609755507649e-05, "loss": 0.9255, "step": 900 }, { "epoch": 2.0, "eval_loss": 0.5336335897445679, "eval_runtime": 1209.6116, "eval_samples_per_second": 0.331, "eval_steps_per_second": 0.083, "step": 900 }, { "epoch": 2.022222222222222, "grad_norm": 9.4601411819458, "learning_rate": 2.0020726113694204e-05, "loss": 0.6626, "step": 910 }, { "epoch": 2.0444444444444443, "grad_norm": 7.833770751953125, "learning_rate": 1.8904317524168458e-05, "loss": 0.8076, "step": 920 }, { "epoch": 2.066666666666667, "grad_norm": 9.944056510925293, "learning_rate": 1.7812637599787297e-05, "loss": 0.826, "step": 930 }, { "epoch": 2.088888888888889, "grad_norm": 50.9095344543457, "learning_rate": 1.674655449281964e-05, "loss": 0.8201, "step": 940 }, { "epoch": 2.111111111111111, "grad_norm": 11.151677131652832, "learning_rate": 1.570691599981053e-05, "loss": 0.7905, "step": 950 }, { "epoch": 2.111111111111111, "eval_loss": 0.5313804149627686, "eval_runtime": 1190.18, "eval_samples_per_second": 0.336, "eval_steps_per_second": 0.084, "step": 950 }, { "epoch": 2.1333333333333333, "grad_norm": 19.227052688598633, "learning_rate": 1.4694548887375708e-05, "loss": 0.7963, "step": 960 }, { "epoch": 2.1555555555555554, "grad_norm": 3.875586748123169, "learning_rate": 1.3710258234720192e-05, "loss": 0.7642, "step": 970 }, { "epoch": 2.1777777777777776, "grad_norm": 15.085531234741211, "learning_rate": 1.2754826793403562e-05, "loss": 0.9614, "step": 980 }, { "epoch": 2.2, "grad_norm": 8.809530258178711, "learning_rate": 1.1829014364861251e-05, "loss": 0.8547, "step": 990 }, { "epoch": 2.2222222222222223, "grad_norm": 9.07629680633545, "learning_rate": 1.093355719617678e-05, "loss": 0.7664, "step": 1000 }, { "epoch": 2.2222222222222223, "eval_loss": 0.5206774473190308, "eval_runtime": 1181.1098, "eval_samples_per_second": 0.339, "eval_steps_per_second": 0.085, "step": 1000 }, { "epoch": 2.2444444444444445, "grad_norm": 6.77101993560791, "learning_rate": 1.006916739458535e-05, "loss": 0.693, "step": 1010 }, { "epoch": 2.2666666666666666, "grad_norm": 5.627523899078369, "learning_rate": 9.236532361174726e-06, "loss": 0.9059, "step": 1020 }, { "epoch": 2.2888888888888888, "grad_norm": 6.075342655181885, "learning_rate": 8.43631424423334e-06, "loss": 0.7724, "step": 1030 }, { "epoch": 2.311111111111111, "grad_norm": 12.213552474975586, "learning_rate": 7.669149412680605e-06, "loss": 0.9813, "step": 1040 }, { "epoch": 2.3333333333333335, "grad_norm": 4.757562637329102, "learning_rate": 6.93564794999823e-06, "loss": 1.2031, "step": 1050 }, { "epoch": 2.3333333333333335, "eval_loss": 0.5155333876609802, "eval_runtime": 1186.5765, "eval_samples_per_second": 0.337, "eval_steps_per_second": 0.084, "step": 1050 }, { "epoch": 2.3555555555555556, "grad_norm": 11.984197616577148, "learning_rate": 6.2363931690647195e-06, "loss": 1.0289, "step": 1060 }, { "epoch": 2.3777777777777778, "grad_norm": 6.141831874847412, "learning_rate": 5.571941148279081e-06, "loss": 0.9776, "step": 1070 }, { "epoch": 2.4, "grad_norm": 6.076663017272949, "learning_rate": 4.942820289342759e-06, "loss": 0.5992, "step": 1080 }, { "epoch": 2.422222222222222, "grad_norm": 113.96343994140625, "learning_rate": 4.349530897051047e-06, "loss": 0.875, "step": 1090 }, { "epoch": 2.4444444444444446, "grad_norm": 12.476838111877441, "learning_rate": 3.7925447814286087e-06, "loss": 0.848, "step": 1100 }, { "epoch": 2.4444444444444446, "eval_loss": 0.5155972838401794, "eval_runtime": 1256.8424, "eval_samples_per_second": 0.318, "eval_steps_per_second": 0.08, "step": 1100 }, { "epoch": 2.466666666666667, "grad_norm": 6.342575550079346, "learning_rate": 3.2723048825252177e-06, "loss": 0.876, "step": 1110 }, { "epoch": 2.488888888888889, "grad_norm": 5.154838562011719, "learning_rate": 2.7892249181701802e-06, "loss": 0.6484, "step": 1120 }, { "epoch": 2.511111111111111, "grad_norm": 14.073723793029785, "learning_rate": 2.343689054965592e-06, "loss": 0.7078, "step": 1130 }, { "epoch": 2.533333333333333, "grad_norm": 11.655536651611328, "learning_rate": 1.936051602780026e-06, "loss": 0.8106, "step": 1140 }, { "epoch": 2.5555555555555554, "grad_norm": 12.040849685668945, "learning_rate": 1.5666367329856046e-06, "loss": 0.736, "step": 1150 }, { "epoch": 2.5555555555555554, "eval_loss": 0.5151739716529846, "eval_runtime": 1180.2607, "eval_samples_per_second": 0.339, "eval_steps_per_second": 0.085, "step": 1150 }, { "epoch": 2.5777777777777775, "grad_norm": 4.713861465454102, "learning_rate": 1.2357382206625801e-06, "loss": 0.9703, "step": 1160 }, { "epoch": 2.6, "grad_norm": 16.40427589416504, "learning_rate": 9.436192109763376e-07, "loss": 0.8726, "step": 1170 }, { "epoch": 2.6222222222222222, "grad_norm": 26.21332359313965, "learning_rate": 6.90512009912725e-07, "loss": 0.7543, "step": 1180 }, { "epoch": 2.6444444444444444, "grad_norm": 10.207490921020508, "learning_rate": 4.766178995379955e-07, "loss": 1.0878, "step": 1190 }, { "epoch": 2.6666666666666665, "grad_norm": 17.372835159301758, "learning_rate": 3.0210697793044975e-07, "loss": 0.6169, "step": 1200 }, { "epoch": 2.6666666666666665, "eval_loss": 0.5134466290473938, "eval_runtime": 1169.2654, "eval_samples_per_second": 0.342, "eval_steps_per_second": 0.086, "step": 1200 }, { "epoch": 2.688888888888889, "grad_norm": 78.36367797851562, "learning_rate": 1.671180239108172e-07, "loss": 0.9518, "step": 1210 }, { "epoch": 2.7111111111111112, "grad_norm": 7.34134578704834, "learning_rate": 7.175838667927148e-08, "loss": 0.6888, "step": 1220 }, { "epoch": 2.7333333333333334, "grad_norm": 7.477999687194824, "learning_rate": 1.6103900446534648e-08, "loss": 0.7221, "step": 1230 }, { "epoch": 2.7511111111111113, "step": 1238, "total_flos": 6.388579291468186e+16, "train_loss": 0.3261277918284082, "train_runtime": 34829.2458, "train_samples_per_second": 0.284, "train_steps_per_second": 0.036 }, { "epoch": 3.66962962962963, "grad_norm": 14.275522232055664, "learning_rate": 1.3300797847207797e-05, "loss": 3.5621, "step": 1240 }, { "epoch": 3.699259259259259, "grad_norm": 27.858943939208984, "learning_rate": 1.2557515699430094e-05, "loss": 4.3815, "step": 1250 }, { "epoch": 3.699259259259259, "eval_loss": 2.2658419609069824, "eval_runtime": 995.7526, "eval_samples_per_second": 0.301, "eval_steps_per_second": 0.075, "step": 1250 }, { "epoch": 3.728888888888889, "grad_norm": 30.557031631469727, "learning_rate": 1.1832611379355878e-05, "loss": 3.2056, "step": 1260 }, { "epoch": 3.7585185185185184, "grad_norm": 34.28306579589844, "learning_rate": 1.1126440690477996e-05, "loss": 2.8957, "step": 1270 }, { "epoch": 3.788148148148148, "grad_norm": 29.017297744750977, "learning_rate": 1.0439350241294566e-05, "loss": 2.5225, "step": 1280 }, { "epoch": 3.8177777777777777, "grad_norm": 23.32266616821289, "learning_rate": 9.771677275183744e-06, "loss": 2.6028, "step": 1290 }, { "epoch": 3.8474074074074074, "grad_norm": 32.830848693847656, "learning_rate": 9.123749504875135e-06, "loss": 2.7177, "step": 1300 }, { "epoch": 3.8474074074074074, "eval_loss": 1.3522464036941528, "eval_runtime": 985.7859, "eval_samples_per_second": 0.304, "eval_steps_per_second": 0.076, "step": 1300 }, { "epoch": 3.877037037037037, "grad_norm": 6.538234233856201, "learning_rate": 8.495884951599142e-06, "loss": 2.2624, "step": 1310 }, { "epoch": 3.9066666666666667, "grad_norm": 19.523771286010742, "learning_rate": 7.888391788993216e-06, "loss": 2.6275, "step": 1320 }, { "epoch": 3.9362962962962964, "grad_norm": 11.971488952636719, "learning_rate": 7.301568191841457e-06, "loss": 2.1496, "step": 1330 }, { "epoch": 3.965925925925926, "grad_norm": 34.24433898925781, "learning_rate": 6.735702189722115e-06, "loss": 2.0774, "step": 1340 }, { "epoch": 3.9955555555555557, "grad_norm": 12.619851112365723, "learning_rate": 6.191071525634456e-06, "loss": 2.0749, "step": 1350 }, { "epoch": 3.9955555555555557, "eval_loss": 1.2665727138519287, "eval_runtime": 972.1433, "eval_samples_per_second": 0.309, "eval_steps_per_second": 0.077, "step": 1350 }, { "epoch": 4.026666666666666, "grad_norm": 21.63642692565918, "learning_rate": 5.667943519674723e-06, "loss": 2.2795, "step": 1360 }, { "epoch": 4.0562962962962965, "grad_norm": 5.838581562042236, "learning_rate": 5.166574937827867e-06, "loss": 2.6146, "step": 1370 }, { "epoch": 4.085925925925926, "grad_norm": 11.008721351623535, "learning_rate": 4.687211865939539e-06, "loss": 2.3045, "step": 1380 }, { "epoch": 4.115555555555556, "grad_norm": 6.246650218963623, "learning_rate": 4.2300895889302805e-06, "loss": 1.823, "step": 1390 }, { "epoch": 4.145185185185185, "grad_norm": 13.782442092895508, "learning_rate": 3.7954324753109673e-06, "loss": 2.2982, "step": 1400 }, { "epoch": 4.145185185185185, "eval_loss": 1.2098972797393799, "eval_runtime": 998.8662, "eval_samples_per_second": 0.3, "eval_steps_per_second": 0.075, "step": 1400 }, { "epoch": 4.174814814814815, "grad_norm": 11.179134368896484, "learning_rate": 3.383453867056452e-06, "loss": 2.5618, "step": 1410 }, { "epoch": 4.204444444444444, "grad_norm": 73.97550201416016, "learning_rate": 2.9943559748912996e-06, "loss": 1.8831, "step": 1420 }, { "epoch": 4.234074074074074, "grad_norm": 17.907745361328125, "learning_rate": 2.628329779039057e-06, "loss": 2.2352, "step": 1430 }, { "epoch": 4.263703703703704, "grad_norm": 81.71790313720703, "learning_rate": 2.2855549354837912e-06, "loss": 2.1651, "step": 1440 }, { "epoch": 4.293333333333333, "grad_norm": 10.33467960357666, "learning_rate": 1.9661996877898105e-06, "loss": 1.7595, "step": 1450 }, { "epoch": 4.293333333333333, "eval_loss": 1.1622637510299683, "eval_runtime": 993.3397, "eval_samples_per_second": 0.302, "eval_steps_per_second": 0.076, "step": 1450 }, { "epoch": 4.322962962962963, "grad_norm": 40.43919372558594, "learning_rate": 1.6704207845230358e-06, "loss": 1.9304, "step": 1460 }, { "epoch": 4.352592592592592, "grad_norm": 10.497286796569824, "learning_rate": 1.3983634023143511e-06, "loss": 2.098, "step": 1470 }, { "epoch": 4.3822222222222225, "grad_norm": 9.101359367370605, "learning_rate": 1.1501610746028124e-06, "loss": 1.8441, "step": 1480 }, { "epoch": 4.411851851851852, "grad_norm": 20.517807006835938, "learning_rate": 9.25935626093688e-07, "loss": 2.3551, "step": 1490 }, { "epoch": 4.441481481481482, "grad_norm": 7.981099605560303, "learning_rate": 7.257971129634389e-07, "loss": 1.6124, "step": 1500 }, { "epoch": 4.441481481481482, "eval_loss": 1.1480356454849243, "eval_runtime": 970.9195, "eval_samples_per_second": 0.309, "eval_steps_per_second": 0.077, "step": 1500 }, { "epoch": 4.471111111111111, "grad_norm": 51.19599533081055, "learning_rate": 5.498437688410463e-07, "loss": 2.0946, "step": 1510 }, { "epoch": 4.50074074074074, "grad_norm": 7.847194671630859, "learning_rate": 3.981619565921968e-07, "loss": 1.8896, "step": 1520 }, { "epoch": 4.53037037037037, "grad_norm": 12.63452434539795, "learning_rate": 2.708261259299072e-07, "loss": 2.1132, "step": 1530 }, { "epoch": 4.5600000000000005, "grad_norm": 8.711173057556152, "learning_rate": 1.6789877687254928e-07, "loss": 1.9074, "step": 1540 }, { "epoch": 4.58962962962963, "grad_norm": 14.014768600463867, "learning_rate": 8.943042906705001e-08, "loss": 2.4591, "step": 1550 }, { "epoch": 4.58962962962963, "eval_loss": 1.1526756286621094, "eval_runtime": 1013.0536, "eval_samples_per_second": 0.296, "eval_steps_per_second": 0.074, "step": 1550 }, { "epoch": 4.619259259259259, "grad_norm": 241.5323486328125, "learning_rate": 3.545959699243207e-08, "loss": 1.9968, "step": 1560 }, { "epoch": 4.648888888888889, "grad_norm": 41.02328109741211, "learning_rate": 6.0127710558133265e-09, "loss": 1.9328, "step": 1570 }, { "epoch": 4.666666666666667, "step": 1576, "total_flos": 8.15036810717184e+16, "train_loss": 0.49436442077462445, "train_runtime": 26325.3193, "train_samples_per_second": 0.479, "train_steps_per_second": 0.06 }, { "epoch": 3.511111111111111, "grad_norm": 11.290818214416504, "learning_rate": 1.4115578944331131e-05, "loss": 4.0951, "step": 1580 }, { "epoch": 3.533333333333333, "grad_norm": 29.64479637145996, "learning_rate": 1.3520911423383454e-05, "loss": 5.0902, "step": 1590 }, { "epoch": 3.5555555555555554, "grad_norm": 12.257214546203613, "learning_rate": 1.2937077174225081e-05, "loss": 3.8541, "step": 1600 }, { "epoch": 3.5555555555555554, "eval_loss": 1.5511490106582642, "eval_runtime": 1129.8462, "eval_samples_per_second": 0.354, "eval_steps_per_second": 0.089, "step": 1600 }, { "epoch": 3.5777777777777775, "grad_norm": 7.66765832901001, "learning_rate": 1.2364249579342985e-05, "loss": 2.9561, "step": 1610 }, { "epoch": 3.6, "grad_norm": 5.419583797454834, "learning_rate": 1.1802598752554878e-05, "loss": 2.4979, "step": 1620 }, { "epoch": 3.6222222222222222, "grad_norm": 10.220062255859375, "learning_rate": 1.125229148849008e-05, "loss": 2.1119, "step": 1630 }, { "epoch": 3.6444444444444444, "grad_norm": 10.050875663757324, "learning_rate": 1.071349121305622e-05, "loss": 2.2186, "step": 1640 }, { "epoch": 3.6666666666666665, "grad_norm": 13.71487045288086, "learning_rate": 1.018635793490621e-05, "loss": 2.6291, "step": 1650 }, { "epoch": 3.6666666666666665, "eval_loss": 1.2097514867782593, "eval_runtime": 1135.3798, "eval_samples_per_second": 0.352, "eval_steps_per_second": 0.088, "step": 1650 }, { "epoch": 3.688888888888889, "grad_norm": 14.815037727355957, "learning_rate": 9.671048197920247e-06, "loss": 2.5383, "step": 1660 }, { "epoch": 3.7111111111111112, "grad_norm": 13.783255577087402, "learning_rate": 9.167715034716606e-06, "loss": 2.6482, "step": 1670 }, { "epoch": 3.7333333333333334, "grad_norm": 9.30642032623291, "learning_rate": 8.676507921205162e-06, "loss": 2.5038, "step": 1680 }, { "epoch": 3.7555555555555555, "grad_norm": 7.022140026092529, "learning_rate": 8.197572732197322e-06, "loss": 2.1227, "step": 1690 }, { "epoch": 3.7777777777777777, "grad_norm": 16.499279022216797, "learning_rate": 7.731051698085162e-06, "loss": 2.144, "step": 1700 }, { "epoch": 3.7777777777777777, "eval_loss": 1.1110306978225708, "eval_runtime": 1124.1297, "eval_samples_per_second": 0.356, "eval_steps_per_second": 0.089, "step": 1700 }, { "epoch": 3.8, "grad_norm": 8.783102989196777, "learning_rate": 7.277083362603099e-06, "loss": 2.2054, "step": 1710 }, { "epoch": 3.822222222222222, "grad_norm": 17.862638473510742, "learning_rate": 6.835802541684117e-06, "loss": 2.6757, "step": 1720 }, { "epoch": 3.8444444444444446, "grad_norm": 10.636580467224121, "learning_rate": 6.407340283423324e-06, "loss": 2.4811, "step": 1730 }, { "epoch": 3.8666666666666667, "grad_norm": 16.785629272460938, "learning_rate": 5.9918238291602145e-06, "loss": 3.0367, "step": 1740 }, { "epoch": 3.888888888888889, "grad_norm": 18.506656646728516, "learning_rate": 5.589376575691652e-06, "loss": 2.6143, "step": 1750 }, { "epoch": 3.888888888888889, "eval_loss": 1.0668652057647705, "eval_runtime": 1135.4155, "eval_samples_per_second": 0.352, "eval_steps_per_second": 0.088, "step": 1750 }, { "epoch": 3.911111111111111, "grad_norm": 33.2259635925293, "learning_rate": 5.200118038626389e-06, "loss": 2.0509, "step": 1760 }, { "epoch": 3.9333333333333336, "grad_norm": 13.904667854309082, "learning_rate": 4.824163816892241e-06, "loss": 2.0153, "step": 1770 }, { "epoch": 3.9555555555555557, "grad_norm": 16.34776496887207, "learning_rate": 4.46162555840653e-06, "loss": 2.1135, "step": 1780 }, { "epoch": 3.977777777777778, "grad_norm": 9.64548397064209, "learning_rate": 4.112610926919663e-06, "loss": 1.8388, "step": 1790 }, { "epoch": 4.0, "grad_norm": 14.741687774658203, "learning_rate": 3.777223570042082e-06, "loss": 1.8233, "step": 1800 }, { "epoch": 4.0, "eval_loss": 1.015744686126709, "eval_runtime": 1124.2011, "eval_samples_per_second": 0.356, "eval_steps_per_second": 0.089, "step": 1800 }, { "epoch": 4.022222222222222, "grad_norm": 17.629962921142578, "learning_rate": 3.455563088463737e-06, "loss": 1.8195, "step": 1810 }, { "epoch": 4.044444444444444, "grad_norm": 10.74545669555664, "learning_rate": 3.1477250063755403e-06, "loss": 1.7446, "step": 1820 }, { "epoch": 4.066666666666666, "grad_norm": 14.61141586303711, "learning_rate": 2.853800743101265e-06, "loss": 1.7361, "step": 1830 }, { "epoch": 4.088888888888889, "grad_norm": 17.42447853088379, "learning_rate": 2.573877585948642e-06, "loss": 1.7683, "step": 1840 }, { "epoch": 4.111111111111111, "grad_norm": 12.99378776550293, "learning_rate": 2.308038664287371e-06, "loss": 1.8956, "step": 1850 }, { "epoch": 4.111111111111111, "eval_loss": 0.9937378168106079, "eval_runtime": 1121.1964, "eval_samples_per_second": 0.357, "eval_steps_per_second": 0.089, "step": 1850 }, { "epoch": 4.133333333333334, "grad_norm": 6.575343608856201, "learning_rate": 2.056362924862121e-06, "loss": 1.7616, "step": 1860 }, { "epoch": 4.155555555555556, "grad_norm": 7.720376491546631, "learning_rate": 1.8189251083474469e-06, "loss": 1.9345, "step": 1870 }, { "epoch": 4.177777777777778, "grad_norm": 16.67999267578125, "learning_rate": 1.5957957271519553e-06, "loss": 1.7756, "step": 1880 }, { "epoch": 4.2, "grad_norm": 25.952878952026367, "learning_rate": 1.3870410444780824e-06, "loss": 1.9684, "step": 1890 }, { "epoch": 4.222222222222222, "grad_norm": 13.947036743164062, "learning_rate": 1.1927230546437406e-06, "loss": 2.4727, "step": 1900 }, { "epoch": 4.222222222222222, "eval_loss": 0.9845434427261353, "eval_runtime": 1121.7839, "eval_samples_per_second": 0.357, "eval_steps_per_second": 0.089, "step": 1900 }, { "epoch": 4.2444444444444445, "grad_norm": 7.728816509246826, "learning_rate": 1.0128994646717683e-06, "loss": 2.5575, "step": 1910 }, { "epoch": 4.266666666666667, "grad_norm": 7.709161758422852, "learning_rate": 8.476236771525259e-07, "loss": 2.0874, "step": 1920 }, { "epoch": 4.288888888888889, "grad_norm": 12.649147033691406, "learning_rate": 6.969447743848501e-07, "loss": 2.1343, "step": 1930 }, { "epoch": 4.311111111111111, "grad_norm": 14.208868980407715, "learning_rate": 5.6090750379994e-07, "loss": 1.6593, "step": 1940 }, { "epoch": 4.333333333333333, "grad_norm": 113.40324401855469, "learning_rate": 4.395522646726491e-07, "loss": 1.877, "step": 1950 }, { "epoch": 4.333333333333333, "eval_loss": 0.9760661125183105, "eval_runtime": 1128.6817, "eval_samples_per_second": 0.354, "eval_steps_per_second": 0.089, "step": 1950 }, { "epoch": 4.355555555555555, "grad_norm": 8.278970718383789, "learning_rate": 3.329150961240146e-07, "loss": 2.4419, "step": 1960 }, { "epoch": 4.377777777777778, "grad_norm": 9.493860244750977, "learning_rate": 2.410276664186473e-07, "loss": 1.9977, "step": 1970 }, { "epoch": 4.4, "grad_norm": 12.72214412689209, "learning_rate": 1.6391726356013158e-07, "loss": 1.996, "step": 1980 }, { "epoch": 4.4222222222222225, "grad_norm": 16.340652465820312, "learning_rate": 1.0160678718726945e-07, "loss": 1.8244, "step": 1990 }, { "epoch": 4.444444444444445, "grad_norm": 5.685306549072266, "learning_rate": 5.411474177349218e-08, "loss": 1.3904, "step": 2000 }, { "epoch": 4.444444444444445, "eval_loss": 0.9806169271469116, "eval_runtime": 1118.8009, "eval_samples_per_second": 0.358, "eval_steps_per_second": 0.089, "step": 2000 }, { "epoch": 4.466666666666667, "grad_norm": 16.570466995239258, "learning_rate": 2.145523113160075e-08, "loss": 1.6432, "step": 2010 }, { "epoch": 4.488888888888889, "grad_norm": 12.792643547058105, "learning_rate": 3.637954225266249e-09, "loss": 1.7129, "step": 2020 }, { "epoch": 4.502222222222223, "step": 2026, "total_flos": 1.0486716012367872e+17, "train_loss": 0.5005418875375128, "train_runtime": 32217.358, "train_samples_per_second": 0.503, "train_steps_per_second": 0.063 }, { "epoch": 6.0088888888888885, "grad_norm": 14.330232620239258, "learning_rate": 5.99674741500138e-06, "loss": 4.366, "step": 2030 }, { "epoch": 6.038518518518519, "grad_norm": 9.551030158996582, "learning_rate": 5.6508792702119225e-06, "loss": 4.9312, "step": 2040 }, { "epoch": 6.068148148148148, "grad_norm": 11.4534273147583, "learning_rate": 5.314685922932666e-06, "loss": 4.5235, "step": 2050 }, { "epoch": 6.068148148148148, "eval_loss": 2.373319625854492, "eval_runtime": 894.95, "eval_samples_per_second": 0.335, "eval_steps_per_second": 0.084, "step": 2050 }, { "epoch": 6.097777777777778, "grad_norm": 11.584074020385742, "learning_rate": 4.988240714021464e-06, "loss": 3.6575, "step": 2060 }, { "epoch": 6.127407407407407, "grad_norm": 12.21071720123291, "learning_rate": 4.671614857771684e-06, "loss": 4.4115, "step": 2070 }, { "epoch": 6.157037037037037, "grad_norm": 7.9541168212890625, "learning_rate": 4.364877426376762e-06, "loss": 3.4241, "step": 2080 }, { "epoch": 6.1866666666666665, "grad_norm": 15.817623138427734, "learning_rate": 4.068095334862038e-06, "loss": 2.7395, "step": 2090 }, { "epoch": 6.216296296296297, "grad_norm": 11.574224472045898, "learning_rate": 3.781333326487202e-06, "loss": 2.9356, "step": 2100 }, { "epoch": 6.216296296296297, "eval_loss": 1.5084153413772583, "eval_runtime": 901.7323, "eval_samples_per_second": 0.333, "eval_steps_per_second": 0.083, "step": 2100 }, { "epoch": 6.245925925925926, "grad_norm": 22.12990951538086, "learning_rate": 3.504653958622456e-06, "loss": 3.6526, "step": 2110 }, { "epoch": 6.275555555555556, "grad_norm": 36.937110900878906, "learning_rate": 3.238117589101658e-06, "loss": 2.135, "step": 2120 }, { "epoch": 6.305185185185185, "grad_norm": 14.819047927856445, "learning_rate": 2.981782363055108e-06, "loss": 2.9432, "step": 2130 }, { "epoch": 6.3348148148148145, "grad_norm": 19.707664489746094, "learning_rate": 2.7357042002251976e-06, "loss": 2.6471, "step": 2140 }, { "epoch": 6.364444444444445, "grad_norm": 15.465389251708984, "learning_rate": 2.4999367827674756e-06, "loss": 2.3315, "step": 2150 }, { "epoch": 6.364444444444445, "eval_loss": 1.438122272491455, "eval_runtime": 901.0131, "eval_samples_per_second": 0.333, "eval_steps_per_second": 0.083, "step": 2150 }, { "epoch": 6.394074074074074, "grad_norm": 28.68686294555664, "learning_rate": 2.274531543539815e-06, "loss": 2.586, "step": 2160 }, { "epoch": 6.423703703703704, "grad_norm": 22.25936508178711, "learning_rate": 2.0595376548823097e-06, "loss": 3.1009, "step": 2170 }, { "epoch": 6.453333333333333, "grad_norm": 12.523676872253418, "learning_rate": 1.8550020178902727e-06, "loss": 1.9499, "step": 2180 }, { "epoch": 6.482962962962963, "grad_norm": 146.981201171875, "learning_rate": 1.6609692521827424e-06, "loss": 2.565, "step": 2190 }, { "epoch": 6.5125925925925925, "grad_norm": 17.379535675048828, "learning_rate": 1.4774816861686636e-06, "loss": 2.7072, "step": 2200 }, { "epoch": 6.5125925925925925, "eval_loss": 1.3992533683776855, "eval_runtime": 915.4557, "eval_samples_per_second": 0.328, "eval_steps_per_second": 0.082, "step": 2200 }, { "epoch": 6.542222222222223, "grad_norm": 9.735651969909668, "learning_rate": 1.304579347812912e-06, "loss": 2.3235, "step": 2210 }, { "epoch": 6.571851851851852, "grad_norm": 9.481500625610352, "learning_rate": 1.1422999559041581e-06, "loss": 2.4865, "step": 2220 }, { "epoch": 6.601481481481482, "grad_norm": 7.604280948638916, "learning_rate": 9.90678911826487e-07, "loss": 2.3393, "step": 2230 }, { "epoch": 6.631111111111111, "grad_norm": 24.429338455200195, "learning_rate": 8.497492918365602e-07, "loss": 2.2012, "step": 2240 }, { "epoch": 6.66074074074074, "grad_norm": 15.720512390136719, "learning_rate": 7.195418398479925e-07, "loss": 2.6741, "step": 2250 }, { "epoch": 6.66074074074074, "eval_loss": 1.3970364332199097, "eval_runtime": 932.9152, "eval_samples_per_second": 0.322, "eval_steps_per_second": 0.08, "step": 2250 }, { "epoch": 6.6903703703703705, "grad_norm": 10.252752304077148, "learning_rate": 6.00084960724534e-07, "loss": 2.4081, "step": 2260 }, { "epoch": 6.72, "grad_norm": 17.19991111755371, "learning_rate": 4.914047140835653e-07, "loss": 2.4516, "step": 2270 }, { "epoch": 6.74962962962963, "grad_norm": 23.26289176940918, "learning_rate": 3.935248086111176e-07, "loss": 2.5946, "step": 2280 }, { "epoch": 6.779259259259259, "grad_norm": 12.669425964355469, "learning_rate": 3.064665968898428e-07, "loss": 2.804, "step": 2290 }, { "epoch": 6.808888888888889, "grad_norm": 22.176555633544922, "learning_rate": 2.3024907074091772e-07, "loss": 2.6944, "step": 2300 }, { "epoch": 6.808888888888889, "eval_loss": 1.3712869882583618, "eval_runtime": 889.4139, "eval_samples_per_second": 0.337, "eval_steps_per_second": 0.084, "step": 2300 }, { "epoch": 6.838518518518518, "grad_norm": 22.525259017944336, "learning_rate": 1.6488885708094705e-07, "loss": 2.3312, "step": 2310 }, { "epoch": 6.868148148148148, "grad_norm": 9.424576759338379, "learning_rate": 1.1040021429480907e-07, "loss": 2.1776, "step": 2320 }, { "epoch": 6.897777777777778, "grad_norm": 33.19387435913086, "learning_rate": 6.679502912517732e-08, "loss": 2.3443, "step": 2330 }, { "epoch": 6.927407407407408, "grad_norm": 27.716110229492188, "learning_rate": 3.408281407939473e-08, "loss": 2.2612, "step": 2340 }, { "epoch": 6.957037037037037, "grad_norm": 17.70338249206543, "learning_rate": 1.2270705354333612e-08, "loss": 2.4162, "step": 2350 }, { "epoch": 6.957037037037037, "eval_loss": 1.3736646175384521, "eval_runtime": 876.3782, "eval_samples_per_second": 0.342, "eval_steps_per_second": 0.086, "step": 2350 }, { "epoch": 6.986666666666666, "grad_norm": 18.5972957611084, "learning_rate": 1.3634612796298295e-09, "loss": 2.4499, "step": 2360 } ], "logging_steps": 10, "max_steps": 2364, "num_input_tokens_seen": 0, "num_train_epochs": 7, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.225036049440727e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }