| { | |
| "best_global_step": 750, | |
| "best_metric": 0.48672306537628174, | |
| "best_model_checkpoint": "./qwen2.5-vl-finetune-checkpoints/checkpoint-750", | |
| "epoch": 6.998518518518519, | |
| "eval_steps": 50, | |
| "global_step": 2364, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.022222222222222223, | |
| "grad_norm": 5.334873676300049, | |
| "learning_rate": 2e-05, | |
| "loss": 2.417, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.044444444444444446, | |
| "grad_norm": 6.264868259429932, | |
| "learning_rate": 4.222222222222222e-05, | |
| "loss": 1.7873, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.06666666666666667, | |
| "grad_norm": 9.44924259185791, | |
| "learning_rate": 6.444444444444446e-05, | |
| "loss": 1.2221, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.08888888888888889, | |
| "grad_norm": 24.675010681152344, | |
| "learning_rate": 8.666666666666667e-05, | |
| "loss": 0.8489, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.1111111111111111, | |
| "grad_norm": 7.224123954772949, | |
| "learning_rate": 9.997593339404756e-05, | |
| "loss": 0.9777, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.1111111111111111, | |
| "eval_loss": 1.0407381057739258, | |
| "eval_runtime": 1366.0737, | |
| "eval_samples_per_second": 0.293, | |
| "eval_steps_per_second": 0.073, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.13333333333333333, | |
| "grad_norm": 3.4162638187408447, | |
| "learning_rate": 9.970545007734807e-05, | |
| "loss": 0.867, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.15555555555555556, | |
| "grad_norm": 8.475242614746094, | |
| "learning_rate": 9.913603233532067e-05, | |
| "loss": 0.8114, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.17777777777777778, | |
| "grad_norm": 4.519033908843994, | |
| "learning_rate": 9.82711047132661e-05, | |
| "loss": 1.0957, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 9.79876708984375, | |
| "learning_rate": 9.711586898767462e-05, | |
| "loss": 0.9682, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.2222222222222222, | |
| "grad_norm": 4.599425315856934, | |
| "learning_rate": 9.567727288213005e-05, | |
| "loss": 0.8787, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.2222222222222222, | |
| "eval_loss": 0.8106439113616943, | |
| "eval_runtime": 1315.3239, | |
| "eval_samples_per_second": 0.304, | |
| "eval_steps_per_second": 0.076, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.24444444444444444, | |
| "grad_norm": 8.550354957580566, | |
| "learning_rate": 9.396396828288272e-05, | |
| "loss": 0.7664, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.26666666666666666, | |
| "grad_norm": 16.60089874267578, | |
| "learning_rate": 9.19862592053875e-05, | |
| "loss": 0.8729, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.28888888888888886, | |
| "grad_norm": 7.3388495445251465, | |
| "learning_rate": 8.97560398247424e-05, | |
| "loss": 0.6473, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.3111111111111111, | |
| "grad_norm": 4.2113847732543945, | |
| "learning_rate": 8.728672294272008e-05, | |
| "loss": 0.7883, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.3333333333333333, | |
| "grad_norm": 9.767938613891602, | |
| "learning_rate": 8.459315932159979e-05, | |
| "loss": 0.9219, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.3333333333333333, | |
| "eval_loss": 0.7609114050865173, | |
| "eval_runtime": 1300.5726, | |
| "eval_samples_per_second": 0.308, | |
| "eval_steps_per_second": 0.077, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.35555555555555557, | |
| "grad_norm": 4.818809986114502, | |
| "learning_rate": 8.169154836993551e-05, | |
| "loss": 0.6935, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.37777777777777777, | |
| "grad_norm": 9.263418197631836, | |
| "learning_rate": 7.859934071740692e-05, | |
| "loss": 0.6154, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 5.390202522277832, | |
| "learning_rate": 7.533513326467911e-05, | |
| "loss": 0.7336, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.4222222222222222, | |
| "grad_norm": 6.1149373054504395, | |
| "learning_rate": 7.191855733945387e-05, | |
| "loss": 0.6445, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.4444444444444444, | |
| "grad_norm": 28.822879791259766, | |
| "learning_rate": 6.837016063135491e-05, | |
| "loss": 0.6949, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.4444444444444444, | |
| "eval_loss": 0.7009051442146301, | |
| "eval_runtime": 1304.5098, | |
| "eval_samples_per_second": 0.307, | |
| "eval_steps_per_second": 0.077, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.4666666666666667, | |
| "grad_norm": 4.288070201873779, | |
| "learning_rate": 6.471128361570476e-05, | |
| "loss": 0.6743, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.4888888888888889, | |
| "grad_norm": 2.8349318504333496, | |
| "learning_rate": 6.096393120939516e-05, | |
| "loss": 0.7858, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.5111111111111111, | |
| "grad_norm": 4.94554328918457, | |
| "learning_rate": 5.715064043072771e-05, | |
| "loss": 0.5722, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.5333333333333333, | |
| "grad_norm": 4.504173278808594, | |
| "learning_rate": 5.329434485913393e-05, | |
| "loss": 0.8959, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.5555555555555556, | |
| "grad_norm": 6.013023376464844, | |
| "learning_rate": 4.941823670993016e-05, | |
| "loss": 0.7088, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.5555555555555556, | |
| "eval_loss": 0.6455658674240112, | |
| "eval_runtime": 1299.4246, | |
| "eval_samples_per_second": 0.308, | |
| "eval_steps_per_second": 0.077, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.5777777777777777, | |
| "grad_norm": 4.269768238067627, | |
| "learning_rate": 4.55456273536057e-05, | |
| "loss": 0.7, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 3.1131203174591064, | |
| "learning_rate": 4.169980711849781e-05, | |
| "loss": 0.5789, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.6222222222222222, | |
| "grad_norm": 3.8976268768310547, | |
| "learning_rate": 3.790390522001662e-05, | |
| "loss": 0.5789, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.6444444444444445, | |
| "grad_norm": 3.225245714187622, | |
| "learning_rate": 3.418075065882217e-05, | |
| "loss": 0.5838, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.6666666666666666, | |
| "grad_norm": 4.711687088012695, | |
| "learning_rate": 3.0552734924528306e-05, | |
| "loss": 0.6903, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.6666666666666666, | |
| "eval_loss": 0.5962206125259399, | |
| "eval_runtime": 1288.7617, | |
| "eval_samples_per_second": 0.31, | |
| "eval_steps_per_second": 0.078, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.6888888888888889, | |
| "grad_norm": 4.992547512054443, | |
| "learning_rate": 2.7041677330649407e-05, | |
| "loss": 0.6936, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.7111111111111111, | |
| "grad_norm": 9.838484764099121, | |
| "learning_rate": 2.3668693790681634e-05, | |
| "loss": 0.7052, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.7333333333333333, | |
| "grad_norm": 8.834742546081543, | |
| "learning_rate": 2.0454069824514444e-05, | |
| "loss": 0.7329, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.7555555555555555, | |
| "grad_norm": 5.83026123046875, | |
| "learning_rate": 1.7417138558927244e-05, | |
| "loss": 0.5383, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.7777777777777778, | |
| "grad_norm": 7.011873722076416, | |
| "learning_rate": 1.4576164455890013e-05, | |
| "loss": 0.5669, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.7777777777777778, | |
| "eval_loss": 0.5696190595626831, | |
| "eval_runtime": 1284.3616, | |
| "eval_samples_per_second": 0.311, | |
| "eval_steps_per_second": 0.078, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 4.988184452056885, | |
| "learning_rate": 1.194823346793998e-05, | |
| "loss": 0.523, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.8222222222222222, | |
| "grad_norm": 6.482738971710205, | |
| "learning_rate": 9.549150281252633e-06, | |
| "loss": 0.6132, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.8444444444444444, | |
| "grad_norm": 7.583404541015625, | |
| "learning_rate": 7.393343264399438e-06, | |
| "loss": 0.5602, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.8666666666666667, | |
| "grad_norm": 3.9021246433258057, | |
| "learning_rate": 5.493777694441521e-06, | |
| "loss": 0.7415, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.8888888888888888, | |
| "grad_norm": 4.355510711669922, | |
| "learning_rate": 3.861877782227885e-06, | |
| "loss": 0.6577, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.8888888888888888, | |
| "eval_loss": 0.5607297420501709, | |
| "eval_runtime": 1282.6163, | |
| "eval_samples_per_second": 0.312, | |
| "eval_steps_per_second": 0.078, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.9111111111111111, | |
| "grad_norm": 6.445341110229492, | |
| "learning_rate": 2.5074579658471266e-06, | |
| "loss": 0.7159, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.9333333333333333, | |
| "grad_norm": 4.908728122711182, | |
| "learning_rate": 1.438663885441982e-06, | |
| "loss": 0.5581, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.9555555555555556, | |
| "grad_norm": 3.629876136779785, | |
| "learning_rate": 6.61923394371039e-07, | |
| "loss": 0.5394, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.9777777777777777, | |
| "grad_norm": 4.596356391906738, | |
| "learning_rate": 1.819079013423153e-07, | |
| "loss": 0.5405, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 3.878979206085205, | |
| "learning_rate": 1.5042760116212861e-09, | |
| "loss": 0.4788, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_loss": 0.5549002289772034, | |
| "eval_runtime": 1306.3502, | |
| "eval_samples_per_second": 0.306, | |
| "eval_steps_per_second": 0.077, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "step": 450, | |
| "total_flos": 2.2909672762220544e+16, | |
| "train_loss": 0.7819234402974446, | |
| "train_runtime": 36382.2657, | |
| "train_samples_per_second": 0.099, | |
| "train_steps_per_second": 0.012 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "step": 450, | |
| "total_flos": 2.2909672762220544e+16, | |
| "train_loss": 0.0, | |
| "train_runtime": 0.2352, | |
| "train_samples_per_second": 11479.607, | |
| "train_steps_per_second": 1432.825 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "step": 450, | |
| "total_flos": 2.2909672762220544e+16, | |
| "train_loss": 0.0, | |
| "train_runtime": 0.2432, | |
| "train_samples_per_second": 11101.818, | |
| "train_steps_per_second": 1385.671 | |
| }, | |
| { | |
| "epoch": 1.3644444444444446, | |
| "grad_norm": 6.853477954864502, | |
| "learning_rate": 4.4362464041245384e-05, | |
| "loss": 0.4727, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 1.394074074074074, | |
| "grad_norm": 5.060462951660156, | |
| "learning_rate": 4.21673357748979e-05, | |
| "loss": 0.7008, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 1.4237037037037037, | |
| "grad_norm": 3.702819585800171, | |
| "learning_rate": 3.998758359194028e-05, | |
| "loss": 0.4912, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 1.4533333333333334, | |
| "grad_norm": 37.993465423583984, | |
| "learning_rate": 3.7827486502728574e-05, | |
| "loss": 0.6657, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 1.482962962962963, | |
| "grad_norm": 11.906915664672852, | |
| "learning_rate": 3.5691284933255654e-05, | |
| "loss": 0.4878, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.482962962962963, | |
| "eval_loss": 0.5222220420837402, | |
| "eval_runtime": 931.5337, | |
| "eval_samples_per_second": 0.322, | |
| "eval_steps_per_second": 0.081, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.5125925925925925, | |
| "grad_norm": 5.83925724029541, | |
| "learning_rate": 3.358317240089008e-05, | |
| "loss": 0.4312, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 1.5422222222222222, | |
| "grad_norm": 3.2574503421783447, | |
| "learning_rate": 3.150728728219966e-05, | |
| "loss": 0.5544, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 1.5718518518518518, | |
| "grad_norm": 6.785427093505859, | |
| "learning_rate": 2.946770468902064e-05, | |
| "loss": 0.5278, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 1.6014814814814815, | |
| "grad_norm": 48.51929473876953, | |
| "learning_rate": 2.7468428468719877e-05, | |
| "loss": 0.6246, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 1.6311111111111112, | |
| "grad_norm": 2.2513532638549805, | |
| "learning_rate": 2.5513383344354467e-05, | |
| "loss": 0.5973, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 1.6311111111111112, | |
| "eval_loss": 0.5203197002410889, | |
| "eval_runtime": 910.2398, | |
| "eval_samples_per_second": 0.33, | |
| "eval_steps_per_second": 0.082, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 1.6607407407407409, | |
| "grad_norm": 5.722535133361816, | |
| "learning_rate": 2.3606407210158006e-05, | |
| "loss": 0.6404, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 1.6903703703703705, | |
| "grad_norm": 6.939681053161621, | |
| "learning_rate": 2.175124359747806e-05, | |
| "loss": 0.5434, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 1.72, | |
| "grad_norm": 6.644214630126953, | |
| "learning_rate": 1.9951534325954914e-05, | |
| "loss": 0.6884, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 1.7496296296296296, | |
| "grad_norm": 4.961182117462158, | |
| "learning_rate": 1.82108123543675e-05, | |
| "loss": 0.459, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 1.779259259259259, | |
| "grad_norm": 6.396653652191162, | |
| "learning_rate": 1.6532494845181155e-05, | |
| "loss": 0.5712, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.779259259259259, | |
| "eval_loss": 0.5014171600341797, | |
| "eval_runtime": 902.4118, | |
| "eval_samples_per_second": 0.332, | |
| "eval_steps_per_second": 0.083, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.8088888888888888, | |
| "grad_norm": 4.0901360511779785, | |
| "learning_rate": 1.4919876456411874e-05, | |
| "loss": 0.5857, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 1.8385185185185184, | |
| "grad_norm": 5.865538597106934, | |
| "learning_rate": 1.3376122873975616e-05, | |
| "loss": 0.5479, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 1.8681481481481481, | |
| "grad_norm": 4.197350025177002, | |
| "learning_rate": 1.1904264597219077e-05, | |
| "loss": 0.5741, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 1.8977777777777778, | |
| "grad_norm": 12.017341613769531, | |
| "learning_rate": 1.0507190989831412e-05, | |
| "loss": 0.5084, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 1.9274074074074075, | |
| "grad_norm": 3.7081458568573, | |
| "learning_rate": 9.187644607815498e-06, | |
| "loss": 0.6477, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 1.9274074074074075, | |
| "eval_loss": 0.4961656630039215, | |
| "eval_runtime": 888.3568, | |
| "eval_samples_per_second": 0.338, | |
| "eval_steps_per_second": 0.084, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 1.9570370370370371, | |
| "grad_norm": 2.705108165740967, | |
| "learning_rate": 7.948215815653149e-06, | |
| "loss": 0.4434, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 1.9866666666666668, | |
| "grad_norm": 4.442301273345947, | |
| "learning_rate": 6.7913377012332694e-06, | |
| "loss": 0.6607, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 2.017777777777778, | |
| "grad_norm": 3.787601947784424, | |
| "learning_rate": 5.719281299525331e-06, | |
| "loss": 0.5332, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 2.0474074074074076, | |
| "grad_norm": 12.064045906066895, | |
| "learning_rate": 4.734151134374304e-06, | |
| "loss": 0.4921, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 2.0770370370370372, | |
| "grad_norm": 3.7398955821990967, | |
| "learning_rate": 3.837881087168932e-06, | |
| "loss": 0.5258, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 2.0770370370370372, | |
| "eval_loss": 0.4903838038444519, | |
| "eval_runtime": 946.841, | |
| "eval_samples_per_second": 0.317, | |
| "eval_steps_per_second": 0.079, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 2.1066666666666665, | |
| "grad_norm": 5.209836006164551, | |
| "learning_rate": 3.0322306004934462e-06, | |
| "loss": 0.5301, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 2.136296296296296, | |
| "grad_norm": 7.934685230255127, | |
| "learning_rate": 2.3187812242151995e-06, | |
| "loss": 0.3265, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 2.165925925925926, | |
| "grad_norm": 7.0827789306640625, | |
| "learning_rate": 1.6989335107884862e-06, | |
| "loss": 0.5313, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 2.1955555555555555, | |
| "grad_norm": 4.234689712524414, | |
| "learning_rate": 1.1739042658693078e-06, | |
| "loss": 0.4536, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 2.225185185185185, | |
| "grad_norm": 2.486720561981201, | |
| "learning_rate": 7.447241596383381e-07, | |
| "loss": 0.4678, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 2.225185185185185, | |
| "eval_loss": 0.48672306537628174, | |
| "eval_runtime": 907.8778, | |
| "eval_samples_per_second": 0.33, | |
| "eval_steps_per_second": 0.083, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 2.254814814814815, | |
| "grad_norm": 3.3360986709594727, | |
| "learning_rate": 4.122357035211855e-07, | |
| "loss": 0.4285, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 2.2844444444444445, | |
| "grad_norm": 5.732705116271973, | |
| "learning_rate": 1.7709159627787853e-07, | |
| "loss": 0.422, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 2.314074074074074, | |
| "grad_norm": 3.294212579727173, | |
| "learning_rate": 3.975344270823467e-08, | |
| "loss": 0.4541, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 2.3377777777777777, | |
| "step": 788, | |
| "total_flos": 4.05169174996992e+16, | |
| "train_loss": 0.22821334594397374, | |
| "train_runtime": 22612.2743, | |
| "train_samples_per_second": 0.279, | |
| "train_steps_per_second": 0.035 | |
| }, | |
| { | |
| "epoch": 1.7555555555555555, | |
| "grad_norm": 7.2509870529174805, | |
| "learning_rate": 3.500580638048163e-05, | |
| "loss": 0.8465, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 1.7777777777777777, | |
| "grad_norm": 12.120731353759766, | |
| "learning_rate": 3.366679270419626e-05, | |
| "loss": 0.9164, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.7777777777777777, | |
| "eval_loss": 0.542812168598175, | |
| "eval_runtime": 1320.8378, | |
| "eval_samples_per_second": 0.303, | |
| "eval_steps_per_second": 0.076, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.8, | |
| "grad_norm": 13.869917869567871, | |
| "learning_rate": 3.2340767918386884e-05, | |
| "loss": 0.9066, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 1.8222222222222222, | |
| "grad_norm": 18.34079933166504, | |
| "learning_rate": 3.102878653674449e-05, | |
| "loss": 1.1401, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 1.8444444444444446, | |
| "grad_norm": 12.816385269165039, | |
| "learning_rate": 2.973189190502259e-05, | |
| "loss": 1.0268, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 1.8666666666666667, | |
| "grad_norm": 12.571537017822266, | |
| "learning_rate": 2.84511153713223e-05, | |
| "loss": 1.3972, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 1.8888888888888888, | |
| "grad_norm": 31.82439613342285, | |
| "learning_rate": 2.7187475465918765e-05, | |
| "loss": 1.2414, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 1.8888888888888888, | |
| "eval_loss": 0.5595240592956543, | |
| "eval_runtime": 1193.2134, | |
| "eval_samples_per_second": 0.335, | |
| "eval_steps_per_second": 0.084, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 1.911111111111111, | |
| "grad_norm": 13.308499336242676, | |
| "learning_rate": 2.594197709128061e-05, | |
| "loss": 1.3218, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 1.9333333333333333, | |
| "grad_norm": 6.059114456176758, | |
| "learning_rate": 2.471561072292703e-05, | |
| "loss": 1.0453, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 1.9555555555555557, | |
| "grad_norm": 13.917656898498535, | |
| "learning_rate": 2.3509351621757692e-05, | |
| "loss": 1.0651, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 1.9777777777777779, | |
| "grad_norm": 9.584994316101074, | |
| "learning_rate": 2.2324159058482085e-05, | |
| "loss": 1.1164, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 12.185356140136719, | |
| "learning_rate": 2.11609755507649e-05, | |
| "loss": 0.9255, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_loss": 0.5336335897445679, | |
| "eval_runtime": 1209.6116, | |
| "eval_samples_per_second": 0.331, | |
| "eval_steps_per_second": 0.083, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 2.022222222222222, | |
| "grad_norm": 9.4601411819458, | |
| "learning_rate": 2.0020726113694204e-05, | |
| "loss": 0.6626, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 2.0444444444444443, | |
| "grad_norm": 7.833770751953125, | |
| "learning_rate": 1.8904317524168458e-05, | |
| "loss": 0.8076, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 2.066666666666667, | |
| "grad_norm": 9.944056510925293, | |
| "learning_rate": 1.7812637599787297e-05, | |
| "loss": 0.826, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 2.088888888888889, | |
| "grad_norm": 50.9095344543457, | |
| "learning_rate": 1.674655449281964e-05, | |
| "loss": 0.8201, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 2.111111111111111, | |
| "grad_norm": 11.151677131652832, | |
| "learning_rate": 1.570691599981053e-05, | |
| "loss": 0.7905, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 2.111111111111111, | |
| "eval_loss": 0.5313804149627686, | |
| "eval_runtime": 1190.18, | |
| "eval_samples_per_second": 0.336, | |
| "eval_steps_per_second": 0.084, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 2.1333333333333333, | |
| "grad_norm": 19.227052688598633, | |
| "learning_rate": 1.4694548887375708e-05, | |
| "loss": 0.7963, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 2.1555555555555554, | |
| "grad_norm": 3.875586748123169, | |
| "learning_rate": 1.3710258234720192e-05, | |
| "loss": 0.7642, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 2.1777777777777776, | |
| "grad_norm": 15.085531234741211, | |
| "learning_rate": 1.2754826793403562e-05, | |
| "loss": 0.9614, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 2.2, | |
| "grad_norm": 8.809530258178711, | |
| "learning_rate": 1.1829014364861251e-05, | |
| "loss": 0.8547, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 2.2222222222222223, | |
| "grad_norm": 9.07629680633545, | |
| "learning_rate": 1.093355719617678e-05, | |
| "loss": 0.7664, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 2.2222222222222223, | |
| "eval_loss": 0.5206774473190308, | |
| "eval_runtime": 1181.1098, | |
| "eval_samples_per_second": 0.339, | |
| "eval_steps_per_second": 0.085, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 2.2444444444444445, | |
| "grad_norm": 6.77101993560791, | |
| "learning_rate": 1.006916739458535e-05, | |
| "loss": 0.693, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 2.2666666666666666, | |
| "grad_norm": 5.627523899078369, | |
| "learning_rate": 9.236532361174726e-06, | |
| "loss": 0.9059, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 2.2888888888888888, | |
| "grad_norm": 6.075342655181885, | |
| "learning_rate": 8.43631424423334e-06, | |
| "loss": 0.7724, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 2.311111111111111, | |
| "grad_norm": 12.213552474975586, | |
| "learning_rate": 7.669149412680605e-06, | |
| "loss": 0.9813, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 2.3333333333333335, | |
| "grad_norm": 4.757562637329102, | |
| "learning_rate": 6.93564794999823e-06, | |
| "loss": 1.2031, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 2.3333333333333335, | |
| "eval_loss": 0.5155333876609802, | |
| "eval_runtime": 1186.5765, | |
| "eval_samples_per_second": 0.337, | |
| "eval_steps_per_second": 0.084, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 2.3555555555555556, | |
| "grad_norm": 11.984197616577148, | |
| "learning_rate": 6.2363931690647195e-06, | |
| "loss": 1.0289, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 2.3777777777777778, | |
| "grad_norm": 6.141831874847412, | |
| "learning_rate": 5.571941148279081e-06, | |
| "loss": 0.9776, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 2.4, | |
| "grad_norm": 6.076663017272949, | |
| "learning_rate": 4.942820289342759e-06, | |
| "loss": 0.5992, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 2.422222222222222, | |
| "grad_norm": 113.96343994140625, | |
| "learning_rate": 4.349530897051047e-06, | |
| "loss": 0.875, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 2.4444444444444446, | |
| "grad_norm": 12.476838111877441, | |
| "learning_rate": 3.7925447814286087e-06, | |
| "loss": 0.848, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 2.4444444444444446, | |
| "eval_loss": 0.5155972838401794, | |
| "eval_runtime": 1256.8424, | |
| "eval_samples_per_second": 0.318, | |
| "eval_steps_per_second": 0.08, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 2.466666666666667, | |
| "grad_norm": 6.342575550079346, | |
| "learning_rate": 3.2723048825252177e-06, | |
| "loss": 0.876, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 2.488888888888889, | |
| "grad_norm": 5.154838562011719, | |
| "learning_rate": 2.7892249181701802e-06, | |
| "loss": 0.6484, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 2.511111111111111, | |
| "grad_norm": 14.073723793029785, | |
| "learning_rate": 2.343689054965592e-06, | |
| "loss": 0.7078, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 2.533333333333333, | |
| "grad_norm": 11.655536651611328, | |
| "learning_rate": 1.936051602780026e-06, | |
| "loss": 0.8106, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 2.5555555555555554, | |
| "grad_norm": 12.040849685668945, | |
| "learning_rate": 1.5666367329856046e-06, | |
| "loss": 0.736, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 2.5555555555555554, | |
| "eval_loss": 0.5151739716529846, | |
| "eval_runtime": 1180.2607, | |
| "eval_samples_per_second": 0.339, | |
| "eval_steps_per_second": 0.085, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 2.5777777777777775, | |
| "grad_norm": 4.713861465454102, | |
| "learning_rate": 1.2357382206625801e-06, | |
| "loss": 0.9703, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 2.6, | |
| "grad_norm": 16.40427589416504, | |
| "learning_rate": 9.436192109763376e-07, | |
| "loss": 0.8726, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 2.6222222222222222, | |
| "grad_norm": 26.21332359313965, | |
| "learning_rate": 6.90512009912725e-07, | |
| "loss": 0.7543, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 2.6444444444444444, | |
| "grad_norm": 10.207490921020508, | |
| "learning_rate": 4.766178995379955e-07, | |
| "loss": 1.0878, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 2.6666666666666665, | |
| "grad_norm": 17.372835159301758, | |
| "learning_rate": 3.0210697793044975e-07, | |
| "loss": 0.6169, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 2.6666666666666665, | |
| "eval_loss": 0.5134466290473938, | |
| "eval_runtime": 1169.2654, | |
| "eval_samples_per_second": 0.342, | |
| "eval_steps_per_second": 0.086, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 2.688888888888889, | |
| "grad_norm": 78.36367797851562, | |
| "learning_rate": 1.671180239108172e-07, | |
| "loss": 0.9518, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 2.7111111111111112, | |
| "grad_norm": 7.34134578704834, | |
| "learning_rate": 7.175838667927148e-08, | |
| "loss": 0.6888, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 2.7333333333333334, | |
| "grad_norm": 7.477999687194824, | |
| "learning_rate": 1.6103900446534648e-08, | |
| "loss": 0.7221, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 2.7511111111111113, | |
| "step": 1238, | |
| "total_flos": 6.388579291468186e+16, | |
| "train_loss": 0.3261277918284082, | |
| "train_runtime": 34829.2458, | |
| "train_samples_per_second": 0.284, | |
| "train_steps_per_second": 0.036 | |
| }, | |
| { | |
| "epoch": 3.66962962962963, | |
| "grad_norm": 14.275522232055664, | |
| "learning_rate": 1.3300797847207797e-05, | |
| "loss": 3.5621, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 3.699259259259259, | |
| "grad_norm": 27.858943939208984, | |
| "learning_rate": 1.2557515699430094e-05, | |
| "loss": 4.3815, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 3.699259259259259, | |
| "eval_loss": 2.2658419609069824, | |
| "eval_runtime": 995.7526, | |
| "eval_samples_per_second": 0.301, | |
| "eval_steps_per_second": 0.075, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 3.728888888888889, | |
| "grad_norm": 30.557031631469727, | |
| "learning_rate": 1.1832611379355878e-05, | |
| "loss": 3.2056, | |
| "step": 1260 | |
| }, | |
| { | |
| "epoch": 3.7585185185185184, | |
| "grad_norm": 34.28306579589844, | |
| "learning_rate": 1.1126440690477996e-05, | |
| "loss": 2.8957, | |
| "step": 1270 | |
| }, | |
| { | |
| "epoch": 3.788148148148148, | |
| "grad_norm": 29.017297744750977, | |
| "learning_rate": 1.0439350241294566e-05, | |
| "loss": 2.5225, | |
| "step": 1280 | |
| }, | |
| { | |
| "epoch": 3.8177777777777777, | |
| "grad_norm": 23.32266616821289, | |
| "learning_rate": 9.771677275183744e-06, | |
| "loss": 2.6028, | |
| "step": 1290 | |
| }, | |
| { | |
| "epoch": 3.8474074074074074, | |
| "grad_norm": 32.830848693847656, | |
| "learning_rate": 9.123749504875135e-06, | |
| "loss": 2.7177, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 3.8474074074074074, | |
| "eval_loss": 1.3522464036941528, | |
| "eval_runtime": 985.7859, | |
| "eval_samples_per_second": 0.304, | |
| "eval_steps_per_second": 0.076, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 3.877037037037037, | |
| "grad_norm": 6.538234233856201, | |
| "learning_rate": 8.495884951599142e-06, | |
| "loss": 2.2624, | |
| "step": 1310 | |
| }, | |
| { | |
| "epoch": 3.9066666666666667, | |
| "grad_norm": 19.523771286010742, | |
| "learning_rate": 7.888391788993216e-06, | |
| "loss": 2.6275, | |
| "step": 1320 | |
| }, | |
| { | |
| "epoch": 3.9362962962962964, | |
| "grad_norm": 11.971488952636719, | |
| "learning_rate": 7.301568191841457e-06, | |
| "loss": 2.1496, | |
| "step": 1330 | |
| }, | |
| { | |
| "epoch": 3.965925925925926, | |
| "grad_norm": 34.24433898925781, | |
| "learning_rate": 6.735702189722115e-06, | |
| "loss": 2.0774, | |
| "step": 1340 | |
| }, | |
| { | |
| "epoch": 3.9955555555555557, | |
| "grad_norm": 12.619851112365723, | |
| "learning_rate": 6.191071525634456e-06, | |
| "loss": 2.0749, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 3.9955555555555557, | |
| "eval_loss": 1.2665727138519287, | |
| "eval_runtime": 972.1433, | |
| "eval_samples_per_second": 0.309, | |
| "eval_steps_per_second": 0.077, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 4.026666666666666, | |
| "grad_norm": 21.63642692565918, | |
| "learning_rate": 5.667943519674723e-06, | |
| "loss": 2.2795, | |
| "step": 1360 | |
| }, | |
| { | |
| "epoch": 4.0562962962962965, | |
| "grad_norm": 5.838581562042236, | |
| "learning_rate": 5.166574937827867e-06, | |
| "loss": 2.6146, | |
| "step": 1370 | |
| }, | |
| { | |
| "epoch": 4.085925925925926, | |
| "grad_norm": 11.008721351623535, | |
| "learning_rate": 4.687211865939539e-06, | |
| "loss": 2.3045, | |
| "step": 1380 | |
| }, | |
| { | |
| "epoch": 4.115555555555556, | |
| "grad_norm": 6.246650218963623, | |
| "learning_rate": 4.2300895889302805e-06, | |
| "loss": 1.823, | |
| "step": 1390 | |
| }, | |
| { | |
| "epoch": 4.145185185185185, | |
| "grad_norm": 13.782442092895508, | |
| "learning_rate": 3.7954324753109673e-06, | |
| "loss": 2.2982, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 4.145185185185185, | |
| "eval_loss": 1.2098972797393799, | |
| "eval_runtime": 998.8662, | |
| "eval_samples_per_second": 0.3, | |
| "eval_steps_per_second": 0.075, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 4.174814814814815, | |
| "grad_norm": 11.179134368896484, | |
| "learning_rate": 3.383453867056452e-06, | |
| "loss": 2.5618, | |
| "step": 1410 | |
| }, | |
| { | |
| "epoch": 4.204444444444444, | |
| "grad_norm": 73.97550201416016, | |
| "learning_rate": 2.9943559748912996e-06, | |
| "loss": 1.8831, | |
| "step": 1420 | |
| }, | |
| { | |
| "epoch": 4.234074074074074, | |
| "grad_norm": 17.907745361328125, | |
| "learning_rate": 2.628329779039057e-06, | |
| "loss": 2.2352, | |
| "step": 1430 | |
| }, | |
| { | |
| "epoch": 4.263703703703704, | |
| "grad_norm": 81.71790313720703, | |
| "learning_rate": 2.2855549354837912e-06, | |
| "loss": 2.1651, | |
| "step": 1440 | |
| }, | |
| { | |
| "epoch": 4.293333333333333, | |
| "grad_norm": 10.33467960357666, | |
| "learning_rate": 1.9661996877898105e-06, | |
| "loss": 1.7595, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 4.293333333333333, | |
| "eval_loss": 1.1622637510299683, | |
| "eval_runtime": 993.3397, | |
| "eval_samples_per_second": 0.302, | |
| "eval_steps_per_second": 0.076, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 4.322962962962963, | |
| "grad_norm": 40.43919372558594, | |
| "learning_rate": 1.6704207845230358e-06, | |
| "loss": 1.9304, | |
| "step": 1460 | |
| }, | |
| { | |
| "epoch": 4.352592592592592, | |
| "grad_norm": 10.497286796569824, | |
| "learning_rate": 1.3983634023143511e-06, | |
| "loss": 2.098, | |
| "step": 1470 | |
| }, | |
| { | |
| "epoch": 4.3822222222222225, | |
| "grad_norm": 9.101359367370605, | |
| "learning_rate": 1.1501610746028124e-06, | |
| "loss": 1.8441, | |
| "step": 1480 | |
| }, | |
| { | |
| "epoch": 4.411851851851852, | |
| "grad_norm": 20.517807006835938, | |
| "learning_rate": 9.25935626093688e-07, | |
| "loss": 2.3551, | |
| "step": 1490 | |
| }, | |
| { | |
| "epoch": 4.441481481481482, | |
| "grad_norm": 7.981099605560303, | |
| "learning_rate": 7.257971129634389e-07, | |
| "loss": 1.6124, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 4.441481481481482, | |
| "eval_loss": 1.1480356454849243, | |
| "eval_runtime": 970.9195, | |
| "eval_samples_per_second": 0.309, | |
| "eval_steps_per_second": 0.077, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 4.471111111111111, | |
| "grad_norm": 51.19599533081055, | |
| "learning_rate": 5.498437688410463e-07, | |
| "loss": 2.0946, | |
| "step": 1510 | |
| }, | |
| { | |
| "epoch": 4.50074074074074, | |
| "grad_norm": 7.847194671630859, | |
| "learning_rate": 3.981619565921968e-07, | |
| "loss": 1.8896, | |
| "step": 1520 | |
| }, | |
| { | |
| "epoch": 4.53037037037037, | |
| "grad_norm": 12.63452434539795, | |
| "learning_rate": 2.708261259299072e-07, | |
| "loss": 2.1132, | |
| "step": 1530 | |
| }, | |
| { | |
| "epoch": 4.5600000000000005, | |
| "grad_norm": 8.711173057556152, | |
| "learning_rate": 1.6789877687254928e-07, | |
| "loss": 1.9074, | |
| "step": 1540 | |
| }, | |
| { | |
| "epoch": 4.58962962962963, | |
| "grad_norm": 14.014768600463867, | |
| "learning_rate": 8.943042906705001e-08, | |
| "loss": 2.4591, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 4.58962962962963, | |
| "eval_loss": 1.1526756286621094, | |
| "eval_runtime": 1013.0536, | |
| "eval_samples_per_second": 0.296, | |
| "eval_steps_per_second": 0.074, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 4.619259259259259, | |
| "grad_norm": 241.5323486328125, | |
| "learning_rate": 3.545959699243207e-08, | |
| "loss": 1.9968, | |
| "step": 1560 | |
| }, | |
| { | |
| "epoch": 4.648888888888889, | |
| "grad_norm": 41.02328109741211, | |
| "learning_rate": 6.0127710558133265e-09, | |
| "loss": 1.9328, | |
| "step": 1570 | |
| }, | |
| { | |
| "epoch": 4.666666666666667, | |
| "step": 1576, | |
| "total_flos": 8.15036810717184e+16, | |
| "train_loss": 0.49436442077462445, | |
| "train_runtime": 26325.3193, | |
| "train_samples_per_second": 0.479, | |
| "train_steps_per_second": 0.06 | |
| }, | |
| { | |
| "epoch": 3.511111111111111, | |
| "grad_norm": 11.290818214416504, | |
| "learning_rate": 1.4115578944331131e-05, | |
| "loss": 4.0951, | |
| "step": 1580 | |
| }, | |
| { | |
| "epoch": 3.533333333333333, | |
| "grad_norm": 29.64479637145996, | |
| "learning_rate": 1.3520911423383454e-05, | |
| "loss": 5.0902, | |
| "step": 1590 | |
| }, | |
| { | |
| "epoch": 3.5555555555555554, | |
| "grad_norm": 12.257214546203613, | |
| "learning_rate": 1.2937077174225081e-05, | |
| "loss": 3.8541, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 3.5555555555555554, | |
| "eval_loss": 1.5511490106582642, | |
| "eval_runtime": 1129.8462, | |
| "eval_samples_per_second": 0.354, | |
| "eval_steps_per_second": 0.089, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 3.5777777777777775, | |
| "grad_norm": 7.66765832901001, | |
| "learning_rate": 1.2364249579342985e-05, | |
| "loss": 2.9561, | |
| "step": 1610 | |
| }, | |
| { | |
| "epoch": 3.6, | |
| "grad_norm": 5.419583797454834, | |
| "learning_rate": 1.1802598752554878e-05, | |
| "loss": 2.4979, | |
| "step": 1620 | |
| }, | |
| { | |
| "epoch": 3.6222222222222222, | |
| "grad_norm": 10.220062255859375, | |
| "learning_rate": 1.125229148849008e-05, | |
| "loss": 2.1119, | |
| "step": 1630 | |
| }, | |
| { | |
| "epoch": 3.6444444444444444, | |
| "grad_norm": 10.050875663757324, | |
| "learning_rate": 1.071349121305622e-05, | |
| "loss": 2.2186, | |
| "step": 1640 | |
| }, | |
| { | |
| "epoch": 3.6666666666666665, | |
| "grad_norm": 13.71487045288086, | |
| "learning_rate": 1.018635793490621e-05, | |
| "loss": 2.6291, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 3.6666666666666665, | |
| "eval_loss": 1.2097514867782593, | |
| "eval_runtime": 1135.3798, | |
| "eval_samples_per_second": 0.352, | |
| "eval_steps_per_second": 0.088, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 3.688888888888889, | |
| "grad_norm": 14.815037727355957, | |
| "learning_rate": 9.671048197920247e-06, | |
| "loss": 2.5383, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 3.7111111111111112, | |
| "grad_norm": 13.783255577087402, | |
| "learning_rate": 9.167715034716606e-06, | |
| "loss": 2.6482, | |
| "step": 1670 | |
| }, | |
| { | |
| "epoch": 3.7333333333333334, | |
| "grad_norm": 9.30642032623291, | |
| "learning_rate": 8.676507921205162e-06, | |
| "loss": 2.5038, | |
| "step": 1680 | |
| }, | |
| { | |
| "epoch": 3.7555555555555555, | |
| "grad_norm": 7.022140026092529, | |
| "learning_rate": 8.197572732197322e-06, | |
| "loss": 2.1227, | |
| "step": 1690 | |
| }, | |
| { | |
| "epoch": 3.7777777777777777, | |
| "grad_norm": 16.499279022216797, | |
| "learning_rate": 7.731051698085162e-06, | |
| "loss": 2.144, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 3.7777777777777777, | |
| "eval_loss": 1.1110306978225708, | |
| "eval_runtime": 1124.1297, | |
| "eval_samples_per_second": 0.356, | |
| "eval_steps_per_second": 0.089, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 3.8, | |
| "grad_norm": 8.783102989196777, | |
| "learning_rate": 7.277083362603099e-06, | |
| "loss": 2.2054, | |
| "step": 1710 | |
| }, | |
| { | |
| "epoch": 3.822222222222222, | |
| "grad_norm": 17.862638473510742, | |
| "learning_rate": 6.835802541684117e-06, | |
| "loss": 2.6757, | |
| "step": 1720 | |
| }, | |
| { | |
| "epoch": 3.8444444444444446, | |
| "grad_norm": 10.636580467224121, | |
| "learning_rate": 6.407340283423324e-06, | |
| "loss": 2.4811, | |
| "step": 1730 | |
| }, | |
| { | |
| "epoch": 3.8666666666666667, | |
| "grad_norm": 16.785629272460938, | |
| "learning_rate": 5.9918238291602145e-06, | |
| "loss": 3.0367, | |
| "step": 1740 | |
| }, | |
| { | |
| "epoch": 3.888888888888889, | |
| "grad_norm": 18.506656646728516, | |
| "learning_rate": 5.589376575691652e-06, | |
| "loss": 2.6143, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 3.888888888888889, | |
| "eval_loss": 1.0668652057647705, | |
| "eval_runtime": 1135.4155, | |
| "eval_samples_per_second": 0.352, | |
| "eval_steps_per_second": 0.088, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 3.911111111111111, | |
| "grad_norm": 33.2259635925293, | |
| "learning_rate": 5.200118038626389e-06, | |
| "loss": 2.0509, | |
| "step": 1760 | |
| }, | |
| { | |
| "epoch": 3.9333333333333336, | |
| "grad_norm": 13.904667854309082, | |
| "learning_rate": 4.824163816892241e-06, | |
| "loss": 2.0153, | |
| "step": 1770 | |
| }, | |
| { | |
| "epoch": 3.9555555555555557, | |
| "grad_norm": 16.34776496887207, | |
| "learning_rate": 4.46162555840653e-06, | |
| "loss": 2.1135, | |
| "step": 1780 | |
| }, | |
| { | |
| "epoch": 3.977777777777778, | |
| "grad_norm": 9.64548397064209, | |
| "learning_rate": 4.112610926919663e-06, | |
| "loss": 1.8388, | |
| "step": 1790 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "grad_norm": 14.741687774658203, | |
| "learning_rate": 3.777223570042082e-06, | |
| "loss": 1.8233, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "eval_loss": 1.015744686126709, | |
| "eval_runtime": 1124.2011, | |
| "eval_samples_per_second": 0.356, | |
| "eval_steps_per_second": 0.089, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 4.022222222222222, | |
| "grad_norm": 17.629962921142578, | |
| "learning_rate": 3.455563088463737e-06, | |
| "loss": 1.8195, | |
| "step": 1810 | |
| }, | |
| { | |
| "epoch": 4.044444444444444, | |
| "grad_norm": 10.74545669555664, | |
| "learning_rate": 3.1477250063755403e-06, | |
| "loss": 1.7446, | |
| "step": 1820 | |
| }, | |
| { | |
| "epoch": 4.066666666666666, | |
| "grad_norm": 14.61141586303711, | |
| "learning_rate": 2.853800743101265e-06, | |
| "loss": 1.7361, | |
| "step": 1830 | |
| }, | |
| { | |
| "epoch": 4.088888888888889, | |
| "grad_norm": 17.42447853088379, | |
| "learning_rate": 2.573877585948642e-06, | |
| "loss": 1.7683, | |
| "step": 1840 | |
| }, | |
| { | |
| "epoch": 4.111111111111111, | |
| "grad_norm": 12.99378776550293, | |
| "learning_rate": 2.308038664287371e-06, | |
| "loss": 1.8956, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 4.111111111111111, | |
| "eval_loss": 0.9937378168106079, | |
| "eval_runtime": 1121.1964, | |
| "eval_samples_per_second": 0.357, | |
| "eval_steps_per_second": 0.089, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 4.133333333333334, | |
| "grad_norm": 6.575343608856201, | |
| "learning_rate": 2.056362924862121e-06, | |
| "loss": 1.7616, | |
| "step": 1860 | |
| }, | |
| { | |
| "epoch": 4.155555555555556, | |
| "grad_norm": 7.720376491546631, | |
| "learning_rate": 1.8189251083474469e-06, | |
| "loss": 1.9345, | |
| "step": 1870 | |
| }, | |
| { | |
| "epoch": 4.177777777777778, | |
| "grad_norm": 16.67999267578125, | |
| "learning_rate": 1.5957957271519553e-06, | |
| "loss": 1.7756, | |
| "step": 1880 | |
| }, | |
| { | |
| "epoch": 4.2, | |
| "grad_norm": 25.952878952026367, | |
| "learning_rate": 1.3870410444780824e-06, | |
| "loss": 1.9684, | |
| "step": 1890 | |
| }, | |
| { | |
| "epoch": 4.222222222222222, | |
| "grad_norm": 13.947036743164062, | |
| "learning_rate": 1.1927230546437406e-06, | |
| "loss": 2.4727, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 4.222222222222222, | |
| "eval_loss": 0.9845434427261353, | |
| "eval_runtime": 1121.7839, | |
| "eval_samples_per_second": 0.357, | |
| "eval_steps_per_second": 0.089, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 4.2444444444444445, | |
| "grad_norm": 7.728816509246826, | |
| "learning_rate": 1.0128994646717683e-06, | |
| "loss": 2.5575, | |
| "step": 1910 | |
| }, | |
| { | |
| "epoch": 4.266666666666667, | |
| "grad_norm": 7.709161758422852, | |
| "learning_rate": 8.476236771525259e-07, | |
| "loss": 2.0874, | |
| "step": 1920 | |
| }, | |
| { | |
| "epoch": 4.288888888888889, | |
| "grad_norm": 12.649147033691406, | |
| "learning_rate": 6.969447743848501e-07, | |
| "loss": 2.1343, | |
| "step": 1930 | |
| }, | |
| { | |
| "epoch": 4.311111111111111, | |
| "grad_norm": 14.208868980407715, | |
| "learning_rate": 5.6090750379994e-07, | |
| "loss": 1.6593, | |
| "step": 1940 | |
| }, | |
| { | |
| "epoch": 4.333333333333333, | |
| "grad_norm": 113.40324401855469, | |
| "learning_rate": 4.395522646726491e-07, | |
| "loss": 1.877, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 4.333333333333333, | |
| "eval_loss": 0.9760661125183105, | |
| "eval_runtime": 1128.6817, | |
| "eval_samples_per_second": 0.354, | |
| "eval_steps_per_second": 0.089, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 4.355555555555555, | |
| "grad_norm": 8.278970718383789, | |
| "learning_rate": 3.329150961240146e-07, | |
| "loss": 2.4419, | |
| "step": 1960 | |
| }, | |
| { | |
| "epoch": 4.377777777777778, | |
| "grad_norm": 9.493860244750977, | |
| "learning_rate": 2.410276664186473e-07, | |
| "loss": 1.9977, | |
| "step": 1970 | |
| }, | |
| { | |
| "epoch": 4.4, | |
| "grad_norm": 12.72214412689209, | |
| "learning_rate": 1.6391726356013158e-07, | |
| "loss": 1.996, | |
| "step": 1980 | |
| }, | |
| { | |
| "epoch": 4.4222222222222225, | |
| "grad_norm": 16.340652465820312, | |
| "learning_rate": 1.0160678718726945e-07, | |
| "loss": 1.8244, | |
| "step": 1990 | |
| }, | |
| { | |
| "epoch": 4.444444444444445, | |
| "grad_norm": 5.685306549072266, | |
| "learning_rate": 5.411474177349218e-08, | |
| "loss": 1.3904, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 4.444444444444445, | |
| "eval_loss": 0.9806169271469116, | |
| "eval_runtime": 1118.8009, | |
| "eval_samples_per_second": 0.358, | |
| "eval_steps_per_second": 0.089, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 4.466666666666667, | |
| "grad_norm": 16.570466995239258, | |
| "learning_rate": 2.145523113160075e-08, | |
| "loss": 1.6432, | |
| "step": 2010 | |
| }, | |
| { | |
| "epoch": 4.488888888888889, | |
| "grad_norm": 12.792643547058105, | |
| "learning_rate": 3.637954225266249e-09, | |
| "loss": 1.7129, | |
| "step": 2020 | |
| }, | |
| { | |
| "epoch": 4.502222222222223, | |
| "step": 2026, | |
| "total_flos": 1.0486716012367872e+17, | |
| "train_loss": 0.5005418875375128, | |
| "train_runtime": 32217.358, | |
| "train_samples_per_second": 0.503, | |
| "train_steps_per_second": 0.063 | |
| }, | |
| { | |
| "epoch": 6.0088888888888885, | |
| "grad_norm": 14.330232620239258, | |
| "learning_rate": 5.99674741500138e-06, | |
| "loss": 4.366, | |
| "step": 2030 | |
| }, | |
| { | |
| "epoch": 6.038518518518519, | |
| "grad_norm": 9.551030158996582, | |
| "learning_rate": 5.6508792702119225e-06, | |
| "loss": 4.9312, | |
| "step": 2040 | |
| }, | |
| { | |
| "epoch": 6.068148148148148, | |
| "grad_norm": 11.4534273147583, | |
| "learning_rate": 5.314685922932666e-06, | |
| "loss": 4.5235, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 6.068148148148148, | |
| "eval_loss": 2.373319625854492, | |
| "eval_runtime": 894.95, | |
| "eval_samples_per_second": 0.335, | |
| "eval_steps_per_second": 0.084, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 6.097777777777778, | |
| "grad_norm": 11.584074020385742, | |
| "learning_rate": 4.988240714021464e-06, | |
| "loss": 3.6575, | |
| "step": 2060 | |
| }, | |
| { | |
| "epoch": 6.127407407407407, | |
| "grad_norm": 12.21071720123291, | |
| "learning_rate": 4.671614857771684e-06, | |
| "loss": 4.4115, | |
| "step": 2070 | |
| }, | |
| { | |
| "epoch": 6.157037037037037, | |
| "grad_norm": 7.9541168212890625, | |
| "learning_rate": 4.364877426376762e-06, | |
| "loss": 3.4241, | |
| "step": 2080 | |
| }, | |
| { | |
| "epoch": 6.1866666666666665, | |
| "grad_norm": 15.817623138427734, | |
| "learning_rate": 4.068095334862038e-06, | |
| "loss": 2.7395, | |
| "step": 2090 | |
| }, | |
| { | |
| "epoch": 6.216296296296297, | |
| "grad_norm": 11.574224472045898, | |
| "learning_rate": 3.781333326487202e-06, | |
| "loss": 2.9356, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 6.216296296296297, | |
| "eval_loss": 1.5084153413772583, | |
| "eval_runtime": 901.7323, | |
| "eval_samples_per_second": 0.333, | |
| "eval_steps_per_second": 0.083, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 6.245925925925926, | |
| "grad_norm": 22.12990951538086, | |
| "learning_rate": 3.504653958622456e-06, | |
| "loss": 3.6526, | |
| "step": 2110 | |
| }, | |
| { | |
| "epoch": 6.275555555555556, | |
| "grad_norm": 36.937110900878906, | |
| "learning_rate": 3.238117589101658e-06, | |
| "loss": 2.135, | |
| "step": 2120 | |
| }, | |
| { | |
| "epoch": 6.305185185185185, | |
| "grad_norm": 14.819047927856445, | |
| "learning_rate": 2.981782363055108e-06, | |
| "loss": 2.9432, | |
| "step": 2130 | |
| }, | |
| { | |
| "epoch": 6.3348148148148145, | |
| "grad_norm": 19.707664489746094, | |
| "learning_rate": 2.7357042002251976e-06, | |
| "loss": 2.6471, | |
| "step": 2140 | |
| }, | |
| { | |
| "epoch": 6.364444444444445, | |
| "grad_norm": 15.465389251708984, | |
| "learning_rate": 2.4999367827674756e-06, | |
| "loss": 2.3315, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 6.364444444444445, | |
| "eval_loss": 1.438122272491455, | |
| "eval_runtime": 901.0131, | |
| "eval_samples_per_second": 0.333, | |
| "eval_steps_per_second": 0.083, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 6.394074074074074, | |
| "grad_norm": 28.68686294555664, | |
| "learning_rate": 2.274531543539815e-06, | |
| "loss": 2.586, | |
| "step": 2160 | |
| }, | |
| { | |
| "epoch": 6.423703703703704, | |
| "grad_norm": 22.25936508178711, | |
| "learning_rate": 2.0595376548823097e-06, | |
| "loss": 3.1009, | |
| "step": 2170 | |
| }, | |
| { | |
| "epoch": 6.453333333333333, | |
| "grad_norm": 12.523676872253418, | |
| "learning_rate": 1.8550020178902727e-06, | |
| "loss": 1.9499, | |
| "step": 2180 | |
| }, | |
| { | |
| "epoch": 6.482962962962963, | |
| "grad_norm": 146.981201171875, | |
| "learning_rate": 1.6609692521827424e-06, | |
| "loss": 2.565, | |
| "step": 2190 | |
| }, | |
| { | |
| "epoch": 6.5125925925925925, | |
| "grad_norm": 17.379535675048828, | |
| "learning_rate": 1.4774816861686636e-06, | |
| "loss": 2.7072, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 6.5125925925925925, | |
| "eval_loss": 1.3992533683776855, | |
| "eval_runtime": 915.4557, | |
| "eval_samples_per_second": 0.328, | |
| "eval_steps_per_second": 0.082, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 6.542222222222223, | |
| "grad_norm": 9.735651969909668, | |
| "learning_rate": 1.304579347812912e-06, | |
| "loss": 2.3235, | |
| "step": 2210 | |
| }, | |
| { | |
| "epoch": 6.571851851851852, | |
| "grad_norm": 9.481500625610352, | |
| "learning_rate": 1.1422999559041581e-06, | |
| "loss": 2.4865, | |
| "step": 2220 | |
| }, | |
| { | |
| "epoch": 6.601481481481482, | |
| "grad_norm": 7.604280948638916, | |
| "learning_rate": 9.90678911826487e-07, | |
| "loss": 2.3393, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 6.631111111111111, | |
| "grad_norm": 24.429338455200195, | |
| "learning_rate": 8.497492918365602e-07, | |
| "loss": 2.2012, | |
| "step": 2240 | |
| }, | |
| { | |
| "epoch": 6.66074074074074, | |
| "grad_norm": 15.720512390136719, | |
| "learning_rate": 7.195418398479925e-07, | |
| "loss": 2.6741, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 6.66074074074074, | |
| "eval_loss": 1.3970364332199097, | |
| "eval_runtime": 932.9152, | |
| "eval_samples_per_second": 0.322, | |
| "eval_steps_per_second": 0.08, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 6.6903703703703705, | |
| "grad_norm": 10.252752304077148, | |
| "learning_rate": 6.00084960724534e-07, | |
| "loss": 2.4081, | |
| "step": 2260 | |
| }, | |
| { | |
| "epoch": 6.72, | |
| "grad_norm": 17.19991111755371, | |
| "learning_rate": 4.914047140835653e-07, | |
| "loss": 2.4516, | |
| "step": 2270 | |
| }, | |
| { | |
| "epoch": 6.74962962962963, | |
| "grad_norm": 23.26289176940918, | |
| "learning_rate": 3.935248086111176e-07, | |
| "loss": 2.5946, | |
| "step": 2280 | |
| }, | |
| { | |
| "epoch": 6.779259259259259, | |
| "grad_norm": 12.669425964355469, | |
| "learning_rate": 3.064665968898428e-07, | |
| "loss": 2.804, | |
| "step": 2290 | |
| }, | |
| { | |
| "epoch": 6.808888888888889, | |
| "grad_norm": 22.176555633544922, | |
| "learning_rate": 2.3024907074091772e-07, | |
| "loss": 2.6944, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 6.808888888888889, | |
| "eval_loss": 1.3712869882583618, | |
| "eval_runtime": 889.4139, | |
| "eval_samples_per_second": 0.337, | |
| "eval_steps_per_second": 0.084, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 6.838518518518518, | |
| "grad_norm": 22.525259017944336, | |
| "learning_rate": 1.6488885708094705e-07, | |
| "loss": 2.3312, | |
| "step": 2310 | |
| }, | |
| { | |
| "epoch": 6.868148148148148, | |
| "grad_norm": 9.424576759338379, | |
| "learning_rate": 1.1040021429480907e-07, | |
| "loss": 2.1776, | |
| "step": 2320 | |
| }, | |
| { | |
| "epoch": 6.897777777777778, | |
| "grad_norm": 33.19387435913086, | |
| "learning_rate": 6.679502912517732e-08, | |
| "loss": 2.3443, | |
| "step": 2330 | |
| }, | |
| { | |
| "epoch": 6.927407407407408, | |
| "grad_norm": 27.716110229492188, | |
| "learning_rate": 3.408281407939473e-08, | |
| "loss": 2.2612, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 6.957037037037037, | |
| "grad_norm": 17.70338249206543, | |
| "learning_rate": 1.2270705354333612e-08, | |
| "loss": 2.4162, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 6.957037037037037, | |
| "eval_loss": 1.3736646175384521, | |
| "eval_runtime": 876.3782, | |
| "eval_samples_per_second": 0.342, | |
| "eval_steps_per_second": 0.086, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 6.986666666666666, | |
| "grad_norm": 18.5972957611084, | |
| "learning_rate": 1.3634612796298295e-09, | |
| "loss": 2.4499, | |
| "step": 2360 | |
| }, | |
| { | |
| "epoch": 6.998518518518519, | |
| "step": 2364, | |
| "total_flos": 1.225036049440727e+17, | |
| "train_loss": 0.4030211762526717, | |
| "train_runtime": 24477.7962, | |
| "train_samples_per_second": 0.773, | |
| "train_steps_per_second": 0.097 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 2364, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 7, | |
| "save_steps": 50, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.225036049440727e+17, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |