{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9989717223650385, "eval_steps": 500, "global_step": 972, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02056555269922879, "grad_norm": 9.233743238441296, "learning_rate": 1.9994839090452616e-05, "loss": 1.5713, "step": 10 }, { "epoch": 0.04113110539845758, "grad_norm": 13.69020758087521, "learning_rate": 1.9979236966675828e-05, "loss": 1.3565, "step": 20 }, { "epoch": 0.061696658097686374, "grad_norm": 9.2629201307749, "learning_rate": 1.9953209739827946e-05, "loss": 1.3132, "step": 30 }, { "epoch": 0.08226221079691516, "grad_norm": 5.836967088114798, "learning_rate": 1.9916784600016132e-05, "loss": 1.2708, "step": 40 }, { "epoch": 0.10282776349614396, "grad_norm": 9.00423601885076, "learning_rate": 1.9869999599832804e-05, "loss": 1.2658, "step": 50 }, { "epoch": 0.12339331619537275, "grad_norm": 13.166982152661006, "learning_rate": 1.981290361460287e-05, "loss": 1.2531, "step": 60 }, { "epoch": 0.14395886889460155, "grad_norm": 10.728661259090346, "learning_rate": 1.974555629132469e-05, "loss": 1.2339, "step": 70 }, { "epoch": 0.16452442159383032, "grad_norm": 9.294779587967374, "learning_rate": 1.9668027986358082e-05, "loss": 1.2372, "step": 80 }, { "epoch": 0.18508997429305912, "grad_norm": 6.388693353299931, "learning_rate": 1.9580399691924484e-05, "loss": 1.2298, "step": 90 }, { "epoch": 0.20565552699228792, "grad_norm": 5.685294719278735, "learning_rate": 1.9482762951496056e-05, "loss": 1.2312, "step": 100 }, { "epoch": 0.2262210796915167, "grad_norm": 6.685866232367591, "learning_rate": 1.9375219764162096e-05, "loss": 1.2316, "step": 110 }, { "epoch": 0.2467866323907455, "grad_norm": 7.688018717119547, "learning_rate": 1.92578824780727e-05, "loss": 1.2337, "step": 120 }, { "epoch": 0.26735218508997427, "grad_norm": 3.884064710067138, "learning_rate": 1.913087367307095e-05, "loss": 1.2136, "step": 130 }, { "epoch": 0.2879177377892031, "grad_norm": 9.444686410104936, "learning_rate": 1.8994326032636318e-05, "loss": 1.2072, "step": 140 }, { "epoch": 0.30848329048843187, "grad_norm": 4.864822081558493, "learning_rate": 1.8848382205272924e-05, "loss": 1.2017, "step": 150 }, { "epoch": 0.32904884318766064, "grad_norm": 5.9852253238798605, "learning_rate": 1.869319465548762e-05, "loss": 1.208, "step": 160 }, { "epoch": 0.3496143958868895, "grad_norm": 6.2970547396781225, "learning_rate": 1.852892550451345e-05, "loss": 1.2012, "step": 170 }, { "epoch": 0.37017994858611825, "grad_norm": 7.846752995277368, "learning_rate": 1.835574636094494e-05, "loss": 1.2035, "step": 180 }, { "epoch": 0.390745501285347, "grad_norm": 9.795450293111358, "learning_rate": 1.8173838141462145e-05, "loss": 1.2147, "step": 190 }, { "epoch": 0.41131105398457585, "grad_norm": 7.782933189444834, "learning_rate": 1.798339088183071e-05, "loss": 1.2126, "step": 200 }, { "epoch": 0.4318766066838046, "grad_norm": 5.494144367822917, "learning_rate": 1.7784603538375453e-05, "loss": 1.2089, "step": 210 }, { "epoch": 0.4524421593830334, "grad_norm": 6.609017931196138, "learning_rate": 1.7577683780134756e-05, "loss": 1.1879, "step": 220 }, { "epoch": 0.4730077120822622, "grad_norm": 8.906035242084766, "learning_rate": 1.7362847771913035e-05, "loss": 1.2097, "step": 230 }, { "epoch": 0.493573264781491, "grad_norm": 8.648775336727859, "learning_rate": 1.714031994845782e-05, "loss": 1.187, "step": 240 }, { "epoch": 0.5141388174807198, "grad_norm": 5.843126331449411, "learning_rate": 1.6910332779997378e-05, "loss": 1.1835, "step": 250 }, { "epoch": 0.5347043701799485, "grad_norm": 7.57208189767232, "learning_rate": 1.6673126529383905e-05, "loss": 1.1906, "step": 260 }, { "epoch": 0.5552699228791774, "grad_norm": 5.872037189234157, "learning_rate": 1.642894900109584e-05, "loss": 1.1736, "step": 270 }, { "epoch": 0.5758354755784062, "grad_norm": 5.59549822016061, "learning_rate": 1.6178055282361642e-05, "loss": 1.19, "step": 280 }, { "epoch": 0.596401028277635, "grad_norm": 7.1037529678120785, "learning_rate": 1.5920707476675446e-05, "loss": 1.1851, "step": 290 }, { "epoch": 0.6169665809768637, "grad_norm": 4.7250598751432875, "learning_rate": 1.565717442998292e-05, "loss": 1.1824, "step": 300 }, { "epoch": 0.6375321336760925, "grad_norm": 7.603461950437508, "learning_rate": 1.5387731449823474e-05, "loss": 1.1543, "step": 310 }, { "epoch": 0.6580976863753213, "grad_norm": 7.812378658902085, "learning_rate": 1.5112660017722122e-05, "loss": 1.1683, "step": 320 }, { "epoch": 0.6786632390745502, "grad_norm": 6.866611481508661, "learning_rate": 1.4832247495131566e-05, "loss": 1.1643, "step": 330 }, { "epoch": 0.699228791773779, "grad_norm": 8.447272563997627, "learning_rate": 1.45467868232316e-05, "loss": 1.1679, "step": 340 }, { "epoch": 0.7197943444730077, "grad_norm": 8.29660722849121, "learning_rate": 1.4256576216899494e-05, "loss": 1.1605, "step": 350 }, { "epoch": 0.7403598971722365, "grad_norm": 6.408869669950844, "learning_rate": 1.3961918853171073e-05, "loss": 1.1681, "step": 360 }, { "epoch": 0.7609254498714653, "grad_norm": 7.257507857539118, "learning_rate": 1.3663122554517917e-05, "loss": 1.1545, "step": 370 }, { "epoch": 0.781491002570694, "grad_norm": 6.745599564457566, "learning_rate": 1.3360499467271552e-05, "loss": 1.167, "step": 380 }, { "epoch": 0.8020565552699229, "grad_norm": 4.372254672475978, "learning_rate": 1.3054365735530666e-05, "loss": 1.1706, "step": 390 }, { "epoch": 0.8226221079691517, "grad_norm": 5.9583586806734585, "learning_rate": 1.2745041170891827e-05, "loss": 1.1512, "step": 400 }, { "epoch": 0.8431876606683805, "grad_norm": 8.218488729311929, "learning_rate": 1.243284891834894e-05, "loss": 1.161, "step": 410 }, { "epoch": 0.8637532133676092, "grad_norm": 9.05020466841952, "learning_rate": 1.211811511871033e-05, "loss": 1.1499, "step": 420 }, { "epoch": 0.884318766066838, "grad_norm": 7.133782169539759, "learning_rate": 1.1801168567886159e-05, "loss": 1.1428, "step": 430 }, { "epoch": 0.9048843187660668, "grad_norm": 10.061681353175718, "learning_rate": 1.1482340373402128e-05, "loss": 1.1548, "step": 440 }, { "epoch": 0.9254498714652957, "grad_norm": 5.964595532826752, "learning_rate": 1.1161963608498254e-05, "loss": 1.1375, "step": 450 }, { "epoch": 0.9460154241645244, "grad_norm": 7.682863713931749, "learning_rate": 1.0840372964174148e-05, "loss": 1.1441, "step": 460 }, { "epoch": 0.9665809768637532, "grad_norm": 6.093064442303913, "learning_rate": 1.051790439954422e-05, "loss": 1.1374, "step": 470 }, { "epoch": 0.987146529562982, "grad_norm": 2.859419774137469, "learning_rate": 1.0194894790868113e-05, "loss": 1.135, "step": 480 }, { "epoch": 1.0077120822622108, "grad_norm": 5.504925974002832, "learning_rate": 9.871681579623028e-06, "loss": 1.1268, "step": 490 }, { "epoch": 1.0282776349614395, "grad_norm": 6.006410935514135, "learning_rate": 9.548602419985584e-06, "loss": 1.0911, "step": 500 }, { "epoch": 1.0488431876606683, "grad_norm": 3.2675854341672608, "learning_rate": 9.225994826091431e-06, "loss": 1.0816, "step": 510 }, { "epoch": 1.069408740359897, "grad_norm": 6.801925989140796, "learning_rate": 8.904195819441222e-06, "loss": 1.0833, "step": 520 }, { "epoch": 1.089974293059126, "grad_norm": 3.6643836420434663, "learning_rate": 8.583541576821191e-06, "loss": 1.081, "step": 530 }, { "epoch": 1.1105398457583548, "grad_norm": 6.747377788645208, "learning_rate": 8.264367079106194e-06, "loss": 1.0793, "step": 540 }, { "epoch": 1.1311053984575836, "grad_norm": 6.372537375904327, "learning_rate": 7.947005761312097e-06, "loss": 1.0979, "step": 550 }, { "epoch": 1.1516709511568124, "grad_norm": 6.925714108622525, "learning_rate": 7.6317891642631e-06, "loss": 1.0868, "step": 560 }, { "epoch": 1.1722365038560412, "grad_norm": 7.332135497496814, "learning_rate": 7.319046588237864e-06, "loss": 1.0613, "step": 570 }, { "epoch": 1.19280205655527, "grad_norm": 3.4981756709446454, "learning_rate": 7.009104748956304e-06, "loss": 1.0801, "step": 580 }, { "epoch": 1.2133676092544987, "grad_norm": 5.6660043695937, "learning_rate": 6.7022874362664155e-06, "loss": 1.0838, "step": 590 }, { "epoch": 1.2339331619537275, "grad_norm": 6.6393615146162634, "learning_rate": 6.398915175887698e-06, "loss": 1.0692, "step": 600 }, { "epoch": 1.2544987146529563, "grad_norm": 9.01722626725686, "learning_rate": 6.099304894564544e-06, "loss": 1.093, "step": 610 }, { "epoch": 1.275064267352185, "grad_norm": 7.465032128492785, "learning_rate": 5.8037695889794e-06, "loss": 1.0781, "step": 620 }, { "epoch": 1.2956298200514138, "grad_norm": 8.243642393853053, "learning_rate": 5.512617998771598e-06, "loss": 1.0833, "step": 630 }, { "epoch": 1.3161953727506428, "grad_norm": 3.1779811319825577, "learning_rate": 5.226154284003411e-06, "loss": 1.0715, "step": 640 }, { "epoch": 1.3367609254498714, "grad_norm": 4.120350391737107, "learning_rate": 4.944677707410315e-06, "loss": 1.0829, "step": 650 }, { "epoch": 1.3573264781491003, "grad_norm": 7.105813120974067, "learning_rate": 4.668482321767371e-06, "loss": 1.0865, "step": 660 }, { "epoch": 1.3778920308483291, "grad_norm": 8.035377582845337, "learning_rate": 4.397856662698368e-06, "loss": 1.0533, "step": 670 }, { "epoch": 1.398457583547558, "grad_norm": 8.049531770624485, "learning_rate": 4.133083447248599e-06, "loss": 1.0745, "step": 680 }, { "epoch": 1.4190231362467867, "grad_norm": 7.225962565044245, "learning_rate": 3.874439278536187e-06, "loss": 1.0899, "step": 690 }, { "epoch": 1.4395886889460154, "grad_norm": 7.270761540137814, "learning_rate": 3.6221943567905283e-06, "loss": 1.0784, "step": 700 }, { "epoch": 1.4601542416452442, "grad_norm": 8.705361491327107, "learning_rate": 3.3766121970796716e-06, "loss": 1.0819, "step": 710 }, { "epoch": 1.480719794344473, "grad_norm": 4.183940499981808, "learning_rate": 3.1379493540215677e-06, "loss": 1.069, "step": 720 }, { "epoch": 1.5012853470437018, "grad_norm": 6.398726679627311, "learning_rate": 2.906455153766744e-06, "loss": 1.0785, "step": 730 }, { "epoch": 1.5218508997429305, "grad_norm": 13.601116552168905, "learning_rate": 2.6823714335324237e-06, "loss": 1.057, "step": 740 }, { "epoch": 1.5424164524421595, "grad_norm": 5.5895207138975, "learning_rate": 2.46593228896017e-06, "loss": 1.0553, "step": 750 }, { "epoch": 1.562982005141388, "grad_norm": 6.267057233279022, "learning_rate": 2.257363829560986e-06, "loss": 1.0542, "step": 760 }, { "epoch": 1.583547557840617, "grad_norm": 4.378216644983502, "learning_rate": 2.0568839425033906e-06, "loss": 1.0799, "step": 770 }, { "epoch": 1.6041131105398456, "grad_norm": 7.255472699627192, "learning_rate": 1.864702064991173e-06, "loss": 1.0571, "step": 780 }, { "epoch": 1.6246786632390746, "grad_norm": 6.289474192540149, "learning_rate": 1.6810189654686715e-06, "loss": 1.0472, "step": 790 }, { "epoch": 1.6452442159383034, "grad_norm": 6.589217658649451, "learning_rate": 1.5060265338821123e-06, "loss": 1.0703, "step": 800 }, { "epoch": 1.6658097686375322, "grad_norm": 4.865701816340032, "learning_rate": 1.3399075812161488e-06, "loss": 1.055, "step": 810 }, { "epoch": 1.686375321336761, "grad_norm": 8.65763359081179, "learning_rate": 1.1828356485149927e-06, "loss": 1.0622, "step": 820 }, { "epoch": 1.7069408740359897, "grad_norm": 6.174972638755989, "learning_rate": 1.0349748255876536e-06, "loss": 1.0526, "step": 830 }, { "epoch": 1.7275064267352185, "grad_norm": 4.986705584850829, "learning_rate": 8.964795795867176e-07, "loss": 1.051, "step": 840 }, { "epoch": 1.7480719794344473, "grad_norm": 9.202122011281354, "learning_rate": 7.67494593639686e-07, "loss": 1.0574, "step": 850 }, { "epoch": 1.7686375321336762, "grad_norm": 8.058395431378568, "learning_rate": 6.481546157014996e-07, "loss": 1.0589, "step": 860 }, { "epoch": 1.7892030848329048, "grad_norm": 5.3354582297297135, "learning_rate": 5.385843177861261e-07, "loss": 1.0578, "step": 870 }, { "epoch": 1.8097686375321338, "grad_norm": 3.8239017444437415, "learning_rate": 4.388981657242819e-07, "loss": 1.0663, "step": 880 }, { "epoch": 1.8303341902313623, "grad_norm": 10.032769787716335, "learning_rate": 3.4920029958333656e-07, "loss": 1.0671, "step": 890 }, { "epoch": 1.8508997429305913, "grad_norm": 4.449303124886201, "learning_rate": 2.695844248743318e-07, "loss": 1.0573, "step": 900 }, { "epoch": 1.87146529562982, "grad_norm": 6.5875403409786735, "learning_rate": 2.0013371465976816e-07, "loss": 1.063, "step": 910 }, { "epoch": 1.8920308483290489, "grad_norm": 6.1087849946753545, "learning_rate": 1.409207226644227e-07, "loss": 1.0703, "step": 920 }, { "epoch": 1.9125964010282777, "grad_norm": 9.357948853303002, "learning_rate": 9.200730747996211e-08, "loss": 1.0615, "step": 930 }, { "epoch": 1.9331619537275064, "grad_norm": 6.746455746388351, "learning_rate": 5.344456794255881e-08, "loss": 1.0591, "step": 940 }, { "epoch": 1.9537275064267352, "grad_norm": 4.307020198331092, "learning_rate": 2.5272789750980797e-08, "loss": 1.0591, "step": 950 }, { "epoch": 1.974293059125964, "grad_norm": 2.368072060390636, "learning_rate": 7.521403380956748e-09, "loss": 1.0602, "step": 960 }, { "epoch": 1.9948586118251928, "grad_norm": 6.97337278897093, "learning_rate": 2.089533397653387e-10, "loss": 1.0495, "step": 970 }, { "epoch": 1.9989717223650385, "step": 972, "total_flos": 9.034546879177687e+18, "train_loss": 1.1365475546675945, "train_runtime": 15552.1986, "train_samples_per_second": 32.015, "train_steps_per_second": 0.062 } ], "logging_steps": 10, "max_steps": 972, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "total_flos": 9.034546879177687e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null }