| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.9989717223650385, | |
| "eval_steps": 500, | |
| "global_step": 972, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.02056555269922879, | |
| "grad_norm": 9.233743238441296, | |
| "learning_rate": 1.9994839090452616e-05, | |
| "loss": 1.5713, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.04113110539845758, | |
| "grad_norm": 13.69020758087521, | |
| "learning_rate": 1.9979236966675828e-05, | |
| "loss": 1.3565, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.061696658097686374, | |
| "grad_norm": 9.2629201307749, | |
| "learning_rate": 1.9953209739827946e-05, | |
| "loss": 1.3132, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.08226221079691516, | |
| "grad_norm": 5.836967088114798, | |
| "learning_rate": 1.9916784600016132e-05, | |
| "loss": 1.2708, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.10282776349614396, | |
| "grad_norm": 9.00423601885076, | |
| "learning_rate": 1.9869999599832804e-05, | |
| "loss": 1.2658, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.12339331619537275, | |
| "grad_norm": 13.166982152661006, | |
| "learning_rate": 1.981290361460287e-05, | |
| "loss": 1.2531, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.14395886889460155, | |
| "grad_norm": 10.728661259090346, | |
| "learning_rate": 1.974555629132469e-05, | |
| "loss": 1.2339, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.16452442159383032, | |
| "grad_norm": 9.294779587967374, | |
| "learning_rate": 1.9668027986358082e-05, | |
| "loss": 1.2372, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.18508997429305912, | |
| "grad_norm": 6.388693353299931, | |
| "learning_rate": 1.9580399691924484e-05, | |
| "loss": 1.2298, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.20565552699228792, | |
| "grad_norm": 5.685294719278735, | |
| "learning_rate": 1.9482762951496056e-05, | |
| "loss": 1.2312, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.2262210796915167, | |
| "grad_norm": 6.685866232367591, | |
| "learning_rate": 1.9375219764162096e-05, | |
| "loss": 1.2316, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.2467866323907455, | |
| "grad_norm": 7.688018717119547, | |
| "learning_rate": 1.92578824780727e-05, | |
| "loss": 1.2337, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.26735218508997427, | |
| "grad_norm": 3.884064710067138, | |
| "learning_rate": 1.913087367307095e-05, | |
| "loss": 1.2136, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.2879177377892031, | |
| "grad_norm": 9.444686410104936, | |
| "learning_rate": 1.8994326032636318e-05, | |
| "loss": 1.2072, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.30848329048843187, | |
| "grad_norm": 4.864822081558493, | |
| "learning_rate": 1.8848382205272924e-05, | |
| "loss": 1.2017, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.32904884318766064, | |
| "grad_norm": 5.9852253238798605, | |
| "learning_rate": 1.869319465548762e-05, | |
| "loss": 1.208, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.3496143958868895, | |
| "grad_norm": 6.2970547396781225, | |
| "learning_rate": 1.852892550451345e-05, | |
| "loss": 1.2012, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.37017994858611825, | |
| "grad_norm": 7.846752995277368, | |
| "learning_rate": 1.835574636094494e-05, | |
| "loss": 1.2035, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.390745501285347, | |
| "grad_norm": 9.795450293111358, | |
| "learning_rate": 1.8173838141462145e-05, | |
| "loss": 1.2147, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.41131105398457585, | |
| "grad_norm": 7.782933189444834, | |
| "learning_rate": 1.798339088183071e-05, | |
| "loss": 1.2126, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.4318766066838046, | |
| "grad_norm": 5.494144367822917, | |
| "learning_rate": 1.7784603538375453e-05, | |
| "loss": 1.2089, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.4524421593830334, | |
| "grad_norm": 6.609017931196138, | |
| "learning_rate": 1.7577683780134756e-05, | |
| "loss": 1.1879, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.4730077120822622, | |
| "grad_norm": 8.906035242084766, | |
| "learning_rate": 1.7362847771913035e-05, | |
| "loss": 1.2097, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.493573264781491, | |
| "grad_norm": 8.648775336727859, | |
| "learning_rate": 1.714031994845782e-05, | |
| "loss": 1.187, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.5141388174807198, | |
| "grad_norm": 5.843126331449411, | |
| "learning_rate": 1.6910332779997378e-05, | |
| "loss": 1.1835, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.5347043701799485, | |
| "grad_norm": 7.57208189767232, | |
| "learning_rate": 1.6673126529383905e-05, | |
| "loss": 1.1906, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.5552699228791774, | |
| "grad_norm": 5.872037189234157, | |
| "learning_rate": 1.642894900109584e-05, | |
| "loss": 1.1736, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.5758354755784062, | |
| "grad_norm": 5.59549822016061, | |
| "learning_rate": 1.6178055282361642e-05, | |
| "loss": 1.19, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.596401028277635, | |
| "grad_norm": 7.1037529678120785, | |
| "learning_rate": 1.5920707476675446e-05, | |
| "loss": 1.1851, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.6169665809768637, | |
| "grad_norm": 4.7250598751432875, | |
| "learning_rate": 1.565717442998292e-05, | |
| "loss": 1.1824, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.6375321336760925, | |
| "grad_norm": 7.603461950437508, | |
| "learning_rate": 1.5387731449823474e-05, | |
| "loss": 1.1543, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.6580976863753213, | |
| "grad_norm": 7.812378658902085, | |
| "learning_rate": 1.5112660017722122e-05, | |
| "loss": 1.1683, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.6786632390745502, | |
| "grad_norm": 6.866611481508661, | |
| "learning_rate": 1.4832247495131566e-05, | |
| "loss": 1.1643, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.699228791773779, | |
| "grad_norm": 8.447272563997627, | |
| "learning_rate": 1.45467868232316e-05, | |
| "loss": 1.1679, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.7197943444730077, | |
| "grad_norm": 8.29660722849121, | |
| "learning_rate": 1.4256576216899494e-05, | |
| "loss": 1.1605, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.7403598971722365, | |
| "grad_norm": 6.408869669950844, | |
| "learning_rate": 1.3961918853171073e-05, | |
| "loss": 1.1681, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.7609254498714653, | |
| "grad_norm": 7.257507857539118, | |
| "learning_rate": 1.3663122554517917e-05, | |
| "loss": 1.1545, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.781491002570694, | |
| "grad_norm": 6.745599564457566, | |
| "learning_rate": 1.3360499467271552e-05, | |
| "loss": 1.167, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.8020565552699229, | |
| "grad_norm": 4.372254672475978, | |
| "learning_rate": 1.3054365735530666e-05, | |
| "loss": 1.1706, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.8226221079691517, | |
| "grad_norm": 5.9583586806734585, | |
| "learning_rate": 1.2745041170891827e-05, | |
| "loss": 1.1512, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.8431876606683805, | |
| "grad_norm": 8.218488729311929, | |
| "learning_rate": 1.243284891834894e-05, | |
| "loss": 1.161, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.8637532133676092, | |
| "grad_norm": 9.05020466841952, | |
| "learning_rate": 1.211811511871033e-05, | |
| "loss": 1.1499, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.884318766066838, | |
| "grad_norm": 7.133782169539759, | |
| "learning_rate": 1.1801168567886159e-05, | |
| "loss": 1.1428, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.9048843187660668, | |
| "grad_norm": 10.061681353175718, | |
| "learning_rate": 1.1482340373402128e-05, | |
| "loss": 1.1548, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.9254498714652957, | |
| "grad_norm": 5.964595532826752, | |
| "learning_rate": 1.1161963608498254e-05, | |
| "loss": 1.1375, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.9460154241645244, | |
| "grad_norm": 7.682863713931749, | |
| "learning_rate": 1.0840372964174148e-05, | |
| "loss": 1.1441, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.9665809768637532, | |
| "grad_norm": 6.093064442303913, | |
| "learning_rate": 1.051790439954422e-05, | |
| "loss": 1.1374, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.987146529562982, | |
| "grad_norm": 2.859419774137469, | |
| "learning_rate": 1.0194894790868113e-05, | |
| "loss": 1.135, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 1.0077120822622108, | |
| "grad_norm": 5.504925974002832, | |
| "learning_rate": 9.871681579623028e-06, | |
| "loss": 1.1268, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 1.0282776349614395, | |
| "grad_norm": 6.006410935514135, | |
| "learning_rate": 9.548602419985584e-06, | |
| "loss": 1.0911, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.0488431876606683, | |
| "grad_norm": 3.2675854341672608, | |
| "learning_rate": 9.225994826091431e-06, | |
| "loss": 1.0816, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 1.069408740359897, | |
| "grad_norm": 6.801925989140796, | |
| "learning_rate": 8.904195819441222e-06, | |
| "loss": 1.0833, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 1.089974293059126, | |
| "grad_norm": 3.6643836420434663, | |
| "learning_rate": 8.583541576821191e-06, | |
| "loss": 1.081, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 1.1105398457583548, | |
| "grad_norm": 6.747377788645208, | |
| "learning_rate": 8.264367079106194e-06, | |
| "loss": 1.0793, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 1.1311053984575836, | |
| "grad_norm": 6.372537375904327, | |
| "learning_rate": 7.947005761312097e-06, | |
| "loss": 1.0979, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 1.1516709511568124, | |
| "grad_norm": 6.925714108622525, | |
| "learning_rate": 7.6317891642631e-06, | |
| "loss": 1.0868, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 1.1722365038560412, | |
| "grad_norm": 7.332135497496814, | |
| "learning_rate": 7.319046588237864e-06, | |
| "loss": 1.0613, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 1.19280205655527, | |
| "grad_norm": 3.4981756709446454, | |
| "learning_rate": 7.009104748956304e-06, | |
| "loss": 1.0801, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 1.2133676092544987, | |
| "grad_norm": 5.6660043695937, | |
| "learning_rate": 6.7022874362664155e-06, | |
| "loss": 1.0838, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 1.2339331619537275, | |
| "grad_norm": 6.6393615146162634, | |
| "learning_rate": 6.398915175887698e-06, | |
| "loss": 1.0692, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.2544987146529563, | |
| "grad_norm": 9.01722626725686, | |
| "learning_rate": 6.099304894564544e-06, | |
| "loss": 1.093, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 1.275064267352185, | |
| "grad_norm": 7.465032128492785, | |
| "learning_rate": 5.8037695889794e-06, | |
| "loss": 1.0781, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 1.2956298200514138, | |
| "grad_norm": 8.243642393853053, | |
| "learning_rate": 5.512617998771598e-06, | |
| "loss": 1.0833, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 1.3161953727506428, | |
| "grad_norm": 3.1779811319825577, | |
| "learning_rate": 5.226154284003411e-06, | |
| "loss": 1.0715, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 1.3367609254498714, | |
| "grad_norm": 4.120350391737107, | |
| "learning_rate": 4.944677707410315e-06, | |
| "loss": 1.0829, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 1.3573264781491003, | |
| "grad_norm": 7.105813120974067, | |
| "learning_rate": 4.668482321767371e-06, | |
| "loss": 1.0865, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 1.3778920308483291, | |
| "grad_norm": 8.035377582845337, | |
| "learning_rate": 4.397856662698368e-06, | |
| "loss": 1.0533, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 1.398457583547558, | |
| "grad_norm": 8.049531770624485, | |
| "learning_rate": 4.133083447248599e-06, | |
| "loss": 1.0745, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 1.4190231362467867, | |
| "grad_norm": 7.225962565044245, | |
| "learning_rate": 3.874439278536187e-06, | |
| "loss": 1.0899, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 1.4395886889460154, | |
| "grad_norm": 7.270761540137814, | |
| "learning_rate": 3.6221943567905283e-06, | |
| "loss": 1.0784, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.4601542416452442, | |
| "grad_norm": 8.705361491327107, | |
| "learning_rate": 3.3766121970796716e-06, | |
| "loss": 1.0819, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 1.480719794344473, | |
| "grad_norm": 4.183940499981808, | |
| "learning_rate": 3.1379493540215677e-06, | |
| "loss": 1.069, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 1.5012853470437018, | |
| "grad_norm": 6.398726679627311, | |
| "learning_rate": 2.906455153766744e-06, | |
| "loss": 1.0785, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 1.5218508997429305, | |
| "grad_norm": 13.601116552168905, | |
| "learning_rate": 2.6823714335324237e-06, | |
| "loss": 1.057, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 1.5424164524421595, | |
| "grad_norm": 5.5895207138975, | |
| "learning_rate": 2.46593228896017e-06, | |
| "loss": 1.0553, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 1.562982005141388, | |
| "grad_norm": 6.267057233279022, | |
| "learning_rate": 2.257363829560986e-06, | |
| "loss": 1.0542, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 1.583547557840617, | |
| "grad_norm": 4.378216644983502, | |
| "learning_rate": 2.0568839425033906e-06, | |
| "loss": 1.0799, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 1.6041131105398456, | |
| "grad_norm": 7.255472699627192, | |
| "learning_rate": 1.864702064991173e-06, | |
| "loss": 1.0571, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 1.6246786632390746, | |
| "grad_norm": 6.289474192540149, | |
| "learning_rate": 1.6810189654686715e-06, | |
| "loss": 1.0472, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 1.6452442159383034, | |
| "grad_norm": 6.589217658649451, | |
| "learning_rate": 1.5060265338821123e-06, | |
| "loss": 1.0703, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.6658097686375322, | |
| "grad_norm": 4.865701816340032, | |
| "learning_rate": 1.3399075812161488e-06, | |
| "loss": 1.055, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 1.686375321336761, | |
| "grad_norm": 8.65763359081179, | |
| "learning_rate": 1.1828356485149927e-06, | |
| "loss": 1.0622, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 1.7069408740359897, | |
| "grad_norm": 6.174972638755989, | |
| "learning_rate": 1.0349748255876536e-06, | |
| "loss": 1.0526, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 1.7275064267352185, | |
| "grad_norm": 4.986705584850829, | |
| "learning_rate": 8.964795795867176e-07, | |
| "loss": 1.051, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 1.7480719794344473, | |
| "grad_norm": 9.202122011281354, | |
| "learning_rate": 7.67494593639686e-07, | |
| "loss": 1.0574, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 1.7686375321336762, | |
| "grad_norm": 8.058395431378568, | |
| "learning_rate": 6.481546157014996e-07, | |
| "loss": 1.0589, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 1.7892030848329048, | |
| "grad_norm": 5.3354582297297135, | |
| "learning_rate": 5.385843177861261e-07, | |
| "loss": 1.0578, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 1.8097686375321338, | |
| "grad_norm": 3.8239017444437415, | |
| "learning_rate": 4.388981657242819e-07, | |
| "loss": 1.0663, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 1.8303341902313623, | |
| "grad_norm": 10.032769787716335, | |
| "learning_rate": 3.4920029958333656e-07, | |
| "loss": 1.0671, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 1.8508997429305913, | |
| "grad_norm": 4.449303124886201, | |
| "learning_rate": 2.695844248743318e-07, | |
| "loss": 1.0573, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.87146529562982, | |
| "grad_norm": 6.5875403409786735, | |
| "learning_rate": 2.0013371465976816e-07, | |
| "loss": 1.063, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 1.8920308483290489, | |
| "grad_norm": 6.1087849946753545, | |
| "learning_rate": 1.409207226644227e-07, | |
| "loss": 1.0703, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 1.9125964010282777, | |
| "grad_norm": 9.357948853303002, | |
| "learning_rate": 9.200730747996211e-08, | |
| "loss": 1.0615, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 1.9331619537275064, | |
| "grad_norm": 6.746455746388351, | |
| "learning_rate": 5.344456794255881e-08, | |
| "loss": 1.0591, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 1.9537275064267352, | |
| "grad_norm": 4.307020198331092, | |
| "learning_rate": 2.5272789750980797e-08, | |
| "loss": 1.0591, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 1.974293059125964, | |
| "grad_norm": 2.368072060390636, | |
| "learning_rate": 7.521403380956748e-09, | |
| "loss": 1.0602, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 1.9948586118251928, | |
| "grad_norm": 6.97337278897093, | |
| "learning_rate": 2.089533397653387e-10, | |
| "loss": 1.0495, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 1.9989717223650385, | |
| "step": 972, | |
| "total_flos": 9.034546879177687e+18, | |
| "train_loss": 1.1365475546675945, | |
| "train_runtime": 15552.1986, | |
| "train_samples_per_second": 32.015, | |
| "train_steps_per_second": 0.062 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 972, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 500, | |
| "total_flos": 9.034546879177687e+18, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |