{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 12500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002, "grad_norm": 9.999999046325684, "learning_rate": 4.000000000000001e-06, "loss": 18.7387, "step": 25 }, { "epoch": 0.004, "grad_norm": 10.0, "learning_rate": 8.000000000000001e-06, "loss": 22.1438, "step": 50 }, { "epoch": 0.006, "grad_norm": 10.0, "learning_rate": 1.2e-05, "loss": 23.3287, "step": 75 }, { "epoch": 0.008, "grad_norm": 9.999999046325684, "learning_rate": 1.6000000000000003e-05, "loss": 21.9887, "step": 100 }, { "epoch": 0.01, "grad_norm": 10.0, "learning_rate": 2e-05, "loss": 18.4825, "step": 125 }, { "epoch": 0.012, "grad_norm": 10.000000953674316, "learning_rate": 2.4e-05, "loss": 16.1713, "step": 150 }, { "epoch": 0.014, "grad_norm": 10.0, "learning_rate": 2.8000000000000003e-05, "loss": 15.2963, "step": 175 }, { "epoch": 0.016, "grad_norm": 9.999999046325684, "learning_rate": 3.2000000000000005e-05, "loss": 13.4162, "step": 200 }, { "epoch": 0.018, "grad_norm": 10.0, "learning_rate": 3.6e-05, "loss": 9.6975, "step": 225 }, { "epoch": 0.02, "grad_norm": 10.0, "learning_rate": 4e-05, "loss": 7.6338, "step": 250 }, { "epoch": 0.022, "grad_norm": 10.0, "learning_rate": 4.4000000000000006e-05, "loss": 6.175, "step": 275 }, { "epoch": 0.024, "grad_norm": 9.999998092651367, "learning_rate": 4.8e-05, "loss": 4.6375, "step": 300 }, { "epoch": 0.026, "grad_norm": 9.999998092651367, "learning_rate": 5.2000000000000004e-05, "loss": 3.1523, "step": 325 }, { "epoch": 0.028, "grad_norm": 9.999998092651367, "learning_rate": 5.6000000000000006e-05, "loss": 2.9066, "step": 350 }, { "epoch": 0.03, "grad_norm": 10.0, "learning_rate": 6e-05, "loss": 1.7845, "step": 375 }, { "epoch": 0.032, "grad_norm": 9.999999046325684, "learning_rate": 6.400000000000001e-05, "loss": 1.2612, "step": 400 }, { "epoch": 0.034, "grad_norm": 9.999999046325684, "learning_rate": 6.800000000000001e-05, "loss": 1.1808, "step": 425 }, { "epoch": 0.036, "grad_norm": 9.34283447265625, "learning_rate": 7.2e-05, "loss": 1.0462, "step": 450 }, { "epoch": 0.038, "grad_norm": 8.628987312316895, "learning_rate": 7.6e-05, "loss": 0.847, "step": 475 }, { "epoch": 0.04, "grad_norm": 7.913747787475586, "learning_rate": 8e-05, "loss": 0.7295, "step": 500 }, { "epoch": 0.042, "grad_norm": 10.0, "learning_rate": 8.4e-05, "loss": 0.7154, "step": 525 }, { "epoch": 0.044, "grad_norm": 5.550658702850342, "learning_rate": 8.800000000000001e-05, "loss": 0.5505, "step": 550 }, { "epoch": 0.046, "grad_norm": 2.6023919582366943, "learning_rate": 9.200000000000001e-05, "loss": 0.6063, "step": 575 }, { "epoch": 0.048, "grad_norm": 6.640954971313477, "learning_rate": 9.6e-05, "loss": 0.6737, "step": 600 }, { "epoch": 0.05, "grad_norm": 6.298529624938965, "learning_rate": 0.0001, "loss": 0.6029, "step": 625 }, { "epoch": 0.052, "grad_norm": 4.304025173187256, "learning_rate": 9.978947368421054e-05, "loss": 0.5277, "step": 650 }, { "epoch": 0.054, "grad_norm": 4.449888229370117, "learning_rate": 9.957894736842106e-05, "loss": 0.4861, "step": 675 }, { "epoch": 0.056, "grad_norm": 2.9031755924224854, "learning_rate": 9.936842105263159e-05, "loss": 0.4413, "step": 700 }, { "epoch": 0.058, "grad_norm": 4.989285469055176, "learning_rate": 9.915789473684211e-05, "loss": 0.4143, "step": 725 }, { "epoch": 0.06, "grad_norm": 5.094226360321045, "learning_rate": 9.894736842105263e-05, "loss": 0.4132, "step": 750 }, { "epoch": 0.062, "grad_norm": 6.01052713394165, "learning_rate": 9.873684210526316e-05, "loss": 0.3645, "step": 775 }, { "epoch": 0.064, "grad_norm": 3.638715982437134, "learning_rate": 9.852631578947368e-05, "loss": 0.3468, "step": 800 }, { "epoch": 0.066, "grad_norm": 5.820480823516846, "learning_rate": 9.831578947368422e-05, "loss": 0.3396, "step": 825 }, { "epoch": 0.068, "grad_norm": 4.334630966186523, "learning_rate": 9.810526315789475e-05, "loss": 0.3035, "step": 850 }, { "epoch": 0.07, "grad_norm": 4.271489143371582, "learning_rate": 9.789473684210527e-05, "loss": 0.3495, "step": 875 }, { "epoch": 0.072, "grad_norm": 4.406955718994141, "learning_rate": 9.768421052631579e-05, "loss": 0.3061, "step": 900 }, { "epoch": 0.074, "grad_norm": 3.8621153831481934, "learning_rate": 9.747368421052632e-05, "loss": 0.2804, "step": 925 }, { "epoch": 0.076, "grad_norm": 3.116670846939087, "learning_rate": 9.726315789473684e-05, "loss": 0.2664, "step": 950 }, { "epoch": 0.078, "grad_norm": 2.6488585472106934, "learning_rate": 9.705263157894738e-05, "loss": 0.2616, "step": 975 }, { "epoch": 0.08, "grad_norm": 3.343226194381714, "learning_rate": 9.68421052631579e-05, "loss": 0.2389, "step": 1000 }, { "epoch": 0.082, "grad_norm": 2.8532614707946777, "learning_rate": 9.663157894736843e-05, "loss": 0.2532, "step": 1025 }, { "epoch": 0.084, "grad_norm": 2.423327922821045, "learning_rate": 9.642105263157896e-05, "loss": 0.2479, "step": 1050 }, { "epoch": 0.086, "grad_norm": 2.269280195236206, "learning_rate": 9.621052631578947e-05, "loss": 0.2208, "step": 1075 }, { "epoch": 0.088, "grad_norm": 2.4925410747528076, "learning_rate": 9.6e-05, "loss": 0.2013, "step": 1100 }, { "epoch": 0.09, "grad_norm": 4.284687042236328, "learning_rate": 9.578947368421052e-05, "loss": 0.1964, "step": 1125 }, { "epoch": 0.092, "grad_norm": 2.524617910385132, "learning_rate": 9.557894736842106e-05, "loss": 0.2117, "step": 1150 }, { "epoch": 0.094, "grad_norm": 3.9951181411743164, "learning_rate": 9.536842105263159e-05, "loss": 0.1844, "step": 1175 }, { "epoch": 0.096, "grad_norm": 7.369756698608398, "learning_rate": 9.515789473684211e-05, "loss": 0.1838, "step": 1200 }, { "epoch": 0.098, "grad_norm": 3.532125949859619, "learning_rate": 9.494736842105264e-05, "loss": 0.1766, "step": 1225 }, { "epoch": 0.1, "grad_norm": 2.8758790493011475, "learning_rate": 9.473684210526316e-05, "loss": 0.2019, "step": 1250 }, { "epoch": 0.102, "grad_norm": 2.530642032623291, "learning_rate": 9.452631578947368e-05, "loss": 0.1539, "step": 1275 }, { "epoch": 0.104, "grad_norm": 3.9389524459838867, "learning_rate": 9.431578947368421e-05, "loss": 0.1865, "step": 1300 }, { "epoch": 0.106, "grad_norm": 2.398953437805176, "learning_rate": 9.410526315789473e-05, "loss": 0.1365, "step": 1325 }, { "epoch": 0.108, "grad_norm": 2.8597166538238525, "learning_rate": 9.389473684210527e-05, "loss": 0.1653, "step": 1350 }, { "epoch": 0.11, "grad_norm": 3.6551222801208496, "learning_rate": 9.36842105263158e-05, "loss": 0.1531, "step": 1375 }, { "epoch": 0.112, "grad_norm": 2.472316026687622, "learning_rate": 9.347368421052632e-05, "loss": 0.1281, "step": 1400 }, { "epoch": 0.114, "grad_norm": 2.308349847793579, "learning_rate": 9.326315789473684e-05, "loss": 0.1554, "step": 1425 }, { "epoch": 0.116, "grad_norm": 3.2640528678894043, "learning_rate": 9.305263157894737e-05, "loss": 0.1693, "step": 1450 }, { "epoch": 0.118, "grad_norm": 3.1926233768463135, "learning_rate": 9.28421052631579e-05, "loss": 0.2025, "step": 1475 }, { "epoch": 0.12, "grad_norm": 6.277203559875488, "learning_rate": 9.263157894736843e-05, "loss": 0.2018, "step": 1500 }, { "epoch": 0.122, "grad_norm": 5.876811981201172, "learning_rate": 9.242105263157895e-05, "loss": 0.2255, "step": 1525 }, { "epoch": 0.124, "grad_norm": 3.3951916694641113, "learning_rate": 9.221052631578948e-05, "loss": 0.1818, "step": 1550 }, { "epoch": 0.126, "grad_norm": 2.0903260707855225, "learning_rate": 9.200000000000001e-05, "loss": 0.1694, "step": 1575 }, { "epoch": 0.128, "grad_norm": 3.4474785327911377, "learning_rate": 9.178947368421052e-05, "loss": 0.1522, "step": 1600 }, { "epoch": 0.13, "grad_norm": 3.1567347049713135, "learning_rate": 9.157894736842105e-05, "loss": 0.1635, "step": 1625 }, { "epoch": 0.132, "grad_norm": 3.8927764892578125, "learning_rate": 9.136842105263159e-05, "loss": 0.1639, "step": 1650 }, { "epoch": 0.134, "grad_norm": 2.9919180870056152, "learning_rate": 9.11578947368421e-05, "loss": 0.1537, "step": 1675 }, { "epoch": 0.136, "grad_norm": 3.9411697387695312, "learning_rate": 9.094736842105264e-05, "loss": 0.1543, "step": 1700 }, { "epoch": 0.138, "grad_norm": 3.9650681018829346, "learning_rate": 9.073684210526316e-05, "loss": 0.1662, "step": 1725 }, { "epoch": 0.14, "grad_norm": 3.127971649169922, "learning_rate": 9.052631578947369e-05, "loss": 0.157, "step": 1750 }, { "epoch": 0.142, "grad_norm": 2.24344539642334, "learning_rate": 9.031578947368423e-05, "loss": 0.1414, "step": 1775 }, { "epoch": 0.144, "grad_norm": 2.141857385635376, "learning_rate": 9.010526315789473e-05, "loss": 0.1265, "step": 1800 }, { "epoch": 0.146, "grad_norm": 4.17902946472168, "learning_rate": 8.989473684210527e-05, "loss": 0.1626, "step": 1825 }, { "epoch": 0.148, "grad_norm": 2.6167194843292236, "learning_rate": 8.96842105263158e-05, "loss": 0.1488, "step": 1850 }, { "epoch": 0.15, "grad_norm": 1.361194372177124, "learning_rate": 8.947368421052632e-05, "loss": 0.1134, "step": 1875 }, { "epoch": 0.152, "grad_norm": 3.2407174110412598, "learning_rate": 8.926315789473685e-05, "loss": 0.1277, "step": 1900 }, { "epoch": 0.154, "grad_norm": 2.556647539138794, "learning_rate": 8.905263157894737e-05, "loss": 0.1205, "step": 1925 }, { "epoch": 0.156, "grad_norm": 1.8379806280136108, "learning_rate": 8.88421052631579e-05, "loss": 0.1511, "step": 1950 }, { "epoch": 0.158, "grad_norm": 1.8115049600601196, "learning_rate": 8.863157894736842e-05, "loss": 0.0999, "step": 1975 }, { "epoch": 0.16, "grad_norm": 2.2672791481018066, "learning_rate": 8.842105263157894e-05, "loss": 0.1025, "step": 2000 }, { "epoch": 0.162, "grad_norm": 3.3364503383636475, "learning_rate": 8.821052631578948e-05, "loss": 0.1158, "step": 2025 }, { "epoch": 0.164, "grad_norm": 2.527956247329712, "learning_rate": 8.800000000000001e-05, "loss": 0.1248, "step": 2050 }, { "epoch": 0.166, "grad_norm": 3.559095859527588, "learning_rate": 8.778947368421053e-05, "loss": 0.1139, "step": 2075 }, { "epoch": 0.168, "grad_norm": 3.2123589515686035, "learning_rate": 8.757894736842106e-05, "loss": 0.1646, "step": 2100 }, { "epoch": 0.17, "grad_norm": 1.7022438049316406, "learning_rate": 8.736842105263158e-05, "loss": 0.1502, "step": 2125 }, { "epoch": 0.172, "grad_norm": 3.6390633583068848, "learning_rate": 8.71578947368421e-05, "loss": 0.1943, "step": 2150 }, { "epoch": 0.174, "grad_norm": 2.4349706172943115, "learning_rate": 8.694736842105264e-05, "loss": 0.1619, "step": 2175 }, { "epoch": 0.176, "grad_norm": 6.100596904754639, "learning_rate": 8.673684210526316e-05, "loss": 0.1419, "step": 2200 }, { "epoch": 0.178, "grad_norm": 2.8502817153930664, "learning_rate": 8.652631578947369e-05, "loss": 0.1194, "step": 2225 }, { "epoch": 0.18, "grad_norm": 3.1560254096984863, "learning_rate": 8.631578947368421e-05, "loss": 0.1253, "step": 2250 }, { "epoch": 0.182, "grad_norm": 3.613964080810547, "learning_rate": 8.610526315789474e-05, "loss": 0.1161, "step": 2275 }, { "epoch": 0.184, "grad_norm": 2.2053143978118896, "learning_rate": 8.589473684210528e-05, "loss": 0.1303, "step": 2300 }, { "epoch": 0.186, "grad_norm": 6.676514148712158, "learning_rate": 8.568421052631578e-05, "loss": 0.2192, "step": 2325 }, { "epoch": 0.188, "grad_norm": 1.9195454120635986, "learning_rate": 8.547368421052632e-05, "loss": 0.1837, "step": 2350 }, { "epoch": 0.19, "grad_norm": 3.2146458625793457, "learning_rate": 8.526315789473685e-05, "loss": 0.1669, "step": 2375 }, { "epoch": 0.192, "grad_norm": 2.7265331745147705, "learning_rate": 8.505263157894737e-05, "loss": 0.1997, "step": 2400 }, { "epoch": 0.194, "grad_norm": 5.918653964996338, "learning_rate": 8.48421052631579e-05, "loss": 0.1934, "step": 2425 }, { "epoch": 0.196, "grad_norm": 2.6463940143585205, "learning_rate": 8.463157894736842e-05, "loss": 0.1706, "step": 2450 }, { "epoch": 0.198, "grad_norm": 2.56866455078125, "learning_rate": 8.442105263157896e-05, "loss": 0.1581, "step": 2475 }, { "epoch": 0.2, "grad_norm": 2.507850170135498, "learning_rate": 8.421052631578948e-05, "loss": 0.1593, "step": 2500 }, { "epoch": 0.202, "grad_norm": 2.009465456008911, "learning_rate": 8.4e-05, "loss": 0.1474, "step": 2525 }, { "epoch": 0.204, "grad_norm": 4.39176082611084, "learning_rate": 8.378947368421053e-05, "loss": 0.1629, "step": 2550 }, { "epoch": 0.206, "grad_norm": 3.6246728897094727, "learning_rate": 8.357894736842106e-05, "loss": 0.1717, "step": 2575 }, { "epoch": 0.208, "grad_norm": 2.2975339889526367, "learning_rate": 8.336842105263158e-05, "loss": 0.1584, "step": 2600 }, { "epoch": 0.21, "grad_norm": 1.475097894668579, "learning_rate": 8.315789473684212e-05, "loss": 0.1474, "step": 2625 }, { "epoch": 0.212, "grad_norm": 2.2008955478668213, "learning_rate": 8.294736842105263e-05, "loss": 0.1887, "step": 2650 }, { "epoch": 0.214, "grad_norm": 3.3320555686950684, "learning_rate": 8.273684210526315e-05, "loss": 0.2136, "step": 2675 }, { "epoch": 0.216, "grad_norm": 4.016301155090332, "learning_rate": 8.252631578947369e-05, "loss": 0.1777, "step": 2700 }, { "epoch": 0.218, "grad_norm": 2.3852882385253906, "learning_rate": 8.231578947368421e-05, "loss": 0.2038, "step": 2725 }, { "epoch": 0.22, "grad_norm": 4.516410827636719, "learning_rate": 8.210526315789474e-05, "loss": 0.2191, "step": 2750 }, { "epoch": 0.222, "grad_norm": 2.775944948196411, "learning_rate": 8.189473684210527e-05, "loss": 0.239, "step": 2775 }, { "epoch": 0.224, "grad_norm": 3.0027830600738525, "learning_rate": 8.16842105263158e-05, "loss": 0.2271, "step": 2800 }, { "epoch": 0.226, "grad_norm": 3.5212559700012207, "learning_rate": 8.147368421052633e-05, "loss": 0.2025, "step": 2825 }, { "epoch": 0.228, "grad_norm": 4.692851543426514, "learning_rate": 8.126315789473685e-05, "loss": 0.1637, "step": 2850 }, { "epoch": 0.23, "grad_norm": 3.9487805366516113, "learning_rate": 8.105263157894737e-05, "loss": 0.1594, "step": 2875 }, { "epoch": 0.232, "grad_norm": 3.8476414680480957, "learning_rate": 8.08421052631579e-05, "loss": 0.2109, "step": 2900 }, { "epoch": 0.234, "grad_norm": 2.902121067047119, "learning_rate": 8.063157894736842e-05, "loss": 0.1735, "step": 2925 }, { "epoch": 0.236, "grad_norm": 3.348264694213867, "learning_rate": 8.042105263157895e-05, "loss": 0.1995, "step": 2950 }, { "epoch": 0.238, "grad_norm": 3.9797656536102295, "learning_rate": 8.021052631578949e-05, "loss": 0.1916, "step": 2975 }, { "epoch": 0.24, "grad_norm": 2.963280200958252, "learning_rate": 8e-05, "loss": 0.1941, "step": 3000 }, { "epoch": 0.242, "grad_norm": 3.173311948776245, "learning_rate": 7.978947368421053e-05, "loss": 0.1578, "step": 3025 }, { "epoch": 0.244, "grad_norm": 2.4859843254089355, "learning_rate": 7.957894736842106e-05, "loss": 0.1913, "step": 3050 }, { "epoch": 0.246, "grad_norm": 2.6782002449035645, "learning_rate": 7.936842105263158e-05, "loss": 0.1526, "step": 3075 }, { "epoch": 0.248, "grad_norm": 6.26693058013916, "learning_rate": 7.915789473684211e-05, "loss": 0.1602, "step": 3100 }, { "epoch": 0.25, "grad_norm": 2.1816000938415527, "learning_rate": 7.894736842105263e-05, "loss": 0.1652, "step": 3125 }, { "epoch": 0.252, "grad_norm": 2.8874270915985107, "learning_rate": 7.873684210526317e-05, "loss": 0.1888, "step": 3150 }, { "epoch": 0.254, "grad_norm": 2.7631444931030273, "learning_rate": 7.852631578947369e-05, "loss": 0.1484, "step": 3175 }, { "epoch": 0.256, "grad_norm": 2.1312832832336426, "learning_rate": 7.83157894736842e-05, "loss": 0.2022, "step": 3200 }, { "epoch": 0.258, "grad_norm": 3.328763723373413, "learning_rate": 7.810526315789474e-05, "loss": 0.1938, "step": 3225 }, { "epoch": 0.26, "grad_norm": 3.270240306854248, "learning_rate": 7.789473684210526e-05, "loss": 0.1671, "step": 3250 }, { "epoch": 0.262, "grad_norm": 1.2207618951797485, "learning_rate": 7.768421052631579e-05, "loss": 0.1434, "step": 3275 }, { "epoch": 0.264, "grad_norm": 2.9835286140441895, "learning_rate": 7.747368421052633e-05, "loss": 0.1609, "step": 3300 }, { "epoch": 0.266, "grad_norm": 3.5259110927581787, "learning_rate": 7.726315789473684e-05, "loss": 0.145, "step": 3325 }, { "epoch": 0.268, "grad_norm": 3.514284372329712, "learning_rate": 7.705263157894738e-05, "loss": 0.1796, "step": 3350 }, { "epoch": 0.27, "grad_norm": 3.7748453617095947, "learning_rate": 7.68421052631579e-05, "loss": 0.167, "step": 3375 }, { "epoch": 0.272, "grad_norm": 4.1441779136657715, "learning_rate": 7.663157894736842e-05, "loss": 0.2027, "step": 3400 }, { "epoch": 0.274, "grad_norm": 4.4492878913879395, "learning_rate": 7.642105263157895e-05, "loss": 0.2903, "step": 3425 }, { "epoch": 0.276, "grad_norm": 4.148222923278809, "learning_rate": 7.621052631578947e-05, "loss": 0.3515, "step": 3450 }, { "epoch": 0.278, "grad_norm": 3.1954922676086426, "learning_rate": 7.6e-05, "loss": 0.2598, "step": 3475 }, { "epoch": 0.28, "grad_norm": 2.256054401397705, "learning_rate": 7.578947368421054e-05, "loss": 0.2331, "step": 3500 }, { "epoch": 0.282, "grad_norm": 1.3954627513885498, "learning_rate": 7.557894736842106e-05, "loss": 0.2147, "step": 3525 }, { "epoch": 0.284, "grad_norm": 4.935629844665527, "learning_rate": 7.536842105263158e-05, "loss": 0.3906, "step": 3550 }, { "epoch": 0.286, "grad_norm": 3.462646245956421, "learning_rate": 7.515789473684211e-05, "loss": 0.4341, "step": 3575 }, { "epoch": 0.288, "grad_norm": 1.512658953666687, "learning_rate": 7.494736842105263e-05, "loss": 0.4142, "step": 3600 }, { "epoch": 0.29, "grad_norm": 4.282130241394043, "learning_rate": 7.473684210526316e-05, "loss": 0.3227, "step": 3625 }, { "epoch": 0.292, "grad_norm": 6.799490928649902, "learning_rate": 7.452631578947368e-05, "loss": 0.2913, "step": 3650 }, { "epoch": 0.294, "grad_norm": 2.9833290576934814, "learning_rate": 7.431578947368422e-05, "loss": 0.195, "step": 3675 }, { "epoch": 0.296, "grad_norm": 3.2424564361572266, "learning_rate": 7.410526315789475e-05, "loss": 0.2175, "step": 3700 }, { "epoch": 0.298, "grad_norm": 3.1551601886749268, "learning_rate": 7.389473684210527e-05, "loss": 0.2779, "step": 3725 }, { "epoch": 0.3, "grad_norm": 5.371734142303467, "learning_rate": 7.368421052631579e-05, "loss": 0.2768, "step": 3750 }, { "epoch": 0.302, "grad_norm": 4.353482723236084, "learning_rate": 7.347368421052632e-05, "loss": 0.227, "step": 3775 }, { "epoch": 0.304, "grad_norm": 1.761165738105774, "learning_rate": 7.326315789473684e-05, "loss": 0.2184, "step": 3800 }, { "epoch": 0.306, "grad_norm": 1.952088713645935, "learning_rate": 7.305263157894738e-05, "loss": 0.279, "step": 3825 }, { "epoch": 0.308, "grad_norm": 2.8295345306396484, "learning_rate": 7.28421052631579e-05, "loss": 0.2077, "step": 3850 }, { "epoch": 0.31, "grad_norm": 3.803010940551758, "learning_rate": 7.263157894736843e-05, "loss": 0.2618, "step": 3875 }, { "epoch": 0.312, "grad_norm": 2.9954023361206055, "learning_rate": 7.242105263157896e-05, "loss": 0.278, "step": 3900 }, { "epoch": 0.314, "grad_norm": 3.2522518634796143, "learning_rate": 7.221052631578947e-05, "loss": 0.1834, "step": 3925 }, { "epoch": 0.316, "grad_norm": 2.8170676231384277, "learning_rate": 7.2e-05, "loss": 0.2125, "step": 3950 }, { "epoch": 0.318, "grad_norm": 1.8490381240844727, "learning_rate": 7.178947368421054e-05, "loss": 0.1691, "step": 3975 }, { "epoch": 0.32, "grad_norm": 5.5511579513549805, "learning_rate": 7.157894736842105e-05, "loss": 0.1718, "step": 4000 }, { "epoch": 0.322, "grad_norm": 2.619178295135498, "learning_rate": 7.136842105263159e-05, "loss": 0.1799, "step": 4025 }, { "epoch": 0.324, "grad_norm": 3.8128204345703125, "learning_rate": 7.115789473684211e-05, "loss": 0.1771, "step": 4050 }, { "epoch": 0.326, "grad_norm": 3.535412311553955, "learning_rate": 7.094736842105264e-05, "loss": 0.1798, "step": 4075 }, { "epoch": 0.328, "grad_norm": 1.8671201467514038, "learning_rate": 7.073684210526316e-05, "loss": 0.1576, "step": 4100 }, { "epoch": 0.33, "grad_norm": 2.651156187057495, "learning_rate": 7.052631578947368e-05, "loss": 0.1653, "step": 4125 }, { "epoch": 0.332, "grad_norm": 2.2884953022003174, "learning_rate": 7.031578947368421e-05, "loss": 0.155, "step": 4150 }, { "epoch": 0.334, "grad_norm": 2.7888643741607666, "learning_rate": 7.010526315789473e-05, "loss": 0.1702, "step": 4175 }, { "epoch": 0.336, "grad_norm": 2.2567873001098633, "learning_rate": 6.989473684210527e-05, "loss": 0.1466, "step": 4200 }, { "epoch": 0.338, "grad_norm": 3.6883699893951416, "learning_rate": 6.96842105263158e-05, "loss": 0.1465, "step": 4225 }, { "epoch": 0.34, "grad_norm": 2.7656190395355225, "learning_rate": 6.947368421052632e-05, "loss": 0.1496, "step": 4250 }, { "epoch": 0.342, "grad_norm": 4.047220230102539, "learning_rate": 6.926315789473684e-05, "loss": 0.1485, "step": 4275 }, { "epoch": 0.344, "grad_norm": 2.5935311317443848, "learning_rate": 6.905263157894737e-05, "loss": 0.1641, "step": 4300 }, { "epoch": 0.346, "grad_norm": 3.29296875, "learning_rate": 6.88421052631579e-05, "loss": 0.1691, "step": 4325 }, { "epoch": 0.348, "grad_norm": 1.8748677968978882, "learning_rate": 6.863157894736843e-05, "loss": 0.1564, "step": 4350 }, { "epoch": 0.35, "grad_norm": 2.8620316982269287, "learning_rate": 6.842105263157895e-05, "loss": 0.1977, "step": 4375 }, { "epoch": 0.352, "grad_norm": 3.427701711654663, "learning_rate": 6.821052631578948e-05, "loss": 0.1699, "step": 4400 }, { "epoch": 0.354, "grad_norm": 2.8630993366241455, "learning_rate": 6.800000000000001e-05, "loss": 0.1841, "step": 4425 }, { "epoch": 0.356, "grad_norm": 1.8674124479293823, "learning_rate": 6.778947368421052e-05, "loss": 0.1545, "step": 4450 }, { "epoch": 0.358, "grad_norm": 2.2906653881073, "learning_rate": 6.757894736842105e-05, "loss": 0.155, "step": 4475 }, { "epoch": 0.36, "grad_norm": 2.981401205062866, "learning_rate": 6.736842105263159e-05, "loss": 0.1604, "step": 4500 }, { "epoch": 0.362, "grad_norm": 3.040834903717041, "learning_rate": 6.71578947368421e-05, "loss": 0.153, "step": 4525 }, { "epoch": 0.364, "grad_norm": 2.600574493408203, "learning_rate": 6.694736842105264e-05, "loss": 0.1469, "step": 4550 }, { "epoch": 0.366, "grad_norm": 2.2893636226654053, "learning_rate": 6.673684210526316e-05, "loss": 0.1501, "step": 4575 }, { "epoch": 0.368, "grad_norm": 6.196401119232178, "learning_rate": 6.652631578947369e-05, "loss": 0.1561, "step": 4600 }, { "epoch": 0.37, "grad_norm": 3.3594653606414795, "learning_rate": 6.631578947368421e-05, "loss": 0.1757, "step": 4625 }, { "epoch": 0.372, "grad_norm": 2.437387704849243, "learning_rate": 6.610526315789473e-05, "loss": 0.1365, "step": 4650 }, { "epoch": 0.374, "grad_norm": 3.190577507019043, "learning_rate": 6.589473684210526e-05, "loss": 0.168, "step": 4675 }, { "epoch": 0.376, "grad_norm": 5.2853102684021, "learning_rate": 6.56842105263158e-05, "loss": 0.1422, "step": 4700 }, { "epoch": 0.378, "grad_norm": 4.163969039916992, "learning_rate": 6.547368421052632e-05, "loss": 0.1391, "step": 4725 }, { "epoch": 0.38, "grad_norm": 3.255009889602661, "learning_rate": 6.526315789473685e-05, "loss": 0.1851, "step": 4750 }, { "epoch": 0.382, "grad_norm": 1.9225974082946777, "learning_rate": 6.505263157894737e-05, "loss": 0.1688, "step": 4775 }, { "epoch": 0.384, "grad_norm": 2.5862815380096436, "learning_rate": 6.484210526315789e-05, "loss": 0.1883, "step": 4800 }, { "epoch": 0.386, "grad_norm": 2.3600857257843018, "learning_rate": 6.463157894736842e-05, "loss": 0.1584, "step": 4825 }, { "epoch": 0.388, "grad_norm": 4.694042682647705, "learning_rate": 6.442105263157894e-05, "loss": 0.1621, "step": 4850 }, { "epoch": 0.39, "grad_norm": 1.5520076751708984, "learning_rate": 6.421052631578948e-05, "loss": 0.1859, "step": 4875 }, { "epoch": 0.392, "grad_norm": 2.522648572921753, "learning_rate": 6.400000000000001e-05, "loss": 0.1549, "step": 4900 }, { "epoch": 0.394, "grad_norm": 2.736837863922119, "learning_rate": 6.378947368421053e-05, "loss": 0.1444, "step": 4925 }, { "epoch": 0.396, "grad_norm": 2.710667371749878, "learning_rate": 6.357894736842106e-05, "loss": 0.1589, "step": 4950 }, { "epoch": 0.398, "grad_norm": 2.561551570892334, "learning_rate": 6.336842105263158e-05, "loss": 0.1603, "step": 4975 }, { "epoch": 0.4, "grad_norm": 2.2533857822418213, "learning_rate": 6.31578947368421e-05, "loss": 0.1547, "step": 5000 }, { "epoch": 0.402, "grad_norm": 4.293575286865234, "learning_rate": 6.294736842105264e-05, "loss": 0.1649, "step": 5025 }, { "epoch": 0.404, "grad_norm": 2.0180907249450684, "learning_rate": 6.273684210526316e-05, "loss": 0.1542, "step": 5050 }, { "epoch": 0.406, "grad_norm": 4.162642955780029, "learning_rate": 6.252631578947369e-05, "loss": 0.1474, "step": 5075 }, { "epoch": 0.408, "grad_norm": 2.2592241764068604, "learning_rate": 6.231578947368422e-05, "loss": 0.1424, "step": 5100 }, { "epoch": 0.41, "grad_norm": 2.9657979011535645, "learning_rate": 6.210526315789474e-05, "loss": 0.141, "step": 5125 }, { "epoch": 0.412, "grad_norm": 4.562716960906982, "learning_rate": 6.189473684210526e-05, "loss": 0.1207, "step": 5150 }, { "epoch": 0.414, "grad_norm": 3.5740582942962646, "learning_rate": 6.168421052631578e-05, "loss": 0.1348, "step": 5175 }, { "epoch": 0.416, "grad_norm": 2.933711290359497, "learning_rate": 6.147368421052632e-05, "loss": 0.1262, "step": 5200 }, { "epoch": 0.418, "grad_norm": 2.674757480621338, "learning_rate": 6.126315789473685e-05, "loss": 0.1639, "step": 5225 }, { "epoch": 0.42, "grad_norm": 3.187509775161743, "learning_rate": 6.105263157894737e-05, "loss": 0.1486, "step": 5250 }, { "epoch": 0.422, "grad_norm": 2.763331174850464, "learning_rate": 6.08421052631579e-05, "loss": 0.1487, "step": 5275 }, { "epoch": 0.424, "grad_norm": 2.597900867462158, "learning_rate": 6.063157894736843e-05, "loss": 0.1482, "step": 5300 }, { "epoch": 0.426, "grad_norm": 2.390775203704834, "learning_rate": 6.042105263157894e-05, "loss": 0.1405, "step": 5325 }, { "epoch": 0.428, "grad_norm": 4.166103363037109, "learning_rate": 6.0210526315789475e-05, "loss": 0.1822, "step": 5350 }, { "epoch": 0.43, "grad_norm": 2.066412925720215, "learning_rate": 6e-05, "loss": 0.1272, "step": 5375 }, { "epoch": 0.432, "grad_norm": 3.385010242462158, "learning_rate": 5.978947368421053e-05, "loss": 0.1576, "step": 5400 }, { "epoch": 0.434, "grad_norm": 3.9283525943756104, "learning_rate": 5.9578947368421055e-05, "loss": 0.1804, "step": 5425 }, { "epoch": 0.436, "grad_norm": 1.7260828018188477, "learning_rate": 5.936842105263158e-05, "loss": 0.1547, "step": 5450 }, { "epoch": 0.438, "grad_norm": 1.3694740533828735, "learning_rate": 5.9157894736842114e-05, "loss": 0.1684, "step": 5475 }, { "epoch": 0.44, "grad_norm": 3.547086000442505, "learning_rate": 5.894736842105263e-05, "loss": 0.1731, "step": 5500 }, { "epoch": 0.442, "grad_norm": 4.220787048339844, "learning_rate": 5.8736842105263154e-05, "loss": 0.1502, "step": 5525 }, { "epoch": 0.444, "grad_norm": 5.9439239501953125, "learning_rate": 5.852631578947369e-05, "loss": 0.1775, "step": 5550 }, { "epoch": 0.446, "grad_norm": 2.953634738922119, "learning_rate": 5.8315789473684214e-05, "loss": 0.1577, "step": 5575 }, { "epoch": 0.448, "grad_norm": 3.257310628890991, "learning_rate": 5.810526315789474e-05, "loss": 0.2024, "step": 5600 }, { "epoch": 0.45, "grad_norm": 2.815178394317627, "learning_rate": 5.789473684210527e-05, "loss": 0.1858, "step": 5625 }, { "epoch": 0.452, "grad_norm": 4.573605537414551, "learning_rate": 5.7684210526315794e-05, "loss": 0.2153, "step": 5650 }, { "epoch": 0.454, "grad_norm": 1.737714409828186, "learning_rate": 5.747368421052633e-05, "loss": 0.1809, "step": 5675 }, { "epoch": 0.456, "grad_norm": 2.6098313331604004, "learning_rate": 5.726315789473684e-05, "loss": 0.166, "step": 5700 }, { "epoch": 0.458, "grad_norm": 1.715842843055725, "learning_rate": 5.7052631578947366e-05, "loss": 0.1662, "step": 5725 }, { "epoch": 0.46, "grad_norm": 1.6159685850143433, "learning_rate": 5.68421052631579e-05, "loss": 0.1443, "step": 5750 }, { "epoch": 0.462, "grad_norm": 2.3512282371520996, "learning_rate": 5.6631578947368426e-05, "loss": 0.1672, "step": 5775 }, { "epoch": 0.464, "grad_norm": 2.50100040435791, "learning_rate": 5.642105263157895e-05, "loss": 0.1596, "step": 5800 }, { "epoch": 0.466, "grad_norm": 3.3251962661743164, "learning_rate": 5.621052631578948e-05, "loss": 0.1914, "step": 5825 }, { "epoch": 0.468, "grad_norm": 2.4758493900299072, "learning_rate": 5.6000000000000006e-05, "loss": 0.1763, "step": 5850 }, { "epoch": 0.47, "grad_norm": 2.60750150680542, "learning_rate": 5.5789473684210526e-05, "loss": 0.163, "step": 5875 }, { "epoch": 0.472, "grad_norm": 2.7172982692718506, "learning_rate": 5.557894736842105e-05, "loss": 0.2162, "step": 5900 }, { "epoch": 0.474, "grad_norm": 2.4564871788024902, "learning_rate": 5.536842105263158e-05, "loss": 0.2331, "step": 5925 }, { "epoch": 0.476, "grad_norm": 4.060207843780518, "learning_rate": 5.5157894736842105e-05, "loss": 0.1686, "step": 5950 }, { "epoch": 0.478, "grad_norm": 4.7515764236450195, "learning_rate": 5.494736842105264e-05, "loss": 0.2329, "step": 5975 }, { "epoch": 0.48, "grad_norm": 4.869494915008545, "learning_rate": 5.4736842105263165e-05, "loss": 0.2139, "step": 6000 }, { "epoch": 0.482, "grad_norm": 4.088851451873779, "learning_rate": 5.452631578947369e-05, "loss": 0.2784, "step": 6025 }, { "epoch": 0.484, "grad_norm": 3.6407222747802734, "learning_rate": 5.431578947368421e-05, "loss": 0.2477, "step": 6050 }, { "epoch": 0.486, "grad_norm": 3.3714334964752197, "learning_rate": 5.410526315789474e-05, "loss": 0.2195, "step": 6075 }, { "epoch": 0.488, "grad_norm": 3.871067523956299, "learning_rate": 5.3894736842105265e-05, "loss": 0.2696, "step": 6100 }, { "epoch": 0.49, "grad_norm": 1.533532738685608, "learning_rate": 5.368421052631579e-05, "loss": 0.2131, "step": 6125 }, { "epoch": 0.492, "grad_norm": 4.925436973571777, "learning_rate": 5.347368421052632e-05, "loss": 0.2229, "step": 6150 }, { "epoch": 0.494, "grad_norm": 3.861382484436035, "learning_rate": 5.326315789473685e-05, "loss": 0.1841, "step": 6175 }, { "epoch": 0.496, "grad_norm": 5.30626106262207, "learning_rate": 5.305263157894738e-05, "loss": 0.2253, "step": 6200 }, { "epoch": 0.498, "grad_norm": 1.9890210628509521, "learning_rate": 5.284210526315789e-05, "loss": 0.2291, "step": 6225 }, { "epoch": 0.5, "grad_norm": 1.197603464126587, "learning_rate": 5.2631578947368424e-05, "loss": 0.1751, "step": 6250 }, { "epoch": 0.502, "grad_norm": 2.809992551803589, "learning_rate": 5.242105263157895e-05, "loss": 0.246, "step": 6275 }, { "epoch": 0.504, "grad_norm": 3.991882801055908, "learning_rate": 5.221052631578948e-05, "loss": 0.1961, "step": 6300 }, { "epoch": 0.506, "grad_norm": 4.238216876983643, "learning_rate": 5.2000000000000004e-05, "loss": 0.1843, "step": 6325 }, { "epoch": 0.508, "grad_norm": 1.8953478336334229, "learning_rate": 5.178947368421053e-05, "loss": 0.2101, "step": 6350 }, { "epoch": 0.51, "grad_norm": 2.997884750366211, "learning_rate": 5.157894736842106e-05, "loss": 0.2066, "step": 6375 }, { "epoch": 0.512, "grad_norm": 3.8943450450897217, "learning_rate": 5.1368421052631576e-05, "loss": 0.2258, "step": 6400 }, { "epoch": 0.514, "grad_norm": 3.5741677284240723, "learning_rate": 5.11578947368421e-05, "loss": 0.2189, "step": 6425 }, { "epoch": 0.516, "grad_norm": 1.6271083354949951, "learning_rate": 5.094736842105263e-05, "loss": 0.1745, "step": 6450 }, { "epoch": 0.518, "grad_norm": 1.796264886856079, "learning_rate": 5.073684210526316e-05, "loss": 0.1925, "step": 6475 }, { "epoch": 0.52, "grad_norm": 3.7582969665527344, "learning_rate": 5.052631578947369e-05, "loss": 0.2079, "step": 6500 }, { "epoch": 0.522, "grad_norm": 1.788479208946228, "learning_rate": 5.0315789473684216e-05, "loss": 0.1729, "step": 6525 }, { "epoch": 0.524, "grad_norm": 1.6168099641799927, "learning_rate": 5.010526315789474e-05, "loss": 0.1717, "step": 6550 }, { "epoch": 0.526, "grad_norm": 2.9334585666656494, "learning_rate": 4.989473684210527e-05, "loss": 0.1446, "step": 6575 }, { "epoch": 0.528, "grad_norm": 2.1925525665283203, "learning_rate": 4.9684210526315796e-05, "loss": 0.1478, "step": 6600 }, { "epoch": 0.53, "grad_norm": 2.6039555072784424, "learning_rate": 4.9473684210526315e-05, "loss": 0.1736, "step": 6625 }, { "epoch": 0.532, "grad_norm": 2.0845723152160645, "learning_rate": 4.926315789473684e-05, "loss": 0.1874, "step": 6650 }, { "epoch": 0.534, "grad_norm": 3.34181547164917, "learning_rate": 4.9052631578947375e-05, "loss": 0.1644, "step": 6675 }, { "epoch": 0.536, "grad_norm": 2.063713550567627, "learning_rate": 4.8842105263157895e-05, "loss": 0.1764, "step": 6700 }, { "epoch": 0.538, "grad_norm": 2.799060583114624, "learning_rate": 4.863157894736842e-05, "loss": 0.1868, "step": 6725 }, { "epoch": 0.54, "grad_norm": 2.2900466918945312, "learning_rate": 4.842105263157895e-05, "loss": 0.2066, "step": 6750 }, { "epoch": 0.542, "grad_norm": 3.7162959575653076, "learning_rate": 4.821052631578948e-05, "loss": 0.1997, "step": 6775 }, { "epoch": 0.544, "grad_norm": 3.615978956222534, "learning_rate": 4.8e-05, "loss": 0.2407, "step": 6800 }, { "epoch": 0.546, "grad_norm": 2.926126003265381, "learning_rate": 4.778947368421053e-05, "loss": 0.1995, "step": 6825 }, { "epoch": 0.548, "grad_norm": 3.1077892780303955, "learning_rate": 4.7578947368421054e-05, "loss": 0.1562, "step": 6850 }, { "epoch": 0.55, "grad_norm": 2.667742967605591, "learning_rate": 4.736842105263158e-05, "loss": 0.195, "step": 6875 }, { "epoch": 0.552, "grad_norm": 3.075545310974121, "learning_rate": 4.715789473684211e-05, "loss": 0.1747, "step": 6900 }, { "epoch": 0.554, "grad_norm": 5.871755123138428, "learning_rate": 4.6947368421052634e-05, "loss": 0.2339, "step": 6925 }, { "epoch": 0.556, "grad_norm": 1.96011221408844, "learning_rate": 4.673684210526316e-05, "loss": 0.1731, "step": 6950 }, { "epoch": 0.558, "grad_norm": 2.2573795318603516, "learning_rate": 4.652631578947369e-05, "loss": 0.1752, "step": 6975 }, { "epoch": 0.56, "grad_norm": 4.410872936248779, "learning_rate": 4.6315789473684214e-05, "loss": 0.1833, "step": 7000 }, { "epoch": 0.562, "grad_norm": 2.1633858680725098, "learning_rate": 4.610526315789474e-05, "loss": 0.1678, "step": 7025 }, { "epoch": 0.564, "grad_norm": 2.8176636695861816, "learning_rate": 4.589473684210526e-05, "loss": 0.2261, "step": 7050 }, { "epoch": 0.566, "grad_norm": 4.914159297943115, "learning_rate": 4.568421052631579e-05, "loss": 0.2307, "step": 7075 }, { "epoch": 0.568, "grad_norm": 3.0818915367126465, "learning_rate": 4.547368421052632e-05, "loss": 0.3216, "step": 7100 }, { "epoch": 0.57, "grad_norm": 2.862865447998047, "learning_rate": 4.5263157894736846e-05, "loss": 0.2771, "step": 7125 }, { "epoch": 0.572, "grad_norm": 3.212938070297241, "learning_rate": 4.5052631578947366e-05, "loss": 0.2252, "step": 7150 }, { "epoch": 0.574, "grad_norm": 5.601623058319092, "learning_rate": 4.48421052631579e-05, "loss": 0.2069, "step": 7175 }, { "epoch": 0.576, "grad_norm": 2.006655693054199, "learning_rate": 4.4631578947368426e-05, "loss": 0.2076, "step": 7200 }, { "epoch": 0.578, "grad_norm": 5.423392295837402, "learning_rate": 4.442105263157895e-05, "loss": 0.1807, "step": 7225 }, { "epoch": 0.58, "grad_norm": 2.1953580379486084, "learning_rate": 4.421052631578947e-05, "loss": 0.1854, "step": 7250 }, { "epoch": 0.582, "grad_norm": 2.8728268146514893, "learning_rate": 4.4000000000000006e-05, "loss": 0.2083, "step": 7275 }, { "epoch": 0.584, "grad_norm": 1.9604209661483765, "learning_rate": 4.378947368421053e-05, "loss": 0.1715, "step": 7300 }, { "epoch": 0.586, "grad_norm": 2.668545961380005, "learning_rate": 4.357894736842105e-05, "loss": 0.1471, "step": 7325 }, { "epoch": 0.588, "grad_norm": 1.3440651893615723, "learning_rate": 4.336842105263158e-05, "loss": 0.1729, "step": 7350 }, { "epoch": 0.59, "grad_norm": 2.6942169666290283, "learning_rate": 4.3157894736842105e-05, "loss": 0.2032, "step": 7375 }, { "epoch": 0.592, "grad_norm": 2.4484572410583496, "learning_rate": 4.294736842105264e-05, "loss": 0.2064, "step": 7400 }, { "epoch": 0.594, "grad_norm": 3.1453819274902344, "learning_rate": 4.273684210526316e-05, "loss": 0.2224, "step": 7425 }, { "epoch": 0.596, "grad_norm": 2.246695041656494, "learning_rate": 4.2526315789473685e-05, "loss": 0.1716, "step": 7450 }, { "epoch": 0.598, "grad_norm": 3.6021037101745605, "learning_rate": 4.231578947368421e-05, "loss": 0.1618, "step": 7475 }, { "epoch": 0.6, "grad_norm": 2.7792000770568848, "learning_rate": 4.210526315789474e-05, "loss": 0.1398, "step": 7500 }, { "epoch": 0.602, "grad_norm": 3.792266845703125, "learning_rate": 4.1894736842105264e-05, "loss": 0.1491, "step": 7525 }, { "epoch": 0.604, "grad_norm": 3.832718849182129, "learning_rate": 4.168421052631579e-05, "loss": 0.1734, "step": 7550 }, { "epoch": 0.606, "grad_norm": 5.681286334991455, "learning_rate": 4.147368421052632e-05, "loss": 0.2279, "step": 7575 }, { "epoch": 0.608, "grad_norm": 3.5109989643096924, "learning_rate": 4.1263157894736844e-05, "loss": 0.2148, "step": 7600 }, { "epoch": 0.61, "grad_norm": 2.3420827388763428, "learning_rate": 4.105263157894737e-05, "loss": 0.2353, "step": 7625 }, { "epoch": 0.612, "grad_norm": 4.0047407150268555, "learning_rate": 4.08421052631579e-05, "loss": 0.2745, "step": 7650 }, { "epoch": 0.614, "grad_norm": 2.9700982570648193, "learning_rate": 4.0631578947368424e-05, "loss": 0.2061, "step": 7675 }, { "epoch": 0.616, "grad_norm": 1.1286704540252686, "learning_rate": 4.042105263157895e-05, "loss": 0.2002, "step": 7700 }, { "epoch": 0.618, "grad_norm": 3.2874717712402344, "learning_rate": 4.021052631578948e-05, "loss": 0.1782, "step": 7725 }, { "epoch": 0.62, "grad_norm": 2.102379560470581, "learning_rate": 4e-05, "loss": 0.1683, "step": 7750 }, { "epoch": 0.622, "grad_norm": 2.282027244567871, "learning_rate": 3.978947368421053e-05, "loss": 0.16, "step": 7775 }, { "epoch": 0.624, "grad_norm": 3.09382963180542, "learning_rate": 3.9578947368421056e-05, "loss": 0.183, "step": 7800 }, { "epoch": 0.626, "grad_norm": 2.2616071701049805, "learning_rate": 3.936842105263158e-05, "loss": 0.1696, "step": 7825 }, { "epoch": 0.628, "grad_norm": 2.725660562515259, "learning_rate": 3.91578947368421e-05, "loss": 0.1798, "step": 7850 }, { "epoch": 0.63, "grad_norm": 3.027191400527954, "learning_rate": 3.894736842105263e-05, "loss": 0.1923, "step": 7875 }, { "epoch": 0.632, "grad_norm": 1.982648253440857, "learning_rate": 3.873684210526316e-05, "loss": 0.2375, "step": 7900 }, { "epoch": 0.634, "grad_norm": 1.9510787725448608, "learning_rate": 3.852631578947369e-05, "loss": 0.2464, "step": 7925 }, { "epoch": 0.636, "grad_norm": 2.4144234657287598, "learning_rate": 3.831578947368421e-05, "loss": 0.2646, "step": 7950 }, { "epoch": 0.638, "grad_norm": 2.5756216049194336, "learning_rate": 3.8105263157894735e-05, "loss": 0.2361, "step": 7975 }, { "epoch": 0.64, "grad_norm": 2.4998459815979004, "learning_rate": 3.789473684210527e-05, "loss": 0.2325, "step": 8000 }, { "epoch": 0.642, "grad_norm": 2.4527933597564697, "learning_rate": 3.768421052631579e-05, "loss": 0.2023, "step": 8025 }, { "epoch": 0.644, "grad_norm": 3.2145981788635254, "learning_rate": 3.7473684210526315e-05, "loss": 0.1794, "step": 8050 }, { "epoch": 0.646, "grad_norm": 3.4155004024505615, "learning_rate": 3.726315789473684e-05, "loss": 0.1793, "step": 8075 }, { "epoch": 0.648, "grad_norm": 1.3707998991012573, "learning_rate": 3.7052631578947375e-05, "loss": 0.145, "step": 8100 }, { "epoch": 0.65, "grad_norm": 1.991523265838623, "learning_rate": 3.6842105263157895e-05, "loss": 0.1993, "step": 8125 }, { "epoch": 0.652, "grad_norm": 1.9854768514633179, "learning_rate": 3.663157894736842e-05, "loss": 0.1834, "step": 8150 }, { "epoch": 0.654, "grad_norm": 2.564042806625366, "learning_rate": 3.642105263157895e-05, "loss": 0.1703, "step": 8175 }, { "epoch": 0.656, "grad_norm": 2.2101786136627197, "learning_rate": 3.621052631578948e-05, "loss": 0.148, "step": 8200 }, { "epoch": 0.658, "grad_norm": 4.551767826080322, "learning_rate": 3.6e-05, "loss": 0.1797, "step": 8225 }, { "epoch": 0.66, "grad_norm": 2.3506393432617188, "learning_rate": 3.578947368421053e-05, "loss": 0.2005, "step": 8250 }, { "epoch": 0.662, "grad_norm": 2.509979486465454, "learning_rate": 3.5578947368421054e-05, "loss": 0.1593, "step": 8275 }, { "epoch": 0.664, "grad_norm": 2.1514201164245605, "learning_rate": 3.536842105263158e-05, "loss": 0.1605, "step": 8300 }, { "epoch": 0.666, "grad_norm": 1.8447412252426147, "learning_rate": 3.515789473684211e-05, "loss": 0.1631, "step": 8325 }, { "epoch": 0.668, "grad_norm": 2.2108616828918457, "learning_rate": 3.4947368421052634e-05, "loss": 0.1812, "step": 8350 }, { "epoch": 0.67, "grad_norm": 3.138340473175049, "learning_rate": 3.473684210526316e-05, "loss": 0.1535, "step": 8375 }, { "epoch": 0.672, "grad_norm": 2.089402437210083, "learning_rate": 3.452631578947369e-05, "loss": 0.1882, "step": 8400 }, { "epoch": 0.674, "grad_norm": 2.022799253463745, "learning_rate": 3.431578947368421e-05, "loss": 0.1534, "step": 8425 }, { "epoch": 0.676, "grad_norm": 4.402505874633789, "learning_rate": 3.410526315789474e-05, "loss": 0.211, "step": 8450 }, { "epoch": 0.678, "grad_norm": 2.2280025482177734, "learning_rate": 3.389473684210526e-05, "loss": 0.2052, "step": 8475 }, { "epoch": 0.68, "grad_norm": 1.9944862127304077, "learning_rate": 3.368421052631579e-05, "loss": 0.1431, "step": 8500 }, { "epoch": 0.682, "grad_norm": 3.152775526046753, "learning_rate": 3.347368421052632e-05, "loss": 0.1697, "step": 8525 }, { "epoch": 0.684, "grad_norm": 4.06758975982666, "learning_rate": 3.3263157894736846e-05, "loss": 0.1894, "step": 8550 }, { "epoch": 0.686, "grad_norm": 4.115979194641113, "learning_rate": 3.3052631578947366e-05, "loss": 0.1746, "step": 8575 }, { "epoch": 0.688, "grad_norm": 1.844355821609497, "learning_rate": 3.28421052631579e-05, "loss": 0.1786, "step": 8600 }, { "epoch": 0.69, "grad_norm": 2.8369998931884766, "learning_rate": 3.2631578947368426e-05, "loss": 0.1718, "step": 8625 }, { "epoch": 0.692, "grad_norm": 4.434097766876221, "learning_rate": 3.2421052631578945e-05, "loss": 0.1707, "step": 8650 }, { "epoch": 0.694, "grad_norm": 3.360795497894287, "learning_rate": 3.221052631578947e-05, "loss": 0.1865, "step": 8675 }, { "epoch": 0.696, "grad_norm": 2.3571205139160156, "learning_rate": 3.2000000000000005e-05, "loss": 0.1922, "step": 8700 }, { "epoch": 0.698, "grad_norm": 2.478071451187134, "learning_rate": 3.178947368421053e-05, "loss": 0.2102, "step": 8725 }, { "epoch": 0.7, "grad_norm": 5.430673599243164, "learning_rate": 3.157894736842105e-05, "loss": 0.2257, "step": 8750 }, { "epoch": 0.702, "grad_norm": 3.069105625152588, "learning_rate": 3.136842105263158e-05, "loss": 0.2041, "step": 8775 }, { "epoch": 0.704, "grad_norm": 5.354055404663086, "learning_rate": 3.115789473684211e-05, "loss": 0.3051, "step": 8800 }, { "epoch": 0.706, "grad_norm": 2.466470956802368, "learning_rate": 3.094736842105263e-05, "loss": 0.3592, "step": 8825 }, { "epoch": 0.708, "grad_norm": 3.638484477996826, "learning_rate": 3.073684210526316e-05, "loss": 0.321, "step": 8850 }, { "epoch": 0.71, "grad_norm": 4.691796779632568, "learning_rate": 3.0526315789473684e-05, "loss": 0.3434, "step": 8875 }, { "epoch": 0.712, "grad_norm": 2.808640718460083, "learning_rate": 3.0315789473684214e-05, "loss": 0.2302, "step": 8900 }, { "epoch": 0.714, "grad_norm": 2.936375617980957, "learning_rate": 3.0105263157894737e-05, "loss": 0.2645, "step": 8925 }, { "epoch": 0.716, "grad_norm": 4.677466869354248, "learning_rate": 2.9894736842105264e-05, "loss": 0.3248, "step": 8950 }, { "epoch": 0.718, "grad_norm": 7.0504584312438965, "learning_rate": 2.968421052631579e-05, "loss": 0.3511, "step": 8975 }, { "epoch": 0.72, "grad_norm": 3.50658917427063, "learning_rate": 2.9473684210526314e-05, "loss": 0.3516, "step": 9000 }, { "epoch": 0.722, "grad_norm": 3.8559470176696777, "learning_rate": 2.9263157894736844e-05, "loss": 0.3557, "step": 9025 }, { "epoch": 0.724, "grad_norm": 9.091622352600098, "learning_rate": 2.905263157894737e-05, "loss": 0.3588, "step": 9050 }, { "epoch": 0.726, "grad_norm": 4.84503173828125, "learning_rate": 2.8842105263157897e-05, "loss": 0.3264, "step": 9075 }, { "epoch": 0.728, "grad_norm": 2.985128164291382, "learning_rate": 2.863157894736842e-05, "loss": 0.3308, "step": 9100 }, { "epoch": 0.73, "grad_norm": 1.7857965230941772, "learning_rate": 2.842105263157895e-05, "loss": 0.3014, "step": 9125 }, { "epoch": 0.732, "grad_norm": 3.808424472808838, "learning_rate": 2.8210526315789476e-05, "loss": 0.3639, "step": 9150 }, { "epoch": 0.734, "grad_norm": 5.9339070320129395, "learning_rate": 2.8000000000000003e-05, "loss": 0.3825, "step": 9175 }, { "epoch": 0.736, "grad_norm": 3.0576109886169434, "learning_rate": 2.7789473684210526e-05, "loss": 0.3629, "step": 9200 }, { "epoch": 0.738, "grad_norm": 3.745070219039917, "learning_rate": 2.7578947368421053e-05, "loss": 0.343, "step": 9225 }, { "epoch": 0.74, "grad_norm": 4.298427104949951, "learning_rate": 2.7368421052631583e-05, "loss": 0.3701, "step": 9250 }, { "epoch": 0.742, "grad_norm": 4.324175834655762, "learning_rate": 2.7157894736842106e-05, "loss": 0.3727, "step": 9275 }, { "epoch": 0.744, "grad_norm": 3.3111040592193604, "learning_rate": 2.6947368421052632e-05, "loss": 0.4209, "step": 9300 }, { "epoch": 0.746, "grad_norm": 5.95839786529541, "learning_rate": 2.673684210526316e-05, "loss": 0.3929, "step": 9325 }, { "epoch": 0.748, "grad_norm": 3.3262205123901367, "learning_rate": 2.652631578947369e-05, "loss": 0.3867, "step": 9350 }, { "epoch": 0.75, "grad_norm": 7.750339508056641, "learning_rate": 2.6315789473684212e-05, "loss": 0.4084, "step": 9375 }, { "epoch": 0.752, "grad_norm": 6.184704303741455, "learning_rate": 2.610526315789474e-05, "loss": 0.3948, "step": 9400 }, { "epoch": 0.754, "grad_norm": 4.458641052246094, "learning_rate": 2.5894736842105265e-05, "loss": 0.3993, "step": 9425 }, { "epoch": 0.756, "grad_norm": 2.420055389404297, "learning_rate": 2.5684210526315788e-05, "loss": 0.4213, "step": 9450 }, { "epoch": 0.758, "grad_norm": 3.379028081893921, "learning_rate": 2.5473684210526315e-05, "loss": 0.4065, "step": 9475 }, { "epoch": 0.76, "grad_norm": 3.8290674686431885, "learning_rate": 2.5263157894736845e-05, "loss": 0.3892, "step": 9500 }, { "epoch": 0.762, "grad_norm": 2.3934147357940674, "learning_rate": 2.505263157894737e-05, "loss": 0.363, "step": 9525 }, { "epoch": 0.764, "grad_norm": 2.6929566860198975, "learning_rate": 2.4842105263157898e-05, "loss": 0.4124, "step": 9550 }, { "epoch": 0.766, "grad_norm": 4.540988445281982, "learning_rate": 2.463157894736842e-05, "loss": 0.372, "step": 9575 }, { "epoch": 0.768, "grad_norm": 7.274412631988525, "learning_rate": 2.4421052631578948e-05, "loss": 0.3798, "step": 9600 }, { "epoch": 0.77, "grad_norm": 2.838831663131714, "learning_rate": 2.4210526315789474e-05, "loss": 0.4108, "step": 9625 }, { "epoch": 0.772, "grad_norm": 3.1290194988250732, "learning_rate": 2.4e-05, "loss": 0.3777, "step": 9650 }, { "epoch": 0.774, "grad_norm": 3.401431083679199, "learning_rate": 2.3789473684210527e-05, "loss": 0.3466, "step": 9675 }, { "epoch": 0.776, "grad_norm": 3.971301555633545, "learning_rate": 2.3578947368421054e-05, "loss": 0.3284, "step": 9700 }, { "epoch": 0.778, "grad_norm": 3.7027299404144287, "learning_rate": 2.336842105263158e-05, "loss": 0.3268, "step": 9725 }, { "epoch": 0.78, "grad_norm": 3.9830739498138428, "learning_rate": 2.3157894736842107e-05, "loss": 0.3523, "step": 9750 }, { "epoch": 0.782, "grad_norm": 3.219536304473877, "learning_rate": 2.294736842105263e-05, "loss": 0.3613, "step": 9775 }, { "epoch": 0.784, "grad_norm": 5.3083672523498535, "learning_rate": 2.273684210526316e-05, "loss": 0.3975, "step": 9800 }, { "epoch": 0.786, "grad_norm": 7.210485935211182, "learning_rate": 2.2526315789473683e-05, "loss": 0.3661, "step": 9825 }, { "epoch": 0.788, "grad_norm": 8.692153930664062, "learning_rate": 2.2315789473684213e-05, "loss": 0.3267, "step": 9850 }, { "epoch": 0.79, "grad_norm": 3.8115341663360596, "learning_rate": 2.2105263157894736e-05, "loss": 0.3664, "step": 9875 }, { "epoch": 0.792, "grad_norm": 3.92474627494812, "learning_rate": 2.1894736842105266e-05, "loss": 0.3677, "step": 9900 }, { "epoch": 0.794, "grad_norm": 3.509307622909546, "learning_rate": 2.168421052631579e-05, "loss": 0.4078, "step": 9925 }, { "epoch": 0.796, "grad_norm": 5.251331329345703, "learning_rate": 2.147368421052632e-05, "loss": 0.3243, "step": 9950 }, { "epoch": 0.798, "grad_norm": 5.842615127563477, "learning_rate": 2.1263157894736842e-05, "loss": 0.3681, "step": 9975 }, { "epoch": 0.8, "grad_norm": 2.8434536457061768, "learning_rate": 2.105263157894737e-05, "loss": 0.3559, "step": 10000 }, { "epoch": 0.802, "grad_norm": 6.5307793617248535, "learning_rate": 2.0842105263157895e-05, "loss": 0.3839, "step": 10025 }, { "epoch": 0.804, "grad_norm": 3.4104504585266113, "learning_rate": 2.0631578947368422e-05, "loss": 0.3887, "step": 10050 }, { "epoch": 0.806, "grad_norm": 3.9366180896759033, "learning_rate": 2.042105263157895e-05, "loss": 0.3695, "step": 10075 }, { "epoch": 0.808, "grad_norm": 3.8400652408599854, "learning_rate": 2.0210526315789475e-05, "loss": 0.3821, "step": 10100 }, { "epoch": 0.81, "grad_norm": 4.9908447265625, "learning_rate": 2e-05, "loss": 0.3639, "step": 10125 }, { "epoch": 0.812, "grad_norm": 4.293838024139404, "learning_rate": 1.9789473684210528e-05, "loss": 0.371, "step": 10150 }, { "epoch": 0.814, "grad_norm": 4.710839748382568, "learning_rate": 1.957894736842105e-05, "loss": 0.3739, "step": 10175 }, { "epoch": 0.816, "grad_norm": 4.468384265899658, "learning_rate": 1.936842105263158e-05, "loss": 0.3414, "step": 10200 }, { "epoch": 0.818, "grad_norm": 4.2299957275390625, "learning_rate": 1.9157894736842104e-05, "loss": 0.3293, "step": 10225 }, { "epoch": 0.82, "grad_norm": 2.4495487213134766, "learning_rate": 1.8947368421052634e-05, "loss": 0.414, "step": 10250 }, { "epoch": 0.822, "grad_norm": 4.561651706695557, "learning_rate": 1.8736842105263158e-05, "loss": 0.4099, "step": 10275 }, { "epoch": 0.824, "grad_norm": 2.8751111030578613, "learning_rate": 1.8526315789473687e-05, "loss": 0.3289, "step": 10300 }, { "epoch": 0.826, "grad_norm": 2.919625997543335, "learning_rate": 1.831578947368421e-05, "loss": 0.3553, "step": 10325 }, { "epoch": 0.828, "grad_norm": 1.8310528993606567, "learning_rate": 1.810526315789474e-05, "loss": 0.3746, "step": 10350 }, { "epoch": 0.83, "grad_norm": 2.163372755050659, "learning_rate": 1.7894736842105264e-05, "loss": 0.4245, "step": 10375 }, { "epoch": 0.832, "grad_norm": 4.151187419891357, "learning_rate": 1.768421052631579e-05, "loss": 0.3887, "step": 10400 }, { "epoch": 0.834, "grad_norm": 4.552877426147461, "learning_rate": 1.7473684210526317e-05, "loss": 0.3552, "step": 10425 }, { "epoch": 0.836, "grad_norm": 4.916106224060059, "learning_rate": 1.7263157894736843e-05, "loss": 0.4124, "step": 10450 }, { "epoch": 0.838, "grad_norm": 4.415599822998047, "learning_rate": 1.705263157894737e-05, "loss": 0.4095, "step": 10475 }, { "epoch": 0.84, "grad_norm": 5.485673904418945, "learning_rate": 1.6842105263157896e-05, "loss": 0.3779, "step": 10500 }, { "epoch": 0.842, "grad_norm": 3.694333791732788, "learning_rate": 1.6631578947368423e-05, "loss": 0.3618, "step": 10525 }, { "epoch": 0.844, "grad_norm": 3.96889066696167, "learning_rate": 1.642105263157895e-05, "loss": 0.3842, "step": 10550 }, { "epoch": 0.846, "grad_norm": 6.318436145782471, "learning_rate": 1.6210526315789473e-05, "loss": 0.3783, "step": 10575 }, { "epoch": 0.848, "grad_norm": 4.1109232902526855, "learning_rate": 1.6000000000000003e-05, "loss": 0.3276, "step": 10600 }, { "epoch": 0.85, "grad_norm": 4.43826961517334, "learning_rate": 1.5789473684210526e-05, "loss": 0.4221, "step": 10625 }, { "epoch": 0.852, "grad_norm": 3.70735239982605, "learning_rate": 1.5578947368421056e-05, "loss": 0.3948, "step": 10650 }, { "epoch": 0.854, "grad_norm": 3.354746103286743, "learning_rate": 1.536842105263158e-05, "loss": 0.4079, "step": 10675 }, { "epoch": 0.856, "grad_norm": 5.025430679321289, "learning_rate": 1.5157894736842107e-05, "loss": 0.3651, "step": 10700 }, { "epoch": 0.858, "grad_norm": 3.7044286727905273, "learning_rate": 1.4947368421052632e-05, "loss": 0.4036, "step": 10725 }, { "epoch": 0.86, "grad_norm": 5.7737884521484375, "learning_rate": 1.4736842105263157e-05, "loss": 0.426, "step": 10750 }, { "epoch": 0.862, "grad_norm": 4.619137763977051, "learning_rate": 1.4526315789473685e-05, "loss": 0.394, "step": 10775 }, { "epoch": 0.864, "grad_norm": 3.5880117416381836, "learning_rate": 1.431578947368421e-05, "loss": 0.3663, "step": 10800 }, { "epoch": 0.866, "grad_norm": 5.826897144317627, "learning_rate": 1.4105263157894738e-05, "loss": 0.329, "step": 10825 }, { "epoch": 0.868, "grad_norm": 5.386528968811035, "learning_rate": 1.3894736842105263e-05, "loss": 0.3184, "step": 10850 }, { "epoch": 0.87, "grad_norm": 5.732900142669678, "learning_rate": 1.3684210526315791e-05, "loss": 0.3611, "step": 10875 }, { "epoch": 0.872, "grad_norm": 3.784217596054077, "learning_rate": 1.3473684210526316e-05, "loss": 0.383, "step": 10900 }, { "epoch": 0.874, "grad_norm": 4.212706089019775, "learning_rate": 1.3263157894736844e-05, "loss": 0.3435, "step": 10925 }, { "epoch": 0.876, "grad_norm": 6.505759239196777, "learning_rate": 1.305263157894737e-05, "loss": 0.3759, "step": 10950 }, { "epoch": 0.878, "grad_norm": 3.2388670444488525, "learning_rate": 1.2842105263157894e-05, "loss": 0.3323, "step": 10975 }, { "epoch": 0.88, "grad_norm": 2.824962615966797, "learning_rate": 1.2631578947368422e-05, "loss": 0.3221, "step": 11000 }, { "epoch": 0.882, "grad_norm": 2.974475622177124, "learning_rate": 1.2421052631578949e-05, "loss": 0.2973, "step": 11025 }, { "epoch": 0.884, "grad_norm": 4.150785446166992, "learning_rate": 1.2210526315789474e-05, "loss": 0.3371, "step": 11050 }, { "epoch": 0.886, "grad_norm": 4.3901591300964355, "learning_rate": 1.2e-05, "loss": 0.2587, "step": 11075 }, { "epoch": 0.888, "grad_norm": 2.3966376781463623, "learning_rate": 1.1789473684210527e-05, "loss": 0.2976, "step": 11100 }, { "epoch": 0.89, "grad_norm": 3.4761784076690674, "learning_rate": 1.1578947368421053e-05, "loss": 0.307, "step": 11125 }, { "epoch": 0.892, "grad_norm": 2.4897382259368896, "learning_rate": 1.136842105263158e-05, "loss": 0.3121, "step": 11150 }, { "epoch": 0.894, "grad_norm": 2.342910051345825, "learning_rate": 1.1157894736842106e-05, "loss": 0.2486, "step": 11175 }, { "epoch": 0.896, "grad_norm": 3.4446234703063965, "learning_rate": 1.0947368421052633e-05, "loss": 0.3367, "step": 11200 }, { "epoch": 0.898, "grad_norm": 6.869594097137451, "learning_rate": 1.073684210526316e-05, "loss": 0.2779, "step": 11225 }, { "epoch": 0.9, "grad_norm": 5.409977912902832, "learning_rate": 1.0526315789473684e-05, "loss": 0.3466, "step": 11250 }, { "epoch": 0.902, "grad_norm": 3.0744941234588623, "learning_rate": 1.0315789473684211e-05, "loss": 0.3442, "step": 11275 }, { "epoch": 0.904, "grad_norm": 6.333566188812256, "learning_rate": 1.0105263157894738e-05, "loss": 0.2866, "step": 11300 }, { "epoch": 0.906, "grad_norm": 2.865696907043457, "learning_rate": 9.894736842105264e-06, "loss": 0.2836, "step": 11325 }, { "epoch": 0.908, "grad_norm": 10.0, "learning_rate": 9.68421052631579e-06, "loss": 0.2817, "step": 11350 }, { "epoch": 0.91, "grad_norm": 4.6826605796813965, "learning_rate": 9.473684210526317e-06, "loss": 0.3117, "step": 11375 }, { "epoch": 0.912, "grad_norm": 9.460478782653809, "learning_rate": 9.263157894736844e-06, "loss": 0.265, "step": 11400 }, { "epoch": 0.914, "grad_norm": 8.582769393920898, "learning_rate": 9.05263157894737e-06, "loss": 0.2552, "step": 11425 }, { "epoch": 0.916, "grad_norm": 3.5668044090270996, "learning_rate": 8.842105263157895e-06, "loss": 0.2983, "step": 11450 }, { "epoch": 0.918, "grad_norm": 4.328312873840332, "learning_rate": 8.631578947368422e-06, "loss": 0.2573, "step": 11475 }, { "epoch": 0.92, "grad_norm": 3.874371290206909, "learning_rate": 8.421052631578948e-06, "loss": 0.2822, "step": 11500 }, { "epoch": 0.922, "grad_norm": 3.0442512035369873, "learning_rate": 8.210526315789475e-06, "loss": 0.2974, "step": 11525 }, { "epoch": 0.924, "grad_norm": 4.014752388000488, "learning_rate": 8.000000000000001e-06, "loss": 0.2627, "step": 11550 }, { "epoch": 0.926, "grad_norm": 3.2828469276428223, "learning_rate": 7.789473684210528e-06, "loss": 0.302, "step": 11575 }, { "epoch": 0.928, "grad_norm": 5.778573989868164, "learning_rate": 7.578947368421054e-06, "loss": 0.2732, "step": 11600 }, { "epoch": 0.93, "grad_norm": 4.556640625, "learning_rate": 7.3684210526315784e-06, "loss": 0.2592, "step": 11625 }, { "epoch": 0.932, "grad_norm": 2.93839693069458, "learning_rate": 7.157894736842105e-06, "loss": 0.2058, "step": 11650 }, { "epoch": 0.934, "grad_norm": 2.7247869968414307, "learning_rate": 6.9473684210526315e-06, "loss": 0.2098, "step": 11675 }, { "epoch": 0.936, "grad_norm": 2.1764931678771973, "learning_rate": 6.736842105263158e-06, "loss": 0.279, "step": 11700 }, { "epoch": 0.938, "grad_norm": 1.9565917253494263, "learning_rate": 6.526315789473685e-06, "loss": 0.2394, "step": 11725 }, { "epoch": 0.94, "grad_norm": 2.1610960960388184, "learning_rate": 6.315789473684211e-06, "loss": 0.2181, "step": 11750 }, { "epoch": 0.942, "grad_norm": 4.215695381164551, "learning_rate": 6.105263157894737e-06, "loss": 0.237, "step": 11775 }, { "epoch": 0.944, "grad_norm": 2.702558994293213, "learning_rate": 5.8947368421052634e-06, "loss": 0.2401, "step": 11800 }, { "epoch": 0.946, "grad_norm": 3.1164896488189697, "learning_rate": 5.68421052631579e-06, "loss": 0.2844, "step": 11825 }, { "epoch": 0.948, "grad_norm": 3.159911632537842, "learning_rate": 5.4736842105263165e-06, "loss": 0.2637, "step": 11850 }, { "epoch": 0.95, "grad_norm": 2.3097712993621826, "learning_rate": 5.263157894736842e-06, "loss": 0.2123, "step": 11875 }, { "epoch": 0.952, "grad_norm": 2.6236393451690674, "learning_rate": 5.052631578947369e-06, "loss": 0.2124, "step": 11900 }, { "epoch": 0.954, "grad_norm": 2.240006923675537, "learning_rate": 4.842105263157895e-06, "loss": 0.2488, "step": 11925 }, { "epoch": 0.956, "grad_norm": 2.2833738327026367, "learning_rate": 4.631578947368422e-06, "loss": 0.2393, "step": 11950 }, { "epoch": 0.958, "grad_norm": 2.0853137969970703, "learning_rate": 4.4210526315789476e-06, "loss": 0.1891, "step": 11975 }, { "epoch": 0.96, "grad_norm": 5.040877342224121, "learning_rate": 4.210526315789474e-06, "loss": 0.2594, "step": 12000 }, { "epoch": 0.962, "grad_norm": 3.583343744277954, "learning_rate": 4.000000000000001e-06, "loss": 0.2463, "step": 12025 }, { "epoch": 0.964, "grad_norm": 3.5989181995391846, "learning_rate": 3.789473684210527e-06, "loss": 0.2231, "step": 12050 }, { "epoch": 0.966, "grad_norm": 3.2131879329681396, "learning_rate": 3.5789473684210525e-06, "loss": 0.2302, "step": 12075 }, { "epoch": 0.968, "grad_norm": 2.3687148094177246, "learning_rate": 3.368421052631579e-06, "loss": 0.2426, "step": 12100 }, { "epoch": 0.97, "grad_norm": 2.4739716053009033, "learning_rate": 3.1578947368421056e-06, "loss": 0.233, "step": 12125 }, { "epoch": 0.972, "grad_norm": 4.973099231719971, "learning_rate": 2.9473684210526317e-06, "loss": 0.2395, "step": 12150 }, { "epoch": 0.974, "grad_norm": 1.5512763261795044, "learning_rate": 2.7368421052631583e-06, "loss": 0.2437, "step": 12175 }, { "epoch": 0.976, "grad_norm": 3.5859758853912354, "learning_rate": 2.5263157894736844e-06, "loss": 0.2519, "step": 12200 }, { "epoch": 0.978, "grad_norm": 2.6558282375335693, "learning_rate": 2.315789473684211e-06, "loss": 0.2029, "step": 12225 }, { "epoch": 0.98, "grad_norm": 6.469668388366699, "learning_rate": 2.105263157894737e-06, "loss": 0.2182, "step": 12250 }, { "epoch": 0.982, "grad_norm": 4.018293857574463, "learning_rate": 1.8947368421052634e-06, "loss": 0.1946, "step": 12275 }, { "epoch": 0.984, "grad_norm": 2.706835985183716, "learning_rate": 1.6842105263157895e-06, "loss": 0.2391, "step": 12300 }, { "epoch": 0.986, "grad_norm": 5.652164936065674, "learning_rate": 1.4736842105263159e-06, "loss": 0.2884, "step": 12325 }, { "epoch": 0.988, "grad_norm": 2.2838077545166016, "learning_rate": 1.2631578947368422e-06, "loss": 0.2257, "step": 12350 }, { "epoch": 0.99, "grad_norm": 4.01392936706543, "learning_rate": 1.0526315789473685e-06, "loss": 0.2852, "step": 12375 }, { "epoch": 0.992, "grad_norm": 5.119837760925293, "learning_rate": 8.421052631578948e-07, "loss": 0.2021, "step": 12400 }, { "epoch": 0.994, "grad_norm": 3.6381170749664307, "learning_rate": 6.315789473684211e-07, "loss": 0.2518, "step": 12425 }, { "epoch": 0.996, "grad_norm": 3.1016297340393066, "learning_rate": 4.210526315789474e-07, "loss": 0.2371, "step": 12450 }, { "epoch": 0.998, "grad_norm": 4.031352519989014, "learning_rate": 2.105263157894737e-07, "loss": 0.2753, "step": 12475 }, { "epoch": 1.0, "grad_norm": 2.030440330505371, "learning_rate": 0.0, "loss": 0.2449, "step": 12500 } ], "logging_steps": 25, "max_steps": 12500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }