| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 14.884210526315789, | |
| "eval_steps": 500, | |
| "global_step": 119, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.12631578947368421, | |
| "grad_norm": 1.9382071495056152, | |
| "learning_rate": 8.333333333333333e-07, | |
| "loss": 0.5018, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.25263157894736843, | |
| "grad_norm": 1.9020211696624756, | |
| "learning_rate": 1.6666666666666667e-06, | |
| "loss": 0.4715, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.37894736842105264, | |
| "grad_norm": 2.1491496562957764, | |
| "learning_rate": 2.5e-06, | |
| "loss": 0.5222, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.5052631578947369, | |
| "grad_norm": 1.9012116193771362, | |
| "learning_rate": 3.3333333333333333e-06, | |
| "loss": 0.4848, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.631578947368421, | |
| "grad_norm": 1.9676955938339233, | |
| "learning_rate": 4.166666666666667e-06, | |
| "loss": 0.4981, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.7578947368421053, | |
| "grad_norm": 1.5441391468048096, | |
| "learning_rate": 5e-06, | |
| "loss": 0.468, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.8842105263157894, | |
| "grad_norm": 1.542313575744629, | |
| "learning_rate": 5.833333333333334e-06, | |
| "loss": 0.4635, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 1.542313575744629, | |
| "learning_rate": 6.666666666666667e-06, | |
| "loss": 0.4412, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 1.1263157894736842, | |
| "grad_norm": 1.0980747938156128, | |
| "learning_rate": 7.500000000000001e-06, | |
| "loss": 0.4205, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 1.2526315789473683, | |
| "grad_norm": 0.8775039911270142, | |
| "learning_rate": 8.333333333333334e-06, | |
| "loss": 0.4191, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 1.3789473684210527, | |
| "grad_norm": 0.4762185215950012, | |
| "learning_rate": 9.166666666666666e-06, | |
| "loss": 0.4331, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 1.5052631578947369, | |
| "grad_norm": 1.0445164442062378, | |
| "learning_rate": 1e-05, | |
| "loss": 0.376, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 1.631578947368421, | |
| "grad_norm": 1.0010490417480469, | |
| "learning_rate": 9.997845031134992e-06, | |
| "loss": 0.4098, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 1.7578947368421054, | |
| "grad_norm": 0.9467893838882446, | |
| "learning_rate": 9.991381982096293e-06, | |
| "loss": 0.3882, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 1.8842105263157896, | |
| "grad_norm": 0.7490498423576355, | |
| "learning_rate": 9.98061642395168e-06, | |
| "loss": 0.408, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 0.8292229175567627, | |
| "learning_rate": 9.965557636478203e-06, | |
| "loss": 0.3526, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 2.126315789473684, | |
| "grad_norm": 0.6992197036743164, | |
| "learning_rate": 9.94621860016312e-06, | |
| "loss": 0.3457, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 2.2526315789473683, | |
| "grad_norm": 0.4710032045841217, | |
| "learning_rate": 9.922615985014887e-06, | |
| "loss": 0.3445, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 2.3789473684210525, | |
| "grad_norm": 0.3899461030960083, | |
| "learning_rate": 9.894770136193814e-06, | |
| "loss": 0.3548, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 2.5052631578947366, | |
| "grad_norm": 0.40362292528152466, | |
| "learning_rate": 9.862705056474795e-06, | |
| "loss": 0.3454, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 2.6315789473684212, | |
| "grad_norm": 0.36370548605918884, | |
| "learning_rate": 9.826448385557208e-06, | |
| "loss": 0.3503, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 2.7578947368421054, | |
| "grad_norm": 0.35196444392204285, | |
| "learning_rate": 9.786031376239842e-06, | |
| "loss": 0.3332, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 2.8842105263157896, | |
| "grad_norm": 0.3312518894672394, | |
| "learning_rate": 9.741488867481377e-06, | |
| "loss": 0.3266, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "grad_norm": 0.3002198338508606, | |
| "learning_rate": 9.692859254369631e-06, | |
| "loss": 0.3333, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 3.126315789473684, | |
| "grad_norm": 0.31604424118995667, | |
| "learning_rate": 9.640184455025472e-06, | |
| "loss": 0.3105, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 3.2526315789473683, | |
| "grad_norm": 0.2710190713405609, | |
| "learning_rate": 9.583509874469924e-06, | |
| "loss": 0.303, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 3.3789473684210525, | |
| "grad_norm": 0.2576391100883484, | |
| "learning_rate": 9.522884365485599e-06, | |
| "loss": 0.3219, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 3.5052631578947366, | |
| "grad_norm": 0.23554575443267822, | |
| "learning_rate": 9.458360186506212e-06, | |
| "loss": 0.292, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 3.6315789473684212, | |
| "grad_norm": 0.22661598026752472, | |
| "learning_rate": 9.389992956570463e-06, | |
| "loss": 0.3079, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 3.7578947368421054, | |
| "grad_norm": 0.238206148147583, | |
| "learning_rate": 9.317841607379106e-06, | |
| "loss": 0.3033, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 3.8842105263157896, | |
| "grad_norm": 0.2077244073152542, | |
| "learning_rate": 9.241968332496576e-06, | |
| "loss": 0.2923, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "grad_norm": 0.21579864621162415, | |
| "learning_rate": 9.162438533740891e-06, | |
| "loss": 0.2884, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 4.126315789473685, | |
| "grad_norm": 0.21424686908721924, | |
| "learning_rate": 9.07932076480812e-06, | |
| "loss": 0.2787, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 4.252631578947368, | |
| "grad_norm": 0.2355010062456131, | |
| "learning_rate": 8.99268667217993e-06, | |
| "loss": 0.2795, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 4.378947368421053, | |
| "grad_norm": 0.19621430337429047, | |
| "learning_rate": 8.90261093336523e-06, | |
| "loss": 0.2726, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 4.505263157894737, | |
| "grad_norm": 0.2064390927553177, | |
| "learning_rate": 8.809171192529074e-06, | |
| "loss": 0.283, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 4.631578947368421, | |
| "grad_norm": 0.22728653252124786, | |
| "learning_rate": 8.712447993564362e-06, | |
| "loss": 0.2521, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 4.757894736842105, | |
| "grad_norm": 0.19322557747364044, | |
| "learning_rate": 8.612524710664012e-06, | |
| "loss": 0.247, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 4.88421052631579, | |
| "grad_norm": 0.1852494776248932, | |
| "learning_rate": 8.509487476453442e-06, | |
| "loss": 0.2585, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "grad_norm": 0.22497323155403137, | |
| "learning_rate": 8.403425107745315e-06, | |
| "loss": 0.2697, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 5.126315789473685, | |
| "grad_norm": 0.23915345966815948, | |
| "learning_rate": 8.294429028980555e-06, | |
| "loss": 0.2322, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 5.252631578947368, | |
| "grad_norm": 0.1770770251750946, | |
| "learning_rate": 8.182593193421625e-06, | |
| "loss": 0.23, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 5.378947368421053, | |
| "grad_norm": 0.1872120201587677, | |
| "learning_rate": 8.06801400216597e-06, | |
| "loss": 0.232, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 5.505263157894737, | |
| "grad_norm": 0.2505616247653961, | |
| "learning_rate": 7.950790221049485e-06, | |
| "loss": 0.2359, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 5.631578947368421, | |
| "grad_norm": 0.19844704866409302, | |
| "learning_rate": 7.831022895511586e-06, | |
| "loss": 0.2451, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 5.757894736842105, | |
| "grad_norm": 0.1715887188911438, | |
| "learning_rate": 7.708815263495307e-06, | |
| "loss": 0.2282, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 5.88421052631579, | |
| "grad_norm": 0.2623206079006195, | |
| "learning_rate": 7.584272666457471e-06, | |
| "loss": 0.224, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 6.0, | |
| "grad_norm": 0.23196087777614594, | |
| "learning_rate": 7.457502458565673e-06, | |
| "loss": 0.2259, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 6.126315789473685, | |
| "grad_norm": 0.2110043466091156, | |
| "learning_rate": 7.328613914160319e-06, | |
| "loss": 0.2162, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 6.252631578947368, | |
| "grad_norm": 0.19842451810836792, | |
| "learning_rate": 7.1977181335615085e-06, | |
| "loss": 0.2009, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 6.378947368421053, | |
| "grad_norm": 0.20280607044696808, | |
| "learning_rate": 7.064927947301942e-06, | |
| "loss": 0.2042, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 6.505263157894737, | |
| "grad_norm": 0.19076986610889435, | |
| "learning_rate": 6.9303578188684085e-06, | |
| "loss": 0.1802, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 6.631578947368421, | |
| "grad_norm": 0.17408154904842377, | |
| "learning_rate": 6.79412374603568e-06, | |
| "loss": 0.1857, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 6.757894736842105, | |
| "grad_norm": 0.22106344997882843, | |
| "learning_rate": 6.65634316087788e-06, | |
| "loss": 0.197, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 6.88421052631579, | |
| "grad_norm": 0.20985056459903717, | |
| "learning_rate": 6.5171348285434965e-06, | |
| "loss": 0.1894, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 7.0, | |
| "grad_norm": 0.1847132444381714, | |
| "learning_rate": 6.3766187448813e-06, | |
| "loss": 0.1941, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 7.126315789473685, | |
| "grad_norm": 0.17527417838573456, | |
| "learning_rate": 6.234916033005421e-06, | |
| "loss": 0.1592, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 7.252631578947368, | |
| "grad_norm": 0.18749035894870758, | |
| "learning_rate": 6.0921488388887315e-06, | |
| "loss": 0.1629, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 7.378947368421053, | |
| "grad_norm": 0.19064860045909882, | |
| "learning_rate": 5.948440226074539e-06, | |
| "loss": 0.1691, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 7.505263157894737, | |
| "grad_norm": 0.1939338892698288, | |
| "learning_rate": 5.803914069597342e-06, | |
| "loss": 0.1469, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 7.631578947368421, | |
| "grad_norm": 0.19160349667072296, | |
| "learning_rate": 5.658694949204094e-06, | |
| "loss": 0.1536, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 7.757894736842105, | |
| "grad_norm": 0.19728262722492218, | |
| "learning_rate": 5.512908041968018e-06, | |
| "loss": 0.1572, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 7.88421052631579, | |
| "grad_norm": 0.4989687502384186, | |
| "learning_rate": 5.36667901438752e-06, | |
| "loss": 0.1571, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 8.0, | |
| "grad_norm": 0.1953885853290558, | |
| "learning_rate": 5.220133914063239e-06, | |
| "loss": 0.1567, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 8.126315789473685, | |
| "grad_norm": 0.2087051123380661, | |
| "learning_rate": 5.073399061046584e-06, | |
| "loss": 0.1345, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 8.25263157894737, | |
| "grad_norm": 0.21436747908592224, | |
| "learning_rate": 4.926600938953418e-06, | |
| "loss": 0.1302, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 8.378947368421052, | |
| "grad_norm": 0.20336535573005676, | |
| "learning_rate": 4.779866085936762e-06, | |
| "loss": 0.128, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 8.505263157894737, | |
| "grad_norm": 0.2767126262187958, | |
| "learning_rate": 4.6333209856124814e-06, | |
| "loss": 0.121, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 8.631578947368421, | |
| "grad_norm": 0.18408919870853424, | |
| "learning_rate": 4.487091958031984e-06, | |
| "loss": 0.1195, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 8.757894736842106, | |
| "grad_norm": 0.2123204469680786, | |
| "learning_rate": 4.341305050795907e-06, | |
| "loss": 0.128, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 8.884210526315789, | |
| "grad_norm": 0.2557663023471832, | |
| "learning_rate": 4.19608593040266e-06, | |
| "loss": 0.12, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 9.0, | |
| "grad_norm": 0.23377615213394165, | |
| "learning_rate": 4.051559773925462e-06, | |
| "loss": 0.1289, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 9.126315789473685, | |
| "grad_norm": 0.26173460483551025, | |
| "learning_rate": 3.907851161111269e-06, | |
| "loss": 0.1108, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 9.25263157894737, | |
| "grad_norm": 0.1853621006011963, | |
| "learning_rate": 3.7650839669945804e-06, | |
| "loss": 0.0998, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 9.378947368421052, | |
| "grad_norm": 0.25105205178260803, | |
| "learning_rate": 3.623381255118702e-06, | |
| "loss": 0.1048, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 9.505263157894737, | |
| "grad_norm": 0.3097490072250366, | |
| "learning_rate": 3.4828651714565056e-06, | |
| "loss": 0.1017, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 9.631578947368421, | |
| "grad_norm": 0.18099191784858704, | |
| "learning_rate": 3.3436568391221215e-06, | |
| "loss": 0.0921, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 9.757894736842106, | |
| "grad_norm": 0.17075076699256897, | |
| "learning_rate": 3.2058762539643214e-06, | |
| "loss": 0.0917, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 9.884210526315789, | |
| "grad_norm": 0.26677173376083374, | |
| "learning_rate": 3.0696421811315923e-06, | |
| "loss": 0.0915, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "grad_norm": 0.24106067419052124, | |
| "learning_rate": 2.9350720526980592e-06, | |
| "loss": 0.0872, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 10.126315789473685, | |
| "grad_norm": 0.24568013846874237, | |
| "learning_rate": 2.8022818664384945e-06, | |
| "loss": 0.0834, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 10.25263157894737, | |
| "grad_norm": 0.16468282043933868, | |
| "learning_rate": 2.671386085839682e-06, | |
| "loss": 0.0797, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 10.378947368421052, | |
| "grad_norm": 0.18845616281032562, | |
| "learning_rate": 2.542497541434329e-06, | |
| "loss": 0.0752, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 10.505263157894737, | |
| "grad_norm": 0.277291476726532, | |
| "learning_rate": 2.4157273335425296e-06, | |
| "loss": 0.076, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 10.631578947368421, | |
| "grad_norm": 0.20707395672798157, | |
| "learning_rate": 2.291184736504695e-06, | |
| "loss": 0.0725, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 10.757894736842106, | |
| "grad_norm": 0.16026121377944946, | |
| "learning_rate": 2.168977104488415e-06, | |
| "loss": 0.0725, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 10.884210526315789, | |
| "grad_norm": 0.18469811975955963, | |
| "learning_rate": 2.049209778950518e-06, | |
| "loss": 0.0753, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 11.0, | |
| "grad_norm": 0.240610271692276, | |
| "learning_rate": 1.9319859978340312e-06, | |
| "loss": 0.0745, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 11.126315789473685, | |
| "grad_norm": 0.2793143689632416, | |
| "learning_rate": 1.8174068065783768e-06, | |
| "loss": 0.0637, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 11.25263157894737, | |
| "grad_norm": 0.21911287307739258, | |
| "learning_rate": 1.7055709710194452e-06, | |
| "loss": 0.0673, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 11.378947368421052, | |
| "grad_norm": 0.15809978544712067, | |
| "learning_rate": 1.5965748922546876e-06, | |
| "loss": 0.0604, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 11.505263157894737, | |
| "grad_norm": 0.18286827206611633, | |
| "learning_rate": 1.490512523546559e-06, | |
| "loss": 0.057, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 11.631578947368421, | |
| "grad_norm": 0.308363676071167, | |
| "learning_rate": 1.38747528933599e-06, | |
| "loss": 0.0639, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 11.757894736842106, | |
| "grad_norm": 0.28524667024612427, | |
| "learning_rate": 1.28755200643564e-06, | |
| "loss": 0.0613, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 11.884210526315789, | |
| "grad_norm": 0.18594607710838318, | |
| "learning_rate": 1.190828807470929e-06, | |
| "loss": 0.0597, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 12.0, | |
| "grad_norm": 0.1585705280303955, | |
| "learning_rate": 1.0973890666347703e-06, | |
| "loss": 0.0542, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 12.126315789473685, | |
| "grad_norm": 0.2194358855485916, | |
| "learning_rate": 1.0073133278200702e-06, | |
| "loss": 0.0534, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 12.25263157894737, | |
| "grad_norm": 0.2553711235523224, | |
| "learning_rate": 9.206792351918809e-07, | |
| "loss": 0.0555, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 12.378947368421052, | |
| "grad_norm": 0.23654919862747192, | |
| "learning_rate": 8.375614662591097e-07, | |
| "loss": 0.0509, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 12.505263157894737, | |
| "grad_norm": 0.22062428295612335, | |
| "learning_rate": 7.580316675034255e-07, | |
| "loss": 0.0547, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 12.631578947368421, | |
| "grad_norm": 0.17180566489696503, | |
| "learning_rate": 6.821583926208947e-07, | |
| "loss": 0.0533, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 12.757894736842106, | |
| "grad_norm": 0.11739594489336014, | |
| "learning_rate": 6.100070434295379e-07, | |
| "loss": 0.0507, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 12.884210526315789, | |
| "grad_norm": 0.14421358704566956, | |
| "learning_rate": 5.416398134937878e-07, | |
| "loss": 0.0469, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 13.0, | |
| "grad_norm": 0.14421358704566956, | |
| "learning_rate": 4.771156345144018e-07, | |
| "loss": 0.0498, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 13.126315789473685, | |
| "grad_norm": 0.38636109232902527, | |
| "learning_rate": 4.1649012553007795e-07, | |
| "loss": 0.049, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 13.25263157894737, | |
| "grad_norm": 0.15430928766727448, | |
| "learning_rate": 3.5981554497452886e-07, | |
| "loss": 0.0467, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 13.378947368421052, | |
| "grad_norm": 0.16978979110717773, | |
| "learning_rate": 3.0714074563037043e-07, | |
| "loss": 0.0432, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 13.505263157894737, | |
| "grad_norm": 0.15888580679893494, | |
| "learning_rate": 2.585111325186235e-07, | |
| "loss": 0.0483, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 13.631578947368421, | |
| "grad_norm": 0.16612868010997772, | |
| "learning_rate": 2.1396862376015904e-07, | |
| "loss": 0.0465, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 13.757894736842106, | |
| "grad_norm": 0.12936703860759735, | |
| "learning_rate": 1.7355161444279346e-07, | |
| "loss": 0.0486, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 13.884210526315789, | |
| "grad_norm": 0.12057878822088242, | |
| "learning_rate": 1.372949435252058e-07, | |
| "loss": 0.0465, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 14.0, | |
| "grad_norm": 0.12300478667020798, | |
| "learning_rate": 1.0522986380618606e-07, | |
| "loss": 0.0491, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 14.126315789473685, | |
| "grad_norm": 0.1282297968864441, | |
| "learning_rate": 7.738401498511406e-08, | |
| "loss": 0.0466, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 14.25263157894737, | |
| "grad_norm": 0.12143048644065857, | |
| "learning_rate": 5.378139983688135e-08, | |
| "loss": 0.0509, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 14.378947368421052, | |
| "grad_norm": 0.11874306946992874, | |
| "learning_rate": 3.444236352179831e-08, | |
| "loss": 0.0479, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 14.505263157894737, | |
| "grad_norm": 0.12550322711467743, | |
| "learning_rate": 1.9383576048320752e-08, | |
| "loss": 0.0445, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 14.631578947368421, | |
| "grad_norm": 0.11551317572593689, | |
| "learning_rate": 8.618017903708198e-09, | |
| "loss": 0.0489, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 14.757894736842106, | |
| "grad_norm": 0.12495766580104828, | |
| "learning_rate": 2.154968865007989e-09, | |
| "loss": 0.0439, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 14.884210526315789, | |
| "grad_norm": 0.10572399199008942, | |
| "learning_rate": 0.0, | |
| "loss": 0.0444, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 14.884210526315789, | |
| "step": 119, | |
| "total_flos": 381420994822144.0, | |
| "train_loss": 0.19363727301609615, | |
| "train_runtime": 21617.7907, | |
| "train_samples_per_second": 0.595, | |
| "train_steps_per_second": 0.006 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 119, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 17, | |
| "save_steps": 32, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 381420994822144.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |