{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.996845425867508, "eval_steps": 500, "global_step": 2376, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0012618296529968455, "grad_norm": 54.94166350549407, "learning_rate": 2.1008403361344538e-07, "loss": 10.9975, "step": 1 }, { "epoch": 0.002523659305993691, "grad_norm": 56.60731240436532, "learning_rate": 4.2016806722689076e-07, "loss": 11.041, "step": 2 }, { "epoch": 0.0037854889589905363, "grad_norm": 54.1033147892274, "learning_rate": 6.302521008403362e-07, "loss": 11.1604, "step": 3 }, { "epoch": 0.005047318611987382, "grad_norm": 55.143994615610836, "learning_rate": 8.403361344537815e-07, "loss": 11.0666, "step": 4 }, { "epoch": 0.006309148264984227, "grad_norm": 55.18721646033169, "learning_rate": 1.0504201680672271e-06, "loss": 11.0652, "step": 5 }, { "epoch": 0.007570977917981073, "grad_norm": 58.43286215962764, "learning_rate": 1.2605042016806724e-06, "loss": 10.939, "step": 6 }, { "epoch": 0.008832807570977918, "grad_norm": 57.03233045866982, "learning_rate": 1.4705882352941177e-06, "loss": 10.9208, "step": 7 }, { "epoch": 0.010094637223974764, "grad_norm": 60.04898713418555, "learning_rate": 1.680672268907563e-06, "loss": 10.6894, "step": 8 }, { "epoch": 0.011356466876971609, "grad_norm": 59.858171778864154, "learning_rate": 1.8907563025210083e-06, "loss": 10.8423, "step": 9 }, { "epoch": 0.012618296529968454, "grad_norm": 66.18668736152034, "learning_rate": 2.1008403361344543e-06, "loss": 10.611, "step": 10 }, { "epoch": 0.0138801261829653, "grad_norm": 82.64596479161493, "learning_rate": 2.3109243697478996e-06, "loss": 9.503, "step": 11 }, { "epoch": 0.015141955835962145, "grad_norm": 95.96109664523145, "learning_rate": 2.521008403361345e-06, "loss": 9.0823, "step": 12 }, { "epoch": 0.016403785488958992, "grad_norm": 102.7068567874174, "learning_rate": 2.73109243697479e-06, "loss": 8.8712, "step": 13 }, { "epoch": 0.017665615141955835, "grad_norm": 68.46446806397606, "learning_rate": 2.9411764705882355e-06, "loss": 3.8286, "step": 14 }, { "epoch": 0.01892744479495268, "grad_norm": 58.86267227950065, "learning_rate": 3.1512605042016808e-06, "loss": 3.4334, "step": 15 }, { "epoch": 0.02018927444794953, "grad_norm": 53.001280097257975, "learning_rate": 3.361344537815126e-06, "loss": 3.1439, "step": 16 }, { "epoch": 0.02145110410094637, "grad_norm": 38.30725393774374, "learning_rate": 3.5714285714285714e-06, "loss": 2.7053, "step": 17 }, { "epoch": 0.022712933753943218, "grad_norm": 29.84088583004441, "learning_rate": 3.7815126050420167e-06, "loss": 2.3237, "step": 18 }, { "epoch": 0.023974763406940065, "grad_norm": 6.775637501837265, "learning_rate": 3.991596638655462e-06, "loss": 1.4353, "step": 19 }, { "epoch": 0.025236593059936908, "grad_norm": 5.055359444793141, "learning_rate": 4.2016806722689085e-06, "loss": 1.3068, "step": 20 }, { "epoch": 0.026498422712933754, "grad_norm": 4.474531192691532, "learning_rate": 4.411764705882353e-06, "loss": 1.3256, "step": 21 }, { "epoch": 0.0277602523659306, "grad_norm": 3.392890426338277, "learning_rate": 4.621848739495799e-06, "loss": 1.1999, "step": 22 }, { "epoch": 0.029022082018927444, "grad_norm": 2.6076305552588765, "learning_rate": 4.831932773109244e-06, "loss": 1.1466, "step": 23 }, { "epoch": 0.03028391167192429, "grad_norm": 2.1135640930397166, "learning_rate": 5.04201680672269e-06, "loss": 1.0897, "step": 24 }, { "epoch": 0.031545741324921134, "grad_norm": 1.6748941172331713, "learning_rate": 5.252100840336135e-06, "loss": 1.0, "step": 25 }, { "epoch": 0.032807570977917984, "grad_norm": 2.2740424704024087, "learning_rate": 5.46218487394958e-06, "loss": 0.9593, "step": 26 }, { "epoch": 0.03406940063091483, "grad_norm": 1.9393860772329445, "learning_rate": 5.672268907563025e-06, "loss": 0.9423, "step": 27 }, { "epoch": 0.03533123028391167, "grad_norm": 2.336883568540269, "learning_rate": 5.882352941176471e-06, "loss": 0.8847, "step": 28 }, { "epoch": 0.03659305993690852, "grad_norm": 0.9880620973146782, "learning_rate": 6.092436974789916e-06, "loss": 0.8869, "step": 29 }, { "epoch": 0.03785488958990536, "grad_norm": 0.8586262555935786, "learning_rate": 6.3025210084033615e-06, "loss": 0.8288, "step": 30 }, { "epoch": 0.039116719242902206, "grad_norm": 0.8970263467299561, "learning_rate": 6.512605042016807e-06, "loss": 0.8547, "step": 31 }, { "epoch": 0.04037854889589906, "grad_norm": 0.7035554853515015, "learning_rate": 6.722689075630252e-06, "loss": 0.7804, "step": 32 }, { "epoch": 0.0416403785488959, "grad_norm": 0.7579672131245586, "learning_rate": 6.932773109243698e-06, "loss": 0.7698, "step": 33 }, { "epoch": 0.04290220820189274, "grad_norm": 0.70956501048036, "learning_rate": 7.142857142857143e-06, "loss": 0.7874, "step": 34 }, { "epoch": 0.04416403785488959, "grad_norm": 0.6376030658996721, "learning_rate": 7.3529411764705884e-06, "loss": 0.7609, "step": 35 }, { "epoch": 0.045425867507886436, "grad_norm": 0.6990802975694539, "learning_rate": 7.563025210084033e-06, "loss": 0.7388, "step": 36 }, { "epoch": 0.04668769716088328, "grad_norm": 0.6570014476882662, "learning_rate": 7.773109243697479e-06, "loss": 0.7476, "step": 37 }, { "epoch": 0.04794952681388013, "grad_norm": 0.5837933375764158, "learning_rate": 7.983193277310924e-06, "loss": 0.7151, "step": 38 }, { "epoch": 0.04921135646687697, "grad_norm": 0.5168415150271652, "learning_rate": 8.19327731092437e-06, "loss": 0.6841, "step": 39 }, { "epoch": 0.050473186119873815, "grad_norm": 0.4452331212043853, "learning_rate": 8.403361344537817e-06, "loss": 0.6845, "step": 40 }, { "epoch": 0.051735015772870666, "grad_norm": 0.4861251196797448, "learning_rate": 8.61344537815126e-06, "loss": 0.6657, "step": 41 }, { "epoch": 0.05299684542586751, "grad_norm": 0.4730000643691013, "learning_rate": 8.823529411764707e-06, "loss": 0.6561, "step": 42 }, { "epoch": 0.05425867507886435, "grad_norm": 0.49223245534256665, "learning_rate": 9.033613445378152e-06, "loss": 0.6669, "step": 43 }, { "epoch": 0.0555205047318612, "grad_norm": 0.5274891728509885, "learning_rate": 9.243697478991598e-06, "loss": 0.6479, "step": 44 }, { "epoch": 0.056782334384858045, "grad_norm": 0.425203347952182, "learning_rate": 9.453781512605041e-06, "loss": 0.6299, "step": 45 }, { "epoch": 0.05804416403785489, "grad_norm": 0.3860692715356024, "learning_rate": 9.663865546218488e-06, "loss": 0.6574, "step": 46 }, { "epoch": 0.05930599369085174, "grad_norm": 0.4318868647602984, "learning_rate": 9.873949579831933e-06, "loss": 0.6429, "step": 47 }, { "epoch": 0.06056782334384858, "grad_norm": 0.4761263034076977, "learning_rate": 1.008403361344538e-05, "loss": 0.6415, "step": 48 }, { "epoch": 0.061829652996845424, "grad_norm": 0.4086278976020363, "learning_rate": 1.0294117647058824e-05, "loss": 0.6046, "step": 49 }, { "epoch": 0.06309148264984227, "grad_norm": 0.38678788361462685, "learning_rate": 1.050420168067227e-05, "loss": 0.6568, "step": 50 }, { "epoch": 0.06435331230283911, "grad_norm": 0.3740055984424438, "learning_rate": 1.0714285714285714e-05, "loss": 0.6408, "step": 51 }, { "epoch": 0.06561514195583597, "grad_norm": 0.37187783362935944, "learning_rate": 1.092436974789916e-05, "loss": 0.5913, "step": 52 }, { "epoch": 0.06687697160883281, "grad_norm": 0.3952203634652536, "learning_rate": 1.1134453781512606e-05, "loss": 0.6101, "step": 53 }, { "epoch": 0.06813880126182965, "grad_norm": 0.347039000658668, "learning_rate": 1.134453781512605e-05, "loss": 0.6078, "step": 54 }, { "epoch": 0.0694006309148265, "grad_norm": 0.2920842022495731, "learning_rate": 1.1554621848739497e-05, "loss": 0.599, "step": 55 }, { "epoch": 0.07066246056782334, "grad_norm": 0.3332584475010501, "learning_rate": 1.1764705882352942e-05, "loss": 0.6038, "step": 56 }, { "epoch": 0.07192429022082018, "grad_norm": 0.35172264312534435, "learning_rate": 1.1974789915966388e-05, "loss": 0.6127, "step": 57 }, { "epoch": 0.07318611987381704, "grad_norm": 0.32527906398133477, "learning_rate": 1.2184873949579832e-05, "loss": 0.5758, "step": 58 }, { "epoch": 0.07444794952681388, "grad_norm": 0.32909176364410186, "learning_rate": 1.2394957983193278e-05, "loss": 0.6242, "step": 59 }, { "epoch": 0.07570977917981073, "grad_norm": 0.2790068039976006, "learning_rate": 1.2605042016806723e-05, "loss": 0.5841, "step": 60 }, { "epoch": 0.07697160883280757, "grad_norm": 0.3700525791955191, "learning_rate": 1.2815126050420168e-05, "loss": 0.6128, "step": 61 }, { "epoch": 0.07823343848580441, "grad_norm": 0.350598337063842, "learning_rate": 1.3025210084033614e-05, "loss": 0.5907, "step": 62 }, { "epoch": 0.07949526813880126, "grad_norm": 0.3154276910064243, "learning_rate": 1.323529411764706e-05, "loss": 0.6029, "step": 63 }, { "epoch": 0.08075709779179811, "grad_norm": 0.28486887556204943, "learning_rate": 1.3445378151260504e-05, "loss": 0.5852, "step": 64 }, { "epoch": 0.08201892744479496, "grad_norm": 0.3082095887071099, "learning_rate": 1.3655462184873949e-05, "loss": 0.5883, "step": 65 }, { "epoch": 0.0832807570977918, "grad_norm": 0.33317247540835776, "learning_rate": 1.3865546218487396e-05, "loss": 0.6098, "step": 66 }, { "epoch": 0.08454258675078864, "grad_norm": 0.2828354497206161, "learning_rate": 1.407563025210084e-05, "loss": 0.5652, "step": 67 }, { "epoch": 0.08580441640378549, "grad_norm": 0.27577067721974824, "learning_rate": 1.4285714285714285e-05, "loss": 0.5669, "step": 68 }, { "epoch": 0.08706624605678233, "grad_norm": 0.2750868510399575, "learning_rate": 1.4495798319327734e-05, "loss": 0.5604, "step": 69 }, { "epoch": 0.08832807570977919, "grad_norm": 0.3068167000355196, "learning_rate": 1.4705882352941177e-05, "loss": 0.5629, "step": 70 }, { "epoch": 0.08958990536277603, "grad_norm": 0.27160908387317056, "learning_rate": 1.4915966386554622e-05, "loss": 0.5706, "step": 71 }, { "epoch": 0.09085173501577287, "grad_norm": 0.27885579356215573, "learning_rate": 1.5126050420168067e-05, "loss": 0.5359, "step": 72 }, { "epoch": 0.09211356466876972, "grad_norm": 0.29527489163947795, "learning_rate": 1.5336134453781513e-05, "loss": 0.5736, "step": 73 }, { "epoch": 0.09337539432176656, "grad_norm": 0.26642005844223404, "learning_rate": 1.5546218487394958e-05, "loss": 0.5652, "step": 74 }, { "epoch": 0.0946372239747634, "grad_norm": 0.2740931025475317, "learning_rate": 1.5756302521008403e-05, "loss": 0.5648, "step": 75 }, { "epoch": 0.09589905362776026, "grad_norm": 0.26032241669273953, "learning_rate": 1.5966386554621848e-05, "loss": 0.572, "step": 76 }, { "epoch": 0.0971608832807571, "grad_norm": 0.24482870527307818, "learning_rate": 1.6176470588235296e-05, "loss": 0.5522, "step": 77 }, { "epoch": 0.09842271293375394, "grad_norm": 0.30352218019672267, "learning_rate": 1.638655462184874e-05, "loss": 0.5538, "step": 78 }, { "epoch": 0.09968454258675079, "grad_norm": 0.2827648549321419, "learning_rate": 1.6596638655462186e-05, "loss": 0.5458, "step": 79 }, { "epoch": 0.10094637223974763, "grad_norm": 0.2792942318638443, "learning_rate": 1.6806722689075634e-05, "loss": 0.5511, "step": 80 }, { "epoch": 0.10220820189274447, "grad_norm": 0.31476573157168214, "learning_rate": 1.7016806722689076e-05, "loss": 0.5656, "step": 81 }, { "epoch": 0.10347003154574133, "grad_norm": 0.29647870051771646, "learning_rate": 1.722689075630252e-05, "loss": 0.5403, "step": 82 }, { "epoch": 0.10473186119873817, "grad_norm": 0.2760137941037949, "learning_rate": 1.7436974789915965e-05, "loss": 0.5601, "step": 83 }, { "epoch": 0.10599369085173502, "grad_norm": 0.2552583210541467, "learning_rate": 1.7647058823529414e-05, "loss": 0.5502, "step": 84 }, { "epoch": 0.10725552050473186, "grad_norm": 0.30513187547702436, "learning_rate": 1.785714285714286e-05, "loss": 0.5438, "step": 85 }, { "epoch": 0.1085173501577287, "grad_norm": 0.30702287506582054, "learning_rate": 1.8067226890756303e-05, "loss": 0.5282, "step": 86 }, { "epoch": 0.10977917981072555, "grad_norm": 0.27808597264441465, "learning_rate": 1.8277310924369748e-05, "loss": 0.573, "step": 87 }, { "epoch": 0.1110410094637224, "grad_norm": 0.27815961515261595, "learning_rate": 1.8487394957983196e-05, "loss": 0.5571, "step": 88 }, { "epoch": 0.11230283911671925, "grad_norm": 0.2582541838836259, "learning_rate": 1.869747899159664e-05, "loss": 0.5349, "step": 89 }, { "epoch": 0.11356466876971609, "grad_norm": 0.26965011172742337, "learning_rate": 1.8907563025210083e-05, "loss": 0.5268, "step": 90 }, { "epoch": 0.11482649842271293, "grad_norm": 0.3048475899439761, "learning_rate": 1.9117647058823528e-05, "loss": 0.5358, "step": 91 }, { "epoch": 0.11608832807570978, "grad_norm": 0.31430489250596766, "learning_rate": 1.9327731092436976e-05, "loss": 0.543, "step": 92 }, { "epoch": 0.11735015772870662, "grad_norm": 0.2932599370130044, "learning_rate": 1.953781512605042e-05, "loss": 0.5081, "step": 93 }, { "epoch": 0.11861198738170348, "grad_norm": 0.4235622951173263, "learning_rate": 1.9747899159663866e-05, "loss": 0.5626, "step": 94 }, { "epoch": 0.11987381703470032, "grad_norm": 0.32385536249308045, "learning_rate": 1.9957983193277314e-05, "loss": 0.5379, "step": 95 }, { "epoch": 0.12113564668769716, "grad_norm": 0.321387290204399, "learning_rate": 2.016806722689076e-05, "loss": 0.5399, "step": 96 }, { "epoch": 0.122397476340694, "grad_norm": 0.3350234990290176, "learning_rate": 2.0378151260504204e-05, "loss": 0.5256, "step": 97 }, { "epoch": 0.12365930599369085, "grad_norm": 0.30409882859064313, "learning_rate": 2.058823529411765e-05, "loss": 0.5272, "step": 98 }, { "epoch": 0.12492113564668769, "grad_norm": 0.4130587610093255, "learning_rate": 2.0798319327731094e-05, "loss": 0.571, "step": 99 }, { "epoch": 0.12618296529968454, "grad_norm": 0.31124119498387315, "learning_rate": 2.100840336134454e-05, "loss": 0.5259, "step": 100 }, { "epoch": 0.12744479495268138, "grad_norm": 0.3556439151298821, "learning_rate": 2.1218487394957983e-05, "loss": 0.5237, "step": 101 }, { "epoch": 0.12870662460567822, "grad_norm": 0.4110458699539421, "learning_rate": 2.1428571428571428e-05, "loss": 0.5497, "step": 102 }, { "epoch": 0.1299684542586751, "grad_norm": 0.35224214680585336, "learning_rate": 2.1638655462184876e-05, "loss": 0.5353, "step": 103 }, { "epoch": 0.13123028391167194, "grad_norm": 0.3114794618305847, "learning_rate": 2.184873949579832e-05, "loss": 0.5128, "step": 104 }, { "epoch": 0.13249211356466878, "grad_norm": 0.33536452390817567, "learning_rate": 2.2058823529411766e-05, "loss": 0.5397, "step": 105 }, { "epoch": 0.13375394321766562, "grad_norm": 0.3665079062303759, "learning_rate": 2.226890756302521e-05, "loss": 0.5605, "step": 106 }, { "epoch": 0.13501577287066246, "grad_norm": 0.33212757248277675, "learning_rate": 2.2478991596638656e-05, "loss": 0.522, "step": 107 }, { "epoch": 0.1362776025236593, "grad_norm": 0.32402310473754675, "learning_rate": 2.26890756302521e-05, "loss": 0.5545, "step": 108 }, { "epoch": 0.13753943217665615, "grad_norm": 0.3629756199724407, "learning_rate": 2.2899159663865546e-05, "loss": 0.5432, "step": 109 }, { "epoch": 0.138801261829653, "grad_norm": 0.30548960203332803, "learning_rate": 2.3109243697478994e-05, "loss": 0.5199, "step": 110 }, { "epoch": 0.14006309148264984, "grad_norm": 0.3205345662615093, "learning_rate": 2.331932773109244e-05, "loss": 0.5398, "step": 111 }, { "epoch": 0.14132492113564668, "grad_norm": 0.29167519349249416, "learning_rate": 2.3529411764705884e-05, "loss": 0.5209, "step": 112 }, { "epoch": 0.14258675078864352, "grad_norm": 0.3305808245937376, "learning_rate": 2.373949579831933e-05, "loss": 0.5059, "step": 113 }, { "epoch": 0.14384858044164037, "grad_norm": 0.34568182836462524, "learning_rate": 2.3949579831932777e-05, "loss": 0.5417, "step": 114 }, { "epoch": 0.14511041009463724, "grad_norm": 0.3746280511839108, "learning_rate": 2.415966386554622e-05, "loss": 0.5159, "step": 115 }, { "epoch": 0.14637223974763408, "grad_norm": 0.3313811877234831, "learning_rate": 2.4369747899159663e-05, "loss": 0.5146, "step": 116 }, { "epoch": 0.14763406940063092, "grad_norm": 0.3460112349433482, "learning_rate": 2.4579831932773108e-05, "loss": 0.5533, "step": 117 }, { "epoch": 0.14889589905362777, "grad_norm": 0.3238937668410351, "learning_rate": 2.4789915966386556e-05, "loss": 0.509, "step": 118 }, { "epoch": 0.1501577287066246, "grad_norm": 0.3064736125492876, "learning_rate": 2.5e-05, "loss": 0.5035, "step": 119 }, { "epoch": 0.15141955835962145, "grad_norm": 0.3382684299939999, "learning_rate": 2.5210084033613446e-05, "loss": 0.5272, "step": 120 }, { "epoch": 0.1526813880126183, "grad_norm": 0.39442018063268197, "learning_rate": 2.542016806722689e-05, "loss": 0.523, "step": 121 }, { "epoch": 0.15394321766561514, "grad_norm": 0.34398848185148684, "learning_rate": 2.5630252100840336e-05, "loss": 0.4922, "step": 122 }, { "epoch": 0.15520504731861198, "grad_norm": 0.3090674125556556, "learning_rate": 2.5840336134453784e-05, "loss": 0.4813, "step": 123 }, { "epoch": 0.15646687697160883, "grad_norm": 0.33155222131369666, "learning_rate": 2.605042016806723e-05, "loss": 0.5077, "step": 124 }, { "epoch": 0.15772870662460567, "grad_norm": 0.3505350324717118, "learning_rate": 2.6260504201680674e-05, "loss": 0.5133, "step": 125 }, { "epoch": 0.1589905362776025, "grad_norm": 0.3346032709647677, "learning_rate": 2.647058823529412e-05, "loss": 0.5046, "step": 126 }, { "epoch": 0.16025236593059936, "grad_norm": 0.37920838560612524, "learning_rate": 2.6680672268907564e-05, "loss": 0.5216, "step": 127 }, { "epoch": 0.16151419558359623, "grad_norm": 0.41961627089305814, "learning_rate": 2.689075630252101e-05, "loss": 0.5139, "step": 128 }, { "epoch": 0.16277602523659307, "grad_norm": 0.38010515750016904, "learning_rate": 2.7100840336134453e-05, "loss": 0.5312, "step": 129 }, { "epoch": 0.1640378548895899, "grad_norm": 0.34167562003506424, "learning_rate": 2.7310924369747898e-05, "loss": 0.4935, "step": 130 }, { "epoch": 0.16529968454258676, "grad_norm": 0.4361998520569672, "learning_rate": 2.7521008403361346e-05, "loss": 0.4925, "step": 131 }, { "epoch": 0.1665615141955836, "grad_norm": 0.4089429739421969, "learning_rate": 2.773109243697479e-05, "loss": 0.5236, "step": 132 }, { "epoch": 0.16782334384858044, "grad_norm": 0.37509413800250313, "learning_rate": 2.7941176470588236e-05, "loss": 0.5083, "step": 133 }, { "epoch": 0.16908517350157728, "grad_norm": 0.37481023501307886, "learning_rate": 2.815126050420168e-05, "loss": 0.5045, "step": 134 }, { "epoch": 0.17034700315457413, "grad_norm": 0.37821805249087437, "learning_rate": 2.8361344537815126e-05, "loss": 0.519, "step": 135 }, { "epoch": 0.17160883280757097, "grad_norm": 0.3063822144046949, "learning_rate": 2.857142857142857e-05, "loss": 0.5246, "step": 136 }, { "epoch": 0.17287066246056781, "grad_norm": 0.42515442030154577, "learning_rate": 2.8781512605042016e-05, "loss": 0.5235, "step": 137 }, { "epoch": 0.17413249211356466, "grad_norm": 0.37732799371330916, "learning_rate": 2.8991596638655467e-05, "loss": 0.5267, "step": 138 }, { "epoch": 0.1753943217665615, "grad_norm": 0.4393605950725276, "learning_rate": 2.9201680672268912e-05, "loss": 0.4895, "step": 139 }, { "epoch": 0.17665615141955837, "grad_norm": 0.36071858994282435, "learning_rate": 2.9411764705882354e-05, "loss": 0.4791, "step": 140 }, { "epoch": 0.17791798107255521, "grad_norm": 0.4162086676174523, "learning_rate": 2.96218487394958e-05, "loss": 0.5338, "step": 141 }, { "epoch": 0.17917981072555206, "grad_norm": 0.3967297919062861, "learning_rate": 2.9831932773109244e-05, "loss": 0.519, "step": 142 }, { "epoch": 0.1804416403785489, "grad_norm": 0.4016640715397709, "learning_rate": 3.004201680672269e-05, "loss": 0.4874, "step": 143 }, { "epoch": 0.18170347003154574, "grad_norm": 0.43650460260842355, "learning_rate": 3.0252100840336133e-05, "loss": 0.5053, "step": 144 }, { "epoch": 0.1829652996845426, "grad_norm": 0.4638050848126171, "learning_rate": 3.0462184873949578e-05, "loss": 0.5299, "step": 145 }, { "epoch": 0.18422712933753943, "grad_norm": 0.38020452980860897, "learning_rate": 3.0672268907563026e-05, "loss": 0.499, "step": 146 }, { "epoch": 0.18548895899053627, "grad_norm": 0.42292387501234724, "learning_rate": 3.0882352941176475e-05, "loss": 0.5093, "step": 147 }, { "epoch": 0.18675078864353312, "grad_norm": 0.38639300918770614, "learning_rate": 3.1092436974789916e-05, "loss": 0.4946, "step": 148 }, { "epoch": 0.18801261829652996, "grad_norm": 0.3565487159724229, "learning_rate": 3.1302521008403364e-05, "loss": 0.4795, "step": 149 }, { "epoch": 0.1892744479495268, "grad_norm": 0.33690762760440995, "learning_rate": 3.1512605042016806e-05, "loss": 0.511, "step": 150 }, { "epoch": 0.19053627760252365, "grad_norm": 0.4331447755425415, "learning_rate": 3.1722689075630254e-05, "loss": 0.5039, "step": 151 }, { "epoch": 0.19179810725552052, "grad_norm": 0.3909890057902332, "learning_rate": 3.1932773109243696e-05, "loss": 0.485, "step": 152 }, { "epoch": 0.19305993690851736, "grad_norm": 0.406328385874364, "learning_rate": 3.2142857142857144e-05, "loss": 0.5057, "step": 153 }, { "epoch": 0.1943217665615142, "grad_norm": 0.46174906906639734, "learning_rate": 3.235294117647059e-05, "loss": 0.4875, "step": 154 }, { "epoch": 0.19558359621451105, "grad_norm": 0.6382752348088091, "learning_rate": 3.2563025210084034e-05, "loss": 0.5283, "step": 155 }, { "epoch": 0.1968454258675079, "grad_norm": 0.5036273756166013, "learning_rate": 3.277310924369748e-05, "loss": 0.5022, "step": 156 }, { "epoch": 0.19810725552050473, "grad_norm": 0.5576175194498294, "learning_rate": 3.2983193277310923e-05, "loss": 0.4971, "step": 157 }, { "epoch": 0.19936908517350158, "grad_norm": 0.7976899392876795, "learning_rate": 3.319327731092437e-05, "loss": 0.5, "step": 158 }, { "epoch": 0.20063091482649842, "grad_norm": 0.5736339634435993, "learning_rate": 3.340336134453781e-05, "loss": 0.4884, "step": 159 }, { "epoch": 0.20189274447949526, "grad_norm": 0.741294528553764, "learning_rate": 3.361344537815127e-05, "loss": 0.4993, "step": 160 }, { "epoch": 0.2031545741324921, "grad_norm": 0.8817741467018153, "learning_rate": 3.382352941176471e-05, "loss": 0.4928, "step": 161 }, { "epoch": 0.20441640378548895, "grad_norm": 0.6033537615914164, "learning_rate": 3.403361344537815e-05, "loss": 0.5009, "step": 162 }, { "epoch": 0.2056782334384858, "grad_norm": 0.9866256541082249, "learning_rate": 3.42436974789916e-05, "loss": 0.5096, "step": 163 }, { "epoch": 0.20694006309148266, "grad_norm": 0.7520371729956136, "learning_rate": 3.445378151260504e-05, "loss": 0.4822, "step": 164 }, { "epoch": 0.2082018927444795, "grad_norm": 0.6558231966073458, "learning_rate": 3.466386554621849e-05, "loss": 0.5052, "step": 165 }, { "epoch": 0.20946372239747635, "grad_norm": 0.8840329620898723, "learning_rate": 3.487394957983193e-05, "loss": 0.5113, "step": 166 }, { "epoch": 0.2107255520504732, "grad_norm": 0.48746870509872403, "learning_rate": 3.508403361344538e-05, "loss": 0.5146, "step": 167 }, { "epoch": 0.21198738170347003, "grad_norm": 0.7503058627086597, "learning_rate": 3.529411764705883e-05, "loss": 0.5011, "step": 168 }, { "epoch": 0.21324921135646688, "grad_norm": 0.4707975709064651, "learning_rate": 3.5504201680672275e-05, "loss": 0.5054, "step": 169 }, { "epoch": 0.21451104100946372, "grad_norm": 0.8440662677308276, "learning_rate": 3.571428571428572e-05, "loss": 0.4846, "step": 170 }, { "epoch": 0.21577287066246056, "grad_norm": 0.5801491626780395, "learning_rate": 3.592436974789916e-05, "loss": 0.5109, "step": 171 }, { "epoch": 0.2170347003154574, "grad_norm": 0.6978161734678204, "learning_rate": 3.613445378151261e-05, "loss": 0.5059, "step": 172 }, { "epoch": 0.21829652996845425, "grad_norm": 0.6359376428213778, "learning_rate": 3.634453781512605e-05, "loss": 0.494, "step": 173 }, { "epoch": 0.2195583596214511, "grad_norm": 0.47773471078816565, "learning_rate": 3.6554621848739496e-05, "loss": 0.484, "step": 174 }, { "epoch": 0.22082018927444794, "grad_norm": 0.5246654458269691, "learning_rate": 3.6764705882352945e-05, "loss": 0.4778, "step": 175 }, { "epoch": 0.2220820189274448, "grad_norm": 0.47746892095635224, "learning_rate": 3.697478991596639e-05, "loss": 0.4992, "step": 176 }, { "epoch": 0.22334384858044165, "grad_norm": 0.5006331692404467, "learning_rate": 3.7184873949579834e-05, "loss": 0.5025, "step": 177 }, { "epoch": 0.2246056782334385, "grad_norm": 0.445078627048204, "learning_rate": 3.739495798319328e-05, "loss": 0.4859, "step": 178 }, { "epoch": 0.22586750788643534, "grad_norm": 0.516671259903624, "learning_rate": 3.7605042016806724e-05, "loss": 0.4717, "step": 179 }, { "epoch": 0.22712933753943218, "grad_norm": 0.43286468333284506, "learning_rate": 3.7815126050420166e-05, "loss": 0.4716, "step": 180 }, { "epoch": 0.22839116719242902, "grad_norm": 0.5337813204396196, "learning_rate": 3.8025210084033614e-05, "loss": 0.4888, "step": 181 }, { "epoch": 0.22965299684542587, "grad_norm": 0.5216760789734665, "learning_rate": 3.8235294117647055e-05, "loss": 0.5165, "step": 182 }, { "epoch": 0.2309148264984227, "grad_norm": 0.4584541849308208, "learning_rate": 3.844537815126051e-05, "loss": 0.476, "step": 183 }, { "epoch": 0.23217665615141955, "grad_norm": 0.5749705581657071, "learning_rate": 3.865546218487395e-05, "loss": 0.4939, "step": 184 }, { "epoch": 0.2334384858044164, "grad_norm": 0.5849286901779913, "learning_rate": 3.88655462184874e-05, "loss": 0.4871, "step": 185 }, { "epoch": 0.23470031545741324, "grad_norm": 0.4194503411512829, "learning_rate": 3.907563025210084e-05, "loss": 0.4805, "step": 186 }, { "epoch": 0.23596214511041008, "grad_norm": 0.3756922446037219, "learning_rate": 3.928571428571429e-05, "loss": 0.4891, "step": 187 }, { "epoch": 0.23722397476340695, "grad_norm": 0.43662274526519534, "learning_rate": 3.949579831932773e-05, "loss": 0.5003, "step": 188 }, { "epoch": 0.2384858044164038, "grad_norm": 0.361772780829675, "learning_rate": 3.970588235294117e-05, "loss": 0.4749, "step": 189 }, { "epoch": 0.23974763406940064, "grad_norm": 0.4283658703754586, "learning_rate": 3.991596638655463e-05, "loss": 0.5008, "step": 190 }, { "epoch": 0.24100946372239748, "grad_norm": 0.4168869736652105, "learning_rate": 4.012605042016807e-05, "loss": 0.4811, "step": 191 }, { "epoch": 0.24227129337539433, "grad_norm": 0.331179422547324, "learning_rate": 4.033613445378152e-05, "loss": 0.4604, "step": 192 }, { "epoch": 0.24353312302839117, "grad_norm": 0.4588793898151273, "learning_rate": 4.054621848739496e-05, "loss": 0.4758, "step": 193 }, { "epoch": 0.244794952681388, "grad_norm": 0.32614175493399766, "learning_rate": 4.075630252100841e-05, "loss": 0.4726, "step": 194 }, { "epoch": 0.24605678233438485, "grad_norm": 0.5021642332066499, "learning_rate": 4.096638655462185e-05, "loss": 0.4895, "step": 195 }, { "epoch": 0.2473186119873817, "grad_norm": 0.37568805164019003, "learning_rate": 4.11764705882353e-05, "loss": 0.5278, "step": 196 }, { "epoch": 0.24858044164037854, "grad_norm": 0.4311924879020063, "learning_rate": 4.138655462184874e-05, "loss": 0.4954, "step": 197 }, { "epoch": 0.24984227129337538, "grad_norm": 0.3731712051705941, "learning_rate": 4.159663865546219e-05, "loss": 0.4794, "step": 198 }, { "epoch": 0.25110410094637226, "grad_norm": 0.3723269299136955, "learning_rate": 4.1806722689075635e-05, "loss": 0.4924, "step": 199 }, { "epoch": 0.25236593059936907, "grad_norm": 0.43139864740727285, "learning_rate": 4.201680672268908e-05, "loss": 0.465, "step": 200 }, { "epoch": 0.25362776025236594, "grad_norm": 0.3859753020150958, "learning_rate": 4.2226890756302525e-05, "loss": 0.4656, "step": 201 }, { "epoch": 0.25488958990536276, "grad_norm": 0.4167528542764003, "learning_rate": 4.2436974789915967e-05, "loss": 0.4925, "step": 202 }, { "epoch": 0.2561514195583596, "grad_norm": 0.44697330532984636, "learning_rate": 4.2647058823529415e-05, "loss": 0.489, "step": 203 }, { "epoch": 0.25741324921135644, "grad_norm": 0.39941836927283736, "learning_rate": 4.2857142857142856e-05, "loss": 0.4696, "step": 204 }, { "epoch": 0.2586750788643533, "grad_norm": 0.45957678872691504, "learning_rate": 4.3067226890756305e-05, "loss": 0.4639, "step": 205 }, { "epoch": 0.2599369085173502, "grad_norm": 0.4620133977062028, "learning_rate": 4.327731092436975e-05, "loss": 0.4941, "step": 206 }, { "epoch": 0.261198738170347, "grad_norm": 0.4500295688931085, "learning_rate": 4.3487394957983194e-05, "loss": 0.4812, "step": 207 }, { "epoch": 0.26246056782334387, "grad_norm": 0.47700706716337143, "learning_rate": 4.369747899159664e-05, "loss": 0.4818, "step": 208 }, { "epoch": 0.2637223974763407, "grad_norm": 0.3853915175395267, "learning_rate": 4.3907563025210084e-05, "loss": 0.4851, "step": 209 }, { "epoch": 0.26498422712933756, "grad_norm": 0.4919558284221479, "learning_rate": 4.411764705882353e-05, "loss": 0.4728, "step": 210 }, { "epoch": 0.2662460567823344, "grad_norm": 0.44120927304385155, "learning_rate": 4.4327731092436974e-05, "loss": 0.4726, "step": 211 }, { "epoch": 0.26750788643533124, "grad_norm": 0.37786907297420214, "learning_rate": 4.453781512605042e-05, "loss": 0.4919, "step": 212 }, { "epoch": 0.26876971608832806, "grad_norm": 0.4209980057444624, "learning_rate": 4.474789915966387e-05, "loss": 0.4554, "step": 213 }, { "epoch": 0.27003154574132493, "grad_norm": 0.38273325420571014, "learning_rate": 4.495798319327731e-05, "loss": 0.5081, "step": 214 }, { "epoch": 0.27129337539432175, "grad_norm": 0.4133022369325695, "learning_rate": 4.516806722689076e-05, "loss": 0.4775, "step": 215 }, { "epoch": 0.2725552050473186, "grad_norm": 0.3863781697046898, "learning_rate": 4.53781512605042e-05, "loss": 0.4858, "step": 216 }, { "epoch": 0.27381703470031543, "grad_norm": 0.4687866218380144, "learning_rate": 4.558823529411765e-05, "loss": 0.4709, "step": 217 }, { "epoch": 0.2750788643533123, "grad_norm": 0.4154176278393551, "learning_rate": 4.579831932773109e-05, "loss": 0.4799, "step": 218 }, { "epoch": 0.2763406940063092, "grad_norm": 0.43218973264245886, "learning_rate": 4.600840336134454e-05, "loss": 0.4821, "step": 219 }, { "epoch": 0.277602523659306, "grad_norm": 0.42214622610550595, "learning_rate": 4.621848739495799e-05, "loss": 0.4797, "step": 220 }, { "epoch": 0.27886435331230286, "grad_norm": 0.47649311337215333, "learning_rate": 4.642857142857143e-05, "loss": 0.4854, "step": 221 }, { "epoch": 0.2801261829652997, "grad_norm": 0.44451462570593153, "learning_rate": 4.663865546218488e-05, "loss": 0.4956, "step": 222 }, { "epoch": 0.28138801261829655, "grad_norm": 0.5708991312510483, "learning_rate": 4.684873949579832e-05, "loss": 0.4983, "step": 223 }, { "epoch": 0.28264984227129336, "grad_norm": 0.45088638257604985, "learning_rate": 4.705882352941177e-05, "loss": 0.4708, "step": 224 }, { "epoch": 0.28391167192429023, "grad_norm": 0.6251428168309907, "learning_rate": 4.726890756302521e-05, "loss": 0.4666, "step": 225 }, { "epoch": 0.28517350157728705, "grad_norm": 0.5434464861395523, "learning_rate": 4.747899159663866e-05, "loss": 0.456, "step": 226 }, { "epoch": 0.2864353312302839, "grad_norm": 0.5276584179187869, "learning_rate": 4.7689075630252105e-05, "loss": 0.4581, "step": 227 }, { "epoch": 0.28769716088328073, "grad_norm": 0.5906060316204074, "learning_rate": 4.7899159663865554e-05, "loss": 0.4554, "step": 228 }, { "epoch": 0.2889589905362776, "grad_norm": 0.47818604466336245, "learning_rate": 4.8109243697478995e-05, "loss": 0.4695, "step": 229 }, { "epoch": 0.2902208201892745, "grad_norm": 0.6068349290073912, "learning_rate": 4.831932773109244e-05, "loss": 0.5065, "step": 230 }, { "epoch": 0.2914826498422713, "grad_norm": 0.39163075924286167, "learning_rate": 4.8529411764705885e-05, "loss": 0.4579, "step": 231 }, { "epoch": 0.29274447949526816, "grad_norm": 0.5976454925024771, "learning_rate": 4.8739495798319326e-05, "loss": 0.4829, "step": 232 }, { "epoch": 0.294006309148265, "grad_norm": 0.3890997782385902, "learning_rate": 4.8949579831932775e-05, "loss": 0.4824, "step": 233 }, { "epoch": 0.29526813880126185, "grad_norm": 0.5669981890711457, "learning_rate": 4.9159663865546216e-05, "loss": 0.5165, "step": 234 }, { "epoch": 0.29652996845425866, "grad_norm": 0.45771262751762865, "learning_rate": 4.936974789915967e-05, "loss": 0.458, "step": 235 }, { "epoch": 0.29779179810725553, "grad_norm": 0.5027927889573373, "learning_rate": 4.957983193277311e-05, "loss": 0.4879, "step": 236 }, { "epoch": 0.29905362776025235, "grad_norm": 0.4891184360327861, "learning_rate": 4.978991596638656e-05, "loss": 0.477, "step": 237 }, { "epoch": 0.3003154574132492, "grad_norm": 0.4915270913214873, "learning_rate": 5e-05, "loss": 0.5111, "step": 238 }, { "epoch": 0.30157728706624604, "grad_norm": 0.4161381606441241, "learning_rate": 4.9976613657623947e-05, "loss": 0.4855, "step": 239 }, { "epoch": 0.3028391167192429, "grad_norm": 0.49716446388210295, "learning_rate": 4.99532273152479e-05, "loss": 0.4737, "step": 240 }, { "epoch": 0.3041009463722397, "grad_norm": 0.40124562798304336, "learning_rate": 4.992984097287184e-05, "loss": 0.4602, "step": 241 }, { "epoch": 0.3053627760252366, "grad_norm": 0.3932101282949015, "learning_rate": 4.990645463049579e-05, "loss": 0.4788, "step": 242 }, { "epoch": 0.30662460567823346, "grad_norm": 0.5061427182780114, "learning_rate": 4.988306828811974e-05, "loss": 0.4908, "step": 243 }, { "epoch": 0.3078864353312303, "grad_norm": 0.5110487456271154, "learning_rate": 4.985968194574369e-05, "loss": 0.4572, "step": 244 }, { "epoch": 0.30914826498422715, "grad_norm": 0.5391248176864377, "learning_rate": 4.983629560336763e-05, "loss": 0.4784, "step": 245 }, { "epoch": 0.31041009463722397, "grad_norm": 0.46831280464469904, "learning_rate": 4.981290926099158e-05, "loss": 0.4615, "step": 246 }, { "epoch": 0.31167192429022084, "grad_norm": 0.5714414616925262, "learning_rate": 4.9789522918615534e-05, "loss": 0.4656, "step": 247 }, { "epoch": 0.31293375394321765, "grad_norm": 0.5270925926817066, "learning_rate": 4.976613657623948e-05, "loss": 0.4951, "step": 248 }, { "epoch": 0.3141955835962145, "grad_norm": 0.5066940017018592, "learning_rate": 4.974275023386343e-05, "loss": 0.4906, "step": 249 }, { "epoch": 0.31545741324921134, "grad_norm": 0.5667162735313502, "learning_rate": 4.971936389148737e-05, "loss": 0.4792, "step": 250 }, { "epoch": 0.3167192429022082, "grad_norm": 0.39529258877739953, "learning_rate": 4.9695977549111324e-05, "loss": 0.4784, "step": 251 }, { "epoch": 0.317981072555205, "grad_norm": 0.586308547486958, "learning_rate": 4.967259120673527e-05, "loss": 0.4689, "step": 252 }, { "epoch": 0.3192429022082019, "grad_norm": 0.49464069232422525, "learning_rate": 4.964920486435922e-05, "loss": 0.4643, "step": 253 }, { "epoch": 0.3205047318611987, "grad_norm": 0.4204109141797044, "learning_rate": 4.9625818521983164e-05, "loss": 0.4688, "step": 254 }, { "epoch": 0.3217665615141956, "grad_norm": 0.6000557450317632, "learning_rate": 4.960243217960711e-05, "loss": 0.4517, "step": 255 }, { "epoch": 0.32302839116719245, "grad_norm": 0.5136322209535605, "learning_rate": 4.957904583723106e-05, "loss": 0.4882, "step": 256 }, { "epoch": 0.32429022082018927, "grad_norm": 0.4679835129254575, "learning_rate": 4.9555659494855e-05, "loss": 0.4765, "step": 257 }, { "epoch": 0.32555205047318614, "grad_norm": 0.5411225651734517, "learning_rate": 4.9532273152478954e-05, "loss": 0.4679, "step": 258 }, { "epoch": 0.32681388012618295, "grad_norm": 0.5233183391623946, "learning_rate": 4.95088868101029e-05, "loss": 0.4564, "step": 259 }, { "epoch": 0.3280757097791798, "grad_norm": 0.4539929449133687, "learning_rate": 4.948550046772685e-05, "loss": 0.4496, "step": 260 }, { "epoch": 0.32933753943217664, "grad_norm": 0.5207912095134354, "learning_rate": 4.94621141253508e-05, "loss": 0.4664, "step": 261 }, { "epoch": 0.3305993690851735, "grad_norm": 0.4408098065628989, "learning_rate": 4.9438727782974744e-05, "loss": 0.4654, "step": 262 }, { "epoch": 0.3318611987381703, "grad_norm": 0.38455668521686615, "learning_rate": 4.9415341440598695e-05, "loss": 0.4776, "step": 263 }, { "epoch": 0.3331230283911672, "grad_norm": 0.49416175674816626, "learning_rate": 4.939195509822264e-05, "loss": 0.465, "step": 264 }, { "epoch": 0.334384858044164, "grad_norm": 0.5825882954769809, "learning_rate": 4.936856875584659e-05, "loss": 0.469, "step": 265 }, { "epoch": 0.3356466876971609, "grad_norm": 0.522056970251061, "learning_rate": 4.9345182413470535e-05, "loss": 0.48, "step": 266 }, { "epoch": 0.33690851735015775, "grad_norm": 0.47963891008867976, "learning_rate": 4.9321796071094486e-05, "loss": 0.457, "step": 267 }, { "epoch": 0.33817034700315457, "grad_norm": 0.6426384190930887, "learning_rate": 4.929840972871843e-05, "loss": 0.4758, "step": 268 }, { "epoch": 0.33943217665615144, "grad_norm": 0.39529813228684374, "learning_rate": 4.927502338634238e-05, "loss": 0.481, "step": 269 }, { "epoch": 0.34069400630914826, "grad_norm": 0.659474729121837, "learning_rate": 4.9251637043966325e-05, "loss": 0.4746, "step": 270 }, { "epoch": 0.3419558359621451, "grad_norm": 0.5939220892201033, "learning_rate": 4.9228250701590276e-05, "loss": 0.46, "step": 271 }, { "epoch": 0.34321766561514194, "grad_norm": 0.4090077101650982, "learning_rate": 4.920486435921422e-05, "loss": 0.4629, "step": 272 }, { "epoch": 0.3444794952681388, "grad_norm": 0.5776082511185947, "learning_rate": 4.9181478016838164e-05, "loss": 0.4407, "step": 273 }, { "epoch": 0.34574132492113563, "grad_norm": 0.400166152290198, "learning_rate": 4.9158091674462115e-05, "loss": 0.4585, "step": 274 }, { "epoch": 0.3470031545741325, "grad_norm": 0.4874860312799066, "learning_rate": 4.913470533208606e-05, "loss": 0.4453, "step": 275 }, { "epoch": 0.3482649842271293, "grad_norm": 0.5720537796990165, "learning_rate": 4.911131898971001e-05, "loss": 0.4752, "step": 276 }, { "epoch": 0.3495268138801262, "grad_norm": 0.4609019305426553, "learning_rate": 4.908793264733396e-05, "loss": 0.4721, "step": 277 }, { "epoch": 0.350788643533123, "grad_norm": 0.41004049363845413, "learning_rate": 4.9064546304957906e-05, "loss": 0.4646, "step": 278 }, { "epoch": 0.35205047318611987, "grad_norm": 0.6439606159949646, "learning_rate": 4.904115996258186e-05, "loss": 0.4876, "step": 279 }, { "epoch": 0.35331230283911674, "grad_norm": 0.5040115380070034, "learning_rate": 4.90177736202058e-05, "loss": 0.4751, "step": 280 }, { "epoch": 0.35457413249211356, "grad_norm": 0.4625699302509201, "learning_rate": 4.899438727782975e-05, "loss": 0.4835, "step": 281 }, { "epoch": 0.35583596214511043, "grad_norm": 0.48491296744798923, "learning_rate": 4.8971000935453696e-05, "loss": 0.4653, "step": 282 }, { "epoch": 0.35709779179810724, "grad_norm": 0.4778421131621386, "learning_rate": 4.894761459307765e-05, "loss": 0.4735, "step": 283 }, { "epoch": 0.3583596214511041, "grad_norm": 0.44084863503610827, "learning_rate": 4.892422825070159e-05, "loss": 0.4879, "step": 284 }, { "epoch": 0.35962145110410093, "grad_norm": 0.4939442329911528, "learning_rate": 4.890084190832554e-05, "loss": 0.4567, "step": 285 }, { "epoch": 0.3608832807570978, "grad_norm": 0.5127207060428786, "learning_rate": 4.8877455565949486e-05, "loss": 0.4579, "step": 286 }, { "epoch": 0.3621451104100946, "grad_norm": 0.44099593712398627, "learning_rate": 4.885406922357344e-05, "loss": 0.4595, "step": 287 }, { "epoch": 0.3634069400630915, "grad_norm": 0.4916846447415859, "learning_rate": 4.883068288119738e-05, "loss": 0.4604, "step": 288 }, { "epoch": 0.3646687697160883, "grad_norm": 0.49379894704505045, "learning_rate": 4.8807296538821326e-05, "loss": 0.4573, "step": 289 }, { "epoch": 0.3659305993690852, "grad_norm": 0.44828074790159683, "learning_rate": 4.878391019644528e-05, "loss": 0.4523, "step": 290 }, { "epoch": 0.36719242902208205, "grad_norm": 0.5132984029243507, "learning_rate": 4.876052385406922e-05, "loss": 0.4686, "step": 291 }, { "epoch": 0.36845425867507886, "grad_norm": 0.4185231776535968, "learning_rate": 4.873713751169318e-05, "loss": 0.4629, "step": 292 }, { "epoch": 0.36971608832807573, "grad_norm": 0.4929913722628461, "learning_rate": 4.871375116931712e-05, "loss": 0.4825, "step": 293 }, { "epoch": 0.37097791798107255, "grad_norm": 0.5167459515464593, "learning_rate": 4.8690364826941074e-05, "loss": 0.47, "step": 294 }, { "epoch": 0.3722397476340694, "grad_norm": 0.3840852318611212, "learning_rate": 4.866697848456502e-05, "loss": 0.4619, "step": 295 }, { "epoch": 0.37350157728706623, "grad_norm": 0.5415334801068984, "learning_rate": 4.864359214218896e-05, "loss": 0.4694, "step": 296 }, { "epoch": 0.3747634069400631, "grad_norm": 0.4142879267135591, "learning_rate": 4.862020579981291e-05, "loss": 0.4946, "step": 297 }, { "epoch": 0.3760252365930599, "grad_norm": 0.493194664608404, "learning_rate": 4.859681945743686e-05, "loss": 0.4666, "step": 298 }, { "epoch": 0.3772870662460568, "grad_norm": 0.38752424763933907, "learning_rate": 4.857343311506081e-05, "loss": 0.4493, "step": 299 }, { "epoch": 0.3785488958990536, "grad_norm": 0.4925261758573114, "learning_rate": 4.855004677268475e-05, "loss": 0.4731, "step": 300 }, { "epoch": 0.3798107255520505, "grad_norm": 0.48396850138179803, "learning_rate": 4.8526660430308703e-05, "loss": 0.4877, "step": 301 }, { "epoch": 0.3810725552050473, "grad_norm": 0.46640453180602337, "learning_rate": 4.850327408793265e-05, "loss": 0.4669, "step": 302 }, { "epoch": 0.38233438485804416, "grad_norm": 0.4728683109226525, "learning_rate": 4.84798877455566e-05, "loss": 0.4753, "step": 303 }, { "epoch": 0.38359621451104103, "grad_norm": 0.6380526911458401, "learning_rate": 4.845650140318054e-05, "loss": 0.4634, "step": 304 }, { "epoch": 0.38485804416403785, "grad_norm": 0.3680422215188687, "learning_rate": 4.843311506080449e-05, "loss": 0.4746, "step": 305 }, { "epoch": 0.3861198738170347, "grad_norm": 0.4515392549060841, "learning_rate": 4.840972871842844e-05, "loss": 0.4466, "step": 306 }, { "epoch": 0.38738170347003154, "grad_norm": 0.3763152473230672, "learning_rate": 4.838634237605239e-05, "loss": 0.4513, "step": 307 }, { "epoch": 0.3886435331230284, "grad_norm": 0.443547972030612, "learning_rate": 4.836295603367634e-05, "loss": 0.4659, "step": 308 }, { "epoch": 0.3899053627760252, "grad_norm": 0.4290302741734105, "learning_rate": 4.8339569691300284e-05, "loss": 0.4749, "step": 309 }, { "epoch": 0.3911671924290221, "grad_norm": 0.4523215089611189, "learning_rate": 4.8316183348924235e-05, "loss": 0.4972, "step": 310 }, { "epoch": 0.3924290220820189, "grad_norm": 0.49091888283784574, "learning_rate": 4.829279700654818e-05, "loss": 0.4578, "step": 311 }, { "epoch": 0.3936908517350158, "grad_norm": 0.42117912681186664, "learning_rate": 4.8269410664172123e-05, "loss": 0.4529, "step": 312 }, { "epoch": 0.3949526813880126, "grad_norm": 0.5471189823065845, "learning_rate": 4.8246024321796074e-05, "loss": 0.465, "step": 313 }, { "epoch": 0.39621451104100947, "grad_norm": 0.5083826323277113, "learning_rate": 4.822263797942002e-05, "loss": 0.4489, "step": 314 }, { "epoch": 0.39747634069400634, "grad_norm": 0.5272598536698047, "learning_rate": 4.819925163704397e-05, "loss": 0.4709, "step": 315 }, { "epoch": 0.39873817034700315, "grad_norm": 0.5575494693188537, "learning_rate": 4.8175865294667914e-05, "loss": 0.4444, "step": 316 }, { "epoch": 0.4, "grad_norm": 0.44671245136551607, "learning_rate": 4.8152478952291865e-05, "loss": 0.4583, "step": 317 }, { "epoch": 0.40126182965299684, "grad_norm": 0.5188627383109754, "learning_rate": 4.812909260991581e-05, "loss": 0.4942, "step": 318 }, { "epoch": 0.4025236593059937, "grad_norm": 0.4537553405855915, "learning_rate": 4.810570626753976e-05, "loss": 0.4554, "step": 319 }, { "epoch": 0.4037854889589905, "grad_norm": 0.49982428209252605, "learning_rate": 4.8082319925163704e-05, "loss": 0.4632, "step": 320 }, { "epoch": 0.4050473186119874, "grad_norm": 0.4601209281476904, "learning_rate": 4.8058933582787655e-05, "loss": 0.4795, "step": 321 }, { "epoch": 0.4063091482649842, "grad_norm": 0.4655445948011024, "learning_rate": 4.8035547240411606e-05, "loss": 0.4565, "step": 322 }, { "epoch": 0.4075709779179811, "grad_norm": 0.41208934563662414, "learning_rate": 4.801216089803555e-05, "loss": 0.4569, "step": 323 }, { "epoch": 0.4088328075709779, "grad_norm": 0.41886925081097964, "learning_rate": 4.79887745556595e-05, "loss": 0.4796, "step": 324 }, { "epoch": 0.41009463722397477, "grad_norm": 0.3742013897407593, "learning_rate": 4.7965388213283445e-05, "loss": 0.4412, "step": 325 }, { "epoch": 0.4113564668769716, "grad_norm": 0.39768681397725686, "learning_rate": 4.7942001870907396e-05, "loss": 0.4613, "step": 326 }, { "epoch": 0.41261829652996845, "grad_norm": 0.4047119795980251, "learning_rate": 4.791861552853134e-05, "loss": 0.4475, "step": 327 }, { "epoch": 0.4138801261829653, "grad_norm": 0.5169358086853238, "learning_rate": 4.789522918615529e-05, "loss": 0.4711, "step": 328 }, { "epoch": 0.41514195583596214, "grad_norm": 0.4198309021201204, "learning_rate": 4.7871842843779236e-05, "loss": 0.4768, "step": 329 }, { "epoch": 0.416403785488959, "grad_norm": 0.4370975840327619, "learning_rate": 4.784845650140318e-05, "loss": 0.4675, "step": 330 }, { "epoch": 0.4176656151419558, "grad_norm": 0.3697582956328349, "learning_rate": 4.782507015902713e-05, "loss": 0.4647, "step": 331 }, { "epoch": 0.4189274447949527, "grad_norm": 0.47073602055773195, "learning_rate": 4.7801683816651075e-05, "loss": 0.4458, "step": 332 }, { "epoch": 0.4201892744479495, "grad_norm": 0.3588110480213258, "learning_rate": 4.7778297474275026e-05, "loss": 0.4291, "step": 333 }, { "epoch": 0.4214511041009464, "grad_norm": 0.5033061935141917, "learning_rate": 4.775491113189897e-05, "loss": 0.4652, "step": 334 }, { "epoch": 0.4227129337539432, "grad_norm": 0.3826123176521751, "learning_rate": 4.773152478952292e-05, "loss": 0.4681, "step": 335 }, { "epoch": 0.42397476340694007, "grad_norm": 0.47585917769117253, "learning_rate": 4.7708138447146865e-05, "loss": 0.4632, "step": 336 }, { "epoch": 0.4252365930599369, "grad_norm": 0.35413754007634785, "learning_rate": 4.7684752104770816e-05, "loss": 0.4477, "step": 337 }, { "epoch": 0.42649842271293376, "grad_norm": 0.45536405074245795, "learning_rate": 4.766136576239477e-05, "loss": 0.4375, "step": 338 }, { "epoch": 0.4277602523659306, "grad_norm": 0.3308619497863041, "learning_rate": 4.763797942001871e-05, "loss": 0.4559, "step": 339 }, { "epoch": 0.42902208201892744, "grad_norm": 0.4577577480859207, "learning_rate": 4.761459307764266e-05, "loss": 0.4673, "step": 340 }, { "epoch": 0.4302839116719243, "grad_norm": 0.39229231440952533, "learning_rate": 4.759120673526661e-05, "loss": 0.4507, "step": 341 }, { "epoch": 0.43154574132492113, "grad_norm": 0.5156562615125537, "learning_rate": 4.756782039289056e-05, "loss": 0.4593, "step": 342 }, { "epoch": 0.432807570977918, "grad_norm": 0.4619979906862443, "learning_rate": 4.75444340505145e-05, "loss": 0.451, "step": 343 }, { "epoch": 0.4340694006309148, "grad_norm": 0.5142194886999226, "learning_rate": 4.752104770813845e-05, "loss": 0.4514, "step": 344 }, { "epoch": 0.4353312302839117, "grad_norm": 0.4219094450245634, "learning_rate": 4.74976613657624e-05, "loss": 0.4344, "step": 345 }, { "epoch": 0.4365930599369085, "grad_norm": 0.4410618456123614, "learning_rate": 4.747427502338634e-05, "loss": 0.4316, "step": 346 }, { "epoch": 0.43785488958990537, "grad_norm": 0.3985518605865327, "learning_rate": 4.745088868101029e-05, "loss": 0.4475, "step": 347 }, { "epoch": 0.4391167192429022, "grad_norm": 0.5011713419216915, "learning_rate": 4.7427502338634236e-05, "loss": 0.4532, "step": 348 }, { "epoch": 0.44037854889589906, "grad_norm": 0.4075925734006734, "learning_rate": 4.740411599625819e-05, "loss": 0.459, "step": 349 }, { "epoch": 0.4416403785488959, "grad_norm": 0.409201091988876, "learning_rate": 4.738072965388213e-05, "loss": 0.4491, "step": 350 }, { "epoch": 0.44290220820189274, "grad_norm": 0.47376481811729565, "learning_rate": 4.735734331150608e-05, "loss": 0.4565, "step": 351 }, { "epoch": 0.4441640378548896, "grad_norm": 0.47408919975511793, "learning_rate": 4.733395696913003e-05, "loss": 0.4619, "step": 352 }, { "epoch": 0.44542586750788643, "grad_norm": 0.3738872462088946, "learning_rate": 4.731057062675398e-05, "loss": 0.4641, "step": 353 }, { "epoch": 0.4466876971608833, "grad_norm": 0.43867494716106564, "learning_rate": 4.728718428437793e-05, "loss": 0.4643, "step": 354 }, { "epoch": 0.4479495268138801, "grad_norm": 0.32777880151918504, "learning_rate": 4.726379794200187e-05, "loss": 0.4551, "step": 355 }, { "epoch": 0.449211356466877, "grad_norm": 0.4338847471806085, "learning_rate": 4.7240411599625824e-05, "loss": 0.442, "step": 356 }, { "epoch": 0.4504731861198738, "grad_norm": 0.40255687917067834, "learning_rate": 4.721702525724977e-05, "loss": 0.459, "step": 357 }, { "epoch": 0.4517350157728707, "grad_norm": 0.35952462682865066, "learning_rate": 4.719363891487372e-05, "loss": 0.4749, "step": 358 }, { "epoch": 0.4529968454258675, "grad_norm": 0.41112673143875256, "learning_rate": 4.717025257249766e-05, "loss": 0.4391, "step": 359 }, { "epoch": 0.45425867507886436, "grad_norm": 0.3831050384509767, "learning_rate": 4.7146866230121614e-05, "loss": 0.4447, "step": 360 }, { "epoch": 0.4555205047318612, "grad_norm": 0.4219557681565918, "learning_rate": 4.712347988774556e-05, "loss": 0.4617, "step": 361 }, { "epoch": 0.45678233438485805, "grad_norm": 0.38214209826940065, "learning_rate": 4.71000935453695e-05, "loss": 0.4505, "step": 362 }, { "epoch": 0.4580441640378549, "grad_norm": 0.44336017641229486, "learning_rate": 4.7076707202993454e-05, "loss": 0.4535, "step": 363 }, { "epoch": 0.45930599369085173, "grad_norm": 0.35342012545308704, "learning_rate": 4.70533208606174e-05, "loss": 0.4527, "step": 364 }, { "epoch": 0.4605678233438486, "grad_norm": 0.4997555991592146, "learning_rate": 4.702993451824135e-05, "loss": 0.4642, "step": 365 }, { "epoch": 0.4618296529968454, "grad_norm": 0.3391604683352496, "learning_rate": 4.700654817586529e-05, "loss": 0.4453, "step": 366 }, { "epoch": 0.4630914826498423, "grad_norm": 0.4089370497351172, "learning_rate": 4.6983161833489244e-05, "loss": 0.472, "step": 367 }, { "epoch": 0.4643533123028391, "grad_norm": 0.3886812117405029, "learning_rate": 4.6959775491113195e-05, "loss": 0.4546, "step": 368 }, { "epoch": 0.465615141955836, "grad_norm": 0.40308013604631604, "learning_rate": 4.693638914873714e-05, "loss": 0.4698, "step": 369 }, { "epoch": 0.4668769716088328, "grad_norm": 0.39908796303256144, "learning_rate": 4.691300280636109e-05, "loss": 0.4398, "step": 370 }, { "epoch": 0.46813880126182966, "grad_norm": 0.34375780071366263, "learning_rate": 4.6889616463985034e-05, "loss": 0.4276, "step": 371 }, { "epoch": 0.4694006309148265, "grad_norm": 0.44623807334941695, "learning_rate": 4.6866230121608985e-05, "loss": 0.4638, "step": 372 }, { "epoch": 0.47066246056782335, "grad_norm": 0.3397043238219459, "learning_rate": 4.684284377923293e-05, "loss": 0.4552, "step": 373 }, { "epoch": 0.47192429022082016, "grad_norm": 0.36055746616859574, "learning_rate": 4.681945743685688e-05, "loss": 0.4525, "step": 374 }, { "epoch": 0.47318611987381703, "grad_norm": 0.43002163842901864, "learning_rate": 4.6796071094480824e-05, "loss": 0.4797, "step": 375 }, { "epoch": 0.4744479495268139, "grad_norm": 0.36152968291578996, "learning_rate": 4.6772684752104775e-05, "loss": 0.449, "step": 376 }, { "epoch": 0.4757097791798107, "grad_norm": 0.40467653872733833, "learning_rate": 4.674929840972872e-05, "loss": 0.4312, "step": 377 }, { "epoch": 0.4769716088328076, "grad_norm": 0.5438597217577603, "learning_rate": 4.672591206735267e-05, "loss": 0.4589, "step": 378 }, { "epoch": 0.4782334384858044, "grad_norm": 0.43271093549461065, "learning_rate": 4.6702525724976615e-05, "loss": 0.4578, "step": 379 }, { "epoch": 0.4794952681388013, "grad_norm": 0.464124976112048, "learning_rate": 4.667913938260056e-05, "loss": 0.4469, "step": 380 }, { "epoch": 0.4807570977917981, "grad_norm": 0.4627781431443684, "learning_rate": 4.665575304022451e-05, "loss": 0.4592, "step": 381 }, { "epoch": 0.48201892744479496, "grad_norm": 0.4588951184323405, "learning_rate": 4.6632366697848454e-05, "loss": 0.4524, "step": 382 }, { "epoch": 0.4832807570977918, "grad_norm": 0.48292965933347815, "learning_rate": 4.6608980355472405e-05, "loss": 0.4454, "step": 383 }, { "epoch": 0.48454258675078865, "grad_norm": 0.49294377602956396, "learning_rate": 4.6585594013096356e-05, "loss": 0.448, "step": 384 }, { "epoch": 0.48580441640378547, "grad_norm": 0.4580589597219713, "learning_rate": 4.65622076707203e-05, "loss": 0.4492, "step": 385 }, { "epoch": 0.48706624605678234, "grad_norm": 0.5172702994763925, "learning_rate": 4.653882132834425e-05, "loss": 0.4621, "step": 386 }, { "epoch": 0.48832807570977915, "grad_norm": 0.35646902219910753, "learning_rate": 4.6515434985968195e-05, "loss": 0.442, "step": 387 }, { "epoch": 0.489589905362776, "grad_norm": 0.5420809853413071, "learning_rate": 4.6492048643592146e-05, "loss": 0.4457, "step": 388 }, { "epoch": 0.4908517350157729, "grad_norm": 0.36608312227152007, "learning_rate": 4.646866230121609e-05, "loss": 0.4826, "step": 389 }, { "epoch": 0.4921135646687697, "grad_norm": 0.39685416366625836, "learning_rate": 4.644527595884004e-05, "loss": 0.4456, "step": 390 }, { "epoch": 0.4933753943217666, "grad_norm": 0.5009594552323486, "learning_rate": 4.6421889616463986e-05, "loss": 0.4571, "step": 391 }, { "epoch": 0.4946372239747634, "grad_norm": 0.3844241950709571, "learning_rate": 4.639850327408794e-05, "loss": 0.4661, "step": 392 }, { "epoch": 0.49589905362776027, "grad_norm": 0.5343035942187719, "learning_rate": 4.637511693171188e-05, "loss": 0.4463, "step": 393 }, { "epoch": 0.4971608832807571, "grad_norm": 0.5151379566813599, "learning_rate": 4.635173058933583e-05, "loss": 0.4394, "step": 394 }, { "epoch": 0.49842271293375395, "grad_norm": 0.40384501035139153, "learning_rate": 4.6328344246959776e-05, "loss": 0.476, "step": 395 }, { "epoch": 0.49968454258675077, "grad_norm": 0.5794631352070053, "learning_rate": 4.630495790458372e-05, "loss": 0.4473, "step": 396 }, { "epoch": 0.5009463722397476, "grad_norm": 0.36660061615456596, "learning_rate": 4.628157156220767e-05, "loss": 0.4452, "step": 397 }, { "epoch": 0.5022082018927445, "grad_norm": 0.6367803970841863, "learning_rate": 4.6258185219831615e-05, "loss": 0.4456, "step": 398 }, { "epoch": 0.5034700315457413, "grad_norm": 0.512988569548103, "learning_rate": 4.623479887745557e-05, "loss": 0.4573, "step": 399 }, { "epoch": 0.5047318611987381, "grad_norm": 0.47613479219136734, "learning_rate": 4.621141253507952e-05, "loss": 0.4423, "step": 400 }, { "epoch": 0.5059936908517351, "grad_norm": 0.49192304399998366, "learning_rate": 4.618802619270347e-05, "loss": 0.4574, "step": 401 }, { "epoch": 0.5072555205047319, "grad_norm": 0.4747627929690796, "learning_rate": 4.616463985032741e-05, "loss": 0.4469, "step": 402 }, { "epoch": 0.5085173501577287, "grad_norm": 0.41990698318103187, "learning_rate": 4.614125350795136e-05, "loss": 0.4467, "step": 403 }, { "epoch": 0.5097791798107255, "grad_norm": 0.5273516031327016, "learning_rate": 4.611786716557531e-05, "loss": 0.4402, "step": 404 }, { "epoch": 0.5110410094637224, "grad_norm": 0.40727201857606293, "learning_rate": 4.609448082319925e-05, "loss": 0.4599, "step": 405 }, { "epoch": 0.5123028391167193, "grad_norm": 0.461908880887009, "learning_rate": 4.60710944808232e-05, "loss": 0.4413, "step": 406 }, { "epoch": 0.5135646687697161, "grad_norm": 0.4243461107321498, "learning_rate": 4.604770813844715e-05, "loss": 0.4394, "step": 407 }, { "epoch": 0.5148264984227129, "grad_norm": 0.3872515382695516, "learning_rate": 4.60243217960711e-05, "loss": 0.4267, "step": 408 }, { "epoch": 0.5160883280757098, "grad_norm": 0.36061500648363504, "learning_rate": 4.600093545369504e-05, "loss": 0.432, "step": 409 }, { "epoch": 0.5173501577287066, "grad_norm": 0.4744359929627581, "learning_rate": 4.597754911131899e-05, "loss": 0.4988, "step": 410 }, { "epoch": 0.5186119873817034, "grad_norm": 0.3860345902513679, "learning_rate": 4.595416276894294e-05, "loss": 0.4568, "step": 411 }, { "epoch": 0.5198738170347004, "grad_norm": 0.43464359868558416, "learning_rate": 4.593077642656688e-05, "loss": 0.4357, "step": 412 }, { "epoch": 0.5211356466876972, "grad_norm": 0.3403523323781767, "learning_rate": 4.590739008419083e-05, "loss": 0.4311, "step": 413 }, { "epoch": 0.522397476340694, "grad_norm": 0.4352456529874452, "learning_rate": 4.5884003741814784e-05, "loss": 0.4238, "step": 414 }, { "epoch": 0.5236593059936908, "grad_norm": 0.3791719697260861, "learning_rate": 4.5860617399438735e-05, "loss": 0.4403, "step": 415 }, { "epoch": 0.5249211356466877, "grad_norm": 0.4261303667759798, "learning_rate": 4.583723105706268e-05, "loss": 0.4432, "step": 416 }, { "epoch": 0.5261829652996846, "grad_norm": 0.46155439469997595, "learning_rate": 4.581384471468663e-05, "loss": 0.4469, "step": 417 }, { "epoch": 0.5274447949526814, "grad_norm": 0.45075846419435117, "learning_rate": 4.5790458372310574e-05, "loss": 0.4614, "step": 418 }, { "epoch": 0.5287066246056782, "grad_norm": 0.4521871310129386, "learning_rate": 4.576707202993452e-05, "loss": 0.441, "step": 419 }, { "epoch": 0.5299684542586751, "grad_norm": 0.44849653739282547, "learning_rate": 4.574368568755847e-05, "loss": 0.4624, "step": 420 }, { "epoch": 0.5312302839116719, "grad_norm": 0.62648474263937, "learning_rate": 4.572029934518241e-05, "loss": 0.465, "step": 421 }, { "epoch": 0.5324921135646687, "grad_norm": 0.4602193860964116, "learning_rate": 4.5696913002806364e-05, "loss": 0.442, "step": 422 }, { "epoch": 0.5337539432176656, "grad_norm": 0.5235469743091229, "learning_rate": 4.567352666043031e-05, "loss": 0.4506, "step": 423 }, { "epoch": 0.5350157728706625, "grad_norm": 0.4382067869998778, "learning_rate": 4.565014031805426e-05, "loss": 0.4529, "step": 424 }, { "epoch": 0.5362776025236593, "grad_norm": 0.414864897686053, "learning_rate": 4.5626753975678204e-05, "loss": 0.4568, "step": 425 }, { "epoch": 0.5375394321766561, "grad_norm": 0.4507882322728874, "learning_rate": 4.5603367633302155e-05, "loss": 0.4364, "step": 426 }, { "epoch": 0.538801261829653, "grad_norm": 0.4000665781994082, "learning_rate": 4.55799812909261e-05, "loss": 0.4286, "step": 427 }, { "epoch": 0.5400630914826499, "grad_norm": 0.4546113052797933, "learning_rate": 4.555659494855005e-05, "loss": 0.4452, "step": 428 }, { "epoch": 0.5413249211356467, "grad_norm": 0.39829474191802555, "learning_rate": 4.5533208606174e-05, "loss": 0.4537, "step": 429 }, { "epoch": 0.5425867507886435, "grad_norm": 0.39432372698482904, "learning_rate": 4.5509822263797945e-05, "loss": 0.4494, "step": 430 }, { "epoch": 0.5438485804416404, "grad_norm": 0.36159573919792026, "learning_rate": 4.5486435921421896e-05, "loss": 0.4363, "step": 431 }, { "epoch": 0.5451104100946372, "grad_norm": 0.4032938876243454, "learning_rate": 4.546304957904584e-05, "loss": 0.4385, "step": 432 }, { "epoch": 0.546372239747634, "grad_norm": 0.36482570906111855, "learning_rate": 4.543966323666979e-05, "loss": 0.4505, "step": 433 }, { "epoch": 0.5476340694006309, "grad_norm": 0.41605458178639126, "learning_rate": 4.5416276894293735e-05, "loss": 0.452, "step": 434 }, { "epoch": 0.5488958990536278, "grad_norm": 0.4212068007485103, "learning_rate": 4.5392890551917686e-05, "loss": 0.4583, "step": 435 }, { "epoch": 0.5501577287066246, "grad_norm": 0.5733244410217441, "learning_rate": 4.536950420954163e-05, "loss": 0.4426, "step": 436 }, { "epoch": 0.5514195583596214, "grad_norm": 0.37574451124929725, "learning_rate": 4.5346117867165575e-05, "loss": 0.4384, "step": 437 }, { "epoch": 0.5526813880126183, "grad_norm": 0.5052434376414195, "learning_rate": 4.5322731524789526e-05, "loss": 0.4252, "step": 438 }, { "epoch": 0.5539432176656152, "grad_norm": 0.3933195232509149, "learning_rate": 4.529934518241347e-05, "loss": 0.4832, "step": 439 }, { "epoch": 0.555205047318612, "grad_norm": 0.5538009637456877, "learning_rate": 4.527595884003742e-05, "loss": 0.4335, "step": 440 }, { "epoch": 0.5564668769716088, "grad_norm": 0.49516292287868197, "learning_rate": 4.5252572497661365e-05, "loss": 0.4624, "step": 441 }, { "epoch": 0.5577287066246057, "grad_norm": 0.5236913259832761, "learning_rate": 4.5229186155285316e-05, "loss": 0.4438, "step": 442 }, { "epoch": 0.5589905362776025, "grad_norm": 0.43482698454836066, "learning_rate": 4.520579981290926e-05, "loss": 0.4476, "step": 443 }, { "epoch": 0.5602523659305993, "grad_norm": 0.4314161051688246, "learning_rate": 4.518241347053321e-05, "loss": 0.4396, "step": 444 }, { "epoch": 0.5615141955835962, "grad_norm": 1.4310400083479748, "learning_rate": 4.515902712815716e-05, "loss": 0.4713, "step": 445 }, { "epoch": 0.5627760252365931, "grad_norm": 0.48208951612444373, "learning_rate": 4.5135640785781106e-05, "loss": 0.4601, "step": 446 }, { "epoch": 0.5640378548895899, "grad_norm": 0.5212109512214435, "learning_rate": 4.511225444340506e-05, "loss": 0.4156, "step": 447 }, { "epoch": 0.5652996845425867, "grad_norm": 0.47265326286218023, "learning_rate": 4.5088868101029e-05, "loss": 0.4626, "step": 448 }, { "epoch": 0.5665615141955836, "grad_norm": 0.500236104998197, "learning_rate": 4.506548175865295e-05, "loss": 0.4521, "step": 449 }, { "epoch": 0.5678233438485805, "grad_norm": 0.4367088962465068, "learning_rate": 4.5042095416276897e-05, "loss": 0.4395, "step": 450 }, { "epoch": 0.5690851735015773, "grad_norm": 0.5171682585541689, "learning_rate": 4.501870907390085e-05, "loss": 0.4607, "step": 451 }, { "epoch": 0.5703470031545741, "grad_norm": 0.5548410123293586, "learning_rate": 4.499532273152479e-05, "loss": 0.4744, "step": 452 }, { "epoch": 0.571608832807571, "grad_norm": 0.38943305937832134, "learning_rate": 4.4971936389148736e-05, "loss": 0.437, "step": 453 }, { "epoch": 0.5728706624605678, "grad_norm": 0.5660035713261121, "learning_rate": 4.494855004677269e-05, "loss": 0.4585, "step": 454 }, { "epoch": 0.5741324921135647, "grad_norm": 0.4908950590551449, "learning_rate": 4.492516370439663e-05, "loss": 0.4519, "step": 455 }, { "epoch": 0.5753943217665615, "grad_norm": 0.5918357502817085, "learning_rate": 4.490177736202058e-05, "loss": 0.4604, "step": 456 }, { "epoch": 0.5766561514195584, "grad_norm": 0.6298990838019757, "learning_rate": 4.4878391019644526e-05, "loss": 0.4688, "step": 457 }, { "epoch": 0.5779179810725552, "grad_norm": 0.5265882674861123, "learning_rate": 4.485500467726848e-05, "loss": 0.4367, "step": 458 }, { "epoch": 0.579179810725552, "grad_norm": 0.6136564446676166, "learning_rate": 4.483161833489242e-05, "loss": 0.4395, "step": 459 }, { "epoch": 0.580441640378549, "grad_norm": 0.533885865194882, "learning_rate": 4.480823199251637e-05, "loss": 0.4687, "step": 460 }, { "epoch": 0.5817034700315458, "grad_norm": 0.4884388644168447, "learning_rate": 4.478484565014032e-05, "loss": 0.4387, "step": 461 }, { "epoch": 0.5829652996845426, "grad_norm": 0.5177705767250105, "learning_rate": 4.476145930776427e-05, "loss": 0.4259, "step": 462 }, { "epoch": 0.5842271293375394, "grad_norm": 0.4958171163098016, "learning_rate": 4.473807296538822e-05, "loss": 0.45, "step": 463 }, { "epoch": 0.5854889589905363, "grad_norm": 0.5565193268203671, "learning_rate": 4.471468662301216e-05, "loss": 0.4485, "step": 464 }, { "epoch": 0.5867507886435331, "grad_norm": 0.5466597685641945, "learning_rate": 4.4691300280636114e-05, "loss": 0.43, "step": 465 }, { "epoch": 0.58801261829653, "grad_norm": 0.5108803879247027, "learning_rate": 4.466791393826006e-05, "loss": 0.4277, "step": 466 }, { "epoch": 0.5892744479495268, "grad_norm": 0.5414242849004682, "learning_rate": 4.464452759588401e-05, "loss": 0.4457, "step": 467 }, { "epoch": 0.5905362776025237, "grad_norm": 0.537633417586369, "learning_rate": 4.462114125350795e-05, "loss": 0.4457, "step": 468 }, { "epoch": 0.5917981072555205, "grad_norm": 0.4809701661471773, "learning_rate": 4.45977549111319e-05, "loss": 0.4614, "step": 469 }, { "epoch": 0.5930599369085173, "grad_norm": 0.5752715974261441, "learning_rate": 4.457436856875585e-05, "loss": 0.4494, "step": 470 }, { "epoch": 0.5943217665615141, "grad_norm": 0.35254063311464173, "learning_rate": 4.455098222637979e-05, "loss": 0.4416, "step": 471 }, { "epoch": 0.5955835962145111, "grad_norm": 0.4678559360971878, "learning_rate": 4.452759588400374e-05, "loss": 0.4395, "step": 472 }, { "epoch": 0.5968454258675079, "grad_norm": 0.49903841850200437, "learning_rate": 4.450420954162769e-05, "loss": 0.4646, "step": 473 }, { "epoch": 0.5981072555205047, "grad_norm": 0.37919397077391304, "learning_rate": 4.448082319925164e-05, "loss": 0.4255, "step": 474 }, { "epoch": 0.5993690851735016, "grad_norm": 0.44350916914628585, "learning_rate": 4.445743685687559e-05, "loss": 0.4491, "step": 475 }, { "epoch": 0.6006309148264984, "grad_norm": 0.37380512416460365, "learning_rate": 4.4434050514499534e-05, "loss": 0.4483, "step": 476 }, { "epoch": 0.6018927444794953, "grad_norm": 0.3527867439069043, "learning_rate": 4.4410664172123485e-05, "loss": 0.4643, "step": 477 }, { "epoch": 0.6031545741324921, "grad_norm": 0.39205752085901513, "learning_rate": 4.438727782974743e-05, "loss": 0.4386, "step": 478 }, { "epoch": 0.604416403785489, "grad_norm": 0.37769650803756677, "learning_rate": 4.436389148737138e-05, "loss": 0.4573, "step": 479 }, { "epoch": 0.6056782334384858, "grad_norm": 0.3933853086256073, "learning_rate": 4.4340505144995324e-05, "loss": 0.4559, "step": 480 }, { "epoch": 0.6069400630914826, "grad_norm": 0.41079166905661296, "learning_rate": 4.4317118802619275e-05, "loss": 0.4357, "step": 481 }, { "epoch": 0.6082018927444794, "grad_norm": 0.3402089432094098, "learning_rate": 4.429373246024322e-05, "loss": 0.4178, "step": 482 }, { "epoch": 0.6094637223974764, "grad_norm": 0.5659535836966931, "learning_rate": 4.427034611786717e-05, "loss": 0.4522, "step": 483 }, { "epoch": 0.6107255520504732, "grad_norm": 0.5014611796244743, "learning_rate": 4.4246959775491114e-05, "loss": 0.4323, "step": 484 }, { "epoch": 0.61198738170347, "grad_norm": 0.39421421292308173, "learning_rate": 4.4223573433115065e-05, "loss": 0.4341, "step": 485 }, { "epoch": 0.6132492113564669, "grad_norm": 0.4604564321774722, "learning_rate": 4.420018709073901e-05, "loss": 0.4601, "step": 486 }, { "epoch": 0.6145110410094637, "grad_norm": 0.38100729417753104, "learning_rate": 4.4176800748362954e-05, "loss": 0.4504, "step": 487 }, { "epoch": 0.6157728706624606, "grad_norm": 0.41904162607244655, "learning_rate": 4.4153414405986905e-05, "loss": 0.4352, "step": 488 }, { "epoch": 0.6170347003154574, "grad_norm": 0.39281632664977195, "learning_rate": 4.413002806361085e-05, "loss": 0.4323, "step": 489 }, { "epoch": 0.6182965299684543, "grad_norm": 0.4289185072317031, "learning_rate": 4.4106641721234807e-05, "loss": 0.4526, "step": 490 }, { "epoch": 0.6195583596214511, "grad_norm": 0.4255383334408064, "learning_rate": 4.408325537885875e-05, "loss": 0.4379, "step": 491 }, { "epoch": 0.6208201892744479, "grad_norm": 0.3402924606186915, "learning_rate": 4.4059869036482695e-05, "loss": 0.4682, "step": 492 }, { "epoch": 0.6220820189274447, "grad_norm": 0.3266978694655513, "learning_rate": 4.4036482694106646e-05, "loss": 0.4457, "step": 493 }, { "epoch": 0.6233438485804417, "grad_norm": 0.4145002235832889, "learning_rate": 4.401309635173059e-05, "loss": 0.4473, "step": 494 }, { "epoch": 0.6246056782334385, "grad_norm": 0.3434336485276844, "learning_rate": 4.398971000935454e-05, "loss": 0.4339, "step": 495 }, { "epoch": 0.6258675078864353, "grad_norm": 0.3042779624306036, "learning_rate": 4.3966323666978485e-05, "loss": 0.4287, "step": 496 }, { "epoch": 0.6271293375394322, "grad_norm": 0.42055557269560967, "learning_rate": 4.3942937324602436e-05, "loss": 0.4344, "step": 497 }, { "epoch": 0.628391167192429, "grad_norm": 0.31034733572139783, "learning_rate": 4.391955098222638e-05, "loss": 0.4387, "step": 498 }, { "epoch": 0.6296529968454259, "grad_norm": 0.3746455966659741, "learning_rate": 4.389616463985033e-05, "loss": 0.4796, "step": 499 }, { "epoch": 0.6309148264984227, "grad_norm": 0.3415594280525405, "learning_rate": 4.3872778297474276e-05, "loss": 0.4405, "step": 500 }, { "epoch": 0.6321766561514196, "grad_norm": 0.3586890977451799, "learning_rate": 4.3849391955098227e-05, "loss": 0.4569, "step": 501 }, { "epoch": 0.6334384858044164, "grad_norm": 0.3588073969129441, "learning_rate": 4.382600561272217e-05, "loss": 0.4302, "step": 502 }, { "epoch": 0.6347003154574132, "grad_norm": 0.3559168406042167, "learning_rate": 4.3802619270346115e-05, "loss": 0.4438, "step": 503 }, { "epoch": 0.63596214511041, "grad_norm": 0.38787711879803927, "learning_rate": 4.3779232927970066e-05, "loss": 0.4474, "step": 504 }, { "epoch": 0.637223974763407, "grad_norm": 0.4208108308258923, "learning_rate": 4.375584658559401e-05, "loss": 0.4404, "step": 505 }, { "epoch": 0.6384858044164038, "grad_norm": 0.38881532606325203, "learning_rate": 4.373246024321797e-05, "loss": 0.4671, "step": 506 }, { "epoch": 0.6397476340694006, "grad_norm": 0.38519354418634505, "learning_rate": 4.370907390084191e-05, "loss": 0.4453, "step": 507 }, { "epoch": 0.6410094637223974, "grad_norm": 0.42308155184965573, "learning_rate": 4.368568755846586e-05, "loss": 0.4093, "step": 508 }, { "epoch": 0.6422712933753943, "grad_norm": 0.4924584258146572, "learning_rate": 4.366230121608981e-05, "loss": 0.4726, "step": 509 }, { "epoch": 0.6435331230283912, "grad_norm": 0.42605069864995976, "learning_rate": 4.363891487371375e-05, "loss": 0.4081, "step": 510 }, { "epoch": 0.644794952681388, "grad_norm": 0.47597406318302116, "learning_rate": 4.36155285313377e-05, "loss": 0.4503, "step": 511 }, { "epoch": 0.6460567823343849, "grad_norm": 0.3598675508236477, "learning_rate": 4.3592142188961647e-05, "loss": 0.422, "step": 512 }, { "epoch": 0.6473186119873817, "grad_norm": 0.523485837905319, "learning_rate": 4.35687558465856e-05, "loss": 0.4554, "step": 513 }, { "epoch": 0.6485804416403785, "grad_norm": 0.43920842128212106, "learning_rate": 4.354536950420954e-05, "loss": 0.4315, "step": 514 }, { "epoch": 0.6498422712933754, "grad_norm": 0.40306229403604654, "learning_rate": 4.352198316183349e-05, "loss": 0.4436, "step": 515 }, { "epoch": 0.6511041009463723, "grad_norm": 0.40412442346334654, "learning_rate": 4.349859681945744e-05, "loss": 0.4291, "step": 516 }, { "epoch": 0.6523659305993691, "grad_norm": 0.43010412495372025, "learning_rate": 4.347521047708139e-05, "loss": 0.4535, "step": 517 }, { "epoch": 0.6536277602523659, "grad_norm": 0.3614851580477356, "learning_rate": 4.345182413470533e-05, "loss": 0.4227, "step": 518 }, { "epoch": 0.6548895899053627, "grad_norm": 0.42587552864715206, "learning_rate": 4.3428437792329276e-05, "loss": 0.4365, "step": 519 }, { "epoch": 0.6561514195583596, "grad_norm": 0.44185130752308804, "learning_rate": 4.340505144995323e-05, "loss": 0.4494, "step": 520 }, { "epoch": 0.6574132492113565, "grad_norm": 0.41993103029888085, "learning_rate": 4.338166510757718e-05, "loss": 0.4566, "step": 521 }, { "epoch": 0.6586750788643533, "grad_norm": 0.3838786594364853, "learning_rate": 4.335827876520113e-05, "loss": 0.4326, "step": 522 }, { "epoch": 0.6599369085173502, "grad_norm": 0.40449788443058576, "learning_rate": 4.333489242282507e-05, "loss": 0.4268, "step": 523 }, { "epoch": 0.661198738170347, "grad_norm": 0.34921359724374645, "learning_rate": 4.3311506080449024e-05, "loss": 0.4117, "step": 524 }, { "epoch": 0.6624605678233438, "grad_norm": 0.3419179803221498, "learning_rate": 4.328811973807297e-05, "loss": 0.4354, "step": 525 }, { "epoch": 0.6637223974763407, "grad_norm": 0.3564593050333705, "learning_rate": 4.326473339569691e-05, "loss": 0.4456, "step": 526 }, { "epoch": 0.6649842271293376, "grad_norm": 0.36059467580577137, "learning_rate": 4.3241347053320864e-05, "loss": 0.4557, "step": 527 }, { "epoch": 0.6662460567823344, "grad_norm": 0.3761183394213682, "learning_rate": 4.321796071094481e-05, "loss": 0.4384, "step": 528 }, { "epoch": 0.6675078864353312, "grad_norm": 0.35207827384062645, "learning_rate": 4.319457436856876e-05, "loss": 0.4332, "step": 529 }, { "epoch": 0.668769716088328, "grad_norm": 0.35703310210724104, "learning_rate": 4.31711880261927e-05, "loss": 0.4498, "step": 530 }, { "epoch": 0.670031545741325, "grad_norm": 0.38726354542349806, "learning_rate": 4.3147801683816654e-05, "loss": 0.4314, "step": 531 }, { "epoch": 0.6712933753943218, "grad_norm": 0.3547036042328963, "learning_rate": 4.31244153414406e-05, "loss": 0.4235, "step": 532 }, { "epoch": 0.6725552050473186, "grad_norm": 0.3724855914645451, "learning_rate": 4.310102899906455e-05, "loss": 0.4301, "step": 533 }, { "epoch": 0.6738170347003155, "grad_norm": 0.3869769715215993, "learning_rate": 4.307764265668849e-05, "loss": 0.4523, "step": 534 }, { "epoch": 0.6750788643533123, "grad_norm": 0.3593843880453283, "learning_rate": 4.3054256314312444e-05, "loss": 0.4387, "step": 535 }, { "epoch": 0.6763406940063091, "grad_norm": 0.379059649813867, "learning_rate": 4.3030869971936395e-05, "loss": 0.4564, "step": 536 }, { "epoch": 0.677602523659306, "grad_norm": 0.43304822134643833, "learning_rate": 4.300748362956034e-05, "loss": 0.4356, "step": 537 }, { "epoch": 0.6788643533123029, "grad_norm": 0.3844847616625165, "learning_rate": 4.298409728718429e-05, "loss": 0.4232, "step": 538 }, { "epoch": 0.6801261829652997, "grad_norm": 0.45153562106148243, "learning_rate": 4.2960710944808235e-05, "loss": 0.4519, "step": 539 }, { "epoch": 0.6813880126182965, "grad_norm": 0.4199754018680437, "learning_rate": 4.2937324602432186e-05, "loss": 0.4585, "step": 540 }, { "epoch": 0.6826498422712933, "grad_norm": 0.3969515375114868, "learning_rate": 4.291393826005613e-05, "loss": 0.4566, "step": 541 }, { "epoch": 0.6839116719242903, "grad_norm": 0.4452861191029979, "learning_rate": 4.2890551917680074e-05, "loss": 0.4496, "step": 542 }, { "epoch": 0.6851735015772871, "grad_norm": 0.36884101162376315, "learning_rate": 4.2867165575304025e-05, "loss": 0.4589, "step": 543 }, { "epoch": 0.6864353312302839, "grad_norm": 0.4811324488132717, "learning_rate": 4.284377923292797e-05, "loss": 0.4397, "step": 544 }, { "epoch": 0.6876971608832808, "grad_norm": 0.34902169678353634, "learning_rate": 4.282039289055192e-05, "loss": 0.429, "step": 545 }, { "epoch": 0.6889589905362776, "grad_norm": 0.41364622956686403, "learning_rate": 4.2797006548175864e-05, "loss": 0.4305, "step": 546 }, { "epoch": 0.6902208201892744, "grad_norm": 0.37585486809378627, "learning_rate": 4.2773620205799815e-05, "loss": 0.4254, "step": 547 }, { "epoch": 0.6914826498422713, "grad_norm": 0.522569497977384, "learning_rate": 4.275023386342376e-05, "loss": 0.425, "step": 548 }, { "epoch": 0.6927444794952682, "grad_norm": 0.41479885249373005, "learning_rate": 4.272684752104771e-05, "loss": 0.4141, "step": 549 }, { "epoch": 0.694006309148265, "grad_norm": 0.48593431976577434, "learning_rate": 4.2703461178671655e-05, "loss": 0.4237, "step": 550 }, { "epoch": 0.6952681388012618, "grad_norm": 0.4679065097821076, "learning_rate": 4.2680074836295606e-05, "loss": 0.458, "step": 551 }, { "epoch": 0.6965299684542586, "grad_norm": 0.42036767865552627, "learning_rate": 4.265668849391956e-05, "loss": 0.4396, "step": 552 }, { "epoch": 0.6977917981072556, "grad_norm": 0.42051608735391366, "learning_rate": 4.26333021515435e-05, "loss": 0.4411, "step": 553 }, { "epoch": 0.6990536277602524, "grad_norm": 0.41470308022006025, "learning_rate": 4.260991580916745e-05, "loss": 0.4384, "step": 554 }, { "epoch": 0.7003154574132492, "grad_norm": 0.429070516691048, "learning_rate": 4.2586529466791396e-05, "loss": 0.4459, "step": 555 }, { "epoch": 0.701577287066246, "grad_norm": 0.48302394761020123, "learning_rate": 4.256314312441535e-05, "loss": 0.4354, "step": 556 }, { "epoch": 0.7028391167192429, "grad_norm": 0.4466385727865267, "learning_rate": 4.253975678203929e-05, "loss": 0.4328, "step": 557 }, { "epoch": 0.7041009463722397, "grad_norm": 0.4450659480490801, "learning_rate": 4.251637043966324e-05, "loss": 0.443, "step": 558 }, { "epoch": 0.7053627760252366, "grad_norm": 0.39155298402556954, "learning_rate": 4.2492984097287186e-05, "loss": 0.4545, "step": 559 }, { "epoch": 0.7066246056782335, "grad_norm": 0.4597596783475071, "learning_rate": 4.246959775491113e-05, "loss": 0.4257, "step": 560 }, { "epoch": 0.7078864353312303, "grad_norm": 0.3425353174772746, "learning_rate": 4.244621141253508e-05, "loss": 0.4391, "step": 561 }, { "epoch": 0.7091482649842271, "grad_norm": 0.5175267153959492, "learning_rate": 4.2422825070159026e-05, "loss": 0.4332, "step": 562 }, { "epoch": 0.7104100946372239, "grad_norm": 0.4483017193572462, "learning_rate": 4.239943872778298e-05, "loss": 0.4584, "step": 563 }, { "epoch": 0.7116719242902209, "grad_norm": 0.4001744872333345, "learning_rate": 4.237605238540692e-05, "loss": 0.4393, "step": 564 }, { "epoch": 0.7129337539432177, "grad_norm": 0.41395793315059093, "learning_rate": 4.235266604303087e-05, "loss": 0.4472, "step": 565 }, { "epoch": 0.7141955835962145, "grad_norm": 0.42294426450651845, "learning_rate": 4.2329279700654816e-05, "loss": 0.4201, "step": 566 }, { "epoch": 0.7154574132492113, "grad_norm": 0.49649175387204103, "learning_rate": 4.230589335827877e-05, "loss": 0.4113, "step": 567 }, { "epoch": 0.7167192429022082, "grad_norm": 0.38952391376570294, "learning_rate": 4.228250701590272e-05, "loss": 0.4359, "step": 568 }, { "epoch": 0.717981072555205, "grad_norm": 0.4964560679883741, "learning_rate": 4.225912067352666e-05, "loss": 0.4455, "step": 569 }, { "epoch": 0.7192429022082019, "grad_norm": 0.4213835817524196, "learning_rate": 4.223573433115061e-05, "loss": 0.4187, "step": 570 }, { "epoch": 0.7205047318611988, "grad_norm": 0.5379060798823736, "learning_rate": 4.221234798877456e-05, "loss": 0.4412, "step": 571 }, { "epoch": 0.7217665615141956, "grad_norm": 0.39446096355209653, "learning_rate": 4.218896164639851e-05, "loss": 0.4419, "step": 572 }, { "epoch": 0.7230283911671924, "grad_norm": 0.522986913047166, "learning_rate": 4.216557530402245e-05, "loss": 0.4464, "step": 573 }, { "epoch": 0.7242902208201892, "grad_norm": 0.3709785396250102, "learning_rate": 4.2142188961646403e-05, "loss": 0.4101, "step": 574 }, { "epoch": 0.7255520504731862, "grad_norm": 0.4118606924003121, "learning_rate": 4.211880261927035e-05, "loss": 0.4531, "step": 575 }, { "epoch": 0.726813880126183, "grad_norm": 0.4343964274203349, "learning_rate": 4.209541627689429e-05, "loss": 0.4503, "step": 576 }, { "epoch": 0.7280757097791798, "grad_norm": 0.3691033092508308, "learning_rate": 4.207202993451824e-05, "loss": 0.408, "step": 577 }, { "epoch": 0.7293375394321766, "grad_norm": 0.40747757859492156, "learning_rate": 4.204864359214219e-05, "loss": 0.446, "step": 578 }, { "epoch": 0.7305993690851735, "grad_norm": 0.3405719235032822, "learning_rate": 4.202525724976614e-05, "loss": 0.4229, "step": 579 }, { "epoch": 0.7318611987381703, "grad_norm": 0.3849604679849114, "learning_rate": 4.200187090739008e-05, "loss": 0.4356, "step": 580 }, { "epoch": 0.7331230283911672, "grad_norm": 0.3233642564247576, "learning_rate": 4.197848456501403e-05, "loss": 0.4522, "step": 581 }, { "epoch": 0.7343848580441641, "grad_norm": 0.3782863557344273, "learning_rate": 4.1955098222637984e-05, "loss": 0.4496, "step": 582 }, { "epoch": 0.7356466876971609, "grad_norm": 0.3560346834624068, "learning_rate": 4.193171188026193e-05, "loss": 0.4348, "step": 583 }, { "epoch": 0.7369085173501577, "grad_norm": 0.41857471926678486, "learning_rate": 4.190832553788588e-05, "loss": 0.4525, "step": 584 }, { "epoch": 0.7381703470031545, "grad_norm": 0.3644775712919491, "learning_rate": 4.1884939195509823e-05, "loss": 0.4209, "step": 585 }, { "epoch": 0.7394321766561515, "grad_norm": 0.3487334142444358, "learning_rate": 4.1861552853133774e-05, "loss": 0.446, "step": 586 }, { "epoch": 0.7406940063091483, "grad_norm": 0.33068556375135855, "learning_rate": 4.183816651075772e-05, "loss": 0.4396, "step": 587 }, { "epoch": 0.7419558359621451, "grad_norm": 0.33664799038627985, "learning_rate": 4.181478016838167e-05, "loss": 0.4311, "step": 588 }, { "epoch": 0.7432176656151419, "grad_norm": 0.33478759650015183, "learning_rate": 4.1791393826005614e-05, "loss": 0.423, "step": 589 }, { "epoch": 0.7444794952681388, "grad_norm": 0.38269942299742815, "learning_rate": 4.1768007483629565e-05, "loss": 0.4087, "step": 590 }, { "epoch": 0.7457413249211357, "grad_norm": 0.3402332828411788, "learning_rate": 4.174462114125351e-05, "loss": 0.437, "step": 591 }, { "epoch": 0.7470031545741325, "grad_norm": 0.35146161384058405, "learning_rate": 4.172123479887746e-05, "loss": 0.4168, "step": 592 }, { "epoch": 0.7482649842271294, "grad_norm": 0.3409038959715398, "learning_rate": 4.1697848456501404e-05, "loss": 0.4216, "step": 593 }, { "epoch": 0.7495268138801262, "grad_norm": 0.3785813350627158, "learning_rate": 4.167446211412535e-05, "loss": 0.4074, "step": 594 }, { "epoch": 0.750788643533123, "grad_norm": 0.2889999555548425, "learning_rate": 4.16510757717493e-05, "loss": 0.4148, "step": 595 }, { "epoch": 0.7520504731861198, "grad_norm": 0.4048080609739844, "learning_rate": 4.1627689429373243e-05, "loss": 0.4195, "step": 596 }, { "epoch": 0.7533123028391168, "grad_norm": 0.45719955503067466, "learning_rate": 4.16043030869972e-05, "loss": 0.4341, "step": 597 }, { "epoch": 0.7545741324921136, "grad_norm": 0.33964766138830355, "learning_rate": 4.1580916744621145e-05, "loss": 0.446, "step": 598 }, { "epoch": 0.7558359621451104, "grad_norm": 0.48799483688977285, "learning_rate": 4.155753040224509e-05, "loss": 0.4492, "step": 599 }, { "epoch": 0.7570977917981072, "grad_norm": 0.3316884984254001, "learning_rate": 4.153414405986904e-05, "loss": 0.4315, "step": 600 }, { "epoch": 0.7583596214511041, "grad_norm": 0.5534668714450056, "learning_rate": 4.1510757717492985e-05, "loss": 0.4435, "step": 601 }, { "epoch": 0.759621451104101, "grad_norm": 0.3721645118015671, "learning_rate": 4.1487371375116936e-05, "loss": 0.4663, "step": 602 }, { "epoch": 0.7608832807570978, "grad_norm": 0.43882825519682533, "learning_rate": 4.146398503274088e-05, "loss": 0.4363, "step": 603 }, { "epoch": 0.7621451104100946, "grad_norm": 0.3587151296469655, "learning_rate": 4.144059869036483e-05, "loss": 0.4279, "step": 604 }, { "epoch": 0.7634069400630915, "grad_norm": 0.4459565959117182, "learning_rate": 4.1417212347988775e-05, "loss": 0.4169, "step": 605 }, { "epoch": 0.7646687697160883, "grad_norm": 0.48157271475517727, "learning_rate": 4.1393826005612726e-05, "loss": 0.4559, "step": 606 }, { "epoch": 0.7659305993690851, "grad_norm": 0.4587900432463369, "learning_rate": 4.137043966323667e-05, "loss": 0.4333, "step": 607 }, { "epoch": 0.7671924290220821, "grad_norm": 0.3739353862855734, "learning_rate": 4.134705332086062e-05, "loss": 0.4508, "step": 608 }, { "epoch": 0.7684542586750789, "grad_norm": 0.4406226796377741, "learning_rate": 4.1323666978484565e-05, "loss": 0.4586, "step": 609 }, { "epoch": 0.7697160883280757, "grad_norm": 0.3485440836423936, "learning_rate": 4.130028063610851e-05, "loss": 0.4388, "step": 610 }, { "epoch": 0.7709779179810725, "grad_norm": 0.4733545501812665, "learning_rate": 4.127689429373246e-05, "loss": 0.4263, "step": 611 }, { "epoch": 0.7722397476340694, "grad_norm": 0.4373879708420027, "learning_rate": 4.1253507951356405e-05, "loss": 0.4538, "step": 612 }, { "epoch": 0.7735015772870663, "grad_norm": 0.3738367095175759, "learning_rate": 4.123012160898036e-05, "loss": 0.4362, "step": 613 }, { "epoch": 0.7747634069400631, "grad_norm": 0.45105267197037524, "learning_rate": 4.120673526660431e-05, "loss": 0.4544, "step": 614 }, { "epoch": 0.7760252365930599, "grad_norm": 0.34647758609620666, "learning_rate": 4.118334892422826e-05, "loss": 0.438, "step": 615 }, { "epoch": 0.7772870662460568, "grad_norm": 0.3951301839562351, "learning_rate": 4.11599625818522e-05, "loss": 0.431, "step": 616 }, { "epoch": 0.7785488958990536, "grad_norm": 0.3780399674517972, "learning_rate": 4.1136576239476146e-05, "loss": 0.4132, "step": 617 }, { "epoch": 0.7798107255520504, "grad_norm": 0.3743086078346953, "learning_rate": 4.11131898971001e-05, "loss": 0.4151, "step": 618 }, { "epoch": 0.7810725552050474, "grad_norm": 0.40802782001960364, "learning_rate": 4.108980355472404e-05, "loss": 0.4176, "step": 619 }, { "epoch": 0.7823343848580442, "grad_norm": 0.4398133641402211, "learning_rate": 4.106641721234799e-05, "loss": 0.4352, "step": 620 }, { "epoch": 0.783596214511041, "grad_norm": 0.4012703376390044, "learning_rate": 4.1043030869971936e-05, "loss": 0.4231, "step": 621 }, { "epoch": 0.7848580441640378, "grad_norm": 0.35751372200716297, "learning_rate": 4.101964452759589e-05, "loss": 0.4201, "step": 622 }, { "epoch": 0.7861198738170347, "grad_norm": 0.38091023587351064, "learning_rate": 4.099625818521983e-05, "loss": 0.4378, "step": 623 }, { "epoch": 0.7873817034700316, "grad_norm": 0.34740562386370577, "learning_rate": 4.097287184284378e-05, "loss": 0.4324, "step": 624 }, { "epoch": 0.7886435331230284, "grad_norm": 0.34221155069398806, "learning_rate": 4.094948550046773e-05, "loss": 0.4242, "step": 625 }, { "epoch": 0.7899053627760252, "grad_norm": 0.33866803742388984, "learning_rate": 4.092609915809167e-05, "loss": 0.43, "step": 626 }, { "epoch": 0.7911671924290221, "grad_norm": 0.3716580313248445, "learning_rate": 4.090271281571562e-05, "loss": 0.4455, "step": 627 }, { "epoch": 0.7924290220820189, "grad_norm": 0.3365624459277081, "learning_rate": 4.087932647333957e-05, "loss": 0.4482, "step": 628 }, { "epoch": 0.7936908517350157, "grad_norm": 0.42573442463238836, "learning_rate": 4.0855940130963524e-05, "loss": 0.4253, "step": 629 }, { "epoch": 0.7949526813880127, "grad_norm": 0.32745926855864316, "learning_rate": 4.083255378858747e-05, "loss": 0.4317, "step": 630 }, { "epoch": 0.7962145110410095, "grad_norm": 0.458454430627373, "learning_rate": 4.080916744621142e-05, "loss": 0.44, "step": 631 }, { "epoch": 0.7974763406940063, "grad_norm": 0.4216762474799929, "learning_rate": 4.078578110383536e-05, "loss": 0.461, "step": 632 }, { "epoch": 0.7987381703470031, "grad_norm": 0.37127653820845147, "learning_rate": 4.076239476145931e-05, "loss": 0.4474, "step": 633 }, { "epoch": 0.8, "grad_norm": 0.43518808079278437, "learning_rate": 4.073900841908326e-05, "loss": 0.4503, "step": 634 }, { "epoch": 0.8012618296529969, "grad_norm": 0.329390844413726, "learning_rate": 4.07156220767072e-05, "loss": 0.4279, "step": 635 }, { "epoch": 0.8025236593059937, "grad_norm": 0.37076228521139815, "learning_rate": 4.0692235734331153e-05, "loss": 0.4263, "step": 636 }, { "epoch": 0.8037854889589905, "grad_norm": 0.32706523945269755, "learning_rate": 4.06688493919551e-05, "loss": 0.4153, "step": 637 }, { "epoch": 0.8050473186119874, "grad_norm": 0.3702715015248414, "learning_rate": 4.064546304957905e-05, "loss": 0.4354, "step": 638 }, { "epoch": 0.8063091482649842, "grad_norm": 0.329317073550233, "learning_rate": 4.062207670720299e-05, "loss": 0.4324, "step": 639 }, { "epoch": 0.807570977917981, "grad_norm": 0.3151690546448997, "learning_rate": 4.0598690364826944e-05, "loss": 0.4328, "step": 640 }, { "epoch": 0.8088328075709779, "grad_norm": 0.35359704655421614, "learning_rate": 4.057530402245089e-05, "loss": 0.4383, "step": 641 }, { "epoch": 0.8100946372239748, "grad_norm": 0.30483259175713534, "learning_rate": 4.055191768007484e-05, "loss": 0.4474, "step": 642 }, { "epoch": 0.8113564668769716, "grad_norm": 0.4608876851757235, "learning_rate": 4.052853133769879e-05, "loss": 0.4244, "step": 643 }, { "epoch": 0.8126182965299684, "grad_norm": 0.2901076465312291, "learning_rate": 4.0505144995322734e-05, "loss": 0.4166, "step": 644 }, { "epoch": 0.8138801261829653, "grad_norm": 0.396062100348901, "learning_rate": 4.0481758652946685e-05, "loss": 0.4496, "step": 645 }, { "epoch": 0.8151419558359622, "grad_norm": 0.38983274963117703, "learning_rate": 4.045837231057063e-05, "loss": 0.4506, "step": 646 }, { "epoch": 0.816403785488959, "grad_norm": 0.3801597581707347, "learning_rate": 4.043498596819458e-05, "loss": 0.428, "step": 647 }, { "epoch": 0.8176656151419558, "grad_norm": 0.4048840355050778, "learning_rate": 4.0411599625818524e-05, "loss": 0.434, "step": 648 }, { "epoch": 0.8189274447949527, "grad_norm": 0.3342960803281639, "learning_rate": 4.038821328344247e-05, "loss": 0.4273, "step": 649 }, { "epoch": 0.8201892744479495, "grad_norm": 0.3942047350763883, "learning_rate": 4.036482694106642e-05, "loss": 0.4259, "step": 650 }, { "epoch": 0.8214511041009463, "grad_norm": 0.3196064219566821, "learning_rate": 4.0341440598690364e-05, "loss": 0.4473, "step": 651 }, { "epoch": 0.8227129337539432, "grad_norm": 0.3657854410486474, "learning_rate": 4.0318054256314315e-05, "loss": 0.4321, "step": 652 }, { "epoch": 0.8239747634069401, "grad_norm": 0.35041573123894304, "learning_rate": 4.029466791393826e-05, "loss": 0.4493, "step": 653 }, { "epoch": 0.8252365930599369, "grad_norm": 0.3577573987387102, "learning_rate": 4.027128157156221e-05, "loss": 0.4213, "step": 654 }, { "epoch": 0.8264984227129337, "grad_norm": 0.4038871704336403, "learning_rate": 4.0247895229186154e-05, "loss": 0.4661, "step": 655 }, { "epoch": 0.8277602523659306, "grad_norm": 0.3296905922993033, "learning_rate": 4.0224508886810105e-05, "loss": 0.441, "step": 656 }, { "epoch": 0.8290220820189275, "grad_norm": 0.3943564256663512, "learning_rate": 4.020112254443405e-05, "loss": 0.4249, "step": 657 }, { "epoch": 0.8302839116719243, "grad_norm": 0.36135976064488134, "learning_rate": 4.0177736202058e-05, "loss": 0.4447, "step": 658 }, { "epoch": 0.8315457413249211, "grad_norm": 0.37665871536125545, "learning_rate": 4.015434985968195e-05, "loss": 0.4204, "step": 659 }, { "epoch": 0.832807570977918, "grad_norm": 0.3726474645330793, "learning_rate": 4.0130963517305895e-05, "loss": 0.4415, "step": 660 }, { "epoch": 0.8340694006309148, "grad_norm": 0.35224746883941765, "learning_rate": 4.0107577174929846e-05, "loss": 0.4281, "step": 661 }, { "epoch": 0.8353312302839117, "grad_norm": 0.38223589989494433, "learning_rate": 4.008419083255379e-05, "loss": 0.4308, "step": 662 }, { "epoch": 0.8365930599369085, "grad_norm": 0.3087611178112773, "learning_rate": 4.006080449017774e-05, "loss": 0.4186, "step": 663 }, { "epoch": 0.8378548895899054, "grad_norm": 0.34670139753450624, "learning_rate": 4.0037418147801686e-05, "loss": 0.4177, "step": 664 }, { "epoch": 0.8391167192429022, "grad_norm": 0.34652085841403685, "learning_rate": 4.001403180542564e-05, "loss": 0.4364, "step": 665 }, { "epoch": 0.840378548895899, "grad_norm": 0.361502623011677, "learning_rate": 3.999064546304958e-05, "loss": 0.4201, "step": 666 }, { "epoch": 0.841640378548896, "grad_norm": 0.3494094693242219, "learning_rate": 3.9967259120673525e-05, "loss": 0.4174, "step": 667 }, { "epoch": 0.8429022082018928, "grad_norm": 0.33472910657187593, "learning_rate": 3.9943872778297476e-05, "loss": 0.4331, "step": 668 }, { "epoch": 0.8441640378548896, "grad_norm": 0.3713530741655262, "learning_rate": 3.992048643592142e-05, "loss": 0.4238, "step": 669 }, { "epoch": 0.8454258675078864, "grad_norm": 0.33985585439988236, "learning_rate": 3.989710009354537e-05, "loss": 0.4374, "step": 670 }, { "epoch": 0.8466876971608833, "grad_norm": 0.35907811220701313, "learning_rate": 3.9873713751169315e-05, "loss": 0.4413, "step": 671 }, { "epoch": 0.8479495268138801, "grad_norm": 0.3201499511866596, "learning_rate": 3.9850327408793266e-05, "loss": 0.4438, "step": 672 }, { "epoch": 0.849211356466877, "grad_norm": 0.31837050565507263, "learning_rate": 3.982694106641721e-05, "loss": 0.4147, "step": 673 }, { "epoch": 0.8504731861198738, "grad_norm": 0.33260479730505754, "learning_rate": 3.980355472404116e-05, "loss": 0.425, "step": 674 }, { "epoch": 0.8517350157728707, "grad_norm": 0.3154252162314694, "learning_rate": 3.978016838166511e-05, "loss": 0.4274, "step": 675 }, { "epoch": 0.8529968454258675, "grad_norm": 0.3509639674448653, "learning_rate": 3.975678203928906e-05, "loss": 0.4083, "step": 676 }, { "epoch": 0.8542586750788643, "grad_norm": 0.3227407268094023, "learning_rate": 3.973339569691301e-05, "loss": 0.4368, "step": 677 }, { "epoch": 0.8555205047318613, "grad_norm": 0.36375318224587605, "learning_rate": 3.971000935453695e-05, "loss": 0.4417, "step": 678 }, { "epoch": 0.8567823343848581, "grad_norm": 0.3097239051504593, "learning_rate": 3.96866230121609e-05, "loss": 0.4248, "step": 679 }, { "epoch": 0.8580441640378549, "grad_norm": 0.35944189037903757, "learning_rate": 3.966323666978485e-05, "loss": 0.4228, "step": 680 }, { "epoch": 0.8593059936908517, "grad_norm": 0.36916882490285263, "learning_rate": 3.96398503274088e-05, "loss": 0.4429, "step": 681 }, { "epoch": 0.8605678233438486, "grad_norm": 0.33044867521378807, "learning_rate": 3.961646398503274e-05, "loss": 0.4171, "step": 682 }, { "epoch": 0.8618296529968454, "grad_norm": 0.3811560022267869, "learning_rate": 3.9593077642656686e-05, "loss": 0.444, "step": 683 }, { "epoch": 0.8630914826498423, "grad_norm": 0.2985371946666912, "learning_rate": 3.956969130028064e-05, "loss": 0.4315, "step": 684 }, { "epoch": 0.8643533123028391, "grad_norm": 0.3738810297922705, "learning_rate": 3.954630495790458e-05, "loss": 0.4406, "step": 685 }, { "epoch": 0.865615141955836, "grad_norm": 0.32869366470135347, "learning_rate": 3.952291861552853e-05, "loss": 0.4133, "step": 686 }, { "epoch": 0.8668769716088328, "grad_norm": 0.3399004506033569, "learning_rate": 3.949953227315248e-05, "loss": 0.4107, "step": 687 }, { "epoch": 0.8681388012618296, "grad_norm": 0.3602210999530381, "learning_rate": 3.947614593077643e-05, "loss": 0.4288, "step": 688 }, { "epoch": 0.8694006309148264, "grad_norm": 0.41848151143511536, "learning_rate": 3.945275958840038e-05, "loss": 0.4378, "step": 689 }, { "epoch": 0.8706624605678234, "grad_norm": 0.37010331967694726, "learning_rate": 3.942937324602432e-05, "loss": 0.4406, "step": 690 }, { "epoch": 0.8719242902208202, "grad_norm": 0.3317195521513011, "learning_rate": 3.9405986903648274e-05, "loss": 0.4307, "step": 691 }, { "epoch": 0.873186119873817, "grad_norm": 0.4550277809958294, "learning_rate": 3.938260056127222e-05, "loss": 0.4386, "step": 692 }, { "epoch": 0.8744479495268139, "grad_norm": 0.3725578082898371, "learning_rate": 3.935921421889617e-05, "loss": 0.4396, "step": 693 }, { "epoch": 0.8757097791798107, "grad_norm": 0.3785568199852292, "learning_rate": 3.933582787652011e-05, "loss": 0.4433, "step": 694 }, { "epoch": 0.8769716088328076, "grad_norm": 0.427373960153116, "learning_rate": 3.9312441534144064e-05, "loss": 0.4264, "step": 695 }, { "epoch": 0.8782334384858044, "grad_norm": 0.3428147880999008, "learning_rate": 3.928905519176801e-05, "loss": 0.4261, "step": 696 }, { "epoch": 0.8794952681388013, "grad_norm": 0.43067209245311117, "learning_rate": 3.926566884939196e-05, "loss": 0.4379, "step": 697 }, { "epoch": 0.8807570977917981, "grad_norm": 0.3411135206290369, "learning_rate": 3.9242282507015904e-05, "loss": 0.4291, "step": 698 }, { "epoch": 0.8820189274447949, "grad_norm": 0.36546551690340495, "learning_rate": 3.9218896164639855e-05, "loss": 0.4117, "step": 699 }, { "epoch": 0.8832807570977917, "grad_norm": 0.42636538194860646, "learning_rate": 3.91955098222638e-05, "loss": 0.4256, "step": 700 }, { "epoch": 0.8845425867507887, "grad_norm": 0.3093446184352861, "learning_rate": 3.917212347988774e-05, "loss": 0.4132, "step": 701 }, { "epoch": 0.8858044164037855, "grad_norm": 0.34384218972681596, "learning_rate": 3.9148737137511694e-05, "loss": 0.4032, "step": 702 }, { "epoch": 0.8870662460567823, "grad_norm": 0.41802502590520235, "learning_rate": 3.912535079513564e-05, "loss": 0.417, "step": 703 }, { "epoch": 0.8883280757097792, "grad_norm": 0.33712433482397586, "learning_rate": 3.9101964452759596e-05, "loss": 0.4246, "step": 704 }, { "epoch": 0.889589905362776, "grad_norm": 0.5014304821600325, "learning_rate": 3.907857811038354e-05, "loss": 0.423, "step": 705 }, { "epoch": 0.8908517350157729, "grad_norm": 0.33107870773551673, "learning_rate": 3.9055191768007484e-05, "loss": 0.4335, "step": 706 }, { "epoch": 0.8921135646687697, "grad_norm": 0.5372119630059076, "learning_rate": 3.9031805425631435e-05, "loss": 0.4234, "step": 707 }, { "epoch": 0.8933753943217666, "grad_norm": 0.3848874000308558, "learning_rate": 3.900841908325538e-05, "loss": 0.424, "step": 708 }, { "epoch": 0.8946372239747634, "grad_norm": 0.4965158148336378, "learning_rate": 3.898503274087933e-05, "loss": 0.437, "step": 709 }, { "epoch": 0.8958990536277602, "grad_norm": 0.442323511349848, "learning_rate": 3.8961646398503275e-05, "loss": 0.4271, "step": 710 }, { "epoch": 0.897160883280757, "grad_norm": 0.38881198641508535, "learning_rate": 3.8938260056127226e-05, "loss": 0.4229, "step": 711 }, { "epoch": 0.898422712933754, "grad_norm": 0.5067316035284152, "learning_rate": 3.891487371375117e-05, "loss": 0.4557, "step": 712 }, { "epoch": 0.8996845425867508, "grad_norm": 0.3679983868482137, "learning_rate": 3.889148737137512e-05, "loss": 0.4393, "step": 713 }, { "epoch": 0.9009463722397476, "grad_norm": 0.431149920911744, "learning_rate": 3.8868101028999065e-05, "loss": 0.4326, "step": 714 }, { "epoch": 0.9022082018927445, "grad_norm": 0.425059732874692, "learning_rate": 3.8844714686623016e-05, "loss": 0.454, "step": 715 }, { "epoch": 0.9034700315457413, "grad_norm": 0.30816490540210917, "learning_rate": 3.882132834424696e-05, "loss": 0.4284, "step": 716 }, { "epoch": 0.9047318611987382, "grad_norm": 0.45545948903403044, "learning_rate": 3.8797942001870904e-05, "loss": 0.4169, "step": 717 }, { "epoch": 0.905993690851735, "grad_norm": 0.3744606227306325, "learning_rate": 3.8774555659494855e-05, "loss": 0.4169, "step": 718 }, { "epoch": 0.9072555205047319, "grad_norm": 0.4027162001620634, "learning_rate": 3.87511693171188e-05, "loss": 0.4445, "step": 719 }, { "epoch": 0.9085173501577287, "grad_norm": 0.41926678688719066, "learning_rate": 3.872778297474276e-05, "loss": 0.4266, "step": 720 }, { "epoch": 0.9097791798107255, "grad_norm": 0.33955020130891794, "learning_rate": 3.87043966323667e-05, "loss": 0.426, "step": 721 }, { "epoch": 0.9110410094637224, "grad_norm": 0.3199952036237926, "learning_rate": 3.868101028999065e-05, "loss": 0.4281, "step": 722 }, { "epoch": 0.9123028391167193, "grad_norm": 0.3501330094018627, "learning_rate": 3.8657623947614596e-05, "loss": 0.4368, "step": 723 }, { "epoch": 0.9135646687697161, "grad_norm": 0.3242288691048073, "learning_rate": 3.863423760523854e-05, "loss": 0.4102, "step": 724 }, { "epoch": 0.9148264984227129, "grad_norm": 0.33250536516965684, "learning_rate": 3.861085126286249e-05, "loss": 0.4379, "step": 725 }, { "epoch": 0.9160883280757098, "grad_norm": 0.4129102967968599, "learning_rate": 3.8587464920486436e-05, "loss": 0.4246, "step": 726 }, { "epoch": 0.9173501577287066, "grad_norm": 0.3546053820873773, "learning_rate": 3.856407857811039e-05, "loss": 0.413, "step": 727 }, { "epoch": 0.9186119873817035, "grad_norm": 0.4212123841734, "learning_rate": 3.854069223573433e-05, "loss": 0.432, "step": 728 }, { "epoch": 0.9198738170347003, "grad_norm": 0.3602236416975185, "learning_rate": 3.851730589335828e-05, "loss": 0.4279, "step": 729 }, { "epoch": 0.9211356466876972, "grad_norm": 0.3784497727963806, "learning_rate": 3.8493919550982226e-05, "loss": 0.4449, "step": 730 }, { "epoch": 0.922397476340694, "grad_norm": 0.4300714612849566, "learning_rate": 3.847053320860618e-05, "loss": 0.4327, "step": 731 }, { "epoch": 0.9236593059936908, "grad_norm": 0.32871622058071787, "learning_rate": 3.844714686623012e-05, "loss": 0.4238, "step": 732 }, { "epoch": 0.9249211356466877, "grad_norm": 0.40126201147744406, "learning_rate": 3.8423760523854066e-05, "loss": 0.4342, "step": 733 }, { "epoch": 0.9261829652996846, "grad_norm": 0.3318255828145147, "learning_rate": 3.8400374181478017e-05, "loss": 0.4215, "step": 734 }, { "epoch": 0.9274447949526814, "grad_norm": 0.41245061294788965, "learning_rate": 3.837698783910197e-05, "loss": 0.4254, "step": 735 }, { "epoch": 0.9287066246056782, "grad_norm": 0.4253418564389487, "learning_rate": 3.835360149672592e-05, "loss": 0.4488, "step": 736 }, { "epoch": 0.929968454258675, "grad_norm": 0.4177608332588415, "learning_rate": 3.833021515434986e-05, "loss": 0.4276, "step": 737 }, { "epoch": 0.931230283911672, "grad_norm": 0.36030029749688336, "learning_rate": 3.8306828811973814e-05, "loss": 0.4325, "step": 738 }, { "epoch": 0.9324921135646688, "grad_norm": 0.436317603599469, "learning_rate": 3.828344246959776e-05, "loss": 0.4023, "step": 739 }, { "epoch": 0.9337539432176656, "grad_norm": 0.3109759270310245, "learning_rate": 3.82600561272217e-05, "loss": 0.4024, "step": 740 }, { "epoch": 0.9350157728706625, "grad_norm": 1.5048816503085398, "learning_rate": 3.823666978484565e-05, "loss": 0.4636, "step": 741 }, { "epoch": 0.9362776025236593, "grad_norm": 0.42173395496409033, "learning_rate": 3.82132834424696e-05, "loss": 0.4327, "step": 742 }, { "epoch": 0.9375394321766561, "grad_norm": 0.40275731751419164, "learning_rate": 3.818989710009355e-05, "loss": 0.4184, "step": 743 }, { "epoch": 0.938801261829653, "grad_norm": 0.35282932967434705, "learning_rate": 3.816651075771749e-05, "loss": 0.4192, "step": 744 }, { "epoch": 0.9400630914826499, "grad_norm": 0.4722826618442621, "learning_rate": 3.814312441534144e-05, "loss": 0.4421, "step": 745 }, { "epoch": 0.9413249211356467, "grad_norm": 0.32865010598873784, "learning_rate": 3.811973807296539e-05, "loss": 0.4083, "step": 746 }, { "epoch": 0.9425867507886435, "grad_norm": 0.46884013885868625, "learning_rate": 3.809635173058934e-05, "loss": 0.4219, "step": 747 }, { "epoch": 0.9438485804416403, "grad_norm": 0.35985337826047537, "learning_rate": 3.807296538821328e-05, "loss": 0.4299, "step": 748 }, { "epoch": 0.9451104100946373, "grad_norm": 0.36707009944234137, "learning_rate": 3.8049579045837234e-05, "loss": 0.4263, "step": 749 }, { "epoch": 0.9463722397476341, "grad_norm": 0.37269078522221316, "learning_rate": 3.8026192703461185e-05, "loss": 0.4461, "step": 750 }, { "epoch": 0.9476340694006309, "grad_norm": 0.456708083016668, "learning_rate": 3.800280636108513e-05, "loss": 0.4198, "step": 751 }, { "epoch": 0.9488958990536278, "grad_norm": 0.4178876254091584, "learning_rate": 3.797942001870908e-05, "loss": 0.415, "step": 752 }, { "epoch": 0.9501577287066246, "grad_norm": 0.4708117283045452, "learning_rate": 3.7956033676333024e-05, "loss": 0.4447, "step": 753 }, { "epoch": 0.9514195583596214, "grad_norm": 0.49357790798383305, "learning_rate": 3.7932647333956975e-05, "loss": 0.4079, "step": 754 }, { "epoch": 0.9526813880126183, "grad_norm": 0.4303658574966331, "learning_rate": 3.790926099158092e-05, "loss": 0.4176, "step": 755 }, { "epoch": 0.9539432176656152, "grad_norm": 0.5405666235425333, "learning_rate": 3.788587464920486e-05, "loss": 0.4338, "step": 756 }, { "epoch": 0.955205047318612, "grad_norm": 0.3935426658753803, "learning_rate": 3.7862488306828814e-05, "loss": 0.4344, "step": 757 }, { "epoch": 0.9564668769716088, "grad_norm": 0.5057203556678131, "learning_rate": 3.783910196445276e-05, "loss": 0.4215, "step": 758 }, { "epoch": 0.9577287066246056, "grad_norm": 0.37222918971966207, "learning_rate": 3.781571562207671e-05, "loss": 0.4329, "step": 759 }, { "epoch": 0.9589905362776026, "grad_norm": 0.5007186602660089, "learning_rate": 3.7792329279700654e-05, "loss": 0.4102, "step": 760 }, { "epoch": 0.9602523659305994, "grad_norm": 0.32409067206846875, "learning_rate": 3.7768942937324605e-05, "loss": 0.4214, "step": 761 }, { "epoch": 0.9615141955835962, "grad_norm": 0.3659401766631626, "learning_rate": 3.774555659494855e-05, "loss": 0.4322, "step": 762 }, { "epoch": 0.9627760252365931, "grad_norm": 0.34413998554490943, "learning_rate": 3.77221702525725e-05, "loss": 0.4106, "step": 763 }, { "epoch": 0.9640378548895899, "grad_norm": 0.36846896990638295, "learning_rate": 3.7698783910196444e-05, "loss": 0.4327, "step": 764 }, { "epoch": 0.9652996845425867, "grad_norm": 0.3922870667012599, "learning_rate": 3.7675397567820395e-05, "loss": 0.4299, "step": 765 }, { "epoch": 0.9665615141955836, "grad_norm": 0.3807632497796044, "learning_rate": 3.7652011225444346e-05, "loss": 0.4211, "step": 766 }, { "epoch": 0.9678233438485805, "grad_norm": 0.3482774062572583, "learning_rate": 3.762862488306829e-05, "loss": 0.4219, "step": 767 }, { "epoch": 0.9690851735015773, "grad_norm": 0.34241672211909924, "learning_rate": 3.760523854069224e-05, "loss": 0.4244, "step": 768 }, { "epoch": 0.9703470031545741, "grad_norm": 0.4019196278065781, "learning_rate": 3.7581852198316185e-05, "loss": 0.4329, "step": 769 }, { "epoch": 0.9716088328075709, "grad_norm": 0.29807091527913865, "learning_rate": 3.7558465855940136e-05, "loss": 0.4296, "step": 770 }, { "epoch": 0.9728706624605679, "grad_norm": 0.40011924257778314, "learning_rate": 3.753507951356408e-05, "loss": 0.4133, "step": 771 }, { "epoch": 0.9741324921135647, "grad_norm": 0.2925392860337392, "learning_rate": 3.751169317118803e-05, "loss": 0.4276, "step": 772 }, { "epoch": 0.9753943217665615, "grad_norm": 0.37308903579201885, "learning_rate": 3.7488306828811976e-05, "loss": 0.4374, "step": 773 }, { "epoch": 0.9766561514195583, "grad_norm": 0.3133799714800022, "learning_rate": 3.746492048643592e-05, "loss": 0.4217, "step": 774 }, { "epoch": 0.9779179810725552, "grad_norm": 0.3538493354661285, "learning_rate": 3.744153414405987e-05, "loss": 0.426, "step": 775 }, { "epoch": 0.979179810725552, "grad_norm": 0.328021005514308, "learning_rate": 3.7418147801683815e-05, "loss": 0.4034, "step": 776 }, { "epoch": 0.9804416403785489, "grad_norm": 0.3141541760139432, "learning_rate": 3.7394761459307766e-05, "loss": 0.4212, "step": 777 }, { "epoch": 0.9817034700315458, "grad_norm": 0.3479670234738308, "learning_rate": 3.737137511693171e-05, "loss": 0.417, "step": 778 }, { "epoch": 0.9829652996845426, "grad_norm": 0.32072962633939733, "learning_rate": 3.734798877455566e-05, "loss": 0.4299, "step": 779 }, { "epoch": 0.9842271293375394, "grad_norm": 0.33326960847079234, "learning_rate": 3.7324602432179605e-05, "loss": 0.4402, "step": 780 }, { "epoch": 0.9854889589905362, "grad_norm": 0.34308846695356726, "learning_rate": 3.7301216089803556e-05, "loss": 0.4107, "step": 781 }, { "epoch": 0.9867507886435332, "grad_norm": 0.3260433333482398, "learning_rate": 3.727782974742751e-05, "loss": 0.4203, "step": 782 }, { "epoch": 0.98801261829653, "grad_norm": 0.3517607889110412, "learning_rate": 3.725444340505145e-05, "loss": 0.4196, "step": 783 }, { "epoch": 0.9892744479495268, "grad_norm": 0.3564649077379764, "learning_rate": 3.72310570626754e-05, "loss": 0.423, "step": 784 }, { "epoch": 0.9905362776025236, "grad_norm": 0.29034785445451033, "learning_rate": 3.7207670720299347e-05, "loss": 0.414, "step": 785 }, { "epoch": 0.9917981072555205, "grad_norm": 0.3509019361462701, "learning_rate": 3.71842843779233e-05, "loss": 0.4357, "step": 786 }, { "epoch": 0.9930599369085173, "grad_norm": 0.3279529606177725, "learning_rate": 3.716089803554724e-05, "loss": 0.4243, "step": 787 }, { "epoch": 0.9943217665615142, "grad_norm": 0.3650044042459212, "learning_rate": 3.713751169317119e-05, "loss": 0.4206, "step": 788 }, { "epoch": 0.9955835962145111, "grad_norm": 0.31446820895713357, "learning_rate": 3.711412535079514e-05, "loss": 0.4366, "step": 789 }, { "epoch": 0.9968454258675079, "grad_norm": 0.36622695322206794, "learning_rate": 3.709073900841908e-05, "loss": 0.4212, "step": 790 }, { "epoch": 0.9981072555205047, "grad_norm": 0.38761706135296253, "learning_rate": 3.706735266604303e-05, "loss": 0.4169, "step": 791 }, { "epoch": 0.9993690851735015, "grad_norm": 0.3089432492526613, "learning_rate": 3.7043966323666976e-05, "loss": 0.4147, "step": 792 }, { "epoch": 1.0, "grad_norm": 0.3089432492526613, "learning_rate": 3.702057998129093e-05, "loss": 0.4085, "step": 793 }, { "epoch": 1.001261829652997, "grad_norm": 0.5569564758433563, "learning_rate": 3.699719363891487e-05, "loss": 0.3685, "step": 794 }, { "epoch": 1.0025236593059936, "grad_norm": 0.37579372581890236, "learning_rate": 3.697380729653882e-05, "loss": 0.369, "step": 795 }, { "epoch": 1.0037854889589906, "grad_norm": 0.32288290925767904, "learning_rate": 3.695042095416277e-05, "loss": 0.3852, "step": 796 }, { "epoch": 1.0050473186119875, "grad_norm": 0.3412866092456408, "learning_rate": 3.692703461178672e-05, "loss": 0.3592, "step": 797 }, { "epoch": 1.0063091482649842, "grad_norm": 0.35045228278813284, "learning_rate": 3.690364826941067e-05, "loss": 0.3619, "step": 798 }, { "epoch": 1.0075709779179811, "grad_norm": 0.44283339115902054, "learning_rate": 3.688026192703461e-05, "loss": 0.3752, "step": 799 }, { "epoch": 1.0088328075709778, "grad_norm": 0.37837032334730214, "learning_rate": 3.6856875584658564e-05, "loss": 0.3664, "step": 800 }, { "epoch": 1.0100946372239747, "grad_norm": 0.35566840538672306, "learning_rate": 3.683348924228251e-05, "loss": 0.3824, "step": 801 }, { "epoch": 1.0113564668769717, "grad_norm": 0.4522966651934033, "learning_rate": 3.681010289990646e-05, "loss": 0.3607, "step": 802 }, { "epoch": 1.0126182965299684, "grad_norm": 0.36040618274363045, "learning_rate": 3.67867165575304e-05, "loss": 0.3516, "step": 803 }, { "epoch": 1.0138801261829653, "grad_norm": 0.36771958735568594, "learning_rate": 3.6763330215154354e-05, "loss": 0.3666, "step": 804 }, { "epoch": 1.0151419558359622, "grad_norm": 0.7407933345567267, "learning_rate": 3.67399438727783e-05, "loss": 0.3901, "step": 805 }, { "epoch": 1.016403785488959, "grad_norm": 0.3646736558335337, "learning_rate": 3.671655753040224e-05, "loss": 0.3693, "step": 806 }, { "epoch": 1.0176656151419559, "grad_norm": 0.37495674973987525, "learning_rate": 3.669317118802619e-05, "loss": 0.3643, "step": 807 }, { "epoch": 1.0189274447949528, "grad_norm": 0.3427077427931033, "learning_rate": 3.666978484565014e-05, "loss": 0.3445, "step": 808 }, { "epoch": 1.0201892744479495, "grad_norm": 0.36271956683984635, "learning_rate": 3.664639850327409e-05, "loss": 0.3702, "step": 809 }, { "epoch": 1.0214511041009464, "grad_norm": 0.391981698736177, "learning_rate": 3.662301216089803e-05, "loss": 0.395, "step": 810 }, { "epoch": 1.0227129337539431, "grad_norm": 0.36402823933521916, "learning_rate": 3.659962581852199e-05, "loss": 0.3774, "step": 811 }, { "epoch": 1.02397476340694, "grad_norm": 0.37164900118816835, "learning_rate": 3.6576239476145935e-05, "loss": 0.3498, "step": 812 }, { "epoch": 1.025236593059937, "grad_norm": 0.3483965066050902, "learning_rate": 3.655285313376988e-05, "loss": 0.3651, "step": 813 }, { "epoch": 1.0264984227129337, "grad_norm": 0.32228863562071147, "learning_rate": 3.652946679139383e-05, "loss": 0.352, "step": 814 }, { "epoch": 1.0277602523659306, "grad_norm": 0.3372114295634021, "learning_rate": 3.6506080449017774e-05, "loss": 0.3585, "step": 815 }, { "epoch": 1.0290220820189275, "grad_norm": 0.32346590086227295, "learning_rate": 3.6482694106641725e-05, "loss": 0.3454, "step": 816 }, { "epoch": 1.0302839116719242, "grad_norm": 0.35672023195436686, "learning_rate": 3.645930776426567e-05, "loss": 0.3441, "step": 817 }, { "epoch": 1.0315457413249212, "grad_norm": 0.26989862291147454, "learning_rate": 3.643592142188962e-05, "loss": 0.3273, "step": 818 }, { "epoch": 1.032807570977918, "grad_norm": 0.39991789665685157, "learning_rate": 3.6412535079513564e-05, "loss": 0.3659, "step": 819 }, { "epoch": 1.0340694006309148, "grad_norm": 0.2950459831252526, "learning_rate": 3.6389148737137515e-05, "loss": 0.3632, "step": 820 }, { "epoch": 1.0353312302839117, "grad_norm": 0.3276746994499684, "learning_rate": 3.636576239476146e-05, "loss": 0.3835, "step": 821 }, { "epoch": 1.0365930599369084, "grad_norm": 0.37393795590586715, "learning_rate": 3.634237605238541e-05, "loss": 0.3445, "step": 822 }, { "epoch": 1.0378548895899053, "grad_norm": 0.2965539328194294, "learning_rate": 3.6318989710009355e-05, "loss": 0.3592, "step": 823 }, { "epoch": 1.0391167192429023, "grad_norm": 0.40310433415090235, "learning_rate": 3.62956033676333e-05, "loss": 0.3768, "step": 824 }, { "epoch": 1.040378548895899, "grad_norm": 0.3163362558085887, "learning_rate": 3.627221702525725e-05, "loss": 0.3725, "step": 825 }, { "epoch": 1.041640378548896, "grad_norm": 0.35991544621108007, "learning_rate": 3.62488306828812e-05, "loss": 0.39, "step": 826 }, { "epoch": 1.0429022082018928, "grad_norm": 0.3073002192337909, "learning_rate": 3.622544434050515e-05, "loss": 0.3542, "step": 827 }, { "epoch": 1.0441640378548895, "grad_norm": 0.31783661439308364, "learning_rate": 3.6202057998129096e-05, "loss": 0.362, "step": 828 }, { "epoch": 1.0454258675078865, "grad_norm": 0.31260697240530433, "learning_rate": 3.617867165575305e-05, "loss": 0.3646, "step": 829 }, { "epoch": 1.0466876971608832, "grad_norm": 0.3041181143978463, "learning_rate": 3.615528531337699e-05, "loss": 0.3722, "step": 830 }, { "epoch": 1.04794952681388, "grad_norm": 0.31088030929554356, "learning_rate": 3.6131898971000935e-05, "loss": 0.3634, "step": 831 }, { "epoch": 1.049211356466877, "grad_norm": 0.30956730663168586, "learning_rate": 3.6108512628624886e-05, "loss": 0.3813, "step": 832 }, { "epoch": 1.0504731861198737, "grad_norm": 0.3523089176712155, "learning_rate": 3.608512628624883e-05, "loss": 0.3766, "step": 833 }, { "epoch": 1.0517350157728707, "grad_norm": 0.33569475132209836, "learning_rate": 3.606173994387278e-05, "loss": 0.3572, "step": 834 }, { "epoch": 1.0529968454258676, "grad_norm": 0.3566433870615443, "learning_rate": 3.6038353601496726e-05, "loss": 0.3524, "step": 835 }, { "epoch": 1.0542586750788643, "grad_norm": 0.33099117271829465, "learning_rate": 3.601496725912068e-05, "loss": 0.368, "step": 836 }, { "epoch": 1.0555205047318612, "grad_norm": 0.2964800547840401, "learning_rate": 3.599158091674462e-05, "loss": 0.3617, "step": 837 }, { "epoch": 1.0567823343848581, "grad_norm": 0.3916491875374125, "learning_rate": 3.596819457436857e-05, "loss": 0.3525, "step": 838 }, { "epoch": 1.0580441640378548, "grad_norm": 0.35954018853689146, "learning_rate": 3.5944808231992516e-05, "loss": 0.356, "step": 839 }, { "epoch": 1.0593059936908518, "grad_norm": 0.40304832800457435, "learning_rate": 3.592142188961646e-05, "loss": 0.371, "step": 840 }, { "epoch": 1.0605678233438485, "grad_norm": 0.4177682542972928, "learning_rate": 3.589803554724041e-05, "loss": 0.3547, "step": 841 }, { "epoch": 1.0618296529968454, "grad_norm": 0.3377114932611128, "learning_rate": 3.587464920486436e-05, "loss": 0.3436, "step": 842 }, { "epoch": 1.0630914826498423, "grad_norm": 0.3549597118908824, "learning_rate": 3.585126286248831e-05, "loss": 0.3696, "step": 843 }, { "epoch": 1.064353312302839, "grad_norm": 0.39901942863383516, "learning_rate": 3.582787652011226e-05, "loss": 0.3659, "step": 844 }, { "epoch": 1.065615141955836, "grad_norm": 0.30063072620528386, "learning_rate": 3.580449017773621e-05, "loss": 0.3699, "step": 845 }, { "epoch": 1.0668769716088329, "grad_norm": 0.38609449216432634, "learning_rate": 3.578110383536015e-05, "loss": 0.357, "step": 846 }, { "epoch": 1.0681388012618296, "grad_norm": 0.36593693498226726, "learning_rate": 3.57577174929841e-05, "loss": 0.3495, "step": 847 }, { "epoch": 1.0694006309148265, "grad_norm": 0.32965839327134283, "learning_rate": 3.573433115060805e-05, "loss": 0.3462, "step": 848 }, { "epoch": 1.0706624605678234, "grad_norm": 0.3647457633007007, "learning_rate": 3.571094480823199e-05, "loss": 0.3455, "step": 849 }, { "epoch": 1.0719242902208201, "grad_norm": 0.3450500415314435, "learning_rate": 3.568755846585594e-05, "loss": 0.3578, "step": 850 }, { "epoch": 1.073186119873817, "grad_norm": 0.4213766682573884, "learning_rate": 3.566417212347989e-05, "loss": 0.3745, "step": 851 }, { "epoch": 1.0744479495268138, "grad_norm": 0.33573570925790347, "learning_rate": 3.564078578110384e-05, "loss": 0.3449, "step": 852 }, { "epoch": 1.0757097791798107, "grad_norm": 0.30558998639975143, "learning_rate": 3.561739943872778e-05, "loss": 0.3742, "step": 853 }, { "epoch": 1.0769716088328076, "grad_norm": 0.40558321596780655, "learning_rate": 3.559401309635173e-05, "loss": 0.3769, "step": 854 }, { "epoch": 1.0782334384858043, "grad_norm": 0.38037173875232605, "learning_rate": 3.557062675397568e-05, "loss": 0.3911, "step": 855 }, { "epoch": 1.0794952681388013, "grad_norm": 0.46624945047641275, "learning_rate": 3.554724041159963e-05, "loss": 0.3337, "step": 856 }, { "epoch": 1.0807570977917982, "grad_norm": 0.4348056674452088, "learning_rate": 3.552385406922358e-05, "loss": 0.3677, "step": 857 }, { "epoch": 1.0820189274447949, "grad_norm": 0.359878782694853, "learning_rate": 3.5500467726847523e-05, "loss": 0.371, "step": 858 }, { "epoch": 1.0832807570977918, "grad_norm": 0.4797221661037856, "learning_rate": 3.5477081384471474e-05, "loss": 0.3666, "step": 859 }, { "epoch": 1.0845425867507887, "grad_norm": 0.3185015563702556, "learning_rate": 3.545369504209542e-05, "loss": 0.3621, "step": 860 }, { "epoch": 1.0858044164037854, "grad_norm": 0.33549514149994636, "learning_rate": 3.543030869971937e-05, "loss": 0.3728, "step": 861 }, { "epoch": 1.0870662460567824, "grad_norm": 0.38027798217848857, "learning_rate": 3.5406922357343314e-05, "loss": 0.3586, "step": 862 }, { "epoch": 1.088328075709779, "grad_norm": 0.2850769699638008, "learning_rate": 3.538353601496726e-05, "loss": 0.3794, "step": 863 }, { "epoch": 1.089589905362776, "grad_norm": 0.37638573014992627, "learning_rate": 3.536014967259121e-05, "loss": 0.3701, "step": 864 }, { "epoch": 1.090851735015773, "grad_norm": 0.3377880581768803, "learning_rate": 3.533676333021515e-05, "loss": 0.3727, "step": 865 }, { "epoch": 1.0921135646687696, "grad_norm": 0.3476326694325818, "learning_rate": 3.5313376987839104e-05, "loss": 0.3638, "step": 866 }, { "epoch": 1.0933753943217666, "grad_norm": 0.33673714581676384, "learning_rate": 3.528999064546305e-05, "loss": 0.3653, "step": 867 }, { "epoch": 1.0946372239747635, "grad_norm": 0.3014918956066604, "learning_rate": 3.5266604303087e-05, "loss": 0.3487, "step": 868 }, { "epoch": 1.0958990536277602, "grad_norm": 0.2953143941109084, "learning_rate": 3.5243217960710943e-05, "loss": 0.3567, "step": 869 }, { "epoch": 1.0971608832807571, "grad_norm": 0.3702853609881141, "learning_rate": 3.5219831618334894e-05, "loss": 0.3499, "step": 870 }, { "epoch": 1.098422712933754, "grad_norm": 0.34745017206541473, "learning_rate": 3.519644527595884e-05, "loss": 0.3726, "step": 871 }, { "epoch": 1.0996845425867507, "grad_norm": 0.3075634868412922, "learning_rate": 3.517305893358279e-05, "loss": 0.3725, "step": 872 }, { "epoch": 1.1009463722397477, "grad_norm": 0.36664289339052186, "learning_rate": 3.514967259120674e-05, "loss": 0.3615, "step": 873 }, { "epoch": 1.1022082018927444, "grad_norm": 0.3148022643555196, "learning_rate": 3.5126286248830685e-05, "loss": 0.3725, "step": 874 }, { "epoch": 1.1034700315457413, "grad_norm": 0.3524866662172614, "learning_rate": 3.5102899906454636e-05, "loss": 0.3826, "step": 875 }, { "epoch": 1.1047318611987382, "grad_norm": 0.3307724408871424, "learning_rate": 3.507951356407858e-05, "loss": 0.3537, "step": 876 }, { "epoch": 1.105993690851735, "grad_norm": 0.311024554807942, "learning_rate": 3.505612722170253e-05, "loss": 0.3733, "step": 877 }, { "epoch": 1.1072555205047319, "grad_norm": 0.32758980846561925, "learning_rate": 3.5032740879326475e-05, "loss": 0.3771, "step": 878 }, { "epoch": 1.1085173501577288, "grad_norm": 0.3655454595500149, "learning_rate": 3.5009354536950426e-05, "loss": 0.3662, "step": 879 }, { "epoch": 1.1097791798107255, "grad_norm": 0.30925101466177146, "learning_rate": 3.498596819457437e-05, "loss": 0.3578, "step": 880 }, { "epoch": 1.1110410094637224, "grad_norm": 0.30132915918476566, "learning_rate": 3.4962581852198314e-05, "loss": 0.3601, "step": 881 }, { "epoch": 1.1123028391167193, "grad_norm": 0.31319607424011453, "learning_rate": 3.4939195509822265e-05, "loss": 0.3722, "step": 882 }, { "epoch": 1.113564668769716, "grad_norm": 0.2942876358515799, "learning_rate": 3.491580916744621e-05, "loss": 0.3554, "step": 883 }, { "epoch": 1.114826498422713, "grad_norm": 0.33024042211059834, "learning_rate": 3.489242282507016e-05, "loss": 0.3647, "step": 884 }, { "epoch": 1.1160883280757097, "grad_norm": 0.32446374597289207, "learning_rate": 3.4869036482694105e-05, "loss": 0.3798, "step": 885 }, { "epoch": 1.1173501577287066, "grad_norm": 0.28201179343292415, "learning_rate": 3.4845650140318056e-05, "loss": 0.3631, "step": 886 }, { "epoch": 1.1186119873817035, "grad_norm": 0.3513399773434184, "learning_rate": 3.4822263797942e-05, "loss": 0.3721, "step": 887 }, { "epoch": 1.1198738170347002, "grad_norm": 0.32077857386739494, "learning_rate": 3.479887745556595e-05, "loss": 0.3668, "step": 888 }, { "epoch": 1.1211356466876972, "grad_norm": 0.3474210941557822, "learning_rate": 3.47754911131899e-05, "loss": 0.4062, "step": 889 }, { "epoch": 1.122397476340694, "grad_norm": 1.8541777908593127, "learning_rate": 3.4752104770813846e-05, "loss": 0.3594, "step": 890 }, { "epoch": 1.1236593059936908, "grad_norm": 0.3660864690049694, "learning_rate": 3.47287184284378e-05, "loss": 0.3652, "step": 891 }, { "epoch": 1.1249211356466877, "grad_norm": 0.29949471225300955, "learning_rate": 3.470533208606174e-05, "loss": 0.3385, "step": 892 }, { "epoch": 1.1261829652996846, "grad_norm": 0.42079574468596925, "learning_rate": 3.468194574368569e-05, "loss": 0.3629, "step": 893 }, { "epoch": 1.1274447949526814, "grad_norm": 0.34609017643869866, "learning_rate": 3.4658559401309636e-05, "loss": 0.3934, "step": 894 }, { "epoch": 1.1287066246056783, "grad_norm": 0.3631151241130398, "learning_rate": 3.463517305893359e-05, "loss": 0.373, "step": 895 }, { "epoch": 1.129968454258675, "grad_norm": 0.3805396993585003, "learning_rate": 3.461178671655753e-05, "loss": 0.367, "step": 896 }, { "epoch": 1.131230283911672, "grad_norm": 0.31981560816238774, "learning_rate": 3.4588400374181476e-05, "loss": 0.3644, "step": 897 }, { "epoch": 1.1324921135646688, "grad_norm": 0.29393444804024654, "learning_rate": 3.456501403180543e-05, "loss": 0.3663, "step": 898 }, { "epoch": 1.1337539432176655, "grad_norm": 0.35697755550117793, "learning_rate": 3.454162768942937e-05, "loss": 0.3728, "step": 899 }, { "epoch": 1.1350157728706625, "grad_norm": 0.3141776522756188, "learning_rate": 3.451824134705332e-05, "loss": 0.3616, "step": 900 }, { "epoch": 1.1362776025236594, "grad_norm": 0.3259765273826799, "learning_rate": 3.4494855004677266e-05, "loss": 0.3832, "step": 901 }, { "epoch": 1.137539432176656, "grad_norm": 0.3603640628351931, "learning_rate": 3.447146866230122e-05, "loss": 0.3606, "step": 902 }, { "epoch": 1.138801261829653, "grad_norm": 0.37316749211617806, "learning_rate": 3.444808231992517e-05, "loss": 0.3738, "step": 903 }, { "epoch": 1.14006309148265, "grad_norm": 0.32781155557707164, "learning_rate": 3.442469597754911e-05, "loss": 0.3467, "step": 904 }, { "epoch": 1.1413249211356467, "grad_norm": 0.3515867382191848, "learning_rate": 3.440130963517306e-05, "loss": 0.3811, "step": 905 }, { "epoch": 1.1425867507886436, "grad_norm": 0.31431035257475826, "learning_rate": 3.437792329279701e-05, "loss": 0.3606, "step": 906 }, { "epoch": 1.1438485804416403, "grad_norm": 0.3026431626052215, "learning_rate": 3.435453695042096e-05, "loss": 0.3639, "step": 907 }, { "epoch": 1.1451104100946372, "grad_norm": 0.3395087117055265, "learning_rate": 3.43311506080449e-05, "loss": 0.3692, "step": 908 }, { "epoch": 1.1463722397476341, "grad_norm": 0.32457285448880985, "learning_rate": 3.4307764265668853e-05, "loss": 0.3492, "step": 909 }, { "epoch": 1.1476340694006308, "grad_norm": 0.3510734709469971, "learning_rate": 3.42843779232928e-05, "loss": 0.346, "step": 910 }, { "epoch": 1.1488958990536278, "grad_norm": 0.32586993716138596, "learning_rate": 3.426099158091675e-05, "loss": 0.3683, "step": 911 }, { "epoch": 1.1501577287066247, "grad_norm": 0.41432328174261646, "learning_rate": 3.423760523854069e-05, "loss": 0.3661, "step": 912 }, { "epoch": 1.1514195583596214, "grad_norm": 0.32509654109048136, "learning_rate": 3.421421889616464e-05, "loss": 0.3621, "step": 913 }, { "epoch": 1.1526813880126183, "grad_norm": 0.38557785722618027, "learning_rate": 3.419083255378859e-05, "loss": 0.3597, "step": 914 }, { "epoch": 1.1539432176656153, "grad_norm": 0.51649569912852, "learning_rate": 3.416744621141253e-05, "loss": 0.371, "step": 915 }, { "epoch": 1.155205047318612, "grad_norm": 0.35315146361843824, "learning_rate": 3.414405986903648e-05, "loss": 0.3551, "step": 916 }, { "epoch": 1.1564668769716089, "grad_norm": 0.3092357179962581, "learning_rate": 3.412067352666043e-05, "loss": 0.3553, "step": 917 }, { "epoch": 1.1577287066246056, "grad_norm": 0.36623771009783723, "learning_rate": 3.4097287184284385e-05, "loss": 0.3643, "step": 918 }, { "epoch": 1.1589905362776025, "grad_norm": 0.3062812837662381, "learning_rate": 3.407390084190833e-05, "loss": 0.3473, "step": 919 }, { "epoch": 1.1602523659305994, "grad_norm": 0.30291380059978323, "learning_rate": 3.4050514499532273e-05, "loss": 0.3642, "step": 920 }, { "epoch": 1.1615141955835961, "grad_norm": 0.39575580285218726, "learning_rate": 3.4027128157156224e-05, "loss": 0.3597, "step": 921 }, { "epoch": 1.162776025236593, "grad_norm": 0.3357291932115059, "learning_rate": 3.400374181478017e-05, "loss": 0.3761, "step": 922 }, { "epoch": 1.16403785488959, "grad_norm": 0.33342478778270246, "learning_rate": 3.398035547240412e-05, "loss": 0.3803, "step": 923 }, { "epoch": 1.1652996845425867, "grad_norm": 0.35524562272923954, "learning_rate": 3.3956969130028064e-05, "loss": 0.3781, "step": 924 }, { "epoch": 1.1665615141955836, "grad_norm": 0.3368716355387689, "learning_rate": 3.3933582787652015e-05, "loss": 0.3564, "step": 925 }, { "epoch": 1.1678233438485806, "grad_norm": 0.48182267174463406, "learning_rate": 3.391019644527596e-05, "loss": 0.3535, "step": 926 }, { "epoch": 1.1690851735015773, "grad_norm": 0.3218384815833558, "learning_rate": 3.388681010289991e-05, "loss": 0.3733, "step": 927 }, { "epoch": 1.1703470031545742, "grad_norm": 0.37356555535203617, "learning_rate": 3.3863423760523854e-05, "loss": 0.3753, "step": 928 }, { "epoch": 1.1716088328075709, "grad_norm": 0.4409403277068033, "learning_rate": 3.3840037418147805e-05, "loss": 0.379, "step": 929 }, { "epoch": 1.1728706624605678, "grad_norm": 0.36166654083048466, "learning_rate": 3.381665107577175e-05, "loss": 0.3776, "step": 930 }, { "epoch": 1.1741324921135647, "grad_norm": 0.3871717693232225, "learning_rate": 3.3793264733395694e-05, "loss": 0.355, "step": 931 }, { "epoch": 1.1753943217665614, "grad_norm": 0.3896402539426354, "learning_rate": 3.3769878391019644e-05, "loss": 0.3598, "step": 932 }, { "epoch": 1.1766561514195584, "grad_norm": 0.2860478431318752, "learning_rate": 3.3746492048643595e-05, "loss": 0.3786, "step": 933 }, { "epoch": 1.1779179810725553, "grad_norm": 0.3734981449201439, "learning_rate": 3.3723105706267546e-05, "loss": 0.3693, "step": 934 }, { "epoch": 1.179179810725552, "grad_norm": 0.38516927146685104, "learning_rate": 3.369971936389149e-05, "loss": 0.3734, "step": 935 }, { "epoch": 1.180441640378549, "grad_norm": 0.3131325405962398, "learning_rate": 3.367633302151544e-05, "loss": 0.3846, "step": 936 }, { "epoch": 1.1817034700315459, "grad_norm": 0.4154102210076708, "learning_rate": 3.3652946679139386e-05, "loss": 0.3428, "step": 937 }, { "epoch": 1.1829652996845426, "grad_norm": 0.3540158055287355, "learning_rate": 3.362956033676333e-05, "loss": 0.3517, "step": 938 }, { "epoch": 1.1842271293375395, "grad_norm": 0.3614369198941091, "learning_rate": 3.360617399438728e-05, "loss": 0.3629, "step": 939 }, { "epoch": 1.1854889589905362, "grad_norm": 0.32517255742199475, "learning_rate": 3.3582787652011225e-05, "loss": 0.3544, "step": 940 }, { "epoch": 1.1867507886435331, "grad_norm": 0.31098449339627543, "learning_rate": 3.3559401309635176e-05, "loss": 0.3557, "step": 941 }, { "epoch": 1.18801261829653, "grad_norm": 0.3278737998096723, "learning_rate": 3.353601496725912e-05, "loss": 0.3792, "step": 942 }, { "epoch": 1.1892744479495267, "grad_norm": 0.4009548394131181, "learning_rate": 3.351262862488307e-05, "loss": 0.3651, "step": 943 }, { "epoch": 1.1905362776025237, "grad_norm": 0.3009497573165383, "learning_rate": 3.3489242282507015e-05, "loss": 0.3718, "step": 944 }, { "epoch": 1.1917981072555206, "grad_norm": 0.37489023584391284, "learning_rate": 3.3465855940130966e-05, "loss": 0.359, "step": 945 }, { "epoch": 1.1930599369085173, "grad_norm": 0.2973067777444671, "learning_rate": 3.344246959775491e-05, "loss": 0.3616, "step": 946 }, { "epoch": 1.1943217665615142, "grad_norm": 0.28643667668582207, "learning_rate": 3.3419083255378855e-05, "loss": 0.3566, "step": 947 }, { "epoch": 1.1955835962145112, "grad_norm": 0.33663919478505594, "learning_rate": 3.3395696913002806e-05, "loss": 0.3553, "step": 948 }, { "epoch": 1.1968454258675079, "grad_norm": 0.30797818584305064, "learning_rate": 3.337231057062676e-05, "loss": 0.3741, "step": 949 }, { "epoch": 1.1981072555205048, "grad_norm": 0.3555649990421895, "learning_rate": 3.334892422825071e-05, "loss": 0.3703, "step": 950 }, { "epoch": 1.1993690851735015, "grad_norm": 2.187694593063976, "learning_rate": 3.332553788587465e-05, "loss": 0.3899, "step": 951 }, { "epoch": 1.2006309148264984, "grad_norm": 0.3590742565484184, "learning_rate": 3.33021515434986e-05, "loss": 0.3657, "step": 952 }, { "epoch": 1.2018927444794953, "grad_norm": 0.3375611690029136, "learning_rate": 3.327876520112255e-05, "loss": 0.3527, "step": 953 }, { "epoch": 1.203154574132492, "grad_norm": 0.3321901078859852, "learning_rate": 3.325537885874649e-05, "loss": 0.356, "step": 954 }, { "epoch": 1.204416403785489, "grad_norm": 0.3087459902683394, "learning_rate": 3.323199251637044e-05, "loss": 0.3774, "step": 955 }, { "epoch": 1.2056782334384857, "grad_norm": 0.35540170210860733, "learning_rate": 3.3208606173994386e-05, "loss": 0.3633, "step": 956 }, { "epoch": 1.2069400630914826, "grad_norm": 0.3358444758839276, "learning_rate": 3.318521983161834e-05, "loss": 0.3527, "step": 957 }, { "epoch": 1.2082018927444795, "grad_norm": 0.2825674413995706, "learning_rate": 3.316183348924228e-05, "loss": 0.3703, "step": 958 }, { "epoch": 1.2094637223974765, "grad_norm": 0.2912339037684726, "learning_rate": 3.313844714686623e-05, "loss": 0.3665, "step": 959 }, { "epoch": 1.2107255520504732, "grad_norm": 0.4199431997997769, "learning_rate": 3.311506080449018e-05, "loss": 0.3597, "step": 960 }, { "epoch": 1.21198738170347, "grad_norm": 0.2745504185896179, "learning_rate": 3.309167446211413e-05, "loss": 0.3625, "step": 961 }, { "epoch": 1.2132492113564668, "grad_norm": 0.39448444462662174, "learning_rate": 3.306828811973807e-05, "loss": 0.3641, "step": 962 }, { "epoch": 1.2145110410094637, "grad_norm": 0.34923963110873574, "learning_rate": 3.304490177736202e-05, "loss": 0.368, "step": 963 }, { "epoch": 1.2157728706624606, "grad_norm": 0.30256729584442693, "learning_rate": 3.3021515434985974e-05, "loss": 0.3573, "step": 964 }, { "epoch": 1.2170347003154574, "grad_norm": 0.4044926742164532, "learning_rate": 3.299812909260992e-05, "loss": 0.3708, "step": 965 }, { "epoch": 1.2182965299684543, "grad_norm": 0.27387001263817096, "learning_rate": 3.297474275023387e-05, "loss": 0.3606, "step": 966 }, { "epoch": 1.219558359621451, "grad_norm": 0.33551827337368906, "learning_rate": 3.295135640785781e-05, "loss": 0.3603, "step": 967 }, { "epoch": 1.220820189274448, "grad_norm": 0.32901614085613085, "learning_rate": 3.2927970065481764e-05, "loss": 0.3593, "step": 968 }, { "epoch": 1.2220820189274448, "grad_norm": 0.3011604487945531, "learning_rate": 3.290458372310571e-05, "loss": 0.3696, "step": 969 }, { "epoch": 1.2233438485804418, "grad_norm": 0.3233575773121538, "learning_rate": 3.288119738072965e-05, "loss": 0.3791, "step": 970 }, { "epoch": 1.2246056782334385, "grad_norm": 0.2992972589429785, "learning_rate": 3.2857811038353604e-05, "loss": 0.3332, "step": 971 }, { "epoch": 1.2258675078864354, "grad_norm": 0.3019562546172355, "learning_rate": 3.283442469597755e-05, "loss": 0.383, "step": 972 }, { "epoch": 1.227129337539432, "grad_norm": 0.29590186408894975, "learning_rate": 3.28110383536015e-05, "loss": 0.3712, "step": 973 }, { "epoch": 1.228391167192429, "grad_norm": 0.2836804161250389, "learning_rate": 3.278765201122544e-05, "loss": 0.3739, "step": 974 }, { "epoch": 1.229652996845426, "grad_norm": 0.34967239085028046, "learning_rate": 3.2764265668849394e-05, "loss": 0.3617, "step": 975 }, { "epoch": 1.2309148264984227, "grad_norm": 0.2919250742451188, "learning_rate": 3.274087932647334e-05, "loss": 0.372, "step": 976 }, { "epoch": 1.2321766561514196, "grad_norm": 0.30251553229032746, "learning_rate": 3.271749298409729e-05, "loss": 0.3662, "step": 977 }, { "epoch": 1.2334384858044163, "grad_norm": 0.34447085647905135, "learning_rate": 3.269410664172123e-05, "loss": 0.379, "step": 978 }, { "epoch": 1.2347003154574132, "grad_norm": 0.31373916129511714, "learning_rate": 3.2670720299345184e-05, "loss": 0.3583, "step": 979 }, { "epoch": 1.2359621451104101, "grad_norm": 0.28533479483459706, "learning_rate": 3.2647333956969135e-05, "loss": 0.3747, "step": 980 }, { "epoch": 1.237223974763407, "grad_norm": 0.3029238637474579, "learning_rate": 3.262394761459308e-05, "loss": 0.3651, "step": 981 }, { "epoch": 1.2384858044164038, "grad_norm": 0.295207223531178, "learning_rate": 3.260056127221703e-05, "loss": 0.3622, "step": 982 }, { "epoch": 1.2397476340694007, "grad_norm": 0.2932283364359268, "learning_rate": 3.2577174929840975e-05, "loss": 0.3625, "step": 983 }, { "epoch": 1.2410094637223974, "grad_norm": 0.2960186230212048, "learning_rate": 3.2553788587464926e-05, "loss": 0.3368, "step": 984 }, { "epoch": 1.2422712933753943, "grad_norm": 0.3002091104021548, "learning_rate": 3.253040224508887e-05, "loss": 0.3756, "step": 985 }, { "epoch": 1.2435331230283913, "grad_norm": 0.32731024812764037, "learning_rate": 3.250701590271282e-05, "loss": 0.3525, "step": 986 }, { "epoch": 1.244794952681388, "grad_norm": 0.2751504553438174, "learning_rate": 3.2483629560336765e-05, "loss": 0.3558, "step": 987 }, { "epoch": 1.2460567823343849, "grad_norm": 0.29952846939631317, "learning_rate": 3.246024321796071e-05, "loss": 0.3611, "step": 988 }, { "epoch": 1.2473186119873816, "grad_norm": 0.2881567885225936, "learning_rate": 3.243685687558466e-05, "loss": 0.3841, "step": 989 }, { "epoch": 1.2485804416403785, "grad_norm": 0.2892468291920591, "learning_rate": 3.2413470533208604e-05, "loss": 0.3645, "step": 990 }, { "epoch": 1.2498422712933754, "grad_norm": 0.2598506610406127, "learning_rate": 3.2390084190832555e-05, "loss": 0.3552, "step": 991 }, { "epoch": 1.2511041009463724, "grad_norm": 0.3138395821055879, "learning_rate": 3.23666978484565e-05, "loss": 0.3783, "step": 992 }, { "epoch": 1.252365930599369, "grad_norm": 0.27053739547939387, "learning_rate": 3.234331150608045e-05, "loss": 0.3638, "step": 993 }, { "epoch": 1.253627760252366, "grad_norm": 0.2783052427913572, "learning_rate": 3.23199251637044e-05, "loss": 0.3518, "step": 994 }, { "epoch": 1.2548895899053627, "grad_norm": 0.28430374566620353, "learning_rate": 3.2296538821328346e-05, "loss": 0.3658, "step": 995 }, { "epoch": 1.2561514195583596, "grad_norm": 0.2745527500471857, "learning_rate": 3.2273152478952296e-05, "loss": 0.3562, "step": 996 }, { "epoch": 1.2574132492113566, "grad_norm": 0.29963277243865377, "learning_rate": 3.224976613657624e-05, "loss": 0.3903, "step": 997 }, { "epoch": 1.2586750788643533, "grad_norm": 0.2899191548159274, "learning_rate": 3.222637979420019e-05, "loss": 0.3741, "step": 998 }, { "epoch": 1.2599369085173502, "grad_norm": 0.3461836854805556, "learning_rate": 3.2202993451824136e-05, "loss": 0.3648, "step": 999 }, { "epoch": 1.261198738170347, "grad_norm": 0.3056635512447289, "learning_rate": 3.217960710944809e-05, "loss": 0.3764, "step": 1000 }, { "epoch": 1.2624605678233438, "grad_norm": 0.3137616392189068, "learning_rate": 3.215622076707203e-05, "loss": 0.3716, "step": 1001 }, { "epoch": 1.2637223974763407, "grad_norm": 0.2668763229659044, "learning_rate": 3.213283442469598e-05, "loss": 0.3457, "step": 1002 }, { "epoch": 1.2649842271293377, "grad_norm": 0.30734263471387696, "learning_rate": 3.2109448082319926e-05, "loss": 0.3513, "step": 1003 }, { "epoch": 1.2662460567823344, "grad_norm": 0.2918996196571878, "learning_rate": 3.208606173994387e-05, "loss": 0.369, "step": 1004 }, { "epoch": 1.2675078864353313, "grad_norm": 0.31291876575393346, "learning_rate": 3.206267539756782e-05, "loss": 0.3544, "step": 1005 }, { "epoch": 1.268769716088328, "grad_norm": 0.3626213014744176, "learning_rate": 3.2039289055191766e-05, "loss": 0.3684, "step": 1006 }, { "epoch": 1.270031545741325, "grad_norm": 0.3245441955977353, "learning_rate": 3.2015902712815717e-05, "loss": 0.3654, "step": 1007 }, { "epoch": 1.2712933753943219, "grad_norm": 0.3055459298530345, "learning_rate": 3.199251637043966e-05, "loss": 0.3621, "step": 1008 }, { "epoch": 1.2725552050473186, "grad_norm": 0.2908782172460048, "learning_rate": 3.196913002806361e-05, "loss": 0.3711, "step": 1009 }, { "epoch": 1.2738170347003155, "grad_norm": 0.2946947732536184, "learning_rate": 3.194574368568756e-05, "loss": 0.3458, "step": 1010 }, { "epoch": 1.2750788643533122, "grad_norm": 0.30720037561311025, "learning_rate": 3.192235734331151e-05, "loss": 0.3638, "step": 1011 }, { "epoch": 1.2763406940063091, "grad_norm": 0.29900085057598336, "learning_rate": 3.189897100093546e-05, "loss": 0.364, "step": 1012 }, { "epoch": 1.277602523659306, "grad_norm": 0.2958676347612124, "learning_rate": 3.18755846585594e-05, "loss": 0.3634, "step": 1013 }, { "epoch": 1.278864353312303, "grad_norm": 0.3340883920303775, "learning_rate": 3.185219831618335e-05, "loss": 0.3678, "step": 1014 }, { "epoch": 1.2801261829652997, "grad_norm": 0.28337964076425926, "learning_rate": 3.18288119738073e-05, "loss": 0.3558, "step": 1015 }, { "epoch": 1.2813880126182966, "grad_norm": 0.3355611834274182, "learning_rate": 3.180542563143125e-05, "loss": 0.3611, "step": 1016 }, { "epoch": 1.2826498422712933, "grad_norm": 0.3208150914547165, "learning_rate": 3.178203928905519e-05, "loss": 0.3504, "step": 1017 }, { "epoch": 1.2839116719242902, "grad_norm": 0.34191775016164594, "learning_rate": 3.175865294667914e-05, "loss": 0.3447, "step": 1018 }, { "epoch": 1.2851735015772872, "grad_norm": 0.2936928075638436, "learning_rate": 3.173526660430309e-05, "loss": 0.3642, "step": 1019 }, { "epoch": 1.2864353312302839, "grad_norm": 0.3535746394079441, "learning_rate": 3.171188026192703e-05, "loss": 0.3726, "step": 1020 }, { "epoch": 1.2876971608832808, "grad_norm": 0.3247099025443107, "learning_rate": 3.168849391955098e-05, "loss": 0.3635, "step": 1021 }, { "epoch": 1.2889589905362775, "grad_norm": 0.3310295471867825, "learning_rate": 3.166510757717493e-05, "loss": 0.3571, "step": 1022 }, { "epoch": 1.2902208201892744, "grad_norm": 0.3065539287460266, "learning_rate": 3.164172123479888e-05, "loss": 0.3715, "step": 1023 }, { "epoch": 1.2914826498422713, "grad_norm": 0.3214671536839739, "learning_rate": 3.161833489242282e-05, "loss": 0.3561, "step": 1024 }, { "epoch": 1.2927444794952683, "grad_norm": 0.33620104359447295, "learning_rate": 3.159494855004678e-05, "loss": 0.3774, "step": 1025 }, { "epoch": 1.294006309148265, "grad_norm": 0.30679728920023364, "learning_rate": 3.1571562207670724e-05, "loss": 0.3675, "step": 1026 }, { "epoch": 1.295268138801262, "grad_norm": 0.3212312029208238, "learning_rate": 3.154817586529467e-05, "loss": 0.3665, "step": 1027 }, { "epoch": 1.2965299684542586, "grad_norm": 0.27775534879583463, "learning_rate": 3.152478952291862e-05, "loss": 0.3696, "step": 1028 }, { "epoch": 1.2977917981072555, "grad_norm": 0.3210280873238131, "learning_rate": 3.150140318054256e-05, "loss": 0.3523, "step": 1029 }, { "epoch": 1.2990536277602525, "grad_norm": 0.2862465122420245, "learning_rate": 3.1478016838166514e-05, "loss": 0.3621, "step": 1030 }, { "epoch": 1.3003154574132492, "grad_norm": 0.2983705510033839, "learning_rate": 3.145463049579046e-05, "loss": 0.357, "step": 1031 }, { "epoch": 1.301577287066246, "grad_norm": 0.293685146179101, "learning_rate": 3.143124415341441e-05, "loss": 0.3711, "step": 1032 }, { "epoch": 1.3028391167192428, "grad_norm": 0.30229778873880386, "learning_rate": 3.1407857811038354e-05, "loss": 0.3689, "step": 1033 }, { "epoch": 1.3041009463722397, "grad_norm": 0.2874439169729176, "learning_rate": 3.1384471468662305e-05, "loss": 0.3675, "step": 1034 }, { "epoch": 1.3053627760252366, "grad_norm": 0.2925566047869318, "learning_rate": 3.136108512628625e-05, "loss": 0.3502, "step": 1035 }, { "epoch": 1.3066246056782336, "grad_norm": 0.3111465295183572, "learning_rate": 3.13376987839102e-05, "loss": 0.3571, "step": 1036 }, { "epoch": 1.3078864353312303, "grad_norm": 0.37080858286239354, "learning_rate": 3.1314312441534144e-05, "loss": 0.3693, "step": 1037 }, { "epoch": 1.3091482649842272, "grad_norm": 0.3018782416377932, "learning_rate": 3.129092609915809e-05, "loss": 0.3583, "step": 1038 }, { "epoch": 1.310410094637224, "grad_norm": 0.32819453622454814, "learning_rate": 3.126753975678204e-05, "loss": 0.3596, "step": 1039 }, { "epoch": 1.3116719242902208, "grad_norm": 0.3325258111243139, "learning_rate": 3.124415341440599e-05, "loss": 0.3652, "step": 1040 }, { "epoch": 1.3129337539432178, "grad_norm": 0.28681673412446024, "learning_rate": 3.122076707202994e-05, "loss": 0.3736, "step": 1041 }, { "epoch": 1.3141955835962145, "grad_norm": 0.31667018226365473, "learning_rate": 3.1197380729653885e-05, "loss": 0.3553, "step": 1042 }, { "epoch": 1.3154574132492114, "grad_norm": 0.27550312072676847, "learning_rate": 3.1173994387277836e-05, "loss": 0.3488, "step": 1043 }, { "epoch": 1.316719242902208, "grad_norm": 0.2711597105883976, "learning_rate": 3.115060804490178e-05, "loss": 0.358, "step": 1044 }, { "epoch": 1.317981072555205, "grad_norm": 0.2906777007068102, "learning_rate": 3.1127221702525725e-05, "loss": 0.374, "step": 1045 }, { "epoch": 1.319242902208202, "grad_norm": 0.30947810173080686, "learning_rate": 3.1103835360149676e-05, "loss": 0.3596, "step": 1046 }, { "epoch": 1.3205047318611987, "grad_norm": 0.29948942445264787, "learning_rate": 3.108044901777362e-05, "loss": 0.3472, "step": 1047 }, { "epoch": 1.3217665615141956, "grad_norm": 0.31246631295073674, "learning_rate": 3.105706267539757e-05, "loss": 0.3819, "step": 1048 }, { "epoch": 1.3230283911671925, "grad_norm": 0.2920220667629263, "learning_rate": 3.1033676333021515e-05, "loss": 0.3663, "step": 1049 }, { "epoch": 1.3242902208201892, "grad_norm": 0.3430978007003764, "learning_rate": 3.1010289990645466e-05, "loss": 0.357, "step": 1050 }, { "epoch": 1.3255520504731861, "grad_norm": 0.3171877069932031, "learning_rate": 3.098690364826941e-05, "loss": 0.3457, "step": 1051 }, { "epoch": 1.326813880126183, "grad_norm": 0.2798714452671392, "learning_rate": 3.096351730589336e-05, "loss": 0.3481, "step": 1052 }, { "epoch": 1.3280757097791798, "grad_norm": 0.33598029535147134, "learning_rate": 3.0940130963517305e-05, "loss": 0.3552, "step": 1053 }, { "epoch": 1.3293375394321767, "grad_norm": 0.2991632915771032, "learning_rate": 3.091674462114125e-05, "loss": 0.3832, "step": 1054 }, { "epoch": 1.3305993690851734, "grad_norm": 0.3291406665520808, "learning_rate": 3.08933582787652e-05, "loss": 0.363, "step": 1055 }, { "epoch": 1.3318611987381703, "grad_norm": 0.3608002491608247, "learning_rate": 3.086997193638915e-05, "loss": 0.3654, "step": 1056 }, { "epoch": 1.3331230283911673, "grad_norm": 0.30973747585672445, "learning_rate": 3.08465855940131e-05, "loss": 0.358, "step": 1057 }, { "epoch": 1.334384858044164, "grad_norm": 0.30346787316852775, "learning_rate": 3.0823199251637047e-05, "loss": 0.3648, "step": 1058 }, { "epoch": 1.3356466876971609, "grad_norm": 0.34795143781043075, "learning_rate": 3.0799812909261e-05, "loss": 0.3413, "step": 1059 }, { "epoch": 1.3369085173501578, "grad_norm": 0.2804804015281431, "learning_rate": 3.077642656688494e-05, "loss": 0.3484, "step": 1060 }, { "epoch": 1.3381703470031545, "grad_norm": 0.31966401978588904, "learning_rate": 3.0753040224508886e-05, "loss": 0.3656, "step": 1061 }, { "epoch": 1.3394321766561514, "grad_norm": 0.2640536950320693, "learning_rate": 3.072965388213284e-05, "loss": 0.3816, "step": 1062 }, { "epoch": 1.3406940063091484, "grad_norm": 0.32916676920488763, "learning_rate": 3.070626753975678e-05, "loss": 0.3602, "step": 1063 }, { "epoch": 1.341955835962145, "grad_norm": 0.28741083544429963, "learning_rate": 3.068288119738073e-05, "loss": 0.351, "step": 1064 }, { "epoch": 1.343217665615142, "grad_norm": 0.3286721643367131, "learning_rate": 3.0659494855004676e-05, "loss": 0.3592, "step": 1065 }, { "epoch": 1.3444794952681387, "grad_norm": 0.29273998729373424, "learning_rate": 3.063610851262863e-05, "loss": 0.3488, "step": 1066 }, { "epoch": 1.3457413249211356, "grad_norm": 0.38085086391210654, "learning_rate": 3.061272217025257e-05, "loss": 0.363, "step": 1067 }, { "epoch": 1.3470031545741326, "grad_norm": 0.2767156793932315, "learning_rate": 3.058933582787652e-05, "loss": 0.3611, "step": 1068 }, { "epoch": 1.3482649842271293, "grad_norm": 0.3029475027439941, "learning_rate": 3.0565949485500467e-05, "loss": 0.3706, "step": 1069 }, { "epoch": 1.3495268138801262, "grad_norm": 0.32382083369735265, "learning_rate": 3.054256314312441e-05, "loss": 0.3695, "step": 1070 }, { "epoch": 1.350788643533123, "grad_norm": 0.30697548723718276, "learning_rate": 3.051917680074837e-05, "loss": 0.3706, "step": 1071 }, { "epoch": 1.3520504731861198, "grad_norm": 0.30376449471403766, "learning_rate": 3.0495790458372313e-05, "loss": 0.3464, "step": 1072 }, { "epoch": 1.3533123028391167, "grad_norm": 0.29088055683504327, "learning_rate": 3.047240411599626e-05, "loss": 0.3756, "step": 1073 }, { "epoch": 1.3545741324921137, "grad_norm": 0.29052682059234775, "learning_rate": 3.0449017773620208e-05, "loss": 0.3491, "step": 1074 }, { "epoch": 1.3558359621451104, "grad_norm": 0.2795790036267974, "learning_rate": 3.0425631431244155e-05, "loss": 0.3636, "step": 1075 }, { "epoch": 1.3570977917981073, "grad_norm": 0.32083341793404613, "learning_rate": 3.0402245088868103e-05, "loss": 0.3363, "step": 1076 }, { "epoch": 1.358359621451104, "grad_norm": 0.3181030328757953, "learning_rate": 3.037885874649205e-05, "loss": 0.3769, "step": 1077 }, { "epoch": 1.359621451104101, "grad_norm": 0.2883480754635085, "learning_rate": 3.0355472404115998e-05, "loss": 0.3512, "step": 1078 }, { "epoch": 1.3608832807570979, "grad_norm": 0.3093349451033097, "learning_rate": 3.0332086061739946e-05, "loss": 0.3556, "step": 1079 }, { "epoch": 1.3621451104100946, "grad_norm": 0.30944798150574215, "learning_rate": 3.0308699719363893e-05, "loss": 0.3654, "step": 1080 }, { "epoch": 1.3634069400630915, "grad_norm": 0.3157450203975307, "learning_rate": 3.028531337698784e-05, "loss": 0.3646, "step": 1081 }, { "epoch": 1.3646687697160882, "grad_norm": 0.28614855297206776, "learning_rate": 3.0261927034611785e-05, "loss": 0.3613, "step": 1082 }, { "epoch": 1.3659305993690851, "grad_norm": 0.2627217920547506, "learning_rate": 3.0238540692235733e-05, "loss": 0.3591, "step": 1083 }, { "epoch": 1.367192429022082, "grad_norm": 0.32563577280372175, "learning_rate": 3.021515434985968e-05, "loss": 0.3429, "step": 1084 }, { "epoch": 1.368454258675079, "grad_norm": 0.27351928131832975, "learning_rate": 3.0191768007483628e-05, "loss": 0.3598, "step": 1085 }, { "epoch": 1.3697160883280757, "grad_norm": 0.30306401152478124, "learning_rate": 3.0168381665107582e-05, "loss": 0.3489, "step": 1086 }, { "epoch": 1.3709779179810726, "grad_norm": 0.28646240771229237, "learning_rate": 3.014499532273153e-05, "loss": 0.3576, "step": 1087 }, { "epoch": 1.3722397476340693, "grad_norm": 0.3364414929425308, "learning_rate": 3.0121608980355477e-05, "loss": 0.3787, "step": 1088 }, { "epoch": 1.3735015772870662, "grad_norm": 0.2884372279085607, "learning_rate": 3.009822263797942e-05, "loss": 0.3414, "step": 1089 }, { "epoch": 1.3747634069400632, "grad_norm": 0.3497884893533693, "learning_rate": 3.007483629560337e-05, "loss": 0.3713, "step": 1090 }, { "epoch": 1.3760252365930599, "grad_norm": 0.2970839032954431, "learning_rate": 3.0051449953227317e-05, "loss": 0.3809, "step": 1091 }, { "epoch": 1.3772870662460568, "grad_norm": 0.3241920872168854, "learning_rate": 3.0028063610851264e-05, "loss": 0.3563, "step": 1092 }, { "epoch": 1.3785488958990535, "grad_norm": 0.3316185399076849, "learning_rate": 3.0004677268475212e-05, "loss": 0.3619, "step": 1093 }, { "epoch": 1.3798107255520504, "grad_norm": 0.29570044478226803, "learning_rate": 2.998129092609916e-05, "loss": 0.3472, "step": 1094 }, { "epoch": 1.3810725552050473, "grad_norm": 0.3090174348345738, "learning_rate": 2.9957904583723107e-05, "loss": 0.3731, "step": 1095 }, { "epoch": 1.3823343848580443, "grad_norm": 0.3426377122726833, "learning_rate": 2.9934518241347055e-05, "loss": 0.3448, "step": 1096 }, { "epoch": 1.383596214511041, "grad_norm": 0.29167307300140566, "learning_rate": 2.9911131898971002e-05, "loss": 0.3448, "step": 1097 }, { "epoch": 1.384858044164038, "grad_norm": 0.3161444628217017, "learning_rate": 2.988774555659495e-05, "loss": 0.358, "step": 1098 }, { "epoch": 1.3861198738170346, "grad_norm": 0.3239279044485258, "learning_rate": 2.9864359214218894e-05, "loss": 0.376, "step": 1099 }, { "epoch": 1.3873817034700315, "grad_norm": 0.3165771225629975, "learning_rate": 2.984097287184284e-05, "loss": 0.3573, "step": 1100 }, { "epoch": 1.3886435331230285, "grad_norm": 0.32554982020712286, "learning_rate": 2.9817586529466796e-05, "loss": 0.3432, "step": 1101 }, { "epoch": 1.3899053627760252, "grad_norm": 0.39179324631706614, "learning_rate": 2.9794200187090744e-05, "loss": 0.3588, "step": 1102 }, { "epoch": 1.391167192429022, "grad_norm": 0.2872996037634169, "learning_rate": 2.977081384471469e-05, "loss": 0.3664, "step": 1103 }, { "epoch": 1.3924290220820188, "grad_norm": 0.307491585046153, "learning_rate": 2.974742750233864e-05, "loss": 0.3665, "step": 1104 }, { "epoch": 1.3936908517350157, "grad_norm": 0.3058922904036904, "learning_rate": 2.9724041159962583e-05, "loss": 0.3758, "step": 1105 }, { "epoch": 1.3949526813880126, "grad_norm": 0.31350873108470506, "learning_rate": 2.970065481758653e-05, "loss": 0.3786, "step": 1106 }, { "epoch": 1.3962145110410096, "grad_norm": 0.34221872092575095, "learning_rate": 2.9677268475210478e-05, "loss": 0.3586, "step": 1107 }, { "epoch": 1.3974763406940063, "grad_norm": 0.32982127494181596, "learning_rate": 2.9653882132834426e-05, "loss": 0.3838, "step": 1108 }, { "epoch": 1.3987381703470032, "grad_norm": 0.3505821733520848, "learning_rate": 2.9630495790458373e-05, "loss": 0.3718, "step": 1109 }, { "epoch": 1.4, "grad_norm": 0.3546058260366438, "learning_rate": 2.960710944808232e-05, "loss": 0.3887, "step": 1110 }, { "epoch": 1.4012618296529968, "grad_norm": 0.3685249557042249, "learning_rate": 2.958372310570627e-05, "loss": 0.3608, "step": 1111 }, { "epoch": 1.4025236593059938, "grad_norm": 0.3061506523440452, "learning_rate": 2.9560336763330216e-05, "loss": 0.3649, "step": 1112 }, { "epoch": 1.4037854889589905, "grad_norm": 0.3514946936177404, "learning_rate": 2.9536950420954164e-05, "loss": 0.365, "step": 1113 }, { "epoch": 1.4050473186119874, "grad_norm": 0.3296206538489641, "learning_rate": 2.951356407857811e-05, "loss": 0.3809, "step": 1114 }, { "epoch": 1.406309148264984, "grad_norm": 0.4191526656545158, "learning_rate": 2.949017773620206e-05, "loss": 0.3542, "step": 1115 }, { "epoch": 1.407570977917981, "grad_norm": 0.3060496798915595, "learning_rate": 2.9466791393826003e-05, "loss": 0.3639, "step": 1116 }, { "epoch": 1.408832807570978, "grad_norm": 0.41191325138455026, "learning_rate": 2.9443405051449957e-05, "loss": 0.3496, "step": 1117 }, { "epoch": 1.4100946372239749, "grad_norm": 0.2863389040725298, "learning_rate": 2.9420018709073905e-05, "loss": 0.3656, "step": 1118 }, { "epoch": 1.4113564668769716, "grad_norm": 0.45747154454209393, "learning_rate": 2.9396632366697852e-05, "loss": 0.3616, "step": 1119 }, { "epoch": 1.4126182965299685, "grad_norm": 0.3169266118783908, "learning_rate": 2.93732460243218e-05, "loss": 0.3523, "step": 1120 }, { "epoch": 1.4138801261829652, "grad_norm": 0.3568476511813498, "learning_rate": 2.9349859681945748e-05, "loss": 0.3601, "step": 1121 }, { "epoch": 1.4151419558359621, "grad_norm": 0.32685730038928634, "learning_rate": 2.9326473339569692e-05, "loss": 0.3583, "step": 1122 }, { "epoch": 1.416403785488959, "grad_norm": 0.3548086790715519, "learning_rate": 2.930308699719364e-05, "loss": 0.3629, "step": 1123 }, { "epoch": 1.4176656151419558, "grad_norm": 0.29676157406655745, "learning_rate": 2.9279700654817587e-05, "loss": 0.3546, "step": 1124 }, { "epoch": 1.4189274447949527, "grad_norm": 0.4035680203890022, "learning_rate": 2.9256314312441535e-05, "loss": 0.3763, "step": 1125 }, { "epoch": 1.4201892744479494, "grad_norm": 0.3140424536933844, "learning_rate": 2.9232927970065482e-05, "loss": 0.3439, "step": 1126 }, { "epoch": 1.4214511041009463, "grad_norm": 0.33193614061209314, "learning_rate": 2.920954162768943e-05, "loss": 0.3784, "step": 1127 }, { "epoch": 1.4227129337539433, "grad_norm": 0.3853755652266198, "learning_rate": 2.9186155285313377e-05, "loss": 0.3709, "step": 1128 }, { "epoch": 1.4239747634069402, "grad_norm": 0.31336876760551424, "learning_rate": 2.9162768942937325e-05, "loss": 0.3701, "step": 1129 }, { "epoch": 1.4252365930599369, "grad_norm": 0.37143092317169596, "learning_rate": 2.9139382600561272e-05, "loss": 0.3832, "step": 1130 }, { "epoch": 1.4264984227129338, "grad_norm": 0.45699732867783094, "learning_rate": 2.911599625818522e-05, "loss": 0.3761, "step": 1131 }, { "epoch": 1.4277602523659305, "grad_norm": 0.29002911419691846, "learning_rate": 2.909260991580917e-05, "loss": 0.3487, "step": 1132 }, { "epoch": 1.4290220820189274, "grad_norm": 0.3432780777040525, "learning_rate": 2.906922357343312e-05, "loss": 0.3548, "step": 1133 }, { "epoch": 1.4302839116719244, "grad_norm": 0.32733234984737347, "learning_rate": 2.9045837231057066e-05, "loss": 0.3706, "step": 1134 }, { "epoch": 1.431545741324921, "grad_norm": 0.30657819815272996, "learning_rate": 2.9022450888681014e-05, "loss": 0.3595, "step": 1135 }, { "epoch": 1.432807570977918, "grad_norm": 0.3287784907096455, "learning_rate": 2.899906454630496e-05, "loss": 0.3755, "step": 1136 }, { "epoch": 1.4340694006309147, "grad_norm": 0.3130239643345491, "learning_rate": 2.897567820392891e-05, "loss": 0.3684, "step": 1137 }, { "epoch": 1.4353312302839116, "grad_norm": 0.31021585496890947, "learning_rate": 2.8952291861552856e-05, "loss": 0.3616, "step": 1138 }, { "epoch": 1.4365930599369086, "grad_norm": 0.32126481212197633, "learning_rate": 2.89289055191768e-05, "loss": 0.3655, "step": 1139 }, { "epoch": 1.4378548895899055, "grad_norm": 0.3262959611439638, "learning_rate": 2.8905519176800748e-05, "loss": 0.3839, "step": 1140 }, { "epoch": 1.4391167192429022, "grad_norm": 0.28010481784348706, "learning_rate": 2.8882132834424696e-05, "loss": 0.3698, "step": 1141 }, { "epoch": 1.4403785488958991, "grad_norm": 0.32752161151766396, "learning_rate": 2.8858746492048643e-05, "loss": 0.3738, "step": 1142 }, { "epoch": 1.4416403785488958, "grad_norm": 0.289375819794751, "learning_rate": 2.883536014967259e-05, "loss": 0.356, "step": 1143 }, { "epoch": 1.4429022082018927, "grad_norm": 0.517529595107631, "learning_rate": 2.881197380729654e-05, "loss": 0.3744, "step": 1144 }, { "epoch": 1.4441640378548897, "grad_norm": 0.2838779792784436, "learning_rate": 2.8788587464920486e-05, "loss": 0.372, "step": 1145 }, { "epoch": 1.4454258675078864, "grad_norm": 0.37723696346276236, "learning_rate": 2.8765201122544434e-05, "loss": 0.3661, "step": 1146 }, { "epoch": 1.4466876971608833, "grad_norm": 0.28364029194676593, "learning_rate": 2.8741814780168385e-05, "loss": 0.3619, "step": 1147 }, { "epoch": 1.44794952681388, "grad_norm": 0.3159689077808555, "learning_rate": 2.8718428437792332e-05, "loss": 0.3425, "step": 1148 }, { "epoch": 1.449211356466877, "grad_norm": 0.3162768651990943, "learning_rate": 2.869504209541628e-05, "loss": 0.3635, "step": 1149 }, { "epoch": 1.4504731861198739, "grad_norm": 0.2716816066554923, "learning_rate": 2.8671655753040227e-05, "loss": 0.3361, "step": 1150 }, { "epoch": 1.4517350157728708, "grad_norm": 0.30179045976353513, "learning_rate": 2.8648269410664175e-05, "loss": 0.3595, "step": 1151 }, { "epoch": 1.4529968454258675, "grad_norm": 0.290615956589824, "learning_rate": 2.8624883068288123e-05, "loss": 0.3411, "step": 1152 }, { "epoch": 1.4542586750788644, "grad_norm": 0.29135125719080374, "learning_rate": 2.860149672591207e-05, "loss": 0.3473, "step": 1153 }, { "epoch": 1.4555205047318611, "grad_norm": 0.2975543861381207, "learning_rate": 2.8578110383536018e-05, "loss": 0.3477, "step": 1154 }, { "epoch": 1.456782334384858, "grad_norm": 0.31244256240212914, "learning_rate": 2.8554724041159965e-05, "loss": 0.3837, "step": 1155 }, { "epoch": 1.458044164037855, "grad_norm": 0.30987113458690574, "learning_rate": 2.853133769878391e-05, "loss": 0.3631, "step": 1156 }, { "epoch": 1.4593059936908517, "grad_norm": 0.337104742616513, "learning_rate": 2.8507951356407857e-05, "loss": 0.3595, "step": 1157 }, { "epoch": 1.4605678233438486, "grad_norm": 0.26539622793332707, "learning_rate": 2.8484565014031805e-05, "loss": 0.3544, "step": 1158 }, { "epoch": 1.4618296529968453, "grad_norm": 0.3522900170842775, "learning_rate": 2.8461178671655752e-05, "loss": 0.3449, "step": 1159 }, { "epoch": 1.4630914826498422, "grad_norm": 0.30025496096290527, "learning_rate": 2.84377923292797e-05, "loss": 0.3613, "step": 1160 }, { "epoch": 1.4643533123028392, "grad_norm": 0.31417264419094604, "learning_rate": 2.8414405986903647e-05, "loss": 0.3437, "step": 1161 }, { "epoch": 1.465615141955836, "grad_norm": 0.33980509807743575, "learning_rate": 2.83910196445276e-05, "loss": 0.3825, "step": 1162 }, { "epoch": 1.4668769716088328, "grad_norm": 0.35840809366298876, "learning_rate": 2.8367633302151546e-05, "loss": 0.3426, "step": 1163 }, { "epoch": 1.4681388012618297, "grad_norm": 0.32841647554216624, "learning_rate": 2.8344246959775494e-05, "loss": 0.3693, "step": 1164 }, { "epoch": 1.4694006309148264, "grad_norm": 0.4359291404688486, "learning_rate": 2.832086061739944e-05, "loss": 0.3535, "step": 1165 }, { "epoch": 1.4706624605678233, "grad_norm": 0.32851186736924004, "learning_rate": 2.829747427502339e-05, "loss": 0.3514, "step": 1166 }, { "epoch": 1.4719242902208203, "grad_norm": 0.359715263305686, "learning_rate": 2.8274087932647336e-05, "loss": 0.3775, "step": 1167 }, { "epoch": 1.473186119873817, "grad_norm": 0.39613559242320406, "learning_rate": 2.8250701590271284e-05, "loss": 0.3597, "step": 1168 }, { "epoch": 1.474447949526814, "grad_norm": 0.3137232638728872, "learning_rate": 2.822731524789523e-05, "loss": 0.3712, "step": 1169 }, { "epoch": 1.4757097791798106, "grad_norm": 0.45266796759246086, "learning_rate": 2.820392890551918e-05, "loss": 0.363, "step": 1170 }, { "epoch": 1.4769716088328075, "grad_norm": 0.3713203645728838, "learning_rate": 2.8180542563143127e-05, "loss": 0.3712, "step": 1171 }, { "epoch": 1.4782334384858045, "grad_norm": 0.32388927755869557, "learning_rate": 2.815715622076707e-05, "loss": 0.3395, "step": 1172 }, { "epoch": 1.4794952681388014, "grad_norm": 0.37546874499661237, "learning_rate": 2.813376987839102e-05, "loss": 0.3616, "step": 1173 }, { "epoch": 1.480757097791798, "grad_norm": 0.3085019633669626, "learning_rate": 2.8110383536014966e-05, "loss": 0.3639, "step": 1174 }, { "epoch": 1.482018927444795, "grad_norm": 0.30004815044224753, "learning_rate": 2.8086997193638914e-05, "loss": 0.3802, "step": 1175 }, { "epoch": 1.4832807570977917, "grad_norm": 0.3314031962288213, "learning_rate": 2.806361085126286e-05, "loss": 0.3562, "step": 1176 }, { "epoch": 1.4845425867507887, "grad_norm": 0.31970848660911055, "learning_rate": 2.804022450888681e-05, "loss": 0.3659, "step": 1177 }, { "epoch": 1.4858044164037856, "grad_norm": 0.2815966850035257, "learning_rate": 2.8016838166510763e-05, "loss": 0.3657, "step": 1178 }, { "epoch": 1.4870662460567823, "grad_norm": 0.35372976368888154, "learning_rate": 2.7993451824134707e-05, "loss": 0.3751, "step": 1179 }, { "epoch": 1.4883280757097792, "grad_norm": 0.3067526544955514, "learning_rate": 2.7970065481758655e-05, "loss": 0.3562, "step": 1180 }, { "epoch": 1.489589905362776, "grad_norm": 0.35603066713978176, "learning_rate": 2.7946679139382603e-05, "loss": 0.3632, "step": 1181 }, { "epoch": 1.4908517350157728, "grad_norm": 0.30284631797332234, "learning_rate": 2.792329279700655e-05, "loss": 0.3779, "step": 1182 }, { "epoch": 1.4921135646687698, "grad_norm": 0.30706471286249365, "learning_rate": 2.7899906454630498e-05, "loss": 0.3617, "step": 1183 }, { "epoch": 1.4933753943217667, "grad_norm": 0.34586161286462175, "learning_rate": 2.7876520112254445e-05, "loss": 0.3622, "step": 1184 }, { "epoch": 1.4946372239747634, "grad_norm": 0.2890112789292974, "learning_rate": 2.7853133769878393e-05, "loss": 0.3661, "step": 1185 }, { "epoch": 1.4958990536277603, "grad_norm": 0.34662953691800674, "learning_rate": 2.782974742750234e-05, "loss": 0.3606, "step": 1186 }, { "epoch": 1.497160883280757, "grad_norm": 0.2438773252553068, "learning_rate": 2.7806361085126288e-05, "loss": 0.3509, "step": 1187 }, { "epoch": 1.498422712933754, "grad_norm": 0.31670436632868315, "learning_rate": 2.7782974742750236e-05, "loss": 0.3626, "step": 1188 }, { "epoch": 1.4996845425867509, "grad_norm": 0.32938285102895004, "learning_rate": 2.775958840037418e-05, "loss": 0.3539, "step": 1189 }, { "epoch": 1.5009463722397476, "grad_norm": 0.33465245972402785, "learning_rate": 2.7736202057998127e-05, "loss": 0.3818, "step": 1190 }, { "epoch": 1.5022082018927445, "grad_norm": 0.31395443760309666, "learning_rate": 2.7712815715622075e-05, "loss": 0.3587, "step": 1191 }, { "epoch": 1.5034700315457412, "grad_norm": 0.3315951951957389, "learning_rate": 2.7689429373246023e-05, "loss": 0.3651, "step": 1192 }, { "epoch": 1.5047318611987381, "grad_norm": 0.35638302664599625, "learning_rate": 2.7666043030869977e-05, "loss": 0.37, "step": 1193 }, { "epoch": 1.505993690851735, "grad_norm": 0.30245071336765567, "learning_rate": 2.7642656688493924e-05, "loss": 0.378, "step": 1194 }, { "epoch": 1.507255520504732, "grad_norm": 0.302478209160533, "learning_rate": 2.761927034611787e-05, "loss": 0.3601, "step": 1195 }, { "epoch": 1.5085173501577287, "grad_norm": 0.31824577306783924, "learning_rate": 2.7595884003741816e-05, "loss": 0.3589, "step": 1196 }, { "epoch": 1.5097791798107254, "grad_norm": 0.26673745886605477, "learning_rate": 2.7572497661365764e-05, "loss": 0.3655, "step": 1197 }, { "epoch": 1.5110410094637223, "grad_norm": 0.3122730139124133, "learning_rate": 2.754911131898971e-05, "loss": 0.3322, "step": 1198 }, { "epoch": 1.5123028391167193, "grad_norm": 0.33241341267142793, "learning_rate": 2.752572497661366e-05, "loss": 0.3451, "step": 1199 }, { "epoch": 1.5135646687697162, "grad_norm": 0.29110000957223664, "learning_rate": 2.7502338634237607e-05, "loss": 0.3581, "step": 1200 }, { "epoch": 1.5148264984227129, "grad_norm": 0.3051698820926539, "learning_rate": 2.7478952291861554e-05, "loss": 0.3608, "step": 1201 }, { "epoch": 1.5160883280757098, "grad_norm": 0.3257473233717455, "learning_rate": 2.7455565949485502e-05, "loss": 0.3886, "step": 1202 }, { "epoch": 1.5173501577287065, "grad_norm": 0.2709303524195602, "learning_rate": 2.743217960710945e-05, "loss": 0.3569, "step": 1203 }, { "epoch": 1.5186119873817034, "grad_norm": 0.3560894021656622, "learning_rate": 2.7408793264733397e-05, "loss": 0.3591, "step": 1204 }, { "epoch": 1.5198738170347004, "grad_norm": 0.30864004280297364, "learning_rate": 2.7385406922357344e-05, "loss": 0.3729, "step": 1205 }, { "epoch": 1.5211356466876973, "grad_norm": 0.33109373375286905, "learning_rate": 2.736202057998129e-05, "loss": 0.3702, "step": 1206 }, { "epoch": 1.522397476340694, "grad_norm": 0.3492024113144231, "learning_rate": 2.7338634237605236e-05, "loss": 0.3615, "step": 1207 }, { "epoch": 1.5236593059936907, "grad_norm": 0.2912821314064088, "learning_rate": 2.731524789522919e-05, "loss": 0.3532, "step": 1208 }, { "epoch": 1.5249211356466876, "grad_norm": 0.27367922120525057, "learning_rate": 2.7291861552853138e-05, "loss": 0.3762, "step": 1209 }, { "epoch": 1.5261829652996846, "grad_norm": 0.3005974963105602, "learning_rate": 2.7268475210477086e-05, "loss": 0.3649, "step": 1210 }, { "epoch": 1.5274447949526815, "grad_norm": 0.3399169508703363, "learning_rate": 2.7245088868101033e-05, "loss": 0.3624, "step": 1211 }, { "epoch": 1.5287066246056782, "grad_norm": 0.3264430303603992, "learning_rate": 2.7221702525724978e-05, "loss": 0.3657, "step": 1212 }, { "epoch": 1.5299684542586751, "grad_norm": 0.29967909922395375, "learning_rate": 2.7198316183348925e-05, "loss": 0.3574, "step": 1213 }, { "epoch": 1.5312302839116718, "grad_norm": 0.2686586236839994, "learning_rate": 2.7174929840972873e-05, "loss": 0.3877, "step": 1214 }, { "epoch": 1.5324921135646687, "grad_norm": 0.30342557188072006, "learning_rate": 2.715154349859682e-05, "loss": 0.367, "step": 1215 }, { "epoch": 1.5337539432176657, "grad_norm": 0.26133973512651476, "learning_rate": 2.7128157156220768e-05, "loss": 0.3625, "step": 1216 }, { "epoch": 1.5350157728706626, "grad_norm": 0.2511854611573312, "learning_rate": 2.7104770813844715e-05, "loss": 0.3755, "step": 1217 }, { "epoch": 1.5362776025236593, "grad_norm": 0.28259409413492764, "learning_rate": 2.7081384471468663e-05, "loss": 0.353, "step": 1218 }, { "epoch": 1.537539432176656, "grad_norm": 0.2622712543319768, "learning_rate": 2.705799812909261e-05, "loss": 0.3664, "step": 1219 }, { "epoch": 1.538801261829653, "grad_norm": 0.26789195967688173, "learning_rate": 2.7034611786716558e-05, "loss": 0.3398, "step": 1220 }, { "epoch": 1.5400630914826499, "grad_norm": 0.29637199591810115, "learning_rate": 2.7011225444340506e-05, "loss": 0.369, "step": 1221 }, { "epoch": 1.5413249211356468, "grad_norm": 0.29228987407750157, "learning_rate": 2.698783910196445e-05, "loss": 0.3508, "step": 1222 }, { "epoch": 1.5425867507886435, "grad_norm": 0.255634932686046, "learning_rate": 2.6964452759588398e-05, "loss": 0.3554, "step": 1223 }, { "epoch": 1.5438485804416404, "grad_norm": 0.2841930031845203, "learning_rate": 2.6941066417212352e-05, "loss": 0.3673, "step": 1224 }, { "epoch": 1.5451104100946371, "grad_norm": 0.2618426257390454, "learning_rate": 2.69176800748363e-05, "loss": 0.3552, "step": 1225 }, { "epoch": 1.546372239747634, "grad_norm": 0.26448650353446457, "learning_rate": 2.6894293732460247e-05, "loss": 0.3743, "step": 1226 }, { "epoch": 1.547634069400631, "grad_norm": 0.25997373976727656, "learning_rate": 2.6870907390084195e-05, "loss": 0.3623, "step": 1227 }, { "epoch": 1.548895899053628, "grad_norm": 0.28615432398140045, "learning_rate": 2.6847521047708142e-05, "loss": 0.3457, "step": 1228 }, { "epoch": 1.5501577287066246, "grad_norm": 0.24210283384987347, "learning_rate": 2.6824134705332086e-05, "loss": 0.3726, "step": 1229 }, { "epoch": 1.5514195583596213, "grad_norm": 0.3358949625836188, "learning_rate": 2.6800748362956034e-05, "loss": 0.3489, "step": 1230 }, { "epoch": 1.5526813880126182, "grad_norm": 0.3337493766744186, "learning_rate": 2.677736202057998e-05, "loss": 0.361, "step": 1231 }, { "epoch": 1.5539432176656152, "grad_norm": 0.36111742895746163, "learning_rate": 2.675397567820393e-05, "loss": 0.3659, "step": 1232 }, { "epoch": 1.555205047318612, "grad_norm": 0.2796115621359888, "learning_rate": 2.6730589335827877e-05, "loss": 0.3704, "step": 1233 }, { "epoch": 1.5564668769716088, "grad_norm": 0.3726480790898827, "learning_rate": 2.6707202993451824e-05, "loss": 0.357, "step": 1234 }, { "epoch": 1.5577287066246057, "grad_norm": 0.30746588230605904, "learning_rate": 2.6683816651075772e-05, "loss": 0.3586, "step": 1235 }, { "epoch": 1.5589905362776024, "grad_norm": 0.2834453853156219, "learning_rate": 2.666043030869972e-05, "loss": 0.3321, "step": 1236 }, { "epoch": 1.5602523659305993, "grad_norm": 0.3052222433279445, "learning_rate": 2.6637043966323667e-05, "loss": 0.3598, "step": 1237 }, { "epoch": 1.5615141955835963, "grad_norm": 0.3084744116333268, "learning_rate": 2.6613657623947615e-05, "loss": 0.3647, "step": 1238 }, { "epoch": 1.5627760252365932, "grad_norm": 0.2719108655778965, "learning_rate": 2.6590271281571566e-05, "loss": 0.3397, "step": 1239 }, { "epoch": 1.56403785488959, "grad_norm": 0.336989714909457, "learning_rate": 2.6566884939195513e-05, "loss": 0.3465, "step": 1240 }, { "epoch": 1.5652996845425866, "grad_norm": 0.26389327571732074, "learning_rate": 2.654349859681946e-05, "loss": 0.3532, "step": 1241 }, { "epoch": 1.5665615141955835, "grad_norm": 0.29011537062200626, "learning_rate": 2.652011225444341e-05, "loss": 0.3602, "step": 1242 }, { "epoch": 1.5678233438485805, "grad_norm": 0.36672044152789685, "learning_rate": 2.6496725912067356e-05, "loss": 0.3672, "step": 1243 }, { "epoch": 1.5690851735015774, "grad_norm": 0.29302612360603764, "learning_rate": 2.6473339569691304e-05, "loss": 0.3576, "step": 1244 }, { "epoch": 1.570347003154574, "grad_norm": 0.35497999669500063, "learning_rate": 2.644995322731525e-05, "loss": 0.3439, "step": 1245 }, { "epoch": 1.571608832807571, "grad_norm": 0.30924276135552803, "learning_rate": 2.6426566884939195e-05, "loss": 0.3788, "step": 1246 }, { "epoch": 1.5728706624605677, "grad_norm": 0.3369523303549387, "learning_rate": 2.6403180542563143e-05, "loss": 0.3605, "step": 1247 }, { "epoch": 1.5741324921135647, "grad_norm": 0.2920115794766075, "learning_rate": 2.637979420018709e-05, "loss": 0.3407, "step": 1248 }, { "epoch": 1.5753943217665616, "grad_norm": 0.30830994708060516, "learning_rate": 2.6356407857811038e-05, "loss": 0.3676, "step": 1249 }, { "epoch": 1.5766561514195585, "grad_norm": 0.34950260041322967, "learning_rate": 2.6333021515434986e-05, "loss": 0.3681, "step": 1250 }, { "epoch": 1.5779179810725552, "grad_norm": 0.2832799929904863, "learning_rate": 2.6309635173058933e-05, "loss": 0.3369, "step": 1251 }, { "epoch": 1.579179810725552, "grad_norm": 0.33732868847579206, "learning_rate": 2.628624883068288e-05, "loss": 0.3552, "step": 1252 }, { "epoch": 1.5804416403785488, "grad_norm": 0.31609830174119175, "learning_rate": 2.626286248830683e-05, "loss": 0.3705, "step": 1253 }, { "epoch": 1.5817034700315458, "grad_norm": 0.369709333634519, "learning_rate": 2.623947614593078e-05, "loss": 0.3511, "step": 1254 }, { "epoch": 1.5829652996845427, "grad_norm": 0.3298040044397129, "learning_rate": 2.6216089803554727e-05, "loss": 0.3792, "step": 1255 }, { "epoch": 1.5842271293375394, "grad_norm": 0.3534033697039225, "learning_rate": 2.6192703461178675e-05, "loss": 0.3707, "step": 1256 }, { "epoch": 1.5854889589905363, "grad_norm": 0.30167040004295115, "learning_rate": 2.6169317118802622e-05, "loss": 0.3696, "step": 1257 }, { "epoch": 1.586750788643533, "grad_norm": 0.3061897784108467, "learning_rate": 2.614593077642657e-05, "loss": 0.3379, "step": 1258 }, { "epoch": 1.58801261829653, "grad_norm": 0.3159325675011769, "learning_rate": 2.6122544434050517e-05, "loss": 0.353, "step": 1259 }, { "epoch": 1.5892744479495269, "grad_norm": 0.27967017976800274, "learning_rate": 2.6099158091674465e-05, "loss": 0.3573, "step": 1260 }, { "epoch": 1.5905362776025238, "grad_norm": 0.3496182664618417, "learning_rate": 2.6075771749298412e-05, "loss": 0.3564, "step": 1261 }, { "epoch": 1.5917981072555205, "grad_norm": 0.2827188238776627, "learning_rate": 2.6052385406922357e-05, "loss": 0.3744, "step": 1262 }, { "epoch": 1.5930599369085172, "grad_norm": 0.40029359226870365, "learning_rate": 2.6028999064546304e-05, "loss": 0.3391, "step": 1263 }, { "epoch": 1.5943217665615141, "grad_norm": 0.30361398578700144, "learning_rate": 2.6005612722170252e-05, "loss": 0.3712, "step": 1264 }, { "epoch": 1.595583596214511, "grad_norm": 0.35489705421778106, "learning_rate": 2.59822263797942e-05, "loss": 0.367, "step": 1265 }, { "epoch": 1.596845425867508, "grad_norm": 0.3579004712227079, "learning_rate": 2.5958840037418147e-05, "loss": 0.3569, "step": 1266 }, { "epoch": 1.5981072555205047, "grad_norm": 0.27497244455769554, "learning_rate": 2.5935453695042095e-05, "loss": 0.356, "step": 1267 }, { "epoch": 1.5993690851735016, "grad_norm": 0.3718511801934102, "learning_rate": 2.5912067352666042e-05, "loss": 0.3445, "step": 1268 }, { "epoch": 1.6006309148264983, "grad_norm": 0.2930035129365693, "learning_rate": 2.5888681010289993e-05, "loss": 0.3255, "step": 1269 }, { "epoch": 1.6018927444794953, "grad_norm": 0.3246906033973324, "learning_rate": 2.586529466791394e-05, "loss": 0.3842, "step": 1270 }, { "epoch": 1.6031545741324922, "grad_norm": 0.2848824459713829, "learning_rate": 2.5841908325537888e-05, "loss": 0.3547, "step": 1271 }, { "epoch": 1.604416403785489, "grad_norm": 0.26369273302669116, "learning_rate": 2.5818521983161836e-05, "loss": 0.3565, "step": 1272 }, { "epoch": 1.6056782334384858, "grad_norm": 0.3082332135562457, "learning_rate": 2.5795135640785783e-05, "loss": 0.3452, "step": 1273 }, { "epoch": 1.6069400630914825, "grad_norm": 0.28768558700640523, "learning_rate": 2.577174929840973e-05, "loss": 0.3875, "step": 1274 }, { "epoch": 1.6082018927444794, "grad_norm": 0.31602325631294675, "learning_rate": 2.574836295603368e-05, "loss": 0.3647, "step": 1275 }, { "epoch": 1.6094637223974764, "grad_norm": 0.27704677553906937, "learning_rate": 2.5724976613657626e-05, "loss": 0.3499, "step": 1276 }, { "epoch": 1.6107255520504733, "grad_norm": 0.2866901236911578, "learning_rate": 2.5701590271281574e-05, "loss": 0.3462, "step": 1277 }, { "epoch": 1.61198738170347, "grad_norm": 0.2729626063128669, "learning_rate": 2.567820392890552e-05, "loss": 0.3535, "step": 1278 }, { "epoch": 1.613249211356467, "grad_norm": 0.2602387721602046, "learning_rate": 2.5654817586529466e-05, "loss": 0.3544, "step": 1279 }, { "epoch": 1.6145110410094636, "grad_norm": 0.2714359697733415, "learning_rate": 2.5631431244153413e-05, "loss": 0.3463, "step": 1280 }, { "epoch": 1.6157728706624606, "grad_norm": 0.263541531558955, "learning_rate": 2.560804490177736e-05, "loss": 0.3427, "step": 1281 }, { "epoch": 1.6170347003154575, "grad_norm": 0.27108830730713473, "learning_rate": 2.5584658559401308e-05, "loss": 0.3615, "step": 1282 }, { "epoch": 1.6182965299684544, "grad_norm": 0.3088889142025173, "learning_rate": 2.5561272217025256e-05, "loss": 0.3853, "step": 1283 }, { "epoch": 1.6195583596214511, "grad_norm": 0.25566022631550894, "learning_rate": 2.5537885874649203e-05, "loss": 0.363, "step": 1284 }, { "epoch": 1.6208201892744478, "grad_norm": 0.2833266908593562, "learning_rate": 2.5514499532273158e-05, "loss": 0.3476, "step": 1285 }, { "epoch": 1.6220820189274447, "grad_norm": 0.26819356490539203, "learning_rate": 2.5491113189897102e-05, "loss": 0.3788, "step": 1286 }, { "epoch": 1.6233438485804417, "grad_norm": 0.3008309585314281, "learning_rate": 2.546772684752105e-05, "loss": 0.3535, "step": 1287 }, { "epoch": 1.6246056782334386, "grad_norm": 0.2906929692939445, "learning_rate": 2.5444340505144997e-05, "loss": 0.3418, "step": 1288 }, { "epoch": 1.6258675078864353, "grad_norm": 0.26935186334205524, "learning_rate": 2.5420954162768945e-05, "loss": 0.356, "step": 1289 }, { "epoch": 1.6271293375394322, "grad_norm": 0.2930379763320379, "learning_rate": 2.5397567820392892e-05, "loss": 0.363, "step": 1290 }, { "epoch": 1.628391167192429, "grad_norm": 0.276207437229723, "learning_rate": 2.537418147801684e-05, "loss": 0.3366, "step": 1291 }, { "epoch": 1.6296529968454259, "grad_norm": 0.25865623114476904, "learning_rate": 2.5350795135640787e-05, "loss": 0.3334, "step": 1292 }, { "epoch": 1.6309148264984228, "grad_norm": 0.2690957105428344, "learning_rate": 2.5327408793264735e-05, "loss": 0.3952, "step": 1293 }, { "epoch": 1.6321766561514197, "grad_norm": 0.28067900628977127, "learning_rate": 2.5304022450888683e-05, "loss": 0.371, "step": 1294 }, { "epoch": 1.6334384858044164, "grad_norm": 0.24623613762668783, "learning_rate": 2.528063610851263e-05, "loss": 0.3571, "step": 1295 }, { "epoch": 1.6347003154574131, "grad_norm": 0.2790415327911077, "learning_rate": 2.5257249766136574e-05, "loss": 0.3398, "step": 1296 }, { "epoch": 1.63596214511041, "grad_norm": 0.2588483204262472, "learning_rate": 2.5233863423760522e-05, "loss": 0.3541, "step": 1297 }, { "epoch": 1.637223974763407, "grad_norm": 0.25908895511630897, "learning_rate": 2.521047708138447e-05, "loss": 0.3572, "step": 1298 }, { "epoch": 1.638485804416404, "grad_norm": 0.2748481043043573, "learning_rate": 2.5187090739008417e-05, "loss": 0.3564, "step": 1299 }, { "epoch": 1.6397476340694006, "grad_norm": 0.26228911225808554, "learning_rate": 2.516370439663237e-05, "loss": 0.3428, "step": 1300 }, { "epoch": 1.6410094637223973, "grad_norm": 0.27588219223189253, "learning_rate": 2.514031805425632e-05, "loss": 0.3397, "step": 1301 }, { "epoch": 1.6422712933753942, "grad_norm": 0.3150661560824328, "learning_rate": 2.5116931711880263e-05, "loss": 0.3579, "step": 1302 }, { "epoch": 1.6435331230283912, "grad_norm": 0.35223216662573303, "learning_rate": 2.509354536950421e-05, "loss": 0.3702, "step": 1303 }, { "epoch": 1.644794952681388, "grad_norm": 0.2986340399594472, "learning_rate": 2.507015902712816e-05, "loss": 0.3504, "step": 1304 }, { "epoch": 1.646056782334385, "grad_norm": 0.3029941111869371, "learning_rate": 2.5046772684752106e-05, "loss": 0.3535, "step": 1305 }, { "epoch": 1.6473186119873817, "grad_norm": 0.3321894375094783, "learning_rate": 2.5023386342376054e-05, "loss": 0.3753, "step": 1306 }, { "epoch": 1.6485804416403784, "grad_norm": 0.2948464740068144, "learning_rate": 2.5e-05, "loss": 0.3689, "step": 1307 }, { "epoch": 1.6498422712933754, "grad_norm": 0.6747728809255068, "learning_rate": 2.497661365762395e-05, "loss": 0.3715, "step": 1308 }, { "epoch": 1.6511041009463723, "grad_norm": 0.3112755013348083, "learning_rate": 2.4953227315247896e-05, "loss": 0.3412, "step": 1309 }, { "epoch": 1.6523659305993692, "grad_norm": 0.3282882476765895, "learning_rate": 2.4929840972871844e-05, "loss": 0.3818, "step": 1310 }, { "epoch": 1.653627760252366, "grad_norm": 0.37160835607656173, "learning_rate": 2.490645463049579e-05, "loss": 0.3595, "step": 1311 }, { "epoch": 1.6548895899053626, "grad_norm": 0.3095299442594574, "learning_rate": 2.488306828811974e-05, "loss": 0.3695, "step": 1312 }, { "epoch": 1.6561514195583595, "grad_norm": 0.3467622086013079, "learning_rate": 2.4859681945743687e-05, "loss": 0.3617, "step": 1313 }, { "epoch": 1.6574132492113565, "grad_norm": 0.30126426751205854, "learning_rate": 2.4836295603367634e-05, "loss": 0.3456, "step": 1314 }, { "epoch": 1.6586750788643534, "grad_norm": 0.29259633930681145, "learning_rate": 2.4812909260991582e-05, "loss": 0.3674, "step": 1315 }, { "epoch": 1.6599369085173503, "grad_norm": 0.27239406415956247, "learning_rate": 2.478952291861553e-05, "loss": 0.3681, "step": 1316 }, { "epoch": 1.661198738170347, "grad_norm": 0.3094417609709377, "learning_rate": 2.4766136576239477e-05, "loss": 0.3519, "step": 1317 }, { "epoch": 1.6624605678233437, "grad_norm": 0.3163458710787224, "learning_rate": 2.4742750233863425e-05, "loss": 0.3538, "step": 1318 }, { "epoch": 1.6637223974763407, "grad_norm": 0.3084331873567931, "learning_rate": 2.4719363891487372e-05, "loss": 0.3514, "step": 1319 }, { "epoch": 1.6649842271293376, "grad_norm": 0.7587135705035095, "learning_rate": 2.469597754911132e-05, "loss": 0.3443, "step": 1320 }, { "epoch": 1.6662460567823345, "grad_norm": 0.2839215129324076, "learning_rate": 2.4672591206735267e-05, "loss": 0.3708, "step": 1321 }, { "epoch": 1.6675078864353312, "grad_norm": 0.29224501464816993, "learning_rate": 2.4649204864359215e-05, "loss": 0.3693, "step": 1322 }, { "epoch": 1.668769716088328, "grad_norm": 0.31662930466817607, "learning_rate": 2.4625818521983163e-05, "loss": 0.3544, "step": 1323 }, { "epoch": 1.6700315457413248, "grad_norm": 0.2672988855837333, "learning_rate": 2.460243217960711e-05, "loss": 0.3493, "step": 1324 }, { "epoch": 1.6712933753943218, "grad_norm": 0.33155092026297267, "learning_rate": 2.4579045837231058e-05, "loss": 0.3642, "step": 1325 }, { "epoch": 1.6725552050473187, "grad_norm": 0.2826277154566981, "learning_rate": 2.4555659494855005e-05, "loss": 0.3499, "step": 1326 }, { "epoch": 1.6738170347003156, "grad_norm": 0.2832387677467714, "learning_rate": 2.4532273152478953e-05, "loss": 0.3636, "step": 1327 }, { "epoch": 1.6750788643533123, "grad_norm": 0.2987788768627046, "learning_rate": 2.45088868101029e-05, "loss": 0.3653, "step": 1328 }, { "epoch": 1.676340694006309, "grad_norm": 0.2923197127537628, "learning_rate": 2.4485500467726848e-05, "loss": 0.3875, "step": 1329 }, { "epoch": 1.677602523659306, "grad_norm": 0.29932809803528704, "learning_rate": 2.4462114125350796e-05, "loss": 0.3648, "step": 1330 }, { "epoch": 1.6788643533123029, "grad_norm": 0.31985155680954896, "learning_rate": 2.4438727782974743e-05, "loss": 0.3588, "step": 1331 }, { "epoch": 1.6801261829652998, "grad_norm": 0.3211891241202969, "learning_rate": 2.441534144059869e-05, "loss": 0.3705, "step": 1332 }, { "epoch": 1.6813880126182965, "grad_norm": 0.3095959594835941, "learning_rate": 2.439195509822264e-05, "loss": 0.3524, "step": 1333 }, { "epoch": 1.6826498422712932, "grad_norm": 0.3106629121466089, "learning_rate": 2.436856875584659e-05, "loss": 0.3526, "step": 1334 }, { "epoch": 1.6839116719242901, "grad_norm": 0.28489062571578955, "learning_rate": 2.4345182413470537e-05, "loss": 0.3592, "step": 1335 }, { "epoch": 1.685173501577287, "grad_norm": 0.26351303047226365, "learning_rate": 2.432179607109448e-05, "loss": 0.337, "step": 1336 }, { "epoch": 1.686435331230284, "grad_norm": 0.3286340787081081, "learning_rate": 2.429840972871843e-05, "loss": 0.3526, "step": 1337 }, { "epoch": 1.687697160883281, "grad_norm": 0.27600595504437875, "learning_rate": 2.4275023386342376e-05, "loss": 0.3494, "step": 1338 }, { "epoch": 1.6889589905362776, "grad_norm": 0.3548448482023506, "learning_rate": 2.4251637043966324e-05, "loss": 0.3407, "step": 1339 }, { "epoch": 1.6902208201892743, "grad_norm": 0.2745066881252786, "learning_rate": 2.422825070159027e-05, "loss": 0.3559, "step": 1340 }, { "epoch": 1.6914826498422713, "grad_norm": 0.3148903736010573, "learning_rate": 2.420486435921422e-05, "loss": 0.3675, "step": 1341 }, { "epoch": 1.6927444794952682, "grad_norm": 0.37118124031218797, "learning_rate": 2.418147801683817e-05, "loss": 0.3733, "step": 1342 }, { "epoch": 1.694006309148265, "grad_norm": 0.3327046547061281, "learning_rate": 2.4158091674462118e-05, "loss": 0.3506, "step": 1343 }, { "epoch": 1.6952681388012618, "grad_norm": 0.314526436288781, "learning_rate": 2.4134705332086062e-05, "loss": 0.362, "step": 1344 }, { "epoch": 1.6965299684542585, "grad_norm": 0.34687276009494006, "learning_rate": 2.411131898971001e-05, "loss": 0.361, "step": 1345 }, { "epoch": 1.6977917981072554, "grad_norm": 0.35182024866433487, "learning_rate": 2.4087932647333957e-05, "loss": 0.3586, "step": 1346 }, { "epoch": 1.6990536277602524, "grad_norm": 0.2999899735573623, "learning_rate": 2.4064546304957904e-05, "loss": 0.346, "step": 1347 }, { "epoch": 1.7003154574132493, "grad_norm": 0.2812546140739151, "learning_rate": 2.4041159962581852e-05, "loss": 0.3559, "step": 1348 }, { "epoch": 1.701577287066246, "grad_norm": 0.3120655574024914, "learning_rate": 2.4017773620205803e-05, "loss": 0.3391, "step": 1349 }, { "epoch": 1.702839116719243, "grad_norm": 0.31740128593242, "learning_rate": 2.399438727782975e-05, "loss": 0.3658, "step": 1350 }, { "epoch": 1.7041009463722396, "grad_norm": 0.31395218025155525, "learning_rate": 2.3971000935453698e-05, "loss": 0.3439, "step": 1351 }, { "epoch": 1.7053627760252366, "grad_norm": 0.2928357690942594, "learning_rate": 2.3947614593077646e-05, "loss": 0.3555, "step": 1352 }, { "epoch": 1.7066246056782335, "grad_norm": 0.2639857337373238, "learning_rate": 2.392422825070159e-05, "loss": 0.3551, "step": 1353 }, { "epoch": 1.7078864353312304, "grad_norm": 0.3001506176658843, "learning_rate": 2.3900841908325538e-05, "loss": 0.3485, "step": 1354 }, { "epoch": 1.7091482649842271, "grad_norm": 0.2596141530180018, "learning_rate": 2.3877455565949485e-05, "loss": 0.3607, "step": 1355 }, { "epoch": 1.7104100946372238, "grad_norm": 0.28737709843988224, "learning_rate": 2.3854069223573433e-05, "loss": 0.3656, "step": 1356 }, { "epoch": 1.7116719242902207, "grad_norm": 0.31365178607933497, "learning_rate": 2.3830682881197384e-05, "loss": 0.3428, "step": 1357 }, { "epoch": 1.7129337539432177, "grad_norm": 0.24613130267869984, "learning_rate": 2.380729653882133e-05, "loss": 0.3562, "step": 1358 }, { "epoch": 1.7141955835962146, "grad_norm": 0.27452855362267753, "learning_rate": 2.378391019644528e-05, "loss": 0.3563, "step": 1359 }, { "epoch": 1.7154574132492113, "grad_norm": 0.31531327394679337, "learning_rate": 2.3760523854069226e-05, "loss": 0.3668, "step": 1360 }, { "epoch": 1.7167192429022082, "grad_norm": 0.2851819728696128, "learning_rate": 2.373713751169317e-05, "loss": 0.3556, "step": 1361 }, { "epoch": 1.717981072555205, "grad_norm": 0.305081323882725, "learning_rate": 2.3713751169317118e-05, "loss": 0.3727, "step": 1362 }, { "epoch": 1.7192429022082019, "grad_norm": 0.3608770996715523, "learning_rate": 2.3690364826941066e-05, "loss": 0.3755, "step": 1363 }, { "epoch": 1.7205047318611988, "grad_norm": 0.3251973699355815, "learning_rate": 2.3666978484565013e-05, "loss": 0.3476, "step": 1364 }, { "epoch": 1.7217665615141957, "grad_norm": 0.2942022819145365, "learning_rate": 2.3643592142188964e-05, "loss": 0.3607, "step": 1365 }, { "epoch": 1.7230283911671924, "grad_norm": 0.3142294965332406, "learning_rate": 2.3620205799812912e-05, "loss": 0.3645, "step": 1366 }, { "epoch": 1.7242902208201891, "grad_norm": 0.31704473674504857, "learning_rate": 2.359681945743686e-05, "loss": 0.344, "step": 1367 }, { "epoch": 1.725552050473186, "grad_norm": 0.29365432063316843, "learning_rate": 2.3573433115060807e-05, "loss": 0.3535, "step": 1368 }, { "epoch": 1.726813880126183, "grad_norm": 0.27016400397630014, "learning_rate": 2.355004677268475e-05, "loss": 0.3685, "step": 1369 }, { "epoch": 1.72807570977918, "grad_norm": 0.28866530707132876, "learning_rate": 2.35266604303087e-05, "loss": 0.3634, "step": 1370 }, { "epoch": 1.7293375394321766, "grad_norm": 0.3101277151873725, "learning_rate": 2.3503274087932646e-05, "loss": 0.3842, "step": 1371 }, { "epoch": 1.7305993690851735, "grad_norm": 0.35167827021314113, "learning_rate": 2.3479887745556597e-05, "loss": 0.3539, "step": 1372 }, { "epoch": 1.7318611987381702, "grad_norm": 0.2707147910311022, "learning_rate": 2.3456501403180545e-05, "loss": 0.3584, "step": 1373 }, { "epoch": 1.7331230283911672, "grad_norm": 0.3540596055442269, "learning_rate": 2.3433115060804493e-05, "loss": 0.3474, "step": 1374 }, { "epoch": 1.734384858044164, "grad_norm": 0.37733572079576605, "learning_rate": 2.340972871842844e-05, "loss": 0.3475, "step": 1375 }, { "epoch": 1.735646687697161, "grad_norm": 0.330010750770734, "learning_rate": 2.3386342376052388e-05, "loss": 0.3727, "step": 1376 }, { "epoch": 1.7369085173501577, "grad_norm": 0.35427627665973266, "learning_rate": 2.3362956033676335e-05, "loss": 0.3594, "step": 1377 }, { "epoch": 1.7381703470031544, "grad_norm": 0.4306752706974659, "learning_rate": 2.333956969130028e-05, "loss": 0.3651, "step": 1378 }, { "epoch": 1.7394321766561514, "grad_norm": 0.37213838241015074, "learning_rate": 2.3316183348924227e-05, "loss": 0.3607, "step": 1379 }, { "epoch": 1.7406940063091483, "grad_norm": 0.3514793928157547, "learning_rate": 2.3292797006548178e-05, "loss": 0.3771, "step": 1380 }, { "epoch": 1.7419558359621452, "grad_norm": 0.457120905665283, "learning_rate": 2.3269410664172126e-05, "loss": 0.3377, "step": 1381 }, { "epoch": 1.743217665615142, "grad_norm": 0.2915961262717494, "learning_rate": 2.3246024321796073e-05, "loss": 0.3491, "step": 1382 }, { "epoch": 1.7444794952681388, "grad_norm": 0.3717244226028987, "learning_rate": 2.322263797942002e-05, "loss": 0.3624, "step": 1383 }, { "epoch": 1.7457413249211355, "grad_norm": 0.3563463059045249, "learning_rate": 2.319925163704397e-05, "loss": 0.3618, "step": 1384 }, { "epoch": 1.7470031545741325, "grad_norm": 0.30910787637747883, "learning_rate": 2.3175865294667916e-05, "loss": 0.3543, "step": 1385 }, { "epoch": 1.7482649842271294, "grad_norm": 0.328277942347006, "learning_rate": 2.315247895229186e-05, "loss": 0.3986, "step": 1386 }, { "epoch": 1.7495268138801263, "grad_norm": 0.34554531546149375, "learning_rate": 2.3129092609915808e-05, "loss": 0.3709, "step": 1387 }, { "epoch": 1.750788643533123, "grad_norm": 0.3184537127204872, "learning_rate": 2.310570626753976e-05, "loss": 0.354, "step": 1388 }, { "epoch": 1.7520504731861197, "grad_norm": 0.28201286117425484, "learning_rate": 2.3082319925163706e-05, "loss": 0.3406, "step": 1389 }, { "epoch": 1.7533123028391167, "grad_norm": 0.32543782983327124, "learning_rate": 2.3058933582787654e-05, "loss": 0.3409, "step": 1390 }, { "epoch": 1.7545741324921136, "grad_norm": 0.3614518474846075, "learning_rate": 2.30355472404116e-05, "loss": 0.3728, "step": 1391 }, { "epoch": 1.7558359621451105, "grad_norm": 0.281782804326344, "learning_rate": 2.301216089803555e-05, "loss": 0.3453, "step": 1392 }, { "epoch": 1.7570977917981072, "grad_norm": 0.331241254753504, "learning_rate": 2.2988774555659497e-05, "loss": 0.3688, "step": 1393 }, { "epoch": 1.7583596214511041, "grad_norm": 0.3668857562607763, "learning_rate": 2.296538821328344e-05, "loss": 0.3714, "step": 1394 }, { "epoch": 1.7596214511041008, "grad_norm": 0.27072275373722177, "learning_rate": 2.2942001870907392e-05, "loss": 0.3442, "step": 1395 }, { "epoch": 1.7608832807570978, "grad_norm": 0.31109767750300144, "learning_rate": 2.291861552853134e-05, "loss": 0.3706, "step": 1396 }, { "epoch": 1.7621451104100947, "grad_norm": 0.28837170134076845, "learning_rate": 2.2895229186155287e-05, "loss": 0.3613, "step": 1397 }, { "epoch": 1.7634069400630916, "grad_norm": 0.329129444906132, "learning_rate": 2.2871842843779235e-05, "loss": 0.3554, "step": 1398 }, { "epoch": 1.7646687697160883, "grad_norm": 0.3142417422820172, "learning_rate": 2.2848456501403182e-05, "loss": 0.3524, "step": 1399 }, { "epoch": 1.765930599369085, "grad_norm": 0.40289753268252554, "learning_rate": 2.282507015902713e-05, "loss": 0.354, "step": 1400 }, { "epoch": 1.767192429022082, "grad_norm": 0.3294979161689948, "learning_rate": 2.2801683816651077e-05, "loss": 0.3262, "step": 1401 }, { "epoch": 1.7684542586750789, "grad_norm": 0.2847670482024436, "learning_rate": 2.2778297474275025e-05, "loss": 0.3616, "step": 1402 }, { "epoch": 1.7697160883280758, "grad_norm": 0.31756642324441625, "learning_rate": 2.2754911131898972e-05, "loss": 0.3581, "step": 1403 }, { "epoch": 1.7709779179810725, "grad_norm": 0.33202144478255036, "learning_rate": 2.273152478952292e-05, "loss": 0.3693, "step": 1404 }, { "epoch": 1.7722397476340694, "grad_norm": 0.2969393824061135, "learning_rate": 2.2708138447146868e-05, "loss": 0.3685, "step": 1405 }, { "epoch": 1.7735015772870661, "grad_norm": 0.34119474884822065, "learning_rate": 2.2684752104770815e-05, "loss": 0.3924, "step": 1406 }, { "epoch": 1.774763406940063, "grad_norm": 0.3087359636669285, "learning_rate": 2.2661365762394763e-05, "loss": 0.3596, "step": 1407 }, { "epoch": 1.77602523659306, "grad_norm": 0.32945705267383535, "learning_rate": 2.263797942001871e-05, "loss": 0.3879, "step": 1408 }, { "epoch": 1.777287066246057, "grad_norm": 0.33191922941009727, "learning_rate": 2.2614593077642658e-05, "loss": 0.3652, "step": 1409 }, { "epoch": 1.7785488958990536, "grad_norm": 0.2929071573388395, "learning_rate": 2.2591206735266606e-05, "loss": 0.3701, "step": 1410 }, { "epoch": 1.7798107255520503, "grad_norm": 0.29883828384766475, "learning_rate": 2.2567820392890553e-05, "loss": 0.3613, "step": 1411 }, { "epoch": 1.7810725552050473, "grad_norm": 0.30350877907339124, "learning_rate": 2.25444340505145e-05, "loss": 0.3794, "step": 1412 }, { "epoch": 1.7823343848580442, "grad_norm": 0.25845663138738806, "learning_rate": 2.2521047708138448e-05, "loss": 0.3614, "step": 1413 }, { "epoch": 1.7835962145110411, "grad_norm": 0.2846851666192883, "learning_rate": 2.2497661365762396e-05, "loss": 0.3498, "step": 1414 }, { "epoch": 1.7848580441640378, "grad_norm": 0.3413252679212488, "learning_rate": 2.2474275023386343e-05, "loss": 0.3719, "step": 1415 }, { "epoch": 1.7861198738170347, "grad_norm": 0.2956630458426754, "learning_rate": 2.245088868101029e-05, "loss": 0.3588, "step": 1416 }, { "epoch": 1.7873817034700314, "grad_norm": 0.29294431519393666, "learning_rate": 2.242750233863424e-05, "loss": 0.3356, "step": 1417 }, { "epoch": 1.7886435331230284, "grad_norm": 0.29266884083217853, "learning_rate": 2.2404115996258186e-05, "loss": 0.3607, "step": 1418 }, { "epoch": 1.7899053627760253, "grad_norm": 0.29325651299257227, "learning_rate": 2.2380729653882134e-05, "loss": 0.3736, "step": 1419 }, { "epoch": 1.7911671924290222, "grad_norm": 0.2923638093042888, "learning_rate": 2.235734331150608e-05, "loss": 0.3438, "step": 1420 }, { "epoch": 1.792429022082019, "grad_norm": 0.31521194878451425, "learning_rate": 2.233395696913003e-05, "loss": 0.3501, "step": 1421 }, { "epoch": 1.7936908517350156, "grad_norm": 0.2865864304974183, "learning_rate": 2.2310570626753976e-05, "loss": 0.3601, "step": 1422 }, { "epoch": 1.7949526813880126, "grad_norm": 0.32624268257655875, "learning_rate": 2.2287184284377924e-05, "loss": 0.3581, "step": 1423 }, { "epoch": 1.7962145110410095, "grad_norm": 0.2963585190608157, "learning_rate": 2.226379794200187e-05, "loss": 0.3284, "step": 1424 }, { "epoch": 1.7974763406940064, "grad_norm": 0.28421124145383053, "learning_rate": 2.224041159962582e-05, "loss": 0.3652, "step": 1425 }, { "epoch": 1.7987381703470031, "grad_norm": 0.3088979667337815, "learning_rate": 2.2217025257249767e-05, "loss": 0.3552, "step": 1426 }, { "epoch": 1.8, "grad_norm": 0.3232249809867413, "learning_rate": 2.2193638914873714e-05, "loss": 0.3581, "step": 1427 }, { "epoch": 1.8012618296529967, "grad_norm": 0.30731415369996246, "learning_rate": 2.2170252572497662e-05, "loss": 0.3714, "step": 1428 }, { "epoch": 1.8025236593059937, "grad_norm": 0.3018110821503075, "learning_rate": 2.214686623012161e-05, "loss": 0.3835, "step": 1429 }, { "epoch": 1.8037854889589906, "grad_norm": 0.3240679469318283, "learning_rate": 2.2123479887745557e-05, "loss": 0.3675, "step": 1430 }, { "epoch": 1.8050473186119875, "grad_norm": 0.26614323260132206, "learning_rate": 2.2100093545369505e-05, "loss": 0.3555, "step": 1431 }, { "epoch": 1.8063091482649842, "grad_norm": 0.2969238559609344, "learning_rate": 2.2076707202993452e-05, "loss": 0.365, "step": 1432 }, { "epoch": 1.807570977917981, "grad_norm": 0.30518280489427324, "learning_rate": 2.2053320860617403e-05, "loss": 0.363, "step": 1433 }, { "epoch": 1.8088328075709779, "grad_norm": 0.32627417892983895, "learning_rate": 2.2029934518241347e-05, "loss": 0.3691, "step": 1434 }, { "epoch": 1.8100946372239748, "grad_norm": 0.26954813542366396, "learning_rate": 2.2006548175865295e-05, "loss": 0.3575, "step": 1435 }, { "epoch": 1.8113564668769717, "grad_norm": 0.27328860268340327, "learning_rate": 2.1983161833489243e-05, "loss": 0.3483, "step": 1436 }, { "epoch": 1.8126182965299684, "grad_norm": 0.33096406085721636, "learning_rate": 2.195977549111319e-05, "loss": 0.3557, "step": 1437 }, { "epoch": 1.8138801261829653, "grad_norm": 0.25888691418842336, "learning_rate": 2.1936389148737138e-05, "loss": 0.362, "step": 1438 }, { "epoch": 1.815141955835962, "grad_norm": 0.3401442519248401, "learning_rate": 2.1913002806361085e-05, "loss": 0.3501, "step": 1439 }, { "epoch": 1.816403785488959, "grad_norm": 0.35765412877680985, "learning_rate": 2.1889616463985033e-05, "loss": 0.3547, "step": 1440 }, { "epoch": 1.817665615141956, "grad_norm": 0.2615584212734026, "learning_rate": 2.1866230121608984e-05, "loss": 0.3352, "step": 1441 }, { "epoch": 1.8189274447949528, "grad_norm": 0.2671290071350582, "learning_rate": 2.184284377923293e-05, "loss": 0.3666, "step": 1442 }, { "epoch": 1.8201892744479495, "grad_norm": 0.37924640227923373, "learning_rate": 2.1819457436856876e-05, "loss": 0.3447, "step": 1443 }, { "epoch": 1.8214511041009462, "grad_norm": 0.26373334902812096, "learning_rate": 2.1796071094480823e-05, "loss": 0.3584, "step": 1444 }, { "epoch": 1.8227129337539432, "grad_norm": 0.30967857342343413, "learning_rate": 2.177268475210477e-05, "loss": 0.3466, "step": 1445 }, { "epoch": 1.82397476340694, "grad_norm": 0.3358999745444896, "learning_rate": 2.174929840972872e-05, "loss": 0.3618, "step": 1446 }, { "epoch": 1.825236593059937, "grad_norm": 0.23902860426152167, "learning_rate": 2.1725912067352666e-05, "loss": 0.3741, "step": 1447 }, { "epoch": 1.8264984227129337, "grad_norm": 0.29129500924319973, "learning_rate": 2.1702525724976614e-05, "loss": 0.3723, "step": 1448 }, { "epoch": 1.8277602523659306, "grad_norm": 0.30732468633603677, "learning_rate": 2.1679139382600565e-05, "loss": 0.3577, "step": 1449 }, { "epoch": 1.8290220820189274, "grad_norm": 0.28028745092619173, "learning_rate": 2.1655753040224512e-05, "loss": 0.3385, "step": 1450 }, { "epoch": 1.8302839116719243, "grad_norm": 0.26758496231341244, "learning_rate": 2.1632366697848456e-05, "loss": 0.3587, "step": 1451 }, { "epoch": 1.8315457413249212, "grad_norm": 0.3415561731709248, "learning_rate": 2.1608980355472404e-05, "loss": 0.3701, "step": 1452 }, { "epoch": 1.8328075709779181, "grad_norm": 0.2779332403678166, "learning_rate": 2.158559401309635e-05, "loss": 0.3879, "step": 1453 }, { "epoch": 1.8340694006309148, "grad_norm": 0.33675249791259687, "learning_rate": 2.15622076707203e-05, "loss": 0.3609, "step": 1454 }, { "epoch": 1.8353312302839115, "grad_norm": 0.308448771088418, "learning_rate": 2.1538821328344247e-05, "loss": 0.3611, "step": 1455 }, { "epoch": 1.8365930599369085, "grad_norm": 0.2949029935783161, "learning_rate": 2.1515434985968198e-05, "loss": 0.3699, "step": 1456 }, { "epoch": 1.8378548895899054, "grad_norm": 0.28618767002064627, "learning_rate": 2.1492048643592145e-05, "loss": 0.3446, "step": 1457 }, { "epoch": 1.8391167192429023, "grad_norm": 0.28525898574836633, "learning_rate": 2.1468662301216093e-05, "loss": 0.3734, "step": 1458 }, { "epoch": 1.840378548895899, "grad_norm": 0.27370130430511774, "learning_rate": 2.1445275958840037e-05, "loss": 0.3483, "step": 1459 }, { "epoch": 1.841640378548896, "grad_norm": 0.25781859010854313, "learning_rate": 2.1421889616463985e-05, "loss": 0.3457, "step": 1460 }, { "epoch": 1.8429022082018927, "grad_norm": 0.2561525457528687, "learning_rate": 2.1398503274087932e-05, "loss": 0.3615, "step": 1461 }, { "epoch": 1.8441640378548896, "grad_norm": 0.33395216256168975, "learning_rate": 2.137511693171188e-05, "loss": 0.3679, "step": 1462 }, { "epoch": 1.8454258675078865, "grad_norm": 0.30418259063863623, "learning_rate": 2.1351730589335827e-05, "loss": 0.3549, "step": 1463 }, { "epoch": 1.8466876971608834, "grad_norm": 0.24548377197873006, "learning_rate": 2.132834424695978e-05, "loss": 0.3569, "step": 1464 }, { "epoch": 1.8479495268138801, "grad_norm": 0.27986604618347505, "learning_rate": 2.1304957904583726e-05, "loss": 0.3626, "step": 1465 }, { "epoch": 1.8492113564668768, "grad_norm": 0.319830516983045, "learning_rate": 2.1281571562207673e-05, "loss": 0.381, "step": 1466 }, { "epoch": 1.8504731861198738, "grad_norm": 0.2729248868531213, "learning_rate": 2.125818521983162e-05, "loss": 0.3757, "step": 1467 }, { "epoch": 1.8517350157728707, "grad_norm": 0.278072735221967, "learning_rate": 2.1234798877455565e-05, "loss": 0.3616, "step": 1468 }, { "epoch": 1.8529968454258676, "grad_norm": 0.32971884809216545, "learning_rate": 2.1211412535079513e-05, "loss": 0.3789, "step": 1469 }, { "epoch": 1.8542586750788643, "grad_norm": 0.2992461142302705, "learning_rate": 2.118802619270346e-05, "loss": 0.3455, "step": 1470 }, { "epoch": 1.8555205047318613, "grad_norm": 0.257542546226757, "learning_rate": 2.1164639850327408e-05, "loss": 0.366, "step": 1471 }, { "epoch": 1.856782334384858, "grad_norm": 0.3516025437913823, "learning_rate": 2.114125350795136e-05, "loss": 0.3527, "step": 1472 }, { "epoch": 1.8580441640378549, "grad_norm": 0.29616065082672755, "learning_rate": 2.1117867165575307e-05, "loss": 0.3386, "step": 1473 }, { "epoch": 1.8593059936908518, "grad_norm": 0.25925890566439974, "learning_rate": 2.1094480823199254e-05, "loss": 0.3656, "step": 1474 }, { "epoch": 1.8605678233438487, "grad_norm": 0.31627691974816835, "learning_rate": 2.1071094480823202e-05, "loss": 0.3575, "step": 1475 }, { "epoch": 1.8618296529968454, "grad_norm": 0.2950594361343604, "learning_rate": 2.1047708138447146e-05, "loss": 0.3743, "step": 1476 }, { "epoch": 1.8630914826498421, "grad_norm": 0.28467457443742195, "learning_rate": 2.1024321796071093e-05, "loss": 0.3655, "step": 1477 }, { "epoch": 1.864353312302839, "grad_norm": 0.2913592324448357, "learning_rate": 2.100093545369504e-05, "loss": 0.3545, "step": 1478 }, { "epoch": 1.865615141955836, "grad_norm": 0.26799750622827906, "learning_rate": 2.0977549111318992e-05, "loss": 0.3519, "step": 1479 }, { "epoch": 1.866876971608833, "grad_norm": 0.2762577753806236, "learning_rate": 2.095416276894294e-05, "loss": 0.3583, "step": 1480 }, { "epoch": 1.8681388012618296, "grad_norm": 0.27418698324216056, "learning_rate": 2.0930776426566887e-05, "loss": 0.3491, "step": 1481 }, { "epoch": 1.8694006309148263, "grad_norm": 0.3375592209229033, "learning_rate": 2.0907390084190835e-05, "loss": 0.3744, "step": 1482 }, { "epoch": 1.8706624605678233, "grad_norm": 0.27664990952963386, "learning_rate": 2.0884003741814782e-05, "loss": 0.3602, "step": 1483 }, { "epoch": 1.8719242902208202, "grad_norm": 0.31185162831114477, "learning_rate": 2.086061739943873e-05, "loss": 0.345, "step": 1484 }, { "epoch": 1.8731861198738171, "grad_norm": 0.30656324209644187, "learning_rate": 2.0837231057062674e-05, "loss": 0.3727, "step": 1485 }, { "epoch": 1.874447949526814, "grad_norm": 0.34622049717482645, "learning_rate": 2.0813844714686622e-05, "loss": 0.3637, "step": 1486 }, { "epoch": 1.8757097791798107, "grad_norm": 0.2715974816908054, "learning_rate": 2.0790458372310573e-05, "loss": 0.3676, "step": 1487 }, { "epoch": 1.8769716088328074, "grad_norm": 0.3256876276463631, "learning_rate": 2.076707202993452e-05, "loss": 0.376, "step": 1488 }, { "epoch": 1.8782334384858044, "grad_norm": 0.2922465428979275, "learning_rate": 2.0743685687558468e-05, "loss": 0.3686, "step": 1489 }, { "epoch": 1.8794952681388013, "grad_norm": 0.31903914980170706, "learning_rate": 2.0720299345182415e-05, "loss": 0.3698, "step": 1490 }, { "epoch": 1.8807570977917982, "grad_norm": 0.30700735052843636, "learning_rate": 2.0696913002806363e-05, "loss": 0.3695, "step": 1491 }, { "epoch": 1.882018927444795, "grad_norm": 0.2606514027046226, "learning_rate": 2.067352666043031e-05, "loss": 0.3818, "step": 1492 }, { "epoch": 1.8832807570977916, "grad_norm": 0.29246614158327433, "learning_rate": 2.0650140318054255e-05, "loss": 0.3636, "step": 1493 }, { "epoch": 1.8845425867507886, "grad_norm": 0.29418959903623654, "learning_rate": 2.0626753975678202e-05, "loss": 0.3368, "step": 1494 }, { "epoch": 1.8858044164037855, "grad_norm": 0.2664669200064647, "learning_rate": 2.0603367633302153e-05, "loss": 0.3745, "step": 1495 }, { "epoch": 1.8870662460567824, "grad_norm": 0.30409666068120644, "learning_rate": 2.05799812909261e-05, "loss": 0.356, "step": 1496 }, { "epoch": 1.8883280757097793, "grad_norm": 0.2630244301434001, "learning_rate": 2.055659494855005e-05, "loss": 0.3583, "step": 1497 }, { "epoch": 1.889589905362776, "grad_norm": 0.25996481042098435, "learning_rate": 2.0533208606173996e-05, "loss": 0.3566, "step": 1498 }, { "epoch": 1.8908517350157727, "grad_norm": 0.23940060509491318, "learning_rate": 2.0509822263797944e-05, "loss": 0.3483, "step": 1499 }, { "epoch": 1.8921135646687697, "grad_norm": 0.29862403941512455, "learning_rate": 2.048643592142189e-05, "loss": 0.3528, "step": 1500 }, { "epoch": 1.8933753943217666, "grad_norm": 0.2657512590384999, "learning_rate": 2.0463049579045835e-05, "loss": 0.3523, "step": 1501 }, { "epoch": 1.8946372239747635, "grad_norm": 0.27557383250051304, "learning_rate": 2.0439663236669786e-05, "loss": 0.3559, "step": 1502 }, { "epoch": 1.8958990536277602, "grad_norm": 0.2860782664941291, "learning_rate": 2.0416276894293734e-05, "loss": 0.3806, "step": 1503 }, { "epoch": 1.897160883280757, "grad_norm": 0.3199158536642191, "learning_rate": 2.039289055191768e-05, "loss": 0.3615, "step": 1504 }, { "epoch": 1.8984227129337539, "grad_norm": 0.30938673740440703, "learning_rate": 2.036950420954163e-05, "loss": 0.3511, "step": 1505 }, { "epoch": 1.8996845425867508, "grad_norm": 0.2607179693361668, "learning_rate": 2.0346117867165577e-05, "loss": 0.367, "step": 1506 }, { "epoch": 1.9009463722397477, "grad_norm": 0.33409092324723993, "learning_rate": 2.0322731524789524e-05, "loss": 0.3628, "step": 1507 }, { "epoch": 1.9022082018927446, "grad_norm": 0.2986567995494392, "learning_rate": 2.0299345182413472e-05, "loss": 0.3343, "step": 1508 }, { "epoch": 1.9034700315457413, "grad_norm": 0.27511290400972177, "learning_rate": 2.027595884003742e-05, "loss": 0.3668, "step": 1509 }, { "epoch": 1.904731861198738, "grad_norm": 0.34518107535362214, "learning_rate": 2.0252572497661367e-05, "loss": 0.3503, "step": 1510 }, { "epoch": 1.905993690851735, "grad_norm": 0.2587458269976321, "learning_rate": 2.0229186155285315e-05, "loss": 0.3687, "step": 1511 }, { "epoch": 1.907255520504732, "grad_norm": 0.27665312197222697, "learning_rate": 2.0205799812909262e-05, "loss": 0.3392, "step": 1512 }, { "epoch": 1.9085173501577288, "grad_norm": 0.3279964713469947, "learning_rate": 2.018241347053321e-05, "loss": 0.3593, "step": 1513 }, { "epoch": 1.9097791798107255, "grad_norm": 0.3299746478740882, "learning_rate": 2.0159027128157157e-05, "loss": 0.3721, "step": 1514 }, { "epoch": 1.9110410094637222, "grad_norm": 0.32837453963094176, "learning_rate": 2.0135640785781105e-05, "loss": 0.3449, "step": 1515 }, { "epoch": 1.9123028391167192, "grad_norm": 0.2591394128172102, "learning_rate": 2.0112254443405053e-05, "loss": 0.369, "step": 1516 }, { "epoch": 1.913564668769716, "grad_norm": 0.27907683103088093, "learning_rate": 2.0088868101029e-05, "loss": 0.364, "step": 1517 }, { "epoch": 1.914826498422713, "grad_norm": 0.29712698885576627, "learning_rate": 2.0065481758652948e-05, "loss": 0.3698, "step": 1518 }, { "epoch": 1.91608832807571, "grad_norm": 0.2724104562815062, "learning_rate": 2.0042095416276895e-05, "loss": 0.3539, "step": 1519 }, { "epoch": 1.9173501577287066, "grad_norm": 0.2452285965048993, "learning_rate": 2.0018709073900843e-05, "loss": 0.3505, "step": 1520 }, { "epoch": 1.9186119873817034, "grad_norm": 0.280883400440155, "learning_rate": 1.999532273152479e-05, "loss": 0.3692, "step": 1521 }, { "epoch": 1.9198738170347003, "grad_norm": 0.2742429349465196, "learning_rate": 1.9971936389148738e-05, "loss": 0.3508, "step": 1522 }, { "epoch": 1.9211356466876972, "grad_norm": 0.31893633636161595, "learning_rate": 1.9948550046772686e-05, "loss": 0.357, "step": 1523 }, { "epoch": 1.9223974763406941, "grad_norm": 0.2515058115113471, "learning_rate": 1.9925163704396633e-05, "loss": 0.3523, "step": 1524 }, { "epoch": 1.9236593059936908, "grad_norm": 0.28322161289366193, "learning_rate": 1.990177736202058e-05, "loss": 0.373, "step": 1525 }, { "epoch": 1.9249211356466875, "grad_norm": 0.30576364961496194, "learning_rate": 1.987839101964453e-05, "loss": 0.3581, "step": 1526 }, { "epoch": 1.9261829652996845, "grad_norm": 0.29964971247964906, "learning_rate": 1.9855004677268476e-05, "loss": 0.3611, "step": 1527 }, { "epoch": 1.9274447949526814, "grad_norm": 0.28544689605706997, "learning_rate": 1.9831618334892424e-05, "loss": 0.3541, "step": 1528 }, { "epoch": 1.9287066246056783, "grad_norm": 0.2638912999690251, "learning_rate": 1.980823199251637e-05, "loss": 0.375, "step": 1529 }, { "epoch": 1.929968454258675, "grad_norm": 0.3207983653125962, "learning_rate": 1.978484565014032e-05, "loss": 0.3762, "step": 1530 }, { "epoch": 1.931230283911672, "grad_norm": 0.2879139566354284, "learning_rate": 1.9761459307764266e-05, "loss": 0.3463, "step": 1531 }, { "epoch": 1.9324921135646687, "grad_norm": 0.25165699961135474, "learning_rate": 1.9738072965388214e-05, "loss": 0.3481, "step": 1532 }, { "epoch": 1.9337539432176656, "grad_norm": 0.28721629727084547, "learning_rate": 1.971468662301216e-05, "loss": 0.3437, "step": 1533 }, { "epoch": 1.9350157728706625, "grad_norm": 0.2904105325696281, "learning_rate": 1.969130028063611e-05, "loss": 0.3527, "step": 1534 }, { "epoch": 1.9362776025236594, "grad_norm": 0.2754121143979274, "learning_rate": 1.9667913938260057e-05, "loss": 0.3466, "step": 1535 }, { "epoch": 1.9375394321766561, "grad_norm": 0.2966296355795464, "learning_rate": 1.9644527595884004e-05, "loss": 0.3764, "step": 1536 }, { "epoch": 1.9388012618296528, "grad_norm": 0.32801857453553385, "learning_rate": 1.9621141253507952e-05, "loss": 0.3528, "step": 1537 }, { "epoch": 1.9400630914826498, "grad_norm": 0.4676335265813418, "learning_rate": 1.95977549111319e-05, "loss": 0.3893, "step": 1538 }, { "epoch": 1.9413249211356467, "grad_norm": 0.3976644061912845, "learning_rate": 1.9574368568755847e-05, "loss": 0.3839, "step": 1539 }, { "epoch": 1.9425867507886436, "grad_norm": 0.35022932417411873, "learning_rate": 1.9550982226379798e-05, "loss": 0.3506, "step": 1540 }, { "epoch": 1.9438485804416403, "grad_norm": 0.28821484103098854, "learning_rate": 1.9527595884003742e-05, "loss": 0.3841, "step": 1541 }, { "epoch": 1.9451104100946373, "grad_norm": 0.31050251996155986, "learning_rate": 1.950420954162769e-05, "loss": 0.3549, "step": 1542 }, { "epoch": 1.946372239747634, "grad_norm": 0.3025421568784994, "learning_rate": 1.9480823199251637e-05, "loss": 0.3565, "step": 1543 }, { "epoch": 1.9476340694006309, "grad_norm": 0.3178137078650446, "learning_rate": 1.9457436856875585e-05, "loss": 0.361, "step": 1544 }, { "epoch": 1.9488958990536278, "grad_norm": 0.2898960318084051, "learning_rate": 1.9434050514499532e-05, "loss": 0.364, "step": 1545 }, { "epoch": 1.9501577287066247, "grad_norm": 0.2639020704853177, "learning_rate": 1.941066417212348e-05, "loss": 0.3585, "step": 1546 }, { "epoch": 1.9514195583596214, "grad_norm": 0.31218686835044734, "learning_rate": 1.9387277829747428e-05, "loss": 0.3488, "step": 1547 }, { "epoch": 1.9526813880126181, "grad_norm": 0.28787828011915295, "learning_rate": 1.936389148737138e-05, "loss": 0.3369, "step": 1548 }, { "epoch": 1.953943217665615, "grad_norm": 0.25803765390129735, "learning_rate": 1.9340505144995326e-05, "loss": 0.3538, "step": 1549 }, { "epoch": 1.955205047318612, "grad_norm": 0.3327349189823211, "learning_rate": 1.931711880261927e-05, "loss": 0.3785, "step": 1550 }, { "epoch": 1.956466876971609, "grad_norm": 0.263164925348699, "learning_rate": 1.9293732460243218e-05, "loss": 0.3633, "step": 1551 }, { "epoch": 1.9577287066246056, "grad_norm": 0.28465465551167796, "learning_rate": 1.9270346117867166e-05, "loss": 0.355, "step": 1552 }, { "epoch": 1.9589905362776026, "grad_norm": 0.26688659526439945, "learning_rate": 1.9246959775491113e-05, "loss": 0.3595, "step": 1553 }, { "epoch": 1.9602523659305993, "grad_norm": 0.2915036199854609, "learning_rate": 1.922357343311506e-05, "loss": 0.3473, "step": 1554 }, { "epoch": 1.9615141955835962, "grad_norm": 0.29754111428949115, "learning_rate": 1.9200187090739008e-05, "loss": 0.3456, "step": 1555 }, { "epoch": 1.9627760252365931, "grad_norm": 0.3288414369231714, "learning_rate": 1.917680074836296e-05, "loss": 0.3777, "step": 1556 }, { "epoch": 1.96403785488959, "grad_norm": 0.284466574727442, "learning_rate": 1.9153414405986907e-05, "loss": 0.358, "step": 1557 }, { "epoch": 1.9652996845425867, "grad_norm": 0.2868146663334334, "learning_rate": 1.913002806361085e-05, "loss": 0.347, "step": 1558 }, { "epoch": 1.9665615141955834, "grad_norm": 0.30641231848968653, "learning_rate": 1.91066417212348e-05, "loss": 0.358, "step": 1559 }, { "epoch": 1.9678233438485804, "grad_norm": 0.27422915092097966, "learning_rate": 1.9083255378858746e-05, "loss": 0.3527, "step": 1560 }, { "epoch": 1.9690851735015773, "grad_norm": 0.2801226079550089, "learning_rate": 1.9059869036482694e-05, "loss": 0.3534, "step": 1561 }, { "epoch": 1.9703470031545742, "grad_norm": 0.2866414024612034, "learning_rate": 1.903648269410664e-05, "loss": 0.3625, "step": 1562 }, { "epoch": 1.971608832807571, "grad_norm": 0.26022729705124165, "learning_rate": 1.9013096351730592e-05, "loss": 0.3584, "step": 1563 }, { "epoch": 1.9728706624605679, "grad_norm": 0.29727477759668897, "learning_rate": 1.898971000935454e-05, "loss": 0.3605, "step": 1564 }, { "epoch": 1.9741324921135646, "grad_norm": 0.26125778276688916, "learning_rate": 1.8966323666978487e-05, "loss": 0.3412, "step": 1565 }, { "epoch": 1.9753943217665615, "grad_norm": 0.26061731606836985, "learning_rate": 1.894293732460243e-05, "loss": 0.3533, "step": 1566 }, { "epoch": 1.9766561514195584, "grad_norm": 0.2904736256928185, "learning_rate": 1.891955098222638e-05, "loss": 0.3686, "step": 1567 }, { "epoch": 1.9779179810725553, "grad_norm": 0.26546340427122683, "learning_rate": 1.8896164639850327e-05, "loss": 0.3213, "step": 1568 }, { "epoch": 1.979179810725552, "grad_norm": 0.2641867136285689, "learning_rate": 1.8872778297474274e-05, "loss": 0.365, "step": 1569 }, { "epoch": 1.9804416403785488, "grad_norm": 0.30101564687534915, "learning_rate": 1.8849391955098222e-05, "loss": 0.377, "step": 1570 }, { "epoch": 1.9817034700315457, "grad_norm": 0.27261163270156885, "learning_rate": 1.8826005612722173e-05, "loss": 0.3654, "step": 1571 }, { "epoch": 1.9829652996845426, "grad_norm": 0.2664263826190153, "learning_rate": 1.880261927034612e-05, "loss": 0.3465, "step": 1572 }, { "epoch": 1.9842271293375395, "grad_norm": 0.29356049793860944, "learning_rate": 1.8779232927970068e-05, "loss": 0.3706, "step": 1573 }, { "epoch": 1.9854889589905362, "grad_norm": 0.2734326098794183, "learning_rate": 1.8755846585594016e-05, "loss": 0.3431, "step": 1574 }, { "epoch": 1.9867507886435332, "grad_norm": 0.2677064800604729, "learning_rate": 1.873246024321796e-05, "loss": 0.3634, "step": 1575 }, { "epoch": 1.9880126182965299, "grad_norm": 0.3090476317276071, "learning_rate": 1.8709073900841907e-05, "loss": 0.371, "step": 1576 }, { "epoch": 1.9892744479495268, "grad_norm": 0.2800169189426272, "learning_rate": 1.8685687558465855e-05, "loss": 0.3485, "step": 1577 }, { "epoch": 1.9905362776025237, "grad_norm": 0.2887154685519294, "learning_rate": 1.8662301216089803e-05, "loss": 0.3686, "step": 1578 }, { "epoch": 1.9917981072555206, "grad_norm": 0.3044630062096668, "learning_rate": 1.8638914873713754e-05, "loss": 0.3536, "step": 1579 }, { "epoch": 1.9930599369085173, "grad_norm": 0.22830052437795442, "learning_rate": 1.86155285313377e-05, "loss": 0.3499, "step": 1580 }, { "epoch": 1.994321766561514, "grad_norm": 0.29630264527408745, "learning_rate": 1.859214218896165e-05, "loss": 0.364, "step": 1581 }, { "epoch": 1.995583596214511, "grad_norm": 0.3113093179547255, "learning_rate": 1.8568755846585596e-05, "loss": 0.3603, "step": 1582 }, { "epoch": 1.996845425867508, "grad_norm": 0.3083316654628617, "learning_rate": 1.854536950420954e-05, "loss": 0.3653, "step": 1583 }, { "epoch": 1.9981072555205048, "grad_norm": 0.3443627273870572, "learning_rate": 1.8521983161833488e-05, "loss": 0.3496, "step": 1584 }, { "epoch": 1.9993690851735015, "grad_norm": 0.2784030221010476, "learning_rate": 1.8498596819457436e-05, "loss": 0.3467, "step": 1585 }, { "epoch": 2.0, "grad_norm": 0.450195442582419, "learning_rate": 1.8475210477081387e-05, "loss": 0.32, "step": 1586 }, { "epoch": 2.001261829652997, "grad_norm": 0.31049943820504905, "learning_rate": 1.8451824134705334e-05, "loss": 0.2978, "step": 1587 }, { "epoch": 2.002523659305994, "grad_norm": 0.3046028456544412, "learning_rate": 1.8428437792329282e-05, "loss": 0.3017, "step": 1588 }, { "epoch": 2.0037854889589903, "grad_norm": 0.343545465720014, "learning_rate": 1.840505144995323e-05, "loss": 0.314, "step": 1589 }, { "epoch": 2.0050473186119873, "grad_norm": 0.24694773498429506, "learning_rate": 1.8381665107577177e-05, "loss": 0.2885, "step": 1590 }, { "epoch": 2.006309148264984, "grad_norm": 0.3656973449046354, "learning_rate": 1.835827876520112e-05, "loss": 0.2838, "step": 1591 }, { "epoch": 2.007570977917981, "grad_norm": 0.30418931917496284, "learning_rate": 1.833489242282507e-05, "loss": 0.2852, "step": 1592 }, { "epoch": 2.008832807570978, "grad_norm": 0.2717230635790713, "learning_rate": 1.8311506080449016e-05, "loss": 0.3024, "step": 1593 }, { "epoch": 2.010094637223975, "grad_norm": 0.27779065278371123, "learning_rate": 1.8288119738072967e-05, "loss": 0.281, "step": 1594 }, { "epoch": 2.0113564668769714, "grad_norm": 0.3191103313648123, "learning_rate": 1.8264733395696915e-05, "loss": 0.2977, "step": 1595 }, { "epoch": 2.0126182965299684, "grad_norm": 0.27873582182945966, "learning_rate": 1.8241347053320862e-05, "loss": 0.2919, "step": 1596 }, { "epoch": 2.0138801261829653, "grad_norm": 0.26851934098313457, "learning_rate": 1.821796071094481e-05, "loss": 0.2899, "step": 1597 }, { "epoch": 2.0151419558359622, "grad_norm": 0.27443550551086704, "learning_rate": 1.8194574368568758e-05, "loss": 0.2748, "step": 1598 }, { "epoch": 2.016403785488959, "grad_norm": 0.30851783532608235, "learning_rate": 1.8171188026192705e-05, "loss": 0.2828, "step": 1599 }, { "epoch": 2.0176656151419556, "grad_norm": 0.2582983655087739, "learning_rate": 1.814780168381665e-05, "loss": 0.3028, "step": 1600 }, { "epoch": 2.0189274447949526, "grad_norm": 0.32597439707377, "learning_rate": 1.81244153414406e-05, "loss": 0.3015, "step": 1601 }, { "epoch": 2.0201892744479495, "grad_norm": 0.3049982563181317, "learning_rate": 1.8101028999064548e-05, "loss": 0.282, "step": 1602 }, { "epoch": 2.0214511041009464, "grad_norm": 0.23145901139140004, "learning_rate": 1.8077642656688496e-05, "loss": 0.2882, "step": 1603 }, { "epoch": 2.0227129337539433, "grad_norm": 0.29075170653365306, "learning_rate": 1.8054256314312443e-05, "loss": 0.3009, "step": 1604 }, { "epoch": 2.0239747634069403, "grad_norm": 0.27078152430963426, "learning_rate": 1.803086997193639e-05, "loss": 0.2921, "step": 1605 }, { "epoch": 2.0252365930599368, "grad_norm": 0.24078328896392048, "learning_rate": 1.800748362956034e-05, "loss": 0.2833, "step": 1606 }, { "epoch": 2.0264984227129337, "grad_norm": 0.27413859238131855, "learning_rate": 1.7984097287184286e-05, "loss": 0.2824, "step": 1607 }, { "epoch": 2.0277602523659306, "grad_norm": 0.2846622173120609, "learning_rate": 1.796071094480823e-05, "loss": 0.2835, "step": 1608 }, { "epoch": 2.0290220820189275, "grad_norm": 0.24693679886115028, "learning_rate": 1.793732460243218e-05, "loss": 0.2888, "step": 1609 }, { "epoch": 2.0302839116719245, "grad_norm": 0.26081473180908754, "learning_rate": 1.791393826005613e-05, "loss": 0.2882, "step": 1610 }, { "epoch": 2.031545741324921, "grad_norm": 0.2781650027137441, "learning_rate": 1.7890551917680076e-05, "loss": 0.2867, "step": 1611 }, { "epoch": 2.032807570977918, "grad_norm": 0.27136706454221854, "learning_rate": 1.7867165575304024e-05, "loss": 0.2854, "step": 1612 }, { "epoch": 2.034069400630915, "grad_norm": 0.2687015166227358, "learning_rate": 1.784377923292797e-05, "loss": 0.2908, "step": 1613 }, { "epoch": 2.0353312302839117, "grad_norm": 0.265759688175269, "learning_rate": 1.782039289055192e-05, "loss": 0.289, "step": 1614 }, { "epoch": 2.0365930599369086, "grad_norm": 0.2529757166111857, "learning_rate": 1.7797006548175867e-05, "loss": 0.2765, "step": 1615 }, { "epoch": 2.0378548895899056, "grad_norm": 0.21692379340765086, "learning_rate": 1.7773620205799814e-05, "loss": 0.2616, "step": 1616 }, { "epoch": 2.039116719242902, "grad_norm": 0.24928121655352437, "learning_rate": 1.7750233863423762e-05, "loss": 0.2762, "step": 1617 }, { "epoch": 2.040378548895899, "grad_norm": 0.2562727242100018, "learning_rate": 1.772684752104771e-05, "loss": 0.2769, "step": 1618 }, { "epoch": 2.041640378548896, "grad_norm": 0.27952160769930684, "learning_rate": 1.7703461178671657e-05, "loss": 0.3005, "step": 1619 }, { "epoch": 2.042902208201893, "grad_norm": 0.22960394793806022, "learning_rate": 1.7680074836295604e-05, "loss": 0.2792, "step": 1620 }, { "epoch": 2.0441640378548898, "grad_norm": 0.22737648947032832, "learning_rate": 1.7656688493919552e-05, "loss": 0.2767, "step": 1621 }, { "epoch": 2.0454258675078862, "grad_norm": 0.29046469253042756, "learning_rate": 1.76333021515435e-05, "loss": 0.3004, "step": 1622 }, { "epoch": 2.046687697160883, "grad_norm": 0.2506507603420223, "learning_rate": 1.7609915809167447e-05, "loss": 0.2774, "step": 1623 }, { "epoch": 2.04794952681388, "grad_norm": 0.22368399217533913, "learning_rate": 1.7586529466791395e-05, "loss": 0.2893, "step": 1624 }, { "epoch": 2.049211356466877, "grad_norm": 0.2880248919708022, "learning_rate": 1.7563143124415342e-05, "loss": 0.2852, "step": 1625 }, { "epoch": 2.050473186119874, "grad_norm": 0.26634672444625523, "learning_rate": 1.753975678203929e-05, "loss": 0.2876, "step": 1626 }, { "epoch": 2.051735015772871, "grad_norm": 0.22859121539954402, "learning_rate": 1.7516370439663238e-05, "loss": 0.2673, "step": 1627 }, { "epoch": 2.0529968454258674, "grad_norm": 0.27777084644355965, "learning_rate": 1.7492984097287185e-05, "loss": 0.2839, "step": 1628 }, { "epoch": 2.0542586750788643, "grad_norm": 0.23885636565558743, "learning_rate": 1.7469597754911133e-05, "loss": 0.2821, "step": 1629 }, { "epoch": 2.055520504731861, "grad_norm": 0.23174859597705588, "learning_rate": 1.744621141253508e-05, "loss": 0.2694, "step": 1630 }, { "epoch": 2.056782334384858, "grad_norm": 0.23385829737609184, "learning_rate": 1.7422825070159028e-05, "loss": 0.2837, "step": 1631 }, { "epoch": 2.058044164037855, "grad_norm": 0.2545738679114364, "learning_rate": 1.7399438727782975e-05, "loss": 0.2791, "step": 1632 }, { "epoch": 2.0593059936908515, "grad_norm": 0.24415713464484304, "learning_rate": 1.7376052385406923e-05, "loss": 0.279, "step": 1633 }, { "epoch": 2.0605678233438485, "grad_norm": 0.2519190290907613, "learning_rate": 1.735266604303087e-05, "loss": 0.2825, "step": 1634 }, { "epoch": 2.0618296529968454, "grad_norm": 0.2304016909452736, "learning_rate": 1.7329279700654818e-05, "loss": 0.2823, "step": 1635 }, { "epoch": 2.0630914826498423, "grad_norm": 0.3250125165384863, "learning_rate": 1.7305893358278766e-05, "loss": 0.3057, "step": 1636 }, { "epoch": 2.0643533123028392, "grad_norm": 0.24757008949309234, "learning_rate": 1.7282507015902713e-05, "loss": 0.2839, "step": 1637 }, { "epoch": 2.065615141955836, "grad_norm": 0.24467706557427427, "learning_rate": 1.725912067352666e-05, "loss": 0.2984, "step": 1638 }, { "epoch": 2.0668769716088327, "grad_norm": 0.2623288050550035, "learning_rate": 1.723573433115061e-05, "loss": 0.2778, "step": 1639 }, { "epoch": 2.0681388012618296, "grad_norm": 0.27391348933822635, "learning_rate": 1.7212347988774556e-05, "loss": 0.2752, "step": 1640 }, { "epoch": 2.0694006309148265, "grad_norm": 0.24311769442849693, "learning_rate": 1.7188961646398504e-05, "loss": 0.2806, "step": 1641 }, { "epoch": 2.0706624605678234, "grad_norm": 0.28278538573034023, "learning_rate": 1.716557530402245e-05, "loss": 0.2906, "step": 1642 }, { "epoch": 2.0719242902208204, "grad_norm": 0.2905600597255147, "learning_rate": 1.71421889616464e-05, "loss": 0.3046, "step": 1643 }, { "epoch": 2.073186119873817, "grad_norm": 0.22696716401980566, "learning_rate": 1.7118802619270346e-05, "loss": 0.2908, "step": 1644 }, { "epoch": 2.0744479495268138, "grad_norm": 0.2542531463996861, "learning_rate": 1.7095416276894294e-05, "loss": 0.2777, "step": 1645 }, { "epoch": 2.0757097791798107, "grad_norm": 0.3030580025408155, "learning_rate": 1.707202993451824e-05, "loss": 0.2837, "step": 1646 }, { "epoch": 2.0769716088328076, "grad_norm": 0.2514121568751174, "learning_rate": 1.7048643592142193e-05, "loss": 0.2949, "step": 1647 }, { "epoch": 2.0782334384858046, "grad_norm": 0.24495037464278693, "learning_rate": 1.7025257249766137e-05, "loss": 0.2813, "step": 1648 }, { "epoch": 2.0794952681388015, "grad_norm": 0.26838043455067406, "learning_rate": 1.7001870907390084e-05, "loss": 0.2945, "step": 1649 }, { "epoch": 2.080757097791798, "grad_norm": 0.23885808457344018, "learning_rate": 1.6978484565014032e-05, "loss": 0.2799, "step": 1650 }, { "epoch": 2.082018927444795, "grad_norm": 0.28570761883960444, "learning_rate": 1.695509822263798e-05, "loss": 0.2958, "step": 1651 }, { "epoch": 2.083280757097792, "grad_norm": 0.24156705495307637, "learning_rate": 1.6931711880261927e-05, "loss": 0.2882, "step": 1652 }, { "epoch": 2.0845425867507887, "grad_norm": 0.2380933093821709, "learning_rate": 1.6908325537885875e-05, "loss": 0.2767, "step": 1653 }, { "epoch": 2.0858044164037857, "grad_norm": 0.2813791602630977, "learning_rate": 1.6884939195509822e-05, "loss": 0.2751, "step": 1654 }, { "epoch": 2.087066246056782, "grad_norm": 0.24715599463326127, "learning_rate": 1.6861552853133773e-05, "loss": 0.2986, "step": 1655 }, { "epoch": 2.088328075709779, "grad_norm": 0.25879275759410325, "learning_rate": 1.683816651075772e-05, "loss": 0.2852, "step": 1656 }, { "epoch": 2.089589905362776, "grad_norm": 0.2768513199254686, "learning_rate": 1.6814780168381665e-05, "loss": 0.2933, "step": 1657 }, { "epoch": 2.090851735015773, "grad_norm": 0.23417226891872164, "learning_rate": 1.6791393826005613e-05, "loss": 0.2936, "step": 1658 }, { "epoch": 2.09211356466877, "grad_norm": 0.25664632125404274, "learning_rate": 1.676800748362956e-05, "loss": 0.2923, "step": 1659 }, { "epoch": 2.0933753943217663, "grad_norm": 0.27715279108701957, "learning_rate": 1.6744621141253508e-05, "loss": 0.298, "step": 1660 }, { "epoch": 2.0946372239747633, "grad_norm": 0.26039593115174053, "learning_rate": 1.6721234798877455e-05, "loss": 0.293, "step": 1661 }, { "epoch": 2.09589905362776, "grad_norm": 0.2607914114057626, "learning_rate": 1.6697848456501403e-05, "loss": 0.2907, "step": 1662 }, { "epoch": 2.097160883280757, "grad_norm": 0.3111316456319916, "learning_rate": 1.6674462114125354e-05, "loss": 0.2901, "step": 1663 }, { "epoch": 2.098422712933754, "grad_norm": 0.2647094234384495, "learning_rate": 1.66510757717493e-05, "loss": 0.2772, "step": 1664 }, { "epoch": 2.099684542586751, "grad_norm": 0.27904153416460514, "learning_rate": 1.6627689429373246e-05, "loss": 0.2937, "step": 1665 }, { "epoch": 2.1009463722397475, "grad_norm": 0.2838166188530622, "learning_rate": 1.6604303086997193e-05, "loss": 0.2682, "step": 1666 }, { "epoch": 2.1022082018927444, "grad_norm": 0.3082423550460401, "learning_rate": 1.658091674462114e-05, "loss": 0.2873, "step": 1667 }, { "epoch": 2.1034700315457413, "grad_norm": 0.2666499176921043, "learning_rate": 1.655753040224509e-05, "loss": 0.2972, "step": 1668 }, { "epoch": 2.1047318611987382, "grad_norm": 0.23697343909393603, "learning_rate": 1.6534144059869036e-05, "loss": 0.2924, "step": 1669 }, { "epoch": 2.105993690851735, "grad_norm": 0.25402764313281084, "learning_rate": 1.6510757717492987e-05, "loss": 0.2949, "step": 1670 }, { "epoch": 2.107255520504732, "grad_norm": 0.26866764952832084, "learning_rate": 1.6487371375116935e-05, "loss": 0.2911, "step": 1671 }, { "epoch": 2.1085173501577286, "grad_norm": 0.23426877189391293, "learning_rate": 1.6463985032740882e-05, "loss": 0.2827, "step": 1672 }, { "epoch": 2.1097791798107255, "grad_norm": 0.23465214717616142, "learning_rate": 1.6440598690364826e-05, "loss": 0.278, "step": 1673 }, { "epoch": 2.1110410094637224, "grad_norm": 0.2479746054234925, "learning_rate": 1.6417212347988774e-05, "loss": 0.2793, "step": 1674 }, { "epoch": 2.1123028391167193, "grad_norm": 0.24613710612685163, "learning_rate": 1.639382600561272e-05, "loss": 0.2904, "step": 1675 }, { "epoch": 2.1135646687697163, "grad_norm": 0.25661188039472427, "learning_rate": 1.637043966323667e-05, "loss": 0.3092, "step": 1676 }, { "epoch": 2.1148264984227128, "grad_norm": 0.24456114018484754, "learning_rate": 1.6347053320860617e-05, "loss": 0.2974, "step": 1677 }, { "epoch": 2.1160883280757097, "grad_norm": 0.2549242999673381, "learning_rate": 1.6323666978484568e-05, "loss": 0.2901, "step": 1678 }, { "epoch": 2.1173501577287066, "grad_norm": 0.23766998330408862, "learning_rate": 1.6300280636108515e-05, "loss": 0.2855, "step": 1679 }, { "epoch": 2.1186119873817035, "grad_norm": 0.27070133549856196, "learning_rate": 1.6276894293732463e-05, "loss": 0.3007, "step": 1680 }, { "epoch": 2.1198738170347005, "grad_norm": 0.24837810336141297, "learning_rate": 1.625350795135641e-05, "loss": 0.2746, "step": 1681 }, { "epoch": 2.121135646687697, "grad_norm": 0.23783432779515798, "learning_rate": 1.6230121608980355e-05, "loss": 0.2896, "step": 1682 }, { "epoch": 2.122397476340694, "grad_norm": 0.24036300408831485, "learning_rate": 1.6206735266604302e-05, "loss": 0.2834, "step": 1683 }, { "epoch": 2.123659305993691, "grad_norm": 0.2840835896418943, "learning_rate": 1.618334892422825e-05, "loss": 0.2797, "step": 1684 }, { "epoch": 2.1249211356466877, "grad_norm": 0.2389347078076983, "learning_rate": 1.61599625818522e-05, "loss": 0.2713, "step": 1685 }, { "epoch": 2.1261829652996846, "grad_norm": 0.26379555947388966, "learning_rate": 1.6136576239476148e-05, "loss": 0.2827, "step": 1686 }, { "epoch": 2.1274447949526816, "grad_norm": 0.2455158006959945, "learning_rate": 1.6113189897100096e-05, "loss": 0.2799, "step": 1687 }, { "epoch": 2.128706624605678, "grad_norm": 0.26515315362204356, "learning_rate": 1.6089803554724043e-05, "loss": 0.2879, "step": 1688 }, { "epoch": 2.129968454258675, "grad_norm": 0.2596113300885751, "learning_rate": 1.606641721234799e-05, "loss": 0.2725, "step": 1689 }, { "epoch": 2.131230283911672, "grad_norm": 0.23505159567934306, "learning_rate": 1.6043030869971935e-05, "loss": 0.2914, "step": 1690 }, { "epoch": 2.132492113564669, "grad_norm": 0.22930420193431644, "learning_rate": 1.6019644527595883e-05, "loss": 0.2839, "step": 1691 }, { "epoch": 2.1337539432176658, "grad_norm": 0.24905028644267627, "learning_rate": 1.599625818521983e-05, "loss": 0.2712, "step": 1692 }, { "epoch": 2.1350157728706627, "grad_norm": 0.23288804272393926, "learning_rate": 1.597287184284378e-05, "loss": 0.2828, "step": 1693 }, { "epoch": 2.136277602523659, "grad_norm": 0.23968523270399325, "learning_rate": 1.594948550046773e-05, "loss": 0.2888, "step": 1694 }, { "epoch": 2.137539432176656, "grad_norm": 0.23417385979896838, "learning_rate": 1.5926099158091676e-05, "loss": 0.297, "step": 1695 }, { "epoch": 2.138801261829653, "grad_norm": 0.2548919481633086, "learning_rate": 1.5902712815715624e-05, "loss": 0.2872, "step": 1696 }, { "epoch": 2.14006309148265, "grad_norm": 0.236431148458568, "learning_rate": 1.587932647333957e-05, "loss": 0.2884, "step": 1697 }, { "epoch": 2.141324921135647, "grad_norm": 0.2564196608327053, "learning_rate": 1.5855940130963516e-05, "loss": 0.2794, "step": 1698 }, { "epoch": 2.1425867507886434, "grad_norm": 0.23647060959836197, "learning_rate": 1.5832553788587463e-05, "loss": 0.2819, "step": 1699 }, { "epoch": 2.1438485804416403, "grad_norm": 0.22356993259775346, "learning_rate": 1.580916744621141e-05, "loss": 0.3054, "step": 1700 }, { "epoch": 2.145110410094637, "grad_norm": 0.24154541113013664, "learning_rate": 1.5785781103835362e-05, "loss": 0.2816, "step": 1701 }, { "epoch": 2.146372239747634, "grad_norm": 0.24094978288452046, "learning_rate": 1.576239476145931e-05, "loss": 0.2836, "step": 1702 }, { "epoch": 2.147634069400631, "grad_norm": 0.2651080091850239, "learning_rate": 1.5739008419083257e-05, "loss": 0.3006, "step": 1703 }, { "epoch": 2.1488958990536275, "grad_norm": 0.2169587156852895, "learning_rate": 1.5715622076707205e-05, "loss": 0.2865, "step": 1704 }, { "epoch": 2.1501577287066245, "grad_norm": 0.23093461867266496, "learning_rate": 1.5692235734331152e-05, "loss": 0.2877, "step": 1705 }, { "epoch": 2.1514195583596214, "grad_norm": 0.22357962731150458, "learning_rate": 1.56688493919551e-05, "loss": 0.2809, "step": 1706 }, { "epoch": 2.1526813880126183, "grad_norm": 0.215834851340263, "learning_rate": 1.5645463049579044e-05, "loss": 0.2849, "step": 1707 }, { "epoch": 2.1539432176656153, "grad_norm": 0.2174404569186092, "learning_rate": 1.5622076707202995e-05, "loss": 0.2855, "step": 1708 }, { "epoch": 2.155205047318612, "grad_norm": 0.20838007188358737, "learning_rate": 1.5598690364826943e-05, "loss": 0.2687, "step": 1709 }, { "epoch": 2.1564668769716087, "grad_norm": 0.22703634692571986, "learning_rate": 1.557530402245089e-05, "loss": 0.3011, "step": 1710 }, { "epoch": 2.1577287066246056, "grad_norm": 0.22412872977916404, "learning_rate": 1.5551917680074838e-05, "loss": 0.2758, "step": 1711 }, { "epoch": 2.1589905362776025, "grad_norm": 0.22192084531816805, "learning_rate": 1.5528531337698785e-05, "loss": 0.2828, "step": 1712 }, { "epoch": 2.1602523659305994, "grad_norm": 0.22901386229752668, "learning_rate": 1.5505144995322733e-05, "loss": 0.2913, "step": 1713 }, { "epoch": 2.1615141955835964, "grad_norm": 0.2384891508522908, "learning_rate": 1.548175865294668e-05, "loss": 0.2756, "step": 1714 }, { "epoch": 2.1627760252365933, "grad_norm": 0.219981445616734, "learning_rate": 1.5458372310570625e-05, "loss": 0.2776, "step": 1715 }, { "epoch": 2.1640378548895898, "grad_norm": 0.24412316521119487, "learning_rate": 1.5434985968194576e-05, "loss": 0.2978, "step": 1716 }, { "epoch": 2.1652996845425867, "grad_norm": 0.22164213610486647, "learning_rate": 1.5411599625818523e-05, "loss": 0.2886, "step": 1717 }, { "epoch": 2.1665615141955836, "grad_norm": 0.22887734522825542, "learning_rate": 1.538821328344247e-05, "loss": 0.288, "step": 1718 }, { "epoch": 2.1678233438485806, "grad_norm": 0.23177305726844233, "learning_rate": 1.536482694106642e-05, "loss": 0.2966, "step": 1719 }, { "epoch": 2.1690851735015775, "grad_norm": 0.23808274654925804, "learning_rate": 1.5341440598690366e-05, "loss": 0.2992, "step": 1720 }, { "epoch": 2.170347003154574, "grad_norm": 0.21515469619851316, "learning_rate": 1.5318054256314314e-05, "loss": 0.2759, "step": 1721 }, { "epoch": 2.171608832807571, "grad_norm": 0.2397717719020172, "learning_rate": 1.529466791393826e-05, "loss": 0.2906, "step": 1722 }, { "epoch": 2.172870662460568, "grad_norm": 0.2199784680289094, "learning_rate": 1.5271281571562205e-05, "loss": 0.2877, "step": 1723 }, { "epoch": 2.1741324921135647, "grad_norm": 0.23566365120119398, "learning_rate": 1.5247895229186156e-05, "loss": 0.2861, "step": 1724 }, { "epoch": 2.1753943217665617, "grad_norm": 0.5295138831514458, "learning_rate": 1.5224508886810104e-05, "loss": 0.2826, "step": 1725 }, { "epoch": 2.176656151419558, "grad_norm": 0.23739868839256106, "learning_rate": 1.5201122544434052e-05, "loss": 0.2963, "step": 1726 }, { "epoch": 2.177917981072555, "grad_norm": 0.23157688210799146, "learning_rate": 1.5177736202057999e-05, "loss": 0.2818, "step": 1727 }, { "epoch": 2.179179810725552, "grad_norm": 0.24324505289773862, "learning_rate": 1.5154349859681947e-05, "loss": 0.2932, "step": 1728 }, { "epoch": 2.180441640378549, "grad_norm": 0.27169865269030474, "learning_rate": 1.5130963517305893e-05, "loss": 0.2925, "step": 1729 }, { "epoch": 2.181703470031546, "grad_norm": 0.23374243450500334, "learning_rate": 1.510757717492984e-05, "loss": 0.2813, "step": 1730 }, { "epoch": 2.182965299684543, "grad_norm": 0.2181682856705886, "learning_rate": 1.5084190832553791e-05, "loss": 0.2818, "step": 1731 }, { "epoch": 2.1842271293375393, "grad_norm": 0.23487382511596905, "learning_rate": 1.5060804490177739e-05, "loss": 0.2903, "step": 1732 }, { "epoch": 2.185488958990536, "grad_norm": 0.24093962108443437, "learning_rate": 1.5037418147801685e-05, "loss": 0.2886, "step": 1733 }, { "epoch": 2.186750788643533, "grad_norm": 0.21912914495286104, "learning_rate": 1.5014031805425632e-05, "loss": 0.287, "step": 1734 }, { "epoch": 2.18801261829653, "grad_norm": 0.22733451038616723, "learning_rate": 1.499064546304958e-05, "loss": 0.2926, "step": 1735 }, { "epoch": 2.189274447949527, "grad_norm": 0.23367953122283877, "learning_rate": 1.4967259120673527e-05, "loss": 0.2787, "step": 1736 }, { "epoch": 2.1905362776025235, "grad_norm": 0.22458929969708627, "learning_rate": 1.4943872778297475e-05, "loss": 0.288, "step": 1737 }, { "epoch": 2.1917981072555204, "grad_norm": 0.23885173807682125, "learning_rate": 1.492048643592142e-05, "loss": 0.2928, "step": 1738 }, { "epoch": 2.1930599369085173, "grad_norm": 0.24580873696544198, "learning_rate": 1.4897100093545372e-05, "loss": 0.3069, "step": 1739 }, { "epoch": 2.1943217665615142, "grad_norm": 0.21680942100649753, "learning_rate": 1.487371375116932e-05, "loss": 0.2801, "step": 1740 }, { "epoch": 2.195583596214511, "grad_norm": 0.23440776812673542, "learning_rate": 1.4850327408793265e-05, "loss": 0.2698, "step": 1741 }, { "epoch": 2.196845425867508, "grad_norm": 0.2511526599079675, "learning_rate": 1.4826941066417213e-05, "loss": 0.2909, "step": 1742 }, { "epoch": 2.1981072555205046, "grad_norm": 0.2213338043620079, "learning_rate": 1.480355472404116e-05, "loss": 0.2883, "step": 1743 }, { "epoch": 2.1993690851735015, "grad_norm": 0.22385569800006008, "learning_rate": 1.4780168381665108e-05, "loss": 0.2743, "step": 1744 }, { "epoch": 2.2006309148264984, "grad_norm": 0.2509942436214832, "learning_rate": 1.4756782039289056e-05, "loss": 0.2867, "step": 1745 }, { "epoch": 2.2018927444794953, "grad_norm": 0.23470144483693378, "learning_rate": 1.4733395696913001e-05, "loss": 0.282, "step": 1746 }, { "epoch": 2.2031545741324923, "grad_norm": 0.2518435758236852, "learning_rate": 1.4710009354536952e-05, "loss": 0.2766, "step": 1747 }, { "epoch": 2.2044164037854888, "grad_norm": 0.24883868828481068, "learning_rate": 1.46866230121609e-05, "loss": 0.2886, "step": 1748 }, { "epoch": 2.2056782334384857, "grad_norm": 0.22799947453736316, "learning_rate": 1.4663236669784846e-05, "loss": 0.2891, "step": 1749 }, { "epoch": 2.2069400630914826, "grad_norm": 0.2324002390188588, "learning_rate": 1.4639850327408793e-05, "loss": 0.2817, "step": 1750 }, { "epoch": 2.2082018927444795, "grad_norm": 0.2349950977340715, "learning_rate": 1.4616463985032741e-05, "loss": 0.2876, "step": 1751 }, { "epoch": 2.2094637223974765, "grad_norm": 0.28894369112959173, "learning_rate": 1.4593077642656689e-05, "loss": 0.2772, "step": 1752 }, { "epoch": 2.2107255520504734, "grad_norm": 0.22501549214774905, "learning_rate": 1.4569691300280636e-05, "loss": 0.2806, "step": 1753 }, { "epoch": 2.21198738170347, "grad_norm": 0.2278605972328639, "learning_rate": 1.4546304957904585e-05, "loss": 0.287, "step": 1754 }, { "epoch": 2.213249211356467, "grad_norm": 0.2162836054252456, "learning_rate": 1.4522918615528533e-05, "loss": 0.2733, "step": 1755 }, { "epoch": 2.2145110410094637, "grad_norm": 0.2686978189141461, "learning_rate": 1.449953227315248e-05, "loss": 0.2896, "step": 1756 }, { "epoch": 2.2157728706624606, "grad_norm": 0.21522488189147093, "learning_rate": 1.4476145930776428e-05, "loss": 0.2782, "step": 1757 }, { "epoch": 2.2170347003154576, "grad_norm": 0.24681313190559023, "learning_rate": 1.4452759588400374e-05, "loss": 0.2892, "step": 1758 }, { "epoch": 2.218296529968454, "grad_norm": 0.23088369634856445, "learning_rate": 1.4429373246024322e-05, "loss": 0.2787, "step": 1759 }, { "epoch": 2.219558359621451, "grad_norm": 0.2169094501133013, "learning_rate": 1.440598690364827e-05, "loss": 0.2729, "step": 1760 }, { "epoch": 2.220820189274448, "grad_norm": 0.22274307156209172, "learning_rate": 1.4382600561272217e-05, "loss": 0.2794, "step": 1761 }, { "epoch": 2.222082018927445, "grad_norm": 0.2505226728409246, "learning_rate": 1.4359214218896166e-05, "loss": 0.2947, "step": 1762 }, { "epoch": 2.2233438485804418, "grad_norm": 0.2323182292403782, "learning_rate": 1.4335827876520114e-05, "loss": 0.2894, "step": 1763 }, { "epoch": 2.2246056782334387, "grad_norm": 0.24419577104830334, "learning_rate": 1.4312441534144061e-05, "loss": 0.2757, "step": 1764 }, { "epoch": 2.225867507886435, "grad_norm": 0.23633322880733076, "learning_rate": 1.4289055191768009e-05, "loss": 0.2829, "step": 1765 }, { "epoch": 2.227129337539432, "grad_norm": 0.23391710436546667, "learning_rate": 1.4265668849391955e-05, "loss": 0.299, "step": 1766 }, { "epoch": 2.228391167192429, "grad_norm": 0.22600977247559562, "learning_rate": 1.4242282507015902e-05, "loss": 0.2831, "step": 1767 }, { "epoch": 2.229652996845426, "grad_norm": 0.24418809214054563, "learning_rate": 1.421889616463985e-05, "loss": 0.2942, "step": 1768 }, { "epoch": 2.230914826498423, "grad_norm": 0.2305509942669111, "learning_rate": 1.41955098222638e-05, "loss": 0.2829, "step": 1769 }, { "epoch": 2.2321766561514194, "grad_norm": 0.22647651424924883, "learning_rate": 1.4172123479887747e-05, "loss": 0.2909, "step": 1770 }, { "epoch": 2.2334384858044163, "grad_norm": 0.2384850851298168, "learning_rate": 1.4148737137511694e-05, "loss": 0.2748, "step": 1771 }, { "epoch": 2.234700315457413, "grad_norm": 0.215610362525108, "learning_rate": 1.4125350795135642e-05, "loss": 0.2926, "step": 1772 }, { "epoch": 2.23596214511041, "grad_norm": 0.2208794506016768, "learning_rate": 1.410196445275959e-05, "loss": 0.2852, "step": 1773 }, { "epoch": 2.237223974763407, "grad_norm": 0.24975008433166326, "learning_rate": 1.4078578110383535e-05, "loss": 0.2924, "step": 1774 }, { "epoch": 2.238485804416404, "grad_norm": 0.26867875779356726, "learning_rate": 1.4055191768007483e-05, "loss": 0.3064, "step": 1775 }, { "epoch": 2.2397476340694005, "grad_norm": 0.2328262402437395, "learning_rate": 1.403180542563143e-05, "loss": 0.2915, "step": 1776 }, { "epoch": 2.2410094637223974, "grad_norm": 0.23622235738383812, "learning_rate": 1.4008419083255382e-05, "loss": 0.2926, "step": 1777 }, { "epoch": 2.2422712933753943, "grad_norm": 0.21028437629005647, "learning_rate": 1.3985032740879327e-05, "loss": 0.2834, "step": 1778 }, { "epoch": 2.2435331230283913, "grad_norm": 0.21422788180717942, "learning_rate": 1.3961646398503275e-05, "loss": 0.2903, "step": 1779 }, { "epoch": 2.244794952681388, "grad_norm": 0.23712786439544586, "learning_rate": 1.3938260056127223e-05, "loss": 0.2855, "step": 1780 }, { "epoch": 2.2460567823343847, "grad_norm": 0.24038371069272446, "learning_rate": 1.391487371375117e-05, "loss": 0.296, "step": 1781 }, { "epoch": 2.2473186119873816, "grad_norm": 0.22853023911391687, "learning_rate": 1.3891487371375118e-05, "loss": 0.288, "step": 1782 }, { "epoch": 2.2485804416403785, "grad_norm": 0.22603796850769456, "learning_rate": 1.3868101028999064e-05, "loss": 0.2854, "step": 1783 }, { "epoch": 2.2498422712933754, "grad_norm": 0.23416060989897966, "learning_rate": 1.3844714686623011e-05, "loss": 0.2936, "step": 1784 }, { "epoch": 2.2511041009463724, "grad_norm": 0.21900333922300438, "learning_rate": 1.3821328344246962e-05, "loss": 0.2841, "step": 1785 }, { "epoch": 2.2523659305993693, "grad_norm": 0.24201567146197334, "learning_rate": 1.3797942001870908e-05, "loss": 0.2984, "step": 1786 }, { "epoch": 2.2536277602523658, "grad_norm": 0.45345717464133944, "learning_rate": 1.3774555659494856e-05, "loss": 0.3009, "step": 1787 }, { "epoch": 2.2548895899053627, "grad_norm": 0.22356996262881842, "learning_rate": 1.3751169317118803e-05, "loss": 0.2793, "step": 1788 }, { "epoch": 2.2561514195583596, "grad_norm": 0.23663351570461844, "learning_rate": 1.3727782974742751e-05, "loss": 0.2719, "step": 1789 }, { "epoch": 2.2574132492113566, "grad_norm": 0.24230475249043962, "learning_rate": 1.3704396632366698e-05, "loss": 0.2753, "step": 1790 }, { "epoch": 2.2586750788643535, "grad_norm": 0.22778926286236506, "learning_rate": 1.3681010289990644e-05, "loss": 0.2799, "step": 1791 }, { "epoch": 2.25993690851735, "grad_norm": 0.24811829498712742, "learning_rate": 1.3657623947614595e-05, "loss": 0.2869, "step": 1792 }, { "epoch": 2.261198738170347, "grad_norm": 0.227210213424066, "learning_rate": 1.3634237605238543e-05, "loss": 0.2666, "step": 1793 }, { "epoch": 2.262460567823344, "grad_norm": 0.23733361889312943, "learning_rate": 1.3610851262862489e-05, "loss": 0.3054, "step": 1794 }, { "epoch": 2.2637223974763407, "grad_norm": 0.23656438464078347, "learning_rate": 1.3587464920486436e-05, "loss": 0.2992, "step": 1795 }, { "epoch": 2.2649842271293377, "grad_norm": 0.22914290672710044, "learning_rate": 1.3564078578110384e-05, "loss": 0.2952, "step": 1796 }, { "epoch": 2.266246056782334, "grad_norm": 0.21613280728981624, "learning_rate": 1.3540692235734332e-05, "loss": 0.2828, "step": 1797 }, { "epoch": 2.267507886435331, "grad_norm": 0.2037407139463749, "learning_rate": 1.3517305893358279e-05, "loss": 0.275, "step": 1798 }, { "epoch": 2.268769716088328, "grad_norm": 0.24438514851162185, "learning_rate": 1.3493919550982225e-05, "loss": 0.29, "step": 1799 }, { "epoch": 2.270031545741325, "grad_norm": 0.24523129939754293, "learning_rate": 1.3470533208606176e-05, "loss": 0.3026, "step": 1800 }, { "epoch": 2.271293375394322, "grad_norm": 0.21734865659625638, "learning_rate": 1.3447146866230124e-05, "loss": 0.2797, "step": 1801 }, { "epoch": 2.272555205047319, "grad_norm": 0.21733886302367109, "learning_rate": 1.3423760523854071e-05, "loss": 0.2723, "step": 1802 }, { "epoch": 2.2738170347003153, "grad_norm": 0.26141224671500596, "learning_rate": 1.3400374181478017e-05, "loss": 0.2887, "step": 1803 }, { "epoch": 2.275078864353312, "grad_norm": 0.2494628462918937, "learning_rate": 1.3376987839101965e-05, "loss": 0.2981, "step": 1804 }, { "epoch": 2.276340694006309, "grad_norm": 0.2206060739659763, "learning_rate": 1.3353601496725912e-05, "loss": 0.2918, "step": 1805 }, { "epoch": 2.277602523659306, "grad_norm": 0.2385522532169825, "learning_rate": 1.333021515434986e-05, "loss": 0.2903, "step": 1806 }, { "epoch": 2.278864353312303, "grad_norm": 0.24894734608847136, "learning_rate": 1.3306828811973807e-05, "loss": 0.2751, "step": 1807 }, { "epoch": 2.2801261829653, "grad_norm": 0.2490949635033043, "learning_rate": 1.3283442469597757e-05, "loss": 0.2817, "step": 1808 }, { "epoch": 2.2813880126182964, "grad_norm": 0.21651365019744753, "learning_rate": 1.3260056127221704e-05, "loss": 0.2837, "step": 1809 }, { "epoch": 2.2826498422712933, "grad_norm": 0.23694929961602004, "learning_rate": 1.3236669784845652e-05, "loss": 0.2955, "step": 1810 }, { "epoch": 2.2839116719242902, "grad_norm": 0.23580957328429747, "learning_rate": 1.3213283442469598e-05, "loss": 0.2857, "step": 1811 }, { "epoch": 2.285173501577287, "grad_norm": 0.21500776480188705, "learning_rate": 1.3189897100093545e-05, "loss": 0.297, "step": 1812 }, { "epoch": 2.286435331230284, "grad_norm": 0.21282208759136736, "learning_rate": 1.3166510757717493e-05, "loss": 0.2862, "step": 1813 }, { "epoch": 2.2876971608832806, "grad_norm": 0.22205980174951293, "learning_rate": 1.314312441534144e-05, "loss": 0.2842, "step": 1814 }, { "epoch": 2.2889589905362775, "grad_norm": 0.2236585060637774, "learning_rate": 1.311973807296539e-05, "loss": 0.2895, "step": 1815 }, { "epoch": 2.2902208201892744, "grad_norm": 0.22894365791170015, "learning_rate": 1.3096351730589337e-05, "loss": 0.2776, "step": 1816 }, { "epoch": 2.2914826498422713, "grad_norm": 0.24836213951691777, "learning_rate": 1.3072965388213285e-05, "loss": 0.2698, "step": 1817 }, { "epoch": 2.2927444794952683, "grad_norm": 0.2108337638996842, "learning_rate": 1.3049579045837232e-05, "loss": 0.2847, "step": 1818 }, { "epoch": 2.2940063091482648, "grad_norm": 0.22254833323886558, "learning_rate": 1.3026192703461178e-05, "loss": 0.2762, "step": 1819 }, { "epoch": 2.2952681388012617, "grad_norm": 0.22597037630195763, "learning_rate": 1.3002806361085126e-05, "loss": 0.2854, "step": 1820 }, { "epoch": 2.2965299684542586, "grad_norm": 0.235640254730497, "learning_rate": 1.2979420018709073e-05, "loss": 0.2752, "step": 1821 }, { "epoch": 2.2977917981072555, "grad_norm": 0.23409950066781968, "learning_rate": 1.2956033676333021e-05, "loss": 0.3008, "step": 1822 }, { "epoch": 2.2990536277602525, "grad_norm": 0.22669181740518923, "learning_rate": 1.293264733395697e-05, "loss": 0.2948, "step": 1823 }, { "epoch": 2.3003154574132494, "grad_norm": 0.2242799796741699, "learning_rate": 1.2909260991580918e-05, "loss": 0.2883, "step": 1824 }, { "epoch": 2.301577287066246, "grad_norm": 0.2184262345432056, "learning_rate": 1.2885874649204865e-05, "loss": 0.2911, "step": 1825 }, { "epoch": 2.302839116719243, "grad_norm": 0.23357126216839652, "learning_rate": 1.2862488306828813e-05, "loss": 0.2867, "step": 1826 }, { "epoch": 2.3041009463722397, "grad_norm": 0.24216763920250872, "learning_rate": 1.283910196445276e-05, "loss": 0.2931, "step": 1827 }, { "epoch": 2.3053627760252366, "grad_norm": 0.2306512626548041, "learning_rate": 1.2815715622076707e-05, "loss": 0.2832, "step": 1828 }, { "epoch": 2.3066246056782336, "grad_norm": 0.2406121392319697, "learning_rate": 1.2792329279700654e-05, "loss": 0.2883, "step": 1829 }, { "epoch": 2.3078864353312305, "grad_norm": 0.24534414925224726, "learning_rate": 1.2768942937324602e-05, "loss": 0.3063, "step": 1830 }, { "epoch": 2.309148264984227, "grad_norm": 0.24275127412906566, "learning_rate": 1.2745556594948551e-05, "loss": 0.2836, "step": 1831 }, { "epoch": 2.310410094637224, "grad_norm": 0.24816791121432594, "learning_rate": 1.2722170252572499e-05, "loss": 0.2822, "step": 1832 }, { "epoch": 2.311671924290221, "grad_norm": 0.22756233596478775, "learning_rate": 1.2698783910196446e-05, "loss": 0.2757, "step": 1833 }, { "epoch": 2.3129337539432178, "grad_norm": 0.23603958675100475, "learning_rate": 1.2675397567820394e-05, "loss": 0.2891, "step": 1834 }, { "epoch": 2.3141955835962147, "grad_norm": 0.25487277772135275, "learning_rate": 1.2652011225444341e-05, "loss": 0.2975, "step": 1835 }, { "epoch": 2.315457413249211, "grad_norm": 0.24100927208264167, "learning_rate": 1.2628624883068287e-05, "loss": 0.2792, "step": 1836 }, { "epoch": 2.316719242902208, "grad_norm": 0.2472307770236215, "learning_rate": 1.2605238540692235e-05, "loss": 0.2985, "step": 1837 }, { "epoch": 2.317981072555205, "grad_norm": 0.21872927014373503, "learning_rate": 1.2581852198316186e-05, "loss": 0.275, "step": 1838 }, { "epoch": 2.319242902208202, "grad_norm": 0.2296191374870959, "learning_rate": 1.2558465855940132e-05, "loss": 0.297, "step": 1839 }, { "epoch": 2.320504731861199, "grad_norm": 0.2366365832554931, "learning_rate": 1.253507951356408e-05, "loss": 0.283, "step": 1840 }, { "epoch": 2.3217665615141954, "grad_norm": 0.2637642882643801, "learning_rate": 1.2511693171188027e-05, "loss": 0.2893, "step": 1841 }, { "epoch": 2.3230283911671923, "grad_norm": 0.23726665511774123, "learning_rate": 1.2488306828811974e-05, "loss": 0.2988, "step": 1842 }, { "epoch": 2.324290220820189, "grad_norm": 0.22698926361352406, "learning_rate": 1.2464920486435922e-05, "loss": 0.2886, "step": 1843 }, { "epoch": 2.325552050473186, "grad_norm": 0.24356951204261362, "learning_rate": 1.244153414405987e-05, "loss": 0.2783, "step": 1844 }, { "epoch": 2.326813880126183, "grad_norm": 0.2592871836721219, "learning_rate": 1.2418147801683817e-05, "loss": 0.2925, "step": 1845 }, { "epoch": 2.32807570977918, "grad_norm": 0.21770332008469645, "learning_rate": 1.2394761459307765e-05, "loss": 0.2686, "step": 1846 }, { "epoch": 2.3293375394321765, "grad_norm": 0.24066678611647885, "learning_rate": 1.2371375116931712e-05, "loss": 0.2784, "step": 1847 }, { "epoch": 2.3305993690851734, "grad_norm": 0.23988851968748487, "learning_rate": 1.234798877455566e-05, "loss": 0.2845, "step": 1848 }, { "epoch": 2.3318611987381703, "grad_norm": 0.23381637081659734, "learning_rate": 1.2324602432179607e-05, "loss": 0.2935, "step": 1849 }, { "epoch": 2.3331230283911673, "grad_norm": 0.21522812018151616, "learning_rate": 1.2301216089803555e-05, "loss": 0.2755, "step": 1850 }, { "epoch": 2.334384858044164, "grad_norm": 0.2339594338930035, "learning_rate": 1.2277829747427503e-05, "loss": 0.3026, "step": 1851 }, { "epoch": 2.335646687697161, "grad_norm": 0.2273839215861767, "learning_rate": 1.225444340505145e-05, "loss": 0.2917, "step": 1852 }, { "epoch": 2.3369085173501576, "grad_norm": 0.20444038803314282, "learning_rate": 1.2231057062675398e-05, "loss": 0.2728, "step": 1853 }, { "epoch": 2.3381703470031545, "grad_norm": 0.2231367607412206, "learning_rate": 1.2207670720299345e-05, "loss": 0.2933, "step": 1854 }, { "epoch": 2.3394321766561514, "grad_norm": 0.21936111594800117, "learning_rate": 1.2184284377923295e-05, "loss": 0.284, "step": 1855 }, { "epoch": 2.3406940063091484, "grad_norm": 0.25348455881629184, "learning_rate": 1.216089803554724e-05, "loss": 0.3111, "step": 1856 }, { "epoch": 2.3419558359621453, "grad_norm": 0.20976397381887635, "learning_rate": 1.2137511693171188e-05, "loss": 0.2735, "step": 1857 }, { "epoch": 2.3432176656151418, "grad_norm": 0.260123784256303, "learning_rate": 1.2114125350795136e-05, "loss": 0.294, "step": 1858 }, { "epoch": 2.3444794952681387, "grad_norm": 0.22938480165137018, "learning_rate": 1.2090739008419085e-05, "loss": 0.2769, "step": 1859 }, { "epoch": 2.3457413249211356, "grad_norm": 0.22185651059023676, "learning_rate": 1.2067352666043031e-05, "loss": 0.2711, "step": 1860 }, { "epoch": 2.3470031545741326, "grad_norm": 0.21041387289274832, "learning_rate": 1.2043966323666978e-05, "loss": 0.2784, "step": 1861 }, { "epoch": 2.3482649842271295, "grad_norm": 0.2196260215025504, "learning_rate": 1.2020579981290926e-05, "loss": 0.2891, "step": 1862 }, { "epoch": 2.349526813880126, "grad_norm": 0.22366498960263567, "learning_rate": 1.1997193638914875e-05, "loss": 0.2968, "step": 1863 }, { "epoch": 2.350788643533123, "grad_norm": 0.2230734989094266, "learning_rate": 1.1973807296538823e-05, "loss": 0.2819, "step": 1864 }, { "epoch": 2.35205047318612, "grad_norm": 0.2536829410262053, "learning_rate": 1.1950420954162769e-05, "loss": 0.2798, "step": 1865 }, { "epoch": 2.3533123028391167, "grad_norm": 0.2747782839155942, "learning_rate": 1.1927034611786716e-05, "loss": 0.2792, "step": 1866 }, { "epoch": 2.3545741324921137, "grad_norm": 0.21232552979291866, "learning_rate": 1.1903648269410666e-05, "loss": 0.2754, "step": 1867 }, { "epoch": 2.3558359621451106, "grad_norm": 0.221996862526714, "learning_rate": 1.1880261927034613e-05, "loss": 0.3008, "step": 1868 }, { "epoch": 2.357097791798107, "grad_norm": 0.2287962522492788, "learning_rate": 1.1856875584658559e-05, "loss": 0.2932, "step": 1869 }, { "epoch": 2.358359621451104, "grad_norm": 0.20891197789898555, "learning_rate": 1.1833489242282507e-05, "loss": 0.2696, "step": 1870 }, { "epoch": 2.359621451104101, "grad_norm": 0.22980172327856732, "learning_rate": 1.1810102899906456e-05, "loss": 0.2707, "step": 1871 }, { "epoch": 2.360883280757098, "grad_norm": 0.229138574935501, "learning_rate": 1.1786716557530404e-05, "loss": 0.3012, "step": 1872 }, { "epoch": 2.362145110410095, "grad_norm": 0.22721944803609134, "learning_rate": 1.176333021515435e-05, "loss": 0.2902, "step": 1873 }, { "epoch": 2.3634069400630917, "grad_norm": 0.23020221263374988, "learning_rate": 1.1739943872778299e-05, "loss": 0.2969, "step": 1874 }, { "epoch": 2.364668769716088, "grad_norm": 0.22285114746690868, "learning_rate": 1.1716557530402246e-05, "loss": 0.2883, "step": 1875 }, { "epoch": 2.365930599369085, "grad_norm": 0.21762056781429867, "learning_rate": 1.1693171188026194e-05, "loss": 0.2876, "step": 1876 }, { "epoch": 2.367192429022082, "grad_norm": 0.22744903222011975, "learning_rate": 1.166978484565014e-05, "loss": 0.2755, "step": 1877 }, { "epoch": 2.368454258675079, "grad_norm": 0.20999859869056842, "learning_rate": 1.1646398503274089e-05, "loss": 0.2839, "step": 1878 }, { "epoch": 2.369716088328076, "grad_norm": 0.23958579741409528, "learning_rate": 1.1623012160898037e-05, "loss": 0.2896, "step": 1879 }, { "epoch": 2.3709779179810724, "grad_norm": 0.2342985772004226, "learning_rate": 1.1599625818521984e-05, "loss": 0.2928, "step": 1880 }, { "epoch": 2.3722397476340693, "grad_norm": 0.2296389797840133, "learning_rate": 1.157623947614593e-05, "loss": 0.2841, "step": 1881 }, { "epoch": 2.3735015772870662, "grad_norm": 0.22575487195345628, "learning_rate": 1.155285313376988e-05, "loss": 0.3031, "step": 1882 }, { "epoch": 2.374763406940063, "grad_norm": 0.23509530061806663, "learning_rate": 1.1529466791393827e-05, "loss": 0.306, "step": 1883 }, { "epoch": 2.37602523659306, "grad_norm": 0.22102056928273153, "learning_rate": 1.1506080449017775e-05, "loss": 0.2826, "step": 1884 }, { "epoch": 2.3772870662460566, "grad_norm": 0.21964818624051669, "learning_rate": 1.148269410664172e-05, "loss": 0.2845, "step": 1885 }, { "epoch": 2.3785488958990535, "grad_norm": 0.20297545932407593, "learning_rate": 1.145930776426567e-05, "loss": 0.3027, "step": 1886 }, { "epoch": 2.3798107255520504, "grad_norm": 0.21165760548130874, "learning_rate": 1.1435921421889617e-05, "loss": 0.2873, "step": 1887 }, { "epoch": 2.3810725552050473, "grad_norm": 0.23431943210360423, "learning_rate": 1.1412535079513565e-05, "loss": 0.2921, "step": 1888 }, { "epoch": 2.3823343848580443, "grad_norm": 0.24649221574869196, "learning_rate": 1.1389148737137512e-05, "loss": 0.2799, "step": 1889 }, { "epoch": 2.383596214511041, "grad_norm": 0.2246482156498452, "learning_rate": 1.136576239476146e-05, "loss": 0.2885, "step": 1890 }, { "epoch": 2.3848580441640377, "grad_norm": 0.21394502448111627, "learning_rate": 1.1342376052385408e-05, "loss": 0.2862, "step": 1891 }, { "epoch": 2.3861198738170346, "grad_norm": 0.21291348796121945, "learning_rate": 1.1318989710009355e-05, "loss": 0.2854, "step": 1892 }, { "epoch": 2.3873817034700315, "grad_norm": 0.24278980255586996, "learning_rate": 1.1295603367633303e-05, "loss": 0.2747, "step": 1893 }, { "epoch": 2.3886435331230285, "grad_norm": 0.22510066691753364, "learning_rate": 1.127221702525725e-05, "loss": 0.2921, "step": 1894 }, { "epoch": 2.3899053627760254, "grad_norm": 0.22574129394648365, "learning_rate": 1.1248830682881198e-05, "loss": 0.2883, "step": 1895 }, { "epoch": 2.3911671924290223, "grad_norm": 0.23276737977953013, "learning_rate": 1.1225444340505145e-05, "loss": 0.3014, "step": 1896 }, { "epoch": 2.392429022082019, "grad_norm": 0.21926354692697628, "learning_rate": 1.1202057998129093e-05, "loss": 0.2755, "step": 1897 }, { "epoch": 2.3936908517350157, "grad_norm": 0.2434788889398615, "learning_rate": 1.117867165575304e-05, "loss": 0.2879, "step": 1898 }, { "epoch": 2.3949526813880126, "grad_norm": 0.21118803280156162, "learning_rate": 1.1155285313376988e-05, "loss": 0.2846, "step": 1899 }, { "epoch": 2.3962145110410096, "grad_norm": 0.21846668592496868, "learning_rate": 1.1131898971000936e-05, "loss": 0.2976, "step": 1900 }, { "epoch": 2.3974763406940065, "grad_norm": 0.2293104143226929, "learning_rate": 1.1108512628624883e-05, "loss": 0.2873, "step": 1901 }, { "epoch": 2.398738170347003, "grad_norm": 0.2304530636662853, "learning_rate": 1.1085126286248831e-05, "loss": 0.2878, "step": 1902 }, { "epoch": 2.4, "grad_norm": 0.2394006715028658, "learning_rate": 1.1061739943872779e-05, "loss": 0.2954, "step": 1903 }, { "epoch": 2.401261829652997, "grad_norm": 0.2185020877229977, "learning_rate": 1.1038353601496726e-05, "loss": 0.2827, "step": 1904 }, { "epoch": 2.4025236593059938, "grad_norm": 0.2258880784226633, "learning_rate": 1.1014967259120674e-05, "loss": 0.2773, "step": 1905 }, { "epoch": 2.4037854889589907, "grad_norm": 0.25180435913683635, "learning_rate": 1.0991580916744621e-05, "loss": 0.2956, "step": 1906 }, { "epoch": 2.405047318611987, "grad_norm": 0.2293581539987734, "learning_rate": 1.0968194574368569e-05, "loss": 0.2754, "step": 1907 }, { "epoch": 2.406309148264984, "grad_norm": 0.2685872647725737, "learning_rate": 1.0944808231992516e-05, "loss": 0.2911, "step": 1908 }, { "epoch": 2.407570977917981, "grad_norm": 0.2359511501723514, "learning_rate": 1.0921421889616466e-05, "loss": 0.2877, "step": 1909 }, { "epoch": 2.408832807570978, "grad_norm": 0.22987296826466425, "learning_rate": 1.0898035547240412e-05, "loss": 0.2807, "step": 1910 }, { "epoch": 2.410094637223975, "grad_norm": 0.23602883729386448, "learning_rate": 1.087464920486436e-05, "loss": 0.2879, "step": 1911 }, { "epoch": 2.4113564668769714, "grad_norm": 0.22487738174547262, "learning_rate": 1.0851262862488307e-05, "loss": 0.2795, "step": 1912 }, { "epoch": 2.4126182965299683, "grad_norm": 0.23639008176761167, "learning_rate": 1.0827876520112256e-05, "loss": 0.2743, "step": 1913 }, { "epoch": 2.413880126182965, "grad_norm": 0.24550690208292522, "learning_rate": 1.0804490177736202e-05, "loss": 0.2885, "step": 1914 }, { "epoch": 2.415141955835962, "grad_norm": 0.22500896993136235, "learning_rate": 1.078110383536015e-05, "loss": 0.2973, "step": 1915 }, { "epoch": 2.416403785488959, "grad_norm": 0.21049085812980362, "learning_rate": 1.0757717492984099e-05, "loss": 0.2766, "step": 1916 }, { "epoch": 2.417665615141956, "grad_norm": 0.2316902305362484, "learning_rate": 1.0734331150608046e-05, "loss": 0.2841, "step": 1917 }, { "epoch": 2.418927444794953, "grad_norm": 0.2200407564047701, "learning_rate": 1.0710944808231992e-05, "loss": 0.2858, "step": 1918 }, { "epoch": 2.4201892744479494, "grad_norm": 0.23891794886928835, "learning_rate": 1.068755846585594e-05, "loss": 0.2853, "step": 1919 }, { "epoch": 2.4214511041009463, "grad_norm": 0.23327576812419773, "learning_rate": 1.066417212347989e-05, "loss": 0.2892, "step": 1920 }, { "epoch": 2.4227129337539433, "grad_norm": 0.21302970352640221, "learning_rate": 1.0640785781103837e-05, "loss": 0.2921, "step": 1921 }, { "epoch": 2.42397476340694, "grad_norm": 0.21331625371959353, "learning_rate": 1.0617399438727783e-05, "loss": 0.2907, "step": 1922 }, { "epoch": 2.425236593059937, "grad_norm": 0.21963606582973724, "learning_rate": 1.059401309635173e-05, "loss": 0.2843, "step": 1923 }, { "epoch": 2.4264984227129336, "grad_norm": 0.22962681201556973, "learning_rate": 1.057062675397568e-05, "loss": 0.2673, "step": 1924 }, { "epoch": 2.4277602523659305, "grad_norm": 0.22062040419515005, "learning_rate": 1.0547240411599627e-05, "loss": 0.2883, "step": 1925 }, { "epoch": 2.4290220820189274, "grad_norm": 0.21102322921583713, "learning_rate": 1.0523854069223573e-05, "loss": 0.2763, "step": 1926 }, { "epoch": 2.4302839116719244, "grad_norm": 0.23153915145581838, "learning_rate": 1.050046772684752e-05, "loss": 0.2865, "step": 1927 }, { "epoch": 2.4315457413249213, "grad_norm": 0.24354971878849876, "learning_rate": 1.047708138447147e-05, "loss": 0.2717, "step": 1928 }, { "epoch": 2.4328075709779178, "grad_norm": 0.23806685241642878, "learning_rate": 1.0453695042095417e-05, "loss": 0.2959, "step": 1929 }, { "epoch": 2.4340694006309147, "grad_norm": 0.20923442783841958, "learning_rate": 1.0430308699719365e-05, "loss": 0.2796, "step": 1930 }, { "epoch": 2.4353312302839116, "grad_norm": 0.2295848049913635, "learning_rate": 1.0406922357343311e-05, "loss": 0.2879, "step": 1931 }, { "epoch": 2.4365930599369086, "grad_norm": 0.24870569582083446, "learning_rate": 1.038353601496726e-05, "loss": 0.2793, "step": 1932 }, { "epoch": 2.4378548895899055, "grad_norm": 0.22546451467799072, "learning_rate": 1.0360149672591208e-05, "loss": 0.3138, "step": 1933 }, { "epoch": 2.439116719242902, "grad_norm": 0.21612658229782536, "learning_rate": 1.0336763330215155e-05, "loss": 0.2877, "step": 1934 }, { "epoch": 2.440378548895899, "grad_norm": 0.22391207735576601, "learning_rate": 1.0313376987839101e-05, "loss": 0.2732, "step": 1935 }, { "epoch": 2.441640378548896, "grad_norm": 0.2115195045039257, "learning_rate": 1.028999064546305e-05, "loss": 0.2875, "step": 1936 }, { "epoch": 2.4429022082018927, "grad_norm": 0.21704377042254724, "learning_rate": 1.0266604303086998e-05, "loss": 0.2805, "step": 1937 }, { "epoch": 2.4441640378548897, "grad_norm": 0.23131415141124606, "learning_rate": 1.0243217960710946e-05, "loss": 0.2894, "step": 1938 }, { "epoch": 2.4454258675078866, "grad_norm": 0.21333040844398857, "learning_rate": 1.0219831618334893e-05, "loss": 0.2806, "step": 1939 }, { "epoch": 2.4466876971608835, "grad_norm": 0.2242713693797131, "learning_rate": 1.019644527595884e-05, "loss": 0.2927, "step": 1940 }, { "epoch": 2.44794952681388, "grad_norm": 0.2255543466453063, "learning_rate": 1.0173058933582788e-05, "loss": 0.2795, "step": 1941 }, { "epoch": 2.449211356466877, "grad_norm": 0.2175969658524527, "learning_rate": 1.0149672591206736e-05, "loss": 0.2723, "step": 1942 }, { "epoch": 2.450473186119874, "grad_norm": 0.2147342776254662, "learning_rate": 1.0126286248830684e-05, "loss": 0.2733, "step": 1943 }, { "epoch": 2.451735015772871, "grad_norm": 0.2404846004169907, "learning_rate": 1.0102899906454631e-05, "loss": 0.286, "step": 1944 }, { "epoch": 2.4529968454258677, "grad_norm": 0.2310119586707202, "learning_rate": 1.0079513564078579e-05, "loss": 0.2845, "step": 1945 }, { "epoch": 2.454258675078864, "grad_norm": 0.2178739908475884, "learning_rate": 1.0056127221702526e-05, "loss": 0.2909, "step": 1946 }, { "epoch": 2.455520504731861, "grad_norm": 0.2000984170075236, "learning_rate": 1.0032740879326474e-05, "loss": 0.2721, "step": 1947 }, { "epoch": 2.456782334384858, "grad_norm": 0.2143241530580217, "learning_rate": 1.0009354536950421e-05, "loss": 0.2825, "step": 1948 }, { "epoch": 2.458044164037855, "grad_norm": 0.23497280393154074, "learning_rate": 9.985968194574369e-06, "loss": 0.2912, "step": 1949 }, { "epoch": 2.459305993690852, "grad_norm": 0.20237121564020802, "learning_rate": 9.962581852198317e-06, "loss": 0.2774, "step": 1950 }, { "epoch": 2.4605678233438484, "grad_norm": 0.21439587477385422, "learning_rate": 9.939195509822264e-06, "loss": 0.2977, "step": 1951 }, { "epoch": 2.4618296529968453, "grad_norm": 0.21386643831951713, "learning_rate": 9.915809167446212e-06, "loss": 0.2758, "step": 1952 }, { "epoch": 2.4630914826498422, "grad_norm": 0.2080996973265681, "learning_rate": 9.89242282507016e-06, "loss": 0.2847, "step": 1953 }, { "epoch": 2.464353312302839, "grad_norm": 0.21359510762313536, "learning_rate": 9.869036482694107e-06, "loss": 0.2925, "step": 1954 }, { "epoch": 2.465615141955836, "grad_norm": 0.2141016622741463, "learning_rate": 9.845650140318055e-06, "loss": 0.2792, "step": 1955 }, { "epoch": 2.4668769716088326, "grad_norm": 0.21546387611170156, "learning_rate": 9.822263797942002e-06, "loss": 0.2931, "step": 1956 }, { "epoch": 2.4681388012618295, "grad_norm": 0.2080532129809853, "learning_rate": 9.79887745556595e-06, "loss": 0.2752, "step": 1957 }, { "epoch": 2.4694006309148264, "grad_norm": 0.2166485324519516, "learning_rate": 9.775491113189899e-06, "loss": 0.2898, "step": 1958 }, { "epoch": 2.4706624605678233, "grad_norm": 0.20448038389487105, "learning_rate": 9.752104770813845e-06, "loss": 0.2897, "step": 1959 }, { "epoch": 2.4719242902208203, "grad_norm": 0.2276560983332787, "learning_rate": 9.728718428437792e-06, "loss": 0.2902, "step": 1960 }, { "epoch": 2.473186119873817, "grad_norm": 0.22196226375811037, "learning_rate": 9.70533208606174e-06, "loss": 0.283, "step": 1961 }, { "epoch": 2.474447949526814, "grad_norm": 0.22385492755673414, "learning_rate": 9.68194574368569e-06, "loss": 0.29, "step": 1962 }, { "epoch": 2.4757097791798106, "grad_norm": 0.20888939461205283, "learning_rate": 9.658559401309635e-06, "loss": 0.2726, "step": 1963 }, { "epoch": 2.4769716088328075, "grad_norm": 0.2660793456761265, "learning_rate": 9.635173058933583e-06, "loss": 0.2753, "step": 1964 }, { "epoch": 2.4782334384858045, "grad_norm": 0.22985295882041673, "learning_rate": 9.61178671655753e-06, "loss": 0.2899, "step": 1965 }, { "epoch": 2.4794952681388014, "grad_norm": 0.21251622724498717, "learning_rate": 9.58840037418148e-06, "loss": 0.2866, "step": 1966 }, { "epoch": 2.4807570977917983, "grad_norm": 0.20498861263747567, "learning_rate": 9.565014031805426e-06, "loss": 0.2839, "step": 1967 }, { "epoch": 2.482018927444795, "grad_norm": 0.20391553527461231, "learning_rate": 9.541627689429373e-06, "loss": 0.2733, "step": 1968 }, { "epoch": 2.4832807570977917, "grad_norm": 0.233404525368607, "learning_rate": 9.51824134705332e-06, "loss": 0.2847, "step": 1969 }, { "epoch": 2.4845425867507887, "grad_norm": 0.21005727372355637, "learning_rate": 9.49485500467727e-06, "loss": 0.2807, "step": 1970 }, { "epoch": 2.4858044164037856, "grad_norm": 0.2156459979432238, "learning_rate": 9.471468662301216e-06, "loss": 0.2823, "step": 1971 }, { "epoch": 2.4870662460567825, "grad_norm": 0.23573168718825163, "learning_rate": 9.448082319925163e-06, "loss": 0.2866, "step": 1972 }, { "epoch": 2.488328075709779, "grad_norm": 0.22195383124651802, "learning_rate": 9.424695977549111e-06, "loss": 0.2886, "step": 1973 }, { "epoch": 2.489589905362776, "grad_norm": 0.22682484110190196, "learning_rate": 9.40130963517306e-06, "loss": 0.284, "step": 1974 }, { "epoch": 2.490851735015773, "grad_norm": 0.2211342197522265, "learning_rate": 9.377923292797008e-06, "loss": 0.2929, "step": 1975 }, { "epoch": 2.4921135646687698, "grad_norm": 0.22880129336208913, "learning_rate": 9.354536950420954e-06, "loss": 0.2945, "step": 1976 }, { "epoch": 2.4933753943217667, "grad_norm": 0.20541998804367742, "learning_rate": 9.331150608044901e-06, "loss": 0.2789, "step": 1977 }, { "epoch": 2.494637223974763, "grad_norm": 0.20983040217386012, "learning_rate": 9.30776426566885e-06, "loss": 0.2858, "step": 1978 }, { "epoch": 2.49589905362776, "grad_norm": 0.21520131709707496, "learning_rate": 9.284377923292798e-06, "loss": 0.3033, "step": 1979 }, { "epoch": 2.497160883280757, "grad_norm": 0.21711488638682072, "learning_rate": 9.260991580916744e-06, "loss": 0.2745, "step": 1980 }, { "epoch": 2.498422712933754, "grad_norm": 0.22302606806584735, "learning_rate": 9.237605238540693e-06, "loss": 0.2919, "step": 1981 }, { "epoch": 2.499684542586751, "grad_norm": 0.2271124259477682, "learning_rate": 9.214218896164641e-06, "loss": 0.2826, "step": 1982 }, { "epoch": 2.5009463722397474, "grad_norm": 0.22814224382848144, "learning_rate": 9.190832553788589e-06, "loss": 0.2957, "step": 1983 }, { "epoch": 2.5022082018927447, "grad_norm": 0.23908497890461367, "learning_rate": 9.167446211412534e-06, "loss": 0.2684, "step": 1984 }, { "epoch": 2.503470031545741, "grad_norm": 0.20744638607952337, "learning_rate": 9.144059869036484e-06, "loss": 0.2731, "step": 1985 }, { "epoch": 2.504731861198738, "grad_norm": 0.2152169035523792, "learning_rate": 9.120673526660431e-06, "loss": 0.2861, "step": 1986 }, { "epoch": 2.505993690851735, "grad_norm": 0.23327957191655213, "learning_rate": 9.097287184284379e-06, "loss": 0.295, "step": 1987 }, { "epoch": 2.507255520504732, "grad_norm": 0.23185363569587944, "learning_rate": 9.073900841908325e-06, "loss": 0.2846, "step": 1988 }, { "epoch": 2.508517350157729, "grad_norm": 0.23104883166385376, "learning_rate": 9.050514499532274e-06, "loss": 0.2951, "step": 1989 }, { "epoch": 2.5097791798107254, "grad_norm": 0.2231585749029464, "learning_rate": 9.027128157156222e-06, "loss": 0.2812, "step": 1990 }, { "epoch": 2.5110410094637223, "grad_norm": 0.24674135159854857, "learning_rate": 9.00374181478017e-06, "loss": 0.29, "step": 1991 }, { "epoch": 2.5123028391167193, "grad_norm": 0.22016176304081442, "learning_rate": 8.980355472404115e-06, "loss": 0.2745, "step": 1992 }, { "epoch": 2.513564668769716, "grad_norm": 0.26504643527216293, "learning_rate": 8.956969130028064e-06, "loss": 0.2985, "step": 1993 }, { "epoch": 2.514826498422713, "grad_norm": 0.22219338476837522, "learning_rate": 8.933582787652012e-06, "loss": 0.2802, "step": 1994 }, { "epoch": 2.5160883280757096, "grad_norm": 0.21363476884113708, "learning_rate": 8.91019644527596e-06, "loss": 0.2712, "step": 1995 }, { "epoch": 2.5173501577287065, "grad_norm": 0.24128527350780204, "learning_rate": 8.886810102899907e-06, "loss": 0.2809, "step": 1996 }, { "epoch": 2.5186119873817034, "grad_norm": 0.2309299696431924, "learning_rate": 8.863423760523855e-06, "loss": 0.2864, "step": 1997 }, { "epoch": 2.5198738170347004, "grad_norm": 0.21348326240091495, "learning_rate": 8.840037418147802e-06, "loss": 0.2805, "step": 1998 }, { "epoch": 2.5211356466876973, "grad_norm": 0.2130101288975422, "learning_rate": 8.81665107577175e-06, "loss": 0.2802, "step": 1999 }, { "epoch": 2.522397476340694, "grad_norm": 0.22436292586758683, "learning_rate": 8.793264733395697e-06, "loss": 0.3033, "step": 2000 }, { "epoch": 2.5236593059936907, "grad_norm": 0.20925054546554653, "learning_rate": 8.769878391019645e-06, "loss": 0.2785, "step": 2001 }, { "epoch": 2.5249211356466876, "grad_norm": 0.22135965206802508, "learning_rate": 8.746492048643593e-06, "loss": 0.2917, "step": 2002 }, { "epoch": 2.5261829652996846, "grad_norm": 0.21028897046455833, "learning_rate": 8.72310570626754e-06, "loss": 0.2828, "step": 2003 }, { "epoch": 2.5274447949526815, "grad_norm": 0.2054113485940043, "learning_rate": 8.699719363891488e-06, "loss": 0.2764, "step": 2004 }, { "epoch": 2.528706624605678, "grad_norm": 0.25423172662551735, "learning_rate": 8.676333021515435e-06, "loss": 0.297, "step": 2005 }, { "epoch": 2.5299684542586753, "grad_norm": 0.23886229360150138, "learning_rate": 8.652946679139383e-06, "loss": 0.2712, "step": 2006 }, { "epoch": 2.531230283911672, "grad_norm": 0.2111270266595698, "learning_rate": 8.62956033676333e-06, "loss": 0.2769, "step": 2007 }, { "epoch": 2.5324921135646687, "grad_norm": 0.2067082331061543, "learning_rate": 8.606173994387278e-06, "loss": 0.2928, "step": 2008 }, { "epoch": 2.5337539432176657, "grad_norm": 0.2732800884486134, "learning_rate": 8.582787652011226e-06, "loss": 0.2996, "step": 2009 }, { "epoch": 2.5350157728706626, "grad_norm": 0.21267676829780321, "learning_rate": 8.559401309635173e-06, "loss": 0.2845, "step": 2010 }, { "epoch": 2.5362776025236595, "grad_norm": 0.22931756013988736, "learning_rate": 8.53601496725912e-06, "loss": 0.2903, "step": 2011 }, { "epoch": 2.537539432176656, "grad_norm": 0.21955863268410078, "learning_rate": 8.512628624883068e-06, "loss": 0.294, "step": 2012 }, { "epoch": 2.538801261829653, "grad_norm": 0.2190421923409746, "learning_rate": 8.489242282507016e-06, "loss": 0.275, "step": 2013 }, { "epoch": 2.54006309148265, "grad_norm": 0.23508649561377837, "learning_rate": 8.465855940130964e-06, "loss": 0.2855, "step": 2014 }, { "epoch": 2.541324921135647, "grad_norm": 0.23009284516177753, "learning_rate": 8.442469597754911e-06, "loss": 0.2888, "step": 2015 }, { "epoch": 2.5425867507886437, "grad_norm": 0.2277911038065868, "learning_rate": 8.41908325537886e-06, "loss": 0.2804, "step": 2016 }, { "epoch": 2.54384858044164, "grad_norm": 0.22151951443838802, "learning_rate": 8.395696913002806e-06, "loss": 0.2887, "step": 2017 }, { "epoch": 2.545110410094637, "grad_norm": 0.24201696444576518, "learning_rate": 8.372310570626754e-06, "loss": 0.295, "step": 2018 }, { "epoch": 2.546372239747634, "grad_norm": 0.22594145075067362, "learning_rate": 8.348924228250701e-06, "loss": 0.2807, "step": 2019 }, { "epoch": 2.547634069400631, "grad_norm": 0.23897526629626806, "learning_rate": 8.32553788587465e-06, "loss": 0.2997, "step": 2020 }, { "epoch": 2.548895899053628, "grad_norm": 0.24744797613312153, "learning_rate": 8.302151543498597e-06, "loss": 0.2784, "step": 2021 }, { "epoch": 2.5501577287066244, "grad_norm": 0.8666698125912051, "learning_rate": 8.278765201122544e-06, "loss": 0.3134, "step": 2022 }, { "epoch": 2.5514195583596213, "grad_norm": 0.21092861987152395, "learning_rate": 8.255378858746493e-06, "loss": 0.2828, "step": 2023 }, { "epoch": 2.5526813880126182, "grad_norm": 0.2167829213723099, "learning_rate": 8.231992516370441e-06, "loss": 0.2835, "step": 2024 }, { "epoch": 2.553943217665615, "grad_norm": 0.2128299402983715, "learning_rate": 8.208606173994387e-06, "loss": 0.2889, "step": 2025 }, { "epoch": 2.555205047318612, "grad_norm": 0.20836285645458633, "learning_rate": 8.185219831618335e-06, "loss": 0.2805, "step": 2026 }, { "epoch": 2.5564668769716086, "grad_norm": 0.23523630427523604, "learning_rate": 8.161833489242284e-06, "loss": 0.296, "step": 2027 }, { "epoch": 2.557728706624606, "grad_norm": 0.21234475682392623, "learning_rate": 8.138447146866231e-06, "loss": 0.2825, "step": 2028 }, { "epoch": 2.5589905362776024, "grad_norm": 0.21969491671375213, "learning_rate": 8.115060804490177e-06, "loss": 0.3036, "step": 2029 }, { "epoch": 2.5602523659305993, "grad_norm": 0.23466190222026603, "learning_rate": 8.091674462114125e-06, "loss": 0.2918, "step": 2030 }, { "epoch": 2.5615141955835963, "grad_norm": 0.2207450454560613, "learning_rate": 8.068288119738074e-06, "loss": 0.2773, "step": 2031 }, { "epoch": 2.562776025236593, "grad_norm": 0.22111152860953134, "learning_rate": 8.044901777362022e-06, "loss": 0.2731, "step": 2032 }, { "epoch": 2.56403785488959, "grad_norm": 0.20878491563329402, "learning_rate": 8.021515434985968e-06, "loss": 0.2769, "step": 2033 }, { "epoch": 2.5652996845425866, "grad_norm": 0.2089594490841091, "learning_rate": 7.998129092609915e-06, "loss": 0.2898, "step": 2034 }, { "epoch": 2.5665615141955835, "grad_norm": 0.2069613213391077, "learning_rate": 7.974742750233864e-06, "loss": 0.2864, "step": 2035 }, { "epoch": 2.5678233438485805, "grad_norm": 0.21995653172772364, "learning_rate": 7.951356407857812e-06, "loss": 0.2839, "step": 2036 }, { "epoch": 2.5690851735015774, "grad_norm": 0.23412896452838447, "learning_rate": 7.927970065481758e-06, "loss": 0.2989, "step": 2037 }, { "epoch": 2.5703470031545743, "grad_norm": 0.23688233624615201, "learning_rate": 7.904583723105706e-06, "loss": 0.2928, "step": 2038 }, { "epoch": 2.571608832807571, "grad_norm": 0.22726286328658604, "learning_rate": 7.881197380729655e-06, "loss": 0.3005, "step": 2039 }, { "epoch": 2.5728706624605677, "grad_norm": 0.22686818727620103, "learning_rate": 7.857811038353602e-06, "loss": 0.2916, "step": 2040 }, { "epoch": 2.5741324921135647, "grad_norm": 0.21459763148007124, "learning_rate": 7.83442469597755e-06, "loss": 0.2923, "step": 2041 }, { "epoch": 2.5753943217665616, "grad_norm": 0.2023080803126151, "learning_rate": 7.811038353601498e-06, "loss": 0.2788, "step": 2042 }, { "epoch": 2.5766561514195585, "grad_norm": 0.22334790656071368, "learning_rate": 7.787652011225445e-06, "loss": 0.2767, "step": 2043 }, { "epoch": 2.577917981072555, "grad_norm": 0.21699755076392244, "learning_rate": 7.764265668849393e-06, "loss": 0.2692, "step": 2044 }, { "epoch": 2.579179810725552, "grad_norm": 0.21068579010157437, "learning_rate": 7.74087932647334e-06, "loss": 0.2744, "step": 2045 }, { "epoch": 2.580441640378549, "grad_norm": 0.21710032276335256, "learning_rate": 7.717492984097288e-06, "loss": 0.2726, "step": 2046 }, { "epoch": 2.5817034700315458, "grad_norm": 0.2190211992291235, "learning_rate": 7.694106641721235e-06, "loss": 0.2863, "step": 2047 }, { "epoch": 2.5829652996845427, "grad_norm": 0.21706921039442753, "learning_rate": 7.670720299345183e-06, "loss": 0.2815, "step": 2048 }, { "epoch": 2.584227129337539, "grad_norm": 0.22808382649694434, "learning_rate": 7.64733395696913e-06, "loss": 0.2924, "step": 2049 }, { "epoch": 2.5854889589905365, "grad_norm": 0.21119310609732014, "learning_rate": 7.623947614593078e-06, "loss": 0.2868, "step": 2050 }, { "epoch": 2.586750788643533, "grad_norm": 0.20352407151824378, "learning_rate": 7.600561272217026e-06, "loss": 0.2892, "step": 2051 }, { "epoch": 2.58801261829653, "grad_norm": 0.22076699349847495, "learning_rate": 7.577174929840973e-06, "loss": 0.2936, "step": 2052 }, { "epoch": 2.589274447949527, "grad_norm": 0.21601398066156008, "learning_rate": 7.55378858746492e-06, "loss": 0.2711, "step": 2053 }, { "epoch": 2.590536277602524, "grad_norm": 0.216180889656219, "learning_rate": 7.530402245088869e-06, "loss": 0.281, "step": 2054 }, { "epoch": 2.5917981072555207, "grad_norm": 0.21363773427170604, "learning_rate": 7.507015902712816e-06, "loss": 0.285, "step": 2055 }, { "epoch": 2.593059936908517, "grad_norm": 0.22743607175797703, "learning_rate": 7.483629560336764e-06, "loss": 0.3054, "step": 2056 }, { "epoch": 2.594321766561514, "grad_norm": 0.20938634847003487, "learning_rate": 7.46024321796071e-06, "loss": 0.298, "step": 2057 }, { "epoch": 2.595583596214511, "grad_norm": 0.21119844862559287, "learning_rate": 7.43685687558466e-06, "loss": 0.277, "step": 2058 }, { "epoch": 2.596845425867508, "grad_norm": 0.22042834424449884, "learning_rate": 7.413470533208606e-06, "loss": 0.2936, "step": 2059 }, { "epoch": 2.598107255520505, "grad_norm": 0.22195888071573786, "learning_rate": 7.390084190832554e-06, "loss": 0.2907, "step": 2060 }, { "epoch": 2.5993690851735014, "grad_norm": 0.2261940944997366, "learning_rate": 7.366697848456501e-06, "loss": 0.2978, "step": 2061 }, { "epoch": 2.6006309148264983, "grad_norm": 0.21517212044091585, "learning_rate": 7.34331150608045e-06, "loss": 0.2943, "step": 2062 }, { "epoch": 2.6018927444794953, "grad_norm": 0.2207076057041569, "learning_rate": 7.319925163704397e-06, "loss": 0.291, "step": 2063 }, { "epoch": 2.603154574132492, "grad_norm": 0.2038943868661106, "learning_rate": 7.296538821328344e-06, "loss": 0.2765, "step": 2064 }, { "epoch": 2.604416403785489, "grad_norm": 0.22820672975585743, "learning_rate": 7.273152478952293e-06, "loss": 0.2722, "step": 2065 }, { "epoch": 2.6056782334384856, "grad_norm": 0.19847668638301877, "learning_rate": 7.24976613657624e-06, "loss": 0.2815, "step": 2066 }, { "epoch": 2.6069400630914825, "grad_norm": 0.20792536681445514, "learning_rate": 7.226379794200187e-06, "loss": 0.2786, "step": 2067 }, { "epoch": 2.6082018927444794, "grad_norm": 0.23137799876385698, "learning_rate": 7.202993451824135e-06, "loss": 0.2975, "step": 2068 }, { "epoch": 2.6094637223974764, "grad_norm": 0.22039382059604898, "learning_rate": 7.179607109448083e-06, "loss": 0.2885, "step": 2069 }, { "epoch": 2.6107255520504733, "grad_norm": 0.21097114195477035, "learning_rate": 7.156220767072031e-06, "loss": 0.2687, "step": 2070 }, { "epoch": 2.61198738170347, "grad_norm": 0.21575516925364702, "learning_rate": 7.132834424695977e-06, "loss": 0.2927, "step": 2071 }, { "epoch": 2.613249211356467, "grad_norm": 0.23398610488740937, "learning_rate": 7.109448082319925e-06, "loss": 0.2697, "step": 2072 }, { "epoch": 2.6145110410094636, "grad_norm": 0.23001448274277755, "learning_rate": 7.086061739943873e-06, "loss": 0.29, "step": 2073 }, { "epoch": 2.6157728706624606, "grad_norm": 0.21126467665463916, "learning_rate": 7.062675397567821e-06, "loss": 0.2795, "step": 2074 }, { "epoch": 2.6170347003154575, "grad_norm": 0.20716355013481919, "learning_rate": 7.039289055191768e-06, "loss": 0.2807, "step": 2075 }, { "epoch": 2.6182965299684544, "grad_norm": 0.2109665033449527, "learning_rate": 7.015902712815715e-06, "loss": 0.2905, "step": 2076 }, { "epoch": 2.6195583596214513, "grad_norm": 0.21599678753395815, "learning_rate": 6.992516370439664e-06, "loss": 0.2772, "step": 2077 }, { "epoch": 2.620820189274448, "grad_norm": 0.2433432450345048, "learning_rate": 6.969130028063611e-06, "loss": 0.2837, "step": 2078 }, { "epoch": 2.6220820189274447, "grad_norm": 0.21133118953720323, "learning_rate": 6.945743685687559e-06, "loss": 0.3026, "step": 2079 }, { "epoch": 2.6233438485804417, "grad_norm": 0.22466334572774702, "learning_rate": 6.922357343311506e-06, "loss": 0.2816, "step": 2080 }, { "epoch": 2.6246056782334386, "grad_norm": 0.22062201009319468, "learning_rate": 6.898971000935454e-06, "loss": 0.2923, "step": 2081 }, { "epoch": 2.6258675078864355, "grad_norm": 0.23809766896574827, "learning_rate": 6.875584658559402e-06, "loss": 0.2885, "step": 2082 }, { "epoch": 2.627129337539432, "grad_norm": 0.2082503424015974, "learning_rate": 6.852198316183349e-06, "loss": 0.2863, "step": 2083 }, { "epoch": 2.628391167192429, "grad_norm": 0.19983892780199822, "learning_rate": 6.828811973807298e-06, "loss": 0.2916, "step": 2084 }, { "epoch": 2.629652996845426, "grad_norm": 0.21293510751172295, "learning_rate": 6.805425631431244e-06, "loss": 0.3078, "step": 2085 }, { "epoch": 2.630914826498423, "grad_norm": 0.22281039929615937, "learning_rate": 6.782039289055192e-06, "loss": 0.2835, "step": 2086 }, { "epoch": 2.6321766561514197, "grad_norm": 0.2228402936264736, "learning_rate": 6.7586529466791395e-06, "loss": 0.2911, "step": 2087 }, { "epoch": 2.633438485804416, "grad_norm": 0.22286170486845566, "learning_rate": 6.735266604303088e-06, "loss": 0.2807, "step": 2088 }, { "epoch": 2.634700315457413, "grad_norm": 0.20700553861568297, "learning_rate": 6.7118802619270356e-06, "loss": 0.2863, "step": 2089 }, { "epoch": 2.63596214511041, "grad_norm": 0.21467793539175758, "learning_rate": 6.688493919550982e-06, "loss": 0.2852, "step": 2090 }, { "epoch": 2.637223974763407, "grad_norm": 0.20513160375483477, "learning_rate": 6.66510757717493e-06, "loss": 0.2811, "step": 2091 }, { "epoch": 2.638485804416404, "grad_norm": 0.21060523033796483, "learning_rate": 6.641721234798878e-06, "loss": 0.2895, "step": 2092 }, { "epoch": 2.6397476340694004, "grad_norm": 0.2090332995273368, "learning_rate": 6.618334892422826e-06, "loss": 0.2852, "step": 2093 }, { "epoch": 2.6410094637223973, "grad_norm": 0.2335603507034544, "learning_rate": 6.594948550046773e-06, "loss": 0.2838, "step": 2094 }, { "epoch": 2.6422712933753942, "grad_norm": 0.21481631978562665, "learning_rate": 6.57156220767072e-06, "loss": 0.2918, "step": 2095 }, { "epoch": 2.643533123028391, "grad_norm": 0.20764488822871163, "learning_rate": 6.548175865294669e-06, "loss": 0.299, "step": 2096 }, { "epoch": 2.644794952681388, "grad_norm": 0.22505817959300234, "learning_rate": 6.524789522918616e-06, "loss": 0.2909, "step": 2097 }, { "epoch": 2.646056782334385, "grad_norm": 0.20644995967001809, "learning_rate": 6.501403180542563e-06, "loss": 0.2823, "step": 2098 }, { "epoch": 2.647318611987382, "grad_norm": 0.2102039371915212, "learning_rate": 6.4780168381665105e-06, "loss": 0.2868, "step": 2099 }, { "epoch": 2.6485804416403784, "grad_norm": 0.1991608443189748, "learning_rate": 6.454630495790459e-06, "loss": 0.302, "step": 2100 }, { "epoch": 2.6498422712933754, "grad_norm": 0.23402748220099398, "learning_rate": 6.4312441534144065e-06, "loss": 0.2872, "step": 2101 }, { "epoch": 2.6511041009463723, "grad_norm": 0.2046980593884065, "learning_rate": 6.407857811038353e-06, "loss": 0.2949, "step": 2102 }, { "epoch": 2.652365930599369, "grad_norm": 0.20329492278120515, "learning_rate": 6.384471468662301e-06, "loss": 0.2738, "step": 2103 }, { "epoch": 2.653627760252366, "grad_norm": 0.21349105766229734, "learning_rate": 6.361085126286249e-06, "loss": 0.2946, "step": 2104 }, { "epoch": 2.6548895899053626, "grad_norm": 0.22249158467251662, "learning_rate": 6.337698783910197e-06, "loss": 0.2974, "step": 2105 }, { "epoch": 2.6561514195583595, "grad_norm": 0.24362617509539874, "learning_rate": 6.314312441534144e-06, "loss": 0.2854, "step": 2106 }, { "epoch": 2.6574132492113565, "grad_norm": 0.2135314905262438, "learning_rate": 6.290926099158093e-06, "loss": 0.2968, "step": 2107 }, { "epoch": 2.6586750788643534, "grad_norm": 0.21654294581897646, "learning_rate": 6.26753975678204e-06, "loss": 0.2904, "step": 2108 }, { "epoch": 2.6599369085173503, "grad_norm": 0.21472897725978107, "learning_rate": 6.244153414405987e-06, "loss": 0.2888, "step": 2109 }, { "epoch": 2.661198738170347, "grad_norm": 0.2183411320238664, "learning_rate": 6.220767072029935e-06, "loss": 0.2765, "step": 2110 }, { "epoch": 2.6624605678233437, "grad_norm": 0.2369935582473363, "learning_rate": 6.197380729653882e-06, "loss": 0.3088, "step": 2111 }, { "epoch": 2.6637223974763407, "grad_norm": 0.2251894503391724, "learning_rate": 6.17399438727783e-06, "loss": 0.2967, "step": 2112 }, { "epoch": 2.6649842271293376, "grad_norm": 0.19995323214182917, "learning_rate": 6.1506080449017775e-06, "loss": 0.2858, "step": 2113 }, { "epoch": 2.6662460567823345, "grad_norm": 0.21829014624295476, "learning_rate": 6.127221702525725e-06, "loss": 0.2838, "step": 2114 }, { "epoch": 2.667507886435331, "grad_norm": 0.21821987540986815, "learning_rate": 6.103835360149673e-06, "loss": 0.2838, "step": 2115 }, { "epoch": 2.668769716088328, "grad_norm": 0.21552530356367208, "learning_rate": 6.08044901777362e-06, "loss": 0.2894, "step": 2116 }, { "epoch": 2.670031545741325, "grad_norm": 0.20678821234974995, "learning_rate": 6.057062675397568e-06, "loss": 0.2784, "step": 2117 }, { "epoch": 2.6712933753943218, "grad_norm": 0.19816182368359408, "learning_rate": 6.0336763330215154e-06, "loss": 0.2739, "step": 2118 }, { "epoch": 2.6725552050473187, "grad_norm": 0.20403908667598566, "learning_rate": 6.010289990645463e-06, "loss": 0.2969, "step": 2119 }, { "epoch": 2.6738170347003156, "grad_norm": 0.20464159172713622, "learning_rate": 5.9869036482694114e-06, "loss": 0.2743, "step": 2120 }, { "epoch": 2.6750788643533125, "grad_norm": 0.22012495671850127, "learning_rate": 5.963517305893358e-06, "loss": 0.2837, "step": 2121 }, { "epoch": 2.676340694006309, "grad_norm": 0.2388477816369541, "learning_rate": 5.940130963517307e-06, "loss": 0.3132, "step": 2122 }, { "epoch": 2.677602523659306, "grad_norm": 0.21960304853727253, "learning_rate": 5.916744621141253e-06, "loss": 0.2735, "step": 2123 }, { "epoch": 2.678864353312303, "grad_norm": 0.21109655919860723, "learning_rate": 5.893358278765202e-06, "loss": 0.2923, "step": 2124 }, { "epoch": 2.6801261829653, "grad_norm": 0.22415479755361706, "learning_rate": 5.869971936389149e-06, "loss": 0.2969, "step": 2125 }, { "epoch": 2.6813880126182967, "grad_norm": 0.2245845407896934, "learning_rate": 5.846585594013097e-06, "loss": 0.2906, "step": 2126 }, { "epoch": 2.682649842271293, "grad_norm": 0.20206460619403604, "learning_rate": 5.8231992516370445e-06, "loss": 0.282, "step": 2127 }, { "epoch": 2.68391167192429, "grad_norm": 0.1981693798512142, "learning_rate": 5.799812909260992e-06, "loss": 0.2679, "step": 2128 }, { "epoch": 2.685173501577287, "grad_norm": 0.22395158920575026, "learning_rate": 5.77642656688494e-06, "loss": 0.2873, "step": 2129 }, { "epoch": 2.686435331230284, "grad_norm": 0.2093135379495245, "learning_rate": 5.753040224508887e-06, "loss": 0.2978, "step": 2130 }, { "epoch": 2.687697160883281, "grad_norm": 0.20432712787097923, "learning_rate": 5.729653882132835e-06, "loss": 0.2843, "step": 2131 }, { "epoch": 2.6889589905362774, "grad_norm": 0.24177221857334766, "learning_rate": 5.706267539756782e-06, "loss": 0.2906, "step": 2132 }, { "epoch": 2.6902208201892743, "grad_norm": 0.20144344787039362, "learning_rate": 5.68288119738073e-06, "loss": 0.2876, "step": 2133 }, { "epoch": 2.6914826498422713, "grad_norm": 0.2127324957742735, "learning_rate": 5.659494855004678e-06, "loss": 0.2845, "step": 2134 }, { "epoch": 2.692744479495268, "grad_norm": 0.21197637926189114, "learning_rate": 5.636108512628625e-06, "loss": 0.2803, "step": 2135 }, { "epoch": 2.694006309148265, "grad_norm": 0.20881398200164286, "learning_rate": 5.612722170252573e-06, "loss": 0.3022, "step": 2136 }, { "epoch": 2.6952681388012616, "grad_norm": 0.22189166031443974, "learning_rate": 5.58933582787652e-06, "loss": 0.2891, "step": 2137 }, { "epoch": 2.6965299684542585, "grad_norm": 0.20571413232259855, "learning_rate": 5.565949485500468e-06, "loss": 0.3034, "step": 2138 }, { "epoch": 2.6977917981072554, "grad_norm": 0.19400229037737457, "learning_rate": 5.5425631431244155e-06, "loss": 0.2802, "step": 2139 }, { "epoch": 2.6990536277602524, "grad_norm": 0.19812187021616018, "learning_rate": 5.519176800748363e-06, "loss": 0.2739, "step": 2140 }, { "epoch": 2.7003154574132493, "grad_norm": 0.194033804987788, "learning_rate": 5.495790458372311e-06, "loss": 0.2833, "step": 2141 }, { "epoch": 2.701577287066246, "grad_norm": 0.19906280993761083, "learning_rate": 5.472404115996258e-06, "loss": 0.2964, "step": 2142 }, { "epoch": 2.702839116719243, "grad_norm": 0.2127133620044932, "learning_rate": 5.449017773620206e-06, "loss": 0.283, "step": 2143 }, { "epoch": 2.7041009463722396, "grad_norm": 0.20483397820420218, "learning_rate": 5.425631431244153e-06, "loss": 0.2877, "step": 2144 }, { "epoch": 2.7053627760252366, "grad_norm": 0.1980159114579772, "learning_rate": 5.402245088868101e-06, "loss": 0.2843, "step": 2145 }, { "epoch": 2.7066246056782335, "grad_norm": 0.2010479690661036, "learning_rate": 5.378858746492049e-06, "loss": 0.2733, "step": 2146 }, { "epoch": 2.7078864353312304, "grad_norm": 0.2235196599704895, "learning_rate": 5.355472404115996e-06, "loss": 0.2848, "step": 2147 }, { "epoch": 2.7091482649842273, "grad_norm": 0.19624295709195397, "learning_rate": 5.332086061739945e-06, "loss": 0.2756, "step": 2148 }, { "epoch": 2.710410094637224, "grad_norm": 0.2201832825604379, "learning_rate": 5.308699719363891e-06, "loss": 0.2848, "step": 2149 }, { "epoch": 2.7116719242902207, "grad_norm": 0.20850507268024696, "learning_rate": 5.28531337698784e-06, "loss": 0.2629, "step": 2150 }, { "epoch": 2.7129337539432177, "grad_norm": 0.20123939713984965, "learning_rate": 5.2619270346117865e-06, "loss": 0.2772, "step": 2151 }, { "epoch": 2.7141955835962146, "grad_norm": 0.20102737228718595, "learning_rate": 5.238540692235735e-06, "loss": 0.2656, "step": 2152 }, { "epoch": 2.7154574132492115, "grad_norm": 0.21705055599262124, "learning_rate": 5.2151543498596825e-06, "loss": 0.3096, "step": 2153 }, { "epoch": 2.716719242902208, "grad_norm": 0.2286611736362425, "learning_rate": 5.19176800748363e-06, "loss": 0.2959, "step": 2154 }, { "epoch": 2.717981072555205, "grad_norm": 0.19616824229372512, "learning_rate": 5.168381665107578e-06, "loss": 0.2754, "step": 2155 }, { "epoch": 2.719242902208202, "grad_norm": 0.2208561541604093, "learning_rate": 5.144995322731525e-06, "loss": 0.2956, "step": 2156 }, { "epoch": 2.720504731861199, "grad_norm": 0.19536668921271164, "learning_rate": 5.121608980355473e-06, "loss": 0.2739, "step": 2157 }, { "epoch": 2.7217665615141957, "grad_norm": 0.20903247673528522, "learning_rate": 5.09822263797942e-06, "loss": 0.2985, "step": 2158 }, { "epoch": 2.723028391167192, "grad_norm": 0.2057724051647836, "learning_rate": 5.074836295603368e-06, "loss": 0.2796, "step": 2159 }, { "epoch": 2.724290220820189, "grad_norm": 0.20604449688257512, "learning_rate": 5.0514499532273156e-06, "loss": 0.3031, "step": 2160 }, { "epoch": 2.725552050473186, "grad_norm": 0.20121115069939052, "learning_rate": 5.028063610851263e-06, "loss": 0.2878, "step": 2161 }, { "epoch": 2.726813880126183, "grad_norm": 0.18960464347409378, "learning_rate": 5.004677268475211e-06, "loss": 0.2797, "step": 2162 }, { "epoch": 2.72807570977918, "grad_norm": 0.2151671013302071, "learning_rate": 4.981290926099158e-06, "loss": 0.2888, "step": 2163 }, { "epoch": 2.7293375394321764, "grad_norm": 0.20079743670955869, "learning_rate": 4.957904583723106e-06, "loss": 0.2757, "step": 2164 }, { "epoch": 2.7305993690851738, "grad_norm": 0.1955994634711192, "learning_rate": 4.9345182413470535e-06, "loss": 0.2825, "step": 2165 }, { "epoch": 2.7318611987381702, "grad_norm": 0.18933281665707588, "learning_rate": 4.911131898971001e-06, "loss": 0.2846, "step": 2166 }, { "epoch": 2.733123028391167, "grad_norm": 0.2071456886627136, "learning_rate": 4.8877455565949495e-06, "loss": 0.2765, "step": 2167 }, { "epoch": 2.734384858044164, "grad_norm": 0.20290055328763829, "learning_rate": 4.864359214218896e-06, "loss": 0.3016, "step": 2168 }, { "epoch": 2.735646687697161, "grad_norm": 0.21505827400534588, "learning_rate": 4.840972871842845e-06, "loss": 0.2909, "step": 2169 }, { "epoch": 2.736908517350158, "grad_norm": 0.20733193170372205, "learning_rate": 4.817586529466791e-06, "loss": 0.2806, "step": 2170 }, { "epoch": 2.7381703470031544, "grad_norm": 0.2075234613295031, "learning_rate": 4.79420018709074e-06, "loss": 0.2906, "step": 2171 }, { "epoch": 2.7394321766561514, "grad_norm": 0.2000301175658234, "learning_rate": 4.7708138447146865e-06, "loss": 0.2786, "step": 2172 }, { "epoch": 2.7406940063091483, "grad_norm": 0.2093924958186629, "learning_rate": 4.747427502338635e-06, "loss": 0.2948, "step": 2173 }, { "epoch": 2.741955835962145, "grad_norm": 0.20302768092111034, "learning_rate": 4.724041159962582e-06, "loss": 0.2918, "step": 2174 }, { "epoch": 2.743217665615142, "grad_norm": 0.22142694383322092, "learning_rate": 4.70065481758653e-06, "loss": 0.2844, "step": 2175 }, { "epoch": 2.7444794952681386, "grad_norm": 0.20749307643446482, "learning_rate": 4.677268475210477e-06, "loss": 0.2796, "step": 2176 }, { "epoch": 2.7457413249211355, "grad_norm": 0.19561671297010316, "learning_rate": 4.653882132834425e-06, "loss": 0.2708, "step": 2177 }, { "epoch": 2.7470031545741325, "grad_norm": 0.20389687421425917, "learning_rate": 4.630495790458372e-06, "loss": 0.2775, "step": 2178 }, { "epoch": 2.7482649842271294, "grad_norm": 0.18784907902735257, "learning_rate": 4.6071094480823205e-06, "loss": 0.2865, "step": 2179 }, { "epoch": 2.7495268138801263, "grad_norm": 0.19856378594784133, "learning_rate": 4.583723105706267e-06, "loss": 0.2701, "step": 2180 }, { "epoch": 2.750788643533123, "grad_norm": 0.2161205474556389, "learning_rate": 4.560336763330216e-06, "loss": 0.3012, "step": 2181 }, { "epoch": 2.7520504731861197, "grad_norm": 0.20694449878630097, "learning_rate": 4.536950420954162e-06, "loss": 0.2943, "step": 2182 }, { "epoch": 2.7533123028391167, "grad_norm": 0.20310341097174966, "learning_rate": 4.513564078578111e-06, "loss": 0.265, "step": 2183 }, { "epoch": 2.7545741324921136, "grad_norm": 0.2093324841344802, "learning_rate": 4.4901777362020575e-06, "loss": 0.299, "step": 2184 }, { "epoch": 2.7558359621451105, "grad_norm": 0.21485090556601577, "learning_rate": 4.466791393826006e-06, "loss": 0.2824, "step": 2185 }, { "epoch": 2.757097791798107, "grad_norm": 0.19759233498142578, "learning_rate": 4.4434050514499535e-06, "loss": 0.2924, "step": 2186 }, { "epoch": 2.7583596214511044, "grad_norm": 0.21565851105654388, "learning_rate": 4.420018709073901e-06, "loss": 0.2888, "step": 2187 }, { "epoch": 2.759621451104101, "grad_norm": 0.20209645537339924, "learning_rate": 4.396632366697849e-06, "loss": 0.2843, "step": 2188 }, { "epoch": 2.7608832807570978, "grad_norm": 0.21527589762414717, "learning_rate": 4.373246024321796e-06, "loss": 0.2867, "step": 2189 }, { "epoch": 2.7621451104100947, "grad_norm": 0.2022143651190669, "learning_rate": 4.349859681945744e-06, "loss": 0.2842, "step": 2190 }, { "epoch": 2.7634069400630916, "grad_norm": 0.2168741929750416, "learning_rate": 4.3264733395696914e-06, "loss": 0.3021, "step": 2191 }, { "epoch": 2.7646687697160885, "grad_norm": 0.2377571736664258, "learning_rate": 4.303086997193639e-06, "loss": 0.2805, "step": 2192 }, { "epoch": 2.765930599369085, "grad_norm": 0.23020895242343706, "learning_rate": 4.279700654817587e-06, "loss": 0.2829, "step": 2193 }, { "epoch": 2.767192429022082, "grad_norm": 0.23474691914122817, "learning_rate": 4.256314312441534e-06, "loss": 0.2974, "step": 2194 }, { "epoch": 2.768454258675079, "grad_norm": 0.22762565523542813, "learning_rate": 4.232927970065482e-06, "loss": 0.2901, "step": 2195 }, { "epoch": 2.769716088328076, "grad_norm": 0.21899965819342965, "learning_rate": 4.20954162768943e-06, "loss": 0.2898, "step": 2196 }, { "epoch": 2.7709779179810727, "grad_norm": 0.2118060744045913, "learning_rate": 4.186155285313377e-06, "loss": 0.2878, "step": 2197 }, { "epoch": 2.772239747634069, "grad_norm": 0.19553901734839926, "learning_rate": 4.162768942937325e-06, "loss": 0.2866, "step": 2198 }, { "epoch": 2.773501577287066, "grad_norm": 0.20768116092676994, "learning_rate": 4.139382600561272e-06, "loss": 0.2798, "step": 2199 }, { "epoch": 2.774763406940063, "grad_norm": 0.21397556959229616, "learning_rate": 4.1159962581852205e-06, "loss": 0.2901, "step": 2200 }, { "epoch": 2.77602523659306, "grad_norm": 0.20471254487497498, "learning_rate": 4.092609915809167e-06, "loss": 0.2957, "step": 2201 }, { "epoch": 2.777287066246057, "grad_norm": 0.20647465333188092, "learning_rate": 4.069223573433116e-06, "loss": 0.2674, "step": 2202 }, { "epoch": 2.7785488958990534, "grad_norm": 0.2226743850241834, "learning_rate": 4.045837231057062e-06, "loss": 0.2915, "step": 2203 }, { "epoch": 2.7798107255520503, "grad_norm": 0.21341339748615332, "learning_rate": 4.022450888681011e-06, "loss": 0.286, "step": 2204 }, { "epoch": 2.7810725552050473, "grad_norm": 0.9598838909859161, "learning_rate": 3.999064546304958e-06, "loss": 0.3032, "step": 2205 }, { "epoch": 2.782334384858044, "grad_norm": 0.19742363174152608, "learning_rate": 3.975678203928906e-06, "loss": 0.2968, "step": 2206 }, { "epoch": 2.783596214511041, "grad_norm": 0.20335754383427834, "learning_rate": 3.952291861552853e-06, "loss": 0.2757, "step": 2207 }, { "epoch": 2.7848580441640376, "grad_norm": 0.19516168585376054, "learning_rate": 3.928905519176801e-06, "loss": 0.288, "step": 2208 }, { "epoch": 2.786119873817035, "grad_norm": 0.20417953692041577, "learning_rate": 3.905519176800749e-06, "loss": 0.284, "step": 2209 }, { "epoch": 2.7873817034700314, "grad_norm": 0.20784100107518538, "learning_rate": 3.882132834424696e-06, "loss": 0.288, "step": 2210 }, { "epoch": 2.7886435331230284, "grad_norm": 0.21490391813560325, "learning_rate": 3.858746492048644e-06, "loss": 0.2835, "step": 2211 }, { "epoch": 2.7899053627760253, "grad_norm": 0.20375189589287132, "learning_rate": 3.8353601496725915e-06, "loss": 0.2986, "step": 2212 }, { "epoch": 2.7911671924290222, "grad_norm": 0.2088710028598101, "learning_rate": 3.811973807296539e-06, "loss": 0.288, "step": 2213 }, { "epoch": 2.792429022082019, "grad_norm": 0.19366173529517336, "learning_rate": 3.7885874649204867e-06, "loss": 0.2743, "step": 2214 }, { "epoch": 2.7936908517350156, "grad_norm": 0.2041125652010544, "learning_rate": 3.7652011225444347e-06, "loss": 0.275, "step": 2215 }, { "epoch": 2.7949526813880126, "grad_norm": 0.20358383687643214, "learning_rate": 3.741814780168382e-06, "loss": 0.2621, "step": 2216 }, { "epoch": 2.7962145110410095, "grad_norm": 0.20375636798178726, "learning_rate": 3.71842843779233e-06, "loss": 0.2891, "step": 2217 }, { "epoch": 2.7974763406940064, "grad_norm": 0.2084533358741522, "learning_rate": 3.695042095416277e-06, "loss": 0.2919, "step": 2218 }, { "epoch": 2.7987381703470033, "grad_norm": 0.21392893466660687, "learning_rate": 3.671655753040225e-06, "loss": 0.2944, "step": 2219 }, { "epoch": 2.8, "grad_norm": 0.20662358278683163, "learning_rate": 3.648269410664172e-06, "loss": 0.2987, "step": 2220 }, { "epoch": 2.8012618296529967, "grad_norm": 0.2006443491477381, "learning_rate": 3.62488306828812e-06, "loss": 0.2824, "step": 2221 }, { "epoch": 2.8025236593059937, "grad_norm": 0.19293545202256024, "learning_rate": 3.6014967259120673e-06, "loss": 0.291, "step": 2222 }, { "epoch": 2.8037854889589906, "grad_norm": 0.19274480111913353, "learning_rate": 3.5781103835360153e-06, "loss": 0.2763, "step": 2223 }, { "epoch": 2.8050473186119875, "grad_norm": 0.19673774336326832, "learning_rate": 3.5547240411599625e-06, "loss": 0.2924, "step": 2224 }, { "epoch": 2.806309148264984, "grad_norm": 0.20314693128516886, "learning_rate": 3.5313376987839105e-06, "loss": 0.2834, "step": 2225 }, { "epoch": 2.807570977917981, "grad_norm": 0.1961819111850614, "learning_rate": 3.5079513564078577e-06, "loss": 0.3013, "step": 2226 }, { "epoch": 2.808832807570978, "grad_norm": 0.2033266884588355, "learning_rate": 3.4845650140318057e-06, "loss": 0.2911, "step": 2227 }, { "epoch": 2.810094637223975, "grad_norm": 0.22588281696263116, "learning_rate": 3.461178671655753e-06, "loss": 0.3021, "step": 2228 }, { "epoch": 2.8113564668769717, "grad_norm": 0.19832349152405943, "learning_rate": 3.437792329279701e-06, "loss": 0.2837, "step": 2229 }, { "epoch": 2.812618296529968, "grad_norm": 0.2053094656514767, "learning_rate": 3.414405986903649e-06, "loss": 0.3053, "step": 2230 }, { "epoch": 2.8138801261829656, "grad_norm": 0.19544736330751247, "learning_rate": 3.391019644527596e-06, "loss": 0.2679, "step": 2231 }, { "epoch": 2.815141955835962, "grad_norm": 0.22147493015505706, "learning_rate": 3.367633302151544e-06, "loss": 0.3101, "step": 2232 }, { "epoch": 2.816403785488959, "grad_norm": 0.20130635320368762, "learning_rate": 3.344246959775491e-06, "loss": 0.2811, "step": 2233 }, { "epoch": 2.817665615141956, "grad_norm": 0.1914487459126035, "learning_rate": 3.320860617399439e-06, "loss": 0.2864, "step": 2234 }, { "epoch": 2.818927444794953, "grad_norm": 0.20804223623178442, "learning_rate": 3.2974742750233863e-06, "loss": 0.2862, "step": 2235 }, { "epoch": 2.8201892744479498, "grad_norm": 0.20130561926132473, "learning_rate": 3.2740879326473343e-06, "loss": 0.2926, "step": 2236 }, { "epoch": 2.8214511041009462, "grad_norm": 0.21018199934844362, "learning_rate": 3.2507015902712815e-06, "loss": 0.276, "step": 2237 }, { "epoch": 2.822712933753943, "grad_norm": 0.2019980499998044, "learning_rate": 3.2273152478952295e-06, "loss": 0.2893, "step": 2238 }, { "epoch": 2.82397476340694, "grad_norm": 0.21166113310318213, "learning_rate": 3.2039289055191766e-06, "loss": 0.2886, "step": 2239 }, { "epoch": 2.825236593059937, "grad_norm": 0.20106740737550646, "learning_rate": 3.1805425631431246e-06, "loss": 0.2894, "step": 2240 }, { "epoch": 2.826498422712934, "grad_norm": 0.19824988754671297, "learning_rate": 3.157156220767072e-06, "loss": 0.2796, "step": 2241 }, { "epoch": 2.8277602523659304, "grad_norm": 0.19662473951945347, "learning_rate": 3.13376987839102e-06, "loss": 0.2731, "step": 2242 }, { "epoch": 2.8290220820189274, "grad_norm": 0.19949001205660272, "learning_rate": 3.1103835360149674e-06, "loss": 0.2793, "step": 2243 }, { "epoch": 2.8302839116719243, "grad_norm": 0.19217246228434443, "learning_rate": 3.086997193638915e-06, "loss": 0.3004, "step": 2244 }, { "epoch": 2.831545741324921, "grad_norm": 0.20019506735049664, "learning_rate": 3.0636108512628626e-06, "loss": 0.2869, "step": 2245 }, { "epoch": 2.832807570977918, "grad_norm": 0.19812153229655402, "learning_rate": 3.04022450888681e-06, "loss": 0.2857, "step": 2246 }, { "epoch": 2.8340694006309146, "grad_norm": 0.1931467886078895, "learning_rate": 3.0168381665107577e-06, "loss": 0.2748, "step": 2247 }, { "epoch": 2.8353312302839115, "grad_norm": 0.1974884551031706, "learning_rate": 2.9934518241347057e-06, "loss": 0.2894, "step": 2248 }, { "epoch": 2.8365930599369085, "grad_norm": 0.19443328241212252, "learning_rate": 2.9700654817586533e-06, "loss": 0.2824, "step": 2249 }, { "epoch": 2.8378548895899054, "grad_norm": 0.19195460135249937, "learning_rate": 2.946679139382601e-06, "loss": 0.2803, "step": 2250 }, { "epoch": 2.8391167192429023, "grad_norm": 0.19081138242521598, "learning_rate": 2.9232927970065485e-06, "loss": 0.2882, "step": 2251 }, { "epoch": 2.840378548895899, "grad_norm": 0.18847843860387237, "learning_rate": 2.899906454630496e-06, "loss": 0.2736, "step": 2252 }, { "epoch": 2.841640378548896, "grad_norm": 0.20148904227901962, "learning_rate": 2.8765201122544436e-06, "loss": 0.273, "step": 2253 }, { "epoch": 2.8429022082018927, "grad_norm": 0.20092443378433492, "learning_rate": 2.853133769878391e-06, "loss": 0.2834, "step": 2254 }, { "epoch": 2.8441640378548896, "grad_norm": 0.20318657282183325, "learning_rate": 2.829747427502339e-06, "loss": 0.2959, "step": 2255 }, { "epoch": 2.8454258675078865, "grad_norm": 0.19336070529678423, "learning_rate": 2.8063610851262864e-06, "loss": 0.2952, "step": 2256 }, { "epoch": 2.8466876971608834, "grad_norm": 0.19090344800147846, "learning_rate": 2.782974742750234e-06, "loss": 0.2774, "step": 2257 }, { "epoch": 2.8479495268138804, "grad_norm": 0.19630467083218575, "learning_rate": 2.7595884003741815e-06, "loss": 0.2786, "step": 2258 }, { "epoch": 2.849211356466877, "grad_norm": 0.1962225563438849, "learning_rate": 2.736202057998129e-06, "loss": 0.2858, "step": 2259 }, { "epoch": 2.8504731861198738, "grad_norm": 0.2011625485299181, "learning_rate": 2.7128157156220767e-06, "loss": 0.2821, "step": 2260 }, { "epoch": 2.8517350157728707, "grad_norm": 0.2182669317682995, "learning_rate": 2.6894293732460247e-06, "loss": 0.2965, "step": 2261 }, { "epoch": 2.8529968454258676, "grad_norm": 0.1995706060968766, "learning_rate": 2.6660430308699723e-06, "loss": 0.2786, "step": 2262 }, { "epoch": 2.8542586750788645, "grad_norm": 0.20996422519042104, "learning_rate": 2.64265668849392e-06, "loss": 0.2813, "step": 2263 }, { "epoch": 2.855520504731861, "grad_norm": 0.1983283078456748, "learning_rate": 2.6192703461178675e-06, "loss": 0.2836, "step": 2264 }, { "epoch": 2.856782334384858, "grad_norm": 0.20782454138154796, "learning_rate": 2.595884003741815e-06, "loss": 0.2911, "step": 2265 }, { "epoch": 2.858044164037855, "grad_norm": 0.18304444440576798, "learning_rate": 2.5724976613657626e-06, "loss": 0.2686, "step": 2266 }, { "epoch": 2.859305993690852, "grad_norm": 0.23068747072768778, "learning_rate": 2.54911131898971e-06, "loss": 0.2835, "step": 2267 }, { "epoch": 2.8605678233438487, "grad_norm": 0.1965579289100782, "learning_rate": 2.5257249766136578e-06, "loss": 0.2792, "step": 2268 }, { "epoch": 2.861829652996845, "grad_norm": 0.1985253843561604, "learning_rate": 2.5023386342376054e-06, "loss": 0.285, "step": 2269 }, { "epoch": 2.863091482649842, "grad_norm": 0.18842285859958186, "learning_rate": 2.478952291861553e-06, "loss": 0.286, "step": 2270 }, { "epoch": 2.864353312302839, "grad_norm": 0.19033714449571237, "learning_rate": 2.4555659494855005e-06, "loss": 0.279, "step": 2271 }, { "epoch": 2.865615141955836, "grad_norm": 0.1842062163046925, "learning_rate": 2.432179607109448e-06, "loss": 0.2695, "step": 2272 }, { "epoch": 2.866876971608833, "grad_norm": 0.18908594677291424, "learning_rate": 2.4087932647333957e-06, "loss": 0.2745, "step": 2273 }, { "epoch": 2.8681388012618294, "grad_norm": 0.20229175340825428, "learning_rate": 2.3854069223573433e-06, "loss": 0.2834, "step": 2274 }, { "epoch": 2.8694006309148263, "grad_norm": 0.20323338765129734, "learning_rate": 2.362020579981291e-06, "loss": 0.3119, "step": 2275 }, { "epoch": 2.8706624605678233, "grad_norm": 0.21579379381440456, "learning_rate": 2.3386342376052384e-06, "loss": 0.2957, "step": 2276 }, { "epoch": 2.87192429022082, "grad_norm": 0.19207377859939698, "learning_rate": 2.315247895229186e-06, "loss": 0.289, "step": 2277 }, { "epoch": 2.873186119873817, "grad_norm": 0.19641142749999166, "learning_rate": 2.2918615528531336e-06, "loss": 0.2892, "step": 2278 }, { "epoch": 2.874447949526814, "grad_norm": 0.19874583154750702, "learning_rate": 2.268475210477081e-06, "loss": 0.2906, "step": 2279 }, { "epoch": 2.875709779179811, "grad_norm": 0.20047868415989067, "learning_rate": 2.2450888681010288e-06, "loss": 0.2934, "step": 2280 }, { "epoch": 2.8769716088328074, "grad_norm": 0.20486859711622593, "learning_rate": 2.2217025257249768e-06, "loss": 0.2916, "step": 2281 }, { "epoch": 2.8782334384858044, "grad_norm": 0.23950238868054524, "learning_rate": 2.1983161833489243e-06, "loss": 0.3089, "step": 2282 }, { "epoch": 2.8794952681388013, "grad_norm": 0.19574073325145003, "learning_rate": 2.174929840972872e-06, "loss": 0.2861, "step": 2283 }, { "epoch": 2.8807570977917982, "grad_norm": 0.18912656976485773, "learning_rate": 2.1515434985968195e-06, "loss": 0.2805, "step": 2284 }, { "epoch": 2.882018927444795, "grad_norm": 0.19503086679760348, "learning_rate": 2.128157156220767e-06, "loss": 0.2756, "step": 2285 }, { "epoch": 2.8832807570977916, "grad_norm": 2.0013164300864994, "learning_rate": 2.104770813844715e-06, "loss": 0.3457, "step": 2286 }, { "epoch": 2.8845425867507886, "grad_norm": 0.1985582219729309, "learning_rate": 2.0813844714686627e-06, "loss": 0.2775, "step": 2287 }, { "epoch": 2.8858044164037855, "grad_norm": 0.18431899004402189, "learning_rate": 2.0579981290926103e-06, "loss": 0.2701, "step": 2288 }, { "epoch": 2.8870662460567824, "grad_norm": 0.1909912834868144, "learning_rate": 2.034611786716558e-06, "loss": 0.2781, "step": 2289 }, { "epoch": 2.8883280757097793, "grad_norm": 0.1980032665162759, "learning_rate": 2.0112254443405054e-06, "loss": 0.2863, "step": 2290 }, { "epoch": 2.889589905362776, "grad_norm": 0.20067190700649626, "learning_rate": 1.987839101964453e-06, "loss": 0.2796, "step": 2291 }, { "epoch": 2.8908517350157727, "grad_norm": 0.21043172206714206, "learning_rate": 1.9644527595884006e-06, "loss": 0.2878, "step": 2292 }, { "epoch": 2.8921135646687697, "grad_norm": 0.19208073717102897, "learning_rate": 1.941066417212348e-06, "loss": 0.2814, "step": 2293 }, { "epoch": 2.8933753943217666, "grad_norm": 0.19400656533782487, "learning_rate": 1.9176800748362958e-06, "loss": 0.2707, "step": 2294 }, { "epoch": 2.8946372239747635, "grad_norm": 0.1977956617147864, "learning_rate": 1.8942937324602433e-06, "loss": 0.2761, "step": 2295 }, { "epoch": 2.89589905362776, "grad_norm": 0.1888083962572518, "learning_rate": 1.870907390084191e-06, "loss": 0.2835, "step": 2296 }, { "epoch": 2.897160883280757, "grad_norm": 0.1879969963459746, "learning_rate": 1.8475210477081385e-06, "loss": 0.2778, "step": 2297 }, { "epoch": 2.898422712933754, "grad_norm": 0.20071985421500943, "learning_rate": 1.824134705332086e-06, "loss": 0.3009, "step": 2298 }, { "epoch": 2.899684542586751, "grad_norm": 0.20682611904973305, "learning_rate": 1.8007483629560337e-06, "loss": 0.2724, "step": 2299 }, { "epoch": 2.9009463722397477, "grad_norm": 0.18816744665630306, "learning_rate": 1.7773620205799812e-06, "loss": 0.2831, "step": 2300 }, { "epoch": 2.9022082018927446, "grad_norm": 0.21308572591586908, "learning_rate": 1.7539756782039288e-06, "loss": 0.2948, "step": 2301 }, { "epoch": 2.9034700315457416, "grad_norm": 0.18784492779906062, "learning_rate": 1.7305893358278764e-06, "loss": 0.2818, "step": 2302 }, { "epoch": 2.904731861198738, "grad_norm": 0.1936531336842357, "learning_rate": 1.7072029934518244e-06, "loss": 0.2886, "step": 2303 }, { "epoch": 2.905993690851735, "grad_norm": 0.1908370252012221, "learning_rate": 1.683816651075772e-06, "loss": 0.2742, "step": 2304 }, { "epoch": 2.907255520504732, "grad_norm": 0.2003056639517309, "learning_rate": 1.6604303086997196e-06, "loss": 0.2839, "step": 2305 }, { "epoch": 2.908517350157729, "grad_norm": 0.1923536048039185, "learning_rate": 1.6370439663236672e-06, "loss": 0.2963, "step": 2306 }, { "epoch": 2.9097791798107258, "grad_norm": 0.20491632803448598, "learning_rate": 1.6136576239476147e-06, "loss": 0.2798, "step": 2307 }, { "epoch": 2.9110410094637222, "grad_norm": 0.18504676650715382, "learning_rate": 1.5902712815715623e-06, "loss": 0.2812, "step": 2308 }, { "epoch": 2.912302839116719, "grad_norm": 0.19102632181844081, "learning_rate": 1.56688493919551e-06, "loss": 0.2811, "step": 2309 }, { "epoch": 2.913564668769716, "grad_norm": 0.19413933696613703, "learning_rate": 1.5434985968194575e-06, "loss": 0.2793, "step": 2310 }, { "epoch": 2.914826498422713, "grad_norm": 0.19679780860330293, "learning_rate": 1.520112254443405e-06, "loss": 0.2865, "step": 2311 }, { "epoch": 2.91608832807571, "grad_norm": 0.1934003659338284, "learning_rate": 1.4967259120673529e-06, "loss": 0.2931, "step": 2312 }, { "epoch": 2.9173501577287064, "grad_norm": 0.20001712349794445, "learning_rate": 1.4733395696913004e-06, "loss": 0.2815, "step": 2313 }, { "epoch": 2.9186119873817034, "grad_norm": 0.18821667045530066, "learning_rate": 1.449953227315248e-06, "loss": 0.2694, "step": 2314 }, { "epoch": 2.9198738170347003, "grad_norm": 0.20504397288343496, "learning_rate": 1.4265668849391956e-06, "loss": 0.2952, "step": 2315 }, { "epoch": 2.921135646687697, "grad_norm": 0.19619857579815655, "learning_rate": 1.4031805425631432e-06, "loss": 0.295, "step": 2316 }, { "epoch": 2.922397476340694, "grad_norm": 0.20599325583762387, "learning_rate": 1.3797942001870908e-06, "loss": 0.2774, "step": 2317 }, { "epoch": 2.9236593059936906, "grad_norm": 0.1912756528026883, "learning_rate": 1.3564078578110384e-06, "loss": 0.2891, "step": 2318 }, { "epoch": 2.9249211356466875, "grad_norm": 0.19145253062694034, "learning_rate": 1.3330215154349861e-06, "loss": 0.2745, "step": 2319 }, { "epoch": 2.9261829652996845, "grad_norm": 0.18951950354744823, "learning_rate": 1.3096351730589337e-06, "loss": 0.2824, "step": 2320 }, { "epoch": 2.9274447949526814, "grad_norm": 0.20380722212258057, "learning_rate": 1.2862488306828813e-06, "loss": 0.2788, "step": 2321 }, { "epoch": 2.9287066246056783, "grad_norm": 0.194461438187136, "learning_rate": 1.2628624883068289e-06, "loss": 0.2816, "step": 2322 }, { "epoch": 2.929968454258675, "grad_norm": 0.199257828244804, "learning_rate": 1.2394761459307765e-06, "loss": 0.2846, "step": 2323 }, { "epoch": 2.931230283911672, "grad_norm": 0.20650130571443134, "learning_rate": 1.216089803554724e-06, "loss": 0.2849, "step": 2324 }, { "epoch": 2.9324921135646687, "grad_norm": 0.19680776354631474, "learning_rate": 1.1927034611786716e-06, "loss": 0.2906, "step": 2325 }, { "epoch": 2.9337539432176656, "grad_norm": 0.1926432282526223, "learning_rate": 1.1693171188026192e-06, "loss": 0.2897, "step": 2326 }, { "epoch": 2.9350157728706625, "grad_norm": 0.18883968799917913, "learning_rate": 1.1459307764265668e-06, "loss": 0.2921, "step": 2327 }, { "epoch": 2.9362776025236594, "grad_norm": 0.1812846994898678, "learning_rate": 1.1225444340505144e-06, "loss": 0.2637, "step": 2328 }, { "epoch": 2.9375394321766564, "grad_norm": 0.21151334532423927, "learning_rate": 1.0991580916744622e-06, "loss": 0.3027, "step": 2329 }, { "epoch": 2.938801261829653, "grad_norm": 0.1878166045366657, "learning_rate": 1.0757717492984098e-06, "loss": 0.2803, "step": 2330 }, { "epoch": 2.9400630914826498, "grad_norm": 0.20644181560416403, "learning_rate": 1.0523854069223575e-06, "loss": 0.2909, "step": 2331 }, { "epoch": 2.9413249211356467, "grad_norm": 0.18737208618850798, "learning_rate": 1.0289990645463051e-06, "loss": 0.2907, "step": 2332 }, { "epoch": 2.9425867507886436, "grad_norm": 0.192309181017649, "learning_rate": 1.0056127221702527e-06, "loss": 0.279, "step": 2333 }, { "epoch": 2.9438485804416406, "grad_norm": 0.19217958015678424, "learning_rate": 9.822263797942003e-07, "loss": 0.2919, "step": 2334 }, { "epoch": 2.945110410094637, "grad_norm": 0.19082441861596325, "learning_rate": 9.588400374181479e-07, "loss": 0.2763, "step": 2335 }, { "epoch": 2.946372239747634, "grad_norm": 0.20939467993024974, "learning_rate": 9.354536950420955e-07, "loss": 0.2907, "step": 2336 }, { "epoch": 2.947634069400631, "grad_norm": 0.1889105071802824, "learning_rate": 9.12067352666043e-07, "loss": 0.2854, "step": 2337 }, { "epoch": 2.948895899053628, "grad_norm": 0.1874791298460694, "learning_rate": 8.886810102899906e-07, "loss": 0.2745, "step": 2338 }, { "epoch": 2.9501577287066247, "grad_norm": 0.19605645626800905, "learning_rate": 8.652946679139382e-07, "loss": 0.2898, "step": 2339 }, { "epoch": 2.951419558359621, "grad_norm": 0.2195662425185686, "learning_rate": 8.41908325537886e-07, "loss": 0.288, "step": 2340 }, { "epoch": 2.952681388012618, "grad_norm": 0.18870615902846954, "learning_rate": 8.185219831618336e-07, "loss": 0.2865, "step": 2341 }, { "epoch": 2.953943217665615, "grad_norm": 0.19593632796541363, "learning_rate": 7.951356407857812e-07, "loss": 0.2728, "step": 2342 }, { "epoch": 2.955205047318612, "grad_norm": 0.19141767347545055, "learning_rate": 7.717492984097287e-07, "loss": 0.2856, "step": 2343 }, { "epoch": 2.956466876971609, "grad_norm": 0.19514924067879094, "learning_rate": 7.483629560336764e-07, "loss": 0.2851, "step": 2344 }, { "epoch": 2.9577287066246054, "grad_norm": 0.20337801611622602, "learning_rate": 7.24976613657624e-07, "loss": 0.3036, "step": 2345 }, { "epoch": 2.958990536277603, "grad_norm": 0.20049159073426834, "learning_rate": 7.015902712815716e-07, "loss": 0.2943, "step": 2346 }, { "epoch": 2.9602523659305993, "grad_norm": 0.196808634291867, "learning_rate": 6.782039289055192e-07, "loss": 0.2822, "step": 2347 }, { "epoch": 2.961514195583596, "grad_norm": 0.19695694688948262, "learning_rate": 6.548175865294669e-07, "loss": 0.2753, "step": 2348 }, { "epoch": 2.962776025236593, "grad_norm": 0.24195152962051172, "learning_rate": 6.314312441534144e-07, "loss": 0.3034, "step": 2349 }, { "epoch": 2.96403785488959, "grad_norm": 0.18829062390499068, "learning_rate": 6.08044901777362e-07, "loss": 0.291, "step": 2350 }, { "epoch": 2.965299684542587, "grad_norm": 0.18963488597797623, "learning_rate": 5.846585594013096e-07, "loss": 0.2957, "step": 2351 }, { "epoch": 2.9665615141955834, "grad_norm": 0.1928929340282869, "learning_rate": 5.612722170252572e-07, "loss": 0.2758, "step": 2352 }, { "epoch": 2.9678233438485804, "grad_norm": 0.18569050671565643, "learning_rate": 5.378858746492049e-07, "loss": 0.2853, "step": 2353 }, { "epoch": 2.9690851735015773, "grad_norm": 0.19576679843047806, "learning_rate": 5.144995322731526e-07, "loss": 0.2792, "step": 2354 }, { "epoch": 2.9703470031545742, "grad_norm": 0.18974057509405512, "learning_rate": 4.911131898971001e-07, "loss": 0.2923, "step": 2355 }, { "epoch": 2.971608832807571, "grad_norm": 0.20386657980568576, "learning_rate": 4.6772684752104773e-07, "loss": 0.2924, "step": 2356 }, { "epoch": 2.9728706624605676, "grad_norm": 0.19643395765538257, "learning_rate": 4.443405051449953e-07, "loss": 0.2972, "step": 2357 }, { "epoch": 2.9741324921135646, "grad_norm": 0.1778850129796762, "learning_rate": 4.20954162768943e-07, "loss": 0.2766, "step": 2358 }, { "epoch": 2.9753943217665615, "grad_norm": 0.18526183296388332, "learning_rate": 3.975678203928906e-07, "loss": 0.2725, "step": 2359 }, { "epoch": 2.9766561514195584, "grad_norm": 0.1861762404020392, "learning_rate": 3.741814780168382e-07, "loss": 0.2787, "step": 2360 }, { "epoch": 2.9779179810725553, "grad_norm": 0.19769846658891385, "learning_rate": 3.507951356407858e-07, "loss": 0.2931, "step": 2361 }, { "epoch": 2.979179810725552, "grad_norm": 0.19540161051526625, "learning_rate": 3.2740879326473343e-07, "loss": 0.2796, "step": 2362 }, { "epoch": 2.9804416403785488, "grad_norm": 0.2070632333621523, "learning_rate": 3.04022450888681e-07, "loss": 0.2864, "step": 2363 }, { "epoch": 2.9817034700315457, "grad_norm": 0.17805627623497985, "learning_rate": 2.806361085126286e-07, "loss": 0.274, "step": 2364 }, { "epoch": 2.9829652996845426, "grad_norm": 0.19562480680643513, "learning_rate": 2.572497661365763e-07, "loss": 0.2785, "step": 2365 }, { "epoch": 2.9842271293375395, "grad_norm": 0.18823783728448376, "learning_rate": 2.3386342376052386e-07, "loss": 0.2838, "step": 2366 }, { "epoch": 2.985488958990536, "grad_norm": 0.18604457858261886, "learning_rate": 2.104770813844715e-07, "loss": 0.2873, "step": 2367 }, { "epoch": 2.9867507886435334, "grad_norm": 0.19727070938193514, "learning_rate": 1.870907390084191e-07, "loss": 0.2816, "step": 2368 }, { "epoch": 2.98801261829653, "grad_norm": 0.18552916093261657, "learning_rate": 1.6370439663236672e-07, "loss": 0.2725, "step": 2369 }, { "epoch": 2.989274447949527, "grad_norm": 0.22331272922934614, "learning_rate": 1.403180542563143e-07, "loss": 0.2989, "step": 2370 }, { "epoch": 2.9905362776025237, "grad_norm": 0.18426741517716266, "learning_rate": 1.1693171188026193e-07, "loss": 0.2825, "step": 2371 }, { "epoch": 2.9917981072555206, "grad_norm": 0.1817779379276072, "learning_rate": 9.354536950420955e-08, "loss": 0.277, "step": 2372 }, { "epoch": 2.9930599369085176, "grad_norm": 0.20179757428313053, "learning_rate": 7.015902712815715e-08, "loss": 0.2866, "step": 2373 }, { "epoch": 2.994321766561514, "grad_norm": 0.1915817848134601, "learning_rate": 4.677268475210478e-08, "loss": 0.2904, "step": 2374 }, { "epoch": 2.995583596214511, "grad_norm": 0.19727899982642197, "learning_rate": 2.338634237605239e-08, "loss": 0.281, "step": 2375 }, { "epoch": 2.996845425867508, "grad_norm": 0.18640986238054733, "learning_rate": 0.0, "loss": 0.2743, "step": 2376 }, { "epoch": 2.996845425867508, "step": 2376, "total_flos": 2.0304218331986002e+18, "train_loss": 0.4356661826648094, "train_runtime": 137728.6279, "train_samples_per_second": 0.276, "train_steps_per_second": 0.017 } ], "logging_steps": 1, "max_steps": 2376, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.0304218331986002e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }