{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.1664, "eval_steps": 500, "global_step": 5000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0002, "grad_norm": 127.02848815917969, "learning_rate": 5.000000000000001e-07, "loss": 8.9894, "step": 1 }, { "epoch": 0.0004, "grad_norm": 597.7093505859375, "learning_rate": 1.0000000000000002e-06, "loss": 11.8861, "step": 2 }, { "epoch": 0.0006, "grad_norm": 672.9093627929688, "learning_rate": 1.5e-06, "loss": 22.2202, "step": 3 }, { "epoch": 0.0008, "grad_norm": 203.70701599121094, "learning_rate": 2.0000000000000003e-06, "loss": 9.8539, "step": 4 }, { "epoch": 0.001, "grad_norm": 621.4810180664062, "learning_rate": 2.5e-06, "loss": 14.1359, "step": 5 }, { "epoch": 0.0012, "grad_norm": 97.86430358886719, "learning_rate": 3e-06, "loss": 4.5683, "step": 6 }, { "epoch": 0.0014, "grad_norm": 134.84072875976562, "learning_rate": 3.5000000000000004e-06, "loss": 9.4148, "step": 7 }, { "epoch": 0.0016, "grad_norm": 221.8789825439453, "learning_rate": 4.000000000000001e-06, "loss": 8.9489, "step": 8 }, { "epoch": 0.0018, "grad_norm": 574.8094482421875, "learning_rate": 4.5e-06, "loss": 10.2934, "step": 9 }, { "epoch": 0.002, "grad_norm": 211.35472106933594, "learning_rate": 5e-06, "loss": 10.3731, "step": 10 }, { "epoch": 0.0022, "grad_norm": 235.362060546875, "learning_rate": 5.500000000000001e-06, "loss": 9.9291, "step": 11 }, { "epoch": 0.0024, "grad_norm": 94.09746551513672, "learning_rate": 6e-06, "loss": 4.4554, "step": 12 }, { "epoch": 0.0026, "grad_norm": 83.25800323486328, "learning_rate": 6.5000000000000004e-06, "loss": 7.3829, "step": 13 }, { "epoch": 0.0028, "grad_norm": 491.2541809082031, "learning_rate": 7.000000000000001e-06, "loss": 22.5135, "step": 14 }, { "epoch": 0.003, "grad_norm": 180.1479949951172, "learning_rate": 7.5e-06, "loss": 9.7904, "step": 15 }, { "epoch": 0.0032, "grad_norm": 1482.721435546875, "learning_rate": 8.000000000000001e-06, "loss": 15.6138, "step": 16 }, { "epoch": 0.0034, "grad_norm": 141.54798889160156, "learning_rate": 8.500000000000002e-06, "loss": 8.4214, "step": 17 }, { "epoch": 0.0036, "grad_norm": 265.2538757324219, "learning_rate": 9e-06, "loss": 8.1851, "step": 18 }, { "epoch": 0.0038, "grad_norm": 506.762451171875, "learning_rate": 9.5e-06, "loss": 18.3503, "step": 19 }, { "epoch": 0.004, "grad_norm": 533.5099487304688, "learning_rate": 1e-05, "loss": 9.0975, "step": 20 }, { "epoch": 0.0042, "grad_norm": 84.2190933227539, "learning_rate": 1.05e-05, "loss": 8.2995, "step": 21 }, { "epoch": 0.0044, "grad_norm": 1646.957275390625, "learning_rate": 1.1000000000000001e-05, "loss": 16.4511, "step": 22 }, { "epoch": 0.0046, "grad_norm": 73.73599243164062, "learning_rate": 1.1500000000000002e-05, "loss": 3.7298, "step": 23 }, { "epoch": 0.0048, "grad_norm": 487.625244140625, "learning_rate": 1.2e-05, "loss": 17.5225, "step": 24 }, { "epoch": 0.005, "grad_norm": 59.372535705566406, "learning_rate": 1.25e-05, "loss": 7.8804, "step": 25 }, { "epoch": 0.0052, "grad_norm": 97.49443054199219, "learning_rate": 1.3000000000000001e-05, "loss": 7.7473, "step": 26 }, { "epoch": 0.0054, "grad_norm": 100.4667739868164, "learning_rate": 1.3500000000000001e-05, "loss": 8.4293, "step": 27 }, { "epoch": 0.0056, "grad_norm": 44.721805572509766, "learning_rate": 1.4000000000000001e-05, "loss": 7.0762, "step": 28 }, { "epoch": 0.0058, "grad_norm": 920.8779907226562, "learning_rate": 1.45e-05, "loss": 11.9611, "step": 29 }, { "epoch": 0.006, "grad_norm": 270.84088134765625, "learning_rate": 1.5e-05, "loss": 8.3698, "step": 30 }, { "epoch": 0.0062, "grad_norm": 55.78865432739258, "learning_rate": 1.55e-05, "loss": 7.5168, "step": 31 }, { "epoch": 0.0064, "grad_norm": 54.78589630126953, "learning_rate": 1.6000000000000003e-05, "loss": 3.2809, "step": 32 }, { "epoch": 0.0066, "grad_norm": 99.09492492675781, "learning_rate": 1.65e-05, "loss": 7.8991, "step": 33 }, { "epoch": 0.0068, "grad_norm": 102.16423034667969, "learning_rate": 1.7000000000000003e-05, "loss": 7.4389, "step": 34 }, { "epoch": 0.007, "grad_norm": 53.18633270263672, "learning_rate": 1.75e-05, "loss": 7.7285, "step": 35 }, { "epoch": 0.0072, "grad_norm": 2191.484130859375, "learning_rate": 1.8e-05, "loss": 11.3803, "step": 36 }, { "epoch": 0.0074, "grad_norm": 48.38761901855469, "learning_rate": 1.85e-05, "loss": 6.9352, "step": 37 }, { "epoch": 0.0076, "grad_norm": 37.02681350708008, "learning_rate": 1.9e-05, "loss": 7.463, "step": 38 }, { "epoch": 0.0078, "grad_norm": 804.8701782226562, "learning_rate": 1.9500000000000003e-05, "loss": 10.1294, "step": 39 }, { "epoch": 0.008, "grad_norm": 33.863460540771484, "learning_rate": 2e-05, "loss": 6.993, "step": 40 }, { "epoch": 0.0082, "grad_norm": 122.3696060180664, "learning_rate": 2.05e-05, "loss": 7.8665, "step": 41 }, { "epoch": 0.0084, "grad_norm": 237.02120971679688, "learning_rate": 2.1e-05, "loss": 10.9977, "step": 42 }, { "epoch": 0.0086, "grad_norm": 114.43867492675781, "learning_rate": 2.15e-05, "loss": 6.6069, "step": 43 }, { "epoch": 0.0088, "grad_norm": 36.79835891723633, "learning_rate": 2.2000000000000003e-05, "loss": 7.1009, "step": 44 }, { "epoch": 0.009, "grad_norm": 205.56387329101562, "learning_rate": 2.25e-05, "loss": 10.1884, "step": 45 }, { "epoch": 0.0092, "grad_norm": 43.2474479675293, "learning_rate": 2.3000000000000003e-05, "loss": 6.7291, "step": 46 }, { "epoch": 0.0094, "grad_norm": 47.37004089355469, "learning_rate": 2.35e-05, "loss": 6.4736, "step": 47 }, { "epoch": 0.0096, "grad_norm": 542.9366455078125, "learning_rate": 2.4e-05, "loss": 10.2724, "step": 48 }, { "epoch": 0.0098, "grad_norm": 99.73185729980469, "learning_rate": 2.45e-05, "loss": 5.6246, "step": 49 }, { "epoch": 0.01, "grad_norm": 38.3469123840332, "learning_rate": 2.5e-05, "loss": 6.3691, "step": 50 }, { "epoch": 0.0102, "grad_norm": 76.59410095214844, "learning_rate": 2.5500000000000003e-05, "loss": 7.3026, "step": 51 }, { "epoch": 0.0104, "grad_norm": 234.27630615234375, "learning_rate": 2.6000000000000002e-05, "loss": 8.6945, "step": 52 }, { "epoch": 0.0106, "grad_norm": 44.871097564697266, "learning_rate": 2.6500000000000004e-05, "loss": 6.2657, "step": 53 }, { "epoch": 0.0108, "grad_norm": 60.8604850769043, "learning_rate": 2.7000000000000002e-05, "loss": 6.6361, "step": 54 }, { "epoch": 0.011, "grad_norm": 62.86309051513672, "learning_rate": 2.7500000000000004e-05, "loss": 4.5686, "step": 55 }, { "epoch": 0.0112, "grad_norm": 56.52326583862305, "learning_rate": 2.8000000000000003e-05, "loss": 5.9374, "step": 56 }, { "epoch": 0.0114, "grad_norm": 29.06716537475586, "learning_rate": 2.8499999999999998e-05, "loss": 6.9688, "step": 57 }, { "epoch": 0.0116, "grad_norm": 44.0078125, "learning_rate": 2.9e-05, "loss": 4.9339, "step": 58 }, { "epoch": 0.0118, "grad_norm": 42.00264358520508, "learning_rate": 2.95e-05, "loss": 5.7515, "step": 59 }, { "epoch": 0.012, "grad_norm": 140.99119567871094, "learning_rate": 3e-05, "loss": 6.9327, "step": 60 }, { "epoch": 0.0122, "grad_norm": 39.872074127197266, "learning_rate": 3.05e-05, "loss": 5.49, "step": 61 }, { "epoch": 0.0124, "grad_norm": 29.44024658203125, "learning_rate": 3.1e-05, "loss": 6.6595, "step": 62 }, { "epoch": 0.0126, "grad_norm": 29.17344856262207, "learning_rate": 3.15e-05, "loss": 4.0179, "step": 63 }, { "epoch": 0.0128, "grad_norm": 118.6063003540039, "learning_rate": 3.2000000000000005e-05, "loss": 4.1218, "step": 64 }, { "epoch": 0.013, "grad_norm": 33.67353820800781, "learning_rate": 3.2500000000000004e-05, "loss": 4.605, "step": 65 }, { "epoch": 0.0132, "grad_norm": 31.440397262573242, "learning_rate": 3.3e-05, "loss": 4.8518, "step": 66 }, { "epoch": 0.0134, "grad_norm": 17.99005699157715, "learning_rate": 3.35e-05, "loss": 3.2932, "step": 67 }, { "epoch": 0.0136, "grad_norm": 26.642446517944336, "learning_rate": 3.4000000000000007e-05, "loss": 6.224, "step": 68 }, { "epoch": 0.0138, "grad_norm": 311.91741943359375, "learning_rate": 3.45e-05, "loss": 7.6243, "step": 69 }, { "epoch": 0.014, "grad_norm": 82.06395721435547, "learning_rate": 3.5e-05, "loss": 7.1435, "step": 70 }, { "epoch": 0.0142, "grad_norm": 262.7768249511719, "learning_rate": 3.55e-05, "loss": 4.886, "step": 71 }, { "epoch": 0.0144, "grad_norm": 37.81349563598633, "learning_rate": 3.6e-05, "loss": 6.8874, "step": 72 }, { "epoch": 0.0146, "grad_norm": 74.8050765991211, "learning_rate": 3.65e-05, "loss": 2.0786, "step": 73 }, { "epoch": 0.0148, "grad_norm": 90.38041687011719, "learning_rate": 3.7e-05, "loss": 3.6252, "step": 74 }, { "epoch": 0.015, "grad_norm": 60.22389602661133, "learning_rate": 3.7500000000000003e-05, "loss": 4.9044, "step": 75 }, { "epoch": 0.0152, "grad_norm": 340.64166259765625, "learning_rate": 3.8e-05, "loss": 8.3815, "step": 76 }, { "epoch": 0.0154, "grad_norm": 27.283594131469727, "learning_rate": 3.85e-05, "loss": 5.8438, "step": 77 }, { "epoch": 0.0156, "grad_norm": 14.617252349853516, "learning_rate": 3.9000000000000006e-05, "loss": 3.4718, "step": 78 }, { "epoch": 0.0158, "grad_norm": 20.60955810546875, "learning_rate": 3.9500000000000005e-05, "loss": 4.0038, "step": 79 }, { "epoch": 0.016, "grad_norm": 24.20742416381836, "learning_rate": 4e-05, "loss": 1.6708, "step": 80 }, { "epoch": 0.0162, "grad_norm": 9.425049781799316, "learning_rate": 4.05e-05, "loss": 3.2246, "step": 81 }, { "epoch": 0.0164, "grad_norm": 133.04257202148438, "learning_rate": 4.1e-05, "loss": 6.8735, "step": 82 }, { "epoch": 0.0166, "grad_norm": 34.120025634765625, "learning_rate": 4.15e-05, "loss": 1.512, "step": 83 }, { "epoch": 0.0168, "grad_norm": 95.99542999267578, "learning_rate": 4.2e-05, "loss": 4.0492, "step": 84 }, { "epoch": 0.017, "grad_norm": 130.52586364746094, "learning_rate": 4.25e-05, "loss": 7.1804, "step": 85 }, { "epoch": 0.0172, "grad_norm": 50.44832229614258, "learning_rate": 4.3e-05, "loss": 5.8466, "step": 86 }, { "epoch": 0.0174, "grad_norm": 13.494194984436035, "learning_rate": 4.35e-05, "loss": 1.1162, "step": 87 }, { "epoch": 0.0176, "grad_norm": 32.111392974853516, "learning_rate": 4.4000000000000006e-05, "loss": 5.315, "step": 88 }, { "epoch": 0.0178, "grad_norm": 54.81203842163086, "learning_rate": 4.4500000000000004e-05, "loss": 3.1099, "step": 89 }, { "epoch": 0.018, "grad_norm": 19.034406661987305, "learning_rate": 4.5e-05, "loss": 2.7665, "step": 90 }, { "epoch": 0.0182, "grad_norm": 31.54140281677246, "learning_rate": 4.55e-05, "loss": 2.5888, "step": 91 }, { "epoch": 0.0184, "grad_norm": 11.20257568359375, "learning_rate": 4.600000000000001e-05, "loss": 1.1534, "step": 92 }, { "epoch": 0.0186, "grad_norm": 32.47492218017578, "learning_rate": 4.6500000000000005e-05, "loss": 4.7617, "step": 93 }, { "epoch": 0.0188, "grad_norm": 34.09675216674805, "learning_rate": 4.7e-05, "loss": 2.0035, "step": 94 }, { "epoch": 0.019, "grad_norm": 37.695125579833984, "learning_rate": 4.75e-05, "loss": 5.3329, "step": 95 }, { "epoch": 0.0192, "grad_norm": 20.07520294189453, "learning_rate": 4.8e-05, "loss": 2.5175, "step": 96 }, { "epoch": 0.0194, "grad_norm": 415.35711669921875, "learning_rate": 4.85e-05, "loss": 10.5392, "step": 97 }, { "epoch": 0.0196, "grad_norm": 41.11747741699219, "learning_rate": 4.9e-05, "loss": 4.6094, "step": 98 }, { "epoch": 0.0198, "grad_norm": 42.81120300292969, "learning_rate": 4.9500000000000004e-05, "loss": 6.263, "step": 99 }, { "epoch": 0.02, "grad_norm": 14.508249282836914, "learning_rate": 5e-05, "loss": 0.8818, "step": 100 }, { "epoch": 0.0202, "grad_norm": 10.983814239501953, "learning_rate": 4.998979591836735e-05, "loss": 0.7669, "step": 101 }, { "epoch": 0.0204, "grad_norm": 9.199824333190918, "learning_rate": 4.9979591836734694e-05, "loss": 1.0418, "step": 102 }, { "epoch": 0.0206, "grad_norm": 66.83099365234375, "learning_rate": 4.996938775510204e-05, "loss": 1.6225, "step": 103 }, { "epoch": 0.0208, "grad_norm": 7.638471603393555, "learning_rate": 4.995918367346939e-05, "loss": 0.4936, "step": 104 }, { "epoch": 0.021, "grad_norm": 10.665667533874512, "learning_rate": 4.9948979591836735e-05, "loss": 0.8371, "step": 105 }, { "epoch": 0.0212, "grad_norm": 10.239150047302246, "learning_rate": 4.9938775510204084e-05, "loss": 0.8516, "step": 106 }, { "epoch": 0.0214, "grad_norm": 34.41289138793945, "learning_rate": 4.992857142857143e-05, "loss": 2.7268, "step": 107 }, { "epoch": 0.0216, "grad_norm": 107.72695922851562, "learning_rate": 4.9918367346938776e-05, "loss": 6.4053, "step": 108 }, { "epoch": 0.0218, "grad_norm": 64.1345443725586, "learning_rate": 4.9908163265306125e-05, "loss": 3.2117, "step": 109 }, { "epoch": 0.022, "grad_norm": 11.564123153686523, "learning_rate": 4.9897959183673474e-05, "loss": 0.8914, "step": 110 }, { "epoch": 0.0222, "grad_norm": 23.555273056030273, "learning_rate": 4.9887755102040816e-05, "loss": 2.3173, "step": 111 }, { "epoch": 0.0224, "grad_norm": 26.109325408935547, "learning_rate": 4.9877551020408165e-05, "loss": 2.4154, "step": 112 }, { "epoch": 0.0226, "grad_norm": 37.607215881347656, "learning_rate": 4.986734693877551e-05, "loss": 4.3035, "step": 113 }, { "epoch": 0.0228, "grad_norm": 8.989594459533691, "learning_rate": 4.985714285714286e-05, "loss": 2.5046, "step": 114 }, { "epoch": 0.023, "grad_norm": 27.475799560546875, "learning_rate": 4.9846938775510206e-05, "loss": 2.4623, "step": 115 }, { "epoch": 0.0232, "grad_norm": 5.941082954406738, "learning_rate": 4.983673469387755e-05, "loss": 0.7323, "step": 116 }, { "epoch": 0.0234, "grad_norm": 6.239040374755859, "learning_rate": 4.98265306122449e-05, "loss": 0.7388, "step": 117 }, { "epoch": 0.0236, "grad_norm": 5.513604640960693, "learning_rate": 4.981632653061225e-05, "loss": 0.6584, "step": 118 }, { "epoch": 0.0238, "grad_norm": 43.922508239746094, "learning_rate": 4.980612244897959e-05, "loss": 5.6676, "step": 119 }, { "epoch": 0.024, "grad_norm": 8.907512664794922, "learning_rate": 4.979591836734694e-05, "loss": 2.0489, "step": 120 }, { "epoch": 0.0242, "grad_norm": 6.571248531341553, "learning_rate": 4.978571428571429e-05, "loss": 2.4868, "step": 121 }, { "epoch": 0.0244, "grad_norm": 9.025981903076172, "learning_rate": 4.977551020408163e-05, "loss": 3.183, "step": 122 }, { "epoch": 0.0246, "grad_norm": 82.9317398071289, "learning_rate": 4.9765306122448986e-05, "loss": 6.5242, "step": 123 }, { "epoch": 0.0248, "grad_norm": 43.22480010986328, "learning_rate": 4.975510204081633e-05, "loss": 5.2471, "step": 124 }, { "epoch": 0.025, "grad_norm": 5.8927154541015625, "learning_rate": 4.974489795918368e-05, "loss": 6.9892, "step": 125 }, { "epoch": 0.0252, "grad_norm": 12.013327598571777, "learning_rate": 4.973469387755103e-05, "loss": 1.9226, "step": 126 }, { "epoch": 0.0254, "grad_norm": 15.207947731018066, "learning_rate": 4.972448979591837e-05, "loss": 6.7809, "step": 127 }, { "epoch": 0.0256, "grad_norm": 17.63144302368164, "learning_rate": 4.971428571428572e-05, "loss": 1.7614, "step": 128 }, { "epoch": 0.0258, "grad_norm": 28.31899642944336, "learning_rate": 4.970408163265307e-05, "loss": 2.4694, "step": 129 }, { "epoch": 0.026, "grad_norm": 5.781543254852295, "learning_rate": 4.969387755102041e-05, "loss": 2.5552, "step": 130 }, { "epoch": 0.0262, "grad_norm": 80.65481567382812, "learning_rate": 4.968367346938776e-05, "loss": 3.0708, "step": 131 }, { "epoch": 0.0264, "grad_norm": 4.071949005126953, "learning_rate": 4.967346938775511e-05, "loss": 0.2991, "step": 132 }, { "epoch": 0.0266, "grad_norm": 4.599740982055664, "learning_rate": 4.966326530612245e-05, "loss": 2.3899, "step": 133 }, { "epoch": 0.0268, "grad_norm": 20.408233642578125, "learning_rate": 4.96530612244898e-05, "loss": 6.7909, "step": 134 }, { "epoch": 0.027, "grad_norm": 4.217963218688965, "learning_rate": 4.964285714285715e-05, "loss": 2.3584, "step": 135 }, { "epoch": 0.0272, "grad_norm": 120.89624786376953, "learning_rate": 4.963265306122449e-05, "loss": 7.0599, "step": 136 }, { "epoch": 0.0274, "grad_norm": 4.268696308135986, "learning_rate": 4.962244897959184e-05, "loss": 0.3368, "step": 137 }, { "epoch": 0.0276, "grad_norm": 8.89269733428955, "learning_rate": 4.961224489795919e-05, "loss": 1.674, "step": 138 }, { "epoch": 0.0278, "grad_norm": 4.60618257522583, "learning_rate": 4.960204081632653e-05, "loss": 0.5549, "step": 139 }, { "epoch": 0.028, "grad_norm": 37.98936462402344, "learning_rate": 4.959183673469388e-05, "loss": 5.3535, "step": 140 }, { "epoch": 0.0282, "grad_norm": 8.033515930175781, "learning_rate": 4.958163265306123e-05, "loss": 1.5668, "step": 141 }, { "epoch": 0.0284, "grad_norm": 7.632925033569336, "learning_rate": 4.957142857142857e-05, "loss": 1.5609, "step": 142 }, { "epoch": 0.0286, "grad_norm": 3.267777681350708, "learning_rate": 4.956122448979592e-05, "loss": 0.2475, "step": 143 }, { "epoch": 0.0288, "grad_norm": 4.545289039611816, "learning_rate": 4.9551020408163265e-05, "loss": 2.3144, "step": 144 }, { "epoch": 0.029, "grad_norm": 19.45534324645996, "learning_rate": 4.9540816326530614e-05, "loss": 1.4105, "step": 145 }, { "epoch": 0.0292, "grad_norm": 12.32528018951416, "learning_rate": 4.953061224489796e-05, "loss": 0.6523, "step": 146 }, { "epoch": 0.0294, "grad_norm": 115.36129760742188, "learning_rate": 4.9520408163265305e-05, "loss": 6.924, "step": 147 }, { "epoch": 0.0296, "grad_norm": 16.953712463378906, "learning_rate": 4.9510204081632655e-05, "loss": 3.345, "step": 148 }, { "epoch": 0.0298, "grad_norm": 50.63832473754883, "learning_rate": 4.9500000000000004e-05, "loss": 4.9704, "step": 149 }, { "epoch": 0.03, "grad_norm": 32.5130615234375, "learning_rate": 4.9489795918367346e-05, "loss": 6.6223, "step": 150 }, { "epoch": 0.0302, "grad_norm": 4.348920822143555, "learning_rate": 4.9479591836734695e-05, "loss": 0.5413, "step": 151 }, { "epoch": 0.0304, "grad_norm": 3.998990774154663, "learning_rate": 4.9469387755102045e-05, "loss": 0.4476, "step": 152 }, { "epoch": 0.0306, "grad_norm": 23.424474716186523, "learning_rate": 4.945918367346939e-05, "loss": 3.7908, "step": 153 }, { "epoch": 0.0308, "grad_norm": 22.823986053466797, "learning_rate": 4.9448979591836736e-05, "loss": 3.9092, "step": 154 }, { "epoch": 0.031, "grad_norm": 7.079718589782715, "learning_rate": 4.9438775510204085e-05, "loss": 2.9751, "step": 155 }, { "epoch": 0.0312, "grad_norm": 52.237789154052734, "learning_rate": 4.942857142857143e-05, "loss": 5.3007, "step": 156 }, { "epoch": 0.0314, "grad_norm": 6.196706295013428, "learning_rate": 4.941836734693878e-05, "loss": 2.8203, "step": 157 }, { "epoch": 0.0316, "grad_norm": 4.051186561584473, "learning_rate": 4.9408163265306126e-05, "loss": 0.5189, "step": 158 }, { "epoch": 0.0318, "grad_norm": 3.7578818798065186, "learning_rate": 4.939795918367347e-05, "loss": 0.4583, "step": 159 }, { "epoch": 0.032, "grad_norm": 22.257274627685547, "learning_rate": 4.938775510204082e-05, "loss": 3.8465, "step": 160 }, { "epoch": 0.0322, "grad_norm": 3.206167221069336, "learning_rate": 4.937755102040817e-05, "loss": 0.2594, "step": 161 }, { "epoch": 0.0324, "grad_norm": 9.934144973754883, "learning_rate": 4.936734693877551e-05, "loss": 1.4664, "step": 162 }, { "epoch": 0.0326, "grad_norm": 71.399169921875, "learning_rate": 4.935714285714286e-05, "loss": 6.7523, "step": 163 }, { "epoch": 0.0328, "grad_norm": 12.2349214553833, "learning_rate": 4.93469387755102e-05, "loss": 0.4401, "step": 164 }, { "epoch": 0.033, "grad_norm": 3.647381067276001, "learning_rate": 4.933673469387755e-05, "loss": 0.4522, "step": 165 }, { "epoch": 0.0332, "grad_norm": 2.9874470233917236, "learning_rate": 4.93265306122449e-05, "loss": 0.4223, "step": 166 }, { "epoch": 0.0334, "grad_norm": 15.757845878601074, "learning_rate": 4.931632653061224e-05, "loss": 3.0744, "step": 167 }, { "epoch": 0.0336, "grad_norm": 2.7883803844451904, "learning_rate": 4.930612244897959e-05, "loss": 0.1997, "step": 168 }, { "epoch": 0.0338, "grad_norm": 14.830641746520996, "learning_rate": 4.929591836734695e-05, "loss": 2.8235, "step": 169 }, { "epoch": 0.034, "grad_norm": 8.174346923828125, "learning_rate": 4.928571428571429e-05, "loss": 1.4324, "step": 170 }, { "epoch": 0.0342, "grad_norm": 9.390400886535645, "learning_rate": 4.927551020408164e-05, "loss": 1.1507, "step": 171 }, { "epoch": 0.0344, "grad_norm": 3.5765488147735596, "learning_rate": 4.926530612244899e-05, "loss": 0.2527, "step": 172 }, { "epoch": 0.0346, "grad_norm": 16.015581130981445, "learning_rate": 4.925510204081633e-05, "loss": 2.0474, "step": 173 }, { "epoch": 0.0348, "grad_norm": 13.759570121765137, "learning_rate": 4.924489795918368e-05, "loss": 7.0157, "step": 174 }, { "epoch": 0.035, "grad_norm": 45.530189514160156, "learning_rate": 4.923469387755102e-05, "loss": 4.6881, "step": 175 }, { "epoch": 0.0352, "grad_norm": 3.162175178527832, "learning_rate": 4.922448979591837e-05, "loss": 0.2335, "step": 176 }, { "epoch": 0.0354, "grad_norm": 2.960479497909546, "learning_rate": 4.921428571428572e-05, "loss": 0.4245, "step": 177 }, { "epoch": 0.0356, "grad_norm": 3.4160397052764893, "learning_rate": 4.920408163265306e-05, "loss": 0.4166, "step": 178 }, { "epoch": 0.0358, "grad_norm": 2.5386452674865723, "learning_rate": 4.919387755102041e-05, "loss": 0.3643, "step": 179 }, { "epoch": 0.036, "grad_norm": 2.4958510398864746, "learning_rate": 4.918367346938776e-05, "loss": 0.3616, "step": 180 }, { "epoch": 0.0362, "grad_norm": 7.223433017730713, "learning_rate": 4.91734693877551e-05, "loss": 1.4523, "step": 181 }, { "epoch": 0.0364, "grad_norm": 16.580181121826172, "learning_rate": 4.916326530612245e-05, "loss": 2.6113, "step": 182 }, { "epoch": 0.0366, "grad_norm": 3.077042818069458, "learning_rate": 4.91530612244898e-05, "loss": 0.4027, "step": 183 }, { "epoch": 0.0368, "grad_norm": 7.351260185241699, "learning_rate": 4.9142857142857144e-05, "loss": 2.8672, "step": 184 }, { "epoch": 0.037, "grad_norm": 5.3265886306762695, "learning_rate": 4.913265306122449e-05, "loss": 0.2311, "step": 185 }, { "epoch": 0.0372, "grad_norm": 25.74429702758789, "learning_rate": 4.912244897959184e-05, "loss": 1.7426, "step": 186 }, { "epoch": 0.0374, "grad_norm": 2.879579782485962, "learning_rate": 4.9112244897959185e-05, "loss": 0.3941, "step": 187 }, { "epoch": 0.0376, "grad_norm": 13.189726829528809, "learning_rate": 4.9102040816326534e-05, "loss": 0.5893, "step": 188 }, { "epoch": 0.0378, "grad_norm": 3.6334128379821777, "learning_rate": 4.909183673469388e-05, "loss": 0.2001, "step": 189 }, { "epoch": 0.038, "grad_norm": 14.311576843261719, "learning_rate": 4.9081632653061225e-05, "loss": 1.6499, "step": 190 }, { "epoch": 0.0382, "grad_norm": 24.885698318481445, "learning_rate": 4.9071428571428574e-05, "loss": 3.9402, "step": 191 }, { "epoch": 0.0384, "grad_norm": 2.6309475898742676, "learning_rate": 4.9061224489795924e-05, "loss": 0.3384, "step": 192 }, { "epoch": 0.0386, "grad_norm": 10.061433792114258, "learning_rate": 4.9051020408163266e-05, "loss": 1.5038, "step": 193 }, { "epoch": 0.0388, "grad_norm": 20.588462829589844, "learning_rate": 4.9040816326530615e-05, "loss": 2.2994, "step": 194 }, { "epoch": 0.039, "grad_norm": 8.214776992797852, "learning_rate": 4.903061224489796e-05, "loss": 1.6767, "step": 195 }, { "epoch": 0.0392, "grad_norm": 2.4332640171051025, "learning_rate": 4.902040816326531e-05, "loss": 0.3212, "step": 196 }, { "epoch": 0.0394, "grad_norm": 15.954568862915039, "learning_rate": 4.9010204081632656e-05, "loss": 2.0944, "step": 197 }, { "epoch": 0.0396, "grad_norm": 3.0381927490234375, "learning_rate": 4.9e-05, "loss": 0.4318, "step": 198 }, { "epoch": 0.0398, "grad_norm": 2.6343955993652344, "learning_rate": 4.898979591836735e-05, "loss": 0.1892, "step": 199 }, { "epoch": 0.04, "grad_norm": 6.825660705566406, "learning_rate": 4.89795918367347e-05, "loss": 1.2455, "step": 200 }, { "epoch": 0.0402, "grad_norm": 15.805678367614746, "learning_rate": 4.896938775510204e-05, "loss": 3.0766, "step": 201 }, { "epoch": 0.0404, "grad_norm": 22.658878326416016, "learning_rate": 4.895918367346939e-05, "loss": 2.6953, "step": 202 }, { "epoch": 0.0406, "grad_norm": 10.367082595825195, "learning_rate": 4.894897959183674e-05, "loss": 1.3568, "step": 203 }, { "epoch": 0.0408, "grad_norm": 14.985063552856445, "learning_rate": 4.893877551020408e-05, "loss": 3.11, "step": 204 }, { "epoch": 0.041, "grad_norm": 4.432097434997559, "learning_rate": 4.892857142857143e-05, "loss": 0.6922, "step": 205 }, { "epoch": 0.0412, "grad_norm": 74.39356994628906, "learning_rate": 4.891836734693878e-05, "loss": 4.0891, "step": 206 }, { "epoch": 0.0414, "grad_norm": 5.9803996086120605, "learning_rate": 4.890816326530612e-05, "loss": 1.1988, "step": 207 }, { "epoch": 0.0416, "grad_norm": 5.758410930633545, "learning_rate": 4.889795918367347e-05, "loss": 2.229, "step": 208 }, { "epoch": 0.0418, "grad_norm": 11.344877243041992, "learning_rate": 4.888775510204082e-05, "loss": 1.8269, "step": 209 }, { "epoch": 0.042, "grad_norm": 3.3845725059509277, "learning_rate": 4.887755102040816e-05, "loss": 0.4241, "step": 210 }, { "epoch": 0.0422, "grad_norm": 3.4041435718536377, "learning_rate": 4.886734693877551e-05, "loss": 0.4123, "step": 211 }, { "epoch": 0.0424, "grad_norm": 3.4436275959014893, "learning_rate": 4.885714285714286e-05, "loss": 0.4752, "step": 212 }, { "epoch": 0.0426, "grad_norm": 11.167293548583984, "learning_rate": 4.88469387755102e-05, "loss": 2.8301, "step": 213 }, { "epoch": 0.0428, "grad_norm": 25.37110137939453, "learning_rate": 4.883673469387756e-05, "loss": 3.3613, "step": 214 }, { "epoch": 0.043, "grad_norm": 2.6816182136535645, "learning_rate": 4.88265306122449e-05, "loss": 0.1673, "step": 215 }, { "epoch": 0.0432, "grad_norm": 5.956937789916992, "learning_rate": 4.881632653061225e-05, "loss": 0.2408, "step": 216 }, { "epoch": 0.0434, "grad_norm": 8.719182014465332, "learning_rate": 4.88061224489796e-05, "loss": 1.2296, "step": 217 }, { "epoch": 0.0436, "grad_norm": 7.700140953063965, "learning_rate": 4.879591836734694e-05, "loss": 2.7421, "step": 218 }, { "epoch": 0.0438, "grad_norm": 1.9446169137954712, "learning_rate": 4.878571428571429e-05, "loss": 0.1474, "step": 219 }, { "epoch": 0.044, "grad_norm": 87.47311401367188, "learning_rate": 4.877551020408164e-05, "loss": 4.962, "step": 220 }, { "epoch": 0.0442, "grad_norm": 2.5825252532958984, "learning_rate": 4.876530612244898e-05, "loss": 0.178, "step": 221 }, { "epoch": 0.0444, "grad_norm": 54.101078033447266, "learning_rate": 4.875510204081633e-05, "loss": 6.6264, "step": 222 }, { "epoch": 0.0446, "grad_norm": 6.605436325073242, "learning_rate": 4.874489795918368e-05, "loss": 1.5507, "step": 223 }, { "epoch": 0.0448, "grad_norm": 14.785659790039062, "learning_rate": 4.873469387755102e-05, "loss": 3.2945, "step": 224 }, { "epoch": 0.045, "grad_norm": 9.988936424255371, "learning_rate": 4.872448979591837e-05, "loss": 2.7765, "step": 225 }, { "epoch": 0.0452, "grad_norm": 40.442596435546875, "learning_rate": 4.8714285714285714e-05, "loss": 4.506, "step": 226 }, { "epoch": 0.0454, "grad_norm": 2.5142595767974854, "learning_rate": 4.8704081632653064e-05, "loss": 0.1866, "step": 227 }, { "epoch": 0.0456, "grad_norm": 4.363778591156006, "learning_rate": 4.869387755102041e-05, "loss": 2.0775, "step": 228 }, { "epoch": 0.0458, "grad_norm": 2.496397018432617, "learning_rate": 4.8683673469387755e-05, "loss": 0.1748, "step": 229 }, { "epoch": 0.046, "grad_norm": 3.2190587520599365, "learning_rate": 4.8673469387755104e-05, "loss": 0.4083, "step": 230 }, { "epoch": 0.0462, "grad_norm": 13.004916191101074, "learning_rate": 4.8663265306122454e-05, "loss": 0.826, "step": 231 }, { "epoch": 0.0464, "grad_norm": 3.257899045944214, "learning_rate": 4.8653061224489796e-05, "loss": 0.4073, "step": 232 }, { "epoch": 0.0466, "grad_norm": 22.85171890258789, "learning_rate": 4.8642857142857145e-05, "loss": 2.5843, "step": 233 }, { "epoch": 0.0468, "grad_norm": 23.22861671447754, "learning_rate": 4.8632653061224494e-05, "loss": 3.0827, "step": 234 }, { "epoch": 0.047, "grad_norm": 2.492215633392334, "learning_rate": 4.862244897959184e-05, "loss": 0.315, "step": 235 }, { "epoch": 0.0472, "grad_norm": 2.859203338623047, "learning_rate": 4.8612244897959186e-05, "loss": 0.4144, "step": 236 }, { "epoch": 0.0474, "grad_norm": 10.443507194519043, "learning_rate": 4.8602040816326535e-05, "loss": 1.4981, "step": 237 }, { "epoch": 0.0476, "grad_norm": 5.941455364227295, "learning_rate": 4.859183673469388e-05, "loss": 1.4664, "step": 238 }, { "epoch": 0.0478, "grad_norm": 2.6655406951904297, "learning_rate": 4.858163265306123e-05, "loss": 0.327, "step": 239 }, { "epoch": 0.048, "grad_norm": 5.939995288848877, "learning_rate": 4.8571428571428576e-05, "loss": 1.3496, "step": 240 }, { "epoch": 0.0482, "grad_norm": 36.7309684753418, "learning_rate": 4.856122448979592e-05, "loss": 4.0816, "step": 241 }, { "epoch": 0.0484, "grad_norm": 5.255819797515869, "learning_rate": 4.855102040816327e-05, "loss": 0.2191, "step": 242 }, { "epoch": 0.0486, "grad_norm": 24.00787925720215, "learning_rate": 4.854081632653062e-05, "loss": 3.8993, "step": 243 }, { "epoch": 0.0488, "grad_norm": 6.310253143310547, "learning_rate": 4.853061224489796e-05, "loss": 2.7461, "step": 244 }, { "epoch": 0.049, "grad_norm": 2.4759926795959473, "learning_rate": 4.852040816326531e-05, "loss": 0.312, "step": 245 }, { "epoch": 0.0492, "grad_norm": 10.137598037719727, "learning_rate": 4.851020408163266e-05, "loss": 0.6682, "step": 246 }, { "epoch": 0.0494, "grad_norm": 7.681159019470215, "learning_rate": 4.85e-05, "loss": 1.4522, "step": 247 }, { "epoch": 0.0496, "grad_norm": 7.122126579284668, "learning_rate": 4.848979591836735e-05, "loss": 1.34, "step": 248 }, { "epoch": 0.0498, "grad_norm": 3.2464118003845215, "learning_rate": 4.847959183673469e-05, "loss": 0.4424, "step": 249 }, { "epoch": 0.05, "grad_norm": 2.2065978050231934, "learning_rate": 4.846938775510204e-05, "loss": 0.3443, "step": 250 }, { "epoch": 0.0502, "grad_norm": 12.5082426071167, "learning_rate": 4.845918367346939e-05, "loss": 1.12, "step": 251 }, { "epoch": 0.0504, "grad_norm": 18.6636962890625, "learning_rate": 4.844897959183673e-05, "loss": 4.2972, "step": 252 }, { "epoch": 0.0506, "grad_norm": 4.454346656799316, "learning_rate": 4.843877551020408e-05, "loss": 2.1657, "step": 253 }, { "epoch": 0.0508, "grad_norm": 9.119332313537598, "learning_rate": 4.842857142857143e-05, "loss": 1.2732, "step": 254 }, { "epoch": 0.051, "grad_norm": 2.056427240371704, "learning_rate": 4.841836734693877e-05, "loss": 0.3077, "step": 255 }, { "epoch": 0.0512, "grad_norm": 11.680416107177734, "learning_rate": 4.840816326530612e-05, "loss": 2.7134, "step": 256 }, { "epoch": 0.0514, "grad_norm": 3.6624484062194824, "learning_rate": 4.839795918367347e-05, "loss": 0.1659, "step": 257 }, { "epoch": 0.0516, "grad_norm": 31.827489852905273, "learning_rate": 4.8387755102040814e-05, "loss": 2.0273, "step": 258 }, { "epoch": 0.0518, "grad_norm": 115.13412475585938, "learning_rate": 4.837755102040817e-05, "loss": 5.7663, "step": 259 }, { "epoch": 0.052, "grad_norm": 6.4298810958862305, "learning_rate": 4.836734693877551e-05, "loss": 1.1484, "step": 260 }, { "epoch": 0.0522, "grad_norm": 28.639739990234375, "learning_rate": 4.835714285714286e-05, "loss": 2.9524, "step": 261 }, { "epoch": 0.0524, "grad_norm": 22.621858596801758, "learning_rate": 4.834693877551021e-05, "loss": 6.4986, "step": 262 }, { "epoch": 0.0526, "grad_norm": 12.89719009399414, "learning_rate": 4.833673469387755e-05, "loss": 6.5437, "step": 263 }, { "epoch": 0.0528, "grad_norm": 14.225550651550293, "learning_rate": 4.83265306122449e-05, "loss": 1.9275, "step": 264 }, { "epoch": 0.053, "grad_norm": 25.970439910888672, "learning_rate": 4.831632653061225e-05, "loss": 6.3776, "step": 265 }, { "epoch": 0.0532, "grad_norm": 4.544527053833008, "learning_rate": 4.8306122448979594e-05, "loss": 2.1623, "step": 266 }, { "epoch": 0.0534, "grad_norm": 5.257410049438477, "learning_rate": 4.829591836734694e-05, "loss": 0.2056, "step": 267 }, { "epoch": 0.0536, "grad_norm": 6.435934066772461, "learning_rate": 4.828571428571429e-05, "loss": 1.2436, "step": 268 }, { "epoch": 0.0538, "grad_norm": 2.09723162651062, "learning_rate": 4.8275510204081634e-05, "loss": 0.281, "step": 269 }, { "epoch": 0.054, "grad_norm": 2.2552835941314697, "learning_rate": 4.8265306122448984e-05, "loss": 0.3421, "step": 270 }, { "epoch": 0.0542, "grad_norm": 23.66434669494629, "learning_rate": 4.825510204081633e-05, "loss": 7.0027, "step": 271 }, { "epoch": 0.0544, "grad_norm": 9.996148109436035, "learning_rate": 4.8244897959183675e-05, "loss": 1.566, "step": 272 }, { "epoch": 0.0546, "grad_norm": 15.409833908081055, "learning_rate": 4.8234693877551024e-05, "loss": 1.5482, "step": 273 }, { "epoch": 0.0548, "grad_norm": 11.011585235595703, "learning_rate": 4.8224489795918373e-05, "loss": 1.4062, "step": 274 }, { "epoch": 0.055, "grad_norm": 4.346341609954834, "learning_rate": 4.8214285714285716e-05, "loss": 2.0908, "step": 275 }, { "epoch": 0.0552, "grad_norm": 30.071130752563477, "learning_rate": 4.8204081632653065e-05, "loss": 6.2823, "step": 276 }, { "epoch": 0.0554, "grad_norm": 4.043562889099121, "learning_rate": 4.8193877551020414e-05, "loss": 0.1602, "step": 277 }, { "epoch": 0.0556, "grad_norm": 14.959501266479492, "learning_rate": 4.818367346938776e-05, "loss": 1.2077, "step": 278 }, { "epoch": 0.0558, "grad_norm": 2.124439239501953, "learning_rate": 4.8173469387755106e-05, "loss": 0.2668, "step": 279 }, { "epoch": 0.056, "grad_norm": 33.473724365234375, "learning_rate": 4.816326530612245e-05, "loss": 6.1955, "step": 280 }, { "epoch": 0.0562, "grad_norm": 2.401884078979492, "learning_rate": 4.81530612244898e-05, "loss": 0.1679, "step": 281 }, { "epoch": 0.0564, "grad_norm": 14.710826873779297, "learning_rate": 4.8142857142857147e-05, "loss": 1.228, "step": 282 }, { "epoch": 0.0566, "grad_norm": 2.006013870239258, "learning_rate": 4.813265306122449e-05, "loss": 0.2674, "step": 283 }, { "epoch": 0.0568, "grad_norm": 2.3197853565216064, "learning_rate": 4.812244897959184e-05, "loss": 0.3204, "step": 284 }, { "epoch": 0.057, "grad_norm": 2.53511118888855, "learning_rate": 4.811224489795919e-05, "loss": 0.1583, "step": 285 }, { "epoch": 0.0572, "grad_norm": 2.7353696823120117, "learning_rate": 4.810204081632653e-05, "loss": 0.4002, "step": 286 }, { "epoch": 0.0574, "grad_norm": 2.9320266246795654, "learning_rate": 4.809183673469388e-05, "loss": 0.3796, "step": 287 }, { "epoch": 0.0576, "grad_norm": 6.147301197052002, "learning_rate": 4.808163265306123e-05, "loss": 2.6707, "step": 288 }, { "epoch": 0.0578, "grad_norm": 2.694920063018799, "learning_rate": 4.807142857142857e-05, "loss": 0.384, "step": 289 }, { "epoch": 0.058, "grad_norm": 12.652451515197754, "learning_rate": 4.806122448979592e-05, "loss": 2.7884, "step": 290 }, { "epoch": 0.0582, "grad_norm": 5.94713020324707, "learning_rate": 4.805102040816327e-05, "loss": 1.1616, "step": 291 }, { "epoch": 0.0584, "grad_norm": 2.474952459335327, "learning_rate": 4.804081632653061e-05, "loss": 0.3208, "step": 292 }, { "epoch": 0.0586, "grad_norm": 6.267833232879639, "learning_rate": 4.803061224489796e-05, "loss": 0.5163, "step": 293 }, { "epoch": 0.0588, "grad_norm": 4.102102279663086, "learning_rate": 4.802040816326531e-05, "loss": 0.1298, "step": 294 }, { "epoch": 0.059, "grad_norm": 4.8158135414123535, "learning_rate": 4.801020408163265e-05, "loss": 2.5453, "step": 295 }, { "epoch": 0.0592, "grad_norm": 4.911422252655029, "learning_rate": 4.8e-05, "loss": 0.4622, "step": 296 }, { "epoch": 0.0594, "grad_norm": 10.098528861999512, "learning_rate": 4.798979591836735e-05, "loss": 2.6648, "step": 297 }, { "epoch": 0.0596, "grad_norm": 6.3386640548706055, "learning_rate": 4.797959183673469e-05, "loss": 0.4538, "step": 298 }, { "epoch": 0.0598, "grad_norm": 6.769515514373779, "learning_rate": 4.796938775510204e-05, "loss": 1.3707, "step": 299 }, { "epoch": 0.06, "grad_norm": 3.17781400680542, "learning_rate": 4.795918367346939e-05, "loss": 0.1102, "step": 300 }, { "epoch": 0.0602, "grad_norm": 3.4535491466522217, "learning_rate": 4.7948979591836734e-05, "loss": 0.1158, "step": 301 }, { "epoch": 0.0604, "grad_norm": 25.51836585998535, "learning_rate": 4.793877551020408e-05, "loss": 6.5621, "step": 302 }, { "epoch": 0.0606, "grad_norm": 3.729757308959961, "learning_rate": 4.7928571428571425e-05, "loss": 0.1819, "step": 303 }, { "epoch": 0.0608, "grad_norm": 2.9387378692626953, "learning_rate": 4.7918367346938774e-05, "loss": 0.2026, "step": 304 }, { "epoch": 0.061, "grad_norm": 2.278538703918457, "learning_rate": 4.790816326530613e-05, "loss": 0.1594, "step": 305 }, { "epoch": 0.0612, "grad_norm": 6.259002685546875, "learning_rate": 4.789795918367347e-05, "loss": 1.0925, "step": 306 }, { "epoch": 0.0614, "grad_norm": 13.990778923034668, "learning_rate": 4.788775510204082e-05, "loss": 6.5765, "step": 307 }, { "epoch": 0.0616, "grad_norm": 2.2181642055511475, "learning_rate": 4.787755102040817e-05, "loss": 0.3021, "step": 308 }, { "epoch": 0.0618, "grad_norm": 5.557339191436768, "learning_rate": 4.7867346938775513e-05, "loss": 2.5857, "step": 309 }, { "epoch": 0.062, "grad_norm": 3.1741716861724854, "learning_rate": 4.785714285714286e-05, "loss": 0.3761, "step": 310 }, { "epoch": 0.0622, "grad_norm": 7.62800407409668, "learning_rate": 4.7846938775510205e-05, "loss": 1.4748, "step": 311 }, { "epoch": 0.0624, "grad_norm": 52.55562210083008, "learning_rate": 4.7836734693877554e-05, "loss": 5.4467, "step": 312 }, { "epoch": 0.0626, "grad_norm": 2.609224557876587, "learning_rate": 4.7826530612244903e-05, "loss": 0.2835, "step": 313 }, { "epoch": 0.0628, "grad_norm": 13.866734504699707, "learning_rate": 4.7816326530612246e-05, "loss": 1.8371, "step": 314 }, { "epoch": 0.063, "grad_norm": 1.6872336864471436, "learning_rate": 4.7806122448979595e-05, "loss": 0.2887, "step": 315 }, { "epoch": 0.0632, "grad_norm": 9.54610824584961, "learning_rate": 4.7795918367346944e-05, "loss": 2.5558, "step": 316 }, { "epoch": 0.0634, "grad_norm": 5.848460674285889, "learning_rate": 4.7785714285714287e-05, "loss": 1.2751, "step": 317 }, { "epoch": 0.0636, "grad_norm": 11.767894744873047, "learning_rate": 4.7775510204081636e-05, "loss": 1.2123, "step": 318 }, { "epoch": 0.0638, "grad_norm": 2.920628786087036, "learning_rate": 4.7765306122448985e-05, "loss": 0.3535, "step": 319 }, { "epoch": 0.064, "grad_norm": 26.431793212890625, "learning_rate": 4.775510204081633e-05, "loss": 4.3793, "step": 320 }, { "epoch": 0.0642, "grad_norm": 3.030142307281494, "learning_rate": 4.7744897959183677e-05, "loss": 0.1938, "step": 321 }, { "epoch": 0.0644, "grad_norm": 7.9036126136779785, "learning_rate": 4.7734693877551026e-05, "loss": 2.4549, "step": 322 }, { "epoch": 0.0646, "grad_norm": 2.295531988143921, "learning_rate": 4.772448979591837e-05, "loss": 0.1455, "step": 323 }, { "epoch": 0.0648, "grad_norm": 2.797222137451172, "learning_rate": 4.771428571428572e-05, "loss": 0.3548, "step": 324 }, { "epoch": 0.065, "grad_norm": 4.948246002197266, "learning_rate": 4.7704081632653066e-05, "loss": 2.1088, "step": 325 }, { "epoch": 0.0652, "grad_norm": 3.0309126377105713, "learning_rate": 4.769387755102041e-05, "loss": 0.3759, "step": 326 }, { "epoch": 0.0654, "grad_norm": 2.7186086177825928, "learning_rate": 4.768367346938776e-05, "loss": 0.3967, "step": 327 }, { "epoch": 0.0656, "grad_norm": 2.1337838172912598, "learning_rate": 4.767346938775511e-05, "loss": 0.1388, "step": 328 }, { "epoch": 0.0658, "grad_norm": 7.373128890991211, "learning_rate": 4.766326530612245e-05, "loss": 1.0983, "step": 329 }, { "epoch": 0.066, "grad_norm": 4.437475204467773, "learning_rate": 4.76530612244898e-05, "loss": 2.0878, "step": 330 }, { "epoch": 0.0662, "grad_norm": 21.72403907775879, "learning_rate": 4.764285714285715e-05, "loss": 1.1818, "step": 331 }, { "epoch": 0.0664, "grad_norm": 5.763108730316162, "learning_rate": 4.763265306122449e-05, "loss": 1.274, "step": 332 }, { "epoch": 0.0666, "grad_norm": 13.025044441223145, "learning_rate": 4.762244897959184e-05, "loss": 1.723, "step": 333 }, { "epoch": 0.0668, "grad_norm": 3.968433380126953, "learning_rate": 4.761224489795918e-05, "loss": 2.1153, "step": 334 }, { "epoch": 0.067, "grad_norm": 7.378103733062744, "learning_rate": 4.760204081632653e-05, "loss": 0.4202, "step": 335 }, { "epoch": 0.0672, "grad_norm": 2.830109119415283, "learning_rate": 4.759183673469388e-05, "loss": 0.3293, "step": 336 }, { "epoch": 0.0674, "grad_norm": 3.178539514541626, "learning_rate": 4.758163265306122e-05, "loss": 0.3486, "step": 337 }, { "epoch": 0.0676, "grad_norm": 7.1868181228637695, "learning_rate": 4.757142857142857e-05, "loss": 1.1632, "step": 338 }, { "epoch": 0.0678, "grad_norm": 2.4396426677703857, "learning_rate": 4.756122448979592e-05, "loss": 0.1441, "step": 339 }, { "epoch": 0.068, "grad_norm": 15.88539981842041, "learning_rate": 4.7551020408163263e-05, "loss": 2.5294, "step": 340 }, { "epoch": 0.0682, "grad_norm": 16.781665802001953, "learning_rate": 4.754081632653061e-05, "loss": 2.8364, "step": 341 }, { "epoch": 0.0684, "grad_norm": 10.077573776245117, "learning_rate": 4.753061224489796e-05, "loss": 0.8778, "step": 342 }, { "epoch": 0.0686, "grad_norm": 5.763261318206787, "learning_rate": 4.7520408163265304e-05, "loss": 2.6801, "step": 343 }, { "epoch": 0.0688, "grad_norm": 2.591111660003662, "learning_rate": 4.7510204081632653e-05, "loss": 0.341, "step": 344 }, { "epoch": 0.069, "grad_norm": 5.386274337768555, "learning_rate": 4.75e-05, "loss": 2.5503, "step": 345 }, { "epoch": 0.0692, "grad_norm": 24.671098709106445, "learning_rate": 4.7489795918367345e-05, "loss": 6.5451, "step": 346 }, { "epoch": 0.0694, "grad_norm": 1.994725227355957, "learning_rate": 4.7479591836734694e-05, "loss": 0.2798, "step": 347 }, { "epoch": 0.0696, "grad_norm": 2.557914972305298, "learning_rate": 4.746938775510204e-05, "loss": 0.3585, "step": 348 }, { "epoch": 0.0698, "grad_norm": 34.20411682128906, "learning_rate": 4.7459183673469386e-05, "loss": 5.0002, "step": 349 }, { "epoch": 0.07, "grad_norm": 17.687604904174805, "learning_rate": 4.744897959183674e-05, "loss": 6.5777, "step": 350 }, { "epoch": 0.0702, "grad_norm": 7.226384162902832, "learning_rate": 4.7438775510204084e-05, "loss": 1.1322, "step": 351 }, { "epoch": 0.0704, "grad_norm": 5.94661283493042, "learning_rate": 4.742857142857143e-05, "loss": 1.0266, "step": 352 }, { "epoch": 0.0706, "grad_norm": 2.209641218185425, "learning_rate": 4.741836734693878e-05, "loss": 0.1366, "step": 353 }, { "epoch": 0.0708, "grad_norm": 3.1439688205718994, "learning_rate": 4.7408163265306125e-05, "loss": 0.1135, "step": 354 }, { "epoch": 0.071, "grad_norm": 2.3960354328155518, "learning_rate": 4.7397959183673474e-05, "loss": 0.2808, "step": 355 }, { "epoch": 0.0712, "grad_norm": 2.367637872695923, "learning_rate": 4.738775510204082e-05, "loss": 0.1627, "step": 356 }, { "epoch": 0.0714, "grad_norm": 2.480921745300293, "learning_rate": 4.7377551020408166e-05, "loss": 0.3506, "step": 357 }, { "epoch": 0.0716, "grad_norm": 8.608354568481445, "learning_rate": 4.7367346938775515e-05, "loss": 0.4904, "step": 358 }, { "epoch": 0.0718, "grad_norm": 2.8946304321289062, "learning_rate": 4.7357142857142864e-05, "loss": 0.1005, "step": 359 }, { "epoch": 0.072, "grad_norm": 2.059964656829834, "learning_rate": 4.7346938775510206e-05, "loss": 0.1328, "step": 360 }, { "epoch": 0.0722, "grad_norm": 2.6717636585235596, "learning_rate": 4.7336734693877556e-05, "loss": 0.3791, "step": 361 }, { "epoch": 0.0724, "grad_norm": 2.0760722160339355, "learning_rate": 4.7326530612244905e-05, "loss": 0.1078, "step": 362 }, { "epoch": 0.0726, "grad_norm": 53.7850341796875, "learning_rate": 4.731632653061225e-05, "loss": 2.9198, "step": 363 }, { "epoch": 0.0728, "grad_norm": 2.0827722549438477, "learning_rate": 4.7306122448979596e-05, "loss": 0.2764, "step": 364 }, { "epoch": 0.073, "grad_norm": 9.983550071716309, "learning_rate": 4.729591836734694e-05, "loss": 2.7176, "step": 365 }, { "epoch": 0.0732, "grad_norm": 13.311184883117676, "learning_rate": 4.728571428571429e-05, "loss": 2.3279, "step": 366 }, { "epoch": 0.0734, "grad_norm": 26.359949111938477, "learning_rate": 4.727551020408164e-05, "loss": 2.3328, "step": 367 }, { "epoch": 0.0736, "grad_norm": 5.651286602020264, "learning_rate": 4.726530612244898e-05, "loss": 0.3716, "step": 368 }, { "epoch": 0.0738, "grad_norm": 1.93024480342865, "learning_rate": 4.725510204081633e-05, "loss": 0.2244, "step": 369 }, { "epoch": 0.074, "grad_norm": 6.500854969024658, "learning_rate": 4.724489795918368e-05, "loss": 0.3689, "step": 370 }, { "epoch": 0.0742, "grad_norm": 6.808442115783691, "learning_rate": 4.723469387755102e-05, "loss": 0.3618, "step": 371 }, { "epoch": 0.0744, "grad_norm": 2.819575071334839, "learning_rate": 4.722448979591837e-05, "loss": 0.1492, "step": 372 }, { "epoch": 0.0746, "grad_norm": 2.6610302925109863, "learning_rate": 4.721428571428572e-05, "loss": 0.2873, "step": 373 }, { "epoch": 0.0748, "grad_norm": 27.070655822753906, "learning_rate": 4.720408163265306e-05, "loss": 3.0837, "step": 374 }, { "epoch": 0.075, "grad_norm": 2.9257850646972656, "learning_rate": 4.719387755102041e-05, "loss": 0.2015, "step": 375 }, { "epoch": 0.0752, "grad_norm": 1.9699163436889648, "learning_rate": 4.718367346938776e-05, "loss": 0.2495, "step": 376 }, { "epoch": 0.0754, "grad_norm": 31.752384185791016, "learning_rate": 4.71734693877551e-05, "loss": 4.9408, "step": 377 }, { "epoch": 0.0756, "grad_norm": 1.8258370161056519, "learning_rate": 4.716326530612245e-05, "loss": 0.254, "step": 378 }, { "epoch": 0.0758, "grad_norm": 18.534879684448242, "learning_rate": 4.71530612244898e-05, "loss": 3.016, "step": 379 }, { "epoch": 0.076, "grad_norm": 1.9348527193069458, "learning_rate": 4.714285714285714e-05, "loss": 0.1144, "step": 380 }, { "epoch": 0.0762, "grad_norm": 6.82661247253418, "learning_rate": 4.713265306122449e-05, "loss": 2.5575, "step": 381 }, { "epoch": 0.0764, "grad_norm": 6.1823883056640625, "learning_rate": 4.712244897959184e-05, "loss": 2.626, "step": 382 }, { "epoch": 0.0766, "grad_norm": 11.660492897033691, "learning_rate": 4.711224489795918e-05, "loss": 0.9745, "step": 383 }, { "epoch": 0.0768, "grad_norm": 2.0270681381225586, "learning_rate": 4.710204081632653e-05, "loss": 0.2937, "step": 384 }, { "epoch": 0.077, "grad_norm": 56.424224853515625, "learning_rate": 4.7091836734693875e-05, "loss": 6.583, "step": 385 }, { "epoch": 0.0772, "grad_norm": 8.324662208557129, "learning_rate": 4.7081632653061224e-05, "loss": 1.3856, "step": 386 }, { "epoch": 0.0774, "grad_norm": 8.9666748046875, "learning_rate": 4.707142857142857e-05, "loss": 2.572, "step": 387 }, { "epoch": 0.0776, "grad_norm": 2.187340259552002, "learning_rate": 4.7061224489795916e-05, "loss": 0.2333, "step": 388 }, { "epoch": 0.0778, "grad_norm": 13.370061874389648, "learning_rate": 4.7051020408163265e-05, "loss": 1.9124, "step": 389 }, { "epoch": 0.078, "grad_norm": 2.4363479614257812, "learning_rate": 4.7040816326530614e-05, "loss": 0.1063, "step": 390 }, { "epoch": 0.0782, "grad_norm": 9.3132963180542, "learning_rate": 4.7030612244897956e-05, "loss": 2.6552, "step": 391 }, { "epoch": 0.0784, "grad_norm": 55.959781646728516, "learning_rate": 4.7020408163265306e-05, "loss": 6.6248, "step": 392 }, { "epoch": 0.0786, "grad_norm": 18.18315315246582, "learning_rate": 4.7010204081632655e-05, "loss": 2.1259, "step": 393 }, { "epoch": 0.0788, "grad_norm": 25.41035270690918, "learning_rate": 4.7e-05, "loss": 6.2054, "step": 394 }, { "epoch": 0.079, "grad_norm": 2.9915242195129395, "learning_rate": 4.698979591836735e-05, "loss": 0.3402, "step": 395 }, { "epoch": 0.0792, "grad_norm": 5.823123455047607, "learning_rate": 4.6979591836734696e-05, "loss": 1.2686, "step": 396 }, { "epoch": 0.0794, "grad_norm": 2.028001308441162, "learning_rate": 4.6969387755102045e-05, "loss": 0.2905, "step": 397 }, { "epoch": 0.0796, "grad_norm": 42.19456100463867, "learning_rate": 4.6959183673469394e-05, "loss": 6.2213, "step": 398 }, { "epoch": 0.0798, "grad_norm": 2.543256998062134, "learning_rate": 4.6948979591836736e-05, "loss": 0.3686, "step": 399 }, { "epoch": 0.08, "grad_norm": 2.2356905937194824, "learning_rate": 4.6938775510204086e-05, "loss": 0.1324, "step": 400 }, { "epoch": 0.0802, "grad_norm": 2.372619867324829, "learning_rate": 4.6928571428571435e-05, "loss": 0.3644, "step": 401 }, { "epoch": 0.0804, "grad_norm": 1.9613386392593384, "learning_rate": 4.691836734693878e-05, "loss": 0.241, "step": 402 }, { "epoch": 0.0806, "grad_norm": 8.247029304504395, "learning_rate": 4.6908163265306126e-05, "loss": 6.9809, "step": 403 }, { "epoch": 0.0808, "grad_norm": 8.159348487854004, "learning_rate": 4.6897959183673475e-05, "loss": 1.5399, "step": 404 }, { "epoch": 0.081, "grad_norm": 10.0965576171875, "learning_rate": 4.688775510204082e-05, "loss": 2.3527, "step": 405 }, { "epoch": 0.0812, "grad_norm": 5.929891109466553, "learning_rate": 4.687755102040817e-05, "loss": 0.9952, "step": 406 }, { "epoch": 0.0814, "grad_norm": 2.4128918647766113, "learning_rate": 4.6867346938775516e-05, "loss": 0.3466, "step": 407 }, { "epoch": 0.0816, "grad_norm": 1.925405502319336, "learning_rate": 4.685714285714286e-05, "loss": 0.2287, "step": 408 }, { "epoch": 0.0818, "grad_norm": 7.814586162567139, "learning_rate": 4.684693877551021e-05, "loss": 1.0657, "step": 409 }, { "epoch": 0.082, "grad_norm": 2.403630495071411, "learning_rate": 4.683673469387756e-05, "loss": 0.3243, "step": 410 }, { "epoch": 0.0822, "grad_norm": 9.4711275100708, "learning_rate": 4.68265306122449e-05, "loss": 1.6438, "step": 411 }, { "epoch": 0.0824, "grad_norm": 48.67058181762695, "learning_rate": 4.681632653061225e-05, "loss": 5.4859, "step": 412 }, { "epoch": 0.0826, "grad_norm": 2.594776153564453, "learning_rate": 4.68061224489796e-05, "loss": 0.1547, "step": 413 }, { "epoch": 0.0828, "grad_norm": 49.50006103515625, "learning_rate": 4.679591836734694e-05, "loss": 6.3499, "step": 414 }, { "epoch": 0.083, "grad_norm": 43.77144241333008, "learning_rate": 4.678571428571429e-05, "loss": 7.4436, "step": 415 }, { "epoch": 0.0832, "grad_norm": 1.756616234779358, "learning_rate": 4.677551020408163e-05, "loss": 0.2482, "step": 416 }, { "epoch": 0.0834, "grad_norm": 10.484140396118164, "learning_rate": 4.676530612244898e-05, "loss": 1.5785, "step": 417 }, { "epoch": 0.0836, "grad_norm": 9.244158744812012, "learning_rate": 4.675510204081633e-05, "loss": 0.8072, "step": 418 }, { "epoch": 0.0838, "grad_norm": 8.533461570739746, "learning_rate": 4.674489795918367e-05, "loss": 1.5188, "step": 419 }, { "epoch": 0.084, "grad_norm": 2.3646106719970703, "learning_rate": 4.673469387755102e-05, "loss": 0.1298, "step": 420 }, { "epoch": 0.0842, "grad_norm": 5.251696586608887, "learning_rate": 4.672448979591837e-05, "loss": 0.3743, "step": 421 }, { "epoch": 0.0844, "grad_norm": 2.401188611984253, "learning_rate": 4.671428571428571e-05, "loss": 0.3042, "step": 422 }, { "epoch": 0.0846, "grad_norm": 4.875900745391846, "learning_rate": 4.670408163265306e-05, "loss": 2.55, "step": 423 }, { "epoch": 0.0848, "grad_norm": 6.620153903961182, "learning_rate": 4.669387755102041e-05, "loss": 1.0621, "step": 424 }, { "epoch": 0.085, "grad_norm": 2.050922155380249, "learning_rate": 4.6683673469387754e-05, "loss": 0.1177, "step": 425 }, { "epoch": 0.0852, "grad_norm": 4.2134222984313965, "learning_rate": 4.66734693877551e-05, "loss": 2.5296, "step": 426 }, { "epoch": 0.0854, "grad_norm": 2.0711166858673096, "learning_rate": 4.666326530612245e-05, "loss": 0.1249, "step": 427 }, { "epoch": 0.0856, "grad_norm": 6.235434055328369, "learning_rate": 4.6653061224489795e-05, "loss": 1.0692, "step": 428 }, { "epoch": 0.0858, "grad_norm": 6.365238189697266, "learning_rate": 4.6642857142857144e-05, "loss": 1.1687, "step": 429 }, { "epoch": 0.086, "grad_norm": 9.341511726379395, "learning_rate": 4.663265306122449e-05, "loss": 0.3599, "step": 430 }, { "epoch": 0.0862, "grad_norm": 11.925809860229492, "learning_rate": 4.6622448979591836e-05, "loss": 2.4904, "step": 431 }, { "epoch": 0.0864, "grad_norm": 10.085716247558594, "learning_rate": 4.6612244897959185e-05, "loss": 2.4821, "step": 432 }, { "epoch": 0.0866, "grad_norm": 39.38417434692383, "learning_rate": 4.6602040816326534e-05, "loss": 5.7205, "step": 433 }, { "epoch": 0.0868, "grad_norm": 5.331692695617676, "learning_rate": 4.6591836734693876e-05, "loss": 2.591, "step": 434 }, { "epoch": 0.087, "grad_norm": 7.388650417327881, "learning_rate": 4.6581632653061226e-05, "loss": 1.1156, "step": 435 }, { "epoch": 0.0872, "grad_norm": 21.16617774963379, "learning_rate": 4.6571428571428575e-05, "loss": 6.2198, "step": 436 }, { "epoch": 0.0874, "grad_norm": 4.8475213050842285, "learning_rate": 4.656122448979592e-05, "loss": 0.2768, "step": 437 }, { "epoch": 0.0876, "grad_norm": 4.950862407684326, "learning_rate": 4.6551020408163266e-05, "loss": 1.0098, "step": 438 }, { "epoch": 0.0878, "grad_norm": 4.628479957580566, "learning_rate": 4.654081632653061e-05, "loss": 2.423, "step": 439 }, { "epoch": 0.088, "grad_norm": 4.182015419006348, "learning_rate": 4.653061224489796e-05, "loss": 2.3441, "step": 440 }, { "epoch": 0.0882, "grad_norm": 2.807716131210327, "learning_rate": 4.6520408163265314e-05, "loss": 0.5297, "step": 441 }, { "epoch": 0.0884, "grad_norm": 5.62799072265625, "learning_rate": 4.6510204081632656e-05, "loss": 1.1379, "step": 442 }, { "epoch": 0.0886, "grad_norm": 2.8704583644866943, "learning_rate": 4.6500000000000005e-05, "loss": 0.3582, "step": 443 }, { "epoch": 0.0888, "grad_norm": 1.8881853818893433, "learning_rate": 4.6489795918367355e-05, "loss": 0.2726, "step": 444 }, { "epoch": 0.089, "grad_norm": 1.7147490978240967, "learning_rate": 4.64795918367347e-05, "loss": 0.0869, "step": 445 }, { "epoch": 0.0892, "grad_norm": 5.187413692474365, "learning_rate": 4.6469387755102046e-05, "loss": 0.9787, "step": 446 }, { "epoch": 0.0894, "grad_norm": 2.709937334060669, "learning_rate": 4.645918367346939e-05, "loss": 0.341, "step": 447 }, { "epoch": 0.0896, "grad_norm": 1.8514126539230347, "learning_rate": 4.644897959183674e-05, "loss": 0.2127, "step": 448 }, { "epoch": 0.0898, "grad_norm": 5.530944347381592, "learning_rate": 4.643877551020409e-05, "loss": 1.0078, "step": 449 }, { "epoch": 0.09, "grad_norm": 11.764533996582031, "learning_rate": 4.642857142857143e-05, "loss": 2.3106, "step": 450 }, { "epoch": 0.0902, "grad_norm": 16.673175811767578, "learning_rate": 4.641836734693878e-05, "loss": 3.5456, "step": 451 }, { "epoch": 0.0904, "grad_norm": 2.0734784603118896, "learning_rate": 4.640816326530613e-05, "loss": 0.2862, "step": 452 }, { "epoch": 0.0906, "grad_norm": 21.004438400268555, "learning_rate": 4.639795918367347e-05, "loss": 5.2117, "step": 453 }, { "epoch": 0.0908, "grad_norm": 8.92627239227295, "learning_rate": 4.638775510204082e-05, "loss": 1.619, "step": 454 }, { "epoch": 0.091, "grad_norm": 2.5494918823242188, "learning_rate": 4.637755102040817e-05, "loss": 0.3257, "step": 455 }, { "epoch": 0.0912, "grad_norm": 19.945465087890625, "learning_rate": 4.636734693877551e-05, "loss": 2.0669, "step": 456 }, { "epoch": 0.0914, "grad_norm": 1.8492319583892822, "learning_rate": 4.635714285714286e-05, "loss": 0.3081, "step": 457 }, { "epoch": 0.0916, "grad_norm": 1.979044795036316, "learning_rate": 4.634693877551021e-05, "loss": 0.2047, "step": 458 }, { "epoch": 0.0918, "grad_norm": 2.6501336097717285, "learning_rate": 4.633673469387755e-05, "loss": 0.1064, "step": 459 }, { "epoch": 0.092, "grad_norm": 6.860795021057129, "learning_rate": 4.63265306122449e-05, "loss": 0.9122, "step": 460 }, { "epoch": 0.0922, "grad_norm": 12.75924301147461, "learning_rate": 4.631632653061225e-05, "loss": 2.1535, "step": 461 }, { "epoch": 0.0924, "grad_norm": 2.1344172954559326, "learning_rate": 4.630612244897959e-05, "loss": 0.1228, "step": 462 }, { "epoch": 0.0926, "grad_norm": 2.5231246948242188, "learning_rate": 4.629591836734694e-05, "loss": 0.1514, "step": 463 }, { "epoch": 0.0928, "grad_norm": 2.375951051712036, "learning_rate": 4.628571428571429e-05, "loss": 0.3321, "step": 464 }, { "epoch": 0.093, "grad_norm": 3.2040934562683105, "learning_rate": 4.627551020408163e-05, "loss": 0.1209, "step": 465 }, { "epoch": 0.0932, "grad_norm": 6.255486011505127, "learning_rate": 4.626530612244898e-05, "loss": 2.4753, "step": 466 }, { "epoch": 0.0934, "grad_norm": 12.446625709533691, "learning_rate": 4.625510204081633e-05, "loss": 1.9803, "step": 467 }, { "epoch": 0.0936, "grad_norm": 10.153313636779785, "learning_rate": 4.6244897959183674e-05, "loss": 1.6769, "step": 468 }, { "epoch": 0.0938, "grad_norm": 17.7565975189209, "learning_rate": 4.623469387755102e-05, "loss": 1.7344, "step": 469 }, { "epoch": 0.094, "grad_norm": 13.362611770629883, "learning_rate": 4.6224489795918366e-05, "loss": 1.4584, "step": 470 }, { "epoch": 0.0942, "grad_norm": 1.8224835395812988, "learning_rate": 4.6214285714285715e-05, "loss": 0.1938, "step": 471 }, { "epoch": 0.0944, "grad_norm": 2.838259696960449, "learning_rate": 4.6204081632653064e-05, "loss": 0.1336, "step": 472 }, { "epoch": 0.0946, "grad_norm": 2.060246706008911, "learning_rate": 4.6193877551020406e-05, "loss": 0.2559, "step": 473 }, { "epoch": 0.0948, "grad_norm": 2.429246664047241, "learning_rate": 4.6183673469387755e-05, "loss": 0.14, "step": 474 }, { "epoch": 0.095, "grad_norm": 10.238293647766113, "learning_rate": 4.6173469387755105e-05, "loss": 2.5295, "step": 475 }, { "epoch": 0.0952, "grad_norm": 5.1625285148620605, "learning_rate": 4.616326530612245e-05, "loss": 0.3549, "step": 476 }, { "epoch": 0.0954, "grad_norm": 2.8266282081604004, "learning_rate": 4.6153061224489796e-05, "loss": 0.0985, "step": 477 }, { "epoch": 0.0956, "grad_norm": 14.649356842041016, "learning_rate": 4.6142857142857145e-05, "loss": 4.0913, "step": 478 }, { "epoch": 0.0958, "grad_norm": 2.2226009368896484, "learning_rate": 4.613265306122449e-05, "loss": 0.2541, "step": 479 }, { "epoch": 0.096, "grad_norm": 4.22063684463501, "learning_rate": 4.612244897959184e-05, "loss": 0.4131, "step": 480 }, { "epoch": 0.0962, "grad_norm": 5.823008060455322, "learning_rate": 4.6112244897959186e-05, "loss": 1.0252, "step": 481 }, { "epoch": 0.0964, "grad_norm": 10.060904502868652, "learning_rate": 4.610204081632653e-05, "loss": 1.5716, "step": 482 }, { "epoch": 0.0966, "grad_norm": 2.1902894973754883, "learning_rate": 4.609183673469388e-05, "loss": 0.1208, "step": 483 }, { "epoch": 0.0968, "grad_norm": 15.719635963439941, "learning_rate": 4.608163265306123e-05, "loss": 7.0908, "step": 484 }, { "epoch": 0.097, "grad_norm": 8.193558692932129, "learning_rate": 4.607142857142857e-05, "loss": 1.26, "step": 485 }, { "epoch": 0.0972, "grad_norm": 6.4959306716918945, "learning_rate": 4.6061224489795925e-05, "loss": 0.9872, "step": 486 }, { "epoch": 0.0974, "grad_norm": 6.141908168792725, "learning_rate": 4.605102040816327e-05, "loss": 1.0136, "step": 487 }, { "epoch": 0.0976, "grad_norm": 32.68397903442383, "learning_rate": 4.604081632653062e-05, "loss": 4.1119, "step": 488 }, { "epoch": 0.0978, "grad_norm": 5.456779956817627, "learning_rate": 4.6030612244897966e-05, "loss": 1.3421, "step": 489 }, { "epoch": 0.098, "grad_norm": 1.829673409461975, "learning_rate": 4.602040816326531e-05, "loss": 0.277, "step": 490 }, { "epoch": 0.0982, "grad_norm": 4.504250526428223, "learning_rate": 4.601020408163266e-05, "loss": 2.4248, "step": 491 }, { "epoch": 0.0984, "grad_norm": 4.908005714416504, "learning_rate": 4.600000000000001e-05, "loss": 2.3024, "step": 492 }, { "epoch": 0.0986, "grad_norm": 4.181675434112549, "learning_rate": 4.598979591836735e-05, "loss": 2.4278, "step": 493 }, { "epoch": 0.0988, "grad_norm": 2.944359064102173, "learning_rate": 4.59795918367347e-05, "loss": 0.4215, "step": 494 }, { "epoch": 0.099, "grad_norm": 34.008079528808594, "learning_rate": 4.596938775510205e-05, "loss": 4.4627, "step": 495 }, { "epoch": 0.0992, "grad_norm": 2.167393922805786, "learning_rate": 4.595918367346939e-05, "loss": 0.0745, "step": 496 }, { "epoch": 0.0994, "grad_norm": 2.0184450149536133, "learning_rate": 4.594897959183674e-05, "loss": 0.3072, "step": 497 }, { "epoch": 0.0996, "grad_norm": 7.414123058319092, "learning_rate": 4.593877551020409e-05, "loss": 0.985, "step": 498 }, { "epoch": 0.0998, "grad_norm": 2.57053804397583, "learning_rate": 4.592857142857143e-05, "loss": 0.3009, "step": 499 }, { "epoch": 0.1, "grad_norm": 1.8467835187911987, "learning_rate": 4.591836734693878e-05, "loss": 0.2618, "step": 500 }, { "epoch": 0.1002, "grad_norm": 6.0907111167907715, "learning_rate": 4.590816326530612e-05, "loss": 0.9765, "step": 501 }, { "epoch": 0.1004, "grad_norm": 2.033963441848755, "learning_rate": 4.589795918367347e-05, "loss": 0.1242, "step": 502 }, { "epoch": 0.1006, "grad_norm": 1.8364888429641724, "learning_rate": 4.588775510204082e-05, "loss": 0.2615, "step": 503 }, { "epoch": 0.1008, "grad_norm": 5.7680864334106445, "learning_rate": 4.587755102040816e-05, "loss": 0.9865, "step": 504 }, { "epoch": 0.101, "grad_norm": 7.0686116218566895, "learning_rate": 4.586734693877551e-05, "loss": 0.7085, "step": 505 }, { "epoch": 0.1012, "grad_norm": 2.136955499649048, "learning_rate": 4.585714285714286e-05, "loss": 0.0715, "step": 506 }, { "epoch": 0.1014, "grad_norm": 1.9805129766464233, "learning_rate": 4.5846938775510204e-05, "loss": 0.1, "step": 507 }, { "epoch": 0.1016, "grad_norm": 8.698697090148926, "learning_rate": 4.583673469387755e-05, "loss": 1.4424, "step": 508 }, { "epoch": 0.1018, "grad_norm": 7.782326698303223, "learning_rate": 4.58265306122449e-05, "loss": 1.4647, "step": 509 }, { "epoch": 0.102, "grad_norm": 41.67585754394531, "learning_rate": 4.5816326530612245e-05, "loss": 6.3981, "step": 510 }, { "epoch": 0.1022, "grad_norm": 1.8698790073394775, "learning_rate": 4.5806122448979594e-05, "loss": 0.1129, "step": 511 }, { "epoch": 0.1024, "grad_norm": 4.760601043701172, "learning_rate": 4.579591836734694e-05, "loss": 0.3221, "step": 512 }, { "epoch": 0.1026, "grad_norm": 33.31980895996094, "learning_rate": 4.5785714285714285e-05, "loss": 1.7785, "step": 513 }, { "epoch": 0.1028, "grad_norm": 17.92771339416504, "learning_rate": 4.5775510204081635e-05, "loss": 5.4287, "step": 514 }, { "epoch": 0.103, "grad_norm": 2.300755023956299, "learning_rate": 4.5765306122448984e-05, "loss": 0.123, "step": 515 }, { "epoch": 0.1032, "grad_norm": 4.174624443054199, "learning_rate": 4.5755102040816326e-05, "loss": 0.3054, "step": 516 }, { "epoch": 0.1034, "grad_norm": 2.479468822479248, "learning_rate": 4.5744897959183675e-05, "loss": 0.3015, "step": 517 }, { "epoch": 0.1036, "grad_norm": 2.323305606842041, "learning_rate": 4.5734693877551025e-05, "loss": 0.2464, "step": 518 }, { "epoch": 0.1038, "grad_norm": 2.432375192642212, "learning_rate": 4.572448979591837e-05, "loss": 0.2387, "step": 519 }, { "epoch": 0.104, "grad_norm": 2.466305732727051, "learning_rate": 4.5714285714285716e-05, "loss": 0.1291, "step": 520 }, { "epoch": 0.1042, "grad_norm": 2.67108416557312, "learning_rate": 4.570408163265306e-05, "loss": 0.3753, "step": 521 }, { "epoch": 0.1044, "grad_norm": 2.484483242034912, "learning_rate": 4.569387755102041e-05, "loss": 0.4598, "step": 522 }, { "epoch": 0.1046, "grad_norm": 16.740785598754883, "learning_rate": 4.568367346938776e-05, "loss": 2.6504, "step": 523 }, { "epoch": 0.1048, "grad_norm": 2.645599842071533, "learning_rate": 4.56734693877551e-05, "loss": 0.3468, "step": 524 }, { "epoch": 0.105, "grad_norm": 9.68822956085205, "learning_rate": 4.566326530612245e-05, "loss": 2.6367, "step": 525 }, { "epoch": 0.1052, "grad_norm": 2.621138095855713, "learning_rate": 4.56530612244898e-05, "loss": 0.1783, "step": 526 }, { "epoch": 0.1054, "grad_norm": 2.0223898887634277, "learning_rate": 4.564285714285714e-05, "loss": 0.1192, "step": 527 }, { "epoch": 0.1056, "grad_norm": 22.834697723388672, "learning_rate": 4.563265306122449e-05, "loss": 5.2872, "step": 528 }, { "epoch": 0.1058, "grad_norm": 2.804661989212036, "learning_rate": 4.562244897959184e-05, "loss": 0.3319, "step": 529 }, { "epoch": 0.106, "grad_norm": 2.925588846206665, "learning_rate": 4.561224489795918e-05, "loss": 0.3356, "step": 530 }, { "epoch": 0.1062, "grad_norm": 10.556026458740234, "learning_rate": 4.560204081632653e-05, "loss": 2.0116, "step": 531 }, { "epoch": 0.1064, "grad_norm": 70.50692749023438, "learning_rate": 4.559183673469388e-05, "loss": 7.9615, "step": 532 }, { "epoch": 0.1066, "grad_norm": 2.5424160957336426, "learning_rate": 4.558163265306123e-05, "loss": 0.3561, "step": 533 }, { "epoch": 0.1068, "grad_norm": 8.875993728637695, "learning_rate": 4.557142857142858e-05, "loss": 1.0426, "step": 534 }, { "epoch": 0.107, "grad_norm": 2.2023794651031494, "learning_rate": 4.556122448979592e-05, "loss": 0.1573, "step": 535 }, { "epoch": 0.1072, "grad_norm": 4.783979415893555, "learning_rate": 4.555102040816327e-05, "loss": 0.2794, "step": 536 }, { "epoch": 0.1074, "grad_norm": 2.1055617332458496, "learning_rate": 4.554081632653062e-05, "loss": 0.1174, "step": 537 }, { "epoch": 0.1076, "grad_norm": 2.1910979747772217, "learning_rate": 4.553061224489796e-05, "loss": 0.249, "step": 538 }, { "epoch": 0.1078, "grad_norm": 13.680757522583008, "learning_rate": 4.552040816326531e-05, "loss": 3.8584, "step": 539 }, { "epoch": 0.108, "grad_norm": 2.199683666229248, "learning_rate": 4.551020408163266e-05, "loss": 0.2536, "step": 540 }, { "epoch": 0.1082, "grad_norm": 1.8234713077545166, "learning_rate": 4.55e-05, "loss": 0.1149, "step": 541 }, { "epoch": 0.1084, "grad_norm": 2.0882201194763184, "learning_rate": 4.548979591836735e-05, "loss": 0.2728, "step": 542 }, { "epoch": 0.1086, "grad_norm": 72.3479995727539, "learning_rate": 4.54795918367347e-05, "loss": 6.6544, "step": 543 }, { "epoch": 0.1088, "grad_norm": 2.8216774463653564, "learning_rate": 4.546938775510204e-05, "loss": 0.1134, "step": 544 }, { "epoch": 0.109, "grad_norm": 26.27626609802246, "learning_rate": 4.545918367346939e-05, "loss": 2.9438, "step": 545 }, { "epoch": 0.1092, "grad_norm": 2.255739688873291, "learning_rate": 4.544897959183674e-05, "loss": 0.2993, "step": 546 }, { "epoch": 0.1094, "grad_norm": 2.0638649463653564, "learning_rate": 4.543877551020408e-05, "loss": 0.2525, "step": 547 }, { "epoch": 0.1096, "grad_norm": 1.93026602268219, "learning_rate": 4.542857142857143e-05, "loss": 0.2609, "step": 548 }, { "epoch": 0.1098, "grad_norm": 2.0406620502471924, "learning_rate": 4.541836734693878e-05, "loss": 0.2668, "step": 549 }, { "epoch": 0.11, "grad_norm": 2.5131540298461914, "learning_rate": 4.5408163265306124e-05, "loss": 0.3013, "step": 550 }, { "epoch": 0.1102, "grad_norm": 5.5197858810424805, "learning_rate": 4.539795918367347e-05, "loss": 1.2905, "step": 551 }, { "epoch": 0.1104, "grad_norm": 1.8763465881347656, "learning_rate": 4.538775510204082e-05, "loss": 0.2596, "step": 552 }, { "epoch": 0.1106, "grad_norm": 11.363729476928711, "learning_rate": 4.5377551020408164e-05, "loss": 1.8338, "step": 553 }, { "epoch": 0.1108, "grad_norm": 20.515701293945312, "learning_rate": 4.5367346938775514e-05, "loss": 6.1151, "step": 554 }, { "epoch": 0.111, "grad_norm": 2.955251455307007, "learning_rate": 4.5357142857142856e-05, "loss": 0.0994, "step": 555 }, { "epoch": 0.1112, "grad_norm": 2.0138936042785645, "learning_rate": 4.5346938775510205e-05, "loss": 0.2722, "step": 556 }, { "epoch": 0.1114, "grad_norm": 5.070435523986816, "learning_rate": 4.5336734693877554e-05, "loss": 2.5984, "step": 557 }, { "epoch": 0.1116, "grad_norm": 5.005916118621826, "learning_rate": 4.53265306122449e-05, "loss": 1.2611, "step": 558 }, { "epoch": 0.1118, "grad_norm": 1.90200674533844, "learning_rate": 4.5316326530612246e-05, "loss": 0.1851, "step": 559 }, { "epoch": 0.112, "grad_norm": 2.2471718788146973, "learning_rate": 4.5306122448979595e-05, "loss": 0.3086, "step": 560 }, { "epoch": 0.1122, "grad_norm": 2.431797504425049, "learning_rate": 4.529591836734694e-05, "loss": 0.0901, "step": 561 }, { "epoch": 0.1124, "grad_norm": 17.897838592529297, "learning_rate": 4.528571428571429e-05, "loss": 4.6469, "step": 562 }, { "epoch": 0.1126, "grad_norm": 2.6925182342529297, "learning_rate": 4.5275510204081636e-05, "loss": 0.2705, "step": 563 }, { "epoch": 0.1128, "grad_norm": 7.428674697875977, "learning_rate": 4.526530612244898e-05, "loss": 1.2483, "step": 564 }, { "epoch": 0.113, "grad_norm": 33.47559356689453, "learning_rate": 4.525510204081633e-05, "loss": 4.067, "step": 565 }, { "epoch": 0.1132, "grad_norm": 2.309847354888916, "learning_rate": 4.524489795918368e-05, "loss": 0.3087, "step": 566 }, { "epoch": 0.1134, "grad_norm": 2.301074504852295, "learning_rate": 4.523469387755102e-05, "loss": 0.262, "step": 567 }, { "epoch": 0.1136, "grad_norm": 2.009781837463379, "learning_rate": 4.522448979591837e-05, "loss": 0.0959, "step": 568 }, { "epoch": 0.1138, "grad_norm": 9.266844749450684, "learning_rate": 4.521428571428572e-05, "loss": 2.4829, "step": 569 }, { "epoch": 0.114, "grad_norm": 2.2796411514282227, "learning_rate": 4.520408163265306e-05, "loss": 0.1412, "step": 570 }, { "epoch": 0.1142, "grad_norm": 2.4037399291992188, "learning_rate": 4.519387755102041e-05, "loss": 0.1177, "step": 571 }, { "epoch": 0.1144, "grad_norm": 1.8711189031600952, "learning_rate": 4.518367346938776e-05, "loss": 0.1035, "step": 572 }, { "epoch": 0.1146, "grad_norm": 6.985625743865967, "learning_rate": 4.51734693877551e-05, "loss": 2.3783, "step": 573 }, { "epoch": 0.1148, "grad_norm": 15.73539924621582, "learning_rate": 4.516326530612245e-05, "loss": 4.3117, "step": 574 }, { "epoch": 0.115, "grad_norm": 8.829004287719727, "learning_rate": 4.515306122448979e-05, "loss": 1.3898, "step": 575 }, { "epoch": 0.1152, "grad_norm": 8.513986587524414, "learning_rate": 4.514285714285714e-05, "loss": 1.6092, "step": 576 }, { "epoch": 0.1154, "grad_norm": 4.182492256164551, "learning_rate": 4.51326530612245e-05, "loss": 0.2672, "step": 577 }, { "epoch": 0.1156, "grad_norm": 1.9439010620117188, "learning_rate": 4.512244897959184e-05, "loss": 0.1175, "step": 578 }, { "epoch": 0.1158, "grad_norm": 2.382523775100708, "learning_rate": 4.511224489795919e-05, "loss": 0.309, "step": 579 }, { "epoch": 0.116, "grad_norm": 15.121125221252441, "learning_rate": 4.510204081632654e-05, "loss": 4.528, "step": 580 }, { "epoch": 0.1162, "grad_norm": 1.8058350086212158, "learning_rate": 4.509183673469388e-05, "loss": 0.2308, "step": 581 }, { "epoch": 0.1164, "grad_norm": 5.423115253448486, "learning_rate": 4.508163265306123e-05, "loss": 1.2338, "step": 582 }, { "epoch": 0.1166, "grad_norm": 2.032620429992676, "learning_rate": 4.507142857142858e-05, "loss": 0.1202, "step": 583 }, { "epoch": 0.1168, "grad_norm": 2.276237964630127, "learning_rate": 4.506122448979592e-05, "loss": 0.3036, "step": 584 }, { "epoch": 0.117, "grad_norm": 5.8232526779174805, "learning_rate": 4.505102040816327e-05, "loss": 2.4301, "step": 585 }, { "epoch": 0.1172, "grad_norm": 4.476560592651367, "learning_rate": 4.504081632653061e-05, "loss": 1.0937, "step": 586 }, { "epoch": 0.1174, "grad_norm": 5.9133992195129395, "learning_rate": 4.503061224489796e-05, "loss": 0.9071, "step": 587 }, { "epoch": 0.1176, "grad_norm": 1.6806764602661133, "learning_rate": 4.502040816326531e-05, "loss": 0.1801, "step": 588 }, { "epoch": 0.1178, "grad_norm": 32.35234069824219, "learning_rate": 4.5010204081632654e-05, "loss": 4.8664, "step": 589 }, { "epoch": 0.118, "grad_norm": 2.2517266273498535, "learning_rate": 4.5e-05, "loss": 0.2928, "step": 590 }, { "epoch": 0.1182, "grad_norm": 2.939851760864258, "learning_rate": 4.498979591836735e-05, "loss": 0.4259, "step": 591 }, { "epoch": 0.1184, "grad_norm": 5.593673229217529, "learning_rate": 4.4979591836734694e-05, "loss": 0.3221, "step": 592 }, { "epoch": 0.1186, "grad_norm": 2.246220350265503, "learning_rate": 4.4969387755102044e-05, "loss": 0.0967, "step": 593 }, { "epoch": 0.1188, "grad_norm": 1.910014033317566, "learning_rate": 4.495918367346939e-05, "loss": 0.3013, "step": 594 }, { "epoch": 0.119, "grad_norm": 1.9520102739334106, "learning_rate": 4.4948979591836735e-05, "loss": 0.2674, "step": 595 }, { "epoch": 0.1192, "grad_norm": 2.66032338142395, "learning_rate": 4.4938775510204084e-05, "loss": 0.3248, "step": 596 }, { "epoch": 0.1194, "grad_norm": 9.406121253967285, "learning_rate": 4.4928571428571434e-05, "loss": 1.4535, "step": 597 }, { "epoch": 0.1196, "grad_norm": 8.665762901306152, "learning_rate": 4.4918367346938776e-05, "loss": 0.7553, "step": 598 }, { "epoch": 0.1198, "grad_norm": 4.330859184265137, "learning_rate": 4.4908163265306125e-05, "loss": 2.3006, "step": 599 }, { "epoch": 0.12, "grad_norm": 1.626834750175476, "learning_rate": 4.4897959183673474e-05, "loss": 0.098, "step": 600 }, { "epoch": 0.1202, "grad_norm": 12.482990264892578, "learning_rate": 4.488775510204082e-05, "loss": 6.0159, "step": 601 }, { "epoch": 0.1204, "grad_norm": 1.843930959701538, "learning_rate": 4.4877551020408166e-05, "loss": 0.1119, "step": 602 }, { "epoch": 0.1206, "grad_norm": 6.070778846740723, "learning_rate": 4.4867346938775515e-05, "loss": 0.6427, "step": 603 }, { "epoch": 0.1208, "grad_norm": 6.043375015258789, "learning_rate": 4.485714285714286e-05, "loss": 1.1888, "step": 604 }, { "epoch": 0.121, "grad_norm": 38.47969436645508, "learning_rate": 4.484693877551021e-05, "loss": 4.228, "step": 605 }, { "epoch": 0.1212, "grad_norm": 5.892314434051514, "learning_rate": 4.483673469387755e-05, "loss": 0.33, "step": 606 }, { "epoch": 0.1214, "grad_norm": 15.078666687011719, "learning_rate": 4.48265306122449e-05, "loss": 2.3711, "step": 607 }, { "epoch": 0.1216, "grad_norm": 3.097940444946289, "learning_rate": 4.481632653061225e-05, "loss": 0.1112, "step": 608 }, { "epoch": 0.1218, "grad_norm": 12.760353088378906, "learning_rate": 4.480612244897959e-05, "loss": 4.0837, "step": 609 }, { "epoch": 0.122, "grad_norm": 11.902802467346191, "learning_rate": 4.479591836734694e-05, "loss": 1.5209, "step": 610 }, { "epoch": 0.1222, "grad_norm": 2.2661423683166504, "learning_rate": 4.478571428571429e-05, "loss": 0.076, "step": 611 }, { "epoch": 0.1224, "grad_norm": 8.932852745056152, "learning_rate": 4.477551020408163e-05, "loss": 1.6964, "step": 612 }, { "epoch": 0.1226, "grad_norm": 13.332756996154785, "learning_rate": 4.476530612244898e-05, "loss": 2.6943, "step": 613 }, { "epoch": 0.1228, "grad_norm": 1.782543659210205, "learning_rate": 4.475510204081633e-05, "loss": 0.2552, "step": 614 }, { "epoch": 0.123, "grad_norm": 9.798274040222168, "learning_rate": 4.474489795918367e-05, "loss": 1.7567, "step": 615 }, { "epoch": 0.1232, "grad_norm": 2.503276824951172, "learning_rate": 4.473469387755102e-05, "loss": 0.3502, "step": 616 }, { "epoch": 0.1234, "grad_norm": 1.6842453479766846, "learning_rate": 4.472448979591837e-05, "loss": 0.2415, "step": 617 }, { "epoch": 0.1236, "grad_norm": 4.441090106964111, "learning_rate": 4.471428571428571e-05, "loss": 0.4642, "step": 618 }, { "epoch": 0.1238, "grad_norm": 4.461360454559326, "learning_rate": 4.470408163265306e-05, "loss": 1.1069, "step": 619 }, { "epoch": 0.124, "grad_norm": 4.818694591522217, "learning_rate": 4.469387755102041e-05, "loss": 0.2668, "step": 620 }, { "epoch": 0.1242, "grad_norm": 14.786673545837402, "learning_rate": 4.468367346938775e-05, "loss": 1.2223, "step": 621 }, { "epoch": 0.1244, "grad_norm": 2.0519139766693115, "learning_rate": 4.467346938775511e-05, "loss": 0.1196, "step": 622 }, { "epoch": 0.1246, "grad_norm": 5.332647323608398, "learning_rate": 4.466326530612245e-05, "loss": 0.2951, "step": 623 }, { "epoch": 0.1248, "grad_norm": 2.252056360244751, "learning_rate": 4.46530612244898e-05, "loss": 0.325, "step": 624 }, { "epoch": 0.125, "grad_norm": 5.384517669677734, "learning_rate": 4.464285714285715e-05, "loss": 0.8607, "step": 625 }, { "epoch": 0.1252, "grad_norm": 10.462895393371582, "learning_rate": 4.463265306122449e-05, "loss": 2.4684, "step": 626 }, { "epoch": 0.1254, "grad_norm": 1.8529956340789795, "learning_rate": 4.462244897959184e-05, "loss": 0.1103, "step": 627 }, { "epoch": 0.1256, "grad_norm": 9.0777006149292, "learning_rate": 4.461224489795919e-05, "loss": 1.6302, "step": 628 }, { "epoch": 0.1258, "grad_norm": 7.788472652435303, "learning_rate": 4.460204081632653e-05, "loss": 1.4777, "step": 629 }, { "epoch": 0.126, "grad_norm": 11.898658752441406, "learning_rate": 4.459183673469388e-05, "loss": 6.0391, "step": 630 }, { "epoch": 0.1262, "grad_norm": 13.46484661102295, "learning_rate": 4.458163265306123e-05, "loss": 2.6615, "step": 631 }, { "epoch": 0.1264, "grad_norm": 4.574545860290527, "learning_rate": 4.4571428571428574e-05, "loss": 1.0586, "step": 632 }, { "epoch": 0.1266, "grad_norm": 1.9689745903015137, "learning_rate": 4.456122448979592e-05, "loss": 0.2847, "step": 633 }, { "epoch": 0.1268, "grad_norm": 13.859519958496094, "learning_rate": 4.455102040816327e-05, "loss": 1.6259, "step": 634 }, { "epoch": 0.127, "grad_norm": 1.6551806926727295, "learning_rate": 4.4540816326530614e-05, "loss": 0.1856, "step": 635 }, { "epoch": 0.1272, "grad_norm": 5.734321594238281, "learning_rate": 4.4530612244897963e-05, "loss": 0.9242, "step": 636 }, { "epoch": 0.1274, "grad_norm": 2.1179840564727783, "learning_rate": 4.4520408163265306e-05, "loss": 0.2987, "step": 637 }, { "epoch": 0.1276, "grad_norm": 1.6108742952346802, "learning_rate": 4.4510204081632655e-05, "loss": 0.165, "step": 638 }, { "epoch": 0.1278, "grad_norm": 5.832553386688232, "learning_rate": 4.4500000000000004e-05, "loss": 1.0757, "step": 639 }, { "epoch": 0.128, "grad_norm": 2.1176013946533203, "learning_rate": 4.448979591836735e-05, "loss": 0.1239, "step": 640 }, { "epoch": 0.1282, "grad_norm": 5.174328804016113, "learning_rate": 4.4479591836734696e-05, "loss": 0.3201, "step": 641 }, { "epoch": 0.1284, "grad_norm": 2.0192956924438477, "learning_rate": 4.4469387755102045e-05, "loss": 0.134, "step": 642 }, { "epoch": 0.1286, "grad_norm": 2.0334508419036865, "learning_rate": 4.445918367346939e-05, "loss": 0.2273, "step": 643 }, { "epoch": 0.1288, "grad_norm": 2.3532967567443848, "learning_rate": 4.4448979591836737e-05, "loss": 0.3139, "step": 644 }, { "epoch": 0.129, "grad_norm": 5.952183723449707, "learning_rate": 4.4438775510204086e-05, "loss": 1.0241, "step": 645 }, { "epoch": 0.1292, "grad_norm": 56.45768356323242, "learning_rate": 4.442857142857143e-05, "loss": 4.0514, "step": 646 }, { "epoch": 0.1294, "grad_norm": 15.219496726989746, "learning_rate": 4.441836734693878e-05, "loss": 6.0333, "step": 647 }, { "epoch": 0.1296, "grad_norm": 11.211031913757324, "learning_rate": 4.4408163265306127e-05, "loss": 1.6167, "step": 648 }, { "epoch": 0.1298, "grad_norm": 31.7654972076416, "learning_rate": 4.439795918367347e-05, "loss": 1.9553, "step": 649 }, { "epoch": 0.13, "grad_norm": 2.0561766624450684, "learning_rate": 4.438775510204082e-05, "loss": 0.2417, "step": 650 }, { "epoch": 0.1302, "grad_norm": 2.283371686935425, "learning_rate": 4.437755102040817e-05, "loss": 0.2687, "step": 651 }, { "epoch": 0.1304, "grad_norm": 1.6434128284454346, "learning_rate": 4.436734693877551e-05, "loss": 0.1885, "step": 652 }, { "epoch": 0.1306, "grad_norm": 7.3232340812683105, "learning_rate": 4.435714285714286e-05, "loss": 2.4361, "step": 653 }, { "epoch": 0.1308, "grad_norm": 7.41826868057251, "learning_rate": 4.434693877551021e-05, "loss": 2.3235, "step": 654 }, { "epoch": 0.131, "grad_norm": 6.322724342346191, "learning_rate": 4.433673469387755e-05, "loss": 1.433, "step": 655 }, { "epoch": 0.1312, "grad_norm": 2.0319886207580566, "learning_rate": 4.43265306122449e-05, "loss": 0.1222, "step": 656 }, { "epoch": 0.1314, "grad_norm": 5.4758524894714355, "learning_rate": 4.431632653061225e-05, "loss": 1.5181, "step": 657 }, { "epoch": 0.1316, "grad_norm": 5.770097732543945, "learning_rate": 4.430612244897959e-05, "loss": 1.5341, "step": 658 }, { "epoch": 0.1318, "grad_norm": 1.8182827234268188, "learning_rate": 4.429591836734694e-05, "loss": 0.1104, "step": 659 }, { "epoch": 0.132, "grad_norm": 8.128026962280273, "learning_rate": 4.428571428571428e-05, "loss": 2.4333, "step": 660 }, { "epoch": 0.1322, "grad_norm": 1.7597488164901733, "learning_rate": 4.427551020408163e-05, "loss": 0.1998, "step": 661 }, { "epoch": 0.1324, "grad_norm": 1.8226596117019653, "learning_rate": 4.426530612244898e-05, "loss": 0.2336, "step": 662 }, { "epoch": 0.1326, "grad_norm": 18.413177490234375, "learning_rate": 4.4255102040816324e-05, "loss": 4.6057, "step": 663 }, { "epoch": 0.1328, "grad_norm": 5.388815879821777, "learning_rate": 4.424489795918367e-05, "loss": 2.4815, "step": 664 }, { "epoch": 0.133, "grad_norm": 1.821333408355713, "learning_rate": 4.423469387755102e-05, "loss": 0.2417, "step": 665 }, { "epoch": 0.1332, "grad_norm": 8.400188446044922, "learning_rate": 4.4224489795918364e-05, "loss": 1.4933, "step": 666 }, { "epoch": 0.1334, "grad_norm": 4.5805983543396, "learning_rate": 4.4214285714285714e-05, "loss": 1.1053, "step": 667 }, { "epoch": 0.1336, "grad_norm": 1.690073013305664, "learning_rate": 4.420408163265306e-05, "loss": 0.1809, "step": 668 }, { "epoch": 0.1338, "grad_norm": 5.437297344207764, "learning_rate": 4.419387755102041e-05, "loss": 0.9489, "step": 669 }, { "epoch": 0.134, "grad_norm": 2.3655149936676025, "learning_rate": 4.418367346938776e-05, "loss": 0.1009, "step": 670 }, { "epoch": 0.1342, "grad_norm": 2.305649518966675, "learning_rate": 4.4173469387755103e-05, "loss": 0.2685, "step": 671 }, { "epoch": 0.1344, "grad_norm": 2.1379716396331787, "learning_rate": 4.416326530612245e-05, "loss": 0.2611, "step": 672 }, { "epoch": 0.1346, "grad_norm": 1.7722504138946533, "learning_rate": 4.41530612244898e-05, "loss": 0.1145, "step": 673 }, { "epoch": 0.1348, "grad_norm": 9.311881065368652, "learning_rate": 4.4142857142857144e-05, "loss": 1.4772, "step": 674 }, { "epoch": 0.135, "grad_norm": 2.3688912391662598, "learning_rate": 4.4132653061224493e-05, "loss": 0.3648, "step": 675 }, { "epoch": 0.1352, "grad_norm": 7.750619411468506, "learning_rate": 4.412244897959184e-05, "loss": 0.6366, "step": 676 }, { "epoch": 0.1354, "grad_norm": 1.9949312210083008, "learning_rate": 4.4112244897959185e-05, "loss": 0.2289, "step": 677 }, { "epoch": 0.1356, "grad_norm": 2.4722976684570312, "learning_rate": 4.4102040816326534e-05, "loss": 0.0601, "step": 678 }, { "epoch": 0.1358, "grad_norm": 5.0097761154174805, "learning_rate": 4.409183673469388e-05, "loss": 0.8654, "step": 679 }, { "epoch": 0.136, "grad_norm": 1.8627954721450806, "learning_rate": 4.4081632653061226e-05, "loss": 0.0965, "step": 680 }, { "epoch": 0.1362, "grad_norm": 7.933848857879639, "learning_rate": 4.4071428571428575e-05, "loss": 1.6723, "step": 681 }, { "epoch": 0.1364, "grad_norm": 10.013572692871094, "learning_rate": 4.4061224489795924e-05, "loss": 1.65, "step": 682 }, { "epoch": 0.1366, "grad_norm": 1.8995143175125122, "learning_rate": 4.4051020408163267e-05, "loss": 0.2396, "step": 683 }, { "epoch": 0.1368, "grad_norm": 1.689717411994934, "learning_rate": 4.4040816326530616e-05, "loss": 0.0887, "step": 684 }, { "epoch": 0.137, "grad_norm": 4.838736057281494, "learning_rate": 4.4030612244897965e-05, "loss": 1.0061, "step": 685 }, { "epoch": 0.1372, "grad_norm": 5.647104263305664, "learning_rate": 4.402040816326531e-05, "loss": 0.3167, "step": 686 }, { "epoch": 0.1374, "grad_norm": 2.3787808418273926, "learning_rate": 4.4010204081632656e-05, "loss": 0.2948, "step": 687 }, { "epoch": 0.1376, "grad_norm": 1.9067801237106323, "learning_rate": 4.4000000000000006e-05, "loss": 0.2273, "step": 688 }, { "epoch": 0.1378, "grad_norm": 2.5516650676727295, "learning_rate": 4.398979591836735e-05, "loss": 0.0622, "step": 689 }, { "epoch": 0.138, "grad_norm": 2.4554383754730225, "learning_rate": 4.39795918367347e-05, "loss": 0.3739, "step": 690 }, { "epoch": 0.1382, "grad_norm": 2.3099822998046875, "learning_rate": 4.396938775510204e-05, "loss": 0.2943, "step": 691 }, { "epoch": 0.1384, "grad_norm": 2.306360960006714, "learning_rate": 4.395918367346939e-05, "loss": 0.34, "step": 692 }, { "epoch": 0.1386, "grad_norm": 8.243119239807129, "learning_rate": 4.394897959183674e-05, "loss": 0.6953, "step": 693 }, { "epoch": 0.1388, "grad_norm": 1.6192740201950073, "learning_rate": 4.393877551020408e-05, "loss": 0.0911, "step": 694 }, { "epoch": 0.139, "grad_norm": 1.8218029737472534, "learning_rate": 4.392857142857143e-05, "loss": 0.2832, "step": 695 }, { "epoch": 0.1392, "grad_norm": 2.514047622680664, "learning_rate": 4.391836734693878e-05, "loss": 0.4452, "step": 696 }, { "epoch": 0.1394, "grad_norm": 12.909881591796875, "learning_rate": 4.390816326530612e-05, "loss": 0.7189, "step": 697 }, { "epoch": 0.1396, "grad_norm": 6.974311351776123, "learning_rate": 4.389795918367347e-05, "loss": 1.5038, "step": 698 }, { "epoch": 0.1398, "grad_norm": 2.2936313152313232, "learning_rate": 4.388775510204082e-05, "loss": 0.2825, "step": 699 }, { "epoch": 0.14, "grad_norm": 11.503886222839355, "learning_rate": 4.387755102040816e-05, "loss": 5.795, "step": 700 }, { "epoch": 0.1402, "grad_norm": 1.8265719413757324, "learning_rate": 4.386734693877551e-05, "loss": 0.1091, "step": 701 }, { "epoch": 0.1404, "grad_norm": 2.218329906463623, "learning_rate": 4.385714285714286e-05, "loss": 0.0533, "step": 702 }, { "epoch": 0.1406, "grad_norm": 2.1319103240966797, "learning_rate": 4.38469387755102e-05, "loss": 0.2347, "step": 703 }, { "epoch": 0.1408, "grad_norm": 2.2720017433166504, "learning_rate": 4.383673469387755e-05, "loss": 0.2567, "step": 704 }, { "epoch": 0.141, "grad_norm": 2.6647064685821533, "learning_rate": 4.38265306122449e-05, "loss": 0.0915, "step": 705 }, { "epoch": 0.1412, "grad_norm": 1.7370988130569458, "learning_rate": 4.3816326530612243e-05, "loss": 0.1033, "step": 706 }, { "epoch": 0.1414, "grad_norm": 7.157041072845459, "learning_rate": 4.380612244897959e-05, "loss": 0.2841, "step": 707 }, { "epoch": 0.1416, "grad_norm": 6.928675651550293, "learning_rate": 4.379591836734694e-05, "loss": 1.4424, "step": 708 }, { "epoch": 0.1418, "grad_norm": 2.09725022315979, "learning_rate": 4.3785714285714284e-05, "loss": 0.0728, "step": 709 }, { "epoch": 0.142, "grad_norm": 1.9853891134262085, "learning_rate": 4.377551020408163e-05, "loss": 0.0603, "step": 710 }, { "epoch": 0.1422, "grad_norm": 9.301671981811523, "learning_rate": 4.3765306122448976e-05, "loss": 2.4837, "step": 711 }, { "epoch": 0.1424, "grad_norm": 5.64785099029541, "learning_rate": 4.3755102040816325e-05, "loss": 0.8947, "step": 712 }, { "epoch": 0.1426, "grad_norm": 1.9208048582077026, "learning_rate": 4.374489795918368e-05, "loss": 0.1086, "step": 713 }, { "epoch": 0.1428, "grad_norm": 4.717841625213623, "learning_rate": 4.373469387755102e-05, "loss": 1.1693, "step": 714 }, { "epoch": 0.143, "grad_norm": 9.996125221252441, "learning_rate": 4.372448979591837e-05, "loss": 0.9532, "step": 715 }, { "epoch": 0.1432, "grad_norm": 2.3046953678131104, "learning_rate": 4.371428571428572e-05, "loss": 0.331, "step": 716 }, { "epoch": 0.1434, "grad_norm": 1.901801586151123, "learning_rate": 4.3704081632653064e-05, "loss": 0.272, "step": 717 }, { "epoch": 0.1436, "grad_norm": 24.127500534057617, "learning_rate": 4.369387755102041e-05, "loss": 4.3717, "step": 718 }, { "epoch": 0.1438, "grad_norm": 6.888923645019531, "learning_rate": 4.368367346938776e-05, "loss": 1.3725, "step": 719 }, { "epoch": 0.144, "grad_norm": 6.642779350280762, "learning_rate": 4.3673469387755105e-05, "loss": 1.2854, "step": 720 }, { "epoch": 0.1442, "grad_norm": 4.80898904800415, "learning_rate": 4.3663265306122454e-05, "loss": 0.2967, "step": 721 }, { "epoch": 0.1444, "grad_norm": 4.025364398956299, "learning_rate": 4.3653061224489796e-05, "loss": 0.9348, "step": 722 }, { "epoch": 0.1446, "grad_norm": 1.866579532623291, "learning_rate": 4.3642857142857146e-05, "loss": 0.1253, "step": 723 }, { "epoch": 0.1448, "grad_norm": 5.358083724975586, "learning_rate": 4.3632653061224495e-05, "loss": 2.4192, "step": 724 }, { "epoch": 0.145, "grad_norm": 12.656335830688477, "learning_rate": 4.362244897959184e-05, "loss": 2.5382, "step": 725 }, { "epoch": 0.1452, "grad_norm": 2.922243356704712, "learning_rate": 4.3612244897959186e-05, "loss": 0.8893, "step": 726 }, { "epoch": 0.1454, "grad_norm": 2.257392406463623, "learning_rate": 4.3602040816326536e-05, "loss": 0.2155, "step": 727 }, { "epoch": 0.1456, "grad_norm": 1.99644935131073, "learning_rate": 4.359183673469388e-05, "loss": 0.2558, "step": 728 }, { "epoch": 0.1458, "grad_norm": 1.754607915878296, "learning_rate": 4.358163265306123e-05, "loss": 0.2429, "step": 729 }, { "epoch": 0.146, "grad_norm": 1.7153891324996948, "learning_rate": 4.3571428571428576e-05, "loss": 0.1047, "step": 730 }, { "epoch": 0.1462, "grad_norm": 1.9489779472351074, "learning_rate": 4.356122448979592e-05, "loss": 0.1325, "step": 731 }, { "epoch": 0.1464, "grad_norm": 1.736901879310608, "learning_rate": 4.355102040816327e-05, "loss": 0.1266, "step": 732 }, { "epoch": 0.1466, "grad_norm": 25.8922176361084, "learning_rate": 4.354081632653062e-05, "loss": 3.2462, "step": 733 }, { "epoch": 0.1468, "grad_norm": 8.070469856262207, "learning_rate": 4.353061224489796e-05, "loss": 2.528, "step": 734 }, { "epoch": 0.147, "grad_norm": 6.357401371002197, "learning_rate": 4.352040816326531e-05, "loss": 0.6996, "step": 735 }, { "epoch": 0.1472, "grad_norm": 1.9338181018829346, "learning_rate": 4.351020408163266e-05, "loss": 0.2494, "step": 736 }, { "epoch": 0.1474, "grad_norm": 7.206000804901123, "learning_rate": 4.35e-05, "loss": 1.5507, "step": 737 }, { "epoch": 0.1476, "grad_norm": 5.9976019859313965, "learning_rate": 4.348979591836735e-05, "loss": 0.9014, "step": 738 }, { "epoch": 0.1478, "grad_norm": 4.735384464263916, "learning_rate": 4.34795918367347e-05, "loss": 0.816, "step": 739 }, { "epoch": 0.148, "grad_norm": 1.800541639328003, "learning_rate": 4.346938775510204e-05, "loss": 0.1901, "step": 740 }, { "epoch": 0.1482, "grad_norm": 2.0132083892822266, "learning_rate": 4.345918367346939e-05, "loss": 0.2509, "step": 741 }, { "epoch": 0.1484, "grad_norm": 1.8128314018249512, "learning_rate": 4.344897959183673e-05, "loss": 0.051, "step": 742 }, { "epoch": 0.1486, "grad_norm": 2.1387152671813965, "learning_rate": 4.343877551020408e-05, "loss": 0.2722, "step": 743 }, { "epoch": 0.1488, "grad_norm": 2.3190178871154785, "learning_rate": 4.342857142857143e-05, "loss": 0.2977, "step": 744 }, { "epoch": 0.149, "grad_norm": 7.440555572509766, "learning_rate": 4.341836734693877e-05, "loss": 1.4205, "step": 745 }, { "epoch": 0.1492, "grad_norm": 1.9111486673355103, "learning_rate": 4.340816326530612e-05, "loss": 0.1122, "step": 746 }, { "epoch": 0.1494, "grad_norm": 13.870625495910645, "learning_rate": 4.339795918367347e-05, "loss": 2.2386, "step": 747 }, { "epoch": 0.1496, "grad_norm": 1.77839994430542, "learning_rate": 4.3387755102040814e-05, "loss": 0.1634, "step": 748 }, { "epoch": 0.1498, "grad_norm": 7.970048427581787, "learning_rate": 4.337755102040816e-05, "loss": 0.7243, "step": 749 }, { "epoch": 0.15, "grad_norm": 4.571451187133789, "learning_rate": 4.336734693877551e-05, "loss": 0.3174, "step": 750 }, { "epoch": 0.1502, "grad_norm": 1.6975948810577393, "learning_rate": 4.3357142857142855e-05, "loss": 0.1841, "step": 751 }, { "epoch": 0.1504, "grad_norm": 7.2954816818237305, "learning_rate": 4.3346938775510204e-05, "loss": 1.3127, "step": 752 }, { "epoch": 0.1506, "grad_norm": 4.603607177734375, "learning_rate": 4.333673469387755e-05, "loss": 1.1817, "step": 753 }, { "epoch": 0.1508, "grad_norm": 1.546514630317688, "learning_rate": 4.3326530612244896e-05, "loss": 0.2038, "step": 754 }, { "epoch": 0.151, "grad_norm": 1.9785537719726562, "learning_rate": 4.3316326530612245e-05, "loss": 0.2896, "step": 755 }, { "epoch": 0.1512, "grad_norm": 4.313318252563477, "learning_rate": 4.3306122448979594e-05, "loss": 1.1541, "step": 756 }, { "epoch": 0.1514, "grad_norm": 4.420673370361328, "learning_rate": 4.3295918367346936e-05, "loss": 1.0872, "step": 757 }, { "epoch": 0.1516, "grad_norm": 4.9215288162231445, "learning_rate": 4.328571428571429e-05, "loss": 0.8623, "step": 758 }, { "epoch": 0.1518, "grad_norm": 1.6973063945770264, "learning_rate": 4.3275510204081635e-05, "loss": 0.0893, "step": 759 }, { "epoch": 0.152, "grad_norm": 1.7262623310089111, "learning_rate": 4.3265306122448984e-05, "loss": 0.2229, "step": 760 }, { "epoch": 0.1522, "grad_norm": 2.2670018672943115, "learning_rate": 4.325510204081633e-05, "loss": 0.2578, "step": 761 }, { "epoch": 0.1524, "grad_norm": 8.593221664428711, "learning_rate": 4.3244897959183676e-05, "loss": 2.3867, "step": 762 }, { "epoch": 0.1526, "grad_norm": 9.047572135925293, "learning_rate": 4.3234693877551025e-05, "loss": 2.3965, "step": 763 }, { "epoch": 0.1528, "grad_norm": 1.6796197891235352, "learning_rate": 4.3224489795918374e-05, "loss": 0.1077, "step": 764 }, { "epoch": 0.153, "grad_norm": 1.9067870378494263, "learning_rate": 4.3214285714285716e-05, "loss": 0.1235, "step": 765 }, { "epoch": 0.1532, "grad_norm": 6.224389553070068, "learning_rate": 4.3204081632653065e-05, "loss": 1.4195, "step": 766 }, { "epoch": 0.1534, "grad_norm": 6.695178508758545, "learning_rate": 4.3193877551020415e-05, "loss": 1.2996, "step": 767 }, { "epoch": 0.1536, "grad_norm": 3.7735090255737305, "learning_rate": 4.318367346938776e-05, "loss": 0.8999, "step": 768 }, { "epoch": 0.1538, "grad_norm": 18.430770874023438, "learning_rate": 4.3173469387755106e-05, "loss": 5.8124, "step": 769 }, { "epoch": 0.154, "grad_norm": 13.440892219543457, "learning_rate": 4.3163265306122455e-05, "loss": 2.2063, "step": 770 }, { "epoch": 0.1542, "grad_norm": 1.6782939434051514, "learning_rate": 4.31530612244898e-05, "loss": 0.1103, "step": 771 }, { "epoch": 0.1544, "grad_norm": 2.325686454772949, "learning_rate": 4.314285714285715e-05, "loss": 0.0827, "step": 772 }, { "epoch": 0.1546, "grad_norm": 7.990361213684082, "learning_rate": 4.313265306122449e-05, "loss": 1.2616, "step": 773 }, { "epoch": 0.1548, "grad_norm": 1.9885655641555786, "learning_rate": 4.312244897959184e-05, "loss": 0.1165, "step": 774 }, { "epoch": 0.155, "grad_norm": 2.0096423625946045, "learning_rate": 4.311224489795919e-05, "loss": 0.3091, "step": 775 }, { "epoch": 0.1552, "grad_norm": 2.642979621887207, "learning_rate": 4.310204081632653e-05, "loss": 0.3795, "step": 776 }, { "epoch": 0.1554, "grad_norm": 5.770168781280518, "learning_rate": 4.309183673469388e-05, "loss": 0.8668, "step": 777 }, { "epoch": 0.1556, "grad_norm": 1.6523147821426392, "learning_rate": 4.308163265306123e-05, "loss": 0.1001, "step": 778 }, { "epoch": 0.1558, "grad_norm": 2.008592128753662, "learning_rate": 4.307142857142857e-05, "loss": 0.1983, "step": 779 }, { "epoch": 0.156, "grad_norm": 2.2496085166931152, "learning_rate": 4.306122448979592e-05, "loss": 0.2928, "step": 780 }, { "epoch": 0.1562, "grad_norm": 2.215184211730957, "learning_rate": 4.305102040816327e-05, "loss": 0.2607, "step": 781 }, { "epoch": 0.1564, "grad_norm": 4.5969953536987305, "learning_rate": 4.304081632653061e-05, "loss": 1.0656, "step": 782 }, { "epoch": 0.1566, "grad_norm": 2.250063896179199, "learning_rate": 4.303061224489796e-05, "loss": 0.1636, "step": 783 }, { "epoch": 0.1568, "grad_norm": 5.640593528747559, "learning_rate": 4.302040816326531e-05, "loss": 2.3175, "step": 784 }, { "epoch": 0.157, "grad_norm": 15.217977523803711, "learning_rate": 4.301020408163265e-05, "loss": 1.4736, "step": 785 }, { "epoch": 0.1572, "grad_norm": 1.8735414743423462, "learning_rate": 4.3e-05, "loss": 0.1066, "step": 786 }, { "epoch": 0.1574, "grad_norm": 24.6962890625, "learning_rate": 4.298979591836735e-05, "loss": 4.4351, "step": 787 }, { "epoch": 0.1576, "grad_norm": 19.73663330078125, "learning_rate": 4.297959183673469e-05, "loss": 4.5139, "step": 788 }, { "epoch": 0.1578, "grad_norm": 12.441691398620605, "learning_rate": 4.296938775510204e-05, "loss": 2.562, "step": 789 }, { "epoch": 0.158, "grad_norm": 4.711108684539795, "learning_rate": 4.295918367346939e-05, "loss": 0.8065, "step": 790 }, { "epoch": 0.1582, "grad_norm": 2.1329903602600098, "learning_rate": 4.2948979591836734e-05, "loss": 0.0514, "step": 791 }, { "epoch": 0.1584, "grad_norm": 1.7468369007110596, "learning_rate": 4.293877551020408e-05, "loss": 0.1048, "step": 792 }, { "epoch": 0.1586, "grad_norm": 2.452786445617676, "learning_rate": 4.292857142857143e-05, "loss": 0.2923, "step": 793 }, { "epoch": 0.1588, "grad_norm": 3.929147243499756, "learning_rate": 4.2918367346938775e-05, "loss": 2.3275, "step": 794 }, { "epoch": 0.159, "grad_norm": 1.9393938779830933, "learning_rate": 4.2908163265306124e-05, "loss": 0.1292, "step": 795 }, { "epoch": 0.1592, "grad_norm": 7.826672554016113, "learning_rate": 4.2897959183673466e-05, "loss": 1.2891, "step": 796 }, { "epoch": 0.1594, "grad_norm": 1.6239943504333496, "learning_rate": 4.2887755102040816e-05, "loss": 0.1988, "step": 797 }, { "epoch": 0.1596, "grad_norm": 2.3447299003601074, "learning_rate": 4.2877551020408165e-05, "loss": 0.3015, "step": 798 }, { "epoch": 0.1598, "grad_norm": 2.075815439224243, "learning_rate": 4.286734693877551e-05, "loss": 0.2568, "step": 799 }, { "epoch": 0.16, "grad_norm": 1.7597588300704956, "learning_rate": 4.2857142857142856e-05, "loss": 0.104, "step": 800 }, { "epoch": 0.1602, "grad_norm": 4.3136725425720215, "learning_rate": 4.2846938775510205e-05, "loss": 2.4686, "step": 801 }, { "epoch": 0.1604, "grad_norm": 1.7578059434890747, "learning_rate": 4.283673469387755e-05, "loss": 0.0796, "step": 802 }, { "epoch": 0.1606, "grad_norm": 2.104520797729492, "learning_rate": 4.28265306122449e-05, "loss": 0.0624, "step": 803 }, { "epoch": 0.1608, "grad_norm": 1.956167459487915, "learning_rate": 4.281632653061225e-05, "loss": 0.2198, "step": 804 }, { "epoch": 0.161, "grad_norm": 2.076270341873169, "learning_rate": 4.2806122448979595e-05, "loss": 0.0752, "step": 805 }, { "epoch": 0.1612, "grad_norm": 1.9125944375991821, "learning_rate": 4.2795918367346945e-05, "loss": 0.2213, "step": 806 }, { "epoch": 0.1614, "grad_norm": 1.775360345840454, "learning_rate": 4.278571428571429e-05, "loss": 0.109, "step": 807 }, { "epoch": 0.1616, "grad_norm": 4.13633394241333, "learning_rate": 4.2775510204081636e-05, "loss": 2.4488, "step": 808 }, { "epoch": 0.1618, "grad_norm": 8.054276466369629, "learning_rate": 4.2765306122448985e-05, "loss": 2.2969, "step": 809 }, { "epoch": 0.162, "grad_norm": 6.543287754058838, "learning_rate": 4.275510204081633e-05, "loss": 0.7256, "step": 810 }, { "epoch": 0.1622, "grad_norm": 3.911162853240967, "learning_rate": 4.274489795918368e-05, "loss": 2.3648, "step": 811 }, { "epoch": 0.1624, "grad_norm": 3.8917007446289062, "learning_rate": 4.2734693877551026e-05, "loss": 2.3915, "step": 812 }, { "epoch": 0.1626, "grad_norm": 3.657830238342285, "learning_rate": 4.272448979591837e-05, "loss": 2.2125, "step": 813 }, { "epoch": 0.1628, "grad_norm": 46.86573791503906, "learning_rate": 4.271428571428572e-05, "loss": 6.0395, "step": 814 }, { "epoch": 0.163, "grad_norm": 7.47802734375, "learning_rate": 4.270408163265307e-05, "loss": 1.3507, "step": 815 }, { "epoch": 0.1632, "grad_norm": 1.9284485578536987, "learning_rate": 4.269387755102041e-05, "loss": 0.158, "step": 816 }, { "epoch": 0.1634, "grad_norm": 2.250680446624756, "learning_rate": 4.268367346938776e-05, "loss": 0.3714, "step": 817 }, { "epoch": 0.1636, "grad_norm": 6.096149921417236, "learning_rate": 4.267346938775511e-05, "loss": 1.1214, "step": 818 }, { "epoch": 0.1638, "grad_norm": 4.140794277191162, "learning_rate": 4.266326530612245e-05, "loss": 2.4074, "step": 819 }, { "epoch": 0.164, "grad_norm": 7.302412509918213, "learning_rate": 4.26530612244898e-05, "loss": 2.3707, "step": 820 }, { "epoch": 0.1642, "grad_norm": 22.319246292114258, "learning_rate": 4.264285714285715e-05, "loss": 4.4901, "step": 821 }, { "epoch": 0.1644, "grad_norm": 6.420026779174805, "learning_rate": 4.263265306122449e-05, "loss": 2.4446, "step": 822 }, { "epoch": 0.1646, "grad_norm": 15.629339218139648, "learning_rate": 4.262244897959184e-05, "loss": 4.5816, "step": 823 }, { "epoch": 0.1648, "grad_norm": 2.6500132083892822, "learning_rate": 4.261224489795919e-05, "loss": 0.1216, "step": 824 }, { "epoch": 0.165, "grad_norm": 6.980380535125732, "learning_rate": 4.260204081632653e-05, "loss": 1.2799, "step": 825 }, { "epoch": 0.1652, "grad_norm": 1.9646185636520386, "learning_rate": 4.259183673469388e-05, "loss": 0.1346, "step": 826 }, { "epoch": 0.1654, "grad_norm": 3.1223630905151367, "learning_rate": 4.258163265306122e-05, "loss": 0.8811, "step": 827 }, { "epoch": 0.1656, "grad_norm": 2.060901165008545, "learning_rate": 4.257142857142857e-05, "loss": 0.1289, "step": 828 }, { "epoch": 0.1658, "grad_norm": 3.2158594131469727, "learning_rate": 4.256122448979592e-05, "loss": 0.0913, "step": 829 }, { "epoch": 0.166, "grad_norm": 50.38131332397461, "learning_rate": 4.2551020408163264e-05, "loss": 5.8206, "step": 830 }, { "epoch": 0.1662, "grad_norm": 3.1545493602752686, "learning_rate": 4.254081632653061e-05, "loss": 0.0734, "step": 831 }, { "epoch": 0.1664, "grad_norm": 1.6991525888442993, "learning_rate": 4.253061224489796e-05, "loss": 0.2189, "step": 832 }, { "epoch": 0.1666, "grad_norm": 2.1904635429382324, "learning_rate": 4.2520408163265305e-05, "loss": 0.1192, "step": 833 }, { "epoch": 0.1668, "grad_norm": 1.8767436742782593, "learning_rate": 4.2510204081632654e-05, "loss": 0.188, "step": 834 }, { "epoch": 0.167, "grad_norm": 5.210190773010254, "learning_rate": 4.25e-05, "loss": 1.0994, "step": 835 }, { "epoch": 0.1672, "grad_norm": 2.0396203994750977, "learning_rate": 4.2489795918367345e-05, "loss": 0.1005, "step": 836 }, { "epoch": 0.1674, "grad_norm": 1.7658389806747437, "learning_rate": 4.2479591836734695e-05, "loss": 0.0898, "step": 837 }, { "epoch": 0.1676, "grad_norm": 1.6944715976715088, "learning_rate": 4.2469387755102044e-05, "loss": 0.2445, "step": 838 }, { "epoch": 0.1678, "grad_norm": 1.6534080505371094, "learning_rate": 4.2459183673469386e-05, "loss": 0.0917, "step": 839 }, { "epoch": 0.168, "grad_norm": 1.8207001686096191, "learning_rate": 4.2448979591836735e-05, "loss": 0.1746, "step": 840 }, { "epoch": 0.1682, "grad_norm": 1.6604434251785278, "learning_rate": 4.2438775510204085e-05, "loss": 0.1816, "step": 841 }, { "epoch": 0.1684, "grad_norm": 1.7828645706176758, "learning_rate": 4.242857142857143e-05, "loss": 0.2107, "step": 842 }, { "epoch": 0.1686, "grad_norm": 4.550353527069092, "learning_rate": 4.2418367346938776e-05, "loss": 1.1876, "step": 843 }, { "epoch": 0.1688, "grad_norm": 4.328961372375488, "learning_rate": 4.2408163265306125e-05, "loss": 2.4182, "step": 844 }, { "epoch": 0.169, "grad_norm": 2.939178943634033, "learning_rate": 4.239795918367347e-05, "loss": 0.8846, "step": 845 }, { "epoch": 0.1692, "grad_norm": 4.374268054962158, "learning_rate": 4.238775510204082e-05, "loss": 0.962, "step": 846 }, { "epoch": 0.1694, "grad_norm": 1.9265333414077759, "learning_rate": 4.237755102040816e-05, "loss": 0.162, "step": 847 }, { "epoch": 0.1696, "grad_norm": 1.5972001552581787, "learning_rate": 4.236734693877551e-05, "loss": 0.1947, "step": 848 }, { "epoch": 0.1698, "grad_norm": 5.813956260681152, "learning_rate": 4.2357142857142864e-05, "loss": 1.3463, "step": 849 }, { "epoch": 0.17, "grad_norm": 11.302892684936523, "learning_rate": 4.234693877551021e-05, "loss": 1.6315, "step": 850 }, { "epoch": 0.1702, "grad_norm": 1.6612507104873657, "learning_rate": 4.2336734693877556e-05, "loss": 0.1486, "step": 851 }, { "epoch": 0.1704, "grad_norm": 1.7442834377288818, "learning_rate": 4.2326530612244905e-05, "loss": 0.2076, "step": 852 }, { "epoch": 0.1706, "grad_norm": 8.239985466003418, "learning_rate": 4.231632653061225e-05, "loss": 2.4089, "step": 853 }, { "epoch": 0.1708, "grad_norm": 1.635135531425476, "learning_rate": 4.23061224489796e-05, "loss": 0.1433, "step": 854 }, { "epoch": 0.171, "grad_norm": 11.539763450622559, "learning_rate": 4.2295918367346946e-05, "loss": 2.2066, "step": 855 }, { "epoch": 0.1712, "grad_norm": 2.6980319023132324, "learning_rate": 4.228571428571429e-05, "loss": 0.082, "step": 856 }, { "epoch": 0.1714, "grad_norm": 2.544710397720337, "learning_rate": 4.227551020408164e-05, "loss": 0.1421, "step": 857 }, { "epoch": 0.1716, "grad_norm": 6.985103130340576, "learning_rate": 4.226530612244898e-05, "loss": 1.4134, "step": 858 }, { "epoch": 0.1718, "grad_norm": 3.9861574172973633, "learning_rate": 4.225510204081633e-05, "loss": 0.7055, "step": 859 }, { "epoch": 0.172, "grad_norm": 1.906098484992981, "learning_rate": 4.224489795918368e-05, "loss": 0.111, "step": 860 }, { "epoch": 0.1722, "grad_norm": 2.332689046859741, "learning_rate": 4.223469387755102e-05, "loss": 0.1167, "step": 861 }, { "epoch": 0.1724, "grad_norm": 15.099422454833984, "learning_rate": 4.222448979591837e-05, "loss": 3.3177, "step": 862 }, { "epoch": 0.1726, "grad_norm": 6.7551984786987305, "learning_rate": 4.221428571428572e-05, "loss": 2.307, "step": 863 }, { "epoch": 0.1728, "grad_norm": 2.085442066192627, "learning_rate": 4.220408163265306e-05, "loss": 0.1908, "step": 864 }, { "epoch": 0.173, "grad_norm": 38.784244537353516, "learning_rate": 4.219387755102041e-05, "loss": 5.5188, "step": 865 }, { "epoch": 0.1732, "grad_norm": 2.1101486682891846, "learning_rate": 4.218367346938776e-05, "loss": 0.2555, "step": 866 }, { "epoch": 0.1734, "grad_norm": 4.37405252456665, "learning_rate": 4.21734693877551e-05, "loss": 2.2608, "step": 867 }, { "epoch": 0.1736, "grad_norm": 12.383613586425781, "learning_rate": 4.216326530612245e-05, "loss": 2.5355, "step": 868 }, { "epoch": 0.1738, "grad_norm": 1.730635166168213, "learning_rate": 4.21530612244898e-05, "loss": 0.2009, "step": 869 }, { "epoch": 0.174, "grad_norm": 8.86620807647705, "learning_rate": 4.214285714285714e-05, "loss": 1.3526, "step": 870 }, { "epoch": 0.1742, "grad_norm": 4.039647102355957, "learning_rate": 4.213265306122449e-05, "loss": 2.3077, "step": 871 }, { "epoch": 0.1744, "grad_norm": 2.9431746006011963, "learning_rate": 4.212244897959184e-05, "loss": 0.1349, "step": 872 }, { "epoch": 0.1746, "grad_norm": 1.9896126985549927, "learning_rate": 4.2112244897959184e-05, "loss": 0.0804, "step": 873 }, { "epoch": 0.1748, "grad_norm": 6.88995361328125, "learning_rate": 4.210204081632653e-05, "loss": 0.6664, "step": 874 }, { "epoch": 0.175, "grad_norm": 2.000009298324585, "learning_rate": 4.209183673469388e-05, "loss": 0.2, "step": 875 }, { "epoch": 0.1752, "grad_norm": 1.6612294912338257, "learning_rate": 4.2081632653061225e-05, "loss": 0.1047, "step": 876 }, { "epoch": 0.1754, "grad_norm": 4.192002296447754, "learning_rate": 4.2071428571428574e-05, "loss": 2.411, "step": 877 }, { "epoch": 0.1756, "grad_norm": 1.8390638828277588, "learning_rate": 4.206122448979592e-05, "loss": 0.214, "step": 878 }, { "epoch": 0.1758, "grad_norm": 1.7344233989715576, "learning_rate": 4.2051020408163265e-05, "loss": 0.2782, "step": 879 }, { "epoch": 0.176, "grad_norm": 1.545967698097229, "learning_rate": 4.2040816326530615e-05, "loss": 0.16, "step": 880 }, { "epoch": 0.1762, "grad_norm": 1.775578498840332, "learning_rate": 4.203061224489796e-05, "loss": 0.2152, "step": 881 }, { "epoch": 0.1764, "grad_norm": 1.6764472723007202, "learning_rate": 4.2020408163265306e-05, "loss": 0.1382, "step": 882 }, { "epoch": 0.1766, "grad_norm": 1.90451979637146, "learning_rate": 4.2010204081632655e-05, "loss": 0.2481, "step": 883 }, { "epoch": 0.1768, "grad_norm": 7.125704765319824, "learning_rate": 4.2e-05, "loss": 2.3166, "step": 884 }, { "epoch": 0.177, "grad_norm": 1.615654706954956, "learning_rate": 4.198979591836735e-05, "loss": 0.1979, "step": 885 }, { "epoch": 0.1772, "grad_norm": 6.812422752380371, "learning_rate": 4.1979591836734696e-05, "loss": 0.543, "step": 886 }, { "epoch": 0.1774, "grad_norm": 4.927462577819824, "learning_rate": 4.196938775510204e-05, "loss": 0.8311, "step": 887 }, { "epoch": 0.1776, "grad_norm": 32.130638122558594, "learning_rate": 4.195918367346939e-05, "loss": 3.5849, "step": 888 }, { "epoch": 0.1778, "grad_norm": 9.701715469360352, "learning_rate": 4.194897959183674e-05, "loss": 1.5411, "step": 889 }, { "epoch": 0.178, "grad_norm": 1.8125908374786377, "learning_rate": 4.193877551020408e-05, "loss": 0.1623, "step": 890 }, { "epoch": 0.1782, "grad_norm": 27.293272018432617, "learning_rate": 4.192857142857143e-05, "loss": 5.7133, "step": 891 }, { "epoch": 0.1784, "grad_norm": 4.58138370513916, "learning_rate": 4.191836734693878e-05, "loss": 2.2323, "step": 892 }, { "epoch": 0.1786, "grad_norm": 1.850812554359436, "learning_rate": 4.190816326530612e-05, "loss": 0.209, "step": 893 }, { "epoch": 0.1788, "grad_norm": 13.71081256866455, "learning_rate": 4.1897959183673476e-05, "loss": 1.3616, "step": 894 }, { "epoch": 0.179, "grad_norm": 4.414346218109131, "learning_rate": 4.188775510204082e-05, "loss": 0.765, "step": 895 }, { "epoch": 0.1792, "grad_norm": 1.5059181451797485, "learning_rate": 4.187755102040817e-05, "loss": 0.2032, "step": 896 }, { "epoch": 0.1794, "grad_norm": 1.4246588945388794, "learning_rate": 4.186734693877552e-05, "loss": 0.1675, "step": 897 }, { "epoch": 0.1796, "grad_norm": 1.410630464553833, "learning_rate": 4.185714285714286e-05, "loss": 0.1632, "step": 898 }, { "epoch": 0.1798, "grad_norm": 9.267348289489746, "learning_rate": 4.184693877551021e-05, "loss": 0.6752, "step": 899 }, { "epoch": 0.18, "grad_norm": 2.1020305156707764, "learning_rate": 4.183673469387756e-05, "loss": 0.0524, "step": 900 }, { "epoch": 0.1802, "grad_norm": 1.746822714805603, "learning_rate": 4.18265306122449e-05, "loss": 0.1791, "step": 901 }, { "epoch": 0.1804, "grad_norm": 3.0003411769866943, "learning_rate": 4.181632653061225e-05, "loss": 0.8223, "step": 902 }, { "epoch": 0.1806, "grad_norm": 1.722442626953125, "learning_rate": 4.18061224489796e-05, "loss": 0.1501, "step": 903 }, { "epoch": 0.1808, "grad_norm": 1.9100525379180908, "learning_rate": 4.179591836734694e-05, "loss": 0.1688, "step": 904 }, { "epoch": 0.181, "grad_norm": 2.5086095333099365, "learning_rate": 4.178571428571429e-05, "loss": 0.0947, "step": 905 }, { "epoch": 0.1812, "grad_norm": 4.063119411468506, "learning_rate": 4.177551020408164e-05, "loss": 0.9714, "step": 906 }, { "epoch": 0.1814, "grad_norm": 6.684601783752441, "learning_rate": 4.176530612244898e-05, "loss": 1.3965, "step": 907 }, { "epoch": 0.1816, "grad_norm": 1.689247727394104, "learning_rate": 4.175510204081633e-05, "loss": 0.1731, "step": 908 }, { "epoch": 0.1818, "grad_norm": 14.05957317352295, "learning_rate": 4.174489795918368e-05, "loss": 2.3015, "step": 909 }, { "epoch": 0.182, "grad_norm": 18.073965072631836, "learning_rate": 4.173469387755102e-05, "loss": 3.9085, "step": 910 }, { "epoch": 0.1822, "grad_norm": 5.947948932647705, "learning_rate": 4.172448979591837e-05, "loss": 1.154, "step": 911 }, { "epoch": 0.1824, "grad_norm": 6.424638748168945, "learning_rate": 4.1714285714285714e-05, "loss": 1.2806, "step": 912 }, { "epoch": 0.1826, "grad_norm": 1.6589370965957642, "learning_rate": 4.170408163265306e-05, "loss": 0.12, "step": 913 }, { "epoch": 0.1828, "grad_norm": 2.2325034141540527, "learning_rate": 4.169387755102041e-05, "loss": 0.1171, "step": 914 }, { "epoch": 0.183, "grad_norm": 6.6812849044799805, "learning_rate": 4.1683673469387754e-05, "loss": 2.3485, "step": 915 }, { "epoch": 0.1832, "grad_norm": 4.563431739807129, "learning_rate": 4.1673469387755104e-05, "loss": 2.3878, "step": 916 }, { "epoch": 0.1834, "grad_norm": 1.316593885421753, "learning_rate": 4.166326530612245e-05, "loss": 0.1204, "step": 917 }, { "epoch": 0.1836, "grad_norm": 2.0545766353607178, "learning_rate": 4.1653061224489795e-05, "loss": 0.1217, "step": 918 }, { "epoch": 0.1838, "grad_norm": 1.4564555883407593, "learning_rate": 4.1642857142857144e-05, "loss": 0.1337, "step": 919 }, { "epoch": 0.184, "grad_norm": 5.850950717926025, "learning_rate": 4.1632653061224494e-05, "loss": 1.317, "step": 920 }, { "epoch": 0.1842, "grad_norm": 5.424844741821289, "learning_rate": 4.1622448979591836e-05, "loss": 1.0097, "step": 921 }, { "epoch": 0.1844, "grad_norm": 4.220058441162109, "learning_rate": 4.1612244897959185e-05, "loss": 2.2679, "step": 922 }, { "epoch": 0.1846, "grad_norm": 1.9250975847244263, "learning_rate": 4.1602040816326534e-05, "loss": 0.1915, "step": 923 }, { "epoch": 0.1848, "grad_norm": 1.4541889429092407, "learning_rate": 4.159183673469388e-05, "loss": 0.1724, "step": 924 }, { "epoch": 0.185, "grad_norm": 81.74874877929688, "learning_rate": 4.1581632653061226e-05, "loss": 6.8632, "step": 925 }, { "epoch": 0.1852, "grad_norm": 18.632627487182617, "learning_rate": 4.1571428571428575e-05, "loss": 0.3878, "step": 926 }, { "epoch": 0.1854, "grad_norm": 1.8790223598480225, "learning_rate": 4.156122448979592e-05, "loss": 0.2882, "step": 927 }, { "epoch": 0.1856, "grad_norm": 1.3396214246749878, "learning_rate": 4.155102040816327e-05, "loss": 0.1173, "step": 928 }, { "epoch": 0.1858, "grad_norm": 12.873229026794434, "learning_rate": 4.1540816326530616e-05, "loss": 4.3862, "step": 929 }, { "epoch": 0.186, "grad_norm": 17.393413543701172, "learning_rate": 4.153061224489796e-05, "loss": 4.3354, "step": 930 }, { "epoch": 0.1862, "grad_norm": 12.224554061889648, "learning_rate": 4.152040816326531e-05, "loss": 1.6314, "step": 931 }, { "epoch": 0.1864, "grad_norm": 4.5745439529418945, "learning_rate": 4.151020408163265e-05, "loss": 1.005, "step": 932 }, { "epoch": 0.1866, "grad_norm": 1.3737479448318481, "learning_rate": 4.15e-05, "loss": 0.1172, "step": 933 }, { "epoch": 0.1868, "grad_norm": 4.484574794769287, "learning_rate": 4.148979591836735e-05, "loss": 2.1438, "step": 934 }, { "epoch": 0.187, "grad_norm": 13.013484001159668, "learning_rate": 4.147959183673469e-05, "loss": 1.3852, "step": 935 }, { "epoch": 0.1872, "grad_norm": 1.4868980646133423, "learning_rate": 4.146938775510204e-05, "loss": 0.129, "step": 936 }, { "epoch": 0.1874, "grad_norm": 2.100748300552368, "learning_rate": 4.145918367346939e-05, "loss": 0.1455, "step": 937 }, { "epoch": 0.1876, "grad_norm": 17.68741798400879, "learning_rate": 4.144897959183673e-05, "loss": 4.359, "step": 938 }, { "epoch": 0.1878, "grad_norm": 4.203042030334473, "learning_rate": 4.143877551020408e-05, "loss": 1.1126, "step": 939 }, { "epoch": 0.188, "grad_norm": 7.120700836181641, "learning_rate": 4.1428571428571437e-05, "loss": 1.2638, "step": 940 }, { "epoch": 0.1882, "grad_norm": 3.8899881839752197, "learning_rate": 4.141836734693878e-05, "loss": 0.9725, "step": 941 }, { "epoch": 0.1884, "grad_norm": 9.656298637390137, "learning_rate": 4.140816326530613e-05, "loss": 0.6918, "step": 942 }, { "epoch": 0.1886, "grad_norm": 6.113008499145508, "learning_rate": 4.139795918367347e-05, "loss": 1.315, "step": 943 }, { "epoch": 0.1888, "grad_norm": 8.569879531860352, "learning_rate": 4.138775510204082e-05, "loss": 1.3466, "step": 944 }, { "epoch": 0.189, "grad_norm": 7.1254353523254395, "learning_rate": 4.137755102040817e-05, "loss": 2.4282, "step": 945 }, { "epoch": 0.1892, "grad_norm": 1.4450511932373047, "learning_rate": 4.136734693877551e-05, "loss": 0.1443, "step": 946 }, { "epoch": 0.1894, "grad_norm": 3.5518486499786377, "learning_rate": 4.135714285714286e-05, "loss": 0.3862, "step": 947 }, { "epoch": 0.1896, "grad_norm": 6.623416423797607, "learning_rate": 4.134693877551021e-05, "loss": 1.2714, "step": 948 }, { "epoch": 0.1898, "grad_norm": 1.5826369524002075, "learning_rate": 4.133673469387755e-05, "loss": 0.1646, "step": 949 }, { "epoch": 0.19, "grad_norm": 1.6537444591522217, "learning_rate": 4.13265306122449e-05, "loss": 0.0755, "step": 950 }, { "epoch": 0.1902, "grad_norm": 33.40952682495117, "learning_rate": 4.131632653061225e-05, "loss": 0.794, "step": 951 }, { "epoch": 0.1904, "grad_norm": 18.153114318847656, "learning_rate": 4.130612244897959e-05, "loss": 2.4001, "step": 952 }, { "epoch": 0.1906, "grad_norm": 11.340116500854492, "learning_rate": 4.129591836734694e-05, "loss": 2.5849, "step": 953 }, { "epoch": 0.1908, "grad_norm": 8.346242904663086, "learning_rate": 4.128571428571429e-05, "loss": 2.2724, "step": 954 }, { "epoch": 0.191, "grad_norm": 1.4004164934158325, "learning_rate": 4.1275510204081634e-05, "loss": 0.1659, "step": 955 }, { "epoch": 0.1912, "grad_norm": 2.0775654315948486, "learning_rate": 4.126530612244898e-05, "loss": 0.2222, "step": 956 }, { "epoch": 0.1914, "grad_norm": 1.3837953805923462, "learning_rate": 4.125510204081633e-05, "loss": 0.1013, "step": 957 }, { "epoch": 0.1916, "grad_norm": 1.9300739765167236, "learning_rate": 4.1244897959183674e-05, "loss": 0.0995, "step": 958 }, { "epoch": 0.1918, "grad_norm": 1.8797160387039185, "learning_rate": 4.1234693877551024e-05, "loss": 0.097, "step": 959 }, { "epoch": 0.192, "grad_norm": 1.974260926246643, "learning_rate": 4.122448979591837e-05, "loss": 0.1699, "step": 960 }, { "epoch": 0.1922, "grad_norm": 9.13991928100586, "learning_rate": 4.1214285714285715e-05, "loss": 1.3085, "step": 961 }, { "epoch": 0.1924, "grad_norm": 1.9515283107757568, "learning_rate": 4.1204081632653064e-05, "loss": 0.1082, "step": 962 }, { "epoch": 0.1926, "grad_norm": 1.9016188383102417, "learning_rate": 4.119387755102041e-05, "loss": 0.1154, "step": 963 }, { "epoch": 0.1928, "grad_norm": 7.120417594909668, "learning_rate": 4.1183673469387756e-05, "loss": 2.2656, "step": 964 }, { "epoch": 0.193, "grad_norm": 8.615710258483887, "learning_rate": 4.1173469387755105e-05, "loss": 1.2942, "step": 965 }, { "epoch": 0.1932, "grad_norm": 10.473421096801758, "learning_rate": 4.116326530612245e-05, "loss": 0.8338, "step": 966 }, { "epoch": 0.1934, "grad_norm": 1.2581517696380615, "learning_rate": 4.11530612244898e-05, "loss": 0.1018, "step": 967 }, { "epoch": 0.1936, "grad_norm": 1.8771737813949585, "learning_rate": 4.1142857142857146e-05, "loss": 0.1048, "step": 968 }, { "epoch": 0.1938, "grad_norm": 1.536227822303772, "learning_rate": 4.113265306122449e-05, "loss": 0.0699, "step": 969 }, { "epoch": 0.194, "grad_norm": 5.4825520515441895, "learning_rate": 4.112244897959184e-05, "loss": 2.3103, "step": 970 }, { "epoch": 0.1942, "grad_norm": 6.8128790855407715, "learning_rate": 4.1112244897959187e-05, "loss": 0.7666, "step": 971 }, { "epoch": 0.1944, "grad_norm": 5.419755935668945, "learning_rate": 4.110204081632653e-05, "loss": 1.3184, "step": 972 }, { "epoch": 0.1946, "grad_norm": 1.8374887704849243, "learning_rate": 4.109183673469388e-05, "loss": 0.1427, "step": 973 }, { "epoch": 0.1948, "grad_norm": 4.784745693206787, "learning_rate": 4.108163265306123e-05, "loss": 1.1199, "step": 974 }, { "epoch": 0.195, "grad_norm": 1.6024447679519653, "learning_rate": 4.107142857142857e-05, "loss": 0.0935, "step": 975 }, { "epoch": 0.1952, "grad_norm": 2.622997760772705, "learning_rate": 4.106122448979592e-05, "loss": 0.8585, "step": 976 }, { "epoch": 0.1954, "grad_norm": 1.4445765018463135, "learning_rate": 4.105102040816327e-05, "loss": 0.1321, "step": 977 }, { "epoch": 0.1956, "grad_norm": 1.9010988473892212, "learning_rate": 4.104081632653061e-05, "loss": 0.1113, "step": 978 }, { "epoch": 0.1958, "grad_norm": 1.8826258182525635, "learning_rate": 4.103061224489796e-05, "loss": 0.1027, "step": 979 }, { "epoch": 0.196, "grad_norm": 1.7285670042037964, "learning_rate": 4.102040816326531e-05, "loss": 0.096, "step": 980 }, { "epoch": 0.1962, "grad_norm": 24.584623336791992, "learning_rate": 4.101020408163265e-05, "loss": 1.6457, "step": 981 }, { "epoch": 0.1964, "grad_norm": 33.21518325805664, "learning_rate": 4.1e-05, "loss": 5.8246, "step": 982 }, { "epoch": 0.1966, "grad_norm": 2.0982282161712646, "learning_rate": 4.098979591836735e-05, "loss": 0.1859, "step": 983 }, { "epoch": 0.1968, "grad_norm": 1.2820332050323486, "learning_rate": 4.097959183673469e-05, "loss": 0.0985, "step": 984 }, { "epoch": 0.197, "grad_norm": 8.416690826416016, "learning_rate": 4.096938775510205e-05, "loss": 1.5457, "step": 985 }, { "epoch": 0.1972, "grad_norm": 1.9515358209609985, "learning_rate": 4.095918367346939e-05, "loss": 0.1537, "step": 986 }, { "epoch": 0.1974, "grad_norm": 1.6056196689605713, "learning_rate": 4.094897959183674e-05, "loss": 0.1138, "step": 987 }, { "epoch": 0.1976, "grad_norm": 10.302643775939941, "learning_rate": 4.093877551020409e-05, "loss": 3.2709, "step": 988 }, { "epoch": 0.1978, "grad_norm": 5.601901531219482, "learning_rate": 4.092857142857143e-05, "loss": 0.774, "step": 989 }, { "epoch": 0.198, "grad_norm": 1.9346834421157837, "learning_rate": 4.091836734693878e-05, "loss": 0.1553, "step": 990 }, { "epoch": 0.1982, "grad_norm": 2.078249454498291, "learning_rate": 4.090816326530613e-05, "loss": 0.1278, "step": 991 }, { "epoch": 0.1984, "grad_norm": 1.7769577503204346, "learning_rate": 4.089795918367347e-05, "loss": 0.0883, "step": 992 }, { "epoch": 0.1986, "grad_norm": 1.2814565896987915, "learning_rate": 4.088775510204082e-05, "loss": 0.1313, "step": 993 }, { "epoch": 0.1988, "grad_norm": 6.84991455078125, "learning_rate": 4.0877551020408164e-05, "loss": 1.2505, "step": 994 }, { "epoch": 0.199, "grad_norm": 1.601853609085083, "learning_rate": 4.086734693877551e-05, "loss": 0.157, "step": 995 }, { "epoch": 0.1992, "grad_norm": 8.058499336242676, "learning_rate": 4.085714285714286e-05, "loss": 0.6271, "step": 996 }, { "epoch": 0.1994, "grad_norm": 1.5115468502044678, "learning_rate": 4.0846938775510204e-05, "loss": 0.2509, "step": 997 }, { "epoch": 0.1996, "grad_norm": 1.6834666728973389, "learning_rate": 4.0836734693877553e-05, "loss": 0.0936, "step": 998 }, { "epoch": 0.1998, "grad_norm": 9.186635971069336, "learning_rate": 4.08265306122449e-05, "loss": 2.1009, "step": 999 }, { "epoch": 0.2, "grad_norm": 1.8956700563430786, "learning_rate": 4.0816326530612245e-05, "loss": 0.1993, "step": 1000 }, { "epoch": 0.2002, "grad_norm": 1.4122716188430786, "learning_rate": 4.0806122448979594e-05, "loss": 0.1491, "step": 1001 }, { "epoch": 0.2004, "grad_norm": 11.156471252441406, "learning_rate": 4.0795918367346943e-05, "loss": 3.8965, "step": 1002 }, { "epoch": 0.2006, "grad_norm": 1.5105379819869995, "learning_rate": 4.0785714285714286e-05, "loss": 0.1107, "step": 1003 }, { "epoch": 0.2008, "grad_norm": 70.52470397949219, "learning_rate": 4.0775510204081635e-05, "loss": 6.2614, "step": 1004 }, { "epoch": 0.201, "grad_norm": 4.5841498374938965, "learning_rate": 4.0765306122448984e-05, "loss": 0.8138, "step": 1005 }, { "epoch": 0.2012, "grad_norm": 3.8832404613494873, "learning_rate": 4.0755102040816327e-05, "loss": 0.9416, "step": 1006 }, { "epoch": 0.2014, "grad_norm": 7.785485744476318, "learning_rate": 4.0744897959183676e-05, "loss": 2.2909, "step": 1007 }, { "epoch": 0.2016, "grad_norm": 6.2355780601501465, "learning_rate": 4.0734693877551025e-05, "loss": 1.1862, "step": 1008 }, { "epoch": 0.2018, "grad_norm": 7.021670818328857, "learning_rate": 4.072448979591837e-05, "loss": 0.5645, "step": 1009 }, { "epoch": 0.202, "grad_norm": 1.8375966548919678, "learning_rate": 4.0714285714285717e-05, "loss": 0.0982, "step": 1010 }, { "epoch": 0.2022, "grad_norm": 1.345582365989685, "learning_rate": 4.0704081632653066e-05, "loss": 0.0887, "step": 1011 }, { "epoch": 0.2024, "grad_norm": 1.925706148147583, "learning_rate": 4.069387755102041e-05, "loss": 0.1029, "step": 1012 }, { "epoch": 0.2026, "grad_norm": 21.210050582885742, "learning_rate": 4.068367346938776e-05, "loss": 3.965, "step": 1013 }, { "epoch": 0.2028, "grad_norm": 1.2436354160308838, "learning_rate": 4.0673469387755106e-05, "loss": 0.0701, "step": 1014 }, { "epoch": 0.203, "grad_norm": 11.792455673217773, "learning_rate": 4.066326530612245e-05, "loss": 5.489, "step": 1015 }, { "epoch": 0.2032, "grad_norm": 1.7196590900421143, "learning_rate": 4.06530612244898e-05, "loss": 0.1133, "step": 1016 }, { "epoch": 0.2034, "grad_norm": 1.7486034631729126, "learning_rate": 4.064285714285714e-05, "loss": 0.0889, "step": 1017 }, { "epoch": 0.2036, "grad_norm": 1.5281075239181519, "learning_rate": 4.063265306122449e-05, "loss": 0.1184, "step": 1018 }, { "epoch": 0.2038, "grad_norm": 1.5642445087432861, "learning_rate": 4.062244897959184e-05, "loss": 0.0774, "step": 1019 }, { "epoch": 0.204, "grad_norm": 1.566006064414978, "learning_rate": 4.061224489795918e-05, "loss": 0.1203, "step": 1020 }, { "epoch": 0.2042, "grad_norm": 9.783367156982422, "learning_rate": 4.060204081632653e-05, "loss": 2.1491, "step": 1021 }, { "epoch": 0.2044, "grad_norm": 16.899147033691406, "learning_rate": 4.059183673469388e-05, "loss": 4.2063, "step": 1022 }, { "epoch": 0.2046, "grad_norm": 1.4680163860321045, "learning_rate": 4.058163265306122e-05, "loss": 0.0917, "step": 1023 }, { "epoch": 0.2048, "grad_norm": 4.537769317626953, "learning_rate": 4.057142857142857e-05, "loss": 0.7457, "step": 1024 }, { "epoch": 0.205, "grad_norm": 1.6647769212722778, "learning_rate": 4.056122448979592e-05, "loss": 0.0705, "step": 1025 }, { "epoch": 0.2052, "grad_norm": 8.374734878540039, "learning_rate": 4.055102040816326e-05, "loss": 4.0424, "step": 1026 }, { "epoch": 0.2054, "grad_norm": 1.743696689605713, "learning_rate": 4.054081632653061e-05, "loss": 0.1013, "step": 1027 }, { "epoch": 0.2056, "grad_norm": 6.450601577758789, "learning_rate": 4.053061224489796e-05, "loss": 1.207, "step": 1028 }, { "epoch": 0.2058, "grad_norm": 14.592948913574219, "learning_rate": 4.0520408163265304e-05, "loss": 4.1025, "step": 1029 }, { "epoch": 0.206, "grad_norm": 1.6705548763275146, "learning_rate": 4.051020408163265e-05, "loss": 0.084, "step": 1030 }, { "epoch": 0.2062, "grad_norm": 9.311860084533691, "learning_rate": 4.05e-05, "loss": 0.6941, "step": 1031 }, { "epoch": 0.2064, "grad_norm": 1.5666495561599731, "learning_rate": 4.048979591836735e-05, "loss": 0.1536, "step": 1032 }, { "epoch": 0.2066, "grad_norm": 1.5592368841171265, "learning_rate": 4.04795918367347e-05, "loss": 0.1401, "step": 1033 }, { "epoch": 0.2068, "grad_norm": 2.0136380195617676, "learning_rate": 4.046938775510204e-05, "loss": 0.114, "step": 1034 }, { "epoch": 0.207, "grad_norm": 1.42483651638031, "learning_rate": 4.045918367346939e-05, "loss": 0.1033, "step": 1035 }, { "epoch": 0.2072, "grad_norm": 1.5242152214050293, "learning_rate": 4.044897959183674e-05, "loss": 0.074, "step": 1036 }, { "epoch": 0.2074, "grad_norm": 7.547857761383057, "learning_rate": 4.0438775510204083e-05, "loss": 2.3123, "step": 1037 }, { "epoch": 0.2076, "grad_norm": 1.7124192714691162, "learning_rate": 4.042857142857143e-05, "loss": 0.1537, "step": 1038 }, { "epoch": 0.2078, "grad_norm": 1.7321916818618774, "learning_rate": 4.041836734693878e-05, "loss": 0.1755, "step": 1039 }, { "epoch": 0.208, "grad_norm": 1.7035274505615234, "learning_rate": 4.0408163265306124e-05, "loss": 0.1048, "step": 1040 }, { "epoch": 0.2082, "grad_norm": 7.646542549133301, "learning_rate": 4.039795918367347e-05, "loss": 0.5286, "step": 1041 }, { "epoch": 0.2084, "grad_norm": 30.4531307220459, "learning_rate": 4.038775510204082e-05, "loss": 3.739, "step": 1042 }, { "epoch": 0.2086, "grad_norm": 1.4997987747192383, "learning_rate": 4.0377551020408165e-05, "loss": 0.1062, "step": 1043 }, { "epoch": 0.2088, "grad_norm": 1.8914159536361694, "learning_rate": 4.0367346938775514e-05, "loss": 0.0973, "step": 1044 }, { "epoch": 0.209, "grad_norm": 60.22584533691406, "learning_rate": 4.035714285714286e-05, "loss": 5.7281, "step": 1045 }, { "epoch": 0.2092, "grad_norm": 4.276406764984131, "learning_rate": 4.0346938775510206e-05, "loss": 0.7713, "step": 1046 }, { "epoch": 0.2094, "grad_norm": 10.648032188415527, "learning_rate": 4.0336734693877555e-05, "loss": 1.5118, "step": 1047 }, { "epoch": 0.2096, "grad_norm": 1.9461679458618164, "learning_rate": 4.03265306122449e-05, "loss": 0.1174, "step": 1048 }, { "epoch": 0.2098, "grad_norm": 4.108247756958008, "learning_rate": 4.0316326530612246e-05, "loss": 1.0173, "step": 1049 }, { "epoch": 0.21, "grad_norm": 2.454129934310913, "learning_rate": 4.0306122448979596e-05, "loss": 0.1232, "step": 1050 }, { "epoch": 0.2102, "grad_norm": 1.7871676683425903, "learning_rate": 4.029591836734694e-05, "loss": 0.0944, "step": 1051 }, { "epoch": 0.2104, "grad_norm": 5.2860307693481445, "learning_rate": 4.028571428571429e-05, "loss": 1.1604, "step": 1052 }, { "epoch": 0.2106, "grad_norm": 4.047928810119629, "learning_rate": 4.0275510204081636e-05, "loss": 0.7197, "step": 1053 }, { "epoch": 0.2108, "grad_norm": 28.927228927612305, "learning_rate": 4.026530612244898e-05, "loss": 4.3561, "step": 1054 }, { "epoch": 0.211, "grad_norm": 29.201982498168945, "learning_rate": 4.025510204081633e-05, "loss": 1.1396, "step": 1055 }, { "epoch": 0.2112, "grad_norm": 1.7634601593017578, "learning_rate": 4.024489795918368e-05, "loss": 0.1091, "step": 1056 }, { "epoch": 0.2114, "grad_norm": 6.9372124671936035, "learning_rate": 4.023469387755102e-05, "loss": 1.2503, "step": 1057 }, { "epoch": 0.2116, "grad_norm": 2.028412103652954, "learning_rate": 4.022448979591837e-05, "loss": 0.1896, "step": 1058 }, { "epoch": 0.2118, "grad_norm": 1.4823191165924072, "learning_rate": 4.021428571428572e-05, "loss": 0.2048, "step": 1059 }, { "epoch": 0.212, "grad_norm": 1.8830736875534058, "learning_rate": 4.020408163265306e-05, "loss": 0.1155, "step": 1060 }, { "epoch": 0.2122, "grad_norm": 8.2816162109375, "learning_rate": 4.019387755102041e-05, "loss": 0.5329, "step": 1061 }, { "epoch": 0.2124, "grad_norm": 3.7619011402130127, "learning_rate": 4.018367346938776e-05, "loss": 0.8836, "step": 1062 }, { "epoch": 0.2126, "grad_norm": 1.6038355827331543, "learning_rate": 4.01734693877551e-05, "loss": 0.0724, "step": 1063 }, { "epoch": 0.2128, "grad_norm": 2.4276578426361084, "learning_rate": 4.016326530612245e-05, "loss": 0.2589, "step": 1064 }, { "epoch": 0.213, "grad_norm": 6.890028953552246, "learning_rate": 4.01530612244898e-05, "loss": 1.2599, "step": 1065 }, { "epoch": 0.2132, "grad_norm": 3.703716993331909, "learning_rate": 4.014285714285714e-05, "loss": 0.8813, "step": 1066 }, { "epoch": 0.2134, "grad_norm": 1.9085053205490112, "learning_rate": 4.013265306122449e-05, "loss": 0.1695, "step": 1067 }, { "epoch": 0.2136, "grad_norm": 4.963956356048584, "learning_rate": 4.0122448979591833e-05, "loss": 1.195, "step": 1068 }, { "epoch": 0.2138, "grad_norm": 1.4116706848144531, "learning_rate": 4.011224489795918e-05, "loss": 0.0697, "step": 1069 }, { "epoch": 0.214, "grad_norm": 6.491616725921631, "learning_rate": 4.010204081632653e-05, "loss": 1.1058, "step": 1070 }, { "epoch": 0.2142, "grad_norm": 17.118547439575195, "learning_rate": 4.0091836734693874e-05, "loss": 2.6657, "step": 1071 }, { "epoch": 0.2144, "grad_norm": 1.7242052555084229, "learning_rate": 4.008163265306122e-05, "loss": 0.0941, "step": 1072 }, { "epoch": 0.2146, "grad_norm": 22.678550720214844, "learning_rate": 4.007142857142857e-05, "loss": 4.1993, "step": 1073 }, { "epoch": 0.2148, "grad_norm": 3.679158926010132, "learning_rate": 4.0061224489795915e-05, "loss": 0.9252, "step": 1074 }, { "epoch": 0.215, "grad_norm": 1.7487565279006958, "learning_rate": 4.0051020408163264e-05, "loss": 0.0862, "step": 1075 }, { "epoch": 0.2152, "grad_norm": 1.4549202919006348, "learning_rate": 4.004081632653062e-05, "loss": 0.1285, "step": 1076 }, { "epoch": 0.2154, "grad_norm": 21.872774124145508, "learning_rate": 4.003061224489796e-05, "loss": 4.259, "step": 1077 }, { "epoch": 0.2156, "grad_norm": 1.3242027759552002, "learning_rate": 4.002040816326531e-05, "loss": 0.1155, "step": 1078 }, { "epoch": 0.2158, "grad_norm": 1.9196478128433228, "learning_rate": 4.0010204081632654e-05, "loss": 0.1505, "step": 1079 }, { "epoch": 0.216, "grad_norm": 1.3824145793914795, "learning_rate": 4e-05, "loss": 0.1283, "step": 1080 }, { "epoch": 0.2162, "grad_norm": 1.5698976516723633, "learning_rate": 3.998979591836735e-05, "loss": 0.0639, "step": 1081 }, { "epoch": 0.2164, "grad_norm": 1.4868955612182617, "learning_rate": 3.9979591836734695e-05, "loss": 0.125, "step": 1082 }, { "epoch": 0.2166, "grad_norm": 1.6327617168426514, "learning_rate": 3.9969387755102044e-05, "loss": 0.1406, "step": 1083 }, { "epoch": 0.2168, "grad_norm": 3.6507341861724854, "learning_rate": 3.995918367346939e-05, "loss": 0.9076, "step": 1084 }, { "epoch": 0.217, "grad_norm": 1.826192021369934, "learning_rate": 3.9948979591836736e-05, "loss": 0.1481, "step": 1085 }, { "epoch": 0.2172, "grad_norm": 7.602888584136963, "learning_rate": 3.9938775510204085e-05, "loss": 2.2922, "step": 1086 }, { "epoch": 0.2174, "grad_norm": 8.769946098327637, "learning_rate": 3.9928571428571434e-05, "loss": 1.3738, "step": 1087 }, { "epoch": 0.2176, "grad_norm": 1.91317880153656, "learning_rate": 3.9918367346938776e-05, "loss": 0.1733, "step": 1088 }, { "epoch": 0.2178, "grad_norm": 1.6258636713027954, "learning_rate": 3.9908163265306126e-05, "loss": 0.145, "step": 1089 }, { "epoch": 0.218, "grad_norm": 22.450666427612305, "learning_rate": 3.9897959183673475e-05, "loss": 4.5677, "step": 1090 }, { "epoch": 0.2182, "grad_norm": 20.471054077148438, "learning_rate": 3.988775510204082e-05, "loss": 2.4693, "step": 1091 }, { "epoch": 0.2184, "grad_norm": 1.944197177886963, "learning_rate": 3.9877551020408166e-05, "loss": 0.0889, "step": 1092 }, { "epoch": 0.2186, "grad_norm": 5.251967906951904, "learning_rate": 3.9867346938775516e-05, "loss": 0.6365, "step": 1093 }, { "epoch": 0.2188, "grad_norm": 6.404879093170166, "learning_rate": 3.985714285714286e-05, "loss": 1.2409, "step": 1094 }, { "epoch": 0.219, "grad_norm": 1.7063329219818115, "learning_rate": 3.984693877551021e-05, "loss": 0.1512, "step": 1095 }, { "epoch": 0.2192, "grad_norm": 1.841525912284851, "learning_rate": 3.9836734693877556e-05, "loss": 0.1593, "step": 1096 }, { "epoch": 0.2194, "grad_norm": 7.1279826164245605, "learning_rate": 3.98265306122449e-05, "loss": 1.1394, "step": 1097 }, { "epoch": 0.2196, "grad_norm": 4.164473533630371, "learning_rate": 3.981632653061225e-05, "loss": 0.752, "step": 1098 }, { "epoch": 0.2198, "grad_norm": 9.854219436645508, "learning_rate": 3.980612244897959e-05, "loss": 3.5586, "step": 1099 }, { "epoch": 0.22, "grad_norm": 6.977601528167725, "learning_rate": 3.979591836734694e-05, "loss": 2.3759, "step": 1100 }, { "epoch": 0.2202, "grad_norm": 31.07599449157715, "learning_rate": 3.978571428571429e-05, "loss": 3.3308, "step": 1101 }, { "epoch": 0.2204, "grad_norm": 1.951879620552063, "learning_rate": 3.977551020408163e-05, "loss": 0.1847, "step": 1102 }, { "epoch": 0.2206, "grad_norm": 2.1651017665863037, "learning_rate": 3.976530612244898e-05, "loss": 0.197, "step": 1103 }, { "epoch": 0.2208, "grad_norm": 1.6172817945480347, "learning_rate": 3.975510204081633e-05, "loss": 0.152, "step": 1104 }, { "epoch": 0.221, "grad_norm": 1.4493610858917236, "learning_rate": 3.974489795918367e-05, "loss": 0.1325, "step": 1105 }, { "epoch": 0.2212, "grad_norm": 4.297125339508057, "learning_rate": 3.973469387755102e-05, "loss": 1.1716, "step": 1106 }, { "epoch": 0.2214, "grad_norm": 5.791296482086182, "learning_rate": 3.972448979591837e-05, "loss": 1.1987, "step": 1107 }, { "epoch": 0.2216, "grad_norm": 4.080999374389648, "learning_rate": 3.971428571428571e-05, "loss": 1.0897, "step": 1108 }, { "epoch": 0.2218, "grad_norm": 1.6441861391067505, "learning_rate": 3.970408163265306e-05, "loss": 0.1664, "step": 1109 }, { "epoch": 0.222, "grad_norm": 9.482796669006348, "learning_rate": 3.969387755102041e-05, "loss": 3.391, "step": 1110 }, { "epoch": 0.2222, "grad_norm": 6.508853435516357, "learning_rate": 3.968367346938775e-05, "loss": 2.2502, "step": 1111 }, { "epoch": 0.2224, "grad_norm": 4.284785747528076, "learning_rate": 3.96734693877551e-05, "loss": 1.0736, "step": 1112 }, { "epoch": 0.2226, "grad_norm": 1.7649943828582764, "learning_rate": 3.966326530612245e-05, "loss": 0.1619, "step": 1113 }, { "epoch": 0.2228, "grad_norm": 2.027397394180298, "learning_rate": 3.9653061224489794e-05, "loss": 0.1351, "step": 1114 }, { "epoch": 0.223, "grad_norm": 1.4270291328430176, "learning_rate": 3.964285714285714e-05, "loss": 0.1335, "step": 1115 }, { "epoch": 0.2232, "grad_norm": 1.7953479290008545, "learning_rate": 3.963265306122449e-05, "loss": 0.1167, "step": 1116 }, { "epoch": 0.2234, "grad_norm": 1.2949645519256592, "learning_rate": 3.9622448979591835e-05, "loss": 0.1138, "step": 1117 }, { "epoch": 0.2236, "grad_norm": 1.8343255519866943, "learning_rate": 3.9612244897959184e-05, "loss": 0.1137, "step": 1118 }, { "epoch": 0.2238, "grad_norm": 2.346017360687256, "learning_rate": 3.960204081632653e-05, "loss": 0.2721, "step": 1119 }, { "epoch": 0.224, "grad_norm": 34.164283752441406, "learning_rate": 3.9591836734693876e-05, "loss": 3.905, "step": 1120 }, { "epoch": 0.2242, "grad_norm": 5.671075820922852, "learning_rate": 3.958163265306123e-05, "loss": 1.4206, "step": 1121 }, { "epoch": 0.2244, "grad_norm": 1.5829075574874878, "learning_rate": 3.9571428571428574e-05, "loss": 0.2156, "step": 1122 }, { "epoch": 0.2246, "grad_norm": 16.01406478881836, "learning_rate": 3.956122448979592e-05, "loss": 2.3045, "step": 1123 }, { "epoch": 0.2248, "grad_norm": 4.0922746658325195, "learning_rate": 3.955102040816327e-05, "loss": 0.9245, "step": 1124 }, { "epoch": 0.225, "grad_norm": 1.7736269235610962, "learning_rate": 3.9540816326530615e-05, "loss": 0.1719, "step": 1125 }, { "epoch": 0.2252, "grad_norm": 1.5497605800628662, "learning_rate": 3.9530612244897964e-05, "loss": 0.139, "step": 1126 }, { "epoch": 0.2254, "grad_norm": 1.831518292427063, "learning_rate": 3.952040816326531e-05, "loss": 0.1242, "step": 1127 }, { "epoch": 0.2256, "grad_norm": 1.4932825565338135, "learning_rate": 3.9510204081632655e-05, "loss": 0.1355, "step": 1128 }, { "epoch": 0.2258, "grad_norm": 1.522917628288269, "learning_rate": 3.9500000000000005e-05, "loss": 0.163, "step": 1129 }, { "epoch": 0.226, "grad_norm": 3.278972625732422, "learning_rate": 3.9489795918367354e-05, "loss": 0.3247, "step": 1130 }, { "epoch": 0.2262, "grad_norm": 1.7440088987350464, "learning_rate": 3.9479591836734696e-05, "loss": 0.0979, "step": 1131 }, { "epoch": 0.2264, "grad_norm": 4.800971031188965, "learning_rate": 3.9469387755102045e-05, "loss": 1.2838, "step": 1132 }, { "epoch": 0.2266, "grad_norm": 8.162755012512207, "learning_rate": 3.945918367346939e-05, "loss": 1.3834, "step": 1133 }, { "epoch": 0.2268, "grad_norm": 36.66396713256836, "learning_rate": 3.944897959183674e-05, "loss": 4.7365, "step": 1134 }, { "epoch": 0.227, "grad_norm": 2.0371525287628174, "learning_rate": 3.9438775510204086e-05, "loss": 0.1467, "step": 1135 }, { "epoch": 0.2272, "grad_norm": 1.2436259984970093, "learning_rate": 3.942857142857143e-05, "loss": 0.1262, "step": 1136 }, { "epoch": 0.2274, "grad_norm": 6.373444080352783, "learning_rate": 3.941836734693878e-05, "loss": 1.164, "step": 1137 }, { "epoch": 0.2276, "grad_norm": 6.302332878112793, "learning_rate": 3.940816326530613e-05, "loss": 1.3267, "step": 1138 }, { "epoch": 0.2278, "grad_norm": 1.712341070175171, "learning_rate": 3.939795918367347e-05, "loss": 0.2145, "step": 1139 }, { "epoch": 0.228, "grad_norm": 3.593083620071411, "learning_rate": 3.938775510204082e-05, "loss": 0.3416, "step": 1140 }, { "epoch": 0.2282, "grad_norm": 1.5694092512130737, "learning_rate": 3.937755102040817e-05, "loss": 0.0872, "step": 1141 }, { "epoch": 0.2284, "grad_norm": 10.168475151062012, "learning_rate": 3.936734693877551e-05, "loss": 2.4081, "step": 1142 }, { "epoch": 0.2286, "grad_norm": 1.5657432079315186, "learning_rate": 3.935714285714286e-05, "loss": 0.1414, "step": 1143 }, { "epoch": 0.2288, "grad_norm": 4.660126686096191, "learning_rate": 3.934693877551021e-05, "loss": 1.1826, "step": 1144 }, { "epoch": 0.229, "grad_norm": 11.854291915893555, "learning_rate": 3.933673469387755e-05, "loss": 3.3537, "step": 1145 }, { "epoch": 0.2292, "grad_norm": 1.510237216949463, "learning_rate": 3.93265306122449e-05, "loss": 0.0829, "step": 1146 }, { "epoch": 0.2294, "grad_norm": 4.058131217956543, "learning_rate": 3.931632653061225e-05, "loss": 0.3575, "step": 1147 }, { "epoch": 0.2296, "grad_norm": 2.597877264022827, "learning_rate": 3.930612244897959e-05, "loss": 0.8784, "step": 1148 }, { "epoch": 0.2298, "grad_norm": 1.6106457710266113, "learning_rate": 3.929591836734694e-05, "loss": 0.1452, "step": 1149 }, { "epoch": 0.23, "grad_norm": 1.7510958909988403, "learning_rate": 3.928571428571429e-05, "loss": 0.1684, "step": 1150 }, { "epoch": 0.2302, "grad_norm": 1.4825830459594727, "learning_rate": 3.927551020408163e-05, "loss": 0.1473, "step": 1151 }, { "epoch": 0.2304, "grad_norm": 4.590761184692383, "learning_rate": 3.926530612244898e-05, "loss": 1.0117, "step": 1152 }, { "epoch": 0.2306, "grad_norm": 9.946000099182129, "learning_rate": 3.9255102040816324e-05, "loss": 1.5746, "step": 1153 }, { "epoch": 0.2308, "grad_norm": 1.388819932937622, "learning_rate": 3.924489795918367e-05, "loss": 0.0816, "step": 1154 }, { "epoch": 0.231, "grad_norm": 18.854167938232422, "learning_rate": 3.923469387755102e-05, "loss": 1.3711, "step": 1155 }, { "epoch": 0.2312, "grad_norm": 1.3573522567749023, "learning_rate": 3.9224489795918365e-05, "loss": 0.1349, "step": 1156 }, { "epoch": 0.2314, "grad_norm": 8.461785316467285, "learning_rate": 3.9214285714285714e-05, "loss": 0.6804, "step": 1157 }, { "epoch": 0.2316, "grad_norm": 1.6372792720794678, "learning_rate": 3.920408163265306e-05, "loss": 0.1442, "step": 1158 }, { "epoch": 0.2318, "grad_norm": 6.600191116333008, "learning_rate": 3.9193877551020406e-05, "loss": 1.1584, "step": 1159 }, { "epoch": 0.232, "grad_norm": 7.225102424621582, "learning_rate": 3.9183673469387755e-05, "loss": 1.2314, "step": 1160 }, { "epoch": 0.2322, "grad_norm": 1.274695634841919, "learning_rate": 3.9173469387755104e-05, "loss": 0.1083, "step": 1161 }, { "epoch": 0.2324, "grad_norm": 1.7186917066574097, "learning_rate": 3.9163265306122446e-05, "loss": 0.0973, "step": 1162 }, { "epoch": 0.2326, "grad_norm": 1.218078851699829, "learning_rate": 3.9153061224489795e-05, "loss": 0.0565, "step": 1163 }, { "epoch": 0.2328, "grad_norm": 3.951833724975586, "learning_rate": 3.9142857142857145e-05, "loss": 0.9821, "step": 1164 }, { "epoch": 0.233, "grad_norm": 1.9798835515975952, "learning_rate": 3.913265306122449e-05, "loss": 0.2112, "step": 1165 }, { "epoch": 0.2332, "grad_norm": 1.6355785131454468, "learning_rate": 3.9122448979591836e-05, "loss": 0.1587, "step": 1166 }, { "epoch": 0.2334, "grad_norm": 5.939909934997559, "learning_rate": 3.9112244897959185e-05, "loss": 0.6297, "step": 1167 }, { "epoch": 0.2336, "grad_norm": 7.891049385070801, "learning_rate": 3.9102040816326535e-05, "loss": 1.3208, "step": 1168 }, { "epoch": 0.2338, "grad_norm": 5.886658668518066, "learning_rate": 3.9091836734693884e-05, "loss": 1.0827, "step": 1169 }, { "epoch": 0.234, "grad_norm": 1.9486569166183472, "learning_rate": 3.9081632653061226e-05, "loss": 0.1653, "step": 1170 }, { "epoch": 0.2342, "grad_norm": 1.8028507232666016, "learning_rate": 3.9071428571428575e-05, "loss": 0.1124, "step": 1171 }, { "epoch": 0.2344, "grad_norm": 1.5691555738449097, "learning_rate": 3.9061224489795925e-05, "loss": 0.1285, "step": 1172 }, { "epoch": 0.2346, "grad_norm": 3.233590841293335, "learning_rate": 3.905102040816327e-05, "loss": 0.3467, "step": 1173 }, { "epoch": 0.2348, "grad_norm": 13.754486083984375, "learning_rate": 3.9040816326530616e-05, "loss": 3.2339, "step": 1174 }, { "epoch": 0.235, "grad_norm": 2.6587777137756348, "learning_rate": 3.9030612244897965e-05, "loss": 0.82, "step": 1175 }, { "epoch": 0.2352, "grad_norm": 6.628676891326904, "learning_rate": 3.902040816326531e-05, "loss": 2.2773, "step": 1176 }, { "epoch": 0.2354, "grad_norm": 1.5130740404129028, "learning_rate": 3.901020408163266e-05, "loss": 0.062, "step": 1177 }, { "epoch": 0.2356, "grad_norm": 3.9275505542755127, "learning_rate": 3.9000000000000006e-05, "loss": 0.9092, "step": 1178 }, { "epoch": 0.2358, "grad_norm": 1.7313053607940674, "learning_rate": 3.898979591836735e-05, "loss": 0.1316, "step": 1179 }, { "epoch": 0.236, "grad_norm": 24.386125564575195, "learning_rate": 3.89795918367347e-05, "loss": 5.702, "step": 1180 }, { "epoch": 0.2362, "grad_norm": 4.170772075653076, "learning_rate": 3.896938775510205e-05, "loss": 0.775, "step": 1181 }, { "epoch": 0.2364, "grad_norm": 1.594424843788147, "learning_rate": 3.895918367346939e-05, "loss": 0.1186, "step": 1182 }, { "epoch": 0.2366, "grad_norm": 6.332061767578125, "learning_rate": 3.894897959183674e-05, "loss": 1.2124, "step": 1183 }, { "epoch": 0.2368, "grad_norm": 1.5418387651443481, "learning_rate": 3.893877551020408e-05, "loss": 0.0678, "step": 1184 }, { "epoch": 0.237, "grad_norm": 1.4276838302612305, "learning_rate": 3.892857142857143e-05, "loss": 0.138, "step": 1185 }, { "epoch": 0.2372, "grad_norm": 1.4515182971954346, "learning_rate": 3.891836734693878e-05, "loss": 0.1521, "step": 1186 }, { "epoch": 0.2374, "grad_norm": 3.8713340759277344, "learning_rate": 3.890816326530612e-05, "loss": 0.9481, "step": 1187 }, { "epoch": 0.2376, "grad_norm": 1.266196846961975, "learning_rate": 3.889795918367347e-05, "loss": 0.1171, "step": 1188 }, { "epoch": 0.2378, "grad_norm": 1.7428760528564453, "learning_rate": 3.888775510204082e-05, "loss": 0.1543, "step": 1189 }, { "epoch": 0.238, "grad_norm": 6.290396690368652, "learning_rate": 3.887755102040816e-05, "loss": 2.354, "step": 1190 }, { "epoch": 0.2382, "grad_norm": 1.7652369737625122, "learning_rate": 3.886734693877551e-05, "loss": 0.1633, "step": 1191 }, { "epoch": 0.2384, "grad_norm": 1.5250771045684814, "learning_rate": 3.885714285714286e-05, "loss": 0.0776, "step": 1192 }, { "epoch": 0.2386, "grad_norm": 1.7794101238250732, "learning_rate": 3.88469387755102e-05, "loss": 0.1795, "step": 1193 }, { "epoch": 0.2388, "grad_norm": 1.3927475214004517, "learning_rate": 3.883673469387755e-05, "loss": 0.1239, "step": 1194 }, { "epoch": 0.239, "grad_norm": 1.7098913192749023, "learning_rate": 3.88265306122449e-05, "loss": 0.1516, "step": 1195 }, { "epoch": 0.2392, "grad_norm": 1.2571477890014648, "learning_rate": 3.8816326530612244e-05, "loss": 0.1057, "step": 1196 }, { "epoch": 0.2394, "grad_norm": 1.652297019958496, "learning_rate": 3.880612244897959e-05, "loss": 0.1436, "step": 1197 }, { "epoch": 0.2396, "grad_norm": 13.419703483581543, "learning_rate": 3.879591836734694e-05, "loss": 3.5707, "step": 1198 }, { "epoch": 0.2398, "grad_norm": 9.41876220703125, "learning_rate": 3.8785714285714285e-05, "loss": 2.1655, "step": 1199 }, { "epoch": 0.24, "grad_norm": 4.557036876678467, "learning_rate": 3.8775510204081634e-05, "loss": 1.1251, "step": 1200 }, { "epoch": 0.2402, "grad_norm": 1.5032589435577393, "learning_rate": 3.876530612244898e-05, "loss": 0.1279, "step": 1201 }, { "epoch": 0.2404, "grad_norm": 1.44595468044281, "learning_rate": 3.8755102040816325e-05, "loss": 0.069, "step": 1202 }, { "epoch": 0.2406, "grad_norm": 4.187497138977051, "learning_rate": 3.8744897959183675e-05, "loss": 1.0626, "step": 1203 }, { "epoch": 0.2408, "grad_norm": 1.6708344221115112, "learning_rate": 3.8734693877551024e-05, "loss": 0.1072, "step": 1204 }, { "epoch": 0.241, "grad_norm": 6.50513219833374, "learning_rate": 3.8724489795918366e-05, "loss": 1.1696, "step": 1205 }, { "epoch": 0.2412, "grad_norm": 1.7861454486846924, "learning_rate": 3.8714285714285715e-05, "loss": 0.1797, "step": 1206 }, { "epoch": 0.2414, "grad_norm": 1.772229552268982, "learning_rate": 3.870408163265306e-05, "loss": 0.0919, "step": 1207 }, { "epoch": 0.2416, "grad_norm": 1.9311400651931763, "learning_rate": 3.869387755102041e-05, "loss": 0.1622, "step": 1208 }, { "epoch": 0.2418, "grad_norm": 1.8080766201019287, "learning_rate": 3.8683673469387756e-05, "loss": 0.1087, "step": 1209 }, { "epoch": 0.242, "grad_norm": 1.8979612588882446, "learning_rate": 3.86734693877551e-05, "loss": 0.0955, "step": 1210 }, { "epoch": 0.2422, "grad_norm": 1.825605869293213, "learning_rate": 3.866326530612245e-05, "loss": 0.1084, "step": 1211 }, { "epoch": 0.2424, "grad_norm": 1.3832318782806396, "learning_rate": 3.8653061224489804e-05, "loss": 0.0901, "step": 1212 }, { "epoch": 0.2426, "grad_norm": 16.540746688842773, "learning_rate": 3.8642857142857146e-05, "loss": 3.5837, "step": 1213 }, { "epoch": 0.2428, "grad_norm": 1.9575875997543335, "learning_rate": 3.8632653061224495e-05, "loss": 0.1763, "step": 1214 }, { "epoch": 0.243, "grad_norm": 1.6776007413864136, "learning_rate": 3.862244897959184e-05, "loss": 0.1061, "step": 1215 }, { "epoch": 0.2432, "grad_norm": 1.3324201107025146, "learning_rate": 3.861224489795919e-05, "loss": 0.1453, "step": 1216 }, { "epoch": 0.2434, "grad_norm": 1.635075330734253, "learning_rate": 3.8602040816326536e-05, "loss": 0.1365, "step": 1217 }, { "epoch": 0.2436, "grad_norm": 1.4818600416183472, "learning_rate": 3.859183673469388e-05, "loss": 0.1351, "step": 1218 }, { "epoch": 0.2438, "grad_norm": 48.662357330322266, "learning_rate": 3.858163265306123e-05, "loss": 5.8292, "step": 1219 }, { "epoch": 0.244, "grad_norm": 3.794050455093384, "learning_rate": 3.857142857142858e-05, "loss": 0.9183, "step": 1220 }, { "epoch": 0.2442, "grad_norm": 1.3571511507034302, "learning_rate": 3.856122448979592e-05, "loss": 0.1196, "step": 1221 }, { "epoch": 0.2444, "grad_norm": 1.4742392301559448, "learning_rate": 3.855102040816327e-05, "loss": 0.1399, "step": 1222 }, { "epoch": 0.2446, "grad_norm": 18.71346664428711, "learning_rate": 3.854081632653062e-05, "loss": 5.4186, "step": 1223 }, { "epoch": 0.2448, "grad_norm": 1.8707962036132812, "learning_rate": 3.853061224489796e-05, "loss": 0.1217, "step": 1224 }, { "epoch": 0.245, "grad_norm": 1.4898655414581299, "learning_rate": 3.852040816326531e-05, "loss": 0.141, "step": 1225 }, { "epoch": 0.2452, "grad_norm": 3.1395351886749268, "learning_rate": 3.851020408163266e-05, "loss": 0.8529, "step": 1226 }, { "epoch": 0.2454, "grad_norm": 1.0336296558380127, "learning_rate": 3.85e-05, "loss": 0.0862, "step": 1227 }, { "epoch": 0.2456, "grad_norm": 1.612135648727417, "learning_rate": 3.848979591836735e-05, "loss": 0.1439, "step": 1228 }, { "epoch": 0.2458, "grad_norm": 1.3654370307922363, "learning_rate": 3.84795918367347e-05, "loss": 0.1077, "step": 1229 }, { "epoch": 0.246, "grad_norm": 3.510406255722046, "learning_rate": 3.846938775510204e-05, "loss": 0.8932, "step": 1230 }, { "epoch": 0.2462, "grad_norm": 1.8323819637298584, "learning_rate": 3.845918367346939e-05, "loss": 0.1688, "step": 1231 }, { "epoch": 0.2464, "grad_norm": 3.9554245471954346, "learning_rate": 3.844897959183674e-05, "loss": 1.0654, "step": 1232 }, { "epoch": 0.2466, "grad_norm": 3.3375043869018555, "learning_rate": 3.843877551020408e-05, "loss": 0.865, "step": 1233 }, { "epoch": 0.2468, "grad_norm": 63.8090705871582, "learning_rate": 3.842857142857143e-05, "loss": 6.3251, "step": 1234 }, { "epoch": 0.247, "grad_norm": 1.4541479349136353, "learning_rate": 3.841836734693878e-05, "loss": 0.0772, "step": 1235 }, { "epoch": 0.2472, "grad_norm": 1.5204209089279175, "learning_rate": 3.840816326530612e-05, "loss": 0.1428, "step": 1236 }, { "epoch": 0.2474, "grad_norm": 6.136209011077881, "learning_rate": 3.839795918367347e-05, "loss": 1.1625, "step": 1237 }, { "epoch": 0.2476, "grad_norm": 1.3140615224838257, "learning_rate": 3.8387755102040815e-05, "loss": 0.1221, "step": 1238 }, { "epoch": 0.2478, "grad_norm": 1.2308504581451416, "learning_rate": 3.8377551020408164e-05, "loss": 0.1003, "step": 1239 }, { "epoch": 0.248, "grad_norm": 6.357258319854736, "learning_rate": 3.836734693877551e-05, "loss": 2.259, "step": 1240 }, { "epoch": 0.2482, "grad_norm": 1.3247007131576538, "learning_rate": 3.8357142857142855e-05, "loss": 0.0658, "step": 1241 }, { "epoch": 0.2484, "grad_norm": 1.5625752210617065, "learning_rate": 3.8346938775510205e-05, "loss": 0.1311, "step": 1242 }, { "epoch": 0.2486, "grad_norm": 1.7948263883590698, "learning_rate": 3.8336734693877554e-05, "loss": 0.1043, "step": 1243 }, { "epoch": 0.2488, "grad_norm": 4.615420818328857, "learning_rate": 3.8326530612244896e-05, "loss": 1.2485, "step": 1244 }, { "epoch": 0.249, "grad_norm": 3.948204517364502, "learning_rate": 3.8316326530612245e-05, "loss": 0.971, "step": 1245 }, { "epoch": 0.2492, "grad_norm": 10.756133079528809, "learning_rate": 3.8306122448979594e-05, "loss": 2.3515, "step": 1246 }, { "epoch": 0.2494, "grad_norm": 19.966793060302734, "learning_rate": 3.829591836734694e-05, "loss": 3.2037, "step": 1247 }, { "epoch": 0.2496, "grad_norm": 1.7232310771942139, "learning_rate": 3.8285714285714286e-05, "loss": 0.086, "step": 1248 }, { "epoch": 0.2498, "grad_norm": 15.161467552185059, "learning_rate": 3.8275510204081635e-05, "loss": 3.2497, "step": 1249 }, { "epoch": 0.25, "grad_norm": 5.99517297744751, "learning_rate": 3.826530612244898e-05, "loss": 2.1919, "step": 1250 }, { "epoch": 0.2502, "grad_norm": 1.4167498350143433, "learning_rate": 3.825510204081633e-05, "loss": 0.1137, "step": 1251 }, { "epoch": 0.2504, "grad_norm": 2.161652088165283, "learning_rate": 3.8244897959183676e-05, "loss": 0.215, "step": 1252 }, { "epoch": 0.2506, "grad_norm": 1.7977770566940308, "learning_rate": 3.823469387755102e-05, "loss": 0.1669, "step": 1253 }, { "epoch": 0.2508, "grad_norm": 2.0701310634613037, "learning_rate": 3.822448979591837e-05, "loss": 0.2376, "step": 1254 }, { "epoch": 0.251, "grad_norm": 4.391334533691406, "learning_rate": 3.821428571428572e-05, "loss": 1.3211, "step": 1255 }, { "epoch": 0.2512, "grad_norm": 1.8493633270263672, "learning_rate": 3.820408163265306e-05, "loss": 0.1635, "step": 1256 }, { "epoch": 0.2514, "grad_norm": 3.1207923889160156, "learning_rate": 3.8193877551020415e-05, "loss": 0.3237, "step": 1257 }, { "epoch": 0.2516, "grad_norm": 4.7217326164245605, "learning_rate": 3.818367346938776e-05, "loss": 0.7855, "step": 1258 }, { "epoch": 0.2518, "grad_norm": 4.3752827644348145, "learning_rate": 3.817346938775511e-05, "loss": 1.0889, "step": 1259 }, { "epoch": 0.252, "grad_norm": 1.588794231414795, "learning_rate": 3.8163265306122456e-05, "loss": 0.1937, "step": 1260 }, { "epoch": 0.2522, "grad_norm": 1.5312442779541016, "learning_rate": 3.81530612244898e-05, "loss": 0.1339, "step": 1261 }, { "epoch": 0.2524, "grad_norm": 54.07605743408203, "learning_rate": 3.814285714285715e-05, "loss": 4.1722, "step": 1262 }, { "epoch": 0.2526, "grad_norm": 1.6507775783538818, "learning_rate": 3.81326530612245e-05, "loss": 0.178, "step": 1263 }, { "epoch": 0.2528, "grad_norm": 3.3244357109069824, "learning_rate": 3.812244897959184e-05, "loss": 0.8281, "step": 1264 }, { "epoch": 0.253, "grad_norm": 2.177236795425415, "learning_rate": 3.811224489795919e-05, "loss": 0.124, "step": 1265 }, { "epoch": 0.2532, "grad_norm": 12.845377922058105, "learning_rate": 3.810204081632654e-05, "loss": 2.2881, "step": 1266 }, { "epoch": 0.2534, "grad_norm": 38.4703369140625, "learning_rate": 3.809183673469388e-05, "loss": 3.1562, "step": 1267 }, { "epoch": 0.2536, "grad_norm": 3.0963709354400635, "learning_rate": 3.808163265306123e-05, "loss": 0.8849, "step": 1268 }, { "epoch": 0.2538, "grad_norm": 1.7664254903793335, "learning_rate": 3.807142857142857e-05, "loss": 0.152, "step": 1269 }, { "epoch": 0.254, "grad_norm": 4.133462429046631, "learning_rate": 3.806122448979592e-05, "loss": 1.0243, "step": 1270 }, { "epoch": 0.2542, "grad_norm": 1.334006428718567, "learning_rate": 3.805102040816327e-05, "loss": 0.1479, "step": 1271 }, { "epoch": 0.2544, "grad_norm": 1.698577642440796, "learning_rate": 3.804081632653061e-05, "loss": 0.1708, "step": 1272 }, { "epoch": 0.2546, "grad_norm": 2.968167781829834, "learning_rate": 3.803061224489796e-05, "loss": 0.1021, "step": 1273 }, { "epoch": 0.2548, "grad_norm": 1.6637413501739502, "learning_rate": 3.802040816326531e-05, "loss": 0.1823, "step": 1274 }, { "epoch": 0.255, "grad_norm": 1.5605759620666504, "learning_rate": 3.801020408163265e-05, "loss": 0.0836, "step": 1275 }, { "epoch": 0.2552, "grad_norm": 1.4904472827911377, "learning_rate": 3.8e-05, "loss": 0.0866, "step": 1276 }, { "epoch": 0.2554, "grad_norm": 6.826018810272217, "learning_rate": 3.798979591836735e-05, "loss": 1.1779, "step": 1277 }, { "epoch": 0.2556, "grad_norm": 3.469442844390869, "learning_rate": 3.7979591836734694e-05, "loss": 0.866, "step": 1278 }, { "epoch": 0.2558, "grad_norm": 15.142460823059082, "learning_rate": 3.796938775510204e-05, "loss": 1.5539, "step": 1279 }, { "epoch": 0.256, "grad_norm": 1.6873462200164795, "learning_rate": 3.795918367346939e-05, "loss": 0.1636, "step": 1280 }, { "epoch": 0.2562, "grad_norm": 1.8198899030685425, "learning_rate": 3.7948979591836734e-05, "loss": 0.1531, "step": 1281 }, { "epoch": 0.2564, "grad_norm": 35.50392532348633, "learning_rate": 3.7938775510204084e-05, "loss": 5.2858, "step": 1282 }, { "epoch": 0.2566, "grad_norm": 4.1826491355896, "learning_rate": 3.792857142857143e-05, "loss": 0.7081, "step": 1283 }, { "epoch": 0.2568, "grad_norm": 1.536327838897705, "learning_rate": 3.7918367346938775e-05, "loss": 0.0921, "step": 1284 }, { "epoch": 0.257, "grad_norm": 7.594724655151367, "learning_rate": 3.7908163265306124e-05, "loss": 1.3357, "step": 1285 }, { "epoch": 0.2572, "grad_norm": 1.5905827283859253, "learning_rate": 3.7897959183673474e-05, "loss": 0.1269, "step": 1286 }, { "epoch": 0.2574, "grad_norm": 3.3080477714538574, "learning_rate": 3.7887755102040816e-05, "loss": 0.8568, "step": 1287 }, { "epoch": 0.2576, "grad_norm": 1.4824048280715942, "learning_rate": 3.7877551020408165e-05, "loss": 0.0757, "step": 1288 }, { "epoch": 0.2578, "grad_norm": 2.0729236602783203, "learning_rate": 3.786734693877551e-05, "loss": 0.1869, "step": 1289 }, { "epoch": 0.258, "grad_norm": 1.4475592374801636, "learning_rate": 3.785714285714286e-05, "loss": 0.1157, "step": 1290 }, { "epoch": 0.2582, "grad_norm": 2.0196914672851562, "learning_rate": 3.7846938775510206e-05, "loss": 0.0811, "step": 1291 }, { "epoch": 0.2584, "grad_norm": 3.5122108459472656, "learning_rate": 3.783673469387755e-05, "loss": 0.8649, "step": 1292 }, { "epoch": 0.2586, "grad_norm": 31.961008071899414, "learning_rate": 3.78265306122449e-05, "loss": 3.9754, "step": 1293 }, { "epoch": 0.2588, "grad_norm": 3.7600467205047607, "learning_rate": 3.781632653061225e-05, "loss": 0.287, "step": 1294 }, { "epoch": 0.259, "grad_norm": 1.7909352779388428, "learning_rate": 3.780612244897959e-05, "loss": 0.1029, "step": 1295 }, { "epoch": 0.2592, "grad_norm": 1.7947596311569214, "learning_rate": 3.779591836734694e-05, "loss": 0.1077, "step": 1296 }, { "epoch": 0.2594, "grad_norm": 1.495173692703247, "learning_rate": 3.778571428571429e-05, "loss": 0.1247, "step": 1297 }, { "epoch": 0.2596, "grad_norm": 1.5115487575531006, "learning_rate": 3.777551020408163e-05, "loss": 0.0777, "step": 1298 }, { "epoch": 0.2598, "grad_norm": 8.24157428741455, "learning_rate": 3.776530612244898e-05, "loss": 1.4188, "step": 1299 }, { "epoch": 0.26, "grad_norm": 1.461794376373291, "learning_rate": 3.775510204081633e-05, "loss": 0.1331, "step": 1300 }, { "epoch": 0.2602, "grad_norm": 1.6198186874389648, "learning_rate": 3.774489795918367e-05, "loss": 0.0813, "step": 1301 }, { "epoch": 0.2604, "grad_norm": 1.730018138885498, "learning_rate": 3.773469387755102e-05, "loss": 0.0997, "step": 1302 }, { "epoch": 0.2606, "grad_norm": 1.579390287399292, "learning_rate": 3.772448979591837e-05, "loss": 0.085, "step": 1303 }, { "epoch": 0.2608, "grad_norm": 1.8548482656478882, "learning_rate": 3.771428571428572e-05, "loss": 0.1627, "step": 1304 }, { "epoch": 0.261, "grad_norm": 3.0801405906677246, "learning_rate": 3.770408163265307e-05, "loss": 0.3232, "step": 1305 }, { "epoch": 0.2612, "grad_norm": 2.0121631622314453, "learning_rate": 3.769387755102041e-05, "loss": 0.1992, "step": 1306 }, { "epoch": 0.2614, "grad_norm": 7.033249855041504, "learning_rate": 3.768367346938776e-05, "loss": 2.3145, "step": 1307 }, { "epoch": 0.2616, "grad_norm": 6.946961402893066, "learning_rate": 3.767346938775511e-05, "loss": 1.1799, "step": 1308 }, { "epoch": 0.2618, "grad_norm": 1.6728273630142212, "learning_rate": 3.766326530612245e-05, "loss": 0.1331, "step": 1309 }, { "epoch": 0.262, "grad_norm": 3.6165990829467773, "learning_rate": 3.76530612244898e-05, "loss": 0.8752, "step": 1310 }, { "epoch": 0.2622, "grad_norm": 2.263660430908203, "learning_rate": 3.764285714285715e-05, "loss": 0.2218, "step": 1311 }, { "epoch": 0.2624, "grad_norm": 23.657312393188477, "learning_rate": 3.763265306122449e-05, "loss": 5.4844, "step": 1312 }, { "epoch": 0.2626, "grad_norm": 6.865463733673096, "learning_rate": 3.762244897959184e-05, "loss": 1.2535, "step": 1313 }, { "epoch": 0.2628, "grad_norm": 18.283000946044922, "learning_rate": 3.761224489795919e-05, "loss": 4.1451, "step": 1314 }, { "epoch": 0.263, "grad_norm": 18.092365264892578, "learning_rate": 3.760204081632653e-05, "loss": 4.2177, "step": 1315 }, { "epoch": 0.2632, "grad_norm": 14.155285835266113, "learning_rate": 3.759183673469388e-05, "loss": 5.4509, "step": 1316 }, { "epoch": 0.2634, "grad_norm": 1.3538949489593506, "learning_rate": 3.758163265306123e-05, "loss": 0.1153, "step": 1317 }, { "epoch": 0.2636, "grad_norm": 4.133082389831543, "learning_rate": 3.757142857142857e-05, "loss": 0.738, "step": 1318 }, { "epoch": 0.2638, "grad_norm": 1.8121343851089478, "learning_rate": 3.756122448979592e-05, "loss": 0.1799, "step": 1319 }, { "epoch": 0.264, "grad_norm": 1.3098820447921753, "learning_rate": 3.7551020408163264e-05, "loss": 0.0949, "step": 1320 }, { "epoch": 0.2642, "grad_norm": 7.844435691833496, "learning_rate": 3.7540816326530614e-05, "loss": 1.4964, "step": 1321 }, { "epoch": 0.2644, "grad_norm": 5.921436786651611, "learning_rate": 3.753061224489796e-05, "loss": 1.2696, "step": 1322 }, { "epoch": 0.2646, "grad_norm": 1.8828604221343994, "learning_rate": 3.7520408163265305e-05, "loss": 0.1063, "step": 1323 }, { "epoch": 0.2648, "grad_norm": 4.386382102966309, "learning_rate": 3.7510204081632654e-05, "loss": 0.7707, "step": 1324 }, { "epoch": 0.265, "grad_norm": 6.917270660400391, "learning_rate": 3.7500000000000003e-05, "loss": 1.2423, "step": 1325 }, { "epoch": 0.2652, "grad_norm": 1.765566110610962, "learning_rate": 3.7489795918367346e-05, "loss": 0.1591, "step": 1326 }, { "epoch": 0.2654, "grad_norm": 1.6731854677200317, "learning_rate": 3.7479591836734695e-05, "loss": 0.0748, "step": 1327 }, { "epoch": 0.2656, "grad_norm": 4.075490951538086, "learning_rate": 3.7469387755102044e-05, "loss": 0.9095, "step": 1328 }, { "epoch": 0.2658, "grad_norm": 11.133137702941895, "learning_rate": 3.745918367346939e-05, "loss": 1.4753, "step": 1329 }, { "epoch": 0.266, "grad_norm": 1.6695935726165771, "learning_rate": 3.7448979591836736e-05, "loss": 0.1598, "step": 1330 }, { "epoch": 0.2662, "grad_norm": 1.5798397064208984, "learning_rate": 3.7438775510204085e-05, "loss": 0.1487, "step": 1331 }, { "epoch": 0.2664, "grad_norm": 1.7983241081237793, "learning_rate": 3.742857142857143e-05, "loss": 0.1521, "step": 1332 }, { "epoch": 0.2666, "grad_norm": 10.795758247375488, "learning_rate": 3.7418367346938777e-05, "loss": 1.1869, "step": 1333 }, { "epoch": 0.2668, "grad_norm": 3.6086666584014893, "learning_rate": 3.7408163265306126e-05, "loss": 0.8384, "step": 1334 }, { "epoch": 0.267, "grad_norm": 5.579383373260498, "learning_rate": 3.739795918367347e-05, "loss": 1.0828, "step": 1335 }, { "epoch": 0.2672, "grad_norm": 1.3698099851608276, "learning_rate": 3.738775510204082e-05, "loss": 0.1315, "step": 1336 }, { "epoch": 0.2674, "grad_norm": 1.4228986501693726, "learning_rate": 3.7377551020408167e-05, "loss": 0.0627, "step": 1337 }, { "epoch": 0.2676, "grad_norm": 5.64273738861084, "learning_rate": 3.736734693877551e-05, "loss": 1.0982, "step": 1338 }, { "epoch": 0.2678, "grad_norm": 1.7403396368026733, "learning_rate": 3.735714285714286e-05, "loss": 0.0856, "step": 1339 }, { "epoch": 0.268, "grad_norm": 6.413857460021973, "learning_rate": 3.734693877551021e-05, "loss": 1.4826, "step": 1340 }, { "epoch": 0.2682, "grad_norm": 1.5933765172958374, "learning_rate": 3.733673469387755e-05, "loss": 0.0893, "step": 1341 }, { "epoch": 0.2684, "grad_norm": 1.4069416522979736, "learning_rate": 3.73265306122449e-05, "loss": 0.0718, "step": 1342 }, { "epoch": 0.2686, "grad_norm": 5.735523223876953, "learning_rate": 3.731632653061224e-05, "loss": 1.4975, "step": 1343 }, { "epoch": 0.2688, "grad_norm": 1.6116992235183716, "learning_rate": 3.730612244897959e-05, "loss": 0.1336, "step": 1344 }, { "epoch": 0.269, "grad_norm": 1.9418095350265503, "learning_rate": 3.729591836734694e-05, "loss": 0.0917, "step": 1345 }, { "epoch": 0.2692, "grad_norm": 1.7028329372406006, "learning_rate": 3.728571428571428e-05, "loss": 0.097, "step": 1346 }, { "epoch": 0.2694, "grad_norm": 1.7631217241287231, "learning_rate": 3.727551020408163e-05, "loss": 0.0936, "step": 1347 }, { "epoch": 0.2696, "grad_norm": 22.473411560058594, "learning_rate": 3.726530612244899e-05, "loss": 3.687, "step": 1348 }, { "epoch": 0.2698, "grad_norm": 1.4212217330932617, "learning_rate": 3.725510204081633e-05, "loss": 0.1405, "step": 1349 }, { "epoch": 0.27, "grad_norm": 3.952842950820923, "learning_rate": 3.724489795918368e-05, "loss": 0.6278, "step": 1350 }, { "epoch": 0.2702, "grad_norm": 1.5269299745559692, "learning_rate": 3.723469387755102e-05, "loss": 0.1122, "step": 1351 }, { "epoch": 0.2704, "grad_norm": 1.2545517683029175, "learning_rate": 3.722448979591837e-05, "loss": 0.101, "step": 1352 }, { "epoch": 0.2706, "grad_norm": 1.3637840747833252, "learning_rate": 3.721428571428572e-05, "loss": 0.1407, "step": 1353 }, { "epoch": 0.2708, "grad_norm": 1.5960081815719604, "learning_rate": 3.720408163265306e-05, "loss": 0.0814, "step": 1354 }, { "epoch": 0.271, "grad_norm": 1.6952459812164307, "learning_rate": 3.719387755102041e-05, "loss": 0.1448, "step": 1355 }, { "epoch": 0.2712, "grad_norm": 6.112464904785156, "learning_rate": 3.718367346938776e-05, "loss": 1.0341, "step": 1356 }, { "epoch": 0.2714, "grad_norm": 3.779942512512207, "learning_rate": 3.71734693877551e-05, "loss": 0.8778, "step": 1357 }, { "epoch": 0.2716, "grad_norm": 1.5804752111434937, "learning_rate": 3.716326530612245e-05, "loss": 0.0789, "step": 1358 }, { "epoch": 0.2718, "grad_norm": 1.850809931755066, "learning_rate": 3.71530612244898e-05, "loss": 0.2246, "step": 1359 }, { "epoch": 0.272, "grad_norm": 3.5643880367279053, "learning_rate": 3.7142857142857143e-05, "loss": 0.8722, "step": 1360 }, { "epoch": 0.2722, "grad_norm": 1.2772294282913208, "learning_rate": 3.713265306122449e-05, "loss": 0.1378, "step": 1361 }, { "epoch": 0.2724, "grad_norm": 1.5688132047653198, "learning_rate": 3.712244897959184e-05, "loss": 0.1225, "step": 1362 }, { "epoch": 0.2726, "grad_norm": 4.985512733459473, "learning_rate": 3.7112244897959184e-05, "loss": 1.0646, "step": 1363 }, { "epoch": 0.2728, "grad_norm": 1.7303472757339478, "learning_rate": 3.7102040816326533e-05, "loss": 0.0765, "step": 1364 }, { "epoch": 0.273, "grad_norm": 1.4362291097640991, "learning_rate": 3.709183673469388e-05, "loss": 0.0705, "step": 1365 }, { "epoch": 0.2732, "grad_norm": 4.591298580169678, "learning_rate": 3.7081632653061225e-05, "loss": 0.7663, "step": 1366 }, { "epoch": 0.2734, "grad_norm": 1.4391281604766846, "learning_rate": 3.7071428571428574e-05, "loss": 0.1558, "step": 1367 }, { "epoch": 0.2736, "grad_norm": 22.59450912475586, "learning_rate": 3.706122448979592e-05, "loss": 4.1362, "step": 1368 }, { "epoch": 0.2738, "grad_norm": 1.5219882726669312, "learning_rate": 3.7051020408163266e-05, "loss": 0.1387, "step": 1369 }, { "epoch": 0.274, "grad_norm": 1.5087683200836182, "learning_rate": 3.7040816326530615e-05, "loss": 0.1129, "step": 1370 }, { "epoch": 0.2742, "grad_norm": 1.8149199485778809, "learning_rate": 3.7030612244897964e-05, "loss": 0.1189, "step": 1371 }, { "epoch": 0.2744, "grad_norm": 9.546114921569824, "learning_rate": 3.7020408163265307e-05, "loss": 2.0722, "step": 1372 }, { "epoch": 0.2746, "grad_norm": 1.3203532695770264, "learning_rate": 3.7010204081632656e-05, "loss": 0.1022, "step": 1373 }, { "epoch": 0.2748, "grad_norm": 8.20767593383789, "learning_rate": 3.7e-05, "loss": 2.306, "step": 1374 }, { "epoch": 0.275, "grad_norm": 4.272182464599609, "learning_rate": 3.698979591836735e-05, "loss": 0.6594, "step": 1375 }, { "epoch": 0.2752, "grad_norm": 12.107483863830566, "learning_rate": 3.6979591836734696e-05, "loss": 4.1481, "step": 1376 }, { "epoch": 0.2754, "grad_norm": 6.796584606170654, "learning_rate": 3.696938775510204e-05, "loss": 2.4405, "step": 1377 }, { "epoch": 0.2756, "grad_norm": 10.60947036743164, "learning_rate": 3.695918367346939e-05, "loss": 4.4079, "step": 1378 }, { "epoch": 0.2758, "grad_norm": 74.99689483642578, "learning_rate": 3.694897959183674e-05, "loss": 5.6219, "step": 1379 }, { "epoch": 0.276, "grad_norm": 14.767340660095215, "learning_rate": 3.693877551020408e-05, "loss": 2.0869, "step": 1380 }, { "epoch": 0.2762, "grad_norm": 19.619022369384766, "learning_rate": 3.692857142857143e-05, "loss": 1.5368, "step": 1381 }, { "epoch": 0.2764, "grad_norm": 4.810040473937988, "learning_rate": 3.691836734693878e-05, "loss": 1.0624, "step": 1382 }, { "epoch": 0.2766, "grad_norm": 1.496702790260315, "learning_rate": 3.690816326530612e-05, "loss": 0.0921, "step": 1383 }, { "epoch": 0.2768, "grad_norm": 61.29378890991211, "learning_rate": 3.689795918367347e-05, "loss": 5.6475, "step": 1384 }, { "epoch": 0.277, "grad_norm": 1.4522367715835571, "learning_rate": 3.688775510204082e-05, "loss": 0.1096, "step": 1385 }, { "epoch": 0.2772, "grad_norm": 1.9087193012237549, "learning_rate": 3.687755102040816e-05, "loss": 0.0828, "step": 1386 }, { "epoch": 0.2774, "grad_norm": 35.92570495605469, "learning_rate": 3.686734693877551e-05, "loss": 5.3491, "step": 1387 }, { "epoch": 0.2776, "grad_norm": 2.087874412536621, "learning_rate": 3.685714285714286e-05, "loss": 0.2269, "step": 1388 }, { "epoch": 0.2778, "grad_norm": 21.29840850830078, "learning_rate": 3.68469387755102e-05, "loss": 4.7003, "step": 1389 }, { "epoch": 0.278, "grad_norm": 1.5787270069122314, "learning_rate": 3.683673469387755e-05, "loss": 0.1242, "step": 1390 }, { "epoch": 0.2782, "grad_norm": 2.447349786758423, "learning_rate": 3.68265306122449e-05, "loss": 0.0794, "step": 1391 }, { "epoch": 0.2784, "grad_norm": 1.8658487796783447, "learning_rate": 3.681632653061224e-05, "loss": 0.0941, "step": 1392 }, { "epoch": 0.2786, "grad_norm": 4.186209201812744, "learning_rate": 3.680612244897959e-05, "loss": 0.7502, "step": 1393 }, { "epoch": 0.2788, "grad_norm": 3.0826706886291504, "learning_rate": 3.679591836734694e-05, "loss": 0.0862, "step": 1394 }, { "epoch": 0.279, "grad_norm": 17.367568969726562, "learning_rate": 3.678571428571429e-05, "loss": 1.3791, "step": 1395 }, { "epoch": 0.2792, "grad_norm": 1.1378870010375977, "learning_rate": 3.677551020408164e-05, "loss": 0.0919, "step": 1396 }, { "epoch": 0.2794, "grad_norm": 1.4609671831130981, "learning_rate": 3.676530612244898e-05, "loss": 0.0793, "step": 1397 }, { "epoch": 0.2796, "grad_norm": 6.895241737365723, "learning_rate": 3.675510204081633e-05, "loss": 2.2601, "step": 1398 }, { "epoch": 0.2798, "grad_norm": 1.5115087032318115, "learning_rate": 3.674489795918368e-05, "loss": 0.0784, "step": 1399 }, { "epoch": 0.28, "grad_norm": 3.2029130458831787, "learning_rate": 3.673469387755102e-05, "loss": 0.3369, "step": 1400 }, { "epoch": 0.2802, "grad_norm": 1.7710728645324707, "learning_rate": 3.672448979591837e-05, "loss": 0.1016, "step": 1401 }, { "epoch": 0.2804, "grad_norm": 1.6437673568725586, "learning_rate": 3.671428571428572e-05, "loss": 0.0811, "step": 1402 }, { "epoch": 0.2806, "grad_norm": 1.339395523071289, "learning_rate": 3.670408163265306e-05, "loss": 0.1016, "step": 1403 }, { "epoch": 0.2808, "grad_norm": 12.196369171142578, "learning_rate": 3.669387755102041e-05, "loss": 4.7879, "step": 1404 }, { "epoch": 0.281, "grad_norm": 1.3435919284820557, "learning_rate": 3.6683673469387755e-05, "loss": 0.1578, "step": 1405 }, { "epoch": 0.2812, "grad_norm": 1.9169238805770874, "learning_rate": 3.6673469387755104e-05, "loss": 0.099, "step": 1406 }, { "epoch": 0.2814, "grad_norm": 6.315728187561035, "learning_rate": 3.666326530612245e-05, "loss": 1.0508, "step": 1407 }, { "epoch": 0.2816, "grad_norm": 23.689496994018555, "learning_rate": 3.6653061224489796e-05, "loss": 5.2598, "step": 1408 }, { "epoch": 0.2818, "grad_norm": 17.41273307800293, "learning_rate": 3.6642857142857145e-05, "loss": 1.1995, "step": 1409 }, { "epoch": 0.282, "grad_norm": 5.75798225402832, "learning_rate": 3.6632653061224494e-05, "loss": 1.074, "step": 1410 }, { "epoch": 0.2822, "grad_norm": 1.9065535068511963, "learning_rate": 3.6622448979591836e-05, "loss": 0.2362, "step": 1411 }, { "epoch": 0.2824, "grad_norm": 1.2875878810882568, "learning_rate": 3.6612244897959186e-05, "loss": 0.1163, "step": 1412 }, { "epoch": 0.2826, "grad_norm": 3.6460695266723633, "learning_rate": 3.6602040816326535e-05, "loss": 0.9125, "step": 1413 }, { "epoch": 0.2828, "grad_norm": 1.1291654109954834, "learning_rate": 3.659183673469388e-05, "loss": 0.0723, "step": 1414 }, { "epoch": 0.283, "grad_norm": 8.621419906616211, "learning_rate": 3.6581632653061226e-05, "loss": 1.9511, "step": 1415 }, { "epoch": 0.2832, "grad_norm": 4.533262252807617, "learning_rate": 3.6571428571428576e-05, "loss": 1.0461, "step": 1416 }, { "epoch": 0.2834, "grad_norm": 1.8045939207077026, "learning_rate": 3.656122448979592e-05, "loss": 0.1642, "step": 1417 }, { "epoch": 0.2836, "grad_norm": 5.960536956787109, "learning_rate": 3.655102040816327e-05, "loss": 2.2314, "step": 1418 }, { "epoch": 0.2838, "grad_norm": 1.9197490215301514, "learning_rate": 3.6540816326530616e-05, "loss": 0.1698, "step": 1419 }, { "epoch": 0.284, "grad_norm": 1.6662993431091309, "learning_rate": 3.653061224489796e-05, "loss": 0.075, "step": 1420 }, { "epoch": 0.2842, "grad_norm": 1.8599644899368286, "learning_rate": 3.652040816326531e-05, "loss": 0.2059, "step": 1421 }, { "epoch": 0.2844, "grad_norm": 1.4788411855697632, "learning_rate": 3.651020408163266e-05, "loss": 0.0738, "step": 1422 }, { "epoch": 0.2846, "grad_norm": 4.117801666259766, "learning_rate": 3.65e-05, "loss": 0.7203, "step": 1423 }, { "epoch": 0.2848, "grad_norm": 6.469876766204834, "learning_rate": 3.648979591836735e-05, "loss": 1.1329, "step": 1424 }, { "epoch": 0.285, "grad_norm": 5.63407039642334, "learning_rate": 3.64795918367347e-05, "loss": 1.4169, "step": 1425 }, { "epoch": 0.2852, "grad_norm": 1.5634034872055054, "learning_rate": 3.646938775510204e-05, "loss": 0.0892, "step": 1426 }, { "epoch": 0.2854, "grad_norm": 13.888527870178223, "learning_rate": 3.645918367346939e-05, "loss": 2.2912, "step": 1427 }, { "epoch": 0.2856, "grad_norm": 4.702810764312744, "learning_rate": 3.644897959183673e-05, "loss": 1.0284, "step": 1428 }, { "epoch": 0.2858, "grad_norm": 18.649410247802734, "learning_rate": 3.643877551020408e-05, "loss": 4.8366, "step": 1429 }, { "epoch": 0.286, "grad_norm": 36.403133392333984, "learning_rate": 3.642857142857143e-05, "loss": 5.4918, "step": 1430 }, { "epoch": 0.2862, "grad_norm": 10.971161842346191, "learning_rate": 3.641836734693877e-05, "loss": 4.3258, "step": 1431 }, { "epoch": 0.2864, "grad_norm": 8.168219566345215, "learning_rate": 3.640816326530612e-05, "loss": 1.0876, "step": 1432 }, { "epoch": 0.2866, "grad_norm": 39.15559768676758, "learning_rate": 3.639795918367347e-05, "loss": 5.2519, "step": 1433 }, { "epoch": 0.2868, "grad_norm": 5.92279052734375, "learning_rate": 3.638775510204081e-05, "loss": 1.0407, "step": 1434 }, { "epoch": 0.287, "grad_norm": 1.6077378988265991, "learning_rate": 3.637755102040816e-05, "loss": 0.1247, "step": 1435 }, { "epoch": 0.2872, "grad_norm": 1.6817654371261597, "learning_rate": 3.636734693877551e-05, "loss": 0.1037, "step": 1436 }, { "epoch": 0.2874, "grad_norm": 1.44280207157135, "learning_rate": 3.6357142857142854e-05, "loss": 0.0773, "step": 1437 }, { "epoch": 0.2876, "grad_norm": 1.7035020589828491, "learning_rate": 3.63469387755102e-05, "loss": 0.1836, "step": 1438 }, { "epoch": 0.2878, "grad_norm": 1.8268977403640747, "learning_rate": 3.633673469387755e-05, "loss": 0.164, "step": 1439 }, { "epoch": 0.288, "grad_norm": 1.641530990600586, "learning_rate": 3.63265306122449e-05, "loss": 0.1704, "step": 1440 }, { "epoch": 0.2882, "grad_norm": 1.2546417713165283, "learning_rate": 3.631632653061225e-05, "loss": 0.0956, "step": 1441 }, { "epoch": 0.2884, "grad_norm": 1.6430646181106567, "learning_rate": 3.630612244897959e-05, "loss": 0.1431, "step": 1442 }, { "epoch": 0.2886, "grad_norm": 1.5561422109603882, "learning_rate": 3.629591836734694e-05, "loss": 0.1346, "step": 1443 }, { "epoch": 0.2888, "grad_norm": 1.5593615770339966, "learning_rate": 3.628571428571429e-05, "loss": 0.0824, "step": 1444 }, { "epoch": 0.289, "grad_norm": 1.296095848083496, "learning_rate": 3.6275510204081634e-05, "loss": 0.0708, "step": 1445 }, { "epoch": 0.2892, "grad_norm": 1.6430326700210571, "learning_rate": 3.626530612244898e-05, "loss": 0.181, "step": 1446 }, { "epoch": 0.2894, "grad_norm": 75.7375259399414, "learning_rate": 3.625510204081633e-05, "loss": 5.6181, "step": 1447 }, { "epoch": 0.2896, "grad_norm": 1.7995988130569458, "learning_rate": 3.6244897959183675e-05, "loss": 0.1908, "step": 1448 }, { "epoch": 0.2898, "grad_norm": 1.6368839740753174, "learning_rate": 3.6234693877551024e-05, "loss": 0.1367, "step": 1449 }, { "epoch": 0.29, "grad_norm": 1.4290481805801392, "learning_rate": 3.622448979591837e-05, "loss": 0.1843, "step": 1450 }, { "epoch": 0.2902, "grad_norm": 30.49867057800293, "learning_rate": 3.6214285714285716e-05, "loss": 4.8434, "step": 1451 }, { "epoch": 0.2904, "grad_norm": 1.2859183549880981, "learning_rate": 3.6204081632653065e-05, "loss": 0.1097, "step": 1452 }, { "epoch": 0.2906, "grad_norm": 20.768646240234375, "learning_rate": 3.6193877551020414e-05, "loss": 4.2763, "step": 1453 }, { "epoch": 0.2908, "grad_norm": 1.3919540643692017, "learning_rate": 3.6183673469387756e-05, "loss": 0.1183, "step": 1454 }, { "epoch": 0.291, "grad_norm": 4.011405944824219, "learning_rate": 3.6173469387755106e-05, "loss": 0.6685, "step": 1455 }, { "epoch": 0.2912, "grad_norm": 1.4617115259170532, "learning_rate": 3.6163265306122455e-05, "loss": 0.1463, "step": 1456 }, { "epoch": 0.2914, "grad_norm": 1.5064582824707031, "learning_rate": 3.61530612244898e-05, "loss": 0.0668, "step": 1457 }, { "epoch": 0.2916, "grad_norm": 1.6540746688842773, "learning_rate": 3.6142857142857146e-05, "loss": 0.1695, "step": 1458 }, { "epoch": 0.2918, "grad_norm": 1.683559536933899, "learning_rate": 3.613265306122449e-05, "loss": 0.0959, "step": 1459 }, { "epoch": 0.292, "grad_norm": 1.4306719303131104, "learning_rate": 3.612244897959184e-05, "loss": 0.0992, "step": 1460 }, { "epoch": 0.2922, "grad_norm": 6.173213481903076, "learning_rate": 3.611224489795919e-05, "loss": 2.266, "step": 1461 }, { "epoch": 0.2924, "grad_norm": 5.689418315887451, "learning_rate": 3.610204081632653e-05, "loss": 1.2446, "step": 1462 }, { "epoch": 0.2926, "grad_norm": 1.2365597486495972, "learning_rate": 3.609183673469388e-05, "loss": 0.1461, "step": 1463 }, { "epoch": 0.2928, "grad_norm": 1.5029525756835938, "learning_rate": 3.608163265306123e-05, "loss": 0.0894, "step": 1464 }, { "epoch": 0.293, "grad_norm": 12.302665710449219, "learning_rate": 3.607142857142857e-05, "loss": 2.1756, "step": 1465 }, { "epoch": 0.2932, "grad_norm": 1.846310019493103, "learning_rate": 3.606122448979592e-05, "loss": 0.166, "step": 1466 }, { "epoch": 0.2934, "grad_norm": 29.496362686157227, "learning_rate": 3.605102040816327e-05, "loss": 4.3579, "step": 1467 }, { "epoch": 0.2936, "grad_norm": 1.7736221551895142, "learning_rate": 3.604081632653061e-05, "loss": 0.1465, "step": 1468 }, { "epoch": 0.2938, "grad_norm": 8.128005981445312, "learning_rate": 3.603061224489796e-05, "loss": 2.0517, "step": 1469 }, { "epoch": 0.294, "grad_norm": 1.6332521438598633, "learning_rate": 3.602040816326531e-05, "loss": 0.0818, "step": 1470 }, { "epoch": 0.2942, "grad_norm": 1.4304476976394653, "learning_rate": 3.601020408163265e-05, "loss": 0.1144, "step": 1471 }, { "epoch": 0.2944, "grad_norm": 6.116512298583984, "learning_rate": 3.6e-05, "loss": 1.1423, "step": 1472 }, { "epoch": 0.2946, "grad_norm": 1.713259220123291, "learning_rate": 3.598979591836735e-05, "loss": 0.2148, "step": 1473 }, { "epoch": 0.2948, "grad_norm": 4.142031669616699, "learning_rate": 3.597959183673469e-05, "loss": 0.6924, "step": 1474 }, { "epoch": 0.295, "grad_norm": 1.4502673149108887, "learning_rate": 3.596938775510204e-05, "loss": 0.0688, "step": 1475 }, { "epoch": 0.2952, "grad_norm": 1.564724326133728, "learning_rate": 3.595918367346939e-05, "loss": 0.1236, "step": 1476 }, { "epoch": 0.2954, "grad_norm": 1.7544804811477661, "learning_rate": 3.594897959183673e-05, "loss": 0.083, "step": 1477 }, { "epoch": 0.2956, "grad_norm": 13.240604400634766, "learning_rate": 3.593877551020408e-05, "loss": 2.213, "step": 1478 }, { "epoch": 0.2958, "grad_norm": 1.6230263710021973, "learning_rate": 3.5928571428571425e-05, "loss": 0.0949, "step": 1479 }, { "epoch": 0.296, "grad_norm": 1.812746524810791, "learning_rate": 3.5918367346938774e-05, "loss": 0.1133, "step": 1480 }, { "epoch": 0.2962, "grad_norm": 5.771973609924316, "learning_rate": 3.590816326530612e-05, "loss": 1.136, "step": 1481 }, { "epoch": 0.2964, "grad_norm": 1.594550609588623, "learning_rate": 3.5897959183673466e-05, "loss": 0.1483, "step": 1482 }, { "epoch": 0.2966, "grad_norm": 17.456066131591797, "learning_rate": 3.5887755102040815e-05, "loss": 4.0837, "step": 1483 }, { "epoch": 0.2968, "grad_norm": 1.3515254259109497, "learning_rate": 3.587755102040817e-05, "loss": 0.06, "step": 1484 }, { "epoch": 0.297, "grad_norm": 1.5891889333724976, "learning_rate": 3.586734693877551e-05, "loss": 0.1569, "step": 1485 }, { "epoch": 0.2972, "grad_norm": 1.8010159730911255, "learning_rate": 3.585714285714286e-05, "loss": 0.1797, "step": 1486 }, { "epoch": 0.2974, "grad_norm": 6.173261642456055, "learning_rate": 3.584693877551021e-05, "loss": 2.3401, "step": 1487 }, { "epoch": 0.2976, "grad_norm": 6.937226295471191, "learning_rate": 3.5836734693877554e-05, "loss": 1.1559, "step": 1488 }, { "epoch": 0.2978, "grad_norm": 1.6864588260650635, "learning_rate": 3.58265306122449e-05, "loss": 0.1606, "step": 1489 }, { "epoch": 0.298, "grad_norm": 2.268903970718384, "learning_rate": 3.5816326530612245e-05, "loss": 0.0849, "step": 1490 }, { "epoch": 0.2982, "grad_norm": 2.1504127979278564, "learning_rate": 3.5806122448979595e-05, "loss": 0.1037, "step": 1491 }, { "epoch": 0.2984, "grad_norm": 1.5428582429885864, "learning_rate": 3.5795918367346944e-05, "loss": 0.1281, "step": 1492 }, { "epoch": 0.2986, "grad_norm": 1.5299909114837646, "learning_rate": 3.5785714285714286e-05, "loss": 0.0599, "step": 1493 }, { "epoch": 0.2988, "grad_norm": 6.747359752655029, "learning_rate": 3.5775510204081635e-05, "loss": 2.2304, "step": 1494 }, { "epoch": 0.299, "grad_norm": 1.5359011888504028, "learning_rate": 3.5765306122448985e-05, "loss": 0.1195, "step": 1495 }, { "epoch": 0.2992, "grad_norm": 1.6429435014724731, "learning_rate": 3.575510204081633e-05, "loss": 0.1849, "step": 1496 }, { "epoch": 0.2994, "grad_norm": 1.3016010522842407, "learning_rate": 3.5744897959183676e-05, "loss": 0.1018, "step": 1497 }, { "epoch": 0.2996, "grad_norm": 1.7025047540664673, "learning_rate": 3.5734693877551025e-05, "loss": 0.1944, "step": 1498 }, { "epoch": 0.2998, "grad_norm": 1.5438400506973267, "learning_rate": 3.572448979591837e-05, "loss": 0.1396, "step": 1499 }, { "epoch": 0.3, "grad_norm": 1.3732315301895142, "learning_rate": 3.571428571428572e-05, "loss": 0.1188, "step": 1500 }, { "epoch": 0.3002, "grad_norm": 22.419750213623047, "learning_rate": 3.5704081632653066e-05, "loss": 5.4139, "step": 1501 }, { "epoch": 0.3004, "grad_norm": 4.706147193908691, "learning_rate": 3.569387755102041e-05, "loss": 0.8861, "step": 1502 }, { "epoch": 0.3006, "grad_norm": 1.3744498491287231, "learning_rate": 3.568367346938776e-05, "loss": 0.18, "step": 1503 }, { "epoch": 0.3008, "grad_norm": 6.645823001861572, "learning_rate": 3.567346938775511e-05, "loss": 1.2648, "step": 1504 }, { "epoch": 0.301, "grad_norm": 6.157020092010498, "learning_rate": 3.566326530612245e-05, "loss": 1.0222, "step": 1505 }, { "epoch": 0.3012, "grad_norm": 1.47323739528656, "learning_rate": 3.56530612244898e-05, "loss": 0.1314, "step": 1506 }, { "epoch": 0.3014, "grad_norm": 1.8938440084457397, "learning_rate": 3.564285714285715e-05, "loss": 0.1721, "step": 1507 }, { "epoch": 0.3016, "grad_norm": 6.2161030769348145, "learning_rate": 3.563265306122449e-05, "loss": 1.1228, "step": 1508 }, { "epoch": 0.3018, "grad_norm": 2.3820409774780273, "learning_rate": 3.562244897959184e-05, "loss": 0.1067, "step": 1509 }, { "epoch": 0.302, "grad_norm": 1.7744817733764648, "learning_rate": 3.561224489795918e-05, "loss": 0.2073, "step": 1510 }, { "epoch": 0.3022, "grad_norm": 6.334085464477539, "learning_rate": 3.560204081632653e-05, "loss": 1.4715, "step": 1511 }, { "epoch": 0.3024, "grad_norm": 1.767067551612854, "learning_rate": 3.559183673469388e-05, "loss": 0.1587, "step": 1512 }, { "epoch": 0.3026, "grad_norm": 2.8065779209136963, "learning_rate": 3.558163265306122e-05, "loss": 0.1068, "step": 1513 }, { "epoch": 0.3028, "grad_norm": 1.4385590553283691, "learning_rate": 3.557142857142857e-05, "loss": 0.1007, "step": 1514 }, { "epoch": 0.303, "grad_norm": 1.512893557548523, "learning_rate": 3.556122448979592e-05, "loss": 0.1425, "step": 1515 }, { "epoch": 0.3032, "grad_norm": 1.9610958099365234, "learning_rate": 3.555102040816326e-05, "loss": 0.1041, "step": 1516 }, { "epoch": 0.3034, "grad_norm": 18.42789649963379, "learning_rate": 3.554081632653061e-05, "loss": 3.6185, "step": 1517 }, { "epoch": 0.3036, "grad_norm": 6.517113208770752, "learning_rate": 3.553061224489796e-05, "loss": 1.0958, "step": 1518 }, { "epoch": 0.3038, "grad_norm": 20.108379364013672, "learning_rate": 3.5520408163265304e-05, "loss": 4.195, "step": 1519 }, { "epoch": 0.304, "grad_norm": 5.463354587554932, "learning_rate": 3.551020408163265e-05, "loss": 1.0336, "step": 1520 }, { "epoch": 0.3042, "grad_norm": 1.7834296226501465, "learning_rate": 3.55e-05, "loss": 0.1257, "step": 1521 }, { "epoch": 0.3044, "grad_norm": 30.76213836669922, "learning_rate": 3.5489795918367345e-05, "loss": 5.2445, "step": 1522 }, { "epoch": 0.3046, "grad_norm": 17.685440063476562, "learning_rate": 3.5479591836734694e-05, "loss": 3.5709, "step": 1523 }, { "epoch": 0.3048, "grad_norm": 7.154433250427246, "learning_rate": 3.546938775510204e-05, "loss": 1.2881, "step": 1524 }, { "epoch": 0.305, "grad_norm": 6.08471155166626, "learning_rate": 3.5459183673469385e-05, "loss": 1.1282, "step": 1525 }, { "epoch": 0.3052, "grad_norm": 1.3227505683898926, "learning_rate": 3.5448979591836735e-05, "loss": 0.1083, "step": 1526 }, { "epoch": 0.3054, "grad_norm": 5.789578914642334, "learning_rate": 3.5438775510204084e-05, "loss": 2.3024, "step": 1527 }, { "epoch": 0.3056, "grad_norm": 1.3606492280960083, "learning_rate": 3.5428571428571426e-05, "loss": 0.0869, "step": 1528 }, { "epoch": 0.3058, "grad_norm": 7.820291996002197, "learning_rate": 3.5418367346938775e-05, "loss": 1.2844, "step": 1529 }, { "epoch": 0.306, "grad_norm": 1.8580995798110962, "learning_rate": 3.5408163265306125e-05, "loss": 0.1626, "step": 1530 }, { "epoch": 0.3062, "grad_norm": 9.6140718460083, "learning_rate": 3.5397959183673474e-05, "loss": 3.9964, "step": 1531 }, { "epoch": 0.3064, "grad_norm": 1.9866973161697388, "learning_rate": 3.538775510204082e-05, "loss": 0.2138, "step": 1532 }, { "epoch": 0.3066, "grad_norm": 1.411624789237976, "learning_rate": 3.5377551020408165e-05, "loss": 0.1443, "step": 1533 }, { "epoch": 0.3068, "grad_norm": 1.9197992086410522, "learning_rate": 3.5367346938775515e-05, "loss": 0.1875, "step": 1534 }, { "epoch": 0.307, "grad_norm": 40.50486373901367, "learning_rate": 3.5357142857142864e-05, "loss": 5.3346, "step": 1535 }, { "epoch": 0.3072, "grad_norm": 1.443135380744934, "learning_rate": 3.5346938775510206e-05, "loss": 0.1313, "step": 1536 }, { "epoch": 0.3074, "grad_norm": 4.470092296600342, "learning_rate": 3.5336734693877555e-05, "loss": 1.0869, "step": 1537 }, { "epoch": 0.3076, "grad_norm": 1.6345386505126953, "learning_rate": 3.5326530612244904e-05, "loss": 0.1208, "step": 1538 }, { "epoch": 0.3078, "grad_norm": 4.804007530212402, "learning_rate": 3.531632653061225e-05, "loss": 0.8006, "step": 1539 }, { "epoch": 0.308, "grad_norm": 1.4564540386199951, "learning_rate": 3.5306122448979596e-05, "loss": 0.1651, "step": 1540 }, { "epoch": 0.3082, "grad_norm": 1.2920421361923218, "learning_rate": 3.529591836734694e-05, "loss": 0.1073, "step": 1541 }, { "epoch": 0.3084, "grad_norm": 4.00438928604126, "learning_rate": 3.528571428571429e-05, "loss": 0.7153, "step": 1542 }, { "epoch": 0.3086, "grad_norm": 1.3874545097351074, "learning_rate": 3.527551020408164e-05, "loss": 0.0839, "step": 1543 }, { "epoch": 0.3088, "grad_norm": 6.14603853225708, "learning_rate": 3.526530612244898e-05, "loss": 2.2278, "step": 1544 }, { "epoch": 0.309, "grad_norm": 1.3765803575515747, "learning_rate": 3.525510204081633e-05, "loss": 0.0968, "step": 1545 }, { "epoch": 0.3092, "grad_norm": 4.640695571899414, "learning_rate": 3.524489795918368e-05, "loss": 1.1675, "step": 1546 }, { "epoch": 0.3094, "grad_norm": 4.243666172027588, "learning_rate": 3.523469387755102e-05, "loss": 0.7011, "step": 1547 }, { "epoch": 0.3096, "grad_norm": 5.763128280639648, "learning_rate": 3.522448979591837e-05, "loss": 1.3464, "step": 1548 }, { "epoch": 0.3098, "grad_norm": 1.5605864524841309, "learning_rate": 3.521428571428572e-05, "loss": 0.1439, "step": 1549 }, { "epoch": 0.31, "grad_norm": 1.6594971418380737, "learning_rate": 3.520408163265306e-05, "loss": 0.1253, "step": 1550 }, { "epoch": 0.3102, "grad_norm": 1.8500962257385254, "learning_rate": 3.519387755102041e-05, "loss": 0.1942, "step": 1551 }, { "epoch": 0.3104, "grad_norm": 5.679625511169434, "learning_rate": 3.518367346938776e-05, "loss": 1.3988, "step": 1552 }, { "epoch": 0.3106, "grad_norm": 7.082156181335449, "learning_rate": 3.51734693877551e-05, "loss": 1.1313, "step": 1553 }, { "epoch": 0.3108, "grad_norm": 10.332091331481934, "learning_rate": 3.516326530612245e-05, "loss": 1.809, "step": 1554 }, { "epoch": 0.311, "grad_norm": 1.4994076490402222, "learning_rate": 3.51530612244898e-05, "loss": 0.1259, "step": 1555 }, { "epoch": 0.3112, "grad_norm": 1.9050859212875366, "learning_rate": 3.514285714285714e-05, "loss": 0.1662, "step": 1556 }, { "epoch": 0.3114, "grad_norm": 1.4891732931137085, "learning_rate": 3.513265306122449e-05, "loss": 0.117, "step": 1557 }, { "epoch": 0.3116, "grad_norm": 1.6584659814834595, "learning_rate": 3.512244897959184e-05, "loss": 0.1396, "step": 1558 }, { "epoch": 0.3118, "grad_norm": 4.019477844238281, "learning_rate": 3.511224489795918e-05, "loss": 1.0247, "step": 1559 }, { "epoch": 0.312, "grad_norm": 4.059731483459473, "learning_rate": 3.510204081632653e-05, "loss": 0.9791, "step": 1560 }, { "epoch": 0.3122, "grad_norm": 9.344042778015137, "learning_rate": 3.509183673469388e-05, "loss": 1.0842, "step": 1561 }, { "epoch": 0.3124, "grad_norm": 2.1318531036376953, "learning_rate": 3.5081632653061224e-05, "loss": 0.2142, "step": 1562 }, { "epoch": 0.3126, "grad_norm": 1.78006911277771, "learning_rate": 3.507142857142857e-05, "loss": 0.1534, "step": 1563 }, { "epoch": 0.3128, "grad_norm": 1.7451825141906738, "learning_rate": 3.5061224489795915e-05, "loss": 0.1632, "step": 1564 }, { "epoch": 0.313, "grad_norm": 1.4965324401855469, "learning_rate": 3.5051020408163265e-05, "loss": 0.1359, "step": 1565 }, { "epoch": 0.3132, "grad_norm": 1.3011749982833862, "learning_rate": 3.5040816326530614e-05, "loss": 0.0872, "step": 1566 }, { "epoch": 0.3134, "grad_norm": 1.8529915809631348, "learning_rate": 3.5030612244897956e-05, "loss": 0.1895, "step": 1567 }, { "epoch": 0.3136, "grad_norm": 1.5333659648895264, "learning_rate": 3.5020408163265305e-05, "loss": 0.1305, "step": 1568 }, { "epoch": 0.3138, "grad_norm": 4.501252174377441, "learning_rate": 3.5010204081632655e-05, "loss": 0.7046, "step": 1569 }, { "epoch": 0.314, "grad_norm": 5.666863441467285, "learning_rate": 3.5e-05, "loss": 1.0643, "step": 1570 }, { "epoch": 0.3142, "grad_norm": 1.5210033655166626, "learning_rate": 3.4989795918367346e-05, "loss": 0.1249, "step": 1571 }, { "epoch": 0.3144, "grad_norm": 1.483896255493164, "learning_rate": 3.4979591836734695e-05, "loss": 0.117, "step": 1572 }, { "epoch": 0.3146, "grad_norm": 14.870137214660645, "learning_rate": 3.496938775510204e-05, "loss": 4.297, "step": 1573 }, { "epoch": 0.3148, "grad_norm": 1.2303497791290283, "learning_rate": 3.495918367346939e-05, "loss": 0.0996, "step": 1574 }, { "epoch": 0.315, "grad_norm": 3.4915120601654053, "learning_rate": 3.4948979591836736e-05, "loss": 0.8037, "step": 1575 }, { "epoch": 0.3152, "grad_norm": 1.8560577630996704, "learning_rate": 3.4938775510204085e-05, "loss": 0.1712, "step": 1576 }, { "epoch": 0.3154, "grad_norm": 1.430997610092163, "learning_rate": 3.4928571428571434e-05, "loss": 0.0985, "step": 1577 }, { "epoch": 0.3156, "grad_norm": 1.5378358364105225, "learning_rate": 3.491836734693878e-05, "loss": 0.163, "step": 1578 }, { "epoch": 0.3158, "grad_norm": 1.7097241878509521, "learning_rate": 3.4908163265306126e-05, "loss": 0.1624, "step": 1579 }, { "epoch": 0.316, "grad_norm": 7.346837043762207, "learning_rate": 3.4897959183673475e-05, "loss": 2.2195, "step": 1580 }, { "epoch": 0.3162, "grad_norm": 5.800686359405518, "learning_rate": 3.488775510204082e-05, "loss": 1.1779, "step": 1581 }, { "epoch": 0.3164, "grad_norm": 3.9609789848327637, "learning_rate": 3.487755102040817e-05, "loss": 1.0466, "step": 1582 }, { "epoch": 0.3166, "grad_norm": 1.1616785526275635, "learning_rate": 3.4867346938775516e-05, "loss": 0.098, "step": 1583 }, { "epoch": 0.3168, "grad_norm": 1.6771498918533325, "learning_rate": 3.485714285714286e-05, "loss": 0.1413, "step": 1584 }, { "epoch": 0.317, "grad_norm": 1.4764608144760132, "learning_rate": 3.484693877551021e-05, "loss": 0.1243, "step": 1585 }, { "epoch": 0.3172, "grad_norm": 1.3374871015548706, "learning_rate": 3.483673469387756e-05, "loss": 0.1146, "step": 1586 }, { "epoch": 0.3174, "grad_norm": 3.5561840534210205, "learning_rate": 3.48265306122449e-05, "loss": 0.8987, "step": 1587 }, { "epoch": 0.3176, "grad_norm": 1.4515995979309082, "learning_rate": 3.481632653061225e-05, "loss": 0.125, "step": 1588 }, { "epoch": 0.3178, "grad_norm": 25.665393829345703, "learning_rate": 3.48061224489796e-05, "loss": 5.1746, "step": 1589 }, { "epoch": 0.318, "grad_norm": 1.3759853839874268, "learning_rate": 3.479591836734694e-05, "loss": 0.16, "step": 1590 }, { "epoch": 0.3182, "grad_norm": 7.616851806640625, "learning_rate": 3.478571428571429e-05, "loss": 3.5554, "step": 1591 }, { "epoch": 0.3184, "grad_norm": 3.7456676959991455, "learning_rate": 3.477551020408164e-05, "loss": 0.9717, "step": 1592 }, { "epoch": 0.3186, "grad_norm": 1.3557777404785156, "learning_rate": 3.476530612244898e-05, "loss": 0.1145, "step": 1593 }, { "epoch": 0.3188, "grad_norm": 4.116989612579346, "learning_rate": 3.475510204081633e-05, "loss": 0.6787, "step": 1594 }, { "epoch": 0.319, "grad_norm": 3.567126750946045, "learning_rate": 3.474489795918367e-05, "loss": 0.8604, "step": 1595 }, { "epoch": 0.3192, "grad_norm": 3.4982337951660156, "learning_rate": 3.473469387755102e-05, "loss": 0.9317, "step": 1596 }, { "epoch": 0.3194, "grad_norm": 3.7683043479919434, "learning_rate": 3.472448979591837e-05, "loss": 0.6925, "step": 1597 }, { "epoch": 0.3196, "grad_norm": 4.050169944763184, "learning_rate": 3.471428571428571e-05, "loss": 0.7471, "step": 1598 }, { "epoch": 0.3198, "grad_norm": 1.2799166440963745, "learning_rate": 3.470408163265306e-05, "loss": 0.1135, "step": 1599 }, { "epoch": 0.32, "grad_norm": 1.3863531351089478, "learning_rate": 3.469387755102041e-05, "loss": 0.1476, "step": 1600 }, { "epoch": 0.3202, "grad_norm": 8.264937400817871, "learning_rate": 3.4683673469387754e-05, "loss": 1.4777, "step": 1601 }, { "epoch": 0.3204, "grad_norm": 1.742661714553833, "learning_rate": 3.46734693877551e-05, "loss": 0.1704, "step": 1602 }, { "epoch": 0.3206, "grad_norm": 4.8802266120910645, "learning_rate": 3.466326530612245e-05, "loss": 1.318, "step": 1603 }, { "epoch": 0.3208, "grad_norm": 4.035736083984375, "learning_rate": 3.4653061224489795e-05, "loss": 1.0556, "step": 1604 }, { "epoch": 0.321, "grad_norm": 6.3922343254089355, "learning_rate": 3.4642857142857144e-05, "loss": 1.1756, "step": 1605 }, { "epoch": 0.3212, "grad_norm": 8.551189422607422, "learning_rate": 3.463265306122449e-05, "loss": 3.5888, "step": 1606 }, { "epoch": 0.3214, "grad_norm": 54.71131896972656, "learning_rate": 3.4622448979591835e-05, "loss": 5.4349, "step": 1607 }, { "epoch": 0.3216, "grad_norm": 37.75558090209961, "learning_rate": 3.4612244897959184e-05, "loss": 5.3459, "step": 1608 }, { "epoch": 0.3218, "grad_norm": 1.561134696006775, "learning_rate": 3.4602040816326534e-05, "loss": 0.1272, "step": 1609 }, { "epoch": 0.322, "grad_norm": 6.835804462432861, "learning_rate": 3.4591836734693876e-05, "loss": 1.4204, "step": 1610 }, { "epoch": 0.3222, "grad_norm": 1.407302975654602, "learning_rate": 3.4581632653061225e-05, "loss": 0.1035, "step": 1611 }, { "epoch": 0.3224, "grad_norm": 1.1173968315124512, "learning_rate": 3.4571428571428574e-05, "loss": 0.0943, "step": 1612 }, { "epoch": 0.3226, "grad_norm": 1.4095560312271118, "learning_rate": 3.456122448979592e-05, "loss": 0.1168, "step": 1613 }, { "epoch": 0.3228, "grad_norm": 28.86614990234375, "learning_rate": 3.4551020408163266e-05, "loss": 5.2445, "step": 1614 }, { "epoch": 0.323, "grad_norm": 1.3415619134902954, "learning_rate": 3.454081632653061e-05, "loss": 0.0933, "step": 1615 }, { "epoch": 0.3232, "grad_norm": 10.243077278137207, "learning_rate": 3.453061224489796e-05, "loss": 1.2456, "step": 1616 }, { "epoch": 0.3234, "grad_norm": 6.853682994842529, "learning_rate": 3.452040816326531e-05, "loss": 1.2804, "step": 1617 }, { "epoch": 0.3236, "grad_norm": 1.4520295858383179, "learning_rate": 3.451020408163265e-05, "loss": 0.1343, "step": 1618 }, { "epoch": 0.3238, "grad_norm": 1.510238528251648, "learning_rate": 3.45e-05, "loss": 0.1346, "step": 1619 }, { "epoch": 0.324, "grad_norm": 1.762850046157837, "learning_rate": 3.4489795918367354e-05, "loss": 0.1664, "step": 1620 }, { "epoch": 0.3242, "grad_norm": 5.683584690093994, "learning_rate": 3.44795918367347e-05, "loss": 2.1875, "step": 1621 }, { "epoch": 0.3244, "grad_norm": 3.9202487468719482, "learning_rate": 3.4469387755102046e-05, "loss": 0.986, "step": 1622 }, { "epoch": 0.3246, "grad_norm": 1.4274541139602661, "learning_rate": 3.4459183673469395e-05, "loss": 0.1336, "step": 1623 }, { "epoch": 0.3248, "grad_norm": 1.455907940864563, "learning_rate": 3.444897959183674e-05, "loss": 0.1164, "step": 1624 }, { "epoch": 0.325, "grad_norm": 1.8156874179840088, "learning_rate": 3.443877551020409e-05, "loss": 0.1452, "step": 1625 }, { "epoch": 0.3252, "grad_norm": 7.767525672912598, "learning_rate": 3.442857142857143e-05, "loss": 3.2278, "step": 1626 }, { "epoch": 0.3254, "grad_norm": 6.3254170417785645, "learning_rate": 3.441836734693878e-05, "loss": 1.2977, "step": 1627 }, { "epoch": 0.3256, "grad_norm": 1.2721400260925293, "learning_rate": 3.440816326530613e-05, "loss": 0.1012, "step": 1628 }, { "epoch": 0.3258, "grad_norm": 1.3631278276443481, "learning_rate": 3.439795918367347e-05, "loss": 0.1276, "step": 1629 }, { "epoch": 0.326, "grad_norm": 1.3635259866714478, "learning_rate": 3.438775510204082e-05, "loss": 0.0953, "step": 1630 }, { "epoch": 0.3262, "grad_norm": 1.0951149463653564, "learning_rate": 3.437755102040817e-05, "loss": 0.0772, "step": 1631 }, { "epoch": 0.3264, "grad_norm": 5.393881320953369, "learning_rate": 3.436734693877551e-05, "loss": 0.8863, "step": 1632 }, { "epoch": 0.3266, "grad_norm": 3.9565374851226807, "learning_rate": 3.435714285714286e-05, "loss": 1.0638, "step": 1633 }, { "epoch": 0.3268, "grad_norm": 1.1917364597320557, "learning_rate": 3.434693877551021e-05, "loss": 0.1068, "step": 1634 }, { "epoch": 0.327, "grad_norm": 6.110203266143799, "learning_rate": 3.433673469387755e-05, "loss": 1.1439, "step": 1635 }, { "epoch": 0.3272, "grad_norm": 1.4958648681640625, "learning_rate": 3.43265306122449e-05, "loss": 0.1234, "step": 1636 }, { "epoch": 0.3274, "grad_norm": 5.4393534660339355, "learning_rate": 3.431632653061225e-05, "loss": 1.1319, "step": 1637 }, { "epoch": 0.3276, "grad_norm": 1.2154313325881958, "learning_rate": 3.430612244897959e-05, "loss": 0.0805, "step": 1638 }, { "epoch": 0.3278, "grad_norm": 1.614425539970398, "learning_rate": 3.429591836734694e-05, "loss": 0.1391, "step": 1639 }, { "epoch": 0.328, "grad_norm": 1.4102503061294556, "learning_rate": 3.428571428571429e-05, "loss": 0.1195, "step": 1640 }, { "epoch": 0.3282, "grad_norm": 1.0450716018676758, "learning_rate": 3.427551020408163e-05, "loss": 0.0539, "step": 1641 }, { "epoch": 0.3284, "grad_norm": 1.5549017190933228, "learning_rate": 3.426530612244898e-05, "loss": 0.1209, "step": 1642 }, { "epoch": 0.3286, "grad_norm": 1.6046826839447021, "learning_rate": 3.425510204081633e-05, "loss": 0.1655, "step": 1643 }, { "epoch": 0.3288, "grad_norm": 1.2701417207717896, "learning_rate": 3.4244897959183674e-05, "loss": 0.1058, "step": 1644 }, { "epoch": 0.329, "grad_norm": 1.348042368888855, "learning_rate": 3.423469387755102e-05, "loss": 0.0948, "step": 1645 }, { "epoch": 0.3292, "grad_norm": 1.8712621927261353, "learning_rate": 3.4224489795918365e-05, "loss": 0.1133, "step": 1646 }, { "epoch": 0.3294, "grad_norm": 3.555974006652832, "learning_rate": 3.4214285714285714e-05, "loss": 0.8684, "step": 1647 }, { "epoch": 0.3296, "grad_norm": 1.5502355098724365, "learning_rate": 3.4204081632653064e-05, "loss": 0.1568, "step": 1648 }, { "epoch": 0.3298, "grad_norm": 3.769658088684082, "learning_rate": 3.4193877551020406e-05, "loss": 0.9487, "step": 1649 }, { "epoch": 0.33, "grad_norm": 1.6740633249282837, "learning_rate": 3.4183673469387755e-05, "loss": 0.1423, "step": 1650 }, { "epoch": 0.3302, "grad_norm": 1.3294124603271484, "learning_rate": 3.4173469387755104e-05, "loss": 0.0909, "step": 1651 }, { "epoch": 0.3304, "grad_norm": 1.5210949182510376, "learning_rate": 3.416326530612245e-05, "loss": 0.1255, "step": 1652 }, { "epoch": 0.3306, "grad_norm": 1.7085466384887695, "learning_rate": 3.4153061224489796e-05, "loss": 0.1542, "step": 1653 }, { "epoch": 0.3308, "grad_norm": 1.6548869609832764, "learning_rate": 3.4142857142857145e-05, "loss": 0.144, "step": 1654 }, { "epoch": 0.331, "grad_norm": 1.3550704717636108, "learning_rate": 3.413265306122449e-05, "loss": 0.129, "step": 1655 }, { "epoch": 0.3312, "grad_norm": 4.087093353271484, "learning_rate": 3.412244897959184e-05, "loss": 0.7072, "step": 1656 }, { "epoch": 0.3314, "grad_norm": 10.05302906036377, "learning_rate": 3.4112244897959186e-05, "loss": 3.2497, "step": 1657 }, { "epoch": 0.3316, "grad_norm": 1.5289970636367798, "learning_rate": 3.410204081632653e-05, "loss": 0.1442, "step": 1658 }, { "epoch": 0.3318, "grad_norm": 3.952613115310669, "learning_rate": 3.409183673469388e-05, "loss": 1.0591, "step": 1659 }, { "epoch": 0.332, "grad_norm": 1.4845707416534424, "learning_rate": 3.408163265306123e-05, "loss": 0.1287, "step": 1660 }, { "epoch": 0.3322, "grad_norm": 1.3955657482147217, "learning_rate": 3.407142857142857e-05, "loss": 0.1285, "step": 1661 }, { "epoch": 0.3324, "grad_norm": 1.4724160432815552, "learning_rate": 3.406122448979592e-05, "loss": 0.132, "step": 1662 }, { "epoch": 0.3326, "grad_norm": 1.2959953546524048, "learning_rate": 3.405102040816327e-05, "loss": 0.1044, "step": 1663 }, { "epoch": 0.3328, "grad_norm": 1.2368966341018677, "learning_rate": 3.404081632653061e-05, "loss": 0.085, "step": 1664 }, { "epoch": 0.333, "grad_norm": 1.550514817237854, "learning_rate": 3.403061224489796e-05, "loss": 0.1256, "step": 1665 }, { "epoch": 0.3332, "grad_norm": 3.974156618118286, "learning_rate": 3.402040816326531e-05, "loss": 0.9368, "step": 1666 }, { "epoch": 0.3334, "grad_norm": 3.496253490447998, "learning_rate": 3.401020408163266e-05, "loss": 1.0436, "step": 1667 }, { "epoch": 0.3336, "grad_norm": 3.534266471862793, "learning_rate": 3.4000000000000007e-05, "loss": 1.0133, "step": 1668 }, { "epoch": 0.3338, "grad_norm": 6.372529029846191, "learning_rate": 3.398979591836735e-05, "loss": 2.2158, "step": 1669 }, { "epoch": 0.334, "grad_norm": 1.311484932899475, "learning_rate": 3.39795918367347e-05, "loss": 0.0882, "step": 1670 }, { "epoch": 0.3342, "grad_norm": 3.420142412185669, "learning_rate": 3.396938775510205e-05, "loss": 0.8702, "step": 1671 }, { "epoch": 0.3344, "grad_norm": 8.62233829498291, "learning_rate": 3.395918367346939e-05, "loss": 1.4761, "step": 1672 }, { "epoch": 0.3346, "grad_norm": 1.5052247047424316, "learning_rate": 3.394897959183674e-05, "loss": 0.1379, "step": 1673 }, { "epoch": 0.3348, "grad_norm": 1.2894690036773682, "learning_rate": 3.393877551020409e-05, "loss": 0.0978, "step": 1674 }, { "epoch": 0.335, "grad_norm": 1.4060401916503906, "learning_rate": 3.392857142857143e-05, "loss": 0.0982, "step": 1675 }, { "epoch": 0.3352, "grad_norm": 5.907136917114258, "learning_rate": 3.391836734693878e-05, "loss": 2.2422, "step": 1676 }, { "epoch": 0.3354, "grad_norm": 1.534026026725769, "learning_rate": 3.390816326530613e-05, "loss": 0.1418, "step": 1677 }, { "epoch": 0.3356, "grad_norm": 8.832184791564941, "learning_rate": 3.389795918367347e-05, "loss": 1.2558, "step": 1678 }, { "epoch": 0.3358, "grad_norm": 18.30379867553711, "learning_rate": 3.388775510204082e-05, "loss": 3.42, "step": 1679 }, { "epoch": 0.336, "grad_norm": 7.9011077880859375, "learning_rate": 3.387755102040816e-05, "loss": 1.2031, "step": 1680 }, { "epoch": 0.3362, "grad_norm": 1.344774842262268, "learning_rate": 3.386734693877551e-05, "loss": 0.118, "step": 1681 }, { "epoch": 0.3364, "grad_norm": 7.134794235229492, "learning_rate": 3.385714285714286e-05, "loss": 1.2171, "step": 1682 }, { "epoch": 0.3366, "grad_norm": 1.4707818031311035, "learning_rate": 3.3846938775510204e-05, "loss": 0.1157, "step": 1683 }, { "epoch": 0.3368, "grad_norm": 1.6916249990463257, "learning_rate": 3.383673469387755e-05, "loss": 0.1132, "step": 1684 }, { "epoch": 0.337, "grad_norm": 11.05109977722168, "learning_rate": 3.38265306122449e-05, "loss": 1.1971, "step": 1685 }, { "epoch": 0.3372, "grad_norm": 5.596589088439941, "learning_rate": 3.3816326530612244e-05, "loss": 1.1734, "step": 1686 }, { "epoch": 0.3374, "grad_norm": 3.7325363159179688, "learning_rate": 3.3806122448979593e-05, "loss": 0.8977, "step": 1687 }, { "epoch": 0.3376, "grad_norm": 1.798915147781372, "learning_rate": 3.379591836734694e-05, "loss": 0.1708, "step": 1688 }, { "epoch": 0.3378, "grad_norm": 1.2309712171554565, "learning_rate": 3.3785714285714285e-05, "loss": 0.1188, "step": 1689 }, { "epoch": 0.338, "grad_norm": 1.7836970090866089, "learning_rate": 3.3775510204081634e-05, "loss": 0.1625, "step": 1690 }, { "epoch": 0.3382, "grad_norm": 1.6843546628952026, "learning_rate": 3.3765306122448983e-05, "loss": 0.1446, "step": 1691 }, { "epoch": 0.3384, "grad_norm": 1.4100183248519897, "learning_rate": 3.3755102040816326e-05, "loss": 0.1293, "step": 1692 }, { "epoch": 0.3386, "grad_norm": 1.7047845125198364, "learning_rate": 3.3744897959183675e-05, "loss": 0.2804, "step": 1693 }, { "epoch": 0.3388, "grad_norm": 4.1712260246276855, "learning_rate": 3.3734693877551024e-05, "loss": 0.6373, "step": 1694 }, { "epoch": 0.339, "grad_norm": 1.97983717918396, "learning_rate": 3.3724489795918367e-05, "loss": 0.1642, "step": 1695 }, { "epoch": 0.3392, "grad_norm": 1.362574577331543, "learning_rate": 3.3714285714285716e-05, "loss": 0.1343, "step": 1696 }, { "epoch": 0.3394, "grad_norm": 1.8942652940750122, "learning_rate": 3.3704081632653065e-05, "loss": 0.1933, "step": 1697 }, { "epoch": 0.3396, "grad_norm": 5.709615230560303, "learning_rate": 3.369387755102041e-05, "loss": 2.2058, "step": 1698 }, { "epoch": 0.3398, "grad_norm": 4.429149627685547, "learning_rate": 3.3683673469387757e-05, "loss": 0.962, "step": 1699 }, { "epoch": 0.34, "grad_norm": 1.3776390552520752, "learning_rate": 3.36734693877551e-05, "loss": 0.1654, "step": 1700 }, { "epoch": 0.3402, "grad_norm": 1.5495575666427612, "learning_rate": 3.366326530612245e-05, "loss": 0.1326, "step": 1701 }, { "epoch": 0.3404, "grad_norm": 1.2185381650924683, "learning_rate": 3.36530612244898e-05, "loss": 0.159, "step": 1702 }, { "epoch": 0.3406, "grad_norm": 1.8954464197158813, "learning_rate": 3.364285714285714e-05, "loss": 0.3059, "step": 1703 }, { "epoch": 0.3408, "grad_norm": 4.421078205108643, "learning_rate": 3.363265306122449e-05, "loss": 1.0207, "step": 1704 }, { "epoch": 0.341, "grad_norm": 1.355732798576355, "learning_rate": 3.362244897959184e-05, "loss": 0.1387, "step": 1705 }, { "epoch": 0.3412, "grad_norm": 7.854058742523193, "learning_rate": 3.361224489795918e-05, "loss": 1.0543, "step": 1706 }, { "epoch": 0.3414, "grad_norm": 1.3484553098678589, "learning_rate": 3.360204081632653e-05, "loss": 0.1473, "step": 1707 }, { "epoch": 0.3416, "grad_norm": 22.08734130859375, "learning_rate": 3.359183673469388e-05, "loss": 4.1627, "step": 1708 }, { "epoch": 0.3418, "grad_norm": 18.592674255371094, "learning_rate": 3.358163265306122e-05, "loss": 4.042, "step": 1709 }, { "epoch": 0.342, "grad_norm": 1.5277100801467896, "learning_rate": 3.357142857142857e-05, "loss": 0.166, "step": 1710 }, { "epoch": 0.3422, "grad_norm": 3.34956955909729, "learning_rate": 3.356122448979592e-05, "loss": 0.8024, "step": 1711 }, { "epoch": 0.3424, "grad_norm": 1.134932279586792, "learning_rate": 3.355102040816327e-05, "loss": 0.0938, "step": 1712 }, { "epoch": 0.3426, "grad_norm": 1.764827847480774, "learning_rate": 3.354081632653062e-05, "loss": 0.2044, "step": 1713 }, { "epoch": 0.3428, "grad_norm": 3.3121185302734375, "learning_rate": 3.353061224489796e-05, "loss": 0.8107, "step": 1714 }, { "epoch": 0.343, "grad_norm": 1.1516286134719849, "learning_rate": 3.352040816326531e-05, "loss": 0.0882, "step": 1715 }, { "epoch": 0.3432, "grad_norm": 16.110246658325195, "learning_rate": 3.351020408163266e-05, "loss": 4.0145, "step": 1716 }, { "epoch": 0.3434, "grad_norm": 1.7402606010437012, "learning_rate": 3.35e-05, "loss": 0.1657, "step": 1717 }, { "epoch": 0.3436, "grad_norm": 3.8523192405700684, "learning_rate": 3.348979591836735e-05, "loss": 1.0029, "step": 1718 }, { "epoch": 0.3438, "grad_norm": 6.852540016174316, "learning_rate": 3.34795918367347e-05, "loss": 1.1104, "step": 1719 }, { "epoch": 0.344, "grad_norm": 3.598719358444214, "learning_rate": 3.346938775510204e-05, "loss": 0.9867, "step": 1720 }, { "epoch": 0.3442, "grad_norm": 1.3935763835906982, "learning_rate": 3.345918367346939e-05, "loss": 0.1097, "step": 1721 }, { "epoch": 0.3444, "grad_norm": 1.6227840185165405, "learning_rate": 3.344897959183674e-05, "loss": 0.1254, "step": 1722 }, { "epoch": 0.3446, "grad_norm": 1.2965258359909058, "learning_rate": 3.343877551020408e-05, "loss": 0.1083, "step": 1723 }, { "epoch": 0.3448, "grad_norm": 1.3589463233947754, "learning_rate": 3.342857142857143e-05, "loss": 0.1148, "step": 1724 }, { "epoch": 0.345, "grad_norm": 5.295959949493408, "learning_rate": 3.341836734693878e-05, "loss": 1.0539, "step": 1725 }, { "epoch": 0.3452, "grad_norm": 3.4420156478881836, "learning_rate": 3.3408163265306123e-05, "loss": 0.7745, "step": 1726 }, { "epoch": 0.3454, "grad_norm": 16.860403060913086, "learning_rate": 3.339795918367347e-05, "loss": 3.1186, "step": 1727 }, { "epoch": 0.3456, "grad_norm": 15.784538269042969, "learning_rate": 3.338775510204082e-05, "loss": 1.4251, "step": 1728 }, { "epoch": 0.3458, "grad_norm": 4.1750874519348145, "learning_rate": 3.3377551020408164e-05, "loss": 0.6993, "step": 1729 }, { "epoch": 0.346, "grad_norm": 3.424389362335205, "learning_rate": 3.336734693877551e-05, "loss": 0.8525, "step": 1730 }, { "epoch": 0.3462, "grad_norm": 6.516100883483887, "learning_rate": 3.3357142857142856e-05, "loss": 2.2779, "step": 1731 }, { "epoch": 0.3464, "grad_norm": 4.072001934051514, "learning_rate": 3.3346938775510205e-05, "loss": 0.6746, "step": 1732 }, { "epoch": 0.3466, "grad_norm": 8.418037414550781, "learning_rate": 3.3336734693877554e-05, "loss": 1.1869, "step": 1733 }, { "epoch": 0.3468, "grad_norm": 3.950887441635132, "learning_rate": 3.3326530612244897e-05, "loss": 0.7377, "step": 1734 }, { "epoch": 0.347, "grad_norm": 3.9305362701416016, "learning_rate": 3.3316326530612246e-05, "loss": 0.6518, "step": 1735 }, { "epoch": 0.3472, "grad_norm": 4.105065822601318, "learning_rate": 3.3306122448979595e-05, "loss": 0.7078, "step": 1736 }, { "epoch": 0.3474, "grad_norm": 3.4574952125549316, "learning_rate": 3.329591836734694e-05, "loss": 0.9431, "step": 1737 }, { "epoch": 0.3476, "grad_norm": 1.2512776851654053, "learning_rate": 3.3285714285714286e-05, "loss": 0.1185, "step": 1738 }, { "epoch": 0.3478, "grad_norm": 10.10742473602295, "learning_rate": 3.3275510204081636e-05, "loss": 1.3306, "step": 1739 }, { "epoch": 0.348, "grad_norm": 4.465425968170166, "learning_rate": 3.326530612244898e-05, "loss": 0.6967, "step": 1740 }, { "epoch": 0.3482, "grad_norm": 6.026064872741699, "learning_rate": 3.325510204081633e-05, "loss": 1.0078, "step": 1741 }, { "epoch": 0.3484, "grad_norm": 1.0465497970581055, "learning_rate": 3.3244897959183676e-05, "loss": 0.0829, "step": 1742 }, { "epoch": 0.3486, "grad_norm": 3.51488995552063, "learning_rate": 3.323469387755102e-05, "loss": 1.0144, "step": 1743 }, { "epoch": 0.3488, "grad_norm": 10.781816482543945, "learning_rate": 3.322448979591837e-05, "loss": 1.3457, "step": 1744 }, { "epoch": 0.349, "grad_norm": 3.6929171085357666, "learning_rate": 3.321428571428572e-05, "loss": 0.8511, "step": 1745 }, { "epoch": 0.3492, "grad_norm": 1.1213587522506714, "learning_rate": 3.320408163265306e-05, "loss": 0.0851, "step": 1746 }, { "epoch": 0.3494, "grad_norm": 9.472102165222168, "learning_rate": 3.319387755102041e-05, "loss": 2.6352, "step": 1747 }, { "epoch": 0.3496, "grad_norm": 1.7029060125350952, "learning_rate": 3.318367346938776e-05, "loss": 0.155, "step": 1748 }, { "epoch": 0.3498, "grad_norm": 1.4820170402526855, "learning_rate": 3.31734693877551e-05, "loss": 0.1119, "step": 1749 }, { "epoch": 0.35, "grad_norm": 4.579092502593994, "learning_rate": 3.316326530612245e-05, "loss": 0.7513, "step": 1750 }, { "epoch": 0.3502, "grad_norm": 1.3974214792251587, "learning_rate": 3.31530612244898e-05, "loss": 0.1146, "step": 1751 }, { "epoch": 0.3504, "grad_norm": 4.359090805053711, "learning_rate": 3.314285714285714e-05, "loss": 0.6645, "step": 1752 }, { "epoch": 0.3506, "grad_norm": 4.183372497558594, "learning_rate": 3.313265306122449e-05, "loss": 0.747, "step": 1753 }, { "epoch": 0.3508, "grad_norm": 6.204871654510498, "learning_rate": 3.312244897959183e-05, "loss": 1.3034, "step": 1754 }, { "epoch": 0.351, "grad_norm": 3.895108938217163, "learning_rate": 3.311224489795918e-05, "loss": 0.9852, "step": 1755 }, { "epoch": 0.3512, "grad_norm": 1.3820586204528809, "learning_rate": 3.310204081632653e-05, "loss": 0.1083, "step": 1756 }, { "epoch": 0.3514, "grad_norm": 8.781912803649902, "learning_rate": 3.309183673469388e-05, "loss": 3.3185, "step": 1757 }, { "epoch": 0.3516, "grad_norm": 7.169617176055908, "learning_rate": 3.308163265306123e-05, "loss": 1.1307, "step": 1758 }, { "epoch": 0.3518, "grad_norm": 5.979995250701904, "learning_rate": 3.307142857142858e-05, "loss": 2.2011, "step": 1759 }, { "epoch": 0.352, "grad_norm": 1.4406485557556152, "learning_rate": 3.306122448979592e-05, "loss": 0.1699, "step": 1760 }, { "epoch": 0.3522, "grad_norm": 1.7303863763809204, "learning_rate": 3.305102040816327e-05, "loss": 0.1256, "step": 1761 }, { "epoch": 0.3524, "grad_norm": 1.4316778182983398, "learning_rate": 3.304081632653061e-05, "loss": 0.132, "step": 1762 }, { "epoch": 0.3526, "grad_norm": 6.620378494262695, "learning_rate": 3.303061224489796e-05, "loss": 3.2379, "step": 1763 }, { "epoch": 0.3528, "grad_norm": 3.6853671073913574, "learning_rate": 3.302040816326531e-05, "loss": 1.0066, "step": 1764 }, { "epoch": 0.353, "grad_norm": 5.730157375335693, "learning_rate": 3.301020408163265e-05, "loss": 0.7325, "step": 1765 }, { "epoch": 0.3532, "grad_norm": 1.6502915620803833, "learning_rate": 3.3e-05, "loss": 0.1872, "step": 1766 }, { "epoch": 0.3534, "grad_norm": 1.535474419593811, "learning_rate": 3.298979591836735e-05, "loss": 0.1412, "step": 1767 }, { "epoch": 0.3536, "grad_norm": 7.426815509796143, "learning_rate": 3.2979591836734694e-05, "loss": 1.1631, "step": 1768 }, { "epoch": 0.3538, "grad_norm": 1.3126943111419678, "learning_rate": 3.296938775510204e-05, "loss": 0.1495, "step": 1769 }, { "epoch": 0.354, "grad_norm": 3.619706153869629, "learning_rate": 3.295918367346939e-05, "loss": 1.0407, "step": 1770 }, { "epoch": 0.3542, "grad_norm": 1.4426507949829102, "learning_rate": 3.2948979591836735e-05, "loss": 0.1323, "step": 1771 }, { "epoch": 0.3544, "grad_norm": 1.4123917818069458, "learning_rate": 3.2938775510204084e-05, "loss": 0.1208, "step": 1772 }, { "epoch": 0.3546, "grad_norm": 1.712772250175476, "learning_rate": 3.292857142857143e-05, "loss": 0.1838, "step": 1773 }, { "epoch": 0.3548, "grad_norm": 1.4630318880081177, "learning_rate": 3.2918367346938776e-05, "loss": 0.1842, "step": 1774 }, { "epoch": 0.355, "grad_norm": 1.5501052141189575, "learning_rate": 3.2908163265306125e-05, "loss": 0.1932, "step": 1775 }, { "epoch": 0.3552, "grad_norm": 8.72156810760498, "learning_rate": 3.2897959183673474e-05, "loss": 3.2597, "step": 1776 }, { "epoch": 0.3554, "grad_norm": 3.496630907058716, "learning_rate": 3.2887755102040816e-05, "loss": 0.9562, "step": 1777 }, { "epoch": 0.3556, "grad_norm": 1.2298327684402466, "learning_rate": 3.2877551020408166e-05, "loss": 0.1136, "step": 1778 }, { "epoch": 0.3558, "grad_norm": 5.842172622680664, "learning_rate": 3.2867346938775515e-05, "loss": 2.3213, "step": 1779 }, { "epoch": 0.356, "grad_norm": 1.2177481651306152, "learning_rate": 3.285714285714286e-05, "loss": 0.0954, "step": 1780 }, { "epoch": 0.3562, "grad_norm": 3.6284985542297363, "learning_rate": 3.2846938775510206e-05, "loss": 0.8903, "step": 1781 }, { "epoch": 0.3564, "grad_norm": 6.315433502197266, "learning_rate": 3.2836734693877556e-05, "loss": 1.3574, "step": 1782 }, { "epoch": 0.3566, "grad_norm": 5.381503582000732, "learning_rate": 3.28265306122449e-05, "loss": 1.0625, "step": 1783 }, { "epoch": 0.3568, "grad_norm": 3.6157219409942627, "learning_rate": 3.281632653061225e-05, "loss": 1.0214, "step": 1784 }, { "epoch": 0.357, "grad_norm": 3.836179494857788, "learning_rate": 3.280612244897959e-05, "loss": 0.6615, "step": 1785 }, { "epoch": 0.3572, "grad_norm": 3.564310312271118, "learning_rate": 3.279591836734694e-05, "loss": 0.8603, "step": 1786 }, { "epoch": 0.3574, "grad_norm": 10.237130165100098, "learning_rate": 3.278571428571429e-05, "loss": 3.5878, "step": 1787 }, { "epoch": 0.3576, "grad_norm": 3.611448287963867, "learning_rate": 3.277551020408163e-05, "loss": 0.9539, "step": 1788 }, { "epoch": 0.3578, "grad_norm": 5.697876453399658, "learning_rate": 3.276530612244898e-05, "loss": 1.0622, "step": 1789 }, { "epoch": 0.358, "grad_norm": 6.092684268951416, "learning_rate": 3.275510204081633e-05, "loss": 1.3316, "step": 1790 }, { "epoch": 0.3582, "grad_norm": 3.4067060947418213, "learning_rate": 3.274489795918367e-05, "loss": 0.8333, "step": 1791 }, { "epoch": 0.3584, "grad_norm": 5.643919944763184, "learning_rate": 3.273469387755102e-05, "loss": 1.0826, "step": 1792 }, { "epoch": 0.3586, "grad_norm": 5.771454334259033, "learning_rate": 3.272448979591837e-05, "loss": 1.1095, "step": 1793 }, { "epoch": 0.3588, "grad_norm": 5.640023231506348, "learning_rate": 3.271428571428571e-05, "loss": 1.1269, "step": 1794 }, { "epoch": 0.359, "grad_norm": 4.021417617797852, "learning_rate": 3.270408163265306e-05, "loss": 0.6279, "step": 1795 }, { "epoch": 0.3592, "grad_norm": 10.706412315368652, "learning_rate": 3.269387755102041e-05, "loss": 3.4476, "step": 1796 }, { "epoch": 0.3594, "grad_norm": 9.943094253540039, "learning_rate": 3.268367346938775e-05, "loss": 3.3975, "step": 1797 }, { "epoch": 0.3596, "grad_norm": 3.9147348403930664, "learning_rate": 3.26734693877551e-05, "loss": 0.6569, "step": 1798 }, { "epoch": 0.3598, "grad_norm": 6.527913570404053, "learning_rate": 3.266326530612245e-05, "loss": 0.9437, "step": 1799 }, { "epoch": 0.36, "grad_norm": 6.382730484008789, "learning_rate": 3.265306122448979e-05, "loss": 1.0618, "step": 1800 }, { "epoch": 0.3602, "grad_norm": 5.790674209594727, "learning_rate": 3.264285714285714e-05, "loss": 2.3089, "step": 1801 }, { "epoch": 0.3604, "grad_norm": 5.195087432861328, "learning_rate": 3.263265306122449e-05, "loss": 1.0027, "step": 1802 }, { "epoch": 0.3606, "grad_norm": 4.964000701904297, "learning_rate": 3.262244897959184e-05, "loss": 1.2752, "step": 1803 }, { "epoch": 0.3608, "grad_norm": 5.07620906829834, "learning_rate": 3.261224489795919e-05, "loss": 1.007, "step": 1804 }, { "epoch": 0.361, "grad_norm": 5.06422233581543, "learning_rate": 3.260204081632653e-05, "loss": 2.202, "step": 1805 }, { "epoch": 0.3612, "grad_norm": 5.387059688568115, "learning_rate": 3.259183673469388e-05, "loss": 1.014, "step": 1806 }, { "epoch": 0.3614, "grad_norm": 6.059737205505371, "learning_rate": 3.258163265306123e-05, "loss": 1.2625, "step": 1807 }, { "epoch": 0.3616, "grad_norm": 5.065815448760986, "learning_rate": 3.257142857142857e-05, "loss": 0.9486, "step": 1808 }, { "epoch": 0.3618, "grad_norm": 33.03364181518555, "learning_rate": 3.256122448979592e-05, "loss": 3.7953, "step": 1809 }, { "epoch": 0.362, "grad_norm": 7.243016242980957, "learning_rate": 3.255102040816327e-05, "loss": 1.1437, "step": 1810 }, { "epoch": 0.3622, "grad_norm": 4.155308246612549, "learning_rate": 3.2540816326530614e-05, "loss": 0.5773, "step": 1811 }, { "epoch": 0.3624, "grad_norm": 21.944828033447266, "learning_rate": 3.253061224489796e-05, "loss": 1.3502, "step": 1812 }, { "epoch": 0.3626, "grad_norm": 4.042191505432129, "learning_rate": 3.252040816326531e-05, "loss": 0.6203, "step": 1813 }, { "epoch": 0.3628, "grad_norm": 30.818618774414062, "learning_rate": 3.2510204081632655e-05, "loss": 1.3049, "step": 1814 }, { "epoch": 0.363, "grad_norm": 16.38467025756836, "learning_rate": 3.2500000000000004e-05, "loss": 3.4577, "step": 1815 }, { "epoch": 0.3632, "grad_norm": 11.60643482208252, "learning_rate": 3.2489795918367346e-05, "loss": 3.4165, "step": 1816 }, { "epoch": 0.3634, "grad_norm": 6.943589210510254, "learning_rate": 3.2479591836734696e-05, "loss": 1.152, "step": 1817 }, { "epoch": 0.3636, "grad_norm": 8.237154960632324, "learning_rate": 3.2469387755102045e-05, "loss": 3.092, "step": 1818 }, { "epoch": 0.3638, "grad_norm": 10.214845657348633, "learning_rate": 3.245918367346939e-05, "loss": 3.4323, "step": 1819 }, { "epoch": 0.364, "grad_norm": 5.321285247802734, "learning_rate": 3.2448979591836736e-05, "loss": 0.8715, "step": 1820 }, { "epoch": 0.3642, "grad_norm": 5.701889991760254, "learning_rate": 3.2438775510204085e-05, "loss": 1.0432, "step": 1821 }, { "epoch": 0.3644, "grad_norm": 9.685300827026367, "learning_rate": 3.242857142857143e-05, "loss": 3.4456, "step": 1822 }, { "epoch": 0.3646, "grad_norm": 5.464596271514893, "learning_rate": 3.241836734693878e-05, "loss": 1.0736, "step": 1823 }, { "epoch": 0.3648, "grad_norm": 17.551986694335938, "learning_rate": 3.2408163265306126e-05, "loss": 1.3006, "step": 1824 }, { "epoch": 0.365, "grad_norm": 4.076687812805176, "learning_rate": 3.239795918367347e-05, "loss": 0.894, "step": 1825 }, { "epoch": 0.3652, "grad_norm": 9.390284538269043, "learning_rate": 3.238775510204082e-05, "loss": 3.2777, "step": 1826 }, { "epoch": 0.3654, "grad_norm": 3.8985793590545654, "learning_rate": 3.237755102040817e-05, "loss": 0.9889, "step": 1827 }, { "epoch": 0.3656, "grad_norm": 4.8796234130859375, "learning_rate": 3.236734693877551e-05, "loss": 0.6503, "step": 1828 }, { "epoch": 0.3658, "grad_norm": 4.104645729064941, "learning_rate": 3.235714285714286e-05, "loss": 0.5622, "step": 1829 }, { "epoch": 0.366, "grad_norm": 13.190925598144531, "learning_rate": 3.234693877551021e-05, "loss": 1.3046, "step": 1830 }, { "epoch": 0.3662, "grad_norm": 9.27966022491455, "learning_rate": 3.233673469387755e-05, "loss": 2.9469, "step": 1831 }, { "epoch": 0.3664, "grad_norm": 7.0232319831848145, "learning_rate": 3.23265306122449e-05, "loss": 1.1279, "step": 1832 }, { "epoch": 0.3666, "grad_norm": 4.636045455932617, "learning_rate": 3.231632653061225e-05, "loss": 0.9341, "step": 1833 }, { "epoch": 0.3668, "grad_norm": 4.049642562866211, "learning_rate": 3.230612244897959e-05, "loss": 0.6906, "step": 1834 }, { "epoch": 0.367, "grad_norm": 3.690493583679199, "learning_rate": 3.229591836734694e-05, "loss": 1.0225, "step": 1835 }, { "epoch": 0.3672, "grad_norm": 9.44584846496582, "learning_rate": 3.228571428571428e-05, "loss": 1.2827, "step": 1836 }, { "epoch": 0.3674, "grad_norm": 7.8593668937683105, "learning_rate": 3.227551020408163e-05, "loss": 1.1259, "step": 1837 }, { "epoch": 0.3676, "grad_norm": 12.65119743347168, "learning_rate": 3.226530612244898e-05, "loss": 1.3475, "step": 1838 }, { "epoch": 0.3678, "grad_norm": 14.825552940368652, "learning_rate": 3.225510204081632e-05, "loss": 1.2212, "step": 1839 }, { "epoch": 0.368, "grad_norm": 10.312542915344238, "learning_rate": 3.224489795918367e-05, "loss": 2.8721, "step": 1840 }, { "epoch": 0.3682, "grad_norm": 8.35904598236084, "learning_rate": 3.223469387755102e-05, "loss": 1.1117, "step": 1841 }, { "epoch": 0.3684, "grad_norm": 21.328636169433594, "learning_rate": 3.2224489795918364e-05, "loss": 1.2698, "step": 1842 }, { "epoch": 0.3686, "grad_norm": 11.115662574768066, "learning_rate": 3.221428571428571e-05, "loss": 1.2428, "step": 1843 }, { "epoch": 0.3688, "grad_norm": 8.736370086669922, "learning_rate": 3.220408163265306e-05, "loss": 2.8465, "step": 1844 }, { "epoch": 0.369, "grad_norm": 4.694275379180908, "learning_rate": 3.2193877551020405e-05, "loss": 0.6521, "step": 1845 }, { "epoch": 0.3692, "grad_norm": 20.076202392578125, "learning_rate": 3.2183673469387754e-05, "loss": 1.2942, "step": 1846 }, { "epoch": 0.3694, "grad_norm": 3.7784104347229004, "learning_rate": 3.21734693877551e-05, "loss": 0.8574, "step": 1847 }, { "epoch": 0.3696, "grad_norm": 3.9411227703094482, "learning_rate": 3.216326530612245e-05, "loss": 0.6632, "step": 1848 }, { "epoch": 0.3698, "grad_norm": 3.7799296379089355, "learning_rate": 3.21530612244898e-05, "loss": 0.8616, "step": 1849 }, { "epoch": 0.37, "grad_norm": 5.967972278594971, "learning_rate": 3.2142857142857144e-05, "loss": 0.9687, "step": 1850 }, { "epoch": 0.3702, "grad_norm": 6.866252899169922, "learning_rate": 3.213265306122449e-05, "loss": 2.7619, "step": 1851 }, { "epoch": 0.3704, "grad_norm": 3.4781665802001953, "learning_rate": 3.212244897959184e-05, "loss": 0.8378, "step": 1852 }, { "epoch": 0.3706, "grad_norm": 12.557558059692383, "learning_rate": 3.2112244897959185e-05, "loss": 1.2461, "step": 1853 }, { "epoch": 0.3708, "grad_norm": 3.999380111694336, "learning_rate": 3.2102040816326534e-05, "loss": 0.9952, "step": 1854 }, { "epoch": 0.371, "grad_norm": 6.250697612762451, "learning_rate": 3.209183673469388e-05, "loss": 1.1547, "step": 1855 }, { "epoch": 0.3712, "grad_norm": 3.5486440658569336, "learning_rate": 3.2081632653061225e-05, "loss": 0.8971, "step": 1856 }, { "epoch": 0.3714, "grad_norm": 4.229161262512207, "learning_rate": 3.2071428571428575e-05, "loss": 0.6361, "step": 1857 }, { "epoch": 0.3716, "grad_norm": 8.195026397705078, "learning_rate": 3.2061224489795924e-05, "loss": 3.408, "step": 1858 }, { "epoch": 0.3718, "grad_norm": 8.057538986206055, "learning_rate": 3.2051020408163266e-05, "loss": 1.2027, "step": 1859 }, { "epoch": 0.372, "grad_norm": 3.8987393379211426, "learning_rate": 3.2040816326530615e-05, "loss": 0.9229, "step": 1860 }, { "epoch": 0.3722, "grad_norm": 10.013763427734375, "learning_rate": 3.2030612244897965e-05, "loss": 3.5451, "step": 1861 }, { "epoch": 0.3724, "grad_norm": 6.321132183074951, "learning_rate": 3.202040816326531e-05, "loss": 1.0868, "step": 1862 }, { "epoch": 0.3726, "grad_norm": 3.6911380290985107, "learning_rate": 3.2010204081632656e-05, "loss": 0.9743, "step": 1863 }, { "epoch": 0.3728, "grad_norm": 6.979063510894775, "learning_rate": 3.2000000000000005e-05, "loss": 1.0945, "step": 1864 }, { "epoch": 0.373, "grad_norm": 6.380610942840576, "learning_rate": 3.198979591836735e-05, "loss": 1.0856, "step": 1865 }, { "epoch": 0.3732, "grad_norm": 4.044541835784912, "learning_rate": 3.19795918367347e-05, "loss": 0.6641, "step": 1866 }, { "epoch": 0.3734, "grad_norm": 4.208895206451416, "learning_rate": 3.196938775510204e-05, "loss": 0.8649, "step": 1867 }, { "epoch": 0.3736, "grad_norm": 3.6989431381225586, "learning_rate": 3.195918367346939e-05, "loss": 0.9247, "step": 1868 }, { "epoch": 0.3738, "grad_norm": 6.997406005859375, "learning_rate": 3.194897959183674e-05, "loss": 1.1066, "step": 1869 }, { "epoch": 0.374, "grad_norm": 10.595443725585938, "learning_rate": 3.193877551020408e-05, "loss": 3.605, "step": 1870 }, { "epoch": 0.3742, "grad_norm": 5.664329528808594, "learning_rate": 3.192857142857143e-05, "loss": 1.0672, "step": 1871 }, { "epoch": 0.3744, "grad_norm": 3.9504380226135254, "learning_rate": 3.191836734693878e-05, "loss": 1.0203, "step": 1872 }, { "epoch": 0.3746, "grad_norm": 4.3827996253967285, "learning_rate": 3.190816326530612e-05, "loss": 0.7952, "step": 1873 }, { "epoch": 0.3748, "grad_norm": 3.58048152923584, "learning_rate": 3.189795918367347e-05, "loss": 0.8945, "step": 1874 }, { "epoch": 0.375, "grad_norm": 7.100594520568848, "learning_rate": 3.188775510204082e-05, "loss": 1.0029, "step": 1875 }, { "epoch": 0.3752, "grad_norm": 5.567595481872559, "learning_rate": 3.187755102040816e-05, "loss": 0.9961, "step": 1876 }, { "epoch": 0.3754, "grad_norm": 6.2788405418396, "learning_rate": 3.186734693877551e-05, "loss": 1.2381, "step": 1877 }, { "epoch": 0.3756, "grad_norm": 10.337995529174805, "learning_rate": 3.185714285714286e-05, "loss": 3.6624, "step": 1878 }, { "epoch": 0.3758, "grad_norm": 3.7018885612487793, "learning_rate": 3.18469387755102e-05, "loss": 1.0117, "step": 1879 }, { "epoch": 0.376, "grad_norm": 3.6972994804382324, "learning_rate": 3.183673469387755e-05, "loss": 0.803, "step": 1880 }, { "epoch": 0.3762, "grad_norm": 3.676743268966675, "learning_rate": 3.18265306122449e-05, "loss": 0.8658, "step": 1881 }, { "epoch": 0.3764, "grad_norm": 3.5221686363220215, "learning_rate": 3.181632653061224e-05, "loss": 0.5166, "step": 1882 }, { "epoch": 0.3766, "grad_norm": 9.288930892944336, "learning_rate": 3.180612244897959e-05, "loss": 3.1491, "step": 1883 }, { "epoch": 0.3768, "grad_norm": 9.832517623901367, "learning_rate": 3.179591836734694e-05, "loss": 1.2434, "step": 1884 }, { "epoch": 0.377, "grad_norm": 7.8202223777771, "learning_rate": 3.1785714285714284e-05, "loss": 1.105, "step": 1885 }, { "epoch": 0.3772, "grad_norm": 5.503623962402344, "learning_rate": 3.177551020408163e-05, "loss": 0.9261, "step": 1886 }, { "epoch": 0.3774, "grad_norm": 3.5174360275268555, "learning_rate": 3.176530612244898e-05, "loss": 0.5461, "step": 1887 }, { "epoch": 0.3776, "grad_norm": 9.028364181518555, "learning_rate": 3.1755102040816325e-05, "loss": 3.6403, "step": 1888 }, { "epoch": 0.3778, "grad_norm": 9.071781158447266, "learning_rate": 3.1744897959183674e-05, "loss": 3.6504, "step": 1889 }, { "epoch": 0.378, "grad_norm": 8.815235137939453, "learning_rate": 3.1734693877551016e-05, "loss": 3.2857, "step": 1890 }, { "epoch": 0.3782, "grad_norm": 3.7210514545440674, "learning_rate": 3.1724489795918365e-05, "loss": 0.9614, "step": 1891 }, { "epoch": 0.3784, "grad_norm": 10.739581108093262, "learning_rate": 3.1714285714285715e-05, "loss": 3.3892, "step": 1892 }, { "epoch": 0.3786, "grad_norm": 9.99426555633545, "learning_rate": 3.1704081632653064e-05, "loss": 3.0964, "step": 1893 }, { "epoch": 0.3788, "grad_norm": 3.962157964706421, "learning_rate": 3.169387755102041e-05, "loss": 0.9553, "step": 1894 }, { "epoch": 0.379, "grad_norm": 5.60536003112793, "learning_rate": 3.168367346938776e-05, "loss": 1.024, "step": 1895 }, { "epoch": 0.3792, "grad_norm": 8.856130599975586, "learning_rate": 3.1673469387755105e-05, "loss": 1.236, "step": 1896 }, { "epoch": 0.3794, "grad_norm": 5.325572490692139, "learning_rate": 3.1663265306122454e-05, "loss": 1.1324, "step": 1897 }, { "epoch": 0.3796, "grad_norm": 3.662156343460083, "learning_rate": 3.1653061224489796e-05, "loss": 0.8391, "step": 1898 }, { "epoch": 0.3798, "grad_norm": 6.837795734405518, "learning_rate": 3.1642857142857145e-05, "loss": 1.0977, "step": 1899 }, { "epoch": 0.38, "grad_norm": 6.636470317840576, "learning_rate": 3.1632653061224494e-05, "loss": 1.1679, "step": 1900 }, { "epoch": 0.3802, "grad_norm": 5.536612033843994, "learning_rate": 3.162244897959184e-05, "loss": 0.9724, "step": 1901 }, { "epoch": 0.3804, "grad_norm": 5.437753200531006, "learning_rate": 3.1612244897959186e-05, "loss": 0.9618, "step": 1902 }, { "epoch": 0.3806, "grad_norm": 5.53395938873291, "learning_rate": 3.1602040816326535e-05, "loss": 0.9768, "step": 1903 }, { "epoch": 0.3808, "grad_norm": 26.427989959716797, "learning_rate": 3.159183673469388e-05, "loss": 3.7384, "step": 1904 }, { "epoch": 0.381, "grad_norm": 3.5214948654174805, "learning_rate": 3.158163265306123e-05, "loss": 0.5824, "step": 1905 }, { "epoch": 0.3812, "grad_norm": 4.140063762664795, "learning_rate": 3.1571428571428576e-05, "loss": 0.6823, "step": 1906 }, { "epoch": 0.3814, "grad_norm": 5.837836265563965, "learning_rate": 3.156122448979592e-05, "loss": 1.1179, "step": 1907 }, { "epoch": 0.3816, "grad_norm": 5.40811824798584, "learning_rate": 3.155102040816327e-05, "loss": 1.0316, "step": 1908 }, { "epoch": 0.3818, "grad_norm": 4.175589084625244, "learning_rate": 3.154081632653062e-05, "loss": 0.9813, "step": 1909 }, { "epoch": 0.382, "grad_norm": 5.406588554382324, "learning_rate": 3.153061224489796e-05, "loss": 0.8905, "step": 1910 }, { "epoch": 0.3822, "grad_norm": 3.529309034347534, "learning_rate": 3.152040816326531e-05, "loss": 0.9562, "step": 1911 }, { "epoch": 0.3824, "grad_norm": 11.12568187713623, "learning_rate": 3.151020408163266e-05, "loss": 1.2959, "step": 1912 }, { "epoch": 0.3826, "grad_norm": 7.0006184577941895, "learning_rate": 3.15e-05, "loss": 1.0464, "step": 1913 }, { "epoch": 0.3828, "grad_norm": 3.590198040008545, "learning_rate": 3.148979591836735e-05, "loss": 0.9875, "step": 1914 }, { "epoch": 0.383, "grad_norm": 5.859086513519287, "learning_rate": 3.14795918367347e-05, "loss": 1.0345, "step": 1915 }, { "epoch": 0.3832, "grad_norm": 3.9727370738983154, "learning_rate": 3.146938775510204e-05, "loss": 0.6396, "step": 1916 }, { "epoch": 0.3834, "grad_norm": 3.9990367889404297, "learning_rate": 3.145918367346939e-05, "loss": 0.6313, "step": 1917 }, { "epoch": 0.3836, "grad_norm": 4.18431282043457, "learning_rate": 3.144897959183674e-05, "loss": 0.8224, "step": 1918 }, { "epoch": 0.3838, "grad_norm": 5.706707954406738, "learning_rate": 3.143877551020408e-05, "loss": 1.0675, "step": 1919 }, { "epoch": 0.384, "grad_norm": 3.66581392288208, "learning_rate": 3.142857142857143e-05, "loss": 0.9332, "step": 1920 }, { "epoch": 0.3842, "grad_norm": 3.9625906944274902, "learning_rate": 3.141836734693877e-05, "loss": 0.6328, "step": 1921 }, { "epoch": 0.3844, "grad_norm": 17.030656814575195, "learning_rate": 3.140816326530612e-05, "loss": 3.9366, "step": 1922 }, { "epoch": 0.3846, "grad_norm": 5.616252899169922, "learning_rate": 3.139795918367347e-05, "loss": 0.9011, "step": 1923 }, { "epoch": 0.3848, "grad_norm": 6.17678689956665, "learning_rate": 3.1387755102040814e-05, "loss": 1.0409, "step": 1924 }, { "epoch": 0.385, "grad_norm": 4.09219217300415, "learning_rate": 3.137755102040816e-05, "loss": 0.6887, "step": 1925 }, { "epoch": 0.3852, "grad_norm": 8.822908401489258, "learning_rate": 3.136734693877551e-05, "loss": 3.6639, "step": 1926 }, { "epoch": 0.3854, "grad_norm": 5.703184604644775, "learning_rate": 3.1357142857142855e-05, "loss": 0.9716, "step": 1927 }, { "epoch": 0.3856, "grad_norm": 5.323953151702881, "learning_rate": 3.1346938775510204e-05, "loss": 0.9321, "step": 1928 }, { "epoch": 0.3858, "grad_norm": 3.9743220806121826, "learning_rate": 3.133673469387755e-05, "loss": 0.809, "step": 1929 }, { "epoch": 0.386, "grad_norm": 3.9991021156311035, "learning_rate": 3.1326530612244895e-05, "loss": 0.5908, "step": 1930 }, { "epoch": 0.3862, "grad_norm": 3.7616851329803467, "learning_rate": 3.1316326530612245e-05, "loss": 0.906, "step": 1931 }, { "epoch": 0.3864, "grad_norm": 11.455646514892578, "learning_rate": 3.1306122448979594e-05, "loss": 1.278, "step": 1932 }, { "epoch": 0.3866, "grad_norm": 5.890268802642822, "learning_rate": 3.1295918367346936e-05, "loss": 1.075, "step": 1933 }, { "epoch": 0.3868, "grad_norm": 10.48533821105957, "learning_rate": 3.1285714285714285e-05, "loss": 3.1911, "step": 1934 }, { "epoch": 0.387, "grad_norm": 3.556635856628418, "learning_rate": 3.1275510204081634e-05, "loss": 0.9989, "step": 1935 }, { "epoch": 0.3872, "grad_norm": 9.573206901550293, "learning_rate": 3.126530612244898e-05, "loss": 2.9485, "step": 1936 }, { "epoch": 0.3874, "grad_norm": 5.702581882476807, "learning_rate": 3.1255102040816326e-05, "loss": 1.0445, "step": 1937 }, { "epoch": 0.3876, "grad_norm": 5.90617561340332, "learning_rate": 3.1244897959183675e-05, "loss": 1.0362, "step": 1938 }, { "epoch": 0.3878, "grad_norm": 5.865963459014893, "learning_rate": 3.1234693877551024e-05, "loss": 1.0315, "step": 1939 }, { "epoch": 0.388, "grad_norm": 6.144934177398682, "learning_rate": 3.1224489795918374e-05, "loss": 1.0469, "step": 1940 }, { "epoch": 0.3882, "grad_norm": 5.400904178619385, "learning_rate": 3.1214285714285716e-05, "loss": 0.9472, "step": 1941 }, { "epoch": 0.3884, "grad_norm": 5.631744384765625, "learning_rate": 3.1204081632653065e-05, "loss": 1.0641, "step": 1942 }, { "epoch": 0.3886, "grad_norm": 9.171345710754395, "learning_rate": 3.1193877551020414e-05, "loss": 1.2373, "step": 1943 }, { "epoch": 0.3888, "grad_norm": 4.0366530418396, "learning_rate": 3.118367346938776e-05, "loss": 0.5937, "step": 1944 }, { "epoch": 0.389, "grad_norm": 3.8050081729888916, "learning_rate": 3.1173469387755106e-05, "loss": 0.8319, "step": 1945 }, { "epoch": 0.3892, "grad_norm": 5.817963123321533, "learning_rate": 3.1163265306122455e-05, "loss": 1.0481, "step": 1946 }, { "epoch": 0.3894, "grad_norm": 6.208949089050293, "learning_rate": 3.11530612244898e-05, "loss": 1.0638, "step": 1947 }, { "epoch": 0.3896, "grad_norm": 3.700920343399048, "learning_rate": 3.114285714285715e-05, "loss": 0.6374, "step": 1948 }, { "epoch": 0.3898, "grad_norm": 5.019497871398926, "learning_rate": 3.1132653061224496e-05, "loss": 0.9131, "step": 1949 }, { "epoch": 0.39, "grad_norm": 21.337099075317383, "learning_rate": 3.112244897959184e-05, "loss": 3.1491, "step": 1950 }, { "epoch": 0.3902, "grad_norm": 18.03508758544922, "learning_rate": 3.111224489795919e-05, "loss": 3.3816, "step": 1951 }, { "epoch": 0.3904, "grad_norm": 4.190047740936279, "learning_rate": 3.110204081632653e-05, "loss": 0.7675, "step": 1952 }, { "epoch": 0.3906, "grad_norm": 14.109248161315918, "learning_rate": 3.109183673469388e-05, "loss": 1.2913, "step": 1953 }, { "epoch": 0.3908, "grad_norm": 4.065311431884766, "learning_rate": 3.108163265306123e-05, "loss": 0.6556, "step": 1954 }, { "epoch": 0.391, "grad_norm": 3.806204080581665, "learning_rate": 3.107142857142857e-05, "loss": 0.8599, "step": 1955 }, { "epoch": 0.3912, "grad_norm": 14.704195976257324, "learning_rate": 3.106122448979592e-05, "loss": 1.2845, "step": 1956 }, { "epoch": 0.3914, "grad_norm": 17.34805679321289, "learning_rate": 3.105102040816327e-05, "loss": 1.3075, "step": 1957 }, { "epoch": 0.3916, "grad_norm": 5.221651077270508, "learning_rate": 3.104081632653061e-05, "loss": 1.0788, "step": 1958 }, { "epoch": 0.3918, "grad_norm": 20.793325424194336, "learning_rate": 3.103061224489796e-05, "loss": 1.2658, "step": 1959 }, { "epoch": 0.392, "grad_norm": 3.698373556137085, "learning_rate": 3.102040816326531e-05, "loss": 0.9564, "step": 1960 }, { "epoch": 0.3922, "grad_norm": 6.547873020172119, "learning_rate": 3.101020408163265e-05, "loss": 1.096, "step": 1961 }, { "epoch": 0.3924, "grad_norm": 3.6642978191375732, "learning_rate": 3.1e-05, "loss": 0.9244, "step": 1962 }, { "epoch": 0.3926, "grad_norm": 4.003145694732666, "learning_rate": 3.098979591836735e-05, "loss": 0.6351, "step": 1963 }, { "epoch": 0.3928, "grad_norm": 37.91643524169922, "learning_rate": 3.097959183673469e-05, "loss": 4.2161, "step": 1964 }, { "epoch": 0.393, "grad_norm": 5.784246444702148, "learning_rate": 3.096938775510204e-05, "loss": 0.9189, "step": 1965 }, { "epoch": 0.3932, "grad_norm": 3.4148221015930176, "learning_rate": 3.095918367346939e-05, "loss": 0.7949, "step": 1966 }, { "epoch": 0.3934, "grad_norm": 3.614133834838867, "learning_rate": 3.0948979591836734e-05, "loss": 0.5784, "step": 1967 }, { "epoch": 0.3936, "grad_norm": 8.935428619384766, "learning_rate": 3.093877551020408e-05, "loss": 1.1826, "step": 1968 }, { "epoch": 0.3938, "grad_norm": 5.664531230926514, "learning_rate": 3.092857142857143e-05, "loss": 0.9772, "step": 1969 }, { "epoch": 0.394, "grad_norm": 3.752865791320801, "learning_rate": 3.0918367346938774e-05, "loss": 0.613, "step": 1970 }, { "epoch": 0.3942, "grad_norm": 15.220873832702637, "learning_rate": 3.0908163265306124e-05, "loss": 3.7921, "step": 1971 }, { "epoch": 0.3944, "grad_norm": 3.931366443634033, "learning_rate": 3.0897959183673466e-05, "loss": 0.6618, "step": 1972 }, { "epoch": 0.3946, "grad_norm": 3.9736874103546143, "learning_rate": 3.0887755102040815e-05, "loss": 0.6155, "step": 1973 }, { "epoch": 0.3948, "grad_norm": 6.704489707946777, "learning_rate": 3.0877551020408164e-05, "loss": 1.1973, "step": 1974 }, { "epoch": 0.395, "grad_norm": 8.21762466430664, "learning_rate": 3.086734693877551e-05, "loss": 1.3387, "step": 1975 }, { "epoch": 0.3952, "grad_norm": 5.635308742523193, "learning_rate": 3.0857142857142856e-05, "loss": 1.0915, "step": 1976 }, { "epoch": 0.3954, "grad_norm": 3.429318428039551, "learning_rate": 3.0846938775510205e-05, "loss": 0.8407, "step": 1977 }, { "epoch": 0.3956, "grad_norm": 5.429908275604248, "learning_rate": 3.083673469387755e-05, "loss": 0.9866, "step": 1978 }, { "epoch": 0.3958, "grad_norm": 3.98653507232666, "learning_rate": 3.08265306122449e-05, "loss": 1.0904, "step": 1979 }, { "epoch": 0.396, "grad_norm": 3.7446930408477783, "learning_rate": 3.0816326530612246e-05, "loss": 0.9527, "step": 1980 }, { "epoch": 0.3962, "grad_norm": 3.985771417617798, "learning_rate": 3.080612244897959e-05, "loss": 0.5949, "step": 1981 }, { "epoch": 0.3964, "grad_norm": 5.2019453048706055, "learning_rate": 3.079591836734694e-05, "loss": 0.9761, "step": 1982 }, { "epoch": 0.3966, "grad_norm": 3.969365119934082, "learning_rate": 3.078571428571429e-05, "loss": 0.6185, "step": 1983 }, { "epoch": 0.3968, "grad_norm": 3.3323633670806885, "learning_rate": 3.0775510204081636e-05, "loss": 0.9703, "step": 1984 }, { "epoch": 0.397, "grad_norm": 19.202472686767578, "learning_rate": 3.0765306122448985e-05, "loss": 3.8001, "step": 1985 }, { "epoch": 0.3972, "grad_norm": 3.3859543800354004, "learning_rate": 3.075510204081633e-05, "loss": 0.959, "step": 1986 }, { "epoch": 0.3974, "grad_norm": 3.364506244659424, "learning_rate": 3.074489795918368e-05, "loss": 0.9518, "step": 1987 }, { "epoch": 0.3976, "grad_norm": 11.729973793029785, "learning_rate": 3.0734693877551026e-05, "loss": 4.0912, "step": 1988 }, { "epoch": 0.3978, "grad_norm": 9.226360321044922, "learning_rate": 3.072448979591837e-05, "loss": 4.0417, "step": 1989 }, { "epoch": 0.398, "grad_norm": 8.097306251525879, "learning_rate": 3.071428571428572e-05, "loss": 1.2364, "step": 1990 }, { "epoch": 0.3982, "grad_norm": 5.448502063751221, "learning_rate": 3.0704081632653067e-05, "loss": 1.0877, "step": 1991 }, { "epoch": 0.3984, "grad_norm": 3.436793327331543, "learning_rate": 3.069387755102041e-05, "loss": 0.8769, "step": 1992 }, { "epoch": 0.3986, "grad_norm": 3.4772262573242188, "learning_rate": 3.068367346938776e-05, "loss": 0.9147, "step": 1993 }, { "epoch": 0.3988, "grad_norm": 10.667865753173828, "learning_rate": 3.067346938775511e-05, "loss": 1.2431, "step": 1994 }, { "epoch": 0.399, "grad_norm": 3.464256763458252, "learning_rate": 3.066326530612245e-05, "loss": 0.9025, "step": 1995 }, { "epoch": 0.3992, "grad_norm": 4.23721170425415, "learning_rate": 3.06530612244898e-05, "loss": 0.6652, "step": 1996 }, { "epoch": 0.3994, "grad_norm": 4.40722131729126, "learning_rate": 3.064285714285715e-05, "loss": 0.5962, "step": 1997 }, { "epoch": 0.3996, "grad_norm": 5.087538719177246, "learning_rate": 3.063265306122449e-05, "loss": 1.0236, "step": 1998 }, { "epoch": 0.3998, "grad_norm": 3.472217321395874, "learning_rate": 3.062244897959184e-05, "loss": 0.8398, "step": 1999 }, { "epoch": 0.4, "grad_norm": 5.690279960632324, "learning_rate": 3.061224489795919e-05, "loss": 0.9392, "step": 2000 }, { "epoch": 0.4002, "grad_norm": 6.091668605804443, "learning_rate": 3.060204081632653e-05, "loss": 1.065, "step": 2001 }, { "epoch": 0.4004, "grad_norm": 5.0249924659729, "learning_rate": 3.059183673469388e-05, "loss": 1.0291, "step": 2002 }, { "epoch": 0.4006, "grad_norm": 4.939494609832764, "learning_rate": 3.058163265306123e-05, "loss": 0.8855, "step": 2003 }, { "epoch": 0.4008, "grad_norm": 4.587623119354248, "learning_rate": 3.057142857142857e-05, "loss": 0.9673, "step": 2004 }, { "epoch": 0.401, "grad_norm": 7.077375411987305, "learning_rate": 3.056122448979592e-05, "loss": 0.9796, "step": 2005 }, { "epoch": 0.4012, "grad_norm": 5.092260837554932, "learning_rate": 3.0551020408163264e-05, "loss": 0.9718, "step": 2006 }, { "epoch": 0.4014, "grad_norm": 3.5044186115264893, "learning_rate": 3.054081632653061e-05, "loss": 0.8064, "step": 2007 }, { "epoch": 0.4016, "grad_norm": 6.15052604675293, "learning_rate": 3.053061224489796e-05, "loss": 0.9817, "step": 2008 }, { "epoch": 0.4018, "grad_norm": 9.287921905517578, "learning_rate": 3.0520408163265304e-05, "loss": 1.2837, "step": 2009 }, { "epoch": 0.402, "grad_norm": 3.6782681941986084, "learning_rate": 3.0510204081632654e-05, "loss": 0.859, "step": 2010 }, { "epoch": 0.4022, "grad_norm": 5.276793479919434, "learning_rate": 3.05e-05, "loss": 0.6698, "step": 2011 }, { "epoch": 0.4024, "grad_norm": 4.4981560707092285, "learning_rate": 3.048979591836735e-05, "loss": 0.6247, "step": 2012 }, { "epoch": 0.4026, "grad_norm": 4.182480335235596, "learning_rate": 3.0479591836734694e-05, "loss": 0.9111, "step": 2013 }, { "epoch": 0.4028, "grad_norm": 4.58298397064209, "learning_rate": 3.046938775510204e-05, "loss": 0.9857, "step": 2014 }, { "epoch": 0.403, "grad_norm": 7.412697792053223, "learning_rate": 3.045918367346939e-05, "loss": 1.0283, "step": 2015 }, { "epoch": 0.4032, "grad_norm": 4.1168694496154785, "learning_rate": 3.0448979591836735e-05, "loss": 0.6491, "step": 2016 }, { "epoch": 0.4034, "grad_norm": 3.5667405128479004, "learning_rate": 3.043877551020408e-05, "loss": 0.9392, "step": 2017 }, { "epoch": 0.4036, "grad_norm": 6.736431121826172, "learning_rate": 3.042857142857143e-05, "loss": 1.0647, "step": 2018 }, { "epoch": 0.4038, "grad_norm": 3.888291120529175, "learning_rate": 3.0418367346938776e-05, "loss": 0.8569, "step": 2019 }, { "epoch": 0.404, "grad_norm": 6.160373687744141, "learning_rate": 3.040816326530612e-05, "loss": 1.0167, "step": 2020 }, { "epoch": 0.4042, "grad_norm": 4.123867511749268, "learning_rate": 3.0397959183673467e-05, "loss": 0.8689, "step": 2021 }, { "epoch": 0.4044, "grad_norm": 13.053849220275879, "learning_rate": 3.0387755102040817e-05, "loss": 1.236, "step": 2022 }, { "epoch": 0.4046, "grad_norm": 9.465170860290527, "learning_rate": 3.0377551020408162e-05, "loss": 1.2494, "step": 2023 }, { "epoch": 0.4048, "grad_norm": 7.582245349884033, "learning_rate": 3.0367346938775508e-05, "loss": 1.214, "step": 2024 }, { "epoch": 0.405, "grad_norm": 7.706305980682373, "learning_rate": 3.0357142857142857e-05, "loss": 1.0576, "step": 2025 }, { "epoch": 0.4052, "grad_norm": 14.02505874633789, "learning_rate": 3.0346938775510203e-05, "loss": 1.0928, "step": 2026 }, { "epoch": 0.4054, "grad_norm": 7.376889705657959, "learning_rate": 3.033673469387755e-05, "loss": 1.2182, "step": 2027 }, { "epoch": 0.4056, "grad_norm": 3.411616086959839, "learning_rate": 3.0326530612244898e-05, "loss": 0.8649, "step": 2028 }, { "epoch": 0.4058, "grad_norm": 11.657422065734863, "learning_rate": 3.031632653061225e-05, "loss": 1.2291, "step": 2029 }, { "epoch": 0.406, "grad_norm": 4.260434150695801, "learning_rate": 3.0306122448979597e-05, "loss": 0.5884, "step": 2030 }, { "epoch": 0.4062, "grad_norm": 8.48438835144043, "learning_rate": 3.0295918367346942e-05, "loss": 1.062, "step": 2031 }, { "epoch": 0.4064, "grad_norm": 3.534970998764038, "learning_rate": 3.0285714285714288e-05, "loss": 0.889, "step": 2032 }, { "epoch": 0.4066, "grad_norm": 3.441049337387085, "learning_rate": 3.0275510204081637e-05, "loss": 0.958, "step": 2033 }, { "epoch": 0.4068, "grad_norm": 3.2886886596679688, "learning_rate": 3.0265306122448983e-05, "loss": 0.786, "step": 2034 }, { "epoch": 0.407, "grad_norm": 3.797276020050049, "learning_rate": 3.025510204081633e-05, "loss": 0.5687, "step": 2035 }, { "epoch": 0.4072, "grad_norm": 3.5251455307006836, "learning_rate": 3.0244897959183678e-05, "loss": 0.7998, "step": 2036 }, { "epoch": 0.4074, "grad_norm": 3.2015249729156494, "learning_rate": 3.0234693877551024e-05, "loss": 0.743, "step": 2037 }, { "epoch": 0.4076, "grad_norm": 3.5168328285217285, "learning_rate": 3.022448979591837e-05, "loss": 0.8678, "step": 2038 }, { "epoch": 0.4078, "grad_norm": 3.90354061126709, "learning_rate": 3.021428571428572e-05, "loss": 0.6304, "step": 2039 }, { "epoch": 0.408, "grad_norm": 3.3159353733062744, "learning_rate": 3.0204081632653065e-05, "loss": 0.7902, "step": 2040 }, { "epoch": 0.4082, "grad_norm": 3.9887642860412598, "learning_rate": 3.019387755102041e-05, "loss": 0.545, "step": 2041 }, { "epoch": 0.4084, "grad_norm": 3.527355670928955, "learning_rate": 3.0183673469387756e-05, "loss": 0.7923, "step": 2042 }, { "epoch": 0.4086, "grad_norm": 3.8808462619781494, "learning_rate": 3.0173469387755105e-05, "loss": 0.9774, "step": 2043 }, { "epoch": 0.4088, "grad_norm": 3.7496337890625, "learning_rate": 3.016326530612245e-05, "loss": 0.7932, "step": 2044 }, { "epoch": 0.409, "grad_norm": 3.646125316619873, "learning_rate": 3.0153061224489797e-05, "loss": 0.9224, "step": 2045 }, { "epoch": 0.4092, "grad_norm": 3.7936694622039795, "learning_rate": 3.0142857142857146e-05, "loss": 0.9792, "step": 2046 }, { "epoch": 0.4094, "grad_norm": 4.863035202026367, "learning_rate": 3.0132653061224492e-05, "loss": 0.7245, "step": 2047 }, { "epoch": 0.4096, "grad_norm": 4.080334186553955, "learning_rate": 3.0122448979591838e-05, "loss": 0.5945, "step": 2048 }, { "epoch": 0.4098, "grad_norm": 3.421778440475464, "learning_rate": 3.0112244897959187e-05, "loss": 0.9618, "step": 2049 }, { "epoch": 0.41, "grad_norm": 3.776970624923706, "learning_rate": 3.0102040816326533e-05, "loss": 0.9939, "step": 2050 }, { "epoch": 0.4102, "grad_norm": 11.517478942871094, "learning_rate": 3.009183673469388e-05, "loss": 1.2231, "step": 2051 }, { "epoch": 0.4104, "grad_norm": 3.593852996826172, "learning_rate": 3.0081632653061224e-05, "loss": 0.8188, "step": 2052 }, { "epoch": 0.4106, "grad_norm": 3.9655921459198, "learning_rate": 3.0071428571428573e-05, "loss": 0.5983, "step": 2053 }, { "epoch": 0.4108, "grad_norm": 7.757854461669922, "learning_rate": 3.006122448979592e-05, "loss": 1.2068, "step": 2054 }, { "epoch": 0.411, "grad_norm": 3.8167548179626465, "learning_rate": 3.0051020408163265e-05, "loss": 0.8267, "step": 2055 }, { "epoch": 0.4112, "grad_norm": 3.506699323654175, "learning_rate": 3.0040816326530614e-05, "loss": 0.8534, "step": 2056 }, { "epoch": 0.4114, "grad_norm": 6.759268760681152, "learning_rate": 3.003061224489796e-05, "loss": 1.2464, "step": 2057 }, { "epoch": 0.4116, "grad_norm": 3.3585410118103027, "learning_rate": 3.0020408163265306e-05, "loss": 0.8959, "step": 2058 }, { "epoch": 0.4118, "grad_norm": 3.979999303817749, "learning_rate": 3.0010204081632655e-05, "loss": 0.5908, "step": 2059 }, { "epoch": 0.412, "grad_norm": 3.32076096534729, "learning_rate": 3e-05, "loss": 0.8333, "step": 2060 }, { "epoch": 0.4122, "grad_norm": 3.9741761684417725, "learning_rate": 2.9989795918367347e-05, "loss": 0.603, "step": 2061 }, { "epoch": 0.4124, "grad_norm": 3.654820442199707, "learning_rate": 2.9979591836734692e-05, "loss": 0.5471, "step": 2062 }, { "epoch": 0.4126, "grad_norm": 3.9359383583068848, "learning_rate": 2.996938775510204e-05, "loss": 0.6082, "step": 2063 }, { "epoch": 0.4128, "grad_norm": 3.843722105026245, "learning_rate": 2.9959183673469387e-05, "loss": 0.597, "step": 2064 }, { "epoch": 0.413, "grad_norm": 8.112070083618164, "learning_rate": 2.9948979591836733e-05, "loss": 1.2302, "step": 2065 }, { "epoch": 0.4132, "grad_norm": 7.104030132293701, "learning_rate": 2.9938775510204082e-05, "loss": 1.2015, "step": 2066 }, { "epoch": 0.4134, "grad_norm": 10.215073585510254, "learning_rate": 2.9928571428571428e-05, "loss": 1.1589, "step": 2067 }, { "epoch": 0.4136, "grad_norm": 3.378031015396118, "learning_rate": 2.9918367346938774e-05, "loss": 0.9331, "step": 2068 }, { "epoch": 0.4138, "grad_norm": 7.290038585662842, "learning_rate": 2.9908163265306123e-05, "loss": 1.196, "step": 2069 }, { "epoch": 0.414, "grad_norm": 4.649845600128174, "learning_rate": 2.989795918367347e-05, "loss": 0.6947, "step": 2070 }, { "epoch": 0.4142, "grad_norm": 6.744777679443359, "learning_rate": 2.9887755102040815e-05, "loss": 1.2755, "step": 2071 }, { "epoch": 0.4144, "grad_norm": 4.500767707824707, "learning_rate": 2.987755102040816e-05, "loss": 0.6236, "step": 2072 }, { "epoch": 0.4146, "grad_norm": 4.191307067871094, "learning_rate": 2.986734693877551e-05, "loss": 0.6218, "step": 2073 }, { "epoch": 0.4148, "grad_norm": 4.284450531005859, "learning_rate": 2.9857142857142862e-05, "loss": 0.6073, "step": 2074 }, { "epoch": 0.415, "grad_norm": 3.9721145629882812, "learning_rate": 2.9846938775510208e-05, "loss": 0.6127, "step": 2075 }, { "epoch": 0.4152, "grad_norm": 4.18171501159668, "learning_rate": 2.9836734693877554e-05, "loss": 0.6471, "step": 2076 }, { "epoch": 0.4154, "grad_norm": 3.992933511734009, "learning_rate": 2.9826530612244903e-05, "loss": 0.5906, "step": 2077 }, { "epoch": 0.4156, "grad_norm": 9.78303337097168, "learning_rate": 2.981632653061225e-05, "loss": 1.2285, "step": 2078 }, { "epoch": 0.4158, "grad_norm": 7.465812683105469, "learning_rate": 2.9806122448979595e-05, "loss": 1.2131, "step": 2079 }, { "epoch": 0.416, "grad_norm": 7.9334940910339355, "learning_rate": 2.9795918367346944e-05, "loss": 1.0955, "step": 2080 }, { "epoch": 0.4162, "grad_norm": 8.485240936279297, "learning_rate": 2.978571428571429e-05, "loss": 1.3146, "step": 2081 }, { "epoch": 0.4164, "grad_norm": 6.9409637451171875, "learning_rate": 2.9775510204081635e-05, "loss": 1.143, "step": 2082 }, { "epoch": 0.4166, "grad_norm": 7.945091247558594, "learning_rate": 2.976530612244898e-05, "loss": 1.1643, "step": 2083 }, { "epoch": 0.4168, "grad_norm": 18.780855178833008, "learning_rate": 2.975510204081633e-05, "loss": 0.8423, "step": 2084 }, { "epoch": 1.0002, "grad_norm": 15.242094039916992, "learning_rate": 2.9744897959183676e-05, "loss": 1.1254, "step": 2085 }, { "epoch": 1.0004, "grad_norm": 51.17671585083008, "learning_rate": 2.9734693877551022e-05, "loss": 3.1271, "step": 2086 }, { "epoch": 1.0006, "grad_norm": 2.6297574043273926, "learning_rate": 2.972448979591837e-05, "loss": 0.336, "step": 2087 }, { "epoch": 1.0008, "grad_norm": 8.47588062286377, "learning_rate": 2.9714285714285717e-05, "loss": 0.5913, "step": 2088 }, { "epoch": 1.001, "grad_norm": 7.058892726898193, "learning_rate": 2.9704081632653063e-05, "loss": 0.4798, "step": 2089 }, { "epoch": 1.0012, "grad_norm": 1.6835764646530151, "learning_rate": 2.9693877551020412e-05, "loss": 0.0532, "step": 2090 }, { "epoch": 1.0014, "grad_norm": 2.0876736640930176, "learning_rate": 2.9683673469387758e-05, "loss": 0.26, "step": 2091 }, { "epoch": 1.0016, "grad_norm": 2.948742628097534, "learning_rate": 2.9673469387755103e-05, "loss": 0.5332, "step": 2092 }, { "epoch": 1.0018, "grad_norm": 7.258228302001953, "learning_rate": 2.966326530612245e-05, "loss": 1.5851, "step": 2093 }, { "epoch": 1.002, "grad_norm": 44.64124298095703, "learning_rate": 2.96530612244898e-05, "loss": 0.4057, "step": 2094 }, { "epoch": 1.0022, "grad_norm": 90.02332305908203, "learning_rate": 2.9642857142857144e-05, "loss": 0.2384, "step": 2095 }, { "epoch": 1.0024, "grad_norm": 2.4125702381134033, "learning_rate": 2.963265306122449e-05, "loss": 0.4608, "step": 2096 }, { "epoch": 1.0026, "grad_norm": 3.5664608478546143, "learning_rate": 2.962244897959184e-05, "loss": 1.5708, "step": 2097 }, { "epoch": 1.0028, "grad_norm": 2.176363945007324, "learning_rate": 2.9612244897959185e-05, "loss": 0.232, "step": 2098 }, { "epoch": 1.003, "grad_norm": 4.480269908905029, "learning_rate": 2.960204081632653e-05, "loss": 0.2929, "step": 2099 }, { "epoch": 1.0032, "grad_norm": 4.131799697875977, "learning_rate": 2.959183673469388e-05, "loss": 1.0002, "step": 2100 }, { "epoch": 1.0034, "grad_norm": 15.502717018127441, "learning_rate": 2.9581632653061226e-05, "loss": 3.1208, "step": 2101 }, { "epoch": 1.0036, "grad_norm": 21.303125381469727, "learning_rate": 2.957142857142857e-05, "loss": 3.2585, "step": 2102 }, { "epoch": 1.0038, "grad_norm": 2.70643949508667, "learning_rate": 2.956122448979592e-05, "loss": 0.389, "step": 2103 }, { "epoch": 1.004, "grad_norm": 7.2120466232299805, "learning_rate": 2.9551020408163266e-05, "loss": 0.9416, "step": 2104 }, { "epoch": 1.0042, "grad_norm": 33.94477462768555, "learning_rate": 2.9540816326530612e-05, "loss": 1.8284, "step": 2105 }, { "epoch": 1.0044, "grad_norm": 98.7635498046875, "learning_rate": 2.9530612244897958e-05, "loss": 5.7999, "step": 2106 }, { "epoch": 1.0046, "grad_norm": 1.7753891944885254, "learning_rate": 2.9520408163265307e-05, "loss": 0.0865, "step": 2107 }, { "epoch": 1.0048, "grad_norm": 2.728239059448242, "learning_rate": 2.9510204081632653e-05, "loss": 0.2501, "step": 2108 }, { "epoch": 1.005, "grad_norm": 4.694189548492432, "learning_rate": 2.95e-05, "loss": 0.4039, "step": 2109 }, { "epoch": 1.0052, "grad_norm": 9.07700252532959, "learning_rate": 2.9489795918367348e-05, "loss": 0.85, "step": 2110 }, { "epoch": 1.0054, "grad_norm": 11.796422958374023, "learning_rate": 2.9479591836734694e-05, "loss": 1.6915, "step": 2111 }, { "epoch": 1.0056, "grad_norm": 15.555567741394043, "learning_rate": 2.946938775510204e-05, "loss": 5.2889, "step": 2112 }, { "epoch": 1.0058, "grad_norm": 3.4670357704162598, "learning_rate": 2.945918367346939e-05, "loss": 0.4462, "step": 2113 }, { "epoch": 1.006, "grad_norm": 3.21122407913208, "learning_rate": 2.9448979591836735e-05, "loss": 0.4042, "step": 2114 }, { "epoch": 1.0062, "grad_norm": 2.0091428756713867, "learning_rate": 2.943877551020408e-05, "loss": 0.2797, "step": 2115 }, { "epoch": 1.0064, "grad_norm": 3.6005043983459473, "learning_rate": 2.9428571428571426e-05, "loss": 0.2846, "step": 2116 }, { "epoch": 1.0066, "grad_norm": 8.382363319396973, "learning_rate": 2.9418367346938775e-05, "loss": 1.2789, "step": 2117 }, { "epoch": 1.0068, "grad_norm": 4.641507148742676, "learning_rate": 2.940816326530612e-05, "loss": 0.1561, "step": 2118 }, { "epoch": 1.007, "grad_norm": 4.312943458557129, "learning_rate": 2.9397959183673474e-05, "loss": 0.3616, "step": 2119 }, { "epoch": 1.0072, "grad_norm": 7.527416706085205, "learning_rate": 2.938775510204082e-05, "loss": 1.0976, "step": 2120 }, { "epoch": 1.0074, "grad_norm": 5.3701019287109375, "learning_rate": 2.937755102040817e-05, "loss": 1.6283, "step": 2121 }, { "epoch": 1.0076, "grad_norm": 2.4299004077911377, "learning_rate": 2.9367346938775514e-05, "loss": 0.1998, "step": 2122 }, { "epoch": 1.0078, "grad_norm": 3.094940662384033, "learning_rate": 2.935714285714286e-05, "loss": 0.9545, "step": 2123 }, { "epoch": 1.008, "grad_norm": 7.588383197784424, "learning_rate": 2.9346938775510206e-05, "loss": 2.3132, "step": 2124 }, { "epoch": 1.0082, "grad_norm": 14.091776847839355, "learning_rate": 2.9336734693877555e-05, "loss": 2.5728, "step": 2125 }, { "epoch": 1.0084, "grad_norm": 1.9190328121185303, "learning_rate": 2.93265306122449e-05, "loss": 0.2431, "step": 2126 }, { "epoch": 1.0086, "grad_norm": 5.02833366394043, "learning_rate": 2.9316326530612247e-05, "loss": 0.5762, "step": 2127 }, { "epoch": 1.0088, "grad_norm": 8.859189987182617, "learning_rate": 2.9306122448979596e-05, "loss": 1.2586, "step": 2128 }, { "epoch": 1.009, "grad_norm": 2.1577792167663574, "learning_rate": 2.9295918367346942e-05, "loss": 0.2873, "step": 2129 }, { "epoch": 1.0092, "grad_norm": 1.858289122581482, "learning_rate": 2.9285714285714288e-05, "loss": 0.2176, "step": 2130 }, { "epoch": 1.0094, "grad_norm": 11.388729095458984, "learning_rate": 2.9275510204081637e-05, "loss": 1.218, "step": 2131 }, { "epoch": 1.0096, "grad_norm": 18.279762268066406, "learning_rate": 2.9265306122448982e-05, "loss": 2.053, "step": 2132 }, { "epoch": 1.0098, "grad_norm": 4.1785783767700195, "learning_rate": 2.9255102040816328e-05, "loss": 0.7075, "step": 2133 }, { "epoch": 1.01, "grad_norm": 5.734426975250244, "learning_rate": 2.9244897959183677e-05, "loss": 1.6235, "step": 2134 }, { "epoch": 1.0102, "grad_norm": 7.153301239013672, "learning_rate": 2.9234693877551023e-05, "loss": 0.9697, "step": 2135 }, { "epoch": 1.0104, "grad_norm": 7.6792426109313965, "learning_rate": 2.922448979591837e-05, "loss": 0.8291, "step": 2136 }, { "epoch": 1.0106, "grad_norm": 2.6842920780181885, "learning_rate": 2.9214285714285715e-05, "loss": 0.753, "step": 2137 }, { "epoch": 1.0108, "grad_norm": 3.8635406494140625, "learning_rate": 2.9204081632653064e-05, "loss": 1.7667, "step": 2138 }, { "epoch": 1.011, "grad_norm": 2.733811140060425, "learning_rate": 2.919387755102041e-05, "loss": 0.3324, "step": 2139 }, { "epoch": 1.0112, "grad_norm": 2.376089334487915, "learning_rate": 2.9183673469387756e-05, "loss": 0.2683, "step": 2140 }, { "epoch": 1.0114, "grad_norm": 4.434942722320557, "learning_rate": 2.9173469387755105e-05, "loss": 0.7013, "step": 2141 }, { "epoch": 1.0116, "grad_norm": 4.3742356300354, "learning_rate": 2.916326530612245e-05, "loss": 1.9174, "step": 2142 }, { "epoch": 1.0118, "grad_norm": 9.535046577453613, "learning_rate": 2.9153061224489796e-05, "loss": 2.0534, "step": 2143 }, { "epoch": 1.012, "grad_norm": 12.418166160583496, "learning_rate": 2.9142857142857146e-05, "loss": 1.9329, "step": 2144 }, { "epoch": 1.0122, "grad_norm": 3.672943353652954, "learning_rate": 2.913265306122449e-05, "loss": 1.7427, "step": 2145 }, { "epoch": 1.0124, "grad_norm": 3.344444751739502, "learning_rate": 2.9122448979591837e-05, "loss": 0.6165, "step": 2146 }, { "epoch": 1.0126, "grad_norm": 3.1958985328674316, "learning_rate": 2.9112244897959183e-05, "loss": 1.4177, "step": 2147 }, { "epoch": 1.0128, "grad_norm": 2.36991810798645, "learning_rate": 2.9102040816326532e-05, "loss": 0.6834, "step": 2148 }, { "epoch": 1.013, "grad_norm": 8.034588813781738, "learning_rate": 2.9091836734693878e-05, "loss": 2.3334, "step": 2149 }, { "epoch": 1.0132, "grad_norm": 7.280432224273682, "learning_rate": 2.9081632653061224e-05, "loss": 2.5005, "step": 2150 }, { "epoch": 1.0134, "grad_norm": 5.838300704956055, "learning_rate": 2.9071428571428573e-05, "loss": 1.8218, "step": 2151 }, { "epoch": 1.0136, "grad_norm": 10.983949661254883, "learning_rate": 2.906122448979592e-05, "loss": 2.1662, "step": 2152 }, { "epoch": 1.0138, "grad_norm": 20.635364532470703, "learning_rate": 2.9051020408163264e-05, "loss": 4.3268, "step": 2153 }, { "epoch": 1.014, "grad_norm": 13.782269477844238, "learning_rate": 2.9040816326530614e-05, "loss": 2.6215, "step": 2154 }, { "epoch": 1.0142, "grad_norm": 5.808487415313721, "learning_rate": 2.903061224489796e-05, "loss": 1.1143, "step": 2155 }, { "epoch": 1.0144, "grad_norm": 9.513835906982422, "learning_rate": 2.9020408163265305e-05, "loss": 2.3511, "step": 2156 }, { "epoch": 1.0146, "grad_norm": 1.9956474304199219, "learning_rate": 2.901020408163265e-05, "loss": 0.2929, "step": 2157 }, { "epoch": 1.0148, "grad_norm": 5.999602317810059, "learning_rate": 2.9e-05, "loss": 1.599, "step": 2158 }, { "epoch": 1.015, "grad_norm": 14.991220474243164, "learning_rate": 2.8989795918367346e-05, "loss": 4.129, "step": 2159 }, { "epoch": 1.0152, "grad_norm": 16.006671905517578, "learning_rate": 2.8979591836734692e-05, "loss": 2.3101, "step": 2160 }, { "epoch": 1.0154, "grad_norm": 10.431235313415527, "learning_rate": 2.896938775510204e-05, "loss": 1.7555, "step": 2161 }, { "epoch": 1.0156, "grad_norm": 5.553926467895508, "learning_rate": 2.8959183673469387e-05, "loss": 2.2064, "step": 2162 }, { "epoch": 1.0158, "grad_norm": 6.1060590744018555, "learning_rate": 2.8948979591836733e-05, "loss": 1.9016, "step": 2163 }, { "epoch": 1.016, "grad_norm": 2.589744806289673, "learning_rate": 2.8938775510204082e-05, "loss": 0.679, "step": 2164 }, { "epoch": 1.0162, "grad_norm": 7.968225479125977, "learning_rate": 2.8928571428571434e-05, "loss": 2.2678, "step": 2165 }, { "epoch": 1.0164, "grad_norm": 20.811412811279297, "learning_rate": 2.891836734693878e-05, "loss": 2.487, "step": 2166 }, { "epoch": 1.0166, "grad_norm": 2.4238314628601074, "learning_rate": 2.8908163265306126e-05, "loss": 0.3541, "step": 2167 }, { "epoch": 1.0168, "grad_norm": 16.10127067565918, "learning_rate": 2.889795918367347e-05, "loss": 1.8777, "step": 2168 }, { "epoch": 1.017, "grad_norm": 36.814212799072266, "learning_rate": 2.888775510204082e-05, "loss": 3.9616, "step": 2169 }, { "epoch": 1.0172, "grad_norm": 7.001033306121826, "learning_rate": 2.8877551020408167e-05, "loss": 0.4239, "step": 2170 }, { "epoch": 1.0174, "grad_norm": 2.069876194000244, "learning_rate": 2.8867346938775512e-05, "loss": 0.2434, "step": 2171 }, { "epoch": 1.0176, "grad_norm": 3.3106343746185303, "learning_rate": 2.885714285714286e-05, "loss": 0.2273, "step": 2172 }, { "epoch": 1.0178, "grad_norm": 3.6341891288757324, "learning_rate": 2.8846938775510207e-05, "loss": 0.5256, "step": 2173 }, { "epoch": 1.018, "grad_norm": 5.140353679656982, "learning_rate": 2.8836734693877553e-05, "loss": 0.8109, "step": 2174 }, { "epoch": 1.0182, "grad_norm": 3.3280324935913086, "learning_rate": 2.8826530612244902e-05, "loss": 0.4409, "step": 2175 }, { "epoch": 1.0184, "grad_norm": 3.295837163925171, "learning_rate": 2.8816326530612248e-05, "loss": 0.3681, "step": 2176 }, { "epoch": 1.0186, "grad_norm": 3.2037105560302734, "learning_rate": 2.8806122448979594e-05, "loss": 0.1784, "step": 2177 }, { "epoch": 1.0188, "grad_norm": 4.193380832672119, "learning_rate": 2.879591836734694e-05, "loss": 0.3336, "step": 2178 }, { "epoch": 1.019, "grad_norm": 7.2576985359191895, "learning_rate": 2.878571428571429e-05, "loss": 1.288, "step": 2179 }, { "epoch": 1.0192, "grad_norm": 7.04248571395874, "learning_rate": 2.8775510204081635e-05, "loss": 2.1678, "step": 2180 }, { "epoch": 1.0194, "grad_norm": 16.95452308654785, "learning_rate": 2.876530612244898e-05, "loss": 4.9449, "step": 2181 }, { "epoch": 1.0196, "grad_norm": 6.738562107086182, "learning_rate": 2.875510204081633e-05, "loss": 0.7365, "step": 2182 }, { "epoch": 1.0198, "grad_norm": 7.8314666748046875, "learning_rate": 2.8744897959183675e-05, "loss": 1.8211, "step": 2183 }, { "epoch": 1.02, "grad_norm": 1.4186290502548218, "learning_rate": 2.873469387755102e-05, "loss": 0.0832, "step": 2184 }, { "epoch": 1.0202, "grad_norm": 1.4884039163589478, "learning_rate": 2.872448979591837e-05, "loss": 0.0918, "step": 2185 }, { "epoch": 1.0204, "grad_norm": 4.299226760864258, "learning_rate": 2.8714285714285716e-05, "loss": 0.374, "step": 2186 }, { "epoch": 1.0206, "grad_norm": 2.9364874362945557, "learning_rate": 2.8704081632653062e-05, "loss": 0.2143, "step": 2187 }, { "epoch": 1.0208, "grad_norm": 0.9889697432518005, "learning_rate": 2.8693877551020408e-05, "loss": 0.0406, "step": 2188 }, { "epoch": 1.021, "grad_norm": 1.8314415216445923, "learning_rate": 2.8683673469387757e-05, "loss": 0.232, "step": 2189 }, { "epoch": 1.0212, "grad_norm": 2.237391471862793, "learning_rate": 2.8673469387755103e-05, "loss": 0.2606, "step": 2190 }, { "epoch": 1.0214, "grad_norm": 4.936470985412598, "learning_rate": 2.866326530612245e-05, "loss": 1.1271, "step": 2191 }, { "epoch": 1.0216, "grad_norm": 10.1080961227417, "learning_rate": 2.8653061224489798e-05, "loss": 1.7872, "step": 2192 }, { "epoch": 1.0218, "grad_norm": 8.04566764831543, "learning_rate": 2.8642857142857144e-05, "loss": 0.4089, "step": 2193 }, { "epoch": 1.022, "grad_norm": 2.041060209274292, "learning_rate": 2.863265306122449e-05, "loss": 0.2214, "step": 2194 }, { "epoch": 1.0222, "grad_norm": 4.039699554443359, "learning_rate": 2.862244897959184e-05, "loss": 0.3278, "step": 2195 }, { "epoch": 1.0224, "grad_norm": 7.945012092590332, "learning_rate": 2.8612244897959184e-05, "loss": 1.0167, "step": 2196 }, { "epoch": 1.0226, "grad_norm": 8.669960975646973, "learning_rate": 2.860204081632653e-05, "loss": 1.5223, "step": 2197 }, { "epoch": 1.0228, "grad_norm": 3.9770047664642334, "learning_rate": 2.8591836734693876e-05, "loss": 1.5937, "step": 2198 }, { "epoch": 1.023, "grad_norm": 4.4404144287109375, "learning_rate": 2.8581632653061225e-05, "loss": 0.7484, "step": 2199 }, { "epoch": 1.0232, "grad_norm": 2.1829888820648193, "learning_rate": 2.857142857142857e-05, "loss": 0.2419, "step": 2200 }, { "epoch": 1.0234, "grad_norm": 1.9747893810272217, "learning_rate": 2.8561224489795917e-05, "loss": 0.2527, "step": 2201 }, { "epoch": 1.0236, "grad_norm": 4.661055564880371, "learning_rate": 2.8551020408163266e-05, "loss": 0.7782, "step": 2202 }, { "epoch": 1.0238, "grad_norm": 11.441447257995605, "learning_rate": 2.854081632653061e-05, "loss": 2.4732, "step": 2203 }, { "epoch": 1.024, "grad_norm": 4.55333137512207, "learning_rate": 2.8530612244897957e-05, "loss": 1.3935, "step": 2204 }, { "epoch": 1.0242, "grad_norm": 4.206186771392822, "learning_rate": 2.8520408163265307e-05, "loss": 1.894, "step": 2205 }, { "epoch": 1.0244, "grad_norm": 11.083173751831055, "learning_rate": 2.8510204081632652e-05, "loss": 2.4657, "step": 2206 }, { "epoch": 1.0246, "grad_norm": 23.377429962158203, "learning_rate": 2.8499999999999998e-05, "loss": 2.9622, "step": 2207 }, { "epoch": 1.0248, "grad_norm": 11.613309860229492, "learning_rate": 2.8489795918367347e-05, "loss": 3.649, "step": 2208 }, { "epoch": 1.025, "grad_norm": 15.78592300415039, "learning_rate": 2.8479591836734693e-05, "loss": 5.3943, "step": 2209 }, { "epoch": 1.0252, "grad_norm": 26.071020126342773, "learning_rate": 2.8469387755102046e-05, "loss": 2.2782, "step": 2210 }, { "epoch": 1.0254, "grad_norm": 14.612674713134766, "learning_rate": 2.845918367346939e-05, "loss": 4.8628, "step": 2211 }, { "epoch": 1.0256, "grad_norm": 3.795036554336548, "learning_rate": 2.8448979591836737e-05, "loss": 0.4615, "step": 2212 }, { "epoch": 1.0258, "grad_norm": 4.833165168762207, "learning_rate": 2.8438775510204086e-05, "loss": 1.0494, "step": 2213 }, { "epoch": 1.026, "grad_norm": 3.8589091300964355, "learning_rate": 2.8428571428571432e-05, "loss": 1.6959, "step": 2214 }, { "epoch": 1.0262, "grad_norm": 3.95047664642334, "learning_rate": 2.8418367346938778e-05, "loss": 0.7726, "step": 2215 }, { "epoch": 1.0264, "grad_norm": 1.8891721963882446, "learning_rate": 2.8408163265306127e-05, "loss": 0.5153, "step": 2216 }, { "epoch": 1.0266, "grad_norm": 4.715999603271484, "learning_rate": 2.8397959183673473e-05, "loss": 2.9044, "step": 2217 }, { "epoch": 1.0268, "grad_norm": 7.398796558380127, "learning_rate": 2.838775510204082e-05, "loss": 5.2357, "step": 2218 }, { "epoch": 1.027, "grad_norm": 16.269620895385742, "learning_rate": 2.8377551020408165e-05, "loss": 2.6862, "step": 2219 }, { "epoch": 1.0272, "grad_norm": 69.98822021484375, "learning_rate": 2.8367346938775514e-05, "loss": 4.26, "step": 2220 }, { "epoch": 1.0274, "grad_norm": 2.350796937942505, "learning_rate": 2.835714285714286e-05, "loss": 0.1944, "step": 2221 }, { "epoch": 1.0276, "grad_norm": 4.043328285217285, "learning_rate": 2.8346938775510205e-05, "loss": 0.664, "step": 2222 }, { "epoch": 1.0278, "grad_norm": 9.004681587219238, "learning_rate": 2.8336734693877555e-05, "loss": 0.7774, "step": 2223 }, { "epoch": 1.028, "grad_norm": 16.564464569091797, "learning_rate": 2.83265306122449e-05, "loss": 2.6385, "step": 2224 }, { "epoch": 1.0282, "grad_norm": 5.071820259094238, "learning_rate": 2.8316326530612246e-05, "loss": 0.903, "step": 2225 }, { "epoch": 1.0284, "grad_norm": 3.974750518798828, "learning_rate": 2.8306122448979595e-05, "loss": 0.6421, "step": 2226 }, { "epoch": 1.0286, "grad_norm": 2.1501924991607666, "learning_rate": 2.829591836734694e-05, "loss": 0.52, "step": 2227 }, { "epoch": 1.0288, "grad_norm": 3.4971842765808105, "learning_rate": 2.8285714285714287e-05, "loss": 1.5575, "step": 2228 }, { "epoch": 1.029, "grad_norm": 5.169280052185059, "learning_rate": 2.8275510204081636e-05, "loss": 0.1911, "step": 2229 }, { "epoch": 1.0292, "grad_norm": 5.614638805389404, "learning_rate": 2.8265306122448982e-05, "loss": 1.5052, "step": 2230 }, { "epoch": 1.0294, "grad_norm": 10.963964462280273, "learning_rate": 2.8255102040816328e-05, "loss": 5.0517, "step": 2231 }, { "epoch": 1.0296, "grad_norm": 7.341145038604736, "learning_rate": 2.8244897959183673e-05, "loss": 2.3456, "step": 2232 }, { "epoch": 1.0298, "grad_norm": 10.776498794555664, "learning_rate": 2.8234693877551023e-05, "loss": 3.0459, "step": 2233 }, { "epoch": 1.03, "grad_norm": 12.878778457641602, "learning_rate": 2.822448979591837e-05, "loss": 3.6761, "step": 2234 }, { "epoch": 1.0302, "grad_norm": 1.7765483856201172, "learning_rate": 2.8214285714285714e-05, "loss": 0.2764, "step": 2235 }, { "epoch": 1.0304, "grad_norm": 3.567537784576416, "learning_rate": 2.8204081632653063e-05, "loss": 0.4874, "step": 2236 }, { "epoch": 1.0306, "grad_norm": 11.235349655151367, "learning_rate": 2.819387755102041e-05, "loss": 2.031, "step": 2237 }, { "epoch": 1.0308, "grad_norm": 11.563838005065918, "learning_rate": 2.8183673469387755e-05, "loss": 1.9193, "step": 2238 }, { "epoch": 1.031, "grad_norm": 4.753085136413574, "learning_rate": 2.8173469387755104e-05, "loss": 2.6111, "step": 2239 }, { "epoch": 1.0312, "grad_norm": 12.879108428955078, "learning_rate": 2.816326530612245e-05, "loss": 2.582, "step": 2240 }, { "epoch": 1.0314, "grad_norm": 3.6239147186279297, "learning_rate": 2.8153061224489796e-05, "loss": 1.5615, "step": 2241 }, { "epoch": 1.0316, "grad_norm": 1.8712190389633179, "learning_rate": 2.814285714285714e-05, "loss": 0.2572, "step": 2242 }, { "epoch": 1.0318, "grad_norm": 5.439129829406738, "learning_rate": 2.813265306122449e-05, "loss": 0.6856, "step": 2243 }, { "epoch": 1.032, "grad_norm": 14.700053215026855, "learning_rate": 2.8122448979591837e-05, "loss": 1.5368, "step": 2244 }, { "epoch": 1.0322, "grad_norm": 1.8174935579299927, "learning_rate": 2.8112244897959182e-05, "loss": 0.1421, "step": 2245 }, { "epoch": 1.0324, "grad_norm": 11.371535301208496, "learning_rate": 2.810204081632653e-05, "loss": 1.4882, "step": 2246 }, { "epoch": 1.0326, "grad_norm": 34.32445526123047, "learning_rate": 2.8091836734693877e-05, "loss": 3.6614, "step": 2247 }, { "epoch": 1.0328, "grad_norm": 1.8845922946929932, "learning_rate": 2.8081632653061223e-05, "loss": 0.0578, "step": 2248 }, { "epoch": 1.033, "grad_norm": 1.6151620149612427, "learning_rate": 2.8071428571428572e-05, "loss": 0.1738, "step": 2249 }, { "epoch": 1.0332, "grad_norm": 4.2584452629089355, "learning_rate": 2.8061224489795918e-05, "loss": 0.4365, "step": 2250 }, { "epoch": 1.0334, "grad_norm": 7.917820453643799, "learning_rate": 2.8051020408163264e-05, "loss": 1.2255, "step": 2251 }, { "epoch": 1.0336, "grad_norm": 3.721857786178589, "learning_rate": 2.804081632653061e-05, "loss": 0.2738, "step": 2252 }, { "epoch": 1.0338, "grad_norm": 8.463653564453125, "learning_rate": 2.803061224489796e-05, "loss": 1.2317, "step": 2253 }, { "epoch": 1.034, "grad_norm": 4.422754287719727, "learning_rate": 2.8020408163265305e-05, "loss": 0.7327, "step": 2254 }, { "epoch": 1.0342, "grad_norm": 2.656949758529663, "learning_rate": 2.801020408163265e-05, "loss": 0.5733, "step": 2255 }, { "epoch": 1.0344, "grad_norm": 2.132868766784668, "learning_rate": 2.8000000000000003e-05, "loss": 0.163, "step": 2256 }, { "epoch": 1.0346, "grad_norm": 11.262079238891602, "learning_rate": 2.7989795918367352e-05, "loss": 2.2782, "step": 2257 }, { "epoch": 1.0348, "grad_norm": 26.79599952697754, "learning_rate": 2.7979591836734698e-05, "loss": 5.6807, "step": 2258 }, { "epoch": 1.035, "grad_norm": 9.496004104614258, "learning_rate": 2.7969387755102044e-05, "loss": 2.106, "step": 2259 }, { "epoch": 1.0352, "grad_norm": 1.3949973583221436, "learning_rate": 2.7959183673469393e-05, "loss": 0.0548, "step": 2260 }, { "epoch": 1.0354, "grad_norm": 2.057509183883667, "learning_rate": 2.794897959183674e-05, "loss": 0.2224, "step": 2261 }, { "epoch": 1.0356, "grad_norm": 1.8621536493301392, "learning_rate": 2.7938775510204084e-05, "loss": 0.1989, "step": 2262 }, { "epoch": 1.0358, "grad_norm": 1.6629083156585693, "learning_rate": 2.792857142857143e-05, "loss": 0.1907, "step": 2263 }, { "epoch": 1.036, "grad_norm": 2.4138829708099365, "learning_rate": 2.791836734693878e-05, "loss": 0.3385, "step": 2264 }, { "epoch": 1.0362, "grad_norm": 5.232615947723389, "learning_rate": 2.7908163265306125e-05, "loss": 0.9645, "step": 2265 }, { "epoch": 1.0364, "grad_norm": 7.121108531951904, "learning_rate": 2.789795918367347e-05, "loss": 1.0515, "step": 2266 }, { "epoch": 1.0366, "grad_norm": 2.2986886501312256, "learning_rate": 2.788775510204082e-05, "loss": 0.6423, "step": 2267 }, { "epoch": 1.0368, "grad_norm": 3.717360496520996, "learning_rate": 2.7877551020408166e-05, "loss": 1.6657, "step": 2268 }, { "epoch": 1.037, "grad_norm": 3.4986190795898438, "learning_rate": 2.7867346938775512e-05, "loss": 0.1756, "step": 2269 }, { "epoch": 1.0372, "grad_norm": 6.572997093200684, "learning_rate": 2.785714285714286e-05, "loss": 0.3995, "step": 2270 }, { "epoch": 1.0374, "grad_norm": 1.7881145477294922, "learning_rate": 2.7846938775510207e-05, "loss": 0.215, "step": 2271 }, { "epoch": 1.0376, "grad_norm": 2.556283712387085, "learning_rate": 2.7836734693877553e-05, "loss": 0.2269, "step": 2272 }, { "epoch": 1.0378, "grad_norm": 1.8592932224273682, "learning_rate": 2.78265306122449e-05, "loss": 0.1474, "step": 2273 }, { "epoch": 1.038, "grad_norm": 4.909307956695557, "learning_rate": 2.7816326530612248e-05, "loss": 0.8709, "step": 2274 }, { "epoch": 1.0382, "grad_norm": 10.424930572509766, "learning_rate": 2.7806122448979593e-05, "loss": 2.0107, "step": 2275 }, { "epoch": 1.0384, "grad_norm": 3.212265968322754, "learning_rate": 2.779591836734694e-05, "loss": 0.3066, "step": 2276 }, { "epoch": 1.0386, "grad_norm": 3.5865800380706787, "learning_rate": 2.778571428571429e-05, "loss": 0.5172, "step": 2277 }, { "epoch": 1.0388, "grad_norm": 4.890392780303955, "learning_rate": 2.7775510204081634e-05, "loss": 0.8389, "step": 2278 }, { "epoch": 1.039, "grad_norm": 4.13357400894165, "learning_rate": 2.776530612244898e-05, "loss": 0.8173, "step": 2279 }, { "epoch": 1.0392, "grad_norm": 2.429373264312744, "learning_rate": 2.775510204081633e-05, "loss": 0.2955, "step": 2280 }, { "epoch": 1.0394, "grad_norm": 4.9131622314453125, "learning_rate": 2.7744897959183675e-05, "loss": 0.7203, "step": 2281 }, { "epoch": 1.0396, "grad_norm": 1.8296152353286743, "learning_rate": 2.773469387755102e-05, "loss": 0.2563, "step": 2282 }, { "epoch": 1.0398, "grad_norm": 2.247852087020874, "learning_rate": 2.7724489795918366e-05, "loss": 0.1898, "step": 2283 }, { "epoch": 1.04, "grad_norm": 4.431122303009033, "learning_rate": 2.7714285714285716e-05, "loss": 0.859, "step": 2284 }, { "epoch": 1.0402, "grad_norm": 7.977433204650879, "learning_rate": 2.770408163265306e-05, "loss": 1.6311, "step": 2285 }, { "epoch": 1.0404, "grad_norm": 11.255778312683105, "learning_rate": 2.7693877551020407e-05, "loss": 0.903, "step": 2286 }, { "epoch": 1.0406, "grad_norm": 6.330371379852295, "learning_rate": 2.7683673469387756e-05, "loss": 0.8694, "step": 2287 }, { "epoch": 1.0408, "grad_norm": 5.011168479919434, "learning_rate": 2.7673469387755102e-05, "loss": 1.5897, "step": 2288 }, { "epoch": 1.041, "grad_norm": 4.034277439117432, "learning_rate": 2.7663265306122448e-05, "loss": 0.5642, "step": 2289 }, { "epoch": 1.0412, "grad_norm": 12.039741516113281, "learning_rate": 2.7653061224489797e-05, "loss": 1.9871, "step": 2290 }, { "epoch": 1.0414, "grad_norm": 4.279134273529053, "learning_rate": 2.7642857142857143e-05, "loss": 1.0659, "step": 2291 }, { "epoch": 1.0416, "grad_norm": 3.9430532455444336, "learning_rate": 2.763265306122449e-05, "loss": 1.552, "step": 2292 }, { "epoch": 1.0418, "grad_norm": 4.80662727355957, "learning_rate": 2.7622448979591835e-05, "loss": 0.675, "step": 2293 }, { "epoch": 1.042, "grad_norm": 1.777544379234314, "learning_rate": 2.7612244897959184e-05, "loss": 0.2428, "step": 2294 }, { "epoch": 1.0422, "grad_norm": 1.8538479804992676, "learning_rate": 2.760204081632653e-05, "loss": 0.2281, "step": 2295 }, { "epoch": 1.0424, "grad_norm": 2.8823657035827637, "learning_rate": 2.7591836734693875e-05, "loss": 0.5083, "step": 2296 }, { "epoch": 1.0426, "grad_norm": 6.437721252441406, "learning_rate": 2.7581632653061224e-05, "loss": 1.7292, "step": 2297 }, { "epoch": 1.0428, "grad_norm": 6.733942985534668, "learning_rate": 2.757142857142857e-05, "loss": 1.5915, "step": 2298 }, { "epoch": 1.043, "grad_norm": 1.15110445022583, "learning_rate": 2.7561224489795916e-05, "loss": 0.0507, "step": 2299 }, { "epoch": 1.0432, "grad_norm": 3.2653064727783203, "learning_rate": 2.7551020408163265e-05, "loss": 0.2169, "step": 2300 }, { "epoch": 1.0434, "grad_norm": 5.073391914367676, "learning_rate": 2.7540816326530618e-05, "loss": 1.0899, "step": 2301 }, { "epoch": 1.0436, "grad_norm": 3.506643056869507, "learning_rate": 2.7530612244897964e-05, "loss": 1.5375, "step": 2302 }, { "epoch": 1.0438, "grad_norm": 5.033652305603027, "learning_rate": 2.752040816326531e-05, "loss": 1.0176, "step": 2303 }, { "epoch": 1.044, "grad_norm": 9.596454620361328, "learning_rate": 2.7510204081632655e-05, "loss": 1.6695, "step": 2304 }, { "epoch": 1.0442, "grad_norm": 7.5560736656188965, "learning_rate": 2.7500000000000004e-05, "loss": 0.9778, "step": 2305 }, { "epoch": 1.0444, "grad_norm": 17.96355628967285, "learning_rate": 2.748979591836735e-05, "loss": 3.7537, "step": 2306 }, { "epoch": 1.0446, "grad_norm": 6.530677795410156, "learning_rate": 2.7479591836734696e-05, "loss": 1.1766, "step": 2307 }, { "epoch": 1.0448, "grad_norm": 11.135425567626953, "learning_rate": 2.7469387755102045e-05, "loss": 1.8137, "step": 2308 }, { "epoch": 1.045, "grad_norm": 9.50981616973877, "learning_rate": 2.745918367346939e-05, "loss": 2.141, "step": 2309 }, { "epoch": 1.0452, "grad_norm": 9.981623649597168, "learning_rate": 2.7448979591836737e-05, "loss": 2.3876, "step": 2310 }, { "epoch": 1.0454, "grad_norm": 1.8994221687316895, "learning_rate": 2.7438775510204086e-05, "loss": 0.4994, "step": 2311 }, { "epoch": 1.0456, "grad_norm": 2.903367042541504, "learning_rate": 2.742857142857143e-05, "loss": 1.3922, "step": 2312 }, { "epoch": 1.0458, "grad_norm": 1.6302586793899536, "learning_rate": 2.7418367346938777e-05, "loss": 0.064, "step": 2313 }, { "epoch": 1.046, "grad_norm": 2.146597146987915, "learning_rate": 2.7408163265306123e-05, "loss": 0.2463, "step": 2314 }, { "epoch": 1.0462, "grad_norm": 14.185137748718262, "learning_rate": 2.7397959183673472e-05, "loss": 0.1766, "step": 2315 }, { "epoch": 1.0464, "grad_norm": 5.094812393188477, "learning_rate": 2.7387755102040818e-05, "loss": 0.3866, "step": 2316 }, { "epoch": 1.0466, "grad_norm": 10.57387924194336, "learning_rate": 2.7377551020408164e-05, "loss": 1.2765, "step": 2317 }, { "epoch": 1.0468, "grad_norm": 15.01219654083252, "learning_rate": 2.7367346938775513e-05, "loss": 1.539, "step": 2318 }, { "epoch": 1.047, "grad_norm": 2.043815851211548, "learning_rate": 2.735714285714286e-05, "loss": 0.1832, "step": 2319 }, { "epoch": 1.0472, "grad_norm": 2.302161931991577, "learning_rate": 2.7346938775510205e-05, "loss": 0.3694, "step": 2320 }, { "epoch": 1.0474, "grad_norm": 3.4057183265686035, "learning_rate": 2.7336734693877554e-05, "loss": 0.7633, "step": 2321 }, { "epoch": 1.0476, "grad_norm": 4.351888179779053, "learning_rate": 2.73265306122449e-05, "loss": 0.7517, "step": 2322 }, { "epoch": 1.0478, "grad_norm": 2.020519495010376, "learning_rate": 2.7316326530612246e-05, "loss": 0.2264, "step": 2323 }, { "epoch": 1.048, "grad_norm": 6.416701316833496, "learning_rate": 2.730612244897959e-05, "loss": 1.2137, "step": 2324 }, { "epoch": 1.0482, "grad_norm": 8.749337196350098, "learning_rate": 2.729591836734694e-05, "loss": 2.1079, "step": 2325 }, { "epoch": 1.0484, "grad_norm": 4.0751471519470215, "learning_rate": 2.7285714285714286e-05, "loss": 0.582, "step": 2326 }, { "epoch": 1.0486, "grad_norm": 6.795112133026123, "learning_rate": 2.7275510204081632e-05, "loss": 2.4138, "step": 2327 }, { "epoch": 1.0488, "grad_norm": 3.4426896572113037, "learning_rate": 2.726530612244898e-05, "loss": 1.6688, "step": 2328 }, { "epoch": 1.049, "grad_norm": 1.9708757400512695, "learning_rate": 2.7255102040816327e-05, "loss": 0.2078, "step": 2329 }, { "epoch": 1.0492, "grad_norm": 3.8974573612213135, "learning_rate": 2.7244897959183673e-05, "loss": 0.3048, "step": 2330 }, { "epoch": 1.0493999999999999, "grad_norm": 4.25654411315918, "learning_rate": 2.7234693877551022e-05, "loss": 0.9219, "step": 2331 }, { "epoch": 1.0496, "grad_norm": 2.8432178497314453, "learning_rate": 2.7224489795918368e-05, "loss": 0.6184, "step": 2332 }, { "epoch": 1.0498, "grad_norm": 2.5125699043273926, "learning_rate": 2.7214285714285714e-05, "loss": 0.3014, "step": 2333 }, { "epoch": 1.05, "grad_norm": 5.020925998687744, "learning_rate": 2.7204081632653063e-05, "loss": 0.2826, "step": 2334 }, { "epoch": 1.0502, "grad_norm": 6.0717082023620605, "learning_rate": 2.719387755102041e-05, "loss": 1.4458, "step": 2335 }, { "epoch": 1.0504, "grad_norm": 33.236656188964844, "learning_rate": 2.7183673469387754e-05, "loss": 3.6638, "step": 2336 }, { "epoch": 1.0506, "grad_norm": 3.4322261810302734, "learning_rate": 2.71734693877551e-05, "loss": 1.5605, "step": 2337 }, { "epoch": 1.0508, "grad_norm": 3.385988235473633, "learning_rate": 2.716326530612245e-05, "loss": 0.4192, "step": 2338 }, { "epoch": 1.051, "grad_norm": 3.708045482635498, "learning_rate": 2.7153061224489795e-05, "loss": 0.4939, "step": 2339 }, { "epoch": 1.0512, "grad_norm": 6.477840423583984, "learning_rate": 2.714285714285714e-05, "loss": 1.4888, "step": 2340 }, { "epoch": 1.0514000000000001, "grad_norm": 1.9212522506713867, "learning_rate": 2.713265306122449e-05, "loss": 0.1784, "step": 2341 }, { "epoch": 1.0516, "grad_norm": 5.865115165710449, "learning_rate": 2.7122448979591836e-05, "loss": 1.7386, "step": 2342 }, { "epoch": 1.0518, "grad_norm": 19.44985580444336, "learning_rate": 2.7112244897959182e-05, "loss": 2.1967, "step": 2343 }, { "epoch": 1.052, "grad_norm": 3.969221353530884, "learning_rate": 2.710204081632653e-05, "loss": 0.5162, "step": 2344 }, { "epoch": 1.0522, "grad_norm": 18.67195701599121, "learning_rate": 2.7091836734693877e-05, "loss": 2.5846, "step": 2345 }, { "epoch": 1.0524, "grad_norm": 35.398231506347656, "learning_rate": 2.708163265306123e-05, "loss": 5.0282, "step": 2346 }, { "epoch": 1.0526, "grad_norm": 144.9508056640625, "learning_rate": 2.7071428571428575e-05, "loss": 6.2646, "step": 2347 }, { "epoch": 1.0528, "grad_norm": 63.41022491455078, "learning_rate": 2.706122448979592e-05, "loss": 2.7425, "step": 2348 }, { "epoch": 1.053, "grad_norm": 23.23649787902832, "learning_rate": 2.705102040816327e-05, "loss": 4.6877, "step": 2349 }, { "epoch": 1.0532, "grad_norm": 3.7437732219696045, "learning_rate": 2.7040816326530616e-05, "loss": 1.4819, "step": 2350 }, { "epoch": 1.0534, "grad_norm": 2.6368460655212402, "learning_rate": 2.703061224489796e-05, "loss": 0.1497, "step": 2351 }, { "epoch": 1.0536, "grad_norm": 4.356257438659668, "learning_rate": 2.702040816326531e-05, "loss": 0.6518, "step": 2352 }, { "epoch": 1.0538, "grad_norm": 1.6669964790344238, "learning_rate": 2.7010204081632657e-05, "loss": 0.1701, "step": 2353 }, { "epoch": 1.054, "grad_norm": 34.541290283203125, "learning_rate": 2.7000000000000002e-05, "loss": 1.8341, "step": 2354 }, { "epoch": 1.0542, "grad_norm": 82.706787109375, "learning_rate": 2.698979591836735e-05, "loss": 5.5993, "step": 2355 }, { "epoch": 1.0544, "grad_norm": 6.800225734710693, "learning_rate": 2.6979591836734697e-05, "loss": 0.7273, "step": 2356 }, { "epoch": 1.0546, "grad_norm": 3.865412473678589, "learning_rate": 2.6969387755102043e-05, "loss": 0.9293, "step": 2357 }, { "epoch": 1.0548, "grad_norm": 3.1924800872802734, "learning_rate": 2.695918367346939e-05, "loss": 1.0414, "step": 2358 }, { "epoch": 1.055, "grad_norm": 12.705007553100586, "learning_rate": 2.6948979591836738e-05, "loss": 2.6006, "step": 2359 }, { "epoch": 1.0552, "grad_norm": 30.791608810424805, "learning_rate": 2.6938775510204084e-05, "loss": 4.2604, "step": 2360 }, { "epoch": 1.0554, "grad_norm": 3.631416082382202, "learning_rate": 2.692857142857143e-05, "loss": 0.1501, "step": 2361 }, { "epoch": 1.0556, "grad_norm": 5.945212364196777, "learning_rate": 2.691836734693878e-05, "loss": 0.5055, "step": 2362 }, { "epoch": 1.0558, "grad_norm": 22.41519546508789, "learning_rate": 2.6908163265306125e-05, "loss": 1.3106, "step": 2363 }, { "epoch": 1.056, "grad_norm": 49.526512145996094, "learning_rate": 2.689795918367347e-05, "loss": 4.4305, "step": 2364 }, { "epoch": 1.0562, "grad_norm": 3.019853353500366, "learning_rate": 2.688775510204082e-05, "loss": 0.2182, "step": 2365 }, { "epoch": 1.0564, "grad_norm": 5.977735996246338, "learning_rate": 2.6877551020408165e-05, "loss": 0.3451, "step": 2366 }, { "epoch": 1.0566, "grad_norm": 1.628416895866394, "learning_rate": 2.686734693877551e-05, "loss": 0.1811, "step": 2367 }, { "epoch": 1.0568, "grad_norm": 1.9607353210449219, "learning_rate": 2.6857142857142857e-05, "loss": 0.219, "step": 2368 }, { "epoch": 1.057, "grad_norm": 3.1588733196258545, "learning_rate": 2.6846938775510206e-05, "loss": 0.0474, "step": 2369 }, { "epoch": 1.0572, "grad_norm": 1.7066285610198975, "learning_rate": 2.6836734693877552e-05, "loss": 0.2578, "step": 2370 }, { "epoch": 1.0574, "grad_norm": 2.4532039165496826, "learning_rate": 2.6826530612244898e-05, "loss": 0.6573, "step": 2371 }, { "epoch": 1.0576, "grad_norm": 3.317470073699951, "learning_rate": 2.6816326530612247e-05, "loss": 1.6299, "step": 2372 }, { "epoch": 1.0578, "grad_norm": 4.707334518432617, "learning_rate": 2.6806122448979593e-05, "loss": 0.5495, "step": 2373 }, { "epoch": 1.058, "grad_norm": 10.748777389526367, "learning_rate": 2.679591836734694e-05, "loss": 1.7668, "step": 2374 }, { "epoch": 1.0582, "grad_norm": 4.457859516143799, "learning_rate": 2.6785714285714288e-05, "loss": 0.6171, "step": 2375 }, { "epoch": 1.0584, "grad_norm": 2.8171865940093994, "learning_rate": 2.6775510204081634e-05, "loss": 0.2549, "step": 2376 }, { "epoch": 1.0586, "grad_norm": 4.005404949188232, "learning_rate": 2.676530612244898e-05, "loss": 0.2262, "step": 2377 }, { "epoch": 1.0588, "grad_norm": 2.286299467086792, "learning_rate": 2.6755102040816325e-05, "loss": 0.4013, "step": 2378 }, { "epoch": 1.059, "grad_norm": 3.72717547416687, "learning_rate": 2.6744897959183674e-05, "loss": 1.6463, "step": 2379 }, { "epoch": 1.0592, "grad_norm": 6.668277740478516, "learning_rate": 2.673469387755102e-05, "loss": 0.5391, "step": 2380 }, { "epoch": 1.0594, "grad_norm": 9.107120513916016, "learning_rate": 2.6724489795918366e-05, "loss": 1.5799, "step": 2381 }, { "epoch": 1.0596, "grad_norm": 4.307889938354492, "learning_rate": 2.6714285714285715e-05, "loss": 0.2707, "step": 2382 }, { "epoch": 1.0598, "grad_norm": 3.1701438426971436, "learning_rate": 2.670408163265306e-05, "loss": 0.6578, "step": 2383 }, { "epoch": 1.06, "grad_norm": 1.8328818082809448, "learning_rate": 2.6693877551020407e-05, "loss": 0.0478, "step": 2384 }, { "epoch": 1.0602, "grad_norm": 11.048552513122559, "learning_rate": 2.6683673469387756e-05, "loss": 1.4496, "step": 2385 }, { "epoch": 1.0604, "grad_norm": 17.557401657104492, "learning_rate": 2.66734693877551e-05, "loss": 4.7514, "step": 2386 }, { "epoch": 1.0606, "grad_norm": 2.2788662910461426, "learning_rate": 2.6663265306122447e-05, "loss": 0.0864, "step": 2387 }, { "epoch": 1.0608, "grad_norm": 1.4921162128448486, "learning_rate": 2.6653061224489793e-05, "loss": 0.0922, "step": 2388 }, { "epoch": 1.061, "grad_norm": 2.2919957637786865, "learning_rate": 2.6642857142857142e-05, "loss": 0.161, "step": 2389 }, { "epoch": 1.0612, "grad_norm": 21.927486419677734, "learning_rate": 2.6632653061224488e-05, "loss": 2.0904, "step": 2390 }, { "epoch": 1.0614, "grad_norm": 6.614194393157959, "learning_rate": 2.6622448979591834e-05, "loss": 4.6801, "step": 2391 }, { "epoch": 1.0616, "grad_norm": 2.3331987857818604, "learning_rate": 2.6612244897959187e-05, "loss": 0.5976, "step": 2392 }, { "epoch": 1.0618, "grad_norm": 3.3811841011047363, "learning_rate": 2.6602040816326536e-05, "loss": 1.6392, "step": 2393 }, { "epoch": 1.062, "grad_norm": 2.3435702323913574, "learning_rate": 2.659183673469388e-05, "loss": 0.3477, "step": 2394 }, { "epoch": 1.0622, "grad_norm": 6.553333282470703, "learning_rate": 2.6581632653061227e-05, "loss": 1.6144, "step": 2395 }, { "epoch": 1.0624, "grad_norm": 10.608148574829102, "learning_rate": 2.6571428571428576e-05, "loss": 2.163, "step": 2396 }, { "epoch": 1.0626, "grad_norm": 2.376661539077759, "learning_rate": 2.6561224489795922e-05, "loss": 0.2497, "step": 2397 }, { "epoch": 1.0628, "grad_norm": 5.275718688964844, "learning_rate": 2.6551020408163268e-05, "loss": 0.707, "step": 2398 }, { "epoch": 1.063, "grad_norm": 4.847681999206543, "learning_rate": 2.6540816326530614e-05, "loss": 0.5223, "step": 2399 }, { "epoch": 1.0632, "grad_norm": 5.3941650390625, "learning_rate": 2.6530612244897963e-05, "loss": 1.6102, "step": 2400 }, { "epoch": 1.0634, "grad_norm": 6.420630931854248, "learning_rate": 2.652040816326531e-05, "loss": 0.8017, "step": 2401 }, { "epoch": 1.0636, "grad_norm": 5.283441543579102, "learning_rate": 2.6510204081632655e-05, "loss": 0.4855, "step": 2402 }, { "epoch": 1.0638, "grad_norm": 13.458806037902832, "learning_rate": 2.6500000000000004e-05, "loss": 0.7235, "step": 2403 }, { "epoch": 1.064, "grad_norm": 28.971445083618164, "learning_rate": 2.648979591836735e-05, "loss": 2.4488, "step": 2404 }, { "epoch": 1.0642, "grad_norm": 3.099283456802368, "learning_rate": 2.6479591836734695e-05, "loss": 0.374, "step": 2405 }, { "epoch": 1.0644, "grad_norm": 4.976246356964111, "learning_rate": 2.6469387755102045e-05, "loss": 1.3718, "step": 2406 }, { "epoch": 1.0646, "grad_norm": 0.9986436367034912, "learning_rate": 2.645918367346939e-05, "loss": 0.041, "step": 2407 }, { "epoch": 1.0648, "grad_norm": 2.3621811866760254, "learning_rate": 2.6448979591836736e-05, "loss": 0.6865, "step": 2408 }, { "epoch": 1.065, "grad_norm": 3.2157063484191895, "learning_rate": 2.6438775510204082e-05, "loss": 1.4494, "step": 2409 }, { "epoch": 1.0652, "grad_norm": 1.8791629076004028, "learning_rate": 2.642857142857143e-05, "loss": 0.255, "step": 2410 }, { "epoch": 1.0654, "grad_norm": 1.9395337104797363, "learning_rate": 2.6418367346938777e-05, "loss": 0.2685, "step": 2411 }, { "epoch": 1.0656, "grad_norm": 1.4387263059616089, "learning_rate": 2.6408163265306123e-05, "loss": 0.1137, "step": 2412 }, { "epoch": 1.0658, "grad_norm": 3.260917901992798, "learning_rate": 2.6397959183673472e-05, "loss": 0.9726, "step": 2413 }, { "epoch": 1.066, "grad_norm": 3.520772695541382, "learning_rate": 2.6387755102040818e-05, "loss": 1.5219, "step": 2414 }, { "epoch": 1.0662, "grad_norm": 7.0468316078186035, "learning_rate": 2.6377551020408163e-05, "loss": 0.5509, "step": 2415 }, { "epoch": 1.0664, "grad_norm": 4.3371663093566895, "learning_rate": 2.6367346938775513e-05, "loss": 0.8301, "step": 2416 }, { "epoch": 1.0666, "grad_norm": 4.128448486328125, "learning_rate": 2.635714285714286e-05, "loss": 1.2042, "step": 2417 }, { "epoch": 1.0668, "grad_norm": 3.1050233840942383, "learning_rate": 2.6346938775510204e-05, "loss": 1.4974, "step": 2418 }, { "epoch": 1.067, "grad_norm": 1.9743566513061523, "learning_rate": 2.633673469387755e-05, "loss": 0.138, "step": 2419 }, { "epoch": 1.0672, "grad_norm": 1.6516227722167969, "learning_rate": 2.63265306122449e-05, "loss": 0.1792, "step": 2420 }, { "epoch": 1.0674, "grad_norm": 2.782998561859131, "learning_rate": 2.6316326530612245e-05, "loss": 0.3191, "step": 2421 }, { "epoch": 1.0676, "grad_norm": 3.995413303375244, "learning_rate": 2.630612244897959e-05, "loss": 0.5936, "step": 2422 }, { "epoch": 1.0678, "grad_norm": 3.0147314071655273, "learning_rate": 2.629591836734694e-05, "loss": 0.2437, "step": 2423 }, { "epoch": 1.068, "grad_norm": 9.054986000061035, "learning_rate": 2.6285714285714286e-05, "loss": 1.7769, "step": 2424 }, { "epoch": 1.0682, "grad_norm": 8.601308822631836, "learning_rate": 2.627551020408163e-05, "loss": 1.6046, "step": 2425 }, { "epoch": 1.0684, "grad_norm": 4.747802257537842, "learning_rate": 2.626530612244898e-05, "loss": 0.7729, "step": 2426 }, { "epoch": 1.0686, "grad_norm": 3.94885516166687, "learning_rate": 2.6255102040816326e-05, "loss": 1.7273, "step": 2427 }, { "epoch": 1.0688, "grad_norm": 2.5352084636688232, "learning_rate": 2.6244897959183672e-05, "loss": 0.6568, "step": 2428 }, { "epoch": 1.069, "grad_norm": 6.1487507820129395, "learning_rate": 2.623469387755102e-05, "loss": 2.8582, "step": 2429 }, { "epoch": 1.0692, "grad_norm": 10.868823051452637, "learning_rate": 2.6224489795918367e-05, "loss": 4.6807, "step": 2430 }, { "epoch": 1.0694, "grad_norm": 2.121119260787964, "learning_rate": 2.6214285714285713e-05, "loss": 0.2144, "step": 2431 }, { "epoch": 1.0695999999999999, "grad_norm": 5.630823612213135, "learning_rate": 2.620408163265306e-05, "loss": 1.0378, "step": 2432 }, { "epoch": 1.0698, "grad_norm": 14.371367454528809, "learning_rate": 2.6193877551020408e-05, "loss": 3.6745, "step": 2433 }, { "epoch": 1.07, "grad_norm": 7.156108856201172, "learning_rate": 2.6183673469387754e-05, "loss": 4.7997, "step": 2434 }, { "epoch": 1.0702, "grad_norm": 4.25835657119751, "learning_rate": 2.61734693877551e-05, "loss": 0.5825, "step": 2435 }, { "epoch": 1.0704, "grad_norm": 3.216127872467041, "learning_rate": 2.616326530612245e-05, "loss": 0.366, "step": 2436 }, { "epoch": 1.0706, "grad_norm": 1.3122063875198364, "learning_rate": 2.61530612244898e-05, "loss": 0.0502, "step": 2437 }, { "epoch": 1.0708, "grad_norm": 1.7881473302841187, "learning_rate": 2.6142857142857147e-05, "loss": 0.0359, "step": 2438 }, { "epoch": 1.071, "grad_norm": 1.5982885360717773, "learning_rate": 2.6132653061224493e-05, "loss": 0.1776, "step": 2439 }, { "epoch": 1.0712, "grad_norm": 1.4499366283416748, "learning_rate": 2.612244897959184e-05, "loss": 0.0721, "step": 2440 }, { "epoch": 1.0714, "grad_norm": 2.7761549949645996, "learning_rate": 2.6112244897959188e-05, "loss": 0.2619, "step": 2441 }, { "epoch": 1.0716, "grad_norm": 3.3720993995666504, "learning_rate": 2.6102040816326534e-05, "loss": 0.1568, "step": 2442 }, { "epoch": 1.0718, "grad_norm": 1.3414360284805298, "learning_rate": 2.609183673469388e-05, "loss": 0.0378, "step": 2443 }, { "epoch": 1.072, "grad_norm": 1.1581441164016724, "learning_rate": 2.608163265306123e-05, "loss": 0.0442, "step": 2444 }, { "epoch": 1.0722, "grad_norm": 1.699424386024475, "learning_rate": 2.6071428571428574e-05, "loss": 0.2574, "step": 2445 }, { "epoch": 1.0724, "grad_norm": 3.8419086933135986, "learning_rate": 2.606122448979592e-05, "loss": 0.2357, "step": 2446 }, { "epoch": 1.0726, "grad_norm": 8.273889541625977, "learning_rate": 2.605102040816327e-05, "loss": 1.3546, "step": 2447 }, { "epoch": 1.0728, "grad_norm": 2.8728175163269043, "learning_rate": 2.6040816326530615e-05, "loss": 0.64, "step": 2448 }, { "epoch": 1.073, "grad_norm": 6.3875579833984375, "learning_rate": 2.603061224489796e-05, "loss": 1.9487, "step": 2449 }, { "epoch": 1.0732, "grad_norm": 9.781810760498047, "learning_rate": 2.6020408163265307e-05, "loss": 1.4451, "step": 2450 }, { "epoch": 1.0734, "grad_norm": 13.755586624145508, "learning_rate": 2.6010204081632656e-05, "loss": 0.9554, "step": 2451 }, { "epoch": 1.0735999999999999, "grad_norm": 4.130563259124756, "learning_rate": 2.6000000000000002e-05, "loss": 0.173, "step": 2452 }, { "epoch": 1.0738, "grad_norm": 1.6912357807159424, "learning_rate": 2.5989795918367348e-05, "loss": 0.1261, "step": 2453 }, { "epoch": 1.074, "grad_norm": 4.4955925941467285, "learning_rate": 2.5979591836734697e-05, "loss": 0.2242, "step": 2454 }, { "epoch": 1.0742, "grad_norm": 2.2355611324310303, "learning_rate": 2.5969387755102043e-05, "loss": 0.1369, "step": 2455 }, { "epoch": 1.0744, "grad_norm": 1.3106834888458252, "learning_rate": 2.595918367346939e-05, "loss": 0.0492, "step": 2456 }, { "epoch": 1.0746, "grad_norm": 7.732717990875244, "learning_rate": 2.5948979591836738e-05, "loss": 0.6055, "step": 2457 }, { "epoch": 1.0748, "grad_norm": 13.904815673828125, "learning_rate": 2.5938775510204083e-05, "loss": 1.3689, "step": 2458 }, { "epoch": 1.075, "grad_norm": 1.706587314605713, "learning_rate": 2.592857142857143e-05, "loss": 0.0827, "step": 2459 }, { "epoch": 1.0752, "grad_norm": 7.618509292602539, "learning_rate": 2.5918367346938778e-05, "loss": 1.062, "step": 2460 }, { "epoch": 1.0754, "grad_norm": 14.990194320678711, "learning_rate": 2.5908163265306124e-05, "loss": 2.4455, "step": 2461 }, { "epoch": 1.0756000000000001, "grad_norm": 7.136456489562988, "learning_rate": 2.589795918367347e-05, "loss": 0.6104, "step": 2462 }, { "epoch": 1.0758, "grad_norm": 19.634891510009766, "learning_rate": 2.5887755102040816e-05, "loss": 1.4636, "step": 2463 }, { "epoch": 1.076, "grad_norm": 2.359050750732422, "learning_rate": 2.5877551020408165e-05, "loss": 0.4074, "step": 2464 }, { "epoch": 1.0762, "grad_norm": 5.584263801574707, "learning_rate": 2.586734693877551e-05, "loss": 2.4626, "step": 2465 }, { "epoch": 1.0764, "grad_norm": 4.744572162628174, "learning_rate": 2.5857142857142856e-05, "loss": 1.764, "step": 2466 }, { "epoch": 1.0766, "grad_norm": 4.963972568511963, "learning_rate": 2.5846938775510206e-05, "loss": 0.3838, "step": 2467 }, { "epoch": 1.0768, "grad_norm": 12.521373748779297, "learning_rate": 2.583673469387755e-05, "loss": 1.2165, "step": 2468 }, { "epoch": 1.077, "grad_norm": 33.27798843383789, "learning_rate": 2.5826530612244897e-05, "loss": 4.0389, "step": 2469 }, { "epoch": 1.0772, "grad_norm": 4.73541259765625, "learning_rate": 2.5816326530612246e-05, "loss": 0.9516, "step": 2470 }, { "epoch": 1.0774, "grad_norm": 7.456980228424072, "learning_rate": 2.5806122448979592e-05, "loss": 1.5232, "step": 2471 }, { "epoch": 1.0776, "grad_norm": 3.147059202194214, "learning_rate": 2.5795918367346938e-05, "loss": 0.2818, "step": 2472 }, { "epoch": 1.0778, "grad_norm": 6.3879008293151855, "learning_rate": 2.5785714285714284e-05, "loss": 0.7583, "step": 2473 }, { "epoch": 1.078, "grad_norm": 2.2574191093444824, "learning_rate": 2.5775510204081633e-05, "loss": 0.4802, "step": 2474 }, { "epoch": 1.0782, "grad_norm": 6.5756378173828125, "learning_rate": 2.576530612244898e-05, "loss": 2.5003, "step": 2475 }, { "epoch": 1.0784, "grad_norm": 11.439837455749512, "learning_rate": 2.5755102040816325e-05, "loss": 3.7775, "step": 2476 }, { "epoch": 1.0786, "grad_norm": 23.936866760253906, "learning_rate": 2.5744897959183674e-05, "loss": 2.1445, "step": 2477 }, { "epoch": 1.0788, "grad_norm": 23.30131721496582, "learning_rate": 2.573469387755102e-05, "loss": 3.8271, "step": 2478 }, { "epoch": 1.079, "grad_norm": 2.3670778274536133, "learning_rate": 2.5724489795918365e-05, "loss": 0.3152, "step": 2479 }, { "epoch": 1.0792, "grad_norm": 3.0792884826660156, "learning_rate": 2.5714285714285714e-05, "loss": 0.5886, "step": 2480 }, { "epoch": 1.0794, "grad_norm": 14.284750938415527, "learning_rate": 2.570408163265306e-05, "loss": 1.2113, "step": 2481 }, { "epoch": 1.0796000000000001, "grad_norm": 29.00316047668457, "learning_rate": 2.5693877551020413e-05, "loss": 3.9031, "step": 2482 }, { "epoch": 1.0798, "grad_norm": 2.106374502182007, "learning_rate": 2.568367346938776e-05, "loss": 0.2462, "step": 2483 }, { "epoch": 1.08, "grad_norm": 1.4427016973495483, "learning_rate": 2.5673469387755104e-05, "loss": 0.0533, "step": 2484 }, { "epoch": 1.0802, "grad_norm": 2.1444180011749268, "learning_rate": 2.5663265306122454e-05, "loss": 0.2781, "step": 2485 }, { "epoch": 1.0804, "grad_norm": 6.793755054473877, "learning_rate": 2.56530612244898e-05, "loss": 1.5963, "step": 2486 }, { "epoch": 1.0806, "grad_norm": 15.605535507202148, "learning_rate": 2.5642857142857145e-05, "loss": 5.3253, "step": 2487 }, { "epoch": 1.0808, "grad_norm": 6.925433158874512, "learning_rate": 2.5632653061224494e-05, "loss": 0.827, "step": 2488 }, { "epoch": 1.081, "grad_norm": 7.272058963775635, "learning_rate": 2.562244897959184e-05, "loss": 1.4671, "step": 2489 }, { "epoch": 1.0812, "grad_norm": 3.7872958183288574, "learning_rate": 2.5612244897959186e-05, "loss": 0.5281, "step": 2490 }, { "epoch": 1.0814, "grad_norm": 1.976417899131775, "learning_rate": 2.5602040816326535e-05, "loss": 0.2451, "step": 2491 }, { "epoch": 1.0816, "grad_norm": 2.881429433822632, "learning_rate": 2.559183673469388e-05, "loss": 0.2317, "step": 2492 }, { "epoch": 1.0818, "grad_norm": 3.65335750579834, "learning_rate": 2.5581632653061227e-05, "loss": 0.3894, "step": 2493 }, { "epoch": 1.082, "grad_norm": 2.544227123260498, "learning_rate": 2.5571428571428572e-05, "loss": 0.3121, "step": 2494 }, { "epoch": 1.0822, "grad_norm": 7.100771903991699, "learning_rate": 2.556122448979592e-05, "loss": 1.753, "step": 2495 }, { "epoch": 1.0824, "grad_norm": 12.864798545837402, "learning_rate": 2.5551020408163267e-05, "loss": 2.8146, "step": 2496 }, { "epoch": 1.0826, "grad_norm": 22.558868408203125, "learning_rate": 2.5540816326530613e-05, "loss": 1.1523, "step": 2497 }, { "epoch": 1.0828, "grad_norm": 73.28495025634766, "learning_rate": 2.5530612244897962e-05, "loss": 5.9876, "step": 2498 }, { "epoch": 1.083, "grad_norm": 80.7002182006836, "learning_rate": 2.5520408163265308e-05, "loss": 5.6509, "step": 2499 }, { "epoch": 1.0832, "grad_norm": 2.3751392364501953, "learning_rate": 2.5510204081632654e-05, "loss": 0.3095, "step": 2500 }, { "epoch": 1.0834, "grad_norm": 5.686920642852783, "learning_rate": 2.5500000000000003e-05, "loss": 0.7626, "step": 2501 }, { "epoch": 1.0836, "grad_norm": 4.439648628234863, "learning_rate": 2.548979591836735e-05, "loss": 0.4574, "step": 2502 }, { "epoch": 1.0838, "grad_norm": 4.587183475494385, "learning_rate": 2.5479591836734695e-05, "loss": 0.6623, "step": 2503 }, { "epoch": 1.084, "grad_norm": 2.004228353500366, "learning_rate": 2.546938775510204e-05, "loss": 0.0682, "step": 2504 }, { "epoch": 1.0842, "grad_norm": 4.348333358764648, "learning_rate": 2.545918367346939e-05, "loss": 0.1526, "step": 2505 }, { "epoch": 1.0844, "grad_norm": 2.2945642471313477, "learning_rate": 2.5448979591836736e-05, "loss": 0.6309, "step": 2506 }, { "epoch": 1.0846, "grad_norm": 3.4506020545959473, "learning_rate": 2.543877551020408e-05, "loss": 1.6623, "step": 2507 }, { "epoch": 1.0848, "grad_norm": 3.223053216934204, "learning_rate": 2.542857142857143e-05, "loss": 0.3624, "step": 2508 }, { "epoch": 1.085, "grad_norm": 2.0197436809539795, "learning_rate": 2.5418367346938776e-05, "loss": 0.4646, "step": 2509 }, { "epoch": 1.0852, "grad_norm": 3.190613031387329, "learning_rate": 2.5408163265306122e-05, "loss": 1.6204, "step": 2510 }, { "epoch": 1.0854, "grad_norm": 1.880837082862854, "learning_rate": 2.539795918367347e-05, "loss": 0.1061, "step": 2511 }, { "epoch": 1.0856, "grad_norm": 3.710780143737793, "learning_rate": 2.5387755102040817e-05, "loss": 0.4916, "step": 2512 }, { "epoch": 1.0858, "grad_norm": 4.7563886642456055, "learning_rate": 2.5377551020408163e-05, "loss": 0.6781, "step": 2513 }, { "epoch": 1.086, "grad_norm": 6.385908603668213, "learning_rate": 2.536734693877551e-05, "loss": 0.4621, "step": 2514 }, { "epoch": 1.0862, "grad_norm": 9.943044662475586, "learning_rate": 2.5357142857142858e-05, "loss": 1.7608, "step": 2515 }, { "epoch": 1.0864, "grad_norm": 7.606964588165283, "learning_rate": 2.5346938775510204e-05, "loss": 2.5999, "step": 2516 }, { "epoch": 1.0866, "grad_norm": 8.824433326721191, "learning_rate": 2.533673469387755e-05, "loss": 3.2743, "step": 2517 }, { "epoch": 1.0868, "grad_norm": 3.378680467605591, "learning_rate": 2.53265306122449e-05, "loss": 1.7249, "step": 2518 }, { "epoch": 1.087, "grad_norm": 7.698119163513184, "learning_rate": 2.5316326530612244e-05, "loss": 1.4085, "step": 2519 }, { "epoch": 1.0872, "grad_norm": 16.662708282470703, "learning_rate": 2.530612244897959e-05, "loss": 3.6055, "step": 2520 }, { "epoch": 1.0874, "grad_norm": 2.6420581340789795, "learning_rate": 2.529591836734694e-05, "loss": 0.2005, "step": 2521 }, { "epoch": 1.0876, "grad_norm": 3.360490560531616, "learning_rate": 2.5285714285714285e-05, "loss": 0.862, "step": 2522 }, { "epoch": 1.0878, "grad_norm": 3.689589500427246, "learning_rate": 2.527551020408163e-05, "loss": 2.2184, "step": 2523 }, { "epoch": 1.088, "grad_norm": 3.124295711517334, "learning_rate": 2.526530612244898e-05, "loss": 1.5167, "step": 2524 }, { "epoch": 1.0882, "grad_norm": 2.8372583389282227, "learning_rate": 2.5255102040816326e-05, "loss": 0.5618, "step": 2525 }, { "epoch": 1.0884, "grad_norm": 4.048417091369629, "learning_rate": 2.5244897959183672e-05, "loss": 0.5793, "step": 2526 }, { "epoch": 1.0886, "grad_norm": 2.170830249786377, "learning_rate": 2.5234693877551017e-05, "loss": 0.2466, "step": 2527 }, { "epoch": 1.0888, "grad_norm": 1.8162426948547363, "learning_rate": 2.522448979591837e-05, "loss": 0.1953, "step": 2528 }, { "epoch": 1.089, "grad_norm": 2.095097303390503, "learning_rate": 2.521428571428572e-05, "loss": 0.1488, "step": 2529 }, { "epoch": 1.0892, "grad_norm": 3.6445815563201904, "learning_rate": 2.5204081632653065e-05, "loss": 0.5365, "step": 2530 }, { "epoch": 1.0894, "grad_norm": 2.005758047103882, "learning_rate": 2.519387755102041e-05, "loss": 0.2343, "step": 2531 }, { "epoch": 1.0896, "grad_norm": 2.506690502166748, "learning_rate": 2.518367346938776e-05, "loss": 0.276, "step": 2532 }, { "epoch": 1.0898, "grad_norm": 4.830741882324219, "learning_rate": 2.5173469387755106e-05, "loss": 0.7821, "step": 2533 }, { "epoch": 1.09, "grad_norm": 16.803346633911133, "learning_rate": 2.516326530612245e-05, "loss": 1.7534, "step": 2534 }, { "epoch": 1.0902, "grad_norm": 20.17479705810547, "learning_rate": 2.5153061224489797e-05, "loss": 1.6485, "step": 2535 }, { "epoch": 1.0904, "grad_norm": 6.046016216278076, "learning_rate": 2.5142857142857147e-05, "loss": 1.2223, "step": 2536 }, { "epoch": 1.0906, "grad_norm": 11.118535995483398, "learning_rate": 2.5132653061224492e-05, "loss": 2.9827, "step": 2537 }, { "epoch": 1.0908, "grad_norm": 4.158987045288086, "learning_rate": 2.5122448979591838e-05, "loss": 0.8138, "step": 2538 }, { "epoch": 1.091, "grad_norm": 5.505331516265869, "learning_rate": 2.5112244897959187e-05, "loss": 0.3746, "step": 2539 }, { "epoch": 1.0912, "grad_norm": 17.089290618896484, "learning_rate": 2.5102040816326533e-05, "loss": 0.8031, "step": 2540 }, { "epoch": 1.0914, "grad_norm": 1.8025397062301636, "learning_rate": 2.509183673469388e-05, "loss": 0.2397, "step": 2541 }, { "epoch": 1.0916, "grad_norm": 1.887250542640686, "learning_rate": 2.5081632653061228e-05, "loss": 0.1461, "step": 2542 }, { "epoch": 1.0918, "grad_norm": 1.9648981094360352, "learning_rate": 2.5071428571428574e-05, "loss": 0.0805, "step": 2543 }, { "epoch": 1.092, "grad_norm": 4.035311698913574, "learning_rate": 2.506122448979592e-05, "loss": 0.5857, "step": 2544 }, { "epoch": 1.0922, "grad_norm": 6.021888732910156, "learning_rate": 2.5051020408163265e-05, "loss": 1.2061, "step": 2545 }, { "epoch": 1.0924, "grad_norm": 1.1482367515563965, "learning_rate": 2.5040816326530615e-05, "loss": 0.0516, "step": 2546 }, { "epoch": 1.0926, "grad_norm": 1.3609832525253296, "learning_rate": 2.503061224489796e-05, "loss": 0.0763, "step": 2547 }, { "epoch": 1.0928, "grad_norm": 1.8674981594085693, "learning_rate": 2.5020408163265306e-05, "loss": 0.207, "step": 2548 }, { "epoch": 1.093, "grad_norm": 2.574284791946411, "learning_rate": 2.5010204081632655e-05, "loss": 0.4801, "step": 2549 }, { "epoch": 1.0932, "grad_norm": 5.7880940437316895, "learning_rate": 2.5e-05, "loss": 1.706, "step": 2550 }, { "epoch": 1.0934, "grad_norm": 8.400511741638184, "learning_rate": 2.4989795918367347e-05, "loss": 0.9191, "step": 2551 }, { "epoch": 1.0936, "grad_norm": 6.415307998657227, "learning_rate": 2.4979591836734696e-05, "loss": 0.9075, "step": 2552 }, { "epoch": 1.0937999999999999, "grad_norm": 4.819631099700928, "learning_rate": 2.4969387755102042e-05, "loss": 1.2393, "step": 2553 }, { "epoch": 1.094, "grad_norm": 4.074408531188965, "learning_rate": 2.4959183673469388e-05, "loss": 0.7136, "step": 2554 }, { "epoch": 1.0942, "grad_norm": 1.512075662612915, "learning_rate": 2.4948979591836737e-05, "loss": 0.1217, "step": 2555 }, { "epoch": 1.0944, "grad_norm": 1.1646981239318848, "learning_rate": 2.4938775510204083e-05, "loss": 0.0507, "step": 2556 }, { "epoch": 1.0946, "grad_norm": 1.680336594581604, "learning_rate": 2.492857142857143e-05, "loss": 0.1584, "step": 2557 }, { "epoch": 1.0948, "grad_norm": 2.7351956367492676, "learning_rate": 2.4918367346938774e-05, "loss": 0.37, "step": 2558 }, { "epoch": 1.095, "grad_norm": 6.237983703613281, "learning_rate": 2.4908163265306123e-05, "loss": 1.5575, "step": 2559 }, { "epoch": 1.0952, "grad_norm": 2.0216691493988037, "learning_rate": 2.489795918367347e-05, "loss": 0.1551, "step": 2560 }, { "epoch": 1.0954, "grad_norm": 7.477294921875, "learning_rate": 2.4887755102040815e-05, "loss": 0.5022, "step": 2561 }, { "epoch": 1.0956, "grad_norm": 17.65044403076172, "learning_rate": 2.4877551020408164e-05, "loss": 2.1472, "step": 2562 }, { "epoch": 1.0958, "grad_norm": 1.9280164241790771, "learning_rate": 2.4867346938775513e-05, "loss": 0.1854, "step": 2563 }, { "epoch": 1.096, "grad_norm": 3.616777181625366, "learning_rate": 2.485714285714286e-05, "loss": 0.2811, "step": 2564 }, { "epoch": 1.0962, "grad_norm": 3.648794651031494, "learning_rate": 2.4846938775510205e-05, "loss": 0.496, "step": 2565 }, { "epoch": 1.0964, "grad_norm": 4.593664646148682, "learning_rate": 2.4836734693877554e-05, "loss": 0.6411, "step": 2566 }, { "epoch": 1.0966, "grad_norm": 2.9092133045196533, "learning_rate": 2.48265306122449e-05, "loss": 1.4583, "step": 2567 }, { "epoch": 1.0968, "grad_norm": 5.004642486572266, "learning_rate": 2.4816326530612246e-05, "loss": 5.1528, "step": 2568 }, { "epoch": 1.097, "grad_norm": 3.7020583152770996, "learning_rate": 2.4806122448979595e-05, "loss": 0.7462, "step": 2569 }, { "epoch": 1.0972, "grad_norm": 4.8899736404418945, "learning_rate": 2.479591836734694e-05, "loss": 0.7615, "step": 2570 }, { "epoch": 1.0974, "grad_norm": 12.54307746887207, "learning_rate": 2.4785714285714287e-05, "loss": 1.0085, "step": 2571 }, { "epoch": 1.0976, "grad_norm": 17.654830932617188, "learning_rate": 2.4775510204081632e-05, "loss": 2.3768, "step": 2572 }, { "epoch": 1.0977999999999999, "grad_norm": 3.0789108276367188, "learning_rate": 2.476530612244898e-05, "loss": 0.6546, "step": 2573 }, { "epoch": 1.098, "grad_norm": 2.571528911590576, "learning_rate": 2.4755102040816327e-05, "loss": 0.6165, "step": 2574 }, { "epoch": 1.0982, "grad_norm": 4.755832672119141, "learning_rate": 2.4744897959183673e-05, "loss": 2.2378, "step": 2575 }, { "epoch": 1.0984, "grad_norm": 4.332126617431641, "learning_rate": 2.4734693877551022e-05, "loss": 2.0653, "step": 2576 }, { "epoch": 1.0986, "grad_norm": 3.1858487129211426, "learning_rate": 2.4724489795918368e-05, "loss": 1.5829, "step": 2577 }, { "epoch": 1.0988, "grad_norm": 12.445663452148438, "learning_rate": 2.4714285714285714e-05, "loss": 1.4157, "step": 2578 }, { "epoch": 1.099, "grad_norm": 25.608064651489258, "learning_rate": 2.4704081632653063e-05, "loss": 3.2528, "step": 2579 }, { "epoch": 1.0992, "grad_norm": 1.2519673109054565, "learning_rate": 2.469387755102041e-05, "loss": 0.0214, "step": 2580 }, { "epoch": 1.0994, "grad_norm": 2.4915757179260254, "learning_rate": 2.4683673469387755e-05, "loss": 0.3216, "step": 2581 }, { "epoch": 1.0996, "grad_norm": 3.9431369304656982, "learning_rate": 2.46734693877551e-05, "loss": 0.5245, "step": 2582 }, { "epoch": 1.0998, "grad_norm": 1.9523743391036987, "learning_rate": 2.466326530612245e-05, "loss": 0.2015, "step": 2583 }, { "epoch": 1.1, "grad_norm": 2.3328518867492676, "learning_rate": 2.4653061224489795e-05, "loss": 0.2947, "step": 2584 }, { "epoch": 1.1002, "grad_norm": 4.120367050170898, "learning_rate": 2.4642857142857145e-05, "loss": 0.5547, "step": 2585 }, { "epoch": 1.1004, "grad_norm": 1.4598771333694458, "learning_rate": 2.4632653061224494e-05, "loss": 0.055, "step": 2586 }, { "epoch": 1.1006, "grad_norm": 2.297677755355835, "learning_rate": 2.462244897959184e-05, "loss": 0.262, "step": 2587 }, { "epoch": 1.1008, "grad_norm": 4.969046115875244, "learning_rate": 2.4612244897959185e-05, "loss": 0.6594, "step": 2588 }, { "epoch": 1.101, "grad_norm": 5.883965492248535, "learning_rate": 2.460204081632653e-05, "loss": 0.2786, "step": 2589 }, { "epoch": 1.1012, "grad_norm": 0.9458068609237671, "learning_rate": 2.459183673469388e-05, "loss": 0.02, "step": 2590 }, { "epoch": 1.1014, "grad_norm": 2.026034355163574, "learning_rate": 2.4581632653061226e-05, "loss": 0.1354, "step": 2591 }, { "epoch": 1.1016, "grad_norm": 5.589249134063721, "learning_rate": 2.4571428571428572e-05, "loss": 0.9564, "step": 2592 }, { "epoch": 1.1018, "grad_norm": 14.085370063781738, "learning_rate": 2.456122448979592e-05, "loss": 1.7646, "step": 2593 }, { "epoch": 1.102, "grad_norm": 28.355955123901367, "learning_rate": 2.4551020408163267e-05, "loss": 3.7831, "step": 2594 }, { "epoch": 1.1022, "grad_norm": 4.164278507232666, "learning_rate": 2.4540816326530613e-05, "loss": 0.0793, "step": 2595 }, { "epoch": 1.1024, "grad_norm": 5.802898406982422, "learning_rate": 2.4530612244897962e-05, "loss": 0.2683, "step": 2596 }, { "epoch": 1.1026, "grad_norm": 10.879895210266113, "learning_rate": 2.4520408163265308e-05, "loss": 1.8366, "step": 2597 }, { "epoch": 1.1028, "grad_norm": 10.431557655334473, "learning_rate": 2.4510204081632653e-05, "loss": 3.1627, "step": 2598 }, { "epoch": 1.103, "grad_norm": 1.2934068441390991, "learning_rate": 2.45e-05, "loss": 0.0489, "step": 2599 }, { "epoch": 1.1032, "grad_norm": 2.8279800415039062, "learning_rate": 2.448979591836735e-05, "loss": 0.134, "step": 2600 }, { "epoch": 1.1034, "grad_norm": 1.7171381711959839, "learning_rate": 2.4479591836734694e-05, "loss": 0.1996, "step": 2601 }, { "epoch": 1.1036, "grad_norm": 1.8562647104263306, "learning_rate": 2.446938775510204e-05, "loss": 0.1764, "step": 2602 }, { "epoch": 1.1038000000000001, "grad_norm": 1.6449317932128906, "learning_rate": 2.445918367346939e-05, "loss": 0.1458, "step": 2603 }, { "epoch": 1.104, "grad_norm": 1.1087433099746704, "learning_rate": 2.4448979591836735e-05, "loss": 0.0387, "step": 2604 }, { "epoch": 1.1042, "grad_norm": 1.929565191268921, "learning_rate": 2.443877551020408e-05, "loss": 0.2624, "step": 2605 }, { "epoch": 1.1044, "grad_norm": 5.939025402069092, "learning_rate": 2.442857142857143e-05, "loss": 0.7206, "step": 2606 }, { "epoch": 1.1046, "grad_norm": 14.030128479003906, "learning_rate": 2.441836734693878e-05, "loss": 1.3375, "step": 2607 }, { "epoch": 1.1048, "grad_norm": 2.9569661617279053, "learning_rate": 2.4408163265306125e-05, "loss": 0.6407, "step": 2608 }, { "epoch": 1.105, "grad_norm": 6.541789531707764, "learning_rate": 2.439795918367347e-05, "loss": 1.7242, "step": 2609 }, { "epoch": 1.1052, "grad_norm": 1.908613681793213, "learning_rate": 2.438775510204082e-05, "loss": 0.1024, "step": 2610 }, { "epoch": 1.1054, "grad_norm": 4.430586338043213, "learning_rate": 2.4377551020408166e-05, "loss": 0.9539, "step": 2611 }, { "epoch": 1.1056, "grad_norm": 10.48686408996582, "learning_rate": 2.436734693877551e-05, "loss": 3.0465, "step": 2612 }, { "epoch": 1.1058, "grad_norm": 2.074552297592163, "learning_rate": 2.4357142857142857e-05, "loss": 0.2299, "step": 2613 }, { "epoch": 1.106, "grad_norm": 3.4246790409088135, "learning_rate": 2.4346938775510206e-05, "loss": 0.4109, "step": 2614 }, { "epoch": 1.1062, "grad_norm": 11.392557144165039, "learning_rate": 2.4336734693877552e-05, "loss": 2.621, "step": 2615 }, { "epoch": 1.1064, "grad_norm": 17.0239200592041, "learning_rate": 2.4326530612244898e-05, "loss": 5.0606, "step": 2616 }, { "epoch": 1.1066, "grad_norm": 2.5136921405792236, "learning_rate": 2.4316326530612247e-05, "loss": 0.303, "step": 2617 }, { "epoch": 1.1068, "grad_norm": 4.009968280792236, "learning_rate": 2.4306122448979593e-05, "loss": 0.5713, "step": 2618 }, { "epoch": 1.107, "grad_norm": 2.3244709968566895, "learning_rate": 2.429591836734694e-05, "loss": 0.0982, "step": 2619 }, { "epoch": 1.1072, "grad_norm": 3.1784238815307617, "learning_rate": 2.4285714285714288e-05, "loss": 0.1382, "step": 2620 }, { "epoch": 1.1074, "grad_norm": 1.188054084777832, "learning_rate": 2.4275510204081634e-05, "loss": 0.042, "step": 2621 }, { "epoch": 1.1076, "grad_norm": 6.872524261474609, "learning_rate": 2.426530612244898e-05, "loss": 0.6016, "step": 2622 }, { "epoch": 1.1078000000000001, "grad_norm": 24.797069549560547, "learning_rate": 2.425510204081633e-05, "loss": 2.0936, "step": 2623 }, { "epoch": 1.108, "grad_norm": 1.7459396123886108, "learning_rate": 2.4244897959183674e-05, "loss": 0.1488, "step": 2624 }, { "epoch": 1.1082, "grad_norm": 1.2388994693756104, "learning_rate": 2.423469387755102e-05, "loss": 0.0466, "step": 2625 }, { "epoch": 1.1084, "grad_norm": 7.152106761932373, "learning_rate": 2.4224489795918366e-05, "loss": 1.1114, "step": 2626 }, { "epoch": 1.1086, "grad_norm": 14.875006675720215, "learning_rate": 2.4214285714285715e-05, "loss": 3.5813, "step": 2627 }, { "epoch": 1.1088, "grad_norm": 4.775994300842285, "learning_rate": 2.420408163265306e-05, "loss": 0.3872, "step": 2628 }, { "epoch": 1.109, "grad_norm": 12.121844291687012, "learning_rate": 2.4193877551020407e-05, "loss": 1.6924, "step": 2629 }, { "epoch": 1.1092, "grad_norm": 1.738283395767212, "learning_rate": 2.4183673469387756e-05, "loss": 0.1797, "step": 2630 }, { "epoch": 1.1094, "grad_norm": 1.5898929834365845, "learning_rate": 2.4173469387755105e-05, "loss": 0.1573, "step": 2631 }, { "epoch": 1.1096, "grad_norm": 1.715936541557312, "learning_rate": 2.416326530612245e-05, "loss": 0.1822, "step": 2632 }, { "epoch": 1.1098, "grad_norm": 1.7776638269424438, "learning_rate": 2.4153061224489797e-05, "loss": 0.1873, "step": 2633 }, { "epoch": 1.11, "grad_norm": 2.2790379524230957, "learning_rate": 2.4142857142857146e-05, "loss": 0.3086, "step": 2634 }, { "epoch": 1.1102, "grad_norm": 3.2239675521850586, "learning_rate": 2.4132653061224492e-05, "loss": 0.6393, "step": 2635 }, { "epoch": 1.1104, "grad_norm": 3.7233240604400635, "learning_rate": 2.4122448979591838e-05, "loss": 0.3281, "step": 2636 }, { "epoch": 1.1106, "grad_norm": 11.728107452392578, "learning_rate": 2.4112244897959187e-05, "loss": 1.6738, "step": 2637 }, { "epoch": 1.1108, "grad_norm": 12.237937927246094, "learning_rate": 2.4102040816326533e-05, "loss": 3.5454, "step": 2638 }, { "epoch": 1.111, "grad_norm": 1.3609619140625, "learning_rate": 2.409183673469388e-05, "loss": 0.0409, "step": 2639 }, { "epoch": 1.1112, "grad_norm": 2.414645195007324, "learning_rate": 2.4081632653061224e-05, "loss": 0.5846, "step": 2640 }, { "epoch": 1.1114, "grad_norm": 4.060914039611816, "learning_rate": 2.4071428571428573e-05, "loss": 1.8276, "step": 2641 }, { "epoch": 1.1116, "grad_norm": 3.0793774127960205, "learning_rate": 2.406122448979592e-05, "loss": 0.6324, "step": 2642 }, { "epoch": 1.1118, "grad_norm": 1.6483436822891235, "learning_rate": 2.4051020408163265e-05, "loss": 0.1275, "step": 2643 }, { "epoch": 1.112, "grad_norm": 1.8608285188674927, "learning_rate": 2.4040816326530614e-05, "loss": 0.2238, "step": 2644 }, { "epoch": 1.1122, "grad_norm": 3.8672969341278076, "learning_rate": 2.403061224489796e-05, "loss": 0.8668, "step": 2645 }, { "epoch": 1.1124, "grad_norm": 7.139001369476318, "learning_rate": 2.4020408163265306e-05, "loss": 2.4751, "step": 2646 }, { "epoch": 1.1126, "grad_norm": 2.5560009479522705, "learning_rate": 2.4010204081632655e-05, "loss": 0.2781, "step": 2647 }, { "epoch": 1.1128, "grad_norm": 13.595659255981445, "learning_rate": 2.4e-05, "loss": 1.1682, "step": 2648 }, { "epoch": 1.113, "grad_norm": 16.68004035949707, "learning_rate": 2.3989795918367346e-05, "loss": 2.1596, "step": 2649 }, { "epoch": 1.1132, "grad_norm": 1.8965356349945068, "learning_rate": 2.3979591836734696e-05, "loss": 0.2235, "step": 2650 }, { "epoch": 1.1134, "grad_norm": 1.8255671262741089, "learning_rate": 2.396938775510204e-05, "loss": 0.1737, "step": 2651 }, { "epoch": 1.1136, "grad_norm": 5.417263031005859, "learning_rate": 2.3959183673469387e-05, "loss": 0.3718, "step": 2652 }, { "epoch": 1.1138, "grad_norm": 7.282905578613281, "learning_rate": 2.3948979591836736e-05, "loss": 1.4317, "step": 2653 }, { "epoch": 1.114, "grad_norm": 1.0953459739685059, "learning_rate": 2.3938775510204086e-05, "loss": 0.0445, "step": 2654 }, { "epoch": 1.1142, "grad_norm": 1.5249688625335693, "learning_rate": 2.392857142857143e-05, "loss": 0.0532, "step": 2655 }, { "epoch": 1.1144, "grad_norm": 3.1003592014312744, "learning_rate": 2.3918367346938777e-05, "loss": 0.3299, "step": 2656 }, { "epoch": 1.1146, "grad_norm": 7.614220142364502, "learning_rate": 2.3908163265306123e-05, "loss": 2.4333, "step": 2657 }, { "epoch": 1.1148, "grad_norm": 23.154939651489258, "learning_rate": 2.3897959183673472e-05, "loss": 3.3009, "step": 2658 }, { "epoch": 1.115, "grad_norm": 6.271807670593262, "learning_rate": 2.3887755102040818e-05, "loss": 0.8004, "step": 2659 }, { "epoch": 1.1152, "grad_norm": 8.839851379394531, "learning_rate": 2.3877551020408164e-05, "loss": 0.7816, "step": 2660 }, { "epoch": 1.1154, "grad_norm": 2.4665706157684326, "learning_rate": 2.3867346938775513e-05, "loss": 0.1041, "step": 2661 }, { "epoch": 1.1156, "grad_norm": 1.2891920804977417, "learning_rate": 2.385714285714286e-05, "loss": 0.053, "step": 2662 }, { "epoch": 1.1158, "grad_norm": 5.995043754577637, "learning_rate": 2.3846938775510204e-05, "loss": 1.0304, "step": 2663 }, { "epoch": 1.116, "grad_norm": 9.764463424682617, "learning_rate": 2.3836734693877554e-05, "loss": 2.591, "step": 2664 }, { "epoch": 1.1162, "grad_norm": 1.9951735734939575, "learning_rate": 2.38265306122449e-05, "loss": 0.2677, "step": 2665 }, { "epoch": 1.1164, "grad_norm": 3.084277629852295, "learning_rate": 2.3816326530612245e-05, "loss": 0.6408, "step": 2666 }, { "epoch": 1.1166, "grad_norm": 1.1833219528198242, "learning_rate": 2.380612244897959e-05, "loss": 0.0451, "step": 2667 }, { "epoch": 1.1168, "grad_norm": 2.448613405227661, "learning_rate": 2.379591836734694e-05, "loss": 0.6766, "step": 2668 }, { "epoch": 1.117, "grad_norm": 3.4040493965148926, "learning_rate": 2.3785714285714286e-05, "loss": 1.5933, "step": 2669 }, { "epoch": 1.1172, "grad_norm": 14.958999633789062, "learning_rate": 2.3775510204081632e-05, "loss": 0.8691, "step": 2670 }, { "epoch": 1.1174, "grad_norm": 3.2945313453674316, "learning_rate": 2.376530612244898e-05, "loss": 0.3583, "step": 2671 }, { "epoch": 1.1176, "grad_norm": 5.522057056427002, "learning_rate": 2.3755102040816327e-05, "loss": 1.0304, "step": 2672 }, { "epoch": 1.1178, "grad_norm": 8.773022651672363, "learning_rate": 2.3744897959183673e-05, "loss": 2.6604, "step": 2673 }, { "epoch": 1.1179999999999999, "grad_norm": 1.739574670791626, "learning_rate": 2.373469387755102e-05, "loss": 0.2143, "step": 2674 }, { "epoch": 1.1182, "grad_norm": 2.3119235038757324, "learning_rate": 2.372448979591837e-05, "loss": 0.3143, "step": 2675 }, { "epoch": 1.1184, "grad_norm": 3.644256591796875, "learning_rate": 2.3714285714285717e-05, "loss": 0.1712, "step": 2676 }, { "epoch": 1.1186, "grad_norm": 1.1288031339645386, "learning_rate": 2.3704081632653062e-05, "loss": 0.0345, "step": 2677 }, { "epoch": 1.1188, "grad_norm": 1.857331395149231, "learning_rate": 2.369387755102041e-05, "loss": 0.2174, "step": 2678 }, { "epoch": 1.119, "grad_norm": 2.1009767055511475, "learning_rate": 2.3683673469387757e-05, "loss": 0.2082, "step": 2679 }, { "epoch": 1.1192, "grad_norm": 2.547800302505493, "learning_rate": 2.3673469387755103e-05, "loss": 0.2913, "step": 2680 }, { "epoch": 1.1194, "grad_norm": 4.963531494140625, "learning_rate": 2.3663265306122452e-05, "loss": 0.7662, "step": 2681 }, { "epoch": 1.1196, "grad_norm": 4.640702247619629, "learning_rate": 2.3653061224489798e-05, "loss": 0.77, "step": 2682 }, { "epoch": 1.1198, "grad_norm": 3.045327663421631, "learning_rate": 2.3642857142857144e-05, "loss": 1.4742, "step": 2683 }, { "epoch": 1.12, "grad_norm": 8.779814720153809, "learning_rate": 2.363265306122449e-05, "loss": 0.9135, "step": 2684 }, { "epoch": 1.1202, "grad_norm": 17.087533950805664, "learning_rate": 2.362244897959184e-05, "loss": 3.5725, "step": 2685 }, { "epoch": 1.1204, "grad_norm": 1.8048440217971802, "learning_rate": 2.3612244897959185e-05, "loss": 0.1151, "step": 2686 }, { "epoch": 1.1206, "grad_norm": 4.408398151397705, "learning_rate": 2.360204081632653e-05, "loss": 0.4616, "step": 2687 }, { "epoch": 1.1208, "grad_norm": 4.695296287536621, "learning_rate": 2.359183673469388e-05, "loss": 1.5793, "step": 2688 }, { "epoch": 1.121, "grad_norm": 12.745363235473633, "learning_rate": 2.3581632653061226e-05, "loss": 3.0204, "step": 2689 }, { "epoch": 1.1212, "grad_norm": 10.47659969329834, "learning_rate": 2.357142857142857e-05, "loss": 0.5482, "step": 2690 }, { "epoch": 1.1214, "grad_norm": 9.649550437927246, "learning_rate": 2.356122448979592e-05, "loss": 1.2656, "step": 2691 }, { "epoch": 1.1216, "grad_norm": 2.9947545528411865, "learning_rate": 2.3551020408163266e-05, "loss": 0.9884, "step": 2692 }, { "epoch": 1.1218, "grad_norm": 8.209904670715332, "learning_rate": 2.3540816326530612e-05, "loss": 3.1175, "step": 2693 }, { "epoch": 1.1219999999999999, "grad_norm": 5.573894023895264, "learning_rate": 2.3530612244897958e-05, "loss": 0.7264, "step": 2694 }, { "epoch": 1.1222, "grad_norm": 3.8499183654785156, "learning_rate": 2.3520408163265307e-05, "loss": 0.1304, "step": 2695 }, { "epoch": 1.1224, "grad_norm": 10.379673957824707, "learning_rate": 2.3510204081632653e-05, "loss": 1.1817, "step": 2696 }, { "epoch": 1.1226, "grad_norm": 10.155116081237793, "learning_rate": 2.35e-05, "loss": 1.4249, "step": 2697 }, { "epoch": 1.1228, "grad_norm": 4.0061163902282715, "learning_rate": 2.3489795918367348e-05, "loss": 0.3103, "step": 2698 }, { "epoch": 1.123, "grad_norm": 4.94691801071167, "learning_rate": 2.3479591836734697e-05, "loss": 0.797, "step": 2699 }, { "epoch": 1.1232, "grad_norm": 1.813581943511963, "learning_rate": 2.3469387755102043e-05, "loss": 0.2344, "step": 2700 }, { "epoch": 1.1234, "grad_norm": 1.988603949546814, "learning_rate": 2.345918367346939e-05, "loss": 0.228, "step": 2701 }, { "epoch": 1.1236, "grad_norm": 3.084177255630493, "learning_rate": 2.3448979591836738e-05, "loss": 0.3087, "step": 2702 }, { "epoch": 1.1238, "grad_norm": 3.0632259845733643, "learning_rate": 2.3438775510204084e-05, "loss": 0.5855, "step": 2703 }, { "epoch": 1.124, "grad_norm": 2.8814468383789062, "learning_rate": 2.342857142857143e-05, "loss": 0.2033, "step": 2704 }, { "epoch": 1.1242, "grad_norm": 3.1632778644561768, "learning_rate": 2.341836734693878e-05, "loss": 0.5382, "step": 2705 }, { "epoch": 1.1244, "grad_norm": 1.307254672050476, "learning_rate": 2.3408163265306124e-05, "loss": 0.0582, "step": 2706 }, { "epoch": 1.1246, "grad_norm": 3.011153221130371, "learning_rate": 2.339795918367347e-05, "loss": 0.1373, "step": 2707 }, { "epoch": 1.1248, "grad_norm": 2.2650563716888428, "learning_rate": 2.3387755102040816e-05, "loss": 0.286, "step": 2708 }, { "epoch": 1.125, "grad_norm": 6.4699296951293945, "learning_rate": 2.3377551020408165e-05, "loss": 0.7095, "step": 2709 }, { "epoch": 1.1252, "grad_norm": 7.103350639343262, "learning_rate": 2.336734693877551e-05, "loss": 1.4044, "step": 2710 }, { "epoch": 1.1254, "grad_norm": 2.83284330368042, "learning_rate": 2.3357142857142857e-05, "loss": 0.1647, "step": 2711 }, { "epoch": 1.1256, "grad_norm": 6.117814064025879, "learning_rate": 2.3346938775510206e-05, "loss": 0.9263, "step": 2712 }, { "epoch": 1.1258, "grad_norm": 11.084607124328613, "learning_rate": 2.333673469387755e-05, "loss": 1.686, "step": 2713 }, { "epoch": 1.126, "grad_norm": 27.052371978759766, "learning_rate": 2.3326530612244897e-05, "loss": 4.1773, "step": 2714 }, { "epoch": 1.1262, "grad_norm": 10.01918888092041, "learning_rate": 2.3316326530612247e-05, "loss": 1.4462, "step": 2715 }, { "epoch": 1.1264, "grad_norm": 2.8444294929504395, "learning_rate": 2.3306122448979592e-05, "loss": 0.5133, "step": 2716 }, { "epoch": 1.1266, "grad_norm": 3.0026142597198486, "learning_rate": 2.3295918367346938e-05, "loss": 0.286, "step": 2717 }, { "epoch": 1.1268, "grad_norm": 5.488697528839111, "learning_rate": 2.3285714285714287e-05, "loss": 0.8064, "step": 2718 }, { "epoch": 1.127, "grad_norm": 2.136697292327881, "learning_rate": 2.3275510204081633e-05, "loss": 0.1877, "step": 2719 }, { "epoch": 1.1272, "grad_norm": 3.03145432472229, "learning_rate": 2.326530612244898e-05, "loss": 0.3336, "step": 2720 }, { "epoch": 1.1274, "grad_norm": 1.7051206827163696, "learning_rate": 2.3255102040816328e-05, "loss": 0.22, "step": 2721 }, { "epoch": 1.1276, "grad_norm": 1.7277824878692627, "learning_rate": 2.3244897959183677e-05, "loss": 0.1903, "step": 2722 }, { "epoch": 1.1278, "grad_norm": 2.966791868209839, "learning_rate": 2.3234693877551023e-05, "loss": 0.5597, "step": 2723 }, { "epoch": 1.1280000000000001, "grad_norm": 1.6014727354049683, "learning_rate": 2.322448979591837e-05, "loss": 0.0624, "step": 2724 }, { "epoch": 1.1282, "grad_norm": 2.1757612228393555, "learning_rate": 2.3214285714285715e-05, "loss": 0.1369, "step": 2725 }, { "epoch": 1.1284, "grad_norm": 1.3579509258270264, "learning_rate": 2.3204081632653064e-05, "loss": 0.0576, "step": 2726 }, { "epoch": 1.1286, "grad_norm": 1.526038408279419, "learning_rate": 2.319387755102041e-05, "loss": 0.134, "step": 2727 }, { "epoch": 1.1288, "grad_norm": 1.9145957231521606, "learning_rate": 2.3183673469387755e-05, "loss": 0.2593, "step": 2728 }, { "epoch": 1.129, "grad_norm": 9.260457992553711, "learning_rate": 2.3173469387755105e-05, "loss": 1.0714, "step": 2729 }, { "epoch": 1.1292, "grad_norm": 29.887434005737305, "learning_rate": 2.316326530612245e-05, "loss": 3.3454, "step": 2730 }, { "epoch": 1.1294, "grad_norm": 13.774479866027832, "learning_rate": 2.3153061224489796e-05, "loss": 3.6583, "step": 2731 }, { "epoch": 1.1296, "grad_norm": 6.605763912200928, "learning_rate": 2.3142857142857145e-05, "loss": 0.8855, "step": 2732 }, { "epoch": 1.1298, "grad_norm": 4.741060733795166, "learning_rate": 2.313265306122449e-05, "loss": 0.7902, "step": 2733 }, { "epoch": 1.13, "grad_norm": 1.5933618545532227, "learning_rate": 2.3122448979591837e-05, "loss": 0.1623, "step": 2734 }, { "epoch": 1.1302, "grad_norm": 1.6804769039154053, "learning_rate": 2.3112244897959183e-05, "loss": 0.1537, "step": 2735 }, { "epoch": 1.1304, "grad_norm": 2.6978795528411865, "learning_rate": 2.3102040816326532e-05, "loss": 0.4404, "step": 2736 }, { "epoch": 1.1306, "grad_norm": 5.578824043273926, "learning_rate": 2.3091836734693878e-05, "loss": 2.2101, "step": 2737 }, { "epoch": 1.1308, "grad_norm": 5.527901649475098, "learning_rate": 2.3081632653061224e-05, "loss": 1.5192, "step": 2738 }, { "epoch": 1.131, "grad_norm": 3.8173131942749023, "learning_rate": 2.3071428571428573e-05, "loss": 0.7459, "step": 2739 }, { "epoch": 1.1312, "grad_norm": 5.779356479644775, "learning_rate": 2.306122448979592e-05, "loss": 0.1954, "step": 2740 }, { "epoch": 1.1314, "grad_norm": 7.4463701248168945, "learning_rate": 2.3051020408163264e-05, "loss": 1.1728, "step": 2741 }, { "epoch": 1.1316, "grad_norm": 5.37211275100708, "learning_rate": 2.3040816326530613e-05, "loss": 0.6996, "step": 2742 }, { "epoch": 1.1318, "grad_norm": 2.5998642444610596, "learning_rate": 2.3030612244897963e-05, "loss": 0.3492, "step": 2743 }, { "epoch": 1.1320000000000001, "grad_norm": 5.17749547958374, "learning_rate": 2.302040816326531e-05, "loss": 1.4707, "step": 2744 }, { "epoch": 1.1322, "grad_norm": 1.720008134841919, "learning_rate": 2.3010204081632654e-05, "loss": 0.1492, "step": 2745 }, { "epoch": 1.1324, "grad_norm": 4.889779567718506, "learning_rate": 2.3000000000000003e-05, "loss": 0.9116, "step": 2746 }, { "epoch": 1.1326, "grad_norm": 9.019208908081055, "learning_rate": 2.298979591836735e-05, "loss": 3.042, "step": 2747 }, { "epoch": 1.1328, "grad_norm": 3.348464250564575, "learning_rate": 2.2979591836734695e-05, "loss": 1.6472, "step": 2748 }, { "epoch": 1.133, "grad_norm": 4.644619941711426, "learning_rate": 2.2969387755102044e-05, "loss": 0.3318, "step": 2749 }, { "epoch": 1.1332, "grad_norm": 6.385184288024902, "learning_rate": 2.295918367346939e-05, "loss": 0.817, "step": 2750 }, { "epoch": 1.1334, "grad_norm": 3.095552682876587, "learning_rate": 2.2948979591836736e-05, "loss": 0.5922, "step": 2751 }, { "epoch": 1.1336, "grad_norm": 1.7094886302947998, "learning_rate": 2.293877551020408e-05, "loss": 0.2121, "step": 2752 }, { "epoch": 1.1338, "grad_norm": 3.2444093227386475, "learning_rate": 2.292857142857143e-05, "loss": 0.5553, "step": 2753 }, { "epoch": 1.134, "grad_norm": 1.6598012447357178, "learning_rate": 2.2918367346938777e-05, "loss": 0.0689, "step": 2754 }, { "epoch": 1.1342, "grad_norm": 1.6716395616531372, "learning_rate": 2.2908163265306122e-05, "loss": 0.1675, "step": 2755 }, { "epoch": 1.1344, "grad_norm": 1.6261208057403564, "learning_rate": 2.289795918367347e-05, "loss": 0.1817, "step": 2756 }, { "epoch": 1.1346, "grad_norm": 2.096707582473755, "learning_rate": 2.2887755102040817e-05, "loss": 0.1242, "step": 2757 }, { "epoch": 1.1348, "grad_norm": 5.705234527587891, "learning_rate": 2.2877551020408163e-05, "loss": 0.7205, "step": 2758 }, { "epoch": 1.135, "grad_norm": 2.8146116733551025, "learning_rate": 2.2867346938775512e-05, "loss": 0.3474, "step": 2759 }, { "epoch": 1.1352, "grad_norm": 3.8745510578155518, "learning_rate": 2.2857142857142858e-05, "loss": 0.3032, "step": 2760 }, { "epoch": 1.1354, "grad_norm": 1.6446934938430786, "learning_rate": 2.2846938775510204e-05, "loss": 0.1739, "step": 2761 }, { "epoch": 1.1356, "grad_norm": 1.8880884647369385, "learning_rate": 2.283673469387755e-05, "loss": 0.0904, "step": 2762 }, { "epoch": 1.1358, "grad_norm": 2.831763744354248, "learning_rate": 2.28265306122449e-05, "loss": 0.3695, "step": 2763 }, { "epoch": 1.1360000000000001, "grad_norm": 2.0684707164764404, "learning_rate": 2.2816326530612245e-05, "loss": 0.147, "step": 2764 }, { "epoch": 1.1362, "grad_norm": 20.502220153808594, "learning_rate": 2.280612244897959e-05, "loss": 1.2422, "step": 2765 }, { "epoch": 1.1364, "grad_norm": 5.634237289428711, "learning_rate": 2.279591836734694e-05, "loss": 0.7513, "step": 2766 }, { "epoch": 1.1366, "grad_norm": 1.6980818510055542, "learning_rate": 2.278571428571429e-05, "loss": 0.1942, "step": 2767 }, { "epoch": 1.1368, "grad_norm": 1.4760496616363525, "learning_rate": 2.2775510204081635e-05, "loss": 0.0931, "step": 2768 }, { "epoch": 1.137, "grad_norm": 2.79160475730896, "learning_rate": 2.276530612244898e-05, "loss": 0.5251, "step": 2769 }, { "epoch": 1.1372, "grad_norm": 2.9664740562438965, "learning_rate": 2.275510204081633e-05, "loss": 0.164, "step": 2770 }, { "epoch": 1.1374, "grad_norm": 1.9510027170181274, "learning_rate": 2.2744897959183675e-05, "loss": 0.219, "step": 2771 }, { "epoch": 1.1376, "grad_norm": 1.5220974683761597, "learning_rate": 2.273469387755102e-05, "loss": 0.1522, "step": 2772 }, { "epoch": 1.1378, "grad_norm": 0.9969020485877991, "learning_rate": 2.272448979591837e-05, "loss": 0.0287, "step": 2773 }, { "epoch": 1.138, "grad_norm": 1.9152190685272217, "learning_rate": 2.2714285714285716e-05, "loss": 0.2731, "step": 2774 }, { "epoch": 1.1381999999999999, "grad_norm": 1.6108945608139038, "learning_rate": 2.2704081632653062e-05, "loss": 0.1778, "step": 2775 }, { "epoch": 1.1384, "grad_norm": 2.6869864463806152, "learning_rate": 2.269387755102041e-05, "loss": 0.306, "step": 2776 }, { "epoch": 1.1386, "grad_norm": 5.675705909729004, "learning_rate": 2.2683673469387757e-05, "loss": 0.3161, "step": 2777 }, { "epoch": 1.1388, "grad_norm": 0.9622205495834351, "learning_rate": 2.2673469387755103e-05, "loss": 0.0312, "step": 2778 }, { "epoch": 1.139, "grad_norm": 1.7289884090423584, "learning_rate": 2.266326530612245e-05, "loss": 0.2262, "step": 2779 }, { "epoch": 1.1392, "grad_norm": 2.9984047412872314, "learning_rate": 2.2653061224489798e-05, "loss": 0.4141, "step": 2780 }, { "epoch": 1.1394, "grad_norm": 6.064243793487549, "learning_rate": 2.2642857142857143e-05, "loss": 0.4489, "step": 2781 }, { "epoch": 1.1396, "grad_norm": 6.7196364402771, "learning_rate": 2.263265306122449e-05, "loss": 0.786, "step": 2782 }, { "epoch": 1.1398, "grad_norm": 7.996720790863037, "learning_rate": 2.262244897959184e-05, "loss": 1.0503, "step": 2783 }, { "epoch": 1.1400000000000001, "grad_norm": 20.268320083618164, "learning_rate": 2.2612244897959184e-05, "loss": 3.538, "step": 2784 }, { "epoch": 1.1402, "grad_norm": 1.163985013961792, "learning_rate": 2.260204081632653e-05, "loss": 0.0461, "step": 2785 }, { "epoch": 1.1404, "grad_norm": 0.9983113408088684, "learning_rate": 2.259183673469388e-05, "loss": 0.0153, "step": 2786 }, { "epoch": 1.1406, "grad_norm": 1.6703228950500488, "learning_rate": 2.2581632653061225e-05, "loss": 0.1554, "step": 2787 }, { "epoch": 1.1408, "grad_norm": 1.7943878173828125, "learning_rate": 2.257142857142857e-05, "loss": 0.1643, "step": 2788 }, { "epoch": 1.141, "grad_norm": 1.7003551721572876, "learning_rate": 2.256122448979592e-05, "loss": 0.0332, "step": 2789 }, { "epoch": 1.1412, "grad_norm": 2.893758773803711, "learning_rate": 2.255102040816327e-05, "loss": 0.08, "step": 2790 }, { "epoch": 1.1414, "grad_norm": 10.646772384643555, "learning_rate": 2.2540816326530615e-05, "loss": 0.3478, "step": 2791 }, { "epoch": 1.1416, "grad_norm": 4.976737022399902, "learning_rate": 2.253061224489796e-05, "loss": 0.7227, "step": 2792 }, { "epoch": 1.1418, "grad_norm": 1.5799471139907837, "learning_rate": 2.2520408163265306e-05, "loss": 0.0398, "step": 2793 }, { "epoch": 1.142, "grad_norm": 4.041989803314209, "learning_rate": 2.2510204081632656e-05, "loss": 0.3589, "step": 2794 }, { "epoch": 1.1421999999999999, "grad_norm": 6.398179054260254, "learning_rate": 2.25e-05, "loss": 1.5622, "step": 2795 }, { "epoch": 1.1424, "grad_norm": 3.265444040298462, "learning_rate": 2.2489795918367347e-05, "loss": 0.3875, "step": 2796 }, { "epoch": 1.1426, "grad_norm": 1.83922278881073, "learning_rate": 2.2479591836734696e-05, "loss": 0.1368, "step": 2797 }, { "epoch": 1.1428, "grad_norm": 4.628585338592529, "learning_rate": 2.2469387755102042e-05, "loss": 0.7204, "step": 2798 }, { "epoch": 1.143, "grad_norm": 7.918242454528809, "learning_rate": 2.2459183673469388e-05, "loss": 0.4492, "step": 2799 }, { "epoch": 1.1432, "grad_norm": 1.9701615571975708, "learning_rate": 2.2448979591836737e-05, "loss": 0.2359, "step": 2800 }, { "epoch": 1.1434, "grad_norm": 5.108238220214844, "learning_rate": 2.2438775510204083e-05, "loss": 0.9741, "step": 2801 }, { "epoch": 1.1436, "grad_norm": 7.751604080200195, "learning_rate": 2.242857142857143e-05, "loss": 2.4672, "step": 2802 }, { "epoch": 1.1438, "grad_norm": 6.921794414520264, "learning_rate": 2.2418367346938775e-05, "loss": 0.8374, "step": 2803 }, { "epoch": 1.144, "grad_norm": 4.900245189666748, "learning_rate": 2.2408163265306124e-05, "loss": 0.6811, "step": 2804 }, { "epoch": 1.1442, "grad_norm": 2.111574172973633, "learning_rate": 2.239795918367347e-05, "loss": 0.2138, "step": 2805 }, { "epoch": 1.1444, "grad_norm": 2.1236510276794434, "learning_rate": 2.2387755102040815e-05, "loss": 0.527, "step": 2806 }, { "epoch": 1.1446, "grad_norm": 1.9605672359466553, "learning_rate": 2.2377551020408164e-05, "loss": 0.3985, "step": 2807 }, { "epoch": 1.1448, "grad_norm": 7.262815475463867, "learning_rate": 2.236734693877551e-05, "loss": 2.0073, "step": 2808 }, { "epoch": 1.145, "grad_norm": 10.495835304260254, "learning_rate": 2.2357142857142856e-05, "loss": 1.4147, "step": 2809 }, { "epoch": 1.1452, "grad_norm": 2.1620757579803467, "learning_rate": 2.2346938775510205e-05, "loss": 0.5296, "step": 2810 }, { "epoch": 1.1454, "grad_norm": 1.7605044841766357, "learning_rate": 2.2336734693877554e-05, "loss": 0.1249, "step": 2811 }, { "epoch": 1.1456, "grad_norm": 2.030941963195801, "learning_rate": 2.23265306122449e-05, "loss": 0.1877, "step": 2812 }, { "epoch": 1.1458, "grad_norm": 1.8107985258102417, "learning_rate": 2.2316326530612246e-05, "loss": 0.1866, "step": 2813 }, { "epoch": 1.146, "grad_norm": 2.911255121231079, "learning_rate": 2.2306122448979595e-05, "loss": 0.0739, "step": 2814 }, { "epoch": 1.1461999999999999, "grad_norm": 1.6085364818572998, "learning_rate": 2.229591836734694e-05, "loss": 0.0729, "step": 2815 }, { "epoch": 1.1464, "grad_norm": 7.5536699295043945, "learning_rate": 2.2285714285714287e-05, "loss": 0.4134, "step": 2816 }, { "epoch": 1.1466, "grad_norm": 24.68170928955078, "learning_rate": 2.2275510204081636e-05, "loss": 2.1009, "step": 2817 }, { "epoch": 1.1468, "grad_norm": 15.094805717468262, "learning_rate": 2.2265306122448982e-05, "loss": 1.6492, "step": 2818 }, { "epoch": 1.147, "grad_norm": 4.507059097290039, "learning_rate": 2.2255102040816328e-05, "loss": 0.3622, "step": 2819 }, { "epoch": 1.1472, "grad_norm": 2.311285972595215, "learning_rate": 2.2244897959183673e-05, "loss": 0.3124, "step": 2820 }, { "epoch": 1.1474, "grad_norm": 4.111201286315918, "learning_rate": 2.2234693877551022e-05, "loss": 0.9661, "step": 2821 }, { "epoch": 1.1476, "grad_norm": 2.7623486518859863, "learning_rate": 2.2224489795918368e-05, "loss": 0.5781, "step": 2822 }, { "epoch": 1.1478, "grad_norm": 2.8520424365997314, "learning_rate": 2.2214285714285714e-05, "loss": 0.3171, "step": 2823 }, { "epoch": 1.148, "grad_norm": 1.5777324438095093, "learning_rate": 2.2204081632653063e-05, "loss": 0.1292, "step": 2824 }, { "epoch": 1.1482, "grad_norm": 1.6323140859603882, "learning_rate": 2.219387755102041e-05, "loss": 0.1628, "step": 2825 }, { "epoch": 1.1484, "grad_norm": 0.8256015777587891, "learning_rate": 2.2183673469387755e-05, "loss": 0.0198, "step": 2826 }, { "epoch": 1.1486, "grad_norm": 1.7223306894302368, "learning_rate": 2.2173469387755104e-05, "loss": 0.1819, "step": 2827 }, { "epoch": 1.1488, "grad_norm": 2.447789430618286, "learning_rate": 2.216326530612245e-05, "loss": 0.3057, "step": 2828 }, { "epoch": 1.149, "grad_norm": 5.144397735595703, "learning_rate": 2.2153061224489796e-05, "loss": 0.7765, "step": 2829 }, { "epoch": 1.1492, "grad_norm": 4.504195690155029, "learning_rate": 2.214285714285714e-05, "loss": 0.251, "step": 2830 }, { "epoch": 1.1494, "grad_norm": 7.414604663848877, "learning_rate": 2.213265306122449e-05, "loss": 1.3508, "step": 2831 }, { "epoch": 1.1496, "grad_norm": 5.837427139282227, "learning_rate": 2.2122448979591836e-05, "loss": 0.196, "step": 2832 }, { "epoch": 1.1498, "grad_norm": 7.726154327392578, "learning_rate": 2.2112244897959182e-05, "loss": 0.3343, "step": 2833 }, { "epoch": 1.15, "grad_norm": 2.2711503505706787, "learning_rate": 2.210204081632653e-05, "loss": 0.1605, "step": 2834 }, { "epoch": 1.1502, "grad_norm": 2.346430778503418, "learning_rate": 2.209183673469388e-05, "loss": 0.2132, "step": 2835 }, { "epoch": 1.1504, "grad_norm": 4.680408477783203, "learning_rate": 2.2081632653061226e-05, "loss": 0.7817, "step": 2836 }, { "epoch": 1.1506, "grad_norm": 2.951364755630493, "learning_rate": 2.2071428571428572e-05, "loss": 0.5717, "step": 2837 }, { "epoch": 1.1508, "grad_norm": 1.5978069305419922, "learning_rate": 2.206122448979592e-05, "loss": 0.161, "step": 2838 }, { "epoch": 1.151, "grad_norm": 2.3276004791259766, "learning_rate": 2.2051020408163267e-05, "loss": 0.3314, "step": 2839 }, { "epoch": 1.1512, "grad_norm": 3.6032259464263916, "learning_rate": 2.2040816326530613e-05, "loss": 0.9733, "step": 2840 }, { "epoch": 1.1514, "grad_norm": 6.305304527282715, "learning_rate": 2.2030612244897962e-05, "loss": 0.6498, "step": 2841 }, { "epoch": 1.1516, "grad_norm": 2.920128345489502, "learning_rate": 2.2020408163265308e-05, "loss": 0.3438, "step": 2842 }, { "epoch": 1.1518, "grad_norm": 0.9799144268035889, "learning_rate": 2.2010204081632654e-05, "loss": 0.0364, "step": 2843 }, { "epoch": 1.152, "grad_norm": 1.5188324451446533, "learning_rate": 2.2000000000000003e-05, "loss": 0.1618, "step": 2844 }, { "epoch": 1.1522000000000001, "grad_norm": 3.6000258922576904, "learning_rate": 2.198979591836735e-05, "loss": 0.4784, "step": 2845 }, { "epoch": 1.1524, "grad_norm": 7.604703903198242, "learning_rate": 2.1979591836734694e-05, "loss": 2.1591, "step": 2846 }, { "epoch": 1.1526, "grad_norm": 6.9416728019714355, "learning_rate": 2.196938775510204e-05, "loss": 1.4784, "step": 2847 }, { "epoch": 1.1528, "grad_norm": 1.5903897285461426, "learning_rate": 2.195918367346939e-05, "loss": 0.0872, "step": 2848 }, { "epoch": 1.153, "grad_norm": 1.9738491773605347, "learning_rate": 2.1948979591836735e-05, "loss": 0.1707, "step": 2849 }, { "epoch": 1.1532, "grad_norm": 4.482814788818359, "learning_rate": 2.193877551020408e-05, "loss": 0.8933, "step": 2850 }, { "epoch": 1.1534, "grad_norm": 5.364544868469238, "learning_rate": 2.192857142857143e-05, "loss": 0.7052, "step": 2851 }, { "epoch": 1.1536, "grad_norm": 11.5966157913208, "learning_rate": 2.1918367346938776e-05, "loss": 1.5005, "step": 2852 }, { "epoch": 1.1538, "grad_norm": 34.515357971191406, "learning_rate": 2.1908163265306122e-05, "loss": 4.0731, "step": 2853 }, { "epoch": 1.154, "grad_norm": 11.817828178405762, "learning_rate": 2.189795918367347e-05, "loss": 1.3178, "step": 2854 }, { "epoch": 1.1542, "grad_norm": 1.158492922782898, "learning_rate": 2.1887755102040817e-05, "loss": 0.045, "step": 2855 }, { "epoch": 1.1544, "grad_norm": 2.2836451530456543, "learning_rate": 2.1877551020408162e-05, "loss": 0.1339, "step": 2856 }, { "epoch": 1.1546, "grad_norm": 4.321943759918213, "learning_rate": 2.186734693877551e-05, "loss": 0.6489, "step": 2857 }, { "epoch": 1.1548, "grad_norm": 1.2813799381256104, "learning_rate": 2.185714285714286e-05, "loss": 0.0529, "step": 2858 }, { "epoch": 1.155, "grad_norm": 1.838179588317871, "learning_rate": 2.1846938775510207e-05, "loss": 0.2322, "step": 2859 }, { "epoch": 1.1552, "grad_norm": 2.256678819656372, "learning_rate": 2.1836734693877552e-05, "loss": 0.3033, "step": 2860 }, { "epoch": 1.1554, "grad_norm": 3.089646100997925, "learning_rate": 2.1826530612244898e-05, "loss": 0.3594, "step": 2861 }, { "epoch": 1.1556, "grad_norm": 1.164261817932129, "learning_rate": 2.1816326530612247e-05, "loss": 0.0458, "step": 2862 }, { "epoch": 1.1558, "grad_norm": 1.6352413892745972, "learning_rate": 2.1806122448979593e-05, "loss": 0.1352, "step": 2863 }, { "epoch": 1.156, "grad_norm": 1.7097831964492798, "learning_rate": 2.179591836734694e-05, "loss": 0.2099, "step": 2864 }, { "epoch": 1.1562000000000001, "grad_norm": 1.9434983730316162, "learning_rate": 2.1785714285714288e-05, "loss": 0.2527, "step": 2865 }, { "epoch": 1.1564, "grad_norm": 2.9280757904052734, "learning_rate": 2.1775510204081634e-05, "loss": 0.5403, "step": 2866 }, { "epoch": 1.1566, "grad_norm": 2.071007490158081, "learning_rate": 2.176530612244898e-05, "loss": 0.3943, "step": 2867 }, { "epoch": 1.1568, "grad_norm": 3.7801718711853027, "learning_rate": 2.175510204081633e-05, "loss": 1.6641, "step": 2868 }, { "epoch": 1.157, "grad_norm": 5.573559284210205, "learning_rate": 2.1744897959183675e-05, "loss": 0.6697, "step": 2869 }, { "epoch": 1.1572, "grad_norm": 5.357125759124756, "learning_rate": 2.173469387755102e-05, "loss": 0.9319, "step": 2870 }, { "epoch": 1.1574, "grad_norm": 11.930858612060547, "learning_rate": 2.1724489795918366e-05, "loss": 3.4857, "step": 2871 }, { "epoch": 1.1576, "grad_norm": 21.174842834472656, "learning_rate": 2.1714285714285715e-05, "loss": 3.564, "step": 2872 }, { "epoch": 1.1578, "grad_norm": 6.427459716796875, "learning_rate": 2.170408163265306e-05, "loss": 1.6017, "step": 2873 }, { "epoch": 1.158, "grad_norm": 2.9862544536590576, "learning_rate": 2.1693877551020407e-05, "loss": 0.3427, "step": 2874 }, { "epoch": 1.1582, "grad_norm": 1.7005441188812256, "learning_rate": 2.1683673469387756e-05, "loss": 0.029, "step": 2875 }, { "epoch": 1.1584, "grad_norm": 1.1053560972213745, "learning_rate": 2.1673469387755102e-05, "loss": 0.0429, "step": 2876 }, { "epoch": 1.1586, "grad_norm": 2.360154628753662, "learning_rate": 2.1663265306122448e-05, "loss": 0.5475, "step": 2877 }, { "epoch": 1.1588, "grad_norm": 3.2414939403533936, "learning_rate": 2.1653061224489797e-05, "loss": 1.5426, "step": 2878 }, { "epoch": 1.159, "grad_norm": 2.4206106662750244, "learning_rate": 2.1642857142857146e-05, "loss": 0.1732, "step": 2879 }, { "epoch": 1.1592, "grad_norm": 4.766385555267334, "learning_rate": 2.1632653061224492e-05, "loss": 0.6206, "step": 2880 }, { "epoch": 1.1594, "grad_norm": 1.7571815252304077, "learning_rate": 2.1622448979591838e-05, "loss": 0.1629, "step": 2881 }, { "epoch": 1.1596, "grad_norm": 2.1201984882354736, "learning_rate": 2.1612244897959187e-05, "loss": 0.2275, "step": 2882 }, { "epoch": 1.1598, "grad_norm": 1.9399231672286987, "learning_rate": 2.1602040816326533e-05, "loss": 0.1805, "step": 2883 }, { "epoch": 1.16, "grad_norm": 1.9238301515579224, "learning_rate": 2.159183673469388e-05, "loss": 0.4092, "step": 2884 }, { "epoch": 1.1602000000000001, "grad_norm": 3.6335549354553223, "learning_rate": 2.1581632653061228e-05, "loss": 1.6746, "step": 2885 }, { "epoch": 1.1604, "grad_norm": 1.0903916358947754, "learning_rate": 2.1571428571428574e-05, "loss": 0.0294, "step": 2886 }, { "epoch": 1.1606, "grad_norm": 1.3453727960586548, "learning_rate": 2.156122448979592e-05, "loss": 0.027, "step": 2887 }, { "epoch": 1.1608, "grad_norm": 1.4440014362335205, "learning_rate": 2.1551020408163265e-05, "loss": 0.1507, "step": 2888 }, { "epoch": 1.161, "grad_norm": 1.1431560516357422, "learning_rate": 2.1540816326530614e-05, "loss": 0.0303, "step": 2889 }, { "epoch": 1.1612, "grad_norm": 1.644680142402649, "learning_rate": 2.153061224489796e-05, "loss": 0.1439, "step": 2890 }, { "epoch": 1.1614, "grad_norm": 2.0987346172332764, "learning_rate": 2.1520408163265306e-05, "loss": 0.5029, "step": 2891 }, { "epoch": 1.1616, "grad_norm": 4.1892409324646, "learning_rate": 2.1510204081632655e-05, "loss": 1.8529, "step": 2892 }, { "epoch": 1.1618, "grad_norm": 7.047264575958252, "learning_rate": 2.15e-05, "loss": 1.5588, "step": 2893 }, { "epoch": 1.162, "grad_norm": 4.856655597686768, "learning_rate": 2.1489795918367347e-05, "loss": 0.6655, "step": 2894 }, { "epoch": 1.1622, "grad_norm": 3.8010571002960205, "learning_rate": 2.1479591836734696e-05, "loss": 2.1848, "step": 2895 }, { "epoch": 1.1623999999999999, "grad_norm": 3.943370819091797, "learning_rate": 2.146938775510204e-05, "loss": 2.3074, "step": 2896 }, { "epoch": 1.1626, "grad_norm": 8.42847728729248, "learning_rate": 2.1459183673469387e-05, "loss": 2.3577, "step": 2897 }, { "epoch": 1.1628, "grad_norm": 15.801399230957031, "learning_rate": 2.1448979591836733e-05, "loss": 3.7016, "step": 2898 }, { "epoch": 1.163, "grad_norm": 6.718034267425537, "learning_rate": 2.1438775510204082e-05, "loss": 0.6954, "step": 2899 }, { "epoch": 1.1632, "grad_norm": 1.4650180339813232, "learning_rate": 2.1428571428571428e-05, "loss": 0.0764, "step": 2900 }, { "epoch": 1.1634, "grad_norm": 2.4374194145202637, "learning_rate": 2.1418367346938774e-05, "loss": 0.3817, "step": 2901 }, { "epoch": 1.1636, "grad_norm": 3.6774742603302, "learning_rate": 2.1408163265306127e-05, "loss": 0.9914, "step": 2902 }, { "epoch": 1.1638, "grad_norm": 4.494981288909912, "learning_rate": 2.1397959183673472e-05, "loss": 1.9728, "step": 2903 }, { "epoch": 1.164, "grad_norm": 6.808770179748535, "learning_rate": 2.1387755102040818e-05, "loss": 2.264, "step": 2904 }, { "epoch": 1.1642000000000001, "grad_norm": 10.066740036010742, "learning_rate": 2.1377551020408164e-05, "loss": 2.7497, "step": 2905 }, { "epoch": 1.1644, "grad_norm": 6.652487277984619, "learning_rate": 2.1367346938775513e-05, "loss": 2.5345, "step": 2906 }, { "epoch": 1.1646, "grad_norm": 7.7479472160339355, "learning_rate": 2.135714285714286e-05, "loss": 2.5017, "step": 2907 }, { "epoch": 1.1648, "grad_norm": 2.5272955894470215, "learning_rate": 2.1346938775510205e-05, "loss": 0.1819, "step": 2908 }, { "epoch": 1.165, "grad_norm": 5.05612325668335, "learning_rate": 2.1336734693877554e-05, "loss": 0.6378, "step": 2909 }, { "epoch": 1.1652, "grad_norm": 1.5177295207977295, "learning_rate": 2.13265306122449e-05, "loss": 0.1516, "step": 2910 }, { "epoch": 1.1654, "grad_norm": 1.9821985960006714, "learning_rate": 2.1316326530612245e-05, "loss": 0.4778, "step": 2911 }, { "epoch": 1.1656, "grad_norm": 1.3456918001174927, "learning_rate": 2.1306122448979595e-05, "loss": 0.06, "step": 2912 }, { "epoch": 1.1658, "grad_norm": 10.177454948425293, "learning_rate": 2.129591836734694e-05, "loss": 0.9334, "step": 2913 }, { "epoch": 1.166, "grad_norm": 22.78434181213379, "learning_rate": 2.1285714285714286e-05, "loss": 3.5745, "step": 2914 }, { "epoch": 1.1662, "grad_norm": 1.3361148834228516, "learning_rate": 2.1275510204081632e-05, "loss": 0.0313, "step": 2915 }, { "epoch": 1.1663999999999999, "grad_norm": 1.5880099534988403, "learning_rate": 2.126530612244898e-05, "loss": 0.1572, "step": 2916 }, { "epoch": 1.1666, "grad_norm": 1.2747163772583008, "learning_rate": 2.1255102040816327e-05, "loss": 0.0516, "step": 2917 }, { "epoch": 1.1668, "grad_norm": 2.1888797283172607, "learning_rate": 2.1244897959183673e-05, "loss": 0.2343, "step": 2918 }, { "epoch": 1.167, "grad_norm": 3.0382328033447266, "learning_rate": 2.1234693877551022e-05, "loss": 0.5813, "step": 2919 }, { "epoch": 1.1672, "grad_norm": 1.4096482992172241, "learning_rate": 2.1224489795918368e-05, "loss": 0.0571, "step": 2920 }, { "epoch": 1.1674, "grad_norm": 1.185367226600647, "learning_rate": 2.1214285714285713e-05, "loss": 0.0407, "step": 2921 }, { "epoch": 1.1676, "grad_norm": 1.6025385856628418, "learning_rate": 2.1204081632653063e-05, "loss": 0.1827, "step": 2922 }, { "epoch": 1.1678, "grad_norm": 1.0219346284866333, "learning_rate": 2.119387755102041e-05, "loss": 0.0378, "step": 2923 }, { "epoch": 1.168, "grad_norm": 1.3154290914535522, "learning_rate": 2.1183673469387754e-05, "loss": 0.1056, "step": 2924 }, { "epoch": 1.1682, "grad_norm": 1.2452430725097656, "learning_rate": 2.1173469387755103e-05, "loss": 0.1115, "step": 2925 }, { "epoch": 1.1684, "grad_norm": 1.9842383861541748, "learning_rate": 2.1163265306122453e-05, "loss": 0.2558, "step": 2926 }, { "epoch": 1.1686, "grad_norm": 3.636225938796997, "learning_rate": 2.11530612244898e-05, "loss": 1.0264, "step": 2927 }, { "epoch": 1.1688, "grad_norm": 3.466745138168335, "learning_rate": 2.1142857142857144e-05, "loss": 1.7151, "step": 2928 }, { "epoch": 1.169, "grad_norm": 2.697654962539673, "learning_rate": 2.113265306122449e-05, "loss": 0.6576, "step": 2929 }, { "epoch": 1.1692, "grad_norm": 3.2328388690948486, "learning_rate": 2.112244897959184e-05, "loss": 0.459, "step": 2930 }, { "epoch": 1.1694, "grad_norm": 1.6146492958068848, "learning_rate": 2.1112244897959185e-05, "loss": 0.1081, "step": 2931 }, { "epoch": 1.1696, "grad_norm": 2.285536766052246, "learning_rate": 2.110204081632653e-05, "loss": 0.2701, "step": 2932 }, { "epoch": 1.1698, "grad_norm": 6.2433905601501465, "learning_rate": 2.109183673469388e-05, "loss": 0.8872, "step": 2933 }, { "epoch": 1.17, "grad_norm": 8.182297706604004, "learning_rate": 2.1081632653061226e-05, "loss": 0.8029, "step": 2934 }, { "epoch": 1.1702, "grad_norm": 2.641026020050049, "learning_rate": 2.107142857142857e-05, "loss": 0.1031, "step": 2935 }, { "epoch": 1.1703999999999999, "grad_norm": 3.0824596881866455, "learning_rate": 2.106122448979592e-05, "loss": 0.4613, "step": 2936 }, { "epoch": 1.1706, "grad_norm": 4.887169361114502, "learning_rate": 2.1051020408163266e-05, "loss": 1.4961, "step": 2937 }, { "epoch": 1.1708, "grad_norm": 3.300015449523926, "learning_rate": 2.1040816326530612e-05, "loss": 0.3345, "step": 2938 }, { "epoch": 1.171, "grad_norm": 7.756126403808594, "learning_rate": 2.103061224489796e-05, "loss": 1.3592, "step": 2939 }, { "epoch": 1.1712, "grad_norm": 1.2789355516433716, "learning_rate": 2.1020408163265307e-05, "loss": 0.033, "step": 2940 }, { "epoch": 1.1714, "grad_norm": 2.148930311203003, "learning_rate": 2.1010204081632653e-05, "loss": 0.1399, "step": 2941 }, { "epoch": 1.1716, "grad_norm": 5.470130443572998, "learning_rate": 2.1e-05, "loss": 0.8179, "step": 2942 }, { "epoch": 1.1718, "grad_norm": 2.832042932510376, "learning_rate": 2.0989795918367348e-05, "loss": 0.3213, "step": 2943 }, { "epoch": 1.172, "grad_norm": 1.6176263093948364, "learning_rate": 2.0979591836734694e-05, "loss": 0.08, "step": 2944 }, { "epoch": 1.1722, "grad_norm": 8.061407089233398, "learning_rate": 2.096938775510204e-05, "loss": 0.4205, "step": 2945 }, { "epoch": 1.1724, "grad_norm": 21.434839248657227, "learning_rate": 2.095918367346939e-05, "loss": 2.1196, "step": 2946 }, { "epoch": 1.1726, "grad_norm": 5.325974941253662, "learning_rate": 2.0948979591836738e-05, "loss": 1.4808, "step": 2947 }, { "epoch": 1.1728, "grad_norm": 6.673322677612305, "learning_rate": 2.0938775510204084e-05, "loss": 0.9208, "step": 2948 }, { "epoch": 1.173, "grad_norm": 22.24078941345215, "learning_rate": 2.092857142857143e-05, "loss": 2.9982, "step": 2949 }, { "epoch": 1.1732, "grad_norm": 2.5346529483795166, "learning_rate": 2.091836734693878e-05, "loss": 0.6132, "step": 2950 }, { "epoch": 1.1734, "grad_norm": 6.196047782897949, "learning_rate": 2.0908163265306125e-05, "loss": 1.766, "step": 2951 }, { "epoch": 1.1736, "grad_norm": 9.942660331726074, "learning_rate": 2.089795918367347e-05, "loss": 1.4465, "step": 2952 }, { "epoch": 1.1738, "grad_norm": 2.346346139907837, "learning_rate": 2.088775510204082e-05, "loss": 0.2552, "step": 2953 }, { "epoch": 1.174, "grad_norm": 4.952291011810303, "learning_rate": 2.0877551020408165e-05, "loss": 1.0868, "step": 2954 }, { "epoch": 1.1742, "grad_norm": 3.304058790206909, "learning_rate": 2.086734693877551e-05, "loss": 1.5127, "step": 2955 }, { "epoch": 1.1743999999999999, "grad_norm": 1.2883349657058716, "learning_rate": 2.0857142857142857e-05, "loss": 0.0566, "step": 2956 }, { "epoch": 1.1746, "grad_norm": 3.0064966678619385, "learning_rate": 2.0846938775510206e-05, "loss": 0.1192, "step": 2957 }, { "epoch": 1.1748, "grad_norm": 5.3768768310546875, "learning_rate": 2.0836734693877552e-05, "loss": 0.3003, "step": 2958 }, { "epoch": 1.175, "grad_norm": 1.6847586631774902, "learning_rate": 2.0826530612244898e-05, "loss": 0.1338, "step": 2959 }, { "epoch": 1.1752, "grad_norm": 1.9989336729049683, "learning_rate": 2.0816326530612247e-05, "loss": 0.4576, "step": 2960 }, { "epoch": 1.1754, "grad_norm": 3.390285015106201, "learning_rate": 2.0806122448979593e-05, "loss": 1.6451, "step": 2961 }, { "epoch": 1.1756, "grad_norm": 1.4990715980529785, "learning_rate": 2.079591836734694e-05, "loss": 0.1426, "step": 2962 }, { "epoch": 1.1758, "grad_norm": 1.853268027305603, "learning_rate": 2.0785714285714288e-05, "loss": 0.2308, "step": 2963 }, { "epoch": 1.176, "grad_norm": 1.5178651809692383, "learning_rate": 2.0775510204081633e-05, "loss": 0.1052, "step": 2964 }, { "epoch": 1.1762, "grad_norm": 1.68022882938385, "learning_rate": 2.076530612244898e-05, "loss": 0.1792, "step": 2965 }, { "epoch": 1.1764000000000001, "grad_norm": 1.5579159259796143, "learning_rate": 2.0755102040816325e-05, "loss": 0.0838, "step": 2966 }, { "epoch": 1.1766, "grad_norm": 4.088912010192871, "learning_rate": 2.0744897959183674e-05, "loss": 0.5077, "step": 2967 }, { "epoch": 1.1768, "grad_norm": 7.16737699508667, "learning_rate": 2.073469387755102e-05, "loss": 1.4126, "step": 2968 }, { "epoch": 1.177, "grad_norm": 5.091823101043701, "learning_rate": 2.0724489795918366e-05, "loss": 0.2374, "step": 2969 }, { "epoch": 1.1772, "grad_norm": 6.514739990234375, "learning_rate": 2.0714285714285718e-05, "loss": 0.3321, "step": 2970 }, { "epoch": 1.1774, "grad_norm": 10.223052978515625, "learning_rate": 2.0704081632653064e-05, "loss": 0.7807, "step": 2971 }, { "epoch": 1.1776, "grad_norm": 20.88498878479004, "learning_rate": 2.069387755102041e-05, "loss": 2.1198, "step": 2972 }, { "epoch": 1.1778, "grad_norm": 8.11330795288086, "learning_rate": 2.0683673469387756e-05, "loss": 0.7407, "step": 2973 }, { "epoch": 1.178, "grad_norm": 7.516994476318359, "learning_rate": 2.0673469387755105e-05, "loss": 0.9794, "step": 2974 }, { "epoch": 1.1782, "grad_norm": 16.233720779418945, "learning_rate": 2.066326530612245e-05, "loss": 3.877, "step": 2975 }, { "epoch": 1.1784, "grad_norm": 3.6696817874908447, "learning_rate": 2.0653061224489796e-05, "loss": 1.5323, "step": 2976 }, { "epoch": 1.1786, "grad_norm": 2.3854358196258545, "learning_rate": 2.0642857142857146e-05, "loss": 0.242, "step": 2977 }, { "epoch": 1.1788, "grad_norm": 4.538883686065674, "learning_rate": 2.063265306122449e-05, "loss": 0.781, "step": 2978 }, { "epoch": 1.179, "grad_norm": 2.7626986503601074, "learning_rate": 2.0622448979591837e-05, "loss": 0.3337, "step": 2979 }, { "epoch": 1.1792, "grad_norm": 1.7479041814804077, "learning_rate": 2.0612244897959186e-05, "loss": 0.1616, "step": 2980 }, { "epoch": 1.1794, "grad_norm": 1.5093389749526978, "learning_rate": 2.0602040816326532e-05, "loss": 0.145, "step": 2981 }, { "epoch": 1.1796, "grad_norm": 2.2258784770965576, "learning_rate": 2.0591836734693878e-05, "loss": 0.1724, "step": 2982 }, { "epoch": 1.1798, "grad_norm": 4.365813255310059, "learning_rate": 2.0581632653061224e-05, "loss": 0.3035, "step": 2983 }, { "epoch": 1.18, "grad_norm": 1.0965051651000977, "learning_rate": 2.0571428571428573e-05, "loss": 0.0122, "step": 2984 }, { "epoch": 1.1802, "grad_norm": 1.604235291481018, "learning_rate": 2.056122448979592e-05, "loss": 0.1682, "step": 2985 }, { "epoch": 1.1804000000000001, "grad_norm": 2.189631223678589, "learning_rate": 2.0551020408163265e-05, "loss": 0.4954, "step": 2986 }, { "epoch": 1.1806, "grad_norm": 1.478208065032959, "learning_rate": 2.0540816326530614e-05, "loss": 0.097, "step": 2987 }, { "epoch": 1.1808, "grad_norm": 1.4729284048080444, "learning_rate": 2.053061224489796e-05, "loss": 0.088, "step": 2988 }, { "epoch": 1.181, "grad_norm": 1.6487613916397095, "learning_rate": 2.0520408163265305e-05, "loss": 0.116, "step": 2989 }, { "epoch": 1.1812, "grad_norm": 3.5400404930114746, "learning_rate": 2.0510204081632654e-05, "loss": 0.629, "step": 2990 }, { "epoch": 1.1814, "grad_norm": 6.919580459594727, "learning_rate": 2.05e-05, "loss": 0.7759, "step": 2991 }, { "epoch": 1.1816, "grad_norm": 2.9006593227386475, "learning_rate": 2.0489795918367346e-05, "loss": 0.2965, "step": 2992 }, { "epoch": 1.1818, "grad_norm": 11.443617820739746, "learning_rate": 2.0479591836734695e-05, "loss": 2.0913, "step": 2993 }, { "epoch": 1.182, "grad_norm": 16.701017379760742, "learning_rate": 2.0469387755102044e-05, "loss": 2.2104, "step": 2994 }, { "epoch": 1.1822, "grad_norm": 5.281843185424805, "learning_rate": 2.045918367346939e-05, "loss": 0.8237, "step": 2995 }, { "epoch": 1.1824, "grad_norm": 4.46371603012085, "learning_rate": 2.0448979591836736e-05, "loss": 0.6812, "step": 2996 }, { "epoch": 1.1826, "grad_norm": 1.2886015176773071, "learning_rate": 2.0438775510204082e-05, "loss": 0.0697, "step": 2997 }, { "epoch": 1.1828, "grad_norm": 3.352893114089966, "learning_rate": 2.042857142857143e-05, "loss": 0.3906, "step": 2998 }, { "epoch": 1.183, "grad_norm": 5.506088733673096, "learning_rate": 2.0418367346938777e-05, "loss": 1.8157, "step": 2999 }, { "epoch": 1.1832, "grad_norm": 3.538123846054077, "learning_rate": 2.0408163265306123e-05, "loss": 1.6393, "step": 3000 }, { "epoch": 1.1834, "grad_norm": 0.9296956062316895, "learning_rate": 2.0397959183673472e-05, "loss": 0.0733, "step": 3001 }, { "epoch": 1.1836, "grad_norm": 1.3598219156265259, "learning_rate": 2.0387755102040817e-05, "loss": 0.0486, "step": 3002 }, { "epoch": 1.1838, "grad_norm": 2.0247433185577393, "learning_rate": 2.0377551020408163e-05, "loss": 0.1865, "step": 3003 }, { "epoch": 1.184, "grad_norm": 5.148133277893066, "learning_rate": 2.0367346938775512e-05, "loss": 0.8735, "step": 3004 }, { "epoch": 1.1842, "grad_norm": 3.1829111576080322, "learning_rate": 2.0357142857142858e-05, "loss": 0.8626, "step": 3005 }, { "epoch": 1.1844000000000001, "grad_norm": 3.3566997051239014, "learning_rate": 2.0346938775510204e-05, "loss": 1.5234, "step": 3006 }, { "epoch": 1.1846, "grad_norm": 1.5200315713882446, "learning_rate": 2.0336734693877553e-05, "loss": 0.1196, "step": 3007 }, { "epoch": 1.1848, "grad_norm": 8.360516548156738, "learning_rate": 2.03265306122449e-05, "loss": 1.1216, "step": 3008 }, { "epoch": 1.185, "grad_norm": 16.254709243774414, "learning_rate": 2.0316326530612245e-05, "loss": 3.6941, "step": 3009 }, { "epoch": 1.1852, "grad_norm": 2.454667091369629, "learning_rate": 2.030612244897959e-05, "loss": 0.1772, "step": 3010 }, { "epoch": 1.1854, "grad_norm": 1.7685350179672241, "learning_rate": 2.029591836734694e-05, "loss": 0.2342, "step": 3011 }, { "epoch": 1.1856, "grad_norm": 4.389267444610596, "learning_rate": 2.0285714285714286e-05, "loss": 0.7286, "step": 3012 }, { "epoch": 1.1858, "grad_norm": 13.457568168640137, "learning_rate": 2.027551020408163e-05, "loss": 3.6436, "step": 3013 }, { "epoch": 1.186, "grad_norm": 14.3905611038208, "learning_rate": 2.026530612244898e-05, "loss": 3.3189, "step": 3014 }, { "epoch": 1.1862, "grad_norm": 7.432793140411377, "learning_rate": 2.0255102040816326e-05, "loss": 0.8565, "step": 3015 }, { "epoch": 1.1864, "grad_norm": 2.9249179363250732, "learning_rate": 2.0244897959183676e-05, "loss": 0.5177, "step": 3016 }, { "epoch": 1.1865999999999999, "grad_norm": 2.024263620376587, "learning_rate": 2.023469387755102e-05, "loss": 0.4307, "step": 3017 }, { "epoch": 1.1868, "grad_norm": 3.8027567863464355, "learning_rate": 2.022448979591837e-05, "loss": 1.5573, "step": 3018 }, { "epoch": 1.187, "grad_norm": 3.958740234375, "learning_rate": 2.0214285714285716e-05, "loss": 0.6836, "step": 3019 }, { "epoch": 1.1872, "grad_norm": 1.5115995407104492, "learning_rate": 2.0204081632653062e-05, "loss": 0.1058, "step": 3020 }, { "epoch": 1.1874, "grad_norm": 6.807767868041992, "learning_rate": 2.019387755102041e-05, "loss": 1.069, "step": 3021 }, { "epoch": 1.1876, "grad_norm": 10.496637344360352, "learning_rate": 2.0183673469387757e-05, "loss": 3.1418, "step": 3022 }, { "epoch": 1.1878, "grad_norm": 3.6764719486236572, "learning_rate": 2.0173469387755103e-05, "loss": 0.6877, "step": 3023 }, { "epoch": 1.188, "grad_norm": 4.998138904571533, "learning_rate": 2.016326530612245e-05, "loss": 0.7316, "step": 3024 }, { "epoch": 1.1882, "grad_norm": 3.1024022102355957, "learning_rate": 2.0153061224489798e-05, "loss": 0.5964, "step": 3025 }, { "epoch": 1.1884000000000001, "grad_norm": 5.601747512817383, "learning_rate": 2.0142857142857144e-05, "loss": 0.4643, "step": 3026 }, { "epoch": 1.1886, "grad_norm": 15.740729331970215, "learning_rate": 2.013265306122449e-05, "loss": 0.979, "step": 3027 }, { "epoch": 1.1888, "grad_norm": 6.91904878616333, "learning_rate": 2.012244897959184e-05, "loss": 0.9966, "step": 3028 }, { "epoch": 1.189, "grad_norm": 5.038677215576172, "learning_rate": 2.0112244897959184e-05, "loss": 1.5011, "step": 3029 }, { "epoch": 1.1892, "grad_norm": 1.3942502737045288, "learning_rate": 2.010204081632653e-05, "loss": 0.1064, "step": 3030 }, { "epoch": 1.1894, "grad_norm": 3.739537000656128, "learning_rate": 2.009183673469388e-05, "loss": 0.3063, "step": 3031 }, { "epoch": 1.1896, "grad_norm": 4.991539478302002, "learning_rate": 2.0081632653061225e-05, "loss": 0.6707, "step": 3032 }, { "epoch": 1.1898, "grad_norm": 1.6191712617874146, "learning_rate": 2.007142857142857e-05, "loss": 0.1287, "step": 3033 }, { "epoch": 1.19, "grad_norm": 3.295368194580078, "learning_rate": 2.0061224489795917e-05, "loss": 0.125, "step": 3034 }, { "epoch": 1.1902, "grad_norm": 8.2177734375, "learning_rate": 2.0051020408163266e-05, "loss": 0.7098, "step": 3035 }, { "epoch": 1.1904, "grad_norm": 10.9459228515625, "learning_rate": 2.004081632653061e-05, "loss": 2.0341, "step": 3036 }, { "epoch": 1.1905999999999999, "grad_norm": 10.621405601501465, "learning_rate": 2.0030612244897957e-05, "loss": 1.8083, "step": 3037 }, { "epoch": 1.1908, "grad_norm": 6.680018901824951, "learning_rate": 2.002040816326531e-05, "loss": 1.4806, "step": 3038 }, { "epoch": 1.191, "grad_norm": 1.1898123025894165, "learning_rate": 2.0010204081632656e-05, "loss": 0.1244, "step": 3039 }, { "epoch": 1.1912, "grad_norm": 1.4971431493759155, "learning_rate": 2e-05, "loss": 0.1304, "step": 3040 }, { "epoch": 1.1914, "grad_norm": 1.1757605075836182, "learning_rate": 1.9989795918367347e-05, "loss": 0.0622, "step": 3041 }, { "epoch": 1.1916, "grad_norm": 1.6284112930297852, "learning_rate": 1.9979591836734697e-05, "loss": 0.0736, "step": 3042 }, { "epoch": 1.1918, "grad_norm": 1.1026115417480469, "learning_rate": 1.9969387755102042e-05, "loss": 0.042, "step": 3043 }, { "epoch": 1.192, "grad_norm": 1.9990594387054443, "learning_rate": 1.9959183673469388e-05, "loss": 0.1913, "step": 3044 }, { "epoch": 1.1922, "grad_norm": 3.820249319076538, "learning_rate": 1.9948979591836737e-05, "loss": 0.6869, "step": 3045 }, { "epoch": 1.1924, "grad_norm": 1.741510033607483, "learning_rate": 1.9938775510204083e-05, "loss": 0.0858, "step": 3046 }, { "epoch": 1.1926, "grad_norm": 3.434689998626709, "learning_rate": 1.992857142857143e-05, "loss": 0.3755, "step": 3047 }, { "epoch": 1.1928, "grad_norm": 5.762261390686035, "learning_rate": 1.9918367346938778e-05, "loss": 1.5206, "step": 3048 }, { "epoch": 1.193, "grad_norm": 8.78773021697998, "learning_rate": 1.9908163265306124e-05, "loss": 0.86, "step": 3049 }, { "epoch": 1.1932, "grad_norm": 7.344252109527588, "learning_rate": 1.989795918367347e-05, "loss": 0.3056, "step": 3050 }, { "epoch": 1.1934, "grad_norm": 1.1593364477157593, "learning_rate": 1.9887755102040816e-05, "loss": 0.0734, "step": 3051 }, { "epoch": 1.1936, "grad_norm": 1.4719830751419067, "learning_rate": 1.9877551020408165e-05, "loss": 0.0712, "step": 3052 }, { "epoch": 1.1938, "grad_norm": 2.024540662765503, "learning_rate": 1.986734693877551e-05, "loss": 0.4275, "step": 3053 }, { "epoch": 1.194, "grad_norm": 4.2963056564331055, "learning_rate": 1.9857142857142856e-05, "loss": 1.6131, "step": 3054 }, { "epoch": 1.1942, "grad_norm": 8.733445167541504, "learning_rate": 1.9846938775510205e-05, "loss": 0.4647, "step": 3055 }, { "epoch": 1.1944, "grad_norm": 3.532928466796875, "learning_rate": 1.983673469387755e-05, "loss": 0.7477, "step": 3056 }, { "epoch": 1.1945999999999999, "grad_norm": 1.657246470451355, "learning_rate": 1.9826530612244897e-05, "loss": 0.1621, "step": 3057 }, { "epoch": 1.1948, "grad_norm": 2.976757049560547, "learning_rate": 1.9816326530612246e-05, "loss": 0.6082, "step": 3058 }, { "epoch": 1.195, "grad_norm": 1.20772385597229, "learning_rate": 1.9806122448979592e-05, "loss": 0.1211, "step": 3059 }, { "epoch": 1.1952, "grad_norm": 2.3030781745910645, "learning_rate": 1.9795918367346938e-05, "loss": 0.4913, "step": 3060 }, { "epoch": 1.1954, "grad_norm": 1.7301090955734253, "learning_rate": 1.9785714285714287e-05, "loss": 0.1146, "step": 3061 }, { "epoch": 1.1956, "grad_norm": 1.2910375595092773, "learning_rate": 1.9775510204081636e-05, "loss": 0.0515, "step": 3062 }, { "epoch": 1.1958, "grad_norm": 1.4276667833328247, "learning_rate": 1.9765306122448982e-05, "loss": 0.0557, "step": 3063 }, { "epoch": 1.196, "grad_norm": 5.736569881439209, "learning_rate": 1.9755102040816328e-05, "loss": 0.1938, "step": 3064 }, { "epoch": 1.1962, "grad_norm": 12.777549743652344, "learning_rate": 1.9744897959183677e-05, "loss": 1.8733, "step": 3065 }, { "epoch": 1.1964, "grad_norm": 17.816858291625977, "learning_rate": 1.9734693877551023e-05, "loss": 3.5921, "step": 3066 }, { "epoch": 1.1966, "grad_norm": 1.7031697034835815, "learning_rate": 1.972448979591837e-05, "loss": 0.1164, "step": 3067 }, { "epoch": 1.1968, "grad_norm": 4.166122913360596, "learning_rate": 1.9714285714285714e-05, "loss": 0.1969, "step": 3068 }, { "epoch": 1.197, "grad_norm": 9.597853660583496, "learning_rate": 1.9704081632653063e-05, "loss": 0.8492, "step": 3069 }, { "epoch": 1.1972, "grad_norm": 1.3580381870269775, "learning_rate": 1.969387755102041e-05, "loss": 0.0847, "step": 3070 }, { "epoch": 1.1974, "grad_norm": 6.079249858856201, "learning_rate": 1.9683673469387755e-05, "loss": 0.7113, "step": 3071 }, { "epoch": 1.1976, "grad_norm": 9.949956893920898, "learning_rate": 1.9673469387755104e-05, "loss": 1.831, "step": 3072 }, { "epoch": 1.1978, "grad_norm": 3.1551754474639893, "learning_rate": 1.966326530612245e-05, "loss": 0.3292, "step": 3073 }, { "epoch": 1.198, "grad_norm": 1.6016051769256592, "learning_rate": 1.9653061224489796e-05, "loss": 0.0865, "step": 3074 }, { "epoch": 1.1982, "grad_norm": 1.842907428741455, "learning_rate": 1.9642857142857145e-05, "loss": 0.0748, "step": 3075 }, { "epoch": 1.1984, "grad_norm": 1.17067289352417, "learning_rate": 1.963265306122449e-05, "loss": 0.0344, "step": 3076 }, { "epoch": 1.1985999999999999, "grad_norm": 2.7815639972686768, "learning_rate": 1.9622448979591837e-05, "loss": 0.2074, "step": 3077 }, { "epoch": 1.1988, "grad_norm": 4.956741809844971, "learning_rate": 1.9612244897959182e-05, "loss": 0.6889, "step": 3078 }, { "epoch": 1.199, "grad_norm": 2.6895835399627686, "learning_rate": 1.960204081632653e-05, "loss": 0.1869, "step": 3079 }, { "epoch": 1.1992, "grad_norm": 4.918212890625, "learning_rate": 1.9591836734693877e-05, "loss": 0.2545, "step": 3080 }, { "epoch": 1.1994, "grad_norm": 1.636683702468872, "learning_rate": 1.9581632653061223e-05, "loss": 0.2151, "step": 3081 }, { "epoch": 1.1996, "grad_norm": 4.025855541229248, "learning_rate": 1.9571428571428572e-05, "loss": 0.2909, "step": 3082 }, { "epoch": 1.1998, "grad_norm": 9.657797813415527, "learning_rate": 1.9561224489795918e-05, "loss": 1.3987, "step": 3083 }, { "epoch": 1.2, "grad_norm": 1.4951329231262207, "learning_rate": 1.9551020408163267e-05, "loss": 0.1309, "step": 3084 }, { "epoch": 1.2002, "grad_norm": 5.286871910095215, "learning_rate": 1.9540816326530613e-05, "loss": 0.9136, "step": 3085 }, { "epoch": 1.2004, "grad_norm": 7.972781658172607, "learning_rate": 1.9530612244897962e-05, "loss": 2.049, "step": 3086 }, { "epoch": 1.2006000000000001, "grad_norm": 7.049349308013916, "learning_rate": 1.9520408163265308e-05, "loss": 0.9573, "step": 3087 }, { "epoch": 1.2008, "grad_norm": 20.929615020751953, "learning_rate": 1.9510204081632654e-05, "loss": 3.7053, "step": 3088 }, { "epoch": 1.201, "grad_norm": 4.493311882019043, "learning_rate": 1.9500000000000003e-05, "loss": 0.48, "step": 3089 }, { "epoch": 1.2012, "grad_norm": 4.102753639221191, "learning_rate": 1.948979591836735e-05, "loss": 0.8753, "step": 3090 }, { "epoch": 1.2014, "grad_norm": 5.101694583892822, "learning_rate": 1.9479591836734695e-05, "loss": 1.4985, "step": 3091 }, { "epoch": 1.2016, "grad_norm": 4.904029846191406, "learning_rate": 1.946938775510204e-05, "loss": 0.6939, "step": 3092 }, { "epoch": 1.2018, "grad_norm": 3.5857245922088623, "learning_rate": 1.945918367346939e-05, "loss": 0.2649, "step": 3093 }, { "epoch": 1.202, "grad_norm": 1.2739484310150146, "learning_rate": 1.9448979591836735e-05, "loss": 0.0412, "step": 3094 }, { "epoch": 1.2022, "grad_norm": 1.253515362739563, "learning_rate": 1.943877551020408e-05, "loss": 0.0718, "step": 3095 }, { "epoch": 1.2024, "grad_norm": 7.14802885055542, "learning_rate": 1.942857142857143e-05, "loss": 0.5822, "step": 3096 }, { "epoch": 1.2026, "grad_norm": 15.709341049194336, "learning_rate": 1.9418367346938776e-05, "loss": 2.2194, "step": 3097 }, { "epoch": 1.2028, "grad_norm": 7.323596477508545, "learning_rate": 1.9408163265306122e-05, "loss": 0.9511, "step": 3098 }, { "epoch": 1.203, "grad_norm": 23.892974853515625, "learning_rate": 1.939795918367347e-05, "loss": 3.5908, "step": 3099 }, { "epoch": 1.2032, "grad_norm": 1.673535704612732, "learning_rate": 1.9387755102040817e-05, "loss": 0.0876, "step": 3100 }, { "epoch": 1.2034, "grad_norm": 1.2727751731872559, "learning_rate": 1.9377551020408163e-05, "loss": 0.0544, "step": 3101 }, { "epoch": 1.2036, "grad_norm": 1.1253530979156494, "learning_rate": 1.9367346938775512e-05, "loss": 0.0778, "step": 3102 }, { "epoch": 1.2038, "grad_norm": 1.170177698135376, "learning_rate": 1.9357142857142858e-05, "loss": 0.0469, "step": 3103 }, { "epoch": 1.204, "grad_norm": 3.109325885772705, "learning_rate": 1.9346938775510203e-05, "loss": 0.2601, "step": 3104 }, { "epoch": 1.2042, "grad_norm": 9.423465728759766, "learning_rate": 1.933673469387755e-05, "loss": 2.4327, "step": 3105 }, { "epoch": 1.2044, "grad_norm": 15.970878601074219, "learning_rate": 1.9326530612244902e-05, "loss": 3.0273, "step": 3106 }, { "epoch": 1.2046000000000001, "grad_norm": 2.112394332885742, "learning_rate": 1.9316326530612248e-05, "loss": 0.1451, "step": 3107 }, { "epoch": 1.2048, "grad_norm": 2.8906023502349854, "learning_rate": 1.9306122448979593e-05, "loss": 0.2964, "step": 3108 }, { "epoch": 1.205, "grad_norm": 5.92968225479126, "learning_rate": 1.929591836734694e-05, "loss": 1.036, "step": 3109 }, { "epoch": 1.2052, "grad_norm": 8.283905982971191, "learning_rate": 1.928571428571429e-05, "loss": 3.0009, "step": 3110 }, { "epoch": 1.2054, "grad_norm": 2.000481605529785, "learning_rate": 1.9275510204081634e-05, "loss": 0.139, "step": 3111 }, { "epoch": 1.2056, "grad_norm": 7.078057289123535, "learning_rate": 1.926530612244898e-05, "loss": 1.4782, "step": 3112 }, { "epoch": 1.2058, "grad_norm": 9.67080307006836, "learning_rate": 1.925510204081633e-05, "loss": 2.2358, "step": 3113 }, { "epoch": 1.206, "grad_norm": 4.307836532592773, "learning_rate": 1.9244897959183675e-05, "loss": 0.1114, "step": 3114 }, { "epoch": 1.2062, "grad_norm": 5.063274383544922, "learning_rate": 1.923469387755102e-05, "loss": 0.3036, "step": 3115 }, { "epoch": 1.2064, "grad_norm": 1.4876151084899902, "learning_rate": 1.922448979591837e-05, "loss": 0.1199, "step": 3116 }, { "epoch": 1.2066, "grad_norm": 1.2019920349121094, "learning_rate": 1.9214285714285716e-05, "loss": 0.0794, "step": 3117 }, { "epoch": 1.2068, "grad_norm": 1.368234395980835, "learning_rate": 1.920408163265306e-05, "loss": 0.049, "step": 3118 }, { "epoch": 1.207, "grad_norm": 1.189566731452942, "learning_rate": 1.9193877551020407e-05, "loss": 0.061, "step": 3119 }, { "epoch": 1.2072, "grad_norm": 3.7581393718719482, "learning_rate": 1.9183673469387756e-05, "loss": 0.3694, "step": 3120 }, { "epoch": 1.2074, "grad_norm": 5.4688520431518555, "learning_rate": 1.9173469387755102e-05, "loss": 1.4537, "step": 3121 }, { "epoch": 1.2076, "grad_norm": 1.7890475988388062, "learning_rate": 1.9163265306122448e-05, "loss": 0.1057, "step": 3122 }, { "epoch": 1.2078, "grad_norm": 1.502280354499817, "learning_rate": 1.9153061224489797e-05, "loss": 0.1204, "step": 3123 }, { "epoch": 1.208, "grad_norm": 3.024693012237549, "learning_rate": 1.9142857142857143e-05, "loss": 0.0935, "step": 3124 }, { "epoch": 1.2082, "grad_norm": 10.54287052154541, "learning_rate": 1.913265306122449e-05, "loss": 0.7667, "step": 3125 }, { "epoch": 1.2084, "grad_norm": 14.563943862915039, "learning_rate": 1.9122448979591838e-05, "loss": 2.016, "step": 3126 }, { "epoch": 1.2086000000000001, "grad_norm": 1.2480556964874268, "learning_rate": 1.9112244897959184e-05, "loss": 0.0672, "step": 3127 }, { "epoch": 1.2088, "grad_norm": 6.187782287597656, "learning_rate": 1.910204081632653e-05, "loss": 0.9447, "step": 3128 }, { "epoch": 1.209, "grad_norm": 11.717743873596191, "learning_rate": 1.909183673469388e-05, "loss": 3.6272, "step": 3129 }, { "epoch": 1.2092, "grad_norm": 3.94441294670105, "learning_rate": 1.9081632653061228e-05, "loss": 0.4909, "step": 3130 }, { "epoch": 1.2094, "grad_norm": 5.881809711456299, "learning_rate": 1.9071428571428574e-05, "loss": 0.7599, "step": 3131 }, { "epoch": 1.2096, "grad_norm": 1.982026219367981, "learning_rate": 1.906122448979592e-05, "loss": 0.1455, "step": 3132 }, { "epoch": 1.2098, "grad_norm": 2.9072039127349854, "learning_rate": 1.905102040816327e-05, "loss": 0.5168, "step": 3133 }, { "epoch": 1.21, "grad_norm": 1.7545095682144165, "learning_rate": 1.9040816326530614e-05, "loss": 0.0821, "step": 3134 }, { "epoch": 1.2102, "grad_norm": 1.8028314113616943, "learning_rate": 1.903061224489796e-05, "loss": 0.1375, "step": 3135 }, { "epoch": 1.2104, "grad_norm": 4.452255725860596, "learning_rate": 1.9020408163265306e-05, "loss": 0.6812, "step": 3136 }, { "epoch": 1.2106, "grad_norm": 10.38892936706543, "learning_rate": 1.9010204081632655e-05, "loss": 1.4032, "step": 3137 }, { "epoch": 1.2107999999999999, "grad_norm": 9.387345314025879, "learning_rate": 1.9e-05, "loss": 3.0628, "step": 3138 }, { "epoch": 1.211, "grad_norm": 5.306800365447998, "learning_rate": 1.8989795918367347e-05, "loss": 0.6412, "step": 3139 }, { "epoch": 1.2112, "grad_norm": 3.5087547302246094, "learning_rate": 1.8979591836734696e-05, "loss": 0.2057, "step": 3140 }, { "epoch": 1.2114, "grad_norm": 5.041624546051025, "learning_rate": 1.8969387755102042e-05, "loss": 0.6413, "step": 3141 }, { "epoch": 1.2116, "grad_norm": 1.5640407800674438, "learning_rate": 1.8959183673469388e-05, "loss": 0.1225, "step": 3142 }, { "epoch": 1.2118, "grad_norm": 1.5499358177185059, "learning_rate": 1.8948979591836737e-05, "loss": 0.1756, "step": 3143 }, { "epoch": 1.212, "grad_norm": 3.4542722702026367, "learning_rate": 1.8938775510204083e-05, "loss": 0.0812, "step": 3144 }, { "epoch": 1.2122, "grad_norm": 6.98813009262085, "learning_rate": 1.892857142857143e-05, "loss": 0.3454, "step": 3145 }, { "epoch": 1.2124, "grad_norm": 2.5921764373779297, "learning_rate": 1.8918367346938774e-05, "loss": 0.463, "step": 3146 }, { "epoch": 1.2126000000000001, "grad_norm": 0.8930935859680176, "learning_rate": 1.8908163265306123e-05, "loss": 0.0271, "step": 3147 }, { "epoch": 1.2128, "grad_norm": 2.3683059215545654, "learning_rate": 1.889795918367347e-05, "loss": 0.2736, "step": 3148 }, { "epoch": 1.213, "grad_norm": 5.169450283050537, "learning_rate": 1.8887755102040815e-05, "loss": 0.8165, "step": 3149 }, { "epoch": 1.2132, "grad_norm": 2.7456088066101074, "learning_rate": 1.8877551020408164e-05, "loss": 0.4483, "step": 3150 }, { "epoch": 1.2134, "grad_norm": 2.067122220993042, "learning_rate": 1.886734693877551e-05, "loss": 0.2132, "step": 3151 }, { "epoch": 1.2136, "grad_norm": 3.7471401691436768, "learning_rate": 1.885714285714286e-05, "loss": 0.6301, "step": 3152 }, { "epoch": 1.2138, "grad_norm": 2.1313626766204834, "learning_rate": 1.8846938775510205e-05, "loss": 0.1247, "step": 3153 }, { "epoch": 1.214, "grad_norm": 8.123455047607422, "learning_rate": 1.8836734693877554e-05, "loss": 0.9875, "step": 3154 }, { "epoch": 1.2142, "grad_norm": 11.68111801147461, "learning_rate": 1.88265306122449e-05, "loss": 1.5304, "step": 3155 }, { "epoch": 1.2144, "grad_norm": 7.343622207641602, "learning_rate": 1.8816326530612246e-05, "loss": 1.0192, "step": 3156 }, { "epoch": 1.2146, "grad_norm": 6.457760334014893, "learning_rate": 1.8806122448979595e-05, "loss": 3.1121, "step": 3157 }, { "epoch": 1.2147999999999999, "grad_norm": 2.7293622493743896, "learning_rate": 1.879591836734694e-05, "loss": 0.5017, "step": 3158 }, { "epoch": 1.215, "grad_norm": 1.0072730779647827, "learning_rate": 1.8785714285714286e-05, "loss": 0.0282, "step": 3159 }, { "epoch": 1.2152, "grad_norm": 3.2834362983703613, "learning_rate": 1.8775510204081632e-05, "loss": 1.0516, "step": 3160 }, { "epoch": 1.2154, "grad_norm": 11.370521545410156, "learning_rate": 1.876530612244898e-05, "loss": 3.0999, "step": 3161 }, { "epoch": 1.2156, "grad_norm": 1.2034019231796265, "learning_rate": 1.8755102040816327e-05, "loss": 0.0787, "step": 3162 }, { "epoch": 1.2158, "grad_norm": 1.3203438520431519, "learning_rate": 1.8744897959183673e-05, "loss": 0.0865, "step": 3163 }, { "epoch": 1.216, "grad_norm": 1.1307048797607422, "learning_rate": 1.8734693877551022e-05, "loss": 0.071, "step": 3164 }, { "epoch": 1.2162, "grad_norm": 0.9662017822265625, "learning_rate": 1.8724489795918368e-05, "loss": 0.0276, "step": 3165 }, { "epoch": 1.2164, "grad_norm": 1.1824790239334106, "learning_rate": 1.8714285714285714e-05, "loss": 0.0763, "step": 3166 }, { "epoch": 1.2166, "grad_norm": 1.7388250827789307, "learning_rate": 1.8704081632653063e-05, "loss": 0.1711, "step": 3167 }, { "epoch": 1.2168, "grad_norm": 2.5671846866607666, "learning_rate": 1.869387755102041e-05, "loss": 0.4834, "step": 3168 }, { "epoch": 1.217, "grad_norm": 2.9493188858032227, "learning_rate": 1.8683673469387754e-05, "loss": 0.4221, "step": 3169 }, { "epoch": 1.2172, "grad_norm": 6.83276891708374, "learning_rate": 1.8673469387755104e-05, "loss": 1.5277, "step": 3170 }, { "epoch": 1.2174, "grad_norm": 5.947171688079834, "learning_rate": 1.866326530612245e-05, "loss": 0.7498, "step": 3171 }, { "epoch": 1.2176, "grad_norm": 1.5263842344284058, "learning_rate": 1.8653061224489795e-05, "loss": 0.1066, "step": 3172 }, { "epoch": 1.2178, "grad_norm": 7.348236083984375, "learning_rate": 1.864285714285714e-05, "loss": 1.0489, "step": 3173 }, { "epoch": 1.218, "grad_norm": 12.764039039611816, "learning_rate": 1.8632653061224494e-05, "loss": 2.8689, "step": 3174 }, { "epoch": 1.2182, "grad_norm": 9.676426887512207, "learning_rate": 1.862244897959184e-05, "loss": 1.3229, "step": 3175 }, { "epoch": 1.2184, "grad_norm": 1.6813863515853882, "learning_rate": 1.8612244897959185e-05, "loss": 0.0723, "step": 3176 }, { "epoch": 1.2186, "grad_norm": 3.3537490367889404, "learning_rate": 1.860204081632653e-05, "loss": 0.3726, "step": 3177 }, { "epoch": 1.2187999999999999, "grad_norm": 5.313870429992676, "learning_rate": 1.859183673469388e-05, "loss": 0.7126, "step": 3178 }, { "epoch": 1.219, "grad_norm": 1.637223720550537, "learning_rate": 1.8581632653061226e-05, "loss": 0.0945, "step": 3179 }, { "epoch": 1.2192, "grad_norm": 2.391230821609497, "learning_rate": 1.8571428571428572e-05, "loss": 0.1888, "step": 3180 }, { "epoch": 1.2194, "grad_norm": 4.959512233734131, "learning_rate": 1.856122448979592e-05, "loss": 0.6768, "step": 3181 }, { "epoch": 1.2196, "grad_norm": 4.827409267425537, "learning_rate": 1.8551020408163267e-05, "loss": 1.2034, "step": 3182 }, { "epoch": 1.2198, "grad_norm": 9.361382484436035, "learning_rate": 1.8540816326530613e-05, "loss": 2.1611, "step": 3183 }, { "epoch": 1.22, "grad_norm": 10.543557167053223, "learning_rate": 1.853061224489796e-05, "loss": 2.0855, "step": 3184 }, { "epoch": 1.2202, "grad_norm": 17.42910385131836, "learning_rate": 1.8520408163265307e-05, "loss": 1.6268, "step": 3185 }, { "epoch": 1.2204, "grad_norm": 1.5924044847488403, "learning_rate": 1.8510204081632653e-05, "loss": 0.1284, "step": 3186 }, { "epoch": 1.2206, "grad_norm": 1.471555471420288, "learning_rate": 1.85e-05, "loss": 0.1076, "step": 3187 }, { "epoch": 1.2208, "grad_norm": 1.4289698600769043, "learning_rate": 1.8489795918367348e-05, "loss": 0.105, "step": 3188 }, { "epoch": 1.221, "grad_norm": 1.8644956350326538, "learning_rate": 1.8479591836734694e-05, "loss": 0.1873, "step": 3189 }, { "epoch": 1.2212, "grad_norm": 3.7627358436584473, "learning_rate": 1.846938775510204e-05, "loss": 0.7693, "step": 3190 }, { "epoch": 1.2214, "grad_norm": 5.294459819793701, "learning_rate": 1.845918367346939e-05, "loss": 0.7467, "step": 3191 }, { "epoch": 1.2216, "grad_norm": 3.0752527713775635, "learning_rate": 1.8448979591836735e-05, "loss": 0.5994, "step": 3192 }, { "epoch": 1.2218, "grad_norm": 4.9536967277526855, "learning_rate": 1.843877551020408e-05, "loss": 0.9815, "step": 3193 }, { "epoch": 1.222, "grad_norm": 9.496366500854492, "learning_rate": 1.842857142857143e-05, "loss": 2.0773, "step": 3194 }, { "epoch": 1.2222, "grad_norm": 5.776390552520752, "learning_rate": 1.8418367346938776e-05, "loss": 1.5251, "step": 3195 }, { "epoch": 1.2224, "grad_norm": 3.0072059631347656, "learning_rate": 1.840816326530612e-05, "loss": 0.5866, "step": 3196 }, { "epoch": 1.2226, "grad_norm": 1.4602209329605103, "learning_rate": 1.839795918367347e-05, "loss": 0.1017, "step": 3197 }, { "epoch": 1.2227999999999999, "grad_norm": 1.5261403322219849, "learning_rate": 1.838775510204082e-05, "loss": 0.0664, "step": 3198 }, { "epoch": 1.223, "grad_norm": 1.498910665512085, "learning_rate": 1.8377551020408165e-05, "loss": 0.1027, "step": 3199 }, { "epoch": 1.2232, "grad_norm": 1.4415857791900635, "learning_rate": 1.836734693877551e-05, "loss": 0.0466, "step": 3200 }, { "epoch": 1.2234, "grad_norm": 1.0863139629364014, "learning_rate": 1.835714285714286e-05, "loss": 0.0707, "step": 3201 }, { "epoch": 1.2236, "grad_norm": 1.3481663465499878, "learning_rate": 1.8346938775510206e-05, "loss": 0.053, "step": 3202 }, { "epoch": 1.2238, "grad_norm": 9.405648231506348, "learning_rate": 1.8336734693877552e-05, "loss": 0.6962, "step": 3203 }, { "epoch": 1.224, "grad_norm": 17.991626739501953, "learning_rate": 1.8326530612244898e-05, "loss": 2.3324, "step": 3204 }, { "epoch": 1.2242, "grad_norm": 5.935586452484131, "learning_rate": 1.8316326530612247e-05, "loss": 0.7266, "step": 3205 }, { "epoch": 1.2244, "grad_norm": 6.7307610511779785, "learning_rate": 1.8306122448979593e-05, "loss": 0.4605, "step": 3206 }, { "epoch": 1.2246, "grad_norm": 10.014396667480469, "learning_rate": 1.829591836734694e-05, "loss": 1.3713, "step": 3207 }, { "epoch": 1.2248, "grad_norm": 2.9402997493743896, "learning_rate": 1.8285714285714288e-05, "loss": 0.5045, "step": 3208 }, { "epoch": 1.225, "grad_norm": 1.3361576795578003, "learning_rate": 1.8275510204081634e-05, "loss": 0.102, "step": 3209 }, { "epoch": 1.2252, "grad_norm": 1.4597071409225464, "learning_rate": 1.826530612244898e-05, "loss": 0.0999, "step": 3210 }, { "epoch": 1.2254, "grad_norm": 1.4014943838119507, "learning_rate": 1.825510204081633e-05, "loss": 0.0607, "step": 3211 }, { "epoch": 1.2256, "grad_norm": 1.4707921743392944, "learning_rate": 1.8244897959183674e-05, "loss": 0.085, "step": 3212 }, { "epoch": 1.2258, "grad_norm": 1.6909431219100952, "learning_rate": 1.823469387755102e-05, "loss": 0.1145, "step": 3213 }, { "epoch": 1.226, "grad_norm": 2.412825584411621, "learning_rate": 1.8224489795918366e-05, "loss": 0.1601, "step": 3214 }, { "epoch": 1.2262, "grad_norm": 2.01613187789917, "learning_rate": 1.8214285714285715e-05, "loss": 0.1601, "step": 3215 }, { "epoch": 1.2264, "grad_norm": 4.6927900314331055, "learning_rate": 1.820408163265306e-05, "loss": 0.8213, "step": 3216 }, { "epoch": 1.2266, "grad_norm": 7.865774154663086, "learning_rate": 1.8193877551020407e-05, "loss": 1.802, "step": 3217 }, { "epoch": 1.2268, "grad_norm": 11.120959281921387, "learning_rate": 1.8183673469387756e-05, "loss": 3.1244, "step": 3218 }, { "epoch": 1.227, "grad_norm": 1.3915950059890747, "learning_rate": 1.81734693877551e-05, "loss": 0.0592, "step": 3219 }, { "epoch": 1.2272, "grad_norm": 2.2663471698760986, "learning_rate": 1.816326530612245e-05, "loss": 0.1695, "step": 3220 }, { "epoch": 1.2274, "grad_norm": 5.7890801429748535, "learning_rate": 1.8153061224489797e-05, "loss": 0.9364, "step": 3221 }, { "epoch": 1.2276, "grad_norm": 4.851029872894287, "learning_rate": 1.8142857142857146e-05, "loss": 0.7313, "step": 3222 }, { "epoch": 1.2278, "grad_norm": 1.829401969909668, "learning_rate": 1.813265306122449e-05, "loss": 0.1682, "step": 3223 }, { "epoch": 1.228, "grad_norm": 2.4404027462005615, "learning_rate": 1.8122448979591837e-05, "loss": 0.1551, "step": 3224 }, { "epoch": 1.2282, "grad_norm": 3.285078763961792, "learning_rate": 1.8112244897959187e-05, "loss": 0.4083, "step": 3225 }, { "epoch": 1.2284, "grad_norm": 5.276399612426758, "learning_rate": 1.8102040816326532e-05, "loss": 1.4821, "step": 3226 }, { "epoch": 1.2286, "grad_norm": 2.0326952934265137, "learning_rate": 1.8091836734693878e-05, "loss": 0.2251, "step": 3227 }, { "epoch": 1.2288000000000001, "grad_norm": 5.430430889129639, "learning_rate": 1.8081632653061227e-05, "loss": 1.5465, "step": 3228 }, { "epoch": 1.229, "grad_norm": 10.54371166229248, "learning_rate": 1.8071428571428573e-05, "loss": 1.6414, "step": 3229 }, { "epoch": 1.2292, "grad_norm": 1.2699222564697266, "learning_rate": 1.806122448979592e-05, "loss": 0.0412, "step": 3230 }, { "epoch": 1.2294, "grad_norm": 2.6544322967529297, "learning_rate": 1.8051020408163265e-05, "loss": 0.2643, "step": 3231 }, { "epoch": 1.2296, "grad_norm": 2.2963390350341797, "learning_rate": 1.8040816326530614e-05, "loss": 0.5428, "step": 3232 }, { "epoch": 1.2298, "grad_norm": 1.345909595489502, "learning_rate": 1.803061224489796e-05, "loss": 0.0937, "step": 3233 }, { "epoch": 1.23, "grad_norm": 1.867246150970459, "learning_rate": 1.8020408163265305e-05, "loss": 0.1299, "step": 3234 }, { "epoch": 1.2302, "grad_norm": 1.998002529144287, "learning_rate": 1.8010204081632655e-05, "loss": 0.2057, "step": 3235 }, { "epoch": 1.2304, "grad_norm": 3.9584403038024902, "learning_rate": 1.8e-05, "loss": 0.6425, "step": 3236 }, { "epoch": 1.2306, "grad_norm": 6.284635543823242, "learning_rate": 1.7989795918367346e-05, "loss": 0.7627, "step": 3237 }, { "epoch": 1.2308, "grad_norm": 2.366175651550293, "learning_rate": 1.7979591836734695e-05, "loss": 0.1369, "step": 3238 }, { "epoch": 1.231, "grad_norm": 5.042405128479004, "learning_rate": 1.796938775510204e-05, "loss": 0.6354, "step": 3239 }, { "epoch": 1.2312, "grad_norm": 2.543888568878174, "learning_rate": 1.7959183673469387e-05, "loss": 0.1593, "step": 3240 }, { "epoch": 1.2314, "grad_norm": 4.086974620819092, "learning_rate": 1.7948979591836733e-05, "loss": 0.2952, "step": 3241 }, { "epoch": 1.2316, "grad_norm": 2.588975429534912, "learning_rate": 1.7938775510204085e-05, "loss": 0.1849, "step": 3242 }, { "epoch": 1.2318, "grad_norm": 8.656416893005371, "learning_rate": 1.792857142857143e-05, "loss": 0.7699, "step": 3243 }, { "epoch": 1.232, "grad_norm": 4.926800727844238, "learning_rate": 1.7918367346938777e-05, "loss": 0.6368, "step": 3244 }, { "epoch": 1.2322, "grad_norm": 1.3040525913238525, "learning_rate": 1.7908163265306123e-05, "loss": 0.0759, "step": 3245 }, { "epoch": 1.2324, "grad_norm": 1.3231332302093506, "learning_rate": 1.7897959183673472e-05, "loss": 0.0512, "step": 3246 }, { "epoch": 1.2326, "grad_norm": 1.4585577249526978, "learning_rate": 1.7887755102040818e-05, "loss": 0.0938, "step": 3247 }, { "epoch": 1.2328000000000001, "grad_norm": 2.9919850826263428, "learning_rate": 1.7877551020408164e-05, "loss": 0.5489, "step": 3248 }, { "epoch": 1.233, "grad_norm": 1.8937764167785645, "learning_rate": 1.7867346938775513e-05, "loss": 0.1754, "step": 3249 }, { "epoch": 1.2332, "grad_norm": 2.412397861480713, "learning_rate": 1.785714285714286e-05, "loss": 0.2007, "step": 3250 }, { "epoch": 1.2334, "grad_norm": 4.665653228759766, "learning_rate": 1.7846938775510204e-05, "loss": 0.4089, "step": 3251 }, { "epoch": 1.2336, "grad_norm": 6.121510028839111, "learning_rate": 1.7836734693877553e-05, "loss": 0.786, "step": 3252 }, { "epoch": 1.2338, "grad_norm": 4.21718168258667, "learning_rate": 1.78265306122449e-05, "loss": 0.5882, "step": 3253 }, { "epoch": 1.234, "grad_norm": 1.6234577894210815, "learning_rate": 1.7816326530612245e-05, "loss": 0.1117, "step": 3254 }, { "epoch": 1.2342, "grad_norm": 1.4547427892684937, "learning_rate": 1.780612244897959e-05, "loss": 0.0564, "step": 3255 }, { "epoch": 1.2344, "grad_norm": 1.4127044677734375, "learning_rate": 1.779591836734694e-05, "loss": 0.1022, "step": 3256 }, { "epoch": 1.2346, "grad_norm": 5.04308557510376, "learning_rate": 1.7785714285714286e-05, "loss": 0.8675, "step": 3257 }, { "epoch": 1.2348, "grad_norm": 8.409255981445312, "learning_rate": 1.777551020408163e-05, "loss": 1.7431, "step": 3258 }, { "epoch": 1.2349999999999999, "grad_norm": 3.6114909648895264, "learning_rate": 1.776530612244898e-05, "loss": 0.8014, "step": 3259 }, { "epoch": 1.2352, "grad_norm": 4.705353736877441, "learning_rate": 1.7755102040816327e-05, "loss": 1.4045, "step": 3260 }, { "epoch": 1.2354, "grad_norm": 1.5038385391235352, "learning_rate": 1.7744897959183672e-05, "loss": 0.1013, "step": 3261 }, { "epoch": 1.2356, "grad_norm": 2.7289583683013916, "learning_rate": 1.773469387755102e-05, "loss": 0.479, "step": 3262 }, { "epoch": 1.2358, "grad_norm": 6.702466011047363, "learning_rate": 1.7724489795918367e-05, "loss": 0.9266, "step": 3263 }, { "epoch": 1.236, "grad_norm": 12.450255393981934, "learning_rate": 1.7714285714285713e-05, "loss": 3.5627, "step": 3264 }, { "epoch": 1.2362, "grad_norm": 3.067748785018921, "learning_rate": 1.7704081632653062e-05, "loss": 0.3582, "step": 3265 }, { "epoch": 1.2364, "grad_norm": 2.5850586891174316, "learning_rate": 1.769387755102041e-05, "loss": 0.1721, "step": 3266 }, { "epoch": 1.2366, "grad_norm": 5.197973728179932, "learning_rate": 1.7683673469387757e-05, "loss": 0.6389, "step": 3267 }, { "epoch": 1.2368000000000001, "grad_norm": 1.110595464706421, "learning_rate": 1.7673469387755103e-05, "loss": 0.033, "step": 3268 }, { "epoch": 1.237, "grad_norm": 1.408860445022583, "learning_rate": 1.7663265306122452e-05, "loss": 0.0923, "step": 3269 }, { "epoch": 1.2372, "grad_norm": 1.7110408544540405, "learning_rate": 1.7653061224489798e-05, "loss": 0.1991, "step": 3270 }, { "epoch": 1.2374, "grad_norm": 2.680989980697632, "learning_rate": 1.7642857142857144e-05, "loss": 0.4824, "step": 3271 }, { "epoch": 1.2376, "grad_norm": 1.338800072669983, "learning_rate": 1.763265306122449e-05, "loss": 0.0762, "step": 3272 }, { "epoch": 1.2378, "grad_norm": 3.547950506210327, "learning_rate": 1.762244897959184e-05, "loss": 0.4761, "step": 3273 }, { "epoch": 1.238, "grad_norm": 5.276917934417725, "learning_rate": 1.7612244897959185e-05, "loss": 1.4496, "step": 3274 }, { "epoch": 1.2382, "grad_norm": 1.638073444366455, "learning_rate": 1.760204081632653e-05, "loss": 0.1042, "step": 3275 }, { "epoch": 1.2384, "grad_norm": 0.9599982500076294, "learning_rate": 1.759183673469388e-05, "loss": 0.0289, "step": 3276 }, { "epoch": 1.2386, "grad_norm": 1.8225157260894775, "learning_rate": 1.7581632653061225e-05, "loss": 0.133, "step": 3277 }, { "epoch": 1.2388, "grad_norm": 1.4022389650344849, "learning_rate": 1.757142857142857e-05, "loss": 0.0832, "step": 3278 }, { "epoch": 1.2389999999999999, "grad_norm": 1.6842702627182007, "learning_rate": 1.756122448979592e-05, "loss": 0.1013, "step": 3279 }, { "epoch": 1.2392, "grad_norm": 1.0950801372528076, "learning_rate": 1.7551020408163266e-05, "loss": 0.0562, "step": 3280 }, { "epoch": 1.2394, "grad_norm": 5.457022666931152, "learning_rate": 1.7540816326530612e-05, "loss": 0.8113, "step": 3281 }, { "epoch": 1.2396, "grad_norm": 11.909248352050781, "learning_rate": 1.7530612244897958e-05, "loss": 2.2316, "step": 3282 }, { "epoch": 1.2398, "grad_norm": 7.8928422927856445, "learning_rate": 1.7520408163265307e-05, "loss": 1.4548, "step": 3283 }, { "epoch": 1.24, "grad_norm": 2.9538159370422363, "learning_rate": 1.7510204081632653e-05, "loss": 0.6393, "step": 3284 }, { "epoch": 1.2402, "grad_norm": 1.4128315448760986, "learning_rate": 1.75e-05, "loss": 0.0856, "step": 3285 }, { "epoch": 1.2404, "grad_norm": 1.7415316104888916, "learning_rate": 1.7489795918367348e-05, "loss": 0.1573, "step": 3286 }, { "epoch": 1.2406, "grad_norm": 3.041987419128418, "learning_rate": 1.7479591836734693e-05, "loss": 0.5616, "step": 3287 }, { "epoch": 1.2408, "grad_norm": 2.1510403156280518, "learning_rate": 1.7469387755102043e-05, "loss": 0.1567, "step": 3288 }, { "epoch": 1.241, "grad_norm": 4.927199363708496, "learning_rate": 1.745918367346939e-05, "loss": 0.6416, "step": 3289 }, { "epoch": 1.2412, "grad_norm": 1.535884141921997, "learning_rate": 1.7448979591836738e-05, "loss": 0.1165, "step": 3290 }, { "epoch": 1.2414, "grad_norm": 1.228538155555725, "learning_rate": 1.7438775510204083e-05, "loss": 0.0394, "step": 3291 }, { "epoch": 1.2416, "grad_norm": 1.5323854684829712, "learning_rate": 1.742857142857143e-05, "loss": 0.1044, "step": 3292 }, { "epoch": 1.2418, "grad_norm": 1.7505450248718262, "learning_rate": 1.741836734693878e-05, "loss": 0.0901, "step": 3293 }, { "epoch": 1.242, "grad_norm": 1.385218858718872, "learning_rate": 1.7408163265306124e-05, "loss": 0.0434, "step": 3294 }, { "epoch": 1.2422, "grad_norm": 1.3750401735305786, "learning_rate": 1.739795918367347e-05, "loss": 0.0524, "step": 3295 }, { "epoch": 1.2424, "grad_norm": 4.895515441894531, "learning_rate": 1.738775510204082e-05, "loss": 0.9659, "step": 3296 }, { "epoch": 1.2426, "grad_norm": 7.652602672576904, "learning_rate": 1.7377551020408165e-05, "loss": 1.8379, "step": 3297 }, { "epoch": 1.2428, "grad_norm": 1.8131300210952759, "learning_rate": 1.736734693877551e-05, "loss": 0.1161, "step": 3298 }, { "epoch": 1.2429999999999999, "grad_norm": 1.3964576721191406, "learning_rate": 1.7357142857142856e-05, "loss": 0.0571, "step": 3299 }, { "epoch": 1.2432, "grad_norm": 1.4084566831588745, "learning_rate": 1.7346938775510206e-05, "loss": 0.1014, "step": 3300 }, { "epoch": 1.2434, "grad_norm": 1.425021767616272, "learning_rate": 1.733673469387755e-05, "loss": 0.0705, "step": 3301 }, { "epoch": 1.2436, "grad_norm": 4.891021251678467, "learning_rate": 1.7326530612244897e-05, "loss": 0.9931, "step": 3302 }, { "epoch": 1.2438, "grad_norm": 9.658141136169434, "learning_rate": 1.7316326530612246e-05, "loss": 3.5513, "step": 3303 }, { "epoch": 1.244, "grad_norm": 2.6448793411254883, "learning_rate": 1.7306122448979592e-05, "loss": 0.4798, "step": 3304 }, { "epoch": 1.2442, "grad_norm": 1.2641750574111938, "learning_rate": 1.7295918367346938e-05, "loss": 0.0766, "step": 3305 }, { "epoch": 1.2444, "grad_norm": 5.329100608825684, "learning_rate": 1.7285714285714287e-05, "loss": 0.9358, "step": 3306 }, { "epoch": 1.2446, "grad_norm": 8.251531600952148, "learning_rate": 1.7275510204081633e-05, "loss": 3.4661, "step": 3307 }, { "epoch": 1.2448, "grad_norm": 1.6422741413116455, "learning_rate": 1.726530612244898e-05, "loss": 0.0595, "step": 3308 }, { "epoch": 1.245, "grad_norm": 1.5212047100067139, "learning_rate": 1.7255102040816325e-05, "loss": 0.1585, "step": 3309 }, { "epoch": 1.2452, "grad_norm": 2.3259589672088623, "learning_rate": 1.7244897959183677e-05, "loss": 0.4884, "step": 3310 }, { "epoch": 1.2454, "grad_norm": 1.1222413778305054, "learning_rate": 1.7234693877551023e-05, "loss": 0.0563, "step": 3311 }, { "epoch": 1.2456, "grad_norm": 1.3256467580795288, "learning_rate": 1.722448979591837e-05, "loss": 0.086, "step": 3312 }, { "epoch": 1.2458, "grad_norm": 1.7415765523910522, "learning_rate": 1.7214285714285715e-05, "loss": 0.1463, "step": 3313 }, { "epoch": 1.246, "grad_norm": 2.6794984340667725, "learning_rate": 1.7204081632653064e-05, "loss": 0.4776, "step": 3314 }, { "epoch": 1.2462, "grad_norm": 1.9408143758773804, "learning_rate": 1.719387755102041e-05, "loss": 0.1916, "step": 3315 }, { "epoch": 1.2464, "grad_norm": 9.202816009521484, "learning_rate": 1.7183673469387755e-05, "loss": 0.7523, "step": 3316 }, { "epoch": 1.2466, "grad_norm": 7.594089984893799, "learning_rate": 1.7173469387755104e-05, "loss": 1.3364, "step": 3317 }, { "epoch": 1.2468, "grad_norm": 12.51789379119873, "learning_rate": 1.716326530612245e-05, "loss": 3.4165, "step": 3318 }, { "epoch": 1.2469999999999999, "grad_norm": 1.0752918720245361, "learning_rate": 1.7153061224489796e-05, "loss": 0.0322, "step": 3319 }, { "epoch": 1.2472, "grad_norm": 2.296510934829712, "learning_rate": 1.7142857142857145e-05, "loss": 0.1935, "step": 3320 }, { "epoch": 1.2474, "grad_norm": 5.305103778839111, "learning_rate": 1.713265306122449e-05, "loss": 0.6499, "step": 3321 }, { "epoch": 1.2476, "grad_norm": 1.4535934925079346, "learning_rate": 1.7122448979591837e-05, "loss": 0.0932, "step": 3322 }, { "epoch": 1.2478, "grad_norm": 3.06929612159729, "learning_rate": 1.7112244897959183e-05, "loss": 0.3918, "step": 3323 }, { "epoch": 1.248, "grad_norm": 5.280209541320801, "learning_rate": 1.7102040816326532e-05, "loss": 1.4359, "step": 3324 }, { "epoch": 1.2482, "grad_norm": 1.0554393529891968, "learning_rate": 1.7091836734693878e-05, "loss": 0.0297, "step": 3325 }, { "epoch": 1.2484, "grad_norm": 1.6997267007827759, "learning_rate": 1.7081632653061223e-05, "loss": 0.0987, "step": 3326 }, { "epoch": 1.2486, "grad_norm": 2.0310628414154053, "learning_rate": 1.7071428571428573e-05, "loss": 0.1564, "step": 3327 }, { "epoch": 1.2488, "grad_norm": 4.670139312744141, "learning_rate": 1.706122448979592e-05, "loss": 0.8549, "step": 3328 }, { "epoch": 1.249, "grad_norm": 4.195288181304932, "learning_rate": 1.7051020408163264e-05, "loss": 0.7412, "step": 3329 }, { "epoch": 1.2492, "grad_norm": 11.199814796447754, "learning_rate": 1.7040816326530613e-05, "loss": 1.9341, "step": 3330 }, { "epoch": 1.2494, "grad_norm": 15.108905792236328, "learning_rate": 1.703061224489796e-05, "loss": 1.6621, "step": 3331 }, { "epoch": 1.2496, "grad_norm": 9.078606605529785, "learning_rate": 1.7020408163265305e-05, "loss": 0.401, "step": 3332 }, { "epoch": 1.2498, "grad_norm": 17.929407119750977, "learning_rate": 1.7010204081632654e-05, "loss": 1.9839, "step": 3333 }, { "epoch": 1.25, "grad_norm": 5.080728530883789, "learning_rate": 1.7000000000000003e-05, "loss": 1.3457, "step": 3334 }, { "epoch": 1.2502, "grad_norm": 1.418836236000061, "learning_rate": 1.698979591836735e-05, "loss": 0.0805, "step": 3335 }, { "epoch": 1.2504, "grad_norm": 1.9480489492416382, "learning_rate": 1.6979591836734695e-05, "loss": 0.1381, "step": 3336 }, { "epoch": 1.2506, "grad_norm": 1.7020474672317505, "learning_rate": 1.6969387755102044e-05, "loss": 0.1188, "step": 3337 }, { "epoch": 1.2508, "grad_norm": 2.2416229248046875, "learning_rate": 1.695918367346939e-05, "loss": 0.2585, "step": 3338 }, { "epoch": 1.251, "grad_norm": 3.647240400314331, "learning_rate": 1.6948979591836736e-05, "loss": 0.7583, "step": 3339 }, { "epoch": 1.2511999999999999, "grad_norm": 1.6109038591384888, "learning_rate": 1.693877551020408e-05, "loss": 0.1145, "step": 3340 }, { "epoch": 1.2514, "grad_norm": 2.7248711585998535, "learning_rate": 1.692857142857143e-05, "loss": 0.2185, "step": 3341 }, { "epoch": 1.2516, "grad_norm": 4.435530185699463, "learning_rate": 1.6918367346938776e-05, "loss": 0.4876, "step": 3342 }, { "epoch": 1.2518, "grad_norm": 3.164839744567871, "learning_rate": 1.6908163265306122e-05, "loss": 0.602, "step": 3343 }, { "epoch": 1.252, "grad_norm": 1.6481194496154785, "learning_rate": 1.689795918367347e-05, "loss": 0.1389, "step": 3344 }, { "epoch": 1.2522, "grad_norm": 8.841111183166504, "learning_rate": 1.6887755102040817e-05, "loss": 0.6078, "step": 3345 }, { "epoch": 1.2524, "grad_norm": 13.993014335632324, "learning_rate": 1.6877551020408163e-05, "loss": 1.9227, "step": 3346 }, { "epoch": 1.2526, "grad_norm": 1.9829548597335815, "learning_rate": 1.6867346938775512e-05, "loss": 0.2012, "step": 3347 }, { "epoch": 1.2528000000000001, "grad_norm": 2.797745704650879, "learning_rate": 1.6857142857142858e-05, "loss": 0.4567, "step": 3348 }, { "epoch": 1.2530000000000001, "grad_norm": 3.1786699295043945, "learning_rate": 1.6846938775510204e-05, "loss": 0.3397, "step": 3349 }, { "epoch": 1.2532, "grad_norm": 12.750046730041504, "learning_rate": 1.683673469387755e-05, "loss": 1.9939, "step": 3350 }, { "epoch": 1.2534, "grad_norm": 12.33714485168457, "learning_rate": 1.68265306122449e-05, "loss": 1.5595, "step": 3351 }, { "epoch": 1.2536, "grad_norm": 2.612457275390625, "learning_rate": 1.6816326530612244e-05, "loss": 0.5026, "step": 3352 }, { "epoch": 1.2538, "grad_norm": 2.011706590652466, "learning_rate": 1.680612244897959e-05, "loss": 0.1798, "step": 3353 }, { "epoch": 1.254, "grad_norm": 3.1007089614868164, "learning_rate": 1.679591836734694e-05, "loss": 0.5509, "step": 3354 }, { "epoch": 1.2542, "grad_norm": 1.5206526517868042, "learning_rate": 1.6785714285714285e-05, "loss": 0.1098, "step": 3355 }, { "epoch": 1.2544, "grad_norm": 1.6399301290512085, "learning_rate": 1.6775510204081634e-05, "loss": 0.1127, "step": 3356 }, { "epoch": 1.2546, "grad_norm": 1.327517032623291, "learning_rate": 1.676530612244898e-05, "loss": 0.0481, "step": 3357 }, { "epoch": 1.2548, "grad_norm": 1.5172820091247559, "learning_rate": 1.675510204081633e-05, "loss": 0.1195, "step": 3358 }, { "epoch": 1.255, "grad_norm": 1.360618233680725, "learning_rate": 1.6744897959183675e-05, "loss": 0.0576, "step": 3359 }, { "epoch": 1.2551999999999999, "grad_norm": 2.326371431350708, "learning_rate": 1.673469387755102e-05, "loss": 0.1188, "step": 3360 }, { "epoch": 1.2554, "grad_norm": 5.61547327041626, "learning_rate": 1.672448979591837e-05, "loss": 0.7798, "step": 3361 }, { "epoch": 1.2556, "grad_norm": 5.132718563079834, "learning_rate": 1.6714285714285716e-05, "loss": 0.6264, "step": 3362 }, { "epoch": 1.2558, "grad_norm": 8.089487075805664, "learning_rate": 1.6704081632653062e-05, "loss": 0.7934, "step": 3363 }, { "epoch": 1.256, "grad_norm": 1.711835265159607, "learning_rate": 1.669387755102041e-05, "loss": 0.1151, "step": 3364 }, { "epoch": 1.2562, "grad_norm": 4.24153995513916, "learning_rate": 1.6683673469387757e-05, "loss": 0.9138, "step": 3365 }, { "epoch": 1.2564, "grad_norm": 9.217023849487305, "learning_rate": 1.6673469387755102e-05, "loss": 3.4371, "step": 3366 }, { "epoch": 1.2566, "grad_norm": 3.100184679031372, "learning_rate": 1.6663265306122448e-05, "loss": 0.3243, "step": 3367 }, { "epoch": 1.2568, "grad_norm": 2.0953240394592285, "learning_rate": 1.6653061224489797e-05, "loss": 0.1945, "step": 3368 }, { "epoch": 1.2570000000000001, "grad_norm": 3.7271270751953125, "learning_rate": 1.6642857142857143e-05, "loss": 0.7156, "step": 3369 }, { "epoch": 1.2572, "grad_norm": 2.181183099746704, "learning_rate": 1.663265306122449e-05, "loss": 0.1978, "step": 3370 }, { "epoch": 1.2574, "grad_norm": 2.6793060302734375, "learning_rate": 1.6622448979591838e-05, "loss": 0.4631, "step": 3371 }, { "epoch": 1.2576, "grad_norm": 1.3397785425186157, "learning_rate": 1.6612244897959184e-05, "loss": 0.0393, "step": 3372 }, { "epoch": 1.2578, "grad_norm": 1.6295970678329468, "learning_rate": 1.660204081632653e-05, "loss": 0.1165, "step": 3373 }, { "epoch": 1.258, "grad_norm": 1.4244837760925293, "learning_rate": 1.659183673469388e-05, "loss": 0.0769, "step": 3374 }, { "epoch": 1.2582, "grad_norm": 1.6356065273284912, "learning_rate": 1.6581632653061225e-05, "loss": 0.1071, "step": 3375 }, { "epoch": 1.2584, "grad_norm": 5.48160457611084, "learning_rate": 1.657142857142857e-05, "loss": 1.4024, "step": 3376 }, { "epoch": 1.2586, "grad_norm": 8.133343696594238, "learning_rate": 1.6561224489795916e-05, "loss": 1.9838, "step": 3377 }, { "epoch": 1.2588, "grad_norm": 2.3891677856445312, "learning_rate": 1.6551020408163266e-05, "loss": 0.1329, "step": 3378 }, { "epoch": 1.259, "grad_norm": 1.8075857162475586, "learning_rate": 1.6540816326530615e-05, "loss": 0.069, "step": 3379 }, { "epoch": 1.2591999999999999, "grad_norm": 1.2503466606140137, "learning_rate": 1.653061224489796e-05, "loss": 0.0409, "step": 3380 }, { "epoch": 1.2594, "grad_norm": 1.4084055423736572, "learning_rate": 1.6520408163265306e-05, "loss": 0.082, "step": 3381 }, { "epoch": 1.2596, "grad_norm": 2.6745080947875977, "learning_rate": 1.6510204081632655e-05, "loss": 0.1938, "step": 3382 }, { "epoch": 1.2598, "grad_norm": 5.423195838928223, "learning_rate": 1.65e-05, "loss": 0.7431, "step": 3383 }, { "epoch": 1.26, "grad_norm": 1.263220191001892, "learning_rate": 1.6489795918367347e-05, "loss": 0.0898, "step": 3384 }, { "epoch": 1.2602, "grad_norm": 1.5418835878372192, "learning_rate": 1.6479591836734696e-05, "loss": 0.0626, "step": 3385 }, { "epoch": 1.2604, "grad_norm": 1.559310793876648, "learning_rate": 1.6469387755102042e-05, "loss": 0.0583, "step": 3386 }, { "epoch": 1.2606, "grad_norm": 1.265760898590088, "learning_rate": 1.6459183673469388e-05, "loss": 0.042, "step": 3387 }, { "epoch": 1.2608, "grad_norm": 1.7681432962417603, "learning_rate": 1.6448979591836737e-05, "loss": 0.1167, "step": 3388 }, { "epoch": 1.2610000000000001, "grad_norm": 2.5975985527038574, "learning_rate": 1.6438775510204083e-05, "loss": 0.1504, "step": 3389 }, { "epoch": 1.2612, "grad_norm": 2.9997403621673584, "learning_rate": 1.642857142857143e-05, "loss": 0.4652, "step": 3390 }, { "epoch": 1.2614, "grad_norm": 5.229567050933838, "learning_rate": 1.6418367346938778e-05, "loss": 1.5219, "step": 3391 }, { "epoch": 1.2616, "grad_norm": 4.952725410461426, "learning_rate": 1.6408163265306124e-05, "loss": 0.6158, "step": 3392 }, { "epoch": 1.2618, "grad_norm": 1.8712340593338013, "learning_rate": 1.639795918367347e-05, "loss": 0.1638, "step": 3393 }, { "epoch": 1.262, "grad_norm": 2.623368501663208, "learning_rate": 1.6387755102040815e-05, "loss": 0.4956, "step": 3394 }, { "epoch": 1.2622, "grad_norm": 6.142118453979492, "learning_rate": 1.6377551020408164e-05, "loss": 1.0003, "step": 3395 }, { "epoch": 1.2624, "grad_norm": 16.167905807495117, "learning_rate": 1.636734693877551e-05, "loss": 3.6286, "step": 3396 }, { "epoch": 1.2626, "grad_norm": 6.149655818939209, "learning_rate": 1.6357142857142856e-05, "loss": 1.6503, "step": 3397 }, { "epoch": 1.2628, "grad_norm": 10.63801097869873, "learning_rate": 1.6346938775510205e-05, "loss": 3.1041, "step": 3398 }, { "epoch": 1.263, "grad_norm": 10.49075984954834, "learning_rate": 1.633673469387755e-05, "loss": 3.8939, "step": 3399 }, { "epoch": 1.2631999999999999, "grad_norm": 9.431962966918945, "learning_rate": 1.6326530612244897e-05, "loss": 3.4103, "step": 3400 }, { "epoch": 1.2634, "grad_norm": 1.8127503395080566, "learning_rate": 1.6316326530612246e-05, "loss": 0.144, "step": 3401 }, { "epoch": 1.2636, "grad_norm": 3.003711223602295, "learning_rate": 1.6306122448979595e-05, "loss": 0.354, "step": 3402 }, { "epoch": 1.2638, "grad_norm": 1.5330774784088135, "learning_rate": 1.629591836734694e-05, "loss": 0.1217, "step": 3403 }, { "epoch": 1.264, "grad_norm": 2.6452956199645996, "learning_rate": 1.6285714285714287e-05, "loss": 0.18, "step": 3404 }, { "epoch": 1.2642, "grad_norm": 6.225276947021484, "learning_rate": 1.6275510204081636e-05, "loss": 0.8655, "step": 3405 }, { "epoch": 1.2644, "grad_norm": 4.707012176513672, "learning_rate": 1.626530612244898e-05, "loss": 0.7126, "step": 3406 }, { "epoch": 1.2646, "grad_norm": 1.9787338972091675, "learning_rate": 1.6255102040816327e-05, "loss": 0.1043, "step": 3407 }, { "epoch": 1.2648, "grad_norm": 3.948756694793701, "learning_rate": 1.6244897959183673e-05, "loss": 0.4824, "step": 3408 }, { "epoch": 1.2650000000000001, "grad_norm": 4.800436019897461, "learning_rate": 1.6234693877551022e-05, "loss": 0.6866, "step": 3409 }, { "epoch": 1.2652, "grad_norm": 1.5941983461380005, "learning_rate": 1.6224489795918368e-05, "loss": 0.1029, "step": 3410 }, { "epoch": 1.2654, "grad_norm": 1.7154693603515625, "learning_rate": 1.6214285714285714e-05, "loss": 0.115, "step": 3411 }, { "epoch": 1.2656, "grad_norm": 3.7221055030822754, "learning_rate": 1.6204081632653063e-05, "loss": 0.6513, "step": 3412 }, { "epoch": 1.2658, "grad_norm": 8.22921085357666, "learning_rate": 1.619387755102041e-05, "loss": 0.7431, "step": 3413 }, { "epoch": 1.266, "grad_norm": 1.383772611618042, "learning_rate": 1.6183673469387755e-05, "loss": 0.086, "step": 3414 }, { "epoch": 1.2662, "grad_norm": 1.8097155094146729, "learning_rate": 1.6173469387755104e-05, "loss": 0.1126, "step": 3415 }, { "epoch": 1.2664, "grad_norm": 2.1031274795532227, "learning_rate": 1.616326530612245e-05, "loss": 0.1949, "step": 3416 }, { "epoch": 1.2666, "grad_norm": 7.550622463226318, "learning_rate": 1.6153061224489795e-05, "loss": 0.7638, "step": 3417 }, { "epoch": 1.2668, "grad_norm": 3.7834267616271973, "learning_rate": 1.614285714285714e-05, "loss": 0.6121, "step": 3418 }, { "epoch": 1.267, "grad_norm": 4.540528297424316, "learning_rate": 1.613265306122449e-05, "loss": 0.5646, "step": 3419 }, { "epoch": 1.2671999999999999, "grad_norm": 1.4493329524993896, "learning_rate": 1.6122448979591836e-05, "loss": 0.0882, "step": 3420 }, { "epoch": 1.2674, "grad_norm": 2.163473129272461, "learning_rate": 1.6112244897959182e-05, "loss": 0.1259, "step": 3421 }, { "epoch": 1.2676, "grad_norm": 4.591796398162842, "learning_rate": 1.610204081632653e-05, "loss": 0.5881, "step": 3422 }, { "epoch": 1.2678, "grad_norm": 3.2775096893310547, "learning_rate": 1.6091836734693877e-05, "loss": 0.1809, "step": 3423 }, { "epoch": 1.268, "grad_norm": 6.489138603210449, "learning_rate": 1.6081632653061226e-05, "loss": 0.7169, "step": 3424 }, { "epoch": 1.2682, "grad_norm": 1.3613406419754028, "learning_rate": 1.6071428571428572e-05, "loss": 0.0461, "step": 3425 }, { "epoch": 1.2684, "grad_norm": 2.6900811195373535, "learning_rate": 1.606122448979592e-05, "loss": 0.1681, "step": 3426 }, { "epoch": 1.2686, "grad_norm": 6.287497520446777, "learning_rate": 1.6051020408163267e-05, "loss": 0.7438, "step": 3427 }, { "epoch": 1.2688, "grad_norm": 1.5202996730804443, "learning_rate": 1.6040816326530613e-05, "loss": 0.0881, "step": 3428 }, { "epoch": 1.2690000000000001, "grad_norm": 1.743475079536438, "learning_rate": 1.6030612244897962e-05, "loss": 0.0626, "step": 3429 }, { "epoch": 1.2692, "grad_norm": 1.582897663116455, "learning_rate": 1.6020408163265308e-05, "loss": 0.0645, "step": 3430 }, { "epoch": 1.2694, "grad_norm": 7.358126640319824, "learning_rate": 1.6010204081632653e-05, "loss": 0.5717, "step": 3431 }, { "epoch": 1.2696, "grad_norm": 13.37587833404541, "learning_rate": 1.6000000000000003e-05, "loss": 2.0154, "step": 3432 }, { "epoch": 1.2698, "grad_norm": 2.172822952270508, "learning_rate": 1.598979591836735e-05, "loss": 0.1618, "step": 3433 }, { "epoch": 1.27, "grad_norm": 2.9426889419555664, "learning_rate": 1.5979591836734694e-05, "loss": 0.2733, "step": 3434 }, { "epoch": 1.2702, "grad_norm": 1.3792476654052734, "learning_rate": 1.596938775510204e-05, "loss": 0.0719, "step": 3435 }, { "epoch": 1.2704, "grad_norm": 1.2770227193832397, "learning_rate": 1.595918367346939e-05, "loss": 0.0714, "step": 3436 }, { "epoch": 1.2706, "grad_norm": 1.3073512315750122, "learning_rate": 1.5948979591836735e-05, "loss": 0.1088, "step": 3437 }, { "epoch": 1.2708, "grad_norm": 1.2940077781677246, "learning_rate": 1.593877551020408e-05, "loss": 0.0461, "step": 3438 }, { "epoch": 1.271, "grad_norm": 2.482409715652466, "learning_rate": 1.592857142857143e-05, "loss": 0.1819, "step": 3439 }, { "epoch": 1.2711999999999999, "grad_norm": 4.856838703155518, "learning_rate": 1.5918367346938776e-05, "loss": 0.689, "step": 3440 }, { "epoch": 1.2713999999999999, "grad_norm": 2.8870060443878174, "learning_rate": 1.590816326530612e-05, "loss": 0.4883, "step": 3441 }, { "epoch": 1.2716, "grad_norm": 1.156765103340149, "learning_rate": 1.589795918367347e-05, "loss": 0.0341, "step": 3442 }, { "epoch": 1.2718, "grad_norm": 2.10788893699646, "learning_rate": 1.5887755102040817e-05, "loss": 0.2436, "step": 3443 }, { "epoch": 1.272, "grad_norm": 2.6830666065216064, "learning_rate": 1.5877551020408162e-05, "loss": 0.479, "step": 3444 }, { "epoch": 1.2722, "grad_norm": 1.2923004627227783, "learning_rate": 1.5867346938775508e-05, "loss": 0.1194, "step": 3445 }, { "epoch": 1.2724, "grad_norm": 1.9385242462158203, "learning_rate": 1.5857142857142857e-05, "loss": 0.1861, "step": 3446 }, { "epoch": 1.2726, "grad_norm": 3.1385715007781982, "learning_rate": 1.5846938775510206e-05, "loss": 0.6352, "step": 3447 }, { "epoch": 1.2728, "grad_norm": 1.4715768098831177, "learning_rate": 1.5836734693877552e-05, "loss": 0.0476, "step": 3448 }, { "epoch": 1.2730000000000001, "grad_norm": 1.7465702295303345, "learning_rate": 1.5826530612244898e-05, "loss": 0.0899, "step": 3449 }, { "epoch": 1.2732, "grad_norm": 3.3258235454559326, "learning_rate": 1.5816326530612247e-05, "loss": 0.3732, "step": 3450 }, { "epoch": 1.2734, "grad_norm": 4.332332134246826, "learning_rate": 1.5806122448979593e-05, "loss": 1.1368, "step": 3451 }, { "epoch": 1.2736, "grad_norm": 8.17160701751709, "learning_rate": 1.579591836734694e-05, "loss": 2.0268, "step": 3452 }, { "epoch": 1.2738, "grad_norm": 1.359741449356079, "learning_rate": 1.5785714285714288e-05, "loss": 0.0842, "step": 3453 }, { "epoch": 1.274, "grad_norm": 1.439969778060913, "learning_rate": 1.5775510204081634e-05, "loss": 0.0787, "step": 3454 }, { "epoch": 1.2742, "grad_norm": 3.2931435108184814, "learning_rate": 1.576530612244898e-05, "loss": 0.2335, "step": 3455 }, { "epoch": 1.2744, "grad_norm": 7.263006210327148, "learning_rate": 1.575510204081633e-05, "loss": 1.3831, "step": 3456 }, { "epoch": 1.2746, "grad_norm": 2.6793131828308105, "learning_rate": 1.5744897959183675e-05, "loss": 0.3533, "step": 3457 }, { "epoch": 1.2748, "grad_norm": 6.873786926269531, "learning_rate": 1.573469387755102e-05, "loss": 1.512, "step": 3458 }, { "epoch": 1.275, "grad_norm": 5.337397575378418, "learning_rate": 1.572448979591837e-05, "loss": 1.2574, "step": 3459 }, { "epoch": 1.2752, "grad_norm": 9.182343482971191, "learning_rate": 1.5714285714285715e-05, "loss": 2.6326, "step": 3460 }, { "epoch": 1.2753999999999999, "grad_norm": 6.159662246704102, "learning_rate": 1.570408163265306e-05, "loss": 2.4802, "step": 3461 }, { "epoch": 1.2756, "grad_norm": 17.988075256347656, "learning_rate": 1.5693877551020407e-05, "loss": 3.4773, "step": 3462 }, { "epoch": 1.2758, "grad_norm": 33.38669204711914, "learning_rate": 1.5683673469387756e-05, "loss": 3.9685, "step": 3463 }, { "epoch": 1.276, "grad_norm": 10.34990406036377, "learning_rate": 1.5673469387755102e-05, "loss": 1.2912, "step": 3464 }, { "epoch": 1.2762, "grad_norm": 7.714395046234131, "learning_rate": 1.5663265306122448e-05, "loss": 0.9114, "step": 3465 }, { "epoch": 1.2764, "grad_norm": 3.0588672161102295, "learning_rate": 1.5653061224489797e-05, "loss": 0.5662, "step": 3466 }, { "epoch": 1.2766, "grad_norm": 6.391806125640869, "learning_rate": 1.5642857142857143e-05, "loss": 0.9289, "step": 3467 }, { "epoch": 1.2768, "grad_norm": 16.139328002929688, "learning_rate": 1.563265306122449e-05, "loss": 3.4601, "step": 3468 }, { "epoch": 1.2770000000000001, "grad_norm": 1.3934966325759888, "learning_rate": 1.5622448979591838e-05, "loss": 0.0688, "step": 3469 }, { "epoch": 1.2772000000000001, "grad_norm": 5.1799821853637695, "learning_rate": 1.5612244897959187e-05, "loss": 0.8917, "step": 3470 }, { "epoch": 1.2774, "grad_norm": 8.424983978271484, "learning_rate": 1.5602040816326533e-05, "loss": 3.3538, "step": 3471 }, { "epoch": 1.2776, "grad_norm": 6.700727462768555, "learning_rate": 1.559183673469388e-05, "loss": 1.2781, "step": 3472 }, { "epoch": 1.2778, "grad_norm": 8.653555870056152, "learning_rate": 1.5581632653061228e-05, "loss": 2.5732, "step": 3473 }, { "epoch": 1.278, "grad_norm": 1.482365608215332, "learning_rate": 1.5571428571428573e-05, "loss": 0.0817, "step": 3474 }, { "epoch": 1.2782, "grad_norm": 1.7122341394424438, "learning_rate": 1.556122448979592e-05, "loss": 0.0784, "step": 3475 }, { "epoch": 1.2784, "grad_norm": 2.120478630065918, "learning_rate": 1.5551020408163265e-05, "loss": 0.124, "step": 3476 }, { "epoch": 1.2786, "grad_norm": 2.957033157348633, "learning_rate": 1.5540816326530614e-05, "loss": 0.3313, "step": 3477 }, { "epoch": 1.2788, "grad_norm": 3.401562452316284, "learning_rate": 1.553061224489796e-05, "loss": 0.1388, "step": 3478 }, { "epoch": 1.279, "grad_norm": 7.845393657684326, "learning_rate": 1.5520408163265306e-05, "loss": 0.7818, "step": 3479 }, { "epoch": 1.2792, "grad_norm": 1.0962510108947754, "learning_rate": 1.5510204081632655e-05, "loss": 0.0656, "step": 3480 }, { "epoch": 1.2793999999999999, "grad_norm": 2.6956839561462402, "learning_rate": 1.55e-05, "loss": 0.3444, "step": 3481 }, { "epoch": 1.2796, "grad_norm": 7.444957256317139, "learning_rate": 1.5489795918367346e-05, "loss": 1.4494, "step": 3482 }, { "epoch": 1.2798, "grad_norm": 1.4879704713821411, "learning_rate": 1.5479591836734696e-05, "loss": 0.0571, "step": 3483 }, { "epoch": 1.28, "grad_norm": 2.6962056159973145, "learning_rate": 1.546938775510204e-05, "loss": 0.172, "step": 3484 }, { "epoch": 1.2802, "grad_norm": 1.4376516342163086, "learning_rate": 1.5459183673469387e-05, "loss": 0.0517, "step": 3485 }, { "epoch": 1.2804, "grad_norm": 1.297108769416809, "learning_rate": 1.5448979591836733e-05, "loss": 0.0451, "step": 3486 }, { "epoch": 1.2806, "grad_norm": 4.2883830070495605, "learning_rate": 1.5438775510204082e-05, "loss": 1.1647, "step": 3487 }, { "epoch": 1.2808, "grad_norm": 11.137147903442383, "learning_rate": 1.5428571428571428e-05, "loss": 2.7671, "step": 3488 }, { "epoch": 1.2810000000000001, "grad_norm": 1.6357594728469849, "learning_rate": 1.5418367346938774e-05, "loss": 0.119, "step": 3489 }, { "epoch": 1.2812000000000001, "grad_norm": 2.295539379119873, "learning_rate": 1.5408163265306123e-05, "loss": 0.1267, "step": 3490 }, { "epoch": 1.2814, "grad_norm": 8.366477966308594, "learning_rate": 1.539795918367347e-05, "loss": 1.4248, "step": 3491 }, { "epoch": 1.2816, "grad_norm": 21.025161743164062, "learning_rate": 1.5387755102040818e-05, "loss": 3.7029, "step": 3492 }, { "epoch": 1.2818, "grad_norm": 7.5603532791137695, "learning_rate": 1.5377551020408164e-05, "loss": 0.7515, "step": 3493 }, { "epoch": 1.282, "grad_norm": 4.353247165679932, "learning_rate": 1.5367346938775513e-05, "loss": 0.5647, "step": 3494 }, { "epoch": 1.2822, "grad_norm": 1.7207931280136108, "learning_rate": 1.535714285714286e-05, "loss": 0.1705, "step": 3495 }, { "epoch": 1.2824, "grad_norm": 1.7064158916473389, "learning_rate": 1.5346938775510204e-05, "loss": 0.1937, "step": 3496 }, { "epoch": 1.2826, "grad_norm": 2.7570483684539795, "learning_rate": 1.5336734693877554e-05, "loss": 0.4847, "step": 3497 }, { "epoch": 1.2828, "grad_norm": 3.3982269763946533, "learning_rate": 1.53265306122449e-05, "loss": 0.2327, "step": 3498 }, { "epoch": 1.283, "grad_norm": 7.127781867980957, "learning_rate": 1.5316326530612245e-05, "loss": 1.3224, "step": 3499 }, { "epoch": 1.2832, "grad_norm": 3.0783791542053223, "learning_rate": 1.5306122448979594e-05, "loss": 0.6105, "step": 3500 }, { "epoch": 1.2833999999999999, "grad_norm": 3.3459722995758057, "learning_rate": 1.529591836734694e-05, "loss": 0.4059, "step": 3501 }, { "epoch": 1.2836, "grad_norm": 4.628581523895264, "learning_rate": 1.5285714285714286e-05, "loss": 1.4011, "step": 3502 }, { "epoch": 1.2838, "grad_norm": 1.6847960948944092, "learning_rate": 1.5275510204081632e-05, "loss": 0.1031, "step": 3503 }, { "epoch": 1.284, "grad_norm": 1.219497799873352, "learning_rate": 1.526530612244898e-05, "loss": 0.0378, "step": 3504 }, { "epoch": 1.2842, "grad_norm": 1.7342090606689453, "learning_rate": 1.5255102040816327e-05, "loss": 0.1441, "step": 3505 }, { "epoch": 1.2844, "grad_norm": 1.6367411613464355, "learning_rate": 1.5244897959183674e-05, "loss": 0.0833, "step": 3506 }, { "epoch": 1.2846, "grad_norm": 3.5634076595306396, "learning_rate": 1.523469387755102e-05, "loss": 0.4575, "step": 3507 }, { "epoch": 1.2848, "grad_norm": 5.396846294403076, "learning_rate": 1.5224489795918368e-05, "loss": 0.7511, "step": 3508 }, { "epoch": 1.285, "grad_norm": 5.117428302764893, "learning_rate": 1.5214285714285715e-05, "loss": 0.7507, "step": 3509 }, { "epoch": 1.2852000000000001, "grad_norm": 4.2475666999816895, "learning_rate": 1.520408163265306e-05, "loss": 0.4041, "step": 3510 }, { "epoch": 1.2854, "grad_norm": 10.006048202514648, "learning_rate": 1.5193877551020408e-05, "loss": 1.283, "step": 3511 }, { "epoch": 1.2856, "grad_norm": 6.676290512084961, "learning_rate": 1.5183673469387754e-05, "loss": 1.648, "step": 3512 }, { "epoch": 1.2858, "grad_norm": 10.92365550994873, "learning_rate": 1.5173469387755102e-05, "loss": 3.6243, "step": 3513 }, { "epoch": 1.286, "grad_norm": 11.564417839050293, "learning_rate": 1.5163265306122449e-05, "loss": 4.3346, "step": 3514 }, { "epoch": 1.2862, "grad_norm": 9.778779029846191, "learning_rate": 1.5153061224489798e-05, "loss": 2.6538, "step": 3515 }, { "epoch": 1.2864, "grad_norm": 8.761896133422852, "learning_rate": 1.5142857142857144e-05, "loss": 1.4395, "step": 3516 }, { "epoch": 1.2866, "grad_norm": 11.730488777160645, "learning_rate": 1.5132653061224492e-05, "loss": 3.4833, "step": 3517 }, { "epoch": 1.2868, "grad_norm": 4.998980522155762, "learning_rate": 1.5122448979591839e-05, "loss": 0.5691, "step": 3518 }, { "epoch": 1.287, "grad_norm": 1.5715025663375854, "learning_rate": 1.5112244897959185e-05, "loss": 0.077, "step": 3519 }, { "epoch": 1.2872, "grad_norm": 1.9681670665740967, "learning_rate": 1.5102040816326532e-05, "loss": 0.0896, "step": 3520 }, { "epoch": 1.2873999999999999, "grad_norm": 1.2198389768600464, "learning_rate": 1.5091836734693878e-05, "loss": 0.0338, "step": 3521 }, { "epoch": 1.2876, "grad_norm": 1.6890063285827637, "learning_rate": 1.5081632653061226e-05, "loss": 0.1405, "step": 3522 }, { "epoch": 1.2878, "grad_norm": 1.5838737487792969, "learning_rate": 1.5071428571428573e-05, "loss": 0.1021, "step": 3523 }, { "epoch": 1.288, "grad_norm": 1.7186545133590698, "learning_rate": 1.5061224489795919e-05, "loss": 0.1365, "step": 3524 }, { "epoch": 1.2882, "grad_norm": 1.1837531328201294, "learning_rate": 1.5051020408163266e-05, "loss": 0.0649, "step": 3525 }, { "epoch": 1.2884, "grad_norm": 1.6602630615234375, "learning_rate": 1.5040816326530612e-05, "loss": 0.1081, "step": 3526 }, { "epoch": 1.2886, "grad_norm": 1.467596173286438, "learning_rate": 1.503061224489796e-05, "loss": 0.0973, "step": 3527 }, { "epoch": 1.2888, "grad_norm": 1.367651104927063, "learning_rate": 1.5020408163265307e-05, "loss": 0.0538, "step": 3528 }, { "epoch": 1.289, "grad_norm": 1.1187045574188232, "learning_rate": 1.5010204081632653e-05, "loss": 0.0379, "step": 3529 }, { "epoch": 1.2892000000000001, "grad_norm": 7.7804718017578125, "learning_rate": 1.5e-05, "loss": 0.9649, "step": 3530 }, { "epoch": 1.2894, "grad_norm": 17.334747314453125, "learning_rate": 1.4989795918367346e-05, "loss": 3.3545, "step": 3531 }, { "epoch": 1.2896, "grad_norm": 1.814028263092041, "learning_rate": 1.4979591836734694e-05, "loss": 0.1417, "step": 3532 }, { "epoch": 1.2898, "grad_norm": 1.3732644319534302, "learning_rate": 1.4969387755102041e-05, "loss": 0.0833, "step": 3533 }, { "epoch": 1.29, "grad_norm": 7.133530616760254, "learning_rate": 1.4959183673469387e-05, "loss": 1.2164, "step": 3534 }, { "epoch": 1.2902, "grad_norm": 10.225381851196289, "learning_rate": 1.4948979591836734e-05, "loss": 2.5016, "step": 3535 }, { "epoch": 1.2904, "grad_norm": 4.777650833129883, "learning_rate": 1.493877551020408e-05, "loss": 0.8102, "step": 3536 }, { "epoch": 1.2906, "grad_norm": 11.105450630187988, "learning_rate": 1.4928571428571431e-05, "loss": 2.552, "step": 3537 }, { "epoch": 1.2908, "grad_norm": 1.9035511016845703, "learning_rate": 1.4918367346938777e-05, "loss": 0.132, "step": 3538 }, { "epoch": 1.291, "grad_norm": 3.101081371307373, "learning_rate": 1.4908163265306124e-05, "loss": 0.325, "step": 3539 }, { "epoch": 1.2912, "grad_norm": 1.6706990003585815, "learning_rate": 1.4897959183673472e-05, "loss": 0.1237, "step": 3540 }, { "epoch": 1.2913999999999999, "grad_norm": 0.9322052001953125, "learning_rate": 1.4887755102040818e-05, "loss": 0.0218, "step": 3541 }, { "epoch": 1.2916, "grad_norm": 1.633112907409668, "learning_rate": 1.4877551020408165e-05, "loss": 0.1139, "step": 3542 }, { "epoch": 1.2918, "grad_norm": 1.4652941226959229, "learning_rate": 1.4867346938775511e-05, "loss": 0.0537, "step": 3543 }, { "epoch": 1.292, "grad_norm": 2.806101083755493, "learning_rate": 1.4857142857142858e-05, "loss": 0.375, "step": 3544 }, { "epoch": 1.2922, "grad_norm": 5.178884983062744, "learning_rate": 1.4846938775510206e-05, "loss": 1.5283, "step": 3545 }, { "epoch": 1.2924, "grad_norm": 3.7781739234924316, "learning_rate": 1.4836734693877552e-05, "loss": 0.6923, "step": 3546 }, { "epoch": 1.2926, "grad_norm": 1.3200960159301758, "learning_rate": 1.48265306122449e-05, "loss": 0.1126, "step": 3547 }, { "epoch": 1.2928, "grad_norm": 4.988454341888428, "learning_rate": 1.4816326530612245e-05, "loss": 0.3449, "step": 3548 }, { "epoch": 1.293, "grad_norm": 10.687281608581543, "learning_rate": 1.4806122448979592e-05, "loss": 1.2852, "step": 3549 }, { "epoch": 1.2932000000000001, "grad_norm": 6.6570940017700195, "learning_rate": 1.479591836734694e-05, "loss": 1.1017, "step": 3550 }, { "epoch": 1.2934, "grad_norm": 19.316333770751953, "learning_rate": 1.4785714285714286e-05, "loss": 3.1181, "step": 3551 }, { "epoch": 1.2936, "grad_norm": 3.2980775833129883, "learning_rate": 1.4775510204081633e-05, "loss": 0.2851, "step": 3552 }, { "epoch": 1.2938, "grad_norm": 7.51362943649292, "learning_rate": 1.4765306122448979e-05, "loss": 1.3359, "step": 3553 }, { "epoch": 1.294, "grad_norm": 1.1324206590652466, "learning_rate": 1.4755102040816326e-05, "loss": 0.0294, "step": 3554 }, { "epoch": 1.2942, "grad_norm": 2.5744659900665283, "learning_rate": 1.4744897959183674e-05, "loss": 0.2059, "step": 3555 }, { "epoch": 1.2944, "grad_norm": 4.4462504386901855, "learning_rate": 1.473469387755102e-05, "loss": 0.593, "step": 3556 }, { "epoch": 1.2946, "grad_norm": 2.342153549194336, "learning_rate": 1.4724489795918367e-05, "loss": 0.2257, "step": 3557 }, { "epoch": 1.2948, "grad_norm": 3.1876916885375977, "learning_rate": 1.4714285714285713e-05, "loss": 0.3238, "step": 3558 }, { "epoch": 1.295, "grad_norm": 1.0605589151382446, "learning_rate": 1.470408163265306e-05, "loss": 0.0264, "step": 3559 }, { "epoch": 1.2952, "grad_norm": 1.5809507369995117, "learning_rate": 1.469387755102041e-05, "loss": 0.0874, "step": 3560 }, { "epoch": 1.2953999999999999, "grad_norm": 5.060691833496094, "learning_rate": 1.4683673469387757e-05, "loss": 0.3005, "step": 3561 }, { "epoch": 1.2955999999999999, "grad_norm": 10.105025291442871, "learning_rate": 1.4673469387755103e-05, "loss": 1.2824, "step": 3562 }, { "epoch": 1.2958, "grad_norm": 1.5753602981567383, "learning_rate": 1.466326530612245e-05, "loss": 0.0595, "step": 3563 }, { "epoch": 1.296, "grad_norm": 2.5145034790039062, "learning_rate": 1.4653061224489798e-05, "loss": 0.1782, "step": 3564 }, { "epoch": 1.2962, "grad_norm": 4.732728481292725, "learning_rate": 1.4642857142857144e-05, "loss": 0.6166, "step": 3565 }, { "epoch": 1.2964, "grad_norm": 3.8714406490325928, "learning_rate": 1.4632653061224491e-05, "loss": 1.0394, "step": 3566 }, { "epoch": 1.2966, "grad_norm": 12.83499813079834, "learning_rate": 1.4622448979591839e-05, "loss": 3.0973, "step": 3567 }, { "epoch": 1.2968, "grad_norm": 1.1112841367721558, "learning_rate": 1.4612244897959185e-05, "loss": 0.0258, "step": 3568 }, { "epoch": 1.297, "grad_norm": 1.4227652549743652, "learning_rate": 1.4602040816326532e-05, "loss": 0.1001, "step": 3569 }, { "epoch": 1.2972000000000001, "grad_norm": 2.9268789291381836, "learning_rate": 1.4591836734693878e-05, "loss": 0.4727, "step": 3570 }, { "epoch": 1.2974, "grad_norm": 5.912209987640381, "learning_rate": 1.4581632653061225e-05, "loss": 1.6332, "step": 3571 }, { "epoch": 1.2976, "grad_norm": 5.406517028808594, "learning_rate": 1.4571428571428573e-05, "loss": 0.658, "step": 3572 }, { "epoch": 1.2978, "grad_norm": 1.526465892791748, "learning_rate": 1.4561224489795919e-05, "loss": 0.1091, "step": 3573 }, { "epoch": 1.298, "grad_norm": 1.7140930891036987, "learning_rate": 1.4551020408163266e-05, "loss": 0.0577, "step": 3574 }, { "epoch": 1.2982, "grad_norm": 1.4619375467300415, "learning_rate": 1.4540816326530612e-05, "loss": 0.0507, "step": 3575 }, { "epoch": 1.2984, "grad_norm": 1.4371378421783447, "learning_rate": 1.453061224489796e-05, "loss": 0.0845, "step": 3576 }, { "epoch": 1.2986, "grad_norm": 2.455461025238037, "learning_rate": 1.4520408163265307e-05, "loss": 0.2963, "step": 3577 }, { "epoch": 1.2988, "grad_norm": 5.196033477783203, "learning_rate": 1.4510204081632653e-05, "loss": 1.4591, "step": 3578 }, { "epoch": 1.299, "grad_norm": 1.3846713304519653, "learning_rate": 1.45e-05, "loss": 0.0728, "step": 3579 }, { "epoch": 1.2992, "grad_norm": 1.5920789241790771, "learning_rate": 1.4489795918367346e-05, "loss": 0.1345, "step": 3580 }, { "epoch": 1.2993999999999999, "grad_norm": 1.436944842338562, "learning_rate": 1.4479591836734693e-05, "loss": 0.083, "step": 3581 }, { "epoch": 1.2995999999999999, "grad_norm": 1.6569437980651855, "learning_rate": 1.4469387755102041e-05, "loss": 0.1281, "step": 3582 }, { "epoch": 1.2998, "grad_norm": 1.71175217628479, "learning_rate": 1.445918367346939e-05, "loss": 0.1145, "step": 3583 }, { "epoch": 1.3, "grad_norm": 5.769881248474121, "learning_rate": 1.4448979591836736e-05, "loss": 0.9269, "step": 3584 }, { "epoch": 1.3002, "grad_norm": 11.544594764709473, "learning_rate": 1.4438775510204083e-05, "loss": 3.452, "step": 3585 }, { "epoch": 1.3004, "grad_norm": 3.0036733150482178, "learning_rate": 1.442857142857143e-05, "loss": 0.5211, "step": 3586 }, { "epoch": 1.3006, "grad_norm": 2.4071826934814453, "learning_rate": 1.4418367346938777e-05, "loss": 0.2461, "step": 3587 }, { "epoch": 1.3008, "grad_norm": 6.731344223022461, "learning_rate": 1.4408163265306124e-05, "loss": 0.8418, "step": 3588 }, { "epoch": 1.301, "grad_norm": 4.861983299255371, "learning_rate": 1.439795918367347e-05, "loss": 0.5717, "step": 3589 }, { "epoch": 1.3012000000000001, "grad_norm": 1.4551787376403809, "learning_rate": 1.4387755102040817e-05, "loss": 0.0891, "step": 3590 }, { "epoch": 1.3014000000000001, "grad_norm": 2.8933231830596924, "learning_rate": 1.4377551020408165e-05, "loss": 0.2431, "step": 3591 }, { "epoch": 1.3016, "grad_norm": 4.9407453536987305, "learning_rate": 1.436734693877551e-05, "loss": 0.65, "step": 3592 }, { "epoch": 1.3018, "grad_norm": 1.4356982707977295, "learning_rate": 1.4357142857142858e-05, "loss": 0.0407, "step": 3593 }, { "epoch": 1.302, "grad_norm": 3.079118490219116, "learning_rate": 1.4346938775510204e-05, "loss": 0.2953, "step": 3594 }, { "epoch": 1.3022, "grad_norm": 5.729078769683838, "learning_rate": 1.4336734693877551e-05, "loss": 0.7755, "step": 3595 }, { "epoch": 1.3024, "grad_norm": 1.4708061218261719, "learning_rate": 1.4326530612244899e-05, "loss": 0.0989, "step": 3596 }, { "epoch": 1.3026, "grad_norm": 1.650211215019226, "learning_rate": 1.4316326530612245e-05, "loss": 0.0479, "step": 3597 }, { "epoch": 1.3028, "grad_norm": 1.3778326511383057, "learning_rate": 1.4306122448979592e-05, "loss": 0.0676, "step": 3598 }, { "epoch": 1.303, "grad_norm": 1.4415026903152466, "learning_rate": 1.4295918367346938e-05, "loss": 0.0994, "step": 3599 }, { "epoch": 1.3032, "grad_norm": 9.081709861755371, "learning_rate": 1.4285714285714285e-05, "loss": 0.5877, "step": 3600 }, { "epoch": 1.3034, "grad_norm": 15.51332950592041, "learning_rate": 1.4275510204081633e-05, "loss": 2.0339, "step": 3601 }, { "epoch": 1.3035999999999999, "grad_norm": 9.164117813110352, "learning_rate": 1.4265306122448979e-05, "loss": 1.4634, "step": 3602 }, { "epoch": 1.3038, "grad_norm": 20.999588012695312, "learning_rate": 1.4255102040816326e-05, "loss": 2.6444, "step": 3603 }, { "epoch": 1.304, "grad_norm": 4.09861421585083, "learning_rate": 1.4244897959183674e-05, "loss": 0.599, "step": 3604 }, { "epoch": 1.3042, "grad_norm": 5.565969467163086, "learning_rate": 1.4234693877551023e-05, "loss": 0.9323, "step": 3605 }, { "epoch": 1.3044, "grad_norm": 15.917082786560059, "learning_rate": 1.4224489795918369e-05, "loss": 3.9851, "step": 3606 }, { "epoch": 1.3046, "grad_norm": 16.127853393554688, "learning_rate": 1.4214285714285716e-05, "loss": 2.1158, "step": 3607 }, { "epoch": 1.3048, "grad_norm": 6.162598133087158, "learning_rate": 1.4204081632653064e-05, "loss": 1.0179, "step": 3608 }, { "epoch": 1.305, "grad_norm": 5.599311828613281, "learning_rate": 1.419387755102041e-05, "loss": 0.6646, "step": 3609 }, { "epoch": 1.3052000000000001, "grad_norm": 2.427150011062622, "learning_rate": 1.4183673469387757e-05, "loss": 0.3712, "step": 3610 }, { "epoch": 1.3054000000000001, "grad_norm": 4.940189838409424, "learning_rate": 1.4173469387755103e-05, "loss": 1.4755, "step": 3611 }, { "epoch": 1.3056, "grad_norm": 2.0217015743255615, "learning_rate": 1.416326530612245e-05, "loss": 0.1821, "step": 3612 }, { "epoch": 1.3058, "grad_norm": 3.79567813873291, "learning_rate": 1.4153061224489798e-05, "loss": 0.665, "step": 3613 }, { "epoch": 1.306, "grad_norm": 5.068130970001221, "learning_rate": 1.4142857142857143e-05, "loss": 0.7768, "step": 3614 }, { "epoch": 1.3062, "grad_norm": 7.242030620574951, "learning_rate": 1.4132653061224491e-05, "loss": 2.3842, "step": 3615 }, { "epoch": 1.3064, "grad_norm": 1.7902650833129883, "learning_rate": 1.4122448979591837e-05, "loss": 0.1467, "step": 3616 }, { "epoch": 1.3066, "grad_norm": 1.5323320627212524, "learning_rate": 1.4112244897959184e-05, "loss": 0.1101, "step": 3617 }, { "epoch": 1.3068, "grad_norm": 6.208742141723633, "learning_rate": 1.4102040816326532e-05, "loss": 1.0017, "step": 3618 }, { "epoch": 1.307, "grad_norm": 10.6547212600708, "learning_rate": 1.4091836734693877e-05, "loss": 3.3961, "step": 3619 }, { "epoch": 1.3072, "grad_norm": 1.841823697090149, "learning_rate": 1.4081632653061225e-05, "loss": 0.187, "step": 3620 }, { "epoch": 1.3074, "grad_norm": 3.2964346408843994, "learning_rate": 1.407142857142857e-05, "loss": 0.6089, "step": 3621 }, { "epoch": 1.3075999999999999, "grad_norm": 2.113030433654785, "learning_rate": 1.4061224489795918e-05, "loss": 0.139, "step": 3622 }, { "epoch": 1.3078, "grad_norm": 3.6273980140686035, "learning_rate": 1.4051020408163266e-05, "loss": 0.4046, "step": 3623 }, { "epoch": 1.308, "grad_norm": 1.473108172416687, "learning_rate": 1.4040816326530612e-05, "loss": 0.1369, "step": 3624 }, { "epoch": 1.3082, "grad_norm": 2.0529394149780273, "learning_rate": 1.4030612244897959e-05, "loss": 0.1391, "step": 3625 }, { "epoch": 1.3084, "grad_norm": 3.2491400241851807, "learning_rate": 1.4020408163265305e-05, "loss": 0.3464, "step": 3626 }, { "epoch": 1.3086, "grad_norm": 2.3641955852508545, "learning_rate": 1.4010204081632652e-05, "loss": 0.3468, "step": 3627 }, { "epoch": 1.3088, "grad_norm": 5.182273864746094, "learning_rate": 1.4000000000000001e-05, "loss": 1.41, "step": 3628 }, { "epoch": 1.309, "grad_norm": 1.8585495948791504, "learning_rate": 1.3989795918367349e-05, "loss": 0.1579, "step": 3629 }, { "epoch": 1.3092, "grad_norm": 4.79997444152832, "learning_rate": 1.3979591836734696e-05, "loss": 0.78, "step": 3630 }, { "epoch": 1.3094000000000001, "grad_norm": 7.107030868530273, "learning_rate": 1.3969387755102042e-05, "loss": 0.4945, "step": 3631 }, { "epoch": 1.3096, "grad_norm": 6.470427989959717, "learning_rate": 1.395918367346939e-05, "loss": 0.7194, "step": 3632 }, { "epoch": 1.3098, "grad_norm": 1.762215495109558, "learning_rate": 1.3948979591836736e-05, "loss": 0.1192, "step": 3633 }, { "epoch": 1.31, "grad_norm": 1.6998673677444458, "learning_rate": 1.3938775510204083e-05, "loss": 0.107, "step": 3634 }, { "epoch": 1.3102, "grad_norm": 3.6796700954437256, "learning_rate": 1.392857142857143e-05, "loss": 0.2478, "step": 3635 }, { "epoch": 1.3104, "grad_norm": 5.984239101409912, "learning_rate": 1.3918367346938776e-05, "loss": 0.8121, "step": 3636 }, { "epoch": 1.3106, "grad_norm": 5.5593695640563965, "learning_rate": 1.3908163265306124e-05, "loss": 0.8158, "step": 3637 }, { "epoch": 1.3108, "grad_norm": 8.170197486877441, "learning_rate": 1.389795918367347e-05, "loss": 1.161, "step": 3638 }, { "epoch": 1.311, "grad_norm": 1.3757100105285645, "learning_rate": 1.3887755102040817e-05, "loss": 0.0896, "step": 3639 }, { "epoch": 1.3112, "grad_norm": 1.858641266822815, "learning_rate": 1.3877551020408165e-05, "loss": 0.1156, "step": 3640 }, { "epoch": 1.3114, "grad_norm": 1.5672694444656372, "learning_rate": 1.386734693877551e-05, "loss": 0.0827, "step": 3641 }, { "epoch": 1.3115999999999999, "grad_norm": 1.9095299243927002, "learning_rate": 1.3857142857142858e-05, "loss": 0.1906, "step": 3642 }, { "epoch": 1.3118, "grad_norm": 3.8802802562713623, "learning_rate": 1.3846938775510204e-05, "loss": 0.9082, "step": 3643 }, { "epoch": 1.312, "grad_norm": 3.765718936920166, "learning_rate": 1.3836734693877551e-05, "loss": 0.6387, "step": 3644 }, { "epoch": 1.3122, "grad_norm": 4.606767177581787, "learning_rate": 1.3826530612244899e-05, "loss": 0.5988, "step": 3645 }, { "epoch": 1.3124, "grad_norm": 1.8502494096755981, "learning_rate": 1.3816326530612244e-05, "loss": 0.1423, "step": 3646 }, { "epoch": 1.3126, "grad_norm": 1.8141132593154907, "learning_rate": 1.3806122448979592e-05, "loss": 0.1143, "step": 3647 }, { "epoch": 1.3128, "grad_norm": 1.5949076414108276, "learning_rate": 1.3795918367346938e-05, "loss": 0.111, "step": 3648 }, { "epoch": 1.313, "grad_norm": 1.3847777843475342, "learning_rate": 1.3785714285714285e-05, "loss": 0.0915, "step": 3649 }, { "epoch": 1.3132, "grad_norm": 1.3259258270263672, "learning_rate": 1.3775510204081633e-05, "loss": 0.0673, "step": 3650 }, { "epoch": 1.3134000000000001, "grad_norm": 1.9255751371383667, "learning_rate": 1.3765306122448982e-05, "loss": 0.1472, "step": 3651 }, { "epoch": 1.3136, "grad_norm": 1.9907561540603638, "learning_rate": 1.3755102040816328e-05, "loss": 0.1404, "step": 3652 }, { "epoch": 1.3138, "grad_norm": 3.772474765777588, "learning_rate": 1.3744897959183675e-05, "loss": 0.4566, "step": 3653 }, { "epoch": 1.314, "grad_norm": 4.542640686035156, "learning_rate": 1.3734693877551023e-05, "loss": 0.5977, "step": 3654 }, { "epoch": 1.3142, "grad_norm": 1.2687718868255615, "learning_rate": 1.3724489795918368e-05, "loss": 0.0769, "step": 3655 }, { "epoch": 1.3144, "grad_norm": 7.83428430557251, "learning_rate": 1.3714285714285716e-05, "loss": 1.029, "step": 3656 }, { "epoch": 1.3146, "grad_norm": 10.699220657348633, "learning_rate": 1.3704081632653062e-05, "loss": 2.4053, "step": 3657 }, { "epoch": 1.3148, "grad_norm": 1.6182563304901123, "learning_rate": 1.3693877551020409e-05, "loss": 0.1276, "step": 3658 }, { "epoch": 1.315, "grad_norm": 2.7827768325805664, "learning_rate": 1.3683673469387757e-05, "loss": 0.4677, "step": 3659 }, { "epoch": 1.3152, "grad_norm": 1.7074637413024902, "learning_rate": 1.3673469387755102e-05, "loss": 0.1219, "step": 3660 }, { "epoch": 1.3154, "grad_norm": 1.3121364116668701, "learning_rate": 1.366326530612245e-05, "loss": 0.0634, "step": 3661 }, { "epoch": 1.3155999999999999, "grad_norm": 1.4462305307388306, "learning_rate": 1.3653061224489796e-05, "loss": 0.1197, "step": 3662 }, { "epoch": 1.3158, "grad_norm": 2.5625054836273193, "learning_rate": 1.3642857142857143e-05, "loss": 0.3954, "step": 3663 }, { "epoch": 1.316, "grad_norm": 6.2406816482543945, "learning_rate": 1.363265306122449e-05, "loss": 1.5542, "step": 3664 }, { "epoch": 1.3162, "grad_norm": 5.714224338531494, "learning_rate": 1.3622448979591836e-05, "loss": 0.7596, "step": 3665 }, { "epoch": 1.3164, "grad_norm": 3.18538498878479, "learning_rate": 1.3612244897959184e-05, "loss": 0.6324, "step": 3666 }, { "epoch": 1.3166, "grad_norm": 1.0018253326416016, "learning_rate": 1.3602040816326531e-05, "loss": 0.0642, "step": 3667 }, { "epoch": 1.3168, "grad_norm": 1.6076992750167847, "learning_rate": 1.3591836734693877e-05, "loss": 0.0985, "step": 3668 }, { "epoch": 1.317, "grad_norm": 1.3837776184082031, "learning_rate": 1.3581632653061225e-05, "loss": 0.0813, "step": 3669 }, { "epoch": 1.3172, "grad_norm": 1.8163518905639648, "learning_rate": 1.357142857142857e-05, "loss": 0.147, "step": 3670 }, { "epoch": 1.3174000000000001, "grad_norm": 2.740192413330078, "learning_rate": 1.3561224489795918e-05, "loss": 0.5002, "step": 3671 }, { "epoch": 1.3176, "grad_norm": 7.440036773681641, "learning_rate": 1.3551020408163265e-05, "loss": 0.9479, "step": 3672 }, { "epoch": 1.3178, "grad_norm": 11.16032600402832, "learning_rate": 1.3540816326530615e-05, "loss": 3.4297, "step": 3673 }, { "epoch": 1.318, "grad_norm": 4.578827381134033, "learning_rate": 1.353061224489796e-05, "loss": 0.8416, "step": 3674 }, { "epoch": 1.3182, "grad_norm": 7.189239501953125, "learning_rate": 1.3520408163265308e-05, "loss": 2.1488, "step": 3675 }, { "epoch": 1.3184, "grad_norm": 3.0757930278778076, "learning_rate": 1.3510204081632655e-05, "loss": 0.5967, "step": 3676 }, { "epoch": 1.3186, "grad_norm": 2.4431116580963135, "learning_rate": 1.3500000000000001e-05, "loss": 0.1462, "step": 3677 }, { "epoch": 1.3188, "grad_norm": 5.975914001464844, "learning_rate": 1.3489795918367349e-05, "loss": 0.4564, "step": 3678 }, { "epoch": 1.319, "grad_norm": 3.4433000087738037, "learning_rate": 1.3479591836734694e-05, "loss": 0.8065, "step": 3679 }, { "epoch": 1.3192, "grad_norm": 4.407238960266113, "learning_rate": 1.3469387755102042e-05, "loss": 0.6133, "step": 3680 }, { "epoch": 1.3194, "grad_norm": 4.648280143737793, "learning_rate": 1.345918367346939e-05, "loss": 0.5786, "step": 3681 }, { "epoch": 1.3195999999999999, "grad_norm": 4.969205856323242, "learning_rate": 1.3448979591836735e-05, "loss": 0.3892, "step": 3682 }, { "epoch": 1.3197999999999999, "grad_norm": 1.3952200412750244, "learning_rate": 1.3438775510204083e-05, "loss": 0.0894, "step": 3683 }, { "epoch": 1.32, "grad_norm": 3.4232287406921387, "learning_rate": 1.3428571428571429e-05, "loss": 0.2368, "step": 3684 }, { "epoch": 1.3202, "grad_norm": 5.8791728019714355, "learning_rate": 1.3418367346938776e-05, "loss": 0.851, "step": 3685 }, { "epoch": 1.3204, "grad_norm": 2.5943305492401123, "learning_rate": 1.3408163265306123e-05, "loss": 0.2203, "step": 3686 }, { "epoch": 1.3206, "grad_norm": 5.222429275512695, "learning_rate": 1.339795918367347e-05, "loss": 0.8633, "step": 3687 }, { "epoch": 1.3208, "grad_norm": 3.6457314491271973, "learning_rate": 1.3387755102040817e-05, "loss": 0.7084, "step": 3688 }, { "epoch": 1.321, "grad_norm": 6.079892635345459, "learning_rate": 1.3377551020408163e-05, "loss": 1.2813, "step": 3689 }, { "epoch": 1.3212, "grad_norm": 9.036714553833008, "learning_rate": 1.336734693877551e-05, "loss": 3.1631, "step": 3690 }, { "epoch": 1.3214000000000001, "grad_norm": 19.296865463256836, "learning_rate": 1.3357142857142858e-05, "loss": 4.8567, "step": 3691 }, { "epoch": 1.3216, "grad_norm": 12.835603713989258, "learning_rate": 1.3346938775510203e-05, "loss": 3.3974, "step": 3692 }, { "epoch": 1.3218, "grad_norm": 3.113570213317871, "learning_rate": 1.333673469387755e-05, "loss": 0.1844, "step": 3693 }, { "epoch": 1.322, "grad_norm": 7.056849956512451, "learning_rate": 1.3326530612244897e-05, "loss": 0.7761, "step": 3694 }, { "epoch": 1.3222, "grad_norm": 1.145198106765747, "learning_rate": 1.3316326530612244e-05, "loss": 0.0626, "step": 3695 }, { "epoch": 1.3224, "grad_norm": 1.8467090129852295, "learning_rate": 1.3306122448979593e-05, "loss": 0.0861, "step": 3696 }, { "epoch": 1.3226, "grad_norm": 5.691300392150879, "learning_rate": 1.329591836734694e-05, "loss": 0.8919, "step": 3697 }, { "epoch": 1.3228, "grad_norm": 10.928914070129395, "learning_rate": 1.3285714285714288e-05, "loss": 3.3144, "step": 3698 }, { "epoch": 1.323, "grad_norm": 2.7220211029052734, "learning_rate": 1.3275510204081634e-05, "loss": 0.2012, "step": 3699 }, { "epoch": 1.3232, "grad_norm": 6.318994045257568, "learning_rate": 1.3265306122448982e-05, "loss": 0.797, "step": 3700 }, { "epoch": 1.3234, "grad_norm": 7.228105068206787, "learning_rate": 1.3255102040816327e-05, "loss": 0.7282, "step": 3701 }, { "epoch": 1.3235999999999999, "grad_norm": 1.5779112577438354, "learning_rate": 1.3244897959183675e-05, "loss": 0.0974, "step": 3702 }, { "epoch": 1.3237999999999999, "grad_norm": 1.5987306833267212, "learning_rate": 1.3234693877551022e-05, "loss": 0.0883, "step": 3703 }, { "epoch": 1.324, "grad_norm": 2.9678375720977783, "learning_rate": 1.3224489795918368e-05, "loss": 0.4179, "step": 3704 }, { "epoch": 1.3242, "grad_norm": 5.287540912628174, "learning_rate": 1.3214285714285716e-05, "loss": 1.488, "step": 3705 }, { "epoch": 1.3244, "grad_norm": 2.976473093032837, "learning_rate": 1.3204081632653061e-05, "loss": 0.599, "step": 3706 }, { "epoch": 1.3246, "grad_norm": 1.6741358041763306, "learning_rate": 1.3193877551020409e-05, "loss": 0.1117, "step": 3707 }, { "epoch": 1.3248, "grad_norm": 1.5690118074417114, "learning_rate": 1.3183673469387756e-05, "loss": 0.0883, "step": 3708 }, { "epoch": 1.325, "grad_norm": 4.477802276611328, "learning_rate": 1.3173469387755102e-05, "loss": 0.6296, "step": 3709 }, { "epoch": 1.3252, "grad_norm": 6.388333797454834, "learning_rate": 1.316326530612245e-05, "loss": 2.0726, "step": 3710 }, { "epoch": 1.3254000000000001, "grad_norm": 5.454859733581543, "learning_rate": 1.3153061224489795e-05, "loss": 0.7201, "step": 3711 }, { "epoch": 1.3256000000000001, "grad_norm": 1.328463077545166, "learning_rate": 1.3142857142857143e-05, "loss": 0.0711, "step": 3712 }, { "epoch": 1.3258, "grad_norm": 1.4585798978805542, "learning_rate": 1.313265306122449e-05, "loss": 0.0998, "step": 3713 }, { "epoch": 1.326, "grad_norm": 1.403083086013794, "learning_rate": 1.3122448979591836e-05, "loss": 0.0689, "step": 3714 }, { "epoch": 1.3262, "grad_norm": 1.627443552017212, "learning_rate": 1.3112244897959184e-05, "loss": 0.1369, "step": 3715 }, { "epoch": 1.3264, "grad_norm": 5.76826810836792, "learning_rate": 1.310204081632653e-05, "loss": 0.5985, "step": 3716 }, { "epoch": 1.3266, "grad_norm": 3.087222099304199, "learning_rate": 1.3091836734693877e-05, "loss": 0.5963, "step": 3717 }, { "epoch": 1.3268, "grad_norm": 2.640472888946533, "learning_rate": 1.3081632653061224e-05, "loss": 0.1815, "step": 3718 }, { "epoch": 1.327, "grad_norm": 4.838239669799805, "learning_rate": 1.3071428571428574e-05, "loss": 0.6387, "step": 3719 }, { "epoch": 1.3272, "grad_norm": 2.5957911014556885, "learning_rate": 1.306122448979592e-05, "loss": 0.2261, "step": 3720 }, { "epoch": 1.3274, "grad_norm": 4.602501392364502, "learning_rate": 1.3051020408163267e-05, "loss": 0.6065, "step": 3721 }, { "epoch": 1.3276, "grad_norm": 1.1260366439819336, "learning_rate": 1.3040816326530614e-05, "loss": 0.0524, "step": 3722 }, { "epoch": 1.3277999999999999, "grad_norm": 1.6132937669754028, "learning_rate": 1.303061224489796e-05, "loss": 0.096, "step": 3723 }, { "epoch": 1.328, "grad_norm": 1.4178515672683716, "learning_rate": 1.3020408163265308e-05, "loss": 0.0875, "step": 3724 }, { "epoch": 1.3282, "grad_norm": 0.9025347828865051, "learning_rate": 1.3010204081632653e-05, "loss": 0.0325, "step": 3725 }, { "epoch": 1.3284, "grad_norm": 1.5743216276168823, "learning_rate": 1.3000000000000001e-05, "loss": 0.0891, "step": 3726 }, { "epoch": 1.3286, "grad_norm": 1.3139675855636597, "learning_rate": 1.2989795918367348e-05, "loss": 0.1146, "step": 3727 }, { "epoch": 1.3288, "grad_norm": 1.460642695426941, "learning_rate": 1.2979591836734694e-05, "loss": 0.0889, "step": 3728 }, { "epoch": 1.329, "grad_norm": 1.1795800924301147, "learning_rate": 1.2969387755102042e-05, "loss": 0.0618, "step": 3729 }, { "epoch": 1.3292, "grad_norm": 1.6362366676330566, "learning_rate": 1.2959183673469389e-05, "loss": 0.1454, "step": 3730 }, { "epoch": 1.3294000000000001, "grad_norm": 2.8314414024353027, "learning_rate": 1.2948979591836735e-05, "loss": 0.4987, "step": 3731 }, { "epoch": 1.3296000000000001, "grad_norm": 1.8919548988342285, "learning_rate": 1.2938775510204082e-05, "loss": 0.1928, "step": 3732 }, { "epoch": 1.3298, "grad_norm": 2.7952616214752197, "learning_rate": 1.2928571428571428e-05, "loss": 0.5315, "step": 3733 }, { "epoch": 1.33, "grad_norm": 1.3814488649368286, "learning_rate": 1.2918367346938776e-05, "loss": 0.0935, "step": 3734 }, { "epoch": 1.3302, "grad_norm": 1.2890468835830688, "learning_rate": 1.2908163265306123e-05, "loss": 0.0548, "step": 3735 }, { "epoch": 1.3304, "grad_norm": 1.5378800630569458, "learning_rate": 1.2897959183673469e-05, "loss": 0.0899, "step": 3736 }, { "epoch": 1.3306, "grad_norm": 1.6947510242462158, "learning_rate": 1.2887755102040816e-05, "loss": 0.1284, "step": 3737 }, { "epoch": 1.3308, "grad_norm": 1.4605859518051147, "learning_rate": 1.2877551020408162e-05, "loss": 0.0965, "step": 3738 }, { "epoch": 1.331, "grad_norm": 1.8479279279708862, "learning_rate": 1.286734693877551e-05, "loss": 0.1376, "step": 3739 }, { "epoch": 1.3312, "grad_norm": 6.6838459968566895, "learning_rate": 1.2857142857142857e-05, "loss": 1.0591, "step": 3740 }, { "epoch": 1.3314, "grad_norm": 7.753203868865967, "learning_rate": 1.2846938775510206e-05, "loss": 1.8856, "step": 3741 }, { "epoch": 1.3316, "grad_norm": 1.8943339586257935, "learning_rate": 1.2836734693877552e-05, "loss": 0.1905, "step": 3742 }, { "epoch": 1.3317999999999999, "grad_norm": 3.1973650455474854, "learning_rate": 1.28265306122449e-05, "loss": 0.6096, "step": 3743 }, { "epoch": 1.332, "grad_norm": 1.7720997333526611, "learning_rate": 1.2816326530612247e-05, "loss": 0.106, "step": 3744 }, { "epoch": 1.3322, "grad_norm": 1.6186349391937256, "learning_rate": 1.2806122448979593e-05, "loss": 0.1065, "step": 3745 }, { "epoch": 1.3324, "grad_norm": 1.695291519165039, "learning_rate": 1.279591836734694e-05, "loss": 0.112, "step": 3746 }, { "epoch": 1.3326, "grad_norm": 1.3820916414260864, "learning_rate": 1.2785714285714286e-05, "loss": 0.0755, "step": 3747 }, { "epoch": 1.3328, "grad_norm": 1.4397777318954468, "learning_rate": 1.2775510204081634e-05, "loss": 0.0657, "step": 3748 }, { "epoch": 1.333, "grad_norm": 1.9753934144973755, "learning_rate": 1.2765306122448981e-05, "loss": 0.1784, "step": 3749 }, { "epoch": 1.3332, "grad_norm": 3.92526912689209, "learning_rate": 1.2755102040816327e-05, "loss": 0.6451, "step": 3750 }, { "epoch": 1.3334, "grad_norm": 3.340161085128784, "learning_rate": 1.2744897959183674e-05, "loss": 0.8984, "step": 3751 }, { "epoch": 1.3336000000000001, "grad_norm": 4.063291072845459, "learning_rate": 1.273469387755102e-05, "loss": 0.9231, "step": 3752 }, { "epoch": 1.3338, "grad_norm": 4.840859889984131, "learning_rate": 1.2724489795918368e-05, "loss": 1.4109, "step": 3753 }, { "epoch": 1.334, "grad_norm": 1.761492133140564, "learning_rate": 1.2714285714285715e-05, "loss": 0.1427, "step": 3754 }, { "epoch": 1.3342, "grad_norm": 4.310940742492676, "learning_rate": 1.2704081632653061e-05, "loss": 0.64, "step": 3755 }, { "epoch": 1.3344, "grad_norm": 5.979731559753418, "learning_rate": 1.2693877551020409e-05, "loss": 0.7816, "step": 3756 }, { "epoch": 1.3346, "grad_norm": 1.581800937652588, "learning_rate": 1.2683673469387754e-05, "loss": 0.1127, "step": 3757 }, { "epoch": 1.3348, "grad_norm": 1.3641674518585205, "learning_rate": 1.2673469387755102e-05, "loss": 0.077, "step": 3758 }, { "epoch": 1.335, "grad_norm": 2.465879201889038, "learning_rate": 1.266326530612245e-05, "loss": 0.3563, "step": 3759 }, { "epoch": 1.3352, "grad_norm": 4.839778423309326, "learning_rate": 1.2653061224489795e-05, "loss": 1.443, "step": 3760 }, { "epoch": 1.3354, "grad_norm": 2.8524973392486572, "learning_rate": 1.2642857142857143e-05, "loss": 0.2132, "step": 3761 }, { "epoch": 1.3356, "grad_norm": 7.233521938323975, "learning_rate": 1.263265306122449e-05, "loss": 1.3656, "step": 3762 }, { "epoch": 1.3357999999999999, "grad_norm": 9.550909042358398, "learning_rate": 1.2622448979591836e-05, "loss": 2.0322, "step": 3763 }, { "epoch": 1.336, "grad_norm": 5.07677698135376, "learning_rate": 1.2612244897959185e-05, "loss": 0.665, "step": 3764 }, { "epoch": 1.3362, "grad_norm": 2.5103707313537598, "learning_rate": 1.2602040816326533e-05, "loss": 0.2274, "step": 3765 }, { "epoch": 1.3364, "grad_norm": 4.901525974273682, "learning_rate": 1.259183673469388e-05, "loss": 0.6456, "step": 3766 }, { "epoch": 1.3366, "grad_norm": 1.6755197048187256, "learning_rate": 1.2581632653061226e-05, "loss": 0.1005, "step": 3767 }, { "epoch": 1.3368, "grad_norm": 2.485529899597168, "learning_rate": 1.2571428571428573e-05, "loss": 0.1637, "step": 3768 }, { "epoch": 1.337, "grad_norm": 7.253251552581787, "learning_rate": 1.2561224489795919e-05, "loss": 0.8481, "step": 3769 }, { "epoch": 1.3372, "grad_norm": 5.495018482208252, "learning_rate": 1.2551020408163267e-05, "loss": 0.8316, "step": 3770 }, { "epoch": 1.3374, "grad_norm": 2.9167444705963135, "learning_rate": 1.2540816326530614e-05, "loss": 0.5102, "step": 3771 }, { "epoch": 1.3376000000000001, "grad_norm": 1.8908495903015137, "learning_rate": 1.253061224489796e-05, "loss": 0.1286, "step": 3772 }, { "epoch": 1.3378, "grad_norm": 1.4749886989593506, "learning_rate": 1.2520408163265307e-05, "loss": 0.0961, "step": 3773 }, { "epoch": 1.338, "grad_norm": 1.9437893629074097, "learning_rate": 1.2510204081632653e-05, "loss": 0.1222, "step": 3774 }, { "epoch": 1.3382, "grad_norm": 1.6354379653930664, "learning_rate": 1.25e-05, "loss": 0.107, "step": 3775 }, { "epoch": 1.3384, "grad_norm": 1.515820026397705, "learning_rate": 1.2489795918367348e-05, "loss": 0.1014, "step": 3776 }, { "epoch": 1.3386, "grad_norm": 2.15248441696167, "learning_rate": 1.2479591836734694e-05, "loss": 0.2705, "step": 3777 }, { "epoch": 1.3388, "grad_norm": 3.161968469619751, "learning_rate": 1.2469387755102041e-05, "loss": 0.3206, "step": 3778 }, { "epoch": 1.339, "grad_norm": 1.6812952756881714, "learning_rate": 1.2459183673469387e-05, "loss": 0.1136, "step": 3779 }, { "epoch": 1.3392, "grad_norm": 1.4099876880645752, "learning_rate": 1.2448979591836735e-05, "loss": 0.1028, "step": 3780 }, { "epoch": 1.3394, "grad_norm": 3.105964183807373, "learning_rate": 1.2438775510204082e-05, "loss": 0.4409, "step": 3781 }, { "epoch": 1.3396, "grad_norm": 4.7388596534729, "learning_rate": 1.242857142857143e-05, "loss": 1.4694, "step": 3782 }, { "epoch": 1.3397999999999999, "grad_norm": 3.004696846008301, "learning_rate": 1.2418367346938777e-05, "loss": 0.5646, "step": 3783 }, { "epoch": 1.34, "grad_norm": 1.8444324731826782, "learning_rate": 1.2408163265306123e-05, "loss": 0.1194, "step": 3784 }, { "epoch": 1.3402, "grad_norm": 1.3238780498504639, "learning_rate": 1.239795918367347e-05, "loss": 0.0807, "step": 3785 }, { "epoch": 1.3404, "grad_norm": 1.6439921855926514, "learning_rate": 1.2387755102040816e-05, "loss": 0.1438, "step": 3786 }, { "epoch": 1.3406, "grad_norm": 2.3479502201080322, "learning_rate": 1.2377551020408164e-05, "loss": 0.3354, "step": 3787 }, { "epoch": 1.3408, "grad_norm": 3.0557663440704346, "learning_rate": 1.2367346938775511e-05, "loss": 0.587, "step": 3788 }, { "epoch": 1.341, "grad_norm": 2.2186567783355713, "learning_rate": 1.2357142857142857e-05, "loss": 0.1944, "step": 3789 }, { "epoch": 1.3412, "grad_norm": 4.796391487121582, "learning_rate": 1.2346938775510204e-05, "loss": 0.612, "step": 3790 }, { "epoch": 1.3414, "grad_norm": 4.777772426605225, "learning_rate": 1.233673469387755e-05, "loss": 0.8965, "step": 3791 }, { "epoch": 1.3416000000000001, "grad_norm": 11.811017036437988, "learning_rate": 1.2326530612244898e-05, "loss": 3.2756, "step": 3792 }, { "epoch": 1.3418, "grad_norm": 8.805556297302246, "learning_rate": 1.2316326530612247e-05, "loss": 2.5276, "step": 3793 }, { "epoch": 1.342, "grad_norm": 1.7621744871139526, "learning_rate": 1.2306122448979593e-05, "loss": 0.1785, "step": 3794 }, { "epoch": 1.3422, "grad_norm": 2.9811997413635254, "learning_rate": 1.229591836734694e-05, "loss": 0.4932, "step": 3795 }, { "epoch": 1.3424, "grad_norm": 1.205106258392334, "learning_rate": 1.2285714285714286e-05, "loss": 0.06, "step": 3796 }, { "epoch": 1.3426, "grad_norm": 2.116752862930298, "learning_rate": 1.2275510204081633e-05, "loss": 0.2296, "step": 3797 }, { "epoch": 1.3428, "grad_norm": 2.8943114280700684, "learning_rate": 1.2265306122448981e-05, "loss": 0.5197, "step": 3798 }, { "epoch": 1.343, "grad_norm": 4.272083282470703, "learning_rate": 1.2255102040816327e-05, "loss": 0.6344, "step": 3799 }, { "epoch": 1.3432, "grad_norm": 8.491045951843262, "learning_rate": 1.2244897959183674e-05, "loss": 2.5191, "step": 3800 }, { "epoch": 1.3434, "grad_norm": 1.980488657951355, "learning_rate": 1.223469387755102e-05, "loss": 0.2081, "step": 3801 }, { "epoch": 1.3436, "grad_norm": 3.655423164367676, "learning_rate": 1.2224489795918367e-05, "loss": 0.6737, "step": 3802 }, { "epoch": 1.3437999999999999, "grad_norm": 5.269712448120117, "learning_rate": 1.2214285714285715e-05, "loss": 0.7562, "step": 3803 }, { "epoch": 1.3439999999999999, "grad_norm": 2.799565553665161, "learning_rate": 1.2204081632653062e-05, "loss": 0.5657, "step": 3804 }, { "epoch": 1.3442, "grad_norm": 1.1319365501403809, "learning_rate": 1.219387755102041e-05, "loss": 0.0589, "step": 3805 }, { "epoch": 1.3444, "grad_norm": 1.70322585105896, "learning_rate": 1.2183673469387756e-05, "loss": 0.0815, "step": 3806 }, { "epoch": 1.3446, "grad_norm": 1.3940032720565796, "learning_rate": 1.2173469387755103e-05, "loss": 0.0749, "step": 3807 }, { "epoch": 1.3448, "grad_norm": 1.9430924654006958, "learning_rate": 1.2163265306122449e-05, "loss": 0.1559, "step": 3808 }, { "epoch": 1.345, "grad_norm": 3.467571973800659, "learning_rate": 1.2153061224489796e-05, "loss": 0.6982, "step": 3809 }, { "epoch": 1.3452, "grad_norm": 5.4435224533081055, "learning_rate": 1.2142857142857144e-05, "loss": 1.0138, "step": 3810 }, { "epoch": 1.3454, "grad_norm": 9.767927169799805, "learning_rate": 1.213265306122449e-05, "loss": 1.7398, "step": 3811 }, { "epoch": 1.3456000000000001, "grad_norm": 5.552506923675537, "learning_rate": 1.2122448979591837e-05, "loss": 0.8101, "step": 3812 }, { "epoch": 1.3458, "grad_norm": 3.9132087230682373, "learning_rate": 1.2112244897959183e-05, "loss": 0.478, "step": 3813 }, { "epoch": 1.346, "grad_norm": 3.659240961074829, "learning_rate": 1.210204081632653e-05, "loss": 0.7716, "step": 3814 }, { "epoch": 1.3462, "grad_norm": 5.815901756286621, "learning_rate": 1.2091836734693878e-05, "loss": 1.4835, "step": 3815 }, { "epoch": 1.3464, "grad_norm": 3.8814995288848877, "learning_rate": 1.2081632653061225e-05, "loss": 0.4621, "step": 3816 }, { "epoch": 1.3466, "grad_norm": 5.397446632385254, "learning_rate": 1.2071428571428573e-05, "loss": 0.7511, "step": 3817 }, { "epoch": 1.3468, "grad_norm": 4.171359539031982, "learning_rate": 1.2061224489795919e-05, "loss": 0.5696, "step": 3818 }, { "epoch": 1.347, "grad_norm": 3.8873350620269775, "learning_rate": 1.2051020408163266e-05, "loss": 0.5423, "step": 3819 }, { "epoch": 1.3472, "grad_norm": 3.4660348892211914, "learning_rate": 1.2040816326530612e-05, "loss": 0.4298, "step": 3820 }, { "epoch": 1.3474, "grad_norm": 2.9098331928253174, "learning_rate": 1.203061224489796e-05, "loss": 0.5387, "step": 3821 }, { "epoch": 1.3476, "grad_norm": 2.7750661373138428, "learning_rate": 1.2020408163265307e-05, "loss": 0.2058, "step": 3822 }, { "epoch": 1.3477999999999999, "grad_norm": 5.425154209136963, "learning_rate": 1.2010204081632653e-05, "loss": 0.7798, "step": 3823 }, { "epoch": 1.3479999999999999, "grad_norm": 3.839717149734497, "learning_rate": 1.2e-05, "loss": 0.4634, "step": 3824 }, { "epoch": 1.3482, "grad_norm": 4.481932640075684, "learning_rate": 1.1989795918367348e-05, "loss": 0.5418, "step": 3825 }, { "epoch": 1.3484, "grad_norm": 1.583490014076233, "learning_rate": 1.1979591836734694e-05, "loss": 0.1359, "step": 3826 }, { "epoch": 1.3486, "grad_norm": 3.9904890060424805, "learning_rate": 1.1969387755102043e-05, "loss": 0.7038, "step": 3827 }, { "epoch": 1.3488, "grad_norm": 6.746601104736328, "learning_rate": 1.1959183673469389e-05, "loss": 0.8479, "step": 3828 }, { "epoch": 1.349, "grad_norm": 2.903327703475952, "learning_rate": 1.1948979591836736e-05, "loss": 0.5251, "step": 3829 }, { "epoch": 1.3492, "grad_norm": 3.328550338745117, "learning_rate": 1.1938775510204082e-05, "loss": 0.3319, "step": 3830 }, { "epoch": 1.3494, "grad_norm": 6.920244216918945, "learning_rate": 1.192857142857143e-05, "loss": 1.7583, "step": 3831 }, { "epoch": 1.3496000000000001, "grad_norm": 1.6275689601898193, "learning_rate": 1.1918367346938777e-05, "loss": 0.1088, "step": 3832 }, { "epoch": 1.3498, "grad_norm": 1.926766037940979, "learning_rate": 1.1908163265306123e-05, "loss": 0.1258, "step": 3833 }, { "epoch": 1.35, "grad_norm": 3.261030673980713, "learning_rate": 1.189795918367347e-05, "loss": 0.3952, "step": 3834 }, { "epoch": 1.3502, "grad_norm": 1.8843517303466797, "learning_rate": 1.1887755102040816e-05, "loss": 0.1201, "step": 3835 }, { "epoch": 1.3504, "grad_norm": 4.032954692840576, "learning_rate": 1.1877551020408163e-05, "loss": 0.5584, "step": 3836 }, { "epoch": 1.3506, "grad_norm": 4.915853500366211, "learning_rate": 1.186734693877551e-05, "loss": 0.5074, "step": 3837 }, { "epoch": 1.3508, "grad_norm": 5.292396068572998, "learning_rate": 1.1857142857142858e-05, "loss": 0.8171, "step": 3838 }, { "epoch": 1.351, "grad_norm": 2.990004301071167, "learning_rate": 1.1846938775510206e-05, "loss": 0.5732, "step": 3839 }, { "epoch": 1.3512, "grad_norm": 4.736379146575928, "learning_rate": 1.1836734693877552e-05, "loss": 0.5969, "step": 3840 }, { "epoch": 1.3514, "grad_norm": 8.278603553771973, "learning_rate": 1.1826530612244899e-05, "loss": 2.2096, "step": 3841 }, { "epoch": 1.3516, "grad_norm": 5.408187389373779, "learning_rate": 1.1816326530612245e-05, "loss": 0.9051, "step": 3842 }, { "epoch": 1.3518, "grad_norm": 4.891638278961182, "learning_rate": 1.1806122448979592e-05, "loss": 1.3975, "step": 3843 }, { "epoch": 1.3519999999999999, "grad_norm": 1.6680200099945068, "learning_rate": 1.179591836734694e-05, "loss": 0.1287, "step": 3844 }, { "epoch": 1.3522, "grad_norm": 1.410937786102295, "learning_rate": 1.1785714285714286e-05, "loss": 0.078, "step": 3845 }, { "epoch": 1.3524, "grad_norm": 3.5020029544830322, "learning_rate": 1.1775510204081633e-05, "loss": 0.5996, "step": 3846 }, { "epoch": 1.3526, "grad_norm": 7.307976722717285, "learning_rate": 1.1765306122448979e-05, "loss": 2.084, "step": 3847 }, { "epoch": 1.3528, "grad_norm": 3.692384719848633, "learning_rate": 1.1755102040816326e-05, "loss": 0.6842, "step": 3848 }, { "epoch": 1.353, "grad_norm": 3.2924370765686035, "learning_rate": 1.1744897959183674e-05, "loss": 0.3576, "step": 3849 }, { "epoch": 1.3532, "grad_norm": 1.646892786026001, "learning_rate": 1.1734693877551021e-05, "loss": 0.1366, "step": 3850 }, { "epoch": 1.3534, "grad_norm": 2.786102533340454, "learning_rate": 1.1724489795918369e-05, "loss": 0.2471, "step": 3851 }, { "epoch": 1.3536000000000001, "grad_norm": 5.256365776062012, "learning_rate": 1.1714285714285715e-05, "loss": 0.6327, "step": 3852 }, { "epoch": 1.3538000000000001, "grad_norm": 2.0491883754730225, "learning_rate": 1.1704081632653062e-05, "loss": 0.214, "step": 3853 }, { "epoch": 1.354, "grad_norm": 3.036485433578491, "learning_rate": 1.1693877551020408e-05, "loss": 0.5884, "step": 3854 }, { "epoch": 1.3542, "grad_norm": 1.5038535594940186, "learning_rate": 1.1683673469387755e-05, "loss": 0.1011, "step": 3855 }, { "epoch": 1.3544, "grad_norm": 1.5168235301971436, "learning_rate": 1.1673469387755103e-05, "loss": 0.09, "step": 3856 }, { "epoch": 1.3546, "grad_norm": 1.7048945426940918, "learning_rate": 1.1663265306122449e-05, "loss": 0.143, "step": 3857 }, { "epoch": 1.3548, "grad_norm": 1.708801031112671, "learning_rate": 1.1653061224489796e-05, "loss": 0.1426, "step": 3858 }, { "epoch": 1.355, "grad_norm": 3.5101585388183594, "learning_rate": 1.1642857142857144e-05, "loss": 0.6942, "step": 3859 }, { "epoch": 1.3552, "grad_norm": 7.236852169036865, "learning_rate": 1.163265306122449e-05, "loss": 2.1071, "step": 3860 }, { "epoch": 1.3554, "grad_norm": 2.947705030441284, "learning_rate": 1.1622448979591839e-05, "loss": 0.5327, "step": 3861 }, { "epoch": 1.3556, "grad_norm": 2.4596071243286133, "learning_rate": 1.1612244897959184e-05, "loss": 0.3587, "step": 3862 }, { "epoch": 1.3558, "grad_norm": 4.5340256690979, "learning_rate": 1.1602040816326532e-05, "loss": 1.5091, "step": 3863 }, { "epoch": 1.3559999999999999, "grad_norm": 1.7653383016586304, "learning_rate": 1.1591836734693878e-05, "loss": 0.1534, "step": 3864 }, { "epoch": 1.3562, "grad_norm": 3.754122734069824, "learning_rate": 1.1581632653061225e-05, "loss": 0.6394, "step": 3865 }, { "epoch": 1.3564, "grad_norm": 6.031116962432861, "learning_rate": 1.1571428571428573e-05, "loss": 0.8656, "step": 3866 }, { "epoch": 1.3566, "grad_norm": 4.633213043212891, "learning_rate": 1.1561224489795918e-05, "loss": 0.7653, "step": 3867 }, { "epoch": 1.3568, "grad_norm": 3.4678215980529785, "learning_rate": 1.1551020408163266e-05, "loss": 0.6047, "step": 3868 }, { "epoch": 1.357, "grad_norm": 3.8937063217163086, "learning_rate": 1.1540816326530612e-05, "loss": 0.445, "step": 3869 }, { "epoch": 1.3572, "grad_norm": 4.51537561416626, "learning_rate": 1.153061224489796e-05, "loss": 1.0653, "step": 3870 }, { "epoch": 1.3574, "grad_norm": 8.521768569946289, "learning_rate": 1.1520408163265307e-05, "loss": 2.3745, "step": 3871 }, { "epoch": 1.3576, "grad_norm": 3.4433677196502686, "learning_rate": 1.1510204081632654e-05, "loss": 0.6414, "step": 3872 }, { "epoch": 1.3578000000000001, "grad_norm": 6.296442031860352, "learning_rate": 1.1500000000000002e-05, "loss": 0.7838, "step": 3873 }, { "epoch": 1.358, "grad_norm": 5.0307207107543945, "learning_rate": 1.1489795918367347e-05, "loss": 0.7592, "step": 3874 }, { "epoch": 1.3582, "grad_norm": 3.6523263454437256, "learning_rate": 1.1479591836734695e-05, "loss": 0.63, "step": 3875 }, { "epoch": 1.3584, "grad_norm": 5.82245397567749, "learning_rate": 1.146938775510204e-05, "loss": 0.8227, "step": 3876 }, { "epoch": 1.3586, "grad_norm": 5.670694351196289, "learning_rate": 1.1459183673469388e-05, "loss": 0.7424, "step": 3877 }, { "epoch": 1.3588, "grad_norm": 5.125217437744141, "learning_rate": 1.1448979591836736e-05, "loss": 0.7306, "step": 3878 }, { "epoch": 1.359, "grad_norm": 4.8379435539245605, "learning_rate": 1.1438775510204082e-05, "loss": 0.8655, "step": 3879 }, { "epoch": 1.3592, "grad_norm": 9.271891593933105, "learning_rate": 1.1428571428571429e-05, "loss": 3.0184, "step": 3880 }, { "epoch": 1.3594, "grad_norm": 8.492823600769043, "learning_rate": 1.1418367346938775e-05, "loss": 2.1837, "step": 3881 }, { "epoch": 1.3596, "grad_norm": 3.692840099334717, "learning_rate": 1.1408163265306122e-05, "loss": 0.4024, "step": 3882 }, { "epoch": 1.3598, "grad_norm": 4.732965469360352, "learning_rate": 1.139795918367347e-05, "loss": 0.5713, "step": 3883 }, { "epoch": 1.3599999999999999, "grad_norm": 5.9341583251953125, "learning_rate": 1.1387755102040817e-05, "loss": 0.9768, "step": 3884 }, { "epoch": 1.3602, "grad_norm": 4.89736270904541, "learning_rate": 1.1377551020408165e-05, "loss": 1.623, "step": 3885 }, { "epoch": 1.3604, "grad_norm": 4.786985874176025, "learning_rate": 1.136734693877551e-05, "loss": 0.6604, "step": 3886 }, { "epoch": 1.3606, "grad_norm": 5.9557623863220215, "learning_rate": 1.1357142857142858e-05, "loss": 0.8251, "step": 3887 }, { "epoch": 1.3608, "grad_norm": 4.59980583190918, "learning_rate": 1.1346938775510206e-05, "loss": 0.8647, "step": 3888 }, { "epoch": 1.361, "grad_norm": 4.638906002044678, "learning_rate": 1.1336734693877551e-05, "loss": 1.4674, "step": 3889 }, { "epoch": 1.3612, "grad_norm": 5.079880237579346, "learning_rate": 1.1326530612244899e-05, "loss": 0.7242, "step": 3890 }, { "epoch": 1.3614, "grad_norm": 5.692809104919434, "learning_rate": 1.1316326530612245e-05, "loss": 0.7839, "step": 3891 }, { "epoch": 1.3616, "grad_norm": 6.237496852874756, "learning_rate": 1.1306122448979592e-05, "loss": 1.2235, "step": 3892 }, { "epoch": 1.3618000000000001, "grad_norm": 8.457006454467773, "learning_rate": 1.129591836734694e-05, "loss": 2.2326, "step": 3893 }, { "epoch": 1.362, "grad_norm": 9.149642944335938, "learning_rate": 1.1285714285714285e-05, "loss": 0.6763, "step": 3894 }, { "epoch": 1.3622, "grad_norm": 3.8505139350891113, "learning_rate": 1.1275510204081635e-05, "loss": 0.4128, "step": 3895 }, { "epoch": 1.3624, "grad_norm": 6.831646919250488, "learning_rate": 1.126530612244898e-05, "loss": 0.8069, "step": 3896 }, { "epoch": 1.3626, "grad_norm": 4.0196146965026855, "learning_rate": 1.1255102040816328e-05, "loss": 0.4681, "step": 3897 }, { "epoch": 1.3628, "grad_norm": 7.241461277008057, "learning_rate": 1.1244897959183674e-05, "loss": 1.3956, "step": 3898 }, { "epoch": 1.363, "grad_norm": 9.091708183288574, "learning_rate": 1.1234693877551021e-05, "loss": 2.7268, "step": 3899 }, { "epoch": 1.3632, "grad_norm": 9.10069751739502, "learning_rate": 1.1224489795918369e-05, "loss": 2.2056, "step": 3900 }, { "epoch": 1.3634, "grad_norm": 6.977329730987549, "learning_rate": 1.1214285714285714e-05, "loss": 1.2128, "step": 3901 }, { "epoch": 1.3636, "grad_norm": 10.115520477294922, "learning_rate": 1.1204081632653062e-05, "loss": 2.788, "step": 3902 }, { "epoch": 1.3638, "grad_norm": 8.028214454650879, "learning_rate": 1.1193877551020408e-05, "loss": 2.0324, "step": 3903 }, { "epoch": 1.3639999999999999, "grad_norm": 4.721216678619385, "learning_rate": 1.1183673469387755e-05, "loss": 0.6433, "step": 3904 }, { "epoch": 1.3642, "grad_norm": 7.163980484008789, "learning_rate": 1.1173469387755103e-05, "loss": 1.4074, "step": 3905 }, { "epoch": 1.3644, "grad_norm": 8.084197998046875, "learning_rate": 1.116326530612245e-05, "loss": 2.0196, "step": 3906 }, { "epoch": 1.3646, "grad_norm": 4.783535480499268, "learning_rate": 1.1153061224489798e-05, "loss": 0.7092, "step": 3907 }, { "epoch": 1.3648, "grad_norm": 4.9050092697143555, "learning_rate": 1.1142857142857143e-05, "loss": 0.7534, "step": 3908 }, { "epoch": 1.365, "grad_norm": 7.531850337982178, "learning_rate": 1.1132653061224491e-05, "loss": 1.2272, "step": 3909 }, { "epoch": 1.3652, "grad_norm": 10.261857986450195, "learning_rate": 1.1122448979591837e-05, "loss": 1.9668, "step": 3910 }, { "epoch": 1.3654, "grad_norm": 3.57900333404541, "learning_rate": 1.1112244897959184e-05, "loss": 0.6528, "step": 3911 }, { "epoch": 1.3656, "grad_norm": 3.6782445907592773, "learning_rate": 1.1102040816326532e-05, "loss": 0.4855, "step": 3912 }, { "epoch": 1.3658000000000001, "grad_norm": 4.000214576721191, "learning_rate": 1.1091836734693877e-05, "loss": 0.4317, "step": 3913 }, { "epoch": 1.366, "grad_norm": 7.233711242675781, "learning_rate": 1.1081632653061225e-05, "loss": 1.1248, "step": 3914 }, { "epoch": 1.3662, "grad_norm": 9.311083793640137, "learning_rate": 1.107142857142857e-05, "loss": 2.0811, "step": 3915 }, { "epoch": 1.3664, "grad_norm": 5.855093955993652, "learning_rate": 1.1061224489795918e-05, "loss": 0.7559, "step": 3916 }, { "epoch": 1.3666, "grad_norm": 3.242572546005249, "learning_rate": 1.1051020408163266e-05, "loss": 0.5673, "step": 3917 }, { "epoch": 1.3668, "grad_norm": 3.610198974609375, "learning_rate": 1.1040816326530613e-05, "loss": 0.4493, "step": 3918 }, { "epoch": 1.367, "grad_norm": 3.7920875549316406, "learning_rate": 1.103061224489796e-05, "loss": 0.7715, "step": 3919 }, { "epoch": 1.3672, "grad_norm": 5.547231197357178, "learning_rate": 1.1020408163265306e-05, "loss": 0.8798, "step": 3920 }, { "epoch": 1.3674, "grad_norm": 8.020981788635254, "learning_rate": 1.1010204081632654e-05, "loss": 0.7225, "step": 3921 }, { "epoch": 1.3676, "grad_norm": 10.814255714416504, "learning_rate": 1.1000000000000001e-05, "loss": 1.1109, "step": 3922 }, { "epoch": 1.3678, "grad_norm": 6.38167667388916, "learning_rate": 1.0989795918367347e-05, "loss": 0.945, "step": 3923 }, { "epoch": 1.3679999999999999, "grad_norm": 9.843852996826172, "learning_rate": 1.0979591836734695e-05, "loss": 2.1945, "step": 3924 }, { "epoch": 1.3682, "grad_norm": 6.032860279083252, "learning_rate": 1.096938775510204e-05, "loss": 0.7325, "step": 3925 }, { "epoch": 1.3684, "grad_norm": 8.657896041870117, "learning_rate": 1.0959183673469388e-05, "loss": 1.1041, "step": 3926 }, { "epoch": 1.3686, "grad_norm": 5.5394062995910645, "learning_rate": 1.0948979591836735e-05, "loss": 1.0176, "step": 3927 }, { "epoch": 1.3688, "grad_norm": 9.15536117553711, "learning_rate": 1.0938775510204081e-05, "loss": 1.9866, "step": 3928 }, { "epoch": 1.369, "grad_norm": 3.647921562194824, "learning_rate": 1.092857142857143e-05, "loss": 0.4283, "step": 3929 }, { "epoch": 1.3692, "grad_norm": 5.62546968460083, "learning_rate": 1.0918367346938776e-05, "loss": 0.7953, "step": 3930 }, { "epoch": 1.3694, "grad_norm": 3.3547279834747314, "learning_rate": 1.0908163265306124e-05, "loss": 0.5298, "step": 3931 }, { "epoch": 1.3696, "grad_norm": 3.8890135288238525, "learning_rate": 1.089795918367347e-05, "loss": 0.482, "step": 3932 }, { "epoch": 1.3698000000000001, "grad_norm": 3.9916648864746094, "learning_rate": 1.0887755102040817e-05, "loss": 0.6226, "step": 3933 }, { "epoch": 1.37, "grad_norm": 5.355892181396484, "learning_rate": 1.0877551020408164e-05, "loss": 0.8334, "step": 3934 }, { "epoch": 1.3702, "grad_norm": 7.691184997558594, "learning_rate": 1.086734693877551e-05, "loss": 2.0341, "step": 3935 }, { "epoch": 1.3704, "grad_norm": 3.757251501083374, "learning_rate": 1.0857142857142858e-05, "loss": 0.6091, "step": 3936 }, { "epoch": 1.3706, "grad_norm": 5.115704536437988, "learning_rate": 1.0846938775510204e-05, "loss": 0.7629, "step": 3937 }, { "epoch": 1.3708, "grad_norm": 3.836716413497925, "learning_rate": 1.0836734693877551e-05, "loss": 0.7387, "step": 3938 }, { "epoch": 1.371, "grad_norm": 5.407971382141113, "learning_rate": 1.0826530612244899e-05, "loss": 0.7584, "step": 3939 }, { "epoch": 1.3712, "grad_norm": 3.6939923763275146, "learning_rate": 1.0816326530612246e-05, "loss": 0.6111, "step": 3940 }, { "epoch": 1.3714, "grad_norm": 5.296103477478027, "learning_rate": 1.0806122448979593e-05, "loss": 0.7072, "step": 3941 }, { "epoch": 1.3716, "grad_norm": 9.211636543273926, "learning_rate": 1.079591836734694e-05, "loss": 2.5747, "step": 3942 }, { "epoch": 1.3718, "grad_norm": 5.751867771148682, "learning_rate": 1.0785714285714287e-05, "loss": 0.8185, "step": 3943 }, { "epoch": 1.3719999999999999, "grad_norm": 4.8876237869262695, "learning_rate": 1.0775510204081633e-05, "loss": 0.9352, "step": 3944 }, { "epoch": 1.3721999999999999, "grad_norm": 9.685640335083008, "learning_rate": 1.076530612244898e-05, "loss": 2.645, "step": 3945 }, { "epoch": 1.3724, "grad_norm": 4.994866371154785, "learning_rate": 1.0755102040816328e-05, "loss": 0.6653, "step": 3946 }, { "epoch": 1.3726, "grad_norm": 4.2619805335998535, "learning_rate": 1.0744897959183673e-05, "loss": 0.7159, "step": 3947 }, { "epoch": 1.3728, "grad_norm": 5.770644664764404, "learning_rate": 1.073469387755102e-05, "loss": 0.8857, "step": 3948 }, { "epoch": 1.373, "grad_norm": 5.596279621124268, "learning_rate": 1.0724489795918367e-05, "loss": 0.7091, "step": 3949 }, { "epoch": 1.3732, "grad_norm": 3.7536487579345703, "learning_rate": 1.0714285714285714e-05, "loss": 0.4113, "step": 3950 }, { "epoch": 1.3734, "grad_norm": 4.525453090667725, "learning_rate": 1.0704081632653063e-05, "loss": 0.6371, "step": 3951 }, { "epoch": 1.3736, "grad_norm": 3.5729410648345947, "learning_rate": 1.0693877551020409e-05, "loss": 0.6469, "step": 3952 }, { "epoch": 1.3738000000000001, "grad_norm": 6.834310531616211, "learning_rate": 1.0683673469387757e-05, "loss": 1.0561, "step": 3953 }, { "epoch": 1.374, "grad_norm": 9.00754451751709, "learning_rate": 1.0673469387755102e-05, "loss": 2.6829, "step": 3954 }, { "epoch": 1.3742, "grad_norm": 4.654486179351807, "learning_rate": 1.066326530612245e-05, "loss": 0.7045, "step": 3955 }, { "epoch": 1.3744, "grad_norm": 3.5815670490264893, "learning_rate": 1.0653061224489797e-05, "loss": 0.6919, "step": 3956 }, { "epoch": 1.3746, "grad_norm": 3.902749538421631, "learning_rate": 1.0642857142857143e-05, "loss": 0.4963, "step": 3957 }, { "epoch": 1.3748, "grad_norm": 3.7020041942596436, "learning_rate": 1.063265306122449e-05, "loss": 0.6277, "step": 3958 }, { "epoch": 1.375, "grad_norm": 5.73671293258667, "learning_rate": 1.0622448979591836e-05, "loss": 0.6776, "step": 3959 }, { "epoch": 1.3752, "grad_norm": 5.688353538513184, "learning_rate": 1.0612244897959184e-05, "loss": 0.7026, "step": 3960 }, { "epoch": 1.3754, "grad_norm": 7.400761127471924, "learning_rate": 1.0602040816326531e-05, "loss": 1.2377, "step": 3961 }, { "epoch": 1.3756, "grad_norm": 11.227447509765625, "learning_rate": 1.0591836734693877e-05, "loss": 2.7975, "step": 3962 }, { "epoch": 1.3758, "grad_norm": 3.6369383335113525, "learning_rate": 1.0581632653061226e-05, "loss": 0.6545, "step": 3963 }, { "epoch": 1.376, "grad_norm": 3.8340251445770264, "learning_rate": 1.0571428571428572e-05, "loss": 0.7249, "step": 3964 }, { "epoch": 1.3761999999999999, "grad_norm": 3.273732900619507, "learning_rate": 1.056122448979592e-05, "loss": 0.6021, "step": 3965 }, { "epoch": 1.3764, "grad_norm": 5.729831695556641, "learning_rate": 1.0551020408163265e-05, "loss": 0.7489, "step": 3966 }, { "epoch": 1.3766, "grad_norm": 9.157611846923828, "learning_rate": 1.0540816326530613e-05, "loss": 2.1601, "step": 3967 }, { "epoch": 1.3768, "grad_norm": 5.697342395782471, "learning_rate": 1.053061224489796e-05, "loss": 0.8413, "step": 3968 }, { "epoch": 1.377, "grad_norm": 5.372077941894531, "learning_rate": 1.0520408163265306e-05, "loss": 0.6816, "step": 3969 }, { "epoch": 1.3772, "grad_norm": 4.570939064025879, "learning_rate": 1.0510204081632654e-05, "loss": 0.5851, "step": 3970 }, { "epoch": 1.3774, "grad_norm": 5.907211780548096, "learning_rate": 1.05e-05, "loss": 0.8755, "step": 3971 }, { "epoch": 1.3776, "grad_norm": 11.042017936706543, "learning_rate": 1.0489795918367347e-05, "loss": 2.9378, "step": 3972 }, { "epoch": 1.3778000000000001, "grad_norm": 10.968971252441406, "learning_rate": 1.0479591836734694e-05, "loss": 2.9056, "step": 3973 }, { "epoch": 1.3780000000000001, "grad_norm": 10.674814224243164, "learning_rate": 1.0469387755102042e-05, "loss": 2.6566, "step": 3974 }, { "epoch": 1.3782, "grad_norm": 6.546092510223389, "learning_rate": 1.045918367346939e-05, "loss": 0.9165, "step": 3975 }, { "epoch": 1.3784, "grad_norm": 12.73819351196289, "learning_rate": 1.0448979591836735e-05, "loss": 3.0223, "step": 3976 }, { "epoch": 1.3786, "grad_norm": 10.385857582092285, "learning_rate": 1.0438775510204083e-05, "loss": 2.549, "step": 3977 }, { "epoch": 1.3788, "grad_norm": 3.7381415367126465, "learning_rate": 1.0428571428571428e-05, "loss": 0.7069, "step": 3978 }, { "epoch": 1.379, "grad_norm": 5.126061916351318, "learning_rate": 1.0418367346938776e-05, "loss": 0.6788, "step": 3979 }, { "epoch": 1.3792, "grad_norm": 5.441112995147705, "learning_rate": 1.0408163265306123e-05, "loss": 0.7679, "step": 3980 }, { "epoch": 1.3794, "grad_norm": 5.262506008148193, "learning_rate": 1.039795918367347e-05, "loss": 0.7664, "step": 3981 }, { "epoch": 1.3796, "grad_norm": 3.900252103805542, "learning_rate": 1.0387755102040817e-05, "loss": 0.6374, "step": 3982 }, { "epoch": 1.3798, "grad_norm": 5.284982681274414, "learning_rate": 1.0377551020408162e-05, "loss": 0.7284, "step": 3983 }, { "epoch": 1.38, "grad_norm": 6.346417427062988, "learning_rate": 1.036734693877551e-05, "loss": 0.9852, "step": 3984 }, { "epoch": 1.3801999999999999, "grad_norm": 5.45538854598999, "learning_rate": 1.0357142857142859e-05, "loss": 0.8302, "step": 3985 }, { "epoch": 1.3804, "grad_norm": 5.084419250488281, "learning_rate": 1.0346938775510205e-05, "loss": 0.647, "step": 3986 }, { "epoch": 1.3806, "grad_norm": 6.146941661834717, "learning_rate": 1.0336734693877552e-05, "loss": 0.9438, "step": 3987 }, { "epoch": 1.3808, "grad_norm": 12.002094268798828, "learning_rate": 1.0326530612244898e-05, "loss": 2.7541, "step": 3988 }, { "epoch": 1.381, "grad_norm": 3.95780611038208, "learning_rate": 1.0316326530612246e-05, "loss": 0.5239, "step": 3989 }, { "epoch": 1.3812, "grad_norm": 3.9181201457977295, "learning_rate": 1.0306122448979593e-05, "loss": 0.4687, "step": 3990 }, { "epoch": 1.3814, "grad_norm": 5.811712265014648, "learning_rate": 1.0295918367346939e-05, "loss": 0.9536, "step": 3991 }, { "epoch": 1.3816, "grad_norm": 4.9396562576293945, "learning_rate": 1.0285714285714286e-05, "loss": 0.6593, "step": 3992 }, { "epoch": 1.3818, "grad_norm": 3.514286756515503, "learning_rate": 1.0275510204081632e-05, "loss": 0.7083, "step": 3993 }, { "epoch": 1.3820000000000001, "grad_norm": 3.364076614379883, "learning_rate": 1.026530612244898e-05, "loss": 0.5974, "step": 3994 }, { "epoch": 1.3822, "grad_norm": 3.648085832595825, "learning_rate": 1.0255102040816327e-05, "loss": 0.6451, "step": 3995 }, { "epoch": 1.3824, "grad_norm": 4.9181694984436035, "learning_rate": 1.0244897959183673e-05, "loss": 0.8782, "step": 3996 }, { "epoch": 1.3826, "grad_norm": 4.802428722381592, "learning_rate": 1.0234693877551022e-05, "loss": 0.734, "step": 3997 }, { "epoch": 1.3828, "grad_norm": 3.6626720428466797, "learning_rate": 1.0224489795918368e-05, "loss": 0.7175, "step": 3998 }, { "epoch": 1.383, "grad_norm": 4.834207534790039, "learning_rate": 1.0214285714285715e-05, "loss": 0.6871, "step": 3999 }, { "epoch": 1.3832, "grad_norm": 3.8979711532592773, "learning_rate": 1.0204081632653061e-05, "loss": 0.5119, "step": 4000 }, { "epoch": 1.3834, "grad_norm": 3.582329511642456, "learning_rate": 1.0193877551020409e-05, "loss": 0.3988, "step": 4001 }, { "epoch": 1.3836, "grad_norm": 3.8930623531341553, "learning_rate": 1.0183673469387756e-05, "loss": 0.6107, "step": 4002 }, { "epoch": 1.3838, "grad_norm": 5.010544300079346, "learning_rate": 1.0173469387755102e-05, "loss": 0.732, "step": 4003 }, { "epoch": 1.384, "grad_norm": 3.3792223930358887, "learning_rate": 1.016326530612245e-05, "loss": 0.5825, "step": 4004 }, { "epoch": 1.3841999999999999, "grad_norm": 6.155489444732666, "learning_rate": 1.0153061224489795e-05, "loss": 0.8847, "step": 4005 }, { "epoch": 1.3844, "grad_norm": 9.239078521728516, "learning_rate": 1.0142857142857143e-05, "loss": 2.9732, "step": 4006 }, { "epoch": 1.3846, "grad_norm": 5.7950358390808105, "learning_rate": 1.013265306122449e-05, "loss": 0.8344, "step": 4007 }, { "epoch": 1.3848, "grad_norm": 4.781053066253662, "learning_rate": 1.0122448979591838e-05, "loss": 0.6476, "step": 4008 }, { "epoch": 1.385, "grad_norm": 6.565587997436523, "learning_rate": 1.0112244897959185e-05, "loss": 1.0583, "step": 4009 }, { "epoch": 1.3852, "grad_norm": 9.15428638458252, "learning_rate": 1.0102040816326531e-05, "loss": 2.5636, "step": 4010 }, { "epoch": 1.3854, "grad_norm": 5.543002128601074, "learning_rate": 1.0091836734693879e-05, "loss": 0.8628, "step": 4011 }, { "epoch": 1.3856, "grad_norm": 5.14238977432251, "learning_rate": 1.0081632653061224e-05, "loss": 0.6357, "step": 4012 }, { "epoch": 1.3858, "grad_norm": 3.532233715057373, "learning_rate": 1.0071428571428572e-05, "loss": 0.569, "step": 4013 }, { "epoch": 1.3860000000000001, "grad_norm": 3.352722644805908, "learning_rate": 1.006122448979592e-05, "loss": 0.3895, "step": 4014 }, { "epoch": 1.3862, "grad_norm": 3.551661729812622, "learning_rate": 1.0051020408163265e-05, "loss": 0.6503, "step": 4015 }, { "epoch": 1.3864, "grad_norm": 5.485884666442871, "learning_rate": 1.0040816326530613e-05, "loss": 0.8738, "step": 4016 }, { "epoch": 1.3866, "grad_norm": 7.892735004425049, "learning_rate": 1.0030612244897958e-05, "loss": 1.035, "step": 4017 }, { "epoch": 1.3868, "grad_norm": 9.256954193115234, "learning_rate": 1.0020408163265306e-05, "loss": 2.4708, "step": 4018 }, { "epoch": 1.387, "grad_norm": 4.731393337249756, "learning_rate": 1.0010204081632655e-05, "loss": 0.8429, "step": 4019 }, { "epoch": 1.3872, "grad_norm": 9.638980865478516, "learning_rate": 1e-05, "loss": 2.3211, "step": 4020 }, { "epoch": 1.3874, "grad_norm": 5.631479740142822, "learning_rate": 9.989795918367348e-06, "loss": 0.699, "step": 4021 }, { "epoch": 1.3876, "grad_norm": 6.148746490478516, "learning_rate": 9.979591836734694e-06, "loss": 0.7217, "step": 4022 }, { "epoch": 1.3878, "grad_norm": 6.269064903259277, "learning_rate": 9.969387755102042e-06, "loss": 0.7055, "step": 4023 }, { "epoch": 1.388, "grad_norm": 5.382347106933594, "learning_rate": 9.959183673469389e-06, "loss": 0.7104, "step": 4024 }, { "epoch": 1.3881999999999999, "grad_norm": 5.656306743621826, "learning_rate": 9.948979591836735e-06, "loss": 0.836, "step": 4025 }, { "epoch": 1.3884, "grad_norm": 5.1849870681762695, "learning_rate": 9.938775510204082e-06, "loss": 0.7665, "step": 4026 }, { "epoch": 1.3886, "grad_norm": 5.041769981384277, "learning_rate": 9.928571428571428e-06, "loss": 0.7329, "step": 4027 }, { "epoch": 1.3888, "grad_norm": 3.5364441871643066, "learning_rate": 9.918367346938776e-06, "loss": 0.4226, "step": 4028 }, { "epoch": 1.389, "grad_norm": 3.902554750442505, "learning_rate": 9.908163265306123e-06, "loss": 0.6374, "step": 4029 }, { "epoch": 1.3892, "grad_norm": 4.9168829917907715, "learning_rate": 9.897959183673469e-06, "loss": 0.7221, "step": 4030 }, { "epoch": 1.3894, "grad_norm": 5.48930025100708, "learning_rate": 9.887755102040818e-06, "loss": 0.6971, "step": 4031 }, { "epoch": 1.3896, "grad_norm": 3.7762134075164795, "learning_rate": 9.877551020408164e-06, "loss": 0.4239, "step": 4032 }, { "epoch": 1.3898, "grad_norm": 6.708253383636475, "learning_rate": 9.867346938775511e-06, "loss": 0.9419, "step": 4033 }, { "epoch": 1.3900000000000001, "grad_norm": 11.21085262298584, "learning_rate": 9.857142857142857e-06, "loss": 2.8889, "step": 4034 }, { "epoch": 1.3902, "grad_norm": 9.258325576782227, "learning_rate": 9.846938775510205e-06, "loss": 2.289, "step": 4035 }, { "epoch": 1.3904, "grad_norm": 3.815279722213745, "learning_rate": 9.836734693877552e-06, "loss": 0.5744, "step": 4036 }, { "epoch": 1.3906, "grad_norm": 5.737252712249756, "learning_rate": 9.826530612244898e-06, "loss": 0.7616, "step": 4037 }, { "epoch": 1.3908, "grad_norm": 3.4769980907440186, "learning_rate": 9.816326530612245e-06, "loss": 0.4377, "step": 4038 }, { "epoch": 1.391, "grad_norm": 3.816934823989868, "learning_rate": 9.806122448979591e-06, "loss": 0.6121, "step": 4039 }, { "epoch": 1.3912, "grad_norm": 6.097662448883057, "learning_rate": 9.795918367346939e-06, "loss": 1.1407, "step": 4040 }, { "epoch": 1.3914, "grad_norm": 5.40137243270874, "learning_rate": 9.785714285714286e-06, "loss": 0.8819, "step": 4041 }, { "epoch": 1.3916, "grad_norm": 5.388091564178467, "learning_rate": 9.775510204081634e-06, "loss": 0.8149, "step": 4042 }, { "epoch": 1.3918, "grad_norm": 4.445162296295166, "learning_rate": 9.765306122448981e-06, "loss": 0.8164, "step": 4043 }, { "epoch": 1.392, "grad_norm": 3.53629469871521, "learning_rate": 9.755102040816327e-06, "loss": 0.6779, "step": 4044 }, { "epoch": 1.3921999999999999, "grad_norm": 5.086369514465332, "learning_rate": 9.744897959183674e-06, "loss": 0.7485, "step": 4045 }, { "epoch": 1.3924, "grad_norm": 3.230525493621826, "learning_rate": 9.73469387755102e-06, "loss": 0.6028, "step": 4046 }, { "epoch": 1.3926, "grad_norm": 6.732891082763672, "learning_rate": 9.724489795918368e-06, "loss": 1.0818, "step": 4047 }, { "epoch": 1.3928, "grad_norm": 9.322139739990234, "learning_rate": 9.714285714285715e-06, "loss": 2.5275, "step": 4048 }, { "epoch": 1.393, "grad_norm": 5.124191761016846, "learning_rate": 9.704081632653061e-06, "loss": 0.6462, "step": 4049 }, { "epoch": 1.3932, "grad_norm": 3.3168301582336426, "learning_rate": 9.693877551020408e-06, "loss": 0.5227, "step": 4050 }, { "epoch": 1.3934, "grad_norm": 4.323263168334961, "learning_rate": 9.683673469387756e-06, "loss": 0.4867, "step": 4051 }, { "epoch": 1.3936, "grad_norm": 5.344505786895752, "learning_rate": 9.673469387755102e-06, "loss": 0.797, "step": 4052 }, { "epoch": 1.3938, "grad_norm": 4.83172607421875, "learning_rate": 9.663265306122451e-06, "loss": 0.6209, "step": 4053 }, { "epoch": 1.3940000000000001, "grad_norm": 5.8424391746521, "learning_rate": 9.653061224489797e-06, "loss": 0.8825, "step": 4054 }, { "epoch": 1.3942, "grad_norm": 11.91540813446045, "learning_rate": 9.642857142857144e-06, "loss": 2.654, "step": 4055 }, { "epoch": 1.3944, "grad_norm": 3.9795920848846436, "learning_rate": 9.63265306122449e-06, "loss": 0.538, "step": 4056 }, { "epoch": 1.3946, "grad_norm": 4.208872318267822, "learning_rate": 9.622448979591837e-06, "loss": 0.4365, "step": 4057 }, { "epoch": 1.3948, "grad_norm": 6.616360664367676, "learning_rate": 9.612244897959185e-06, "loss": 1.0681, "step": 4058 }, { "epoch": 1.395, "grad_norm": 6.30941915512085, "learning_rate": 9.60204081632653e-06, "loss": 0.8877, "step": 4059 }, { "epoch": 1.3952, "grad_norm": 5.092617511749268, "learning_rate": 9.591836734693878e-06, "loss": 0.8003, "step": 4060 }, { "epoch": 1.3954, "grad_norm": 3.4397690296173096, "learning_rate": 9.581632653061224e-06, "loss": 0.6257, "step": 4061 }, { "epoch": 1.3956, "grad_norm": 4.654313564300537, "learning_rate": 9.571428571428572e-06, "loss": 0.7335, "step": 4062 }, { "epoch": 1.3958, "grad_norm": 4.091364860534668, "learning_rate": 9.561224489795919e-06, "loss": 0.9497, "step": 4063 }, { "epoch": 1.396, "grad_norm": 3.311901092529297, "learning_rate": 9.551020408163265e-06, "loss": 0.6365, "step": 4064 }, { "epoch": 1.3961999999999999, "grad_norm": 3.5903725624084473, "learning_rate": 9.540816326530614e-06, "loss": 0.4147, "step": 4065 }, { "epoch": 1.3963999999999999, "grad_norm": 4.654419898986816, "learning_rate": 9.53061224489796e-06, "loss": 0.6337, "step": 4066 }, { "epoch": 1.3966, "grad_norm": 3.7038626670837402, "learning_rate": 9.520408163265307e-06, "loss": 0.4614, "step": 4067 }, { "epoch": 1.3968, "grad_norm": 6.082508087158203, "learning_rate": 9.510204081632653e-06, "loss": 1.1482, "step": 4068 }, { "epoch": 1.397, "grad_norm": 9.412351608276367, "learning_rate": 9.5e-06, "loss": 2.536, "step": 4069 }, { "epoch": 1.3972, "grad_norm": 3.590003728866577, "learning_rate": 9.489795918367348e-06, "loss": 0.8897, "step": 4070 }, { "epoch": 1.3974, "grad_norm": 5.914702892303467, "learning_rate": 9.479591836734694e-06, "loss": 1.4219, "step": 4071 }, { "epoch": 1.3976, "grad_norm": 13.116612434387207, "learning_rate": 9.469387755102041e-06, "loss": 3.6732, "step": 4072 }, { "epoch": 1.3978, "grad_norm": 11.515417098999023, "learning_rate": 9.459183673469387e-06, "loss": 2.5414, "step": 4073 }, { "epoch": 1.3980000000000001, "grad_norm": 4.8623223304748535, "learning_rate": 9.448979591836735e-06, "loss": 0.7811, "step": 4074 }, { "epoch": 1.3982, "grad_norm": 5.086510181427002, "learning_rate": 9.438775510204082e-06, "loss": 0.7708, "step": 4075 }, { "epoch": 1.3984, "grad_norm": 3.5381362438201904, "learning_rate": 9.42857142857143e-06, "loss": 0.7936, "step": 4076 }, { "epoch": 1.3986, "grad_norm": 4.599300861358643, "learning_rate": 9.418367346938777e-06, "loss": 0.7034, "step": 4077 }, { "epoch": 1.3988, "grad_norm": 5.645698070526123, "learning_rate": 9.408163265306123e-06, "loss": 0.8146, "step": 4078 }, { "epoch": 1.399, "grad_norm": 3.565887212753296, "learning_rate": 9.39795918367347e-06, "loss": 0.6053, "step": 4079 }, { "epoch": 1.3992, "grad_norm": 4.043582916259766, "learning_rate": 9.387755102040816e-06, "loss": 0.5256, "step": 4080 }, { "epoch": 1.3994, "grad_norm": 3.9750559329986572, "learning_rate": 9.377551020408164e-06, "loss": 0.4568, "step": 4081 }, { "epoch": 1.3996, "grad_norm": 4.978681564331055, "learning_rate": 9.367346938775511e-06, "loss": 0.6702, "step": 4082 }, { "epoch": 1.3998, "grad_norm": 3.766176223754883, "learning_rate": 9.357142857142857e-06, "loss": 0.6297, "step": 4083 }, { "epoch": 1.4, "grad_norm": 6.139642238616943, "learning_rate": 9.346938775510204e-06, "loss": 0.8615, "step": 4084 }, { "epoch": 1.4002, "grad_norm": 5.712095737457275, "learning_rate": 9.336734693877552e-06, "loss": 0.7664, "step": 4085 }, { "epoch": 1.4003999999999999, "grad_norm": 4.974853038787842, "learning_rate": 9.326530612244898e-06, "loss": 0.7525, "step": 4086 }, { "epoch": 1.4006, "grad_norm": 3.8871517181396484, "learning_rate": 9.316326530612247e-06, "loss": 0.7542, "step": 4087 }, { "epoch": 1.4008, "grad_norm": 3.846158027648926, "learning_rate": 9.306122448979593e-06, "loss": 0.6825, "step": 4088 }, { "epoch": 1.401, "grad_norm": 5.497731685638428, "learning_rate": 9.29591836734694e-06, "loss": 0.7653, "step": 4089 }, { "epoch": 1.4012, "grad_norm": 4.662613391876221, "learning_rate": 9.285714285714286e-06, "loss": 0.7129, "step": 4090 }, { "epoch": 1.4014, "grad_norm": 3.373971462249756, "learning_rate": 9.275510204081633e-06, "loss": 0.5762, "step": 4091 }, { "epoch": 1.4016, "grad_norm": 4.859783172607422, "learning_rate": 9.26530612244898e-06, "loss": 0.7057, "step": 4092 }, { "epoch": 1.4018, "grad_norm": 6.266166687011719, "learning_rate": 9.255102040816327e-06, "loss": 0.8399, "step": 4093 }, { "epoch": 1.4020000000000001, "grad_norm": 3.591654062271118, "learning_rate": 9.244897959183674e-06, "loss": 0.6136, "step": 4094 }, { "epoch": 1.4022000000000001, "grad_norm": 3.9025590419769287, "learning_rate": 9.23469387755102e-06, "loss": 0.5225, "step": 4095 }, { "epoch": 1.4024, "grad_norm": 3.5609548091888428, "learning_rate": 9.224489795918367e-06, "loss": 0.4294, "step": 4096 }, { "epoch": 1.4026, "grad_norm": 4.027820587158203, "learning_rate": 9.214285714285715e-06, "loss": 0.8391, "step": 4097 }, { "epoch": 1.4028, "grad_norm": 3.616084337234497, "learning_rate": 9.20408163265306e-06, "loss": 0.7161, "step": 4098 }, { "epoch": 1.403, "grad_norm": 5.0737385749816895, "learning_rate": 9.19387755102041e-06, "loss": 0.7033, "step": 4099 }, { "epoch": 1.4032, "grad_norm": 3.8645384311676025, "learning_rate": 9.183673469387756e-06, "loss": 0.4397, "step": 4100 }, { "epoch": 1.4034, "grad_norm": 4.167079925537109, "learning_rate": 9.173469387755103e-06, "loss": 0.7173, "step": 4101 }, { "epoch": 1.4036, "grad_norm": 5.854258060455322, "learning_rate": 9.163265306122449e-06, "loss": 0.7312, "step": 4102 }, { "epoch": 1.4038, "grad_norm": 3.604153871536255, "learning_rate": 9.153061224489796e-06, "loss": 0.617, "step": 4103 }, { "epoch": 1.404, "grad_norm": 5.196364879608154, "learning_rate": 9.142857142857144e-06, "loss": 0.7194, "step": 4104 }, { "epoch": 1.4042, "grad_norm": 4.474340915679932, "learning_rate": 9.13265306122449e-06, "loss": 0.6409, "step": 4105 }, { "epoch": 1.4043999999999999, "grad_norm": 8.195387840270996, "learning_rate": 9.122448979591837e-06, "loss": 1.1349, "step": 4106 }, { "epoch": 1.4046, "grad_norm": 8.900938034057617, "learning_rate": 9.112244897959183e-06, "loss": 1.1237, "step": 4107 }, { "epoch": 1.4048, "grad_norm": 8.176107406616211, "learning_rate": 9.10204081632653e-06, "loss": 0.8196, "step": 4108 }, { "epoch": 1.405, "grad_norm": 6.472485542297363, "learning_rate": 9.091836734693878e-06, "loss": 0.7335, "step": 4109 }, { "epoch": 1.4052, "grad_norm": 5.660496234893799, "learning_rate": 9.081632653061225e-06, "loss": 0.7534, "step": 4110 }, { "epoch": 1.4054, "grad_norm": 5.223132133483887, "learning_rate": 9.071428571428573e-06, "loss": 0.8232, "step": 4111 }, { "epoch": 1.4056, "grad_norm": 4.634860515594482, "learning_rate": 9.061224489795919e-06, "loss": 0.6467, "step": 4112 }, { "epoch": 1.4058, "grad_norm": 7.191314220428467, "learning_rate": 9.051020408163266e-06, "loss": 0.7423, "step": 4113 }, { "epoch": 1.4060000000000001, "grad_norm": 3.7934906482696533, "learning_rate": 9.040816326530614e-06, "loss": 0.441, "step": 4114 }, { "epoch": 1.4062000000000001, "grad_norm": 5.6176910400390625, "learning_rate": 9.03061224489796e-06, "loss": 0.7425, "step": 4115 }, { "epoch": 1.4064, "grad_norm": 3.6926732063293457, "learning_rate": 9.020408163265307e-06, "loss": 0.8754, "step": 4116 }, { "epoch": 1.4066, "grad_norm": 4.297953128814697, "learning_rate": 9.010204081632653e-06, "loss": 0.6508, "step": 4117 }, { "epoch": 1.4068, "grad_norm": 2.9790539741516113, "learning_rate": 9e-06, "loss": 0.5164, "step": 4118 }, { "epoch": 1.407, "grad_norm": 3.659420967102051, "learning_rate": 8.989795918367348e-06, "loss": 0.4337, "step": 4119 }, { "epoch": 1.4072, "grad_norm": 3.8091375827789307, "learning_rate": 8.979591836734694e-06, "loss": 0.7421, "step": 4120 }, { "epoch": 1.4074, "grad_norm": 3.657268524169922, "learning_rate": 8.969387755102043e-06, "loss": 0.6517, "step": 4121 }, { "epoch": 1.4076, "grad_norm": 3.3698313236236572, "learning_rate": 8.959183673469388e-06, "loss": 0.619, "step": 4122 }, { "epoch": 1.4078, "grad_norm": 3.6699438095092773, "learning_rate": 8.948979591836736e-06, "loss": 0.4399, "step": 4123 }, { "epoch": 1.408, "grad_norm": 3.251619338989258, "learning_rate": 8.938775510204082e-06, "loss": 0.5502, "step": 4124 }, { "epoch": 1.4082, "grad_norm": 3.619903326034546, "learning_rate": 8.92857142857143e-06, "loss": 0.3735, "step": 4125 }, { "epoch": 1.4083999999999999, "grad_norm": 3.4549977779388428, "learning_rate": 8.918367346938777e-06, "loss": 0.5709, "step": 4126 }, { "epoch": 1.4086, "grad_norm": 3.7849767208099365, "learning_rate": 8.908163265306123e-06, "loss": 0.6561, "step": 4127 }, { "epoch": 1.4088, "grad_norm": 3.783862352371216, "learning_rate": 8.89795918367347e-06, "loss": 0.5563, "step": 4128 }, { "epoch": 1.409, "grad_norm": 3.595287799835205, "learning_rate": 8.887755102040816e-06, "loss": 0.9047, "step": 4129 }, { "epoch": 1.4092, "grad_norm": 3.724360466003418, "learning_rate": 8.877551020408163e-06, "loss": 0.6573, "step": 4130 }, { "epoch": 1.4094, "grad_norm": 4.294783115386963, "learning_rate": 8.86734693877551e-06, "loss": 0.6083, "step": 4131 }, { "epoch": 1.4096, "grad_norm": 3.358783006668091, "learning_rate": 8.857142857142857e-06, "loss": 0.3971, "step": 4132 }, { "epoch": 1.4098, "grad_norm": 3.7183899879455566, "learning_rate": 8.846938775510206e-06, "loss": 0.8904, "step": 4133 }, { "epoch": 1.41, "grad_norm": 3.684494972229004, "learning_rate": 8.836734693877552e-06, "loss": 0.7093, "step": 4134 }, { "epoch": 1.4102000000000001, "grad_norm": 5.470197677612305, "learning_rate": 8.826530612244899e-06, "loss": 0.7394, "step": 4135 }, { "epoch": 1.4104, "grad_norm": 3.2948615550994873, "learning_rate": 8.816326530612245e-06, "loss": 0.5397, "step": 4136 }, { "epoch": 1.4106, "grad_norm": 4.346402168273926, "learning_rate": 8.806122448979592e-06, "loss": 0.4657, "step": 4137 }, { "epoch": 1.4108, "grad_norm": 12.534156799316406, "learning_rate": 8.79591836734694e-06, "loss": 0.7853, "step": 4138 }, { "epoch": 1.411, "grad_norm": 3.222975015640259, "learning_rate": 8.785714285714286e-06, "loss": 0.5772, "step": 4139 }, { "epoch": 1.4112, "grad_norm": 3.6416501998901367, "learning_rate": 8.775510204081633e-06, "loss": 0.6124, "step": 4140 }, { "epoch": 1.4114, "grad_norm": 5.341848373413086, "learning_rate": 8.765306122448979e-06, "loss": 0.8143, "step": 4141 }, { "epoch": 1.4116, "grad_norm": 3.1770570278167725, "learning_rate": 8.755102040816326e-06, "loss": 0.5841, "step": 4142 }, { "epoch": 1.4118, "grad_norm": 3.6214945316314697, "learning_rate": 8.744897959183674e-06, "loss": 0.435, "step": 4143 }, { "epoch": 1.412, "grad_norm": 3.3380117416381836, "learning_rate": 8.734693877551021e-06, "loss": 0.5665, "step": 4144 }, { "epoch": 1.4122, "grad_norm": 4.026908874511719, "learning_rate": 8.724489795918369e-06, "loss": 0.5302, "step": 4145 }, { "epoch": 1.4123999999999999, "grad_norm": 4.105876445770264, "learning_rate": 8.714285714285715e-06, "loss": 0.4974, "step": 4146 }, { "epoch": 1.4126, "grad_norm": 4.341339111328125, "learning_rate": 8.704081632653062e-06, "loss": 0.5447, "step": 4147 }, { "epoch": 1.4128, "grad_norm": 6.510222911834717, "learning_rate": 8.69387755102041e-06, "loss": 0.4419, "step": 4148 }, { "epoch": 1.413, "grad_norm": 6.3148579597473145, "learning_rate": 8.683673469387755e-06, "loss": 1.0734, "step": 4149 }, { "epoch": 1.4132, "grad_norm": 6.665963649749756, "learning_rate": 8.673469387755103e-06, "loss": 1.1138, "step": 4150 }, { "epoch": 1.4134, "grad_norm": 5.742279052734375, "learning_rate": 8.663265306122449e-06, "loss": 0.725, "step": 4151 }, { "epoch": 1.4136, "grad_norm": 3.820310354232788, "learning_rate": 8.653061224489796e-06, "loss": 0.6811, "step": 4152 }, { "epoch": 1.4138, "grad_norm": 5.027363300323486, "learning_rate": 8.642857142857144e-06, "loss": 0.7551, "step": 4153 }, { "epoch": 1.414, "grad_norm": 4.043216228485107, "learning_rate": 8.63265306122449e-06, "loss": 0.5471, "step": 4154 }, { "epoch": 1.4142000000000001, "grad_norm": 6.13849401473999, "learning_rate": 8.622448979591839e-06, "loss": 0.7599, "step": 4155 }, { "epoch": 1.4144, "grad_norm": 4.4201812744140625, "learning_rate": 8.612244897959184e-06, "loss": 0.5476, "step": 4156 }, { "epoch": 1.4146, "grad_norm": 4.416066646575928, "learning_rate": 8.602040816326532e-06, "loss": 0.5428, "step": 4157 }, { "epoch": 1.4148, "grad_norm": 4.235730171203613, "learning_rate": 8.591836734693878e-06, "loss": 0.5485, "step": 4158 }, { "epoch": 1.415, "grad_norm": 4.274699687957764, "learning_rate": 8.581632653061225e-06, "loss": 0.5564, "step": 4159 }, { "epoch": 1.4152, "grad_norm": 4.5172224044799805, "learning_rate": 8.571428571428573e-06, "loss": 0.6145, "step": 4160 }, { "epoch": 1.4154, "grad_norm": 4.030407428741455, "learning_rate": 8.561224489795918e-06, "loss": 0.4312, "step": 4161 }, { "epoch": 1.4156, "grad_norm": 8.166952133178711, "learning_rate": 8.551020408163266e-06, "loss": 1.1238, "step": 4162 }, { "epoch": 1.4158, "grad_norm": 5.0052337646484375, "learning_rate": 8.540816326530612e-06, "loss": 1.0528, "step": 4163 }, { "epoch": 1.416, "grad_norm": 11.661174774169922, "learning_rate": 8.53061224489796e-06, "loss": 1.1472, "step": 4164 }, { "epoch": 1.4162, "grad_norm": 5.528961181640625, "learning_rate": 8.520408163265307e-06, "loss": 1.1316, "step": 4165 }, { "epoch": 1.4163999999999999, "grad_norm": 5.657470703125, "learning_rate": 8.510204081632652e-06, "loss": 1.0495, "step": 4166 }, { "epoch": 1.4166, "grad_norm": 4.580628395080566, "learning_rate": 8.500000000000002e-06, "loss": 0.6827, "step": 4167 }, { "epoch": 1.4168, "grad_norm": 7.498333930969238, "learning_rate": 8.489795918367347e-06, "loss": 1.3054, "step": 4168 }, { "epoch": 2.0002, "grad_norm": 2.02801513671875, "learning_rate": 8.479591836734695e-06, "loss": 0.1952, "step": 4169 }, { "epoch": 2.0004, "grad_norm": 11.26914119720459, "learning_rate": 8.46938775510204e-06, "loss": 2.2445, "step": 4170 }, { "epoch": 2.0006, "grad_norm": 2.6974380016326904, "learning_rate": 8.459183673469388e-06, "loss": 0.2363, "step": 4171 }, { "epoch": 2.0008, "grad_norm": 3.35848069190979, "learning_rate": 8.448979591836736e-06, "loss": 0.3612, "step": 4172 }, { "epoch": 2.001, "grad_norm": 4.7232279777526855, "learning_rate": 8.438775510204081e-06, "loss": 0.307, "step": 4173 }, { "epoch": 2.0012, "grad_norm": 0.9839540719985962, "learning_rate": 8.428571428571429e-06, "loss": 0.0254, "step": 4174 }, { "epoch": 2.0014, "grad_norm": 2.924433708190918, "learning_rate": 8.418367346938775e-06, "loss": 0.4624, "step": 4175 }, { "epoch": 2.0016, "grad_norm": 2.3263041973114014, "learning_rate": 8.408163265306122e-06, "loss": 0.2279, "step": 4176 }, { "epoch": 2.0018, "grad_norm": 4.812216281890869, "learning_rate": 8.39795918367347e-06, "loss": 1.4139, "step": 4177 }, { "epoch": 2.002, "grad_norm": 2.2195217609405518, "learning_rate": 8.387755102040817e-06, "loss": 0.1295, "step": 4178 }, { "epoch": 2.0022, "grad_norm": 2.8188059329986572, "learning_rate": 8.377551020408165e-06, "loss": 0.4121, "step": 4179 }, { "epoch": 2.0024, "grad_norm": 1.218191146850586, "learning_rate": 8.36734693877551e-06, "loss": 0.0442, "step": 4180 }, { "epoch": 2.0026, "grad_norm": 3.5840089321136475, "learning_rate": 8.357142857142858e-06, "loss": 1.4749, "step": 4181 }, { "epoch": 2.0028, "grad_norm": 2.198009967803955, "learning_rate": 8.346938775510205e-06, "loss": 0.2235, "step": 4182 }, { "epoch": 2.003, "grad_norm": 4.087214946746826, "learning_rate": 8.336734693877551e-06, "loss": 0.5768, "step": 4183 }, { "epoch": 2.0032, "grad_norm": 6.406209945678711, "learning_rate": 8.326530612244899e-06, "loss": 1.4211, "step": 4184 }, { "epoch": 2.0034, "grad_norm": 7.409274578094482, "learning_rate": 8.316326530612245e-06, "loss": 1.9132, "step": 4185 }, { "epoch": 2.0036, "grad_norm": 9.410537719726562, "learning_rate": 8.306122448979592e-06, "loss": 3.1996, "step": 4186 }, { "epoch": 2.0038, "grad_norm": 1.7160249948501587, "learning_rate": 8.29591836734694e-06, "loss": 0.1781, "step": 4187 }, { "epoch": 2.004, "grad_norm": 8.489348411560059, "learning_rate": 8.285714285714285e-06, "loss": 2.0185, "step": 4188 }, { "epoch": 2.0042, "grad_norm": 2.1543900966644287, "learning_rate": 8.275510204081633e-06, "loss": 0.1903, "step": 4189 }, { "epoch": 2.0044, "grad_norm": 18.228240966796875, "learning_rate": 8.26530612244898e-06, "loss": 4.8913, "step": 4190 }, { "epoch": 2.0046, "grad_norm": 1.31488037109375, "learning_rate": 8.255102040816328e-06, "loss": 0.0419, "step": 4191 }, { "epoch": 2.0048, "grad_norm": 2.7342636585235596, "learning_rate": 8.244897959183674e-06, "loss": 0.2603, "step": 4192 }, { "epoch": 2.005, "grad_norm": 3.703019142150879, "learning_rate": 8.234693877551021e-06, "loss": 0.2315, "step": 4193 }, { "epoch": 2.0052, "grad_norm": 11.856758117675781, "learning_rate": 8.224489795918369e-06, "loss": 2.0342, "step": 4194 }, { "epoch": 2.0054, "grad_norm": 3.2413318157196045, "learning_rate": 8.214285714285714e-06, "loss": 0.2025, "step": 4195 }, { "epoch": 2.0056, "grad_norm": 21.850154876708984, "learning_rate": 8.204081632653062e-06, "loss": 5.154, "step": 4196 }, { "epoch": 2.0058, "grad_norm": 3.440854549407959, "learning_rate": 8.193877551020408e-06, "loss": 0.3227, "step": 4197 }, { "epoch": 2.006, "grad_norm": 3.0768935680389404, "learning_rate": 8.183673469387755e-06, "loss": 0.3272, "step": 4198 }, { "epoch": 2.0062, "grad_norm": 3.454883575439453, "learning_rate": 8.173469387755103e-06, "loss": 0.4007, "step": 4199 }, { "epoch": 2.0064, "grad_norm": 1.5237864255905151, "learning_rate": 8.163265306122448e-06, "loss": 0.0421, "step": 4200 }, { "epoch": 2.0066, "grad_norm": 7.132639408111572, "learning_rate": 8.153061224489798e-06, "loss": 1.1057, "step": 4201 }, { "epoch": 2.0068, "grad_norm": 3.31197452545166, "learning_rate": 8.142857142857143e-06, "loss": 0.2151, "step": 4202 }, { "epoch": 2.007, "grad_norm": 3.2910704612731934, "learning_rate": 8.13265306122449e-06, "loss": 0.4873, "step": 4203 }, { "epoch": 2.0072, "grad_norm": 5.533116340637207, "learning_rate": 8.122448979591837e-06, "loss": 0.6616, "step": 4204 }, { "epoch": 2.0074, "grad_norm": 5.0104265213012695, "learning_rate": 8.112244897959184e-06, "loss": 1.5563, "step": 4205 }, { "epoch": 2.0076, "grad_norm": 2.2984113693237305, "learning_rate": 8.102040816326532e-06, "loss": 0.565, "step": 4206 }, { "epoch": 2.0078, "grad_norm": 5.124585151672363, "learning_rate": 8.091836734693877e-06, "loss": 0.9707, "step": 4207 }, { "epoch": 2.008, "grad_norm": 3.2739717960357666, "learning_rate": 8.081632653061225e-06, "loss": 1.4951, "step": 4208 }, { "epoch": 2.0082, "grad_norm": 9.073833465576172, "learning_rate": 8.07142857142857e-06, "loss": 2.1831, "step": 4209 }, { "epoch": 2.0084, "grad_norm": 3.1194498538970947, "learning_rate": 8.061224489795918e-06, "loss": 0.3266, "step": 4210 }, { "epoch": 2.0086, "grad_norm": 3.2492129802703857, "learning_rate": 8.051020408163266e-06, "loss": 0.2966, "step": 4211 }, { "epoch": 2.0088, "grad_norm": 9.486139297485352, "learning_rate": 8.040816326530613e-06, "loss": 1.0458, "step": 4212 }, { "epoch": 2.009, "grad_norm": 1.5922856330871582, "learning_rate": 8.03061224489796e-06, "loss": 0.2079, "step": 4213 }, { "epoch": 2.0092, "grad_norm": 5.670705795288086, "learning_rate": 8.020408163265306e-06, "loss": 1.0761, "step": 4214 }, { "epoch": 2.0094, "grad_norm": 3.3283936977386475, "learning_rate": 8.010204081632654e-06, "loss": 0.1212, "step": 4215 }, { "epoch": 2.0096, "grad_norm": 11.98408317565918, "learning_rate": 8.000000000000001e-06, "loss": 1.9528, "step": 4216 }, { "epoch": 2.0098, "grad_norm": 3.9587674140930176, "learning_rate": 7.989795918367347e-06, "loss": 0.4188, "step": 4217 }, { "epoch": 2.01, "grad_norm": 5.890332221984863, "learning_rate": 7.979591836734695e-06, "loss": 1.4773, "step": 4218 }, { "epoch": 2.0102, "grad_norm": 4.72938346862793, "learning_rate": 7.96938775510204e-06, "loss": 0.6635, "step": 4219 }, { "epoch": 2.0104, "grad_norm": 6.216917514801025, "learning_rate": 7.959183673469388e-06, "loss": 1.1733, "step": 4220 }, { "epoch": 2.0106, "grad_norm": 2.5533437728881836, "learning_rate": 7.948979591836735e-06, "loss": 0.3137, "step": 4221 }, { "epoch": 2.0108, "grad_norm": 3.2857799530029297, "learning_rate": 7.938775510204081e-06, "loss": 1.569, "step": 4222 }, { "epoch": 2.011, "grad_norm": 2.7982468605041504, "learning_rate": 7.928571428571429e-06, "loss": 0.3094, "step": 4223 }, { "epoch": 2.0112, "grad_norm": 2.6701512336730957, "learning_rate": 7.918367346938776e-06, "loss": 0.6731, "step": 4224 }, { "epoch": 2.0114, "grad_norm": 3.838090181350708, "learning_rate": 7.908163265306124e-06, "loss": 0.5143, "step": 4225 }, { "epoch": 2.0116, "grad_norm": 7.445224285125732, "learning_rate": 7.89795918367347e-06, "loss": 1.9191, "step": 4226 }, { "epoch": 2.0118, "grad_norm": 5.421460151672363, "learning_rate": 7.887755102040817e-06, "loss": 1.8454, "step": 4227 }, { "epoch": 2.012, "grad_norm": 11.494588851928711, "learning_rate": 7.877551020408164e-06, "loss": 1.2474, "step": 4228 }, { "epoch": 2.0122, "grad_norm": 4.028449058532715, "learning_rate": 7.86734693877551e-06, "loss": 2.0761, "step": 4229 }, { "epoch": 2.0124, "grad_norm": 2.2099153995513916, "learning_rate": 7.857142857142858e-06, "loss": 0.1245, "step": 4230 }, { "epoch": 2.0126, "grad_norm": 3.8839001655578613, "learning_rate": 7.846938775510203e-06, "loss": 1.7844, "step": 4231 }, { "epoch": 2.0128, "grad_norm": 3.8286972045898438, "learning_rate": 7.836734693877551e-06, "loss": 0.6683, "step": 4232 }, { "epoch": 2.013, "grad_norm": 3.683375835418701, "learning_rate": 7.826530612244898e-06, "loss": 2.0083, "step": 4233 }, { "epoch": 2.0132, "grad_norm": 6.994391918182373, "learning_rate": 7.816326530612244e-06, "loss": 2.0595, "step": 4234 }, { "epoch": 2.0134, "grad_norm": 8.173694610595703, "learning_rate": 7.806122448979593e-06, "loss": 2.3681, "step": 4235 }, { "epoch": 2.0136, "grad_norm": 9.8168363571167, "learning_rate": 7.79591836734694e-06, "loss": 1.6911, "step": 4236 }, { "epoch": 2.0138, "grad_norm": 9.103837013244629, "learning_rate": 7.785714285714287e-06, "loss": 3.4232, "step": 4237 }, { "epoch": 2.014, "grad_norm": 9.685304641723633, "learning_rate": 7.775510204081632e-06, "loss": 3.0462, "step": 4238 }, { "epoch": 2.0142, "grad_norm": 2.858280897140503, "learning_rate": 7.76530612244898e-06, "loss": 0.4622, "step": 4239 }, { "epoch": 2.0144, "grad_norm": 7.876575946807861, "learning_rate": 7.755102040816327e-06, "loss": 2.1807, "step": 4240 }, { "epoch": 2.0146, "grad_norm": 3.4903438091278076, "learning_rate": 7.744897959183673e-06, "loss": 1.124, "step": 4241 }, { "epoch": 2.0148, "grad_norm": 6.04346227645874, "learning_rate": 7.73469387755102e-06, "loss": 1.4367, "step": 4242 }, { "epoch": 2.015, "grad_norm": 25.000646591186523, "learning_rate": 7.724489795918367e-06, "loss": 3.2368, "step": 4243 }, { "epoch": 2.0152, "grad_norm": 9.479928016662598, "learning_rate": 7.714285714285714e-06, "loss": 2.1588, "step": 4244 }, { "epoch": 2.0154, "grad_norm": 8.344285011291504, "learning_rate": 7.704081632653061e-06, "loss": 1.5787, "step": 4245 }, { "epoch": 2.0156, "grad_norm": 3.25274920463562, "learning_rate": 7.693877551020409e-06, "loss": 1.4998, "step": 4246 }, { "epoch": 2.0158, "grad_norm": 8.191744804382324, "learning_rate": 7.683673469387756e-06, "loss": 2.2567, "step": 4247 }, { "epoch": 2.016, "grad_norm": 4.326716899871826, "learning_rate": 7.673469387755102e-06, "loss": 0.8182, "step": 4248 }, { "epoch": 2.0162, "grad_norm": 3.1718268394470215, "learning_rate": 7.66326530612245e-06, "loss": 1.5348, "step": 4249 }, { "epoch": 2.0164, "grad_norm": 6.938101291656494, "learning_rate": 7.653061224489797e-06, "loss": 2.21, "step": 4250 }, { "epoch": 2.0166, "grad_norm": 5.614826679229736, "learning_rate": 7.642857142857143e-06, "loss": 1.0724, "step": 4251 }, { "epoch": 2.0168, "grad_norm": 4.613611221313477, "learning_rate": 7.63265306122449e-06, "loss": 0.8283, "step": 4252 }, { "epoch": 2.017, "grad_norm": 8.282724380493164, "learning_rate": 7.622448979591837e-06, "loss": 3.3605, "step": 4253 }, { "epoch": 2.0172, "grad_norm": 5.70613431930542, "learning_rate": 7.612244897959184e-06, "loss": 0.3506, "step": 4254 }, { "epoch": 2.0174, "grad_norm": 2.392794609069824, "learning_rate": 7.60204081632653e-06, "loss": 0.2306, "step": 4255 }, { "epoch": 2.0176, "grad_norm": 3.4948360919952393, "learning_rate": 7.591836734693877e-06, "loss": 0.2047, "step": 4256 }, { "epoch": 2.0178, "grad_norm": 4.542838096618652, "learning_rate": 7.5816326530612245e-06, "loss": 0.5617, "step": 4257 }, { "epoch": 2.018, "grad_norm": 4.445930004119873, "learning_rate": 7.571428571428572e-06, "loss": 0.6029, "step": 4258 }, { "epoch": 2.0182, "grad_norm": 3.355067491531372, "learning_rate": 7.5612244897959195e-06, "loss": 0.3332, "step": 4259 }, { "epoch": 2.0184, "grad_norm": 2.9681215286254883, "learning_rate": 7.551020408163266e-06, "loss": 0.2963, "step": 4260 }, { "epoch": 2.0186, "grad_norm": 4.1483988761901855, "learning_rate": 7.540816326530613e-06, "loss": 0.4423, "step": 4261 }, { "epoch": 2.0188, "grad_norm": 3.583317756652832, "learning_rate": 7.5306122448979594e-06, "loss": 0.211, "step": 4262 }, { "epoch": 2.019, "grad_norm": 8.856846809387207, "learning_rate": 7.520408163265306e-06, "loss": 2.3365, "step": 4263 }, { "epoch": 2.0192, "grad_norm": 4.985814571380615, "learning_rate": 7.5102040816326536e-06, "loss": 0.8046, "step": 4264 }, { "epoch": 2.0194, "grad_norm": 10.49881649017334, "learning_rate": 7.5e-06, "loss": 5.0147, "step": 4265 }, { "epoch": 2.0196, "grad_norm": 8.625734329223633, "learning_rate": 7.489795918367347e-06, "loss": 0.2431, "step": 4266 }, { "epoch": 2.0198, "grad_norm": 8.84630298614502, "learning_rate": 7.4795918367346935e-06, "loss": 1.6197, "step": 4267 }, { "epoch": 2.02, "grad_norm": 1.5825251340866089, "learning_rate": 7.46938775510204e-06, "loss": 0.0774, "step": 4268 }, { "epoch": 2.0202, "grad_norm": 1.0772866010665894, "learning_rate": 7.4591836734693884e-06, "loss": 0.0352, "step": 4269 }, { "epoch": 2.0204, "grad_norm": 3.064786911010742, "learning_rate": 7.448979591836736e-06, "loss": 0.3147, "step": 4270 }, { "epoch": 2.0206, "grad_norm": 4.729042053222656, "learning_rate": 7.4387755102040826e-06, "loss": 0.1488, "step": 4271 }, { "epoch": 2.0208, "grad_norm": 0.9412669539451599, "learning_rate": 7.428571428571429e-06, "loss": 0.0267, "step": 4272 }, { "epoch": 2.021, "grad_norm": 2.5195536613464355, "learning_rate": 7.418367346938776e-06, "loss": 0.2004, "step": 4273 }, { "epoch": 2.0212, "grad_norm": 4.93627405166626, "learning_rate": 7.4081632653061225e-06, "loss": 1.0138, "step": 4274 }, { "epoch": 2.0214, "grad_norm": 3.3213465213775635, "learning_rate": 7.39795918367347e-06, "loss": 0.2463, "step": 4275 }, { "epoch": 2.0216, "grad_norm": 9.367770195007324, "learning_rate": 7.387755102040817e-06, "loss": 1.5291, "step": 4276 }, { "epoch": 2.0218, "grad_norm": 5.484495639801025, "learning_rate": 7.377551020408163e-06, "loss": 0.3543, "step": 4277 }, { "epoch": 2.022, "grad_norm": 2.817056894302368, "learning_rate": 7.36734693877551e-06, "loss": 0.2793, "step": 4278 }, { "epoch": 2.0222, "grad_norm": 4.89556884765625, "learning_rate": 7.3571428571428565e-06, "loss": 0.4653, "step": 4279 }, { "epoch": 2.0224, "grad_norm": 5.620337009429932, "learning_rate": 7.346938775510205e-06, "loss": 1.0702, "step": 4280 }, { "epoch": 2.0226, "grad_norm": 8.641432762145996, "learning_rate": 7.3367346938775515e-06, "loss": 1.0402, "step": 4281 }, { "epoch": 2.0228, "grad_norm": 3.3844666481018066, "learning_rate": 7.326530612244899e-06, "loss": 1.4472, "step": 4282 }, { "epoch": 2.023, "grad_norm": 4.433985710144043, "learning_rate": 7.316326530612246e-06, "loss": 0.6459, "step": 4283 }, { "epoch": 2.0232, "grad_norm": 1.9350392818450928, "learning_rate": 7.306122448979592e-06, "loss": 0.1746, "step": 4284 }, { "epoch": 2.0234, "grad_norm": 6.165185451507568, "learning_rate": 7.295918367346939e-06, "loss": 0.6697, "step": 4285 }, { "epoch": 2.0236, "grad_norm": 2.852959156036377, "learning_rate": 7.285714285714286e-06, "loss": 0.3198, "step": 4286 }, { "epoch": 2.0238, "grad_norm": 8.078595161437988, "learning_rate": 7.275510204081633e-06, "loss": 2.5873, "step": 4287 }, { "epoch": 2.024, "grad_norm": 4.600830554962158, "learning_rate": 7.26530612244898e-06, "loss": 1.077, "step": 4288 }, { "epoch": 2.0242, "grad_norm": 10.893990516662598, "learning_rate": 7.255102040816326e-06, "loss": 1.9659, "step": 4289 }, { "epoch": 2.0244, "grad_norm": 5.1831207275390625, "learning_rate": 7.244897959183673e-06, "loss": 2.1732, "step": 4290 }, { "epoch": 2.0246, "grad_norm": 25.182043075561523, "learning_rate": 7.2346938775510204e-06, "loss": 3.6428, "step": 4291 }, { "epoch": 2.0248, "grad_norm": 10.35595703125, "learning_rate": 7.224489795918368e-06, "loss": 2.1839, "step": 4292 }, { "epoch": 2.025, "grad_norm": 32.797996520996094, "learning_rate": 7.214285714285715e-06, "loss": 6.636, "step": 4293 }, { "epoch": 2.0252, "grad_norm": 4.553895473480225, "learning_rate": 7.204081632653062e-06, "loss": 0.5681, "step": 4294 }, { "epoch": 2.0254, "grad_norm": 9.21674633026123, "learning_rate": 7.193877551020409e-06, "loss": 4.7964, "step": 4295 }, { "epoch": 2.0256, "grad_norm": 4.0077714920043945, "learning_rate": 7.183673469387755e-06, "loss": 0.841, "step": 4296 }, { "epoch": 2.0258, "grad_norm": 4.008864402770996, "learning_rate": 7.173469387755102e-06, "loss": 0.5305, "step": 4297 }, { "epoch": 2.026, "grad_norm": 3.588186740875244, "learning_rate": 7.1632653061224494e-06, "loss": 1.5291, "step": 4298 }, { "epoch": 2.0262000000000002, "grad_norm": 4.0893330574035645, "learning_rate": 7.153061224489796e-06, "loss": 1.1, "step": 4299 }, { "epoch": 2.0264, "grad_norm": 3.504427194595337, "learning_rate": 7.142857142857143e-06, "loss": 1.3353, "step": 4300 }, { "epoch": 2.0266, "grad_norm": 3.810716390609741, "learning_rate": 7.132653061224489e-06, "loss": 2.1167, "step": 4301 }, { "epoch": 2.0268, "grad_norm": 12.104032516479492, "learning_rate": 7.122448979591837e-06, "loss": 5.6261, "step": 4302 }, { "epoch": 2.027, "grad_norm": 3.746196985244751, "learning_rate": 7.112244897959184e-06, "loss": 1.511, "step": 4303 }, { "epoch": 2.0272, "grad_norm": 12.053168296813965, "learning_rate": 7.102040816326532e-06, "loss": 3.4344, "step": 4304 }, { "epoch": 2.0274, "grad_norm": 1.4310916662216187, "learning_rate": 7.0918367346938785e-06, "loss": 0.0547, "step": 4305 }, { "epoch": 2.0276, "grad_norm": 7.563154697418213, "learning_rate": 7.081632653061225e-06, "loss": 0.9291, "step": 4306 }, { "epoch": 2.0278, "grad_norm": 2.7030062675476074, "learning_rate": 7.071428571428572e-06, "loss": 0.3102, "step": 4307 }, { "epoch": 2.028, "grad_norm": 16.48634910583496, "learning_rate": 7.061224489795918e-06, "loss": 2.2507, "step": 4308 }, { "epoch": 2.0282, "grad_norm": 3.9196617603302, "learning_rate": 7.051020408163266e-06, "loss": 0.5105, "step": 4309 }, { "epoch": 2.0284, "grad_norm": 4.114338397979736, "learning_rate": 7.0408163265306125e-06, "loss": 0.9735, "step": 4310 }, { "epoch": 2.0286, "grad_norm": 1.3699504137039185, "learning_rate": 7.030612244897959e-06, "loss": 0.0524, "step": 4311 }, { "epoch": 2.0288, "grad_norm": 3.6880979537963867, "learning_rate": 7.020408163265306e-06, "loss": 1.5327, "step": 4312 }, { "epoch": 2.029, "grad_norm": 5.881893157958984, "learning_rate": 7.010204081632652e-06, "loss": 1.4355, "step": 4313 }, { "epoch": 2.0292, "grad_norm": 4.015309810638428, "learning_rate": 7.000000000000001e-06, "loss": 0.6515, "step": 4314 }, { "epoch": 2.0294, "grad_norm": 9.127671241760254, "learning_rate": 6.989795918367348e-06, "loss": 5.0853, "step": 4315 }, { "epoch": 2.0296, "grad_norm": 9.522642135620117, "learning_rate": 6.979591836734695e-06, "loss": 2.508, "step": 4316 }, { "epoch": 2.0298, "grad_norm": 8.305075645446777, "learning_rate": 6.9693877551020415e-06, "loss": 2.0008, "step": 4317 }, { "epoch": 2.03, "grad_norm": 10.065943717956543, "learning_rate": 6.959183673469388e-06, "loss": 3.3854, "step": 4318 }, { "epoch": 2.0302, "grad_norm": 3.0334036350250244, "learning_rate": 6.948979591836735e-06, "loss": 0.51, "step": 4319 }, { "epoch": 2.0304, "grad_norm": 5.859947681427002, "learning_rate": 6.938775510204082e-06, "loss": 0.4441, "step": 4320 }, { "epoch": 2.0306, "grad_norm": 5.833090305328369, "learning_rate": 6.928571428571429e-06, "loss": 1.9727, "step": 4321 }, { "epoch": 2.0308, "grad_norm": 14.231902122497559, "learning_rate": 6.9183673469387755e-06, "loss": 2.0807, "step": 4322 }, { "epoch": 2.031, "grad_norm": 3.9816040992736816, "learning_rate": 6.908163265306122e-06, "loss": 2.2134, "step": 4323 }, { "epoch": 2.0312, "grad_norm": 10.230045318603516, "learning_rate": 6.897959183673469e-06, "loss": 1.8335, "step": 4324 }, { "epoch": 2.0314, "grad_norm": 3.276740789413452, "learning_rate": 6.887755102040816e-06, "loss": 1.4815, "step": 4325 }, { "epoch": 2.0316, "grad_norm": 6.0765790939331055, "learning_rate": 6.877551020408164e-06, "loss": 0.6174, "step": 4326 }, { "epoch": 2.0318, "grad_norm": 1.807294249534607, "learning_rate": 6.867346938775511e-06, "loss": 0.1718, "step": 4327 }, { "epoch": 2.032, "grad_norm": 12.442965507507324, "learning_rate": 6.857142857142858e-06, "loss": 1.442, "step": 4328 }, { "epoch": 2.0322, "grad_norm": 14.195085525512695, "learning_rate": 6.8469387755102046e-06, "loss": 0.8755, "step": 4329 }, { "epoch": 2.0324, "grad_norm": 2.847365617752075, "learning_rate": 6.836734693877551e-06, "loss": 0.4877, "step": 4330 }, { "epoch": 2.0326, "grad_norm": 11.989447593688965, "learning_rate": 6.826530612244898e-06, "loss": 3.3478, "step": 4331 }, { "epoch": 2.0328, "grad_norm": 1.4652307033538818, "learning_rate": 6.816326530612245e-06, "loss": 0.0239, "step": 4332 }, { "epoch": 2.033, "grad_norm": 3.8287265300750732, "learning_rate": 6.806122448979592e-06, "loss": 0.3573, "step": 4333 }, { "epoch": 2.0332, "grad_norm": 2.04620099067688, "learning_rate": 6.795918367346939e-06, "loss": 0.1761, "step": 4334 }, { "epoch": 2.0334, "grad_norm": 10.700292587280273, "learning_rate": 6.785714285714285e-06, "loss": 1.4991, "step": 4335 }, { "epoch": 2.0336, "grad_norm": 2.6859498023986816, "learning_rate": 6.775510204081633e-06, "loss": 0.1894, "step": 4336 }, { "epoch": 2.0338, "grad_norm": 7.146368503570557, "learning_rate": 6.76530612244898e-06, "loss": 1.0313, "step": 4337 }, { "epoch": 2.034, "grad_norm": 4.037665367126465, "learning_rate": 6.755102040816328e-06, "loss": 0.556, "step": 4338 }, { "epoch": 2.0342, "grad_norm": 3.5893335342407227, "learning_rate": 6.744897959183674e-06, "loss": 0.6636, "step": 4339 }, { "epoch": 2.0344, "grad_norm": 3.3621785640716553, "learning_rate": 6.734693877551021e-06, "loss": 1.4062, "step": 4340 }, { "epoch": 2.0346, "grad_norm": 5.736871242523193, "learning_rate": 6.724489795918368e-06, "loss": 1.1887, "step": 4341 }, { "epoch": 2.0348, "grad_norm": 8.566832542419434, "learning_rate": 6.714285714285714e-06, "loss": 5.0112, "step": 4342 }, { "epoch": 2.035, "grad_norm": 7.10464334487915, "learning_rate": 6.704081632653062e-06, "loss": 1.9142, "step": 4343 }, { "epoch": 2.0352, "grad_norm": 1.1951695680618286, "learning_rate": 6.693877551020408e-06, "loss": 0.0413, "step": 4344 }, { "epoch": 2.0354, "grad_norm": 1.8170075416564941, "learning_rate": 6.683673469387755e-06, "loss": 0.1659, "step": 4345 }, { "epoch": 2.0356, "grad_norm": 1.655696988105774, "learning_rate": 6.673469387755102e-06, "loss": 0.1428, "step": 4346 }, { "epoch": 2.0358, "grad_norm": 2.352597236633301, "learning_rate": 6.663265306122448e-06, "loss": 0.2514, "step": 4347 }, { "epoch": 2.036, "grad_norm": 3.6672985553741455, "learning_rate": 6.653061224489797e-06, "loss": 0.3943, "step": 4348 }, { "epoch": 2.0362, "grad_norm": 4.368422985076904, "learning_rate": 6.642857142857144e-06, "loss": 0.6111, "step": 4349 }, { "epoch": 2.0364, "grad_norm": 6.8434624671936035, "learning_rate": 6.632653061224491e-06, "loss": 1.3778, "step": 4350 }, { "epoch": 2.0366, "grad_norm": 1.9027135372161865, "learning_rate": 6.622448979591837e-06, "loss": 0.1582, "step": 4351 }, { "epoch": 2.0368, "grad_norm": 3.6676418781280518, "learning_rate": 6.612244897959184e-06, "loss": 1.6667, "step": 4352 }, { "epoch": 2.037, "grad_norm": 1.3977506160736084, "learning_rate": 6.602040816326531e-06, "loss": 0.0251, "step": 4353 }, { "epoch": 2.0372, "grad_norm": 4.393439769744873, "learning_rate": 6.591836734693878e-06, "loss": 0.2797, "step": 4354 }, { "epoch": 2.0374, "grad_norm": 1.9742120504379272, "learning_rate": 6.581632653061225e-06, "loss": 0.1793, "step": 4355 }, { "epoch": 2.0376, "grad_norm": 3.285804510116577, "learning_rate": 6.5714285714285714e-06, "loss": 0.2586, "step": 4356 }, { "epoch": 2.0378, "grad_norm": 3.293999195098877, "learning_rate": 6.561224489795918e-06, "loss": 0.1964, "step": 4357 }, { "epoch": 2.038, "grad_norm": 3.4198780059814453, "learning_rate": 6.551020408163265e-06, "loss": 0.5805, "step": 4358 }, { "epoch": 2.0382, "grad_norm": 10.4772367477417, "learning_rate": 6.540816326530612e-06, "loss": 1.8666, "step": 4359 }, { "epoch": 2.0384, "grad_norm": 2.4163289070129395, "learning_rate": 6.53061224489796e-06, "loss": 0.2647, "step": 4360 }, { "epoch": 2.0386, "grad_norm": 3.692894220352173, "learning_rate": 6.520408163265307e-06, "loss": 0.4535, "step": 4361 }, { "epoch": 2.0388, "grad_norm": 4.505298614501953, "learning_rate": 6.510204081632654e-06, "loss": 0.5907, "step": 4362 }, { "epoch": 2.039, "grad_norm": 4.578971862792969, "learning_rate": 6.5000000000000004e-06, "loss": 0.7895, "step": 4363 }, { "epoch": 2.0392, "grad_norm": 1.5840128660202026, "learning_rate": 6.489795918367347e-06, "loss": 0.1479, "step": 4364 }, { "epoch": 2.0394, "grad_norm": 4.764891624450684, "learning_rate": 6.4795918367346946e-06, "loss": 0.6096, "step": 4365 }, { "epoch": 2.0396, "grad_norm": 2.5816330909729004, "learning_rate": 6.469387755102041e-06, "loss": 0.3104, "step": 4366 }, { "epoch": 2.0398, "grad_norm": 2.8380391597747803, "learning_rate": 6.459183673469388e-06, "loss": 0.3112, "step": 4367 }, { "epoch": 2.04, "grad_norm": 5.092507839202881, "learning_rate": 6.4489795918367345e-06, "loss": 0.6007, "step": 4368 }, { "epoch": 2.0402, "grad_norm": 5.286302089691162, "learning_rate": 6.438775510204081e-06, "loss": 1.4574, "step": 4369 }, { "epoch": 2.0404, "grad_norm": 11.976883888244629, "learning_rate": 6.428571428571429e-06, "loss": 0.9541, "step": 4370 }, { "epoch": 2.0406, "grad_norm": 3.1366777420043945, "learning_rate": 6.418367346938776e-06, "loss": 0.5663, "step": 4371 }, { "epoch": 2.0408, "grad_norm": 5.570676326751709, "learning_rate": 6.408163265306124e-06, "loss": 1.6063, "step": 4372 }, { "epoch": 2.041, "grad_norm": 3.534306764602661, "learning_rate": 6.39795918367347e-06, "loss": 0.4283, "step": 4373 }, { "epoch": 2.0412, "grad_norm": 8.754057884216309, "learning_rate": 6.387755102040817e-06, "loss": 2.1288, "step": 4374 }, { "epoch": 2.0414, "grad_norm": 4.957884788513184, "learning_rate": 6.3775510204081635e-06, "loss": 0.5723, "step": 4375 }, { "epoch": 2.0416, "grad_norm": 3.3927721977233887, "learning_rate": 6.36734693877551e-06, "loss": 1.4173, "step": 4376 }, { "epoch": 2.0418, "grad_norm": 4.619457244873047, "learning_rate": 6.357142857142858e-06, "loss": 0.5361, "step": 4377 }, { "epoch": 2.042, "grad_norm": 2.0513007640838623, "learning_rate": 6.346938775510204e-06, "loss": 0.2143, "step": 4378 }, { "epoch": 2.0422, "grad_norm": 2.817387580871582, "learning_rate": 6.336734693877551e-06, "loss": 0.3838, "step": 4379 }, { "epoch": 2.0424, "grad_norm": 3.58909273147583, "learning_rate": 6.3265306122448975e-06, "loss": 0.3807, "step": 4380 }, { "epoch": 2.0426, "grad_norm": 4.798856735229492, "learning_rate": 6.316326530612245e-06, "loss": 1.434, "step": 4381 }, { "epoch": 2.0428, "grad_norm": 8.82713794708252, "learning_rate": 6.3061224489795925e-06, "loss": 1.4132, "step": 4382 }, { "epoch": 2.043, "grad_norm": 2.2979350090026855, "learning_rate": 6.29591836734694e-06, "loss": 0.1361, "step": 4383 }, { "epoch": 2.0432, "grad_norm": 2.487318754196167, "learning_rate": 6.285714285714287e-06, "loss": 0.5233, "step": 4384 }, { "epoch": 2.0434, "grad_norm": 4.03679084777832, "learning_rate": 6.275510204081633e-06, "loss": 0.4972, "step": 4385 }, { "epoch": 2.0436, "grad_norm": 5.528019905090332, "learning_rate": 6.26530612244898e-06, "loss": 2.2495, "step": 4386 }, { "epoch": 2.0438, "grad_norm": 1.5046391487121582, "learning_rate": 6.2551020408163266e-06, "loss": 0.0472, "step": 4387 }, { "epoch": 2.044, "grad_norm": 9.946911811828613, "learning_rate": 6.244897959183674e-06, "loss": 2.3159, "step": 4388 }, { "epoch": 2.0442, "grad_norm": 2.3810222148895264, "learning_rate": 6.234693877551021e-06, "loss": 0.183, "step": 4389 }, { "epoch": 2.0444, "grad_norm": 12.703330039978027, "learning_rate": 6.224489795918367e-06, "loss": 3.7456, "step": 4390 }, { "epoch": 2.0446, "grad_norm": 4.766584873199463, "learning_rate": 6.214285714285715e-06, "loss": 0.9412, "step": 4391 }, { "epoch": 2.0448, "grad_norm": 13.101330757141113, "learning_rate": 6.2040816326530614e-06, "loss": 1.9205, "step": 4392 }, { "epoch": 2.045, "grad_norm": 4.829402446746826, "learning_rate": 6.193877551020408e-06, "loss": 1.543, "step": 4393 }, { "epoch": 2.0452, "grad_norm": 7.299891948699951, "learning_rate": 6.1836734693877556e-06, "loss": 2.6336, "step": 4394 }, { "epoch": 2.0454, "grad_norm": 1.5839056968688965, "learning_rate": 6.173469387755102e-06, "loss": 0.0655, "step": 4395 }, { "epoch": 2.0456, "grad_norm": 2.8372104167938232, "learning_rate": 6.163265306122449e-06, "loss": 1.3619, "step": 4396 }, { "epoch": 2.0458, "grad_norm": 1.482968807220459, "learning_rate": 6.153061224489796e-06, "loss": 0.0501, "step": 4397 }, { "epoch": 2.046, "grad_norm": 2.1378843784332275, "learning_rate": 6.142857142857143e-06, "loss": 0.2139, "step": 4398 }, { "epoch": 2.0462, "grad_norm": 4.614658832550049, "learning_rate": 6.1326530612244905e-06, "loss": 0.2673, "step": 4399 }, { "epoch": 2.0464, "grad_norm": 6.131068229675293, "learning_rate": 6.122448979591837e-06, "loss": 0.5613, "step": 4400 }, { "epoch": 2.0466, "grad_norm": 7.097366809844971, "learning_rate": 6.112244897959184e-06, "loss": 0.7178, "step": 4401 }, { "epoch": 2.0468, "grad_norm": 13.37352180480957, "learning_rate": 6.102040816326531e-06, "loss": 1.3431, "step": 4402 }, { "epoch": 2.047, "grad_norm": 2.1020140647888184, "learning_rate": 6.091836734693878e-06, "loss": 0.2304, "step": 4403 }, { "epoch": 2.0472, "grad_norm": 2.519576072692871, "learning_rate": 6.0816326530612245e-06, "loss": 0.3093, "step": 4404 }, { "epoch": 2.0474, "grad_norm": 3.063638925552368, "learning_rate": 6.071428571428572e-06, "loss": 0.5728, "step": 4405 }, { "epoch": 2.0476, "grad_norm": 5.231879234313965, "learning_rate": 6.061224489795919e-06, "loss": 0.9442, "step": 4406 }, { "epoch": 2.0478, "grad_norm": 4.134194850921631, "learning_rate": 6.051020408163265e-06, "loss": 0.5767, "step": 4407 }, { "epoch": 2.048, "grad_norm": 4.078321933746338, "learning_rate": 6.040816326530613e-06, "loss": 0.6297, "step": 4408 }, { "epoch": 2.0482, "grad_norm": 8.251264572143555, "learning_rate": 6.030612244897959e-06, "loss": 2.754, "step": 4409 }, { "epoch": 2.0484, "grad_norm": 2.3369216918945312, "learning_rate": 6.020408163265306e-06, "loss": 0.4342, "step": 4410 }, { "epoch": 2.0486, "grad_norm": 6.78234338760376, "learning_rate": 6.0102040816326535e-06, "loss": 1.9212, "step": 4411 }, { "epoch": 2.0488, "grad_norm": 3.5200674533843994, "learning_rate": 6e-06, "loss": 1.628, "step": 4412 }, { "epoch": 2.049, "grad_norm": 2.2730045318603516, "learning_rate": 5.989795918367347e-06, "loss": 0.2568, "step": 4413 }, { "epoch": 2.0492, "grad_norm": 2.6582682132720947, "learning_rate": 5.979591836734694e-06, "loss": 0.2148, "step": 4414 }, { "epoch": 2.0494, "grad_norm": 5.059128761291504, "learning_rate": 5.969387755102041e-06, "loss": 0.5851, "step": 4415 }, { "epoch": 2.0496, "grad_norm": 2.932605266571045, "learning_rate": 5.959183673469388e-06, "loss": 0.5677, "step": 4416 }, { "epoch": 2.0498, "grad_norm": 2.9726316928863525, "learning_rate": 5.948979591836735e-06, "loss": 0.2871, "step": 4417 }, { "epoch": 2.05, "grad_norm": 3.800670623779297, "learning_rate": 5.938775510204082e-06, "loss": 1.1672, "step": 4418 }, { "epoch": 2.0502, "grad_norm": 7.1238579750061035, "learning_rate": 5.928571428571429e-06, "loss": 0.815, "step": 4419 }, { "epoch": 2.0504, "grad_norm": 15.937649726867676, "learning_rate": 5.918367346938776e-06, "loss": 3.0679, "step": 4420 }, { "epoch": 2.0506, "grad_norm": 3.0840086936950684, "learning_rate": 5.9081632653061224e-06, "loss": 1.4612, "step": 4421 }, { "epoch": 2.0508, "grad_norm": 4.5798187255859375, "learning_rate": 5.89795918367347e-06, "loss": 0.648, "step": 4422 }, { "epoch": 2.051, "grad_norm": 1.7499949932098389, "learning_rate": 5.8877551020408166e-06, "loss": 0.1632, "step": 4423 }, { "epoch": 2.0512, "grad_norm": 6.960896015167236, "learning_rate": 5.877551020408163e-06, "loss": 1.4873, "step": 4424 }, { "epoch": 2.0514, "grad_norm": 4.876622676849365, "learning_rate": 5.867346938775511e-06, "loss": 0.8274, "step": 4425 }, { "epoch": 2.0516, "grad_norm": 4.382575988769531, "learning_rate": 5.857142857142857e-06, "loss": 0.7828, "step": 4426 }, { "epoch": 2.0518, "grad_norm": 8.884764671325684, "learning_rate": 5.846938775510204e-06, "loss": 1.8684, "step": 4427 }, { "epoch": 2.052, "grad_norm": 6.382287502288818, "learning_rate": 5.8367346938775515e-06, "loss": 1.1565, "step": 4428 }, { "epoch": 2.0522, "grad_norm": 10.393208503723145, "learning_rate": 5.826530612244898e-06, "loss": 2.3059, "step": 4429 }, { "epoch": 2.0524, "grad_norm": 9.367992401123047, "learning_rate": 5.816326530612245e-06, "loss": 3.4319, "step": 4430 }, { "epoch": 2.0526, "grad_norm": 24.33891487121582, "learning_rate": 5.806122448979592e-06, "loss": 5.3695, "step": 4431 }, { "epoch": 2.0528, "grad_norm": 4.95985746383667, "learning_rate": 5.795918367346939e-06, "loss": 1.0752, "step": 4432 }, { "epoch": 2.053, "grad_norm": 20.802255630493164, "learning_rate": 5.785714285714286e-06, "loss": 3.8515, "step": 4433 }, { "epoch": 2.0532, "grad_norm": 3.7617321014404297, "learning_rate": 5.775510204081633e-06, "loss": 1.4798, "step": 4434 }, { "epoch": 2.0534, "grad_norm": 1.6784859895706177, "learning_rate": 5.76530612244898e-06, "loss": 0.0366, "step": 4435 }, { "epoch": 2.0536, "grad_norm": 4.394393444061279, "learning_rate": 5.755102040816327e-06, "loss": 0.5598, "step": 4436 }, { "epoch": 2.0538, "grad_norm": 6.262113094329834, "learning_rate": 5.744897959183674e-06, "loss": 1.5221, "step": 4437 }, { "epoch": 2.054, "grad_norm": 2.5081212520599365, "learning_rate": 5.73469387755102e-06, "loss": 0.2736, "step": 4438 }, { "epoch": 2.0542, "grad_norm": 15.906336784362793, "learning_rate": 5.724489795918368e-06, "loss": 5.0865, "step": 4439 }, { "epoch": 2.0544, "grad_norm": 4.739946365356445, "learning_rate": 5.7142857142857145e-06, "loss": 0.5971, "step": 4440 }, { "epoch": 2.0546, "grad_norm": 3.405334234237671, "learning_rate": 5.704081632653061e-06, "loss": 1.0074, "step": 4441 }, { "epoch": 2.0548, "grad_norm": 20.478851318359375, "learning_rate": 5.693877551020409e-06, "loss": 1.5863, "step": 4442 }, { "epoch": 2.055, "grad_norm": 3.57926344871521, "learning_rate": 5.683673469387755e-06, "loss": 1.4315, "step": 4443 }, { "epoch": 2.0552, "grad_norm": 18.06096839904785, "learning_rate": 5.673469387755103e-06, "loss": 3.774, "step": 4444 }, { "epoch": 2.0554, "grad_norm": 1.1555712223052979, "learning_rate": 5.663265306122449e-06, "loss": 0.0234, "step": 4445 }, { "epoch": 2.0556, "grad_norm": 12.743829727172852, "learning_rate": 5.653061224489796e-06, "loss": 1.3324, "step": 4446 }, { "epoch": 2.0558, "grad_norm": 1.9675103425979614, "learning_rate": 5.642857142857143e-06, "loss": 0.1721, "step": 4447 }, { "epoch": 2.056, "grad_norm": 11.829596519470215, "learning_rate": 5.63265306122449e-06, "loss": 3.8803, "step": 4448 }, { "epoch": 2.0562, "grad_norm": 1.3106498718261719, "learning_rate": 5.622448979591837e-06, "loss": 0.0457, "step": 4449 }, { "epoch": 2.0564, "grad_norm": 4.3609938621521, "learning_rate": 5.612244897959184e-06, "loss": 0.2479, "step": 4450 }, { "epoch": 2.0566, "grad_norm": 1.6185758113861084, "learning_rate": 5.602040816326531e-06, "loss": 0.1516, "step": 4451 }, { "epoch": 2.0568, "grad_norm": 2.02146577835083, "learning_rate": 5.5918367346938776e-06, "loss": 0.1894, "step": 4452 }, { "epoch": 2.057, "grad_norm": 1.1370601654052734, "learning_rate": 5.581632653061225e-06, "loss": 0.0356, "step": 4453 }, { "epoch": 2.0572, "grad_norm": 2.578493118286133, "learning_rate": 5.571428571428572e-06, "loss": 0.6012, "step": 4454 }, { "epoch": 2.0574, "grad_norm": 1.9479355812072754, "learning_rate": 5.561224489795918e-06, "loss": 0.2416, "step": 4455 }, { "epoch": 2.0576, "grad_norm": 4.031513690948486, "learning_rate": 5.551020408163266e-06, "loss": 1.8204, "step": 4456 }, { "epoch": 2.0578, "grad_norm": 2.8992726802825928, "learning_rate": 5.5408163265306125e-06, "loss": 0.3126, "step": 4457 }, { "epoch": 2.058, "grad_norm": 5.120978355407715, "learning_rate": 5.530612244897959e-06, "loss": 1.4985, "step": 4458 }, { "epoch": 2.0582, "grad_norm": 4.285439491271973, "learning_rate": 5.520408163265307e-06, "loss": 0.553, "step": 4459 }, { "epoch": 2.0584, "grad_norm": 1.931534767150879, "learning_rate": 5.510204081632653e-06, "loss": 0.1953, "step": 4460 }, { "epoch": 2.0586, "grad_norm": 3.271833658218384, "learning_rate": 5.500000000000001e-06, "loss": 0.5028, "step": 4461 }, { "epoch": 2.0588, "grad_norm": 1.9770781993865967, "learning_rate": 5.489795918367347e-06, "loss": 0.0458, "step": 4462 }, { "epoch": 2.059, "grad_norm": 4.2079997062683105, "learning_rate": 5.479591836734694e-06, "loss": 1.8211, "step": 4463 }, { "epoch": 2.0592, "grad_norm": 2.775291919708252, "learning_rate": 5.469387755102041e-06, "loss": 0.2137, "step": 4464 }, { "epoch": 2.0594, "grad_norm": 6.623566627502441, "learning_rate": 5.459183673469388e-06, "loss": 1.5583, "step": 4465 }, { "epoch": 2.0596, "grad_norm": 3.7688889503479004, "learning_rate": 5.448979591836735e-06, "loss": 0.1495, "step": 4466 }, { "epoch": 2.0598, "grad_norm": 3.026883363723755, "learning_rate": 5.438775510204082e-06, "loss": 0.593, "step": 4467 }, { "epoch": 2.06, "grad_norm": 5.533717155456543, "learning_rate": 5.428571428571429e-06, "loss": 1.3528, "step": 4468 }, { "epoch": 2.0602, "grad_norm": 2.3275177478790283, "learning_rate": 5.4183673469387755e-06, "loss": 0.0615, "step": 4469 }, { "epoch": 2.0604, "grad_norm": 8.552267074584961, "learning_rate": 5.408163265306123e-06, "loss": 4.6218, "step": 4470 }, { "epoch": 2.0606, "grad_norm": 2.0251433849334717, "learning_rate": 5.39795918367347e-06, "loss": 0.0536, "step": 4471 }, { "epoch": 2.0608, "grad_norm": 2.1941115856170654, "learning_rate": 5.387755102040816e-06, "loss": 0.1264, "step": 4472 }, { "epoch": 2.061, "grad_norm": 4.157203197479248, "learning_rate": 5.377551020408164e-06, "loss": 1.3165, "step": 4473 }, { "epoch": 2.0612, "grad_norm": 4.577666759490967, "learning_rate": 5.36734693877551e-06, "loss": 0.568, "step": 4474 }, { "epoch": 2.0614, "grad_norm": 8.491950035095215, "learning_rate": 5.357142857142857e-06, "loss": 4.998, "step": 4475 }, { "epoch": 2.0616, "grad_norm": 1.6711488962173462, "learning_rate": 5.3469387755102045e-06, "loss": 0.1915, "step": 4476 }, { "epoch": 2.0618, "grad_norm": 3.7359209060668945, "learning_rate": 5.336734693877551e-06, "loss": 1.6961, "step": 4477 }, { "epoch": 2.062, "grad_norm": 5.737436294555664, "learning_rate": 5.326530612244899e-06, "loss": 0.8686, "step": 4478 }, { "epoch": 2.0622, "grad_norm": 4.680469512939453, "learning_rate": 5.316326530612245e-06, "loss": 0.7464, "step": 4479 }, { "epoch": 2.0624, "grad_norm": 8.432143211364746, "learning_rate": 5.306122448979592e-06, "loss": 1.9435, "step": 4480 }, { "epoch": 2.0626, "grad_norm": 1.610304594039917, "learning_rate": 5.2959183673469386e-06, "loss": 0.111, "step": 4481 }, { "epoch": 2.0628, "grad_norm": 6.408649921417236, "learning_rate": 5.285714285714286e-06, "loss": 0.8789, "step": 4482 }, { "epoch": 2.063, "grad_norm": 2.3763010501861572, "learning_rate": 5.275510204081633e-06, "loss": 0.3241, "step": 4483 }, { "epoch": 2.0632, "grad_norm": 5.558871746063232, "learning_rate": 5.26530612244898e-06, "loss": 1.4693, "step": 4484 }, { "epoch": 2.0634, "grad_norm": 4.030745506286621, "learning_rate": 5.255102040816327e-06, "loss": 0.6144, "step": 4485 }, { "epoch": 2.0636, "grad_norm": 14.399699211120605, "learning_rate": 5.2448979591836735e-06, "loss": 0.8409, "step": 4486 }, { "epoch": 2.0638, "grad_norm": 2.289337635040283, "learning_rate": 5.234693877551021e-06, "loss": 0.192, "step": 4487 }, { "epoch": 2.064, "grad_norm": 26.556169509887695, "learning_rate": 5.224489795918368e-06, "loss": 2.6163, "step": 4488 }, { "epoch": 2.0642, "grad_norm": 1.5711888074874878, "learning_rate": 5.214285714285714e-06, "loss": 0.0689, "step": 4489 }, { "epoch": 2.0644, "grad_norm": 4.504660606384277, "learning_rate": 5.204081632653062e-06, "loss": 1.2876, "step": 4490 }, { "epoch": 2.0646, "grad_norm": 1.9299784898757935, "learning_rate": 5.193877551020408e-06, "loss": 0.491, "step": 4491 }, { "epoch": 2.0648, "grad_norm": 2.0123250484466553, "learning_rate": 5.183673469387755e-06, "loss": 0.2154, "step": 4492 }, { "epoch": 2.065, "grad_norm": 3.2566661834716797, "learning_rate": 5.1734693877551025e-06, "loss": 1.411, "step": 4493 }, { "epoch": 2.0652, "grad_norm": 1.9258058071136475, "learning_rate": 5.163265306122449e-06, "loss": 0.2116, "step": 4494 }, { "epoch": 2.0654, "grad_norm": 2.1975531578063965, "learning_rate": 5.153061224489797e-06, "loss": 0.2916, "step": 4495 }, { "epoch": 2.0656, "grad_norm": 1.9354606866836548, "learning_rate": 5.142857142857143e-06, "loss": 0.4395, "step": 4496 }, { "epoch": 2.0658, "grad_norm": 3.242257833480835, "learning_rate": 5.13265306122449e-06, "loss": 0.5785, "step": 4497 }, { "epoch": 2.066, "grad_norm": 3.7497684955596924, "learning_rate": 5.1224489795918365e-06, "loss": 1.5256, "step": 4498 }, { "epoch": 2.0662, "grad_norm": 6.796024322509766, "learning_rate": 5.112244897959184e-06, "loss": 0.4456, "step": 4499 }, { "epoch": 2.0664, "grad_norm": 4.2627716064453125, "learning_rate": 5.102040816326531e-06, "loss": 1.1087, "step": 4500 }, { "epoch": 2.0666, "grad_norm": 3.804938316345215, "learning_rate": 5.091836734693878e-06, "loss": 0.6744, "step": 4501 }, { "epoch": 2.0668, "grad_norm": 2.9033427238464355, "learning_rate": 5.081632653061225e-06, "loss": 1.4458, "step": 4502 }, { "epoch": 2.067, "grad_norm": 1.893961787223816, "learning_rate": 5.071428571428571e-06, "loss": 0.1196, "step": 4503 }, { "epoch": 2.0672, "grad_norm": 2.7537388801574707, "learning_rate": 5.061224489795919e-06, "loss": 0.2561, "step": 4504 }, { "epoch": 2.0674, "grad_norm": 2.0070817470550537, "learning_rate": 5.0510204081632655e-06, "loss": 0.1651, "step": 4505 }, { "epoch": 2.0676, "grad_norm": 7.835712909698486, "learning_rate": 5.040816326530612e-06, "loss": 0.67, "step": 4506 }, { "epoch": 2.0678, "grad_norm": 3.468743085861206, "learning_rate": 5.03061224489796e-06, "loss": 0.2437, "step": 4507 }, { "epoch": 2.068, "grad_norm": 8.640397071838379, "learning_rate": 5.020408163265306e-06, "loss": 1.3277, "step": 4508 }, { "epoch": 2.0682, "grad_norm": 9.059866905212402, "learning_rate": 5.010204081632653e-06, "loss": 1.7499, "step": 4509 }, { "epoch": 2.0684, "grad_norm": 4.851001739501953, "learning_rate": 5e-06, "loss": 0.273, "step": 4510 }, { "epoch": 2.0686, "grad_norm": 4.267689228057861, "learning_rate": 4.989795918367347e-06, "loss": 2.3196, "step": 4511 }, { "epoch": 2.0688, "grad_norm": 4.466909885406494, "learning_rate": 4.9795918367346945e-06, "loss": 1.4522, "step": 4512 }, { "epoch": 2.069, "grad_norm": 3.5190610885620117, "learning_rate": 4.969387755102041e-06, "loss": 1.5381, "step": 4513 }, { "epoch": 2.0692, "grad_norm": 7.272828102111816, "learning_rate": 4.959183673469388e-06, "loss": 4.6036, "step": 4514 }, { "epoch": 2.0694, "grad_norm": 4.943753242492676, "learning_rate": 4.9489795918367345e-06, "loss": 0.8271, "step": 4515 }, { "epoch": 2.0696, "grad_norm": 4.36967134475708, "learning_rate": 4.938775510204082e-06, "loss": 1.4336, "step": 4516 }, { "epoch": 2.0698, "grad_norm": 8.297110557556152, "learning_rate": 4.9285714285714286e-06, "loss": 2.122, "step": 4517 }, { "epoch": 2.07, "grad_norm": 7.8115668296813965, "learning_rate": 4.918367346938776e-06, "loss": 4.7454, "step": 4518 }, { "epoch": 2.0702, "grad_norm": 3.2708168029785156, "learning_rate": 4.908163265306123e-06, "loss": 0.311, "step": 4519 }, { "epoch": 2.0704, "grad_norm": 3.0260918140411377, "learning_rate": 4.897959183673469e-06, "loss": 0.2899, "step": 4520 }, { "epoch": 2.0705999999999998, "grad_norm": 1.0097763538360596, "learning_rate": 4.887755102040817e-06, "loss": 0.0318, "step": 4521 }, { "epoch": 2.0708, "grad_norm": 1.5027214288711548, "learning_rate": 4.8775510204081635e-06, "loss": 0.0214, "step": 4522 }, { "epoch": 2.071, "grad_norm": 1.852122187614441, "learning_rate": 4.86734693877551e-06, "loss": 0.1686, "step": 4523 }, { "epoch": 2.0712, "grad_norm": 1.601088047027588, "learning_rate": 4.857142857142858e-06, "loss": 0.0679, "step": 4524 }, { "epoch": 2.0714, "grad_norm": 2.005523681640625, "learning_rate": 4.846938775510204e-06, "loss": 0.1974, "step": 4525 }, { "epoch": 2.0716, "grad_norm": 1.9652719497680664, "learning_rate": 4.836734693877551e-06, "loss": 0.1106, "step": 4526 }, { "epoch": 2.0718, "grad_norm": 0.9379339218139648, "learning_rate": 4.826530612244898e-06, "loss": 0.0182, "step": 4527 }, { "epoch": 2.072, "grad_norm": 1.3320714235305786, "learning_rate": 4.816326530612245e-06, "loss": 0.0359, "step": 4528 }, { "epoch": 2.0722, "grad_norm": 3.7485291957855225, "learning_rate": 4.8061224489795925e-06, "loss": 0.4026, "step": 4529 }, { "epoch": 2.0724, "grad_norm": 0.9966763854026794, "learning_rate": 4.795918367346939e-06, "loss": 0.0292, "step": 4530 }, { "epoch": 2.0726, "grad_norm": 9.170167922973633, "learning_rate": 4.785714285714286e-06, "loss": 1.6943, "step": 4531 }, { "epoch": 2.0728, "grad_norm": 3.7105157375335693, "learning_rate": 4.775510204081632e-06, "loss": 0.3304, "step": 4532 }, { "epoch": 2.073, "grad_norm": 4.878239154815674, "learning_rate": 4.76530612244898e-06, "loss": 1.7712, "step": 4533 }, { "epoch": 2.0732, "grad_norm": 7.267502307891846, "learning_rate": 4.7551020408163265e-06, "loss": 1.1406, "step": 4534 }, { "epoch": 2.0734, "grad_norm": 7.83075475692749, "learning_rate": 4.744897959183674e-06, "loss": 0.7267, "step": 4535 }, { "epoch": 2.0736, "grad_norm": 2.710083484649658, "learning_rate": 4.734693877551021e-06, "loss": 0.2014, "step": 4536 }, { "epoch": 2.0738, "grad_norm": 2.002182960510254, "learning_rate": 4.724489795918367e-06, "loss": 0.1058, "step": 4537 }, { "epoch": 2.074, "grad_norm": 2.4177191257476807, "learning_rate": 4.714285714285715e-06, "loss": 0.131, "step": 4538 }, { "epoch": 2.0742, "grad_norm": 2.5525918006896973, "learning_rate": 4.704081632653061e-06, "loss": 0.115, "step": 4539 }, { "epoch": 2.0744, "grad_norm": 5.234180450439453, "learning_rate": 4.693877551020408e-06, "loss": 0.3899, "step": 4540 }, { "epoch": 2.0746, "grad_norm": 2.046069383621216, "learning_rate": 4.6836734693877555e-06, "loss": 0.1686, "step": 4541 }, { "epoch": 2.0748, "grad_norm": 11.975163459777832, "learning_rate": 4.673469387755102e-06, "loss": 1.2345, "step": 4542 }, { "epoch": 2.075, "grad_norm": 5.11647891998291, "learning_rate": 4.663265306122449e-06, "loss": 0.7455, "step": 4543 }, { "epoch": 2.0752, "grad_norm": 1.9567712545394897, "learning_rate": 4.653061224489796e-06, "loss": 0.1626, "step": 4544 }, { "epoch": 2.0754, "grad_norm": 10.514437675476074, "learning_rate": 4.642857142857143e-06, "loss": 2.4514, "step": 4545 }, { "epoch": 2.0756, "grad_norm": 1.8375921249389648, "learning_rate": 4.63265306122449e-06, "loss": 0.1867, "step": 4546 }, { "epoch": 2.0758, "grad_norm": 10.785018920898438, "learning_rate": 4.622448979591837e-06, "loss": 1.6582, "step": 4547 }, { "epoch": 2.076, "grad_norm": 2.0468430519104004, "learning_rate": 4.612244897959184e-06, "loss": 0.4524, "step": 4548 }, { "epoch": 2.0762, "grad_norm": 4.889520645141602, "learning_rate": 4.60204081632653e-06, "loss": 1.7329, "step": 4549 }, { "epoch": 2.0764, "grad_norm": 3.7147347927093506, "learning_rate": 4.591836734693878e-06, "loss": 1.6102, "step": 4550 }, { "epoch": 2.0766, "grad_norm": 9.234541893005371, "learning_rate": 4.5816326530612245e-06, "loss": 1.1917, "step": 4551 }, { "epoch": 2.0768, "grad_norm": 2.258267641067505, "learning_rate": 4.571428571428572e-06, "loss": 0.2806, "step": 4552 }, { "epoch": 2.077, "grad_norm": 11.382367134094238, "learning_rate": 4.561224489795919e-06, "loss": 3.6602, "step": 4553 }, { "epoch": 2.0772, "grad_norm": 3.3093225955963135, "learning_rate": 4.551020408163265e-06, "loss": 0.5698, "step": 4554 }, { "epoch": 2.0774, "grad_norm": 5.779459476470947, "learning_rate": 4.540816326530613e-06, "loss": 1.5482, "step": 4555 }, { "epoch": 2.0776, "grad_norm": 1.640430212020874, "learning_rate": 4.530612244897959e-06, "loss": 0.1241, "step": 4556 }, { "epoch": 2.0778, "grad_norm": 6.966116428375244, "learning_rate": 4.520408163265307e-06, "loss": 1.0226, "step": 4557 }, { "epoch": 2.078, "grad_norm": 5.381195068359375, "learning_rate": 4.5102040816326535e-06, "loss": 0.8309, "step": 4558 }, { "epoch": 2.0782, "grad_norm": 4.727212429046631, "learning_rate": 4.5e-06, "loss": 1.6594, "step": 4559 }, { "epoch": 2.0784, "grad_norm": 21.576250076293945, "learning_rate": 4.489795918367347e-06, "loss": 4.3874, "step": 4560 }, { "epoch": 2.0786, "grad_norm": 7.968533992767334, "learning_rate": 4.479591836734694e-06, "loss": 0.782, "step": 4561 }, { "epoch": 2.0788, "grad_norm": 10.656570434570312, "learning_rate": 4.469387755102041e-06, "loss": 3.7518, "step": 4562 }, { "epoch": 2.079, "grad_norm": 1.8644603490829468, "learning_rate": 4.459183673469388e-06, "loss": 0.1694, "step": 4563 }, { "epoch": 2.0792, "grad_norm": 11.678762435913086, "learning_rate": 4.448979591836735e-06, "loss": 1.506, "step": 4564 }, { "epoch": 2.0794, "grad_norm": 1.744140386581421, "learning_rate": 4.438775510204082e-06, "loss": 0.1903, "step": 4565 }, { "epoch": 2.0796, "grad_norm": 19.20170021057129, "learning_rate": 4.428571428571428e-06, "loss": 3.6634, "step": 4566 }, { "epoch": 2.0798, "grad_norm": 2.1338698863983154, "learning_rate": 4.418367346938776e-06, "loss": 0.2085, "step": 4567 }, { "epoch": 2.08, "grad_norm": 0.9909071922302246, "learning_rate": 4.408163265306122e-06, "loss": 0.0245, "step": 4568 }, { "epoch": 2.0802, "grad_norm": 5.628470420837402, "learning_rate": 4.39795918367347e-06, "loss": 1.6193, "step": 4569 }, { "epoch": 2.0804, "grad_norm": 3.133054494857788, "learning_rate": 4.3877551020408165e-06, "loss": 0.2746, "step": 4570 }, { "epoch": 2.0806, "grad_norm": 12.851790428161621, "learning_rate": 4.377551020408163e-06, "loss": 5.1795, "step": 4571 }, { "epoch": 2.0808, "grad_norm": 4.939739227294922, "learning_rate": 4.367346938775511e-06, "loss": 0.5878, "step": 4572 }, { "epoch": 2.081, "grad_norm": 8.724700927734375, "learning_rate": 4.357142857142857e-06, "loss": 1.3007, "step": 4573 }, { "epoch": 2.0812, "grad_norm": 3.8831076622009277, "learning_rate": 4.346938775510205e-06, "loss": 0.443, "step": 4574 }, { "epoch": 2.0814, "grad_norm": 2.3261559009552, "learning_rate": 4.336734693877551e-06, "loss": 0.244, "step": 4575 }, { "epoch": 2.0816, "grad_norm": 1.6815943717956543, "learning_rate": 4.326530612244898e-06, "loss": 0.1314, "step": 4576 }, { "epoch": 2.0818, "grad_norm": 3.9276928901672363, "learning_rate": 4.316326530612245e-06, "loss": 0.4005, "step": 4577 }, { "epoch": 2.082, "grad_norm": 5.823033809661865, "learning_rate": 4.306122448979592e-06, "loss": 1.0409, "step": 4578 }, { "epoch": 2.0822, "grad_norm": 5.226304054260254, "learning_rate": 4.295918367346939e-06, "loss": 0.6593, "step": 4579 }, { "epoch": 2.0824, "grad_norm": 53.483036041259766, "learning_rate": 4.285714285714286e-06, "loss": 4.1761, "step": 4580 }, { "epoch": 2.0826000000000002, "grad_norm": 3.3650310039520264, "learning_rate": 4.275510204081633e-06, "loss": 1.4406, "step": 4581 }, { "epoch": 2.0828, "grad_norm": 16.876192092895508, "learning_rate": 4.26530612244898e-06, "loss": 3.6484, "step": 4582 }, { "epoch": 2.083, "grad_norm": 7.014874458312988, "learning_rate": 4.255102040816326e-06, "loss": 5.0853, "step": 4583 }, { "epoch": 2.0832, "grad_norm": 3.952516555786133, "learning_rate": 4.244897959183674e-06, "loss": 0.2555, "step": 4584 }, { "epoch": 2.0834, "grad_norm": 5.884845733642578, "learning_rate": 4.23469387755102e-06, "loss": 0.8335, "step": 4585 }, { "epoch": 2.0836, "grad_norm": 5.378721714019775, "learning_rate": 4.224489795918368e-06, "loss": 0.2679, "step": 4586 }, { "epoch": 2.0838, "grad_norm": 4.5112385749816895, "learning_rate": 4.2142857142857145e-06, "loss": 0.5756, "step": 4587 }, { "epoch": 2.084, "grad_norm": 1.023730754852295, "learning_rate": 4.204081632653061e-06, "loss": 0.0301, "step": 4588 }, { "epoch": 2.0842, "grad_norm": 4.677689552307129, "learning_rate": 4.193877551020409e-06, "loss": 0.5565, "step": 4589 }, { "epoch": 2.0844, "grad_norm": 2.3065829277038574, "learning_rate": 4.183673469387755e-06, "loss": 0.2316, "step": 4590 }, { "epoch": 2.0846, "grad_norm": 3.4042725563049316, "learning_rate": 4.173469387755103e-06, "loss": 1.5457, "step": 4591 }, { "epoch": 2.0848, "grad_norm": 3.8823788166046143, "learning_rate": 4.163265306122449e-06, "loss": 0.7111, "step": 4592 }, { "epoch": 2.085, "grad_norm": 1.4060784578323364, "learning_rate": 4.153061224489796e-06, "loss": 0.0451, "step": 4593 }, { "epoch": 2.0852, "grad_norm": 3.823343515396118, "learning_rate": 4.142857142857143e-06, "loss": 1.6003, "step": 4594 }, { "epoch": 2.0854, "grad_norm": 2.5086982250213623, "learning_rate": 4.13265306122449e-06, "loss": 0.1459, "step": 4595 }, { "epoch": 2.0856, "grad_norm": 3.642223834991455, "learning_rate": 4.122448979591837e-06, "loss": 0.3265, "step": 4596 }, { "epoch": 2.0858, "grad_norm": 6.024643421173096, "learning_rate": 4.112244897959184e-06, "loss": 0.7844, "step": 4597 }, { "epoch": 2.086, "grad_norm": 3.7316811084747314, "learning_rate": 4.102040816326531e-06, "loss": 0.4592, "step": 4598 }, { "epoch": 2.0862, "grad_norm": 10.370810508728027, "learning_rate": 4.0918367346938775e-06, "loss": 2.3321, "step": 4599 }, { "epoch": 2.0864, "grad_norm": 10.592586517333984, "learning_rate": 4.081632653061224e-06, "loss": 1.8294, "step": 4600 }, { "epoch": 2.0866, "grad_norm": 10.37260627746582, "learning_rate": 4.071428571428572e-06, "loss": 2.6426, "step": 4601 }, { "epoch": 2.0868, "grad_norm": 7.045802593231201, "learning_rate": 4.061224489795918e-06, "loss": 2.4168, "step": 4602 }, { "epoch": 2.087, "grad_norm": 2.7917001247406006, "learning_rate": 4.051020408163266e-06, "loss": 0.4655, "step": 4603 }, { "epoch": 2.0872, "grad_norm": 8.666906356811523, "learning_rate": 4.040816326530612e-06, "loss": 3.4238, "step": 4604 }, { "epoch": 2.0874, "grad_norm": 2.353450298309326, "learning_rate": 4.030612244897959e-06, "loss": 0.4478, "step": 4605 }, { "epoch": 2.0876, "grad_norm": 3.345656156539917, "learning_rate": 4.0204081632653065e-06, "loss": 0.801, "step": 4606 }, { "epoch": 2.0878, "grad_norm": 3.443507432937622, "learning_rate": 4.010204081632653e-06, "loss": 1.5316, "step": 4607 }, { "epoch": 2.088, "grad_norm": 4.042265892028809, "learning_rate": 4.000000000000001e-06, "loss": 1.6096, "step": 4608 }, { "epoch": 2.0882, "grad_norm": 2.2342569828033447, "learning_rate": 3.989795918367347e-06, "loss": 0.3755, "step": 4609 }, { "epoch": 2.0884, "grad_norm": 4.226164817810059, "learning_rate": 3.979591836734694e-06, "loss": 0.5237, "step": 4610 }, { "epoch": 2.0886, "grad_norm": 2.139458417892456, "learning_rate": 3.969387755102041e-06, "loss": 0.2093, "step": 4611 }, { "epoch": 2.0888, "grad_norm": 2.6331608295440674, "learning_rate": 3.959183673469388e-06, "loss": 0.2545, "step": 4612 }, { "epoch": 2.089, "grad_norm": 1.0588756799697876, "learning_rate": 3.948979591836735e-06, "loss": 0.0267, "step": 4613 }, { "epoch": 2.0892, "grad_norm": 3.786996603012085, "learning_rate": 3.938775510204082e-06, "loss": 0.4619, "step": 4614 }, { "epoch": 2.0894, "grad_norm": 2.729401111602783, "learning_rate": 3.928571428571429e-06, "loss": 0.2752, "step": 4615 }, { "epoch": 2.0896, "grad_norm": 4.36923885345459, "learning_rate": 3.9183673469387755e-06, "loss": 0.3585, "step": 4616 }, { "epoch": 2.0898, "grad_norm": 8.847342491149902, "learning_rate": 3.908163265306122e-06, "loss": 0.8233, "step": 4617 }, { "epoch": 2.09, "grad_norm": 7.369173526763916, "learning_rate": 3.89795918367347e-06, "loss": 1.1151, "step": 4618 }, { "epoch": 2.0902, "grad_norm": 20.38437843322754, "learning_rate": 3.887755102040816e-06, "loss": 2.2948, "step": 4619 }, { "epoch": 2.0904, "grad_norm": 2.462642192840576, "learning_rate": 3.877551020408164e-06, "loss": 0.2833, "step": 4620 }, { "epoch": 2.0906, "grad_norm": 9.721959114074707, "learning_rate": 3.86734693877551e-06, "loss": 2.6022, "step": 4621 }, { "epoch": 2.0908, "grad_norm": 5.129932880401611, "learning_rate": 3.857142857142857e-06, "loss": 0.8727, "step": 4622 }, { "epoch": 2.091, "grad_norm": 2.118366003036499, "learning_rate": 3.8469387755102045e-06, "loss": 0.1915, "step": 4623 }, { "epoch": 2.0912, "grad_norm": 8.516611099243164, "learning_rate": 3.836734693877551e-06, "loss": 0.7175, "step": 4624 }, { "epoch": 2.0914, "grad_norm": 1.7975201606750488, "learning_rate": 3.826530612244899e-06, "loss": 0.2119, "step": 4625 }, { "epoch": 2.0916, "grad_norm": 2.179056167602539, "learning_rate": 3.816326530612245e-06, "loss": 0.1473, "step": 4626 }, { "epoch": 2.0918, "grad_norm": 3.825058698654175, "learning_rate": 3.806122448979592e-06, "loss": 0.2241, "step": 4627 }, { "epoch": 2.092, "grad_norm": 3.3491287231445312, "learning_rate": 3.7959183673469385e-06, "loss": 0.3319, "step": 4628 }, { "epoch": 2.0922, "grad_norm": 8.091402053833008, "learning_rate": 3.785714285714286e-06, "loss": 1.0905, "step": 4629 }, { "epoch": 2.0924, "grad_norm": 0.9689975380897522, "learning_rate": 3.775510204081633e-06, "loss": 0.0246, "step": 4630 }, { "epoch": 2.0926, "grad_norm": 1.3830915689468384, "learning_rate": 3.7653061224489797e-06, "loss": 0.0428, "step": 4631 }, { "epoch": 2.0928, "grad_norm": 2.716122627258301, "learning_rate": 3.7551020408163268e-06, "loss": 0.5996, "step": 4632 }, { "epoch": 2.093, "grad_norm": 3.4472105503082275, "learning_rate": 3.7448979591836734e-06, "loss": 0.1459, "step": 4633 }, { "epoch": 2.0932, "grad_norm": 4.0314788818359375, "learning_rate": 3.73469387755102e-06, "loss": 1.5885, "step": 4634 }, { "epoch": 2.0934, "grad_norm": 8.527509689331055, "learning_rate": 3.724489795918368e-06, "loss": 0.8418, "step": 4635 }, { "epoch": 2.0936, "grad_norm": 5.430739402770996, "learning_rate": 3.7142857142857146e-06, "loss": 0.7711, "step": 4636 }, { "epoch": 2.0938, "grad_norm": 4.720272064208984, "learning_rate": 3.7040816326530612e-06, "loss": 0.7726, "step": 4637 }, { "epoch": 2.094, "grad_norm": 3.743134021759033, "learning_rate": 3.6938775510204083e-06, "loss": 0.626, "step": 4638 }, { "epoch": 2.0942, "grad_norm": 1.433454155921936, "learning_rate": 3.683673469387755e-06, "loss": 0.105, "step": 4639 }, { "epoch": 2.0944, "grad_norm": 1.5286427736282349, "learning_rate": 3.6734693877551024e-06, "loss": 0.0555, "step": 4640 }, { "epoch": 2.0946, "grad_norm": 2.7598135471343994, "learning_rate": 3.6632653061224495e-06, "loss": 0.4154, "step": 4641 }, { "epoch": 2.0948, "grad_norm": 1.687098741531372, "learning_rate": 3.653061224489796e-06, "loss": 0.0738, "step": 4642 }, { "epoch": 2.095, "grad_norm": 5.211580276489258, "learning_rate": 3.642857142857143e-06, "loss": 1.4158, "step": 4643 }, { "epoch": 2.0952, "grad_norm": 7.764120578765869, "learning_rate": 3.63265306122449e-06, "loss": 0.5652, "step": 4644 }, { "epoch": 2.0954, "grad_norm": 0.9991572499275208, "learning_rate": 3.6224489795918365e-06, "loss": 0.0194, "step": 4645 }, { "epoch": 2.0956, "grad_norm": 15.356642723083496, "learning_rate": 3.612244897959184e-06, "loss": 1.9036, "step": 4646 }, { "epoch": 2.0958, "grad_norm": 2.4500508308410645, "learning_rate": 3.602040816326531e-06, "loss": 0.1955, "step": 4647 }, { "epoch": 2.096, "grad_norm": 3.684885025024414, "learning_rate": 3.5918367346938777e-06, "loss": 0.3032, "step": 4648 }, { "epoch": 2.0962, "grad_norm": 3.018481492996216, "learning_rate": 3.5816326530612247e-06, "loss": 0.3123, "step": 4649 }, { "epoch": 2.0964, "grad_norm": 5.848809719085693, "learning_rate": 3.5714285714285714e-06, "loss": 1.9129, "step": 4650 }, { "epoch": 2.0966, "grad_norm": 1.8316609859466553, "learning_rate": 3.5612244897959184e-06, "loss": 0.1484, "step": 4651 }, { "epoch": 2.0968, "grad_norm": 6.228904724121094, "learning_rate": 3.551020408163266e-06, "loss": 5.0758, "step": 4652 }, { "epoch": 2.097, "grad_norm": 3.67339825630188, "learning_rate": 3.5408163265306125e-06, "loss": 0.6337, "step": 4653 }, { "epoch": 2.0972, "grad_norm": 8.086236953735352, "learning_rate": 3.530612244897959e-06, "loss": 0.8362, "step": 4654 }, { "epoch": 2.0974, "grad_norm": 4.554009437561035, "learning_rate": 3.5204081632653062e-06, "loss": 0.6495, "step": 4655 }, { "epoch": 2.0976, "grad_norm": 18.859203338623047, "learning_rate": 3.510204081632653e-06, "loss": 2.0373, "step": 4656 }, { "epoch": 2.0978, "grad_norm": 3.6999504566192627, "learning_rate": 3.5000000000000004e-06, "loss": 1.0081, "step": 4657 }, { "epoch": 2.098, "grad_norm": 2.6290180683135986, "learning_rate": 3.4897959183673474e-06, "loss": 0.594, "step": 4658 }, { "epoch": 2.0982, "grad_norm": 4.123224258422852, "learning_rate": 3.479591836734694e-06, "loss": 2.1693, "step": 4659 }, { "epoch": 2.0984, "grad_norm": 3.309077024459839, "learning_rate": 3.469387755102041e-06, "loss": 1.3687, "step": 4660 }, { "epoch": 2.0986, "grad_norm": 8.282438278198242, "learning_rate": 3.4591836734693878e-06, "loss": 2.6114, "step": 4661 }, { "epoch": 2.0987999999999998, "grad_norm": 2.382996082305908, "learning_rate": 3.4489795918367344e-06, "loss": 0.2528, "step": 4662 }, { "epoch": 2.099, "grad_norm": 11.994235038757324, "learning_rate": 3.438775510204082e-06, "loss": 3.0576, "step": 4663 }, { "epoch": 2.0992, "grad_norm": 2.2513766288757324, "learning_rate": 3.428571428571429e-06, "loss": 0.0989, "step": 4664 }, { "epoch": 2.0994, "grad_norm": 1.784716248512268, "learning_rate": 3.4183673469387756e-06, "loss": 0.2105, "step": 4665 }, { "epoch": 2.0996, "grad_norm": 3.966416120529175, "learning_rate": 3.4081632653061227e-06, "loss": 0.4555, "step": 4666 }, { "epoch": 2.0998, "grad_norm": 2.5667736530303955, "learning_rate": 3.3979591836734693e-06, "loss": 0.2572, "step": 4667 }, { "epoch": 2.1, "grad_norm": 1.7077491283416748, "learning_rate": 3.3877551020408164e-06, "loss": 0.1606, "step": 4668 }, { "epoch": 2.1002, "grad_norm": 3.992199659347534, "learning_rate": 3.377551020408164e-06, "loss": 0.479, "step": 4669 }, { "epoch": 2.1004, "grad_norm": 2.081674575805664, "learning_rate": 3.3673469387755105e-06, "loss": 0.1013, "step": 4670 }, { "epoch": 2.1006, "grad_norm": 2.2576310634613037, "learning_rate": 3.357142857142857e-06, "loss": 0.2264, "step": 4671 }, { "epoch": 2.1008, "grad_norm": 4.365758895874023, "learning_rate": 3.346938775510204e-06, "loss": 0.5402, "step": 4672 }, { "epoch": 2.101, "grad_norm": 4.182484149932861, "learning_rate": 3.336734693877551e-06, "loss": 0.236, "step": 4673 }, { "epoch": 2.1012, "grad_norm": 1.898068904876709, "learning_rate": 3.3265306122448983e-06, "loss": 0.0834, "step": 4674 }, { "epoch": 2.1014, "grad_norm": 2.2708635330200195, "learning_rate": 3.3163265306122454e-06, "loss": 0.1356, "step": 4675 }, { "epoch": 2.1016, "grad_norm": 6.384464263916016, "learning_rate": 3.306122448979592e-06, "loss": 1.4563, "step": 4676 }, { "epoch": 2.1018, "grad_norm": 4.844964981079102, "learning_rate": 3.295918367346939e-06, "loss": 0.678, "step": 4677 }, { "epoch": 2.102, "grad_norm": 9.251545906066895, "learning_rate": 3.2857142857142857e-06, "loss": 3.3184, "step": 4678 }, { "epoch": 2.1022, "grad_norm": 3.1075327396392822, "learning_rate": 3.2755102040816324e-06, "loss": 0.1363, "step": 4679 }, { "epoch": 2.1024, "grad_norm": 5.791533946990967, "learning_rate": 3.26530612244898e-06, "loss": 1.0356, "step": 4680 }, { "epoch": 2.1026, "grad_norm": 7.99643611907959, "learning_rate": 3.255102040816327e-06, "loss": 0.6833, "step": 4681 }, { "epoch": 2.1028000000000002, "grad_norm": 9.739289283752441, "learning_rate": 3.2448979591836735e-06, "loss": 2.835, "step": 4682 }, { "epoch": 2.103, "grad_norm": 1.0964337587356567, "learning_rate": 3.2346938775510206e-06, "loss": 0.0257, "step": 4683 }, { "epoch": 2.1032, "grad_norm": 1.8878360986709595, "learning_rate": 3.2244897959183672e-06, "loss": 0.1182, "step": 4684 }, { "epoch": 2.1034, "grad_norm": 2.076249837875366, "learning_rate": 3.2142857142857143e-06, "loss": 0.1854, "step": 4685 }, { "epoch": 2.1036, "grad_norm": 1.6759300231933594, "learning_rate": 3.204081632653062e-06, "loss": 0.1459, "step": 4686 }, { "epoch": 2.1038, "grad_norm": 1.6163229942321777, "learning_rate": 3.1938775510204084e-06, "loss": 0.1178, "step": 4687 }, { "epoch": 2.104, "grad_norm": 0.738377571105957, "learning_rate": 3.183673469387755e-06, "loss": 0.0223, "step": 4688 }, { "epoch": 2.1042, "grad_norm": 6.086853981018066, "learning_rate": 3.173469387755102e-06, "loss": 0.5203, "step": 4689 }, { "epoch": 2.1044, "grad_norm": 1.979962706565857, "learning_rate": 3.1632653061224488e-06, "loss": 0.3415, "step": 4690 }, { "epoch": 2.1046, "grad_norm": 9.70959186553955, "learning_rate": 3.1530612244897963e-06, "loss": 1.558, "step": 4691 }, { "epoch": 2.1048, "grad_norm": 2.2602646350860596, "learning_rate": 3.1428571428571433e-06, "loss": 0.2151, "step": 4692 }, { "epoch": 2.105, "grad_norm": 3.5727615356445312, "learning_rate": 3.13265306122449e-06, "loss": 1.6043, "step": 4693 }, { "epoch": 2.1052, "grad_norm": 5.286078929901123, "learning_rate": 3.122448979591837e-06, "loss": 0.828, "step": 4694 }, { "epoch": 2.1054, "grad_norm": 1.0418708324432373, "learning_rate": 3.1122448979591837e-06, "loss": 0.0245, "step": 4695 }, { "epoch": 2.1056, "grad_norm": 10.79141902923584, "learning_rate": 3.1020408163265307e-06, "loss": 2.8219, "step": 4696 }, { "epoch": 2.1058, "grad_norm": 5.624372959136963, "learning_rate": 3.0918367346938778e-06, "loss": 0.3448, "step": 4697 }, { "epoch": 2.106, "grad_norm": 3.7493929862976074, "learning_rate": 3.0816326530612244e-06, "loss": 1.5757, "step": 4698 }, { "epoch": 2.1062, "grad_norm": 7.292983055114746, "learning_rate": 3.0714285714285715e-06, "loss": 1.0822, "step": 4699 }, { "epoch": 2.1064, "grad_norm": 6.196479320526123, "learning_rate": 3.0612244897959185e-06, "loss": 5.0199, "step": 4700 }, { "epoch": 2.1066, "grad_norm": 1.9192054271697998, "learning_rate": 3.0510204081632656e-06, "loss": 0.1834, "step": 4701 }, { "epoch": 2.1068, "grad_norm": 4.074975490570068, "learning_rate": 3.0408163265306122e-06, "loss": 0.5157, "step": 4702 }, { "epoch": 2.107, "grad_norm": 1.673568606376648, "learning_rate": 3.0306122448979593e-06, "loss": 0.0709, "step": 4703 }, { "epoch": 2.1072, "grad_norm": 3.941925048828125, "learning_rate": 3.0204081632653064e-06, "loss": 0.125, "step": 4704 }, { "epoch": 2.1074, "grad_norm": 9.043163299560547, "learning_rate": 3.010204081632653e-06, "loss": 0.4157, "step": 4705 }, { "epoch": 2.1076, "grad_norm": 1.8842334747314453, "learning_rate": 3e-06, "loss": 0.134, "step": 4706 }, { "epoch": 2.1078, "grad_norm": 14.688392639160156, "learning_rate": 2.989795918367347e-06, "loss": 1.8615, "step": 4707 }, { "epoch": 2.108, "grad_norm": 1.6997333765029907, "learning_rate": 2.979591836734694e-06, "loss": 0.115, "step": 4708 }, { "epoch": 2.1082, "grad_norm": 5.09561014175415, "learning_rate": 2.969387755102041e-06, "loss": 0.8584, "step": 4709 }, { "epoch": 2.1084, "grad_norm": 1.824609637260437, "learning_rate": 2.959183673469388e-06, "loss": 0.1815, "step": 4710 }, { "epoch": 2.1086, "grad_norm": 12.072440147399902, "learning_rate": 2.948979591836735e-06, "loss": 3.6057, "step": 4711 }, { "epoch": 2.1088, "grad_norm": 1.943498969078064, "learning_rate": 2.9387755102040816e-06, "loss": 0.0558, "step": 4712 }, { "epoch": 2.109, "grad_norm": 10.319755554199219, "learning_rate": 2.9285714285714287e-06, "loss": 1.573, "step": 4713 }, { "epoch": 2.1092, "grad_norm": 1.8012566566467285, "learning_rate": 2.9183673469387757e-06, "loss": 0.1308, "step": 4714 }, { "epoch": 2.1094, "grad_norm": 1.4766422510147095, "learning_rate": 2.9081632653061224e-06, "loss": 0.1336, "step": 4715 }, { "epoch": 2.1096, "grad_norm": 1.7192660570144653, "learning_rate": 2.8979591836734694e-06, "loss": 0.1554, "step": 4716 }, { "epoch": 2.1098, "grad_norm": 2.284416913986206, "learning_rate": 2.8877551020408165e-06, "loss": 0.2452, "step": 4717 }, { "epoch": 2.11, "grad_norm": 2.098954200744629, "learning_rate": 2.8775510204081636e-06, "loss": 0.1814, "step": 4718 }, { "epoch": 2.1102, "grad_norm": 4.4768967628479, "learning_rate": 2.86734693877551e-06, "loss": 0.6873, "step": 4719 }, { "epoch": 2.1104, "grad_norm": 4.6320109367370605, "learning_rate": 2.8571428571428573e-06, "loss": 0.9844, "step": 4720 }, { "epoch": 2.1106, "grad_norm": 7.1224365234375, "learning_rate": 2.8469387755102043e-06, "loss": 0.7156, "step": 4721 }, { "epoch": 2.1108, "grad_norm": 9.049817085266113, "learning_rate": 2.8367346938775514e-06, "loss": 3.3146, "step": 4722 }, { "epoch": 2.111, "grad_norm": 2.0975863933563232, "learning_rate": 2.826530612244898e-06, "loss": 0.4132, "step": 4723 }, { "epoch": 2.1112, "grad_norm": 2.4252097606658936, "learning_rate": 2.816326530612245e-06, "loss": 0.2678, "step": 4724 }, { "epoch": 2.1114, "grad_norm": 3.6085386276245117, "learning_rate": 2.806122448979592e-06, "loss": 1.6607, "step": 4725 }, { "epoch": 2.1116, "grad_norm": 3.3134396076202393, "learning_rate": 2.7959183673469388e-06, "loss": 0.5871, "step": 4726 }, { "epoch": 2.1118, "grad_norm": 1.4712363481521606, "learning_rate": 2.785714285714286e-06, "loss": 0.0912, "step": 4727 }, { "epoch": 2.112, "grad_norm": 6.200575351715088, "learning_rate": 2.775510204081633e-06, "loss": 0.9258, "step": 4728 }, { "epoch": 2.1122, "grad_norm": 1.3659968376159668, "learning_rate": 2.7653061224489795e-06, "loss": 0.0313, "step": 4729 }, { "epoch": 2.1124, "grad_norm": 10.170721054077148, "learning_rate": 2.7551020408163266e-06, "loss": 2.3293, "step": 4730 }, { "epoch": 2.1126, "grad_norm": 7.839609622955322, "learning_rate": 2.7448979591836737e-06, "loss": 0.6248, "step": 4731 }, { "epoch": 2.1128, "grad_norm": 4.498930931091309, "learning_rate": 2.7346938775510203e-06, "loss": 0.562, "step": 4732 }, { "epoch": 2.113, "grad_norm": 18.936641693115234, "learning_rate": 2.7244897959183674e-06, "loss": 1.9553, "step": 4733 }, { "epoch": 2.1132, "grad_norm": 2.0487070083618164, "learning_rate": 2.7142857142857144e-06, "loss": 0.1837, "step": 4734 }, { "epoch": 2.1134, "grad_norm": 3.12445330619812, "learning_rate": 2.7040816326530615e-06, "loss": 0.4411, "step": 4735 }, { "epoch": 2.1136, "grad_norm": 1.5472902059555054, "learning_rate": 2.693877551020408e-06, "loss": 0.0534, "step": 4736 }, { "epoch": 2.1138, "grad_norm": 4.787519931793213, "learning_rate": 2.683673469387755e-06, "loss": 1.3217, "step": 4737 }, { "epoch": 2.114, "grad_norm": 1.6154824495315552, "learning_rate": 2.6734693877551023e-06, "loss": 0.0597, "step": 4738 }, { "epoch": 2.1142, "grad_norm": 2.8686463832855225, "learning_rate": 2.6632653061224493e-06, "loss": 0.304, "step": 4739 }, { "epoch": 2.1144, "grad_norm": 5.769339084625244, "learning_rate": 2.653061224489796e-06, "loss": 0.9847, "step": 4740 }, { "epoch": 2.1146, "grad_norm": 6.174430847167969, "learning_rate": 2.642857142857143e-06, "loss": 1.4963, "step": 4741 }, { "epoch": 2.1148, "grad_norm": 11.479657173156738, "learning_rate": 2.63265306122449e-06, "loss": 3.2436, "step": 4742 }, { "epoch": 2.115, "grad_norm": 5.433708190917969, "learning_rate": 2.6224489795918367e-06, "loss": 0.6161, "step": 4743 }, { "epoch": 2.1152, "grad_norm": 6.422898292541504, "learning_rate": 2.612244897959184e-06, "loss": 0.6999, "step": 4744 }, { "epoch": 2.1154, "grad_norm": 1.6894084215164185, "learning_rate": 2.602040816326531e-06, "loss": 0.0894, "step": 4745 }, { "epoch": 2.1156, "grad_norm": 5.903936862945557, "learning_rate": 2.5918367346938775e-06, "loss": 0.7027, "step": 4746 }, { "epoch": 2.1158, "grad_norm": 2.2160937786102295, "learning_rate": 2.5816326530612246e-06, "loss": 0.1945, "step": 4747 }, { "epoch": 2.116, "grad_norm": 8.383194923400879, "learning_rate": 2.5714285714285716e-06, "loss": 2.4187, "step": 4748 }, { "epoch": 2.1162, "grad_norm": 1.5930655002593994, "learning_rate": 2.5612244897959183e-06, "loss": 0.1469, "step": 4749 }, { "epoch": 2.1164, "grad_norm": 3.298297643661499, "learning_rate": 2.5510204081632653e-06, "loss": 0.5827, "step": 4750 }, { "epoch": 2.1166, "grad_norm": 2.2217905521392822, "learning_rate": 2.5408163265306124e-06, "loss": 0.4792, "step": 4751 }, { "epoch": 2.1168, "grad_norm": 2.5096020698547363, "learning_rate": 2.5306122448979594e-06, "loss": 0.267, "step": 4752 }, { "epoch": 2.117, "grad_norm": 3.6636083126068115, "learning_rate": 2.520408163265306e-06, "loss": 1.53, "step": 4753 }, { "epoch": 2.1172, "grad_norm": 3.336641788482666, "learning_rate": 2.510204081632653e-06, "loss": 0.5622, "step": 4754 }, { "epoch": 2.1174, "grad_norm": 8.178468704223633, "learning_rate": 2.5e-06, "loss": 1.1144, "step": 4755 }, { "epoch": 2.1176, "grad_norm": 1.8707865476608276, "learning_rate": 2.4897959183673473e-06, "loss": 0.1159, "step": 4756 }, { "epoch": 2.1178, "grad_norm": 10.267629623413086, "learning_rate": 2.479591836734694e-06, "loss": 2.3957, "step": 4757 }, { "epoch": 2.118, "grad_norm": 2.2508950233459473, "learning_rate": 2.469387755102041e-06, "loss": 0.1804, "step": 4758 }, { "epoch": 2.1182, "grad_norm": 2.4401466846466064, "learning_rate": 2.459183673469388e-06, "loss": 0.2665, "step": 4759 }, { "epoch": 2.1184, "grad_norm": 3.918395757675171, "learning_rate": 2.4489795918367347e-06, "loss": 0.1593, "step": 4760 }, { "epoch": 2.1186, "grad_norm": 0.7459062933921814, "learning_rate": 2.4387755102040817e-06, "loss": 0.0159, "step": 4761 }, { "epoch": 2.1188, "grad_norm": 1.8292782306671143, "learning_rate": 2.428571428571429e-06, "loss": 0.1906, "step": 4762 }, { "epoch": 2.1189999999999998, "grad_norm": 2.560241460800171, "learning_rate": 2.4183673469387754e-06, "loss": 0.2444, "step": 4763 }, { "epoch": 2.1192, "grad_norm": 3.5728046894073486, "learning_rate": 2.4081632653061225e-06, "loss": 0.2309, "step": 4764 }, { "epoch": 2.1194, "grad_norm": 5.053526401519775, "learning_rate": 2.3979591836734696e-06, "loss": 0.9814, "step": 4765 }, { "epoch": 2.1196, "grad_norm": 5.086238384246826, "learning_rate": 2.387755102040816e-06, "loss": 0.3073, "step": 4766 }, { "epoch": 2.1198, "grad_norm": 5.67470121383667, "learning_rate": 2.3775510204081633e-06, "loss": 2.2228, "step": 4767 }, { "epoch": 2.12, "grad_norm": 1.1746867895126343, "learning_rate": 2.3673469387755103e-06, "loss": 0.0368, "step": 4768 }, { "epoch": 2.1202, "grad_norm": 9.472219467163086, "learning_rate": 2.3571428571428574e-06, "loss": 3.4311, "step": 4769 }, { "epoch": 2.1204, "grad_norm": 1.9404643774032593, "learning_rate": 2.346938775510204e-06, "loss": 0.1586, "step": 4770 }, { "epoch": 2.1206, "grad_norm": 6.1719255447387695, "learning_rate": 2.336734693877551e-06, "loss": 1.236, "step": 4771 }, { "epoch": 2.1208, "grad_norm": 3.216226816177368, "learning_rate": 2.326530612244898e-06, "loss": 0.577, "step": 4772 }, { "epoch": 2.121, "grad_norm": 12.40564250946045, "learning_rate": 2.316326530612245e-06, "loss": 3.3144, "step": 4773 }, { "epoch": 2.1212, "grad_norm": 2.8573033809661865, "learning_rate": 2.306122448979592e-06, "loss": 0.1612, "step": 4774 }, { "epoch": 2.1214, "grad_norm": 12.39941120147705, "learning_rate": 2.295918367346939e-06, "loss": 2.1376, "step": 4775 }, { "epoch": 2.1216, "grad_norm": 2.2197749614715576, "learning_rate": 2.285714285714286e-06, "loss": 0.1219, "step": 4776 }, { "epoch": 2.1218, "grad_norm": 6.356390953063965, "learning_rate": 2.2755102040816326e-06, "loss": 2.936, "step": 4777 }, { "epoch": 2.122, "grad_norm": 6.300841331481934, "learning_rate": 2.2653061224489797e-06, "loss": 0.7233, "step": 4778 }, { "epoch": 2.1222, "grad_norm": 7.246288299560547, "learning_rate": 2.2551020408163267e-06, "loss": 0.3583, "step": 4779 }, { "epoch": 2.1224, "grad_norm": 8.743884086608887, "learning_rate": 2.2448979591836734e-06, "loss": 0.7548, "step": 4780 }, { "epoch": 2.1226, "grad_norm": 11.135361671447754, "learning_rate": 2.2346938775510204e-06, "loss": 1.4149, "step": 4781 }, { "epoch": 2.1228, "grad_norm": 1.941303014755249, "learning_rate": 2.2244897959183675e-06, "loss": 0.1728, "step": 4782 }, { "epoch": 2.123, "grad_norm": 8.995508193969727, "learning_rate": 2.214285714285714e-06, "loss": 0.7435, "step": 4783 }, { "epoch": 2.1232, "grad_norm": 2.2888870239257812, "learning_rate": 2.204081632653061e-06, "loss": 0.2329, "step": 4784 }, { "epoch": 2.1234, "grad_norm": 2.0693838596343994, "learning_rate": 2.1938775510204083e-06, "loss": 0.2532, "step": 4785 }, { "epoch": 2.1236, "grad_norm": 7.136364936828613, "learning_rate": 2.1836734693877553e-06, "loss": 0.1922, "step": 4786 }, { "epoch": 2.1238, "grad_norm": 5.488953590393066, "learning_rate": 2.1734693877551024e-06, "loss": 0.6459, "step": 4787 }, { "epoch": 2.124, "grad_norm": 2.1585514545440674, "learning_rate": 2.163265306122449e-06, "loss": 0.1159, "step": 4788 }, { "epoch": 2.1242, "grad_norm": 2.8502037525177, "learning_rate": 2.153061224489796e-06, "loss": 0.4822, "step": 4789 }, { "epoch": 2.1244, "grad_norm": 1.2102391719818115, "learning_rate": 2.142857142857143e-06, "loss": 0.0376, "step": 4790 }, { "epoch": 2.1246, "grad_norm": 2.719905376434326, "learning_rate": 2.13265306122449e-06, "loss": 0.1681, "step": 4791 }, { "epoch": 2.1248, "grad_norm": 3.575418472290039, "learning_rate": 2.122448979591837e-06, "loss": 0.5159, "step": 4792 }, { "epoch": 2.125, "grad_norm": 2.9629266262054443, "learning_rate": 2.112244897959184e-06, "loss": 0.2813, "step": 4793 }, { "epoch": 2.1252, "grad_norm": 5.93533182144165, "learning_rate": 2.1020408163265306e-06, "loss": 1.4368, "step": 4794 }, { "epoch": 2.1254, "grad_norm": 2.384918451309204, "learning_rate": 2.0918367346938776e-06, "loss": 0.143, "step": 4795 }, { "epoch": 2.1256, "grad_norm": 8.767003059387207, "learning_rate": 2.0816326530612247e-06, "loss": 1.633, "step": 4796 }, { "epoch": 2.1258, "grad_norm": 7.807423114776611, "learning_rate": 2.0714285714285713e-06, "loss": 1.032, "step": 4797 }, { "epoch": 2.126, "grad_norm": 10.32408618927002, "learning_rate": 2.0612244897959184e-06, "loss": 3.4824, "step": 4798 }, { "epoch": 2.1262, "grad_norm": 12.379114151000977, "learning_rate": 2.0510204081632654e-06, "loss": 1.1809, "step": 4799 }, { "epoch": 2.1264, "grad_norm": 11.27355670928955, "learning_rate": 2.040816326530612e-06, "loss": 0.5557, "step": 4800 }, { "epoch": 2.1266, "grad_norm": 1.92478609085083, "learning_rate": 2.030612244897959e-06, "loss": 0.1746, "step": 4801 }, { "epoch": 2.1268, "grad_norm": 6.682904243469238, "learning_rate": 2.020408163265306e-06, "loss": 0.8163, "step": 4802 }, { "epoch": 2.127, "grad_norm": 1.5593671798706055, "learning_rate": 2.0102040816326533e-06, "loss": 0.0997, "step": 4803 }, { "epoch": 2.1272, "grad_norm": 2.9079713821411133, "learning_rate": 2.0000000000000003e-06, "loss": 0.275, "step": 4804 }, { "epoch": 2.1274, "grad_norm": 2.176316976547241, "learning_rate": 1.989795918367347e-06, "loss": 0.2564, "step": 4805 }, { "epoch": 2.1276, "grad_norm": 1.6099629402160645, "learning_rate": 1.979591836734694e-06, "loss": 0.1026, "step": 4806 }, { "epoch": 2.1278, "grad_norm": 2.784722089767456, "learning_rate": 1.969387755102041e-06, "loss": 0.5327, "step": 4807 }, { "epoch": 2.128, "grad_norm": 1.4035965204238892, "learning_rate": 1.9591836734693877e-06, "loss": 0.0564, "step": 4808 }, { "epoch": 2.1282, "grad_norm": 2.1346659660339355, "learning_rate": 1.948979591836735e-06, "loss": 0.129, "step": 4809 }, { "epoch": 2.1284, "grad_norm": 1.2497564554214478, "learning_rate": 1.938775510204082e-06, "loss": 0.0352, "step": 4810 }, { "epoch": 2.1286, "grad_norm": 1.9146955013275146, "learning_rate": 1.9285714285714285e-06, "loss": 0.1686, "step": 4811 }, { "epoch": 2.1288, "grad_norm": 8.626206398010254, "learning_rate": 1.9183673469387756e-06, "loss": 0.6249, "step": 4812 }, { "epoch": 2.129, "grad_norm": 6.26157808303833, "learning_rate": 1.9081632653061226e-06, "loss": 1.314, "step": 4813 }, { "epoch": 2.1292, "grad_norm": 16.438100814819336, "learning_rate": 1.8979591836734693e-06, "loss": 2.1826, "step": 4814 }, { "epoch": 2.1294, "grad_norm": 9.846821784973145, "learning_rate": 1.8877551020408165e-06, "loss": 3.4003, "step": 4815 }, { "epoch": 2.1296, "grad_norm": 5.37025260925293, "learning_rate": 1.8775510204081634e-06, "loss": 0.7318, "step": 4816 }, { "epoch": 2.1298, "grad_norm": 4.28413724899292, "learning_rate": 1.86734693877551e-06, "loss": 0.7001, "step": 4817 }, { "epoch": 2.13, "grad_norm": 1.656635046005249, "learning_rate": 1.8571428571428573e-06, "loss": 0.1194, "step": 4818 }, { "epoch": 2.1302, "grad_norm": 2.847740411758423, "learning_rate": 1.8469387755102042e-06, "loss": 0.4024, "step": 4819 }, { "epoch": 2.1304, "grad_norm": 2.651296615600586, "learning_rate": 1.8367346938775512e-06, "loss": 0.4087, "step": 4820 }, { "epoch": 2.1306, "grad_norm": 5.253534317016602, "learning_rate": 1.826530612244898e-06, "loss": 1.5563, "step": 4821 }, { "epoch": 2.1308, "grad_norm": 5.369923114776611, "learning_rate": 1.816326530612245e-06, "loss": 1.2941, "step": 4822 }, { "epoch": 2.1310000000000002, "grad_norm": 4.879589080810547, "learning_rate": 1.806122448979592e-06, "loss": 0.801, "step": 4823 }, { "epoch": 2.1312, "grad_norm": 3.045949935913086, "learning_rate": 1.7959183673469388e-06, "loss": 0.1729, "step": 4824 }, { "epoch": 2.1314, "grad_norm": 5.828512668609619, "learning_rate": 1.7857142857142857e-06, "loss": 0.6962, "step": 4825 }, { "epoch": 2.1316, "grad_norm": 6.206547737121582, "learning_rate": 1.775510204081633e-06, "loss": 0.9211, "step": 4826 }, { "epoch": 2.1318, "grad_norm": 1.1685166358947754, "learning_rate": 1.7653061224489796e-06, "loss": 0.0379, "step": 4827 }, { "epoch": 2.132, "grad_norm": 5.419491767883301, "learning_rate": 1.7551020408163264e-06, "loss": 1.3864, "step": 4828 }, { "epoch": 2.1322, "grad_norm": 7.400469779968262, "learning_rate": 1.7448979591836737e-06, "loss": 0.7807, "step": 4829 }, { "epoch": 2.1324, "grad_norm": 2.7783303260803223, "learning_rate": 1.7346938775510206e-06, "loss": 0.5662, "step": 4830 }, { "epoch": 2.1326, "grad_norm": 9.906639099121094, "learning_rate": 1.7244897959183672e-06, "loss": 2.34, "step": 4831 }, { "epoch": 2.1328, "grad_norm": 4.272519111633301, "learning_rate": 1.7142857142857145e-06, "loss": 1.7126, "step": 4832 }, { "epoch": 2.133, "grad_norm": 2.4000024795532227, "learning_rate": 1.7040816326530613e-06, "loss": 0.2753, "step": 4833 }, { "epoch": 2.1332, "grad_norm": 6.222007751464844, "learning_rate": 1.6938775510204082e-06, "loss": 0.6795, "step": 4834 }, { "epoch": 2.1334, "grad_norm": 3.4705138206481934, "learning_rate": 1.6836734693877552e-06, "loss": 0.6273, "step": 4835 }, { "epoch": 2.1336, "grad_norm": 1.5016515254974365, "learning_rate": 1.673469387755102e-06, "loss": 0.1128, "step": 4836 }, { "epoch": 2.1338, "grad_norm": 2.719780921936035, "learning_rate": 1.6632653061224492e-06, "loss": 0.5315, "step": 4837 }, { "epoch": 2.134, "grad_norm": 1.00386381149292, "learning_rate": 1.653061224489796e-06, "loss": 0.0354, "step": 4838 }, { "epoch": 2.1342, "grad_norm": 1.8480079174041748, "learning_rate": 1.6428571428571429e-06, "loss": 0.1274, "step": 4839 }, { "epoch": 2.1344, "grad_norm": 2.8276331424713135, "learning_rate": 1.63265306122449e-06, "loss": 0.2234, "step": 4840 }, { "epoch": 2.1346, "grad_norm": 1.1041818857192993, "learning_rate": 1.6224489795918368e-06, "loss": 0.0323, "step": 4841 }, { "epoch": 2.1348, "grad_norm": 11.41901969909668, "learning_rate": 1.6122448979591836e-06, "loss": 0.7364, "step": 4842 }, { "epoch": 2.135, "grad_norm": 2.2846317291259766, "learning_rate": 1.602040816326531e-06, "loss": 0.2361, "step": 4843 }, { "epoch": 2.1352, "grad_norm": 6.557143688201904, "learning_rate": 1.5918367346938775e-06, "loss": 0.273, "step": 4844 }, { "epoch": 2.1354, "grad_norm": 2.2313144207000732, "learning_rate": 1.5816326530612244e-06, "loss": 0.1999, "step": 4845 }, { "epoch": 2.1356, "grad_norm": 0.8703025579452515, "learning_rate": 1.5714285714285717e-06, "loss": 0.0131, "step": 4846 }, { "epoch": 2.1358, "grad_norm": 3.4729299545288086, "learning_rate": 1.5612244897959185e-06, "loss": 0.3988, "step": 4847 }, { "epoch": 2.136, "grad_norm": 2.4414684772491455, "learning_rate": 1.5510204081632654e-06, "loss": 0.1224, "step": 4848 }, { "epoch": 2.1362, "grad_norm": 5.00660514831543, "learning_rate": 1.5408163265306122e-06, "loss": 0.7455, "step": 4849 }, { "epoch": 2.1364, "grad_norm": 5.4602580070495605, "learning_rate": 1.5306122448979593e-06, "loss": 0.6444, "step": 4850 }, { "epoch": 2.1366, "grad_norm": 1.9978864192962646, "learning_rate": 1.5204081632653061e-06, "loss": 0.2164, "step": 4851 }, { "epoch": 2.1368, "grad_norm": 1.0184167623519897, "learning_rate": 1.5102040816326532e-06, "loss": 0.029, "step": 4852 }, { "epoch": 2.137, "grad_norm": 2.7992334365844727, "learning_rate": 1.5e-06, "loss": 0.4683, "step": 4853 }, { "epoch": 2.1372, "grad_norm": 2.517106533050537, "learning_rate": 1.489795918367347e-06, "loss": 0.1445, "step": 4854 }, { "epoch": 2.1374, "grad_norm": 1.8562113046646118, "learning_rate": 1.479591836734694e-06, "loss": 0.1674, "step": 4855 }, { "epoch": 2.1376, "grad_norm": 1.714816689491272, "learning_rate": 1.4693877551020408e-06, "loss": 0.1415, "step": 4856 }, { "epoch": 2.1378, "grad_norm": 0.8883129954338074, "learning_rate": 1.4591836734693879e-06, "loss": 0.0156, "step": 4857 }, { "epoch": 2.138, "grad_norm": 2.0923383235931396, "learning_rate": 1.4489795918367347e-06, "loss": 0.2406, "step": 4858 }, { "epoch": 2.1382, "grad_norm": 2.276726484298706, "learning_rate": 1.4387755102040818e-06, "loss": 0.2007, "step": 4859 }, { "epoch": 2.1384, "grad_norm": 2.1536126136779785, "learning_rate": 1.4285714285714286e-06, "loss": 0.2267, "step": 4860 }, { "epoch": 2.1386, "grad_norm": 4.831217288970947, "learning_rate": 1.4183673469387757e-06, "loss": 0.264, "step": 4861 }, { "epoch": 2.1388, "grad_norm": 0.8061777949333191, "learning_rate": 1.4081632653061225e-06, "loss": 0.0162, "step": 4862 }, { "epoch": 2.1390000000000002, "grad_norm": 2.4018330574035645, "learning_rate": 1.3979591836734694e-06, "loss": 0.2791, "step": 4863 }, { "epoch": 2.1391999999999998, "grad_norm": 3.1337363719940186, "learning_rate": 1.3877551020408165e-06, "loss": 0.3743, "step": 4864 }, { "epoch": 2.1394, "grad_norm": 6.414299488067627, "learning_rate": 1.3775510204081633e-06, "loss": 0.284, "step": 4865 }, { "epoch": 2.1396, "grad_norm": 7.778178691864014, "learning_rate": 1.3673469387755102e-06, "loss": 1.4856, "step": 4866 }, { "epoch": 2.1398, "grad_norm": 1.8873628377914429, "learning_rate": 1.3571428571428572e-06, "loss": 0.1629, "step": 4867 }, { "epoch": 2.14, "grad_norm": 9.496567726135254, "learning_rate": 1.346938775510204e-06, "loss": 3.3572, "step": 4868 }, { "epoch": 2.1402, "grad_norm": 1.016174077987671, "learning_rate": 1.3367346938775511e-06, "loss": 0.0316, "step": 4869 }, { "epoch": 2.1404, "grad_norm": 0.9162819981575012, "learning_rate": 1.326530612244898e-06, "loss": 0.0095, "step": 4870 }, { "epoch": 2.1406, "grad_norm": 1.6717957258224487, "learning_rate": 1.316326530612245e-06, "loss": 0.1213, "step": 4871 }, { "epoch": 2.1408, "grad_norm": 1.7956033945083618, "learning_rate": 1.306122448979592e-06, "loss": 0.1451, "step": 4872 }, { "epoch": 2.141, "grad_norm": 1.9850627183914185, "learning_rate": 1.2959183673469387e-06, "loss": 0.0518, "step": 4873 }, { "epoch": 2.1412, "grad_norm": 2.4787042140960693, "learning_rate": 1.2857142857142858e-06, "loss": 0.132, "step": 4874 }, { "epoch": 2.1414, "grad_norm": 2.5726118087768555, "learning_rate": 1.2755102040816327e-06, "loss": 0.144, "step": 4875 }, { "epoch": 2.1416, "grad_norm": 4.900381565093994, "learning_rate": 1.2653061224489797e-06, "loss": 0.5928, "step": 4876 }, { "epoch": 2.1418, "grad_norm": 2.545435667037964, "learning_rate": 1.2551020408163266e-06, "loss": 0.3196, "step": 4877 }, { "epoch": 2.142, "grad_norm": 1.777809739112854, "learning_rate": 1.2448979591836736e-06, "loss": 0.0613, "step": 4878 }, { "epoch": 2.1422, "grad_norm": 5.111660003662109, "learning_rate": 1.2346938775510205e-06, "loss": 1.4366, "step": 4879 }, { "epoch": 2.1424, "grad_norm": 3.8646697998046875, "learning_rate": 1.2244897959183673e-06, "loss": 0.4065, "step": 4880 }, { "epoch": 2.1426, "grad_norm": 3.850189447402954, "learning_rate": 1.2142857142857144e-06, "loss": 0.0971, "step": 4881 }, { "epoch": 2.1428, "grad_norm": 3.2342708110809326, "learning_rate": 1.2040816326530612e-06, "loss": 0.5958, "step": 4882 }, { "epoch": 2.143, "grad_norm": 4.939568519592285, "learning_rate": 1.193877551020408e-06, "loss": 0.4098, "step": 4883 }, { "epoch": 2.1432, "grad_norm": 6.049392223358154, "learning_rate": 1.1836734693877552e-06, "loss": 0.8668, "step": 4884 }, { "epoch": 2.1434, "grad_norm": 2.739666700363159, "learning_rate": 1.173469387755102e-06, "loss": 0.2615, "step": 4885 }, { "epoch": 2.1436, "grad_norm": 9.856095314025879, "learning_rate": 1.163265306122449e-06, "loss": 2.2432, "step": 4886 }, { "epoch": 2.1438, "grad_norm": 5.986268043518066, "learning_rate": 1.153061224489796e-06, "loss": 0.6472, "step": 4887 }, { "epoch": 2.144, "grad_norm": 4.728291034698486, "learning_rate": 1.142857142857143e-06, "loss": 0.6578, "step": 4888 }, { "epoch": 2.1442, "grad_norm": 2.241907835006714, "learning_rate": 1.1326530612244898e-06, "loss": 0.1339, "step": 4889 }, { "epoch": 2.1444, "grad_norm": 3.052717447280884, "learning_rate": 1.1224489795918367e-06, "loss": 0.8341, "step": 4890 }, { "epoch": 2.1446, "grad_norm": 5.318640232086182, "learning_rate": 1.1122448979591838e-06, "loss": 0.364, "step": 4891 }, { "epoch": 2.1448, "grad_norm": 3.5043251514434814, "learning_rate": 1.1020408163265306e-06, "loss": 1.6492, "step": 4892 }, { "epoch": 2.145, "grad_norm": 9.889541625976562, "learning_rate": 1.0918367346938777e-06, "loss": 1.1815, "step": 4893 }, { "epoch": 2.1452, "grad_norm": 2.5116753578186035, "learning_rate": 1.0816326530612245e-06, "loss": 0.5012, "step": 4894 }, { "epoch": 2.1454, "grad_norm": 1.443432092666626, "learning_rate": 1.0714285714285716e-06, "loss": 0.0875, "step": 4895 }, { "epoch": 2.1456, "grad_norm": 1.9345141649246216, "learning_rate": 1.0612244897959184e-06, "loss": 0.1529, "step": 4896 }, { "epoch": 2.1458, "grad_norm": 1.706526517868042, "learning_rate": 1.0510204081632653e-06, "loss": 0.1565, "step": 4897 }, { "epoch": 2.146, "grad_norm": 1.3103222846984863, "learning_rate": 1.0408163265306123e-06, "loss": 0.0434, "step": 4898 }, { "epoch": 2.1462, "grad_norm": 6.11220645904541, "learning_rate": 1.0306122448979592e-06, "loss": 0.3047, "step": 4899 }, { "epoch": 2.1464, "grad_norm": 2.7306883335113525, "learning_rate": 1.020408163265306e-06, "loss": 0.341, "step": 4900 }, { "epoch": 2.1466, "grad_norm": 15.686260223388672, "learning_rate": 1.010204081632653e-06, "loss": 1.5644, "step": 4901 }, { "epoch": 2.1468, "grad_norm": 5.284987926483154, "learning_rate": 1.0000000000000002e-06, "loss": 1.4679, "step": 4902 }, { "epoch": 2.147, "grad_norm": 6.0595574378967285, "learning_rate": 9.89795918367347e-07, "loss": 0.4288, "step": 4903 }, { "epoch": 2.1471999999999998, "grad_norm": 2.0993542671203613, "learning_rate": 9.795918367346939e-07, "loss": 0.2359, "step": 4904 }, { "epoch": 2.1474, "grad_norm": 4.383464336395264, "learning_rate": 9.69387755102041e-07, "loss": 0.8041, "step": 4905 }, { "epoch": 2.1476, "grad_norm": 2.5457303524017334, "learning_rate": 9.591836734693878e-07, "loss": 0.5014, "step": 4906 }, { "epoch": 2.1478, "grad_norm": 2.813450336456299, "learning_rate": 9.489795918367346e-07, "loss": 0.2483, "step": 4907 }, { "epoch": 2.148, "grad_norm": 1.4544942378997803, "learning_rate": 9.387755102040817e-07, "loss": 0.1044, "step": 4908 }, { "epoch": 2.1482, "grad_norm": 1.8861054182052612, "learning_rate": 9.285714285714287e-07, "loss": 0.1481, "step": 4909 }, { "epoch": 2.1484, "grad_norm": 0.4566737413406372, "learning_rate": 9.183673469387756e-07, "loss": 0.0085, "step": 4910 }, { "epoch": 2.1486, "grad_norm": 2.440262794494629, "learning_rate": 9.081632653061225e-07, "loss": 0.2129, "step": 4911 }, { "epoch": 2.1488, "grad_norm": 1.998534917831421, "learning_rate": 8.979591836734694e-07, "loss": 0.1949, "step": 4912 }, { "epoch": 2.149, "grad_norm": 7.0681047439575195, "learning_rate": 8.877551020408165e-07, "loss": 0.8244, "step": 4913 }, { "epoch": 2.1492, "grad_norm": 1.2315425872802734, "learning_rate": 8.775510204081632e-07, "loss": 0.0363, "step": 4914 }, { "epoch": 2.1494, "grad_norm": 11.267387390136719, "learning_rate": 8.673469387755103e-07, "loss": 1.4418, "step": 4915 }, { "epoch": 2.1496, "grad_norm": 1.8728132247924805, "learning_rate": 8.571428571428572e-07, "loss": 0.1121, "step": 4916 }, { "epoch": 2.1498, "grad_norm": 9.476338386535645, "learning_rate": 8.469387755102041e-07, "loss": 0.2879, "step": 4917 }, { "epoch": 2.15, "grad_norm": 3.17253041267395, "learning_rate": 8.36734693877551e-07, "loss": 0.2077, "step": 4918 }, { "epoch": 2.1502, "grad_norm": 2.146371603012085, "learning_rate": 8.26530612244898e-07, "loss": 0.2161, "step": 4919 }, { "epoch": 2.1504, "grad_norm": 4.625229835510254, "learning_rate": 8.16326530612245e-07, "loss": 0.5836, "step": 4920 }, { "epoch": 2.1506, "grad_norm": 3.1853513717651367, "learning_rate": 8.061224489795918e-07, "loss": 0.5375, "step": 4921 }, { "epoch": 2.1508, "grad_norm": 1.9648888111114502, "learning_rate": 7.959183673469388e-07, "loss": 0.2261, "step": 4922 }, { "epoch": 2.151, "grad_norm": 2.480511426925659, "learning_rate": 7.857142857142858e-07, "loss": 0.3026, "step": 4923 }, { "epoch": 2.1512000000000002, "grad_norm": 3.7170333862304688, "learning_rate": 7.755102040816327e-07, "loss": 0.6515, "step": 4924 }, { "epoch": 2.1514, "grad_norm": 3.0362749099731445, "learning_rate": 7.653061224489796e-07, "loss": 0.4871, "step": 4925 }, { "epoch": 2.1516, "grad_norm": 3.072406768798828, "learning_rate": 7.551020408163266e-07, "loss": 0.2954, "step": 4926 }, { "epoch": 2.1518, "grad_norm": 1.0148082971572876, "learning_rate": 7.448979591836736e-07, "loss": 0.0305, "step": 4927 }, { "epoch": 2.152, "grad_norm": 2.6846325397491455, "learning_rate": 7.346938775510204e-07, "loss": 0.4181, "step": 4928 }, { "epoch": 2.1522, "grad_norm": 3.4255530834198, "learning_rate": 7.244897959183674e-07, "loss": 0.4283, "step": 4929 }, { "epoch": 2.1524, "grad_norm": 5.002836227416992, "learning_rate": 7.142857142857143e-07, "loss": 1.4199, "step": 4930 }, { "epoch": 2.1526, "grad_norm": 5.339587688446045, "learning_rate": 7.040816326530613e-07, "loss": 1.3885, "step": 4931 }, { "epoch": 2.1528, "grad_norm": 2.046875476837158, "learning_rate": 6.938775510204082e-07, "loss": 0.1375, "step": 4932 }, { "epoch": 2.153, "grad_norm": 2.331223964691162, "learning_rate": 6.836734693877551e-07, "loss": 0.1424, "step": 4933 }, { "epoch": 2.1532, "grad_norm": 4.11829948425293, "learning_rate": 6.73469387755102e-07, "loss": 0.825, "step": 4934 }, { "epoch": 2.1534, "grad_norm": 10.498679161071777, "learning_rate": 6.63265306122449e-07, "loss": 1.3925, "step": 4935 }, { "epoch": 2.1536, "grad_norm": 4.633434772491455, "learning_rate": 6.53061224489796e-07, "loss": 0.7704, "step": 4936 }, { "epoch": 2.1538, "grad_norm": 9.199163436889648, "learning_rate": 6.428571428571429e-07, "loss": 3.3639, "step": 4937 }, { "epoch": 2.154, "grad_norm": 8.057214736938477, "learning_rate": 6.326530612244899e-07, "loss": 1.2202, "step": 4938 }, { "epoch": 2.1542, "grad_norm": 1.9962745904922485, "learning_rate": 6.224489795918368e-07, "loss": 0.1147, "step": 4939 }, { "epoch": 2.1544, "grad_norm": 1.3766710758209229, "learning_rate": 6.122448979591837e-07, "loss": 0.0292, "step": 4940 }, { "epoch": 2.1546, "grad_norm": 4.408767223358154, "learning_rate": 6.020408163265306e-07, "loss": 0.5736, "step": 4941 }, { "epoch": 2.1548, "grad_norm": 0.9604965448379517, "learning_rate": 5.918367346938776e-07, "loss": 0.0322, "step": 4942 }, { "epoch": 2.155, "grad_norm": 2.2919974327087402, "learning_rate": 5.816326530612245e-07, "loss": 0.2447, "step": 4943 }, { "epoch": 2.1552, "grad_norm": 2.1274476051330566, "learning_rate": 5.714285714285715e-07, "loss": 0.2329, "step": 4944 }, { "epoch": 2.1554, "grad_norm": 3.189530849456787, "learning_rate": 5.612244897959183e-07, "loss": 0.3092, "step": 4945 }, { "epoch": 2.1556, "grad_norm": 1.0164053440093994, "learning_rate": 5.510204081632653e-07, "loss": 0.0263, "step": 4946 }, { "epoch": 2.1558, "grad_norm": 1.5905165672302246, "learning_rate": 5.408163265306123e-07, "loss": 0.1089, "step": 4947 }, { "epoch": 2.156, "grad_norm": 2.1196746826171875, "learning_rate": 5.306122448979592e-07, "loss": 0.2421, "step": 4948 }, { "epoch": 2.1562, "grad_norm": 1.8495643138885498, "learning_rate": 5.204081632653062e-07, "loss": 0.1581, "step": 4949 }, { "epoch": 2.1564, "grad_norm": 3.3096840381622314, "learning_rate": 5.10204081632653e-07, "loss": 0.8094, "step": 4950 }, { "epoch": 2.1566, "grad_norm": 2.5738346576690674, "learning_rate": 5.000000000000001e-07, "loss": 0.1388, "step": 4951 }, { "epoch": 2.1568, "grad_norm": 3.5386643409729004, "learning_rate": 4.897959183673469e-07, "loss": 1.5187, "step": 4952 }, { "epoch": 2.157, "grad_norm": 8.421414375305176, "learning_rate": 4.795918367346939e-07, "loss": 1.4135, "step": 4953 }, { "epoch": 2.1572, "grad_norm": 7.643795967102051, "learning_rate": 4.6938775510204085e-07, "loss": 1.1508, "step": 4954 }, { "epoch": 2.1574, "grad_norm": 9.643633842468262, "learning_rate": 4.591836734693878e-07, "loss": 2.5107, "step": 4955 }, { "epoch": 2.1576, "grad_norm": 11.657377243041992, "learning_rate": 4.489795918367347e-07, "loss": 3.1162, "step": 4956 }, { "epoch": 2.1578, "grad_norm": 5.536891460418701, "learning_rate": 4.387755102040816e-07, "loss": 1.4635, "step": 4957 }, { "epoch": 2.158, "grad_norm": 2.9929583072662354, "learning_rate": 4.285714285714286e-07, "loss": 0.2885, "step": 4958 }, { "epoch": 2.1582, "grad_norm": 0.9444872736930847, "learning_rate": 4.183673469387755e-07, "loss": 0.0164, "step": 4959 }, { "epoch": 2.1584, "grad_norm": 1.915464997291565, "learning_rate": 4.081632653061225e-07, "loss": 0.3739, "step": 4960 }, { "epoch": 2.1586, "grad_norm": 1.9991296529769897, "learning_rate": 3.979591836734694e-07, "loss": 0.1583, "step": 4961 }, { "epoch": 2.1588, "grad_norm": 4.060037612915039, "learning_rate": 3.8775510204081634e-07, "loss": 1.5909, "step": 4962 }, { "epoch": 2.159, "grad_norm": 1.268680453300476, "learning_rate": 3.775510204081633e-07, "loss": 0.044, "step": 4963 }, { "epoch": 2.1592000000000002, "grad_norm": 4.501353740692139, "learning_rate": 3.673469387755102e-07, "loss": 0.5581, "step": 4964 }, { "epoch": 2.1593999999999998, "grad_norm": 1.8425730466842651, "learning_rate": 3.5714285714285716e-07, "loss": 0.1389, "step": 4965 }, { "epoch": 2.1596, "grad_norm": 2.0111634731292725, "learning_rate": 3.469387755102041e-07, "loss": 0.1959, "step": 4966 }, { "epoch": 2.1598, "grad_norm": 2.507601737976074, "learning_rate": 3.36734693877551e-07, "loss": 0.494, "step": 4967 }, { "epoch": 2.16, "grad_norm": 1.3330937623977661, "learning_rate": 3.26530612244898e-07, "loss": 0.0433, "step": 4968 }, { "epoch": 2.1602, "grad_norm": 3.633925676345825, "learning_rate": 3.1632653061224493e-07, "loss": 1.6239, "step": 4969 }, { "epoch": 2.1604, "grad_norm": 0.8505246043205261, "learning_rate": 3.0612244897959183e-07, "loss": 0.0187, "step": 4970 }, { "epoch": 2.1606, "grad_norm": 1.1604036092758179, "learning_rate": 2.959183673469388e-07, "loss": 0.0253, "step": 4971 }, { "epoch": 2.1608, "grad_norm": 1.7584632635116577, "learning_rate": 2.8571428571428575e-07, "loss": 0.1514, "step": 4972 }, { "epoch": 2.161, "grad_norm": 1.6417275667190552, "learning_rate": 2.7551020408163265e-07, "loss": 0.025, "step": 4973 }, { "epoch": 2.1612, "grad_norm": 2.554945707321167, "learning_rate": 2.653061224489796e-07, "loss": 0.5624, "step": 4974 }, { "epoch": 2.1614, "grad_norm": 2.8897547721862793, "learning_rate": 2.551020408163265e-07, "loss": 0.3113, "step": 4975 }, { "epoch": 2.1616, "grad_norm": 4.395506381988525, "learning_rate": 2.4489795918367347e-07, "loss": 1.6157, "step": 4976 }, { "epoch": 2.1618, "grad_norm": 5.517244338989258, "learning_rate": 2.3469387755102042e-07, "loss": 1.743, "step": 4977 }, { "epoch": 2.162, "grad_norm": 5.6711015701293945, "learning_rate": 2.2448979591836735e-07, "loss": 0.5785, "step": 4978 }, { "epoch": 2.1622, "grad_norm": 3.8922955989837646, "learning_rate": 2.142857142857143e-07, "loss": 2.177, "step": 4979 }, { "epoch": 2.1624, "grad_norm": 5.796224594116211, "learning_rate": 2.0408163265306124e-07, "loss": 2.4335, "step": 4980 }, { "epoch": 2.1626, "grad_norm": 3.8742527961730957, "learning_rate": 1.9387755102040817e-07, "loss": 1.4992, "step": 4981 }, { "epoch": 2.1628, "grad_norm": 11.639328002929688, "learning_rate": 1.836734693877551e-07, "loss": 3.339, "step": 4982 }, { "epoch": 2.163, "grad_norm": 5.487311840057373, "learning_rate": 1.7346938775510206e-07, "loss": 0.5887, "step": 4983 }, { "epoch": 2.1632, "grad_norm": 1.668665885925293, "learning_rate": 1.63265306122449e-07, "loss": 0.1145, "step": 4984 }, { "epoch": 2.1634, "grad_norm": 2.9366931915283203, "learning_rate": 1.5306122448979592e-07, "loss": 0.6556, "step": 4985 }, { "epoch": 2.1636, "grad_norm": 4.772943496704102, "learning_rate": 1.4285714285714287e-07, "loss": 0.8894, "step": 4986 }, { "epoch": 2.1638, "grad_norm": 5.80991792678833, "learning_rate": 1.326530612244898e-07, "loss": 2.3163, "step": 4987 }, { "epoch": 2.164, "grad_norm": 6.079235553741455, "learning_rate": 1.2244897959183673e-07, "loss": 2.0411, "step": 4988 }, { "epoch": 2.1642, "grad_norm": 11.773283004760742, "learning_rate": 1.1224489795918368e-07, "loss": 3.309, "step": 4989 }, { "epoch": 2.1644, "grad_norm": 5.10943078994751, "learning_rate": 1.0204081632653062e-07, "loss": 1.5065, "step": 4990 }, { "epoch": 2.1646, "grad_norm": 9.724143981933594, "learning_rate": 9.183673469387755e-08, "loss": 2.3739, "step": 4991 }, { "epoch": 2.1648, "grad_norm": 1.881788730621338, "learning_rate": 8.16326530612245e-08, "loss": 0.0728, "step": 4992 }, { "epoch": 2.165, "grad_norm": 5.462317943572998, "learning_rate": 7.142857142857144e-08, "loss": 0.6102, "step": 4993 }, { "epoch": 2.1652, "grad_norm": 1.4766038656234741, "learning_rate": 6.122448979591837e-08, "loss": 0.0714, "step": 4994 }, { "epoch": 2.1654, "grad_norm": 2.4013559818267822, "learning_rate": 5.102040816326531e-08, "loss": 0.4663, "step": 4995 }, { "epoch": 2.1656, "grad_norm": 4.605373859405518, "learning_rate": 4.081632653061225e-08, "loss": 0.8669, "step": 4996 }, { "epoch": 2.1658, "grad_norm": 1.9811826944351196, "learning_rate": 3.0612244897959183e-08, "loss": 0.0347, "step": 4997 }, { "epoch": 2.166, "grad_norm": 9.743058204650879, "learning_rate": 2.0408163265306123e-08, "loss": 3.2979, "step": 4998 }, { "epoch": 2.1662, "grad_norm": 1.0800037384033203, "learning_rate": 1.0204081632653062e-08, "loss": 0.024, "step": 4999 }, { "epoch": 2.1664, "grad_norm": 1.7821890115737915, "learning_rate": 0.0, "loss": 0.1462, "step": 5000 } ], "logging_steps": 1, "max_steps": 5000, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 128, "trial_name": null, "trial_params": null }