{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.996186117467582, "eval_steps": 500, "global_step": 1965, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0015255530129672007, "grad_norm": 55.88000383206808, "learning_rate": 2.5380710659898475e-07, "loss": 11.0765, "step": 1 }, { "epoch": 0.0030511060259344014, "grad_norm": 55.88268860975718, "learning_rate": 5.076142131979695e-07, "loss": 10.9814, "step": 2 }, { "epoch": 0.004576659038901602, "grad_norm": 54.0836855166221, "learning_rate": 7.614213197969544e-07, "loss": 11.0931, "step": 3 }, { "epoch": 0.006102212051868803, "grad_norm": 54.47169557217386, "learning_rate": 1.015228426395939e-06, "loss": 11.0954, "step": 4 }, { "epoch": 0.007627765064836003, "grad_norm": 56.38941947983373, "learning_rate": 1.2690355329949238e-06, "loss": 10.9298, "step": 5 }, { "epoch": 0.009153318077803204, "grad_norm": 54.84980648399342, "learning_rate": 1.5228426395939088e-06, "loss": 10.9394, "step": 6 }, { "epoch": 0.010678871090770405, "grad_norm": 58.74204074302873, "learning_rate": 1.7766497461928936e-06, "loss": 10.9743, "step": 7 }, { "epoch": 0.012204424103737605, "grad_norm": 61.21897739088408, "learning_rate": 2.030456852791878e-06, "loss": 10.7307, "step": 8 }, { "epoch": 0.013729977116704805, "grad_norm": 60.89294870094568, "learning_rate": 2.284263959390863e-06, "loss": 10.7832, "step": 9 }, { "epoch": 0.015255530129672006, "grad_norm": 86.50338649966646, "learning_rate": 2.5380710659898476e-06, "loss": 9.4282, "step": 10 }, { "epoch": 0.016781083142639208, "grad_norm": 96.24927400332886, "learning_rate": 2.7918781725888327e-06, "loss": 9.0517, "step": 11 }, { "epoch": 0.018306636155606407, "grad_norm": 103.01088072790436, "learning_rate": 3.0456852791878177e-06, "loss": 8.7765, "step": 12 }, { "epoch": 0.019832189168573607, "grad_norm": 66.52436218643427, "learning_rate": 3.2994923857868023e-06, "loss": 3.7181, "step": 13 }, { "epoch": 0.02135774218154081, "grad_norm": 55.08812752150733, "learning_rate": 3.5532994923857873e-06, "loss": 3.256, "step": 14 }, { "epoch": 0.02288329519450801, "grad_norm": 43.846496928188095, "learning_rate": 3.807106598984772e-06, "loss": 2.8638, "step": 15 }, { "epoch": 0.02440884820747521, "grad_norm": 32.74730272480893, "learning_rate": 4.060913705583756e-06, "loss": 2.4054, "step": 16 }, { "epoch": 0.02593440122044241, "grad_norm": 9.603334973521518, "learning_rate": 4.3147208121827415e-06, "loss": 1.5618, "step": 17 }, { "epoch": 0.02745995423340961, "grad_norm": 5.53247588860195, "learning_rate": 4.568527918781726e-06, "loss": 1.3143, "step": 18 }, { "epoch": 0.028985507246376812, "grad_norm": 4.105827499289385, "learning_rate": 4.822335025380711e-06, "loss": 1.2216, "step": 19 }, { "epoch": 0.03051106025934401, "grad_norm": 3.4000052200569923, "learning_rate": 5.076142131979695e-06, "loss": 1.2201, "step": 20 }, { "epoch": 0.032036613272311214, "grad_norm": 2.6879998456705905, "learning_rate": 5.329949238578681e-06, "loss": 1.1503, "step": 21 }, { "epoch": 0.033562166285278416, "grad_norm": 2.1072710234995657, "learning_rate": 5.583756345177665e-06, "loss": 1.0781, "step": 22 }, { "epoch": 0.03508771929824561, "grad_norm": 1.7421838938117116, "learning_rate": 5.83756345177665e-06, "loss": 0.9949, "step": 23 }, { "epoch": 0.036613272311212815, "grad_norm": 1.6759811520203085, "learning_rate": 6.091370558375635e-06, "loss": 0.9396, "step": 24 }, { "epoch": 0.03813882532418002, "grad_norm": 2.543416580168318, "learning_rate": 6.345177664974619e-06, "loss": 0.8998, "step": 25 }, { "epoch": 0.03966437833714721, "grad_norm": 1.1722028145828536, "learning_rate": 6.5989847715736045e-06, "loss": 0.8848, "step": 26 }, { "epoch": 0.041189931350114416, "grad_norm": 0.9844726876714781, "learning_rate": 6.852791878172589e-06, "loss": 0.8663, "step": 27 }, { "epoch": 0.04271548436308162, "grad_norm": 0.8452871932199003, "learning_rate": 7.106598984771575e-06, "loss": 0.8277, "step": 28 }, { "epoch": 0.04424103737604882, "grad_norm": 0.8916194748929214, "learning_rate": 7.360406091370558e-06, "loss": 0.8066, "step": 29 }, { "epoch": 0.04576659038901602, "grad_norm": 0.7377757781215405, "learning_rate": 7.614213197969544e-06, "loss": 0.8226, "step": 30 }, { "epoch": 0.04729214340198322, "grad_norm": 0.8856223276762146, "learning_rate": 7.868020304568528e-06, "loss": 0.7893, "step": 31 }, { "epoch": 0.04881769641495042, "grad_norm": 0.6508169390709633, "learning_rate": 8.121827411167512e-06, "loss": 0.7804, "step": 32 }, { "epoch": 0.05034324942791762, "grad_norm": 0.6351910751183879, "learning_rate": 8.375634517766498e-06, "loss": 0.737, "step": 33 }, { "epoch": 0.05186880244088482, "grad_norm": 0.6843074789800184, "learning_rate": 8.629441624365483e-06, "loss": 0.7385, "step": 34 }, { "epoch": 0.05339435545385202, "grad_norm": 0.6620709946674335, "learning_rate": 8.883248730964468e-06, "loss": 0.6753, "step": 35 }, { "epoch": 0.05491990846681922, "grad_norm": 0.5446251237226146, "learning_rate": 9.137055837563452e-06, "loss": 0.6791, "step": 36 }, { "epoch": 0.05644546147978642, "grad_norm": 0.4940245336984263, "learning_rate": 9.390862944162437e-06, "loss": 0.668, "step": 37 }, { "epoch": 0.057971014492753624, "grad_norm": 0.4883431498501216, "learning_rate": 9.644670050761421e-06, "loss": 0.6867, "step": 38 }, { "epoch": 0.059496567505720827, "grad_norm": 0.4672008802639215, "learning_rate": 9.898477157360408e-06, "loss": 0.6844, "step": 39 }, { "epoch": 0.06102212051868802, "grad_norm": 0.48650473895236, "learning_rate": 1.015228426395939e-05, "loss": 0.6472, "step": 40 }, { "epoch": 0.06254767353165523, "grad_norm": 0.4830917557599978, "learning_rate": 1.0406091370558377e-05, "loss": 0.6453, "step": 41 }, { "epoch": 0.06407322654462243, "grad_norm": 0.47417484948798255, "learning_rate": 1.0659898477157361e-05, "loss": 0.6333, "step": 42 }, { "epoch": 0.06559877955758962, "grad_norm": 0.4030923786062654, "learning_rate": 1.0913705583756344e-05, "loss": 0.6383, "step": 43 }, { "epoch": 0.06712433257055683, "grad_norm": 0.4159082973227752, "learning_rate": 1.116751269035533e-05, "loss": 0.6512, "step": 44 }, { "epoch": 0.06864988558352403, "grad_norm": 0.42253143600580884, "learning_rate": 1.1421319796954315e-05, "loss": 0.6308, "step": 45 }, { "epoch": 0.07017543859649122, "grad_norm": 0.47113895923259785, "learning_rate": 1.16751269035533e-05, "loss": 0.6568, "step": 46 }, { "epoch": 0.07170099160945843, "grad_norm": 0.30882796664229845, "learning_rate": 1.1928934010152284e-05, "loss": 0.6066, "step": 47 }, { "epoch": 0.07322654462242563, "grad_norm": 0.36989002092255446, "learning_rate": 1.218274111675127e-05, "loss": 0.6495, "step": 48 }, { "epoch": 0.07475209763539283, "grad_norm": 0.37156553357084415, "learning_rate": 1.2436548223350254e-05, "loss": 0.6175, "step": 49 }, { "epoch": 0.07627765064836003, "grad_norm": 0.8002269797092063, "learning_rate": 1.2690355329949238e-05, "loss": 0.6207, "step": 50 }, { "epoch": 0.07780320366132723, "grad_norm": 0.32214792222590466, "learning_rate": 1.2944162436548224e-05, "loss": 0.5969, "step": 51 }, { "epoch": 0.07932875667429443, "grad_norm": 0.3061568657189902, "learning_rate": 1.3197969543147209e-05, "loss": 0.6399, "step": 52 }, { "epoch": 0.08085430968726164, "grad_norm": 0.32240275255442236, "learning_rate": 1.3451776649746192e-05, "loss": 0.6111, "step": 53 }, { "epoch": 0.08237986270022883, "grad_norm": 0.3291098800636541, "learning_rate": 1.3705583756345178e-05, "loss": 0.5838, "step": 54 }, { "epoch": 0.08390541571319603, "grad_norm": 0.33211334436889267, "learning_rate": 1.3959390862944163e-05, "loss": 0.6166, "step": 55 }, { "epoch": 0.08543096872616324, "grad_norm": 0.2991553042417146, "learning_rate": 1.421319796954315e-05, "loss": 0.5988, "step": 56 }, { "epoch": 0.08695652173913043, "grad_norm": 0.27694691190260423, "learning_rate": 1.4467005076142132e-05, "loss": 0.5823, "step": 57 }, { "epoch": 0.08848207475209764, "grad_norm": 0.3115693417644138, "learning_rate": 1.4720812182741117e-05, "loss": 0.5628, "step": 58 }, { "epoch": 0.09000762776506484, "grad_norm": 0.31670264536471215, "learning_rate": 1.4974619289340103e-05, "loss": 0.6005, "step": 59 }, { "epoch": 0.09153318077803203, "grad_norm": 0.28948747005580666, "learning_rate": 1.5228426395939088e-05, "loss": 0.5894, "step": 60 }, { "epoch": 0.09305873379099924, "grad_norm": 0.2919781215198566, "learning_rate": 1.548223350253807e-05, "loss": 0.6014, "step": 61 }, { "epoch": 0.09458428680396644, "grad_norm": 0.27286206726223033, "learning_rate": 1.5736040609137055e-05, "loss": 0.5662, "step": 62 }, { "epoch": 0.09610983981693363, "grad_norm": 0.268249522887509, "learning_rate": 1.5989847715736043e-05, "loss": 0.5551, "step": 63 }, { "epoch": 0.09763539282990084, "grad_norm": 0.2710539351846991, "learning_rate": 1.6243654822335024e-05, "loss": 0.5522, "step": 64 }, { "epoch": 0.09916094584286804, "grad_norm": 0.2605882231456281, "learning_rate": 1.649746192893401e-05, "loss": 0.5627, "step": 65 }, { "epoch": 0.10068649885583524, "grad_norm": 0.2863241910439653, "learning_rate": 1.6751269035532997e-05, "loss": 0.5706, "step": 66 }, { "epoch": 0.10221205186880244, "grad_norm": 0.26834273946373155, "learning_rate": 1.700507614213198e-05, "loss": 0.5563, "step": 67 }, { "epoch": 0.10373760488176964, "grad_norm": 0.26963191017034194, "learning_rate": 1.7258883248730966e-05, "loss": 0.5657, "step": 68 }, { "epoch": 0.10526315789473684, "grad_norm": 0.2842231027390101, "learning_rate": 1.751269035532995e-05, "loss": 0.5759, "step": 69 }, { "epoch": 0.10678871090770405, "grad_norm": 0.2615342982302276, "learning_rate": 1.7766497461928935e-05, "loss": 0.5667, "step": 70 }, { "epoch": 0.10831426392067124, "grad_norm": 0.2571340491010065, "learning_rate": 1.802030456852792e-05, "loss": 0.576, "step": 71 }, { "epoch": 0.10983981693363844, "grad_norm": 0.2706971456685584, "learning_rate": 1.8274111675126904e-05, "loss": 0.5859, "step": 72 }, { "epoch": 0.11136536994660565, "grad_norm": 0.2471893691410938, "learning_rate": 1.852791878172589e-05, "loss": 0.5491, "step": 73 }, { "epoch": 0.11289092295957284, "grad_norm": 0.27003901385356355, "learning_rate": 1.8781725888324874e-05, "loss": 0.5323, "step": 74 }, { "epoch": 0.11441647597254005, "grad_norm": 0.27371467439294905, "learning_rate": 1.9035532994923858e-05, "loss": 0.5398, "step": 75 }, { "epoch": 0.11594202898550725, "grad_norm": 0.2558176358885089, "learning_rate": 1.9289340101522843e-05, "loss": 0.5011, "step": 76 }, { "epoch": 0.11746758199847444, "grad_norm": 0.2836432263652492, "learning_rate": 1.9543147208121827e-05, "loss": 0.5567, "step": 77 }, { "epoch": 0.11899313501144165, "grad_norm": 0.2510880683600564, "learning_rate": 1.9796954314720815e-05, "loss": 0.5517, "step": 78 }, { "epoch": 0.12051868802440885, "grad_norm": 0.29754874593280645, "learning_rate": 2.0050761421319797e-05, "loss": 0.5425, "step": 79 }, { "epoch": 0.12204424103737604, "grad_norm": 0.2738662366242286, "learning_rate": 2.030456852791878e-05, "loss": 0.5576, "step": 80 }, { "epoch": 0.12356979405034325, "grad_norm": 0.279311498047004, "learning_rate": 2.055837563451777e-05, "loss": 0.5409, "step": 81 }, { "epoch": 0.12509534706331046, "grad_norm": 0.31258524874496857, "learning_rate": 2.0812182741116754e-05, "loss": 0.5536, "step": 82 }, { "epoch": 0.12662090007627766, "grad_norm": 0.3173618131396359, "learning_rate": 2.1065989847715735e-05, "loss": 0.5305, "step": 83 }, { "epoch": 0.12814645308924486, "grad_norm": 0.3307918249285771, "learning_rate": 2.1319796954314723e-05, "loss": 0.5514, "step": 84 }, { "epoch": 0.12967200610221205, "grad_norm": 0.3041878553835707, "learning_rate": 2.1573604060913707e-05, "loss": 0.5424, "step": 85 }, { "epoch": 0.13119755911517925, "grad_norm": 0.33787709221853124, "learning_rate": 2.182741116751269e-05, "loss": 0.5325, "step": 86 }, { "epoch": 0.13272311212814644, "grad_norm": 0.3009701848739935, "learning_rate": 2.2081218274111677e-05, "loss": 0.518, "step": 87 }, { "epoch": 0.13424866514111367, "grad_norm": 0.3431111942607132, "learning_rate": 2.233502538071066e-05, "loss": 0.5523, "step": 88 }, { "epoch": 0.13577421815408086, "grad_norm": 0.2939274934820149, "learning_rate": 2.2588832487309646e-05, "loss": 0.5466, "step": 89 }, { "epoch": 0.13729977116704806, "grad_norm": 0.32001698933885575, "learning_rate": 2.284263959390863e-05, "loss": 0.5302, "step": 90 }, { "epoch": 0.13882532418001525, "grad_norm": 0.31254407085355995, "learning_rate": 2.3096446700507615e-05, "loss": 0.5323, "step": 91 }, { "epoch": 0.14035087719298245, "grad_norm": 0.3203144309469938, "learning_rate": 2.33502538071066e-05, "loss": 0.5336, "step": 92 }, { "epoch": 0.14187643020594964, "grad_norm": 0.2893890738368526, "learning_rate": 2.3604060913705588e-05, "loss": 0.5329, "step": 93 }, { "epoch": 0.14340198321891687, "grad_norm": 0.3189107842660622, "learning_rate": 2.385786802030457e-05, "loss": 0.5348, "step": 94 }, { "epoch": 0.14492753623188406, "grad_norm": 0.34528306216855803, "learning_rate": 2.4111675126903553e-05, "loss": 0.5483, "step": 95 }, { "epoch": 0.14645308924485126, "grad_norm": 0.30760010341553246, "learning_rate": 2.436548223350254e-05, "loss": 0.5483, "step": 96 }, { "epoch": 0.14797864225781845, "grad_norm": 0.31091630783524227, "learning_rate": 2.4619289340101523e-05, "loss": 0.5227, "step": 97 }, { "epoch": 0.14950419527078565, "grad_norm": 0.3443416890112506, "learning_rate": 2.4873096446700507e-05, "loss": 0.5239, "step": 98 }, { "epoch": 0.15102974828375287, "grad_norm": 0.3450504510548033, "learning_rate": 2.5126903553299492e-05, "loss": 0.5357, "step": 99 }, { "epoch": 0.15255530129672007, "grad_norm": 0.30857498415221274, "learning_rate": 2.5380710659898476e-05, "loss": 0.5278, "step": 100 }, { "epoch": 0.15408085430968727, "grad_norm": 0.3248471415386921, "learning_rate": 2.563451776649746e-05, "loss": 0.4947, "step": 101 }, { "epoch": 0.15560640732265446, "grad_norm": 0.29633240413007217, "learning_rate": 2.588832487309645e-05, "loss": 0.4969, "step": 102 }, { "epoch": 0.15713196033562166, "grad_norm": 0.32647275675691345, "learning_rate": 2.6142131979695434e-05, "loss": 0.5211, "step": 103 }, { "epoch": 0.15865751334858885, "grad_norm": 0.2797389340538669, "learning_rate": 2.6395939086294418e-05, "loss": 0.4982, "step": 104 }, { "epoch": 0.16018306636155608, "grad_norm": 0.3306159602092806, "learning_rate": 2.6649746192893406e-05, "loss": 0.5175, "step": 105 }, { "epoch": 0.16170861937452327, "grad_norm": 0.309903569962899, "learning_rate": 2.6903553299492384e-05, "loss": 0.5322, "step": 106 }, { "epoch": 0.16323417238749047, "grad_norm": 0.4210903211687211, "learning_rate": 2.715736040609137e-05, "loss": 0.5285, "step": 107 }, { "epoch": 0.16475972540045766, "grad_norm": 0.2987578394619366, "learning_rate": 2.7411167512690357e-05, "loss": 0.5166, "step": 108 }, { "epoch": 0.16628527841342486, "grad_norm": 0.3692820965774576, "learning_rate": 2.766497461928934e-05, "loss": 0.5268, "step": 109 }, { "epoch": 0.16781083142639205, "grad_norm": 0.36679117529033817, "learning_rate": 2.7918781725888326e-05, "loss": 0.5133, "step": 110 }, { "epoch": 0.16933638443935928, "grad_norm": 0.3653267788003699, "learning_rate": 2.8172588832487314e-05, "loss": 0.5032, "step": 111 }, { "epoch": 0.17086193745232647, "grad_norm": 0.31915637471934893, "learning_rate": 2.84263959390863e-05, "loss": 0.5413, "step": 112 }, { "epoch": 0.17238749046529367, "grad_norm": 0.3679755403709902, "learning_rate": 2.8680203045685283e-05, "loss": 0.5133, "step": 113 }, { "epoch": 0.17391304347826086, "grad_norm": 0.3554894746921558, "learning_rate": 2.8934010152284264e-05, "loss": 0.4975, "step": 114 }, { "epoch": 0.17543859649122806, "grad_norm": 0.33406061120945474, "learning_rate": 2.918781725888325e-05, "loss": 0.5356, "step": 115 }, { "epoch": 0.17696414950419528, "grad_norm": 0.33456153184785203, "learning_rate": 2.9441624365482233e-05, "loss": 0.5066, "step": 116 }, { "epoch": 0.17848970251716248, "grad_norm": 0.39642613038185825, "learning_rate": 2.969543147208122e-05, "loss": 0.5158, "step": 117 }, { "epoch": 0.18001525553012968, "grad_norm": 0.3368112730923934, "learning_rate": 2.9949238578680206e-05, "loss": 0.5533, "step": 118 }, { "epoch": 0.18154080854309687, "grad_norm": 0.39599908942243056, "learning_rate": 3.020304568527919e-05, "loss": 0.5099, "step": 119 }, { "epoch": 0.18306636155606407, "grad_norm": 0.40873178022050505, "learning_rate": 3.0456852791878175e-05, "loss": 0.5082, "step": 120 }, { "epoch": 0.18459191456903126, "grad_norm": 0.37003794750518143, "learning_rate": 3.071065989847716e-05, "loss": 0.5019, "step": 121 }, { "epoch": 0.18611746758199849, "grad_norm": 0.36467300367696853, "learning_rate": 3.096446700507614e-05, "loss": 0.4744, "step": 122 }, { "epoch": 0.18764302059496568, "grad_norm": 0.4385726256188317, "learning_rate": 3.121827411167513e-05, "loss": 0.4914, "step": 123 }, { "epoch": 0.18916857360793288, "grad_norm": 0.38189320901963275, "learning_rate": 3.147208121827411e-05, "loss": 0.4993, "step": 124 }, { "epoch": 0.19069412662090007, "grad_norm": 0.49053280789406456, "learning_rate": 3.17258883248731e-05, "loss": 0.5405, "step": 125 }, { "epoch": 0.19221967963386727, "grad_norm": 0.4769105100767916, "learning_rate": 3.1979695431472086e-05, "loss": 0.5, "step": 126 }, { "epoch": 0.19374523264683446, "grad_norm": 0.42515148472239256, "learning_rate": 3.223350253807107e-05, "loss": 0.5056, "step": 127 }, { "epoch": 0.1952707856598017, "grad_norm": 0.3798566436420005, "learning_rate": 3.248730964467005e-05, "loss": 0.5062, "step": 128 }, { "epoch": 0.19679633867276888, "grad_norm": 0.509095378757817, "learning_rate": 3.2741116751269036e-05, "loss": 0.5126, "step": 129 }, { "epoch": 0.19832189168573608, "grad_norm": 0.5188828118008044, "learning_rate": 3.299492385786802e-05, "loss": 0.5021, "step": 130 }, { "epoch": 0.19984744469870327, "grad_norm": 0.4111552897977725, "learning_rate": 3.3248730964467006e-05, "loss": 0.5168, "step": 131 }, { "epoch": 0.20137299771167047, "grad_norm": 0.5004706437857418, "learning_rate": 3.3502538071065994e-05, "loss": 0.5073, "step": 132 }, { "epoch": 0.2028985507246377, "grad_norm": 0.44272001802480027, "learning_rate": 3.3756345177664975e-05, "loss": 0.5146, "step": 133 }, { "epoch": 0.2044241037376049, "grad_norm": 0.4044855190961641, "learning_rate": 3.401015228426396e-05, "loss": 0.4878, "step": 134 }, { "epoch": 0.20594965675057209, "grad_norm": 0.448067603166869, "learning_rate": 3.4263959390862944e-05, "loss": 0.5027, "step": 135 }, { "epoch": 0.20747520976353928, "grad_norm": 0.5718659703770835, "learning_rate": 3.451776649746193e-05, "loss": 0.4751, "step": 136 }, { "epoch": 0.20900076277650648, "grad_norm": 0.3723611331519563, "learning_rate": 3.477157360406091e-05, "loss": 0.4958, "step": 137 }, { "epoch": 0.21052631578947367, "grad_norm": 0.48362860914936395, "learning_rate": 3.50253807106599e-05, "loss": 0.4823, "step": 138 }, { "epoch": 0.2120518688024409, "grad_norm": 0.5353366890665446, "learning_rate": 3.527918781725888e-05, "loss": 0.4983, "step": 139 }, { "epoch": 0.2135774218154081, "grad_norm": 0.4411572574415237, "learning_rate": 3.553299492385787e-05, "loss": 0.5236, "step": 140 }, { "epoch": 0.2151029748283753, "grad_norm": 0.5693476043264747, "learning_rate": 3.578680203045686e-05, "loss": 0.4684, "step": 141 }, { "epoch": 0.21662852784134248, "grad_norm": 0.6425736599419469, "learning_rate": 3.604060913705584e-05, "loss": 0.4842, "step": 142 }, { "epoch": 0.21815408085430968, "grad_norm": 0.43467579478422785, "learning_rate": 3.629441624365482e-05, "loss": 0.5103, "step": 143 }, { "epoch": 0.21967963386727687, "grad_norm": 0.8449956349942062, "learning_rate": 3.654822335025381e-05, "loss": 0.482, "step": 144 }, { "epoch": 0.2212051868802441, "grad_norm": 0.6517623801926216, "learning_rate": 3.680203045685279e-05, "loss": 0.5038, "step": 145 }, { "epoch": 0.2227307398932113, "grad_norm": 0.5441669121124234, "learning_rate": 3.705583756345178e-05, "loss": 0.4904, "step": 146 }, { "epoch": 0.2242562929061785, "grad_norm": 0.6218334722660793, "learning_rate": 3.7309644670050766e-05, "loss": 0.4996, "step": 147 }, { "epoch": 0.22578184591914569, "grad_norm": 0.622312111306837, "learning_rate": 3.756345177664975e-05, "loss": 0.5197, "step": 148 }, { "epoch": 0.22730739893211288, "grad_norm": 0.5027883306888783, "learning_rate": 3.7817258883248735e-05, "loss": 0.4881, "step": 149 }, { "epoch": 0.2288329519450801, "grad_norm": 0.6526639925691351, "learning_rate": 3.8071065989847716e-05, "loss": 0.4961, "step": 150 }, { "epoch": 0.2303585049580473, "grad_norm": 0.4549092358748704, "learning_rate": 3.83248730964467e-05, "loss": 0.4981, "step": 151 }, { "epoch": 0.2318840579710145, "grad_norm": 0.6274895391580585, "learning_rate": 3.8578680203045685e-05, "loss": 0.4925, "step": 152 }, { "epoch": 0.2334096109839817, "grad_norm": 0.4842229185831382, "learning_rate": 3.8832487309644673e-05, "loss": 0.4743, "step": 153 }, { "epoch": 0.2349351639969489, "grad_norm": 0.5468556570446964, "learning_rate": 3.9086294416243655e-05, "loss": 0.4934, "step": 154 }, { "epoch": 0.23646071700991608, "grad_norm": 0.6443008789123438, "learning_rate": 3.934010152284264e-05, "loss": 0.5296, "step": 155 }, { "epoch": 0.2379862700228833, "grad_norm": 0.444510041361722, "learning_rate": 3.959390862944163e-05, "loss": 0.5012, "step": 156 }, { "epoch": 0.2395118230358505, "grad_norm": 0.7543670344115693, "learning_rate": 3.9847715736040605e-05, "loss": 0.4933, "step": 157 }, { "epoch": 0.2410373760488177, "grad_norm": 0.45282352353842703, "learning_rate": 4.010152284263959e-05, "loss": 0.483, "step": 158 }, { "epoch": 0.2425629290617849, "grad_norm": 0.5339353191922888, "learning_rate": 4.035532994923858e-05, "loss": 0.4933, "step": 159 }, { "epoch": 0.2440884820747521, "grad_norm": 0.45790763663725975, "learning_rate": 4.060913705583756e-05, "loss": 0.4618, "step": 160 }, { "epoch": 0.24561403508771928, "grad_norm": 0.48805058257865885, "learning_rate": 4.086294416243655e-05, "loss": 0.4743, "step": 161 }, { "epoch": 0.2471395881006865, "grad_norm": 0.5780459276913678, "learning_rate": 4.111675126903554e-05, "loss": 0.4572, "step": 162 }, { "epoch": 0.2486651411136537, "grad_norm": 0.44288672561950615, "learning_rate": 4.137055837563452e-05, "loss": 0.4798, "step": 163 }, { "epoch": 0.2501906941266209, "grad_norm": 0.49217871577753636, "learning_rate": 4.162436548223351e-05, "loss": 0.5071, "step": 164 }, { "epoch": 0.2517162471395881, "grad_norm": 0.4404444082718636, "learning_rate": 4.187817258883249e-05, "loss": 0.5177, "step": 165 }, { "epoch": 0.2532418001525553, "grad_norm": 0.5380575952107296, "learning_rate": 4.213197969543147e-05, "loss": 0.4834, "step": 166 }, { "epoch": 0.2547673531655225, "grad_norm": 0.4489323446314217, "learning_rate": 4.238578680203046e-05, "loss": 0.4933, "step": 167 }, { "epoch": 0.2562929061784897, "grad_norm": 0.4473677330159973, "learning_rate": 4.2639593908629446e-05, "loss": 0.4741, "step": 168 }, { "epoch": 0.2578184591914569, "grad_norm": 0.449098025532032, "learning_rate": 4.289340101522843e-05, "loss": 0.4826, "step": 169 }, { "epoch": 0.2593440122044241, "grad_norm": 0.43662049292538996, "learning_rate": 4.3147208121827415e-05, "loss": 0.4868, "step": 170 }, { "epoch": 0.2608695652173913, "grad_norm": 0.3319680098257283, "learning_rate": 4.34010152284264e-05, "loss": 0.4579, "step": 171 }, { "epoch": 0.2623951182303585, "grad_norm": 0.46213502274360335, "learning_rate": 4.365482233502538e-05, "loss": 0.4678, "step": 172 }, { "epoch": 0.2639206712433257, "grad_norm": 0.31262043632684705, "learning_rate": 4.3908629441624365e-05, "loss": 0.4837, "step": 173 }, { "epoch": 0.2654462242562929, "grad_norm": 0.46608634573300534, "learning_rate": 4.416243654822335e-05, "loss": 0.4923, "step": 174 }, { "epoch": 0.2669717772692601, "grad_norm": 0.4211508088357342, "learning_rate": 4.4416243654822335e-05, "loss": 0.4993, "step": 175 }, { "epoch": 0.26849733028222733, "grad_norm": 0.41570394841221237, "learning_rate": 4.467005076142132e-05, "loss": 0.5089, "step": 176 }, { "epoch": 0.2700228832951945, "grad_norm": 0.47720671631353956, "learning_rate": 4.492385786802031e-05, "loss": 0.5169, "step": 177 }, { "epoch": 0.2715484363081617, "grad_norm": 0.3943421712390057, "learning_rate": 4.517766497461929e-05, "loss": 0.486, "step": 178 }, { "epoch": 0.2730739893211289, "grad_norm": 0.3657449910955345, "learning_rate": 4.543147208121827e-05, "loss": 0.468, "step": 179 }, { "epoch": 0.2745995423340961, "grad_norm": 0.33705977330021186, "learning_rate": 4.568527918781726e-05, "loss": 0.4691, "step": 180 }, { "epoch": 0.2761250953470633, "grad_norm": 0.48016632862316694, "learning_rate": 4.593908629441624e-05, "loss": 0.4686, "step": 181 }, { "epoch": 0.2776506483600305, "grad_norm": 0.4007003040017178, "learning_rate": 4.619289340101523e-05, "loss": 0.4913, "step": 182 }, { "epoch": 0.2791762013729977, "grad_norm": 0.49732079427964887, "learning_rate": 4.644670050761422e-05, "loss": 0.4812, "step": 183 }, { "epoch": 0.2807017543859649, "grad_norm": 0.3876840707466106, "learning_rate": 4.67005076142132e-05, "loss": 0.4757, "step": 184 }, { "epoch": 0.2822273073989321, "grad_norm": 0.46031139692605944, "learning_rate": 4.695431472081219e-05, "loss": 0.4746, "step": 185 }, { "epoch": 0.2837528604118993, "grad_norm": 0.5124316062824426, "learning_rate": 4.7208121827411175e-05, "loss": 0.4831, "step": 186 }, { "epoch": 0.28527841342486654, "grad_norm": 0.5417121484222781, "learning_rate": 4.746192893401015e-05, "loss": 0.4962, "step": 187 }, { "epoch": 0.28680396643783374, "grad_norm": 0.5210161657821222, "learning_rate": 4.771573604060914e-05, "loss": 0.4711, "step": 188 }, { "epoch": 0.28832951945080093, "grad_norm": 0.3652367581389051, "learning_rate": 4.7969543147208126e-05, "loss": 0.478, "step": 189 }, { "epoch": 0.2898550724637681, "grad_norm": 0.6269855841429538, "learning_rate": 4.822335025380711e-05, "loss": 0.46, "step": 190 }, { "epoch": 0.2913806254767353, "grad_norm": 0.441568182365455, "learning_rate": 4.8477157360406095e-05, "loss": 0.4867, "step": 191 }, { "epoch": 0.2929061784897025, "grad_norm": 0.4410864596893693, "learning_rate": 4.873096446700508e-05, "loss": 0.4654, "step": 192 }, { "epoch": 0.2944317315026697, "grad_norm": 0.5091064245038812, "learning_rate": 4.8984771573604064e-05, "loss": 0.468, "step": 193 }, { "epoch": 0.2959572845156369, "grad_norm": 0.4441100137243752, "learning_rate": 4.9238578680203045e-05, "loss": 0.4897, "step": 194 }, { "epoch": 0.2974828375286041, "grad_norm": 0.4163739595578847, "learning_rate": 4.949238578680203e-05, "loss": 0.4774, "step": 195 }, { "epoch": 0.2990083905415713, "grad_norm": 0.38947059047218374, "learning_rate": 4.9746192893401014e-05, "loss": 0.4903, "step": 196 }, { "epoch": 0.3005339435545385, "grad_norm": 0.44065669053134326, "learning_rate": 5e-05, "loss": 0.4662, "step": 197 }, { "epoch": 0.30205949656750575, "grad_norm": 0.3445543234859049, "learning_rate": 4.997171945701358e-05, "loss": 0.4724, "step": 198 }, { "epoch": 0.30358504958047294, "grad_norm": 0.45807223386266915, "learning_rate": 4.994343891402715e-05, "loss": 0.4799, "step": 199 }, { "epoch": 0.30511060259344014, "grad_norm": 0.4954707471406609, "learning_rate": 4.991515837104073e-05, "loss": 0.4804, "step": 200 }, { "epoch": 0.30663615560640733, "grad_norm": 0.44335077027616937, "learning_rate": 4.98868778280543e-05, "loss": 0.4624, "step": 201 }, { "epoch": 0.30816170861937453, "grad_norm": 0.4182871013585828, "learning_rate": 4.985859728506788e-05, "loss": 0.4814, "step": 202 }, { "epoch": 0.3096872616323417, "grad_norm": 0.47873990237766945, "learning_rate": 4.983031674208145e-05, "loss": 0.497, "step": 203 }, { "epoch": 0.3112128146453089, "grad_norm": 0.37130826366755704, "learning_rate": 4.980203619909503e-05, "loss": 0.4756, "step": 204 }, { "epoch": 0.3127383676582761, "grad_norm": 0.43381734662593663, "learning_rate": 4.97737556561086e-05, "loss": 0.4592, "step": 205 }, { "epoch": 0.3142639206712433, "grad_norm": 0.37802189902667876, "learning_rate": 4.974547511312218e-05, "loss": 0.4895, "step": 206 }, { "epoch": 0.3157894736842105, "grad_norm": 0.43660330927144847, "learning_rate": 4.971719457013575e-05, "loss": 0.4629, "step": 207 }, { "epoch": 0.3173150266971777, "grad_norm": 0.3699050009743259, "learning_rate": 4.9688914027149327e-05, "loss": 0.4774, "step": 208 }, { "epoch": 0.3188405797101449, "grad_norm": 0.3722325824685486, "learning_rate": 4.96606334841629e-05, "loss": 0.5009, "step": 209 }, { "epoch": 0.32036613272311215, "grad_norm": 0.42801493321280004, "learning_rate": 4.9632352941176476e-05, "loss": 0.4543, "step": 210 }, { "epoch": 0.32189168573607935, "grad_norm": 0.46399544699213296, "learning_rate": 4.960407239819005e-05, "loss": 0.4949, "step": 211 }, { "epoch": 0.32341723874904654, "grad_norm": 0.42121235432689375, "learning_rate": 4.9575791855203626e-05, "loss": 0.475, "step": 212 }, { "epoch": 0.32494279176201374, "grad_norm": 0.4272365929618656, "learning_rate": 4.95475113122172e-05, "loss": 0.4762, "step": 213 }, { "epoch": 0.32646834477498093, "grad_norm": 0.5943722623210292, "learning_rate": 4.9519230769230776e-05, "loss": 0.4645, "step": 214 }, { "epoch": 0.32799389778794813, "grad_norm": 0.4391503380933774, "learning_rate": 4.949095022624435e-05, "loss": 0.4715, "step": 215 }, { "epoch": 0.3295194508009153, "grad_norm": 0.5791927295142608, "learning_rate": 4.9462669683257926e-05, "loss": 0.4402, "step": 216 }, { "epoch": 0.3310450038138825, "grad_norm": 0.5046188493276822, "learning_rate": 4.94343891402715e-05, "loss": 0.501, "step": 217 }, { "epoch": 0.3325705568268497, "grad_norm": 0.4590973221428651, "learning_rate": 4.940610859728507e-05, "loss": 0.4623, "step": 218 }, { "epoch": 0.3340961098398169, "grad_norm": 0.4243812726071251, "learning_rate": 4.9377828054298644e-05, "loss": 0.4908, "step": 219 }, { "epoch": 0.3356216628527841, "grad_norm": 0.527581684322034, "learning_rate": 4.934954751131222e-05, "loss": 0.4681, "step": 220 }, { "epoch": 0.33714721586575136, "grad_norm": 0.41265843549008524, "learning_rate": 4.9321266968325794e-05, "loss": 0.472, "step": 221 }, { "epoch": 0.33867276887871856, "grad_norm": 0.5350130897075509, "learning_rate": 4.929298642533937e-05, "loss": 0.4555, "step": 222 }, { "epoch": 0.34019832189168575, "grad_norm": 0.6993310808240697, "learning_rate": 4.9264705882352944e-05, "loss": 0.4697, "step": 223 }, { "epoch": 0.34172387490465295, "grad_norm": 0.48830000682690955, "learning_rate": 4.923642533936652e-05, "loss": 0.4379, "step": 224 }, { "epoch": 0.34324942791762014, "grad_norm": 0.48018531779541546, "learning_rate": 4.9208144796380093e-05, "loss": 0.4543, "step": 225 }, { "epoch": 0.34477498093058734, "grad_norm": 0.6636436659696373, "learning_rate": 4.917986425339366e-05, "loss": 0.469, "step": 226 }, { "epoch": 0.34630053394355453, "grad_norm": 0.38057238528993864, "learning_rate": 4.9151583710407237e-05, "loss": 0.4603, "step": 227 }, { "epoch": 0.34782608695652173, "grad_norm": 0.4668707769354748, "learning_rate": 4.912330316742081e-05, "loss": 0.4672, "step": 228 }, { "epoch": 0.3493516399694889, "grad_norm": 0.5392289506229951, "learning_rate": 4.9095022624434386e-05, "loss": 0.475, "step": 229 }, { "epoch": 0.3508771929824561, "grad_norm": 0.42052166965841536, "learning_rate": 4.906674208144796e-05, "loss": 0.4789, "step": 230 }, { "epoch": 0.3524027459954233, "grad_norm": 0.47141161249782315, "learning_rate": 4.9038461538461536e-05, "loss": 0.4689, "step": 231 }, { "epoch": 0.35392829900839057, "grad_norm": 0.5970456705204581, "learning_rate": 4.901018099547511e-05, "loss": 0.4696, "step": 232 }, { "epoch": 0.35545385202135776, "grad_norm": 0.37037016416712376, "learning_rate": 4.8981900452488686e-05, "loss": 0.4561, "step": 233 }, { "epoch": 0.35697940503432496, "grad_norm": 0.5314255286256848, "learning_rate": 4.895361990950226e-05, "loss": 0.463, "step": 234 }, { "epoch": 0.35850495804729215, "grad_norm": 0.4290981397873774, "learning_rate": 4.8925339366515836e-05, "loss": 0.4788, "step": 235 }, { "epoch": 0.36003051106025935, "grad_norm": 0.3887824386452619, "learning_rate": 4.889705882352941e-05, "loss": 0.4607, "step": 236 }, { "epoch": 0.36155606407322655, "grad_norm": 0.5641989416146719, "learning_rate": 4.8868778280542986e-05, "loss": 0.4484, "step": 237 }, { "epoch": 0.36308161708619374, "grad_norm": 0.4430449636319575, "learning_rate": 4.884049773755656e-05, "loss": 0.4691, "step": 238 }, { "epoch": 0.36460717009916094, "grad_norm": 0.45903933214920994, "learning_rate": 4.8812217194570136e-05, "loss": 0.4716, "step": 239 }, { "epoch": 0.36613272311212813, "grad_norm": 0.5708405508871432, "learning_rate": 4.878393665158371e-05, "loss": 0.4773, "step": 240 }, { "epoch": 0.36765827612509533, "grad_norm": 0.34952852246125676, "learning_rate": 4.8755656108597285e-05, "loss": 0.4694, "step": 241 }, { "epoch": 0.3691838291380625, "grad_norm": 0.5303600635373378, "learning_rate": 4.872737556561086e-05, "loss": 0.4553, "step": 242 }, { "epoch": 0.3707093821510298, "grad_norm": 0.4589455368395553, "learning_rate": 4.8699095022624435e-05, "loss": 0.4383, "step": 243 }, { "epoch": 0.37223493516399697, "grad_norm": 0.5229900132897564, "learning_rate": 4.867081447963801e-05, "loss": 0.4823, "step": 244 }, { "epoch": 0.37376048817696417, "grad_norm": 0.573062510585785, "learning_rate": 4.8642533936651585e-05, "loss": 0.4671, "step": 245 }, { "epoch": 0.37528604118993136, "grad_norm": 0.4724878456316856, "learning_rate": 4.861425339366516e-05, "loss": 0.49, "step": 246 }, { "epoch": 0.37681159420289856, "grad_norm": 0.612363787799769, "learning_rate": 4.8585972850678735e-05, "loss": 0.4692, "step": 247 }, { "epoch": 0.37833714721586575, "grad_norm": 0.40127773639659037, "learning_rate": 4.855769230769231e-05, "loss": 0.4694, "step": 248 }, { "epoch": 0.37986270022883295, "grad_norm": 0.4522230730583521, "learning_rate": 4.8529411764705885e-05, "loss": 0.4735, "step": 249 }, { "epoch": 0.38138825324180015, "grad_norm": 0.39180505831026313, "learning_rate": 4.850113122171946e-05, "loss": 0.454, "step": 250 }, { "epoch": 0.38291380625476734, "grad_norm": 0.49040625372710184, "learning_rate": 4.8472850678733035e-05, "loss": 0.4628, "step": 251 }, { "epoch": 0.38443935926773454, "grad_norm": 0.45764589901740815, "learning_rate": 4.844457013574661e-05, "loss": 0.4401, "step": 252 }, { "epoch": 0.38596491228070173, "grad_norm": 0.5400997487814866, "learning_rate": 4.8416289592760185e-05, "loss": 0.4584, "step": 253 }, { "epoch": 0.38749046529366893, "grad_norm": 0.5574166645978192, "learning_rate": 4.838800904977376e-05, "loss": 0.4737, "step": 254 }, { "epoch": 0.3890160183066362, "grad_norm": 0.4426528437890868, "learning_rate": 4.8359728506787334e-05, "loss": 0.4671, "step": 255 }, { "epoch": 0.3905415713196034, "grad_norm": 0.5226926307746833, "learning_rate": 4.833144796380091e-05, "loss": 0.4811, "step": 256 }, { "epoch": 0.39206712433257057, "grad_norm": 0.3984257278231069, "learning_rate": 4.8303167420814484e-05, "loss": 0.4601, "step": 257 }, { "epoch": 0.39359267734553777, "grad_norm": 0.46381641467015, "learning_rate": 4.827488687782806e-05, "loss": 0.4644, "step": 258 }, { "epoch": 0.39511823035850496, "grad_norm": 0.46631307350552836, "learning_rate": 4.8246606334841634e-05, "loss": 0.4481, "step": 259 }, { "epoch": 0.39664378337147216, "grad_norm": 0.43044617741441904, "learning_rate": 4.821832579185521e-05, "loss": 0.4698, "step": 260 }, { "epoch": 0.39816933638443935, "grad_norm": 0.3764516882099913, "learning_rate": 4.8190045248868784e-05, "loss": 0.4379, "step": 261 }, { "epoch": 0.39969488939740655, "grad_norm": 0.4622229825454947, "learning_rate": 4.816176470588236e-05, "loss": 0.4762, "step": 262 }, { "epoch": 0.40122044241037375, "grad_norm": 0.5188195015700752, "learning_rate": 4.8133484162895934e-05, "loss": 0.4838, "step": 263 }, { "epoch": 0.40274599542334094, "grad_norm": 0.3710555656609614, "learning_rate": 4.810520361990951e-05, "loss": 0.4792, "step": 264 }, { "epoch": 0.40427154843630814, "grad_norm": 0.5803492910944004, "learning_rate": 4.8076923076923084e-05, "loss": 0.4886, "step": 265 }, { "epoch": 0.4057971014492754, "grad_norm": 0.4137424118355299, "learning_rate": 4.804864253393666e-05, "loss": 0.4493, "step": 266 }, { "epoch": 0.4073226544622426, "grad_norm": 0.49202554291510203, "learning_rate": 4.802036199095023e-05, "loss": 0.4617, "step": 267 }, { "epoch": 0.4088482074752098, "grad_norm": 0.44238954800868036, "learning_rate": 4.79920814479638e-05, "loss": 0.4584, "step": 268 }, { "epoch": 0.410373760488177, "grad_norm": 0.3957991921999439, "learning_rate": 4.7963800904977377e-05, "loss": 0.4268, "step": 269 }, { "epoch": 0.41189931350114417, "grad_norm": 0.5052515169115994, "learning_rate": 4.793552036199095e-05, "loss": 0.4627, "step": 270 }, { "epoch": 0.41342486651411137, "grad_norm": 0.4259854143111027, "learning_rate": 4.7907239819004526e-05, "loss": 0.478, "step": 271 }, { "epoch": 0.41495041952707856, "grad_norm": 0.5312126575837334, "learning_rate": 4.78789592760181e-05, "loss": 0.474, "step": 272 }, { "epoch": 0.41647597254004576, "grad_norm": 0.4388407804050586, "learning_rate": 4.7850678733031676e-05, "loss": 0.4454, "step": 273 }, { "epoch": 0.41800152555301295, "grad_norm": 0.4777489924417406, "learning_rate": 4.782239819004525e-05, "loss": 0.4601, "step": 274 }, { "epoch": 0.41952707856598015, "grad_norm": 0.5382399680071529, "learning_rate": 4.7794117647058826e-05, "loss": 0.4497, "step": 275 }, { "epoch": 0.42105263157894735, "grad_norm": 0.45447188618950124, "learning_rate": 4.77658371040724e-05, "loss": 0.4599, "step": 276 }, { "epoch": 0.4225781845919146, "grad_norm": 0.39885476632615113, "learning_rate": 4.7737556561085976e-05, "loss": 0.4487, "step": 277 }, { "epoch": 0.4241037376048818, "grad_norm": 0.6027227469402054, "learning_rate": 4.770927601809955e-05, "loss": 0.5001, "step": 278 }, { "epoch": 0.425629290617849, "grad_norm": 0.39139220478922765, "learning_rate": 4.7680995475113126e-05, "loss": 0.467, "step": 279 }, { "epoch": 0.4271548436308162, "grad_norm": 0.5390364787754144, "learning_rate": 4.7652714932126694e-05, "loss": 0.4813, "step": 280 }, { "epoch": 0.4286803966437834, "grad_norm": 0.5030366757531028, "learning_rate": 4.762443438914027e-05, "loss": 0.4303, "step": 281 }, { "epoch": 0.4302059496567506, "grad_norm": 0.3753061577694602, "learning_rate": 4.7596153846153844e-05, "loss": 0.4609, "step": 282 }, { "epoch": 0.43173150266971777, "grad_norm": 0.46558604241659296, "learning_rate": 4.756787330316742e-05, "loss": 0.4589, "step": 283 }, { "epoch": 0.43325705568268497, "grad_norm": 0.4479023963548496, "learning_rate": 4.7539592760180994e-05, "loss": 0.4658, "step": 284 }, { "epoch": 0.43478260869565216, "grad_norm": 0.43578672527822704, "learning_rate": 4.751131221719457e-05, "loss": 0.4414, "step": 285 }, { "epoch": 0.43630816170861936, "grad_norm": 0.5062944004433069, "learning_rate": 4.7483031674208144e-05, "loss": 0.4562, "step": 286 }, { "epoch": 0.43783371472158655, "grad_norm": 0.38127652104526, "learning_rate": 4.745475113122172e-05, "loss": 0.4773, "step": 287 }, { "epoch": 0.43935926773455375, "grad_norm": 0.5406646631436469, "learning_rate": 4.742647058823529e-05, "loss": 0.4517, "step": 288 }, { "epoch": 0.440884820747521, "grad_norm": 0.40180264476706384, "learning_rate": 4.739819004524887e-05, "loss": 0.4593, "step": 289 }, { "epoch": 0.4424103737604882, "grad_norm": 0.5405922291658195, "learning_rate": 4.736990950226244e-05, "loss": 0.485, "step": 290 }, { "epoch": 0.4439359267734554, "grad_norm": 0.44017064515275356, "learning_rate": 4.734162895927602e-05, "loss": 0.4707, "step": 291 }, { "epoch": 0.4454614797864226, "grad_norm": 0.4569333152729843, "learning_rate": 4.731334841628959e-05, "loss": 0.4736, "step": 292 }, { "epoch": 0.4469870327993898, "grad_norm": 0.36379872790121087, "learning_rate": 4.728506787330317e-05, "loss": 0.456, "step": 293 }, { "epoch": 0.448512585812357, "grad_norm": 0.46898492997877445, "learning_rate": 4.725678733031674e-05, "loss": 0.4587, "step": 294 }, { "epoch": 0.4500381388253242, "grad_norm": 0.36812568539890567, "learning_rate": 4.722850678733032e-05, "loss": 0.4299, "step": 295 }, { "epoch": 0.45156369183829137, "grad_norm": 0.48451675786412296, "learning_rate": 4.720022624434389e-05, "loss": 0.4533, "step": 296 }, { "epoch": 0.45308924485125857, "grad_norm": 0.4153380771526916, "learning_rate": 4.717194570135747e-05, "loss": 0.4435, "step": 297 }, { "epoch": 0.45461479786422576, "grad_norm": 0.4643780824657794, "learning_rate": 4.714366515837104e-05, "loss": 0.4608, "step": 298 }, { "epoch": 0.45614035087719296, "grad_norm": 0.5783424978567152, "learning_rate": 4.711538461538462e-05, "loss": 0.4851, "step": 299 }, { "epoch": 0.4576659038901602, "grad_norm": 0.4450266615766501, "learning_rate": 4.708710407239819e-05, "loss": 0.4479, "step": 300 }, { "epoch": 0.4591914569031274, "grad_norm": 0.4108395824940765, "learning_rate": 4.705882352941177e-05, "loss": 0.4222, "step": 301 }, { "epoch": 0.4607170099160946, "grad_norm": 0.3963945382194638, "learning_rate": 4.703054298642534e-05, "loss": 0.4577, "step": 302 }, { "epoch": 0.4622425629290618, "grad_norm": 0.4145860861303533, "learning_rate": 4.700226244343892e-05, "loss": 0.4746, "step": 303 }, { "epoch": 0.463768115942029, "grad_norm": 0.5084087326157789, "learning_rate": 4.697398190045249e-05, "loss": 0.4934, "step": 304 }, { "epoch": 0.4652936689549962, "grad_norm": 0.408963604047535, "learning_rate": 4.694570135746607e-05, "loss": 0.413, "step": 305 }, { "epoch": 0.4668192219679634, "grad_norm": 0.42088164908317816, "learning_rate": 4.691742081447964e-05, "loss": 0.4548, "step": 306 }, { "epoch": 0.4683447749809306, "grad_norm": 0.3949943390119748, "learning_rate": 4.688914027149322e-05, "loss": 0.4307, "step": 307 }, { "epoch": 0.4698703279938978, "grad_norm": 0.4160357275133474, "learning_rate": 4.686085972850679e-05, "loss": 0.473, "step": 308 }, { "epoch": 0.47139588100686497, "grad_norm": 0.44133710804642845, "learning_rate": 4.683257918552037e-05, "loss": 0.4665, "step": 309 }, { "epoch": 0.47292143401983217, "grad_norm": 0.514247945375714, "learning_rate": 4.680429864253394e-05, "loss": 0.4571, "step": 310 }, { "epoch": 0.4744469870327994, "grad_norm": 0.3721245782115898, "learning_rate": 4.6776018099547517e-05, "loss": 0.4447, "step": 311 }, { "epoch": 0.4759725400457666, "grad_norm": 0.5516103085652514, "learning_rate": 4.674773755656109e-05, "loss": 0.4571, "step": 312 }, { "epoch": 0.4774980930587338, "grad_norm": 0.3755855929718043, "learning_rate": 4.6719457013574666e-05, "loss": 0.4638, "step": 313 }, { "epoch": 0.479023646071701, "grad_norm": 0.5418692357964363, "learning_rate": 4.669117647058824e-05, "loss": 0.4727, "step": 314 }, { "epoch": 0.4805491990846682, "grad_norm": 0.4737887282902813, "learning_rate": 4.6662895927601816e-05, "loss": 0.4613, "step": 315 }, { "epoch": 0.4820747520976354, "grad_norm": 0.43402162538984035, "learning_rate": 4.6634615384615384e-05, "loss": 0.4586, "step": 316 }, { "epoch": 0.4836003051106026, "grad_norm": 0.52292535058243, "learning_rate": 4.660633484162896e-05, "loss": 0.4266, "step": 317 }, { "epoch": 0.4851258581235698, "grad_norm": 0.4104427371190281, "learning_rate": 4.6578054298642534e-05, "loss": 0.4904, "step": 318 }, { "epoch": 0.486651411136537, "grad_norm": 0.5373092053187365, "learning_rate": 4.654977375565611e-05, "loss": 0.46, "step": 319 }, { "epoch": 0.4881769641495042, "grad_norm": 0.4221583725515177, "learning_rate": 4.6521493212669684e-05, "loss": 0.4716, "step": 320 }, { "epoch": 0.4897025171624714, "grad_norm": 0.5525195319324893, "learning_rate": 4.649321266968326e-05, "loss": 0.4748, "step": 321 }, { "epoch": 0.49122807017543857, "grad_norm": 0.4223207392743745, "learning_rate": 4.6464932126696834e-05, "loss": 0.4405, "step": 322 }, { "epoch": 0.4927536231884058, "grad_norm": 0.4254187550057278, "learning_rate": 4.643665158371041e-05, "loss": 0.4543, "step": 323 }, { "epoch": 0.494279176201373, "grad_norm": 0.4486672802800912, "learning_rate": 4.6408371040723984e-05, "loss": 0.4454, "step": 324 }, { "epoch": 0.4958047292143402, "grad_norm": 0.4013849604330731, "learning_rate": 4.638009049773756e-05, "loss": 0.4595, "step": 325 }, { "epoch": 0.4973302822273074, "grad_norm": 0.4313598936626369, "learning_rate": 4.6351809954751134e-05, "loss": 0.4503, "step": 326 }, { "epoch": 0.4988558352402746, "grad_norm": 0.4084446293141037, "learning_rate": 4.632352941176471e-05, "loss": 0.448, "step": 327 }, { "epoch": 0.5003813882532419, "grad_norm": 0.437417146170015, "learning_rate": 4.6295248868778284e-05, "loss": 0.4674, "step": 328 }, { "epoch": 0.501906941266209, "grad_norm": 0.5076482751322222, "learning_rate": 4.626696832579186e-05, "loss": 0.4313, "step": 329 }, { "epoch": 0.5034324942791762, "grad_norm": 0.42025341760486523, "learning_rate": 4.623868778280543e-05, "loss": 0.4533, "step": 330 }, { "epoch": 0.5049580472921434, "grad_norm": 0.48174843325997196, "learning_rate": 4.621040723981901e-05, "loss": 0.4752, "step": 331 }, { "epoch": 0.5064836003051106, "grad_norm": 0.4207992363558363, "learning_rate": 4.618212669683258e-05, "loss": 0.4438, "step": 332 }, { "epoch": 0.5080091533180778, "grad_norm": 0.42997297643346377, "learning_rate": 4.615384615384616e-05, "loss": 0.4391, "step": 333 }, { "epoch": 0.509534706331045, "grad_norm": 0.46786714681220665, "learning_rate": 4.612556561085973e-05, "loss": 0.4488, "step": 334 }, { "epoch": 0.5110602593440122, "grad_norm": 0.3569989969746086, "learning_rate": 4.609728506787331e-05, "loss": 0.4586, "step": 335 }, { "epoch": 0.5125858123569794, "grad_norm": 0.4702651952137365, "learning_rate": 4.6069004524886876e-05, "loss": 0.4562, "step": 336 }, { "epoch": 0.5141113653699466, "grad_norm": 0.41939865362528245, "learning_rate": 4.604072398190045e-05, "loss": 0.4589, "step": 337 }, { "epoch": 0.5156369183829138, "grad_norm": 0.3608796758838473, "learning_rate": 4.6012443438914026e-05, "loss": 0.4549, "step": 338 }, { "epoch": 0.517162471395881, "grad_norm": 0.37003271075323124, "learning_rate": 4.59841628959276e-05, "loss": 0.4391, "step": 339 }, { "epoch": 0.5186880244088482, "grad_norm": 0.40773310627517967, "learning_rate": 4.5955882352941176e-05, "loss": 0.4234, "step": 340 }, { "epoch": 0.5202135774218154, "grad_norm": 0.4017377848071537, "learning_rate": 4.592760180995475e-05, "loss": 0.4686, "step": 341 }, { "epoch": 0.5217391304347826, "grad_norm": 0.3551772805295089, "learning_rate": 4.5899321266968326e-05, "loss": 0.4347, "step": 342 }, { "epoch": 0.5232646834477498, "grad_norm": 0.30671458085301156, "learning_rate": 4.58710407239819e-05, "loss": 0.4408, "step": 343 }, { "epoch": 0.524790236460717, "grad_norm": 0.351252738580502, "learning_rate": 4.5842760180995476e-05, "loss": 0.4318, "step": 344 }, { "epoch": 0.5263157894736842, "grad_norm": 0.36041326850955696, "learning_rate": 4.581447963800905e-05, "loss": 0.4552, "step": 345 }, { "epoch": 0.5278413424866514, "grad_norm": 0.3892227954291902, "learning_rate": 4.5786199095022625e-05, "loss": 0.4351, "step": 346 }, { "epoch": 0.5293668954996186, "grad_norm": 0.4165864141016753, "learning_rate": 4.57579185520362e-05, "loss": 0.478, "step": 347 }, { "epoch": 0.5308924485125858, "grad_norm": 0.3724361352453816, "learning_rate": 4.5729638009049775e-05, "loss": 0.4655, "step": 348 }, { "epoch": 0.532418001525553, "grad_norm": 0.43983944854017054, "learning_rate": 4.570135746606335e-05, "loss": 0.4386, "step": 349 }, { "epoch": 0.5339435545385202, "grad_norm": 0.39133002920394927, "learning_rate": 4.5673076923076925e-05, "loss": 0.4466, "step": 350 }, { "epoch": 0.5354691075514875, "grad_norm": 0.4253021140149184, "learning_rate": 4.56447963800905e-05, "loss": 0.4532, "step": 351 }, { "epoch": 0.5369946605644547, "grad_norm": 0.4749741254305587, "learning_rate": 4.5616515837104075e-05, "loss": 0.4547, "step": 352 }, { "epoch": 0.5385202135774219, "grad_norm": 0.4332394870804955, "learning_rate": 4.558823529411765e-05, "loss": 0.4293, "step": 353 }, { "epoch": 0.540045766590389, "grad_norm": 0.4978766447980935, "learning_rate": 4.5559954751131225e-05, "loss": 0.4654, "step": 354 }, { "epoch": 0.5415713196033562, "grad_norm": 0.40753989031643445, "learning_rate": 4.55316742081448e-05, "loss": 0.4516, "step": 355 }, { "epoch": 0.5430968726163234, "grad_norm": 0.508818231927752, "learning_rate": 4.5503393665158375e-05, "loss": 0.4345, "step": 356 }, { "epoch": 0.5446224256292906, "grad_norm": 0.34556609934531163, "learning_rate": 4.547511312217195e-05, "loss": 0.4618, "step": 357 }, { "epoch": 0.5461479786422578, "grad_norm": 0.4741067287348198, "learning_rate": 4.5446832579185524e-05, "loss": 0.4468, "step": 358 }, { "epoch": 0.547673531655225, "grad_norm": 0.3933417795108134, "learning_rate": 4.54185520361991e-05, "loss": 0.4599, "step": 359 }, { "epoch": 0.5491990846681922, "grad_norm": 0.36734422664651617, "learning_rate": 4.5390271493212674e-05, "loss": 0.4464, "step": 360 }, { "epoch": 0.5507246376811594, "grad_norm": 0.40738211382014083, "learning_rate": 4.536199095022625e-05, "loss": 0.4638, "step": 361 }, { "epoch": 0.5522501906941266, "grad_norm": 0.5039240904657424, "learning_rate": 4.5333710407239824e-05, "loss": 0.4739, "step": 362 }, { "epoch": 0.5537757437070938, "grad_norm": 0.38530984712855215, "learning_rate": 4.53054298642534e-05, "loss": 0.445, "step": 363 }, { "epoch": 0.555301296720061, "grad_norm": 0.4285572951822777, "learning_rate": 4.5277149321266974e-05, "loss": 0.4331, "step": 364 }, { "epoch": 0.5568268497330282, "grad_norm": 0.4479239776834813, "learning_rate": 4.524886877828054e-05, "loss": 0.4401, "step": 365 }, { "epoch": 0.5583524027459954, "grad_norm": 0.47877508671413205, "learning_rate": 4.522058823529412e-05, "loss": 0.4501, "step": 366 }, { "epoch": 0.5598779557589626, "grad_norm": 0.4387234941868231, "learning_rate": 4.519230769230769e-05, "loss": 0.4528, "step": 367 }, { "epoch": 0.5614035087719298, "grad_norm": 0.4881510268326386, "learning_rate": 4.516402714932127e-05, "loss": 0.4337, "step": 368 }, { "epoch": 0.562929061784897, "grad_norm": 0.4121845631800157, "learning_rate": 4.513574660633484e-05, "loss": 0.4342, "step": 369 }, { "epoch": 0.5644546147978642, "grad_norm": 0.4608500937615472, "learning_rate": 4.510746606334842e-05, "loss": 0.4546, "step": 370 }, { "epoch": 0.5659801678108314, "grad_norm": 0.40396651123895094, "learning_rate": 4.507918552036199e-05, "loss": 0.4612, "step": 371 }, { "epoch": 0.5675057208237986, "grad_norm": 0.3971917042423019, "learning_rate": 4.505090497737557e-05, "loss": 0.4445, "step": 372 }, { "epoch": 0.5690312738367659, "grad_norm": 0.5304750539121851, "learning_rate": 4.502262443438914e-05, "loss": 0.4513, "step": 373 }, { "epoch": 0.5705568268497331, "grad_norm": 0.39851964969480136, "learning_rate": 4.4994343891402716e-05, "loss": 0.4315, "step": 374 }, { "epoch": 0.5720823798627003, "grad_norm": 0.3869803096137064, "learning_rate": 4.496606334841629e-05, "loss": 0.4287, "step": 375 }, { "epoch": 0.5736079328756675, "grad_norm": 0.49339457919017954, "learning_rate": 4.4937782805429866e-05, "loss": 0.4332, "step": 376 }, { "epoch": 0.5751334858886347, "grad_norm": 0.39651697488067017, "learning_rate": 4.490950226244344e-05, "loss": 0.4637, "step": 377 }, { "epoch": 0.5766590389016019, "grad_norm": 0.4528791780276198, "learning_rate": 4.4881221719457016e-05, "loss": 0.4355, "step": 378 }, { "epoch": 0.5781845919145691, "grad_norm": 0.3907554175580125, "learning_rate": 4.485294117647059e-05, "loss": 0.4473, "step": 379 }, { "epoch": 0.5797101449275363, "grad_norm": 0.46628684662952646, "learning_rate": 4.4824660633484166e-05, "loss": 0.4559, "step": 380 }, { "epoch": 0.5812356979405034, "grad_norm": 0.3886374150606958, "learning_rate": 4.479638009049774e-05, "loss": 0.4265, "step": 381 }, { "epoch": 0.5827612509534706, "grad_norm": 0.3915722595472576, "learning_rate": 4.4768099547511316e-05, "loss": 0.4516, "step": 382 }, { "epoch": 0.5842868039664378, "grad_norm": 0.4886911549246048, "learning_rate": 4.473981900452489e-05, "loss": 0.4652, "step": 383 }, { "epoch": 0.585812356979405, "grad_norm": 0.3740510866570307, "learning_rate": 4.4711538461538466e-05, "loss": 0.4715, "step": 384 }, { "epoch": 0.5873379099923722, "grad_norm": 0.6267512732423065, "learning_rate": 4.468325791855204e-05, "loss": 0.4362, "step": 385 }, { "epoch": 0.5888634630053394, "grad_norm": 0.4398614275180945, "learning_rate": 4.4654977375565616e-05, "loss": 0.4304, "step": 386 }, { "epoch": 0.5903890160183066, "grad_norm": 0.49954532745863417, "learning_rate": 4.462669683257919e-05, "loss": 0.4728, "step": 387 }, { "epoch": 0.5919145690312738, "grad_norm": 0.4018086233528892, "learning_rate": 4.4598416289592765e-05, "loss": 0.4525, "step": 388 }, { "epoch": 0.593440122044241, "grad_norm": 0.3888987046714818, "learning_rate": 4.457013574660634e-05, "loss": 0.432, "step": 389 }, { "epoch": 0.5949656750572082, "grad_norm": 0.3277544666565367, "learning_rate": 4.4541855203619915e-05, "loss": 0.4349, "step": 390 }, { "epoch": 0.5964912280701754, "grad_norm": 0.4199617428609042, "learning_rate": 4.4513574660633483e-05, "loss": 0.4363, "step": 391 }, { "epoch": 0.5980167810831426, "grad_norm": 0.3840487744212482, "learning_rate": 4.448529411764706e-05, "loss": 0.4357, "step": 392 }, { "epoch": 0.5995423340961098, "grad_norm": 0.3715895258712878, "learning_rate": 4.445701357466063e-05, "loss": 0.4448, "step": 393 }, { "epoch": 0.601067887109077, "grad_norm": 0.3629274666303493, "learning_rate": 4.442873303167421e-05, "loss": 0.4345, "step": 394 }, { "epoch": 0.6025934401220442, "grad_norm": 0.4139472132630224, "learning_rate": 4.440045248868778e-05, "loss": 0.4355, "step": 395 }, { "epoch": 0.6041189931350115, "grad_norm": 0.31946882882677174, "learning_rate": 4.437217194570136e-05, "loss": 0.4481, "step": 396 }, { "epoch": 0.6056445461479787, "grad_norm": 0.3475146758487184, "learning_rate": 4.434389140271493e-05, "loss": 0.4432, "step": 397 }, { "epoch": 0.6071700991609459, "grad_norm": 0.36474507341371326, "learning_rate": 4.431561085972851e-05, "loss": 0.4429, "step": 398 }, { "epoch": 0.6086956521739131, "grad_norm": 0.3494767192767803, "learning_rate": 4.428733031674208e-05, "loss": 0.4431, "step": 399 }, { "epoch": 0.6102212051868803, "grad_norm": 0.3740072831069121, "learning_rate": 4.425904977375566e-05, "loss": 0.4229, "step": 400 }, { "epoch": 0.6117467581998475, "grad_norm": 0.3260291200457102, "learning_rate": 4.423076923076923e-05, "loss": 0.4451, "step": 401 }, { "epoch": 0.6132723112128147, "grad_norm": 0.44451385028694124, "learning_rate": 4.420248868778281e-05, "loss": 0.4431, "step": 402 }, { "epoch": 0.6147978642257819, "grad_norm": 0.35955221814113475, "learning_rate": 4.417420814479638e-05, "loss": 0.4256, "step": 403 }, { "epoch": 0.6163234172387491, "grad_norm": 0.3570169264624412, "learning_rate": 4.414592760180996e-05, "loss": 0.4332, "step": 404 }, { "epoch": 0.6178489702517163, "grad_norm": 0.3746828703092404, "learning_rate": 4.411764705882353e-05, "loss": 0.4631, "step": 405 }, { "epoch": 0.6193745232646835, "grad_norm": 0.38690933058119664, "learning_rate": 4.408936651583711e-05, "loss": 0.4798, "step": 406 }, { "epoch": 0.6209000762776506, "grad_norm": 0.43996651577396434, "learning_rate": 4.406108597285068e-05, "loss": 0.4621, "step": 407 }, { "epoch": 0.6224256292906178, "grad_norm": 0.4622430850965734, "learning_rate": 4.403280542986426e-05, "loss": 0.4533, "step": 408 }, { "epoch": 0.623951182303585, "grad_norm": 0.39216822212828395, "learning_rate": 4.400452488687783e-05, "loss": 0.4424, "step": 409 }, { "epoch": 0.6254767353165522, "grad_norm": 0.3500335269359861, "learning_rate": 4.397624434389141e-05, "loss": 0.4302, "step": 410 }, { "epoch": 0.6270022883295194, "grad_norm": 0.40727390861961066, "learning_rate": 4.394796380090498e-05, "loss": 0.4386, "step": 411 }, { "epoch": 0.6285278413424866, "grad_norm": 0.39785426469904844, "learning_rate": 4.391968325791856e-05, "loss": 0.4289, "step": 412 }, { "epoch": 0.6300533943554538, "grad_norm": 0.3719090259908795, "learning_rate": 4.3891402714932125e-05, "loss": 0.4228, "step": 413 }, { "epoch": 0.631578947368421, "grad_norm": 0.4299792936992418, "learning_rate": 4.38631221719457e-05, "loss": 0.4581, "step": 414 }, { "epoch": 0.6331045003813882, "grad_norm": 0.4001344390596111, "learning_rate": 4.3834841628959275e-05, "loss": 0.4529, "step": 415 }, { "epoch": 0.6346300533943554, "grad_norm": 0.3726114837290671, "learning_rate": 4.380656108597285e-05, "loss": 0.4187, "step": 416 }, { "epoch": 0.6361556064073226, "grad_norm": 0.38570425645860257, "learning_rate": 4.3778280542986425e-05, "loss": 0.4649, "step": 417 }, { "epoch": 0.6376811594202898, "grad_norm": 0.37979077033508857, "learning_rate": 4.375e-05, "loss": 0.4629, "step": 418 }, { "epoch": 0.6392067124332571, "grad_norm": 0.5013342563326993, "learning_rate": 4.3721719457013574e-05, "loss": 0.4417, "step": 419 }, { "epoch": 0.6407322654462243, "grad_norm": 0.3700722544592611, "learning_rate": 4.369343891402715e-05, "loss": 0.4329, "step": 420 }, { "epoch": 0.6422578184591915, "grad_norm": 0.5507744592110941, "learning_rate": 4.3665158371040724e-05, "loss": 0.4255, "step": 421 }, { "epoch": 0.6437833714721587, "grad_norm": 0.36192157077352644, "learning_rate": 4.36368778280543e-05, "loss": 0.4169, "step": 422 }, { "epoch": 0.6453089244851259, "grad_norm": 0.45180649263539396, "learning_rate": 4.3608597285067874e-05, "loss": 0.4226, "step": 423 }, { "epoch": 0.6468344774980931, "grad_norm": 0.44946004314109494, "learning_rate": 4.358031674208145e-05, "loss": 0.4413, "step": 424 }, { "epoch": 0.6483600305110603, "grad_norm": 0.48391861289697913, "learning_rate": 4.3552036199095024e-05, "loss": 0.4226, "step": 425 }, { "epoch": 0.6498855835240275, "grad_norm": 0.49752850415172106, "learning_rate": 4.35237556561086e-05, "loss": 0.4436, "step": 426 }, { "epoch": 0.6514111365369947, "grad_norm": 0.5057465597931835, "learning_rate": 4.3495475113122174e-05, "loss": 0.4373, "step": 427 }, { "epoch": 0.6529366895499619, "grad_norm": 0.5009939967966213, "learning_rate": 4.346719457013575e-05, "loss": 0.4692, "step": 428 }, { "epoch": 0.6544622425629291, "grad_norm": 0.4324803717388467, "learning_rate": 4.3438914027149324e-05, "loss": 0.4413, "step": 429 }, { "epoch": 0.6559877955758963, "grad_norm": 0.42344675741206317, "learning_rate": 4.34106334841629e-05, "loss": 0.4322, "step": 430 }, { "epoch": 0.6575133485888635, "grad_norm": 0.43426221099686474, "learning_rate": 4.3382352941176474e-05, "loss": 0.4598, "step": 431 }, { "epoch": 0.6590389016018307, "grad_norm": 0.4433636039987283, "learning_rate": 4.335407239819005e-05, "loss": 0.4392, "step": 432 }, { "epoch": 0.6605644546147978, "grad_norm": 0.43896261089690186, "learning_rate": 4.3325791855203623e-05, "loss": 0.4612, "step": 433 }, { "epoch": 0.662090007627765, "grad_norm": 0.4539414479329059, "learning_rate": 4.32975113122172e-05, "loss": 0.4397, "step": 434 }, { "epoch": 0.6636155606407322, "grad_norm": 0.4515301449852285, "learning_rate": 4.326923076923077e-05, "loss": 0.4494, "step": 435 }, { "epoch": 0.6651411136536994, "grad_norm": 0.41705878095032783, "learning_rate": 4.324095022624435e-05, "loss": 0.4358, "step": 436 }, { "epoch": 0.6666666666666666, "grad_norm": 0.41848113451183794, "learning_rate": 4.321266968325792e-05, "loss": 0.4256, "step": 437 }, { "epoch": 0.6681922196796338, "grad_norm": 0.4351679087963125, "learning_rate": 4.31843891402715e-05, "loss": 0.4377, "step": 438 }, { "epoch": 0.669717772692601, "grad_norm": 0.317019730512141, "learning_rate": 4.315610859728507e-05, "loss": 0.4513, "step": 439 }, { "epoch": 0.6712433257055682, "grad_norm": 0.4782465227886277, "learning_rate": 4.312782805429865e-05, "loss": 0.4358, "step": 440 }, { "epoch": 0.6727688787185355, "grad_norm": 0.38101501366479645, "learning_rate": 4.309954751131222e-05, "loss": 0.4339, "step": 441 }, { "epoch": 0.6742944317315027, "grad_norm": 0.4553183961402146, "learning_rate": 4.30712669683258e-05, "loss": 0.4464, "step": 442 }, { "epoch": 0.6758199847444699, "grad_norm": 0.4712316243764032, "learning_rate": 4.304298642533937e-05, "loss": 0.4252, "step": 443 }, { "epoch": 0.6773455377574371, "grad_norm": 0.42700764736440866, "learning_rate": 4.301470588235295e-05, "loss": 0.4544, "step": 444 }, { "epoch": 0.6788710907704043, "grad_norm": 0.5031366259445439, "learning_rate": 4.298642533936652e-05, "loss": 0.4496, "step": 445 }, { "epoch": 0.6803966437833715, "grad_norm": 0.3587930196357391, "learning_rate": 4.295814479638009e-05, "loss": 0.4495, "step": 446 }, { "epoch": 0.6819221967963387, "grad_norm": 0.450787277890409, "learning_rate": 4.2929864253393666e-05, "loss": 0.4469, "step": 447 }, { "epoch": 0.6834477498093059, "grad_norm": 0.538426386356793, "learning_rate": 4.290158371040724e-05, "loss": 0.4632, "step": 448 }, { "epoch": 0.6849733028222731, "grad_norm": 0.30861542975618167, "learning_rate": 4.2873303167420815e-05, "loss": 0.4299, "step": 449 }, { "epoch": 0.6864988558352403, "grad_norm": 0.4918295370160658, "learning_rate": 4.284502262443439e-05, "loss": 0.4293, "step": 450 }, { "epoch": 0.6880244088482075, "grad_norm": 0.32260277442259266, "learning_rate": 4.2816742081447965e-05, "loss": 0.4337, "step": 451 }, { "epoch": 0.6895499618611747, "grad_norm": 0.47929932908100925, "learning_rate": 4.278846153846154e-05, "loss": 0.4397, "step": 452 }, { "epoch": 0.6910755148741419, "grad_norm": 0.32693475308094044, "learning_rate": 4.2760180995475115e-05, "loss": 0.424, "step": 453 }, { "epoch": 0.6926010678871091, "grad_norm": 0.41362554352450925, "learning_rate": 4.273190045248869e-05, "loss": 0.419, "step": 454 }, { "epoch": 0.6941266209000763, "grad_norm": 0.33441120158453636, "learning_rate": 4.2703619909502265e-05, "loss": 0.4286, "step": 455 }, { "epoch": 0.6956521739130435, "grad_norm": 0.4424269061343038, "learning_rate": 4.267533936651584e-05, "loss": 0.4512, "step": 456 }, { "epoch": 0.6971777269260107, "grad_norm": 0.4430518447643823, "learning_rate": 4.2647058823529415e-05, "loss": 0.4453, "step": 457 }, { "epoch": 0.6987032799389779, "grad_norm": 0.34116194011285733, "learning_rate": 4.261877828054299e-05, "loss": 0.4274, "step": 458 }, { "epoch": 0.700228832951945, "grad_norm": 0.4463064137841719, "learning_rate": 4.2590497737556565e-05, "loss": 0.42, "step": 459 }, { "epoch": 0.7017543859649122, "grad_norm": 0.3504183687477405, "learning_rate": 4.256221719457014e-05, "loss": 0.4096, "step": 460 }, { "epoch": 0.7032799389778794, "grad_norm": 0.4390755471801308, "learning_rate": 4.2533936651583714e-05, "loss": 0.423, "step": 461 }, { "epoch": 0.7048054919908466, "grad_norm": 0.3330534069207293, "learning_rate": 4.250565610859728e-05, "loss": 0.4376, "step": 462 }, { "epoch": 0.7063310450038138, "grad_norm": 0.3348321045788987, "learning_rate": 4.247737556561086e-05, "loss": 0.416, "step": 463 }, { "epoch": 0.7078565980167811, "grad_norm": 0.30764974138216766, "learning_rate": 4.244909502262443e-05, "loss": 0.4097, "step": 464 }, { "epoch": 0.7093821510297483, "grad_norm": 0.378790293833348, "learning_rate": 4.242081447963801e-05, "loss": 0.4287, "step": 465 }, { "epoch": 0.7109077040427155, "grad_norm": 0.38181223437660317, "learning_rate": 4.239253393665158e-05, "loss": 0.4479, "step": 466 }, { "epoch": 0.7124332570556827, "grad_norm": 0.3276273923568558, "learning_rate": 4.236425339366516e-05, "loss": 0.4679, "step": 467 }, { "epoch": 0.7139588100686499, "grad_norm": 0.34892975958993516, "learning_rate": 4.233597285067873e-05, "loss": 0.4545, "step": 468 }, { "epoch": 0.7154843630816171, "grad_norm": 0.39671637020274614, "learning_rate": 4.230769230769231e-05, "loss": 0.4519, "step": 469 }, { "epoch": 0.7170099160945843, "grad_norm": 0.3726165575548184, "learning_rate": 4.227941176470588e-05, "loss": 0.4507, "step": 470 }, { "epoch": 0.7185354691075515, "grad_norm": 0.3406000316535968, "learning_rate": 4.225113122171946e-05, "loss": 0.4385, "step": 471 }, { "epoch": 0.7200610221205187, "grad_norm": 0.34814420401527546, "learning_rate": 4.222285067873303e-05, "loss": 0.4355, "step": 472 }, { "epoch": 0.7215865751334859, "grad_norm": 0.360478020043742, "learning_rate": 4.219457013574661e-05, "loss": 0.4473, "step": 473 }, { "epoch": 0.7231121281464531, "grad_norm": 0.4157581179201356, "learning_rate": 4.216628959276018e-05, "loss": 0.4287, "step": 474 }, { "epoch": 0.7246376811594203, "grad_norm": 0.3618166729981742, "learning_rate": 4.213800904977376e-05, "loss": 0.4662, "step": 475 }, { "epoch": 0.7261632341723875, "grad_norm": 0.4341759921378755, "learning_rate": 4.210972850678733e-05, "loss": 0.4386, "step": 476 }, { "epoch": 0.7276887871853547, "grad_norm": 0.37050300703554023, "learning_rate": 4.2081447963800907e-05, "loss": 0.4432, "step": 477 }, { "epoch": 0.7292143401983219, "grad_norm": 0.4223338148195672, "learning_rate": 4.205316742081448e-05, "loss": 0.4252, "step": 478 }, { "epoch": 0.7307398932112891, "grad_norm": 0.46667895398014203, "learning_rate": 4.2024886877828056e-05, "loss": 0.4481, "step": 479 }, { "epoch": 0.7322654462242563, "grad_norm": 0.43008822514649786, "learning_rate": 4.199660633484163e-05, "loss": 0.44, "step": 480 }, { "epoch": 0.7337909992372235, "grad_norm": 0.45983996480601036, "learning_rate": 4.1968325791855206e-05, "loss": 0.451, "step": 481 }, { "epoch": 0.7353165522501907, "grad_norm": 0.4038327324511019, "learning_rate": 4.194004524886878e-05, "loss": 0.443, "step": 482 }, { "epoch": 0.7368421052631579, "grad_norm": 0.3598566269341369, "learning_rate": 4.1911764705882356e-05, "loss": 0.4532, "step": 483 }, { "epoch": 0.738367658276125, "grad_norm": 0.3904631744886152, "learning_rate": 4.188348416289593e-05, "loss": 0.4195, "step": 484 }, { "epoch": 0.7398932112890922, "grad_norm": 0.30183693363386577, "learning_rate": 4.1855203619909506e-05, "loss": 0.4059, "step": 485 }, { "epoch": 0.7414187643020596, "grad_norm": 0.3403185490364003, "learning_rate": 4.182692307692308e-05, "loss": 0.4038, "step": 486 }, { "epoch": 0.7429443173150267, "grad_norm": 0.34892870359971107, "learning_rate": 4.1798642533936656e-05, "loss": 0.44, "step": 487 }, { "epoch": 0.7444698703279939, "grad_norm": 0.40597792095225804, "learning_rate": 4.177036199095023e-05, "loss": 0.4626, "step": 488 }, { "epoch": 0.7459954233409611, "grad_norm": 0.3642441592274843, "learning_rate": 4.1742081447963806e-05, "loss": 0.4371, "step": 489 }, { "epoch": 0.7475209763539283, "grad_norm": 0.40959411720314487, "learning_rate": 4.171380090497738e-05, "loss": 0.4438, "step": 490 }, { "epoch": 0.7490465293668955, "grad_norm": 0.35086291099222305, "learning_rate": 4.1685520361990955e-05, "loss": 0.4388, "step": 491 }, { "epoch": 0.7505720823798627, "grad_norm": 0.35861585314941474, "learning_rate": 4.165723981900453e-05, "loss": 0.4007, "step": 492 }, { "epoch": 0.7520976353928299, "grad_norm": 0.3493380508080291, "learning_rate": 4.1628959276018105e-05, "loss": 0.4294, "step": 493 }, { "epoch": 0.7536231884057971, "grad_norm": 0.3624013624907356, "learning_rate": 4.160067873303168e-05, "loss": 0.4372, "step": 494 }, { "epoch": 0.7551487414187643, "grad_norm": 0.3821852024904889, "learning_rate": 4.1572398190045255e-05, "loss": 0.4182, "step": 495 }, { "epoch": 0.7566742944317315, "grad_norm": 0.3287770549693359, "learning_rate": 4.154411764705883e-05, "loss": 0.4422, "step": 496 }, { "epoch": 0.7581998474446987, "grad_norm": 0.4595509568520524, "learning_rate": 4.1515837104072405e-05, "loss": 0.4302, "step": 497 }, { "epoch": 0.7597254004576659, "grad_norm": 0.3546128657044429, "learning_rate": 4.148755656108598e-05, "loss": 0.4485, "step": 498 }, { "epoch": 0.7612509534706331, "grad_norm": 0.3985144951379393, "learning_rate": 4.1459276018099555e-05, "loss": 0.4516, "step": 499 }, { "epoch": 0.7627765064836003, "grad_norm": 0.38198891262178786, "learning_rate": 4.143099547511313e-05, "loss": 0.4191, "step": 500 }, { "epoch": 0.7643020594965675, "grad_norm": 0.4275693749347524, "learning_rate": 4.14027149321267e-05, "loss": 0.4535, "step": 501 }, { "epoch": 0.7658276125095347, "grad_norm": 0.35570418687296823, "learning_rate": 4.137443438914027e-05, "loss": 0.4389, "step": 502 }, { "epoch": 0.7673531655225019, "grad_norm": 0.39748510168292595, "learning_rate": 4.134615384615385e-05, "loss": 0.4214, "step": 503 }, { "epoch": 0.7688787185354691, "grad_norm": 0.35309050746856163, "learning_rate": 4.131787330316742e-05, "loss": 0.4391, "step": 504 }, { "epoch": 0.7704042715484363, "grad_norm": 0.4138660725458607, "learning_rate": 4.1289592760181e-05, "loss": 0.4522, "step": 505 }, { "epoch": 0.7719298245614035, "grad_norm": 0.37664830291130674, "learning_rate": 4.126131221719457e-05, "loss": 0.4248, "step": 506 }, { "epoch": 0.7734553775743707, "grad_norm": 0.3700675444871862, "learning_rate": 4.123303167420815e-05, "loss": 0.4254, "step": 507 }, { "epoch": 0.7749809305873379, "grad_norm": 0.4179374623964575, "learning_rate": 4.120475113122172e-05, "loss": 0.4354, "step": 508 }, { "epoch": 0.7765064836003052, "grad_norm": 0.44472902570389683, "learning_rate": 4.11764705882353e-05, "loss": 0.4541, "step": 509 }, { "epoch": 0.7780320366132724, "grad_norm": 0.45687814161162316, "learning_rate": 4.114819004524887e-05, "loss": 0.4295, "step": 510 }, { "epoch": 0.7795575896262396, "grad_norm": 0.4522717098648478, "learning_rate": 4.111990950226244e-05, "loss": 0.437, "step": 511 }, { "epoch": 0.7810831426392068, "grad_norm": 0.44609761720685026, "learning_rate": 4.1091628959276015e-05, "loss": 0.4359, "step": 512 }, { "epoch": 0.782608695652174, "grad_norm": 0.399540222786967, "learning_rate": 4.106334841628959e-05, "loss": 0.4519, "step": 513 }, { "epoch": 0.7841342486651411, "grad_norm": 0.418020596051392, "learning_rate": 4.1035067873303165e-05, "loss": 0.4229, "step": 514 }, { "epoch": 0.7856598016781083, "grad_norm": 0.3668951642726817, "learning_rate": 4.100678733031674e-05, "loss": 0.4434, "step": 515 }, { "epoch": 0.7871853546910755, "grad_norm": 0.42359436410322415, "learning_rate": 4.0978506787330315e-05, "loss": 0.4328, "step": 516 }, { "epoch": 0.7887109077040427, "grad_norm": 0.481488514162343, "learning_rate": 4.095022624434389e-05, "loss": 0.4234, "step": 517 }, { "epoch": 0.7902364607170099, "grad_norm": 0.3674899626741532, "learning_rate": 4.0921945701357465e-05, "loss": 0.4255, "step": 518 }, { "epoch": 0.7917620137299771, "grad_norm": 0.5616988974920332, "learning_rate": 4.089366515837104e-05, "loss": 0.417, "step": 519 }, { "epoch": 0.7932875667429443, "grad_norm": 0.3577365107559803, "learning_rate": 4.0865384615384615e-05, "loss": 0.4495, "step": 520 }, { "epoch": 0.7948131197559115, "grad_norm": 0.5459024475776874, "learning_rate": 4.083710407239819e-05, "loss": 0.4428, "step": 521 }, { "epoch": 0.7963386727688787, "grad_norm": 0.525719360020542, "learning_rate": 4.0808823529411765e-05, "loss": 0.4621, "step": 522 }, { "epoch": 0.7978642257818459, "grad_norm": 0.3579200950335217, "learning_rate": 4.078054298642534e-05, "loss": 0.4378, "step": 523 }, { "epoch": 0.7993897787948131, "grad_norm": 0.5197481981029356, "learning_rate": 4.0752262443438914e-05, "loss": 0.4422, "step": 524 }, { "epoch": 0.8009153318077803, "grad_norm": 0.3301245950993897, "learning_rate": 4.072398190045249e-05, "loss": 0.4127, "step": 525 }, { "epoch": 0.8024408848207475, "grad_norm": 0.49164229751799104, "learning_rate": 4.0695701357466064e-05, "loss": 0.4457, "step": 526 }, { "epoch": 0.8039664378337147, "grad_norm": 0.38468109117122024, "learning_rate": 4.066742081447964e-05, "loss": 0.4451, "step": 527 }, { "epoch": 0.8054919908466819, "grad_norm": 0.3671853119128999, "learning_rate": 4.0639140271493214e-05, "loss": 0.4317, "step": 528 }, { "epoch": 0.8070175438596491, "grad_norm": 0.41052414051160324, "learning_rate": 4.061085972850679e-05, "loss": 0.42, "step": 529 }, { "epoch": 0.8085430968726163, "grad_norm": 0.3633248598440435, "learning_rate": 4.0582579185520364e-05, "loss": 0.4381, "step": 530 }, { "epoch": 0.8100686498855835, "grad_norm": 0.3949508051125908, "learning_rate": 4.055429864253394e-05, "loss": 0.43, "step": 531 }, { "epoch": 0.8115942028985508, "grad_norm": 0.37466317723067155, "learning_rate": 4.0526018099547514e-05, "loss": 0.4253, "step": 532 }, { "epoch": 0.813119755911518, "grad_norm": 0.37733762013648203, "learning_rate": 4.049773755656109e-05, "loss": 0.4415, "step": 533 }, { "epoch": 0.8146453089244852, "grad_norm": 0.39627195512828745, "learning_rate": 4.0469457013574664e-05, "loss": 0.4464, "step": 534 }, { "epoch": 0.8161708619374524, "grad_norm": 0.35072925232938257, "learning_rate": 4.044117647058824e-05, "loss": 0.4254, "step": 535 }, { "epoch": 0.8176964149504196, "grad_norm": 0.3673659818945414, "learning_rate": 4.0412895927601813e-05, "loss": 0.4305, "step": 536 }, { "epoch": 0.8192219679633868, "grad_norm": 0.3669274840929282, "learning_rate": 4.038461538461539e-05, "loss": 0.4367, "step": 537 }, { "epoch": 0.820747520976354, "grad_norm": 0.351068844413437, "learning_rate": 4.035633484162896e-05, "loss": 0.4406, "step": 538 }, { "epoch": 0.8222730739893211, "grad_norm": 0.3255485661471821, "learning_rate": 4.032805429864254e-05, "loss": 0.4302, "step": 539 }, { "epoch": 0.8237986270022883, "grad_norm": 0.36404315724650055, "learning_rate": 4.029977375565611e-05, "loss": 0.4359, "step": 540 }, { "epoch": 0.8253241800152555, "grad_norm": 0.3287371505788899, "learning_rate": 4.027149321266969e-05, "loss": 0.429, "step": 541 }, { "epoch": 0.8268497330282227, "grad_norm": 0.4048184554099012, "learning_rate": 4.024321266968326e-05, "loss": 0.4407, "step": 542 }, { "epoch": 0.8283752860411899, "grad_norm": 0.3152081390131719, "learning_rate": 4.021493212669684e-05, "loss": 0.4306, "step": 543 }, { "epoch": 0.8299008390541571, "grad_norm": 0.3808433740270737, "learning_rate": 4.018665158371041e-05, "loss": 0.4329, "step": 544 }, { "epoch": 0.8314263920671243, "grad_norm": 0.3674064844216663, "learning_rate": 4.015837104072399e-05, "loss": 0.4216, "step": 545 }, { "epoch": 0.8329519450800915, "grad_norm": 0.5113868123384665, "learning_rate": 4.013009049773756e-05, "loss": 0.4383, "step": 546 }, { "epoch": 0.8344774980930587, "grad_norm": 0.3942348486079557, "learning_rate": 4.010180995475114e-05, "loss": 0.4267, "step": 547 }, { "epoch": 0.8360030511060259, "grad_norm": 0.3959015132502646, "learning_rate": 4.007352941176471e-05, "loss": 0.4327, "step": 548 }, { "epoch": 0.8375286041189931, "grad_norm": 0.39900614461024647, "learning_rate": 4.004524886877829e-05, "loss": 0.439, "step": 549 }, { "epoch": 0.8390541571319603, "grad_norm": 0.3556342334515889, "learning_rate": 4.001696832579186e-05, "loss": 0.4372, "step": 550 }, { "epoch": 0.8405797101449275, "grad_norm": 0.39573254933057145, "learning_rate": 3.998868778280544e-05, "loss": 0.4259, "step": 551 }, { "epoch": 0.8421052631578947, "grad_norm": 0.3231721458396948, "learning_rate": 3.9960407239819005e-05, "loss": 0.4266, "step": 552 }, { "epoch": 0.8436308161708619, "grad_norm": 0.3887563105395256, "learning_rate": 3.993212669683258e-05, "loss": 0.4304, "step": 553 }, { "epoch": 0.8451563691838292, "grad_norm": 0.4076995572955868, "learning_rate": 3.9903846153846155e-05, "loss": 0.4393, "step": 554 }, { "epoch": 0.8466819221967964, "grad_norm": 0.3600330590893684, "learning_rate": 3.987556561085973e-05, "loss": 0.444, "step": 555 }, { "epoch": 0.8482074752097636, "grad_norm": 0.35809525854323015, "learning_rate": 3.9847285067873305e-05, "loss": 0.426, "step": 556 }, { "epoch": 0.8497330282227308, "grad_norm": 0.35196143482229303, "learning_rate": 3.981900452488688e-05, "loss": 0.4235, "step": 557 }, { "epoch": 0.851258581235698, "grad_norm": 0.4016805998261422, "learning_rate": 3.9790723981900455e-05, "loss": 0.4137, "step": 558 }, { "epoch": 0.8527841342486652, "grad_norm": 0.4134683614283133, "learning_rate": 3.976244343891403e-05, "loss": 0.4301, "step": 559 }, { "epoch": 0.8543096872616324, "grad_norm": 0.3180261970818384, "learning_rate": 3.97341628959276e-05, "loss": 0.4304, "step": 560 }, { "epoch": 0.8558352402745996, "grad_norm": 0.3728191275343172, "learning_rate": 3.970588235294117e-05, "loss": 0.4254, "step": 561 }, { "epoch": 0.8573607932875668, "grad_norm": 0.3046867657318598, "learning_rate": 3.967760180995475e-05, "loss": 0.4384, "step": 562 }, { "epoch": 0.858886346300534, "grad_norm": 0.3937945276938685, "learning_rate": 3.964932126696832e-05, "loss": 0.4124, "step": 563 }, { "epoch": 0.8604118993135011, "grad_norm": 0.3589460244896344, "learning_rate": 3.96210407239819e-05, "loss": 0.4297, "step": 564 }, { "epoch": 0.8619374523264683, "grad_norm": 0.39307788568257573, "learning_rate": 3.959276018099547e-05, "loss": 0.4281, "step": 565 }, { "epoch": 0.8634630053394355, "grad_norm": 0.3802874303282596, "learning_rate": 3.956447963800905e-05, "loss": 0.4132, "step": 566 }, { "epoch": 0.8649885583524027, "grad_norm": 0.3829383601795179, "learning_rate": 3.953619909502262e-05, "loss": 0.4278, "step": 567 }, { "epoch": 0.8665141113653699, "grad_norm": 0.37203283370858603, "learning_rate": 3.95079185520362e-05, "loss": 0.4276, "step": 568 }, { "epoch": 0.8680396643783371, "grad_norm": 0.3643132933081618, "learning_rate": 3.947963800904977e-05, "loss": 0.4257, "step": 569 }, { "epoch": 0.8695652173913043, "grad_norm": 0.387773518630962, "learning_rate": 3.945135746606335e-05, "loss": 0.4421, "step": 570 }, { "epoch": 0.8710907704042715, "grad_norm": 0.29097562010262734, "learning_rate": 3.942307692307692e-05, "loss": 0.4202, "step": 571 }, { "epoch": 0.8726163234172387, "grad_norm": 0.35716151836392673, "learning_rate": 3.93947963800905e-05, "loss": 0.4514, "step": 572 }, { "epoch": 0.8741418764302059, "grad_norm": 0.28290381675932774, "learning_rate": 3.936651583710407e-05, "loss": 0.4145, "step": 573 }, { "epoch": 0.8756674294431731, "grad_norm": 0.34061865793706136, "learning_rate": 3.933823529411765e-05, "loss": 0.4147, "step": 574 }, { "epoch": 0.8771929824561403, "grad_norm": 0.3240896835803248, "learning_rate": 3.930995475113122e-05, "loss": 0.4229, "step": 575 }, { "epoch": 0.8787185354691075, "grad_norm": 0.3421741352319487, "learning_rate": 3.92816742081448e-05, "loss": 0.4182, "step": 576 }, { "epoch": 0.8802440884820748, "grad_norm": 0.3385706833429536, "learning_rate": 3.925339366515837e-05, "loss": 0.4309, "step": 577 }, { "epoch": 0.881769641495042, "grad_norm": 0.36150066991640434, "learning_rate": 3.922511312217195e-05, "loss": 0.4332, "step": 578 }, { "epoch": 0.8832951945080092, "grad_norm": 0.3723214400969838, "learning_rate": 3.919683257918552e-05, "loss": 0.4371, "step": 579 }, { "epoch": 0.8848207475209764, "grad_norm": 0.3321593640435473, "learning_rate": 3.9168552036199097e-05, "loss": 0.4142, "step": 580 }, { "epoch": 0.8863463005339436, "grad_norm": 0.40291801211558004, "learning_rate": 3.914027149321267e-05, "loss": 0.4433, "step": 581 }, { "epoch": 0.8878718535469108, "grad_norm": 0.3094521748910113, "learning_rate": 3.9111990950226246e-05, "loss": 0.4209, "step": 582 }, { "epoch": 0.889397406559878, "grad_norm": 0.32131667811785675, "learning_rate": 3.908371040723982e-05, "loss": 0.4145, "step": 583 }, { "epoch": 0.8909229595728452, "grad_norm": 0.3801646700226601, "learning_rate": 3.9055429864253396e-05, "loss": 0.4563, "step": 584 }, { "epoch": 0.8924485125858124, "grad_norm": 0.35361586288948427, "learning_rate": 3.902714932126697e-05, "loss": 0.4203, "step": 585 }, { "epoch": 0.8939740655987796, "grad_norm": 0.33812889674576013, "learning_rate": 3.8998868778280546e-05, "loss": 0.4111, "step": 586 }, { "epoch": 0.8954996186117468, "grad_norm": 0.327553547956027, "learning_rate": 3.897058823529412e-05, "loss": 0.4167, "step": 587 }, { "epoch": 0.897025171624714, "grad_norm": 0.3762624105435268, "learning_rate": 3.8942307692307696e-05, "loss": 0.441, "step": 588 }, { "epoch": 0.8985507246376812, "grad_norm": 0.35864950445264976, "learning_rate": 3.891402714932127e-05, "loss": 0.4534, "step": 589 }, { "epoch": 0.9000762776506483, "grad_norm": 0.3982333395922051, "learning_rate": 3.8885746606334846e-05, "loss": 0.4337, "step": 590 }, { "epoch": 0.9016018306636155, "grad_norm": 0.4151622053266755, "learning_rate": 3.885746606334842e-05, "loss": 0.4558, "step": 591 }, { "epoch": 0.9031273836765827, "grad_norm": 0.3878550665399113, "learning_rate": 3.8829185520361996e-05, "loss": 0.4481, "step": 592 }, { "epoch": 0.9046529366895499, "grad_norm": 0.3874524200991818, "learning_rate": 3.880090497737557e-05, "loss": 0.4219, "step": 593 }, { "epoch": 0.9061784897025171, "grad_norm": 0.37338618173844984, "learning_rate": 3.8772624434389145e-05, "loss": 0.4168, "step": 594 }, { "epoch": 0.9077040427154843, "grad_norm": 0.3883574401735357, "learning_rate": 3.874434389140272e-05, "loss": 0.4376, "step": 595 }, { "epoch": 0.9092295957284515, "grad_norm": 0.4205847145993874, "learning_rate": 3.8716063348416295e-05, "loss": 0.4186, "step": 596 }, { "epoch": 0.9107551487414187, "grad_norm": 0.3046649892064356, "learning_rate": 3.868778280542987e-05, "loss": 0.3993, "step": 597 }, { "epoch": 0.9122807017543859, "grad_norm": 0.43269500534034583, "learning_rate": 3.8659502262443445e-05, "loss": 0.4219, "step": 598 }, { "epoch": 0.9138062547673532, "grad_norm": 0.32397701183768696, "learning_rate": 3.863122171945702e-05, "loss": 0.4164, "step": 599 }, { "epoch": 0.9153318077803204, "grad_norm": 0.39635007279140116, "learning_rate": 3.8602941176470595e-05, "loss": 0.4281, "step": 600 }, { "epoch": 0.9168573607932876, "grad_norm": 0.39260544284888516, "learning_rate": 3.857466063348416e-05, "loss": 0.439, "step": 601 }, { "epoch": 0.9183829138062548, "grad_norm": 0.3594609294624123, "learning_rate": 3.854638009049774e-05, "loss": 0.4331, "step": 602 }, { "epoch": 0.919908466819222, "grad_norm": 0.3865475522502821, "learning_rate": 3.851809954751131e-05, "loss": 0.4234, "step": 603 }, { "epoch": 0.9214340198321892, "grad_norm": 0.36698102903858737, "learning_rate": 3.848981900452489e-05, "loss": 0.4371, "step": 604 }, { "epoch": 0.9229595728451564, "grad_norm": 0.3855883841942864, "learning_rate": 3.846153846153846e-05, "loss": 0.4492, "step": 605 }, { "epoch": 0.9244851258581236, "grad_norm": 0.3114063353047286, "learning_rate": 3.843325791855204e-05, "loss": 0.4221, "step": 606 }, { "epoch": 0.9260106788710908, "grad_norm": 0.38394090589196245, "learning_rate": 3.840497737556561e-05, "loss": 0.4467, "step": 607 }, { "epoch": 0.927536231884058, "grad_norm": 0.3516999315211215, "learning_rate": 3.837669683257919e-05, "loss": 0.4145, "step": 608 }, { "epoch": 0.9290617848970252, "grad_norm": 0.37926741223194355, "learning_rate": 3.834841628959276e-05, "loss": 0.4605, "step": 609 }, { "epoch": 0.9305873379099924, "grad_norm": 0.3962331825136665, "learning_rate": 3.832013574660634e-05, "loss": 0.433, "step": 610 }, { "epoch": 0.9321128909229596, "grad_norm": 0.37087036572036247, "learning_rate": 3.829185520361991e-05, "loss": 0.4186, "step": 611 }, { "epoch": 0.9336384439359268, "grad_norm": 0.42668785852672586, "learning_rate": 3.826357466063348e-05, "loss": 0.4102, "step": 612 }, { "epoch": 0.935163996948894, "grad_norm": 0.33238157635704535, "learning_rate": 3.8235294117647055e-05, "loss": 0.4065, "step": 613 }, { "epoch": 0.9366895499618612, "grad_norm": 0.3730752599750209, "learning_rate": 3.820701357466063e-05, "loss": 0.4097, "step": 614 }, { "epoch": 0.9382151029748284, "grad_norm": 0.42727288807058933, "learning_rate": 3.8178733031674205e-05, "loss": 0.4091, "step": 615 }, { "epoch": 0.9397406559877955, "grad_norm": 0.34667052186890596, "learning_rate": 3.815045248868778e-05, "loss": 0.4212, "step": 616 }, { "epoch": 0.9412662090007627, "grad_norm": 0.5165472186856029, "learning_rate": 3.8122171945701355e-05, "loss": 0.446, "step": 617 }, { "epoch": 0.9427917620137299, "grad_norm": 0.3650617870503334, "learning_rate": 3.809389140271493e-05, "loss": 0.4297, "step": 618 }, { "epoch": 0.9443173150266971, "grad_norm": 0.3819096126125252, "learning_rate": 3.8065610859728505e-05, "loss": 0.4362, "step": 619 }, { "epoch": 0.9458428680396643, "grad_norm": 0.3977069021103364, "learning_rate": 3.803733031674208e-05, "loss": 0.4298, "step": 620 }, { "epoch": 0.9473684210526315, "grad_norm": 0.4033708900031112, "learning_rate": 3.8009049773755655e-05, "loss": 0.4416, "step": 621 }, { "epoch": 0.9488939740655988, "grad_norm": 0.36624012411797835, "learning_rate": 3.798076923076923e-05, "loss": 0.4121, "step": 622 }, { "epoch": 0.950419527078566, "grad_norm": 0.367804648017954, "learning_rate": 3.7952488687782805e-05, "loss": 0.422, "step": 623 }, { "epoch": 0.9519450800915332, "grad_norm": 0.44296600681436415, "learning_rate": 3.792420814479638e-05, "loss": 0.4351, "step": 624 }, { "epoch": 0.9534706331045004, "grad_norm": 0.41331210589686396, "learning_rate": 3.7895927601809955e-05, "loss": 0.4202, "step": 625 }, { "epoch": 0.9549961861174676, "grad_norm": 0.4319757517276028, "learning_rate": 3.786764705882353e-05, "loss": 0.4432, "step": 626 }, { "epoch": 0.9565217391304348, "grad_norm": 0.3632517323334636, "learning_rate": 3.7839366515837104e-05, "loss": 0.4214, "step": 627 }, { "epoch": 0.958047292143402, "grad_norm": 0.4190389418280002, "learning_rate": 3.781108597285068e-05, "loss": 0.4408, "step": 628 }, { "epoch": 0.9595728451563692, "grad_norm": 0.3516904020891142, "learning_rate": 3.7782805429864254e-05, "loss": 0.4375, "step": 629 }, { "epoch": 0.9610983981693364, "grad_norm": 0.3355963581394157, "learning_rate": 3.775452488687783e-05, "loss": 0.4218, "step": 630 }, { "epoch": 0.9626239511823036, "grad_norm": 0.3660973253529957, "learning_rate": 3.7726244343891404e-05, "loss": 0.4101, "step": 631 }, { "epoch": 0.9641495041952708, "grad_norm": 0.4675055935898027, "learning_rate": 3.769796380090498e-05, "loss": 0.4007, "step": 632 }, { "epoch": 0.965675057208238, "grad_norm": 0.3772842656388094, "learning_rate": 3.7669683257918554e-05, "loss": 0.4129, "step": 633 }, { "epoch": 0.9672006102212052, "grad_norm": 0.4234378504406015, "learning_rate": 3.764140271493213e-05, "loss": 0.4188, "step": 634 }, { "epoch": 0.9687261632341724, "grad_norm": 0.34118463814157296, "learning_rate": 3.7613122171945704e-05, "loss": 0.4196, "step": 635 }, { "epoch": 0.9702517162471396, "grad_norm": 0.35593978548379435, "learning_rate": 3.758484162895928e-05, "loss": 0.4402, "step": 636 }, { "epoch": 0.9717772692601068, "grad_norm": 0.320677244229397, "learning_rate": 3.7556561085972854e-05, "loss": 0.4409, "step": 637 }, { "epoch": 0.973302822273074, "grad_norm": 0.3408243426683139, "learning_rate": 3.752828054298643e-05, "loss": 0.4456, "step": 638 }, { "epoch": 0.9748283752860412, "grad_norm": 0.33792223346336797, "learning_rate": 3.7500000000000003e-05, "loss": 0.428, "step": 639 }, { "epoch": 0.9763539282990084, "grad_norm": 0.33375525746767226, "learning_rate": 3.747171945701358e-05, "loss": 0.4219, "step": 640 }, { "epoch": 0.9778794813119756, "grad_norm": 0.38415107872535165, "learning_rate": 3.744343891402715e-05, "loss": 0.4231, "step": 641 }, { "epoch": 0.9794050343249427, "grad_norm": 0.41832020544301285, "learning_rate": 3.741515837104073e-05, "loss": 0.4364, "step": 642 }, { "epoch": 0.9809305873379099, "grad_norm": 0.3920099411936452, "learning_rate": 3.73868778280543e-05, "loss": 0.4303, "step": 643 }, { "epoch": 0.9824561403508771, "grad_norm": 0.479436939473446, "learning_rate": 3.735859728506788e-05, "loss": 0.4035, "step": 644 }, { "epoch": 0.9839816933638444, "grad_norm": 0.4438216691246816, "learning_rate": 3.733031674208145e-05, "loss": 0.4074, "step": 645 }, { "epoch": 0.9855072463768116, "grad_norm": 0.3523431458601914, "learning_rate": 3.730203619909503e-05, "loss": 0.4263, "step": 646 }, { "epoch": 0.9870327993897788, "grad_norm": 0.39248307054441434, "learning_rate": 3.72737556561086e-05, "loss": 0.4147, "step": 647 }, { "epoch": 0.988558352402746, "grad_norm": 0.402178573146837, "learning_rate": 3.724547511312218e-05, "loss": 0.4229, "step": 648 }, { "epoch": 0.9900839054157132, "grad_norm": 0.34035353487629616, "learning_rate": 3.721719457013575e-05, "loss": 0.4182, "step": 649 }, { "epoch": 0.9916094584286804, "grad_norm": 0.3785491406625458, "learning_rate": 3.718891402714932e-05, "loss": 0.4329, "step": 650 }, { "epoch": 0.9931350114416476, "grad_norm": 0.3621343197082202, "learning_rate": 3.7160633484162896e-05, "loss": 0.4312, "step": 651 }, { "epoch": 0.9946605644546148, "grad_norm": 0.345853669424604, "learning_rate": 3.713235294117647e-05, "loss": 0.445, "step": 652 }, { "epoch": 0.996186117467582, "grad_norm": 0.3586234319820601, "learning_rate": 3.7104072398190046e-05, "loss": 0.4185, "step": 653 }, { "epoch": 0.9977116704805492, "grad_norm": 0.37475756616718636, "learning_rate": 3.707579185520362e-05, "loss": 0.4541, "step": 654 }, { "epoch": 0.9992372234935164, "grad_norm": 0.3521674939508171, "learning_rate": 3.7047511312217195e-05, "loss": 0.4265, "step": 655 }, { "epoch": 1.0, "grad_norm": 0.3521674939508171, "learning_rate": 3.701923076923077e-05, "loss": 0.436, "step": 656 }, { "epoch": 1.0015255530129672, "grad_norm": 0.593251859887867, "learning_rate": 3.6990950226244345e-05, "loss": 0.3562, "step": 657 }, { "epoch": 1.0030511060259344, "grad_norm": 0.45295506137551766, "learning_rate": 3.696266968325792e-05, "loss": 0.3768, "step": 658 }, { "epoch": 1.0045766590389016, "grad_norm": 0.41699131901412023, "learning_rate": 3.6934389140271495e-05, "loss": 0.3938, "step": 659 }, { "epoch": 1.0061022120518688, "grad_norm": 0.40264475612421186, "learning_rate": 3.690610859728507e-05, "loss": 0.3718, "step": 660 }, { "epoch": 1.007627765064836, "grad_norm": 0.395548751330996, "learning_rate": 3.6877828054298645e-05, "loss": 0.387, "step": 661 }, { "epoch": 1.0091533180778032, "grad_norm": 0.43771472768185676, "learning_rate": 3.684954751131222e-05, "loss": 0.3792, "step": 662 }, { "epoch": 1.0106788710907704, "grad_norm": 0.3909352445078242, "learning_rate": 3.6821266968325795e-05, "loss": 0.3475, "step": 663 }, { "epoch": 1.0122044241037376, "grad_norm": 0.43044613807951376, "learning_rate": 3.679298642533937e-05, "loss": 0.3917, "step": 664 }, { "epoch": 1.0137299771167048, "grad_norm": 0.4608552583014176, "learning_rate": 3.6764705882352945e-05, "loss": 0.3554, "step": 665 }, { "epoch": 1.015255530129672, "grad_norm": 0.339278130917991, "learning_rate": 3.673642533936652e-05, "loss": 0.3589, "step": 666 }, { "epoch": 1.0167810831426392, "grad_norm": 0.38477111056992863, "learning_rate": 3.670814479638009e-05, "loss": 0.3636, "step": 667 }, { "epoch": 1.0183066361556063, "grad_norm": 0.4045355841363484, "learning_rate": 3.667986425339366e-05, "loss": 0.3758, "step": 668 }, { "epoch": 1.0198321891685735, "grad_norm": 0.32104366509454324, "learning_rate": 3.665158371040724e-05, "loss": 0.3631, "step": 669 }, { "epoch": 1.0213577421815407, "grad_norm": 0.48805604063014907, "learning_rate": 3.662330316742081e-05, "loss": 0.3667, "step": 670 }, { "epoch": 1.022883295194508, "grad_norm": 0.3600467956634154, "learning_rate": 3.659502262443439e-05, "loss": 0.3515, "step": 671 }, { "epoch": 1.0244088482074751, "grad_norm": 0.35882341315528093, "learning_rate": 3.656674208144796e-05, "loss": 0.3591, "step": 672 }, { "epoch": 1.0259344012204423, "grad_norm": 0.35646313818108427, "learning_rate": 3.653846153846154e-05, "loss": 0.3625, "step": 673 }, { "epoch": 1.0274599542334095, "grad_norm": 0.37608663940975634, "learning_rate": 3.651018099547511e-05, "loss": 0.3695, "step": 674 }, { "epoch": 1.0289855072463767, "grad_norm": 0.3256732027487922, "learning_rate": 3.648190045248869e-05, "loss": 0.3651, "step": 675 }, { "epoch": 1.030511060259344, "grad_norm": 0.37635162459799487, "learning_rate": 3.645361990950226e-05, "loss": 0.3877, "step": 676 }, { "epoch": 1.032036613272311, "grad_norm": 0.39324886077489246, "learning_rate": 3.642533936651584e-05, "loss": 0.3577, "step": 677 }, { "epoch": 1.0335621662852783, "grad_norm": 0.33906652766960804, "learning_rate": 3.639705882352941e-05, "loss": 0.3374, "step": 678 }, { "epoch": 1.0350877192982457, "grad_norm": 0.33943296955720215, "learning_rate": 3.636877828054299e-05, "loss": 0.3561, "step": 679 }, { "epoch": 1.036613272311213, "grad_norm": 0.3742946053703938, "learning_rate": 3.634049773755656e-05, "loss": 0.3595, "step": 680 }, { "epoch": 1.0381388253241801, "grad_norm": 0.3602253735580728, "learning_rate": 3.631221719457014e-05, "loss": 0.3809, "step": 681 }, { "epoch": 1.0396643783371473, "grad_norm": 0.3295057599366689, "learning_rate": 3.628393665158371e-05, "loss": 0.3502, "step": 682 }, { "epoch": 1.0411899313501145, "grad_norm": 0.33282310049439884, "learning_rate": 3.6255656108597287e-05, "loss": 0.3436, "step": 683 }, { "epoch": 1.0427154843630817, "grad_norm": 0.35852623338895606, "learning_rate": 3.622737556561086e-05, "loss": 0.3693, "step": 684 }, { "epoch": 1.044241037376049, "grad_norm": 0.39453119838786, "learning_rate": 3.6199095022624436e-05, "loss": 0.3657, "step": 685 }, { "epoch": 1.045766590389016, "grad_norm": 0.2981036037456941, "learning_rate": 3.617081447963801e-05, "loss": 0.3733, "step": 686 }, { "epoch": 1.0472921434019833, "grad_norm": 0.40005366609570037, "learning_rate": 3.6142533936651586e-05, "loss": 0.3664, "step": 687 }, { "epoch": 1.0488176964149505, "grad_norm": 0.2728572556036001, "learning_rate": 3.611425339366516e-05, "loss": 0.3621, "step": 688 }, { "epoch": 1.0503432494279177, "grad_norm": 0.3344489826494558, "learning_rate": 3.6085972850678736e-05, "loss": 0.3605, "step": 689 }, { "epoch": 1.0518688024408849, "grad_norm": 0.2929538053400578, "learning_rate": 3.605769230769231e-05, "loss": 0.3687, "step": 690 }, { "epoch": 1.053394355453852, "grad_norm": 0.33832438954834515, "learning_rate": 3.6029411764705886e-05, "loss": 0.3404, "step": 691 }, { "epoch": 1.0549199084668193, "grad_norm": 0.3313049792145938, "learning_rate": 3.600113122171946e-05, "loss": 0.3653, "step": 692 }, { "epoch": 1.0564454614797865, "grad_norm": 0.34534594691279014, "learning_rate": 3.5972850678733036e-05, "loss": 0.3547, "step": 693 }, { "epoch": 1.0579710144927537, "grad_norm": 0.3476488883167179, "learning_rate": 3.594457013574661e-05, "loss": 0.3786, "step": 694 }, { "epoch": 1.0594965675057209, "grad_norm": 0.43253455435272686, "learning_rate": 3.5916289592760186e-05, "loss": 0.3865, "step": 695 }, { "epoch": 1.061022120518688, "grad_norm": 0.27251909912767064, "learning_rate": 3.588800904977376e-05, "loss": 0.3672, "step": 696 }, { "epoch": 1.0625476735316552, "grad_norm": 0.31281954450144794, "learning_rate": 3.5859728506787336e-05, "loss": 0.3572, "step": 697 }, { "epoch": 1.0640732265446224, "grad_norm": 0.3438989796955778, "learning_rate": 3.583144796380091e-05, "loss": 0.3589, "step": 698 }, { "epoch": 1.0655987795575896, "grad_norm": 0.29410567056590037, "learning_rate": 3.580316742081448e-05, "loss": 0.3703, "step": 699 }, { "epoch": 1.0671243325705568, "grad_norm": 0.36800312170191835, "learning_rate": 3.5774886877828054e-05, "loss": 0.3693, "step": 700 }, { "epoch": 1.068649885583524, "grad_norm": 0.3109924970551452, "learning_rate": 3.574660633484163e-05, "loss": 0.3585, "step": 701 }, { "epoch": 1.0701754385964912, "grad_norm": 0.3477657984693785, "learning_rate": 3.57183257918552e-05, "loss": 0.3783, "step": 702 }, { "epoch": 1.0717009916094584, "grad_norm": 0.414942500953861, "learning_rate": 3.569004524886878e-05, "loss": 0.3644, "step": 703 }, { "epoch": 1.0732265446224256, "grad_norm": 0.2958371572882132, "learning_rate": 3.566176470588235e-05, "loss": 0.3396, "step": 704 }, { "epoch": 1.0747520976353928, "grad_norm": 0.3563958424771956, "learning_rate": 3.563348416289593e-05, "loss": 0.3624, "step": 705 }, { "epoch": 1.07627765064836, "grad_norm": 0.3290955294962301, "learning_rate": 3.56052036199095e-05, "loss": 0.3645, "step": 706 }, { "epoch": 1.0778032036613272, "grad_norm": 0.3898669293134275, "learning_rate": 3.557692307692308e-05, "loss": 0.3731, "step": 707 }, { "epoch": 1.0793287566742944, "grad_norm": 0.31725886156362204, "learning_rate": 3.554864253393665e-05, "loss": 0.3378, "step": 708 }, { "epoch": 1.0808543096872616, "grad_norm": 0.2839479538778152, "learning_rate": 3.552036199095023e-05, "loss": 0.363, "step": 709 }, { "epoch": 1.0823798627002288, "grad_norm": 0.30390672758361764, "learning_rate": 3.54920814479638e-05, "loss": 0.3555, "step": 710 }, { "epoch": 1.083905415713196, "grad_norm": 0.3028781383188018, "learning_rate": 3.546380090497738e-05, "loss": 0.3469, "step": 711 }, { "epoch": 1.0854309687261632, "grad_norm": 0.34342403618919626, "learning_rate": 3.543552036199095e-05, "loss": 0.3442, "step": 712 }, { "epoch": 1.0869565217391304, "grad_norm": 0.3094651527690766, "learning_rate": 3.540723981900453e-05, "loss": 0.3826, "step": 713 }, { "epoch": 1.0884820747520976, "grad_norm": 0.36078668280636395, "learning_rate": 3.53789592760181e-05, "loss": 0.3539, "step": 714 }, { "epoch": 1.0900076277650648, "grad_norm": 0.30248528644641726, "learning_rate": 3.535067873303168e-05, "loss": 0.3494, "step": 715 }, { "epoch": 1.091533180778032, "grad_norm": 0.2902576522419803, "learning_rate": 3.532239819004525e-05, "loss": 0.3537, "step": 716 }, { "epoch": 1.0930587337909992, "grad_norm": 0.43906433222996644, "learning_rate": 3.529411764705883e-05, "loss": 0.3577, "step": 717 }, { "epoch": 1.0945842868039664, "grad_norm": 0.26836425533880376, "learning_rate": 3.52658371040724e-05, "loss": 0.3648, "step": 718 }, { "epoch": 1.0961098398169336, "grad_norm": 0.31259218142419926, "learning_rate": 3.523755656108598e-05, "loss": 0.3425, "step": 719 }, { "epoch": 1.0976353928299007, "grad_norm": 0.2756847532019879, "learning_rate": 3.520927601809955e-05, "loss": 0.3786, "step": 720 }, { "epoch": 1.099160945842868, "grad_norm": 0.27321166564050575, "learning_rate": 3.518099547511313e-05, "loss": 0.3685, "step": 721 }, { "epoch": 1.1006864988558351, "grad_norm": 0.2936679771245994, "learning_rate": 3.5152714932126695e-05, "loss": 0.3651, "step": 722 }, { "epoch": 1.1022120518688023, "grad_norm": 0.2858409163101666, "learning_rate": 3.512443438914027e-05, "loss": 0.3733, "step": 723 }, { "epoch": 1.1037376048817698, "grad_norm": 0.27616976876792226, "learning_rate": 3.5096153846153845e-05, "loss": 0.361, "step": 724 }, { "epoch": 1.1052631578947367, "grad_norm": 0.29067663695824036, "learning_rate": 3.506787330316742e-05, "loss": 0.3522, "step": 725 }, { "epoch": 1.1067887109077041, "grad_norm": 0.3201434193821312, "learning_rate": 3.5039592760180995e-05, "loss": 0.3683, "step": 726 }, { "epoch": 1.1083142639206713, "grad_norm": 0.29157985483337956, "learning_rate": 3.501131221719457e-05, "loss": 0.3487, "step": 727 }, { "epoch": 1.1098398169336385, "grad_norm": 0.33609676188897386, "learning_rate": 3.4983031674208145e-05, "loss": 0.3476, "step": 728 }, { "epoch": 1.1113653699466057, "grad_norm": 0.30292146925009705, "learning_rate": 3.495475113122172e-05, "loss": 0.3663, "step": 729 }, { "epoch": 1.112890922959573, "grad_norm": 0.3255042447939858, "learning_rate": 3.4926470588235294e-05, "loss": 0.3652, "step": 730 }, { "epoch": 1.1144164759725401, "grad_norm": 0.3049383149494861, "learning_rate": 3.489819004524887e-05, "loss": 0.3664, "step": 731 }, { "epoch": 1.1159420289855073, "grad_norm": 0.33104144007801134, "learning_rate": 3.4869909502262444e-05, "loss": 0.3619, "step": 732 }, { "epoch": 1.1174675819984745, "grad_norm": 0.251786946170529, "learning_rate": 3.484162895927602e-05, "loss": 0.3674, "step": 733 }, { "epoch": 1.1189931350114417, "grad_norm": 0.36899703162789055, "learning_rate": 3.4813348416289594e-05, "loss": 0.3782, "step": 734 }, { "epoch": 1.120518688024409, "grad_norm": 0.34451828076904206, "learning_rate": 3.478506787330317e-05, "loss": 0.3445, "step": 735 }, { "epoch": 1.122044241037376, "grad_norm": 0.3298315249368445, "learning_rate": 3.4756787330316744e-05, "loss": 0.3733, "step": 736 }, { "epoch": 1.1235697940503433, "grad_norm": 0.3688483219771535, "learning_rate": 3.472850678733032e-05, "loss": 0.3502, "step": 737 }, { "epoch": 1.1250953470633105, "grad_norm": 0.28824911315013685, "learning_rate": 3.4700226244343894e-05, "loss": 0.3831, "step": 738 }, { "epoch": 1.1266209000762777, "grad_norm": 0.46392697001500616, "learning_rate": 3.467194570135747e-05, "loss": 0.3745, "step": 739 }, { "epoch": 1.1281464530892449, "grad_norm": 0.3269712015426494, "learning_rate": 3.4643665158371044e-05, "loss": 0.3675, "step": 740 }, { "epoch": 1.129672006102212, "grad_norm": 0.3962652109373114, "learning_rate": 3.461538461538462e-05, "loss": 0.3608, "step": 741 }, { "epoch": 1.1311975591151793, "grad_norm": 0.3271978472160097, "learning_rate": 3.4587104072398194e-05, "loss": 0.351, "step": 742 }, { "epoch": 1.1327231121281465, "grad_norm": 0.36670403765211895, "learning_rate": 3.455882352941177e-05, "loss": 0.3573, "step": 743 }, { "epoch": 1.1342486651411137, "grad_norm": 0.3609745956926606, "learning_rate": 3.453054298642534e-05, "loss": 0.3866, "step": 744 }, { "epoch": 1.1357742181540809, "grad_norm": 0.4204368958366545, "learning_rate": 3.450226244343892e-05, "loss": 0.3409, "step": 745 }, { "epoch": 1.137299771167048, "grad_norm": 0.3399390344839788, "learning_rate": 3.447398190045249e-05, "loss": 0.3582, "step": 746 }, { "epoch": 1.1388253241800153, "grad_norm": 0.38813243860347474, "learning_rate": 3.444570135746606e-05, "loss": 0.3455, "step": 747 }, { "epoch": 1.1403508771929824, "grad_norm": 0.37113583514795034, "learning_rate": 3.4417420814479636e-05, "loss": 0.3436, "step": 748 }, { "epoch": 1.1418764302059496, "grad_norm": 0.32821990645390825, "learning_rate": 3.438914027149321e-05, "loss": 0.352, "step": 749 }, { "epoch": 1.1434019832189168, "grad_norm": 0.38710364712843637, "learning_rate": 3.4360859728506786e-05, "loss": 0.3783, "step": 750 }, { "epoch": 1.144927536231884, "grad_norm": 0.33384938962032085, "learning_rate": 3.433257918552036e-05, "loss": 0.3541, "step": 751 }, { "epoch": 1.1464530892448512, "grad_norm": 0.42206911895681165, "learning_rate": 3.4304298642533936e-05, "loss": 0.3615, "step": 752 }, { "epoch": 1.1479786422578184, "grad_norm": 0.28322433163436533, "learning_rate": 3.427601809954751e-05, "loss": 0.3497, "step": 753 }, { "epoch": 1.1495041952707856, "grad_norm": 0.3766737480485819, "learning_rate": 3.4247737556561086e-05, "loss": 0.3511, "step": 754 }, { "epoch": 1.1510297482837528, "grad_norm": 0.33245088563171293, "learning_rate": 3.421945701357466e-05, "loss": 0.3642, "step": 755 }, { "epoch": 1.15255530129672, "grad_norm": 0.37005608280527524, "learning_rate": 3.4191176470588236e-05, "loss": 0.3685, "step": 756 }, { "epoch": 1.1540808543096872, "grad_norm": 0.309465642053034, "learning_rate": 3.416289592760181e-05, "loss": 0.3643, "step": 757 }, { "epoch": 1.1556064073226544, "grad_norm": 0.3341347657224667, "learning_rate": 3.4134615384615386e-05, "loss": 0.3571, "step": 758 }, { "epoch": 1.1571319603356216, "grad_norm": 0.3188289396995708, "learning_rate": 3.410633484162896e-05, "loss": 0.3654, "step": 759 }, { "epoch": 1.1586575133485888, "grad_norm": 0.38814592564714806, "learning_rate": 3.4078054298642535e-05, "loss": 0.3806, "step": 760 }, { "epoch": 1.160183066361556, "grad_norm": 0.3376329886672136, "learning_rate": 3.404977375565611e-05, "loss": 0.3711, "step": 761 }, { "epoch": 1.1617086193745232, "grad_norm": 0.42797094155591686, "learning_rate": 3.4021493212669685e-05, "loss": 0.3556, "step": 762 }, { "epoch": 1.1632341723874904, "grad_norm": 0.3470097885322982, "learning_rate": 3.399321266968326e-05, "loss": 0.3957, "step": 763 }, { "epoch": 1.1647597254004576, "grad_norm": 1.0880673762393351, "learning_rate": 3.3964932126696835e-05, "loss": 0.3429, "step": 764 }, { "epoch": 1.1662852784134248, "grad_norm": 0.36312265319753506, "learning_rate": 3.393665158371041e-05, "loss": 0.3512, "step": 765 }, { "epoch": 1.167810831426392, "grad_norm": 0.3326104761861983, "learning_rate": 3.3908371040723985e-05, "loss": 0.3562, "step": 766 }, { "epoch": 1.1693363844393594, "grad_norm": 0.33792318038261887, "learning_rate": 3.388009049773756e-05, "loss": 0.3754, "step": 767 }, { "epoch": 1.1708619374523264, "grad_norm": 0.31756994653172216, "learning_rate": 3.3851809954751135e-05, "loss": 0.3817, "step": 768 }, { "epoch": 1.1723874904652938, "grad_norm": 0.3930699392453536, "learning_rate": 3.382352941176471e-05, "loss": 0.3444, "step": 769 }, { "epoch": 1.1739130434782608, "grad_norm": 0.2934629702775079, "learning_rate": 3.3795248868778285e-05, "loss": 0.3812, "step": 770 }, { "epoch": 1.1754385964912282, "grad_norm": 0.4259709066297283, "learning_rate": 3.376696832579186e-05, "loss": 0.3654, "step": 771 }, { "epoch": 1.1769641495041954, "grad_norm": 0.32767482770302686, "learning_rate": 3.3738687782805434e-05, "loss": 0.3705, "step": 772 }, { "epoch": 1.1784897025171626, "grad_norm": 0.3135396750214026, "learning_rate": 3.371040723981901e-05, "loss": 0.3413, "step": 773 }, { "epoch": 1.1800152555301298, "grad_norm": 0.3218790344860688, "learning_rate": 3.3682126696832584e-05, "loss": 0.3368, "step": 774 }, { "epoch": 1.181540808543097, "grad_norm": 0.30041225001107286, "learning_rate": 3.365384615384616e-05, "loss": 0.3563, "step": 775 }, { "epoch": 1.1830663615560641, "grad_norm": 0.31187294692885215, "learning_rate": 3.3625565610859734e-05, "loss": 0.3754, "step": 776 }, { "epoch": 1.1845919145690313, "grad_norm": 0.29331893844901186, "learning_rate": 3.359728506787331e-05, "loss": 0.3869, "step": 777 }, { "epoch": 1.1861174675819985, "grad_norm": 0.32278247680401856, "learning_rate": 3.356900452488688e-05, "loss": 0.3556, "step": 778 }, { "epoch": 1.1876430205949657, "grad_norm": 0.2922180269416971, "learning_rate": 3.354072398190045e-05, "loss": 0.3616, "step": 779 }, { "epoch": 1.189168573607933, "grad_norm": 0.2982417126804559, "learning_rate": 3.351244343891403e-05, "loss": 0.3709, "step": 780 }, { "epoch": 1.1906941266209001, "grad_norm": 0.32322547000051394, "learning_rate": 3.34841628959276e-05, "loss": 0.3655, "step": 781 }, { "epoch": 1.1922196796338673, "grad_norm": 0.2845836850044046, "learning_rate": 3.345588235294118e-05, "loss": 0.3712, "step": 782 }, { "epoch": 1.1937452326468345, "grad_norm": 0.3234529711774025, "learning_rate": 3.342760180995475e-05, "loss": 0.3634, "step": 783 }, { "epoch": 1.1952707856598017, "grad_norm": 0.2968216843603346, "learning_rate": 3.339932126696833e-05, "loss": 0.3585, "step": 784 }, { "epoch": 1.196796338672769, "grad_norm": 0.3258500072240317, "learning_rate": 3.33710407239819e-05, "loss": 0.3419, "step": 785 }, { "epoch": 1.198321891685736, "grad_norm": 0.301632718296814, "learning_rate": 3.334276018099548e-05, "loss": 0.3695, "step": 786 }, { "epoch": 1.1998474446987033, "grad_norm": 0.3437491492447937, "learning_rate": 3.331447963800905e-05, "loss": 0.3964, "step": 787 }, { "epoch": 1.2013729977116705, "grad_norm": 0.2861802147473255, "learning_rate": 3.3286199095022626e-05, "loss": 0.3439, "step": 788 }, { "epoch": 1.2028985507246377, "grad_norm": 0.3006882279466217, "learning_rate": 3.32579185520362e-05, "loss": 0.3739, "step": 789 }, { "epoch": 1.204424103737605, "grad_norm": 0.3098402169351283, "learning_rate": 3.3229638009049776e-05, "loss": 0.3531, "step": 790 }, { "epoch": 1.205949656750572, "grad_norm": 0.32665290569631905, "learning_rate": 3.320135746606335e-05, "loss": 0.3622, "step": 791 }, { "epoch": 1.2074752097635393, "grad_norm": 0.3215764536950064, "learning_rate": 3.3173076923076926e-05, "loss": 0.3488, "step": 792 }, { "epoch": 1.2090007627765065, "grad_norm": 0.31311524716080635, "learning_rate": 3.31447963800905e-05, "loss": 0.355, "step": 793 }, { "epoch": 1.2105263157894737, "grad_norm": 0.3259800523241062, "learning_rate": 3.3116515837104076e-05, "loss": 0.3806, "step": 794 }, { "epoch": 1.2120518688024409, "grad_norm": 0.32780641136102046, "learning_rate": 3.308823529411765e-05, "loss": 0.3577, "step": 795 }, { "epoch": 1.213577421815408, "grad_norm": 0.3921416862158077, "learning_rate": 3.305995475113122e-05, "loss": 0.3802, "step": 796 }, { "epoch": 1.2151029748283753, "grad_norm": 0.3229730291711872, "learning_rate": 3.3031674208144794e-05, "loss": 0.3569, "step": 797 }, { "epoch": 1.2166285278413425, "grad_norm": 0.34701866023125033, "learning_rate": 3.300339366515837e-05, "loss": 0.3581, "step": 798 }, { "epoch": 1.2181540808543097, "grad_norm": 0.3050738810604449, "learning_rate": 3.2975113122171944e-05, "loss": 0.3592, "step": 799 }, { "epoch": 1.2196796338672768, "grad_norm": 0.30131562445525706, "learning_rate": 3.294683257918552e-05, "loss": 0.3511, "step": 800 }, { "epoch": 1.221205186880244, "grad_norm": 0.3218951058374966, "learning_rate": 3.2918552036199094e-05, "loss": 0.354, "step": 801 }, { "epoch": 1.2227307398932112, "grad_norm": 0.3185090220830481, "learning_rate": 3.289027149321267e-05, "loss": 0.3621, "step": 802 }, { "epoch": 1.2242562929061784, "grad_norm": 0.3232118365716808, "learning_rate": 3.2861990950226244e-05, "loss": 0.3619, "step": 803 }, { "epoch": 1.2257818459191456, "grad_norm": 0.29661655709766904, "learning_rate": 3.283371040723982e-05, "loss": 0.3503, "step": 804 }, { "epoch": 1.2273073989321128, "grad_norm": 0.3384133735460187, "learning_rate": 3.2805429864253393e-05, "loss": 0.3834, "step": 805 }, { "epoch": 1.22883295194508, "grad_norm": 0.3147350337645927, "learning_rate": 3.277714932126697e-05, "loss": 0.3592, "step": 806 }, { "epoch": 1.2303585049580472, "grad_norm": 0.3533576529469757, "learning_rate": 3.274886877828054e-05, "loss": 0.3733, "step": 807 }, { "epoch": 1.2318840579710144, "grad_norm": 0.3010179576349225, "learning_rate": 3.272058823529412e-05, "loss": 0.372, "step": 808 }, { "epoch": 1.2334096109839816, "grad_norm": 0.2870592332124842, "learning_rate": 3.269230769230769e-05, "loss": 0.358, "step": 809 }, { "epoch": 1.2349351639969488, "grad_norm": 0.30448331877368384, "learning_rate": 3.266402714932127e-05, "loss": 0.3458, "step": 810 }, { "epoch": 1.236460717009916, "grad_norm": 0.29511033746620324, "learning_rate": 3.263574660633484e-05, "loss": 0.3673, "step": 811 }, { "epoch": 1.2379862700228834, "grad_norm": 0.3181030940016441, "learning_rate": 3.260746606334842e-05, "loss": 0.3388, "step": 812 }, { "epoch": 1.2395118230358504, "grad_norm": 0.3282222502506622, "learning_rate": 3.257918552036199e-05, "loss": 0.3369, "step": 813 }, { "epoch": 1.2410373760488178, "grad_norm": 0.291737865816559, "learning_rate": 3.255090497737557e-05, "loss": 0.3554, "step": 814 }, { "epoch": 1.2425629290617848, "grad_norm": 0.35011657763506077, "learning_rate": 3.252262443438914e-05, "loss": 0.3798, "step": 815 }, { "epoch": 1.2440884820747522, "grad_norm": 0.3372547556613035, "learning_rate": 3.249434389140272e-05, "loss": 0.3618, "step": 816 }, { "epoch": 1.2456140350877192, "grad_norm": 0.28498304374671685, "learning_rate": 3.246606334841629e-05, "loss": 0.3594, "step": 817 }, { "epoch": 1.2471395881006866, "grad_norm": 0.27105058112297437, "learning_rate": 3.243778280542987e-05, "loss": 0.3832, "step": 818 }, { "epoch": 1.2486651411136538, "grad_norm": 0.28098965114993046, "learning_rate": 3.240950226244344e-05, "loss": 0.3556, "step": 819 }, { "epoch": 1.250190694126621, "grad_norm": 0.303814944684156, "learning_rate": 3.238122171945702e-05, "loss": 0.3836, "step": 820 }, { "epoch": 1.2517162471395882, "grad_norm": 0.3006145501890592, "learning_rate": 3.235294117647059e-05, "loss": 0.363, "step": 821 }, { "epoch": 1.2532418001525554, "grad_norm": 0.30079048331556246, "learning_rate": 3.232466063348417e-05, "loss": 0.3694, "step": 822 }, { "epoch": 1.2547673531655226, "grad_norm": 0.331713280169309, "learning_rate": 3.229638009049774e-05, "loss": 0.3637, "step": 823 }, { "epoch": 1.2562929061784898, "grad_norm": 0.2981761051194111, "learning_rate": 3.226809954751132e-05, "loss": 0.3622, "step": 824 }, { "epoch": 1.257818459191457, "grad_norm": 0.29435196472687214, "learning_rate": 3.223981900452489e-05, "loss": 0.3635, "step": 825 }, { "epoch": 1.2593440122044242, "grad_norm": 0.3157883747870468, "learning_rate": 3.221153846153847e-05, "loss": 0.3686, "step": 826 }, { "epoch": 1.2608695652173914, "grad_norm": 0.27949744665578796, "learning_rate": 3.218325791855204e-05, "loss": 0.3541, "step": 827 }, { "epoch": 1.2623951182303585, "grad_norm": 0.33247724089906183, "learning_rate": 3.215497737556562e-05, "loss": 0.3667, "step": 828 }, { "epoch": 1.2639206712433257, "grad_norm": 0.3009010590962123, "learning_rate": 3.212669683257919e-05, "loss": 0.3615, "step": 829 }, { "epoch": 1.265446224256293, "grad_norm": 0.3850122893493692, "learning_rate": 3.2098416289592766e-05, "loss": 0.3489, "step": 830 }, { "epoch": 1.2669717772692601, "grad_norm": 0.2793530437208135, "learning_rate": 3.207013574660634e-05, "loss": 0.3456, "step": 831 }, { "epoch": 1.2684973302822273, "grad_norm": 0.2873691131501744, "learning_rate": 3.2041855203619916e-05, "loss": 0.3482, "step": 832 }, { "epoch": 1.2700228832951945, "grad_norm": 0.33875053389666104, "learning_rate": 3.2013574660633484e-05, "loss": 0.3728, "step": 833 }, { "epoch": 1.2715484363081617, "grad_norm": 0.2770726043750035, "learning_rate": 3.198529411764706e-05, "loss": 0.3639, "step": 834 }, { "epoch": 1.273073989321129, "grad_norm": 0.29817024482451976, "learning_rate": 3.1957013574660634e-05, "loss": 0.343, "step": 835 }, { "epoch": 1.2745995423340961, "grad_norm": 0.3207997743587917, "learning_rate": 3.192873303167421e-05, "loss": 0.3759, "step": 836 }, { "epoch": 1.2761250953470633, "grad_norm": 0.31907516199921254, "learning_rate": 3.1900452488687784e-05, "loss": 0.346, "step": 837 }, { "epoch": 1.2776506483600305, "grad_norm": 0.28184297727430424, "learning_rate": 3.187217194570136e-05, "loss": 0.3616, "step": 838 }, { "epoch": 1.2791762013729977, "grad_norm": 0.34314797427374555, "learning_rate": 3.1843891402714934e-05, "loss": 0.3554, "step": 839 }, { "epoch": 1.280701754385965, "grad_norm": 0.280471308804091, "learning_rate": 3.181561085972851e-05, "loss": 0.3424, "step": 840 }, { "epoch": 1.282227307398932, "grad_norm": 0.2855896850980244, "learning_rate": 3.1787330316742084e-05, "loss": 0.3332, "step": 841 }, { "epoch": 1.2837528604118993, "grad_norm": 0.34692516832225134, "learning_rate": 3.175904977375566e-05, "loss": 0.3628, "step": 842 }, { "epoch": 1.2852784134248665, "grad_norm": 0.30851791088685726, "learning_rate": 3.1730769230769234e-05, "loss": 0.3525, "step": 843 }, { "epoch": 1.2868039664378337, "grad_norm": 0.30286368684786696, "learning_rate": 3.170248868778281e-05, "loss": 0.382, "step": 844 }, { "epoch": 1.2883295194508009, "grad_norm": 0.3970469109191316, "learning_rate": 3.167420814479638e-05, "loss": 0.3714, "step": 845 }, { "epoch": 1.289855072463768, "grad_norm": 0.304624736373359, "learning_rate": 3.164592760180995e-05, "loss": 0.3584, "step": 846 }, { "epoch": 1.2913806254767353, "grad_norm": 0.3835352589804246, "learning_rate": 3.161764705882353e-05, "loss": 0.3686, "step": 847 }, { "epoch": 1.2929061784897025, "grad_norm": 0.35066543868662653, "learning_rate": 3.15893665158371e-05, "loss": 0.3591, "step": 848 }, { "epoch": 1.2944317315026697, "grad_norm": 0.29046790965500097, "learning_rate": 3.1561085972850676e-05, "loss": 0.3719, "step": 849 }, { "epoch": 1.2959572845156369, "grad_norm": 0.3115775674208215, "learning_rate": 3.153280542986425e-05, "loss": 0.3539, "step": 850 }, { "epoch": 1.297482837528604, "grad_norm": 0.34320297000266514, "learning_rate": 3.1504524886877826e-05, "loss": 0.347, "step": 851 }, { "epoch": 1.2990083905415712, "grad_norm": 0.26732549348264156, "learning_rate": 3.14762443438914e-05, "loss": 0.3716, "step": 852 }, { "epoch": 1.3005339435545384, "grad_norm": 0.31372005245468704, "learning_rate": 3.1447963800904976e-05, "loss": 0.3558, "step": 853 }, { "epoch": 1.3020594965675056, "grad_norm": 0.38480946323876164, "learning_rate": 3.141968325791855e-05, "loss": 0.3743, "step": 854 }, { "epoch": 1.303585049580473, "grad_norm": 0.2887008874291651, "learning_rate": 3.1391402714932126e-05, "loss": 0.3591, "step": 855 }, { "epoch": 1.30511060259344, "grad_norm": 0.35057428594124657, "learning_rate": 3.13631221719457e-05, "loss": 0.354, "step": 856 }, { "epoch": 1.3066361556064074, "grad_norm": 0.2913609344492864, "learning_rate": 3.1334841628959276e-05, "loss": 0.3504, "step": 857 }, { "epoch": 1.3081617086193744, "grad_norm": 0.2972879795549247, "learning_rate": 3.130656108597285e-05, "loss": 0.3552, "step": 858 }, { "epoch": 1.3096872616323418, "grad_norm": 0.34992912360534273, "learning_rate": 3.1278280542986426e-05, "loss": 0.3759, "step": 859 }, { "epoch": 1.3112128146453088, "grad_norm": 0.30888982926374814, "learning_rate": 3.125e-05, "loss": 0.3524, "step": 860 }, { "epoch": 1.3127383676582762, "grad_norm": 0.3238906842502274, "learning_rate": 3.1221719457013576e-05, "loss": 0.3698, "step": 861 }, { "epoch": 1.3142639206712432, "grad_norm": 0.34867190911596113, "learning_rate": 3.119343891402715e-05, "loss": 0.3625, "step": 862 }, { "epoch": 1.3157894736842106, "grad_norm": 0.3068787363874944, "learning_rate": 3.1165158371040725e-05, "loss": 0.3537, "step": 863 }, { "epoch": 1.3173150266971776, "grad_norm": 0.3401345580977144, "learning_rate": 3.11368778280543e-05, "loss": 0.3656, "step": 864 }, { "epoch": 1.318840579710145, "grad_norm": 0.29099167720513563, "learning_rate": 3.1108597285067875e-05, "loss": 0.3665, "step": 865 }, { "epoch": 1.3203661327231122, "grad_norm": 0.3106144527995552, "learning_rate": 3.108031674208145e-05, "loss": 0.3593, "step": 866 }, { "epoch": 1.3218916857360794, "grad_norm": 0.30608279067971755, "learning_rate": 3.1052036199095025e-05, "loss": 0.3436, "step": 867 }, { "epoch": 1.3234172387490466, "grad_norm": 0.26822407752522154, "learning_rate": 3.10237556561086e-05, "loss": 0.3744, "step": 868 }, { "epoch": 1.3249427917620138, "grad_norm": 0.31246897061981854, "learning_rate": 3.0995475113122175e-05, "loss": 0.348, "step": 869 }, { "epoch": 1.326468344774981, "grad_norm": 0.3087649820652163, "learning_rate": 3.096719457013575e-05, "loss": 0.3799, "step": 870 }, { "epoch": 1.3279938977879482, "grad_norm": 0.2897816156420314, "learning_rate": 3.0938914027149325e-05, "loss": 0.3632, "step": 871 }, { "epoch": 1.3295194508009154, "grad_norm": 0.2927201059145663, "learning_rate": 3.09106334841629e-05, "loss": 0.3628, "step": 872 }, { "epoch": 1.3310450038138826, "grad_norm": 0.2706527799496889, "learning_rate": 3.0882352941176475e-05, "loss": 0.38, "step": 873 }, { "epoch": 1.3325705568268498, "grad_norm": 0.32951808801290206, "learning_rate": 3.085407239819005e-05, "loss": 0.3631, "step": 874 }, { "epoch": 1.334096109839817, "grad_norm": 0.2640871507606413, "learning_rate": 3.0825791855203625e-05, "loss": 0.3552, "step": 875 }, { "epoch": 1.3356216628527842, "grad_norm": 0.3103873454680314, "learning_rate": 3.07975113122172e-05, "loss": 0.373, "step": 876 }, { "epoch": 1.3371472158657514, "grad_norm": 0.3424160093358527, "learning_rate": 3.0769230769230774e-05, "loss": 0.3807, "step": 877 }, { "epoch": 1.3386727688787186, "grad_norm": 0.25886083224510276, "learning_rate": 3.074095022624435e-05, "loss": 0.362, "step": 878 }, { "epoch": 1.3401983218916858, "grad_norm": 0.3465546229338862, "learning_rate": 3.0712669683257924e-05, "loss": 0.3803, "step": 879 }, { "epoch": 1.341723874904653, "grad_norm": 0.2874590053237379, "learning_rate": 3.06843891402715e-05, "loss": 0.3704, "step": 880 }, { "epoch": 1.3432494279176201, "grad_norm": 0.3183252162801128, "learning_rate": 3.0656108597285074e-05, "loss": 0.3713, "step": 881 }, { "epoch": 1.3447749809305873, "grad_norm": 0.261964244508055, "learning_rate": 3.062782805429865e-05, "loss": 0.3499, "step": 882 }, { "epoch": 1.3463005339435545, "grad_norm": 0.3029794289138747, "learning_rate": 3.0599547511312224e-05, "loss": 0.3451, "step": 883 }, { "epoch": 1.3478260869565217, "grad_norm": 0.25749720551996796, "learning_rate": 3.05712669683258e-05, "loss": 0.3524, "step": 884 }, { "epoch": 1.349351639969489, "grad_norm": 0.3029928284748562, "learning_rate": 3.0542986425339374e-05, "loss": 0.3599, "step": 885 }, { "epoch": 1.3508771929824561, "grad_norm": 0.327119716411293, "learning_rate": 3.0514705882352945e-05, "loss": 0.3528, "step": 886 }, { "epoch": 1.3524027459954233, "grad_norm": 0.2698413556067364, "learning_rate": 3.048642533936652e-05, "loss": 0.375, "step": 887 }, { "epoch": 1.3539282990083905, "grad_norm": 0.3272760606894408, "learning_rate": 3.045814479638009e-05, "loss": 0.3676, "step": 888 }, { "epoch": 1.3554538520213577, "grad_norm": 0.2719296662251019, "learning_rate": 3.0429864253393663e-05, "loss": 0.3639, "step": 889 }, { "epoch": 1.356979405034325, "grad_norm": 0.32475898255424634, "learning_rate": 3.0401583710407238e-05, "loss": 0.3592, "step": 890 }, { "epoch": 1.358504958047292, "grad_norm": 0.3387603857499437, "learning_rate": 3.0373303167420813e-05, "loss": 0.3697, "step": 891 }, { "epoch": 1.3600305110602593, "grad_norm": 0.30945122385144275, "learning_rate": 3.0345022624434388e-05, "loss": 0.3642, "step": 892 }, { "epoch": 1.3615560640732265, "grad_norm": 0.3564246984172524, "learning_rate": 3.0316742081447963e-05, "loss": 0.3543, "step": 893 }, { "epoch": 1.3630816170861937, "grad_norm": 0.3389941946276307, "learning_rate": 3.0288461538461538e-05, "loss": 0.3448, "step": 894 }, { "epoch": 1.3646071700991609, "grad_norm": 0.33589296350570197, "learning_rate": 3.0260180995475113e-05, "loss": 0.393, "step": 895 }, { "epoch": 1.366132723112128, "grad_norm": 0.39411677168629144, "learning_rate": 3.0231900452488688e-05, "loss": 0.3712, "step": 896 }, { "epoch": 1.3676582761250953, "grad_norm": 0.29139252624651535, "learning_rate": 3.0203619909502263e-05, "loss": 0.3568, "step": 897 }, { "epoch": 1.3691838291380625, "grad_norm": 0.32917534344591504, "learning_rate": 3.0175339366515838e-05, "loss": 0.3563, "step": 898 }, { "epoch": 1.3707093821510297, "grad_norm": 0.3586058833182019, "learning_rate": 3.0147058823529413e-05, "loss": 0.3768, "step": 899 }, { "epoch": 1.372234935163997, "grad_norm": 0.32610430330609336, "learning_rate": 3.0118778280542987e-05, "loss": 0.3327, "step": 900 }, { "epoch": 1.373760488176964, "grad_norm": 0.2730698231539029, "learning_rate": 3.0090497737556562e-05, "loss": 0.3705, "step": 901 }, { "epoch": 1.3752860411899315, "grad_norm": 0.38162887629077497, "learning_rate": 3.0062217194570137e-05, "loss": 0.3881, "step": 902 }, { "epoch": 1.3768115942028984, "grad_norm": 0.2918292072527327, "learning_rate": 3.0033936651583712e-05, "loss": 0.3787, "step": 903 }, { "epoch": 1.3783371472158659, "grad_norm": 0.3316174091604833, "learning_rate": 3.0005656108597284e-05, "loss": 0.3721, "step": 904 }, { "epoch": 1.3798627002288328, "grad_norm": 0.29570381112066074, "learning_rate": 2.997737556561086e-05, "loss": 0.3666, "step": 905 }, { "epoch": 1.3813882532418003, "grad_norm": 0.2967088574339795, "learning_rate": 2.9949095022624434e-05, "loss": 0.3554, "step": 906 }, { "epoch": 1.3829138062547672, "grad_norm": 0.27602509705061945, "learning_rate": 2.992081447963801e-05, "loss": 0.3353, "step": 907 }, { "epoch": 1.3844393592677346, "grad_norm": 0.3747530700197461, "learning_rate": 2.9892533936651583e-05, "loss": 0.3575, "step": 908 }, { "epoch": 1.3859649122807016, "grad_norm": 0.2423474039649387, "learning_rate": 2.986425339366516e-05, "loss": 0.3718, "step": 909 }, { "epoch": 1.387490465293669, "grad_norm": 0.3502041463012035, "learning_rate": 2.9835972850678733e-05, "loss": 0.3634, "step": 910 }, { "epoch": 1.3890160183066362, "grad_norm": 0.33474562431608373, "learning_rate": 2.9807692307692308e-05, "loss": 0.3475, "step": 911 }, { "epoch": 1.3905415713196034, "grad_norm": 0.29482081237857927, "learning_rate": 2.9779411764705883e-05, "loss": 0.362, "step": 912 }, { "epoch": 1.3920671243325706, "grad_norm": 0.31676577099408487, "learning_rate": 2.9751131221719458e-05, "loss": 0.3832, "step": 913 }, { "epoch": 1.3935926773455378, "grad_norm": 0.3091926441023363, "learning_rate": 2.9722850678733033e-05, "loss": 0.3602, "step": 914 }, { "epoch": 1.395118230358505, "grad_norm": 0.3423169827212822, "learning_rate": 2.9694570135746608e-05, "loss": 0.373, "step": 915 }, { "epoch": 1.3966437833714722, "grad_norm": 0.3861321988481063, "learning_rate": 2.9666289592760183e-05, "loss": 0.3461, "step": 916 }, { "epoch": 1.3981693363844394, "grad_norm": 0.2477590373288559, "learning_rate": 2.9638009049773758e-05, "loss": 0.3585, "step": 917 }, { "epoch": 1.3996948893974066, "grad_norm": 0.3462734110358965, "learning_rate": 2.9609728506787333e-05, "loss": 0.364, "step": 918 }, { "epoch": 1.4012204424103738, "grad_norm": 0.3549298578553296, "learning_rate": 2.9581447963800908e-05, "loss": 0.3652, "step": 919 }, { "epoch": 1.402745995423341, "grad_norm": 0.28082854659450834, "learning_rate": 2.9553167420814483e-05, "loss": 0.3456, "step": 920 }, { "epoch": 1.4042715484363082, "grad_norm": 0.3651819914413173, "learning_rate": 2.9524886877828057e-05, "loss": 0.3498, "step": 921 }, { "epoch": 1.4057971014492754, "grad_norm": 0.2907251145439764, "learning_rate": 2.9496606334841632e-05, "loss": 0.3515, "step": 922 }, { "epoch": 1.4073226544622426, "grad_norm": 0.31117315198769663, "learning_rate": 2.9468325791855207e-05, "loss": 0.3694, "step": 923 }, { "epoch": 1.4088482074752098, "grad_norm": 0.2777413894739365, "learning_rate": 2.9440045248868782e-05, "loss": 0.3579, "step": 924 }, { "epoch": 1.410373760488177, "grad_norm": 0.2850947734895607, "learning_rate": 2.9411764705882354e-05, "loss": 0.355, "step": 925 }, { "epoch": 1.4118993135011442, "grad_norm": 0.28718584180364953, "learning_rate": 2.938348416289593e-05, "loss": 0.3547, "step": 926 }, { "epoch": 1.4134248665141114, "grad_norm": 0.30177720321811924, "learning_rate": 2.9355203619909504e-05, "loss": 0.3542, "step": 927 }, { "epoch": 1.4149504195270786, "grad_norm": 0.2738434198101729, "learning_rate": 2.932692307692308e-05, "loss": 0.3544, "step": 928 }, { "epoch": 1.4164759725400458, "grad_norm": 0.280709136463144, "learning_rate": 2.9298642533936653e-05, "loss": 0.3834, "step": 929 }, { "epoch": 1.418001525553013, "grad_norm": 0.37334394553392075, "learning_rate": 2.927036199095023e-05, "loss": 0.3542, "step": 930 }, { "epoch": 1.4195270785659801, "grad_norm": 0.26271968527038353, "learning_rate": 2.9242081447963803e-05, "loss": 0.3771, "step": 931 }, { "epoch": 1.4210526315789473, "grad_norm": 0.36309696069780456, "learning_rate": 2.9213800904977378e-05, "loss": 0.331, "step": 932 }, { "epoch": 1.4225781845919145, "grad_norm": 0.3229983171484996, "learning_rate": 2.9185520361990953e-05, "loss": 0.3552, "step": 933 }, { "epoch": 1.4241037376048817, "grad_norm": 0.2867808487269563, "learning_rate": 2.9157239819004528e-05, "loss": 0.3593, "step": 934 }, { "epoch": 1.425629290617849, "grad_norm": 0.3812347579632455, "learning_rate": 2.9128959276018103e-05, "loss": 0.3398, "step": 935 }, { "epoch": 1.4271548436308161, "grad_norm": 0.27337327860944605, "learning_rate": 2.9100678733031678e-05, "loss": 0.3635, "step": 936 }, { "epoch": 1.4286803966437833, "grad_norm": 0.28407447135024466, "learning_rate": 2.9072398190045253e-05, "loss": 0.3452, "step": 937 }, { "epoch": 1.4302059496567505, "grad_norm": 0.3396521981593608, "learning_rate": 2.9044117647058828e-05, "loss": 0.3717, "step": 938 }, { "epoch": 1.4317315026697177, "grad_norm": 0.2960472888600466, "learning_rate": 2.9015837104072403e-05, "loss": 0.3401, "step": 939 }, { "epoch": 1.433257055682685, "grad_norm": 0.27062354579037, "learning_rate": 2.8987556561085978e-05, "loss": 0.3483, "step": 940 }, { "epoch": 1.434782608695652, "grad_norm": 0.328085268350545, "learning_rate": 2.8959276018099553e-05, "loss": 0.3467, "step": 941 }, { "epoch": 1.4363081617086193, "grad_norm": 0.6218991657452897, "learning_rate": 2.8930995475113127e-05, "loss": 0.3473, "step": 942 }, { "epoch": 1.4378337147215865, "grad_norm": 0.28237492563677336, "learning_rate": 2.8902714932126696e-05, "loss": 0.3527, "step": 943 }, { "epoch": 1.4393592677345537, "grad_norm": 0.32398714294619146, "learning_rate": 2.887443438914027e-05, "loss": 0.3416, "step": 944 }, { "epoch": 1.4408848207475211, "grad_norm": 0.2943516945820961, "learning_rate": 2.8846153846153845e-05, "loss": 0.3547, "step": 945 }, { "epoch": 1.442410373760488, "grad_norm": 0.30836170883035285, "learning_rate": 2.881787330316742e-05, "loss": 0.3647, "step": 946 }, { "epoch": 1.4439359267734555, "grad_norm": 0.2577166458071984, "learning_rate": 2.8789592760180995e-05, "loss": 0.3501, "step": 947 }, { "epoch": 1.4454614797864225, "grad_norm": 0.32082272185702887, "learning_rate": 2.876131221719457e-05, "loss": 0.3564, "step": 948 }, { "epoch": 1.44698703279939, "grad_norm": 0.3139662067445532, "learning_rate": 2.8733031674208145e-05, "loss": 0.3443, "step": 949 }, { "epoch": 1.4485125858123569, "grad_norm": 0.2910757011766852, "learning_rate": 2.870475113122172e-05, "loss": 0.3581, "step": 950 }, { "epoch": 1.4500381388253243, "grad_norm": 0.28386361642802616, "learning_rate": 2.8676470588235295e-05, "loss": 0.3338, "step": 951 }, { "epoch": 1.4515636918382913, "grad_norm": 0.3363851866953124, "learning_rate": 2.864819004524887e-05, "loss": 0.3668, "step": 952 }, { "epoch": 1.4530892448512587, "grad_norm": 0.28682700113769033, "learning_rate": 2.861990950226244e-05, "loss": 0.3489, "step": 953 }, { "epoch": 1.4546147978642257, "grad_norm": 0.34250574003412315, "learning_rate": 2.8591628959276016e-05, "loss": 0.3669, "step": 954 }, { "epoch": 1.456140350877193, "grad_norm": 0.2986214535047864, "learning_rate": 2.856334841628959e-05, "loss": 0.3657, "step": 955 }, { "epoch": 1.4576659038901603, "grad_norm": 0.3787608463147748, "learning_rate": 2.8535067873303166e-05, "loss": 0.3597, "step": 956 }, { "epoch": 1.4591914569031275, "grad_norm": 0.3267786485905336, "learning_rate": 2.850678733031674e-05, "loss": 0.3724, "step": 957 }, { "epoch": 1.4607170099160947, "grad_norm": 0.31418159784962285, "learning_rate": 2.8478506787330316e-05, "loss": 0.3484, "step": 958 }, { "epoch": 1.4622425629290619, "grad_norm": 0.2939226815677612, "learning_rate": 2.845022624434389e-05, "loss": 0.3761, "step": 959 }, { "epoch": 1.463768115942029, "grad_norm": 0.3860458724478377, "learning_rate": 2.8421945701357466e-05, "loss": 0.3524, "step": 960 }, { "epoch": 1.4652936689549962, "grad_norm": 0.29834736490111347, "learning_rate": 2.839366515837104e-05, "loss": 0.3412, "step": 961 }, { "epoch": 1.4668192219679634, "grad_norm": 0.31460938674534694, "learning_rate": 2.8365384615384616e-05, "loss": 0.3775, "step": 962 }, { "epoch": 1.4683447749809306, "grad_norm": 0.33998221019464425, "learning_rate": 2.833710407239819e-05, "loss": 0.3531, "step": 963 }, { "epoch": 1.4698703279938978, "grad_norm": 0.3187851407578482, "learning_rate": 2.8308823529411766e-05, "loss": 0.3504, "step": 964 }, { "epoch": 1.471395881006865, "grad_norm": 0.3008582110133443, "learning_rate": 2.828054298642534e-05, "loss": 0.3747, "step": 965 }, { "epoch": 1.4729214340198322, "grad_norm": 0.3379877823643842, "learning_rate": 2.8252262443438915e-05, "loss": 0.3548, "step": 966 }, { "epoch": 1.4744469870327994, "grad_norm": 0.32669622344411525, "learning_rate": 2.822398190045249e-05, "loss": 0.3926, "step": 967 }, { "epoch": 1.4759725400457666, "grad_norm": 0.3359159451283145, "learning_rate": 2.8195701357466065e-05, "loss": 0.3474, "step": 968 }, { "epoch": 1.4774980930587338, "grad_norm": 0.2967670934792688, "learning_rate": 2.816742081447964e-05, "loss": 0.3611, "step": 969 }, { "epoch": 1.479023646071701, "grad_norm": 0.28068318078555354, "learning_rate": 2.8139140271493215e-05, "loss": 0.3707, "step": 970 }, { "epoch": 1.4805491990846682, "grad_norm": 0.300059321953815, "learning_rate": 2.811085972850679e-05, "loss": 0.3586, "step": 971 }, { "epoch": 1.4820747520976354, "grad_norm": 0.2772202293623559, "learning_rate": 2.8082579185520365e-05, "loss": 0.3676, "step": 972 }, { "epoch": 1.4836003051106026, "grad_norm": 0.4347391203661483, "learning_rate": 2.805429864253394e-05, "loss": 0.4054, "step": 973 }, { "epoch": 1.4851258581235698, "grad_norm": 0.2826916995930012, "learning_rate": 2.802601809954751e-05, "loss": 0.3378, "step": 974 }, { "epoch": 1.486651411136537, "grad_norm": 0.2702680798707541, "learning_rate": 2.7997737556561086e-05, "loss": 0.3631, "step": 975 }, { "epoch": 1.4881769641495042, "grad_norm": 0.34094230534669256, "learning_rate": 2.796945701357466e-05, "loss": 0.3637, "step": 976 }, { "epoch": 1.4897025171624714, "grad_norm": 0.25898359192232395, "learning_rate": 2.7941176470588236e-05, "loss": 0.3571, "step": 977 }, { "epoch": 1.4912280701754386, "grad_norm": 0.2643098838726964, "learning_rate": 2.791289592760181e-05, "loss": 0.3537, "step": 978 }, { "epoch": 1.4927536231884058, "grad_norm": 0.2605464985479492, "learning_rate": 2.7884615384615386e-05, "loss": 0.349, "step": 979 }, { "epoch": 1.494279176201373, "grad_norm": 0.27108143053248857, "learning_rate": 2.785633484162896e-05, "loss": 0.3706, "step": 980 }, { "epoch": 1.4958047292143402, "grad_norm": 0.2704817900771619, "learning_rate": 2.7828054298642536e-05, "loss": 0.3461, "step": 981 }, { "epoch": 1.4973302822273074, "grad_norm": 0.2899164926239144, "learning_rate": 2.779977375565611e-05, "loss": 0.3723, "step": 982 }, { "epoch": 1.4988558352402745, "grad_norm": 0.2785684014126151, "learning_rate": 2.7771493212669686e-05, "loss": 0.3521, "step": 983 }, { "epoch": 1.500381388253242, "grad_norm": 0.3033011374253123, "learning_rate": 2.774321266968326e-05, "loss": 0.3672, "step": 984 }, { "epoch": 1.501906941266209, "grad_norm": 0.29001554724576417, "learning_rate": 2.7714932126696836e-05, "loss": 0.3541, "step": 985 }, { "epoch": 1.5034324942791764, "grad_norm": 0.2637769153662645, "learning_rate": 2.768665158371041e-05, "loss": 0.3453, "step": 986 }, { "epoch": 1.5049580472921433, "grad_norm": 0.31859956995944844, "learning_rate": 2.7658371040723985e-05, "loss": 0.3642, "step": 987 }, { "epoch": 1.5064836003051107, "grad_norm": 0.30786511606276296, "learning_rate": 2.763009049773756e-05, "loss": 0.3817, "step": 988 }, { "epoch": 1.5080091533180777, "grad_norm": 0.7871088988023625, "learning_rate": 2.7601809954751135e-05, "loss": 0.3639, "step": 989 }, { "epoch": 1.5095347063310451, "grad_norm": 0.29250827965833587, "learning_rate": 2.757352941176471e-05, "loss": 0.3716, "step": 990 }, { "epoch": 1.5110602593440121, "grad_norm": 0.3071196750628621, "learning_rate": 2.7545248868778285e-05, "loss": 0.365, "step": 991 }, { "epoch": 1.5125858123569795, "grad_norm": 0.297498513422697, "learning_rate": 2.751696832579186e-05, "loss": 0.3644, "step": 992 }, { "epoch": 1.5141113653699465, "grad_norm": 0.26976996748945353, "learning_rate": 2.7488687782805435e-05, "loss": 0.3619, "step": 993 }, { "epoch": 1.515636918382914, "grad_norm": 0.2537669456812244, "learning_rate": 2.7460407239819007e-05, "loss": 0.3386, "step": 994 }, { "epoch": 1.517162471395881, "grad_norm": 0.2607393590337182, "learning_rate": 2.743212669683258e-05, "loss": 0.3767, "step": 995 }, { "epoch": 1.5186880244088483, "grad_norm": 0.3172304093627381, "learning_rate": 2.7403846153846156e-05, "loss": 0.3608, "step": 996 }, { "epoch": 1.5202135774218153, "grad_norm": 0.2826545418764549, "learning_rate": 2.737556561085973e-05, "loss": 0.3426, "step": 997 }, { "epoch": 1.5217391304347827, "grad_norm": 0.28725902284398214, "learning_rate": 2.7347285067873306e-05, "loss": 0.3492, "step": 998 }, { "epoch": 1.5232646834477497, "grad_norm": 0.25912721460393867, "learning_rate": 2.7319004524886878e-05, "loss": 0.3441, "step": 999 }, { "epoch": 1.524790236460717, "grad_norm": 0.2858498636942156, "learning_rate": 2.7290723981900453e-05, "loss": 0.3602, "step": 1000 }, { "epoch": 1.526315789473684, "grad_norm": 0.25553181541395886, "learning_rate": 2.7262443438914028e-05, "loss": 0.3563, "step": 1001 }, { "epoch": 1.5278413424866515, "grad_norm": 0.27702274350997974, "learning_rate": 2.72341628959276e-05, "loss": 0.363, "step": 1002 }, { "epoch": 1.5293668954996185, "grad_norm": 0.2498225516613448, "learning_rate": 2.7205882352941174e-05, "loss": 0.37, "step": 1003 }, { "epoch": 1.5308924485125859, "grad_norm": 0.2722931385581554, "learning_rate": 2.717760180995475e-05, "loss": 0.3435, "step": 1004 }, { "epoch": 1.5324180015255529, "grad_norm": 0.2944093699451797, "learning_rate": 2.7149321266968324e-05, "loss": 0.3672, "step": 1005 }, { "epoch": 1.5339435545385203, "grad_norm": 0.33503518276045624, "learning_rate": 2.71210407239819e-05, "loss": 0.3826, "step": 1006 }, { "epoch": 1.5354691075514875, "grad_norm": 0.3195198109219537, "learning_rate": 2.7092760180995474e-05, "loss": 0.3649, "step": 1007 }, { "epoch": 1.5369946605644547, "grad_norm": 0.3223103696166716, "learning_rate": 2.706447963800905e-05, "loss": 0.3388, "step": 1008 }, { "epoch": 1.5385202135774219, "grad_norm": 0.3178599272266727, "learning_rate": 2.7036199095022624e-05, "loss": 0.3724, "step": 1009 }, { "epoch": 1.540045766590389, "grad_norm": 0.396105305752255, "learning_rate": 2.70079185520362e-05, "loss": 0.3582, "step": 1010 }, { "epoch": 1.5415713196033562, "grad_norm": 0.2865981786362419, "learning_rate": 2.6979638009049773e-05, "loss": 0.3553, "step": 1011 }, { "epoch": 1.5430968726163234, "grad_norm": 0.2741406732971974, "learning_rate": 2.695135746606335e-05, "loss": 0.3699, "step": 1012 }, { "epoch": 1.5446224256292906, "grad_norm": 0.3628274388020018, "learning_rate": 2.6923076923076923e-05, "loss": 0.3557, "step": 1013 }, { "epoch": 1.5461479786422578, "grad_norm": 0.2632174149702895, "learning_rate": 2.6894796380090498e-05, "loss": 0.3493, "step": 1014 }, { "epoch": 1.547673531655225, "grad_norm": 0.3070852405655027, "learning_rate": 2.6866515837104073e-05, "loss": 0.3691, "step": 1015 }, { "epoch": 1.5491990846681922, "grad_norm": 0.36250608817969165, "learning_rate": 2.6838235294117648e-05, "loss": 0.3555, "step": 1016 }, { "epoch": 1.5507246376811594, "grad_norm": 0.3098785499907234, "learning_rate": 2.6809954751131223e-05, "loss": 0.3558, "step": 1017 }, { "epoch": 1.5522501906941266, "grad_norm": 0.27316059247503877, "learning_rate": 2.6781674208144798e-05, "loss": 0.3588, "step": 1018 }, { "epoch": 1.5537757437070938, "grad_norm": 0.2972272061356339, "learning_rate": 2.6753393665158373e-05, "loss": 0.3632, "step": 1019 }, { "epoch": 1.555301296720061, "grad_norm": 0.31753866481409543, "learning_rate": 2.6725113122171948e-05, "loss": 0.353, "step": 1020 }, { "epoch": 1.5568268497330282, "grad_norm": 0.2906366023432111, "learning_rate": 2.6696832579185523e-05, "loss": 0.3605, "step": 1021 }, { "epoch": 1.5583524027459954, "grad_norm": 0.2873637141380983, "learning_rate": 2.6668552036199094e-05, "loss": 0.3526, "step": 1022 }, { "epoch": 1.5598779557589626, "grad_norm": 0.31017891125743946, "learning_rate": 2.664027149321267e-05, "loss": 0.3516, "step": 1023 }, { "epoch": 1.5614035087719298, "grad_norm": 0.26618304543670745, "learning_rate": 2.6611990950226244e-05, "loss": 0.3583, "step": 1024 }, { "epoch": 1.562929061784897, "grad_norm": 0.29218332109763234, "learning_rate": 2.658371040723982e-05, "loss": 0.3664, "step": 1025 }, { "epoch": 1.5644546147978642, "grad_norm": 0.3123753492525783, "learning_rate": 2.6555429864253394e-05, "loss": 0.3618, "step": 1026 }, { "epoch": 1.5659801678108314, "grad_norm": 0.2829971419216817, "learning_rate": 2.652714932126697e-05, "loss": 0.3435, "step": 1027 }, { "epoch": 1.5675057208237986, "grad_norm": 0.3098485377083609, "learning_rate": 2.6498868778280544e-05, "loss": 0.3634, "step": 1028 }, { "epoch": 1.569031273836766, "grad_norm": 0.3397680327078356, "learning_rate": 2.647058823529412e-05, "loss": 0.3388, "step": 1029 }, { "epoch": 1.570556826849733, "grad_norm": 0.2611132315563582, "learning_rate": 2.6442307692307694e-05, "loss": 0.3529, "step": 1030 }, { "epoch": 1.5720823798627004, "grad_norm": 0.3311778608545287, "learning_rate": 2.641402714932127e-05, "loss": 0.3707, "step": 1031 }, { "epoch": 1.5736079328756674, "grad_norm": 0.26732038199467656, "learning_rate": 2.6385746606334843e-05, "loss": 0.3501, "step": 1032 }, { "epoch": 1.5751334858886348, "grad_norm": 0.2609208630137583, "learning_rate": 2.635746606334842e-05, "loss": 0.356, "step": 1033 }, { "epoch": 1.5766590389016018, "grad_norm": 0.3136045635522527, "learning_rate": 2.6329185520361993e-05, "loss": 0.3585, "step": 1034 }, { "epoch": 1.5781845919145692, "grad_norm": 0.3249072050432297, "learning_rate": 2.6300904977375568e-05, "loss": 0.3414, "step": 1035 }, { "epoch": 1.5797101449275361, "grad_norm": 0.26133795101855833, "learning_rate": 2.6272624434389143e-05, "loss": 0.3481, "step": 1036 }, { "epoch": 1.5812356979405036, "grad_norm": 0.31212512391404534, "learning_rate": 2.6244343891402718e-05, "loss": 0.3613, "step": 1037 }, { "epoch": 1.5827612509534705, "grad_norm": 0.35309507964727094, "learning_rate": 2.6216063348416293e-05, "loss": 0.3604, "step": 1038 }, { "epoch": 1.584286803966438, "grad_norm": 0.306190740060231, "learning_rate": 2.6187782805429868e-05, "loss": 0.3537, "step": 1039 }, { "epoch": 1.585812356979405, "grad_norm": 0.3430592253511539, "learning_rate": 2.6159502262443443e-05, "loss": 0.3649, "step": 1040 }, { "epoch": 1.5873379099923723, "grad_norm": 0.25423343265707893, "learning_rate": 2.6131221719457018e-05, "loss": 0.3491, "step": 1041 }, { "epoch": 1.5888634630053393, "grad_norm": 0.31879599813326137, "learning_rate": 2.6102941176470593e-05, "loss": 0.334, "step": 1042 }, { "epoch": 1.5903890160183067, "grad_norm": 0.3526020975181312, "learning_rate": 2.6074660633484164e-05, "loss": 0.3449, "step": 1043 }, { "epoch": 1.5919145690312737, "grad_norm": 0.29467885194744725, "learning_rate": 2.604638009049774e-05, "loss": 0.3619, "step": 1044 }, { "epoch": 1.5934401220442411, "grad_norm": 0.34034480115002824, "learning_rate": 2.6018099547511314e-05, "loss": 0.3535, "step": 1045 }, { "epoch": 1.594965675057208, "grad_norm": 0.3217300078156638, "learning_rate": 2.598981900452489e-05, "loss": 0.3616, "step": 1046 }, { "epoch": 1.5964912280701755, "grad_norm": 0.2984356444433481, "learning_rate": 2.5961538461538464e-05, "loss": 0.343, "step": 1047 }, { "epoch": 1.5980167810831425, "grad_norm": 0.30167058667672475, "learning_rate": 2.593325791855204e-05, "loss": 0.3639, "step": 1048 }, { "epoch": 1.59954233409611, "grad_norm": 0.3447411948403999, "learning_rate": 2.5904977375565614e-05, "loss": 0.3627, "step": 1049 }, { "epoch": 1.6010678871090769, "grad_norm": 0.30545462997764716, "learning_rate": 2.587669683257919e-05, "loss": 0.354, "step": 1050 }, { "epoch": 1.6025934401220443, "grad_norm": 0.36253669498220914, "learning_rate": 2.5848416289592764e-05, "loss": 0.3543, "step": 1051 }, { "epoch": 1.6041189931350115, "grad_norm": 0.2893315894966898, "learning_rate": 2.582013574660634e-05, "loss": 0.3361, "step": 1052 }, { "epoch": 1.6056445461479787, "grad_norm": 0.28338286599488055, "learning_rate": 2.5791855203619913e-05, "loss": 0.3378, "step": 1053 }, { "epoch": 1.6071700991609459, "grad_norm": 0.36856151666537373, "learning_rate": 2.576357466063348e-05, "loss": 0.3711, "step": 1054 }, { "epoch": 1.608695652173913, "grad_norm": 0.2648417266736647, "learning_rate": 2.5735294117647057e-05, "loss": 0.3489, "step": 1055 }, { "epoch": 1.6102212051868803, "grad_norm": 0.33651790693752515, "learning_rate": 2.570701357466063e-05, "loss": 0.3533, "step": 1056 }, { "epoch": 1.6117467581998475, "grad_norm": 0.3577276023242142, "learning_rate": 2.5678733031674206e-05, "loss": 0.355, "step": 1057 }, { "epoch": 1.6132723112128147, "grad_norm": 0.31449301710939187, "learning_rate": 2.565045248868778e-05, "loss": 0.3622, "step": 1058 }, { "epoch": 1.6147978642257819, "grad_norm": 0.3836431681568273, "learning_rate": 2.5622171945701356e-05, "loss": 0.3534, "step": 1059 }, { "epoch": 1.616323417238749, "grad_norm": 0.3074858274015837, "learning_rate": 2.559389140271493e-05, "loss": 0.3692, "step": 1060 }, { "epoch": 1.6178489702517163, "grad_norm": 0.35145464345268557, "learning_rate": 2.5565610859728506e-05, "loss": 0.3819, "step": 1061 }, { "epoch": 1.6193745232646835, "grad_norm": 0.42904792608751186, "learning_rate": 2.553733031674208e-05, "loss": 0.3443, "step": 1062 }, { "epoch": 1.6209000762776506, "grad_norm": 0.3648145864334806, "learning_rate": 2.5509049773755656e-05, "loss": 0.3555, "step": 1063 }, { "epoch": 1.6224256292906178, "grad_norm": 0.3008847053369864, "learning_rate": 2.548076923076923e-05, "loss": 0.3839, "step": 1064 }, { "epoch": 1.623951182303585, "grad_norm": 0.30158142335885474, "learning_rate": 2.5452488687782806e-05, "loss": 0.3491, "step": 1065 }, { "epoch": 1.6254767353165522, "grad_norm": 0.3065027075189174, "learning_rate": 2.542420814479638e-05, "loss": 0.3412, "step": 1066 }, { "epoch": 1.6270022883295194, "grad_norm": 0.26099897859056526, "learning_rate": 2.5395927601809956e-05, "loss": 0.3509, "step": 1067 }, { "epoch": 1.6285278413424866, "grad_norm": 0.34823625202831604, "learning_rate": 2.536764705882353e-05, "loss": 0.3545, "step": 1068 }, { "epoch": 1.6300533943554538, "grad_norm": 0.3177426908959906, "learning_rate": 2.5339366515837106e-05, "loss": 0.3831, "step": 1069 }, { "epoch": 1.631578947368421, "grad_norm": 0.2718245446335484, "learning_rate": 2.531108597285068e-05, "loss": 0.3576, "step": 1070 }, { "epoch": 1.6331045003813882, "grad_norm": 0.3250728387532577, "learning_rate": 2.5282805429864252e-05, "loss": 0.3653, "step": 1071 }, { "epoch": 1.6346300533943554, "grad_norm": 0.29919739298789405, "learning_rate": 2.5254524886877827e-05, "loss": 0.3495, "step": 1072 }, { "epoch": 1.6361556064073226, "grad_norm": 0.27359159022559826, "learning_rate": 2.5226244343891402e-05, "loss": 0.3719, "step": 1073 }, { "epoch": 1.6376811594202898, "grad_norm": 0.307694420960446, "learning_rate": 2.5197963800904977e-05, "loss": 0.3539, "step": 1074 }, { "epoch": 1.639206712433257, "grad_norm": 0.2857275906083576, "learning_rate": 2.516968325791855e-05, "loss": 0.3625, "step": 1075 }, { "epoch": 1.6407322654462244, "grad_norm": 0.3016309769706346, "learning_rate": 2.5141402714932127e-05, "loss": 0.3472, "step": 1076 }, { "epoch": 1.6422578184591914, "grad_norm": 0.2893992539085371, "learning_rate": 2.51131221719457e-05, "loss": 0.3513, "step": 1077 }, { "epoch": 1.6437833714721588, "grad_norm": 0.3015736422719924, "learning_rate": 2.5084841628959276e-05, "loss": 0.3552, "step": 1078 }, { "epoch": 1.6453089244851258, "grad_norm": 0.34115369075418434, "learning_rate": 2.505656108597285e-05, "loss": 0.3512, "step": 1079 }, { "epoch": 1.6468344774980932, "grad_norm": 0.2932116677725194, "learning_rate": 2.5028280542986426e-05, "loss": 0.3719, "step": 1080 }, { "epoch": 1.6483600305110602, "grad_norm": 0.3563846217933727, "learning_rate": 2.5e-05, "loss": 0.3512, "step": 1081 }, { "epoch": 1.6498855835240276, "grad_norm": 0.2989294368662978, "learning_rate": 2.4971719457013576e-05, "loss": 0.3445, "step": 1082 }, { "epoch": 1.6514111365369946, "grad_norm": 0.2692803835475221, "learning_rate": 2.494343891402715e-05, "loss": 0.3432, "step": 1083 }, { "epoch": 1.652936689549962, "grad_norm": 0.33663493350448703, "learning_rate": 2.4915158371040726e-05, "loss": 0.3412, "step": 1084 }, { "epoch": 1.654462242562929, "grad_norm": 0.2651482381967112, "learning_rate": 2.48868778280543e-05, "loss": 0.3377, "step": 1085 }, { "epoch": 1.6559877955758964, "grad_norm": 0.3245624439567372, "learning_rate": 2.4858597285067876e-05, "loss": 0.3618, "step": 1086 }, { "epoch": 1.6575133485888633, "grad_norm": 0.3026236568955183, "learning_rate": 2.483031674208145e-05, "loss": 0.3602, "step": 1087 }, { "epoch": 1.6590389016018308, "grad_norm": 0.26063845935190083, "learning_rate": 2.4802036199095026e-05, "loss": 0.3424, "step": 1088 }, { "epoch": 1.6605644546147977, "grad_norm": 0.3687881115812144, "learning_rate": 2.47737556561086e-05, "loss": 0.3586, "step": 1089 }, { "epoch": 1.6620900076277652, "grad_norm": 0.35089646828708954, "learning_rate": 2.4745475113122176e-05, "loss": 0.3657, "step": 1090 }, { "epoch": 1.6636155606407321, "grad_norm": 0.30721272342831885, "learning_rate": 2.471719457013575e-05, "loss": 0.3656, "step": 1091 }, { "epoch": 1.6651411136536995, "grad_norm": 0.3823653086811772, "learning_rate": 2.4688914027149322e-05, "loss": 0.353, "step": 1092 }, { "epoch": 1.6666666666666665, "grad_norm": 0.25603068665826634, "learning_rate": 2.4660633484162897e-05, "loss": 0.3359, "step": 1093 }, { "epoch": 1.668192219679634, "grad_norm": 0.35714076897788605, "learning_rate": 2.4632352941176472e-05, "loss": 0.3382, "step": 1094 }, { "epoch": 1.669717772692601, "grad_norm": 0.282109252444298, "learning_rate": 2.4604072398190047e-05, "loss": 0.3821, "step": 1095 }, { "epoch": 1.6712433257055683, "grad_norm": 0.3360487667269666, "learning_rate": 2.4575791855203618e-05, "loss": 0.3706, "step": 1096 }, { "epoch": 1.6727688787185355, "grad_norm": 0.30411146212699386, "learning_rate": 2.4547511312217193e-05, "loss": 0.3564, "step": 1097 }, { "epoch": 1.6742944317315027, "grad_norm": 0.3373957271748902, "learning_rate": 2.4519230769230768e-05, "loss": 0.3744, "step": 1098 }, { "epoch": 1.67581998474447, "grad_norm": 0.33478015923073795, "learning_rate": 2.4490950226244343e-05, "loss": 0.3506, "step": 1099 }, { "epoch": 1.677345537757437, "grad_norm": 0.3440455231693872, "learning_rate": 2.4462669683257918e-05, "loss": 0.3436, "step": 1100 }, { "epoch": 1.6788710907704043, "grad_norm": 0.3248836927288912, "learning_rate": 2.4434389140271493e-05, "loss": 0.3485, "step": 1101 }, { "epoch": 1.6803966437833715, "grad_norm": 0.28564949845803367, "learning_rate": 2.4406108597285068e-05, "loss": 0.3495, "step": 1102 }, { "epoch": 1.6819221967963387, "grad_norm": 0.32639817583497654, "learning_rate": 2.4377828054298643e-05, "loss": 0.3641, "step": 1103 }, { "epoch": 1.683447749809306, "grad_norm": 0.4033437098205818, "learning_rate": 2.4349547511312218e-05, "loss": 0.3604, "step": 1104 }, { "epoch": 1.684973302822273, "grad_norm": 0.2952911550727721, "learning_rate": 2.4321266968325793e-05, "loss": 0.3472, "step": 1105 }, { "epoch": 1.6864988558352403, "grad_norm": 0.34607615677025894, "learning_rate": 2.4292986425339368e-05, "loss": 0.3569, "step": 1106 }, { "epoch": 1.6880244088482075, "grad_norm": 0.28303207396141256, "learning_rate": 2.4264705882352942e-05, "loss": 0.3489, "step": 1107 }, { "epoch": 1.6895499618611747, "grad_norm": 0.33488192131918704, "learning_rate": 2.4236425339366517e-05, "loss": 0.3554, "step": 1108 }, { "epoch": 1.6910755148741419, "grad_norm": 0.3170245862036528, "learning_rate": 2.4208144796380092e-05, "loss": 0.3492, "step": 1109 }, { "epoch": 1.692601067887109, "grad_norm": 0.2855150436288049, "learning_rate": 2.4179864253393667e-05, "loss": 0.3625, "step": 1110 }, { "epoch": 1.6941266209000763, "grad_norm": 0.34897097012771044, "learning_rate": 2.4151583710407242e-05, "loss": 0.3552, "step": 1111 }, { "epoch": 1.6956521739130435, "grad_norm": 0.28959190253597444, "learning_rate": 2.4123303167420817e-05, "loss": 0.3627, "step": 1112 }, { "epoch": 1.6971777269260107, "grad_norm": 0.2860963227704419, "learning_rate": 2.4095022624434392e-05, "loss": 0.3296, "step": 1113 }, { "epoch": 1.6987032799389779, "grad_norm": 0.34087803414499385, "learning_rate": 2.4066742081447967e-05, "loss": 0.3435, "step": 1114 }, { "epoch": 1.700228832951945, "grad_norm": 0.3519992720923535, "learning_rate": 2.4038461538461542e-05, "loss": 0.344, "step": 1115 }, { "epoch": 1.7017543859649122, "grad_norm": 0.2894323859254795, "learning_rate": 2.4010180995475113e-05, "loss": 0.3533, "step": 1116 }, { "epoch": 1.7032799389778794, "grad_norm": 0.3656267255355836, "learning_rate": 2.3981900452488688e-05, "loss": 0.3345, "step": 1117 }, { "epoch": 1.7048054919908466, "grad_norm": 0.29966413753556864, "learning_rate": 2.3953619909502263e-05, "loss": 0.3565, "step": 1118 }, { "epoch": 1.7063310450038138, "grad_norm": 0.36104314679277005, "learning_rate": 2.3925339366515838e-05, "loss": 0.3597, "step": 1119 }, { "epoch": 1.707856598016781, "grad_norm": 0.2742093073924787, "learning_rate": 2.3897058823529413e-05, "loss": 0.3713, "step": 1120 }, { "epoch": 1.7093821510297484, "grad_norm": 0.32224466386309575, "learning_rate": 2.3868778280542988e-05, "loss": 0.3505, "step": 1121 }, { "epoch": 1.7109077040427154, "grad_norm": 0.32880806589428874, "learning_rate": 2.3840497737556563e-05, "loss": 0.3753, "step": 1122 }, { "epoch": 1.7124332570556828, "grad_norm": 0.25160018186879773, "learning_rate": 2.3812217194570134e-05, "loss": 0.3636, "step": 1123 }, { "epoch": 1.7139588100686498, "grad_norm": 0.3080208708463818, "learning_rate": 2.378393665158371e-05, "loss": 0.3514, "step": 1124 }, { "epoch": 1.7154843630816172, "grad_norm": 0.3030834544580116, "learning_rate": 2.3755656108597284e-05, "loss": 0.3658, "step": 1125 }, { "epoch": 1.7170099160945842, "grad_norm": 0.29388742983720706, "learning_rate": 2.372737556561086e-05, "loss": 0.3518, "step": 1126 }, { "epoch": 1.7185354691075516, "grad_norm": 0.9636611912758173, "learning_rate": 2.3699095022624434e-05, "loss": 0.3618, "step": 1127 }, { "epoch": 1.7200610221205186, "grad_norm": 0.338599512023856, "learning_rate": 2.367081447963801e-05, "loss": 0.3538, "step": 1128 }, { "epoch": 1.721586575133486, "grad_norm": 0.3288784576792027, "learning_rate": 2.3642533936651584e-05, "loss": 0.3719, "step": 1129 }, { "epoch": 1.723112128146453, "grad_norm": 0.2852840398860864, "learning_rate": 2.361425339366516e-05, "loss": 0.36, "step": 1130 }, { "epoch": 1.7246376811594204, "grad_norm": 0.3754909802612195, "learning_rate": 2.3585972850678734e-05, "loss": 0.3605, "step": 1131 }, { "epoch": 1.7261632341723874, "grad_norm": 0.27183492000016296, "learning_rate": 2.355769230769231e-05, "loss": 0.3335, "step": 1132 }, { "epoch": 1.7276887871853548, "grad_norm": 0.28119605727787905, "learning_rate": 2.3529411764705884e-05, "loss": 0.3518, "step": 1133 }, { "epoch": 1.7292143401983218, "grad_norm": 0.2768918015528303, "learning_rate": 2.350113122171946e-05, "loss": 0.36, "step": 1134 }, { "epoch": 1.7307398932112892, "grad_norm": 0.2751833179458571, "learning_rate": 2.3472850678733034e-05, "loss": 0.3446, "step": 1135 }, { "epoch": 1.7322654462242562, "grad_norm": 0.299726266743985, "learning_rate": 2.344457013574661e-05, "loss": 0.3664, "step": 1136 }, { "epoch": 1.7337909992372236, "grad_norm": 0.3003546320523925, "learning_rate": 2.3416289592760183e-05, "loss": 0.357, "step": 1137 }, { "epoch": 1.7353165522501905, "grad_norm": 0.30354512550291074, "learning_rate": 2.3388009049773758e-05, "loss": 0.3561, "step": 1138 }, { "epoch": 1.736842105263158, "grad_norm": 0.2723767377768168, "learning_rate": 2.3359728506787333e-05, "loss": 0.3638, "step": 1139 }, { "epoch": 1.738367658276125, "grad_norm": 0.29240195812420067, "learning_rate": 2.3331447963800908e-05, "loss": 0.3493, "step": 1140 }, { "epoch": 1.7398932112890924, "grad_norm": 0.2749525705449181, "learning_rate": 2.330316742081448e-05, "loss": 0.3658, "step": 1141 }, { "epoch": 1.7414187643020596, "grad_norm": 0.31520990408900146, "learning_rate": 2.3274886877828055e-05, "loss": 0.3867, "step": 1142 }, { "epoch": 1.7429443173150267, "grad_norm": 0.3509765939879963, "learning_rate": 2.324660633484163e-05, "loss": 0.3682, "step": 1143 }, { "epoch": 1.744469870327994, "grad_norm": 0.26682390669992306, "learning_rate": 2.3218325791855204e-05, "loss": 0.3573, "step": 1144 }, { "epoch": 1.7459954233409611, "grad_norm": 0.28656093015111456, "learning_rate": 2.319004524886878e-05, "loss": 0.3406, "step": 1145 }, { "epoch": 1.7475209763539283, "grad_norm": 0.3017370778130631, "learning_rate": 2.3161764705882354e-05, "loss": 0.3563, "step": 1146 }, { "epoch": 1.7490465293668955, "grad_norm": 0.25604080549432656, "learning_rate": 2.313348416289593e-05, "loss": 0.3692, "step": 1147 }, { "epoch": 1.7505720823798627, "grad_norm": 0.2830981938049564, "learning_rate": 2.3105203619909504e-05, "loss": 0.3459, "step": 1148 }, { "epoch": 1.75209763539283, "grad_norm": 0.27756862190288134, "learning_rate": 2.307692307692308e-05, "loss": 0.3478, "step": 1149 }, { "epoch": 1.7536231884057971, "grad_norm": 0.31597651519177944, "learning_rate": 2.3048642533936654e-05, "loss": 0.3784, "step": 1150 }, { "epoch": 1.7551487414187643, "grad_norm": 0.2941855445050186, "learning_rate": 2.3020361990950226e-05, "loss": 0.3774, "step": 1151 }, { "epoch": 1.7566742944317315, "grad_norm": 0.31793005598756585, "learning_rate": 2.29920814479638e-05, "loss": 0.3339, "step": 1152 }, { "epoch": 1.7581998474446987, "grad_norm": 0.25764496178286145, "learning_rate": 2.2963800904977375e-05, "loss": 0.3792, "step": 1153 }, { "epoch": 1.759725400457666, "grad_norm": 0.27862949442427737, "learning_rate": 2.293552036199095e-05, "loss": 0.3649, "step": 1154 }, { "epoch": 1.761250953470633, "grad_norm": 0.27558415529778985, "learning_rate": 2.2907239819004525e-05, "loss": 0.3462, "step": 1155 }, { "epoch": 1.7627765064836003, "grad_norm": 0.2417778635295319, "learning_rate": 2.28789592760181e-05, "loss": 0.3575, "step": 1156 }, { "epoch": 1.7643020594965675, "grad_norm": 0.314147412500954, "learning_rate": 2.2850678733031675e-05, "loss": 0.3726, "step": 1157 }, { "epoch": 1.7658276125095347, "grad_norm": 0.26170961363554684, "learning_rate": 2.282239819004525e-05, "loss": 0.3686, "step": 1158 }, { "epoch": 1.7673531655225019, "grad_norm": 0.31316755237194105, "learning_rate": 2.2794117647058825e-05, "loss": 0.3665, "step": 1159 }, { "epoch": 1.768878718535469, "grad_norm": 0.2672838967407788, "learning_rate": 2.27658371040724e-05, "loss": 0.3593, "step": 1160 }, { "epoch": 1.7704042715484363, "grad_norm": 0.25882094303567915, "learning_rate": 2.2737556561085975e-05, "loss": 0.358, "step": 1161 }, { "epoch": 1.7719298245614035, "grad_norm": 0.2874240935323855, "learning_rate": 2.270927601809955e-05, "loss": 0.3562, "step": 1162 }, { "epoch": 1.7734553775743707, "grad_norm": 0.2962963322147107, "learning_rate": 2.2680995475113125e-05, "loss": 0.3754, "step": 1163 }, { "epoch": 1.7749809305873379, "grad_norm": 0.3244657653643156, "learning_rate": 2.26527149321267e-05, "loss": 0.3638, "step": 1164 }, { "epoch": 1.776506483600305, "grad_norm": 0.2690506253469047, "learning_rate": 2.262443438914027e-05, "loss": 0.3612, "step": 1165 }, { "epoch": 1.7780320366132725, "grad_norm": 0.32243141924039254, "learning_rate": 2.2596153846153846e-05, "loss": 0.3806, "step": 1166 }, { "epoch": 1.7795575896262394, "grad_norm": 0.28060907754648245, "learning_rate": 2.256787330316742e-05, "loss": 0.3404, "step": 1167 }, { "epoch": 1.7810831426392069, "grad_norm": 0.2895944904964822, "learning_rate": 2.2539592760180996e-05, "loss": 0.3612, "step": 1168 }, { "epoch": 1.7826086956521738, "grad_norm": 0.2740773822439161, "learning_rate": 2.251131221719457e-05, "loss": 0.3408, "step": 1169 }, { "epoch": 1.7841342486651413, "grad_norm": 0.30366776287242553, "learning_rate": 2.2483031674208146e-05, "loss": 0.3446, "step": 1170 }, { "epoch": 1.7856598016781082, "grad_norm": 0.24788327757077455, "learning_rate": 2.245475113122172e-05, "loss": 0.3629, "step": 1171 }, { "epoch": 1.7871853546910756, "grad_norm": 0.2736873149286662, "learning_rate": 2.2426470588235296e-05, "loss": 0.3666, "step": 1172 }, { "epoch": 1.7887109077040426, "grad_norm": 0.2714865423091107, "learning_rate": 2.239819004524887e-05, "loss": 0.375, "step": 1173 }, { "epoch": 1.79023646071701, "grad_norm": 0.2974656609659056, "learning_rate": 2.2369909502262445e-05, "loss": 0.3733, "step": 1174 }, { "epoch": 1.791762013729977, "grad_norm": 0.45617517862514617, "learning_rate": 2.234162895927602e-05, "loss": 0.3681, "step": 1175 }, { "epoch": 1.7932875667429444, "grad_norm": 0.2917189701955795, "learning_rate": 2.2313348416289595e-05, "loss": 0.3515, "step": 1176 }, { "epoch": 1.7948131197559114, "grad_norm": 0.3126494012650245, "learning_rate": 2.228506787330317e-05, "loss": 0.3637, "step": 1177 }, { "epoch": 1.7963386727688788, "grad_norm": 0.3059071172255333, "learning_rate": 2.2256787330316742e-05, "loss": 0.3777, "step": 1178 }, { "epoch": 1.7978642257818458, "grad_norm": 0.29978470333193524, "learning_rate": 2.2228506787330317e-05, "loss": 0.3432, "step": 1179 }, { "epoch": 1.7993897787948132, "grad_norm": 0.24143850274086243, "learning_rate": 2.220022624434389e-05, "loss": 0.368, "step": 1180 }, { "epoch": 1.8009153318077802, "grad_norm": 0.31590470376096214, "learning_rate": 2.2171945701357466e-05, "loss": 0.364, "step": 1181 }, { "epoch": 1.8024408848207476, "grad_norm": 0.32043465866908044, "learning_rate": 2.214366515837104e-05, "loss": 0.3484, "step": 1182 }, { "epoch": 1.8039664378337146, "grad_norm": 0.25431907027439227, "learning_rate": 2.2115384615384616e-05, "loss": 0.3553, "step": 1183 }, { "epoch": 1.805491990846682, "grad_norm": 0.30865727269783083, "learning_rate": 2.208710407239819e-05, "loss": 0.368, "step": 1184 }, { "epoch": 1.807017543859649, "grad_norm": 0.3528112009364805, "learning_rate": 2.2058823529411766e-05, "loss": 0.3519, "step": 1185 }, { "epoch": 1.8085430968726164, "grad_norm": 0.2845488250881281, "learning_rate": 2.203054298642534e-05, "loss": 0.3608, "step": 1186 }, { "epoch": 1.8100686498855834, "grad_norm": 0.31743403326108477, "learning_rate": 2.2002262443438916e-05, "loss": 0.3544, "step": 1187 }, { "epoch": 1.8115942028985508, "grad_norm": 0.3108461452669532, "learning_rate": 2.197398190045249e-05, "loss": 0.3283, "step": 1188 }, { "epoch": 1.813119755911518, "grad_norm": 0.27887640966123817, "learning_rate": 2.1945701357466062e-05, "loss": 0.3633, "step": 1189 }, { "epoch": 1.8146453089244852, "grad_norm": 0.32735087905418914, "learning_rate": 2.1917420814479637e-05, "loss": 0.3519, "step": 1190 }, { "epoch": 1.8161708619374524, "grad_norm": 0.24494882496603412, "learning_rate": 2.1889140271493212e-05, "loss": 0.3513, "step": 1191 }, { "epoch": 1.8176964149504196, "grad_norm": 0.25207347519731366, "learning_rate": 2.1860859728506787e-05, "loss": 0.3508, "step": 1192 }, { "epoch": 1.8192219679633868, "grad_norm": 0.2543573803905881, "learning_rate": 2.1832579185520362e-05, "loss": 0.3562, "step": 1193 }, { "epoch": 1.820747520976354, "grad_norm": 0.23731665031275748, "learning_rate": 2.1804298642533937e-05, "loss": 0.3588, "step": 1194 }, { "epoch": 1.8222730739893211, "grad_norm": 0.24000966480434444, "learning_rate": 2.1776018099547512e-05, "loss": 0.3574, "step": 1195 }, { "epoch": 1.8237986270022883, "grad_norm": 0.2449761977241036, "learning_rate": 2.1747737556561087e-05, "loss": 0.3496, "step": 1196 }, { "epoch": 1.8253241800152555, "grad_norm": 0.27256076057141465, "learning_rate": 2.1719457013574662e-05, "loss": 0.3638, "step": 1197 }, { "epoch": 1.8268497330282227, "grad_norm": 0.278818595654442, "learning_rate": 2.1691176470588237e-05, "loss": 0.3586, "step": 1198 }, { "epoch": 1.82837528604119, "grad_norm": 0.30360956757239316, "learning_rate": 2.1662895927601812e-05, "loss": 0.3651, "step": 1199 }, { "epoch": 1.8299008390541571, "grad_norm": 0.2368119824894942, "learning_rate": 2.1634615384615387e-05, "loss": 0.3585, "step": 1200 }, { "epoch": 1.8314263920671243, "grad_norm": 0.28650338463044334, "learning_rate": 2.160633484162896e-05, "loss": 0.3467, "step": 1201 }, { "epoch": 1.8329519450800915, "grad_norm": 0.285899709636955, "learning_rate": 2.1578054298642536e-05, "loss": 0.3517, "step": 1202 }, { "epoch": 1.8344774980930587, "grad_norm": 0.2642326561911518, "learning_rate": 2.154977375565611e-05, "loss": 0.369, "step": 1203 }, { "epoch": 1.836003051106026, "grad_norm": 0.28952088056412445, "learning_rate": 2.1521493212669686e-05, "loss": 0.3497, "step": 1204 }, { "epoch": 1.837528604118993, "grad_norm": 0.2808018204643139, "learning_rate": 2.149321266968326e-05, "loss": 0.3476, "step": 1205 }, { "epoch": 1.8390541571319603, "grad_norm": 0.2760504954280243, "learning_rate": 2.1464932126696833e-05, "loss": 0.3465, "step": 1206 }, { "epoch": 1.8405797101449275, "grad_norm": 0.30467742982757823, "learning_rate": 2.1436651583710408e-05, "loss": 0.354, "step": 1207 }, { "epoch": 1.8421052631578947, "grad_norm": 0.2641472736693004, "learning_rate": 2.1408371040723983e-05, "loss": 0.3606, "step": 1208 }, { "epoch": 1.8436308161708619, "grad_norm": 0.29004898391000666, "learning_rate": 2.1380090497737558e-05, "loss": 0.345, "step": 1209 }, { "epoch": 1.845156369183829, "grad_norm": 0.29731754363341334, "learning_rate": 2.1351809954751132e-05, "loss": 0.3636, "step": 1210 }, { "epoch": 1.8466819221967965, "grad_norm": 0.2799558545509259, "learning_rate": 2.1323529411764707e-05, "loss": 0.3661, "step": 1211 }, { "epoch": 1.8482074752097635, "grad_norm": 0.299553460683014, "learning_rate": 2.1295248868778282e-05, "loss": 0.3409, "step": 1212 }, { "epoch": 1.849733028222731, "grad_norm": 0.3080267773602016, "learning_rate": 2.1266968325791857e-05, "loss": 0.3607, "step": 1213 }, { "epoch": 1.8512585812356979, "grad_norm": 0.25729843688714477, "learning_rate": 2.123868778280543e-05, "loss": 0.3439, "step": 1214 }, { "epoch": 1.8527841342486653, "grad_norm": 0.22491844340155334, "learning_rate": 2.1210407239819004e-05, "loss": 0.3484, "step": 1215 }, { "epoch": 1.8543096872616323, "grad_norm": 0.28420390711877036, "learning_rate": 2.118212669683258e-05, "loss": 0.3499, "step": 1216 }, { "epoch": 1.8558352402745997, "grad_norm": 0.2620925112545204, "learning_rate": 2.1153846153846154e-05, "loss": 0.3761, "step": 1217 }, { "epoch": 1.8573607932875666, "grad_norm": 0.25654285947143013, "learning_rate": 2.112556561085973e-05, "loss": 0.3617, "step": 1218 }, { "epoch": 1.858886346300534, "grad_norm": 0.2783469779918579, "learning_rate": 2.1097285067873303e-05, "loss": 0.3627, "step": 1219 }, { "epoch": 1.860411899313501, "grad_norm": 0.2547953081314663, "learning_rate": 2.106900452488688e-05, "loss": 0.3659, "step": 1220 }, { "epoch": 1.8619374523264685, "grad_norm": 0.28777473826672295, "learning_rate": 2.1040723981900453e-05, "loss": 0.3494, "step": 1221 }, { "epoch": 1.8634630053394354, "grad_norm": 0.24190712212546592, "learning_rate": 2.1012443438914028e-05, "loss": 0.3348, "step": 1222 }, { "epoch": 1.8649885583524028, "grad_norm": 0.24367122725786947, "learning_rate": 2.0984162895927603e-05, "loss": 0.3487, "step": 1223 }, { "epoch": 1.8665141113653698, "grad_norm": 0.24400820818729213, "learning_rate": 2.0955882352941178e-05, "loss": 0.3464, "step": 1224 }, { "epoch": 1.8680396643783372, "grad_norm": 0.3024200958608852, "learning_rate": 2.0927601809954753e-05, "loss": 0.3701, "step": 1225 }, { "epoch": 1.8695652173913042, "grad_norm": 0.27031056332791265, "learning_rate": 2.0899321266968328e-05, "loss": 0.3393, "step": 1226 }, { "epoch": 1.8710907704042716, "grad_norm": 0.24496511399566678, "learning_rate": 2.0871040723981903e-05, "loss": 0.3723, "step": 1227 }, { "epoch": 1.8726163234172386, "grad_norm": 0.2785477403334692, "learning_rate": 2.0842760180995478e-05, "loss": 0.3565, "step": 1228 }, { "epoch": 1.874141876430206, "grad_norm": 0.26411513253577973, "learning_rate": 2.0814479638009053e-05, "loss": 0.3713, "step": 1229 }, { "epoch": 1.875667429443173, "grad_norm": 0.25218706028325794, "learning_rate": 2.0786199095022628e-05, "loss": 0.3558, "step": 1230 }, { "epoch": 1.8771929824561404, "grad_norm": 0.27385055340827474, "learning_rate": 2.0757918552036202e-05, "loss": 0.3531, "step": 1231 }, { "epoch": 1.8787185354691074, "grad_norm": 0.28192239468050395, "learning_rate": 2.0729638009049777e-05, "loss": 0.3755, "step": 1232 }, { "epoch": 1.8802440884820748, "grad_norm": 0.251334464242557, "learning_rate": 2.070135746606335e-05, "loss": 0.3564, "step": 1233 }, { "epoch": 1.881769641495042, "grad_norm": 0.2540922469942456, "learning_rate": 2.0673076923076924e-05, "loss": 0.3497, "step": 1234 }, { "epoch": 1.8832951945080092, "grad_norm": 0.29500973733016594, "learning_rate": 2.06447963800905e-05, "loss": 0.3552, "step": 1235 }, { "epoch": 1.8848207475209764, "grad_norm": 0.2757381708449834, "learning_rate": 2.0616515837104074e-05, "loss": 0.3555, "step": 1236 }, { "epoch": 1.8863463005339436, "grad_norm": 0.2780794903569844, "learning_rate": 2.058823529411765e-05, "loss": 0.347, "step": 1237 }, { "epoch": 1.8878718535469108, "grad_norm": 0.24145052939743583, "learning_rate": 2.055995475113122e-05, "loss": 0.3394, "step": 1238 }, { "epoch": 1.889397406559878, "grad_norm": 0.25408255291883186, "learning_rate": 2.0531674208144795e-05, "loss": 0.3619, "step": 1239 }, { "epoch": 1.8909229595728452, "grad_norm": 0.2733918856986025, "learning_rate": 2.050339366515837e-05, "loss": 0.3721, "step": 1240 }, { "epoch": 1.8924485125858124, "grad_norm": 0.29717484255512633, "learning_rate": 2.0475113122171945e-05, "loss": 0.3684, "step": 1241 }, { "epoch": 1.8939740655987796, "grad_norm": 0.2549466885013985, "learning_rate": 2.044683257918552e-05, "loss": 0.369, "step": 1242 }, { "epoch": 1.8954996186117468, "grad_norm": 0.30091405677788824, "learning_rate": 2.0418552036199095e-05, "loss": 0.3363, "step": 1243 }, { "epoch": 1.897025171624714, "grad_norm": 0.24302556457985794, "learning_rate": 2.039027149321267e-05, "loss": 0.3546, "step": 1244 }, { "epoch": 1.8985507246376812, "grad_norm": 0.30020390068366254, "learning_rate": 2.0361990950226245e-05, "loss": 0.3778, "step": 1245 }, { "epoch": 1.9000762776506483, "grad_norm": 0.2698639201098616, "learning_rate": 2.033371040723982e-05, "loss": 0.3776, "step": 1246 }, { "epoch": 1.9016018306636155, "grad_norm": 0.3041984466335862, "learning_rate": 2.0305429864253394e-05, "loss": 0.3299, "step": 1247 }, { "epoch": 1.9031273836765827, "grad_norm": 0.30573104890342073, "learning_rate": 2.027714932126697e-05, "loss": 0.3718, "step": 1248 }, { "epoch": 1.90465293668955, "grad_norm": 0.26524347880570226, "learning_rate": 2.0248868778280544e-05, "loss": 0.4147, "step": 1249 }, { "epoch": 1.9061784897025171, "grad_norm": 1.933244451014113, "learning_rate": 2.022058823529412e-05, "loss": 0.3423, "step": 1250 }, { "epoch": 1.9077040427154843, "grad_norm": 0.32839266411441637, "learning_rate": 2.0192307692307694e-05, "loss": 0.369, "step": 1251 }, { "epoch": 1.9092295957284515, "grad_norm": 0.2833496385954936, "learning_rate": 2.016402714932127e-05, "loss": 0.3384, "step": 1252 }, { "epoch": 1.9107551487414187, "grad_norm": 0.2569998571599813, "learning_rate": 2.0135746606334844e-05, "loss": 0.357, "step": 1253 }, { "epoch": 1.912280701754386, "grad_norm": 0.28794857176899535, "learning_rate": 2.010746606334842e-05, "loss": 0.3505, "step": 1254 }, { "epoch": 1.913806254767353, "grad_norm": 0.27739225674528767, "learning_rate": 2.0079185520361994e-05, "loss": 0.355, "step": 1255 }, { "epoch": 1.9153318077803205, "grad_norm": 0.24199587460199384, "learning_rate": 2.005090497737557e-05, "loss": 0.3519, "step": 1256 }, { "epoch": 1.9168573607932875, "grad_norm": 0.27288589243411576, "learning_rate": 2.0022624434389144e-05, "loss": 0.3363, "step": 1257 }, { "epoch": 1.918382913806255, "grad_norm": 0.24392333903140972, "learning_rate": 1.999434389140272e-05, "loss": 0.3425, "step": 1258 }, { "epoch": 1.919908466819222, "grad_norm": 0.2844277905332004, "learning_rate": 1.996606334841629e-05, "loss": 0.3448, "step": 1259 }, { "epoch": 1.9214340198321893, "grad_norm": 0.24110799193319182, "learning_rate": 1.9937782805429865e-05, "loss": 0.3384, "step": 1260 }, { "epoch": 1.9229595728451563, "grad_norm": 0.24452008561441213, "learning_rate": 1.990950226244344e-05, "loss": 0.3415, "step": 1261 }, { "epoch": 1.9244851258581237, "grad_norm": 0.280907477399498, "learning_rate": 1.9881221719457015e-05, "loss": 0.3619, "step": 1262 }, { "epoch": 1.9260106788710907, "grad_norm": 0.2706822544467885, "learning_rate": 1.9852941176470586e-05, "loss": 0.3473, "step": 1263 }, { "epoch": 1.927536231884058, "grad_norm": 0.27923244971851974, "learning_rate": 1.982466063348416e-05, "loss": 0.3701, "step": 1264 }, { "epoch": 1.929061784897025, "grad_norm": 0.26308476284987636, "learning_rate": 1.9796380090497736e-05, "loss": 0.3586, "step": 1265 }, { "epoch": 1.9305873379099925, "grad_norm": 0.27991167200977485, "learning_rate": 1.976809954751131e-05, "loss": 0.3642, "step": 1266 }, { "epoch": 1.9321128909229595, "grad_norm": 0.2800662785766214, "learning_rate": 1.9739819004524886e-05, "loss": 0.3367, "step": 1267 }, { "epoch": 1.9336384439359269, "grad_norm": 0.29539447622039167, "learning_rate": 1.971153846153846e-05, "loss": 0.3657, "step": 1268 }, { "epoch": 1.9351639969488938, "grad_norm": 0.27420176873927277, "learning_rate": 1.9683257918552036e-05, "loss": 0.372, "step": 1269 }, { "epoch": 1.9366895499618613, "grad_norm": 0.2948250593361952, "learning_rate": 1.965497737556561e-05, "loss": 0.3632, "step": 1270 }, { "epoch": 1.9382151029748282, "grad_norm": 0.3078520344261779, "learning_rate": 1.9626696832579186e-05, "loss": 0.3545, "step": 1271 }, { "epoch": 1.9397406559877957, "grad_norm": 0.2647444246293629, "learning_rate": 1.959841628959276e-05, "loss": 0.3498, "step": 1272 }, { "epoch": 1.9412662090007626, "grad_norm": 0.253829055072827, "learning_rate": 1.9570135746606336e-05, "loss": 0.353, "step": 1273 }, { "epoch": 1.94279176201373, "grad_norm": 0.2906454869267361, "learning_rate": 1.954185520361991e-05, "loss": 0.3545, "step": 1274 }, { "epoch": 1.944317315026697, "grad_norm": 0.3059251691157682, "learning_rate": 1.9513574660633486e-05, "loss": 0.3539, "step": 1275 }, { "epoch": 1.9458428680396644, "grad_norm": 0.25014852896692086, "learning_rate": 1.948529411764706e-05, "loss": 0.3448, "step": 1276 }, { "epoch": 1.9473684210526314, "grad_norm": 0.25961074726881317, "learning_rate": 1.9457013574660635e-05, "loss": 0.359, "step": 1277 }, { "epoch": 1.9488939740655988, "grad_norm": 0.3039308859343225, "learning_rate": 1.942873303167421e-05, "loss": 0.3662, "step": 1278 }, { "epoch": 1.950419527078566, "grad_norm": 0.25074627600177735, "learning_rate": 1.9400452488687785e-05, "loss": 0.3431, "step": 1279 }, { "epoch": 1.9519450800915332, "grad_norm": 0.2426721412571897, "learning_rate": 1.937217194570136e-05, "loss": 0.3498, "step": 1280 }, { "epoch": 1.9534706331045004, "grad_norm": 0.2735483313034868, "learning_rate": 1.9343891402714935e-05, "loss": 0.3686, "step": 1281 }, { "epoch": 1.9549961861174676, "grad_norm": 0.3033673890049759, "learning_rate": 1.931561085972851e-05, "loss": 0.3297, "step": 1282 }, { "epoch": 1.9565217391304348, "grad_norm": 0.2863291491145563, "learning_rate": 1.928733031674208e-05, "loss": 0.3419, "step": 1283 }, { "epoch": 1.958047292143402, "grad_norm": 0.2562002824044231, "learning_rate": 1.9259049773755657e-05, "loss": 0.3473, "step": 1284 }, { "epoch": 1.9595728451563692, "grad_norm": 0.2861466705834836, "learning_rate": 1.923076923076923e-05, "loss": 0.3593, "step": 1285 }, { "epoch": 1.9610983981693364, "grad_norm": 0.3164961594462574, "learning_rate": 1.9202488687782806e-05, "loss": 0.3732, "step": 1286 }, { "epoch": 1.9626239511823036, "grad_norm": 0.24337967094393143, "learning_rate": 1.917420814479638e-05, "loss": 0.3512, "step": 1287 }, { "epoch": 1.9641495041952708, "grad_norm": 0.3235838165788084, "learning_rate": 1.9145927601809956e-05, "loss": 0.3431, "step": 1288 }, { "epoch": 1.965675057208238, "grad_norm": 0.2867709587147382, "learning_rate": 1.9117647058823528e-05, "loss": 0.3493, "step": 1289 }, { "epoch": 1.9672006102212052, "grad_norm": 0.25544277379484576, "learning_rate": 1.9089366515837103e-05, "loss": 0.355, "step": 1290 }, { "epoch": 1.9687261632341724, "grad_norm": 0.282521577550364, "learning_rate": 1.9061085972850678e-05, "loss": 0.3614, "step": 1291 }, { "epoch": 1.9702517162471396, "grad_norm": 0.29976212349352155, "learning_rate": 1.9032805429864253e-05, "loss": 0.3716, "step": 1292 }, { "epoch": 1.9717772692601068, "grad_norm": 0.2647181799845142, "learning_rate": 1.9004524886877827e-05, "loss": 0.3297, "step": 1293 }, { "epoch": 1.973302822273074, "grad_norm": 0.26828218905049134, "learning_rate": 1.8976244343891402e-05, "loss": 0.3531, "step": 1294 }, { "epoch": 1.9748283752860412, "grad_norm": 0.24201979440130994, "learning_rate": 1.8947963800904977e-05, "loss": 0.3541, "step": 1295 }, { "epoch": 1.9763539282990084, "grad_norm": 0.26056479583132874, "learning_rate": 1.8919683257918552e-05, "loss": 0.3848, "step": 1296 }, { "epoch": 1.9778794813119756, "grad_norm": 0.30143203027217086, "learning_rate": 1.8891402714932127e-05, "loss": 0.3383, "step": 1297 }, { "epoch": 1.9794050343249427, "grad_norm": 0.23854255417671935, "learning_rate": 1.8863122171945702e-05, "loss": 0.3475, "step": 1298 }, { "epoch": 1.98093058733791, "grad_norm": 0.28519057704369694, "learning_rate": 1.8834841628959277e-05, "loss": 0.3544, "step": 1299 }, { "epoch": 1.9824561403508771, "grad_norm": 0.2532084311296758, "learning_rate": 1.8806561085972852e-05, "loss": 0.3715, "step": 1300 }, { "epoch": 1.9839816933638446, "grad_norm": 0.26825548504935504, "learning_rate": 1.8778280542986427e-05, "loss": 0.3763, "step": 1301 }, { "epoch": 1.9855072463768115, "grad_norm": 0.29468792910789016, "learning_rate": 1.8750000000000002e-05, "loss": 0.3548, "step": 1302 }, { "epoch": 1.987032799389779, "grad_norm": 0.27322839726724224, "learning_rate": 1.8721719457013577e-05, "loss": 0.3482, "step": 1303 }, { "epoch": 1.988558352402746, "grad_norm": 0.24764897359555524, "learning_rate": 1.869343891402715e-05, "loss": 0.3612, "step": 1304 }, { "epoch": 1.9900839054157133, "grad_norm": 0.3200305500827646, "learning_rate": 1.8665158371040727e-05, "loss": 0.3674, "step": 1305 }, { "epoch": 1.9916094584286803, "grad_norm": 0.29665472415776156, "learning_rate": 1.86368778280543e-05, "loss": 0.3558, "step": 1306 }, { "epoch": 1.9931350114416477, "grad_norm": 0.2526045363976545, "learning_rate": 1.8608597285067876e-05, "loss": 0.3363, "step": 1307 }, { "epoch": 1.9946605644546147, "grad_norm": 0.29506172459257307, "learning_rate": 1.8580316742081448e-05, "loss": 0.3429, "step": 1308 }, { "epoch": 1.9961861174675821, "grad_norm": 0.3142477890683925, "learning_rate": 1.8552036199095023e-05, "loss": 0.3543, "step": 1309 }, { "epoch": 1.997711670480549, "grad_norm": 0.2633491114374021, "learning_rate": 1.8523755656108598e-05, "loss": 0.3613, "step": 1310 }, { "epoch": 1.9992372234935165, "grad_norm": 0.27011764358923546, "learning_rate": 1.8495475113122173e-05, "loss": 0.3489, "step": 1311 }, { "epoch": 2.0, "grad_norm": 0.38825409845599057, "learning_rate": 1.8467194570135748e-05, "loss": 0.304, "step": 1312 }, { "epoch": 2.0015255530129674, "grad_norm": 0.304615635404561, "learning_rate": 1.8438914027149323e-05, "loss": 0.2854, "step": 1313 }, { "epoch": 2.0030511060259344, "grad_norm": 0.26055124638185084, "learning_rate": 1.8410633484162897e-05, "loss": 0.2706, "step": 1314 }, { "epoch": 2.004576659038902, "grad_norm": 0.3081520275644501, "learning_rate": 1.8382352941176472e-05, "loss": 0.2873, "step": 1315 }, { "epoch": 2.006102212051869, "grad_norm": 0.29276761733041584, "learning_rate": 1.8354072398190044e-05, "loss": 0.2888, "step": 1316 }, { "epoch": 2.007627765064836, "grad_norm": 0.2612057069054041, "learning_rate": 1.832579185520362e-05, "loss": 0.2855, "step": 1317 }, { "epoch": 2.009153318077803, "grad_norm": 0.34209930261948884, "learning_rate": 1.8297511312217194e-05, "loss": 0.2867, "step": 1318 }, { "epoch": 2.0106788710907706, "grad_norm": 0.30695522707819956, "learning_rate": 1.826923076923077e-05, "loss": 0.2721, "step": 1319 }, { "epoch": 2.0122044241037376, "grad_norm": 0.25789373822912876, "learning_rate": 1.8240950226244344e-05, "loss": 0.2775, "step": 1320 }, { "epoch": 2.013729977116705, "grad_norm": 0.3947910923390881, "learning_rate": 1.821266968325792e-05, "loss": 0.2947, "step": 1321 }, { "epoch": 2.015255530129672, "grad_norm": 0.2810950153185699, "learning_rate": 1.8184389140271493e-05, "loss": 0.2685, "step": 1322 }, { "epoch": 2.0167810831426394, "grad_norm": 0.24852659900252533, "learning_rate": 1.815610859728507e-05, "loss": 0.2824, "step": 1323 }, { "epoch": 2.0183066361556063, "grad_norm": 0.32540128196509527, "learning_rate": 1.8127828054298643e-05, "loss": 0.284, "step": 1324 }, { "epoch": 2.0198321891685738, "grad_norm": 0.26103808153935915, "learning_rate": 1.8099547511312218e-05, "loss": 0.2824, "step": 1325 }, { "epoch": 2.0213577421815407, "grad_norm": 0.2680589022097402, "learning_rate": 1.8071266968325793e-05, "loss": 0.2911, "step": 1326 }, { "epoch": 2.022883295194508, "grad_norm": 0.30950653337166906, "learning_rate": 1.8042986425339368e-05, "loss": 0.2864, "step": 1327 }, { "epoch": 2.024408848207475, "grad_norm": 0.2488798437636545, "learning_rate": 1.8014705882352943e-05, "loss": 0.2802, "step": 1328 }, { "epoch": 2.0259344012204425, "grad_norm": 0.28288422256793144, "learning_rate": 1.7986425339366518e-05, "loss": 0.2761, "step": 1329 }, { "epoch": 2.0274599542334095, "grad_norm": 0.2493367924565217, "learning_rate": 1.7958144796380093e-05, "loss": 0.2796, "step": 1330 }, { "epoch": 2.028985507246377, "grad_norm": 0.24781635378572833, "learning_rate": 1.7929864253393668e-05, "loss": 0.281, "step": 1331 }, { "epoch": 2.030511060259344, "grad_norm": 0.2878126125548768, "learning_rate": 1.790158371040724e-05, "loss": 0.2812, "step": 1332 }, { "epoch": 2.0320366132723113, "grad_norm": 0.2399427866466173, "learning_rate": 1.7873303167420814e-05, "loss": 0.2679, "step": 1333 }, { "epoch": 2.0335621662852783, "grad_norm": 0.27993366407408155, "learning_rate": 1.784502262443439e-05, "loss": 0.2846, "step": 1334 }, { "epoch": 2.0350877192982457, "grad_norm": 0.2359780498818118, "learning_rate": 1.7816742081447964e-05, "loss": 0.2787, "step": 1335 }, { "epoch": 2.0366132723112127, "grad_norm": 0.28043769337285285, "learning_rate": 1.778846153846154e-05, "loss": 0.2897, "step": 1336 }, { "epoch": 2.03813882532418, "grad_norm": 0.2529551260825759, "learning_rate": 1.7760180995475114e-05, "loss": 0.2771, "step": 1337 }, { "epoch": 2.039664378337147, "grad_norm": 0.22688224213373154, "learning_rate": 1.773190045248869e-05, "loss": 0.2562, "step": 1338 }, { "epoch": 2.0411899313501145, "grad_norm": 0.24633296975502658, "learning_rate": 1.7703619909502264e-05, "loss": 0.2724, "step": 1339 }, { "epoch": 2.0427154843630815, "grad_norm": 0.24612603311218906, "learning_rate": 1.767533936651584e-05, "loss": 0.2841, "step": 1340 }, { "epoch": 2.044241037376049, "grad_norm": 0.2556686254484129, "learning_rate": 1.7647058823529414e-05, "loss": 0.2808, "step": 1341 }, { "epoch": 2.045766590389016, "grad_norm": 0.23371590258227298, "learning_rate": 1.761877828054299e-05, "loss": 0.273, "step": 1342 }, { "epoch": 2.0472921434019833, "grad_norm": 0.26599745843651623, "learning_rate": 1.7590497737556563e-05, "loss": 0.2784, "step": 1343 }, { "epoch": 2.0488176964149503, "grad_norm": 0.2344705704654067, "learning_rate": 1.7562217194570135e-05, "loss": 0.2751, "step": 1344 }, { "epoch": 2.0503432494279177, "grad_norm": 0.2288116740692705, "learning_rate": 1.753393665158371e-05, "loss": 0.289, "step": 1345 }, { "epoch": 2.0518688024408847, "grad_norm": 0.2136034735234995, "learning_rate": 1.7505656108597285e-05, "loss": 0.2581, "step": 1346 }, { "epoch": 2.053394355453852, "grad_norm": 0.2632231503910778, "learning_rate": 1.747737556561086e-05, "loss": 0.2861, "step": 1347 }, { "epoch": 2.054919908466819, "grad_norm": 0.22715075790362788, "learning_rate": 1.7449095022624435e-05, "loss": 0.2863, "step": 1348 }, { "epoch": 2.0564454614797865, "grad_norm": 0.21914877638244776, "learning_rate": 1.742081447963801e-05, "loss": 0.2846, "step": 1349 }, { "epoch": 2.0579710144927534, "grad_norm": 0.267446428861218, "learning_rate": 1.7392533936651585e-05, "loss": 0.299, "step": 1350 }, { "epoch": 2.059496567505721, "grad_norm": 0.2343480552391539, "learning_rate": 1.736425339366516e-05, "loss": 0.2744, "step": 1351 }, { "epoch": 2.061022120518688, "grad_norm": 0.20995915190271736, "learning_rate": 1.7335972850678734e-05, "loss": 0.2551, "step": 1352 }, { "epoch": 2.0625476735316552, "grad_norm": 0.2340759915411448, "learning_rate": 1.730769230769231e-05, "loss": 0.2707, "step": 1353 }, { "epoch": 2.064073226544622, "grad_norm": 0.2423726372887553, "learning_rate": 1.7279411764705884e-05, "loss": 0.3014, "step": 1354 }, { "epoch": 2.0655987795575896, "grad_norm": 0.23633719728337452, "learning_rate": 1.725113122171946e-05, "loss": 0.2995, "step": 1355 }, { "epoch": 2.0671243325705566, "grad_norm": 0.2652109270072076, "learning_rate": 1.722285067873303e-05, "loss": 0.2684, "step": 1356 }, { "epoch": 2.068649885583524, "grad_norm": 0.24108581867004353, "learning_rate": 1.7194570135746606e-05, "loss": 0.2902, "step": 1357 }, { "epoch": 2.0701754385964914, "grad_norm": 0.2213627254311107, "learning_rate": 1.716628959276018e-05, "loss": 0.2755, "step": 1358 }, { "epoch": 2.0717009916094584, "grad_norm": 0.24001982779759423, "learning_rate": 1.7138009049773755e-05, "loss": 0.2812, "step": 1359 }, { "epoch": 2.073226544622426, "grad_norm": 0.25845653087228226, "learning_rate": 1.710972850678733e-05, "loss": 0.2679, "step": 1360 }, { "epoch": 2.074752097635393, "grad_norm": 0.2370245946949866, "learning_rate": 1.7081447963800905e-05, "loss": 0.282, "step": 1361 }, { "epoch": 2.0762776506483602, "grad_norm": 0.23470552483793847, "learning_rate": 1.705316742081448e-05, "loss": 0.2851, "step": 1362 }, { "epoch": 2.077803203661327, "grad_norm": 0.24157838427378223, "learning_rate": 1.7024886877828055e-05, "loss": 0.2669, "step": 1363 }, { "epoch": 2.0793287566742946, "grad_norm": 0.21954681213423774, "learning_rate": 1.699660633484163e-05, "loss": 0.2832, "step": 1364 }, { "epoch": 2.0808543096872616, "grad_norm": 0.24916676812143432, "learning_rate": 1.6968325791855205e-05, "loss": 0.283, "step": 1365 }, { "epoch": 2.082379862700229, "grad_norm": 0.22162963618079398, "learning_rate": 1.694004524886878e-05, "loss": 0.2686, "step": 1366 }, { "epoch": 2.083905415713196, "grad_norm": 0.25586459247263665, "learning_rate": 1.6911764705882355e-05, "loss": 0.301, "step": 1367 }, { "epoch": 2.0854309687261634, "grad_norm": 0.2175361199608134, "learning_rate": 1.688348416289593e-05, "loss": 0.2737, "step": 1368 }, { "epoch": 2.0869565217391304, "grad_norm": 0.21341615965645183, "learning_rate": 1.6855203619909505e-05, "loss": 0.2791, "step": 1369 }, { "epoch": 2.088482074752098, "grad_norm": 0.23238754879831144, "learning_rate": 1.682692307692308e-05, "loss": 0.2654, "step": 1370 }, { "epoch": 2.0900076277650648, "grad_norm": 0.2619800196675576, "learning_rate": 1.6798642533936655e-05, "loss": 0.2772, "step": 1371 }, { "epoch": 2.091533180778032, "grad_norm": 0.22622568070031515, "learning_rate": 1.6770361990950226e-05, "loss": 0.2757, "step": 1372 }, { "epoch": 2.093058733790999, "grad_norm": 0.24561083906997622, "learning_rate": 1.67420814479638e-05, "loss": 0.2875, "step": 1373 }, { "epoch": 2.0945842868039666, "grad_norm": 0.2487201734099284, "learning_rate": 1.6713800904977376e-05, "loss": 0.2701, "step": 1374 }, { "epoch": 2.0961098398169336, "grad_norm": 0.2372395757829258, "learning_rate": 1.668552036199095e-05, "loss": 0.2925, "step": 1375 }, { "epoch": 2.097635392829901, "grad_norm": 0.23183716284617864, "learning_rate": 1.6657239819004526e-05, "loss": 0.2839, "step": 1376 }, { "epoch": 2.099160945842868, "grad_norm": 0.2343283873855932, "learning_rate": 1.66289592760181e-05, "loss": 0.2702, "step": 1377 }, { "epoch": 2.1006864988558354, "grad_norm": 0.23563008886669579, "learning_rate": 1.6600678733031676e-05, "loss": 0.2842, "step": 1378 }, { "epoch": 2.1022120518688023, "grad_norm": 0.25564089011136, "learning_rate": 1.657239819004525e-05, "loss": 0.2867, "step": 1379 }, { "epoch": 2.1037376048817698, "grad_norm": 0.2225891161831782, "learning_rate": 1.6544117647058825e-05, "loss": 0.2789, "step": 1380 }, { "epoch": 2.1052631578947367, "grad_norm": 0.23150678564042498, "learning_rate": 1.6515837104072397e-05, "loss": 0.2927, "step": 1381 }, { "epoch": 2.106788710907704, "grad_norm": 0.25266388840826587, "learning_rate": 1.6487556561085972e-05, "loss": 0.2735, "step": 1382 }, { "epoch": 2.108314263920671, "grad_norm": 0.2947148484215003, "learning_rate": 1.6459276018099547e-05, "loss": 0.2763, "step": 1383 }, { "epoch": 2.1098398169336385, "grad_norm": 0.22163334641849206, "learning_rate": 1.6430995475113122e-05, "loss": 0.2779, "step": 1384 }, { "epoch": 2.1113653699466055, "grad_norm": 0.20751955008691697, "learning_rate": 1.6402714932126697e-05, "loss": 0.2645, "step": 1385 }, { "epoch": 2.112890922959573, "grad_norm": 0.2442520898168622, "learning_rate": 1.637443438914027e-05, "loss": 0.2746, "step": 1386 }, { "epoch": 2.11441647597254, "grad_norm": 0.2554049146891923, "learning_rate": 1.6346153846153847e-05, "loss": 0.2639, "step": 1387 }, { "epoch": 2.1159420289855073, "grad_norm": 0.27273712893023627, "learning_rate": 1.631787330316742e-05, "loss": 0.2824, "step": 1388 }, { "epoch": 2.1174675819984743, "grad_norm": 0.23260371423690915, "learning_rate": 1.6289592760180996e-05, "loss": 0.272, "step": 1389 }, { "epoch": 2.1189931350114417, "grad_norm": 0.29046600557313046, "learning_rate": 1.626131221719457e-05, "loss": 0.2947, "step": 1390 }, { "epoch": 2.1205186880244087, "grad_norm": 0.24702449785111197, "learning_rate": 1.6233031674208146e-05, "loss": 0.2815, "step": 1391 }, { "epoch": 2.122044241037376, "grad_norm": 0.24992915124184917, "learning_rate": 1.620475113122172e-05, "loss": 0.2695, "step": 1392 }, { "epoch": 2.123569794050343, "grad_norm": 0.23211149092231315, "learning_rate": 1.6176470588235296e-05, "loss": 0.2865, "step": 1393 }, { "epoch": 2.1250953470633105, "grad_norm": 0.2206466496805776, "learning_rate": 1.614819004524887e-05, "loss": 0.2579, "step": 1394 }, { "epoch": 2.1266209000762775, "grad_norm": 0.2561733558216656, "learning_rate": 1.6119909502262446e-05, "loss": 0.2889, "step": 1395 }, { "epoch": 2.128146453089245, "grad_norm": 0.43253812738620684, "learning_rate": 1.609162895927602e-05, "loss": 0.2991, "step": 1396 }, { "epoch": 2.129672006102212, "grad_norm": 0.26332431605794593, "learning_rate": 1.6063348416289596e-05, "loss": 0.2833, "step": 1397 }, { "epoch": 2.1311975591151793, "grad_norm": 0.23341930571408043, "learning_rate": 1.603506787330317e-05, "loss": 0.2772, "step": 1398 }, { "epoch": 2.1327231121281462, "grad_norm": 0.26061388421980547, "learning_rate": 1.6006787330316742e-05, "loss": 0.2916, "step": 1399 }, { "epoch": 2.1342486651411137, "grad_norm": 0.2407215467001019, "learning_rate": 1.5978506787330317e-05, "loss": 0.2748, "step": 1400 }, { "epoch": 2.135774218154081, "grad_norm": 0.2309274172523503, "learning_rate": 1.5950226244343892e-05, "loss": 0.2893, "step": 1401 }, { "epoch": 2.137299771167048, "grad_norm": 0.2461570644690458, "learning_rate": 1.5921945701357467e-05, "loss": 0.2878, "step": 1402 }, { "epoch": 2.138825324180015, "grad_norm": 0.26046909342790814, "learning_rate": 1.5893665158371042e-05, "loss": 0.2901, "step": 1403 }, { "epoch": 2.1403508771929824, "grad_norm": 0.2637151641306921, "learning_rate": 1.5865384615384617e-05, "loss": 0.2933, "step": 1404 }, { "epoch": 2.14187643020595, "grad_norm": 0.2335136163723376, "learning_rate": 1.583710407239819e-05, "loss": 0.2862, "step": 1405 }, { "epoch": 2.143401983218917, "grad_norm": 0.24285983965417998, "learning_rate": 1.5808823529411763e-05, "loss": 0.2672, "step": 1406 }, { "epoch": 2.1449275362318843, "grad_norm": 0.23663524283999415, "learning_rate": 1.5780542986425338e-05, "loss": 0.2812, "step": 1407 }, { "epoch": 2.1464530892448512, "grad_norm": 0.239682113824786, "learning_rate": 1.5752262443438913e-05, "loss": 0.2673, "step": 1408 }, { "epoch": 2.1479786422578186, "grad_norm": 0.2662907010037106, "learning_rate": 1.5723981900452488e-05, "loss": 0.2827, "step": 1409 }, { "epoch": 2.1495041952707856, "grad_norm": 0.24555249962129158, "learning_rate": 1.5695701357466063e-05, "loss": 0.2782, "step": 1410 }, { "epoch": 2.151029748283753, "grad_norm": 0.22013055427494768, "learning_rate": 1.5667420814479638e-05, "loss": 0.2733, "step": 1411 }, { "epoch": 2.15255530129672, "grad_norm": 0.2469776209130251, "learning_rate": 1.5639140271493213e-05, "loss": 0.2704, "step": 1412 }, { "epoch": 2.1540808543096874, "grad_norm": 0.26850448328703863, "learning_rate": 1.5610859728506788e-05, "loss": 0.2647, "step": 1413 }, { "epoch": 2.1556064073226544, "grad_norm": 0.21155747391210908, "learning_rate": 1.5582579185520363e-05, "loss": 0.2825, "step": 1414 }, { "epoch": 2.157131960335622, "grad_norm": 0.25395681738056664, "learning_rate": 1.5554298642533938e-05, "loss": 0.2806, "step": 1415 }, { "epoch": 2.158657513348589, "grad_norm": 0.25379126870238866, "learning_rate": 1.5526018099547513e-05, "loss": 0.2836, "step": 1416 }, { "epoch": 2.160183066361556, "grad_norm": 0.2304418662734134, "learning_rate": 1.5497737556561087e-05, "loss": 0.2774, "step": 1417 }, { "epoch": 2.161708619374523, "grad_norm": 0.24099975531541323, "learning_rate": 1.5469457013574662e-05, "loss": 0.2843, "step": 1418 }, { "epoch": 2.1632341723874906, "grad_norm": 0.23194337080206534, "learning_rate": 1.5441176470588237e-05, "loss": 0.2923, "step": 1419 }, { "epoch": 2.1647597254004576, "grad_norm": 0.24666666813440696, "learning_rate": 1.5412895927601812e-05, "loss": 0.2861, "step": 1420 }, { "epoch": 2.166285278413425, "grad_norm": 0.23307035770278123, "learning_rate": 1.5384615384615387e-05, "loss": 0.2752, "step": 1421 }, { "epoch": 2.167810831426392, "grad_norm": 0.22966913567099606, "learning_rate": 1.5356334841628962e-05, "loss": 0.2746, "step": 1422 }, { "epoch": 2.1693363844393594, "grad_norm": 0.2293482576891465, "learning_rate": 1.5328054298642537e-05, "loss": 0.2771, "step": 1423 }, { "epoch": 2.1708619374523264, "grad_norm": 0.2509900978401101, "learning_rate": 1.5299773755656112e-05, "loss": 0.2832, "step": 1424 }, { "epoch": 2.172387490465294, "grad_norm": 0.23741070273562181, "learning_rate": 1.5271493212669687e-05, "loss": 0.2912, "step": 1425 }, { "epoch": 2.1739130434782608, "grad_norm": 0.21230256334202344, "learning_rate": 1.524321266968326e-05, "loss": 0.2851, "step": 1426 }, { "epoch": 2.175438596491228, "grad_norm": 0.26902489361210735, "learning_rate": 1.5214932126696832e-05, "loss": 0.2801, "step": 1427 }, { "epoch": 2.176964149504195, "grad_norm": 0.2431128665138929, "learning_rate": 1.5186651583710407e-05, "loss": 0.2739, "step": 1428 }, { "epoch": 2.1784897025171626, "grad_norm": 0.23734987748085742, "learning_rate": 1.5158371040723981e-05, "loss": 0.2834, "step": 1429 }, { "epoch": 2.1800152555301295, "grad_norm": 0.2299580768669421, "learning_rate": 1.5130090497737556e-05, "loss": 0.274, "step": 1430 }, { "epoch": 2.181540808543097, "grad_norm": 0.2378562648235592, "learning_rate": 1.5101809954751131e-05, "loss": 0.2654, "step": 1431 }, { "epoch": 2.183066361556064, "grad_norm": 0.27624637531738905, "learning_rate": 1.5073529411764706e-05, "loss": 0.2832, "step": 1432 }, { "epoch": 2.1845919145690313, "grad_norm": 0.2339916157355755, "learning_rate": 1.5045248868778281e-05, "loss": 0.2801, "step": 1433 }, { "epoch": 2.1861174675819983, "grad_norm": 0.2367482705611227, "learning_rate": 1.5016968325791856e-05, "loss": 0.2801, "step": 1434 }, { "epoch": 2.1876430205949657, "grad_norm": 0.2459268225034855, "learning_rate": 1.498868778280543e-05, "loss": 0.2665, "step": 1435 }, { "epoch": 2.1891685736079327, "grad_norm": 0.24551925760311752, "learning_rate": 1.4960407239819004e-05, "loss": 0.2879, "step": 1436 }, { "epoch": 2.1906941266209, "grad_norm": 0.2264797458150397, "learning_rate": 1.493212669683258e-05, "loss": 0.2715, "step": 1437 }, { "epoch": 2.192219679633867, "grad_norm": 0.23002371053291934, "learning_rate": 1.4903846153846154e-05, "loss": 0.2766, "step": 1438 }, { "epoch": 2.1937452326468345, "grad_norm": 0.24679208635683614, "learning_rate": 1.4875565610859729e-05, "loss": 0.2861, "step": 1439 }, { "epoch": 2.1952707856598015, "grad_norm": 0.23083031094277295, "learning_rate": 1.4847285067873304e-05, "loss": 0.2684, "step": 1440 }, { "epoch": 2.196796338672769, "grad_norm": 0.23274312205464767, "learning_rate": 1.4819004524886879e-05, "loss": 0.264, "step": 1441 }, { "epoch": 2.198321891685736, "grad_norm": 0.27284997452227566, "learning_rate": 1.4790723981900454e-05, "loss": 0.3044, "step": 1442 }, { "epoch": 2.1998474446987033, "grad_norm": 0.2297517119608719, "learning_rate": 1.4762443438914029e-05, "loss": 0.2779, "step": 1443 }, { "epoch": 2.2013729977116703, "grad_norm": 0.22488293871390355, "learning_rate": 1.4734162895927604e-05, "loss": 0.282, "step": 1444 }, { "epoch": 2.2028985507246377, "grad_norm": 0.2508372791740457, "learning_rate": 1.4705882352941177e-05, "loss": 0.2796, "step": 1445 }, { "epoch": 2.2044241037376047, "grad_norm": 0.19627309014704653, "learning_rate": 1.4677601809954752e-05, "loss": 0.2628, "step": 1446 }, { "epoch": 2.205949656750572, "grad_norm": 0.24702748389528628, "learning_rate": 1.4649321266968327e-05, "loss": 0.2746, "step": 1447 }, { "epoch": 2.2074752097635395, "grad_norm": 0.2624391147753211, "learning_rate": 1.4621040723981902e-05, "loss": 0.2857, "step": 1448 }, { "epoch": 2.2090007627765065, "grad_norm": 0.22275397150270965, "learning_rate": 1.4592760180995477e-05, "loss": 0.2738, "step": 1449 }, { "epoch": 2.2105263157894735, "grad_norm": 0.28177551630850595, "learning_rate": 1.4564479638009051e-05, "loss": 0.2875, "step": 1450 }, { "epoch": 2.212051868802441, "grad_norm": 0.23147595752603684, "learning_rate": 1.4536199095022626e-05, "loss": 0.2847, "step": 1451 }, { "epoch": 2.2135774218154083, "grad_norm": 2.034075901163884, "learning_rate": 1.4507918552036201e-05, "loss": 0.326, "step": 1452 }, { "epoch": 2.2151029748283753, "grad_norm": 0.2855537793458811, "learning_rate": 1.4479638009049776e-05, "loss": 0.283, "step": 1453 }, { "epoch": 2.2166285278413427, "grad_norm": 0.22429325441645034, "learning_rate": 1.4451357466063348e-05, "loss": 0.2797, "step": 1454 }, { "epoch": 2.2181540808543097, "grad_norm": 0.223371791401713, "learning_rate": 1.4423076923076923e-05, "loss": 0.2803, "step": 1455 }, { "epoch": 2.219679633867277, "grad_norm": 0.23149150450017109, "learning_rate": 1.4394796380090498e-05, "loss": 0.2837, "step": 1456 }, { "epoch": 2.221205186880244, "grad_norm": 0.2500179508867957, "learning_rate": 1.4366515837104073e-05, "loss": 0.2971, "step": 1457 }, { "epoch": 2.2227307398932115, "grad_norm": 0.24787261255522033, "learning_rate": 1.4338235294117647e-05, "loss": 0.2933, "step": 1458 }, { "epoch": 2.2242562929061784, "grad_norm": 0.22786776417053106, "learning_rate": 1.430995475113122e-05, "loss": 0.2699, "step": 1459 }, { "epoch": 2.225781845919146, "grad_norm": 0.21186549259061815, "learning_rate": 1.4281674208144796e-05, "loss": 0.2663, "step": 1460 }, { "epoch": 2.227307398932113, "grad_norm": 0.24522221508046638, "learning_rate": 1.425339366515837e-05, "loss": 0.2904, "step": 1461 }, { "epoch": 2.2288329519450802, "grad_norm": 0.23376836185329758, "learning_rate": 1.4225113122171945e-05, "loss": 0.2699, "step": 1462 }, { "epoch": 2.230358504958047, "grad_norm": 0.24711674152334157, "learning_rate": 1.419683257918552e-05, "loss": 0.2791, "step": 1463 }, { "epoch": 2.2318840579710146, "grad_norm": 0.2058521370429759, "learning_rate": 1.4168552036199095e-05, "loss": 0.2923, "step": 1464 }, { "epoch": 2.2334096109839816, "grad_norm": 0.24708866258282694, "learning_rate": 1.414027149321267e-05, "loss": 0.2851, "step": 1465 }, { "epoch": 2.234935163996949, "grad_norm": 0.23393830215248956, "learning_rate": 1.4111990950226245e-05, "loss": 0.2869, "step": 1466 }, { "epoch": 2.236460717009916, "grad_norm": 0.21448615163036913, "learning_rate": 1.408371040723982e-05, "loss": 0.2788, "step": 1467 }, { "epoch": 2.2379862700228834, "grad_norm": 0.21405460503858115, "learning_rate": 1.4055429864253395e-05, "loss": 0.2697, "step": 1468 }, { "epoch": 2.2395118230358504, "grad_norm": 0.21963375653529404, "learning_rate": 1.402714932126697e-05, "loss": 0.2815, "step": 1469 }, { "epoch": 2.241037376048818, "grad_norm": 0.20701502093360719, "learning_rate": 1.3998868778280543e-05, "loss": 0.2842, "step": 1470 }, { "epoch": 2.242562929061785, "grad_norm": 0.2156190714791893, "learning_rate": 1.3970588235294118e-05, "loss": 0.2712, "step": 1471 }, { "epoch": 2.244088482074752, "grad_norm": 0.2319488839586452, "learning_rate": 1.3942307692307693e-05, "loss": 0.2957, "step": 1472 }, { "epoch": 2.245614035087719, "grad_norm": 0.21755187035299067, "learning_rate": 1.3914027149321268e-05, "loss": 0.2778, "step": 1473 }, { "epoch": 2.2471395881006866, "grad_norm": 0.223857951719631, "learning_rate": 1.3885746606334843e-05, "loss": 0.284, "step": 1474 }, { "epoch": 2.2486651411136536, "grad_norm": 0.21433117960338816, "learning_rate": 1.3857466063348418e-05, "loss": 0.2695, "step": 1475 }, { "epoch": 2.250190694126621, "grad_norm": 0.2195220235958117, "learning_rate": 1.3829185520361993e-05, "loss": 0.2614, "step": 1476 }, { "epoch": 2.251716247139588, "grad_norm": 0.20832469293608413, "learning_rate": 1.3800904977375568e-05, "loss": 0.2698, "step": 1477 }, { "epoch": 2.2532418001525554, "grad_norm": 0.21031669156788593, "learning_rate": 1.3772624434389143e-05, "loss": 0.2787, "step": 1478 }, { "epoch": 2.2547673531655223, "grad_norm": 0.22581091968026443, "learning_rate": 1.3744343891402718e-05, "loss": 0.2794, "step": 1479 }, { "epoch": 2.2562929061784898, "grad_norm": 0.2269583248700337, "learning_rate": 1.371606334841629e-05, "loss": 0.2953, "step": 1480 }, { "epoch": 2.2578184591914567, "grad_norm": 0.22192680754047617, "learning_rate": 1.3687782805429866e-05, "loss": 0.277, "step": 1481 }, { "epoch": 2.259344012204424, "grad_norm": 0.23095839199430782, "learning_rate": 1.3659502262443439e-05, "loss": 0.2842, "step": 1482 }, { "epoch": 2.260869565217391, "grad_norm": 0.2273635267848613, "learning_rate": 1.3631221719457014e-05, "loss": 0.2752, "step": 1483 }, { "epoch": 2.2623951182303585, "grad_norm": 0.21679122765843373, "learning_rate": 1.3602941176470587e-05, "loss": 0.2658, "step": 1484 }, { "epoch": 2.2639206712433255, "grad_norm": 0.23030086794204124, "learning_rate": 1.3574660633484162e-05, "loss": 0.2809, "step": 1485 }, { "epoch": 2.265446224256293, "grad_norm": 0.7174875854567854, "learning_rate": 1.3546380090497737e-05, "loss": 0.2776, "step": 1486 }, { "epoch": 2.26697177726926, "grad_norm": 0.2343842993500212, "learning_rate": 1.3518099547511312e-05, "loss": 0.2976, "step": 1487 }, { "epoch": 2.2684973302822273, "grad_norm": 0.23253440978581466, "learning_rate": 1.3489819004524887e-05, "loss": 0.2719, "step": 1488 }, { "epoch": 2.2700228832951943, "grad_norm": 0.22213240513173482, "learning_rate": 1.3461538461538462e-05, "loss": 0.2753, "step": 1489 }, { "epoch": 2.2715484363081617, "grad_norm": 0.25992955101161186, "learning_rate": 1.3433257918552037e-05, "loss": 0.2721, "step": 1490 }, { "epoch": 2.273073989321129, "grad_norm": 0.23200448100291576, "learning_rate": 1.3404977375565612e-05, "loss": 0.2779, "step": 1491 }, { "epoch": 2.274599542334096, "grad_norm": 0.25457460841985785, "learning_rate": 1.3376696832579186e-05, "loss": 0.2766, "step": 1492 }, { "epoch": 2.276125095347063, "grad_norm": 0.23020119933682384, "learning_rate": 1.3348416289592761e-05, "loss": 0.2748, "step": 1493 }, { "epoch": 2.2776506483600305, "grad_norm": 0.2310725584726743, "learning_rate": 1.3320135746606335e-05, "loss": 0.287, "step": 1494 }, { "epoch": 2.279176201372998, "grad_norm": 0.22997170414291762, "learning_rate": 1.329185520361991e-05, "loss": 0.292, "step": 1495 }, { "epoch": 2.280701754385965, "grad_norm": 0.23188833147569224, "learning_rate": 1.3263574660633484e-05, "loss": 0.2986, "step": 1496 }, { "epoch": 2.282227307398932, "grad_norm": 0.22628273318772696, "learning_rate": 1.323529411764706e-05, "loss": 0.2637, "step": 1497 }, { "epoch": 2.2837528604118993, "grad_norm": 0.21818427709323163, "learning_rate": 1.3207013574660634e-05, "loss": 0.2628, "step": 1498 }, { "epoch": 2.2852784134248667, "grad_norm": 0.24587460259103364, "learning_rate": 1.317873303167421e-05, "loss": 0.289, "step": 1499 }, { "epoch": 2.2868039664378337, "grad_norm": 0.22603021060472417, "learning_rate": 1.3150452488687784e-05, "loss": 0.2825, "step": 1500 }, { "epoch": 2.288329519450801, "grad_norm": 0.20630947736367092, "learning_rate": 1.3122171945701359e-05, "loss": 0.2567, "step": 1501 }, { "epoch": 2.289855072463768, "grad_norm": 0.21545579084039768, "learning_rate": 1.3093891402714934e-05, "loss": 0.2802, "step": 1502 }, { "epoch": 2.2913806254767355, "grad_norm": 0.23039800897546536, "learning_rate": 1.3065610859728509e-05, "loss": 0.3018, "step": 1503 }, { "epoch": 2.2929061784897025, "grad_norm": 0.23306790221076523, "learning_rate": 1.3037330316742082e-05, "loss": 0.2817, "step": 1504 }, { "epoch": 2.29443173150267, "grad_norm": 0.23047086173860734, "learning_rate": 1.3009049773755657e-05, "loss": 0.2871, "step": 1505 }, { "epoch": 2.295957284515637, "grad_norm": 0.2287648560040637, "learning_rate": 1.2980769230769232e-05, "loss": 0.2901, "step": 1506 }, { "epoch": 2.2974828375286043, "grad_norm": 0.2266286815746821, "learning_rate": 1.2952488687782807e-05, "loss": 0.2941, "step": 1507 }, { "epoch": 2.2990083905415712, "grad_norm": 0.2035436780237056, "learning_rate": 1.2924208144796382e-05, "loss": 0.2606, "step": 1508 }, { "epoch": 2.3005339435545387, "grad_norm": 0.22790277017849925, "learning_rate": 1.2895927601809957e-05, "loss": 0.2682, "step": 1509 }, { "epoch": 2.3020594965675056, "grad_norm": 0.2565359206670967, "learning_rate": 1.2867647058823528e-05, "loss": 0.2783, "step": 1510 }, { "epoch": 2.303585049580473, "grad_norm": 0.23042376651469026, "learning_rate": 1.2839366515837103e-05, "loss": 0.279, "step": 1511 }, { "epoch": 2.30511060259344, "grad_norm": 0.2371001865571473, "learning_rate": 1.2811085972850678e-05, "loss": 0.2855, "step": 1512 }, { "epoch": 2.3066361556064074, "grad_norm": 0.2150119840229643, "learning_rate": 1.2782805429864253e-05, "loss": 0.2795, "step": 1513 }, { "epoch": 2.3081617086193744, "grad_norm": 0.2314958966404437, "learning_rate": 1.2754524886877828e-05, "loss": 0.2669, "step": 1514 }, { "epoch": 2.309687261632342, "grad_norm": 0.22813143582482615, "learning_rate": 1.2726244343891403e-05, "loss": 0.2929, "step": 1515 }, { "epoch": 2.311212814645309, "grad_norm": 0.24102202920644533, "learning_rate": 1.2697963800904978e-05, "loss": 0.2745, "step": 1516 }, { "epoch": 2.3127383676582762, "grad_norm": 0.23091059480613566, "learning_rate": 1.2669683257918553e-05, "loss": 0.2854, "step": 1517 }, { "epoch": 2.314263920671243, "grad_norm": 0.21988571272168464, "learning_rate": 1.2641402714932126e-05, "loss": 0.2661, "step": 1518 }, { "epoch": 2.3157894736842106, "grad_norm": 0.2788152643128347, "learning_rate": 1.2613122171945701e-05, "loss": 0.2766, "step": 1519 }, { "epoch": 2.3173150266971776, "grad_norm": 0.22481152496251075, "learning_rate": 1.2584841628959276e-05, "loss": 0.273, "step": 1520 }, { "epoch": 2.318840579710145, "grad_norm": 0.2712841522600814, "learning_rate": 1.255656108597285e-05, "loss": 0.2793, "step": 1521 }, { "epoch": 2.320366132723112, "grad_norm": 0.23468591592157814, "learning_rate": 1.2528280542986426e-05, "loss": 0.2743, "step": 1522 }, { "epoch": 2.3218916857360794, "grad_norm": 0.23943358873151716, "learning_rate": 1.25e-05, "loss": 0.2898, "step": 1523 }, { "epoch": 2.3234172387490464, "grad_norm": 0.2195127517558412, "learning_rate": 1.2471719457013576e-05, "loss": 0.2778, "step": 1524 }, { "epoch": 2.324942791762014, "grad_norm": 0.22543897487193973, "learning_rate": 1.244343891402715e-05, "loss": 0.2841, "step": 1525 }, { "epoch": 2.3264683447749808, "grad_norm": 0.23200938429865844, "learning_rate": 1.2415158371040725e-05, "loss": 0.2759, "step": 1526 }, { "epoch": 2.327993897787948, "grad_norm": 0.22080851567928816, "learning_rate": 1.23868778280543e-05, "loss": 0.2832, "step": 1527 }, { "epoch": 2.329519450800915, "grad_norm": 0.21022630455630478, "learning_rate": 1.2358597285067875e-05, "loss": 0.2721, "step": 1528 }, { "epoch": 2.3310450038138826, "grad_norm": 0.23780820146984896, "learning_rate": 1.2330316742081448e-05, "loss": 0.2851, "step": 1529 }, { "epoch": 2.3325705568268496, "grad_norm": 0.2317196325895635, "learning_rate": 1.2302036199095023e-05, "loss": 0.2613, "step": 1530 }, { "epoch": 2.334096109839817, "grad_norm": 0.21968869736864108, "learning_rate": 1.2273755656108597e-05, "loss": 0.2958, "step": 1531 }, { "epoch": 2.335621662852784, "grad_norm": 0.2246082843735646, "learning_rate": 1.2245475113122172e-05, "loss": 0.2951, "step": 1532 }, { "epoch": 2.3371472158657514, "grad_norm": 0.24311846588984623, "learning_rate": 1.2217194570135746e-05, "loss": 0.2866, "step": 1533 }, { "epoch": 2.3386727688787188, "grad_norm": 0.22546931098266407, "learning_rate": 1.2188914027149321e-05, "loss": 0.2752, "step": 1534 }, { "epoch": 2.3401983218916858, "grad_norm": 0.2085668357687971, "learning_rate": 1.2160633484162896e-05, "loss": 0.2562, "step": 1535 }, { "epoch": 2.3417238749046527, "grad_norm": 0.19726908315950453, "learning_rate": 1.2132352941176471e-05, "loss": 0.2658, "step": 1536 }, { "epoch": 2.34324942791762, "grad_norm": 0.23174538950608564, "learning_rate": 1.2104072398190046e-05, "loss": 0.2797, "step": 1537 }, { "epoch": 2.3447749809305876, "grad_norm": 0.22880239226975388, "learning_rate": 1.2075791855203621e-05, "loss": 0.2805, "step": 1538 }, { "epoch": 2.3463005339435545, "grad_norm": 0.21316684722484283, "learning_rate": 1.2047511312217196e-05, "loss": 0.274, "step": 1539 }, { "epoch": 2.3478260869565215, "grad_norm": 0.23052518697599234, "learning_rate": 1.2019230769230771e-05, "loss": 0.2746, "step": 1540 }, { "epoch": 2.349351639969489, "grad_norm": 0.23254598468137908, "learning_rate": 1.1990950226244344e-05, "loss": 0.2805, "step": 1541 }, { "epoch": 2.3508771929824563, "grad_norm": 0.23088626129823808, "learning_rate": 1.1962669683257919e-05, "loss": 0.2819, "step": 1542 }, { "epoch": 2.3524027459954233, "grad_norm": 0.21566472225955918, "learning_rate": 1.1934389140271494e-05, "loss": 0.2861, "step": 1543 }, { "epoch": 2.3539282990083907, "grad_norm": 0.2250469810638243, "learning_rate": 1.1906108597285067e-05, "loss": 0.2974, "step": 1544 }, { "epoch": 2.3554538520213577, "grad_norm": 0.21771062743600905, "learning_rate": 1.1877828054298642e-05, "loss": 0.2784, "step": 1545 }, { "epoch": 2.356979405034325, "grad_norm": 0.2240410078013881, "learning_rate": 1.1849547511312217e-05, "loss": 0.2783, "step": 1546 }, { "epoch": 2.358504958047292, "grad_norm": 0.24260482144964807, "learning_rate": 1.1821266968325792e-05, "loss": 0.2873, "step": 1547 }, { "epoch": 2.3600305110602595, "grad_norm": 0.27239810119573704, "learning_rate": 1.1792986425339367e-05, "loss": 0.2978, "step": 1548 }, { "epoch": 2.3615560640732265, "grad_norm": 0.21167713517556228, "learning_rate": 1.1764705882352942e-05, "loss": 0.2835, "step": 1549 }, { "epoch": 2.363081617086194, "grad_norm": 0.2540281850807561, "learning_rate": 1.1736425339366517e-05, "loss": 0.2891, "step": 1550 }, { "epoch": 2.364607170099161, "grad_norm": 0.2725801523911438, "learning_rate": 1.1708144796380092e-05, "loss": 0.273, "step": 1551 }, { "epoch": 2.3661327231121283, "grad_norm": 0.2354257556544511, "learning_rate": 1.1679864253393667e-05, "loss": 0.2763, "step": 1552 }, { "epoch": 2.3676582761250953, "grad_norm": 0.23064933375881416, "learning_rate": 1.165158371040724e-05, "loss": 0.2779, "step": 1553 }, { "epoch": 2.3691838291380627, "grad_norm": 0.25995678988306425, "learning_rate": 1.1623303167420815e-05, "loss": 0.275, "step": 1554 }, { "epoch": 2.3707093821510297, "grad_norm": 0.24245427461968447, "learning_rate": 1.159502262443439e-05, "loss": 0.288, "step": 1555 }, { "epoch": 2.372234935163997, "grad_norm": 0.9054347787678712, "learning_rate": 1.1566742081447965e-05, "loss": 0.2938, "step": 1556 }, { "epoch": 2.373760488176964, "grad_norm": 0.2219899482345533, "learning_rate": 1.153846153846154e-05, "loss": 0.2742, "step": 1557 }, { "epoch": 2.3752860411899315, "grad_norm": 0.23997615580180914, "learning_rate": 1.1510180995475113e-05, "loss": 0.2859, "step": 1558 }, { "epoch": 2.3768115942028984, "grad_norm": 0.2630987310503776, "learning_rate": 1.1481900452488688e-05, "loss": 0.3044, "step": 1559 }, { "epoch": 2.378337147215866, "grad_norm": 0.21536160885536643, "learning_rate": 1.1453619909502263e-05, "loss": 0.2602, "step": 1560 }, { "epoch": 2.379862700228833, "grad_norm": 0.20979202921779957, "learning_rate": 1.1425339366515838e-05, "loss": 0.2734, "step": 1561 }, { "epoch": 2.3813882532418003, "grad_norm": 0.2205194268981048, "learning_rate": 1.1397058823529412e-05, "loss": 0.2915, "step": 1562 }, { "epoch": 2.3829138062547672, "grad_norm": 0.20961233156939493, "learning_rate": 1.1368778280542987e-05, "loss": 0.266, "step": 1563 }, { "epoch": 2.3844393592677346, "grad_norm": 0.21385766865899664, "learning_rate": 1.1340497737556562e-05, "loss": 0.275, "step": 1564 }, { "epoch": 2.3859649122807016, "grad_norm": 0.2261935019536189, "learning_rate": 1.1312217194570136e-05, "loss": 0.2878, "step": 1565 }, { "epoch": 2.387490465293669, "grad_norm": 0.22956233637004675, "learning_rate": 1.128393665158371e-05, "loss": 0.273, "step": 1566 }, { "epoch": 2.389016018306636, "grad_norm": 0.2096052534395747, "learning_rate": 1.1255656108597285e-05, "loss": 0.2707, "step": 1567 }, { "epoch": 2.3905415713196034, "grad_norm": 0.2207256116252762, "learning_rate": 1.122737556561086e-05, "loss": 0.2877, "step": 1568 }, { "epoch": 2.3920671243325704, "grad_norm": 0.22516661890456555, "learning_rate": 1.1199095022624435e-05, "loss": 0.2877, "step": 1569 }, { "epoch": 2.393592677345538, "grad_norm": 0.25797947047085756, "learning_rate": 1.117081447963801e-05, "loss": 0.2779, "step": 1570 }, { "epoch": 2.395118230358505, "grad_norm": 0.23690792705377064, "learning_rate": 1.1142533936651585e-05, "loss": 0.2837, "step": 1571 }, { "epoch": 2.396643783371472, "grad_norm": 0.2325374833929267, "learning_rate": 1.1114253393665158e-05, "loss": 0.2856, "step": 1572 }, { "epoch": 2.398169336384439, "grad_norm": 0.19965605750520127, "learning_rate": 1.1085972850678733e-05, "loss": 0.2731, "step": 1573 }, { "epoch": 2.3996948893974066, "grad_norm": 0.22956518174866034, "learning_rate": 1.1057692307692308e-05, "loss": 0.2859, "step": 1574 }, { "epoch": 2.4012204424103736, "grad_norm": 0.22866012046976975, "learning_rate": 1.1029411764705883e-05, "loss": 0.2933, "step": 1575 }, { "epoch": 2.402745995423341, "grad_norm": 0.23398779926310673, "learning_rate": 1.1001131221719458e-05, "loss": 0.2865, "step": 1576 }, { "epoch": 2.404271548436308, "grad_norm": 0.22748277110674286, "learning_rate": 1.0972850678733031e-05, "loss": 0.2787, "step": 1577 }, { "epoch": 2.4057971014492754, "grad_norm": 0.2412163106931157, "learning_rate": 1.0944570135746606e-05, "loss": 0.2806, "step": 1578 }, { "epoch": 2.4073226544622424, "grad_norm": 0.21074005487406564, "learning_rate": 1.0916289592760181e-05, "loss": 0.2716, "step": 1579 }, { "epoch": 2.40884820747521, "grad_norm": 0.25163818633551793, "learning_rate": 1.0888009049773756e-05, "loss": 0.3027, "step": 1580 }, { "epoch": 2.410373760488177, "grad_norm": 0.24525589853590377, "learning_rate": 1.0859728506787331e-05, "loss": 0.2695, "step": 1581 }, { "epoch": 2.411899313501144, "grad_norm": 0.22323212864506395, "learning_rate": 1.0831447963800906e-05, "loss": 0.2822, "step": 1582 }, { "epoch": 2.413424866514111, "grad_norm": 0.2094506848925692, "learning_rate": 1.080316742081448e-05, "loss": 0.2744, "step": 1583 }, { "epoch": 2.4149504195270786, "grad_norm": 0.22329956173671808, "learning_rate": 1.0774886877828056e-05, "loss": 0.2649, "step": 1584 }, { "epoch": 2.416475972540046, "grad_norm": 0.22397274956891738, "learning_rate": 1.074660633484163e-05, "loss": 0.2776, "step": 1585 }, { "epoch": 2.418001525553013, "grad_norm": 0.22272462209354393, "learning_rate": 1.0718325791855204e-05, "loss": 0.2703, "step": 1586 }, { "epoch": 2.41952707856598, "grad_norm": 0.24365643308425242, "learning_rate": 1.0690045248868779e-05, "loss": 0.2828, "step": 1587 }, { "epoch": 2.4210526315789473, "grad_norm": 0.21105302604427462, "learning_rate": 1.0661764705882354e-05, "loss": 0.2857, "step": 1588 }, { "epoch": 2.4225781845919148, "grad_norm": 0.21485671654699434, "learning_rate": 1.0633484162895929e-05, "loss": 0.2755, "step": 1589 }, { "epoch": 2.4241037376048817, "grad_norm": 0.22545201164922105, "learning_rate": 1.0605203619909502e-05, "loss": 0.2719, "step": 1590 }, { "epoch": 2.425629290617849, "grad_norm": 0.21974496403937652, "learning_rate": 1.0576923076923077e-05, "loss": 0.2771, "step": 1591 }, { "epoch": 2.427154843630816, "grad_norm": 0.2608933103377383, "learning_rate": 1.0548642533936652e-05, "loss": 0.2945, "step": 1592 }, { "epoch": 2.4286803966437835, "grad_norm": 0.2226670214532915, "learning_rate": 1.0520361990950227e-05, "loss": 0.2793, "step": 1593 }, { "epoch": 2.4302059496567505, "grad_norm": 0.2091229419246102, "learning_rate": 1.0492081447963802e-05, "loss": 0.2812, "step": 1594 }, { "epoch": 2.431731502669718, "grad_norm": 0.20893135972935403, "learning_rate": 1.0463800904977376e-05, "loss": 0.2723, "step": 1595 }, { "epoch": 2.433257055682685, "grad_norm": 0.20369933553542627, "learning_rate": 1.0435520361990951e-05, "loss": 0.2734, "step": 1596 }, { "epoch": 2.4347826086956523, "grad_norm": 0.22616991406707043, "learning_rate": 1.0407239819004526e-05, "loss": 0.2754, "step": 1597 }, { "epoch": 2.4363081617086193, "grad_norm": 0.21969105322616525, "learning_rate": 1.0378959276018101e-05, "loss": 0.2825, "step": 1598 }, { "epoch": 2.4378337147215867, "grad_norm": 0.2108241673080586, "learning_rate": 1.0350678733031674e-05, "loss": 0.2657, "step": 1599 }, { "epoch": 2.4393592677345537, "grad_norm": 0.25407085662657747, "learning_rate": 1.032239819004525e-05, "loss": 0.2829, "step": 1600 }, { "epoch": 2.440884820747521, "grad_norm": 0.20512239182614364, "learning_rate": 1.0294117647058824e-05, "loss": 0.269, "step": 1601 }, { "epoch": 2.442410373760488, "grad_norm": 0.20738367551053763, "learning_rate": 1.0265837104072398e-05, "loss": 0.268, "step": 1602 }, { "epoch": 2.4439359267734555, "grad_norm": 0.25344701789653806, "learning_rate": 1.0237556561085972e-05, "loss": 0.2818, "step": 1603 }, { "epoch": 2.4454614797864225, "grad_norm": 0.2121148766638685, "learning_rate": 1.0209276018099547e-05, "loss": 0.2721, "step": 1604 }, { "epoch": 2.44698703279939, "grad_norm": 0.22610766759556997, "learning_rate": 1.0180995475113122e-05, "loss": 0.2777, "step": 1605 }, { "epoch": 2.448512585812357, "grad_norm": 0.2140889198060139, "learning_rate": 1.0152714932126697e-05, "loss": 0.2736, "step": 1606 }, { "epoch": 2.4500381388253243, "grad_norm": 0.21650728028910263, "learning_rate": 1.0124434389140272e-05, "loss": 0.2662, "step": 1607 }, { "epoch": 2.4515636918382913, "grad_norm": 0.22028002848624273, "learning_rate": 1.0096153846153847e-05, "loss": 0.2804, "step": 1608 }, { "epoch": 2.4530892448512587, "grad_norm": 0.22929668203786552, "learning_rate": 1.0067873303167422e-05, "loss": 0.2789, "step": 1609 }, { "epoch": 2.4546147978642257, "grad_norm": 0.25338835823737005, "learning_rate": 1.0039592760180997e-05, "loss": 0.2869, "step": 1610 }, { "epoch": 2.456140350877193, "grad_norm": 0.23569650661617936, "learning_rate": 1.0011312217194572e-05, "loss": 0.2775, "step": 1611 }, { "epoch": 2.45766590389016, "grad_norm": 1.4751962051640848, "learning_rate": 9.983031674208145e-06, "loss": 0.3111, "step": 1612 }, { "epoch": 2.4591914569031275, "grad_norm": 0.23171621264063458, "learning_rate": 9.95475113122172e-06, "loss": 0.2903, "step": 1613 }, { "epoch": 2.4607170099160944, "grad_norm": 0.23310742188734973, "learning_rate": 9.926470588235293e-06, "loss": 0.2849, "step": 1614 }, { "epoch": 2.462242562929062, "grad_norm": 0.22435922122528862, "learning_rate": 9.898190045248868e-06, "loss": 0.281, "step": 1615 }, { "epoch": 2.463768115942029, "grad_norm": 0.20361771186250674, "learning_rate": 9.869909502262443e-06, "loss": 0.283, "step": 1616 }, { "epoch": 2.4652936689549962, "grad_norm": 0.22117455686723328, "learning_rate": 9.841628959276018e-06, "loss": 0.2769, "step": 1617 }, { "epoch": 2.466819221967963, "grad_norm": 0.20824066255874457, "learning_rate": 9.813348416289593e-06, "loss": 0.2793, "step": 1618 }, { "epoch": 2.4683447749809306, "grad_norm": 0.21895104942382138, "learning_rate": 9.785067873303168e-06, "loss": 0.2704, "step": 1619 }, { "epoch": 2.4698703279938976, "grad_norm": 0.23251137545692332, "learning_rate": 9.756787330316743e-06, "loss": 0.2799, "step": 1620 }, { "epoch": 2.471395881006865, "grad_norm": 0.20722757600155256, "learning_rate": 9.728506787330318e-06, "loss": 0.2653, "step": 1621 }, { "epoch": 2.472921434019832, "grad_norm": 0.20090029938133547, "learning_rate": 9.700226244343893e-06, "loss": 0.2801, "step": 1622 }, { "epoch": 2.4744469870327994, "grad_norm": 0.21314887618225944, "learning_rate": 9.671945701357468e-06, "loss": 0.2791, "step": 1623 }, { "epoch": 2.475972540045767, "grad_norm": 0.2168917753661417, "learning_rate": 9.64366515837104e-06, "loss": 0.2794, "step": 1624 }, { "epoch": 2.477498093058734, "grad_norm": 0.2237030834568368, "learning_rate": 9.615384615384616e-06, "loss": 0.2785, "step": 1625 }, { "epoch": 2.479023646071701, "grad_norm": 0.247781773414794, "learning_rate": 9.58710407239819e-06, "loss": 0.2986, "step": 1626 }, { "epoch": 2.480549199084668, "grad_norm": 0.21307162988079187, "learning_rate": 9.558823529411764e-06, "loss": 0.2897, "step": 1627 }, { "epoch": 2.4820747520976356, "grad_norm": 0.20812957726000034, "learning_rate": 9.530542986425339e-06, "loss": 0.2747, "step": 1628 }, { "epoch": 2.4836003051106026, "grad_norm": 0.24222963171829934, "learning_rate": 9.502262443438914e-06, "loss": 0.2868, "step": 1629 }, { "epoch": 2.4851258581235696, "grad_norm": 0.22234485681908805, "learning_rate": 9.473981900452489e-06, "loss": 0.2836, "step": 1630 }, { "epoch": 2.486651411136537, "grad_norm": 0.2275796232607693, "learning_rate": 9.445701357466064e-06, "loss": 0.2671, "step": 1631 }, { "epoch": 2.4881769641495044, "grad_norm": 0.20920067874200737, "learning_rate": 9.417420814479638e-06, "loss": 0.2956, "step": 1632 }, { "epoch": 2.4897025171624714, "grad_norm": 0.22510144361920553, "learning_rate": 9.389140271493213e-06, "loss": 0.2844, "step": 1633 }, { "epoch": 2.4912280701754383, "grad_norm": 0.23125607261356362, "learning_rate": 9.360859728506788e-06, "loss": 0.2737, "step": 1634 }, { "epoch": 2.4927536231884058, "grad_norm": 0.21342024912342322, "learning_rate": 9.332579185520363e-06, "loss": 0.2739, "step": 1635 }, { "epoch": 2.494279176201373, "grad_norm": 0.22085282019714988, "learning_rate": 9.304298642533938e-06, "loss": 0.2961, "step": 1636 }, { "epoch": 2.49580472921434, "grad_norm": 0.20739741374402657, "learning_rate": 9.276018099547511e-06, "loss": 0.2582, "step": 1637 }, { "epoch": 2.4973302822273076, "grad_norm": 0.22918602056747625, "learning_rate": 9.247737556561086e-06, "loss": 0.2747, "step": 1638 }, { "epoch": 2.4988558352402745, "grad_norm": 0.21242933991411847, "learning_rate": 9.219457013574661e-06, "loss": 0.2862, "step": 1639 }, { "epoch": 2.500381388253242, "grad_norm": 0.19739253681137733, "learning_rate": 9.191176470588236e-06, "loss": 0.2707, "step": 1640 }, { "epoch": 2.501906941266209, "grad_norm": 0.21899971639244328, "learning_rate": 9.16289592760181e-06, "loss": 0.2891, "step": 1641 }, { "epoch": 2.5034324942791764, "grad_norm": 0.21091842219889204, "learning_rate": 9.134615384615384e-06, "loss": 0.2691, "step": 1642 }, { "epoch": 2.5049580472921433, "grad_norm": 0.8696874649563884, "learning_rate": 9.10633484162896e-06, "loss": 0.3067, "step": 1643 }, { "epoch": 2.5064836003051107, "grad_norm": 0.2103590288583414, "learning_rate": 9.078054298642534e-06, "loss": 0.2833, "step": 1644 }, { "epoch": 2.5080091533180777, "grad_norm": 0.2152070505222916, "learning_rate": 9.049773755656109e-06, "loss": 0.2854, "step": 1645 }, { "epoch": 2.509534706331045, "grad_norm": 0.2144087457060262, "learning_rate": 9.021493212669684e-06, "loss": 0.2666, "step": 1646 }, { "epoch": 2.511060259344012, "grad_norm": 0.2400116681635908, "learning_rate": 8.993212669683259e-06, "loss": 0.2865, "step": 1647 }, { "epoch": 2.5125858123569795, "grad_norm": 0.20455058135481066, "learning_rate": 8.964932126696834e-06, "loss": 0.2729, "step": 1648 }, { "epoch": 2.5141113653699465, "grad_norm": 0.22406208581175688, "learning_rate": 8.936651583710407e-06, "loss": 0.2795, "step": 1649 }, { "epoch": 2.515636918382914, "grad_norm": 0.21556740450614334, "learning_rate": 8.908371040723982e-06, "loss": 0.2677, "step": 1650 }, { "epoch": 2.517162471395881, "grad_norm": 0.22727091370614894, "learning_rate": 8.880090497737557e-06, "loss": 0.2818, "step": 1651 }, { "epoch": 2.5186880244088483, "grad_norm": 0.2128912330614817, "learning_rate": 8.851809954751132e-06, "loss": 0.2779, "step": 1652 }, { "epoch": 2.5202135774218153, "grad_norm": 0.21023970707835585, "learning_rate": 8.823529411764707e-06, "loss": 0.2809, "step": 1653 }, { "epoch": 2.5217391304347827, "grad_norm": 0.22227854074254327, "learning_rate": 8.795248868778282e-06, "loss": 0.2864, "step": 1654 }, { "epoch": 2.5232646834477497, "grad_norm": 0.21968112742475057, "learning_rate": 8.766968325791855e-06, "loss": 0.2861, "step": 1655 }, { "epoch": 2.524790236460717, "grad_norm": 0.1993922589175035, "learning_rate": 8.73868778280543e-06, "loss": 0.2567, "step": 1656 }, { "epoch": 2.526315789473684, "grad_norm": 0.22437610053250978, "learning_rate": 8.710407239819005e-06, "loss": 0.3005, "step": 1657 }, { "epoch": 2.5278413424866515, "grad_norm": 0.21321978029893912, "learning_rate": 8.68212669683258e-06, "loss": 0.2756, "step": 1658 }, { "epoch": 2.5293668954996185, "grad_norm": 0.22503607913636978, "learning_rate": 8.653846153846155e-06, "loss": 0.3074, "step": 1659 }, { "epoch": 2.530892448512586, "grad_norm": 0.20346364263146385, "learning_rate": 8.62556561085973e-06, "loss": 0.2778, "step": 1660 }, { "epoch": 2.532418001525553, "grad_norm": 0.22532825048713948, "learning_rate": 8.597285067873303e-06, "loss": 0.2936, "step": 1661 }, { "epoch": 2.5339435545385203, "grad_norm": 0.2083581817731538, "learning_rate": 8.569004524886878e-06, "loss": 0.2798, "step": 1662 }, { "epoch": 2.5354691075514877, "grad_norm": 0.2151782503432388, "learning_rate": 8.540723981900453e-06, "loss": 0.2748, "step": 1663 }, { "epoch": 2.5369946605644547, "grad_norm": 0.2147982750728022, "learning_rate": 8.512443438914028e-06, "loss": 0.2656, "step": 1664 }, { "epoch": 2.5385202135774216, "grad_norm": 0.22080275090221005, "learning_rate": 8.484162895927603e-06, "loss": 0.2883, "step": 1665 }, { "epoch": 2.540045766590389, "grad_norm": 0.21010358858867292, "learning_rate": 8.455882352941177e-06, "loss": 0.2632, "step": 1666 }, { "epoch": 2.5415713196033565, "grad_norm": 0.21615071310922732, "learning_rate": 8.427601809954752e-06, "loss": 0.2764, "step": 1667 }, { "epoch": 2.5430968726163234, "grad_norm": 0.22198761072634826, "learning_rate": 8.399321266968327e-06, "loss": 0.2842, "step": 1668 }, { "epoch": 2.5446224256292904, "grad_norm": 0.2407002396674416, "learning_rate": 8.3710407239819e-06, "loss": 0.2926, "step": 1669 }, { "epoch": 2.546147978642258, "grad_norm": 0.23641029748349152, "learning_rate": 8.342760180995475e-06, "loss": 0.2829, "step": 1670 }, { "epoch": 2.5476735316552253, "grad_norm": 0.21273823822196347, "learning_rate": 8.31447963800905e-06, "loss": 0.2839, "step": 1671 }, { "epoch": 2.5491990846681922, "grad_norm": 0.1916673440388943, "learning_rate": 8.286199095022625e-06, "loss": 0.2562, "step": 1672 }, { "epoch": 2.550724637681159, "grad_norm": 0.2353159368540045, "learning_rate": 8.257918552036199e-06, "loss": 0.2764, "step": 1673 }, { "epoch": 2.5522501906941266, "grad_norm": 0.22461499459138665, "learning_rate": 8.229638009049773e-06, "loss": 0.286, "step": 1674 }, { "epoch": 2.553775743707094, "grad_norm": 0.21460945792715486, "learning_rate": 8.201357466063348e-06, "loss": 0.2858, "step": 1675 }, { "epoch": 2.555301296720061, "grad_norm": 0.20290621967862554, "learning_rate": 8.173076923076923e-06, "loss": 0.2789, "step": 1676 }, { "epoch": 2.556826849733028, "grad_norm": 0.22463123324000495, "learning_rate": 8.144796380090498e-06, "loss": 0.2669, "step": 1677 }, { "epoch": 2.5583524027459954, "grad_norm": 0.2524613402910334, "learning_rate": 8.116515837104073e-06, "loss": 0.282, "step": 1678 }, { "epoch": 2.559877955758963, "grad_norm": 0.22144031366187167, "learning_rate": 8.088235294117648e-06, "loss": 0.283, "step": 1679 }, { "epoch": 2.56140350877193, "grad_norm": 0.23576477110034597, "learning_rate": 8.059954751131223e-06, "loss": 0.2989, "step": 1680 }, { "epoch": 2.5629290617848968, "grad_norm": 0.2061294229474491, "learning_rate": 8.031674208144798e-06, "loss": 0.2703, "step": 1681 }, { "epoch": 2.564454614797864, "grad_norm": 0.21331153011443713, "learning_rate": 8.003393665158371e-06, "loss": 0.2884, "step": 1682 }, { "epoch": 2.5659801678108316, "grad_norm": 0.22341955682156556, "learning_rate": 7.975113122171946e-06, "loss": 0.2891, "step": 1683 }, { "epoch": 2.5675057208237986, "grad_norm": 0.20969484549760975, "learning_rate": 7.946832579185521e-06, "loss": 0.2931, "step": 1684 }, { "epoch": 2.569031273836766, "grad_norm": 0.20904178558815312, "learning_rate": 7.918552036199094e-06, "loss": 0.2846, "step": 1685 }, { "epoch": 2.570556826849733, "grad_norm": 0.20849443045669935, "learning_rate": 7.890271493212669e-06, "loss": 0.2683, "step": 1686 }, { "epoch": 2.5720823798627004, "grad_norm": 0.206437277283606, "learning_rate": 7.861990950226244e-06, "loss": 0.2823, "step": 1687 }, { "epoch": 2.5736079328756674, "grad_norm": 0.21375535243276647, "learning_rate": 7.833710407239819e-06, "loss": 0.2659, "step": 1688 }, { "epoch": 2.5751334858886348, "grad_norm": 0.19695856697420835, "learning_rate": 7.805429864253394e-06, "loss": 0.2652, "step": 1689 }, { "epoch": 2.5766590389016018, "grad_norm": 0.2137647597436359, "learning_rate": 7.777149321266969e-06, "loss": 0.2809, "step": 1690 }, { "epoch": 2.578184591914569, "grad_norm": 0.21115519767081306, "learning_rate": 7.748868778280544e-06, "loss": 0.2882, "step": 1691 }, { "epoch": 2.579710144927536, "grad_norm": 0.2276606740858827, "learning_rate": 7.720588235294119e-06, "loss": 0.2978, "step": 1692 }, { "epoch": 2.5812356979405036, "grad_norm": 0.20445555673247845, "learning_rate": 7.692307692307694e-06, "loss": 0.283, "step": 1693 }, { "epoch": 2.5827612509534705, "grad_norm": 0.22587904888562277, "learning_rate": 7.664027149321269e-06, "loss": 0.274, "step": 1694 }, { "epoch": 2.584286803966438, "grad_norm": 0.2113243182759978, "learning_rate": 7.635746606334843e-06, "loss": 0.2832, "step": 1695 }, { "epoch": 2.585812356979405, "grad_norm": 0.2063796058202304, "learning_rate": 7.607466063348416e-06, "loss": 0.2676, "step": 1696 }, { "epoch": 2.5873379099923723, "grad_norm": 0.20447609170466327, "learning_rate": 7.579185520361991e-06, "loss": 0.2704, "step": 1697 }, { "epoch": 2.5888634630053393, "grad_norm": 0.24012102317457498, "learning_rate": 7.550904977375566e-06, "loss": 0.2935, "step": 1698 }, { "epoch": 2.5903890160183067, "grad_norm": 0.19906035784035253, "learning_rate": 7.522624434389141e-06, "loss": 0.2836, "step": 1699 }, { "epoch": 2.5919145690312737, "grad_norm": 0.20687048470744934, "learning_rate": 7.494343891402715e-06, "loss": 0.2865, "step": 1700 }, { "epoch": 2.593440122044241, "grad_norm": 0.2219547564364429, "learning_rate": 7.46606334841629e-06, "loss": 0.284, "step": 1701 }, { "epoch": 2.594965675057208, "grad_norm": 0.2246060481954018, "learning_rate": 7.4377828054298645e-06, "loss": 0.2784, "step": 1702 }, { "epoch": 2.5964912280701755, "grad_norm": 0.2396925336780812, "learning_rate": 7.4095022624434394e-06, "loss": 0.3026, "step": 1703 }, { "epoch": 2.5980167810831425, "grad_norm": 0.22288474719934917, "learning_rate": 7.381221719457014e-06, "loss": 0.2836, "step": 1704 }, { "epoch": 2.59954233409611, "grad_norm": 0.20799162681162964, "learning_rate": 7.3529411764705884e-06, "loss": 0.2881, "step": 1705 }, { "epoch": 2.601067887109077, "grad_norm": 0.20474660439741385, "learning_rate": 7.324660633484163e-06, "loss": 0.2668, "step": 1706 }, { "epoch": 2.6025934401220443, "grad_norm": 0.19704940878121494, "learning_rate": 7.296380090497738e-06, "loss": 0.2778, "step": 1707 }, { "epoch": 2.6041189931350113, "grad_norm": 0.19313428912060626, "learning_rate": 7.268099547511313e-06, "loss": 0.26, "step": 1708 }, { "epoch": 2.6056445461479787, "grad_norm": 0.21479438064214473, "learning_rate": 7.239819004524888e-06, "loss": 0.2773, "step": 1709 }, { "epoch": 2.607170099160946, "grad_norm": 0.21823269385482522, "learning_rate": 7.211538461538461e-06, "loss": 0.2692, "step": 1710 }, { "epoch": 2.608695652173913, "grad_norm": 0.23323027816574532, "learning_rate": 7.183257918552036e-06, "loss": 0.2782, "step": 1711 }, { "epoch": 2.61022120518688, "grad_norm": 0.22355068492947913, "learning_rate": 7.15497737556561e-06, "loss": 0.2759, "step": 1712 }, { "epoch": 2.6117467581998475, "grad_norm": 0.21092141529081365, "learning_rate": 7.126696832579185e-06, "loss": 0.2853, "step": 1713 }, { "epoch": 2.613272311212815, "grad_norm": 0.20673471727275003, "learning_rate": 7.09841628959276e-06, "loss": 0.2926, "step": 1714 }, { "epoch": 2.614797864225782, "grad_norm": 0.20099040577035818, "learning_rate": 7.070135746606335e-06, "loss": 0.2792, "step": 1715 }, { "epoch": 2.616323417238749, "grad_norm": 0.21016267419532775, "learning_rate": 7.04185520361991e-06, "loss": 0.2914, "step": 1716 }, { "epoch": 2.6178489702517163, "grad_norm": 0.23143100905120167, "learning_rate": 7.013574660633485e-06, "loss": 0.2769, "step": 1717 }, { "epoch": 2.6193745232646837, "grad_norm": 0.21786360833534885, "learning_rate": 6.985294117647059e-06, "loss": 0.2779, "step": 1718 }, { "epoch": 2.6209000762776506, "grad_norm": 0.2229679157915102, "learning_rate": 6.957013574660634e-06, "loss": 0.2814, "step": 1719 }, { "epoch": 2.6224256292906176, "grad_norm": 0.20390343390812546, "learning_rate": 6.928733031674209e-06, "loss": 0.2792, "step": 1720 }, { "epoch": 2.623951182303585, "grad_norm": 0.20967149496184742, "learning_rate": 6.900452488687784e-06, "loss": 0.2909, "step": 1721 }, { "epoch": 2.6254767353165525, "grad_norm": 0.19628481827264382, "learning_rate": 6.872171945701359e-06, "loss": 0.2725, "step": 1722 }, { "epoch": 2.6270022883295194, "grad_norm": 0.21661036525095748, "learning_rate": 6.843891402714933e-06, "loss": 0.2805, "step": 1723 }, { "epoch": 2.6285278413424864, "grad_norm": 0.21521153343143212, "learning_rate": 6.815610859728507e-06, "loss": 0.2846, "step": 1724 }, { "epoch": 2.630053394355454, "grad_norm": 0.2139668026602555, "learning_rate": 6.787330316742081e-06, "loss": 0.2757, "step": 1725 }, { "epoch": 2.6315789473684212, "grad_norm": 0.2327458221252486, "learning_rate": 6.759049773755656e-06, "loss": 0.2825, "step": 1726 }, { "epoch": 2.633104500381388, "grad_norm": 0.20890484373765802, "learning_rate": 6.730769230769231e-06, "loss": 0.2751, "step": 1727 }, { "epoch": 2.634630053394355, "grad_norm": 0.1994683222919695, "learning_rate": 6.702488687782806e-06, "loss": 0.2718, "step": 1728 }, { "epoch": 2.6361556064073226, "grad_norm": 0.2034034191979968, "learning_rate": 6.674208144796381e-06, "loss": 0.2637, "step": 1729 }, { "epoch": 2.63768115942029, "grad_norm": 0.2050317125741684, "learning_rate": 6.645927601809955e-06, "loss": 0.281, "step": 1730 }, { "epoch": 2.639206712433257, "grad_norm": 0.1999782976399038, "learning_rate": 6.61764705882353e-06, "loss": 0.2631, "step": 1731 }, { "epoch": 2.6407322654462244, "grad_norm": 0.21164158862169224, "learning_rate": 6.589366515837105e-06, "loss": 0.2804, "step": 1732 }, { "epoch": 2.6422578184591914, "grad_norm": 0.24852157871733738, "learning_rate": 6.5610859728506795e-06, "loss": 0.2897, "step": 1733 }, { "epoch": 2.643783371472159, "grad_norm": 0.21897965835871702, "learning_rate": 6.5328054298642545e-06, "loss": 0.287, "step": 1734 }, { "epoch": 2.645308924485126, "grad_norm": 0.21736820126057554, "learning_rate": 6.5045248868778285e-06, "loss": 0.2707, "step": 1735 }, { "epoch": 2.646834477498093, "grad_norm": 0.1912025493643382, "learning_rate": 6.4762443438914035e-06, "loss": 0.2786, "step": 1736 }, { "epoch": 2.64836003051106, "grad_norm": 0.19901860534600127, "learning_rate": 6.447963800904978e-06, "loss": 0.2744, "step": 1737 }, { "epoch": 2.6498855835240276, "grad_norm": 0.20176455217872563, "learning_rate": 6.419683257918552e-06, "loss": 0.2711, "step": 1738 }, { "epoch": 2.6514111365369946, "grad_norm": 0.24054707315385543, "learning_rate": 6.3914027149321265e-06, "loss": 0.299, "step": 1739 }, { "epoch": 2.652936689549962, "grad_norm": 0.21986705445548135, "learning_rate": 6.3631221719457015e-06, "loss": 0.2656, "step": 1740 }, { "epoch": 2.654462242562929, "grad_norm": 0.19892771899607123, "learning_rate": 6.334841628959276e-06, "loss": 0.28, "step": 1741 }, { "epoch": 2.6559877955758964, "grad_norm": 0.21121445656433221, "learning_rate": 6.3065610859728505e-06, "loss": 0.277, "step": 1742 }, { "epoch": 2.6575133485888633, "grad_norm": 0.20031364058790926, "learning_rate": 6.278280542986425e-06, "loss": 0.2824, "step": 1743 }, { "epoch": 2.6590389016018308, "grad_norm": 0.20244472580715342, "learning_rate": 6.25e-06, "loss": 0.2586, "step": 1744 }, { "epoch": 2.6605644546147977, "grad_norm": 0.21396928891335462, "learning_rate": 6.221719457013575e-06, "loss": 0.2787, "step": 1745 }, { "epoch": 2.662090007627765, "grad_norm": 0.21284201375319423, "learning_rate": 6.19343891402715e-06, "loss": 0.277, "step": 1746 }, { "epoch": 2.663615560640732, "grad_norm": 0.21779542671976118, "learning_rate": 6.165158371040724e-06, "loss": 0.2795, "step": 1747 }, { "epoch": 2.6651411136536995, "grad_norm": 0.21124000260068296, "learning_rate": 6.136877828054298e-06, "loss": 0.276, "step": 1748 }, { "epoch": 2.6666666666666665, "grad_norm": 0.20105485954585156, "learning_rate": 6.108597285067873e-06, "loss": 0.273, "step": 1749 }, { "epoch": 2.668192219679634, "grad_norm": 0.20621530394473295, "learning_rate": 6.080316742081448e-06, "loss": 0.284, "step": 1750 }, { "epoch": 2.669717772692601, "grad_norm": 0.22051830402981962, "learning_rate": 6.052036199095023e-06, "loss": 0.2814, "step": 1751 }, { "epoch": 2.6712433257055683, "grad_norm": 0.2086144331319767, "learning_rate": 6.023755656108598e-06, "loss": 0.2909, "step": 1752 }, { "epoch": 2.6727688787185357, "grad_norm": 0.20605675034035678, "learning_rate": 5.995475113122172e-06, "loss": 0.2752, "step": 1753 }, { "epoch": 2.6742944317315027, "grad_norm": 0.1977731783537534, "learning_rate": 5.967194570135747e-06, "loss": 0.2783, "step": 1754 }, { "epoch": 2.6758199847444697, "grad_norm": 0.20352103449515424, "learning_rate": 5.938914027149321e-06, "loss": 0.2718, "step": 1755 }, { "epoch": 2.677345537757437, "grad_norm": 0.20208025640281874, "learning_rate": 5.910633484162896e-06, "loss": 0.2795, "step": 1756 }, { "epoch": 2.6788710907704045, "grad_norm": 0.20272404929779814, "learning_rate": 5.882352941176471e-06, "loss": 0.2693, "step": 1757 }, { "epoch": 2.6803966437833715, "grad_norm": 0.19413850642655334, "learning_rate": 5.854072398190046e-06, "loss": 0.2733, "step": 1758 }, { "epoch": 2.6819221967963385, "grad_norm": 0.19295789771841734, "learning_rate": 5.82579185520362e-06, "loss": 0.2687, "step": 1759 }, { "epoch": 2.683447749809306, "grad_norm": 0.23676347712607398, "learning_rate": 5.797511312217195e-06, "loss": 0.2802, "step": 1760 }, { "epoch": 2.6849733028222733, "grad_norm": 0.19770484918887604, "learning_rate": 5.76923076923077e-06, "loss": 0.2774, "step": 1761 }, { "epoch": 2.6864988558352403, "grad_norm": 0.21377705771609457, "learning_rate": 5.740950226244344e-06, "loss": 0.2748, "step": 1762 }, { "epoch": 2.6880244088482073, "grad_norm": 0.19794500014407843, "learning_rate": 5.712669683257919e-06, "loss": 0.283, "step": 1763 }, { "epoch": 2.6895499618611747, "grad_norm": 0.20219605961960982, "learning_rate": 5.684389140271494e-06, "loss": 0.2755, "step": 1764 }, { "epoch": 2.691075514874142, "grad_norm": 0.20666656531508049, "learning_rate": 5.656108597285068e-06, "loss": 0.29, "step": 1765 }, { "epoch": 2.692601067887109, "grad_norm": 0.21127499701209673, "learning_rate": 5.627828054298643e-06, "loss": 0.2927, "step": 1766 }, { "epoch": 2.694126620900076, "grad_norm": 0.2441426560600548, "learning_rate": 5.599547511312218e-06, "loss": 0.2868, "step": 1767 }, { "epoch": 2.6956521739130435, "grad_norm": 0.19631125560528798, "learning_rate": 5.5712669683257925e-06, "loss": 0.2804, "step": 1768 }, { "epoch": 2.697177726926011, "grad_norm": 0.19081090057503455, "learning_rate": 5.542986425339367e-06, "loss": 0.266, "step": 1769 }, { "epoch": 2.698703279938978, "grad_norm": 0.2067040957087181, "learning_rate": 5.5147058823529415e-06, "loss": 0.2731, "step": 1770 }, { "epoch": 2.700228832951945, "grad_norm": 0.20534860880961198, "learning_rate": 5.486425339366516e-06, "loss": 0.2753, "step": 1771 }, { "epoch": 2.7017543859649122, "grad_norm": 0.20954705600567422, "learning_rate": 5.4581447963800905e-06, "loss": 0.3026, "step": 1772 }, { "epoch": 2.7032799389778797, "grad_norm": 0.19711572913576397, "learning_rate": 5.4298642533936655e-06, "loss": 0.2706, "step": 1773 }, { "epoch": 2.7048054919908466, "grad_norm": 0.2036168269402528, "learning_rate": 5.40158371040724e-06, "loss": 0.2838, "step": 1774 }, { "epoch": 2.7063310450038136, "grad_norm": 0.19954787788547765, "learning_rate": 5.373303167420815e-06, "loss": 0.2841, "step": 1775 }, { "epoch": 2.707856598016781, "grad_norm": 0.1949116029736815, "learning_rate": 5.345022624434389e-06, "loss": 0.284, "step": 1776 }, { "epoch": 2.7093821510297484, "grad_norm": 0.19920957831348507, "learning_rate": 5.316742081447964e-06, "loss": 0.2673, "step": 1777 }, { "epoch": 2.7109077040427154, "grad_norm": 0.20565607931703614, "learning_rate": 5.288461538461538e-06, "loss": 0.282, "step": 1778 }, { "epoch": 2.712433257055683, "grad_norm": 0.21848622705535342, "learning_rate": 5.260180995475113e-06, "loss": 0.272, "step": 1779 }, { "epoch": 2.71395881006865, "grad_norm": 0.20498999223792772, "learning_rate": 5.231900452488688e-06, "loss": 0.2901, "step": 1780 }, { "epoch": 2.7154843630816172, "grad_norm": 0.21343915036091268, "learning_rate": 5.203619909502263e-06, "loss": 0.2859, "step": 1781 }, { "epoch": 2.717009916094584, "grad_norm": 0.20032593652978248, "learning_rate": 5.175339366515837e-06, "loss": 0.2823, "step": 1782 }, { "epoch": 2.7185354691075516, "grad_norm": 0.18776500628622367, "learning_rate": 5.147058823529412e-06, "loss": 0.265, "step": 1783 }, { "epoch": 2.7200610221205186, "grad_norm": 0.19527669609462595, "learning_rate": 5.118778280542986e-06, "loss": 0.2661, "step": 1784 }, { "epoch": 2.721586575133486, "grad_norm": 0.21890095995853462, "learning_rate": 5.090497737556561e-06, "loss": 0.2904, "step": 1785 }, { "epoch": 2.723112128146453, "grad_norm": 0.2106203942255612, "learning_rate": 5.062217194570136e-06, "loss": 0.2843, "step": 1786 }, { "epoch": 2.7246376811594204, "grad_norm": 0.2017432888003113, "learning_rate": 5.033936651583711e-06, "loss": 0.2775, "step": 1787 }, { "epoch": 2.7261632341723874, "grad_norm": 0.20885887222065194, "learning_rate": 5.005656108597286e-06, "loss": 0.27, "step": 1788 }, { "epoch": 2.727688787185355, "grad_norm": 0.19196065604079432, "learning_rate": 4.97737556561086e-06, "loss": 0.2698, "step": 1789 }, { "epoch": 2.7292143401983218, "grad_norm": 0.20277434922419085, "learning_rate": 4.949095022624434e-06, "loss": 0.2857, "step": 1790 }, { "epoch": 2.730739893211289, "grad_norm": 0.2151816697142376, "learning_rate": 4.920814479638009e-06, "loss": 0.2812, "step": 1791 }, { "epoch": 2.732265446224256, "grad_norm": 0.21466632086032003, "learning_rate": 4.892533936651584e-06, "loss": 0.2714, "step": 1792 }, { "epoch": 2.7337909992372236, "grad_norm": 0.1943615103819248, "learning_rate": 4.864253393665159e-06, "loss": 0.2815, "step": 1793 }, { "epoch": 2.7353165522501905, "grad_norm": 0.20893471290047624, "learning_rate": 4.835972850678734e-06, "loss": 0.2893, "step": 1794 }, { "epoch": 2.736842105263158, "grad_norm": 0.20036964077176836, "learning_rate": 4.807692307692308e-06, "loss": 0.2665, "step": 1795 }, { "epoch": 2.738367658276125, "grad_norm": 0.20625643555087492, "learning_rate": 4.779411764705882e-06, "loss": 0.2761, "step": 1796 }, { "epoch": 2.7398932112890924, "grad_norm": 0.21184530090069853, "learning_rate": 4.751131221719457e-06, "loss": 0.2761, "step": 1797 }, { "epoch": 2.7414187643020593, "grad_norm": 0.1955398354366089, "learning_rate": 4.722850678733032e-06, "loss": 0.2775, "step": 1798 }, { "epoch": 2.7429443173150267, "grad_norm": 0.21508383344401844, "learning_rate": 4.694570135746607e-06, "loss": 0.289, "step": 1799 }, { "epoch": 2.744469870327994, "grad_norm": 0.20689497983985197, "learning_rate": 4.666289592760182e-06, "loss": 0.2787, "step": 1800 }, { "epoch": 2.745995423340961, "grad_norm": 0.20785169814667376, "learning_rate": 4.638009049773756e-06, "loss": 0.2819, "step": 1801 }, { "epoch": 2.747520976353928, "grad_norm": 0.1979271282874828, "learning_rate": 4.609728506787331e-06, "loss": 0.2744, "step": 1802 }, { "epoch": 2.7490465293668955, "grad_norm": 0.19882276266039367, "learning_rate": 4.581447963800905e-06, "loss": 0.2789, "step": 1803 }, { "epoch": 2.750572082379863, "grad_norm": 0.20548302680642835, "learning_rate": 4.55316742081448e-06, "loss": 0.2902, "step": 1804 }, { "epoch": 2.75209763539283, "grad_norm": 0.19985130867303336, "learning_rate": 4.5248868778280546e-06, "loss": 0.2771, "step": 1805 }, { "epoch": 2.753623188405797, "grad_norm": 0.20822754426365878, "learning_rate": 4.4966063348416295e-06, "loss": 0.2733, "step": 1806 }, { "epoch": 2.7551487414187643, "grad_norm": 0.19616732071791923, "learning_rate": 4.4683257918552036e-06, "loss": 0.2851, "step": 1807 }, { "epoch": 2.7566742944317317, "grad_norm": 0.22306048646972088, "learning_rate": 4.4400452488687785e-06, "loss": 0.2803, "step": 1808 }, { "epoch": 2.7581998474446987, "grad_norm": 0.19641782709321357, "learning_rate": 4.411764705882353e-06, "loss": 0.2897, "step": 1809 }, { "epoch": 2.7597254004576657, "grad_norm": 0.19312742071710312, "learning_rate": 4.3834841628959275e-06, "loss": 0.2766, "step": 1810 }, { "epoch": 2.761250953470633, "grad_norm": 0.2321259470246117, "learning_rate": 4.355203619909502e-06, "loss": 0.2805, "step": 1811 }, { "epoch": 2.7627765064836005, "grad_norm": 0.20317270322417355, "learning_rate": 4.326923076923077e-06, "loss": 0.2735, "step": 1812 }, { "epoch": 2.7643020594965675, "grad_norm": 0.20024205534952347, "learning_rate": 4.298642533936651e-06, "loss": 0.2781, "step": 1813 }, { "epoch": 2.7658276125095345, "grad_norm": 0.20522565800125614, "learning_rate": 4.270361990950226e-06, "loss": 0.2869, "step": 1814 }, { "epoch": 2.767353165522502, "grad_norm": 0.20536297366322828, "learning_rate": 4.242081447963801e-06, "loss": 0.2814, "step": 1815 }, { "epoch": 2.7688787185354693, "grad_norm": 0.21131541574505874, "learning_rate": 4.213800904977376e-06, "loss": 0.2794, "step": 1816 }, { "epoch": 2.7704042715484363, "grad_norm": 0.22411841940183003, "learning_rate": 4.18552036199095e-06, "loss": 0.2806, "step": 1817 }, { "epoch": 2.7719298245614032, "grad_norm": 0.20861357244680745, "learning_rate": 4.157239819004525e-06, "loss": 0.2683, "step": 1818 }, { "epoch": 2.7734553775743707, "grad_norm": 0.20545064592099158, "learning_rate": 4.128959276018099e-06, "loss": 0.281, "step": 1819 }, { "epoch": 2.774980930587338, "grad_norm": 0.21735057418603912, "learning_rate": 4.100678733031674e-06, "loss": 0.278, "step": 1820 }, { "epoch": 2.776506483600305, "grad_norm": 0.21360434657677974, "learning_rate": 4.072398190045249e-06, "loss": 0.2896, "step": 1821 }, { "epoch": 2.7780320366132725, "grad_norm": 0.19737497694054937, "learning_rate": 4.044117647058824e-06, "loss": 0.2929, "step": 1822 }, { "epoch": 2.7795575896262394, "grad_norm": 0.19957913179273273, "learning_rate": 4.015837104072399e-06, "loss": 0.2798, "step": 1823 }, { "epoch": 2.781083142639207, "grad_norm": 0.2186092489833436, "learning_rate": 3.987556561085973e-06, "loss": 0.2861, "step": 1824 }, { "epoch": 2.782608695652174, "grad_norm": 0.19333714166469484, "learning_rate": 3.959276018099547e-06, "loss": 0.2833, "step": 1825 }, { "epoch": 2.7841342486651413, "grad_norm": 0.19827554892347982, "learning_rate": 3.930995475113122e-06, "loss": 0.2754, "step": 1826 }, { "epoch": 2.7856598016781082, "grad_norm": 0.1948186909919549, "learning_rate": 3.902714932126697e-06, "loss": 0.2778, "step": 1827 }, { "epoch": 2.7871853546910756, "grad_norm": 0.1963431742920227, "learning_rate": 3.874434389140272e-06, "loss": 0.2875, "step": 1828 }, { "epoch": 2.7887109077040426, "grad_norm": 0.20044823557973415, "learning_rate": 3.846153846153847e-06, "loss": 0.2727, "step": 1829 }, { "epoch": 2.79023646071701, "grad_norm": 0.20460548184192576, "learning_rate": 3.817873303167422e-06, "loss": 0.2764, "step": 1830 }, { "epoch": 2.791762013729977, "grad_norm": 0.20235174380669996, "learning_rate": 3.7895927601809954e-06, "loss": 0.2774, "step": 1831 }, { "epoch": 2.7932875667429444, "grad_norm": 0.20310593222619544, "learning_rate": 3.7613122171945703e-06, "loss": 0.2813, "step": 1832 }, { "epoch": 2.7948131197559114, "grad_norm": 0.22541172087472924, "learning_rate": 3.733031674208145e-06, "loss": 0.2669, "step": 1833 }, { "epoch": 2.796338672768879, "grad_norm": 0.2000614354387634, "learning_rate": 3.7047511312217197e-06, "loss": 0.2795, "step": 1834 }, { "epoch": 2.797864225781846, "grad_norm": 0.1992544511740729, "learning_rate": 3.6764705882352942e-06, "loss": 0.2773, "step": 1835 }, { "epoch": 2.799389778794813, "grad_norm": 0.19803639254881675, "learning_rate": 3.648190045248869e-06, "loss": 0.279, "step": 1836 }, { "epoch": 2.80091533180778, "grad_norm": 0.19636753157558323, "learning_rate": 3.619909502262444e-06, "loss": 0.2807, "step": 1837 }, { "epoch": 2.8024408848207476, "grad_norm": 0.19498506519840914, "learning_rate": 3.591628959276018e-06, "loss": 0.2764, "step": 1838 }, { "epoch": 2.8039664378337146, "grad_norm": 0.20654624641305525, "learning_rate": 3.5633484162895926e-06, "loss": 0.2949, "step": 1839 }, { "epoch": 2.805491990846682, "grad_norm": 0.1971461849214778, "learning_rate": 3.5350678733031676e-06, "loss": 0.2625, "step": 1840 }, { "epoch": 2.807017543859649, "grad_norm": 0.21146377469214334, "learning_rate": 3.5067873303167425e-06, "loss": 0.2995, "step": 1841 }, { "epoch": 2.8085430968726164, "grad_norm": 0.20659701037676945, "learning_rate": 3.478506787330317e-06, "loss": 0.2823, "step": 1842 }, { "epoch": 2.8100686498855834, "grad_norm": 0.21214570264708388, "learning_rate": 3.450226244343892e-06, "loss": 0.2749, "step": 1843 }, { "epoch": 2.8115942028985508, "grad_norm": 0.19359310004236657, "learning_rate": 3.4219457013574664e-06, "loss": 0.281, "step": 1844 }, { "epoch": 2.8131197559115177, "grad_norm": 0.18519796076266437, "learning_rate": 3.3936651583710405e-06, "loss": 0.2689, "step": 1845 }, { "epoch": 2.814645308924485, "grad_norm": 0.19858870771962905, "learning_rate": 3.3653846153846154e-06, "loss": 0.2752, "step": 1846 }, { "epoch": 2.8161708619374526, "grad_norm": 0.21008264917074906, "learning_rate": 3.3371040723981903e-06, "loss": 0.2804, "step": 1847 }, { "epoch": 2.8176964149504196, "grad_norm": 0.20451856233954666, "learning_rate": 3.308823529411765e-06, "loss": 0.2786, "step": 1848 }, { "epoch": 2.8192219679633865, "grad_norm": 0.20616549209637472, "learning_rate": 3.2805429864253398e-06, "loss": 0.2932, "step": 1849 }, { "epoch": 2.820747520976354, "grad_norm": 0.2116980566350686, "learning_rate": 3.2522624434389143e-06, "loss": 0.2821, "step": 1850 }, { "epoch": 2.8222730739893214, "grad_norm": 0.20816702198103063, "learning_rate": 3.223981900452489e-06, "loss": 0.2893, "step": 1851 }, { "epoch": 2.8237986270022883, "grad_norm": 0.19065738578122451, "learning_rate": 3.1957013574660633e-06, "loss": 0.2788, "step": 1852 }, { "epoch": 2.8253241800152553, "grad_norm": 0.18949811867696587, "learning_rate": 3.167420814479638e-06, "loss": 0.2697, "step": 1853 }, { "epoch": 2.8268497330282227, "grad_norm": 0.18989299263981352, "learning_rate": 3.1391402714932127e-06, "loss": 0.2836, "step": 1854 }, { "epoch": 2.82837528604119, "grad_norm": 0.1964528239410692, "learning_rate": 3.1108597285067876e-06, "loss": 0.2806, "step": 1855 }, { "epoch": 2.829900839054157, "grad_norm": 0.19932100013150664, "learning_rate": 3.082579185520362e-06, "loss": 0.2737, "step": 1856 }, { "epoch": 2.831426392067124, "grad_norm": 0.2093626042896617, "learning_rate": 3.0542986425339366e-06, "loss": 0.2775, "step": 1857 }, { "epoch": 2.8329519450800915, "grad_norm": 0.20233779882901465, "learning_rate": 3.0260180995475115e-06, "loss": 0.2896, "step": 1858 }, { "epoch": 2.834477498093059, "grad_norm": 0.1951692291835088, "learning_rate": 2.997737556561086e-06, "loss": 0.292, "step": 1859 }, { "epoch": 2.836003051106026, "grad_norm": 0.21068939044919083, "learning_rate": 2.9694570135746605e-06, "loss": 0.2814, "step": 1860 }, { "epoch": 2.837528604118993, "grad_norm": 0.2018118664623138, "learning_rate": 2.9411764705882355e-06, "loss": 0.2726, "step": 1861 }, { "epoch": 2.8390541571319603, "grad_norm": 0.19945355818568114, "learning_rate": 2.91289592760181e-06, "loss": 0.2707, "step": 1862 }, { "epoch": 2.8405797101449277, "grad_norm": 0.20234504640790094, "learning_rate": 2.884615384615385e-06, "loss": 0.2741, "step": 1863 }, { "epoch": 2.8421052631578947, "grad_norm": 0.19023388771246863, "learning_rate": 2.8563348416289594e-06, "loss": 0.2673, "step": 1864 }, { "epoch": 2.8436308161708617, "grad_norm": 0.19598839189926553, "learning_rate": 2.828054298642534e-06, "loss": 0.2599, "step": 1865 }, { "epoch": 2.845156369183829, "grad_norm": 0.18837179337512258, "learning_rate": 2.799773755656109e-06, "loss": 0.2741, "step": 1866 }, { "epoch": 2.8466819221967965, "grad_norm": 0.2036395493918796, "learning_rate": 2.7714932126696833e-06, "loss": 0.2829, "step": 1867 }, { "epoch": 2.8482074752097635, "grad_norm": 0.19628394887848483, "learning_rate": 2.743212669683258e-06, "loss": 0.2855, "step": 1868 }, { "epoch": 2.849733028222731, "grad_norm": 0.18913545161713, "learning_rate": 2.7149321266968327e-06, "loss": 0.282, "step": 1869 }, { "epoch": 2.851258581235698, "grad_norm": 0.1906042921602759, "learning_rate": 2.6866515837104077e-06, "loss": 0.2685, "step": 1870 }, { "epoch": 2.8527841342486653, "grad_norm": 0.19252935494538206, "learning_rate": 2.658371040723982e-06, "loss": 0.2785, "step": 1871 }, { "epoch": 2.8543096872616323, "grad_norm": 0.19166974258408342, "learning_rate": 2.6300904977375567e-06, "loss": 0.2814, "step": 1872 }, { "epoch": 2.8558352402745997, "grad_norm": 0.22759252072978411, "learning_rate": 2.6018099547511316e-06, "loss": 0.2997, "step": 1873 }, { "epoch": 2.8573607932875666, "grad_norm": 0.1945736972351939, "learning_rate": 2.573529411764706e-06, "loss": 0.2804, "step": 1874 }, { "epoch": 2.858886346300534, "grad_norm": 0.2018793461724642, "learning_rate": 2.5452488687782806e-06, "loss": 0.29, "step": 1875 }, { "epoch": 2.860411899313501, "grad_norm": 0.20095273489353882, "learning_rate": 2.5169683257918555e-06, "loss": 0.2664, "step": 1876 }, { "epoch": 2.8619374523264685, "grad_norm": 0.1994651067699733, "learning_rate": 2.48868778280543e-06, "loss": 0.2803, "step": 1877 }, { "epoch": 2.8634630053394354, "grad_norm": 0.18984310892997858, "learning_rate": 2.4604072398190045e-06, "loss": 0.2871, "step": 1878 }, { "epoch": 2.864988558352403, "grad_norm": 0.19331812252640546, "learning_rate": 2.4321266968325794e-06, "loss": 0.2895, "step": 1879 }, { "epoch": 2.86651411136537, "grad_norm": 0.18293576564746528, "learning_rate": 2.403846153846154e-06, "loss": 0.2644, "step": 1880 }, { "epoch": 2.8680396643783372, "grad_norm": 0.19630191679697984, "learning_rate": 2.3755656108597284e-06, "loss": 0.2832, "step": 1881 }, { "epoch": 2.869565217391304, "grad_norm": 0.18900795766685907, "learning_rate": 2.3472850678733034e-06, "loss": 0.2803, "step": 1882 }, { "epoch": 2.8710907704042716, "grad_norm": 0.20624229024549387, "learning_rate": 2.319004524886878e-06, "loss": 0.2855, "step": 1883 }, { "epoch": 2.8726163234172386, "grad_norm": 0.2019449530274578, "learning_rate": 2.2907239819004524e-06, "loss": 0.279, "step": 1884 }, { "epoch": 2.874141876430206, "grad_norm": 0.19364216376667498, "learning_rate": 2.2624434389140273e-06, "loss": 0.2801, "step": 1885 }, { "epoch": 2.875667429443173, "grad_norm": 0.223807311508233, "learning_rate": 2.2341628959276018e-06, "loss": 0.2922, "step": 1886 }, { "epoch": 2.8771929824561404, "grad_norm": 0.2122401107328281, "learning_rate": 2.2058823529411767e-06, "loss": 0.2631, "step": 1887 }, { "epoch": 2.8787185354691074, "grad_norm": 0.20773118167770527, "learning_rate": 2.177601809954751e-06, "loss": 0.262, "step": 1888 }, { "epoch": 2.880244088482075, "grad_norm": 0.18705052436659148, "learning_rate": 2.1493212669683257e-06, "loss": 0.2649, "step": 1889 }, { "epoch": 2.8817696414950422, "grad_norm": 0.21044509678817772, "learning_rate": 2.1210407239819006e-06, "loss": 0.2819, "step": 1890 }, { "epoch": 2.883295194508009, "grad_norm": 0.18948494412919942, "learning_rate": 2.092760180995475e-06, "loss": 0.271, "step": 1891 }, { "epoch": 2.884820747520976, "grad_norm": 0.1949727541230729, "learning_rate": 2.0644796380090496e-06, "loss": 0.2768, "step": 1892 }, { "epoch": 2.8863463005339436, "grad_norm": 0.1927693044091142, "learning_rate": 2.0361990950226245e-06, "loss": 0.2828, "step": 1893 }, { "epoch": 2.887871853546911, "grad_norm": 0.20765184811621995, "learning_rate": 2.0079185520361995e-06, "loss": 0.2793, "step": 1894 }, { "epoch": 2.889397406559878, "grad_norm": 0.18265763689678813, "learning_rate": 1.9796380090497735e-06, "loss": 0.2658, "step": 1895 }, { "epoch": 2.890922959572845, "grad_norm": 0.18116491253389014, "learning_rate": 1.9513574660633485e-06, "loss": 0.2814, "step": 1896 }, { "epoch": 2.8924485125858124, "grad_norm": 0.21116579147448233, "learning_rate": 1.9230769230769234e-06, "loss": 0.2742, "step": 1897 }, { "epoch": 2.89397406559878, "grad_norm": 0.21521899434020572, "learning_rate": 1.8947963800904977e-06, "loss": 0.2832, "step": 1898 }, { "epoch": 2.8954996186117468, "grad_norm": 0.1943135663213224, "learning_rate": 1.8665158371040724e-06, "loss": 0.2823, "step": 1899 }, { "epoch": 2.8970251716247137, "grad_norm": 0.19340213696014497, "learning_rate": 1.8382352941176471e-06, "loss": 0.2749, "step": 1900 }, { "epoch": 2.898550724637681, "grad_norm": 0.18659921583401218, "learning_rate": 1.809954751131222e-06, "loss": 0.2807, "step": 1901 }, { "epoch": 2.9000762776506486, "grad_norm": 0.2182342259522023, "learning_rate": 1.7816742081447963e-06, "loss": 0.3005, "step": 1902 }, { "epoch": 2.9016018306636155, "grad_norm": 0.19990728761258314, "learning_rate": 1.7533936651583712e-06, "loss": 0.2937, "step": 1903 }, { "epoch": 2.9031273836765825, "grad_norm": 0.19883155333190392, "learning_rate": 1.725113122171946e-06, "loss": 0.275, "step": 1904 }, { "epoch": 2.90465293668955, "grad_norm": 0.20135356259013146, "learning_rate": 1.6968325791855202e-06, "loss": 0.2959, "step": 1905 }, { "epoch": 2.9061784897025174, "grad_norm": 0.22161599309483826, "learning_rate": 1.6685520361990952e-06, "loss": 0.2859, "step": 1906 }, { "epoch": 2.9077040427154843, "grad_norm": 0.20903408145215524, "learning_rate": 1.6402714932126699e-06, "loss": 0.2838, "step": 1907 }, { "epoch": 2.9092295957284513, "grad_norm": 0.19109527492114803, "learning_rate": 1.6119909502262446e-06, "loss": 0.2797, "step": 1908 }, { "epoch": 2.9107551487414187, "grad_norm": 0.19522382385657336, "learning_rate": 1.583710407239819e-06, "loss": 0.2882, "step": 1909 }, { "epoch": 2.912280701754386, "grad_norm": 0.20696188551210895, "learning_rate": 1.5554298642533938e-06, "loss": 0.288, "step": 1910 }, { "epoch": 2.913806254767353, "grad_norm": 0.20087144799240744, "learning_rate": 1.5271493212669683e-06, "loss": 0.2722, "step": 1911 }, { "epoch": 2.9153318077803205, "grad_norm": 0.18227605440503353, "learning_rate": 1.498868778280543e-06, "loss": 0.27, "step": 1912 }, { "epoch": 2.9168573607932875, "grad_norm": 0.1961463211728297, "learning_rate": 1.4705882352941177e-06, "loss": 0.2824, "step": 1913 }, { "epoch": 2.918382913806255, "grad_norm": 0.19094363608689296, "learning_rate": 1.4423076923076924e-06, "loss": 0.2694, "step": 1914 }, { "epoch": 2.919908466819222, "grad_norm": 0.18952327325238438, "learning_rate": 1.414027149321267e-06, "loss": 0.2746, "step": 1915 }, { "epoch": 2.9214340198321893, "grad_norm": 0.20808705461866878, "learning_rate": 1.3857466063348417e-06, "loss": 0.305, "step": 1916 }, { "epoch": 2.9229595728451563, "grad_norm": 0.22205218137643415, "learning_rate": 1.3574660633484164e-06, "loss": 0.2894, "step": 1917 }, { "epoch": 2.9244851258581237, "grad_norm": 0.19224349475036417, "learning_rate": 1.329185520361991e-06, "loss": 0.2604, "step": 1918 }, { "epoch": 2.9260106788710907, "grad_norm": 0.18978044048329157, "learning_rate": 1.3009049773755658e-06, "loss": 0.2711, "step": 1919 }, { "epoch": 2.927536231884058, "grad_norm": 0.2080704679502206, "learning_rate": 1.2726244343891403e-06, "loss": 0.2989, "step": 1920 }, { "epoch": 2.929061784897025, "grad_norm": 0.2038606670520458, "learning_rate": 1.244343891402715e-06, "loss": 0.3001, "step": 1921 }, { "epoch": 2.9305873379099925, "grad_norm": 0.18078769321816562, "learning_rate": 1.2160633484162897e-06, "loss": 0.2759, "step": 1922 }, { "epoch": 2.9321128909229595, "grad_norm": 0.18032720403977873, "learning_rate": 1.1877828054298642e-06, "loss": 0.2673, "step": 1923 }, { "epoch": 2.933638443935927, "grad_norm": 0.18681485169569836, "learning_rate": 1.159502262443439e-06, "loss": 0.2588, "step": 1924 }, { "epoch": 2.935163996948894, "grad_norm": 0.19958849118895866, "learning_rate": 1.1312217194570136e-06, "loss": 0.2924, "step": 1925 }, { "epoch": 2.9366895499618613, "grad_norm": 0.17873409968545653, "learning_rate": 1.1029411764705884e-06, "loss": 0.2713, "step": 1926 }, { "epoch": 2.9382151029748282, "grad_norm": 0.19570694964107407, "learning_rate": 1.0746606334841629e-06, "loss": 0.2641, "step": 1927 }, { "epoch": 2.9397406559877957, "grad_norm": 0.20170096527048184, "learning_rate": 1.0463800904977376e-06, "loss": 0.2684, "step": 1928 }, { "epoch": 2.9412662090007626, "grad_norm": 0.1974087436606526, "learning_rate": 1.0180995475113123e-06, "loss": 0.2705, "step": 1929 }, { "epoch": 2.94279176201373, "grad_norm": 0.19713720381330507, "learning_rate": 9.898190045248868e-07, "loss": 0.269, "step": 1930 }, { "epoch": 2.944317315026697, "grad_norm": 0.18582001552980776, "learning_rate": 9.615384615384617e-07, "loss": 0.284, "step": 1931 }, { "epoch": 2.9458428680396644, "grad_norm": 0.18789847352784972, "learning_rate": 9.332579185520362e-07, "loss": 0.2687, "step": 1932 }, { "epoch": 2.9473684210526314, "grad_norm": 0.1884601749091908, "learning_rate": 9.04977375565611e-07, "loss": 0.2691, "step": 1933 }, { "epoch": 2.948893974065599, "grad_norm": 0.18916379904445393, "learning_rate": 8.766968325791856e-07, "loss": 0.2636, "step": 1934 }, { "epoch": 2.950419527078566, "grad_norm": 0.1897947432322617, "learning_rate": 8.484162895927601e-07, "loss": 0.2773, "step": 1935 }, { "epoch": 2.9519450800915332, "grad_norm": 0.19297181251691972, "learning_rate": 8.201357466063349e-07, "loss": 0.2783, "step": 1936 }, { "epoch": 2.9534706331045006, "grad_norm": 0.186108626952009, "learning_rate": 7.918552036199095e-07, "loss": 0.2677, "step": 1937 }, { "epoch": 2.9549961861174676, "grad_norm": 0.19229898186314018, "learning_rate": 7.635746606334842e-07, "loss": 0.2779, "step": 1938 }, { "epoch": 2.9565217391304346, "grad_norm": 0.19130038483414738, "learning_rate": 7.352941176470589e-07, "loss": 0.2775, "step": 1939 }, { "epoch": 2.958047292143402, "grad_norm": 0.1884186597348547, "learning_rate": 7.070135746606335e-07, "loss": 0.2738, "step": 1940 }, { "epoch": 2.9595728451563694, "grad_norm": 0.2028281024366674, "learning_rate": 6.787330316742082e-07, "loss": 0.2817, "step": 1941 }, { "epoch": 2.9610983981693364, "grad_norm": 0.19973700574613962, "learning_rate": 6.504524886877829e-07, "loss": 0.2905, "step": 1942 }, { "epoch": 2.9626239511823034, "grad_norm": 0.1833797630025949, "learning_rate": 6.221719457013575e-07, "loss": 0.273, "step": 1943 }, { "epoch": 2.964149504195271, "grad_norm": 0.1912783145981763, "learning_rate": 5.938914027149321e-07, "loss": 0.2903, "step": 1944 }, { "epoch": 2.965675057208238, "grad_norm": 0.17985116607145232, "learning_rate": 5.656108597285068e-07, "loss": 0.2575, "step": 1945 }, { "epoch": 2.967200610221205, "grad_norm": 0.1815625133149068, "learning_rate": 5.373303167420814e-07, "loss": 0.2795, "step": 1946 }, { "epoch": 2.968726163234172, "grad_norm": 0.18371116817903616, "learning_rate": 5.090497737556561e-07, "loss": 0.2699, "step": 1947 }, { "epoch": 2.9702517162471396, "grad_norm": 0.1907887758378023, "learning_rate": 4.807692307692308e-07, "loss": 0.2649, "step": 1948 }, { "epoch": 2.971777269260107, "grad_norm": 0.20336867493564428, "learning_rate": 4.524886877828055e-07, "loss": 0.277, "step": 1949 }, { "epoch": 2.973302822273074, "grad_norm": 0.1883546053382782, "learning_rate": 4.2420814479638006e-07, "loss": 0.2703, "step": 1950 }, { "epoch": 2.974828375286041, "grad_norm": 0.1869503181312374, "learning_rate": 3.9592760180995477e-07, "loss": 0.2733, "step": 1951 }, { "epoch": 2.9763539282990084, "grad_norm": 0.19019072092772865, "learning_rate": 3.6764705882352943e-07, "loss": 0.2701, "step": 1952 }, { "epoch": 2.9778794813119758, "grad_norm": 0.18738242994591303, "learning_rate": 3.393665158371041e-07, "loss": 0.2831, "step": 1953 }, { "epoch": 2.9794050343249427, "grad_norm": 0.20575169197370763, "learning_rate": 3.1108597285067875e-07, "loss": 0.2786, "step": 1954 }, { "epoch": 2.9809305873379097, "grad_norm": 0.19031615531559876, "learning_rate": 2.828054298642534e-07, "loss": 0.274, "step": 1955 }, { "epoch": 2.982456140350877, "grad_norm": 0.21603455966727772, "learning_rate": 2.5452488687782807e-07, "loss": 0.2883, "step": 1956 }, { "epoch": 2.9839816933638446, "grad_norm": 0.20135972245746994, "learning_rate": 2.2624434389140275e-07, "loss": 0.2711, "step": 1957 }, { "epoch": 2.9855072463768115, "grad_norm": 0.1952807212804495, "learning_rate": 1.9796380090497739e-07, "loss": 0.2821, "step": 1958 }, { "epoch": 2.987032799389779, "grad_norm": 0.1940081598237213, "learning_rate": 1.6968325791855205e-07, "loss": 0.2933, "step": 1959 }, { "epoch": 2.988558352402746, "grad_norm": 0.18405338453755415, "learning_rate": 1.414027149321267e-07, "loss": 0.2643, "step": 1960 }, { "epoch": 2.9900839054157133, "grad_norm": 0.18714699572961324, "learning_rate": 1.1312217194570138e-07, "loss": 0.291, "step": 1961 }, { "epoch": 2.9916094584286803, "grad_norm": 0.18954699349831053, "learning_rate": 8.484162895927602e-08, "loss": 0.2647, "step": 1962 }, { "epoch": 2.9931350114416477, "grad_norm": 0.2010780900259728, "learning_rate": 5.656108597285069e-08, "loss": 0.2705, "step": 1963 }, { "epoch": 2.9946605644546147, "grad_norm": 0.19383533843403955, "learning_rate": 2.8280542986425344e-08, "loss": 0.2679, "step": 1964 }, { "epoch": 2.996186117467582, "grad_norm": 0.1866714901699595, "learning_rate": 0.0, "loss": 0.2768, "step": 1965 }, { "epoch": 2.996186117467582, "step": 1965, "total_flos": 1.6782329569272136e+18, "train_loss": 0.4397198457905961, "train_runtime": 113727.7387, "train_samples_per_second": 0.277, "train_steps_per_second": 0.017 } ], "logging_steps": 1, "max_steps": 1965, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.6782329569272136e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }