{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9993453355155482, "eval_steps": 382, "global_step": 3054, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 5.687138080596924, "learning_rate": 2.9999999999999997e-05, "loss": 3.5097, "step": 1 }, { "epoch": 0.0, "eval_loss": 3.6327099800109863, "eval_runtime": 39.1673, "eval_samples_per_second": 32.859, "eval_steps_per_second": 8.221, "step": 1 }, { "epoch": 0.0, "grad_norm": 5.729796886444092, "learning_rate": 5.9999999999999995e-05, "loss": 3.6634, "step": 2 }, { "epoch": 0.0, "grad_norm": 5.679180145263672, "learning_rate": 8.999999999999999e-05, "loss": 3.5559, "step": 3 }, { "epoch": 0.0, "grad_norm": 4.81653356552124, "learning_rate": 0.00011999999999999999, "loss": 3.1536, "step": 4 }, { "epoch": 0.0, "grad_norm": 4.388213634490967, "learning_rate": 0.00015, "loss": 2.3092, "step": 5 }, { "epoch": 0.0, "grad_norm": 2.6662285327911377, "learning_rate": 0.00017999999999999998, "loss": 1.2283, "step": 6 }, { "epoch": 0.0, "grad_norm": 1.9162248373031616, "learning_rate": 0.00020999999999999998, "loss": 0.6207, "step": 7 }, { "epoch": 0.01, "grad_norm": 1.3946017026901245, "learning_rate": 0.00023999999999999998, "loss": 0.2942, "step": 8 }, { "epoch": 0.01, "grad_norm": 0.3801995813846588, "learning_rate": 0.00027, "loss": 0.1143, "step": 9 }, { "epoch": 0.01, "grad_norm": 0.2290647178888321, "learning_rate": 0.0003, "loss": 0.1152, "step": 10 }, { "epoch": 0.01, "grad_norm": 0.2698324918746948, "learning_rate": 0.00029999996457265966, "loss": 0.0984, "step": 11 }, { "epoch": 0.01, "grad_norm": 0.15049245953559875, "learning_rate": 0.00029999985829065547, "loss": 0.0925, "step": 12 }, { "epoch": 0.01, "grad_norm": 0.7001833319664001, "learning_rate": 0.0002999996811540376, "loss": 0.1215, "step": 13 }, { "epoch": 0.01, "grad_norm": 0.22832374274730682, "learning_rate": 0.00029999943316288974, "loss": 0.0997, "step": 14 }, { "epoch": 0.01, "grad_norm": 0.1290595531463623, "learning_rate": 0.00029999911431732894, "loss": 0.0973, "step": 15 }, { "epoch": 0.01, "grad_norm": 0.3555549383163452, "learning_rate": 0.00029999872461750597, "loss": 0.1108, "step": 16 }, { "epoch": 0.01, "grad_norm": 0.04830395057797432, "learning_rate": 0.0002999982640636048, "loss": 0.0994, "step": 17 }, { "epoch": 0.01, "grad_norm": 0.2727436125278473, "learning_rate": 0.00029999773265584304, "loss": 0.1144, "step": 18 }, { "epoch": 0.01, "grad_norm": 0.03478335589170456, "learning_rate": 0.0002999971303944716, "loss": 0.0945, "step": 19 }, { "epoch": 0.01, "grad_norm": 0.133951798081398, "learning_rate": 0.00029999645727977505, "loss": 0.0928, "step": 20 }, { "epoch": 0.01, "grad_norm": 0.20885471999645233, "learning_rate": 0.0002999957133120714, "loss": 0.1056, "step": 21 }, { "epoch": 0.01, "grad_norm": 0.030896561220288277, "learning_rate": 0.00029999489849171195, "loss": 0.0985, "step": 22 }, { "epoch": 0.02, "grad_norm": 0.0476481132209301, "learning_rate": 0.0002999940128190817, "loss": 0.0993, "step": 23 }, { "epoch": 0.02, "grad_norm": 0.2006714642047882, "learning_rate": 0.00029999305629459895, "loss": 0.0971, "step": 24 }, { "epoch": 0.02, "grad_norm": 0.150727316737175, "learning_rate": 0.0002999920289187155, "loss": 0.1016, "step": 25 }, { "epoch": 0.02, "grad_norm": 0.03271281719207764, "learning_rate": 0.0002999909306919168, "loss": 0.1002, "step": 26 }, { "epoch": 0.02, "grad_norm": 0.08288753032684326, "learning_rate": 0.0002999897616147214, "loss": 0.1009, "step": 27 }, { "epoch": 0.02, "grad_norm": 0.2443581521511078, "learning_rate": 0.0002999885216876816, "loss": 0.1036, "step": 28 }, { "epoch": 0.02, "grad_norm": 0.16865722835063934, "learning_rate": 0.00029998721091138323, "loss": 0.0965, "step": 29 }, { "epoch": 0.02, "grad_norm": 0.19362947344779968, "learning_rate": 0.0002999858292864453, "loss": 0.0952, "step": 30 }, { "epoch": 0.02, "grad_norm": 0.039490532130002975, "learning_rate": 0.0002999843768135205, "loss": 0.0967, "step": 31 }, { "epoch": 0.02, "grad_norm": 0.15848855674266815, "learning_rate": 0.0002999828534932949, "loss": 0.093, "step": 32 }, { "epoch": 0.02, "grad_norm": 0.2813495695590973, "learning_rate": 0.0002999812593264881, "loss": 0.1052, "step": 33 }, { "epoch": 0.02, "grad_norm": 0.03380066901445389, "learning_rate": 0.00029997959431385314, "loss": 0.0974, "step": 34 }, { "epoch": 0.02, "grad_norm": 0.050066880881786346, "learning_rate": 0.0002999778584561764, "loss": 0.0972, "step": 35 }, { "epoch": 0.02, "grad_norm": 0.2120673805475235, "learning_rate": 0.00029997605175427803, "loss": 0.0965, "step": 36 }, { "epoch": 0.02, "grad_norm": 0.11290993541479111, "learning_rate": 0.0002999741742090113, "loss": 0.099, "step": 37 }, { "epoch": 0.02, "grad_norm": 0.2454652190208435, "learning_rate": 0.00029997222582126313, "loss": 0.0898, "step": 38 }, { "epoch": 0.03, "grad_norm": 0.10817914456129074, "learning_rate": 0.0002999702065919539, "loss": 0.0887, "step": 39 }, { "epoch": 0.03, "grad_norm": 0.3510904014110565, "learning_rate": 0.00029996811652203737, "loss": 0.1107, "step": 40 }, { "epoch": 0.03, "grad_norm": 0.3444919288158417, "learning_rate": 0.0002999659556125009, "loss": 0.1113, "step": 41 }, { "epoch": 0.03, "grad_norm": 0.21621473133563995, "learning_rate": 0.0002999637238643651, "loss": 0.0991, "step": 42 }, { "epoch": 0.03, "grad_norm": 0.0429786741733551, "learning_rate": 0.00029996142127868426, "loss": 0.0976, "step": 43 }, { "epoch": 0.03, "grad_norm": 0.04371911287307739, "learning_rate": 0.000299959047856546, "loss": 0.0969, "step": 44 }, { "epoch": 0.03, "grad_norm": 0.17956386506557465, "learning_rate": 0.00029995660359907154, "loss": 0.1027, "step": 45 }, { "epoch": 0.03, "grad_norm": 0.05985981971025467, "learning_rate": 0.0002999540885074153, "loss": 0.0911, "step": 46 }, { "epoch": 0.03, "grad_norm": 0.057165782898664474, "learning_rate": 0.00029995150258276546, "loss": 0.0944, "step": 47 }, { "epoch": 0.03, "grad_norm": 0.06133668124675751, "learning_rate": 0.00029994884582634345, "loss": 0.0936, "step": 48 }, { "epoch": 0.03, "grad_norm": 0.13429470360279083, "learning_rate": 0.0002999461182394042, "loss": 0.0932, "step": 49 }, { "epoch": 0.03, "grad_norm": 0.08454808592796326, "learning_rate": 0.00029994331982323625, "loss": 0.0849, "step": 50 }, { "epoch": 0.03, "grad_norm": 0.152529776096344, "learning_rate": 0.0002999404505791613, "loss": 0.0742, "step": 51 }, { "epoch": 0.03, "grad_norm": 0.9239559173583984, "learning_rate": 0.0002999375105085348, "loss": 0.1266, "step": 52 }, { "epoch": 0.03, "grad_norm": 0.15827079117298126, "learning_rate": 0.0002999344996127455, "loss": 0.0685, "step": 53 }, { "epoch": 0.04, "grad_norm": 0.11372745782136917, "learning_rate": 0.0002999314178932156, "loss": 0.0853, "step": 54 }, { "epoch": 0.04, "grad_norm": 0.11947692930698395, "learning_rate": 0.00029992826535140093, "loss": 0.0871, "step": 55 }, { "epoch": 0.04, "grad_norm": 0.09731484949588776, "learning_rate": 0.00029992504198879047, "loss": 0.0799, "step": 56 }, { "epoch": 0.04, "grad_norm": 0.40479809045791626, "learning_rate": 0.0002999217478069069, "loss": 0.1119, "step": 57 }, { "epoch": 0.04, "grad_norm": 0.10651114583015442, "learning_rate": 0.00029991838280730635, "loss": 0.0741, "step": 58 }, { "epoch": 0.04, "grad_norm": 0.13227766752243042, "learning_rate": 0.0002999149469915782, "loss": 0.067, "step": 59 }, { "epoch": 0.04, "grad_norm": 0.2328774333000183, "learning_rate": 0.0002999114403613454, "loss": 0.0872, "step": 60 }, { "epoch": 0.04, "grad_norm": 0.15303733944892883, "learning_rate": 0.0002999078629182645, "loss": 0.077, "step": 61 }, { "epoch": 0.04, "grad_norm": 0.3285676836967468, "learning_rate": 0.0002999042146640252, "loss": 0.087, "step": 62 }, { "epoch": 0.04, "grad_norm": 0.1548561453819275, "learning_rate": 0.00029990049560035093, "loss": 0.0521, "step": 63 }, { "epoch": 0.04, "grad_norm": 0.1792415827512741, "learning_rate": 0.0002998967057289983, "loss": 0.0591, "step": 64 }, { "epoch": 0.04, "grad_norm": 0.29741746187210083, "learning_rate": 0.0002998928450517577, "loss": 0.0955, "step": 65 }, { "epoch": 0.04, "grad_norm": 0.2590031325817108, "learning_rate": 0.0002998889135704527, "loss": 0.0443, "step": 66 }, { "epoch": 0.04, "grad_norm": 0.2152624875307083, "learning_rate": 0.0002998849112869403, "loss": 0.0656, "step": 67 }, { "epoch": 0.04, "grad_norm": 0.1976858377456665, "learning_rate": 0.0002998808382031111, "loss": 0.0256, "step": 68 }, { "epoch": 0.05, "grad_norm": 0.4642391502857208, "learning_rate": 0.00029987669432088917, "loss": 0.074, "step": 69 }, { "epoch": 0.05, "grad_norm": 0.43813541531562805, "learning_rate": 0.0002998724796422318, "loss": 0.0344, "step": 70 }, { "epoch": 0.05, "grad_norm": 0.8069552183151245, "learning_rate": 0.0002998681941691299, "loss": 0.1559, "step": 71 }, { "epoch": 0.05, "grad_norm": 0.3986961841583252, "learning_rate": 0.00029986383790360776, "loss": 0.0504, "step": 72 }, { "epoch": 0.05, "grad_norm": 0.19154639542102814, "learning_rate": 0.00029985941084772317, "loss": 0.0638, "step": 73 }, { "epoch": 0.05, "grad_norm": 0.2110302895307541, "learning_rate": 0.0002998549130035673, "loss": 0.071, "step": 74 }, { "epoch": 0.05, "grad_norm": 0.17988017201423645, "learning_rate": 0.00029985034437326477, "loss": 0.0798, "step": 75 }, { "epoch": 0.05, "grad_norm": 0.15195637941360474, "learning_rate": 0.0002998457049589736, "loss": 0.0575, "step": 76 }, { "epoch": 0.05, "grad_norm": 0.2465752810239792, "learning_rate": 0.0002998409947628854, "loss": 0.0669, "step": 77 }, { "epoch": 0.05, "grad_norm": 0.10329095274209976, "learning_rate": 0.0002998362137872249, "loss": 0.0483, "step": 78 }, { "epoch": 0.05, "grad_norm": 0.21354705095291138, "learning_rate": 0.00029983136203425064, "loss": 0.0522, "step": 79 }, { "epoch": 0.05, "grad_norm": 0.1916392743587494, "learning_rate": 0.00029982643950625436, "loss": 0.0797, "step": 80 }, { "epoch": 0.05, "grad_norm": 0.12721975147724152, "learning_rate": 0.0002998214462055613, "loss": 0.0368, "step": 81 }, { "epoch": 0.05, "grad_norm": 0.29551053047180176, "learning_rate": 0.0002998163821345301, "loss": 0.094, "step": 82 }, { "epoch": 0.05, "grad_norm": 0.3058943748474121, "learning_rate": 0.00029981124729555283, "loss": 0.0358, "step": 83 }, { "epoch": 0.05, "grad_norm": 0.7026583552360535, "learning_rate": 0.00029980604169105497, "loss": 0.1386, "step": 84 }, { "epoch": 0.06, "grad_norm": 0.37371405959129333, "learning_rate": 0.00029980076532349557, "loss": 0.0748, "step": 85 }, { "epoch": 0.06, "grad_norm": 0.28942474722862244, "learning_rate": 0.00029979541819536695, "loss": 0.1037, "step": 86 }, { "epoch": 0.06, "grad_norm": 0.1699017435312271, "learning_rate": 0.0002997900003091949, "loss": 0.0631, "step": 87 }, { "epoch": 0.06, "grad_norm": 0.1061767190694809, "learning_rate": 0.0002997845116675386, "loss": 0.0557, "step": 88 }, { "epoch": 0.06, "grad_norm": 0.13656219840049744, "learning_rate": 0.0002997789522729908, "loss": 0.0637, "step": 89 }, { "epoch": 0.06, "grad_norm": 0.09874790161848068, "learning_rate": 0.00029977332212817746, "loss": 0.0495, "step": 90 }, { "epoch": 0.06, "grad_norm": 0.24591276049613953, "learning_rate": 0.0002997676212357581, "loss": 0.0559, "step": 91 }, { "epoch": 0.06, "grad_norm": 0.14663195610046387, "learning_rate": 0.0002997618495984256, "loss": 0.0804, "step": 92 }, { "epoch": 0.06, "grad_norm": 0.08905334770679474, "learning_rate": 0.0002997560072189062, "loss": 0.0498, "step": 93 }, { "epoch": 0.06, "grad_norm": 0.12921252846717834, "learning_rate": 0.00029975009409995986, "loss": 0.0365, "step": 94 }, { "epoch": 0.06, "grad_norm": 0.08008511364459991, "learning_rate": 0.0002997441102443795, "loss": 0.03, "step": 95 }, { "epoch": 0.06, "grad_norm": 0.2947149872779846, "learning_rate": 0.0002997380556549918, "loss": 0.0698, "step": 96 }, { "epoch": 0.06, "grad_norm": 0.3243441581726074, "learning_rate": 0.0002997319303346567, "loss": 0.0564, "step": 97 }, { "epoch": 0.06, "grad_norm": 0.28058576583862305, "learning_rate": 0.00029972573428626757, "loss": 0.1262, "step": 98 }, { "epoch": 0.06, "grad_norm": 0.40957021713256836, "learning_rate": 0.0002997194675127512, "loss": 0.0471, "step": 99 }, { "epoch": 0.07, "grad_norm": 0.12092690169811249, "learning_rate": 0.00029971313001706787, "loss": 0.0574, "step": 100 }, { "epoch": 0.07, "grad_norm": 0.380398154258728, "learning_rate": 0.0002997067218022111, "loss": 0.1148, "step": 101 }, { "epoch": 0.07, "grad_norm": 0.13584262132644653, "learning_rate": 0.0002997002428712079, "loss": 0.0299, "step": 102 }, { "epoch": 0.07, "grad_norm": 0.13165581226348877, "learning_rate": 0.00029969369322711874, "loss": 0.0602, "step": 103 }, { "epoch": 0.07, "grad_norm": 0.1055503860116005, "learning_rate": 0.00029968707287303744, "loss": 0.0404, "step": 104 }, { "epoch": 0.07, "grad_norm": 0.09600503742694855, "learning_rate": 0.00029968038181209114, "loss": 0.0497, "step": 105 }, { "epoch": 0.07, "grad_norm": 0.05941639468073845, "learning_rate": 0.0002996736200474406, "loss": 0.0456, "step": 106 }, { "epoch": 0.07, "grad_norm": 0.1557297259569168, "learning_rate": 0.0002996667875822797, "loss": 0.077, "step": 107 }, { "epoch": 0.07, "grad_norm": 0.14879021048545837, "learning_rate": 0.00029965988441983595, "loss": 0.0554, "step": 108 }, { "epoch": 0.07, "grad_norm": 0.13067294657230377, "learning_rate": 0.00029965291056337006, "loss": 0.0357, "step": 109 }, { "epoch": 0.07, "grad_norm": 0.15178795158863068, "learning_rate": 0.00029964586601617633, "loss": 0.0433, "step": 110 }, { "epoch": 0.07, "grad_norm": 0.1176379844546318, "learning_rate": 0.0002996387507815823, "loss": 0.0432, "step": 111 }, { "epoch": 0.07, "grad_norm": 0.048378992825746536, "learning_rate": 0.000299631564862949, "loss": 0.0338, "step": 112 }, { "epoch": 0.07, "grad_norm": 0.09883740544319153, "learning_rate": 0.0002996243082636708, "loss": 0.0475, "step": 113 }, { "epoch": 0.07, "grad_norm": 0.16062304377555847, "learning_rate": 0.0002996169809871754, "loss": 0.0595, "step": 114 }, { "epoch": 0.08, "grad_norm": 0.06556422263383865, "learning_rate": 0.00029960958303692397, "loss": 0.0326, "step": 115 }, { "epoch": 0.08, "grad_norm": 0.7436458468437195, "learning_rate": 0.000299602114416411, "loss": 0.0512, "step": 116 }, { "epoch": 0.08, "grad_norm": 0.12153153866529465, "learning_rate": 0.00029959457512916454, "loss": 0.0448, "step": 117 }, { "epoch": 0.08, "grad_norm": 0.21684418618679047, "learning_rate": 0.0002995869651787458, "loss": 0.0754, "step": 118 }, { "epoch": 0.08, "grad_norm": 0.13978178799152374, "learning_rate": 0.0002995792845687494, "loss": 0.03, "step": 119 }, { "epoch": 0.08, "grad_norm": 0.08695519715547562, "learning_rate": 0.0002995715333028034, "loss": 0.0156, "step": 120 }, { "epoch": 0.08, "grad_norm": 0.2607383131980896, "learning_rate": 0.0002995637113845693, "loss": 0.0933, "step": 121 }, { "epoch": 0.08, "grad_norm": 0.08398541808128357, "learning_rate": 0.0002995558188177418, "loss": 0.0368, "step": 122 }, { "epoch": 0.08, "grad_norm": 0.14658145606517792, "learning_rate": 0.0002995478556060492, "loss": 0.0593, "step": 123 }, { "epoch": 0.08, "grad_norm": 0.09054147452116013, "learning_rate": 0.00029953982175325293, "loss": 0.042, "step": 124 }, { "epoch": 0.08, "grad_norm": 0.17315314710140228, "learning_rate": 0.0002995317172631479, "loss": 0.0754, "step": 125 }, { "epoch": 0.08, "grad_norm": 0.20856395363807678, "learning_rate": 0.0002995235421395624, "loss": 0.0537, "step": 126 }, { "epoch": 0.08, "grad_norm": 0.17539943754673004, "learning_rate": 0.0002995152963863581, "loss": 0.045, "step": 127 }, { "epoch": 0.08, "grad_norm": 0.1361098289489746, "learning_rate": 0.00029950698000743, "loss": 0.0622, "step": 128 }, { "epoch": 0.08, "grad_norm": 0.05299444869160652, "learning_rate": 0.00029949859300670644, "loss": 0.0548, "step": 129 }, { "epoch": 0.09, "grad_norm": 0.19711115956306458, "learning_rate": 0.0002994901353881491, "loss": 0.0721, "step": 130 }, { "epoch": 0.09, "grad_norm": 0.1288406252861023, "learning_rate": 0.0002994816071557532, "loss": 0.0408, "step": 131 }, { "epoch": 0.09, "grad_norm": 0.08221332728862762, "learning_rate": 0.000299473008313547, "loss": 0.0526, "step": 132 }, { "epoch": 0.09, "grad_norm": 0.1506081223487854, "learning_rate": 0.00029946433886559237, "loss": 0.0542, "step": 133 }, { "epoch": 0.09, "grad_norm": 0.293639600276947, "learning_rate": 0.00029945559881598444, "loss": 0.0769, "step": 134 }, { "epoch": 0.09, "grad_norm": 0.06451396644115448, "learning_rate": 0.0002994467881688517, "loss": 0.0417, "step": 135 }, { "epoch": 0.09, "grad_norm": 0.2765437662601471, "learning_rate": 0.00029943790692835604, "loss": 0.0617, "step": 136 }, { "epoch": 0.09, "grad_norm": 0.12035606801509857, "learning_rate": 0.00029942895509869254, "loss": 0.0429, "step": 137 }, { "epoch": 0.09, "grad_norm": 0.09559385478496552, "learning_rate": 0.0002994199326840898, "loss": 0.044, "step": 138 }, { "epoch": 0.09, "grad_norm": 0.13433387875556946, "learning_rate": 0.00029941083968880965, "loss": 0.036, "step": 139 }, { "epoch": 0.09, "grad_norm": 0.1325090080499649, "learning_rate": 0.0002994016761171474, "loss": 0.0762, "step": 140 }, { "epoch": 0.09, "grad_norm": 0.19197365641593933, "learning_rate": 0.00029939244197343143, "loss": 0.0587, "step": 141 }, { "epoch": 0.09, "grad_norm": 0.09238675236701965, "learning_rate": 0.00029938313726202376, "loss": 0.0262, "step": 142 }, { "epoch": 0.09, "grad_norm": 0.2584728002548218, "learning_rate": 0.0002993737619873195, "loss": 0.0382, "step": 143 }, { "epoch": 0.09, "grad_norm": 0.30280745029449463, "learning_rate": 0.00029936431615374727, "loss": 0.0448, "step": 144 }, { "epoch": 0.09, "grad_norm": 0.41464564204216003, "learning_rate": 0.00029935479976576896, "loss": 0.0676, "step": 145 }, { "epoch": 0.1, "grad_norm": 0.4580010175704956, "learning_rate": 0.00029934521282787974, "loss": 0.1366, "step": 146 }, { "epoch": 0.1, "grad_norm": 0.1701657474040985, "learning_rate": 0.0002993355553446081, "loss": 0.0844, "step": 147 }, { "epoch": 0.1, "grad_norm": 0.10784261673688889, "learning_rate": 0.000299325827320516, "loss": 0.0211, "step": 148 }, { "epoch": 0.1, "grad_norm": 0.08266110718250275, "learning_rate": 0.0002993160287601984, "loss": 0.0181, "step": 149 }, { "epoch": 0.1, "grad_norm": 0.20068615674972534, "learning_rate": 0.00029930615966828407, "loss": 0.0582, "step": 150 }, { "epoch": 0.1, "grad_norm": 0.14237689971923828, "learning_rate": 0.0002992962200494347, "loss": 0.0549, "step": 151 }, { "epoch": 0.1, "grad_norm": 0.09671233594417572, "learning_rate": 0.0002992862099083453, "loss": 0.0368, "step": 152 }, { "epoch": 0.1, "grad_norm": 0.11356969177722931, "learning_rate": 0.00029927612924974455, "loss": 0.0851, "step": 153 }, { "epoch": 0.1, "grad_norm": 0.17435969412326813, "learning_rate": 0.00029926597807839394, "loss": 0.0869, "step": 154 }, { "epoch": 0.1, "grad_norm": 0.09785137325525284, "learning_rate": 0.00029925575639908866, "loss": 0.0463, "step": 155 }, { "epoch": 0.1, "grad_norm": 0.143271341919899, "learning_rate": 0.0002992454642166571, "loss": 0.0532, "step": 156 }, { "epoch": 0.1, "grad_norm": 0.1381101906299591, "learning_rate": 0.0002992351015359608, "loss": 0.0512, "step": 157 }, { "epoch": 0.1, "grad_norm": 0.0688018947839737, "learning_rate": 0.0002992246683618948, "loss": 0.0188, "step": 158 }, { "epoch": 0.1, "grad_norm": 0.18138591945171356, "learning_rate": 0.0002992141646993874, "loss": 0.0737, "step": 159 }, { "epoch": 0.1, "grad_norm": 0.0729256346821785, "learning_rate": 0.0002992035905534001, "loss": 0.0194, "step": 160 }, { "epoch": 0.11, "grad_norm": 0.15414761006832123, "learning_rate": 0.0002991929459289277, "loss": 0.0412, "step": 161 }, { "epoch": 0.11, "grad_norm": 0.2506199777126312, "learning_rate": 0.00029918223083099846, "loss": 0.0789, "step": 162 }, { "epoch": 0.11, "grad_norm": 0.16611520946025848, "learning_rate": 0.00029917144526467375, "loss": 0.046, "step": 163 }, { "epoch": 0.11, "grad_norm": 0.1828208565711975, "learning_rate": 0.00029916058923504826, "loss": 0.0324, "step": 164 }, { "epoch": 0.11, "grad_norm": 0.08737993985414505, "learning_rate": 0.00029914966274725006, "loss": 0.0177, "step": 165 }, { "epoch": 0.11, "grad_norm": 0.20271027088165283, "learning_rate": 0.00029913866580644037, "loss": 0.0455, "step": 166 }, { "epoch": 0.11, "grad_norm": 0.04210209473967552, "learning_rate": 0.00029912759841781383, "loss": 0.0063, "step": 167 }, { "epoch": 0.11, "grad_norm": 0.09085400402545929, "learning_rate": 0.00029911646058659825, "loss": 0.0174, "step": 168 }, { "epoch": 0.11, "grad_norm": 0.18242572247982025, "learning_rate": 0.00029910525231805466, "loss": 0.053, "step": 169 }, { "epoch": 0.11, "grad_norm": 0.2796941101551056, "learning_rate": 0.0002990939736174776, "loss": 0.0348, "step": 170 }, { "epoch": 0.11, "grad_norm": 0.18838226795196533, "learning_rate": 0.00029908262449019463, "loss": 0.0583, "step": 171 }, { "epoch": 0.11, "grad_norm": 0.03574841469526291, "learning_rate": 0.00029907120494156674, "loss": 0.0058, "step": 172 }, { "epoch": 0.11, "grad_norm": 0.18582922220230103, "learning_rate": 0.00029905971497698805, "loss": 0.0571, "step": 173 }, { "epoch": 0.11, "grad_norm": 0.12871672213077545, "learning_rate": 0.00029904815460188604, "loss": 0.0618, "step": 174 }, { "epoch": 0.11, "grad_norm": 0.0590621717274189, "learning_rate": 0.00029903652382172143, "loss": 0.0107, "step": 175 }, { "epoch": 0.12, "grad_norm": 0.07922167330980301, "learning_rate": 0.00029902482264198817, "loss": 0.035, "step": 176 }, { "epoch": 0.12, "grad_norm": 0.3096056878566742, "learning_rate": 0.0002990130510682135, "loss": 0.0782, "step": 177 }, { "epoch": 0.12, "grad_norm": 0.1896304190158844, "learning_rate": 0.00029900120910595783, "loss": 0.036, "step": 178 }, { "epoch": 0.12, "grad_norm": 0.11776513606309891, "learning_rate": 0.000298989296760815, "loss": 0.0521, "step": 179 }, { "epoch": 0.12, "grad_norm": 0.11616750061511993, "learning_rate": 0.00029897731403841194, "loss": 0.0275, "step": 180 }, { "epoch": 0.12, "grad_norm": 0.20179390907287598, "learning_rate": 0.0002989652609444088, "loss": 0.0514, "step": 181 }, { "epoch": 0.12, "grad_norm": 0.14983738958835602, "learning_rate": 0.00029895313748449907, "loss": 0.077, "step": 182 }, { "epoch": 0.12, "grad_norm": 0.12123002856969833, "learning_rate": 0.0002989409436644095, "loss": 0.0485, "step": 183 }, { "epoch": 0.12, "grad_norm": 0.314486026763916, "learning_rate": 0.0002989286794898999, "loss": 0.0931, "step": 184 }, { "epoch": 0.12, "grad_norm": 0.132719025015831, "learning_rate": 0.0002989163449667636, "loss": 0.047, "step": 185 }, { "epoch": 0.12, "grad_norm": 0.07938767969608307, "learning_rate": 0.00029890394010082677, "loss": 0.0364, "step": 186 }, { "epoch": 0.12, "grad_norm": 0.08216488361358643, "learning_rate": 0.00029889146489794926, "loss": 0.0299, "step": 187 }, { "epoch": 0.12, "grad_norm": 0.19339217245578766, "learning_rate": 0.00029887891936402375, "loss": 0.0408, "step": 188 }, { "epoch": 0.12, "grad_norm": 0.30395349860191345, "learning_rate": 0.0002988663035049763, "loss": 0.0865, "step": 189 }, { "epoch": 0.12, "grad_norm": 0.21264804899692535, "learning_rate": 0.0002988536173267663, "loss": 0.0584, "step": 190 }, { "epoch": 0.13, "grad_norm": 0.1590937227010727, "learning_rate": 0.0002988408608353862, "loss": 0.0442, "step": 191 }, { "epoch": 0.13, "grad_norm": 0.13069725036621094, "learning_rate": 0.00029882803403686177, "loss": 0.0416, "step": 192 }, { "epoch": 0.13, "grad_norm": 0.1968701034784317, "learning_rate": 0.0002988151369372518, "loss": 0.0586, "step": 193 }, { "epoch": 0.13, "grad_norm": 0.1478463113307953, "learning_rate": 0.00029880216954264856, "loss": 0.0595, "step": 194 }, { "epoch": 0.13, "grad_norm": 0.06919383257627487, "learning_rate": 0.0002987891318591773, "loss": 0.0239, "step": 195 }, { "epoch": 0.13, "grad_norm": 0.11905679851770401, "learning_rate": 0.0002987760238929966, "loss": 0.0345, "step": 196 }, { "epoch": 0.13, "grad_norm": 0.14240068197250366, "learning_rate": 0.00029876284565029816, "loss": 0.0467, "step": 197 }, { "epoch": 0.13, "grad_norm": 0.16097158193588257, "learning_rate": 0.000298749597137307, "loss": 0.0554, "step": 198 }, { "epoch": 0.13, "grad_norm": 0.15597470104694366, "learning_rate": 0.0002987362783602812, "loss": 0.054, "step": 199 }, { "epoch": 0.13, "grad_norm": 0.10321896523237228, "learning_rate": 0.000298722889325512, "loss": 0.0432, "step": 200 }, { "epoch": 0.13, "grad_norm": 0.128427192568779, "learning_rate": 0.000298709430039324, "loss": 0.0315, "step": 201 }, { "epoch": 0.13, "grad_norm": 0.11706223338842392, "learning_rate": 0.00029869590050807487, "loss": 0.0359, "step": 202 }, { "epoch": 0.13, "grad_norm": 0.15359801054000854, "learning_rate": 0.0002986823007381555, "loss": 0.034, "step": 203 }, { "epoch": 0.13, "grad_norm": 0.10363847017288208, "learning_rate": 0.0002986686307359899, "loss": 0.0261, "step": 204 }, { "epoch": 0.13, "grad_norm": 0.12338493019342422, "learning_rate": 0.0002986548905080353, "loss": 0.0287, "step": 205 }, { "epoch": 0.13, "grad_norm": 0.16201013326644897, "learning_rate": 0.00029864108006078205, "loss": 0.0173, "step": 206 }, { "epoch": 0.14, "grad_norm": 0.04950540140271187, "learning_rate": 0.00029862719940075387, "loss": 0.0098, "step": 207 }, { "epoch": 0.14, "grad_norm": 0.20930823683738708, "learning_rate": 0.0002986132485345073, "loss": 0.0652, "step": 208 }, { "epoch": 0.14, "grad_norm": 0.12760238349437714, "learning_rate": 0.0002985992274686324, "loss": 0.0342, "step": 209 }, { "epoch": 0.14, "grad_norm": 0.2107914686203003, "learning_rate": 0.00029858513620975216, "loss": 0.015, "step": 210 }, { "epoch": 0.14, "grad_norm": 0.21169154345989227, "learning_rate": 0.0002985709747645227, "loss": 0.072, "step": 211 }, { "epoch": 0.14, "grad_norm": 0.18555670976638794, "learning_rate": 0.00029855674313963355, "loss": 0.0359, "step": 212 }, { "epoch": 0.14, "grad_norm": 0.1801125705242157, "learning_rate": 0.00029854244134180707, "loss": 0.038, "step": 213 }, { "epoch": 0.14, "grad_norm": 0.10735122859477997, "learning_rate": 0.000298528069377799, "loss": 0.037, "step": 214 }, { "epoch": 0.14, "grad_norm": 0.20155467092990875, "learning_rate": 0.0002985136272543982, "loss": 0.0505, "step": 215 }, { "epoch": 0.14, "grad_norm": 0.1130833774805069, "learning_rate": 0.0002984991149784265, "loss": 0.0202, "step": 216 }, { "epoch": 0.14, "grad_norm": 0.1932414174079895, "learning_rate": 0.00029848453255673906, "loss": 0.0803, "step": 217 }, { "epoch": 0.14, "grad_norm": 0.18907181918621063, "learning_rate": 0.0002984698799962241, "loss": 0.0562, "step": 218 }, { "epoch": 0.14, "grad_norm": 0.11439274251461029, "learning_rate": 0.0002984551573038029, "loss": 0.0474, "step": 219 }, { "epoch": 0.14, "grad_norm": 0.19350704550743103, "learning_rate": 0.00029844036448643, "loss": 0.0335, "step": 220 }, { "epoch": 0.14, "grad_norm": 0.19873294234275818, "learning_rate": 0.000298425501551093, "loss": 0.0616, "step": 221 }, { "epoch": 0.15, "grad_norm": 0.2024085968732834, "learning_rate": 0.00029841056850481265, "loss": 0.0567, "step": 222 }, { "epoch": 0.15, "grad_norm": 0.09004423022270203, "learning_rate": 0.0002983955653546427, "loss": 0.0291, "step": 223 }, { "epoch": 0.15, "grad_norm": 0.19469811022281647, "learning_rate": 0.00029838049210767015, "loss": 0.0487, "step": 224 }, { "epoch": 0.15, "grad_norm": 0.2525189518928528, "learning_rate": 0.00029836534877101514, "loss": 0.0629, "step": 225 }, { "epoch": 0.15, "grad_norm": 0.12139023840427399, "learning_rate": 0.0002983501353518307, "loss": 0.0457, "step": 226 }, { "epoch": 0.15, "grad_norm": 0.06411401927471161, "learning_rate": 0.00029833485185730326, "loss": 0.0186, "step": 227 }, { "epoch": 0.15, "grad_norm": 0.024475542828440666, "learning_rate": 0.00029831949829465214, "loss": 0.004, "step": 228 }, { "epoch": 0.15, "grad_norm": 0.15951114892959595, "learning_rate": 0.0002983040746711298, "loss": 0.0297, "step": 229 }, { "epoch": 0.15, "grad_norm": 0.03694155812263489, "learning_rate": 0.0002982885809940218, "loss": 0.0073, "step": 230 }, { "epoch": 0.15, "grad_norm": 0.13100893795490265, "learning_rate": 0.0002982730172706468, "loss": 0.0272, "step": 231 }, { "epoch": 0.15, "grad_norm": 0.08929093927145004, "learning_rate": 0.00029825738350835665, "loss": 0.0146, "step": 232 }, { "epoch": 0.15, "grad_norm": 0.1474764049053192, "learning_rate": 0.0002982416797145361, "loss": 0.0422, "step": 233 }, { "epoch": 0.15, "grad_norm": 0.13874994218349457, "learning_rate": 0.00029822590589660306, "loss": 0.0353, "step": 234 }, { "epoch": 0.15, "grad_norm": 0.048271678388118744, "learning_rate": 0.00029821006206200856, "loss": 0.0072, "step": 235 }, { "epoch": 0.15, "grad_norm": 0.29017898440361023, "learning_rate": 0.0002981941482182366, "loss": 0.0607, "step": 236 }, { "epoch": 0.16, "grad_norm": 0.3267674446105957, "learning_rate": 0.0002981781643728044, "loss": 0.101, "step": 237 }, { "epoch": 0.16, "grad_norm": 0.17602747678756714, "learning_rate": 0.00029816211053326216, "loss": 0.0236, "step": 238 }, { "epoch": 0.16, "grad_norm": 0.08361077308654785, "learning_rate": 0.00029814598670719304, "loss": 0.0277, "step": 239 }, { "epoch": 0.16, "grad_norm": 0.08593238145112991, "learning_rate": 0.00029812979290221346, "loss": 0.0291, "step": 240 }, { "epoch": 0.16, "grad_norm": 0.08858275413513184, "learning_rate": 0.00029811352912597277, "loss": 0.0329, "step": 241 }, { "epoch": 0.16, "grad_norm": 0.08017202466726303, "learning_rate": 0.0002980971953861534, "loss": 0.0287, "step": 242 }, { "epoch": 0.16, "grad_norm": 0.06615002453327179, "learning_rate": 0.0002980807916904709, "loss": 0.0269, "step": 243 }, { "epoch": 0.16, "grad_norm": 0.12813499569892883, "learning_rate": 0.00029806431804667364, "loss": 0.0321, "step": 244 }, { "epoch": 0.16, "grad_norm": 0.05528206750750542, "learning_rate": 0.0002980477744625433, "loss": 0.0089, "step": 245 }, { "epoch": 0.16, "grad_norm": 0.10161186009645462, "learning_rate": 0.00029803116094589445, "loss": 0.0294, "step": 246 }, { "epoch": 0.16, "grad_norm": 0.09885023534297943, "learning_rate": 0.00029801447750457476, "loss": 0.0232, "step": 247 }, { "epoch": 0.16, "grad_norm": 0.20870375633239746, "learning_rate": 0.00029799772414646484, "loss": 0.0478, "step": 248 }, { "epoch": 0.16, "grad_norm": 0.2730790674686432, "learning_rate": 0.00029798090087947843, "loss": 0.042, "step": 249 }, { "epoch": 0.16, "grad_norm": 0.20371069014072418, "learning_rate": 0.0002979640077115622, "loss": 0.0634, "step": 250 }, { "epoch": 0.16, "grad_norm": 0.14660406112670898, "learning_rate": 0.0002979470446506959, "loss": 0.0201, "step": 251 }, { "epoch": 0.16, "grad_norm": 0.19971100986003876, "learning_rate": 0.0002979300117048923, "loss": 0.0431, "step": 252 }, { "epoch": 0.17, "grad_norm": 0.14965400099754333, "learning_rate": 0.0002979129088821971, "loss": 0.041, "step": 253 }, { "epoch": 0.17, "grad_norm": 0.2110958695411682, "learning_rate": 0.0002978957361906892, "loss": 0.028, "step": 254 }, { "epoch": 0.17, "grad_norm": 0.13050246238708496, "learning_rate": 0.0002978784936384802, "loss": 0.0258, "step": 255 }, { "epoch": 0.17, "grad_norm": 0.0885690301656723, "learning_rate": 0.000297861181233715, "loss": 0.0337, "step": 256 }, { "epoch": 0.17, "grad_norm": 0.26541608572006226, "learning_rate": 0.0002978437989845713, "loss": 0.1142, "step": 257 }, { "epoch": 0.17, "grad_norm": 0.14441104233264923, "learning_rate": 0.0002978263468992599, "loss": 0.0368, "step": 258 }, { "epoch": 0.17, "grad_norm": 0.11450188606977463, "learning_rate": 0.0002978088249860245, "loss": 0.0243, "step": 259 }, { "epoch": 0.17, "grad_norm": 0.3472074568271637, "learning_rate": 0.00029779123325314184, "loss": 0.0786, "step": 260 }, { "epoch": 0.17, "grad_norm": 0.07867071032524109, "learning_rate": 0.0002977735717089217, "loss": 0.0356, "step": 261 }, { "epoch": 0.17, "grad_norm": 0.1661967933177948, "learning_rate": 0.0002977558403617067, "loss": 0.047, "step": 262 }, { "epoch": 0.17, "grad_norm": 0.17638400197029114, "learning_rate": 0.00029773803921987244, "loss": 0.0527, "step": 263 }, { "epoch": 0.17, "grad_norm": 0.05885611101984978, "learning_rate": 0.0002977201682918277, "loss": 0.0156, "step": 264 }, { "epoch": 0.17, "grad_norm": 0.07076411694288254, "learning_rate": 0.00029770222758601395, "loss": 0.0418, "step": 265 }, { "epoch": 0.17, "grad_norm": 0.06245988979935646, "learning_rate": 0.0002976842171109058, "loss": 0.0199, "step": 266 }, { "epoch": 0.17, "grad_norm": 0.08311894536018372, "learning_rate": 0.0002976661368750107, "loss": 0.028, "step": 267 }, { "epoch": 0.18, "grad_norm": 0.11093831807374954, "learning_rate": 0.0002976479868868692, "loss": 0.0298, "step": 268 }, { "epoch": 0.18, "grad_norm": 0.17683441936969757, "learning_rate": 0.00029762976715505464, "loss": 0.0539, "step": 269 }, { "epoch": 0.18, "grad_norm": 0.13351142406463623, "learning_rate": 0.00029761147768817345, "loss": 0.0593, "step": 270 }, { "epoch": 0.18, "grad_norm": 0.07717160880565643, "learning_rate": 0.0002975931184948648, "loss": 0.0227, "step": 271 }, { "epoch": 0.18, "grad_norm": 0.11211559176445007, "learning_rate": 0.0002975746895838011, "loss": 0.0385, "step": 272 }, { "epoch": 0.18, "grad_norm": 0.09209641814231873, "learning_rate": 0.00029755619096368734, "loss": 0.0086, "step": 273 }, { "epoch": 0.18, "grad_norm": 0.0850004106760025, "learning_rate": 0.0002975376226432617, "loss": 0.0343, "step": 274 }, { "epoch": 0.18, "grad_norm": 0.17711663246154785, "learning_rate": 0.0002975189846312952, "loss": 0.0665, "step": 275 }, { "epoch": 0.18, "grad_norm": 0.13066548109054565, "learning_rate": 0.0002975002769365918, "loss": 0.0551, "step": 276 }, { "epoch": 0.18, "grad_norm": 0.07509409636259079, "learning_rate": 0.00029748149956798826, "loss": 0.0087, "step": 277 }, { "epoch": 0.18, "grad_norm": 0.3725223243236542, "learning_rate": 0.0002974626525343544, "loss": 0.026, "step": 278 }, { "epoch": 0.18, "grad_norm": 0.20973052084445953, "learning_rate": 0.0002974437358445929, "loss": 0.015, "step": 279 }, { "epoch": 0.18, "grad_norm": 0.25902581214904785, "learning_rate": 0.0002974247495076393, "loss": 0.0617, "step": 280 }, { "epoch": 0.18, "grad_norm": 0.22490067780017853, "learning_rate": 0.000297405693532462, "loss": 0.0456, "step": 281 }, { "epoch": 0.18, "grad_norm": 0.2885708510875702, "learning_rate": 0.0002973865679280626, "loss": 0.1066, "step": 282 }, { "epoch": 0.19, "grad_norm": 0.2658590078353882, "learning_rate": 0.00029736737270347517, "loss": 0.0931, "step": 283 }, { "epoch": 0.19, "grad_norm": 0.11531944572925568, "learning_rate": 0.00029734810786776687, "loss": 0.0238, "step": 284 }, { "epoch": 0.19, "grad_norm": 0.0557803250849247, "learning_rate": 0.00029732877343003776, "loss": 0.0257, "step": 285 }, { "epoch": 0.19, "grad_norm": 0.10880523920059204, "learning_rate": 0.00029730936939942077, "loss": 0.0387, "step": 286 }, { "epoch": 0.19, "grad_norm": 0.09500639885663986, "learning_rate": 0.0002972898957850816, "loss": 0.0308, "step": 287 }, { "epoch": 0.19, "grad_norm": 0.11504241824150085, "learning_rate": 0.0002972703525962189, "loss": 0.0292, "step": 288 }, { "epoch": 0.19, "grad_norm": 0.10513140261173248, "learning_rate": 0.0002972507398420643, "loss": 0.0245, "step": 289 }, { "epoch": 0.19, "grad_norm": 0.20218555629253387, "learning_rate": 0.000297231057531882, "loss": 0.0394, "step": 290 }, { "epoch": 0.19, "grad_norm": 0.053536418825387955, "learning_rate": 0.00029721130567496936, "loss": 0.0129, "step": 291 }, { "epoch": 0.19, "grad_norm": 0.15879443287849426, "learning_rate": 0.0002971914842806564, "loss": 0.054, "step": 292 }, { "epoch": 0.19, "grad_norm": 0.11933678388595581, "learning_rate": 0.00029717159335830606, "loss": 0.0206, "step": 293 }, { "epoch": 0.19, "grad_norm": 0.14436180889606476, "learning_rate": 0.0002971516329173141, "loss": 0.024, "step": 294 }, { "epoch": 0.19, "grad_norm": 0.01978749968111515, "learning_rate": 0.0002971316029671091, "loss": 0.0047, "step": 295 }, { "epoch": 0.19, "grad_norm": 0.1731237769126892, "learning_rate": 0.00029711150351715253, "loss": 0.0605, "step": 296 }, { "epoch": 0.19, "grad_norm": 0.059307076036930084, "learning_rate": 0.00029709133457693867, "loss": 0.0308, "step": 297 }, { "epoch": 0.2, "grad_norm": 0.3645476996898651, "learning_rate": 0.00029707109615599456, "loss": 0.0566, "step": 298 }, { "epoch": 0.2, "grad_norm": 0.10670791566371918, "learning_rate": 0.0002970507882638801, "loss": 0.0234, "step": 299 }, { "epoch": 0.2, "grad_norm": 0.10919758677482605, "learning_rate": 0.0002970304109101881, "loss": 0.0157, "step": 300 }, { "epoch": 0.2, "grad_norm": 0.08173630386590958, "learning_rate": 0.00029700996410454407, "loss": 0.0371, "step": 301 }, { "epoch": 0.2, "grad_norm": 0.13943839073181152, "learning_rate": 0.00029698944785660635, "loss": 0.0781, "step": 302 }, { "epoch": 0.2, "grad_norm": 0.342821329832077, "learning_rate": 0.00029696886217606605, "loss": 0.0476, "step": 303 }, { "epoch": 0.2, "grad_norm": 0.048615969717502594, "learning_rate": 0.0002969482070726472, "loss": 0.0083, "step": 304 }, { "epoch": 0.2, "grad_norm": 0.1213599145412445, "learning_rate": 0.0002969274825561064, "loss": 0.0258, "step": 305 }, { "epoch": 0.2, "grad_norm": 0.1914874166250229, "learning_rate": 0.0002969066886362333, "loss": 0.034, "step": 306 }, { "epoch": 0.2, "grad_norm": 0.14067624509334564, "learning_rate": 0.0002968858253228502, "loss": 0.0395, "step": 307 }, { "epoch": 0.2, "grad_norm": 0.08359983563423157, "learning_rate": 0.00029686489262581217, "loss": 0.0315, "step": 308 }, { "epoch": 0.2, "grad_norm": 0.11551601439714432, "learning_rate": 0.000296843890555007, "loss": 0.058, "step": 309 }, { "epoch": 0.2, "grad_norm": 0.12968787550926208, "learning_rate": 0.00029682281912035545, "loss": 0.0347, "step": 310 }, { "epoch": 0.2, "grad_norm": 0.10182147473096848, "learning_rate": 0.0002968016783318109, "loss": 0.0165, "step": 311 }, { "epoch": 0.2, "grad_norm": 0.06534916907548904, "learning_rate": 0.00029678046819935934, "loss": 0.0218, "step": 312 }, { "epoch": 0.2, "grad_norm": 0.12587250769138336, "learning_rate": 0.0002967591887330199, "loss": 0.0498, "step": 313 }, { "epoch": 0.21, "grad_norm": 0.06701786816120148, "learning_rate": 0.0002967378399428441, "loss": 0.0484, "step": 314 }, { "epoch": 0.21, "grad_norm": 0.10836692154407501, "learning_rate": 0.00029671642183891643, "loss": 0.0412, "step": 315 }, { "epoch": 0.21, "grad_norm": 0.061415113508701324, "learning_rate": 0.00029669493443135403, "loss": 0.0172, "step": 316 }, { "epoch": 0.21, "grad_norm": 0.20760087668895721, "learning_rate": 0.0002966733777303068, "loss": 0.0494, "step": 317 }, { "epoch": 0.21, "grad_norm": 0.11503862589597702, "learning_rate": 0.00029665175174595736, "loss": 0.0385, "step": 318 }, { "epoch": 0.21, "grad_norm": 0.07366505265235901, "learning_rate": 0.000296630056488521, "loss": 0.0403, "step": 319 }, { "epoch": 0.21, "grad_norm": 0.036951594054698944, "learning_rate": 0.00029660829196824577, "loss": 0.0092, "step": 320 }, { "epoch": 0.21, "grad_norm": 0.08457314223051071, "learning_rate": 0.0002965864581954126, "loss": 0.0445, "step": 321 }, { "epoch": 0.21, "grad_norm": 0.24513787031173706, "learning_rate": 0.0002965645551803349, "loss": 0.0716, "step": 322 }, { "epoch": 0.21, "grad_norm": 0.08235831558704376, "learning_rate": 0.00029654258293335887, "loss": 0.029, "step": 323 }, { "epoch": 0.21, "grad_norm": 0.08004003018140793, "learning_rate": 0.00029652054146486344, "loss": 0.0365, "step": 324 }, { "epoch": 0.21, "grad_norm": 0.14928393065929413, "learning_rate": 0.0002964984307852602, "loss": 0.039, "step": 325 }, { "epoch": 0.21, "grad_norm": 0.1802273988723755, "learning_rate": 0.00029647625090499345, "loss": 0.0324, "step": 326 }, { "epoch": 0.21, "grad_norm": 0.18169750273227692, "learning_rate": 0.00029645400183454026, "loss": 0.0427, "step": 327 }, { "epoch": 0.21, "grad_norm": 0.13121691346168518, "learning_rate": 0.0002964316835844102, "loss": 0.0274, "step": 328 }, { "epoch": 0.22, "grad_norm": 0.27358877658843994, "learning_rate": 0.0002964092961651456, "loss": 0.0537, "step": 329 }, { "epoch": 0.22, "grad_norm": 0.16992299258708954, "learning_rate": 0.0002963868395873216, "loss": 0.0797, "step": 330 }, { "epoch": 0.22, "grad_norm": 0.2110740691423416, "learning_rate": 0.0002963643138615458, "loss": 0.0835, "step": 331 }, { "epoch": 0.22, "grad_norm": 0.17114487290382385, "learning_rate": 0.0002963417189984586, "loss": 0.0619, "step": 332 }, { "epoch": 0.22, "grad_norm": 0.09492560476064682, "learning_rate": 0.000296319055008733, "loss": 0.0212, "step": 333 }, { "epoch": 0.22, "grad_norm": 0.19000209867954254, "learning_rate": 0.0002962963219030746, "loss": 0.0802, "step": 334 }, { "epoch": 0.22, "grad_norm": 0.11632812023162842, "learning_rate": 0.0002962735196922219, "loss": 0.0426, "step": 335 }, { "epoch": 0.22, "grad_norm": 0.15153561532497406, "learning_rate": 0.0002962506483869456, "loss": 0.07, "step": 336 }, { "epoch": 0.22, "grad_norm": 0.0691797137260437, "learning_rate": 0.00029622770799804944, "loss": 0.0246, "step": 337 }, { "epoch": 0.22, "grad_norm": 0.0731196403503418, "learning_rate": 0.0002962046985363697, "loss": 0.0413, "step": 338 }, { "epoch": 0.22, "grad_norm": 0.1449161171913147, "learning_rate": 0.00029618162001277513, "loss": 0.023, "step": 339 }, { "epoch": 0.22, "grad_norm": 0.13844870030879974, "learning_rate": 0.0002961584724381672, "loss": 0.055, "step": 340 }, { "epoch": 0.22, "grad_norm": 0.08192728459835052, "learning_rate": 0.00029613525582348007, "loss": 0.0274, "step": 341 }, { "epoch": 0.22, "grad_norm": 0.030294157564640045, "learning_rate": 0.0002961119701796804, "loss": 0.0332, "step": 342 }, { "epoch": 0.22, "grad_norm": 0.12008962035179138, "learning_rate": 0.0002960886155177675, "loss": 0.0293, "step": 343 }, { "epoch": 0.23, "grad_norm": 0.22829335927963257, "learning_rate": 0.0002960651918487734, "loss": 0.049, "step": 344 }, { "epoch": 0.23, "grad_norm": 0.09662315249443054, "learning_rate": 0.00029604169918376246, "loss": 0.019, "step": 345 }, { "epoch": 0.23, "grad_norm": 0.056000061333179474, "learning_rate": 0.0002960181375338318, "loss": 0.0077, "step": 346 }, { "epoch": 0.23, "grad_norm": 0.04742419347167015, "learning_rate": 0.00029599450691011116, "loss": 0.0216, "step": 347 }, { "epoch": 0.23, "grad_norm": 0.17151907086372375, "learning_rate": 0.0002959708073237628, "loss": 0.0364, "step": 348 }, { "epoch": 0.23, "grad_norm": 0.3108668923377991, "learning_rate": 0.00029594703878598155, "loss": 0.0288, "step": 349 }, { "epoch": 0.23, "grad_norm": 0.05538111925125122, "learning_rate": 0.00029592320130799487, "loss": 0.0048, "step": 350 }, { "epoch": 0.23, "grad_norm": 0.2907853126525879, "learning_rate": 0.00029589929490106263, "loss": 0.0443, "step": 351 }, { "epoch": 0.23, "grad_norm": 0.19189013540744781, "learning_rate": 0.0002958753195764775, "loss": 0.0688, "step": 352 }, { "epoch": 0.23, "grad_norm": 0.3744778037071228, "learning_rate": 0.00029585127534556446, "loss": 0.0726, "step": 353 }, { "epoch": 0.23, "grad_norm": 0.02139083668589592, "learning_rate": 0.00029582716221968124, "loss": 0.003, "step": 354 }, { "epoch": 0.23, "grad_norm": 0.3209889531135559, "learning_rate": 0.00029580298021021796, "loss": 0.068, "step": 355 }, { "epoch": 0.23, "grad_norm": 0.13530127704143524, "learning_rate": 0.0002957787293285974, "loss": 0.0229, "step": 356 }, { "epoch": 0.23, "grad_norm": 0.04955355450510979, "learning_rate": 0.00029575440958627485, "loss": 0.007, "step": 357 }, { "epoch": 0.23, "grad_norm": 0.05992133542895317, "learning_rate": 0.0002957300209947379, "loss": 0.014, "step": 358 }, { "epoch": 0.24, "grad_norm": 0.08975626528263092, "learning_rate": 0.0002957055635655071, "loss": 0.0419, "step": 359 }, { "epoch": 0.24, "grad_norm": 0.3397723436355591, "learning_rate": 0.00029568103731013513, "loss": 0.093, "step": 360 }, { "epoch": 0.24, "grad_norm": 0.05291612446308136, "learning_rate": 0.00029565644224020733, "loss": 0.0137, "step": 361 }, { "epoch": 0.24, "grad_norm": 0.16154609620571136, "learning_rate": 0.0002956317783673416, "loss": 0.0414, "step": 362 }, { "epoch": 0.24, "grad_norm": 0.12861596047878265, "learning_rate": 0.0002956070457031882, "loss": 0.0372, "step": 363 }, { "epoch": 0.24, "grad_norm": 0.09462448954582214, "learning_rate": 0.00029558224425943003, "loss": 0.0292, "step": 364 }, { "epoch": 0.24, "grad_norm": 0.14290063083171844, "learning_rate": 0.00029555737404778233, "loss": 0.0572, "step": 365 }, { "epoch": 0.24, "grad_norm": 0.11055822670459747, "learning_rate": 0.00029553243507999307, "loss": 0.0372, "step": 366 }, { "epoch": 0.24, "grad_norm": 0.10231087356805801, "learning_rate": 0.00029550742736784237, "loss": 0.0368, "step": 367 }, { "epoch": 0.24, "grad_norm": 0.09969429671764374, "learning_rate": 0.00029548235092314304, "loss": 0.0416, "step": 368 }, { "epoch": 0.24, "grad_norm": 0.1207612007856369, "learning_rate": 0.00029545720575774033, "loss": 0.0307, "step": 369 }, { "epoch": 0.24, "grad_norm": 0.11535090953111649, "learning_rate": 0.0002954319918835119, "loss": 0.0296, "step": 370 }, { "epoch": 0.24, "grad_norm": 0.1460224986076355, "learning_rate": 0.00029540670931236786, "loss": 0.0587, "step": 371 }, { "epoch": 0.24, "grad_norm": 0.10432720184326172, "learning_rate": 0.0002953813580562509, "loss": 0.0397, "step": 372 }, { "epoch": 0.24, "grad_norm": 0.2140846997499466, "learning_rate": 0.0002953559381271359, "loss": 0.0538, "step": 373 }, { "epoch": 0.24, "grad_norm": 0.12050808221101761, "learning_rate": 0.00029533044953703044, "loss": 0.0439, "step": 374 }, { "epoch": 0.25, "grad_norm": 0.07928888499736786, "learning_rate": 0.0002953048922979744, "loss": 0.0163, "step": 375 }, { "epoch": 0.25, "grad_norm": 0.08733994513750076, "learning_rate": 0.0002952792664220402, "loss": 0.0219, "step": 376 }, { "epoch": 0.25, "grad_norm": 0.18080447614192963, "learning_rate": 0.0002952535719213325, "loss": 0.0469, "step": 377 }, { "epoch": 0.25, "grad_norm": 0.08348793536424637, "learning_rate": 0.0002952278088079884, "loss": 0.035, "step": 378 }, { "epoch": 0.25, "grad_norm": 0.1347195953130722, "learning_rate": 0.00029520197709417763, "loss": 0.029, "step": 379 }, { "epoch": 0.25, "grad_norm": 0.11075679957866669, "learning_rate": 0.0002951760767921021, "loss": 0.0257, "step": 380 }, { "epoch": 0.25, "grad_norm": 0.13172994554042816, "learning_rate": 0.0002951501079139962, "loss": 0.0302, "step": 381 }, { "epoch": 0.25, "grad_norm": 0.114262655377388, "learning_rate": 0.0002951240704721267, "loss": 0.0492, "step": 382 }, { "epoch": 0.25, "eval_loss": 0.034534960985183716, "eval_runtime": 39.6959, "eval_samples_per_second": 32.421, "eval_steps_per_second": 8.112, "step": 382 }, { "epoch": 0.25, "grad_norm": 0.08364730328321457, "learning_rate": 0.0002950979644787928, "loss": 0.0185, "step": 383 }, { "epoch": 0.25, "grad_norm": 0.16603770852088928, "learning_rate": 0.000295071789946326, "loss": 0.0443, "step": 384 }, { "epoch": 0.25, "grad_norm": 0.1269228458404541, "learning_rate": 0.00029504554688709027, "loss": 0.0217, "step": 385 }, { "epoch": 0.25, "grad_norm": 0.15612861514091492, "learning_rate": 0.0002950192353134819, "loss": 0.0377, "step": 386 }, { "epoch": 0.25, "grad_norm": 0.056646961718797684, "learning_rate": 0.00029499285523792946, "loss": 0.0133, "step": 387 }, { "epoch": 0.25, "grad_norm": 0.23394975066184998, "learning_rate": 0.000294966406672894, "loss": 0.0767, "step": 388 }, { "epoch": 0.25, "grad_norm": 0.21382953226566315, "learning_rate": 0.00029493988963086895, "loss": 0.0729, "step": 389 }, { "epoch": 0.26, "grad_norm": 0.27641353011131287, "learning_rate": 0.00029491330412438, "loss": 0.1022, "step": 390 }, { "epoch": 0.26, "grad_norm": 0.0760459303855896, "learning_rate": 0.0002948866501659852, "loss": 0.0269, "step": 391 }, { "epoch": 0.26, "grad_norm": 0.5418729186058044, "learning_rate": 0.0002948599277682748, "loss": 0.1523, "step": 392 }, { "epoch": 0.26, "grad_norm": 0.13234178721904755, "learning_rate": 0.00029483313694387165, "loss": 0.0292, "step": 393 }, { "epoch": 0.26, "grad_norm": 0.07174021750688553, "learning_rate": 0.00029480627770543086, "loss": 0.0395, "step": 394 }, { "epoch": 0.26, "grad_norm": 0.09958759695291519, "learning_rate": 0.00029477935006563957, "loss": 0.0559, "step": 395 }, { "epoch": 0.26, "grad_norm": 0.07592346519231796, "learning_rate": 0.00029475235403721763, "loss": 0.0488, "step": 396 }, { "epoch": 0.26, "grad_norm": 0.10129998624324799, "learning_rate": 0.00029472528963291685, "loss": 0.0287, "step": 397 }, { "epoch": 0.26, "grad_norm": 0.08051212131977081, "learning_rate": 0.00029469815686552163, "loss": 0.0386, "step": 398 }, { "epoch": 0.26, "grad_norm": 0.0695783942937851, "learning_rate": 0.0002946709557478485, "loss": 0.0201, "step": 399 }, { "epoch": 0.26, "grad_norm": 0.1511554718017578, "learning_rate": 0.00029464368629274624, "loss": 0.0464, "step": 400 }, { "epoch": 0.26, "grad_norm": 0.075484499335289, "learning_rate": 0.00029461634851309597, "loss": 0.031, "step": 401 }, { "epoch": 0.26, "grad_norm": 0.08108027279376984, "learning_rate": 0.00029458894242181114, "loss": 0.0271, "step": 402 }, { "epoch": 0.26, "grad_norm": 0.07254958897829056, "learning_rate": 0.00029456146803183745, "loss": 0.0187, "step": 403 }, { "epoch": 0.26, "grad_norm": 0.215089812874794, "learning_rate": 0.00029453392535615274, "loss": 0.0463, "step": 404 }, { "epoch": 0.27, "grad_norm": 0.034637995064258575, "learning_rate": 0.0002945063144077672, "loss": 0.0084, "step": 405 }, { "epoch": 0.27, "grad_norm": 0.12073606252670288, "learning_rate": 0.00029447863519972337, "loss": 0.0401, "step": 406 }, { "epoch": 0.27, "grad_norm": 0.13762198388576508, "learning_rate": 0.00029445088774509583, "loss": 0.0244, "step": 407 }, { "epoch": 0.27, "grad_norm": 0.2537041902542114, "learning_rate": 0.00029442307205699154, "loss": 0.0574, "step": 408 }, { "epoch": 0.27, "grad_norm": 0.1401953399181366, "learning_rate": 0.00029439518814854956, "loss": 0.0202, "step": 409 }, { "epoch": 0.27, "grad_norm": 0.13872119784355164, "learning_rate": 0.0002943672360329413, "loss": 0.0373, "step": 410 }, { "epoch": 0.27, "grad_norm": 0.3436320126056671, "learning_rate": 0.00029433921572337044, "loss": 0.0944, "step": 411 }, { "epoch": 0.27, "grad_norm": 0.20004349946975708, "learning_rate": 0.00029431112723307266, "loss": 0.0625, "step": 412 }, { "epoch": 0.27, "grad_norm": 0.10176026076078415, "learning_rate": 0.00029428297057531607, "loss": 0.023, "step": 413 }, { "epoch": 0.27, "grad_norm": 0.08603208512067795, "learning_rate": 0.0002942547457634008, "loss": 0.0141, "step": 414 }, { "epoch": 0.27, "grad_norm": 0.03601311519742012, "learning_rate": 0.0002942264528106592, "loss": 0.0071, "step": 415 }, { "epoch": 0.27, "grad_norm": 0.1434870958328247, "learning_rate": 0.000294198091730456, "loss": 0.0362, "step": 416 }, { "epoch": 0.27, "grad_norm": 0.1505521684885025, "learning_rate": 0.0002941696625361879, "loss": 0.0211, "step": 417 }, { "epoch": 0.27, "grad_norm": 0.14390698075294495, "learning_rate": 0.0002941411652412838, "loss": 0.054, "step": 418 }, { "epoch": 0.27, "grad_norm": 0.21683859825134277, "learning_rate": 0.00029411259985920486, "loss": 0.0482, "step": 419 }, { "epoch": 0.27, "grad_norm": 0.12036791443824768, "learning_rate": 0.0002940839664034444, "loss": 0.0444, "step": 420 }, { "epoch": 0.28, "grad_norm": 0.09479566663503647, "learning_rate": 0.00029405526488752775, "loss": 0.035, "step": 421 }, { "epoch": 0.28, "grad_norm": 0.14229558408260345, "learning_rate": 0.0002940264953250125, "loss": 0.0573, "step": 422 }, { "epoch": 0.28, "grad_norm": 0.22773970663547516, "learning_rate": 0.00029399765772948844, "loss": 0.061, "step": 423 }, { "epoch": 0.28, "grad_norm": 0.11387961357831955, "learning_rate": 0.0002939687521145774, "loss": 0.057, "step": 424 }, { "epoch": 0.28, "grad_norm": 0.1798745095729828, "learning_rate": 0.00029393977849393333, "loss": 0.0392, "step": 425 }, { "epoch": 0.28, "grad_norm": 0.07203508168458939, "learning_rate": 0.0002939107368812424, "loss": 0.0152, "step": 426 }, { "epoch": 0.28, "grad_norm": 0.04569177329540253, "learning_rate": 0.0002938816272902228, "loss": 0.0113, "step": 427 }, { "epoch": 0.28, "grad_norm": 0.0927419438958168, "learning_rate": 0.0002938524497346249, "loss": 0.0246, "step": 428 }, { "epoch": 0.28, "grad_norm": 0.16807597875595093, "learning_rate": 0.0002938232042282311, "loss": 0.0364, "step": 429 }, { "epoch": 0.28, "grad_norm": 0.12006795406341553, "learning_rate": 0.00029379389078485596, "loss": 0.0118, "step": 430 }, { "epoch": 0.28, "grad_norm": 0.0377679318189621, "learning_rate": 0.0002937645094183461, "loss": 0.0063, "step": 431 }, { "epoch": 0.28, "grad_norm": 0.27051666378974915, "learning_rate": 0.00029373506014258025, "loss": 0.0682, "step": 432 }, { "epoch": 0.28, "grad_norm": 0.228448748588562, "learning_rate": 0.0002937055429714692, "loss": 0.0733, "step": 433 }, { "epoch": 0.28, "grad_norm": 0.18427824974060059, "learning_rate": 0.00029367595791895577, "loss": 0.0338, "step": 434 }, { "epoch": 0.28, "grad_norm": 0.25813257694244385, "learning_rate": 0.00029364630499901503, "loss": 0.0323, "step": 435 }, { "epoch": 0.29, "grad_norm": 0.17406705021858215, "learning_rate": 0.0002936165842256538, "loss": 0.0398, "step": 436 }, { "epoch": 0.29, "grad_norm": 0.5199068188667297, "learning_rate": 0.0002935867956129112, "loss": 0.0486, "step": 437 }, { "epoch": 0.29, "grad_norm": 0.3251938223838806, "learning_rate": 0.0002935569391748583, "loss": 0.049, "step": 438 }, { "epoch": 0.29, "grad_norm": 0.057003892958164215, "learning_rate": 0.00029352701492559827, "loss": 0.0114, "step": 439 }, { "epoch": 0.29, "grad_norm": 0.15188859403133392, "learning_rate": 0.00029349702287926623, "loss": 0.0323, "step": 440 }, { "epoch": 0.29, "grad_norm": 0.17942048609256744, "learning_rate": 0.0002934669630500293, "loss": 0.0437, "step": 441 }, { "epoch": 0.29, "grad_norm": 0.06396406143903732, "learning_rate": 0.0002934368354520867, "loss": 0.0097, "step": 442 }, { "epoch": 0.29, "grad_norm": 0.1496248096227646, "learning_rate": 0.00029340664009966974, "loss": 0.0316, "step": 443 }, { "epoch": 0.29, "grad_norm": 0.0654374286532402, "learning_rate": 0.00029337637700704156, "loss": 0.0083, "step": 444 }, { "epoch": 0.29, "grad_norm": 0.04386695846915245, "learning_rate": 0.0002933460461884973, "loss": 0.0094, "step": 445 }, { "epoch": 0.29, "grad_norm": 0.14928901195526123, "learning_rate": 0.0002933156476583643, "loss": 0.0484, "step": 446 }, { "epoch": 0.29, "grad_norm": 0.12666364014148712, "learning_rate": 0.0002932851814310017, "loss": 0.0148, "step": 447 }, { "epoch": 0.29, "grad_norm": 0.023791933432221413, "learning_rate": 0.0002932546475208006, "loss": 0.003, "step": 448 }, { "epoch": 0.29, "grad_norm": 0.022256718948483467, "learning_rate": 0.0002932240459421842, "loss": 0.0044, "step": 449 }, { "epoch": 0.29, "grad_norm": 0.12194914370775223, "learning_rate": 0.0002931933767096076, "loss": 0.009, "step": 450 }, { "epoch": 0.3, "grad_norm": 0.29687178134918213, "learning_rate": 0.0002931626398375578, "loss": 0.0691, "step": 451 }, { "epoch": 0.3, "grad_norm": 0.24758018553256989, "learning_rate": 0.00029313183534055386, "loss": 0.0589, "step": 452 }, { "epoch": 0.3, "grad_norm": 0.10298270732164383, "learning_rate": 0.0002931009632331468, "loss": 0.0187, "step": 453 }, { "epoch": 0.3, "grad_norm": 0.1447860449552536, "learning_rate": 0.00029307002352991937, "loss": 0.0297, "step": 454 }, { "epoch": 0.3, "grad_norm": 0.2590334117412567, "learning_rate": 0.00029303901624548644, "loss": 0.0892, "step": 455 }, { "epoch": 0.3, "grad_norm": 0.07339983433485031, "learning_rate": 0.00029300794139449477, "loss": 0.0249, "step": 456 }, { "epoch": 0.3, "grad_norm": 0.16213186085224152, "learning_rate": 0.000292976798991623, "loss": 0.0493, "step": 457 }, { "epoch": 0.3, "grad_norm": 0.03418932110071182, "learning_rate": 0.0002929455890515818, "loss": 0.0066, "step": 458 }, { "epoch": 0.3, "grad_norm": 0.18771564960479736, "learning_rate": 0.0002929143115891134, "loss": 0.03, "step": 459 }, { "epoch": 0.3, "grad_norm": 0.13976161181926727, "learning_rate": 0.00029288296661899243, "loss": 0.0451, "step": 460 }, { "epoch": 0.3, "grad_norm": 0.07075387239456177, "learning_rate": 0.00029285155415602495, "loss": 0.0201, "step": 461 }, { "epoch": 0.3, "grad_norm": 0.1304980367422104, "learning_rate": 0.0002928200742150492, "loss": 0.0286, "step": 462 }, { "epoch": 0.3, "grad_norm": 0.06026493385434151, "learning_rate": 0.00029278852681093514, "loss": 0.0159, "step": 463 }, { "epoch": 0.3, "grad_norm": 0.08018484711647034, "learning_rate": 0.0002927569119585847, "loss": 0.0333, "step": 464 }, { "epoch": 0.3, "grad_norm": 0.21171532571315765, "learning_rate": 0.0002927252296729315, "loss": 0.034, "step": 465 }, { "epoch": 0.31, "grad_norm": 0.14055241644382477, "learning_rate": 0.0002926934799689413, "loss": 0.0504, "step": 466 }, { "epoch": 0.31, "grad_norm": 0.17434647679328918, "learning_rate": 0.0002926616628616113, "loss": 0.0519, "step": 467 }, { "epoch": 0.31, "grad_norm": 0.12710362672805786, "learning_rate": 0.00029262977836597105, "loss": 0.0154, "step": 468 }, { "epoch": 0.31, "grad_norm": 0.16046389937400818, "learning_rate": 0.0002925978264970814, "loss": 0.0398, "step": 469 }, { "epoch": 0.31, "grad_norm": 0.23207533359527588, "learning_rate": 0.00029256580727003543, "loss": 0.0562, "step": 470 }, { "epoch": 0.31, "grad_norm": 0.29609429836273193, "learning_rate": 0.0002925337206999579, "loss": 0.137, "step": 471 }, { "epoch": 0.31, "grad_norm": 0.15176476538181305, "learning_rate": 0.00029250156680200526, "loss": 0.025, "step": 472 }, { "epoch": 0.31, "grad_norm": 0.14394959807395935, "learning_rate": 0.00029246934559136597, "loss": 0.0519, "step": 473 }, { "epoch": 0.31, "grad_norm": 0.08391053229570389, "learning_rate": 0.00029243705708326015, "loss": 0.0184, "step": 474 }, { "epoch": 0.31, "grad_norm": 0.09384860098361969, "learning_rate": 0.00029240470129293975, "loss": 0.0229, "step": 475 }, { "epoch": 0.31, "grad_norm": 0.12083159387111664, "learning_rate": 0.00029237227823568845, "loss": 0.0219, "step": 476 }, { "epoch": 0.31, "grad_norm": 0.19567762315273285, "learning_rate": 0.0002923397879268218, "loss": 0.0728, "step": 477 }, { "epoch": 0.31, "grad_norm": 0.07342015206813812, "learning_rate": 0.0002923072303816871, "loss": 0.0412, "step": 478 }, { "epoch": 0.31, "grad_norm": 0.06717100739479065, "learning_rate": 0.00029227460561566333, "loss": 0.0309, "step": 479 }, { "epoch": 0.31, "grad_norm": 0.09244221448898315, "learning_rate": 0.0002922419136441613, "loss": 0.0508, "step": 480 }, { "epoch": 0.31, "grad_norm": 0.052494604140520096, "learning_rate": 0.0002922091544826235, "loss": 0.0319, "step": 481 }, { "epoch": 0.32, "grad_norm": 0.14286155998706818, "learning_rate": 0.00029217632814652417, "loss": 0.0654, "step": 482 }, { "epoch": 0.32, "grad_norm": 0.06442811340093613, "learning_rate": 0.00029214343465136945, "loss": 0.0132, "step": 483 }, { "epoch": 0.32, "grad_norm": 0.05420248210430145, "learning_rate": 0.0002921104740126969, "loss": 0.0115, "step": 484 }, { "epoch": 0.32, "grad_norm": 0.04951406642794609, "learning_rate": 0.0002920774462460761, "loss": 0.0086, "step": 485 }, { "epoch": 0.32, "grad_norm": 0.08321358263492584, "learning_rate": 0.00029204435136710803, "loss": 0.0445, "step": 486 }, { "epoch": 0.32, "grad_norm": 0.11665898561477661, "learning_rate": 0.0002920111893914257, "loss": 0.0262, "step": 487 }, { "epoch": 0.32, "grad_norm": 0.1829105019569397, "learning_rate": 0.00029197796033469356, "loss": 0.0308, "step": 488 }, { "epoch": 0.32, "grad_norm": 0.20940159261226654, "learning_rate": 0.00029194466421260786, "loss": 0.0299, "step": 489 }, { "epoch": 0.32, "grad_norm": 0.20697347819805145, "learning_rate": 0.0002919113010408965, "loss": 0.0405, "step": 490 }, { "epoch": 0.32, "grad_norm": 0.051994968205690384, "learning_rate": 0.000291877870835319, "loss": 0.01, "step": 491 }, { "epoch": 0.32, "grad_norm": 0.1463523805141449, "learning_rate": 0.00029184437361166676, "loss": 0.0555, "step": 492 }, { "epoch": 0.32, "grad_norm": 0.09110219031572342, "learning_rate": 0.00029181080938576255, "loss": 0.0371, "step": 493 }, { "epoch": 0.32, "grad_norm": 0.04076121374964714, "learning_rate": 0.00029177717817346097, "loss": 0.0065, "step": 494 }, { "epoch": 0.32, "grad_norm": 0.11555450409650803, "learning_rate": 0.0002917434799906482, "loss": 0.0115, "step": 495 }, { "epoch": 0.32, "grad_norm": 0.15579824149608612, "learning_rate": 0.0002917097148532421, "loss": 0.0332, "step": 496 }, { "epoch": 0.33, "grad_norm": 0.41938668489456177, "learning_rate": 0.000291675882777192, "loss": 0.0678, "step": 497 }, { "epoch": 0.33, "grad_norm": 0.16764874756336212, "learning_rate": 0.0002916419837784791, "loss": 0.0683, "step": 498 }, { "epoch": 0.33, "grad_norm": 0.1291145384311676, "learning_rate": 0.00029160801787311613, "loss": 0.0376, "step": 499 }, { "epoch": 0.33, "grad_norm": 0.06120933219790459, "learning_rate": 0.0002915739850771472, "loss": 0.0307, "step": 500 }, { "epoch": 0.33, "grad_norm": 0.09218423068523407, "learning_rate": 0.0002915398854066483, "loss": 0.0545, "step": 501 }, { "epoch": 0.33, "grad_norm": 0.12664952874183655, "learning_rate": 0.00029150571887772694, "loss": 0.0274, "step": 502 }, { "epoch": 0.33, "grad_norm": 0.0705379918217659, "learning_rate": 0.0002914714855065221, "loss": 0.0198, "step": 503 }, { "epoch": 0.33, "grad_norm": 0.03559693694114685, "learning_rate": 0.00029143718530920447, "loss": 0.0114, "step": 504 }, { "epoch": 0.33, "grad_norm": 0.051283448934555054, "learning_rate": 0.0002914028183019762, "loss": 0.0327, "step": 505 }, { "epoch": 0.33, "grad_norm": 0.12527117133140564, "learning_rate": 0.0002913683845010711, "loss": 0.0316, "step": 506 }, { "epoch": 0.33, "grad_norm": 0.0627032071352005, "learning_rate": 0.0002913338839227544, "loss": 0.0185, "step": 507 }, { "epoch": 0.33, "grad_norm": 0.07235468178987503, "learning_rate": 0.000291299316583323, "loss": 0.0605, "step": 508 }, { "epoch": 0.33, "grad_norm": 0.07697612792253494, "learning_rate": 0.0002912646824991053, "loss": 0.031, "step": 509 }, { "epoch": 0.33, "grad_norm": 0.08240342885255814, "learning_rate": 0.0002912299816864612, "loss": 0.0211, "step": 510 }, { "epoch": 0.33, "grad_norm": 0.07725581526756287, "learning_rate": 0.0002911952141617821, "loss": 0.0311, "step": 511 }, { "epoch": 0.34, "grad_norm": 0.14777988195419312, "learning_rate": 0.000291160379941491, "loss": 0.038, "step": 512 }, { "epoch": 0.34, "grad_norm": 0.11423151195049286, "learning_rate": 0.0002911254790420423, "loss": 0.0594, "step": 513 }, { "epoch": 0.34, "grad_norm": 0.07308260351419449, "learning_rate": 0.000291090511479922, "loss": 0.0416, "step": 514 }, { "epoch": 0.34, "grad_norm": 0.11171098798513412, "learning_rate": 0.00029105547727164747, "loss": 0.0509, "step": 515 }, { "epoch": 0.34, "grad_norm": 0.29647496342658997, "learning_rate": 0.00029102037643376764, "loss": 0.0421, "step": 516 }, { "epoch": 0.34, "grad_norm": 0.08812320232391357, "learning_rate": 0.00029098520898286303, "loss": 0.0559, "step": 517 }, { "epoch": 0.34, "grad_norm": 0.13493718206882477, "learning_rate": 0.00029094997493554525, "loss": 0.0257, "step": 518 }, { "epoch": 0.34, "grad_norm": 0.1292780339717865, "learning_rate": 0.0002909146743084579, "loss": 0.0699, "step": 519 }, { "epoch": 0.34, "grad_norm": 0.03736162185668945, "learning_rate": 0.0002908793071182755, "loss": 0.0113, "step": 520 }, { "epoch": 0.34, "grad_norm": 0.20628990232944489, "learning_rate": 0.00029084387338170435, "loss": 0.1039, "step": 521 }, { "epoch": 0.34, "grad_norm": 0.13702163100242615, "learning_rate": 0.0002908083731154821, "loss": 0.0715, "step": 522 }, { "epoch": 0.34, "grad_norm": 0.10376426577568054, "learning_rate": 0.0002907728063363779, "loss": 0.0566, "step": 523 }, { "epoch": 0.34, "grad_norm": 0.03796597197651863, "learning_rate": 0.00029073717306119206, "loss": 0.0131, "step": 524 }, { "epoch": 0.34, "grad_norm": 0.12588168680667877, "learning_rate": 0.0002907014733067566, "loss": 0.0754, "step": 525 }, { "epoch": 0.34, "grad_norm": 0.18614119291305542, "learning_rate": 0.00029066570708993474, "loss": 0.0839, "step": 526 }, { "epoch": 0.35, "grad_norm": 0.08624828606843948, "learning_rate": 0.0002906298744276212, "loss": 0.0519, "step": 527 }, { "epoch": 0.35, "grad_norm": 0.09907104074954987, "learning_rate": 0.00029059397533674216, "loss": 0.0554, "step": 528 }, { "epoch": 0.35, "grad_norm": 0.05135316029191017, "learning_rate": 0.00029055800983425494, "loss": 0.0374, "step": 529 }, { "epoch": 0.35, "grad_norm": 0.10954371839761734, "learning_rate": 0.00029052197793714844, "loss": 0.03, "step": 530 }, { "epoch": 0.35, "grad_norm": 0.13733310997486115, "learning_rate": 0.0002904858796624428, "loss": 0.0345, "step": 531 }, { "epoch": 0.35, "grad_norm": 0.09171781688928604, "learning_rate": 0.00029044971502718966, "loss": 0.0285, "step": 532 }, { "epoch": 0.35, "grad_norm": 0.08643066138029099, "learning_rate": 0.00029041348404847177, "loss": 0.0225, "step": 533 }, { "epoch": 0.35, "grad_norm": 0.3179713487625122, "learning_rate": 0.00029037718674340343, "loss": 0.1167, "step": 534 }, { "epoch": 0.35, "grad_norm": 0.09737833589315414, "learning_rate": 0.0002903408231291303, "loss": 0.047, "step": 535 }, { "epoch": 0.35, "grad_norm": 0.15587852895259857, "learning_rate": 0.00029030439322282904, "loss": 0.0406, "step": 536 }, { "epoch": 0.35, "grad_norm": 0.07560009509325027, "learning_rate": 0.0002902678970417081, "loss": 0.0387, "step": 537 }, { "epoch": 0.35, "grad_norm": 0.12732967734336853, "learning_rate": 0.00029023133460300677, "loss": 0.0434, "step": 538 }, { "epoch": 0.35, "grad_norm": 0.06021510064601898, "learning_rate": 0.00029019470592399593, "loss": 0.0149, "step": 539 }, { "epoch": 0.35, "grad_norm": 0.09609080851078033, "learning_rate": 0.0002901580110219777, "loss": 0.0203, "step": 540 }, { "epoch": 0.35, "grad_norm": 0.1442640721797943, "learning_rate": 0.0002901212499142854, "loss": 0.0345, "step": 541 }, { "epoch": 0.35, "grad_norm": 0.15236537158489227, "learning_rate": 0.0002900844226182837, "loss": 0.041, "step": 542 }, { "epoch": 0.36, "grad_norm": 0.14138057827949524, "learning_rate": 0.00029004752915136854, "loss": 0.0413, "step": 543 }, { "epoch": 0.36, "grad_norm": 0.16659876704216003, "learning_rate": 0.000290010569530967, "loss": 0.0202, "step": 544 }, { "epoch": 0.36, "grad_norm": 0.16970619559288025, "learning_rate": 0.0002899735437745376, "loss": 0.0373, "step": 545 }, { "epoch": 0.36, "grad_norm": 0.044596217572689056, "learning_rate": 0.00028993645189956987, "loss": 0.0202, "step": 546 }, { "epoch": 0.36, "grad_norm": 0.07182051986455917, "learning_rate": 0.00028989929392358484, "loss": 0.0137, "step": 547 }, { "epoch": 0.36, "grad_norm": 0.2593410313129425, "learning_rate": 0.0002898620698641345, "loss": 0.0373, "step": 548 }, { "epoch": 0.36, "grad_norm": 0.17339394986629486, "learning_rate": 0.0002898247797388023, "loss": 0.0217, "step": 549 }, { "epoch": 0.36, "grad_norm": 0.13247337937355042, "learning_rate": 0.00028978742356520256, "loss": 0.0621, "step": 550 }, { "epoch": 0.36, "grad_norm": 0.04582560807466507, "learning_rate": 0.00028975000136098123, "loss": 0.0051, "step": 551 }, { "epoch": 0.36, "grad_norm": 0.04409830644726753, "learning_rate": 0.0002897125131438151, "loss": 0.0042, "step": 552 }, { "epoch": 0.36, "grad_norm": 0.11188169568777084, "learning_rate": 0.0002896749589314123, "loss": 0.0307, "step": 553 }, { "epoch": 0.36, "grad_norm": 0.10103113949298859, "learning_rate": 0.00028963733874151225, "loss": 0.0132, "step": 554 }, { "epoch": 0.36, "grad_norm": 0.13099652528762817, "learning_rate": 0.0002895996525918852, "loss": 0.0348, "step": 555 }, { "epoch": 0.36, "grad_norm": 0.07826762646436691, "learning_rate": 0.0002895619005003328, "loss": 0.0232, "step": 556 }, { "epoch": 0.36, "grad_norm": 0.053435299545526505, "learning_rate": 0.00028952408248468785, "loss": 0.0113, "step": 557 }, { "epoch": 0.37, "grad_norm": 0.07408218830823898, "learning_rate": 0.00028948619856281423, "loss": 0.0099, "step": 558 }, { "epoch": 0.37, "grad_norm": 0.08491642028093338, "learning_rate": 0.00028944824875260693, "loss": 0.0122, "step": 559 }, { "epoch": 0.37, "grad_norm": 0.0294903963804245, "learning_rate": 0.00028941023307199214, "loss": 0.0044, "step": 560 }, { "epoch": 0.37, "grad_norm": 0.16142538189888, "learning_rate": 0.000289372151538927, "loss": 0.0721, "step": 561 }, { "epoch": 0.37, "grad_norm": 0.11368390917778015, "learning_rate": 0.0002893340041714, "loss": 0.0109, "step": 562 }, { "epoch": 0.37, "grad_norm": 0.1799473911523819, "learning_rate": 0.0002892957909874306, "loss": 0.0487, "step": 563 }, { "epoch": 0.37, "grad_norm": 0.1448475420475006, "learning_rate": 0.0002892575120050693, "loss": 0.0601, "step": 564 }, { "epoch": 0.37, "grad_norm": 0.07079991698265076, "learning_rate": 0.00028921916724239773, "loss": 0.0089, "step": 565 }, { "epoch": 0.37, "grad_norm": 0.13462460041046143, "learning_rate": 0.0002891807567175287, "loss": 0.0361, "step": 566 }, { "epoch": 0.37, "grad_norm": 0.08166678249835968, "learning_rate": 0.00028914228044860584, "loss": 0.0412, "step": 567 }, { "epoch": 0.37, "grad_norm": 0.09470119327306747, "learning_rate": 0.00028910373845380405, "loss": 0.036, "step": 568 }, { "epoch": 0.37, "grad_norm": 0.0957297682762146, "learning_rate": 0.00028906513075132917, "loss": 0.0302, "step": 569 }, { "epoch": 0.37, "grad_norm": 0.17004123330116272, "learning_rate": 0.00028902645735941814, "loss": 0.0559, "step": 570 }, { "epoch": 0.37, "grad_norm": 0.10910087823867798, "learning_rate": 0.0002889877182963389, "loss": 0.0765, "step": 571 }, { "epoch": 0.37, "grad_norm": 0.1027827113866806, "learning_rate": 0.0002889489135803904, "loss": 0.0261, "step": 572 }, { "epoch": 0.38, "grad_norm": 0.1182394027709961, "learning_rate": 0.00028891004322990254, "loss": 0.0413, "step": 573 }, { "epoch": 0.38, "grad_norm": 0.08422794938087463, "learning_rate": 0.00028887110726323644, "loss": 0.048, "step": 574 }, { "epoch": 0.38, "grad_norm": 0.10699556767940521, "learning_rate": 0.00028883210569878397, "loss": 0.0193, "step": 575 }, { "epoch": 0.38, "grad_norm": 0.06325127184391022, "learning_rate": 0.00028879303855496805, "loss": 0.0248, "step": 576 }, { "epoch": 0.38, "grad_norm": 0.10081582516431808, "learning_rate": 0.00028875390585024274, "loss": 0.0211, "step": 577 }, { "epoch": 0.38, "grad_norm": 0.062216054648160934, "learning_rate": 0.00028871470760309285, "loss": 0.0185, "step": 578 }, { "epoch": 0.38, "grad_norm": 0.086198590695858, "learning_rate": 0.00028867544383203423, "loss": 0.0544, "step": 579 }, { "epoch": 0.38, "grad_norm": 0.11464603990316391, "learning_rate": 0.00028863611455561374, "loss": 0.0482, "step": 580 }, { "epoch": 0.38, "grad_norm": 0.1089998185634613, "learning_rate": 0.0002885967197924092, "loss": 0.0496, "step": 581 }, { "epoch": 0.38, "grad_norm": 0.1297656148672104, "learning_rate": 0.00028855725956102913, "loss": 0.0286, "step": 582 }, { "epoch": 0.38, "grad_norm": 0.12966851890087128, "learning_rate": 0.0002885177338801133, "loss": 0.0271, "step": 583 }, { "epoch": 0.38, "grad_norm": 0.1413564682006836, "learning_rate": 0.00028847814276833215, "loss": 0.0334, "step": 584 }, { "epoch": 0.38, "grad_norm": 0.08366623520851135, "learning_rate": 0.0002884384862443871, "loss": 0.0252, "step": 585 }, { "epoch": 0.38, "grad_norm": 0.11143944412469864, "learning_rate": 0.0002883987643270106, "loss": 0.0347, "step": 586 }, { "epoch": 0.38, "grad_norm": 0.018316002562642097, "learning_rate": 0.0002883589770349658, "loss": 0.0041, "step": 587 }, { "epoch": 0.38, "grad_norm": 0.02275553159415722, "learning_rate": 0.0002883191243870467, "loss": 0.0049, "step": 588 }, { "epoch": 0.39, "grad_norm": 0.14462235569953918, "learning_rate": 0.0002882792064020785, "loss": 0.0745, "step": 589 }, { "epoch": 0.39, "grad_norm": 0.10231613367795944, "learning_rate": 0.0002882392230989169, "loss": 0.0211, "step": 590 }, { "epoch": 0.39, "grad_norm": 0.013464580290019512, "learning_rate": 0.00028819917449644865, "loss": 0.0027, "step": 591 }, { "epoch": 0.39, "grad_norm": 0.1707848161458969, "learning_rate": 0.0002881590606135912, "loss": 0.0292, "step": 592 }, { "epoch": 0.39, "grad_norm": 0.021210182458162308, "learning_rate": 0.00028811888146929303, "loss": 0.0034, "step": 593 }, { "epoch": 0.39, "grad_norm": 0.09697694331407547, "learning_rate": 0.00028807863708253326, "loss": 0.0134, "step": 594 }, { "epoch": 0.39, "grad_norm": 0.014497664757072926, "learning_rate": 0.000288038327472322, "loss": 0.0033, "step": 595 }, { "epoch": 0.39, "grad_norm": 0.25384795665740967, "learning_rate": 0.00028799795265770003, "loss": 0.0258, "step": 596 }, { "epoch": 0.39, "grad_norm": 0.0065186647698283195, "learning_rate": 0.00028795751265773894, "loss": 0.0012, "step": 597 }, { "epoch": 0.39, "grad_norm": 0.03637157753109932, "learning_rate": 0.00028791700749154124, "loss": 0.004, "step": 598 }, { "epoch": 0.39, "grad_norm": 0.039990831166505814, "learning_rate": 0.00028787643717824007, "loss": 0.0067, "step": 599 }, { "epoch": 0.39, "grad_norm": 0.18821458518505096, "learning_rate": 0.0002878358017369994, "loss": 0.0233, "step": 600 }, { "epoch": 0.39, "grad_norm": 0.12891234457492828, "learning_rate": 0.00028779510118701404, "loss": 0.0121, "step": 601 }, { "epoch": 0.39, "grad_norm": 0.1731066256761551, "learning_rate": 0.0002877543355475094, "loss": 0.0535, "step": 602 }, { "epoch": 0.39, "grad_norm": 0.5192031264305115, "learning_rate": 0.0002877135048377418, "loss": 0.1073, "step": 603 }, { "epoch": 0.4, "grad_norm": 0.13350637257099152, "learning_rate": 0.0002876726090769982, "loss": 0.0157, "step": 604 }, { "epoch": 0.4, "grad_norm": 0.12136203050613403, "learning_rate": 0.0002876316482845963, "loss": 0.0132, "step": 605 }, { "epoch": 0.4, "grad_norm": 0.5036077499389648, "learning_rate": 0.0002875906224798844, "loss": 0.1366, "step": 606 }, { "epoch": 0.4, "grad_norm": 0.22896146774291992, "learning_rate": 0.0002875495316822419, "loss": 0.08, "step": 607 }, { "epoch": 0.4, "grad_norm": 0.15327180922031403, "learning_rate": 0.0002875083759110785, "loss": 0.0322, "step": 608 }, { "epoch": 0.4, "grad_norm": 0.0520663745701313, "learning_rate": 0.0002874671551858346, "loss": 0.0202, "step": 609 }, { "epoch": 0.4, "grad_norm": 0.08731318265199661, "learning_rate": 0.00028742586952598155, "loss": 0.0414, "step": 610 }, { "epoch": 0.4, "grad_norm": 0.11570514738559723, "learning_rate": 0.0002873845189510213, "loss": 0.0625, "step": 611 }, { "epoch": 0.4, "grad_norm": 0.1604083925485611, "learning_rate": 0.0002873431034804862, "loss": 0.0644, "step": 612 }, { "epoch": 0.4, "grad_norm": 0.06147552654147148, "learning_rate": 0.0002873016231339396, "loss": 0.0168, "step": 613 }, { "epoch": 0.4, "grad_norm": 0.12419867515563965, "learning_rate": 0.00028726007793097527, "loss": 0.0438, "step": 614 }, { "epoch": 0.4, "grad_norm": 0.06133590638637543, "learning_rate": 0.0002872184678912177, "loss": 0.024, "step": 615 }, { "epoch": 0.4, "grad_norm": 0.10245617479085922, "learning_rate": 0.00028717679303432207, "loss": 0.0468, "step": 616 }, { "epoch": 0.4, "grad_norm": 0.11957762390375137, "learning_rate": 0.000287135053379974, "loss": 0.0442, "step": 617 }, { "epoch": 0.4, "grad_norm": 0.12896914780139923, "learning_rate": 0.0002870932489478899, "loss": 0.019, "step": 618 }, { "epoch": 0.41, "grad_norm": 0.1816866546869278, "learning_rate": 0.0002870513797578167, "loss": 0.0465, "step": 619 }, { "epoch": 0.41, "grad_norm": 0.3061673045158386, "learning_rate": 0.00028700944582953184, "loss": 0.0356, "step": 620 }, { "epoch": 0.41, "grad_norm": 0.12940478324890137, "learning_rate": 0.0002869674471828435, "loss": 0.0447, "step": 621 }, { "epoch": 0.41, "grad_norm": 0.2569711208343506, "learning_rate": 0.0002869253838375903, "loss": 0.0383, "step": 622 }, { "epoch": 0.41, "grad_norm": 0.17063623666763306, "learning_rate": 0.0002868832558136415, "loss": 0.0394, "step": 623 }, { "epoch": 0.41, "grad_norm": 0.16775226593017578, "learning_rate": 0.00028684106313089686, "loss": 0.0314, "step": 624 }, { "epoch": 0.41, "grad_norm": 0.12676480412483215, "learning_rate": 0.00028679880580928676, "loss": 0.0397, "step": 625 }, { "epoch": 0.41, "grad_norm": 0.19791187345981598, "learning_rate": 0.0002867564838687721, "loss": 0.0668, "step": 626 }, { "epoch": 0.41, "grad_norm": 0.18982940912246704, "learning_rate": 0.0002867140973293441, "loss": 0.0472, "step": 627 }, { "epoch": 0.41, "grad_norm": 0.06308908015489578, "learning_rate": 0.00028667164621102475, "loss": 0.0166, "step": 628 }, { "epoch": 0.41, "grad_norm": 0.09570673853158951, "learning_rate": 0.0002866291305338665, "loss": 0.0156, "step": 629 }, { "epoch": 0.41, "grad_norm": 0.12950573861598969, "learning_rate": 0.00028658655031795215, "loss": 0.0381, "step": 630 }, { "epoch": 0.41, "grad_norm": 0.30905017256736755, "learning_rate": 0.00028654390558339516, "loss": 0.0386, "step": 631 }, { "epoch": 0.41, "grad_norm": 0.2680380940437317, "learning_rate": 0.0002865011963503394, "loss": 0.0307, "step": 632 }, { "epoch": 0.41, "grad_norm": 0.15153923630714417, "learning_rate": 0.00028645842263895916, "loss": 0.0448, "step": 633 }, { "epoch": 0.42, "grad_norm": 0.06900045275688171, "learning_rate": 0.0002864155844694592, "loss": 0.0134, "step": 634 }, { "epoch": 0.42, "grad_norm": 0.39054739475250244, "learning_rate": 0.00028637268186207474, "loss": 0.0562, "step": 635 }, { "epoch": 0.42, "grad_norm": 0.06766320765018463, "learning_rate": 0.0002863297148370716, "loss": 0.0135, "step": 636 }, { "epoch": 0.42, "grad_norm": 0.12230436503887177, "learning_rate": 0.0002862866834147457, "loss": 0.0189, "step": 637 }, { "epoch": 0.42, "grad_norm": 0.10021094232797623, "learning_rate": 0.00028624358761542365, "loss": 0.021, "step": 638 }, { "epoch": 0.42, "grad_norm": 0.1645062267780304, "learning_rate": 0.0002862004274594623, "loss": 0.0284, "step": 639 }, { "epoch": 0.42, "grad_norm": 0.3108697831630707, "learning_rate": 0.00028615720296724906, "loss": 0.0792, "step": 640 }, { "epoch": 0.42, "grad_norm": 0.12834666669368744, "learning_rate": 0.0002861139141592017, "loss": 0.0162, "step": 641 }, { "epoch": 0.42, "grad_norm": 0.11455690860748291, "learning_rate": 0.00028607056105576806, "loss": 0.0374, "step": 642 }, { "epoch": 0.42, "grad_norm": 0.14810198545455933, "learning_rate": 0.0002860271436774269, "loss": 0.0132, "step": 643 }, { "epoch": 0.42, "grad_norm": 0.1764562875032425, "learning_rate": 0.00028598366204468694, "loss": 0.0641, "step": 644 }, { "epoch": 0.42, "grad_norm": 0.10819990932941437, "learning_rate": 0.0002859401161780873, "loss": 0.036, "step": 645 }, { "epoch": 0.42, "grad_norm": 0.10301560163497925, "learning_rate": 0.00028589650609819764, "loss": 0.0272, "step": 646 }, { "epoch": 0.42, "grad_norm": 0.13949047029018402, "learning_rate": 0.00028585283182561773, "loss": 0.0396, "step": 647 }, { "epoch": 0.42, "grad_norm": 0.20076854526996613, "learning_rate": 0.0002858090933809777, "loss": 0.0304, "step": 648 }, { "epoch": 0.42, "grad_norm": 0.12382891029119492, "learning_rate": 0.0002857652907849381, "loss": 0.0317, "step": 649 }, { "epoch": 0.43, "grad_norm": 0.03410351276397705, "learning_rate": 0.0002857214240581897, "loss": 0.0075, "step": 650 }, { "epoch": 0.43, "grad_norm": 0.10016089677810669, "learning_rate": 0.00028567749322145367, "loss": 0.0179, "step": 651 }, { "epoch": 0.43, "grad_norm": 0.24712401628494263, "learning_rate": 0.00028563349829548125, "loss": 0.0857, "step": 652 }, { "epoch": 0.43, "grad_norm": 0.10354748368263245, "learning_rate": 0.00028558943930105413, "loss": 0.0276, "step": 653 }, { "epoch": 0.43, "grad_norm": 0.13952110707759857, "learning_rate": 0.00028554531625898434, "loss": 0.0352, "step": 654 }, { "epoch": 0.43, "grad_norm": 0.25892096757888794, "learning_rate": 0.0002855011291901138, "loss": 0.0635, "step": 655 }, { "epoch": 0.43, "grad_norm": 0.1324494630098343, "learning_rate": 0.0002854568781153151, "loss": 0.0404, "step": 656 }, { "epoch": 0.43, "grad_norm": 0.2835068702697754, "learning_rate": 0.0002854125630554908, "loss": 0.0913, "step": 657 }, { "epoch": 0.43, "grad_norm": 0.06329616159200668, "learning_rate": 0.00028536818403157387, "loss": 0.0146, "step": 658 }, { "epoch": 0.43, "grad_norm": 0.07758588343858719, "learning_rate": 0.0002853237410645272, "loss": 0.022, "step": 659 }, { "epoch": 0.43, "grad_norm": 0.0839746966958046, "learning_rate": 0.00028527923417534425, "loss": 0.0175, "step": 660 }, { "epoch": 0.43, "grad_norm": 0.06847698986530304, "learning_rate": 0.0002852346633850484, "loss": 0.0257, "step": 661 }, { "epoch": 0.43, "grad_norm": 0.05117741599678993, "learning_rate": 0.0002851900287146933, "loss": 0.0136, "step": 662 }, { "epoch": 0.43, "grad_norm": 0.12874063849449158, "learning_rate": 0.0002851453301853628, "loss": 0.0525, "step": 663 }, { "epoch": 0.43, "grad_norm": 0.1822301298379898, "learning_rate": 0.000285100567818171, "loss": 0.0889, "step": 664 }, { "epoch": 0.44, "grad_norm": 0.11295532435178757, "learning_rate": 0.0002850557416342619, "loss": 0.0242, "step": 665 }, { "epoch": 0.44, "grad_norm": 0.13836829364299774, "learning_rate": 0.0002850108516548099, "loss": 0.0441, "step": 666 }, { "epoch": 0.44, "grad_norm": 0.10722105205059052, "learning_rate": 0.0002849658979010194, "loss": 0.0401, "step": 667 }, { "epoch": 0.44, "grad_norm": 0.1335124373435974, "learning_rate": 0.000284920880394125, "loss": 0.0355, "step": 668 }, { "epoch": 0.44, "grad_norm": 0.0793779119849205, "learning_rate": 0.00028487579915539136, "loss": 0.0653, "step": 669 }, { "epoch": 0.44, "grad_norm": 0.06469617038965225, "learning_rate": 0.00028483065420611313, "loss": 0.0212, "step": 670 }, { "epoch": 0.44, "grad_norm": 0.11224417388439178, "learning_rate": 0.0002847854455676154, "loss": 0.0689, "step": 671 }, { "epoch": 0.44, "grad_norm": 0.08650530874729156, "learning_rate": 0.00028474017326125296, "loss": 0.0301, "step": 672 }, { "epoch": 0.44, "grad_norm": 0.0688636302947998, "learning_rate": 0.0002846948373084109, "loss": 0.0161, "step": 673 }, { "epoch": 0.44, "grad_norm": 0.13195598125457764, "learning_rate": 0.0002846494377305043, "loss": 0.0529, "step": 674 }, { "epoch": 0.44, "grad_norm": 0.1887226700782776, "learning_rate": 0.0002846039745489783, "loss": 0.0615, "step": 675 }, { "epoch": 0.44, "grad_norm": 0.06736018508672714, "learning_rate": 0.0002845584477853082, "loss": 0.0246, "step": 676 }, { "epoch": 0.44, "grad_norm": 0.1488025039434433, "learning_rate": 0.0002845128574609992, "loss": 0.0361, "step": 677 }, { "epoch": 0.44, "grad_norm": 0.09811149537563324, "learning_rate": 0.0002844672035975864, "loss": 0.0228, "step": 678 }, { "epoch": 0.44, "grad_norm": 0.06320784986019135, "learning_rate": 0.0002844214862166352, "loss": 0.0182, "step": 679 }, { "epoch": 0.45, "grad_norm": 0.0695585086941719, "learning_rate": 0.00028437570533974084, "loss": 0.0393, "step": 680 }, { "epoch": 0.45, "grad_norm": 0.08886481821537018, "learning_rate": 0.00028432986098852857, "loss": 0.0293, "step": 681 }, { "epoch": 0.45, "grad_norm": 0.09019115567207336, "learning_rate": 0.0002842839531846537, "loss": 0.0436, "step": 682 }, { "epoch": 0.45, "grad_norm": 0.1718403697013855, "learning_rate": 0.0002842379819498013, "loss": 0.0512, "step": 683 }, { "epoch": 0.45, "grad_norm": 0.1692350208759308, "learning_rate": 0.0002841919473056867, "loss": 0.0637, "step": 684 }, { "epoch": 0.45, "grad_norm": 0.15840108692646027, "learning_rate": 0.00028414584927405497, "loss": 0.0224, "step": 685 }, { "epoch": 0.45, "grad_norm": 0.12710994482040405, "learning_rate": 0.0002840996878766812, "loss": 0.042, "step": 686 }, { "epoch": 0.45, "grad_norm": 0.07157866656780243, "learning_rate": 0.0002840534631353704, "loss": 0.0279, "step": 687 }, { "epoch": 0.45, "grad_norm": 0.046079106628894806, "learning_rate": 0.0002840071750719575, "loss": 0.0093, "step": 688 }, { "epoch": 0.45, "grad_norm": 0.09579090774059296, "learning_rate": 0.00028396082370830733, "loss": 0.027, "step": 689 }, { "epoch": 0.45, "grad_norm": 0.201382577419281, "learning_rate": 0.0002839144090663146, "loss": 0.072, "step": 690 }, { "epoch": 0.45, "grad_norm": 0.1724497377872467, "learning_rate": 0.000283867931167904, "loss": 0.052, "step": 691 }, { "epoch": 0.45, "grad_norm": 0.06867794692516327, "learning_rate": 0.00028382139003503006, "loss": 0.0168, "step": 692 }, { "epoch": 0.45, "grad_norm": 0.06690815091133118, "learning_rate": 0.00028377478568967704, "loss": 0.0137, "step": 693 }, { "epoch": 0.45, "grad_norm": 0.17312408983707428, "learning_rate": 0.0002837281181538593, "loss": 0.0226, "step": 694 }, { "epoch": 0.45, "grad_norm": 0.08470922708511353, "learning_rate": 0.0002836813874496208, "loss": 0.0126, "step": 695 }, { "epoch": 0.46, "grad_norm": 0.0778060331940651, "learning_rate": 0.00028363459359903565, "loss": 0.0235, "step": 696 }, { "epoch": 0.46, "grad_norm": 0.05282146856188774, "learning_rate": 0.00028358773662420745, "loss": 0.0086, "step": 697 }, { "epoch": 0.46, "grad_norm": 0.09441710263490677, "learning_rate": 0.00028354081654726984, "loss": 0.049, "step": 698 }, { "epoch": 0.46, "grad_norm": 0.050999149680137634, "learning_rate": 0.00028349383339038617, "loss": 0.0086, "step": 699 }, { "epoch": 0.46, "grad_norm": 0.10146886110305786, "learning_rate": 0.0002834467871757497, "loss": 0.0246, "step": 700 }, { "epoch": 0.46, "grad_norm": 0.03706188499927521, "learning_rate": 0.0002833996779255833, "loss": 0.0056, "step": 701 }, { "epoch": 0.46, "grad_norm": 0.10421967506408691, "learning_rate": 0.0002833525056621397, "loss": 0.0241, "step": 702 }, { "epoch": 0.46, "grad_norm": 0.18308381736278534, "learning_rate": 0.00028330527040770146, "loss": 0.042, "step": 703 }, { "epoch": 0.46, "grad_norm": 0.2132684737443924, "learning_rate": 0.0002832579721845809, "loss": 0.0226, "step": 704 }, { "epoch": 0.46, "grad_norm": 0.5411369204521179, "learning_rate": 0.00028321061101511984, "loss": 0.0702, "step": 705 }, { "epoch": 0.46, "grad_norm": 0.3440389335155487, "learning_rate": 0.0002831631869216902, "loss": 0.0225, "step": 706 }, { "epoch": 0.46, "grad_norm": 0.16572096943855286, "learning_rate": 0.00028311569992669333, "loss": 0.0352, "step": 707 }, { "epoch": 0.46, "grad_norm": 0.15913799405097961, "learning_rate": 0.0002830681500525604, "loss": 0.0266, "step": 708 }, { "epoch": 0.46, "grad_norm": 0.1818440854549408, "learning_rate": 0.0002830205373217524, "loss": 0.0688, "step": 709 }, { "epoch": 0.46, "grad_norm": 0.17044726014137268, "learning_rate": 0.0002829728617567598, "loss": 0.0515, "step": 710 }, { "epoch": 0.47, "grad_norm": 0.15003225207328796, "learning_rate": 0.0002829251233801028, "loss": 0.0757, "step": 711 }, { "epoch": 0.47, "grad_norm": 0.11599011719226837, "learning_rate": 0.00028287732221433145, "loss": 0.0402, "step": 712 }, { "epoch": 0.47, "grad_norm": 0.11937666684389114, "learning_rate": 0.0002828294582820252, "loss": 0.0391, "step": 713 }, { "epoch": 0.47, "grad_norm": 0.2134632170200348, "learning_rate": 0.0002827815316057933, "loss": 0.0748, "step": 714 }, { "epoch": 0.47, "grad_norm": 0.29407811164855957, "learning_rate": 0.00028273354220827477, "loss": 0.0679, "step": 715 }, { "epoch": 0.47, "grad_norm": 0.059820059686899185, "learning_rate": 0.00028268549011213785, "loss": 0.0372, "step": 716 }, { "epoch": 0.47, "grad_norm": 0.06392081081867218, "learning_rate": 0.0002826373753400808, "loss": 0.0212, "step": 717 }, { "epoch": 0.47, "grad_norm": 0.15369698405265808, "learning_rate": 0.0002825891979148313, "loss": 0.0258, "step": 718 }, { "epoch": 0.47, "grad_norm": 0.11246192455291748, "learning_rate": 0.00028254095785914667, "loss": 0.0357, "step": 719 }, { "epoch": 0.47, "grad_norm": 0.09747990220785141, "learning_rate": 0.0002824926551958138, "loss": 0.0393, "step": 720 }, { "epoch": 0.47, "grad_norm": 0.06651636213064194, "learning_rate": 0.0002824442899476491, "loss": 0.0222, "step": 721 }, { "epoch": 0.47, "grad_norm": 0.2205415815114975, "learning_rate": 0.00028239586213749866, "loss": 0.0391, "step": 722 }, { "epoch": 0.47, "grad_norm": 0.15302903950214386, "learning_rate": 0.000282347371788238, "loss": 0.019, "step": 723 }, { "epoch": 0.47, "grad_norm": 0.0960957333445549, "learning_rate": 0.00028229881892277237, "loss": 0.0142, "step": 724 }, { "epoch": 0.47, "grad_norm": 0.13723668456077576, "learning_rate": 0.00028225020356403624, "loss": 0.0595, "step": 725 }, { "epoch": 0.48, "grad_norm": 0.11208293586969376, "learning_rate": 0.00028220152573499394, "loss": 0.0283, "step": 726 }, { "epoch": 0.48, "grad_norm": 0.11818848550319672, "learning_rate": 0.000282152785458639, "loss": 0.0337, "step": 727 }, { "epoch": 0.48, "grad_norm": 0.1239568442106247, "learning_rate": 0.0002821039827579948, "loss": 0.0406, "step": 728 }, { "epoch": 0.48, "grad_norm": 0.2423425167798996, "learning_rate": 0.0002820551176561138, "loss": 0.0502, "step": 729 }, { "epoch": 0.48, "grad_norm": 0.11348683387041092, "learning_rate": 0.0002820061901760783, "loss": 0.0356, "step": 730 }, { "epoch": 0.48, "grad_norm": 0.1281062662601471, "learning_rate": 0.00028195720034099976, "loss": 0.0172, "step": 731 }, { "epoch": 0.48, "grad_norm": 0.017542295157909393, "learning_rate": 0.0002819081481740193, "loss": 0.0035, "step": 732 }, { "epoch": 0.48, "grad_norm": 0.13793808221817017, "learning_rate": 0.00028185903369830757, "loss": 0.0142, "step": 733 }, { "epoch": 0.48, "grad_norm": 0.0807032361626625, "learning_rate": 0.0002818098569370643, "loss": 0.0093, "step": 734 }, { "epoch": 0.48, "grad_norm": 0.13335250318050385, "learning_rate": 0.0002817606179135189, "loss": 0.0249, "step": 735 }, { "epoch": 0.48, "grad_norm": 0.18836469948291779, "learning_rate": 0.0002817113166509302, "loss": 0.0702, "step": 736 }, { "epoch": 0.48, "grad_norm": 0.08416979014873505, "learning_rate": 0.0002816619531725863, "loss": 0.0115, "step": 737 }, { "epoch": 0.48, "grad_norm": 0.054892800748348236, "learning_rate": 0.00028161252750180486, "loss": 0.0051, "step": 738 }, { "epoch": 0.48, "grad_norm": 0.11004303395748138, "learning_rate": 0.0002815630396619327, "loss": 0.025, "step": 739 }, { "epoch": 0.48, "grad_norm": 0.08595911413431168, "learning_rate": 0.00028151348967634613, "loss": 0.0247, "step": 740 }, { "epoch": 0.49, "grad_norm": 0.19074852764606476, "learning_rate": 0.0002814638775684509, "loss": 0.0548, "step": 741 }, { "epoch": 0.49, "grad_norm": 0.15080730617046356, "learning_rate": 0.0002814142033616819, "loss": 0.0865, "step": 742 }, { "epoch": 0.49, "grad_norm": 0.1833682358264923, "learning_rate": 0.00028136446707950353, "loss": 0.0697, "step": 743 }, { "epoch": 0.49, "grad_norm": 0.049776408821344376, "learning_rate": 0.00028131466874540943, "loss": 0.0078, "step": 744 }, { "epoch": 0.49, "grad_norm": 0.28232109546661377, "learning_rate": 0.00028126480838292254, "loss": 0.0283, "step": 745 }, { "epoch": 0.49, "grad_norm": 0.11048179864883423, "learning_rate": 0.0002812148860155952, "loss": 0.0333, "step": 746 }, { "epoch": 0.49, "grad_norm": 0.3188829720020294, "learning_rate": 0.0002811649016670089, "loss": 0.0524, "step": 747 }, { "epoch": 0.49, "grad_norm": 0.17602626979351044, "learning_rate": 0.0002811148553607745, "loss": 0.0559, "step": 748 }, { "epoch": 0.49, "grad_norm": 0.1038702130317688, "learning_rate": 0.0002810647471205321, "loss": 0.0409, "step": 749 }, { "epoch": 0.49, "grad_norm": 0.13291211426258087, "learning_rate": 0.00028101457696995104, "loss": 0.0343, "step": 750 }, { "epoch": 0.49, "grad_norm": 0.3924441933631897, "learning_rate": 0.0002809643449327299, "loss": 0.1051, "step": 751 }, { "epoch": 0.49, "grad_norm": 0.4292432367801666, "learning_rate": 0.0002809140510325966, "loss": 0.0985, "step": 752 }, { "epoch": 0.49, "grad_norm": 0.2376273274421692, "learning_rate": 0.0002808636952933081, "loss": 0.0372, "step": 753 }, { "epoch": 0.49, "grad_norm": 0.18019931018352509, "learning_rate": 0.0002808132777386507, "loss": 0.0449, "step": 754 }, { "epoch": 0.49, "grad_norm": 0.09754154831171036, "learning_rate": 0.0002807627983924399, "loss": 0.0399, "step": 755 }, { "epoch": 0.49, "grad_norm": 0.15885044634342194, "learning_rate": 0.0002807122572785203, "loss": 0.0231, "step": 756 }, { "epoch": 0.5, "grad_norm": 0.15031558275222778, "learning_rate": 0.0002806616544207657, "loss": 0.0562, "step": 757 }, { "epoch": 0.5, "grad_norm": 0.03422224149107933, "learning_rate": 0.00028061098984307923, "loss": 0.0055, "step": 758 }, { "epoch": 0.5, "grad_norm": 0.11442912369966507, "learning_rate": 0.0002805602635693929, "loss": 0.0318, "step": 759 }, { "epoch": 0.5, "grad_norm": 0.10043658316135406, "learning_rate": 0.0002805094756236681, "loss": 0.0293, "step": 760 }, { "epoch": 0.5, "grad_norm": 0.06093163788318634, "learning_rate": 0.00028045862602989516, "loss": 0.0062, "step": 761 }, { "epoch": 0.5, "grad_norm": 0.17897385358810425, "learning_rate": 0.0002804077148120937, "loss": 0.0387, "step": 762 }, { "epoch": 0.5, "grad_norm": 0.14174053072929382, "learning_rate": 0.0002803567419943124, "loss": 0.0742, "step": 763 }, { "epoch": 0.5, "grad_norm": 0.12193652242422104, "learning_rate": 0.0002803057076006289, "loss": 0.0237, "step": 764 }, { "epoch": 0.5, "eval_loss": 0.03274312615394592, "eval_runtime": 39.9218, "eval_samples_per_second": 32.238, "eval_steps_per_second": 8.066, "step": 764 }, { "epoch": 0.5, "grad_norm": 0.17127107083797455, "learning_rate": 0.00028025461165515016, "loss": 0.0269, "step": 765 }, { "epoch": 0.5, "grad_norm": 0.03232162818312645, "learning_rate": 0.00028020345418201196, "loss": 0.0053, "step": 766 }, { "epoch": 0.5, "grad_norm": 0.29065465927124023, "learning_rate": 0.0002801522352053794, "loss": 0.0621, "step": 767 }, { "epoch": 0.5, "grad_norm": 0.14432761073112488, "learning_rate": 0.00028010095474944647, "loss": 0.0556, "step": 768 }, { "epoch": 0.5, "grad_norm": 0.11056532710790634, "learning_rate": 0.00028004961283843624, "loss": 0.0111, "step": 769 }, { "epoch": 0.5, "grad_norm": 0.18930545449256897, "learning_rate": 0.0002799982094966007, "loss": 0.0548, "step": 770 }, { "epoch": 0.5, "grad_norm": 0.14607198536396027, "learning_rate": 0.00027994674474822115, "loss": 0.0296, "step": 771 }, { "epoch": 0.51, "grad_norm": 0.1470440924167633, "learning_rate": 0.0002798952186176076, "loss": 0.0366, "step": 772 }, { "epoch": 0.51, "grad_norm": 0.0858917385339737, "learning_rate": 0.0002798436311290992, "loss": 0.0149, "step": 773 }, { "epoch": 0.51, "grad_norm": 0.23497411608695984, "learning_rate": 0.000279791982307064, "loss": 0.031, "step": 774 }, { "epoch": 0.51, "grad_norm": 0.05533986538648605, "learning_rate": 0.00027974027217589917, "loss": 0.0149, "step": 775 }, { "epoch": 0.51, "grad_norm": 0.18971019983291626, "learning_rate": 0.00027968850076003066, "loss": 0.0339, "step": 776 }, { "epoch": 0.51, "grad_norm": 0.18975158035755157, "learning_rate": 0.00027963666808391343, "loss": 0.0192, "step": 777 }, { "epoch": 0.51, "grad_norm": 0.04416336864233017, "learning_rate": 0.0002795847741720315, "loss": 0.0073, "step": 778 }, { "epoch": 0.51, "grad_norm": 0.20576409995555878, "learning_rate": 0.00027953281904889764, "loss": 0.0418, "step": 779 }, { "epoch": 0.51, "grad_norm": 0.1331322193145752, "learning_rate": 0.0002794808027390536, "loss": 0.011, "step": 780 }, { "epoch": 0.51, "grad_norm": 0.3062935769557953, "learning_rate": 0.0002794287252670701, "loss": 0.07, "step": 781 }, { "epoch": 0.51, "grad_norm": 0.1513393074274063, "learning_rate": 0.0002793765866575466, "loss": 0.0384, "step": 782 }, { "epoch": 0.51, "grad_norm": 0.08953273296356201, "learning_rate": 0.0002793243869351116, "loss": 0.0342, "step": 783 }, { "epoch": 0.51, "grad_norm": 0.16210728883743286, "learning_rate": 0.00027927212612442243, "loss": 0.0403, "step": 784 }, { "epoch": 0.51, "grad_norm": 0.10174558311700821, "learning_rate": 0.0002792198042501652, "loss": 0.0304, "step": 785 }, { "epoch": 0.51, "grad_norm": 0.15236282348632812, "learning_rate": 0.0002791674213370549, "loss": 0.0378, "step": 786 }, { "epoch": 0.52, "grad_norm": 0.06242475286126137, "learning_rate": 0.0002791149774098353, "loss": 0.0092, "step": 787 }, { "epoch": 0.52, "grad_norm": 0.20970316231250763, "learning_rate": 0.0002790624724932792, "loss": 0.0479, "step": 788 }, { "epoch": 0.52, "grad_norm": 0.15189824998378754, "learning_rate": 0.0002790099066121879, "loss": 0.0118, "step": 789 }, { "epoch": 0.52, "grad_norm": 0.061395786702632904, "learning_rate": 0.0002789572797913918, "loss": 0.0151, "step": 790 }, { "epoch": 0.52, "grad_norm": 0.5034061074256897, "learning_rate": 0.00027890459205574987, "loss": 0.0864, "step": 791 }, { "epoch": 0.52, "grad_norm": 0.07147826254367828, "learning_rate": 0.0002788518434301499, "loss": 0.0191, "step": 792 }, { "epoch": 0.52, "grad_norm": 0.1329374462366104, "learning_rate": 0.0002787990339395085, "loss": 0.0331, "step": 793 }, { "epoch": 0.52, "grad_norm": 0.15126173198223114, "learning_rate": 0.0002787461636087711, "loss": 0.0143, "step": 794 }, { "epoch": 0.52, "grad_norm": 0.09816433489322662, "learning_rate": 0.0002786932324629116, "loss": 0.0155, "step": 795 }, { "epoch": 0.52, "grad_norm": 0.1379247009754181, "learning_rate": 0.0002786402405269329, "loss": 0.0315, "step": 796 }, { "epoch": 0.52, "grad_norm": 0.23714596033096313, "learning_rate": 0.00027858718782586647, "loss": 0.0465, "step": 797 }, { "epoch": 0.52, "grad_norm": 0.12600766122341156, "learning_rate": 0.0002785340743847725, "loss": 0.0359, "step": 798 }, { "epoch": 0.52, "grad_norm": 0.13546015322208405, "learning_rate": 0.00027848090022874, "loss": 0.0175, "step": 799 }, { "epoch": 0.52, "grad_norm": 0.07381202280521393, "learning_rate": 0.00027842766538288647, "loss": 0.0302, "step": 800 }, { "epoch": 0.52, "grad_norm": 0.14797933399677277, "learning_rate": 0.0002783743698723582, "loss": 0.0818, "step": 801 }, { "epoch": 0.53, "grad_norm": 0.02270502597093582, "learning_rate": 0.00027832101372233007, "loss": 0.0049, "step": 802 }, { "epoch": 0.53, "grad_norm": 0.322780042886734, "learning_rate": 0.00027826759695800566, "loss": 0.0694, "step": 803 }, { "epoch": 0.53, "grad_norm": 0.2222844511270523, "learning_rate": 0.0002782141196046171, "loss": 0.0261, "step": 804 }, { "epoch": 0.53, "grad_norm": 0.3285076320171356, "learning_rate": 0.0002781605816874253, "loss": 0.0872, "step": 805 }, { "epoch": 0.53, "grad_norm": 0.08804619312286377, "learning_rate": 0.0002781069832317196, "loss": 0.0578, "step": 806 }, { "epoch": 0.53, "grad_norm": 0.17540492117404938, "learning_rate": 0.00027805332426281793, "loss": 0.0384, "step": 807 }, { "epoch": 0.53, "grad_norm": 0.06208420172333717, "learning_rate": 0.00027799960480606706, "loss": 0.0136, "step": 808 }, { "epoch": 0.53, "grad_norm": 0.07048339396715164, "learning_rate": 0.0002779458248868421, "loss": 0.026, "step": 809 }, { "epoch": 0.53, "grad_norm": 0.0901297777891159, "learning_rate": 0.00027789198453054666, "loss": 0.0277, "step": 810 }, { "epoch": 0.53, "grad_norm": 0.07435130327939987, "learning_rate": 0.0002778380837626132, "loss": 0.0197, "step": 811 }, { "epoch": 0.53, "grad_norm": 0.14209306240081787, "learning_rate": 0.00027778412260850234, "loss": 0.0407, "step": 812 }, { "epoch": 0.53, "grad_norm": 0.16662320494651794, "learning_rate": 0.00027773010109370357, "loss": 0.0667, "step": 813 }, { "epoch": 0.53, "grad_norm": 0.031582899391651154, "learning_rate": 0.0002776760192437346, "loss": 0.0104, "step": 814 }, { "epoch": 0.53, "grad_norm": 0.0818646028637886, "learning_rate": 0.00027762187708414195, "loss": 0.0258, "step": 815 }, { "epoch": 0.53, "grad_norm": 0.0822024941444397, "learning_rate": 0.0002775676746405003, "loss": 0.0406, "step": 816 }, { "epoch": 0.53, "grad_norm": 0.11700989305973053, "learning_rate": 0.0002775134119384131, "loss": 0.0335, "step": 817 }, { "epoch": 0.54, "grad_norm": 0.082905612885952, "learning_rate": 0.00027745908900351195, "loss": 0.0161, "step": 818 }, { "epoch": 0.54, "grad_norm": 0.11846023797988892, "learning_rate": 0.00027740470586145726, "loss": 0.0502, "step": 819 }, { "epoch": 0.54, "grad_norm": 0.17150120437145233, "learning_rate": 0.00027735026253793756, "loss": 0.0345, "step": 820 }, { "epoch": 0.54, "grad_norm": 0.12443263083696365, "learning_rate": 0.00027729575905867, "loss": 0.0158, "step": 821 }, { "epoch": 0.54, "grad_norm": 0.1657358705997467, "learning_rate": 0.0002772411954494001, "loss": 0.0226, "step": 822 }, { "epoch": 0.54, "grad_norm": 0.08840323239564896, "learning_rate": 0.0002771865717359018, "loss": 0.0152, "step": 823 }, { "epoch": 0.54, "grad_norm": 0.08032941073179245, "learning_rate": 0.00027713188794397737, "loss": 0.0129, "step": 824 }, { "epoch": 0.54, "grad_norm": 0.1835167407989502, "learning_rate": 0.00027707714409945744, "loss": 0.0569, "step": 825 }, { "epoch": 0.54, "grad_norm": 0.12917861342430115, "learning_rate": 0.0002770223402282012, "loss": 0.0309, "step": 826 }, { "epoch": 0.54, "grad_norm": 0.22104112803936005, "learning_rate": 0.0002769674763560959, "loss": 0.0432, "step": 827 }, { "epoch": 0.54, "grad_norm": 0.13979768753051758, "learning_rate": 0.00027691255250905737, "loss": 0.0174, "step": 828 }, { "epoch": 0.54, "grad_norm": 0.17627565562725067, "learning_rate": 0.0002768575687130297, "loss": 0.0915, "step": 829 }, { "epoch": 0.54, "grad_norm": 0.486728310585022, "learning_rate": 0.0002768025249939853, "loss": 0.0583, "step": 830 }, { "epoch": 0.54, "grad_norm": 0.1259876936674118, "learning_rate": 0.0002767474213779247, "loss": 0.0254, "step": 831 }, { "epoch": 0.54, "grad_norm": 0.17353613674640656, "learning_rate": 0.00027669225789087715, "loss": 0.0238, "step": 832 }, { "epoch": 0.55, "grad_norm": 0.011490284465253353, "learning_rate": 0.00027663703455889973, "loss": 0.0025, "step": 833 }, { "epoch": 0.55, "grad_norm": 0.054609689861536026, "learning_rate": 0.00027658175140807815, "loss": 0.0098, "step": 834 }, { "epoch": 0.55, "grad_norm": 0.1213490441441536, "learning_rate": 0.000276526408464526, "loss": 0.0128, "step": 835 }, { "epoch": 0.55, "grad_norm": 0.09482322633266449, "learning_rate": 0.0002764710057543855, "loss": 0.0126, "step": 836 }, { "epoch": 0.55, "grad_norm": 0.057049017399549484, "learning_rate": 0.00027641554330382686, "loss": 0.015, "step": 837 }, { "epoch": 0.55, "grad_norm": 0.18572884798049927, "learning_rate": 0.0002763600211390486, "loss": 0.034, "step": 838 }, { "epoch": 0.55, "grad_norm": 0.09493198245763779, "learning_rate": 0.0002763044392862774, "loss": 0.0408, "step": 839 }, { "epoch": 0.55, "grad_norm": 0.2182336002588272, "learning_rate": 0.00027624879777176807, "loss": 0.055, "step": 840 }, { "epoch": 0.55, "grad_norm": 0.08872721344232559, "learning_rate": 0.00027619309662180386, "loss": 0.0383, "step": 841 }, { "epoch": 0.55, "grad_norm": 0.11956200748682022, "learning_rate": 0.0002761373358626959, "loss": 0.0287, "step": 842 }, { "epoch": 0.55, "grad_norm": 0.1644572764635086, "learning_rate": 0.0002760815155207837, "loss": 0.0286, "step": 843 }, { "epoch": 0.55, "grad_norm": 0.16476300358772278, "learning_rate": 0.0002760256356224347, "loss": 0.0392, "step": 844 }, { "epoch": 0.55, "grad_norm": 0.1026122123003006, "learning_rate": 0.00027596969619404457, "loss": 0.0403, "step": 845 }, { "epoch": 0.55, "grad_norm": 0.17450834810733795, "learning_rate": 0.00027591369726203725, "loss": 0.0586, "step": 846 }, { "epoch": 0.55, "grad_norm": 0.10373177379369736, "learning_rate": 0.0002758576388528645, "loss": 0.0214, "step": 847 }, { "epoch": 0.56, "grad_norm": 0.08164018392562866, "learning_rate": 0.0002758015209930064, "loss": 0.0229, "step": 848 }, { "epoch": 0.56, "grad_norm": 0.07375165820121765, "learning_rate": 0.000275745343708971, "loss": 0.0333, "step": 849 }, { "epoch": 0.56, "grad_norm": 0.09719602763652802, "learning_rate": 0.0002756891070272945, "loss": 0.0214, "step": 850 }, { "epoch": 0.56, "grad_norm": 0.5595388412475586, "learning_rate": 0.00027563281097454115, "loss": 0.0657, "step": 851 }, { "epoch": 0.56, "grad_norm": 0.10981204360723495, "learning_rate": 0.0002755764555773031, "loss": 0.0308, "step": 852 }, { "epoch": 0.56, "grad_norm": 0.10418907552957535, "learning_rate": 0.0002755200408622007, "loss": 0.0238, "step": 853 }, { "epoch": 0.56, "grad_norm": 0.0636146143078804, "learning_rate": 0.0002754635668558822, "loss": 0.0143, "step": 854 }, { "epoch": 0.56, "grad_norm": 0.12179470807313919, "learning_rate": 0.00027540703358502406, "loss": 0.0393, "step": 855 }, { "epoch": 0.56, "grad_norm": 0.07303999364376068, "learning_rate": 0.00027535044107633046, "loss": 0.0118, "step": 856 }, { "epoch": 0.56, "grad_norm": 0.11226726323366165, "learning_rate": 0.00027529378935653377, "loss": 0.0356, "step": 857 }, { "epoch": 0.56, "grad_norm": 0.16357053816318512, "learning_rate": 0.0002752370784523942, "loss": 0.0378, "step": 858 }, { "epoch": 0.56, "grad_norm": 0.10425914824008942, "learning_rate": 0.0002751803083907, "loss": 0.0423, "step": 859 }, { "epoch": 0.56, "grad_norm": 0.11986647546291351, "learning_rate": 0.0002751234791982674, "loss": 0.054, "step": 860 }, { "epoch": 0.56, "grad_norm": 0.14440590143203735, "learning_rate": 0.00027506659090194036, "loss": 0.0418, "step": 861 }, { "epoch": 0.56, "grad_norm": 0.21995751559734344, "learning_rate": 0.0002750096435285909, "loss": 0.0303, "step": 862 }, { "epoch": 0.56, "grad_norm": 0.03415970876812935, "learning_rate": 0.00027495263710511906, "loss": 0.0084, "step": 863 }, { "epoch": 0.57, "grad_norm": 0.052127208560705185, "learning_rate": 0.0002748955716584526, "loss": 0.0124, "step": 864 }, { "epoch": 0.57, "grad_norm": 0.23270320892333984, "learning_rate": 0.0002748384472155472, "loss": 0.0501, "step": 865 }, { "epoch": 0.57, "grad_norm": 0.05627870559692383, "learning_rate": 0.00027478126380338645, "loss": 0.0081, "step": 866 }, { "epoch": 0.57, "grad_norm": 0.1844397783279419, "learning_rate": 0.0002747240214489817, "loss": 0.04, "step": 867 }, { "epoch": 0.57, "grad_norm": 0.06833455711603165, "learning_rate": 0.0002746667201793722, "loss": 0.0136, "step": 868 }, { "epoch": 0.57, "grad_norm": 0.03551473841071129, "learning_rate": 0.00027460936002162513, "loss": 0.0057, "step": 869 }, { "epoch": 0.57, "grad_norm": 0.0920785516500473, "learning_rate": 0.0002745519410028354, "loss": 0.0103, "step": 870 }, { "epoch": 0.57, "grad_norm": 0.1218150407075882, "learning_rate": 0.0002744944631501256, "loss": 0.0427, "step": 871 }, { "epoch": 0.57, "grad_norm": 0.3496924042701721, "learning_rate": 0.00027443692649064633, "loss": 0.0686, "step": 872 }, { "epoch": 0.57, "grad_norm": 0.3225466310977936, "learning_rate": 0.00027437933105157585, "loss": 0.0518, "step": 873 }, { "epoch": 0.57, "grad_norm": 0.230736643075943, "learning_rate": 0.00027432167686012015, "loss": 0.0468, "step": 874 }, { "epoch": 0.57, "grad_norm": 0.20991326868534088, "learning_rate": 0.00027426396394351313, "loss": 0.0595, "step": 875 }, { "epoch": 0.57, "grad_norm": 0.10641276091337204, "learning_rate": 0.0002742061923290162, "loss": 0.0353, "step": 876 }, { "epoch": 0.57, "grad_norm": 0.06472618877887726, "learning_rate": 0.00027414836204391865, "loss": 0.012, "step": 877 }, { "epoch": 0.57, "grad_norm": 0.2291422188282013, "learning_rate": 0.0002740904731155375, "loss": 0.0431, "step": 878 }, { "epoch": 0.58, "grad_norm": 0.26647308468818665, "learning_rate": 0.0002740325255712175, "loss": 0.1054, "step": 879 }, { "epoch": 0.58, "grad_norm": 0.08363434672355652, "learning_rate": 0.0002739745194383309, "loss": 0.011, "step": 880 }, { "epoch": 0.58, "grad_norm": 0.10943964123725891, "learning_rate": 0.00027391645474427774, "loss": 0.0331, "step": 881 }, { "epoch": 0.58, "grad_norm": 0.2208610624074936, "learning_rate": 0.0002738583315164857, "loss": 0.0499, "step": 882 }, { "epoch": 0.58, "grad_norm": 0.09434379637241364, "learning_rate": 0.00027380014978241026, "loss": 0.0268, "step": 883 }, { "epoch": 0.58, "grad_norm": 0.13045388460159302, "learning_rate": 0.0002737419095695343, "loss": 0.0367, "step": 884 }, { "epoch": 0.58, "grad_norm": 0.1460418850183487, "learning_rate": 0.00027368361090536844, "loss": 0.0662, "step": 885 }, { "epoch": 0.58, "grad_norm": 0.08823563903570175, "learning_rate": 0.000273625253817451, "loss": 0.0387, "step": 886 }, { "epoch": 0.58, "grad_norm": 0.08193490654230118, "learning_rate": 0.00027356683833334766, "loss": 0.0357, "step": 887 }, { "epoch": 0.58, "grad_norm": 0.1274595856666565, "learning_rate": 0.00027350836448065193, "loss": 0.0346, "step": 888 }, { "epoch": 0.58, "grad_norm": 0.061123717576265335, "learning_rate": 0.0002734498322869847, "loss": 0.0388, "step": 889 }, { "epoch": 0.58, "grad_norm": 0.12708084285259247, "learning_rate": 0.0002733912417799945, "loss": 0.0276, "step": 890 }, { "epoch": 0.58, "grad_norm": 0.055733684450387955, "learning_rate": 0.00027333259298735756, "loss": 0.0139, "step": 891 }, { "epoch": 0.58, "grad_norm": 0.049776624888181686, "learning_rate": 0.00027327388593677727, "loss": 0.0141, "step": 892 }, { "epoch": 0.58, "grad_norm": 0.1466546654701233, "learning_rate": 0.000273215120655985, "loss": 0.0424, "step": 893 }, { "epoch": 0.59, "grad_norm": 0.04927024617791176, "learning_rate": 0.00027315629717273915, "loss": 0.0121, "step": 894 }, { "epoch": 0.59, "grad_norm": 0.14217214286327362, "learning_rate": 0.0002730974155148259, "loss": 0.0365, "step": 895 }, { "epoch": 0.59, "grad_norm": 0.06162632629275322, "learning_rate": 0.00027303847571005904, "loss": 0.0185, "step": 896 }, { "epoch": 0.59, "grad_norm": 0.09187627583742142, "learning_rate": 0.00027297947778627947, "loss": 0.024, "step": 897 }, { "epoch": 0.59, "grad_norm": 0.08694395422935486, "learning_rate": 0.00027292042177135575, "loss": 0.016, "step": 898 }, { "epoch": 0.59, "grad_norm": 0.2407931238412857, "learning_rate": 0.0002728613076931838, "loss": 0.0895, "step": 899 }, { "epoch": 0.59, "grad_norm": 0.11447851359844208, "learning_rate": 0.0002728021355796871, "loss": 0.0156, "step": 900 }, { "epoch": 0.59, "grad_norm": 0.17052114009857178, "learning_rate": 0.0002727429054588165, "loss": 0.0686, "step": 901 }, { "epoch": 0.59, "grad_norm": 0.11735350638628006, "learning_rate": 0.0002726836173585501, "loss": 0.0458, "step": 902 }, { "epoch": 0.59, "grad_norm": 0.1015033945441246, "learning_rate": 0.0002726242713068935, "loss": 0.0396, "step": 903 }, { "epoch": 0.59, "grad_norm": 0.09442136436700821, "learning_rate": 0.00027256486733187975, "loss": 0.0354, "step": 904 }, { "epoch": 0.59, "grad_norm": 0.051811713725328445, "learning_rate": 0.0002725054054615691, "loss": 0.0103, "step": 905 }, { "epoch": 0.59, "grad_norm": 0.09581268578767776, "learning_rate": 0.00027244588572404924, "loss": 0.0346, "step": 906 }, { "epoch": 0.59, "grad_norm": 0.1265789121389389, "learning_rate": 0.00027238630814743525, "loss": 0.0296, "step": 907 }, { "epoch": 0.59, "grad_norm": 0.11578807979822159, "learning_rate": 0.0002723266727598694, "loss": 0.0374, "step": 908 }, { "epoch": 0.6, "grad_norm": 0.0634288564324379, "learning_rate": 0.0002722669795895214, "loss": 0.0211, "step": 909 }, { "epoch": 0.6, "grad_norm": 0.10002614557743073, "learning_rate": 0.0002722072286645881, "loss": 0.0217, "step": 910 }, { "epoch": 0.6, "grad_norm": 0.10582344233989716, "learning_rate": 0.0002721474200132937, "loss": 0.0262, "step": 911 }, { "epoch": 0.6, "grad_norm": 0.20417608320713043, "learning_rate": 0.0002720875536638898, "loss": 0.0303, "step": 912 }, { "epoch": 0.6, "grad_norm": 0.06233491376042366, "learning_rate": 0.00027202762964465514, "loss": 0.0179, "step": 913 }, { "epoch": 0.6, "grad_norm": 0.10917846113443375, "learning_rate": 0.00027196764798389557, "loss": 0.0238, "step": 914 }, { "epoch": 0.6, "grad_norm": 0.20902927219867706, "learning_rate": 0.0002719076087099444, "loss": 0.0744, "step": 915 }, { "epoch": 0.6, "grad_norm": 0.07525712251663208, "learning_rate": 0.000271847511851162, "loss": 0.0145, "step": 916 }, { "epoch": 0.6, "grad_norm": 0.13625741004943848, "learning_rate": 0.0002717873574359361, "loss": 0.0557, "step": 917 }, { "epoch": 0.6, "grad_norm": 0.10275349766016006, "learning_rate": 0.00027172714549268136, "loss": 0.0156, "step": 918 }, { "epoch": 0.6, "grad_norm": 0.07689966261386871, "learning_rate": 0.0002716668760498399, "loss": 0.0285, "step": 919 }, { "epoch": 0.6, "grad_norm": 0.051624033600091934, "learning_rate": 0.00027160654913588073, "loss": 0.0109, "step": 920 }, { "epoch": 0.6, "grad_norm": 0.1263073831796646, "learning_rate": 0.0002715461647793003, "loss": 0.03, "step": 921 }, { "epoch": 0.6, "grad_norm": 0.03605236858129501, "learning_rate": 0.0002714857230086219, "loss": 0.008, "step": 922 }, { "epoch": 0.6, "grad_norm": 0.09554066509008408, "learning_rate": 0.0002714252238523962, "loss": 0.0276, "step": 923 }, { "epoch": 0.6, "grad_norm": 0.12727093696594238, "learning_rate": 0.0002713646673392008, "loss": 0.0365, "step": 924 }, { "epoch": 0.61, "grad_norm": 0.21029303967952728, "learning_rate": 0.00027130405349764044, "loss": 0.0554, "step": 925 }, { "epoch": 0.61, "grad_norm": 0.10958801954984665, "learning_rate": 0.00027124338235634695, "loss": 0.032, "step": 926 }, { "epoch": 0.61, "grad_norm": 0.06557829678058624, "learning_rate": 0.0002711826539439792, "loss": 0.0145, "step": 927 }, { "epoch": 0.61, "grad_norm": 0.0530441552400589, "learning_rate": 0.0002711218682892232, "loss": 0.014, "step": 928 }, { "epoch": 0.61, "grad_norm": 0.11874904483556747, "learning_rate": 0.00027106102542079195, "loss": 0.0144, "step": 929 }, { "epoch": 0.61, "grad_norm": 0.07747121155261993, "learning_rate": 0.0002710001253674254, "loss": 0.0136, "step": 930 }, { "epoch": 0.61, "grad_norm": 0.055583804845809937, "learning_rate": 0.0002709391681578906, "loss": 0.013, "step": 931 }, { "epoch": 0.61, "grad_norm": 0.06069410964846611, "learning_rate": 0.0002708781538209815, "loss": 0.0076, "step": 932 }, { "epoch": 0.61, "grad_norm": 0.019891362637281418, "learning_rate": 0.00027081708238551927, "loss": 0.0038, "step": 933 }, { "epoch": 0.61, "grad_norm": 0.1343265175819397, "learning_rate": 0.00027075595388035173, "loss": 0.0307, "step": 934 }, { "epoch": 0.61, "grad_norm": 0.04620016738772392, "learning_rate": 0.00027069476833435397, "loss": 0.0048, "step": 935 }, { "epoch": 0.61, "grad_norm": 0.1706463247537613, "learning_rate": 0.00027063352577642776, "loss": 0.0643, "step": 936 }, { "epoch": 0.61, "grad_norm": 0.058014389127492905, "learning_rate": 0.0002705722262355019, "loss": 0.0081, "step": 937 }, { "epoch": 0.61, "grad_norm": 0.11744493991136551, "learning_rate": 0.0002705108697405322, "loss": 0.0308, "step": 938 }, { "epoch": 0.61, "grad_norm": 0.08099761605262756, "learning_rate": 0.00027044945632050127, "loss": 0.0052, "step": 939 }, { "epoch": 0.62, "grad_norm": 0.29563236236572266, "learning_rate": 0.00027038798600441865, "loss": 0.0529, "step": 940 }, { "epoch": 0.62, "grad_norm": 0.043802157044410706, "learning_rate": 0.0002703264588213206, "loss": 0.0071, "step": 941 }, { "epoch": 0.62, "grad_norm": 0.12684734165668488, "learning_rate": 0.00027026487480027057, "loss": 0.0433, "step": 942 }, { "epoch": 0.62, "grad_norm": 0.21014286577701569, "learning_rate": 0.00027020323397035855, "loss": 0.028, "step": 943 }, { "epoch": 0.62, "grad_norm": 0.11645261198282242, "learning_rate": 0.00027014153636070157, "loss": 0.0178, "step": 944 }, { "epoch": 0.62, "grad_norm": 0.16726157069206238, "learning_rate": 0.00027007978200044324, "loss": 0.0508, "step": 945 }, { "epoch": 0.62, "grad_norm": 0.10064594447612762, "learning_rate": 0.0002700179709187543, "loss": 0.0239, "step": 946 }, { "epoch": 0.62, "grad_norm": 0.060703571885824203, "learning_rate": 0.00026995610314483205, "loss": 0.0103, "step": 947 }, { "epoch": 0.62, "grad_norm": 0.0527808852493763, "learning_rate": 0.0002698941787079006, "loss": 0.0178, "step": 948 }, { "epoch": 0.62, "grad_norm": 0.08081556856632233, "learning_rate": 0.00026983219763721086, "loss": 0.0157, "step": 949 }, { "epoch": 0.62, "grad_norm": 0.12985916435718536, "learning_rate": 0.00026977015996204054, "loss": 0.0575, "step": 950 }, { "epoch": 0.62, "grad_norm": 0.15043164789676666, "learning_rate": 0.00026970806571169397, "loss": 0.0302, "step": 951 }, { "epoch": 0.62, "grad_norm": 0.024910060688853264, "learning_rate": 0.00026964591491550235, "loss": 0.0045, "step": 952 }, { "epoch": 0.62, "grad_norm": 0.10944465547800064, "learning_rate": 0.00026958370760282345, "loss": 0.0574, "step": 953 }, { "epoch": 0.62, "grad_norm": 0.114822618663311, "learning_rate": 0.0002695214438030418, "loss": 0.0262, "step": 954 }, { "epoch": 0.63, "grad_norm": 0.15373332798480988, "learning_rate": 0.0002694591235455687, "loss": 0.0206, "step": 955 }, { "epoch": 0.63, "grad_norm": 0.14427144825458527, "learning_rate": 0.0002693967468598419, "loss": 0.0508, "step": 956 }, { "epoch": 0.63, "grad_norm": 0.0668393075466156, "learning_rate": 0.000269334313775326, "loss": 0.0195, "step": 957 }, { "epoch": 0.63, "grad_norm": 0.06797386705875397, "learning_rate": 0.00026927182432151216, "loss": 0.0081, "step": 958 }, { "epoch": 0.63, "grad_norm": 0.21059945225715637, "learning_rate": 0.00026920927852791825, "loss": 0.1075, "step": 959 }, { "epoch": 0.63, "grad_norm": 0.10499881953001022, "learning_rate": 0.0002691466764240886, "loss": 0.0111, "step": 960 }, { "epoch": 0.63, "grad_norm": 0.033115822821855545, "learning_rate": 0.00026908401803959423, "loss": 0.0054, "step": 961 }, { "epoch": 0.63, "grad_norm": 0.2655697464942932, "learning_rate": 0.0002690213034040328, "loss": 0.0455, "step": 962 }, { "epoch": 0.63, "grad_norm": 0.1976163387298584, "learning_rate": 0.0002689585325470284, "loss": 0.0454, "step": 963 }, { "epoch": 0.63, "grad_norm": 0.05260282754898071, "learning_rate": 0.00026889570549823184, "loss": 0.0275, "step": 964 }, { "epoch": 0.63, "grad_norm": 0.1485443115234375, "learning_rate": 0.0002688328222873203, "loss": 0.0191, "step": 965 }, { "epoch": 0.63, "grad_norm": 0.0436883270740509, "learning_rate": 0.0002687698829439977, "loss": 0.0099, "step": 966 }, { "epoch": 0.63, "grad_norm": 0.12818527221679688, "learning_rate": 0.00026870688749799416, "loss": 0.0323, "step": 967 }, { "epoch": 0.63, "grad_norm": 0.14603693783283234, "learning_rate": 0.0002686438359790667, "loss": 0.0541, "step": 968 }, { "epoch": 0.63, "grad_norm": 0.09324526786804199, "learning_rate": 0.00026858072841699847, "loss": 0.0272, "step": 969 }, { "epoch": 0.64, "grad_norm": 0.26789504289627075, "learning_rate": 0.0002685175648415994, "loss": 0.0503, "step": 970 }, { "epoch": 0.64, "grad_norm": 0.059855278581380844, "learning_rate": 0.0002684543452827056, "loss": 0.0136, "step": 971 }, { "epoch": 0.64, "grad_norm": 0.08910810202360153, "learning_rate": 0.00026839106977017974, "loss": 0.016, "step": 972 }, { "epoch": 0.64, "grad_norm": 0.09903378039598465, "learning_rate": 0.000268327738333911, "loss": 0.0307, "step": 973 }, { "epoch": 0.64, "grad_norm": 0.16080208122730255, "learning_rate": 0.00026826435100381487, "loss": 0.0318, "step": 974 }, { "epoch": 0.64, "grad_norm": 0.09495270997285843, "learning_rate": 0.0002682009078098333, "loss": 0.0591, "step": 975 }, { "epoch": 0.64, "grad_norm": 0.11322695016860962, "learning_rate": 0.00026813740878193457, "loss": 0.047, "step": 976 }, { "epoch": 0.64, "grad_norm": 0.06805938482284546, "learning_rate": 0.0002680738539501134, "loss": 0.0337, "step": 977 }, { "epoch": 0.64, "grad_norm": 0.18398675322532654, "learning_rate": 0.00026801024334439076, "loss": 0.0653, "step": 978 }, { "epoch": 0.64, "grad_norm": 0.09730216860771179, "learning_rate": 0.00026794657699481415, "loss": 0.0463, "step": 979 }, { "epoch": 0.64, "grad_norm": 0.0954691618680954, "learning_rate": 0.0002678828549314573, "loss": 0.0199, "step": 980 }, { "epoch": 0.64, "grad_norm": 0.15214982628822327, "learning_rate": 0.00026781907718442013, "loss": 0.0606, "step": 981 }, { "epoch": 0.64, "grad_norm": 0.07308922708034515, "learning_rate": 0.00026775524378382906, "loss": 0.0229, "step": 982 }, { "epoch": 0.64, "grad_norm": 0.1865328997373581, "learning_rate": 0.00026769135475983676, "loss": 0.0617, "step": 983 }, { "epoch": 0.64, "grad_norm": 0.0670800730586052, "learning_rate": 0.0002676274101426221, "loss": 0.0213, "step": 984 }, { "epoch": 0.64, "grad_norm": 0.09108185768127441, "learning_rate": 0.0002675634099623903, "loss": 0.0163, "step": 985 }, { "epoch": 0.65, "grad_norm": 0.09892558306455612, "learning_rate": 0.0002674993542493727, "loss": 0.0398, "step": 986 }, { "epoch": 0.65, "grad_norm": 0.18465696275234222, "learning_rate": 0.00026743524303382695, "loss": 0.0456, "step": 987 }, { "epoch": 0.65, "grad_norm": 0.14701491594314575, "learning_rate": 0.000267371076346037, "loss": 0.0217, "step": 988 }, { "epoch": 0.65, "grad_norm": 0.22119949758052826, "learning_rate": 0.0002673068542163128, "loss": 0.0337, "step": 989 }, { "epoch": 0.65, "grad_norm": 0.07329166680574417, "learning_rate": 0.0002672425766749907, "loss": 0.0077, "step": 990 }, { "epoch": 0.65, "grad_norm": 0.08214308321475983, "learning_rate": 0.0002671782437524331, "loss": 0.0086, "step": 991 }, { "epoch": 0.65, "grad_norm": 0.16395068168640137, "learning_rate": 0.0002671138554790286, "loss": 0.0511, "step": 992 }, { "epoch": 0.65, "grad_norm": 0.07903768122196198, "learning_rate": 0.0002670494118851919, "loss": 0.0227, "step": 993 }, { "epoch": 0.65, "grad_norm": 0.044391512870788574, "learning_rate": 0.0002669849130013639, "loss": 0.0062, "step": 994 }, { "epoch": 0.65, "grad_norm": 0.11790774017572403, "learning_rate": 0.0002669203588580116, "loss": 0.0586, "step": 995 }, { "epoch": 0.65, "grad_norm": 0.023213036358356476, "learning_rate": 0.000266855749485628, "loss": 0.004, "step": 996 }, { "epoch": 0.65, "grad_norm": 0.1801631897687912, "learning_rate": 0.0002667910849147324, "loss": 0.0273, "step": 997 }, { "epoch": 0.65, "grad_norm": 0.3998229205608368, "learning_rate": 0.00026672636517587, "loss": 0.0479, "step": 998 }, { "epoch": 0.65, "grad_norm": 0.08344905078411102, "learning_rate": 0.0002666615902996121, "loss": 0.0066, "step": 999 }, { "epoch": 0.65, "grad_norm": 0.4904734194278717, "learning_rate": 0.00026659676031655605, "loss": 0.107, "step": 1000 }, { "epoch": 0.66, "grad_norm": 0.14752142131328583, "learning_rate": 0.00026653187525732525, "loss": 0.0567, "step": 1001 }, { "epoch": 0.66, "grad_norm": 0.09572061896324158, "learning_rate": 0.0002664669351525691, "loss": 0.045, "step": 1002 }, { "epoch": 0.66, "grad_norm": 0.1489264965057373, "learning_rate": 0.00026640194003296297, "loss": 0.0181, "step": 1003 }, { "epoch": 0.66, "grad_norm": 0.06828869134187698, "learning_rate": 0.00026633688992920833, "loss": 0.0204, "step": 1004 }, { "epoch": 0.66, "grad_norm": 0.08580945432186127, "learning_rate": 0.00026627178487203244, "loss": 0.0275, "step": 1005 }, { "epoch": 0.66, "grad_norm": 0.2796219289302826, "learning_rate": 0.00026620662489218867, "loss": 0.06, "step": 1006 }, { "epoch": 0.66, "grad_norm": 0.19413504004478455, "learning_rate": 0.0002661414100204563, "loss": 0.048, "step": 1007 }, { "epoch": 0.66, "grad_norm": 0.0517372228205204, "learning_rate": 0.0002660761402876405, "loss": 0.0192, "step": 1008 }, { "epoch": 0.66, "grad_norm": 0.12586665153503418, "learning_rate": 0.0002660108157245724, "loss": 0.064, "step": 1009 }, { "epoch": 0.66, "grad_norm": 0.055950991809368134, "learning_rate": 0.000265945436362109, "loss": 0.0128, "step": 1010 }, { "epoch": 0.66, "grad_norm": 0.03763822093605995, "learning_rate": 0.00026588000223113316, "loss": 0.0107, "step": 1011 }, { "epoch": 0.66, "grad_norm": 0.20842203497886658, "learning_rate": 0.00026581451336255365, "loss": 0.0668, "step": 1012 }, { "epoch": 0.66, "grad_norm": 0.077543243765831, "learning_rate": 0.00026574896978730515, "loss": 0.0218, "step": 1013 }, { "epoch": 0.66, "grad_norm": 0.13783104717731476, "learning_rate": 0.0002656833715363481, "loss": 0.0431, "step": 1014 }, { "epoch": 0.66, "grad_norm": 0.049275536090135574, "learning_rate": 0.0002656177186406687, "loss": 0.012, "step": 1015 }, { "epoch": 0.67, "grad_norm": 0.10721635073423386, "learning_rate": 0.00026555201113127907, "loss": 0.0392, "step": 1016 }, { "epoch": 0.67, "grad_norm": 0.1177641823887825, "learning_rate": 0.0002654862490392172, "loss": 0.0416, "step": 1017 }, { "epoch": 0.67, "grad_norm": 0.1034293919801712, "learning_rate": 0.00026542043239554677, "loss": 0.0262, "step": 1018 }, { "epoch": 0.67, "grad_norm": 0.05769471079111099, "learning_rate": 0.0002653545612313571, "loss": 0.0088, "step": 1019 }, { "epoch": 0.67, "grad_norm": 0.2152629792690277, "learning_rate": 0.0002652886355777635, "loss": 0.0709, "step": 1020 }, { "epoch": 0.67, "grad_norm": 0.0717998817563057, "learning_rate": 0.0002652226554659069, "loss": 0.0135, "step": 1021 }, { "epoch": 0.67, "grad_norm": 0.24547475576400757, "learning_rate": 0.0002651566209269539, "loss": 0.0627, "step": 1022 }, { "epoch": 0.67, "grad_norm": 0.17455288767814636, "learning_rate": 0.00026509053199209697, "loss": 0.0466, "step": 1023 }, { "epoch": 0.67, "grad_norm": 0.08559072017669678, "learning_rate": 0.0002650243886925541, "loss": 0.0306, "step": 1024 }, { "epoch": 0.67, "grad_norm": 0.16568362712860107, "learning_rate": 0.0002649581910595691, "loss": 0.0272, "step": 1025 }, { "epoch": 0.67, "grad_norm": 0.14109772443771362, "learning_rate": 0.00026489193912441133, "loss": 0.0241, "step": 1026 }, { "epoch": 0.67, "grad_norm": 0.12116571515798569, "learning_rate": 0.00026482563291837586, "loss": 0.0216, "step": 1027 }, { "epoch": 0.67, "grad_norm": 0.1847831755876541, "learning_rate": 0.0002647592724727835, "loss": 0.046, "step": 1028 }, { "epoch": 0.67, "grad_norm": 0.1964387595653534, "learning_rate": 0.0002646928578189803, "loss": 0.0223, "step": 1029 }, { "epoch": 0.67, "grad_norm": 0.17670650780200958, "learning_rate": 0.0002646263889883385, "loss": 0.0392, "step": 1030 }, { "epoch": 0.67, "grad_norm": 0.3018537759780884, "learning_rate": 0.00026455986601225544, "loss": 0.0601, "step": 1031 }, { "epoch": 0.68, "grad_norm": 0.16954761743545532, "learning_rate": 0.0002644932889221543, "loss": 0.0568, "step": 1032 }, { "epoch": 0.68, "grad_norm": 0.07362630218267441, "learning_rate": 0.0002644266577494837, "loss": 0.0173, "step": 1033 }, { "epoch": 0.68, "grad_norm": 0.22263336181640625, "learning_rate": 0.0002643599725257178, "loss": 0.0528, "step": 1034 }, { "epoch": 0.68, "grad_norm": 0.1654106080532074, "learning_rate": 0.00026429323328235635, "loss": 0.0264, "step": 1035 }, { "epoch": 0.68, "grad_norm": 0.16433385014533997, "learning_rate": 0.0002642264400509247, "loss": 0.0403, "step": 1036 }, { "epoch": 0.68, "grad_norm": 0.16119657456874847, "learning_rate": 0.0002641595928629735, "loss": 0.0517, "step": 1037 }, { "epoch": 0.68, "grad_norm": 0.06720812618732452, "learning_rate": 0.00026409269175007904, "loss": 0.0275, "step": 1038 }, { "epoch": 0.68, "grad_norm": 0.08320458233356476, "learning_rate": 0.000264025736743843, "loss": 0.0254, "step": 1039 }, { "epoch": 0.68, "grad_norm": 0.10702455043792725, "learning_rate": 0.00026395872787589254, "loss": 0.0173, "step": 1040 }, { "epoch": 0.68, "grad_norm": 0.1805281639099121, "learning_rate": 0.0002638916651778803, "loss": 0.0526, "step": 1041 }, { "epoch": 0.68, "grad_norm": 0.1021476462483406, "learning_rate": 0.0002638245486814843, "loss": 0.0206, "step": 1042 }, { "epoch": 0.68, "grad_norm": 0.0951414480805397, "learning_rate": 0.00026375737841840803, "loss": 0.0165, "step": 1043 }, { "epoch": 0.68, "grad_norm": 0.07957201451063156, "learning_rate": 0.0002636901544203804, "loss": 0.0205, "step": 1044 }, { "epoch": 0.68, "grad_norm": 0.1612643599510193, "learning_rate": 0.0002636228767191555, "loss": 0.0426, "step": 1045 }, { "epoch": 0.68, "grad_norm": 0.06410415470600128, "learning_rate": 0.00026355554534651296, "loss": 0.0138, "step": 1046 }, { "epoch": 0.69, "grad_norm": 0.06664423644542694, "learning_rate": 0.0002634881603342578, "loss": 0.0158, "step": 1047 }, { "epoch": 0.69, "grad_norm": 0.07890690118074417, "learning_rate": 0.0002634207217142203, "loss": 0.0582, "step": 1048 }, { "epoch": 0.69, "grad_norm": 0.14806897938251495, "learning_rate": 0.000263353229518256, "loss": 0.059, "step": 1049 }, { "epoch": 0.69, "grad_norm": 0.08011514693498611, "learning_rate": 0.00026328568377824587, "loss": 0.0114, "step": 1050 }, { "epoch": 0.69, "grad_norm": 0.2250976264476776, "learning_rate": 0.00026321808452609615, "loss": 0.0563, "step": 1051 }, { "epoch": 0.69, "grad_norm": 0.12238743901252747, "learning_rate": 0.0002631504317937383, "loss": 0.027, "step": 1052 }, { "epoch": 0.69, "grad_norm": 0.21183420717716217, "learning_rate": 0.00026308272561312903, "loss": 0.0975, "step": 1053 }, { "epoch": 0.69, "grad_norm": 0.03879234194755554, "learning_rate": 0.0002630149660162505, "loss": 0.0079, "step": 1054 }, { "epoch": 0.69, "grad_norm": 0.0885310247540474, "learning_rate": 0.0002629471530351097, "loss": 0.0345, "step": 1055 }, { "epoch": 0.69, "grad_norm": 0.15572543442249298, "learning_rate": 0.0002628792867017392, "loss": 0.0418, "step": 1056 }, { "epoch": 0.69, "grad_norm": 0.05571586638689041, "learning_rate": 0.00026281136704819674, "loss": 0.0148, "step": 1057 }, { "epoch": 0.69, "grad_norm": 0.15888281166553497, "learning_rate": 0.000262743394106565, "loss": 0.0423, "step": 1058 }, { "epoch": 0.69, "grad_norm": 0.06525658816099167, "learning_rate": 0.0002626753679089521, "loss": 0.0179, "step": 1059 }, { "epoch": 0.69, "grad_norm": 0.12204741686582565, "learning_rate": 0.0002626072884874911, "loss": 0.025, "step": 1060 }, { "epoch": 0.69, "grad_norm": 0.08302486687898636, "learning_rate": 0.00026253915587434035, "loss": 0.0346, "step": 1061 }, { "epoch": 0.7, "grad_norm": 0.1839776635169983, "learning_rate": 0.0002624709701016833, "loss": 0.0328, "step": 1062 }, { "epoch": 0.7, "grad_norm": 0.07696244865655899, "learning_rate": 0.0002624027312017285, "loss": 0.0133, "step": 1063 }, { "epoch": 0.7, "grad_norm": 0.13141821324825287, "learning_rate": 0.0002623344392067096, "loss": 0.0776, "step": 1064 }, { "epoch": 0.7, "grad_norm": 0.10801652073860168, "learning_rate": 0.00026226609414888523, "loss": 0.0308, "step": 1065 }, { "epoch": 0.7, "grad_norm": 0.11759611964225769, "learning_rate": 0.00026219769606053927, "loss": 0.0555, "step": 1066 }, { "epoch": 0.7, "grad_norm": 0.27231448888778687, "learning_rate": 0.00026212924497398044, "loss": 0.1241, "step": 1067 }, { "epoch": 0.7, "grad_norm": 0.05362692102789879, "learning_rate": 0.00026206074092154276, "loss": 0.0345, "step": 1068 }, { "epoch": 0.7, "grad_norm": 0.09251823276281357, "learning_rate": 0.0002619921839355849, "loss": 0.0423, "step": 1069 }, { "epoch": 0.7, "grad_norm": 0.09250043332576752, "learning_rate": 0.000261923574048491, "loss": 0.0433, "step": 1070 }, { "epoch": 0.7, "grad_norm": 0.04056562855839729, "learning_rate": 0.0002618549112926698, "loss": 0.0134, "step": 1071 }, { "epoch": 0.7, "grad_norm": 0.04624701663851738, "learning_rate": 0.0002617861957005551, "loss": 0.0204, "step": 1072 }, { "epoch": 0.7, "grad_norm": 0.09492779523134232, "learning_rate": 0.00026171742730460583, "loss": 0.0252, "step": 1073 }, { "epoch": 0.7, "grad_norm": 0.07661382853984833, "learning_rate": 0.00026164860613730567, "loss": 0.0164, "step": 1074 }, { "epoch": 0.7, "grad_norm": 0.2870001196861267, "learning_rate": 0.0002615797322311633, "loss": 0.0362, "step": 1075 }, { "epoch": 0.7, "grad_norm": 0.0941600501537323, "learning_rate": 0.0002615108056187123, "loss": 0.0277, "step": 1076 }, { "epoch": 0.71, "grad_norm": 0.09941410273313522, "learning_rate": 0.00026144182633251127, "loss": 0.0271, "step": 1077 }, { "epoch": 0.71, "grad_norm": 0.06893230229616165, "learning_rate": 0.0002613727944051434, "loss": 0.0264, "step": 1078 }, { "epoch": 0.71, "grad_norm": 0.09225239604711533, "learning_rate": 0.00026130370986921707, "loss": 0.0124, "step": 1079 }, { "epoch": 0.71, "grad_norm": 0.13335295021533966, "learning_rate": 0.0002612345727573653, "loss": 0.0658, "step": 1080 }, { "epoch": 0.71, "grad_norm": 0.08353302627801895, "learning_rate": 0.000261165383102246, "loss": 0.0168, "step": 1081 }, { "epoch": 0.71, "grad_norm": 0.16986088454723358, "learning_rate": 0.00026109614093654195, "loss": 0.0857, "step": 1082 }, { "epoch": 0.71, "grad_norm": 0.07607953995466232, "learning_rate": 0.00026102684629296065, "loss": 0.01, "step": 1083 }, { "epoch": 0.71, "grad_norm": 0.1080528199672699, "learning_rate": 0.00026095749920423446, "loss": 0.0605, "step": 1084 }, { "epoch": 0.71, "grad_norm": 0.14226533472537994, "learning_rate": 0.0002608880997031205, "loss": 0.0323, "step": 1085 }, { "epoch": 0.71, "grad_norm": 0.0267617329955101, "learning_rate": 0.0002608186478224006, "loss": 0.0046, "step": 1086 }, { "epoch": 0.71, "grad_norm": 0.05727904289960861, "learning_rate": 0.00026074914359488143, "loss": 0.0111, "step": 1087 }, { "epoch": 0.71, "grad_norm": 0.08158308267593384, "learning_rate": 0.0002606795870533942, "loss": 0.0227, "step": 1088 }, { "epoch": 0.71, "grad_norm": 0.17422080039978027, "learning_rate": 0.00026060997823079506, "loss": 0.0583, "step": 1089 }, { "epoch": 0.71, "grad_norm": 0.19084464013576508, "learning_rate": 0.0002605403171599647, "loss": 0.0736, "step": 1090 }, { "epoch": 0.71, "grad_norm": 0.10208334028720856, "learning_rate": 0.00026047060387380855, "loss": 0.021, "step": 1091 }, { "epoch": 0.71, "grad_norm": 0.13515685498714447, "learning_rate": 0.0002604008384052568, "loss": 0.0319, "step": 1092 }, { "epoch": 0.72, "grad_norm": 0.13729439675807953, "learning_rate": 0.00026033102078726393, "loss": 0.0292, "step": 1093 }, { "epoch": 0.72, "grad_norm": 0.10295616090297699, "learning_rate": 0.0002602611510528095, "loss": 0.0133, "step": 1094 }, { "epoch": 0.72, "grad_norm": 0.14003846049308777, "learning_rate": 0.0002601912292348975, "loss": 0.0413, "step": 1095 }, { "epoch": 0.72, "grad_norm": 0.22413985431194305, "learning_rate": 0.0002601212553665564, "loss": 0.0242, "step": 1096 }, { "epoch": 0.72, "grad_norm": 0.13832725584506989, "learning_rate": 0.0002600512294808395, "loss": 0.0353, "step": 1097 }, { "epoch": 0.72, "grad_norm": 0.29502683877944946, "learning_rate": 0.0002599811516108245, "loss": 0.0362, "step": 1098 }, { "epoch": 0.72, "grad_norm": 0.09490124136209488, "learning_rate": 0.00025991102178961366, "loss": 0.014, "step": 1099 }, { "epoch": 0.72, "grad_norm": 0.1145247370004654, "learning_rate": 0.0002598408400503339, "loss": 0.0294, "step": 1100 }, { "epoch": 0.72, "grad_norm": 0.38977229595184326, "learning_rate": 0.00025977060642613645, "loss": 0.0827, "step": 1101 }, { "epoch": 0.72, "grad_norm": 0.10398557782173157, "learning_rate": 0.0002597003209501973, "loss": 0.0176, "step": 1102 }, { "epoch": 0.72, "grad_norm": 0.13759955763816833, "learning_rate": 0.0002596299836557168, "loss": 0.0428, "step": 1103 }, { "epoch": 0.72, "grad_norm": 0.05294102802872658, "learning_rate": 0.0002595595945759198, "loss": 0.013, "step": 1104 }, { "epoch": 0.72, "grad_norm": 0.2116420418024063, "learning_rate": 0.0002594891537440556, "loss": 0.0416, "step": 1105 }, { "epoch": 0.72, "grad_norm": 0.0850871354341507, "learning_rate": 0.00025941866119339786, "loss": 0.0264, "step": 1106 }, { "epoch": 0.72, "grad_norm": 0.04429350420832634, "learning_rate": 0.00025934811695724484, "loss": 0.0088, "step": 1107 }, { "epoch": 0.73, "grad_norm": 0.0578470379114151, "learning_rate": 0.0002592775210689192, "loss": 0.0295, "step": 1108 }, { "epoch": 0.73, "grad_norm": 0.1103309616446495, "learning_rate": 0.00025920687356176784, "loss": 0.0154, "step": 1109 }, { "epoch": 0.73, "grad_norm": 0.09454017877578735, "learning_rate": 0.0002591361744691622, "loss": 0.025, "step": 1110 }, { "epoch": 0.73, "grad_norm": 0.19059227406978607, "learning_rate": 0.0002590654238244979, "loss": 0.0599, "step": 1111 }, { "epoch": 0.73, "grad_norm": 0.08629673719406128, "learning_rate": 0.0002589946216611952, "loss": 0.0151, "step": 1112 }, { "epoch": 0.73, "grad_norm": 0.18637306988239288, "learning_rate": 0.0002589237680126984, "loss": 0.0496, "step": 1113 }, { "epoch": 0.73, "grad_norm": 0.12386718392372131, "learning_rate": 0.00025885286291247634, "loss": 0.0269, "step": 1114 }, { "epoch": 0.73, "grad_norm": 0.18383803963661194, "learning_rate": 0.00025878190639402204, "loss": 0.0408, "step": 1115 }, { "epoch": 0.73, "grad_norm": 0.24928437173366547, "learning_rate": 0.0002587108984908528, "loss": 0.0254, "step": 1116 }, { "epoch": 0.73, "grad_norm": 0.023719167336821556, "learning_rate": 0.00025863983923651027, "loss": 0.0037, "step": 1117 }, { "epoch": 0.73, "grad_norm": 0.16337376832962036, "learning_rate": 0.00025856872866456037, "loss": 0.0529, "step": 1118 }, { "epoch": 0.73, "grad_norm": 0.11658964306116104, "learning_rate": 0.00025849756680859317, "loss": 0.063, "step": 1119 }, { "epoch": 0.73, "grad_norm": 0.20387554168701172, "learning_rate": 0.000258426353702223, "loss": 0.0605, "step": 1120 }, { "epoch": 0.73, "grad_norm": 0.2778151035308838, "learning_rate": 0.0002583550893790885, "loss": 0.0476, "step": 1121 }, { "epoch": 0.73, "grad_norm": 0.11449744552373886, "learning_rate": 0.0002582837738728522, "loss": 0.0315, "step": 1122 }, { "epoch": 0.74, "grad_norm": 0.10286298394203186, "learning_rate": 0.00025821240721720116, "loss": 0.041, "step": 1123 }, { "epoch": 0.74, "grad_norm": 0.11522707343101501, "learning_rate": 0.00025814098944584645, "loss": 0.0414, "step": 1124 }, { "epoch": 0.74, "grad_norm": 0.06536536663770676, "learning_rate": 0.0002580695205925233, "loss": 0.0216, "step": 1125 }, { "epoch": 0.74, "grad_norm": 0.0686458870768547, "learning_rate": 0.00025799800069099105, "loss": 0.0667, "step": 1126 }, { "epoch": 0.74, "grad_norm": 0.07378174364566803, "learning_rate": 0.0002579264297750331, "loss": 0.018, "step": 1127 }, { "epoch": 0.74, "grad_norm": 0.05744575336575508, "learning_rate": 0.0002578548078784571, "loss": 0.0328, "step": 1128 }, { "epoch": 0.74, "grad_norm": 0.1781056821346283, "learning_rate": 0.0002577831350350947, "loss": 0.056, "step": 1129 }, { "epoch": 0.74, "grad_norm": 0.11974502354860306, "learning_rate": 0.0002577114112788016, "loss": 0.0411, "step": 1130 }, { "epoch": 0.74, "grad_norm": 0.07625679671764374, "learning_rate": 0.00025763963664345745, "loss": 0.0332, "step": 1131 }, { "epoch": 0.74, "grad_norm": 0.07967997342348099, "learning_rate": 0.00025756781116296617, "loss": 0.0431, "step": 1132 }, { "epoch": 0.74, "grad_norm": 0.14101997017860413, "learning_rate": 0.0002574959348712555, "loss": 0.0322, "step": 1133 }, { "epoch": 0.74, "grad_norm": 0.12365719676017761, "learning_rate": 0.00025742400780227724, "loss": 0.0205, "step": 1134 }, { "epoch": 0.74, "grad_norm": 0.14429523050785065, "learning_rate": 0.0002573520299900073, "loss": 0.069, "step": 1135 }, { "epoch": 0.74, "grad_norm": 0.021441614255309105, "learning_rate": 0.0002572800014684453, "loss": 0.0054, "step": 1136 }, { "epoch": 0.74, "grad_norm": 0.08611132204532623, "learning_rate": 0.0002572079222716151, "loss": 0.0442, "step": 1137 }, { "epoch": 0.75, "grad_norm": 0.09402936697006226, "learning_rate": 0.0002571357924335642, "loss": 0.0352, "step": 1138 }, { "epoch": 0.75, "grad_norm": 0.08581096678972244, "learning_rate": 0.00025706361198836437, "loss": 0.0149, "step": 1139 }, { "epoch": 0.75, "grad_norm": 0.0612567737698555, "learning_rate": 0.0002569913809701109, "loss": 0.014, "step": 1140 }, { "epoch": 0.75, "grad_norm": 0.10282464325428009, "learning_rate": 0.0002569190994129233, "loss": 0.0254, "step": 1141 }, { "epoch": 0.75, "grad_norm": 0.07298202067613602, "learning_rate": 0.00025684676735094475, "loss": 0.033, "step": 1142 }, { "epoch": 0.75, "grad_norm": 0.06616336852312088, "learning_rate": 0.0002567743848183423, "loss": 0.0127, "step": 1143 }, { "epoch": 0.75, "grad_norm": 0.09016578644514084, "learning_rate": 0.000256701951849307, "loss": 0.0248, "step": 1144 }, { "epoch": 0.75, "grad_norm": 0.09605623781681061, "learning_rate": 0.0002566294684780536, "loss": 0.0554, "step": 1145 }, { "epoch": 0.75, "grad_norm": 0.13209934532642365, "learning_rate": 0.0002565569347388206, "loss": 0.0437, "step": 1146 }, { "epoch": 0.75, "eval_loss": 0.030348777770996094, "eval_runtime": 39.9058, "eval_samples_per_second": 32.251, "eval_steps_per_second": 8.069, "step": 1146 }, { "epoch": 0.75, "grad_norm": 0.13489413261413574, "learning_rate": 0.0002564843506658704, "loss": 0.0214, "step": 1147 }, { "epoch": 0.75, "grad_norm": 0.036875851452350616, "learning_rate": 0.00025641171629348916, "loss": 0.0075, "step": 1148 }, { "epoch": 0.75, "grad_norm": 0.04911373555660248, "learning_rate": 0.0002563390316559868, "loss": 0.0331, "step": 1149 }, { "epoch": 0.75, "grad_norm": 0.02945212461054325, "learning_rate": 0.0002562662967876969, "loss": 0.0044, "step": 1150 }, { "epoch": 0.75, "grad_norm": 0.09545271843671799, "learning_rate": 0.00025619351172297686, "loss": 0.0342, "step": 1151 }, { "epoch": 0.75, "grad_norm": 0.034161727875471115, "learning_rate": 0.0002561206764962079, "loss": 0.0064, "step": 1152 }, { "epoch": 0.75, "grad_norm": 0.17162153124809265, "learning_rate": 0.00025604779114179457, "loss": 0.0305, "step": 1153 }, { "epoch": 0.76, "grad_norm": 0.10241468250751495, "learning_rate": 0.0002559748556941654, "loss": 0.0143, "step": 1154 }, { "epoch": 0.76, "grad_norm": 0.19089680910110474, "learning_rate": 0.0002559018701877726, "loss": 0.0192, "step": 1155 }, { "epoch": 0.76, "grad_norm": 0.19189144670963287, "learning_rate": 0.0002558288346570918, "loss": 0.0385, "step": 1156 }, { "epoch": 0.76, "grad_norm": 0.023649632930755615, "learning_rate": 0.00025575574913662256, "loss": 0.0043, "step": 1157 }, { "epoch": 0.76, "grad_norm": 0.20011720061302185, "learning_rate": 0.0002556826136608877, "loss": 0.0361, "step": 1158 }, { "epoch": 0.76, "grad_norm": 0.3903810679912567, "learning_rate": 0.00025560942826443396, "loss": 0.1086, "step": 1159 }, { "epoch": 0.76, "grad_norm": 0.0918634682893753, "learning_rate": 0.0002555361929818315, "loss": 0.0237, "step": 1160 }, { "epoch": 0.76, "grad_norm": 0.11210468411445618, "learning_rate": 0.00025546290784767407, "loss": 0.0432, "step": 1161 }, { "epoch": 0.76, "grad_norm": 0.10598167777061462, "learning_rate": 0.000255389572896579, "loss": 0.0304, "step": 1162 }, { "epoch": 0.76, "grad_norm": 0.03547512739896774, "learning_rate": 0.00025531618816318697, "loss": 0.014, "step": 1163 }, { "epoch": 0.76, "grad_norm": 0.08146083354949951, "learning_rate": 0.00025524275368216245, "loss": 0.0122, "step": 1164 }, { "epoch": 0.76, "grad_norm": 0.046655625104904175, "learning_rate": 0.00025516926948819334, "loss": 0.0151, "step": 1165 }, { "epoch": 0.76, "grad_norm": 0.09417696297168732, "learning_rate": 0.0002550957356159908, "loss": 0.047, "step": 1166 }, { "epoch": 0.76, "grad_norm": 0.08695515990257263, "learning_rate": 0.00025502215210028976, "loss": 0.0363, "step": 1167 }, { "epoch": 0.76, "grad_norm": 0.05286262556910515, "learning_rate": 0.0002549485189758485, "loss": 0.0331, "step": 1168 }, { "epoch": 0.77, "grad_norm": 0.1305568516254425, "learning_rate": 0.0002548748362774485, "loss": 0.0552, "step": 1169 }, { "epoch": 0.77, "grad_norm": 0.15096144378185272, "learning_rate": 0.000254801104039895, "loss": 0.0341, "step": 1170 }, { "epoch": 0.77, "grad_norm": 0.07643090933561325, "learning_rate": 0.0002547273222980165, "loss": 0.0234, "step": 1171 }, { "epoch": 0.77, "grad_norm": 0.052111852914094925, "learning_rate": 0.0002546534910866648, "loss": 0.0278, "step": 1172 }, { "epoch": 0.77, "grad_norm": 0.15109075605869293, "learning_rate": 0.00025457961044071523, "loss": 0.039, "step": 1173 }, { "epoch": 0.77, "grad_norm": 0.05562788248062134, "learning_rate": 0.00025450568039506633, "loss": 0.0214, "step": 1174 }, { "epoch": 0.77, "grad_norm": 0.1751837581396103, "learning_rate": 0.00025443170098464, "loss": 0.0401, "step": 1175 }, { "epoch": 0.77, "grad_norm": 0.19507139921188354, "learning_rate": 0.0002543576722443816, "loss": 0.0331, "step": 1176 }, { "epoch": 0.77, "grad_norm": 0.10975005477666855, "learning_rate": 0.00025428359420925966, "loss": 0.0155, "step": 1177 }, { "epoch": 0.77, "grad_norm": 0.1416396051645279, "learning_rate": 0.00025420946691426586, "loss": 0.0473, "step": 1178 }, { "epoch": 0.77, "grad_norm": 0.03987191617488861, "learning_rate": 0.0002541352903944155, "loss": 0.0069, "step": 1179 }, { "epoch": 0.77, "grad_norm": 0.34085920453071594, "learning_rate": 0.00025406106468474685, "loss": 0.0919, "step": 1180 }, { "epoch": 0.77, "grad_norm": 0.06129152700304985, "learning_rate": 0.0002539867898203215, "loss": 0.0129, "step": 1181 }, { "epoch": 0.77, "grad_norm": 0.08059722930192947, "learning_rate": 0.00025391246583622427, "loss": 0.0172, "step": 1182 }, { "epoch": 0.77, "grad_norm": 0.12509244680404663, "learning_rate": 0.0002538380927675632, "loss": 0.0881, "step": 1183 }, { "epoch": 0.78, "grad_norm": 0.21917979419231415, "learning_rate": 0.00025376367064946945, "loss": 0.0438, "step": 1184 }, { "epoch": 0.78, "grad_norm": 0.05029948800802231, "learning_rate": 0.0002536891995170974, "loss": 0.0102, "step": 1185 }, { "epoch": 0.78, "grad_norm": 0.027424413710832596, "learning_rate": 0.00025361467940562463, "loss": 0.0053, "step": 1186 }, { "epoch": 0.78, "grad_norm": 0.0775713250041008, "learning_rate": 0.0002535401103502517, "loss": 0.0329, "step": 1187 }, { "epoch": 0.78, "grad_norm": 0.12953567504882812, "learning_rate": 0.0002534654923862025, "loss": 0.0371, "step": 1188 }, { "epoch": 0.78, "grad_norm": 0.07097966223955154, "learning_rate": 0.00025339082554872377, "loss": 0.0165, "step": 1189 }, { "epoch": 0.78, "grad_norm": 0.1304195523262024, "learning_rate": 0.0002533161098730856, "loss": 0.0386, "step": 1190 }, { "epoch": 0.78, "grad_norm": 0.06887423247098923, "learning_rate": 0.00025324134539458096, "loss": 0.0221, "step": 1191 }, { "epoch": 0.78, "grad_norm": 0.08637112379074097, "learning_rate": 0.00025316653214852596, "loss": 0.0341, "step": 1192 }, { "epoch": 0.78, "grad_norm": 0.04632532596588135, "learning_rate": 0.0002530916701702597, "loss": 0.0094, "step": 1193 }, { "epoch": 0.78, "grad_norm": 0.11397617310285568, "learning_rate": 0.00025301675949514435, "loss": 0.0167, "step": 1194 }, { "epoch": 0.78, "grad_norm": 0.04785558953881264, "learning_rate": 0.000252941800158565, "loss": 0.0189, "step": 1195 }, { "epoch": 0.78, "grad_norm": 0.24082554876804352, "learning_rate": 0.00025286679219593, "loss": 0.0472, "step": 1196 }, { "epoch": 0.78, "grad_norm": 0.14454412460327148, "learning_rate": 0.00025279173564267014, "loss": 0.0521, "step": 1197 }, { "epoch": 0.78, "grad_norm": 0.16198396682739258, "learning_rate": 0.00025271663053423967, "loss": 0.0606, "step": 1198 }, { "epoch": 0.78, "grad_norm": 0.114061638712883, "learning_rate": 0.0002526414769061155, "loss": 0.012, "step": 1199 }, { "epoch": 0.79, "grad_norm": 0.1736219972372055, "learning_rate": 0.00025256627479379755, "loss": 0.0516, "step": 1200 }, { "epoch": 0.79, "grad_norm": 0.04280832037329674, "learning_rate": 0.0002524910242328087, "loss": 0.0073, "step": 1201 }, { "epoch": 0.79, "grad_norm": 0.13054266571998596, "learning_rate": 0.0002524157252586946, "loss": 0.0295, "step": 1202 }, { "epoch": 0.79, "grad_norm": 0.24452893435955048, "learning_rate": 0.00025234037790702375, "loss": 0.0856, "step": 1203 }, { "epoch": 0.79, "grad_norm": 0.05776005983352661, "learning_rate": 0.0002522649822133877, "loss": 0.0152, "step": 1204 }, { "epoch": 0.79, "grad_norm": 0.27971917390823364, "learning_rate": 0.0002521895382134006, "loss": 0.1183, "step": 1205 }, { "epoch": 0.79, "grad_norm": 0.04905636981129646, "learning_rate": 0.0002521140459426995, "loss": 0.0126, "step": 1206 }, { "epoch": 0.79, "grad_norm": 0.15006506443023682, "learning_rate": 0.0002520385054369444, "loss": 0.0811, "step": 1207 }, { "epoch": 0.79, "grad_norm": 0.15131042897701263, "learning_rate": 0.00025196291673181784, "loss": 0.0401, "step": 1208 }, { "epoch": 0.79, "grad_norm": 0.1603415459394455, "learning_rate": 0.0002518872798630253, "loss": 0.0448, "step": 1209 }, { "epoch": 0.79, "grad_norm": 0.07513672858476639, "learning_rate": 0.0002518115948662949, "loss": 0.0401, "step": 1210 }, { "epoch": 0.79, "grad_norm": 0.11225542426109314, "learning_rate": 0.0002517358617773776, "loss": 0.039, "step": 1211 }, { "epoch": 0.79, "grad_norm": 0.0876198261976242, "learning_rate": 0.000251660080632047, "loss": 0.0239, "step": 1212 }, { "epoch": 0.79, "grad_norm": 0.1050589308142662, "learning_rate": 0.0002515842514660994, "loss": 0.0258, "step": 1213 }, { "epoch": 0.79, "grad_norm": 0.0426226444542408, "learning_rate": 0.0002515083743153539, "loss": 0.0111, "step": 1214 }, { "epoch": 0.8, "grad_norm": 0.09025552123785019, "learning_rate": 0.00025143244921565214, "loss": 0.0185, "step": 1215 }, { "epoch": 0.8, "grad_norm": 0.12371645122766495, "learning_rate": 0.00025135647620285834, "loss": 0.0326, "step": 1216 }, { "epoch": 0.8, "grad_norm": 0.07417233288288116, "learning_rate": 0.0002512804553128596, "loss": 0.0238, "step": 1217 }, { "epoch": 0.8, "grad_norm": 0.10499947518110275, "learning_rate": 0.0002512043865815654, "loss": 0.0464, "step": 1218 }, { "epoch": 0.8, "grad_norm": 0.16344919800758362, "learning_rate": 0.00025112827004490797, "loss": 0.0373, "step": 1219 }, { "epoch": 0.8, "grad_norm": 0.0862027183175087, "learning_rate": 0.00025105210573884203, "loss": 0.0178, "step": 1220 }, { "epoch": 0.8, "grad_norm": 0.10541030019521713, "learning_rate": 0.0002509758936993449, "loss": 0.0377, "step": 1221 }, { "epoch": 0.8, "grad_norm": 0.05190376564860344, "learning_rate": 0.00025089963396241643, "loss": 0.0099, "step": 1222 }, { "epoch": 0.8, "grad_norm": 0.09249959141016006, "learning_rate": 0.00025082332656407906, "loss": 0.0157, "step": 1223 }, { "epoch": 0.8, "grad_norm": 0.02348952367901802, "learning_rate": 0.00025074697154037765, "loss": 0.0041, "step": 1224 }, { "epoch": 0.8, "grad_norm": 0.12875327467918396, "learning_rate": 0.0002506705689273797, "loss": 0.0173, "step": 1225 }, { "epoch": 0.8, "grad_norm": 0.13971397280693054, "learning_rate": 0.0002505941187611749, "loss": 0.0381, "step": 1226 }, { "epoch": 0.8, "grad_norm": 0.21139316260814667, "learning_rate": 0.00025051762107787583, "loss": 0.0399, "step": 1227 }, { "epoch": 0.8, "grad_norm": 0.10346369445323944, "learning_rate": 0.0002504410759136171, "loss": 0.031, "step": 1228 }, { "epoch": 0.8, "grad_norm": 0.021524077281355858, "learning_rate": 0.00025036448330455603, "loss": 0.0041, "step": 1229 }, { "epoch": 0.81, "grad_norm": 0.21078258752822876, "learning_rate": 0.0002502878432868722, "loss": 0.0291, "step": 1230 }, { "epoch": 0.81, "grad_norm": 0.28720253705978394, "learning_rate": 0.00025021115589676774, "loss": 0.0318, "step": 1231 }, { "epoch": 0.81, "grad_norm": 0.2182384580373764, "learning_rate": 0.00025013442117046694, "loss": 0.0407, "step": 1232 }, { "epoch": 0.81, "grad_norm": 0.1223733052611351, "learning_rate": 0.0002500576391442166, "loss": 0.0189, "step": 1233 }, { "epoch": 0.81, "grad_norm": 0.1699313372373581, "learning_rate": 0.0002499808098542858, "loss": 0.1081, "step": 1234 }, { "epoch": 0.81, "grad_norm": 0.21604309976100922, "learning_rate": 0.00024990393333696603, "loss": 0.0406, "step": 1235 }, { "epoch": 0.81, "grad_norm": 0.11065655201673508, "learning_rate": 0.00024982700962857094, "loss": 0.0274, "step": 1236 }, { "epoch": 0.81, "grad_norm": 0.10013590008020401, "learning_rate": 0.0002497500387654367, "loss": 0.0138, "step": 1237 }, { "epoch": 0.81, "grad_norm": 0.03474019467830658, "learning_rate": 0.0002496730207839215, "loss": 0.0067, "step": 1238 }, { "epoch": 0.81, "grad_norm": 0.1373460739850998, "learning_rate": 0.00024959595572040594, "loss": 0.0382, "step": 1239 }, { "epoch": 0.81, "grad_norm": 0.1674460619688034, "learning_rate": 0.0002495188436112928, "loss": 0.0187, "step": 1240 }, { "epoch": 0.81, "grad_norm": 0.056852634996175766, "learning_rate": 0.0002494416844930072, "loss": 0.02, "step": 1241 }, { "epoch": 0.81, "grad_norm": 0.1567879319190979, "learning_rate": 0.00024936447840199626, "loss": 0.0488, "step": 1242 }, { "epoch": 0.81, "grad_norm": 0.19893474876880646, "learning_rate": 0.0002492872253747294, "loss": 0.0382, "step": 1243 }, { "epoch": 0.81, "grad_norm": 0.07066723704338074, "learning_rate": 0.0002492099254476983, "loss": 0.0194, "step": 1244 }, { "epoch": 0.82, "grad_norm": 0.11466959118843079, "learning_rate": 0.00024913257865741663, "loss": 0.0367, "step": 1245 }, { "epoch": 0.82, "grad_norm": 0.08930857479572296, "learning_rate": 0.0002490551850404203, "loss": 0.0186, "step": 1246 }, { "epoch": 0.82, "grad_norm": 0.0905904471874237, "learning_rate": 0.0002489777446332673, "loss": 0.0349, "step": 1247 }, { "epoch": 0.82, "grad_norm": 0.225018709897995, "learning_rate": 0.0002489002574725378, "loss": 0.0579, "step": 1248 }, { "epoch": 0.82, "grad_norm": 0.15631456673145294, "learning_rate": 0.0002488227235948339, "loss": 0.0361, "step": 1249 }, { "epoch": 0.82, "grad_norm": 0.06862124055624008, "learning_rate": 0.0002487451430367798, "loss": 0.0351, "step": 1250 }, { "epoch": 0.82, "grad_norm": 0.10271900147199631, "learning_rate": 0.00024866751583502194, "loss": 0.0393, "step": 1251 }, { "epoch": 0.82, "grad_norm": 0.12624254822731018, "learning_rate": 0.0002485898420262286, "loss": 0.0309, "step": 1252 }, { "epoch": 0.82, "grad_norm": 0.116575688123703, "learning_rate": 0.00024851212164709013, "loss": 0.058, "step": 1253 }, { "epoch": 0.82, "grad_norm": 0.06756250560283661, "learning_rate": 0.00024843435473431886, "loss": 0.0335, "step": 1254 }, { "epoch": 0.82, "grad_norm": 0.20835717022418976, "learning_rate": 0.0002483565413246492, "loss": 0.0389, "step": 1255 }, { "epoch": 0.82, "grad_norm": 0.04360177740454674, "learning_rate": 0.0002482786814548374, "loss": 0.008, "step": 1256 }, { "epoch": 0.82, "grad_norm": 0.1068229153752327, "learning_rate": 0.0002482007751616616, "loss": 0.0304, "step": 1257 }, { "epoch": 0.82, "grad_norm": 0.04819338023662567, "learning_rate": 0.0002481228224819221, "loss": 0.0098, "step": 1258 }, { "epoch": 0.82, "grad_norm": 0.48405715823173523, "learning_rate": 0.00024804482345244105, "loss": 0.0348, "step": 1259 }, { "epoch": 0.82, "grad_norm": 0.09796518087387085, "learning_rate": 0.0002479667781100622, "loss": 0.0153, "step": 1260 }, { "epoch": 0.83, "grad_norm": 0.13171538710594177, "learning_rate": 0.0002478886864916516, "loss": 0.0316, "step": 1261 }, { "epoch": 0.83, "grad_norm": 0.0907411128282547, "learning_rate": 0.00024781054863409676, "loss": 0.0169, "step": 1262 }, { "epoch": 0.83, "grad_norm": 0.10159718245267868, "learning_rate": 0.00024773236457430745, "loss": 0.013, "step": 1263 }, { "epoch": 0.83, "grad_norm": 0.10823512077331543, "learning_rate": 0.00024765413434921495, "loss": 0.0252, "step": 1264 }, { "epoch": 0.83, "grad_norm": 0.07199376821517944, "learning_rate": 0.0002475758579957724, "loss": 0.0105, "step": 1265 }, { "epoch": 0.83, "grad_norm": 0.11216728389263153, "learning_rate": 0.0002474975355509549, "loss": 0.0339, "step": 1266 }, { "epoch": 0.83, "grad_norm": 0.16655175387859344, "learning_rate": 0.00024741916705175906, "loss": 0.0306, "step": 1267 }, { "epoch": 0.83, "grad_norm": 0.08566506952047348, "learning_rate": 0.00024734075253520345, "loss": 0.0329, "step": 1268 }, { "epoch": 0.83, "grad_norm": 0.1542367786169052, "learning_rate": 0.00024726229203832824, "loss": 0.0284, "step": 1269 }, { "epoch": 0.83, "grad_norm": 0.1685347855091095, "learning_rate": 0.00024718378559819554, "loss": 0.0385, "step": 1270 }, { "epoch": 0.83, "grad_norm": 0.1904221624135971, "learning_rate": 0.00024710523325188885, "loss": 0.0435, "step": 1271 }, { "epoch": 0.83, "grad_norm": 0.10915929824113846, "learning_rate": 0.00024702663503651357, "loss": 0.0129, "step": 1272 }, { "epoch": 0.83, "grad_norm": 0.04411763325333595, "learning_rate": 0.0002469479909891967, "loss": 0.0038, "step": 1273 }, { "epoch": 0.83, "grad_norm": 0.22485259175300598, "learning_rate": 0.0002468693011470869, "loss": 0.0456, "step": 1274 }, { "epoch": 0.83, "grad_norm": 0.10708510875701904, "learning_rate": 0.00024679056554735454, "loss": 0.0192, "step": 1275 }, { "epoch": 0.84, "grad_norm": 0.15084552764892578, "learning_rate": 0.00024671178422719137, "loss": 0.0293, "step": 1276 }, { "epoch": 0.84, "grad_norm": 0.14543551206588745, "learning_rate": 0.000246632957223811, "loss": 0.0666, "step": 1277 }, { "epoch": 0.84, "grad_norm": 0.1648811399936676, "learning_rate": 0.00024655408457444853, "loss": 0.0321, "step": 1278 }, { "epoch": 0.84, "grad_norm": 0.16748228669166565, "learning_rate": 0.00024647516631636055, "loss": 0.0373, "step": 1279 }, { "epoch": 0.84, "grad_norm": 0.04038754105567932, "learning_rate": 0.00024639620248682523, "loss": 0.0049, "step": 1280 }, { "epoch": 0.84, "grad_norm": 0.1675775945186615, "learning_rate": 0.00024631719312314234, "loss": 0.0517, "step": 1281 }, { "epoch": 0.84, "grad_norm": 0.227004274725914, "learning_rate": 0.00024623813826263303, "loss": 0.0445, "step": 1282 }, { "epoch": 0.84, "grad_norm": 0.05555510148406029, "learning_rate": 0.00024615903794264005, "loss": 0.0096, "step": 1283 }, { "epoch": 0.84, "grad_norm": 0.16279524564743042, "learning_rate": 0.00024607989220052766, "loss": 0.0452, "step": 1284 }, { "epoch": 0.84, "grad_norm": 0.22099511325359344, "learning_rate": 0.0002460007010736814, "loss": 0.0484, "step": 1285 }, { "epoch": 0.84, "grad_norm": 0.3313157558441162, "learning_rate": 0.00024592146459950835, "loss": 0.0798, "step": 1286 }, { "epoch": 0.84, "grad_norm": 0.1560799926519394, "learning_rate": 0.0002458421828154371, "loss": 0.0523, "step": 1287 }, { "epoch": 0.84, "grad_norm": 0.0924949198961258, "learning_rate": 0.0002457628557589174, "loss": 0.0416, "step": 1288 }, { "epoch": 0.84, "grad_norm": 0.061663124710321426, "learning_rate": 0.0002456834834674207, "loss": 0.0187, "step": 1289 }, { "epoch": 0.84, "grad_norm": 0.04804534092545509, "learning_rate": 0.0002456040659784396, "loss": 0.0236, "step": 1290 }, { "epoch": 0.85, "grad_norm": 0.09753583371639252, "learning_rate": 0.00024552460332948804, "loss": 0.0447, "step": 1291 }, { "epoch": 0.85, "grad_norm": 0.03994222357869148, "learning_rate": 0.0002454450955581015, "loss": 0.0098, "step": 1292 }, { "epoch": 0.85, "grad_norm": 0.12844492495059967, "learning_rate": 0.0002453655427018364, "loss": 0.0234, "step": 1293 }, { "epoch": 0.85, "grad_norm": 0.12967482209205627, "learning_rate": 0.000245285944798271, "loss": 0.0435, "step": 1294 }, { "epoch": 0.85, "grad_norm": 0.25114384293556213, "learning_rate": 0.00024520630188500423, "loss": 0.0539, "step": 1295 }, { "epoch": 0.85, "grad_norm": 0.25040391087532043, "learning_rate": 0.0002451266139996568, "loss": 0.037, "step": 1296 }, { "epoch": 0.85, "grad_norm": 0.21144863963127136, "learning_rate": 0.0002450468811798703, "loss": 0.0371, "step": 1297 }, { "epoch": 0.85, "grad_norm": 0.2176048457622528, "learning_rate": 0.00024496710346330776, "loss": 0.0311, "step": 1298 }, { "epoch": 0.85, "grad_norm": 0.06802531331777573, "learning_rate": 0.0002448872808876533, "loss": 0.0095, "step": 1299 }, { "epoch": 0.85, "grad_norm": 0.11026319861412048, "learning_rate": 0.0002448074134906123, "loss": 0.0132, "step": 1300 }, { "epoch": 0.85, "grad_norm": 0.2511361539363861, "learning_rate": 0.00024472750130991126, "loss": 0.0091, "step": 1301 }, { "epoch": 0.85, "grad_norm": 0.42377692461013794, "learning_rate": 0.0002446475443832979, "loss": 0.0669, "step": 1302 }, { "epoch": 0.85, "grad_norm": 0.587988555431366, "learning_rate": 0.000244567542748541, "loss": 0.0737, "step": 1303 }, { "epoch": 0.85, "grad_norm": 0.1163543239235878, "learning_rate": 0.0002444874964434305, "loss": 0.0151, "step": 1304 }, { "epoch": 0.85, "grad_norm": 0.036374811083078384, "learning_rate": 0.00024440740550577754, "loss": 0.0067, "step": 1305 }, { "epoch": 0.85, "grad_norm": 0.07870301604270935, "learning_rate": 0.00024432726997341403, "loss": 0.0191, "step": 1306 }, { "epoch": 0.86, "grad_norm": 0.09554275870323181, "learning_rate": 0.0002442470898841933, "loss": 0.0169, "step": 1307 }, { "epoch": 0.86, "grad_norm": 0.20255301892757416, "learning_rate": 0.0002441668652759896, "loss": 0.0404, "step": 1308 }, { "epoch": 0.86, "grad_norm": 0.015268395654857159, "learning_rate": 0.0002440865961866981, "loss": 0.002, "step": 1309 }, { "epoch": 0.86, "grad_norm": 0.08300946652889252, "learning_rate": 0.0002440062826542351, "loss": 0.0281, "step": 1310 }, { "epoch": 0.86, "grad_norm": 0.2025083601474762, "learning_rate": 0.00024392592471653786, "loss": 0.0407, "step": 1311 }, { "epoch": 0.86, "grad_norm": 0.01875820755958557, "learning_rate": 0.0002438455224115647, "loss": 0.0024, "step": 1312 }, { "epoch": 0.86, "grad_norm": 0.16822032630443573, "learning_rate": 0.0002437650757772947, "loss": 0.0356, "step": 1313 }, { "epoch": 0.86, "grad_norm": 0.307230681180954, "learning_rate": 0.0002436845848517281, "loss": 0.0277, "step": 1314 }, { "epoch": 0.86, "grad_norm": 0.21822179853916168, "learning_rate": 0.00024360404967288586, "loss": 0.0153, "step": 1315 }, { "epoch": 0.86, "grad_norm": 0.18913914263248444, "learning_rate": 0.00024352347027881003, "loss": 0.0664, "step": 1316 }, { "epoch": 0.86, "grad_norm": 0.26664435863494873, "learning_rate": 0.0002434428467075634, "loss": 0.0821, "step": 1317 }, { "epoch": 0.86, "grad_norm": 0.15764296054840088, "learning_rate": 0.00024336217899722967, "loss": 0.0663, "step": 1318 }, { "epoch": 0.86, "grad_norm": 0.09952249377965927, "learning_rate": 0.00024328146718591352, "loss": 0.0497, "step": 1319 }, { "epoch": 0.86, "grad_norm": 0.20798932015895844, "learning_rate": 0.00024320071131174022, "loss": 0.0448, "step": 1320 }, { "epoch": 0.86, "grad_norm": 0.2187434434890747, "learning_rate": 0.00024311991141285602, "loss": 0.0547, "step": 1321 }, { "epoch": 0.87, "grad_norm": 0.08128710836172104, "learning_rate": 0.00024303906752742797, "loss": 0.0232, "step": 1322 }, { "epoch": 0.87, "grad_norm": 0.1579497754573822, "learning_rate": 0.00024295817969364382, "loss": 0.0368, "step": 1323 }, { "epoch": 0.87, "grad_norm": 0.3530323803424835, "learning_rate": 0.00024287724794971207, "loss": 0.0543, "step": 1324 }, { "epoch": 0.87, "grad_norm": 0.13028964400291443, "learning_rate": 0.00024279627233386212, "loss": 0.0562, "step": 1325 }, { "epoch": 0.87, "grad_norm": 0.17670606076717377, "learning_rate": 0.00024271525288434385, "loss": 0.033, "step": 1326 }, { "epoch": 0.87, "grad_norm": 0.14736194908618927, "learning_rate": 0.00024263418963942808, "loss": 0.0403, "step": 1327 }, { "epoch": 0.87, "grad_norm": 0.2033924162387848, "learning_rate": 0.00024255308263740618, "loss": 0.0584, "step": 1328 }, { "epoch": 0.87, "grad_norm": 0.08926638215780258, "learning_rate": 0.00024247193191659016, "loss": 0.0368, "step": 1329 }, { "epoch": 0.87, "grad_norm": 0.09010445326566696, "learning_rate": 0.0002423907375153128, "loss": 0.0313, "step": 1330 }, { "epoch": 0.87, "grad_norm": 0.07403320074081421, "learning_rate": 0.00024230949947192748, "loss": 0.0146, "step": 1331 }, { "epoch": 0.87, "grad_norm": 0.11623091250658035, "learning_rate": 0.00024222821782480812, "loss": 0.0308, "step": 1332 }, { "epoch": 0.87, "grad_norm": 0.20798785984516144, "learning_rate": 0.0002421468926123493, "loss": 0.0447, "step": 1333 }, { "epoch": 0.87, "grad_norm": 0.12543538212776184, "learning_rate": 0.00024206552387296621, "loss": 0.0438, "step": 1334 }, { "epoch": 0.87, "grad_norm": 0.12966863811016083, "learning_rate": 0.00024198411164509447, "loss": 0.0453, "step": 1335 }, { "epoch": 0.87, "grad_norm": 0.05985172837972641, "learning_rate": 0.00024190265596719043, "loss": 0.0102, "step": 1336 }, { "epoch": 0.88, "grad_norm": 0.14263281226158142, "learning_rate": 0.00024182115687773075, "loss": 0.0544, "step": 1337 }, { "epoch": 0.88, "grad_norm": 0.190725639462471, "learning_rate": 0.00024173961441521284, "loss": 0.0265, "step": 1338 }, { "epoch": 0.88, "grad_norm": 0.29231366515159607, "learning_rate": 0.00024165802861815435, "loss": 0.0684, "step": 1339 }, { "epoch": 0.88, "grad_norm": 0.13645826280117035, "learning_rate": 0.00024157639952509356, "loss": 0.0577, "step": 1340 }, { "epoch": 0.88, "grad_norm": 0.15891732275485992, "learning_rate": 0.0002414947271745892, "loss": 0.0455, "step": 1341 }, { "epoch": 0.88, "grad_norm": 0.2538587152957916, "learning_rate": 0.00024141301160522037, "loss": 0.0566, "step": 1342 }, { "epoch": 0.88, "grad_norm": 0.08588481694459915, "learning_rate": 0.00024133125285558658, "loss": 0.0265, "step": 1343 }, { "epoch": 0.88, "grad_norm": 0.1366318315267563, "learning_rate": 0.00024124945096430775, "loss": 0.0209, "step": 1344 }, { "epoch": 0.88, "grad_norm": 0.12919899821281433, "learning_rate": 0.00024116760597002427, "loss": 0.0358, "step": 1345 }, { "epoch": 0.88, "grad_norm": 0.1527070701122284, "learning_rate": 0.0002410857179113967, "loss": 0.0584, "step": 1346 }, { "epoch": 0.88, "grad_norm": 0.1441652625799179, "learning_rate": 0.00024100378682710618, "loss": 0.026, "step": 1347 }, { "epoch": 0.88, "grad_norm": 0.0560770146548748, "learning_rate": 0.00024092181275585397, "loss": 0.0126, "step": 1348 }, { "epoch": 0.88, "grad_norm": 0.23829127848148346, "learning_rate": 0.00024083979573636172, "loss": 0.0492, "step": 1349 }, { "epoch": 0.88, "grad_norm": 0.22331084311008453, "learning_rate": 0.00024075773580737138, "loss": 0.0374, "step": 1350 }, { "epoch": 0.88, "grad_norm": 0.16740433871746063, "learning_rate": 0.0002406756330076452, "loss": 0.033, "step": 1351 }, { "epoch": 0.89, "grad_norm": 0.12624043226242065, "learning_rate": 0.0002405934873759655, "loss": 0.0254, "step": 1352 }, { "epoch": 0.89, "grad_norm": 0.2925248444080353, "learning_rate": 0.00024051129895113506, "loss": 0.0966, "step": 1353 }, { "epoch": 0.89, "grad_norm": 0.050702452659606934, "learning_rate": 0.00024042906777197676, "loss": 0.0058, "step": 1354 }, { "epoch": 0.89, "grad_norm": 0.11182265728712082, "learning_rate": 0.00024034679387733367, "loss": 0.0209, "step": 1355 }, { "epoch": 0.89, "grad_norm": 0.07762409001588821, "learning_rate": 0.00024026447730606911, "loss": 0.0117, "step": 1356 }, { "epoch": 0.89, "grad_norm": 0.05566919595003128, "learning_rate": 0.00024018211809706652, "loss": 0.012, "step": 1357 }, { "epoch": 0.89, "grad_norm": 0.026816535741090775, "learning_rate": 0.00024009971628922937, "loss": 0.0058, "step": 1358 }, { "epoch": 0.89, "grad_norm": 0.14158879220485687, "learning_rate": 0.0002400172719214814, "loss": 0.0242, "step": 1359 }, { "epoch": 0.89, "grad_norm": 0.10178912431001663, "learning_rate": 0.0002399347850327664, "loss": 0.0144, "step": 1360 }, { "epoch": 0.89, "grad_norm": 0.2671686112880707, "learning_rate": 0.00023985225566204834, "loss": 0.1116, "step": 1361 }, { "epoch": 0.89, "grad_norm": 0.2026364952325821, "learning_rate": 0.00023976968384831107, "loss": 0.0511, "step": 1362 }, { "epoch": 0.89, "grad_norm": 0.046000707894563675, "learning_rate": 0.0002396870696305586, "loss": 0.0089, "step": 1363 }, { "epoch": 0.89, "grad_norm": 0.243350088596344, "learning_rate": 0.00023960441304781495, "loss": 0.0376, "step": 1364 }, { "epoch": 0.89, "grad_norm": 0.053250979632139206, "learning_rate": 0.0002395217141391242, "loss": 0.008, "step": 1365 }, { "epoch": 0.89, "grad_norm": 0.08489834517240524, "learning_rate": 0.0002394389729435503, "loss": 0.0216, "step": 1366 }, { "epoch": 0.89, "grad_norm": 0.09859486669301987, "learning_rate": 0.00023935618950017738, "loss": 0.0253, "step": 1367 }, { "epoch": 0.9, "grad_norm": 0.11453449726104736, "learning_rate": 0.00023927336384810933, "loss": 0.0414, "step": 1368 }, { "epoch": 0.9, "grad_norm": 0.1473090499639511, "learning_rate": 0.00023919049602647005, "loss": 0.0365, "step": 1369 }, { "epoch": 0.9, "grad_norm": 0.12153466045856476, "learning_rate": 0.00023910758607440335, "loss": 0.0314, "step": 1370 }, { "epoch": 0.9, "grad_norm": 0.17143134772777557, "learning_rate": 0.000239024634031073, "loss": 0.0928, "step": 1371 }, { "epoch": 0.9, "grad_norm": 0.11081311106681824, "learning_rate": 0.00023894163993566257, "loss": 0.0535, "step": 1372 }, { "epoch": 0.9, "grad_norm": 0.13488808274269104, "learning_rate": 0.0002388586038273755, "loss": 0.0321, "step": 1373 }, { "epoch": 0.9, "grad_norm": 0.0592711940407753, "learning_rate": 0.0002387755257454352, "loss": 0.01, "step": 1374 }, { "epoch": 0.9, "grad_norm": 0.09835012257099152, "learning_rate": 0.00023869240572908467, "loss": 0.0295, "step": 1375 }, { "epoch": 0.9, "grad_norm": 0.071134053170681, "learning_rate": 0.000238609243817587, "loss": 0.0243, "step": 1376 }, { "epoch": 0.9, "grad_norm": 0.14431652426719666, "learning_rate": 0.0002385260400502248, "loss": 0.0344, "step": 1377 }, { "epoch": 0.9, "grad_norm": 0.10391832143068314, "learning_rate": 0.00023844279446630067, "loss": 0.0231, "step": 1378 }, { "epoch": 0.9, "grad_norm": 0.07357161492109299, "learning_rate": 0.00023835950710513677, "loss": 0.0163, "step": 1379 }, { "epoch": 0.9, "grad_norm": 0.16738182306289673, "learning_rate": 0.00023827617800607523, "loss": 0.0423, "step": 1380 }, { "epoch": 0.9, "grad_norm": 0.07547144591808319, "learning_rate": 0.00023819280720847774, "loss": 0.0273, "step": 1381 }, { "epoch": 0.9, "grad_norm": 0.10503777116537094, "learning_rate": 0.0002381093947517256, "loss": 0.0192, "step": 1382 }, { "epoch": 0.91, "grad_norm": 0.0630551353096962, "learning_rate": 0.00023802594067521998, "loss": 0.0115, "step": 1383 }, { "epoch": 0.91, "grad_norm": 0.02077486738562584, "learning_rate": 0.00023794244501838162, "loss": 0.0045, "step": 1384 }, { "epoch": 0.91, "grad_norm": 0.09841371327638626, "learning_rate": 0.00023785890782065087, "loss": 0.0242, "step": 1385 }, { "epoch": 0.91, "grad_norm": 0.21591459214687347, "learning_rate": 0.00023777532912148781, "loss": 0.0237, "step": 1386 }, { "epoch": 0.91, "grad_norm": 0.03989405184984207, "learning_rate": 0.000237691708960372, "loss": 0.0051, "step": 1387 }, { "epoch": 0.91, "grad_norm": 0.12305942177772522, "learning_rate": 0.0002376080473768026, "loss": 0.0264, "step": 1388 }, { "epoch": 0.91, "grad_norm": 0.14408881962299347, "learning_rate": 0.00023752434441029848, "loss": 0.0322, "step": 1389 }, { "epoch": 0.91, "grad_norm": 0.04419580101966858, "learning_rate": 0.00023744060010039784, "loss": 0.0073, "step": 1390 }, { "epoch": 0.91, "grad_norm": 0.18515107035636902, "learning_rate": 0.0002373568144866586, "loss": 0.0465, "step": 1391 }, { "epoch": 0.91, "grad_norm": 0.048167865723371506, "learning_rate": 0.00023727298760865812, "loss": 0.0138, "step": 1392 }, { "epoch": 0.91, "grad_norm": 0.08519299328327179, "learning_rate": 0.0002371891195059932, "loss": 0.0095, "step": 1393 }, { "epoch": 0.91, "grad_norm": 0.21691879630088806, "learning_rate": 0.00023710521021828016, "loss": 0.0381, "step": 1394 }, { "epoch": 0.91, "grad_norm": 0.09678614884614944, "learning_rate": 0.00023702125978515478, "loss": 0.0099, "step": 1395 }, { "epoch": 0.91, "grad_norm": 0.08847987651824951, "learning_rate": 0.0002369372682462723, "loss": 0.0165, "step": 1396 }, { "epoch": 0.91, "grad_norm": 0.03246233984827995, "learning_rate": 0.0002368532356413073, "loss": 0.0058, "step": 1397 }, { "epoch": 0.92, "grad_norm": 0.07045282423496246, "learning_rate": 0.00023676916200995386, "loss": 0.0164, "step": 1398 }, { "epoch": 0.92, "grad_norm": 0.05581701174378395, "learning_rate": 0.00023668504739192528, "loss": 0.0152, "step": 1399 }, { "epoch": 0.92, "grad_norm": 0.15774132311344147, "learning_rate": 0.0002366008918269544, "loss": 0.0243, "step": 1400 }, { "epoch": 0.92, "grad_norm": 0.172657772898674, "learning_rate": 0.00023651669535479334, "loss": 0.0184, "step": 1401 }, { "epoch": 0.92, "grad_norm": 0.08128032833337784, "learning_rate": 0.0002364324580152135, "loss": 0.0186, "step": 1402 }, { "epoch": 0.92, "grad_norm": 0.06358969956636429, "learning_rate": 0.00023634817984800554, "loss": 0.0102, "step": 1403 }, { "epoch": 0.92, "grad_norm": 0.31414860486984253, "learning_rate": 0.00023626386089297958, "loss": 0.0514, "step": 1404 }, { "epoch": 0.92, "grad_norm": 0.22831489145755768, "learning_rate": 0.00023617950118996487, "loss": 0.0323, "step": 1405 }, { "epoch": 0.92, "grad_norm": 0.048902370035648346, "learning_rate": 0.00023609510077880996, "loss": 0.0033, "step": 1406 }, { "epoch": 0.92, "grad_norm": 0.03278432413935661, "learning_rate": 0.00023601065969938262, "loss": 0.0031, "step": 1407 }, { "epoch": 0.92, "grad_norm": 0.09343546628952026, "learning_rate": 0.00023592617799156977, "loss": 0.0199, "step": 1408 }, { "epoch": 0.92, "grad_norm": 0.0755714625120163, "learning_rate": 0.00023584165569527757, "loss": 0.0086, "step": 1409 }, { "epoch": 0.92, "grad_norm": 0.28567177057266235, "learning_rate": 0.00023575709285043138, "loss": 0.0256, "step": 1410 }, { "epoch": 0.92, "grad_norm": 0.1896996796131134, "learning_rate": 0.0002356724894969757, "loss": 0.0291, "step": 1411 }, { "epoch": 0.92, "grad_norm": 0.1428869366645813, "learning_rate": 0.0002355878456748742, "loss": 0.0574, "step": 1412 }, { "epoch": 0.93, "grad_norm": 0.25432294607162476, "learning_rate": 0.0002355031614241095, "loss": 0.0433, "step": 1413 }, { "epoch": 0.93, "grad_norm": 0.2577909231185913, "learning_rate": 0.00023541843678468355, "loss": 0.0376, "step": 1414 }, { "epoch": 0.93, "grad_norm": 0.1479143500328064, "learning_rate": 0.0002353336717966172, "loss": 0.0248, "step": 1415 }, { "epoch": 0.93, "grad_norm": 0.058144185692071915, "learning_rate": 0.00023524886649995043, "loss": 0.0102, "step": 1416 }, { "epoch": 0.93, "grad_norm": 0.18476702272891998, "learning_rate": 0.00023516402093474225, "loss": 0.0658, "step": 1417 }, { "epoch": 0.93, "grad_norm": 0.1367078274488449, "learning_rate": 0.00023507913514107074, "loss": 0.0228, "step": 1418 }, { "epoch": 0.93, "grad_norm": 0.05217135697603226, "learning_rate": 0.00023499420915903293, "loss": 0.0117, "step": 1419 }, { "epoch": 0.93, "grad_norm": 0.311260461807251, "learning_rate": 0.00023490924302874478, "loss": 0.0945, "step": 1420 }, { "epoch": 0.93, "grad_norm": 0.06179346889257431, "learning_rate": 0.00023482423679034134, "loss": 0.0102, "step": 1421 }, { "epoch": 0.93, "grad_norm": 0.0694802924990654, "learning_rate": 0.00023473919048397652, "loss": 0.0187, "step": 1422 }, { "epoch": 0.93, "grad_norm": 0.09105714410543442, "learning_rate": 0.00023465410414982317, "loss": 0.0245, "step": 1423 }, { "epoch": 0.93, "grad_norm": 0.10562916845083237, "learning_rate": 0.0002345689778280731, "loss": 0.0296, "step": 1424 }, { "epoch": 0.93, "grad_norm": 0.07471620291471481, "learning_rate": 0.00023448381155893695, "loss": 0.0288, "step": 1425 }, { "epoch": 0.93, "grad_norm": 0.11771635711193085, "learning_rate": 0.0002343986053826442, "loss": 0.0165, "step": 1426 }, { "epoch": 0.93, "grad_norm": 0.056794993579387665, "learning_rate": 0.00023431335933944323, "loss": 0.02, "step": 1427 }, { "epoch": 0.93, "grad_norm": 0.10688856244087219, "learning_rate": 0.00023422807346960131, "loss": 0.037, "step": 1428 }, { "epoch": 0.94, "grad_norm": 0.10420051217079163, "learning_rate": 0.00023414274781340442, "loss": 0.0211, "step": 1429 }, { "epoch": 0.94, "grad_norm": 0.09319007396697998, "learning_rate": 0.00023405738241115737, "loss": 0.0324, "step": 1430 }, { "epoch": 0.94, "grad_norm": 0.11446485668420792, "learning_rate": 0.00023397197730318377, "loss": 0.0381, "step": 1431 }, { "epoch": 0.94, "grad_norm": 0.10845956206321716, "learning_rate": 0.00023388653252982594, "loss": 0.0171, "step": 1432 }, { "epoch": 0.94, "grad_norm": 0.08544383198022842, "learning_rate": 0.000233801048131445, "loss": 0.0239, "step": 1433 }, { "epoch": 0.94, "grad_norm": 0.10372909903526306, "learning_rate": 0.0002337155241484207, "loss": 0.0429, "step": 1434 }, { "epoch": 0.94, "grad_norm": 0.24167174100875854, "learning_rate": 0.00023362996062115154, "loss": 0.1291, "step": 1435 }, { "epoch": 0.94, "grad_norm": 0.10461205989122391, "learning_rate": 0.00023354435759005473, "loss": 0.0385, "step": 1436 }, { "epoch": 0.94, "grad_norm": 0.14408838748931885, "learning_rate": 0.0002334587150955661, "loss": 0.0377, "step": 1437 }, { "epoch": 0.94, "grad_norm": 0.08705660700798035, "learning_rate": 0.0002333730331781401, "loss": 0.0169, "step": 1438 }, { "epoch": 0.94, "grad_norm": 0.10698029398918152, "learning_rate": 0.00023328731187824986, "loss": 0.0383, "step": 1439 }, { "epoch": 0.94, "grad_norm": 0.18005134165287018, "learning_rate": 0.0002332015512363871, "loss": 0.0408, "step": 1440 }, { "epoch": 0.94, "grad_norm": 0.11144935339689255, "learning_rate": 0.00023311575129306202, "loss": 0.0434, "step": 1441 }, { "epoch": 0.94, "grad_norm": 0.09303693473339081, "learning_rate": 0.0002330299120888035, "loss": 0.0259, "step": 1442 }, { "epoch": 0.94, "grad_norm": 0.09196025878190994, "learning_rate": 0.00023294403366415904, "loss": 0.0256, "step": 1443 }, { "epoch": 0.95, "grad_norm": 0.09944535046815872, "learning_rate": 0.00023285811605969442, "loss": 0.0691, "step": 1444 }, { "epoch": 0.95, "grad_norm": 0.0766916275024414, "learning_rate": 0.00023277215931599417, "loss": 0.0162, "step": 1445 }, { "epoch": 0.95, "grad_norm": 0.0471719354391098, "learning_rate": 0.00023268616347366114, "loss": 0.0157, "step": 1446 }, { "epoch": 0.95, "grad_norm": 0.0748835876584053, "learning_rate": 0.0002326001285733168, "loss": 0.0162, "step": 1447 }, { "epoch": 0.95, "grad_norm": 0.2493734508752823, "learning_rate": 0.0002325140546556009, "loss": 0.0908, "step": 1448 }, { "epoch": 0.95, "grad_norm": 0.1908605992794037, "learning_rate": 0.0002324279417611717, "loss": 0.0352, "step": 1449 }, { "epoch": 0.95, "grad_norm": 0.16963645815849304, "learning_rate": 0.00023234178993070595, "loss": 0.0597, "step": 1450 }, { "epoch": 0.95, "grad_norm": 0.1448785662651062, "learning_rate": 0.0002322555992048987, "loss": 0.0341, "step": 1451 }, { "epoch": 0.95, "grad_norm": 0.11966606229543686, "learning_rate": 0.00023216936962446334, "loss": 0.0447, "step": 1452 }, { "epoch": 0.95, "grad_norm": 0.06863813102245331, "learning_rate": 0.00023208310123013176, "loss": 0.0184, "step": 1453 }, { "epoch": 0.95, "grad_norm": 0.08081576228141785, "learning_rate": 0.000231996794062654, "loss": 0.0183, "step": 1454 }, { "epoch": 0.95, "grad_norm": 0.04790128767490387, "learning_rate": 0.00023191044816279856, "loss": 0.0159, "step": 1455 }, { "epoch": 0.95, "grad_norm": 0.11623428761959076, "learning_rate": 0.00023182406357135217, "loss": 0.036, "step": 1456 }, { "epoch": 0.95, "grad_norm": 0.19882117211818695, "learning_rate": 0.0002317376403291198, "loss": 0.0356, "step": 1457 }, { "epoch": 0.95, "grad_norm": 0.06410811841487885, "learning_rate": 0.0002316511784769248, "loss": 0.0153, "step": 1458 }, { "epoch": 0.96, "grad_norm": 0.1210549846291542, "learning_rate": 0.00023156467805560862, "loss": 0.0254, "step": 1459 }, { "epoch": 0.96, "grad_norm": 0.09589160978794098, "learning_rate": 0.00023147813910603102, "loss": 0.0231, "step": 1460 }, { "epoch": 0.96, "grad_norm": 0.05613451451063156, "learning_rate": 0.00023139156166906993, "loss": 0.008, "step": 1461 }, { "epoch": 0.96, "grad_norm": 0.12222158908843994, "learning_rate": 0.00023130494578562147, "loss": 0.0236, "step": 1462 }, { "epoch": 0.96, "grad_norm": 0.12595443427562714, "learning_rate": 0.00023121829149659988, "loss": 0.0284, "step": 1463 }, { "epoch": 0.96, "grad_norm": 0.05631411075592041, "learning_rate": 0.00023113159884293762, "loss": 0.0083, "step": 1464 }, { "epoch": 0.96, "grad_norm": 0.15821842849254608, "learning_rate": 0.00023104486786558516, "loss": 0.0281, "step": 1465 }, { "epoch": 0.96, "grad_norm": 0.28132763504981995, "learning_rate": 0.0002309580986055112, "loss": 0.0744, "step": 1466 }, { "epoch": 0.96, "grad_norm": 0.08583173155784607, "learning_rate": 0.00023087129110370243, "loss": 0.0163, "step": 1467 }, { "epoch": 0.96, "grad_norm": 0.1472005695104599, "learning_rate": 0.00023078444540116364, "loss": 0.0342, "step": 1468 }, { "epoch": 0.96, "grad_norm": 0.15789683163166046, "learning_rate": 0.0002306975615389177, "loss": 0.0321, "step": 1469 }, { "epoch": 0.96, "grad_norm": 0.0862409770488739, "learning_rate": 0.00023061063955800542, "loss": 0.0337, "step": 1470 }, { "epoch": 0.96, "grad_norm": 0.09513189643621445, "learning_rate": 0.00023052367949948562, "loss": 0.0156, "step": 1471 }, { "epoch": 0.96, "grad_norm": 0.16023319959640503, "learning_rate": 0.00023043668140443522, "loss": 0.0437, "step": 1472 }, { "epoch": 0.96, "grad_norm": 0.28757092356681824, "learning_rate": 0.0002303496453139491, "loss": 0.0526, "step": 1473 }, { "epoch": 0.96, "grad_norm": 0.09820155799388885, "learning_rate": 0.00023026257126913986, "loss": 0.0087, "step": 1474 }, { "epoch": 0.97, "grad_norm": 0.23134587705135345, "learning_rate": 0.00023017545931113822, "loss": 0.0613, "step": 1475 }, { "epoch": 0.97, "grad_norm": 0.06153428182005882, "learning_rate": 0.0002300883094810929, "loss": 0.0086, "step": 1476 }, { "epoch": 0.97, "grad_norm": 0.17993584275245667, "learning_rate": 0.00023000112182017032, "loss": 0.0339, "step": 1477 }, { "epoch": 0.97, "grad_norm": 0.23367144167423248, "learning_rate": 0.00022991389636955483, "loss": 0.0785, "step": 1478 }, { "epoch": 0.97, "grad_norm": 0.057765256613492966, "learning_rate": 0.00022982663317044864, "loss": 0.0077, "step": 1479 }, { "epoch": 0.97, "grad_norm": 0.17549645900726318, "learning_rate": 0.00022973933226407174, "loss": 0.0578, "step": 1480 }, { "epoch": 0.97, "grad_norm": 0.13486583530902863, "learning_rate": 0.0002296519936916621, "loss": 0.0381, "step": 1481 }, { "epoch": 0.97, "grad_norm": 0.11634548753499985, "learning_rate": 0.00022956461749447528, "loss": 0.0356, "step": 1482 }, { "epoch": 0.97, "grad_norm": 0.03911494463682175, "learning_rate": 0.0002294772037137847, "loss": 0.0082, "step": 1483 }, { "epoch": 0.97, "grad_norm": 0.13597272336483002, "learning_rate": 0.0002293897523908816, "loss": 0.037, "step": 1484 }, { "epoch": 0.97, "grad_norm": 0.03297096863389015, "learning_rate": 0.0002293022635670748, "loss": 0.0101, "step": 1485 }, { "epoch": 0.97, "grad_norm": 0.1217992827296257, "learning_rate": 0.00022921473728369099, "loss": 0.0488, "step": 1486 }, { "epoch": 0.97, "grad_norm": 0.08392113447189331, "learning_rate": 0.0002291271735820744, "loss": 0.0213, "step": 1487 }, { "epoch": 0.97, "grad_norm": 0.0728277638554573, "learning_rate": 0.00022903957250358707, "loss": 0.0323, "step": 1488 }, { "epoch": 0.97, "grad_norm": 0.19564445316791534, "learning_rate": 0.0002289519340896086, "loss": 0.0362, "step": 1489 }, { "epoch": 0.98, "grad_norm": 0.09455154836177826, "learning_rate": 0.00022886425838153634, "loss": 0.0305, "step": 1490 }, { "epoch": 0.98, "grad_norm": 0.02463528886437416, "learning_rate": 0.00022877654542078515, "loss": 0.0055, "step": 1491 }, { "epoch": 0.98, "grad_norm": 0.10636550933122635, "learning_rate": 0.0002286887952487875, "loss": 0.0254, "step": 1492 }, { "epoch": 0.98, "grad_norm": 0.08179948478937149, "learning_rate": 0.00022860100790699352, "loss": 0.0341, "step": 1493 }, { "epoch": 0.98, "grad_norm": 0.04053513705730438, "learning_rate": 0.00022851318343687074, "loss": 0.0059, "step": 1494 }, { "epoch": 0.98, "grad_norm": 0.06254950165748596, "learning_rate": 0.00022842532187990444, "loss": 0.016, "step": 1495 }, { "epoch": 0.98, "grad_norm": 0.11671124398708344, "learning_rate": 0.00022833742327759722, "loss": 0.0316, "step": 1496 }, { "epoch": 0.98, "grad_norm": 0.05388714000582695, "learning_rate": 0.00022824948767146926, "loss": 0.0114, "step": 1497 }, { "epoch": 0.98, "grad_norm": 0.07483407109975815, "learning_rate": 0.00022816151510305824, "loss": 0.0121, "step": 1498 }, { "epoch": 0.98, "grad_norm": 0.08650153130292892, "learning_rate": 0.00022807350561391938, "loss": 0.0518, "step": 1499 }, { "epoch": 0.98, "grad_norm": 0.1296052485704422, "learning_rate": 0.00022798545924562508, "loss": 0.0666, "step": 1500 }, { "epoch": 0.98, "grad_norm": 0.15292461216449738, "learning_rate": 0.00022789737603976542, "loss": 0.0314, "step": 1501 }, { "epoch": 0.98, "grad_norm": 0.2241302728652954, "learning_rate": 0.00022780925603794775, "loss": 0.13, "step": 1502 }, { "epoch": 0.98, "grad_norm": 0.07691671699285507, "learning_rate": 0.00022772109928179688, "loss": 0.0303, "step": 1503 }, { "epoch": 0.98, "grad_norm": 0.07967071235179901, "learning_rate": 0.0002276329058129548, "loss": 0.0104, "step": 1504 }, { "epoch": 0.99, "grad_norm": 0.15211229026317596, "learning_rate": 0.00022754467567308114, "loss": 0.0463, "step": 1505 }, { "epoch": 0.99, "grad_norm": 0.1364462524652481, "learning_rate": 0.00022745640890385263, "loss": 0.0333, "step": 1506 }, { "epoch": 0.99, "grad_norm": 0.08477602154016495, "learning_rate": 0.00022736810554696335, "loss": 0.0144, "step": 1507 }, { "epoch": 0.99, "grad_norm": 0.030945677310228348, "learning_rate": 0.0002272797656441247, "loss": 0.0082, "step": 1508 }, { "epoch": 0.99, "grad_norm": 0.0667153000831604, "learning_rate": 0.00022719138923706525, "loss": 0.0285, "step": 1509 }, { "epoch": 0.99, "grad_norm": 0.15130023658275604, "learning_rate": 0.00022710297636753096, "loss": 0.0493, "step": 1510 }, { "epoch": 0.99, "grad_norm": 0.07945651561021805, "learning_rate": 0.00022701452707728486, "loss": 0.0181, "step": 1511 }, { "epoch": 0.99, "grad_norm": 0.1147598847746849, "learning_rate": 0.00022692604140810735, "loss": 0.0377, "step": 1512 }, { "epoch": 0.99, "grad_norm": 0.04304948449134827, "learning_rate": 0.00022683751940179588, "loss": 0.0128, "step": 1513 }, { "epoch": 0.99, "grad_norm": 0.08819684386253357, "learning_rate": 0.00022674896110016503, "loss": 0.0296, "step": 1514 }, { "epoch": 0.99, "grad_norm": 0.06335631757974625, "learning_rate": 0.0002266603665450467, "loss": 0.0188, "step": 1515 }, { "epoch": 0.99, "grad_norm": 0.08433008193969727, "learning_rate": 0.00022657173577828979, "loss": 0.0251, "step": 1516 }, { "epoch": 0.99, "grad_norm": 0.06014099717140198, "learning_rate": 0.00022648306884176034, "loss": 0.0193, "step": 1517 }, { "epoch": 0.99, "grad_norm": 0.05266990885138512, "learning_rate": 0.00022639436577734143, "loss": 0.0112, "step": 1518 }, { "epoch": 0.99, "grad_norm": 0.10652010887861252, "learning_rate": 0.00022630562662693328, "loss": 0.0312, "step": 1519 }, { "epoch": 1.0, "grad_norm": 0.043453726917505264, "learning_rate": 0.00022621685143245308, "loss": 0.009, "step": 1520 }, { "epoch": 1.0, "grad_norm": 0.0685136690735817, "learning_rate": 0.00022612804023583515, "loss": 0.0189, "step": 1521 }, { "epoch": 1.0, "grad_norm": 0.14430442452430725, "learning_rate": 0.0002260391930790307, "loss": 0.066, "step": 1522 }, { "epoch": 1.0, "grad_norm": 0.0724061131477356, "learning_rate": 0.00022595031000400794, "loss": 0.0129, "step": 1523 }, { "epoch": 1.0, "grad_norm": 0.18257959187030792, "learning_rate": 0.00022586139105275214, "loss": 0.0434, "step": 1524 }, { "epoch": 1.0, "grad_norm": 0.06716416776180267, "learning_rate": 0.00022577243626726548, "loss": 0.0102, "step": 1525 }, { "epoch": 1.0, "grad_norm": 0.05102796107530594, "learning_rate": 0.00022568344568956697, "loss": 0.0094, "step": 1526 }, { "epoch": 1.0, "grad_norm": 0.0711396113038063, "learning_rate": 0.0002255944193616927, "loss": 0.0138, "step": 1527 }, { "epoch": 1.0, "grad_norm": 0.09844920784235, "learning_rate": 0.00022550535732569543, "loss": 0.0144, "step": 1528 }, { "epoch": 1.0, "eval_loss": 0.028799179941415787, "eval_runtime": 39.8961, "eval_samples_per_second": 32.259, "eval_steps_per_second": 8.071, "step": 1528 }, { "epoch": 1.0, "grad_norm": 0.03633524477481842, "learning_rate": 0.00022541625962364497, "loss": 0.0054, "step": 1529 }, { "epoch": 1.0, "grad_norm": 0.0410737618803978, "learning_rate": 0.00022532712629762795, "loss": 0.0069, "step": 1530 }, { "epoch": 1.0, "grad_norm": 0.04491305723786354, "learning_rate": 0.00022523795738974776, "loss": 0.0049, "step": 1531 }, { "epoch": 1.0, "grad_norm": 0.058488693088293076, "learning_rate": 0.0002251487529421246, "loss": 0.0045, "step": 1532 }, { "epoch": 1.0, "grad_norm": 0.008812353946268559, "learning_rate": 0.00022505951299689553, "loss": 0.0017, "step": 1533 }, { "epoch": 1.0, "grad_norm": 0.01968853361904621, "learning_rate": 0.00022497023759621433, "loss": 0.0037, "step": 1534 }, { "epoch": 1.0, "grad_norm": 0.05288131162524223, "learning_rate": 0.00022488092678225153, "loss": 0.0055, "step": 1535 }, { "epoch": 1.01, "grad_norm": 0.23432080447673798, "learning_rate": 0.0002247915805971944, "loss": 0.0302, "step": 1536 }, { "epoch": 1.01, "grad_norm": 0.09490291774272919, "learning_rate": 0.00022470219908324684, "loss": 0.0327, "step": 1537 }, { "epoch": 1.01, "grad_norm": 0.13473515212535858, "learning_rate": 0.00022461278228262958, "loss": 0.0182, "step": 1538 }, { "epoch": 1.01, "grad_norm": 0.06751556694507599, "learning_rate": 0.00022452333023757998, "loss": 0.0048, "step": 1539 }, { "epoch": 1.01, "grad_norm": 0.22719435393810272, "learning_rate": 0.00022443384299035193, "loss": 0.0115, "step": 1540 }, { "epoch": 1.01, "grad_norm": 0.006171741988509893, "learning_rate": 0.00022434432058321605, "loss": 0.0008, "step": 1541 }, { "epoch": 1.01, "grad_norm": 0.005011085420846939, "learning_rate": 0.00022425476305845958, "loss": 0.0008, "step": 1542 }, { "epoch": 1.01, "grad_norm": 0.01569819450378418, "learning_rate": 0.00022416517045838628, "loss": 0.0014, "step": 1543 }, { "epoch": 1.01, "grad_norm": 0.37864693999290466, "learning_rate": 0.00022407554282531658, "loss": 0.0313, "step": 1544 }, { "epoch": 1.01, "grad_norm": 0.121845543384552, "learning_rate": 0.00022398588020158735, "loss": 0.0252, "step": 1545 }, { "epoch": 1.01, "grad_norm": 0.16927878558635712, "learning_rate": 0.00022389618262955198, "loss": 0.0413, "step": 1546 }, { "epoch": 1.01, "grad_norm": 0.03560361638665199, "learning_rate": 0.00022380645015158054, "loss": 0.0038, "step": 1547 }, { "epoch": 1.01, "grad_norm": 0.17061570286750793, "learning_rate": 0.0002237166828100594, "loss": 0.0238, "step": 1548 }, { "epoch": 1.01, "grad_norm": 0.0610225647687912, "learning_rate": 0.0002236268806473915, "loss": 0.0077, "step": 1549 }, { "epoch": 1.01, "grad_norm": 0.1876288652420044, "learning_rate": 0.00022353704370599615, "loss": 0.0293, "step": 1550 }, { "epoch": 1.02, "grad_norm": 0.12210645526647568, "learning_rate": 0.00022344717202830915, "loss": 0.014, "step": 1551 }, { "epoch": 1.02, "grad_norm": 0.09035097062587738, "learning_rate": 0.00022335726565678277, "loss": 0.0178, "step": 1552 }, { "epoch": 1.02, "grad_norm": 0.016949700191617012, "learning_rate": 0.0002232673246338855, "loss": 0.0023, "step": 1553 }, { "epoch": 1.02, "grad_norm": 0.22814500331878662, "learning_rate": 0.0002231773490021023, "loss": 0.0187, "step": 1554 }, { "epoch": 1.02, "grad_norm": 0.06230514496564865, "learning_rate": 0.00022308733880393447, "loss": 0.0045, "step": 1555 }, { "epoch": 1.02, "grad_norm": 0.037863511592149734, "learning_rate": 0.00022299729408189968, "loss": 0.0079, "step": 1556 }, { "epoch": 1.02, "grad_norm": 0.10934063047170639, "learning_rate": 0.00022290721487853185, "loss": 0.037, "step": 1557 }, { "epoch": 1.02, "grad_norm": 0.17740324139595032, "learning_rate": 0.00022281710123638117, "loss": 0.025, "step": 1558 }, { "epoch": 1.02, "grad_norm": 0.09901938587427139, "learning_rate": 0.00022272695319801417, "loss": 0.0235, "step": 1559 }, { "epoch": 1.02, "grad_norm": 0.09340560436248779, "learning_rate": 0.00022263677080601354, "loss": 0.0189, "step": 1560 }, { "epoch": 1.02, "grad_norm": 0.1699395328760147, "learning_rate": 0.00022254655410297827, "loss": 0.0344, "step": 1561 }, { "epoch": 1.02, "grad_norm": 0.1495848298072815, "learning_rate": 0.00022245630313152352, "loss": 0.0337, "step": 1562 }, { "epoch": 1.02, "grad_norm": 0.2952522039413452, "learning_rate": 0.00022236601793428063, "loss": 0.0604, "step": 1563 }, { "epoch": 1.02, "grad_norm": 0.1865517646074295, "learning_rate": 0.0002222756985538972, "loss": 0.063, "step": 1564 }, { "epoch": 1.02, "grad_norm": 0.1253533810377121, "learning_rate": 0.00022218534503303682, "loss": 0.0317, "step": 1565 }, { "epoch": 1.03, "grad_norm": 0.06356123089790344, "learning_rate": 0.00022209495741437938, "loss": 0.0125, "step": 1566 }, { "epoch": 1.03, "grad_norm": 0.09303832054138184, "learning_rate": 0.00022200453574062063, "loss": 0.0212, "step": 1567 }, { "epoch": 1.03, "grad_norm": 0.041782230138778687, "learning_rate": 0.00022191408005447274, "loss": 0.0081, "step": 1568 }, { "epoch": 1.03, "grad_norm": 0.10091729462146759, "learning_rate": 0.00022182359039866364, "loss": 0.024, "step": 1569 }, { "epoch": 1.03, "grad_norm": 0.03203802555799484, "learning_rate": 0.00022173306681593747, "loss": 0.007, "step": 1570 }, { "epoch": 1.03, "grad_norm": 0.09047690778970718, "learning_rate": 0.00022164250934905442, "loss": 0.0253, "step": 1571 }, { "epoch": 1.03, "grad_norm": 0.058186113834381104, "learning_rate": 0.00022155191804079058, "loss": 0.0107, "step": 1572 }, { "epoch": 1.03, "grad_norm": 0.14713934063911438, "learning_rate": 0.00022146129293393804, "loss": 0.0268, "step": 1573 }, { "epoch": 1.03, "grad_norm": 0.0747760757803917, "learning_rate": 0.00022137063407130493, "loss": 0.016, "step": 1574 }, { "epoch": 1.03, "grad_norm": 0.05679846182465553, "learning_rate": 0.0002212799414957153, "loss": 0.0122, "step": 1575 }, { "epoch": 1.03, "grad_norm": 0.040479984134435654, "learning_rate": 0.00022118921525000903, "loss": 0.0044, "step": 1576 }, { "epoch": 1.03, "grad_norm": 0.06615650653839111, "learning_rate": 0.00022109845537704204, "loss": 0.0051, "step": 1577 }, { "epoch": 1.03, "grad_norm": 0.0890481099486351, "learning_rate": 0.00022100766191968606, "loss": 0.0209, "step": 1578 }, { "epoch": 1.03, "grad_norm": 0.24288196861743927, "learning_rate": 0.00022091683492082875, "loss": 0.0726, "step": 1579 }, { "epoch": 1.03, "grad_norm": 0.25049716234207153, "learning_rate": 0.00022082597442337344, "loss": 0.0329, "step": 1580 }, { "epoch": 1.04, "grad_norm": 0.028392024338245392, "learning_rate": 0.0002207350804702395, "loss": 0.0031, "step": 1581 }, { "epoch": 1.04, "grad_norm": 0.023121071979403496, "learning_rate": 0.00022064415310436202, "loss": 0.0041, "step": 1582 }, { "epoch": 1.04, "grad_norm": 0.022821614518761635, "learning_rate": 0.0002205531923686918, "loss": 0.0025, "step": 1583 }, { "epoch": 1.04, "grad_norm": 0.17920270562171936, "learning_rate": 0.00022046219830619554, "loss": 0.017, "step": 1584 }, { "epoch": 1.04, "grad_norm": 0.08992599695920944, "learning_rate": 0.00022037117095985553, "loss": 0.0306, "step": 1585 }, { "epoch": 1.04, "grad_norm": 0.30955061316490173, "learning_rate": 0.0002202801103726699, "loss": 0.0687, "step": 1586 }, { "epoch": 1.04, "grad_norm": 0.1383177936077118, "learning_rate": 0.00022018901658765245, "loss": 0.0236, "step": 1587 }, { "epoch": 1.04, "grad_norm": 0.13899581134319305, "learning_rate": 0.00022009788964783271, "loss": 0.02, "step": 1588 }, { "epoch": 1.04, "grad_norm": 0.3127962350845337, "learning_rate": 0.00022000672959625564, "loss": 0.0785, "step": 1589 }, { "epoch": 1.04, "grad_norm": 0.1163870096206665, "learning_rate": 0.00021991553647598218, "loss": 0.0239, "step": 1590 }, { "epoch": 1.04, "grad_norm": 0.08966390788555145, "learning_rate": 0.00021982431033008867, "loss": 0.0321, "step": 1591 }, { "epoch": 1.04, "grad_norm": 0.1911877542734146, "learning_rate": 0.00021973305120166712, "loss": 0.0347, "step": 1592 }, { "epoch": 1.04, "grad_norm": 0.1138681024312973, "learning_rate": 0.00021964175913382508, "loss": 0.0401, "step": 1593 }, { "epoch": 1.04, "grad_norm": 0.05526265874505043, "learning_rate": 0.00021955043416968571, "loss": 0.0086, "step": 1594 }, { "epoch": 1.04, "grad_norm": 0.08750049024820328, "learning_rate": 0.00021945907635238766, "loss": 0.0316, "step": 1595 }, { "epoch": 1.04, "grad_norm": 0.11475680023431778, "learning_rate": 0.00021936768572508513, "loss": 0.0258, "step": 1596 }, { "epoch": 1.05, "grad_norm": 0.10367580503225327, "learning_rate": 0.00021927626233094784, "loss": 0.0332, "step": 1597 }, { "epoch": 1.05, "grad_norm": 0.09429827332496643, "learning_rate": 0.0002191848062131609, "loss": 0.0188, "step": 1598 }, { "epoch": 1.05, "grad_norm": 0.131412535905838, "learning_rate": 0.000219093317414925, "loss": 0.0419, "step": 1599 }, { "epoch": 1.05, "grad_norm": 0.2314550131559372, "learning_rate": 0.0002190017959794562, "loss": 0.0442, "step": 1600 }, { "epoch": 1.05, "grad_norm": 0.12669596076011658, "learning_rate": 0.00021891024194998593, "loss": 0.0137, "step": 1601 }, { "epoch": 1.05, "grad_norm": 0.09830581396818161, "learning_rate": 0.0002188186553697611, "loss": 0.022, "step": 1602 }, { "epoch": 1.05, "grad_norm": 0.10588390380144119, "learning_rate": 0.00021872703628204396, "loss": 0.0573, "step": 1603 }, { "epoch": 1.05, "grad_norm": 0.09916213899850845, "learning_rate": 0.0002186353847301121, "loss": 0.031, "step": 1604 }, { "epoch": 1.05, "grad_norm": 0.09390533715486526, "learning_rate": 0.00021854370075725848, "loss": 0.0181, "step": 1605 }, { "epoch": 1.05, "grad_norm": 0.09290188550949097, "learning_rate": 0.0002184519844067914, "loss": 0.0146, "step": 1606 }, { "epoch": 1.05, "grad_norm": 0.1350594460964203, "learning_rate": 0.00021836023572203433, "loss": 0.0536, "step": 1607 }, { "epoch": 1.05, "grad_norm": 0.04494404420256615, "learning_rate": 0.0002182684547463261, "loss": 0.006, "step": 1608 }, { "epoch": 1.05, "grad_norm": 0.1277172863483429, "learning_rate": 0.00021817664152302087, "loss": 0.0201, "step": 1609 }, { "epoch": 1.05, "grad_norm": 0.09169845283031464, "learning_rate": 0.0002180847960954879, "loss": 0.0051, "step": 1610 }, { "epoch": 1.05, "grad_norm": 0.11080282181501389, "learning_rate": 0.00021799291850711173, "loss": 0.02, "step": 1611 }, { "epoch": 1.06, "grad_norm": 0.06069633364677429, "learning_rate": 0.00021790100880129208, "loss": 0.0169, "step": 1612 }, { "epoch": 1.06, "grad_norm": 0.11857344210147858, "learning_rate": 0.00021780906702144372, "loss": 0.0361, "step": 1613 }, { "epoch": 1.06, "grad_norm": 0.10318568348884583, "learning_rate": 0.0002177170932109968, "loss": 0.0091, "step": 1614 }, { "epoch": 1.06, "grad_norm": 0.08703909814357758, "learning_rate": 0.00021762508741339655, "loss": 0.0095, "step": 1615 }, { "epoch": 1.06, "grad_norm": 0.14628368616104126, "learning_rate": 0.00021753304967210313, "loss": 0.0135, "step": 1616 }, { "epoch": 1.06, "grad_norm": 0.09634681791067123, "learning_rate": 0.0002174409800305919, "loss": 0.0098, "step": 1617 }, { "epoch": 1.06, "grad_norm": 3.208465576171875, "learning_rate": 0.00021734887853235333, "loss": 0.0302, "step": 1618 }, { "epoch": 1.06, "grad_norm": 0.2852117121219635, "learning_rate": 0.00021725674522089292, "loss": 0.0442, "step": 1619 }, { "epoch": 1.06, "grad_norm": 0.016181744635105133, "learning_rate": 0.0002171645801397312, "loss": 0.0021, "step": 1620 }, { "epoch": 1.06, "grad_norm": 0.020854290574789047, "learning_rate": 0.00021707238333240362, "loss": 0.0027, "step": 1621 }, { "epoch": 1.06, "grad_norm": 0.16986317932605743, "learning_rate": 0.00021698015484246068, "loss": 0.0422, "step": 1622 }, { "epoch": 1.06, "grad_norm": 0.15938438475131989, "learning_rate": 0.0002168878947134679, "loss": 0.0224, "step": 1623 }, { "epoch": 1.06, "grad_norm": 0.07981102913618088, "learning_rate": 0.00021679560298900572, "loss": 0.01, "step": 1624 }, { "epoch": 1.06, "grad_norm": 0.08571261167526245, "learning_rate": 0.00021670327971266937, "loss": 0.0137, "step": 1625 }, { "epoch": 1.06, "grad_norm": 0.09093351662158966, "learning_rate": 0.00021661092492806917, "loss": 0.0089, "step": 1626 }, { "epoch": 1.07, "grad_norm": 0.12952205538749695, "learning_rate": 0.0002165185386788302, "loss": 0.0257, "step": 1627 }, { "epoch": 1.07, "grad_norm": 0.24057811498641968, "learning_rate": 0.00021642612100859256, "loss": 0.041, "step": 1628 }, { "epoch": 1.07, "grad_norm": 0.6021707653999329, "learning_rate": 0.00021633367196101093, "loss": 0.0363, "step": 1629 }, { "epoch": 1.07, "grad_norm": 0.09282581508159637, "learning_rate": 0.000216241191579755, "loss": 0.013, "step": 1630 }, { "epoch": 1.07, "grad_norm": 0.027487829327583313, "learning_rate": 0.0002161486799085093, "loss": 0.0043, "step": 1631 }, { "epoch": 1.07, "grad_norm": 0.0671788677573204, "learning_rate": 0.00021605613699097296, "loss": 0.0148, "step": 1632 }, { "epoch": 1.07, "grad_norm": 0.11706430464982986, "learning_rate": 0.0002159635628708601, "loss": 0.0235, "step": 1633 }, { "epoch": 1.07, "grad_norm": 0.026642918586730957, "learning_rate": 0.00021587095759189934, "loss": 0.0037, "step": 1634 }, { "epoch": 1.07, "grad_norm": 0.011636834591627121, "learning_rate": 0.0002157783211978341, "loss": 0.0018, "step": 1635 }, { "epoch": 1.07, "grad_norm": 0.0854690819978714, "learning_rate": 0.00021568565373242268, "loss": 0.0049, "step": 1636 }, { "epoch": 1.07, "grad_norm": 0.3163520395755768, "learning_rate": 0.0002155929552394378, "loss": 0.0333, "step": 1637 }, { "epoch": 1.07, "grad_norm": 0.1685972809791565, "learning_rate": 0.00021550022576266695, "loss": 0.0175, "step": 1638 }, { "epoch": 1.07, "grad_norm": 0.10485909879207611, "learning_rate": 0.00021540746534591223, "loss": 0.0351, "step": 1639 }, { "epoch": 1.07, "grad_norm": 0.21983854472637177, "learning_rate": 0.00021531467403299042, "loss": 0.0586, "step": 1640 }, { "epoch": 1.07, "grad_norm": 0.057190775871276855, "learning_rate": 0.00021522185186773283, "loss": 0.0047, "step": 1641 }, { "epoch": 1.07, "grad_norm": 0.03798317164182663, "learning_rate": 0.00021512899889398535, "loss": 0.0028, "step": 1642 }, { "epoch": 1.08, "grad_norm": 0.24047723412513733, "learning_rate": 0.0002150361151556084, "loss": 0.0539, "step": 1643 }, { "epoch": 1.08, "grad_norm": 0.05976404622197151, "learning_rate": 0.000214943200696477, "loss": 0.0102, "step": 1644 }, { "epoch": 1.08, "grad_norm": 0.028973877429962158, "learning_rate": 0.00021485025556048067, "loss": 0.0048, "step": 1645 }, { "epoch": 1.08, "grad_norm": 0.19304604828357697, "learning_rate": 0.00021475727979152338, "loss": 0.0188, "step": 1646 }, { "epoch": 1.08, "grad_norm": 0.04543152078986168, "learning_rate": 0.00021466427343352353, "loss": 0.0074, "step": 1647 }, { "epoch": 1.08, "grad_norm": 0.12159468978643417, "learning_rate": 0.00021457123653041409, "loss": 0.0209, "step": 1648 }, { "epoch": 1.08, "grad_norm": 0.28786468505859375, "learning_rate": 0.00021447816912614236, "loss": 0.022, "step": 1649 }, { "epoch": 1.08, "grad_norm": 0.169004887342453, "learning_rate": 0.00021438507126467015, "loss": 0.0577, "step": 1650 }, { "epoch": 1.08, "grad_norm": 0.07666225731372833, "learning_rate": 0.00021429194298997349, "loss": 0.0115, "step": 1651 }, { "epoch": 1.08, "grad_norm": 0.02752969041466713, "learning_rate": 0.00021419878434604287, "loss": 0.0026, "step": 1652 }, { "epoch": 1.08, "grad_norm": 0.03154395520687103, "learning_rate": 0.00021410559537688324, "loss": 0.0022, "step": 1653 }, { "epoch": 1.08, "grad_norm": 0.02959531545639038, "learning_rate": 0.00021401237612651372, "loss": 0.0043, "step": 1654 }, { "epoch": 1.08, "grad_norm": 0.13195501267910004, "learning_rate": 0.0002139191266389677, "loss": 0.0178, "step": 1655 }, { "epoch": 1.08, "grad_norm": 0.03975927457213402, "learning_rate": 0.000213825846958293, "loss": 0.0055, "step": 1656 }, { "epoch": 1.08, "grad_norm": 0.06535745412111282, "learning_rate": 0.00021373253712855168, "loss": 0.0223, "step": 1657 }, { "epoch": 1.09, "grad_norm": 0.09343099594116211, "learning_rate": 0.00021363919719381987, "loss": 0.0123, "step": 1658 }, { "epoch": 1.09, "grad_norm": 0.1524379998445511, "learning_rate": 0.00021354582719818816, "loss": 0.0421, "step": 1659 }, { "epoch": 1.09, "grad_norm": 0.18298211693763733, "learning_rate": 0.00021345242718576117, "loss": 0.0444, "step": 1660 }, { "epoch": 1.09, "grad_norm": 0.088227778673172, "learning_rate": 0.00021335899720065777, "loss": 0.0116, "step": 1661 }, { "epoch": 1.09, "grad_norm": 0.1901037096977234, "learning_rate": 0.00021326553728701091, "loss": 0.0211, "step": 1662 }, { "epoch": 1.09, "grad_norm": 0.22244389355182648, "learning_rate": 0.00021317204748896786, "loss": 0.0567, "step": 1663 }, { "epoch": 1.09, "grad_norm": 0.24408915638923645, "learning_rate": 0.00021307852785068976, "loss": 0.0472, "step": 1664 }, { "epoch": 1.09, "grad_norm": 0.13890671730041504, "learning_rate": 0.00021298497841635208, "loss": 0.0326, "step": 1665 }, { "epoch": 1.09, "grad_norm": 0.10192529112100601, "learning_rate": 0.00021289139923014416, "loss": 0.0142, "step": 1666 }, { "epoch": 1.09, "grad_norm": 0.02748352289199829, "learning_rate": 0.00021279779033626955, "loss": 0.0065, "step": 1667 }, { "epoch": 1.09, "grad_norm": 0.029419776052236557, "learning_rate": 0.00021270415177894578, "loss": 0.0062, "step": 1668 }, { "epoch": 1.09, "grad_norm": 0.13964907824993134, "learning_rate": 0.00021261048360240434, "loss": 0.0442, "step": 1669 }, { "epoch": 1.09, "grad_norm": 0.22910548746585846, "learning_rate": 0.00021251678585089076, "loss": 0.0735, "step": 1670 }, { "epoch": 1.09, "grad_norm": 0.10947010666131973, "learning_rate": 0.0002124230585686645, "loss": 0.0225, "step": 1671 }, { "epoch": 1.09, "grad_norm": 0.04233964532613754, "learning_rate": 0.00021232930179999914, "loss": 0.0121, "step": 1672 }, { "epoch": 1.1, "grad_norm": 0.11553874611854553, "learning_rate": 0.00021223551558918193, "loss": 0.0206, "step": 1673 }, { "epoch": 1.1, "grad_norm": 0.30024242401123047, "learning_rate": 0.0002121416999805142, "loss": 0.0369, "step": 1674 }, { "epoch": 1.1, "grad_norm": 0.04763152822852135, "learning_rate": 0.00021204785501831107, "loss": 0.0094, "step": 1675 }, { "epoch": 1.1, "grad_norm": 0.12608014047145844, "learning_rate": 0.00021195398074690163, "loss": 0.0529, "step": 1676 }, { "epoch": 1.1, "grad_norm": 0.08367837965488434, "learning_rate": 0.00021186007721062873, "loss": 0.0172, "step": 1677 }, { "epoch": 1.1, "grad_norm": 0.12868303060531616, "learning_rate": 0.00021176614445384906, "loss": 0.0268, "step": 1678 }, { "epoch": 1.1, "grad_norm": 0.06957541406154633, "learning_rate": 0.00021167218252093314, "loss": 0.0099, "step": 1679 }, { "epoch": 1.1, "grad_norm": 0.17989858984947205, "learning_rate": 0.00021157819145626523, "loss": 0.0316, "step": 1680 }, { "epoch": 1.1, "grad_norm": 0.06561832875013351, "learning_rate": 0.00021148417130424345, "loss": 0.0118, "step": 1681 }, { "epoch": 1.1, "grad_norm": 0.07524342089891434, "learning_rate": 0.0002113901221092795, "loss": 0.0246, "step": 1682 }, { "epoch": 1.1, "grad_norm": 0.09207272529602051, "learning_rate": 0.0002112960439157989, "loss": 0.0371, "step": 1683 }, { "epoch": 1.1, "grad_norm": 0.11619143187999725, "learning_rate": 0.00021120193676824086, "loss": 0.0196, "step": 1684 }, { "epoch": 1.1, "grad_norm": 0.09404407441616058, "learning_rate": 0.00021110780071105829, "loss": 0.0198, "step": 1685 }, { "epoch": 1.1, "grad_norm": 0.025721121579408646, "learning_rate": 0.00021101363578871773, "loss": 0.0033, "step": 1686 }, { "epoch": 1.1, "grad_norm": 0.17948229610919952, "learning_rate": 0.00021091944204569928, "loss": 0.0225, "step": 1687 }, { "epoch": 1.11, "grad_norm": 0.299441933631897, "learning_rate": 0.00021082521952649677, "loss": 0.0314, "step": 1688 }, { "epoch": 1.11, "grad_norm": 0.18134386837482452, "learning_rate": 0.00021073096827561755, "loss": 0.0077, "step": 1689 }, { "epoch": 1.11, "grad_norm": 0.030258700251579285, "learning_rate": 0.00021063668833758265, "loss": 0.005, "step": 1690 }, { "epoch": 1.11, "grad_norm": 0.44221439957618713, "learning_rate": 0.00021054237975692646, "loss": 0.0267, "step": 1691 }, { "epoch": 1.11, "grad_norm": 0.19150428473949432, "learning_rate": 0.0002104480425781971, "loss": 0.0843, "step": 1692 }, { "epoch": 1.11, "grad_norm": 0.16386204957962036, "learning_rate": 0.00021035367684595603, "loss": 0.0408, "step": 1693 }, { "epoch": 1.11, "grad_norm": 0.06396955996751785, "learning_rate": 0.0002102592826047783, "loss": 0.0051, "step": 1694 }, { "epoch": 1.11, "grad_norm": 0.096857450902462, "learning_rate": 0.0002101648598992525, "loss": 0.0104, "step": 1695 }, { "epoch": 1.11, "grad_norm": 0.08629319071769714, "learning_rate": 0.0002100704087739804, "loss": 0.0066, "step": 1696 }, { "epoch": 1.11, "grad_norm": 0.06080562621355057, "learning_rate": 0.00020997592927357746, "loss": 0.0083, "step": 1697 }, { "epoch": 1.11, "grad_norm": 0.026974063366651535, "learning_rate": 0.00020988142144267246, "loss": 0.0043, "step": 1698 }, { "epoch": 1.11, "grad_norm": 0.01949911192059517, "learning_rate": 0.00020978688532590747, "loss": 0.0034, "step": 1699 }, { "epoch": 1.11, "grad_norm": 0.0293037761002779, "learning_rate": 0.0002096923209679381, "loss": 0.005, "step": 1700 }, { "epoch": 1.11, "grad_norm": 0.03902202844619751, "learning_rate": 0.0002095977284134331, "loss": 0.005, "step": 1701 }, { "epoch": 1.11, "grad_norm": 0.02799048461019993, "learning_rate": 0.0002095031077070747, "loss": 0.0044, "step": 1702 }, { "epoch": 1.11, "grad_norm": 0.07804699242115021, "learning_rate": 0.00020940845889355842, "loss": 0.0053, "step": 1703 }, { "epoch": 1.12, "grad_norm": 0.27684271335601807, "learning_rate": 0.00020931378201759283, "loss": 0.0272, "step": 1704 }, { "epoch": 1.12, "grad_norm": 0.10727167129516602, "learning_rate": 0.00020921907712390008, "loss": 0.008, "step": 1705 }, { "epoch": 1.12, "grad_norm": 0.1409468650817871, "learning_rate": 0.00020912434425721536, "loss": 0.0078, "step": 1706 }, { "epoch": 1.12, "grad_norm": 0.06090042367577553, "learning_rate": 0.0002090295834622871, "loss": 0.0022, "step": 1707 }, { "epoch": 1.12, "grad_norm": 0.3309457004070282, "learning_rate": 0.00020893479478387695, "loss": 0.0202, "step": 1708 }, { "epoch": 1.12, "grad_norm": 0.014704009518027306, "learning_rate": 0.00020883997826675972, "loss": 0.0018, "step": 1709 }, { "epoch": 1.12, "grad_norm": 0.036989156156778336, "learning_rate": 0.0002087451339557234, "loss": 0.0023, "step": 1710 }, { "epoch": 1.12, "grad_norm": 0.0498960018157959, "learning_rate": 0.00020865026189556898, "loss": 0.0022, "step": 1711 }, { "epoch": 1.12, "grad_norm": 0.3943072259426117, "learning_rate": 0.0002085553621311108, "loss": 0.0478, "step": 1712 }, { "epoch": 1.12, "grad_norm": 0.011776590719819069, "learning_rate": 0.00020846043470717606, "loss": 0.0015, "step": 1713 }, { "epoch": 1.12, "grad_norm": 0.01262225303798914, "learning_rate": 0.00020836547966860512, "loss": 0.0012, "step": 1714 }, { "epoch": 1.12, "grad_norm": 0.44584786891937256, "learning_rate": 0.00020827049706025134, "loss": 0.0376, "step": 1715 }, { "epoch": 1.12, "grad_norm": 0.1026252806186676, "learning_rate": 0.00020817548692698122, "loss": 0.0057, "step": 1716 }, { "epoch": 1.12, "grad_norm": 0.5443242788314819, "learning_rate": 0.0002080804493136741, "loss": 0.0355, "step": 1717 }, { "epoch": 1.12, "grad_norm": 0.2901538610458374, "learning_rate": 0.0002079853842652224, "loss": 0.0331, "step": 1718 }, { "epoch": 1.13, "grad_norm": 0.13325665891170502, "learning_rate": 0.00020789029182653146, "loss": 0.0314, "step": 1719 }, { "epoch": 1.13, "grad_norm": 0.3474900722503662, "learning_rate": 0.00020779517204251962, "loss": 0.0052, "step": 1720 }, { "epoch": 1.13, "grad_norm": 0.19612590968608856, "learning_rate": 0.00020770002495811807, "loss": 0.0159, "step": 1721 }, { "epoch": 1.13, "grad_norm": 0.12384012341499329, "learning_rate": 0.00020760485061827096, "loss": 0.0463, "step": 1722 }, { "epoch": 1.13, "grad_norm": 0.2543465495109558, "learning_rate": 0.00020750964906793518, "loss": 0.1287, "step": 1723 }, { "epoch": 1.13, "grad_norm": 0.037789445370435715, "learning_rate": 0.00020741442035208062, "loss": 0.0039, "step": 1724 }, { "epoch": 1.13, "grad_norm": 0.026412533596158028, "learning_rate": 0.00020731916451568991, "loss": 0.004, "step": 1725 }, { "epoch": 1.13, "grad_norm": 0.30076315999031067, "learning_rate": 0.00020722388160375867, "loss": 0.0585, "step": 1726 }, { "epoch": 1.13, "grad_norm": 0.052092649042606354, "learning_rate": 0.00020712857166129502, "loss": 0.0059, "step": 1727 }, { "epoch": 1.13, "grad_norm": 0.23526246845722198, "learning_rate": 0.00020703323473332, "loss": 0.0618, "step": 1728 }, { "epoch": 1.13, "grad_norm": 0.17606157064437866, "learning_rate": 0.00020693787086486747, "loss": 0.0382, "step": 1729 }, { "epoch": 1.13, "grad_norm": 0.04785061255097389, "learning_rate": 0.0002068424801009839, "loss": 0.009, "step": 1730 }, { "epoch": 1.13, "grad_norm": 0.1910519003868103, "learning_rate": 0.0002067470624867285, "loss": 0.0268, "step": 1731 }, { "epoch": 1.13, "grad_norm": 0.029830094426870346, "learning_rate": 0.00020665161806717318, "loss": 0.0058, "step": 1732 }, { "epoch": 1.13, "grad_norm": 0.22875742614269257, "learning_rate": 0.0002065561468874025, "loss": 0.042, "step": 1733 }, { "epoch": 1.14, "grad_norm": 0.1160743236541748, "learning_rate": 0.00020646064899251365, "loss": 0.0123, "step": 1734 }, { "epoch": 1.14, "grad_norm": 0.09632806479930878, "learning_rate": 0.0002063651244276165, "loss": 0.028, "step": 1735 }, { "epoch": 1.14, "grad_norm": 0.045413848012685776, "learning_rate": 0.00020626957323783337, "loss": 0.0103, "step": 1736 }, { "epoch": 1.14, "grad_norm": 0.06504768133163452, "learning_rate": 0.00020617399546829932, "loss": 0.0103, "step": 1737 }, { "epoch": 1.14, "grad_norm": 0.11606152355670929, "learning_rate": 0.00020607839116416188, "loss": 0.0219, "step": 1738 }, { "epoch": 1.14, "grad_norm": 0.15662230551242828, "learning_rate": 0.00020598276037058115, "loss": 0.0075, "step": 1739 }, { "epoch": 1.14, "grad_norm": 0.060219258069992065, "learning_rate": 0.00020588710313272968, "loss": 0.0051, "step": 1740 }, { "epoch": 1.14, "grad_norm": 0.033545345067977905, "learning_rate": 0.0002057914194957926, "loss": 0.006, "step": 1741 }, { "epoch": 1.14, "grad_norm": 0.1281086504459381, "learning_rate": 0.00020569570950496746, "loss": 0.0148, "step": 1742 }, { "epoch": 1.14, "grad_norm": 0.28984948992729187, "learning_rate": 0.0002055999732054643, "loss": 0.0788, "step": 1743 }, { "epoch": 1.14, "grad_norm": 0.022472795099020004, "learning_rate": 0.00020550421064250546, "loss": 0.0032, "step": 1744 }, { "epoch": 1.14, "grad_norm": 0.21518099308013916, "learning_rate": 0.00020540842186132587, "loss": 0.0624, "step": 1745 }, { "epoch": 1.14, "grad_norm": 0.11233403533697128, "learning_rate": 0.00020531260690717269, "loss": 0.018, "step": 1746 }, { "epoch": 1.14, "grad_norm": 0.0663604736328125, "learning_rate": 0.0002052167658253055, "loss": 0.0247, "step": 1747 }, { "epoch": 1.14, "grad_norm": 0.019785290583968163, "learning_rate": 0.00020512089866099635, "loss": 0.004, "step": 1748 }, { "epoch": 1.15, "grad_norm": 0.18856649100780487, "learning_rate": 0.00020502500545952935, "loss": 0.0246, "step": 1749 }, { "epoch": 1.15, "grad_norm": 0.06530511379241943, "learning_rate": 0.0002049290862662011, "loss": 0.0047, "step": 1750 }, { "epoch": 1.15, "grad_norm": 0.11900179833173752, "learning_rate": 0.0002048331411263204, "loss": 0.0305, "step": 1751 }, { "epoch": 1.15, "grad_norm": 0.08668252825737, "learning_rate": 0.00020473717008520842, "loss": 0.0342, "step": 1752 }, { "epoch": 1.15, "grad_norm": 0.033169977366924286, "learning_rate": 0.00020464117318819836, "loss": 0.0051, "step": 1753 }, { "epoch": 1.15, "grad_norm": 0.11904280632734299, "learning_rate": 0.00020454515048063578, "loss": 0.0148, "step": 1754 }, { "epoch": 1.15, "grad_norm": 0.03916119784116745, "learning_rate": 0.00020444910200787846, "loss": 0.0064, "step": 1755 }, { "epoch": 1.15, "grad_norm": 0.23415330052375793, "learning_rate": 0.0002043530278152963, "loss": 0.0252, "step": 1756 }, { "epoch": 1.15, "grad_norm": 0.2871975302696228, "learning_rate": 0.0002042569279482712, "loss": 0.0425, "step": 1757 }, { "epoch": 1.15, "grad_norm": 0.09590361267328262, "learning_rate": 0.00020416080245219743, "loss": 0.0079, "step": 1758 }, { "epoch": 1.15, "grad_norm": 0.037294209003448486, "learning_rate": 0.00020406465137248135, "loss": 0.0027, "step": 1759 }, { "epoch": 1.15, "grad_norm": 0.14344163239002228, "learning_rate": 0.00020396847475454114, "loss": 0.0109, "step": 1760 }, { "epoch": 1.15, "grad_norm": 0.01968855783343315, "learning_rate": 0.0002038722726438074, "loss": 0.0024, "step": 1761 }, { "epoch": 1.15, "grad_norm": 0.26668640971183777, "learning_rate": 0.00020377604508572245, "loss": 0.0166, "step": 1762 }, { "epoch": 1.15, "grad_norm": 0.0922434851527214, "learning_rate": 0.00020367979212574085, "loss": 0.0128, "step": 1763 }, { "epoch": 1.15, "grad_norm": 0.012287971563637257, "learning_rate": 0.0002035835138093291, "loss": 0.0018, "step": 1764 }, { "epoch": 1.16, "grad_norm": 0.14998489618301392, "learning_rate": 0.0002034872101819656, "loss": 0.0383, "step": 1765 }, { "epoch": 1.16, "grad_norm": 0.03521181270480156, "learning_rate": 0.00020339088128914083, "loss": 0.0042, "step": 1766 }, { "epoch": 1.16, "grad_norm": 0.1004776582121849, "learning_rate": 0.00020329452717635712, "loss": 0.0217, "step": 1767 }, { "epoch": 1.16, "grad_norm": 0.10840025544166565, "learning_rate": 0.00020319814788912868, "loss": 0.0238, "step": 1768 }, { "epoch": 1.16, "grad_norm": 0.08637768775224686, "learning_rate": 0.00020310174347298174, "loss": 0.0244, "step": 1769 }, { "epoch": 1.16, "grad_norm": 0.12513408064842224, "learning_rate": 0.00020300531397345433, "loss": 0.0402, "step": 1770 }, { "epoch": 1.16, "grad_norm": 0.1716419905424118, "learning_rate": 0.00020290885943609628, "loss": 0.0473, "step": 1771 }, { "epoch": 1.16, "grad_norm": 0.1464458853006363, "learning_rate": 0.00020281237990646932, "loss": 0.0263, "step": 1772 }, { "epoch": 1.16, "grad_norm": 0.2202548384666443, "learning_rate": 0.00020271587543014695, "loss": 0.014, "step": 1773 }, { "epoch": 1.16, "grad_norm": 0.20607982575893402, "learning_rate": 0.00020261934605271447, "loss": 0.0112, "step": 1774 }, { "epoch": 1.16, "grad_norm": 0.04717608913779259, "learning_rate": 0.00020252279181976897, "loss": 0.0072, "step": 1775 }, { "epoch": 1.16, "grad_norm": 0.1588015854358673, "learning_rate": 0.00020242621277691912, "loss": 0.0203, "step": 1776 }, { "epoch": 1.16, "grad_norm": 0.28549695014953613, "learning_rate": 0.00020232960896978558, "loss": 0.0256, "step": 1777 }, { "epoch": 1.16, "grad_norm": 0.05648793280124664, "learning_rate": 0.00020223298044400048, "loss": 0.0172, "step": 1778 }, { "epoch": 1.16, "grad_norm": 0.08711002767086029, "learning_rate": 0.00020213632724520777, "loss": 0.0091, "step": 1779 }, { "epoch": 1.17, "grad_norm": 0.1041957437992096, "learning_rate": 0.00020203964941906293, "loss": 0.0391, "step": 1780 }, { "epoch": 1.17, "grad_norm": 0.23113363981246948, "learning_rate": 0.00020194294701123317, "loss": 0.0202, "step": 1781 }, { "epoch": 1.17, "grad_norm": 0.05238531902432442, "learning_rate": 0.00020184622006739724, "loss": 0.0133, "step": 1782 }, { "epoch": 1.17, "grad_norm": 0.16166527569293976, "learning_rate": 0.00020174946863324555, "loss": 0.0162, "step": 1783 }, { "epoch": 1.17, "grad_norm": 0.09691984206438065, "learning_rate": 0.0002016526927544801, "loss": 0.0163, "step": 1784 }, { "epoch": 1.17, "grad_norm": 0.09754455834627151, "learning_rate": 0.0002015558924768143, "loss": 0.0053, "step": 1785 }, { "epoch": 1.17, "grad_norm": 0.13531388342380524, "learning_rate": 0.00020145906784597317, "loss": 0.0243, "step": 1786 }, { "epoch": 1.17, "grad_norm": 0.06766755878925323, "learning_rate": 0.0002013622189076933, "loss": 0.0121, "step": 1787 }, { "epoch": 1.17, "grad_norm": 0.00836429838091135, "learning_rate": 0.00020126534570772265, "loss": 0.0012, "step": 1788 }, { "epoch": 1.17, "grad_norm": 0.009742120280861855, "learning_rate": 0.00020116844829182065, "loss": 0.0013, "step": 1789 }, { "epoch": 1.17, "grad_norm": 0.593370258808136, "learning_rate": 0.00020107152670575826, "loss": 0.0362, "step": 1790 }, { "epoch": 1.17, "grad_norm": 0.09712370485067368, "learning_rate": 0.00020097458099531778, "loss": 0.0055, "step": 1791 }, { "epoch": 1.17, "grad_norm": 0.5568703413009644, "learning_rate": 0.00020087761120629296, "loss": 0.0747, "step": 1792 }, { "epoch": 1.17, "grad_norm": 0.073786161839962, "learning_rate": 0.00020078061738448881, "loss": 0.0113, "step": 1793 }, { "epoch": 1.17, "grad_norm": 0.011098073795437813, "learning_rate": 0.0002006835995757218, "loss": 0.001, "step": 1794 }, { "epoch": 1.18, "grad_norm": 0.009852485731244087, "learning_rate": 0.0002005865578258198, "loss": 0.0013, "step": 1795 }, { "epoch": 1.18, "grad_norm": 0.2939152717590332, "learning_rate": 0.00020048949218062174, "loss": 0.0346, "step": 1796 }, { "epoch": 1.18, "grad_norm": 0.2433510720729828, "learning_rate": 0.0002003924026859781, "loss": 0.02, "step": 1797 }, { "epoch": 1.18, "grad_norm": 0.02912173792719841, "learning_rate": 0.00020029528938775046, "loss": 0.0024, "step": 1798 }, { "epoch": 1.18, "grad_norm": 0.13098259270191193, "learning_rate": 0.0002001981523318117, "loss": 0.0236, "step": 1799 }, { "epoch": 1.18, "grad_norm": 0.15209761261940002, "learning_rate": 0.00020010099156404594, "loss": 0.0305, "step": 1800 }, { "epoch": 1.18, "grad_norm": 0.23245444893836975, "learning_rate": 0.00020000380713034848, "loss": 0.0488, "step": 1801 }, { "epoch": 1.18, "grad_norm": 0.048978038132190704, "learning_rate": 0.00019990659907662578, "loss": 0.0072, "step": 1802 }, { "epoch": 1.18, "grad_norm": 0.01486815232783556, "learning_rate": 0.00019980936744879552, "loss": 0.0021, "step": 1803 }, { "epoch": 1.18, "grad_norm": 0.07630421221256256, "learning_rate": 0.0001997121122927864, "loss": 0.0129, "step": 1804 }, { "epoch": 1.18, "grad_norm": 0.1630118042230606, "learning_rate": 0.00019961483365453842, "loss": 0.0247, "step": 1805 }, { "epoch": 1.18, "grad_norm": 0.20434342324733734, "learning_rate": 0.00019951753158000242, "loss": 0.0275, "step": 1806 }, { "epoch": 1.18, "grad_norm": 0.17411430180072784, "learning_rate": 0.00019942020611514056, "loss": 0.0485, "step": 1807 }, { "epoch": 1.18, "grad_norm": 0.2546854317188263, "learning_rate": 0.00019932285730592583, "loss": 0.0231, "step": 1808 }, { "epoch": 1.18, "grad_norm": 0.10782810300588608, "learning_rate": 0.0001992254851983425, "loss": 0.0299, "step": 1809 }, { "epoch": 1.18, "grad_norm": 0.12854281067848206, "learning_rate": 0.0001991280898383856, "loss": 0.0167, "step": 1810 }, { "epoch": 1.19, "grad_norm": 0.017508767545223236, "learning_rate": 0.00019903067127206124, "loss": 0.0025, "step": 1811 }, { "epoch": 1.19, "grad_norm": 0.07165428251028061, "learning_rate": 0.00019893322954538657, "loss": 0.0113, "step": 1812 }, { "epoch": 1.19, "grad_norm": 0.10676853358745575, "learning_rate": 0.0001988357647043895, "loss": 0.0035, "step": 1813 }, { "epoch": 1.19, "grad_norm": 0.2053932100534439, "learning_rate": 0.00019873827679510908, "loss": 0.0215, "step": 1814 }, { "epoch": 1.19, "grad_norm": 0.03081035614013672, "learning_rate": 0.00019864076586359513, "loss": 0.0047, "step": 1815 }, { "epoch": 1.19, "grad_norm": 0.0858052521944046, "learning_rate": 0.00019854323195590823, "loss": 0.0113, "step": 1816 }, { "epoch": 1.19, "grad_norm": 0.16202549636363983, "learning_rate": 0.00019844567511812002, "loss": 0.0088, "step": 1817 }, { "epoch": 1.19, "grad_norm": 0.09111412614583969, "learning_rate": 0.0001983480953963129, "loss": 0.0078, "step": 1818 }, { "epoch": 1.19, "grad_norm": 0.05848320201039314, "learning_rate": 0.0001982504928365801, "loss": 0.0035, "step": 1819 }, { "epoch": 1.19, "grad_norm": 0.1509551852941513, "learning_rate": 0.00019815286748502554, "loss": 0.0109, "step": 1820 }, { "epoch": 1.19, "grad_norm": 0.06057953089475632, "learning_rate": 0.00019805521938776402, "loss": 0.0043, "step": 1821 }, { "epoch": 1.19, "grad_norm": 0.11573519557714462, "learning_rate": 0.00019795754859092097, "loss": 0.0343, "step": 1822 }, { "epoch": 1.19, "grad_norm": 0.0522802509367466, "learning_rate": 0.0001978598551406327, "loss": 0.0054, "step": 1823 }, { "epoch": 1.19, "grad_norm": 0.08309350162744522, "learning_rate": 0.00019776213908304611, "loss": 0.0041, "step": 1824 }, { "epoch": 1.19, "grad_norm": 0.506611704826355, "learning_rate": 0.00019766440046431875, "loss": 0.062, "step": 1825 }, { "epoch": 1.2, "grad_norm": 0.05043810233473778, "learning_rate": 0.00019756663933061892, "loss": 0.0036, "step": 1826 }, { "epoch": 1.2, "grad_norm": 0.008669359609484673, "learning_rate": 0.0001974688557281255, "loss": 0.0008, "step": 1827 }, { "epoch": 1.2, "grad_norm": 0.24424995481967926, "learning_rate": 0.00019737104970302802, "loss": 0.0312, "step": 1828 }, { "epoch": 1.2, "grad_norm": 0.08624009788036346, "learning_rate": 0.00019727322130152656, "loss": 0.0471, "step": 1829 }, { "epoch": 1.2, "grad_norm": 0.0553121417760849, "learning_rate": 0.00019717537056983177, "loss": 0.0047, "step": 1830 }, { "epoch": 1.2, "grad_norm": 0.20766226947307587, "learning_rate": 0.00019707749755416487, "loss": 0.0187, "step": 1831 }, { "epoch": 1.2, "grad_norm": 0.03902578726410866, "learning_rate": 0.00019697960230075768, "loss": 0.0022, "step": 1832 }, { "epoch": 1.2, "grad_norm": 0.05113459378480911, "learning_rate": 0.00019688168485585233, "loss": 0.0059, "step": 1833 }, { "epoch": 1.2, "grad_norm": 0.07002965360879898, "learning_rate": 0.00019678374526570157, "loss": 0.0054, "step": 1834 }, { "epoch": 1.2, "grad_norm": 0.11523545533418655, "learning_rate": 0.00019668578357656864, "loss": 0.006, "step": 1835 }, { "epoch": 1.2, "grad_norm": 0.10022434592247009, "learning_rate": 0.00019658779983472714, "loss": 0.0086, "step": 1836 }, { "epoch": 1.2, "grad_norm": 0.3261854350566864, "learning_rate": 0.00019648979408646113, "loss": 0.0639, "step": 1837 }, { "epoch": 1.2, "grad_norm": 0.0735621452331543, "learning_rate": 0.000196391766378065, "loss": 0.004, "step": 1838 }, { "epoch": 1.2, "grad_norm": 0.02739633060991764, "learning_rate": 0.00019629371675584367, "loss": 0.0022, "step": 1839 }, { "epoch": 1.2, "grad_norm": 0.15281106531620026, "learning_rate": 0.0001961956452661122, "loss": 0.0086, "step": 1840 }, { "epoch": 1.21, "grad_norm": 0.04371574893593788, "learning_rate": 0.00019609755195519615, "loss": 0.0034, "step": 1841 }, { "epoch": 1.21, "grad_norm": 0.10713805258274078, "learning_rate": 0.00019599943686943126, "loss": 0.0065, "step": 1842 }, { "epoch": 1.21, "grad_norm": 0.059120483696460724, "learning_rate": 0.00019590130005516364, "loss": 0.006, "step": 1843 }, { "epoch": 1.21, "grad_norm": 0.1300182342529297, "learning_rate": 0.00019580314155874968, "loss": 0.0476, "step": 1844 }, { "epoch": 1.21, "grad_norm": 0.22915533185005188, "learning_rate": 0.00019570496142655598, "loss": 0.0257, "step": 1845 }, { "epoch": 1.21, "grad_norm": 0.4989432692527771, "learning_rate": 0.00019560675970495926, "loss": 0.0554, "step": 1846 }, { "epoch": 1.21, "grad_norm": 0.051195550709962845, "learning_rate": 0.0001955085364403466, "loss": 0.0044, "step": 1847 }, { "epoch": 1.21, "grad_norm": 0.36124640703201294, "learning_rate": 0.00019541029167911513, "loss": 0.0726, "step": 1848 }, { "epoch": 1.21, "grad_norm": 0.12776778638362885, "learning_rate": 0.0001953120254676723, "loss": 0.012, "step": 1849 }, { "epoch": 1.21, "grad_norm": 0.329092800617218, "learning_rate": 0.0001952137378524355, "loss": 0.0174, "step": 1850 }, { "epoch": 1.21, "grad_norm": 0.21430779993534088, "learning_rate": 0.00019511542887983233, "loss": 0.061, "step": 1851 }, { "epoch": 1.21, "grad_norm": 0.031340569257736206, "learning_rate": 0.00019501709859630047, "loss": 0.0051, "step": 1852 }, { "epoch": 1.21, "grad_norm": 0.23494398593902588, "learning_rate": 0.00019491874704828766, "loss": 0.0568, "step": 1853 }, { "epoch": 1.21, "grad_norm": 0.19756872951984406, "learning_rate": 0.00019482037428225166, "loss": 0.027, "step": 1854 }, { "epoch": 1.21, "grad_norm": 0.1831345558166504, "learning_rate": 0.00019472198034466032, "loss": 0.0213, "step": 1855 }, { "epoch": 1.22, "grad_norm": 0.0714825913310051, "learning_rate": 0.00019462356528199138, "loss": 0.0264, "step": 1856 }, { "epoch": 1.22, "grad_norm": 0.09632866829633713, "learning_rate": 0.0001945251291407327, "loss": 0.0123, "step": 1857 }, { "epoch": 1.22, "grad_norm": 0.28308427333831787, "learning_rate": 0.00019442667196738192, "loss": 0.0718, "step": 1858 }, { "epoch": 1.22, "grad_norm": 0.1654062271118164, "learning_rate": 0.00019432819380844687, "loss": 0.046, "step": 1859 }, { "epoch": 1.22, "grad_norm": 0.044118259102106094, "learning_rate": 0.00019422969471044501, "loss": 0.0055, "step": 1860 }, { "epoch": 1.22, "grad_norm": 0.06968650221824646, "learning_rate": 0.00019413117471990386, "loss": 0.0137, "step": 1861 }, { "epoch": 1.22, "grad_norm": 0.03375468775629997, "learning_rate": 0.0001940326338833608, "loss": 0.0066, "step": 1862 }, { "epoch": 1.22, "grad_norm": 0.15991543233394623, "learning_rate": 0.00019393407224736306, "loss": 0.0316, "step": 1863 }, { "epoch": 1.22, "grad_norm": 0.09309843927621841, "learning_rate": 0.00019383548985846754, "loss": 0.0178, "step": 1864 }, { "epoch": 1.22, "grad_norm": 0.14062856137752533, "learning_rate": 0.00019373688676324114, "loss": 0.0315, "step": 1865 }, { "epoch": 1.22, "grad_norm": 0.09427135437726974, "learning_rate": 0.00019363826300826043, "loss": 0.0516, "step": 1866 }, { "epoch": 1.22, "grad_norm": 0.12931552529335022, "learning_rate": 0.00019353961864011183, "loss": 0.0208, "step": 1867 }, { "epoch": 1.22, "grad_norm": 0.06832294166088104, "learning_rate": 0.0001934409537053914, "loss": 0.0249, "step": 1868 }, { "epoch": 1.22, "grad_norm": 0.1043616384267807, "learning_rate": 0.00019334226825070493, "loss": 0.0132, "step": 1869 }, { "epoch": 1.22, "grad_norm": 0.04455610364675522, "learning_rate": 0.000193243562322668, "loss": 0.0068, "step": 1870 }, { "epoch": 1.22, "grad_norm": 0.09259182214736938, "learning_rate": 0.00019314483596790576, "loss": 0.0084, "step": 1871 }, { "epoch": 1.23, "grad_norm": 0.15175136923789978, "learning_rate": 0.00019304608923305302, "loss": 0.0299, "step": 1872 }, { "epoch": 1.23, "grad_norm": 0.3181014657020569, "learning_rate": 0.00019294732216475427, "loss": 0.05, "step": 1873 }, { "epoch": 1.23, "grad_norm": 0.029958350583910942, "learning_rate": 0.00019284853480966354, "loss": 0.0048, "step": 1874 }, { "epoch": 1.23, "grad_norm": 0.1685924082994461, "learning_rate": 0.00019274972721444446, "loss": 0.0271, "step": 1875 }, { "epoch": 1.23, "grad_norm": 0.08830788731575012, "learning_rate": 0.00019265089942577027, "loss": 0.0099, "step": 1876 }, { "epoch": 1.23, "grad_norm": 0.20782901346683502, "learning_rate": 0.00019255205149032375, "loss": 0.0558, "step": 1877 }, { "epoch": 1.23, "grad_norm": 0.12139201164245605, "learning_rate": 0.00019245318345479707, "loss": 0.0161, "step": 1878 }, { "epoch": 1.23, "grad_norm": 0.14500971138477325, "learning_rate": 0.00019235429536589203, "loss": 0.0129, "step": 1879 }, { "epoch": 1.23, "grad_norm": 0.18900787830352783, "learning_rate": 0.0001922553872703198, "loss": 0.0161, "step": 1880 }, { "epoch": 1.23, "grad_norm": 0.04411626234650612, "learning_rate": 0.0001921564592148012, "loss": 0.0038, "step": 1881 }, { "epoch": 1.23, "grad_norm": 0.08428865671157837, "learning_rate": 0.0001920575112460662, "loss": 0.0048, "step": 1882 }, { "epoch": 1.23, "grad_norm": 0.12923188507556915, "learning_rate": 0.0001919585434108543, "loss": 0.0111, "step": 1883 }, { "epoch": 1.23, "grad_norm": 0.02636256441473961, "learning_rate": 0.00019185955575591452, "loss": 0.0032, "step": 1884 }, { "epoch": 1.23, "grad_norm": 0.297539621591568, "learning_rate": 0.00019176054832800498, "loss": 0.0659, "step": 1885 }, { "epoch": 1.23, "grad_norm": 0.1499989926815033, "learning_rate": 0.00019166152117389344, "loss": 0.0129, "step": 1886 }, { "epoch": 1.24, "grad_norm": 0.4077751040458679, "learning_rate": 0.00019156247434035665, "loss": 0.0447, "step": 1887 }, { "epoch": 1.24, "grad_norm": 0.04810630530118942, "learning_rate": 0.0001914634078741809, "loss": 0.004, "step": 1888 }, { "epoch": 1.24, "grad_norm": 0.07159969210624695, "learning_rate": 0.00019136432182216166, "loss": 0.0044, "step": 1889 }, { "epoch": 1.24, "grad_norm": 0.09493908286094666, "learning_rate": 0.00019126521623110375, "loss": 0.0071, "step": 1890 }, { "epoch": 1.24, "grad_norm": 0.04515552520751953, "learning_rate": 0.00019116609114782097, "loss": 0.0039, "step": 1891 }, { "epoch": 1.24, "grad_norm": 0.01067087147384882, "learning_rate": 0.00019106694661913664, "loss": 0.0017, "step": 1892 }, { "epoch": 1.24, "grad_norm": 0.15949301421642303, "learning_rate": 0.00019096778269188302, "loss": 0.0077, "step": 1893 }, { "epoch": 1.24, "grad_norm": 0.01298619620501995, "learning_rate": 0.00019086859941290174, "loss": 0.0019, "step": 1894 }, { "epoch": 1.24, "grad_norm": 0.009331258945167065, "learning_rate": 0.00019076939682904337, "loss": 0.001, "step": 1895 }, { "epoch": 1.24, "grad_norm": 0.23378711938858032, "learning_rate": 0.00019067017498716773, "loss": 0.029, "step": 1896 }, { "epoch": 1.24, "grad_norm": 0.2589947581291199, "learning_rate": 0.00019057093393414366, "loss": 0.0386, "step": 1897 }, { "epoch": 1.24, "grad_norm": 0.4780135750770569, "learning_rate": 0.00019047167371684918, "loss": 0.0401, "step": 1898 }, { "epoch": 1.24, "grad_norm": 0.31437739729881287, "learning_rate": 0.00019037239438217127, "loss": 0.0684, "step": 1899 }, { "epoch": 1.24, "grad_norm": 0.03697904944419861, "learning_rate": 0.00019027309597700594, "loss": 0.0036, "step": 1900 }, { "epoch": 1.24, "grad_norm": 0.1111644059419632, "learning_rate": 0.00019017377854825828, "loss": 0.006, "step": 1901 }, { "epoch": 1.25, "grad_norm": 0.1257040649652481, "learning_rate": 0.00019007444214284226, "loss": 0.0168, "step": 1902 }, { "epoch": 1.25, "grad_norm": 0.2534196376800537, "learning_rate": 0.00018997508680768097, "loss": 0.0263, "step": 1903 }, { "epoch": 1.25, "grad_norm": 0.053403884172439575, "learning_rate": 0.00018987571258970626, "loss": 0.003, "step": 1904 }, { "epoch": 1.25, "grad_norm": 0.22898997366428375, "learning_rate": 0.00018977631953585902, "loss": 0.0212, "step": 1905 }, { "epoch": 1.25, "grad_norm": 0.13535194098949432, "learning_rate": 0.00018967690769308894, "loss": 0.0087, "step": 1906 }, { "epoch": 1.25, "grad_norm": 0.16851268708705902, "learning_rate": 0.00018957747710835482, "loss": 0.0388, "step": 1907 }, { "epoch": 1.25, "grad_norm": 0.21347947418689728, "learning_rate": 0.00018947802782862396, "loss": 0.0332, "step": 1908 }, { "epoch": 1.25, "grad_norm": 0.0564715713262558, "learning_rate": 0.00018937855990087276, "loss": 0.0036, "step": 1909 }, { "epoch": 1.25, "grad_norm": 0.06032940000295639, "learning_rate": 0.0001892790733720863, "loss": 0.0067, "step": 1910 }, { "epoch": 1.25, "eval_loss": 0.032177336513996124, "eval_runtime": 39.9387, "eval_samples_per_second": 32.224, "eval_steps_per_second": 8.062, "step": 1910 }, { "epoch": 1.25, "grad_norm": 0.11383625119924545, "learning_rate": 0.00018917956828925857, "loss": 0.0152, "step": 1911 }, { "epoch": 1.25, "grad_norm": 0.15107430517673492, "learning_rate": 0.00018908004469939216, "loss": 0.0511, "step": 1912 }, { "epoch": 1.25, "grad_norm": 0.403848260641098, "learning_rate": 0.00018898050264949852, "loss": 0.0206, "step": 1913 }, { "epoch": 1.25, "grad_norm": 0.09853165596723557, "learning_rate": 0.00018888094218659778, "loss": 0.0186, "step": 1914 }, { "epoch": 1.25, "grad_norm": 0.1771824061870575, "learning_rate": 0.00018878136335771876, "loss": 0.0221, "step": 1915 }, { "epoch": 1.25, "grad_norm": 0.0342799611389637, "learning_rate": 0.000188681766209899, "loss": 0.004, "step": 1916 }, { "epoch": 1.25, "grad_norm": 0.14599063992500305, "learning_rate": 0.0001885821507901846, "loss": 0.0616, "step": 1917 }, { "epoch": 1.26, "grad_norm": 0.09153589606285095, "learning_rate": 0.0001884825171456304, "loss": 0.0093, "step": 1918 }, { "epoch": 1.26, "grad_norm": 0.040928881615400314, "learning_rate": 0.0001883828653232998, "loss": 0.0055, "step": 1919 }, { "epoch": 1.26, "grad_norm": 0.0882481262087822, "learning_rate": 0.00018828319537026475, "loss": 0.0115, "step": 1920 }, { "epoch": 1.26, "grad_norm": 0.22515325248241425, "learning_rate": 0.00018818350733360584, "loss": 0.0484, "step": 1921 }, { "epoch": 1.26, "grad_norm": 0.2775671184062958, "learning_rate": 0.00018808380126041215, "loss": 0.0412, "step": 1922 }, { "epoch": 1.26, "grad_norm": 0.20490968227386475, "learning_rate": 0.00018798407719778127, "loss": 0.0146, "step": 1923 }, { "epoch": 1.26, "grad_norm": 0.017601322382688522, "learning_rate": 0.00018788433519281933, "loss": 0.0032, "step": 1924 }, { "epoch": 1.26, "grad_norm": 0.061190057545900345, "learning_rate": 0.00018778457529264098, "loss": 0.0062, "step": 1925 }, { "epoch": 1.26, "grad_norm": 0.011243225075304508, "learning_rate": 0.00018768479754436917, "loss": 0.0019, "step": 1926 }, { "epoch": 1.26, "grad_norm": 0.21566064655780792, "learning_rate": 0.0001875850019951354, "loss": 0.0197, "step": 1927 }, { "epoch": 1.26, "grad_norm": 0.14111027121543884, "learning_rate": 0.00018748518869207952, "loss": 0.0164, "step": 1928 }, { "epoch": 1.26, "grad_norm": 0.17709197103977203, "learning_rate": 0.00018738535768234984, "loss": 0.0173, "step": 1929 }, { "epoch": 1.26, "grad_norm": 0.11170878261327744, "learning_rate": 0.00018728550901310297, "loss": 0.0103, "step": 1930 }, { "epoch": 1.26, "grad_norm": 0.10775300115346909, "learning_rate": 0.00018718564273150387, "loss": 0.0061, "step": 1931 }, { "epoch": 1.26, "grad_norm": 0.08073946833610535, "learning_rate": 0.00018708575888472587, "loss": 0.0076, "step": 1932 }, { "epoch": 1.27, "grad_norm": 0.2539899945259094, "learning_rate": 0.0001869858575199505, "loss": 0.0326, "step": 1933 }, { "epoch": 1.27, "grad_norm": 0.046343106776475906, "learning_rate": 0.0001868859386843677, "loss": 0.0057, "step": 1934 }, { "epoch": 1.27, "grad_norm": 0.09674936532974243, "learning_rate": 0.00018678600242517547, "loss": 0.0081, "step": 1935 }, { "epoch": 1.27, "grad_norm": 0.006561139598488808, "learning_rate": 0.00018668604878958027, "loss": 0.0011, "step": 1936 }, { "epoch": 1.27, "grad_norm": 0.003308130893856287, "learning_rate": 0.00018658607782479653, "loss": 0.0006, "step": 1937 }, { "epoch": 1.27, "grad_norm": 0.1847364902496338, "learning_rate": 0.0001864860895780471, "loss": 0.0128, "step": 1938 }, { "epoch": 1.27, "grad_norm": 0.4606344699859619, "learning_rate": 0.00018638608409656288, "loss": 0.1213, "step": 1939 }, { "epoch": 1.27, "grad_norm": 0.005895258858799934, "learning_rate": 0.00018628606142758285, "loss": 0.0009, "step": 1940 }, { "epoch": 1.27, "grad_norm": 0.37326616048812866, "learning_rate": 0.0001861860216183542, "loss": 0.0808, "step": 1941 }, { "epoch": 1.27, "grad_norm": 0.04593970999121666, "learning_rate": 0.00018608596471613215, "loss": 0.0024, "step": 1942 }, { "epoch": 1.27, "grad_norm": 0.16561304032802582, "learning_rate": 0.00018598589076818014, "loss": 0.0396, "step": 1943 }, { "epoch": 1.27, "grad_norm": 0.2503207325935364, "learning_rate": 0.00018588579982176944, "loss": 0.0538, "step": 1944 }, { "epoch": 1.27, "grad_norm": 0.09036950767040253, "learning_rate": 0.0001857856919241795, "loss": 0.0102, "step": 1945 }, { "epoch": 1.27, "grad_norm": 0.1274523138999939, "learning_rate": 0.00018568556712269776, "loss": 0.0291, "step": 1946 }, { "epoch": 1.27, "grad_norm": 0.028810936957597733, "learning_rate": 0.00018558542546461964, "loss": 0.0053, "step": 1947 }, { "epoch": 1.28, "grad_norm": 0.15604527294635773, "learning_rate": 0.0001854852669972484, "loss": 0.0171, "step": 1948 }, { "epoch": 1.28, "grad_norm": 0.06874702125787735, "learning_rate": 0.00018538509176789546, "loss": 0.0089, "step": 1949 }, { "epoch": 1.28, "grad_norm": 0.09649529308080673, "learning_rate": 0.00018528489982388006, "loss": 0.0164, "step": 1950 }, { "epoch": 1.28, "grad_norm": 0.041638266295194626, "learning_rate": 0.0001851846912125292, "loss": 0.008, "step": 1951 }, { "epoch": 1.28, "grad_norm": 0.08413052558898926, "learning_rate": 0.00018508446598117806, "loss": 0.0178, "step": 1952 }, { "epoch": 1.28, "grad_norm": 0.09511756896972656, "learning_rate": 0.00018498422417716928, "loss": 0.0171, "step": 1953 }, { "epoch": 1.28, "grad_norm": 0.13475637137889862, "learning_rate": 0.00018488396584785365, "loss": 0.023, "step": 1954 }, { "epoch": 1.28, "grad_norm": 0.04502255469560623, "learning_rate": 0.00018478369104058963, "loss": 0.0093, "step": 1955 }, { "epoch": 1.28, "grad_norm": 0.030024804174900055, "learning_rate": 0.00018468339980274353, "loss": 0.0042, "step": 1956 }, { "epoch": 1.28, "grad_norm": 0.10409369319677353, "learning_rate": 0.00018458309218168925, "loss": 0.0224, "step": 1957 }, { "epoch": 1.28, "grad_norm": 0.2133146971464157, "learning_rate": 0.00018448276822480866, "loss": 0.0747, "step": 1958 }, { "epoch": 1.28, "grad_norm": 0.17796148359775543, "learning_rate": 0.0001843824279794912, "loss": 0.0135, "step": 1959 }, { "epoch": 1.28, "grad_norm": 0.07230894267559052, "learning_rate": 0.00018428207149313403, "loss": 0.0096, "step": 1960 }, { "epoch": 1.28, "grad_norm": 0.15250401198863983, "learning_rate": 0.00018418169881314207, "loss": 0.016, "step": 1961 }, { "epoch": 1.28, "grad_norm": 0.0743720754981041, "learning_rate": 0.00018408130998692773, "loss": 0.0062, "step": 1962 }, { "epoch": 1.29, "grad_norm": 0.14948879182338715, "learning_rate": 0.00018398090506191114, "loss": 0.0237, "step": 1963 }, { "epoch": 1.29, "grad_norm": 0.12644903361797333, "learning_rate": 0.00018388048408552008, "loss": 0.0223, "step": 1964 }, { "epoch": 1.29, "grad_norm": 0.05275022238492966, "learning_rate": 0.00018378004710518984, "loss": 0.0051, "step": 1965 }, { "epoch": 1.29, "grad_norm": 0.18393655121326447, "learning_rate": 0.00018367959416836332, "loss": 0.0119, "step": 1966 }, { "epoch": 1.29, "grad_norm": 0.2449781447649002, "learning_rate": 0.00018357912532249076, "loss": 0.0262, "step": 1967 }, { "epoch": 1.29, "grad_norm": 0.05197291448712349, "learning_rate": 0.00018347864061503028, "loss": 0.0072, "step": 1968 }, { "epoch": 1.29, "grad_norm": 0.07551299780607224, "learning_rate": 0.00018337814009344714, "loss": 0.0081, "step": 1969 }, { "epoch": 1.29, "grad_norm": 0.05459301918745041, "learning_rate": 0.00018327762380521438, "loss": 0.0047, "step": 1970 }, { "epoch": 1.29, "grad_norm": 0.0383986234664917, "learning_rate": 0.0001831770917978122, "loss": 0.0034, "step": 1971 }, { "epoch": 1.29, "grad_norm": 0.0700681209564209, "learning_rate": 0.00018307654411872838, "loss": 0.0068, "step": 1972 }, { "epoch": 1.29, "grad_norm": 0.28609392046928406, "learning_rate": 0.0001829759808154581, "loss": 0.0608, "step": 1973 }, { "epoch": 1.29, "grad_norm": 0.21615540981292725, "learning_rate": 0.0001828754019355039, "loss": 0.0088, "step": 1974 }, { "epoch": 1.29, "grad_norm": 0.15507575869560242, "learning_rate": 0.0001827748075263757, "loss": 0.0081, "step": 1975 }, { "epoch": 1.29, "grad_norm": 0.2063005119562149, "learning_rate": 0.0001826741976355907, "loss": 0.0072, "step": 1976 }, { "epoch": 1.29, "grad_norm": 0.23691704869270325, "learning_rate": 0.0001825735723106734, "loss": 0.037, "step": 1977 }, { "epoch": 1.29, "grad_norm": 0.05298379436135292, "learning_rate": 0.0001824729315991557, "loss": 0.0041, "step": 1978 }, { "epoch": 1.3, "grad_norm": 0.22102193534374237, "learning_rate": 0.00018237227554857672, "loss": 0.0117, "step": 1979 }, { "epoch": 1.3, "grad_norm": 0.1501353681087494, "learning_rate": 0.00018227160420648274, "loss": 0.0397, "step": 1980 }, { "epoch": 1.3, "grad_norm": 0.402899831533432, "learning_rate": 0.00018217091762042737, "loss": 0.0821, "step": 1981 }, { "epoch": 1.3, "grad_norm": 0.20166534185409546, "learning_rate": 0.0001820702158379714, "loss": 0.0285, "step": 1982 }, { "epoch": 1.3, "grad_norm": 0.13613693416118622, "learning_rate": 0.00018196949890668276, "loss": 0.006, "step": 1983 }, { "epoch": 1.3, "grad_norm": 0.19995389878749847, "learning_rate": 0.00018186876687413655, "loss": 0.0152, "step": 1984 }, { "epoch": 1.3, "grad_norm": 0.2411389946937561, "learning_rate": 0.00018176801978791497, "loss": 0.0546, "step": 1985 }, { "epoch": 1.3, "grad_norm": 0.022495824843645096, "learning_rate": 0.00018166725769560747, "loss": 0.0022, "step": 1986 }, { "epoch": 1.3, "grad_norm": 0.07184285670518875, "learning_rate": 0.00018156648064481044, "loss": 0.0058, "step": 1987 }, { "epoch": 1.3, "grad_norm": 0.13286525011062622, "learning_rate": 0.00018146568868312733, "loss": 0.0254, "step": 1988 }, { "epoch": 1.3, "grad_norm": 0.2439001351594925, "learning_rate": 0.00018136488185816878, "loss": 0.0457, "step": 1989 }, { "epoch": 1.3, "grad_norm": 0.1679784655570984, "learning_rate": 0.00018126406021755232, "loss": 0.0274, "step": 1990 }, { "epoch": 1.3, "grad_norm": 0.1093001514673233, "learning_rate": 0.00018116322380890248, "loss": 0.0107, "step": 1991 }, { "epoch": 1.3, "grad_norm": 0.08656150102615356, "learning_rate": 0.0001810623726798509, "loss": 0.0075, "step": 1992 }, { "epoch": 1.3, "grad_norm": 0.06291065365076065, "learning_rate": 0.00018096150687803598, "loss": 0.0058, "step": 1993 }, { "epoch": 1.31, "grad_norm": 0.06546434015035629, "learning_rate": 0.00018086062645110318, "loss": 0.0082, "step": 1994 }, { "epoch": 1.31, "grad_norm": 0.09988091886043549, "learning_rate": 0.00018075973144670486, "loss": 0.0239, "step": 1995 }, { "epoch": 1.31, "grad_norm": 0.30109840631484985, "learning_rate": 0.0001806588219125002, "loss": 0.0297, "step": 1996 }, { "epoch": 1.31, "grad_norm": 0.05507725849747658, "learning_rate": 0.00018055789789615532, "loss": 0.004, "step": 1997 }, { "epoch": 1.31, "grad_norm": 0.09339085221290588, "learning_rate": 0.00018045695944534314, "loss": 0.0109, "step": 1998 }, { "epoch": 1.31, "grad_norm": 0.059944991022348404, "learning_rate": 0.00018035600660774336, "loss": 0.0085, "step": 1999 }, { "epoch": 1.31, "grad_norm": 0.09726294130086899, "learning_rate": 0.00018025503943104262, "loss": 0.0093, "step": 2000 }, { "epoch": 1.31, "grad_norm": 0.07707206904888153, "learning_rate": 0.00018015405796293417, "loss": 0.0067, "step": 2001 }, { "epoch": 1.31, "grad_norm": 0.027958236634731293, "learning_rate": 0.00018005306225111803, "loss": 0.0035, "step": 2002 }, { "epoch": 1.31, "grad_norm": 0.2533860206604004, "learning_rate": 0.00017995205234330107, "loss": 0.0626, "step": 2003 }, { "epoch": 1.31, "grad_norm": 0.041864681988954544, "learning_rate": 0.00017985102828719675, "loss": 0.0067, "step": 2004 }, { "epoch": 1.31, "grad_norm": 0.08300785720348358, "learning_rate": 0.00017974999013052527, "loss": 0.0311, "step": 2005 }, { "epoch": 1.31, "grad_norm": 0.054343078285455704, "learning_rate": 0.00017964893792101345, "loss": 0.0052, "step": 2006 }, { "epoch": 1.31, "grad_norm": 0.007766898721456528, "learning_rate": 0.00017954787170639476, "loss": 0.0014, "step": 2007 }, { "epoch": 1.31, "grad_norm": 0.18020589649677277, "learning_rate": 0.00017944679153440935, "loss": 0.0424, "step": 2008 }, { "epoch": 1.32, "grad_norm": 0.07235695421695709, "learning_rate": 0.00017934569745280392, "loss": 0.009, "step": 2009 }, { "epoch": 1.32, "grad_norm": 0.04374720901250839, "learning_rate": 0.00017924458950933163, "loss": 0.0036, "step": 2010 }, { "epoch": 1.32, "grad_norm": 0.12976546585559845, "learning_rate": 0.00017914346775175236, "loss": 0.0083, "step": 2011 }, { "epoch": 1.32, "grad_norm": 0.13522937893867493, "learning_rate": 0.0001790423322278324, "loss": 0.0142, "step": 2012 }, { "epoch": 1.32, "grad_norm": 0.17489002645015717, "learning_rate": 0.0001789411829853446, "loss": 0.0192, "step": 2013 }, { "epoch": 1.32, "grad_norm": 0.07798654586076736, "learning_rate": 0.00017884002007206837, "loss": 0.0071, "step": 2014 }, { "epoch": 1.32, "grad_norm": 0.03286200389266014, "learning_rate": 0.00017873884353578935, "loss": 0.0018, "step": 2015 }, { "epoch": 1.32, "grad_norm": 0.10676159709692001, "learning_rate": 0.00017863765342429977, "loss": 0.0046, "step": 2016 }, { "epoch": 1.32, "grad_norm": 0.18042497336864471, "learning_rate": 0.00017853644978539835, "loss": 0.0199, "step": 2017 }, { "epoch": 1.32, "grad_norm": 0.21726305782794952, "learning_rate": 0.00017843523266688994, "loss": 0.0173, "step": 2018 }, { "epoch": 1.32, "grad_norm": 0.03823615238070488, "learning_rate": 0.00017833400211658606, "loss": 0.003, "step": 2019 }, { "epoch": 1.32, "grad_norm": 0.0042721726931631565, "learning_rate": 0.00017823275818230436, "loss": 0.0005, "step": 2020 }, { "epoch": 1.32, "grad_norm": 0.21143902838230133, "learning_rate": 0.00017813150091186886, "loss": 0.0095, "step": 2021 }, { "epoch": 1.32, "grad_norm": 0.14743442833423615, "learning_rate": 0.00017803023035311, "loss": 0.0505, "step": 2022 }, { "epoch": 1.32, "grad_norm": 0.02131885476410389, "learning_rate": 0.0001779289465538643, "loss": 0.0021, "step": 2023 }, { "epoch": 1.33, "grad_norm": 0.15741504728794098, "learning_rate": 0.00017782764956197474, "loss": 0.0061, "step": 2024 }, { "epoch": 1.33, "grad_norm": 0.016739048063755035, "learning_rate": 0.00017772633942529032, "loss": 0.002, "step": 2025 }, { "epoch": 1.33, "grad_norm": 0.14672209322452545, "learning_rate": 0.00017762501619166638, "loss": 0.039, "step": 2026 }, { "epoch": 1.33, "grad_norm": 0.1252099722623825, "learning_rate": 0.00017752367990896446, "loss": 0.0064, "step": 2027 }, { "epoch": 1.33, "grad_norm": 0.322226881980896, "learning_rate": 0.0001774223306250523, "loss": 0.0363, "step": 2028 }, { "epoch": 1.33, "grad_norm": 0.4759753346443176, "learning_rate": 0.00017732096838780353, "loss": 0.1034, "step": 2029 }, { "epoch": 1.33, "grad_norm": 0.01636957749724388, "learning_rate": 0.00017721959324509815, "loss": 0.0023, "step": 2030 }, { "epoch": 1.33, "grad_norm": 0.09087517112493515, "learning_rate": 0.00017711820524482223, "loss": 0.0034, "step": 2031 }, { "epoch": 1.33, "grad_norm": 0.10265547037124634, "learning_rate": 0.00017701680443486784, "loss": 0.0195, "step": 2032 }, { "epoch": 1.33, "grad_norm": 0.018688971176743507, "learning_rate": 0.00017691539086313307, "loss": 0.0025, "step": 2033 }, { "epoch": 1.33, "grad_norm": 0.04474179074168205, "learning_rate": 0.00017681396457752221, "loss": 0.0046, "step": 2034 }, { "epoch": 1.33, "grad_norm": 0.11852456629276276, "learning_rate": 0.00017671252562594531, "loss": 0.0229, "step": 2035 }, { "epoch": 1.33, "grad_norm": 0.20545479655265808, "learning_rate": 0.00017661107405631866, "loss": 0.0247, "step": 2036 }, { "epoch": 1.33, "grad_norm": 0.030622253194451332, "learning_rate": 0.00017650960991656432, "loss": 0.0021, "step": 2037 }, { "epoch": 1.33, "grad_norm": 0.0707719549536705, "learning_rate": 0.0001764081332546103, "loss": 0.0067, "step": 2038 }, { "epoch": 1.33, "grad_norm": 0.07734823226928711, "learning_rate": 0.00017630664411839064, "loss": 0.0174, "step": 2039 }, { "epoch": 1.34, "grad_norm": 0.15264584124088287, "learning_rate": 0.00017620514255584522, "loss": 0.0143, "step": 2040 }, { "epoch": 1.34, "grad_norm": 0.016753699630498886, "learning_rate": 0.00017610362861491977, "loss": 0.0024, "step": 2041 }, { "epoch": 1.34, "grad_norm": 0.4822998642921448, "learning_rate": 0.00017600210234356586, "loss": 0.0165, "step": 2042 }, { "epoch": 1.34, "grad_norm": 0.04116729274392128, "learning_rate": 0.00017590056378974088, "loss": 0.0053, "step": 2043 }, { "epoch": 1.34, "grad_norm": 0.024770237505435944, "learning_rate": 0.00017579901300140808, "loss": 0.0039, "step": 2044 }, { "epoch": 1.34, "grad_norm": 0.03564632683992386, "learning_rate": 0.00017569745002653646, "loss": 0.0042, "step": 2045 }, { "epoch": 1.34, "grad_norm": 0.20899443328380585, "learning_rate": 0.0001755958749131007, "loss": 0.0275, "step": 2046 }, { "epoch": 1.34, "grad_norm": 0.01576872169971466, "learning_rate": 0.00017549428770908136, "loss": 0.0015, "step": 2047 }, { "epoch": 1.34, "grad_norm": 0.0382314994931221, "learning_rate": 0.00017539268846246457, "loss": 0.0041, "step": 2048 }, { "epoch": 1.34, "grad_norm": 0.0657842680811882, "learning_rate": 0.00017529107722124223, "loss": 0.0074, "step": 2049 }, { "epoch": 1.34, "grad_norm": 0.11808799207210541, "learning_rate": 0.00017518945403341196, "loss": 0.0127, "step": 2050 }, { "epoch": 1.34, "grad_norm": 0.12941673398017883, "learning_rate": 0.00017508781894697684, "loss": 0.0157, "step": 2051 }, { "epoch": 1.34, "grad_norm": 0.2390335202217102, "learning_rate": 0.00017498617200994572, "loss": 0.0137, "step": 2052 }, { "epoch": 1.34, "grad_norm": 0.06445050239562988, "learning_rate": 0.00017488451327033304, "loss": 0.0028, "step": 2053 }, { "epoch": 1.34, "grad_norm": 0.26266834139823914, "learning_rate": 0.00017478284277615876, "loss": 0.0093, "step": 2054 }, { "epoch": 1.35, "grad_norm": 0.2604714035987854, "learning_rate": 0.0001746811605754484, "loss": 0.0519, "step": 2055 }, { "epoch": 1.35, "grad_norm": 0.23091405630111694, "learning_rate": 0.00017457946671623305, "loss": 0.0376, "step": 2056 }, { "epoch": 1.35, "grad_norm": 0.630933403968811, "learning_rate": 0.00017447776124654925, "loss": 0.0726, "step": 2057 }, { "epoch": 1.35, "grad_norm": 0.15399132668972015, "learning_rate": 0.00017437604421443914, "loss": 0.0186, "step": 2058 }, { "epoch": 1.35, "grad_norm": 0.3804969787597656, "learning_rate": 0.00017427431566795012, "loss": 0.0533, "step": 2059 }, { "epoch": 1.35, "grad_norm": 0.26310884952545166, "learning_rate": 0.00017417257565513524, "loss": 0.0431, "step": 2060 }, { "epoch": 1.35, "grad_norm": 0.10924821346998215, "learning_rate": 0.0001740708242240528, "loss": 0.0557, "step": 2061 }, { "epoch": 1.35, "grad_norm": 0.021753892302513123, "learning_rate": 0.00017396906142276664, "loss": 0.0025, "step": 2062 }, { "epoch": 1.35, "grad_norm": 0.05231242626905441, "learning_rate": 0.00017386728729934587, "loss": 0.0039, "step": 2063 }, { "epoch": 1.35, "grad_norm": 0.03753997012972832, "learning_rate": 0.000173765501901865, "loss": 0.0039, "step": 2064 }, { "epoch": 1.35, "grad_norm": 0.022054225206375122, "learning_rate": 0.00017366370527840377, "loss": 0.003, "step": 2065 }, { "epoch": 1.35, "grad_norm": 0.07640133053064346, "learning_rate": 0.00017356189747704735, "loss": 0.0082, "step": 2066 }, { "epoch": 1.35, "grad_norm": 0.1012224480509758, "learning_rate": 0.00017346007854588617, "loss": 0.0191, "step": 2067 }, { "epoch": 1.35, "grad_norm": 0.33267688751220703, "learning_rate": 0.00017335824853301584, "loss": 0.0347, "step": 2068 }, { "epoch": 1.35, "grad_norm": 0.061131980270147324, "learning_rate": 0.00017325640748653718, "loss": 0.0096, "step": 2069 }, { "epoch": 1.36, "grad_norm": 0.16046808660030365, "learning_rate": 0.00017315455545455636, "loss": 0.0202, "step": 2070 }, { "epoch": 1.36, "grad_norm": 0.048775166273117065, "learning_rate": 0.00017305269248518468, "loss": 0.0098, "step": 2071 }, { "epoch": 1.36, "grad_norm": 0.17278356850147247, "learning_rate": 0.0001729508186265386, "loss": 0.0222, "step": 2072 }, { "epoch": 1.36, "grad_norm": 0.0673917904496193, "learning_rate": 0.0001728489339267397, "loss": 0.0071, "step": 2073 }, { "epoch": 1.36, "grad_norm": 0.0350324921309948, "learning_rate": 0.00017274703843391467, "loss": 0.0041, "step": 2074 }, { "epoch": 1.36, "grad_norm": 0.07471290230751038, "learning_rate": 0.00017264513219619534, "loss": 0.0083, "step": 2075 }, { "epoch": 1.36, "grad_norm": 0.10071226209402084, "learning_rate": 0.00017254321526171862, "loss": 0.0406, "step": 2076 }, { "epoch": 1.36, "grad_norm": 0.11408775299787521, "learning_rate": 0.0001724412876786265, "loss": 0.0095, "step": 2077 }, { "epoch": 1.36, "grad_norm": 0.030934764072299004, "learning_rate": 0.00017233934949506584, "loss": 0.0035, "step": 2078 }, { "epoch": 1.36, "grad_norm": 0.05950973555445671, "learning_rate": 0.00017223740075918872, "loss": 0.0065, "step": 2079 }, { "epoch": 1.36, "grad_norm": 0.014024289324879646, "learning_rate": 0.00017213544151915204, "loss": 0.0016, "step": 2080 }, { "epoch": 1.36, "grad_norm": 0.14681045711040497, "learning_rate": 0.00017203347182311783, "loss": 0.0097, "step": 2081 }, { "epoch": 1.36, "grad_norm": 0.04823039472103119, "learning_rate": 0.00017193149171925286, "loss": 0.0052, "step": 2082 }, { "epoch": 1.36, "grad_norm": 0.10540402680635452, "learning_rate": 0.00017182950125572892, "loss": 0.011, "step": 2083 }, { "epoch": 1.36, "grad_norm": 0.20969213545322418, "learning_rate": 0.00017172750048072277, "loss": 0.0138, "step": 2084 }, { "epoch": 1.36, "grad_norm": 0.31061890721321106, "learning_rate": 0.0001716254894424159, "loss": 0.0532, "step": 2085 }, { "epoch": 1.37, "grad_norm": 0.20670346915721893, "learning_rate": 0.00017152346818899468, "loss": 0.0326, "step": 2086 }, { "epoch": 1.37, "grad_norm": 0.00548940384760499, "learning_rate": 0.00017142143676865038, "loss": 0.0009, "step": 2087 }, { "epoch": 1.37, "grad_norm": 0.1297319084405899, "learning_rate": 0.00017131939522957898, "loss": 0.0074, "step": 2088 }, { "epoch": 1.37, "grad_norm": 0.2983582019805908, "learning_rate": 0.00017121734361998133, "loss": 0.0182, "step": 2089 }, { "epoch": 1.37, "grad_norm": 0.25771012902259827, "learning_rate": 0.00017111528198806303, "loss": 0.0576, "step": 2090 }, { "epoch": 1.37, "grad_norm": 0.04253006353974342, "learning_rate": 0.00017101321038203425, "loss": 0.0038, "step": 2091 }, { "epoch": 1.37, "grad_norm": 0.2725884020328522, "learning_rate": 0.00017091112885011007, "loss": 0.0283, "step": 2092 }, { "epoch": 1.37, "grad_norm": 0.2715790569782257, "learning_rate": 0.0001708090374405102, "loss": 0.0542, "step": 2093 }, { "epoch": 1.37, "grad_norm": 0.038607288151979446, "learning_rate": 0.00017070693620145904, "loss": 0.0027, "step": 2094 }, { "epoch": 1.37, "grad_norm": 0.39629149436950684, "learning_rate": 0.00017060482518118546, "loss": 0.0461, "step": 2095 }, { "epoch": 1.37, "grad_norm": 0.014728988520801067, "learning_rate": 0.0001705027044279232, "loss": 0.0022, "step": 2096 }, { "epoch": 1.37, "grad_norm": 0.11767303943634033, "learning_rate": 0.0001704005739899104, "loss": 0.0163, "step": 2097 }, { "epoch": 1.37, "grad_norm": 0.143372043967247, "learning_rate": 0.00017029843391539, "loss": 0.0163, "step": 2098 }, { "epoch": 1.37, "grad_norm": 0.11955475062131882, "learning_rate": 0.00017019628425260917, "loss": 0.0115, "step": 2099 }, { "epoch": 1.37, "grad_norm": 0.2151668518781662, "learning_rate": 0.0001700941250498199, "loss": 0.0372, "step": 2100 }, { "epoch": 1.38, "grad_norm": 0.08933127671480179, "learning_rate": 0.00016999195635527853, "loss": 0.0094, "step": 2101 }, { "epoch": 1.38, "grad_norm": 0.043822623789310455, "learning_rate": 0.00016988977821724593, "loss": 0.0053, "step": 2102 }, { "epoch": 1.38, "grad_norm": 0.21337710320949554, "learning_rate": 0.0001697875906839875, "loss": 0.0272, "step": 2103 }, { "epoch": 1.38, "grad_norm": 0.2069789320230484, "learning_rate": 0.00016968539380377292, "loss": 0.0318, "step": 2104 }, { "epoch": 1.38, "grad_norm": 0.2900010049343109, "learning_rate": 0.0001695831876248764, "loss": 0.0465, "step": 2105 }, { "epoch": 1.38, "grad_norm": 0.038315340876579285, "learning_rate": 0.00016948097219557647, "loss": 0.0042, "step": 2106 }, { "epoch": 1.38, "grad_norm": 0.10780084133148193, "learning_rate": 0.00016937874756415623, "loss": 0.0365, "step": 2107 }, { "epoch": 1.38, "grad_norm": 0.01659630611538887, "learning_rate": 0.00016927651377890275, "loss": 0.0017, "step": 2108 }, { "epoch": 1.38, "grad_norm": 0.037653081119060516, "learning_rate": 0.00016917427088810778, "loss": 0.0044, "step": 2109 }, { "epoch": 1.38, "grad_norm": 0.20532900094985962, "learning_rate": 0.00016907201894006724, "loss": 0.0514, "step": 2110 }, { "epoch": 1.38, "grad_norm": 0.025467032566666603, "learning_rate": 0.0001689697579830813, "loss": 0.0037, "step": 2111 }, { "epoch": 1.38, "grad_norm": 0.161657452583313, "learning_rate": 0.00016886748806545438, "loss": 0.0236, "step": 2112 }, { "epoch": 1.38, "grad_norm": 0.0939616858959198, "learning_rate": 0.00016876520923549517, "loss": 0.0048, "step": 2113 }, { "epoch": 1.38, "grad_norm": 0.24862095713615417, "learning_rate": 0.0001686629215415166, "loss": 0.0125, "step": 2114 }, { "epoch": 1.38, "grad_norm": 0.13289472460746765, "learning_rate": 0.00016856062503183572, "loss": 0.0221, "step": 2115 }, { "epoch": 1.39, "grad_norm": 0.023004405200481415, "learning_rate": 0.00016845831975477384, "loss": 0.0036, "step": 2116 }, { "epoch": 1.39, "grad_norm": 0.2153320163488388, "learning_rate": 0.00016835600575865623, "loss": 0.0517, "step": 2117 }, { "epoch": 1.39, "grad_norm": 0.18337225914001465, "learning_rate": 0.0001682536830918125, "loss": 0.0199, "step": 2118 }, { "epoch": 1.39, "grad_norm": 0.06616278737783432, "learning_rate": 0.00016815135180257612, "loss": 0.008, "step": 2119 }, { "epoch": 1.39, "grad_norm": 0.19889144599437714, "learning_rate": 0.00016804901193928488, "loss": 0.0103, "step": 2120 }, { "epoch": 1.39, "grad_norm": 0.15102970600128174, "learning_rate": 0.0001679466635502805, "loss": 0.0216, "step": 2121 }, { "epoch": 1.39, "grad_norm": 0.10404885560274124, "learning_rate": 0.00016784430668390866, "loss": 0.0116, "step": 2122 }, { "epoch": 1.39, "grad_norm": 0.11957724392414093, "learning_rate": 0.00016774194138851915, "loss": 0.0128, "step": 2123 }, { "epoch": 1.39, "grad_norm": 0.054242778569459915, "learning_rate": 0.00016763956771246566, "loss": 0.0059, "step": 2124 }, { "epoch": 1.39, "grad_norm": 0.04711861535906792, "learning_rate": 0.000167537185704106, "loss": 0.0039, "step": 2125 }, { "epoch": 1.39, "grad_norm": 0.17932789027690887, "learning_rate": 0.0001674347954118017, "loss": 0.0451, "step": 2126 }, { "epoch": 1.39, "grad_norm": 0.12226320058107376, "learning_rate": 0.0001673323968839183, "loss": 0.0372, "step": 2127 }, { "epoch": 1.39, "grad_norm": 0.36971887946128845, "learning_rate": 0.0001672299901688253, "loss": 0.0173, "step": 2128 }, { "epoch": 1.39, "grad_norm": 0.17018848657608032, "learning_rate": 0.0001671275753148959, "loss": 0.018, "step": 2129 }, { "epoch": 1.39, "grad_norm": 0.1189635694026947, "learning_rate": 0.0001670251523705074, "loss": 0.0163, "step": 2130 }, { "epoch": 1.4, "grad_norm": 0.31365808844566345, "learning_rate": 0.00016692272138404065, "loss": 0.0412, "step": 2131 }, { "epoch": 1.4, "grad_norm": 0.20389385521411896, "learning_rate": 0.00016682028240388037, "loss": 0.0122, "step": 2132 }, { "epoch": 1.4, "grad_norm": 0.16484835743904114, "learning_rate": 0.00016671783547841525, "loss": 0.0134, "step": 2133 }, { "epoch": 1.4, "grad_norm": 0.14718228578567505, "learning_rate": 0.00016661538065603748, "loss": 0.0218, "step": 2134 }, { "epoch": 1.4, "grad_norm": 0.12503278255462646, "learning_rate": 0.00016651291798514312, "loss": 0.013, "step": 2135 }, { "epoch": 1.4, "grad_norm": 0.2573976516723633, "learning_rate": 0.00016641044751413187, "loss": 0.0302, "step": 2136 }, { "epoch": 1.4, "grad_norm": 0.11231845617294312, "learning_rate": 0.00016630796929140718, "loss": 0.0091, "step": 2137 }, { "epoch": 1.4, "grad_norm": 0.1934831589460373, "learning_rate": 0.00016620548336537613, "loss": 0.0167, "step": 2138 }, { "epoch": 1.4, "grad_norm": 0.13918915390968323, "learning_rate": 0.00016610298978444942, "loss": 0.0133, "step": 2139 }, { "epoch": 1.4, "grad_norm": 0.06197899207472801, "learning_rate": 0.0001660004885970414, "loss": 0.0053, "step": 2140 }, { "epoch": 1.4, "grad_norm": 0.047342702746391296, "learning_rate": 0.00016589797985156997, "loss": 0.005, "step": 2141 }, { "epoch": 1.4, "grad_norm": 0.01349773071706295, "learning_rate": 0.00016579546359645663, "loss": 0.0018, "step": 2142 }, { "epoch": 1.4, "grad_norm": 0.018491871654987335, "learning_rate": 0.0001656929398801265, "loss": 0.0022, "step": 2143 }, { "epoch": 1.4, "grad_norm": 0.07871642708778381, "learning_rate": 0.000165590408751008, "loss": 0.0425, "step": 2144 }, { "epoch": 1.4, "grad_norm": 0.13570870459079742, "learning_rate": 0.00016548787025753332, "loss": 0.0379, "step": 2145 }, { "epoch": 1.4, "grad_norm": 0.07806690782308578, "learning_rate": 0.00016538532444813794, "loss": 0.0083, "step": 2146 }, { "epoch": 1.41, "grad_norm": 0.2510945796966553, "learning_rate": 0.00016528277137126094, "loss": 0.057, "step": 2147 }, { "epoch": 1.41, "grad_norm": 0.009228669106960297, "learning_rate": 0.00016518021107534472, "loss": 0.0015, "step": 2148 }, { "epoch": 1.41, "grad_norm": 0.01314024068415165, "learning_rate": 0.00016507764360883506, "loss": 0.0016, "step": 2149 }, { "epoch": 1.41, "grad_norm": 0.008662903681397438, "learning_rate": 0.00016497506902018127, "loss": 0.0011, "step": 2150 }, { "epoch": 1.41, "grad_norm": 0.2847445011138916, "learning_rate": 0.0001648724873578359, "loss": 0.0381, "step": 2151 }, { "epoch": 1.41, "grad_norm": 0.29297375679016113, "learning_rate": 0.00016476989867025499, "loss": 0.0163, "step": 2152 }, { "epoch": 1.41, "grad_norm": 0.020055145025253296, "learning_rate": 0.00016466730300589768, "loss": 0.0022, "step": 2153 }, { "epoch": 1.41, "grad_norm": 0.2110797017812729, "learning_rate": 0.0001645647004132266, "loss": 0.0339, "step": 2154 }, { "epoch": 1.41, "grad_norm": 0.04020393267273903, "learning_rate": 0.0001644620909407075, "loss": 0.0058, "step": 2155 }, { "epoch": 1.41, "grad_norm": 0.016536343842744827, "learning_rate": 0.00016435947463680955, "loss": 0.0025, "step": 2156 }, { "epoch": 1.41, "grad_norm": 0.02839847095310688, "learning_rate": 0.00016425685155000496, "loss": 0.0023, "step": 2157 }, { "epoch": 1.41, "grad_norm": 0.3273443281650543, "learning_rate": 0.00016415422172876934, "loss": 0.0595, "step": 2158 }, { "epoch": 1.41, "grad_norm": 0.011165251024067402, "learning_rate": 0.00016405158522158123, "loss": 0.0019, "step": 2159 }, { "epoch": 1.41, "grad_norm": 0.01584658771753311, "learning_rate": 0.00016394894207692268, "loss": 0.0027, "step": 2160 }, { "epoch": 1.41, "grad_norm": 0.1392921805381775, "learning_rate": 0.00016384629234327848, "loss": 0.0392, "step": 2161 }, { "epoch": 1.42, "grad_norm": 0.09018007665872574, "learning_rate": 0.0001637436360691368, "loss": 0.0241, "step": 2162 }, { "epoch": 1.42, "grad_norm": 0.0933082103729248, "learning_rate": 0.00016364097330298885, "loss": 0.0314, "step": 2163 }, { "epoch": 1.42, "grad_norm": 0.260314404964447, "learning_rate": 0.00016353830409332882, "loss": 0.0236, "step": 2164 }, { "epoch": 1.42, "grad_norm": 0.22859854996204376, "learning_rate": 0.00016343562848865413, "loss": 0.0334, "step": 2165 }, { "epoch": 1.42, "grad_norm": 0.2570708692073822, "learning_rate": 0.00016333294653746494, "loss": 0.07, "step": 2166 }, { "epoch": 1.42, "grad_norm": 0.10685363411903381, "learning_rate": 0.0001632302582882646, "loss": 0.0169, "step": 2167 }, { "epoch": 1.42, "grad_norm": 0.41462910175323486, "learning_rate": 0.00016312756378955947, "loss": 0.02, "step": 2168 }, { "epoch": 1.42, "grad_norm": 0.15622973442077637, "learning_rate": 0.00016302486308985873, "loss": 0.0252, "step": 2169 }, { "epoch": 1.42, "grad_norm": 0.14134229719638824, "learning_rate": 0.00016292215623767457, "loss": 0.0192, "step": 2170 }, { "epoch": 1.42, "grad_norm": 0.4416516423225403, "learning_rate": 0.00016281944328152206, "loss": 0.0608, "step": 2171 }, { "epoch": 1.42, "grad_norm": 0.05815531313419342, "learning_rate": 0.0001627167242699191, "loss": 0.0132, "step": 2172 }, { "epoch": 1.42, "grad_norm": 0.10270994156599045, "learning_rate": 0.0001626139992513866, "loss": 0.028, "step": 2173 }, { "epoch": 1.42, "grad_norm": 0.09582596272230148, "learning_rate": 0.0001625112682744482, "loss": 0.0152, "step": 2174 }, { "epoch": 1.42, "grad_norm": 0.09254796802997589, "learning_rate": 0.0001624085313876303, "loss": 0.0184, "step": 2175 }, { "epoch": 1.42, "grad_norm": 0.04410834610462189, "learning_rate": 0.00016230578863946223, "loss": 0.0167, "step": 2176 }, { "epoch": 1.43, "grad_norm": 0.07682263851165771, "learning_rate": 0.000162203040078476, "loss": 0.0107, "step": 2177 }, { "epoch": 1.43, "grad_norm": 0.0957961305975914, "learning_rate": 0.00016210028575320643, "loss": 0.01, "step": 2178 }, { "epoch": 1.43, "grad_norm": 0.11102692037820816, "learning_rate": 0.00016199752571219092, "loss": 0.0175, "step": 2179 }, { "epoch": 1.43, "grad_norm": 0.04554305970668793, "learning_rate": 0.00016189476000396977, "loss": 0.0058, "step": 2180 }, { "epoch": 1.43, "grad_norm": 0.10862531512975693, "learning_rate": 0.00016179198867708575, "loss": 0.0059, "step": 2181 }, { "epoch": 1.43, "grad_norm": 0.18247346580028534, "learning_rate": 0.00016168921178008448, "loss": 0.0248, "step": 2182 }, { "epoch": 1.43, "grad_norm": 0.11431893706321716, "learning_rate": 0.00016158642936151416, "loss": 0.0142, "step": 2183 }, { "epoch": 1.43, "grad_norm": 0.12229776382446289, "learning_rate": 0.0001614836414699254, "loss": 0.0203, "step": 2184 }, { "epoch": 1.43, "grad_norm": 0.09890652447938919, "learning_rate": 0.0001613808481538717, "loss": 0.0162, "step": 2185 }, { "epoch": 1.43, "grad_norm": 0.021438946947455406, "learning_rate": 0.00016127804946190893, "loss": 0.0035, "step": 2186 }, { "epoch": 1.43, "grad_norm": 0.026309477165341377, "learning_rate": 0.00016117524544259553, "loss": 0.0027, "step": 2187 }, { "epoch": 1.43, "grad_norm": 0.1389365792274475, "learning_rate": 0.0001610724361444925, "loss": 0.0463, "step": 2188 }, { "epoch": 1.43, "grad_norm": 0.2812053859233856, "learning_rate": 0.00016096962161616326, "loss": 0.0695, "step": 2189 }, { "epoch": 1.43, "grad_norm": 0.009222770109772682, "learning_rate": 0.0001608668019061738, "loss": 0.0013, "step": 2190 }, { "epoch": 1.43, "grad_norm": 0.02184683084487915, "learning_rate": 0.00016076397706309245, "loss": 0.0041, "step": 2191 }, { "epoch": 1.44, "grad_norm": 0.028864651918411255, "learning_rate": 0.0001606611471354901, "loss": 0.004, "step": 2192 }, { "epoch": 1.44, "grad_norm": 0.1239912286400795, "learning_rate": 0.0001605583121719399, "loss": 0.017, "step": 2193 }, { "epoch": 1.44, "grad_norm": 0.04970764368772507, "learning_rate": 0.00016045547222101746, "loss": 0.0058, "step": 2194 }, { "epoch": 1.44, "grad_norm": 0.046907342970371246, "learning_rate": 0.0001603526273313007, "loss": 0.0059, "step": 2195 }, { "epoch": 1.44, "grad_norm": 0.11429935693740845, "learning_rate": 0.00016024977755136995, "loss": 0.0413, "step": 2196 }, { "epoch": 1.44, "grad_norm": 0.015206074342131615, "learning_rate": 0.00016014692292980775, "loss": 0.0013, "step": 2197 }, { "epoch": 1.44, "grad_norm": 0.07148374617099762, "learning_rate": 0.00016004406351519896, "loss": 0.0111, "step": 2198 }, { "epoch": 1.44, "grad_norm": 0.07193495333194733, "learning_rate": 0.0001599411993561308, "loss": 0.008, "step": 2199 }, { "epoch": 1.44, "grad_norm": 0.07575807720422745, "learning_rate": 0.0001598383305011926, "loss": 0.0102, "step": 2200 }, { "epoch": 1.44, "grad_norm": 0.019444789737462997, "learning_rate": 0.00015973545699897595, "loss": 0.002, "step": 2201 }, { "epoch": 1.44, "grad_norm": 0.007753497920930386, "learning_rate": 0.00015963257889807465, "loss": 0.001, "step": 2202 }, { "epoch": 1.44, "grad_norm": 0.13980121910572052, "learning_rate": 0.0001595296962470847, "loss": 0.0113, "step": 2203 }, { "epoch": 1.44, "grad_norm": 0.0628625676035881, "learning_rate": 0.00015942680909460417, "loss": 0.006, "step": 2204 }, { "epoch": 1.44, "grad_norm": 0.3631378710269928, "learning_rate": 0.00015932391748923333, "loss": 0.0221, "step": 2205 }, { "epoch": 1.44, "grad_norm": 0.08289500325918198, "learning_rate": 0.00015922102147957452, "loss": 0.0033, "step": 2206 }, { "epoch": 1.44, "grad_norm": 0.016553470864892006, "learning_rate": 0.00015911812111423215, "loss": 0.0015, "step": 2207 }, { "epoch": 1.45, "grad_norm": 0.14857268333435059, "learning_rate": 0.00015901521644181272, "loss": 0.008, "step": 2208 }, { "epoch": 1.45, "grad_norm": 0.11462613940238953, "learning_rate": 0.00015891230751092478, "loss": 0.0076, "step": 2209 }, { "epoch": 1.45, "grad_norm": 0.10147285461425781, "learning_rate": 0.00015880939437017878, "loss": 0.0028, "step": 2210 }, { "epoch": 1.45, "grad_norm": 0.07668115198612213, "learning_rate": 0.00015870647706818728, "loss": 0.0019, "step": 2211 }, { "epoch": 1.45, "grad_norm": 0.25671836733818054, "learning_rate": 0.00015860355565356483, "loss": 0.0293, "step": 2212 }, { "epoch": 1.45, "grad_norm": 0.09710162878036499, "learning_rate": 0.00015850063017492773, "loss": 0.0056, "step": 2213 }, { "epoch": 1.45, "grad_norm": 0.02251294068992138, "learning_rate": 0.00015839770068089442, "loss": 0.0019, "step": 2214 }, { "epoch": 1.45, "grad_norm": 0.2710408866405487, "learning_rate": 0.00015829476722008508, "loss": 0.0328, "step": 2215 }, { "epoch": 1.45, "grad_norm": 0.05798187106847763, "learning_rate": 0.0001581918298411219, "loss": 0.0032, "step": 2216 }, { "epoch": 1.45, "grad_norm": 0.15880708396434784, "learning_rate": 0.00015808888859262875, "loss": 0.0052, "step": 2217 }, { "epoch": 1.45, "grad_norm": 0.0126173235476017, "learning_rate": 0.00015798594352323155, "loss": 0.0014, "step": 2218 }, { "epoch": 1.45, "grad_norm": 0.3693072497844696, "learning_rate": 0.00015788299468155783, "loss": 0.035, "step": 2219 }, { "epoch": 1.45, "grad_norm": 0.008310376666486263, "learning_rate": 0.00015778004211623695, "loss": 0.0007, "step": 2220 }, { "epoch": 1.45, "grad_norm": 0.1609114110469818, "learning_rate": 0.00015767708587590003, "loss": 0.0159, "step": 2221 }, { "epoch": 1.45, "grad_norm": 0.18963384628295898, "learning_rate": 0.00015757412600918004, "loss": 0.0149, "step": 2222 }, { "epoch": 1.46, "grad_norm": 0.10448487848043442, "learning_rate": 0.00015747116256471154, "loss": 0.0047, "step": 2223 }, { "epoch": 1.46, "grad_norm": 0.0033179214224219322, "learning_rate": 0.00015736819559113076, "loss": 0.0005, "step": 2224 }, { "epoch": 1.46, "grad_norm": 0.03964143246412277, "learning_rate": 0.00015726522513707567, "loss": 0.0014, "step": 2225 }, { "epoch": 1.46, "grad_norm": 0.2769809663295746, "learning_rate": 0.00015716225125118587, "loss": 0.0654, "step": 2226 }, { "epoch": 1.46, "grad_norm": 0.03909273445606232, "learning_rate": 0.00015705927398210258, "loss": 0.0025, "step": 2227 }, { "epoch": 1.46, "grad_norm": 0.11096679419279099, "learning_rate": 0.0001569562933784686, "loss": 0.0446, "step": 2228 }, { "epoch": 1.46, "grad_norm": 0.14691616594791412, "learning_rate": 0.00015685330948892834, "loss": 0.0471, "step": 2229 }, { "epoch": 1.46, "grad_norm": 0.10699688643217087, "learning_rate": 0.00015675032236212768, "loss": 0.0076, "step": 2230 }, { "epoch": 1.46, "grad_norm": 0.32596132159233093, "learning_rate": 0.0001566473320467141, "loss": 0.022, "step": 2231 }, { "epoch": 1.46, "grad_norm": 0.5243645310401917, "learning_rate": 0.00015654433859133666, "loss": 0.0461, "step": 2232 }, { "epoch": 1.46, "grad_norm": 0.08140390366315842, "learning_rate": 0.0001564413420446457, "loss": 0.0052, "step": 2233 }, { "epoch": 1.46, "grad_norm": 0.009572282433509827, "learning_rate": 0.00015633834245529316, "loss": 0.0011, "step": 2234 }, { "epoch": 1.46, "grad_norm": 0.1445630043745041, "learning_rate": 0.00015623533987193247, "loss": 0.0075, "step": 2235 }, { "epoch": 1.46, "grad_norm": 0.19702015817165375, "learning_rate": 0.00015613233434321833, "loss": 0.0164, "step": 2236 }, { "epoch": 1.46, "grad_norm": 0.05097610130906105, "learning_rate": 0.00015602932591780692, "loss": 0.0053, "step": 2237 }, { "epoch": 1.47, "grad_norm": 0.1141589879989624, "learning_rate": 0.00015592631464435573, "loss": 0.0085, "step": 2238 }, { "epoch": 1.47, "grad_norm": 0.17674514651298523, "learning_rate": 0.00015582330057152367, "loss": 0.0115, "step": 2239 }, { "epoch": 1.47, "grad_norm": 0.43345922231674194, "learning_rate": 0.00015572028374797095, "loss": 0.0494, "step": 2240 }, { "epoch": 1.47, "grad_norm": 0.24077734351158142, "learning_rate": 0.00015561726422235906, "loss": 0.0407, "step": 2241 }, { "epoch": 1.47, "grad_norm": 0.2242443561553955, "learning_rate": 0.00015551424204335074, "loss": 0.0464, "step": 2242 }, { "epoch": 1.47, "grad_norm": 0.02641609124839306, "learning_rate": 0.00015541121725961, "loss": 0.0036, "step": 2243 }, { "epoch": 1.47, "grad_norm": 0.17494410276412964, "learning_rate": 0.00015530818991980213, "loss": 0.0728, "step": 2244 }, { "epoch": 1.47, "grad_norm": 0.14498472213745117, "learning_rate": 0.00015520516007259364, "loss": 0.0141, "step": 2245 }, { "epoch": 1.47, "grad_norm": 0.07311736047267914, "learning_rate": 0.00015510212776665206, "loss": 0.0075, "step": 2246 }, { "epoch": 1.47, "grad_norm": 0.18270064890384674, "learning_rate": 0.00015499909305064625, "loss": 0.0335, "step": 2247 }, { "epoch": 1.47, "grad_norm": 0.11468853056430817, "learning_rate": 0.00015489605597324618, "loss": 0.0126, "step": 2248 }, { "epoch": 1.47, "grad_norm": 0.24907346069812775, "learning_rate": 0.00015479301658312294, "loss": 0.0547, "step": 2249 }, { "epoch": 1.47, "grad_norm": 0.054052844643592834, "learning_rate": 0.0001546899749289486, "loss": 0.0061, "step": 2250 }, { "epoch": 1.47, "grad_norm": 0.10278685390949249, "learning_rate": 0.0001545869310593964, "loss": 0.0148, "step": 2251 }, { "epoch": 1.47, "grad_norm": 0.039586808532476425, "learning_rate": 0.00015448388502314065, "loss": 0.0044, "step": 2252 }, { "epoch": 1.47, "grad_norm": 0.1996268779039383, "learning_rate": 0.00015438083686885663, "loss": 0.0238, "step": 2253 }, { "epoch": 1.48, "grad_norm": 0.07462750375270844, "learning_rate": 0.00015427778664522067, "loss": 0.0077, "step": 2254 }, { "epoch": 1.48, "grad_norm": 0.022693922743201256, "learning_rate": 0.00015417473440090994, "loss": 0.0031, "step": 2255 }, { "epoch": 1.48, "grad_norm": 0.17342881858348846, "learning_rate": 0.00015407168018460272, "loss": 0.0334, "step": 2256 }, { "epoch": 1.48, "grad_norm": 0.1654786616563797, "learning_rate": 0.0001539686240449782, "loss": 0.059, "step": 2257 }, { "epoch": 1.48, "grad_norm": 0.10321973264217377, "learning_rate": 0.00015386556603071643, "loss": 0.0204, "step": 2258 }, { "epoch": 1.48, "grad_norm": 0.10149465501308441, "learning_rate": 0.00015376250619049834, "loss": 0.0122, "step": 2259 }, { "epoch": 1.48, "grad_norm": 0.04105484485626221, "learning_rate": 0.00015365944457300572, "loss": 0.0056, "step": 2260 }, { "epoch": 1.48, "grad_norm": 0.10165435820817947, "learning_rate": 0.0001535563812269213, "loss": 0.0137, "step": 2261 }, { "epoch": 1.48, "grad_norm": 0.11750262975692749, "learning_rate": 0.0001534533162009285, "loss": 0.009, "step": 2262 }, { "epoch": 1.48, "grad_norm": 0.30132028460502625, "learning_rate": 0.00015335024954371158, "loss": 0.0188, "step": 2263 }, { "epoch": 1.48, "grad_norm": 0.19077062606811523, "learning_rate": 0.0001532471813039556, "loss": 0.0249, "step": 2264 }, { "epoch": 1.48, "grad_norm": 0.1327200084924698, "learning_rate": 0.0001531441115303463, "loss": 0.0139, "step": 2265 }, { "epoch": 1.48, "grad_norm": 0.09262137115001678, "learning_rate": 0.0001530410402715702, "loss": 0.0128, "step": 2266 }, { "epoch": 1.48, "grad_norm": 0.1415720283985138, "learning_rate": 0.00015293796757631458, "loss": 0.0255, "step": 2267 }, { "epoch": 1.48, "grad_norm": 0.28981319069862366, "learning_rate": 0.00015283489349326721, "loss": 0.0365, "step": 2268 }, { "epoch": 1.49, "grad_norm": 0.024845613166689873, "learning_rate": 0.0001527318180711167, "loss": 0.0034, "step": 2269 }, { "epoch": 1.49, "grad_norm": 0.0768069252371788, "learning_rate": 0.0001526287413585522, "loss": 0.0066, "step": 2270 }, { "epoch": 1.49, "grad_norm": 0.1130814403295517, "learning_rate": 0.00015252566340426352, "loss": 0.0138, "step": 2271 }, { "epoch": 1.49, "grad_norm": 0.15060044825077057, "learning_rate": 0.00015242258425694107, "loss": 0.047, "step": 2272 }, { "epoch": 1.49, "grad_norm": 0.16956418752670288, "learning_rate": 0.00015231950396527564, "loss": 0.0468, "step": 2273 }, { "epoch": 1.49, "grad_norm": 0.027678130194544792, "learning_rate": 0.0001522164225779588, "loss": 0.0035, "step": 2274 }, { "epoch": 1.49, "grad_norm": 0.011556626297533512, "learning_rate": 0.00015211334014368256, "loss": 0.0017, "step": 2275 }, { "epoch": 1.49, "grad_norm": 0.17217235267162323, "learning_rate": 0.0001520102567111394, "loss": 0.0299, "step": 2276 }, { "epoch": 1.49, "grad_norm": 0.02171311527490616, "learning_rate": 0.00015190717232902224, "loss": 0.0032, "step": 2277 }, { "epoch": 1.49, "grad_norm": 0.08322153985500336, "learning_rate": 0.0001518040870460245, "loss": 0.0148, "step": 2278 }, { "epoch": 1.49, "grad_norm": 0.10915898531675339, "learning_rate": 0.00015170100091084, "loss": 0.0122, "step": 2279 }, { "epoch": 1.49, "grad_norm": 0.06174374371767044, "learning_rate": 0.000151597913972163, "loss": 0.011, "step": 2280 }, { "epoch": 1.49, "grad_norm": 0.0850352942943573, "learning_rate": 0.00015149482627868814, "loss": 0.0175, "step": 2281 }, { "epoch": 1.49, "grad_norm": 0.8030893206596375, "learning_rate": 0.0001513917378791103, "loss": 0.0689, "step": 2282 }, { "epoch": 1.49, "grad_norm": 0.08825898915529251, "learning_rate": 0.0001512886488221249, "loss": 0.0109, "step": 2283 }, { "epoch": 1.5, "grad_norm": 0.20646128058433533, "learning_rate": 0.00015118555915642746, "loss": 0.0717, "step": 2284 }, { "epoch": 1.5, "grad_norm": 0.09915432333946228, "learning_rate": 0.00015108246893071395, "loss": 0.0117, "step": 2285 }, { "epoch": 1.5, "grad_norm": 0.06762687116861343, "learning_rate": 0.00015097937819368045, "loss": 0.0095, "step": 2286 }, { "epoch": 1.5, "grad_norm": 0.181647390127182, "learning_rate": 0.00015087628699402345, "loss": 0.0205, "step": 2287 }, { "epoch": 1.5, "grad_norm": 0.25607171654701233, "learning_rate": 0.00015077319538043954, "loss": 0.0161, "step": 2288 }, { "epoch": 1.5, "grad_norm": 0.10751291364431381, "learning_rate": 0.00015067010340162558, "loss": 0.0151, "step": 2289 }, { "epoch": 1.5, "grad_norm": 0.15035240352153778, "learning_rate": 0.00015056701110627855, "loss": 0.0232, "step": 2290 }, { "epoch": 1.5, "grad_norm": 0.09425321966409683, "learning_rate": 0.00015046391854309552, "loss": 0.0109, "step": 2291 }, { "epoch": 1.5, "grad_norm": 0.05676641687750816, "learning_rate": 0.00015036082576077385, "loss": 0.0091, "step": 2292 }, { "epoch": 1.5, "eval_loss": 0.029079807922244072, "eval_runtime": 39.9888, "eval_samples_per_second": 32.184, "eval_steps_per_second": 8.052, "step": 2292 }, { "epoch": 1.5, "grad_norm": 0.18537257611751556, "learning_rate": 0.00015025773280801088, "loss": 0.0312, "step": 2293 }, { "epoch": 1.5, "grad_norm": 0.17844584584236145, "learning_rate": 0.0001501546397335041, "loss": 0.0275, "step": 2294 }, { "epoch": 1.5, "grad_norm": 0.12108953297138214, "learning_rate": 0.00015005154658595096, "loss": 0.0173, "step": 2295 }, { "epoch": 1.5, "grad_norm": 0.06204582378268242, "learning_rate": 0.00014994845341404907, "loss": 0.0089, "step": 2296 }, { "epoch": 1.5, "grad_norm": 0.23866377770900726, "learning_rate": 0.00014984536026649593, "loss": 0.0258, "step": 2297 }, { "epoch": 1.5, "grad_norm": 0.013257946819067001, "learning_rate": 0.00014974226719198912, "loss": 0.0026, "step": 2298 }, { "epoch": 1.51, "grad_norm": 0.14589637517929077, "learning_rate": 0.00014963917423922618, "loss": 0.0159, "step": 2299 }, { "epoch": 1.51, "grad_norm": 0.2827925682067871, "learning_rate": 0.0001495360814569045, "loss": 0.0182, "step": 2300 }, { "epoch": 1.51, "grad_norm": 0.042063966393470764, "learning_rate": 0.0001494329888937215, "loss": 0.0063, "step": 2301 }, { "epoch": 1.51, "grad_norm": 0.01539128739386797, "learning_rate": 0.00014932989659837442, "loss": 0.0025, "step": 2302 }, { "epoch": 1.51, "grad_norm": 0.07236410677433014, "learning_rate": 0.00014922680461956048, "loss": 0.0087, "step": 2303 }, { "epoch": 1.51, "grad_norm": 0.03618314489722252, "learning_rate": 0.00014912371300597658, "loss": 0.0049, "step": 2304 }, { "epoch": 1.51, "grad_norm": 0.11485552042722702, "learning_rate": 0.00014902062180631958, "loss": 0.008, "step": 2305 }, { "epoch": 1.51, "grad_norm": 0.11101217567920685, "learning_rate": 0.00014891753106928608, "loss": 0.0232, "step": 2306 }, { "epoch": 1.51, "grad_norm": 0.017525073140859604, "learning_rate": 0.00014881444084357255, "loss": 0.0025, "step": 2307 }, { "epoch": 1.51, "grad_norm": 0.20452751219272614, "learning_rate": 0.00014871135117787513, "loss": 0.0438, "step": 2308 }, { "epoch": 1.51, "grad_norm": 0.2409246861934662, "learning_rate": 0.00014860826212088972, "loss": 0.0299, "step": 2309 }, { "epoch": 1.51, "grad_norm": 0.3612133860588074, "learning_rate": 0.0001485051737213119, "loss": 0.0299, "step": 2310 }, { "epoch": 1.51, "grad_norm": 0.1734815537929535, "learning_rate": 0.000148402086027837, "loss": 0.0164, "step": 2311 }, { "epoch": 1.51, "grad_norm": 0.2504834234714508, "learning_rate": 0.00014829899908916003, "loss": 0.0233, "step": 2312 }, { "epoch": 1.51, "grad_norm": 0.29188862442970276, "learning_rate": 0.00014819591295397555, "loss": 0.0202, "step": 2313 }, { "epoch": 1.51, "grad_norm": 0.044157739728689194, "learning_rate": 0.0001480928276709778, "loss": 0.0057, "step": 2314 }, { "epoch": 1.52, "grad_norm": 0.08563201874494553, "learning_rate": 0.00014798974328886062, "loss": 0.0118, "step": 2315 }, { "epoch": 1.52, "grad_norm": 0.06240120530128479, "learning_rate": 0.00014788665985631741, "loss": 0.0041, "step": 2316 }, { "epoch": 1.52, "grad_norm": 0.1662430614233017, "learning_rate": 0.0001477835774220412, "loss": 0.0129, "step": 2317 }, { "epoch": 1.52, "grad_norm": 0.020060239359736443, "learning_rate": 0.00014768049603472436, "loss": 0.0016, "step": 2318 }, { "epoch": 1.52, "grad_norm": 0.11197911947965622, "learning_rate": 0.00014757741574305896, "loss": 0.0059, "step": 2319 }, { "epoch": 1.52, "grad_norm": 0.25439372658729553, "learning_rate": 0.00014747433659573645, "loss": 0.0597, "step": 2320 }, { "epoch": 1.52, "grad_norm": 0.12229887396097183, "learning_rate": 0.00014737125864144779, "loss": 0.0061, "step": 2321 }, { "epoch": 1.52, "grad_norm": 0.3244328498840332, "learning_rate": 0.0001472681819288833, "loss": 0.0897, "step": 2322 }, { "epoch": 1.52, "grad_norm": 0.10589354485273361, "learning_rate": 0.00014716510650673279, "loss": 0.0057, "step": 2323 }, { "epoch": 1.52, "grad_norm": 0.020396653562784195, "learning_rate": 0.00014706203242368542, "loss": 0.0025, "step": 2324 }, { "epoch": 1.52, "grad_norm": 0.3133372366428375, "learning_rate": 0.0001469589597284298, "loss": 0.0432, "step": 2325 }, { "epoch": 1.52, "grad_norm": 0.09910666942596436, "learning_rate": 0.0001468558884696537, "loss": 0.0089, "step": 2326 }, { "epoch": 1.52, "grad_norm": 0.06794673204421997, "learning_rate": 0.0001467528186960444, "loss": 0.0036, "step": 2327 }, { "epoch": 1.52, "grad_norm": 0.5119796395301819, "learning_rate": 0.00014664975045628842, "loss": 0.0347, "step": 2328 }, { "epoch": 1.52, "grad_norm": 0.13082320988178253, "learning_rate": 0.00014654668379907149, "loss": 0.0181, "step": 2329 }, { "epoch": 1.53, "grad_norm": 0.043470270931720734, "learning_rate": 0.0001464436187730787, "loss": 0.0048, "step": 2330 }, { "epoch": 1.53, "grad_norm": 0.1573643833398819, "learning_rate": 0.00014634055542699426, "loss": 0.0161, "step": 2331 }, { "epoch": 1.53, "grad_norm": 0.2287653237581253, "learning_rate": 0.00014623749380950166, "loss": 0.0384, "step": 2332 }, { "epoch": 1.53, "grad_norm": 0.35878098011016846, "learning_rate": 0.00014613443396928357, "loss": 0.047, "step": 2333 }, { "epoch": 1.53, "grad_norm": 0.14366035163402557, "learning_rate": 0.0001460313759550218, "loss": 0.0258, "step": 2334 }, { "epoch": 1.53, "grad_norm": 0.027050506323575974, "learning_rate": 0.00014592831981539726, "loss": 0.0023, "step": 2335 }, { "epoch": 1.53, "grad_norm": 0.1347045749425888, "learning_rate": 0.00014582526559909006, "loss": 0.0568, "step": 2336 }, { "epoch": 1.53, "grad_norm": 0.1418723464012146, "learning_rate": 0.00014572221335477936, "loss": 0.0116, "step": 2337 }, { "epoch": 1.53, "grad_norm": 0.057905472815036774, "learning_rate": 0.00014561916313114338, "loss": 0.008, "step": 2338 }, { "epoch": 1.53, "grad_norm": 0.12035661935806274, "learning_rate": 0.00014551611497685933, "loss": 0.0243, "step": 2339 }, { "epoch": 1.53, "grad_norm": 0.0975029394030571, "learning_rate": 0.00014541306894060358, "loss": 0.0113, "step": 2340 }, { "epoch": 1.53, "grad_norm": 0.17044967412948608, "learning_rate": 0.0001453100250710514, "loss": 0.0244, "step": 2341 }, { "epoch": 1.53, "grad_norm": 0.04020816087722778, "learning_rate": 0.00014520698341687706, "loss": 0.0053, "step": 2342 }, { "epoch": 1.53, "grad_norm": 0.18097876012325287, "learning_rate": 0.0001451039440267538, "loss": 0.0253, "step": 2343 }, { "epoch": 1.53, "grad_norm": 0.012999899685382843, "learning_rate": 0.00014500090694935373, "loss": 0.0021, "step": 2344 }, { "epoch": 1.54, "grad_norm": 0.17676088213920593, "learning_rate": 0.00014489787223334795, "loss": 0.0283, "step": 2345 }, { "epoch": 1.54, "grad_norm": 0.2022247314453125, "learning_rate": 0.00014479483992740636, "loss": 0.0144, "step": 2346 }, { "epoch": 1.54, "grad_norm": 0.059292592108249664, "learning_rate": 0.00014469181008019784, "loss": 0.0055, "step": 2347 }, { "epoch": 1.54, "grad_norm": 0.07103478163480759, "learning_rate": 0.00014458878274039, "loss": 0.0082, "step": 2348 }, { "epoch": 1.54, "grad_norm": 0.24378490447998047, "learning_rate": 0.00014448575795664926, "loss": 0.0345, "step": 2349 }, { "epoch": 1.54, "grad_norm": 0.04015032947063446, "learning_rate": 0.00014438273577764094, "loss": 0.0045, "step": 2350 }, { "epoch": 1.54, "grad_norm": 0.22856755554676056, "learning_rate": 0.00014427971625202905, "loss": 0.0152, "step": 2351 }, { "epoch": 1.54, "grad_norm": 0.08063790947198868, "learning_rate": 0.0001441766994284763, "loss": 0.0085, "step": 2352 }, { "epoch": 1.54, "grad_norm": 0.30245184898376465, "learning_rate": 0.00014407368535564427, "loss": 0.0841, "step": 2353 }, { "epoch": 1.54, "grad_norm": 0.2929746210575104, "learning_rate": 0.00014397067408219308, "loss": 0.028, "step": 2354 }, { "epoch": 1.54, "grad_norm": 0.23219022154808044, "learning_rate": 0.00014386766565678165, "loss": 0.0337, "step": 2355 }, { "epoch": 1.54, "grad_norm": 0.05898268148303032, "learning_rate": 0.00014376466012806755, "loss": 0.0076, "step": 2356 }, { "epoch": 1.54, "grad_norm": 0.15910372138023376, "learning_rate": 0.0001436616575447068, "loss": 0.0318, "step": 2357 }, { "epoch": 1.54, "grad_norm": 0.12564696371555328, "learning_rate": 0.0001435586579553543, "loss": 0.0139, "step": 2358 }, { "epoch": 1.54, "grad_norm": 0.08463872969150543, "learning_rate": 0.00014345566140866334, "loss": 0.0081, "step": 2359 }, { "epoch": 1.55, "grad_norm": 0.10430733859539032, "learning_rate": 0.0001433526679532859, "loss": 0.0131, "step": 2360 }, { "epoch": 1.55, "grad_norm": 0.1021500900387764, "learning_rate": 0.00014324967763787235, "loss": 0.0131, "step": 2361 }, { "epoch": 1.55, "grad_norm": 0.04552861675620079, "learning_rate": 0.00014314669051107166, "loss": 0.0069, "step": 2362 }, { "epoch": 1.55, "grad_norm": 0.18448792397975922, "learning_rate": 0.00014304370662153137, "loss": 0.0241, "step": 2363 }, { "epoch": 1.55, "grad_norm": 0.14002187550067902, "learning_rate": 0.00014294072601789742, "loss": 0.0205, "step": 2364 }, { "epoch": 1.55, "grad_norm": 0.025072062388062477, "learning_rate": 0.00014283774874881413, "loss": 0.0026, "step": 2365 }, { "epoch": 1.55, "grad_norm": 0.12297520786523819, "learning_rate": 0.00014273477486292433, "loss": 0.0106, "step": 2366 }, { "epoch": 1.55, "grad_norm": 0.0502072237432003, "learning_rate": 0.00014263180440886924, "loss": 0.006, "step": 2367 }, { "epoch": 1.55, "grad_norm": 0.3273145854473114, "learning_rate": 0.00014252883743528843, "loss": 0.0501, "step": 2368 }, { "epoch": 1.55, "grad_norm": 0.1304243505001068, "learning_rate": 0.00014242587399081993, "loss": 0.0152, "step": 2369 }, { "epoch": 1.55, "grad_norm": 0.29160431027412415, "learning_rate": 0.00014232291412409994, "loss": 0.0516, "step": 2370 }, { "epoch": 1.55, "grad_norm": 0.10514498502016068, "learning_rate": 0.00014221995788376305, "loss": 0.0355, "step": 2371 }, { "epoch": 1.55, "grad_norm": 0.0701146274805069, "learning_rate": 0.00014211700531844215, "loss": 0.0062, "step": 2372 }, { "epoch": 1.55, "grad_norm": 0.0321023166179657, "learning_rate": 0.00014201405647676842, "loss": 0.0028, "step": 2373 }, { "epoch": 1.55, "grad_norm": 0.19740337133407593, "learning_rate": 0.0001419111114073712, "loss": 0.0394, "step": 2374 }, { "epoch": 1.55, "grad_norm": 0.016404815018177032, "learning_rate": 0.00014180817015887806, "loss": 0.0015, "step": 2375 }, { "epoch": 1.56, "grad_norm": 0.16899968683719635, "learning_rate": 0.00014170523277991486, "loss": 0.0178, "step": 2376 }, { "epoch": 1.56, "grad_norm": 0.052522968500852585, "learning_rate": 0.00014160229931910556, "loss": 0.0038, "step": 2377 }, { "epoch": 1.56, "grad_norm": 0.2541167438030243, "learning_rate": 0.00014149936982507224, "loss": 0.0218, "step": 2378 }, { "epoch": 1.56, "grad_norm": 0.09515637159347534, "learning_rate": 0.00014139644434643515, "loss": 0.0074, "step": 2379 }, { "epoch": 1.56, "grad_norm": 0.17533494532108307, "learning_rate": 0.00014129352293181264, "loss": 0.0184, "step": 2380 }, { "epoch": 1.56, "grad_norm": 0.03894618898630142, "learning_rate": 0.00014119060562982116, "loss": 0.0035, "step": 2381 }, { "epoch": 1.56, "grad_norm": 0.1853707730770111, "learning_rate": 0.00014108769248907522, "loss": 0.0965, "step": 2382 }, { "epoch": 1.56, "grad_norm": 0.1522863656282425, "learning_rate": 0.00014098478355818725, "loss": 0.0083, "step": 2383 }, { "epoch": 1.56, "grad_norm": 0.04990135878324509, "learning_rate": 0.0001408818788857678, "loss": 0.0043, "step": 2384 }, { "epoch": 1.56, "grad_norm": 0.3627833127975464, "learning_rate": 0.00014077897852042545, "loss": 0.0825, "step": 2385 }, { "epoch": 1.56, "grad_norm": 0.07105204463005066, "learning_rate": 0.00014067608251076664, "loss": 0.0069, "step": 2386 }, { "epoch": 1.56, "grad_norm": 0.11619725823402405, "learning_rate": 0.0001405731909053958, "loss": 0.0201, "step": 2387 }, { "epoch": 1.56, "grad_norm": 0.17097829282283783, "learning_rate": 0.00014047030375291528, "loss": 0.0413, "step": 2388 }, { "epoch": 1.56, "grad_norm": 0.05021402984857559, "learning_rate": 0.0001403674211019253, "loss": 0.0035, "step": 2389 }, { "epoch": 1.56, "grad_norm": 0.1278286725282669, "learning_rate": 0.000140264543001024, "loss": 0.0255, "step": 2390 }, { "epoch": 1.57, "grad_norm": 0.02548815682530403, "learning_rate": 0.0001401616694988074, "loss": 0.0043, "step": 2391 }, { "epoch": 1.57, "grad_norm": 0.08917523175477982, "learning_rate": 0.00014005880064386916, "loss": 0.0121, "step": 2392 }, { "epoch": 1.57, "grad_norm": 0.11778222769498825, "learning_rate": 0.00013995593648480099, "loss": 0.0064, "step": 2393 }, { "epoch": 1.57, "grad_norm": 0.10804206132888794, "learning_rate": 0.00013985307707019222, "loss": 0.0081, "step": 2394 }, { "epoch": 1.57, "grad_norm": 0.03458308428525925, "learning_rate": 0.00013975022244863005, "loss": 0.0046, "step": 2395 }, { "epoch": 1.57, "grad_norm": 0.21902398765087128, "learning_rate": 0.00013964737266869927, "loss": 0.0564, "step": 2396 }, { "epoch": 1.57, "grad_norm": 0.08411278575658798, "learning_rate": 0.0001395445277789825, "loss": 0.0151, "step": 2397 }, { "epoch": 1.57, "grad_norm": 0.25032511353492737, "learning_rate": 0.00013944168782806013, "loss": 0.0301, "step": 2398 }, { "epoch": 1.57, "grad_norm": 0.042354997247457504, "learning_rate": 0.00013933885286450992, "loss": 0.0061, "step": 2399 }, { "epoch": 1.57, "grad_norm": 0.12194197624921799, "learning_rate": 0.00013923602293690755, "loss": 0.0138, "step": 2400 }, { "epoch": 1.57, "grad_norm": 0.1070348247885704, "learning_rate": 0.00013913319809382625, "loss": 0.0312, "step": 2401 }, { "epoch": 1.57, "grad_norm": 0.10185689479112625, "learning_rate": 0.00013903037838383677, "loss": 0.0389, "step": 2402 }, { "epoch": 1.57, "grad_norm": 0.08572839200496674, "learning_rate": 0.00013892756385550754, "loss": 0.0101, "step": 2403 }, { "epoch": 1.57, "grad_norm": 0.11028925329446793, "learning_rate": 0.00013882475455740447, "loss": 0.0129, "step": 2404 }, { "epoch": 1.57, "grad_norm": 0.11871008574962616, "learning_rate": 0.00013872195053809107, "loss": 0.0114, "step": 2405 }, { "epoch": 1.58, "grad_norm": 0.03218941390514374, "learning_rate": 0.00013861915184612832, "loss": 0.0043, "step": 2406 }, { "epoch": 1.58, "grad_norm": 0.02190096117556095, "learning_rate": 0.0001385163585300746, "loss": 0.003, "step": 2407 }, { "epoch": 1.58, "grad_norm": 0.08264799416065216, "learning_rate": 0.00013841357063848586, "loss": 0.0097, "step": 2408 }, { "epoch": 1.58, "grad_norm": 0.06024307757616043, "learning_rate": 0.0001383107882199155, "loss": 0.006, "step": 2409 }, { "epoch": 1.58, "grad_norm": 0.2193279266357422, "learning_rate": 0.00013820801132291425, "loss": 0.0193, "step": 2410 }, { "epoch": 1.58, "grad_norm": 0.01249188743531704, "learning_rate": 0.00013810523999603026, "loss": 0.002, "step": 2411 }, { "epoch": 1.58, "grad_norm": 0.11805452406406403, "learning_rate": 0.00013800247428780908, "loss": 0.0133, "step": 2412 }, { "epoch": 1.58, "grad_norm": 0.058763302862644196, "learning_rate": 0.0001378997142467936, "loss": 0.0078, "step": 2413 }, { "epoch": 1.58, "grad_norm": 0.0065534659661352634, "learning_rate": 0.000137796959921524, "loss": 0.0012, "step": 2414 }, { "epoch": 1.58, "grad_norm": 0.13317126035690308, "learning_rate": 0.00013769421136053777, "loss": 0.006, "step": 2415 }, { "epoch": 1.58, "grad_norm": 0.01397017389535904, "learning_rate": 0.0001375914686123697, "loss": 0.0022, "step": 2416 }, { "epoch": 1.58, "grad_norm": 0.1307278871536255, "learning_rate": 0.00013748873172555182, "loss": 0.0138, "step": 2417 }, { "epoch": 1.58, "grad_norm": 0.005166274029761553, "learning_rate": 0.00013738600074861339, "loss": 0.0009, "step": 2418 }, { "epoch": 1.58, "grad_norm": 0.01865355297923088, "learning_rate": 0.00013728327573008092, "loss": 0.0018, "step": 2419 }, { "epoch": 1.58, "grad_norm": 0.0630246177315712, "learning_rate": 0.000137180556718478, "loss": 0.0042, "step": 2420 }, { "epoch": 1.58, "grad_norm": 0.02148517221212387, "learning_rate": 0.00013707784376232546, "loss": 0.0026, "step": 2421 }, { "epoch": 1.59, "grad_norm": 0.15195278823375702, "learning_rate": 0.00013697513691014127, "loss": 0.0309, "step": 2422 }, { "epoch": 1.59, "grad_norm": 0.026208559051156044, "learning_rate": 0.00013687243621044056, "loss": 0.0021, "step": 2423 }, { "epoch": 1.59, "grad_norm": 0.04777579382061958, "learning_rate": 0.0001367697417117354, "loss": 0.0049, "step": 2424 }, { "epoch": 1.59, "grad_norm": 0.034857913851737976, "learning_rate": 0.00013666705346253508, "loss": 0.0035, "step": 2425 }, { "epoch": 1.59, "grad_norm": 0.14037910103797913, "learning_rate": 0.00013656437151134587, "loss": 0.0223, "step": 2426 }, { "epoch": 1.59, "grad_norm": 0.019575120881199837, "learning_rate": 0.00013646169590667115, "loss": 0.0024, "step": 2427 }, { "epoch": 1.59, "grad_norm": 0.18282446265220642, "learning_rate": 0.00013635902669701115, "loss": 0.0365, "step": 2428 }, { "epoch": 1.59, "grad_norm": 0.10844237357378006, "learning_rate": 0.0001362563639308632, "loss": 0.0349, "step": 2429 }, { "epoch": 1.59, "grad_norm": 0.015698986127972603, "learning_rate": 0.00013615370765672152, "loss": 0.0021, "step": 2430 }, { "epoch": 1.59, "grad_norm": 0.009203084744513035, "learning_rate": 0.00013605105792307732, "loss": 0.0013, "step": 2431 }, { "epoch": 1.59, "grad_norm": 0.20686601102352142, "learning_rate": 0.00013594841477841874, "loss": 0.0482, "step": 2432 }, { "epoch": 1.59, "grad_norm": 0.2730505168437958, "learning_rate": 0.0001358457782712307, "loss": 0.0469, "step": 2433 }, { "epoch": 1.59, "grad_norm": 0.1980660855770111, "learning_rate": 0.00013574314844999502, "loss": 0.0216, "step": 2434 }, { "epoch": 1.59, "grad_norm": 0.1146879717707634, "learning_rate": 0.00013564052536319045, "loss": 0.0404, "step": 2435 }, { "epoch": 1.59, "grad_norm": 0.13500703871250153, "learning_rate": 0.0001355379090592925, "loss": 0.0688, "step": 2436 }, { "epoch": 1.6, "grad_norm": 0.1410558968782425, "learning_rate": 0.0001354352995867734, "loss": 0.011, "step": 2437 }, { "epoch": 1.6, "grad_norm": 0.039574917405843735, "learning_rate": 0.0001353326969941023, "loss": 0.0059, "step": 2438 }, { "epoch": 1.6, "grad_norm": 0.22806905210018158, "learning_rate": 0.000135230101329745, "loss": 0.0196, "step": 2439 }, { "epoch": 1.6, "grad_norm": 0.06916101276874542, "learning_rate": 0.00013512751264216407, "loss": 0.0076, "step": 2440 }, { "epoch": 1.6, "grad_norm": 0.2902592420578003, "learning_rate": 0.00013502493097981874, "loss": 0.0285, "step": 2441 }, { "epoch": 1.6, "grad_norm": 0.16353599727153778, "learning_rate": 0.00013492235639116495, "loss": 0.0165, "step": 2442 }, { "epoch": 1.6, "grad_norm": 0.04453244060277939, "learning_rate": 0.00013481978892465528, "loss": 0.008, "step": 2443 }, { "epoch": 1.6, "grad_norm": 0.06091325357556343, "learning_rate": 0.00013471722862873903, "loss": 0.0098, "step": 2444 }, { "epoch": 1.6, "grad_norm": 0.05872859060764313, "learning_rate": 0.00013461467555186203, "loss": 0.0077, "step": 2445 }, { "epoch": 1.6, "grad_norm": 0.2814798355102539, "learning_rate": 0.00013451212974246668, "loss": 0.0367, "step": 2446 }, { "epoch": 1.6, "grad_norm": 0.09219411015510559, "learning_rate": 0.00013440959124899198, "loss": 0.0454, "step": 2447 }, { "epoch": 1.6, "grad_norm": 0.1083960011601448, "learning_rate": 0.0001343070601198735, "loss": 0.0319, "step": 2448 }, { "epoch": 1.6, "grad_norm": 0.0786982923746109, "learning_rate": 0.00013420453640354335, "loss": 0.0128, "step": 2449 }, { "epoch": 1.6, "grad_norm": 0.020560231059789658, "learning_rate": 0.00013410202014843, "loss": 0.0043, "step": 2450 }, { "epoch": 1.6, "grad_norm": 0.14801499247550964, "learning_rate": 0.0001339995114029586, "loss": 0.0191, "step": 2451 }, { "epoch": 1.61, "grad_norm": 0.13155722618103027, "learning_rate": 0.00013389701021555056, "loss": 0.0216, "step": 2452 }, { "epoch": 1.61, "grad_norm": 0.1539149135351181, "learning_rate": 0.00013379451663462388, "loss": 0.0283, "step": 2453 }, { "epoch": 1.61, "grad_norm": 0.04807708412408829, "learning_rate": 0.0001336920307085928, "loss": 0.004, "step": 2454 }, { "epoch": 1.61, "grad_norm": 0.08267413824796677, "learning_rate": 0.0001335895524858681, "loss": 0.009, "step": 2455 }, { "epoch": 1.61, "grad_norm": 0.1063155010342598, "learning_rate": 0.00013348708201485688, "loss": 0.0482, "step": 2456 }, { "epoch": 1.61, "grad_norm": 0.1579791158437729, "learning_rate": 0.0001333846193439625, "loss": 0.0231, "step": 2457 }, { "epoch": 1.61, "grad_norm": 0.14827631413936615, "learning_rate": 0.00013328216452158478, "loss": 0.0184, "step": 2458 }, { "epoch": 1.61, "grad_norm": 0.09377805888652802, "learning_rate": 0.0001331797175961196, "loss": 0.0242, "step": 2459 }, { "epoch": 1.61, "grad_norm": 0.13285934925079346, "learning_rate": 0.00013307727861595938, "loss": 0.0322, "step": 2460 }, { "epoch": 1.61, "grad_norm": 0.07158241420984268, "learning_rate": 0.0001329748476294926, "loss": 0.0152, "step": 2461 }, { "epoch": 1.61, "grad_norm": 0.03591454401612282, "learning_rate": 0.00013287242468510408, "loss": 0.0052, "step": 2462 }, { "epoch": 1.61, "grad_norm": 0.04562580958008766, "learning_rate": 0.0001327700098311747, "loss": 0.0078, "step": 2463 }, { "epoch": 1.61, "grad_norm": 0.22357186675071716, "learning_rate": 0.00013266760311608168, "loss": 0.0426, "step": 2464 }, { "epoch": 1.61, "grad_norm": 0.019299369305372238, "learning_rate": 0.0001325652045881983, "loss": 0.0027, "step": 2465 }, { "epoch": 1.61, "grad_norm": 0.1803794503211975, "learning_rate": 0.00013246281429589397, "loss": 0.0299, "step": 2466 }, { "epoch": 1.62, "grad_norm": 0.08838968724012375, "learning_rate": 0.00013236043228753431, "loss": 0.0078, "step": 2467 }, { "epoch": 1.62, "grad_norm": 0.07040340453386307, "learning_rate": 0.00013225805861148086, "loss": 0.0167, "step": 2468 }, { "epoch": 1.62, "grad_norm": 0.07686702907085419, "learning_rate": 0.00013215569331609134, "loss": 0.0068, "step": 2469 }, { "epoch": 1.62, "grad_norm": 0.10085508972406387, "learning_rate": 0.0001320533364497195, "loss": 0.0228, "step": 2470 }, { "epoch": 1.62, "grad_norm": 0.060376573354005814, "learning_rate": 0.0001319509880607151, "loss": 0.0095, "step": 2471 }, { "epoch": 1.62, "grad_norm": 0.028011616319417953, "learning_rate": 0.00013184864819742385, "loss": 0.0051, "step": 2472 }, { "epoch": 1.62, "grad_norm": 0.04475580155849457, "learning_rate": 0.00013174631690818749, "loss": 0.0059, "step": 2473 }, { "epoch": 1.62, "grad_norm": 0.031112445518374443, "learning_rate": 0.00013164399424134374, "loss": 0.0043, "step": 2474 }, { "epoch": 1.62, "grad_norm": 0.38572391867637634, "learning_rate": 0.00013154168024522616, "loss": 0.0355, "step": 2475 }, { "epoch": 1.62, "grad_norm": 0.2470254898071289, "learning_rate": 0.00013143937496816422, "loss": 0.0146, "step": 2476 }, { "epoch": 1.62, "grad_norm": 0.08601324260234833, "learning_rate": 0.00013133707845848334, "loss": 0.0132, "step": 2477 }, { "epoch": 1.62, "grad_norm": 0.09904181212186813, "learning_rate": 0.00013123479076450478, "loss": 0.0249, "step": 2478 }, { "epoch": 1.62, "grad_norm": 0.013944767415523529, "learning_rate": 0.00013113251193454557, "loss": 0.0016, "step": 2479 }, { "epoch": 1.62, "grad_norm": 0.05794617161154747, "learning_rate": 0.00013103024201691868, "loss": 0.0054, "step": 2480 }, { "epoch": 1.62, "grad_norm": 0.25573623180389404, "learning_rate": 0.00013092798105993273, "loss": 0.0472, "step": 2481 }, { "epoch": 1.62, "grad_norm": 0.14380040764808655, "learning_rate": 0.00013082572911189217, "loss": 0.0255, "step": 2482 }, { "epoch": 1.63, "grad_norm": 0.0981423556804657, "learning_rate": 0.0001307234862210972, "loss": 0.0067, "step": 2483 }, { "epoch": 1.63, "grad_norm": 0.19898656010627747, "learning_rate": 0.0001306212524358438, "loss": 0.0079, "step": 2484 }, { "epoch": 1.63, "grad_norm": 0.1341608613729477, "learning_rate": 0.00013051902780442348, "loss": 0.0093, "step": 2485 }, { "epoch": 1.63, "grad_norm": 0.294636070728302, "learning_rate": 0.00013041681237512358, "loss": 0.0619, "step": 2486 }, { "epoch": 1.63, "grad_norm": 0.03222107142210007, "learning_rate": 0.00013031460619622706, "loss": 0.0044, "step": 2487 }, { "epoch": 1.63, "grad_norm": 0.11569062620401382, "learning_rate": 0.00013021240931601247, "loss": 0.0113, "step": 2488 }, { "epoch": 1.63, "grad_norm": 0.25324419140815735, "learning_rate": 0.000130110221782754, "loss": 0.0169, "step": 2489 }, { "epoch": 1.63, "grad_norm": 0.3488900065422058, "learning_rate": 0.00013000804364472144, "loss": 0.0328, "step": 2490 }, { "epoch": 1.63, "grad_norm": 0.053114596754312515, "learning_rate": 0.00012990587495018005, "loss": 0.0059, "step": 2491 }, { "epoch": 1.63, "grad_norm": 0.028155844658613205, "learning_rate": 0.0001298037157473908, "loss": 0.0036, "step": 2492 }, { "epoch": 1.63, "grad_norm": 0.019256332889199257, "learning_rate": 0.00012970156608461, "loss": 0.0024, "step": 2493 }, { "epoch": 1.63, "grad_norm": 0.2722720205783844, "learning_rate": 0.00012959942601008953, "loss": 0.017, "step": 2494 }, { "epoch": 1.63, "grad_norm": 0.02251126803457737, "learning_rate": 0.00012949729557207678, "loss": 0.0018, "step": 2495 }, { "epoch": 1.63, "grad_norm": 0.11708138138055801, "learning_rate": 0.00012939517481881448, "loss": 0.0076, "step": 2496 }, { "epoch": 1.63, "grad_norm": 0.07980841398239136, "learning_rate": 0.00012929306379854096, "loss": 0.0118, "step": 2497 }, { "epoch": 1.64, "grad_norm": 0.04632432758808136, "learning_rate": 0.00012919096255948974, "loss": 0.0044, "step": 2498 }, { "epoch": 1.64, "grad_norm": 0.09860672801733017, "learning_rate": 0.00012908887114988993, "loss": 0.004, "step": 2499 }, { "epoch": 1.64, "grad_norm": 0.10034430027008057, "learning_rate": 0.00012898678961796578, "loss": 0.0076, "step": 2500 }, { "epoch": 1.64, "grad_norm": 0.3471710979938507, "learning_rate": 0.00012888471801193702, "loss": 0.0284, "step": 2501 }, { "epoch": 1.64, "grad_norm": 0.219834566116333, "learning_rate": 0.00012878265638001867, "loss": 0.0278, "step": 2502 }, { "epoch": 1.64, "grad_norm": 0.1285816878080368, "learning_rate": 0.00012868060477042105, "loss": 0.0115, "step": 2503 }, { "epoch": 1.64, "grad_norm": 0.09379381686449051, "learning_rate": 0.00012857856323134965, "loss": 0.0052, "step": 2504 }, { "epoch": 1.64, "grad_norm": 0.1482100635766983, "learning_rate": 0.00012847653181100534, "loss": 0.0259, "step": 2505 }, { "epoch": 1.64, "grad_norm": 0.062140848487615585, "learning_rate": 0.00012837451055758414, "loss": 0.0051, "step": 2506 }, { "epoch": 1.64, "grad_norm": 0.12353700399398804, "learning_rate": 0.00012827249951927723, "loss": 0.0227, "step": 2507 }, { "epoch": 1.64, "grad_norm": 0.013850602321326733, "learning_rate": 0.00012817049874427108, "loss": 0.0008, "step": 2508 }, { "epoch": 1.64, "grad_norm": 0.006505441851913929, "learning_rate": 0.00012806850828074717, "loss": 0.0009, "step": 2509 }, { "epoch": 1.64, "grad_norm": 0.25963035225868225, "learning_rate": 0.0001279665281768822, "loss": 0.014, "step": 2510 }, { "epoch": 1.64, "grad_norm": 0.16497601568698883, "learning_rate": 0.00012786455848084793, "loss": 0.0103, "step": 2511 }, { "epoch": 1.64, "grad_norm": 0.036567408591508865, "learning_rate": 0.0001277625992408113, "loss": 0.0033, "step": 2512 }, { "epoch": 1.65, "grad_norm": 0.1094021275639534, "learning_rate": 0.00012766065050493416, "loss": 0.0035, "step": 2513 }, { "epoch": 1.65, "grad_norm": 0.029884997755289078, "learning_rate": 0.00012755871232137354, "loss": 0.0018, "step": 2514 }, { "epoch": 1.65, "grad_norm": 0.19183427095413208, "learning_rate": 0.00012745678473828138, "loss": 0.0354, "step": 2515 }, { "epoch": 1.65, "grad_norm": 0.00821281410753727, "learning_rate": 0.0001273548678038047, "loss": 0.0011, "step": 2516 }, { "epoch": 1.65, "grad_norm": 0.11743585020303726, "learning_rate": 0.00012725296156608536, "loss": 0.012, "step": 2517 }, { "epoch": 1.65, "grad_norm": 0.4067364037036896, "learning_rate": 0.00012715106607326032, "loss": 0.0251, "step": 2518 }, { "epoch": 1.65, "grad_norm": 0.13303601741790771, "learning_rate": 0.0001270491813734614, "loss": 0.022, "step": 2519 }, { "epoch": 1.65, "grad_norm": 0.46618887782096863, "learning_rate": 0.00012694730751481532, "loss": 0.0312, "step": 2520 }, { "epoch": 1.65, "grad_norm": 0.261059433221817, "learning_rate": 0.00012684544454544364, "loss": 0.0177, "step": 2521 }, { "epoch": 1.65, "grad_norm": 0.3947398364543915, "learning_rate": 0.00012674359251346284, "loss": 0.0138, "step": 2522 }, { "epoch": 1.65, "grad_norm": 0.02748054452240467, "learning_rate": 0.00012664175146698422, "loss": 0.0018, "step": 2523 }, { "epoch": 1.65, "grad_norm": 0.159784734249115, "learning_rate": 0.00012653992145411383, "loss": 0.0245, "step": 2524 }, { "epoch": 1.65, "grad_norm": 0.265787810087204, "learning_rate": 0.00012643810252295265, "loss": 0.0133, "step": 2525 }, { "epoch": 1.65, "grad_norm": 0.08137747645378113, "learning_rate": 0.00012633629472159623, "loss": 0.0061, "step": 2526 }, { "epoch": 1.65, "grad_norm": 0.19212502241134644, "learning_rate": 0.000126234498098135, "loss": 0.0457, "step": 2527 }, { "epoch": 1.65, "grad_norm": 0.232799232006073, "learning_rate": 0.0001261327127006541, "loss": 0.0251, "step": 2528 }, { "epoch": 1.66, "grad_norm": 0.2269105315208435, "learning_rate": 0.00012603093857723336, "loss": 0.0424, "step": 2529 }, { "epoch": 1.66, "grad_norm": 0.044430967420339584, "learning_rate": 0.00012592917577594718, "loss": 0.0041, "step": 2530 }, { "epoch": 1.66, "grad_norm": 0.1506963074207306, "learning_rate": 0.00012582742434486476, "loss": 0.0048, "step": 2531 }, { "epoch": 1.66, "grad_norm": 0.005820784717798233, "learning_rate": 0.00012572568433204986, "loss": 0.0006, "step": 2532 }, { "epoch": 1.66, "grad_norm": 0.035203348845243454, "learning_rate": 0.00012562395578556086, "loss": 0.0035, "step": 2533 }, { "epoch": 1.66, "grad_norm": 0.27628517150878906, "learning_rate": 0.00012552223875345072, "loss": 0.012, "step": 2534 }, { "epoch": 1.66, "grad_norm": 0.1673869639635086, "learning_rate": 0.00012542053328376695, "loss": 0.0201, "step": 2535 }, { "epoch": 1.66, "grad_norm": 0.013920975849032402, "learning_rate": 0.0001253188394245516, "loss": 0.0011, "step": 2536 }, { "epoch": 1.66, "grad_norm": 0.11940686404705048, "learning_rate": 0.0001252171572238412, "loss": 0.0078, "step": 2537 }, { "epoch": 1.66, "grad_norm": 0.1563238948583603, "learning_rate": 0.00012511548672966696, "loss": 0.0158, "step": 2538 }, { "epoch": 1.66, "grad_norm": 0.4310009777545929, "learning_rate": 0.00012501382799005425, "loss": 0.0919, "step": 2539 }, { "epoch": 1.66, "grad_norm": 0.10280542075634003, "learning_rate": 0.00012491218105302313, "loss": 0.0325, "step": 2540 }, { "epoch": 1.66, "grad_norm": 0.005980873014777899, "learning_rate": 0.000124810545966588, "loss": 0.0007, "step": 2541 }, { "epoch": 1.66, "grad_norm": 0.17933742702007294, "learning_rate": 0.00012470892277875774, "loss": 0.0362, "step": 2542 }, { "epoch": 1.66, "grad_norm": 0.14791390299797058, "learning_rate": 0.00012460731153753543, "loss": 0.052, "step": 2543 }, { "epoch": 1.67, "grad_norm": 0.09524931013584137, "learning_rate": 0.00012450571229091865, "loss": 0.0138, "step": 2544 }, { "epoch": 1.67, "grad_norm": 0.14938437938690186, "learning_rate": 0.00012440412508689928, "loss": 0.0098, "step": 2545 }, { "epoch": 1.67, "grad_norm": 0.22291189432144165, "learning_rate": 0.00012430254997346354, "loss": 0.0275, "step": 2546 }, { "epoch": 1.67, "grad_norm": 0.09883278608322144, "learning_rate": 0.00012420098699859192, "loss": 0.0043, "step": 2547 }, { "epoch": 1.67, "grad_norm": 0.09738563001155853, "learning_rate": 0.0001240994362102591, "loss": 0.009, "step": 2548 }, { "epoch": 1.67, "grad_norm": 0.11794974654912949, "learning_rate": 0.00012399789765643411, "loss": 0.0264, "step": 2549 }, { "epoch": 1.67, "grad_norm": 0.13492193818092346, "learning_rate": 0.0001238963713850802, "loss": 0.0309, "step": 2550 }, { "epoch": 1.67, "grad_norm": 0.19299760460853577, "learning_rate": 0.00012379485744415476, "loss": 0.0204, "step": 2551 }, { "epoch": 1.67, "grad_norm": 0.0748491957783699, "learning_rate": 0.00012369335588160933, "loss": 0.011, "step": 2552 }, { "epoch": 1.67, "grad_norm": 0.11878203600645065, "learning_rate": 0.0001235918667453897, "loss": 0.0218, "step": 2553 }, { "epoch": 1.67, "grad_norm": 0.06896767765283585, "learning_rate": 0.00012349039008343568, "loss": 0.0074, "step": 2554 }, { "epoch": 1.67, "grad_norm": 0.0966789722442627, "learning_rate": 0.0001233889259436813, "loss": 0.0162, "step": 2555 }, { "epoch": 1.67, "grad_norm": 0.13032063841819763, "learning_rate": 0.00012328747437405466, "loss": 0.0299, "step": 2556 }, { "epoch": 1.67, "grad_norm": 0.056150924414396286, "learning_rate": 0.0001231860354224778, "loss": 0.0147, "step": 2557 }, { "epoch": 1.67, "grad_norm": 0.18075281381607056, "learning_rate": 0.0001230846091368669, "loss": 0.0511, "step": 2558 }, { "epoch": 1.68, "grad_norm": 0.02879105694591999, "learning_rate": 0.00012298319556513216, "loss": 0.003, "step": 2559 }, { "epoch": 1.68, "grad_norm": 0.0305055920034647, "learning_rate": 0.0001228817947551778, "loss": 0.0038, "step": 2560 }, { "epoch": 1.68, "grad_norm": 0.08900358527898788, "learning_rate": 0.00012278040675490186, "loss": 0.011, "step": 2561 }, { "epoch": 1.68, "grad_norm": 0.22078153491020203, "learning_rate": 0.0001226790316121965, "loss": 0.0269, "step": 2562 }, { "epoch": 1.68, "grad_norm": 0.11649607867002487, "learning_rate": 0.00012257766937494774, "loss": 0.0144, "step": 2563 }, { "epoch": 1.68, "grad_norm": 0.08650204539299011, "learning_rate": 0.00012247632009103552, "loss": 0.0122, "step": 2564 }, { "epoch": 1.68, "grad_norm": 0.1292939931154251, "learning_rate": 0.0001223749838083336, "loss": 0.0151, "step": 2565 }, { "epoch": 1.68, "grad_norm": 0.05495935305953026, "learning_rate": 0.00012227366057470968, "loss": 0.007, "step": 2566 }, { "epoch": 1.68, "grad_norm": 0.10544559359550476, "learning_rate": 0.00012217235043802526, "loss": 0.0161, "step": 2567 }, { "epoch": 1.68, "grad_norm": 0.17187099158763885, "learning_rate": 0.00012207105344613566, "loss": 0.0419, "step": 2568 }, { "epoch": 1.68, "grad_norm": 0.09299924224615097, "learning_rate": 0.00012196976964689001, "loss": 0.0047, "step": 2569 }, { "epoch": 1.68, "grad_norm": 0.14098414778709412, "learning_rate": 0.00012186849908813111, "loss": 0.0273, "step": 2570 }, { "epoch": 1.68, "grad_norm": 0.18325307965278625, "learning_rate": 0.00012176724181769564, "loss": 0.0292, "step": 2571 }, { "epoch": 1.68, "grad_norm": 0.23543445765972137, "learning_rate": 0.00012166599788341393, "loss": 0.061, "step": 2572 }, { "epoch": 1.68, "grad_norm": 0.23078452050685883, "learning_rate": 0.00012156476733311005, "loss": 0.0406, "step": 2573 }, { "epoch": 1.69, "grad_norm": 0.07733777910470963, "learning_rate": 0.00012146355021460166, "loss": 0.0074, "step": 2574 }, { "epoch": 1.69, "grad_norm": 0.17538850009441376, "learning_rate": 0.00012136234657570018, "loss": 0.023, "step": 2575 }, { "epoch": 1.69, "grad_norm": 0.1247786208987236, "learning_rate": 0.00012126115646421062, "loss": 0.0061, "step": 2576 }, { "epoch": 1.69, "grad_norm": 0.011379680596292019, "learning_rate": 0.00012115997992793163, "loss": 0.0014, "step": 2577 }, { "epoch": 1.69, "grad_norm": 0.029602685943245888, "learning_rate": 0.00012105881701465533, "loss": 0.0037, "step": 2578 }, { "epoch": 1.69, "grad_norm": 0.23150865733623505, "learning_rate": 0.00012095766777216755, "loss": 0.027, "step": 2579 }, { "epoch": 1.69, "grad_norm": 0.2272222638130188, "learning_rate": 0.00012085653224824761, "loss": 0.0493, "step": 2580 }, { "epoch": 1.69, "grad_norm": 0.2526804208755493, "learning_rate": 0.00012075541049066832, "loss": 0.036, "step": 2581 }, { "epoch": 1.69, "grad_norm": 0.29018113017082214, "learning_rate": 0.00012065430254719608, "loss": 0.0353, "step": 2582 }, { "epoch": 1.69, "grad_norm": 0.062286440283060074, "learning_rate": 0.0001205532084655906, "loss": 0.0081, "step": 2583 }, { "epoch": 1.69, "grad_norm": 0.07457780838012695, "learning_rate": 0.00012045212829360517, "loss": 0.0109, "step": 2584 }, { "epoch": 1.69, "grad_norm": 0.34257975220680237, "learning_rate": 0.0001203510620789865, "loss": 0.032, "step": 2585 }, { "epoch": 1.69, "grad_norm": 0.2395414412021637, "learning_rate": 0.00012025000986947471, "loss": 0.0255, "step": 2586 }, { "epoch": 1.69, "grad_norm": 0.07272249460220337, "learning_rate": 0.00012014897171280323, "loss": 0.0086, "step": 2587 }, { "epoch": 1.69, "grad_norm": 0.14450271427631378, "learning_rate": 0.0001200479476566989, "loss": 0.0217, "step": 2588 }, { "epoch": 1.69, "grad_norm": 0.24903085827827454, "learning_rate": 0.00011994693774888192, "loss": 0.0251, "step": 2589 }, { "epoch": 1.7, "grad_norm": 0.13333503901958466, "learning_rate": 0.00011984594203706583, "loss": 0.0176, "step": 2590 }, { "epoch": 1.7, "grad_norm": 0.061313144862651825, "learning_rate": 0.00011974496056895735, "loss": 0.0043, "step": 2591 }, { "epoch": 1.7, "grad_norm": 0.21405766904354095, "learning_rate": 0.00011964399339225658, "loss": 0.0227, "step": 2592 }, { "epoch": 1.7, "grad_norm": 0.11095394939184189, "learning_rate": 0.00011954304055465683, "loss": 0.0208, "step": 2593 }, { "epoch": 1.7, "grad_norm": 0.11780749261379242, "learning_rate": 0.00011944210210384464, "loss": 0.0139, "step": 2594 }, { "epoch": 1.7, "grad_norm": 0.21892563998699188, "learning_rate": 0.00011934117808749978, "loss": 0.0234, "step": 2595 }, { "epoch": 1.7, "grad_norm": 0.08663798868656158, "learning_rate": 0.00011924026855329511, "loss": 0.0085, "step": 2596 }, { "epoch": 1.7, "grad_norm": 0.18243402242660522, "learning_rate": 0.00011913937354889678, "loss": 0.035, "step": 2597 }, { "epoch": 1.7, "grad_norm": 0.023229578509926796, "learning_rate": 0.00011903849312196398, "loss": 0.0015, "step": 2598 }, { "epoch": 1.7, "grad_norm": 0.14799842238426208, "learning_rate": 0.00011893762732014909, "loss": 0.0214, "step": 2599 }, { "epoch": 1.7, "grad_norm": 0.29807209968566895, "learning_rate": 0.00011883677619109746, "loss": 0.024, "step": 2600 }, { "epoch": 1.7, "grad_norm": 0.1408165544271469, "learning_rate": 0.00011873593978244771, "loss": 0.0106, "step": 2601 }, { "epoch": 1.7, "grad_norm": 0.11649095267057419, "learning_rate": 0.00011863511814183123, "loss": 0.0208, "step": 2602 }, { "epoch": 1.7, "grad_norm": 0.1464689075946808, "learning_rate": 0.00011853431131687267, "loss": 0.0253, "step": 2603 }, { "epoch": 1.7, "grad_norm": 0.3470141589641571, "learning_rate": 0.00011843351935518957, "loss": 0.0329, "step": 2604 }, { "epoch": 1.71, "grad_norm": 0.2767955958843231, "learning_rate": 0.00011833274230439255, "loss": 0.0307, "step": 2605 }, { "epoch": 1.71, "grad_norm": 0.047959037125110626, "learning_rate": 0.00011823198021208503, "loss": 0.002, "step": 2606 }, { "epoch": 1.71, "grad_norm": 0.05150588974356651, "learning_rate": 0.00011813123312586349, "loss": 0.0031, "step": 2607 }, { "epoch": 1.71, "grad_norm": 0.21748711168766022, "learning_rate": 0.00011803050109331725, "loss": 0.0434, "step": 2608 }, { "epoch": 1.71, "grad_norm": 0.26486659049987793, "learning_rate": 0.0001179297841620286, "loss": 0.0183, "step": 2609 }, { "epoch": 1.71, "grad_norm": 0.04066809266805649, "learning_rate": 0.00011782908237957265, "loss": 0.0033, "step": 2610 }, { "epoch": 1.71, "grad_norm": 0.13910254836082458, "learning_rate": 0.00011772839579351726, "loss": 0.0109, "step": 2611 }, { "epoch": 1.71, "grad_norm": 0.24764953553676605, "learning_rate": 0.00011762772445142329, "loss": 0.0182, "step": 2612 }, { "epoch": 1.71, "grad_norm": 0.0986236035823822, "learning_rate": 0.00011752706840084428, "loss": 0.0089, "step": 2613 }, { "epoch": 1.71, "grad_norm": 0.322979211807251, "learning_rate": 0.0001174264276893266, "loss": 0.0356, "step": 2614 }, { "epoch": 1.71, "grad_norm": 0.3145226538181305, "learning_rate": 0.00011732580236440934, "loss": 0.0271, "step": 2615 }, { "epoch": 1.71, "grad_norm": 0.10612889379262924, "learning_rate": 0.00011722519247362431, "loss": 0.0093, "step": 2616 }, { "epoch": 1.71, "grad_norm": 0.2893246114253998, "learning_rate": 0.00011712459806449608, "loss": 0.0193, "step": 2617 }, { "epoch": 1.71, "grad_norm": 0.16755311191082, "learning_rate": 0.00011702401918454192, "loss": 0.0129, "step": 2618 }, { "epoch": 1.71, "grad_norm": 0.029482614248991013, "learning_rate": 0.00011692345588127165, "loss": 0.0031, "step": 2619 }, { "epoch": 1.72, "grad_norm": 0.13832448422908783, "learning_rate": 0.00011682290820218785, "loss": 0.0061, "step": 2620 }, { "epoch": 1.72, "grad_norm": 0.19813022017478943, "learning_rate": 0.00011672237619478566, "loss": 0.0332, "step": 2621 }, { "epoch": 1.72, "grad_norm": 0.350599080324173, "learning_rate": 0.00011662185990655284, "loss": 0.0375, "step": 2622 }, { "epoch": 1.72, "grad_norm": 0.29366403818130493, "learning_rate": 0.00011652135938496977, "loss": 0.0523, "step": 2623 }, { "epoch": 1.72, "grad_norm": 0.17575830221176147, "learning_rate": 0.00011642087467750924, "loss": 0.0345, "step": 2624 }, { "epoch": 1.72, "grad_norm": 0.14889006316661835, "learning_rate": 0.00011632040583163673, "loss": 0.0164, "step": 2625 }, { "epoch": 1.72, "grad_norm": 0.09447371959686279, "learning_rate": 0.00011621995289481013, "loss": 0.0103, "step": 2626 }, { "epoch": 1.72, "grad_norm": 0.054458245635032654, "learning_rate": 0.00011611951591447991, "loss": 0.0036, "step": 2627 }, { "epoch": 1.72, "grad_norm": 0.19059543311595917, "learning_rate": 0.00011601909493808882, "loss": 0.0166, "step": 2628 }, { "epoch": 1.72, "grad_norm": 0.2544896900653839, "learning_rate": 0.00011591869001307226, "loss": 0.0297, "step": 2629 }, { "epoch": 1.72, "grad_norm": 0.07179494202136993, "learning_rate": 0.00011581830118685792, "loss": 0.0104, "step": 2630 }, { "epoch": 1.72, "grad_norm": 0.16107530891895294, "learning_rate": 0.00011571792850686595, "loss": 0.0107, "step": 2631 }, { "epoch": 1.72, "grad_norm": 0.14669694006443024, "learning_rate": 0.0001156175720205088, "loss": 0.0268, "step": 2632 }, { "epoch": 1.72, "grad_norm": 0.19625726342201233, "learning_rate": 0.00011551723177519134, "loss": 0.0296, "step": 2633 }, { "epoch": 1.72, "grad_norm": 0.09380532801151276, "learning_rate": 0.00011541690781831074, "loss": 0.0082, "step": 2634 }, { "epoch": 1.73, "grad_norm": 0.03426641598343849, "learning_rate": 0.00011531660019725648, "loss": 0.0042, "step": 2635 }, { "epoch": 1.73, "grad_norm": 0.10879172384738922, "learning_rate": 0.00011521630895941036, "loss": 0.0259, "step": 2636 }, { "epoch": 1.73, "grad_norm": 0.2619246542453766, "learning_rate": 0.00011511603415214633, "loss": 0.0394, "step": 2637 }, { "epoch": 1.73, "grad_norm": 0.06384444236755371, "learning_rate": 0.00011501577582283071, "loss": 0.0063, "step": 2638 }, { "epoch": 1.73, "grad_norm": 0.11376836895942688, "learning_rate": 0.00011491553401882195, "loss": 0.0164, "step": 2639 }, { "epoch": 1.73, "grad_norm": 0.011989972554147243, "learning_rate": 0.00011481530878747076, "loss": 0.0013, "step": 2640 }, { "epoch": 1.73, "grad_norm": 0.12901929020881653, "learning_rate": 0.00011471510017611995, "loss": 0.0059, "step": 2641 }, { "epoch": 1.73, "grad_norm": 0.036112286150455475, "learning_rate": 0.00011461490823210451, "loss": 0.0034, "step": 2642 }, { "epoch": 1.73, "grad_norm": 0.13186568021774292, "learning_rate": 0.00011451473300275158, "loss": 0.0125, "step": 2643 }, { "epoch": 1.73, "grad_norm": 0.08135777711868286, "learning_rate": 0.00011441457453538038, "loss": 0.0076, "step": 2644 }, { "epoch": 1.73, "grad_norm": 0.31037700176239014, "learning_rate": 0.00011431443287730226, "loss": 0.0438, "step": 2645 }, { "epoch": 1.73, "grad_norm": 0.02185073494911194, "learning_rate": 0.0001142143080758205, "loss": 0.0025, "step": 2646 }, { "epoch": 1.73, "grad_norm": 0.09959240257740021, "learning_rate": 0.00011411420017823056, "loss": 0.0079, "step": 2647 }, { "epoch": 1.73, "grad_norm": 0.06049061939120293, "learning_rate": 0.00011401410923181986, "loss": 0.0053, "step": 2648 }, { "epoch": 1.73, "grad_norm": 0.03902551159262657, "learning_rate": 0.00011391403528386782, "loss": 0.0029, "step": 2649 }, { "epoch": 1.73, "grad_norm": 0.17166626453399658, "learning_rate": 0.0001138139783816458, "loss": 0.0218, "step": 2650 }, { "epoch": 1.74, "grad_norm": 0.026734329760074615, "learning_rate": 0.00011371393857241713, "loss": 0.0027, "step": 2651 }, { "epoch": 1.74, "grad_norm": 0.007330378983169794, "learning_rate": 0.0001136139159034371, "loss": 0.0007, "step": 2652 }, { "epoch": 1.74, "grad_norm": 0.030042244121432304, "learning_rate": 0.00011351391042195287, "loss": 0.0023, "step": 2653 }, { "epoch": 1.74, "grad_norm": 0.1411576271057129, "learning_rate": 0.00011341392217520345, "loss": 0.0126, "step": 2654 }, { "epoch": 1.74, "grad_norm": 0.2431839257478714, "learning_rate": 0.00011331395121041975, "loss": 0.0162, "step": 2655 }, { "epoch": 1.74, "grad_norm": 0.013232244178652763, "learning_rate": 0.0001132139975748245, "loss": 0.0011, "step": 2656 }, { "epoch": 1.74, "grad_norm": 0.2504200041294098, "learning_rate": 0.0001131140613156323, "loss": 0.0161, "step": 2657 }, { "epoch": 1.74, "grad_norm": 0.09972897917032242, "learning_rate": 0.00011301414248004949, "loss": 0.0057, "step": 2658 }, { "epoch": 1.74, "grad_norm": 0.22492682933807373, "learning_rate": 0.00011291424111527412, "loss": 0.0339, "step": 2659 }, { "epoch": 1.74, "grad_norm": 0.1436653435230255, "learning_rate": 0.0001128143572684961, "loss": 0.007, "step": 2660 }, { "epoch": 1.74, "grad_norm": 0.0187530554831028, "learning_rate": 0.000112714490986897, "loss": 0.0019, "step": 2661 }, { "epoch": 1.74, "grad_norm": 0.21609365940093994, "learning_rate": 0.00011261464231765017, "loss": 0.0084, "step": 2662 }, { "epoch": 1.74, "grad_norm": 0.02223445661365986, "learning_rate": 0.00011251481130792048, "loss": 0.0011, "step": 2663 }, { "epoch": 1.74, "grad_norm": 0.14209489524364471, "learning_rate": 0.0001124149980048646, "loss": 0.0057, "step": 2664 }, { "epoch": 1.74, "grad_norm": 0.27440565824508667, "learning_rate": 0.00011231520245563082, "loss": 0.0169, "step": 2665 }, { "epoch": 1.75, "grad_norm": 0.25544804334640503, "learning_rate": 0.00011221542470735904, "loss": 0.0085, "step": 2666 }, { "epoch": 1.75, "grad_norm": 0.10816871374845505, "learning_rate": 0.00011211566480718064, "loss": 0.0083, "step": 2667 }, { "epoch": 1.75, "grad_norm": 0.0025934341829270124, "learning_rate": 0.00011201592280221872, "loss": 0.0003, "step": 2668 }, { "epoch": 1.75, "grad_norm": 0.11705806106328964, "learning_rate": 0.00011191619873958785, "loss": 0.0066, "step": 2669 }, { "epoch": 1.75, "grad_norm": 0.15215551853179932, "learning_rate": 0.00011181649266639416, "loss": 0.0297, "step": 2670 }, { "epoch": 1.75, "grad_norm": 0.29455116391181946, "learning_rate": 0.00011171680462973526, "loss": 0.0228, "step": 2671 }, { "epoch": 1.75, "grad_norm": 0.38029757142066956, "learning_rate": 0.00011161713467670022, "loss": 0.0303, "step": 2672 }, { "epoch": 1.75, "grad_norm": 0.3070727586746216, "learning_rate": 0.0001115174828543696, "loss": 0.0516, "step": 2673 }, { "epoch": 1.75, "grad_norm": 0.1334857940673828, "learning_rate": 0.00011141784920981539, "loss": 0.021, "step": 2674 }, { "epoch": 1.75, "eval_loss": 0.03871524706482887, "eval_runtime": 39.9789, "eval_samples_per_second": 32.192, "eval_steps_per_second": 8.054, "step": 2674 }, { "epoch": 1.75, "grad_norm": 0.4947819709777832, "learning_rate": 0.00011131823379010101, "loss": 0.0455, "step": 2675 }, { "epoch": 1.75, "grad_norm": 0.3073294758796692, "learning_rate": 0.00011121863664228123, "loss": 0.0581, "step": 2676 }, { "epoch": 1.75, "grad_norm": 0.17356103658676147, "learning_rate": 0.0001111190578134022, "loss": 0.0228, "step": 2677 }, { "epoch": 1.75, "grad_norm": 0.10131116956472397, "learning_rate": 0.00011101949735050143, "loss": 0.0063, "step": 2678 }, { "epoch": 1.75, "grad_norm": 0.08166798949241638, "learning_rate": 0.00011091995530060781, "loss": 0.0091, "step": 2679 }, { "epoch": 1.75, "grad_norm": 0.13784568011760712, "learning_rate": 0.0001108204317107414, "loss": 0.0136, "step": 2680 }, { "epoch": 1.76, "grad_norm": 0.16461403667926788, "learning_rate": 0.00011072092662791364, "loss": 0.0338, "step": 2681 }, { "epoch": 1.76, "grad_norm": 0.1617322415113449, "learning_rate": 0.00011062144009912721, "loss": 0.0205, "step": 2682 }, { "epoch": 1.76, "grad_norm": 0.17539392411708832, "learning_rate": 0.000110521972171376, "loss": 0.0044, "step": 2683 }, { "epoch": 1.76, "grad_norm": 0.0066742608323693275, "learning_rate": 0.00011042252289164518, "loss": 0.0009, "step": 2684 }, { "epoch": 1.76, "grad_norm": 0.09298042207956314, "learning_rate": 0.000110323092306911, "loss": 0.0209, "step": 2685 }, { "epoch": 1.76, "grad_norm": 0.05764083191752434, "learning_rate": 0.00011022368046414096, "loss": 0.0038, "step": 2686 }, { "epoch": 1.76, "grad_norm": 0.0654100552201271, "learning_rate": 0.00011012428741029372, "loss": 0.0095, "step": 2687 }, { "epoch": 1.76, "grad_norm": 0.008322247304022312, "learning_rate": 0.00011002491319231902, "loss": 0.0006, "step": 2688 }, { "epoch": 1.76, "grad_norm": 0.33108824491500854, "learning_rate": 0.00010992555785715771, "loss": 0.0333, "step": 2689 }, { "epoch": 1.76, "grad_norm": 0.16499800980091095, "learning_rate": 0.0001098262214517417, "loss": 0.0243, "step": 2690 }, { "epoch": 1.76, "grad_norm": 0.10082323104143143, "learning_rate": 0.00010972690402299402, "loss": 0.0077, "step": 2691 }, { "epoch": 1.76, "grad_norm": 0.3231547772884369, "learning_rate": 0.00010962760561782873, "loss": 0.0376, "step": 2692 }, { "epoch": 1.76, "grad_norm": 0.04128749296069145, "learning_rate": 0.0001095283262831508, "loss": 0.0052, "step": 2693 }, { "epoch": 1.76, "grad_norm": 0.2695901691913605, "learning_rate": 0.0001094290660658563, "loss": 0.0303, "step": 2694 }, { "epoch": 1.76, "grad_norm": 0.015289656817913055, "learning_rate": 0.00010932982501283224, "loss": 0.002, "step": 2695 }, { "epoch": 1.76, "grad_norm": 0.10868901014328003, "learning_rate": 0.0001092306031709566, "loss": 0.0261, "step": 2696 }, { "epoch": 1.77, "grad_norm": 0.029903091490268707, "learning_rate": 0.00010913140058709824, "loss": 0.0032, "step": 2697 }, { "epoch": 1.77, "grad_norm": 0.1284814178943634, "learning_rate": 0.00010903221730811692, "loss": 0.0192, "step": 2698 }, { "epoch": 1.77, "grad_norm": 0.09511958807706833, "learning_rate": 0.00010893305338086334, "loss": 0.0291, "step": 2699 }, { "epoch": 1.77, "grad_norm": 0.14972400665283203, "learning_rate": 0.00010883390885217896, "loss": 0.0114, "step": 2700 }, { "epoch": 1.77, "grad_norm": 0.14529098570346832, "learning_rate": 0.00010873478376889625, "loss": 0.0153, "step": 2701 }, { "epoch": 1.77, "grad_norm": 0.12022048979997635, "learning_rate": 0.0001086356781778383, "loss": 0.0129, "step": 2702 }, { "epoch": 1.77, "grad_norm": 0.3261503279209137, "learning_rate": 0.00010853659212581911, "loss": 0.0255, "step": 2703 }, { "epoch": 1.77, "grad_norm": 0.026748869568109512, "learning_rate": 0.00010843752565964337, "loss": 0.0036, "step": 2704 }, { "epoch": 1.77, "grad_norm": 0.13744834065437317, "learning_rate": 0.0001083384788261066, "loss": 0.0323, "step": 2705 }, { "epoch": 1.77, "grad_norm": 0.06895776093006134, "learning_rate": 0.00010823945167199499, "loss": 0.0097, "step": 2706 }, { "epoch": 1.77, "grad_norm": 0.03191859647631645, "learning_rate": 0.00010814044424408552, "loss": 0.0034, "step": 2707 }, { "epoch": 1.77, "grad_norm": 0.0162139143794775, "learning_rate": 0.00010804145658914571, "loss": 0.0022, "step": 2708 }, { "epoch": 1.77, "grad_norm": 0.611072301864624, "learning_rate": 0.00010794248875393385, "loss": 0.0402, "step": 2709 }, { "epoch": 1.77, "grad_norm": 0.008421842940151691, "learning_rate": 0.00010784354078519884, "loss": 0.0007, "step": 2710 }, { "epoch": 1.77, "grad_norm": 0.04360457509756088, "learning_rate": 0.00010774461272968016, "loss": 0.0057, "step": 2711 }, { "epoch": 1.78, "grad_norm": 0.027826182544231415, "learning_rate": 0.00010764570463410802, "loss": 0.0038, "step": 2712 }, { "epoch": 1.78, "grad_norm": 0.08522525429725647, "learning_rate": 0.00010754681654520296, "loss": 0.0105, "step": 2713 }, { "epoch": 1.78, "grad_norm": 0.11638516932725906, "learning_rate": 0.00010744794850967627, "loss": 0.0149, "step": 2714 }, { "epoch": 1.78, "grad_norm": 0.14275485277175903, "learning_rate": 0.0001073491005742297, "loss": 0.0108, "step": 2715 }, { "epoch": 1.78, "grad_norm": 0.22823548316955566, "learning_rate": 0.00010725027278555554, "loss": 0.0384, "step": 2716 }, { "epoch": 1.78, "grad_norm": 0.07148795574903488, "learning_rate": 0.00010715146519033647, "loss": 0.0063, "step": 2717 }, { "epoch": 1.78, "grad_norm": 0.08536515384912491, "learning_rate": 0.00010705267783524574, "loss": 0.013, "step": 2718 }, { "epoch": 1.78, "grad_norm": 0.06670022010803223, "learning_rate": 0.00010695391076694698, "loss": 0.0062, "step": 2719 }, { "epoch": 1.78, "grad_norm": 0.07848575711250305, "learning_rate": 0.00010685516403209426, "loss": 0.0074, "step": 2720 }, { "epoch": 1.78, "grad_norm": 0.07577262818813324, "learning_rate": 0.000106756437677332, "loss": 0.0049, "step": 2721 }, { "epoch": 1.78, "grad_norm": 0.04875582456588745, "learning_rate": 0.00010665773174929507, "loss": 0.0071, "step": 2722 }, { "epoch": 1.78, "grad_norm": 0.021466953679919243, "learning_rate": 0.00010655904629460862, "loss": 0.0028, "step": 2723 }, { "epoch": 1.78, "grad_norm": 0.0066236392594873905, "learning_rate": 0.00010646038135988819, "loss": 0.0008, "step": 2724 }, { "epoch": 1.78, "grad_norm": 0.07818492501974106, "learning_rate": 0.00010636173699173959, "loss": 0.0084, "step": 2725 }, { "epoch": 1.78, "grad_norm": 0.06917870789766312, "learning_rate": 0.0001062631132367589, "loss": 0.0074, "step": 2726 }, { "epoch": 1.79, "grad_norm": 0.20321956276893616, "learning_rate": 0.00010616451014153246, "loss": 0.0191, "step": 2727 }, { "epoch": 1.79, "grad_norm": 0.05821048095822334, "learning_rate": 0.00010606592775263694, "loss": 0.004, "step": 2728 }, { "epoch": 1.79, "grad_norm": 0.07762417942285538, "learning_rate": 0.00010596736611663916, "loss": 0.0028, "step": 2729 }, { "epoch": 1.79, "grad_norm": 0.06247600540518761, "learning_rate": 0.0001058688252800961, "loss": 0.0032, "step": 2730 }, { "epoch": 1.79, "grad_norm": 0.35450640320777893, "learning_rate": 0.00010577030528955497, "loss": 0.0227, "step": 2731 }, { "epoch": 1.79, "grad_norm": 0.1101314052939415, "learning_rate": 0.00010567180619155312, "loss": 0.0064, "step": 2732 }, { "epoch": 1.79, "grad_norm": 0.1981196254491806, "learning_rate": 0.00010557332803261806, "loss": 0.0467, "step": 2733 }, { "epoch": 1.79, "grad_norm": 0.030132168903946877, "learning_rate": 0.00010547487085926732, "loss": 0.0024, "step": 2734 }, { "epoch": 1.79, "grad_norm": 0.14475418627262115, "learning_rate": 0.00010537643471800862, "loss": 0.0316, "step": 2735 }, { "epoch": 1.79, "grad_norm": 0.22851037979125977, "learning_rate": 0.0001052780196553397, "loss": 0.0124, "step": 2736 }, { "epoch": 1.79, "grad_norm": 0.15478867292404175, "learning_rate": 0.00010517962571774832, "loss": 0.0209, "step": 2737 }, { "epoch": 1.79, "grad_norm": 0.1821509748697281, "learning_rate": 0.00010508125295171236, "loss": 0.0229, "step": 2738 }, { "epoch": 1.79, "grad_norm": 0.24094289541244507, "learning_rate": 0.00010498290140369953, "loss": 0.0679, "step": 2739 }, { "epoch": 1.79, "grad_norm": 0.07481525093317032, "learning_rate": 0.00010488457112016765, "loss": 0.0076, "step": 2740 }, { "epoch": 1.79, "grad_norm": 0.8672902584075928, "learning_rate": 0.00010478626214756448, "loss": 0.0177, "step": 2741 }, { "epoch": 1.8, "grad_norm": 0.04198053479194641, "learning_rate": 0.0001046879745323277, "loss": 0.0032, "step": 2742 }, { "epoch": 1.8, "grad_norm": 0.3100494146347046, "learning_rate": 0.00010458970832088484, "loss": 0.0334, "step": 2743 }, { "epoch": 1.8, "grad_norm": 0.12208042293787003, "learning_rate": 0.0001044914635596534, "loss": 0.0125, "step": 2744 }, { "epoch": 1.8, "grad_norm": 0.04906391724944115, "learning_rate": 0.00010439324029504073, "loss": 0.0023, "step": 2745 }, { "epoch": 1.8, "grad_norm": 0.18216626346111298, "learning_rate": 0.00010429503857344403, "loss": 0.0179, "step": 2746 }, { "epoch": 1.8, "grad_norm": 0.04328668490052223, "learning_rate": 0.0001041968584412503, "loss": 0.0042, "step": 2747 }, { "epoch": 1.8, "grad_norm": 0.17808032035827637, "learning_rate": 0.00010409869994483632, "loss": 0.0058, "step": 2748 }, { "epoch": 1.8, "grad_norm": 0.040204476565122604, "learning_rate": 0.00010400056313056873, "loss": 0.0035, "step": 2749 }, { "epoch": 1.8, "grad_norm": 0.08717933297157288, "learning_rate": 0.00010390244804480385, "loss": 0.0096, "step": 2750 }, { "epoch": 1.8, "grad_norm": 0.019179692491889, "learning_rate": 0.0001038043547338878, "loss": 0.001, "step": 2751 }, { "epoch": 1.8, "grad_norm": 0.41493868827819824, "learning_rate": 0.00010370628324415633, "loss": 0.0285, "step": 2752 }, { "epoch": 1.8, "grad_norm": 0.2256990224123001, "learning_rate": 0.00010360823362193495, "loss": 0.0211, "step": 2753 }, { "epoch": 1.8, "grad_norm": 0.05887773633003235, "learning_rate": 0.00010351020591353885, "loss": 0.0064, "step": 2754 }, { "epoch": 1.8, "grad_norm": 0.05249325558543205, "learning_rate": 0.00010341220016527286, "loss": 0.0039, "step": 2755 }, { "epoch": 1.8, "grad_norm": 0.08469627052545547, "learning_rate": 0.00010331421642343138, "loss": 0.0145, "step": 2756 }, { "epoch": 1.8, "grad_norm": 0.6204782128334045, "learning_rate": 0.00010321625473429844, "loss": 0.0185, "step": 2757 }, { "epoch": 1.81, "grad_norm": 0.244975745677948, "learning_rate": 0.00010311831514414769, "loss": 0.05, "step": 2758 }, { "epoch": 1.81, "grad_norm": 0.13598302006721497, "learning_rate": 0.00010302039769924234, "loss": 0.0384, "step": 2759 }, { "epoch": 1.81, "grad_norm": 0.18722222745418549, "learning_rate": 0.00010292250244583512, "loss": 0.0054, "step": 2760 }, { "epoch": 1.81, "grad_norm": 0.43910056352615356, "learning_rate": 0.00010282462943016821, "loss": 0.0335, "step": 2761 }, { "epoch": 1.81, "grad_norm": 0.2426317185163498, "learning_rate": 0.00010272677869847342, "loss": 0.0236, "step": 2762 }, { "epoch": 1.81, "grad_norm": 0.02529718354344368, "learning_rate": 0.00010262895029697194, "loss": 0.0019, "step": 2763 }, { "epoch": 1.81, "grad_norm": 0.14480143785476685, "learning_rate": 0.00010253114427187447, "loss": 0.0158, "step": 2764 }, { "epoch": 1.81, "grad_norm": 0.23900645971298218, "learning_rate": 0.00010243336066938107, "loss": 0.0326, "step": 2765 }, { "epoch": 1.81, "grad_norm": 0.170863538980484, "learning_rate": 0.00010233559953568125, "loss": 0.011, "step": 2766 }, { "epoch": 1.81, "grad_norm": 0.0535564124584198, "learning_rate": 0.00010223786091695387, "loss": 0.005, "step": 2767 }, { "epoch": 1.81, "grad_norm": 0.049874015152454376, "learning_rate": 0.00010214014485936731, "loss": 0.0038, "step": 2768 }, { "epoch": 1.81, "grad_norm": 0.13458359241485596, "learning_rate": 0.000102042451409079, "loss": 0.0272, "step": 2769 }, { "epoch": 1.81, "grad_norm": 0.04044146463274956, "learning_rate": 0.000101944780612236, "loss": 0.0043, "step": 2770 }, { "epoch": 1.81, "grad_norm": 0.0071864319033920765, "learning_rate": 0.00010184713251497443, "loss": 0.0009, "step": 2771 }, { "epoch": 1.81, "grad_norm": 0.2800210416316986, "learning_rate": 0.00010174950716341988, "loss": 0.023, "step": 2772 }, { "epoch": 1.82, "grad_norm": 0.0530213862657547, "learning_rate": 0.00010165190460368709, "loss": 0.0054, "step": 2773 }, { "epoch": 1.82, "grad_norm": 0.12830093502998352, "learning_rate": 0.00010155432488187995, "loss": 0.0063, "step": 2774 }, { "epoch": 1.82, "grad_norm": 0.09907712042331696, "learning_rate": 0.00010145676804409176, "loss": 0.0156, "step": 2775 }, { "epoch": 1.82, "grad_norm": 0.0601109117269516, "learning_rate": 0.00010135923413640487, "loss": 0.007, "step": 2776 }, { "epoch": 1.82, "grad_norm": 0.08233000338077545, "learning_rate": 0.00010126172320489088, "loss": 0.0072, "step": 2777 }, { "epoch": 1.82, "grad_norm": 0.09767752140760422, "learning_rate": 0.00010116423529561042, "loss": 0.0191, "step": 2778 }, { "epoch": 1.82, "grad_norm": 0.15519340336322784, "learning_rate": 0.0001010667704546134, "loss": 0.0431, "step": 2779 }, { "epoch": 1.82, "grad_norm": 0.17334707081317902, "learning_rate": 0.0001009693287279387, "loss": 0.0134, "step": 2780 }, { "epoch": 1.82, "grad_norm": 0.281533420085907, "learning_rate": 0.00010087191016161439, "loss": 0.0136, "step": 2781 }, { "epoch": 1.82, "grad_norm": 0.03143606334924698, "learning_rate": 0.00010077451480165747, "loss": 0.0032, "step": 2782 }, { "epoch": 1.82, "grad_norm": 0.16341941058635712, "learning_rate": 0.0001006771426940741, "loss": 0.0121, "step": 2783 }, { "epoch": 1.82, "grad_norm": 0.14227576553821564, "learning_rate": 0.00010057979388485942, "loss": 0.0096, "step": 2784 }, { "epoch": 1.82, "grad_norm": 0.03395267575979233, "learning_rate": 0.00010048246841999754, "loss": 0.0039, "step": 2785 }, { "epoch": 1.82, "grad_norm": 0.12993645668029785, "learning_rate": 0.0001003851663454616, "loss": 0.0057, "step": 2786 }, { "epoch": 1.82, "grad_norm": 0.21049480140209198, "learning_rate": 0.00010028788770721356, "loss": 0.0176, "step": 2787 }, { "epoch": 1.83, "grad_norm": 0.1201065331697464, "learning_rate": 0.00010019063255120446, "loss": 0.0609, "step": 2788 }, { "epoch": 1.83, "grad_norm": 0.08682208508253098, "learning_rate": 0.00010009340092337416, "loss": 0.0055, "step": 2789 }, { "epoch": 1.83, "grad_norm": 0.07561865448951721, "learning_rate": 9.999619286965149e-05, "loss": 0.0048, "step": 2790 }, { "epoch": 1.83, "grad_norm": 0.10334252566099167, "learning_rate": 9.989900843595403e-05, "loss": 0.005, "step": 2791 }, { "epoch": 1.83, "grad_norm": 0.0631050243973732, "learning_rate": 9.980184766818828e-05, "loss": 0.0058, "step": 2792 }, { "epoch": 1.83, "grad_norm": 0.43190333247184753, "learning_rate": 9.970471061224951e-05, "loss": 0.0583, "step": 2793 }, { "epoch": 1.83, "grad_norm": 0.03992627188563347, "learning_rate": 9.960759731402189e-05, "loss": 0.0027, "step": 2794 }, { "epoch": 1.83, "grad_norm": 0.08371511101722717, "learning_rate": 9.951050781937822e-05, "loss": 0.003, "step": 2795 }, { "epoch": 1.83, "grad_norm": 0.19122813642024994, "learning_rate": 9.941344217418017e-05, "loss": 0.0112, "step": 2796 }, { "epoch": 1.83, "grad_norm": 0.019885288551449776, "learning_rate": 9.931640042427812e-05, "loss": 0.0014, "step": 2797 }, { "epoch": 1.83, "grad_norm": 0.025179168209433556, "learning_rate": 9.921938261551113e-05, "loss": 0.0028, "step": 2798 }, { "epoch": 1.83, "grad_norm": 0.3792370855808258, "learning_rate": 9.912238879370703e-05, "loss": 0.032, "step": 2799 }, { "epoch": 1.83, "grad_norm": 0.37182220816612244, "learning_rate": 9.902541900468216e-05, "loss": 0.0227, "step": 2800 }, { "epoch": 1.83, "grad_norm": 0.17105183005332947, "learning_rate": 9.892847329424169e-05, "loss": 0.0373, "step": 2801 }, { "epoch": 1.83, "grad_norm": 0.09459386765956879, "learning_rate": 9.88315517081793e-05, "loss": 0.0098, "step": 2802 }, { "epoch": 1.84, "grad_norm": 0.21665576100349426, "learning_rate": 9.873465429227735e-05, "loss": 0.0139, "step": 2803 }, { "epoch": 1.84, "grad_norm": 0.10254119336605072, "learning_rate": 9.86377810923067e-05, "loss": 0.0019, "step": 2804 }, { "epoch": 1.84, "grad_norm": 0.25252997875213623, "learning_rate": 9.854093215402683e-05, "loss": 0.0316, "step": 2805 }, { "epoch": 1.84, "grad_norm": 0.21150769293308258, "learning_rate": 9.844410752318572e-05, "loss": 0.0033, "step": 2806 }, { "epoch": 1.84, "grad_norm": 0.2727220356464386, "learning_rate": 9.834730724551992e-05, "loss": 0.0261, "step": 2807 }, { "epoch": 1.84, "grad_norm": 0.08808046579360962, "learning_rate": 9.825053136675442e-05, "loss": 0.0083, "step": 2808 }, { "epoch": 1.84, "grad_norm": 0.11724215745925903, "learning_rate": 9.815377993260279e-05, "loss": 0.0079, "step": 2809 }, { "epoch": 1.84, "grad_norm": 0.03124573826789856, "learning_rate": 9.805705298876687e-05, "loss": 0.0034, "step": 2810 }, { "epoch": 1.84, "grad_norm": 0.32333555817604065, "learning_rate": 9.796035058093711e-05, "loss": 0.0185, "step": 2811 }, { "epoch": 1.84, "grad_norm": 0.03145265206694603, "learning_rate": 9.786367275479224e-05, "loss": 0.0028, "step": 2812 }, { "epoch": 1.84, "grad_norm": 0.18941310048103333, "learning_rate": 9.776701955599952e-05, "loss": 0.0182, "step": 2813 }, { "epoch": 1.84, "grad_norm": 0.0899311974644661, "learning_rate": 9.767039103021444e-05, "loss": 0.0083, "step": 2814 }, { "epoch": 1.84, "grad_norm": 0.1362254023551941, "learning_rate": 9.757378722308088e-05, "loss": 0.0144, "step": 2815 }, { "epoch": 1.84, "grad_norm": 0.026598049327731133, "learning_rate": 9.747720818023109e-05, "loss": 0.0017, "step": 2816 }, { "epoch": 1.84, "grad_norm": 0.15683645009994507, "learning_rate": 9.738065394728553e-05, "loss": 0.0044, "step": 2817 }, { "epoch": 1.84, "grad_norm": 0.17370571196079254, "learning_rate": 9.728412456985308e-05, "loss": 0.0143, "step": 2818 }, { "epoch": 1.85, "grad_norm": 0.2154899388551712, "learning_rate": 9.71876200935307e-05, "loss": 0.023, "step": 2819 }, { "epoch": 1.85, "grad_norm": 0.3872066140174866, "learning_rate": 9.709114056390375e-05, "loss": 0.0626, "step": 2820 }, { "epoch": 1.85, "grad_norm": 0.09413719922304153, "learning_rate": 9.69946860265457e-05, "loss": 0.0137, "step": 2821 }, { "epoch": 1.85, "grad_norm": 0.115718774497509, "learning_rate": 9.689825652701829e-05, "loss": 0.01, "step": 2822 }, { "epoch": 1.85, "grad_norm": 0.02686137706041336, "learning_rate": 9.680185211087136e-05, "loss": 0.0015, "step": 2823 }, { "epoch": 1.85, "grad_norm": 0.08938975632190704, "learning_rate": 9.670547282364294e-05, "loss": 0.0533, "step": 2824 }, { "epoch": 1.85, "grad_norm": 0.18357262015342712, "learning_rate": 9.660911871085917e-05, "loss": 0.043, "step": 2825 }, { "epoch": 1.85, "grad_norm": 0.23003168404102325, "learning_rate": 9.651278981803441e-05, "loss": 0.0273, "step": 2826 }, { "epoch": 1.85, "grad_norm": 0.035942334681749344, "learning_rate": 9.641648619067093e-05, "loss": 0.0043, "step": 2827 }, { "epoch": 1.85, "grad_norm": 0.19699084758758545, "learning_rate": 9.632020787425915e-05, "loss": 0.0378, "step": 2828 }, { "epoch": 1.85, "grad_norm": 0.06653069704771042, "learning_rate": 9.622395491427755e-05, "loss": 0.0044, "step": 2829 }, { "epoch": 1.85, "grad_norm": 0.004854390397667885, "learning_rate": 9.612772735619262e-05, "loss": 0.0006, "step": 2830 }, { "epoch": 1.85, "grad_norm": 0.012486539781093597, "learning_rate": 9.603152524545884e-05, "loss": 0.0018, "step": 2831 }, { "epoch": 1.85, "grad_norm": 0.022599024698138237, "learning_rate": 9.593534862751867e-05, "loss": 0.0027, "step": 2832 }, { "epoch": 1.85, "grad_norm": 0.051252953708171844, "learning_rate": 9.583919754780254e-05, "loss": 0.0064, "step": 2833 }, { "epoch": 1.86, "grad_norm": 0.22247406840324402, "learning_rate": 9.574307205172881e-05, "loss": 0.0255, "step": 2834 }, { "epoch": 1.86, "grad_norm": 0.07303927838802338, "learning_rate": 9.564697218470372e-05, "loss": 0.0053, "step": 2835 }, { "epoch": 1.86, "grad_norm": 0.00530305877327919, "learning_rate": 9.555089799212156e-05, "loss": 0.0004, "step": 2836 }, { "epoch": 1.86, "grad_norm": 0.14532607793807983, "learning_rate": 9.545484951936422e-05, "loss": 0.0083, "step": 2837 }, { "epoch": 1.86, "grad_norm": 0.02097056619822979, "learning_rate": 9.535882681180166e-05, "loss": 0.0024, "step": 2838 }, { "epoch": 1.86, "grad_norm": 0.11118460446596146, "learning_rate": 9.526282991479159e-05, "loss": 0.0083, "step": 2839 }, { "epoch": 1.86, "grad_norm": 0.24298857152462006, "learning_rate": 9.516685887367959e-05, "loss": 0.0391, "step": 2840 }, { "epoch": 1.86, "grad_norm": 0.13576066493988037, "learning_rate": 9.50709137337989e-05, "loss": 0.0078, "step": 2841 }, { "epoch": 1.86, "grad_norm": 0.3387295603752136, "learning_rate": 9.497499454047065e-05, "loss": 0.0191, "step": 2842 }, { "epoch": 1.86, "grad_norm": 0.10902759432792664, "learning_rate": 9.487910133900365e-05, "loss": 0.0077, "step": 2843 }, { "epoch": 1.86, "grad_norm": 0.22465559840202332, "learning_rate": 9.478323417469446e-05, "loss": 0.0149, "step": 2844 }, { "epoch": 1.86, "grad_norm": 0.18790677189826965, "learning_rate": 9.468739309282733e-05, "loss": 0.01, "step": 2845 }, { "epoch": 1.86, "grad_norm": 0.36567965149879456, "learning_rate": 9.459157813867414e-05, "loss": 0.0513, "step": 2846 }, { "epoch": 1.86, "grad_norm": 0.03895680233836174, "learning_rate": 9.449578935749451e-05, "loss": 0.004, "step": 2847 }, { "epoch": 1.86, "grad_norm": 0.11043170839548111, "learning_rate": 9.44000267945357e-05, "loss": 0.0085, "step": 2848 }, { "epoch": 1.87, "grad_norm": 0.39684051275253296, "learning_rate": 9.430429049503253e-05, "loss": 0.0441, "step": 2849 }, { "epoch": 1.87, "grad_norm": 0.11447808891534805, "learning_rate": 9.420858050420737e-05, "loss": 0.0201, "step": 2850 }, { "epoch": 1.87, "grad_norm": 0.10845249891281128, "learning_rate": 9.411289686727029e-05, "loss": 0.0307, "step": 2851 }, { "epoch": 1.87, "grad_norm": 0.1653800904750824, "learning_rate": 9.401723962941885e-05, "loss": 0.0361, "step": 2852 }, { "epoch": 1.87, "grad_norm": 0.18911443650722504, "learning_rate": 9.392160883583812e-05, "loss": 0.0076, "step": 2853 }, { "epoch": 1.87, "grad_norm": 0.026294823735952377, "learning_rate": 9.382600453170068e-05, "loss": 0.0017, "step": 2854 }, { "epoch": 1.87, "grad_norm": 0.17699147760868073, "learning_rate": 9.373042676216662e-05, "loss": 0.0195, "step": 2855 }, { "epoch": 1.87, "grad_norm": 0.1367698609828949, "learning_rate": 9.36348755723835e-05, "loss": 0.0111, "step": 2856 }, { "epoch": 1.87, "grad_norm": 0.5007506608963013, "learning_rate": 9.353935100748631e-05, "loss": 0.0202, "step": 2857 }, { "epoch": 1.87, "grad_norm": 0.0346270427107811, "learning_rate": 9.344385311259747e-05, "loss": 0.0025, "step": 2858 }, { "epoch": 1.87, "grad_norm": 0.1084810346364975, "learning_rate": 9.334838193282678e-05, "loss": 0.0094, "step": 2859 }, { "epoch": 1.87, "grad_norm": 0.04106857255101204, "learning_rate": 9.325293751327148e-05, "loss": 0.0025, "step": 2860 }, { "epoch": 1.87, "grad_norm": 0.11952603608369827, "learning_rate": 9.315751989901608e-05, "loss": 0.0082, "step": 2861 }, { "epoch": 1.87, "grad_norm": 0.1006799265742302, "learning_rate": 9.306212913513253e-05, "loss": 0.0173, "step": 2862 }, { "epoch": 1.87, "grad_norm": 0.036590754985809326, "learning_rate": 9.296676526668e-05, "loss": 0.0037, "step": 2863 }, { "epoch": 1.87, "grad_norm": 0.06457981467247009, "learning_rate": 9.2871428338705e-05, "loss": 0.0081, "step": 2864 }, { "epoch": 1.88, "grad_norm": 0.21057863533496857, "learning_rate": 9.277611839624132e-05, "loss": 0.0569, "step": 2865 }, { "epoch": 1.88, "grad_norm": 0.1334536373615265, "learning_rate": 9.268083548431005e-05, "loss": 0.0139, "step": 2866 }, { "epoch": 1.88, "grad_norm": 0.07306934148073196, "learning_rate": 9.258557964791938e-05, "loss": 0.0121, "step": 2867 }, { "epoch": 1.88, "grad_norm": 0.010792912915349007, "learning_rate": 9.249035093206484e-05, "loss": 0.0015, "step": 2868 }, { "epoch": 1.88, "grad_norm": 0.05428679287433624, "learning_rate": 9.239514938172906e-05, "loss": 0.0135, "step": 2869 }, { "epoch": 1.88, "grad_norm": 0.05001017451286316, "learning_rate": 9.229997504188193e-05, "loss": 0.0034, "step": 2870 }, { "epoch": 1.88, "grad_norm": 0.06393374502658844, "learning_rate": 9.220482795748037e-05, "loss": 0.0049, "step": 2871 }, { "epoch": 1.88, "grad_norm": 0.026336563751101494, "learning_rate": 9.210970817346854e-05, "loss": 0.0032, "step": 2872 }, { "epoch": 1.88, "grad_norm": 0.19261330366134644, "learning_rate": 9.201461573477761e-05, "loss": 0.0205, "step": 2873 }, { "epoch": 1.88, "grad_norm": 0.037117138504981995, "learning_rate": 9.19195506863259e-05, "loss": 0.0029, "step": 2874 }, { "epoch": 1.88, "grad_norm": 0.14101198315620422, "learning_rate": 9.18245130730188e-05, "loss": 0.0143, "step": 2875 }, { "epoch": 1.88, "grad_norm": 0.060865968465805054, "learning_rate": 9.172950293974863e-05, "loss": 0.0032, "step": 2876 }, { "epoch": 1.88, "grad_norm": 0.05794965475797653, "learning_rate": 9.163452033139487e-05, "loss": 0.0045, "step": 2877 }, { "epoch": 1.88, "grad_norm": 0.040618691593408585, "learning_rate": 9.153956529282391e-05, "loss": 0.0046, "step": 2878 }, { "epoch": 1.88, "grad_norm": 0.14414070546627045, "learning_rate": 9.144463786888918e-05, "loss": 0.0118, "step": 2879 }, { "epoch": 1.89, "grad_norm": 0.26047050952911377, "learning_rate": 9.134973810443096e-05, "loss": 0.0403, "step": 2880 }, { "epoch": 1.89, "grad_norm": 0.45467132329940796, "learning_rate": 9.125486604427658e-05, "loss": 0.0289, "step": 2881 }, { "epoch": 1.89, "grad_norm": 0.11333033442497253, "learning_rate": 9.116002173324025e-05, "loss": 0.0413, "step": 2882 }, { "epoch": 1.89, "grad_norm": 0.033728063106536865, "learning_rate": 9.106520521612305e-05, "loss": 0.002, "step": 2883 }, { "epoch": 1.89, "grad_norm": 0.04400965943932533, "learning_rate": 9.097041653771288e-05, "loss": 0.0048, "step": 2884 }, { "epoch": 1.89, "grad_norm": 0.40565457940101624, "learning_rate": 9.087565574278462e-05, "loss": 0.0335, "step": 2885 }, { "epoch": 1.89, "grad_norm": 0.012257483787834644, "learning_rate": 9.078092287609989e-05, "loss": 0.0016, "step": 2886 }, { "epoch": 1.89, "grad_norm": 0.17588816583156586, "learning_rate": 9.068621798240713e-05, "loss": 0.012, "step": 2887 }, { "epoch": 1.89, "grad_norm": 0.04336775094270706, "learning_rate": 9.05915411064416e-05, "loss": 0.0031, "step": 2888 }, { "epoch": 1.89, "grad_norm": 0.18482789397239685, "learning_rate": 9.049689229292524e-05, "loss": 0.0076, "step": 2889 }, { "epoch": 1.89, "grad_norm": 0.30984288454055786, "learning_rate": 9.040227158656684e-05, "loss": 0.025, "step": 2890 }, { "epoch": 1.89, "grad_norm": 0.0787012130022049, "learning_rate": 9.030767903206186e-05, "loss": 0.0085, "step": 2891 }, { "epoch": 1.89, "grad_norm": 0.1350172907114029, "learning_rate": 9.021311467409249e-05, "loss": 0.0283, "step": 2892 }, { "epoch": 1.89, "grad_norm": 0.25070253014564514, "learning_rate": 9.011857855732753e-05, "loss": 0.0307, "step": 2893 }, { "epoch": 1.89, "grad_norm": 0.06961613148450851, "learning_rate": 9.00240707264225e-05, "loss": 0.0046, "step": 2894 }, { "epoch": 1.9, "grad_norm": 0.6530323624610901, "learning_rate": 8.992959122601957e-05, "loss": 0.0236, "step": 2895 }, { "epoch": 1.9, "grad_norm": 0.06180819123983383, "learning_rate": 8.983514010074749e-05, "loss": 0.0042, "step": 2896 }, { "epoch": 1.9, "grad_norm": 0.05041724815964699, "learning_rate": 8.974071739522164e-05, "loss": 0.0071, "step": 2897 }, { "epoch": 1.9, "grad_norm": 0.12667830288410187, "learning_rate": 8.964632315404394e-05, "loss": 0.0079, "step": 2898 }, { "epoch": 1.9, "grad_norm": 0.01682797633111477, "learning_rate": 8.955195742180289e-05, "loss": 0.0015, "step": 2899 }, { "epoch": 1.9, "grad_norm": 0.06474439054727554, "learning_rate": 8.94576202430735e-05, "loss": 0.0038, "step": 2900 }, { "epoch": 1.9, "grad_norm": 0.20585475862026215, "learning_rate": 8.936331166241734e-05, "loss": 0.0347, "step": 2901 }, { "epoch": 1.9, "grad_norm": 0.10581755638122559, "learning_rate": 8.92690317243824e-05, "loss": 0.0058, "step": 2902 }, { "epoch": 1.9, "grad_norm": 0.10786069929599762, "learning_rate": 8.917478047350322e-05, "loss": 0.0047, "step": 2903 }, { "epoch": 1.9, "grad_norm": 0.19840127229690552, "learning_rate": 8.90805579543007e-05, "loss": 0.0225, "step": 2904 }, { "epoch": 1.9, "grad_norm": 0.11379020661115646, "learning_rate": 8.898636421128231e-05, "loss": 0.0165, "step": 2905 }, { "epoch": 1.9, "grad_norm": 0.12862448394298553, "learning_rate": 8.889219928894173e-05, "loss": 0.0115, "step": 2906 }, { "epoch": 1.9, "grad_norm": 0.011147045530378819, "learning_rate": 8.879806323175916e-05, "loss": 0.0012, "step": 2907 }, { "epoch": 1.9, "grad_norm": 0.043882615864276886, "learning_rate": 8.870395608420113e-05, "loss": 0.0055, "step": 2908 }, { "epoch": 1.9, "grad_norm": 0.24039480090141296, "learning_rate": 8.860987789072053e-05, "loss": 0.0272, "step": 2909 }, { "epoch": 1.91, "grad_norm": 0.21488645672798157, "learning_rate": 8.851582869575659e-05, "loss": 0.0092, "step": 2910 }, { "epoch": 1.91, "grad_norm": 0.009159781038761139, "learning_rate": 8.842180854373479e-05, "loss": 0.0008, "step": 2911 }, { "epoch": 1.91, "grad_norm": 0.19630037248134613, "learning_rate": 8.832781747906687e-05, "loss": 0.0262, "step": 2912 }, { "epoch": 1.91, "grad_norm": 0.1669941395521164, "learning_rate": 8.823385554615094e-05, "loss": 0.009, "step": 2913 }, { "epoch": 1.91, "grad_norm": 0.057654477655887604, "learning_rate": 8.813992278937129e-05, "loss": 0.0028, "step": 2914 }, { "epoch": 1.91, "grad_norm": 0.019775306805968285, "learning_rate": 8.804601925309837e-05, "loss": 0.0021, "step": 2915 }, { "epoch": 1.91, "grad_norm": 0.2010907083749771, "learning_rate": 8.795214498168895e-05, "loss": 0.0347, "step": 2916 }, { "epoch": 1.91, "grad_norm": 0.3659219741821289, "learning_rate": 8.785830001948583e-05, "loss": 0.0353, "step": 2917 }, { "epoch": 1.91, "grad_norm": 0.02452949434518814, "learning_rate": 8.776448441081807e-05, "loss": 0.001, "step": 2918 }, { "epoch": 1.91, "grad_norm": 0.038888439536094666, "learning_rate": 8.767069820000086e-05, "loss": 0.0025, "step": 2919 }, { "epoch": 1.91, "grad_norm": 0.08751122653484344, "learning_rate": 8.75769414313355e-05, "loss": 0.0062, "step": 2920 }, { "epoch": 1.91, "grad_norm": 0.012029001489281654, "learning_rate": 8.748321414910928e-05, "loss": 0.0009, "step": 2921 }, { "epoch": 1.91, "grad_norm": 0.036629196256399155, "learning_rate": 8.73895163975957e-05, "loss": 0.0024, "step": 2922 }, { "epoch": 1.91, "grad_norm": 0.027340730652213097, "learning_rate": 8.729584822105425e-05, "loss": 0.0011, "step": 2923 }, { "epoch": 1.91, "grad_norm": 0.08335358649492264, "learning_rate": 8.720220966373044e-05, "loss": 0.0326, "step": 2924 }, { "epoch": 1.91, "grad_norm": 0.32940107583999634, "learning_rate": 8.710860076985583e-05, "loss": 0.0256, "step": 2925 }, { "epoch": 1.92, "grad_norm": 0.054443515837192535, "learning_rate": 8.701502158364792e-05, "loss": 0.0041, "step": 2926 }, { "epoch": 1.92, "grad_norm": 0.0999118834733963, "learning_rate": 8.692147214931027e-05, "loss": 0.0111, "step": 2927 }, { "epoch": 1.92, "grad_norm": 0.3169289827346802, "learning_rate": 8.682795251103218e-05, "loss": 0.0284, "step": 2928 }, { "epoch": 1.92, "grad_norm": 0.009980392642319202, "learning_rate": 8.673446271298909e-05, "loss": 0.001, "step": 2929 }, { "epoch": 1.92, "grad_norm": 0.3076903223991394, "learning_rate": 8.664100279934227e-05, "loss": 0.0184, "step": 2930 }, { "epoch": 1.92, "grad_norm": 0.015774663537740707, "learning_rate": 8.654757281423884e-05, "loss": 0.0014, "step": 2931 }, { "epoch": 1.92, "grad_norm": 0.37929320335388184, "learning_rate": 8.645417280181184e-05, "loss": 0.0304, "step": 2932 }, { "epoch": 1.92, "grad_norm": 0.08329842984676361, "learning_rate": 8.63608028061801e-05, "loss": 0.0047, "step": 2933 }, { "epoch": 1.92, "grad_norm": 0.21046321094036102, "learning_rate": 8.62674628714483e-05, "loss": 0.0171, "step": 2934 }, { "epoch": 1.92, "grad_norm": 0.01876913383603096, "learning_rate": 8.6174153041707e-05, "loss": 0.0014, "step": 2935 }, { "epoch": 1.92, "grad_norm": 0.016124719753861427, "learning_rate": 8.60808733610323e-05, "loss": 0.0015, "step": 2936 }, { "epoch": 1.92, "grad_norm": 0.011233742348849773, "learning_rate": 8.59876238734863e-05, "loss": 0.0009, "step": 2937 }, { "epoch": 1.92, "grad_norm": 0.2208932489156723, "learning_rate": 8.589440462311675e-05, "loss": 0.0703, "step": 2938 }, { "epoch": 1.92, "grad_norm": 0.38058269023895264, "learning_rate": 8.58012156539571e-05, "loss": 0.0206, "step": 2939 }, { "epoch": 1.92, "grad_norm": 0.34829381108283997, "learning_rate": 8.570805701002651e-05, "loss": 0.0258, "step": 2940 }, { "epoch": 1.93, "grad_norm": 0.27227723598480225, "learning_rate": 8.561492873532986e-05, "loss": 0.0065, "step": 2941 }, { "epoch": 1.93, "grad_norm": 0.1077399030327797, "learning_rate": 8.552183087385759e-05, "loss": 0.0214, "step": 2942 }, { "epoch": 1.93, "grad_norm": 0.06788244843482971, "learning_rate": 8.542876346958589e-05, "loss": 0.0059, "step": 2943 }, { "epoch": 1.93, "grad_norm": 0.16576650738716125, "learning_rate": 8.533572656647648e-05, "loss": 0.0492, "step": 2944 }, { "epoch": 1.93, "grad_norm": 0.03128691017627716, "learning_rate": 8.524272020847665e-05, "loss": 0.0032, "step": 2945 }, { "epoch": 1.93, "grad_norm": 0.2140830159187317, "learning_rate": 8.514974443951933e-05, "loss": 0.0577, "step": 2946 }, { "epoch": 1.93, "grad_norm": 0.09814707189798355, "learning_rate": 8.505679930352298e-05, "loss": 0.0115, "step": 2947 }, { "epoch": 1.93, "grad_norm": 0.2515917420387268, "learning_rate": 8.496388484439158e-05, "loss": 0.0142, "step": 2948 }, { "epoch": 1.93, "grad_norm": 0.3558675944805145, "learning_rate": 8.487100110601466e-05, "loss": 0.0135, "step": 2949 }, { "epoch": 1.93, "grad_norm": 0.05048364773392677, "learning_rate": 8.477814813226715e-05, "loss": 0.0031, "step": 2950 }, { "epoch": 1.93, "grad_norm": 0.09671253710985184, "learning_rate": 8.468532596700955e-05, "loss": 0.0097, "step": 2951 }, { "epoch": 1.93, "grad_norm": 0.10951755940914154, "learning_rate": 8.459253465408772e-05, "loss": 0.0419, "step": 2952 }, { "epoch": 1.93, "grad_norm": 0.1641848385334015, "learning_rate": 8.449977423733308e-05, "loss": 0.0148, "step": 2953 }, { "epoch": 1.93, "grad_norm": 0.1038142666220665, "learning_rate": 8.440704476056221e-05, "loss": 0.0071, "step": 2954 }, { "epoch": 1.93, "grad_norm": 0.08310937136411667, "learning_rate": 8.431434626757731e-05, "loss": 0.0112, "step": 2955 }, { "epoch": 1.94, "grad_norm": 0.05308748781681061, "learning_rate": 8.422167880216586e-05, "loss": 0.0077, "step": 2956 }, { "epoch": 1.94, "grad_norm": 0.15207041800022125, "learning_rate": 8.412904240810068e-05, "loss": 0.0143, "step": 2957 }, { "epoch": 1.94, "grad_norm": 0.03646821156144142, "learning_rate": 8.403643712913989e-05, "loss": 0.0052, "step": 2958 }, { "epoch": 1.94, "grad_norm": 0.06434578448534012, "learning_rate": 8.394386300902699e-05, "loss": 0.008, "step": 2959 }, { "epoch": 1.94, "grad_norm": 0.0646260604262352, "learning_rate": 8.385132009149067e-05, "loss": 0.0072, "step": 2960 }, { "epoch": 1.94, "grad_norm": 0.13893379271030426, "learning_rate": 8.375880842024494e-05, "loss": 0.0105, "step": 2961 }, { "epoch": 1.94, "grad_norm": 0.1638610064983368, "learning_rate": 8.36663280389891e-05, "loss": 0.0162, "step": 2962 }, { "epoch": 1.94, "grad_norm": 0.018266642466187477, "learning_rate": 8.357387899140747e-05, "loss": 0.003, "step": 2963 }, { "epoch": 1.94, "grad_norm": 0.11912977695465088, "learning_rate": 8.348146132116976e-05, "loss": 0.0104, "step": 2964 }, { "epoch": 1.94, "grad_norm": 0.04382968321442604, "learning_rate": 8.338907507193083e-05, "loss": 0.0033, "step": 2965 }, { "epoch": 1.94, "grad_norm": 0.17198912799358368, "learning_rate": 8.329672028733062e-05, "loss": 0.0093, "step": 2966 }, { "epoch": 1.94, "grad_norm": 0.022629285231232643, "learning_rate": 8.320439701099428e-05, "loss": 0.002, "step": 2967 }, { "epoch": 1.94, "grad_norm": 0.21153992414474487, "learning_rate": 8.311210528653204e-05, "loss": 0.0378, "step": 2968 }, { "epoch": 1.94, "grad_norm": 0.029377898201346397, "learning_rate": 8.301984515753928e-05, "loss": 0.0031, "step": 2969 }, { "epoch": 1.94, "grad_norm": 0.11475684493780136, "learning_rate": 8.292761666759642e-05, "loss": 0.0052, "step": 2970 }, { "epoch": 1.95, "grad_norm": 0.3934403657913208, "learning_rate": 8.283541986026881e-05, "loss": 0.0528, "step": 2971 }, { "epoch": 1.95, "grad_norm": 0.07621371001005173, "learning_rate": 8.274325477910708e-05, "loss": 0.0059, "step": 2972 }, { "epoch": 1.95, "grad_norm": 0.035385921597480774, "learning_rate": 8.265112146764667e-05, "loss": 0.0026, "step": 2973 }, { "epoch": 1.95, "grad_norm": 0.0529436431825161, "learning_rate": 8.255901996940809e-05, "loss": 0.008, "step": 2974 }, { "epoch": 1.95, "grad_norm": 0.02031162567436695, "learning_rate": 8.246695032789688e-05, "loss": 0.0021, "step": 2975 }, { "epoch": 1.95, "grad_norm": 0.10501090437173843, "learning_rate": 8.237491258660342e-05, "loss": 0.0088, "step": 2976 }, { "epoch": 1.95, "grad_norm": 0.18462517857551575, "learning_rate": 8.228290678900312e-05, "loss": 0.018, "step": 2977 }, { "epoch": 1.95, "grad_norm": 0.18451613187789917, "learning_rate": 8.219093297855623e-05, "loss": 0.0488, "step": 2978 }, { "epoch": 1.95, "grad_norm": 0.41322609782218933, "learning_rate": 8.209899119870798e-05, "loss": 0.0349, "step": 2979 }, { "epoch": 1.95, "grad_norm": 0.034096457064151764, "learning_rate": 8.200708149288827e-05, "loss": 0.0036, "step": 2980 }, { "epoch": 1.95, "grad_norm": 0.10774432122707367, "learning_rate": 8.191520390451207e-05, "loss": 0.0373, "step": 2981 }, { "epoch": 1.95, "grad_norm": 0.0551498681306839, "learning_rate": 8.182335847697909e-05, "loss": 0.0025, "step": 2982 }, { "epoch": 1.95, "grad_norm": 0.09337715804576874, "learning_rate": 8.173154525367383e-05, "loss": 0.0059, "step": 2983 }, { "epoch": 1.95, "grad_norm": 0.16637249290943146, "learning_rate": 8.163976427796563e-05, "loss": 0.0079, "step": 2984 }, { "epoch": 1.95, "grad_norm": 0.035759832710027695, "learning_rate": 8.154801559320857e-05, "loss": 0.0045, "step": 2985 }, { "epoch": 1.95, "grad_norm": 0.0748455747961998, "learning_rate": 8.145629924274144e-05, "loss": 0.0036, "step": 2986 }, { "epoch": 1.96, "grad_norm": 0.378963828086853, "learning_rate": 8.136461526988783e-05, "loss": 0.0885, "step": 2987 }, { "epoch": 1.96, "grad_norm": 0.01577618159353733, "learning_rate": 8.127296371795605e-05, "loss": 0.0023, "step": 2988 }, { "epoch": 1.96, "grad_norm": 0.02859189733862877, "learning_rate": 8.118134463023889e-05, "loss": 0.0031, "step": 2989 }, { "epoch": 1.96, "grad_norm": 0.16388894617557526, "learning_rate": 8.108975805001406e-05, "loss": 0.0342, "step": 2990 }, { "epoch": 1.96, "grad_norm": 0.07618826627731323, "learning_rate": 8.099820402054377e-05, "loss": 0.0068, "step": 2991 }, { "epoch": 1.96, "grad_norm": 0.08716662973165512, "learning_rate": 8.090668258507494e-05, "loss": 0.0073, "step": 2992 }, { "epoch": 1.96, "grad_norm": 0.08639045059680939, "learning_rate": 8.081519378683904e-05, "loss": 0.0294, "step": 2993 }, { "epoch": 1.96, "grad_norm": 0.2602868974208832, "learning_rate": 8.072373766905212e-05, "loss": 0.0282, "step": 2994 }, { "epoch": 1.96, "grad_norm": 0.08201611787080765, "learning_rate": 8.06323142749148e-05, "loss": 0.0344, "step": 2995 }, { "epoch": 1.96, "grad_norm": 0.1925457864999771, "learning_rate": 8.054092364761234e-05, "loss": 0.0085, "step": 2996 }, { "epoch": 1.96, "grad_norm": 0.6533692479133606, "learning_rate": 8.044956583031429e-05, "loss": 0.1054, "step": 2997 }, { "epoch": 1.96, "grad_norm": 0.048638634383678436, "learning_rate": 8.03582408661749e-05, "loss": 0.0066, "step": 2998 }, { "epoch": 1.96, "grad_norm": 0.11080126464366913, "learning_rate": 8.026694879833285e-05, "loss": 0.0096, "step": 2999 }, { "epoch": 1.96, "grad_norm": 0.07887112349271774, "learning_rate": 8.017568966991129e-05, "loss": 0.0216, "step": 3000 }, { "epoch": 1.96, "grad_norm": 0.04135850816965103, "learning_rate": 8.008446352401777e-05, "loss": 0.0031, "step": 3001 }, { "epoch": 1.97, "grad_norm": 0.11523578315973282, "learning_rate": 7.99932704037443e-05, "loss": 0.0102, "step": 3002 }, { "epoch": 1.97, "grad_norm": 0.12281786650419235, "learning_rate": 7.990211035216727e-05, "loss": 0.0338, "step": 3003 }, { "epoch": 1.97, "grad_norm": 0.07289214432239532, "learning_rate": 7.981098341234747e-05, "loss": 0.0048, "step": 3004 }, { "epoch": 1.97, "grad_norm": 0.13399043679237366, "learning_rate": 7.971988962733007e-05, "loss": 0.0147, "step": 3005 }, { "epoch": 1.97, "grad_norm": 0.10570705682039261, "learning_rate": 7.962882904014447e-05, "loss": 0.0104, "step": 3006 }, { "epoch": 1.97, "grad_norm": 0.02668238803744316, "learning_rate": 7.953780169380452e-05, "loss": 0.0035, "step": 3007 }, { "epoch": 1.97, "grad_norm": 0.23536016047000885, "learning_rate": 7.944680763130824e-05, "loss": 0.0222, "step": 3008 }, { "epoch": 1.97, "grad_norm": 0.17513103783130646, "learning_rate": 7.935584689563802e-05, "loss": 0.0198, "step": 3009 }, { "epoch": 1.97, "grad_norm": 0.022857805714011192, "learning_rate": 7.926491952976051e-05, "loss": 0.0035, "step": 3010 }, { "epoch": 1.97, "grad_norm": 0.14224904775619507, "learning_rate": 7.917402557662658e-05, "loss": 0.0137, "step": 3011 }, { "epoch": 1.97, "grad_norm": 0.2415999174118042, "learning_rate": 7.90831650791713e-05, "loss": 0.0421, "step": 3012 }, { "epoch": 1.97, "grad_norm": 0.05061252415180206, "learning_rate": 7.899233808031394e-05, "loss": 0.0069, "step": 3013 }, { "epoch": 1.97, "grad_norm": 0.12675368785858154, "learning_rate": 7.890154462295795e-05, "loss": 0.0124, "step": 3014 }, { "epoch": 1.97, "grad_norm": 0.16706958413124084, "learning_rate": 7.881078474999097e-05, "loss": 0.0369, "step": 3015 }, { "epoch": 1.97, "grad_norm": 0.049328841269016266, "learning_rate": 7.872005850428476e-05, "loss": 0.0065, "step": 3016 }, { "epoch": 1.98, "grad_norm": 0.11827641725540161, "learning_rate": 7.862936592869508e-05, "loss": 0.0161, "step": 3017 }, { "epoch": 1.98, "grad_norm": 0.17668034136295319, "learning_rate": 7.853870706606198e-05, "loss": 0.0334, "step": 3018 }, { "epoch": 1.98, "grad_norm": 0.1600051075220108, "learning_rate": 7.844808195920943e-05, "loss": 0.0106, "step": 3019 }, { "epoch": 1.98, "grad_norm": 0.23005841672420502, "learning_rate": 7.835749065094558e-05, "loss": 0.0187, "step": 3020 }, { "epoch": 1.98, "grad_norm": 0.021763009950518608, "learning_rate": 7.82669331840625e-05, "loss": 0.0025, "step": 3021 }, { "epoch": 1.98, "grad_norm": 0.0882042795419693, "learning_rate": 7.817640960133636e-05, "loss": 0.0101, "step": 3022 }, { "epoch": 1.98, "grad_norm": 0.16614805161952972, "learning_rate": 7.808591994552728e-05, "loss": 0.021, "step": 3023 }, { "epoch": 1.98, "grad_norm": 1.298937201499939, "learning_rate": 7.799546425937941e-05, "loss": 0.0137, "step": 3024 }, { "epoch": 1.98, "grad_norm": 0.30948105454444885, "learning_rate": 7.79050425856207e-05, "loss": 0.0114, "step": 3025 }, { "epoch": 1.98, "grad_norm": 0.036360953003168106, "learning_rate": 7.78146549669632e-05, "loss": 0.0028, "step": 3026 }, { "epoch": 1.98, "grad_norm": 0.19619564712047577, "learning_rate": 7.772430144610284e-05, "loss": 0.014, "step": 3027 }, { "epoch": 1.98, "grad_norm": 0.22606731951236725, "learning_rate": 7.763398206571938e-05, "loss": 0.0618, "step": 3028 }, { "epoch": 1.98, "grad_norm": 0.045581962913274765, "learning_rate": 7.754369686847648e-05, "loss": 0.0039, "step": 3029 }, { "epoch": 1.98, "grad_norm": 0.0919916108250618, "learning_rate": 7.745344589702173e-05, "loss": 0.0243, "step": 3030 }, { "epoch": 1.98, "grad_norm": 0.14731182157993317, "learning_rate": 7.736322919398645e-05, "loss": 0.0182, "step": 3031 }, { "epoch": 1.98, "grad_norm": 0.2550598680973053, "learning_rate": 7.727304680198582e-05, "loss": 0.0238, "step": 3032 }, { "epoch": 1.99, "grad_norm": 0.12032425403594971, "learning_rate": 7.718289876361885e-05, "loss": 0.0153, "step": 3033 }, { "epoch": 1.99, "grad_norm": 0.038090333342552185, "learning_rate": 7.709278512146815e-05, "loss": 0.0049, "step": 3034 }, { "epoch": 1.99, "grad_norm": 0.19079631567001343, "learning_rate": 7.700270591810029e-05, "loss": 0.0188, "step": 3035 }, { "epoch": 1.99, "grad_norm": 0.21086086332798004, "learning_rate": 7.69126611960655e-05, "loss": 0.0389, "step": 3036 }, { "epoch": 1.99, "grad_norm": 0.11210260540246964, "learning_rate": 7.68226509978977e-05, "loss": 0.0443, "step": 3037 }, { "epoch": 1.99, "grad_norm": 0.018384624272584915, "learning_rate": 7.67326753661145e-05, "loss": 0.0024, "step": 3038 }, { "epoch": 1.99, "grad_norm": 0.015754880383610725, "learning_rate": 7.66427343432172e-05, "loss": 0.0019, "step": 3039 }, { "epoch": 1.99, "grad_norm": 0.20221295952796936, "learning_rate": 7.655282797169078e-05, "loss": 0.0111, "step": 3040 }, { "epoch": 1.99, "grad_norm": 0.12582607567310333, "learning_rate": 7.64629562940038e-05, "loss": 0.0095, "step": 3041 }, { "epoch": 1.99, "grad_norm": 0.09076832234859467, "learning_rate": 7.637311935260852e-05, "loss": 0.0082, "step": 3042 }, { "epoch": 1.99, "grad_norm": 0.025840701535344124, "learning_rate": 7.628331718994059e-05, "loss": 0.0027, "step": 3043 }, { "epoch": 1.99, "grad_norm": 0.24747799336910248, "learning_rate": 7.619354984841945e-05, "loss": 0.0227, "step": 3044 }, { "epoch": 1.99, "grad_norm": 0.10816159844398499, "learning_rate": 7.610381737044798e-05, "loss": 0.0285, "step": 3045 }, { "epoch": 1.99, "grad_norm": 0.2089771330356598, "learning_rate": 7.601411979841267e-05, "loss": 0.0278, "step": 3046 }, { "epoch": 1.99, "grad_norm": 0.10027289390563965, "learning_rate": 7.59244571746834e-05, "loss": 0.0083, "step": 3047 }, { "epoch": 2.0, "grad_norm": 0.1805410087108612, "learning_rate": 7.58348295416137e-05, "loss": 0.0141, "step": 3048 }, { "epoch": 2.0, "grad_norm": 0.05324965715408325, "learning_rate": 7.57452369415404e-05, "loss": 0.0025, "step": 3049 }, { "epoch": 2.0, "grad_norm": 0.2869288921356201, "learning_rate": 7.565567941678392e-05, "loss": 0.0217, "step": 3050 }, { "epoch": 2.0, "grad_norm": 0.3843397796154022, "learning_rate": 7.556615700964808e-05, "loss": 0.0599, "step": 3051 }, { "epoch": 2.0, "grad_norm": 0.03356291353702545, "learning_rate": 7.547666976242004e-05, "loss": 0.0044, "step": 3052 }, { "epoch": 2.0, "grad_norm": 0.13007520139217377, "learning_rate": 7.538721771737039e-05, "loss": 0.0436, "step": 3053 }, { "epoch": 2.0, "grad_norm": 0.04935429245233536, "learning_rate": 7.529780091675315e-05, "loss": 0.0058, "step": 3054 } ], "logging_steps": 1, "max_steps": 4581, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 1527, "total_flos": 2.8512716171771904e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }