diff --git "a/checkpoint-4581/trainer_state.json" "b/checkpoint-4581/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-4581/trainer_state.json" @@ -0,0 +1,32184 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.9990180032733225, + "eval_steps": 382, + "global_step": 4581, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 5.687138080596924, + "learning_rate": 2.9999999999999997e-05, + "loss": 3.5097, + "step": 1 + }, + { + "epoch": 0.0, + "eval_loss": 3.6327099800109863, + "eval_runtime": 39.1673, + "eval_samples_per_second": 32.859, + "eval_steps_per_second": 8.221, + "step": 1 + }, + { + "epoch": 0.0, + "grad_norm": 5.729796886444092, + "learning_rate": 5.9999999999999995e-05, + "loss": 3.6634, + "step": 2 + }, + { + "epoch": 0.0, + "grad_norm": 5.679180145263672, + "learning_rate": 8.999999999999999e-05, + "loss": 3.5559, + "step": 3 + }, + { + "epoch": 0.0, + "grad_norm": 4.81653356552124, + "learning_rate": 0.00011999999999999999, + "loss": 3.1536, + "step": 4 + }, + { + "epoch": 0.0, + "grad_norm": 4.388213634490967, + "learning_rate": 0.00015, + "loss": 2.3092, + "step": 5 + }, + { + "epoch": 0.0, + "grad_norm": 2.6662285327911377, + "learning_rate": 0.00017999999999999998, + "loss": 1.2283, + "step": 6 + }, + { + "epoch": 0.0, + "grad_norm": 1.9162248373031616, + "learning_rate": 0.00020999999999999998, + "loss": 0.6207, + "step": 7 + }, + { + "epoch": 0.01, + "grad_norm": 1.3946017026901245, + "learning_rate": 0.00023999999999999998, + "loss": 0.2942, + "step": 8 + }, + { + "epoch": 0.01, + "grad_norm": 0.3801995813846588, + "learning_rate": 0.00027, + "loss": 0.1143, + "step": 9 + }, + { + "epoch": 0.01, + "grad_norm": 0.2290647178888321, + "learning_rate": 0.0003, + "loss": 0.1152, + "step": 10 + }, + { + "epoch": 0.01, + "grad_norm": 0.2698324918746948, + "learning_rate": 0.00029999996457265966, + "loss": 0.0984, + "step": 11 + }, + { + "epoch": 0.01, + "grad_norm": 0.15049245953559875, + "learning_rate": 0.00029999985829065547, + "loss": 0.0925, + "step": 12 + }, + { + "epoch": 0.01, + "grad_norm": 0.7001833319664001, + "learning_rate": 0.0002999996811540376, + "loss": 0.1215, + "step": 13 + }, + { + "epoch": 0.01, + "grad_norm": 0.22832374274730682, + "learning_rate": 0.00029999943316288974, + "loss": 0.0997, + "step": 14 + }, + { + "epoch": 0.01, + "grad_norm": 0.1290595531463623, + "learning_rate": 0.00029999911431732894, + "loss": 0.0973, + "step": 15 + }, + { + "epoch": 0.01, + "grad_norm": 0.3555549383163452, + "learning_rate": 0.00029999872461750597, + "loss": 0.1108, + "step": 16 + }, + { + "epoch": 0.01, + "grad_norm": 0.04830395057797432, + "learning_rate": 0.0002999982640636048, + "loss": 0.0994, + "step": 17 + }, + { + "epoch": 0.01, + "grad_norm": 0.2727436125278473, + "learning_rate": 0.00029999773265584304, + "loss": 0.1144, + "step": 18 + }, + { + "epoch": 0.01, + "grad_norm": 0.03478335589170456, + "learning_rate": 0.0002999971303944716, + "loss": 0.0945, + "step": 19 + }, + { + "epoch": 0.01, + "grad_norm": 0.133951798081398, + "learning_rate": 0.00029999645727977505, + "loss": 0.0928, + "step": 20 + }, + { + "epoch": 0.01, + "grad_norm": 0.20885471999645233, + "learning_rate": 0.0002999957133120714, + "loss": 0.1056, + "step": 21 + }, + { + "epoch": 0.01, + "grad_norm": 0.030896561220288277, + "learning_rate": 0.00029999489849171195, + "loss": 0.0985, + "step": 22 + }, + { + "epoch": 0.02, + "grad_norm": 0.0476481132209301, + "learning_rate": 0.0002999940128190817, + "loss": 0.0993, + "step": 23 + }, + { + "epoch": 0.02, + "grad_norm": 0.2006714642047882, + "learning_rate": 0.00029999305629459895, + "loss": 0.0971, + "step": 24 + }, + { + "epoch": 0.02, + "grad_norm": 0.150727316737175, + "learning_rate": 0.0002999920289187155, + "loss": 0.1016, + "step": 25 + }, + { + "epoch": 0.02, + "grad_norm": 0.03271281719207764, + "learning_rate": 0.0002999909306919168, + "loss": 0.1002, + "step": 26 + }, + { + "epoch": 0.02, + "grad_norm": 0.08288753032684326, + "learning_rate": 0.0002999897616147214, + "loss": 0.1009, + "step": 27 + }, + { + "epoch": 0.02, + "grad_norm": 0.2443581521511078, + "learning_rate": 0.0002999885216876816, + "loss": 0.1036, + "step": 28 + }, + { + "epoch": 0.02, + "grad_norm": 0.16865722835063934, + "learning_rate": 0.00029998721091138323, + "loss": 0.0965, + "step": 29 + }, + { + "epoch": 0.02, + "grad_norm": 0.19362947344779968, + "learning_rate": 0.0002999858292864453, + "loss": 0.0952, + "step": 30 + }, + { + "epoch": 0.02, + "grad_norm": 0.039490532130002975, + "learning_rate": 0.0002999843768135205, + "loss": 0.0967, + "step": 31 + }, + { + "epoch": 0.02, + "grad_norm": 0.15848855674266815, + "learning_rate": 0.0002999828534932949, + "loss": 0.093, + "step": 32 + }, + { + "epoch": 0.02, + "grad_norm": 0.2813495695590973, + "learning_rate": 0.0002999812593264881, + "loss": 0.1052, + "step": 33 + }, + { + "epoch": 0.02, + "grad_norm": 0.03380066901445389, + "learning_rate": 0.00029997959431385314, + "loss": 0.0974, + "step": 34 + }, + { + "epoch": 0.02, + "grad_norm": 0.050066880881786346, + "learning_rate": 0.0002999778584561764, + "loss": 0.0972, + "step": 35 + }, + { + "epoch": 0.02, + "grad_norm": 0.2120673805475235, + "learning_rate": 0.00029997605175427803, + "loss": 0.0965, + "step": 36 + }, + { + "epoch": 0.02, + "grad_norm": 0.11290993541479111, + "learning_rate": 0.0002999741742090113, + "loss": 0.099, + "step": 37 + }, + { + "epoch": 0.02, + "grad_norm": 0.2454652190208435, + "learning_rate": 0.00029997222582126313, + "loss": 0.0898, + "step": 38 + }, + { + "epoch": 0.03, + "grad_norm": 0.10817914456129074, + "learning_rate": 0.0002999702065919539, + "loss": 0.0887, + "step": 39 + }, + { + "epoch": 0.03, + "grad_norm": 0.3510904014110565, + "learning_rate": 0.00029996811652203737, + "loss": 0.1107, + "step": 40 + }, + { + "epoch": 0.03, + "grad_norm": 0.3444919288158417, + "learning_rate": 0.0002999659556125009, + "loss": 0.1113, + "step": 41 + }, + { + "epoch": 0.03, + "grad_norm": 0.21621473133563995, + "learning_rate": 0.0002999637238643651, + "loss": 0.0991, + "step": 42 + }, + { + "epoch": 0.03, + "grad_norm": 0.0429786741733551, + "learning_rate": 0.00029996142127868426, + "loss": 0.0976, + "step": 43 + }, + { + "epoch": 0.03, + "grad_norm": 0.04371911287307739, + "learning_rate": 0.000299959047856546, + "loss": 0.0969, + "step": 44 + }, + { + "epoch": 0.03, + "grad_norm": 0.17956386506557465, + "learning_rate": 0.00029995660359907154, + "loss": 0.1027, + "step": 45 + }, + { + "epoch": 0.03, + "grad_norm": 0.05985981971025467, + "learning_rate": 0.0002999540885074153, + "loss": 0.0911, + "step": 46 + }, + { + "epoch": 0.03, + "grad_norm": 0.057165782898664474, + "learning_rate": 0.00029995150258276546, + "loss": 0.0944, + "step": 47 + }, + { + "epoch": 0.03, + "grad_norm": 0.06133668124675751, + "learning_rate": 0.00029994884582634345, + "loss": 0.0936, + "step": 48 + }, + { + "epoch": 0.03, + "grad_norm": 0.13429470360279083, + "learning_rate": 0.0002999461182394042, + "loss": 0.0932, + "step": 49 + }, + { + "epoch": 0.03, + "grad_norm": 0.08454808592796326, + "learning_rate": 0.00029994331982323625, + "loss": 0.0849, + "step": 50 + }, + { + "epoch": 0.03, + "grad_norm": 0.152529776096344, + "learning_rate": 0.0002999404505791613, + "loss": 0.0742, + "step": 51 + }, + { + "epoch": 0.03, + "grad_norm": 0.9239559173583984, + "learning_rate": 0.0002999375105085348, + "loss": 0.1266, + "step": 52 + }, + { + "epoch": 0.03, + "grad_norm": 0.15827079117298126, + "learning_rate": 0.0002999344996127455, + "loss": 0.0685, + "step": 53 + }, + { + "epoch": 0.04, + "grad_norm": 0.11372745782136917, + "learning_rate": 0.0002999314178932156, + "loss": 0.0853, + "step": 54 + }, + { + "epoch": 0.04, + "grad_norm": 0.11947692930698395, + "learning_rate": 0.00029992826535140093, + "loss": 0.0871, + "step": 55 + }, + { + "epoch": 0.04, + "grad_norm": 0.09731484949588776, + "learning_rate": 0.00029992504198879047, + "loss": 0.0799, + "step": 56 + }, + { + "epoch": 0.04, + "grad_norm": 0.40479809045791626, + "learning_rate": 0.0002999217478069069, + "loss": 0.1119, + "step": 57 + }, + { + "epoch": 0.04, + "grad_norm": 0.10651114583015442, + "learning_rate": 0.00029991838280730635, + "loss": 0.0741, + "step": 58 + }, + { + "epoch": 0.04, + "grad_norm": 0.13227766752243042, + "learning_rate": 0.0002999149469915782, + "loss": 0.067, + "step": 59 + }, + { + "epoch": 0.04, + "grad_norm": 0.2328774333000183, + "learning_rate": 0.0002999114403613454, + "loss": 0.0872, + "step": 60 + }, + { + "epoch": 0.04, + "grad_norm": 0.15303733944892883, + "learning_rate": 0.0002999078629182645, + "loss": 0.077, + "step": 61 + }, + { + "epoch": 0.04, + "grad_norm": 0.3285676836967468, + "learning_rate": 0.0002999042146640252, + "loss": 0.087, + "step": 62 + }, + { + "epoch": 0.04, + "grad_norm": 0.1548561453819275, + "learning_rate": 0.00029990049560035093, + "loss": 0.0521, + "step": 63 + }, + { + "epoch": 0.04, + "grad_norm": 0.1792415827512741, + "learning_rate": 0.0002998967057289983, + "loss": 0.0591, + "step": 64 + }, + { + "epoch": 0.04, + "grad_norm": 0.29741746187210083, + "learning_rate": 0.0002998928450517577, + "loss": 0.0955, + "step": 65 + }, + { + "epoch": 0.04, + "grad_norm": 0.2590031325817108, + "learning_rate": 0.0002998889135704527, + "loss": 0.0443, + "step": 66 + }, + { + "epoch": 0.04, + "grad_norm": 0.2152624875307083, + "learning_rate": 0.0002998849112869403, + "loss": 0.0656, + "step": 67 + }, + { + "epoch": 0.04, + "grad_norm": 0.1976858377456665, + "learning_rate": 0.0002998808382031111, + "loss": 0.0256, + "step": 68 + }, + { + "epoch": 0.05, + "grad_norm": 0.4642391502857208, + "learning_rate": 0.00029987669432088917, + "loss": 0.074, + "step": 69 + }, + { + "epoch": 0.05, + "grad_norm": 0.43813541531562805, + "learning_rate": 0.0002998724796422318, + "loss": 0.0344, + "step": 70 + }, + { + "epoch": 0.05, + "grad_norm": 0.8069552183151245, + "learning_rate": 0.0002998681941691299, + "loss": 0.1559, + "step": 71 + }, + { + "epoch": 0.05, + "grad_norm": 0.3986961841583252, + "learning_rate": 0.00029986383790360776, + "loss": 0.0504, + "step": 72 + }, + { + "epoch": 0.05, + "grad_norm": 0.19154639542102814, + "learning_rate": 0.00029985941084772317, + "loss": 0.0638, + "step": 73 + }, + { + "epoch": 0.05, + "grad_norm": 0.2110302895307541, + "learning_rate": 0.0002998549130035673, + "loss": 0.071, + "step": 74 + }, + { + "epoch": 0.05, + "grad_norm": 0.17988017201423645, + "learning_rate": 0.00029985034437326477, + "loss": 0.0798, + "step": 75 + }, + { + "epoch": 0.05, + "grad_norm": 0.15195637941360474, + "learning_rate": 0.0002998457049589736, + "loss": 0.0575, + "step": 76 + }, + { + "epoch": 0.05, + "grad_norm": 0.2465752810239792, + "learning_rate": 0.0002998409947628854, + "loss": 0.0669, + "step": 77 + }, + { + "epoch": 0.05, + "grad_norm": 0.10329095274209976, + "learning_rate": 0.0002998362137872249, + "loss": 0.0483, + "step": 78 + }, + { + "epoch": 0.05, + "grad_norm": 0.21354705095291138, + "learning_rate": 0.00029983136203425064, + "loss": 0.0522, + "step": 79 + }, + { + "epoch": 0.05, + "grad_norm": 0.1916392743587494, + "learning_rate": 0.00029982643950625436, + "loss": 0.0797, + "step": 80 + }, + { + "epoch": 0.05, + "grad_norm": 0.12721975147724152, + "learning_rate": 0.0002998214462055613, + "loss": 0.0368, + "step": 81 + }, + { + "epoch": 0.05, + "grad_norm": 0.29551053047180176, + "learning_rate": 0.0002998163821345301, + "loss": 0.094, + "step": 82 + }, + { + "epoch": 0.05, + "grad_norm": 0.3058943748474121, + "learning_rate": 0.00029981124729555283, + "loss": 0.0358, + "step": 83 + }, + { + "epoch": 0.05, + "grad_norm": 0.7026583552360535, + "learning_rate": 0.00029980604169105497, + "loss": 0.1386, + "step": 84 + }, + { + "epoch": 0.06, + "grad_norm": 0.37371405959129333, + "learning_rate": 0.00029980076532349557, + "loss": 0.0748, + "step": 85 + }, + { + "epoch": 0.06, + "grad_norm": 0.28942474722862244, + "learning_rate": 0.00029979541819536695, + "loss": 0.1037, + "step": 86 + }, + { + "epoch": 0.06, + "grad_norm": 0.1699017435312271, + "learning_rate": 0.0002997900003091949, + "loss": 0.0631, + "step": 87 + }, + { + "epoch": 0.06, + "grad_norm": 0.1061767190694809, + "learning_rate": 0.0002997845116675386, + "loss": 0.0557, + "step": 88 + }, + { + "epoch": 0.06, + "grad_norm": 0.13656219840049744, + "learning_rate": 0.0002997789522729908, + "loss": 0.0637, + "step": 89 + }, + { + "epoch": 0.06, + "grad_norm": 0.09874790161848068, + "learning_rate": 0.00029977332212817746, + "loss": 0.0495, + "step": 90 + }, + { + "epoch": 0.06, + "grad_norm": 0.24591276049613953, + "learning_rate": 0.0002997676212357581, + "loss": 0.0559, + "step": 91 + }, + { + "epoch": 0.06, + "grad_norm": 0.14663195610046387, + "learning_rate": 0.0002997618495984256, + "loss": 0.0804, + "step": 92 + }, + { + "epoch": 0.06, + "grad_norm": 0.08905334770679474, + "learning_rate": 0.0002997560072189062, + "loss": 0.0498, + "step": 93 + }, + { + "epoch": 0.06, + "grad_norm": 0.12921252846717834, + "learning_rate": 0.00029975009409995986, + "loss": 0.0365, + "step": 94 + }, + { + "epoch": 0.06, + "grad_norm": 0.08008511364459991, + "learning_rate": 0.0002997441102443795, + "loss": 0.03, + "step": 95 + }, + { + "epoch": 0.06, + "grad_norm": 0.2947149872779846, + "learning_rate": 0.0002997380556549918, + "loss": 0.0698, + "step": 96 + }, + { + "epoch": 0.06, + "grad_norm": 0.3243441581726074, + "learning_rate": 0.0002997319303346567, + "loss": 0.0564, + "step": 97 + }, + { + "epoch": 0.06, + "grad_norm": 0.28058576583862305, + "learning_rate": 0.00029972573428626757, + "loss": 0.1262, + "step": 98 + }, + { + "epoch": 0.06, + "grad_norm": 0.40957021713256836, + "learning_rate": 0.0002997194675127512, + "loss": 0.0471, + "step": 99 + }, + { + "epoch": 0.07, + "grad_norm": 0.12092690169811249, + "learning_rate": 0.00029971313001706787, + "loss": 0.0574, + "step": 100 + }, + { + "epoch": 0.07, + "grad_norm": 0.380398154258728, + "learning_rate": 0.0002997067218022111, + "loss": 0.1148, + "step": 101 + }, + { + "epoch": 0.07, + "grad_norm": 0.13584262132644653, + "learning_rate": 0.0002997002428712079, + "loss": 0.0299, + "step": 102 + }, + { + "epoch": 0.07, + "grad_norm": 0.13165581226348877, + "learning_rate": 0.00029969369322711874, + "loss": 0.0602, + "step": 103 + }, + { + "epoch": 0.07, + "grad_norm": 0.1055503860116005, + "learning_rate": 0.00029968707287303744, + "loss": 0.0404, + "step": 104 + }, + { + "epoch": 0.07, + "grad_norm": 0.09600503742694855, + "learning_rate": 0.00029968038181209114, + "loss": 0.0497, + "step": 105 + }, + { + "epoch": 0.07, + "grad_norm": 0.05941639468073845, + "learning_rate": 0.0002996736200474406, + "loss": 0.0456, + "step": 106 + }, + { + "epoch": 0.07, + "grad_norm": 0.1557297259569168, + "learning_rate": 0.0002996667875822797, + "loss": 0.077, + "step": 107 + }, + { + "epoch": 0.07, + "grad_norm": 0.14879021048545837, + "learning_rate": 0.00029965988441983595, + "loss": 0.0554, + "step": 108 + }, + { + "epoch": 0.07, + "grad_norm": 0.13067294657230377, + "learning_rate": 0.00029965291056337006, + "loss": 0.0357, + "step": 109 + }, + { + "epoch": 0.07, + "grad_norm": 0.15178795158863068, + "learning_rate": 0.00029964586601617633, + "loss": 0.0433, + "step": 110 + }, + { + "epoch": 0.07, + "grad_norm": 0.1176379844546318, + "learning_rate": 0.0002996387507815823, + "loss": 0.0432, + "step": 111 + }, + { + "epoch": 0.07, + "grad_norm": 0.048378992825746536, + "learning_rate": 0.000299631564862949, + "loss": 0.0338, + "step": 112 + }, + { + "epoch": 0.07, + "grad_norm": 0.09883740544319153, + "learning_rate": 0.0002996243082636708, + "loss": 0.0475, + "step": 113 + }, + { + "epoch": 0.07, + "grad_norm": 0.16062304377555847, + "learning_rate": 0.0002996169809871754, + "loss": 0.0595, + "step": 114 + }, + { + "epoch": 0.08, + "grad_norm": 0.06556422263383865, + "learning_rate": 0.00029960958303692397, + "loss": 0.0326, + "step": 115 + }, + { + "epoch": 0.08, + "grad_norm": 0.7436458468437195, + "learning_rate": 0.000299602114416411, + "loss": 0.0512, + "step": 116 + }, + { + "epoch": 0.08, + "grad_norm": 0.12153153866529465, + "learning_rate": 0.00029959457512916454, + "loss": 0.0448, + "step": 117 + }, + { + "epoch": 0.08, + "grad_norm": 0.21684418618679047, + "learning_rate": 0.0002995869651787458, + "loss": 0.0754, + "step": 118 + }, + { + "epoch": 0.08, + "grad_norm": 0.13978178799152374, + "learning_rate": 0.0002995792845687494, + "loss": 0.03, + "step": 119 + }, + { + "epoch": 0.08, + "grad_norm": 0.08695519715547562, + "learning_rate": 0.0002995715333028034, + "loss": 0.0156, + "step": 120 + }, + { + "epoch": 0.08, + "grad_norm": 0.2607383131980896, + "learning_rate": 0.0002995637113845693, + "loss": 0.0933, + "step": 121 + }, + { + "epoch": 0.08, + "grad_norm": 0.08398541808128357, + "learning_rate": 0.0002995558188177418, + "loss": 0.0368, + "step": 122 + }, + { + "epoch": 0.08, + "grad_norm": 0.14658145606517792, + "learning_rate": 0.0002995478556060492, + "loss": 0.0593, + "step": 123 + }, + { + "epoch": 0.08, + "grad_norm": 0.09054147452116013, + "learning_rate": 0.00029953982175325293, + "loss": 0.042, + "step": 124 + }, + { + "epoch": 0.08, + "grad_norm": 0.17315314710140228, + "learning_rate": 0.0002995317172631479, + "loss": 0.0754, + "step": 125 + }, + { + "epoch": 0.08, + "grad_norm": 0.20856395363807678, + "learning_rate": 0.0002995235421395624, + "loss": 0.0537, + "step": 126 + }, + { + "epoch": 0.08, + "grad_norm": 0.17539943754673004, + "learning_rate": 0.0002995152963863581, + "loss": 0.045, + "step": 127 + }, + { + "epoch": 0.08, + "grad_norm": 0.1361098289489746, + "learning_rate": 0.00029950698000743, + "loss": 0.0622, + "step": 128 + }, + { + "epoch": 0.08, + "grad_norm": 0.05299444869160652, + "learning_rate": 0.00029949859300670644, + "loss": 0.0548, + "step": 129 + }, + { + "epoch": 0.09, + "grad_norm": 0.19711115956306458, + "learning_rate": 0.0002994901353881491, + "loss": 0.0721, + "step": 130 + }, + { + "epoch": 0.09, + "grad_norm": 0.1288406252861023, + "learning_rate": 0.0002994816071557532, + "loss": 0.0408, + "step": 131 + }, + { + "epoch": 0.09, + "grad_norm": 0.08221332728862762, + "learning_rate": 0.000299473008313547, + "loss": 0.0526, + "step": 132 + }, + { + "epoch": 0.09, + "grad_norm": 0.1506081223487854, + "learning_rate": 0.00029946433886559237, + "loss": 0.0542, + "step": 133 + }, + { + "epoch": 0.09, + "grad_norm": 0.293639600276947, + "learning_rate": 0.00029945559881598444, + "loss": 0.0769, + "step": 134 + }, + { + "epoch": 0.09, + "grad_norm": 0.06451396644115448, + "learning_rate": 0.0002994467881688517, + "loss": 0.0417, + "step": 135 + }, + { + "epoch": 0.09, + "grad_norm": 0.2765437662601471, + "learning_rate": 0.00029943790692835604, + "loss": 0.0617, + "step": 136 + }, + { + "epoch": 0.09, + "grad_norm": 0.12035606801509857, + "learning_rate": 0.00029942895509869254, + "loss": 0.0429, + "step": 137 + }, + { + "epoch": 0.09, + "grad_norm": 0.09559385478496552, + "learning_rate": 0.0002994199326840898, + "loss": 0.044, + "step": 138 + }, + { + "epoch": 0.09, + "grad_norm": 0.13433387875556946, + "learning_rate": 0.00029941083968880965, + "loss": 0.036, + "step": 139 + }, + { + "epoch": 0.09, + "grad_norm": 0.1325090080499649, + "learning_rate": 0.0002994016761171474, + "loss": 0.0762, + "step": 140 + }, + { + "epoch": 0.09, + "grad_norm": 0.19197365641593933, + "learning_rate": 0.00029939244197343143, + "loss": 0.0587, + "step": 141 + }, + { + "epoch": 0.09, + "grad_norm": 0.09238675236701965, + "learning_rate": 0.00029938313726202376, + "loss": 0.0262, + "step": 142 + }, + { + "epoch": 0.09, + "grad_norm": 0.2584728002548218, + "learning_rate": 0.0002993737619873195, + "loss": 0.0382, + "step": 143 + }, + { + "epoch": 0.09, + "grad_norm": 0.30280745029449463, + "learning_rate": 0.00029936431615374727, + "loss": 0.0448, + "step": 144 + }, + { + "epoch": 0.09, + "grad_norm": 0.41464564204216003, + "learning_rate": 0.00029935479976576896, + "loss": 0.0676, + "step": 145 + }, + { + "epoch": 0.1, + "grad_norm": 0.4580010175704956, + "learning_rate": 0.00029934521282787974, + "loss": 0.1366, + "step": 146 + }, + { + "epoch": 0.1, + "grad_norm": 0.1701657474040985, + "learning_rate": 0.0002993355553446081, + "loss": 0.0844, + "step": 147 + }, + { + "epoch": 0.1, + "grad_norm": 0.10784261673688889, + "learning_rate": 0.000299325827320516, + "loss": 0.0211, + "step": 148 + }, + { + "epoch": 0.1, + "grad_norm": 0.08266110718250275, + "learning_rate": 0.0002993160287601984, + "loss": 0.0181, + "step": 149 + }, + { + "epoch": 0.1, + "grad_norm": 0.20068615674972534, + "learning_rate": 0.00029930615966828407, + "loss": 0.0582, + "step": 150 + }, + { + "epoch": 0.1, + "grad_norm": 0.14237689971923828, + "learning_rate": 0.0002992962200494347, + "loss": 0.0549, + "step": 151 + }, + { + "epoch": 0.1, + "grad_norm": 0.09671233594417572, + "learning_rate": 0.0002992862099083453, + "loss": 0.0368, + "step": 152 + }, + { + "epoch": 0.1, + "grad_norm": 0.11356969177722931, + "learning_rate": 0.00029927612924974455, + "loss": 0.0851, + "step": 153 + }, + { + "epoch": 0.1, + "grad_norm": 0.17435969412326813, + "learning_rate": 0.00029926597807839394, + "loss": 0.0869, + "step": 154 + }, + { + "epoch": 0.1, + "grad_norm": 0.09785137325525284, + "learning_rate": 0.00029925575639908866, + "loss": 0.0463, + "step": 155 + }, + { + "epoch": 0.1, + "grad_norm": 0.143271341919899, + "learning_rate": 0.0002992454642166571, + "loss": 0.0532, + "step": 156 + }, + { + "epoch": 0.1, + "grad_norm": 0.1381101906299591, + "learning_rate": 0.0002992351015359608, + "loss": 0.0512, + "step": 157 + }, + { + "epoch": 0.1, + "grad_norm": 0.0688018947839737, + "learning_rate": 0.0002992246683618948, + "loss": 0.0188, + "step": 158 + }, + { + "epoch": 0.1, + "grad_norm": 0.18138591945171356, + "learning_rate": 0.0002992141646993874, + "loss": 0.0737, + "step": 159 + }, + { + "epoch": 0.1, + "grad_norm": 0.0729256346821785, + "learning_rate": 0.0002992035905534001, + "loss": 0.0194, + "step": 160 + }, + { + "epoch": 0.11, + "grad_norm": 0.15414761006832123, + "learning_rate": 0.0002991929459289277, + "loss": 0.0412, + "step": 161 + }, + { + "epoch": 0.11, + "grad_norm": 0.2506199777126312, + "learning_rate": 0.00029918223083099846, + "loss": 0.0789, + "step": 162 + }, + { + "epoch": 0.11, + "grad_norm": 0.16611520946025848, + "learning_rate": 0.00029917144526467375, + "loss": 0.046, + "step": 163 + }, + { + "epoch": 0.11, + "grad_norm": 0.1828208565711975, + "learning_rate": 0.00029916058923504826, + "loss": 0.0324, + "step": 164 + }, + { + "epoch": 0.11, + "grad_norm": 0.08737993985414505, + "learning_rate": 0.00029914966274725006, + "loss": 0.0177, + "step": 165 + }, + { + "epoch": 0.11, + "grad_norm": 0.20271027088165283, + "learning_rate": 0.00029913866580644037, + "loss": 0.0455, + "step": 166 + }, + { + "epoch": 0.11, + "grad_norm": 0.04210209473967552, + "learning_rate": 0.00029912759841781383, + "loss": 0.0063, + "step": 167 + }, + { + "epoch": 0.11, + "grad_norm": 0.09085400402545929, + "learning_rate": 0.00029911646058659825, + "loss": 0.0174, + "step": 168 + }, + { + "epoch": 0.11, + "grad_norm": 0.18242572247982025, + "learning_rate": 0.00029910525231805466, + "loss": 0.053, + "step": 169 + }, + { + "epoch": 0.11, + "grad_norm": 0.2796941101551056, + "learning_rate": 0.0002990939736174776, + "loss": 0.0348, + "step": 170 + }, + { + "epoch": 0.11, + "grad_norm": 0.18838226795196533, + "learning_rate": 0.00029908262449019463, + "loss": 0.0583, + "step": 171 + }, + { + "epoch": 0.11, + "grad_norm": 0.03574841469526291, + "learning_rate": 0.00029907120494156674, + "loss": 0.0058, + "step": 172 + }, + { + "epoch": 0.11, + "grad_norm": 0.18582922220230103, + "learning_rate": 0.00029905971497698805, + "loss": 0.0571, + "step": 173 + }, + { + "epoch": 0.11, + "grad_norm": 0.12871672213077545, + "learning_rate": 0.00029904815460188604, + "loss": 0.0618, + "step": 174 + }, + { + "epoch": 0.11, + "grad_norm": 0.0590621717274189, + "learning_rate": 0.00029903652382172143, + "loss": 0.0107, + "step": 175 + }, + { + "epoch": 0.12, + "grad_norm": 0.07922167330980301, + "learning_rate": 0.00029902482264198817, + "loss": 0.035, + "step": 176 + }, + { + "epoch": 0.12, + "grad_norm": 0.3096056878566742, + "learning_rate": 0.0002990130510682135, + "loss": 0.0782, + "step": 177 + }, + { + "epoch": 0.12, + "grad_norm": 0.1896304190158844, + "learning_rate": 0.00029900120910595783, + "loss": 0.036, + "step": 178 + }, + { + "epoch": 0.12, + "grad_norm": 0.11776513606309891, + "learning_rate": 0.000298989296760815, + "loss": 0.0521, + "step": 179 + }, + { + "epoch": 0.12, + "grad_norm": 0.11616750061511993, + "learning_rate": 0.00029897731403841194, + "loss": 0.0275, + "step": 180 + }, + { + "epoch": 0.12, + "grad_norm": 0.20179390907287598, + "learning_rate": 0.0002989652609444088, + "loss": 0.0514, + "step": 181 + }, + { + "epoch": 0.12, + "grad_norm": 0.14983738958835602, + "learning_rate": 0.00029895313748449907, + "loss": 0.077, + "step": 182 + }, + { + "epoch": 0.12, + "grad_norm": 0.12123002856969833, + "learning_rate": 0.0002989409436644095, + "loss": 0.0485, + "step": 183 + }, + { + "epoch": 0.12, + "grad_norm": 0.314486026763916, + "learning_rate": 0.0002989286794898999, + "loss": 0.0931, + "step": 184 + }, + { + "epoch": 0.12, + "grad_norm": 0.132719025015831, + "learning_rate": 0.0002989163449667636, + "loss": 0.047, + "step": 185 + }, + { + "epoch": 0.12, + "grad_norm": 0.07938767969608307, + "learning_rate": 0.00029890394010082677, + "loss": 0.0364, + "step": 186 + }, + { + "epoch": 0.12, + "grad_norm": 0.08216488361358643, + "learning_rate": 0.00029889146489794926, + "loss": 0.0299, + "step": 187 + }, + { + "epoch": 0.12, + "grad_norm": 0.19339217245578766, + "learning_rate": 0.00029887891936402375, + "loss": 0.0408, + "step": 188 + }, + { + "epoch": 0.12, + "grad_norm": 0.30395349860191345, + "learning_rate": 0.0002988663035049763, + "loss": 0.0865, + "step": 189 + }, + { + "epoch": 0.12, + "grad_norm": 0.21264804899692535, + "learning_rate": 0.0002988536173267663, + "loss": 0.0584, + "step": 190 + }, + { + "epoch": 0.13, + "grad_norm": 0.1590937227010727, + "learning_rate": 0.0002988408608353862, + "loss": 0.0442, + "step": 191 + }, + { + "epoch": 0.13, + "grad_norm": 0.13069725036621094, + "learning_rate": 0.00029882803403686177, + "loss": 0.0416, + "step": 192 + }, + { + "epoch": 0.13, + "grad_norm": 0.1968701034784317, + "learning_rate": 0.0002988151369372518, + "loss": 0.0586, + "step": 193 + }, + { + "epoch": 0.13, + "grad_norm": 0.1478463113307953, + "learning_rate": 0.00029880216954264856, + "loss": 0.0595, + "step": 194 + }, + { + "epoch": 0.13, + "grad_norm": 0.06919383257627487, + "learning_rate": 0.0002987891318591773, + "loss": 0.0239, + "step": 195 + }, + { + "epoch": 0.13, + "grad_norm": 0.11905679851770401, + "learning_rate": 0.0002987760238929966, + "loss": 0.0345, + "step": 196 + }, + { + "epoch": 0.13, + "grad_norm": 0.14240068197250366, + "learning_rate": 0.00029876284565029816, + "loss": 0.0467, + "step": 197 + }, + { + "epoch": 0.13, + "grad_norm": 0.16097158193588257, + "learning_rate": 0.000298749597137307, + "loss": 0.0554, + "step": 198 + }, + { + "epoch": 0.13, + "grad_norm": 0.15597470104694366, + "learning_rate": 0.0002987362783602812, + "loss": 0.054, + "step": 199 + }, + { + "epoch": 0.13, + "grad_norm": 0.10321896523237228, + "learning_rate": 0.000298722889325512, + "loss": 0.0432, + "step": 200 + }, + { + "epoch": 0.13, + "grad_norm": 0.128427192568779, + "learning_rate": 0.000298709430039324, + "loss": 0.0315, + "step": 201 + }, + { + "epoch": 0.13, + "grad_norm": 0.11706223338842392, + "learning_rate": 0.00029869590050807487, + "loss": 0.0359, + "step": 202 + }, + { + "epoch": 0.13, + "grad_norm": 0.15359801054000854, + "learning_rate": 0.0002986823007381555, + "loss": 0.034, + "step": 203 + }, + { + "epoch": 0.13, + "grad_norm": 0.10363847017288208, + "learning_rate": 0.0002986686307359899, + "loss": 0.0261, + "step": 204 + }, + { + "epoch": 0.13, + "grad_norm": 0.12338493019342422, + "learning_rate": 0.0002986548905080353, + "loss": 0.0287, + "step": 205 + }, + { + "epoch": 0.13, + "grad_norm": 0.16201013326644897, + "learning_rate": 0.00029864108006078205, + "loss": 0.0173, + "step": 206 + }, + { + "epoch": 0.14, + "grad_norm": 0.04950540140271187, + "learning_rate": 0.00029862719940075387, + "loss": 0.0098, + "step": 207 + }, + { + "epoch": 0.14, + "grad_norm": 0.20930823683738708, + "learning_rate": 0.0002986132485345073, + "loss": 0.0652, + "step": 208 + }, + { + "epoch": 0.14, + "grad_norm": 0.12760238349437714, + "learning_rate": 0.0002985992274686324, + "loss": 0.0342, + "step": 209 + }, + { + "epoch": 0.14, + "grad_norm": 0.2107914686203003, + "learning_rate": 0.00029858513620975216, + "loss": 0.015, + "step": 210 + }, + { + "epoch": 0.14, + "grad_norm": 0.21169154345989227, + "learning_rate": 0.0002985709747645227, + "loss": 0.072, + "step": 211 + }, + { + "epoch": 0.14, + "grad_norm": 0.18555670976638794, + "learning_rate": 0.00029855674313963355, + "loss": 0.0359, + "step": 212 + }, + { + "epoch": 0.14, + "grad_norm": 0.1801125705242157, + "learning_rate": 0.00029854244134180707, + "loss": 0.038, + "step": 213 + }, + { + "epoch": 0.14, + "grad_norm": 0.10735122859477997, + "learning_rate": 0.000298528069377799, + "loss": 0.037, + "step": 214 + }, + { + "epoch": 0.14, + "grad_norm": 0.20155467092990875, + "learning_rate": 0.0002985136272543982, + "loss": 0.0505, + "step": 215 + }, + { + "epoch": 0.14, + "grad_norm": 0.1130833774805069, + "learning_rate": 0.0002984991149784265, + "loss": 0.0202, + "step": 216 + }, + { + "epoch": 0.14, + "grad_norm": 0.1932414174079895, + "learning_rate": 0.00029848453255673906, + "loss": 0.0803, + "step": 217 + }, + { + "epoch": 0.14, + "grad_norm": 0.18907181918621063, + "learning_rate": 0.0002984698799962241, + "loss": 0.0562, + "step": 218 + }, + { + "epoch": 0.14, + "grad_norm": 0.11439274251461029, + "learning_rate": 0.0002984551573038029, + "loss": 0.0474, + "step": 219 + }, + { + "epoch": 0.14, + "grad_norm": 0.19350704550743103, + "learning_rate": 0.00029844036448643, + "loss": 0.0335, + "step": 220 + }, + { + "epoch": 0.14, + "grad_norm": 0.19873294234275818, + "learning_rate": 0.000298425501551093, + "loss": 0.0616, + "step": 221 + }, + { + "epoch": 0.15, + "grad_norm": 0.2024085968732834, + "learning_rate": 0.00029841056850481265, + "loss": 0.0567, + "step": 222 + }, + { + "epoch": 0.15, + "grad_norm": 0.09004423022270203, + "learning_rate": 0.0002983955653546427, + "loss": 0.0291, + "step": 223 + }, + { + "epoch": 0.15, + "grad_norm": 0.19469811022281647, + "learning_rate": 0.00029838049210767015, + "loss": 0.0487, + "step": 224 + }, + { + "epoch": 0.15, + "grad_norm": 0.2525189518928528, + "learning_rate": 0.00029836534877101514, + "loss": 0.0629, + "step": 225 + }, + { + "epoch": 0.15, + "grad_norm": 0.12139023840427399, + "learning_rate": 0.0002983501353518307, + "loss": 0.0457, + "step": 226 + }, + { + "epoch": 0.15, + "grad_norm": 0.06411401927471161, + "learning_rate": 0.00029833485185730326, + "loss": 0.0186, + "step": 227 + }, + { + "epoch": 0.15, + "grad_norm": 0.024475542828440666, + "learning_rate": 0.00029831949829465214, + "loss": 0.004, + "step": 228 + }, + { + "epoch": 0.15, + "grad_norm": 0.15951114892959595, + "learning_rate": 0.0002983040746711298, + "loss": 0.0297, + "step": 229 + }, + { + "epoch": 0.15, + "grad_norm": 0.03694155812263489, + "learning_rate": 0.0002982885809940218, + "loss": 0.0073, + "step": 230 + }, + { + "epoch": 0.15, + "grad_norm": 0.13100893795490265, + "learning_rate": 0.0002982730172706468, + "loss": 0.0272, + "step": 231 + }, + { + "epoch": 0.15, + "grad_norm": 0.08929093927145004, + "learning_rate": 0.00029825738350835665, + "loss": 0.0146, + "step": 232 + }, + { + "epoch": 0.15, + "grad_norm": 0.1474764049053192, + "learning_rate": 0.0002982416797145361, + "loss": 0.0422, + "step": 233 + }, + { + "epoch": 0.15, + "grad_norm": 0.13874994218349457, + "learning_rate": 0.00029822590589660306, + "loss": 0.0353, + "step": 234 + }, + { + "epoch": 0.15, + "grad_norm": 0.048271678388118744, + "learning_rate": 0.00029821006206200856, + "loss": 0.0072, + "step": 235 + }, + { + "epoch": 0.15, + "grad_norm": 0.29017898440361023, + "learning_rate": 0.0002981941482182366, + "loss": 0.0607, + "step": 236 + }, + { + "epoch": 0.16, + "grad_norm": 0.3267674446105957, + "learning_rate": 0.0002981781643728044, + "loss": 0.101, + "step": 237 + }, + { + "epoch": 0.16, + "grad_norm": 0.17602747678756714, + "learning_rate": 0.00029816211053326216, + "loss": 0.0236, + "step": 238 + }, + { + "epoch": 0.16, + "grad_norm": 0.08361077308654785, + "learning_rate": 0.00029814598670719304, + "loss": 0.0277, + "step": 239 + }, + { + "epoch": 0.16, + "grad_norm": 0.08593238145112991, + "learning_rate": 0.00029812979290221346, + "loss": 0.0291, + "step": 240 + }, + { + "epoch": 0.16, + "grad_norm": 0.08858275413513184, + "learning_rate": 0.00029811352912597277, + "loss": 0.0329, + "step": 241 + }, + { + "epoch": 0.16, + "grad_norm": 0.08017202466726303, + "learning_rate": 0.0002980971953861534, + "loss": 0.0287, + "step": 242 + }, + { + "epoch": 0.16, + "grad_norm": 0.06615002453327179, + "learning_rate": 0.0002980807916904709, + "loss": 0.0269, + "step": 243 + }, + { + "epoch": 0.16, + "grad_norm": 0.12813499569892883, + "learning_rate": 0.00029806431804667364, + "loss": 0.0321, + "step": 244 + }, + { + "epoch": 0.16, + "grad_norm": 0.05528206750750542, + "learning_rate": 0.0002980477744625433, + "loss": 0.0089, + "step": 245 + }, + { + "epoch": 0.16, + "grad_norm": 0.10161186009645462, + "learning_rate": 0.00029803116094589445, + "loss": 0.0294, + "step": 246 + }, + { + "epoch": 0.16, + "grad_norm": 0.09885023534297943, + "learning_rate": 0.00029801447750457476, + "loss": 0.0232, + "step": 247 + }, + { + "epoch": 0.16, + "grad_norm": 0.20870375633239746, + "learning_rate": 0.00029799772414646484, + "loss": 0.0478, + "step": 248 + }, + { + "epoch": 0.16, + "grad_norm": 0.2730790674686432, + "learning_rate": 0.00029798090087947843, + "loss": 0.042, + "step": 249 + }, + { + "epoch": 0.16, + "grad_norm": 0.20371069014072418, + "learning_rate": 0.0002979640077115622, + "loss": 0.0634, + "step": 250 + }, + { + "epoch": 0.16, + "grad_norm": 0.14660406112670898, + "learning_rate": 0.0002979470446506959, + "loss": 0.0201, + "step": 251 + }, + { + "epoch": 0.16, + "grad_norm": 0.19971100986003876, + "learning_rate": 0.0002979300117048923, + "loss": 0.0431, + "step": 252 + }, + { + "epoch": 0.17, + "grad_norm": 0.14965400099754333, + "learning_rate": 0.0002979129088821971, + "loss": 0.041, + "step": 253 + }, + { + "epoch": 0.17, + "grad_norm": 0.2110958695411682, + "learning_rate": 0.0002978957361906892, + "loss": 0.028, + "step": 254 + }, + { + "epoch": 0.17, + "grad_norm": 0.13050246238708496, + "learning_rate": 0.0002978784936384802, + "loss": 0.0258, + "step": 255 + }, + { + "epoch": 0.17, + "grad_norm": 0.0885690301656723, + "learning_rate": 0.000297861181233715, + "loss": 0.0337, + "step": 256 + }, + { + "epoch": 0.17, + "grad_norm": 0.26541608572006226, + "learning_rate": 0.0002978437989845713, + "loss": 0.1142, + "step": 257 + }, + { + "epoch": 0.17, + "grad_norm": 0.14441104233264923, + "learning_rate": 0.0002978263468992599, + "loss": 0.0368, + "step": 258 + }, + { + "epoch": 0.17, + "grad_norm": 0.11450188606977463, + "learning_rate": 0.0002978088249860245, + "loss": 0.0243, + "step": 259 + }, + { + "epoch": 0.17, + "grad_norm": 0.3472074568271637, + "learning_rate": 0.00029779123325314184, + "loss": 0.0786, + "step": 260 + }, + { + "epoch": 0.17, + "grad_norm": 0.07867071032524109, + "learning_rate": 0.0002977735717089217, + "loss": 0.0356, + "step": 261 + }, + { + "epoch": 0.17, + "grad_norm": 0.1661967933177948, + "learning_rate": 0.0002977558403617067, + "loss": 0.047, + "step": 262 + }, + { + "epoch": 0.17, + "grad_norm": 0.17638400197029114, + "learning_rate": 0.00029773803921987244, + "loss": 0.0527, + "step": 263 + }, + { + "epoch": 0.17, + "grad_norm": 0.05885611101984978, + "learning_rate": 0.0002977201682918277, + "loss": 0.0156, + "step": 264 + }, + { + "epoch": 0.17, + "grad_norm": 0.07076411694288254, + "learning_rate": 0.00029770222758601395, + "loss": 0.0418, + "step": 265 + }, + { + "epoch": 0.17, + "grad_norm": 0.06245988979935646, + "learning_rate": 0.0002976842171109058, + "loss": 0.0199, + "step": 266 + }, + { + "epoch": 0.17, + "grad_norm": 0.08311894536018372, + "learning_rate": 0.0002976661368750107, + "loss": 0.028, + "step": 267 + }, + { + "epoch": 0.18, + "grad_norm": 0.11093831807374954, + "learning_rate": 0.0002976479868868692, + "loss": 0.0298, + "step": 268 + }, + { + "epoch": 0.18, + "grad_norm": 0.17683441936969757, + "learning_rate": 0.00029762976715505464, + "loss": 0.0539, + "step": 269 + }, + { + "epoch": 0.18, + "grad_norm": 0.13351142406463623, + "learning_rate": 0.00029761147768817345, + "loss": 0.0593, + "step": 270 + }, + { + "epoch": 0.18, + "grad_norm": 0.07717160880565643, + "learning_rate": 0.0002975931184948648, + "loss": 0.0227, + "step": 271 + }, + { + "epoch": 0.18, + "grad_norm": 0.11211559176445007, + "learning_rate": 0.0002975746895838011, + "loss": 0.0385, + "step": 272 + }, + { + "epoch": 0.18, + "grad_norm": 0.09209641814231873, + "learning_rate": 0.00029755619096368734, + "loss": 0.0086, + "step": 273 + }, + { + "epoch": 0.18, + "grad_norm": 0.0850004106760025, + "learning_rate": 0.0002975376226432617, + "loss": 0.0343, + "step": 274 + }, + { + "epoch": 0.18, + "grad_norm": 0.17711663246154785, + "learning_rate": 0.0002975189846312952, + "loss": 0.0665, + "step": 275 + }, + { + "epoch": 0.18, + "grad_norm": 0.13066548109054565, + "learning_rate": 0.0002975002769365918, + "loss": 0.0551, + "step": 276 + }, + { + "epoch": 0.18, + "grad_norm": 0.07509409636259079, + "learning_rate": 0.00029748149956798826, + "loss": 0.0087, + "step": 277 + }, + { + "epoch": 0.18, + "grad_norm": 0.3725223243236542, + "learning_rate": 0.0002974626525343544, + "loss": 0.026, + "step": 278 + }, + { + "epoch": 0.18, + "grad_norm": 0.20973052084445953, + "learning_rate": 0.0002974437358445929, + "loss": 0.015, + "step": 279 + }, + { + "epoch": 0.18, + "grad_norm": 0.25902581214904785, + "learning_rate": 0.0002974247495076393, + "loss": 0.0617, + "step": 280 + }, + { + "epoch": 0.18, + "grad_norm": 0.22490067780017853, + "learning_rate": 0.000297405693532462, + "loss": 0.0456, + "step": 281 + }, + { + "epoch": 0.18, + "grad_norm": 0.2885708510875702, + "learning_rate": 0.0002973865679280626, + "loss": 0.1066, + "step": 282 + }, + { + "epoch": 0.19, + "grad_norm": 0.2658590078353882, + "learning_rate": 0.00029736737270347517, + "loss": 0.0931, + "step": 283 + }, + { + "epoch": 0.19, + "grad_norm": 0.11531944572925568, + "learning_rate": 0.00029734810786776687, + "loss": 0.0238, + "step": 284 + }, + { + "epoch": 0.19, + "grad_norm": 0.0557803250849247, + "learning_rate": 0.00029732877343003776, + "loss": 0.0257, + "step": 285 + }, + { + "epoch": 0.19, + "grad_norm": 0.10880523920059204, + "learning_rate": 0.00029730936939942077, + "loss": 0.0387, + "step": 286 + }, + { + "epoch": 0.19, + "grad_norm": 0.09500639885663986, + "learning_rate": 0.0002972898957850816, + "loss": 0.0308, + "step": 287 + }, + { + "epoch": 0.19, + "grad_norm": 0.11504241824150085, + "learning_rate": 0.0002972703525962189, + "loss": 0.0292, + "step": 288 + }, + { + "epoch": 0.19, + "grad_norm": 0.10513140261173248, + "learning_rate": 0.0002972507398420643, + "loss": 0.0245, + "step": 289 + }, + { + "epoch": 0.19, + "grad_norm": 0.20218555629253387, + "learning_rate": 0.000297231057531882, + "loss": 0.0394, + "step": 290 + }, + { + "epoch": 0.19, + "grad_norm": 0.053536418825387955, + "learning_rate": 0.00029721130567496936, + "loss": 0.0129, + "step": 291 + }, + { + "epoch": 0.19, + "grad_norm": 0.15879443287849426, + "learning_rate": 0.0002971914842806564, + "loss": 0.054, + "step": 292 + }, + { + "epoch": 0.19, + "grad_norm": 0.11933678388595581, + "learning_rate": 0.00029717159335830606, + "loss": 0.0206, + "step": 293 + }, + { + "epoch": 0.19, + "grad_norm": 0.14436180889606476, + "learning_rate": 0.0002971516329173141, + "loss": 0.024, + "step": 294 + }, + { + "epoch": 0.19, + "grad_norm": 0.01978749968111515, + "learning_rate": 0.0002971316029671091, + "loss": 0.0047, + "step": 295 + }, + { + "epoch": 0.19, + "grad_norm": 0.1731237769126892, + "learning_rate": 0.00029711150351715253, + "loss": 0.0605, + "step": 296 + }, + { + "epoch": 0.19, + "grad_norm": 0.059307076036930084, + "learning_rate": 0.00029709133457693867, + "loss": 0.0308, + "step": 297 + }, + { + "epoch": 0.2, + "grad_norm": 0.3645476996898651, + "learning_rate": 0.00029707109615599456, + "loss": 0.0566, + "step": 298 + }, + { + "epoch": 0.2, + "grad_norm": 0.10670791566371918, + "learning_rate": 0.0002970507882638801, + "loss": 0.0234, + "step": 299 + }, + { + "epoch": 0.2, + "grad_norm": 0.10919758677482605, + "learning_rate": 0.0002970304109101881, + "loss": 0.0157, + "step": 300 + }, + { + "epoch": 0.2, + "grad_norm": 0.08173630386590958, + "learning_rate": 0.00029700996410454407, + "loss": 0.0371, + "step": 301 + }, + { + "epoch": 0.2, + "grad_norm": 0.13943839073181152, + "learning_rate": 0.00029698944785660635, + "loss": 0.0781, + "step": 302 + }, + { + "epoch": 0.2, + "grad_norm": 0.342821329832077, + "learning_rate": 0.00029696886217606605, + "loss": 0.0476, + "step": 303 + }, + { + "epoch": 0.2, + "grad_norm": 0.048615969717502594, + "learning_rate": 0.0002969482070726472, + "loss": 0.0083, + "step": 304 + }, + { + "epoch": 0.2, + "grad_norm": 0.1213599145412445, + "learning_rate": 0.0002969274825561064, + "loss": 0.0258, + "step": 305 + }, + { + "epoch": 0.2, + "grad_norm": 0.1914874166250229, + "learning_rate": 0.0002969066886362333, + "loss": 0.034, + "step": 306 + }, + { + "epoch": 0.2, + "grad_norm": 0.14067624509334564, + "learning_rate": 0.0002968858253228502, + "loss": 0.0395, + "step": 307 + }, + { + "epoch": 0.2, + "grad_norm": 0.08359983563423157, + "learning_rate": 0.00029686489262581217, + "loss": 0.0315, + "step": 308 + }, + { + "epoch": 0.2, + "grad_norm": 0.11551601439714432, + "learning_rate": 0.000296843890555007, + "loss": 0.058, + "step": 309 + }, + { + "epoch": 0.2, + "grad_norm": 0.12968787550926208, + "learning_rate": 0.00029682281912035545, + "loss": 0.0347, + "step": 310 + }, + { + "epoch": 0.2, + "grad_norm": 0.10182147473096848, + "learning_rate": 0.0002968016783318109, + "loss": 0.0165, + "step": 311 + }, + { + "epoch": 0.2, + "grad_norm": 0.06534916907548904, + "learning_rate": 0.00029678046819935934, + "loss": 0.0218, + "step": 312 + }, + { + "epoch": 0.2, + "grad_norm": 0.12587250769138336, + "learning_rate": 0.0002967591887330199, + "loss": 0.0498, + "step": 313 + }, + { + "epoch": 0.21, + "grad_norm": 0.06701786816120148, + "learning_rate": 0.0002967378399428441, + "loss": 0.0484, + "step": 314 + }, + { + "epoch": 0.21, + "grad_norm": 0.10836692154407501, + "learning_rate": 0.00029671642183891643, + "loss": 0.0412, + "step": 315 + }, + { + "epoch": 0.21, + "grad_norm": 0.061415113508701324, + "learning_rate": 0.00029669493443135403, + "loss": 0.0172, + "step": 316 + }, + { + "epoch": 0.21, + "grad_norm": 0.20760087668895721, + "learning_rate": 0.0002966733777303068, + "loss": 0.0494, + "step": 317 + }, + { + "epoch": 0.21, + "grad_norm": 0.11503862589597702, + "learning_rate": 0.00029665175174595736, + "loss": 0.0385, + "step": 318 + }, + { + "epoch": 0.21, + "grad_norm": 0.07366505265235901, + "learning_rate": 0.000296630056488521, + "loss": 0.0403, + "step": 319 + }, + { + "epoch": 0.21, + "grad_norm": 0.036951594054698944, + "learning_rate": 0.00029660829196824577, + "loss": 0.0092, + "step": 320 + }, + { + "epoch": 0.21, + "grad_norm": 0.08457314223051071, + "learning_rate": 0.0002965864581954126, + "loss": 0.0445, + "step": 321 + }, + { + "epoch": 0.21, + "grad_norm": 0.24513787031173706, + "learning_rate": 0.0002965645551803349, + "loss": 0.0716, + "step": 322 + }, + { + "epoch": 0.21, + "grad_norm": 0.08235831558704376, + "learning_rate": 0.00029654258293335887, + "loss": 0.029, + "step": 323 + }, + { + "epoch": 0.21, + "grad_norm": 0.08004003018140793, + "learning_rate": 0.00029652054146486344, + "loss": 0.0365, + "step": 324 + }, + { + "epoch": 0.21, + "grad_norm": 0.14928393065929413, + "learning_rate": 0.0002964984307852602, + "loss": 0.039, + "step": 325 + }, + { + "epoch": 0.21, + "grad_norm": 0.1802273988723755, + "learning_rate": 0.00029647625090499345, + "loss": 0.0324, + "step": 326 + }, + { + "epoch": 0.21, + "grad_norm": 0.18169750273227692, + "learning_rate": 0.00029645400183454026, + "loss": 0.0427, + "step": 327 + }, + { + "epoch": 0.21, + "grad_norm": 0.13121691346168518, + "learning_rate": 0.0002964316835844102, + "loss": 0.0274, + "step": 328 + }, + { + "epoch": 0.22, + "grad_norm": 0.27358877658843994, + "learning_rate": 0.0002964092961651456, + "loss": 0.0537, + "step": 329 + }, + { + "epoch": 0.22, + "grad_norm": 0.16992299258708954, + "learning_rate": 0.0002963868395873216, + "loss": 0.0797, + "step": 330 + }, + { + "epoch": 0.22, + "grad_norm": 0.2110740691423416, + "learning_rate": 0.0002963643138615458, + "loss": 0.0835, + "step": 331 + }, + { + "epoch": 0.22, + "grad_norm": 0.17114487290382385, + "learning_rate": 0.0002963417189984586, + "loss": 0.0619, + "step": 332 + }, + { + "epoch": 0.22, + "grad_norm": 0.09492560476064682, + "learning_rate": 0.000296319055008733, + "loss": 0.0212, + "step": 333 + }, + { + "epoch": 0.22, + "grad_norm": 0.19000209867954254, + "learning_rate": 0.0002962963219030746, + "loss": 0.0802, + "step": 334 + }, + { + "epoch": 0.22, + "grad_norm": 0.11632812023162842, + "learning_rate": 0.0002962735196922219, + "loss": 0.0426, + "step": 335 + }, + { + "epoch": 0.22, + "grad_norm": 0.15153561532497406, + "learning_rate": 0.0002962506483869456, + "loss": 0.07, + "step": 336 + }, + { + "epoch": 0.22, + "grad_norm": 0.0691797137260437, + "learning_rate": 0.00029622770799804944, + "loss": 0.0246, + "step": 337 + }, + { + "epoch": 0.22, + "grad_norm": 0.0731196403503418, + "learning_rate": 0.0002962046985363697, + "loss": 0.0413, + "step": 338 + }, + { + "epoch": 0.22, + "grad_norm": 0.1449161171913147, + "learning_rate": 0.00029618162001277513, + "loss": 0.023, + "step": 339 + }, + { + "epoch": 0.22, + "grad_norm": 0.13844870030879974, + "learning_rate": 0.0002961584724381672, + "loss": 0.055, + "step": 340 + }, + { + "epoch": 0.22, + "grad_norm": 0.08192728459835052, + "learning_rate": 0.00029613525582348007, + "loss": 0.0274, + "step": 341 + }, + { + "epoch": 0.22, + "grad_norm": 0.030294157564640045, + "learning_rate": 0.0002961119701796804, + "loss": 0.0332, + "step": 342 + }, + { + "epoch": 0.22, + "grad_norm": 0.12008962035179138, + "learning_rate": 0.0002960886155177675, + "loss": 0.0293, + "step": 343 + }, + { + "epoch": 0.23, + "grad_norm": 0.22829335927963257, + "learning_rate": 0.0002960651918487734, + "loss": 0.049, + "step": 344 + }, + { + "epoch": 0.23, + "grad_norm": 0.09662315249443054, + "learning_rate": 0.00029604169918376246, + "loss": 0.019, + "step": 345 + }, + { + "epoch": 0.23, + "grad_norm": 0.056000061333179474, + "learning_rate": 0.0002960181375338318, + "loss": 0.0077, + "step": 346 + }, + { + "epoch": 0.23, + "grad_norm": 0.04742419347167015, + "learning_rate": 0.00029599450691011116, + "loss": 0.0216, + "step": 347 + }, + { + "epoch": 0.23, + "grad_norm": 0.17151907086372375, + "learning_rate": 0.0002959708073237628, + "loss": 0.0364, + "step": 348 + }, + { + "epoch": 0.23, + "grad_norm": 0.3108668923377991, + "learning_rate": 0.00029594703878598155, + "loss": 0.0288, + "step": 349 + }, + { + "epoch": 0.23, + "grad_norm": 0.05538111925125122, + "learning_rate": 0.00029592320130799487, + "loss": 0.0048, + "step": 350 + }, + { + "epoch": 0.23, + "grad_norm": 0.2907853126525879, + "learning_rate": 0.00029589929490106263, + "loss": 0.0443, + "step": 351 + }, + { + "epoch": 0.23, + "grad_norm": 0.19189013540744781, + "learning_rate": 0.0002958753195764775, + "loss": 0.0688, + "step": 352 + }, + { + "epoch": 0.23, + "grad_norm": 0.3744778037071228, + "learning_rate": 0.00029585127534556446, + "loss": 0.0726, + "step": 353 + }, + { + "epoch": 0.23, + "grad_norm": 0.02139083668589592, + "learning_rate": 0.00029582716221968124, + "loss": 0.003, + "step": 354 + }, + { + "epoch": 0.23, + "grad_norm": 0.3209889531135559, + "learning_rate": 0.00029580298021021796, + "loss": 0.068, + "step": 355 + }, + { + "epoch": 0.23, + "grad_norm": 0.13530127704143524, + "learning_rate": 0.0002957787293285974, + "loss": 0.0229, + "step": 356 + }, + { + "epoch": 0.23, + "grad_norm": 0.04955355450510979, + "learning_rate": 0.00029575440958627485, + "loss": 0.007, + "step": 357 + }, + { + "epoch": 0.23, + "grad_norm": 0.05992133542895317, + "learning_rate": 0.0002957300209947379, + "loss": 0.014, + "step": 358 + }, + { + "epoch": 0.24, + "grad_norm": 0.08975626528263092, + "learning_rate": 0.0002957055635655071, + "loss": 0.0419, + "step": 359 + }, + { + "epoch": 0.24, + "grad_norm": 0.3397723436355591, + "learning_rate": 0.00029568103731013513, + "loss": 0.093, + "step": 360 + }, + { + "epoch": 0.24, + "grad_norm": 0.05291612446308136, + "learning_rate": 0.00029565644224020733, + "loss": 0.0137, + "step": 361 + }, + { + "epoch": 0.24, + "grad_norm": 0.16154609620571136, + "learning_rate": 0.0002956317783673416, + "loss": 0.0414, + "step": 362 + }, + { + "epoch": 0.24, + "grad_norm": 0.12861596047878265, + "learning_rate": 0.0002956070457031882, + "loss": 0.0372, + "step": 363 + }, + { + "epoch": 0.24, + "grad_norm": 0.09462448954582214, + "learning_rate": 0.00029558224425943003, + "loss": 0.0292, + "step": 364 + }, + { + "epoch": 0.24, + "grad_norm": 0.14290063083171844, + "learning_rate": 0.00029555737404778233, + "loss": 0.0572, + "step": 365 + }, + { + "epoch": 0.24, + "grad_norm": 0.11055822670459747, + "learning_rate": 0.00029553243507999307, + "loss": 0.0372, + "step": 366 + }, + { + "epoch": 0.24, + "grad_norm": 0.10231087356805801, + "learning_rate": 0.00029550742736784237, + "loss": 0.0368, + "step": 367 + }, + { + "epoch": 0.24, + "grad_norm": 0.09969429671764374, + "learning_rate": 0.00029548235092314304, + "loss": 0.0416, + "step": 368 + }, + { + "epoch": 0.24, + "grad_norm": 0.1207612007856369, + "learning_rate": 0.00029545720575774033, + "loss": 0.0307, + "step": 369 + }, + { + "epoch": 0.24, + "grad_norm": 0.11535090953111649, + "learning_rate": 0.0002954319918835119, + "loss": 0.0296, + "step": 370 + }, + { + "epoch": 0.24, + "grad_norm": 0.1460224986076355, + "learning_rate": 0.00029540670931236786, + "loss": 0.0587, + "step": 371 + }, + { + "epoch": 0.24, + "grad_norm": 0.10432720184326172, + "learning_rate": 0.0002953813580562509, + "loss": 0.0397, + "step": 372 + }, + { + "epoch": 0.24, + "grad_norm": 0.2140846997499466, + "learning_rate": 0.0002953559381271359, + "loss": 0.0538, + "step": 373 + }, + { + "epoch": 0.24, + "grad_norm": 0.12050808221101761, + "learning_rate": 0.00029533044953703044, + "loss": 0.0439, + "step": 374 + }, + { + "epoch": 0.25, + "grad_norm": 0.07928888499736786, + "learning_rate": 0.0002953048922979744, + "loss": 0.0163, + "step": 375 + }, + { + "epoch": 0.25, + "grad_norm": 0.08733994513750076, + "learning_rate": 0.0002952792664220402, + "loss": 0.0219, + "step": 376 + }, + { + "epoch": 0.25, + "grad_norm": 0.18080447614192963, + "learning_rate": 0.0002952535719213325, + "loss": 0.0469, + "step": 377 + }, + { + "epoch": 0.25, + "grad_norm": 0.08348793536424637, + "learning_rate": 0.0002952278088079884, + "loss": 0.035, + "step": 378 + }, + { + "epoch": 0.25, + "grad_norm": 0.1347195953130722, + "learning_rate": 0.00029520197709417763, + "loss": 0.029, + "step": 379 + }, + { + "epoch": 0.25, + "grad_norm": 0.11075679957866669, + "learning_rate": 0.0002951760767921021, + "loss": 0.0257, + "step": 380 + }, + { + "epoch": 0.25, + "grad_norm": 0.13172994554042816, + "learning_rate": 0.0002951501079139962, + "loss": 0.0302, + "step": 381 + }, + { + "epoch": 0.25, + "grad_norm": 0.114262655377388, + "learning_rate": 0.0002951240704721267, + "loss": 0.0492, + "step": 382 + }, + { + "epoch": 0.25, + "eval_loss": 0.034534960985183716, + "eval_runtime": 39.6959, + "eval_samples_per_second": 32.421, + "eval_steps_per_second": 8.112, + "step": 382 + }, + { + "epoch": 0.25, + "grad_norm": 0.08364730328321457, + "learning_rate": 0.0002950979644787928, + "loss": 0.0185, + "step": 383 + }, + { + "epoch": 0.25, + "grad_norm": 0.16603770852088928, + "learning_rate": 0.000295071789946326, + "loss": 0.0443, + "step": 384 + }, + { + "epoch": 0.25, + "grad_norm": 0.1269228458404541, + "learning_rate": 0.00029504554688709027, + "loss": 0.0217, + "step": 385 + }, + { + "epoch": 0.25, + "grad_norm": 0.15612861514091492, + "learning_rate": 0.0002950192353134819, + "loss": 0.0377, + "step": 386 + }, + { + "epoch": 0.25, + "grad_norm": 0.056646961718797684, + "learning_rate": 0.00029499285523792946, + "loss": 0.0133, + "step": 387 + }, + { + "epoch": 0.25, + "grad_norm": 0.23394975066184998, + "learning_rate": 0.000294966406672894, + "loss": 0.0767, + "step": 388 + }, + { + "epoch": 0.25, + "grad_norm": 0.21382953226566315, + "learning_rate": 0.00029493988963086895, + "loss": 0.0729, + "step": 389 + }, + { + "epoch": 0.26, + "grad_norm": 0.27641353011131287, + "learning_rate": 0.00029491330412438, + "loss": 0.1022, + "step": 390 + }, + { + "epoch": 0.26, + "grad_norm": 0.0760459303855896, + "learning_rate": 0.0002948866501659852, + "loss": 0.0269, + "step": 391 + }, + { + "epoch": 0.26, + "grad_norm": 0.5418729186058044, + "learning_rate": 0.0002948599277682748, + "loss": 0.1523, + "step": 392 + }, + { + "epoch": 0.26, + "grad_norm": 0.13234178721904755, + "learning_rate": 0.00029483313694387165, + "loss": 0.0292, + "step": 393 + }, + { + "epoch": 0.26, + "grad_norm": 0.07174021750688553, + "learning_rate": 0.00029480627770543086, + "loss": 0.0395, + "step": 394 + }, + { + "epoch": 0.26, + "grad_norm": 0.09958759695291519, + "learning_rate": 0.00029477935006563957, + "loss": 0.0559, + "step": 395 + }, + { + "epoch": 0.26, + "grad_norm": 0.07592346519231796, + "learning_rate": 0.00029475235403721763, + "loss": 0.0488, + "step": 396 + }, + { + "epoch": 0.26, + "grad_norm": 0.10129998624324799, + "learning_rate": 0.00029472528963291685, + "loss": 0.0287, + "step": 397 + }, + { + "epoch": 0.26, + "grad_norm": 0.08051212131977081, + "learning_rate": 0.00029469815686552163, + "loss": 0.0386, + "step": 398 + }, + { + "epoch": 0.26, + "grad_norm": 0.0695783942937851, + "learning_rate": 0.0002946709557478485, + "loss": 0.0201, + "step": 399 + }, + { + "epoch": 0.26, + "grad_norm": 0.1511554718017578, + "learning_rate": 0.00029464368629274624, + "loss": 0.0464, + "step": 400 + }, + { + "epoch": 0.26, + "grad_norm": 0.075484499335289, + "learning_rate": 0.00029461634851309597, + "loss": 0.031, + "step": 401 + }, + { + "epoch": 0.26, + "grad_norm": 0.08108027279376984, + "learning_rate": 0.00029458894242181114, + "loss": 0.0271, + "step": 402 + }, + { + "epoch": 0.26, + "grad_norm": 0.07254958897829056, + "learning_rate": 0.00029456146803183745, + "loss": 0.0187, + "step": 403 + }, + { + "epoch": 0.26, + "grad_norm": 0.215089812874794, + "learning_rate": 0.00029453392535615274, + "loss": 0.0463, + "step": 404 + }, + { + "epoch": 0.27, + "grad_norm": 0.034637995064258575, + "learning_rate": 0.0002945063144077672, + "loss": 0.0084, + "step": 405 + }, + { + "epoch": 0.27, + "grad_norm": 0.12073606252670288, + "learning_rate": 0.00029447863519972337, + "loss": 0.0401, + "step": 406 + }, + { + "epoch": 0.27, + "grad_norm": 0.13762198388576508, + "learning_rate": 0.00029445088774509583, + "loss": 0.0244, + "step": 407 + }, + { + "epoch": 0.27, + "grad_norm": 0.2537041902542114, + "learning_rate": 0.00029442307205699154, + "loss": 0.0574, + "step": 408 + }, + { + "epoch": 0.27, + "grad_norm": 0.1401953399181366, + "learning_rate": 0.00029439518814854956, + "loss": 0.0202, + "step": 409 + }, + { + "epoch": 0.27, + "grad_norm": 0.13872119784355164, + "learning_rate": 0.0002943672360329413, + "loss": 0.0373, + "step": 410 + }, + { + "epoch": 0.27, + "grad_norm": 0.3436320126056671, + "learning_rate": 0.00029433921572337044, + "loss": 0.0944, + "step": 411 + }, + { + "epoch": 0.27, + "grad_norm": 0.20004349946975708, + "learning_rate": 0.00029431112723307266, + "loss": 0.0625, + "step": 412 + }, + { + "epoch": 0.27, + "grad_norm": 0.10176026076078415, + "learning_rate": 0.00029428297057531607, + "loss": 0.023, + "step": 413 + }, + { + "epoch": 0.27, + "grad_norm": 0.08603208512067795, + "learning_rate": 0.0002942547457634008, + "loss": 0.0141, + "step": 414 + }, + { + "epoch": 0.27, + "grad_norm": 0.03601311519742012, + "learning_rate": 0.0002942264528106592, + "loss": 0.0071, + "step": 415 + }, + { + "epoch": 0.27, + "grad_norm": 0.1434870958328247, + "learning_rate": 0.000294198091730456, + "loss": 0.0362, + "step": 416 + }, + { + "epoch": 0.27, + "grad_norm": 0.1505521684885025, + "learning_rate": 0.0002941696625361879, + "loss": 0.0211, + "step": 417 + }, + { + "epoch": 0.27, + "grad_norm": 0.14390698075294495, + "learning_rate": 0.0002941411652412838, + "loss": 0.054, + "step": 418 + }, + { + "epoch": 0.27, + "grad_norm": 0.21683859825134277, + "learning_rate": 0.00029411259985920486, + "loss": 0.0482, + "step": 419 + }, + { + "epoch": 0.27, + "grad_norm": 0.12036791443824768, + "learning_rate": 0.0002940839664034444, + "loss": 0.0444, + "step": 420 + }, + { + "epoch": 0.28, + "grad_norm": 0.09479566663503647, + "learning_rate": 0.00029405526488752775, + "loss": 0.035, + "step": 421 + }, + { + "epoch": 0.28, + "grad_norm": 0.14229558408260345, + "learning_rate": 0.0002940264953250125, + "loss": 0.0573, + "step": 422 + }, + { + "epoch": 0.28, + "grad_norm": 0.22773970663547516, + "learning_rate": 0.00029399765772948844, + "loss": 0.061, + "step": 423 + }, + { + "epoch": 0.28, + "grad_norm": 0.11387961357831955, + "learning_rate": 0.0002939687521145774, + "loss": 0.057, + "step": 424 + }, + { + "epoch": 0.28, + "grad_norm": 0.1798745095729828, + "learning_rate": 0.00029393977849393333, + "loss": 0.0392, + "step": 425 + }, + { + "epoch": 0.28, + "grad_norm": 0.07203508168458939, + "learning_rate": 0.0002939107368812424, + "loss": 0.0152, + "step": 426 + }, + { + "epoch": 0.28, + "grad_norm": 0.04569177329540253, + "learning_rate": 0.0002938816272902228, + "loss": 0.0113, + "step": 427 + }, + { + "epoch": 0.28, + "grad_norm": 0.0927419438958168, + "learning_rate": 0.0002938524497346249, + "loss": 0.0246, + "step": 428 + }, + { + "epoch": 0.28, + "grad_norm": 0.16807597875595093, + "learning_rate": 0.0002938232042282311, + "loss": 0.0364, + "step": 429 + }, + { + "epoch": 0.28, + "grad_norm": 0.12006795406341553, + "learning_rate": 0.00029379389078485596, + "loss": 0.0118, + "step": 430 + }, + { + "epoch": 0.28, + "grad_norm": 0.0377679318189621, + "learning_rate": 0.0002937645094183461, + "loss": 0.0063, + "step": 431 + }, + { + "epoch": 0.28, + "grad_norm": 0.27051666378974915, + "learning_rate": 0.00029373506014258025, + "loss": 0.0682, + "step": 432 + }, + { + "epoch": 0.28, + "grad_norm": 0.228448748588562, + "learning_rate": 0.0002937055429714692, + "loss": 0.0733, + "step": 433 + }, + { + "epoch": 0.28, + "grad_norm": 0.18427824974060059, + "learning_rate": 0.00029367595791895577, + "loss": 0.0338, + "step": 434 + }, + { + "epoch": 0.28, + "grad_norm": 0.25813257694244385, + "learning_rate": 0.00029364630499901503, + "loss": 0.0323, + "step": 435 + }, + { + "epoch": 0.29, + "grad_norm": 0.17406705021858215, + "learning_rate": 0.0002936165842256538, + "loss": 0.0398, + "step": 436 + }, + { + "epoch": 0.29, + "grad_norm": 0.5199068188667297, + "learning_rate": 0.0002935867956129112, + "loss": 0.0486, + "step": 437 + }, + { + "epoch": 0.29, + "grad_norm": 0.3251938223838806, + "learning_rate": 0.0002935569391748583, + "loss": 0.049, + "step": 438 + }, + { + "epoch": 0.29, + "grad_norm": 0.057003892958164215, + "learning_rate": 0.00029352701492559827, + "loss": 0.0114, + "step": 439 + }, + { + "epoch": 0.29, + "grad_norm": 0.15188859403133392, + "learning_rate": 0.00029349702287926623, + "loss": 0.0323, + "step": 440 + }, + { + "epoch": 0.29, + "grad_norm": 0.17942048609256744, + "learning_rate": 0.0002934669630500293, + "loss": 0.0437, + "step": 441 + }, + { + "epoch": 0.29, + "grad_norm": 0.06396406143903732, + "learning_rate": 0.0002934368354520867, + "loss": 0.0097, + "step": 442 + }, + { + "epoch": 0.29, + "grad_norm": 0.1496248096227646, + "learning_rate": 0.00029340664009966974, + "loss": 0.0316, + "step": 443 + }, + { + "epoch": 0.29, + "grad_norm": 0.0654374286532402, + "learning_rate": 0.00029337637700704156, + "loss": 0.0083, + "step": 444 + }, + { + "epoch": 0.29, + "grad_norm": 0.04386695846915245, + "learning_rate": 0.0002933460461884973, + "loss": 0.0094, + "step": 445 + }, + { + "epoch": 0.29, + "grad_norm": 0.14928901195526123, + "learning_rate": 0.0002933156476583643, + "loss": 0.0484, + "step": 446 + }, + { + "epoch": 0.29, + "grad_norm": 0.12666364014148712, + "learning_rate": 0.0002932851814310017, + "loss": 0.0148, + "step": 447 + }, + { + "epoch": 0.29, + "grad_norm": 0.023791933432221413, + "learning_rate": 0.0002932546475208006, + "loss": 0.003, + "step": 448 + }, + { + "epoch": 0.29, + "grad_norm": 0.022256718948483467, + "learning_rate": 0.0002932240459421842, + "loss": 0.0044, + "step": 449 + }, + { + "epoch": 0.29, + "grad_norm": 0.12194914370775223, + "learning_rate": 0.0002931933767096076, + "loss": 0.009, + "step": 450 + }, + { + "epoch": 0.3, + "grad_norm": 0.29687178134918213, + "learning_rate": 0.0002931626398375578, + "loss": 0.0691, + "step": 451 + }, + { + "epoch": 0.3, + "grad_norm": 0.24758018553256989, + "learning_rate": 0.00029313183534055386, + "loss": 0.0589, + "step": 452 + }, + { + "epoch": 0.3, + "grad_norm": 0.10298270732164383, + "learning_rate": 0.0002931009632331468, + "loss": 0.0187, + "step": 453 + }, + { + "epoch": 0.3, + "grad_norm": 0.1447860449552536, + "learning_rate": 0.00029307002352991937, + "loss": 0.0297, + "step": 454 + }, + { + "epoch": 0.3, + "grad_norm": 0.2590334117412567, + "learning_rate": 0.00029303901624548644, + "loss": 0.0892, + "step": 455 + }, + { + "epoch": 0.3, + "grad_norm": 0.07339983433485031, + "learning_rate": 0.00029300794139449477, + "loss": 0.0249, + "step": 456 + }, + { + "epoch": 0.3, + "grad_norm": 0.16213186085224152, + "learning_rate": 0.000292976798991623, + "loss": 0.0493, + "step": 457 + }, + { + "epoch": 0.3, + "grad_norm": 0.03418932110071182, + "learning_rate": 0.0002929455890515818, + "loss": 0.0066, + "step": 458 + }, + { + "epoch": 0.3, + "grad_norm": 0.18771564960479736, + "learning_rate": 0.0002929143115891134, + "loss": 0.03, + "step": 459 + }, + { + "epoch": 0.3, + "grad_norm": 0.13976161181926727, + "learning_rate": 0.00029288296661899243, + "loss": 0.0451, + "step": 460 + }, + { + "epoch": 0.3, + "grad_norm": 0.07075387239456177, + "learning_rate": 0.00029285155415602495, + "loss": 0.0201, + "step": 461 + }, + { + "epoch": 0.3, + "grad_norm": 0.1304980367422104, + "learning_rate": 0.0002928200742150492, + "loss": 0.0286, + "step": 462 + }, + { + "epoch": 0.3, + "grad_norm": 0.06026493385434151, + "learning_rate": 0.00029278852681093514, + "loss": 0.0159, + "step": 463 + }, + { + "epoch": 0.3, + "grad_norm": 0.08018484711647034, + "learning_rate": 0.0002927569119585847, + "loss": 0.0333, + "step": 464 + }, + { + "epoch": 0.3, + "grad_norm": 0.21171532571315765, + "learning_rate": 0.0002927252296729315, + "loss": 0.034, + "step": 465 + }, + { + "epoch": 0.31, + "grad_norm": 0.14055241644382477, + "learning_rate": 0.0002926934799689413, + "loss": 0.0504, + "step": 466 + }, + { + "epoch": 0.31, + "grad_norm": 0.17434647679328918, + "learning_rate": 0.0002926616628616113, + "loss": 0.0519, + "step": 467 + }, + { + "epoch": 0.31, + "grad_norm": 0.12710362672805786, + "learning_rate": 0.00029262977836597105, + "loss": 0.0154, + "step": 468 + }, + { + "epoch": 0.31, + "grad_norm": 0.16046389937400818, + "learning_rate": 0.0002925978264970814, + "loss": 0.0398, + "step": 469 + }, + { + "epoch": 0.31, + "grad_norm": 0.23207533359527588, + "learning_rate": 0.00029256580727003543, + "loss": 0.0562, + "step": 470 + }, + { + "epoch": 0.31, + "grad_norm": 0.29609429836273193, + "learning_rate": 0.0002925337206999579, + "loss": 0.137, + "step": 471 + }, + { + "epoch": 0.31, + "grad_norm": 0.15176476538181305, + "learning_rate": 0.00029250156680200526, + "loss": 0.025, + "step": 472 + }, + { + "epoch": 0.31, + "grad_norm": 0.14394959807395935, + "learning_rate": 0.00029246934559136597, + "loss": 0.0519, + "step": 473 + }, + { + "epoch": 0.31, + "grad_norm": 0.08391053229570389, + "learning_rate": 0.00029243705708326015, + "loss": 0.0184, + "step": 474 + }, + { + "epoch": 0.31, + "grad_norm": 0.09384860098361969, + "learning_rate": 0.00029240470129293975, + "loss": 0.0229, + "step": 475 + }, + { + "epoch": 0.31, + "grad_norm": 0.12083159387111664, + "learning_rate": 0.00029237227823568845, + "loss": 0.0219, + "step": 476 + }, + { + "epoch": 0.31, + "grad_norm": 0.19567762315273285, + "learning_rate": 0.0002923397879268218, + "loss": 0.0728, + "step": 477 + }, + { + "epoch": 0.31, + "grad_norm": 0.07342015206813812, + "learning_rate": 0.0002923072303816871, + "loss": 0.0412, + "step": 478 + }, + { + "epoch": 0.31, + "grad_norm": 0.06717100739479065, + "learning_rate": 0.00029227460561566333, + "loss": 0.0309, + "step": 479 + }, + { + "epoch": 0.31, + "grad_norm": 0.09244221448898315, + "learning_rate": 0.0002922419136441613, + "loss": 0.0508, + "step": 480 + }, + { + "epoch": 0.31, + "grad_norm": 0.052494604140520096, + "learning_rate": 0.0002922091544826235, + "loss": 0.0319, + "step": 481 + }, + { + "epoch": 0.32, + "grad_norm": 0.14286155998706818, + "learning_rate": 0.00029217632814652417, + "loss": 0.0654, + "step": 482 + }, + { + "epoch": 0.32, + "grad_norm": 0.06442811340093613, + "learning_rate": 0.00029214343465136945, + "loss": 0.0132, + "step": 483 + }, + { + "epoch": 0.32, + "grad_norm": 0.05420248210430145, + "learning_rate": 0.0002921104740126969, + "loss": 0.0115, + "step": 484 + }, + { + "epoch": 0.32, + "grad_norm": 0.04951406642794609, + "learning_rate": 0.0002920774462460761, + "loss": 0.0086, + "step": 485 + }, + { + "epoch": 0.32, + "grad_norm": 0.08321358263492584, + "learning_rate": 0.00029204435136710803, + "loss": 0.0445, + "step": 486 + }, + { + "epoch": 0.32, + "grad_norm": 0.11665898561477661, + "learning_rate": 0.0002920111893914257, + "loss": 0.0262, + "step": 487 + }, + { + "epoch": 0.32, + "grad_norm": 0.1829105019569397, + "learning_rate": 0.00029197796033469356, + "loss": 0.0308, + "step": 488 + }, + { + "epoch": 0.32, + "grad_norm": 0.20940159261226654, + "learning_rate": 0.00029194466421260786, + "loss": 0.0299, + "step": 489 + }, + { + "epoch": 0.32, + "grad_norm": 0.20697347819805145, + "learning_rate": 0.0002919113010408965, + "loss": 0.0405, + "step": 490 + }, + { + "epoch": 0.32, + "grad_norm": 0.051994968205690384, + "learning_rate": 0.000291877870835319, + "loss": 0.01, + "step": 491 + }, + { + "epoch": 0.32, + "grad_norm": 0.1463523805141449, + "learning_rate": 0.00029184437361166676, + "loss": 0.0555, + "step": 492 + }, + { + "epoch": 0.32, + "grad_norm": 0.09110219031572342, + "learning_rate": 0.00029181080938576255, + "loss": 0.0371, + "step": 493 + }, + { + "epoch": 0.32, + "grad_norm": 0.04076121374964714, + "learning_rate": 0.00029177717817346097, + "loss": 0.0065, + "step": 494 + }, + { + "epoch": 0.32, + "grad_norm": 0.11555450409650803, + "learning_rate": 0.0002917434799906482, + "loss": 0.0115, + "step": 495 + }, + { + "epoch": 0.32, + "grad_norm": 0.15579824149608612, + "learning_rate": 0.0002917097148532421, + "loss": 0.0332, + "step": 496 + }, + { + "epoch": 0.33, + "grad_norm": 0.41938668489456177, + "learning_rate": 0.000291675882777192, + "loss": 0.0678, + "step": 497 + }, + { + "epoch": 0.33, + "grad_norm": 0.16764874756336212, + "learning_rate": 0.0002916419837784791, + "loss": 0.0683, + "step": 498 + }, + { + "epoch": 0.33, + "grad_norm": 0.1291145384311676, + "learning_rate": 0.00029160801787311613, + "loss": 0.0376, + "step": 499 + }, + { + "epoch": 0.33, + "grad_norm": 0.06120933219790459, + "learning_rate": 0.0002915739850771472, + "loss": 0.0307, + "step": 500 + }, + { + "epoch": 0.33, + "grad_norm": 0.09218423068523407, + "learning_rate": 0.0002915398854066483, + "loss": 0.0545, + "step": 501 + }, + { + "epoch": 0.33, + "grad_norm": 0.12664952874183655, + "learning_rate": 0.00029150571887772694, + "loss": 0.0274, + "step": 502 + }, + { + "epoch": 0.33, + "grad_norm": 0.0705379918217659, + "learning_rate": 0.0002914714855065221, + "loss": 0.0198, + "step": 503 + }, + { + "epoch": 0.33, + "grad_norm": 0.03559693694114685, + "learning_rate": 0.00029143718530920447, + "loss": 0.0114, + "step": 504 + }, + { + "epoch": 0.33, + "grad_norm": 0.051283448934555054, + "learning_rate": 0.0002914028183019762, + "loss": 0.0327, + "step": 505 + }, + { + "epoch": 0.33, + "grad_norm": 0.12527117133140564, + "learning_rate": 0.0002913683845010711, + "loss": 0.0316, + "step": 506 + }, + { + "epoch": 0.33, + "grad_norm": 0.0627032071352005, + "learning_rate": 0.0002913338839227544, + "loss": 0.0185, + "step": 507 + }, + { + "epoch": 0.33, + "grad_norm": 0.07235468178987503, + "learning_rate": 0.000291299316583323, + "loss": 0.0605, + "step": 508 + }, + { + "epoch": 0.33, + "grad_norm": 0.07697612792253494, + "learning_rate": 0.0002912646824991053, + "loss": 0.031, + "step": 509 + }, + { + "epoch": 0.33, + "grad_norm": 0.08240342885255814, + "learning_rate": 0.0002912299816864612, + "loss": 0.0211, + "step": 510 + }, + { + "epoch": 0.33, + "grad_norm": 0.07725581526756287, + "learning_rate": 0.0002911952141617821, + "loss": 0.0311, + "step": 511 + }, + { + "epoch": 0.34, + "grad_norm": 0.14777988195419312, + "learning_rate": 0.000291160379941491, + "loss": 0.038, + "step": 512 + }, + { + "epoch": 0.34, + "grad_norm": 0.11423151195049286, + "learning_rate": 0.0002911254790420423, + "loss": 0.0594, + "step": 513 + }, + { + "epoch": 0.34, + "grad_norm": 0.07308260351419449, + "learning_rate": 0.000291090511479922, + "loss": 0.0416, + "step": 514 + }, + { + "epoch": 0.34, + "grad_norm": 0.11171098798513412, + "learning_rate": 0.00029105547727164747, + "loss": 0.0509, + "step": 515 + }, + { + "epoch": 0.34, + "grad_norm": 0.29647496342658997, + "learning_rate": 0.00029102037643376764, + "loss": 0.0421, + "step": 516 + }, + { + "epoch": 0.34, + "grad_norm": 0.08812320232391357, + "learning_rate": 0.00029098520898286303, + "loss": 0.0559, + "step": 517 + }, + { + "epoch": 0.34, + "grad_norm": 0.13493718206882477, + "learning_rate": 0.00029094997493554525, + "loss": 0.0257, + "step": 518 + }, + { + "epoch": 0.34, + "grad_norm": 0.1292780339717865, + "learning_rate": 0.0002909146743084579, + "loss": 0.0699, + "step": 519 + }, + { + "epoch": 0.34, + "grad_norm": 0.03736162185668945, + "learning_rate": 0.0002908793071182755, + "loss": 0.0113, + "step": 520 + }, + { + "epoch": 0.34, + "grad_norm": 0.20628990232944489, + "learning_rate": 0.00029084387338170435, + "loss": 0.1039, + "step": 521 + }, + { + "epoch": 0.34, + "grad_norm": 0.13702163100242615, + "learning_rate": 0.0002908083731154821, + "loss": 0.0715, + "step": 522 + }, + { + "epoch": 0.34, + "grad_norm": 0.10376426577568054, + "learning_rate": 0.0002907728063363779, + "loss": 0.0566, + "step": 523 + }, + { + "epoch": 0.34, + "grad_norm": 0.03796597197651863, + "learning_rate": 0.00029073717306119206, + "loss": 0.0131, + "step": 524 + }, + { + "epoch": 0.34, + "grad_norm": 0.12588168680667877, + "learning_rate": 0.0002907014733067566, + "loss": 0.0754, + "step": 525 + }, + { + "epoch": 0.34, + "grad_norm": 0.18614119291305542, + "learning_rate": 0.00029066570708993474, + "loss": 0.0839, + "step": 526 + }, + { + "epoch": 0.35, + "grad_norm": 0.08624828606843948, + "learning_rate": 0.0002906298744276212, + "loss": 0.0519, + "step": 527 + }, + { + "epoch": 0.35, + "grad_norm": 0.09907104074954987, + "learning_rate": 0.00029059397533674216, + "loss": 0.0554, + "step": 528 + }, + { + "epoch": 0.35, + "grad_norm": 0.05135316029191017, + "learning_rate": 0.00029055800983425494, + "loss": 0.0374, + "step": 529 + }, + { + "epoch": 0.35, + "grad_norm": 0.10954371839761734, + "learning_rate": 0.00029052197793714844, + "loss": 0.03, + "step": 530 + }, + { + "epoch": 0.35, + "grad_norm": 0.13733310997486115, + "learning_rate": 0.0002904858796624428, + "loss": 0.0345, + "step": 531 + }, + { + "epoch": 0.35, + "grad_norm": 0.09171781688928604, + "learning_rate": 0.00029044971502718966, + "loss": 0.0285, + "step": 532 + }, + { + "epoch": 0.35, + "grad_norm": 0.08643066138029099, + "learning_rate": 0.00029041348404847177, + "loss": 0.0225, + "step": 533 + }, + { + "epoch": 0.35, + "grad_norm": 0.3179713487625122, + "learning_rate": 0.00029037718674340343, + "loss": 0.1167, + "step": 534 + }, + { + "epoch": 0.35, + "grad_norm": 0.09737833589315414, + "learning_rate": 0.0002903408231291303, + "loss": 0.047, + "step": 535 + }, + { + "epoch": 0.35, + "grad_norm": 0.15587852895259857, + "learning_rate": 0.00029030439322282904, + "loss": 0.0406, + "step": 536 + }, + { + "epoch": 0.35, + "grad_norm": 0.07560009509325027, + "learning_rate": 0.0002902678970417081, + "loss": 0.0387, + "step": 537 + }, + { + "epoch": 0.35, + "grad_norm": 0.12732967734336853, + "learning_rate": 0.00029023133460300677, + "loss": 0.0434, + "step": 538 + }, + { + "epoch": 0.35, + "grad_norm": 0.06021510064601898, + "learning_rate": 0.00029019470592399593, + "loss": 0.0149, + "step": 539 + }, + { + "epoch": 0.35, + "grad_norm": 0.09609080851078033, + "learning_rate": 0.0002901580110219777, + "loss": 0.0203, + "step": 540 + }, + { + "epoch": 0.35, + "grad_norm": 0.1442640721797943, + "learning_rate": 0.0002901212499142854, + "loss": 0.0345, + "step": 541 + }, + { + "epoch": 0.35, + "grad_norm": 0.15236537158489227, + "learning_rate": 0.0002900844226182837, + "loss": 0.041, + "step": 542 + }, + { + "epoch": 0.36, + "grad_norm": 0.14138057827949524, + "learning_rate": 0.00029004752915136854, + "loss": 0.0413, + "step": 543 + }, + { + "epoch": 0.36, + "grad_norm": 0.16659876704216003, + "learning_rate": 0.000290010569530967, + "loss": 0.0202, + "step": 544 + }, + { + "epoch": 0.36, + "grad_norm": 0.16970619559288025, + "learning_rate": 0.0002899735437745376, + "loss": 0.0373, + "step": 545 + }, + { + "epoch": 0.36, + "grad_norm": 0.044596217572689056, + "learning_rate": 0.00028993645189956987, + "loss": 0.0202, + "step": 546 + }, + { + "epoch": 0.36, + "grad_norm": 0.07182051986455917, + "learning_rate": 0.00028989929392358484, + "loss": 0.0137, + "step": 547 + }, + { + "epoch": 0.36, + "grad_norm": 0.2593410313129425, + "learning_rate": 0.0002898620698641345, + "loss": 0.0373, + "step": 548 + }, + { + "epoch": 0.36, + "grad_norm": 0.17339394986629486, + "learning_rate": 0.0002898247797388023, + "loss": 0.0217, + "step": 549 + }, + { + "epoch": 0.36, + "grad_norm": 0.13247337937355042, + "learning_rate": 0.00028978742356520256, + "loss": 0.0621, + "step": 550 + }, + { + "epoch": 0.36, + "grad_norm": 0.04582560807466507, + "learning_rate": 0.00028975000136098123, + "loss": 0.0051, + "step": 551 + }, + { + "epoch": 0.36, + "grad_norm": 0.04409830644726753, + "learning_rate": 0.0002897125131438151, + "loss": 0.0042, + "step": 552 + }, + { + "epoch": 0.36, + "grad_norm": 0.11188169568777084, + "learning_rate": 0.0002896749589314123, + "loss": 0.0307, + "step": 553 + }, + { + "epoch": 0.36, + "grad_norm": 0.10103113949298859, + "learning_rate": 0.00028963733874151225, + "loss": 0.0132, + "step": 554 + }, + { + "epoch": 0.36, + "grad_norm": 0.13099652528762817, + "learning_rate": 0.0002895996525918852, + "loss": 0.0348, + "step": 555 + }, + { + "epoch": 0.36, + "grad_norm": 0.07826762646436691, + "learning_rate": 0.0002895619005003328, + "loss": 0.0232, + "step": 556 + }, + { + "epoch": 0.36, + "grad_norm": 0.053435299545526505, + "learning_rate": 0.00028952408248468785, + "loss": 0.0113, + "step": 557 + }, + { + "epoch": 0.37, + "grad_norm": 0.07408218830823898, + "learning_rate": 0.00028948619856281423, + "loss": 0.0099, + "step": 558 + }, + { + "epoch": 0.37, + "grad_norm": 0.08491642028093338, + "learning_rate": 0.00028944824875260693, + "loss": 0.0122, + "step": 559 + }, + { + "epoch": 0.37, + "grad_norm": 0.0294903963804245, + "learning_rate": 0.00028941023307199214, + "loss": 0.0044, + "step": 560 + }, + { + "epoch": 0.37, + "grad_norm": 0.16142538189888, + "learning_rate": 0.000289372151538927, + "loss": 0.0721, + "step": 561 + }, + { + "epoch": 0.37, + "grad_norm": 0.11368390917778015, + "learning_rate": 0.0002893340041714, + "loss": 0.0109, + "step": 562 + }, + { + "epoch": 0.37, + "grad_norm": 0.1799473911523819, + "learning_rate": 0.0002892957909874306, + "loss": 0.0487, + "step": 563 + }, + { + "epoch": 0.37, + "grad_norm": 0.1448475420475006, + "learning_rate": 0.0002892575120050693, + "loss": 0.0601, + "step": 564 + }, + { + "epoch": 0.37, + "grad_norm": 0.07079991698265076, + "learning_rate": 0.00028921916724239773, + "loss": 0.0089, + "step": 565 + }, + { + "epoch": 0.37, + "grad_norm": 0.13462460041046143, + "learning_rate": 0.0002891807567175287, + "loss": 0.0361, + "step": 566 + }, + { + "epoch": 0.37, + "grad_norm": 0.08166678249835968, + "learning_rate": 0.00028914228044860584, + "loss": 0.0412, + "step": 567 + }, + { + "epoch": 0.37, + "grad_norm": 0.09470119327306747, + "learning_rate": 0.00028910373845380405, + "loss": 0.036, + "step": 568 + }, + { + "epoch": 0.37, + "grad_norm": 0.0957297682762146, + "learning_rate": 0.00028906513075132917, + "loss": 0.0302, + "step": 569 + }, + { + "epoch": 0.37, + "grad_norm": 0.17004123330116272, + "learning_rate": 0.00028902645735941814, + "loss": 0.0559, + "step": 570 + }, + { + "epoch": 0.37, + "grad_norm": 0.10910087823867798, + "learning_rate": 0.0002889877182963389, + "loss": 0.0765, + "step": 571 + }, + { + "epoch": 0.37, + "grad_norm": 0.1027827113866806, + "learning_rate": 0.0002889489135803904, + "loss": 0.0261, + "step": 572 + }, + { + "epoch": 0.38, + "grad_norm": 0.1182394027709961, + "learning_rate": 0.00028891004322990254, + "loss": 0.0413, + "step": 573 + }, + { + "epoch": 0.38, + "grad_norm": 0.08422794938087463, + "learning_rate": 0.00028887110726323644, + "loss": 0.048, + "step": 574 + }, + { + "epoch": 0.38, + "grad_norm": 0.10699556767940521, + "learning_rate": 0.00028883210569878397, + "loss": 0.0193, + "step": 575 + }, + { + "epoch": 0.38, + "grad_norm": 0.06325127184391022, + "learning_rate": 0.00028879303855496805, + "loss": 0.0248, + "step": 576 + }, + { + "epoch": 0.38, + "grad_norm": 0.10081582516431808, + "learning_rate": 0.00028875390585024274, + "loss": 0.0211, + "step": 577 + }, + { + "epoch": 0.38, + "grad_norm": 0.062216054648160934, + "learning_rate": 0.00028871470760309285, + "loss": 0.0185, + "step": 578 + }, + { + "epoch": 0.38, + "grad_norm": 0.086198590695858, + "learning_rate": 0.00028867544383203423, + "loss": 0.0544, + "step": 579 + }, + { + "epoch": 0.38, + "grad_norm": 0.11464603990316391, + "learning_rate": 0.00028863611455561374, + "loss": 0.0482, + "step": 580 + }, + { + "epoch": 0.38, + "grad_norm": 0.1089998185634613, + "learning_rate": 0.0002885967197924092, + "loss": 0.0496, + "step": 581 + }, + { + "epoch": 0.38, + "grad_norm": 0.1297656148672104, + "learning_rate": 0.00028855725956102913, + "loss": 0.0286, + "step": 582 + }, + { + "epoch": 0.38, + "grad_norm": 0.12966851890087128, + "learning_rate": 0.0002885177338801133, + "loss": 0.0271, + "step": 583 + }, + { + "epoch": 0.38, + "grad_norm": 0.1413564682006836, + "learning_rate": 0.00028847814276833215, + "loss": 0.0334, + "step": 584 + }, + { + "epoch": 0.38, + "grad_norm": 0.08366623520851135, + "learning_rate": 0.0002884384862443871, + "loss": 0.0252, + "step": 585 + }, + { + "epoch": 0.38, + "grad_norm": 0.11143944412469864, + "learning_rate": 0.0002883987643270106, + "loss": 0.0347, + "step": 586 + }, + { + "epoch": 0.38, + "grad_norm": 0.018316002562642097, + "learning_rate": 0.0002883589770349658, + "loss": 0.0041, + "step": 587 + }, + { + "epoch": 0.38, + "grad_norm": 0.02275553159415722, + "learning_rate": 0.0002883191243870467, + "loss": 0.0049, + "step": 588 + }, + { + "epoch": 0.39, + "grad_norm": 0.14462235569953918, + "learning_rate": 0.0002882792064020785, + "loss": 0.0745, + "step": 589 + }, + { + "epoch": 0.39, + "grad_norm": 0.10231613367795944, + "learning_rate": 0.0002882392230989169, + "loss": 0.0211, + "step": 590 + }, + { + "epoch": 0.39, + "grad_norm": 0.013464580290019512, + "learning_rate": 0.00028819917449644865, + "loss": 0.0027, + "step": 591 + }, + { + "epoch": 0.39, + "grad_norm": 0.1707848161458969, + "learning_rate": 0.0002881590606135912, + "loss": 0.0292, + "step": 592 + }, + { + "epoch": 0.39, + "grad_norm": 0.021210182458162308, + "learning_rate": 0.00028811888146929303, + "loss": 0.0034, + "step": 593 + }, + { + "epoch": 0.39, + "grad_norm": 0.09697694331407547, + "learning_rate": 0.00028807863708253326, + "loss": 0.0134, + "step": 594 + }, + { + "epoch": 0.39, + "grad_norm": 0.014497664757072926, + "learning_rate": 0.000288038327472322, + "loss": 0.0033, + "step": 595 + }, + { + "epoch": 0.39, + "grad_norm": 0.25384795665740967, + "learning_rate": 0.00028799795265770003, + "loss": 0.0258, + "step": 596 + }, + { + "epoch": 0.39, + "grad_norm": 0.0065186647698283195, + "learning_rate": 0.00028795751265773894, + "loss": 0.0012, + "step": 597 + }, + { + "epoch": 0.39, + "grad_norm": 0.03637157753109932, + "learning_rate": 0.00028791700749154124, + "loss": 0.004, + "step": 598 + }, + { + "epoch": 0.39, + "grad_norm": 0.039990831166505814, + "learning_rate": 0.00028787643717824007, + "loss": 0.0067, + "step": 599 + }, + { + "epoch": 0.39, + "grad_norm": 0.18821458518505096, + "learning_rate": 0.0002878358017369994, + "loss": 0.0233, + "step": 600 + }, + { + "epoch": 0.39, + "grad_norm": 0.12891234457492828, + "learning_rate": 0.00028779510118701404, + "loss": 0.0121, + "step": 601 + }, + { + "epoch": 0.39, + "grad_norm": 0.1731066256761551, + "learning_rate": 0.0002877543355475094, + "loss": 0.0535, + "step": 602 + }, + { + "epoch": 0.39, + "grad_norm": 0.5192031264305115, + "learning_rate": 0.0002877135048377418, + "loss": 0.1073, + "step": 603 + }, + { + "epoch": 0.4, + "grad_norm": 0.13350637257099152, + "learning_rate": 0.0002876726090769982, + "loss": 0.0157, + "step": 604 + }, + { + "epoch": 0.4, + "grad_norm": 0.12136203050613403, + "learning_rate": 0.0002876316482845963, + "loss": 0.0132, + "step": 605 + }, + { + "epoch": 0.4, + "grad_norm": 0.5036077499389648, + "learning_rate": 0.0002875906224798844, + "loss": 0.1366, + "step": 606 + }, + { + "epoch": 0.4, + "grad_norm": 0.22896146774291992, + "learning_rate": 0.0002875495316822419, + "loss": 0.08, + "step": 607 + }, + { + "epoch": 0.4, + "grad_norm": 0.15327180922031403, + "learning_rate": 0.0002875083759110785, + "loss": 0.0322, + "step": 608 + }, + { + "epoch": 0.4, + "grad_norm": 0.0520663745701313, + "learning_rate": 0.0002874671551858346, + "loss": 0.0202, + "step": 609 + }, + { + "epoch": 0.4, + "grad_norm": 0.08731318265199661, + "learning_rate": 0.00028742586952598155, + "loss": 0.0414, + "step": 610 + }, + { + "epoch": 0.4, + "grad_norm": 0.11570514738559723, + "learning_rate": 0.0002873845189510213, + "loss": 0.0625, + "step": 611 + }, + { + "epoch": 0.4, + "grad_norm": 0.1604083925485611, + "learning_rate": 0.0002873431034804862, + "loss": 0.0644, + "step": 612 + }, + { + "epoch": 0.4, + "grad_norm": 0.06147552654147148, + "learning_rate": 0.0002873016231339396, + "loss": 0.0168, + "step": 613 + }, + { + "epoch": 0.4, + "grad_norm": 0.12419867515563965, + "learning_rate": 0.00028726007793097527, + "loss": 0.0438, + "step": 614 + }, + { + "epoch": 0.4, + "grad_norm": 0.06133590638637543, + "learning_rate": 0.0002872184678912177, + "loss": 0.024, + "step": 615 + }, + { + "epoch": 0.4, + "grad_norm": 0.10245617479085922, + "learning_rate": 0.00028717679303432207, + "loss": 0.0468, + "step": 616 + }, + { + "epoch": 0.4, + "grad_norm": 0.11957762390375137, + "learning_rate": 0.000287135053379974, + "loss": 0.0442, + "step": 617 + }, + { + "epoch": 0.4, + "grad_norm": 0.12896914780139923, + "learning_rate": 0.0002870932489478899, + "loss": 0.019, + "step": 618 + }, + { + "epoch": 0.41, + "grad_norm": 0.1816866546869278, + "learning_rate": 0.0002870513797578167, + "loss": 0.0465, + "step": 619 + }, + { + "epoch": 0.41, + "grad_norm": 0.3061673045158386, + "learning_rate": 0.00028700944582953184, + "loss": 0.0356, + "step": 620 + }, + { + "epoch": 0.41, + "grad_norm": 0.12940478324890137, + "learning_rate": 0.0002869674471828435, + "loss": 0.0447, + "step": 621 + }, + { + "epoch": 0.41, + "grad_norm": 0.2569711208343506, + "learning_rate": 0.0002869253838375903, + "loss": 0.0383, + "step": 622 + }, + { + "epoch": 0.41, + "grad_norm": 0.17063623666763306, + "learning_rate": 0.0002868832558136415, + "loss": 0.0394, + "step": 623 + }, + { + "epoch": 0.41, + "grad_norm": 0.16775226593017578, + "learning_rate": 0.00028684106313089686, + "loss": 0.0314, + "step": 624 + }, + { + "epoch": 0.41, + "grad_norm": 0.12676480412483215, + "learning_rate": 0.00028679880580928676, + "loss": 0.0397, + "step": 625 + }, + { + "epoch": 0.41, + "grad_norm": 0.19791187345981598, + "learning_rate": 0.0002867564838687721, + "loss": 0.0668, + "step": 626 + }, + { + "epoch": 0.41, + "grad_norm": 0.18982940912246704, + "learning_rate": 0.0002867140973293441, + "loss": 0.0472, + "step": 627 + }, + { + "epoch": 0.41, + "grad_norm": 0.06308908015489578, + "learning_rate": 0.00028667164621102475, + "loss": 0.0166, + "step": 628 + }, + { + "epoch": 0.41, + "grad_norm": 0.09570673853158951, + "learning_rate": 0.0002866291305338665, + "loss": 0.0156, + "step": 629 + }, + { + "epoch": 0.41, + "grad_norm": 0.12950573861598969, + "learning_rate": 0.00028658655031795215, + "loss": 0.0381, + "step": 630 + }, + { + "epoch": 0.41, + "grad_norm": 0.30905017256736755, + "learning_rate": 0.00028654390558339516, + "loss": 0.0386, + "step": 631 + }, + { + "epoch": 0.41, + "grad_norm": 0.2680380940437317, + "learning_rate": 0.0002865011963503394, + "loss": 0.0307, + "step": 632 + }, + { + "epoch": 0.41, + "grad_norm": 0.15153923630714417, + "learning_rate": 0.00028645842263895916, + "loss": 0.0448, + "step": 633 + }, + { + "epoch": 0.42, + "grad_norm": 0.06900045275688171, + "learning_rate": 0.0002864155844694592, + "loss": 0.0134, + "step": 634 + }, + { + "epoch": 0.42, + "grad_norm": 0.39054739475250244, + "learning_rate": 0.00028637268186207474, + "loss": 0.0562, + "step": 635 + }, + { + "epoch": 0.42, + "grad_norm": 0.06766320765018463, + "learning_rate": 0.0002863297148370716, + "loss": 0.0135, + "step": 636 + }, + { + "epoch": 0.42, + "grad_norm": 0.12230436503887177, + "learning_rate": 0.0002862866834147457, + "loss": 0.0189, + "step": 637 + }, + { + "epoch": 0.42, + "grad_norm": 0.10021094232797623, + "learning_rate": 0.00028624358761542365, + "loss": 0.021, + "step": 638 + }, + { + "epoch": 0.42, + "grad_norm": 0.1645062267780304, + "learning_rate": 0.0002862004274594623, + "loss": 0.0284, + "step": 639 + }, + { + "epoch": 0.42, + "grad_norm": 0.3108697831630707, + "learning_rate": 0.00028615720296724906, + "loss": 0.0792, + "step": 640 + }, + { + "epoch": 0.42, + "grad_norm": 0.12834666669368744, + "learning_rate": 0.0002861139141592017, + "loss": 0.0162, + "step": 641 + }, + { + "epoch": 0.42, + "grad_norm": 0.11455690860748291, + "learning_rate": 0.00028607056105576806, + "loss": 0.0374, + "step": 642 + }, + { + "epoch": 0.42, + "grad_norm": 0.14810198545455933, + "learning_rate": 0.0002860271436774269, + "loss": 0.0132, + "step": 643 + }, + { + "epoch": 0.42, + "grad_norm": 0.1764562875032425, + "learning_rate": 0.00028598366204468694, + "loss": 0.0641, + "step": 644 + }, + { + "epoch": 0.42, + "grad_norm": 0.10819990932941437, + "learning_rate": 0.0002859401161780873, + "loss": 0.036, + "step": 645 + }, + { + "epoch": 0.42, + "grad_norm": 0.10301560163497925, + "learning_rate": 0.00028589650609819764, + "loss": 0.0272, + "step": 646 + }, + { + "epoch": 0.42, + "grad_norm": 0.13949047029018402, + "learning_rate": 0.00028585283182561773, + "loss": 0.0396, + "step": 647 + }, + { + "epoch": 0.42, + "grad_norm": 0.20076854526996613, + "learning_rate": 0.0002858090933809777, + "loss": 0.0304, + "step": 648 + }, + { + "epoch": 0.42, + "grad_norm": 0.12382891029119492, + "learning_rate": 0.0002857652907849381, + "loss": 0.0317, + "step": 649 + }, + { + "epoch": 0.43, + "grad_norm": 0.03410351276397705, + "learning_rate": 0.0002857214240581897, + "loss": 0.0075, + "step": 650 + }, + { + "epoch": 0.43, + "grad_norm": 0.10016089677810669, + "learning_rate": 0.00028567749322145367, + "loss": 0.0179, + "step": 651 + }, + { + "epoch": 0.43, + "grad_norm": 0.24712401628494263, + "learning_rate": 0.00028563349829548125, + "loss": 0.0857, + "step": 652 + }, + { + "epoch": 0.43, + "grad_norm": 0.10354748368263245, + "learning_rate": 0.00028558943930105413, + "loss": 0.0276, + "step": 653 + }, + { + "epoch": 0.43, + "grad_norm": 0.13952110707759857, + "learning_rate": 0.00028554531625898434, + "loss": 0.0352, + "step": 654 + }, + { + "epoch": 0.43, + "grad_norm": 0.25892096757888794, + "learning_rate": 0.0002855011291901138, + "loss": 0.0635, + "step": 655 + }, + { + "epoch": 0.43, + "grad_norm": 0.1324494630098343, + "learning_rate": 0.0002854568781153151, + "loss": 0.0404, + "step": 656 + }, + { + "epoch": 0.43, + "grad_norm": 0.2835068702697754, + "learning_rate": 0.0002854125630554908, + "loss": 0.0913, + "step": 657 + }, + { + "epoch": 0.43, + "grad_norm": 0.06329616159200668, + "learning_rate": 0.00028536818403157387, + "loss": 0.0146, + "step": 658 + }, + { + "epoch": 0.43, + "grad_norm": 0.07758588343858719, + "learning_rate": 0.0002853237410645272, + "loss": 0.022, + "step": 659 + }, + { + "epoch": 0.43, + "grad_norm": 0.0839746966958046, + "learning_rate": 0.00028527923417534425, + "loss": 0.0175, + "step": 660 + }, + { + "epoch": 0.43, + "grad_norm": 0.06847698986530304, + "learning_rate": 0.0002852346633850484, + "loss": 0.0257, + "step": 661 + }, + { + "epoch": 0.43, + "grad_norm": 0.05117741599678993, + "learning_rate": 0.0002851900287146933, + "loss": 0.0136, + "step": 662 + }, + { + "epoch": 0.43, + "grad_norm": 0.12874063849449158, + "learning_rate": 0.0002851453301853628, + "loss": 0.0525, + "step": 663 + }, + { + "epoch": 0.43, + "grad_norm": 0.1822301298379898, + "learning_rate": 0.000285100567818171, + "loss": 0.0889, + "step": 664 + }, + { + "epoch": 0.44, + "grad_norm": 0.11295532435178757, + "learning_rate": 0.0002850557416342619, + "loss": 0.0242, + "step": 665 + }, + { + "epoch": 0.44, + "grad_norm": 0.13836829364299774, + "learning_rate": 0.0002850108516548099, + "loss": 0.0441, + "step": 666 + }, + { + "epoch": 0.44, + "grad_norm": 0.10722105205059052, + "learning_rate": 0.0002849658979010194, + "loss": 0.0401, + "step": 667 + }, + { + "epoch": 0.44, + "grad_norm": 0.1335124373435974, + "learning_rate": 0.000284920880394125, + "loss": 0.0355, + "step": 668 + }, + { + "epoch": 0.44, + "grad_norm": 0.0793779119849205, + "learning_rate": 0.00028487579915539136, + "loss": 0.0653, + "step": 669 + }, + { + "epoch": 0.44, + "grad_norm": 0.06469617038965225, + "learning_rate": 0.00028483065420611313, + "loss": 0.0212, + "step": 670 + }, + { + "epoch": 0.44, + "grad_norm": 0.11224417388439178, + "learning_rate": 0.0002847854455676154, + "loss": 0.0689, + "step": 671 + }, + { + "epoch": 0.44, + "grad_norm": 0.08650530874729156, + "learning_rate": 0.00028474017326125296, + "loss": 0.0301, + "step": 672 + }, + { + "epoch": 0.44, + "grad_norm": 0.0688636302947998, + "learning_rate": 0.0002846948373084109, + "loss": 0.0161, + "step": 673 + }, + { + "epoch": 0.44, + "grad_norm": 0.13195598125457764, + "learning_rate": 0.0002846494377305043, + "loss": 0.0529, + "step": 674 + }, + { + "epoch": 0.44, + "grad_norm": 0.1887226700782776, + "learning_rate": 0.0002846039745489783, + "loss": 0.0615, + "step": 675 + }, + { + "epoch": 0.44, + "grad_norm": 0.06736018508672714, + "learning_rate": 0.0002845584477853082, + "loss": 0.0246, + "step": 676 + }, + { + "epoch": 0.44, + "grad_norm": 0.1488025039434433, + "learning_rate": 0.0002845128574609992, + "loss": 0.0361, + "step": 677 + }, + { + "epoch": 0.44, + "grad_norm": 0.09811149537563324, + "learning_rate": 0.0002844672035975864, + "loss": 0.0228, + "step": 678 + }, + { + "epoch": 0.44, + "grad_norm": 0.06320784986019135, + "learning_rate": 0.0002844214862166352, + "loss": 0.0182, + "step": 679 + }, + { + "epoch": 0.45, + "grad_norm": 0.0695585086941719, + "learning_rate": 0.00028437570533974084, + "loss": 0.0393, + "step": 680 + }, + { + "epoch": 0.45, + "grad_norm": 0.08886481821537018, + "learning_rate": 0.00028432986098852857, + "loss": 0.0293, + "step": 681 + }, + { + "epoch": 0.45, + "grad_norm": 0.09019115567207336, + "learning_rate": 0.0002842839531846537, + "loss": 0.0436, + "step": 682 + }, + { + "epoch": 0.45, + "grad_norm": 0.1718403697013855, + "learning_rate": 0.0002842379819498013, + "loss": 0.0512, + "step": 683 + }, + { + "epoch": 0.45, + "grad_norm": 0.1692350208759308, + "learning_rate": 0.0002841919473056867, + "loss": 0.0637, + "step": 684 + }, + { + "epoch": 0.45, + "grad_norm": 0.15840108692646027, + "learning_rate": 0.00028414584927405497, + "loss": 0.0224, + "step": 685 + }, + { + "epoch": 0.45, + "grad_norm": 0.12710994482040405, + "learning_rate": 0.0002840996878766812, + "loss": 0.042, + "step": 686 + }, + { + "epoch": 0.45, + "grad_norm": 0.07157866656780243, + "learning_rate": 0.0002840534631353704, + "loss": 0.0279, + "step": 687 + }, + { + "epoch": 0.45, + "grad_norm": 0.046079106628894806, + "learning_rate": 0.0002840071750719575, + "loss": 0.0093, + "step": 688 + }, + { + "epoch": 0.45, + "grad_norm": 0.09579090774059296, + "learning_rate": 0.00028396082370830733, + "loss": 0.027, + "step": 689 + }, + { + "epoch": 0.45, + "grad_norm": 0.201382577419281, + "learning_rate": 0.0002839144090663146, + "loss": 0.072, + "step": 690 + }, + { + "epoch": 0.45, + "grad_norm": 0.1724497377872467, + "learning_rate": 0.000283867931167904, + "loss": 0.052, + "step": 691 + }, + { + "epoch": 0.45, + "grad_norm": 0.06867794692516327, + "learning_rate": 0.00028382139003503006, + "loss": 0.0168, + "step": 692 + }, + { + "epoch": 0.45, + "grad_norm": 0.06690815091133118, + "learning_rate": 0.00028377478568967704, + "loss": 0.0137, + "step": 693 + }, + { + "epoch": 0.45, + "grad_norm": 0.17312408983707428, + "learning_rate": 0.0002837281181538593, + "loss": 0.0226, + "step": 694 + }, + { + "epoch": 0.45, + "grad_norm": 0.08470922708511353, + "learning_rate": 0.0002836813874496208, + "loss": 0.0126, + "step": 695 + }, + { + "epoch": 0.46, + "grad_norm": 0.0778060331940651, + "learning_rate": 0.00028363459359903565, + "loss": 0.0235, + "step": 696 + }, + { + "epoch": 0.46, + "grad_norm": 0.05282146856188774, + "learning_rate": 0.00028358773662420745, + "loss": 0.0086, + "step": 697 + }, + { + "epoch": 0.46, + "grad_norm": 0.09441710263490677, + "learning_rate": 0.00028354081654726984, + "loss": 0.049, + "step": 698 + }, + { + "epoch": 0.46, + "grad_norm": 0.050999149680137634, + "learning_rate": 0.00028349383339038617, + "loss": 0.0086, + "step": 699 + }, + { + "epoch": 0.46, + "grad_norm": 0.10146886110305786, + "learning_rate": 0.0002834467871757497, + "loss": 0.0246, + "step": 700 + }, + { + "epoch": 0.46, + "grad_norm": 0.03706188499927521, + "learning_rate": 0.0002833996779255833, + "loss": 0.0056, + "step": 701 + }, + { + "epoch": 0.46, + "grad_norm": 0.10421967506408691, + "learning_rate": 0.0002833525056621397, + "loss": 0.0241, + "step": 702 + }, + { + "epoch": 0.46, + "grad_norm": 0.18308381736278534, + "learning_rate": 0.00028330527040770146, + "loss": 0.042, + "step": 703 + }, + { + "epoch": 0.46, + "grad_norm": 0.2132684737443924, + "learning_rate": 0.0002832579721845809, + "loss": 0.0226, + "step": 704 + }, + { + "epoch": 0.46, + "grad_norm": 0.5411369204521179, + "learning_rate": 0.00028321061101511984, + "loss": 0.0702, + "step": 705 + }, + { + "epoch": 0.46, + "grad_norm": 0.3440389335155487, + "learning_rate": 0.0002831631869216902, + "loss": 0.0225, + "step": 706 + }, + { + "epoch": 0.46, + "grad_norm": 0.16572096943855286, + "learning_rate": 0.00028311569992669333, + "loss": 0.0352, + "step": 707 + }, + { + "epoch": 0.46, + "grad_norm": 0.15913799405097961, + "learning_rate": 0.0002830681500525604, + "loss": 0.0266, + "step": 708 + }, + { + "epoch": 0.46, + "grad_norm": 0.1818440854549408, + "learning_rate": 0.0002830205373217524, + "loss": 0.0688, + "step": 709 + }, + { + "epoch": 0.46, + "grad_norm": 0.17044726014137268, + "learning_rate": 0.0002829728617567598, + "loss": 0.0515, + "step": 710 + }, + { + "epoch": 0.47, + "grad_norm": 0.15003225207328796, + "learning_rate": 0.0002829251233801028, + "loss": 0.0757, + "step": 711 + }, + { + "epoch": 0.47, + "grad_norm": 0.11599011719226837, + "learning_rate": 0.00028287732221433145, + "loss": 0.0402, + "step": 712 + }, + { + "epoch": 0.47, + "grad_norm": 0.11937666684389114, + "learning_rate": 0.0002828294582820252, + "loss": 0.0391, + "step": 713 + }, + { + "epoch": 0.47, + "grad_norm": 0.2134632170200348, + "learning_rate": 0.0002827815316057933, + "loss": 0.0748, + "step": 714 + }, + { + "epoch": 0.47, + "grad_norm": 0.29407811164855957, + "learning_rate": 0.00028273354220827477, + "loss": 0.0679, + "step": 715 + }, + { + "epoch": 0.47, + "grad_norm": 0.059820059686899185, + "learning_rate": 0.00028268549011213785, + "loss": 0.0372, + "step": 716 + }, + { + "epoch": 0.47, + "grad_norm": 0.06392081081867218, + "learning_rate": 0.0002826373753400808, + "loss": 0.0212, + "step": 717 + }, + { + "epoch": 0.47, + "grad_norm": 0.15369698405265808, + "learning_rate": 0.0002825891979148313, + "loss": 0.0258, + "step": 718 + }, + { + "epoch": 0.47, + "grad_norm": 0.11246192455291748, + "learning_rate": 0.00028254095785914667, + "loss": 0.0357, + "step": 719 + }, + { + "epoch": 0.47, + "grad_norm": 0.09747990220785141, + "learning_rate": 0.0002824926551958138, + "loss": 0.0393, + "step": 720 + }, + { + "epoch": 0.47, + "grad_norm": 0.06651636213064194, + "learning_rate": 0.0002824442899476491, + "loss": 0.0222, + "step": 721 + }, + { + "epoch": 0.47, + "grad_norm": 0.2205415815114975, + "learning_rate": 0.00028239586213749866, + "loss": 0.0391, + "step": 722 + }, + { + "epoch": 0.47, + "grad_norm": 0.15302903950214386, + "learning_rate": 0.000282347371788238, + "loss": 0.019, + "step": 723 + }, + { + "epoch": 0.47, + "grad_norm": 0.0960957333445549, + "learning_rate": 0.00028229881892277237, + "loss": 0.0142, + "step": 724 + }, + { + "epoch": 0.47, + "grad_norm": 0.13723668456077576, + "learning_rate": 0.00028225020356403624, + "loss": 0.0595, + "step": 725 + }, + { + "epoch": 0.48, + "grad_norm": 0.11208293586969376, + "learning_rate": 0.00028220152573499394, + "loss": 0.0283, + "step": 726 + }, + { + "epoch": 0.48, + "grad_norm": 0.11818848550319672, + "learning_rate": 0.000282152785458639, + "loss": 0.0337, + "step": 727 + }, + { + "epoch": 0.48, + "grad_norm": 0.1239568442106247, + "learning_rate": 0.0002821039827579948, + "loss": 0.0406, + "step": 728 + }, + { + "epoch": 0.48, + "grad_norm": 0.2423425167798996, + "learning_rate": 0.0002820551176561138, + "loss": 0.0502, + "step": 729 + }, + { + "epoch": 0.48, + "grad_norm": 0.11348683387041092, + "learning_rate": 0.0002820061901760783, + "loss": 0.0356, + "step": 730 + }, + { + "epoch": 0.48, + "grad_norm": 0.1281062662601471, + "learning_rate": 0.00028195720034099976, + "loss": 0.0172, + "step": 731 + }, + { + "epoch": 0.48, + "grad_norm": 0.017542295157909393, + "learning_rate": 0.0002819081481740193, + "loss": 0.0035, + "step": 732 + }, + { + "epoch": 0.48, + "grad_norm": 0.13793808221817017, + "learning_rate": 0.00028185903369830757, + "loss": 0.0142, + "step": 733 + }, + { + "epoch": 0.48, + "grad_norm": 0.0807032361626625, + "learning_rate": 0.0002818098569370643, + "loss": 0.0093, + "step": 734 + }, + { + "epoch": 0.48, + "grad_norm": 0.13335250318050385, + "learning_rate": 0.0002817606179135189, + "loss": 0.0249, + "step": 735 + }, + { + "epoch": 0.48, + "grad_norm": 0.18836469948291779, + "learning_rate": 0.0002817113166509302, + "loss": 0.0702, + "step": 736 + }, + { + "epoch": 0.48, + "grad_norm": 0.08416979014873505, + "learning_rate": 0.0002816619531725863, + "loss": 0.0115, + "step": 737 + }, + { + "epoch": 0.48, + "grad_norm": 0.054892800748348236, + "learning_rate": 0.00028161252750180486, + "loss": 0.0051, + "step": 738 + }, + { + "epoch": 0.48, + "grad_norm": 0.11004303395748138, + "learning_rate": 0.0002815630396619327, + "loss": 0.025, + "step": 739 + }, + { + "epoch": 0.48, + "grad_norm": 0.08595911413431168, + "learning_rate": 0.00028151348967634613, + "loss": 0.0247, + "step": 740 + }, + { + "epoch": 0.49, + "grad_norm": 0.19074852764606476, + "learning_rate": 0.0002814638775684509, + "loss": 0.0548, + "step": 741 + }, + { + "epoch": 0.49, + "grad_norm": 0.15080730617046356, + "learning_rate": 0.0002814142033616819, + "loss": 0.0865, + "step": 742 + }, + { + "epoch": 0.49, + "grad_norm": 0.1833682358264923, + "learning_rate": 0.00028136446707950353, + "loss": 0.0697, + "step": 743 + }, + { + "epoch": 0.49, + "grad_norm": 0.049776408821344376, + "learning_rate": 0.00028131466874540943, + "loss": 0.0078, + "step": 744 + }, + { + "epoch": 0.49, + "grad_norm": 0.28232109546661377, + "learning_rate": 0.00028126480838292254, + "loss": 0.0283, + "step": 745 + }, + { + "epoch": 0.49, + "grad_norm": 0.11048179864883423, + "learning_rate": 0.0002812148860155952, + "loss": 0.0333, + "step": 746 + }, + { + "epoch": 0.49, + "grad_norm": 0.3188829720020294, + "learning_rate": 0.0002811649016670089, + "loss": 0.0524, + "step": 747 + }, + { + "epoch": 0.49, + "grad_norm": 0.17602626979351044, + "learning_rate": 0.0002811148553607745, + "loss": 0.0559, + "step": 748 + }, + { + "epoch": 0.49, + "grad_norm": 0.1038702130317688, + "learning_rate": 0.0002810647471205321, + "loss": 0.0409, + "step": 749 + }, + { + "epoch": 0.49, + "grad_norm": 0.13291211426258087, + "learning_rate": 0.00028101457696995104, + "loss": 0.0343, + "step": 750 + }, + { + "epoch": 0.49, + "grad_norm": 0.3924441933631897, + "learning_rate": 0.0002809643449327299, + "loss": 0.1051, + "step": 751 + }, + { + "epoch": 0.49, + "grad_norm": 0.4292432367801666, + "learning_rate": 0.0002809140510325966, + "loss": 0.0985, + "step": 752 + }, + { + "epoch": 0.49, + "grad_norm": 0.2376273274421692, + "learning_rate": 0.0002808636952933081, + "loss": 0.0372, + "step": 753 + }, + { + "epoch": 0.49, + "grad_norm": 0.18019931018352509, + "learning_rate": 0.0002808132777386507, + "loss": 0.0449, + "step": 754 + }, + { + "epoch": 0.49, + "grad_norm": 0.09754154831171036, + "learning_rate": 0.0002807627983924399, + "loss": 0.0399, + "step": 755 + }, + { + "epoch": 0.49, + "grad_norm": 0.15885044634342194, + "learning_rate": 0.0002807122572785203, + "loss": 0.0231, + "step": 756 + }, + { + "epoch": 0.5, + "grad_norm": 0.15031558275222778, + "learning_rate": 0.0002806616544207657, + "loss": 0.0562, + "step": 757 + }, + { + "epoch": 0.5, + "grad_norm": 0.03422224149107933, + "learning_rate": 0.00028061098984307923, + "loss": 0.0055, + "step": 758 + }, + { + "epoch": 0.5, + "grad_norm": 0.11442912369966507, + "learning_rate": 0.0002805602635693929, + "loss": 0.0318, + "step": 759 + }, + { + "epoch": 0.5, + "grad_norm": 0.10043658316135406, + "learning_rate": 0.0002805094756236681, + "loss": 0.0293, + "step": 760 + }, + { + "epoch": 0.5, + "grad_norm": 0.06093163788318634, + "learning_rate": 0.00028045862602989516, + "loss": 0.0062, + "step": 761 + }, + { + "epoch": 0.5, + "grad_norm": 0.17897385358810425, + "learning_rate": 0.0002804077148120937, + "loss": 0.0387, + "step": 762 + }, + { + "epoch": 0.5, + "grad_norm": 0.14174053072929382, + "learning_rate": 0.0002803567419943124, + "loss": 0.0742, + "step": 763 + }, + { + "epoch": 0.5, + "grad_norm": 0.12193652242422104, + "learning_rate": 0.0002803057076006289, + "loss": 0.0237, + "step": 764 + }, + { + "epoch": 0.5, + "eval_loss": 0.03274312615394592, + "eval_runtime": 39.9218, + "eval_samples_per_second": 32.238, + "eval_steps_per_second": 8.066, + "step": 764 + }, + { + "epoch": 0.5, + "grad_norm": 0.17127107083797455, + "learning_rate": 0.00028025461165515016, + "loss": 0.0269, + "step": 765 + }, + { + "epoch": 0.5, + "grad_norm": 0.03232162818312645, + "learning_rate": 0.00028020345418201196, + "loss": 0.0053, + "step": 766 + }, + { + "epoch": 0.5, + "grad_norm": 0.29065465927124023, + "learning_rate": 0.0002801522352053794, + "loss": 0.0621, + "step": 767 + }, + { + "epoch": 0.5, + "grad_norm": 0.14432761073112488, + "learning_rate": 0.00028010095474944647, + "loss": 0.0556, + "step": 768 + }, + { + "epoch": 0.5, + "grad_norm": 0.11056532710790634, + "learning_rate": 0.00028004961283843624, + "loss": 0.0111, + "step": 769 + }, + { + "epoch": 0.5, + "grad_norm": 0.18930545449256897, + "learning_rate": 0.0002799982094966007, + "loss": 0.0548, + "step": 770 + }, + { + "epoch": 0.5, + "grad_norm": 0.14607198536396027, + "learning_rate": 0.00027994674474822115, + "loss": 0.0296, + "step": 771 + }, + { + "epoch": 0.51, + "grad_norm": 0.1470440924167633, + "learning_rate": 0.0002798952186176076, + "loss": 0.0366, + "step": 772 + }, + { + "epoch": 0.51, + "grad_norm": 0.0858917385339737, + "learning_rate": 0.0002798436311290992, + "loss": 0.0149, + "step": 773 + }, + { + "epoch": 0.51, + "grad_norm": 0.23497411608695984, + "learning_rate": 0.000279791982307064, + "loss": 0.031, + "step": 774 + }, + { + "epoch": 0.51, + "grad_norm": 0.05533986538648605, + "learning_rate": 0.00027974027217589917, + "loss": 0.0149, + "step": 775 + }, + { + "epoch": 0.51, + "grad_norm": 0.18971019983291626, + "learning_rate": 0.00027968850076003066, + "loss": 0.0339, + "step": 776 + }, + { + "epoch": 0.51, + "grad_norm": 0.18975158035755157, + "learning_rate": 0.00027963666808391343, + "loss": 0.0192, + "step": 777 + }, + { + "epoch": 0.51, + "grad_norm": 0.04416336864233017, + "learning_rate": 0.0002795847741720315, + "loss": 0.0073, + "step": 778 + }, + { + "epoch": 0.51, + "grad_norm": 0.20576409995555878, + "learning_rate": 0.00027953281904889764, + "loss": 0.0418, + "step": 779 + }, + { + "epoch": 0.51, + "grad_norm": 0.1331322193145752, + "learning_rate": 0.0002794808027390536, + "loss": 0.011, + "step": 780 + }, + { + "epoch": 0.51, + "grad_norm": 0.3062935769557953, + "learning_rate": 0.0002794287252670701, + "loss": 0.07, + "step": 781 + }, + { + "epoch": 0.51, + "grad_norm": 0.1513393074274063, + "learning_rate": 0.0002793765866575466, + "loss": 0.0384, + "step": 782 + }, + { + "epoch": 0.51, + "grad_norm": 0.08953273296356201, + "learning_rate": 0.0002793243869351116, + "loss": 0.0342, + "step": 783 + }, + { + "epoch": 0.51, + "grad_norm": 0.16210728883743286, + "learning_rate": 0.00027927212612442243, + "loss": 0.0403, + "step": 784 + }, + { + "epoch": 0.51, + "grad_norm": 0.10174558311700821, + "learning_rate": 0.0002792198042501652, + "loss": 0.0304, + "step": 785 + }, + { + "epoch": 0.51, + "grad_norm": 0.15236282348632812, + "learning_rate": 0.0002791674213370549, + "loss": 0.0378, + "step": 786 + }, + { + "epoch": 0.52, + "grad_norm": 0.06242475286126137, + "learning_rate": 0.0002791149774098353, + "loss": 0.0092, + "step": 787 + }, + { + "epoch": 0.52, + "grad_norm": 0.20970316231250763, + "learning_rate": 0.0002790624724932792, + "loss": 0.0479, + "step": 788 + }, + { + "epoch": 0.52, + "grad_norm": 0.15189824998378754, + "learning_rate": 0.0002790099066121879, + "loss": 0.0118, + "step": 789 + }, + { + "epoch": 0.52, + "grad_norm": 0.061395786702632904, + "learning_rate": 0.0002789572797913918, + "loss": 0.0151, + "step": 790 + }, + { + "epoch": 0.52, + "grad_norm": 0.5034061074256897, + "learning_rate": 0.00027890459205574987, + "loss": 0.0864, + "step": 791 + }, + { + "epoch": 0.52, + "grad_norm": 0.07147826254367828, + "learning_rate": 0.0002788518434301499, + "loss": 0.0191, + "step": 792 + }, + { + "epoch": 0.52, + "grad_norm": 0.1329374462366104, + "learning_rate": 0.0002787990339395085, + "loss": 0.0331, + "step": 793 + }, + { + "epoch": 0.52, + "grad_norm": 0.15126173198223114, + "learning_rate": 0.0002787461636087711, + "loss": 0.0143, + "step": 794 + }, + { + "epoch": 0.52, + "grad_norm": 0.09816433489322662, + "learning_rate": 0.0002786932324629116, + "loss": 0.0155, + "step": 795 + }, + { + "epoch": 0.52, + "grad_norm": 0.1379247009754181, + "learning_rate": 0.0002786402405269329, + "loss": 0.0315, + "step": 796 + }, + { + "epoch": 0.52, + "grad_norm": 0.23714596033096313, + "learning_rate": 0.00027858718782586647, + "loss": 0.0465, + "step": 797 + }, + { + "epoch": 0.52, + "grad_norm": 0.12600766122341156, + "learning_rate": 0.0002785340743847725, + "loss": 0.0359, + "step": 798 + }, + { + "epoch": 0.52, + "grad_norm": 0.13546015322208405, + "learning_rate": 0.00027848090022874, + "loss": 0.0175, + "step": 799 + }, + { + "epoch": 0.52, + "grad_norm": 0.07381202280521393, + "learning_rate": 0.00027842766538288647, + "loss": 0.0302, + "step": 800 + }, + { + "epoch": 0.52, + "grad_norm": 0.14797933399677277, + "learning_rate": 0.0002783743698723582, + "loss": 0.0818, + "step": 801 + }, + { + "epoch": 0.53, + "grad_norm": 0.02270502597093582, + "learning_rate": 0.00027832101372233007, + "loss": 0.0049, + "step": 802 + }, + { + "epoch": 0.53, + "grad_norm": 0.322780042886734, + "learning_rate": 0.00027826759695800566, + "loss": 0.0694, + "step": 803 + }, + { + "epoch": 0.53, + "grad_norm": 0.2222844511270523, + "learning_rate": 0.0002782141196046171, + "loss": 0.0261, + "step": 804 + }, + { + "epoch": 0.53, + "grad_norm": 0.3285076320171356, + "learning_rate": 0.0002781605816874253, + "loss": 0.0872, + "step": 805 + }, + { + "epoch": 0.53, + "grad_norm": 0.08804619312286377, + "learning_rate": 0.0002781069832317196, + "loss": 0.0578, + "step": 806 + }, + { + "epoch": 0.53, + "grad_norm": 0.17540492117404938, + "learning_rate": 0.00027805332426281793, + "loss": 0.0384, + "step": 807 + }, + { + "epoch": 0.53, + "grad_norm": 0.06208420172333717, + "learning_rate": 0.00027799960480606706, + "loss": 0.0136, + "step": 808 + }, + { + "epoch": 0.53, + "grad_norm": 0.07048339396715164, + "learning_rate": 0.0002779458248868421, + "loss": 0.026, + "step": 809 + }, + { + "epoch": 0.53, + "grad_norm": 0.0901297777891159, + "learning_rate": 0.00027789198453054666, + "loss": 0.0277, + "step": 810 + }, + { + "epoch": 0.53, + "grad_norm": 0.07435130327939987, + "learning_rate": 0.0002778380837626132, + "loss": 0.0197, + "step": 811 + }, + { + "epoch": 0.53, + "grad_norm": 0.14209306240081787, + "learning_rate": 0.00027778412260850234, + "loss": 0.0407, + "step": 812 + }, + { + "epoch": 0.53, + "grad_norm": 0.16662320494651794, + "learning_rate": 0.00027773010109370357, + "loss": 0.0667, + "step": 813 + }, + { + "epoch": 0.53, + "grad_norm": 0.031582899391651154, + "learning_rate": 0.0002776760192437346, + "loss": 0.0104, + "step": 814 + }, + { + "epoch": 0.53, + "grad_norm": 0.0818646028637886, + "learning_rate": 0.00027762187708414195, + "loss": 0.0258, + "step": 815 + }, + { + "epoch": 0.53, + "grad_norm": 0.0822024941444397, + "learning_rate": 0.0002775676746405003, + "loss": 0.0406, + "step": 816 + }, + { + "epoch": 0.53, + "grad_norm": 0.11700989305973053, + "learning_rate": 0.0002775134119384131, + "loss": 0.0335, + "step": 817 + }, + { + "epoch": 0.54, + "grad_norm": 0.082905612885952, + "learning_rate": 0.00027745908900351195, + "loss": 0.0161, + "step": 818 + }, + { + "epoch": 0.54, + "grad_norm": 0.11846023797988892, + "learning_rate": 0.00027740470586145726, + "loss": 0.0502, + "step": 819 + }, + { + "epoch": 0.54, + "grad_norm": 0.17150120437145233, + "learning_rate": 0.00027735026253793756, + "loss": 0.0345, + "step": 820 + }, + { + "epoch": 0.54, + "grad_norm": 0.12443263083696365, + "learning_rate": 0.00027729575905867, + "loss": 0.0158, + "step": 821 + }, + { + "epoch": 0.54, + "grad_norm": 0.1657358705997467, + "learning_rate": 0.0002772411954494001, + "loss": 0.0226, + "step": 822 + }, + { + "epoch": 0.54, + "grad_norm": 0.08840323239564896, + "learning_rate": 0.0002771865717359018, + "loss": 0.0152, + "step": 823 + }, + { + "epoch": 0.54, + "grad_norm": 0.08032941073179245, + "learning_rate": 0.00027713188794397737, + "loss": 0.0129, + "step": 824 + }, + { + "epoch": 0.54, + "grad_norm": 0.1835167407989502, + "learning_rate": 0.00027707714409945744, + "loss": 0.0569, + "step": 825 + }, + { + "epoch": 0.54, + "grad_norm": 0.12917861342430115, + "learning_rate": 0.0002770223402282012, + "loss": 0.0309, + "step": 826 + }, + { + "epoch": 0.54, + "grad_norm": 0.22104112803936005, + "learning_rate": 0.0002769674763560959, + "loss": 0.0432, + "step": 827 + }, + { + "epoch": 0.54, + "grad_norm": 0.13979768753051758, + "learning_rate": 0.00027691255250905737, + "loss": 0.0174, + "step": 828 + }, + { + "epoch": 0.54, + "grad_norm": 0.17627565562725067, + "learning_rate": 0.0002768575687130297, + "loss": 0.0915, + "step": 829 + }, + { + "epoch": 0.54, + "grad_norm": 0.486728310585022, + "learning_rate": 0.0002768025249939853, + "loss": 0.0583, + "step": 830 + }, + { + "epoch": 0.54, + "grad_norm": 0.1259876936674118, + "learning_rate": 0.0002767474213779247, + "loss": 0.0254, + "step": 831 + }, + { + "epoch": 0.54, + "grad_norm": 0.17353613674640656, + "learning_rate": 0.00027669225789087715, + "loss": 0.0238, + "step": 832 + }, + { + "epoch": 0.55, + "grad_norm": 0.011490284465253353, + "learning_rate": 0.00027663703455889973, + "loss": 0.0025, + "step": 833 + }, + { + "epoch": 0.55, + "grad_norm": 0.054609689861536026, + "learning_rate": 0.00027658175140807815, + "loss": 0.0098, + "step": 834 + }, + { + "epoch": 0.55, + "grad_norm": 0.1213490441441536, + "learning_rate": 0.000276526408464526, + "loss": 0.0128, + "step": 835 + }, + { + "epoch": 0.55, + "grad_norm": 0.09482322633266449, + "learning_rate": 0.0002764710057543855, + "loss": 0.0126, + "step": 836 + }, + { + "epoch": 0.55, + "grad_norm": 0.057049017399549484, + "learning_rate": 0.00027641554330382686, + "loss": 0.015, + "step": 837 + }, + { + "epoch": 0.55, + "grad_norm": 0.18572884798049927, + "learning_rate": 0.0002763600211390486, + "loss": 0.034, + "step": 838 + }, + { + "epoch": 0.55, + "grad_norm": 0.09493198245763779, + "learning_rate": 0.0002763044392862774, + "loss": 0.0408, + "step": 839 + }, + { + "epoch": 0.55, + "grad_norm": 0.2182336002588272, + "learning_rate": 0.00027624879777176807, + "loss": 0.055, + "step": 840 + }, + { + "epoch": 0.55, + "grad_norm": 0.08872721344232559, + "learning_rate": 0.00027619309662180386, + "loss": 0.0383, + "step": 841 + }, + { + "epoch": 0.55, + "grad_norm": 0.11956200748682022, + "learning_rate": 0.0002761373358626959, + "loss": 0.0287, + "step": 842 + }, + { + "epoch": 0.55, + "grad_norm": 0.1644572764635086, + "learning_rate": 0.0002760815155207837, + "loss": 0.0286, + "step": 843 + }, + { + "epoch": 0.55, + "grad_norm": 0.16476300358772278, + "learning_rate": 0.0002760256356224347, + "loss": 0.0392, + "step": 844 + }, + { + "epoch": 0.55, + "grad_norm": 0.1026122123003006, + "learning_rate": 0.00027596969619404457, + "loss": 0.0403, + "step": 845 + }, + { + "epoch": 0.55, + "grad_norm": 0.17450834810733795, + "learning_rate": 0.00027591369726203725, + "loss": 0.0586, + "step": 846 + }, + { + "epoch": 0.55, + "grad_norm": 0.10373177379369736, + "learning_rate": 0.0002758576388528645, + "loss": 0.0214, + "step": 847 + }, + { + "epoch": 0.56, + "grad_norm": 0.08164018392562866, + "learning_rate": 0.0002758015209930064, + "loss": 0.0229, + "step": 848 + }, + { + "epoch": 0.56, + "grad_norm": 0.07375165820121765, + "learning_rate": 0.000275745343708971, + "loss": 0.0333, + "step": 849 + }, + { + "epoch": 0.56, + "grad_norm": 0.09719602763652802, + "learning_rate": 0.0002756891070272945, + "loss": 0.0214, + "step": 850 + }, + { + "epoch": 0.56, + "grad_norm": 0.5595388412475586, + "learning_rate": 0.00027563281097454115, + "loss": 0.0657, + "step": 851 + }, + { + "epoch": 0.56, + "grad_norm": 0.10981204360723495, + "learning_rate": 0.0002755764555773031, + "loss": 0.0308, + "step": 852 + }, + { + "epoch": 0.56, + "grad_norm": 0.10418907552957535, + "learning_rate": 0.0002755200408622007, + "loss": 0.0238, + "step": 853 + }, + { + "epoch": 0.56, + "grad_norm": 0.0636146143078804, + "learning_rate": 0.0002754635668558822, + "loss": 0.0143, + "step": 854 + }, + { + "epoch": 0.56, + "grad_norm": 0.12179470807313919, + "learning_rate": 0.00027540703358502406, + "loss": 0.0393, + "step": 855 + }, + { + "epoch": 0.56, + "grad_norm": 0.07303999364376068, + "learning_rate": 0.00027535044107633046, + "loss": 0.0118, + "step": 856 + }, + { + "epoch": 0.56, + "grad_norm": 0.11226726323366165, + "learning_rate": 0.00027529378935653377, + "loss": 0.0356, + "step": 857 + }, + { + "epoch": 0.56, + "grad_norm": 0.16357053816318512, + "learning_rate": 0.0002752370784523942, + "loss": 0.0378, + "step": 858 + }, + { + "epoch": 0.56, + "grad_norm": 0.10425914824008942, + "learning_rate": 0.0002751803083907, + "loss": 0.0423, + "step": 859 + }, + { + "epoch": 0.56, + "grad_norm": 0.11986647546291351, + "learning_rate": 0.0002751234791982674, + "loss": 0.054, + "step": 860 + }, + { + "epoch": 0.56, + "grad_norm": 0.14440590143203735, + "learning_rate": 0.00027506659090194036, + "loss": 0.0418, + "step": 861 + }, + { + "epoch": 0.56, + "grad_norm": 0.21995751559734344, + "learning_rate": 0.0002750096435285909, + "loss": 0.0303, + "step": 862 + }, + { + "epoch": 0.56, + "grad_norm": 0.03415970876812935, + "learning_rate": 0.00027495263710511906, + "loss": 0.0084, + "step": 863 + }, + { + "epoch": 0.57, + "grad_norm": 0.052127208560705185, + "learning_rate": 0.0002748955716584526, + "loss": 0.0124, + "step": 864 + }, + { + "epoch": 0.57, + "grad_norm": 0.23270320892333984, + "learning_rate": 0.0002748384472155472, + "loss": 0.0501, + "step": 865 + }, + { + "epoch": 0.57, + "grad_norm": 0.05627870559692383, + "learning_rate": 0.00027478126380338645, + "loss": 0.0081, + "step": 866 + }, + { + "epoch": 0.57, + "grad_norm": 0.1844397783279419, + "learning_rate": 0.0002747240214489817, + "loss": 0.04, + "step": 867 + }, + { + "epoch": 0.57, + "grad_norm": 0.06833455711603165, + "learning_rate": 0.0002746667201793722, + "loss": 0.0136, + "step": 868 + }, + { + "epoch": 0.57, + "grad_norm": 0.03551473841071129, + "learning_rate": 0.00027460936002162513, + "loss": 0.0057, + "step": 869 + }, + { + "epoch": 0.57, + "grad_norm": 0.0920785516500473, + "learning_rate": 0.0002745519410028354, + "loss": 0.0103, + "step": 870 + }, + { + "epoch": 0.57, + "grad_norm": 0.1218150407075882, + "learning_rate": 0.0002744944631501256, + "loss": 0.0427, + "step": 871 + }, + { + "epoch": 0.57, + "grad_norm": 0.3496924042701721, + "learning_rate": 0.00027443692649064633, + "loss": 0.0686, + "step": 872 + }, + { + "epoch": 0.57, + "grad_norm": 0.3225466310977936, + "learning_rate": 0.00027437933105157585, + "loss": 0.0518, + "step": 873 + }, + { + "epoch": 0.57, + "grad_norm": 0.230736643075943, + "learning_rate": 0.00027432167686012015, + "loss": 0.0468, + "step": 874 + }, + { + "epoch": 0.57, + "grad_norm": 0.20991326868534088, + "learning_rate": 0.00027426396394351313, + "loss": 0.0595, + "step": 875 + }, + { + "epoch": 0.57, + "grad_norm": 0.10641276091337204, + "learning_rate": 0.0002742061923290162, + "loss": 0.0353, + "step": 876 + }, + { + "epoch": 0.57, + "grad_norm": 0.06472618877887726, + "learning_rate": 0.00027414836204391865, + "loss": 0.012, + "step": 877 + }, + { + "epoch": 0.57, + "grad_norm": 0.2291422188282013, + "learning_rate": 0.0002740904731155375, + "loss": 0.0431, + "step": 878 + }, + { + "epoch": 0.58, + "grad_norm": 0.26647308468818665, + "learning_rate": 0.0002740325255712175, + "loss": 0.1054, + "step": 879 + }, + { + "epoch": 0.58, + "grad_norm": 0.08363434672355652, + "learning_rate": 0.0002739745194383309, + "loss": 0.011, + "step": 880 + }, + { + "epoch": 0.58, + "grad_norm": 0.10943964123725891, + "learning_rate": 0.00027391645474427774, + "loss": 0.0331, + "step": 881 + }, + { + "epoch": 0.58, + "grad_norm": 0.2208610624074936, + "learning_rate": 0.0002738583315164857, + "loss": 0.0499, + "step": 882 + }, + { + "epoch": 0.58, + "grad_norm": 0.09434379637241364, + "learning_rate": 0.00027380014978241026, + "loss": 0.0268, + "step": 883 + }, + { + "epoch": 0.58, + "grad_norm": 0.13045388460159302, + "learning_rate": 0.0002737419095695343, + "loss": 0.0367, + "step": 884 + }, + { + "epoch": 0.58, + "grad_norm": 0.1460418850183487, + "learning_rate": 0.00027368361090536844, + "loss": 0.0662, + "step": 885 + }, + { + "epoch": 0.58, + "grad_norm": 0.08823563903570175, + "learning_rate": 0.000273625253817451, + "loss": 0.0387, + "step": 886 + }, + { + "epoch": 0.58, + "grad_norm": 0.08193490654230118, + "learning_rate": 0.00027356683833334766, + "loss": 0.0357, + "step": 887 + }, + { + "epoch": 0.58, + "grad_norm": 0.1274595856666565, + "learning_rate": 0.00027350836448065193, + "loss": 0.0346, + "step": 888 + }, + { + "epoch": 0.58, + "grad_norm": 0.061123717576265335, + "learning_rate": 0.0002734498322869847, + "loss": 0.0388, + "step": 889 + }, + { + "epoch": 0.58, + "grad_norm": 0.12708084285259247, + "learning_rate": 0.0002733912417799945, + "loss": 0.0276, + "step": 890 + }, + { + "epoch": 0.58, + "grad_norm": 0.055733684450387955, + "learning_rate": 0.00027333259298735756, + "loss": 0.0139, + "step": 891 + }, + { + "epoch": 0.58, + "grad_norm": 0.049776624888181686, + "learning_rate": 0.00027327388593677727, + "loss": 0.0141, + "step": 892 + }, + { + "epoch": 0.58, + "grad_norm": 0.1466546654701233, + "learning_rate": 0.000273215120655985, + "loss": 0.0424, + "step": 893 + }, + { + "epoch": 0.59, + "grad_norm": 0.04927024617791176, + "learning_rate": 0.00027315629717273915, + "loss": 0.0121, + "step": 894 + }, + { + "epoch": 0.59, + "grad_norm": 0.14217214286327362, + "learning_rate": 0.0002730974155148259, + "loss": 0.0365, + "step": 895 + }, + { + "epoch": 0.59, + "grad_norm": 0.06162632629275322, + "learning_rate": 0.00027303847571005904, + "loss": 0.0185, + "step": 896 + }, + { + "epoch": 0.59, + "grad_norm": 0.09187627583742142, + "learning_rate": 0.00027297947778627947, + "loss": 0.024, + "step": 897 + }, + { + "epoch": 0.59, + "grad_norm": 0.08694395422935486, + "learning_rate": 0.00027292042177135575, + "loss": 0.016, + "step": 898 + }, + { + "epoch": 0.59, + "grad_norm": 0.2407931238412857, + "learning_rate": 0.0002728613076931838, + "loss": 0.0895, + "step": 899 + }, + { + "epoch": 0.59, + "grad_norm": 0.11447851359844208, + "learning_rate": 0.0002728021355796871, + "loss": 0.0156, + "step": 900 + }, + { + "epoch": 0.59, + "grad_norm": 0.17052114009857178, + "learning_rate": 0.0002727429054588165, + "loss": 0.0686, + "step": 901 + }, + { + "epoch": 0.59, + "grad_norm": 0.11735350638628006, + "learning_rate": 0.0002726836173585501, + "loss": 0.0458, + "step": 902 + }, + { + "epoch": 0.59, + "grad_norm": 0.1015033945441246, + "learning_rate": 0.0002726242713068935, + "loss": 0.0396, + "step": 903 + }, + { + "epoch": 0.59, + "grad_norm": 0.09442136436700821, + "learning_rate": 0.00027256486733187975, + "loss": 0.0354, + "step": 904 + }, + { + "epoch": 0.59, + "grad_norm": 0.051811713725328445, + "learning_rate": 0.0002725054054615691, + "loss": 0.0103, + "step": 905 + }, + { + "epoch": 0.59, + "grad_norm": 0.09581268578767776, + "learning_rate": 0.00027244588572404924, + "loss": 0.0346, + "step": 906 + }, + { + "epoch": 0.59, + "grad_norm": 0.1265789121389389, + "learning_rate": 0.00027238630814743525, + "loss": 0.0296, + "step": 907 + }, + { + "epoch": 0.59, + "grad_norm": 0.11578807979822159, + "learning_rate": 0.0002723266727598694, + "loss": 0.0374, + "step": 908 + }, + { + "epoch": 0.6, + "grad_norm": 0.0634288564324379, + "learning_rate": 0.0002722669795895214, + "loss": 0.0211, + "step": 909 + }, + { + "epoch": 0.6, + "grad_norm": 0.10002614557743073, + "learning_rate": 0.0002722072286645881, + "loss": 0.0217, + "step": 910 + }, + { + "epoch": 0.6, + "grad_norm": 0.10582344233989716, + "learning_rate": 0.0002721474200132937, + "loss": 0.0262, + "step": 911 + }, + { + "epoch": 0.6, + "grad_norm": 0.20417608320713043, + "learning_rate": 0.0002720875536638898, + "loss": 0.0303, + "step": 912 + }, + { + "epoch": 0.6, + "grad_norm": 0.06233491376042366, + "learning_rate": 0.00027202762964465514, + "loss": 0.0179, + "step": 913 + }, + { + "epoch": 0.6, + "grad_norm": 0.10917846113443375, + "learning_rate": 0.00027196764798389557, + "loss": 0.0238, + "step": 914 + }, + { + "epoch": 0.6, + "grad_norm": 0.20902927219867706, + "learning_rate": 0.0002719076087099444, + "loss": 0.0744, + "step": 915 + }, + { + "epoch": 0.6, + "grad_norm": 0.07525712251663208, + "learning_rate": 0.000271847511851162, + "loss": 0.0145, + "step": 916 + }, + { + "epoch": 0.6, + "grad_norm": 0.13625741004943848, + "learning_rate": 0.0002717873574359361, + "loss": 0.0557, + "step": 917 + }, + { + "epoch": 0.6, + "grad_norm": 0.10275349766016006, + "learning_rate": 0.00027172714549268136, + "loss": 0.0156, + "step": 918 + }, + { + "epoch": 0.6, + "grad_norm": 0.07689966261386871, + "learning_rate": 0.0002716668760498399, + "loss": 0.0285, + "step": 919 + }, + { + "epoch": 0.6, + "grad_norm": 0.051624033600091934, + "learning_rate": 0.00027160654913588073, + "loss": 0.0109, + "step": 920 + }, + { + "epoch": 0.6, + "grad_norm": 0.1263073831796646, + "learning_rate": 0.0002715461647793003, + "loss": 0.03, + "step": 921 + }, + { + "epoch": 0.6, + "grad_norm": 0.03605236858129501, + "learning_rate": 0.0002714857230086219, + "loss": 0.008, + "step": 922 + }, + { + "epoch": 0.6, + "grad_norm": 0.09554066509008408, + "learning_rate": 0.0002714252238523962, + "loss": 0.0276, + "step": 923 + }, + { + "epoch": 0.6, + "grad_norm": 0.12727093696594238, + "learning_rate": 0.0002713646673392008, + "loss": 0.0365, + "step": 924 + }, + { + "epoch": 0.61, + "grad_norm": 0.21029303967952728, + "learning_rate": 0.00027130405349764044, + "loss": 0.0554, + "step": 925 + }, + { + "epoch": 0.61, + "grad_norm": 0.10958801954984665, + "learning_rate": 0.00027124338235634695, + "loss": 0.032, + "step": 926 + }, + { + "epoch": 0.61, + "grad_norm": 0.06557829678058624, + "learning_rate": 0.0002711826539439792, + "loss": 0.0145, + "step": 927 + }, + { + "epoch": 0.61, + "grad_norm": 0.0530441552400589, + "learning_rate": 0.0002711218682892232, + "loss": 0.014, + "step": 928 + }, + { + "epoch": 0.61, + "grad_norm": 0.11874904483556747, + "learning_rate": 0.00027106102542079195, + "loss": 0.0144, + "step": 929 + }, + { + "epoch": 0.61, + "grad_norm": 0.07747121155261993, + "learning_rate": 0.0002710001253674254, + "loss": 0.0136, + "step": 930 + }, + { + "epoch": 0.61, + "grad_norm": 0.055583804845809937, + "learning_rate": 0.0002709391681578906, + "loss": 0.013, + "step": 931 + }, + { + "epoch": 0.61, + "grad_norm": 0.06069410964846611, + "learning_rate": 0.0002708781538209815, + "loss": 0.0076, + "step": 932 + }, + { + "epoch": 0.61, + "grad_norm": 0.019891362637281418, + "learning_rate": 0.00027081708238551927, + "loss": 0.0038, + "step": 933 + }, + { + "epoch": 0.61, + "grad_norm": 0.1343265175819397, + "learning_rate": 0.00027075595388035173, + "loss": 0.0307, + "step": 934 + }, + { + "epoch": 0.61, + "grad_norm": 0.04620016738772392, + "learning_rate": 0.00027069476833435397, + "loss": 0.0048, + "step": 935 + }, + { + "epoch": 0.61, + "grad_norm": 0.1706463247537613, + "learning_rate": 0.00027063352577642776, + "loss": 0.0643, + "step": 936 + }, + { + "epoch": 0.61, + "grad_norm": 0.058014389127492905, + "learning_rate": 0.0002705722262355019, + "loss": 0.0081, + "step": 937 + }, + { + "epoch": 0.61, + "grad_norm": 0.11744493991136551, + "learning_rate": 0.0002705108697405322, + "loss": 0.0308, + "step": 938 + }, + { + "epoch": 0.61, + "grad_norm": 0.08099761605262756, + "learning_rate": 0.00027044945632050127, + "loss": 0.0052, + "step": 939 + }, + { + "epoch": 0.62, + "grad_norm": 0.29563236236572266, + "learning_rate": 0.00027038798600441865, + "loss": 0.0529, + "step": 940 + }, + { + "epoch": 0.62, + "grad_norm": 0.043802157044410706, + "learning_rate": 0.0002703264588213206, + "loss": 0.0071, + "step": 941 + }, + { + "epoch": 0.62, + "grad_norm": 0.12684734165668488, + "learning_rate": 0.00027026487480027057, + "loss": 0.0433, + "step": 942 + }, + { + "epoch": 0.62, + "grad_norm": 0.21014286577701569, + "learning_rate": 0.00027020323397035855, + "loss": 0.028, + "step": 943 + }, + { + "epoch": 0.62, + "grad_norm": 0.11645261198282242, + "learning_rate": 0.00027014153636070157, + "loss": 0.0178, + "step": 944 + }, + { + "epoch": 0.62, + "grad_norm": 0.16726157069206238, + "learning_rate": 0.00027007978200044324, + "loss": 0.0508, + "step": 945 + }, + { + "epoch": 0.62, + "grad_norm": 0.10064594447612762, + "learning_rate": 0.0002700179709187543, + "loss": 0.0239, + "step": 946 + }, + { + "epoch": 0.62, + "grad_norm": 0.060703571885824203, + "learning_rate": 0.00026995610314483205, + "loss": 0.0103, + "step": 947 + }, + { + "epoch": 0.62, + "grad_norm": 0.0527808852493763, + "learning_rate": 0.0002698941787079006, + "loss": 0.0178, + "step": 948 + }, + { + "epoch": 0.62, + "grad_norm": 0.08081556856632233, + "learning_rate": 0.00026983219763721086, + "loss": 0.0157, + "step": 949 + }, + { + "epoch": 0.62, + "grad_norm": 0.12985916435718536, + "learning_rate": 0.00026977015996204054, + "loss": 0.0575, + "step": 950 + }, + { + "epoch": 0.62, + "grad_norm": 0.15043164789676666, + "learning_rate": 0.00026970806571169397, + "loss": 0.0302, + "step": 951 + }, + { + "epoch": 0.62, + "grad_norm": 0.024910060688853264, + "learning_rate": 0.00026964591491550235, + "loss": 0.0045, + "step": 952 + }, + { + "epoch": 0.62, + "grad_norm": 0.10944465547800064, + "learning_rate": 0.00026958370760282345, + "loss": 0.0574, + "step": 953 + }, + { + "epoch": 0.62, + "grad_norm": 0.114822618663311, + "learning_rate": 0.0002695214438030418, + "loss": 0.0262, + "step": 954 + }, + { + "epoch": 0.63, + "grad_norm": 0.15373332798480988, + "learning_rate": 0.0002694591235455687, + "loss": 0.0206, + "step": 955 + }, + { + "epoch": 0.63, + "grad_norm": 0.14427144825458527, + "learning_rate": 0.0002693967468598419, + "loss": 0.0508, + "step": 956 + }, + { + "epoch": 0.63, + "grad_norm": 0.0668393075466156, + "learning_rate": 0.000269334313775326, + "loss": 0.0195, + "step": 957 + }, + { + "epoch": 0.63, + "grad_norm": 0.06797386705875397, + "learning_rate": 0.00026927182432151216, + "loss": 0.0081, + "step": 958 + }, + { + "epoch": 0.63, + "grad_norm": 0.21059945225715637, + "learning_rate": 0.00026920927852791825, + "loss": 0.1075, + "step": 959 + }, + { + "epoch": 0.63, + "grad_norm": 0.10499881953001022, + "learning_rate": 0.0002691466764240886, + "loss": 0.0111, + "step": 960 + }, + { + "epoch": 0.63, + "grad_norm": 0.033115822821855545, + "learning_rate": 0.00026908401803959423, + "loss": 0.0054, + "step": 961 + }, + { + "epoch": 0.63, + "grad_norm": 0.2655697464942932, + "learning_rate": 0.0002690213034040328, + "loss": 0.0455, + "step": 962 + }, + { + "epoch": 0.63, + "grad_norm": 0.1976163387298584, + "learning_rate": 0.0002689585325470284, + "loss": 0.0454, + "step": 963 + }, + { + "epoch": 0.63, + "grad_norm": 0.05260282754898071, + "learning_rate": 0.00026889570549823184, + "loss": 0.0275, + "step": 964 + }, + { + "epoch": 0.63, + "grad_norm": 0.1485443115234375, + "learning_rate": 0.0002688328222873203, + "loss": 0.0191, + "step": 965 + }, + { + "epoch": 0.63, + "grad_norm": 0.0436883270740509, + "learning_rate": 0.0002687698829439977, + "loss": 0.0099, + "step": 966 + }, + { + "epoch": 0.63, + "grad_norm": 0.12818527221679688, + "learning_rate": 0.00026870688749799416, + "loss": 0.0323, + "step": 967 + }, + { + "epoch": 0.63, + "grad_norm": 0.14603693783283234, + "learning_rate": 0.0002686438359790667, + "loss": 0.0541, + "step": 968 + }, + { + "epoch": 0.63, + "grad_norm": 0.09324526786804199, + "learning_rate": 0.00026858072841699847, + "loss": 0.0272, + "step": 969 + }, + { + "epoch": 0.64, + "grad_norm": 0.26789504289627075, + "learning_rate": 0.0002685175648415994, + "loss": 0.0503, + "step": 970 + }, + { + "epoch": 0.64, + "grad_norm": 0.059855278581380844, + "learning_rate": 0.0002684543452827056, + "loss": 0.0136, + "step": 971 + }, + { + "epoch": 0.64, + "grad_norm": 0.08910810202360153, + "learning_rate": 0.00026839106977017974, + "loss": 0.016, + "step": 972 + }, + { + "epoch": 0.64, + "grad_norm": 0.09903378039598465, + "learning_rate": 0.000268327738333911, + "loss": 0.0307, + "step": 973 + }, + { + "epoch": 0.64, + "grad_norm": 0.16080208122730255, + "learning_rate": 0.00026826435100381487, + "loss": 0.0318, + "step": 974 + }, + { + "epoch": 0.64, + "grad_norm": 0.09495270997285843, + "learning_rate": 0.0002682009078098333, + "loss": 0.0591, + "step": 975 + }, + { + "epoch": 0.64, + "grad_norm": 0.11322695016860962, + "learning_rate": 0.00026813740878193457, + "loss": 0.047, + "step": 976 + }, + { + "epoch": 0.64, + "grad_norm": 0.06805938482284546, + "learning_rate": 0.0002680738539501134, + "loss": 0.0337, + "step": 977 + }, + { + "epoch": 0.64, + "grad_norm": 0.18398675322532654, + "learning_rate": 0.00026801024334439076, + "loss": 0.0653, + "step": 978 + }, + { + "epoch": 0.64, + "grad_norm": 0.09730216860771179, + "learning_rate": 0.00026794657699481415, + "loss": 0.0463, + "step": 979 + }, + { + "epoch": 0.64, + "grad_norm": 0.0954691618680954, + "learning_rate": 0.0002678828549314573, + "loss": 0.0199, + "step": 980 + }, + { + "epoch": 0.64, + "grad_norm": 0.15214982628822327, + "learning_rate": 0.00026781907718442013, + "loss": 0.0606, + "step": 981 + }, + { + "epoch": 0.64, + "grad_norm": 0.07308922708034515, + "learning_rate": 0.00026775524378382906, + "loss": 0.0229, + "step": 982 + }, + { + "epoch": 0.64, + "grad_norm": 0.1865328997373581, + "learning_rate": 0.00026769135475983676, + "loss": 0.0617, + "step": 983 + }, + { + "epoch": 0.64, + "grad_norm": 0.0670800730586052, + "learning_rate": 0.0002676274101426221, + "loss": 0.0213, + "step": 984 + }, + { + "epoch": 0.64, + "grad_norm": 0.09108185768127441, + "learning_rate": 0.0002675634099623903, + "loss": 0.0163, + "step": 985 + }, + { + "epoch": 0.65, + "grad_norm": 0.09892558306455612, + "learning_rate": 0.0002674993542493727, + "loss": 0.0398, + "step": 986 + }, + { + "epoch": 0.65, + "grad_norm": 0.18465696275234222, + "learning_rate": 0.00026743524303382695, + "loss": 0.0456, + "step": 987 + }, + { + "epoch": 0.65, + "grad_norm": 0.14701491594314575, + "learning_rate": 0.000267371076346037, + "loss": 0.0217, + "step": 988 + }, + { + "epoch": 0.65, + "grad_norm": 0.22119949758052826, + "learning_rate": 0.0002673068542163128, + "loss": 0.0337, + "step": 989 + }, + { + "epoch": 0.65, + "grad_norm": 0.07329166680574417, + "learning_rate": 0.0002672425766749907, + "loss": 0.0077, + "step": 990 + }, + { + "epoch": 0.65, + "grad_norm": 0.08214308321475983, + "learning_rate": 0.0002671782437524331, + "loss": 0.0086, + "step": 991 + }, + { + "epoch": 0.65, + "grad_norm": 0.16395068168640137, + "learning_rate": 0.0002671138554790286, + "loss": 0.0511, + "step": 992 + }, + { + "epoch": 0.65, + "grad_norm": 0.07903768122196198, + "learning_rate": 0.0002670494118851919, + "loss": 0.0227, + "step": 993 + }, + { + "epoch": 0.65, + "grad_norm": 0.044391512870788574, + "learning_rate": 0.0002669849130013639, + "loss": 0.0062, + "step": 994 + }, + { + "epoch": 0.65, + "grad_norm": 0.11790774017572403, + "learning_rate": 0.0002669203588580116, + "loss": 0.0586, + "step": 995 + }, + { + "epoch": 0.65, + "grad_norm": 0.023213036358356476, + "learning_rate": 0.000266855749485628, + "loss": 0.004, + "step": 996 + }, + { + "epoch": 0.65, + "grad_norm": 0.1801631897687912, + "learning_rate": 0.0002667910849147324, + "loss": 0.0273, + "step": 997 + }, + { + "epoch": 0.65, + "grad_norm": 0.3998229205608368, + "learning_rate": 0.00026672636517587, + "loss": 0.0479, + "step": 998 + }, + { + "epoch": 0.65, + "grad_norm": 0.08344905078411102, + "learning_rate": 0.0002666615902996121, + "loss": 0.0066, + "step": 999 + }, + { + "epoch": 0.65, + "grad_norm": 0.4904734194278717, + "learning_rate": 0.00026659676031655605, + "loss": 0.107, + "step": 1000 + }, + { + "epoch": 0.66, + "grad_norm": 0.14752142131328583, + "learning_rate": 0.00026653187525732525, + "loss": 0.0567, + "step": 1001 + }, + { + "epoch": 0.66, + "grad_norm": 0.09572061896324158, + "learning_rate": 0.0002664669351525691, + "loss": 0.045, + "step": 1002 + }, + { + "epoch": 0.66, + "grad_norm": 0.1489264965057373, + "learning_rate": 0.00026640194003296297, + "loss": 0.0181, + "step": 1003 + }, + { + "epoch": 0.66, + "grad_norm": 0.06828869134187698, + "learning_rate": 0.00026633688992920833, + "loss": 0.0204, + "step": 1004 + }, + { + "epoch": 0.66, + "grad_norm": 0.08580945432186127, + "learning_rate": 0.00026627178487203244, + "loss": 0.0275, + "step": 1005 + }, + { + "epoch": 0.66, + "grad_norm": 0.2796219289302826, + "learning_rate": 0.00026620662489218867, + "loss": 0.06, + "step": 1006 + }, + { + "epoch": 0.66, + "grad_norm": 0.19413504004478455, + "learning_rate": 0.0002661414100204563, + "loss": 0.048, + "step": 1007 + }, + { + "epoch": 0.66, + "grad_norm": 0.0517372228205204, + "learning_rate": 0.0002660761402876405, + "loss": 0.0192, + "step": 1008 + }, + { + "epoch": 0.66, + "grad_norm": 0.12586665153503418, + "learning_rate": 0.0002660108157245724, + "loss": 0.064, + "step": 1009 + }, + { + "epoch": 0.66, + "grad_norm": 0.055950991809368134, + "learning_rate": 0.000265945436362109, + "loss": 0.0128, + "step": 1010 + }, + { + "epoch": 0.66, + "grad_norm": 0.03763822093605995, + "learning_rate": 0.00026588000223113316, + "loss": 0.0107, + "step": 1011 + }, + { + "epoch": 0.66, + "grad_norm": 0.20842203497886658, + "learning_rate": 0.00026581451336255365, + "loss": 0.0668, + "step": 1012 + }, + { + "epoch": 0.66, + "grad_norm": 0.077543243765831, + "learning_rate": 0.00026574896978730515, + "loss": 0.0218, + "step": 1013 + }, + { + "epoch": 0.66, + "grad_norm": 0.13783104717731476, + "learning_rate": 0.0002656833715363481, + "loss": 0.0431, + "step": 1014 + }, + { + "epoch": 0.66, + "grad_norm": 0.049275536090135574, + "learning_rate": 0.0002656177186406687, + "loss": 0.012, + "step": 1015 + }, + { + "epoch": 0.67, + "grad_norm": 0.10721635073423386, + "learning_rate": 0.00026555201113127907, + "loss": 0.0392, + "step": 1016 + }, + { + "epoch": 0.67, + "grad_norm": 0.1177641823887825, + "learning_rate": 0.0002654862490392172, + "loss": 0.0416, + "step": 1017 + }, + { + "epoch": 0.67, + "grad_norm": 0.1034293919801712, + "learning_rate": 0.00026542043239554677, + "loss": 0.0262, + "step": 1018 + }, + { + "epoch": 0.67, + "grad_norm": 0.05769471079111099, + "learning_rate": 0.0002653545612313571, + "loss": 0.0088, + "step": 1019 + }, + { + "epoch": 0.67, + "grad_norm": 0.2152629792690277, + "learning_rate": 0.0002652886355777635, + "loss": 0.0709, + "step": 1020 + }, + { + "epoch": 0.67, + "grad_norm": 0.0717998817563057, + "learning_rate": 0.0002652226554659069, + "loss": 0.0135, + "step": 1021 + }, + { + "epoch": 0.67, + "grad_norm": 0.24547475576400757, + "learning_rate": 0.0002651566209269539, + "loss": 0.0627, + "step": 1022 + }, + { + "epoch": 0.67, + "grad_norm": 0.17455288767814636, + "learning_rate": 0.00026509053199209697, + "loss": 0.0466, + "step": 1023 + }, + { + "epoch": 0.67, + "grad_norm": 0.08559072017669678, + "learning_rate": 0.0002650243886925541, + "loss": 0.0306, + "step": 1024 + }, + { + "epoch": 0.67, + "grad_norm": 0.16568362712860107, + "learning_rate": 0.0002649581910595691, + "loss": 0.0272, + "step": 1025 + }, + { + "epoch": 0.67, + "grad_norm": 0.14109772443771362, + "learning_rate": 0.00026489193912441133, + "loss": 0.0241, + "step": 1026 + }, + { + "epoch": 0.67, + "grad_norm": 0.12116571515798569, + "learning_rate": 0.00026482563291837586, + "loss": 0.0216, + "step": 1027 + }, + { + "epoch": 0.67, + "grad_norm": 0.1847831755876541, + "learning_rate": 0.0002647592724727835, + "loss": 0.046, + "step": 1028 + }, + { + "epoch": 0.67, + "grad_norm": 0.1964387595653534, + "learning_rate": 0.0002646928578189803, + "loss": 0.0223, + "step": 1029 + }, + { + "epoch": 0.67, + "grad_norm": 0.17670650780200958, + "learning_rate": 0.0002646263889883385, + "loss": 0.0392, + "step": 1030 + }, + { + "epoch": 0.67, + "grad_norm": 0.3018537759780884, + "learning_rate": 0.00026455986601225544, + "loss": 0.0601, + "step": 1031 + }, + { + "epoch": 0.68, + "grad_norm": 0.16954761743545532, + "learning_rate": 0.0002644932889221543, + "loss": 0.0568, + "step": 1032 + }, + { + "epoch": 0.68, + "grad_norm": 0.07362630218267441, + "learning_rate": 0.0002644266577494837, + "loss": 0.0173, + "step": 1033 + }, + { + "epoch": 0.68, + "grad_norm": 0.22263336181640625, + "learning_rate": 0.0002643599725257178, + "loss": 0.0528, + "step": 1034 + }, + { + "epoch": 0.68, + "grad_norm": 0.1654106080532074, + "learning_rate": 0.00026429323328235635, + "loss": 0.0264, + "step": 1035 + }, + { + "epoch": 0.68, + "grad_norm": 0.16433385014533997, + "learning_rate": 0.0002642264400509247, + "loss": 0.0403, + "step": 1036 + }, + { + "epoch": 0.68, + "grad_norm": 0.16119657456874847, + "learning_rate": 0.0002641595928629735, + "loss": 0.0517, + "step": 1037 + }, + { + "epoch": 0.68, + "grad_norm": 0.06720812618732452, + "learning_rate": 0.00026409269175007904, + "loss": 0.0275, + "step": 1038 + }, + { + "epoch": 0.68, + "grad_norm": 0.08320458233356476, + "learning_rate": 0.000264025736743843, + "loss": 0.0254, + "step": 1039 + }, + { + "epoch": 0.68, + "grad_norm": 0.10702455043792725, + "learning_rate": 0.00026395872787589254, + "loss": 0.0173, + "step": 1040 + }, + { + "epoch": 0.68, + "grad_norm": 0.1805281639099121, + "learning_rate": 0.0002638916651778803, + "loss": 0.0526, + "step": 1041 + }, + { + "epoch": 0.68, + "grad_norm": 0.1021476462483406, + "learning_rate": 0.0002638245486814843, + "loss": 0.0206, + "step": 1042 + }, + { + "epoch": 0.68, + "grad_norm": 0.0951414480805397, + "learning_rate": 0.00026375737841840803, + "loss": 0.0165, + "step": 1043 + }, + { + "epoch": 0.68, + "grad_norm": 0.07957201451063156, + "learning_rate": 0.0002636901544203804, + "loss": 0.0205, + "step": 1044 + }, + { + "epoch": 0.68, + "grad_norm": 0.1612643599510193, + "learning_rate": 0.0002636228767191555, + "loss": 0.0426, + "step": 1045 + }, + { + "epoch": 0.68, + "grad_norm": 0.06410415470600128, + "learning_rate": 0.00026355554534651296, + "loss": 0.0138, + "step": 1046 + }, + { + "epoch": 0.69, + "grad_norm": 0.06664423644542694, + "learning_rate": 0.0002634881603342578, + "loss": 0.0158, + "step": 1047 + }, + { + "epoch": 0.69, + "grad_norm": 0.07890690118074417, + "learning_rate": 0.0002634207217142203, + "loss": 0.0582, + "step": 1048 + }, + { + "epoch": 0.69, + "grad_norm": 0.14806897938251495, + "learning_rate": 0.000263353229518256, + "loss": 0.059, + "step": 1049 + }, + { + "epoch": 0.69, + "grad_norm": 0.08011514693498611, + "learning_rate": 0.00026328568377824587, + "loss": 0.0114, + "step": 1050 + }, + { + "epoch": 0.69, + "grad_norm": 0.2250976264476776, + "learning_rate": 0.00026321808452609615, + "loss": 0.0563, + "step": 1051 + }, + { + "epoch": 0.69, + "grad_norm": 0.12238743901252747, + "learning_rate": 0.0002631504317937383, + "loss": 0.027, + "step": 1052 + }, + { + "epoch": 0.69, + "grad_norm": 0.21183420717716217, + "learning_rate": 0.00026308272561312903, + "loss": 0.0975, + "step": 1053 + }, + { + "epoch": 0.69, + "grad_norm": 0.03879234194755554, + "learning_rate": 0.0002630149660162505, + "loss": 0.0079, + "step": 1054 + }, + { + "epoch": 0.69, + "grad_norm": 0.0885310247540474, + "learning_rate": 0.0002629471530351097, + "loss": 0.0345, + "step": 1055 + }, + { + "epoch": 0.69, + "grad_norm": 0.15572543442249298, + "learning_rate": 0.0002628792867017392, + "loss": 0.0418, + "step": 1056 + }, + { + "epoch": 0.69, + "grad_norm": 0.05571586638689041, + "learning_rate": 0.00026281136704819674, + "loss": 0.0148, + "step": 1057 + }, + { + "epoch": 0.69, + "grad_norm": 0.15888281166553497, + "learning_rate": 0.000262743394106565, + "loss": 0.0423, + "step": 1058 + }, + { + "epoch": 0.69, + "grad_norm": 0.06525658816099167, + "learning_rate": 0.0002626753679089521, + "loss": 0.0179, + "step": 1059 + }, + { + "epoch": 0.69, + "grad_norm": 0.12204741686582565, + "learning_rate": 0.0002626072884874911, + "loss": 0.025, + "step": 1060 + }, + { + "epoch": 0.69, + "grad_norm": 0.08302486687898636, + "learning_rate": 0.00026253915587434035, + "loss": 0.0346, + "step": 1061 + }, + { + "epoch": 0.7, + "grad_norm": 0.1839776635169983, + "learning_rate": 0.0002624709701016833, + "loss": 0.0328, + "step": 1062 + }, + { + "epoch": 0.7, + "grad_norm": 0.07696244865655899, + "learning_rate": 0.0002624027312017285, + "loss": 0.0133, + "step": 1063 + }, + { + "epoch": 0.7, + "grad_norm": 0.13141821324825287, + "learning_rate": 0.0002623344392067096, + "loss": 0.0776, + "step": 1064 + }, + { + "epoch": 0.7, + "grad_norm": 0.10801652073860168, + "learning_rate": 0.00026226609414888523, + "loss": 0.0308, + "step": 1065 + }, + { + "epoch": 0.7, + "grad_norm": 0.11759611964225769, + "learning_rate": 0.00026219769606053927, + "loss": 0.0555, + "step": 1066 + }, + { + "epoch": 0.7, + "grad_norm": 0.27231448888778687, + "learning_rate": 0.00026212924497398044, + "loss": 0.1241, + "step": 1067 + }, + { + "epoch": 0.7, + "grad_norm": 0.05362692102789879, + "learning_rate": 0.00026206074092154276, + "loss": 0.0345, + "step": 1068 + }, + { + "epoch": 0.7, + "grad_norm": 0.09251823276281357, + "learning_rate": 0.0002619921839355849, + "loss": 0.0423, + "step": 1069 + }, + { + "epoch": 0.7, + "grad_norm": 0.09250043332576752, + "learning_rate": 0.000261923574048491, + "loss": 0.0433, + "step": 1070 + }, + { + "epoch": 0.7, + "grad_norm": 0.04056562855839729, + "learning_rate": 0.0002618549112926698, + "loss": 0.0134, + "step": 1071 + }, + { + "epoch": 0.7, + "grad_norm": 0.04624701663851738, + "learning_rate": 0.0002617861957005551, + "loss": 0.0204, + "step": 1072 + }, + { + "epoch": 0.7, + "grad_norm": 0.09492779523134232, + "learning_rate": 0.00026171742730460583, + "loss": 0.0252, + "step": 1073 + }, + { + "epoch": 0.7, + "grad_norm": 0.07661382853984833, + "learning_rate": 0.00026164860613730567, + "loss": 0.0164, + "step": 1074 + }, + { + "epoch": 0.7, + "grad_norm": 0.2870001196861267, + "learning_rate": 0.0002615797322311633, + "loss": 0.0362, + "step": 1075 + }, + { + "epoch": 0.7, + "grad_norm": 0.0941600501537323, + "learning_rate": 0.0002615108056187123, + "loss": 0.0277, + "step": 1076 + }, + { + "epoch": 0.71, + "grad_norm": 0.09941410273313522, + "learning_rate": 0.00026144182633251127, + "loss": 0.0271, + "step": 1077 + }, + { + "epoch": 0.71, + "grad_norm": 0.06893230229616165, + "learning_rate": 0.0002613727944051434, + "loss": 0.0264, + "step": 1078 + }, + { + "epoch": 0.71, + "grad_norm": 0.09225239604711533, + "learning_rate": 0.00026130370986921707, + "loss": 0.0124, + "step": 1079 + }, + { + "epoch": 0.71, + "grad_norm": 0.13335295021533966, + "learning_rate": 0.0002612345727573653, + "loss": 0.0658, + "step": 1080 + }, + { + "epoch": 0.71, + "grad_norm": 0.08353302627801895, + "learning_rate": 0.000261165383102246, + "loss": 0.0168, + "step": 1081 + }, + { + "epoch": 0.71, + "grad_norm": 0.16986088454723358, + "learning_rate": 0.00026109614093654195, + "loss": 0.0857, + "step": 1082 + }, + { + "epoch": 0.71, + "grad_norm": 0.07607953995466232, + "learning_rate": 0.00026102684629296065, + "loss": 0.01, + "step": 1083 + }, + { + "epoch": 0.71, + "grad_norm": 0.1080528199672699, + "learning_rate": 0.00026095749920423446, + "loss": 0.0605, + "step": 1084 + }, + { + "epoch": 0.71, + "grad_norm": 0.14226533472537994, + "learning_rate": 0.0002608880997031205, + "loss": 0.0323, + "step": 1085 + }, + { + "epoch": 0.71, + "grad_norm": 0.0267617329955101, + "learning_rate": 0.0002608186478224006, + "loss": 0.0046, + "step": 1086 + }, + { + "epoch": 0.71, + "grad_norm": 0.05727904289960861, + "learning_rate": 0.00026074914359488143, + "loss": 0.0111, + "step": 1087 + }, + { + "epoch": 0.71, + "grad_norm": 0.08158308267593384, + "learning_rate": 0.0002606795870533942, + "loss": 0.0227, + "step": 1088 + }, + { + "epoch": 0.71, + "grad_norm": 0.17422080039978027, + "learning_rate": 0.00026060997823079506, + "loss": 0.0583, + "step": 1089 + }, + { + "epoch": 0.71, + "grad_norm": 0.19084464013576508, + "learning_rate": 0.0002605403171599647, + "loss": 0.0736, + "step": 1090 + }, + { + "epoch": 0.71, + "grad_norm": 0.10208334028720856, + "learning_rate": 0.00026047060387380855, + "loss": 0.021, + "step": 1091 + }, + { + "epoch": 0.71, + "grad_norm": 0.13515685498714447, + "learning_rate": 0.0002604008384052568, + "loss": 0.0319, + "step": 1092 + }, + { + "epoch": 0.72, + "grad_norm": 0.13729439675807953, + "learning_rate": 0.00026033102078726393, + "loss": 0.0292, + "step": 1093 + }, + { + "epoch": 0.72, + "grad_norm": 0.10295616090297699, + "learning_rate": 0.0002602611510528095, + "loss": 0.0133, + "step": 1094 + }, + { + "epoch": 0.72, + "grad_norm": 0.14003846049308777, + "learning_rate": 0.0002601912292348975, + "loss": 0.0413, + "step": 1095 + }, + { + "epoch": 0.72, + "grad_norm": 0.22413985431194305, + "learning_rate": 0.0002601212553665564, + "loss": 0.0242, + "step": 1096 + }, + { + "epoch": 0.72, + "grad_norm": 0.13832725584506989, + "learning_rate": 0.0002600512294808395, + "loss": 0.0353, + "step": 1097 + }, + { + "epoch": 0.72, + "grad_norm": 0.29502683877944946, + "learning_rate": 0.0002599811516108245, + "loss": 0.0362, + "step": 1098 + }, + { + "epoch": 0.72, + "grad_norm": 0.09490124136209488, + "learning_rate": 0.00025991102178961366, + "loss": 0.014, + "step": 1099 + }, + { + "epoch": 0.72, + "grad_norm": 0.1145247370004654, + "learning_rate": 0.0002598408400503339, + "loss": 0.0294, + "step": 1100 + }, + { + "epoch": 0.72, + "grad_norm": 0.38977229595184326, + "learning_rate": 0.00025977060642613645, + "loss": 0.0827, + "step": 1101 + }, + { + "epoch": 0.72, + "grad_norm": 0.10398557782173157, + "learning_rate": 0.0002597003209501973, + "loss": 0.0176, + "step": 1102 + }, + { + "epoch": 0.72, + "grad_norm": 0.13759955763816833, + "learning_rate": 0.0002596299836557168, + "loss": 0.0428, + "step": 1103 + }, + { + "epoch": 0.72, + "grad_norm": 0.05294102802872658, + "learning_rate": 0.0002595595945759198, + "loss": 0.013, + "step": 1104 + }, + { + "epoch": 0.72, + "grad_norm": 0.2116420418024063, + "learning_rate": 0.0002594891537440556, + "loss": 0.0416, + "step": 1105 + }, + { + "epoch": 0.72, + "grad_norm": 0.0850871354341507, + "learning_rate": 0.00025941866119339786, + "loss": 0.0264, + "step": 1106 + }, + { + "epoch": 0.72, + "grad_norm": 0.04429350420832634, + "learning_rate": 0.00025934811695724484, + "loss": 0.0088, + "step": 1107 + }, + { + "epoch": 0.73, + "grad_norm": 0.0578470379114151, + "learning_rate": 0.0002592775210689192, + "loss": 0.0295, + "step": 1108 + }, + { + "epoch": 0.73, + "grad_norm": 0.1103309616446495, + "learning_rate": 0.00025920687356176784, + "loss": 0.0154, + "step": 1109 + }, + { + "epoch": 0.73, + "grad_norm": 0.09454017877578735, + "learning_rate": 0.0002591361744691622, + "loss": 0.025, + "step": 1110 + }, + { + "epoch": 0.73, + "grad_norm": 0.19059227406978607, + "learning_rate": 0.0002590654238244979, + "loss": 0.0599, + "step": 1111 + }, + { + "epoch": 0.73, + "grad_norm": 0.08629673719406128, + "learning_rate": 0.0002589946216611952, + "loss": 0.0151, + "step": 1112 + }, + { + "epoch": 0.73, + "grad_norm": 0.18637306988239288, + "learning_rate": 0.0002589237680126984, + "loss": 0.0496, + "step": 1113 + }, + { + "epoch": 0.73, + "grad_norm": 0.12386718392372131, + "learning_rate": 0.00025885286291247634, + "loss": 0.0269, + "step": 1114 + }, + { + "epoch": 0.73, + "grad_norm": 0.18383803963661194, + "learning_rate": 0.00025878190639402204, + "loss": 0.0408, + "step": 1115 + }, + { + "epoch": 0.73, + "grad_norm": 0.24928437173366547, + "learning_rate": 0.0002587108984908528, + "loss": 0.0254, + "step": 1116 + }, + { + "epoch": 0.73, + "grad_norm": 0.023719167336821556, + "learning_rate": 0.00025863983923651027, + "loss": 0.0037, + "step": 1117 + }, + { + "epoch": 0.73, + "grad_norm": 0.16337376832962036, + "learning_rate": 0.00025856872866456037, + "loss": 0.0529, + "step": 1118 + }, + { + "epoch": 0.73, + "grad_norm": 0.11658964306116104, + "learning_rate": 0.00025849756680859317, + "loss": 0.063, + "step": 1119 + }, + { + "epoch": 0.73, + "grad_norm": 0.20387554168701172, + "learning_rate": 0.000258426353702223, + "loss": 0.0605, + "step": 1120 + }, + { + "epoch": 0.73, + "grad_norm": 0.2778151035308838, + "learning_rate": 0.0002583550893790885, + "loss": 0.0476, + "step": 1121 + }, + { + "epoch": 0.73, + "grad_norm": 0.11449744552373886, + "learning_rate": 0.0002582837738728522, + "loss": 0.0315, + "step": 1122 + }, + { + "epoch": 0.74, + "grad_norm": 0.10286298394203186, + "learning_rate": 0.00025821240721720116, + "loss": 0.041, + "step": 1123 + }, + { + "epoch": 0.74, + "grad_norm": 0.11522707343101501, + "learning_rate": 0.00025814098944584645, + "loss": 0.0414, + "step": 1124 + }, + { + "epoch": 0.74, + "grad_norm": 0.06536536663770676, + "learning_rate": 0.0002580695205925233, + "loss": 0.0216, + "step": 1125 + }, + { + "epoch": 0.74, + "grad_norm": 0.0686458870768547, + "learning_rate": 0.00025799800069099105, + "loss": 0.0667, + "step": 1126 + }, + { + "epoch": 0.74, + "grad_norm": 0.07378174364566803, + "learning_rate": 0.0002579264297750331, + "loss": 0.018, + "step": 1127 + }, + { + "epoch": 0.74, + "grad_norm": 0.05744575336575508, + "learning_rate": 0.0002578548078784571, + "loss": 0.0328, + "step": 1128 + }, + { + "epoch": 0.74, + "grad_norm": 0.1781056821346283, + "learning_rate": 0.0002577831350350947, + "loss": 0.056, + "step": 1129 + }, + { + "epoch": 0.74, + "grad_norm": 0.11974502354860306, + "learning_rate": 0.0002577114112788016, + "loss": 0.0411, + "step": 1130 + }, + { + "epoch": 0.74, + "grad_norm": 0.07625679671764374, + "learning_rate": 0.00025763963664345745, + "loss": 0.0332, + "step": 1131 + }, + { + "epoch": 0.74, + "grad_norm": 0.07967997342348099, + "learning_rate": 0.00025756781116296617, + "loss": 0.0431, + "step": 1132 + }, + { + "epoch": 0.74, + "grad_norm": 0.14101997017860413, + "learning_rate": 0.0002574959348712555, + "loss": 0.0322, + "step": 1133 + }, + { + "epoch": 0.74, + "grad_norm": 0.12365719676017761, + "learning_rate": 0.00025742400780227724, + "loss": 0.0205, + "step": 1134 + }, + { + "epoch": 0.74, + "grad_norm": 0.14429523050785065, + "learning_rate": 0.0002573520299900073, + "loss": 0.069, + "step": 1135 + }, + { + "epoch": 0.74, + "grad_norm": 0.021441614255309105, + "learning_rate": 0.0002572800014684453, + "loss": 0.0054, + "step": 1136 + }, + { + "epoch": 0.74, + "grad_norm": 0.08611132204532623, + "learning_rate": 0.0002572079222716151, + "loss": 0.0442, + "step": 1137 + }, + { + "epoch": 0.75, + "grad_norm": 0.09402936697006226, + "learning_rate": 0.0002571357924335642, + "loss": 0.0352, + "step": 1138 + }, + { + "epoch": 0.75, + "grad_norm": 0.08581096678972244, + "learning_rate": 0.00025706361198836437, + "loss": 0.0149, + "step": 1139 + }, + { + "epoch": 0.75, + "grad_norm": 0.0612567737698555, + "learning_rate": 0.0002569913809701109, + "loss": 0.014, + "step": 1140 + }, + { + "epoch": 0.75, + "grad_norm": 0.10282464325428009, + "learning_rate": 0.0002569190994129233, + "loss": 0.0254, + "step": 1141 + }, + { + "epoch": 0.75, + "grad_norm": 0.07298202067613602, + "learning_rate": 0.00025684676735094475, + "loss": 0.033, + "step": 1142 + }, + { + "epoch": 0.75, + "grad_norm": 0.06616336852312088, + "learning_rate": 0.0002567743848183423, + "loss": 0.0127, + "step": 1143 + }, + { + "epoch": 0.75, + "grad_norm": 0.09016578644514084, + "learning_rate": 0.000256701951849307, + "loss": 0.0248, + "step": 1144 + }, + { + "epoch": 0.75, + "grad_norm": 0.09605623781681061, + "learning_rate": 0.0002566294684780536, + "loss": 0.0554, + "step": 1145 + }, + { + "epoch": 0.75, + "grad_norm": 0.13209934532642365, + "learning_rate": 0.0002565569347388206, + "loss": 0.0437, + "step": 1146 + }, + { + "epoch": 0.75, + "eval_loss": 0.030348777770996094, + "eval_runtime": 39.9058, + "eval_samples_per_second": 32.251, + "eval_steps_per_second": 8.069, + "step": 1146 + }, + { + "epoch": 0.75, + "grad_norm": 0.13489413261413574, + "learning_rate": 0.0002564843506658704, + "loss": 0.0214, + "step": 1147 + }, + { + "epoch": 0.75, + "grad_norm": 0.036875851452350616, + "learning_rate": 0.00025641171629348916, + "loss": 0.0075, + "step": 1148 + }, + { + "epoch": 0.75, + "grad_norm": 0.04911373555660248, + "learning_rate": 0.0002563390316559868, + "loss": 0.0331, + "step": 1149 + }, + { + "epoch": 0.75, + "grad_norm": 0.02945212461054325, + "learning_rate": 0.0002562662967876969, + "loss": 0.0044, + "step": 1150 + }, + { + "epoch": 0.75, + "grad_norm": 0.09545271843671799, + "learning_rate": 0.00025619351172297686, + "loss": 0.0342, + "step": 1151 + }, + { + "epoch": 0.75, + "grad_norm": 0.034161727875471115, + "learning_rate": 0.0002561206764962079, + "loss": 0.0064, + "step": 1152 + }, + { + "epoch": 0.75, + "grad_norm": 0.17162153124809265, + "learning_rate": 0.00025604779114179457, + "loss": 0.0305, + "step": 1153 + }, + { + "epoch": 0.76, + "grad_norm": 0.10241468250751495, + "learning_rate": 0.0002559748556941654, + "loss": 0.0143, + "step": 1154 + }, + { + "epoch": 0.76, + "grad_norm": 0.19089680910110474, + "learning_rate": 0.0002559018701877726, + "loss": 0.0192, + "step": 1155 + }, + { + "epoch": 0.76, + "grad_norm": 0.19189144670963287, + "learning_rate": 0.0002558288346570918, + "loss": 0.0385, + "step": 1156 + }, + { + "epoch": 0.76, + "grad_norm": 0.023649632930755615, + "learning_rate": 0.00025575574913662256, + "loss": 0.0043, + "step": 1157 + }, + { + "epoch": 0.76, + "grad_norm": 0.20011720061302185, + "learning_rate": 0.0002556826136608877, + "loss": 0.0361, + "step": 1158 + }, + { + "epoch": 0.76, + "grad_norm": 0.3903810679912567, + "learning_rate": 0.00025560942826443396, + "loss": 0.1086, + "step": 1159 + }, + { + "epoch": 0.76, + "grad_norm": 0.0918634682893753, + "learning_rate": 0.0002555361929818315, + "loss": 0.0237, + "step": 1160 + }, + { + "epoch": 0.76, + "grad_norm": 0.11210468411445618, + "learning_rate": 0.00025546290784767407, + "loss": 0.0432, + "step": 1161 + }, + { + "epoch": 0.76, + "grad_norm": 0.10598167777061462, + "learning_rate": 0.000255389572896579, + "loss": 0.0304, + "step": 1162 + }, + { + "epoch": 0.76, + "grad_norm": 0.03547512739896774, + "learning_rate": 0.00025531618816318697, + "loss": 0.014, + "step": 1163 + }, + { + "epoch": 0.76, + "grad_norm": 0.08146083354949951, + "learning_rate": 0.00025524275368216245, + "loss": 0.0122, + "step": 1164 + }, + { + "epoch": 0.76, + "grad_norm": 0.046655625104904175, + "learning_rate": 0.00025516926948819334, + "loss": 0.0151, + "step": 1165 + }, + { + "epoch": 0.76, + "grad_norm": 0.09417696297168732, + "learning_rate": 0.0002550957356159908, + "loss": 0.047, + "step": 1166 + }, + { + "epoch": 0.76, + "grad_norm": 0.08695515990257263, + "learning_rate": 0.00025502215210028976, + "loss": 0.0363, + "step": 1167 + }, + { + "epoch": 0.76, + "grad_norm": 0.05286262556910515, + "learning_rate": 0.0002549485189758485, + "loss": 0.0331, + "step": 1168 + }, + { + "epoch": 0.77, + "grad_norm": 0.1305568516254425, + "learning_rate": 0.0002548748362774485, + "loss": 0.0552, + "step": 1169 + }, + { + "epoch": 0.77, + "grad_norm": 0.15096144378185272, + "learning_rate": 0.000254801104039895, + "loss": 0.0341, + "step": 1170 + }, + { + "epoch": 0.77, + "grad_norm": 0.07643090933561325, + "learning_rate": 0.0002547273222980165, + "loss": 0.0234, + "step": 1171 + }, + { + "epoch": 0.77, + "grad_norm": 0.052111852914094925, + "learning_rate": 0.0002546534910866648, + "loss": 0.0278, + "step": 1172 + }, + { + "epoch": 0.77, + "grad_norm": 0.15109075605869293, + "learning_rate": 0.00025457961044071523, + "loss": 0.039, + "step": 1173 + }, + { + "epoch": 0.77, + "grad_norm": 0.05562788248062134, + "learning_rate": 0.00025450568039506633, + "loss": 0.0214, + "step": 1174 + }, + { + "epoch": 0.77, + "grad_norm": 0.1751837581396103, + "learning_rate": 0.00025443170098464, + "loss": 0.0401, + "step": 1175 + }, + { + "epoch": 0.77, + "grad_norm": 0.19507139921188354, + "learning_rate": 0.0002543576722443816, + "loss": 0.0331, + "step": 1176 + }, + { + "epoch": 0.77, + "grad_norm": 0.10975005477666855, + "learning_rate": 0.00025428359420925966, + "loss": 0.0155, + "step": 1177 + }, + { + "epoch": 0.77, + "grad_norm": 0.1416396051645279, + "learning_rate": 0.00025420946691426586, + "loss": 0.0473, + "step": 1178 + }, + { + "epoch": 0.77, + "grad_norm": 0.03987191617488861, + "learning_rate": 0.0002541352903944155, + "loss": 0.0069, + "step": 1179 + }, + { + "epoch": 0.77, + "grad_norm": 0.34085920453071594, + "learning_rate": 0.00025406106468474685, + "loss": 0.0919, + "step": 1180 + }, + { + "epoch": 0.77, + "grad_norm": 0.06129152700304985, + "learning_rate": 0.0002539867898203215, + "loss": 0.0129, + "step": 1181 + }, + { + "epoch": 0.77, + "grad_norm": 0.08059722930192947, + "learning_rate": 0.00025391246583622427, + "loss": 0.0172, + "step": 1182 + }, + { + "epoch": 0.77, + "grad_norm": 0.12509244680404663, + "learning_rate": 0.0002538380927675632, + "loss": 0.0881, + "step": 1183 + }, + { + "epoch": 0.78, + "grad_norm": 0.21917979419231415, + "learning_rate": 0.00025376367064946945, + "loss": 0.0438, + "step": 1184 + }, + { + "epoch": 0.78, + "grad_norm": 0.05029948800802231, + "learning_rate": 0.0002536891995170974, + "loss": 0.0102, + "step": 1185 + }, + { + "epoch": 0.78, + "grad_norm": 0.027424413710832596, + "learning_rate": 0.00025361467940562463, + "loss": 0.0053, + "step": 1186 + }, + { + "epoch": 0.78, + "grad_norm": 0.0775713250041008, + "learning_rate": 0.0002535401103502517, + "loss": 0.0329, + "step": 1187 + }, + { + "epoch": 0.78, + "grad_norm": 0.12953567504882812, + "learning_rate": 0.0002534654923862025, + "loss": 0.0371, + "step": 1188 + }, + { + "epoch": 0.78, + "grad_norm": 0.07097966223955154, + "learning_rate": 0.00025339082554872377, + "loss": 0.0165, + "step": 1189 + }, + { + "epoch": 0.78, + "grad_norm": 0.1304195523262024, + "learning_rate": 0.0002533161098730856, + "loss": 0.0386, + "step": 1190 + }, + { + "epoch": 0.78, + "grad_norm": 0.06887423247098923, + "learning_rate": 0.00025324134539458096, + "loss": 0.0221, + "step": 1191 + }, + { + "epoch": 0.78, + "grad_norm": 0.08637112379074097, + "learning_rate": 0.00025316653214852596, + "loss": 0.0341, + "step": 1192 + }, + { + "epoch": 0.78, + "grad_norm": 0.04632532596588135, + "learning_rate": 0.0002530916701702597, + "loss": 0.0094, + "step": 1193 + }, + { + "epoch": 0.78, + "grad_norm": 0.11397617310285568, + "learning_rate": 0.00025301675949514435, + "loss": 0.0167, + "step": 1194 + }, + { + "epoch": 0.78, + "grad_norm": 0.04785558953881264, + "learning_rate": 0.000252941800158565, + "loss": 0.0189, + "step": 1195 + }, + { + "epoch": 0.78, + "grad_norm": 0.24082554876804352, + "learning_rate": 0.00025286679219593, + "loss": 0.0472, + "step": 1196 + }, + { + "epoch": 0.78, + "grad_norm": 0.14454412460327148, + "learning_rate": 0.00025279173564267014, + "loss": 0.0521, + "step": 1197 + }, + { + "epoch": 0.78, + "grad_norm": 0.16198396682739258, + "learning_rate": 0.00025271663053423967, + "loss": 0.0606, + "step": 1198 + }, + { + "epoch": 0.78, + "grad_norm": 0.114061638712883, + "learning_rate": 0.0002526414769061155, + "loss": 0.012, + "step": 1199 + }, + { + "epoch": 0.79, + "grad_norm": 0.1736219972372055, + "learning_rate": 0.00025256627479379755, + "loss": 0.0516, + "step": 1200 + }, + { + "epoch": 0.79, + "grad_norm": 0.04280832037329674, + "learning_rate": 0.0002524910242328087, + "loss": 0.0073, + "step": 1201 + }, + { + "epoch": 0.79, + "grad_norm": 0.13054266571998596, + "learning_rate": 0.0002524157252586946, + "loss": 0.0295, + "step": 1202 + }, + { + "epoch": 0.79, + "grad_norm": 0.24452893435955048, + "learning_rate": 0.00025234037790702375, + "loss": 0.0856, + "step": 1203 + }, + { + "epoch": 0.79, + "grad_norm": 0.05776005983352661, + "learning_rate": 0.0002522649822133877, + "loss": 0.0152, + "step": 1204 + }, + { + "epoch": 0.79, + "grad_norm": 0.27971917390823364, + "learning_rate": 0.0002521895382134006, + "loss": 0.1183, + "step": 1205 + }, + { + "epoch": 0.79, + "grad_norm": 0.04905636981129646, + "learning_rate": 0.0002521140459426995, + "loss": 0.0126, + "step": 1206 + }, + { + "epoch": 0.79, + "grad_norm": 0.15006506443023682, + "learning_rate": 0.0002520385054369444, + "loss": 0.0811, + "step": 1207 + }, + { + "epoch": 0.79, + "grad_norm": 0.15131042897701263, + "learning_rate": 0.00025196291673181784, + "loss": 0.0401, + "step": 1208 + }, + { + "epoch": 0.79, + "grad_norm": 0.1603415459394455, + "learning_rate": 0.0002518872798630253, + "loss": 0.0448, + "step": 1209 + }, + { + "epoch": 0.79, + "grad_norm": 0.07513672858476639, + "learning_rate": 0.0002518115948662949, + "loss": 0.0401, + "step": 1210 + }, + { + "epoch": 0.79, + "grad_norm": 0.11225542426109314, + "learning_rate": 0.0002517358617773776, + "loss": 0.039, + "step": 1211 + }, + { + "epoch": 0.79, + "grad_norm": 0.0876198261976242, + "learning_rate": 0.000251660080632047, + "loss": 0.0239, + "step": 1212 + }, + { + "epoch": 0.79, + "grad_norm": 0.1050589308142662, + "learning_rate": 0.0002515842514660994, + "loss": 0.0258, + "step": 1213 + }, + { + "epoch": 0.79, + "grad_norm": 0.0426226444542408, + "learning_rate": 0.0002515083743153539, + "loss": 0.0111, + "step": 1214 + }, + { + "epoch": 0.8, + "grad_norm": 0.09025552123785019, + "learning_rate": 0.00025143244921565214, + "loss": 0.0185, + "step": 1215 + }, + { + "epoch": 0.8, + "grad_norm": 0.12371645122766495, + "learning_rate": 0.00025135647620285834, + "loss": 0.0326, + "step": 1216 + }, + { + "epoch": 0.8, + "grad_norm": 0.07417233288288116, + "learning_rate": 0.0002512804553128596, + "loss": 0.0238, + "step": 1217 + }, + { + "epoch": 0.8, + "grad_norm": 0.10499947518110275, + "learning_rate": 0.0002512043865815654, + "loss": 0.0464, + "step": 1218 + }, + { + "epoch": 0.8, + "grad_norm": 0.16344919800758362, + "learning_rate": 0.00025112827004490797, + "loss": 0.0373, + "step": 1219 + }, + { + "epoch": 0.8, + "grad_norm": 0.0862027183175087, + "learning_rate": 0.00025105210573884203, + "loss": 0.0178, + "step": 1220 + }, + { + "epoch": 0.8, + "grad_norm": 0.10541030019521713, + "learning_rate": 0.0002509758936993449, + "loss": 0.0377, + "step": 1221 + }, + { + "epoch": 0.8, + "grad_norm": 0.05190376564860344, + "learning_rate": 0.00025089963396241643, + "loss": 0.0099, + "step": 1222 + }, + { + "epoch": 0.8, + "grad_norm": 0.09249959141016006, + "learning_rate": 0.00025082332656407906, + "loss": 0.0157, + "step": 1223 + }, + { + "epoch": 0.8, + "grad_norm": 0.02348952367901802, + "learning_rate": 0.00025074697154037765, + "loss": 0.0041, + "step": 1224 + }, + { + "epoch": 0.8, + "grad_norm": 0.12875327467918396, + "learning_rate": 0.0002506705689273797, + "loss": 0.0173, + "step": 1225 + }, + { + "epoch": 0.8, + "grad_norm": 0.13971397280693054, + "learning_rate": 0.0002505941187611749, + "loss": 0.0381, + "step": 1226 + }, + { + "epoch": 0.8, + "grad_norm": 0.21139316260814667, + "learning_rate": 0.00025051762107787583, + "loss": 0.0399, + "step": 1227 + }, + { + "epoch": 0.8, + "grad_norm": 0.10346369445323944, + "learning_rate": 0.0002504410759136171, + "loss": 0.031, + "step": 1228 + }, + { + "epoch": 0.8, + "grad_norm": 0.021524077281355858, + "learning_rate": 0.00025036448330455603, + "loss": 0.0041, + "step": 1229 + }, + { + "epoch": 0.81, + "grad_norm": 0.21078258752822876, + "learning_rate": 0.0002502878432868722, + "loss": 0.0291, + "step": 1230 + }, + { + "epoch": 0.81, + "grad_norm": 0.28720253705978394, + "learning_rate": 0.00025021115589676774, + "loss": 0.0318, + "step": 1231 + }, + { + "epoch": 0.81, + "grad_norm": 0.2182384580373764, + "learning_rate": 0.00025013442117046694, + "loss": 0.0407, + "step": 1232 + }, + { + "epoch": 0.81, + "grad_norm": 0.1223733052611351, + "learning_rate": 0.0002500576391442166, + "loss": 0.0189, + "step": 1233 + }, + { + "epoch": 0.81, + "grad_norm": 0.1699313372373581, + "learning_rate": 0.0002499808098542858, + "loss": 0.1081, + "step": 1234 + }, + { + "epoch": 0.81, + "grad_norm": 0.21604309976100922, + "learning_rate": 0.00024990393333696603, + "loss": 0.0406, + "step": 1235 + }, + { + "epoch": 0.81, + "grad_norm": 0.11065655201673508, + "learning_rate": 0.00024982700962857094, + "loss": 0.0274, + "step": 1236 + }, + { + "epoch": 0.81, + "grad_norm": 0.10013590008020401, + "learning_rate": 0.0002497500387654367, + "loss": 0.0138, + "step": 1237 + }, + { + "epoch": 0.81, + "grad_norm": 0.03474019467830658, + "learning_rate": 0.0002496730207839215, + "loss": 0.0067, + "step": 1238 + }, + { + "epoch": 0.81, + "grad_norm": 0.1373460739850998, + "learning_rate": 0.00024959595572040594, + "loss": 0.0382, + "step": 1239 + }, + { + "epoch": 0.81, + "grad_norm": 0.1674460619688034, + "learning_rate": 0.0002495188436112928, + "loss": 0.0187, + "step": 1240 + }, + { + "epoch": 0.81, + "grad_norm": 0.056852634996175766, + "learning_rate": 0.0002494416844930072, + "loss": 0.02, + "step": 1241 + }, + { + "epoch": 0.81, + "grad_norm": 0.1567879319190979, + "learning_rate": 0.00024936447840199626, + "loss": 0.0488, + "step": 1242 + }, + { + "epoch": 0.81, + "grad_norm": 0.19893474876880646, + "learning_rate": 0.0002492872253747294, + "loss": 0.0382, + "step": 1243 + }, + { + "epoch": 0.81, + "grad_norm": 0.07066723704338074, + "learning_rate": 0.0002492099254476983, + "loss": 0.0194, + "step": 1244 + }, + { + "epoch": 0.82, + "grad_norm": 0.11466959118843079, + "learning_rate": 0.00024913257865741663, + "loss": 0.0367, + "step": 1245 + }, + { + "epoch": 0.82, + "grad_norm": 0.08930857479572296, + "learning_rate": 0.0002490551850404203, + "loss": 0.0186, + "step": 1246 + }, + { + "epoch": 0.82, + "grad_norm": 0.0905904471874237, + "learning_rate": 0.0002489777446332673, + "loss": 0.0349, + "step": 1247 + }, + { + "epoch": 0.82, + "grad_norm": 0.225018709897995, + "learning_rate": 0.0002489002574725378, + "loss": 0.0579, + "step": 1248 + }, + { + "epoch": 0.82, + "grad_norm": 0.15631456673145294, + "learning_rate": 0.0002488227235948339, + "loss": 0.0361, + "step": 1249 + }, + { + "epoch": 0.82, + "grad_norm": 0.06862124055624008, + "learning_rate": 0.0002487451430367798, + "loss": 0.0351, + "step": 1250 + }, + { + "epoch": 0.82, + "grad_norm": 0.10271900147199631, + "learning_rate": 0.00024866751583502194, + "loss": 0.0393, + "step": 1251 + }, + { + "epoch": 0.82, + "grad_norm": 0.12624254822731018, + "learning_rate": 0.0002485898420262286, + "loss": 0.0309, + "step": 1252 + }, + { + "epoch": 0.82, + "grad_norm": 0.116575688123703, + "learning_rate": 0.00024851212164709013, + "loss": 0.058, + "step": 1253 + }, + { + "epoch": 0.82, + "grad_norm": 0.06756250560283661, + "learning_rate": 0.00024843435473431886, + "loss": 0.0335, + "step": 1254 + }, + { + "epoch": 0.82, + "grad_norm": 0.20835717022418976, + "learning_rate": 0.0002483565413246492, + "loss": 0.0389, + "step": 1255 + }, + { + "epoch": 0.82, + "grad_norm": 0.04360177740454674, + "learning_rate": 0.0002482786814548374, + "loss": 0.008, + "step": 1256 + }, + { + "epoch": 0.82, + "grad_norm": 0.1068229153752327, + "learning_rate": 0.0002482007751616616, + "loss": 0.0304, + "step": 1257 + }, + { + "epoch": 0.82, + "grad_norm": 0.04819338023662567, + "learning_rate": 0.0002481228224819221, + "loss": 0.0098, + "step": 1258 + }, + { + "epoch": 0.82, + "grad_norm": 0.48405715823173523, + "learning_rate": 0.00024804482345244105, + "loss": 0.0348, + "step": 1259 + }, + { + "epoch": 0.82, + "grad_norm": 0.09796518087387085, + "learning_rate": 0.0002479667781100622, + "loss": 0.0153, + "step": 1260 + }, + { + "epoch": 0.83, + "grad_norm": 0.13171538710594177, + "learning_rate": 0.0002478886864916516, + "loss": 0.0316, + "step": 1261 + }, + { + "epoch": 0.83, + "grad_norm": 0.0907411128282547, + "learning_rate": 0.00024781054863409676, + "loss": 0.0169, + "step": 1262 + }, + { + "epoch": 0.83, + "grad_norm": 0.10159718245267868, + "learning_rate": 0.00024773236457430745, + "loss": 0.013, + "step": 1263 + }, + { + "epoch": 0.83, + "grad_norm": 0.10823512077331543, + "learning_rate": 0.00024765413434921495, + "loss": 0.0252, + "step": 1264 + }, + { + "epoch": 0.83, + "grad_norm": 0.07199376821517944, + "learning_rate": 0.0002475758579957724, + "loss": 0.0105, + "step": 1265 + }, + { + "epoch": 0.83, + "grad_norm": 0.11216728389263153, + "learning_rate": 0.0002474975355509549, + "loss": 0.0339, + "step": 1266 + }, + { + "epoch": 0.83, + "grad_norm": 0.16655175387859344, + "learning_rate": 0.00024741916705175906, + "loss": 0.0306, + "step": 1267 + }, + { + "epoch": 0.83, + "grad_norm": 0.08566506952047348, + "learning_rate": 0.00024734075253520345, + "loss": 0.0329, + "step": 1268 + }, + { + "epoch": 0.83, + "grad_norm": 0.1542367786169052, + "learning_rate": 0.00024726229203832824, + "loss": 0.0284, + "step": 1269 + }, + { + "epoch": 0.83, + "grad_norm": 0.1685347855091095, + "learning_rate": 0.00024718378559819554, + "loss": 0.0385, + "step": 1270 + }, + { + "epoch": 0.83, + "grad_norm": 0.1904221624135971, + "learning_rate": 0.00024710523325188885, + "loss": 0.0435, + "step": 1271 + }, + { + "epoch": 0.83, + "grad_norm": 0.10915929824113846, + "learning_rate": 0.00024702663503651357, + "loss": 0.0129, + "step": 1272 + }, + { + "epoch": 0.83, + "grad_norm": 0.04411763325333595, + "learning_rate": 0.0002469479909891967, + "loss": 0.0038, + "step": 1273 + }, + { + "epoch": 0.83, + "grad_norm": 0.22485259175300598, + "learning_rate": 0.0002468693011470869, + "loss": 0.0456, + "step": 1274 + }, + { + "epoch": 0.83, + "grad_norm": 0.10708510875701904, + "learning_rate": 0.00024679056554735454, + "loss": 0.0192, + "step": 1275 + }, + { + "epoch": 0.84, + "grad_norm": 0.15084552764892578, + "learning_rate": 0.00024671178422719137, + "loss": 0.0293, + "step": 1276 + }, + { + "epoch": 0.84, + "grad_norm": 0.14543551206588745, + "learning_rate": 0.000246632957223811, + "loss": 0.0666, + "step": 1277 + }, + { + "epoch": 0.84, + "grad_norm": 0.1648811399936676, + "learning_rate": 0.00024655408457444853, + "loss": 0.0321, + "step": 1278 + }, + { + "epoch": 0.84, + "grad_norm": 0.16748228669166565, + "learning_rate": 0.00024647516631636055, + "loss": 0.0373, + "step": 1279 + }, + { + "epoch": 0.84, + "grad_norm": 0.04038754105567932, + "learning_rate": 0.00024639620248682523, + "loss": 0.0049, + "step": 1280 + }, + { + "epoch": 0.84, + "grad_norm": 0.1675775945186615, + "learning_rate": 0.00024631719312314234, + "loss": 0.0517, + "step": 1281 + }, + { + "epoch": 0.84, + "grad_norm": 0.227004274725914, + "learning_rate": 0.00024623813826263303, + "loss": 0.0445, + "step": 1282 + }, + { + "epoch": 0.84, + "grad_norm": 0.05555510148406029, + "learning_rate": 0.00024615903794264005, + "loss": 0.0096, + "step": 1283 + }, + { + "epoch": 0.84, + "grad_norm": 0.16279524564743042, + "learning_rate": 0.00024607989220052766, + "loss": 0.0452, + "step": 1284 + }, + { + "epoch": 0.84, + "grad_norm": 0.22099511325359344, + "learning_rate": 0.0002460007010736814, + "loss": 0.0484, + "step": 1285 + }, + { + "epoch": 0.84, + "grad_norm": 0.3313157558441162, + "learning_rate": 0.00024592146459950835, + "loss": 0.0798, + "step": 1286 + }, + { + "epoch": 0.84, + "grad_norm": 0.1560799926519394, + "learning_rate": 0.0002458421828154371, + "loss": 0.0523, + "step": 1287 + }, + { + "epoch": 0.84, + "grad_norm": 0.0924949198961258, + "learning_rate": 0.0002457628557589174, + "loss": 0.0416, + "step": 1288 + }, + { + "epoch": 0.84, + "grad_norm": 0.061663124710321426, + "learning_rate": 0.0002456834834674207, + "loss": 0.0187, + "step": 1289 + }, + { + "epoch": 0.84, + "grad_norm": 0.04804534092545509, + "learning_rate": 0.0002456040659784396, + "loss": 0.0236, + "step": 1290 + }, + { + "epoch": 0.85, + "grad_norm": 0.09753583371639252, + "learning_rate": 0.00024552460332948804, + "loss": 0.0447, + "step": 1291 + }, + { + "epoch": 0.85, + "grad_norm": 0.03994222357869148, + "learning_rate": 0.0002454450955581015, + "loss": 0.0098, + "step": 1292 + }, + { + "epoch": 0.85, + "grad_norm": 0.12844492495059967, + "learning_rate": 0.0002453655427018364, + "loss": 0.0234, + "step": 1293 + }, + { + "epoch": 0.85, + "grad_norm": 0.12967482209205627, + "learning_rate": 0.000245285944798271, + "loss": 0.0435, + "step": 1294 + }, + { + "epoch": 0.85, + "grad_norm": 0.25114384293556213, + "learning_rate": 0.00024520630188500423, + "loss": 0.0539, + "step": 1295 + }, + { + "epoch": 0.85, + "grad_norm": 0.25040391087532043, + "learning_rate": 0.0002451266139996568, + "loss": 0.037, + "step": 1296 + }, + { + "epoch": 0.85, + "grad_norm": 0.21144863963127136, + "learning_rate": 0.0002450468811798703, + "loss": 0.0371, + "step": 1297 + }, + { + "epoch": 0.85, + "grad_norm": 0.2176048457622528, + "learning_rate": 0.00024496710346330776, + "loss": 0.0311, + "step": 1298 + }, + { + "epoch": 0.85, + "grad_norm": 0.06802531331777573, + "learning_rate": 0.0002448872808876533, + "loss": 0.0095, + "step": 1299 + }, + { + "epoch": 0.85, + "grad_norm": 0.11026319861412048, + "learning_rate": 0.0002448074134906123, + "loss": 0.0132, + "step": 1300 + }, + { + "epoch": 0.85, + "grad_norm": 0.2511361539363861, + "learning_rate": 0.00024472750130991126, + "loss": 0.0091, + "step": 1301 + }, + { + "epoch": 0.85, + "grad_norm": 0.42377692461013794, + "learning_rate": 0.0002446475443832979, + "loss": 0.0669, + "step": 1302 + }, + { + "epoch": 0.85, + "grad_norm": 0.587988555431366, + "learning_rate": 0.000244567542748541, + "loss": 0.0737, + "step": 1303 + }, + { + "epoch": 0.85, + "grad_norm": 0.1163543239235878, + "learning_rate": 0.0002444874964434305, + "loss": 0.0151, + "step": 1304 + }, + { + "epoch": 0.85, + "grad_norm": 0.036374811083078384, + "learning_rate": 0.00024440740550577754, + "loss": 0.0067, + "step": 1305 + }, + { + "epoch": 0.85, + "grad_norm": 0.07870301604270935, + "learning_rate": 0.00024432726997341403, + "loss": 0.0191, + "step": 1306 + }, + { + "epoch": 0.86, + "grad_norm": 0.09554275870323181, + "learning_rate": 0.0002442470898841933, + "loss": 0.0169, + "step": 1307 + }, + { + "epoch": 0.86, + "grad_norm": 0.20255301892757416, + "learning_rate": 0.0002441668652759896, + "loss": 0.0404, + "step": 1308 + }, + { + "epoch": 0.86, + "grad_norm": 0.015268395654857159, + "learning_rate": 0.0002440865961866981, + "loss": 0.002, + "step": 1309 + }, + { + "epoch": 0.86, + "grad_norm": 0.08300946652889252, + "learning_rate": 0.0002440062826542351, + "loss": 0.0281, + "step": 1310 + }, + { + "epoch": 0.86, + "grad_norm": 0.2025083601474762, + "learning_rate": 0.00024392592471653786, + "loss": 0.0407, + "step": 1311 + }, + { + "epoch": 0.86, + "grad_norm": 0.01875820755958557, + "learning_rate": 0.0002438455224115647, + "loss": 0.0024, + "step": 1312 + }, + { + "epoch": 0.86, + "grad_norm": 0.16822032630443573, + "learning_rate": 0.0002437650757772947, + "loss": 0.0356, + "step": 1313 + }, + { + "epoch": 0.86, + "grad_norm": 0.307230681180954, + "learning_rate": 0.0002436845848517281, + "loss": 0.0277, + "step": 1314 + }, + { + "epoch": 0.86, + "grad_norm": 0.21822179853916168, + "learning_rate": 0.00024360404967288586, + "loss": 0.0153, + "step": 1315 + }, + { + "epoch": 0.86, + "grad_norm": 0.18913914263248444, + "learning_rate": 0.00024352347027881003, + "loss": 0.0664, + "step": 1316 + }, + { + "epoch": 0.86, + "grad_norm": 0.26664435863494873, + "learning_rate": 0.0002434428467075634, + "loss": 0.0821, + "step": 1317 + }, + { + "epoch": 0.86, + "grad_norm": 0.15764296054840088, + "learning_rate": 0.00024336217899722967, + "loss": 0.0663, + "step": 1318 + }, + { + "epoch": 0.86, + "grad_norm": 0.09952249377965927, + "learning_rate": 0.00024328146718591352, + "loss": 0.0497, + "step": 1319 + }, + { + "epoch": 0.86, + "grad_norm": 0.20798932015895844, + "learning_rate": 0.00024320071131174022, + "loss": 0.0448, + "step": 1320 + }, + { + "epoch": 0.86, + "grad_norm": 0.2187434434890747, + "learning_rate": 0.00024311991141285602, + "loss": 0.0547, + "step": 1321 + }, + { + "epoch": 0.87, + "grad_norm": 0.08128710836172104, + "learning_rate": 0.00024303906752742797, + "loss": 0.0232, + "step": 1322 + }, + { + "epoch": 0.87, + "grad_norm": 0.1579497754573822, + "learning_rate": 0.00024295817969364382, + "loss": 0.0368, + "step": 1323 + }, + { + "epoch": 0.87, + "grad_norm": 0.3530323803424835, + "learning_rate": 0.00024287724794971207, + "loss": 0.0543, + "step": 1324 + }, + { + "epoch": 0.87, + "grad_norm": 0.13028964400291443, + "learning_rate": 0.00024279627233386212, + "loss": 0.0562, + "step": 1325 + }, + { + "epoch": 0.87, + "grad_norm": 0.17670606076717377, + "learning_rate": 0.00024271525288434385, + "loss": 0.033, + "step": 1326 + }, + { + "epoch": 0.87, + "grad_norm": 0.14736194908618927, + "learning_rate": 0.00024263418963942808, + "loss": 0.0403, + "step": 1327 + }, + { + "epoch": 0.87, + "grad_norm": 0.2033924162387848, + "learning_rate": 0.00024255308263740618, + "loss": 0.0584, + "step": 1328 + }, + { + "epoch": 0.87, + "grad_norm": 0.08926638215780258, + "learning_rate": 0.00024247193191659016, + "loss": 0.0368, + "step": 1329 + }, + { + "epoch": 0.87, + "grad_norm": 0.09010445326566696, + "learning_rate": 0.0002423907375153128, + "loss": 0.0313, + "step": 1330 + }, + { + "epoch": 0.87, + "grad_norm": 0.07403320074081421, + "learning_rate": 0.00024230949947192748, + "loss": 0.0146, + "step": 1331 + }, + { + "epoch": 0.87, + "grad_norm": 0.11623091250658035, + "learning_rate": 0.00024222821782480812, + "loss": 0.0308, + "step": 1332 + }, + { + "epoch": 0.87, + "grad_norm": 0.20798785984516144, + "learning_rate": 0.0002421468926123493, + "loss": 0.0447, + "step": 1333 + }, + { + "epoch": 0.87, + "grad_norm": 0.12543538212776184, + "learning_rate": 0.00024206552387296621, + "loss": 0.0438, + "step": 1334 + }, + { + "epoch": 0.87, + "grad_norm": 0.12966863811016083, + "learning_rate": 0.00024198411164509447, + "loss": 0.0453, + "step": 1335 + }, + { + "epoch": 0.87, + "grad_norm": 0.05985172837972641, + "learning_rate": 0.00024190265596719043, + "loss": 0.0102, + "step": 1336 + }, + { + "epoch": 0.88, + "grad_norm": 0.14263281226158142, + "learning_rate": 0.00024182115687773075, + "loss": 0.0544, + "step": 1337 + }, + { + "epoch": 0.88, + "grad_norm": 0.190725639462471, + "learning_rate": 0.00024173961441521284, + "loss": 0.0265, + "step": 1338 + }, + { + "epoch": 0.88, + "grad_norm": 0.29231366515159607, + "learning_rate": 0.00024165802861815435, + "loss": 0.0684, + "step": 1339 + }, + { + "epoch": 0.88, + "grad_norm": 0.13645826280117035, + "learning_rate": 0.00024157639952509356, + "loss": 0.0577, + "step": 1340 + }, + { + "epoch": 0.88, + "grad_norm": 0.15891732275485992, + "learning_rate": 0.0002414947271745892, + "loss": 0.0455, + "step": 1341 + }, + { + "epoch": 0.88, + "grad_norm": 0.2538587152957916, + "learning_rate": 0.00024141301160522037, + "loss": 0.0566, + "step": 1342 + }, + { + "epoch": 0.88, + "grad_norm": 0.08588481694459915, + "learning_rate": 0.00024133125285558658, + "loss": 0.0265, + "step": 1343 + }, + { + "epoch": 0.88, + "grad_norm": 0.1366318315267563, + "learning_rate": 0.00024124945096430775, + "loss": 0.0209, + "step": 1344 + }, + { + "epoch": 0.88, + "grad_norm": 0.12919899821281433, + "learning_rate": 0.00024116760597002427, + "loss": 0.0358, + "step": 1345 + }, + { + "epoch": 0.88, + "grad_norm": 0.1527070701122284, + "learning_rate": 0.0002410857179113967, + "loss": 0.0584, + "step": 1346 + }, + { + "epoch": 0.88, + "grad_norm": 0.1441652625799179, + "learning_rate": 0.00024100378682710618, + "loss": 0.026, + "step": 1347 + }, + { + "epoch": 0.88, + "grad_norm": 0.0560770146548748, + "learning_rate": 0.00024092181275585397, + "loss": 0.0126, + "step": 1348 + }, + { + "epoch": 0.88, + "grad_norm": 0.23829127848148346, + "learning_rate": 0.00024083979573636172, + "loss": 0.0492, + "step": 1349 + }, + { + "epoch": 0.88, + "grad_norm": 0.22331084311008453, + "learning_rate": 0.00024075773580737138, + "loss": 0.0374, + "step": 1350 + }, + { + "epoch": 0.88, + "grad_norm": 0.16740433871746063, + "learning_rate": 0.0002406756330076452, + "loss": 0.033, + "step": 1351 + }, + { + "epoch": 0.89, + "grad_norm": 0.12624043226242065, + "learning_rate": 0.0002405934873759655, + "loss": 0.0254, + "step": 1352 + }, + { + "epoch": 0.89, + "grad_norm": 0.2925248444080353, + "learning_rate": 0.00024051129895113506, + "loss": 0.0966, + "step": 1353 + }, + { + "epoch": 0.89, + "grad_norm": 0.050702452659606934, + "learning_rate": 0.00024042906777197676, + "loss": 0.0058, + "step": 1354 + }, + { + "epoch": 0.89, + "grad_norm": 0.11182265728712082, + "learning_rate": 0.00024034679387733367, + "loss": 0.0209, + "step": 1355 + }, + { + "epoch": 0.89, + "grad_norm": 0.07762409001588821, + "learning_rate": 0.00024026447730606911, + "loss": 0.0117, + "step": 1356 + }, + { + "epoch": 0.89, + "grad_norm": 0.05566919595003128, + "learning_rate": 0.00024018211809706652, + "loss": 0.012, + "step": 1357 + }, + { + "epoch": 0.89, + "grad_norm": 0.026816535741090775, + "learning_rate": 0.00024009971628922937, + "loss": 0.0058, + "step": 1358 + }, + { + "epoch": 0.89, + "grad_norm": 0.14158879220485687, + "learning_rate": 0.0002400172719214814, + "loss": 0.0242, + "step": 1359 + }, + { + "epoch": 0.89, + "grad_norm": 0.10178912431001663, + "learning_rate": 0.0002399347850327664, + "loss": 0.0144, + "step": 1360 + }, + { + "epoch": 0.89, + "grad_norm": 0.2671686112880707, + "learning_rate": 0.00023985225566204834, + "loss": 0.1116, + "step": 1361 + }, + { + "epoch": 0.89, + "grad_norm": 0.2026364952325821, + "learning_rate": 0.00023976968384831107, + "loss": 0.0511, + "step": 1362 + }, + { + "epoch": 0.89, + "grad_norm": 0.046000707894563675, + "learning_rate": 0.0002396870696305586, + "loss": 0.0089, + "step": 1363 + }, + { + "epoch": 0.89, + "grad_norm": 0.243350088596344, + "learning_rate": 0.00023960441304781495, + "loss": 0.0376, + "step": 1364 + }, + { + "epoch": 0.89, + "grad_norm": 0.053250979632139206, + "learning_rate": 0.0002395217141391242, + "loss": 0.008, + "step": 1365 + }, + { + "epoch": 0.89, + "grad_norm": 0.08489834517240524, + "learning_rate": 0.0002394389729435503, + "loss": 0.0216, + "step": 1366 + }, + { + "epoch": 0.89, + "grad_norm": 0.09859486669301987, + "learning_rate": 0.00023935618950017738, + "loss": 0.0253, + "step": 1367 + }, + { + "epoch": 0.9, + "grad_norm": 0.11453449726104736, + "learning_rate": 0.00023927336384810933, + "loss": 0.0414, + "step": 1368 + }, + { + "epoch": 0.9, + "grad_norm": 0.1473090499639511, + "learning_rate": 0.00023919049602647005, + "loss": 0.0365, + "step": 1369 + }, + { + "epoch": 0.9, + "grad_norm": 0.12153466045856476, + "learning_rate": 0.00023910758607440335, + "loss": 0.0314, + "step": 1370 + }, + { + "epoch": 0.9, + "grad_norm": 0.17143134772777557, + "learning_rate": 0.000239024634031073, + "loss": 0.0928, + "step": 1371 + }, + { + "epoch": 0.9, + "grad_norm": 0.11081311106681824, + "learning_rate": 0.00023894163993566257, + "loss": 0.0535, + "step": 1372 + }, + { + "epoch": 0.9, + "grad_norm": 0.13488808274269104, + "learning_rate": 0.0002388586038273755, + "loss": 0.0321, + "step": 1373 + }, + { + "epoch": 0.9, + "grad_norm": 0.0592711940407753, + "learning_rate": 0.0002387755257454352, + "loss": 0.01, + "step": 1374 + }, + { + "epoch": 0.9, + "grad_norm": 0.09835012257099152, + "learning_rate": 0.00023869240572908467, + "loss": 0.0295, + "step": 1375 + }, + { + "epoch": 0.9, + "grad_norm": 0.071134053170681, + "learning_rate": 0.000238609243817587, + "loss": 0.0243, + "step": 1376 + }, + { + "epoch": 0.9, + "grad_norm": 0.14431652426719666, + "learning_rate": 0.0002385260400502248, + "loss": 0.0344, + "step": 1377 + }, + { + "epoch": 0.9, + "grad_norm": 0.10391832143068314, + "learning_rate": 0.00023844279446630067, + "loss": 0.0231, + "step": 1378 + }, + { + "epoch": 0.9, + "grad_norm": 0.07357161492109299, + "learning_rate": 0.00023835950710513677, + "loss": 0.0163, + "step": 1379 + }, + { + "epoch": 0.9, + "grad_norm": 0.16738182306289673, + "learning_rate": 0.00023827617800607523, + "loss": 0.0423, + "step": 1380 + }, + { + "epoch": 0.9, + "grad_norm": 0.07547144591808319, + "learning_rate": 0.00023819280720847774, + "loss": 0.0273, + "step": 1381 + }, + { + "epoch": 0.9, + "grad_norm": 0.10503777116537094, + "learning_rate": 0.0002381093947517256, + "loss": 0.0192, + "step": 1382 + }, + { + "epoch": 0.91, + "grad_norm": 0.0630551353096962, + "learning_rate": 0.00023802594067521998, + "loss": 0.0115, + "step": 1383 + }, + { + "epoch": 0.91, + "grad_norm": 0.02077486738562584, + "learning_rate": 0.00023794244501838162, + "loss": 0.0045, + "step": 1384 + }, + { + "epoch": 0.91, + "grad_norm": 0.09841371327638626, + "learning_rate": 0.00023785890782065087, + "loss": 0.0242, + "step": 1385 + }, + { + "epoch": 0.91, + "grad_norm": 0.21591459214687347, + "learning_rate": 0.00023777532912148781, + "loss": 0.0237, + "step": 1386 + }, + { + "epoch": 0.91, + "grad_norm": 0.03989405184984207, + "learning_rate": 0.000237691708960372, + "loss": 0.0051, + "step": 1387 + }, + { + "epoch": 0.91, + "grad_norm": 0.12305942177772522, + "learning_rate": 0.0002376080473768026, + "loss": 0.0264, + "step": 1388 + }, + { + "epoch": 0.91, + "grad_norm": 0.14408881962299347, + "learning_rate": 0.00023752434441029848, + "loss": 0.0322, + "step": 1389 + }, + { + "epoch": 0.91, + "grad_norm": 0.04419580101966858, + "learning_rate": 0.00023744060010039784, + "loss": 0.0073, + "step": 1390 + }, + { + "epoch": 0.91, + "grad_norm": 0.18515107035636902, + "learning_rate": 0.0002373568144866586, + "loss": 0.0465, + "step": 1391 + }, + { + "epoch": 0.91, + "grad_norm": 0.048167865723371506, + "learning_rate": 0.00023727298760865812, + "loss": 0.0138, + "step": 1392 + }, + { + "epoch": 0.91, + "grad_norm": 0.08519299328327179, + "learning_rate": 0.0002371891195059932, + "loss": 0.0095, + "step": 1393 + }, + { + "epoch": 0.91, + "grad_norm": 0.21691879630088806, + "learning_rate": 0.00023710521021828016, + "loss": 0.0381, + "step": 1394 + }, + { + "epoch": 0.91, + "grad_norm": 0.09678614884614944, + "learning_rate": 0.00023702125978515478, + "loss": 0.0099, + "step": 1395 + }, + { + "epoch": 0.91, + "grad_norm": 0.08847987651824951, + "learning_rate": 0.0002369372682462723, + "loss": 0.0165, + "step": 1396 + }, + { + "epoch": 0.91, + "grad_norm": 0.03246233984827995, + "learning_rate": 0.0002368532356413073, + "loss": 0.0058, + "step": 1397 + }, + { + "epoch": 0.92, + "grad_norm": 0.07045282423496246, + "learning_rate": 0.00023676916200995386, + "loss": 0.0164, + "step": 1398 + }, + { + "epoch": 0.92, + "grad_norm": 0.05581701174378395, + "learning_rate": 0.00023668504739192528, + "loss": 0.0152, + "step": 1399 + }, + { + "epoch": 0.92, + "grad_norm": 0.15774132311344147, + "learning_rate": 0.0002366008918269544, + "loss": 0.0243, + "step": 1400 + }, + { + "epoch": 0.92, + "grad_norm": 0.172657772898674, + "learning_rate": 0.00023651669535479334, + "loss": 0.0184, + "step": 1401 + }, + { + "epoch": 0.92, + "grad_norm": 0.08128032833337784, + "learning_rate": 0.0002364324580152135, + "loss": 0.0186, + "step": 1402 + }, + { + "epoch": 0.92, + "grad_norm": 0.06358969956636429, + "learning_rate": 0.00023634817984800554, + "loss": 0.0102, + "step": 1403 + }, + { + "epoch": 0.92, + "grad_norm": 0.31414860486984253, + "learning_rate": 0.00023626386089297958, + "loss": 0.0514, + "step": 1404 + }, + { + "epoch": 0.92, + "grad_norm": 0.22831489145755768, + "learning_rate": 0.00023617950118996487, + "loss": 0.0323, + "step": 1405 + }, + { + "epoch": 0.92, + "grad_norm": 0.048902370035648346, + "learning_rate": 0.00023609510077880996, + "loss": 0.0033, + "step": 1406 + }, + { + "epoch": 0.92, + "grad_norm": 0.03278432413935661, + "learning_rate": 0.00023601065969938262, + "loss": 0.0031, + "step": 1407 + }, + { + "epoch": 0.92, + "grad_norm": 0.09343546628952026, + "learning_rate": 0.00023592617799156977, + "loss": 0.0199, + "step": 1408 + }, + { + "epoch": 0.92, + "grad_norm": 0.0755714625120163, + "learning_rate": 0.00023584165569527757, + "loss": 0.0086, + "step": 1409 + }, + { + "epoch": 0.92, + "grad_norm": 0.28567177057266235, + "learning_rate": 0.00023575709285043138, + "loss": 0.0256, + "step": 1410 + }, + { + "epoch": 0.92, + "grad_norm": 0.1896996796131134, + "learning_rate": 0.0002356724894969757, + "loss": 0.0291, + "step": 1411 + }, + { + "epoch": 0.92, + "grad_norm": 0.1428869366645813, + "learning_rate": 0.0002355878456748742, + "loss": 0.0574, + "step": 1412 + }, + { + "epoch": 0.93, + "grad_norm": 0.25432294607162476, + "learning_rate": 0.0002355031614241095, + "loss": 0.0433, + "step": 1413 + }, + { + "epoch": 0.93, + "grad_norm": 0.2577909231185913, + "learning_rate": 0.00023541843678468355, + "loss": 0.0376, + "step": 1414 + }, + { + "epoch": 0.93, + "grad_norm": 0.1479143500328064, + "learning_rate": 0.0002353336717966172, + "loss": 0.0248, + "step": 1415 + }, + { + "epoch": 0.93, + "grad_norm": 0.058144185692071915, + "learning_rate": 0.00023524886649995043, + "loss": 0.0102, + "step": 1416 + }, + { + "epoch": 0.93, + "grad_norm": 0.18476702272891998, + "learning_rate": 0.00023516402093474225, + "loss": 0.0658, + "step": 1417 + }, + { + "epoch": 0.93, + "grad_norm": 0.1367078274488449, + "learning_rate": 0.00023507913514107074, + "loss": 0.0228, + "step": 1418 + }, + { + "epoch": 0.93, + "grad_norm": 0.05217135697603226, + "learning_rate": 0.00023499420915903293, + "loss": 0.0117, + "step": 1419 + }, + { + "epoch": 0.93, + "grad_norm": 0.311260461807251, + "learning_rate": 0.00023490924302874478, + "loss": 0.0945, + "step": 1420 + }, + { + "epoch": 0.93, + "grad_norm": 0.06179346889257431, + "learning_rate": 0.00023482423679034134, + "loss": 0.0102, + "step": 1421 + }, + { + "epoch": 0.93, + "grad_norm": 0.0694802924990654, + "learning_rate": 0.00023473919048397652, + "loss": 0.0187, + "step": 1422 + }, + { + "epoch": 0.93, + "grad_norm": 0.09105714410543442, + "learning_rate": 0.00023465410414982317, + "loss": 0.0245, + "step": 1423 + }, + { + "epoch": 0.93, + "grad_norm": 0.10562916845083237, + "learning_rate": 0.0002345689778280731, + "loss": 0.0296, + "step": 1424 + }, + { + "epoch": 0.93, + "grad_norm": 0.07471620291471481, + "learning_rate": 0.00023448381155893695, + "loss": 0.0288, + "step": 1425 + }, + { + "epoch": 0.93, + "grad_norm": 0.11771635711193085, + "learning_rate": 0.0002343986053826442, + "loss": 0.0165, + "step": 1426 + }, + { + "epoch": 0.93, + "grad_norm": 0.056794993579387665, + "learning_rate": 0.00023431335933944323, + "loss": 0.02, + "step": 1427 + }, + { + "epoch": 0.93, + "grad_norm": 0.10688856244087219, + "learning_rate": 0.00023422807346960131, + "loss": 0.037, + "step": 1428 + }, + { + "epoch": 0.94, + "grad_norm": 0.10420051217079163, + "learning_rate": 0.00023414274781340442, + "loss": 0.0211, + "step": 1429 + }, + { + "epoch": 0.94, + "grad_norm": 0.09319007396697998, + "learning_rate": 0.00023405738241115737, + "loss": 0.0324, + "step": 1430 + }, + { + "epoch": 0.94, + "grad_norm": 0.11446485668420792, + "learning_rate": 0.00023397197730318377, + "loss": 0.0381, + "step": 1431 + }, + { + "epoch": 0.94, + "grad_norm": 0.10845956206321716, + "learning_rate": 0.00023388653252982594, + "loss": 0.0171, + "step": 1432 + }, + { + "epoch": 0.94, + "grad_norm": 0.08544383198022842, + "learning_rate": 0.000233801048131445, + "loss": 0.0239, + "step": 1433 + }, + { + "epoch": 0.94, + "grad_norm": 0.10372909903526306, + "learning_rate": 0.0002337155241484207, + "loss": 0.0429, + "step": 1434 + }, + { + "epoch": 0.94, + "grad_norm": 0.24167174100875854, + "learning_rate": 0.00023362996062115154, + "loss": 0.1291, + "step": 1435 + }, + { + "epoch": 0.94, + "grad_norm": 0.10461205989122391, + "learning_rate": 0.00023354435759005473, + "loss": 0.0385, + "step": 1436 + }, + { + "epoch": 0.94, + "grad_norm": 0.14408838748931885, + "learning_rate": 0.0002334587150955661, + "loss": 0.0377, + "step": 1437 + }, + { + "epoch": 0.94, + "grad_norm": 0.08705660700798035, + "learning_rate": 0.0002333730331781401, + "loss": 0.0169, + "step": 1438 + }, + { + "epoch": 0.94, + "grad_norm": 0.10698029398918152, + "learning_rate": 0.00023328731187824986, + "loss": 0.0383, + "step": 1439 + }, + { + "epoch": 0.94, + "grad_norm": 0.18005134165287018, + "learning_rate": 0.0002332015512363871, + "loss": 0.0408, + "step": 1440 + }, + { + "epoch": 0.94, + "grad_norm": 0.11144935339689255, + "learning_rate": 0.00023311575129306202, + "loss": 0.0434, + "step": 1441 + }, + { + "epoch": 0.94, + "grad_norm": 0.09303693473339081, + "learning_rate": 0.0002330299120888035, + "loss": 0.0259, + "step": 1442 + }, + { + "epoch": 0.94, + "grad_norm": 0.09196025878190994, + "learning_rate": 0.00023294403366415904, + "loss": 0.0256, + "step": 1443 + }, + { + "epoch": 0.95, + "grad_norm": 0.09944535046815872, + "learning_rate": 0.00023285811605969442, + "loss": 0.0691, + "step": 1444 + }, + { + "epoch": 0.95, + "grad_norm": 0.0766916275024414, + "learning_rate": 0.00023277215931599417, + "loss": 0.0162, + "step": 1445 + }, + { + "epoch": 0.95, + "grad_norm": 0.0471719354391098, + "learning_rate": 0.00023268616347366114, + "loss": 0.0157, + "step": 1446 + }, + { + "epoch": 0.95, + "grad_norm": 0.0748835876584053, + "learning_rate": 0.0002326001285733168, + "loss": 0.0162, + "step": 1447 + }, + { + "epoch": 0.95, + "grad_norm": 0.2493734508752823, + "learning_rate": 0.0002325140546556009, + "loss": 0.0908, + "step": 1448 + }, + { + "epoch": 0.95, + "grad_norm": 0.1908605992794037, + "learning_rate": 0.0002324279417611717, + "loss": 0.0352, + "step": 1449 + }, + { + "epoch": 0.95, + "grad_norm": 0.16963645815849304, + "learning_rate": 0.00023234178993070595, + "loss": 0.0597, + "step": 1450 + }, + { + "epoch": 0.95, + "grad_norm": 0.1448785662651062, + "learning_rate": 0.0002322555992048987, + "loss": 0.0341, + "step": 1451 + }, + { + "epoch": 0.95, + "grad_norm": 0.11966606229543686, + "learning_rate": 0.00023216936962446334, + "loss": 0.0447, + "step": 1452 + }, + { + "epoch": 0.95, + "grad_norm": 0.06863813102245331, + "learning_rate": 0.00023208310123013176, + "loss": 0.0184, + "step": 1453 + }, + { + "epoch": 0.95, + "grad_norm": 0.08081576228141785, + "learning_rate": 0.000231996794062654, + "loss": 0.0183, + "step": 1454 + }, + { + "epoch": 0.95, + "grad_norm": 0.04790128767490387, + "learning_rate": 0.00023191044816279856, + "loss": 0.0159, + "step": 1455 + }, + { + "epoch": 0.95, + "grad_norm": 0.11623428761959076, + "learning_rate": 0.00023182406357135217, + "loss": 0.036, + "step": 1456 + }, + { + "epoch": 0.95, + "grad_norm": 0.19882117211818695, + "learning_rate": 0.0002317376403291198, + "loss": 0.0356, + "step": 1457 + }, + { + "epoch": 0.95, + "grad_norm": 0.06410811841487885, + "learning_rate": 0.0002316511784769248, + "loss": 0.0153, + "step": 1458 + }, + { + "epoch": 0.96, + "grad_norm": 0.1210549846291542, + "learning_rate": 0.00023156467805560862, + "loss": 0.0254, + "step": 1459 + }, + { + "epoch": 0.96, + "grad_norm": 0.09589160978794098, + "learning_rate": 0.00023147813910603102, + "loss": 0.0231, + "step": 1460 + }, + { + "epoch": 0.96, + "grad_norm": 0.05613451451063156, + "learning_rate": 0.00023139156166906993, + "loss": 0.008, + "step": 1461 + }, + { + "epoch": 0.96, + "grad_norm": 0.12222158908843994, + "learning_rate": 0.00023130494578562147, + "loss": 0.0236, + "step": 1462 + }, + { + "epoch": 0.96, + "grad_norm": 0.12595443427562714, + "learning_rate": 0.00023121829149659988, + "loss": 0.0284, + "step": 1463 + }, + { + "epoch": 0.96, + "grad_norm": 0.05631411075592041, + "learning_rate": 0.00023113159884293762, + "loss": 0.0083, + "step": 1464 + }, + { + "epoch": 0.96, + "grad_norm": 0.15821842849254608, + "learning_rate": 0.00023104486786558516, + "loss": 0.0281, + "step": 1465 + }, + { + "epoch": 0.96, + "grad_norm": 0.28132763504981995, + "learning_rate": 0.0002309580986055112, + "loss": 0.0744, + "step": 1466 + }, + { + "epoch": 0.96, + "grad_norm": 0.08583173155784607, + "learning_rate": 0.00023087129110370243, + "loss": 0.0163, + "step": 1467 + }, + { + "epoch": 0.96, + "grad_norm": 0.1472005695104599, + "learning_rate": 0.00023078444540116364, + "loss": 0.0342, + "step": 1468 + }, + { + "epoch": 0.96, + "grad_norm": 0.15789683163166046, + "learning_rate": 0.0002306975615389177, + "loss": 0.0321, + "step": 1469 + }, + { + "epoch": 0.96, + "grad_norm": 0.0862409770488739, + "learning_rate": 0.00023061063955800542, + "loss": 0.0337, + "step": 1470 + }, + { + "epoch": 0.96, + "grad_norm": 0.09513189643621445, + "learning_rate": 0.00023052367949948562, + "loss": 0.0156, + "step": 1471 + }, + { + "epoch": 0.96, + "grad_norm": 0.16023319959640503, + "learning_rate": 0.00023043668140443522, + "loss": 0.0437, + "step": 1472 + }, + { + "epoch": 0.96, + "grad_norm": 0.28757092356681824, + "learning_rate": 0.0002303496453139491, + "loss": 0.0526, + "step": 1473 + }, + { + "epoch": 0.96, + "grad_norm": 0.09820155799388885, + "learning_rate": 0.00023026257126913986, + "loss": 0.0087, + "step": 1474 + }, + { + "epoch": 0.97, + "grad_norm": 0.23134587705135345, + "learning_rate": 0.00023017545931113822, + "loss": 0.0613, + "step": 1475 + }, + { + "epoch": 0.97, + "grad_norm": 0.06153428182005882, + "learning_rate": 0.0002300883094810929, + "loss": 0.0086, + "step": 1476 + }, + { + "epoch": 0.97, + "grad_norm": 0.17993584275245667, + "learning_rate": 0.00023000112182017032, + "loss": 0.0339, + "step": 1477 + }, + { + "epoch": 0.97, + "grad_norm": 0.23367144167423248, + "learning_rate": 0.00022991389636955483, + "loss": 0.0785, + "step": 1478 + }, + { + "epoch": 0.97, + "grad_norm": 0.057765256613492966, + "learning_rate": 0.00022982663317044864, + "loss": 0.0077, + "step": 1479 + }, + { + "epoch": 0.97, + "grad_norm": 0.17549645900726318, + "learning_rate": 0.00022973933226407174, + "loss": 0.0578, + "step": 1480 + }, + { + "epoch": 0.97, + "grad_norm": 0.13486583530902863, + "learning_rate": 0.0002296519936916621, + "loss": 0.0381, + "step": 1481 + }, + { + "epoch": 0.97, + "grad_norm": 0.11634548753499985, + "learning_rate": 0.00022956461749447528, + "loss": 0.0356, + "step": 1482 + }, + { + "epoch": 0.97, + "grad_norm": 0.03911494463682175, + "learning_rate": 0.0002294772037137847, + "loss": 0.0082, + "step": 1483 + }, + { + "epoch": 0.97, + "grad_norm": 0.13597272336483002, + "learning_rate": 0.0002293897523908816, + "loss": 0.037, + "step": 1484 + }, + { + "epoch": 0.97, + "grad_norm": 0.03297096863389015, + "learning_rate": 0.0002293022635670748, + "loss": 0.0101, + "step": 1485 + }, + { + "epoch": 0.97, + "grad_norm": 0.1217992827296257, + "learning_rate": 0.00022921473728369099, + "loss": 0.0488, + "step": 1486 + }, + { + "epoch": 0.97, + "grad_norm": 0.08392113447189331, + "learning_rate": 0.0002291271735820744, + "loss": 0.0213, + "step": 1487 + }, + { + "epoch": 0.97, + "grad_norm": 0.0728277638554573, + "learning_rate": 0.00022903957250358707, + "loss": 0.0323, + "step": 1488 + }, + { + "epoch": 0.97, + "grad_norm": 0.19564445316791534, + "learning_rate": 0.0002289519340896086, + "loss": 0.0362, + "step": 1489 + }, + { + "epoch": 0.98, + "grad_norm": 0.09455154836177826, + "learning_rate": 0.00022886425838153634, + "loss": 0.0305, + "step": 1490 + }, + { + "epoch": 0.98, + "grad_norm": 0.02463528886437416, + "learning_rate": 0.00022877654542078515, + "loss": 0.0055, + "step": 1491 + }, + { + "epoch": 0.98, + "grad_norm": 0.10636550933122635, + "learning_rate": 0.0002286887952487875, + "loss": 0.0254, + "step": 1492 + }, + { + "epoch": 0.98, + "grad_norm": 0.08179948478937149, + "learning_rate": 0.00022860100790699352, + "loss": 0.0341, + "step": 1493 + }, + { + "epoch": 0.98, + "grad_norm": 0.04053513705730438, + "learning_rate": 0.00022851318343687074, + "loss": 0.0059, + "step": 1494 + }, + { + "epoch": 0.98, + "grad_norm": 0.06254950165748596, + "learning_rate": 0.00022842532187990444, + "loss": 0.016, + "step": 1495 + }, + { + "epoch": 0.98, + "grad_norm": 0.11671124398708344, + "learning_rate": 0.00022833742327759722, + "loss": 0.0316, + "step": 1496 + }, + { + "epoch": 0.98, + "grad_norm": 0.05388714000582695, + "learning_rate": 0.00022824948767146926, + "loss": 0.0114, + "step": 1497 + }, + { + "epoch": 0.98, + "grad_norm": 0.07483407109975815, + "learning_rate": 0.00022816151510305824, + "loss": 0.0121, + "step": 1498 + }, + { + "epoch": 0.98, + "grad_norm": 0.08650153130292892, + "learning_rate": 0.00022807350561391938, + "loss": 0.0518, + "step": 1499 + }, + { + "epoch": 0.98, + "grad_norm": 0.1296052485704422, + "learning_rate": 0.00022798545924562508, + "loss": 0.0666, + "step": 1500 + }, + { + "epoch": 0.98, + "grad_norm": 0.15292461216449738, + "learning_rate": 0.00022789737603976542, + "loss": 0.0314, + "step": 1501 + }, + { + "epoch": 0.98, + "grad_norm": 0.2241302728652954, + "learning_rate": 0.00022780925603794775, + "loss": 0.13, + "step": 1502 + }, + { + "epoch": 0.98, + "grad_norm": 0.07691671699285507, + "learning_rate": 0.00022772109928179688, + "loss": 0.0303, + "step": 1503 + }, + { + "epoch": 0.98, + "grad_norm": 0.07967071235179901, + "learning_rate": 0.0002276329058129548, + "loss": 0.0104, + "step": 1504 + }, + { + "epoch": 0.99, + "grad_norm": 0.15211229026317596, + "learning_rate": 0.00022754467567308114, + "loss": 0.0463, + "step": 1505 + }, + { + "epoch": 0.99, + "grad_norm": 0.1364462524652481, + "learning_rate": 0.00022745640890385263, + "loss": 0.0333, + "step": 1506 + }, + { + "epoch": 0.99, + "grad_norm": 0.08477602154016495, + "learning_rate": 0.00022736810554696335, + "loss": 0.0144, + "step": 1507 + }, + { + "epoch": 0.99, + "grad_norm": 0.030945677310228348, + "learning_rate": 0.0002272797656441247, + "loss": 0.0082, + "step": 1508 + }, + { + "epoch": 0.99, + "grad_norm": 0.0667153000831604, + "learning_rate": 0.00022719138923706525, + "loss": 0.0285, + "step": 1509 + }, + { + "epoch": 0.99, + "grad_norm": 0.15130023658275604, + "learning_rate": 0.00022710297636753096, + "loss": 0.0493, + "step": 1510 + }, + { + "epoch": 0.99, + "grad_norm": 0.07945651561021805, + "learning_rate": 0.00022701452707728486, + "loss": 0.0181, + "step": 1511 + }, + { + "epoch": 0.99, + "grad_norm": 0.1147598847746849, + "learning_rate": 0.00022692604140810735, + "loss": 0.0377, + "step": 1512 + }, + { + "epoch": 0.99, + "grad_norm": 0.04304948449134827, + "learning_rate": 0.00022683751940179588, + "loss": 0.0128, + "step": 1513 + }, + { + "epoch": 0.99, + "grad_norm": 0.08819684386253357, + "learning_rate": 0.00022674896110016503, + "loss": 0.0296, + "step": 1514 + }, + { + "epoch": 0.99, + "grad_norm": 0.06335631757974625, + "learning_rate": 0.0002266603665450467, + "loss": 0.0188, + "step": 1515 + }, + { + "epoch": 0.99, + "grad_norm": 0.08433008193969727, + "learning_rate": 0.00022657173577828979, + "loss": 0.0251, + "step": 1516 + }, + { + "epoch": 0.99, + "grad_norm": 0.06014099717140198, + "learning_rate": 0.00022648306884176034, + "loss": 0.0193, + "step": 1517 + }, + { + "epoch": 0.99, + "grad_norm": 0.05266990885138512, + "learning_rate": 0.00022639436577734143, + "loss": 0.0112, + "step": 1518 + }, + { + "epoch": 0.99, + "grad_norm": 0.10652010887861252, + "learning_rate": 0.00022630562662693328, + "loss": 0.0312, + "step": 1519 + }, + { + "epoch": 1.0, + "grad_norm": 0.043453726917505264, + "learning_rate": 0.00022621685143245308, + "loss": 0.009, + "step": 1520 + }, + { + "epoch": 1.0, + "grad_norm": 0.0685136690735817, + "learning_rate": 0.00022612804023583515, + "loss": 0.0189, + "step": 1521 + }, + { + "epoch": 1.0, + "grad_norm": 0.14430442452430725, + "learning_rate": 0.0002260391930790307, + "loss": 0.066, + "step": 1522 + }, + { + "epoch": 1.0, + "grad_norm": 0.0724061131477356, + "learning_rate": 0.00022595031000400794, + "loss": 0.0129, + "step": 1523 + }, + { + "epoch": 1.0, + "grad_norm": 0.18257959187030792, + "learning_rate": 0.00022586139105275214, + "loss": 0.0434, + "step": 1524 + }, + { + "epoch": 1.0, + "grad_norm": 0.06716416776180267, + "learning_rate": 0.00022577243626726548, + "loss": 0.0102, + "step": 1525 + }, + { + "epoch": 1.0, + "grad_norm": 0.05102796107530594, + "learning_rate": 0.00022568344568956697, + "loss": 0.0094, + "step": 1526 + }, + { + "epoch": 1.0, + "grad_norm": 0.0711396113038063, + "learning_rate": 0.0002255944193616927, + "loss": 0.0138, + "step": 1527 + }, + { + "epoch": 1.0, + "grad_norm": 0.09844920784235, + "learning_rate": 0.00022550535732569543, + "loss": 0.0144, + "step": 1528 + }, + { + "epoch": 1.0, + "eval_loss": 0.028799179941415787, + "eval_runtime": 39.8961, + "eval_samples_per_second": 32.259, + "eval_steps_per_second": 8.071, + "step": 1528 + }, + { + "epoch": 1.0, + "grad_norm": 0.03633524477481842, + "learning_rate": 0.00022541625962364497, + "loss": 0.0054, + "step": 1529 + }, + { + "epoch": 1.0, + "grad_norm": 0.0410737618803978, + "learning_rate": 0.00022532712629762795, + "loss": 0.0069, + "step": 1530 + }, + { + "epoch": 1.0, + "grad_norm": 0.04491305723786354, + "learning_rate": 0.00022523795738974776, + "loss": 0.0049, + "step": 1531 + }, + { + "epoch": 1.0, + "grad_norm": 0.058488693088293076, + "learning_rate": 0.0002251487529421246, + "loss": 0.0045, + "step": 1532 + }, + { + "epoch": 1.0, + "grad_norm": 0.008812353946268559, + "learning_rate": 0.00022505951299689553, + "loss": 0.0017, + "step": 1533 + }, + { + "epoch": 1.0, + "grad_norm": 0.01968853361904621, + "learning_rate": 0.00022497023759621433, + "loss": 0.0037, + "step": 1534 + }, + { + "epoch": 1.0, + "grad_norm": 0.05288131162524223, + "learning_rate": 0.00022488092678225153, + "loss": 0.0055, + "step": 1535 + }, + { + "epoch": 1.01, + "grad_norm": 0.23432080447673798, + "learning_rate": 0.0002247915805971944, + "loss": 0.0302, + "step": 1536 + }, + { + "epoch": 1.01, + "grad_norm": 0.09490291774272919, + "learning_rate": 0.00022470219908324684, + "loss": 0.0327, + "step": 1537 + }, + { + "epoch": 1.01, + "grad_norm": 0.13473515212535858, + "learning_rate": 0.00022461278228262958, + "loss": 0.0182, + "step": 1538 + }, + { + "epoch": 1.01, + "grad_norm": 0.06751556694507599, + "learning_rate": 0.00022452333023757998, + "loss": 0.0048, + "step": 1539 + }, + { + "epoch": 1.01, + "grad_norm": 0.22719435393810272, + "learning_rate": 0.00022443384299035193, + "loss": 0.0115, + "step": 1540 + }, + { + "epoch": 1.01, + "grad_norm": 0.006171741988509893, + "learning_rate": 0.00022434432058321605, + "loss": 0.0008, + "step": 1541 + }, + { + "epoch": 1.01, + "grad_norm": 0.005011085420846939, + "learning_rate": 0.00022425476305845958, + "loss": 0.0008, + "step": 1542 + }, + { + "epoch": 1.01, + "grad_norm": 0.01569819450378418, + "learning_rate": 0.00022416517045838628, + "loss": 0.0014, + "step": 1543 + }, + { + "epoch": 1.01, + "grad_norm": 0.37864693999290466, + "learning_rate": 0.00022407554282531658, + "loss": 0.0313, + "step": 1544 + }, + { + "epoch": 1.01, + "grad_norm": 0.121845543384552, + "learning_rate": 0.00022398588020158735, + "loss": 0.0252, + "step": 1545 + }, + { + "epoch": 1.01, + "grad_norm": 0.16927878558635712, + "learning_rate": 0.00022389618262955198, + "loss": 0.0413, + "step": 1546 + }, + { + "epoch": 1.01, + "grad_norm": 0.03560361638665199, + "learning_rate": 0.00022380645015158054, + "loss": 0.0038, + "step": 1547 + }, + { + "epoch": 1.01, + "grad_norm": 0.17061570286750793, + "learning_rate": 0.0002237166828100594, + "loss": 0.0238, + "step": 1548 + }, + { + "epoch": 1.01, + "grad_norm": 0.0610225647687912, + "learning_rate": 0.0002236268806473915, + "loss": 0.0077, + "step": 1549 + }, + { + "epoch": 1.01, + "grad_norm": 0.1876288652420044, + "learning_rate": 0.00022353704370599615, + "loss": 0.0293, + "step": 1550 + }, + { + "epoch": 1.02, + "grad_norm": 0.12210645526647568, + "learning_rate": 0.00022344717202830915, + "loss": 0.014, + "step": 1551 + }, + { + "epoch": 1.02, + "grad_norm": 0.09035097062587738, + "learning_rate": 0.00022335726565678277, + "loss": 0.0178, + "step": 1552 + }, + { + "epoch": 1.02, + "grad_norm": 0.016949700191617012, + "learning_rate": 0.0002232673246338855, + "loss": 0.0023, + "step": 1553 + }, + { + "epoch": 1.02, + "grad_norm": 0.22814500331878662, + "learning_rate": 0.0002231773490021023, + "loss": 0.0187, + "step": 1554 + }, + { + "epoch": 1.02, + "grad_norm": 0.06230514496564865, + "learning_rate": 0.00022308733880393447, + "loss": 0.0045, + "step": 1555 + }, + { + "epoch": 1.02, + "grad_norm": 0.037863511592149734, + "learning_rate": 0.00022299729408189968, + "loss": 0.0079, + "step": 1556 + }, + { + "epoch": 1.02, + "grad_norm": 0.10934063047170639, + "learning_rate": 0.00022290721487853185, + "loss": 0.037, + "step": 1557 + }, + { + "epoch": 1.02, + "grad_norm": 0.17740324139595032, + "learning_rate": 0.00022281710123638117, + "loss": 0.025, + "step": 1558 + }, + { + "epoch": 1.02, + "grad_norm": 0.09901938587427139, + "learning_rate": 0.00022272695319801417, + "loss": 0.0235, + "step": 1559 + }, + { + "epoch": 1.02, + "grad_norm": 0.09340560436248779, + "learning_rate": 0.00022263677080601354, + "loss": 0.0189, + "step": 1560 + }, + { + "epoch": 1.02, + "grad_norm": 0.1699395328760147, + "learning_rate": 0.00022254655410297827, + "loss": 0.0344, + "step": 1561 + }, + { + "epoch": 1.02, + "grad_norm": 0.1495848298072815, + "learning_rate": 0.00022245630313152352, + "loss": 0.0337, + "step": 1562 + }, + { + "epoch": 1.02, + "grad_norm": 0.2952522039413452, + "learning_rate": 0.00022236601793428063, + "loss": 0.0604, + "step": 1563 + }, + { + "epoch": 1.02, + "grad_norm": 0.1865517646074295, + "learning_rate": 0.0002222756985538972, + "loss": 0.063, + "step": 1564 + }, + { + "epoch": 1.02, + "grad_norm": 0.1253533810377121, + "learning_rate": 0.00022218534503303682, + "loss": 0.0317, + "step": 1565 + }, + { + "epoch": 1.03, + "grad_norm": 0.06356123089790344, + "learning_rate": 0.00022209495741437938, + "loss": 0.0125, + "step": 1566 + }, + { + "epoch": 1.03, + "grad_norm": 0.09303832054138184, + "learning_rate": 0.00022200453574062063, + "loss": 0.0212, + "step": 1567 + }, + { + "epoch": 1.03, + "grad_norm": 0.041782230138778687, + "learning_rate": 0.00022191408005447274, + "loss": 0.0081, + "step": 1568 + }, + { + "epoch": 1.03, + "grad_norm": 0.10091729462146759, + "learning_rate": 0.00022182359039866364, + "loss": 0.024, + "step": 1569 + }, + { + "epoch": 1.03, + "grad_norm": 0.03203802555799484, + "learning_rate": 0.00022173306681593747, + "loss": 0.007, + "step": 1570 + }, + { + "epoch": 1.03, + "grad_norm": 0.09047690778970718, + "learning_rate": 0.00022164250934905442, + "loss": 0.0253, + "step": 1571 + }, + { + "epoch": 1.03, + "grad_norm": 0.058186113834381104, + "learning_rate": 0.00022155191804079058, + "loss": 0.0107, + "step": 1572 + }, + { + "epoch": 1.03, + "grad_norm": 0.14713934063911438, + "learning_rate": 0.00022146129293393804, + "loss": 0.0268, + "step": 1573 + }, + { + "epoch": 1.03, + "grad_norm": 0.0747760757803917, + "learning_rate": 0.00022137063407130493, + "loss": 0.016, + "step": 1574 + }, + { + "epoch": 1.03, + "grad_norm": 0.05679846182465553, + "learning_rate": 0.0002212799414957153, + "loss": 0.0122, + "step": 1575 + }, + { + "epoch": 1.03, + "grad_norm": 0.040479984134435654, + "learning_rate": 0.00022118921525000903, + "loss": 0.0044, + "step": 1576 + }, + { + "epoch": 1.03, + "grad_norm": 0.06615650653839111, + "learning_rate": 0.00022109845537704204, + "loss": 0.0051, + "step": 1577 + }, + { + "epoch": 1.03, + "grad_norm": 0.0890481099486351, + "learning_rate": 0.00022100766191968606, + "loss": 0.0209, + "step": 1578 + }, + { + "epoch": 1.03, + "grad_norm": 0.24288196861743927, + "learning_rate": 0.00022091683492082875, + "loss": 0.0726, + "step": 1579 + }, + { + "epoch": 1.03, + "grad_norm": 0.25049716234207153, + "learning_rate": 0.00022082597442337344, + "loss": 0.0329, + "step": 1580 + }, + { + "epoch": 1.04, + "grad_norm": 0.028392024338245392, + "learning_rate": 0.0002207350804702395, + "loss": 0.0031, + "step": 1581 + }, + { + "epoch": 1.04, + "grad_norm": 0.023121071979403496, + "learning_rate": 0.00022064415310436202, + "loss": 0.0041, + "step": 1582 + }, + { + "epoch": 1.04, + "grad_norm": 0.022821614518761635, + "learning_rate": 0.0002205531923686918, + "loss": 0.0025, + "step": 1583 + }, + { + "epoch": 1.04, + "grad_norm": 0.17920270562171936, + "learning_rate": 0.00022046219830619554, + "loss": 0.017, + "step": 1584 + }, + { + "epoch": 1.04, + "grad_norm": 0.08992599695920944, + "learning_rate": 0.00022037117095985553, + "loss": 0.0306, + "step": 1585 + }, + { + "epoch": 1.04, + "grad_norm": 0.30955061316490173, + "learning_rate": 0.0002202801103726699, + "loss": 0.0687, + "step": 1586 + }, + { + "epoch": 1.04, + "grad_norm": 0.1383177936077118, + "learning_rate": 0.00022018901658765245, + "loss": 0.0236, + "step": 1587 + }, + { + "epoch": 1.04, + "grad_norm": 0.13899581134319305, + "learning_rate": 0.00022009788964783271, + "loss": 0.02, + "step": 1588 + }, + { + "epoch": 1.04, + "grad_norm": 0.3127962350845337, + "learning_rate": 0.00022000672959625564, + "loss": 0.0785, + "step": 1589 + }, + { + "epoch": 1.04, + "grad_norm": 0.1163870096206665, + "learning_rate": 0.00021991553647598218, + "loss": 0.0239, + "step": 1590 + }, + { + "epoch": 1.04, + "grad_norm": 0.08966390788555145, + "learning_rate": 0.00021982431033008867, + "loss": 0.0321, + "step": 1591 + }, + { + "epoch": 1.04, + "grad_norm": 0.1911877542734146, + "learning_rate": 0.00021973305120166712, + "loss": 0.0347, + "step": 1592 + }, + { + "epoch": 1.04, + "grad_norm": 0.1138681024312973, + "learning_rate": 0.00021964175913382508, + "loss": 0.0401, + "step": 1593 + }, + { + "epoch": 1.04, + "grad_norm": 0.05526265874505043, + "learning_rate": 0.00021955043416968571, + "loss": 0.0086, + "step": 1594 + }, + { + "epoch": 1.04, + "grad_norm": 0.08750049024820328, + "learning_rate": 0.00021945907635238766, + "loss": 0.0316, + "step": 1595 + }, + { + "epoch": 1.04, + "grad_norm": 0.11475680023431778, + "learning_rate": 0.00021936768572508513, + "loss": 0.0258, + "step": 1596 + }, + { + "epoch": 1.05, + "grad_norm": 0.10367580503225327, + "learning_rate": 0.00021927626233094784, + "loss": 0.0332, + "step": 1597 + }, + { + "epoch": 1.05, + "grad_norm": 0.09429827332496643, + "learning_rate": 0.0002191848062131609, + "loss": 0.0188, + "step": 1598 + }, + { + "epoch": 1.05, + "grad_norm": 0.131412535905838, + "learning_rate": 0.000219093317414925, + "loss": 0.0419, + "step": 1599 + }, + { + "epoch": 1.05, + "grad_norm": 0.2314550131559372, + "learning_rate": 0.0002190017959794562, + "loss": 0.0442, + "step": 1600 + }, + { + "epoch": 1.05, + "grad_norm": 0.12669596076011658, + "learning_rate": 0.00021891024194998593, + "loss": 0.0137, + "step": 1601 + }, + { + "epoch": 1.05, + "grad_norm": 0.09830581396818161, + "learning_rate": 0.0002188186553697611, + "loss": 0.022, + "step": 1602 + }, + { + "epoch": 1.05, + "grad_norm": 0.10588390380144119, + "learning_rate": 0.00021872703628204396, + "loss": 0.0573, + "step": 1603 + }, + { + "epoch": 1.05, + "grad_norm": 0.09916213899850845, + "learning_rate": 0.0002186353847301121, + "loss": 0.031, + "step": 1604 + }, + { + "epoch": 1.05, + "grad_norm": 0.09390533715486526, + "learning_rate": 0.00021854370075725848, + "loss": 0.0181, + "step": 1605 + }, + { + "epoch": 1.05, + "grad_norm": 0.09290188550949097, + "learning_rate": 0.0002184519844067914, + "loss": 0.0146, + "step": 1606 + }, + { + "epoch": 1.05, + "grad_norm": 0.1350594460964203, + "learning_rate": 0.00021836023572203433, + "loss": 0.0536, + "step": 1607 + }, + { + "epoch": 1.05, + "grad_norm": 0.04494404420256615, + "learning_rate": 0.0002182684547463261, + "loss": 0.006, + "step": 1608 + }, + { + "epoch": 1.05, + "grad_norm": 0.1277172863483429, + "learning_rate": 0.00021817664152302087, + "loss": 0.0201, + "step": 1609 + }, + { + "epoch": 1.05, + "grad_norm": 0.09169845283031464, + "learning_rate": 0.0002180847960954879, + "loss": 0.0051, + "step": 1610 + }, + { + "epoch": 1.05, + "grad_norm": 0.11080282181501389, + "learning_rate": 0.00021799291850711173, + "loss": 0.02, + "step": 1611 + }, + { + "epoch": 1.06, + "grad_norm": 0.06069633364677429, + "learning_rate": 0.00021790100880129208, + "loss": 0.0169, + "step": 1612 + }, + { + "epoch": 1.06, + "grad_norm": 0.11857344210147858, + "learning_rate": 0.00021780906702144372, + "loss": 0.0361, + "step": 1613 + }, + { + "epoch": 1.06, + "grad_norm": 0.10318568348884583, + "learning_rate": 0.0002177170932109968, + "loss": 0.0091, + "step": 1614 + }, + { + "epoch": 1.06, + "grad_norm": 0.08703909814357758, + "learning_rate": 0.00021762508741339655, + "loss": 0.0095, + "step": 1615 + }, + { + "epoch": 1.06, + "grad_norm": 0.14628368616104126, + "learning_rate": 0.00021753304967210313, + "loss": 0.0135, + "step": 1616 + }, + { + "epoch": 1.06, + "grad_norm": 0.09634681791067123, + "learning_rate": 0.0002174409800305919, + "loss": 0.0098, + "step": 1617 + }, + { + "epoch": 1.06, + "grad_norm": 3.208465576171875, + "learning_rate": 0.00021734887853235333, + "loss": 0.0302, + "step": 1618 + }, + { + "epoch": 1.06, + "grad_norm": 0.2852117121219635, + "learning_rate": 0.00021725674522089292, + "loss": 0.0442, + "step": 1619 + }, + { + "epoch": 1.06, + "grad_norm": 0.016181744635105133, + "learning_rate": 0.0002171645801397312, + "loss": 0.0021, + "step": 1620 + }, + { + "epoch": 1.06, + "grad_norm": 0.020854290574789047, + "learning_rate": 0.00021707238333240362, + "loss": 0.0027, + "step": 1621 + }, + { + "epoch": 1.06, + "grad_norm": 0.16986317932605743, + "learning_rate": 0.00021698015484246068, + "loss": 0.0422, + "step": 1622 + }, + { + "epoch": 1.06, + "grad_norm": 0.15938438475131989, + "learning_rate": 0.0002168878947134679, + "loss": 0.0224, + "step": 1623 + }, + { + "epoch": 1.06, + "grad_norm": 0.07981102913618088, + "learning_rate": 0.00021679560298900572, + "loss": 0.01, + "step": 1624 + }, + { + "epoch": 1.06, + "grad_norm": 0.08571261167526245, + "learning_rate": 0.00021670327971266937, + "loss": 0.0137, + "step": 1625 + }, + { + "epoch": 1.06, + "grad_norm": 0.09093351662158966, + "learning_rate": 0.00021661092492806917, + "loss": 0.0089, + "step": 1626 + }, + { + "epoch": 1.07, + "grad_norm": 0.12952205538749695, + "learning_rate": 0.0002165185386788302, + "loss": 0.0257, + "step": 1627 + }, + { + "epoch": 1.07, + "grad_norm": 0.24057811498641968, + "learning_rate": 0.00021642612100859256, + "loss": 0.041, + "step": 1628 + }, + { + "epoch": 1.07, + "grad_norm": 0.6021707653999329, + "learning_rate": 0.00021633367196101093, + "loss": 0.0363, + "step": 1629 + }, + { + "epoch": 1.07, + "grad_norm": 0.09282581508159637, + "learning_rate": 0.000216241191579755, + "loss": 0.013, + "step": 1630 + }, + { + "epoch": 1.07, + "grad_norm": 0.027487829327583313, + "learning_rate": 0.0002161486799085093, + "loss": 0.0043, + "step": 1631 + }, + { + "epoch": 1.07, + "grad_norm": 0.0671788677573204, + "learning_rate": 0.00021605613699097296, + "loss": 0.0148, + "step": 1632 + }, + { + "epoch": 1.07, + "grad_norm": 0.11706430464982986, + "learning_rate": 0.0002159635628708601, + "loss": 0.0235, + "step": 1633 + }, + { + "epoch": 1.07, + "grad_norm": 0.026642918586730957, + "learning_rate": 0.00021587095759189934, + "loss": 0.0037, + "step": 1634 + }, + { + "epoch": 1.07, + "grad_norm": 0.011636834591627121, + "learning_rate": 0.0002157783211978341, + "loss": 0.0018, + "step": 1635 + }, + { + "epoch": 1.07, + "grad_norm": 0.0854690819978714, + "learning_rate": 0.00021568565373242268, + "loss": 0.0049, + "step": 1636 + }, + { + "epoch": 1.07, + "grad_norm": 0.3163520395755768, + "learning_rate": 0.0002155929552394378, + "loss": 0.0333, + "step": 1637 + }, + { + "epoch": 1.07, + "grad_norm": 0.1685972809791565, + "learning_rate": 0.00021550022576266695, + "loss": 0.0175, + "step": 1638 + }, + { + "epoch": 1.07, + "grad_norm": 0.10485909879207611, + "learning_rate": 0.00021540746534591223, + "loss": 0.0351, + "step": 1639 + }, + { + "epoch": 1.07, + "grad_norm": 0.21983854472637177, + "learning_rate": 0.00021531467403299042, + "loss": 0.0586, + "step": 1640 + }, + { + "epoch": 1.07, + "grad_norm": 0.057190775871276855, + "learning_rate": 0.00021522185186773283, + "loss": 0.0047, + "step": 1641 + }, + { + "epoch": 1.07, + "grad_norm": 0.03798317164182663, + "learning_rate": 0.00021512899889398535, + "loss": 0.0028, + "step": 1642 + }, + { + "epoch": 1.08, + "grad_norm": 0.24047723412513733, + "learning_rate": 0.0002150361151556084, + "loss": 0.0539, + "step": 1643 + }, + { + "epoch": 1.08, + "grad_norm": 0.05976404622197151, + "learning_rate": 0.000214943200696477, + "loss": 0.0102, + "step": 1644 + }, + { + "epoch": 1.08, + "grad_norm": 0.028973877429962158, + "learning_rate": 0.00021485025556048067, + "loss": 0.0048, + "step": 1645 + }, + { + "epoch": 1.08, + "grad_norm": 0.19304604828357697, + "learning_rate": 0.00021475727979152338, + "loss": 0.0188, + "step": 1646 + }, + { + "epoch": 1.08, + "grad_norm": 0.04543152078986168, + "learning_rate": 0.00021466427343352353, + "loss": 0.0074, + "step": 1647 + }, + { + "epoch": 1.08, + "grad_norm": 0.12159468978643417, + "learning_rate": 0.00021457123653041409, + "loss": 0.0209, + "step": 1648 + }, + { + "epoch": 1.08, + "grad_norm": 0.28786468505859375, + "learning_rate": 0.00021447816912614236, + "loss": 0.022, + "step": 1649 + }, + { + "epoch": 1.08, + "grad_norm": 0.169004887342453, + "learning_rate": 0.00021438507126467015, + "loss": 0.0577, + "step": 1650 + }, + { + "epoch": 1.08, + "grad_norm": 0.07666225731372833, + "learning_rate": 0.00021429194298997349, + "loss": 0.0115, + "step": 1651 + }, + { + "epoch": 1.08, + "grad_norm": 0.02752969041466713, + "learning_rate": 0.00021419878434604287, + "loss": 0.0026, + "step": 1652 + }, + { + "epoch": 1.08, + "grad_norm": 0.03154395520687103, + "learning_rate": 0.00021410559537688324, + "loss": 0.0022, + "step": 1653 + }, + { + "epoch": 1.08, + "grad_norm": 0.02959531545639038, + "learning_rate": 0.00021401237612651372, + "loss": 0.0043, + "step": 1654 + }, + { + "epoch": 1.08, + "grad_norm": 0.13195501267910004, + "learning_rate": 0.0002139191266389677, + "loss": 0.0178, + "step": 1655 + }, + { + "epoch": 1.08, + "grad_norm": 0.03975927457213402, + "learning_rate": 0.000213825846958293, + "loss": 0.0055, + "step": 1656 + }, + { + "epoch": 1.08, + "grad_norm": 0.06535745412111282, + "learning_rate": 0.00021373253712855168, + "loss": 0.0223, + "step": 1657 + }, + { + "epoch": 1.09, + "grad_norm": 0.09343099594116211, + "learning_rate": 0.00021363919719381987, + "loss": 0.0123, + "step": 1658 + }, + { + "epoch": 1.09, + "grad_norm": 0.1524379998445511, + "learning_rate": 0.00021354582719818816, + "loss": 0.0421, + "step": 1659 + }, + { + "epoch": 1.09, + "grad_norm": 0.18298211693763733, + "learning_rate": 0.00021345242718576117, + "loss": 0.0444, + "step": 1660 + }, + { + "epoch": 1.09, + "grad_norm": 0.088227778673172, + "learning_rate": 0.00021335899720065777, + "loss": 0.0116, + "step": 1661 + }, + { + "epoch": 1.09, + "grad_norm": 0.1901037096977234, + "learning_rate": 0.00021326553728701091, + "loss": 0.0211, + "step": 1662 + }, + { + "epoch": 1.09, + "grad_norm": 0.22244389355182648, + "learning_rate": 0.00021317204748896786, + "loss": 0.0567, + "step": 1663 + }, + { + "epoch": 1.09, + "grad_norm": 0.24408915638923645, + "learning_rate": 0.00021307852785068976, + "loss": 0.0472, + "step": 1664 + }, + { + "epoch": 1.09, + "grad_norm": 0.13890671730041504, + "learning_rate": 0.00021298497841635208, + "loss": 0.0326, + "step": 1665 + }, + { + "epoch": 1.09, + "grad_norm": 0.10192529112100601, + "learning_rate": 0.00021289139923014416, + "loss": 0.0142, + "step": 1666 + }, + { + "epoch": 1.09, + "grad_norm": 0.02748352289199829, + "learning_rate": 0.00021279779033626955, + "loss": 0.0065, + "step": 1667 + }, + { + "epoch": 1.09, + "grad_norm": 0.029419776052236557, + "learning_rate": 0.00021270415177894578, + "loss": 0.0062, + "step": 1668 + }, + { + "epoch": 1.09, + "grad_norm": 0.13964907824993134, + "learning_rate": 0.00021261048360240434, + "loss": 0.0442, + "step": 1669 + }, + { + "epoch": 1.09, + "grad_norm": 0.22910548746585846, + "learning_rate": 0.00021251678585089076, + "loss": 0.0735, + "step": 1670 + }, + { + "epoch": 1.09, + "grad_norm": 0.10947010666131973, + "learning_rate": 0.0002124230585686645, + "loss": 0.0225, + "step": 1671 + }, + { + "epoch": 1.09, + "grad_norm": 0.04233964532613754, + "learning_rate": 0.00021232930179999914, + "loss": 0.0121, + "step": 1672 + }, + { + "epoch": 1.1, + "grad_norm": 0.11553874611854553, + "learning_rate": 0.00021223551558918193, + "loss": 0.0206, + "step": 1673 + }, + { + "epoch": 1.1, + "grad_norm": 0.30024242401123047, + "learning_rate": 0.0002121416999805142, + "loss": 0.0369, + "step": 1674 + }, + { + "epoch": 1.1, + "grad_norm": 0.04763152822852135, + "learning_rate": 0.00021204785501831107, + "loss": 0.0094, + "step": 1675 + }, + { + "epoch": 1.1, + "grad_norm": 0.12608014047145844, + "learning_rate": 0.00021195398074690163, + "loss": 0.0529, + "step": 1676 + }, + { + "epoch": 1.1, + "grad_norm": 0.08367837965488434, + "learning_rate": 0.00021186007721062873, + "loss": 0.0172, + "step": 1677 + }, + { + "epoch": 1.1, + "grad_norm": 0.12868303060531616, + "learning_rate": 0.00021176614445384906, + "loss": 0.0268, + "step": 1678 + }, + { + "epoch": 1.1, + "grad_norm": 0.06957541406154633, + "learning_rate": 0.00021167218252093314, + "loss": 0.0099, + "step": 1679 + }, + { + "epoch": 1.1, + "grad_norm": 0.17989858984947205, + "learning_rate": 0.00021157819145626523, + "loss": 0.0316, + "step": 1680 + }, + { + "epoch": 1.1, + "grad_norm": 0.06561832875013351, + "learning_rate": 0.00021148417130424345, + "loss": 0.0118, + "step": 1681 + }, + { + "epoch": 1.1, + "grad_norm": 0.07524342089891434, + "learning_rate": 0.0002113901221092795, + "loss": 0.0246, + "step": 1682 + }, + { + "epoch": 1.1, + "grad_norm": 0.09207272529602051, + "learning_rate": 0.0002112960439157989, + "loss": 0.0371, + "step": 1683 + }, + { + "epoch": 1.1, + "grad_norm": 0.11619143187999725, + "learning_rate": 0.00021120193676824086, + "loss": 0.0196, + "step": 1684 + }, + { + "epoch": 1.1, + "grad_norm": 0.09404407441616058, + "learning_rate": 0.00021110780071105829, + "loss": 0.0198, + "step": 1685 + }, + { + "epoch": 1.1, + "grad_norm": 0.025721121579408646, + "learning_rate": 0.00021101363578871773, + "loss": 0.0033, + "step": 1686 + }, + { + "epoch": 1.1, + "grad_norm": 0.17948229610919952, + "learning_rate": 0.00021091944204569928, + "loss": 0.0225, + "step": 1687 + }, + { + "epoch": 1.11, + "grad_norm": 0.299441933631897, + "learning_rate": 0.00021082521952649677, + "loss": 0.0314, + "step": 1688 + }, + { + "epoch": 1.11, + "grad_norm": 0.18134386837482452, + "learning_rate": 0.00021073096827561755, + "loss": 0.0077, + "step": 1689 + }, + { + "epoch": 1.11, + "grad_norm": 0.030258700251579285, + "learning_rate": 0.00021063668833758265, + "loss": 0.005, + "step": 1690 + }, + { + "epoch": 1.11, + "grad_norm": 0.44221439957618713, + "learning_rate": 0.00021054237975692646, + "loss": 0.0267, + "step": 1691 + }, + { + "epoch": 1.11, + "grad_norm": 0.19150428473949432, + "learning_rate": 0.0002104480425781971, + "loss": 0.0843, + "step": 1692 + }, + { + "epoch": 1.11, + "grad_norm": 0.16386204957962036, + "learning_rate": 0.00021035367684595603, + "loss": 0.0408, + "step": 1693 + }, + { + "epoch": 1.11, + "grad_norm": 0.06396955996751785, + "learning_rate": 0.0002102592826047783, + "loss": 0.0051, + "step": 1694 + }, + { + "epoch": 1.11, + "grad_norm": 0.096857450902462, + "learning_rate": 0.0002101648598992525, + "loss": 0.0104, + "step": 1695 + }, + { + "epoch": 1.11, + "grad_norm": 0.08629319071769714, + "learning_rate": 0.0002100704087739804, + "loss": 0.0066, + "step": 1696 + }, + { + "epoch": 1.11, + "grad_norm": 0.06080562621355057, + "learning_rate": 0.00020997592927357746, + "loss": 0.0083, + "step": 1697 + }, + { + "epoch": 1.11, + "grad_norm": 0.026974063366651535, + "learning_rate": 0.00020988142144267246, + "loss": 0.0043, + "step": 1698 + }, + { + "epoch": 1.11, + "grad_norm": 0.01949911192059517, + "learning_rate": 0.00020978688532590747, + "loss": 0.0034, + "step": 1699 + }, + { + "epoch": 1.11, + "grad_norm": 0.0293037761002779, + "learning_rate": 0.0002096923209679381, + "loss": 0.005, + "step": 1700 + }, + { + "epoch": 1.11, + "grad_norm": 0.03902202844619751, + "learning_rate": 0.0002095977284134331, + "loss": 0.005, + "step": 1701 + }, + { + "epoch": 1.11, + "grad_norm": 0.02799048461019993, + "learning_rate": 0.0002095031077070747, + "loss": 0.0044, + "step": 1702 + }, + { + "epoch": 1.11, + "grad_norm": 0.07804699242115021, + "learning_rate": 0.00020940845889355842, + "loss": 0.0053, + "step": 1703 + }, + { + "epoch": 1.12, + "grad_norm": 0.27684271335601807, + "learning_rate": 0.00020931378201759283, + "loss": 0.0272, + "step": 1704 + }, + { + "epoch": 1.12, + "grad_norm": 0.10727167129516602, + "learning_rate": 0.00020921907712390008, + "loss": 0.008, + "step": 1705 + }, + { + "epoch": 1.12, + "grad_norm": 0.1409468650817871, + "learning_rate": 0.00020912434425721536, + "loss": 0.0078, + "step": 1706 + }, + { + "epoch": 1.12, + "grad_norm": 0.06090042367577553, + "learning_rate": 0.0002090295834622871, + "loss": 0.0022, + "step": 1707 + }, + { + "epoch": 1.12, + "grad_norm": 0.3309457004070282, + "learning_rate": 0.00020893479478387695, + "loss": 0.0202, + "step": 1708 + }, + { + "epoch": 1.12, + "grad_norm": 0.014704009518027306, + "learning_rate": 0.00020883997826675972, + "loss": 0.0018, + "step": 1709 + }, + { + "epoch": 1.12, + "grad_norm": 0.036989156156778336, + "learning_rate": 0.0002087451339557234, + "loss": 0.0023, + "step": 1710 + }, + { + "epoch": 1.12, + "grad_norm": 0.0498960018157959, + "learning_rate": 0.00020865026189556898, + "loss": 0.0022, + "step": 1711 + }, + { + "epoch": 1.12, + "grad_norm": 0.3943072259426117, + "learning_rate": 0.0002085553621311108, + "loss": 0.0478, + "step": 1712 + }, + { + "epoch": 1.12, + "grad_norm": 0.011776590719819069, + "learning_rate": 0.00020846043470717606, + "loss": 0.0015, + "step": 1713 + }, + { + "epoch": 1.12, + "grad_norm": 0.01262225303798914, + "learning_rate": 0.00020836547966860512, + "loss": 0.0012, + "step": 1714 + }, + { + "epoch": 1.12, + "grad_norm": 0.44584786891937256, + "learning_rate": 0.00020827049706025134, + "loss": 0.0376, + "step": 1715 + }, + { + "epoch": 1.12, + "grad_norm": 0.1026252806186676, + "learning_rate": 0.00020817548692698122, + "loss": 0.0057, + "step": 1716 + }, + { + "epoch": 1.12, + "grad_norm": 0.5443242788314819, + "learning_rate": 0.0002080804493136741, + "loss": 0.0355, + "step": 1717 + }, + { + "epoch": 1.12, + "grad_norm": 0.2901538610458374, + "learning_rate": 0.0002079853842652224, + "loss": 0.0331, + "step": 1718 + }, + { + "epoch": 1.13, + "grad_norm": 0.13325665891170502, + "learning_rate": 0.00020789029182653146, + "loss": 0.0314, + "step": 1719 + }, + { + "epoch": 1.13, + "grad_norm": 0.3474900722503662, + "learning_rate": 0.00020779517204251962, + "loss": 0.0052, + "step": 1720 + }, + { + "epoch": 1.13, + "grad_norm": 0.19612590968608856, + "learning_rate": 0.00020770002495811807, + "loss": 0.0159, + "step": 1721 + }, + { + "epoch": 1.13, + "grad_norm": 0.12384012341499329, + "learning_rate": 0.00020760485061827096, + "loss": 0.0463, + "step": 1722 + }, + { + "epoch": 1.13, + "grad_norm": 0.2543465495109558, + "learning_rate": 0.00020750964906793518, + "loss": 0.1287, + "step": 1723 + }, + { + "epoch": 1.13, + "grad_norm": 0.037789445370435715, + "learning_rate": 0.00020741442035208062, + "loss": 0.0039, + "step": 1724 + }, + { + "epoch": 1.13, + "grad_norm": 0.026412533596158028, + "learning_rate": 0.00020731916451568991, + "loss": 0.004, + "step": 1725 + }, + { + "epoch": 1.13, + "grad_norm": 0.30076315999031067, + "learning_rate": 0.00020722388160375867, + "loss": 0.0585, + "step": 1726 + }, + { + "epoch": 1.13, + "grad_norm": 0.052092649042606354, + "learning_rate": 0.00020712857166129502, + "loss": 0.0059, + "step": 1727 + }, + { + "epoch": 1.13, + "grad_norm": 0.23526246845722198, + "learning_rate": 0.00020703323473332, + "loss": 0.0618, + "step": 1728 + }, + { + "epoch": 1.13, + "grad_norm": 0.17606157064437866, + "learning_rate": 0.00020693787086486747, + "loss": 0.0382, + "step": 1729 + }, + { + "epoch": 1.13, + "grad_norm": 0.04785061255097389, + "learning_rate": 0.0002068424801009839, + "loss": 0.009, + "step": 1730 + }, + { + "epoch": 1.13, + "grad_norm": 0.1910519003868103, + "learning_rate": 0.0002067470624867285, + "loss": 0.0268, + "step": 1731 + }, + { + "epoch": 1.13, + "grad_norm": 0.029830094426870346, + "learning_rate": 0.00020665161806717318, + "loss": 0.0058, + "step": 1732 + }, + { + "epoch": 1.13, + "grad_norm": 0.22875742614269257, + "learning_rate": 0.0002065561468874025, + "loss": 0.042, + "step": 1733 + }, + { + "epoch": 1.14, + "grad_norm": 0.1160743236541748, + "learning_rate": 0.00020646064899251365, + "loss": 0.0123, + "step": 1734 + }, + { + "epoch": 1.14, + "grad_norm": 0.09632806479930878, + "learning_rate": 0.0002063651244276165, + "loss": 0.028, + "step": 1735 + }, + { + "epoch": 1.14, + "grad_norm": 0.045413848012685776, + "learning_rate": 0.00020626957323783337, + "loss": 0.0103, + "step": 1736 + }, + { + "epoch": 1.14, + "grad_norm": 0.06504768133163452, + "learning_rate": 0.00020617399546829932, + "loss": 0.0103, + "step": 1737 + }, + { + "epoch": 1.14, + "grad_norm": 0.11606152355670929, + "learning_rate": 0.00020607839116416188, + "loss": 0.0219, + "step": 1738 + }, + { + "epoch": 1.14, + "grad_norm": 0.15662230551242828, + "learning_rate": 0.00020598276037058115, + "loss": 0.0075, + "step": 1739 + }, + { + "epoch": 1.14, + "grad_norm": 0.060219258069992065, + "learning_rate": 0.00020588710313272968, + "loss": 0.0051, + "step": 1740 + }, + { + "epoch": 1.14, + "grad_norm": 0.033545345067977905, + "learning_rate": 0.0002057914194957926, + "loss": 0.006, + "step": 1741 + }, + { + "epoch": 1.14, + "grad_norm": 0.1281086504459381, + "learning_rate": 0.00020569570950496746, + "loss": 0.0148, + "step": 1742 + }, + { + "epoch": 1.14, + "grad_norm": 0.28984948992729187, + "learning_rate": 0.0002055999732054643, + "loss": 0.0788, + "step": 1743 + }, + { + "epoch": 1.14, + "grad_norm": 0.022472795099020004, + "learning_rate": 0.00020550421064250546, + "loss": 0.0032, + "step": 1744 + }, + { + "epoch": 1.14, + "grad_norm": 0.21518099308013916, + "learning_rate": 0.00020540842186132587, + "loss": 0.0624, + "step": 1745 + }, + { + "epoch": 1.14, + "grad_norm": 0.11233403533697128, + "learning_rate": 0.00020531260690717269, + "loss": 0.018, + "step": 1746 + }, + { + "epoch": 1.14, + "grad_norm": 0.0663604736328125, + "learning_rate": 0.0002052167658253055, + "loss": 0.0247, + "step": 1747 + }, + { + "epoch": 1.14, + "grad_norm": 0.019785290583968163, + "learning_rate": 0.00020512089866099635, + "loss": 0.004, + "step": 1748 + }, + { + "epoch": 1.15, + "grad_norm": 0.18856649100780487, + "learning_rate": 0.00020502500545952935, + "loss": 0.0246, + "step": 1749 + }, + { + "epoch": 1.15, + "grad_norm": 0.06530511379241943, + "learning_rate": 0.0002049290862662011, + "loss": 0.0047, + "step": 1750 + }, + { + "epoch": 1.15, + "grad_norm": 0.11900179833173752, + "learning_rate": 0.0002048331411263204, + "loss": 0.0305, + "step": 1751 + }, + { + "epoch": 1.15, + "grad_norm": 0.08668252825737, + "learning_rate": 0.00020473717008520842, + "loss": 0.0342, + "step": 1752 + }, + { + "epoch": 1.15, + "grad_norm": 0.033169977366924286, + "learning_rate": 0.00020464117318819836, + "loss": 0.0051, + "step": 1753 + }, + { + "epoch": 1.15, + "grad_norm": 0.11904280632734299, + "learning_rate": 0.00020454515048063578, + "loss": 0.0148, + "step": 1754 + }, + { + "epoch": 1.15, + "grad_norm": 0.03916119784116745, + "learning_rate": 0.00020444910200787846, + "loss": 0.0064, + "step": 1755 + }, + { + "epoch": 1.15, + "grad_norm": 0.23415330052375793, + "learning_rate": 0.0002043530278152963, + "loss": 0.0252, + "step": 1756 + }, + { + "epoch": 1.15, + "grad_norm": 0.2871975302696228, + "learning_rate": 0.0002042569279482712, + "loss": 0.0425, + "step": 1757 + }, + { + "epoch": 1.15, + "grad_norm": 0.09590361267328262, + "learning_rate": 0.00020416080245219743, + "loss": 0.0079, + "step": 1758 + }, + { + "epoch": 1.15, + "grad_norm": 0.037294209003448486, + "learning_rate": 0.00020406465137248135, + "loss": 0.0027, + "step": 1759 + }, + { + "epoch": 1.15, + "grad_norm": 0.14344163239002228, + "learning_rate": 0.00020396847475454114, + "loss": 0.0109, + "step": 1760 + }, + { + "epoch": 1.15, + "grad_norm": 0.01968855783343315, + "learning_rate": 0.0002038722726438074, + "loss": 0.0024, + "step": 1761 + }, + { + "epoch": 1.15, + "grad_norm": 0.26668640971183777, + "learning_rate": 0.00020377604508572245, + "loss": 0.0166, + "step": 1762 + }, + { + "epoch": 1.15, + "grad_norm": 0.0922434851527214, + "learning_rate": 0.00020367979212574085, + "loss": 0.0128, + "step": 1763 + }, + { + "epoch": 1.15, + "grad_norm": 0.012287971563637257, + "learning_rate": 0.0002035835138093291, + "loss": 0.0018, + "step": 1764 + }, + { + "epoch": 1.16, + "grad_norm": 0.14998489618301392, + "learning_rate": 0.0002034872101819656, + "loss": 0.0383, + "step": 1765 + }, + { + "epoch": 1.16, + "grad_norm": 0.03521181270480156, + "learning_rate": 0.00020339088128914083, + "loss": 0.0042, + "step": 1766 + }, + { + "epoch": 1.16, + "grad_norm": 0.1004776582121849, + "learning_rate": 0.00020329452717635712, + "loss": 0.0217, + "step": 1767 + }, + { + "epoch": 1.16, + "grad_norm": 0.10840025544166565, + "learning_rate": 0.00020319814788912868, + "loss": 0.0238, + "step": 1768 + }, + { + "epoch": 1.16, + "grad_norm": 0.08637768775224686, + "learning_rate": 0.00020310174347298174, + "loss": 0.0244, + "step": 1769 + }, + { + "epoch": 1.16, + "grad_norm": 0.12513408064842224, + "learning_rate": 0.00020300531397345433, + "loss": 0.0402, + "step": 1770 + }, + { + "epoch": 1.16, + "grad_norm": 0.1716419905424118, + "learning_rate": 0.00020290885943609628, + "loss": 0.0473, + "step": 1771 + }, + { + "epoch": 1.16, + "grad_norm": 0.1464458853006363, + "learning_rate": 0.00020281237990646932, + "loss": 0.0263, + "step": 1772 + }, + { + "epoch": 1.16, + "grad_norm": 0.2202548384666443, + "learning_rate": 0.00020271587543014695, + "loss": 0.014, + "step": 1773 + }, + { + "epoch": 1.16, + "grad_norm": 0.20607982575893402, + "learning_rate": 0.00020261934605271447, + "loss": 0.0112, + "step": 1774 + }, + { + "epoch": 1.16, + "grad_norm": 0.04717608913779259, + "learning_rate": 0.00020252279181976897, + "loss": 0.0072, + "step": 1775 + }, + { + "epoch": 1.16, + "grad_norm": 0.1588015854358673, + "learning_rate": 0.00020242621277691912, + "loss": 0.0203, + "step": 1776 + }, + { + "epoch": 1.16, + "grad_norm": 0.28549695014953613, + "learning_rate": 0.00020232960896978558, + "loss": 0.0256, + "step": 1777 + }, + { + "epoch": 1.16, + "grad_norm": 0.05648793280124664, + "learning_rate": 0.00020223298044400048, + "loss": 0.0172, + "step": 1778 + }, + { + "epoch": 1.16, + "grad_norm": 0.08711002767086029, + "learning_rate": 0.00020213632724520777, + "loss": 0.0091, + "step": 1779 + }, + { + "epoch": 1.17, + "grad_norm": 0.1041957437992096, + "learning_rate": 0.00020203964941906293, + "loss": 0.0391, + "step": 1780 + }, + { + "epoch": 1.17, + "grad_norm": 0.23113363981246948, + "learning_rate": 0.00020194294701123317, + "loss": 0.0202, + "step": 1781 + }, + { + "epoch": 1.17, + "grad_norm": 0.05238531902432442, + "learning_rate": 0.00020184622006739724, + "loss": 0.0133, + "step": 1782 + }, + { + "epoch": 1.17, + "grad_norm": 0.16166527569293976, + "learning_rate": 0.00020174946863324555, + "loss": 0.0162, + "step": 1783 + }, + { + "epoch": 1.17, + "grad_norm": 0.09691984206438065, + "learning_rate": 0.0002016526927544801, + "loss": 0.0163, + "step": 1784 + }, + { + "epoch": 1.17, + "grad_norm": 0.09754455834627151, + "learning_rate": 0.0002015558924768143, + "loss": 0.0053, + "step": 1785 + }, + { + "epoch": 1.17, + "grad_norm": 0.13531388342380524, + "learning_rate": 0.00020145906784597317, + "loss": 0.0243, + "step": 1786 + }, + { + "epoch": 1.17, + "grad_norm": 0.06766755878925323, + "learning_rate": 0.0002013622189076933, + "loss": 0.0121, + "step": 1787 + }, + { + "epoch": 1.17, + "grad_norm": 0.00836429838091135, + "learning_rate": 0.00020126534570772265, + "loss": 0.0012, + "step": 1788 + }, + { + "epoch": 1.17, + "grad_norm": 0.009742120280861855, + "learning_rate": 0.00020116844829182065, + "loss": 0.0013, + "step": 1789 + }, + { + "epoch": 1.17, + "grad_norm": 0.593370258808136, + "learning_rate": 0.00020107152670575826, + "loss": 0.0362, + "step": 1790 + }, + { + "epoch": 1.17, + "grad_norm": 0.09712370485067368, + "learning_rate": 0.00020097458099531778, + "loss": 0.0055, + "step": 1791 + }, + { + "epoch": 1.17, + "grad_norm": 0.5568703413009644, + "learning_rate": 0.00020087761120629296, + "loss": 0.0747, + "step": 1792 + }, + { + "epoch": 1.17, + "grad_norm": 0.073786161839962, + "learning_rate": 0.00020078061738448881, + "loss": 0.0113, + "step": 1793 + }, + { + "epoch": 1.17, + "grad_norm": 0.011098073795437813, + "learning_rate": 0.0002006835995757218, + "loss": 0.001, + "step": 1794 + }, + { + "epoch": 1.18, + "grad_norm": 0.009852485731244087, + "learning_rate": 0.0002005865578258198, + "loss": 0.0013, + "step": 1795 + }, + { + "epoch": 1.18, + "grad_norm": 0.2939152717590332, + "learning_rate": 0.00020048949218062174, + "loss": 0.0346, + "step": 1796 + }, + { + "epoch": 1.18, + "grad_norm": 0.2433510720729828, + "learning_rate": 0.0002003924026859781, + "loss": 0.02, + "step": 1797 + }, + { + "epoch": 1.18, + "grad_norm": 0.02912173792719841, + "learning_rate": 0.00020029528938775046, + "loss": 0.0024, + "step": 1798 + }, + { + "epoch": 1.18, + "grad_norm": 0.13098259270191193, + "learning_rate": 0.0002001981523318117, + "loss": 0.0236, + "step": 1799 + }, + { + "epoch": 1.18, + "grad_norm": 0.15209761261940002, + "learning_rate": 0.00020010099156404594, + "loss": 0.0305, + "step": 1800 + }, + { + "epoch": 1.18, + "grad_norm": 0.23245444893836975, + "learning_rate": 0.00020000380713034848, + "loss": 0.0488, + "step": 1801 + }, + { + "epoch": 1.18, + "grad_norm": 0.048978038132190704, + "learning_rate": 0.00019990659907662578, + "loss": 0.0072, + "step": 1802 + }, + { + "epoch": 1.18, + "grad_norm": 0.01486815232783556, + "learning_rate": 0.00019980936744879552, + "loss": 0.0021, + "step": 1803 + }, + { + "epoch": 1.18, + "grad_norm": 0.07630421221256256, + "learning_rate": 0.0001997121122927864, + "loss": 0.0129, + "step": 1804 + }, + { + "epoch": 1.18, + "grad_norm": 0.1630118042230606, + "learning_rate": 0.00019961483365453842, + "loss": 0.0247, + "step": 1805 + }, + { + "epoch": 1.18, + "grad_norm": 0.20434342324733734, + "learning_rate": 0.00019951753158000242, + "loss": 0.0275, + "step": 1806 + }, + { + "epoch": 1.18, + "grad_norm": 0.17411430180072784, + "learning_rate": 0.00019942020611514056, + "loss": 0.0485, + "step": 1807 + }, + { + "epoch": 1.18, + "grad_norm": 0.2546854317188263, + "learning_rate": 0.00019932285730592583, + "loss": 0.0231, + "step": 1808 + }, + { + "epoch": 1.18, + "grad_norm": 0.10782810300588608, + "learning_rate": 0.0001992254851983425, + "loss": 0.0299, + "step": 1809 + }, + { + "epoch": 1.18, + "grad_norm": 0.12854281067848206, + "learning_rate": 0.0001991280898383856, + "loss": 0.0167, + "step": 1810 + }, + { + "epoch": 1.19, + "grad_norm": 0.017508767545223236, + "learning_rate": 0.00019903067127206124, + "loss": 0.0025, + "step": 1811 + }, + { + "epoch": 1.19, + "grad_norm": 0.07165428251028061, + "learning_rate": 0.00019893322954538657, + "loss": 0.0113, + "step": 1812 + }, + { + "epoch": 1.19, + "grad_norm": 0.10676853358745575, + "learning_rate": 0.0001988357647043895, + "loss": 0.0035, + "step": 1813 + }, + { + "epoch": 1.19, + "grad_norm": 0.2053932100534439, + "learning_rate": 0.00019873827679510908, + "loss": 0.0215, + "step": 1814 + }, + { + "epoch": 1.19, + "grad_norm": 0.03081035614013672, + "learning_rate": 0.00019864076586359513, + "loss": 0.0047, + "step": 1815 + }, + { + "epoch": 1.19, + "grad_norm": 0.0858052521944046, + "learning_rate": 0.00019854323195590823, + "loss": 0.0113, + "step": 1816 + }, + { + "epoch": 1.19, + "grad_norm": 0.16202549636363983, + "learning_rate": 0.00019844567511812002, + "loss": 0.0088, + "step": 1817 + }, + { + "epoch": 1.19, + "grad_norm": 0.09111412614583969, + "learning_rate": 0.0001983480953963129, + "loss": 0.0078, + "step": 1818 + }, + { + "epoch": 1.19, + "grad_norm": 0.05848320201039314, + "learning_rate": 0.0001982504928365801, + "loss": 0.0035, + "step": 1819 + }, + { + "epoch": 1.19, + "grad_norm": 0.1509551852941513, + "learning_rate": 0.00019815286748502554, + "loss": 0.0109, + "step": 1820 + }, + { + "epoch": 1.19, + "grad_norm": 0.06057953089475632, + "learning_rate": 0.00019805521938776402, + "loss": 0.0043, + "step": 1821 + }, + { + "epoch": 1.19, + "grad_norm": 0.11573519557714462, + "learning_rate": 0.00019795754859092097, + "loss": 0.0343, + "step": 1822 + }, + { + "epoch": 1.19, + "grad_norm": 0.0522802509367466, + "learning_rate": 0.0001978598551406327, + "loss": 0.0054, + "step": 1823 + }, + { + "epoch": 1.19, + "grad_norm": 0.08309350162744522, + "learning_rate": 0.00019776213908304611, + "loss": 0.0041, + "step": 1824 + }, + { + "epoch": 1.19, + "grad_norm": 0.506611704826355, + "learning_rate": 0.00019766440046431875, + "loss": 0.062, + "step": 1825 + }, + { + "epoch": 1.2, + "grad_norm": 0.05043810233473778, + "learning_rate": 0.00019756663933061892, + "loss": 0.0036, + "step": 1826 + }, + { + "epoch": 1.2, + "grad_norm": 0.008669359609484673, + "learning_rate": 0.0001974688557281255, + "loss": 0.0008, + "step": 1827 + }, + { + "epoch": 1.2, + "grad_norm": 0.24424995481967926, + "learning_rate": 0.00019737104970302802, + "loss": 0.0312, + "step": 1828 + }, + { + "epoch": 1.2, + "grad_norm": 0.08624009788036346, + "learning_rate": 0.00019727322130152656, + "loss": 0.0471, + "step": 1829 + }, + { + "epoch": 1.2, + "grad_norm": 0.0553121417760849, + "learning_rate": 0.00019717537056983177, + "loss": 0.0047, + "step": 1830 + }, + { + "epoch": 1.2, + "grad_norm": 0.20766226947307587, + "learning_rate": 0.00019707749755416487, + "loss": 0.0187, + "step": 1831 + }, + { + "epoch": 1.2, + "grad_norm": 0.03902578726410866, + "learning_rate": 0.00019697960230075768, + "loss": 0.0022, + "step": 1832 + }, + { + "epoch": 1.2, + "grad_norm": 0.05113459378480911, + "learning_rate": 0.00019688168485585233, + "loss": 0.0059, + "step": 1833 + }, + { + "epoch": 1.2, + "grad_norm": 0.07002965360879898, + "learning_rate": 0.00019678374526570157, + "loss": 0.0054, + "step": 1834 + }, + { + "epoch": 1.2, + "grad_norm": 0.11523545533418655, + "learning_rate": 0.00019668578357656864, + "loss": 0.006, + "step": 1835 + }, + { + "epoch": 1.2, + "grad_norm": 0.10022434592247009, + "learning_rate": 0.00019658779983472714, + "loss": 0.0086, + "step": 1836 + }, + { + "epoch": 1.2, + "grad_norm": 0.3261854350566864, + "learning_rate": 0.00019648979408646113, + "loss": 0.0639, + "step": 1837 + }, + { + "epoch": 1.2, + "grad_norm": 0.0735621452331543, + "learning_rate": 0.000196391766378065, + "loss": 0.004, + "step": 1838 + }, + { + "epoch": 1.2, + "grad_norm": 0.02739633060991764, + "learning_rate": 0.00019629371675584367, + "loss": 0.0022, + "step": 1839 + }, + { + "epoch": 1.2, + "grad_norm": 0.15281106531620026, + "learning_rate": 0.0001961956452661122, + "loss": 0.0086, + "step": 1840 + }, + { + "epoch": 1.21, + "grad_norm": 0.04371574893593788, + "learning_rate": 0.00019609755195519615, + "loss": 0.0034, + "step": 1841 + }, + { + "epoch": 1.21, + "grad_norm": 0.10713805258274078, + "learning_rate": 0.00019599943686943126, + "loss": 0.0065, + "step": 1842 + }, + { + "epoch": 1.21, + "grad_norm": 0.059120483696460724, + "learning_rate": 0.00019590130005516364, + "loss": 0.006, + "step": 1843 + }, + { + "epoch": 1.21, + "grad_norm": 0.1300182342529297, + "learning_rate": 0.00019580314155874968, + "loss": 0.0476, + "step": 1844 + }, + { + "epoch": 1.21, + "grad_norm": 0.22915533185005188, + "learning_rate": 0.00019570496142655598, + "loss": 0.0257, + "step": 1845 + }, + { + "epoch": 1.21, + "grad_norm": 0.4989432692527771, + "learning_rate": 0.00019560675970495926, + "loss": 0.0554, + "step": 1846 + }, + { + "epoch": 1.21, + "grad_norm": 0.051195550709962845, + "learning_rate": 0.0001955085364403466, + "loss": 0.0044, + "step": 1847 + }, + { + "epoch": 1.21, + "grad_norm": 0.36124640703201294, + "learning_rate": 0.00019541029167911513, + "loss": 0.0726, + "step": 1848 + }, + { + "epoch": 1.21, + "grad_norm": 0.12776778638362885, + "learning_rate": 0.0001953120254676723, + "loss": 0.012, + "step": 1849 + }, + { + "epoch": 1.21, + "grad_norm": 0.329092800617218, + "learning_rate": 0.0001952137378524355, + "loss": 0.0174, + "step": 1850 + }, + { + "epoch": 1.21, + "grad_norm": 0.21430779993534088, + "learning_rate": 0.00019511542887983233, + "loss": 0.061, + "step": 1851 + }, + { + "epoch": 1.21, + "grad_norm": 0.031340569257736206, + "learning_rate": 0.00019501709859630047, + "loss": 0.0051, + "step": 1852 + }, + { + "epoch": 1.21, + "grad_norm": 0.23494398593902588, + "learning_rate": 0.00019491874704828766, + "loss": 0.0568, + "step": 1853 + }, + { + "epoch": 1.21, + "grad_norm": 0.19756872951984406, + "learning_rate": 0.00019482037428225166, + "loss": 0.027, + "step": 1854 + }, + { + "epoch": 1.21, + "grad_norm": 0.1831345558166504, + "learning_rate": 0.00019472198034466032, + "loss": 0.0213, + "step": 1855 + }, + { + "epoch": 1.22, + "grad_norm": 0.0714825913310051, + "learning_rate": 0.00019462356528199138, + "loss": 0.0264, + "step": 1856 + }, + { + "epoch": 1.22, + "grad_norm": 0.09632866829633713, + "learning_rate": 0.0001945251291407327, + "loss": 0.0123, + "step": 1857 + }, + { + "epoch": 1.22, + "grad_norm": 0.28308427333831787, + "learning_rate": 0.00019442667196738192, + "loss": 0.0718, + "step": 1858 + }, + { + "epoch": 1.22, + "grad_norm": 0.1654062271118164, + "learning_rate": 0.00019432819380844687, + "loss": 0.046, + "step": 1859 + }, + { + "epoch": 1.22, + "grad_norm": 0.044118259102106094, + "learning_rate": 0.00019422969471044501, + "loss": 0.0055, + "step": 1860 + }, + { + "epoch": 1.22, + "grad_norm": 0.06968650221824646, + "learning_rate": 0.00019413117471990386, + "loss": 0.0137, + "step": 1861 + }, + { + "epoch": 1.22, + "grad_norm": 0.03375468775629997, + "learning_rate": 0.0001940326338833608, + "loss": 0.0066, + "step": 1862 + }, + { + "epoch": 1.22, + "grad_norm": 0.15991543233394623, + "learning_rate": 0.00019393407224736306, + "loss": 0.0316, + "step": 1863 + }, + { + "epoch": 1.22, + "grad_norm": 0.09309843927621841, + "learning_rate": 0.00019383548985846754, + "loss": 0.0178, + "step": 1864 + }, + { + "epoch": 1.22, + "grad_norm": 0.14062856137752533, + "learning_rate": 0.00019373688676324114, + "loss": 0.0315, + "step": 1865 + }, + { + "epoch": 1.22, + "grad_norm": 0.09427135437726974, + "learning_rate": 0.00019363826300826043, + "loss": 0.0516, + "step": 1866 + }, + { + "epoch": 1.22, + "grad_norm": 0.12931552529335022, + "learning_rate": 0.00019353961864011183, + "loss": 0.0208, + "step": 1867 + }, + { + "epoch": 1.22, + "grad_norm": 0.06832294166088104, + "learning_rate": 0.0001934409537053914, + "loss": 0.0249, + "step": 1868 + }, + { + "epoch": 1.22, + "grad_norm": 0.1043616384267807, + "learning_rate": 0.00019334226825070493, + "loss": 0.0132, + "step": 1869 + }, + { + "epoch": 1.22, + "grad_norm": 0.04455610364675522, + "learning_rate": 0.000193243562322668, + "loss": 0.0068, + "step": 1870 + }, + { + "epoch": 1.22, + "grad_norm": 0.09259182214736938, + "learning_rate": 0.00019314483596790576, + "loss": 0.0084, + "step": 1871 + }, + { + "epoch": 1.23, + "grad_norm": 0.15175136923789978, + "learning_rate": 0.00019304608923305302, + "loss": 0.0299, + "step": 1872 + }, + { + "epoch": 1.23, + "grad_norm": 0.3181014657020569, + "learning_rate": 0.00019294732216475427, + "loss": 0.05, + "step": 1873 + }, + { + "epoch": 1.23, + "grad_norm": 0.029958350583910942, + "learning_rate": 0.00019284853480966354, + "loss": 0.0048, + "step": 1874 + }, + { + "epoch": 1.23, + "grad_norm": 0.1685924082994461, + "learning_rate": 0.00019274972721444446, + "loss": 0.0271, + "step": 1875 + }, + { + "epoch": 1.23, + "grad_norm": 0.08830788731575012, + "learning_rate": 0.00019265089942577027, + "loss": 0.0099, + "step": 1876 + }, + { + "epoch": 1.23, + "grad_norm": 0.20782901346683502, + "learning_rate": 0.00019255205149032375, + "loss": 0.0558, + "step": 1877 + }, + { + "epoch": 1.23, + "grad_norm": 0.12139201164245605, + "learning_rate": 0.00019245318345479707, + "loss": 0.0161, + "step": 1878 + }, + { + "epoch": 1.23, + "grad_norm": 0.14500971138477325, + "learning_rate": 0.00019235429536589203, + "loss": 0.0129, + "step": 1879 + }, + { + "epoch": 1.23, + "grad_norm": 0.18900787830352783, + "learning_rate": 0.0001922553872703198, + "loss": 0.0161, + "step": 1880 + }, + { + "epoch": 1.23, + "grad_norm": 0.04411626234650612, + "learning_rate": 0.0001921564592148012, + "loss": 0.0038, + "step": 1881 + }, + { + "epoch": 1.23, + "grad_norm": 0.08428865671157837, + "learning_rate": 0.0001920575112460662, + "loss": 0.0048, + "step": 1882 + }, + { + "epoch": 1.23, + "grad_norm": 0.12923188507556915, + "learning_rate": 0.0001919585434108543, + "loss": 0.0111, + "step": 1883 + }, + { + "epoch": 1.23, + "grad_norm": 0.02636256441473961, + "learning_rate": 0.00019185955575591452, + "loss": 0.0032, + "step": 1884 + }, + { + "epoch": 1.23, + "grad_norm": 0.297539621591568, + "learning_rate": 0.00019176054832800498, + "loss": 0.0659, + "step": 1885 + }, + { + "epoch": 1.23, + "grad_norm": 0.1499989926815033, + "learning_rate": 0.00019166152117389344, + "loss": 0.0129, + "step": 1886 + }, + { + "epoch": 1.24, + "grad_norm": 0.4077751040458679, + "learning_rate": 0.00019156247434035665, + "loss": 0.0447, + "step": 1887 + }, + { + "epoch": 1.24, + "grad_norm": 0.04810630530118942, + "learning_rate": 0.0001914634078741809, + "loss": 0.004, + "step": 1888 + }, + { + "epoch": 1.24, + "grad_norm": 0.07159969210624695, + "learning_rate": 0.00019136432182216166, + "loss": 0.0044, + "step": 1889 + }, + { + "epoch": 1.24, + "grad_norm": 0.09493908286094666, + "learning_rate": 0.00019126521623110375, + "loss": 0.0071, + "step": 1890 + }, + { + "epoch": 1.24, + "grad_norm": 0.04515552520751953, + "learning_rate": 0.00019116609114782097, + "loss": 0.0039, + "step": 1891 + }, + { + "epoch": 1.24, + "grad_norm": 0.01067087147384882, + "learning_rate": 0.00019106694661913664, + "loss": 0.0017, + "step": 1892 + }, + { + "epoch": 1.24, + "grad_norm": 0.15949301421642303, + "learning_rate": 0.00019096778269188302, + "loss": 0.0077, + "step": 1893 + }, + { + "epoch": 1.24, + "grad_norm": 0.01298619620501995, + "learning_rate": 0.00019086859941290174, + "loss": 0.0019, + "step": 1894 + }, + { + "epoch": 1.24, + "grad_norm": 0.009331258945167065, + "learning_rate": 0.00019076939682904337, + "loss": 0.001, + "step": 1895 + }, + { + "epoch": 1.24, + "grad_norm": 0.23378711938858032, + "learning_rate": 0.00019067017498716773, + "loss": 0.029, + "step": 1896 + }, + { + "epoch": 1.24, + "grad_norm": 0.2589947581291199, + "learning_rate": 0.00019057093393414366, + "loss": 0.0386, + "step": 1897 + }, + { + "epoch": 1.24, + "grad_norm": 0.4780135750770569, + "learning_rate": 0.00019047167371684918, + "loss": 0.0401, + "step": 1898 + }, + { + "epoch": 1.24, + "grad_norm": 0.31437739729881287, + "learning_rate": 0.00019037239438217127, + "loss": 0.0684, + "step": 1899 + }, + { + "epoch": 1.24, + "grad_norm": 0.03697904944419861, + "learning_rate": 0.00019027309597700594, + "loss": 0.0036, + "step": 1900 + }, + { + "epoch": 1.24, + "grad_norm": 0.1111644059419632, + "learning_rate": 0.00019017377854825828, + "loss": 0.006, + "step": 1901 + }, + { + "epoch": 1.25, + "grad_norm": 0.1257040649652481, + "learning_rate": 0.00019007444214284226, + "loss": 0.0168, + "step": 1902 + }, + { + "epoch": 1.25, + "grad_norm": 0.2534196376800537, + "learning_rate": 0.00018997508680768097, + "loss": 0.0263, + "step": 1903 + }, + { + "epoch": 1.25, + "grad_norm": 0.053403884172439575, + "learning_rate": 0.00018987571258970626, + "loss": 0.003, + "step": 1904 + }, + { + "epoch": 1.25, + "grad_norm": 0.22898997366428375, + "learning_rate": 0.00018977631953585902, + "loss": 0.0212, + "step": 1905 + }, + { + "epoch": 1.25, + "grad_norm": 0.13535194098949432, + "learning_rate": 0.00018967690769308894, + "loss": 0.0087, + "step": 1906 + }, + { + "epoch": 1.25, + "grad_norm": 0.16851268708705902, + "learning_rate": 0.00018957747710835482, + "loss": 0.0388, + "step": 1907 + }, + { + "epoch": 1.25, + "grad_norm": 0.21347947418689728, + "learning_rate": 0.00018947802782862396, + "loss": 0.0332, + "step": 1908 + }, + { + "epoch": 1.25, + "grad_norm": 0.0564715713262558, + "learning_rate": 0.00018937855990087276, + "loss": 0.0036, + "step": 1909 + }, + { + "epoch": 1.25, + "grad_norm": 0.06032940000295639, + "learning_rate": 0.0001892790733720863, + "loss": 0.0067, + "step": 1910 + }, + { + "epoch": 1.25, + "eval_loss": 0.032177336513996124, + "eval_runtime": 39.9387, + "eval_samples_per_second": 32.224, + "eval_steps_per_second": 8.062, + "step": 1910 + }, + { + "epoch": 1.25, + "grad_norm": 0.11383625119924545, + "learning_rate": 0.00018917956828925857, + "loss": 0.0152, + "step": 1911 + }, + { + "epoch": 1.25, + "grad_norm": 0.15107430517673492, + "learning_rate": 0.00018908004469939216, + "loss": 0.0511, + "step": 1912 + }, + { + "epoch": 1.25, + "grad_norm": 0.403848260641098, + "learning_rate": 0.00018898050264949852, + "loss": 0.0206, + "step": 1913 + }, + { + "epoch": 1.25, + "grad_norm": 0.09853165596723557, + "learning_rate": 0.00018888094218659778, + "loss": 0.0186, + "step": 1914 + }, + { + "epoch": 1.25, + "grad_norm": 0.1771824061870575, + "learning_rate": 0.00018878136335771876, + "loss": 0.0221, + "step": 1915 + }, + { + "epoch": 1.25, + "grad_norm": 0.0342799611389637, + "learning_rate": 0.000188681766209899, + "loss": 0.004, + "step": 1916 + }, + { + "epoch": 1.25, + "grad_norm": 0.14599063992500305, + "learning_rate": 0.0001885821507901846, + "loss": 0.0616, + "step": 1917 + }, + { + "epoch": 1.26, + "grad_norm": 0.09153589606285095, + "learning_rate": 0.0001884825171456304, + "loss": 0.0093, + "step": 1918 + }, + { + "epoch": 1.26, + "grad_norm": 0.040928881615400314, + "learning_rate": 0.0001883828653232998, + "loss": 0.0055, + "step": 1919 + }, + { + "epoch": 1.26, + "grad_norm": 0.0882481262087822, + "learning_rate": 0.00018828319537026475, + "loss": 0.0115, + "step": 1920 + }, + { + "epoch": 1.26, + "grad_norm": 0.22515325248241425, + "learning_rate": 0.00018818350733360584, + "loss": 0.0484, + "step": 1921 + }, + { + "epoch": 1.26, + "grad_norm": 0.2775671184062958, + "learning_rate": 0.00018808380126041215, + "loss": 0.0412, + "step": 1922 + }, + { + "epoch": 1.26, + "grad_norm": 0.20490968227386475, + "learning_rate": 0.00018798407719778127, + "loss": 0.0146, + "step": 1923 + }, + { + "epoch": 1.26, + "grad_norm": 0.017601322382688522, + "learning_rate": 0.00018788433519281933, + "loss": 0.0032, + "step": 1924 + }, + { + "epoch": 1.26, + "grad_norm": 0.061190057545900345, + "learning_rate": 0.00018778457529264098, + "loss": 0.0062, + "step": 1925 + }, + { + "epoch": 1.26, + "grad_norm": 0.011243225075304508, + "learning_rate": 0.00018768479754436917, + "loss": 0.0019, + "step": 1926 + }, + { + "epoch": 1.26, + "grad_norm": 0.21566064655780792, + "learning_rate": 0.0001875850019951354, + "loss": 0.0197, + "step": 1927 + }, + { + "epoch": 1.26, + "grad_norm": 0.14111027121543884, + "learning_rate": 0.00018748518869207952, + "loss": 0.0164, + "step": 1928 + }, + { + "epoch": 1.26, + "grad_norm": 0.17709197103977203, + "learning_rate": 0.00018738535768234984, + "loss": 0.0173, + "step": 1929 + }, + { + "epoch": 1.26, + "grad_norm": 0.11170878261327744, + "learning_rate": 0.00018728550901310297, + "loss": 0.0103, + "step": 1930 + }, + { + "epoch": 1.26, + "grad_norm": 0.10775300115346909, + "learning_rate": 0.00018718564273150387, + "loss": 0.0061, + "step": 1931 + }, + { + "epoch": 1.26, + "grad_norm": 0.08073946833610535, + "learning_rate": 0.00018708575888472587, + "loss": 0.0076, + "step": 1932 + }, + { + "epoch": 1.27, + "grad_norm": 0.2539899945259094, + "learning_rate": 0.0001869858575199505, + "loss": 0.0326, + "step": 1933 + }, + { + "epoch": 1.27, + "grad_norm": 0.046343106776475906, + "learning_rate": 0.0001868859386843677, + "loss": 0.0057, + "step": 1934 + }, + { + "epoch": 1.27, + "grad_norm": 0.09674936532974243, + "learning_rate": 0.00018678600242517547, + "loss": 0.0081, + "step": 1935 + }, + { + "epoch": 1.27, + "grad_norm": 0.006561139598488808, + "learning_rate": 0.00018668604878958027, + "loss": 0.0011, + "step": 1936 + }, + { + "epoch": 1.27, + "grad_norm": 0.003308130893856287, + "learning_rate": 0.00018658607782479653, + "loss": 0.0006, + "step": 1937 + }, + { + "epoch": 1.27, + "grad_norm": 0.1847364902496338, + "learning_rate": 0.0001864860895780471, + "loss": 0.0128, + "step": 1938 + }, + { + "epoch": 1.27, + "grad_norm": 0.4606344699859619, + "learning_rate": 0.00018638608409656288, + "loss": 0.1213, + "step": 1939 + }, + { + "epoch": 1.27, + "grad_norm": 0.005895258858799934, + "learning_rate": 0.00018628606142758285, + "loss": 0.0009, + "step": 1940 + }, + { + "epoch": 1.27, + "grad_norm": 0.37326616048812866, + "learning_rate": 0.0001861860216183542, + "loss": 0.0808, + "step": 1941 + }, + { + "epoch": 1.27, + "grad_norm": 0.04593970999121666, + "learning_rate": 0.00018608596471613215, + "loss": 0.0024, + "step": 1942 + }, + { + "epoch": 1.27, + "grad_norm": 0.16561304032802582, + "learning_rate": 0.00018598589076818014, + "loss": 0.0396, + "step": 1943 + }, + { + "epoch": 1.27, + "grad_norm": 0.2503207325935364, + "learning_rate": 0.00018588579982176944, + "loss": 0.0538, + "step": 1944 + }, + { + "epoch": 1.27, + "grad_norm": 0.09036950767040253, + "learning_rate": 0.0001857856919241795, + "loss": 0.0102, + "step": 1945 + }, + { + "epoch": 1.27, + "grad_norm": 0.1274523138999939, + "learning_rate": 0.00018568556712269776, + "loss": 0.0291, + "step": 1946 + }, + { + "epoch": 1.27, + "grad_norm": 0.028810936957597733, + "learning_rate": 0.00018558542546461964, + "loss": 0.0053, + "step": 1947 + }, + { + "epoch": 1.28, + "grad_norm": 0.15604527294635773, + "learning_rate": 0.0001854852669972484, + "loss": 0.0171, + "step": 1948 + }, + { + "epoch": 1.28, + "grad_norm": 0.06874702125787735, + "learning_rate": 0.00018538509176789546, + "loss": 0.0089, + "step": 1949 + }, + { + "epoch": 1.28, + "grad_norm": 0.09649529308080673, + "learning_rate": 0.00018528489982388006, + "loss": 0.0164, + "step": 1950 + }, + { + "epoch": 1.28, + "grad_norm": 0.041638266295194626, + "learning_rate": 0.0001851846912125292, + "loss": 0.008, + "step": 1951 + }, + { + "epoch": 1.28, + "grad_norm": 0.08413052558898926, + "learning_rate": 0.00018508446598117806, + "loss": 0.0178, + "step": 1952 + }, + { + "epoch": 1.28, + "grad_norm": 0.09511756896972656, + "learning_rate": 0.00018498422417716928, + "loss": 0.0171, + "step": 1953 + }, + { + "epoch": 1.28, + "grad_norm": 0.13475637137889862, + "learning_rate": 0.00018488396584785365, + "loss": 0.023, + "step": 1954 + }, + { + "epoch": 1.28, + "grad_norm": 0.04502255469560623, + "learning_rate": 0.00018478369104058963, + "loss": 0.0093, + "step": 1955 + }, + { + "epoch": 1.28, + "grad_norm": 0.030024804174900055, + "learning_rate": 0.00018468339980274353, + "loss": 0.0042, + "step": 1956 + }, + { + "epoch": 1.28, + "grad_norm": 0.10409369319677353, + "learning_rate": 0.00018458309218168925, + "loss": 0.0224, + "step": 1957 + }, + { + "epoch": 1.28, + "grad_norm": 0.2133146971464157, + "learning_rate": 0.00018448276822480866, + "loss": 0.0747, + "step": 1958 + }, + { + "epoch": 1.28, + "grad_norm": 0.17796148359775543, + "learning_rate": 0.0001843824279794912, + "loss": 0.0135, + "step": 1959 + }, + { + "epoch": 1.28, + "grad_norm": 0.07230894267559052, + "learning_rate": 0.00018428207149313403, + "loss": 0.0096, + "step": 1960 + }, + { + "epoch": 1.28, + "grad_norm": 0.15250401198863983, + "learning_rate": 0.00018418169881314207, + "loss": 0.016, + "step": 1961 + }, + { + "epoch": 1.28, + "grad_norm": 0.0743720754981041, + "learning_rate": 0.00018408130998692773, + "loss": 0.0062, + "step": 1962 + }, + { + "epoch": 1.29, + "grad_norm": 0.14948879182338715, + "learning_rate": 0.00018398090506191114, + "loss": 0.0237, + "step": 1963 + }, + { + "epoch": 1.29, + "grad_norm": 0.12644903361797333, + "learning_rate": 0.00018388048408552008, + "loss": 0.0223, + "step": 1964 + }, + { + "epoch": 1.29, + "grad_norm": 0.05275022238492966, + "learning_rate": 0.00018378004710518984, + "loss": 0.0051, + "step": 1965 + }, + { + "epoch": 1.29, + "grad_norm": 0.18393655121326447, + "learning_rate": 0.00018367959416836332, + "loss": 0.0119, + "step": 1966 + }, + { + "epoch": 1.29, + "grad_norm": 0.2449781447649002, + "learning_rate": 0.00018357912532249076, + "loss": 0.0262, + "step": 1967 + }, + { + "epoch": 1.29, + "grad_norm": 0.05197291448712349, + "learning_rate": 0.00018347864061503028, + "loss": 0.0072, + "step": 1968 + }, + { + "epoch": 1.29, + "grad_norm": 0.07551299780607224, + "learning_rate": 0.00018337814009344714, + "loss": 0.0081, + "step": 1969 + }, + { + "epoch": 1.29, + "grad_norm": 0.05459301918745041, + "learning_rate": 0.00018327762380521438, + "loss": 0.0047, + "step": 1970 + }, + { + "epoch": 1.29, + "grad_norm": 0.0383986234664917, + "learning_rate": 0.0001831770917978122, + "loss": 0.0034, + "step": 1971 + }, + { + "epoch": 1.29, + "grad_norm": 0.0700681209564209, + "learning_rate": 0.00018307654411872838, + "loss": 0.0068, + "step": 1972 + }, + { + "epoch": 1.29, + "grad_norm": 0.28609392046928406, + "learning_rate": 0.0001829759808154581, + "loss": 0.0608, + "step": 1973 + }, + { + "epoch": 1.29, + "grad_norm": 0.21615540981292725, + "learning_rate": 0.0001828754019355039, + "loss": 0.0088, + "step": 1974 + }, + { + "epoch": 1.29, + "grad_norm": 0.15507575869560242, + "learning_rate": 0.0001827748075263757, + "loss": 0.0081, + "step": 1975 + }, + { + "epoch": 1.29, + "grad_norm": 0.2063005119562149, + "learning_rate": 0.0001826741976355907, + "loss": 0.0072, + "step": 1976 + }, + { + "epoch": 1.29, + "grad_norm": 0.23691704869270325, + "learning_rate": 0.0001825735723106734, + "loss": 0.037, + "step": 1977 + }, + { + "epoch": 1.29, + "grad_norm": 0.05298379436135292, + "learning_rate": 0.0001824729315991557, + "loss": 0.0041, + "step": 1978 + }, + { + "epoch": 1.3, + "grad_norm": 0.22102193534374237, + "learning_rate": 0.00018237227554857672, + "loss": 0.0117, + "step": 1979 + }, + { + "epoch": 1.3, + "grad_norm": 0.1501353681087494, + "learning_rate": 0.00018227160420648274, + "loss": 0.0397, + "step": 1980 + }, + { + "epoch": 1.3, + "grad_norm": 0.402899831533432, + "learning_rate": 0.00018217091762042737, + "loss": 0.0821, + "step": 1981 + }, + { + "epoch": 1.3, + "grad_norm": 0.20166534185409546, + "learning_rate": 0.0001820702158379714, + "loss": 0.0285, + "step": 1982 + }, + { + "epoch": 1.3, + "grad_norm": 0.13613693416118622, + "learning_rate": 0.00018196949890668276, + "loss": 0.006, + "step": 1983 + }, + { + "epoch": 1.3, + "grad_norm": 0.19995389878749847, + "learning_rate": 0.00018186876687413655, + "loss": 0.0152, + "step": 1984 + }, + { + "epoch": 1.3, + "grad_norm": 0.2411389946937561, + "learning_rate": 0.00018176801978791497, + "loss": 0.0546, + "step": 1985 + }, + { + "epoch": 1.3, + "grad_norm": 0.022495824843645096, + "learning_rate": 0.00018166725769560747, + "loss": 0.0022, + "step": 1986 + }, + { + "epoch": 1.3, + "grad_norm": 0.07184285670518875, + "learning_rate": 0.00018156648064481044, + "loss": 0.0058, + "step": 1987 + }, + { + "epoch": 1.3, + "grad_norm": 0.13286525011062622, + "learning_rate": 0.00018146568868312733, + "loss": 0.0254, + "step": 1988 + }, + { + "epoch": 1.3, + "grad_norm": 0.2439001351594925, + "learning_rate": 0.00018136488185816878, + "loss": 0.0457, + "step": 1989 + }, + { + "epoch": 1.3, + "grad_norm": 0.1679784655570984, + "learning_rate": 0.00018126406021755232, + "loss": 0.0274, + "step": 1990 + }, + { + "epoch": 1.3, + "grad_norm": 0.1093001514673233, + "learning_rate": 0.00018116322380890248, + "loss": 0.0107, + "step": 1991 + }, + { + "epoch": 1.3, + "grad_norm": 0.08656150102615356, + "learning_rate": 0.0001810623726798509, + "loss": 0.0075, + "step": 1992 + }, + { + "epoch": 1.3, + "grad_norm": 0.06291065365076065, + "learning_rate": 0.00018096150687803598, + "loss": 0.0058, + "step": 1993 + }, + { + "epoch": 1.31, + "grad_norm": 0.06546434015035629, + "learning_rate": 0.00018086062645110318, + "loss": 0.0082, + "step": 1994 + }, + { + "epoch": 1.31, + "grad_norm": 0.09988091886043549, + "learning_rate": 0.00018075973144670486, + "loss": 0.0239, + "step": 1995 + }, + { + "epoch": 1.31, + "grad_norm": 0.30109840631484985, + "learning_rate": 0.0001806588219125002, + "loss": 0.0297, + "step": 1996 + }, + { + "epoch": 1.31, + "grad_norm": 0.05507725849747658, + "learning_rate": 0.00018055789789615532, + "loss": 0.004, + "step": 1997 + }, + { + "epoch": 1.31, + "grad_norm": 0.09339085221290588, + "learning_rate": 0.00018045695944534314, + "loss": 0.0109, + "step": 1998 + }, + { + "epoch": 1.31, + "grad_norm": 0.059944991022348404, + "learning_rate": 0.00018035600660774336, + "loss": 0.0085, + "step": 1999 + }, + { + "epoch": 1.31, + "grad_norm": 0.09726294130086899, + "learning_rate": 0.00018025503943104262, + "loss": 0.0093, + "step": 2000 + }, + { + "epoch": 1.31, + "grad_norm": 0.07707206904888153, + "learning_rate": 0.00018015405796293417, + "loss": 0.0067, + "step": 2001 + }, + { + "epoch": 1.31, + "grad_norm": 0.027958236634731293, + "learning_rate": 0.00018005306225111803, + "loss": 0.0035, + "step": 2002 + }, + { + "epoch": 1.31, + "grad_norm": 0.2533860206604004, + "learning_rate": 0.00017995205234330107, + "loss": 0.0626, + "step": 2003 + }, + { + "epoch": 1.31, + "grad_norm": 0.041864681988954544, + "learning_rate": 0.00017985102828719675, + "loss": 0.0067, + "step": 2004 + }, + { + "epoch": 1.31, + "grad_norm": 0.08300785720348358, + "learning_rate": 0.00017974999013052527, + "loss": 0.0311, + "step": 2005 + }, + { + "epoch": 1.31, + "grad_norm": 0.054343078285455704, + "learning_rate": 0.00017964893792101345, + "loss": 0.0052, + "step": 2006 + }, + { + "epoch": 1.31, + "grad_norm": 0.007766898721456528, + "learning_rate": 0.00017954787170639476, + "loss": 0.0014, + "step": 2007 + }, + { + "epoch": 1.31, + "grad_norm": 0.18020589649677277, + "learning_rate": 0.00017944679153440935, + "loss": 0.0424, + "step": 2008 + }, + { + "epoch": 1.32, + "grad_norm": 0.07235695421695709, + "learning_rate": 0.00017934569745280392, + "loss": 0.009, + "step": 2009 + }, + { + "epoch": 1.32, + "grad_norm": 0.04374720901250839, + "learning_rate": 0.00017924458950933163, + "loss": 0.0036, + "step": 2010 + }, + { + "epoch": 1.32, + "grad_norm": 0.12976546585559845, + "learning_rate": 0.00017914346775175236, + "loss": 0.0083, + "step": 2011 + }, + { + "epoch": 1.32, + "grad_norm": 0.13522937893867493, + "learning_rate": 0.0001790423322278324, + "loss": 0.0142, + "step": 2012 + }, + { + "epoch": 1.32, + "grad_norm": 0.17489002645015717, + "learning_rate": 0.0001789411829853446, + "loss": 0.0192, + "step": 2013 + }, + { + "epoch": 1.32, + "grad_norm": 0.07798654586076736, + "learning_rate": 0.00017884002007206837, + "loss": 0.0071, + "step": 2014 + }, + { + "epoch": 1.32, + "grad_norm": 0.03286200389266014, + "learning_rate": 0.00017873884353578935, + "loss": 0.0018, + "step": 2015 + }, + { + "epoch": 1.32, + "grad_norm": 0.10676159709692001, + "learning_rate": 0.00017863765342429977, + "loss": 0.0046, + "step": 2016 + }, + { + "epoch": 1.32, + "grad_norm": 0.18042497336864471, + "learning_rate": 0.00017853644978539835, + "loss": 0.0199, + "step": 2017 + }, + { + "epoch": 1.32, + "grad_norm": 0.21726305782794952, + "learning_rate": 0.00017843523266688994, + "loss": 0.0173, + "step": 2018 + }, + { + "epoch": 1.32, + "grad_norm": 0.03823615238070488, + "learning_rate": 0.00017833400211658606, + "loss": 0.003, + "step": 2019 + }, + { + "epoch": 1.32, + "grad_norm": 0.0042721726931631565, + "learning_rate": 0.00017823275818230436, + "loss": 0.0005, + "step": 2020 + }, + { + "epoch": 1.32, + "grad_norm": 0.21143902838230133, + "learning_rate": 0.00017813150091186886, + "loss": 0.0095, + "step": 2021 + }, + { + "epoch": 1.32, + "grad_norm": 0.14743442833423615, + "learning_rate": 0.00017803023035311, + "loss": 0.0505, + "step": 2022 + }, + { + "epoch": 1.32, + "grad_norm": 0.02131885476410389, + "learning_rate": 0.0001779289465538643, + "loss": 0.0021, + "step": 2023 + }, + { + "epoch": 1.33, + "grad_norm": 0.15741504728794098, + "learning_rate": 0.00017782764956197474, + "loss": 0.0061, + "step": 2024 + }, + { + "epoch": 1.33, + "grad_norm": 0.016739048063755035, + "learning_rate": 0.00017772633942529032, + "loss": 0.002, + "step": 2025 + }, + { + "epoch": 1.33, + "grad_norm": 0.14672209322452545, + "learning_rate": 0.00017762501619166638, + "loss": 0.039, + "step": 2026 + }, + { + "epoch": 1.33, + "grad_norm": 0.1252099722623825, + "learning_rate": 0.00017752367990896446, + "loss": 0.0064, + "step": 2027 + }, + { + "epoch": 1.33, + "grad_norm": 0.322226881980896, + "learning_rate": 0.0001774223306250523, + "loss": 0.0363, + "step": 2028 + }, + { + "epoch": 1.33, + "grad_norm": 0.4759753346443176, + "learning_rate": 0.00017732096838780353, + "loss": 0.1034, + "step": 2029 + }, + { + "epoch": 1.33, + "grad_norm": 0.01636957749724388, + "learning_rate": 0.00017721959324509815, + "loss": 0.0023, + "step": 2030 + }, + { + "epoch": 1.33, + "grad_norm": 0.09087517112493515, + "learning_rate": 0.00017711820524482223, + "loss": 0.0034, + "step": 2031 + }, + { + "epoch": 1.33, + "grad_norm": 0.10265547037124634, + "learning_rate": 0.00017701680443486784, + "loss": 0.0195, + "step": 2032 + }, + { + "epoch": 1.33, + "grad_norm": 0.018688971176743507, + "learning_rate": 0.00017691539086313307, + "loss": 0.0025, + "step": 2033 + }, + { + "epoch": 1.33, + "grad_norm": 0.04474179074168205, + "learning_rate": 0.00017681396457752221, + "loss": 0.0046, + "step": 2034 + }, + { + "epoch": 1.33, + "grad_norm": 0.11852456629276276, + "learning_rate": 0.00017671252562594531, + "loss": 0.0229, + "step": 2035 + }, + { + "epoch": 1.33, + "grad_norm": 0.20545479655265808, + "learning_rate": 0.00017661107405631866, + "loss": 0.0247, + "step": 2036 + }, + { + "epoch": 1.33, + "grad_norm": 0.030622253194451332, + "learning_rate": 0.00017650960991656432, + "loss": 0.0021, + "step": 2037 + }, + { + "epoch": 1.33, + "grad_norm": 0.0707719549536705, + "learning_rate": 0.0001764081332546103, + "loss": 0.0067, + "step": 2038 + }, + { + "epoch": 1.33, + "grad_norm": 0.07734823226928711, + "learning_rate": 0.00017630664411839064, + "loss": 0.0174, + "step": 2039 + }, + { + "epoch": 1.34, + "grad_norm": 0.15264584124088287, + "learning_rate": 0.00017620514255584522, + "loss": 0.0143, + "step": 2040 + }, + { + "epoch": 1.34, + "grad_norm": 0.016753699630498886, + "learning_rate": 0.00017610362861491977, + "loss": 0.0024, + "step": 2041 + }, + { + "epoch": 1.34, + "grad_norm": 0.4822998642921448, + "learning_rate": 0.00017600210234356586, + "loss": 0.0165, + "step": 2042 + }, + { + "epoch": 1.34, + "grad_norm": 0.04116729274392128, + "learning_rate": 0.00017590056378974088, + "loss": 0.0053, + "step": 2043 + }, + { + "epoch": 1.34, + "grad_norm": 0.024770237505435944, + "learning_rate": 0.00017579901300140808, + "loss": 0.0039, + "step": 2044 + }, + { + "epoch": 1.34, + "grad_norm": 0.03564632683992386, + "learning_rate": 0.00017569745002653646, + "loss": 0.0042, + "step": 2045 + }, + { + "epoch": 1.34, + "grad_norm": 0.20899443328380585, + "learning_rate": 0.0001755958749131007, + "loss": 0.0275, + "step": 2046 + }, + { + "epoch": 1.34, + "grad_norm": 0.01576872169971466, + "learning_rate": 0.00017549428770908136, + "loss": 0.0015, + "step": 2047 + }, + { + "epoch": 1.34, + "grad_norm": 0.0382314994931221, + "learning_rate": 0.00017539268846246457, + "loss": 0.0041, + "step": 2048 + }, + { + "epoch": 1.34, + "grad_norm": 0.0657842680811882, + "learning_rate": 0.00017529107722124223, + "loss": 0.0074, + "step": 2049 + }, + { + "epoch": 1.34, + "grad_norm": 0.11808799207210541, + "learning_rate": 0.00017518945403341196, + "loss": 0.0127, + "step": 2050 + }, + { + "epoch": 1.34, + "grad_norm": 0.12941673398017883, + "learning_rate": 0.00017508781894697684, + "loss": 0.0157, + "step": 2051 + }, + { + "epoch": 1.34, + "grad_norm": 0.2390335202217102, + "learning_rate": 0.00017498617200994572, + "loss": 0.0137, + "step": 2052 + }, + { + "epoch": 1.34, + "grad_norm": 0.06445050239562988, + "learning_rate": 0.00017488451327033304, + "loss": 0.0028, + "step": 2053 + }, + { + "epoch": 1.34, + "grad_norm": 0.26266834139823914, + "learning_rate": 0.00017478284277615876, + "loss": 0.0093, + "step": 2054 + }, + { + "epoch": 1.35, + "grad_norm": 0.2604714035987854, + "learning_rate": 0.0001746811605754484, + "loss": 0.0519, + "step": 2055 + }, + { + "epoch": 1.35, + "grad_norm": 0.23091405630111694, + "learning_rate": 0.00017457946671623305, + "loss": 0.0376, + "step": 2056 + }, + { + "epoch": 1.35, + "grad_norm": 0.630933403968811, + "learning_rate": 0.00017447776124654925, + "loss": 0.0726, + "step": 2057 + }, + { + "epoch": 1.35, + "grad_norm": 0.15399132668972015, + "learning_rate": 0.00017437604421443914, + "loss": 0.0186, + "step": 2058 + }, + { + "epoch": 1.35, + "grad_norm": 0.3804969787597656, + "learning_rate": 0.00017427431566795012, + "loss": 0.0533, + "step": 2059 + }, + { + "epoch": 1.35, + "grad_norm": 0.26310884952545166, + "learning_rate": 0.00017417257565513524, + "loss": 0.0431, + "step": 2060 + }, + { + "epoch": 1.35, + "grad_norm": 0.10924821346998215, + "learning_rate": 0.0001740708242240528, + "loss": 0.0557, + "step": 2061 + }, + { + "epoch": 1.35, + "grad_norm": 0.021753892302513123, + "learning_rate": 0.00017396906142276664, + "loss": 0.0025, + "step": 2062 + }, + { + "epoch": 1.35, + "grad_norm": 0.05231242626905441, + "learning_rate": 0.00017386728729934587, + "loss": 0.0039, + "step": 2063 + }, + { + "epoch": 1.35, + "grad_norm": 0.03753997012972832, + "learning_rate": 0.000173765501901865, + "loss": 0.0039, + "step": 2064 + }, + { + "epoch": 1.35, + "grad_norm": 0.022054225206375122, + "learning_rate": 0.00017366370527840377, + "loss": 0.003, + "step": 2065 + }, + { + "epoch": 1.35, + "grad_norm": 0.07640133053064346, + "learning_rate": 0.00017356189747704735, + "loss": 0.0082, + "step": 2066 + }, + { + "epoch": 1.35, + "grad_norm": 0.1012224480509758, + "learning_rate": 0.00017346007854588617, + "loss": 0.0191, + "step": 2067 + }, + { + "epoch": 1.35, + "grad_norm": 0.33267688751220703, + "learning_rate": 0.00017335824853301584, + "loss": 0.0347, + "step": 2068 + }, + { + "epoch": 1.35, + "grad_norm": 0.061131980270147324, + "learning_rate": 0.00017325640748653718, + "loss": 0.0096, + "step": 2069 + }, + { + "epoch": 1.36, + "grad_norm": 0.16046808660030365, + "learning_rate": 0.00017315455545455636, + "loss": 0.0202, + "step": 2070 + }, + { + "epoch": 1.36, + "grad_norm": 0.048775166273117065, + "learning_rate": 0.00017305269248518468, + "loss": 0.0098, + "step": 2071 + }, + { + "epoch": 1.36, + "grad_norm": 0.17278356850147247, + "learning_rate": 0.0001729508186265386, + "loss": 0.0222, + "step": 2072 + }, + { + "epoch": 1.36, + "grad_norm": 0.0673917904496193, + "learning_rate": 0.0001728489339267397, + "loss": 0.0071, + "step": 2073 + }, + { + "epoch": 1.36, + "grad_norm": 0.0350324921309948, + "learning_rate": 0.00017274703843391467, + "loss": 0.0041, + "step": 2074 + }, + { + "epoch": 1.36, + "grad_norm": 0.07471290230751038, + "learning_rate": 0.00017264513219619534, + "loss": 0.0083, + "step": 2075 + }, + { + "epoch": 1.36, + "grad_norm": 0.10071226209402084, + "learning_rate": 0.00017254321526171862, + "loss": 0.0406, + "step": 2076 + }, + { + "epoch": 1.36, + "grad_norm": 0.11408775299787521, + "learning_rate": 0.0001724412876786265, + "loss": 0.0095, + "step": 2077 + }, + { + "epoch": 1.36, + "grad_norm": 0.030934764072299004, + "learning_rate": 0.00017233934949506584, + "loss": 0.0035, + "step": 2078 + }, + { + "epoch": 1.36, + "grad_norm": 0.05950973555445671, + "learning_rate": 0.00017223740075918872, + "loss": 0.0065, + "step": 2079 + }, + { + "epoch": 1.36, + "grad_norm": 0.014024289324879646, + "learning_rate": 0.00017213544151915204, + "loss": 0.0016, + "step": 2080 + }, + { + "epoch": 1.36, + "grad_norm": 0.14681045711040497, + "learning_rate": 0.00017203347182311783, + "loss": 0.0097, + "step": 2081 + }, + { + "epoch": 1.36, + "grad_norm": 0.04823039472103119, + "learning_rate": 0.00017193149171925286, + "loss": 0.0052, + "step": 2082 + }, + { + "epoch": 1.36, + "grad_norm": 0.10540402680635452, + "learning_rate": 0.00017182950125572892, + "loss": 0.011, + "step": 2083 + }, + { + "epoch": 1.36, + "grad_norm": 0.20969213545322418, + "learning_rate": 0.00017172750048072277, + "loss": 0.0138, + "step": 2084 + }, + { + "epoch": 1.36, + "grad_norm": 0.31061890721321106, + "learning_rate": 0.0001716254894424159, + "loss": 0.0532, + "step": 2085 + }, + { + "epoch": 1.37, + "grad_norm": 0.20670346915721893, + "learning_rate": 0.00017152346818899468, + "loss": 0.0326, + "step": 2086 + }, + { + "epoch": 1.37, + "grad_norm": 0.00548940384760499, + "learning_rate": 0.00017142143676865038, + "loss": 0.0009, + "step": 2087 + }, + { + "epoch": 1.37, + "grad_norm": 0.1297319084405899, + "learning_rate": 0.00017131939522957898, + "loss": 0.0074, + "step": 2088 + }, + { + "epoch": 1.37, + "grad_norm": 0.2983582019805908, + "learning_rate": 0.00017121734361998133, + "loss": 0.0182, + "step": 2089 + }, + { + "epoch": 1.37, + "grad_norm": 0.25771012902259827, + "learning_rate": 0.00017111528198806303, + "loss": 0.0576, + "step": 2090 + }, + { + "epoch": 1.37, + "grad_norm": 0.04253006353974342, + "learning_rate": 0.00017101321038203425, + "loss": 0.0038, + "step": 2091 + }, + { + "epoch": 1.37, + "grad_norm": 0.2725884020328522, + "learning_rate": 0.00017091112885011007, + "loss": 0.0283, + "step": 2092 + }, + { + "epoch": 1.37, + "grad_norm": 0.2715790569782257, + "learning_rate": 0.0001708090374405102, + "loss": 0.0542, + "step": 2093 + }, + { + "epoch": 1.37, + "grad_norm": 0.038607288151979446, + "learning_rate": 0.00017070693620145904, + "loss": 0.0027, + "step": 2094 + }, + { + "epoch": 1.37, + "grad_norm": 0.39629149436950684, + "learning_rate": 0.00017060482518118546, + "loss": 0.0461, + "step": 2095 + }, + { + "epoch": 1.37, + "grad_norm": 0.014728988520801067, + "learning_rate": 0.0001705027044279232, + "loss": 0.0022, + "step": 2096 + }, + { + "epoch": 1.37, + "grad_norm": 0.11767303943634033, + "learning_rate": 0.0001704005739899104, + "loss": 0.0163, + "step": 2097 + }, + { + "epoch": 1.37, + "grad_norm": 0.143372043967247, + "learning_rate": 0.00017029843391539, + "loss": 0.0163, + "step": 2098 + }, + { + "epoch": 1.37, + "grad_norm": 0.11955475062131882, + "learning_rate": 0.00017019628425260917, + "loss": 0.0115, + "step": 2099 + }, + { + "epoch": 1.37, + "grad_norm": 0.2151668518781662, + "learning_rate": 0.0001700941250498199, + "loss": 0.0372, + "step": 2100 + }, + { + "epoch": 1.38, + "grad_norm": 0.08933127671480179, + "learning_rate": 0.00016999195635527853, + "loss": 0.0094, + "step": 2101 + }, + { + "epoch": 1.38, + "grad_norm": 0.043822623789310455, + "learning_rate": 0.00016988977821724593, + "loss": 0.0053, + "step": 2102 + }, + { + "epoch": 1.38, + "grad_norm": 0.21337710320949554, + "learning_rate": 0.0001697875906839875, + "loss": 0.0272, + "step": 2103 + }, + { + "epoch": 1.38, + "grad_norm": 0.2069789320230484, + "learning_rate": 0.00016968539380377292, + "loss": 0.0318, + "step": 2104 + }, + { + "epoch": 1.38, + "grad_norm": 0.2900010049343109, + "learning_rate": 0.0001695831876248764, + "loss": 0.0465, + "step": 2105 + }, + { + "epoch": 1.38, + "grad_norm": 0.038315340876579285, + "learning_rate": 0.00016948097219557647, + "loss": 0.0042, + "step": 2106 + }, + { + "epoch": 1.38, + "grad_norm": 0.10780084133148193, + "learning_rate": 0.00016937874756415623, + "loss": 0.0365, + "step": 2107 + }, + { + "epoch": 1.38, + "grad_norm": 0.01659630611538887, + "learning_rate": 0.00016927651377890275, + "loss": 0.0017, + "step": 2108 + }, + { + "epoch": 1.38, + "grad_norm": 0.037653081119060516, + "learning_rate": 0.00016917427088810778, + "loss": 0.0044, + "step": 2109 + }, + { + "epoch": 1.38, + "grad_norm": 0.20532900094985962, + "learning_rate": 0.00016907201894006724, + "loss": 0.0514, + "step": 2110 + }, + { + "epoch": 1.38, + "grad_norm": 0.025467032566666603, + "learning_rate": 0.0001689697579830813, + "loss": 0.0037, + "step": 2111 + }, + { + "epoch": 1.38, + "grad_norm": 0.161657452583313, + "learning_rate": 0.00016886748806545438, + "loss": 0.0236, + "step": 2112 + }, + { + "epoch": 1.38, + "grad_norm": 0.0939616858959198, + "learning_rate": 0.00016876520923549517, + "loss": 0.0048, + "step": 2113 + }, + { + "epoch": 1.38, + "grad_norm": 0.24862095713615417, + "learning_rate": 0.0001686629215415166, + "loss": 0.0125, + "step": 2114 + }, + { + "epoch": 1.38, + "grad_norm": 0.13289472460746765, + "learning_rate": 0.00016856062503183572, + "loss": 0.0221, + "step": 2115 + }, + { + "epoch": 1.39, + "grad_norm": 0.023004405200481415, + "learning_rate": 0.00016845831975477384, + "loss": 0.0036, + "step": 2116 + }, + { + "epoch": 1.39, + "grad_norm": 0.2153320163488388, + "learning_rate": 0.00016835600575865623, + "loss": 0.0517, + "step": 2117 + }, + { + "epoch": 1.39, + "grad_norm": 0.18337225914001465, + "learning_rate": 0.0001682536830918125, + "loss": 0.0199, + "step": 2118 + }, + { + "epoch": 1.39, + "grad_norm": 0.06616278737783432, + "learning_rate": 0.00016815135180257612, + "loss": 0.008, + "step": 2119 + }, + { + "epoch": 1.39, + "grad_norm": 0.19889144599437714, + "learning_rate": 0.00016804901193928488, + "loss": 0.0103, + "step": 2120 + }, + { + "epoch": 1.39, + "grad_norm": 0.15102970600128174, + "learning_rate": 0.0001679466635502805, + "loss": 0.0216, + "step": 2121 + }, + { + "epoch": 1.39, + "grad_norm": 0.10404885560274124, + "learning_rate": 0.00016784430668390866, + "loss": 0.0116, + "step": 2122 + }, + { + "epoch": 1.39, + "grad_norm": 0.11957724392414093, + "learning_rate": 0.00016774194138851915, + "loss": 0.0128, + "step": 2123 + }, + { + "epoch": 1.39, + "grad_norm": 0.054242778569459915, + "learning_rate": 0.00016763956771246566, + "loss": 0.0059, + "step": 2124 + }, + { + "epoch": 1.39, + "grad_norm": 0.04711861535906792, + "learning_rate": 0.000167537185704106, + "loss": 0.0039, + "step": 2125 + }, + { + "epoch": 1.39, + "grad_norm": 0.17932789027690887, + "learning_rate": 0.0001674347954118017, + "loss": 0.0451, + "step": 2126 + }, + { + "epoch": 1.39, + "grad_norm": 0.12226320058107376, + "learning_rate": 0.0001673323968839183, + "loss": 0.0372, + "step": 2127 + }, + { + "epoch": 1.39, + "grad_norm": 0.36971887946128845, + "learning_rate": 0.0001672299901688253, + "loss": 0.0173, + "step": 2128 + }, + { + "epoch": 1.39, + "grad_norm": 0.17018848657608032, + "learning_rate": 0.0001671275753148959, + "loss": 0.018, + "step": 2129 + }, + { + "epoch": 1.39, + "grad_norm": 0.1189635694026947, + "learning_rate": 0.0001670251523705074, + "loss": 0.0163, + "step": 2130 + }, + { + "epoch": 1.4, + "grad_norm": 0.31365808844566345, + "learning_rate": 0.00016692272138404065, + "loss": 0.0412, + "step": 2131 + }, + { + "epoch": 1.4, + "grad_norm": 0.20389385521411896, + "learning_rate": 0.00016682028240388037, + "loss": 0.0122, + "step": 2132 + }, + { + "epoch": 1.4, + "grad_norm": 0.16484835743904114, + "learning_rate": 0.00016671783547841525, + "loss": 0.0134, + "step": 2133 + }, + { + "epoch": 1.4, + "grad_norm": 0.14718228578567505, + "learning_rate": 0.00016661538065603748, + "loss": 0.0218, + "step": 2134 + }, + { + "epoch": 1.4, + "grad_norm": 0.12503278255462646, + "learning_rate": 0.00016651291798514312, + "loss": 0.013, + "step": 2135 + }, + { + "epoch": 1.4, + "grad_norm": 0.2573976516723633, + "learning_rate": 0.00016641044751413187, + "loss": 0.0302, + "step": 2136 + }, + { + "epoch": 1.4, + "grad_norm": 0.11231845617294312, + "learning_rate": 0.00016630796929140718, + "loss": 0.0091, + "step": 2137 + }, + { + "epoch": 1.4, + "grad_norm": 0.1934831589460373, + "learning_rate": 0.00016620548336537613, + "loss": 0.0167, + "step": 2138 + }, + { + "epoch": 1.4, + "grad_norm": 0.13918915390968323, + "learning_rate": 0.00016610298978444942, + "loss": 0.0133, + "step": 2139 + }, + { + "epoch": 1.4, + "grad_norm": 0.06197899207472801, + "learning_rate": 0.0001660004885970414, + "loss": 0.0053, + "step": 2140 + }, + { + "epoch": 1.4, + "grad_norm": 0.047342702746391296, + "learning_rate": 0.00016589797985156997, + "loss": 0.005, + "step": 2141 + }, + { + "epoch": 1.4, + "grad_norm": 0.01349773071706295, + "learning_rate": 0.00016579546359645663, + "loss": 0.0018, + "step": 2142 + }, + { + "epoch": 1.4, + "grad_norm": 0.018491871654987335, + "learning_rate": 0.0001656929398801265, + "loss": 0.0022, + "step": 2143 + }, + { + "epoch": 1.4, + "grad_norm": 0.07871642708778381, + "learning_rate": 0.000165590408751008, + "loss": 0.0425, + "step": 2144 + }, + { + "epoch": 1.4, + "grad_norm": 0.13570870459079742, + "learning_rate": 0.00016548787025753332, + "loss": 0.0379, + "step": 2145 + }, + { + "epoch": 1.4, + "grad_norm": 0.07806690782308578, + "learning_rate": 0.00016538532444813794, + "loss": 0.0083, + "step": 2146 + }, + { + "epoch": 1.41, + "grad_norm": 0.2510945796966553, + "learning_rate": 0.00016528277137126094, + "loss": 0.057, + "step": 2147 + }, + { + "epoch": 1.41, + "grad_norm": 0.009228669106960297, + "learning_rate": 0.00016518021107534472, + "loss": 0.0015, + "step": 2148 + }, + { + "epoch": 1.41, + "grad_norm": 0.01314024068415165, + "learning_rate": 0.00016507764360883506, + "loss": 0.0016, + "step": 2149 + }, + { + "epoch": 1.41, + "grad_norm": 0.008662903681397438, + "learning_rate": 0.00016497506902018127, + "loss": 0.0011, + "step": 2150 + }, + { + "epoch": 1.41, + "grad_norm": 0.2847445011138916, + "learning_rate": 0.0001648724873578359, + "loss": 0.0381, + "step": 2151 + }, + { + "epoch": 1.41, + "grad_norm": 0.29297375679016113, + "learning_rate": 0.00016476989867025499, + "loss": 0.0163, + "step": 2152 + }, + { + "epoch": 1.41, + "grad_norm": 0.020055145025253296, + "learning_rate": 0.00016466730300589768, + "loss": 0.0022, + "step": 2153 + }, + { + "epoch": 1.41, + "grad_norm": 0.2110797017812729, + "learning_rate": 0.0001645647004132266, + "loss": 0.0339, + "step": 2154 + }, + { + "epoch": 1.41, + "grad_norm": 0.04020393267273903, + "learning_rate": 0.0001644620909407075, + "loss": 0.0058, + "step": 2155 + }, + { + "epoch": 1.41, + "grad_norm": 0.016536343842744827, + "learning_rate": 0.00016435947463680955, + "loss": 0.0025, + "step": 2156 + }, + { + "epoch": 1.41, + "grad_norm": 0.02839847095310688, + "learning_rate": 0.00016425685155000496, + "loss": 0.0023, + "step": 2157 + }, + { + "epoch": 1.41, + "grad_norm": 0.3273443281650543, + "learning_rate": 0.00016415422172876934, + "loss": 0.0595, + "step": 2158 + }, + { + "epoch": 1.41, + "grad_norm": 0.011165251024067402, + "learning_rate": 0.00016405158522158123, + "loss": 0.0019, + "step": 2159 + }, + { + "epoch": 1.41, + "grad_norm": 0.01584658771753311, + "learning_rate": 0.00016394894207692268, + "loss": 0.0027, + "step": 2160 + }, + { + "epoch": 1.41, + "grad_norm": 0.1392921805381775, + "learning_rate": 0.00016384629234327848, + "loss": 0.0392, + "step": 2161 + }, + { + "epoch": 1.42, + "grad_norm": 0.09018007665872574, + "learning_rate": 0.0001637436360691368, + "loss": 0.0241, + "step": 2162 + }, + { + "epoch": 1.42, + "grad_norm": 0.0933082103729248, + "learning_rate": 0.00016364097330298885, + "loss": 0.0314, + "step": 2163 + }, + { + "epoch": 1.42, + "grad_norm": 0.260314404964447, + "learning_rate": 0.00016353830409332882, + "loss": 0.0236, + "step": 2164 + }, + { + "epoch": 1.42, + "grad_norm": 0.22859854996204376, + "learning_rate": 0.00016343562848865413, + "loss": 0.0334, + "step": 2165 + }, + { + "epoch": 1.42, + "grad_norm": 0.2570708692073822, + "learning_rate": 0.00016333294653746494, + "loss": 0.07, + "step": 2166 + }, + { + "epoch": 1.42, + "grad_norm": 0.10685363411903381, + "learning_rate": 0.0001632302582882646, + "loss": 0.0169, + "step": 2167 + }, + { + "epoch": 1.42, + "grad_norm": 0.41462910175323486, + "learning_rate": 0.00016312756378955947, + "loss": 0.02, + "step": 2168 + }, + { + "epoch": 1.42, + "grad_norm": 0.15622973442077637, + "learning_rate": 0.00016302486308985873, + "loss": 0.0252, + "step": 2169 + }, + { + "epoch": 1.42, + "grad_norm": 0.14134229719638824, + "learning_rate": 0.00016292215623767457, + "loss": 0.0192, + "step": 2170 + }, + { + "epoch": 1.42, + "grad_norm": 0.4416516423225403, + "learning_rate": 0.00016281944328152206, + "loss": 0.0608, + "step": 2171 + }, + { + "epoch": 1.42, + "grad_norm": 0.05815531313419342, + "learning_rate": 0.0001627167242699191, + "loss": 0.0132, + "step": 2172 + }, + { + "epoch": 1.42, + "grad_norm": 0.10270994156599045, + "learning_rate": 0.0001626139992513866, + "loss": 0.028, + "step": 2173 + }, + { + "epoch": 1.42, + "grad_norm": 0.09582596272230148, + "learning_rate": 0.0001625112682744482, + "loss": 0.0152, + "step": 2174 + }, + { + "epoch": 1.42, + "grad_norm": 0.09254796802997589, + "learning_rate": 0.0001624085313876303, + "loss": 0.0184, + "step": 2175 + }, + { + "epoch": 1.42, + "grad_norm": 0.04410834610462189, + "learning_rate": 0.00016230578863946223, + "loss": 0.0167, + "step": 2176 + }, + { + "epoch": 1.43, + "grad_norm": 0.07682263851165771, + "learning_rate": 0.000162203040078476, + "loss": 0.0107, + "step": 2177 + }, + { + "epoch": 1.43, + "grad_norm": 0.0957961305975914, + "learning_rate": 0.00016210028575320643, + "loss": 0.01, + "step": 2178 + }, + { + "epoch": 1.43, + "grad_norm": 0.11102692037820816, + "learning_rate": 0.00016199752571219092, + "loss": 0.0175, + "step": 2179 + }, + { + "epoch": 1.43, + "grad_norm": 0.04554305970668793, + "learning_rate": 0.00016189476000396977, + "loss": 0.0058, + "step": 2180 + }, + { + "epoch": 1.43, + "grad_norm": 0.10862531512975693, + "learning_rate": 0.00016179198867708575, + "loss": 0.0059, + "step": 2181 + }, + { + "epoch": 1.43, + "grad_norm": 0.18247346580028534, + "learning_rate": 0.00016168921178008448, + "loss": 0.0248, + "step": 2182 + }, + { + "epoch": 1.43, + "grad_norm": 0.11431893706321716, + "learning_rate": 0.00016158642936151416, + "loss": 0.0142, + "step": 2183 + }, + { + "epoch": 1.43, + "grad_norm": 0.12229776382446289, + "learning_rate": 0.0001614836414699254, + "loss": 0.0203, + "step": 2184 + }, + { + "epoch": 1.43, + "grad_norm": 0.09890652447938919, + "learning_rate": 0.0001613808481538717, + "loss": 0.0162, + "step": 2185 + }, + { + "epoch": 1.43, + "grad_norm": 0.021438946947455406, + "learning_rate": 0.00016127804946190893, + "loss": 0.0035, + "step": 2186 + }, + { + "epoch": 1.43, + "grad_norm": 0.026309477165341377, + "learning_rate": 0.00016117524544259553, + "loss": 0.0027, + "step": 2187 + }, + { + "epoch": 1.43, + "grad_norm": 0.1389365792274475, + "learning_rate": 0.0001610724361444925, + "loss": 0.0463, + "step": 2188 + }, + { + "epoch": 1.43, + "grad_norm": 0.2812053859233856, + "learning_rate": 0.00016096962161616326, + "loss": 0.0695, + "step": 2189 + }, + { + "epoch": 1.43, + "grad_norm": 0.009222770109772682, + "learning_rate": 0.0001608668019061738, + "loss": 0.0013, + "step": 2190 + }, + { + "epoch": 1.43, + "grad_norm": 0.02184683084487915, + "learning_rate": 0.00016076397706309245, + "loss": 0.0041, + "step": 2191 + }, + { + "epoch": 1.44, + "grad_norm": 0.028864651918411255, + "learning_rate": 0.0001606611471354901, + "loss": 0.004, + "step": 2192 + }, + { + "epoch": 1.44, + "grad_norm": 0.1239912286400795, + "learning_rate": 0.0001605583121719399, + "loss": 0.017, + "step": 2193 + }, + { + "epoch": 1.44, + "grad_norm": 0.04970764368772507, + "learning_rate": 0.00016045547222101746, + "loss": 0.0058, + "step": 2194 + }, + { + "epoch": 1.44, + "grad_norm": 0.046907342970371246, + "learning_rate": 0.0001603526273313007, + "loss": 0.0059, + "step": 2195 + }, + { + "epoch": 1.44, + "grad_norm": 0.11429935693740845, + "learning_rate": 0.00016024977755136995, + "loss": 0.0413, + "step": 2196 + }, + { + "epoch": 1.44, + "grad_norm": 0.015206074342131615, + "learning_rate": 0.00016014692292980775, + "loss": 0.0013, + "step": 2197 + }, + { + "epoch": 1.44, + "grad_norm": 0.07148374617099762, + "learning_rate": 0.00016004406351519896, + "loss": 0.0111, + "step": 2198 + }, + { + "epoch": 1.44, + "grad_norm": 0.07193495333194733, + "learning_rate": 0.0001599411993561308, + "loss": 0.008, + "step": 2199 + }, + { + "epoch": 1.44, + "grad_norm": 0.07575807720422745, + "learning_rate": 0.0001598383305011926, + "loss": 0.0102, + "step": 2200 + }, + { + "epoch": 1.44, + "grad_norm": 0.019444789737462997, + "learning_rate": 0.00015973545699897595, + "loss": 0.002, + "step": 2201 + }, + { + "epoch": 1.44, + "grad_norm": 0.007753497920930386, + "learning_rate": 0.00015963257889807465, + "loss": 0.001, + "step": 2202 + }, + { + "epoch": 1.44, + "grad_norm": 0.13980121910572052, + "learning_rate": 0.0001595296962470847, + "loss": 0.0113, + "step": 2203 + }, + { + "epoch": 1.44, + "grad_norm": 0.0628625676035881, + "learning_rate": 0.00015942680909460417, + "loss": 0.006, + "step": 2204 + }, + { + "epoch": 1.44, + "grad_norm": 0.3631378710269928, + "learning_rate": 0.00015932391748923333, + "loss": 0.0221, + "step": 2205 + }, + { + "epoch": 1.44, + "grad_norm": 0.08289500325918198, + "learning_rate": 0.00015922102147957452, + "loss": 0.0033, + "step": 2206 + }, + { + "epoch": 1.44, + "grad_norm": 0.016553470864892006, + "learning_rate": 0.00015911812111423215, + "loss": 0.0015, + "step": 2207 + }, + { + "epoch": 1.45, + "grad_norm": 0.14857268333435059, + "learning_rate": 0.00015901521644181272, + "loss": 0.008, + "step": 2208 + }, + { + "epoch": 1.45, + "grad_norm": 0.11462613940238953, + "learning_rate": 0.00015891230751092478, + "loss": 0.0076, + "step": 2209 + }, + { + "epoch": 1.45, + "grad_norm": 0.10147285461425781, + "learning_rate": 0.00015880939437017878, + "loss": 0.0028, + "step": 2210 + }, + { + "epoch": 1.45, + "grad_norm": 0.07668115198612213, + "learning_rate": 0.00015870647706818728, + "loss": 0.0019, + "step": 2211 + }, + { + "epoch": 1.45, + "grad_norm": 0.25671836733818054, + "learning_rate": 0.00015860355565356483, + "loss": 0.0293, + "step": 2212 + }, + { + "epoch": 1.45, + "grad_norm": 0.09710162878036499, + "learning_rate": 0.00015850063017492773, + "loss": 0.0056, + "step": 2213 + }, + { + "epoch": 1.45, + "grad_norm": 0.02251294068992138, + "learning_rate": 0.00015839770068089442, + "loss": 0.0019, + "step": 2214 + }, + { + "epoch": 1.45, + "grad_norm": 0.2710408866405487, + "learning_rate": 0.00015829476722008508, + "loss": 0.0328, + "step": 2215 + }, + { + "epoch": 1.45, + "grad_norm": 0.05798187106847763, + "learning_rate": 0.0001581918298411219, + "loss": 0.0032, + "step": 2216 + }, + { + "epoch": 1.45, + "grad_norm": 0.15880708396434784, + "learning_rate": 0.00015808888859262875, + "loss": 0.0052, + "step": 2217 + }, + { + "epoch": 1.45, + "grad_norm": 0.0126173235476017, + "learning_rate": 0.00015798594352323155, + "loss": 0.0014, + "step": 2218 + }, + { + "epoch": 1.45, + "grad_norm": 0.3693072497844696, + "learning_rate": 0.00015788299468155783, + "loss": 0.035, + "step": 2219 + }, + { + "epoch": 1.45, + "grad_norm": 0.008310376666486263, + "learning_rate": 0.00015778004211623695, + "loss": 0.0007, + "step": 2220 + }, + { + "epoch": 1.45, + "grad_norm": 0.1609114110469818, + "learning_rate": 0.00015767708587590003, + "loss": 0.0159, + "step": 2221 + }, + { + "epoch": 1.45, + "grad_norm": 0.18963384628295898, + "learning_rate": 0.00015757412600918004, + "loss": 0.0149, + "step": 2222 + }, + { + "epoch": 1.46, + "grad_norm": 0.10448487848043442, + "learning_rate": 0.00015747116256471154, + "loss": 0.0047, + "step": 2223 + }, + { + "epoch": 1.46, + "grad_norm": 0.0033179214224219322, + "learning_rate": 0.00015736819559113076, + "loss": 0.0005, + "step": 2224 + }, + { + "epoch": 1.46, + "grad_norm": 0.03964143246412277, + "learning_rate": 0.00015726522513707567, + "loss": 0.0014, + "step": 2225 + }, + { + "epoch": 1.46, + "grad_norm": 0.2769809663295746, + "learning_rate": 0.00015716225125118587, + "loss": 0.0654, + "step": 2226 + }, + { + "epoch": 1.46, + "grad_norm": 0.03909273445606232, + "learning_rate": 0.00015705927398210258, + "loss": 0.0025, + "step": 2227 + }, + { + "epoch": 1.46, + "grad_norm": 0.11096679419279099, + "learning_rate": 0.0001569562933784686, + "loss": 0.0446, + "step": 2228 + }, + { + "epoch": 1.46, + "grad_norm": 0.14691616594791412, + "learning_rate": 0.00015685330948892834, + "loss": 0.0471, + "step": 2229 + }, + { + "epoch": 1.46, + "grad_norm": 0.10699688643217087, + "learning_rate": 0.00015675032236212768, + "loss": 0.0076, + "step": 2230 + }, + { + "epoch": 1.46, + "grad_norm": 0.32596132159233093, + "learning_rate": 0.0001566473320467141, + "loss": 0.022, + "step": 2231 + }, + { + "epoch": 1.46, + "grad_norm": 0.5243645310401917, + "learning_rate": 0.00015654433859133666, + "loss": 0.0461, + "step": 2232 + }, + { + "epoch": 1.46, + "grad_norm": 0.08140390366315842, + "learning_rate": 0.0001564413420446457, + "loss": 0.0052, + "step": 2233 + }, + { + "epoch": 1.46, + "grad_norm": 0.009572282433509827, + "learning_rate": 0.00015633834245529316, + "loss": 0.0011, + "step": 2234 + }, + { + "epoch": 1.46, + "grad_norm": 0.1445630043745041, + "learning_rate": 0.00015623533987193247, + "loss": 0.0075, + "step": 2235 + }, + { + "epoch": 1.46, + "grad_norm": 0.19702015817165375, + "learning_rate": 0.00015613233434321833, + "loss": 0.0164, + "step": 2236 + }, + { + "epoch": 1.46, + "grad_norm": 0.05097610130906105, + "learning_rate": 0.00015602932591780692, + "loss": 0.0053, + "step": 2237 + }, + { + "epoch": 1.47, + "grad_norm": 0.1141589879989624, + "learning_rate": 0.00015592631464435573, + "loss": 0.0085, + "step": 2238 + }, + { + "epoch": 1.47, + "grad_norm": 0.17674514651298523, + "learning_rate": 0.00015582330057152367, + "loss": 0.0115, + "step": 2239 + }, + { + "epoch": 1.47, + "grad_norm": 0.43345922231674194, + "learning_rate": 0.00015572028374797095, + "loss": 0.0494, + "step": 2240 + }, + { + "epoch": 1.47, + "grad_norm": 0.24077734351158142, + "learning_rate": 0.00015561726422235906, + "loss": 0.0407, + "step": 2241 + }, + { + "epoch": 1.47, + "grad_norm": 0.2242443561553955, + "learning_rate": 0.00015551424204335074, + "loss": 0.0464, + "step": 2242 + }, + { + "epoch": 1.47, + "grad_norm": 0.02641609124839306, + "learning_rate": 0.00015541121725961, + "loss": 0.0036, + "step": 2243 + }, + { + "epoch": 1.47, + "grad_norm": 0.17494410276412964, + "learning_rate": 0.00015530818991980213, + "loss": 0.0728, + "step": 2244 + }, + { + "epoch": 1.47, + "grad_norm": 0.14498472213745117, + "learning_rate": 0.00015520516007259364, + "loss": 0.0141, + "step": 2245 + }, + { + "epoch": 1.47, + "grad_norm": 0.07311736047267914, + "learning_rate": 0.00015510212776665206, + "loss": 0.0075, + "step": 2246 + }, + { + "epoch": 1.47, + "grad_norm": 0.18270064890384674, + "learning_rate": 0.00015499909305064625, + "loss": 0.0335, + "step": 2247 + }, + { + "epoch": 1.47, + "grad_norm": 0.11468853056430817, + "learning_rate": 0.00015489605597324618, + "loss": 0.0126, + "step": 2248 + }, + { + "epoch": 1.47, + "grad_norm": 0.24907346069812775, + "learning_rate": 0.00015479301658312294, + "loss": 0.0547, + "step": 2249 + }, + { + "epoch": 1.47, + "grad_norm": 0.054052844643592834, + "learning_rate": 0.0001546899749289486, + "loss": 0.0061, + "step": 2250 + }, + { + "epoch": 1.47, + "grad_norm": 0.10278685390949249, + "learning_rate": 0.0001545869310593964, + "loss": 0.0148, + "step": 2251 + }, + { + "epoch": 1.47, + "grad_norm": 0.039586808532476425, + "learning_rate": 0.00015448388502314065, + "loss": 0.0044, + "step": 2252 + }, + { + "epoch": 1.47, + "grad_norm": 0.1996268779039383, + "learning_rate": 0.00015438083686885663, + "loss": 0.0238, + "step": 2253 + }, + { + "epoch": 1.48, + "grad_norm": 0.07462750375270844, + "learning_rate": 0.00015427778664522067, + "loss": 0.0077, + "step": 2254 + }, + { + "epoch": 1.48, + "grad_norm": 0.022693922743201256, + "learning_rate": 0.00015417473440090994, + "loss": 0.0031, + "step": 2255 + }, + { + "epoch": 1.48, + "grad_norm": 0.17342881858348846, + "learning_rate": 0.00015407168018460272, + "loss": 0.0334, + "step": 2256 + }, + { + "epoch": 1.48, + "grad_norm": 0.1654786616563797, + "learning_rate": 0.0001539686240449782, + "loss": 0.059, + "step": 2257 + }, + { + "epoch": 1.48, + "grad_norm": 0.10321973264217377, + "learning_rate": 0.00015386556603071643, + "loss": 0.0204, + "step": 2258 + }, + { + "epoch": 1.48, + "grad_norm": 0.10149465501308441, + "learning_rate": 0.00015376250619049834, + "loss": 0.0122, + "step": 2259 + }, + { + "epoch": 1.48, + "grad_norm": 0.04105484485626221, + "learning_rate": 0.00015365944457300572, + "loss": 0.0056, + "step": 2260 + }, + { + "epoch": 1.48, + "grad_norm": 0.10165435820817947, + "learning_rate": 0.0001535563812269213, + "loss": 0.0137, + "step": 2261 + }, + { + "epoch": 1.48, + "grad_norm": 0.11750262975692749, + "learning_rate": 0.0001534533162009285, + "loss": 0.009, + "step": 2262 + }, + { + "epoch": 1.48, + "grad_norm": 0.30132028460502625, + "learning_rate": 0.00015335024954371158, + "loss": 0.0188, + "step": 2263 + }, + { + "epoch": 1.48, + "grad_norm": 0.19077062606811523, + "learning_rate": 0.0001532471813039556, + "loss": 0.0249, + "step": 2264 + }, + { + "epoch": 1.48, + "grad_norm": 0.1327200084924698, + "learning_rate": 0.0001531441115303463, + "loss": 0.0139, + "step": 2265 + }, + { + "epoch": 1.48, + "grad_norm": 0.09262137115001678, + "learning_rate": 0.0001530410402715702, + "loss": 0.0128, + "step": 2266 + }, + { + "epoch": 1.48, + "grad_norm": 0.1415720283985138, + "learning_rate": 0.00015293796757631458, + "loss": 0.0255, + "step": 2267 + }, + { + "epoch": 1.48, + "grad_norm": 0.28981319069862366, + "learning_rate": 0.00015283489349326721, + "loss": 0.0365, + "step": 2268 + }, + { + "epoch": 1.49, + "grad_norm": 0.024845613166689873, + "learning_rate": 0.0001527318180711167, + "loss": 0.0034, + "step": 2269 + }, + { + "epoch": 1.49, + "grad_norm": 0.0768069252371788, + "learning_rate": 0.0001526287413585522, + "loss": 0.0066, + "step": 2270 + }, + { + "epoch": 1.49, + "grad_norm": 0.1130814403295517, + "learning_rate": 0.00015252566340426352, + "loss": 0.0138, + "step": 2271 + }, + { + "epoch": 1.49, + "grad_norm": 0.15060044825077057, + "learning_rate": 0.00015242258425694107, + "loss": 0.047, + "step": 2272 + }, + { + "epoch": 1.49, + "grad_norm": 0.16956418752670288, + "learning_rate": 0.00015231950396527564, + "loss": 0.0468, + "step": 2273 + }, + { + "epoch": 1.49, + "grad_norm": 0.027678130194544792, + "learning_rate": 0.0001522164225779588, + "loss": 0.0035, + "step": 2274 + }, + { + "epoch": 1.49, + "grad_norm": 0.011556626297533512, + "learning_rate": 0.00015211334014368256, + "loss": 0.0017, + "step": 2275 + }, + { + "epoch": 1.49, + "grad_norm": 0.17217235267162323, + "learning_rate": 0.0001520102567111394, + "loss": 0.0299, + "step": 2276 + }, + { + "epoch": 1.49, + "grad_norm": 0.02171311527490616, + "learning_rate": 0.00015190717232902224, + "loss": 0.0032, + "step": 2277 + }, + { + "epoch": 1.49, + "grad_norm": 0.08322153985500336, + "learning_rate": 0.0001518040870460245, + "loss": 0.0148, + "step": 2278 + }, + { + "epoch": 1.49, + "grad_norm": 0.10915898531675339, + "learning_rate": 0.00015170100091084, + "loss": 0.0122, + "step": 2279 + }, + { + "epoch": 1.49, + "grad_norm": 0.06174374371767044, + "learning_rate": 0.000151597913972163, + "loss": 0.011, + "step": 2280 + }, + { + "epoch": 1.49, + "grad_norm": 0.0850352942943573, + "learning_rate": 0.00015149482627868814, + "loss": 0.0175, + "step": 2281 + }, + { + "epoch": 1.49, + "grad_norm": 0.8030893206596375, + "learning_rate": 0.0001513917378791103, + "loss": 0.0689, + "step": 2282 + }, + { + "epoch": 1.49, + "grad_norm": 0.08825898915529251, + "learning_rate": 0.0001512886488221249, + "loss": 0.0109, + "step": 2283 + }, + { + "epoch": 1.5, + "grad_norm": 0.20646128058433533, + "learning_rate": 0.00015118555915642746, + "loss": 0.0717, + "step": 2284 + }, + { + "epoch": 1.5, + "grad_norm": 0.09915432333946228, + "learning_rate": 0.00015108246893071395, + "loss": 0.0117, + "step": 2285 + }, + { + "epoch": 1.5, + "grad_norm": 0.06762687116861343, + "learning_rate": 0.00015097937819368045, + "loss": 0.0095, + "step": 2286 + }, + { + "epoch": 1.5, + "grad_norm": 0.181647390127182, + "learning_rate": 0.00015087628699402345, + "loss": 0.0205, + "step": 2287 + }, + { + "epoch": 1.5, + "grad_norm": 0.25607171654701233, + "learning_rate": 0.00015077319538043954, + "loss": 0.0161, + "step": 2288 + }, + { + "epoch": 1.5, + "grad_norm": 0.10751291364431381, + "learning_rate": 0.00015067010340162558, + "loss": 0.0151, + "step": 2289 + }, + { + "epoch": 1.5, + "grad_norm": 0.15035240352153778, + "learning_rate": 0.00015056701110627855, + "loss": 0.0232, + "step": 2290 + }, + { + "epoch": 1.5, + "grad_norm": 0.09425321966409683, + "learning_rate": 0.00015046391854309552, + "loss": 0.0109, + "step": 2291 + }, + { + "epoch": 1.5, + "grad_norm": 0.05676641687750816, + "learning_rate": 0.00015036082576077385, + "loss": 0.0091, + "step": 2292 + }, + { + "epoch": 1.5, + "eval_loss": 0.029079807922244072, + "eval_runtime": 39.9888, + "eval_samples_per_second": 32.184, + "eval_steps_per_second": 8.052, + "step": 2292 + }, + { + "epoch": 1.5, + "grad_norm": 0.18537257611751556, + "learning_rate": 0.00015025773280801088, + "loss": 0.0312, + "step": 2293 + }, + { + "epoch": 1.5, + "grad_norm": 0.17844584584236145, + "learning_rate": 0.0001501546397335041, + "loss": 0.0275, + "step": 2294 + }, + { + "epoch": 1.5, + "grad_norm": 0.12108953297138214, + "learning_rate": 0.00015005154658595096, + "loss": 0.0173, + "step": 2295 + }, + { + "epoch": 1.5, + "grad_norm": 0.06204582378268242, + "learning_rate": 0.00014994845341404907, + "loss": 0.0089, + "step": 2296 + }, + { + "epoch": 1.5, + "grad_norm": 0.23866377770900726, + "learning_rate": 0.00014984536026649593, + "loss": 0.0258, + "step": 2297 + }, + { + "epoch": 1.5, + "grad_norm": 0.013257946819067001, + "learning_rate": 0.00014974226719198912, + "loss": 0.0026, + "step": 2298 + }, + { + "epoch": 1.51, + "grad_norm": 0.14589637517929077, + "learning_rate": 0.00014963917423922618, + "loss": 0.0159, + "step": 2299 + }, + { + "epoch": 1.51, + "grad_norm": 0.2827925682067871, + "learning_rate": 0.0001495360814569045, + "loss": 0.0182, + "step": 2300 + }, + { + "epoch": 1.51, + "grad_norm": 0.042063966393470764, + "learning_rate": 0.0001494329888937215, + "loss": 0.0063, + "step": 2301 + }, + { + "epoch": 1.51, + "grad_norm": 0.01539128739386797, + "learning_rate": 0.00014932989659837442, + "loss": 0.0025, + "step": 2302 + }, + { + "epoch": 1.51, + "grad_norm": 0.07236410677433014, + "learning_rate": 0.00014922680461956048, + "loss": 0.0087, + "step": 2303 + }, + { + "epoch": 1.51, + "grad_norm": 0.03618314489722252, + "learning_rate": 0.00014912371300597658, + "loss": 0.0049, + "step": 2304 + }, + { + "epoch": 1.51, + "grad_norm": 0.11485552042722702, + "learning_rate": 0.00014902062180631958, + "loss": 0.008, + "step": 2305 + }, + { + "epoch": 1.51, + "grad_norm": 0.11101217567920685, + "learning_rate": 0.00014891753106928608, + "loss": 0.0232, + "step": 2306 + }, + { + "epoch": 1.51, + "grad_norm": 0.017525073140859604, + "learning_rate": 0.00014881444084357255, + "loss": 0.0025, + "step": 2307 + }, + { + "epoch": 1.51, + "grad_norm": 0.20452751219272614, + "learning_rate": 0.00014871135117787513, + "loss": 0.0438, + "step": 2308 + }, + { + "epoch": 1.51, + "grad_norm": 0.2409246861934662, + "learning_rate": 0.00014860826212088972, + "loss": 0.0299, + "step": 2309 + }, + { + "epoch": 1.51, + "grad_norm": 0.3612133860588074, + "learning_rate": 0.0001485051737213119, + "loss": 0.0299, + "step": 2310 + }, + { + "epoch": 1.51, + "grad_norm": 0.1734815537929535, + "learning_rate": 0.000148402086027837, + "loss": 0.0164, + "step": 2311 + }, + { + "epoch": 1.51, + "grad_norm": 0.2504834234714508, + "learning_rate": 0.00014829899908916003, + "loss": 0.0233, + "step": 2312 + }, + { + "epoch": 1.51, + "grad_norm": 0.29188862442970276, + "learning_rate": 0.00014819591295397555, + "loss": 0.0202, + "step": 2313 + }, + { + "epoch": 1.51, + "grad_norm": 0.044157739728689194, + "learning_rate": 0.0001480928276709778, + "loss": 0.0057, + "step": 2314 + }, + { + "epoch": 1.52, + "grad_norm": 0.08563201874494553, + "learning_rate": 0.00014798974328886062, + "loss": 0.0118, + "step": 2315 + }, + { + "epoch": 1.52, + "grad_norm": 0.06240120530128479, + "learning_rate": 0.00014788665985631741, + "loss": 0.0041, + "step": 2316 + }, + { + "epoch": 1.52, + "grad_norm": 0.1662430614233017, + "learning_rate": 0.0001477835774220412, + "loss": 0.0129, + "step": 2317 + }, + { + "epoch": 1.52, + "grad_norm": 0.020060239359736443, + "learning_rate": 0.00014768049603472436, + "loss": 0.0016, + "step": 2318 + }, + { + "epoch": 1.52, + "grad_norm": 0.11197911947965622, + "learning_rate": 0.00014757741574305896, + "loss": 0.0059, + "step": 2319 + }, + { + "epoch": 1.52, + "grad_norm": 0.25439372658729553, + "learning_rate": 0.00014747433659573645, + "loss": 0.0597, + "step": 2320 + }, + { + "epoch": 1.52, + "grad_norm": 0.12229887396097183, + "learning_rate": 0.00014737125864144779, + "loss": 0.0061, + "step": 2321 + }, + { + "epoch": 1.52, + "grad_norm": 0.3244328498840332, + "learning_rate": 0.0001472681819288833, + "loss": 0.0897, + "step": 2322 + }, + { + "epoch": 1.52, + "grad_norm": 0.10589354485273361, + "learning_rate": 0.00014716510650673279, + "loss": 0.0057, + "step": 2323 + }, + { + "epoch": 1.52, + "grad_norm": 0.020396653562784195, + "learning_rate": 0.00014706203242368542, + "loss": 0.0025, + "step": 2324 + }, + { + "epoch": 1.52, + "grad_norm": 0.3133372366428375, + "learning_rate": 0.0001469589597284298, + "loss": 0.0432, + "step": 2325 + }, + { + "epoch": 1.52, + "grad_norm": 0.09910666942596436, + "learning_rate": 0.0001468558884696537, + "loss": 0.0089, + "step": 2326 + }, + { + "epoch": 1.52, + "grad_norm": 0.06794673204421997, + "learning_rate": 0.0001467528186960444, + "loss": 0.0036, + "step": 2327 + }, + { + "epoch": 1.52, + "grad_norm": 0.5119796395301819, + "learning_rate": 0.00014664975045628842, + "loss": 0.0347, + "step": 2328 + }, + { + "epoch": 1.52, + "grad_norm": 0.13082320988178253, + "learning_rate": 0.00014654668379907149, + "loss": 0.0181, + "step": 2329 + }, + { + "epoch": 1.53, + "grad_norm": 0.043470270931720734, + "learning_rate": 0.0001464436187730787, + "loss": 0.0048, + "step": 2330 + }, + { + "epoch": 1.53, + "grad_norm": 0.1573643833398819, + "learning_rate": 0.00014634055542699426, + "loss": 0.0161, + "step": 2331 + }, + { + "epoch": 1.53, + "grad_norm": 0.2287653237581253, + "learning_rate": 0.00014623749380950166, + "loss": 0.0384, + "step": 2332 + }, + { + "epoch": 1.53, + "grad_norm": 0.35878098011016846, + "learning_rate": 0.00014613443396928357, + "loss": 0.047, + "step": 2333 + }, + { + "epoch": 1.53, + "grad_norm": 0.14366035163402557, + "learning_rate": 0.0001460313759550218, + "loss": 0.0258, + "step": 2334 + }, + { + "epoch": 1.53, + "grad_norm": 0.027050506323575974, + "learning_rate": 0.00014592831981539726, + "loss": 0.0023, + "step": 2335 + }, + { + "epoch": 1.53, + "grad_norm": 0.1347045749425888, + "learning_rate": 0.00014582526559909006, + "loss": 0.0568, + "step": 2336 + }, + { + "epoch": 1.53, + "grad_norm": 0.1418723464012146, + "learning_rate": 0.00014572221335477936, + "loss": 0.0116, + "step": 2337 + }, + { + "epoch": 1.53, + "grad_norm": 0.057905472815036774, + "learning_rate": 0.00014561916313114338, + "loss": 0.008, + "step": 2338 + }, + { + "epoch": 1.53, + "grad_norm": 0.12035661935806274, + "learning_rate": 0.00014551611497685933, + "loss": 0.0243, + "step": 2339 + }, + { + "epoch": 1.53, + "grad_norm": 0.0975029394030571, + "learning_rate": 0.00014541306894060358, + "loss": 0.0113, + "step": 2340 + }, + { + "epoch": 1.53, + "grad_norm": 0.17044967412948608, + "learning_rate": 0.0001453100250710514, + "loss": 0.0244, + "step": 2341 + }, + { + "epoch": 1.53, + "grad_norm": 0.04020816087722778, + "learning_rate": 0.00014520698341687706, + "loss": 0.0053, + "step": 2342 + }, + { + "epoch": 1.53, + "grad_norm": 0.18097876012325287, + "learning_rate": 0.0001451039440267538, + "loss": 0.0253, + "step": 2343 + }, + { + "epoch": 1.53, + "grad_norm": 0.012999899685382843, + "learning_rate": 0.00014500090694935373, + "loss": 0.0021, + "step": 2344 + }, + { + "epoch": 1.54, + "grad_norm": 0.17676088213920593, + "learning_rate": 0.00014489787223334795, + "loss": 0.0283, + "step": 2345 + }, + { + "epoch": 1.54, + "grad_norm": 0.2022247314453125, + "learning_rate": 0.00014479483992740636, + "loss": 0.0144, + "step": 2346 + }, + { + "epoch": 1.54, + "grad_norm": 0.059292592108249664, + "learning_rate": 0.00014469181008019784, + "loss": 0.0055, + "step": 2347 + }, + { + "epoch": 1.54, + "grad_norm": 0.07103478163480759, + "learning_rate": 0.00014458878274039, + "loss": 0.0082, + "step": 2348 + }, + { + "epoch": 1.54, + "grad_norm": 0.24378490447998047, + "learning_rate": 0.00014448575795664926, + "loss": 0.0345, + "step": 2349 + }, + { + "epoch": 1.54, + "grad_norm": 0.04015032947063446, + "learning_rate": 0.00014438273577764094, + "loss": 0.0045, + "step": 2350 + }, + { + "epoch": 1.54, + "grad_norm": 0.22856755554676056, + "learning_rate": 0.00014427971625202905, + "loss": 0.0152, + "step": 2351 + }, + { + "epoch": 1.54, + "grad_norm": 0.08063790947198868, + "learning_rate": 0.0001441766994284763, + "loss": 0.0085, + "step": 2352 + }, + { + "epoch": 1.54, + "grad_norm": 0.30245184898376465, + "learning_rate": 0.00014407368535564427, + "loss": 0.0841, + "step": 2353 + }, + { + "epoch": 1.54, + "grad_norm": 0.2929746210575104, + "learning_rate": 0.00014397067408219308, + "loss": 0.028, + "step": 2354 + }, + { + "epoch": 1.54, + "grad_norm": 0.23219022154808044, + "learning_rate": 0.00014386766565678165, + "loss": 0.0337, + "step": 2355 + }, + { + "epoch": 1.54, + "grad_norm": 0.05898268148303032, + "learning_rate": 0.00014376466012806755, + "loss": 0.0076, + "step": 2356 + }, + { + "epoch": 1.54, + "grad_norm": 0.15910372138023376, + "learning_rate": 0.0001436616575447068, + "loss": 0.0318, + "step": 2357 + }, + { + "epoch": 1.54, + "grad_norm": 0.12564696371555328, + "learning_rate": 0.0001435586579553543, + "loss": 0.0139, + "step": 2358 + }, + { + "epoch": 1.54, + "grad_norm": 0.08463872969150543, + "learning_rate": 0.00014345566140866334, + "loss": 0.0081, + "step": 2359 + }, + { + "epoch": 1.55, + "grad_norm": 0.10430733859539032, + "learning_rate": 0.0001433526679532859, + "loss": 0.0131, + "step": 2360 + }, + { + "epoch": 1.55, + "grad_norm": 0.1021500900387764, + "learning_rate": 0.00014324967763787235, + "loss": 0.0131, + "step": 2361 + }, + { + "epoch": 1.55, + "grad_norm": 0.04552861675620079, + "learning_rate": 0.00014314669051107166, + "loss": 0.0069, + "step": 2362 + }, + { + "epoch": 1.55, + "grad_norm": 0.18448792397975922, + "learning_rate": 0.00014304370662153137, + "loss": 0.0241, + "step": 2363 + }, + { + "epoch": 1.55, + "grad_norm": 0.14002187550067902, + "learning_rate": 0.00014294072601789742, + "loss": 0.0205, + "step": 2364 + }, + { + "epoch": 1.55, + "grad_norm": 0.025072062388062477, + "learning_rate": 0.00014283774874881413, + "loss": 0.0026, + "step": 2365 + }, + { + "epoch": 1.55, + "grad_norm": 0.12297520786523819, + "learning_rate": 0.00014273477486292433, + "loss": 0.0106, + "step": 2366 + }, + { + "epoch": 1.55, + "grad_norm": 0.0502072237432003, + "learning_rate": 0.00014263180440886924, + "loss": 0.006, + "step": 2367 + }, + { + "epoch": 1.55, + "grad_norm": 0.3273145854473114, + "learning_rate": 0.00014252883743528843, + "loss": 0.0501, + "step": 2368 + }, + { + "epoch": 1.55, + "grad_norm": 0.1304243505001068, + "learning_rate": 0.00014242587399081993, + "loss": 0.0152, + "step": 2369 + }, + { + "epoch": 1.55, + "grad_norm": 0.29160431027412415, + "learning_rate": 0.00014232291412409994, + "loss": 0.0516, + "step": 2370 + }, + { + "epoch": 1.55, + "grad_norm": 0.10514498502016068, + "learning_rate": 0.00014221995788376305, + "loss": 0.0355, + "step": 2371 + }, + { + "epoch": 1.55, + "grad_norm": 0.0701146274805069, + "learning_rate": 0.00014211700531844215, + "loss": 0.0062, + "step": 2372 + }, + { + "epoch": 1.55, + "grad_norm": 0.0321023166179657, + "learning_rate": 0.00014201405647676842, + "loss": 0.0028, + "step": 2373 + }, + { + "epoch": 1.55, + "grad_norm": 0.19740337133407593, + "learning_rate": 0.0001419111114073712, + "loss": 0.0394, + "step": 2374 + }, + { + "epoch": 1.55, + "grad_norm": 0.016404815018177032, + "learning_rate": 0.00014180817015887806, + "loss": 0.0015, + "step": 2375 + }, + { + "epoch": 1.56, + "grad_norm": 0.16899968683719635, + "learning_rate": 0.00014170523277991486, + "loss": 0.0178, + "step": 2376 + }, + { + "epoch": 1.56, + "grad_norm": 0.052522968500852585, + "learning_rate": 0.00014160229931910556, + "loss": 0.0038, + "step": 2377 + }, + { + "epoch": 1.56, + "grad_norm": 0.2541167438030243, + "learning_rate": 0.00014149936982507224, + "loss": 0.0218, + "step": 2378 + }, + { + "epoch": 1.56, + "grad_norm": 0.09515637159347534, + "learning_rate": 0.00014139644434643515, + "loss": 0.0074, + "step": 2379 + }, + { + "epoch": 1.56, + "grad_norm": 0.17533494532108307, + "learning_rate": 0.00014129352293181264, + "loss": 0.0184, + "step": 2380 + }, + { + "epoch": 1.56, + "grad_norm": 0.03894618898630142, + "learning_rate": 0.00014119060562982116, + "loss": 0.0035, + "step": 2381 + }, + { + "epoch": 1.56, + "grad_norm": 0.1853707730770111, + "learning_rate": 0.00014108769248907522, + "loss": 0.0965, + "step": 2382 + }, + { + "epoch": 1.56, + "grad_norm": 0.1522863656282425, + "learning_rate": 0.00014098478355818725, + "loss": 0.0083, + "step": 2383 + }, + { + "epoch": 1.56, + "grad_norm": 0.04990135878324509, + "learning_rate": 0.0001408818788857678, + "loss": 0.0043, + "step": 2384 + }, + { + "epoch": 1.56, + "grad_norm": 0.3627833127975464, + "learning_rate": 0.00014077897852042545, + "loss": 0.0825, + "step": 2385 + }, + { + "epoch": 1.56, + "grad_norm": 0.07105204463005066, + "learning_rate": 0.00014067608251076664, + "loss": 0.0069, + "step": 2386 + }, + { + "epoch": 1.56, + "grad_norm": 0.11619725823402405, + "learning_rate": 0.0001405731909053958, + "loss": 0.0201, + "step": 2387 + }, + { + "epoch": 1.56, + "grad_norm": 0.17097829282283783, + "learning_rate": 0.00014047030375291528, + "loss": 0.0413, + "step": 2388 + }, + { + "epoch": 1.56, + "grad_norm": 0.05021402984857559, + "learning_rate": 0.0001403674211019253, + "loss": 0.0035, + "step": 2389 + }, + { + "epoch": 1.56, + "grad_norm": 0.1278286725282669, + "learning_rate": 0.000140264543001024, + "loss": 0.0255, + "step": 2390 + }, + { + "epoch": 1.57, + "grad_norm": 0.02548815682530403, + "learning_rate": 0.0001401616694988074, + "loss": 0.0043, + "step": 2391 + }, + { + "epoch": 1.57, + "grad_norm": 0.08917523175477982, + "learning_rate": 0.00014005880064386916, + "loss": 0.0121, + "step": 2392 + }, + { + "epoch": 1.57, + "grad_norm": 0.11778222769498825, + "learning_rate": 0.00013995593648480099, + "loss": 0.0064, + "step": 2393 + }, + { + "epoch": 1.57, + "grad_norm": 0.10804206132888794, + "learning_rate": 0.00013985307707019222, + "loss": 0.0081, + "step": 2394 + }, + { + "epoch": 1.57, + "grad_norm": 0.03458308428525925, + "learning_rate": 0.00013975022244863005, + "loss": 0.0046, + "step": 2395 + }, + { + "epoch": 1.57, + "grad_norm": 0.21902398765087128, + "learning_rate": 0.00013964737266869927, + "loss": 0.0564, + "step": 2396 + }, + { + "epoch": 1.57, + "grad_norm": 0.08411278575658798, + "learning_rate": 0.0001395445277789825, + "loss": 0.0151, + "step": 2397 + }, + { + "epoch": 1.57, + "grad_norm": 0.25032511353492737, + "learning_rate": 0.00013944168782806013, + "loss": 0.0301, + "step": 2398 + }, + { + "epoch": 1.57, + "grad_norm": 0.042354997247457504, + "learning_rate": 0.00013933885286450992, + "loss": 0.0061, + "step": 2399 + }, + { + "epoch": 1.57, + "grad_norm": 0.12194197624921799, + "learning_rate": 0.00013923602293690755, + "loss": 0.0138, + "step": 2400 + }, + { + "epoch": 1.57, + "grad_norm": 0.1070348247885704, + "learning_rate": 0.00013913319809382625, + "loss": 0.0312, + "step": 2401 + }, + { + "epoch": 1.57, + "grad_norm": 0.10185689479112625, + "learning_rate": 0.00013903037838383677, + "loss": 0.0389, + "step": 2402 + }, + { + "epoch": 1.57, + "grad_norm": 0.08572839200496674, + "learning_rate": 0.00013892756385550754, + "loss": 0.0101, + "step": 2403 + }, + { + "epoch": 1.57, + "grad_norm": 0.11028925329446793, + "learning_rate": 0.00013882475455740447, + "loss": 0.0129, + "step": 2404 + }, + { + "epoch": 1.57, + "grad_norm": 0.11871008574962616, + "learning_rate": 0.00013872195053809107, + "loss": 0.0114, + "step": 2405 + }, + { + "epoch": 1.58, + "grad_norm": 0.03218941390514374, + "learning_rate": 0.00013861915184612832, + "loss": 0.0043, + "step": 2406 + }, + { + "epoch": 1.58, + "grad_norm": 0.02190096117556095, + "learning_rate": 0.0001385163585300746, + "loss": 0.003, + "step": 2407 + }, + { + "epoch": 1.58, + "grad_norm": 0.08264799416065216, + "learning_rate": 0.00013841357063848586, + "loss": 0.0097, + "step": 2408 + }, + { + "epoch": 1.58, + "grad_norm": 0.06024307757616043, + "learning_rate": 0.0001383107882199155, + "loss": 0.006, + "step": 2409 + }, + { + "epoch": 1.58, + "grad_norm": 0.2193279266357422, + "learning_rate": 0.00013820801132291425, + "loss": 0.0193, + "step": 2410 + }, + { + "epoch": 1.58, + "grad_norm": 0.01249188743531704, + "learning_rate": 0.00013810523999603026, + "loss": 0.002, + "step": 2411 + }, + { + "epoch": 1.58, + "grad_norm": 0.11805452406406403, + "learning_rate": 0.00013800247428780908, + "loss": 0.0133, + "step": 2412 + }, + { + "epoch": 1.58, + "grad_norm": 0.058763302862644196, + "learning_rate": 0.0001378997142467936, + "loss": 0.0078, + "step": 2413 + }, + { + "epoch": 1.58, + "grad_norm": 0.0065534659661352634, + "learning_rate": 0.000137796959921524, + "loss": 0.0012, + "step": 2414 + }, + { + "epoch": 1.58, + "grad_norm": 0.13317126035690308, + "learning_rate": 0.00013769421136053777, + "loss": 0.006, + "step": 2415 + }, + { + "epoch": 1.58, + "grad_norm": 0.01397017389535904, + "learning_rate": 0.0001375914686123697, + "loss": 0.0022, + "step": 2416 + }, + { + "epoch": 1.58, + "grad_norm": 0.1307278871536255, + "learning_rate": 0.00013748873172555182, + "loss": 0.0138, + "step": 2417 + }, + { + "epoch": 1.58, + "grad_norm": 0.005166274029761553, + "learning_rate": 0.00013738600074861339, + "loss": 0.0009, + "step": 2418 + }, + { + "epoch": 1.58, + "grad_norm": 0.01865355297923088, + "learning_rate": 0.00013728327573008092, + "loss": 0.0018, + "step": 2419 + }, + { + "epoch": 1.58, + "grad_norm": 0.0630246177315712, + "learning_rate": 0.000137180556718478, + "loss": 0.0042, + "step": 2420 + }, + { + "epoch": 1.58, + "grad_norm": 0.02148517221212387, + "learning_rate": 0.00013707784376232546, + "loss": 0.0026, + "step": 2421 + }, + { + "epoch": 1.59, + "grad_norm": 0.15195278823375702, + "learning_rate": 0.00013697513691014127, + "loss": 0.0309, + "step": 2422 + }, + { + "epoch": 1.59, + "grad_norm": 0.026208559051156044, + "learning_rate": 0.00013687243621044056, + "loss": 0.0021, + "step": 2423 + }, + { + "epoch": 1.59, + "grad_norm": 0.04777579382061958, + "learning_rate": 0.0001367697417117354, + "loss": 0.0049, + "step": 2424 + }, + { + "epoch": 1.59, + "grad_norm": 0.034857913851737976, + "learning_rate": 0.00013666705346253508, + "loss": 0.0035, + "step": 2425 + }, + { + "epoch": 1.59, + "grad_norm": 0.14037910103797913, + "learning_rate": 0.00013656437151134587, + "loss": 0.0223, + "step": 2426 + }, + { + "epoch": 1.59, + "grad_norm": 0.019575120881199837, + "learning_rate": 0.00013646169590667115, + "loss": 0.0024, + "step": 2427 + }, + { + "epoch": 1.59, + "grad_norm": 0.18282446265220642, + "learning_rate": 0.00013635902669701115, + "loss": 0.0365, + "step": 2428 + }, + { + "epoch": 1.59, + "grad_norm": 0.10844237357378006, + "learning_rate": 0.0001362563639308632, + "loss": 0.0349, + "step": 2429 + }, + { + "epoch": 1.59, + "grad_norm": 0.015698986127972603, + "learning_rate": 0.00013615370765672152, + "loss": 0.0021, + "step": 2430 + }, + { + "epoch": 1.59, + "grad_norm": 0.009203084744513035, + "learning_rate": 0.00013605105792307732, + "loss": 0.0013, + "step": 2431 + }, + { + "epoch": 1.59, + "grad_norm": 0.20686601102352142, + "learning_rate": 0.00013594841477841874, + "loss": 0.0482, + "step": 2432 + }, + { + "epoch": 1.59, + "grad_norm": 0.2730505168437958, + "learning_rate": 0.0001358457782712307, + "loss": 0.0469, + "step": 2433 + }, + { + "epoch": 1.59, + "grad_norm": 0.1980660855770111, + "learning_rate": 0.00013574314844999502, + "loss": 0.0216, + "step": 2434 + }, + { + "epoch": 1.59, + "grad_norm": 0.1146879717707634, + "learning_rate": 0.00013564052536319045, + "loss": 0.0404, + "step": 2435 + }, + { + "epoch": 1.59, + "grad_norm": 0.13500703871250153, + "learning_rate": 0.0001355379090592925, + "loss": 0.0688, + "step": 2436 + }, + { + "epoch": 1.6, + "grad_norm": 0.1410558968782425, + "learning_rate": 0.0001354352995867734, + "loss": 0.011, + "step": 2437 + }, + { + "epoch": 1.6, + "grad_norm": 0.039574917405843735, + "learning_rate": 0.0001353326969941023, + "loss": 0.0059, + "step": 2438 + }, + { + "epoch": 1.6, + "grad_norm": 0.22806905210018158, + "learning_rate": 0.000135230101329745, + "loss": 0.0196, + "step": 2439 + }, + { + "epoch": 1.6, + "grad_norm": 0.06916101276874542, + "learning_rate": 0.00013512751264216407, + "loss": 0.0076, + "step": 2440 + }, + { + "epoch": 1.6, + "grad_norm": 0.2902592420578003, + "learning_rate": 0.00013502493097981874, + "loss": 0.0285, + "step": 2441 + }, + { + "epoch": 1.6, + "grad_norm": 0.16353599727153778, + "learning_rate": 0.00013492235639116495, + "loss": 0.0165, + "step": 2442 + }, + { + "epoch": 1.6, + "grad_norm": 0.04453244060277939, + "learning_rate": 0.00013481978892465528, + "loss": 0.008, + "step": 2443 + }, + { + "epoch": 1.6, + "grad_norm": 0.06091325357556343, + "learning_rate": 0.00013471722862873903, + "loss": 0.0098, + "step": 2444 + }, + { + "epoch": 1.6, + "grad_norm": 0.05872859060764313, + "learning_rate": 0.00013461467555186203, + "loss": 0.0077, + "step": 2445 + }, + { + "epoch": 1.6, + "grad_norm": 0.2814798355102539, + "learning_rate": 0.00013451212974246668, + "loss": 0.0367, + "step": 2446 + }, + { + "epoch": 1.6, + "grad_norm": 0.09219411015510559, + "learning_rate": 0.00013440959124899198, + "loss": 0.0454, + "step": 2447 + }, + { + "epoch": 1.6, + "grad_norm": 0.1083960011601448, + "learning_rate": 0.0001343070601198735, + "loss": 0.0319, + "step": 2448 + }, + { + "epoch": 1.6, + "grad_norm": 0.0786982923746109, + "learning_rate": 0.00013420453640354335, + "loss": 0.0128, + "step": 2449 + }, + { + "epoch": 1.6, + "grad_norm": 0.020560231059789658, + "learning_rate": 0.00013410202014843, + "loss": 0.0043, + "step": 2450 + }, + { + "epoch": 1.6, + "grad_norm": 0.14801499247550964, + "learning_rate": 0.0001339995114029586, + "loss": 0.0191, + "step": 2451 + }, + { + "epoch": 1.61, + "grad_norm": 0.13155722618103027, + "learning_rate": 0.00013389701021555056, + "loss": 0.0216, + "step": 2452 + }, + { + "epoch": 1.61, + "grad_norm": 0.1539149135351181, + "learning_rate": 0.00013379451663462388, + "loss": 0.0283, + "step": 2453 + }, + { + "epoch": 1.61, + "grad_norm": 0.04807708412408829, + "learning_rate": 0.0001336920307085928, + "loss": 0.004, + "step": 2454 + }, + { + "epoch": 1.61, + "grad_norm": 0.08267413824796677, + "learning_rate": 0.0001335895524858681, + "loss": 0.009, + "step": 2455 + }, + { + "epoch": 1.61, + "grad_norm": 0.1063155010342598, + "learning_rate": 0.00013348708201485688, + "loss": 0.0482, + "step": 2456 + }, + { + "epoch": 1.61, + "grad_norm": 0.1579791158437729, + "learning_rate": 0.0001333846193439625, + "loss": 0.0231, + "step": 2457 + }, + { + "epoch": 1.61, + "grad_norm": 0.14827631413936615, + "learning_rate": 0.00013328216452158478, + "loss": 0.0184, + "step": 2458 + }, + { + "epoch": 1.61, + "grad_norm": 0.09377805888652802, + "learning_rate": 0.0001331797175961196, + "loss": 0.0242, + "step": 2459 + }, + { + "epoch": 1.61, + "grad_norm": 0.13285934925079346, + "learning_rate": 0.00013307727861595938, + "loss": 0.0322, + "step": 2460 + }, + { + "epoch": 1.61, + "grad_norm": 0.07158241420984268, + "learning_rate": 0.0001329748476294926, + "loss": 0.0152, + "step": 2461 + }, + { + "epoch": 1.61, + "grad_norm": 0.03591454401612282, + "learning_rate": 0.00013287242468510408, + "loss": 0.0052, + "step": 2462 + }, + { + "epoch": 1.61, + "grad_norm": 0.04562580958008766, + "learning_rate": 0.0001327700098311747, + "loss": 0.0078, + "step": 2463 + }, + { + "epoch": 1.61, + "grad_norm": 0.22357186675071716, + "learning_rate": 0.00013266760311608168, + "loss": 0.0426, + "step": 2464 + }, + { + "epoch": 1.61, + "grad_norm": 0.019299369305372238, + "learning_rate": 0.0001325652045881983, + "loss": 0.0027, + "step": 2465 + }, + { + "epoch": 1.61, + "grad_norm": 0.1803794503211975, + "learning_rate": 0.00013246281429589397, + "loss": 0.0299, + "step": 2466 + }, + { + "epoch": 1.62, + "grad_norm": 0.08838968724012375, + "learning_rate": 0.00013236043228753431, + "loss": 0.0078, + "step": 2467 + }, + { + "epoch": 1.62, + "grad_norm": 0.07040340453386307, + "learning_rate": 0.00013225805861148086, + "loss": 0.0167, + "step": 2468 + }, + { + "epoch": 1.62, + "grad_norm": 0.07686702907085419, + "learning_rate": 0.00013215569331609134, + "loss": 0.0068, + "step": 2469 + }, + { + "epoch": 1.62, + "grad_norm": 0.10085508972406387, + "learning_rate": 0.0001320533364497195, + "loss": 0.0228, + "step": 2470 + }, + { + "epoch": 1.62, + "grad_norm": 0.060376573354005814, + "learning_rate": 0.0001319509880607151, + "loss": 0.0095, + "step": 2471 + }, + { + "epoch": 1.62, + "grad_norm": 0.028011616319417953, + "learning_rate": 0.00013184864819742385, + "loss": 0.0051, + "step": 2472 + }, + { + "epoch": 1.62, + "grad_norm": 0.04475580155849457, + "learning_rate": 0.00013174631690818749, + "loss": 0.0059, + "step": 2473 + }, + { + "epoch": 1.62, + "grad_norm": 0.031112445518374443, + "learning_rate": 0.00013164399424134374, + "loss": 0.0043, + "step": 2474 + }, + { + "epoch": 1.62, + "grad_norm": 0.38572391867637634, + "learning_rate": 0.00013154168024522616, + "loss": 0.0355, + "step": 2475 + }, + { + "epoch": 1.62, + "grad_norm": 0.2470254898071289, + "learning_rate": 0.00013143937496816422, + "loss": 0.0146, + "step": 2476 + }, + { + "epoch": 1.62, + "grad_norm": 0.08601324260234833, + "learning_rate": 0.00013133707845848334, + "loss": 0.0132, + "step": 2477 + }, + { + "epoch": 1.62, + "grad_norm": 0.09904181212186813, + "learning_rate": 0.00013123479076450478, + "loss": 0.0249, + "step": 2478 + }, + { + "epoch": 1.62, + "grad_norm": 0.013944767415523529, + "learning_rate": 0.00013113251193454557, + "loss": 0.0016, + "step": 2479 + }, + { + "epoch": 1.62, + "grad_norm": 0.05794617161154747, + "learning_rate": 0.00013103024201691868, + "loss": 0.0054, + "step": 2480 + }, + { + "epoch": 1.62, + "grad_norm": 0.25573623180389404, + "learning_rate": 0.00013092798105993273, + "loss": 0.0472, + "step": 2481 + }, + { + "epoch": 1.62, + "grad_norm": 0.14380040764808655, + "learning_rate": 0.00013082572911189217, + "loss": 0.0255, + "step": 2482 + }, + { + "epoch": 1.63, + "grad_norm": 0.0981423556804657, + "learning_rate": 0.0001307234862210972, + "loss": 0.0067, + "step": 2483 + }, + { + "epoch": 1.63, + "grad_norm": 0.19898656010627747, + "learning_rate": 0.0001306212524358438, + "loss": 0.0079, + "step": 2484 + }, + { + "epoch": 1.63, + "grad_norm": 0.1341608613729477, + "learning_rate": 0.00013051902780442348, + "loss": 0.0093, + "step": 2485 + }, + { + "epoch": 1.63, + "grad_norm": 0.294636070728302, + "learning_rate": 0.00013041681237512358, + "loss": 0.0619, + "step": 2486 + }, + { + "epoch": 1.63, + "grad_norm": 0.03222107142210007, + "learning_rate": 0.00013031460619622706, + "loss": 0.0044, + "step": 2487 + }, + { + "epoch": 1.63, + "grad_norm": 0.11569062620401382, + "learning_rate": 0.00013021240931601247, + "loss": 0.0113, + "step": 2488 + }, + { + "epoch": 1.63, + "grad_norm": 0.25324419140815735, + "learning_rate": 0.000130110221782754, + "loss": 0.0169, + "step": 2489 + }, + { + "epoch": 1.63, + "grad_norm": 0.3488900065422058, + "learning_rate": 0.00013000804364472144, + "loss": 0.0328, + "step": 2490 + }, + { + "epoch": 1.63, + "grad_norm": 0.053114596754312515, + "learning_rate": 0.00012990587495018005, + "loss": 0.0059, + "step": 2491 + }, + { + "epoch": 1.63, + "grad_norm": 0.028155844658613205, + "learning_rate": 0.0001298037157473908, + "loss": 0.0036, + "step": 2492 + }, + { + "epoch": 1.63, + "grad_norm": 0.019256332889199257, + "learning_rate": 0.00012970156608461, + "loss": 0.0024, + "step": 2493 + }, + { + "epoch": 1.63, + "grad_norm": 0.2722720205783844, + "learning_rate": 0.00012959942601008953, + "loss": 0.017, + "step": 2494 + }, + { + "epoch": 1.63, + "grad_norm": 0.02251126803457737, + "learning_rate": 0.00012949729557207678, + "loss": 0.0018, + "step": 2495 + }, + { + "epoch": 1.63, + "grad_norm": 0.11708138138055801, + "learning_rate": 0.00012939517481881448, + "loss": 0.0076, + "step": 2496 + }, + { + "epoch": 1.63, + "grad_norm": 0.07980841398239136, + "learning_rate": 0.00012929306379854096, + "loss": 0.0118, + "step": 2497 + }, + { + "epoch": 1.64, + "grad_norm": 0.04632432758808136, + "learning_rate": 0.00012919096255948974, + "loss": 0.0044, + "step": 2498 + }, + { + "epoch": 1.64, + "grad_norm": 0.09860672801733017, + "learning_rate": 0.00012908887114988993, + "loss": 0.004, + "step": 2499 + }, + { + "epoch": 1.64, + "grad_norm": 0.10034430027008057, + "learning_rate": 0.00012898678961796578, + "loss": 0.0076, + "step": 2500 + }, + { + "epoch": 1.64, + "grad_norm": 0.3471710979938507, + "learning_rate": 0.00012888471801193702, + "loss": 0.0284, + "step": 2501 + }, + { + "epoch": 1.64, + "grad_norm": 0.219834566116333, + "learning_rate": 0.00012878265638001867, + "loss": 0.0278, + "step": 2502 + }, + { + "epoch": 1.64, + "grad_norm": 0.1285816878080368, + "learning_rate": 0.00012868060477042105, + "loss": 0.0115, + "step": 2503 + }, + { + "epoch": 1.64, + "grad_norm": 0.09379381686449051, + "learning_rate": 0.00012857856323134965, + "loss": 0.0052, + "step": 2504 + }, + { + "epoch": 1.64, + "grad_norm": 0.1482100635766983, + "learning_rate": 0.00012847653181100534, + "loss": 0.0259, + "step": 2505 + }, + { + "epoch": 1.64, + "grad_norm": 0.062140848487615585, + "learning_rate": 0.00012837451055758414, + "loss": 0.0051, + "step": 2506 + }, + { + "epoch": 1.64, + "grad_norm": 0.12353700399398804, + "learning_rate": 0.00012827249951927723, + "loss": 0.0227, + "step": 2507 + }, + { + "epoch": 1.64, + "grad_norm": 0.013850602321326733, + "learning_rate": 0.00012817049874427108, + "loss": 0.0008, + "step": 2508 + }, + { + "epoch": 1.64, + "grad_norm": 0.006505441851913929, + "learning_rate": 0.00012806850828074717, + "loss": 0.0009, + "step": 2509 + }, + { + "epoch": 1.64, + "grad_norm": 0.25963035225868225, + "learning_rate": 0.0001279665281768822, + "loss": 0.014, + "step": 2510 + }, + { + "epoch": 1.64, + "grad_norm": 0.16497601568698883, + "learning_rate": 0.00012786455848084793, + "loss": 0.0103, + "step": 2511 + }, + { + "epoch": 1.64, + "grad_norm": 0.036567408591508865, + "learning_rate": 0.0001277625992408113, + "loss": 0.0033, + "step": 2512 + }, + { + "epoch": 1.65, + "grad_norm": 0.1094021275639534, + "learning_rate": 0.00012766065050493416, + "loss": 0.0035, + "step": 2513 + }, + { + "epoch": 1.65, + "grad_norm": 0.029884997755289078, + "learning_rate": 0.00012755871232137354, + "loss": 0.0018, + "step": 2514 + }, + { + "epoch": 1.65, + "grad_norm": 0.19183427095413208, + "learning_rate": 0.00012745678473828138, + "loss": 0.0354, + "step": 2515 + }, + { + "epoch": 1.65, + "grad_norm": 0.00821281410753727, + "learning_rate": 0.0001273548678038047, + "loss": 0.0011, + "step": 2516 + }, + { + "epoch": 1.65, + "grad_norm": 0.11743585020303726, + "learning_rate": 0.00012725296156608536, + "loss": 0.012, + "step": 2517 + }, + { + "epoch": 1.65, + "grad_norm": 0.4067364037036896, + "learning_rate": 0.00012715106607326032, + "loss": 0.0251, + "step": 2518 + }, + { + "epoch": 1.65, + "grad_norm": 0.13303601741790771, + "learning_rate": 0.0001270491813734614, + "loss": 0.022, + "step": 2519 + }, + { + "epoch": 1.65, + "grad_norm": 0.46618887782096863, + "learning_rate": 0.00012694730751481532, + "loss": 0.0312, + "step": 2520 + }, + { + "epoch": 1.65, + "grad_norm": 0.261059433221817, + "learning_rate": 0.00012684544454544364, + "loss": 0.0177, + "step": 2521 + }, + { + "epoch": 1.65, + "grad_norm": 0.3947398364543915, + "learning_rate": 0.00012674359251346284, + "loss": 0.0138, + "step": 2522 + }, + { + "epoch": 1.65, + "grad_norm": 0.02748054452240467, + "learning_rate": 0.00012664175146698422, + "loss": 0.0018, + "step": 2523 + }, + { + "epoch": 1.65, + "grad_norm": 0.159784734249115, + "learning_rate": 0.00012653992145411383, + "loss": 0.0245, + "step": 2524 + }, + { + "epoch": 1.65, + "grad_norm": 0.265787810087204, + "learning_rate": 0.00012643810252295265, + "loss": 0.0133, + "step": 2525 + }, + { + "epoch": 1.65, + "grad_norm": 0.08137747645378113, + "learning_rate": 0.00012633629472159623, + "loss": 0.0061, + "step": 2526 + }, + { + "epoch": 1.65, + "grad_norm": 0.19212502241134644, + "learning_rate": 0.000126234498098135, + "loss": 0.0457, + "step": 2527 + }, + { + "epoch": 1.65, + "grad_norm": 0.232799232006073, + "learning_rate": 0.0001261327127006541, + "loss": 0.0251, + "step": 2528 + }, + { + "epoch": 1.66, + "grad_norm": 0.2269105315208435, + "learning_rate": 0.00012603093857723336, + "loss": 0.0424, + "step": 2529 + }, + { + "epoch": 1.66, + "grad_norm": 0.044430967420339584, + "learning_rate": 0.00012592917577594718, + "loss": 0.0041, + "step": 2530 + }, + { + "epoch": 1.66, + "grad_norm": 0.1506963074207306, + "learning_rate": 0.00012582742434486476, + "loss": 0.0048, + "step": 2531 + }, + { + "epoch": 1.66, + "grad_norm": 0.005820784717798233, + "learning_rate": 0.00012572568433204986, + "loss": 0.0006, + "step": 2532 + }, + { + "epoch": 1.66, + "grad_norm": 0.035203348845243454, + "learning_rate": 0.00012562395578556086, + "loss": 0.0035, + "step": 2533 + }, + { + "epoch": 1.66, + "grad_norm": 0.27628517150878906, + "learning_rate": 0.00012552223875345072, + "loss": 0.012, + "step": 2534 + }, + { + "epoch": 1.66, + "grad_norm": 0.1673869639635086, + "learning_rate": 0.00012542053328376695, + "loss": 0.0201, + "step": 2535 + }, + { + "epoch": 1.66, + "grad_norm": 0.013920975849032402, + "learning_rate": 0.0001253188394245516, + "loss": 0.0011, + "step": 2536 + }, + { + "epoch": 1.66, + "grad_norm": 0.11940686404705048, + "learning_rate": 0.0001252171572238412, + "loss": 0.0078, + "step": 2537 + }, + { + "epoch": 1.66, + "grad_norm": 0.1563238948583603, + "learning_rate": 0.00012511548672966696, + "loss": 0.0158, + "step": 2538 + }, + { + "epoch": 1.66, + "grad_norm": 0.4310009777545929, + "learning_rate": 0.00012501382799005425, + "loss": 0.0919, + "step": 2539 + }, + { + "epoch": 1.66, + "grad_norm": 0.10280542075634003, + "learning_rate": 0.00012491218105302313, + "loss": 0.0325, + "step": 2540 + }, + { + "epoch": 1.66, + "grad_norm": 0.005980873014777899, + "learning_rate": 0.000124810545966588, + "loss": 0.0007, + "step": 2541 + }, + { + "epoch": 1.66, + "grad_norm": 0.17933742702007294, + "learning_rate": 0.00012470892277875774, + "loss": 0.0362, + "step": 2542 + }, + { + "epoch": 1.66, + "grad_norm": 0.14791390299797058, + "learning_rate": 0.00012460731153753543, + "loss": 0.052, + "step": 2543 + }, + { + "epoch": 1.67, + "grad_norm": 0.09524931013584137, + "learning_rate": 0.00012450571229091865, + "loss": 0.0138, + "step": 2544 + }, + { + "epoch": 1.67, + "grad_norm": 0.14938437938690186, + "learning_rate": 0.00012440412508689928, + "loss": 0.0098, + "step": 2545 + }, + { + "epoch": 1.67, + "grad_norm": 0.22291189432144165, + "learning_rate": 0.00012430254997346354, + "loss": 0.0275, + "step": 2546 + }, + { + "epoch": 1.67, + "grad_norm": 0.09883278608322144, + "learning_rate": 0.00012420098699859192, + "loss": 0.0043, + "step": 2547 + }, + { + "epoch": 1.67, + "grad_norm": 0.09738563001155853, + "learning_rate": 0.0001240994362102591, + "loss": 0.009, + "step": 2548 + }, + { + "epoch": 1.67, + "grad_norm": 0.11794974654912949, + "learning_rate": 0.00012399789765643411, + "loss": 0.0264, + "step": 2549 + }, + { + "epoch": 1.67, + "grad_norm": 0.13492193818092346, + "learning_rate": 0.0001238963713850802, + "loss": 0.0309, + "step": 2550 + }, + { + "epoch": 1.67, + "grad_norm": 0.19299760460853577, + "learning_rate": 0.00012379485744415476, + "loss": 0.0204, + "step": 2551 + }, + { + "epoch": 1.67, + "grad_norm": 0.0748491957783699, + "learning_rate": 0.00012369335588160933, + "loss": 0.011, + "step": 2552 + }, + { + "epoch": 1.67, + "grad_norm": 0.11878203600645065, + "learning_rate": 0.0001235918667453897, + "loss": 0.0218, + "step": 2553 + }, + { + "epoch": 1.67, + "grad_norm": 0.06896767765283585, + "learning_rate": 0.00012349039008343568, + "loss": 0.0074, + "step": 2554 + }, + { + "epoch": 1.67, + "grad_norm": 0.0966789722442627, + "learning_rate": 0.0001233889259436813, + "loss": 0.0162, + "step": 2555 + }, + { + "epoch": 1.67, + "grad_norm": 0.13032063841819763, + "learning_rate": 0.00012328747437405466, + "loss": 0.0299, + "step": 2556 + }, + { + "epoch": 1.67, + "grad_norm": 0.056150924414396286, + "learning_rate": 0.0001231860354224778, + "loss": 0.0147, + "step": 2557 + }, + { + "epoch": 1.67, + "grad_norm": 0.18075281381607056, + "learning_rate": 0.0001230846091368669, + "loss": 0.0511, + "step": 2558 + }, + { + "epoch": 1.68, + "grad_norm": 0.02879105694591999, + "learning_rate": 0.00012298319556513216, + "loss": 0.003, + "step": 2559 + }, + { + "epoch": 1.68, + "grad_norm": 0.0305055920034647, + "learning_rate": 0.0001228817947551778, + "loss": 0.0038, + "step": 2560 + }, + { + "epoch": 1.68, + "grad_norm": 0.08900358527898788, + "learning_rate": 0.00012278040675490186, + "loss": 0.011, + "step": 2561 + }, + { + "epoch": 1.68, + "grad_norm": 0.22078153491020203, + "learning_rate": 0.0001226790316121965, + "loss": 0.0269, + "step": 2562 + }, + { + "epoch": 1.68, + "grad_norm": 0.11649607867002487, + "learning_rate": 0.00012257766937494774, + "loss": 0.0144, + "step": 2563 + }, + { + "epoch": 1.68, + "grad_norm": 0.08650204539299011, + "learning_rate": 0.00012247632009103552, + "loss": 0.0122, + "step": 2564 + }, + { + "epoch": 1.68, + "grad_norm": 0.1292939931154251, + "learning_rate": 0.0001223749838083336, + "loss": 0.0151, + "step": 2565 + }, + { + "epoch": 1.68, + "grad_norm": 0.05495935305953026, + "learning_rate": 0.00012227366057470968, + "loss": 0.007, + "step": 2566 + }, + { + "epoch": 1.68, + "grad_norm": 0.10544559359550476, + "learning_rate": 0.00012217235043802526, + "loss": 0.0161, + "step": 2567 + }, + { + "epoch": 1.68, + "grad_norm": 0.17187099158763885, + "learning_rate": 0.00012207105344613566, + "loss": 0.0419, + "step": 2568 + }, + { + "epoch": 1.68, + "grad_norm": 0.09299924224615097, + "learning_rate": 0.00012196976964689001, + "loss": 0.0047, + "step": 2569 + }, + { + "epoch": 1.68, + "grad_norm": 0.14098414778709412, + "learning_rate": 0.00012186849908813111, + "loss": 0.0273, + "step": 2570 + }, + { + "epoch": 1.68, + "grad_norm": 0.18325307965278625, + "learning_rate": 0.00012176724181769564, + "loss": 0.0292, + "step": 2571 + }, + { + "epoch": 1.68, + "grad_norm": 0.23543445765972137, + "learning_rate": 0.00012166599788341393, + "loss": 0.061, + "step": 2572 + }, + { + "epoch": 1.68, + "grad_norm": 0.23078452050685883, + "learning_rate": 0.00012156476733311005, + "loss": 0.0406, + "step": 2573 + }, + { + "epoch": 1.69, + "grad_norm": 0.07733777910470963, + "learning_rate": 0.00012146355021460166, + "loss": 0.0074, + "step": 2574 + }, + { + "epoch": 1.69, + "grad_norm": 0.17538850009441376, + "learning_rate": 0.00012136234657570018, + "loss": 0.023, + "step": 2575 + }, + { + "epoch": 1.69, + "grad_norm": 0.1247786208987236, + "learning_rate": 0.00012126115646421062, + "loss": 0.0061, + "step": 2576 + }, + { + "epoch": 1.69, + "grad_norm": 0.011379680596292019, + "learning_rate": 0.00012115997992793163, + "loss": 0.0014, + "step": 2577 + }, + { + "epoch": 1.69, + "grad_norm": 0.029602685943245888, + "learning_rate": 0.00012105881701465533, + "loss": 0.0037, + "step": 2578 + }, + { + "epoch": 1.69, + "grad_norm": 0.23150865733623505, + "learning_rate": 0.00012095766777216755, + "loss": 0.027, + "step": 2579 + }, + { + "epoch": 1.69, + "grad_norm": 0.2272222638130188, + "learning_rate": 0.00012085653224824761, + "loss": 0.0493, + "step": 2580 + }, + { + "epoch": 1.69, + "grad_norm": 0.2526804208755493, + "learning_rate": 0.00012075541049066832, + "loss": 0.036, + "step": 2581 + }, + { + "epoch": 1.69, + "grad_norm": 0.29018113017082214, + "learning_rate": 0.00012065430254719608, + "loss": 0.0353, + "step": 2582 + }, + { + "epoch": 1.69, + "grad_norm": 0.062286440283060074, + "learning_rate": 0.0001205532084655906, + "loss": 0.0081, + "step": 2583 + }, + { + "epoch": 1.69, + "grad_norm": 0.07457780838012695, + "learning_rate": 0.00012045212829360517, + "loss": 0.0109, + "step": 2584 + }, + { + "epoch": 1.69, + "grad_norm": 0.34257975220680237, + "learning_rate": 0.0001203510620789865, + "loss": 0.032, + "step": 2585 + }, + { + "epoch": 1.69, + "grad_norm": 0.2395414412021637, + "learning_rate": 0.00012025000986947471, + "loss": 0.0255, + "step": 2586 + }, + { + "epoch": 1.69, + "grad_norm": 0.07272249460220337, + "learning_rate": 0.00012014897171280323, + "loss": 0.0086, + "step": 2587 + }, + { + "epoch": 1.69, + "grad_norm": 0.14450271427631378, + "learning_rate": 0.0001200479476566989, + "loss": 0.0217, + "step": 2588 + }, + { + "epoch": 1.69, + "grad_norm": 0.24903085827827454, + "learning_rate": 0.00011994693774888192, + "loss": 0.0251, + "step": 2589 + }, + { + "epoch": 1.7, + "grad_norm": 0.13333503901958466, + "learning_rate": 0.00011984594203706583, + "loss": 0.0176, + "step": 2590 + }, + { + "epoch": 1.7, + "grad_norm": 0.061313144862651825, + "learning_rate": 0.00011974496056895735, + "loss": 0.0043, + "step": 2591 + }, + { + "epoch": 1.7, + "grad_norm": 0.21405766904354095, + "learning_rate": 0.00011964399339225658, + "loss": 0.0227, + "step": 2592 + }, + { + "epoch": 1.7, + "grad_norm": 0.11095394939184189, + "learning_rate": 0.00011954304055465683, + "loss": 0.0208, + "step": 2593 + }, + { + "epoch": 1.7, + "grad_norm": 0.11780749261379242, + "learning_rate": 0.00011944210210384464, + "loss": 0.0139, + "step": 2594 + }, + { + "epoch": 1.7, + "grad_norm": 0.21892563998699188, + "learning_rate": 0.00011934117808749978, + "loss": 0.0234, + "step": 2595 + }, + { + "epoch": 1.7, + "grad_norm": 0.08663798868656158, + "learning_rate": 0.00011924026855329511, + "loss": 0.0085, + "step": 2596 + }, + { + "epoch": 1.7, + "grad_norm": 0.18243402242660522, + "learning_rate": 0.00011913937354889678, + "loss": 0.035, + "step": 2597 + }, + { + "epoch": 1.7, + "grad_norm": 0.023229578509926796, + "learning_rate": 0.00011903849312196398, + "loss": 0.0015, + "step": 2598 + }, + { + "epoch": 1.7, + "grad_norm": 0.14799842238426208, + "learning_rate": 0.00011893762732014909, + "loss": 0.0214, + "step": 2599 + }, + { + "epoch": 1.7, + "grad_norm": 0.29807209968566895, + "learning_rate": 0.00011883677619109746, + "loss": 0.024, + "step": 2600 + }, + { + "epoch": 1.7, + "grad_norm": 0.1408165544271469, + "learning_rate": 0.00011873593978244771, + "loss": 0.0106, + "step": 2601 + }, + { + "epoch": 1.7, + "grad_norm": 0.11649095267057419, + "learning_rate": 0.00011863511814183123, + "loss": 0.0208, + "step": 2602 + }, + { + "epoch": 1.7, + "grad_norm": 0.1464689075946808, + "learning_rate": 0.00011853431131687267, + "loss": 0.0253, + "step": 2603 + }, + { + "epoch": 1.7, + "grad_norm": 0.3470141589641571, + "learning_rate": 0.00011843351935518957, + "loss": 0.0329, + "step": 2604 + }, + { + "epoch": 1.71, + "grad_norm": 0.2767955958843231, + "learning_rate": 0.00011833274230439255, + "loss": 0.0307, + "step": 2605 + }, + { + "epoch": 1.71, + "grad_norm": 0.047959037125110626, + "learning_rate": 0.00011823198021208503, + "loss": 0.002, + "step": 2606 + }, + { + "epoch": 1.71, + "grad_norm": 0.05150588974356651, + "learning_rate": 0.00011813123312586349, + "loss": 0.0031, + "step": 2607 + }, + { + "epoch": 1.71, + "grad_norm": 0.21748711168766022, + "learning_rate": 0.00011803050109331725, + "loss": 0.0434, + "step": 2608 + }, + { + "epoch": 1.71, + "grad_norm": 0.26486659049987793, + "learning_rate": 0.0001179297841620286, + "loss": 0.0183, + "step": 2609 + }, + { + "epoch": 1.71, + "grad_norm": 0.04066809266805649, + "learning_rate": 0.00011782908237957265, + "loss": 0.0033, + "step": 2610 + }, + { + "epoch": 1.71, + "grad_norm": 0.13910254836082458, + "learning_rate": 0.00011772839579351726, + "loss": 0.0109, + "step": 2611 + }, + { + "epoch": 1.71, + "grad_norm": 0.24764953553676605, + "learning_rate": 0.00011762772445142329, + "loss": 0.0182, + "step": 2612 + }, + { + "epoch": 1.71, + "grad_norm": 0.0986236035823822, + "learning_rate": 0.00011752706840084428, + "loss": 0.0089, + "step": 2613 + }, + { + "epoch": 1.71, + "grad_norm": 0.322979211807251, + "learning_rate": 0.0001174264276893266, + "loss": 0.0356, + "step": 2614 + }, + { + "epoch": 1.71, + "grad_norm": 0.3145226538181305, + "learning_rate": 0.00011732580236440934, + "loss": 0.0271, + "step": 2615 + }, + { + "epoch": 1.71, + "grad_norm": 0.10612889379262924, + "learning_rate": 0.00011722519247362431, + "loss": 0.0093, + "step": 2616 + }, + { + "epoch": 1.71, + "grad_norm": 0.2893246114253998, + "learning_rate": 0.00011712459806449608, + "loss": 0.0193, + "step": 2617 + }, + { + "epoch": 1.71, + "grad_norm": 0.16755311191082, + "learning_rate": 0.00011702401918454192, + "loss": 0.0129, + "step": 2618 + }, + { + "epoch": 1.71, + "grad_norm": 0.029482614248991013, + "learning_rate": 0.00011692345588127165, + "loss": 0.0031, + "step": 2619 + }, + { + "epoch": 1.72, + "grad_norm": 0.13832448422908783, + "learning_rate": 0.00011682290820218785, + "loss": 0.0061, + "step": 2620 + }, + { + "epoch": 1.72, + "grad_norm": 0.19813022017478943, + "learning_rate": 0.00011672237619478566, + "loss": 0.0332, + "step": 2621 + }, + { + "epoch": 1.72, + "grad_norm": 0.350599080324173, + "learning_rate": 0.00011662185990655284, + "loss": 0.0375, + "step": 2622 + }, + { + "epoch": 1.72, + "grad_norm": 0.29366403818130493, + "learning_rate": 0.00011652135938496977, + "loss": 0.0523, + "step": 2623 + }, + { + "epoch": 1.72, + "grad_norm": 0.17575830221176147, + "learning_rate": 0.00011642087467750924, + "loss": 0.0345, + "step": 2624 + }, + { + "epoch": 1.72, + "grad_norm": 0.14889006316661835, + "learning_rate": 0.00011632040583163673, + "loss": 0.0164, + "step": 2625 + }, + { + "epoch": 1.72, + "grad_norm": 0.09447371959686279, + "learning_rate": 0.00011621995289481013, + "loss": 0.0103, + "step": 2626 + }, + { + "epoch": 1.72, + "grad_norm": 0.054458245635032654, + "learning_rate": 0.00011611951591447991, + "loss": 0.0036, + "step": 2627 + }, + { + "epoch": 1.72, + "grad_norm": 0.19059543311595917, + "learning_rate": 0.00011601909493808882, + "loss": 0.0166, + "step": 2628 + }, + { + "epoch": 1.72, + "grad_norm": 0.2544896900653839, + "learning_rate": 0.00011591869001307226, + "loss": 0.0297, + "step": 2629 + }, + { + "epoch": 1.72, + "grad_norm": 0.07179494202136993, + "learning_rate": 0.00011581830118685792, + "loss": 0.0104, + "step": 2630 + }, + { + "epoch": 1.72, + "grad_norm": 0.16107530891895294, + "learning_rate": 0.00011571792850686595, + "loss": 0.0107, + "step": 2631 + }, + { + "epoch": 1.72, + "grad_norm": 0.14669694006443024, + "learning_rate": 0.0001156175720205088, + "loss": 0.0268, + "step": 2632 + }, + { + "epoch": 1.72, + "grad_norm": 0.19625726342201233, + "learning_rate": 0.00011551723177519134, + "loss": 0.0296, + "step": 2633 + }, + { + "epoch": 1.72, + "grad_norm": 0.09380532801151276, + "learning_rate": 0.00011541690781831074, + "loss": 0.0082, + "step": 2634 + }, + { + "epoch": 1.73, + "grad_norm": 0.03426641598343849, + "learning_rate": 0.00011531660019725648, + "loss": 0.0042, + "step": 2635 + }, + { + "epoch": 1.73, + "grad_norm": 0.10879172384738922, + "learning_rate": 0.00011521630895941036, + "loss": 0.0259, + "step": 2636 + }, + { + "epoch": 1.73, + "grad_norm": 0.2619246542453766, + "learning_rate": 0.00011511603415214633, + "loss": 0.0394, + "step": 2637 + }, + { + "epoch": 1.73, + "grad_norm": 0.06384444236755371, + "learning_rate": 0.00011501577582283071, + "loss": 0.0063, + "step": 2638 + }, + { + "epoch": 1.73, + "grad_norm": 0.11376836895942688, + "learning_rate": 0.00011491553401882195, + "loss": 0.0164, + "step": 2639 + }, + { + "epoch": 1.73, + "grad_norm": 0.011989972554147243, + "learning_rate": 0.00011481530878747076, + "loss": 0.0013, + "step": 2640 + }, + { + "epoch": 1.73, + "grad_norm": 0.12901929020881653, + "learning_rate": 0.00011471510017611995, + "loss": 0.0059, + "step": 2641 + }, + { + "epoch": 1.73, + "grad_norm": 0.036112286150455475, + "learning_rate": 0.00011461490823210451, + "loss": 0.0034, + "step": 2642 + }, + { + "epoch": 1.73, + "grad_norm": 0.13186568021774292, + "learning_rate": 0.00011451473300275158, + "loss": 0.0125, + "step": 2643 + }, + { + "epoch": 1.73, + "grad_norm": 0.08135777711868286, + "learning_rate": 0.00011441457453538038, + "loss": 0.0076, + "step": 2644 + }, + { + "epoch": 1.73, + "grad_norm": 0.31037700176239014, + "learning_rate": 0.00011431443287730226, + "loss": 0.0438, + "step": 2645 + }, + { + "epoch": 1.73, + "grad_norm": 0.02185073494911194, + "learning_rate": 0.0001142143080758205, + "loss": 0.0025, + "step": 2646 + }, + { + "epoch": 1.73, + "grad_norm": 0.09959240257740021, + "learning_rate": 0.00011411420017823056, + "loss": 0.0079, + "step": 2647 + }, + { + "epoch": 1.73, + "grad_norm": 0.06049061939120293, + "learning_rate": 0.00011401410923181986, + "loss": 0.0053, + "step": 2648 + }, + { + "epoch": 1.73, + "grad_norm": 0.03902551159262657, + "learning_rate": 0.00011391403528386782, + "loss": 0.0029, + "step": 2649 + }, + { + "epoch": 1.73, + "grad_norm": 0.17166626453399658, + "learning_rate": 0.0001138139783816458, + "loss": 0.0218, + "step": 2650 + }, + { + "epoch": 1.74, + "grad_norm": 0.026734329760074615, + "learning_rate": 0.00011371393857241713, + "loss": 0.0027, + "step": 2651 + }, + { + "epoch": 1.74, + "grad_norm": 0.007330378983169794, + "learning_rate": 0.0001136139159034371, + "loss": 0.0007, + "step": 2652 + }, + { + "epoch": 1.74, + "grad_norm": 0.030042244121432304, + "learning_rate": 0.00011351391042195287, + "loss": 0.0023, + "step": 2653 + }, + { + "epoch": 1.74, + "grad_norm": 0.1411576271057129, + "learning_rate": 0.00011341392217520345, + "loss": 0.0126, + "step": 2654 + }, + { + "epoch": 1.74, + "grad_norm": 0.2431839257478714, + "learning_rate": 0.00011331395121041975, + "loss": 0.0162, + "step": 2655 + }, + { + "epoch": 1.74, + "grad_norm": 0.013232244178652763, + "learning_rate": 0.0001132139975748245, + "loss": 0.0011, + "step": 2656 + }, + { + "epoch": 1.74, + "grad_norm": 0.2504200041294098, + "learning_rate": 0.0001131140613156323, + "loss": 0.0161, + "step": 2657 + }, + { + "epoch": 1.74, + "grad_norm": 0.09972897917032242, + "learning_rate": 0.00011301414248004949, + "loss": 0.0057, + "step": 2658 + }, + { + "epoch": 1.74, + "grad_norm": 0.22492682933807373, + "learning_rate": 0.00011291424111527412, + "loss": 0.0339, + "step": 2659 + }, + { + "epoch": 1.74, + "grad_norm": 0.1436653435230255, + "learning_rate": 0.0001128143572684961, + "loss": 0.007, + "step": 2660 + }, + { + "epoch": 1.74, + "grad_norm": 0.0187530554831028, + "learning_rate": 0.000112714490986897, + "loss": 0.0019, + "step": 2661 + }, + { + "epoch": 1.74, + "grad_norm": 0.21609365940093994, + "learning_rate": 0.00011261464231765017, + "loss": 0.0084, + "step": 2662 + }, + { + "epoch": 1.74, + "grad_norm": 0.02223445661365986, + "learning_rate": 0.00011251481130792048, + "loss": 0.0011, + "step": 2663 + }, + { + "epoch": 1.74, + "grad_norm": 0.14209489524364471, + "learning_rate": 0.0001124149980048646, + "loss": 0.0057, + "step": 2664 + }, + { + "epoch": 1.74, + "grad_norm": 0.27440565824508667, + "learning_rate": 0.00011231520245563082, + "loss": 0.0169, + "step": 2665 + }, + { + "epoch": 1.75, + "grad_norm": 0.25544804334640503, + "learning_rate": 0.00011221542470735904, + "loss": 0.0085, + "step": 2666 + }, + { + "epoch": 1.75, + "grad_norm": 0.10816871374845505, + "learning_rate": 0.00011211566480718064, + "loss": 0.0083, + "step": 2667 + }, + { + "epoch": 1.75, + "grad_norm": 0.0025934341829270124, + "learning_rate": 0.00011201592280221872, + "loss": 0.0003, + "step": 2668 + }, + { + "epoch": 1.75, + "grad_norm": 0.11705806106328964, + "learning_rate": 0.00011191619873958785, + "loss": 0.0066, + "step": 2669 + }, + { + "epoch": 1.75, + "grad_norm": 0.15215551853179932, + "learning_rate": 0.00011181649266639416, + "loss": 0.0297, + "step": 2670 + }, + { + "epoch": 1.75, + "grad_norm": 0.29455116391181946, + "learning_rate": 0.00011171680462973526, + "loss": 0.0228, + "step": 2671 + }, + { + "epoch": 1.75, + "grad_norm": 0.38029757142066956, + "learning_rate": 0.00011161713467670022, + "loss": 0.0303, + "step": 2672 + }, + { + "epoch": 1.75, + "grad_norm": 0.3070727586746216, + "learning_rate": 0.0001115174828543696, + "loss": 0.0516, + "step": 2673 + }, + { + "epoch": 1.75, + "grad_norm": 0.1334857940673828, + "learning_rate": 0.00011141784920981539, + "loss": 0.021, + "step": 2674 + }, + { + "epoch": 1.75, + "eval_loss": 0.03871524706482887, + "eval_runtime": 39.9789, + "eval_samples_per_second": 32.192, + "eval_steps_per_second": 8.054, + "step": 2674 + }, + { + "epoch": 1.75, + "grad_norm": 0.4947819709777832, + "learning_rate": 0.00011131823379010101, + "loss": 0.0455, + "step": 2675 + }, + { + "epoch": 1.75, + "grad_norm": 0.3073294758796692, + "learning_rate": 0.00011121863664228123, + "loss": 0.0581, + "step": 2676 + }, + { + "epoch": 1.75, + "grad_norm": 0.17356103658676147, + "learning_rate": 0.0001111190578134022, + "loss": 0.0228, + "step": 2677 + }, + { + "epoch": 1.75, + "grad_norm": 0.10131116956472397, + "learning_rate": 0.00011101949735050143, + "loss": 0.0063, + "step": 2678 + }, + { + "epoch": 1.75, + "grad_norm": 0.08166798949241638, + "learning_rate": 0.00011091995530060781, + "loss": 0.0091, + "step": 2679 + }, + { + "epoch": 1.75, + "grad_norm": 0.13784568011760712, + "learning_rate": 0.0001108204317107414, + "loss": 0.0136, + "step": 2680 + }, + { + "epoch": 1.76, + "grad_norm": 0.16461403667926788, + "learning_rate": 0.00011072092662791364, + "loss": 0.0338, + "step": 2681 + }, + { + "epoch": 1.76, + "grad_norm": 0.1617322415113449, + "learning_rate": 0.00011062144009912721, + "loss": 0.0205, + "step": 2682 + }, + { + "epoch": 1.76, + "grad_norm": 0.17539392411708832, + "learning_rate": 0.000110521972171376, + "loss": 0.0044, + "step": 2683 + }, + { + "epoch": 1.76, + "grad_norm": 0.0066742608323693275, + "learning_rate": 0.00011042252289164518, + "loss": 0.0009, + "step": 2684 + }, + { + "epoch": 1.76, + "grad_norm": 0.09298042207956314, + "learning_rate": 0.000110323092306911, + "loss": 0.0209, + "step": 2685 + }, + { + "epoch": 1.76, + "grad_norm": 0.05764083191752434, + "learning_rate": 0.00011022368046414096, + "loss": 0.0038, + "step": 2686 + }, + { + "epoch": 1.76, + "grad_norm": 0.0654100552201271, + "learning_rate": 0.00011012428741029372, + "loss": 0.0095, + "step": 2687 + }, + { + "epoch": 1.76, + "grad_norm": 0.008322247304022312, + "learning_rate": 0.00011002491319231902, + "loss": 0.0006, + "step": 2688 + }, + { + "epoch": 1.76, + "grad_norm": 0.33108824491500854, + "learning_rate": 0.00010992555785715771, + "loss": 0.0333, + "step": 2689 + }, + { + "epoch": 1.76, + "grad_norm": 0.16499800980091095, + "learning_rate": 0.0001098262214517417, + "loss": 0.0243, + "step": 2690 + }, + { + "epoch": 1.76, + "grad_norm": 0.10082323104143143, + "learning_rate": 0.00010972690402299402, + "loss": 0.0077, + "step": 2691 + }, + { + "epoch": 1.76, + "grad_norm": 0.3231547772884369, + "learning_rate": 0.00010962760561782873, + "loss": 0.0376, + "step": 2692 + }, + { + "epoch": 1.76, + "grad_norm": 0.04128749296069145, + "learning_rate": 0.0001095283262831508, + "loss": 0.0052, + "step": 2693 + }, + { + "epoch": 1.76, + "grad_norm": 0.2695901691913605, + "learning_rate": 0.0001094290660658563, + "loss": 0.0303, + "step": 2694 + }, + { + "epoch": 1.76, + "grad_norm": 0.015289656817913055, + "learning_rate": 0.00010932982501283224, + "loss": 0.002, + "step": 2695 + }, + { + "epoch": 1.76, + "grad_norm": 0.10868901014328003, + "learning_rate": 0.0001092306031709566, + "loss": 0.0261, + "step": 2696 + }, + { + "epoch": 1.77, + "grad_norm": 0.029903091490268707, + "learning_rate": 0.00010913140058709824, + "loss": 0.0032, + "step": 2697 + }, + { + "epoch": 1.77, + "grad_norm": 0.1284814178943634, + "learning_rate": 0.00010903221730811692, + "loss": 0.0192, + "step": 2698 + }, + { + "epoch": 1.77, + "grad_norm": 0.09511958807706833, + "learning_rate": 0.00010893305338086334, + "loss": 0.0291, + "step": 2699 + }, + { + "epoch": 1.77, + "grad_norm": 0.14972400665283203, + "learning_rate": 0.00010883390885217896, + "loss": 0.0114, + "step": 2700 + }, + { + "epoch": 1.77, + "grad_norm": 0.14529098570346832, + "learning_rate": 0.00010873478376889625, + "loss": 0.0153, + "step": 2701 + }, + { + "epoch": 1.77, + "grad_norm": 0.12022048979997635, + "learning_rate": 0.0001086356781778383, + "loss": 0.0129, + "step": 2702 + }, + { + "epoch": 1.77, + "grad_norm": 0.3261503279209137, + "learning_rate": 0.00010853659212581911, + "loss": 0.0255, + "step": 2703 + }, + { + "epoch": 1.77, + "grad_norm": 0.026748869568109512, + "learning_rate": 0.00010843752565964337, + "loss": 0.0036, + "step": 2704 + }, + { + "epoch": 1.77, + "grad_norm": 0.13744834065437317, + "learning_rate": 0.0001083384788261066, + "loss": 0.0323, + "step": 2705 + }, + { + "epoch": 1.77, + "grad_norm": 0.06895776093006134, + "learning_rate": 0.00010823945167199499, + "loss": 0.0097, + "step": 2706 + }, + { + "epoch": 1.77, + "grad_norm": 0.03191859647631645, + "learning_rate": 0.00010814044424408552, + "loss": 0.0034, + "step": 2707 + }, + { + "epoch": 1.77, + "grad_norm": 0.0162139143794775, + "learning_rate": 0.00010804145658914571, + "loss": 0.0022, + "step": 2708 + }, + { + "epoch": 1.77, + "grad_norm": 0.611072301864624, + "learning_rate": 0.00010794248875393385, + "loss": 0.0402, + "step": 2709 + }, + { + "epoch": 1.77, + "grad_norm": 0.008421842940151691, + "learning_rate": 0.00010784354078519884, + "loss": 0.0007, + "step": 2710 + }, + { + "epoch": 1.77, + "grad_norm": 0.04360457509756088, + "learning_rate": 0.00010774461272968016, + "loss": 0.0057, + "step": 2711 + }, + { + "epoch": 1.78, + "grad_norm": 0.027826182544231415, + "learning_rate": 0.00010764570463410802, + "loss": 0.0038, + "step": 2712 + }, + { + "epoch": 1.78, + "grad_norm": 0.08522525429725647, + "learning_rate": 0.00010754681654520296, + "loss": 0.0105, + "step": 2713 + }, + { + "epoch": 1.78, + "grad_norm": 0.11638516932725906, + "learning_rate": 0.00010744794850967627, + "loss": 0.0149, + "step": 2714 + }, + { + "epoch": 1.78, + "grad_norm": 0.14275485277175903, + "learning_rate": 0.0001073491005742297, + "loss": 0.0108, + "step": 2715 + }, + { + "epoch": 1.78, + "grad_norm": 0.22823548316955566, + "learning_rate": 0.00010725027278555554, + "loss": 0.0384, + "step": 2716 + }, + { + "epoch": 1.78, + "grad_norm": 0.07148795574903488, + "learning_rate": 0.00010715146519033647, + "loss": 0.0063, + "step": 2717 + }, + { + "epoch": 1.78, + "grad_norm": 0.08536515384912491, + "learning_rate": 0.00010705267783524574, + "loss": 0.013, + "step": 2718 + }, + { + "epoch": 1.78, + "grad_norm": 0.06670022010803223, + "learning_rate": 0.00010695391076694698, + "loss": 0.0062, + "step": 2719 + }, + { + "epoch": 1.78, + "grad_norm": 0.07848575711250305, + "learning_rate": 0.00010685516403209426, + "loss": 0.0074, + "step": 2720 + }, + { + "epoch": 1.78, + "grad_norm": 0.07577262818813324, + "learning_rate": 0.000106756437677332, + "loss": 0.0049, + "step": 2721 + }, + { + "epoch": 1.78, + "grad_norm": 0.04875582456588745, + "learning_rate": 0.00010665773174929507, + "loss": 0.0071, + "step": 2722 + }, + { + "epoch": 1.78, + "grad_norm": 0.021466953679919243, + "learning_rate": 0.00010655904629460862, + "loss": 0.0028, + "step": 2723 + }, + { + "epoch": 1.78, + "grad_norm": 0.0066236392594873905, + "learning_rate": 0.00010646038135988819, + "loss": 0.0008, + "step": 2724 + }, + { + "epoch": 1.78, + "grad_norm": 0.07818492501974106, + "learning_rate": 0.00010636173699173959, + "loss": 0.0084, + "step": 2725 + }, + { + "epoch": 1.78, + "grad_norm": 0.06917870789766312, + "learning_rate": 0.0001062631132367589, + "loss": 0.0074, + "step": 2726 + }, + { + "epoch": 1.79, + "grad_norm": 0.20321956276893616, + "learning_rate": 0.00010616451014153246, + "loss": 0.0191, + "step": 2727 + }, + { + "epoch": 1.79, + "grad_norm": 0.05821048095822334, + "learning_rate": 0.00010606592775263694, + "loss": 0.004, + "step": 2728 + }, + { + "epoch": 1.79, + "grad_norm": 0.07762417942285538, + "learning_rate": 0.00010596736611663916, + "loss": 0.0028, + "step": 2729 + }, + { + "epoch": 1.79, + "grad_norm": 0.06247600540518761, + "learning_rate": 0.0001058688252800961, + "loss": 0.0032, + "step": 2730 + }, + { + "epoch": 1.79, + "grad_norm": 0.35450640320777893, + "learning_rate": 0.00010577030528955497, + "loss": 0.0227, + "step": 2731 + }, + { + "epoch": 1.79, + "grad_norm": 0.1101314052939415, + "learning_rate": 0.00010567180619155312, + "loss": 0.0064, + "step": 2732 + }, + { + "epoch": 1.79, + "grad_norm": 0.1981196254491806, + "learning_rate": 0.00010557332803261806, + "loss": 0.0467, + "step": 2733 + }, + { + "epoch": 1.79, + "grad_norm": 0.030132168903946877, + "learning_rate": 0.00010547487085926732, + "loss": 0.0024, + "step": 2734 + }, + { + "epoch": 1.79, + "grad_norm": 0.14475418627262115, + "learning_rate": 0.00010537643471800862, + "loss": 0.0316, + "step": 2735 + }, + { + "epoch": 1.79, + "grad_norm": 0.22851037979125977, + "learning_rate": 0.0001052780196553397, + "loss": 0.0124, + "step": 2736 + }, + { + "epoch": 1.79, + "grad_norm": 0.15478867292404175, + "learning_rate": 0.00010517962571774832, + "loss": 0.0209, + "step": 2737 + }, + { + "epoch": 1.79, + "grad_norm": 0.1821509748697281, + "learning_rate": 0.00010508125295171236, + "loss": 0.0229, + "step": 2738 + }, + { + "epoch": 1.79, + "grad_norm": 0.24094289541244507, + "learning_rate": 0.00010498290140369953, + "loss": 0.0679, + "step": 2739 + }, + { + "epoch": 1.79, + "grad_norm": 0.07481525093317032, + "learning_rate": 0.00010488457112016765, + "loss": 0.0076, + "step": 2740 + }, + { + "epoch": 1.79, + "grad_norm": 0.8672902584075928, + "learning_rate": 0.00010478626214756448, + "loss": 0.0177, + "step": 2741 + }, + { + "epoch": 1.8, + "grad_norm": 0.04198053479194641, + "learning_rate": 0.0001046879745323277, + "loss": 0.0032, + "step": 2742 + }, + { + "epoch": 1.8, + "grad_norm": 0.3100494146347046, + "learning_rate": 0.00010458970832088484, + "loss": 0.0334, + "step": 2743 + }, + { + "epoch": 1.8, + "grad_norm": 0.12208042293787003, + "learning_rate": 0.0001044914635596534, + "loss": 0.0125, + "step": 2744 + }, + { + "epoch": 1.8, + "grad_norm": 0.04906391724944115, + "learning_rate": 0.00010439324029504073, + "loss": 0.0023, + "step": 2745 + }, + { + "epoch": 1.8, + "grad_norm": 0.18216626346111298, + "learning_rate": 0.00010429503857344403, + "loss": 0.0179, + "step": 2746 + }, + { + "epoch": 1.8, + "grad_norm": 0.04328668490052223, + "learning_rate": 0.0001041968584412503, + "loss": 0.0042, + "step": 2747 + }, + { + "epoch": 1.8, + "grad_norm": 0.17808032035827637, + "learning_rate": 0.00010409869994483632, + "loss": 0.0058, + "step": 2748 + }, + { + "epoch": 1.8, + "grad_norm": 0.040204476565122604, + "learning_rate": 0.00010400056313056873, + "loss": 0.0035, + "step": 2749 + }, + { + "epoch": 1.8, + "grad_norm": 0.08717933297157288, + "learning_rate": 0.00010390244804480385, + "loss": 0.0096, + "step": 2750 + }, + { + "epoch": 1.8, + "grad_norm": 0.019179692491889, + "learning_rate": 0.0001038043547338878, + "loss": 0.001, + "step": 2751 + }, + { + "epoch": 1.8, + "grad_norm": 0.41493868827819824, + "learning_rate": 0.00010370628324415633, + "loss": 0.0285, + "step": 2752 + }, + { + "epoch": 1.8, + "grad_norm": 0.2256990224123001, + "learning_rate": 0.00010360823362193495, + "loss": 0.0211, + "step": 2753 + }, + { + "epoch": 1.8, + "grad_norm": 0.05887773633003235, + "learning_rate": 0.00010351020591353885, + "loss": 0.0064, + "step": 2754 + }, + { + "epoch": 1.8, + "grad_norm": 0.05249325558543205, + "learning_rate": 0.00010341220016527286, + "loss": 0.0039, + "step": 2755 + }, + { + "epoch": 1.8, + "grad_norm": 0.08469627052545547, + "learning_rate": 0.00010331421642343138, + "loss": 0.0145, + "step": 2756 + }, + { + "epoch": 1.8, + "grad_norm": 0.6204782128334045, + "learning_rate": 0.00010321625473429844, + "loss": 0.0185, + "step": 2757 + }, + { + "epoch": 1.81, + "grad_norm": 0.244975745677948, + "learning_rate": 0.00010311831514414769, + "loss": 0.05, + "step": 2758 + }, + { + "epoch": 1.81, + "grad_norm": 0.13598302006721497, + "learning_rate": 0.00010302039769924234, + "loss": 0.0384, + "step": 2759 + }, + { + "epoch": 1.81, + "grad_norm": 0.18722222745418549, + "learning_rate": 0.00010292250244583512, + "loss": 0.0054, + "step": 2760 + }, + { + "epoch": 1.81, + "grad_norm": 0.43910056352615356, + "learning_rate": 0.00010282462943016821, + "loss": 0.0335, + "step": 2761 + }, + { + "epoch": 1.81, + "grad_norm": 0.2426317185163498, + "learning_rate": 0.00010272677869847342, + "loss": 0.0236, + "step": 2762 + }, + { + "epoch": 1.81, + "grad_norm": 0.02529718354344368, + "learning_rate": 0.00010262895029697194, + "loss": 0.0019, + "step": 2763 + }, + { + "epoch": 1.81, + "grad_norm": 0.14480143785476685, + "learning_rate": 0.00010253114427187447, + "loss": 0.0158, + "step": 2764 + }, + { + "epoch": 1.81, + "grad_norm": 0.23900645971298218, + "learning_rate": 0.00010243336066938107, + "loss": 0.0326, + "step": 2765 + }, + { + "epoch": 1.81, + "grad_norm": 0.170863538980484, + "learning_rate": 0.00010233559953568125, + "loss": 0.011, + "step": 2766 + }, + { + "epoch": 1.81, + "grad_norm": 0.0535564124584198, + "learning_rate": 0.00010223786091695387, + "loss": 0.005, + "step": 2767 + }, + { + "epoch": 1.81, + "grad_norm": 0.049874015152454376, + "learning_rate": 0.00010214014485936731, + "loss": 0.0038, + "step": 2768 + }, + { + "epoch": 1.81, + "grad_norm": 0.13458359241485596, + "learning_rate": 0.000102042451409079, + "loss": 0.0272, + "step": 2769 + }, + { + "epoch": 1.81, + "grad_norm": 0.04044146463274956, + "learning_rate": 0.000101944780612236, + "loss": 0.0043, + "step": 2770 + }, + { + "epoch": 1.81, + "grad_norm": 0.0071864319033920765, + "learning_rate": 0.00010184713251497443, + "loss": 0.0009, + "step": 2771 + }, + { + "epoch": 1.81, + "grad_norm": 0.2800210416316986, + "learning_rate": 0.00010174950716341988, + "loss": 0.023, + "step": 2772 + }, + { + "epoch": 1.82, + "grad_norm": 0.0530213862657547, + "learning_rate": 0.00010165190460368709, + "loss": 0.0054, + "step": 2773 + }, + { + "epoch": 1.82, + "grad_norm": 0.12830093502998352, + "learning_rate": 0.00010155432488187995, + "loss": 0.0063, + "step": 2774 + }, + { + "epoch": 1.82, + "grad_norm": 0.09907712042331696, + "learning_rate": 0.00010145676804409176, + "loss": 0.0156, + "step": 2775 + }, + { + "epoch": 1.82, + "grad_norm": 0.0601109117269516, + "learning_rate": 0.00010135923413640487, + "loss": 0.007, + "step": 2776 + }, + { + "epoch": 1.82, + "grad_norm": 0.08233000338077545, + "learning_rate": 0.00010126172320489088, + "loss": 0.0072, + "step": 2777 + }, + { + "epoch": 1.82, + "grad_norm": 0.09767752140760422, + "learning_rate": 0.00010116423529561042, + "loss": 0.0191, + "step": 2778 + }, + { + "epoch": 1.82, + "grad_norm": 0.15519340336322784, + "learning_rate": 0.0001010667704546134, + "loss": 0.0431, + "step": 2779 + }, + { + "epoch": 1.82, + "grad_norm": 0.17334707081317902, + "learning_rate": 0.0001009693287279387, + "loss": 0.0134, + "step": 2780 + }, + { + "epoch": 1.82, + "grad_norm": 0.281533420085907, + "learning_rate": 0.00010087191016161439, + "loss": 0.0136, + "step": 2781 + }, + { + "epoch": 1.82, + "grad_norm": 0.03143606334924698, + "learning_rate": 0.00010077451480165747, + "loss": 0.0032, + "step": 2782 + }, + { + "epoch": 1.82, + "grad_norm": 0.16341941058635712, + "learning_rate": 0.0001006771426940741, + "loss": 0.0121, + "step": 2783 + }, + { + "epoch": 1.82, + "grad_norm": 0.14227576553821564, + "learning_rate": 0.00010057979388485942, + "loss": 0.0096, + "step": 2784 + }, + { + "epoch": 1.82, + "grad_norm": 0.03395267575979233, + "learning_rate": 0.00010048246841999754, + "loss": 0.0039, + "step": 2785 + }, + { + "epoch": 1.82, + "grad_norm": 0.12993645668029785, + "learning_rate": 0.0001003851663454616, + "loss": 0.0057, + "step": 2786 + }, + { + "epoch": 1.82, + "grad_norm": 0.21049480140209198, + "learning_rate": 0.00010028788770721356, + "loss": 0.0176, + "step": 2787 + }, + { + "epoch": 1.83, + "grad_norm": 0.1201065331697464, + "learning_rate": 0.00010019063255120446, + "loss": 0.0609, + "step": 2788 + }, + { + "epoch": 1.83, + "grad_norm": 0.08682208508253098, + "learning_rate": 0.00010009340092337416, + "loss": 0.0055, + "step": 2789 + }, + { + "epoch": 1.83, + "grad_norm": 0.07561865448951721, + "learning_rate": 9.999619286965149e-05, + "loss": 0.0048, + "step": 2790 + }, + { + "epoch": 1.83, + "grad_norm": 0.10334252566099167, + "learning_rate": 9.989900843595403e-05, + "loss": 0.005, + "step": 2791 + }, + { + "epoch": 1.83, + "grad_norm": 0.0631050243973732, + "learning_rate": 9.980184766818828e-05, + "loss": 0.0058, + "step": 2792 + }, + { + "epoch": 1.83, + "grad_norm": 0.43190333247184753, + "learning_rate": 9.970471061224951e-05, + "loss": 0.0583, + "step": 2793 + }, + { + "epoch": 1.83, + "grad_norm": 0.03992627188563347, + "learning_rate": 9.960759731402189e-05, + "loss": 0.0027, + "step": 2794 + }, + { + "epoch": 1.83, + "grad_norm": 0.08371511101722717, + "learning_rate": 9.951050781937822e-05, + "loss": 0.003, + "step": 2795 + }, + { + "epoch": 1.83, + "grad_norm": 0.19122813642024994, + "learning_rate": 9.941344217418017e-05, + "loss": 0.0112, + "step": 2796 + }, + { + "epoch": 1.83, + "grad_norm": 0.019885288551449776, + "learning_rate": 9.931640042427812e-05, + "loss": 0.0014, + "step": 2797 + }, + { + "epoch": 1.83, + "grad_norm": 0.025179168209433556, + "learning_rate": 9.921938261551113e-05, + "loss": 0.0028, + "step": 2798 + }, + { + "epoch": 1.83, + "grad_norm": 0.3792370855808258, + "learning_rate": 9.912238879370703e-05, + "loss": 0.032, + "step": 2799 + }, + { + "epoch": 1.83, + "grad_norm": 0.37182220816612244, + "learning_rate": 9.902541900468216e-05, + "loss": 0.0227, + "step": 2800 + }, + { + "epoch": 1.83, + "grad_norm": 0.17105183005332947, + "learning_rate": 9.892847329424169e-05, + "loss": 0.0373, + "step": 2801 + }, + { + "epoch": 1.83, + "grad_norm": 0.09459386765956879, + "learning_rate": 9.88315517081793e-05, + "loss": 0.0098, + "step": 2802 + }, + { + "epoch": 1.84, + "grad_norm": 0.21665576100349426, + "learning_rate": 9.873465429227735e-05, + "loss": 0.0139, + "step": 2803 + }, + { + "epoch": 1.84, + "grad_norm": 0.10254119336605072, + "learning_rate": 9.86377810923067e-05, + "loss": 0.0019, + "step": 2804 + }, + { + "epoch": 1.84, + "grad_norm": 0.25252997875213623, + "learning_rate": 9.854093215402683e-05, + "loss": 0.0316, + "step": 2805 + }, + { + "epoch": 1.84, + "grad_norm": 0.21150769293308258, + "learning_rate": 9.844410752318572e-05, + "loss": 0.0033, + "step": 2806 + }, + { + "epoch": 1.84, + "grad_norm": 0.2727220356464386, + "learning_rate": 9.834730724551992e-05, + "loss": 0.0261, + "step": 2807 + }, + { + "epoch": 1.84, + "grad_norm": 0.08808046579360962, + "learning_rate": 9.825053136675442e-05, + "loss": 0.0083, + "step": 2808 + }, + { + "epoch": 1.84, + "grad_norm": 0.11724215745925903, + "learning_rate": 9.815377993260279e-05, + "loss": 0.0079, + "step": 2809 + }, + { + "epoch": 1.84, + "grad_norm": 0.03124573826789856, + "learning_rate": 9.805705298876687e-05, + "loss": 0.0034, + "step": 2810 + }, + { + "epoch": 1.84, + "grad_norm": 0.32333555817604065, + "learning_rate": 9.796035058093711e-05, + "loss": 0.0185, + "step": 2811 + }, + { + "epoch": 1.84, + "grad_norm": 0.03145265206694603, + "learning_rate": 9.786367275479224e-05, + "loss": 0.0028, + "step": 2812 + }, + { + "epoch": 1.84, + "grad_norm": 0.18941310048103333, + "learning_rate": 9.776701955599952e-05, + "loss": 0.0182, + "step": 2813 + }, + { + "epoch": 1.84, + "grad_norm": 0.0899311974644661, + "learning_rate": 9.767039103021444e-05, + "loss": 0.0083, + "step": 2814 + }, + { + "epoch": 1.84, + "grad_norm": 0.1362254023551941, + "learning_rate": 9.757378722308088e-05, + "loss": 0.0144, + "step": 2815 + }, + { + "epoch": 1.84, + "grad_norm": 0.026598049327731133, + "learning_rate": 9.747720818023109e-05, + "loss": 0.0017, + "step": 2816 + }, + { + "epoch": 1.84, + "grad_norm": 0.15683645009994507, + "learning_rate": 9.738065394728553e-05, + "loss": 0.0044, + "step": 2817 + }, + { + "epoch": 1.84, + "grad_norm": 0.17370571196079254, + "learning_rate": 9.728412456985308e-05, + "loss": 0.0143, + "step": 2818 + }, + { + "epoch": 1.85, + "grad_norm": 0.2154899388551712, + "learning_rate": 9.71876200935307e-05, + "loss": 0.023, + "step": 2819 + }, + { + "epoch": 1.85, + "grad_norm": 0.3872066140174866, + "learning_rate": 9.709114056390375e-05, + "loss": 0.0626, + "step": 2820 + }, + { + "epoch": 1.85, + "grad_norm": 0.09413719922304153, + "learning_rate": 9.69946860265457e-05, + "loss": 0.0137, + "step": 2821 + }, + { + "epoch": 1.85, + "grad_norm": 0.115718774497509, + "learning_rate": 9.689825652701829e-05, + "loss": 0.01, + "step": 2822 + }, + { + "epoch": 1.85, + "grad_norm": 0.02686137706041336, + "learning_rate": 9.680185211087136e-05, + "loss": 0.0015, + "step": 2823 + }, + { + "epoch": 1.85, + "grad_norm": 0.08938975632190704, + "learning_rate": 9.670547282364294e-05, + "loss": 0.0533, + "step": 2824 + }, + { + "epoch": 1.85, + "grad_norm": 0.18357262015342712, + "learning_rate": 9.660911871085917e-05, + "loss": 0.043, + "step": 2825 + }, + { + "epoch": 1.85, + "grad_norm": 0.23003168404102325, + "learning_rate": 9.651278981803441e-05, + "loss": 0.0273, + "step": 2826 + }, + { + "epoch": 1.85, + "grad_norm": 0.035942334681749344, + "learning_rate": 9.641648619067093e-05, + "loss": 0.0043, + "step": 2827 + }, + { + "epoch": 1.85, + "grad_norm": 0.19699084758758545, + "learning_rate": 9.632020787425915e-05, + "loss": 0.0378, + "step": 2828 + }, + { + "epoch": 1.85, + "grad_norm": 0.06653069704771042, + "learning_rate": 9.622395491427755e-05, + "loss": 0.0044, + "step": 2829 + }, + { + "epoch": 1.85, + "grad_norm": 0.004854390397667885, + "learning_rate": 9.612772735619262e-05, + "loss": 0.0006, + "step": 2830 + }, + { + "epoch": 1.85, + "grad_norm": 0.012486539781093597, + "learning_rate": 9.603152524545884e-05, + "loss": 0.0018, + "step": 2831 + }, + { + "epoch": 1.85, + "grad_norm": 0.022599024698138237, + "learning_rate": 9.593534862751867e-05, + "loss": 0.0027, + "step": 2832 + }, + { + "epoch": 1.85, + "grad_norm": 0.051252953708171844, + "learning_rate": 9.583919754780254e-05, + "loss": 0.0064, + "step": 2833 + }, + { + "epoch": 1.86, + "grad_norm": 0.22247406840324402, + "learning_rate": 9.574307205172881e-05, + "loss": 0.0255, + "step": 2834 + }, + { + "epoch": 1.86, + "grad_norm": 0.07303927838802338, + "learning_rate": 9.564697218470372e-05, + "loss": 0.0053, + "step": 2835 + }, + { + "epoch": 1.86, + "grad_norm": 0.00530305877327919, + "learning_rate": 9.555089799212156e-05, + "loss": 0.0004, + "step": 2836 + }, + { + "epoch": 1.86, + "grad_norm": 0.14532607793807983, + "learning_rate": 9.545484951936422e-05, + "loss": 0.0083, + "step": 2837 + }, + { + "epoch": 1.86, + "grad_norm": 0.02097056619822979, + "learning_rate": 9.535882681180166e-05, + "loss": 0.0024, + "step": 2838 + }, + { + "epoch": 1.86, + "grad_norm": 0.11118460446596146, + "learning_rate": 9.526282991479159e-05, + "loss": 0.0083, + "step": 2839 + }, + { + "epoch": 1.86, + "grad_norm": 0.24298857152462006, + "learning_rate": 9.516685887367959e-05, + "loss": 0.0391, + "step": 2840 + }, + { + "epoch": 1.86, + "grad_norm": 0.13576066493988037, + "learning_rate": 9.50709137337989e-05, + "loss": 0.0078, + "step": 2841 + }, + { + "epoch": 1.86, + "grad_norm": 0.3387295603752136, + "learning_rate": 9.497499454047065e-05, + "loss": 0.0191, + "step": 2842 + }, + { + "epoch": 1.86, + "grad_norm": 0.10902759432792664, + "learning_rate": 9.487910133900365e-05, + "loss": 0.0077, + "step": 2843 + }, + { + "epoch": 1.86, + "grad_norm": 0.22465559840202332, + "learning_rate": 9.478323417469446e-05, + "loss": 0.0149, + "step": 2844 + }, + { + "epoch": 1.86, + "grad_norm": 0.18790677189826965, + "learning_rate": 9.468739309282733e-05, + "loss": 0.01, + "step": 2845 + }, + { + "epoch": 1.86, + "grad_norm": 0.36567965149879456, + "learning_rate": 9.459157813867414e-05, + "loss": 0.0513, + "step": 2846 + }, + { + "epoch": 1.86, + "grad_norm": 0.03895680233836174, + "learning_rate": 9.449578935749451e-05, + "loss": 0.004, + "step": 2847 + }, + { + "epoch": 1.86, + "grad_norm": 0.11043170839548111, + "learning_rate": 9.44000267945357e-05, + "loss": 0.0085, + "step": 2848 + }, + { + "epoch": 1.87, + "grad_norm": 0.39684051275253296, + "learning_rate": 9.430429049503253e-05, + "loss": 0.0441, + "step": 2849 + }, + { + "epoch": 1.87, + "grad_norm": 0.11447808891534805, + "learning_rate": 9.420858050420737e-05, + "loss": 0.0201, + "step": 2850 + }, + { + "epoch": 1.87, + "grad_norm": 0.10845249891281128, + "learning_rate": 9.411289686727029e-05, + "loss": 0.0307, + "step": 2851 + }, + { + "epoch": 1.87, + "grad_norm": 0.1653800904750824, + "learning_rate": 9.401723962941885e-05, + "loss": 0.0361, + "step": 2852 + }, + { + "epoch": 1.87, + "grad_norm": 0.18911443650722504, + "learning_rate": 9.392160883583812e-05, + "loss": 0.0076, + "step": 2853 + }, + { + "epoch": 1.87, + "grad_norm": 0.026294823735952377, + "learning_rate": 9.382600453170068e-05, + "loss": 0.0017, + "step": 2854 + }, + { + "epoch": 1.87, + "grad_norm": 0.17699147760868073, + "learning_rate": 9.373042676216662e-05, + "loss": 0.0195, + "step": 2855 + }, + { + "epoch": 1.87, + "grad_norm": 0.1367698609828949, + "learning_rate": 9.36348755723835e-05, + "loss": 0.0111, + "step": 2856 + }, + { + "epoch": 1.87, + "grad_norm": 0.5007506608963013, + "learning_rate": 9.353935100748631e-05, + "loss": 0.0202, + "step": 2857 + }, + { + "epoch": 1.87, + "grad_norm": 0.0346270427107811, + "learning_rate": 9.344385311259747e-05, + "loss": 0.0025, + "step": 2858 + }, + { + "epoch": 1.87, + "grad_norm": 0.1084810346364975, + "learning_rate": 9.334838193282678e-05, + "loss": 0.0094, + "step": 2859 + }, + { + "epoch": 1.87, + "grad_norm": 0.04106857255101204, + "learning_rate": 9.325293751327148e-05, + "loss": 0.0025, + "step": 2860 + }, + { + "epoch": 1.87, + "grad_norm": 0.11952603608369827, + "learning_rate": 9.315751989901608e-05, + "loss": 0.0082, + "step": 2861 + }, + { + "epoch": 1.87, + "grad_norm": 0.1006799265742302, + "learning_rate": 9.306212913513253e-05, + "loss": 0.0173, + "step": 2862 + }, + { + "epoch": 1.87, + "grad_norm": 0.036590754985809326, + "learning_rate": 9.296676526668e-05, + "loss": 0.0037, + "step": 2863 + }, + { + "epoch": 1.87, + "grad_norm": 0.06457981467247009, + "learning_rate": 9.2871428338705e-05, + "loss": 0.0081, + "step": 2864 + }, + { + "epoch": 1.88, + "grad_norm": 0.21057863533496857, + "learning_rate": 9.277611839624132e-05, + "loss": 0.0569, + "step": 2865 + }, + { + "epoch": 1.88, + "grad_norm": 0.1334536373615265, + "learning_rate": 9.268083548431005e-05, + "loss": 0.0139, + "step": 2866 + }, + { + "epoch": 1.88, + "grad_norm": 0.07306934148073196, + "learning_rate": 9.258557964791938e-05, + "loss": 0.0121, + "step": 2867 + }, + { + "epoch": 1.88, + "grad_norm": 0.010792912915349007, + "learning_rate": 9.249035093206484e-05, + "loss": 0.0015, + "step": 2868 + }, + { + "epoch": 1.88, + "grad_norm": 0.05428679287433624, + "learning_rate": 9.239514938172906e-05, + "loss": 0.0135, + "step": 2869 + }, + { + "epoch": 1.88, + "grad_norm": 0.05001017451286316, + "learning_rate": 9.229997504188193e-05, + "loss": 0.0034, + "step": 2870 + }, + { + "epoch": 1.88, + "grad_norm": 0.06393374502658844, + "learning_rate": 9.220482795748037e-05, + "loss": 0.0049, + "step": 2871 + }, + { + "epoch": 1.88, + "grad_norm": 0.026336563751101494, + "learning_rate": 9.210970817346854e-05, + "loss": 0.0032, + "step": 2872 + }, + { + "epoch": 1.88, + "grad_norm": 0.19261330366134644, + "learning_rate": 9.201461573477761e-05, + "loss": 0.0205, + "step": 2873 + }, + { + "epoch": 1.88, + "grad_norm": 0.037117138504981995, + "learning_rate": 9.19195506863259e-05, + "loss": 0.0029, + "step": 2874 + }, + { + "epoch": 1.88, + "grad_norm": 0.14101198315620422, + "learning_rate": 9.18245130730188e-05, + "loss": 0.0143, + "step": 2875 + }, + { + "epoch": 1.88, + "grad_norm": 0.060865968465805054, + "learning_rate": 9.172950293974863e-05, + "loss": 0.0032, + "step": 2876 + }, + { + "epoch": 1.88, + "grad_norm": 0.05794965475797653, + "learning_rate": 9.163452033139487e-05, + "loss": 0.0045, + "step": 2877 + }, + { + "epoch": 1.88, + "grad_norm": 0.040618691593408585, + "learning_rate": 9.153956529282391e-05, + "loss": 0.0046, + "step": 2878 + }, + { + "epoch": 1.88, + "grad_norm": 0.14414070546627045, + "learning_rate": 9.144463786888918e-05, + "loss": 0.0118, + "step": 2879 + }, + { + "epoch": 1.89, + "grad_norm": 0.26047050952911377, + "learning_rate": 9.134973810443096e-05, + "loss": 0.0403, + "step": 2880 + }, + { + "epoch": 1.89, + "grad_norm": 0.45467132329940796, + "learning_rate": 9.125486604427658e-05, + "loss": 0.0289, + "step": 2881 + }, + { + "epoch": 1.89, + "grad_norm": 0.11333033442497253, + "learning_rate": 9.116002173324025e-05, + "loss": 0.0413, + "step": 2882 + }, + { + "epoch": 1.89, + "grad_norm": 0.033728063106536865, + "learning_rate": 9.106520521612305e-05, + "loss": 0.002, + "step": 2883 + }, + { + "epoch": 1.89, + "grad_norm": 0.04400965943932533, + "learning_rate": 9.097041653771288e-05, + "loss": 0.0048, + "step": 2884 + }, + { + "epoch": 1.89, + "grad_norm": 0.40565457940101624, + "learning_rate": 9.087565574278462e-05, + "loss": 0.0335, + "step": 2885 + }, + { + "epoch": 1.89, + "grad_norm": 0.012257483787834644, + "learning_rate": 9.078092287609989e-05, + "loss": 0.0016, + "step": 2886 + }, + { + "epoch": 1.89, + "grad_norm": 0.17588816583156586, + "learning_rate": 9.068621798240713e-05, + "loss": 0.012, + "step": 2887 + }, + { + "epoch": 1.89, + "grad_norm": 0.04336775094270706, + "learning_rate": 9.05915411064416e-05, + "loss": 0.0031, + "step": 2888 + }, + { + "epoch": 1.89, + "grad_norm": 0.18482789397239685, + "learning_rate": 9.049689229292524e-05, + "loss": 0.0076, + "step": 2889 + }, + { + "epoch": 1.89, + "grad_norm": 0.30984288454055786, + "learning_rate": 9.040227158656684e-05, + "loss": 0.025, + "step": 2890 + }, + { + "epoch": 1.89, + "grad_norm": 0.0787012130022049, + "learning_rate": 9.030767903206186e-05, + "loss": 0.0085, + "step": 2891 + }, + { + "epoch": 1.89, + "grad_norm": 0.1350172907114029, + "learning_rate": 9.021311467409249e-05, + "loss": 0.0283, + "step": 2892 + }, + { + "epoch": 1.89, + "grad_norm": 0.25070253014564514, + "learning_rate": 9.011857855732753e-05, + "loss": 0.0307, + "step": 2893 + }, + { + "epoch": 1.89, + "grad_norm": 0.06961613148450851, + "learning_rate": 9.00240707264225e-05, + "loss": 0.0046, + "step": 2894 + }, + { + "epoch": 1.9, + "grad_norm": 0.6530323624610901, + "learning_rate": 8.992959122601957e-05, + "loss": 0.0236, + "step": 2895 + }, + { + "epoch": 1.9, + "grad_norm": 0.06180819123983383, + "learning_rate": 8.983514010074749e-05, + "loss": 0.0042, + "step": 2896 + }, + { + "epoch": 1.9, + "grad_norm": 0.05041724815964699, + "learning_rate": 8.974071739522164e-05, + "loss": 0.0071, + "step": 2897 + }, + { + "epoch": 1.9, + "grad_norm": 0.12667830288410187, + "learning_rate": 8.964632315404394e-05, + "loss": 0.0079, + "step": 2898 + }, + { + "epoch": 1.9, + "grad_norm": 0.01682797633111477, + "learning_rate": 8.955195742180289e-05, + "loss": 0.0015, + "step": 2899 + }, + { + "epoch": 1.9, + "grad_norm": 0.06474439054727554, + "learning_rate": 8.94576202430735e-05, + "loss": 0.0038, + "step": 2900 + }, + { + "epoch": 1.9, + "grad_norm": 0.20585475862026215, + "learning_rate": 8.936331166241734e-05, + "loss": 0.0347, + "step": 2901 + }, + { + "epoch": 1.9, + "grad_norm": 0.10581755638122559, + "learning_rate": 8.92690317243824e-05, + "loss": 0.0058, + "step": 2902 + }, + { + "epoch": 1.9, + "grad_norm": 0.10786069929599762, + "learning_rate": 8.917478047350322e-05, + "loss": 0.0047, + "step": 2903 + }, + { + "epoch": 1.9, + "grad_norm": 0.19840127229690552, + "learning_rate": 8.90805579543007e-05, + "loss": 0.0225, + "step": 2904 + }, + { + "epoch": 1.9, + "grad_norm": 0.11379020661115646, + "learning_rate": 8.898636421128231e-05, + "loss": 0.0165, + "step": 2905 + }, + { + "epoch": 1.9, + "grad_norm": 0.12862448394298553, + "learning_rate": 8.889219928894173e-05, + "loss": 0.0115, + "step": 2906 + }, + { + "epoch": 1.9, + "grad_norm": 0.011147045530378819, + "learning_rate": 8.879806323175916e-05, + "loss": 0.0012, + "step": 2907 + }, + { + "epoch": 1.9, + "grad_norm": 0.043882615864276886, + "learning_rate": 8.870395608420113e-05, + "loss": 0.0055, + "step": 2908 + }, + { + "epoch": 1.9, + "grad_norm": 0.24039480090141296, + "learning_rate": 8.860987789072053e-05, + "loss": 0.0272, + "step": 2909 + }, + { + "epoch": 1.91, + "grad_norm": 0.21488645672798157, + "learning_rate": 8.851582869575659e-05, + "loss": 0.0092, + "step": 2910 + }, + { + "epoch": 1.91, + "grad_norm": 0.009159781038761139, + "learning_rate": 8.842180854373479e-05, + "loss": 0.0008, + "step": 2911 + }, + { + "epoch": 1.91, + "grad_norm": 0.19630037248134613, + "learning_rate": 8.832781747906687e-05, + "loss": 0.0262, + "step": 2912 + }, + { + "epoch": 1.91, + "grad_norm": 0.1669941395521164, + "learning_rate": 8.823385554615094e-05, + "loss": 0.009, + "step": 2913 + }, + { + "epoch": 1.91, + "grad_norm": 0.057654477655887604, + "learning_rate": 8.813992278937129e-05, + "loss": 0.0028, + "step": 2914 + }, + { + "epoch": 1.91, + "grad_norm": 0.019775306805968285, + "learning_rate": 8.804601925309837e-05, + "loss": 0.0021, + "step": 2915 + }, + { + "epoch": 1.91, + "grad_norm": 0.2010907083749771, + "learning_rate": 8.795214498168895e-05, + "loss": 0.0347, + "step": 2916 + }, + { + "epoch": 1.91, + "grad_norm": 0.3659219741821289, + "learning_rate": 8.785830001948583e-05, + "loss": 0.0353, + "step": 2917 + }, + { + "epoch": 1.91, + "grad_norm": 0.02452949434518814, + "learning_rate": 8.776448441081807e-05, + "loss": 0.001, + "step": 2918 + }, + { + "epoch": 1.91, + "grad_norm": 0.038888439536094666, + "learning_rate": 8.767069820000086e-05, + "loss": 0.0025, + "step": 2919 + }, + { + "epoch": 1.91, + "grad_norm": 0.08751122653484344, + "learning_rate": 8.75769414313355e-05, + "loss": 0.0062, + "step": 2920 + }, + { + "epoch": 1.91, + "grad_norm": 0.012029001489281654, + "learning_rate": 8.748321414910928e-05, + "loss": 0.0009, + "step": 2921 + }, + { + "epoch": 1.91, + "grad_norm": 0.036629196256399155, + "learning_rate": 8.73895163975957e-05, + "loss": 0.0024, + "step": 2922 + }, + { + "epoch": 1.91, + "grad_norm": 0.027340730652213097, + "learning_rate": 8.729584822105425e-05, + "loss": 0.0011, + "step": 2923 + }, + { + "epoch": 1.91, + "grad_norm": 0.08335358649492264, + "learning_rate": 8.720220966373044e-05, + "loss": 0.0326, + "step": 2924 + }, + { + "epoch": 1.91, + "grad_norm": 0.32940107583999634, + "learning_rate": 8.710860076985583e-05, + "loss": 0.0256, + "step": 2925 + }, + { + "epoch": 1.92, + "grad_norm": 0.054443515837192535, + "learning_rate": 8.701502158364792e-05, + "loss": 0.0041, + "step": 2926 + }, + { + "epoch": 1.92, + "grad_norm": 0.0999118834733963, + "learning_rate": 8.692147214931027e-05, + "loss": 0.0111, + "step": 2927 + }, + { + "epoch": 1.92, + "grad_norm": 0.3169289827346802, + "learning_rate": 8.682795251103218e-05, + "loss": 0.0284, + "step": 2928 + }, + { + "epoch": 1.92, + "grad_norm": 0.009980392642319202, + "learning_rate": 8.673446271298909e-05, + "loss": 0.001, + "step": 2929 + }, + { + "epoch": 1.92, + "grad_norm": 0.3076903223991394, + "learning_rate": 8.664100279934227e-05, + "loss": 0.0184, + "step": 2930 + }, + { + "epoch": 1.92, + "grad_norm": 0.015774663537740707, + "learning_rate": 8.654757281423884e-05, + "loss": 0.0014, + "step": 2931 + }, + { + "epoch": 1.92, + "grad_norm": 0.37929320335388184, + "learning_rate": 8.645417280181184e-05, + "loss": 0.0304, + "step": 2932 + }, + { + "epoch": 1.92, + "grad_norm": 0.08329842984676361, + "learning_rate": 8.63608028061801e-05, + "loss": 0.0047, + "step": 2933 + }, + { + "epoch": 1.92, + "grad_norm": 0.21046321094036102, + "learning_rate": 8.62674628714483e-05, + "loss": 0.0171, + "step": 2934 + }, + { + "epoch": 1.92, + "grad_norm": 0.01876913383603096, + "learning_rate": 8.6174153041707e-05, + "loss": 0.0014, + "step": 2935 + }, + { + "epoch": 1.92, + "grad_norm": 0.016124719753861427, + "learning_rate": 8.60808733610323e-05, + "loss": 0.0015, + "step": 2936 + }, + { + "epoch": 1.92, + "grad_norm": 0.011233742348849773, + "learning_rate": 8.59876238734863e-05, + "loss": 0.0009, + "step": 2937 + }, + { + "epoch": 1.92, + "grad_norm": 0.2208932489156723, + "learning_rate": 8.589440462311675e-05, + "loss": 0.0703, + "step": 2938 + }, + { + "epoch": 1.92, + "grad_norm": 0.38058269023895264, + "learning_rate": 8.58012156539571e-05, + "loss": 0.0206, + "step": 2939 + }, + { + "epoch": 1.92, + "grad_norm": 0.34829381108283997, + "learning_rate": 8.570805701002651e-05, + "loss": 0.0258, + "step": 2940 + }, + { + "epoch": 1.93, + "grad_norm": 0.27227723598480225, + "learning_rate": 8.561492873532986e-05, + "loss": 0.0065, + "step": 2941 + }, + { + "epoch": 1.93, + "grad_norm": 0.1077399030327797, + "learning_rate": 8.552183087385759e-05, + "loss": 0.0214, + "step": 2942 + }, + { + "epoch": 1.93, + "grad_norm": 0.06788244843482971, + "learning_rate": 8.542876346958589e-05, + "loss": 0.0059, + "step": 2943 + }, + { + "epoch": 1.93, + "grad_norm": 0.16576650738716125, + "learning_rate": 8.533572656647648e-05, + "loss": 0.0492, + "step": 2944 + }, + { + "epoch": 1.93, + "grad_norm": 0.03128691017627716, + "learning_rate": 8.524272020847665e-05, + "loss": 0.0032, + "step": 2945 + }, + { + "epoch": 1.93, + "grad_norm": 0.2140830159187317, + "learning_rate": 8.514974443951933e-05, + "loss": 0.0577, + "step": 2946 + }, + { + "epoch": 1.93, + "grad_norm": 0.09814707189798355, + "learning_rate": 8.505679930352298e-05, + "loss": 0.0115, + "step": 2947 + }, + { + "epoch": 1.93, + "grad_norm": 0.2515917420387268, + "learning_rate": 8.496388484439158e-05, + "loss": 0.0142, + "step": 2948 + }, + { + "epoch": 1.93, + "grad_norm": 0.3558675944805145, + "learning_rate": 8.487100110601466e-05, + "loss": 0.0135, + "step": 2949 + }, + { + "epoch": 1.93, + "grad_norm": 0.05048364773392677, + "learning_rate": 8.477814813226715e-05, + "loss": 0.0031, + "step": 2950 + }, + { + "epoch": 1.93, + "grad_norm": 0.09671253710985184, + "learning_rate": 8.468532596700955e-05, + "loss": 0.0097, + "step": 2951 + }, + { + "epoch": 1.93, + "grad_norm": 0.10951755940914154, + "learning_rate": 8.459253465408772e-05, + "loss": 0.0419, + "step": 2952 + }, + { + "epoch": 1.93, + "grad_norm": 0.1641848385334015, + "learning_rate": 8.449977423733308e-05, + "loss": 0.0148, + "step": 2953 + }, + { + "epoch": 1.93, + "grad_norm": 0.1038142666220665, + "learning_rate": 8.440704476056221e-05, + "loss": 0.0071, + "step": 2954 + }, + { + "epoch": 1.93, + "grad_norm": 0.08310937136411667, + "learning_rate": 8.431434626757731e-05, + "loss": 0.0112, + "step": 2955 + }, + { + "epoch": 1.94, + "grad_norm": 0.05308748781681061, + "learning_rate": 8.422167880216586e-05, + "loss": 0.0077, + "step": 2956 + }, + { + "epoch": 1.94, + "grad_norm": 0.15207041800022125, + "learning_rate": 8.412904240810068e-05, + "loss": 0.0143, + "step": 2957 + }, + { + "epoch": 1.94, + "grad_norm": 0.03646821156144142, + "learning_rate": 8.403643712913989e-05, + "loss": 0.0052, + "step": 2958 + }, + { + "epoch": 1.94, + "grad_norm": 0.06434578448534012, + "learning_rate": 8.394386300902699e-05, + "loss": 0.008, + "step": 2959 + }, + { + "epoch": 1.94, + "grad_norm": 0.0646260604262352, + "learning_rate": 8.385132009149067e-05, + "loss": 0.0072, + "step": 2960 + }, + { + "epoch": 1.94, + "grad_norm": 0.13893379271030426, + "learning_rate": 8.375880842024494e-05, + "loss": 0.0105, + "step": 2961 + }, + { + "epoch": 1.94, + "grad_norm": 0.1638610064983368, + "learning_rate": 8.36663280389891e-05, + "loss": 0.0162, + "step": 2962 + }, + { + "epoch": 1.94, + "grad_norm": 0.018266642466187477, + "learning_rate": 8.357387899140747e-05, + "loss": 0.003, + "step": 2963 + }, + { + "epoch": 1.94, + "grad_norm": 0.11912977695465088, + "learning_rate": 8.348146132116976e-05, + "loss": 0.0104, + "step": 2964 + }, + { + "epoch": 1.94, + "grad_norm": 0.04382968321442604, + "learning_rate": 8.338907507193083e-05, + "loss": 0.0033, + "step": 2965 + }, + { + "epoch": 1.94, + "grad_norm": 0.17198912799358368, + "learning_rate": 8.329672028733062e-05, + "loss": 0.0093, + "step": 2966 + }, + { + "epoch": 1.94, + "grad_norm": 0.022629285231232643, + "learning_rate": 8.320439701099428e-05, + "loss": 0.002, + "step": 2967 + }, + { + "epoch": 1.94, + "grad_norm": 0.21153992414474487, + "learning_rate": 8.311210528653204e-05, + "loss": 0.0378, + "step": 2968 + }, + { + "epoch": 1.94, + "grad_norm": 0.029377898201346397, + "learning_rate": 8.301984515753928e-05, + "loss": 0.0031, + "step": 2969 + }, + { + "epoch": 1.94, + "grad_norm": 0.11475684493780136, + "learning_rate": 8.292761666759642e-05, + "loss": 0.0052, + "step": 2970 + }, + { + "epoch": 1.95, + "grad_norm": 0.3934403657913208, + "learning_rate": 8.283541986026881e-05, + "loss": 0.0528, + "step": 2971 + }, + { + "epoch": 1.95, + "grad_norm": 0.07621371001005173, + "learning_rate": 8.274325477910708e-05, + "loss": 0.0059, + "step": 2972 + }, + { + "epoch": 1.95, + "grad_norm": 0.035385921597480774, + "learning_rate": 8.265112146764667e-05, + "loss": 0.0026, + "step": 2973 + }, + { + "epoch": 1.95, + "grad_norm": 0.0529436431825161, + "learning_rate": 8.255901996940809e-05, + "loss": 0.008, + "step": 2974 + }, + { + "epoch": 1.95, + "grad_norm": 0.02031162567436695, + "learning_rate": 8.246695032789688e-05, + "loss": 0.0021, + "step": 2975 + }, + { + "epoch": 1.95, + "grad_norm": 0.10501090437173843, + "learning_rate": 8.237491258660342e-05, + "loss": 0.0088, + "step": 2976 + }, + { + "epoch": 1.95, + "grad_norm": 0.18462517857551575, + "learning_rate": 8.228290678900312e-05, + "loss": 0.018, + "step": 2977 + }, + { + "epoch": 1.95, + "grad_norm": 0.18451613187789917, + "learning_rate": 8.219093297855623e-05, + "loss": 0.0488, + "step": 2978 + }, + { + "epoch": 1.95, + "grad_norm": 0.41322609782218933, + "learning_rate": 8.209899119870798e-05, + "loss": 0.0349, + "step": 2979 + }, + { + "epoch": 1.95, + "grad_norm": 0.034096457064151764, + "learning_rate": 8.200708149288827e-05, + "loss": 0.0036, + "step": 2980 + }, + { + "epoch": 1.95, + "grad_norm": 0.10774432122707367, + "learning_rate": 8.191520390451207e-05, + "loss": 0.0373, + "step": 2981 + }, + { + "epoch": 1.95, + "grad_norm": 0.0551498681306839, + "learning_rate": 8.182335847697909e-05, + "loss": 0.0025, + "step": 2982 + }, + { + "epoch": 1.95, + "grad_norm": 0.09337715804576874, + "learning_rate": 8.173154525367383e-05, + "loss": 0.0059, + "step": 2983 + }, + { + "epoch": 1.95, + "grad_norm": 0.16637249290943146, + "learning_rate": 8.163976427796563e-05, + "loss": 0.0079, + "step": 2984 + }, + { + "epoch": 1.95, + "grad_norm": 0.035759832710027695, + "learning_rate": 8.154801559320857e-05, + "loss": 0.0045, + "step": 2985 + }, + { + "epoch": 1.95, + "grad_norm": 0.0748455747961998, + "learning_rate": 8.145629924274144e-05, + "loss": 0.0036, + "step": 2986 + }, + { + "epoch": 1.96, + "grad_norm": 0.378963828086853, + "learning_rate": 8.136461526988783e-05, + "loss": 0.0885, + "step": 2987 + }, + { + "epoch": 1.96, + "grad_norm": 0.01577618159353733, + "learning_rate": 8.127296371795605e-05, + "loss": 0.0023, + "step": 2988 + }, + { + "epoch": 1.96, + "grad_norm": 0.02859189733862877, + "learning_rate": 8.118134463023889e-05, + "loss": 0.0031, + "step": 2989 + }, + { + "epoch": 1.96, + "grad_norm": 0.16388894617557526, + "learning_rate": 8.108975805001406e-05, + "loss": 0.0342, + "step": 2990 + }, + { + "epoch": 1.96, + "grad_norm": 0.07618826627731323, + "learning_rate": 8.099820402054377e-05, + "loss": 0.0068, + "step": 2991 + }, + { + "epoch": 1.96, + "grad_norm": 0.08716662973165512, + "learning_rate": 8.090668258507494e-05, + "loss": 0.0073, + "step": 2992 + }, + { + "epoch": 1.96, + "grad_norm": 0.08639045059680939, + "learning_rate": 8.081519378683904e-05, + "loss": 0.0294, + "step": 2993 + }, + { + "epoch": 1.96, + "grad_norm": 0.2602868974208832, + "learning_rate": 8.072373766905212e-05, + "loss": 0.0282, + "step": 2994 + }, + { + "epoch": 1.96, + "grad_norm": 0.08201611787080765, + "learning_rate": 8.06323142749148e-05, + "loss": 0.0344, + "step": 2995 + }, + { + "epoch": 1.96, + "grad_norm": 0.1925457864999771, + "learning_rate": 8.054092364761234e-05, + "loss": 0.0085, + "step": 2996 + }, + { + "epoch": 1.96, + "grad_norm": 0.6533692479133606, + "learning_rate": 8.044956583031429e-05, + "loss": 0.1054, + "step": 2997 + }, + { + "epoch": 1.96, + "grad_norm": 0.048638634383678436, + "learning_rate": 8.03582408661749e-05, + "loss": 0.0066, + "step": 2998 + }, + { + "epoch": 1.96, + "grad_norm": 0.11080126464366913, + "learning_rate": 8.026694879833285e-05, + "loss": 0.0096, + "step": 2999 + }, + { + "epoch": 1.96, + "grad_norm": 0.07887112349271774, + "learning_rate": 8.017568966991129e-05, + "loss": 0.0216, + "step": 3000 + }, + { + "epoch": 1.96, + "grad_norm": 0.04135850816965103, + "learning_rate": 8.008446352401777e-05, + "loss": 0.0031, + "step": 3001 + }, + { + "epoch": 1.97, + "grad_norm": 0.11523578315973282, + "learning_rate": 7.99932704037443e-05, + "loss": 0.0102, + "step": 3002 + }, + { + "epoch": 1.97, + "grad_norm": 0.12281786650419235, + "learning_rate": 7.990211035216727e-05, + "loss": 0.0338, + "step": 3003 + }, + { + "epoch": 1.97, + "grad_norm": 0.07289214432239532, + "learning_rate": 7.981098341234747e-05, + "loss": 0.0048, + "step": 3004 + }, + { + "epoch": 1.97, + "grad_norm": 0.13399043679237366, + "learning_rate": 7.971988962733007e-05, + "loss": 0.0147, + "step": 3005 + }, + { + "epoch": 1.97, + "grad_norm": 0.10570705682039261, + "learning_rate": 7.962882904014447e-05, + "loss": 0.0104, + "step": 3006 + }, + { + "epoch": 1.97, + "grad_norm": 0.02668238803744316, + "learning_rate": 7.953780169380452e-05, + "loss": 0.0035, + "step": 3007 + }, + { + "epoch": 1.97, + "grad_norm": 0.23536016047000885, + "learning_rate": 7.944680763130824e-05, + "loss": 0.0222, + "step": 3008 + }, + { + "epoch": 1.97, + "grad_norm": 0.17513103783130646, + "learning_rate": 7.935584689563802e-05, + "loss": 0.0198, + "step": 3009 + }, + { + "epoch": 1.97, + "grad_norm": 0.022857805714011192, + "learning_rate": 7.926491952976051e-05, + "loss": 0.0035, + "step": 3010 + }, + { + "epoch": 1.97, + "grad_norm": 0.14224904775619507, + "learning_rate": 7.917402557662658e-05, + "loss": 0.0137, + "step": 3011 + }, + { + "epoch": 1.97, + "grad_norm": 0.2415999174118042, + "learning_rate": 7.90831650791713e-05, + "loss": 0.0421, + "step": 3012 + }, + { + "epoch": 1.97, + "grad_norm": 0.05061252415180206, + "learning_rate": 7.899233808031394e-05, + "loss": 0.0069, + "step": 3013 + }, + { + "epoch": 1.97, + "grad_norm": 0.12675368785858154, + "learning_rate": 7.890154462295795e-05, + "loss": 0.0124, + "step": 3014 + }, + { + "epoch": 1.97, + "grad_norm": 0.16706958413124084, + "learning_rate": 7.881078474999097e-05, + "loss": 0.0369, + "step": 3015 + }, + { + "epoch": 1.97, + "grad_norm": 0.049328841269016266, + "learning_rate": 7.872005850428476e-05, + "loss": 0.0065, + "step": 3016 + }, + { + "epoch": 1.98, + "grad_norm": 0.11827641725540161, + "learning_rate": 7.862936592869508e-05, + "loss": 0.0161, + "step": 3017 + }, + { + "epoch": 1.98, + "grad_norm": 0.17668034136295319, + "learning_rate": 7.853870706606198e-05, + "loss": 0.0334, + "step": 3018 + }, + { + "epoch": 1.98, + "grad_norm": 0.1600051075220108, + "learning_rate": 7.844808195920943e-05, + "loss": 0.0106, + "step": 3019 + }, + { + "epoch": 1.98, + "grad_norm": 0.23005841672420502, + "learning_rate": 7.835749065094558e-05, + "loss": 0.0187, + "step": 3020 + }, + { + "epoch": 1.98, + "grad_norm": 0.021763009950518608, + "learning_rate": 7.82669331840625e-05, + "loss": 0.0025, + "step": 3021 + }, + { + "epoch": 1.98, + "grad_norm": 0.0882042795419693, + "learning_rate": 7.817640960133636e-05, + "loss": 0.0101, + "step": 3022 + }, + { + "epoch": 1.98, + "grad_norm": 0.16614805161952972, + "learning_rate": 7.808591994552728e-05, + "loss": 0.021, + "step": 3023 + }, + { + "epoch": 1.98, + "grad_norm": 1.298937201499939, + "learning_rate": 7.799546425937941e-05, + "loss": 0.0137, + "step": 3024 + }, + { + "epoch": 1.98, + "grad_norm": 0.30948105454444885, + "learning_rate": 7.79050425856207e-05, + "loss": 0.0114, + "step": 3025 + }, + { + "epoch": 1.98, + "grad_norm": 0.036360953003168106, + "learning_rate": 7.78146549669632e-05, + "loss": 0.0028, + "step": 3026 + }, + { + "epoch": 1.98, + "grad_norm": 0.19619564712047577, + "learning_rate": 7.772430144610284e-05, + "loss": 0.014, + "step": 3027 + }, + { + "epoch": 1.98, + "grad_norm": 0.22606731951236725, + "learning_rate": 7.763398206571938e-05, + "loss": 0.0618, + "step": 3028 + }, + { + "epoch": 1.98, + "grad_norm": 0.045581962913274765, + "learning_rate": 7.754369686847648e-05, + "loss": 0.0039, + "step": 3029 + }, + { + "epoch": 1.98, + "grad_norm": 0.0919916108250618, + "learning_rate": 7.745344589702173e-05, + "loss": 0.0243, + "step": 3030 + }, + { + "epoch": 1.98, + "grad_norm": 0.14731182157993317, + "learning_rate": 7.736322919398645e-05, + "loss": 0.0182, + "step": 3031 + }, + { + "epoch": 1.98, + "grad_norm": 0.2550598680973053, + "learning_rate": 7.727304680198582e-05, + "loss": 0.0238, + "step": 3032 + }, + { + "epoch": 1.99, + "grad_norm": 0.12032425403594971, + "learning_rate": 7.718289876361885e-05, + "loss": 0.0153, + "step": 3033 + }, + { + "epoch": 1.99, + "grad_norm": 0.038090333342552185, + "learning_rate": 7.709278512146815e-05, + "loss": 0.0049, + "step": 3034 + }, + { + "epoch": 1.99, + "grad_norm": 0.19079631567001343, + "learning_rate": 7.700270591810029e-05, + "loss": 0.0188, + "step": 3035 + }, + { + "epoch": 1.99, + "grad_norm": 0.21086086332798004, + "learning_rate": 7.69126611960655e-05, + "loss": 0.0389, + "step": 3036 + }, + { + "epoch": 1.99, + "grad_norm": 0.11210260540246964, + "learning_rate": 7.68226509978977e-05, + "loss": 0.0443, + "step": 3037 + }, + { + "epoch": 1.99, + "grad_norm": 0.018384624272584915, + "learning_rate": 7.67326753661145e-05, + "loss": 0.0024, + "step": 3038 + }, + { + "epoch": 1.99, + "grad_norm": 0.015754880383610725, + "learning_rate": 7.66427343432172e-05, + "loss": 0.0019, + "step": 3039 + }, + { + "epoch": 1.99, + "grad_norm": 0.20221295952796936, + "learning_rate": 7.655282797169078e-05, + "loss": 0.0111, + "step": 3040 + }, + { + "epoch": 1.99, + "grad_norm": 0.12582607567310333, + "learning_rate": 7.64629562940038e-05, + "loss": 0.0095, + "step": 3041 + }, + { + "epoch": 1.99, + "grad_norm": 0.09076832234859467, + "learning_rate": 7.637311935260852e-05, + "loss": 0.0082, + "step": 3042 + }, + { + "epoch": 1.99, + "grad_norm": 0.025840701535344124, + "learning_rate": 7.628331718994059e-05, + "loss": 0.0027, + "step": 3043 + }, + { + "epoch": 1.99, + "grad_norm": 0.24747799336910248, + "learning_rate": 7.619354984841945e-05, + "loss": 0.0227, + "step": 3044 + }, + { + "epoch": 1.99, + "grad_norm": 0.10816159844398499, + "learning_rate": 7.610381737044798e-05, + "loss": 0.0285, + "step": 3045 + }, + { + "epoch": 1.99, + "grad_norm": 0.2089771330356598, + "learning_rate": 7.601411979841267e-05, + "loss": 0.0278, + "step": 3046 + }, + { + "epoch": 1.99, + "grad_norm": 0.10027289390563965, + "learning_rate": 7.59244571746834e-05, + "loss": 0.0083, + "step": 3047 + }, + { + "epoch": 2.0, + "grad_norm": 0.1805410087108612, + "learning_rate": 7.58348295416137e-05, + "loss": 0.0141, + "step": 3048 + }, + { + "epoch": 2.0, + "grad_norm": 0.05324965715408325, + "learning_rate": 7.57452369415404e-05, + "loss": 0.0025, + "step": 3049 + }, + { + "epoch": 2.0, + "grad_norm": 0.2869288921356201, + "learning_rate": 7.565567941678392e-05, + "loss": 0.0217, + "step": 3050 + }, + { + "epoch": 2.0, + "grad_norm": 0.3843397796154022, + "learning_rate": 7.556615700964808e-05, + "loss": 0.0599, + "step": 3051 + }, + { + "epoch": 2.0, + "grad_norm": 0.03356291353702545, + "learning_rate": 7.547666976242004e-05, + "loss": 0.0044, + "step": 3052 + }, + { + "epoch": 2.0, + "grad_norm": 0.13007520139217377, + "learning_rate": 7.538721771737039e-05, + "loss": 0.0436, + "step": 3053 + }, + { + "epoch": 2.0, + "grad_norm": 0.04935429245233536, + "learning_rate": 7.529780091675315e-05, + "loss": 0.0058, + "step": 3054 + }, + { + "epoch": 2.0, + "grad_norm": 0.0071046738885343075, + "learning_rate": 7.52084194028056e-05, + "loss": 0.0011, + "step": 3055 + }, + { + "epoch": 2.0, + "grad_norm": 0.010049968957901001, + "learning_rate": 7.511907321774844e-05, + "loss": 0.0017, + "step": 3056 + }, + { + "epoch": 2.0, + "eval_loss": 0.0324973538517952, + "eval_runtime": 39.9542, + "eval_samples_per_second": 32.212, + "eval_steps_per_second": 8.059, + "step": 3056 + }, + { + "epoch": 2.0, + "grad_norm": 0.006857311353087425, + "learning_rate": 7.502976240378561e-05, + "loss": 0.0008, + "step": 3057 + }, + { + "epoch": 2.0, + "grad_norm": 0.05016213655471802, + "learning_rate": 7.494048700310441e-05, + "loss": 0.0055, + "step": 3058 + }, + { + "epoch": 2.0, + "grad_norm": 0.017897196114063263, + "learning_rate": 7.485124705787541e-05, + "loss": 0.0025, + "step": 3059 + }, + { + "epoch": 2.0, + "grad_norm": 0.011696245521306992, + "learning_rate": 7.476204261025225e-05, + "loss": 0.0018, + "step": 3060 + }, + { + "epoch": 2.0, + "grad_norm": 0.08584386110305786, + "learning_rate": 7.467287370237204e-05, + "loss": 0.0076, + "step": 3061 + }, + { + "epoch": 2.0, + "grad_norm": 0.02880135364830494, + "learning_rate": 7.458374037635502e-05, + "loss": 0.0039, + "step": 3062 + }, + { + "epoch": 2.01, + "grad_norm": 0.11909928172826767, + "learning_rate": 7.449464267430457e-05, + "loss": 0.0119, + "step": 3063 + }, + { + "epoch": 2.01, + "grad_norm": 0.04658658057451248, + "learning_rate": 7.440558063830731e-05, + "loss": 0.0042, + "step": 3064 + }, + { + "epoch": 2.01, + "grad_norm": 0.04415608569979668, + "learning_rate": 7.4316554310433e-05, + "loss": 0.0047, + "step": 3065 + }, + { + "epoch": 2.01, + "grad_norm": 0.01758030243217945, + "learning_rate": 7.42275637327345e-05, + "loss": 0.0019, + "step": 3066 + }, + { + "epoch": 2.01, + "grad_norm": 0.0055918144062161446, + "learning_rate": 7.41386089472478e-05, + "loss": 0.0008, + "step": 3067 + }, + { + "epoch": 2.01, + "grad_norm": 0.044865932315588, + "learning_rate": 7.404968999599207e-05, + "loss": 0.0055, + "step": 3068 + }, + { + "epoch": 2.01, + "grad_norm": 0.013705574907362461, + "learning_rate": 7.396080692096934e-05, + "loss": 0.0016, + "step": 3069 + }, + { + "epoch": 2.01, + "grad_norm": 0.01940620131790638, + "learning_rate": 7.387195976416486e-05, + "loss": 0.0024, + "step": 3070 + }, + { + "epoch": 2.01, + "grad_norm": 0.11089968681335449, + "learning_rate": 7.378314856754689e-05, + "loss": 0.005, + "step": 3071 + }, + { + "epoch": 2.01, + "grad_norm": 0.16515721380710602, + "learning_rate": 7.36943733730667e-05, + "loss": 0.0414, + "step": 3072 + }, + { + "epoch": 2.01, + "grad_norm": 0.316139817237854, + "learning_rate": 7.360563422265856e-05, + "loss": 0.0125, + "step": 3073 + }, + { + "epoch": 2.01, + "grad_norm": 0.019969282671809196, + "learning_rate": 7.351693115823964e-05, + "loss": 0.0021, + "step": 3074 + }, + { + "epoch": 2.01, + "grad_norm": 0.00985031109303236, + "learning_rate": 7.342826422171019e-05, + "loss": 0.0015, + "step": 3075 + }, + { + "epoch": 2.01, + "grad_norm": 0.033626820892095566, + "learning_rate": 7.333963345495326e-05, + "loss": 0.0037, + "step": 3076 + }, + { + "epoch": 2.01, + "grad_norm": 0.008928509429097176, + "learning_rate": 7.325103889983498e-05, + "loss": 0.0011, + "step": 3077 + }, + { + "epoch": 2.02, + "grad_norm": 0.08676271885633469, + "learning_rate": 7.316248059820417e-05, + "loss": 0.0198, + "step": 3078 + }, + { + "epoch": 2.02, + "grad_norm": 0.010934005491435528, + "learning_rate": 7.307395859189265e-05, + "loss": 0.0014, + "step": 3079 + }, + { + "epoch": 2.02, + "grad_norm": 0.03228992596268654, + "learning_rate": 7.298547292271512e-05, + "loss": 0.0027, + "step": 3080 + }, + { + "epoch": 2.02, + "grad_norm": 0.004572506994009018, + "learning_rate": 7.289702363246903e-05, + "loss": 0.0004, + "step": 3081 + }, + { + "epoch": 2.02, + "grad_norm": 0.09076468646526337, + "learning_rate": 7.280861076293473e-05, + "loss": 0.005, + "step": 3082 + }, + { + "epoch": 2.02, + "grad_norm": 0.023035453632473946, + "learning_rate": 7.272023435587529e-05, + "loss": 0.0018, + "step": 3083 + }, + { + "epoch": 2.02, + "grad_norm": 0.04920250549912453, + "learning_rate": 7.26318944530366e-05, + "loss": 0.0024, + "step": 3084 + }, + { + "epoch": 2.02, + "grad_norm": 0.007420959882438183, + "learning_rate": 7.254359109614736e-05, + "loss": 0.0009, + "step": 3085 + }, + { + "epoch": 2.02, + "grad_norm": 0.013646521605551243, + "learning_rate": 7.245532432691883e-05, + "loss": 0.0013, + "step": 3086 + }, + { + "epoch": 2.02, + "grad_norm": 0.005216498393565416, + "learning_rate": 7.236709418704516e-05, + "loss": 0.0007, + "step": 3087 + }, + { + "epoch": 2.02, + "grad_norm": 0.0383211225271225, + "learning_rate": 7.227890071820314e-05, + "loss": 0.0034, + "step": 3088 + }, + { + "epoch": 2.02, + "grad_norm": 0.06844444572925568, + "learning_rate": 7.219074396205221e-05, + "loss": 0.0044, + "step": 3089 + }, + { + "epoch": 2.02, + "grad_norm": 0.008179310709238052, + "learning_rate": 7.210262396023454e-05, + "loss": 0.0008, + "step": 3090 + }, + { + "epoch": 2.02, + "grad_norm": 0.009947978891432285, + "learning_rate": 7.201454075437488e-05, + "loss": 0.001, + "step": 3091 + }, + { + "epoch": 2.02, + "grad_norm": 0.12099776417016983, + "learning_rate": 7.192649438608058e-05, + "loss": 0.0064, + "step": 3092 + }, + { + "epoch": 2.02, + "grad_norm": 0.1767294555902481, + "learning_rate": 7.183848489694166e-05, + "loss": 0.0115, + "step": 3093 + }, + { + "epoch": 2.03, + "grad_norm": 0.19339025020599365, + "learning_rate": 7.175051232853072e-05, + "loss": 0.0056, + "step": 3094 + }, + { + "epoch": 2.03, + "grad_norm": 0.006366460584104061, + "learning_rate": 7.166257672240278e-05, + "loss": 0.0006, + "step": 3095 + }, + { + "epoch": 2.03, + "grad_norm": 0.015857741236686707, + "learning_rate": 7.157467812009556e-05, + "loss": 0.0013, + "step": 3096 + }, + { + "epoch": 2.03, + "grad_norm": 0.012925447896122932, + "learning_rate": 7.148681656312922e-05, + "loss": 0.0013, + "step": 3097 + }, + { + "epoch": 2.03, + "grad_norm": 0.1308155208826065, + "learning_rate": 7.139899209300646e-05, + "loss": 0.0157, + "step": 3098 + }, + { + "epoch": 2.03, + "grad_norm": 0.0184427909553051, + "learning_rate": 7.131120475121244e-05, + "loss": 0.001, + "step": 3099 + }, + { + "epoch": 2.03, + "grad_norm": 0.011915124021470547, + "learning_rate": 7.12234545792148e-05, + "loss": 0.0011, + "step": 3100 + }, + { + "epoch": 2.03, + "grad_norm": 0.002806570613756776, + "learning_rate": 7.11357416184636e-05, + "loss": 0.0003, + "step": 3101 + }, + { + "epoch": 2.03, + "grad_norm": 0.006056719459593296, + "learning_rate": 7.104806591039132e-05, + "loss": 0.0005, + "step": 3102 + }, + { + "epoch": 2.03, + "grad_norm": 0.008895138278603554, + "learning_rate": 7.096042749641294e-05, + "loss": 0.0008, + "step": 3103 + }, + { + "epoch": 2.03, + "grad_norm": 0.057021476328372955, + "learning_rate": 7.087282641792561e-05, + "loss": 0.0021, + "step": 3104 + }, + { + "epoch": 2.03, + "grad_norm": 0.24201320111751556, + "learning_rate": 7.078526271630901e-05, + "loss": 0.0148, + "step": 3105 + }, + { + "epoch": 2.03, + "grad_norm": 0.03161720559000969, + "learning_rate": 7.069773643292517e-05, + "loss": 0.002, + "step": 3106 + }, + { + "epoch": 2.03, + "grad_norm": 0.02011238783597946, + "learning_rate": 7.061024760911837e-05, + "loss": 0.0006, + "step": 3107 + }, + { + "epoch": 2.03, + "grad_norm": 0.3532894253730774, + "learning_rate": 7.052279628621523e-05, + "loss": 0.0053, + "step": 3108 + }, + { + "epoch": 2.04, + "grad_norm": 0.008078276179730892, + "learning_rate": 7.043538250552473e-05, + "loss": 0.0007, + "step": 3109 + }, + { + "epoch": 2.04, + "grad_norm": 0.07133232802152634, + "learning_rate": 7.034800630833791e-05, + "loss": 0.0042, + "step": 3110 + }, + { + "epoch": 2.04, + "grad_norm": 0.01626005955040455, + "learning_rate": 7.026066773592823e-05, + "loss": 0.0019, + "step": 3111 + }, + { + "epoch": 2.04, + "grad_norm": 0.003941709641367197, + "learning_rate": 7.017336682955137e-05, + "loss": 0.0004, + "step": 3112 + }, + { + "epoch": 2.04, + "grad_norm": 0.007748854346573353, + "learning_rate": 7.008610363044523e-05, + "loss": 0.0006, + "step": 3113 + }, + { + "epoch": 2.04, + "grad_norm": 0.011195815168321133, + "learning_rate": 6.999887817982972e-05, + "loss": 0.0008, + "step": 3114 + }, + { + "epoch": 2.04, + "grad_norm": 0.044785212725400925, + "learning_rate": 6.99116905189071e-05, + "loss": 0.0021, + "step": 3115 + }, + { + "epoch": 2.04, + "grad_norm": 0.0566740557551384, + "learning_rate": 6.982454068886175e-05, + "loss": 0.0031, + "step": 3116 + }, + { + "epoch": 2.04, + "grad_norm": 0.009785477072000504, + "learning_rate": 6.973742873086017e-05, + "loss": 0.0005, + "step": 3117 + }, + { + "epoch": 2.04, + "grad_norm": 0.021957093849778175, + "learning_rate": 6.965035468605093e-05, + "loss": 0.0012, + "step": 3118 + }, + { + "epoch": 2.04, + "grad_norm": 0.003774002194404602, + "learning_rate": 6.956331859556472e-05, + "loss": 0.0003, + "step": 3119 + }, + { + "epoch": 2.04, + "grad_norm": 0.01460148487240076, + "learning_rate": 6.947632050051434e-05, + "loss": 0.0013, + "step": 3120 + }, + { + "epoch": 2.04, + "grad_norm": 0.1362820416688919, + "learning_rate": 6.938936044199458e-05, + "loss": 0.0042, + "step": 3121 + }, + { + "epoch": 2.04, + "grad_norm": 0.028854064643383026, + "learning_rate": 6.930243846108232e-05, + "loss": 0.0018, + "step": 3122 + }, + { + "epoch": 2.04, + "grad_norm": 0.00219525839202106, + "learning_rate": 6.921555459883637e-05, + "loss": 0.0002, + "step": 3123 + }, + { + "epoch": 2.05, + "grad_norm": 0.07864437252283096, + "learning_rate": 6.912870889629759e-05, + "loss": 0.0048, + "step": 3124 + }, + { + "epoch": 2.05, + "grad_norm": 0.0015528085641562939, + "learning_rate": 6.904190139448881e-05, + "loss": 0.0002, + "step": 3125 + }, + { + "epoch": 2.05, + "grad_norm": 0.11591034382581711, + "learning_rate": 6.895513213441485e-05, + "loss": 0.0372, + "step": 3126 + }, + { + "epoch": 2.05, + "grad_norm": 0.0910307765007019, + "learning_rate": 6.886840115706241e-05, + "loss": 0.0013, + "step": 3127 + }, + { + "epoch": 2.05, + "grad_norm": 0.006387822329998016, + "learning_rate": 6.878170850340012e-05, + "loss": 0.0005, + "step": 3128 + }, + { + "epoch": 2.05, + "grad_norm": 0.04994610324501991, + "learning_rate": 6.869505421437854e-05, + "loss": 0.0023, + "step": 3129 + }, + { + "epoch": 2.05, + "grad_norm": 0.15009135007858276, + "learning_rate": 6.860843833093006e-05, + "loss": 0.0126, + "step": 3130 + }, + { + "epoch": 2.05, + "grad_norm": 0.00306086172349751, + "learning_rate": 6.8521860893969e-05, + "loss": 0.0003, + "step": 3131 + }, + { + "epoch": 2.05, + "grad_norm": 0.6200054883956909, + "learning_rate": 6.843532194439141e-05, + "loss": 0.0137, + "step": 3132 + }, + { + "epoch": 2.05, + "grad_norm": 0.03659270331263542, + "learning_rate": 6.834882152307522e-05, + "loss": 0.0006, + "step": 3133 + }, + { + "epoch": 2.05, + "grad_norm": 0.006774500943720341, + "learning_rate": 6.82623596708802e-05, + "loss": 0.0006, + "step": 3134 + }, + { + "epoch": 2.05, + "grad_norm": 0.02588949166238308, + "learning_rate": 6.817593642864783e-05, + "loss": 0.0009, + "step": 3135 + }, + { + "epoch": 2.05, + "grad_norm": 0.012879827991127968, + "learning_rate": 6.808955183720141e-05, + "loss": 0.0007, + "step": 3136 + }, + { + "epoch": 2.05, + "grad_norm": 0.004731791093945503, + "learning_rate": 6.800320593734596e-05, + "loss": 0.0003, + "step": 3137 + }, + { + "epoch": 2.05, + "grad_norm": 0.015829216688871384, + "learning_rate": 6.79168987698682e-05, + "loss": 0.001, + "step": 3138 + }, + { + "epoch": 2.05, + "grad_norm": 0.17940203845500946, + "learning_rate": 6.78306303755366e-05, + "loss": 0.0297, + "step": 3139 + }, + { + "epoch": 2.06, + "grad_norm": 0.0027716802433133125, + "learning_rate": 6.77444007951013e-05, + "loss": 0.0002, + "step": 3140 + }, + { + "epoch": 2.06, + "grad_norm": 0.015646016225218773, + "learning_rate": 6.765821006929403e-05, + "loss": 0.0005, + "step": 3141 + }, + { + "epoch": 2.06, + "grad_norm": 0.012443234212696552, + "learning_rate": 6.757205823882828e-05, + "loss": 0.0007, + "step": 3142 + }, + { + "epoch": 2.06, + "grad_norm": 0.07186063379049301, + "learning_rate": 6.748594534439911e-05, + "loss": 0.0026, + "step": 3143 + }, + { + "epoch": 2.06, + "grad_norm": 0.20627544820308685, + "learning_rate": 6.739987142668321e-05, + "loss": 0.0055, + "step": 3144 + }, + { + "epoch": 2.06, + "grad_norm": 0.002659998834133148, + "learning_rate": 6.731383652633882e-05, + "loss": 0.0003, + "step": 3145 + }, + { + "epoch": 2.06, + "grad_norm": 0.008209268562495708, + "learning_rate": 6.72278406840058e-05, + "loss": 0.0004, + "step": 3146 + }, + { + "epoch": 2.06, + "grad_norm": 0.013124651275575161, + "learning_rate": 6.714188394030554e-05, + "loss": 0.001, + "step": 3147 + }, + { + "epoch": 2.06, + "grad_norm": 0.008720333687961102, + "learning_rate": 6.7055966335841e-05, + "loss": 0.0004, + "step": 3148 + }, + { + "epoch": 2.06, + "grad_norm": 0.006276703905314207, + "learning_rate": 6.697008791119649e-05, + "loss": 0.0005, + "step": 3149 + }, + { + "epoch": 2.06, + "grad_norm": 0.051605816930532455, + "learning_rate": 6.688424870693801e-05, + "loss": 0.0037, + "step": 3150 + }, + { + "epoch": 2.06, + "grad_norm": 0.049970634281635284, + "learning_rate": 6.679844876361293e-05, + "loss": 0.0028, + "step": 3151 + }, + { + "epoch": 2.06, + "grad_norm": 0.004212846979498863, + "learning_rate": 6.671268812175014e-05, + "loss": 0.0004, + "step": 3152 + }, + { + "epoch": 2.06, + "grad_norm": 0.058990731835365295, + "learning_rate": 6.662696682185988e-05, + "loss": 0.003, + "step": 3153 + }, + { + "epoch": 2.06, + "grad_norm": 0.010446381755173206, + "learning_rate": 6.654128490443388e-05, + "loss": 0.0007, + "step": 3154 + }, + { + "epoch": 2.07, + "grad_norm": 0.003605799749493599, + "learning_rate": 6.645564240994524e-05, + "loss": 0.0003, + "step": 3155 + }, + { + "epoch": 2.07, + "grad_norm": 0.02312384359538555, + "learning_rate": 6.637003937884842e-05, + "loss": 0.001, + "step": 3156 + }, + { + "epoch": 2.07, + "grad_norm": 0.01894088089466095, + "learning_rate": 6.628447585157932e-05, + "loss": 0.0011, + "step": 3157 + }, + { + "epoch": 2.07, + "grad_norm": 0.0029064773116260767, + "learning_rate": 6.619895186855501e-05, + "loss": 0.0002, + "step": 3158 + }, + { + "epoch": 2.07, + "grad_norm": 0.0694064125418663, + "learning_rate": 6.611346747017404e-05, + "loss": 0.0027, + "step": 3159 + }, + { + "epoch": 2.07, + "grad_norm": 0.006106645800173283, + "learning_rate": 6.602802269681621e-05, + "loss": 0.0004, + "step": 3160 + }, + { + "epoch": 2.07, + "grad_norm": 0.005483656190335751, + "learning_rate": 6.59426175888426e-05, + "loss": 0.0004, + "step": 3161 + }, + { + "epoch": 2.07, + "grad_norm": 0.005027757957577705, + "learning_rate": 6.585725218659556e-05, + "loss": 0.0003, + "step": 3162 + }, + { + "epoch": 2.07, + "grad_norm": 0.0007672994979657233, + "learning_rate": 6.577192653039866e-05, + "loss": 0.0001, + "step": 3163 + }, + { + "epoch": 2.07, + "grad_norm": 0.0002690361870918423, + "learning_rate": 6.568664066055673e-05, + "loss": 0.0, + "step": 3164 + }, + { + "epoch": 2.07, + "grad_norm": 0.021148182451725006, + "learning_rate": 6.560139461735578e-05, + "loss": 0.0008, + "step": 3165 + }, + { + "epoch": 2.07, + "grad_norm": 0.004046349320560694, + "learning_rate": 6.551618844106309e-05, + "loss": 0.0001, + "step": 3166 + }, + { + "epoch": 2.07, + "grad_norm": 0.0011810839641839266, + "learning_rate": 6.54310221719269e-05, + "loss": 0.0001, + "step": 3167 + }, + { + "epoch": 2.07, + "grad_norm": 0.0018804300343617797, + "learning_rate": 6.53458958501768e-05, + "loss": 0.0002, + "step": 3168 + }, + { + "epoch": 2.07, + "grad_norm": 0.3152017593383789, + "learning_rate": 6.526080951602346e-05, + "loss": 0.0075, + "step": 3169 + }, + { + "epoch": 2.08, + "grad_norm": 0.005373880732804537, + "learning_rate": 6.517576320965865e-05, + "loss": 0.0003, + "step": 3170 + }, + { + "epoch": 2.08, + "grad_norm": 0.004283849615603685, + "learning_rate": 6.50907569712552e-05, + "loss": 0.0003, + "step": 3171 + }, + { + "epoch": 2.08, + "grad_norm": 0.011751600541174412, + "learning_rate": 6.500579084096707e-05, + "loss": 0.0005, + "step": 3172 + }, + { + "epoch": 2.08, + "grad_norm": 0.007891875691711903, + "learning_rate": 6.492086485892923e-05, + "loss": 0.0005, + "step": 3173 + }, + { + "epoch": 2.08, + "grad_norm": 0.0010966439731419086, + "learning_rate": 6.483597906525777e-05, + "loss": 0.0001, + "step": 3174 + }, + { + "epoch": 2.08, + "grad_norm": 0.09881124645471573, + "learning_rate": 6.47511335000496e-05, + "loss": 0.0057, + "step": 3175 + }, + { + "epoch": 2.08, + "grad_norm": 0.035056356340646744, + "learning_rate": 6.466632820338283e-05, + "loss": 0.0009, + "step": 3176 + }, + { + "epoch": 2.08, + "grad_norm": 0.009287077002227306, + "learning_rate": 6.458156321531646e-05, + "loss": 0.0006, + "step": 3177 + }, + { + "epoch": 2.08, + "grad_norm": 0.001837094547227025, + "learning_rate": 6.449683857589049e-05, + "loss": 0.0002, + "step": 3178 + }, + { + "epoch": 2.08, + "grad_norm": 0.0009134543361142278, + "learning_rate": 6.44121543251258e-05, + "loss": 0.0001, + "step": 3179 + }, + { + "epoch": 2.08, + "grad_norm": 0.003934761509299278, + "learning_rate": 6.432751050302425e-05, + "loss": 0.0002, + "step": 3180 + }, + { + "epoch": 2.08, + "grad_norm": 0.04439563676714897, + "learning_rate": 6.424290714956857e-05, + "loss": 0.0019, + "step": 3181 + }, + { + "epoch": 2.08, + "grad_norm": 0.009751858189702034, + "learning_rate": 6.415834430472239e-05, + "loss": 0.0004, + "step": 3182 + }, + { + "epoch": 2.08, + "grad_norm": 0.15930423140525818, + "learning_rate": 6.407382200843026e-05, + "loss": 0.0058, + "step": 3183 + }, + { + "epoch": 2.08, + "grad_norm": 0.0022712252102792263, + "learning_rate": 6.398934030061738e-05, + "loss": 0.0002, + "step": 3184 + }, + { + "epoch": 2.09, + "grad_norm": 0.0385560542345047, + "learning_rate": 6.390489922119e-05, + "loss": 0.0011, + "step": 3185 + }, + { + "epoch": 2.09, + "grad_norm": 0.011401673778891563, + "learning_rate": 6.382049881003509e-05, + "loss": 0.0004, + "step": 3186 + }, + { + "epoch": 2.09, + "grad_norm": 0.0359170064330101, + "learning_rate": 6.373613910702038e-05, + "loss": 0.0013, + "step": 3187 + }, + { + "epoch": 2.09, + "grad_norm": 0.24599847197532654, + "learning_rate": 6.365182015199442e-05, + "loss": 0.0149, + "step": 3188 + }, + { + "epoch": 2.09, + "grad_norm": 0.010207513347268105, + "learning_rate": 6.35675419847865e-05, + "loss": 0.0004, + "step": 3189 + }, + { + "epoch": 2.09, + "grad_norm": 0.003619662718847394, + "learning_rate": 6.348330464520663e-05, + "loss": 0.0002, + "step": 3190 + }, + { + "epoch": 2.09, + "grad_norm": 0.08463262021541595, + "learning_rate": 6.339910817304553e-05, + "loss": 0.0019, + "step": 3191 + }, + { + "epoch": 2.09, + "grad_norm": 0.001637775101698935, + "learning_rate": 6.331495260807471e-05, + "loss": 0.0001, + "step": 3192 + }, + { + "epoch": 2.09, + "grad_norm": 0.009566172026097775, + "learning_rate": 6.323083799004614e-05, + "loss": 0.0004, + "step": 3193 + }, + { + "epoch": 2.09, + "grad_norm": 0.00526285357773304, + "learning_rate": 6.314676435869267e-05, + "loss": 0.0003, + "step": 3194 + }, + { + "epoch": 2.09, + "grad_norm": 0.0015120654134079814, + "learning_rate": 6.306273175372767e-05, + "loss": 0.0001, + "step": 3195 + }, + { + "epoch": 2.09, + "grad_norm": 0.008848036639392376, + "learning_rate": 6.297874021484518e-05, + "loss": 0.0005, + "step": 3196 + }, + { + "epoch": 2.09, + "grad_norm": 0.4212523102760315, + "learning_rate": 6.28947897817198e-05, + "loss": 0.0358, + "step": 3197 + }, + { + "epoch": 2.09, + "grad_norm": 0.007119151297956705, + "learning_rate": 6.281088049400676e-05, + "loss": 0.0004, + "step": 3198 + }, + { + "epoch": 2.09, + "grad_norm": 0.256693035364151, + "learning_rate": 6.272701239134183e-05, + "loss": 0.0115, + "step": 3199 + }, + { + "epoch": 2.09, + "grad_norm": 0.12740834057331085, + "learning_rate": 6.264318551334132e-05, + "loss": 0.0031, + "step": 3200 + }, + { + "epoch": 2.1, + "grad_norm": 0.0034774895757436752, + "learning_rate": 6.255939989960214e-05, + "loss": 0.0002, + "step": 3201 + }, + { + "epoch": 2.1, + "grad_norm": 0.09209202229976654, + "learning_rate": 6.247565558970152e-05, + "loss": 0.0017, + "step": 3202 + }, + { + "epoch": 2.1, + "grad_norm": 0.017498863860964775, + "learning_rate": 6.239195262319737e-05, + "loss": 0.0008, + "step": 3203 + }, + { + "epoch": 2.1, + "grad_norm": 0.010991438291966915, + "learning_rate": 6.2308291039628e-05, + "loss": 0.0005, + "step": 3204 + }, + { + "epoch": 2.1, + "grad_norm": 0.0019388212822377682, + "learning_rate": 6.222467087851216e-05, + "loss": 0.0002, + "step": 3205 + }, + { + "epoch": 2.1, + "grad_norm": 0.007399784401059151, + "learning_rate": 6.214109217934907e-05, + "loss": 0.0002, + "step": 3206 + }, + { + "epoch": 2.1, + "grad_norm": 0.005549309309571981, + "learning_rate": 6.205755498161833e-05, + "loss": 0.0003, + "step": 3207 + }, + { + "epoch": 2.1, + "grad_norm": 0.4334615170955658, + "learning_rate": 6.197405932477997e-05, + "loss": 0.009, + "step": 3208 + }, + { + "epoch": 2.1, + "grad_norm": 0.004956515040248632, + "learning_rate": 6.189060524827438e-05, + "loss": 0.0003, + "step": 3209 + }, + { + "epoch": 2.1, + "grad_norm": 0.00660488149151206, + "learning_rate": 6.180719279152226e-05, + "loss": 0.0004, + "step": 3210 + }, + { + "epoch": 2.1, + "grad_norm": 0.033332448452711105, + "learning_rate": 6.172382199392477e-05, + "loss": 0.001, + "step": 3211 + }, + { + "epoch": 2.1, + "grad_norm": 0.03414176031947136, + "learning_rate": 6.164049289486323e-05, + "loss": 0.0017, + "step": 3212 + }, + { + "epoch": 2.1, + "grad_norm": 0.006915814708918333, + "learning_rate": 6.155720553369939e-05, + "loss": 0.0005, + "step": 3213 + }, + { + "epoch": 2.1, + "grad_norm": 0.021319087594747543, + "learning_rate": 6.147395994977523e-05, + "loss": 0.0009, + "step": 3214 + }, + { + "epoch": 2.1, + "grad_norm": 0.26830780506134033, + "learning_rate": 6.139075618241305e-05, + "loss": 0.0382, + "step": 3215 + }, + { + "epoch": 2.11, + "grad_norm": 0.002733568660914898, + "learning_rate": 6.130759427091533e-05, + "loss": 0.0002, + "step": 3216 + }, + { + "epoch": 2.11, + "grad_norm": 0.025117401033639908, + "learning_rate": 6.122447425456483e-05, + "loss": 0.0007, + "step": 3217 + }, + { + "epoch": 2.11, + "grad_norm": 0.004128337372094393, + "learning_rate": 6.114139617262447e-05, + "loss": 0.0003, + "step": 3218 + }, + { + "epoch": 2.11, + "grad_norm": 0.047063447535037994, + "learning_rate": 6.105836006433743e-05, + "loss": 0.0015, + "step": 3219 + }, + { + "epoch": 2.11, + "grad_norm": 0.009646626189351082, + "learning_rate": 6.0975365968927036e-05, + "loss": 0.0009, + "step": 3220 + }, + { + "epoch": 2.11, + "grad_norm": 0.053330641239881516, + "learning_rate": 6.0892413925596665e-05, + "loss": 0.0021, + "step": 3221 + }, + { + "epoch": 2.11, + "grad_norm": 0.005851525813341141, + "learning_rate": 6.0809503973529975e-05, + "loss": 0.0003, + "step": 3222 + }, + { + "epoch": 2.11, + "grad_norm": 0.0005984354065731168, + "learning_rate": 6.072663615189069e-05, + "loss": 0.0, + "step": 3223 + }, + { + "epoch": 2.11, + "grad_norm": 0.01633174903690815, + "learning_rate": 6.064381049982262e-05, + "loss": 0.0007, + "step": 3224 + }, + { + "epoch": 2.11, + "grad_norm": 0.055244430899620056, + "learning_rate": 6.0561027056449676e-05, + "loss": 0.0017, + "step": 3225 + }, + { + "epoch": 2.11, + "grad_norm": 0.28780126571655273, + "learning_rate": 6.0478285860875816e-05, + "loss": 0.0105, + "step": 3226 + }, + { + "epoch": 2.11, + "grad_norm": 0.034277625381946564, + "learning_rate": 6.039558695218506e-05, + "loss": 0.001, + "step": 3227 + }, + { + "epoch": 2.11, + "grad_norm": 0.5036815404891968, + "learning_rate": 6.0312930369441414e-05, + "loss": 0.0084, + "step": 3228 + }, + { + "epoch": 2.11, + "grad_norm": 0.024652687832713127, + "learning_rate": 6.0230316151688987e-05, + "loss": 0.0008, + "step": 3229 + }, + { + "epoch": 2.11, + "grad_norm": 0.013688890263438225, + "learning_rate": 6.0147744337951686e-05, + "loss": 0.0005, + "step": 3230 + }, + { + "epoch": 2.12, + "grad_norm": 0.0009207276743836701, + "learning_rate": 6.006521496723359e-05, + "loss": 0.0001, + "step": 3231 + }, + { + "epoch": 2.12, + "grad_norm": 0.007560947444289923, + "learning_rate": 5.9982728078518607e-05, + "loss": 0.0003, + "step": 3232 + }, + { + "epoch": 2.12, + "grad_norm": 0.0022380175068974495, + "learning_rate": 5.9900283710770655e-05, + "loss": 0.0001, + "step": 3233 + }, + { + "epoch": 2.12, + "grad_norm": 0.023539673537015915, + "learning_rate": 5.981788190293349e-05, + "loss": 0.001, + "step": 3234 + }, + { + "epoch": 2.12, + "grad_norm": 0.004317943472415209, + "learning_rate": 5.9735522693930845e-05, + "loss": 0.0001, + "step": 3235 + }, + { + "epoch": 2.12, + "grad_norm": 0.0026261620223522186, + "learning_rate": 5.965320612266628e-05, + "loss": 0.0002, + "step": 3236 + }, + { + "epoch": 2.12, + "grad_norm": 0.004484075587242842, + "learning_rate": 5.957093222802325e-05, + "loss": 0.0003, + "step": 3237 + }, + { + "epoch": 2.12, + "grad_norm": 0.017440563067793846, + "learning_rate": 5.948870104886495e-05, + "loss": 0.0007, + "step": 3238 + }, + { + "epoch": 2.12, + "grad_norm": 0.0010862457565963268, + "learning_rate": 5.940651262403451e-05, + "loss": 0.0001, + "step": 3239 + }, + { + "epoch": 2.12, + "grad_norm": 0.20822399854660034, + "learning_rate": 5.932436699235482e-05, + "loss": 0.0361, + "step": 3240 + }, + { + "epoch": 2.12, + "grad_norm": 0.09969034790992737, + "learning_rate": 5.924226419262859e-05, + "loss": 0.0026, + "step": 3241 + }, + { + "epoch": 2.12, + "grad_norm": 0.0049345120787620544, + "learning_rate": 5.916020426363825e-05, + "loss": 0.0003, + "step": 3242 + }, + { + "epoch": 2.12, + "grad_norm": 0.0024491448421031237, + "learning_rate": 5.907818724414601e-05, + "loss": 0.0002, + "step": 3243 + }, + { + "epoch": 2.12, + "grad_norm": 0.00141709775198251, + "learning_rate": 5.899621317289379e-05, + "loss": 0.0001, + "step": 3244 + }, + { + "epoch": 2.12, + "grad_norm": 0.011614524759352207, + "learning_rate": 5.8914282088603234e-05, + "loss": 0.0007, + "step": 3245 + }, + { + "epoch": 2.13, + "grad_norm": 0.003174105891957879, + "learning_rate": 5.883239402997576e-05, + "loss": 0.0003, + "step": 3246 + }, + { + "epoch": 2.13, + "grad_norm": 0.019315171986818314, + "learning_rate": 5.875054903569225e-05, + "loss": 0.0005, + "step": 3247 + }, + { + "epoch": 2.13, + "grad_norm": 0.018212556838989258, + "learning_rate": 5.866874714441344e-05, + "loss": 0.0006, + "step": 3248 + }, + { + "epoch": 2.13, + "grad_norm": 0.13858914375305176, + "learning_rate": 5.8586988394779635e-05, + "loss": 0.0016, + "step": 3249 + }, + { + "epoch": 2.13, + "grad_norm": 0.0014750031987205148, + "learning_rate": 5.850527282541078e-05, + "loss": 0.0001, + "step": 3250 + }, + { + "epoch": 2.13, + "grad_norm": 0.0020258587319403887, + "learning_rate": 5.8423600474906404e-05, + "loss": 0.0001, + "step": 3251 + }, + { + "epoch": 2.13, + "grad_norm": 0.004569962155073881, + "learning_rate": 5.834197138184563e-05, + "loss": 0.0003, + "step": 3252 + }, + { + "epoch": 2.13, + "grad_norm": 0.48737093806266785, + "learning_rate": 5.826038558478716e-05, + "loss": 0.0686, + "step": 3253 + }, + { + "epoch": 2.13, + "grad_norm": 0.012425919063389301, + "learning_rate": 5.81788431222692e-05, + "loss": 0.0005, + "step": 3254 + }, + { + "epoch": 2.13, + "grad_norm": 0.024924419820308685, + "learning_rate": 5.8097344032809615e-05, + "loss": 0.0005, + "step": 3255 + }, + { + "epoch": 2.13, + "grad_norm": 0.002496515167877078, + "learning_rate": 5.801588835490552e-05, + "loss": 0.0001, + "step": 3256 + }, + { + "epoch": 2.13, + "grad_norm": 0.0027548614889383316, + "learning_rate": 5.79344761270338e-05, + "loss": 0.0002, + "step": 3257 + }, + { + "epoch": 2.13, + "grad_norm": 0.01802532747387886, + "learning_rate": 5.7853107387650675e-05, + "loss": 0.0009, + "step": 3258 + }, + { + "epoch": 2.13, + "grad_norm": 0.015366035513579845, + "learning_rate": 5.7771782175191864e-05, + "loss": 0.0004, + "step": 3259 + }, + { + "epoch": 2.13, + "grad_norm": 0.03580791503190994, + "learning_rate": 5.769050052807249e-05, + "loss": 0.0018, + "step": 3260 + }, + { + "epoch": 2.13, + "grad_norm": 0.11138907074928284, + "learning_rate": 5.760926248468716e-05, + "loss": 0.0019, + "step": 3261 + }, + { + "epoch": 2.14, + "grad_norm": 0.009690427221357822, + "learning_rate": 5.75280680834098e-05, + "loss": 0.0008, + "step": 3262 + }, + { + "epoch": 2.14, + "grad_norm": 0.004451524466276169, + "learning_rate": 5.744691736259386e-05, + "loss": 0.0003, + "step": 3263 + }, + { + "epoch": 2.14, + "grad_norm": 0.007070077117532492, + "learning_rate": 5.736581036057192e-05, + "loss": 0.0005, + "step": 3264 + }, + { + "epoch": 2.14, + "grad_norm": 0.011497410014271736, + "learning_rate": 5.7284747115656134e-05, + "loss": 0.0005, + "step": 3265 + }, + { + "epoch": 2.14, + "grad_norm": 0.3138227164745331, + "learning_rate": 5.720372766613787e-05, + "loss": 0.046, + "step": 3266 + }, + { + "epoch": 2.14, + "grad_norm": 0.12561139464378357, + "learning_rate": 5.712275205028789e-05, + "loss": 0.0281, + "step": 3267 + }, + { + "epoch": 2.14, + "grad_norm": 0.1856832355260849, + "learning_rate": 5.704182030635617e-05, + "loss": 0.0071, + "step": 3268 + }, + { + "epoch": 2.14, + "grad_norm": 0.007879674434661865, + "learning_rate": 5.696093247257201e-05, + "loss": 0.0005, + "step": 3269 + }, + { + "epoch": 2.14, + "grad_norm": 0.02565966546535492, + "learning_rate": 5.688008858714393e-05, + "loss": 0.0013, + "step": 3270 + }, + { + "epoch": 2.14, + "grad_norm": 0.363846093416214, + "learning_rate": 5.679928868825974e-05, + "loss": 0.0104, + "step": 3271 + }, + { + "epoch": 2.14, + "grad_norm": 0.6184598803520203, + "learning_rate": 5.67185328140865e-05, + "loss": 0.0094, + "step": 3272 + }, + { + "epoch": 2.14, + "grad_norm": 0.05976525694131851, + "learning_rate": 5.66378210027703e-05, + "loss": 0.0038, + "step": 3273 + }, + { + "epoch": 2.14, + "grad_norm": 0.01400262676179409, + "learning_rate": 5.65571532924366e-05, + "loss": 0.0007, + "step": 3274 + }, + { + "epoch": 2.14, + "grad_norm": 0.05980502441525459, + "learning_rate": 5.6476529721189974e-05, + "loss": 0.0024, + "step": 3275 + }, + { + "epoch": 2.14, + "grad_norm": 0.01195542048662901, + "learning_rate": 5.639595032711411e-05, + "loss": 0.001, + "step": 3276 + }, + { + "epoch": 2.15, + "grad_norm": 0.012757666409015656, + "learning_rate": 5.63154151482719e-05, + "loss": 0.0012, + "step": 3277 + }, + { + "epoch": 2.15, + "grad_norm": 0.010214698500931263, + "learning_rate": 5.6234924222705255e-05, + "loss": 0.0011, + "step": 3278 + }, + { + "epoch": 2.15, + "grad_norm": 0.5173309445381165, + "learning_rate": 5.615447758843526e-05, + "loss": 0.0079, + "step": 3279 + }, + { + "epoch": 2.15, + "grad_norm": 0.0016734300879761577, + "learning_rate": 5.6074075283462074e-05, + "loss": 0.0002, + "step": 3280 + }, + { + "epoch": 2.15, + "grad_norm": 0.018250368535518646, + "learning_rate": 5.59937173457649e-05, + "loss": 0.001, + "step": 3281 + }, + { + "epoch": 2.15, + "grad_norm": 0.04990352690219879, + "learning_rate": 5.5913403813301914e-05, + "loss": 0.0017, + "step": 3282 + }, + { + "epoch": 2.15, + "grad_norm": 0.011470275931060314, + "learning_rate": 5.583313472401041e-05, + "loss": 0.001, + "step": 3283 + }, + { + "epoch": 2.15, + "grad_norm": 0.029932580888271332, + "learning_rate": 5.575291011580666e-05, + "loss": 0.0019, + "step": 3284 + }, + { + "epoch": 2.15, + "grad_norm": 0.007193171884864569, + "learning_rate": 5.567273002658594e-05, + "loss": 0.0006, + "step": 3285 + }, + { + "epoch": 2.15, + "grad_norm": 0.06624811887741089, + "learning_rate": 5.5592594494222465e-05, + "loss": 0.003, + "step": 3286 + }, + { + "epoch": 2.15, + "grad_norm": 0.005450272001326084, + "learning_rate": 5.5512503556569435e-05, + "loss": 0.0005, + "step": 3287 + }, + { + "epoch": 2.15, + "grad_norm": 0.0019293631194159389, + "learning_rate": 5.5432457251458946e-05, + "loss": 0.0002, + "step": 3288 + }, + { + "epoch": 2.15, + "grad_norm": 0.007713042665272951, + "learning_rate": 5.535245561670204e-05, + "loss": 0.0006, + "step": 3289 + }, + { + "epoch": 2.15, + "grad_norm": 0.0062887719832360744, + "learning_rate": 5.5272498690088724e-05, + "loss": 0.0005, + "step": 3290 + }, + { + "epoch": 2.15, + "grad_norm": 0.004071138799190521, + "learning_rate": 5.51925865093877e-05, + "loss": 0.0004, + "step": 3291 + }, + { + "epoch": 2.16, + "grad_norm": 0.006920125335454941, + "learning_rate": 5.5112719112346686e-05, + "loss": 0.0005, + "step": 3292 + }, + { + "epoch": 2.16, + "grad_norm": 0.04592970013618469, + "learning_rate": 5.5032896536692214e-05, + "loss": 0.0025, + "step": 3293 + }, + { + "epoch": 2.16, + "grad_norm": 0.05991462245583534, + "learning_rate": 5.495311882012966e-05, + "loss": 0.0019, + "step": 3294 + }, + { + "epoch": 2.16, + "grad_norm": 0.004764164332300425, + "learning_rate": 5.4873386000343154e-05, + "loss": 0.0004, + "step": 3295 + }, + { + "epoch": 2.16, + "grad_norm": 0.02527697943150997, + "learning_rate": 5.4793698114995685e-05, + "loss": 0.0016, + "step": 3296 + }, + { + "epoch": 2.16, + "grad_norm": 0.0032181846909224987, + "learning_rate": 5.471405520172896e-05, + "loss": 0.0003, + "step": 3297 + }, + { + "epoch": 2.16, + "grad_norm": 0.0038431212306022644, + "learning_rate": 5.463445729816352e-05, + "loss": 0.0004, + "step": 3298 + }, + { + "epoch": 2.16, + "grad_norm": 0.0034844218753278255, + "learning_rate": 5.455490444189852e-05, + "loss": 0.0002, + "step": 3299 + }, + { + "epoch": 2.16, + "grad_norm": 0.0034144744277000427, + "learning_rate": 5.447539667051191e-05, + "loss": 0.0002, + "step": 3300 + }, + { + "epoch": 2.16, + "grad_norm": 0.006270056590437889, + "learning_rate": 5.4395934021560375e-05, + "loss": 0.0004, + "step": 3301 + }, + { + "epoch": 2.16, + "grad_norm": 0.019354308024048805, + "learning_rate": 5.4316516532579255e-05, + "loss": 0.0011, + "step": 3302 + }, + { + "epoch": 2.16, + "grad_norm": 0.0037658188957720995, + "learning_rate": 5.423714424108254e-05, + "loss": 0.0003, + "step": 3303 + }, + { + "epoch": 2.16, + "grad_norm": 0.006382027640938759, + "learning_rate": 5.4157817184562894e-05, + "loss": 0.0005, + "step": 3304 + }, + { + "epoch": 2.16, + "grad_norm": 0.0015297923237085342, + "learning_rate": 5.4078535400491594e-05, + "loss": 0.0001, + "step": 3305 + }, + { + "epoch": 2.16, + "grad_norm": 0.020708352327346802, + "learning_rate": 5.399929892631857e-05, + "loss": 0.0013, + "step": 3306 + }, + { + "epoch": 2.16, + "grad_norm": 0.008437351323664188, + "learning_rate": 5.392010779947234e-05, + "loss": 0.0004, + "step": 3307 + }, + { + "epoch": 2.17, + "grad_norm": 0.0008229397935792804, + "learning_rate": 5.384096205735989e-05, + "loss": 0.0001, + "step": 3308 + }, + { + "epoch": 2.17, + "grad_norm": 0.0017296182923018932, + "learning_rate": 5.376186173736694e-05, + "loss": 0.0001, + "step": 3309 + }, + { + "epoch": 2.17, + "grad_norm": 0.0024343221448361874, + "learning_rate": 5.368280687685764e-05, + "loss": 0.0001, + "step": 3310 + }, + { + "epoch": 2.17, + "grad_norm": 0.0007007645908743143, + "learning_rate": 5.360379751317472e-05, + "loss": 0.0001, + "step": 3311 + }, + { + "epoch": 2.17, + "grad_norm": 0.006644636858254671, + "learning_rate": 5.352483368363946e-05, + "loss": 0.0004, + "step": 3312 + }, + { + "epoch": 2.17, + "grad_norm": 0.015100304037332535, + "learning_rate": 5.3445915425551464e-05, + "loss": 0.0005, + "step": 3313 + }, + { + "epoch": 2.17, + "grad_norm": 0.00509124668315053, + "learning_rate": 5.336704277618897e-05, + "loss": 0.0004, + "step": 3314 + }, + { + "epoch": 2.17, + "grad_norm": 0.30099958181381226, + "learning_rate": 5.32882157728086e-05, + "loss": 0.0207, + "step": 3315 + }, + { + "epoch": 2.17, + "grad_norm": 0.0017139858100563288, + "learning_rate": 5.320943445264547e-05, + "loss": 0.0001, + "step": 3316 + }, + { + "epoch": 2.17, + "grad_norm": 0.0012273280881345272, + "learning_rate": 5.313069885291305e-05, + "loss": 0.0001, + "step": 3317 + }, + { + "epoch": 2.17, + "grad_norm": 0.09653129428625107, + "learning_rate": 5.305200901080331e-05, + "loss": 0.0029, + "step": 3318 + }, + { + "epoch": 2.17, + "grad_norm": 0.18940600752830505, + "learning_rate": 5.297336496348646e-05, + "loss": 0.0164, + "step": 3319 + }, + { + "epoch": 2.17, + "grad_norm": 0.21358366310596466, + "learning_rate": 5.2894766748111175e-05, + "loss": 0.0105, + "step": 3320 + }, + { + "epoch": 2.17, + "grad_norm": 0.0016733923694118857, + "learning_rate": 5.281621440180449e-05, + "loss": 0.0001, + "step": 3321 + }, + { + "epoch": 2.17, + "grad_norm": 0.005540360696613789, + "learning_rate": 5.2737707961671736e-05, + "loss": 0.0002, + "step": 3322 + }, + { + "epoch": 2.18, + "grad_norm": 0.007181198336184025, + "learning_rate": 5.265924746479657e-05, + "loss": 0.0004, + "step": 3323 + }, + { + "epoch": 2.18, + "grad_norm": 0.004487216006964445, + "learning_rate": 5.258083294824095e-05, + "loss": 0.0003, + "step": 3324 + }, + { + "epoch": 2.18, + "grad_norm": 0.015010706149041653, + "learning_rate": 5.2502464449045114e-05, + "loss": 0.0009, + "step": 3325 + }, + { + "epoch": 2.18, + "grad_norm": 0.0026959397364407778, + "learning_rate": 5.24241420042276e-05, + "loss": 0.0002, + "step": 3326 + }, + { + "epoch": 2.18, + "grad_norm": 0.004269886761903763, + "learning_rate": 5.234586565078508e-05, + "loss": 0.0002, + "step": 3327 + }, + { + "epoch": 2.18, + "grad_norm": 0.012507877312600613, + "learning_rate": 5.226763542569256e-05, + "loss": 0.0004, + "step": 3328 + }, + { + "epoch": 2.18, + "grad_norm": 0.03117496334016323, + "learning_rate": 5.218945136590322e-05, + "loss": 0.0016, + "step": 3329 + }, + { + "epoch": 2.18, + "grad_norm": 0.00443754717707634, + "learning_rate": 5.2111313508348456e-05, + "loss": 0.0003, + "step": 3330 + }, + { + "epoch": 2.18, + "grad_norm": 0.022561147809028625, + "learning_rate": 5.20332218899378e-05, + "loss": 0.0013, + "step": 3331 + }, + { + "epoch": 2.18, + "grad_norm": 0.01195398811250925, + "learning_rate": 5.195517654755899e-05, + "loss": 0.0004, + "step": 3332 + }, + { + "epoch": 2.18, + "grad_norm": 0.00613982742652297, + "learning_rate": 5.1877177518077845e-05, + "loss": 0.0001, + "step": 3333 + }, + { + "epoch": 2.18, + "grad_norm": 0.011768726631999016, + "learning_rate": 5.1799224838338346e-05, + "loss": 0.0004, + "step": 3334 + }, + { + "epoch": 2.18, + "grad_norm": 0.019761614501476288, + "learning_rate": 5.172131854516265e-05, + "loss": 0.0007, + "step": 3335 + }, + { + "epoch": 2.18, + "grad_norm": 0.0037259513046592474, + "learning_rate": 5.164345867535081e-05, + "loss": 0.0003, + "step": 3336 + }, + { + "epoch": 2.18, + "grad_norm": 0.00216940906830132, + "learning_rate": 5.1565645265681116e-05, + "loss": 0.0001, + "step": 3337 + }, + { + "epoch": 2.19, + "grad_norm": 0.0981471836566925, + "learning_rate": 5.148787835290986e-05, + "loss": 0.0043, + "step": 3338 + }, + { + "epoch": 2.19, + "grad_norm": 0.012019110843539238, + "learning_rate": 5.141015797377138e-05, + "loss": 0.0002, + "step": 3339 + }, + { + "epoch": 2.19, + "grad_norm": 0.005617059301584959, + "learning_rate": 5.133248416497803e-05, + "loss": 0.0002, + "step": 3340 + }, + { + "epoch": 2.19, + "grad_norm": 0.008156144991517067, + "learning_rate": 5.125485696322016e-05, + "loss": 0.0004, + "step": 3341 + }, + { + "epoch": 2.19, + "grad_norm": 0.24338524043560028, + "learning_rate": 5.1177276405166104e-05, + "loss": 0.0372, + "step": 3342 + }, + { + "epoch": 2.19, + "grad_norm": 0.005476477090269327, + "learning_rate": 5.109974252746219e-05, + "loss": 0.0003, + "step": 3343 + }, + { + "epoch": 2.19, + "grad_norm": 0.005420641973614693, + "learning_rate": 5.102225536673268e-05, + "loss": 0.0004, + "step": 3344 + }, + { + "epoch": 2.19, + "grad_norm": 0.007410378195345402, + "learning_rate": 5.094481495957968e-05, + "loss": 0.0005, + "step": 3345 + }, + { + "epoch": 2.19, + "grad_norm": 0.021108785644173622, + "learning_rate": 5.086742134258336e-05, + "loss": 0.0013, + "step": 3346 + }, + { + "epoch": 2.19, + "grad_norm": 0.014878311194479465, + "learning_rate": 5.0790074552301696e-05, + "loss": 0.0006, + "step": 3347 + }, + { + "epoch": 2.19, + "grad_norm": 0.14904099702835083, + "learning_rate": 5.071277462527056e-05, + "loss": 0.0026, + "step": 3348 + }, + { + "epoch": 2.19, + "grad_norm": 0.010886380448937416, + "learning_rate": 5.0635521598003733e-05, + "loss": 0.0005, + "step": 3349 + }, + { + "epoch": 2.19, + "grad_norm": 0.013503280468285084, + "learning_rate": 5.055831550699279e-05, + "loss": 0.0008, + "step": 3350 + }, + { + "epoch": 2.19, + "grad_norm": 0.00360480067320168, + "learning_rate": 5.048115638870714e-05, + "loss": 0.0002, + "step": 3351 + }, + { + "epoch": 2.19, + "grad_norm": 0.03083737939596176, + "learning_rate": 5.040404427959408e-05, + "loss": 0.0008, + "step": 3352 + }, + { + "epoch": 2.2, + "grad_norm": 0.03238225355744362, + "learning_rate": 5.032697921607851e-05, + "loss": 0.001, + "step": 3353 + }, + { + "epoch": 2.2, + "grad_norm": 0.171664297580719, + "learning_rate": 5.024996123456331e-05, + "loss": 0.004, + "step": 3354 + }, + { + "epoch": 2.2, + "grad_norm": 0.0031998453196138144, + "learning_rate": 5.017299037142903e-05, + "loss": 0.0002, + "step": 3355 + }, + { + "epoch": 2.2, + "grad_norm": 0.0019488211255520582, + "learning_rate": 5.0096066663033976e-05, + "loss": 0.0001, + "step": 3356 + }, + { + "epoch": 2.2, + "grad_norm": 0.010270226746797562, + "learning_rate": 5.001919014571418e-05, + "loss": 0.0006, + "step": 3357 + }, + { + "epoch": 2.2, + "grad_norm": 0.06024489179253578, + "learning_rate": 4.994236085578339e-05, + "loss": 0.0015, + "step": 3358 + }, + { + "epoch": 2.2, + "grad_norm": 0.05829927697777748, + "learning_rate": 4.9865578829533035e-05, + "loss": 0.0012, + "step": 3359 + }, + { + "epoch": 2.2, + "grad_norm": 0.1129850447177887, + "learning_rate": 4.978884410323222e-05, + "loss": 0.0095, + "step": 3360 + }, + { + "epoch": 2.2, + "grad_norm": 0.018833568319678307, + "learning_rate": 4.971215671312775e-05, + "loss": 0.0005, + "step": 3361 + }, + { + "epoch": 2.2, + "grad_norm": 0.011341112665832043, + "learning_rate": 4.963551669544395e-05, + "loss": 0.0003, + "step": 3362 + }, + { + "epoch": 2.2, + "grad_norm": 0.040290217846632004, + "learning_rate": 4.955892408638288e-05, + "loss": 0.0013, + "step": 3363 + }, + { + "epoch": 2.2, + "grad_norm": 0.0034086050000041723, + "learning_rate": 4.9482378922124165e-05, + "loss": 0.0001, + "step": 3364 + }, + { + "epoch": 2.2, + "grad_norm": 0.013514001853764057, + "learning_rate": 4.940588123882506e-05, + "loss": 0.0005, + "step": 3365 + }, + { + "epoch": 2.2, + "grad_norm": 0.12084479629993439, + "learning_rate": 4.9329431072620316e-05, + "loss": 0.0026, + "step": 3366 + }, + { + "epoch": 2.2, + "grad_norm": 0.003915491513907909, + "learning_rate": 4.92530284596223e-05, + "loss": 0.0002, + "step": 3367 + }, + { + "epoch": 2.2, + "grad_norm": 0.33545753359794617, + "learning_rate": 4.917667343592089e-05, + "loss": 0.0066, + "step": 3368 + }, + { + "epoch": 2.21, + "grad_norm": 0.01727052591741085, + "learning_rate": 4.910036603758351e-05, + "loss": 0.0004, + "step": 3369 + }, + { + "epoch": 2.21, + "grad_norm": 0.03260132670402527, + "learning_rate": 4.902410630065511e-05, + "loss": 0.0007, + "step": 3370 + }, + { + "epoch": 2.21, + "grad_norm": 0.028285512700676918, + "learning_rate": 4.8947894261157974e-05, + "loss": 0.0007, + "step": 3371 + }, + { + "epoch": 2.21, + "grad_norm": 0.005791544448584318, + "learning_rate": 4.887172995509202e-05, + "loss": 0.0003, + "step": 3372 + }, + { + "epoch": 2.21, + "grad_norm": 0.005334790796041489, + "learning_rate": 4.879561341843458e-05, + "loss": 0.0002, + "step": 3373 + }, + { + "epoch": 2.21, + "grad_norm": 0.00440826965495944, + "learning_rate": 4.8719544687140395e-05, + "loss": 0.0003, + "step": 3374 + }, + { + "epoch": 2.21, + "grad_norm": 0.0013330039801076055, + "learning_rate": 4.864352379714163e-05, + "loss": 0.0001, + "step": 3375 + }, + { + "epoch": 2.21, + "grad_norm": 0.019174210727214813, + "learning_rate": 4.8567550784347856e-05, + "loss": 0.0006, + "step": 3376 + }, + { + "epoch": 2.21, + "grad_norm": 0.0014926824951544404, + "learning_rate": 4.849162568464605e-05, + "loss": 0.0001, + "step": 3377 + }, + { + "epoch": 2.21, + "grad_norm": 0.0045569590292871, + "learning_rate": 4.8415748533900536e-05, + "loss": 0.0003, + "step": 3378 + }, + { + "epoch": 2.21, + "grad_norm": 0.003635370172560215, + "learning_rate": 4.833991936795301e-05, + "loss": 0.0002, + "step": 3379 + }, + { + "epoch": 2.21, + "grad_norm": 0.014120995998382568, + "learning_rate": 4.826413822262242e-05, + "loss": 0.0007, + "step": 3380 + }, + { + "epoch": 2.21, + "grad_norm": 0.0014756955206394196, + "learning_rate": 4.818840513370511e-05, + "loss": 0.0001, + "step": 3381 + }, + { + "epoch": 2.21, + "grad_norm": 0.15659062564373016, + "learning_rate": 4.8112720136974707e-05, + "loss": 0.0119, + "step": 3382 + }, + { + "epoch": 2.21, + "grad_norm": 0.01393632311373949, + "learning_rate": 4.8037083268182145e-05, + "loss": 0.0003, + "step": 3383 + }, + { + "epoch": 2.22, + "grad_norm": 0.003948344849050045, + "learning_rate": 4.796149456305557e-05, + "loss": 0.0002, + "step": 3384 + }, + { + "epoch": 2.22, + "grad_norm": 0.014117347076535225, + "learning_rate": 4.7885954057300426e-05, + "loss": 0.0004, + "step": 3385 + }, + { + "epoch": 2.22, + "grad_norm": 0.6760991811752319, + "learning_rate": 4.781046178659937e-05, + "loss": 0.0252, + "step": 3386 + }, + { + "epoch": 2.22, + "grad_norm": 0.0023096229415386915, + "learning_rate": 4.77350177866123e-05, + "loss": 0.0001, + "step": 3387 + }, + { + "epoch": 2.22, + "grad_norm": 0.017435627058148384, + "learning_rate": 4.7659622092976205e-05, + "loss": 0.0008, + "step": 3388 + }, + { + "epoch": 2.22, + "grad_norm": 0.045440226793289185, + "learning_rate": 4.758427474130539e-05, + "loss": 0.0013, + "step": 3389 + }, + { + "epoch": 2.22, + "grad_norm": 0.46780478954315186, + "learning_rate": 4.750897576719126e-05, + "loss": 0.0152, + "step": 3390 + }, + { + "epoch": 2.22, + "grad_norm": 0.022039229050278664, + "learning_rate": 4.743372520620238e-05, + "loss": 0.0009, + "step": 3391 + }, + { + "epoch": 2.22, + "grad_norm": 0.0053681363351643085, + "learning_rate": 4.7358523093884454e-05, + "loss": 0.0003, + "step": 3392 + }, + { + "epoch": 2.22, + "grad_norm": 0.005050495266914368, + "learning_rate": 4.728336946576031e-05, + "loss": 0.0003, + "step": 3393 + }, + { + "epoch": 2.22, + "grad_norm": 0.009374149143695831, + "learning_rate": 4.720826435732982e-05, + "loss": 0.0004, + "step": 3394 + }, + { + "epoch": 2.22, + "grad_norm": 0.03830377757549286, + "learning_rate": 4.713320780406999e-05, + "loss": 0.0017, + "step": 3395 + }, + { + "epoch": 2.22, + "grad_norm": 0.001017848146148026, + "learning_rate": 4.705819984143493e-05, + "loss": 0.0001, + "step": 3396 + }, + { + "epoch": 2.22, + "grad_norm": 0.005420726258307695, + "learning_rate": 4.6983240504855635e-05, + "loss": 0.0003, + "step": 3397 + }, + { + "epoch": 2.22, + "grad_norm": 0.0031959994230419397, + "learning_rate": 4.690832982974028e-05, + "loss": 0.0002, + "step": 3398 + }, + { + "epoch": 2.23, + "grad_norm": 0.0015769600868225098, + "learning_rate": 4.683346785147403e-05, + "loss": 0.0001, + "step": 3399 + }, + { + "epoch": 2.23, + "grad_norm": 0.004441719967871904, + "learning_rate": 4.675865460541903e-05, + "loss": 0.0002, + "step": 3400 + }, + { + "epoch": 2.23, + "grad_norm": 0.001989895710721612, + "learning_rate": 4.6683890126914383e-05, + "loss": 0.0001, + "step": 3401 + }, + { + "epoch": 2.23, + "grad_norm": 0.037345364689826965, + "learning_rate": 4.660917445127619e-05, + "loss": 0.0018, + "step": 3402 + }, + { + "epoch": 2.23, + "grad_norm": 0.07446533441543579, + "learning_rate": 4.653450761379749e-05, + "loss": 0.0026, + "step": 3403 + }, + { + "epoch": 2.23, + "grad_norm": 0.0011752662248909473, + "learning_rate": 4.6459889649748236e-05, + "loss": 0.0001, + "step": 3404 + }, + { + "epoch": 2.23, + "grad_norm": 0.07857818901538849, + "learning_rate": 4.6385320594375365e-05, + "loss": 0.003, + "step": 3405 + }, + { + "epoch": 2.23, + "grad_norm": 0.008967617526650429, + "learning_rate": 4.6310800482902554e-05, + "loss": 0.0006, + "step": 3406 + }, + { + "epoch": 2.23, + "grad_norm": 0.0074556064791977406, + "learning_rate": 4.623632935053052e-05, + "loss": 0.0003, + "step": 3407 + }, + { + "epoch": 2.23, + "grad_norm": 0.0027007856406271458, + "learning_rate": 4.616190723243677e-05, + "loss": 0.0001, + "step": 3408 + }, + { + "epoch": 2.23, + "grad_norm": 0.04171226918697357, + "learning_rate": 4.608753416377569e-05, + "loss": 0.0013, + "step": 3409 + }, + { + "epoch": 2.23, + "grad_norm": 0.042179174721241, + "learning_rate": 4.601321017967846e-05, + "loss": 0.0013, + "step": 3410 + }, + { + "epoch": 2.23, + "grad_norm": 0.011021027341485023, + "learning_rate": 4.593893531525312e-05, + "loss": 0.0006, + "step": 3411 + }, + { + "epoch": 2.23, + "grad_norm": 0.0014699314488098025, + "learning_rate": 4.586470960558444e-05, + "loss": 0.0001, + "step": 3412 + }, + { + "epoch": 2.23, + "grad_norm": 0.45374175906181335, + "learning_rate": 4.579053308573412e-05, + "loss": 0.0066, + "step": 3413 + }, + { + "epoch": 2.24, + "grad_norm": 0.47986674308776855, + "learning_rate": 4.571640579074037e-05, + "loss": 0.0349, + "step": 3414 + }, + { + "epoch": 2.24, + "grad_norm": 0.006492975167930126, + "learning_rate": 4.564232775561841e-05, + "loss": 0.0002, + "step": 3415 + }, + { + "epoch": 2.24, + "grad_norm": 0.0005235617863945663, + "learning_rate": 4.556829901536e-05, + "loss": 0.0, + "step": 3416 + }, + { + "epoch": 2.24, + "grad_norm": 0.0010916460305452347, + "learning_rate": 4.549431960493371e-05, + "loss": 0.0, + "step": 3417 + }, + { + "epoch": 2.24, + "grad_norm": 0.09111440181732178, + "learning_rate": 4.542038955928479e-05, + "loss": 0.0035, + "step": 3418 + }, + { + "epoch": 2.24, + "grad_norm": 0.022540340200066566, + "learning_rate": 4.5346508913335195e-05, + "loss": 0.0005, + "step": 3419 + }, + { + "epoch": 2.24, + "grad_norm": 0.0051037706434726715, + "learning_rate": 4.527267770198352e-05, + "loss": 0.0002, + "step": 3420 + }, + { + "epoch": 2.24, + "grad_norm": 0.04126209765672684, + "learning_rate": 4.519889596010499e-05, + "loss": 0.0011, + "step": 3421 + }, + { + "epoch": 2.24, + "grad_norm": 0.007015510927885771, + "learning_rate": 4.5125163722551486e-05, + "loss": 0.0002, + "step": 3422 + }, + { + "epoch": 2.24, + "grad_norm": 0.13553068041801453, + "learning_rate": 4.5051481024151534e-05, + "loss": 0.004, + "step": 3423 + }, + { + "epoch": 2.24, + "grad_norm": 0.6862461566925049, + "learning_rate": 4.497784789971023e-05, + "loss": 0.0244, + "step": 3424 + }, + { + "epoch": 2.24, + "grad_norm": 0.00551761407405138, + "learning_rate": 4.4904264384009195e-05, + "loss": 0.0003, + "step": 3425 + }, + { + "epoch": 2.24, + "grad_norm": 0.0017294659046456218, + "learning_rate": 4.483073051180668e-05, + "loss": 0.0001, + "step": 3426 + }, + { + "epoch": 2.24, + "grad_norm": 0.009084143675863743, + "learning_rate": 4.475724631783754e-05, + "loss": 0.0004, + "step": 3427 + }, + { + "epoch": 2.24, + "grad_norm": 0.09426633268594742, + "learning_rate": 4.468381183681303e-05, + "loss": 0.0021, + "step": 3428 + }, + { + "epoch": 2.24, + "grad_norm": 0.005611674394458532, + "learning_rate": 4.4610427103421045e-05, + "loss": 0.0003, + "step": 3429 + }, + { + "epoch": 2.25, + "grad_norm": 0.052825383841991425, + "learning_rate": 4.45370921523259e-05, + "loss": 0.0012, + "step": 3430 + }, + { + "epoch": 2.25, + "grad_norm": 0.012692754156887531, + "learning_rate": 4.4463807018168455e-05, + "loss": 0.0003, + "step": 3431 + }, + { + "epoch": 2.25, + "grad_norm": 0.0013258426915854216, + "learning_rate": 4.4390571735565975e-05, + "loss": 0.0001, + "step": 3432 + }, + { + "epoch": 2.25, + "grad_norm": 0.0030224043875932693, + "learning_rate": 4.4317386339112295e-05, + "loss": 0.0002, + "step": 3433 + }, + { + "epoch": 2.25, + "grad_norm": 0.001615703571587801, + "learning_rate": 4.424425086337749e-05, + "loss": 0.0001, + "step": 3434 + }, + { + "epoch": 2.25, + "grad_norm": 0.005948258098214865, + "learning_rate": 4.417116534290818e-05, + "loss": 0.0002, + "step": 3435 + }, + { + "epoch": 2.25, + "grad_norm": 0.0012952801771461964, + "learning_rate": 4.4098129812227425e-05, + "loss": 0.0001, + "step": 3436 + }, + { + "epoch": 2.25, + "grad_norm": 0.0017517295200377703, + "learning_rate": 4.4025144305834595e-05, + "loss": 0.0001, + "step": 3437 + }, + { + "epoch": 2.25, + "grad_norm": 0.03244396671652794, + "learning_rate": 4.3952208858205465e-05, + "loss": 0.001, + "step": 3438 + }, + { + "epoch": 2.25, + "eval_loss": 0.048608824610710144, + "eval_runtime": 40.0231, + "eval_samples_per_second": 32.156, + "eval_steps_per_second": 8.045, + "step": 3438 + }, + { + "epoch": 2.25, + "grad_norm": 0.009011611342430115, + "learning_rate": 4.3879323503792125e-05, + "loss": 0.0004, + "step": 3439 + }, + { + "epoch": 2.25, + "grad_norm": 0.0014130481285974383, + "learning_rate": 4.380648827702307e-05, + "loss": 0.0001, + "step": 3440 + }, + { + "epoch": 2.25, + "grad_norm": 0.014316780492663383, + "learning_rate": 4.37337032123031e-05, + "loss": 0.0004, + "step": 3441 + }, + { + "epoch": 2.25, + "grad_norm": 0.005868827924132347, + "learning_rate": 4.366096834401321e-05, + "loss": 0.0003, + "step": 3442 + }, + { + "epoch": 2.25, + "grad_norm": 0.04225154593586922, + "learning_rate": 4.358828370651083e-05, + "loss": 0.0008, + "step": 3443 + }, + { + "epoch": 2.25, + "grad_norm": 0.011469284072518349, + "learning_rate": 4.3515649334129596e-05, + "loss": 0.0004, + "step": 3444 + }, + { + "epoch": 2.26, + "grad_norm": 0.1543962061405182, + "learning_rate": 4.3443065261179406e-05, + "loss": 0.0064, + "step": 3445 + }, + { + "epoch": 2.26, + "grad_norm": 0.0014764603693038225, + "learning_rate": 4.3370531521946404e-05, + "loss": 0.0001, + "step": 3446 + }, + { + "epoch": 2.26, + "grad_norm": 0.0014661421300843358, + "learning_rate": 4.329804815069298e-05, + "loss": 0.0001, + "step": 3447 + }, + { + "epoch": 2.26, + "grad_norm": 0.004160303622484207, + "learning_rate": 4.322561518165766e-05, + "loss": 0.0002, + "step": 3448 + }, + { + "epoch": 2.26, + "grad_norm": 0.002379069570451975, + "learning_rate": 4.3153232649055245e-05, + "loss": 0.0001, + "step": 3449 + }, + { + "epoch": 2.26, + "grad_norm": 0.038400594145059586, + "learning_rate": 4.308090058707673e-05, + "loss": 0.0008, + "step": 3450 + }, + { + "epoch": 2.26, + "grad_norm": 0.004147912375628948, + "learning_rate": 4.300861902988909e-05, + "loss": 0.0002, + "step": 3451 + }, + { + "epoch": 2.26, + "grad_norm": 0.32823702692985535, + "learning_rate": 4.293638801163564e-05, + "loss": 0.0094, + "step": 3452 + }, + { + "epoch": 2.26, + "grad_norm": 0.005656527355313301, + "learning_rate": 4.286420756643574e-05, + "loss": 0.0002, + "step": 3453 + }, + { + "epoch": 2.26, + "grad_norm": 0.0018892742227762938, + "learning_rate": 4.2792077728384885e-05, + "loss": 0.0001, + "step": 3454 + }, + { + "epoch": 2.26, + "grad_norm": 0.07962115854024887, + "learning_rate": 4.271999853155464e-05, + "loss": 0.0006, + "step": 3455 + }, + { + "epoch": 2.26, + "grad_norm": 0.0006132858688943088, + "learning_rate": 4.264797000999267e-05, + "loss": 0.0, + "step": 3456 + }, + { + "epoch": 2.26, + "grad_norm": 0.005851376336067915, + "learning_rate": 4.25759921977227e-05, + "loss": 0.0002, + "step": 3457 + }, + { + "epoch": 2.26, + "grad_norm": 0.0033673509024083614, + "learning_rate": 4.2504065128744484e-05, + "loss": 0.0002, + "step": 3458 + }, + { + "epoch": 2.26, + "grad_norm": 0.006688028573989868, + "learning_rate": 4.2432188837033856e-05, + "loss": 0.0002, + "step": 3459 + }, + { + "epoch": 2.27, + "grad_norm": 0.0007831354159861803, + "learning_rate": 4.236036335654256e-05, + "loss": 0.0001, + "step": 3460 + }, + { + "epoch": 2.27, + "grad_norm": 0.004538694396615028, + "learning_rate": 4.228858872119843e-05, + "loss": 0.0002, + "step": 3461 + }, + { + "epoch": 2.27, + "grad_norm": 0.02458208240568638, + "learning_rate": 4.221686496490529e-05, + "loss": 0.0006, + "step": 3462 + }, + { + "epoch": 2.27, + "grad_norm": 0.0024234687443822622, + "learning_rate": 4.214519212154284e-05, + "loss": 0.0001, + "step": 3463 + }, + { + "epoch": 2.27, + "grad_norm": 0.0023766474332660437, + "learning_rate": 4.2073570224966856e-05, + "loss": 0.0001, + "step": 3464 + }, + { + "epoch": 2.27, + "grad_norm": 0.21312859654426575, + "learning_rate": 4.2001999309008935e-05, + "loss": 0.0041, + "step": 3465 + }, + { + "epoch": 2.27, + "grad_norm": 0.012594955042004585, + "learning_rate": 4.1930479407476655e-05, + "loss": 0.0004, + "step": 3466 + }, + { + "epoch": 2.27, + "grad_norm": 0.00916473288089037, + "learning_rate": 4.185901055415349e-05, + "loss": 0.0002, + "step": 3467 + }, + { + "epoch": 2.27, + "grad_norm": 0.006125182844698429, + "learning_rate": 4.178759278279883e-05, + "loss": 0.0003, + "step": 3468 + }, + { + "epoch": 2.27, + "grad_norm": 0.0005331359570845962, + "learning_rate": 4.171622612714783e-05, + "loss": 0.0, + "step": 3469 + }, + { + "epoch": 2.27, + "grad_norm": 0.005559367593377829, + "learning_rate": 4.164491062091156e-05, + "loss": 0.0002, + "step": 3470 + }, + { + "epoch": 2.27, + "grad_norm": 0.04052725061774254, + "learning_rate": 4.1573646297776964e-05, + "loss": 0.0009, + "step": 3471 + }, + { + "epoch": 2.27, + "grad_norm": 0.017616767436265945, + "learning_rate": 4.1502433191406794e-05, + "loss": 0.0003, + "step": 3472 + }, + { + "epoch": 2.27, + "grad_norm": 0.002986462553963065, + "learning_rate": 4.143127133543959e-05, + "loss": 0.0001, + "step": 3473 + }, + { + "epoch": 2.27, + "grad_norm": 0.001120403059758246, + "learning_rate": 4.1360160763489676e-05, + "loss": 0.0001, + "step": 3474 + }, + { + "epoch": 2.27, + "grad_norm": 0.0014515554066747427, + "learning_rate": 4.1289101509147175e-05, + "loss": 0.0001, + "step": 3475 + }, + { + "epoch": 2.28, + "grad_norm": 0.4214232861995697, + "learning_rate": 4.1218093605977994e-05, + "loss": 0.0272, + "step": 3476 + }, + { + "epoch": 2.28, + "grad_norm": 0.20730961859226227, + "learning_rate": 4.1147137087523676e-05, + "loss": 0.0118, + "step": 3477 + }, + { + "epoch": 2.28, + "grad_norm": 0.013800989836454391, + "learning_rate": 4.107623198730159e-05, + "loss": 0.0006, + "step": 3478 + }, + { + "epoch": 2.28, + "grad_norm": 0.0016861413605511189, + "learning_rate": 4.100537833880481e-05, + "loss": 0.0001, + "step": 3479 + }, + { + "epoch": 2.28, + "grad_norm": 0.0031050050165504217, + "learning_rate": 4.093457617550207e-05, + "loss": 0.0001, + "step": 3480 + }, + { + "epoch": 2.28, + "grad_norm": 0.004541043192148209, + "learning_rate": 4.08638255308378e-05, + "loss": 0.0001, + "step": 3481 + }, + { + "epoch": 2.28, + "grad_norm": 0.004323361441493034, + "learning_rate": 4.0793126438232104e-05, + "loss": 0.0003, + "step": 3482 + }, + { + "epoch": 2.28, + "grad_norm": 0.2170928418636322, + "learning_rate": 4.0722478931080735e-05, + "loss": 0.0182, + "step": 3483 + }, + { + "epoch": 2.28, + "grad_norm": 0.0005836985656060278, + "learning_rate": 4.0651883042755055e-05, + "loss": 0.0, + "step": 3484 + }, + { + "epoch": 2.28, + "grad_norm": 0.015174277126789093, + "learning_rate": 4.058133880660212e-05, + "loss": 0.0006, + "step": 3485 + }, + { + "epoch": 2.28, + "grad_norm": 0.2508314549922943, + "learning_rate": 4.05108462559444e-05, + "loss": 0.0117, + "step": 3486 + }, + { + "epoch": 2.28, + "grad_norm": 0.11943554878234863, + "learning_rate": 4.0440405424080164e-05, + "loss": 0.0015, + "step": 3487 + }, + { + "epoch": 2.28, + "grad_norm": 0.005124766379594803, + "learning_rate": 4.037001634428314e-05, + "loss": 0.0003, + "step": 3488 + }, + { + "epoch": 2.28, + "grad_norm": 0.2578151524066925, + "learning_rate": 4.0299679049802636e-05, + "loss": 0.0195, + "step": 3489 + }, + { + "epoch": 2.28, + "grad_norm": 0.060233354568481445, + "learning_rate": 4.0229393573863506e-05, + "loss": 0.0027, + "step": 3490 + }, + { + "epoch": 2.29, + "grad_norm": 0.20788171887397766, + "learning_rate": 4.0159159949666094e-05, + "loss": 0.0071, + "step": 3491 + }, + { + "epoch": 2.29, + "grad_norm": 0.004099798854440451, + "learning_rate": 4.008897821038629e-05, + "loss": 0.0002, + "step": 3492 + }, + { + "epoch": 2.29, + "grad_norm": 0.10326941311359406, + "learning_rate": 4.001884838917545e-05, + "loss": 0.004, + "step": 3493 + }, + { + "epoch": 2.29, + "grad_norm": 0.03992806747555733, + "learning_rate": 3.994877051916047e-05, + "loss": 0.0013, + "step": 3494 + }, + { + "epoch": 2.29, + "grad_norm": 0.012305202893912792, + "learning_rate": 3.987874463344356e-05, + "loss": 0.0004, + "step": 3495 + }, + { + "epoch": 2.29, + "grad_norm": 0.0007241340936161578, + "learning_rate": 3.980877076510249e-05, + "loss": 0.0, + "step": 3496 + }, + { + "epoch": 2.29, + "grad_norm": 0.6612746715545654, + "learning_rate": 3.9738848947190464e-05, + "loss": 0.0175, + "step": 3497 + }, + { + "epoch": 2.29, + "grad_norm": 0.004933580290526152, + "learning_rate": 3.966897921273606e-05, + "loss": 0.0002, + "step": 3498 + }, + { + "epoch": 2.29, + "grad_norm": 0.08219257742166519, + "learning_rate": 3.959916159474325e-05, + "loss": 0.0045, + "step": 3499 + }, + { + "epoch": 2.29, + "grad_norm": 0.005811081733554602, + "learning_rate": 3.95293961261914e-05, + "loss": 0.0002, + "step": 3500 + }, + { + "epoch": 2.29, + "grad_norm": 0.006904906593263149, + "learning_rate": 3.945968284003526e-05, + "loss": 0.0003, + "step": 3501 + }, + { + "epoch": 2.29, + "grad_norm": 0.2566499710083008, + "learning_rate": 3.939002176920494e-05, + "loss": 0.0096, + "step": 3502 + }, + { + "epoch": 2.29, + "grad_norm": 0.02261658012866974, + "learning_rate": 3.932041294660579e-05, + "loss": 0.0011, + "step": 3503 + }, + { + "epoch": 2.29, + "grad_norm": 0.6159523129463196, + "learning_rate": 3.925085640511857e-05, + "loss": 0.0249, + "step": 3504 + }, + { + "epoch": 2.29, + "grad_norm": 0.02275974303483963, + "learning_rate": 3.918135217759935e-05, + "loss": 0.0009, + "step": 3505 + }, + { + "epoch": 2.3, + "grad_norm": 0.0028290385380387306, + "learning_rate": 3.911190029687946e-05, + "loss": 0.0002, + "step": 3506 + }, + { + "epoch": 2.3, + "grad_norm": 0.2951640486717224, + "learning_rate": 3.904250079576548e-05, + "loss": 0.0058, + "step": 3507 + }, + { + "epoch": 2.3, + "grad_norm": 0.0186203271150589, + "learning_rate": 3.89731537070393e-05, + "loss": 0.0005, + "step": 3508 + }, + { + "epoch": 2.3, + "grad_norm": 0.00922568142414093, + "learning_rate": 3.8903859063458014e-05, + "loss": 0.0003, + "step": 3509 + }, + { + "epoch": 2.3, + "grad_norm": 0.007727789226919413, + "learning_rate": 3.883461689775396e-05, + "loss": 0.0003, + "step": 3510 + }, + { + "epoch": 2.3, + "grad_norm": 0.4675931930541992, + "learning_rate": 3.8765427242634696e-05, + "loss": 0.0103, + "step": 3511 + }, + { + "epoch": 2.3, + "grad_norm": 0.00173068733420223, + "learning_rate": 3.869629013078292e-05, + "loss": 0.0001, + "step": 3512 + }, + { + "epoch": 2.3, + "grad_norm": 0.13637962937355042, + "learning_rate": 3.862720559485658e-05, + "loss": 0.0053, + "step": 3513 + }, + { + "epoch": 2.3, + "grad_norm": 0.2722899317741394, + "learning_rate": 3.855817366748872e-05, + "loss": 0.0538, + "step": 3514 + }, + { + "epoch": 2.3, + "grad_norm": 0.006585948634892702, + "learning_rate": 3.848919438128768e-05, + "loss": 0.0002, + "step": 3515 + }, + { + "epoch": 2.3, + "grad_norm": 1.2582694292068481, + "learning_rate": 3.8420267768836714e-05, + "loss": 0.0084, + "step": 3516 + }, + { + "epoch": 2.3, + "grad_norm": 0.006589618511497974, + "learning_rate": 3.835139386269435e-05, + "loss": 0.0003, + "step": 3517 + }, + { + "epoch": 2.3, + "grad_norm": 0.0024254857562482357, + "learning_rate": 3.8282572695394183e-05, + "loss": 0.0002, + "step": 3518 + }, + { + "epoch": 2.3, + "grad_norm": 0.07830658555030823, + "learning_rate": 3.8213804299444884e-05, + "loss": 0.0007, + "step": 3519 + }, + { + "epoch": 2.3, + "grad_norm": 0.09882328659296036, + "learning_rate": 3.8145088707330206e-05, + "loss": 0.0064, + "step": 3520 + }, + { + "epoch": 2.31, + "grad_norm": 0.09679798036813736, + "learning_rate": 3.807642595150897e-05, + "loss": 0.0013, + "step": 3521 + }, + { + "epoch": 2.31, + "grad_norm": 0.0025053706485778093, + "learning_rate": 3.800781606441506e-05, + "loss": 0.0001, + "step": 3522 + }, + { + "epoch": 2.31, + "grad_norm": 0.002644827589392662, + "learning_rate": 3.793925907845728e-05, + "loss": 0.0001, + "step": 3523 + }, + { + "epoch": 2.31, + "grad_norm": 0.016901424154639244, + "learning_rate": 3.7870755026019545e-05, + "loss": 0.0007, + "step": 3524 + }, + { + "epoch": 2.31, + "grad_norm": 0.037165265530347824, + "learning_rate": 3.780230393946076e-05, + "loss": 0.0009, + "step": 3525 + }, + { + "epoch": 2.31, + "grad_norm": 0.0009690229198895395, + "learning_rate": 3.773390585111476e-05, + "loss": 0.0, + "step": 3526 + }, + { + "epoch": 2.31, + "grad_norm": 0.021075667813420296, + "learning_rate": 3.76655607932904e-05, + "loss": 0.0011, + "step": 3527 + }, + { + "epoch": 2.31, + "grad_norm": 0.010508038103580475, + "learning_rate": 3.7597268798271475e-05, + "loss": 0.0003, + "step": 3528 + }, + { + "epoch": 2.31, + "grad_norm": 0.003440289059653878, + "learning_rate": 3.752902989831666e-05, + "loss": 0.0003, + "step": 3529 + }, + { + "epoch": 2.31, + "grad_norm": 0.008409742265939713, + "learning_rate": 3.7460844125659675e-05, + "loss": 0.0003, + "step": 3530 + }, + { + "epoch": 2.31, + "grad_norm": 0.017797105014324188, + "learning_rate": 3.7392711512508935e-05, + "loss": 0.0003, + "step": 3531 + }, + { + "epoch": 2.31, + "grad_norm": 0.0017873361939564347, + "learning_rate": 3.7324632091047943e-05, + "loss": 0.0001, + "step": 3532 + }, + { + "epoch": 2.31, + "grad_norm": 0.0432756207883358, + "learning_rate": 3.7256605893435e-05, + "loss": 0.0023, + "step": 3533 + }, + { + "epoch": 2.31, + "grad_norm": 0.012234622612595558, + "learning_rate": 3.718863295180327e-05, + "loss": 0.0008, + "step": 3534 + }, + { + "epoch": 2.31, + "grad_norm": 0.17988906800746918, + "learning_rate": 3.7120713298260766e-05, + "loss": 0.0182, + "step": 3535 + }, + { + "epoch": 2.31, + "grad_norm": 0.007093383464962244, + "learning_rate": 3.7052846964890295e-05, + "loss": 0.0002, + "step": 3536 + }, + { + "epoch": 2.32, + "grad_norm": 0.015323545783758163, + "learning_rate": 3.6985033983749536e-05, + "loss": 0.0004, + "step": 3537 + }, + { + "epoch": 2.32, + "grad_norm": 0.0036853235214948654, + "learning_rate": 3.6917274386870917e-05, + "loss": 0.0002, + "step": 3538 + }, + { + "epoch": 2.32, + "grad_norm": 0.18443378806114197, + "learning_rate": 3.684956820626172e-05, + "loss": 0.0391, + "step": 3539 + }, + { + "epoch": 2.32, + "grad_norm": 0.0005609511281363666, + "learning_rate": 3.6781915473903864e-05, + "loss": 0.0, + "step": 3540 + }, + { + "epoch": 2.32, + "grad_norm": 0.12224981188774109, + "learning_rate": 3.6714316221754126e-05, + "loss": 0.0049, + "step": 3541 + }, + { + "epoch": 2.32, + "grad_norm": 0.010082008317112923, + "learning_rate": 3.664677048174402e-05, + "loss": 0.0003, + "step": 3542 + }, + { + "epoch": 2.32, + "grad_norm": 0.13210326433181763, + "learning_rate": 3.657927828577973e-05, + "loss": 0.0026, + "step": 3543 + }, + { + "epoch": 2.32, + "grad_norm": 0.08009618520736694, + "learning_rate": 3.65118396657422e-05, + "loss": 0.001, + "step": 3544 + }, + { + "epoch": 2.32, + "grad_norm": 0.01351571548730135, + "learning_rate": 3.644445465348703e-05, + "loss": 0.0005, + "step": 3545 + }, + { + "epoch": 2.32, + "grad_norm": 0.00593583332374692, + "learning_rate": 3.637712328084452e-05, + "loss": 0.0002, + "step": 3546 + }, + { + "epoch": 2.32, + "grad_norm": 0.014639221131801605, + "learning_rate": 3.630984557961961e-05, + "loss": 0.0003, + "step": 3547 + }, + { + "epoch": 2.32, + "grad_norm": 0.02833763137459755, + "learning_rate": 3.6242621581591946e-05, + "loss": 0.0011, + "step": 3548 + }, + { + "epoch": 2.32, + "grad_norm": 0.04245695099234581, + "learning_rate": 3.6175451318515686e-05, + "loss": 0.0014, + "step": 3549 + }, + { + "epoch": 2.32, + "grad_norm": 0.026733852922916412, + "learning_rate": 3.6108334822119697e-05, + "loss": 0.0017, + "step": 3550 + }, + { + "epoch": 2.32, + "grad_norm": 0.16322949528694153, + "learning_rate": 3.604127212410748e-05, + "loss": 0.046, + "step": 3551 + }, + { + "epoch": 2.33, + "grad_norm": 0.031389035284519196, + "learning_rate": 3.597426325615702e-05, + "loss": 0.0016, + "step": 3552 + }, + { + "epoch": 2.33, + "grad_norm": 0.0583653524518013, + "learning_rate": 3.590730824992098e-05, + "loss": 0.0021, + "step": 3553 + }, + { + "epoch": 2.33, + "grad_norm": 0.001976636005565524, + "learning_rate": 3.5840407137026474e-05, + "loss": 0.0001, + "step": 3554 + }, + { + "epoch": 2.33, + "grad_norm": 0.20203042030334473, + "learning_rate": 3.5773559949075264e-05, + "loss": 0.035, + "step": 3555 + }, + { + "epoch": 2.33, + "grad_norm": 0.3199298679828644, + "learning_rate": 3.570676671764358e-05, + "loss": 0.0051, + "step": 3556 + }, + { + "epoch": 2.33, + "grad_norm": 0.07311911135911942, + "learning_rate": 3.56400274742822e-05, + "loss": 0.005, + "step": 3557 + }, + { + "epoch": 2.33, + "grad_norm": 0.004137672018259764, + "learning_rate": 3.5573342250516305e-05, + "loss": 0.0003, + "step": 3558 + }, + { + "epoch": 2.33, + "grad_norm": 0.014066854491829872, + "learning_rate": 3.5506711077845675e-05, + "loss": 0.0006, + "step": 3559 + }, + { + "epoch": 2.33, + "grad_norm": 0.03339890018105507, + "learning_rate": 3.5440133987744524e-05, + "loss": 0.0017, + "step": 3560 + }, + { + "epoch": 2.33, + "grad_norm": 0.003349835518747568, + "learning_rate": 3.537361101166147e-05, + "loss": 0.0003, + "step": 3561 + }, + { + "epoch": 2.33, + "grad_norm": 0.01604393683373928, + "learning_rate": 3.530714218101964e-05, + "loss": 0.0012, + "step": 3562 + }, + { + "epoch": 2.33, + "grad_norm": 0.010565409436821938, + "learning_rate": 3.524072752721653e-05, + "loss": 0.0008, + "step": 3563 + }, + { + "epoch": 2.33, + "grad_norm": 0.009340840391814709, + "learning_rate": 3.517436708162411e-05, + "loss": 0.0007, + "step": 3564 + }, + { + "epoch": 2.33, + "grad_norm": 0.017390653491020203, + "learning_rate": 3.5108060875588685e-05, + "loss": 0.0011, + "step": 3565 + }, + { + "epoch": 2.33, + "grad_norm": 0.15557774901390076, + "learning_rate": 3.5041808940430916e-05, + "loss": 0.0227, + "step": 3566 + }, + { + "epoch": 2.34, + "grad_norm": 0.010659299790859222, + "learning_rate": 3.497561130744589e-05, + "loss": 0.0005, + "step": 3567 + }, + { + "epoch": 2.34, + "grad_norm": 0.05340409278869629, + "learning_rate": 3.490946800790302e-05, + "loss": 0.0012, + "step": 3568 + }, + { + "epoch": 2.34, + "grad_norm": 0.006075078155845404, + "learning_rate": 3.484337907304606e-05, + "loss": 0.0005, + "step": 3569 + }, + { + "epoch": 2.34, + "grad_norm": 0.020916730165481567, + "learning_rate": 3.47773445340931e-05, + "loss": 0.0011, + "step": 3570 + }, + { + "epoch": 2.34, + "grad_norm": 0.026928747072815895, + "learning_rate": 3.471136442223647e-05, + "loss": 0.0015, + "step": 3571 + }, + { + "epoch": 2.34, + "grad_norm": 0.002961538266390562, + "learning_rate": 3.464543876864286e-05, + "loss": 0.0002, + "step": 3572 + }, + { + "epoch": 2.34, + "grad_norm": 0.039079513400793076, + "learning_rate": 3.457956760445322e-05, + "loss": 0.0012, + "step": 3573 + }, + { + "epoch": 2.34, + "grad_norm": 0.1257244348526001, + "learning_rate": 3.451375096078279e-05, + "loss": 0.0062, + "step": 3574 + }, + { + "epoch": 2.34, + "grad_norm": 0.21111145615577698, + "learning_rate": 3.444798886872092e-05, + "loss": 0.0322, + "step": 3575 + }, + { + "epoch": 2.34, + "grad_norm": 0.051378391683101654, + "learning_rate": 3.438228135933134e-05, + "loss": 0.0028, + "step": 3576 + }, + { + "epoch": 2.34, + "grad_norm": 0.028272032737731934, + "learning_rate": 3.431662846365194e-05, + "loss": 0.001, + "step": 3577 + }, + { + "epoch": 2.34, + "grad_norm": 0.00982116162776947, + "learning_rate": 3.425103021269482e-05, + "loss": 0.0009, + "step": 3578 + }, + { + "epoch": 2.34, + "grad_norm": 0.012771161273121834, + "learning_rate": 3.41854866374463e-05, + "loss": 0.0009, + "step": 3579 + }, + { + "epoch": 2.34, + "grad_norm": 0.043192602694034576, + "learning_rate": 3.4119997768866806e-05, + "loss": 0.0024, + "step": 3580 + }, + { + "epoch": 2.34, + "grad_norm": 0.08039472997188568, + "learning_rate": 3.405456363789096e-05, + "loss": 0.0051, + "step": 3581 + }, + { + "epoch": 2.35, + "grad_norm": 0.06035568192601204, + "learning_rate": 3.398918427542754e-05, + "loss": 0.0024, + "step": 3582 + }, + { + "epoch": 2.35, + "grad_norm": 0.009197032079100609, + "learning_rate": 3.392385971235946e-05, + "loss": 0.0005, + "step": 3583 + }, + { + "epoch": 2.35, + "grad_norm": 0.036739982664585114, + "learning_rate": 3.3858589979543674e-05, + "loss": 0.0007, + "step": 3584 + }, + { + "epoch": 2.35, + "grad_norm": 0.019024716690182686, + "learning_rate": 3.379337510781129e-05, + "loss": 0.0009, + "step": 3585 + }, + { + "epoch": 2.35, + "grad_norm": 0.101305291056633, + "learning_rate": 3.3728215127967536e-05, + "loss": 0.0058, + "step": 3586 + }, + { + "epoch": 2.35, + "grad_norm": 0.010754559189081192, + "learning_rate": 3.366311007079165e-05, + "loss": 0.0008, + "step": 3587 + }, + { + "epoch": 2.35, + "grad_norm": 0.11008837819099426, + "learning_rate": 3.3598059967036984e-05, + "loss": 0.008, + "step": 3588 + }, + { + "epoch": 2.35, + "grad_norm": 0.0031225886195898056, + "learning_rate": 3.353306484743088e-05, + "loss": 0.0002, + "step": 3589 + }, + { + "epoch": 2.35, + "grad_norm": 0.0102327736094594, + "learning_rate": 3.346812474267472e-05, + "loss": 0.0009, + "step": 3590 + }, + { + "epoch": 2.35, + "grad_norm": 0.037046417593955994, + "learning_rate": 3.340323968344394e-05, + "loss": 0.002, + "step": 3591 + }, + { + "epoch": 2.35, + "grad_norm": 0.10339081287384033, + "learning_rate": 3.333840970038789e-05, + "loss": 0.0062, + "step": 3592 + }, + { + "epoch": 2.35, + "grad_norm": 0.021102124825119972, + "learning_rate": 3.3273634824129995e-05, + "loss": 0.0013, + "step": 3593 + }, + { + "epoch": 2.35, + "grad_norm": 0.0216564629226923, + "learning_rate": 3.320891508526757e-05, + "loss": 0.0006, + "step": 3594 + }, + { + "epoch": 2.35, + "grad_norm": 0.0013801176100969315, + "learning_rate": 3.314425051437197e-05, + "loss": 0.0001, + "step": 3595 + }, + { + "epoch": 2.35, + "grad_norm": 0.04089084267616272, + "learning_rate": 3.307964114198841e-05, + "loss": 0.0017, + "step": 3596 + }, + { + "epoch": 2.35, + "grad_norm": 0.006383365485817194, + "learning_rate": 3.301508699863609e-05, + "loss": 0.0005, + "step": 3597 + }, + { + "epoch": 2.36, + "grad_norm": 0.009137725457549095, + "learning_rate": 3.295058811480808e-05, + "loss": 0.0006, + "step": 3598 + }, + { + "epoch": 2.36, + "grad_norm": 0.011332091875374317, + "learning_rate": 3.2886144520971386e-05, + "loss": 0.0006, + "step": 3599 + }, + { + "epoch": 2.36, + "grad_norm": 0.022123467177152634, + "learning_rate": 3.2821756247566905e-05, + "loss": 0.0013, + "step": 3600 + }, + { + "epoch": 2.36, + "grad_norm": 0.11483735591173172, + "learning_rate": 3.2757423325009295e-05, + "loss": 0.0063, + "step": 3601 + }, + { + "epoch": 2.36, + "grad_norm": 0.00934157520532608, + "learning_rate": 3.2693145783687185e-05, + "loss": 0.0003, + "step": 3602 + }, + { + "epoch": 2.36, + "grad_norm": 0.13960252702236176, + "learning_rate": 3.2628923653963e-05, + "loss": 0.0046, + "step": 3603 + }, + { + "epoch": 2.36, + "grad_norm": 0.046137794852256775, + "learning_rate": 3.2564756966173014e-05, + "loss": 0.0017, + "step": 3604 + }, + { + "epoch": 2.36, + "grad_norm": 0.0008678428130224347, + "learning_rate": 3.250064575062727e-05, + "loss": 0.0001, + "step": 3605 + }, + { + "epoch": 2.36, + "grad_norm": 0.0017144728917628527, + "learning_rate": 3.2436590037609665e-05, + "loss": 0.0001, + "step": 3606 + }, + { + "epoch": 2.36, + "grad_norm": 0.1634555608034134, + "learning_rate": 3.237258985737782e-05, + "loss": 0.0428, + "step": 3607 + }, + { + "epoch": 2.36, + "grad_norm": 0.02324790135025978, + "learning_rate": 3.2308645240163155e-05, + "loss": 0.0007, + "step": 3608 + }, + { + "epoch": 2.36, + "grad_norm": 0.9691700339317322, + "learning_rate": 3.2244756216170905e-05, + "loss": 0.0386, + "step": 3609 + }, + { + "epoch": 2.36, + "grad_norm": 0.016212478280067444, + "learning_rate": 3.218092281557985e-05, + "loss": 0.0007, + "step": 3610 + }, + { + "epoch": 2.36, + "grad_norm": 0.007437903434038162, + "learning_rate": 3.2117145068542713e-05, + "loss": 0.0006, + "step": 3611 + }, + { + "epoch": 2.36, + "grad_norm": 0.014965659938752651, + "learning_rate": 3.205342300518581e-05, + "loss": 0.0005, + "step": 3612 + }, + { + "epoch": 2.37, + "grad_norm": 0.16550156474113464, + "learning_rate": 3.19897566556092e-05, + "loss": 0.0252, + "step": 3613 + }, + { + "epoch": 2.37, + "grad_norm": 0.06881934404373169, + "learning_rate": 3.1926146049886586e-05, + "loss": 0.0031, + "step": 3614 + }, + { + "epoch": 2.37, + "grad_norm": 0.54667729139328, + "learning_rate": 3.1862591218065404e-05, + "loss": 0.0093, + "step": 3615 + }, + { + "epoch": 2.37, + "grad_norm": 0.0035510477609932423, + "learning_rate": 3.179909219016665e-05, + "loss": 0.0002, + "step": 3616 + }, + { + "epoch": 2.37, + "grad_norm": 0.07340116053819656, + "learning_rate": 3.173564899618511e-05, + "loss": 0.0016, + "step": 3617 + }, + { + "epoch": 2.37, + "grad_norm": 0.0016917032189667225, + "learning_rate": 3.167226166608897e-05, + "loss": 0.0001, + "step": 3618 + }, + { + "epoch": 2.37, + "grad_norm": 0.010103181004524231, + "learning_rate": 3.1608930229820276e-05, + "loss": 0.0004, + "step": 3619 + }, + { + "epoch": 2.37, + "grad_norm": 0.01665312983095646, + "learning_rate": 3.1545654717294435e-05, + "loss": 0.0006, + "step": 3620 + }, + { + "epoch": 2.37, + "grad_norm": 0.0030040668789297342, + "learning_rate": 3.148243515840061e-05, + "loss": 0.0002, + "step": 3621 + }, + { + "epoch": 2.37, + "grad_norm": 0.13333337008953094, + "learning_rate": 3.14192715830015e-05, + "loss": 0.0054, + "step": 3622 + }, + { + "epoch": 2.37, + "grad_norm": 0.0021791195031255484, + "learning_rate": 3.13561640209333e-05, + "loss": 0.0002, + "step": 3623 + }, + { + "epoch": 2.37, + "grad_norm": 0.012684706598520279, + "learning_rate": 3.129311250200581e-05, + "loss": 0.0005, + "step": 3624 + }, + { + "epoch": 2.37, + "grad_norm": 0.014608604833483696, + "learning_rate": 3.1230117056002326e-05, + "loss": 0.0008, + "step": 3625 + }, + { + "epoch": 2.37, + "grad_norm": 0.10935472697019577, + "learning_rate": 3.1167177712679684e-05, + "loss": 0.0038, + "step": 3626 + }, + { + "epoch": 2.37, + "grad_norm": 0.02855844236910343, + "learning_rate": 3.110429450176815e-05, + "loss": 0.0008, + "step": 3627 + }, + { + "epoch": 2.38, + "grad_norm": 0.010416747070848942, + "learning_rate": 3.10414674529716e-05, + "loss": 0.0005, + "step": 3628 + }, + { + "epoch": 2.38, + "grad_norm": 0.04126406088471413, + "learning_rate": 3.097869659596721e-05, + "loss": 0.001, + "step": 3629 + }, + { + "epoch": 2.38, + "grad_norm": 0.13651755452156067, + "learning_rate": 3.091598196040576e-05, + "loss": 0.0025, + "step": 3630 + }, + { + "epoch": 2.38, + "grad_norm": 0.0006649987190030515, + "learning_rate": 3.08533235759114e-05, + "loss": 0.0001, + "step": 3631 + }, + { + "epoch": 2.38, + "grad_norm": 0.008126436732709408, + "learning_rate": 3.079072147208173e-05, + "loss": 0.0002, + "step": 3632 + }, + { + "epoch": 2.38, + "grad_norm": 0.07602657377719879, + "learning_rate": 3.072817567848779e-05, + "loss": 0.0036, + "step": 3633 + }, + { + "epoch": 2.38, + "grad_norm": 0.16345715522766113, + "learning_rate": 3.0665686224673966e-05, + "loss": 0.0329, + "step": 3634 + }, + { + "epoch": 2.38, + "grad_norm": 0.002797997323796153, + "learning_rate": 3.060325314015808e-05, + "loss": 0.0002, + "step": 3635 + }, + { + "epoch": 2.38, + "grad_norm": 0.0018658045446500182, + "learning_rate": 3.05408764544313e-05, + "loss": 0.0002, + "step": 3636 + }, + { + "epoch": 2.38, + "grad_norm": 0.0281591285020113, + "learning_rate": 3.0478556196958182e-05, + "loss": 0.0012, + "step": 3637 + }, + { + "epoch": 2.38, + "grad_norm": 0.0049908580258488655, + "learning_rate": 3.0416292397176555e-05, + "loss": 0.0004, + "step": 3638 + }, + { + "epoch": 2.38, + "grad_norm": 0.05512424558401108, + "learning_rate": 3.035408508449766e-05, + "loss": 0.0017, + "step": 3639 + }, + { + "epoch": 2.38, + "grad_norm": 0.009397076442837715, + "learning_rate": 3.029193428830602e-05, + "loss": 0.0006, + "step": 3640 + }, + { + "epoch": 2.38, + "grad_norm": 0.002295385580509901, + "learning_rate": 3.022984003795947e-05, + "loss": 0.0001, + "step": 3641 + }, + { + "epoch": 2.38, + "grad_norm": 0.005197838414460421, + "learning_rate": 3.016780236278913e-05, + "loss": 0.0004, + "step": 3642 + }, + { + "epoch": 2.38, + "grad_norm": 0.46695300936698914, + "learning_rate": 3.0105821292099393e-05, + "loss": 0.011, + "step": 3643 + }, + { + "epoch": 2.39, + "grad_norm": 0.018466414883732796, + "learning_rate": 3.0043896855167938e-05, + "loss": 0.0007, + "step": 3644 + }, + { + "epoch": 2.39, + "grad_norm": 0.08499304205179214, + "learning_rate": 2.998202908124565e-05, + "loss": 0.0032, + "step": 3645 + }, + { + "epoch": 2.39, + "grad_norm": 0.19181577861309052, + "learning_rate": 2.9920217999556722e-05, + "loss": 0.0466, + "step": 3646 + }, + { + "epoch": 2.39, + "grad_norm": 0.020145069807767868, + "learning_rate": 2.9858463639298447e-05, + "loss": 0.0005, + "step": 3647 + }, + { + "epoch": 2.39, + "grad_norm": 0.001984576229006052, + "learning_rate": 2.9796766029641423e-05, + "loss": 0.0001, + "step": 3648 + }, + { + "epoch": 2.39, + "grad_norm": 0.014706733636558056, + "learning_rate": 2.9735125199729404e-05, + "loss": 0.0004, + "step": 3649 + }, + { + "epoch": 2.39, + "grad_norm": 0.00998898595571518, + "learning_rate": 2.967354117867935e-05, + "loss": 0.0006, + "step": 3650 + }, + { + "epoch": 2.39, + "grad_norm": 0.317994624376297, + "learning_rate": 2.9612013995581356e-05, + "loss": 0.011, + "step": 3651 + }, + { + "epoch": 2.39, + "grad_norm": 0.1973196268081665, + "learning_rate": 2.955054367949868e-05, + "loss": 0.0089, + "step": 3652 + }, + { + "epoch": 2.39, + "grad_norm": 0.023477502167224884, + "learning_rate": 2.9489130259467738e-05, + "loss": 0.0012, + "step": 3653 + }, + { + "epoch": 2.39, + "grad_norm": 0.005300854332745075, + "learning_rate": 2.9427773764498076e-05, + "loss": 0.0004, + "step": 3654 + }, + { + "epoch": 2.39, + "grad_norm": 0.002143553690984845, + "learning_rate": 2.9366474223572245e-05, + "loss": 0.0001, + "step": 3655 + }, + { + "epoch": 2.39, + "grad_norm": 0.12469197064638138, + "learning_rate": 2.9305231665646036e-05, + "loss": 0.0127, + "step": 3656 + }, + { + "epoch": 2.39, + "grad_norm": 0.002115819603204727, + "learning_rate": 2.9244046119648234e-05, + "loss": 0.0002, + "step": 3657 + }, + { + "epoch": 2.39, + "grad_norm": 0.0012355220969766378, + "learning_rate": 2.9182917614480727e-05, + "loss": 0.0001, + "step": 3658 + }, + { + "epoch": 2.4, + "grad_norm": 0.022472752258181572, + "learning_rate": 2.9121846179018464e-05, + "loss": 0.0012, + "step": 3659 + }, + { + "epoch": 2.4, + "grad_norm": 0.0011190982768312097, + "learning_rate": 2.9060831842109405e-05, + "loss": 0.0001, + "step": 3660 + }, + { + "epoch": 2.4, + "grad_norm": 0.026770759373903275, + "learning_rate": 2.8999874632574592e-05, + "loss": 0.0012, + "step": 3661 + }, + { + "epoch": 2.4, + "grad_norm": 0.012939787469804287, + "learning_rate": 2.893897457920803e-05, + "loss": 0.0007, + "step": 3662 + }, + { + "epoch": 2.4, + "grad_norm": 0.0015015322715044022, + "learning_rate": 2.887813171077677e-05, + "loss": 0.0001, + "step": 3663 + }, + { + "epoch": 2.4, + "grad_norm": 0.0026245703920722008, + "learning_rate": 2.8817346056020772e-05, + "loss": 0.0002, + "step": 3664 + }, + { + "epoch": 2.4, + "grad_norm": 0.0010097991907969117, + "learning_rate": 2.8756617643653057e-05, + "loss": 0.0001, + "step": 3665 + }, + { + "epoch": 2.4, + "grad_norm": 0.0626211166381836, + "learning_rate": 2.8695946502359557e-05, + "loss": 0.0018, + "step": 3666 + }, + { + "epoch": 2.4, + "grad_norm": 0.004791324492543936, + "learning_rate": 2.8635332660799182e-05, + "loss": 0.0003, + "step": 3667 + }, + { + "epoch": 2.4, + "grad_norm": 0.002726014005020261, + "learning_rate": 2.8574776147603762e-05, + "loss": 0.0002, + "step": 3668 + }, + { + "epoch": 2.4, + "grad_norm": 0.003951300401240587, + "learning_rate": 2.8514276991378048e-05, + "loss": 0.0003, + "step": 3669 + }, + { + "epoch": 2.4, + "grad_norm": 0.021291621029376984, + "learning_rate": 2.8453835220699684e-05, + "loss": 0.0011, + "step": 3670 + }, + { + "epoch": 2.4, + "grad_norm": 0.028817644342780113, + "learning_rate": 2.8393450864119226e-05, + "loss": 0.0017, + "step": 3671 + }, + { + "epoch": 2.4, + "grad_norm": 0.014525814913213253, + "learning_rate": 2.833312395016013e-05, + "loss": 0.0008, + "step": 3672 + }, + { + "epoch": 2.4, + "grad_norm": 0.04240436479449272, + "learning_rate": 2.8272854507318633e-05, + "loss": 0.0006, + "step": 3673 + }, + { + "epoch": 2.41, + "grad_norm": 0.004409548826515675, + "learning_rate": 2.8212642564063925e-05, + "loss": 0.0004, + "step": 3674 + }, + { + "epoch": 2.41, + "grad_norm": 0.007837683893740177, + "learning_rate": 2.815248814883796e-05, + "loss": 0.0005, + "step": 3675 + }, + { + "epoch": 2.41, + "grad_norm": 0.06929178535938263, + "learning_rate": 2.809239129005559e-05, + "loss": 0.0039, + "step": 3676 + }, + { + "epoch": 2.41, + "grad_norm": 0.013775107450783253, + "learning_rate": 2.8032352016104405e-05, + "loss": 0.0009, + "step": 3677 + }, + { + "epoch": 2.41, + "grad_norm": 0.0021300448570400476, + "learning_rate": 2.7972370355344854e-05, + "loss": 0.0002, + "step": 3678 + }, + { + "epoch": 2.41, + "grad_norm": 0.16798287630081177, + "learning_rate": 2.791244633611014e-05, + "loss": 0.0058, + "step": 3679 + }, + { + "epoch": 2.41, + "grad_norm": 0.009342607110738754, + "learning_rate": 2.785257998670627e-05, + "loss": 0.0008, + "step": 3680 + }, + { + "epoch": 2.41, + "grad_norm": 0.03553188964724541, + "learning_rate": 2.779277133541192e-05, + "loss": 0.0022, + "step": 3681 + }, + { + "epoch": 2.41, + "grad_norm": 0.0327470526099205, + "learning_rate": 2.773302041047862e-05, + "loss": 0.0011, + "step": 3682 + }, + { + "epoch": 2.41, + "grad_norm": 0.0034934538416564465, + "learning_rate": 2.7673327240130576e-05, + "loss": 0.0003, + "step": 3683 + }, + { + "epoch": 2.41, + "grad_norm": 0.11681769788265228, + "learning_rate": 2.7613691852564728e-05, + "loss": 0.0071, + "step": 3684 + }, + { + "epoch": 2.41, + "grad_norm": 0.005053054075688124, + "learning_rate": 2.7554114275950723e-05, + "loss": 0.0002, + "step": 3685 + }, + { + "epoch": 2.41, + "grad_norm": 0.0011449737939983606, + "learning_rate": 2.7494594538430882e-05, + "loss": 0.0001, + "step": 3686 + }, + { + "epoch": 2.41, + "grad_norm": 0.010535813868045807, + "learning_rate": 2.743513266812023e-05, + "loss": 0.0006, + "step": 3687 + }, + { + "epoch": 2.41, + "grad_norm": 0.008704773150384426, + "learning_rate": 2.7375728693106454e-05, + "loss": 0.0006, + "step": 3688 + }, + { + "epoch": 2.42, + "grad_norm": 0.021657146513462067, + "learning_rate": 2.73163826414499e-05, + "loss": 0.0013, + "step": 3689 + }, + { + "epoch": 2.42, + "grad_norm": 0.0023986981250345707, + "learning_rate": 2.725709454118349e-05, + "loss": 0.0002, + "step": 3690 + }, + { + "epoch": 2.42, + "grad_norm": 0.007590134162455797, + "learning_rate": 2.7197864420312826e-05, + "loss": 0.0005, + "step": 3691 + }, + { + "epoch": 2.42, + "grad_norm": 0.3991285562515259, + "learning_rate": 2.713869230681615e-05, + "loss": 0.0112, + "step": 3692 + }, + { + "epoch": 2.42, + "grad_norm": 0.01693413034081459, + "learning_rate": 2.707957822864424e-05, + "loss": 0.0006, + "step": 3693 + }, + { + "epoch": 2.42, + "grad_norm": 0.002101296093314886, + "learning_rate": 2.7020522213720517e-05, + "loss": 0.0001, + "step": 3694 + }, + { + "epoch": 2.42, + "grad_norm": 0.014015717431902885, + "learning_rate": 2.6961524289940928e-05, + "loss": 0.0003, + "step": 3695 + }, + { + "epoch": 2.42, + "grad_norm": 0.003958274144679308, + "learning_rate": 2.6902584485174006e-05, + "loss": 0.0001, + "step": 3696 + }, + { + "epoch": 2.42, + "grad_norm": 0.005625806748867035, + "learning_rate": 2.6843702827260834e-05, + "loss": 0.0001, + "step": 3697 + }, + { + "epoch": 2.42, + "grad_norm": 0.0037148420233279467, + "learning_rate": 2.6784879344015043e-05, + "loss": 0.0002, + "step": 3698 + }, + { + "epoch": 2.42, + "grad_norm": 0.0663096159696579, + "learning_rate": 2.672611406322269e-05, + "loss": 0.0029, + "step": 3699 + }, + { + "epoch": 2.42, + "grad_norm": 0.008969821967184544, + "learning_rate": 2.6667407012642445e-05, + "loss": 0.0005, + "step": 3700 + }, + { + "epoch": 2.42, + "grad_norm": 0.004598591011017561, + "learning_rate": 2.6608758220005448e-05, + "loss": 0.0003, + "step": 3701 + }, + { + "epoch": 2.42, + "grad_norm": 0.0017274868441745639, + "learning_rate": 2.6550167713015298e-05, + "loss": 0.0001, + "step": 3702 + }, + { + "epoch": 2.42, + "grad_norm": 0.0020384856034070253, + "learning_rate": 2.6491635519348065e-05, + "loss": 0.0001, + "step": 3703 + }, + { + "epoch": 2.42, + "grad_norm": 0.006550739984959364, + "learning_rate": 2.6433161666652304e-05, + "loss": 0.0003, + "step": 3704 + }, + { + "epoch": 2.43, + "grad_norm": 0.009589463472366333, + "learning_rate": 2.6374746182548966e-05, + "loss": 0.0005, + "step": 3705 + }, + { + "epoch": 2.43, + "grad_norm": 0.08297881484031677, + "learning_rate": 2.6316389094631484e-05, + "loss": 0.0015, + "step": 3706 + }, + { + "epoch": 2.43, + "grad_norm": 0.0033429779578000307, + "learning_rate": 2.625809043046569e-05, + "loss": 0.0001, + "step": 3707 + }, + { + "epoch": 2.43, + "grad_norm": 0.0014815748436376452, + "learning_rate": 2.619985021758972e-05, + "loss": 0.0001, + "step": 3708 + }, + { + "epoch": 2.43, + "grad_norm": 0.00698409229516983, + "learning_rate": 2.614166848351425e-05, + "loss": 0.0004, + "step": 3709 + }, + { + "epoch": 2.43, + "grad_norm": 0.2019488364458084, + "learning_rate": 2.6083545255722267e-05, + "loss": 0.0036, + "step": 3710 + }, + { + "epoch": 2.43, + "grad_norm": 0.014977452345192432, + "learning_rate": 2.6025480561669105e-05, + "loss": 0.0006, + "step": 3711 + }, + { + "epoch": 2.43, + "grad_norm": 0.17552299797534943, + "learning_rate": 2.5967474428782475e-05, + "loss": 0.002, + "step": 3712 + }, + { + "epoch": 2.43, + "grad_norm": 0.005717449821531773, + "learning_rate": 2.5909526884462413e-05, + "loss": 0.0004, + "step": 3713 + }, + { + "epoch": 2.43, + "grad_norm": 0.024677833542227745, + "learning_rate": 2.5851637956081283e-05, + "loss": 0.0008, + "step": 3714 + }, + { + "epoch": 2.43, + "grad_norm": 0.010601447895169258, + "learning_rate": 2.579380767098382e-05, + "loss": 0.0004, + "step": 3715 + }, + { + "epoch": 2.43, + "grad_norm": 0.0032953820191323757, + "learning_rate": 2.5736036056486897e-05, + "loss": 0.0002, + "step": 3716 + }, + { + "epoch": 2.43, + "grad_norm": 0.009412416256964207, + "learning_rate": 2.5678323139879825e-05, + "loss": 0.0005, + "step": 3717 + }, + { + "epoch": 2.43, + "grad_norm": 0.009052753448486328, + "learning_rate": 2.5620668948424163e-05, + "loss": 0.0004, + "step": 3718 + }, + { + "epoch": 2.43, + "grad_norm": 0.0017453498439863324, + "learning_rate": 2.556307350935367e-05, + "loss": 0.0001, + "step": 3719 + }, + { + "epoch": 2.44, + "grad_norm": 0.0002961267891805619, + "learning_rate": 2.550553684987439e-05, + "loss": 0.0, + "step": 3720 + }, + { + "epoch": 2.44, + "grad_norm": 0.02042260393500328, + "learning_rate": 2.54480589971646e-05, + "loss": 0.0013, + "step": 3721 + }, + { + "epoch": 2.44, + "grad_norm": 0.44513195753097534, + "learning_rate": 2.539063997837483e-05, + "loss": 0.0615, + "step": 3722 + }, + { + "epoch": 2.44, + "grad_norm": 0.043515317142009735, + "learning_rate": 2.5333279820627762e-05, + "loss": 0.0013, + "step": 3723 + }, + { + "epoch": 2.44, + "grad_norm": 0.006589180789887905, + "learning_rate": 2.527597855101831e-05, + "loss": 0.0003, + "step": 3724 + }, + { + "epoch": 2.44, + "grad_norm": 0.0023443615064024925, + "learning_rate": 2.521873619661356e-05, + "loss": 0.0002, + "step": 3725 + }, + { + "epoch": 2.44, + "grad_norm": 0.004776511341333389, + "learning_rate": 2.516155278445281e-05, + "loss": 0.0003, + "step": 3726 + }, + { + "epoch": 2.44, + "grad_norm": 0.24286900460720062, + "learning_rate": 2.5104428341547387e-05, + "loss": 0.0322, + "step": 3727 + }, + { + "epoch": 2.44, + "grad_norm": 0.1394210159778595, + "learning_rate": 2.5047362894880913e-05, + "loss": 0.0028, + "step": 3728 + }, + { + "epoch": 2.44, + "grad_norm": 0.23588547110557556, + "learning_rate": 2.499035647140907e-05, + "loss": 0.0453, + "step": 3729 + }, + { + "epoch": 2.44, + "grad_norm": 0.006382886786013842, + "learning_rate": 2.4933409098059648e-05, + "loss": 0.0003, + "step": 3730 + }, + { + "epoch": 2.44, + "grad_norm": 0.03138108551502228, + "learning_rate": 2.487652080173262e-05, + "loss": 0.0019, + "step": 3731 + }, + { + "epoch": 2.44, + "grad_norm": 0.1199861615896225, + "learning_rate": 2.481969160929995e-05, + "loss": 0.0353, + "step": 3732 + }, + { + "epoch": 2.44, + "grad_norm": 0.2854119837284088, + "learning_rate": 2.4762921547605757e-05, + "loss": 0.0098, + "step": 3733 + }, + { + "epoch": 2.44, + "grad_norm": 0.04618098959326744, + "learning_rate": 2.4706210643466185e-05, + "loss": 0.0011, + "step": 3734 + }, + { + "epoch": 2.45, + "grad_norm": 0.0043100458569824696, + "learning_rate": 2.464955892366952e-05, + "loss": 0.0003, + "step": 3735 + }, + { + "epoch": 2.45, + "grad_norm": 0.004088552203029394, + "learning_rate": 2.4592966414975946e-05, + "loss": 0.0002, + "step": 3736 + }, + { + "epoch": 2.45, + "grad_norm": 0.07775861024856567, + "learning_rate": 2.453643314411777e-05, + "loss": 0.0017, + "step": 3737 + }, + { + "epoch": 2.45, + "grad_norm": 0.0030159540474414825, + "learning_rate": 2.4479959137799325e-05, + "loss": 0.0001, + "step": 3738 + }, + { + "epoch": 2.45, + "grad_norm": 0.0009428430930711329, + "learning_rate": 2.4423544422696916e-05, + "loss": 0.0001, + "step": 3739 + }, + { + "epoch": 2.45, + "grad_norm": 0.0019232219783589244, + "learning_rate": 2.436718902545888e-05, + "loss": 0.0001, + "step": 3740 + }, + { + "epoch": 2.45, + "grad_norm": 0.012869434431195259, + "learning_rate": 2.431089297270548e-05, + "loss": 0.0005, + "step": 3741 + }, + { + "epoch": 2.45, + "grad_norm": 0.01162060908973217, + "learning_rate": 2.4254656291028974e-05, + "loss": 0.0006, + "step": 3742 + }, + { + "epoch": 2.45, + "grad_norm": 0.01218508742749691, + "learning_rate": 2.4198479006993637e-05, + "loss": 0.0006, + "step": 3743 + }, + { + "epoch": 2.45, + "grad_norm": 0.00766991451382637, + "learning_rate": 2.414236114713553e-05, + "loss": 0.0002, + "step": 3744 + }, + { + "epoch": 2.45, + "grad_norm": 0.009898004122078419, + "learning_rate": 2.4086302737962797e-05, + "loss": 0.0004, + "step": 3745 + }, + { + "epoch": 2.45, + "grad_norm": 0.01080253440886736, + "learning_rate": 2.4030303805955425e-05, + "loss": 0.0008, + "step": 3746 + }, + { + "epoch": 2.45, + "grad_norm": 0.1856086254119873, + "learning_rate": 2.397436437756532e-05, + "loss": 0.0029, + "step": 3747 + }, + { + "epoch": 2.45, + "grad_norm": 0.005443239118903875, + "learning_rate": 2.3918484479216294e-05, + "loss": 0.0004, + "step": 3748 + }, + { + "epoch": 2.45, + "grad_norm": 0.009528876282274723, + "learning_rate": 2.386266413730405e-05, + "loss": 0.0008, + "step": 3749 + }, + { + "epoch": 2.45, + "grad_norm": 0.02124428004026413, + "learning_rate": 2.3806903378196097e-05, + "loss": 0.0016, + "step": 3750 + }, + { + "epoch": 2.46, + "grad_norm": 0.012310376390814781, + "learning_rate": 2.3751202228231865e-05, + "loss": 0.0004, + "step": 3751 + }, + { + "epoch": 2.46, + "grad_norm": 0.0018997933948412538, + "learning_rate": 2.3695560713722638e-05, + "loss": 0.0002, + "step": 3752 + }, + { + "epoch": 2.46, + "grad_norm": 0.002142946468666196, + "learning_rate": 2.3639978860951413e-05, + "loss": 0.0001, + "step": 3753 + }, + { + "epoch": 2.46, + "grad_norm": 0.003471843432635069, + "learning_rate": 2.358445669617312e-05, + "loss": 0.0002, + "step": 3754 + }, + { + "epoch": 2.46, + "grad_norm": 0.04264701157808304, + "learning_rate": 2.352899424561448e-05, + "loss": 0.0016, + "step": 3755 + }, + { + "epoch": 2.46, + "grad_norm": 0.14676567912101746, + "learning_rate": 2.347359153547397e-05, + "loss": 0.0175, + "step": 3756 + }, + { + "epoch": 2.46, + "grad_norm": 0.020780233666300774, + "learning_rate": 2.3418248591921867e-05, + "loss": 0.0014, + "step": 3757 + }, + { + "epoch": 2.46, + "grad_norm": 0.0027702637016773224, + "learning_rate": 2.3362965441100218e-05, + "loss": 0.0002, + "step": 3758 + }, + { + "epoch": 2.46, + "grad_norm": 0.0017572494689375162, + "learning_rate": 2.330774210912283e-05, + "loss": 0.0001, + "step": 3759 + }, + { + "epoch": 2.46, + "grad_norm": 0.016448311507701874, + "learning_rate": 2.3252578622075235e-05, + "loss": 0.0008, + "step": 3760 + }, + { + "epoch": 2.46, + "grad_norm": 0.014893441461026669, + "learning_rate": 2.319747500601474e-05, + "loss": 0.0008, + "step": 3761 + }, + { + "epoch": 2.46, + "grad_norm": 0.03140440955758095, + "learning_rate": 2.31424312869703e-05, + "loss": 0.0027, + "step": 3762 + }, + { + "epoch": 2.46, + "grad_norm": 0.022161591798067093, + "learning_rate": 2.30874474909426e-05, + "loss": 0.0014, + "step": 3763 + }, + { + "epoch": 2.46, + "grad_norm": 0.004086275119334459, + "learning_rate": 2.303252364390408e-05, + "loss": 0.0003, + "step": 3764 + }, + { + "epoch": 2.46, + "grad_norm": 0.01266722846776247, + "learning_rate": 2.2977659771798806e-05, + "loss": 0.0005, + "step": 3765 + }, + { + "epoch": 2.47, + "grad_norm": 0.0033756562042981386, + "learning_rate": 2.292285590054251e-05, + "loss": 0.0002, + "step": 3766 + }, + { + "epoch": 2.47, + "grad_norm": 0.008223162963986397, + "learning_rate": 2.286811205602261e-05, + "loss": 0.0006, + "step": 3767 + }, + { + "epoch": 2.47, + "grad_norm": 0.05937148630619049, + "learning_rate": 2.2813428264098155e-05, + "loss": 0.0014, + "step": 3768 + }, + { + "epoch": 2.47, + "grad_norm": 0.0038700110744684935, + "learning_rate": 2.275880455059987e-05, + "loss": 0.0002, + "step": 3769 + }, + { + "epoch": 2.47, + "grad_norm": 0.050642628222703934, + "learning_rate": 2.2704240941329976e-05, + "loss": 0.0012, + "step": 3770 + }, + { + "epoch": 2.47, + "grad_norm": 0.0014657049905508757, + "learning_rate": 2.2649737462062445e-05, + "loss": 0.0001, + "step": 3771 + }, + { + "epoch": 2.47, + "grad_norm": 0.009033857844769955, + "learning_rate": 2.2595294138542746e-05, + "loss": 0.0005, + "step": 3772 + }, + { + "epoch": 2.47, + "grad_norm": 0.004064772743731737, + "learning_rate": 2.2540910996488025e-05, + "loss": 0.0002, + "step": 3773 + }, + { + "epoch": 2.47, + "grad_norm": 0.005530566442757845, + "learning_rate": 2.2486588061586918e-05, + "loss": 0.0002, + "step": 3774 + }, + { + "epoch": 2.47, + "grad_norm": 0.07258548587560654, + "learning_rate": 2.2432325359499665e-05, + "loss": 0.0032, + "step": 3775 + }, + { + "epoch": 2.47, + "grad_norm": 0.008993509225547314, + "learning_rate": 2.2378122915858025e-05, + "loss": 0.0004, + "step": 3776 + }, + { + "epoch": 2.47, + "grad_norm": 0.0150374760851264, + "learning_rate": 2.232398075626534e-05, + "loss": 0.0005, + "step": 3777 + }, + { + "epoch": 2.47, + "grad_norm": 0.04592491313815117, + "learning_rate": 2.226989890629645e-05, + "loss": 0.0014, + "step": 3778 + }, + { + "epoch": 2.47, + "grad_norm": 0.0022773267701268196, + "learning_rate": 2.2215877391497655e-05, + "loss": 0.0002, + "step": 3779 + }, + { + "epoch": 2.47, + "grad_norm": 0.38683828711509705, + "learning_rate": 2.2161916237386824e-05, + "loss": 0.0112, + "step": 3780 + }, + { + "epoch": 2.48, + "grad_norm": 0.005129650235176086, + "learning_rate": 2.2108015469453317e-05, + "loss": 0.0003, + "step": 3781 + }, + { + "epoch": 2.48, + "grad_norm": 0.0032208289485424757, + "learning_rate": 2.20541751131579e-05, + "loss": 0.0002, + "step": 3782 + }, + { + "epoch": 2.48, + "grad_norm": 0.010406946763396263, + "learning_rate": 2.2000395193932903e-05, + "loss": 0.0006, + "step": 3783 + }, + { + "epoch": 2.48, + "grad_norm": 0.008497284725308418, + "learning_rate": 2.1946675737182013e-05, + "loss": 0.0003, + "step": 3784 + }, + { + "epoch": 2.48, + "grad_norm": 0.21622425317764282, + "learning_rate": 2.1893016768280404e-05, + "loss": 0.0029, + "step": 3785 + }, + { + "epoch": 2.48, + "grad_norm": 0.13285638391971588, + "learning_rate": 2.183941831257468e-05, + "loss": 0.0029, + "step": 3786 + }, + { + "epoch": 2.48, + "grad_norm": 0.0017450024606660008, + "learning_rate": 2.1785880395382877e-05, + "loss": 0.0001, + "step": 3787 + }, + { + "epoch": 2.48, + "grad_norm": 0.0034125216770917177, + "learning_rate": 2.1732403041994346e-05, + "loss": 0.0003, + "step": 3788 + }, + { + "epoch": 2.48, + "grad_norm": 0.006293709855526686, + "learning_rate": 2.1678986277669915e-05, + "loss": 0.0005, + "step": 3789 + }, + { + "epoch": 2.48, + "grad_norm": 0.005125212948769331, + "learning_rate": 2.162563012764178e-05, + "loss": 0.0002, + "step": 3790 + }, + { + "epoch": 2.48, + "grad_norm": 0.0014759672340005636, + "learning_rate": 2.1572334617113484e-05, + "loss": 0.0001, + "step": 3791 + }, + { + "epoch": 2.48, + "grad_norm": 0.0010085315443575382, + "learning_rate": 2.1519099771259957e-05, + "loss": 0.0001, + "step": 3792 + }, + { + "epoch": 2.48, + "grad_norm": 0.0015687530394643545, + "learning_rate": 2.1465925615227432e-05, + "loss": 0.0001, + "step": 3793 + }, + { + "epoch": 2.48, + "grad_norm": 0.010366515256464481, + "learning_rate": 2.1412812174133496e-05, + "loss": 0.0003, + "step": 3794 + }, + { + "epoch": 2.48, + "grad_norm": 0.003558119060471654, + "learning_rate": 2.1359759473067108e-05, + "loss": 0.0002, + "step": 3795 + }, + { + "epoch": 2.49, + "grad_norm": 0.003396461019292474, + "learning_rate": 2.1306767537088393e-05, + "loss": 0.0002, + "step": 3796 + }, + { + "epoch": 2.49, + "grad_norm": 0.3438689708709717, + "learning_rate": 2.12538363912289e-05, + "loss": 0.0102, + "step": 3797 + }, + { + "epoch": 2.49, + "grad_norm": 0.0034374843817204237, + "learning_rate": 2.1200966060491447e-05, + "loss": 0.0002, + "step": 3798 + }, + { + "epoch": 2.49, + "grad_norm": 0.0038176493253558874, + "learning_rate": 2.1148156569850073e-05, + "loss": 0.0002, + "step": 3799 + }, + { + "epoch": 2.49, + "grad_norm": 0.0036796291824430227, + "learning_rate": 2.109540794425012e-05, + "loss": 0.0003, + "step": 3800 + }, + { + "epoch": 2.49, + "grad_norm": 0.0019943516235798597, + "learning_rate": 2.1042720208608178e-05, + "loss": 0.0001, + "step": 3801 + }, + { + "epoch": 2.49, + "grad_norm": 0.0070703173987567425, + "learning_rate": 2.0990093387812034e-05, + "loss": 0.0005, + "step": 3802 + }, + { + "epoch": 2.49, + "grad_norm": 0.00025776855181902647, + "learning_rate": 2.0937527506720775e-05, + "loss": 0.0, + "step": 3803 + }, + { + "epoch": 2.49, + "grad_norm": 0.002906102454289794, + "learning_rate": 2.0885022590164667e-05, + "loss": 0.0001, + "step": 3804 + }, + { + "epoch": 2.49, + "grad_norm": 0.02106166072189808, + "learning_rate": 2.083257866294511e-05, + "loss": 0.0008, + "step": 3805 + }, + { + "epoch": 2.49, + "grad_norm": 0.012767515145242214, + "learning_rate": 2.0780195749834783e-05, + "loss": 0.0002, + "step": 3806 + }, + { + "epoch": 2.49, + "grad_norm": 0.12511076033115387, + "learning_rate": 2.072787387557753e-05, + "loss": 0.013, + "step": 3807 + }, + { + "epoch": 2.49, + "grad_norm": 0.014380592852830887, + "learning_rate": 2.0675613064888343e-05, + "loss": 0.0006, + "step": 3808 + }, + { + "epoch": 2.49, + "grad_norm": 0.01294582150876522, + "learning_rate": 2.062341334245336e-05, + "loss": 0.0008, + "step": 3809 + }, + { + "epoch": 2.49, + "grad_norm": 0.0015137314330786467, + "learning_rate": 2.0571274732929894e-05, + "loss": 0.0001, + "step": 3810 + }, + { + "epoch": 2.49, + "grad_norm": 0.02780129760503769, + "learning_rate": 2.0519197260946375e-05, + "loss": 0.0009, + "step": 3811 + }, + { + "epoch": 2.5, + "grad_norm": 0.008371416479349136, + "learning_rate": 2.046718095110234e-05, + "loss": 0.0002, + "step": 3812 + }, + { + "epoch": 2.5, + "grad_norm": 0.0009381690178997815, + "learning_rate": 2.0415225827968508e-05, + "loss": 0.0001, + "step": 3813 + }, + { + "epoch": 2.5, + "grad_norm": 0.004177944269031286, + "learning_rate": 2.0363331916086556e-05, + "loss": 0.0002, + "step": 3814 + }, + { + "epoch": 2.5, + "grad_norm": 0.006999014411121607, + "learning_rate": 2.0311499239969365e-05, + "loss": 0.0002, + "step": 3815 + }, + { + "epoch": 2.5, + "grad_norm": 0.002490478102117777, + "learning_rate": 2.025972782410083e-05, + "loss": 0.0002, + "step": 3816 + }, + { + "epoch": 2.5, + "grad_norm": 0.18457277119159698, + "learning_rate": 2.0208017692935957e-05, + "loss": 0.036, + "step": 3817 + }, + { + "epoch": 2.5, + "grad_norm": 0.013369838707149029, + "learning_rate": 2.0156368870900792e-05, + "loss": 0.0003, + "step": 3818 + }, + { + "epoch": 2.5, + "grad_norm": 0.17626619338989258, + "learning_rate": 2.0104781382392366e-05, + "loss": 0.0351, + "step": 3819 + }, + { + "epoch": 2.5, + "grad_norm": 0.003646490629762411, + "learning_rate": 2.005325525177884e-05, + "loss": 0.0003, + "step": 3820 + }, + { + "epoch": 2.5, + "eval_loss": 0.045124031603336334, + "eval_runtime": 40.059, + "eval_samples_per_second": 32.128, + "eval_steps_per_second": 8.038, + "step": 3820 + }, + { + "epoch": 2.5, + "grad_norm": 0.1512085497379303, + "learning_rate": 2.0001790503399258e-05, + "loss": 0.0201, + "step": 3821 + }, + { + "epoch": 2.5, + "grad_norm": 0.04014553129673004, + "learning_rate": 1.9950387161563775e-05, + "loss": 0.0013, + "step": 3822 + }, + { + "epoch": 2.5, + "grad_norm": 0.20916107296943665, + "learning_rate": 1.98990452505535e-05, + "loss": 0.0231, + "step": 3823 + }, + { + "epoch": 2.5, + "grad_norm": 0.008310503326356411, + "learning_rate": 1.984776479462059e-05, + "loss": 0.0004, + "step": 3824 + }, + { + "epoch": 2.5, + "grad_norm": 0.004644991364330053, + "learning_rate": 1.9796545817988014e-05, + "loss": 0.0003, + "step": 3825 + }, + { + "epoch": 2.5, + "grad_norm": 0.0064013684168457985, + "learning_rate": 1.974538834484985e-05, + "loss": 0.0002, + "step": 3826 + }, + { + "epoch": 2.51, + "grad_norm": 0.00458954693749547, + "learning_rate": 1.969429239937107e-05, + "loss": 0.0003, + "step": 3827 + }, + { + "epoch": 2.51, + "grad_norm": 0.013306370005011559, + "learning_rate": 1.96432580056876e-05, + "loss": 0.0009, + "step": 3828 + }, + { + "epoch": 2.51, + "grad_norm": 0.008030070923268795, + "learning_rate": 1.9592285187906258e-05, + "loss": 0.0003, + "step": 3829 + }, + { + "epoch": 2.51, + "grad_norm": 0.003760328982025385, + "learning_rate": 1.95413739701048e-05, + "loss": 0.0003, + "step": 3830 + }, + { + "epoch": 2.51, + "grad_norm": 0.026043307036161423, + "learning_rate": 1.9490524376331888e-05, + "loss": 0.0008, + "step": 3831 + }, + { + "epoch": 2.51, + "grad_norm": 0.14236873388290405, + "learning_rate": 1.9439736430607096e-05, + "loss": 0.0065, + "step": 3832 + }, + { + "epoch": 2.51, + "grad_norm": 0.012036359868943691, + "learning_rate": 1.9389010156920793e-05, + "loss": 0.0007, + "step": 3833 + }, + { + "epoch": 2.51, + "grad_norm": 0.003528281580656767, + "learning_rate": 1.9338345579234283e-05, + "loss": 0.0003, + "step": 3834 + }, + { + "epoch": 2.51, + "grad_norm": 0.008096953853964806, + "learning_rate": 1.928774272147972e-05, + "loss": 0.0004, + "step": 3835 + }, + { + "epoch": 2.51, + "grad_norm": 0.01460680365562439, + "learning_rate": 1.923720160756012e-05, + "loss": 0.001, + "step": 3836 + }, + { + "epoch": 2.51, + "grad_norm": 0.004466039128601551, + "learning_rate": 1.9186722261349286e-05, + "loss": 0.0002, + "step": 3837 + }, + { + "epoch": 2.51, + "grad_norm": 0.20724274218082428, + "learning_rate": 1.913630470669189e-05, + "loss": 0.012, + "step": 3838 + }, + { + "epoch": 2.51, + "grad_norm": 0.01058044284582138, + "learning_rate": 1.90859489674034e-05, + "loss": 0.0004, + "step": 3839 + }, + { + "epoch": 2.51, + "grad_norm": 0.03503754362463951, + "learning_rate": 1.903565506727006e-05, + "loss": 0.0017, + "step": 3840 + }, + { + "epoch": 2.51, + "grad_norm": 0.007808753289282322, + "learning_rate": 1.898542303004898e-05, + "loss": 0.0003, + "step": 3841 + }, + { + "epoch": 2.52, + "grad_norm": 0.44717031717300415, + "learning_rate": 1.893525287946791e-05, + "loss": 0.0273, + "step": 3842 + }, + { + "epoch": 2.52, + "grad_norm": 0.010218746028840542, + "learning_rate": 1.8885144639225496e-05, + "loss": 0.0004, + "step": 3843 + }, + { + "epoch": 2.52, + "grad_norm": 0.10018881410360336, + "learning_rate": 1.8835098332991088e-05, + "loss": 0.0031, + "step": 3844 + }, + { + "epoch": 2.52, + "grad_norm": 0.05226203054189682, + "learning_rate": 1.878511398440479e-05, + "loss": 0.0014, + "step": 3845 + }, + { + "epoch": 2.52, + "grad_norm": 0.07296320050954819, + "learning_rate": 1.8735191617077422e-05, + "loss": 0.0041, + "step": 3846 + }, + { + "epoch": 2.52, + "grad_norm": 0.040401432663202286, + "learning_rate": 1.8685331254590562e-05, + "loss": 0.0018, + "step": 3847 + }, + { + "epoch": 2.52, + "grad_norm": 0.03135022893548012, + "learning_rate": 1.863553292049646e-05, + "loss": 0.0013, + "step": 3848 + }, + { + "epoch": 2.52, + "grad_norm": 0.10556218028068542, + "learning_rate": 1.858579663831808e-05, + "loss": 0.0063, + "step": 3849 + }, + { + "epoch": 2.52, + "grad_norm": 0.004249983001500368, + "learning_rate": 1.8536122431549128e-05, + "loss": 0.0003, + "step": 3850 + }, + { + "epoch": 2.52, + "grad_norm": 0.017284538596868515, + "learning_rate": 1.8486510323653868e-05, + "loss": 0.001, + "step": 3851 + }, + { + "epoch": 2.52, + "grad_norm": 0.025452695786952972, + "learning_rate": 1.8436960338067326e-05, + "loss": 0.0015, + "step": 3852 + }, + { + "epoch": 2.52, + "grad_norm": 0.03908282145857811, + "learning_rate": 1.8387472498195154e-05, + "loss": 0.0022, + "step": 3853 + }, + { + "epoch": 2.52, + "grad_norm": 0.009711049497127533, + "learning_rate": 1.833804682741366e-05, + "loss": 0.0004, + "step": 3854 + }, + { + "epoch": 2.52, + "grad_norm": 0.001228800043463707, + "learning_rate": 1.8288683349069782e-05, + "loss": 0.0001, + "step": 3855 + }, + { + "epoch": 2.52, + "grad_norm": 0.14710290729999542, + "learning_rate": 1.823938208648107e-05, + "loss": 0.028, + "step": 3856 + }, + { + "epoch": 2.53, + "grad_norm": 0.004383179359138012, + "learning_rate": 1.819014306293569e-05, + "loss": 0.0003, + "step": 3857 + }, + { + "epoch": 2.53, + "grad_norm": 0.0014246645150706172, + "learning_rate": 1.8140966301692446e-05, + "loss": 0.0001, + "step": 3858 + }, + { + "epoch": 2.53, + "grad_norm": 0.08542287349700928, + "learning_rate": 1.809185182598063e-05, + "loss": 0.0023, + "step": 3859 + }, + { + "epoch": 2.53, + "grad_norm": 0.0031465787906199694, + "learning_rate": 1.8042799659000222e-05, + "loss": 0.0002, + "step": 3860 + }, + { + "epoch": 2.53, + "grad_norm": 0.003702137153595686, + "learning_rate": 1.7993809823921723e-05, + "loss": 0.0003, + "step": 3861 + }, + { + "epoch": 2.53, + "grad_norm": 0.0014129002811387181, + "learning_rate": 1.794488234388617e-05, + "loss": 0.0001, + "step": 3862 + }, + { + "epoch": 2.53, + "grad_norm": 0.010685211047530174, + "learning_rate": 1.7896017242005207e-05, + "loss": 0.0006, + "step": 3863 + }, + { + "epoch": 2.53, + "grad_norm": 0.0036190771497786045, + "learning_rate": 1.7847214541360938e-05, + "loss": 0.0002, + "step": 3864 + }, + { + "epoch": 2.53, + "grad_norm": 0.03200782090425491, + "learning_rate": 1.7798474265006056e-05, + "loss": 0.0004, + "step": 3865 + }, + { + "epoch": 2.53, + "grad_norm": 0.01492803730070591, + "learning_rate": 1.774979643596373e-05, + "loss": 0.001, + "step": 3866 + }, + { + "epoch": 2.53, + "grad_norm": 0.012737604789435863, + "learning_rate": 1.7701181077227667e-05, + "loss": 0.0009, + "step": 3867 + }, + { + "epoch": 2.53, + "grad_norm": 0.03760932385921478, + "learning_rate": 1.7652628211761978e-05, + "loss": 0.0021, + "step": 3868 + }, + { + "epoch": 2.53, + "grad_norm": 0.008631984703242779, + "learning_rate": 1.760413786250135e-05, + "loss": 0.0007, + "step": 3869 + }, + { + "epoch": 2.53, + "grad_norm": 0.04473002254962921, + "learning_rate": 1.7555710052350898e-05, + "loss": 0.0011, + "step": 3870 + }, + { + "epoch": 2.53, + "grad_norm": 0.0198511965572834, + "learning_rate": 1.75073448041862e-05, + "loss": 0.0008, + "step": 3871 + }, + { + "epoch": 2.53, + "grad_norm": 0.0022420890163630247, + "learning_rate": 1.7459042140853307e-05, + "loss": 0.0002, + "step": 3872 + }, + { + "epoch": 2.54, + "grad_norm": 0.0032802545465528965, + "learning_rate": 1.7410802085168663e-05, + "loss": 0.0002, + "step": 3873 + }, + { + "epoch": 2.54, + "grad_norm": 0.008525917306542397, + "learning_rate": 1.7362624659919155e-05, + "loss": 0.0006, + "step": 3874 + }, + { + "epoch": 2.54, + "grad_norm": 0.008164350874722004, + "learning_rate": 1.73145098878621e-05, + "loss": 0.0004, + "step": 3875 + }, + { + "epoch": 2.54, + "grad_norm": 0.00233345665037632, + "learning_rate": 1.7266457791725247e-05, + "loss": 0.0001, + "step": 3876 + }, + { + "epoch": 2.54, + "grad_norm": 0.005806733388453722, + "learning_rate": 1.721846839420664e-05, + "loss": 0.0004, + "step": 3877 + }, + { + "epoch": 2.54, + "grad_norm": 0.03104785829782486, + "learning_rate": 1.7170541717974786e-05, + "loss": 0.0008, + "step": 3878 + }, + { + "epoch": 2.54, + "grad_norm": 0.01018536277115345, + "learning_rate": 1.712267778566855e-05, + "loss": 0.0009, + "step": 3879 + }, + { + "epoch": 2.54, + "grad_norm": 0.3993581235408783, + "learning_rate": 1.7074876619897148e-05, + "loss": 0.0297, + "step": 3880 + }, + { + "epoch": 2.54, + "grad_norm": 0.009855585172772408, + "learning_rate": 1.702713824324019e-05, + "loss": 0.0005, + "step": 3881 + }, + { + "epoch": 2.54, + "grad_norm": 0.007799225859344006, + "learning_rate": 1.697946267824757e-05, + "loss": 0.0005, + "step": 3882 + }, + { + "epoch": 2.54, + "grad_norm": 0.01226276159286499, + "learning_rate": 1.6931849947439518e-05, + "loss": 0.0006, + "step": 3883 + }, + { + "epoch": 2.54, + "grad_norm": 0.0028570923022925854, + "learning_rate": 1.688430007330665e-05, + "loss": 0.0002, + "step": 3884 + }, + { + "epoch": 2.54, + "grad_norm": 0.24111603200435638, + "learning_rate": 1.6836813078309778e-05, + "loss": 0.0035, + "step": 3885 + }, + { + "epoch": 2.54, + "grad_norm": 0.12557904422283173, + "learning_rate": 1.6789388984880115e-05, + "loss": 0.0075, + "step": 3886 + }, + { + "epoch": 2.54, + "grad_norm": 0.0037830721121281385, + "learning_rate": 1.67420278154191e-05, + "loss": 0.0001, + "step": 3887 + }, + { + "epoch": 2.55, + "grad_norm": 0.013570702634751797, + "learning_rate": 1.669472959229848e-05, + "loss": 0.0005, + "step": 3888 + }, + { + "epoch": 2.55, + "grad_norm": 0.0016447704983875155, + "learning_rate": 1.6647494337860256e-05, + "loss": 0.0001, + "step": 3889 + }, + { + "epoch": 2.55, + "grad_norm": 0.0552806481719017, + "learning_rate": 1.6600322074416674e-05, + "loss": 0.0018, + "step": 3890 + }, + { + "epoch": 2.55, + "grad_norm": 0.22801519930362701, + "learning_rate": 1.6553212824250267e-05, + "loss": 0.0155, + "step": 3891 + }, + { + "epoch": 2.55, + "grad_norm": 0.000800129899289459, + "learning_rate": 1.6506166609613757e-05, + "loss": 0.0, + "step": 3892 + }, + { + "epoch": 2.55, + "grad_norm": 0.0166360754519701, + "learning_rate": 1.6459183452730152e-05, + "loss": 0.001, + "step": 3893 + }, + { + "epoch": 2.55, + "grad_norm": 0.002638260368257761, + "learning_rate": 1.6412263375792544e-05, + "loss": 0.0002, + "step": 3894 + }, + { + "epoch": 2.55, + "grad_norm": 0.022630006074905396, + "learning_rate": 1.6365406400964344e-05, + "loss": 0.002, + "step": 3895 + }, + { + "epoch": 2.55, + "grad_norm": 0.6403875350952148, + "learning_rate": 1.6318612550379156e-05, + "loss": 0.0588, + "step": 3896 + }, + { + "epoch": 2.55, + "grad_norm": 0.016941143199801445, + "learning_rate": 1.627188184614072e-05, + "loss": 0.0004, + "step": 3897 + }, + { + "epoch": 2.55, + "grad_norm": 0.006208622362464666, + "learning_rate": 1.6225214310322948e-05, + "loss": 0.0004, + "step": 3898 + }, + { + "epoch": 2.55, + "grad_norm": 0.004772585816681385, + "learning_rate": 1.617860996496995e-05, + "loss": 0.0002, + "step": 3899 + }, + { + "epoch": 2.55, + "grad_norm": 0.03386545926332474, + "learning_rate": 1.613206883209596e-05, + "loss": 0.0017, + "step": 3900 + }, + { + "epoch": 2.55, + "grad_norm": 0.015483944676816463, + "learning_rate": 1.6085590933685343e-05, + "loss": 0.0012, + "step": 3901 + }, + { + "epoch": 2.55, + "grad_norm": 0.015971994027495384, + "learning_rate": 1.6039176291692666e-05, + "loss": 0.001, + "step": 3902 + }, + { + "epoch": 2.56, + "grad_norm": 0.010225643403828144, + "learning_rate": 1.5992824928042476e-05, + "loss": 0.0004, + "step": 3903 + }, + { + "epoch": 2.56, + "grad_norm": 0.016231779009103775, + "learning_rate": 1.5946536864629568e-05, + "loss": 0.001, + "step": 3904 + }, + { + "epoch": 2.56, + "grad_norm": 0.02168508991599083, + "learning_rate": 1.5900312123318755e-05, + "loss": 0.0004, + "step": 3905 + }, + { + "epoch": 2.56, + "grad_norm": 0.01738566905260086, + "learning_rate": 1.5854150725944988e-05, + "loss": 0.0005, + "step": 3906 + }, + { + "epoch": 2.56, + "grad_norm": 0.16182847321033478, + "learning_rate": 1.5808052694313273e-05, + "loss": 0.0203, + "step": 3907 + }, + { + "epoch": 2.56, + "grad_norm": 0.0051698386669158936, + "learning_rate": 1.576201805019866e-05, + "loss": 0.0004, + "step": 3908 + }, + { + "epoch": 2.56, + "grad_norm": 0.0230996236205101, + "learning_rate": 1.5716046815346304e-05, + "loss": 0.0013, + "step": 3909 + }, + { + "epoch": 2.56, + "grad_norm": 0.008752661757171154, + "learning_rate": 1.567013901147139e-05, + "loss": 0.0004, + "step": 3910 + }, + { + "epoch": 2.56, + "grad_norm": 0.0031037696171551943, + "learning_rate": 1.5624294660259163e-05, + "loss": 0.0002, + "step": 3911 + }, + { + "epoch": 2.56, + "grad_norm": 0.00405475590378046, + "learning_rate": 1.5578513783364786e-05, + "loss": 0.0003, + "step": 3912 + }, + { + "epoch": 2.56, + "grad_norm": 0.006054164841771126, + "learning_rate": 1.5532796402413583e-05, + "loss": 0.0004, + "step": 3913 + }, + { + "epoch": 2.56, + "grad_norm": 0.00064209004631266, + "learning_rate": 1.5487142539000808e-05, + "loss": 0.0, + "step": 3914 + }, + { + "epoch": 2.56, + "grad_norm": 0.06945687532424927, + "learning_rate": 1.5441552214691742e-05, + "loss": 0.0031, + "step": 3915 + }, + { + "epoch": 2.56, + "grad_norm": 0.005002451594918966, + "learning_rate": 1.5396025451021617e-05, + "loss": 0.0004, + "step": 3916 + }, + { + "epoch": 2.56, + "grad_norm": 0.01373702846467495, + "learning_rate": 1.5350562269495653e-05, + "loss": 0.001, + "step": 3917 + }, + { + "epoch": 2.56, + "grad_norm": 0.08181232959032059, + "learning_rate": 1.530516269158908e-05, + "loss": 0.0009, + "step": 3918 + }, + { + "epoch": 2.57, + "grad_norm": 0.01954641006886959, + "learning_rate": 1.5259826738747056e-05, + "loss": 0.0008, + "step": 3919 + }, + { + "epoch": 2.57, + "grad_norm": 0.0027879527769982815, + "learning_rate": 1.5214554432384618e-05, + "loss": 0.0002, + "step": 3920 + }, + { + "epoch": 2.57, + "grad_norm": 0.3608317971229553, + "learning_rate": 1.5169345793886861e-05, + "loss": 0.0191, + "step": 3921 + }, + { + "epoch": 2.57, + "grad_norm": 0.011447795666754246, + "learning_rate": 1.5124200844608686e-05, + "loss": 0.0008, + "step": 3922 + }, + { + "epoch": 2.57, + "grad_norm": 0.098612479865551, + "learning_rate": 1.5079119605875001e-05, + "loss": 0.0053, + "step": 3923 + }, + { + "epoch": 2.57, + "grad_norm": 0.004878662060946226, + "learning_rate": 1.5034102098980555e-05, + "loss": 0.0002, + "step": 3924 + }, + { + "epoch": 2.57, + "grad_norm": 0.002009717281907797, + "learning_rate": 1.4989148345190071e-05, + "loss": 0.0001, + "step": 3925 + }, + { + "epoch": 2.57, + "grad_norm": 0.039765290915966034, + "learning_rate": 1.4944258365738066e-05, + "loss": 0.0007, + "step": 3926 + }, + { + "epoch": 2.57, + "grad_norm": 0.04524314031004906, + "learning_rate": 1.4899432181828991e-05, + "loss": 0.0008, + "step": 3927 + }, + { + "epoch": 2.57, + "grad_norm": 0.0025847493670880795, + "learning_rate": 1.4854669814637143e-05, + "loss": 0.0002, + "step": 3928 + }, + { + "epoch": 2.57, + "grad_norm": 0.01720985397696495, + "learning_rate": 1.4809971285306676e-05, + "loss": 0.0004, + "step": 3929 + }, + { + "epoch": 2.57, + "grad_norm": 0.0033838737290352583, + "learning_rate": 1.4765336614951618e-05, + "loss": 0.0002, + "step": 3930 + }, + { + "epoch": 2.57, + "grad_norm": 0.007368985563516617, + "learning_rate": 1.472076582465575e-05, + "loss": 0.0004, + "step": 3931 + }, + { + "epoch": 2.57, + "grad_norm": 0.012488423846662045, + "learning_rate": 1.4676258935472767e-05, + "loss": 0.0008, + "step": 3932 + }, + { + "epoch": 2.57, + "grad_norm": 0.0020392360165715218, + "learning_rate": 1.4631815968426135e-05, + "loss": 0.0001, + "step": 3933 + }, + { + "epoch": 2.58, + "grad_norm": 0.010550234466791153, + "learning_rate": 1.4587436944509145e-05, + "loss": 0.0006, + "step": 3934 + }, + { + "epoch": 2.58, + "grad_norm": 0.1065152958035469, + "learning_rate": 1.454312188468486e-05, + "loss": 0.0049, + "step": 3935 + }, + { + "epoch": 2.58, + "grad_norm": 0.0025994908064603806, + "learning_rate": 1.4498870809886154e-05, + "loss": 0.0002, + "step": 3936 + }, + { + "epoch": 2.58, + "grad_norm": 0.0024751867167651653, + "learning_rate": 1.4454683741015672e-05, + "loss": 0.0002, + "step": 3937 + }, + { + "epoch": 2.58, + "grad_norm": 0.0099541787058115, + "learning_rate": 1.44105606989458e-05, + "loss": 0.0007, + "step": 3938 + }, + { + "epoch": 2.58, + "grad_norm": 0.037908878177404404, + "learning_rate": 1.4366501704518736e-05, + "loss": 0.001, + "step": 3939 + }, + { + "epoch": 2.58, + "grad_norm": 0.0022392801474779844, + "learning_rate": 1.4322506778546327e-05, + "loss": 0.0001, + "step": 3940 + }, + { + "epoch": 2.58, + "grad_norm": 0.04893236979842186, + "learning_rate": 1.4278575941810255e-05, + "loss": 0.0017, + "step": 3941 + }, + { + "epoch": 2.58, + "grad_norm": 0.009400781244039536, + "learning_rate": 1.4234709215061885e-05, + "loss": 0.0004, + "step": 3942 + }, + { + "epoch": 2.58, + "grad_norm": 0.01746106892824173, + "learning_rate": 1.4190906619022291e-05, + "loss": 0.0007, + "step": 3943 + }, + { + "epoch": 2.58, + "grad_norm": 0.015727614983916283, + "learning_rate": 1.4147168174382273e-05, + "loss": 0.0006, + "step": 3944 + }, + { + "epoch": 2.58, + "grad_norm": 0.032940227538347244, + "learning_rate": 1.410349390180232e-05, + "loss": 0.001, + "step": 3945 + }, + { + "epoch": 2.58, + "grad_norm": 0.15950481593608856, + "learning_rate": 1.4059883821912616e-05, + "loss": 0.002, + "step": 3946 + }, + { + "epoch": 2.58, + "grad_norm": 0.0032736181747168303, + "learning_rate": 1.4016337955313044e-05, + "loss": 0.0002, + "step": 3947 + }, + { + "epoch": 2.58, + "grad_norm": 0.043217290192842484, + "learning_rate": 1.3972856322573073e-05, + "loss": 0.004, + "step": 3948 + }, + { + "epoch": 2.59, + "grad_norm": 0.0019488498801365495, + "learning_rate": 1.3929438944231885e-05, + "loss": 0.0001, + "step": 3949 + }, + { + "epoch": 2.59, + "grad_norm": 0.003278010990470648, + "learning_rate": 1.3886085840798361e-05, + "loss": 0.0001, + "step": 3950 + }, + { + "epoch": 2.59, + "grad_norm": 0.1828782707452774, + "learning_rate": 1.384279703275092e-05, + "loss": 0.0046, + "step": 3951 + }, + { + "epoch": 2.59, + "grad_norm": 0.15590015053749084, + "learning_rate": 1.3799572540537685e-05, + "loss": 0.0067, + "step": 3952 + }, + { + "epoch": 2.59, + "grad_norm": 0.001411370001733303, + "learning_rate": 1.375641238457636e-05, + "loss": 0.0001, + "step": 3953 + }, + { + "epoch": 2.59, + "grad_norm": 0.004240601323544979, + "learning_rate": 1.3713316585254303e-05, + "loss": 0.0003, + "step": 3954 + }, + { + "epoch": 2.59, + "grad_norm": 0.4073038697242737, + "learning_rate": 1.3670285162928396e-05, + "loss": 0.0073, + "step": 3955 + }, + { + "epoch": 2.59, + "grad_norm": 0.29189032316207886, + "learning_rate": 1.3627318137925214e-05, + "loss": 0.0082, + "step": 3956 + }, + { + "epoch": 2.59, + "grad_norm": 0.047309450805187225, + "learning_rate": 1.3584415530540804e-05, + "loss": 0.0053, + "step": 3957 + }, + { + "epoch": 2.59, + "grad_norm": 0.0019443761557340622, + "learning_rate": 1.354157736104084e-05, + "loss": 0.0001, + "step": 3958 + }, + { + "epoch": 2.59, + "grad_norm": 0.0027246689423918724, + "learning_rate": 1.3498803649660583e-05, + "loss": 0.0002, + "step": 3959 + }, + { + "epoch": 2.59, + "grad_norm": 0.011064418591558933, + "learning_rate": 1.345609441660479e-05, + "loss": 0.0005, + "step": 3960 + }, + { + "epoch": 2.59, + "grad_norm": 0.005419931374490261, + "learning_rate": 1.3413449682047805e-05, + "loss": 0.0003, + "step": 3961 + }, + { + "epoch": 2.59, + "grad_norm": 0.00879404041916132, + "learning_rate": 1.3370869466133482e-05, + "loss": 0.0002, + "step": 3962 + }, + { + "epoch": 2.59, + "grad_norm": 0.07874434441328049, + "learning_rate": 1.3328353788975216e-05, + "loss": 0.0036, + "step": 3963 + }, + { + "epoch": 2.6, + "grad_norm": 0.023440929129719734, + "learning_rate": 1.328590267065589e-05, + "loss": 0.0007, + "step": 3964 + }, + { + "epoch": 2.6, + "grad_norm": 0.0015399332623928785, + "learning_rate": 1.3243516131227933e-05, + "loss": 0.0001, + "step": 3965 + }, + { + "epoch": 2.6, + "grad_norm": 0.0010855916189029813, + "learning_rate": 1.3201194190713193e-05, + "loss": 0.0001, + "step": 3966 + }, + { + "epoch": 2.6, + "grad_norm": 0.003763427259400487, + "learning_rate": 1.3158936869103098e-05, + "loss": 0.0003, + "step": 3967 + }, + { + "epoch": 2.6, + "grad_norm": 0.0011584527092054486, + "learning_rate": 1.3116744186358497e-05, + "loss": 0.0001, + "step": 3968 + }, + { + "epoch": 2.6, + "grad_norm": 0.0007404198404401541, + "learning_rate": 1.3074616162409696e-05, + "loss": 0.0, + "step": 3969 + }, + { + "epoch": 2.6, + "grad_norm": 0.005640385672450066, + "learning_rate": 1.3032552817156511e-05, + "loss": 0.0002, + "step": 3970 + }, + { + "epoch": 2.6, + "grad_norm": 0.016451558098196983, + "learning_rate": 1.2990554170468149e-05, + "loss": 0.0004, + "step": 3971 + }, + { + "epoch": 2.6, + "grad_norm": 0.003958103246986866, + "learning_rate": 1.2948620242183306e-05, + "loss": 0.0002, + "step": 3972 + }, + { + "epoch": 2.6, + "grad_norm": 0.0012922111200168729, + "learning_rate": 1.2906751052110103e-05, + "loss": 0.0001, + "step": 3973 + }, + { + "epoch": 2.6, + "grad_norm": 0.030663957819342613, + "learning_rate": 1.286494662002599e-05, + "loss": 0.0012, + "step": 3974 + }, + { + "epoch": 2.6, + "grad_norm": 0.08389660716056824, + "learning_rate": 1.2823206965677934e-05, + "loss": 0.0037, + "step": 3975 + }, + { + "epoch": 2.6, + "grad_norm": 0.08301394432783127, + "learning_rate": 1.2781532108782267e-05, + "loss": 0.0027, + "step": 3976 + }, + { + "epoch": 2.6, + "grad_norm": 0.0019153612665832043, + "learning_rate": 1.2739922069024722e-05, + "loss": 0.0001, + "step": 3977 + }, + { + "epoch": 2.6, + "grad_norm": 0.002557477680966258, + "learning_rate": 1.2698376866060395e-05, + "loss": 0.0002, + "step": 3978 + }, + { + "epoch": 2.6, + "grad_norm": 0.0019160362426191568, + "learning_rate": 1.2656896519513787e-05, + "loss": 0.0001, + "step": 3979 + }, + { + "epoch": 2.61, + "grad_norm": 0.09930053353309631, + "learning_rate": 1.2615481048978709e-05, + "loss": 0.005, + "step": 3980 + }, + { + "epoch": 2.61, + "grad_norm": 0.002215327462181449, + "learning_rate": 1.2574130474018397e-05, + "loss": 0.0001, + "step": 3981 + }, + { + "epoch": 2.61, + "grad_norm": 0.0018856306560337543, + "learning_rate": 1.2532844814165389e-05, + "loss": 0.0001, + "step": 3982 + }, + { + "epoch": 2.61, + "grad_norm": 0.0030563059262931347, + "learning_rate": 1.2491624088921537e-05, + "loss": 0.0001, + "step": 3983 + }, + { + "epoch": 2.61, + "grad_norm": 0.04339168593287468, + "learning_rate": 1.2450468317758067e-05, + "loss": 0.0013, + "step": 3984 + }, + { + "epoch": 2.61, + "grad_norm": 0.0006221532239578664, + "learning_rate": 1.240937752011551e-05, + "loss": 0.0, + "step": 3985 + }, + { + "epoch": 2.61, + "grad_norm": 0.0016415376449003816, + "learning_rate": 1.2368351715403691e-05, + "loss": 0.0001, + "step": 3986 + }, + { + "epoch": 2.61, + "grad_norm": 0.1464734822511673, + "learning_rate": 1.2327390923001773e-05, + "loss": 0.0237, + "step": 3987 + }, + { + "epoch": 2.61, + "grad_norm": 0.002812093123793602, + "learning_rate": 1.2286495162258148e-05, + "loss": 0.0002, + "step": 3988 + }, + { + "epoch": 2.61, + "grad_norm": 0.00033319511567242444, + "learning_rate": 1.2245664452490528e-05, + "loss": 0.0, + "step": 3989 + }, + { + "epoch": 2.61, + "grad_norm": 0.002769660437479615, + "learning_rate": 1.220489881298592e-05, + "loss": 0.0002, + "step": 3990 + }, + { + "epoch": 2.61, + "grad_norm": 0.1347421109676361, + "learning_rate": 1.2164198263000568e-05, + "loss": 0.0027, + "step": 3991 + }, + { + "epoch": 2.61, + "grad_norm": 0.0063906447030603886, + "learning_rate": 1.2123562821759913e-05, + "loss": 0.0002, + "step": 3992 + }, + { + "epoch": 2.61, + "grad_norm": 0.0006798821850679815, + "learning_rate": 1.208299250845875e-05, + "loss": 0.0001, + "step": 3993 + }, + { + "epoch": 2.61, + "grad_norm": 0.020430460572242737, + "learning_rate": 1.2042487342261014e-05, + "loss": 0.0003, + "step": 3994 + }, + { + "epoch": 2.62, + "grad_norm": 0.01602082885801792, + "learning_rate": 1.2002047342299953e-05, + "loss": 0.0009, + "step": 3995 + }, + { + "epoch": 2.62, + "grad_norm": 0.003840226447209716, + "learning_rate": 1.1961672527677968e-05, + "loss": 0.0003, + "step": 3996 + }, + { + "epoch": 2.62, + "grad_norm": 0.017650924623012543, + "learning_rate": 1.1921362917466686e-05, + "loss": 0.0007, + "step": 3997 + }, + { + "epoch": 2.62, + "grad_norm": 0.021648382768034935, + "learning_rate": 1.1881118530706946e-05, + "loss": 0.0004, + "step": 3998 + }, + { + "epoch": 2.62, + "grad_norm": 0.00461529241874814, + "learning_rate": 1.1840939386408753e-05, + "loss": 0.0002, + "step": 3999 + }, + { + "epoch": 2.62, + "grad_norm": 0.0014877563808113337, + "learning_rate": 1.1800825503551364e-05, + "loss": 0.0001, + "step": 4000 + }, + { + "epoch": 2.62, + "grad_norm": 0.0007646268350072205, + "learning_rate": 1.1760776901083086e-05, + "loss": 0.0, + "step": 4001 + }, + { + "epoch": 2.62, + "grad_norm": 0.04728006199002266, + "learning_rate": 1.1720793597921468e-05, + "loss": 0.0026, + "step": 4002 + }, + { + "epoch": 2.62, + "grad_norm": 0.00066333485301584, + "learning_rate": 1.168087561295324e-05, + "loss": 0.0, + "step": 4003 + }, + { + "epoch": 2.62, + "grad_norm": 0.0015067929634824395, + "learning_rate": 1.1641022965034213e-05, + "loss": 0.0001, + "step": 4004 + }, + { + "epoch": 2.62, + "grad_norm": 0.188484326004982, + "learning_rate": 1.1601235672989383e-05, + "loss": 0.007, + "step": 4005 + }, + { + "epoch": 2.62, + "grad_norm": 0.25473806262016296, + "learning_rate": 1.156151375561285e-05, + "loss": 0.0465, + "step": 4006 + }, + { + "epoch": 2.62, + "grad_norm": 0.002568146213889122, + "learning_rate": 1.1521857231667836e-05, + "loss": 0.0002, + "step": 4007 + }, + { + "epoch": 2.62, + "grad_norm": 0.006496311631053686, + "learning_rate": 1.1482266119886708e-05, + "loss": 0.0003, + "step": 4008 + }, + { + "epoch": 2.62, + "grad_norm": 0.07505198568105698, + "learning_rate": 1.1442740438970855e-05, + "loss": 0.0032, + "step": 4009 + }, + { + "epoch": 2.63, + "grad_norm": 0.004610721487551928, + "learning_rate": 1.140328020759081e-05, + "loss": 0.0004, + "step": 4010 + }, + { + "epoch": 2.63, + "grad_norm": 0.5489254593849182, + "learning_rate": 1.1363885444386212e-05, + "loss": 0.0091, + "step": 4011 + }, + { + "epoch": 2.63, + "grad_norm": 0.03889350965619087, + "learning_rate": 1.1324556167965742e-05, + "loss": 0.0007, + "step": 4012 + }, + { + "epoch": 2.63, + "grad_norm": 0.08343320339918137, + "learning_rate": 1.1285292396907142e-05, + "loss": 0.0029, + "step": 4013 + }, + { + "epoch": 2.63, + "grad_norm": 0.0036601496394723654, + "learning_rate": 1.124609414975724e-05, + "loss": 0.0002, + "step": 4014 + }, + { + "epoch": 2.63, + "grad_norm": 0.0038177622482180595, + "learning_rate": 1.120696144503191e-05, + "loss": 0.0001, + "step": 4015 + }, + { + "epoch": 2.63, + "grad_norm": 0.018228301778435707, + "learning_rate": 1.116789430121603e-05, + "loss": 0.0007, + "step": 4016 + }, + { + "epoch": 2.63, + "grad_norm": 0.00024546097847633064, + "learning_rate": 1.1128892736763573e-05, + "loss": 0.0, + "step": 4017 + }, + { + "epoch": 2.63, + "grad_norm": 0.00047648849431425333, + "learning_rate": 1.1089956770097436e-05, + "loss": 0.0, + "step": 4018 + }, + { + "epoch": 2.63, + "grad_norm": 0.5794171690940857, + "learning_rate": 1.1051086419609605e-05, + "loss": 0.0263, + "step": 4019 + }, + { + "epoch": 2.63, + "grad_norm": 0.0021978423465043306, + "learning_rate": 1.101228170366108e-05, + "loss": 0.0001, + "step": 4020 + }, + { + "epoch": 2.63, + "grad_norm": 0.009366245940327644, + "learning_rate": 1.0973542640581828e-05, + "loss": 0.0003, + "step": 4021 + }, + { + "epoch": 2.63, + "grad_norm": 0.2813292443752289, + "learning_rate": 1.0934869248670797e-05, + "loss": 0.0233, + "step": 4022 + }, + { + "epoch": 2.63, + "grad_norm": 0.007486666552722454, + "learning_rate": 1.089626154619594e-05, + "loss": 0.0003, + "step": 4023 + }, + { + "epoch": 2.63, + "grad_norm": 0.0034093467984348536, + "learning_rate": 1.085771955139415e-05, + "loss": 0.0002, + "step": 4024 + }, + { + "epoch": 2.64, + "grad_norm": 0.005050848238170147, + "learning_rate": 1.0819243282471286e-05, + "loss": 0.0003, + "step": 4025 + }, + { + "epoch": 2.64, + "grad_norm": 0.05544862896203995, + "learning_rate": 1.0780832757602203e-05, + "loss": 0.0017, + "step": 4026 + }, + { + "epoch": 2.64, + "grad_norm": 0.1471461057662964, + "learning_rate": 1.074248799493066e-05, + "loss": 0.0126, + "step": 4027 + }, + { + "epoch": 2.64, + "grad_norm": 0.09326312690973282, + "learning_rate": 1.0704209012569398e-05, + "loss": 0.0033, + "step": 4028 + }, + { + "epoch": 2.64, + "grad_norm": 0.0051653687842190266, + "learning_rate": 1.0665995828599971e-05, + "loss": 0.0002, + "step": 4029 + }, + { + "epoch": 2.64, + "grad_norm": 0.11208591610193253, + "learning_rate": 1.0627848461072991e-05, + "loss": 0.0026, + "step": 4030 + }, + { + "epoch": 2.64, + "grad_norm": 0.0023857189808040857, + "learning_rate": 1.0589766928007893e-05, + "loss": 0.0001, + "step": 4031 + }, + { + "epoch": 2.64, + "grad_norm": 0.003841506550088525, + "learning_rate": 1.055175124739307e-05, + "loss": 0.0003, + "step": 4032 + }, + { + "epoch": 2.64, + "grad_norm": 0.015454866923391819, + "learning_rate": 1.051380143718576e-05, + "loss": 0.0011, + "step": 4033 + }, + { + "epoch": 2.64, + "grad_norm": 0.011828050948679447, + "learning_rate": 1.0475917515312122e-05, + "loss": 0.0003, + "step": 4034 + }, + { + "epoch": 2.64, + "grad_norm": 0.00919797271490097, + "learning_rate": 1.0438099499667175e-05, + "loss": 0.0004, + "step": 4035 + }, + { + "epoch": 2.64, + "grad_norm": 0.0012486804043874145, + "learning_rate": 1.0400347408114812e-05, + "loss": 0.0001, + "step": 4036 + }, + { + "epoch": 2.64, + "grad_norm": 0.048705220222473145, + "learning_rate": 1.036266125848777e-05, + "loss": 0.0023, + "step": 4037 + }, + { + "epoch": 2.64, + "grad_norm": 0.03676815703511238, + "learning_rate": 1.0325041068587642e-05, + "loss": 0.0017, + "step": 4038 + }, + { + "epoch": 2.64, + "grad_norm": 0.005671919789165258, + "learning_rate": 1.0287486856184878e-05, + "loss": 0.0003, + "step": 4039 + }, + { + "epoch": 2.64, + "grad_norm": 0.0018038189737126231, + "learning_rate": 1.0249998639018775e-05, + "loss": 0.0001, + "step": 4040 + }, + { + "epoch": 2.65, + "grad_norm": 0.010594921186566353, + "learning_rate": 1.0212576434797432e-05, + "loss": 0.0006, + "step": 4041 + }, + { + "epoch": 2.65, + "grad_norm": 0.002736148191615939, + "learning_rate": 1.0175220261197743e-05, + "loss": 0.0002, + "step": 4042 + }, + { + "epoch": 2.65, + "grad_norm": 0.008918588049709797, + "learning_rate": 1.0137930135865474e-05, + "loss": 0.0007, + "step": 4043 + }, + { + "epoch": 2.65, + "grad_norm": 0.10005183517932892, + "learning_rate": 1.0100706076415138e-05, + "loss": 0.0036, + "step": 4044 + }, + { + "epoch": 2.65, + "grad_norm": 0.14248734712600708, + "learning_rate": 1.0063548100430102e-05, + "loss": 0.0099, + "step": 4045 + }, + { + "epoch": 2.65, + "grad_norm": 0.018299754709005356, + "learning_rate": 1.002645622546241e-05, + "loss": 0.0007, + "step": 4046 + }, + { + "epoch": 2.65, + "grad_norm": 0.0032806459348648787, + "learning_rate": 9.989430469032977e-06, + "loss": 0.0002, + "step": 4047 + }, + { + "epoch": 2.65, + "grad_norm": 0.020569033920764923, + "learning_rate": 9.95247084863145e-06, + "loss": 0.0012, + "step": 4048 + }, + { + "epoch": 2.65, + "grad_norm": 0.004910091403871775, + "learning_rate": 9.91557738171626e-06, + "loss": 0.0001, + "step": 4049 + }, + { + "epoch": 2.65, + "grad_norm": 0.003237323137000203, + "learning_rate": 9.87875008571457e-06, + "loss": 0.0002, + "step": 4050 + }, + { + "epoch": 2.65, + "grad_norm": 0.03983073681592941, + "learning_rate": 9.84198897802228e-06, + "loss": 0.0014, + "step": 4051 + }, + { + "epoch": 2.65, + "grad_norm": 0.03903353214263916, + "learning_rate": 9.80529407600405e-06, + "loss": 0.0014, + "step": 4052 + }, + { + "epoch": 2.65, + "grad_norm": 0.005356269888579845, + "learning_rate": 9.76866539699323e-06, + "loss": 0.0003, + "step": 4053 + }, + { + "epoch": 2.65, + "grad_norm": 0.2587814927101135, + "learning_rate": 9.732102958291931e-06, + "loss": 0.0063, + "step": 4054 + }, + { + "epoch": 2.65, + "grad_norm": 0.34192776679992676, + "learning_rate": 9.695606777170922e-06, + "loss": 0.0224, + "step": 4055 + }, + { + "epoch": 2.66, + "grad_norm": 0.05687537416815758, + "learning_rate": 9.659176870869728e-06, + "loss": 0.0027, + "step": 4056 + }, + { + "epoch": 2.66, + "grad_norm": 0.002049859380349517, + "learning_rate": 9.622813256596518e-06, + "loss": 0.0001, + "step": 4057 + }, + { + "epoch": 2.66, + "grad_norm": 0.02976830117404461, + "learning_rate": 9.586515951528217e-06, + "loss": 0.0016, + "step": 4058 + }, + { + "epoch": 2.66, + "grad_norm": 0.002136975759640336, + "learning_rate": 9.550284972810345e-06, + "loss": 0.0002, + "step": 4059 + }, + { + "epoch": 2.66, + "grad_norm": 0.0014137733960524201, + "learning_rate": 9.514120337557147e-06, + "loss": 0.0001, + "step": 4060 + }, + { + "epoch": 2.66, + "grad_norm": 0.003990999888628721, + "learning_rate": 9.47802206285152e-06, + "loss": 0.0002, + "step": 4061 + }, + { + "epoch": 2.66, + "grad_norm": 0.03214813768863678, + "learning_rate": 9.441990165745028e-06, + "loss": 0.0011, + "step": 4062 + }, + { + "epoch": 2.66, + "grad_norm": 0.029579009860754013, + "learning_rate": 9.406024663257821e-06, + "loss": 0.0011, + "step": 4063 + }, + { + "epoch": 2.66, + "grad_norm": 0.013909817673265934, + "learning_rate": 9.370125572378728e-06, + "loss": 0.0003, + "step": 4064 + }, + { + "epoch": 2.66, + "grad_norm": 0.0007777772261761129, + "learning_rate": 9.334292910065234e-06, + "loss": 0.0, + "step": 4065 + }, + { + "epoch": 2.66, + "grad_norm": 0.0012692922027781606, + "learning_rate": 9.298526693243396e-06, + "loss": 0.0001, + "step": 4066 + }, + { + "epoch": 2.66, + "grad_norm": 0.008403139188885689, + "learning_rate": 9.262826938807939e-06, + "loss": 0.0003, + "step": 4067 + }, + { + "epoch": 2.66, + "grad_norm": 0.0036738510243594646, + "learning_rate": 9.227193663622118e-06, + "loss": 0.0002, + "step": 4068 + }, + { + "epoch": 2.66, + "grad_norm": 0.003529123729094863, + "learning_rate": 9.191626884517855e-06, + "loss": 0.0002, + "step": 4069 + }, + { + "epoch": 2.66, + "grad_norm": 0.006939716637134552, + "learning_rate": 9.156126618295611e-06, + "loss": 0.0002, + "step": 4070 + }, + { + "epoch": 2.67, + "grad_norm": 0.0020183094311505556, + "learning_rate": 9.120692881724522e-06, + "loss": 0.0001, + "step": 4071 + }, + { + "epoch": 2.67, + "grad_norm": 0.0027899667620658875, + "learning_rate": 9.085325691542134e-06, + "loss": 0.0002, + "step": 4072 + }, + { + "epoch": 2.67, + "grad_norm": 0.037185538560152054, + "learning_rate": 9.050025064454697e-06, + "loss": 0.0009, + "step": 4073 + }, + { + "epoch": 2.67, + "grad_norm": 0.3907364010810852, + "learning_rate": 9.01479101713699e-06, + "loss": 0.0131, + "step": 4074 + }, + { + "epoch": 2.67, + "grad_norm": 0.00502181239426136, + "learning_rate": 8.979623566232302e-06, + "loss": 0.0002, + "step": 4075 + }, + { + "epoch": 2.67, + "grad_norm": 0.008093554526567459, + "learning_rate": 8.94452272835251e-06, + "loss": 0.0003, + "step": 4076 + }, + { + "epoch": 2.67, + "grad_norm": 0.004010841716080904, + "learning_rate": 8.909488520077984e-06, + "loss": 0.0002, + "step": 4077 + }, + { + "epoch": 2.67, + "grad_norm": 0.03718475252389908, + "learning_rate": 8.874520957957654e-06, + "loss": 0.0012, + "step": 4078 + }, + { + "epoch": 2.67, + "grad_norm": 0.1648079752922058, + "learning_rate": 8.839620058508956e-06, + "loss": 0.0076, + "step": 4079 + }, + { + "epoch": 2.67, + "grad_norm": 0.0014669696101918817, + "learning_rate": 8.804785838217853e-06, + "loss": 0.0001, + "step": 4080 + }, + { + "epoch": 2.67, + "grad_norm": 0.08992872387170792, + "learning_rate": 8.770018313538768e-06, + "loss": 0.0021, + "step": 4081 + }, + { + "epoch": 2.67, + "grad_norm": 0.0030081751756370068, + "learning_rate": 8.735317500894662e-06, + "loss": 0.0002, + "step": 4082 + }, + { + "epoch": 2.67, + "grad_norm": 0.0023803289514034986, + "learning_rate": 8.700683416676957e-06, + "loss": 0.0001, + "step": 4083 + }, + { + "epoch": 2.67, + "grad_norm": 0.2307877540588379, + "learning_rate": 8.666116077245566e-06, + "loss": 0.0062, + "step": 4084 + }, + { + "epoch": 2.67, + "grad_norm": 0.006816699169576168, + "learning_rate": 8.631615498928879e-06, + "loss": 0.0004, + "step": 4085 + }, + { + "epoch": 2.67, + "grad_norm": 0.01713281124830246, + "learning_rate": 8.59718169802376e-06, + "loss": 0.0006, + "step": 4086 + }, + { + "epoch": 2.68, + "grad_norm": 0.05028926581144333, + "learning_rate": 8.562814690795495e-06, + "loss": 0.0014, + "step": 4087 + }, + { + "epoch": 2.68, + "grad_norm": 0.0028437310829758644, + "learning_rate": 8.52851449347785e-06, + "loss": 0.0001, + "step": 4088 + }, + { + "epoch": 2.68, + "grad_norm": 0.00861066672950983, + "learning_rate": 8.49428112227305e-06, + "loss": 0.0005, + "step": 4089 + }, + { + "epoch": 2.68, + "grad_norm": 0.011029339395463467, + "learning_rate": 8.460114593351674e-06, + "loss": 0.0007, + "step": 4090 + }, + { + "epoch": 2.68, + "grad_norm": 0.010739943943917751, + "learning_rate": 8.426014922852781e-06, + "loss": 0.0003, + "step": 4091 + }, + { + "epoch": 2.68, + "grad_norm": 0.11427808552980423, + "learning_rate": 8.391982126883883e-06, + "loss": 0.0064, + "step": 4092 + }, + { + "epoch": 2.68, + "grad_norm": 0.0007959986687637866, + "learning_rate": 8.358016221520841e-06, + "loss": 0.0, + "step": 4093 + }, + { + "epoch": 2.68, + "grad_norm": 0.017499376088380814, + "learning_rate": 8.324117222807953e-06, + "loss": 0.0006, + "step": 4094 + }, + { + "epoch": 2.68, + "grad_norm": 0.06074637174606323, + "learning_rate": 8.29028514675789e-06, + "loss": 0.0044, + "step": 4095 + }, + { + "epoch": 2.68, + "grad_norm": 0.01843813993036747, + "learning_rate": 8.256520009351758e-06, + "loss": 0.0006, + "step": 4096 + }, + { + "epoch": 2.68, + "grad_norm": 0.0039335633628070354, + "learning_rate": 8.222821826538995e-06, + "loss": 0.0002, + "step": 4097 + }, + { + "epoch": 2.68, + "grad_norm": 0.005784610286355019, + "learning_rate": 8.189190614237418e-06, + "loss": 0.0003, + "step": 4098 + }, + { + "epoch": 2.68, + "grad_norm": 0.0008105946471914649, + "learning_rate": 8.155626388333203e-06, + "loss": 0.0, + "step": 4099 + }, + { + "epoch": 2.68, + "grad_norm": 0.023521745577454567, + "learning_rate": 8.122129164680936e-06, + "loss": 0.0011, + "step": 4100 + }, + { + "epoch": 2.68, + "grad_norm": 0.0019911215640604496, + "learning_rate": 8.08869895910349e-06, + "loss": 0.0001, + "step": 4101 + }, + { + "epoch": 2.69, + "grad_norm": 0.00166597543284297, + "learning_rate": 8.05533578739212e-06, + "loss": 0.0001, + "step": 4102 + }, + { + "epoch": 2.69, + "grad_norm": 0.0048183235339820385, + "learning_rate": 8.022039665306401e-06, + "loss": 0.0002, + "step": 4103 + }, + { + "epoch": 2.69, + "grad_norm": 0.11916442960500717, + "learning_rate": 7.988810608574253e-06, + "loss": 0.0025, + "step": 4104 + }, + { + "epoch": 2.69, + "grad_norm": 0.0017527136951684952, + "learning_rate": 7.955648632891904e-06, + "loss": 0.0001, + "step": 4105 + }, + { + "epoch": 2.69, + "grad_norm": 0.0030303006060421467, + "learning_rate": 7.922553753923905e-06, + "loss": 0.0002, + "step": 4106 + }, + { + "epoch": 2.69, + "grad_norm": 0.00476842699572444, + "learning_rate": 7.889525987303053e-06, + "loss": 0.0002, + "step": 4107 + }, + { + "epoch": 2.69, + "grad_norm": 0.038915570825338364, + "learning_rate": 7.856565348630534e-06, + "loss": 0.0015, + "step": 4108 + }, + { + "epoch": 2.69, + "grad_norm": 0.2311791330575943, + "learning_rate": 7.823671853475776e-06, + "loss": 0.0238, + "step": 4109 + }, + { + "epoch": 2.69, + "grad_norm": 0.0026899471413344145, + "learning_rate": 7.790845517376487e-06, + "loss": 0.0001, + "step": 4110 + }, + { + "epoch": 2.69, + "grad_norm": 0.009794102981686592, + "learning_rate": 7.758086355838695e-06, + "loss": 0.0004, + "step": 4111 + }, + { + "epoch": 2.69, + "grad_norm": 0.15422171354293823, + "learning_rate": 7.725394384336637e-06, + "loss": 0.004, + "step": 4112 + }, + { + "epoch": 2.69, + "grad_norm": 0.0044794646091759205, + "learning_rate": 7.692769618312861e-06, + "loss": 0.0002, + "step": 4113 + }, + { + "epoch": 2.69, + "grad_norm": 0.10780015587806702, + "learning_rate": 7.66021207317814e-06, + "loss": 0.0078, + "step": 4114 + }, + { + "epoch": 2.69, + "grad_norm": 0.004076341167092323, + "learning_rate": 7.627721764311523e-06, + "loss": 0.0003, + "step": 4115 + }, + { + "epoch": 2.69, + "grad_norm": 0.04642152413725853, + "learning_rate": 7.595298707060249e-06, + "loss": 0.0009, + "step": 4116 + }, + { + "epoch": 2.7, + "grad_norm": 0.0005127813201397657, + "learning_rate": 7.562942916739817e-06, + "loss": 0.0, + "step": 4117 + }, + { + "epoch": 2.7, + "grad_norm": 0.0011135004460811615, + "learning_rate": 7.530654408633985e-06, + "loss": 0.0001, + "step": 4118 + }, + { + "epoch": 2.7, + "grad_norm": 0.001014087232761085, + "learning_rate": 7.498433197994685e-06, + "loss": 0.0001, + "step": 4119 + }, + { + "epoch": 2.7, + "grad_norm": 0.00496373837813735, + "learning_rate": 7.466279300042061e-06, + "loss": 0.0002, + "step": 4120 + }, + { + "epoch": 2.7, + "grad_norm": 0.01596042700111866, + "learning_rate": 7.4341927299644945e-06, + "loss": 0.0007, + "step": 4121 + }, + { + "epoch": 2.7, + "grad_norm": 0.019729454070329666, + "learning_rate": 7.402173502918546e-06, + "loss": 0.0004, + "step": 4122 + }, + { + "epoch": 2.7, + "grad_norm": 0.009947029873728752, + "learning_rate": 7.3702216340289665e-06, + "loss": 0.0003, + "step": 4123 + }, + { + "epoch": 2.7, + "grad_norm": 0.003047047182917595, + "learning_rate": 7.33833713838865e-06, + "loss": 0.0001, + "step": 4124 + }, + { + "epoch": 2.7, + "grad_norm": 0.008738638833165169, + "learning_rate": 7.306520031058749e-06, + "loss": 0.0002, + "step": 4125 + }, + { + "epoch": 2.7, + "grad_norm": 0.1427345871925354, + "learning_rate": 7.274770327068474e-06, + "loss": 0.0361, + "step": 4126 + }, + { + "epoch": 2.7, + "grad_norm": 0.047303181141614914, + "learning_rate": 7.243088041415312e-06, + "loss": 0.0015, + "step": 4127 + }, + { + "epoch": 2.7, + "grad_norm": 0.0006034310208633542, + "learning_rate": 7.2114731890648424e-06, + "loss": 0.0, + "step": 4128 + }, + { + "epoch": 2.7, + "grad_norm": 0.0025167923886328936, + "learning_rate": 7.179925784950785e-06, + "loss": 0.0001, + "step": 4129 + }, + { + "epoch": 2.7, + "grad_norm": 0.022558756172657013, + "learning_rate": 7.14844584397502e-06, + "loss": 0.0006, + "step": 4130 + }, + { + "epoch": 2.7, + "grad_norm": 0.016648834571242332, + "learning_rate": 7.11703338100757e-06, + "loss": 0.0006, + "step": 4131 + }, + { + "epoch": 2.71, + "grad_norm": 0.09894277900457382, + "learning_rate": 7.085688410886548e-06, + "loss": 0.0032, + "step": 4132 + }, + { + "epoch": 2.71, + "grad_norm": 0.20772482454776764, + "learning_rate": 7.054410948418227e-06, + "loss": 0.0426, + "step": 4133 + }, + { + "epoch": 2.71, + "grad_norm": 0.10547536611557007, + "learning_rate": 7.023201008376972e-06, + "loss": 0.0014, + "step": 4134 + }, + { + "epoch": 2.71, + "grad_norm": 0.07568886876106262, + "learning_rate": 6.992058605505224e-06, + "loss": 0.0017, + "step": 4135 + }, + { + "epoch": 2.71, + "grad_norm": 0.005178254097700119, + "learning_rate": 6.960983754513566e-06, + "loss": 0.0002, + "step": 4136 + }, + { + "epoch": 2.71, + "grad_norm": 0.009225263260304928, + "learning_rate": 6.929976470080639e-06, + "loss": 0.0004, + "step": 4137 + }, + { + "epoch": 2.71, + "grad_norm": 0.011907052248716354, + "learning_rate": 6.899036766853211e-06, + "loss": 0.0005, + "step": 4138 + }, + { + "epoch": 2.71, + "grad_norm": 0.04745527729392052, + "learning_rate": 6.868164659446107e-06, + "loss": 0.0009, + "step": 4139 + }, + { + "epoch": 2.71, + "grad_norm": 0.002321612322703004, + "learning_rate": 6.837360162442179e-06, + "loss": 0.0001, + "step": 4140 + }, + { + "epoch": 2.71, + "grad_norm": 0.0042698998004198074, + "learning_rate": 6.806623290392405e-06, + "loss": 0.0002, + "step": 4141 + }, + { + "epoch": 2.71, + "grad_norm": 0.024780068546533585, + "learning_rate": 6.775954057815786e-06, + "loss": 0.0008, + "step": 4142 + }, + { + "epoch": 2.71, + "grad_norm": 0.009199246764183044, + "learning_rate": 6.745352479199401e-06, + "loss": 0.0005, + "step": 4143 + }, + { + "epoch": 2.71, + "grad_norm": 0.009111308492720127, + "learning_rate": 6.71481856899832e-06, + "loss": 0.0005, + "step": 4144 + }, + { + "epoch": 2.71, + "grad_norm": 0.016595518216490746, + "learning_rate": 6.684352341635673e-06, + "loss": 0.0007, + "step": 4145 + }, + { + "epoch": 2.71, + "grad_norm": 0.002586304908618331, + "learning_rate": 6.653953811502649e-06, + "loss": 0.0002, + "step": 4146 + }, + { + "epoch": 2.71, + "grad_norm": 0.0032956686336547136, + "learning_rate": 6.623622992958444e-06, + "loss": 0.0001, + "step": 4147 + }, + { + "epoch": 2.72, + "grad_norm": 0.0011785841779783368, + "learning_rate": 6.59335990033023e-06, + "loss": 0.0001, + "step": 4148 + }, + { + "epoch": 2.72, + "grad_norm": 0.0005113594233989716, + "learning_rate": 6.563164547913241e-06, + "loss": 0.0, + "step": 4149 + }, + { + "epoch": 2.72, + "grad_norm": 0.2723368704319, + "learning_rate": 6.533036949970683e-06, + "loss": 0.0573, + "step": 4150 + }, + { + "epoch": 2.72, + "grad_norm": 0.019609831273555756, + "learning_rate": 6.5029771207337874e-06, + "loss": 0.0007, + "step": 4151 + }, + { + "epoch": 2.72, + "grad_norm": 0.06340056657791138, + "learning_rate": 6.472985074401715e-06, + "loss": 0.0009, + "step": 4152 + }, + { + "epoch": 2.72, + "grad_norm": 0.08457580208778381, + "learning_rate": 6.443060825141649e-06, + "loss": 0.0016, + "step": 4153 + }, + { + "epoch": 2.72, + "grad_norm": 0.06118530035018921, + "learning_rate": 6.413204387088766e-06, + "loss": 0.0031, + "step": 4154 + }, + { + "epoch": 2.72, + "grad_norm": 0.15247440338134766, + "learning_rate": 6.3834157743461675e-06, + "loss": 0.0015, + "step": 4155 + }, + { + "epoch": 2.72, + "grad_norm": 0.15113098919391632, + "learning_rate": 6.353695000984965e-06, + "loss": 0.0053, + "step": 4156 + }, + { + "epoch": 2.72, + "grad_norm": 0.008273385465145111, + "learning_rate": 6.324042081044161e-06, + "loss": 0.0003, + "step": 4157 + }, + { + "epoch": 2.72, + "grad_norm": 0.0017637746641412377, + "learning_rate": 6.2944570285307695e-06, + "loss": 0.0001, + "step": 4158 + }, + { + "epoch": 2.72, + "grad_norm": 0.5722165107727051, + "learning_rate": 6.264939857419726e-06, + "loss": 0.035, + "step": 4159 + }, + { + "epoch": 2.72, + "grad_norm": 0.0032465998083353043, + "learning_rate": 6.235490581653896e-06, + "loss": 0.0002, + "step": 4160 + }, + { + "epoch": 2.72, + "grad_norm": 0.0011040384415537119, + "learning_rate": 6.2061092151440335e-06, + "loss": 0.0001, + "step": 4161 + }, + { + "epoch": 2.72, + "grad_norm": 0.004833061248064041, + "learning_rate": 6.176795771768889e-06, + "loss": 0.0003, + "step": 4162 + }, + { + "epoch": 2.73, + "grad_norm": 0.05971897020936012, + "learning_rate": 6.1475502653750845e-06, + "loss": 0.002, + "step": 4163 + }, + { + "epoch": 2.73, + "grad_norm": 0.0014663523761555552, + "learning_rate": 6.118372709777153e-06, + "loss": 0.0001, + "step": 4164 + }, + { + "epoch": 2.73, + "grad_norm": 0.10879574716091156, + "learning_rate": 6.089263118757554e-06, + "loss": 0.0042, + "step": 4165 + }, + { + "epoch": 2.73, + "grad_norm": 0.030240798369050026, + "learning_rate": 6.060221506066604e-06, + "loss": 0.0004, + "step": 4166 + }, + { + "epoch": 2.73, + "grad_norm": 0.01000500563532114, + "learning_rate": 6.0312478854225635e-06, + "loss": 0.0005, + "step": 4167 + }, + { + "epoch": 2.73, + "grad_norm": 0.0015593727584928274, + "learning_rate": 6.002342270511518e-06, + "loss": 0.0001, + "step": 4168 + }, + { + "epoch": 2.73, + "grad_norm": 0.0114108482375741, + "learning_rate": 5.973504674987461e-06, + "loss": 0.0005, + "step": 4169 + }, + { + "epoch": 2.73, + "grad_norm": 0.002675750060006976, + "learning_rate": 5.944735112472248e-06, + "loss": 0.0002, + "step": 4170 + }, + { + "epoch": 2.73, + "grad_norm": 0.0036464605946093798, + "learning_rate": 5.916033596555608e-06, + "loss": 0.0002, + "step": 4171 + }, + { + "epoch": 2.73, + "grad_norm": 0.009755423292517662, + "learning_rate": 5.887400140795095e-06, + "loss": 0.0004, + "step": 4172 + }, + { + "epoch": 2.73, + "grad_norm": 0.010763165540993214, + "learning_rate": 5.858834758716175e-06, + "loss": 0.0005, + "step": 4173 + }, + { + "epoch": 2.73, + "grad_norm": 0.002421615645289421, + "learning_rate": 5.830337463812085e-06, + "loss": 0.0002, + "step": 4174 + }, + { + "epoch": 2.73, + "grad_norm": 0.005541190970689058, + "learning_rate": 5.801908269543975e-06, + "loss": 0.0003, + "step": 4175 + }, + { + "epoch": 2.73, + "grad_norm": 0.0029589326586574316, + "learning_rate": 5.773547189340754e-06, + "loss": 0.0001, + "step": 4176 + }, + { + "epoch": 2.73, + "grad_norm": 0.0014601044822484255, + "learning_rate": 5.745254236599206e-06, + "loss": 0.0001, + "step": 4177 + }, + { + "epoch": 2.74, + "grad_norm": 0.005274084396660328, + "learning_rate": 5.717029424683939e-06, + "loss": 0.0002, + "step": 4178 + }, + { + "epoch": 2.74, + "grad_norm": 0.026927761733531952, + "learning_rate": 5.688872766927305e-06, + "loss": 0.0011, + "step": 4179 + }, + { + "epoch": 2.74, + "grad_norm": 0.23306012153625488, + "learning_rate": 5.660784276629532e-06, + "loss": 0.0304, + "step": 4180 + }, + { + "epoch": 2.74, + "grad_norm": 0.26498934626579285, + "learning_rate": 5.63276396705864e-06, + "loss": 0.0039, + "step": 4181 + }, + { + "epoch": 2.74, + "grad_norm": 0.007049161940813065, + "learning_rate": 5.604811851450425e-06, + "loss": 0.0002, + "step": 4182 + }, + { + "epoch": 2.74, + "grad_norm": 0.06826946884393692, + "learning_rate": 5.576927943008458e-06, + "loss": 0.0022, + "step": 4183 + }, + { + "epoch": 2.74, + "grad_norm": 0.0578896664083004, + "learning_rate": 5.549112254904137e-06, + "loss": 0.0029, + "step": 4184 + }, + { + "epoch": 2.74, + "grad_norm": 0.0023675072006881237, + "learning_rate": 5.521364800276585e-06, + "loss": 0.0001, + "step": 4185 + }, + { + "epoch": 2.74, + "grad_norm": 0.00033406232250854373, + "learning_rate": 5.493685592232733e-06, + "loss": 0.0, + "step": 4186 + }, + { + "epoch": 2.74, + "grad_norm": 0.01783887669444084, + "learning_rate": 5.46607464384724e-06, + "loss": 0.0006, + "step": 4187 + }, + { + "epoch": 2.74, + "grad_norm": 0.016289999708533287, + "learning_rate": 5.43853196816254e-06, + "loss": 0.0005, + "step": 4188 + }, + { + "epoch": 2.74, + "grad_norm": 0.04762318357825279, + "learning_rate": 5.41105757818881e-06, + "loss": 0.0015, + "step": 4189 + }, + { + "epoch": 2.74, + "grad_norm": 0.0011672631371766329, + "learning_rate": 5.383651486904e-06, + "loss": 0.0, + "step": 4190 + }, + { + "epoch": 2.74, + "grad_norm": 0.0036867919843643904, + "learning_rate": 5.356313707253756e-06, + "loss": 0.0002, + "step": 4191 + }, + { + "epoch": 2.74, + "grad_norm": 0.007769985590130091, + "learning_rate": 5.329044252151499e-06, + "loss": 0.0003, + "step": 4192 + }, + { + "epoch": 2.75, + "grad_norm": 0.0023366445675492287, + "learning_rate": 5.301843134478323e-06, + "loss": 0.0001, + "step": 4193 + }, + { + "epoch": 2.75, + "grad_norm": 0.0020640569273382425, + "learning_rate": 5.274710367083085e-06, + "loss": 0.0001, + "step": 4194 + }, + { + "epoch": 2.75, + "grad_norm": 0.014843268319964409, + "learning_rate": 5.247645962782365e-06, + "loss": 0.0007, + "step": 4195 + }, + { + "epoch": 2.75, + "grad_norm": 0.25799453258514404, + "learning_rate": 5.220649934360388e-06, + "loss": 0.0117, + "step": 4196 + }, + { + "epoch": 2.75, + "grad_norm": 0.0038819615729153156, + "learning_rate": 5.1937222945691525e-06, + "loss": 0.0003, + "step": 4197 + }, + { + "epoch": 2.75, + "grad_norm": 0.0014182263985276222, + "learning_rate": 5.166863056128284e-06, + "loss": 0.0001, + "step": 4198 + }, + { + "epoch": 2.75, + "grad_norm": 0.022825760766863823, + "learning_rate": 5.140072231725168e-06, + "loss": 0.0011, + "step": 4199 + }, + { + "epoch": 2.75, + "grad_norm": 0.0015431575011461973, + "learning_rate": 5.113349834014829e-06, + "loss": 0.0001, + "step": 4200 + }, + { + "epoch": 2.75, + "grad_norm": 0.001278590178117156, + "learning_rate": 5.0866958756199725e-06, + "loss": 0.0001, + "step": 4201 + }, + { + "epoch": 2.75, + "grad_norm": 0.7672184109687805, + "learning_rate": 5.060110369130993e-06, + "loss": 0.0078, + "step": 4202 + }, + { + "epoch": 2.75, + "eval_loss": 0.04604343697428703, + "eval_runtime": 40.0954, + "eval_samples_per_second": 32.098, + "eval_steps_per_second": 8.031, + "step": 4202 + }, + { + "epoch": 2.75, + "grad_norm": 0.001736398204229772, + "learning_rate": 5.033593327105945e-06, + "loss": 0.0001, + "step": 4203 + }, + { + "epoch": 2.75, + "grad_norm": 0.18435488641262054, + "learning_rate": 5.007144762070542e-06, + "loss": 0.0049, + "step": 4204 + }, + { + "epoch": 2.75, + "grad_norm": 0.0006806718884035945, + "learning_rate": 4.980764686518124e-06, + "loss": 0.0, + "step": 4205 + }, + { + "epoch": 2.75, + "grad_norm": 0.04827133193612099, + "learning_rate": 4.9544531129097065e-06, + "loss": 0.0009, + "step": 4206 + }, + { + "epoch": 2.75, + "grad_norm": 0.005201430991292, + "learning_rate": 4.928210053673964e-06, + "loss": 0.0003, + "step": 4207 + }, + { + "epoch": 2.75, + "grad_norm": 0.008607663214206696, + "learning_rate": 4.902035521207182e-06, + "loss": 0.0003, + "step": 4208 + }, + { + "epoch": 2.76, + "grad_norm": 0.39723023772239685, + "learning_rate": 4.875929527873268e-06, + "loss": 0.0168, + "step": 4209 + }, + { + "epoch": 2.76, + "grad_norm": 0.0499483086168766, + "learning_rate": 4.849892086003776e-06, + "loss": 0.0018, + "step": 4210 + }, + { + "epoch": 2.76, + "grad_norm": 0.009101290255784988, + "learning_rate": 4.823923207897884e-06, + "loss": 0.0002, + "step": 4211 + }, + { + "epoch": 2.76, + "grad_norm": 0.009669370949268341, + "learning_rate": 4.798022905822363e-06, + "loss": 0.0005, + "step": 4212 + }, + { + "epoch": 2.76, + "grad_norm": 0.09165952354669571, + "learning_rate": 4.7721911920115764e-06, + "loss": 0.0014, + "step": 4213 + }, + { + "epoch": 2.76, + "grad_norm": 0.0015672995941713452, + "learning_rate": 4.746428078667513e-06, + "loss": 0.0001, + "step": 4214 + }, + { + "epoch": 2.76, + "grad_norm": 0.004250067751854658, + "learning_rate": 4.7207335779597736e-06, + "loss": 0.0002, + "step": 4215 + }, + { + "epoch": 2.76, + "grad_norm": 0.0006820221897214651, + "learning_rate": 4.69510770202553e-06, + "loss": 0.0, + "step": 4216 + }, + { + "epoch": 2.76, + "grad_norm": 0.014065735973417759, + "learning_rate": 4.669550462969518e-06, + "loss": 0.0002, + "step": 4217 + }, + { + "epoch": 2.76, + "grad_norm": 0.004080608021467924, + "learning_rate": 4.644061872864063e-06, + "loss": 0.0002, + "step": 4218 + }, + { + "epoch": 2.76, + "grad_norm": 0.004687106236815453, + "learning_rate": 4.618641943749119e-06, + "loss": 0.0001, + "step": 4219 + }, + { + "epoch": 2.76, + "grad_norm": 0.005547702312469482, + "learning_rate": 4.593290687632112e-06, + "loss": 0.0003, + "step": 4220 + }, + { + "epoch": 2.76, + "grad_norm": 0.32303404808044434, + "learning_rate": 4.568008116488098e-06, + "loss": 0.0127, + "step": 4221 + }, + { + "epoch": 2.76, + "grad_norm": 0.009234381839632988, + "learning_rate": 4.542794242259656e-06, + "loss": 0.0004, + "step": 4222 + }, + { + "epoch": 2.76, + "grad_norm": 0.017688224092125893, + "learning_rate": 4.517649076856944e-06, + "loss": 0.0005, + "step": 4223 + }, + { + "epoch": 2.77, + "grad_norm": 0.0029386677779257298, + "learning_rate": 4.49257263215761e-06, + "loss": 0.0001, + "step": 4224 + }, + { + "epoch": 2.77, + "grad_norm": 0.21983179450035095, + "learning_rate": 4.467564920006927e-06, + "loss": 0.0126, + "step": 4225 + }, + { + "epoch": 2.77, + "grad_norm": 0.006616625003516674, + "learning_rate": 4.442625952217615e-06, + "loss": 0.0004, + "step": 4226 + }, + { + "epoch": 2.77, + "grad_norm": 0.004712941590696573, + "learning_rate": 4.417755740569967e-06, + "loss": 0.0003, + "step": 4227 + }, + { + "epoch": 2.77, + "grad_norm": 0.0033072135411202908, + "learning_rate": 4.392954296811802e-06, + "loss": 0.0001, + "step": 4228 + }, + { + "epoch": 2.77, + "grad_norm": 0.000957198441028595, + "learning_rate": 4.3682216326584145e-06, + "loss": 0.0001, + "step": 4229 + }, + { + "epoch": 2.77, + "grad_norm": 0.0008674561977386475, + "learning_rate": 4.343557759792659e-06, + "loss": 0.0001, + "step": 4230 + }, + { + "epoch": 2.77, + "grad_norm": 0.03294390067458153, + "learning_rate": 4.318962689864869e-06, + "loss": 0.0012, + "step": 4231 + }, + { + "epoch": 2.77, + "grad_norm": 0.006492152344435453, + "learning_rate": 4.294436434492898e-06, + "loss": 0.0005, + "step": 4232 + }, + { + "epoch": 2.77, + "grad_norm": 0.040411900728940964, + "learning_rate": 4.269979005262047e-06, + "loss": 0.0016, + "step": 4233 + }, + { + "epoch": 2.77, + "grad_norm": 0.0003391464124433696, + "learning_rate": 4.245590413725175e-06, + "loss": 0.0, + "step": 4234 + }, + { + "epoch": 2.77, + "grad_norm": 0.0020683116745203733, + "learning_rate": 4.221270671402549e-06, + "loss": 0.0001, + "step": 4235 + }, + { + "epoch": 2.77, + "grad_norm": 0.02836175262928009, + "learning_rate": 4.19701978978198e-06, + "loss": 0.001, + "step": 4236 + }, + { + "epoch": 2.77, + "grad_norm": 0.008879311382770538, + "learning_rate": 4.172837780318722e-06, + "loss": 0.0003, + "step": 4237 + }, + { + "epoch": 2.77, + "grad_norm": 0.0010850438848137856, + "learning_rate": 4.148724654435487e-06, + "loss": 0.0001, + "step": 4238 + }, + { + "epoch": 2.78, + "grad_norm": 0.0029992880299687386, + "learning_rate": 4.124680423522481e-06, + "loss": 0.0001, + "step": 4239 + }, + { + "epoch": 2.78, + "grad_norm": 0.536130428314209, + "learning_rate": 4.100705098937334e-06, + "loss": 0.0097, + "step": 4240 + }, + { + "epoch": 2.78, + "grad_norm": 0.003063391661271453, + "learning_rate": 4.07679869200514e-06, + "loss": 0.0001, + "step": 4241 + }, + { + "epoch": 2.78, + "grad_norm": 0.0070351241156458855, + "learning_rate": 4.05296121401843e-06, + "loss": 0.0002, + "step": 4242 + }, + { + "epoch": 2.78, + "grad_norm": 0.0879850685596466, + "learning_rate": 4.029192676237181e-06, + "loss": 0.0024, + "step": 4243 + }, + { + "epoch": 2.78, + "grad_norm": 0.0065876576118171215, + "learning_rate": 4.005493089888812e-06, + "loss": 0.0003, + "step": 4244 + }, + { + "epoch": 2.78, + "grad_norm": 0.012009180150926113, + "learning_rate": 3.981862466168184e-06, + "loss": 0.0003, + "step": 4245 + }, + { + "epoch": 2.78, + "grad_norm": 0.002083989093080163, + "learning_rate": 3.958300816237553e-06, + "loss": 0.0001, + "step": 4246 + }, + { + "epoch": 2.78, + "grad_norm": 1.1517120599746704, + "learning_rate": 3.934808151226599e-06, + "loss": 0.0054, + "step": 4247 + }, + { + "epoch": 2.78, + "grad_norm": 0.00456461263820529, + "learning_rate": 3.911384482232427e-06, + "loss": 0.0002, + "step": 4248 + }, + { + "epoch": 2.78, + "grad_norm": 0.006039094179868698, + "learning_rate": 3.888029820319571e-06, + "loss": 0.0003, + "step": 4249 + }, + { + "epoch": 2.78, + "grad_norm": 0.008198510855436325, + "learning_rate": 3.864744176519924e-06, + "loss": 0.0003, + "step": 4250 + }, + { + "epoch": 2.78, + "grad_norm": 0.18985766172409058, + "learning_rate": 3.841527561832786e-06, + "loss": 0.0052, + "step": 4251 + }, + { + "epoch": 2.78, + "grad_norm": 0.007833573967218399, + "learning_rate": 3.818379987224884e-06, + "loss": 0.0002, + "step": 4252 + }, + { + "epoch": 2.78, + "grad_norm": 0.006897474639117718, + "learning_rate": 3.795301463630307e-06, + "loss": 0.0002, + "step": 4253 + }, + { + "epoch": 2.78, + "grad_norm": 0.0020764705259352922, + "learning_rate": 3.7722920019505166e-06, + "loss": 0.0001, + "step": 4254 + }, + { + "epoch": 2.79, + "grad_norm": 0.010014859028160572, + "learning_rate": 3.7493516130543856e-06, + "loss": 0.0005, + "step": 4255 + }, + { + "epoch": 2.79, + "grad_norm": 0.00040339658153243363, + "learning_rate": 3.726480307778129e-06, + "loss": 0.0, + "step": 4256 + }, + { + "epoch": 2.79, + "grad_norm": 0.009897599928081036, + "learning_rate": 3.703678096925339e-06, + "loss": 0.0004, + "step": 4257 + }, + { + "epoch": 2.79, + "grad_norm": 0.0173860602080822, + "learning_rate": 3.680944991266999e-06, + "loss": 0.0007, + "step": 4258 + }, + { + "epoch": 2.79, + "grad_norm": 0.004080170765519142, + "learning_rate": 3.6582810015413855e-06, + "loss": 0.0002, + "step": 4259 + }, + { + "epoch": 2.79, + "grad_norm": 0.15552128851413727, + "learning_rate": 3.635686138454186e-06, + "loss": 0.0297, + "step": 4260 + }, + { + "epoch": 2.79, + "grad_norm": 0.0021587831433862448, + "learning_rate": 3.6131604126783785e-06, + "loss": 0.0001, + "step": 4261 + }, + { + "epoch": 2.79, + "grad_norm": 0.014981748536229134, + "learning_rate": 3.5907038348543694e-06, + "loss": 0.0005, + "step": 4262 + }, + { + "epoch": 2.79, + "grad_norm": 0.01747909002006054, + "learning_rate": 3.5683164155898057e-06, + "loss": 0.0008, + "step": 4263 + }, + { + "epoch": 2.79, + "grad_norm": 0.0035776684526354074, + "learning_rate": 3.5459981654597293e-06, + "loss": 0.0001, + "step": 4264 + }, + { + "epoch": 2.79, + "grad_norm": 0.01363343931734562, + "learning_rate": 3.523749095006506e-06, + "loss": 0.0006, + "step": 4265 + }, + { + "epoch": 2.79, + "grad_norm": 0.0280233733355999, + "learning_rate": 3.5015692147397634e-06, + "loss": 0.0008, + "step": 4266 + }, + { + "epoch": 2.79, + "grad_norm": 0.0029149064794182777, + "learning_rate": 3.4794585351365535e-06, + "loss": 0.0001, + "step": 4267 + }, + { + "epoch": 2.79, + "grad_norm": 0.12076763808727264, + "learning_rate": 3.457417066641105e-06, + "loss": 0.0033, + "step": 4268 + }, + { + "epoch": 2.79, + "grad_norm": 0.007720254827290773, + "learning_rate": 3.4354448196650897e-06, + "loss": 0.0003, + "step": 4269 + }, + { + "epoch": 2.8, + "grad_norm": 0.03006923198699951, + "learning_rate": 3.413541804587372e-06, + "loss": 0.0009, + "step": 4270 + }, + { + "epoch": 2.8, + "grad_norm": 0.11592748761177063, + "learning_rate": 3.3917080317541758e-06, + "loss": 0.0043, + "step": 4271 + }, + { + "epoch": 2.8, + "grad_norm": 0.009647532366216183, + "learning_rate": 3.3699435114790006e-06, + "loss": 0.0006, + "step": 4272 + }, + { + "epoch": 2.8, + "grad_norm": 0.00810681190341711, + "learning_rate": 3.3482482540426404e-06, + "loss": 0.0004, + "step": 4273 + }, + { + "epoch": 2.8, + "grad_norm": 0.14248058199882507, + "learning_rate": 3.3266222696931633e-06, + "loss": 0.0039, + "step": 4274 + }, + { + "epoch": 2.8, + "grad_norm": 0.03840990737080574, + "learning_rate": 3.3050655686459316e-06, + "loss": 0.0016, + "step": 4275 + }, + { + "epoch": 2.8, + "grad_norm": 0.009877459146082401, + "learning_rate": 3.283578161083533e-06, + "loss": 0.0002, + "step": 4276 + }, + { + "epoch": 2.8, + "grad_norm": 0.004985439591109753, + "learning_rate": 3.262160057155866e-06, + "loss": 0.0002, + "step": 4277 + }, + { + "epoch": 2.8, + "grad_norm": 0.004251273348927498, + "learning_rate": 3.2408112669801035e-06, + "loss": 0.0003, + "step": 4278 + }, + { + "epoch": 2.8, + "grad_norm": 0.0019498238107189536, + "learning_rate": 3.2195318006406457e-06, + "loss": 0.0001, + "step": 4279 + }, + { + "epoch": 2.8, + "grad_norm": 0.6594255566596985, + "learning_rate": 3.1983216681891354e-06, + "loss": 0.0299, + "step": 4280 + }, + { + "epoch": 2.8, + "grad_norm": 0.0782998725771904, + "learning_rate": 3.177180879644525e-06, + "loss": 0.0043, + "step": 4281 + }, + { + "epoch": 2.8, + "grad_norm": 0.1305646002292633, + "learning_rate": 3.1561094449929603e-06, + "loss": 0.0029, + "step": 4282 + }, + { + "epoch": 2.8, + "grad_norm": 0.04577456787228584, + "learning_rate": 3.1351073741878284e-06, + "loss": 0.0023, + "step": 4283 + }, + { + "epoch": 2.8, + "grad_norm": 0.0004647353489417583, + "learning_rate": 3.1141746771497945e-06, + "loss": 0.0, + "step": 4284 + }, + { + "epoch": 2.81, + "grad_norm": 0.0009252427262254059, + "learning_rate": 3.093311363766665e-06, + "loss": 0.0001, + "step": 4285 + }, + { + "epoch": 2.81, + "grad_norm": 0.2674119174480438, + "learning_rate": 3.072517443893574e-06, + "loss": 0.0019, + "step": 4286 + }, + { + "epoch": 2.81, + "grad_norm": 0.004165771882981062, + "learning_rate": 3.0517929273528307e-06, + "loss": 0.0002, + "step": 4287 + }, + { + "epoch": 2.81, + "grad_norm": 0.010230110958218575, + "learning_rate": 3.031137823933938e-06, + "loss": 0.0004, + "step": 4288 + }, + { + "epoch": 2.81, + "grad_norm": 0.022651301696896553, + "learning_rate": 3.010552143393641e-06, + "loss": 0.0008, + "step": 4289 + }, + { + "epoch": 2.81, + "grad_norm": 0.018684033304452896, + "learning_rate": 2.9900358954559113e-06, + "loss": 0.0011, + "step": 4290 + }, + { + "epoch": 2.81, + "grad_norm": 0.10174325853586197, + "learning_rate": 2.9695890898118633e-06, + "loss": 0.0044, + "step": 4291 + }, + { + "epoch": 2.81, + "grad_norm": 0.007036436349153519, + "learning_rate": 2.9492117361198555e-06, + "loss": 0.0004, + "step": 4292 + }, + { + "epoch": 2.81, + "grad_norm": 0.026126496493816376, + "learning_rate": 2.9289038440054536e-06, + "loss": 0.0007, + "step": 4293 + }, + { + "epoch": 2.81, + "grad_norm": 0.024051504209637642, + "learning_rate": 2.9086654230613517e-06, + "loss": 0.0012, + "step": 4294 + }, + { + "epoch": 2.81, + "grad_norm": 1.2785769701004028, + "learning_rate": 2.8884964828474523e-06, + "loss": 0.0225, + "step": 4295 + }, + { + "epoch": 2.81, + "grad_norm": 0.0018845779122784734, + "learning_rate": 2.8683970328908844e-06, + "loss": 0.0001, + "step": 4296 + }, + { + "epoch": 2.81, + "grad_norm": 0.004176693968474865, + "learning_rate": 2.8483670826858874e-06, + "loss": 0.0001, + "step": 4297 + }, + { + "epoch": 2.81, + "grad_norm": 0.013477800413966179, + "learning_rate": 2.828406641693909e-06, + "loss": 0.0007, + "step": 4298 + }, + { + "epoch": 2.81, + "grad_norm": 0.0012212670408189297, + "learning_rate": 2.808515719343557e-06, + "loss": 0.0001, + "step": 4299 + }, + { + "epoch": 2.82, + "grad_norm": 0.02466653287410736, + "learning_rate": 2.788694325030599e-06, + "loss": 0.0008, + "step": 4300 + }, + { + "epoch": 2.82, + "grad_norm": 0.000914603762794286, + "learning_rate": 2.7689424681179626e-06, + "loss": 0.0, + "step": 4301 + }, + { + "epoch": 2.82, + "grad_norm": 0.009139444679021835, + "learning_rate": 2.749260157935701e-06, + "loss": 0.0003, + "step": 4302 + }, + { + "epoch": 2.82, + "grad_norm": 0.0032205942552536726, + "learning_rate": 2.72964740378106e-06, + "loss": 0.0002, + "step": 4303 + }, + { + "epoch": 2.82, + "grad_norm": 0.0010038187028840184, + "learning_rate": 2.710104214918396e-06, + "loss": 0.0001, + "step": 4304 + }, + { + "epoch": 2.82, + "grad_norm": 0.003465186106041074, + "learning_rate": 2.690630600579241e-06, + "loss": 0.0002, + "step": 4305 + }, + { + "epoch": 2.82, + "grad_norm": 0.005702082999050617, + "learning_rate": 2.6712265699622037e-06, + "loss": 0.0003, + "step": 4306 + }, + { + "epoch": 2.82, + "grad_norm": 0.18057338893413544, + "learning_rate": 2.651892132233102e-06, + "loss": 0.0047, + "step": 4307 + }, + { + "epoch": 2.82, + "grad_norm": 0.0012343705166131258, + "learning_rate": 2.632627296524814e-06, + "loss": 0.0001, + "step": 4308 + }, + { + "epoch": 2.82, + "grad_norm": 0.007943877018988132, + "learning_rate": 2.6134320719373603e-06, + "loss": 0.0003, + "step": 4309 + }, + { + "epoch": 2.82, + "grad_norm": 0.0020603055600076914, + "learning_rate": 2.594306467537921e-06, + "loss": 0.0001, + "step": 4310 + }, + { + "epoch": 2.82, + "grad_norm": 0.002424058737233281, + "learning_rate": 2.575250492360703e-06, + "loss": 0.0001, + "step": 4311 + }, + { + "epoch": 2.82, + "grad_norm": 0.005900349002331495, + "learning_rate": 2.556264155407106e-06, + "loss": 0.0002, + "step": 4312 + }, + { + "epoch": 2.82, + "grad_norm": 0.0038717566058039665, + "learning_rate": 2.537347465645573e-06, + "loss": 0.0002, + "step": 4313 + }, + { + "epoch": 2.82, + "grad_norm": 0.008307039737701416, + "learning_rate": 2.518500432011722e-06, + "loss": 0.0004, + "step": 4314 + }, + { + "epoch": 2.82, + "grad_norm": 0.0015151945408433676, + "learning_rate": 2.499723063408182e-06, + "loss": 0.0001, + "step": 4315 + }, + { + "epoch": 2.83, + "grad_norm": 0.014889131300151348, + "learning_rate": 2.4810153687047254e-06, + "loss": 0.0005, + "step": 4316 + }, + { + "epoch": 2.83, + "grad_norm": 0.00043308467138558626, + "learning_rate": 2.462377356738232e-06, + "loss": 0.0, + "step": 4317 + }, + { + "epoch": 2.83, + "grad_norm": 0.006906237918883562, + "learning_rate": 2.443809036312594e-06, + "loss": 0.0002, + "step": 4318 + }, + { + "epoch": 2.83, + "grad_norm": 0.04262949153780937, + "learning_rate": 2.425310416198878e-06, + "loss": 0.001, + "step": 4319 + }, + { + "epoch": 2.83, + "grad_norm": 0.04821141064167023, + "learning_rate": 2.4068815051351275e-06, + "loss": 0.0004, + "step": 4320 + }, + { + "epoch": 2.83, + "grad_norm": 0.003085468662902713, + "learning_rate": 2.3885223118265295e-06, + "loss": 0.0002, + "step": 4321 + }, + { + "epoch": 2.83, + "grad_norm": 0.0031349805649369955, + "learning_rate": 2.3702328449453132e-06, + "loss": 0.0002, + "step": 4322 + }, + { + "epoch": 2.83, + "grad_norm": 0.0023524181451648474, + "learning_rate": 2.3520131131307685e-06, + "loss": 0.0001, + "step": 4323 + }, + { + "epoch": 2.83, + "grad_norm": 0.000546684896107763, + "learning_rate": 2.3338631249892602e-06, + "loss": 0.0, + "step": 4324 + }, + { + "epoch": 2.83, + "grad_norm": 0.008636610582470894, + "learning_rate": 2.3157828890941977e-06, + "loss": 0.0004, + "step": 4325 + }, + { + "epoch": 2.83, + "grad_norm": 0.0006601106724701822, + "learning_rate": 2.297772413986032e-06, + "loss": 0.0, + "step": 4326 + }, + { + "epoch": 2.83, + "grad_norm": 0.003608833299949765, + "learning_rate": 2.2798317081722916e-06, + "loss": 0.0001, + "step": 4327 + }, + { + "epoch": 2.83, + "grad_norm": 0.0037489463575184345, + "learning_rate": 2.2619607801275307e-06, + "loss": 0.0003, + "step": 4328 + }, + { + "epoch": 2.83, + "grad_norm": 0.011426348239183426, + "learning_rate": 2.2441596382933304e-06, + "loss": 0.0006, + "step": 4329 + }, + { + "epoch": 2.83, + "grad_norm": 0.0013299249112606049, + "learning_rate": 2.226428291078297e-06, + "loss": 0.0001, + "step": 4330 + }, + { + "epoch": 2.84, + "grad_norm": 0.001219778903760016, + "learning_rate": 2.208766746858115e-06, + "loss": 0.0, + "step": 4331 + }, + { + "epoch": 2.84, + "grad_norm": 0.045300133526325226, + "learning_rate": 2.1911750139754768e-06, + "loss": 0.0029, + "step": 4332 + }, + { + "epoch": 2.84, + "grad_norm": 0.20339518785476685, + "learning_rate": 2.173653100740086e-06, + "loss": 0.0071, + "step": 4333 + }, + { + "epoch": 2.84, + "grad_norm": 0.0014095986261963844, + "learning_rate": 2.1562010154286713e-06, + "loss": 0.0001, + "step": 4334 + }, + { + "epoch": 2.84, + "grad_norm": 0.00032133294735103846, + "learning_rate": 2.1388187662849722e-06, + "loss": 0.0, + "step": 4335 + }, + { + "epoch": 2.84, + "grad_norm": 0.0066511370241642, + "learning_rate": 2.1215063615197534e-06, + "loss": 0.0003, + "step": 4336 + }, + { + "epoch": 2.84, + "grad_norm": 0.0016755980905145407, + "learning_rate": 2.104263809310791e-06, + "loss": 0.0001, + "step": 4337 + }, + { + "epoch": 2.84, + "grad_norm": 0.0054818810895085335, + "learning_rate": 2.0870911178028527e-06, + "loss": 0.0002, + "step": 4338 + }, + { + "epoch": 2.84, + "grad_norm": 0.00192430114839226, + "learning_rate": 2.0699882951076995e-06, + "loss": 0.0001, + "step": 4339 + }, + { + "epoch": 2.84, + "grad_norm": 0.004567565396428108, + "learning_rate": 2.052955349304103e-06, + "loss": 0.0002, + "step": 4340 + }, + { + "epoch": 2.84, + "grad_norm": 0.019123055040836334, + "learning_rate": 2.0359922884378098e-06, + "loss": 0.0005, + "step": 4341 + }, + { + "epoch": 2.84, + "grad_norm": 0.0012097922153770924, + "learning_rate": 2.019099120521578e-06, + "loss": 0.0001, + "step": 4342 + }, + { + "epoch": 2.84, + "grad_norm": 0.0012429042253643274, + "learning_rate": 2.002275853535157e-06, + "loss": 0.0001, + "step": 4343 + }, + { + "epoch": 2.84, + "grad_norm": 0.0019197979709133506, + "learning_rate": 1.9855224954252235e-06, + "loss": 0.0001, + "step": 4344 + }, + { + "epoch": 2.84, + "grad_norm": 0.2563212215900421, + "learning_rate": 1.968839054105514e-06, + "loss": 0.0382, + "step": 4345 + }, + { + "epoch": 2.85, + "grad_norm": 0.04079863429069519, + "learning_rate": 1.952225537456675e-06, + "loss": 0.0008, + "step": 4346 + }, + { + "epoch": 2.85, + "grad_norm": 0.0014803425874561071, + "learning_rate": 1.9356819533263457e-06, + "loss": 0.0001, + "step": 4347 + }, + { + "epoch": 2.85, + "grad_norm": 0.0017056971555575728, + "learning_rate": 1.9192083095291078e-06, + "loss": 0.0001, + "step": 4348 + }, + { + "epoch": 2.85, + "grad_norm": 0.5389607548713684, + "learning_rate": 1.9028046138465537e-06, + "loss": 0.0127, + "step": 4349 + }, + { + "epoch": 2.85, + "grad_norm": 0.007144905161112547, + "learning_rate": 1.886470874027185e-06, + "loss": 0.0002, + "step": 4350 + }, + { + "epoch": 2.85, + "grad_norm": 0.06887990236282349, + "learning_rate": 1.870207097786497e-06, + "loss": 0.0018, + "step": 4351 + }, + { + "epoch": 2.85, + "grad_norm": 0.007601974532008171, + "learning_rate": 1.8540132928069273e-06, + "loss": 0.0003, + "step": 4352 + }, + { + "epoch": 2.85, + "grad_norm": 0.009388328529894352, + "learning_rate": 1.8378894667378408e-06, + "loss": 0.0002, + "step": 4353 + }, + { + "epoch": 2.85, + "grad_norm": 0.03917848318815231, + "learning_rate": 1.8218356271955615e-06, + "loss": 0.0016, + "step": 4354 + }, + { + "epoch": 2.85, + "grad_norm": 0.02846098691225052, + "learning_rate": 1.8058517817633567e-06, + "loss": 0.0005, + "step": 4355 + }, + { + "epoch": 2.85, + "grad_norm": 0.0003685025731101632, + "learning_rate": 1.789937937991437e-06, + "loss": 0.0, + "step": 4356 + }, + { + "epoch": 2.85, + "grad_norm": 0.08682235330343246, + "learning_rate": 1.7740941033969226e-06, + "loss": 0.005, + "step": 4357 + }, + { + "epoch": 2.85, + "grad_norm": 0.0005288930260576308, + "learning_rate": 1.7583202854638934e-06, + "loss": 0.0, + "step": 4358 + }, + { + "epoch": 2.85, + "grad_norm": 0.01967015117406845, + "learning_rate": 1.7426164916433226e-06, + "loss": 0.0009, + "step": 4359 + }, + { + "epoch": 2.85, + "grad_norm": 0.01656365394592285, + "learning_rate": 1.7269827293531436e-06, + "loss": 0.0007, + "step": 4360 + }, + { + "epoch": 2.85, + "grad_norm": 0.0016807609936222434, + "learning_rate": 1.7114190059781819e-06, + "loss": 0.0001, + "step": 4361 + }, + { + "epoch": 2.86, + "grad_norm": 0.0018747443100437522, + "learning_rate": 1.69592532887019e-06, + "loss": 0.0001, + "step": 4362 + }, + { + "epoch": 2.86, + "grad_norm": 0.0018126006470993161, + "learning_rate": 1.6805017053478309e-06, + "loss": 0.0001, + "step": 4363 + }, + { + "epoch": 2.86, + "grad_norm": 0.016401158645749092, + "learning_rate": 1.6651481426967095e-06, + "loss": 0.0006, + "step": 4364 + }, + { + "epoch": 2.86, + "grad_norm": 0.007723231799900532, + "learning_rate": 1.6498646481692412e-06, + "loss": 0.0004, + "step": 4365 + }, + { + "epoch": 2.86, + "grad_norm": 0.027889663353562355, + "learning_rate": 1.6346512289848512e-06, + "loss": 0.0006, + "step": 4366 + }, + { + "epoch": 2.86, + "grad_norm": 0.09827005863189697, + "learning_rate": 1.6195078923298077e-06, + "loss": 0.004, + "step": 4367 + }, + { + "epoch": 2.86, + "grad_norm": 0.12982110679149628, + "learning_rate": 1.6044346453572887e-06, + "loss": 0.0118, + "step": 4368 + }, + { + "epoch": 2.86, + "grad_norm": 0.000986046507023275, + "learning_rate": 1.5894314951873488e-06, + "loss": 0.0, + "step": 4369 + }, + { + "epoch": 2.86, + "grad_norm": 0.015398401767015457, + "learning_rate": 1.574498448906969e-06, + "loss": 0.0007, + "step": 4370 + }, + { + "epoch": 2.86, + "grad_norm": 0.011814353056252003, + "learning_rate": 1.5596355135699734e-06, + "loss": 0.0003, + "step": 4371 + }, + { + "epoch": 2.86, + "grad_norm": 0.12488653510808945, + "learning_rate": 1.5448426961970795e-06, + "loss": 0.0012, + "step": 4372 + }, + { + "epoch": 2.86, + "grad_norm": 0.002912584226578474, + "learning_rate": 1.5301200037759142e-06, + "loss": 0.0001, + "step": 4373 + }, + { + "epoch": 2.86, + "grad_norm": 0.001886709127575159, + "learning_rate": 1.5154674432609316e-06, + "loss": 0.0001, + "step": 4374 + }, + { + "epoch": 2.86, + "grad_norm": 0.008764170110225677, + "learning_rate": 1.500885021573478e-06, + "loss": 0.0004, + "step": 4375 + }, + { + "epoch": 2.86, + "grad_norm": 0.06006164103746414, + "learning_rate": 1.4863727456017938e-06, + "loss": 0.0012, + "step": 4376 + }, + { + "epoch": 2.87, + "grad_norm": 0.01325245015323162, + "learning_rate": 1.471930622200962e-06, + "loss": 0.0004, + "step": 4377 + }, + { + "epoch": 2.87, + "grad_norm": 0.003653835505247116, + "learning_rate": 1.4575586581929088e-06, + "loss": 0.0002, + "step": 4378 + }, + { + "epoch": 2.87, + "grad_norm": 0.0023002636153250933, + "learning_rate": 1.443256860366454e-06, + "loss": 0.0001, + "step": 4379 + }, + { + "epoch": 2.87, + "grad_norm": 0.009768893010914326, + "learning_rate": 1.4290252354772602e-06, + "loss": 0.0006, + "step": 4380 + }, + { + "epoch": 2.87, + "grad_norm": 0.0016170182498171926, + "learning_rate": 1.4148637902478333e-06, + "loss": 0.0001, + "step": 4381 + }, + { + "epoch": 2.87, + "grad_norm": 0.001299695810303092, + "learning_rate": 1.4007725313675723e-06, + "loss": 0.0001, + "step": 4382 + }, + { + "epoch": 2.87, + "grad_norm": 0.0015163730131462216, + "learning_rate": 1.3867514654926359e-06, + "loss": 0.0001, + "step": 4383 + }, + { + "epoch": 2.87, + "grad_norm": 0.029150087386369705, + "learning_rate": 1.37280059924611e-06, + "loss": 0.0011, + "step": 4384 + }, + { + "epoch": 2.87, + "grad_norm": 0.0034407186321914196, + "learning_rate": 1.3589199392178895e-06, + "loss": 0.0001, + "step": 4385 + }, + { + "epoch": 2.87, + "grad_norm": 0.007222942542284727, + "learning_rate": 1.34510949196468e-06, + "loss": 0.0003, + "step": 4386 + }, + { + "epoch": 2.87, + "grad_norm": 0.005788599606603384, + "learning_rate": 1.3313692640100792e-06, + "loss": 0.0003, + "step": 4387 + }, + { + "epoch": 2.87, + "grad_norm": 0.021365903317928314, + "learning_rate": 1.3176992618444792e-06, + "loss": 0.0003, + "step": 4388 + }, + { + "epoch": 2.87, + "grad_norm": 0.0006491582607850432, + "learning_rate": 1.3040994919250814e-06, + "loss": 0.0, + "step": 4389 + }, + { + "epoch": 2.87, + "grad_norm": 0.13622768223285675, + "learning_rate": 1.2905699606759635e-06, + "loss": 0.0034, + "step": 4390 + }, + { + "epoch": 2.87, + "grad_norm": 0.0067710913717746735, + "learning_rate": 1.2771106744879634e-06, + "loss": 0.0002, + "step": 4391 + }, + { + "epoch": 2.88, + "grad_norm": 0.0005859547527506948, + "learning_rate": 1.2637216397187954e-06, + "loss": 0.0, + "step": 4392 + }, + { + "epoch": 2.88, + "grad_norm": 0.0004070180293638259, + "learning_rate": 1.2504028626929673e-06, + "loss": 0.0, + "step": 4393 + }, + { + "epoch": 2.88, + "grad_norm": 0.152267724275589, + "learning_rate": 1.23715434970178e-06, + "loss": 0.0036, + "step": 4394 + }, + { + "epoch": 2.88, + "grad_norm": 0.02890065312385559, + "learning_rate": 1.2239761070033772e-06, + "loss": 0.0004, + "step": 4395 + }, + { + "epoch": 2.88, + "grad_norm": 0.008390465751290321, + "learning_rate": 1.2108681408226627e-06, + "loss": 0.0003, + "step": 4396 + }, + { + "epoch": 2.88, + "grad_norm": 0.018741142004728317, + "learning_rate": 1.1978304573514175e-06, + "loss": 0.0003, + "step": 4397 + }, + { + "epoch": 2.88, + "grad_norm": 0.0027410881593823433, + "learning_rate": 1.1848630627481649e-06, + "loss": 0.0001, + "step": 4398 + }, + { + "epoch": 2.88, + "grad_norm": 0.0012412865180522203, + "learning_rate": 1.1719659631382384e-06, + "loss": 0.0001, + "step": 4399 + }, + { + "epoch": 2.88, + "grad_norm": 0.021122923120856285, + "learning_rate": 1.1591391646137482e-06, + "loss": 0.0006, + "step": 4400 + }, + { + "epoch": 2.88, + "grad_norm": 0.0018549376400187612, + "learning_rate": 1.1463826732336645e-06, + "loss": 0.0001, + "step": 4401 + }, + { + "epoch": 2.88, + "grad_norm": 0.0029543214477598667, + "learning_rate": 1.133696495023667e-06, + "loss": 0.0001, + "step": 4402 + }, + { + "epoch": 2.88, + "grad_norm": 0.20410063862800598, + "learning_rate": 1.1210806359762625e-06, + "loss": 0.0039, + "step": 4403 + }, + { + "epoch": 2.88, + "grad_norm": 0.002825433388352394, + "learning_rate": 1.1085351020507505e-06, + "loss": 0.0001, + "step": 4404 + }, + { + "epoch": 2.88, + "grad_norm": 0.018182463943958282, + "learning_rate": 1.0960598991731906e-06, + "loss": 0.0006, + "step": 4405 + }, + { + "epoch": 2.88, + "grad_norm": 0.002152892993763089, + "learning_rate": 1.0836550332364024e-06, + "loss": 0.0001, + "step": 4406 + }, + { + "epoch": 2.89, + "grad_norm": 0.009902440011501312, + "learning_rate": 1.0713205101000489e-06, + "loss": 0.0003, + "step": 4407 + }, + { + "epoch": 2.89, + "grad_norm": 0.0007533484022133052, + "learning_rate": 1.0590563355904858e-06, + "loss": 0.0001, + "step": 4408 + }, + { + "epoch": 2.89, + "grad_norm": 0.7009775042533875, + "learning_rate": 1.0468625155008791e-06, + "loss": 0.0052, + "step": 4409 + }, + { + "epoch": 2.89, + "grad_norm": 0.050083860754966736, + "learning_rate": 1.0347390555911717e-06, + "loss": 0.0021, + "step": 4410 + }, + { + "epoch": 2.89, + "grad_norm": 0.0006537585286423564, + "learning_rate": 1.022685961588049e-06, + "loss": 0.0, + "step": 4411 + }, + { + "epoch": 2.89, + "grad_norm": 0.004227178171277046, + "learning_rate": 1.0107032391849568e-06, + "loss": 0.0003, + "step": 4412 + }, + { + "epoch": 2.89, + "grad_norm": 0.007454452570527792, + "learning_rate": 9.987908940421175e-07, + "loss": 0.0002, + "step": 4413 + }, + { + "epoch": 2.89, + "grad_norm": 0.047428593039512634, + "learning_rate": 9.869489317864965e-07, + "loss": 0.0005, + "step": 4414 + }, + { + "epoch": 2.89, + "grad_norm": 0.002882574684917927, + "learning_rate": 9.751773580118193e-07, + "loss": 0.0001, + "step": 4415 + }, + { + "epoch": 2.89, + "grad_norm": 0.005308135412633419, + "learning_rate": 9.63476178278555e-07, + "loss": 0.0002, + "step": 4416 + }, + { + "epoch": 2.89, + "grad_norm": 0.4509996771812439, + "learning_rate": 9.518453981139485e-07, + "loss": 0.0081, + "step": 4417 + }, + { + "epoch": 2.89, + "grad_norm": 0.023147013038396835, + "learning_rate": 9.402850230119385e-07, + "loss": 0.0006, + "step": 4418 + }, + { + "epoch": 2.89, + "grad_norm": 0.003967209253460169, + "learning_rate": 9.287950584332404e-07, + "loss": 0.0003, + "step": 4419 + }, + { + "epoch": 2.89, + "grad_norm": 0.004324222914874554, + "learning_rate": 9.173755098053126e-07, + "loss": 0.0002, + "step": 4420 + }, + { + "epoch": 2.89, + "grad_norm": 0.0015240806387737393, + "learning_rate": 9.060263825223568e-07, + "loss": 0.0001, + "step": 4421 + }, + { + "epoch": 2.89, + "grad_norm": 0.09920462220907211, + "learning_rate": 8.947476819452682e-07, + "loss": 0.0025, + "step": 4422 + }, + { + "epoch": 2.9, + "grad_norm": 0.02376033365726471, + "learning_rate": 8.835394134017348e-07, + "loss": 0.0008, + "step": 4423 + }, + { + "epoch": 2.9, + "grad_norm": 0.0027050350327044725, + "learning_rate": 8.724015821861386e-07, + "loss": 0.0001, + "step": 4424 + }, + { + "epoch": 2.9, + "grad_norm": 0.030055051669478416, + "learning_rate": 8.613341935595874e-07, + "loss": 0.0012, + "step": 4425 + }, + { + "epoch": 2.9, + "grad_norm": 0.001134609105065465, + "learning_rate": 8.503372527499331e-07, + "loss": 0.0001, + "step": 4426 + }, + { + "epoch": 2.9, + "grad_norm": 0.08735169470310211, + "learning_rate": 8.394107649517201e-07, + "loss": 0.0016, + "step": 4427 + }, + { + "epoch": 2.9, + "grad_norm": 0.025910815224051476, + "learning_rate": 8.285547353262534e-07, + "loss": 0.0004, + "step": 4428 + }, + { + "epoch": 2.9, + "grad_norm": 0.004134078044444323, + "learning_rate": 8.177691690015309e-07, + "loss": 0.0002, + "step": 4429 + }, + { + "epoch": 2.9, + "grad_norm": 0.009418493136763573, + "learning_rate": 8.070540710722772e-07, + "loss": 0.0004, + "step": 4430 + }, + { + "epoch": 2.9, + "grad_norm": 0.02795690856873989, + "learning_rate": 7.964094465999104e-07, + "loss": 0.0005, + "step": 4431 + }, + { + "epoch": 2.9, + "grad_norm": 0.014434738084673882, + "learning_rate": 7.858353006125917e-07, + "loss": 0.0006, + "step": 4432 + }, + { + "epoch": 2.9, + "grad_norm": 0.0053445142693817616, + "learning_rate": 7.753316381051588e-07, + "loss": 0.0002, + "step": 4433 + }, + { + "epoch": 2.9, + "grad_norm": 0.33343926072120667, + "learning_rate": 7.648984640391765e-07, + "loss": 0.0231, + "step": 4434 + }, + { + "epoch": 2.9, + "grad_norm": 0.0053176539950072765, + "learning_rate": 7.545357833429022e-07, + "loss": 0.0003, + "step": 4435 + }, + { + "epoch": 2.9, + "grad_norm": 0.024762781336903572, + "learning_rate": 7.44243600911304e-07, + "loss": 0.001, + "step": 4436 + }, + { + "epoch": 2.9, + "grad_norm": 0.08466077595949173, + "learning_rate": 7.340219216060261e-07, + "loss": 0.0022, + "step": 4437 + }, + { + "epoch": 2.91, + "grad_norm": 0.00036638948949985206, + "learning_rate": 7.238707502554564e-07, + "loss": 0.0, + "step": 4438 + }, + { + "epoch": 2.91, + "grad_norm": 0.22942852973937988, + "learning_rate": 7.137900916546257e-07, + "loss": 0.0508, + "step": 4439 + }, + { + "epoch": 2.91, + "grad_norm": 0.062393829226493835, + "learning_rate": 7.037799505652919e-07, + "loss": 0.0016, + "step": 4440 + }, + { + "epoch": 2.91, + "grad_norm": 0.19803699851036072, + "learning_rate": 6.938403317158725e-07, + "loss": 0.0156, + "step": 4441 + }, + { + "epoch": 2.91, + "grad_norm": 0.005899924784898758, + "learning_rate": 6.839712398015118e-07, + "loss": 0.0004, + "step": 4442 + }, + { + "epoch": 2.91, + "grad_norm": 0.0024845225270837545, + "learning_rate": 6.741726794840141e-07, + "loss": 0.0002, + "step": 4443 + }, + { + "epoch": 2.91, + "grad_norm": 0.0049166688695549965, + "learning_rate": 6.64444655391877e-07, + "loss": 0.0002, + "step": 4444 + }, + { + "epoch": 2.91, + "grad_norm": 0.002268972573801875, + "learning_rate": 6.54787172120258e-07, + "loss": 0.0001, + "step": 4445 + }, + { + "epoch": 2.91, + "grad_norm": 0.49916523694992065, + "learning_rate": 6.452002342310247e-07, + "loss": 0.0319, + "step": 4446 + }, + { + "epoch": 2.91, + "grad_norm": 0.008629180490970612, + "learning_rate": 6.356838462526881e-07, + "loss": 0.0003, + "step": 4447 + }, + { + "epoch": 2.91, + "grad_norm": 0.2954511344432831, + "learning_rate": 6.26238012680469e-07, + "loss": 0.035, + "step": 4448 + }, + { + "epoch": 2.91, + "grad_norm": 0.0007742204470559955, + "learning_rate": 6.168627379762314e-07, + "loss": 0.0, + "step": 4449 + }, + { + "epoch": 2.91, + "grad_norm": 0.0043347920291125774, + "learning_rate": 6.075580265685498e-07, + "loss": 0.0002, + "step": 4450 + }, + { + "epoch": 2.91, + "grad_norm": 0.053381215780973434, + "learning_rate": 5.983238828526082e-07, + "loss": 0.003, + "step": 4451 + }, + { + "epoch": 2.91, + "grad_norm": 0.0016654481878504157, + "learning_rate": 5.891603111903009e-07, + "loss": 0.0001, + "step": 4452 + }, + { + "epoch": 2.92, + "grad_norm": 0.014030089601874352, + "learning_rate": 5.800673159101821e-07, + "loss": 0.0007, + "step": 4453 + }, + { + "epoch": 2.92, + "grad_norm": 0.2903052866458893, + "learning_rate": 5.710449013074492e-07, + "loss": 0.0056, + "step": 4454 + }, + { + "epoch": 2.92, + "grad_norm": 0.0033237270545214415, + "learning_rate": 5.620930716439598e-07, + "loss": 0.0001, + "step": 4455 + }, + { + "epoch": 2.92, + "grad_norm": 0.0404021292924881, + "learning_rate": 5.532118311482647e-07, + "loss": 0.0011, + "step": 4456 + }, + { + "epoch": 2.92, + "grad_norm": 0.0039369394071400166, + "learning_rate": 5.444011840155416e-07, + "loss": 0.0002, + "step": 4457 + }, + { + "epoch": 2.92, + "grad_norm": 0.0037958049215376377, + "learning_rate": 5.356611344076278e-07, + "loss": 0.0001, + "step": 4458 + }, + { + "epoch": 2.92, + "grad_norm": 0.15406718850135803, + "learning_rate": 5.269916864530043e-07, + "loss": 0.0195, + "step": 4459 + }, + { + "epoch": 2.92, + "grad_norm": 0.000996951712295413, + "learning_rate": 5.183928442468121e-07, + "loss": 0.0001, + "step": 4460 + }, + { + "epoch": 2.92, + "grad_norm": 0.0013642380945384502, + "learning_rate": 5.098646118508354e-07, + "loss": 0.0001, + "step": 4461 + }, + { + "epoch": 2.92, + "grad_norm": 0.02301752381026745, + "learning_rate": 5.01406993293535e-07, + "loss": 0.0007, + "step": 4462 + }, + { + "epoch": 2.92, + "grad_norm": 0.0013779596192762256, + "learning_rate": 4.930199925699652e-07, + "loss": 0.0001, + "step": 4463 + }, + { + "epoch": 2.92, + "grad_norm": 0.014278000220656395, + "learning_rate": 4.847036136418402e-07, + "loss": 0.0008, + "step": 4464 + }, + { + "epoch": 2.92, + "grad_norm": 0.09595952183008194, + "learning_rate": 4.764578604375513e-07, + "loss": 0.0019, + "step": 4465 + }, + { + "epoch": 2.92, + "grad_norm": 0.0059007806703448296, + "learning_rate": 4.6828273685206584e-07, + "loss": 0.0004, + "step": 4466 + }, + { + "epoch": 2.92, + "grad_norm": 0.15556591749191284, + "learning_rate": 4.601782467470616e-07, + "loss": 0.0062, + "step": 4467 + }, + { + "epoch": 2.93, + "grad_norm": 0.35102561116218567, + "learning_rate": 4.521443939507763e-07, + "loss": 0.0206, + "step": 4468 + }, + { + "epoch": 2.93, + "grad_norm": 0.006505624856799841, + "learning_rate": 4.441811822581409e-07, + "loss": 0.0002, + "step": 4469 + }, + { + "epoch": 2.93, + "grad_norm": 0.47805875539779663, + "learning_rate": 4.3628861543067994e-07, + "loss": 0.0066, + "step": 4470 + }, + { + "epoch": 2.93, + "grad_norm": 0.020300021395087242, + "learning_rate": 4.2846669719657777e-07, + "loss": 0.0006, + "step": 4471 + }, + { + "epoch": 2.93, + "grad_norm": 0.010489795356988907, + "learning_rate": 4.2071543125061224e-07, + "loss": 0.0004, + "step": 4472 + }, + { + "epoch": 2.93, + "grad_norm": 0.0012385237496346235, + "learning_rate": 4.130348212542045e-07, + "loss": 0.0001, + "step": 4473 + }, + { + "epoch": 2.93, + "grad_norm": 0.007655164692550898, + "learning_rate": 4.054248708354191e-07, + "loss": 0.0003, + "step": 4474 + }, + { + "epoch": 2.93, + "grad_norm": 0.046717606484889984, + "learning_rate": 3.978855835889305e-07, + "loss": 0.0009, + "step": 4475 + }, + { + "epoch": 2.93, + "grad_norm": 0.18390505015850067, + "learning_rate": 3.9041696307602345e-07, + "loss": 0.0063, + "step": 4476 + }, + { + "epoch": 2.93, + "grad_norm": 0.002308554481714964, + "learning_rate": 3.8301901282459246e-07, + "loss": 0.0001, + "step": 4477 + }, + { + "epoch": 2.93, + "grad_norm": 0.01422666385769844, + "learning_rate": 3.7569173632919226e-07, + "loss": 0.0005, + "step": 4478 + }, + { + "epoch": 2.93, + "grad_norm": 0.004292371217161417, + "learning_rate": 3.684351370509542e-07, + "loss": 0.0003, + "step": 4479 + }, + { + "epoch": 2.93, + "grad_norm": 0.0008829891448840499, + "learning_rate": 3.612492184176363e-07, + "loss": 0.0, + "step": 4480 + }, + { + "epoch": 2.93, + "grad_norm": 0.004852804355323315, + "learning_rate": 3.5413398382362345e-07, + "loss": 0.0002, + "step": 4481 + }, + { + "epoch": 2.93, + "grad_norm": 0.07125513255596161, + "learning_rate": 3.4708943662989376e-07, + "loss": 0.0024, + "step": 4482 + }, + { + "epoch": 2.93, + "grad_norm": 0.0027690506540238857, + "learning_rate": 3.401155801640354e-07, + "loss": 0.0001, + "step": 4483 + }, + { + "epoch": 2.94, + "grad_norm": 0.0010180575773119926, + "learning_rate": 3.332124177202633e-07, + "loss": 0.0001, + "step": 4484 + }, + { + "epoch": 2.94, + "grad_norm": 0.29328832030296326, + "learning_rate": 3.2637995255938577e-07, + "loss": 0.0063, + "step": 4485 + }, + { + "epoch": 2.94, + "grad_norm": 0.0008733943686820567, + "learning_rate": 3.1961818790880445e-07, + "loss": 0.0, + "step": 4486 + }, + { + "epoch": 2.94, + "grad_norm": 0.12663213908672333, + "learning_rate": 3.1292712696253107e-07, + "loss": 0.0039, + "step": 4487 + }, + { + "epoch": 2.94, + "grad_norm": 0.006642747204750776, + "learning_rate": 3.063067728812207e-07, + "loss": 0.0003, + "step": 4488 + }, + { + "epoch": 2.94, + "grad_norm": 0.009152544662356377, + "learning_rate": 2.9975712879205526e-07, + "loss": 0.0003, + "step": 4489 + }, + { + "epoch": 2.94, + "grad_norm": 0.007460552733391523, + "learning_rate": 2.9327819778889315e-07, + "loss": 0.0002, + "step": 4490 + }, + { + "epoch": 2.94, + "grad_norm": 0.09431986510753632, + "learning_rate": 2.868699829321031e-07, + "loss": 0.0036, + "step": 4491 + }, + { + "epoch": 2.94, + "grad_norm": 0.13941361010074615, + "learning_rate": 2.805324872487469e-07, + "loss": 0.0031, + "step": 4492 + }, + { + "epoch": 2.94, + "grad_norm": 0.026968982070684433, + "learning_rate": 2.742657137323967e-07, + "loss": 0.0007, + "step": 4493 + }, + { + "epoch": 2.94, + "grad_norm": 0.0012274475302547216, + "learning_rate": 2.680696653432679e-07, + "loss": 0.0001, + "step": 4494 + }, + { + "epoch": 2.94, + "grad_norm": 0.1442461609840393, + "learning_rate": 2.6194434500815265e-07, + "loss": 0.0044, + "step": 4495 + }, + { + "epoch": 2.94, + "grad_norm": 0.0060506644658744335, + "learning_rate": 2.558897556204531e-07, + "loss": 0.0002, + "step": 4496 + }, + { + "epoch": 2.94, + "grad_norm": 0.084715835750103, + "learning_rate": 2.499059000401149e-07, + "loss": 0.0023, + "step": 4497 + }, + { + "epoch": 2.94, + "grad_norm": 0.02763253077864647, + "learning_rate": 2.4399278109371036e-07, + "loss": 0.0013, + "step": 4498 + }, + { + "epoch": 2.95, + "grad_norm": 0.01986292377114296, + "learning_rate": 2.3815040157438847e-07, + "loss": 0.0007, + "step": 4499 + }, + { + "epoch": 2.95, + "grad_norm": 0.016204355284571648, + "learning_rate": 2.3237876424187506e-07, + "loss": 0.0004, + "step": 4500 + }, + { + "epoch": 2.95, + "grad_norm": 0.07691261172294617, + "learning_rate": 2.2667787182250597e-07, + "loss": 0.0033, + "step": 4501 + }, + { + "epoch": 2.95, + "grad_norm": 0.004736583214253187, + "learning_rate": 2.210477270091604e-07, + "loss": 0.0002, + "step": 4502 + }, + { + "epoch": 2.95, + "grad_norm": 0.14228679239749908, + "learning_rate": 2.1548833246131102e-07, + "loss": 0.0119, + "step": 4503 + }, + { + "epoch": 2.95, + "grad_norm": 0.0018677035113796592, + "learning_rate": 2.0999969080505719e-07, + "loss": 0.0001, + "step": 4504 + }, + { + "epoch": 2.95, + "grad_norm": 0.012720284052193165, + "learning_rate": 2.0458180463300832e-07, + "loss": 0.0002, + "step": 4505 + }, + { + "epoch": 2.95, + "grad_norm": 0.013799606822431087, + "learning_rate": 1.9923467650438397e-07, + "loss": 0.0003, + "step": 4506 + }, + { + "epoch": 2.95, + "grad_norm": 0.0077991848811507225, + "learning_rate": 1.9395830894498032e-07, + "loss": 0.0003, + "step": 4507 + }, + { + "epoch": 2.95, + "grad_norm": 0.003002564422786236, + "learning_rate": 1.8875270444717038e-07, + "loss": 0.0002, + "step": 4508 + }, + { + "epoch": 2.95, + "grad_norm": 0.006304152309894562, + "learning_rate": 1.8361786546990387e-07, + "loss": 0.0003, + "step": 4509 + }, + { + "epoch": 2.95, + "grad_norm": 0.001204495201818645, + "learning_rate": 1.7855379443869056e-07, + "loss": 0.0001, + "step": 4510 + }, + { + "epoch": 2.95, + "grad_norm": 0.0013043899089097977, + "learning_rate": 1.7356049374560032e-07, + "loss": 0.0001, + "step": 4511 + }, + { + "epoch": 2.95, + "grad_norm": 0.003483373438939452, + "learning_rate": 1.686379657493131e-07, + "loss": 0.0002, + "step": 4512 + }, + { + "epoch": 2.95, + "grad_norm": 0.004764943849295378, + "learning_rate": 1.6378621277505223e-07, + "loss": 0.0002, + "step": 4513 + }, + { + "epoch": 2.96, + "grad_norm": 0.016086289659142494, + "learning_rate": 1.5900523711460112e-07, + "loss": 0.0008, + "step": 4514 + }, + { + "epoch": 2.96, + "grad_norm": 0.014063529670238495, + "learning_rate": 1.5429504102633662e-07, + "loss": 0.0005, + "step": 4515 + }, + { + "epoch": 2.96, + "grad_norm": 0.019461099058389664, + "learning_rate": 1.496556267351956e-07, + "loss": 0.0005, + "step": 4516 + }, + { + "epoch": 2.96, + "grad_norm": 0.01967359147965908, + "learning_rate": 1.4508699643265841e-07, + "loss": 0.0006, + "step": 4517 + }, + { + "epoch": 2.96, + "grad_norm": 0.003815049771219492, + "learning_rate": 1.4058915227678214e-07, + "loss": 0.0002, + "step": 4518 + }, + { + "epoch": 2.96, + "grad_norm": 0.122381791472435, + "learning_rate": 1.3616209639220056e-07, + "loss": 0.0035, + "step": 4519 + }, + { + "epoch": 2.96, + "grad_norm": 0.02738620713353157, + "learning_rate": 1.3180583087009088e-07, + "loss": 0.0013, + "step": 4520 + }, + { + "epoch": 2.96, + "grad_norm": 0.0066810492426157, + "learning_rate": 1.2752035776819048e-07, + "loss": 0.0003, + "step": 4521 + }, + { + "epoch": 2.96, + "grad_norm": 0.002404808299615979, + "learning_rate": 1.2330567911083e-07, + "loss": 0.0002, + "step": 4522 + }, + { + "epoch": 2.96, + "grad_norm": 0.006481163669377565, + "learning_rate": 1.1916179688885031e-07, + "loss": 0.0001, + "step": 4523 + }, + { + "epoch": 2.96, + "grad_norm": 0.004577795043587685, + "learning_rate": 1.1508871305966894e-07, + "loss": 0.0002, + "step": 4524 + }, + { + "epoch": 2.96, + "grad_norm": 0.015365195460617542, + "learning_rate": 1.1108642954729685e-07, + "loss": 0.0005, + "step": 4525 + }, + { + "epoch": 2.96, + "grad_norm": 0.7103176712989807, + "learning_rate": 1.0715494824225512e-07, + "loss": 0.0328, + "step": 4526 + }, + { + "epoch": 2.96, + "grad_norm": 0.002796258544549346, + "learning_rate": 1.032942710016249e-07, + "loss": 0.0001, + "step": 4527 + }, + { + "epoch": 2.96, + "grad_norm": 0.00843851175159216, + "learning_rate": 9.95043996490641e-08, + "loss": 0.0003, + "step": 4528 + }, + { + "epoch": 2.96, + "grad_norm": 0.025280553847551346, + "learning_rate": 9.5785335974774e-08, + "loss": 0.0008, + "step": 4529 + }, + { + "epoch": 2.97, + "grad_norm": 0.0012324524577707052, + "learning_rate": 9.213708173549938e-08, + "loss": 0.0001, + "step": 4530 + }, + { + "epoch": 2.97, + "grad_norm": 0.008781511336565018, + "learning_rate": 8.855963865456172e-08, + "loss": 0.0003, + "step": 4531 + }, + { + "epoch": 2.97, + "grad_norm": 0.001310911844484508, + "learning_rate": 8.505300842180928e-08, + "loss": 0.0001, + "step": 4532 + }, + { + "epoch": 2.97, + "grad_norm": 0.0026375914458185434, + "learning_rate": 8.161719269365041e-08, + "loss": 0.0001, + "step": 4533 + }, + { + "epoch": 2.97, + "grad_norm": 0.06082810088992119, + "learning_rate": 7.825219309305353e-08, + "loss": 0.0015, + "step": 4534 + }, + { + "epoch": 2.97, + "grad_norm": 0.0017568677430972457, + "learning_rate": 7.495801120949718e-08, + "loss": 0.0, + "step": 4535 + }, + { + "epoch": 2.97, + "grad_norm": 0.0035616776440292597, + "learning_rate": 7.173464859905331e-08, + "loss": 0.0002, + "step": 4536 + }, + { + "epoch": 2.97, + "grad_norm": 0.0005090326303616166, + "learning_rate": 6.858210678433729e-08, + "loss": 0.0, + "step": 4537 + }, + { + "epoch": 2.97, + "grad_norm": 0.011574827134609222, + "learning_rate": 6.55003872544746e-08, + "loss": 0.0002, + "step": 4538 + }, + { + "epoch": 2.97, + "grad_norm": 0.005368970800191164, + "learning_rate": 6.248949146516746e-08, + "loss": 0.0002, + "step": 4539 + }, + { + "epoch": 2.97, + "grad_norm": 0.0024410225450992584, + "learning_rate": 5.95494208386782e-08, + "loss": 0.0001, + "step": 4540 + }, + { + "epoch": 2.97, + "grad_norm": 0.011060687713325024, + "learning_rate": 5.668017676374592e-08, + "loss": 0.0005, + "step": 4541 + }, + { + "epoch": 2.97, + "grad_norm": 0.08735539019107819, + "learning_rate": 5.388176059575311e-08, + "loss": 0.0029, + "step": 4542 + }, + { + "epoch": 2.97, + "grad_norm": 0.02066926844418049, + "learning_rate": 5.115417365652574e-08, + "loss": 0.0009, + "step": 4543 + }, + { + "epoch": 2.97, + "grad_norm": 0.003974274266511202, + "learning_rate": 4.84974172345165e-08, + "loss": 0.0001, + "step": 4544 + }, + { + "epoch": 2.98, + "grad_norm": 0.004568861797451973, + "learning_rate": 4.5911492584654875e-08, + "loss": 0.0002, + "step": 4545 + }, + { + "epoch": 2.98, + "grad_norm": 0.008668004535138607, + "learning_rate": 4.3396400928447096e-08, + "loss": 0.0006, + "step": 4546 + }, + { + "epoch": 2.98, + "grad_norm": 0.009674041531980038, + "learning_rate": 4.095214345394282e-08, + "loss": 0.0004, + "step": 4547 + }, + { + "epoch": 2.98, + "grad_norm": 0.0033420005347579718, + "learning_rate": 3.8578721315718486e-08, + "loss": 0.0002, + "step": 4548 + }, + { + "epoch": 2.98, + "grad_norm": 0.004970838315784931, + "learning_rate": 3.6276135634893956e-08, + "loss": 0.0002, + "step": 4549 + }, + { + "epoch": 2.98, + "grad_norm": 0.009196409024298191, + "learning_rate": 3.404438749911586e-08, + "loss": 0.0004, + "step": 4550 + }, + { + "epoch": 2.98, + "grad_norm": 0.006792422849684954, + "learning_rate": 3.1883477962607593e-08, + "loss": 0.0003, + "step": 4551 + }, + { + "epoch": 2.98, + "grad_norm": 0.0009444206370972097, + "learning_rate": 2.9793408046085986e-08, + "loss": 0.0001, + "step": 4552 + }, + { + "epoch": 2.98, + "grad_norm": 0.004028064664453268, + "learning_rate": 2.777417873684462e-08, + "loss": 0.0002, + "step": 4553 + }, + { + "epoch": 2.98, + "grad_norm": 0.0025268937461078167, + "learning_rate": 2.5825790988670546e-08, + "loss": 0.0001, + "step": 4554 + }, + { + "epoch": 2.98, + "grad_norm": 0.0061641717329621315, + "learning_rate": 2.3948245721944203e-08, + "loss": 0.0002, + "step": 4555 + }, + { + "epoch": 2.98, + "grad_norm": 0.05026659369468689, + "learning_rate": 2.2141543823522844e-08, + "loss": 0.0012, + "step": 4556 + }, + { + "epoch": 2.98, + "grad_norm": 0.10264033079147339, + "learning_rate": 2.040568614684046e-08, + "loss": 0.0029, + "step": 4557 + }, + { + "epoch": 2.98, + "grad_norm": 0.0027267495170235634, + "learning_rate": 1.874067351185782e-08, + "loss": 0.0001, + "step": 4558 + }, + { + "epoch": 2.98, + "grad_norm": 0.006125300191342831, + "learning_rate": 1.7146506705062456e-08, + "loss": 0.0002, + "step": 4559 + }, + { + "epoch": 2.99, + "grad_norm": 0.035302937030792236, + "learning_rate": 1.562318647948535e-08, + "loss": 0.0009, + "step": 4560 + }, + { + "epoch": 2.99, + "grad_norm": 0.005918944254517555, + "learning_rate": 1.4170713554684243e-08, + "loss": 0.0003, + "step": 4561 + }, + { + "epoch": 2.99, + "grad_norm": 0.0013190334429964423, + "learning_rate": 1.2789088616760312e-08, + "loss": 0.0001, + "step": 4562 + }, + { + "epoch": 2.99, + "grad_norm": 0.0021986577194184065, + "learning_rate": 1.147831231834151e-08, + "loss": 0.0001, + "step": 4563 + }, + { + "epoch": 2.99, + "grad_norm": 0.006591108627617359, + "learning_rate": 1.0238385278599215e-08, + "loss": 0.0004, + "step": 4564 + }, + { + "epoch": 2.99, + "grad_norm": 0.022041432559490204, + "learning_rate": 9.069308083214933e-09, + "loss": 0.0008, + "step": 4565 + }, + { + "epoch": 2.99, + "grad_norm": 0.0006390580092556775, + "learning_rate": 7.9710812844469e-09, + "loss": 0.0, + "step": 4566 + }, + { + "epoch": 2.99, + "grad_norm": 0.237385094165802, + "learning_rate": 6.943705401030175e-09, + "loss": 0.0081, + "step": 4567 + }, + { + "epoch": 2.99, + "grad_norm": 0.22776009142398834, + "learning_rate": 5.987180918276546e-09, + "loss": 0.0157, + "step": 4568 + }, + { + "epoch": 2.99, + "grad_norm": 0.0027707633562386036, + "learning_rate": 5.1015082879912735e-09, + "loss": 0.0001, + "step": 4569 + }, + { + "epoch": 2.99, + "grad_norm": 0.4915206730365753, + "learning_rate": 4.2866879285730075e-09, + "loss": 0.0096, + "step": 4570 + }, + { + "epoch": 2.99, + "grad_norm": 0.008182759396731853, + "learning_rate": 3.542720224897211e-09, + "loss": 0.0005, + "step": 4571 + }, + { + "epoch": 2.99, + "grad_norm": 0.029654890298843384, + "learning_rate": 2.8696055283661257e-09, + "loss": 0.0005, + "step": 4572 + }, + { + "epoch": 2.99, + "grad_norm": 0.0009402847499586642, + "learning_rate": 2.2673441569753815e-09, + "loss": 0.0001, + "step": 4573 + }, + { + "epoch": 2.99, + "grad_norm": 0.02566376142203808, + "learning_rate": 1.7359363951807703e-09, + "loss": 0.0013, + "step": 4574 + }, + { + "epoch": 3.0, + "grad_norm": 0.007051780819892883, + "learning_rate": 1.275382493998167e-09, + "loss": 0.0004, + "step": 4575 + }, + { + "epoch": 3.0, + "grad_norm": 0.010759466327726841, + "learning_rate": 8.856826710035292e-10, + "loss": 0.0006, + "step": 4576 + }, + { + "epoch": 3.0, + "grad_norm": 0.05404192954301834, + "learning_rate": 5.668371102496294e-10, + "loss": 0.0016, + "step": 4577 + }, + { + "epoch": 3.0, + "grad_norm": 0.019586021080613136, + "learning_rate": 3.1884596238263005e-10, + "loss": 0.0008, + "step": 4578 + }, + { + "epoch": 3.0, + "grad_norm": 0.00387565023265779, + "learning_rate": 1.4170934450885574e-10, + "loss": 0.0003, + "step": 4579 + }, + { + "epoch": 3.0, + "grad_norm": 0.0254792682826519, + "learning_rate": 3.542734031136696e-11, + "loss": 0.0009, + "step": 4580 + }, + { + "epoch": 3.0, + "grad_norm": 0.0024456833489239216, + "learning_rate": 0.0, + "loss": 0.0001, + "step": 4581 + } + ], + "logging_steps": 1, + "max_steps": 4581, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 1527, + "total_flos": 4.277210665808036e+17, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}