diff --git "a/checkpoint-1527/trainer_state.json" "b/checkpoint-1527/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-1527/trainer_state.json" @@ -0,0 +1,10742 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9996726677577742, + "eval_steps": 382, + "global_step": 1527, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 5.687138080596924, + "learning_rate": 2.9999999999999997e-05, + "loss": 3.5097, + "step": 1 + }, + { + "epoch": 0.0, + "eval_loss": 3.6327099800109863, + "eval_runtime": 39.1673, + "eval_samples_per_second": 32.859, + "eval_steps_per_second": 8.221, + "step": 1 + }, + { + "epoch": 0.0, + "grad_norm": 5.729796886444092, + "learning_rate": 5.9999999999999995e-05, + "loss": 3.6634, + "step": 2 + }, + { + "epoch": 0.0, + "grad_norm": 5.679180145263672, + "learning_rate": 8.999999999999999e-05, + "loss": 3.5559, + "step": 3 + }, + { + "epoch": 0.0, + "grad_norm": 4.81653356552124, + "learning_rate": 0.00011999999999999999, + "loss": 3.1536, + "step": 4 + }, + { + "epoch": 0.0, + "grad_norm": 4.388213634490967, + "learning_rate": 0.00015, + "loss": 2.3092, + "step": 5 + }, + { + "epoch": 0.0, + "grad_norm": 2.6662285327911377, + "learning_rate": 0.00017999999999999998, + "loss": 1.2283, + "step": 6 + }, + { + "epoch": 0.0, + "grad_norm": 1.9162248373031616, + "learning_rate": 0.00020999999999999998, + "loss": 0.6207, + "step": 7 + }, + { + "epoch": 0.01, + "grad_norm": 1.3946017026901245, + "learning_rate": 0.00023999999999999998, + "loss": 0.2942, + "step": 8 + }, + { + "epoch": 0.01, + "grad_norm": 0.3801995813846588, + "learning_rate": 0.00027, + "loss": 0.1143, + "step": 9 + }, + { + "epoch": 0.01, + "grad_norm": 0.2290647178888321, + "learning_rate": 0.0003, + "loss": 0.1152, + "step": 10 + }, + { + "epoch": 0.01, + "grad_norm": 0.2698324918746948, + "learning_rate": 0.00029999996457265966, + "loss": 0.0984, + "step": 11 + }, + { + "epoch": 0.01, + "grad_norm": 0.15049245953559875, + "learning_rate": 0.00029999985829065547, + "loss": 0.0925, + "step": 12 + }, + { + "epoch": 0.01, + "grad_norm": 0.7001833319664001, + "learning_rate": 0.0002999996811540376, + "loss": 0.1215, + "step": 13 + }, + { + "epoch": 0.01, + "grad_norm": 0.22832374274730682, + "learning_rate": 0.00029999943316288974, + "loss": 0.0997, + "step": 14 + }, + { + "epoch": 0.01, + "grad_norm": 0.1290595531463623, + "learning_rate": 0.00029999911431732894, + "loss": 0.0973, + "step": 15 + }, + { + "epoch": 0.01, + "grad_norm": 0.3555549383163452, + "learning_rate": 0.00029999872461750597, + "loss": 0.1108, + "step": 16 + }, + { + "epoch": 0.01, + "grad_norm": 0.04830395057797432, + "learning_rate": 0.0002999982640636048, + "loss": 0.0994, + "step": 17 + }, + { + "epoch": 0.01, + "grad_norm": 0.2727436125278473, + "learning_rate": 0.00029999773265584304, + "loss": 0.1144, + "step": 18 + }, + { + "epoch": 0.01, + "grad_norm": 0.03478335589170456, + "learning_rate": 0.0002999971303944716, + "loss": 0.0945, + "step": 19 + }, + { + "epoch": 0.01, + "grad_norm": 0.133951798081398, + "learning_rate": 0.00029999645727977505, + "loss": 0.0928, + "step": 20 + }, + { + "epoch": 0.01, + "grad_norm": 0.20885471999645233, + "learning_rate": 0.0002999957133120714, + "loss": 0.1056, + "step": 21 + }, + { + "epoch": 0.01, + "grad_norm": 0.030896561220288277, + "learning_rate": 0.00029999489849171195, + "loss": 0.0985, + "step": 22 + }, + { + "epoch": 0.02, + "grad_norm": 0.0476481132209301, + "learning_rate": 0.0002999940128190817, + "loss": 0.0993, + "step": 23 + }, + { + "epoch": 0.02, + "grad_norm": 0.2006714642047882, + "learning_rate": 0.00029999305629459895, + "loss": 0.0971, + "step": 24 + }, + { + "epoch": 0.02, + "grad_norm": 0.150727316737175, + "learning_rate": 0.0002999920289187155, + "loss": 0.1016, + "step": 25 + }, + { + "epoch": 0.02, + "grad_norm": 0.03271281719207764, + "learning_rate": 0.0002999909306919168, + "loss": 0.1002, + "step": 26 + }, + { + "epoch": 0.02, + "grad_norm": 0.08288753032684326, + "learning_rate": 0.0002999897616147214, + "loss": 0.1009, + "step": 27 + }, + { + "epoch": 0.02, + "grad_norm": 0.2443581521511078, + "learning_rate": 0.0002999885216876816, + "loss": 0.1036, + "step": 28 + }, + { + "epoch": 0.02, + "grad_norm": 0.16865722835063934, + "learning_rate": 0.00029998721091138323, + "loss": 0.0965, + "step": 29 + }, + { + "epoch": 0.02, + "grad_norm": 0.19362947344779968, + "learning_rate": 0.0002999858292864453, + "loss": 0.0952, + "step": 30 + }, + { + "epoch": 0.02, + "grad_norm": 0.039490532130002975, + "learning_rate": 0.0002999843768135205, + "loss": 0.0967, + "step": 31 + }, + { + "epoch": 0.02, + "grad_norm": 0.15848855674266815, + "learning_rate": 0.0002999828534932949, + "loss": 0.093, + "step": 32 + }, + { + "epoch": 0.02, + "grad_norm": 0.2813495695590973, + "learning_rate": 0.0002999812593264881, + "loss": 0.1052, + "step": 33 + }, + { + "epoch": 0.02, + "grad_norm": 0.03380066901445389, + "learning_rate": 0.00029997959431385314, + "loss": 0.0974, + "step": 34 + }, + { + "epoch": 0.02, + "grad_norm": 0.050066880881786346, + "learning_rate": 0.0002999778584561764, + "loss": 0.0972, + "step": 35 + }, + { + "epoch": 0.02, + "grad_norm": 0.2120673805475235, + "learning_rate": 0.00029997605175427803, + "loss": 0.0965, + "step": 36 + }, + { + "epoch": 0.02, + "grad_norm": 0.11290993541479111, + "learning_rate": 0.0002999741742090113, + "loss": 0.099, + "step": 37 + }, + { + "epoch": 0.02, + "grad_norm": 0.2454652190208435, + "learning_rate": 0.00029997222582126313, + "loss": 0.0898, + "step": 38 + }, + { + "epoch": 0.03, + "grad_norm": 0.10817914456129074, + "learning_rate": 0.0002999702065919539, + "loss": 0.0887, + "step": 39 + }, + { + "epoch": 0.03, + "grad_norm": 0.3510904014110565, + "learning_rate": 0.00029996811652203737, + "loss": 0.1107, + "step": 40 + }, + { + "epoch": 0.03, + "grad_norm": 0.3444919288158417, + "learning_rate": 0.0002999659556125009, + "loss": 0.1113, + "step": 41 + }, + { + "epoch": 0.03, + "grad_norm": 0.21621473133563995, + "learning_rate": 0.0002999637238643651, + "loss": 0.0991, + "step": 42 + }, + { + "epoch": 0.03, + "grad_norm": 0.0429786741733551, + "learning_rate": 0.00029996142127868426, + "loss": 0.0976, + "step": 43 + }, + { + "epoch": 0.03, + "grad_norm": 0.04371911287307739, + "learning_rate": 0.000299959047856546, + "loss": 0.0969, + "step": 44 + }, + { + "epoch": 0.03, + "grad_norm": 0.17956386506557465, + "learning_rate": 0.00029995660359907154, + "loss": 0.1027, + "step": 45 + }, + { + "epoch": 0.03, + "grad_norm": 0.05985981971025467, + "learning_rate": 0.0002999540885074153, + "loss": 0.0911, + "step": 46 + }, + { + "epoch": 0.03, + "grad_norm": 0.057165782898664474, + "learning_rate": 0.00029995150258276546, + "loss": 0.0944, + "step": 47 + }, + { + "epoch": 0.03, + "grad_norm": 0.06133668124675751, + "learning_rate": 0.00029994884582634345, + "loss": 0.0936, + "step": 48 + }, + { + "epoch": 0.03, + "grad_norm": 0.13429470360279083, + "learning_rate": 0.0002999461182394042, + "loss": 0.0932, + "step": 49 + }, + { + "epoch": 0.03, + "grad_norm": 0.08454808592796326, + "learning_rate": 0.00029994331982323625, + "loss": 0.0849, + "step": 50 + }, + { + "epoch": 0.03, + "grad_norm": 0.152529776096344, + "learning_rate": 0.0002999404505791613, + "loss": 0.0742, + "step": 51 + }, + { + "epoch": 0.03, + "grad_norm": 0.9239559173583984, + "learning_rate": 0.0002999375105085348, + "loss": 0.1266, + "step": 52 + }, + { + "epoch": 0.03, + "grad_norm": 0.15827079117298126, + "learning_rate": 0.0002999344996127455, + "loss": 0.0685, + "step": 53 + }, + { + "epoch": 0.04, + "grad_norm": 0.11372745782136917, + "learning_rate": 0.0002999314178932156, + "loss": 0.0853, + "step": 54 + }, + { + "epoch": 0.04, + "grad_norm": 0.11947692930698395, + "learning_rate": 0.00029992826535140093, + "loss": 0.0871, + "step": 55 + }, + { + "epoch": 0.04, + "grad_norm": 0.09731484949588776, + "learning_rate": 0.00029992504198879047, + "loss": 0.0799, + "step": 56 + }, + { + "epoch": 0.04, + "grad_norm": 0.40479809045791626, + "learning_rate": 0.0002999217478069069, + "loss": 0.1119, + "step": 57 + }, + { + "epoch": 0.04, + "grad_norm": 0.10651114583015442, + "learning_rate": 0.00029991838280730635, + "loss": 0.0741, + "step": 58 + }, + { + "epoch": 0.04, + "grad_norm": 0.13227766752243042, + "learning_rate": 0.0002999149469915782, + "loss": 0.067, + "step": 59 + }, + { + "epoch": 0.04, + "grad_norm": 0.2328774333000183, + "learning_rate": 0.0002999114403613454, + "loss": 0.0872, + "step": 60 + }, + { + "epoch": 0.04, + "grad_norm": 0.15303733944892883, + "learning_rate": 0.0002999078629182645, + "loss": 0.077, + "step": 61 + }, + { + "epoch": 0.04, + "grad_norm": 0.3285676836967468, + "learning_rate": 0.0002999042146640252, + "loss": 0.087, + "step": 62 + }, + { + "epoch": 0.04, + "grad_norm": 0.1548561453819275, + "learning_rate": 0.00029990049560035093, + "loss": 0.0521, + "step": 63 + }, + { + "epoch": 0.04, + "grad_norm": 0.1792415827512741, + "learning_rate": 0.0002998967057289983, + "loss": 0.0591, + "step": 64 + }, + { + "epoch": 0.04, + "grad_norm": 0.29741746187210083, + "learning_rate": 0.0002998928450517577, + "loss": 0.0955, + "step": 65 + }, + { + "epoch": 0.04, + "grad_norm": 0.2590031325817108, + "learning_rate": 0.0002998889135704527, + "loss": 0.0443, + "step": 66 + }, + { + "epoch": 0.04, + "grad_norm": 0.2152624875307083, + "learning_rate": 0.0002998849112869403, + "loss": 0.0656, + "step": 67 + }, + { + "epoch": 0.04, + "grad_norm": 0.1976858377456665, + "learning_rate": 0.0002998808382031111, + "loss": 0.0256, + "step": 68 + }, + { + "epoch": 0.05, + "grad_norm": 0.4642391502857208, + "learning_rate": 0.00029987669432088917, + "loss": 0.074, + "step": 69 + }, + { + "epoch": 0.05, + "grad_norm": 0.43813541531562805, + "learning_rate": 0.0002998724796422318, + "loss": 0.0344, + "step": 70 + }, + { + "epoch": 0.05, + "grad_norm": 0.8069552183151245, + "learning_rate": 0.0002998681941691299, + "loss": 0.1559, + "step": 71 + }, + { + "epoch": 0.05, + "grad_norm": 0.3986961841583252, + "learning_rate": 0.00029986383790360776, + "loss": 0.0504, + "step": 72 + }, + { + "epoch": 0.05, + "grad_norm": 0.19154639542102814, + "learning_rate": 0.00029985941084772317, + "loss": 0.0638, + "step": 73 + }, + { + "epoch": 0.05, + "grad_norm": 0.2110302895307541, + "learning_rate": 0.0002998549130035673, + "loss": 0.071, + "step": 74 + }, + { + "epoch": 0.05, + "grad_norm": 0.17988017201423645, + "learning_rate": 0.00029985034437326477, + "loss": 0.0798, + "step": 75 + }, + { + "epoch": 0.05, + "grad_norm": 0.15195637941360474, + "learning_rate": 0.0002998457049589736, + "loss": 0.0575, + "step": 76 + }, + { + "epoch": 0.05, + "grad_norm": 0.2465752810239792, + "learning_rate": 0.0002998409947628854, + "loss": 0.0669, + "step": 77 + }, + { + "epoch": 0.05, + "grad_norm": 0.10329095274209976, + "learning_rate": 0.0002998362137872249, + "loss": 0.0483, + "step": 78 + }, + { + "epoch": 0.05, + "grad_norm": 0.21354705095291138, + "learning_rate": 0.00029983136203425064, + "loss": 0.0522, + "step": 79 + }, + { + "epoch": 0.05, + "grad_norm": 0.1916392743587494, + "learning_rate": 0.00029982643950625436, + "loss": 0.0797, + "step": 80 + }, + { + "epoch": 0.05, + "grad_norm": 0.12721975147724152, + "learning_rate": 0.0002998214462055613, + "loss": 0.0368, + "step": 81 + }, + { + "epoch": 0.05, + "grad_norm": 0.29551053047180176, + "learning_rate": 0.0002998163821345301, + "loss": 0.094, + "step": 82 + }, + { + "epoch": 0.05, + "grad_norm": 0.3058943748474121, + "learning_rate": 0.00029981124729555283, + "loss": 0.0358, + "step": 83 + }, + { + "epoch": 0.05, + "grad_norm": 0.7026583552360535, + "learning_rate": 0.00029980604169105497, + "loss": 0.1386, + "step": 84 + }, + { + "epoch": 0.06, + "grad_norm": 0.37371405959129333, + "learning_rate": 0.00029980076532349557, + "loss": 0.0748, + "step": 85 + }, + { + "epoch": 0.06, + "grad_norm": 0.28942474722862244, + "learning_rate": 0.00029979541819536695, + "loss": 0.1037, + "step": 86 + }, + { + "epoch": 0.06, + "grad_norm": 0.1699017435312271, + "learning_rate": 0.0002997900003091949, + "loss": 0.0631, + "step": 87 + }, + { + "epoch": 0.06, + "grad_norm": 0.1061767190694809, + "learning_rate": 0.0002997845116675386, + "loss": 0.0557, + "step": 88 + }, + { + "epoch": 0.06, + "grad_norm": 0.13656219840049744, + "learning_rate": 0.0002997789522729908, + "loss": 0.0637, + "step": 89 + }, + { + "epoch": 0.06, + "grad_norm": 0.09874790161848068, + "learning_rate": 0.00029977332212817746, + "loss": 0.0495, + "step": 90 + }, + { + "epoch": 0.06, + "grad_norm": 0.24591276049613953, + "learning_rate": 0.0002997676212357581, + "loss": 0.0559, + "step": 91 + }, + { + "epoch": 0.06, + "grad_norm": 0.14663195610046387, + "learning_rate": 0.0002997618495984256, + "loss": 0.0804, + "step": 92 + }, + { + "epoch": 0.06, + "grad_norm": 0.08905334770679474, + "learning_rate": 0.0002997560072189062, + "loss": 0.0498, + "step": 93 + }, + { + "epoch": 0.06, + "grad_norm": 0.12921252846717834, + "learning_rate": 0.00029975009409995986, + "loss": 0.0365, + "step": 94 + }, + { + "epoch": 0.06, + "grad_norm": 0.08008511364459991, + "learning_rate": 0.0002997441102443795, + "loss": 0.03, + "step": 95 + }, + { + "epoch": 0.06, + "grad_norm": 0.2947149872779846, + "learning_rate": 0.0002997380556549918, + "loss": 0.0698, + "step": 96 + }, + { + "epoch": 0.06, + "grad_norm": 0.3243441581726074, + "learning_rate": 0.0002997319303346567, + "loss": 0.0564, + "step": 97 + }, + { + "epoch": 0.06, + "grad_norm": 0.28058576583862305, + "learning_rate": 0.00029972573428626757, + "loss": 0.1262, + "step": 98 + }, + { + "epoch": 0.06, + "grad_norm": 0.40957021713256836, + "learning_rate": 0.0002997194675127512, + "loss": 0.0471, + "step": 99 + }, + { + "epoch": 0.07, + "grad_norm": 0.12092690169811249, + "learning_rate": 0.00029971313001706787, + "loss": 0.0574, + "step": 100 + }, + { + "epoch": 0.07, + "grad_norm": 0.380398154258728, + "learning_rate": 0.0002997067218022111, + "loss": 0.1148, + "step": 101 + }, + { + "epoch": 0.07, + "grad_norm": 0.13584262132644653, + "learning_rate": 0.0002997002428712079, + "loss": 0.0299, + "step": 102 + }, + { + "epoch": 0.07, + "grad_norm": 0.13165581226348877, + "learning_rate": 0.00029969369322711874, + "loss": 0.0602, + "step": 103 + }, + { + "epoch": 0.07, + "grad_norm": 0.1055503860116005, + "learning_rate": 0.00029968707287303744, + "loss": 0.0404, + "step": 104 + }, + { + "epoch": 0.07, + "grad_norm": 0.09600503742694855, + "learning_rate": 0.00029968038181209114, + "loss": 0.0497, + "step": 105 + }, + { + "epoch": 0.07, + "grad_norm": 0.05941639468073845, + "learning_rate": 0.0002996736200474406, + "loss": 0.0456, + "step": 106 + }, + { + "epoch": 0.07, + "grad_norm": 0.1557297259569168, + "learning_rate": 0.0002996667875822797, + "loss": 0.077, + "step": 107 + }, + { + "epoch": 0.07, + "grad_norm": 0.14879021048545837, + "learning_rate": 0.00029965988441983595, + "loss": 0.0554, + "step": 108 + }, + { + "epoch": 0.07, + "grad_norm": 0.13067294657230377, + "learning_rate": 0.00029965291056337006, + "loss": 0.0357, + "step": 109 + }, + { + "epoch": 0.07, + "grad_norm": 0.15178795158863068, + "learning_rate": 0.00029964586601617633, + "loss": 0.0433, + "step": 110 + }, + { + "epoch": 0.07, + "grad_norm": 0.1176379844546318, + "learning_rate": 0.0002996387507815823, + "loss": 0.0432, + "step": 111 + }, + { + "epoch": 0.07, + "grad_norm": 0.048378992825746536, + "learning_rate": 0.000299631564862949, + "loss": 0.0338, + "step": 112 + }, + { + "epoch": 0.07, + "grad_norm": 0.09883740544319153, + "learning_rate": 0.0002996243082636708, + "loss": 0.0475, + "step": 113 + }, + { + "epoch": 0.07, + "grad_norm": 0.16062304377555847, + "learning_rate": 0.0002996169809871754, + "loss": 0.0595, + "step": 114 + }, + { + "epoch": 0.08, + "grad_norm": 0.06556422263383865, + "learning_rate": 0.00029960958303692397, + "loss": 0.0326, + "step": 115 + }, + { + "epoch": 0.08, + "grad_norm": 0.7436458468437195, + "learning_rate": 0.000299602114416411, + "loss": 0.0512, + "step": 116 + }, + { + "epoch": 0.08, + "grad_norm": 0.12153153866529465, + "learning_rate": 0.00029959457512916454, + "loss": 0.0448, + "step": 117 + }, + { + "epoch": 0.08, + "grad_norm": 0.21684418618679047, + "learning_rate": 0.0002995869651787458, + "loss": 0.0754, + "step": 118 + }, + { + "epoch": 0.08, + "grad_norm": 0.13978178799152374, + "learning_rate": 0.0002995792845687494, + "loss": 0.03, + "step": 119 + }, + { + "epoch": 0.08, + "grad_norm": 0.08695519715547562, + "learning_rate": 0.0002995715333028034, + "loss": 0.0156, + "step": 120 + }, + { + "epoch": 0.08, + "grad_norm": 0.2607383131980896, + "learning_rate": 0.0002995637113845693, + "loss": 0.0933, + "step": 121 + }, + { + "epoch": 0.08, + "grad_norm": 0.08398541808128357, + "learning_rate": 0.0002995558188177418, + "loss": 0.0368, + "step": 122 + }, + { + "epoch": 0.08, + "grad_norm": 0.14658145606517792, + "learning_rate": 0.0002995478556060492, + "loss": 0.0593, + "step": 123 + }, + { + "epoch": 0.08, + "grad_norm": 0.09054147452116013, + "learning_rate": 0.00029953982175325293, + "loss": 0.042, + "step": 124 + }, + { + "epoch": 0.08, + "grad_norm": 0.17315314710140228, + "learning_rate": 0.0002995317172631479, + "loss": 0.0754, + "step": 125 + }, + { + "epoch": 0.08, + "grad_norm": 0.20856395363807678, + "learning_rate": 0.0002995235421395624, + "loss": 0.0537, + "step": 126 + }, + { + "epoch": 0.08, + "grad_norm": 0.17539943754673004, + "learning_rate": 0.0002995152963863581, + "loss": 0.045, + "step": 127 + }, + { + "epoch": 0.08, + "grad_norm": 0.1361098289489746, + "learning_rate": 0.00029950698000743, + "loss": 0.0622, + "step": 128 + }, + { + "epoch": 0.08, + "grad_norm": 0.05299444869160652, + "learning_rate": 0.00029949859300670644, + "loss": 0.0548, + "step": 129 + }, + { + "epoch": 0.09, + "grad_norm": 0.19711115956306458, + "learning_rate": 0.0002994901353881491, + "loss": 0.0721, + "step": 130 + }, + { + "epoch": 0.09, + "grad_norm": 0.1288406252861023, + "learning_rate": 0.0002994816071557532, + "loss": 0.0408, + "step": 131 + }, + { + "epoch": 0.09, + "grad_norm": 0.08221332728862762, + "learning_rate": 0.000299473008313547, + "loss": 0.0526, + "step": 132 + }, + { + "epoch": 0.09, + "grad_norm": 0.1506081223487854, + "learning_rate": 0.00029946433886559237, + "loss": 0.0542, + "step": 133 + }, + { + "epoch": 0.09, + "grad_norm": 0.293639600276947, + "learning_rate": 0.00029945559881598444, + "loss": 0.0769, + "step": 134 + }, + { + "epoch": 0.09, + "grad_norm": 0.06451396644115448, + "learning_rate": 0.0002994467881688517, + "loss": 0.0417, + "step": 135 + }, + { + "epoch": 0.09, + "grad_norm": 0.2765437662601471, + "learning_rate": 0.00029943790692835604, + "loss": 0.0617, + "step": 136 + }, + { + "epoch": 0.09, + "grad_norm": 0.12035606801509857, + "learning_rate": 0.00029942895509869254, + "loss": 0.0429, + "step": 137 + }, + { + "epoch": 0.09, + "grad_norm": 0.09559385478496552, + "learning_rate": 0.0002994199326840898, + "loss": 0.044, + "step": 138 + }, + { + "epoch": 0.09, + "grad_norm": 0.13433387875556946, + "learning_rate": 0.00029941083968880965, + "loss": 0.036, + "step": 139 + }, + { + "epoch": 0.09, + "grad_norm": 0.1325090080499649, + "learning_rate": 0.0002994016761171474, + "loss": 0.0762, + "step": 140 + }, + { + "epoch": 0.09, + "grad_norm": 0.19197365641593933, + "learning_rate": 0.00029939244197343143, + "loss": 0.0587, + "step": 141 + }, + { + "epoch": 0.09, + "grad_norm": 0.09238675236701965, + "learning_rate": 0.00029938313726202376, + "loss": 0.0262, + "step": 142 + }, + { + "epoch": 0.09, + "grad_norm": 0.2584728002548218, + "learning_rate": 0.0002993737619873195, + "loss": 0.0382, + "step": 143 + }, + { + "epoch": 0.09, + "grad_norm": 0.30280745029449463, + "learning_rate": 0.00029936431615374727, + "loss": 0.0448, + "step": 144 + }, + { + "epoch": 0.09, + "grad_norm": 0.41464564204216003, + "learning_rate": 0.00029935479976576896, + "loss": 0.0676, + "step": 145 + }, + { + "epoch": 0.1, + "grad_norm": 0.4580010175704956, + "learning_rate": 0.00029934521282787974, + "loss": 0.1366, + "step": 146 + }, + { + "epoch": 0.1, + "grad_norm": 0.1701657474040985, + "learning_rate": 0.0002993355553446081, + "loss": 0.0844, + "step": 147 + }, + { + "epoch": 0.1, + "grad_norm": 0.10784261673688889, + "learning_rate": 0.000299325827320516, + "loss": 0.0211, + "step": 148 + }, + { + "epoch": 0.1, + "grad_norm": 0.08266110718250275, + "learning_rate": 0.0002993160287601984, + "loss": 0.0181, + "step": 149 + }, + { + "epoch": 0.1, + "grad_norm": 0.20068615674972534, + "learning_rate": 0.00029930615966828407, + "loss": 0.0582, + "step": 150 + }, + { + "epoch": 0.1, + "grad_norm": 0.14237689971923828, + "learning_rate": 0.0002992962200494347, + "loss": 0.0549, + "step": 151 + }, + { + "epoch": 0.1, + "grad_norm": 0.09671233594417572, + "learning_rate": 0.0002992862099083453, + "loss": 0.0368, + "step": 152 + }, + { + "epoch": 0.1, + "grad_norm": 0.11356969177722931, + "learning_rate": 0.00029927612924974455, + "loss": 0.0851, + "step": 153 + }, + { + "epoch": 0.1, + "grad_norm": 0.17435969412326813, + "learning_rate": 0.00029926597807839394, + "loss": 0.0869, + "step": 154 + }, + { + "epoch": 0.1, + "grad_norm": 0.09785137325525284, + "learning_rate": 0.00029925575639908866, + "loss": 0.0463, + "step": 155 + }, + { + "epoch": 0.1, + "grad_norm": 0.143271341919899, + "learning_rate": 0.0002992454642166571, + "loss": 0.0532, + "step": 156 + }, + { + "epoch": 0.1, + "grad_norm": 0.1381101906299591, + "learning_rate": 0.0002992351015359608, + "loss": 0.0512, + "step": 157 + }, + { + "epoch": 0.1, + "grad_norm": 0.0688018947839737, + "learning_rate": 0.0002992246683618948, + "loss": 0.0188, + "step": 158 + }, + { + "epoch": 0.1, + "grad_norm": 0.18138591945171356, + "learning_rate": 0.0002992141646993874, + "loss": 0.0737, + "step": 159 + }, + { + "epoch": 0.1, + "grad_norm": 0.0729256346821785, + "learning_rate": 0.0002992035905534001, + "loss": 0.0194, + "step": 160 + }, + { + "epoch": 0.11, + "grad_norm": 0.15414761006832123, + "learning_rate": 0.0002991929459289277, + "loss": 0.0412, + "step": 161 + }, + { + "epoch": 0.11, + "grad_norm": 0.2506199777126312, + "learning_rate": 0.00029918223083099846, + "loss": 0.0789, + "step": 162 + }, + { + "epoch": 0.11, + "grad_norm": 0.16611520946025848, + "learning_rate": 0.00029917144526467375, + "loss": 0.046, + "step": 163 + }, + { + "epoch": 0.11, + "grad_norm": 0.1828208565711975, + "learning_rate": 0.00029916058923504826, + "loss": 0.0324, + "step": 164 + }, + { + "epoch": 0.11, + "grad_norm": 0.08737993985414505, + "learning_rate": 0.00029914966274725006, + "loss": 0.0177, + "step": 165 + }, + { + "epoch": 0.11, + "grad_norm": 0.20271027088165283, + "learning_rate": 0.00029913866580644037, + "loss": 0.0455, + "step": 166 + }, + { + "epoch": 0.11, + "grad_norm": 0.04210209473967552, + "learning_rate": 0.00029912759841781383, + "loss": 0.0063, + "step": 167 + }, + { + "epoch": 0.11, + "grad_norm": 0.09085400402545929, + "learning_rate": 0.00029911646058659825, + "loss": 0.0174, + "step": 168 + }, + { + "epoch": 0.11, + "grad_norm": 0.18242572247982025, + "learning_rate": 0.00029910525231805466, + "loss": 0.053, + "step": 169 + }, + { + "epoch": 0.11, + "grad_norm": 0.2796941101551056, + "learning_rate": 0.0002990939736174776, + "loss": 0.0348, + "step": 170 + }, + { + "epoch": 0.11, + "grad_norm": 0.18838226795196533, + "learning_rate": 0.00029908262449019463, + "loss": 0.0583, + "step": 171 + }, + { + "epoch": 0.11, + "grad_norm": 0.03574841469526291, + "learning_rate": 0.00029907120494156674, + "loss": 0.0058, + "step": 172 + }, + { + "epoch": 0.11, + "grad_norm": 0.18582922220230103, + "learning_rate": 0.00029905971497698805, + "loss": 0.0571, + "step": 173 + }, + { + "epoch": 0.11, + "grad_norm": 0.12871672213077545, + "learning_rate": 0.00029904815460188604, + "loss": 0.0618, + "step": 174 + }, + { + "epoch": 0.11, + "grad_norm": 0.0590621717274189, + "learning_rate": 0.00029903652382172143, + "loss": 0.0107, + "step": 175 + }, + { + "epoch": 0.12, + "grad_norm": 0.07922167330980301, + "learning_rate": 0.00029902482264198817, + "loss": 0.035, + "step": 176 + }, + { + "epoch": 0.12, + "grad_norm": 0.3096056878566742, + "learning_rate": 0.0002990130510682135, + "loss": 0.0782, + "step": 177 + }, + { + "epoch": 0.12, + "grad_norm": 0.1896304190158844, + "learning_rate": 0.00029900120910595783, + "loss": 0.036, + "step": 178 + }, + { + "epoch": 0.12, + "grad_norm": 0.11776513606309891, + "learning_rate": 0.000298989296760815, + "loss": 0.0521, + "step": 179 + }, + { + "epoch": 0.12, + "grad_norm": 0.11616750061511993, + "learning_rate": 0.00029897731403841194, + "loss": 0.0275, + "step": 180 + }, + { + "epoch": 0.12, + "grad_norm": 0.20179390907287598, + "learning_rate": 0.0002989652609444088, + "loss": 0.0514, + "step": 181 + }, + { + "epoch": 0.12, + "grad_norm": 0.14983738958835602, + "learning_rate": 0.00029895313748449907, + "loss": 0.077, + "step": 182 + }, + { + "epoch": 0.12, + "grad_norm": 0.12123002856969833, + "learning_rate": 0.0002989409436644095, + "loss": 0.0485, + "step": 183 + }, + { + "epoch": 0.12, + "grad_norm": 0.314486026763916, + "learning_rate": 0.0002989286794898999, + "loss": 0.0931, + "step": 184 + }, + { + "epoch": 0.12, + "grad_norm": 0.132719025015831, + "learning_rate": 0.0002989163449667636, + "loss": 0.047, + "step": 185 + }, + { + "epoch": 0.12, + "grad_norm": 0.07938767969608307, + "learning_rate": 0.00029890394010082677, + "loss": 0.0364, + "step": 186 + }, + { + "epoch": 0.12, + "grad_norm": 0.08216488361358643, + "learning_rate": 0.00029889146489794926, + "loss": 0.0299, + "step": 187 + }, + { + "epoch": 0.12, + "grad_norm": 0.19339217245578766, + "learning_rate": 0.00029887891936402375, + "loss": 0.0408, + "step": 188 + }, + { + "epoch": 0.12, + "grad_norm": 0.30395349860191345, + "learning_rate": 0.0002988663035049763, + "loss": 0.0865, + "step": 189 + }, + { + "epoch": 0.12, + "grad_norm": 0.21264804899692535, + "learning_rate": 0.0002988536173267663, + "loss": 0.0584, + "step": 190 + }, + { + "epoch": 0.13, + "grad_norm": 0.1590937227010727, + "learning_rate": 0.0002988408608353862, + "loss": 0.0442, + "step": 191 + }, + { + "epoch": 0.13, + "grad_norm": 0.13069725036621094, + "learning_rate": 0.00029882803403686177, + "loss": 0.0416, + "step": 192 + }, + { + "epoch": 0.13, + "grad_norm": 0.1968701034784317, + "learning_rate": 0.0002988151369372518, + "loss": 0.0586, + "step": 193 + }, + { + "epoch": 0.13, + "grad_norm": 0.1478463113307953, + "learning_rate": 0.00029880216954264856, + "loss": 0.0595, + "step": 194 + }, + { + "epoch": 0.13, + "grad_norm": 0.06919383257627487, + "learning_rate": 0.0002987891318591773, + "loss": 0.0239, + "step": 195 + }, + { + "epoch": 0.13, + "grad_norm": 0.11905679851770401, + "learning_rate": 0.0002987760238929966, + "loss": 0.0345, + "step": 196 + }, + { + "epoch": 0.13, + "grad_norm": 0.14240068197250366, + "learning_rate": 0.00029876284565029816, + "loss": 0.0467, + "step": 197 + }, + { + "epoch": 0.13, + "grad_norm": 0.16097158193588257, + "learning_rate": 0.000298749597137307, + "loss": 0.0554, + "step": 198 + }, + { + "epoch": 0.13, + "grad_norm": 0.15597470104694366, + "learning_rate": 0.0002987362783602812, + "loss": 0.054, + "step": 199 + }, + { + "epoch": 0.13, + "grad_norm": 0.10321896523237228, + "learning_rate": 0.000298722889325512, + "loss": 0.0432, + "step": 200 + }, + { + "epoch": 0.13, + "grad_norm": 0.128427192568779, + "learning_rate": 0.000298709430039324, + "loss": 0.0315, + "step": 201 + }, + { + "epoch": 0.13, + "grad_norm": 0.11706223338842392, + "learning_rate": 0.00029869590050807487, + "loss": 0.0359, + "step": 202 + }, + { + "epoch": 0.13, + "grad_norm": 0.15359801054000854, + "learning_rate": 0.0002986823007381555, + "loss": 0.034, + "step": 203 + }, + { + "epoch": 0.13, + "grad_norm": 0.10363847017288208, + "learning_rate": 0.0002986686307359899, + "loss": 0.0261, + "step": 204 + }, + { + "epoch": 0.13, + "grad_norm": 0.12338493019342422, + "learning_rate": 0.0002986548905080353, + "loss": 0.0287, + "step": 205 + }, + { + "epoch": 0.13, + "grad_norm": 0.16201013326644897, + "learning_rate": 0.00029864108006078205, + "loss": 0.0173, + "step": 206 + }, + { + "epoch": 0.14, + "grad_norm": 0.04950540140271187, + "learning_rate": 0.00029862719940075387, + "loss": 0.0098, + "step": 207 + }, + { + "epoch": 0.14, + "grad_norm": 0.20930823683738708, + "learning_rate": 0.0002986132485345073, + "loss": 0.0652, + "step": 208 + }, + { + "epoch": 0.14, + "grad_norm": 0.12760238349437714, + "learning_rate": 0.0002985992274686324, + "loss": 0.0342, + "step": 209 + }, + { + "epoch": 0.14, + "grad_norm": 0.2107914686203003, + "learning_rate": 0.00029858513620975216, + "loss": 0.015, + "step": 210 + }, + { + "epoch": 0.14, + "grad_norm": 0.21169154345989227, + "learning_rate": 0.0002985709747645227, + "loss": 0.072, + "step": 211 + }, + { + "epoch": 0.14, + "grad_norm": 0.18555670976638794, + "learning_rate": 0.00029855674313963355, + "loss": 0.0359, + "step": 212 + }, + { + "epoch": 0.14, + "grad_norm": 0.1801125705242157, + "learning_rate": 0.00029854244134180707, + "loss": 0.038, + "step": 213 + }, + { + "epoch": 0.14, + "grad_norm": 0.10735122859477997, + "learning_rate": 0.000298528069377799, + "loss": 0.037, + "step": 214 + }, + { + "epoch": 0.14, + "grad_norm": 0.20155467092990875, + "learning_rate": 0.0002985136272543982, + "loss": 0.0505, + "step": 215 + }, + { + "epoch": 0.14, + "grad_norm": 0.1130833774805069, + "learning_rate": 0.0002984991149784265, + "loss": 0.0202, + "step": 216 + }, + { + "epoch": 0.14, + "grad_norm": 0.1932414174079895, + "learning_rate": 0.00029848453255673906, + "loss": 0.0803, + "step": 217 + }, + { + "epoch": 0.14, + "grad_norm": 0.18907181918621063, + "learning_rate": 0.0002984698799962241, + "loss": 0.0562, + "step": 218 + }, + { + "epoch": 0.14, + "grad_norm": 0.11439274251461029, + "learning_rate": 0.0002984551573038029, + "loss": 0.0474, + "step": 219 + }, + { + "epoch": 0.14, + "grad_norm": 0.19350704550743103, + "learning_rate": 0.00029844036448643, + "loss": 0.0335, + "step": 220 + }, + { + "epoch": 0.14, + "grad_norm": 0.19873294234275818, + "learning_rate": 0.000298425501551093, + "loss": 0.0616, + "step": 221 + }, + { + "epoch": 0.15, + "grad_norm": 0.2024085968732834, + "learning_rate": 0.00029841056850481265, + "loss": 0.0567, + "step": 222 + }, + { + "epoch": 0.15, + "grad_norm": 0.09004423022270203, + "learning_rate": 0.0002983955653546427, + "loss": 0.0291, + "step": 223 + }, + { + "epoch": 0.15, + "grad_norm": 0.19469811022281647, + "learning_rate": 0.00029838049210767015, + "loss": 0.0487, + "step": 224 + }, + { + "epoch": 0.15, + "grad_norm": 0.2525189518928528, + "learning_rate": 0.00029836534877101514, + "loss": 0.0629, + "step": 225 + }, + { + "epoch": 0.15, + "grad_norm": 0.12139023840427399, + "learning_rate": 0.0002983501353518307, + "loss": 0.0457, + "step": 226 + }, + { + "epoch": 0.15, + "grad_norm": 0.06411401927471161, + "learning_rate": 0.00029833485185730326, + "loss": 0.0186, + "step": 227 + }, + { + "epoch": 0.15, + "grad_norm": 0.024475542828440666, + "learning_rate": 0.00029831949829465214, + "loss": 0.004, + "step": 228 + }, + { + "epoch": 0.15, + "grad_norm": 0.15951114892959595, + "learning_rate": 0.0002983040746711298, + "loss": 0.0297, + "step": 229 + }, + { + "epoch": 0.15, + "grad_norm": 0.03694155812263489, + "learning_rate": 0.0002982885809940218, + "loss": 0.0073, + "step": 230 + }, + { + "epoch": 0.15, + "grad_norm": 0.13100893795490265, + "learning_rate": 0.0002982730172706468, + "loss": 0.0272, + "step": 231 + }, + { + "epoch": 0.15, + "grad_norm": 0.08929093927145004, + "learning_rate": 0.00029825738350835665, + "loss": 0.0146, + "step": 232 + }, + { + "epoch": 0.15, + "grad_norm": 0.1474764049053192, + "learning_rate": 0.0002982416797145361, + "loss": 0.0422, + "step": 233 + }, + { + "epoch": 0.15, + "grad_norm": 0.13874994218349457, + "learning_rate": 0.00029822590589660306, + "loss": 0.0353, + "step": 234 + }, + { + "epoch": 0.15, + "grad_norm": 0.048271678388118744, + "learning_rate": 0.00029821006206200856, + "loss": 0.0072, + "step": 235 + }, + { + "epoch": 0.15, + "grad_norm": 0.29017898440361023, + "learning_rate": 0.0002981941482182366, + "loss": 0.0607, + "step": 236 + }, + { + "epoch": 0.16, + "grad_norm": 0.3267674446105957, + "learning_rate": 0.0002981781643728044, + "loss": 0.101, + "step": 237 + }, + { + "epoch": 0.16, + "grad_norm": 0.17602747678756714, + "learning_rate": 0.00029816211053326216, + "loss": 0.0236, + "step": 238 + }, + { + "epoch": 0.16, + "grad_norm": 0.08361077308654785, + "learning_rate": 0.00029814598670719304, + "loss": 0.0277, + "step": 239 + }, + { + "epoch": 0.16, + "grad_norm": 0.08593238145112991, + "learning_rate": 0.00029812979290221346, + "loss": 0.0291, + "step": 240 + }, + { + "epoch": 0.16, + "grad_norm": 0.08858275413513184, + "learning_rate": 0.00029811352912597277, + "loss": 0.0329, + "step": 241 + }, + { + "epoch": 0.16, + "grad_norm": 0.08017202466726303, + "learning_rate": 0.0002980971953861534, + "loss": 0.0287, + "step": 242 + }, + { + "epoch": 0.16, + "grad_norm": 0.06615002453327179, + "learning_rate": 0.0002980807916904709, + "loss": 0.0269, + "step": 243 + }, + { + "epoch": 0.16, + "grad_norm": 0.12813499569892883, + "learning_rate": 0.00029806431804667364, + "loss": 0.0321, + "step": 244 + }, + { + "epoch": 0.16, + "grad_norm": 0.05528206750750542, + "learning_rate": 0.0002980477744625433, + "loss": 0.0089, + "step": 245 + }, + { + "epoch": 0.16, + "grad_norm": 0.10161186009645462, + "learning_rate": 0.00029803116094589445, + "loss": 0.0294, + "step": 246 + }, + { + "epoch": 0.16, + "grad_norm": 0.09885023534297943, + "learning_rate": 0.00029801447750457476, + "loss": 0.0232, + "step": 247 + }, + { + "epoch": 0.16, + "grad_norm": 0.20870375633239746, + "learning_rate": 0.00029799772414646484, + "loss": 0.0478, + "step": 248 + }, + { + "epoch": 0.16, + "grad_norm": 0.2730790674686432, + "learning_rate": 0.00029798090087947843, + "loss": 0.042, + "step": 249 + }, + { + "epoch": 0.16, + "grad_norm": 0.20371069014072418, + "learning_rate": 0.0002979640077115622, + "loss": 0.0634, + "step": 250 + }, + { + "epoch": 0.16, + "grad_norm": 0.14660406112670898, + "learning_rate": 0.0002979470446506959, + "loss": 0.0201, + "step": 251 + }, + { + "epoch": 0.16, + "grad_norm": 0.19971100986003876, + "learning_rate": 0.0002979300117048923, + "loss": 0.0431, + "step": 252 + }, + { + "epoch": 0.17, + "grad_norm": 0.14965400099754333, + "learning_rate": 0.0002979129088821971, + "loss": 0.041, + "step": 253 + }, + { + "epoch": 0.17, + "grad_norm": 0.2110958695411682, + "learning_rate": 0.0002978957361906892, + "loss": 0.028, + "step": 254 + }, + { + "epoch": 0.17, + "grad_norm": 0.13050246238708496, + "learning_rate": 0.0002978784936384802, + "loss": 0.0258, + "step": 255 + }, + { + "epoch": 0.17, + "grad_norm": 0.0885690301656723, + "learning_rate": 0.000297861181233715, + "loss": 0.0337, + "step": 256 + }, + { + "epoch": 0.17, + "grad_norm": 0.26541608572006226, + "learning_rate": 0.0002978437989845713, + "loss": 0.1142, + "step": 257 + }, + { + "epoch": 0.17, + "grad_norm": 0.14441104233264923, + "learning_rate": 0.0002978263468992599, + "loss": 0.0368, + "step": 258 + }, + { + "epoch": 0.17, + "grad_norm": 0.11450188606977463, + "learning_rate": 0.0002978088249860245, + "loss": 0.0243, + "step": 259 + }, + { + "epoch": 0.17, + "grad_norm": 0.3472074568271637, + "learning_rate": 0.00029779123325314184, + "loss": 0.0786, + "step": 260 + }, + { + "epoch": 0.17, + "grad_norm": 0.07867071032524109, + "learning_rate": 0.0002977735717089217, + "loss": 0.0356, + "step": 261 + }, + { + "epoch": 0.17, + "grad_norm": 0.1661967933177948, + "learning_rate": 0.0002977558403617067, + "loss": 0.047, + "step": 262 + }, + { + "epoch": 0.17, + "grad_norm": 0.17638400197029114, + "learning_rate": 0.00029773803921987244, + "loss": 0.0527, + "step": 263 + }, + { + "epoch": 0.17, + "grad_norm": 0.05885611101984978, + "learning_rate": 0.0002977201682918277, + "loss": 0.0156, + "step": 264 + }, + { + "epoch": 0.17, + "grad_norm": 0.07076411694288254, + "learning_rate": 0.00029770222758601395, + "loss": 0.0418, + "step": 265 + }, + { + "epoch": 0.17, + "grad_norm": 0.06245988979935646, + "learning_rate": 0.0002976842171109058, + "loss": 0.0199, + "step": 266 + }, + { + "epoch": 0.17, + "grad_norm": 0.08311894536018372, + "learning_rate": 0.0002976661368750107, + "loss": 0.028, + "step": 267 + }, + { + "epoch": 0.18, + "grad_norm": 0.11093831807374954, + "learning_rate": 0.0002976479868868692, + "loss": 0.0298, + "step": 268 + }, + { + "epoch": 0.18, + "grad_norm": 0.17683441936969757, + "learning_rate": 0.00029762976715505464, + "loss": 0.0539, + "step": 269 + }, + { + "epoch": 0.18, + "grad_norm": 0.13351142406463623, + "learning_rate": 0.00029761147768817345, + "loss": 0.0593, + "step": 270 + }, + { + "epoch": 0.18, + "grad_norm": 0.07717160880565643, + "learning_rate": 0.0002975931184948648, + "loss": 0.0227, + "step": 271 + }, + { + "epoch": 0.18, + "grad_norm": 0.11211559176445007, + "learning_rate": 0.0002975746895838011, + "loss": 0.0385, + "step": 272 + }, + { + "epoch": 0.18, + "grad_norm": 0.09209641814231873, + "learning_rate": 0.00029755619096368734, + "loss": 0.0086, + "step": 273 + }, + { + "epoch": 0.18, + "grad_norm": 0.0850004106760025, + "learning_rate": 0.0002975376226432617, + "loss": 0.0343, + "step": 274 + }, + { + "epoch": 0.18, + "grad_norm": 0.17711663246154785, + "learning_rate": 0.0002975189846312952, + "loss": 0.0665, + "step": 275 + }, + { + "epoch": 0.18, + "grad_norm": 0.13066548109054565, + "learning_rate": 0.0002975002769365918, + "loss": 0.0551, + "step": 276 + }, + { + "epoch": 0.18, + "grad_norm": 0.07509409636259079, + "learning_rate": 0.00029748149956798826, + "loss": 0.0087, + "step": 277 + }, + { + "epoch": 0.18, + "grad_norm": 0.3725223243236542, + "learning_rate": 0.0002974626525343544, + "loss": 0.026, + "step": 278 + }, + { + "epoch": 0.18, + "grad_norm": 0.20973052084445953, + "learning_rate": 0.0002974437358445929, + "loss": 0.015, + "step": 279 + }, + { + "epoch": 0.18, + "grad_norm": 0.25902581214904785, + "learning_rate": 0.0002974247495076393, + "loss": 0.0617, + "step": 280 + }, + { + "epoch": 0.18, + "grad_norm": 0.22490067780017853, + "learning_rate": 0.000297405693532462, + "loss": 0.0456, + "step": 281 + }, + { + "epoch": 0.18, + "grad_norm": 0.2885708510875702, + "learning_rate": 0.0002973865679280626, + "loss": 0.1066, + "step": 282 + }, + { + "epoch": 0.19, + "grad_norm": 0.2658590078353882, + "learning_rate": 0.00029736737270347517, + "loss": 0.0931, + "step": 283 + }, + { + "epoch": 0.19, + "grad_norm": 0.11531944572925568, + "learning_rate": 0.00029734810786776687, + "loss": 0.0238, + "step": 284 + }, + { + "epoch": 0.19, + "grad_norm": 0.0557803250849247, + "learning_rate": 0.00029732877343003776, + "loss": 0.0257, + "step": 285 + }, + { + "epoch": 0.19, + "grad_norm": 0.10880523920059204, + "learning_rate": 0.00029730936939942077, + "loss": 0.0387, + "step": 286 + }, + { + "epoch": 0.19, + "grad_norm": 0.09500639885663986, + "learning_rate": 0.0002972898957850816, + "loss": 0.0308, + "step": 287 + }, + { + "epoch": 0.19, + "grad_norm": 0.11504241824150085, + "learning_rate": 0.0002972703525962189, + "loss": 0.0292, + "step": 288 + }, + { + "epoch": 0.19, + "grad_norm": 0.10513140261173248, + "learning_rate": 0.0002972507398420643, + "loss": 0.0245, + "step": 289 + }, + { + "epoch": 0.19, + "grad_norm": 0.20218555629253387, + "learning_rate": 0.000297231057531882, + "loss": 0.0394, + "step": 290 + }, + { + "epoch": 0.19, + "grad_norm": 0.053536418825387955, + "learning_rate": 0.00029721130567496936, + "loss": 0.0129, + "step": 291 + }, + { + "epoch": 0.19, + "grad_norm": 0.15879443287849426, + "learning_rate": 0.0002971914842806564, + "loss": 0.054, + "step": 292 + }, + { + "epoch": 0.19, + "grad_norm": 0.11933678388595581, + "learning_rate": 0.00029717159335830606, + "loss": 0.0206, + "step": 293 + }, + { + "epoch": 0.19, + "grad_norm": 0.14436180889606476, + "learning_rate": 0.0002971516329173141, + "loss": 0.024, + "step": 294 + }, + { + "epoch": 0.19, + "grad_norm": 0.01978749968111515, + "learning_rate": 0.0002971316029671091, + "loss": 0.0047, + "step": 295 + }, + { + "epoch": 0.19, + "grad_norm": 0.1731237769126892, + "learning_rate": 0.00029711150351715253, + "loss": 0.0605, + "step": 296 + }, + { + "epoch": 0.19, + "grad_norm": 0.059307076036930084, + "learning_rate": 0.00029709133457693867, + "loss": 0.0308, + "step": 297 + }, + { + "epoch": 0.2, + "grad_norm": 0.3645476996898651, + "learning_rate": 0.00029707109615599456, + "loss": 0.0566, + "step": 298 + }, + { + "epoch": 0.2, + "grad_norm": 0.10670791566371918, + "learning_rate": 0.0002970507882638801, + "loss": 0.0234, + "step": 299 + }, + { + "epoch": 0.2, + "grad_norm": 0.10919758677482605, + "learning_rate": 0.0002970304109101881, + "loss": 0.0157, + "step": 300 + }, + { + "epoch": 0.2, + "grad_norm": 0.08173630386590958, + "learning_rate": 0.00029700996410454407, + "loss": 0.0371, + "step": 301 + }, + { + "epoch": 0.2, + "grad_norm": 0.13943839073181152, + "learning_rate": 0.00029698944785660635, + "loss": 0.0781, + "step": 302 + }, + { + "epoch": 0.2, + "grad_norm": 0.342821329832077, + "learning_rate": 0.00029696886217606605, + "loss": 0.0476, + "step": 303 + }, + { + "epoch": 0.2, + "grad_norm": 0.048615969717502594, + "learning_rate": 0.0002969482070726472, + "loss": 0.0083, + "step": 304 + }, + { + "epoch": 0.2, + "grad_norm": 0.1213599145412445, + "learning_rate": 0.0002969274825561064, + "loss": 0.0258, + "step": 305 + }, + { + "epoch": 0.2, + "grad_norm": 0.1914874166250229, + "learning_rate": 0.0002969066886362333, + "loss": 0.034, + "step": 306 + }, + { + "epoch": 0.2, + "grad_norm": 0.14067624509334564, + "learning_rate": 0.0002968858253228502, + "loss": 0.0395, + "step": 307 + }, + { + "epoch": 0.2, + "grad_norm": 0.08359983563423157, + "learning_rate": 0.00029686489262581217, + "loss": 0.0315, + "step": 308 + }, + { + "epoch": 0.2, + "grad_norm": 0.11551601439714432, + "learning_rate": 0.000296843890555007, + "loss": 0.058, + "step": 309 + }, + { + "epoch": 0.2, + "grad_norm": 0.12968787550926208, + "learning_rate": 0.00029682281912035545, + "loss": 0.0347, + "step": 310 + }, + { + "epoch": 0.2, + "grad_norm": 0.10182147473096848, + "learning_rate": 0.0002968016783318109, + "loss": 0.0165, + "step": 311 + }, + { + "epoch": 0.2, + "grad_norm": 0.06534916907548904, + "learning_rate": 0.00029678046819935934, + "loss": 0.0218, + "step": 312 + }, + { + "epoch": 0.2, + "grad_norm": 0.12587250769138336, + "learning_rate": 0.0002967591887330199, + "loss": 0.0498, + "step": 313 + }, + { + "epoch": 0.21, + "grad_norm": 0.06701786816120148, + "learning_rate": 0.0002967378399428441, + "loss": 0.0484, + "step": 314 + }, + { + "epoch": 0.21, + "grad_norm": 0.10836692154407501, + "learning_rate": 0.00029671642183891643, + "loss": 0.0412, + "step": 315 + }, + { + "epoch": 0.21, + "grad_norm": 0.061415113508701324, + "learning_rate": 0.00029669493443135403, + "loss": 0.0172, + "step": 316 + }, + { + "epoch": 0.21, + "grad_norm": 0.20760087668895721, + "learning_rate": 0.0002966733777303068, + "loss": 0.0494, + "step": 317 + }, + { + "epoch": 0.21, + "grad_norm": 0.11503862589597702, + "learning_rate": 0.00029665175174595736, + "loss": 0.0385, + "step": 318 + }, + { + "epoch": 0.21, + "grad_norm": 0.07366505265235901, + "learning_rate": 0.000296630056488521, + "loss": 0.0403, + "step": 319 + }, + { + "epoch": 0.21, + "grad_norm": 0.036951594054698944, + "learning_rate": 0.00029660829196824577, + "loss": 0.0092, + "step": 320 + }, + { + "epoch": 0.21, + "grad_norm": 0.08457314223051071, + "learning_rate": 0.0002965864581954126, + "loss": 0.0445, + "step": 321 + }, + { + "epoch": 0.21, + "grad_norm": 0.24513787031173706, + "learning_rate": 0.0002965645551803349, + "loss": 0.0716, + "step": 322 + }, + { + "epoch": 0.21, + "grad_norm": 0.08235831558704376, + "learning_rate": 0.00029654258293335887, + "loss": 0.029, + "step": 323 + }, + { + "epoch": 0.21, + "grad_norm": 0.08004003018140793, + "learning_rate": 0.00029652054146486344, + "loss": 0.0365, + "step": 324 + }, + { + "epoch": 0.21, + "grad_norm": 0.14928393065929413, + "learning_rate": 0.0002964984307852602, + "loss": 0.039, + "step": 325 + }, + { + "epoch": 0.21, + "grad_norm": 0.1802273988723755, + "learning_rate": 0.00029647625090499345, + "loss": 0.0324, + "step": 326 + }, + { + "epoch": 0.21, + "grad_norm": 0.18169750273227692, + "learning_rate": 0.00029645400183454026, + "loss": 0.0427, + "step": 327 + }, + { + "epoch": 0.21, + "grad_norm": 0.13121691346168518, + "learning_rate": 0.0002964316835844102, + "loss": 0.0274, + "step": 328 + }, + { + "epoch": 0.22, + "grad_norm": 0.27358877658843994, + "learning_rate": 0.0002964092961651456, + "loss": 0.0537, + "step": 329 + }, + { + "epoch": 0.22, + "grad_norm": 0.16992299258708954, + "learning_rate": 0.0002963868395873216, + "loss": 0.0797, + "step": 330 + }, + { + "epoch": 0.22, + "grad_norm": 0.2110740691423416, + "learning_rate": 0.0002963643138615458, + "loss": 0.0835, + "step": 331 + }, + { + "epoch": 0.22, + "grad_norm": 0.17114487290382385, + "learning_rate": 0.0002963417189984586, + "loss": 0.0619, + "step": 332 + }, + { + "epoch": 0.22, + "grad_norm": 0.09492560476064682, + "learning_rate": 0.000296319055008733, + "loss": 0.0212, + "step": 333 + }, + { + "epoch": 0.22, + "grad_norm": 0.19000209867954254, + "learning_rate": 0.0002962963219030746, + "loss": 0.0802, + "step": 334 + }, + { + "epoch": 0.22, + "grad_norm": 0.11632812023162842, + "learning_rate": 0.0002962735196922219, + "loss": 0.0426, + "step": 335 + }, + { + "epoch": 0.22, + "grad_norm": 0.15153561532497406, + "learning_rate": 0.0002962506483869456, + "loss": 0.07, + "step": 336 + }, + { + "epoch": 0.22, + "grad_norm": 0.0691797137260437, + "learning_rate": 0.00029622770799804944, + "loss": 0.0246, + "step": 337 + }, + { + "epoch": 0.22, + "grad_norm": 0.0731196403503418, + "learning_rate": 0.0002962046985363697, + "loss": 0.0413, + "step": 338 + }, + { + "epoch": 0.22, + "grad_norm": 0.1449161171913147, + "learning_rate": 0.00029618162001277513, + "loss": 0.023, + "step": 339 + }, + { + "epoch": 0.22, + "grad_norm": 0.13844870030879974, + "learning_rate": 0.0002961584724381672, + "loss": 0.055, + "step": 340 + }, + { + "epoch": 0.22, + "grad_norm": 0.08192728459835052, + "learning_rate": 0.00029613525582348007, + "loss": 0.0274, + "step": 341 + }, + { + "epoch": 0.22, + "grad_norm": 0.030294157564640045, + "learning_rate": 0.0002961119701796804, + "loss": 0.0332, + "step": 342 + }, + { + "epoch": 0.22, + "grad_norm": 0.12008962035179138, + "learning_rate": 0.0002960886155177675, + "loss": 0.0293, + "step": 343 + }, + { + "epoch": 0.23, + "grad_norm": 0.22829335927963257, + "learning_rate": 0.0002960651918487734, + "loss": 0.049, + "step": 344 + }, + { + "epoch": 0.23, + "grad_norm": 0.09662315249443054, + "learning_rate": 0.00029604169918376246, + "loss": 0.019, + "step": 345 + }, + { + "epoch": 0.23, + "grad_norm": 0.056000061333179474, + "learning_rate": 0.0002960181375338318, + "loss": 0.0077, + "step": 346 + }, + { + "epoch": 0.23, + "grad_norm": 0.04742419347167015, + "learning_rate": 0.00029599450691011116, + "loss": 0.0216, + "step": 347 + }, + { + "epoch": 0.23, + "grad_norm": 0.17151907086372375, + "learning_rate": 0.0002959708073237628, + "loss": 0.0364, + "step": 348 + }, + { + "epoch": 0.23, + "grad_norm": 0.3108668923377991, + "learning_rate": 0.00029594703878598155, + "loss": 0.0288, + "step": 349 + }, + { + "epoch": 0.23, + "grad_norm": 0.05538111925125122, + "learning_rate": 0.00029592320130799487, + "loss": 0.0048, + "step": 350 + }, + { + "epoch": 0.23, + "grad_norm": 0.2907853126525879, + "learning_rate": 0.00029589929490106263, + "loss": 0.0443, + "step": 351 + }, + { + "epoch": 0.23, + "grad_norm": 0.19189013540744781, + "learning_rate": 0.0002958753195764775, + "loss": 0.0688, + "step": 352 + }, + { + "epoch": 0.23, + "grad_norm": 0.3744778037071228, + "learning_rate": 0.00029585127534556446, + "loss": 0.0726, + "step": 353 + }, + { + "epoch": 0.23, + "grad_norm": 0.02139083668589592, + "learning_rate": 0.00029582716221968124, + "loss": 0.003, + "step": 354 + }, + { + "epoch": 0.23, + "grad_norm": 0.3209889531135559, + "learning_rate": 0.00029580298021021796, + "loss": 0.068, + "step": 355 + }, + { + "epoch": 0.23, + "grad_norm": 0.13530127704143524, + "learning_rate": 0.0002957787293285974, + "loss": 0.0229, + "step": 356 + }, + { + "epoch": 0.23, + "grad_norm": 0.04955355450510979, + "learning_rate": 0.00029575440958627485, + "loss": 0.007, + "step": 357 + }, + { + "epoch": 0.23, + "grad_norm": 0.05992133542895317, + "learning_rate": 0.0002957300209947379, + "loss": 0.014, + "step": 358 + }, + { + "epoch": 0.24, + "grad_norm": 0.08975626528263092, + "learning_rate": 0.0002957055635655071, + "loss": 0.0419, + "step": 359 + }, + { + "epoch": 0.24, + "grad_norm": 0.3397723436355591, + "learning_rate": 0.00029568103731013513, + "loss": 0.093, + "step": 360 + }, + { + "epoch": 0.24, + "grad_norm": 0.05291612446308136, + "learning_rate": 0.00029565644224020733, + "loss": 0.0137, + "step": 361 + }, + { + "epoch": 0.24, + "grad_norm": 0.16154609620571136, + "learning_rate": 0.0002956317783673416, + "loss": 0.0414, + "step": 362 + }, + { + "epoch": 0.24, + "grad_norm": 0.12861596047878265, + "learning_rate": 0.0002956070457031882, + "loss": 0.0372, + "step": 363 + }, + { + "epoch": 0.24, + "grad_norm": 0.09462448954582214, + "learning_rate": 0.00029558224425943003, + "loss": 0.0292, + "step": 364 + }, + { + "epoch": 0.24, + "grad_norm": 0.14290063083171844, + "learning_rate": 0.00029555737404778233, + "loss": 0.0572, + "step": 365 + }, + { + "epoch": 0.24, + "grad_norm": 0.11055822670459747, + "learning_rate": 0.00029553243507999307, + "loss": 0.0372, + "step": 366 + }, + { + "epoch": 0.24, + "grad_norm": 0.10231087356805801, + "learning_rate": 0.00029550742736784237, + "loss": 0.0368, + "step": 367 + }, + { + "epoch": 0.24, + "grad_norm": 0.09969429671764374, + "learning_rate": 0.00029548235092314304, + "loss": 0.0416, + "step": 368 + }, + { + "epoch": 0.24, + "grad_norm": 0.1207612007856369, + "learning_rate": 0.00029545720575774033, + "loss": 0.0307, + "step": 369 + }, + { + "epoch": 0.24, + "grad_norm": 0.11535090953111649, + "learning_rate": 0.0002954319918835119, + "loss": 0.0296, + "step": 370 + }, + { + "epoch": 0.24, + "grad_norm": 0.1460224986076355, + "learning_rate": 0.00029540670931236786, + "loss": 0.0587, + "step": 371 + }, + { + "epoch": 0.24, + "grad_norm": 0.10432720184326172, + "learning_rate": 0.0002953813580562509, + "loss": 0.0397, + "step": 372 + }, + { + "epoch": 0.24, + "grad_norm": 0.2140846997499466, + "learning_rate": 0.0002953559381271359, + "loss": 0.0538, + "step": 373 + }, + { + "epoch": 0.24, + "grad_norm": 0.12050808221101761, + "learning_rate": 0.00029533044953703044, + "loss": 0.0439, + "step": 374 + }, + { + "epoch": 0.25, + "grad_norm": 0.07928888499736786, + "learning_rate": 0.0002953048922979744, + "loss": 0.0163, + "step": 375 + }, + { + "epoch": 0.25, + "grad_norm": 0.08733994513750076, + "learning_rate": 0.0002952792664220402, + "loss": 0.0219, + "step": 376 + }, + { + "epoch": 0.25, + "grad_norm": 0.18080447614192963, + "learning_rate": 0.0002952535719213325, + "loss": 0.0469, + "step": 377 + }, + { + "epoch": 0.25, + "grad_norm": 0.08348793536424637, + "learning_rate": 0.0002952278088079884, + "loss": 0.035, + "step": 378 + }, + { + "epoch": 0.25, + "grad_norm": 0.1347195953130722, + "learning_rate": 0.00029520197709417763, + "loss": 0.029, + "step": 379 + }, + { + "epoch": 0.25, + "grad_norm": 0.11075679957866669, + "learning_rate": 0.0002951760767921021, + "loss": 0.0257, + "step": 380 + }, + { + "epoch": 0.25, + "grad_norm": 0.13172994554042816, + "learning_rate": 0.0002951501079139962, + "loss": 0.0302, + "step": 381 + }, + { + "epoch": 0.25, + "grad_norm": 0.114262655377388, + "learning_rate": 0.0002951240704721267, + "loss": 0.0492, + "step": 382 + }, + { + "epoch": 0.25, + "eval_loss": 0.034534960985183716, + "eval_runtime": 39.6959, + "eval_samples_per_second": 32.421, + "eval_steps_per_second": 8.112, + "step": 382 + }, + { + "epoch": 0.25, + "grad_norm": 0.08364730328321457, + "learning_rate": 0.0002950979644787928, + "loss": 0.0185, + "step": 383 + }, + { + "epoch": 0.25, + "grad_norm": 0.16603770852088928, + "learning_rate": 0.000295071789946326, + "loss": 0.0443, + "step": 384 + }, + { + "epoch": 0.25, + "grad_norm": 0.1269228458404541, + "learning_rate": 0.00029504554688709027, + "loss": 0.0217, + "step": 385 + }, + { + "epoch": 0.25, + "grad_norm": 0.15612861514091492, + "learning_rate": 0.0002950192353134819, + "loss": 0.0377, + "step": 386 + }, + { + "epoch": 0.25, + "grad_norm": 0.056646961718797684, + "learning_rate": 0.00029499285523792946, + "loss": 0.0133, + "step": 387 + }, + { + "epoch": 0.25, + "grad_norm": 0.23394975066184998, + "learning_rate": 0.000294966406672894, + "loss": 0.0767, + "step": 388 + }, + { + "epoch": 0.25, + "grad_norm": 0.21382953226566315, + "learning_rate": 0.00029493988963086895, + "loss": 0.0729, + "step": 389 + }, + { + "epoch": 0.26, + "grad_norm": 0.27641353011131287, + "learning_rate": 0.00029491330412438, + "loss": 0.1022, + "step": 390 + }, + { + "epoch": 0.26, + "grad_norm": 0.0760459303855896, + "learning_rate": 0.0002948866501659852, + "loss": 0.0269, + "step": 391 + }, + { + "epoch": 0.26, + "grad_norm": 0.5418729186058044, + "learning_rate": 0.0002948599277682748, + "loss": 0.1523, + "step": 392 + }, + { + "epoch": 0.26, + "grad_norm": 0.13234178721904755, + "learning_rate": 0.00029483313694387165, + "loss": 0.0292, + "step": 393 + }, + { + "epoch": 0.26, + "grad_norm": 0.07174021750688553, + "learning_rate": 0.00029480627770543086, + "loss": 0.0395, + "step": 394 + }, + { + "epoch": 0.26, + "grad_norm": 0.09958759695291519, + "learning_rate": 0.00029477935006563957, + "loss": 0.0559, + "step": 395 + }, + { + "epoch": 0.26, + "grad_norm": 0.07592346519231796, + "learning_rate": 0.00029475235403721763, + "loss": 0.0488, + "step": 396 + }, + { + "epoch": 0.26, + "grad_norm": 0.10129998624324799, + "learning_rate": 0.00029472528963291685, + "loss": 0.0287, + "step": 397 + }, + { + "epoch": 0.26, + "grad_norm": 0.08051212131977081, + "learning_rate": 0.00029469815686552163, + "loss": 0.0386, + "step": 398 + }, + { + "epoch": 0.26, + "grad_norm": 0.0695783942937851, + "learning_rate": 0.0002946709557478485, + "loss": 0.0201, + "step": 399 + }, + { + "epoch": 0.26, + "grad_norm": 0.1511554718017578, + "learning_rate": 0.00029464368629274624, + "loss": 0.0464, + "step": 400 + }, + { + "epoch": 0.26, + "grad_norm": 0.075484499335289, + "learning_rate": 0.00029461634851309597, + "loss": 0.031, + "step": 401 + }, + { + "epoch": 0.26, + "grad_norm": 0.08108027279376984, + "learning_rate": 0.00029458894242181114, + "loss": 0.0271, + "step": 402 + }, + { + "epoch": 0.26, + "grad_norm": 0.07254958897829056, + "learning_rate": 0.00029456146803183745, + "loss": 0.0187, + "step": 403 + }, + { + "epoch": 0.26, + "grad_norm": 0.215089812874794, + "learning_rate": 0.00029453392535615274, + "loss": 0.0463, + "step": 404 + }, + { + "epoch": 0.27, + "grad_norm": 0.034637995064258575, + "learning_rate": 0.0002945063144077672, + "loss": 0.0084, + "step": 405 + }, + { + "epoch": 0.27, + "grad_norm": 0.12073606252670288, + "learning_rate": 0.00029447863519972337, + "loss": 0.0401, + "step": 406 + }, + { + "epoch": 0.27, + "grad_norm": 0.13762198388576508, + "learning_rate": 0.00029445088774509583, + "loss": 0.0244, + "step": 407 + }, + { + "epoch": 0.27, + "grad_norm": 0.2537041902542114, + "learning_rate": 0.00029442307205699154, + "loss": 0.0574, + "step": 408 + }, + { + "epoch": 0.27, + "grad_norm": 0.1401953399181366, + "learning_rate": 0.00029439518814854956, + "loss": 0.0202, + "step": 409 + }, + { + "epoch": 0.27, + "grad_norm": 0.13872119784355164, + "learning_rate": 0.0002943672360329413, + "loss": 0.0373, + "step": 410 + }, + { + "epoch": 0.27, + "grad_norm": 0.3436320126056671, + "learning_rate": 0.00029433921572337044, + "loss": 0.0944, + "step": 411 + }, + { + "epoch": 0.27, + "grad_norm": 0.20004349946975708, + "learning_rate": 0.00029431112723307266, + "loss": 0.0625, + "step": 412 + }, + { + "epoch": 0.27, + "grad_norm": 0.10176026076078415, + "learning_rate": 0.00029428297057531607, + "loss": 0.023, + "step": 413 + }, + { + "epoch": 0.27, + "grad_norm": 0.08603208512067795, + "learning_rate": 0.0002942547457634008, + "loss": 0.0141, + "step": 414 + }, + { + "epoch": 0.27, + "grad_norm": 0.03601311519742012, + "learning_rate": 0.0002942264528106592, + "loss": 0.0071, + "step": 415 + }, + { + "epoch": 0.27, + "grad_norm": 0.1434870958328247, + "learning_rate": 0.000294198091730456, + "loss": 0.0362, + "step": 416 + }, + { + "epoch": 0.27, + "grad_norm": 0.1505521684885025, + "learning_rate": 0.0002941696625361879, + "loss": 0.0211, + "step": 417 + }, + { + "epoch": 0.27, + "grad_norm": 0.14390698075294495, + "learning_rate": 0.0002941411652412838, + "loss": 0.054, + "step": 418 + }, + { + "epoch": 0.27, + "grad_norm": 0.21683859825134277, + "learning_rate": 0.00029411259985920486, + "loss": 0.0482, + "step": 419 + }, + { + "epoch": 0.27, + "grad_norm": 0.12036791443824768, + "learning_rate": 0.0002940839664034444, + "loss": 0.0444, + "step": 420 + }, + { + "epoch": 0.28, + "grad_norm": 0.09479566663503647, + "learning_rate": 0.00029405526488752775, + "loss": 0.035, + "step": 421 + }, + { + "epoch": 0.28, + "grad_norm": 0.14229558408260345, + "learning_rate": 0.0002940264953250125, + "loss": 0.0573, + "step": 422 + }, + { + "epoch": 0.28, + "grad_norm": 0.22773970663547516, + "learning_rate": 0.00029399765772948844, + "loss": 0.061, + "step": 423 + }, + { + "epoch": 0.28, + "grad_norm": 0.11387961357831955, + "learning_rate": 0.0002939687521145774, + "loss": 0.057, + "step": 424 + }, + { + "epoch": 0.28, + "grad_norm": 0.1798745095729828, + "learning_rate": 0.00029393977849393333, + "loss": 0.0392, + "step": 425 + }, + { + "epoch": 0.28, + "grad_norm": 0.07203508168458939, + "learning_rate": 0.0002939107368812424, + "loss": 0.0152, + "step": 426 + }, + { + "epoch": 0.28, + "grad_norm": 0.04569177329540253, + "learning_rate": 0.0002938816272902228, + "loss": 0.0113, + "step": 427 + }, + { + "epoch": 0.28, + "grad_norm": 0.0927419438958168, + "learning_rate": 0.0002938524497346249, + "loss": 0.0246, + "step": 428 + }, + { + "epoch": 0.28, + "grad_norm": 0.16807597875595093, + "learning_rate": 0.0002938232042282311, + "loss": 0.0364, + "step": 429 + }, + { + "epoch": 0.28, + "grad_norm": 0.12006795406341553, + "learning_rate": 0.00029379389078485596, + "loss": 0.0118, + "step": 430 + }, + { + "epoch": 0.28, + "grad_norm": 0.0377679318189621, + "learning_rate": 0.0002937645094183461, + "loss": 0.0063, + "step": 431 + }, + { + "epoch": 0.28, + "grad_norm": 0.27051666378974915, + "learning_rate": 0.00029373506014258025, + "loss": 0.0682, + "step": 432 + }, + { + "epoch": 0.28, + "grad_norm": 0.228448748588562, + "learning_rate": 0.0002937055429714692, + "loss": 0.0733, + "step": 433 + }, + { + "epoch": 0.28, + "grad_norm": 0.18427824974060059, + "learning_rate": 0.00029367595791895577, + "loss": 0.0338, + "step": 434 + }, + { + "epoch": 0.28, + "grad_norm": 0.25813257694244385, + "learning_rate": 0.00029364630499901503, + "loss": 0.0323, + "step": 435 + }, + { + "epoch": 0.29, + "grad_norm": 0.17406705021858215, + "learning_rate": 0.0002936165842256538, + "loss": 0.0398, + "step": 436 + }, + { + "epoch": 0.29, + "grad_norm": 0.5199068188667297, + "learning_rate": 0.0002935867956129112, + "loss": 0.0486, + "step": 437 + }, + { + "epoch": 0.29, + "grad_norm": 0.3251938223838806, + "learning_rate": 0.0002935569391748583, + "loss": 0.049, + "step": 438 + }, + { + "epoch": 0.29, + "grad_norm": 0.057003892958164215, + "learning_rate": 0.00029352701492559827, + "loss": 0.0114, + "step": 439 + }, + { + "epoch": 0.29, + "grad_norm": 0.15188859403133392, + "learning_rate": 0.00029349702287926623, + "loss": 0.0323, + "step": 440 + }, + { + "epoch": 0.29, + "grad_norm": 0.17942048609256744, + "learning_rate": 0.0002934669630500293, + "loss": 0.0437, + "step": 441 + }, + { + "epoch": 0.29, + "grad_norm": 0.06396406143903732, + "learning_rate": 0.0002934368354520867, + "loss": 0.0097, + "step": 442 + }, + { + "epoch": 0.29, + "grad_norm": 0.1496248096227646, + "learning_rate": 0.00029340664009966974, + "loss": 0.0316, + "step": 443 + }, + { + "epoch": 0.29, + "grad_norm": 0.0654374286532402, + "learning_rate": 0.00029337637700704156, + "loss": 0.0083, + "step": 444 + }, + { + "epoch": 0.29, + "grad_norm": 0.04386695846915245, + "learning_rate": 0.0002933460461884973, + "loss": 0.0094, + "step": 445 + }, + { + "epoch": 0.29, + "grad_norm": 0.14928901195526123, + "learning_rate": 0.0002933156476583643, + "loss": 0.0484, + "step": 446 + }, + { + "epoch": 0.29, + "grad_norm": 0.12666364014148712, + "learning_rate": 0.0002932851814310017, + "loss": 0.0148, + "step": 447 + }, + { + "epoch": 0.29, + "grad_norm": 0.023791933432221413, + "learning_rate": 0.0002932546475208006, + "loss": 0.003, + "step": 448 + }, + { + "epoch": 0.29, + "grad_norm": 0.022256718948483467, + "learning_rate": 0.0002932240459421842, + "loss": 0.0044, + "step": 449 + }, + { + "epoch": 0.29, + "grad_norm": 0.12194914370775223, + "learning_rate": 0.0002931933767096076, + "loss": 0.009, + "step": 450 + }, + { + "epoch": 0.3, + "grad_norm": 0.29687178134918213, + "learning_rate": 0.0002931626398375578, + "loss": 0.0691, + "step": 451 + }, + { + "epoch": 0.3, + "grad_norm": 0.24758018553256989, + "learning_rate": 0.00029313183534055386, + "loss": 0.0589, + "step": 452 + }, + { + "epoch": 0.3, + "grad_norm": 0.10298270732164383, + "learning_rate": 0.0002931009632331468, + "loss": 0.0187, + "step": 453 + }, + { + "epoch": 0.3, + "grad_norm": 0.1447860449552536, + "learning_rate": 0.00029307002352991937, + "loss": 0.0297, + "step": 454 + }, + { + "epoch": 0.3, + "grad_norm": 0.2590334117412567, + "learning_rate": 0.00029303901624548644, + "loss": 0.0892, + "step": 455 + }, + { + "epoch": 0.3, + "grad_norm": 0.07339983433485031, + "learning_rate": 0.00029300794139449477, + "loss": 0.0249, + "step": 456 + }, + { + "epoch": 0.3, + "grad_norm": 0.16213186085224152, + "learning_rate": 0.000292976798991623, + "loss": 0.0493, + "step": 457 + }, + { + "epoch": 0.3, + "grad_norm": 0.03418932110071182, + "learning_rate": 0.0002929455890515818, + "loss": 0.0066, + "step": 458 + }, + { + "epoch": 0.3, + "grad_norm": 0.18771564960479736, + "learning_rate": 0.0002929143115891134, + "loss": 0.03, + "step": 459 + }, + { + "epoch": 0.3, + "grad_norm": 0.13976161181926727, + "learning_rate": 0.00029288296661899243, + "loss": 0.0451, + "step": 460 + }, + { + "epoch": 0.3, + "grad_norm": 0.07075387239456177, + "learning_rate": 0.00029285155415602495, + "loss": 0.0201, + "step": 461 + }, + { + "epoch": 0.3, + "grad_norm": 0.1304980367422104, + "learning_rate": 0.0002928200742150492, + "loss": 0.0286, + "step": 462 + }, + { + "epoch": 0.3, + "grad_norm": 0.06026493385434151, + "learning_rate": 0.00029278852681093514, + "loss": 0.0159, + "step": 463 + }, + { + "epoch": 0.3, + "grad_norm": 0.08018484711647034, + "learning_rate": 0.0002927569119585847, + "loss": 0.0333, + "step": 464 + }, + { + "epoch": 0.3, + "grad_norm": 0.21171532571315765, + "learning_rate": 0.0002927252296729315, + "loss": 0.034, + "step": 465 + }, + { + "epoch": 0.31, + "grad_norm": 0.14055241644382477, + "learning_rate": 0.0002926934799689413, + "loss": 0.0504, + "step": 466 + }, + { + "epoch": 0.31, + "grad_norm": 0.17434647679328918, + "learning_rate": 0.0002926616628616113, + "loss": 0.0519, + "step": 467 + }, + { + "epoch": 0.31, + "grad_norm": 0.12710362672805786, + "learning_rate": 0.00029262977836597105, + "loss": 0.0154, + "step": 468 + }, + { + "epoch": 0.31, + "grad_norm": 0.16046389937400818, + "learning_rate": 0.0002925978264970814, + "loss": 0.0398, + "step": 469 + }, + { + "epoch": 0.31, + "grad_norm": 0.23207533359527588, + "learning_rate": 0.00029256580727003543, + "loss": 0.0562, + "step": 470 + }, + { + "epoch": 0.31, + "grad_norm": 0.29609429836273193, + "learning_rate": 0.0002925337206999579, + "loss": 0.137, + "step": 471 + }, + { + "epoch": 0.31, + "grad_norm": 0.15176476538181305, + "learning_rate": 0.00029250156680200526, + "loss": 0.025, + "step": 472 + }, + { + "epoch": 0.31, + "grad_norm": 0.14394959807395935, + "learning_rate": 0.00029246934559136597, + "loss": 0.0519, + "step": 473 + }, + { + "epoch": 0.31, + "grad_norm": 0.08391053229570389, + "learning_rate": 0.00029243705708326015, + "loss": 0.0184, + "step": 474 + }, + { + "epoch": 0.31, + "grad_norm": 0.09384860098361969, + "learning_rate": 0.00029240470129293975, + "loss": 0.0229, + "step": 475 + }, + { + "epoch": 0.31, + "grad_norm": 0.12083159387111664, + "learning_rate": 0.00029237227823568845, + "loss": 0.0219, + "step": 476 + }, + { + "epoch": 0.31, + "grad_norm": 0.19567762315273285, + "learning_rate": 0.0002923397879268218, + "loss": 0.0728, + "step": 477 + }, + { + "epoch": 0.31, + "grad_norm": 0.07342015206813812, + "learning_rate": 0.0002923072303816871, + "loss": 0.0412, + "step": 478 + }, + { + "epoch": 0.31, + "grad_norm": 0.06717100739479065, + "learning_rate": 0.00029227460561566333, + "loss": 0.0309, + "step": 479 + }, + { + "epoch": 0.31, + "grad_norm": 0.09244221448898315, + "learning_rate": 0.0002922419136441613, + "loss": 0.0508, + "step": 480 + }, + { + "epoch": 0.31, + "grad_norm": 0.052494604140520096, + "learning_rate": 0.0002922091544826235, + "loss": 0.0319, + "step": 481 + }, + { + "epoch": 0.32, + "grad_norm": 0.14286155998706818, + "learning_rate": 0.00029217632814652417, + "loss": 0.0654, + "step": 482 + }, + { + "epoch": 0.32, + "grad_norm": 0.06442811340093613, + "learning_rate": 0.00029214343465136945, + "loss": 0.0132, + "step": 483 + }, + { + "epoch": 0.32, + "grad_norm": 0.05420248210430145, + "learning_rate": 0.0002921104740126969, + "loss": 0.0115, + "step": 484 + }, + { + "epoch": 0.32, + "grad_norm": 0.04951406642794609, + "learning_rate": 0.0002920774462460761, + "loss": 0.0086, + "step": 485 + }, + { + "epoch": 0.32, + "grad_norm": 0.08321358263492584, + "learning_rate": 0.00029204435136710803, + "loss": 0.0445, + "step": 486 + }, + { + "epoch": 0.32, + "grad_norm": 0.11665898561477661, + "learning_rate": 0.0002920111893914257, + "loss": 0.0262, + "step": 487 + }, + { + "epoch": 0.32, + "grad_norm": 0.1829105019569397, + "learning_rate": 0.00029197796033469356, + "loss": 0.0308, + "step": 488 + }, + { + "epoch": 0.32, + "grad_norm": 0.20940159261226654, + "learning_rate": 0.00029194466421260786, + "loss": 0.0299, + "step": 489 + }, + { + "epoch": 0.32, + "grad_norm": 0.20697347819805145, + "learning_rate": 0.0002919113010408965, + "loss": 0.0405, + "step": 490 + }, + { + "epoch": 0.32, + "grad_norm": 0.051994968205690384, + "learning_rate": 0.000291877870835319, + "loss": 0.01, + "step": 491 + }, + { + "epoch": 0.32, + "grad_norm": 0.1463523805141449, + "learning_rate": 0.00029184437361166676, + "loss": 0.0555, + "step": 492 + }, + { + "epoch": 0.32, + "grad_norm": 0.09110219031572342, + "learning_rate": 0.00029181080938576255, + "loss": 0.0371, + "step": 493 + }, + { + "epoch": 0.32, + "grad_norm": 0.04076121374964714, + "learning_rate": 0.00029177717817346097, + "loss": 0.0065, + "step": 494 + }, + { + "epoch": 0.32, + "grad_norm": 0.11555450409650803, + "learning_rate": 0.0002917434799906482, + "loss": 0.0115, + "step": 495 + }, + { + "epoch": 0.32, + "grad_norm": 0.15579824149608612, + "learning_rate": 0.0002917097148532421, + "loss": 0.0332, + "step": 496 + }, + { + "epoch": 0.33, + "grad_norm": 0.41938668489456177, + "learning_rate": 0.000291675882777192, + "loss": 0.0678, + "step": 497 + }, + { + "epoch": 0.33, + "grad_norm": 0.16764874756336212, + "learning_rate": 0.0002916419837784791, + "loss": 0.0683, + "step": 498 + }, + { + "epoch": 0.33, + "grad_norm": 0.1291145384311676, + "learning_rate": 0.00029160801787311613, + "loss": 0.0376, + "step": 499 + }, + { + "epoch": 0.33, + "grad_norm": 0.06120933219790459, + "learning_rate": 0.0002915739850771472, + "loss": 0.0307, + "step": 500 + }, + { + "epoch": 0.33, + "grad_norm": 0.09218423068523407, + "learning_rate": 0.0002915398854066483, + "loss": 0.0545, + "step": 501 + }, + { + "epoch": 0.33, + "grad_norm": 0.12664952874183655, + "learning_rate": 0.00029150571887772694, + "loss": 0.0274, + "step": 502 + }, + { + "epoch": 0.33, + "grad_norm": 0.0705379918217659, + "learning_rate": 0.0002914714855065221, + "loss": 0.0198, + "step": 503 + }, + { + "epoch": 0.33, + "grad_norm": 0.03559693694114685, + "learning_rate": 0.00029143718530920447, + "loss": 0.0114, + "step": 504 + }, + { + "epoch": 0.33, + "grad_norm": 0.051283448934555054, + "learning_rate": 0.0002914028183019762, + "loss": 0.0327, + "step": 505 + }, + { + "epoch": 0.33, + "grad_norm": 0.12527117133140564, + "learning_rate": 0.0002913683845010711, + "loss": 0.0316, + "step": 506 + }, + { + "epoch": 0.33, + "grad_norm": 0.0627032071352005, + "learning_rate": 0.0002913338839227544, + "loss": 0.0185, + "step": 507 + }, + { + "epoch": 0.33, + "grad_norm": 0.07235468178987503, + "learning_rate": 0.000291299316583323, + "loss": 0.0605, + "step": 508 + }, + { + "epoch": 0.33, + "grad_norm": 0.07697612792253494, + "learning_rate": 0.0002912646824991053, + "loss": 0.031, + "step": 509 + }, + { + "epoch": 0.33, + "grad_norm": 0.08240342885255814, + "learning_rate": 0.0002912299816864612, + "loss": 0.0211, + "step": 510 + }, + { + "epoch": 0.33, + "grad_norm": 0.07725581526756287, + "learning_rate": 0.0002911952141617821, + "loss": 0.0311, + "step": 511 + }, + { + "epoch": 0.34, + "grad_norm": 0.14777988195419312, + "learning_rate": 0.000291160379941491, + "loss": 0.038, + "step": 512 + }, + { + "epoch": 0.34, + "grad_norm": 0.11423151195049286, + "learning_rate": 0.0002911254790420423, + "loss": 0.0594, + "step": 513 + }, + { + "epoch": 0.34, + "grad_norm": 0.07308260351419449, + "learning_rate": 0.000291090511479922, + "loss": 0.0416, + "step": 514 + }, + { + "epoch": 0.34, + "grad_norm": 0.11171098798513412, + "learning_rate": 0.00029105547727164747, + "loss": 0.0509, + "step": 515 + }, + { + "epoch": 0.34, + "grad_norm": 0.29647496342658997, + "learning_rate": 0.00029102037643376764, + "loss": 0.0421, + "step": 516 + }, + { + "epoch": 0.34, + "grad_norm": 0.08812320232391357, + "learning_rate": 0.00029098520898286303, + "loss": 0.0559, + "step": 517 + }, + { + "epoch": 0.34, + "grad_norm": 0.13493718206882477, + "learning_rate": 0.00029094997493554525, + "loss": 0.0257, + "step": 518 + }, + { + "epoch": 0.34, + "grad_norm": 0.1292780339717865, + "learning_rate": 0.0002909146743084579, + "loss": 0.0699, + "step": 519 + }, + { + "epoch": 0.34, + "grad_norm": 0.03736162185668945, + "learning_rate": 0.0002908793071182755, + "loss": 0.0113, + "step": 520 + }, + { + "epoch": 0.34, + "grad_norm": 0.20628990232944489, + "learning_rate": 0.00029084387338170435, + "loss": 0.1039, + "step": 521 + }, + { + "epoch": 0.34, + "grad_norm": 0.13702163100242615, + "learning_rate": 0.0002908083731154821, + "loss": 0.0715, + "step": 522 + }, + { + "epoch": 0.34, + "grad_norm": 0.10376426577568054, + "learning_rate": 0.0002907728063363779, + "loss": 0.0566, + "step": 523 + }, + { + "epoch": 0.34, + "grad_norm": 0.03796597197651863, + "learning_rate": 0.00029073717306119206, + "loss": 0.0131, + "step": 524 + }, + { + "epoch": 0.34, + "grad_norm": 0.12588168680667877, + "learning_rate": 0.0002907014733067566, + "loss": 0.0754, + "step": 525 + }, + { + "epoch": 0.34, + "grad_norm": 0.18614119291305542, + "learning_rate": 0.00029066570708993474, + "loss": 0.0839, + "step": 526 + }, + { + "epoch": 0.35, + "grad_norm": 0.08624828606843948, + "learning_rate": 0.0002906298744276212, + "loss": 0.0519, + "step": 527 + }, + { + "epoch": 0.35, + "grad_norm": 0.09907104074954987, + "learning_rate": 0.00029059397533674216, + "loss": 0.0554, + "step": 528 + }, + { + "epoch": 0.35, + "grad_norm": 0.05135316029191017, + "learning_rate": 0.00029055800983425494, + "loss": 0.0374, + "step": 529 + }, + { + "epoch": 0.35, + "grad_norm": 0.10954371839761734, + "learning_rate": 0.00029052197793714844, + "loss": 0.03, + "step": 530 + }, + { + "epoch": 0.35, + "grad_norm": 0.13733310997486115, + "learning_rate": 0.0002904858796624428, + "loss": 0.0345, + "step": 531 + }, + { + "epoch": 0.35, + "grad_norm": 0.09171781688928604, + "learning_rate": 0.00029044971502718966, + "loss": 0.0285, + "step": 532 + }, + { + "epoch": 0.35, + "grad_norm": 0.08643066138029099, + "learning_rate": 0.00029041348404847177, + "loss": 0.0225, + "step": 533 + }, + { + "epoch": 0.35, + "grad_norm": 0.3179713487625122, + "learning_rate": 0.00029037718674340343, + "loss": 0.1167, + "step": 534 + }, + { + "epoch": 0.35, + "grad_norm": 0.09737833589315414, + "learning_rate": 0.0002903408231291303, + "loss": 0.047, + "step": 535 + }, + { + "epoch": 0.35, + "grad_norm": 0.15587852895259857, + "learning_rate": 0.00029030439322282904, + "loss": 0.0406, + "step": 536 + }, + { + "epoch": 0.35, + "grad_norm": 0.07560009509325027, + "learning_rate": 0.0002902678970417081, + "loss": 0.0387, + "step": 537 + }, + { + "epoch": 0.35, + "grad_norm": 0.12732967734336853, + "learning_rate": 0.00029023133460300677, + "loss": 0.0434, + "step": 538 + }, + { + "epoch": 0.35, + "grad_norm": 0.06021510064601898, + "learning_rate": 0.00029019470592399593, + "loss": 0.0149, + "step": 539 + }, + { + "epoch": 0.35, + "grad_norm": 0.09609080851078033, + "learning_rate": 0.0002901580110219777, + "loss": 0.0203, + "step": 540 + }, + { + "epoch": 0.35, + "grad_norm": 0.1442640721797943, + "learning_rate": 0.0002901212499142854, + "loss": 0.0345, + "step": 541 + }, + { + "epoch": 0.35, + "grad_norm": 0.15236537158489227, + "learning_rate": 0.0002900844226182837, + "loss": 0.041, + "step": 542 + }, + { + "epoch": 0.36, + "grad_norm": 0.14138057827949524, + "learning_rate": 0.00029004752915136854, + "loss": 0.0413, + "step": 543 + }, + { + "epoch": 0.36, + "grad_norm": 0.16659876704216003, + "learning_rate": 0.000290010569530967, + "loss": 0.0202, + "step": 544 + }, + { + "epoch": 0.36, + "grad_norm": 0.16970619559288025, + "learning_rate": 0.0002899735437745376, + "loss": 0.0373, + "step": 545 + }, + { + "epoch": 0.36, + "grad_norm": 0.044596217572689056, + "learning_rate": 0.00028993645189956987, + "loss": 0.0202, + "step": 546 + }, + { + "epoch": 0.36, + "grad_norm": 0.07182051986455917, + "learning_rate": 0.00028989929392358484, + "loss": 0.0137, + "step": 547 + }, + { + "epoch": 0.36, + "grad_norm": 0.2593410313129425, + "learning_rate": 0.0002898620698641345, + "loss": 0.0373, + "step": 548 + }, + { + "epoch": 0.36, + "grad_norm": 0.17339394986629486, + "learning_rate": 0.0002898247797388023, + "loss": 0.0217, + "step": 549 + }, + { + "epoch": 0.36, + "grad_norm": 0.13247337937355042, + "learning_rate": 0.00028978742356520256, + "loss": 0.0621, + "step": 550 + }, + { + "epoch": 0.36, + "grad_norm": 0.04582560807466507, + "learning_rate": 0.00028975000136098123, + "loss": 0.0051, + "step": 551 + }, + { + "epoch": 0.36, + "grad_norm": 0.04409830644726753, + "learning_rate": 0.0002897125131438151, + "loss": 0.0042, + "step": 552 + }, + { + "epoch": 0.36, + "grad_norm": 0.11188169568777084, + "learning_rate": 0.0002896749589314123, + "loss": 0.0307, + "step": 553 + }, + { + "epoch": 0.36, + "grad_norm": 0.10103113949298859, + "learning_rate": 0.00028963733874151225, + "loss": 0.0132, + "step": 554 + }, + { + "epoch": 0.36, + "grad_norm": 0.13099652528762817, + "learning_rate": 0.0002895996525918852, + "loss": 0.0348, + "step": 555 + }, + { + "epoch": 0.36, + "grad_norm": 0.07826762646436691, + "learning_rate": 0.0002895619005003328, + "loss": 0.0232, + "step": 556 + }, + { + "epoch": 0.36, + "grad_norm": 0.053435299545526505, + "learning_rate": 0.00028952408248468785, + "loss": 0.0113, + "step": 557 + }, + { + "epoch": 0.37, + "grad_norm": 0.07408218830823898, + "learning_rate": 0.00028948619856281423, + "loss": 0.0099, + "step": 558 + }, + { + "epoch": 0.37, + "grad_norm": 0.08491642028093338, + "learning_rate": 0.00028944824875260693, + "loss": 0.0122, + "step": 559 + }, + { + "epoch": 0.37, + "grad_norm": 0.0294903963804245, + "learning_rate": 0.00028941023307199214, + "loss": 0.0044, + "step": 560 + }, + { + "epoch": 0.37, + "grad_norm": 0.16142538189888, + "learning_rate": 0.000289372151538927, + "loss": 0.0721, + "step": 561 + }, + { + "epoch": 0.37, + "grad_norm": 0.11368390917778015, + "learning_rate": 0.0002893340041714, + "loss": 0.0109, + "step": 562 + }, + { + "epoch": 0.37, + "grad_norm": 0.1799473911523819, + "learning_rate": 0.0002892957909874306, + "loss": 0.0487, + "step": 563 + }, + { + "epoch": 0.37, + "grad_norm": 0.1448475420475006, + "learning_rate": 0.0002892575120050693, + "loss": 0.0601, + "step": 564 + }, + { + "epoch": 0.37, + "grad_norm": 0.07079991698265076, + "learning_rate": 0.00028921916724239773, + "loss": 0.0089, + "step": 565 + }, + { + "epoch": 0.37, + "grad_norm": 0.13462460041046143, + "learning_rate": 0.0002891807567175287, + "loss": 0.0361, + "step": 566 + }, + { + "epoch": 0.37, + "grad_norm": 0.08166678249835968, + "learning_rate": 0.00028914228044860584, + "loss": 0.0412, + "step": 567 + }, + { + "epoch": 0.37, + "grad_norm": 0.09470119327306747, + "learning_rate": 0.00028910373845380405, + "loss": 0.036, + "step": 568 + }, + { + "epoch": 0.37, + "grad_norm": 0.0957297682762146, + "learning_rate": 0.00028906513075132917, + "loss": 0.0302, + "step": 569 + }, + { + "epoch": 0.37, + "grad_norm": 0.17004123330116272, + "learning_rate": 0.00028902645735941814, + "loss": 0.0559, + "step": 570 + }, + { + "epoch": 0.37, + "grad_norm": 0.10910087823867798, + "learning_rate": 0.0002889877182963389, + "loss": 0.0765, + "step": 571 + }, + { + "epoch": 0.37, + "grad_norm": 0.1027827113866806, + "learning_rate": 0.0002889489135803904, + "loss": 0.0261, + "step": 572 + }, + { + "epoch": 0.38, + "grad_norm": 0.1182394027709961, + "learning_rate": 0.00028891004322990254, + "loss": 0.0413, + "step": 573 + }, + { + "epoch": 0.38, + "grad_norm": 0.08422794938087463, + "learning_rate": 0.00028887110726323644, + "loss": 0.048, + "step": 574 + }, + { + "epoch": 0.38, + "grad_norm": 0.10699556767940521, + "learning_rate": 0.00028883210569878397, + "loss": 0.0193, + "step": 575 + }, + { + "epoch": 0.38, + "grad_norm": 0.06325127184391022, + "learning_rate": 0.00028879303855496805, + "loss": 0.0248, + "step": 576 + }, + { + "epoch": 0.38, + "grad_norm": 0.10081582516431808, + "learning_rate": 0.00028875390585024274, + "loss": 0.0211, + "step": 577 + }, + { + "epoch": 0.38, + "grad_norm": 0.062216054648160934, + "learning_rate": 0.00028871470760309285, + "loss": 0.0185, + "step": 578 + }, + { + "epoch": 0.38, + "grad_norm": 0.086198590695858, + "learning_rate": 0.00028867544383203423, + "loss": 0.0544, + "step": 579 + }, + { + "epoch": 0.38, + "grad_norm": 0.11464603990316391, + "learning_rate": 0.00028863611455561374, + "loss": 0.0482, + "step": 580 + }, + { + "epoch": 0.38, + "grad_norm": 0.1089998185634613, + "learning_rate": 0.0002885967197924092, + "loss": 0.0496, + "step": 581 + }, + { + "epoch": 0.38, + "grad_norm": 0.1297656148672104, + "learning_rate": 0.00028855725956102913, + "loss": 0.0286, + "step": 582 + }, + { + "epoch": 0.38, + "grad_norm": 0.12966851890087128, + "learning_rate": 0.0002885177338801133, + "loss": 0.0271, + "step": 583 + }, + { + "epoch": 0.38, + "grad_norm": 0.1413564682006836, + "learning_rate": 0.00028847814276833215, + "loss": 0.0334, + "step": 584 + }, + { + "epoch": 0.38, + "grad_norm": 0.08366623520851135, + "learning_rate": 0.0002884384862443871, + "loss": 0.0252, + "step": 585 + }, + { + "epoch": 0.38, + "grad_norm": 0.11143944412469864, + "learning_rate": 0.0002883987643270106, + "loss": 0.0347, + "step": 586 + }, + { + "epoch": 0.38, + "grad_norm": 0.018316002562642097, + "learning_rate": 0.0002883589770349658, + "loss": 0.0041, + "step": 587 + }, + { + "epoch": 0.38, + "grad_norm": 0.02275553159415722, + "learning_rate": 0.0002883191243870467, + "loss": 0.0049, + "step": 588 + }, + { + "epoch": 0.39, + "grad_norm": 0.14462235569953918, + "learning_rate": 0.0002882792064020785, + "loss": 0.0745, + "step": 589 + }, + { + "epoch": 0.39, + "grad_norm": 0.10231613367795944, + "learning_rate": 0.0002882392230989169, + "loss": 0.0211, + "step": 590 + }, + { + "epoch": 0.39, + "grad_norm": 0.013464580290019512, + "learning_rate": 0.00028819917449644865, + "loss": 0.0027, + "step": 591 + }, + { + "epoch": 0.39, + "grad_norm": 0.1707848161458969, + "learning_rate": 0.0002881590606135912, + "loss": 0.0292, + "step": 592 + }, + { + "epoch": 0.39, + "grad_norm": 0.021210182458162308, + "learning_rate": 0.00028811888146929303, + "loss": 0.0034, + "step": 593 + }, + { + "epoch": 0.39, + "grad_norm": 0.09697694331407547, + "learning_rate": 0.00028807863708253326, + "loss": 0.0134, + "step": 594 + }, + { + "epoch": 0.39, + "grad_norm": 0.014497664757072926, + "learning_rate": 0.000288038327472322, + "loss": 0.0033, + "step": 595 + }, + { + "epoch": 0.39, + "grad_norm": 0.25384795665740967, + "learning_rate": 0.00028799795265770003, + "loss": 0.0258, + "step": 596 + }, + { + "epoch": 0.39, + "grad_norm": 0.0065186647698283195, + "learning_rate": 0.00028795751265773894, + "loss": 0.0012, + "step": 597 + }, + { + "epoch": 0.39, + "grad_norm": 0.03637157753109932, + "learning_rate": 0.00028791700749154124, + "loss": 0.004, + "step": 598 + }, + { + "epoch": 0.39, + "grad_norm": 0.039990831166505814, + "learning_rate": 0.00028787643717824007, + "loss": 0.0067, + "step": 599 + }, + { + "epoch": 0.39, + "grad_norm": 0.18821458518505096, + "learning_rate": 0.0002878358017369994, + "loss": 0.0233, + "step": 600 + }, + { + "epoch": 0.39, + "grad_norm": 0.12891234457492828, + "learning_rate": 0.00028779510118701404, + "loss": 0.0121, + "step": 601 + }, + { + "epoch": 0.39, + "grad_norm": 0.1731066256761551, + "learning_rate": 0.0002877543355475094, + "loss": 0.0535, + "step": 602 + }, + { + "epoch": 0.39, + "grad_norm": 0.5192031264305115, + "learning_rate": 0.0002877135048377418, + "loss": 0.1073, + "step": 603 + }, + { + "epoch": 0.4, + "grad_norm": 0.13350637257099152, + "learning_rate": 0.0002876726090769982, + "loss": 0.0157, + "step": 604 + }, + { + "epoch": 0.4, + "grad_norm": 0.12136203050613403, + "learning_rate": 0.0002876316482845963, + "loss": 0.0132, + "step": 605 + }, + { + "epoch": 0.4, + "grad_norm": 0.5036077499389648, + "learning_rate": 0.0002875906224798844, + "loss": 0.1366, + "step": 606 + }, + { + "epoch": 0.4, + "grad_norm": 0.22896146774291992, + "learning_rate": 0.0002875495316822419, + "loss": 0.08, + "step": 607 + }, + { + "epoch": 0.4, + "grad_norm": 0.15327180922031403, + "learning_rate": 0.0002875083759110785, + "loss": 0.0322, + "step": 608 + }, + { + "epoch": 0.4, + "grad_norm": 0.0520663745701313, + "learning_rate": 0.0002874671551858346, + "loss": 0.0202, + "step": 609 + }, + { + "epoch": 0.4, + "grad_norm": 0.08731318265199661, + "learning_rate": 0.00028742586952598155, + "loss": 0.0414, + "step": 610 + }, + { + "epoch": 0.4, + "grad_norm": 0.11570514738559723, + "learning_rate": 0.0002873845189510213, + "loss": 0.0625, + "step": 611 + }, + { + "epoch": 0.4, + "grad_norm": 0.1604083925485611, + "learning_rate": 0.0002873431034804862, + "loss": 0.0644, + "step": 612 + }, + { + "epoch": 0.4, + "grad_norm": 0.06147552654147148, + "learning_rate": 0.0002873016231339396, + "loss": 0.0168, + "step": 613 + }, + { + "epoch": 0.4, + "grad_norm": 0.12419867515563965, + "learning_rate": 0.00028726007793097527, + "loss": 0.0438, + "step": 614 + }, + { + "epoch": 0.4, + "grad_norm": 0.06133590638637543, + "learning_rate": 0.0002872184678912177, + "loss": 0.024, + "step": 615 + }, + { + "epoch": 0.4, + "grad_norm": 0.10245617479085922, + "learning_rate": 0.00028717679303432207, + "loss": 0.0468, + "step": 616 + }, + { + "epoch": 0.4, + "grad_norm": 0.11957762390375137, + "learning_rate": 0.000287135053379974, + "loss": 0.0442, + "step": 617 + }, + { + "epoch": 0.4, + "grad_norm": 0.12896914780139923, + "learning_rate": 0.0002870932489478899, + "loss": 0.019, + "step": 618 + }, + { + "epoch": 0.41, + "grad_norm": 0.1816866546869278, + "learning_rate": 0.0002870513797578167, + "loss": 0.0465, + "step": 619 + }, + { + "epoch": 0.41, + "grad_norm": 0.3061673045158386, + "learning_rate": 0.00028700944582953184, + "loss": 0.0356, + "step": 620 + }, + { + "epoch": 0.41, + "grad_norm": 0.12940478324890137, + "learning_rate": 0.0002869674471828435, + "loss": 0.0447, + "step": 621 + }, + { + "epoch": 0.41, + "grad_norm": 0.2569711208343506, + "learning_rate": 0.0002869253838375903, + "loss": 0.0383, + "step": 622 + }, + { + "epoch": 0.41, + "grad_norm": 0.17063623666763306, + "learning_rate": 0.0002868832558136415, + "loss": 0.0394, + "step": 623 + }, + { + "epoch": 0.41, + "grad_norm": 0.16775226593017578, + "learning_rate": 0.00028684106313089686, + "loss": 0.0314, + "step": 624 + }, + { + "epoch": 0.41, + "grad_norm": 0.12676480412483215, + "learning_rate": 0.00028679880580928676, + "loss": 0.0397, + "step": 625 + }, + { + "epoch": 0.41, + "grad_norm": 0.19791187345981598, + "learning_rate": 0.0002867564838687721, + "loss": 0.0668, + "step": 626 + }, + { + "epoch": 0.41, + "grad_norm": 0.18982940912246704, + "learning_rate": 0.0002867140973293441, + "loss": 0.0472, + "step": 627 + }, + { + "epoch": 0.41, + "grad_norm": 0.06308908015489578, + "learning_rate": 0.00028667164621102475, + "loss": 0.0166, + "step": 628 + }, + { + "epoch": 0.41, + "grad_norm": 0.09570673853158951, + "learning_rate": 0.0002866291305338665, + "loss": 0.0156, + "step": 629 + }, + { + "epoch": 0.41, + "grad_norm": 0.12950573861598969, + "learning_rate": 0.00028658655031795215, + "loss": 0.0381, + "step": 630 + }, + { + "epoch": 0.41, + "grad_norm": 0.30905017256736755, + "learning_rate": 0.00028654390558339516, + "loss": 0.0386, + "step": 631 + }, + { + "epoch": 0.41, + "grad_norm": 0.2680380940437317, + "learning_rate": 0.0002865011963503394, + "loss": 0.0307, + "step": 632 + }, + { + "epoch": 0.41, + "grad_norm": 0.15153923630714417, + "learning_rate": 0.00028645842263895916, + "loss": 0.0448, + "step": 633 + }, + { + "epoch": 0.42, + "grad_norm": 0.06900045275688171, + "learning_rate": 0.0002864155844694592, + "loss": 0.0134, + "step": 634 + }, + { + "epoch": 0.42, + "grad_norm": 0.39054739475250244, + "learning_rate": 0.00028637268186207474, + "loss": 0.0562, + "step": 635 + }, + { + "epoch": 0.42, + "grad_norm": 0.06766320765018463, + "learning_rate": 0.0002863297148370716, + "loss": 0.0135, + "step": 636 + }, + { + "epoch": 0.42, + "grad_norm": 0.12230436503887177, + "learning_rate": 0.0002862866834147457, + "loss": 0.0189, + "step": 637 + }, + { + "epoch": 0.42, + "grad_norm": 0.10021094232797623, + "learning_rate": 0.00028624358761542365, + "loss": 0.021, + "step": 638 + }, + { + "epoch": 0.42, + "grad_norm": 0.1645062267780304, + "learning_rate": 0.0002862004274594623, + "loss": 0.0284, + "step": 639 + }, + { + "epoch": 0.42, + "grad_norm": 0.3108697831630707, + "learning_rate": 0.00028615720296724906, + "loss": 0.0792, + "step": 640 + }, + { + "epoch": 0.42, + "grad_norm": 0.12834666669368744, + "learning_rate": 0.0002861139141592017, + "loss": 0.0162, + "step": 641 + }, + { + "epoch": 0.42, + "grad_norm": 0.11455690860748291, + "learning_rate": 0.00028607056105576806, + "loss": 0.0374, + "step": 642 + }, + { + "epoch": 0.42, + "grad_norm": 0.14810198545455933, + "learning_rate": 0.0002860271436774269, + "loss": 0.0132, + "step": 643 + }, + { + "epoch": 0.42, + "grad_norm": 0.1764562875032425, + "learning_rate": 0.00028598366204468694, + "loss": 0.0641, + "step": 644 + }, + { + "epoch": 0.42, + "grad_norm": 0.10819990932941437, + "learning_rate": 0.0002859401161780873, + "loss": 0.036, + "step": 645 + }, + { + "epoch": 0.42, + "grad_norm": 0.10301560163497925, + "learning_rate": 0.00028589650609819764, + "loss": 0.0272, + "step": 646 + }, + { + "epoch": 0.42, + "grad_norm": 0.13949047029018402, + "learning_rate": 0.00028585283182561773, + "loss": 0.0396, + "step": 647 + }, + { + "epoch": 0.42, + "grad_norm": 0.20076854526996613, + "learning_rate": 0.0002858090933809777, + "loss": 0.0304, + "step": 648 + }, + { + "epoch": 0.42, + "grad_norm": 0.12382891029119492, + "learning_rate": 0.0002857652907849381, + "loss": 0.0317, + "step": 649 + }, + { + "epoch": 0.43, + "grad_norm": 0.03410351276397705, + "learning_rate": 0.0002857214240581897, + "loss": 0.0075, + "step": 650 + }, + { + "epoch": 0.43, + "grad_norm": 0.10016089677810669, + "learning_rate": 0.00028567749322145367, + "loss": 0.0179, + "step": 651 + }, + { + "epoch": 0.43, + "grad_norm": 0.24712401628494263, + "learning_rate": 0.00028563349829548125, + "loss": 0.0857, + "step": 652 + }, + { + "epoch": 0.43, + "grad_norm": 0.10354748368263245, + "learning_rate": 0.00028558943930105413, + "loss": 0.0276, + "step": 653 + }, + { + "epoch": 0.43, + "grad_norm": 0.13952110707759857, + "learning_rate": 0.00028554531625898434, + "loss": 0.0352, + "step": 654 + }, + { + "epoch": 0.43, + "grad_norm": 0.25892096757888794, + "learning_rate": 0.0002855011291901138, + "loss": 0.0635, + "step": 655 + }, + { + "epoch": 0.43, + "grad_norm": 0.1324494630098343, + "learning_rate": 0.0002854568781153151, + "loss": 0.0404, + "step": 656 + }, + { + "epoch": 0.43, + "grad_norm": 0.2835068702697754, + "learning_rate": 0.0002854125630554908, + "loss": 0.0913, + "step": 657 + }, + { + "epoch": 0.43, + "grad_norm": 0.06329616159200668, + "learning_rate": 0.00028536818403157387, + "loss": 0.0146, + "step": 658 + }, + { + "epoch": 0.43, + "grad_norm": 0.07758588343858719, + "learning_rate": 0.0002853237410645272, + "loss": 0.022, + "step": 659 + }, + { + "epoch": 0.43, + "grad_norm": 0.0839746966958046, + "learning_rate": 0.00028527923417534425, + "loss": 0.0175, + "step": 660 + }, + { + "epoch": 0.43, + "grad_norm": 0.06847698986530304, + "learning_rate": 0.0002852346633850484, + "loss": 0.0257, + "step": 661 + }, + { + "epoch": 0.43, + "grad_norm": 0.05117741599678993, + "learning_rate": 0.0002851900287146933, + "loss": 0.0136, + "step": 662 + }, + { + "epoch": 0.43, + "grad_norm": 0.12874063849449158, + "learning_rate": 0.0002851453301853628, + "loss": 0.0525, + "step": 663 + }, + { + "epoch": 0.43, + "grad_norm": 0.1822301298379898, + "learning_rate": 0.000285100567818171, + "loss": 0.0889, + "step": 664 + }, + { + "epoch": 0.44, + "grad_norm": 0.11295532435178757, + "learning_rate": 0.0002850557416342619, + "loss": 0.0242, + "step": 665 + }, + { + "epoch": 0.44, + "grad_norm": 0.13836829364299774, + "learning_rate": 0.0002850108516548099, + "loss": 0.0441, + "step": 666 + }, + { + "epoch": 0.44, + "grad_norm": 0.10722105205059052, + "learning_rate": 0.0002849658979010194, + "loss": 0.0401, + "step": 667 + }, + { + "epoch": 0.44, + "grad_norm": 0.1335124373435974, + "learning_rate": 0.000284920880394125, + "loss": 0.0355, + "step": 668 + }, + { + "epoch": 0.44, + "grad_norm": 0.0793779119849205, + "learning_rate": 0.00028487579915539136, + "loss": 0.0653, + "step": 669 + }, + { + "epoch": 0.44, + "grad_norm": 0.06469617038965225, + "learning_rate": 0.00028483065420611313, + "loss": 0.0212, + "step": 670 + }, + { + "epoch": 0.44, + "grad_norm": 0.11224417388439178, + "learning_rate": 0.0002847854455676154, + "loss": 0.0689, + "step": 671 + }, + { + "epoch": 0.44, + "grad_norm": 0.08650530874729156, + "learning_rate": 0.00028474017326125296, + "loss": 0.0301, + "step": 672 + }, + { + "epoch": 0.44, + "grad_norm": 0.0688636302947998, + "learning_rate": 0.0002846948373084109, + "loss": 0.0161, + "step": 673 + }, + { + "epoch": 0.44, + "grad_norm": 0.13195598125457764, + "learning_rate": 0.0002846494377305043, + "loss": 0.0529, + "step": 674 + }, + { + "epoch": 0.44, + "grad_norm": 0.1887226700782776, + "learning_rate": 0.0002846039745489783, + "loss": 0.0615, + "step": 675 + }, + { + "epoch": 0.44, + "grad_norm": 0.06736018508672714, + "learning_rate": 0.0002845584477853082, + "loss": 0.0246, + "step": 676 + }, + { + "epoch": 0.44, + "grad_norm": 0.1488025039434433, + "learning_rate": 0.0002845128574609992, + "loss": 0.0361, + "step": 677 + }, + { + "epoch": 0.44, + "grad_norm": 0.09811149537563324, + "learning_rate": 0.0002844672035975864, + "loss": 0.0228, + "step": 678 + }, + { + "epoch": 0.44, + "grad_norm": 0.06320784986019135, + "learning_rate": 0.0002844214862166352, + "loss": 0.0182, + "step": 679 + }, + { + "epoch": 0.45, + "grad_norm": 0.0695585086941719, + "learning_rate": 0.00028437570533974084, + "loss": 0.0393, + "step": 680 + }, + { + "epoch": 0.45, + "grad_norm": 0.08886481821537018, + "learning_rate": 0.00028432986098852857, + "loss": 0.0293, + "step": 681 + }, + { + "epoch": 0.45, + "grad_norm": 0.09019115567207336, + "learning_rate": 0.0002842839531846537, + "loss": 0.0436, + "step": 682 + }, + { + "epoch": 0.45, + "grad_norm": 0.1718403697013855, + "learning_rate": 0.0002842379819498013, + "loss": 0.0512, + "step": 683 + }, + { + "epoch": 0.45, + "grad_norm": 0.1692350208759308, + "learning_rate": 0.0002841919473056867, + "loss": 0.0637, + "step": 684 + }, + { + "epoch": 0.45, + "grad_norm": 0.15840108692646027, + "learning_rate": 0.00028414584927405497, + "loss": 0.0224, + "step": 685 + }, + { + "epoch": 0.45, + "grad_norm": 0.12710994482040405, + "learning_rate": 0.0002840996878766812, + "loss": 0.042, + "step": 686 + }, + { + "epoch": 0.45, + "grad_norm": 0.07157866656780243, + "learning_rate": 0.0002840534631353704, + "loss": 0.0279, + "step": 687 + }, + { + "epoch": 0.45, + "grad_norm": 0.046079106628894806, + "learning_rate": 0.0002840071750719575, + "loss": 0.0093, + "step": 688 + }, + { + "epoch": 0.45, + "grad_norm": 0.09579090774059296, + "learning_rate": 0.00028396082370830733, + "loss": 0.027, + "step": 689 + }, + { + "epoch": 0.45, + "grad_norm": 0.201382577419281, + "learning_rate": 0.0002839144090663146, + "loss": 0.072, + "step": 690 + }, + { + "epoch": 0.45, + "grad_norm": 0.1724497377872467, + "learning_rate": 0.000283867931167904, + "loss": 0.052, + "step": 691 + }, + { + "epoch": 0.45, + "grad_norm": 0.06867794692516327, + "learning_rate": 0.00028382139003503006, + "loss": 0.0168, + "step": 692 + }, + { + "epoch": 0.45, + "grad_norm": 0.06690815091133118, + "learning_rate": 0.00028377478568967704, + "loss": 0.0137, + "step": 693 + }, + { + "epoch": 0.45, + "grad_norm": 0.17312408983707428, + "learning_rate": 0.0002837281181538593, + "loss": 0.0226, + "step": 694 + }, + { + "epoch": 0.45, + "grad_norm": 0.08470922708511353, + "learning_rate": 0.0002836813874496208, + "loss": 0.0126, + "step": 695 + }, + { + "epoch": 0.46, + "grad_norm": 0.0778060331940651, + "learning_rate": 0.00028363459359903565, + "loss": 0.0235, + "step": 696 + }, + { + "epoch": 0.46, + "grad_norm": 0.05282146856188774, + "learning_rate": 0.00028358773662420745, + "loss": 0.0086, + "step": 697 + }, + { + "epoch": 0.46, + "grad_norm": 0.09441710263490677, + "learning_rate": 0.00028354081654726984, + "loss": 0.049, + "step": 698 + }, + { + "epoch": 0.46, + "grad_norm": 0.050999149680137634, + "learning_rate": 0.00028349383339038617, + "loss": 0.0086, + "step": 699 + }, + { + "epoch": 0.46, + "grad_norm": 0.10146886110305786, + "learning_rate": 0.0002834467871757497, + "loss": 0.0246, + "step": 700 + }, + { + "epoch": 0.46, + "grad_norm": 0.03706188499927521, + "learning_rate": 0.0002833996779255833, + "loss": 0.0056, + "step": 701 + }, + { + "epoch": 0.46, + "grad_norm": 0.10421967506408691, + "learning_rate": 0.0002833525056621397, + "loss": 0.0241, + "step": 702 + }, + { + "epoch": 0.46, + "grad_norm": 0.18308381736278534, + "learning_rate": 0.00028330527040770146, + "loss": 0.042, + "step": 703 + }, + { + "epoch": 0.46, + "grad_norm": 0.2132684737443924, + "learning_rate": 0.0002832579721845809, + "loss": 0.0226, + "step": 704 + }, + { + "epoch": 0.46, + "grad_norm": 0.5411369204521179, + "learning_rate": 0.00028321061101511984, + "loss": 0.0702, + "step": 705 + }, + { + "epoch": 0.46, + "grad_norm": 0.3440389335155487, + "learning_rate": 0.0002831631869216902, + "loss": 0.0225, + "step": 706 + }, + { + "epoch": 0.46, + "grad_norm": 0.16572096943855286, + "learning_rate": 0.00028311569992669333, + "loss": 0.0352, + "step": 707 + }, + { + "epoch": 0.46, + "grad_norm": 0.15913799405097961, + "learning_rate": 0.0002830681500525604, + "loss": 0.0266, + "step": 708 + }, + { + "epoch": 0.46, + "grad_norm": 0.1818440854549408, + "learning_rate": 0.0002830205373217524, + "loss": 0.0688, + "step": 709 + }, + { + "epoch": 0.46, + "grad_norm": 0.17044726014137268, + "learning_rate": 0.0002829728617567598, + "loss": 0.0515, + "step": 710 + }, + { + "epoch": 0.47, + "grad_norm": 0.15003225207328796, + "learning_rate": 0.0002829251233801028, + "loss": 0.0757, + "step": 711 + }, + { + "epoch": 0.47, + "grad_norm": 0.11599011719226837, + "learning_rate": 0.00028287732221433145, + "loss": 0.0402, + "step": 712 + }, + { + "epoch": 0.47, + "grad_norm": 0.11937666684389114, + "learning_rate": 0.0002828294582820252, + "loss": 0.0391, + "step": 713 + }, + { + "epoch": 0.47, + "grad_norm": 0.2134632170200348, + "learning_rate": 0.0002827815316057933, + "loss": 0.0748, + "step": 714 + }, + { + "epoch": 0.47, + "grad_norm": 0.29407811164855957, + "learning_rate": 0.00028273354220827477, + "loss": 0.0679, + "step": 715 + }, + { + "epoch": 0.47, + "grad_norm": 0.059820059686899185, + "learning_rate": 0.00028268549011213785, + "loss": 0.0372, + "step": 716 + }, + { + "epoch": 0.47, + "grad_norm": 0.06392081081867218, + "learning_rate": 0.0002826373753400808, + "loss": 0.0212, + "step": 717 + }, + { + "epoch": 0.47, + "grad_norm": 0.15369698405265808, + "learning_rate": 0.0002825891979148313, + "loss": 0.0258, + "step": 718 + }, + { + "epoch": 0.47, + "grad_norm": 0.11246192455291748, + "learning_rate": 0.00028254095785914667, + "loss": 0.0357, + "step": 719 + }, + { + "epoch": 0.47, + "grad_norm": 0.09747990220785141, + "learning_rate": 0.0002824926551958138, + "loss": 0.0393, + "step": 720 + }, + { + "epoch": 0.47, + "grad_norm": 0.06651636213064194, + "learning_rate": 0.0002824442899476491, + "loss": 0.0222, + "step": 721 + }, + { + "epoch": 0.47, + "grad_norm": 0.2205415815114975, + "learning_rate": 0.00028239586213749866, + "loss": 0.0391, + "step": 722 + }, + { + "epoch": 0.47, + "grad_norm": 0.15302903950214386, + "learning_rate": 0.000282347371788238, + "loss": 0.019, + "step": 723 + }, + { + "epoch": 0.47, + "grad_norm": 0.0960957333445549, + "learning_rate": 0.00028229881892277237, + "loss": 0.0142, + "step": 724 + }, + { + "epoch": 0.47, + "grad_norm": 0.13723668456077576, + "learning_rate": 0.00028225020356403624, + "loss": 0.0595, + "step": 725 + }, + { + "epoch": 0.48, + "grad_norm": 0.11208293586969376, + "learning_rate": 0.00028220152573499394, + "loss": 0.0283, + "step": 726 + }, + { + "epoch": 0.48, + "grad_norm": 0.11818848550319672, + "learning_rate": 0.000282152785458639, + "loss": 0.0337, + "step": 727 + }, + { + "epoch": 0.48, + "grad_norm": 0.1239568442106247, + "learning_rate": 0.0002821039827579948, + "loss": 0.0406, + "step": 728 + }, + { + "epoch": 0.48, + "grad_norm": 0.2423425167798996, + "learning_rate": 0.0002820551176561138, + "loss": 0.0502, + "step": 729 + }, + { + "epoch": 0.48, + "grad_norm": 0.11348683387041092, + "learning_rate": 0.0002820061901760783, + "loss": 0.0356, + "step": 730 + }, + { + "epoch": 0.48, + "grad_norm": 0.1281062662601471, + "learning_rate": 0.00028195720034099976, + "loss": 0.0172, + "step": 731 + }, + { + "epoch": 0.48, + "grad_norm": 0.017542295157909393, + "learning_rate": 0.0002819081481740193, + "loss": 0.0035, + "step": 732 + }, + { + "epoch": 0.48, + "grad_norm": 0.13793808221817017, + "learning_rate": 0.00028185903369830757, + "loss": 0.0142, + "step": 733 + }, + { + "epoch": 0.48, + "grad_norm": 0.0807032361626625, + "learning_rate": 0.0002818098569370643, + "loss": 0.0093, + "step": 734 + }, + { + "epoch": 0.48, + "grad_norm": 0.13335250318050385, + "learning_rate": 0.0002817606179135189, + "loss": 0.0249, + "step": 735 + }, + { + "epoch": 0.48, + "grad_norm": 0.18836469948291779, + "learning_rate": 0.0002817113166509302, + "loss": 0.0702, + "step": 736 + }, + { + "epoch": 0.48, + "grad_norm": 0.08416979014873505, + "learning_rate": 0.0002816619531725863, + "loss": 0.0115, + "step": 737 + }, + { + "epoch": 0.48, + "grad_norm": 0.054892800748348236, + "learning_rate": 0.00028161252750180486, + "loss": 0.0051, + "step": 738 + }, + { + "epoch": 0.48, + "grad_norm": 0.11004303395748138, + "learning_rate": 0.0002815630396619327, + "loss": 0.025, + "step": 739 + }, + { + "epoch": 0.48, + "grad_norm": 0.08595911413431168, + "learning_rate": 0.00028151348967634613, + "loss": 0.0247, + "step": 740 + }, + { + "epoch": 0.49, + "grad_norm": 0.19074852764606476, + "learning_rate": 0.0002814638775684509, + "loss": 0.0548, + "step": 741 + }, + { + "epoch": 0.49, + "grad_norm": 0.15080730617046356, + "learning_rate": 0.0002814142033616819, + "loss": 0.0865, + "step": 742 + }, + { + "epoch": 0.49, + "grad_norm": 0.1833682358264923, + "learning_rate": 0.00028136446707950353, + "loss": 0.0697, + "step": 743 + }, + { + "epoch": 0.49, + "grad_norm": 0.049776408821344376, + "learning_rate": 0.00028131466874540943, + "loss": 0.0078, + "step": 744 + }, + { + "epoch": 0.49, + "grad_norm": 0.28232109546661377, + "learning_rate": 0.00028126480838292254, + "loss": 0.0283, + "step": 745 + }, + { + "epoch": 0.49, + "grad_norm": 0.11048179864883423, + "learning_rate": 0.0002812148860155952, + "loss": 0.0333, + "step": 746 + }, + { + "epoch": 0.49, + "grad_norm": 0.3188829720020294, + "learning_rate": 0.0002811649016670089, + "loss": 0.0524, + "step": 747 + }, + { + "epoch": 0.49, + "grad_norm": 0.17602626979351044, + "learning_rate": 0.0002811148553607745, + "loss": 0.0559, + "step": 748 + }, + { + "epoch": 0.49, + "grad_norm": 0.1038702130317688, + "learning_rate": 0.0002810647471205321, + "loss": 0.0409, + "step": 749 + }, + { + "epoch": 0.49, + "grad_norm": 0.13291211426258087, + "learning_rate": 0.00028101457696995104, + "loss": 0.0343, + "step": 750 + }, + { + "epoch": 0.49, + "grad_norm": 0.3924441933631897, + "learning_rate": 0.0002809643449327299, + "loss": 0.1051, + "step": 751 + }, + { + "epoch": 0.49, + "grad_norm": 0.4292432367801666, + "learning_rate": 0.0002809140510325966, + "loss": 0.0985, + "step": 752 + }, + { + "epoch": 0.49, + "grad_norm": 0.2376273274421692, + "learning_rate": 0.0002808636952933081, + "loss": 0.0372, + "step": 753 + }, + { + "epoch": 0.49, + "grad_norm": 0.18019931018352509, + "learning_rate": 0.0002808132777386507, + "loss": 0.0449, + "step": 754 + }, + { + "epoch": 0.49, + "grad_norm": 0.09754154831171036, + "learning_rate": 0.0002807627983924399, + "loss": 0.0399, + "step": 755 + }, + { + "epoch": 0.49, + "grad_norm": 0.15885044634342194, + "learning_rate": 0.0002807122572785203, + "loss": 0.0231, + "step": 756 + }, + { + "epoch": 0.5, + "grad_norm": 0.15031558275222778, + "learning_rate": 0.0002806616544207657, + "loss": 0.0562, + "step": 757 + }, + { + "epoch": 0.5, + "grad_norm": 0.03422224149107933, + "learning_rate": 0.00028061098984307923, + "loss": 0.0055, + "step": 758 + }, + { + "epoch": 0.5, + "grad_norm": 0.11442912369966507, + "learning_rate": 0.0002805602635693929, + "loss": 0.0318, + "step": 759 + }, + { + "epoch": 0.5, + "grad_norm": 0.10043658316135406, + "learning_rate": 0.0002805094756236681, + "loss": 0.0293, + "step": 760 + }, + { + "epoch": 0.5, + "grad_norm": 0.06093163788318634, + "learning_rate": 0.00028045862602989516, + "loss": 0.0062, + "step": 761 + }, + { + "epoch": 0.5, + "grad_norm": 0.17897385358810425, + "learning_rate": 0.0002804077148120937, + "loss": 0.0387, + "step": 762 + }, + { + "epoch": 0.5, + "grad_norm": 0.14174053072929382, + "learning_rate": 0.0002803567419943124, + "loss": 0.0742, + "step": 763 + }, + { + "epoch": 0.5, + "grad_norm": 0.12193652242422104, + "learning_rate": 0.0002803057076006289, + "loss": 0.0237, + "step": 764 + }, + { + "epoch": 0.5, + "eval_loss": 0.03274312615394592, + "eval_runtime": 39.9218, + "eval_samples_per_second": 32.238, + "eval_steps_per_second": 8.066, + "step": 764 + }, + { + "epoch": 0.5, + "grad_norm": 0.17127107083797455, + "learning_rate": 0.00028025461165515016, + "loss": 0.0269, + "step": 765 + }, + { + "epoch": 0.5, + "grad_norm": 0.03232162818312645, + "learning_rate": 0.00028020345418201196, + "loss": 0.0053, + "step": 766 + }, + { + "epoch": 0.5, + "grad_norm": 0.29065465927124023, + "learning_rate": 0.0002801522352053794, + "loss": 0.0621, + "step": 767 + }, + { + "epoch": 0.5, + "grad_norm": 0.14432761073112488, + "learning_rate": 0.00028010095474944647, + "loss": 0.0556, + "step": 768 + }, + { + "epoch": 0.5, + "grad_norm": 0.11056532710790634, + "learning_rate": 0.00028004961283843624, + "loss": 0.0111, + "step": 769 + }, + { + "epoch": 0.5, + "grad_norm": 0.18930545449256897, + "learning_rate": 0.0002799982094966007, + "loss": 0.0548, + "step": 770 + }, + { + "epoch": 0.5, + "grad_norm": 0.14607198536396027, + "learning_rate": 0.00027994674474822115, + "loss": 0.0296, + "step": 771 + }, + { + "epoch": 0.51, + "grad_norm": 0.1470440924167633, + "learning_rate": 0.0002798952186176076, + "loss": 0.0366, + "step": 772 + }, + { + "epoch": 0.51, + "grad_norm": 0.0858917385339737, + "learning_rate": 0.0002798436311290992, + "loss": 0.0149, + "step": 773 + }, + { + "epoch": 0.51, + "grad_norm": 0.23497411608695984, + "learning_rate": 0.000279791982307064, + "loss": 0.031, + "step": 774 + }, + { + "epoch": 0.51, + "grad_norm": 0.05533986538648605, + "learning_rate": 0.00027974027217589917, + "loss": 0.0149, + "step": 775 + }, + { + "epoch": 0.51, + "grad_norm": 0.18971019983291626, + "learning_rate": 0.00027968850076003066, + "loss": 0.0339, + "step": 776 + }, + { + "epoch": 0.51, + "grad_norm": 0.18975158035755157, + "learning_rate": 0.00027963666808391343, + "loss": 0.0192, + "step": 777 + }, + { + "epoch": 0.51, + "grad_norm": 0.04416336864233017, + "learning_rate": 0.0002795847741720315, + "loss": 0.0073, + "step": 778 + }, + { + "epoch": 0.51, + "grad_norm": 0.20576409995555878, + "learning_rate": 0.00027953281904889764, + "loss": 0.0418, + "step": 779 + }, + { + "epoch": 0.51, + "grad_norm": 0.1331322193145752, + "learning_rate": 0.0002794808027390536, + "loss": 0.011, + "step": 780 + }, + { + "epoch": 0.51, + "grad_norm": 0.3062935769557953, + "learning_rate": 0.0002794287252670701, + "loss": 0.07, + "step": 781 + }, + { + "epoch": 0.51, + "grad_norm": 0.1513393074274063, + "learning_rate": 0.0002793765866575466, + "loss": 0.0384, + "step": 782 + }, + { + "epoch": 0.51, + "grad_norm": 0.08953273296356201, + "learning_rate": 0.0002793243869351116, + "loss": 0.0342, + "step": 783 + }, + { + "epoch": 0.51, + "grad_norm": 0.16210728883743286, + "learning_rate": 0.00027927212612442243, + "loss": 0.0403, + "step": 784 + }, + { + "epoch": 0.51, + "grad_norm": 0.10174558311700821, + "learning_rate": 0.0002792198042501652, + "loss": 0.0304, + "step": 785 + }, + { + "epoch": 0.51, + "grad_norm": 0.15236282348632812, + "learning_rate": 0.0002791674213370549, + "loss": 0.0378, + "step": 786 + }, + { + "epoch": 0.52, + "grad_norm": 0.06242475286126137, + "learning_rate": 0.0002791149774098353, + "loss": 0.0092, + "step": 787 + }, + { + "epoch": 0.52, + "grad_norm": 0.20970316231250763, + "learning_rate": 0.0002790624724932792, + "loss": 0.0479, + "step": 788 + }, + { + "epoch": 0.52, + "grad_norm": 0.15189824998378754, + "learning_rate": 0.0002790099066121879, + "loss": 0.0118, + "step": 789 + }, + { + "epoch": 0.52, + "grad_norm": 0.061395786702632904, + "learning_rate": 0.0002789572797913918, + "loss": 0.0151, + "step": 790 + }, + { + "epoch": 0.52, + "grad_norm": 0.5034061074256897, + "learning_rate": 0.00027890459205574987, + "loss": 0.0864, + "step": 791 + }, + { + "epoch": 0.52, + "grad_norm": 0.07147826254367828, + "learning_rate": 0.0002788518434301499, + "loss": 0.0191, + "step": 792 + }, + { + "epoch": 0.52, + "grad_norm": 0.1329374462366104, + "learning_rate": 0.0002787990339395085, + "loss": 0.0331, + "step": 793 + }, + { + "epoch": 0.52, + "grad_norm": 0.15126173198223114, + "learning_rate": 0.0002787461636087711, + "loss": 0.0143, + "step": 794 + }, + { + "epoch": 0.52, + "grad_norm": 0.09816433489322662, + "learning_rate": 0.0002786932324629116, + "loss": 0.0155, + "step": 795 + }, + { + "epoch": 0.52, + "grad_norm": 0.1379247009754181, + "learning_rate": 0.0002786402405269329, + "loss": 0.0315, + "step": 796 + }, + { + "epoch": 0.52, + "grad_norm": 0.23714596033096313, + "learning_rate": 0.00027858718782586647, + "loss": 0.0465, + "step": 797 + }, + { + "epoch": 0.52, + "grad_norm": 0.12600766122341156, + "learning_rate": 0.0002785340743847725, + "loss": 0.0359, + "step": 798 + }, + { + "epoch": 0.52, + "grad_norm": 0.13546015322208405, + "learning_rate": 0.00027848090022874, + "loss": 0.0175, + "step": 799 + }, + { + "epoch": 0.52, + "grad_norm": 0.07381202280521393, + "learning_rate": 0.00027842766538288647, + "loss": 0.0302, + "step": 800 + }, + { + "epoch": 0.52, + "grad_norm": 0.14797933399677277, + "learning_rate": 0.0002783743698723582, + "loss": 0.0818, + "step": 801 + }, + { + "epoch": 0.53, + "grad_norm": 0.02270502597093582, + "learning_rate": 0.00027832101372233007, + "loss": 0.0049, + "step": 802 + }, + { + "epoch": 0.53, + "grad_norm": 0.322780042886734, + "learning_rate": 0.00027826759695800566, + "loss": 0.0694, + "step": 803 + }, + { + "epoch": 0.53, + "grad_norm": 0.2222844511270523, + "learning_rate": 0.0002782141196046171, + "loss": 0.0261, + "step": 804 + }, + { + "epoch": 0.53, + "grad_norm": 0.3285076320171356, + "learning_rate": 0.0002781605816874253, + "loss": 0.0872, + "step": 805 + }, + { + "epoch": 0.53, + "grad_norm": 0.08804619312286377, + "learning_rate": 0.0002781069832317196, + "loss": 0.0578, + "step": 806 + }, + { + "epoch": 0.53, + "grad_norm": 0.17540492117404938, + "learning_rate": 0.00027805332426281793, + "loss": 0.0384, + "step": 807 + }, + { + "epoch": 0.53, + "grad_norm": 0.06208420172333717, + "learning_rate": 0.00027799960480606706, + "loss": 0.0136, + "step": 808 + }, + { + "epoch": 0.53, + "grad_norm": 0.07048339396715164, + "learning_rate": 0.0002779458248868421, + "loss": 0.026, + "step": 809 + }, + { + "epoch": 0.53, + "grad_norm": 0.0901297777891159, + "learning_rate": 0.00027789198453054666, + "loss": 0.0277, + "step": 810 + }, + { + "epoch": 0.53, + "grad_norm": 0.07435130327939987, + "learning_rate": 0.0002778380837626132, + "loss": 0.0197, + "step": 811 + }, + { + "epoch": 0.53, + "grad_norm": 0.14209306240081787, + "learning_rate": 0.00027778412260850234, + "loss": 0.0407, + "step": 812 + }, + { + "epoch": 0.53, + "grad_norm": 0.16662320494651794, + "learning_rate": 0.00027773010109370357, + "loss": 0.0667, + "step": 813 + }, + { + "epoch": 0.53, + "grad_norm": 0.031582899391651154, + "learning_rate": 0.0002776760192437346, + "loss": 0.0104, + "step": 814 + }, + { + "epoch": 0.53, + "grad_norm": 0.0818646028637886, + "learning_rate": 0.00027762187708414195, + "loss": 0.0258, + "step": 815 + }, + { + "epoch": 0.53, + "grad_norm": 0.0822024941444397, + "learning_rate": 0.0002775676746405003, + "loss": 0.0406, + "step": 816 + }, + { + "epoch": 0.53, + "grad_norm": 0.11700989305973053, + "learning_rate": 0.0002775134119384131, + "loss": 0.0335, + "step": 817 + }, + { + "epoch": 0.54, + "grad_norm": 0.082905612885952, + "learning_rate": 0.00027745908900351195, + "loss": 0.0161, + "step": 818 + }, + { + "epoch": 0.54, + "grad_norm": 0.11846023797988892, + "learning_rate": 0.00027740470586145726, + "loss": 0.0502, + "step": 819 + }, + { + "epoch": 0.54, + "grad_norm": 0.17150120437145233, + "learning_rate": 0.00027735026253793756, + "loss": 0.0345, + "step": 820 + }, + { + "epoch": 0.54, + "grad_norm": 0.12443263083696365, + "learning_rate": 0.00027729575905867, + "loss": 0.0158, + "step": 821 + }, + { + "epoch": 0.54, + "grad_norm": 0.1657358705997467, + "learning_rate": 0.0002772411954494001, + "loss": 0.0226, + "step": 822 + }, + { + "epoch": 0.54, + "grad_norm": 0.08840323239564896, + "learning_rate": 0.0002771865717359018, + "loss": 0.0152, + "step": 823 + }, + { + "epoch": 0.54, + "grad_norm": 0.08032941073179245, + "learning_rate": 0.00027713188794397737, + "loss": 0.0129, + "step": 824 + }, + { + "epoch": 0.54, + "grad_norm": 0.1835167407989502, + "learning_rate": 0.00027707714409945744, + "loss": 0.0569, + "step": 825 + }, + { + "epoch": 0.54, + "grad_norm": 0.12917861342430115, + "learning_rate": 0.0002770223402282012, + "loss": 0.0309, + "step": 826 + }, + { + "epoch": 0.54, + "grad_norm": 0.22104112803936005, + "learning_rate": 0.0002769674763560959, + "loss": 0.0432, + "step": 827 + }, + { + "epoch": 0.54, + "grad_norm": 0.13979768753051758, + "learning_rate": 0.00027691255250905737, + "loss": 0.0174, + "step": 828 + }, + { + "epoch": 0.54, + "grad_norm": 0.17627565562725067, + "learning_rate": 0.0002768575687130297, + "loss": 0.0915, + "step": 829 + }, + { + "epoch": 0.54, + "grad_norm": 0.486728310585022, + "learning_rate": 0.0002768025249939853, + "loss": 0.0583, + "step": 830 + }, + { + "epoch": 0.54, + "grad_norm": 0.1259876936674118, + "learning_rate": 0.0002767474213779247, + "loss": 0.0254, + "step": 831 + }, + { + "epoch": 0.54, + "grad_norm": 0.17353613674640656, + "learning_rate": 0.00027669225789087715, + "loss": 0.0238, + "step": 832 + }, + { + "epoch": 0.55, + "grad_norm": 0.011490284465253353, + "learning_rate": 0.00027663703455889973, + "loss": 0.0025, + "step": 833 + }, + { + "epoch": 0.55, + "grad_norm": 0.054609689861536026, + "learning_rate": 0.00027658175140807815, + "loss": 0.0098, + "step": 834 + }, + { + "epoch": 0.55, + "grad_norm": 0.1213490441441536, + "learning_rate": 0.000276526408464526, + "loss": 0.0128, + "step": 835 + }, + { + "epoch": 0.55, + "grad_norm": 0.09482322633266449, + "learning_rate": 0.0002764710057543855, + "loss": 0.0126, + "step": 836 + }, + { + "epoch": 0.55, + "grad_norm": 0.057049017399549484, + "learning_rate": 0.00027641554330382686, + "loss": 0.015, + "step": 837 + }, + { + "epoch": 0.55, + "grad_norm": 0.18572884798049927, + "learning_rate": 0.0002763600211390486, + "loss": 0.034, + "step": 838 + }, + { + "epoch": 0.55, + "grad_norm": 0.09493198245763779, + "learning_rate": 0.0002763044392862774, + "loss": 0.0408, + "step": 839 + }, + { + "epoch": 0.55, + "grad_norm": 0.2182336002588272, + "learning_rate": 0.00027624879777176807, + "loss": 0.055, + "step": 840 + }, + { + "epoch": 0.55, + "grad_norm": 0.08872721344232559, + "learning_rate": 0.00027619309662180386, + "loss": 0.0383, + "step": 841 + }, + { + "epoch": 0.55, + "grad_norm": 0.11956200748682022, + "learning_rate": 0.0002761373358626959, + "loss": 0.0287, + "step": 842 + }, + { + "epoch": 0.55, + "grad_norm": 0.1644572764635086, + "learning_rate": 0.0002760815155207837, + "loss": 0.0286, + "step": 843 + }, + { + "epoch": 0.55, + "grad_norm": 0.16476300358772278, + "learning_rate": 0.0002760256356224347, + "loss": 0.0392, + "step": 844 + }, + { + "epoch": 0.55, + "grad_norm": 0.1026122123003006, + "learning_rate": 0.00027596969619404457, + "loss": 0.0403, + "step": 845 + }, + { + "epoch": 0.55, + "grad_norm": 0.17450834810733795, + "learning_rate": 0.00027591369726203725, + "loss": 0.0586, + "step": 846 + }, + { + "epoch": 0.55, + "grad_norm": 0.10373177379369736, + "learning_rate": 0.0002758576388528645, + "loss": 0.0214, + "step": 847 + }, + { + "epoch": 0.56, + "grad_norm": 0.08164018392562866, + "learning_rate": 0.0002758015209930064, + "loss": 0.0229, + "step": 848 + }, + { + "epoch": 0.56, + "grad_norm": 0.07375165820121765, + "learning_rate": 0.000275745343708971, + "loss": 0.0333, + "step": 849 + }, + { + "epoch": 0.56, + "grad_norm": 0.09719602763652802, + "learning_rate": 0.0002756891070272945, + "loss": 0.0214, + "step": 850 + }, + { + "epoch": 0.56, + "grad_norm": 0.5595388412475586, + "learning_rate": 0.00027563281097454115, + "loss": 0.0657, + "step": 851 + }, + { + "epoch": 0.56, + "grad_norm": 0.10981204360723495, + "learning_rate": 0.0002755764555773031, + "loss": 0.0308, + "step": 852 + }, + { + "epoch": 0.56, + "grad_norm": 0.10418907552957535, + "learning_rate": 0.0002755200408622007, + "loss": 0.0238, + "step": 853 + }, + { + "epoch": 0.56, + "grad_norm": 0.0636146143078804, + "learning_rate": 0.0002754635668558822, + "loss": 0.0143, + "step": 854 + }, + { + "epoch": 0.56, + "grad_norm": 0.12179470807313919, + "learning_rate": 0.00027540703358502406, + "loss": 0.0393, + "step": 855 + }, + { + "epoch": 0.56, + "grad_norm": 0.07303999364376068, + "learning_rate": 0.00027535044107633046, + "loss": 0.0118, + "step": 856 + }, + { + "epoch": 0.56, + "grad_norm": 0.11226726323366165, + "learning_rate": 0.00027529378935653377, + "loss": 0.0356, + "step": 857 + }, + { + "epoch": 0.56, + "grad_norm": 0.16357053816318512, + "learning_rate": 0.0002752370784523942, + "loss": 0.0378, + "step": 858 + }, + { + "epoch": 0.56, + "grad_norm": 0.10425914824008942, + "learning_rate": 0.0002751803083907, + "loss": 0.0423, + "step": 859 + }, + { + "epoch": 0.56, + "grad_norm": 0.11986647546291351, + "learning_rate": 0.0002751234791982674, + "loss": 0.054, + "step": 860 + }, + { + "epoch": 0.56, + "grad_norm": 0.14440590143203735, + "learning_rate": 0.00027506659090194036, + "loss": 0.0418, + "step": 861 + }, + { + "epoch": 0.56, + "grad_norm": 0.21995751559734344, + "learning_rate": 0.0002750096435285909, + "loss": 0.0303, + "step": 862 + }, + { + "epoch": 0.56, + "grad_norm": 0.03415970876812935, + "learning_rate": 0.00027495263710511906, + "loss": 0.0084, + "step": 863 + }, + { + "epoch": 0.57, + "grad_norm": 0.052127208560705185, + "learning_rate": 0.0002748955716584526, + "loss": 0.0124, + "step": 864 + }, + { + "epoch": 0.57, + "grad_norm": 0.23270320892333984, + "learning_rate": 0.0002748384472155472, + "loss": 0.0501, + "step": 865 + }, + { + "epoch": 0.57, + "grad_norm": 0.05627870559692383, + "learning_rate": 0.00027478126380338645, + "loss": 0.0081, + "step": 866 + }, + { + "epoch": 0.57, + "grad_norm": 0.1844397783279419, + "learning_rate": 0.0002747240214489817, + "loss": 0.04, + "step": 867 + }, + { + "epoch": 0.57, + "grad_norm": 0.06833455711603165, + "learning_rate": 0.0002746667201793722, + "loss": 0.0136, + "step": 868 + }, + { + "epoch": 0.57, + "grad_norm": 0.03551473841071129, + "learning_rate": 0.00027460936002162513, + "loss": 0.0057, + "step": 869 + }, + { + "epoch": 0.57, + "grad_norm": 0.0920785516500473, + "learning_rate": 0.0002745519410028354, + "loss": 0.0103, + "step": 870 + }, + { + "epoch": 0.57, + "grad_norm": 0.1218150407075882, + "learning_rate": 0.0002744944631501256, + "loss": 0.0427, + "step": 871 + }, + { + "epoch": 0.57, + "grad_norm": 0.3496924042701721, + "learning_rate": 0.00027443692649064633, + "loss": 0.0686, + "step": 872 + }, + { + "epoch": 0.57, + "grad_norm": 0.3225466310977936, + "learning_rate": 0.00027437933105157585, + "loss": 0.0518, + "step": 873 + }, + { + "epoch": 0.57, + "grad_norm": 0.230736643075943, + "learning_rate": 0.00027432167686012015, + "loss": 0.0468, + "step": 874 + }, + { + "epoch": 0.57, + "grad_norm": 0.20991326868534088, + "learning_rate": 0.00027426396394351313, + "loss": 0.0595, + "step": 875 + }, + { + "epoch": 0.57, + "grad_norm": 0.10641276091337204, + "learning_rate": 0.0002742061923290162, + "loss": 0.0353, + "step": 876 + }, + { + "epoch": 0.57, + "grad_norm": 0.06472618877887726, + "learning_rate": 0.00027414836204391865, + "loss": 0.012, + "step": 877 + }, + { + "epoch": 0.57, + "grad_norm": 0.2291422188282013, + "learning_rate": 0.0002740904731155375, + "loss": 0.0431, + "step": 878 + }, + { + "epoch": 0.58, + "grad_norm": 0.26647308468818665, + "learning_rate": 0.0002740325255712175, + "loss": 0.1054, + "step": 879 + }, + { + "epoch": 0.58, + "grad_norm": 0.08363434672355652, + "learning_rate": 0.0002739745194383309, + "loss": 0.011, + "step": 880 + }, + { + "epoch": 0.58, + "grad_norm": 0.10943964123725891, + "learning_rate": 0.00027391645474427774, + "loss": 0.0331, + "step": 881 + }, + { + "epoch": 0.58, + "grad_norm": 0.2208610624074936, + "learning_rate": 0.0002738583315164857, + "loss": 0.0499, + "step": 882 + }, + { + "epoch": 0.58, + "grad_norm": 0.09434379637241364, + "learning_rate": 0.00027380014978241026, + "loss": 0.0268, + "step": 883 + }, + { + "epoch": 0.58, + "grad_norm": 0.13045388460159302, + "learning_rate": 0.0002737419095695343, + "loss": 0.0367, + "step": 884 + }, + { + "epoch": 0.58, + "grad_norm": 0.1460418850183487, + "learning_rate": 0.00027368361090536844, + "loss": 0.0662, + "step": 885 + }, + { + "epoch": 0.58, + "grad_norm": 0.08823563903570175, + "learning_rate": 0.000273625253817451, + "loss": 0.0387, + "step": 886 + }, + { + "epoch": 0.58, + "grad_norm": 0.08193490654230118, + "learning_rate": 0.00027356683833334766, + "loss": 0.0357, + "step": 887 + }, + { + "epoch": 0.58, + "grad_norm": 0.1274595856666565, + "learning_rate": 0.00027350836448065193, + "loss": 0.0346, + "step": 888 + }, + { + "epoch": 0.58, + "grad_norm": 0.061123717576265335, + "learning_rate": 0.0002734498322869847, + "loss": 0.0388, + "step": 889 + }, + { + "epoch": 0.58, + "grad_norm": 0.12708084285259247, + "learning_rate": 0.0002733912417799945, + "loss": 0.0276, + "step": 890 + }, + { + "epoch": 0.58, + "grad_norm": 0.055733684450387955, + "learning_rate": 0.00027333259298735756, + "loss": 0.0139, + "step": 891 + }, + { + "epoch": 0.58, + "grad_norm": 0.049776624888181686, + "learning_rate": 0.00027327388593677727, + "loss": 0.0141, + "step": 892 + }, + { + "epoch": 0.58, + "grad_norm": 0.1466546654701233, + "learning_rate": 0.000273215120655985, + "loss": 0.0424, + "step": 893 + }, + { + "epoch": 0.59, + "grad_norm": 0.04927024617791176, + "learning_rate": 0.00027315629717273915, + "loss": 0.0121, + "step": 894 + }, + { + "epoch": 0.59, + "grad_norm": 0.14217214286327362, + "learning_rate": 0.0002730974155148259, + "loss": 0.0365, + "step": 895 + }, + { + "epoch": 0.59, + "grad_norm": 0.06162632629275322, + "learning_rate": 0.00027303847571005904, + "loss": 0.0185, + "step": 896 + }, + { + "epoch": 0.59, + "grad_norm": 0.09187627583742142, + "learning_rate": 0.00027297947778627947, + "loss": 0.024, + "step": 897 + }, + { + "epoch": 0.59, + "grad_norm": 0.08694395422935486, + "learning_rate": 0.00027292042177135575, + "loss": 0.016, + "step": 898 + }, + { + "epoch": 0.59, + "grad_norm": 0.2407931238412857, + "learning_rate": 0.0002728613076931838, + "loss": 0.0895, + "step": 899 + }, + { + "epoch": 0.59, + "grad_norm": 0.11447851359844208, + "learning_rate": 0.0002728021355796871, + "loss": 0.0156, + "step": 900 + }, + { + "epoch": 0.59, + "grad_norm": 0.17052114009857178, + "learning_rate": 0.0002727429054588165, + "loss": 0.0686, + "step": 901 + }, + { + "epoch": 0.59, + "grad_norm": 0.11735350638628006, + "learning_rate": 0.0002726836173585501, + "loss": 0.0458, + "step": 902 + }, + { + "epoch": 0.59, + "grad_norm": 0.1015033945441246, + "learning_rate": 0.0002726242713068935, + "loss": 0.0396, + "step": 903 + }, + { + "epoch": 0.59, + "grad_norm": 0.09442136436700821, + "learning_rate": 0.00027256486733187975, + "loss": 0.0354, + "step": 904 + }, + { + "epoch": 0.59, + "grad_norm": 0.051811713725328445, + "learning_rate": 0.0002725054054615691, + "loss": 0.0103, + "step": 905 + }, + { + "epoch": 0.59, + "grad_norm": 0.09581268578767776, + "learning_rate": 0.00027244588572404924, + "loss": 0.0346, + "step": 906 + }, + { + "epoch": 0.59, + "grad_norm": 0.1265789121389389, + "learning_rate": 0.00027238630814743525, + "loss": 0.0296, + "step": 907 + }, + { + "epoch": 0.59, + "grad_norm": 0.11578807979822159, + "learning_rate": 0.0002723266727598694, + "loss": 0.0374, + "step": 908 + }, + { + "epoch": 0.6, + "grad_norm": 0.0634288564324379, + "learning_rate": 0.0002722669795895214, + "loss": 0.0211, + "step": 909 + }, + { + "epoch": 0.6, + "grad_norm": 0.10002614557743073, + "learning_rate": 0.0002722072286645881, + "loss": 0.0217, + "step": 910 + }, + { + "epoch": 0.6, + "grad_norm": 0.10582344233989716, + "learning_rate": 0.0002721474200132937, + "loss": 0.0262, + "step": 911 + }, + { + "epoch": 0.6, + "grad_norm": 0.20417608320713043, + "learning_rate": 0.0002720875536638898, + "loss": 0.0303, + "step": 912 + }, + { + "epoch": 0.6, + "grad_norm": 0.06233491376042366, + "learning_rate": 0.00027202762964465514, + "loss": 0.0179, + "step": 913 + }, + { + "epoch": 0.6, + "grad_norm": 0.10917846113443375, + "learning_rate": 0.00027196764798389557, + "loss": 0.0238, + "step": 914 + }, + { + "epoch": 0.6, + "grad_norm": 0.20902927219867706, + "learning_rate": 0.0002719076087099444, + "loss": 0.0744, + "step": 915 + }, + { + "epoch": 0.6, + "grad_norm": 0.07525712251663208, + "learning_rate": 0.000271847511851162, + "loss": 0.0145, + "step": 916 + }, + { + "epoch": 0.6, + "grad_norm": 0.13625741004943848, + "learning_rate": 0.0002717873574359361, + "loss": 0.0557, + "step": 917 + }, + { + "epoch": 0.6, + "grad_norm": 0.10275349766016006, + "learning_rate": 0.00027172714549268136, + "loss": 0.0156, + "step": 918 + }, + { + "epoch": 0.6, + "grad_norm": 0.07689966261386871, + "learning_rate": 0.0002716668760498399, + "loss": 0.0285, + "step": 919 + }, + { + "epoch": 0.6, + "grad_norm": 0.051624033600091934, + "learning_rate": 0.00027160654913588073, + "loss": 0.0109, + "step": 920 + }, + { + "epoch": 0.6, + "grad_norm": 0.1263073831796646, + "learning_rate": 0.0002715461647793003, + "loss": 0.03, + "step": 921 + }, + { + "epoch": 0.6, + "grad_norm": 0.03605236858129501, + "learning_rate": 0.0002714857230086219, + "loss": 0.008, + "step": 922 + }, + { + "epoch": 0.6, + "grad_norm": 0.09554066509008408, + "learning_rate": 0.0002714252238523962, + "loss": 0.0276, + "step": 923 + }, + { + "epoch": 0.6, + "grad_norm": 0.12727093696594238, + "learning_rate": 0.0002713646673392008, + "loss": 0.0365, + "step": 924 + }, + { + "epoch": 0.61, + "grad_norm": 0.21029303967952728, + "learning_rate": 0.00027130405349764044, + "loss": 0.0554, + "step": 925 + }, + { + "epoch": 0.61, + "grad_norm": 0.10958801954984665, + "learning_rate": 0.00027124338235634695, + "loss": 0.032, + "step": 926 + }, + { + "epoch": 0.61, + "grad_norm": 0.06557829678058624, + "learning_rate": 0.0002711826539439792, + "loss": 0.0145, + "step": 927 + }, + { + "epoch": 0.61, + "grad_norm": 0.0530441552400589, + "learning_rate": 0.0002711218682892232, + "loss": 0.014, + "step": 928 + }, + { + "epoch": 0.61, + "grad_norm": 0.11874904483556747, + "learning_rate": 0.00027106102542079195, + "loss": 0.0144, + "step": 929 + }, + { + "epoch": 0.61, + "grad_norm": 0.07747121155261993, + "learning_rate": 0.0002710001253674254, + "loss": 0.0136, + "step": 930 + }, + { + "epoch": 0.61, + "grad_norm": 0.055583804845809937, + "learning_rate": 0.0002709391681578906, + "loss": 0.013, + "step": 931 + }, + { + "epoch": 0.61, + "grad_norm": 0.06069410964846611, + "learning_rate": 0.0002708781538209815, + "loss": 0.0076, + "step": 932 + }, + { + "epoch": 0.61, + "grad_norm": 0.019891362637281418, + "learning_rate": 0.00027081708238551927, + "loss": 0.0038, + "step": 933 + }, + { + "epoch": 0.61, + "grad_norm": 0.1343265175819397, + "learning_rate": 0.00027075595388035173, + "loss": 0.0307, + "step": 934 + }, + { + "epoch": 0.61, + "grad_norm": 0.04620016738772392, + "learning_rate": 0.00027069476833435397, + "loss": 0.0048, + "step": 935 + }, + { + "epoch": 0.61, + "grad_norm": 0.1706463247537613, + "learning_rate": 0.00027063352577642776, + "loss": 0.0643, + "step": 936 + }, + { + "epoch": 0.61, + "grad_norm": 0.058014389127492905, + "learning_rate": 0.0002705722262355019, + "loss": 0.0081, + "step": 937 + }, + { + "epoch": 0.61, + "grad_norm": 0.11744493991136551, + "learning_rate": 0.0002705108697405322, + "loss": 0.0308, + "step": 938 + }, + { + "epoch": 0.61, + "grad_norm": 0.08099761605262756, + "learning_rate": 0.00027044945632050127, + "loss": 0.0052, + "step": 939 + }, + { + "epoch": 0.62, + "grad_norm": 0.29563236236572266, + "learning_rate": 0.00027038798600441865, + "loss": 0.0529, + "step": 940 + }, + { + "epoch": 0.62, + "grad_norm": 0.043802157044410706, + "learning_rate": 0.0002703264588213206, + "loss": 0.0071, + "step": 941 + }, + { + "epoch": 0.62, + "grad_norm": 0.12684734165668488, + "learning_rate": 0.00027026487480027057, + "loss": 0.0433, + "step": 942 + }, + { + "epoch": 0.62, + "grad_norm": 0.21014286577701569, + "learning_rate": 0.00027020323397035855, + "loss": 0.028, + "step": 943 + }, + { + "epoch": 0.62, + "grad_norm": 0.11645261198282242, + "learning_rate": 0.00027014153636070157, + "loss": 0.0178, + "step": 944 + }, + { + "epoch": 0.62, + "grad_norm": 0.16726157069206238, + "learning_rate": 0.00027007978200044324, + "loss": 0.0508, + "step": 945 + }, + { + "epoch": 0.62, + "grad_norm": 0.10064594447612762, + "learning_rate": 0.0002700179709187543, + "loss": 0.0239, + "step": 946 + }, + { + "epoch": 0.62, + "grad_norm": 0.060703571885824203, + "learning_rate": 0.00026995610314483205, + "loss": 0.0103, + "step": 947 + }, + { + "epoch": 0.62, + "grad_norm": 0.0527808852493763, + "learning_rate": 0.0002698941787079006, + "loss": 0.0178, + "step": 948 + }, + { + "epoch": 0.62, + "grad_norm": 0.08081556856632233, + "learning_rate": 0.00026983219763721086, + "loss": 0.0157, + "step": 949 + }, + { + "epoch": 0.62, + "grad_norm": 0.12985916435718536, + "learning_rate": 0.00026977015996204054, + "loss": 0.0575, + "step": 950 + }, + { + "epoch": 0.62, + "grad_norm": 0.15043164789676666, + "learning_rate": 0.00026970806571169397, + "loss": 0.0302, + "step": 951 + }, + { + "epoch": 0.62, + "grad_norm": 0.024910060688853264, + "learning_rate": 0.00026964591491550235, + "loss": 0.0045, + "step": 952 + }, + { + "epoch": 0.62, + "grad_norm": 0.10944465547800064, + "learning_rate": 0.00026958370760282345, + "loss": 0.0574, + "step": 953 + }, + { + "epoch": 0.62, + "grad_norm": 0.114822618663311, + "learning_rate": 0.0002695214438030418, + "loss": 0.0262, + "step": 954 + }, + { + "epoch": 0.63, + "grad_norm": 0.15373332798480988, + "learning_rate": 0.0002694591235455687, + "loss": 0.0206, + "step": 955 + }, + { + "epoch": 0.63, + "grad_norm": 0.14427144825458527, + "learning_rate": 0.0002693967468598419, + "loss": 0.0508, + "step": 956 + }, + { + "epoch": 0.63, + "grad_norm": 0.0668393075466156, + "learning_rate": 0.000269334313775326, + "loss": 0.0195, + "step": 957 + }, + { + "epoch": 0.63, + "grad_norm": 0.06797386705875397, + "learning_rate": 0.00026927182432151216, + "loss": 0.0081, + "step": 958 + }, + { + "epoch": 0.63, + "grad_norm": 0.21059945225715637, + "learning_rate": 0.00026920927852791825, + "loss": 0.1075, + "step": 959 + }, + { + "epoch": 0.63, + "grad_norm": 0.10499881953001022, + "learning_rate": 0.0002691466764240886, + "loss": 0.0111, + "step": 960 + }, + { + "epoch": 0.63, + "grad_norm": 0.033115822821855545, + "learning_rate": 0.00026908401803959423, + "loss": 0.0054, + "step": 961 + }, + { + "epoch": 0.63, + "grad_norm": 0.2655697464942932, + "learning_rate": 0.0002690213034040328, + "loss": 0.0455, + "step": 962 + }, + { + "epoch": 0.63, + "grad_norm": 0.1976163387298584, + "learning_rate": 0.0002689585325470284, + "loss": 0.0454, + "step": 963 + }, + { + "epoch": 0.63, + "grad_norm": 0.05260282754898071, + "learning_rate": 0.00026889570549823184, + "loss": 0.0275, + "step": 964 + }, + { + "epoch": 0.63, + "grad_norm": 0.1485443115234375, + "learning_rate": 0.0002688328222873203, + "loss": 0.0191, + "step": 965 + }, + { + "epoch": 0.63, + "grad_norm": 0.0436883270740509, + "learning_rate": 0.0002687698829439977, + "loss": 0.0099, + "step": 966 + }, + { + "epoch": 0.63, + "grad_norm": 0.12818527221679688, + "learning_rate": 0.00026870688749799416, + "loss": 0.0323, + "step": 967 + }, + { + "epoch": 0.63, + "grad_norm": 0.14603693783283234, + "learning_rate": 0.0002686438359790667, + "loss": 0.0541, + "step": 968 + }, + { + "epoch": 0.63, + "grad_norm": 0.09324526786804199, + "learning_rate": 0.00026858072841699847, + "loss": 0.0272, + "step": 969 + }, + { + "epoch": 0.64, + "grad_norm": 0.26789504289627075, + "learning_rate": 0.0002685175648415994, + "loss": 0.0503, + "step": 970 + }, + { + "epoch": 0.64, + "grad_norm": 0.059855278581380844, + "learning_rate": 0.0002684543452827056, + "loss": 0.0136, + "step": 971 + }, + { + "epoch": 0.64, + "grad_norm": 0.08910810202360153, + "learning_rate": 0.00026839106977017974, + "loss": 0.016, + "step": 972 + }, + { + "epoch": 0.64, + "grad_norm": 0.09903378039598465, + "learning_rate": 0.000268327738333911, + "loss": 0.0307, + "step": 973 + }, + { + "epoch": 0.64, + "grad_norm": 0.16080208122730255, + "learning_rate": 0.00026826435100381487, + "loss": 0.0318, + "step": 974 + }, + { + "epoch": 0.64, + "grad_norm": 0.09495270997285843, + "learning_rate": 0.0002682009078098333, + "loss": 0.0591, + "step": 975 + }, + { + "epoch": 0.64, + "grad_norm": 0.11322695016860962, + "learning_rate": 0.00026813740878193457, + "loss": 0.047, + "step": 976 + }, + { + "epoch": 0.64, + "grad_norm": 0.06805938482284546, + "learning_rate": 0.0002680738539501134, + "loss": 0.0337, + "step": 977 + }, + { + "epoch": 0.64, + "grad_norm": 0.18398675322532654, + "learning_rate": 0.00026801024334439076, + "loss": 0.0653, + "step": 978 + }, + { + "epoch": 0.64, + "grad_norm": 0.09730216860771179, + "learning_rate": 0.00026794657699481415, + "loss": 0.0463, + "step": 979 + }, + { + "epoch": 0.64, + "grad_norm": 0.0954691618680954, + "learning_rate": 0.0002678828549314573, + "loss": 0.0199, + "step": 980 + }, + { + "epoch": 0.64, + "grad_norm": 0.15214982628822327, + "learning_rate": 0.00026781907718442013, + "loss": 0.0606, + "step": 981 + }, + { + "epoch": 0.64, + "grad_norm": 0.07308922708034515, + "learning_rate": 0.00026775524378382906, + "loss": 0.0229, + "step": 982 + }, + { + "epoch": 0.64, + "grad_norm": 0.1865328997373581, + "learning_rate": 0.00026769135475983676, + "loss": 0.0617, + "step": 983 + }, + { + "epoch": 0.64, + "grad_norm": 0.0670800730586052, + "learning_rate": 0.0002676274101426221, + "loss": 0.0213, + "step": 984 + }, + { + "epoch": 0.64, + "grad_norm": 0.09108185768127441, + "learning_rate": 0.0002675634099623903, + "loss": 0.0163, + "step": 985 + }, + { + "epoch": 0.65, + "grad_norm": 0.09892558306455612, + "learning_rate": 0.0002674993542493727, + "loss": 0.0398, + "step": 986 + }, + { + "epoch": 0.65, + "grad_norm": 0.18465696275234222, + "learning_rate": 0.00026743524303382695, + "loss": 0.0456, + "step": 987 + }, + { + "epoch": 0.65, + "grad_norm": 0.14701491594314575, + "learning_rate": 0.000267371076346037, + "loss": 0.0217, + "step": 988 + }, + { + "epoch": 0.65, + "grad_norm": 0.22119949758052826, + "learning_rate": 0.0002673068542163128, + "loss": 0.0337, + "step": 989 + }, + { + "epoch": 0.65, + "grad_norm": 0.07329166680574417, + "learning_rate": 0.0002672425766749907, + "loss": 0.0077, + "step": 990 + }, + { + "epoch": 0.65, + "grad_norm": 0.08214308321475983, + "learning_rate": 0.0002671782437524331, + "loss": 0.0086, + "step": 991 + }, + { + "epoch": 0.65, + "grad_norm": 0.16395068168640137, + "learning_rate": 0.0002671138554790286, + "loss": 0.0511, + "step": 992 + }, + { + "epoch": 0.65, + "grad_norm": 0.07903768122196198, + "learning_rate": 0.0002670494118851919, + "loss": 0.0227, + "step": 993 + }, + { + "epoch": 0.65, + "grad_norm": 0.044391512870788574, + "learning_rate": 0.0002669849130013639, + "loss": 0.0062, + "step": 994 + }, + { + "epoch": 0.65, + "grad_norm": 0.11790774017572403, + "learning_rate": 0.0002669203588580116, + "loss": 0.0586, + "step": 995 + }, + { + "epoch": 0.65, + "grad_norm": 0.023213036358356476, + "learning_rate": 0.000266855749485628, + "loss": 0.004, + "step": 996 + }, + { + "epoch": 0.65, + "grad_norm": 0.1801631897687912, + "learning_rate": 0.0002667910849147324, + "loss": 0.0273, + "step": 997 + }, + { + "epoch": 0.65, + "grad_norm": 0.3998229205608368, + "learning_rate": 0.00026672636517587, + "loss": 0.0479, + "step": 998 + }, + { + "epoch": 0.65, + "grad_norm": 0.08344905078411102, + "learning_rate": 0.0002666615902996121, + "loss": 0.0066, + "step": 999 + }, + { + "epoch": 0.65, + "grad_norm": 0.4904734194278717, + "learning_rate": 0.00026659676031655605, + "loss": 0.107, + "step": 1000 + }, + { + "epoch": 0.66, + "grad_norm": 0.14752142131328583, + "learning_rate": 0.00026653187525732525, + "loss": 0.0567, + "step": 1001 + }, + { + "epoch": 0.66, + "grad_norm": 0.09572061896324158, + "learning_rate": 0.0002664669351525691, + "loss": 0.045, + "step": 1002 + }, + { + "epoch": 0.66, + "grad_norm": 0.1489264965057373, + "learning_rate": 0.00026640194003296297, + "loss": 0.0181, + "step": 1003 + }, + { + "epoch": 0.66, + "grad_norm": 0.06828869134187698, + "learning_rate": 0.00026633688992920833, + "loss": 0.0204, + "step": 1004 + }, + { + "epoch": 0.66, + "grad_norm": 0.08580945432186127, + "learning_rate": 0.00026627178487203244, + "loss": 0.0275, + "step": 1005 + }, + { + "epoch": 0.66, + "grad_norm": 0.2796219289302826, + "learning_rate": 0.00026620662489218867, + "loss": 0.06, + "step": 1006 + }, + { + "epoch": 0.66, + "grad_norm": 0.19413504004478455, + "learning_rate": 0.0002661414100204563, + "loss": 0.048, + "step": 1007 + }, + { + "epoch": 0.66, + "grad_norm": 0.0517372228205204, + "learning_rate": 0.0002660761402876405, + "loss": 0.0192, + "step": 1008 + }, + { + "epoch": 0.66, + "grad_norm": 0.12586665153503418, + "learning_rate": 0.0002660108157245724, + "loss": 0.064, + "step": 1009 + }, + { + "epoch": 0.66, + "grad_norm": 0.055950991809368134, + "learning_rate": 0.000265945436362109, + "loss": 0.0128, + "step": 1010 + }, + { + "epoch": 0.66, + "grad_norm": 0.03763822093605995, + "learning_rate": 0.00026588000223113316, + "loss": 0.0107, + "step": 1011 + }, + { + "epoch": 0.66, + "grad_norm": 0.20842203497886658, + "learning_rate": 0.00026581451336255365, + "loss": 0.0668, + "step": 1012 + }, + { + "epoch": 0.66, + "grad_norm": 0.077543243765831, + "learning_rate": 0.00026574896978730515, + "loss": 0.0218, + "step": 1013 + }, + { + "epoch": 0.66, + "grad_norm": 0.13783104717731476, + "learning_rate": 0.0002656833715363481, + "loss": 0.0431, + "step": 1014 + }, + { + "epoch": 0.66, + "grad_norm": 0.049275536090135574, + "learning_rate": 0.0002656177186406687, + "loss": 0.012, + "step": 1015 + }, + { + "epoch": 0.67, + "grad_norm": 0.10721635073423386, + "learning_rate": 0.00026555201113127907, + "loss": 0.0392, + "step": 1016 + }, + { + "epoch": 0.67, + "grad_norm": 0.1177641823887825, + "learning_rate": 0.0002654862490392172, + "loss": 0.0416, + "step": 1017 + }, + { + "epoch": 0.67, + "grad_norm": 0.1034293919801712, + "learning_rate": 0.00026542043239554677, + "loss": 0.0262, + "step": 1018 + }, + { + "epoch": 0.67, + "grad_norm": 0.05769471079111099, + "learning_rate": 0.0002653545612313571, + "loss": 0.0088, + "step": 1019 + }, + { + "epoch": 0.67, + "grad_norm": 0.2152629792690277, + "learning_rate": 0.0002652886355777635, + "loss": 0.0709, + "step": 1020 + }, + { + "epoch": 0.67, + "grad_norm": 0.0717998817563057, + "learning_rate": 0.0002652226554659069, + "loss": 0.0135, + "step": 1021 + }, + { + "epoch": 0.67, + "grad_norm": 0.24547475576400757, + "learning_rate": 0.0002651566209269539, + "loss": 0.0627, + "step": 1022 + }, + { + "epoch": 0.67, + "grad_norm": 0.17455288767814636, + "learning_rate": 0.00026509053199209697, + "loss": 0.0466, + "step": 1023 + }, + { + "epoch": 0.67, + "grad_norm": 0.08559072017669678, + "learning_rate": 0.0002650243886925541, + "loss": 0.0306, + "step": 1024 + }, + { + "epoch": 0.67, + "grad_norm": 0.16568362712860107, + "learning_rate": 0.0002649581910595691, + "loss": 0.0272, + "step": 1025 + }, + { + "epoch": 0.67, + "grad_norm": 0.14109772443771362, + "learning_rate": 0.00026489193912441133, + "loss": 0.0241, + "step": 1026 + }, + { + "epoch": 0.67, + "grad_norm": 0.12116571515798569, + "learning_rate": 0.00026482563291837586, + "loss": 0.0216, + "step": 1027 + }, + { + "epoch": 0.67, + "grad_norm": 0.1847831755876541, + "learning_rate": 0.0002647592724727835, + "loss": 0.046, + "step": 1028 + }, + { + "epoch": 0.67, + "grad_norm": 0.1964387595653534, + "learning_rate": 0.0002646928578189803, + "loss": 0.0223, + "step": 1029 + }, + { + "epoch": 0.67, + "grad_norm": 0.17670650780200958, + "learning_rate": 0.0002646263889883385, + "loss": 0.0392, + "step": 1030 + }, + { + "epoch": 0.67, + "grad_norm": 0.3018537759780884, + "learning_rate": 0.00026455986601225544, + "loss": 0.0601, + "step": 1031 + }, + { + "epoch": 0.68, + "grad_norm": 0.16954761743545532, + "learning_rate": 0.0002644932889221543, + "loss": 0.0568, + "step": 1032 + }, + { + "epoch": 0.68, + "grad_norm": 0.07362630218267441, + "learning_rate": 0.0002644266577494837, + "loss": 0.0173, + "step": 1033 + }, + { + "epoch": 0.68, + "grad_norm": 0.22263336181640625, + "learning_rate": 0.0002643599725257178, + "loss": 0.0528, + "step": 1034 + }, + { + "epoch": 0.68, + "grad_norm": 0.1654106080532074, + "learning_rate": 0.00026429323328235635, + "loss": 0.0264, + "step": 1035 + }, + { + "epoch": 0.68, + "grad_norm": 0.16433385014533997, + "learning_rate": 0.0002642264400509247, + "loss": 0.0403, + "step": 1036 + }, + { + "epoch": 0.68, + "grad_norm": 0.16119657456874847, + "learning_rate": 0.0002641595928629735, + "loss": 0.0517, + "step": 1037 + }, + { + "epoch": 0.68, + "grad_norm": 0.06720812618732452, + "learning_rate": 0.00026409269175007904, + "loss": 0.0275, + "step": 1038 + }, + { + "epoch": 0.68, + "grad_norm": 0.08320458233356476, + "learning_rate": 0.000264025736743843, + "loss": 0.0254, + "step": 1039 + }, + { + "epoch": 0.68, + "grad_norm": 0.10702455043792725, + "learning_rate": 0.00026395872787589254, + "loss": 0.0173, + "step": 1040 + }, + { + "epoch": 0.68, + "grad_norm": 0.1805281639099121, + "learning_rate": 0.0002638916651778803, + "loss": 0.0526, + "step": 1041 + }, + { + "epoch": 0.68, + "grad_norm": 0.1021476462483406, + "learning_rate": 0.0002638245486814843, + "loss": 0.0206, + "step": 1042 + }, + { + "epoch": 0.68, + "grad_norm": 0.0951414480805397, + "learning_rate": 0.00026375737841840803, + "loss": 0.0165, + "step": 1043 + }, + { + "epoch": 0.68, + "grad_norm": 0.07957201451063156, + "learning_rate": 0.0002636901544203804, + "loss": 0.0205, + "step": 1044 + }, + { + "epoch": 0.68, + "grad_norm": 0.1612643599510193, + "learning_rate": 0.0002636228767191555, + "loss": 0.0426, + "step": 1045 + }, + { + "epoch": 0.68, + "grad_norm": 0.06410415470600128, + "learning_rate": 0.00026355554534651296, + "loss": 0.0138, + "step": 1046 + }, + { + "epoch": 0.69, + "grad_norm": 0.06664423644542694, + "learning_rate": 0.0002634881603342578, + "loss": 0.0158, + "step": 1047 + }, + { + "epoch": 0.69, + "grad_norm": 0.07890690118074417, + "learning_rate": 0.0002634207217142203, + "loss": 0.0582, + "step": 1048 + }, + { + "epoch": 0.69, + "grad_norm": 0.14806897938251495, + "learning_rate": 0.000263353229518256, + "loss": 0.059, + "step": 1049 + }, + { + "epoch": 0.69, + "grad_norm": 0.08011514693498611, + "learning_rate": 0.00026328568377824587, + "loss": 0.0114, + "step": 1050 + }, + { + "epoch": 0.69, + "grad_norm": 0.2250976264476776, + "learning_rate": 0.00026321808452609615, + "loss": 0.0563, + "step": 1051 + }, + { + "epoch": 0.69, + "grad_norm": 0.12238743901252747, + "learning_rate": 0.0002631504317937383, + "loss": 0.027, + "step": 1052 + }, + { + "epoch": 0.69, + "grad_norm": 0.21183420717716217, + "learning_rate": 0.00026308272561312903, + "loss": 0.0975, + "step": 1053 + }, + { + "epoch": 0.69, + "grad_norm": 0.03879234194755554, + "learning_rate": 0.0002630149660162505, + "loss": 0.0079, + "step": 1054 + }, + { + "epoch": 0.69, + "grad_norm": 0.0885310247540474, + "learning_rate": 0.0002629471530351097, + "loss": 0.0345, + "step": 1055 + }, + { + "epoch": 0.69, + "grad_norm": 0.15572543442249298, + "learning_rate": 0.0002628792867017392, + "loss": 0.0418, + "step": 1056 + }, + { + "epoch": 0.69, + "grad_norm": 0.05571586638689041, + "learning_rate": 0.00026281136704819674, + "loss": 0.0148, + "step": 1057 + }, + { + "epoch": 0.69, + "grad_norm": 0.15888281166553497, + "learning_rate": 0.000262743394106565, + "loss": 0.0423, + "step": 1058 + }, + { + "epoch": 0.69, + "grad_norm": 0.06525658816099167, + "learning_rate": 0.0002626753679089521, + "loss": 0.0179, + "step": 1059 + }, + { + "epoch": 0.69, + "grad_norm": 0.12204741686582565, + "learning_rate": 0.0002626072884874911, + "loss": 0.025, + "step": 1060 + }, + { + "epoch": 0.69, + "grad_norm": 0.08302486687898636, + "learning_rate": 0.00026253915587434035, + "loss": 0.0346, + "step": 1061 + }, + { + "epoch": 0.7, + "grad_norm": 0.1839776635169983, + "learning_rate": 0.0002624709701016833, + "loss": 0.0328, + "step": 1062 + }, + { + "epoch": 0.7, + "grad_norm": 0.07696244865655899, + "learning_rate": 0.0002624027312017285, + "loss": 0.0133, + "step": 1063 + }, + { + "epoch": 0.7, + "grad_norm": 0.13141821324825287, + "learning_rate": 0.0002623344392067096, + "loss": 0.0776, + "step": 1064 + }, + { + "epoch": 0.7, + "grad_norm": 0.10801652073860168, + "learning_rate": 0.00026226609414888523, + "loss": 0.0308, + "step": 1065 + }, + { + "epoch": 0.7, + "grad_norm": 0.11759611964225769, + "learning_rate": 0.00026219769606053927, + "loss": 0.0555, + "step": 1066 + }, + { + "epoch": 0.7, + "grad_norm": 0.27231448888778687, + "learning_rate": 0.00026212924497398044, + "loss": 0.1241, + "step": 1067 + }, + { + "epoch": 0.7, + "grad_norm": 0.05362692102789879, + "learning_rate": 0.00026206074092154276, + "loss": 0.0345, + "step": 1068 + }, + { + "epoch": 0.7, + "grad_norm": 0.09251823276281357, + "learning_rate": 0.0002619921839355849, + "loss": 0.0423, + "step": 1069 + }, + { + "epoch": 0.7, + "grad_norm": 0.09250043332576752, + "learning_rate": 0.000261923574048491, + "loss": 0.0433, + "step": 1070 + }, + { + "epoch": 0.7, + "grad_norm": 0.04056562855839729, + "learning_rate": 0.0002618549112926698, + "loss": 0.0134, + "step": 1071 + }, + { + "epoch": 0.7, + "grad_norm": 0.04624701663851738, + "learning_rate": 0.0002617861957005551, + "loss": 0.0204, + "step": 1072 + }, + { + "epoch": 0.7, + "grad_norm": 0.09492779523134232, + "learning_rate": 0.00026171742730460583, + "loss": 0.0252, + "step": 1073 + }, + { + "epoch": 0.7, + "grad_norm": 0.07661382853984833, + "learning_rate": 0.00026164860613730567, + "loss": 0.0164, + "step": 1074 + }, + { + "epoch": 0.7, + "grad_norm": 0.2870001196861267, + "learning_rate": 0.0002615797322311633, + "loss": 0.0362, + "step": 1075 + }, + { + "epoch": 0.7, + "grad_norm": 0.0941600501537323, + "learning_rate": 0.0002615108056187123, + "loss": 0.0277, + "step": 1076 + }, + { + "epoch": 0.71, + "grad_norm": 0.09941410273313522, + "learning_rate": 0.00026144182633251127, + "loss": 0.0271, + "step": 1077 + }, + { + "epoch": 0.71, + "grad_norm": 0.06893230229616165, + "learning_rate": 0.0002613727944051434, + "loss": 0.0264, + "step": 1078 + }, + { + "epoch": 0.71, + "grad_norm": 0.09225239604711533, + "learning_rate": 0.00026130370986921707, + "loss": 0.0124, + "step": 1079 + }, + { + "epoch": 0.71, + "grad_norm": 0.13335295021533966, + "learning_rate": 0.0002612345727573653, + "loss": 0.0658, + "step": 1080 + }, + { + "epoch": 0.71, + "grad_norm": 0.08353302627801895, + "learning_rate": 0.000261165383102246, + "loss": 0.0168, + "step": 1081 + }, + { + "epoch": 0.71, + "grad_norm": 0.16986088454723358, + "learning_rate": 0.00026109614093654195, + "loss": 0.0857, + "step": 1082 + }, + { + "epoch": 0.71, + "grad_norm": 0.07607953995466232, + "learning_rate": 0.00026102684629296065, + "loss": 0.01, + "step": 1083 + }, + { + "epoch": 0.71, + "grad_norm": 0.1080528199672699, + "learning_rate": 0.00026095749920423446, + "loss": 0.0605, + "step": 1084 + }, + { + "epoch": 0.71, + "grad_norm": 0.14226533472537994, + "learning_rate": 0.0002608880997031205, + "loss": 0.0323, + "step": 1085 + }, + { + "epoch": 0.71, + "grad_norm": 0.0267617329955101, + "learning_rate": 0.0002608186478224006, + "loss": 0.0046, + "step": 1086 + }, + { + "epoch": 0.71, + "grad_norm": 0.05727904289960861, + "learning_rate": 0.00026074914359488143, + "loss": 0.0111, + "step": 1087 + }, + { + "epoch": 0.71, + "grad_norm": 0.08158308267593384, + "learning_rate": 0.0002606795870533942, + "loss": 0.0227, + "step": 1088 + }, + { + "epoch": 0.71, + "grad_norm": 0.17422080039978027, + "learning_rate": 0.00026060997823079506, + "loss": 0.0583, + "step": 1089 + }, + { + "epoch": 0.71, + "grad_norm": 0.19084464013576508, + "learning_rate": 0.0002605403171599647, + "loss": 0.0736, + "step": 1090 + }, + { + "epoch": 0.71, + "grad_norm": 0.10208334028720856, + "learning_rate": 0.00026047060387380855, + "loss": 0.021, + "step": 1091 + }, + { + "epoch": 0.71, + "grad_norm": 0.13515685498714447, + "learning_rate": 0.0002604008384052568, + "loss": 0.0319, + "step": 1092 + }, + { + "epoch": 0.72, + "grad_norm": 0.13729439675807953, + "learning_rate": 0.00026033102078726393, + "loss": 0.0292, + "step": 1093 + }, + { + "epoch": 0.72, + "grad_norm": 0.10295616090297699, + "learning_rate": 0.0002602611510528095, + "loss": 0.0133, + "step": 1094 + }, + { + "epoch": 0.72, + "grad_norm": 0.14003846049308777, + "learning_rate": 0.0002601912292348975, + "loss": 0.0413, + "step": 1095 + }, + { + "epoch": 0.72, + "grad_norm": 0.22413985431194305, + "learning_rate": 0.0002601212553665564, + "loss": 0.0242, + "step": 1096 + }, + { + "epoch": 0.72, + "grad_norm": 0.13832725584506989, + "learning_rate": 0.0002600512294808395, + "loss": 0.0353, + "step": 1097 + }, + { + "epoch": 0.72, + "grad_norm": 0.29502683877944946, + "learning_rate": 0.0002599811516108245, + "loss": 0.0362, + "step": 1098 + }, + { + "epoch": 0.72, + "grad_norm": 0.09490124136209488, + "learning_rate": 0.00025991102178961366, + "loss": 0.014, + "step": 1099 + }, + { + "epoch": 0.72, + "grad_norm": 0.1145247370004654, + "learning_rate": 0.0002598408400503339, + "loss": 0.0294, + "step": 1100 + }, + { + "epoch": 0.72, + "grad_norm": 0.38977229595184326, + "learning_rate": 0.00025977060642613645, + "loss": 0.0827, + "step": 1101 + }, + { + "epoch": 0.72, + "grad_norm": 0.10398557782173157, + "learning_rate": 0.0002597003209501973, + "loss": 0.0176, + "step": 1102 + }, + { + "epoch": 0.72, + "grad_norm": 0.13759955763816833, + "learning_rate": 0.0002596299836557168, + "loss": 0.0428, + "step": 1103 + }, + { + "epoch": 0.72, + "grad_norm": 0.05294102802872658, + "learning_rate": 0.0002595595945759198, + "loss": 0.013, + "step": 1104 + }, + { + "epoch": 0.72, + "grad_norm": 0.2116420418024063, + "learning_rate": 0.0002594891537440556, + "loss": 0.0416, + "step": 1105 + }, + { + "epoch": 0.72, + "grad_norm": 0.0850871354341507, + "learning_rate": 0.00025941866119339786, + "loss": 0.0264, + "step": 1106 + }, + { + "epoch": 0.72, + "grad_norm": 0.04429350420832634, + "learning_rate": 0.00025934811695724484, + "loss": 0.0088, + "step": 1107 + }, + { + "epoch": 0.73, + "grad_norm": 0.0578470379114151, + "learning_rate": 0.0002592775210689192, + "loss": 0.0295, + "step": 1108 + }, + { + "epoch": 0.73, + "grad_norm": 0.1103309616446495, + "learning_rate": 0.00025920687356176784, + "loss": 0.0154, + "step": 1109 + }, + { + "epoch": 0.73, + "grad_norm": 0.09454017877578735, + "learning_rate": 0.0002591361744691622, + "loss": 0.025, + "step": 1110 + }, + { + "epoch": 0.73, + "grad_norm": 0.19059227406978607, + "learning_rate": 0.0002590654238244979, + "loss": 0.0599, + "step": 1111 + }, + { + "epoch": 0.73, + "grad_norm": 0.08629673719406128, + "learning_rate": 0.0002589946216611952, + "loss": 0.0151, + "step": 1112 + }, + { + "epoch": 0.73, + "grad_norm": 0.18637306988239288, + "learning_rate": 0.0002589237680126984, + "loss": 0.0496, + "step": 1113 + }, + { + "epoch": 0.73, + "grad_norm": 0.12386718392372131, + "learning_rate": 0.00025885286291247634, + "loss": 0.0269, + "step": 1114 + }, + { + "epoch": 0.73, + "grad_norm": 0.18383803963661194, + "learning_rate": 0.00025878190639402204, + "loss": 0.0408, + "step": 1115 + }, + { + "epoch": 0.73, + "grad_norm": 0.24928437173366547, + "learning_rate": 0.0002587108984908528, + "loss": 0.0254, + "step": 1116 + }, + { + "epoch": 0.73, + "grad_norm": 0.023719167336821556, + "learning_rate": 0.00025863983923651027, + "loss": 0.0037, + "step": 1117 + }, + { + "epoch": 0.73, + "grad_norm": 0.16337376832962036, + "learning_rate": 0.00025856872866456037, + "loss": 0.0529, + "step": 1118 + }, + { + "epoch": 0.73, + "grad_norm": 0.11658964306116104, + "learning_rate": 0.00025849756680859317, + "loss": 0.063, + "step": 1119 + }, + { + "epoch": 0.73, + "grad_norm": 0.20387554168701172, + "learning_rate": 0.000258426353702223, + "loss": 0.0605, + "step": 1120 + }, + { + "epoch": 0.73, + "grad_norm": 0.2778151035308838, + "learning_rate": 0.0002583550893790885, + "loss": 0.0476, + "step": 1121 + }, + { + "epoch": 0.73, + "grad_norm": 0.11449744552373886, + "learning_rate": 0.0002582837738728522, + "loss": 0.0315, + "step": 1122 + }, + { + "epoch": 0.74, + "grad_norm": 0.10286298394203186, + "learning_rate": 0.00025821240721720116, + "loss": 0.041, + "step": 1123 + }, + { + "epoch": 0.74, + "grad_norm": 0.11522707343101501, + "learning_rate": 0.00025814098944584645, + "loss": 0.0414, + "step": 1124 + }, + { + "epoch": 0.74, + "grad_norm": 0.06536536663770676, + "learning_rate": 0.0002580695205925233, + "loss": 0.0216, + "step": 1125 + }, + { + "epoch": 0.74, + "grad_norm": 0.0686458870768547, + "learning_rate": 0.00025799800069099105, + "loss": 0.0667, + "step": 1126 + }, + { + "epoch": 0.74, + "grad_norm": 0.07378174364566803, + "learning_rate": 0.0002579264297750331, + "loss": 0.018, + "step": 1127 + }, + { + "epoch": 0.74, + "grad_norm": 0.05744575336575508, + "learning_rate": 0.0002578548078784571, + "loss": 0.0328, + "step": 1128 + }, + { + "epoch": 0.74, + "grad_norm": 0.1781056821346283, + "learning_rate": 0.0002577831350350947, + "loss": 0.056, + "step": 1129 + }, + { + "epoch": 0.74, + "grad_norm": 0.11974502354860306, + "learning_rate": 0.0002577114112788016, + "loss": 0.0411, + "step": 1130 + }, + { + "epoch": 0.74, + "grad_norm": 0.07625679671764374, + "learning_rate": 0.00025763963664345745, + "loss": 0.0332, + "step": 1131 + }, + { + "epoch": 0.74, + "grad_norm": 0.07967997342348099, + "learning_rate": 0.00025756781116296617, + "loss": 0.0431, + "step": 1132 + }, + { + "epoch": 0.74, + "grad_norm": 0.14101997017860413, + "learning_rate": 0.0002574959348712555, + "loss": 0.0322, + "step": 1133 + }, + { + "epoch": 0.74, + "grad_norm": 0.12365719676017761, + "learning_rate": 0.00025742400780227724, + "loss": 0.0205, + "step": 1134 + }, + { + "epoch": 0.74, + "grad_norm": 0.14429523050785065, + "learning_rate": 0.0002573520299900073, + "loss": 0.069, + "step": 1135 + }, + { + "epoch": 0.74, + "grad_norm": 0.021441614255309105, + "learning_rate": 0.0002572800014684453, + "loss": 0.0054, + "step": 1136 + }, + { + "epoch": 0.74, + "grad_norm": 0.08611132204532623, + "learning_rate": 0.0002572079222716151, + "loss": 0.0442, + "step": 1137 + }, + { + "epoch": 0.75, + "grad_norm": 0.09402936697006226, + "learning_rate": 0.0002571357924335642, + "loss": 0.0352, + "step": 1138 + }, + { + "epoch": 0.75, + "grad_norm": 0.08581096678972244, + "learning_rate": 0.00025706361198836437, + "loss": 0.0149, + "step": 1139 + }, + { + "epoch": 0.75, + "grad_norm": 0.0612567737698555, + "learning_rate": 0.0002569913809701109, + "loss": 0.014, + "step": 1140 + }, + { + "epoch": 0.75, + "grad_norm": 0.10282464325428009, + "learning_rate": 0.0002569190994129233, + "loss": 0.0254, + "step": 1141 + }, + { + "epoch": 0.75, + "grad_norm": 0.07298202067613602, + "learning_rate": 0.00025684676735094475, + "loss": 0.033, + "step": 1142 + }, + { + "epoch": 0.75, + "grad_norm": 0.06616336852312088, + "learning_rate": 0.0002567743848183423, + "loss": 0.0127, + "step": 1143 + }, + { + "epoch": 0.75, + "grad_norm": 0.09016578644514084, + "learning_rate": 0.000256701951849307, + "loss": 0.0248, + "step": 1144 + }, + { + "epoch": 0.75, + "grad_norm": 0.09605623781681061, + "learning_rate": 0.0002566294684780536, + "loss": 0.0554, + "step": 1145 + }, + { + "epoch": 0.75, + "grad_norm": 0.13209934532642365, + "learning_rate": 0.0002565569347388206, + "loss": 0.0437, + "step": 1146 + }, + { + "epoch": 0.75, + "eval_loss": 0.030348777770996094, + "eval_runtime": 39.9058, + "eval_samples_per_second": 32.251, + "eval_steps_per_second": 8.069, + "step": 1146 + }, + { + "epoch": 0.75, + "grad_norm": 0.13489413261413574, + "learning_rate": 0.0002564843506658704, + "loss": 0.0214, + "step": 1147 + }, + { + "epoch": 0.75, + "grad_norm": 0.036875851452350616, + "learning_rate": 0.00025641171629348916, + "loss": 0.0075, + "step": 1148 + }, + { + "epoch": 0.75, + "grad_norm": 0.04911373555660248, + "learning_rate": 0.0002563390316559868, + "loss": 0.0331, + "step": 1149 + }, + { + "epoch": 0.75, + "grad_norm": 0.02945212461054325, + "learning_rate": 0.0002562662967876969, + "loss": 0.0044, + "step": 1150 + }, + { + "epoch": 0.75, + "grad_norm": 0.09545271843671799, + "learning_rate": 0.00025619351172297686, + "loss": 0.0342, + "step": 1151 + }, + { + "epoch": 0.75, + "grad_norm": 0.034161727875471115, + "learning_rate": 0.0002561206764962079, + "loss": 0.0064, + "step": 1152 + }, + { + "epoch": 0.75, + "grad_norm": 0.17162153124809265, + "learning_rate": 0.00025604779114179457, + "loss": 0.0305, + "step": 1153 + }, + { + "epoch": 0.76, + "grad_norm": 0.10241468250751495, + "learning_rate": 0.0002559748556941654, + "loss": 0.0143, + "step": 1154 + }, + { + "epoch": 0.76, + "grad_norm": 0.19089680910110474, + "learning_rate": 0.0002559018701877726, + "loss": 0.0192, + "step": 1155 + }, + { + "epoch": 0.76, + "grad_norm": 0.19189144670963287, + "learning_rate": 0.0002558288346570918, + "loss": 0.0385, + "step": 1156 + }, + { + "epoch": 0.76, + "grad_norm": 0.023649632930755615, + "learning_rate": 0.00025575574913662256, + "loss": 0.0043, + "step": 1157 + }, + { + "epoch": 0.76, + "grad_norm": 0.20011720061302185, + "learning_rate": 0.0002556826136608877, + "loss": 0.0361, + "step": 1158 + }, + { + "epoch": 0.76, + "grad_norm": 0.3903810679912567, + "learning_rate": 0.00025560942826443396, + "loss": 0.1086, + "step": 1159 + }, + { + "epoch": 0.76, + "grad_norm": 0.0918634682893753, + "learning_rate": 0.0002555361929818315, + "loss": 0.0237, + "step": 1160 + }, + { + "epoch": 0.76, + "grad_norm": 0.11210468411445618, + "learning_rate": 0.00025546290784767407, + "loss": 0.0432, + "step": 1161 + }, + { + "epoch": 0.76, + "grad_norm": 0.10598167777061462, + "learning_rate": 0.000255389572896579, + "loss": 0.0304, + "step": 1162 + }, + { + "epoch": 0.76, + "grad_norm": 0.03547512739896774, + "learning_rate": 0.00025531618816318697, + "loss": 0.014, + "step": 1163 + }, + { + "epoch": 0.76, + "grad_norm": 0.08146083354949951, + "learning_rate": 0.00025524275368216245, + "loss": 0.0122, + "step": 1164 + }, + { + "epoch": 0.76, + "grad_norm": 0.046655625104904175, + "learning_rate": 0.00025516926948819334, + "loss": 0.0151, + "step": 1165 + }, + { + "epoch": 0.76, + "grad_norm": 0.09417696297168732, + "learning_rate": 0.0002550957356159908, + "loss": 0.047, + "step": 1166 + }, + { + "epoch": 0.76, + "grad_norm": 0.08695515990257263, + "learning_rate": 0.00025502215210028976, + "loss": 0.0363, + "step": 1167 + }, + { + "epoch": 0.76, + "grad_norm": 0.05286262556910515, + "learning_rate": 0.0002549485189758485, + "loss": 0.0331, + "step": 1168 + }, + { + "epoch": 0.77, + "grad_norm": 0.1305568516254425, + "learning_rate": 0.0002548748362774485, + "loss": 0.0552, + "step": 1169 + }, + { + "epoch": 0.77, + "grad_norm": 0.15096144378185272, + "learning_rate": 0.000254801104039895, + "loss": 0.0341, + "step": 1170 + }, + { + "epoch": 0.77, + "grad_norm": 0.07643090933561325, + "learning_rate": 0.0002547273222980165, + "loss": 0.0234, + "step": 1171 + }, + { + "epoch": 0.77, + "grad_norm": 0.052111852914094925, + "learning_rate": 0.0002546534910866648, + "loss": 0.0278, + "step": 1172 + }, + { + "epoch": 0.77, + "grad_norm": 0.15109075605869293, + "learning_rate": 0.00025457961044071523, + "loss": 0.039, + "step": 1173 + }, + { + "epoch": 0.77, + "grad_norm": 0.05562788248062134, + "learning_rate": 0.00025450568039506633, + "loss": 0.0214, + "step": 1174 + }, + { + "epoch": 0.77, + "grad_norm": 0.1751837581396103, + "learning_rate": 0.00025443170098464, + "loss": 0.0401, + "step": 1175 + }, + { + "epoch": 0.77, + "grad_norm": 0.19507139921188354, + "learning_rate": 0.0002543576722443816, + "loss": 0.0331, + "step": 1176 + }, + { + "epoch": 0.77, + "grad_norm": 0.10975005477666855, + "learning_rate": 0.00025428359420925966, + "loss": 0.0155, + "step": 1177 + }, + { + "epoch": 0.77, + "grad_norm": 0.1416396051645279, + "learning_rate": 0.00025420946691426586, + "loss": 0.0473, + "step": 1178 + }, + { + "epoch": 0.77, + "grad_norm": 0.03987191617488861, + "learning_rate": 0.0002541352903944155, + "loss": 0.0069, + "step": 1179 + }, + { + "epoch": 0.77, + "grad_norm": 0.34085920453071594, + "learning_rate": 0.00025406106468474685, + "loss": 0.0919, + "step": 1180 + }, + { + "epoch": 0.77, + "grad_norm": 0.06129152700304985, + "learning_rate": 0.0002539867898203215, + "loss": 0.0129, + "step": 1181 + }, + { + "epoch": 0.77, + "grad_norm": 0.08059722930192947, + "learning_rate": 0.00025391246583622427, + "loss": 0.0172, + "step": 1182 + }, + { + "epoch": 0.77, + "grad_norm": 0.12509244680404663, + "learning_rate": 0.0002538380927675632, + "loss": 0.0881, + "step": 1183 + }, + { + "epoch": 0.78, + "grad_norm": 0.21917979419231415, + "learning_rate": 0.00025376367064946945, + "loss": 0.0438, + "step": 1184 + }, + { + "epoch": 0.78, + "grad_norm": 0.05029948800802231, + "learning_rate": 0.0002536891995170974, + "loss": 0.0102, + "step": 1185 + }, + { + "epoch": 0.78, + "grad_norm": 0.027424413710832596, + "learning_rate": 0.00025361467940562463, + "loss": 0.0053, + "step": 1186 + }, + { + "epoch": 0.78, + "grad_norm": 0.0775713250041008, + "learning_rate": 0.0002535401103502517, + "loss": 0.0329, + "step": 1187 + }, + { + "epoch": 0.78, + "grad_norm": 0.12953567504882812, + "learning_rate": 0.0002534654923862025, + "loss": 0.0371, + "step": 1188 + }, + { + "epoch": 0.78, + "grad_norm": 0.07097966223955154, + "learning_rate": 0.00025339082554872377, + "loss": 0.0165, + "step": 1189 + }, + { + "epoch": 0.78, + "grad_norm": 0.1304195523262024, + "learning_rate": 0.0002533161098730856, + "loss": 0.0386, + "step": 1190 + }, + { + "epoch": 0.78, + "grad_norm": 0.06887423247098923, + "learning_rate": 0.00025324134539458096, + "loss": 0.0221, + "step": 1191 + }, + { + "epoch": 0.78, + "grad_norm": 0.08637112379074097, + "learning_rate": 0.00025316653214852596, + "loss": 0.0341, + "step": 1192 + }, + { + "epoch": 0.78, + "grad_norm": 0.04632532596588135, + "learning_rate": 0.0002530916701702597, + "loss": 0.0094, + "step": 1193 + }, + { + "epoch": 0.78, + "grad_norm": 0.11397617310285568, + "learning_rate": 0.00025301675949514435, + "loss": 0.0167, + "step": 1194 + }, + { + "epoch": 0.78, + "grad_norm": 0.04785558953881264, + "learning_rate": 0.000252941800158565, + "loss": 0.0189, + "step": 1195 + }, + { + "epoch": 0.78, + "grad_norm": 0.24082554876804352, + "learning_rate": 0.00025286679219593, + "loss": 0.0472, + "step": 1196 + }, + { + "epoch": 0.78, + "grad_norm": 0.14454412460327148, + "learning_rate": 0.00025279173564267014, + "loss": 0.0521, + "step": 1197 + }, + { + "epoch": 0.78, + "grad_norm": 0.16198396682739258, + "learning_rate": 0.00025271663053423967, + "loss": 0.0606, + "step": 1198 + }, + { + "epoch": 0.78, + "grad_norm": 0.114061638712883, + "learning_rate": 0.0002526414769061155, + "loss": 0.012, + "step": 1199 + }, + { + "epoch": 0.79, + "grad_norm": 0.1736219972372055, + "learning_rate": 0.00025256627479379755, + "loss": 0.0516, + "step": 1200 + }, + { + "epoch": 0.79, + "grad_norm": 0.04280832037329674, + "learning_rate": 0.0002524910242328087, + "loss": 0.0073, + "step": 1201 + }, + { + "epoch": 0.79, + "grad_norm": 0.13054266571998596, + "learning_rate": 0.0002524157252586946, + "loss": 0.0295, + "step": 1202 + }, + { + "epoch": 0.79, + "grad_norm": 0.24452893435955048, + "learning_rate": 0.00025234037790702375, + "loss": 0.0856, + "step": 1203 + }, + { + "epoch": 0.79, + "grad_norm": 0.05776005983352661, + "learning_rate": 0.0002522649822133877, + "loss": 0.0152, + "step": 1204 + }, + { + "epoch": 0.79, + "grad_norm": 0.27971917390823364, + "learning_rate": 0.0002521895382134006, + "loss": 0.1183, + "step": 1205 + }, + { + "epoch": 0.79, + "grad_norm": 0.04905636981129646, + "learning_rate": 0.0002521140459426995, + "loss": 0.0126, + "step": 1206 + }, + { + "epoch": 0.79, + "grad_norm": 0.15006506443023682, + "learning_rate": 0.0002520385054369444, + "loss": 0.0811, + "step": 1207 + }, + { + "epoch": 0.79, + "grad_norm": 0.15131042897701263, + "learning_rate": 0.00025196291673181784, + "loss": 0.0401, + "step": 1208 + }, + { + "epoch": 0.79, + "grad_norm": 0.1603415459394455, + "learning_rate": 0.0002518872798630253, + "loss": 0.0448, + "step": 1209 + }, + { + "epoch": 0.79, + "grad_norm": 0.07513672858476639, + "learning_rate": 0.0002518115948662949, + "loss": 0.0401, + "step": 1210 + }, + { + "epoch": 0.79, + "grad_norm": 0.11225542426109314, + "learning_rate": 0.0002517358617773776, + "loss": 0.039, + "step": 1211 + }, + { + "epoch": 0.79, + "grad_norm": 0.0876198261976242, + "learning_rate": 0.000251660080632047, + "loss": 0.0239, + "step": 1212 + }, + { + "epoch": 0.79, + "grad_norm": 0.1050589308142662, + "learning_rate": 0.0002515842514660994, + "loss": 0.0258, + "step": 1213 + }, + { + "epoch": 0.79, + "grad_norm": 0.0426226444542408, + "learning_rate": 0.0002515083743153539, + "loss": 0.0111, + "step": 1214 + }, + { + "epoch": 0.8, + "grad_norm": 0.09025552123785019, + "learning_rate": 0.00025143244921565214, + "loss": 0.0185, + "step": 1215 + }, + { + "epoch": 0.8, + "grad_norm": 0.12371645122766495, + "learning_rate": 0.00025135647620285834, + "loss": 0.0326, + "step": 1216 + }, + { + "epoch": 0.8, + "grad_norm": 0.07417233288288116, + "learning_rate": 0.0002512804553128596, + "loss": 0.0238, + "step": 1217 + }, + { + "epoch": 0.8, + "grad_norm": 0.10499947518110275, + "learning_rate": 0.0002512043865815654, + "loss": 0.0464, + "step": 1218 + }, + { + "epoch": 0.8, + "grad_norm": 0.16344919800758362, + "learning_rate": 0.00025112827004490797, + "loss": 0.0373, + "step": 1219 + }, + { + "epoch": 0.8, + "grad_norm": 0.0862027183175087, + "learning_rate": 0.00025105210573884203, + "loss": 0.0178, + "step": 1220 + }, + { + "epoch": 0.8, + "grad_norm": 0.10541030019521713, + "learning_rate": 0.0002509758936993449, + "loss": 0.0377, + "step": 1221 + }, + { + "epoch": 0.8, + "grad_norm": 0.05190376564860344, + "learning_rate": 0.00025089963396241643, + "loss": 0.0099, + "step": 1222 + }, + { + "epoch": 0.8, + "grad_norm": 0.09249959141016006, + "learning_rate": 0.00025082332656407906, + "loss": 0.0157, + "step": 1223 + }, + { + "epoch": 0.8, + "grad_norm": 0.02348952367901802, + "learning_rate": 0.00025074697154037765, + "loss": 0.0041, + "step": 1224 + }, + { + "epoch": 0.8, + "grad_norm": 0.12875327467918396, + "learning_rate": 0.0002506705689273797, + "loss": 0.0173, + "step": 1225 + }, + { + "epoch": 0.8, + "grad_norm": 0.13971397280693054, + "learning_rate": 0.0002505941187611749, + "loss": 0.0381, + "step": 1226 + }, + { + "epoch": 0.8, + "grad_norm": 0.21139316260814667, + "learning_rate": 0.00025051762107787583, + "loss": 0.0399, + "step": 1227 + }, + { + "epoch": 0.8, + "grad_norm": 0.10346369445323944, + "learning_rate": 0.0002504410759136171, + "loss": 0.031, + "step": 1228 + }, + { + "epoch": 0.8, + "grad_norm": 0.021524077281355858, + "learning_rate": 0.00025036448330455603, + "loss": 0.0041, + "step": 1229 + }, + { + "epoch": 0.81, + "grad_norm": 0.21078258752822876, + "learning_rate": 0.0002502878432868722, + "loss": 0.0291, + "step": 1230 + }, + { + "epoch": 0.81, + "grad_norm": 0.28720253705978394, + "learning_rate": 0.00025021115589676774, + "loss": 0.0318, + "step": 1231 + }, + { + "epoch": 0.81, + "grad_norm": 0.2182384580373764, + "learning_rate": 0.00025013442117046694, + "loss": 0.0407, + "step": 1232 + }, + { + "epoch": 0.81, + "grad_norm": 0.1223733052611351, + "learning_rate": 0.0002500576391442166, + "loss": 0.0189, + "step": 1233 + }, + { + "epoch": 0.81, + "grad_norm": 0.1699313372373581, + "learning_rate": 0.0002499808098542858, + "loss": 0.1081, + "step": 1234 + }, + { + "epoch": 0.81, + "grad_norm": 0.21604309976100922, + "learning_rate": 0.00024990393333696603, + "loss": 0.0406, + "step": 1235 + }, + { + "epoch": 0.81, + "grad_norm": 0.11065655201673508, + "learning_rate": 0.00024982700962857094, + "loss": 0.0274, + "step": 1236 + }, + { + "epoch": 0.81, + "grad_norm": 0.10013590008020401, + "learning_rate": 0.0002497500387654367, + "loss": 0.0138, + "step": 1237 + }, + { + "epoch": 0.81, + "grad_norm": 0.03474019467830658, + "learning_rate": 0.0002496730207839215, + "loss": 0.0067, + "step": 1238 + }, + { + "epoch": 0.81, + "grad_norm": 0.1373460739850998, + "learning_rate": 0.00024959595572040594, + "loss": 0.0382, + "step": 1239 + }, + { + "epoch": 0.81, + "grad_norm": 0.1674460619688034, + "learning_rate": 0.0002495188436112928, + "loss": 0.0187, + "step": 1240 + }, + { + "epoch": 0.81, + "grad_norm": 0.056852634996175766, + "learning_rate": 0.0002494416844930072, + "loss": 0.02, + "step": 1241 + }, + { + "epoch": 0.81, + "grad_norm": 0.1567879319190979, + "learning_rate": 0.00024936447840199626, + "loss": 0.0488, + "step": 1242 + }, + { + "epoch": 0.81, + "grad_norm": 0.19893474876880646, + "learning_rate": 0.0002492872253747294, + "loss": 0.0382, + "step": 1243 + }, + { + "epoch": 0.81, + "grad_norm": 0.07066723704338074, + "learning_rate": 0.0002492099254476983, + "loss": 0.0194, + "step": 1244 + }, + { + "epoch": 0.82, + "grad_norm": 0.11466959118843079, + "learning_rate": 0.00024913257865741663, + "loss": 0.0367, + "step": 1245 + }, + { + "epoch": 0.82, + "grad_norm": 0.08930857479572296, + "learning_rate": 0.0002490551850404203, + "loss": 0.0186, + "step": 1246 + }, + { + "epoch": 0.82, + "grad_norm": 0.0905904471874237, + "learning_rate": 0.0002489777446332673, + "loss": 0.0349, + "step": 1247 + }, + { + "epoch": 0.82, + "grad_norm": 0.225018709897995, + "learning_rate": 0.0002489002574725378, + "loss": 0.0579, + "step": 1248 + }, + { + "epoch": 0.82, + "grad_norm": 0.15631456673145294, + "learning_rate": 0.0002488227235948339, + "loss": 0.0361, + "step": 1249 + }, + { + "epoch": 0.82, + "grad_norm": 0.06862124055624008, + "learning_rate": 0.0002487451430367798, + "loss": 0.0351, + "step": 1250 + }, + { + "epoch": 0.82, + "grad_norm": 0.10271900147199631, + "learning_rate": 0.00024866751583502194, + "loss": 0.0393, + "step": 1251 + }, + { + "epoch": 0.82, + "grad_norm": 0.12624254822731018, + "learning_rate": 0.0002485898420262286, + "loss": 0.0309, + "step": 1252 + }, + { + "epoch": 0.82, + "grad_norm": 0.116575688123703, + "learning_rate": 0.00024851212164709013, + "loss": 0.058, + "step": 1253 + }, + { + "epoch": 0.82, + "grad_norm": 0.06756250560283661, + "learning_rate": 0.00024843435473431886, + "loss": 0.0335, + "step": 1254 + }, + { + "epoch": 0.82, + "grad_norm": 0.20835717022418976, + "learning_rate": 0.0002483565413246492, + "loss": 0.0389, + "step": 1255 + }, + { + "epoch": 0.82, + "grad_norm": 0.04360177740454674, + "learning_rate": 0.0002482786814548374, + "loss": 0.008, + "step": 1256 + }, + { + "epoch": 0.82, + "grad_norm": 0.1068229153752327, + "learning_rate": 0.0002482007751616616, + "loss": 0.0304, + "step": 1257 + }, + { + "epoch": 0.82, + "grad_norm": 0.04819338023662567, + "learning_rate": 0.0002481228224819221, + "loss": 0.0098, + "step": 1258 + }, + { + "epoch": 0.82, + "grad_norm": 0.48405715823173523, + "learning_rate": 0.00024804482345244105, + "loss": 0.0348, + "step": 1259 + }, + { + "epoch": 0.82, + "grad_norm": 0.09796518087387085, + "learning_rate": 0.0002479667781100622, + "loss": 0.0153, + "step": 1260 + }, + { + "epoch": 0.83, + "grad_norm": 0.13171538710594177, + "learning_rate": 0.0002478886864916516, + "loss": 0.0316, + "step": 1261 + }, + { + "epoch": 0.83, + "grad_norm": 0.0907411128282547, + "learning_rate": 0.00024781054863409676, + "loss": 0.0169, + "step": 1262 + }, + { + "epoch": 0.83, + "grad_norm": 0.10159718245267868, + "learning_rate": 0.00024773236457430745, + "loss": 0.013, + "step": 1263 + }, + { + "epoch": 0.83, + "grad_norm": 0.10823512077331543, + "learning_rate": 0.00024765413434921495, + "loss": 0.0252, + "step": 1264 + }, + { + "epoch": 0.83, + "grad_norm": 0.07199376821517944, + "learning_rate": 0.0002475758579957724, + "loss": 0.0105, + "step": 1265 + }, + { + "epoch": 0.83, + "grad_norm": 0.11216728389263153, + "learning_rate": 0.0002474975355509549, + "loss": 0.0339, + "step": 1266 + }, + { + "epoch": 0.83, + "grad_norm": 0.16655175387859344, + "learning_rate": 0.00024741916705175906, + "loss": 0.0306, + "step": 1267 + }, + { + "epoch": 0.83, + "grad_norm": 0.08566506952047348, + "learning_rate": 0.00024734075253520345, + "loss": 0.0329, + "step": 1268 + }, + { + "epoch": 0.83, + "grad_norm": 0.1542367786169052, + "learning_rate": 0.00024726229203832824, + "loss": 0.0284, + "step": 1269 + }, + { + "epoch": 0.83, + "grad_norm": 0.1685347855091095, + "learning_rate": 0.00024718378559819554, + "loss": 0.0385, + "step": 1270 + }, + { + "epoch": 0.83, + "grad_norm": 0.1904221624135971, + "learning_rate": 0.00024710523325188885, + "loss": 0.0435, + "step": 1271 + }, + { + "epoch": 0.83, + "grad_norm": 0.10915929824113846, + "learning_rate": 0.00024702663503651357, + "loss": 0.0129, + "step": 1272 + }, + { + "epoch": 0.83, + "grad_norm": 0.04411763325333595, + "learning_rate": 0.0002469479909891967, + "loss": 0.0038, + "step": 1273 + }, + { + "epoch": 0.83, + "grad_norm": 0.22485259175300598, + "learning_rate": 0.0002468693011470869, + "loss": 0.0456, + "step": 1274 + }, + { + "epoch": 0.83, + "grad_norm": 0.10708510875701904, + "learning_rate": 0.00024679056554735454, + "loss": 0.0192, + "step": 1275 + }, + { + "epoch": 0.84, + "grad_norm": 0.15084552764892578, + "learning_rate": 0.00024671178422719137, + "loss": 0.0293, + "step": 1276 + }, + { + "epoch": 0.84, + "grad_norm": 0.14543551206588745, + "learning_rate": 0.000246632957223811, + "loss": 0.0666, + "step": 1277 + }, + { + "epoch": 0.84, + "grad_norm": 0.1648811399936676, + "learning_rate": 0.00024655408457444853, + "loss": 0.0321, + "step": 1278 + }, + { + "epoch": 0.84, + "grad_norm": 0.16748228669166565, + "learning_rate": 0.00024647516631636055, + "loss": 0.0373, + "step": 1279 + }, + { + "epoch": 0.84, + "grad_norm": 0.04038754105567932, + "learning_rate": 0.00024639620248682523, + "loss": 0.0049, + "step": 1280 + }, + { + "epoch": 0.84, + "grad_norm": 0.1675775945186615, + "learning_rate": 0.00024631719312314234, + "loss": 0.0517, + "step": 1281 + }, + { + "epoch": 0.84, + "grad_norm": 0.227004274725914, + "learning_rate": 0.00024623813826263303, + "loss": 0.0445, + "step": 1282 + }, + { + "epoch": 0.84, + "grad_norm": 0.05555510148406029, + "learning_rate": 0.00024615903794264005, + "loss": 0.0096, + "step": 1283 + }, + { + "epoch": 0.84, + "grad_norm": 0.16279524564743042, + "learning_rate": 0.00024607989220052766, + "loss": 0.0452, + "step": 1284 + }, + { + "epoch": 0.84, + "grad_norm": 0.22099511325359344, + "learning_rate": 0.0002460007010736814, + "loss": 0.0484, + "step": 1285 + }, + { + "epoch": 0.84, + "grad_norm": 0.3313157558441162, + "learning_rate": 0.00024592146459950835, + "loss": 0.0798, + "step": 1286 + }, + { + "epoch": 0.84, + "grad_norm": 0.1560799926519394, + "learning_rate": 0.0002458421828154371, + "loss": 0.0523, + "step": 1287 + }, + { + "epoch": 0.84, + "grad_norm": 0.0924949198961258, + "learning_rate": 0.0002457628557589174, + "loss": 0.0416, + "step": 1288 + }, + { + "epoch": 0.84, + "grad_norm": 0.061663124710321426, + "learning_rate": 0.0002456834834674207, + "loss": 0.0187, + "step": 1289 + }, + { + "epoch": 0.84, + "grad_norm": 0.04804534092545509, + "learning_rate": 0.0002456040659784396, + "loss": 0.0236, + "step": 1290 + }, + { + "epoch": 0.85, + "grad_norm": 0.09753583371639252, + "learning_rate": 0.00024552460332948804, + "loss": 0.0447, + "step": 1291 + }, + { + "epoch": 0.85, + "grad_norm": 0.03994222357869148, + "learning_rate": 0.0002454450955581015, + "loss": 0.0098, + "step": 1292 + }, + { + "epoch": 0.85, + "grad_norm": 0.12844492495059967, + "learning_rate": 0.0002453655427018364, + "loss": 0.0234, + "step": 1293 + }, + { + "epoch": 0.85, + "grad_norm": 0.12967482209205627, + "learning_rate": 0.000245285944798271, + "loss": 0.0435, + "step": 1294 + }, + { + "epoch": 0.85, + "grad_norm": 0.25114384293556213, + "learning_rate": 0.00024520630188500423, + "loss": 0.0539, + "step": 1295 + }, + { + "epoch": 0.85, + "grad_norm": 0.25040391087532043, + "learning_rate": 0.0002451266139996568, + "loss": 0.037, + "step": 1296 + }, + { + "epoch": 0.85, + "grad_norm": 0.21144863963127136, + "learning_rate": 0.0002450468811798703, + "loss": 0.0371, + "step": 1297 + }, + { + "epoch": 0.85, + "grad_norm": 0.2176048457622528, + "learning_rate": 0.00024496710346330776, + "loss": 0.0311, + "step": 1298 + }, + { + "epoch": 0.85, + "grad_norm": 0.06802531331777573, + "learning_rate": 0.0002448872808876533, + "loss": 0.0095, + "step": 1299 + }, + { + "epoch": 0.85, + "grad_norm": 0.11026319861412048, + "learning_rate": 0.0002448074134906123, + "loss": 0.0132, + "step": 1300 + }, + { + "epoch": 0.85, + "grad_norm": 0.2511361539363861, + "learning_rate": 0.00024472750130991126, + "loss": 0.0091, + "step": 1301 + }, + { + "epoch": 0.85, + "grad_norm": 0.42377692461013794, + "learning_rate": 0.0002446475443832979, + "loss": 0.0669, + "step": 1302 + }, + { + "epoch": 0.85, + "grad_norm": 0.587988555431366, + "learning_rate": 0.000244567542748541, + "loss": 0.0737, + "step": 1303 + }, + { + "epoch": 0.85, + "grad_norm": 0.1163543239235878, + "learning_rate": 0.0002444874964434305, + "loss": 0.0151, + "step": 1304 + }, + { + "epoch": 0.85, + "grad_norm": 0.036374811083078384, + "learning_rate": 0.00024440740550577754, + "loss": 0.0067, + "step": 1305 + }, + { + "epoch": 0.85, + "grad_norm": 0.07870301604270935, + "learning_rate": 0.00024432726997341403, + "loss": 0.0191, + "step": 1306 + }, + { + "epoch": 0.86, + "grad_norm": 0.09554275870323181, + "learning_rate": 0.0002442470898841933, + "loss": 0.0169, + "step": 1307 + }, + { + "epoch": 0.86, + "grad_norm": 0.20255301892757416, + "learning_rate": 0.0002441668652759896, + "loss": 0.0404, + "step": 1308 + }, + { + "epoch": 0.86, + "grad_norm": 0.015268395654857159, + "learning_rate": 0.0002440865961866981, + "loss": 0.002, + "step": 1309 + }, + { + "epoch": 0.86, + "grad_norm": 0.08300946652889252, + "learning_rate": 0.0002440062826542351, + "loss": 0.0281, + "step": 1310 + }, + { + "epoch": 0.86, + "grad_norm": 0.2025083601474762, + "learning_rate": 0.00024392592471653786, + "loss": 0.0407, + "step": 1311 + }, + { + "epoch": 0.86, + "grad_norm": 0.01875820755958557, + "learning_rate": 0.0002438455224115647, + "loss": 0.0024, + "step": 1312 + }, + { + "epoch": 0.86, + "grad_norm": 0.16822032630443573, + "learning_rate": 0.0002437650757772947, + "loss": 0.0356, + "step": 1313 + }, + { + "epoch": 0.86, + "grad_norm": 0.307230681180954, + "learning_rate": 0.0002436845848517281, + "loss": 0.0277, + "step": 1314 + }, + { + "epoch": 0.86, + "grad_norm": 0.21822179853916168, + "learning_rate": 0.00024360404967288586, + "loss": 0.0153, + "step": 1315 + }, + { + "epoch": 0.86, + "grad_norm": 0.18913914263248444, + "learning_rate": 0.00024352347027881003, + "loss": 0.0664, + "step": 1316 + }, + { + "epoch": 0.86, + "grad_norm": 0.26664435863494873, + "learning_rate": 0.0002434428467075634, + "loss": 0.0821, + "step": 1317 + }, + { + "epoch": 0.86, + "grad_norm": 0.15764296054840088, + "learning_rate": 0.00024336217899722967, + "loss": 0.0663, + "step": 1318 + }, + { + "epoch": 0.86, + "grad_norm": 0.09952249377965927, + "learning_rate": 0.00024328146718591352, + "loss": 0.0497, + "step": 1319 + }, + { + "epoch": 0.86, + "grad_norm": 0.20798932015895844, + "learning_rate": 0.00024320071131174022, + "loss": 0.0448, + "step": 1320 + }, + { + "epoch": 0.86, + "grad_norm": 0.2187434434890747, + "learning_rate": 0.00024311991141285602, + "loss": 0.0547, + "step": 1321 + }, + { + "epoch": 0.87, + "grad_norm": 0.08128710836172104, + "learning_rate": 0.00024303906752742797, + "loss": 0.0232, + "step": 1322 + }, + { + "epoch": 0.87, + "grad_norm": 0.1579497754573822, + "learning_rate": 0.00024295817969364382, + "loss": 0.0368, + "step": 1323 + }, + { + "epoch": 0.87, + "grad_norm": 0.3530323803424835, + "learning_rate": 0.00024287724794971207, + "loss": 0.0543, + "step": 1324 + }, + { + "epoch": 0.87, + "grad_norm": 0.13028964400291443, + "learning_rate": 0.00024279627233386212, + "loss": 0.0562, + "step": 1325 + }, + { + "epoch": 0.87, + "grad_norm": 0.17670606076717377, + "learning_rate": 0.00024271525288434385, + "loss": 0.033, + "step": 1326 + }, + { + "epoch": 0.87, + "grad_norm": 0.14736194908618927, + "learning_rate": 0.00024263418963942808, + "loss": 0.0403, + "step": 1327 + }, + { + "epoch": 0.87, + "grad_norm": 0.2033924162387848, + "learning_rate": 0.00024255308263740618, + "loss": 0.0584, + "step": 1328 + }, + { + "epoch": 0.87, + "grad_norm": 0.08926638215780258, + "learning_rate": 0.00024247193191659016, + "loss": 0.0368, + "step": 1329 + }, + { + "epoch": 0.87, + "grad_norm": 0.09010445326566696, + "learning_rate": 0.0002423907375153128, + "loss": 0.0313, + "step": 1330 + }, + { + "epoch": 0.87, + "grad_norm": 0.07403320074081421, + "learning_rate": 0.00024230949947192748, + "loss": 0.0146, + "step": 1331 + }, + { + "epoch": 0.87, + "grad_norm": 0.11623091250658035, + "learning_rate": 0.00024222821782480812, + "loss": 0.0308, + "step": 1332 + }, + { + "epoch": 0.87, + "grad_norm": 0.20798785984516144, + "learning_rate": 0.0002421468926123493, + "loss": 0.0447, + "step": 1333 + }, + { + "epoch": 0.87, + "grad_norm": 0.12543538212776184, + "learning_rate": 0.00024206552387296621, + "loss": 0.0438, + "step": 1334 + }, + { + "epoch": 0.87, + "grad_norm": 0.12966863811016083, + "learning_rate": 0.00024198411164509447, + "loss": 0.0453, + "step": 1335 + }, + { + "epoch": 0.87, + "grad_norm": 0.05985172837972641, + "learning_rate": 0.00024190265596719043, + "loss": 0.0102, + "step": 1336 + }, + { + "epoch": 0.88, + "grad_norm": 0.14263281226158142, + "learning_rate": 0.00024182115687773075, + "loss": 0.0544, + "step": 1337 + }, + { + "epoch": 0.88, + "grad_norm": 0.190725639462471, + "learning_rate": 0.00024173961441521284, + "loss": 0.0265, + "step": 1338 + }, + { + "epoch": 0.88, + "grad_norm": 0.29231366515159607, + "learning_rate": 0.00024165802861815435, + "loss": 0.0684, + "step": 1339 + }, + { + "epoch": 0.88, + "grad_norm": 0.13645826280117035, + "learning_rate": 0.00024157639952509356, + "loss": 0.0577, + "step": 1340 + }, + { + "epoch": 0.88, + "grad_norm": 0.15891732275485992, + "learning_rate": 0.0002414947271745892, + "loss": 0.0455, + "step": 1341 + }, + { + "epoch": 0.88, + "grad_norm": 0.2538587152957916, + "learning_rate": 0.00024141301160522037, + "loss": 0.0566, + "step": 1342 + }, + { + "epoch": 0.88, + "grad_norm": 0.08588481694459915, + "learning_rate": 0.00024133125285558658, + "loss": 0.0265, + "step": 1343 + }, + { + "epoch": 0.88, + "grad_norm": 0.1366318315267563, + "learning_rate": 0.00024124945096430775, + "loss": 0.0209, + "step": 1344 + }, + { + "epoch": 0.88, + "grad_norm": 0.12919899821281433, + "learning_rate": 0.00024116760597002427, + "loss": 0.0358, + "step": 1345 + }, + { + "epoch": 0.88, + "grad_norm": 0.1527070701122284, + "learning_rate": 0.0002410857179113967, + "loss": 0.0584, + "step": 1346 + }, + { + "epoch": 0.88, + "grad_norm": 0.1441652625799179, + "learning_rate": 0.00024100378682710618, + "loss": 0.026, + "step": 1347 + }, + { + "epoch": 0.88, + "grad_norm": 0.0560770146548748, + "learning_rate": 0.00024092181275585397, + "loss": 0.0126, + "step": 1348 + }, + { + "epoch": 0.88, + "grad_norm": 0.23829127848148346, + "learning_rate": 0.00024083979573636172, + "loss": 0.0492, + "step": 1349 + }, + { + "epoch": 0.88, + "grad_norm": 0.22331084311008453, + "learning_rate": 0.00024075773580737138, + "loss": 0.0374, + "step": 1350 + }, + { + "epoch": 0.88, + "grad_norm": 0.16740433871746063, + "learning_rate": 0.0002406756330076452, + "loss": 0.033, + "step": 1351 + }, + { + "epoch": 0.89, + "grad_norm": 0.12624043226242065, + "learning_rate": 0.0002405934873759655, + "loss": 0.0254, + "step": 1352 + }, + { + "epoch": 0.89, + "grad_norm": 0.2925248444080353, + "learning_rate": 0.00024051129895113506, + "loss": 0.0966, + "step": 1353 + }, + { + "epoch": 0.89, + "grad_norm": 0.050702452659606934, + "learning_rate": 0.00024042906777197676, + "loss": 0.0058, + "step": 1354 + }, + { + "epoch": 0.89, + "grad_norm": 0.11182265728712082, + "learning_rate": 0.00024034679387733367, + "loss": 0.0209, + "step": 1355 + }, + { + "epoch": 0.89, + "grad_norm": 0.07762409001588821, + "learning_rate": 0.00024026447730606911, + "loss": 0.0117, + "step": 1356 + }, + { + "epoch": 0.89, + "grad_norm": 0.05566919595003128, + "learning_rate": 0.00024018211809706652, + "loss": 0.012, + "step": 1357 + }, + { + "epoch": 0.89, + "grad_norm": 0.026816535741090775, + "learning_rate": 0.00024009971628922937, + "loss": 0.0058, + "step": 1358 + }, + { + "epoch": 0.89, + "grad_norm": 0.14158879220485687, + "learning_rate": 0.0002400172719214814, + "loss": 0.0242, + "step": 1359 + }, + { + "epoch": 0.89, + "grad_norm": 0.10178912431001663, + "learning_rate": 0.0002399347850327664, + "loss": 0.0144, + "step": 1360 + }, + { + "epoch": 0.89, + "grad_norm": 0.2671686112880707, + "learning_rate": 0.00023985225566204834, + "loss": 0.1116, + "step": 1361 + }, + { + "epoch": 0.89, + "grad_norm": 0.2026364952325821, + "learning_rate": 0.00023976968384831107, + "loss": 0.0511, + "step": 1362 + }, + { + "epoch": 0.89, + "grad_norm": 0.046000707894563675, + "learning_rate": 0.0002396870696305586, + "loss": 0.0089, + "step": 1363 + }, + { + "epoch": 0.89, + "grad_norm": 0.243350088596344, + "learning_rate": 0.00023960441304781495, + "loss": 0.0376, + "step": 1364 + }, + { + "epoch": 0.89, + "grad_norm": 0.053250979632139206, + "learning_rate": 0.0002395217141391242, + "loss": 0.008, + "step": 1365 + }, + { + "epoch": 0.89, + "grad_norm": 0.08489834517240524, + "learning_rate": 0.0002394389729435503, + "loss": 0.0216, + "step": 1366 + }, + { + "epoch": 0.89, + "grad_norm": 0.09859486669301987, + "learning_rate": 0.00023935618950017738, + "loss": 0.0253, + "step": 1367 + }, + { + "epoch": 0.9, + "grad_norm": 0.11453449726104736, + "learning_rate": 0.00023927336384810933, + "loss": 0.0414, + "step": 1368 + }, + { + "epoch": 0.9, + "grad_norm": 0.1473090499639511, + "learning_rate": 0.00023919049602647005, + "loss": 0.0365, + "step": 1369 + }, + { + "epoch": 0.9, + "grad_norm": 0.12153466045856476, + "learning_rate": 0.00023910758607440335, + "loss": 0.0314, + "step": 1370 + }, + { + "epoch": 0.9, + "grad_norm": 0.17143134772777557, + "learning_rate": 0.000239024634031073, + "loss": 0.0928, + "step": 1371 + }, + { + "epoch": 0.9, + "grad_norm": 0.11081311106681824, + "learning_rate": 0.00023894163993566257, + "loss": 0.0535, + "step": 1372 + }, + { + "epoch": 0.9, + "grad_norm": 0.13488808274269104, + "learning_rate": 0.0002388586038273755, + "loss": 0.0321, + "step": 1373 + }, + { + "epoch": 0.9, + "grad_norm": 0.0592711940407753, + "learning_rate": 0.0002387755257454352, + "loss": 0.01, + "step": 1374 + }, + { + "epoch": 0.9, + "grad_norm": 0.09835012257099152, + "learning_rate": 0.00023869240572908467, + "loss": 0.0295, + "step": 1375 + }, + { + "epoch": 0.9, + "grad_norm": 0.071134053170681, + "learning_rate": 0.000238609243817587, + "loss": 0.0243, + "step": 1376 + }, + { + "epoch": 0.9, + "grad_norm": 0.14431652426719666, + "learning_rate": 0.0002385260400502248, + "loss": 0.0344, + "step": 1377 + }, + { + "epoch": 0.9, + "grad_norm": 0.10391832143068314, + "learning_rate": 0.00023844279446630067, + "loss": 0.0231, + "step": 1378 + }, + { + "epoch": 0.9, + "grad_norm": 0.07357161492109299, + "learning_rate": 0.00023835950710513677, + "loss": 0.0163, + "step": 1379 + }, + { + "epoch": 0.9, + "grad_norm": 0.16738182306289673, + "learning_rate": 0.00023827617800607523, + "loss": 0.0423, + "step": 1380 + }, + { + "epoch": 0.9, + "grad_norm": 0.07547144591808319, + "learning_rate": 0.00023819280720847774, + "loss": 0.0273, + "step": 1381 + }, + { + "epoch": 0.9, + "grad_norm": 0.10503777116537094, + "learning_rate": 0.0002381093947517256, + "loss": 0.0192, + "step": 1382 + }, + { + "epoch": 0.91, + "grad_norm": 0.0630551353096962, + "learning_rate": 0.00023802594067521998, + "loss": 0.0115, + "step": 1383 + }, + { + "epoch": 0.91, + "grad_norm": 0.02077486738562584, + "learning_rate": 0.00023794244501838162, + "loss": 0.0045, + "step": 1384 + }, + { + "epoch": 0.91, + "grad_norm": 0.09841371327638626, + "learning_rate": 0.00023785890782065087, + "loss": 0.0242, + "step": 1385 + }, + { + "epoch": 0.91, + "grad_norm": 0.21591459214687347, + "learning_rate": 0.00023777532912148781, + "loss": 0.0237, + "step": 1386 + }, + { + "epoch": 0.91, + "grad_norm": 0.03989405184984207, + "learning_rate": 0.000237691708960372, + "loss": 0.0051, + "step": 1387 + }, + { + "epoch": 0.91, + "grad_norm": 0.12305942177772522, + "learning_rate": 0.0002376080473768026, + "loss": 0.0264, + "step": 1388 + }, + { + "epoch": 0.91, + "grad_norm": 0.14408881962299347, + "learning_rate": 0.00023752434441029848, + "loss": 0.0322, + "step": 1389 + }, + { + "epoch": 0.91, + "grad_norm": 0.04419580101966858, + "learning_rate": 0.00023744060010039784, + "loss": 0.0073, + "step": 1390 + }, + { + "epoch": 0.91, + "grad_norm": 0.18515107035636902, + "learning_rate": 0.0002373568144866586, + "loss": 0.0465, + "step": 1391 + }, + { + "epoch": 0.91, + "grad_norm": 0.048167865723371506, + "learning_rate": 0.00023727298760865812, + "loss": 0.0138, + "step": 1392 + }, + { + "epoch": 0.91, + "grad_norm": 0.08519299328327179, + "learning_rate": 0.0002371891195059932, + "loss": 0.0095, + "step": 1393 + }, + { + "epoch": 0.91, + "grad_norm": 0.21691879630088806, + "learning_rate": 0.00023710521021828016, + "loss": 0.0381, + "step": 1394 + }, + { + "epoch": 0.91, + "grad_norm": 0.09678614884614944, + "learning_rate": 0.00023702125978515478, + "loss": 0.0099, + "step": 1395 + }, + { + "epoch": 0.91, + "grad_norm": 0.08847987651824951, + "learning_rate": 0.0002369372682462723, + "loss": 0.0165, + "step": 1396 + }, + { + "epoch": 0.91, + "grad_norm": 0.03246233984827995, + "learning_rate": 0.0002368532356413073, + "loss": 0.0058, + "step": 1397 + }, + { + "epoch": 0.92, + "grad_norm": 0.07045282423496246, + "learning_rate": 0.00023676916200995386, + "loss": 0.0164, + "step": 1398 + }, + { + "epoch": 0.92, + "grad_norm": 0.05581701174378395, + "learning_rate": 0.00023668504739192528, + "loss": 0.0152, + "step": 1399 + }, + { + "epoch": 0.92, + "grad_norm": 0.15774132311344147, + "learning_rate": 0.0002366008918269544, + "loss": 0.0243, + "step": 1400 + }, + { + "epoch": 0.92, + "grad_norm": 0.172657772898674, + "learning_rate": 0.00023651669535479334, + "loss": 0.0184, + "step": 1401 + }, + { + "epoch": 0.92, + "grad_norm": 0.08128032833337784, + "learning_rate": 0.0002364324580152135, + "loss": 0.0186, + "step": 1402 + }, + { + "epoch": 0.92, + "grad_norm": 0.06358969956636429, + "learning_rate": 0.00023634817984800554, + "loss": 0.0102, + "step": 1403 + }, + { + "epoch": 0.92, + "grad_norm": 0.31414860486984253, + "learning_rate": 0.00023626386089297958, + "loss": 0.0514, + "step": 1404 + }, + { + "epoch": 0.92, + "grad_norm": 0.22831489145755768, + "learning_rate": 0.00023617950118996487, + "loss": 0.0323, + "step": 1405 + }, + { + "epoch": 0.92, + "grad_norm": 0.048902370035648346, + "learning_rate": 0.00023609510077880996, + "loss": 0.0033, + "step": 1406 + }, + { + "epoch": 0.92, + "grad_norm": 0.03278432413935661, + "learning_rate": 0.00023601065969938262, + "loss": 0.0031, + "step": 1407 + }, + { + "epoch": 0.92, + "grad_norm": 0.09343546628952026, + "learning_rate": 0.00023592617799156977, + "loss": 0.0199, + "step": 1408 + }, + { + "epoch": 0.92, + "grad_norm": 0.0755714625120163, + "learning_rate": 0.00023584165569527757, + "loss": 0.0086, + "step": 1409 + }, + { + "epoch": 0.92, + "grad_norm": 0.28567177057266235, + "learning_rate": 0.00023575709285043138, + "loss": 0.0256, + "step": 1410 + }, + { + "epoch": 0.92, + "grad_norm": 0.1896996796131134, + "learning_rate": 0.0002356724894969757, + "loss": 0.0291, + "step": 1411 + }, + { + "epoch": 0.92, + "grad_norm": 0.1428869366645813, + "learning_rate": 0.0002355878456748742, + "loss": 0.0574, + "step": 1412 + }, + { + "epoch": 0.93, + "grad_norm": 0.25432294607162476, + "learning_rate": 0.0002355031614241095, + "loss": 0.0433, + "step": 1413 + }, + { + "epoch": 0.93, + "grad_norm": 0.2577909231185913, + "learning_rate": 0.00023541843678468355, + "loss": 0.0376, + "step": 1414 + }, + { + "epoch": 0.93, + "grad_norm": 0.1479143500328064, + "learning_rate": 0.0002353336717966172, + "loss": 0.0248, + "step": 1415 + }, + { + "epoch": 0.93, + "grad_norm": 0.058144185692071915, + "learning_rate": 0.00023524886649995043, + "loss": 0.0102, + "step": 1416 + }, + { + "epoch": 0.93, + "grad_norm": 0.18476702272891998, + "learning_rate": 0.00023516402093474225, + "loss": 0.0658, + "step": 1417 + }, + { + "epoch": 0.93, + "grad_norm": 0.1367078274488449, + "learning_rate": 0.00023507913514107074, + "loss": 0.0228, + "step": 1418 + }, + { + "epoch": 0.93, + "grad_norm": 0.05217135697603226, + "learning_rate": 0.00023499420915903293, + "loss": 0.0117, + "step": 1419 + }, + { + "epoch": 0.93, + "grad_norm": 0.311260461807251, + "learning_rate": 0.00023490924302874478, + "loss": 0.0945, + "step": 1420 + }, + { + "epoch": 0.93, + "grad_norm": 0.06179346889257431, + "learning_rate": 0.00023482423679034134, + "loss": 0.0102, + "step": 1421 + }, + { + "epoch": 0.93, + "grad_norm": 0.0694802924990654, + "learning_rate": 0.00023473919048397652, + "loss": 0.0187, + "step": 1422 + }, + { + "epoch": 0.93, + "grad_norm": 0.09105714410543442, + "learning_rate": 0.00023465410414982317, + "loss": 0.0245, + "step": 1423 + }, + { + "epoch": 0.93, + "grad_norm": 0.10562916845083237, + "learning_rate": 0.0002345689778280731, + "loss": 0.0296, + "step": 1424 + }, + { + "epoch": 0.93, + "grad_norm": 0.07471620291471481, + "learning_rate": 0.00023448381155893695, + "loss": 0.0288, + "step": 1425 + }, + { + "epoch": 0.93, + "grad_norm": 0.11771635711193085, + "learning_rate": 0.0002343986053826442, + "loss": 0.0165, + "step": 1426 + }, + { + "epoch": 0.93, + "grad_norm": 0.056794993579387665, + "learning_rate": 0.00023431335933944323, + "loss": 0.02, + "step": 1427 + }, + { + "epoch": 0.93, + "grad_norm": 0.10688856244087219, + "learning_rate": 0.00023422807346960131, + "loss": 0.037, + "step": 1428 + }, + { + "epoch": 0.94, + "grad_norm": 0.10420051217079163, + "learning_rate": 0.00023414274781340442, + "loss": 0.0211, + "step": 1429 + }, + { + "epoch": 0.94, + "grad_norm": 0.09319007396697998, + "learning_rate": 0.00023405738241115737, + "loss": 0.0324, + "step": 1430 + }, + { + "epoch": 0.94, + "grad_norm": 0.11446485668420792, + "learning_rate": 0.00023397197730318377, + "loss": 0.0381, + "step": 1431 + }, + { + "epoch": 0.94, + "grad_norm": 0.10845956206321716, + "learning_rate": 0.00023388653252982594, + "loss": 0.0171, + "step": 1432 + }, + { + "epoch": 0.94, + "grad_norm": 0.08544383198022842, + "learning_rate": 0.000233801048131445, + "loss": 0.0239, + "step": 1433 + }, + { + "epoch": 0.94, + "grad_norm": 0.10372909903526306, + "learning_rate": 0.0002337155241484207, + "loss": 0.0429, + "step": 1434 + }, + { + "epoch": 0.94, + "grad_norm": 0.24167174100875854, + "learning_rate": 0.00023362996062115154, + "loss": 0.1291, + "step": 1435 + }, + { + "epoch": 0.94, + "grad_norm": 0.10461205989122391, + "learning_rate": 0.00023354435759005473, + "loss": 0.0385, + "step": 1436 + }, + { + "epoch": 0.94, + "grad_norm": 0.14408838748931885, + "learning_rate": 0.0002334587150955661, + "loss": 0.0377, + "step": 1437 + }, + { + "epoch": 0.94, + "grad_norm": 0.08705660700798035, + "learning_rate": 0.0002333730331781401, + "loss": 0.0169, + "step": 1438 + }, + { + "epoch": 0.94, + "grad_norm": 0.10698029398918152, + "learning_rate": 0.00023328731187824986, + "loss": 0.0383, + "step": 1439 + }, + { + "epoch": 0.94, + "grad_norm": 0.18005134165287018, + "learning_rate": 0.0002332015512363871, + "loss": 0.0408, + "step": 1440 + }, + { + "epoch": 0.94, + "grad_norm": 0.11144935339689255, + "learning_rate": 0.00023311575129306202, + "loss": 0.0434, + "step": 1441 + }, + { + "epoch": 0.94, + "grad_norm": 0.09303693473339081, + "learning_rate": 0.0002330299120888035, + "loss": 0.0259, + "step": 1442 + }, + { + "epoch": 0.94, + "grad_norm": 0.09196025878190994, + "learning_rate": 0.00023294403366415904, + "loss": 0.0256, + "step": 1443 + }, + { + "epoch": 0.95, + "grad_norm": 0.09944535046815872, + "learning_rate": 0.00023285811605969442, + "loss": 0.0691, + "step": 1444 + }, + { + "epoch": 0.95, + "grad_norm": 0.0766916275024414, + "learning_rate": 0.00023277215931599417, + "loss": 0.0162, + "step": 1445 + }, + { + "epoch": 0.95, + "grad_norm": 0.0471719354391098, + "learning_rate": 0.00023268616347366114, + "loss": 0.0157, + "step": 1446 + }, + { + "epoch": 0.95, + "grad_norm": 0.0748835876584053, + "learning_rate": 0.0002326001285733168, + "loss": 0.0162, + "step": 1447 + }, + { + "epoch": 0.95, + "grad_norm": 0.2493734508752823, + "learning_rate": 0.0002325140546556009, + "loss": 0.0908, + "step": 1448 + }, + { + "epoch": 0.95, + "grad_norm": 0.1908605992794037, + "learning_rate": 0.0002324279417611717, + "loss": 0.0352, + "step": 1449 + }, + { + "epoch": 0.95, + "grad_norm": 0.16963645815849304, + "learning_rate": 0.00023234178993070595, + "loss": 0.0597, + "step": 1450 + }, + { + "epoch": 0.95, + "grad_norm": 0.1448785662651062, + "learning_rate": 0.0002322555992048987, + "loss": 0.0341, + "step": 1451 + }, + { + "epoch": 0.95, + "grad_norm": 0.11966606229543686, + "learning_rate": 0.00023216936962446334, + "loss": 0.0447, + "step": 1452 + }, + { + "epoch": 0.95, + "grad_norm": 0.06863813102245331, + "learning_rate": 0.00023208310123013176, + "loss": 0.0184, + "step": 1453 + }, + { + "epoch": 0.95, + "grad_norm": 0.08081576228141785, + "learning_rate": 0.000231996794062654, + "loss": 0.0183, + "step": 1454 + }, + { + "epoch": 0.95, + "grad_norm": 0.04790128767490387, + "learning_rate": 0.00023191044816279856, + "loss": 0.0159, + "step": 1455 + }, + { + "epoch": 0.95, + "grad_norm": 0.11623428761959076, + "learning_rate": 0.00023182406357135217, + "loss": 0.036, + "step": 1456 + }, + { + "epoch": 0.95, + "grad_norm": 0.19882117211818695, + "learning_rate": 0.0002317376403291198, + "loss": 0.0356, + "step": 1457 + }, + { + "epoch": 0.95, + "grad_norm": 0.06410811841487885, + "learning_rate": 0.0002316511784769248, + "loss": 0.0153, + "step": 1458 + }, + { + "epoch": 0.96, + "grad_norm": 0.1210549846291542, + "learning_rate": 0.00023156467805560862, + "loss": 0.0254, + "step": 1459 + }, + { + "epoch": 0.96, + "grad_norm": 0.09589160978794098, + "learning_rate": 0.00023147813910603102, + "loss": 0.0231, + "step": 1460 + }, + { + "epoch": 0.96, + "grad_norm": 0.05613451451063156, + "learning_rate": 0.00023139156166906993, + "loss": 0.008, + "step": 1461 + }, + { + "epoch": 0.96, + "grad_norm": 0.12222158908843994, + "learning_rate": 0.00023130494578562147, + "loss": 0.0236, + "step": 1462 + }, + { + "epoch": 0.96, + "grad_norm": 0.12595443427562714, + "learning_rate": 0.00023121829149659988, + "loss": 0.0284, + "step": 1463 + }, + { + "epoch": 0.96, + "grad_norm": 0.05631411075592041, + "learning_rate": 0.00023113159884293762, + "loss": 0.0083, + "step": 1464 + }, + { + "epoch": 0.96, + "grad_norm": 0.15821842849254608, + "learning_rate": 0.00023104486786558516, + "loss": 0.0281, + "step": 1465 + }, + { + "epoch": 0.96, + "grad_norm": 0.28132763504981995, + "learning_rate": 0.0002309580986055112, + "loss": 0.0744, + "step": 1466 + }, + { + "epoch": 0.96, + "grad_norm": 0.08583173155784607, + "learning_rate": 0.00023087129110370243, + "loss": 0.0163, + "step": 1467 + }, + { + "epoch": 0.96, + "grad_norm": 0.1472005695104599, + "learning_rate": 0.00023078444540116364, + "loss": 0.0342, + "step": 1468 + }, + { + "epoch": 0.96, + "grad_norm": 0.15789683163166046, + "learning_rate": 0.0002306975615389177, + "loss": 0.0321, + "step": 1469 + }, + { + "epoch": 0.96, + "grad_norm": 0.0862409770488739, + "learning_rate": 0.00023061063955800542, + "loss": 0.0337, + "step": 1470 + }, + { + "epoch": 0.96, + "grad_norm": 0.09513189643621445, + "learning_rate": 0.00023052367949948562, + "loss": 0.0156, + "step": 1471 + }, + { + "epoch": 0.96, + "grad_norm": 0.16023319959640503, + "learning_rate": 0.00023043668140443522, + "loss": 0.0437, + "step": 1472 + }, + { + "epoch": 0.96, + "grad_norm": 0.28757092356681824, + "learning_rate": 0.0002303496453139491, + "loss": 0.0526, + "step": 1473 + }, + { + "epoch": 0.96, + "grad_norm": 0.09820155799388885, + "learning_rate": 0.00023026257126913986, + "loss": 0.0087, + "step": 1474 + }, + { + "epoch": 0.97, + "grad_norm": 0.23134587705135345, + "learning_rate": 0.00023017545931113822, + "loss": 0.0613, + "step": 1475 + }, + { + "epoch": 0.97, + "grad_norm": 0.06153428182005882, + "learning_rate": 0.0002300883094810929, + "loss": 0.0086, + "step": 1476 + }, + { + "epoch": 0.97, + "grad_norm": 0.17993584275245667, + "learning_rate": 0.00023000112182017032, + "loss": 0.0339, + "step": 1477 + }, + { + "epoch": 0.97, + "grad_norm": 0.23367144167423248, + "learning_rate": 0.00022991389636955483, + "loss": 0.0785, + "step": 1478 + }, + { + "epoch": 0.97, + "grad_norm": 0.057765256613492966, + "learning_rate": 0.00022982663317044864, + "loss": 0.0077, + "step": 1479 + }, + { + "epoch": 0.97, + "grad_norm": 0.17549645900726318, + "learning_rate": 0.00022973933226407174, + "loss": 0.0578, + "step": 1480 + }, + { + "epoch": 0.97, + "grad_norm": 0.13486583530902863, + "learning_rate": 0.0002296519936916621, + "loss": 0.0381, + "step": 1481 + }, + { + "epoch": 0.97, + "grad_norm": 0.11634548753499985, + "learning_rate": 0.00022956461749447528, + "loss": 0.0356, + "step": 1482 + }, + { + "epoch": 0.97, + "grad_norm": 0.03911494463682175, + "learning_rate": 0.0002294772037137847, + "loss": 0.0082, + "step": 1483 + }, + { + "epoch": 0.97, + "grad_norm": 0.13597272336483002, + "learning_rate": 0.0002293897523908816, + "loss": 0.037, + "step": 1484 + }, + { + "epoch": 0.97, + "grad_norm": 0.03297096863389015, + "learning_rate": 0.0002293022635670748, + "loss": 0.0101, + "step": 1485 + }, + { + "epoch": 0.97, + "grad_norm": 0.1217992827296257, + "learning_rate": 0.00022921473728369099, + "loss": 0.0488, + "step": 1486 + }, + { + "epoch": 0.97, + "grad_norm": 0.08392113447189331, + "learning_rate": 0.0002291271735820744, + "loss": 0.0213, + "step": 1487 + }, + { + "epoch": 0.97, + "grad_norm": 0.0728277638554573, + "learning_rate": 0.00022903957250358707, + "loss": 0.0323, + "step": 1488 + }, + { + "epoch": 0.97, + "grad_norm": 0.19564445316791534, + "learning_rate": 0.0002289519340896086, + "loss": 0.0362, + "step": 1489 + }, + { + "epoch": 0.98, + "grad_norm": 0.09455154836177826, + "learning_rate": 0.00022886425838153634, + "loss": 0.0305, + "step": 1490 + }, + { + "epoch": 0.98, + "grad_norm": 0.02463528886437416, + "learning_rate": 0.00022877654542078515, + "loss": 0.0055, + "step": 1491 + }, + { + "epoch": 0.98, + "grad_norm": 0.10636550933122635, + "learning_rate": 0.0002286887952487875, + "loss": 0.0254, + "step": 1492 + }, + { + "epoch": 0.98, + "grad_norm": 0.08179948478937149, + "learning_rate": 0.00022860100790699352, + "loss": 0.0341, + "step": 1493 + }, + { + "epoch": 0.98, + "grad_norm": 0.04053513705730438, + "learning_rate": 0.00022851318343687074, + "loss": 0.0059, + "step": 1494 + }, + { + "epoch": 0.98, + "grad_norm": 0.06254950165748596, + "learning_rate": 0.00022842532187990444, + "loss": 0.016, + "step": 1495 + }, + { + "epoch": 0.98, + "grad_norm": 0.11671124398708344, + "learning_rate": 0.00022833742327759722, + "loss": 0.0316, + "step": 1496 + }, + { + "epoch": 0.98, + "grad_norm": 0.05388714000582695, + "learning_rate": 0.00022824948767146926, + "loss": 0.0114, + "step": 1497 + }, + { + "epoch": 0.98, + "grad_norm": 0.07483407109975815, + "learning_rate": 0.00022816151510305824, + "loss": 0.0121, + "step": 1498 + }, + { + "epoch": 0.98, + "grad_norm": 0.08650153130292892, + "learning_rate": 0.00022807350561391938, + "loss": 0.0518, + "step": 1499 + }, + { + "epoch": 0.98, + "grad_norm": 0.1296052485704422, + "learning_rate": 0.00022798545924562508, + "loss": 0.0666, + "step": 1500 + }, + { + "epoch": 0.98, + "grad_norm": 0.15292461216449738, + "learning_rate": 0.00022789737603976542, + "loss": 0.0314, + "step": 1501 + }, + { + "epoch": 0.98, + "grad_norm": 0.2241302728652954, + "learning_rate": 0.00022780925603794775, + "loss": 0.13, + "step": 1502 + }, + { + "epoch": 0.98, + "grad_norm": 0.07691671699285507, + "learning_rate": 0.00022772109928179688, + "loss": 0.0303, + "step": 1503 + }, + { + "epoch": 0.98, + "grad_norm": 0.07967071235179901, + "learning_rate": 0.0002276329058129548, + "loss": 0.0104, + "step": 1504 + }, + { + "epoch": 0.99, + "grad_norm": 0.15211229026317596, + "learning_rate": 0.00022754467567308114, + "loss": 0.0463, + "step": 1505 + }, + { + "epoch": 0.99, + "grad_norm": 0.1364462524652481, + "learning_rate": 0.00022745640890385263, + "loss": 0.0333, + "step": 1506 + }, + { + "epoch": 0.99, + "grad_norm": 0.08477602154016495, + "learning_rate": 0.00022736810554696335, + "loss": 0.0144, + "step": 1507 + }, + { + "epoch": 0.99, + "grad_norm": 0.030945677310228348, + "learning_rate": 0.0002272797656441247, + "loss": 0.0082, + "step": 1508 + }, + { + "epoch": 0.99, + "grad_norm": 0.0667153000831604, + "learning_rate": 0.00022719138923706525, + "loss": 0.0285, + "step": 1509 + }, + { + "epoch": 0.99, + "grad_norm": 0.15130023658275604, + "learning_rate": 0.00022710297636753096, + "loss": 0.0493, + "step": 1510 + }, + { + "epoch": 0.99, + "grad_norm": 0.07945651561021805, + "learning_rate": 0.00022701452707728486, + "loss": 0.0181, + "step": 1511 + }, + { + "epoch": 0.99, + "grad_norm": 0.1147598847746849, + "learning_rate": 0.00022692604140810735, + "loss": 0.0377, + "step": 1512 + }, + { + "epoch": 0.99, + "grad_norm": 0.04304948449134827, + "learning_rate": 0.00022683751940179588, + "loss": 0.0128, + "step": 1513 + }, + { + "epoch": 0.99, + "grad_norm": 0.08819684386253357, + "learning_rate": 0.00022674896110016503, + "loss": 0.0296, + "step": 1514 + }, + { + "epoch": 0.99, + "grad_norm": 0.06335631757974625, + "learning_rate": 0.0002266603665450467, + "loss": 0.0188, + "step": 1515 + }, + { + "epoch": 0.99, + "grad_norm": 0.08433008193969727, + "learning_rate": 0.00022657173577828979, + "loss": 0.0251, + "step": 1516 + }, + { + "epoch": 0.99, + "grad_norm": 0.06014099717140198, + "learning_rate": 0.00022648306884176034, + "loss": 0.0193, + "step": 1517 + }, + { + "epoch": 0.99, + "grad_norm": 0.05266990885138512, + "learning_rate": 0.00022639436577734143, + "loss": 0.0112, + "step": 1518 + }, + { + "epoch": 0.99, + "grad_norm": 0.10652010887861252, + "learning_rate": 0.00022630562662693328, + "loss": 0.0312, + "step": 1519 + }, + { + "epoch": 1.0, + "grad_norm": 0.043453726917505264, + "learning_rate": 0.00022621685143245308, + "loss": 0.009, + "step": 1520 + }, + { + "epoch": 1.0, + "grad_norm": 0.0685136690735817, + "learning_rate": 0.00022612804023583515, + "loss": 0.0189, + "step": 1521 + }, + { + "epoch": 1.0, + "grad_norm": 0.14430442452430725, + "learning_rate": 0.0002260391930790307, + "loss": 0.066, + "step": 1522 + }, + { + "epoch": 1.0, + "grad_norm": 0.0724061131477356, + "learning_rate": 0.00022595031000400794, + "loss": 0.0129, + "step": 1523 + }, + { + "epoch": 1.0, + "grad_norm": 0.18257959187030792, + "learning_rate": 0.00022586139105275214, + "loss": 0.0434, + "step": 1524 + }, + { + "epoch": 1.0, + "grad_norm": 0.06716416776180267, + "learning_rate": 0.00022577243626726548, + "loss": 0.0102, + "step": 1525 + }, + { + "epoch": 1.0, + "grad_norm": 0.05102796107530594, + "learning_rate": 0.00022568344568956697, + "loss": 0.0094, + "step": 1526 + }, + { + "epoch": 1.0, + "grad_norm": 0.0711396113038063, + "learning_rate": 0.0002255944193616927, + "loss": 0.0138, + "step": 1527 + } + ], + "logging_steps": 1, + "max_steps": 4581, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 1527, + "total_flos": 1.4257736449943142e+17, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}