diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,5280 @@ +{ + "best_global_step": 474, + "best_metric": 0.14478395879268646, + "best_model_checkpoint": "saves_multiple/p-tuning/llama-3-8b-instruct/train_svamp_101112_1760638001/checkpoint-474", + "epoch": 20.0, + "eval_steps": 158, + "global_step": 3160, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.03164556962025317, + "grad_norm": 27.501157760620117, + "learning_rate": 1.2658227848101267e-05, + "loss": 3.9131, + "num_input_tokens_seen": 2272, + "step": 5 + }, + { + "epoch": 0.06329113924050633, + "grad_norm": 33.62315368652344, + "learning_rate": 2.8481012658227846e-05, + "loss": 3.0431, + "num_input_tokens_seen": 4576, + "step": 10 + }, + { + "epoch": 0.0949367088607595, + "grad_norm": 25.54289436340332, + "learning_rate": 4.430379746835443e-05, + "loss": 2.1245, + "num_input_tokens_seen": 6784, + "step": 15 + }, + { + "epoch": 0.12658227848101267, + "grad_norm": 62.78031921386719, + "learning_rate": 6.012658227848101e-05, + "loss": 1.3902, + "num_input_tokens_seen": 9056, + "step": 20 + }, + { + "epoch": 0.15822784810126583, + "grad_norm": 19.887304306030273, + "learning_rate": 7.59493670886076e-05, + "loss": 0.9863, + "num_input_tokens_seen": 11360, + "step": 25 + }, + { + "epoch": 0.189873417721519, + "grad_norm": 11.19080924987793, + "learning_rate": 9.177215189873418e-05, + "loss": 1.1008, + "num_input_tokens_seen": 13504, + "step": 30 + }, + { + "epoch": 0.22151898734177214, + "grad_norm": 26.57444953918457, + "learning_rate": 0.00010759493670886077, + "loss": 0.8906, + "num_input_tokens_seen": 15680, + "step": 35 + }, + { + "epoch": 0.25316455696202533, + "grad_norm": 20.62451171875, + "learning_rate": 0.00012341772151898734, + "loss": 0.9773, + "num_input_tokens_seen": 17952, + "step": 40 + }, + { + "epoch": 0.2848101265822785, + "grad_norm": 12.63956069946289, + "learning_rate": 0.00013924050632911392, + "loss": 1.0213, + "num_input_tokens_seen": 20224, + "step": 45 + }, + { + "epoch": 0.31645569620253167, + "grad_norm": 37.14901351928711, + "learning_rate": 0.0001550632911392405, + "loss": 0.8393, + "num_input_tokens_seen": 22528, + "step": 50 + }, + { + "epoch": 0.34810126582278483, + "grad_norm": 9.230304718017578, + "learning_rate": 0.00017088607594936708, + "loss": 0.8312, + "num_input_tokens_seen": 24896, + "step": 55 + }, + { + "epoch": 0.379746835443038, + "grad_norm": 8.194303512573242, + "learning_rate": 0.00018670886075949366, + "loss": 0.869, + "num_input_tokens_seen": 27136, + "step": 60 + }, + { + "epoch": 0.41139240506329117, + "grad_norm": 18.06790542602539, + "learning_rate": 0.00020253164556962027, + "loss": 0.8962, + "num_input_tokens_seen": 29472, + "step": 65 + }, + { + "epoch": 0.4430379746835443, + "grad_norm": 10.237406730651855, + "learning_rate": 0.00021835443037974685, + "loss": 0.7859, + "num_input_tokens_seen": 31744, + "step": 70 + }, + { + "epoch": 0.47468354430379744, + "grad_norm": 16.869037628173828, + "learning_rate": 0.00023417721518987343, + "loss": 0.604, + "num_input_tokens_seen": 34112, + "step": 75 + }, + { + "epoch": 0.5063291139240507, + "grad_norm": 9.97231388092041, + "learning_rate": 0.00025, + "loss": 0.9123, + "num_input_tokens_seen": 36320, + "step": 80 + }, + { + "epoch": 0.5379746835443038, + "grad_norm": 13.212492942810059, + "learning_rate": 0.0002658227848101266, + "loss": 1.0078, + "num_input_tokens_seen": 38656, + "step": 85 + }, + { + "epoch": 0.569620253164557, + "grad_norm": 8.071773529052734, + "learning_rate": 0.00028164556962025316, + "loss": 0.5841, + "num_input_tokens_seen": 40928, + "step": 90 + }, + { + "epoch": 0.6012658227848101, + "grad_norm": 20.964324951171875, + "learning_rate": 0.00029746835443037974, + "loss": 0.5911, + "num_input_tokens_seen": 43296, + "step": 95 + }, + { + "epoch": 0.6329113924050633, + "grad_norm": 5.60086727142334, + "learning_rate": 0.0003132911392405063, + "loss": 0.4616, + "num_input_tokens_seen": 45600, + "step": 100 + }, + { + "epoch": 0.6645569620253164, + "grad_norm": 3.0063695907592773, + "learning_rate": 0.0003291139240506329, + "loss": 0.3682, + "num_input_tokens_seen": 47968, + "step": 105 + }, + { + "epoch": 0.6962025316455697, + "grad_norm": 4.657817840576172, + "learning_rate": 0.0003449367088607595, + "loss": 0.2551, + "num_input_tokens_seen": 50208, + "step": 110 + }, + { + "epoch": 0.7278481012658228, + "grad_norm": 3.184723377227783, + "learning_rate": 0.00036075949367088606, + "loss": 0.3104, + "num_input_tokens_seen": 52448, + "step": 115 + }, + { + "epoch": 0.759493670886076, + "grad_norm": 3.6723666191101074, + "learning_rate": 0.0003765822784810127, + "loss": 0.2939, + "num_input_tokens_seen": 54752, + "step": 120 + }, + { + "epoch": 0.7911392405063291, + "grad_norm": 6.200613021850586, + "learning_rate": 0.0003924050632911392, + "loss": 0.3141, + "num_input_tokens_seen": 56992, + "step": 125 + }, + { + "epoch": 0.8227848101265823, + "grad_norm": 2.319878339767456, + "learning_rate": 0.00040822784810126586, + "loss": 0.4616, + "num_input_tokens_seen": 59232, + "step": 130 + }, + { + "epoch": 0.8544303797468354, + "grad_norm": 5.923131465911865, + "learning_rate": 0.0004240506329113924, + "loss": 0.5166, + "num_input_tokens_seen": 61472, + "step": 135 + }, + { + "epoch": 0.8860759493670886, + "grad_norm": 4.088566303253174, + "learning_rate": 0.000439873417721519, + "loss": 0.2912, + "num_input_tokens_seen": 63584, + "step": 140 + }, + { + "epoch": 0.9177215189873418, + "grad_norm": 16.23299217224121, + "learning_rate": 0.00045569620253164554, + "loss": 0.5313, + "num_input_tokens_seen": 65824, + "step": 145 + }, + { + "epoch": 0.9493670886075949, + "grad_norm": 3.2035605907440186, + "learning_rate": 0.0004715189873417722, + "loss": 0.6694, + "num_input_tokens_seen": 68128, + "step": 150 + }, + { + "epoch": 0.9810126582278481, + "grad_norm": 2.2636947631835938, + "learning_rate": 0.00048734177215189876, + "loss": 0.2239, + "num_input_tokens_seen": 70400, + "step": 155 + }, + { + "epoch": 1.0, + "eval_loss": 0.29432404041290283, + "eval_runtime": 1.629, + "eval_samples_per_second": 42.971, + "eval_steps_per_second": 11.05, + "num_input_tokens_seen": 71552, + "step": 158 + }, + { + "epoch": 1.0126582278481013, + "grad_norm": 1.4942150115966797, + "learning_rate": 0.0005031645569620254, + "loss": 0.276, + "num_input_tokens_seen": 72480, + "step": 160 + }, + { + "epoch": 1.0443037974683544, + "grad_norm": 1.4856122732162476, + "learning_rate": 0.0005189873417721519, + "loss": 0.2778, + "num_input_tokens_seen": 74784, + "step": 165 + }, + { + "epoch": 1.0759493670886076, + "grad_norm": 2.80867075920105, + "learning_rate": 0.0005348101265822784, + "loss": 0.1551, + "num_input_tokens_seen": 76928, + "step": 170 + }, + { + "epoch": 1.1075949367088607, + "grad_norm": 3.7323567867279053, + "learning_rate": 0.0005506329113924051, + "loss": 0.1562, + "num_input_tokens_seen": 79296, + "step": 175 + }, + { + "epoch": 1.139240506329114, + "grad_norm": 1.0157109498977661, + "learning_rate": 0.0005664556962025317, + "loss": 0.1654, + "num_input_tokens_seen": 81632, + "step": 180 + }, + { + "epoch": 1.1708860759493671, + "grad_norm": 1.2619264125823975, + "learning_rate": 0.0005822784810126582, + "loss": 0.128, + "num_input_tokens_seen": 83872, + "step": 185 + }, + { + "epoch": 1.2025316455696202, + "grad_norm": 1.1448897123336792, + "learning_rate": 0.0005981012658227848, + "loss": 0.1271, + "num_input_tokens_seen": 86240, + "step": 190 + }, + { + "epoch": 1.2341772151898733, + "grad_norm": 0.7986900210380554, + "learning_rate": 0.0006139240506329115, + "loss": 0.1042, + "num_input_tokens_seen": 88544, + "step": 195 + }, + { + "epoch": 1.2658227848101267, + "grad_norm": 1.0439692735671997, + "learning_rate": 0.000629746835443038, + "loss": 0.1528, + "num_input_tokens_seen": 90880, + "step": 200 + }, + { + "epoch": 1.2974683544303798, + "grad_norm": 0.26329317688941956, + "learning_rate": 0.0006455696202531646, + "loss": 0.0549, + "num_input_tokens_seen": 93056, + "step": 205 + }, + { + "epoch": 1.3291139240506329, + "grad_norm": 0.4752528965473175, + "learning_rate": 0.0006613924050632911, + "loss": 0.1217, + "num_input_tokens_seen": 95392, + "step": 210 + }, + { + "epoch": 1.360759493670886, + "grad_norm": 1.224108338356018, + "learning_rate": 0.0006772151898734177, + "loss": 0.0983, + "num_input_tokens_seen": 97568, + "step": 215 + }, + { + "epoch": 1.3924050632911391, + "grad_norm": 0.5636555552482605, + "learning_rate": 0.0006930379746835443, + "loss": 0.0474, + "num_input_tokens_seen": 99776, + "step": 220 + }, + { + "epoch": 1.4240506329113924, + "grad_norm": 0.8229871392250061, + "learning_rate": 0.0007088607594936709, + "loss": 0.1094, + "num_input_tokens_seen": 102176, + "step": 225 + }, + { + "epoch": 1.4556962025316456, + "grad_norm": 0.6343967318534851, + "learning_rate": 0.0007246835443037975, + "loss": 0.1177, + "num_input_tokens_seen": 104480, + "step": 230 + }, + { + "epoch": 1.4873417721518987, + "grad_norm": 1.0993245840072632, + "learning_rate": 0.000740506329113924, + "loss": 0.1631, + "num_input_tokens_seen": 106816, + "step": 235 + }, + { + "epoch": 1.518987341772152, + "grad_norm": 0.4712866544723511, + "learning_rate": 0.0007563291139240507, + "loss": 0.1521, + "num_input_tokens_seen": 109024, + "step": 240 + }, + { + "epoch": 1.5506329113924051, + "grad_norm": 0.653823733329773, + "learning_rate": 0.0007721518987341772, + "loss": 0.077, + "num_input_tokens_seen": 111200, + "step": 245 + }, + { + "epoch": 1.5822784810126582, + "grad_norm": 1.1556552648544312, + "learning_rate": 0.0007879746835443038, + "loss": 0.1036, + "num_input_tokens_seen": 113408, + "step": 250 + }, + { + "epoch": 1.6139240506329116, + "grad_norm": 0.3208511471748352, + "learning_rate": 0.0008037974683544303, + "loss": 0.0699, + "num_input_tokens_seen": 115616, + "step": 255 + }, + { + "epoch": 1.6455696202531644, + "grad_norm": 0.4209478497505188, + "learning_rate": 0.000819620253164557, + "loss": 0.0546, + "num_input_tokens_seen": 117728, + "step": 260 + }, + { + "epoch": 1.6772151898734178, + "grad_norm": 0.20039498805999756, + "learning_rate": 0.0008354430379746836, + "loss": 0.0819, + "num_input_tokens_seen": 120032, + "step": 265 + }, + { + "epoch": 1.7088607594936709, + "grad_norm": 1.267027735710144, + "learning_rate": 0.0008512658227848101, + "loss": 0.0862, + "num_input_tokens_seen": 122144, + "step": 270 + }, + { + "epoch": 1.740506329113924, + "grad_norm": 0.6629573106765747, + "learning_rate": 0.0008670886075949367, + "loss": 0.0851, + "num_input_tokens_seen": 124384, + "step": 275 + }, + { + "epoch": 1.7721518987341773, + "grad_norm": 1.6403863430023193, + "learning_rate": 0.0008829113924050633, + "loss": 0.1814, + "num_input_tokens_seen": 126560, + "step": 280 + }, + { + "epoch": 1.8037974683544302, + "grad_norm": 0.33124402165412903, + "learning_rate": 0.0008987341772151899, + "loss": 0.1439, + "num_input_tokens_seen": 128928, + "step": 285 + }, + { + "epoch": 1.8354430379746836, + "grad_norm": 0.13936619460582733, + "learning_rate": 0.0009145569620253165, + "loss": 0.1391, + "num_input_tokens_seen": 131264, + "step": 290 + }, + { + "epoch": 1.8670886075949367, + "grad_norm": 0.6116445660591125, + "learning_rate": 0.000930379746835443, + "loss": 0.1173, + "num_input_tokens_seen": 133568, + "step": 295 + }, + { + "epoch": 1.8987341772151898, + "grad_norm": 0.3796460032463074, + "learning_rate": 0.0009462025316455697, + "loss": 0.1156, + "num_input_tokens_seen": 135936, + "step": 300 + }, + { + "epoch": 1.9303797468354431, + "grad_norm": 0.06998901814222336, + "learning_rate": 0.0009620253164556962, + "loss": 0.0465, + "num_input_tokens_seen": 138080, + "step": 305 + }, + { + "epoch": 1.9620253164556962, + "grad_norm": 0.37441620230674744, + "learning_rate": 0.000977848101265823, + "loss": 0.1008, + "num_input_tokens_seen": 140384, + "step": 310 + }, + { + "epoch": 1.9936708860759493, + "grad_norm": 0.8112348318099976, + "learning_rate": 0.0009936708860759493, + "loss": 0.246, + "num_input_tokens_seen": 142752, + "step": 315 + }, + { + "epoch": 2.0, + "eval_loss": 0.2061605006456375, + "eval_runtime": 1.6331, + "eval_samples_per_second": 42.864, + "eval_steps_per_second": 11.022, + "num_input_tokens_seen": 142960, + "step": 316 + }, + { + "epoch": 2.0253164556962027, + "grad_norm": 0.2416524887084961, + "learning_rate": 0.0009999972544921997, + "loss": 0.109, + "num_input_tokens_seen": 144816, + "step": 320 + }, + { + "epoch": 2.0569620253164556, + "grad_norm": 0.4405277371406555, + "learning_rate": 0.000999980476498165, + "loss": 0.0737, + "num_input_tokens_seen": 147120, + "step": 325 + }, + { + "epoch": 2.088607594936709, + "grad_norm": 0.3238157033920288, + "learning_rate": 0.0009999484463034094, + "loss": 0.0364, + "num_input_tokens_seen": 149392, + "step": 330 + }, + { + "epoch": 2.1202531645569622, + "grad_norm": 0.35550785064697266, + "learning_rate": 0.0009999011648850328, + "loss": 0.1014, + "num_input_tokens_seen": 151632, + "step": 335 + }, + { + "epoch": 2.151898734177215, + "grad_norm": 0.23157905042171478, + "learning_rate": 0.0009998386336853829, + "loss": 0.0725, + "num_input_tokens_seen": 153904, + "step": 340 + }, + { + "epoch": 2.1835443037974684, + "grad_norm": 0.1866108477115631, + "learning_rate": 0.0009997608546120109, + "loss": 0.05, + "num_input_tokens_seen": 156176, + "step": 345 + }, + { + "epoch": 2.2151898734177213, + "grad_norm": 0.2203022837638855, + "learning_rate": 0.0009996678300376138, + "loss": 0.079, + "num_input_tokens_seen": 158480, + "step": 350 + }, + { + "epoch": 2.2468354430379747, + "grad_norm": 0.14036527276039124, + "learning_rate": 0.000999559562799961, + "loss": 0.0485, + "num_input_tokens_seen": 160688, + "step": 355 + }, + { + "epoch": 2.278481012658228, + "grad_norm": 0.011383436620235443, + "learning_rate": 0.000999436056201809, + "loss": 0.0262, + "num_input_tokens_seen": 162896, + "step": 360 + }, + { + "epoch": 2.310126582278481, + "grad_norm": 0.23037639260292053, + "learning_rate": 0.0009992973140107997, + "loss": 0.1019, + "num_input_tokens_seen": 165200, + "step": 365 + }, + { + "epoch": 2.3417721518987342, + "grad_norm": 0.38020703196525574, + "learning_rate": 0.000999143340459346, + "loss": 0.0668, + "num_input_tokens_seen": 167376, + "step": 370 + }, + { + "epoch": 2.3734177215189876, + "grad_norm": 0.1685383915901184, + "learning_rate": 0.0009989741402445021, + "loss": 0.0472, + "num_input_tokens_seen": 169520, + "step": 375 + }, + { + "epoch": 2.4050632911392404, + "grad_norm": 0.4711648225784302, + "learning_rate": 0.000998789718527821, + "loss": 0.1526, + "num_input_tokens_seen": 171920, + "step": 380 + }, + { + "epoch": 2.4367088607594938, + "grad_norm": 0.41455963253974915, + "learning_rate": 0.0009985900809351962, + "loss": 0.0876, + "num_input_tokens_seen": 174288, + "step": 385 + }, + { + "epoch": 2.4683544303797467, + "grad_norm": 0.22466030716896057, + "learning_rate": 0.0009983752335566908, + "loss": 0.1276, + "num_input_tokens_seen": 176592, + "step": 390 + }, + { + "epoch": 2.5, + "grad_norm": 0.08923062682151794, + "learning_rate": 0.0009981451829463518, + "loss": 0.084, + "num_input_tokens_seen": 178896, + "step": 395 + }, + { + "epoch": 2.5316455696202533, + "grad_norm": 0.2366461306810379, + "learning_rate": 0.0009978999361220091, + "loss": 0.0511, + "num_input_tokens_seen": 181168, + "step": 400 + }, + { + "epoch": 2.5632911392405062, + "grad_norm": 0.217311829328537, + "learning_rate": 0.0009976395005650623, + "loss": 0.1116, + "num_input_tokens_seen": 183376, + "step": 405 + }, + { + "epoch": 2.5949367088607596, + "grad_norm": 0.13488544523715973, + "learning_rate": 0.0009973638842202526, + "loss": 0.0561, + "num_input_tokens_seen": 185744, + "step": 410 + }, + { + "epoch": 2.6265822784810124, + "grad_norm": 0.2280215471982956, + "learning_rate": 0.00099707309549542, + "loss": 0.0838, + "num_input_tokens_seen": 187984, + "step": 415 + }, + { + "epoch": 2.6582278481012658, + "grad_norm": 0.1973273754119873, + "learning_rate": 0.0009967671432612466, + "loss": 0.0605, + "num_input_tokens_seen": 190128, + "step": 420 + }, + { + "epoch": 2.689873417721519, + "grad_norm": 0.2225196659564972, + "learning_rate": 0.0009964460368509867, + "loss": 0.0865, + "num_input_tokens_seen": 192336, + "step": 425 + }, + { + "epoch": 2.721518987341772, + "grad_norm": 0.5620536804199219, + "learning_rate": 0.0009961097860601818, + "loss": 0.0716, + "num_input_tokens_seen": 194640, + "step": 430 + }, + { + "epoch": 2.7531645569620253, + "grad_norm": 0.19732214510440826, + "learning_rate": 0.0009957584011463612, + "loss": 0.0469, + "num_input_tokens_seen": 196848, + "step": 435 + }, + { + "epoch": 2.7848101265822782, + "grad_norm": 0.12589260935783386, + "learning_rate": 0.0009953918928287304, + "loss": 0.0958, + "num_input_tokens_seen": 199088, + "step": 440 + }, + { + "epoch": 2.8164556962025316, + "grad_norm": 0.2692630887031555, + "learning_rate": 0.0009950102722878422, + "loss": 0.0527, + "num_input_tokens_seen": 201392, + "step": 445 + }, + { + "epoch": 2.848101265822785, + "grad_norm": 0.07784935086965561, + "learning_rate": 0.000994613551165258, + "loss": 0.0687, + "num_input_tokens_seen": 203664, + "step": 450 + }, + { + "epoch": 2.879746835443038, + "grad_norm": 0.18568935990333557, + "learning_rate": 0.0009942017415631903, + "loss": 0.0394, + "num_input_tokens_seen": 205968, + "step": 455 + }, + { + "epoch": 2.911392405063291, + "grad_norm": 0.23891539871692657, + "learning_rate": 0.000993774856044135, + "loss": 0.0556, + "num_input_tokens_seen": 208208, + "step": 460 + }, + { + "epoch": 2.9430379746835444, + "grad_norm": 0.20381800830364227, + "learning_rate": 0.0009933329076304885, + "loss": 0.0872, + "num_input_tokens_seen": 210544, + "step": 465 + }, + { + "epoch": 2.9746835443037973, + "grad_norm": 0.21808888018131256, + "learning_rate": 0.0009928759098041483, + "loss": 0.0528, + "num_input_tokens_seen": 212848, + "step": 470 + }, + { + "epoch": 3.0, + "eval_loss": 0.14478395879268646, + "eval_runtime": 1.6311, + "eval_samples_per_second": 42.917, + "eval_steps_per_second": 11.036, + "num_input_tokens_seen": 214432, + "step": 474 + }, + { + "epoch": 3.0063291139240507, + "grad_norm": 0.09254854172468185, + "learning_rate": 0.000992403876506104, + "loss": 0.0956, + "num_input_tokens_seen": 214944, + "step": 475 + }, + { + "epoch": 3.037974683544304, + "grad_norm": 0.19286249577999115, + "learning_rate": 0.0009919168221360114, + "loss": 0.0524, + "num_input_tokens_seen": 217280, + "step": 480 + }, + { + "epoch": 3.069620253164557, + "grad_norm": 0.21595709025859833, + "learning_rate": 0.0009914147615517526, + "loss": 0.0353, + "num_input_tokens_seen": 219456, + "step": 485 + }, + { + "epoch": 3.1012658227848102, + "grad_norm": 0.23821516335010529, + "learning_rate": 0.0009908977100689831, + "loss": 0.0849, + "num_input_tokens_seen": 221664, + "step": 490 + }, + { + "epoch": 3.132911392405063, + "grad_norm": 0.2962323725223541, + "learning_rate": 0.000990365683460665, + "loss": 0.0367, + "num_input_tokens_seen": 224064, + "step": 495 + }, + { + "epoch": 3.1645569620253164, + "grad_norm": 0.06096614524722099, + "learning_rate": 0.0009898186979565848, + "loss": 0.0144, + "num_input_tokens_seen": 226336, + "step": 500 + }, + { + "epoch": 3.1962025316455698, + "grad_norm": 0.30687931180000305, + "learning_rate": 0.00098925677024286, + "loss": 0.1235, + "num_input_tokens_seen": 228672, + "step": 505 + }, + { + "epoch": 3.2278481012658227, + "grad_norm": 0.424150675535202, + "learning_rate": 0.0009886799174614283, + "loss": 0.055, + "num_input_tokens_seen": 230880, + "step": 510 + }, + { + "epoch": 3.259493670886076, + "grad_norm": 0.18431183695793152, + "learning_rate": 0.0009880881572095256, + "loss": 0.0383, + "num_input_tokens_seen": 233056, + "step": 515 + }, + { + "epoch": 3.291139240506329, + "grad_norm": 0.11716429144144058, + "learning_rate": 0.0009874815075391489, + "loss": 0.0485, + "num_input_tokens_seen": 235232, + "step": 520 + }, + { + "epoch": 3.3227848101265822, + "grad_norm": 1.2910373210906982, + "learning_rate": 0.000986859986956506, + "loss": 0.0545, + "num_input_tokens_seen": 237472, + "step": 525 + }, + { + "epoch": 3.3544303797468356, + "grad_norm": 0.30946189165115356, + "learning_rate": 0.0009862236144214508, + "loss": 0.0374, + "num_input_tokens_seen": 239680, + "step": 530 + }, + { + "epoch": 3.3860759493670884, + "grad_norm": 0.24773317575454712, + "learning_rate": 0.0009855724093469046, + "loss": 0.0477, + "num_input_tokens_seen": 241888, + "step": 535 + }, + { + "epoch": 3.4177215189873418, + "grad_norm": 0.11383341252803802, + "learning_rate": 0.0009849063915982636, + "loss": 0.0418, + "num_input_tokens_seen": 244160, + "step": 540 + }, + { + "epoch": 3.449367088607595, + "grad_norm": 0.16509628295898438, + "learning_rate": 0.0009842255814927945, + "loss": 0.0598, + "num_input_tokens_seen": 246432, + "step": 545 + }, + { + "epoch": 3.481012658227848, + "grad_norm": 0.22512073814868927, + "learning_rate": 0.0009835299997990124, + "loss": 0.0719, + "num_input_tokens_seen": 248704, + "step": 550 + }, + { + "epoch": 3.5126582278481013, + "grad_norm": 0.1970265954732895, + "learning_rate": 0.0009828196677360496, + "loss": 0.0765, + "num_input_tokens_seen": 250944, + "step": 555 + }, + { + "epoch": 3.5443037974683547, + "grad_norm": 0.05213271081447601, + "learning_rate": 0.0009820946069730066, + "loss": 0.0157, + "num_input_tokens_seen": 253248, + "step": 560 + }, + { + "epoch": 3.5759493670886076, + "grad_norm": 0.2490898072719574, + "learning_rate": 0.0009813548396282912, + "loss": 0.0886, + "num_input_tokens_seen": 255648, + "step": 565 + }, + { + "epoch": 3.607594936708861, + "grad_norm": 0.07541127502918243, + "learning_rate": 0.000980600388268945, + "loss": 0.0192, + "num_input_tokens_seen": 257952, + "step": 570 + }, + { + "epoch": 3.6392405063291138, + "grad_norm": 0.019982341676950455, + "learning_rate": 0.0009798312759099538, + "loss": 0.0891, + "num_input_tokens_seen": 260256, + "step": 575 + }, + { + "epoch": 3.670886075949367, + "grad_norm": 0.31538912653923035, + "learning_rate": 0.0009790475260135457, + "loss": 0.0465, + "num_input_tokens_seen": 262688, + "step": 580 + }, + { + "epoch": 3.7025316455696204, + "grad_norm": 0.11736049503087997, + "learning_rate": 0.0009782491624884758, + "loss": 0.0317, + "num_input_tokens_seen": 264960, + "step": 585 + }, + { + "epoch": 3.7341772151898733, + "grad_norm": 0.1613597720861435, + "learning_rate": 0.0009774362096892967, + "loss": 0.0243, + "num_input_tokens_seen": 267136, + "step": 590 + }, + { + "epoch": 3.7658227848101267, + "grad_norm": 0.010684450156986713, + "learning_rate": 0.000976608692415615, + "loss": 0.1092, + "num_input_tokens_seen": 269408, + "step": 595 + }, + { + "epoch": 3.7974683544303796, + "grad_norm": 0.1734110414981842, + "learning_rate": 0.0009757666359113356, + "loss": 0.0523, + "num_input_tokens_seen": 271712, + "step": 600 + }, + { + "epoch": 3.829113924050633, + "grad_norm": 0.19304603338241577, + "learning_rate": 0.0009749100658638914, + "loss": 0.0791, + "num_input_tokens_seen": 274016, + "step": 605 + }, + { + "epoch": 3.8607594936708862, + "grad_norm": 0.052665892988443375, + "learning_rate": 0.0009740390084034589, + "loss": 0.0362, + "num_input_tokens_seen": 276288, + "step": 610 + }, + { + "epoch": 3.892405063291139, + "grad_norm": 0.3322295546531677, + "learning_rate": 0.0009731534901021626, + "loss": 0.0538, + "num_input_tokens_seen": 278560, + "step": 615 + }, + { + "epoch": 3.9240506329113924, + "grad_norm": 0.2676999270915985, + "learning_rate": 0.0009722535379732627, + "loss": 0.0655, + "num_input_tokens_seen": 280864, + "step": 620 + }, + { + "epoch": 3.9556962025316453, + "grad_norm": 0.13080698251724243, + "learning_rate": 0.0009713391794703321, + "loss": 0.0144, + "num_input_tokens_seen": 283072, + "step": 625 + }, + { + "epoch": 3.9873417721518987, + "grad_norm": 0.09336388111114502, + "learning_rate": 0.000970410442486419, + "loss": 0.0997, + "num_input_tokens_seen": 285376, + "step": 630 + }, + { + "epoch": 4.0, + "eval_loss": 0.1533374935388565, + "eval_runtime": 1.6381, + "eval_samples_per_second": 42.732, + "eval_steps_per_second": 10.988, + "num_input_tokens_seen": 286000, + "step": 632 + }, + { + "epoch": 4.018987341772152, + "grad_norm": 0.17443722486495972, + "learning_rate": 0.0009694673553531956, + "loss": 0.0405, + "num_input_tokens_seen": 287280, + "step": 635 + }, + { + "epoch": 4.050632911392405, + "grad_norm": 0.17030157148838043, + "learning_rate": 0.0009685099468400933, + "loss": 0.0569, + "num_input_tokens_seen": 289552, + "step": 640 + }, + { + "epoch": 4.082278481012658, + "grad_norm": 0.2395126223564148, + "learning_rate": 0.0009675382461534265, + "loss": 0.0356, + "num_input_tokens_seen": 291824, + "step": 645 + }, + { + "epoch": 4.113924050632911, + "grad_norm": 0.0889824628829956, + "learning_rate": 0.0009665522829355004, + "loss": 0.0346, + "num_input_tokens_seen": 294224, + "step": 650 + }, + { + "epoch": 4.1455696202531644, + "grad_norm": 0.4414255917072296, + "learning_rate": 0.0009655520872637074, + "loss": 0.0435, + "num_input_tokens_seen": 296496, + "step": 655 + }, + { + "epoch": 4.177215189873418, + "grad_norm": 0.23893271386623383, + "learning_rate": 0.0009645376896496087, + "loss": 0.0602, + "num_input_tokens_seen": 298896, + "step": 660 + }, + { + "epoch": 4.208860759493671, + "grad_norm": 0.11268594115972519, + "learning_rate": 0.0009635091210380051, + "loss": 0.0586, + "num_input_tokens_seen": 301200, + "step": 665 + }, + { + "epoch": 4.2405063291139244, + "grad_norm": 0.0319950170814991, + "learning_rate": 0.0009624664128059915, + "loss": 0.0346, + "num_input_tokens_seen": 303376, + "step": 670 + }, + { + "epoch": 4.272151898734177, + "grad_norm": 0.34500107169151306, + "learning_rate": 0.0009614095967620004, + "loss": 0.0285, + "num_input_tokens_seen": 305744, + "step": 675 + }, + { + "epoch": 4.30379746835443, + "grad_norm": 0.1444796770811081, + "learning_rate": 0.0009603387051448313, + "loss": 0.1045, + "num_input_tokens_seen": 307920, + "step": 680 + }, + { + "epoch": 4.3354430379746836, + "grad_norm": 0.44996634125709534, + "learning_rate": 0.000959253770622668, + "loss": 0.0623, + "num_input_tokens_seen": 310224, + "step": 685 + }, + { + "epoch": 4.367088607594937, + "grad_norm": 0.09627924114465714, + "learning_rate": 0.0009581548262920805, + "loss": 0.0271, + "num_input_tokens_seen": 312464, + "step": 690 + }, + { + "epoch": 4.39873417721519, + "grad_norm": 0.32667797803878784, + "learning_rate": 0.0009570419056770173, + "loss": 0.0453, + "num_input_tokens_seen": 314704, + "step": 695 + }, + { + "epoch": 4.430379746835443, + "grad_norm": 0.3699743151664734, + "learning_rate": 0.0009559150427277812, + "loss": 0.0549, + "num_input_tokens_seen": 317008, + "step": 700 + }, + { + "epoch": 4.462025316455696, + "grad_norm": 0.019202928990125656, + "learning_rate": 0.0009547742718199938, + "loss": 0.0257, + "num_input_tokens_seen": 319248, + "step": 705 + }, + { + "epoch": 4.493670886075949, + "grad_norm": 0.1922365128993988, + "learning_rate": 0.0009536196277535483, + "loss": 0.1067, + "num_input_tokens_seen": 321552, + "step": 710 + }, + { + "epoch": 4.525316455696203, + "grad_norm": 0.14416015148162842, + "learning_rate": 0.0009524511457515457, + "loss": 0.06, + "num_input_tokens_seen": 323728, + "step": 715 + }, + { + "epoch": 4.556962025316456, + "grad_norm": 0.4896336495876312, + "learning_rate": 0.000951268861459222, + "loss": 0.0325, + "num_input_tokens_seen": 326000, + "step": 720 + }, + { + "epoch": 4.588607594936709, + "grad_norm": 0.15833806991577148, + "learning_rate": 0.0009500728109428603, + "loss": 0.0738, + "num_input_tokens_seen": 328368, + "step": 725 + }, + { + "epoch": 4.620253164556962, + "grad_norm": 0.04314582794904709, + "learning_rate": 0.0009488630306886904, + "loss": 0.0085, + "num_input_tokens_seen": 330576, + "step": 730 + }, + { + "epoch": 4.651898734177215, + "grad_norm": 0.07410436868667603, + "learning_rate": 0.0009476395576017756, + "loss": 0.024, + "num_input_tokens_seen": 332784, + "step": 735 + }, + { + "epoch": 4.6835443037974684, + "grad_norm": 0.23897461593151093, + "learning_rate": 0.0009464024290048879, + "loss": 0.0318, + "num_input_tokens_seen": 335152, + "step": 740 + }, + { + "epoch": 4.715189873417722, + "grad_norm": 0.055379047989845276, + "learning_rate": 0.0009451516826373676, + "loss": 0.0238, + "num_input_tokens_seen": 337456, + "step": 745 + }, + { + "epoch": 4.746835443037975, + "grad_norm": 0.2577999532222748, + "learning_rate": 0.0009438873566539743, + "loss": 0.072, + "num_input_tokens_seen": 339792, + "step": 750 + }, + { + "epoch": 4.7784810126582276, + "grad_norm": 0.11338081955909729, + "learning_rate": 0.0009426094896237213, + "loss": 0.0215, + "num_input_tokens_seen": 342032, + "step": 755 + }, + { + "epoch": 4.810126582278481, + "grad_norm": 0.030526921153068542, + "learning_rate": 0.0009413181205286995, + "loss": 0.0278, + "num_input_tokens_seen": 344304, + "step": 760 + }, + { + "epoch": 4.841772151898734, + "grad_norm": 0.07986577600240707, + "learning_rate": 0.0009400132887628885, + "loss": 0.0585, + "num_input_tokens_seen": 346640, + "step": 765 + }, + { + "epoch": 4.8734177215189876, + "grad_norm": 0.12039704620838165, + "learning_rate": 0.0009386950341309545, + "loss": 0.0516, + "num_input_tokens_seen": 348880, + "step": 770 + }, + { + "epoch": 4.905063291139241, + "grad_norm": 0.11591454595327377, + "learning_rate": 0.0009373633968470361, + "loss": 0.0503, + "num_input_tokens_seen": 351248, + "step": 775 + }, + { + "epoch": 4.936708860759493, + "grad_norm": 0.23400233685970306, + "learning_rate": 0.0009360184175335181, + "loss": 0.0874, + "num_input_tokens_seen": 353552, + "step": 780 + }, + { + "epoch": 4.968354430379747, + "grad_norm": 0.22627611458301544, + "learning_rate": 0.0009346601372197913, + "loss": 0.0494, + "num_input_tokens_seen": 355888, + "step": 785 + }, + { + "epoch": 5.0, + "grad_norm": 0.09335144609212875, + "learning_rate": 0.0009332885973410014, + "loss": 0.0227, + "num_input_tokens_seen": 357888, + "step": 790 + }, + { + "epoch": 5.0, + "eval_loss": 0.15590490400791168, + "eval_runtime": 1.6296, + "eval_samples_per_second": 42.955, + "eval_steps_per_second": 11.046, + "num_input_tokens_seen": 357888, + "step": 790 + }, + { + "epoch": 5.031645569620253, + "grad_norm": 0.04937349632382393, + "learning_rate": 0.0009319038397367856, + "loss": 0.0124, + "num_input_tokens_seen": 360096, + "step": 795 + }, + { + "epoch": 5.063291139240507, + "grad_norm": 0.020451189950108528, + "learning_rate": 0.0009305059066499948, + "loss": 0.03, + "num_input_tokens_seen": 362432, + "step": 800 + }, + { + "epoch": 5.094936708860759, + "grad_norm": 0.13021968305110931, + "learning_rate": 0.0009290948407254065, + "loss": 0.0469, + "num_input_tokens_seen": 364768, + "step": 805 + }, + { + "epoch": 5.1265822784810124, + "grad_norm": 0.0669485479593277, + "learning_rate": 0.0009276706850084226, + "loss": 0.015, + "num_input_tokens_seen": 366976, + "step": 810 + }, + { + "epoch": 5.158227848101266, + "grad_norm": 0.006794926710426807, + "learning_rate": 0.0009262334829437575, + "loss": 0.0183, + "num_input_tokens_seen": 369184, + "step": 815 + }, + { + "epoch": 5.189873417721519, + "grad_norm": 0.11718007922172546, + "learning_rate": 0.0009247832783741119, + "loss": 0.0278, + "num_input_tokens_seen": 371424, + "step": 820 + }, + { + "epoch": 5.2215189873417724, + "grad_norm": 0.09471049159765244, + "learning_rate": 0.0009233201155388354, + "loss": 0.0145, + "num_input_tokens_seen": 373600, + "step": 825 + }, + { + "epoch": 5.253164556962025, + "grad_norm": 0.12314964830875397, + "learning_rate": 0.0009218440390725772, + "loss": 0.0197, + "num_input_tokens_seen": 375904, + "step": 830 + }, + { + "epoch": 5.284810126582278, + "grad_norm": 0.19620075821876526, + "learning_rate": 0.000920355094003925, + "loss": 0.0325, + "num_input_tokens_seen": 378336, + "step": 835 + }, + { + "epoch": 5.3164556962025316, + "grad_norm": 0.36962851881980896, + "learning_rate": 0.0009188533257540302, + "loss": 0.0597, + "num_input_tokens_seen": 380672, + "step": 840 + }, + { + "epoch": 5.348101265822785, + "grad_norm": 0.19437819719314575, + "learning_rate": 0.0009173387801352231, + "loss": 0.0407, + "num_input_tokens_seen": 382976, + "step": 845 + }, + { + "epoch": 5.379746835443038, + "grad_norm": 0.015408460050821304, + "learning_rate": 0.0009158115033496156, + "loss": 0.0627, + "num_input_tokens_seen": 385152, + "step": 850 + }, + { + "epoch": 5.4113924050632916, + "grad_norm": 0.02119683474302292, + "learning_rate": 0.0009142715419876909, + "loss": 0.0325, + "num_input_tokens_seen": 387488, + "step": 855 + }, + { + "epoch": 5.443037974683544, + "grad_norm": 0.028640177100896835, + "learning_rate": 0.0009127189430268832, + "loss": 0.0164, + "num_input_tokens_seen": 389696, + "step": 860 + }, + { + "epoch": 5.474683544303797, + "grad_norm": 0.17513598501682281, + "learning_rate": 0.0009111537538301435, + "loss": 0.0355, + "num_input_tokens_seen": 392000, + "step": 865 + }, + { + "epoch": 5.506329113924051, + "grad_norm": 0.07438070327043533, + "learning_rate": 0.0009095760221444959, + "loss": 0.0221, + "num_input_tokens_seen": 394240, + "step": 870 + }, + { + "epoch": 5.537974683544304, + "grad_norm": 0.43085575103759766, + "learning_rate": 0.0009079857960995805, + "loss": 0.0376, + "num_input_tokens_seen": 396512, + "step": 875 + }, + { + "epoch": 5.569620253164557, + "grad_norm": 0.06739749014377594, + "learning_rate": 0.000906383124206185, + "loss": 0.0333, + "num_input_tokens_seen": 398880, + "step": 880 + }, + { + "epoch": 5.60126582278481, + "grad_norm": 0.14802050590515137, + "learning_rate": 0.0009047680553547656, + "loss": 0.0306, + "num_input_tokens_seen": 401184, + "step": 885 + }, + { + "epoch": 5.632911392405063, + "grad_norm": 0.15422411262989044, + "learning_rate": 0.0009031406388139543, + "loss": 0.0125, + "num_input_tokens_seen": 403360, + "step": 890 + }, + { + "epoch": 5.6645569620253164, + "grad_norm": 0.04390065744519234, + "learning_rate": 0.0009015009242290573, + "loss": 0.0636, + "num_input_tokens_seen": 405760, + "step": 895 + }, + { + "epoch": 5.69620253164557, + "grad_norm": 0.1609637290239334, + "learning_rate": 0.0008998489616205395, + "loss": 0.0916, + "num_input_tokens_seen": 408096, + "step": 900 + }, + { + "epoch": 5.727848101265823, + "grad_norm": 0.07584994286298752, + "learning_rate": 0.0008981848013824993, + "loss": 0.0526, + "num_input_tokens_seen": 410432, + "step": 905 + }, + { + "epoch": 5.759493670886076, + "grad_norm": 0.11594012379646301, + "learning_rate": 0.0008965084942811311, + "loss": 0.0439, + "num_input_tokens_seen": 412608, + "step": 910 + }, + { + "epoch": 5.791139240506329, + "grad_norm": 0.16797393560409546, + "learning_rate": 0.0008948200914531761, + "loss": 0.0733, + "num_input_tokens_seen": 414784, + "step": 915 + }, + { + "epoch": 5.822784810126582, + "grad_norm": 0.18480846285820007, + "learning_rate": 0.0008931196444043635, + "loss": 0.0321, + "num_input_tokens_seen": 417024, + "step": 920 + }, + { + "epoch": 5.8544303797468356, + "grad_norm": 0.03197441250085831, + "learning_rate": 0.0008914072050078376, + "loss": 0.0602, + "num_input_tokens_seen": 419296, + "step": 925 + }, + { + "epoch": 5.886075949367089, + "grad_norm": 0.21479590237140656, + "learning_rate": 0.0008896828255025777, + "loss": 0.0796, + "num_input_tokens_seen": 421504, + "step": 930 + }, + { + "epoch": 5.917721518987342, + "grad_norm": 0.061540644615888596, + "learning_rate": 0.000887946558491802, + "loss": 0.0227, + "num_input_tokens_seen": 423712, + "step": 935 + }, + { + "epoch": 5.949367088607595, + "grad_norm": 0.1809530258178711, + "learning_rate": 0.0008861984569413646, + "loss": 0.014, + "num_input_tokens_seen": 426016, + "step": 940 + }, + { + "epoch": 5.981012658227848, + "grad_norm": 0.22476357221603394, + "learning_rate": 0.0008844385741781394, + "loss": 0.0387, + "num_input_tokens_seen": 428352, + "step": 945 + }, + { + "epoch": 6.0, + "eval_loss": 0.1550048142671585, + "eval_runtime": 1.6336, + "eval_samples_per_second": 42.851, + "eval_steps_per_second": 11.019, + "num_input_tokens_seen": 429456, + "step": 948 + }, + { + "epoch": 6.012658227848101, + "grad_norm": 0.243870347738266, + "learning_rate": 0.0008826669638883927, + "loss": 0.0337, + "num_input_tokens_seen": 430416, + "step": 950 + }, + { + "epoch": 6.044303797468355, + "grad_norm": 0.3352782428264618, + "learning_rate": 0.0008808836801161464, + "loss": 0.027, + "num_input_tokens_seen": 432624, + "step": 955 + }, + { + "epoch": 6.075949367088608, + "grad_norm": 0.19765187799930573, + "learning_rate": 0.0008790887772615288, + "loss": 0.0342, + "num_input_tokens_seen": 434960, + "step": 960 + }, + { + "epoch": 6.1075949367088604, + "grad_norm": 0.3231484293937683, + "learning_rate": 0.0008772823100791151, + "loss": 0.0166, + "num_input_tokens_seen": 437200, + "step": 965 + }, + { + "epoch": 6.139240506329114, + "grad_norm": 0.02029154635965824, + "learning_rate": 0.0008754643336762571, + "loss": 0.0098, + "num_input_tokens_seen": 439376, + "step": 970 + }, + { + "epoch": 6.170886075949367, + "grad_norm": 0.4215114414691925, + "learning_rate": 0.0008736349035114024, + "loss": 0.0428, + "num_input_tokens_seen": 441744, + "step": 975 + }, + { + "epoch": 6.2025316455696204, + "grad_norm": 0.16070908308029175, + "learning_rate": 0.0008717940753924023, + "loss": 0.0274, + "num_input_tokens_seen": 443952, + "step": 980 + }, + { + "epoch": 6.234177215189874, + "grad_norm": 0.2765108048915863, + "learning_rate": 0.0008699419054748092, + "loss": 0.0172, + "num_input_tokens_seen": 446256, + "step": 985 + }, + { + "epoch": 6.265822784810126, + "grad_norm": 0.6181844472885132, + "learning_rate": 0.0008680784502601644, + "loss": 0.0447, + "num_input_tokens_seen": 448560, + "step": 990 + }, + { + "epoch": 6.2974683544303796, + "grad_norm": 0.02360740303993225, + "learning_rate": 0.0008662037665942733, + "loss": 0.0648, + "num_input_tokens_seen": 450640, + "step": 995 + }, + { + "epoch": 6.329113924050633, + "grad_norm": 0.042995549738407135, + "learning_rate": 0.0008643179116654719, + "loss": 0.0392, + "num_input_tokens_seen": 452912, + "step": 1000 + }, + { + "epoch": 6.360759493670886, + "grad_norm": 0.17374254763126373, + "learning_rate": 0.0008624209430028826, + "loss": 0.0394, + "num_input_tokens_seen": 455280, + "step": 1005 + }, + { + "epoch": 6.3924050632911396, + "grad_norm": 0.3477534055709839, + "learning_rate": 0.0008605129184746585, + "loss": 0.035, + "num_input_tokens_seen": 457584, + "step": 1010 + }, + { + "epoch": 6.424050632911392, + "grad_norm": 0.1447627693414688, + "learning_rate": 0.0008585938962862184, + "loss": 0.06, + "num_input_tokens_seen": 459792, + "step": 1015 + }, + { + "epoch": 6.455696202531645, + "grad_norm": 0.11052645742893219, + "learning_rate": 0.0008566639349784715, + "loss": 0.0287, + "num_input_tokens_seen": 462064, + "step": 1020 + }, + { + "epoch": 6.487341772151899, + "grad_norm": 0.299258291721344, + "learning_rate": 0.0008547230934260312, + "loss": 0.0474, + "num_input_tokens_seen": 464272, + "step": 1025 + }, + { + "epoch": 6.518987341772152, + "grad_norm": 0.03278442844748497, + "learning_rate": 0.0008527714308354191, + "loss": 0.0214, + "num_input_tokens_seen": 466640, + "step": 1030 + }, + { + "epoch": 6.550632911392405, + "grad_norm": 0.10663223266601562, + "learning_rate": 0.0008508090067432591, + "loss": 0.0205, + "num_input_tokens_seen": 468848, + "step": 1035 + }, + { + "epoch": 6.582278481012658, + "grad_norm": 0.09133275598287582, + "learning_rate": 0.000848835881014461, + "loss": 0.0179, + "num_input_tokens_seen": 471216, + "step": 1040 + }, + { + "epoch": 6.613924050632911, + "grad_norm": 0.21493889391422272, + "learning_rate": 0.0008468521138403945, + "loss": 0.0185, + "num_input_tokens_seen": 473552, + "step": 1045 + }, + { + "epoch": 6.6455696202531644, + "grad_norm": 0.16246415674686432, + "learning_rate": 0.0008448577657370528, + "loss": 0.0362, + "num_input_tokens_seen": 475792, + "step": 1050 + }, + { + "epoch": 6.677215189873418, + "grad_norm": 0.12449932098388672, + "learning_rate": 0.0008428528975432066, + "loss": 0.0297, + "num_input_tokens_seen": 478096, + "step": 1055 + }, + { + "epoch": 6.708860759493671, + "grad_norm": 0.21884576976299286, + "learning_rate": 0.0008408375704185482, + "loss": 0.0233, + "num_input_tokens_seen": 480400, + "step": 1060 + }, + { + "epoch": 6.740506329113924, + "grad_norm": 0.27845168113708496, + "learning_rate": 0.0008388118458418259, + "loss": 0.0372, + "num_input_tokens_seen": 482704, + "step": 1065 + }, + { + "epoch": 6.772151898734177, + "grad_norm": 0.2817884683609009, + "learning_rate": 0.0008367757856089684, + "loss": 0.0393, + "num_input_tokens_seen": 484848, + "step": 1070 + }, + { + "epoch": 6.80379746835443, + "grad_norm": 0.2569531202316284, + "learning_rate": 0.0008347294518311994, + "loss": 0.031, + "num_input_tokens_seen": 487152, + "step": 1075 + }, + { + "epoch": 6.8354430379746836, + "grad_norm": 0.06019921600818634, + "learning_rate": 0.0008326729069331436, + "loss": 0.0113, + "num_input_tokens_seen": 489360, + "step": 1080 + }, + { + "epoch": 6.867088607594937, + "grad_norm": 0.042034849524497986, + "learning_rate": 0.0008306062136509219, + "loss": 0.0312, + "num_input_tokens_seen": 491664, + "step": 1085 + }, + { + "epoch": 6.89873417721519, + "grad_norm": 0.41757842898368835, + "learning_rate": 0.0008285294350302375, + "loss": 0.0686, + "num_input_tokens_seen": 493968, + "step": 1090 + }, + { + "epoch": 6.930379746835443, + "grad_norm": 0.3947131931781769, + "learning_rate": 0.0008264426344244527, + "loss": 0.0443, + "num_input_tokens_seen": 496272, + "step": 1095 + }, + { + "epoch": 6.962025316455696, + "grad_norm": 0.14148084819316864, + "learning_rate": 0.000824345875492657, + "loss": 0.0376, + "num_input_tokens_seen": 498576, + "step": 1100 + }, + { + "epoch": 6.993670886075949, + "grad_norm": 0.24408359825611115, + "learning_rate": 0.000822239222197724, + "loss": 0.0222, + "num_input_tokens_seen": 500912, + "step": 1105 + }, + { + "epoch": 7.0, + "eval_loss": 0.15802980959415436, + "eval_runtime": 1.6291, + "eval_samples_per_second": 42.968, + "eval_steps_per_second": 11.049, + "num_input_tokens_seen": 501136, + "step": 1106 + }, + { + "epoch": 7.025316455696203, + "grad_norm": 0.11984951049089432, + "learning_rate": 0.0008201227388043606, + "loss": 0.0145, + "num_input_tokens_seen": 502928, + "step": 1110 + }, + { + "epoch": 7.056962025316456, + "grad_norm": 0.20624016225337982, + "learning_rate": 0.0008179964898771472, + "loss": 0.0199, + "num_input_tokens_seen": 505232, + "step": 1115 + }, + { + "epoch": 7.0886075949367084, + "grad_norm": 0.020836766809225082, + "learning_rate": 0.0008158605402785673, + "loss": 0.0394, + "num_input_tokens_seen": 507408, + "step": 1120 + }, + { + "epoch": 7.120253164556962, + "grad_norm": 0.1678433120250702, + "learning_rate": 0.000813714955167029, + "loss": 0.048, + "num_input_tokens_seen": 509680, + "step": 1125 + }, + { + "epoch": 7.151898734177215, + "grad_norm": 0.26841288805007935, + "learning_rate": 0.0008115597999948779, + "loss": 0.0122, + "num_input_tokens_seen": 511920, + "step": 1130 + }, + { + "epoch": 7.1835443037974684, + "grad_norm": 0.20223906636238098, + "learning_rate": 0.000809395140506399, + "loss": 0.0181, + "num_input_tokens_seen": 514256, + "step": 1135 + }, + { + "epoch": 7.215189873417722, + "grad_norm": 0.13459138572216034, + "learning_rate": 0.0008072210427358138, + "loss": 0.0263, + "num_input_tokens_seen": 516560, + "step": 1140 + }, + { + "epoch": 7.246835443037975, + "grad_norm": 0.017935167998075485, + "learning_rate": 0.0008050375730052621, + "loss": 0.0184, + "num_input_tokens_seen": 518864, + "step": 1145 + }, + { + "epoch": 7.2784810126582276, + "grad_norm": 0.005919218994677067, + "learning_rate": 0.0008028447979227828, + "loss": 0.0106, + "num_input_tokens_seen": 521232, + "step": 1150 + }, + { + "epoch": 7.310126582278481, + "grad_norm": 0.04079112410545349, + "learning_rate": 0.0008006427843802786, + "loss": 0.0263, + "num_input_tokens_seen": 523536, + "step": 1155 + }, + { + "epoch": 7.341772151898734, + "grad_norm": 0.09390467405319214, + "learning_rate": 0.0007984315995514777, + "loss": 0.0157, + "num_input_tokens_seen": 525840, + "step": 1160 + }, + { + "epoch": 7.3734177215189876, + "grad_norm": 0.04086320102214813, + "learning_rate": 0.0007962113108898838, + "loss": 0.0123, + "num_input_tokens_seen": 528144, + "step": 1165 + }, + { + "epoch": 7.405063291139241, + "grad_norm": 0.027376219630241394, + "learning_rate": 0.0007939819861267182, + "loss": 0.0146, + "num_input_tokens_seen": 530320, + "step": 1170 + }, + { + "epoch": 7.436708860759493, + "grad_norm": 0.058822840452194214, + "learning_rate": 0.0007917436932688538, + "loss": 0.0371, + "num_input_tokens_seen": 532592, + "step": 1175 + }, + { + "epoch": 7.468354430379747, + "grad_norm": 0.20625318586826324, + "learning_rate": 0.0007894965005967404, + "loss": 0.0172, + "num_input_tokens_seen": 534960, + "step": 1180 + }, + { + "epoch": 7.5, + "grad_norm": 0.11094500869512558, + "learning_rate": 0.0007872404766623225, + "loss": 0.034, + "num_input_tokens_seen": 537232, + "step": 1185 + }, + { + "epoch": 7.531645569620253, + "grad_norm": 0.1838381588459015, + "learning_rate": 0.000784975690286947, + "loss": 0.0342, + "num_input_tokens_seen": 539504, + "step": 1190 + }, + { + "epoch": 7.563291139240507, + "grad_norm": 0.037260618060827255, + "learning_rate": 0.0007827022105592645, + "loss": 0.009, + "num_input_tokens_seen": 541680, + "step": 1195 + }, + { + "epoch": 7.594936708860759, + "grad_norm": 0.0016006848309189081, + "learning_rate": 0.0007804201068331211, + "loss": 0.0166, + "num_input_tokens_seen": 543984, + "step": 1200 + }, + { + "epoch": 7.6265822784810124, + "grad_norm": 0.1560075581073761, + "learning_rate": 0.0007781294487254435, + "loss": 0.0368, + "num_input_tokens_seen": 546224, + "step": 1205 + }, + { + "epoch": 7.658227848101266, + "grad_norm": 0.030724182724952698, + "learning_rate": 0.0007758303061141148, + "loss": 0.0279, + "num_input_tokens_seen": 548496, + "step": 1210 + }, + { + "epoch": 7.689873417721519, + "grad_norm": 0.06297805160284042, + "learning_rate": 0.0007735227491358426, + "loss": 0.0188, + "num_input_tokens_seen": 550704, + "step": 1215 + }, + { + "epoch": 7.7215189873417724, + "grad_norm": 0.20682714879512787, + "learning_rate": 0.0007712068481840198, + "loss": 0.0415, + "num_input_tokens_seen": 552976, + "step": 1220 + }, + { + "epoch": 7.753164556962025, + "grad_norm": 0.08757719397544861, + "learning_rate": 0.0007688826739065777, + "loss": 0.047, + "num_input_tokens_seen": 555312, + "step": 1225 + }, + { + "epoch": 7.784810126582278, + "grad_norm": 0.05496051907539368, + "learning_rate": 0.00076655029720383, + "loss": 0.0235, + "num_input_tokens_seen": 557680, + "step": 1230 + }, + { + "epoch": 7.8164556962025316, + "grad_norm": 0.1725744754076004, + "learning_rate": 0.0007642097892263098, + "loss": 0.0196, + "num_input_tokens_seen": 559984, + "step": 1235 + }, + { + "epoch": 7.848101265822785, + "grad_norm": 0.19660170376300812, + "learning_rate": 0.0007618612213726, + "loss": 0.0259, + "num_input_tokens_seen": 562256, + "step": 1240 + }, + { + "epoch": 7.879746835443038, + "grad_norm": 0.0013731749495491385, + "learning_rate": 0.0007595046652871552, + "loss": 0.0349, + "num_input_tokens_seen": 564560, + "step": 1245 + }, + { + "epoch": 7.911392405063291, + "grad_norm": 0.029319079592823982, + "learning_rate": 0.0007571401928581145, + "loss": 0.0289, + "num_input_tokens_seen": 566896, + "step": 1250 + }, + { + "epoch": 7.943037974683544, + "grad_norm": 0.009290185756981373, + "learning_rate": 0.0007547678762151109, + "loss": 0.0146, + "num_input_tokens_seen": 569200, + "step": 1255 + }, + { + "epoch": 7.974683544303797, + "grad_norm": 0.059927552938461304, + "learning_rate": 0.0007523877877270695, + "loss": 0.0233, + "num_input_tokens_seen": 571536, + "step": 1260 + }, + { + "epoch": 8.0, + "eval_loss": 0.19430989027023315, + "eval_runtime": 1.6296, + "eval_samples_per_second": 42.954, + "eval_steps_per_second": 11.045, + "num_input_tokens_seen": 573104, + "step": 1264 + }, + { + "epoch": 8.00632911392405, + "grad_norm": 0.0899854376912117, + "learning_rate": 0.00075, + "loss": 0.0227, + "num_input_tokens_seen": 573616, + "step": 1265 + }, + { + "epoch": 8.037974683544304, + "grad_norm": 0.048420343548059464, + "learning_rate": 0.000747604585874782, + "loss": 0.008, + "num_input_tokens_seen": 575824, + "step": 1270 + }, + { + "epoch": 8.069620253164556, + "grad_norm": 0.03089860826730728, + "learning_rate": 0.0007452016184249428, + "loss": 0.0146, + "num_input_tokens_seen": 578032, + "step": 1275 + }, + { + "epoch": 8.10126582278481, + "grad_norm": 0.1087641566991806, + "learning_rate": 0.0007427911709544287, + "loss": 0.0107, + "num_input_tokens_seen": 580464, + "step": 1280 + }, + { + "epoch": 8.132911392405063, + "grad_norm": 0.00169737811665982, + "learning_rate": 0.0007403733169953678, + "loss": 0.0169, + "num_input_tokens_seen": 582640, + "step": 1285 + }, + { + "epoch": 8.164556962025316, + "grad_norm": 0.18757465481758118, + "learning_rate": 0.0007379481303058282, + "loss": 0.0252, + "num_input_tokens_seen": 584976, + "step": 1290 + }, + { + "epoch": 8.19620253164557, + "grad_norm": 0.2876146733760834, + "learning_rate": 0.000735515684867567, + "loss": 0.0095, + "num_input_tokens_seen": 587120, + "step": 1295 + }, + { + "epoch": 8.227848101265822, + "grad_norm": 0.11902011930942535, + "learning_rate": 0.0007330760548837738, + "loss": 0.0242, + "num_input_tokens_seen": 589392, + "step": 1300 + }, + { + "epoch": 8.259493670886076, + "grad_norm": 0.001837995252572, + "learning_rate": 0.0007306293147768067, + "loss": 0.0257, + "num_input_tokens_seen": 591632, + "step": 1305 + }, + { + "epoch": 8.291139240506329, + "grad_norm": 0.22673650085926056, + "learning_rate": 0.0007281755391859229, + "loss": 0.021, + "num_input_tokens_seen": 594000, + "step": 1310 + }, + { + "epoch": 8.322784810126583, + "grad_norm": 0.007771142292767763, + "learning_rate": 0.0007257148029650007, + "loss": 0.0117, + "num_input_tokens_seen": 596368, + "step": 1315 + }, + { + "epoch": 8.354430379746836, + "grad_norm": 0.021376727148890495, + "learning_rate": 0.0007232471811802568, + "loss": 0.0103, + "num_input_tokens_seen": 598672, + "step": 1320 + }, + { + "epoch": 8.386075949367088, + "grad_norm": 0.40462520718574524, + "learning_rate": 0.000720772749107956, + "loss": 0.0123, + "num_input_tokens_seen": 601040, + "step": 1325 + }, + { + "epoch": 8.417721518987342, + "grad_norm": 0.27043724060058594, + "learning_rate": 0.000718291582232115, + "loss": 0.0234, + "num_input_tokens_seen": 603344, + "step": 1330 + }, + { + "epoch": 8.449367088607595, + "grad_norm": 0.09709592163562775, + "learning_rate": 0.0007158037562421997, + "loss": 0.0188, + "num_input_tokens_seen": 605552, + "step": 1335 + }, + { + "epoch": 8.481012658227849, + "grad_norm": 0.012457730248570442, + "learning_rate": 0.0007133093470308164, + "loss": 0.0198, + "num_input_tokens_seen": 607760, + "step": 1340 + }, + { + "epoch": 8.512658227848101, + "grad_norm": 0.0654660239815712, + "learning_rate": 0.0007108084306913959, + "loss": 0.0119, + "num_input_tokens_seen": 610192, + "step": 1345 + }, + { + "epoch": 8.544303797468354, + "grad_norm": 0.2206096202135086, + "learning_rate": 0.0007083010835158732, + "loss": 0.0216, + "num_input_tokens_seen": 612528, + "step": 1350 + }, + { + "epoch": 8.575949367088608, + "grad_norm": 0.14902903139591217, + "learning_rate": 0.0007057873819923602, + "loss": 0.0113, + "num_input_tokens_seen": 614768, + "step": 1355 + }, + { + "epoch": 8.60759493670886, + "grad_norm": 0.016851400956511497, + "learning_rate": 0.0007032674028028109, + "loss": 0.0331, + "num_input_tokens_seen": 616912, + "step": 1360 + }, + { + "epoch": 8.639240506329115, + "grad_norm": 0.013078989461064339, + "learning_rate": 0.000700741222820684, + "loss": 0.0029, + "num_input_tokens_seen": 619088, + "step": 1365 + }, + { + "epoch": 8.670886075949367, + "grad_norm": 0.238576740026474, + "learning_rate": 0.000698208919108597, + "loss": 0.0327, + "num_input_tokens_seen": 621328, + "step": 1370 + }, + { + "epoch": 8.70253164556962, + "grad_norm": 0.17277413606643677, + "learning_rate": 0.0006956705689159751, + "loss": 0.0105, + "num_input_tokens_seen": 623536, + "step": 1375 + }, + { + "epoch": 8.734177215189874, + "grad_norm": 0.02624700218439102, + "learning_rate": 0.0006931262496766954, + "loss": 0.0212, + "num_input_tokens_seen": 625744, + "step": 1380 + }, + { + "epoch": 8.765822784810126, + "grad_norm": 0.08424220234155655, + "learning_rate": 0.0006905760390067235, + "loss": 0.0174, + "num_input_tokens_seen": 628144, + "step": 1385 + }, + { + "epoch": 8.79746835443038, + "grad_norm": 0.05029447004199028, + "learning_rate": 0.0006880200147017476, + "loss": 0.0109, + "num_input_tokens_seen": 630448, + "step": 1390 + }, + { + "epoch": 8.829113924050633, + "grad_norm": 0.01783582754433155, + "learning_rate": 0.0006854582547348037, + "loss": 0.0095, + "num_input_tokens_seen": 632720, + "step": 1395 + }, + { + "epoch": 8.860759493670885, + "grad_norm": 0.11064671725034714, + "learning_rate": 0.0006828908372538977, + "loss": 0.041, + "num_input_tokens_seen": 634960, + "step": 1400 + }, + { + "epoch": 8.89240506329114, + "grad_norm": 0.14175012707710266, + "learning_rate": 0.0006803178405796214, + "loss": 0.0247, + "num_input_tokens_seen": 637328, + "step": 1405 + }, + { + "epoch": 8.924050632911392, + "grad_norm": 0.039730366319417953, + "learning_rate": 0.0006777393432027626, + "loss": 0.011, + "num_input_tokens_seen": 639600, + "step": 1410 + }, + { + "epoch": 8.955696202531646, + "grad_norm": 0.31237632036209106, + "learning_rate": 0.0006751554237819122, + "loss": 0.0294, + "num_input_tokens_seen": 641808, + "step": 1415 + }, + { + "epoch": 8.987341772151899, + "grad_norm": 0.5145399570465088, + "learning_rate": 0.0006725661611410633, + "loss": 0.0281, + "num_input_tokens_seen": 644080, + "step": 1420 + }, + { + "epoch": 9.0, + "eval_loss": 0.2004173845052719, + "eval_runtime": 1.6339, + "eval_samples_per_second": 42.843, + "eval_steps_per_second": 11.017, + "num_input_tokens_seen": 644752, + "step": 1422 + }, + { + "epoch": 9.018987341772151, + "grad_norm": 0.14643707871437073, + "learning_rate": 0.0006699716342672071, + "loss": 0.0135, + "num_input_tokens_seen": 646128, + "step": 1425 + }, + { + "epoch": 9.050632911392405, + "grad_norm": 0.09504041075706482, + "learning_rate": 0.0006673719223079233, + "loss": 0.0054, + "num_input_tokens_seen": 648336, + "step": 1430 + }, + { + "epoch": 9.082278481012658, + "grad_norm": 0.12280137836933136, + "learning_rate": 0.000664767104568966, + "loss": 0.0229, + "num_input_tokens_seen": 650640, + "step": 1435 + }, + { + "epoch": 9.113924050632912, + "grad_norm": 0.017096519470214844, + "learning_rate": 0.0006621572605118438, + "loss": 0.0075, + "num_input_tokens_seen": 652880, + "step": 1440 + }, + { + "epoch": 9.145569620253164, + "grad_norm": 0.004320152103900909, + "learning_rate": 0.0006595424697513964, + "loss": 0.0067, + "num_input_tokens_seen": 655056, + "step": 1445 + }, + { + "epoch": 9.177215189873417, + "grad_norm": 0.0978095531463623, + "learning_rate": 0.000656922812053365, + "loss": 0.0147, + "num_input_tokens_seen": 657328, + "step": 1450 + }, + { + "epoch": 9.208860759493671, + "grad_norm": 0.00433026347309351, + "learning_rate": 0.0006542983673319606, + "loss": 0.0103, + "num_input_tokens_seen": 659632, + "step": 1455 + }, + { + "epoch": 9.240506329113924, + "grad_norm": 0.16300617158412933, + "learning_rate": 0.0006516692156474243, + "loss": 0.0056, + "num_input_tokens_seen": 662000, + "step": 1460 + }, + { + "epoch": 9.272151898734178, + "grad_norm": 1.0091853141784668, + "learning_rate": 0.0006490354372035864, + "loss": 0.0252, + "num_input_tokens_seen": 664208, + "step": 1465 + }, + { + "epoch": 9.30379746835443, + "grad_norm": 0.1810048520565033, + "learning_rate": 0.000646397112345419, + "loss": 0.0103, + "num_input_tokens_seen": 666672, + "step": 1470 + }, + { + "epoch": 9.335443037974684, + "grad_norm": 0.003192617790773511, + "learning_rate": 0.0006437543215565858, + "loss": 0.0121, + "num_input_tokens_seen": 668944, + "step": 1475 + }, + { + "epoch": 9.367088607594937, + "grad_norm": 0.20482294261455536, + "learning_rate": 0.0006411071454569854, + "loss": 0.0235, + "num_input_tokens_seen": 671216, + "step": 1480 + }, + { + "epoch": 9.39873417721519, + "grad_norm": 0.029740244150161743, + "learning_rate": 0.0006384556648002934, + "loss": 0.0274, + "num_input_tokens_seen": 673424, + "step": 1485 + }, + { + "epoch": 9.430379746835444, + "grad_norm": 0.039655689150094986, + "learning_rate": 0.0006357999604714987, + "loss": 0.0049, + "num_input_tokens_seen": 675600, + "step": 1490 + }, + { + "epoch": 9.462025316455696, + "grad_norm": 0.14757280051708221, + "learning_rate": 0.0006331401134844356, + "loss": 0.0133, + "num_input_tokens_seen": 677872, + "step": 1495 + }, + { + "epoch": 9.49367088607595, + "grad_norm": 0.2317933589220047, + "learning_rate": 0.0006304762049793126, + "loss": 0.025, + "num_input_tokens_seen": 680080, + "step": 1500 + }, + { + "epoch": 9.525316455696203, + "grad_norm": 0.1407846361398697, + "learning_rate": 0.0006278083162202374, + "loss": 0.0117, + "num_input_tokens_seen": 682288, + "step": 1505 + }, + { + "epoch": 9.556962025316455, + "grad_norm": 0.12098275870084763, + "learning_rate": 0.0006251365285927373, + "loss": 0.0113, + "num_input_tokens_seen": 684656, + "step": 1510 + }, + { + "epoch": 9.58860759493671, + "grad_norm": 0.2326350212097168, + "learning_rate": 0.0006224609236012774, + "loss": 0.0107, + "num_input_tokens_seen": 686896, + "step": 1515 + }, + { + "epoch": 9.620253164556962, + "grad_norm": 0.21439184248447418, + "learning_rate": 0.0006197815828667734, + "loss": 0.0081, + "num_input_tokens_seen": 689104, + "step": 1520 + }, + { + "epoch": 9.651898734177216, + "grad_norm": 0.0005985921015962958, + "learning_rate": 0.0006170985881241021, + "loss": 0.0217, + "num_input_tokens_seen": 691376, + "step": 1525 + }, + { + "epoch": 9.683544303797468, + "grad_norm": 0.20147515833377838, + "learning_rate": 0.0006144120212196084, + "loss": 0.0131, + "num_input_tokens_seen": 693744, + "step": 1530 + }, + { + "epoch": 9.715189873417721, + "grad_norm": 0.0021810184698551893, + "learning_rate": 0.0006117219641086072, + "loss": 0.0036, + "num_input_tokens_seen": 695984, + "step": 1535 + }, + { + "epoch": 9.746835443037975, + "grad_norm": 0.2354806363582611, + "learning_rate": 0.0006090284988528853, + "loss": 0.0134, + "num_input_tokens_seen": 698160, + "step": 1540 + }, + { + "epoch": 9.778481012658228, + "grad_norm": 0.13318821787834167, + "learning_rate": 0.0006063317076181961, + "loss": 0.0128, + "num_input_tokens_seen": 700496, + "step": 1545 + }, + { + "epoch": 9.810126582278482, + "grad_norm": 0.09554003924131393, + "learning_rate": 0.0006036316726717545, + "loss": 0.0209, + "num_input_tokens_seen": 702736, + "step": 1550 + }, + { + "epoch": 9.841772151898734, + "grad_norm": 0.004904511384665966, + "learning_rate": 0.0006009284763797266, + "loss": 0.0279, + "num_input_tokens_seen": 705104, + "step": 1555 + }, + { + "epoch": 9.873417721518987, + "grad_norm": 0.43692004680633545, + "learning_rate": 0.0005982222012047172, + "loss": 0.0292, + "num_input_tokens_seen": 707376, + "step": 1560 + }, + { + "epoch": 9.905063291139241, + "grad_norm": 0.14077496528625488, + "learning_rate": 0.0005955129297032539, + "loss": 0.0073, + "num_input_tokens_seen": 709616, + "step": 1565 + }, + { + "epoch": 9.936708860759493, + "grad_norm": 0.0007441575289703906, + "learning_rate": 0.0005928007445232698, + "loss": 0.018, + "num_input_tokens_seen": 711888, + "step": 1570 + }, + { + "epoch": 9.968354430379748, + "grad_norm": 0.03075648471713066, + "learning_rate": 0.0005900857284015807, + "loss": 0.0441, + "num_input_tokens_seen": 714192, + "step": 1575 + }, + { + "epoch": 10.0, + "grad_norm": 0.11141970753669739, + "learning_rate": 0.0005873679641613625, + "loss": 0.036, + "num_input_tokens_seen": 716192, + "step": 1580 + }, + { + "epoch": 10.0, + "eval_loss": 0.19241616129875183, + "eval_runtime": 1.6364, + "eval_samples_per_second": 42.778, + "eval_steps_per_second": 11.0, + "num_input_tokens_seen": 716192, + "step": 1580 + }, + { + "epoch": 10.031645569620252, + "grad_norm": 0.2818821966648102, + "learning_rate": 0.000584647534709624, + "loss": 0.0156, + "num_input_tokens_seen": 718496, + "step": 1585 + }, + { + "epoch": 10.063291139240507, + "grad_norm": 0.1254635751247406, + "learning_rate": 0.0005819245230346772, + "loss": 0.0169, + "num_input_tokens_seen": 720800, + "step": 1590 + }, + { + "epoch": 10.094936708860759, + "grad_norm": 0.04304712638258934, + "learning_rate": 0.0005791990122036075, + "loss": 0.003, + "num_input_tokens_seen": 722944, + "step": 1595 + }, + { + "epoch": 10.126582278481013, + "grad_norm": 0.03468741476535797, + "learning_rate": 0.0005764710853597376, + "loss": 0.0033, + "num_input_tokens_seen": 725088, + "step": 1600 + }, + { + "epoch": 10.158227848101266, + "grad_norm": 0.1091819480061531, + "learning_rate": 0.0005737408257200926, + "loss": 0.0143, + "num_input_tokens_seen": 727232, + "step": 1605 + }, + { + "epoch": 10.189873417721518, + "grad_norm": 0.09041888266801834, + "learning_rate": 0.0005710083165728604, + "loss": 0.0091, + "num_input_tokens_seen": 729344, + "step": 1610 + }, + { + "epoch": 10.221518987341772, + "grad_norm": 0.004345647990703583, + "learning_rate": 0.000568273641274852, + "loss": 0.0096, + "num_input_tokens_seen": 731616, + "step": 1615 + }, + { + "epoch": 10.253164556962025, + "grad_norm": 0.13551408052444458, + "learning_rate": 0.0005655368832489584, + "loss": 0.0078, + "num_input_tokens_seen": 734016, + "step": 1620 + }, + { + "epoch": 10.284810126582279, + "grad_norm": 0.049766648560762405, + "learning_rate": 0.000562798125981604, + "loss": 0.0191, + "num_input_tokens_seen": 736320, + "step": 1625 + }, + { + "epoch": 10.316455696202532, + "grad_norm": 0.01648896187543869, + "learning_rate": 0.0005600574530202029, + "loss": 0.0043, + "num_input_tokens_seen": 738528, + "step": 1630 + }, + { + "epoch": 10.348101265822784, + "grad_norm": 0.27137458324432373, + "learning_rate": 0.0005573149479706079, + "loss": 0.0122, + "num_input_tokens_seen": 740768, + "step": 1635 + }, + { + "epoch": 10.379746835443038, + "grad_norm": 0.2284439355134964, + "learning_rate": 0.0005545706944945606, + "loss": 0.0218, + "num_input_tokens_seen": 743072, + "step": 1640 + }, + { + "epoch": 10.41139240506329, + "grad_norm": 0.3823016583919525, + "learning_rate": 0.0005518247763071391, + "loss": 0.0181, + "num_input_tokens_seen": 745312, + "step": 1645 + }, + { + "epoch": 10.443037974683545, + "grad_norm": 0.24628500640392303, + "learning_rate": 0.0005490772771742055, + "loss": 0.0163, + "num_input_tokens_seen": 747488, + "step": 1650 + }, + { + "epoch": 10.474683544303797, + "grad_norm": 0.06299833953380585, + "learning_rate": 0.0005463282809098489, + "loss": 0.0114, + "num_input_tokens_seen": 749760, + "step": 1655 + }, + { + "epoch": 10.50632911392405, + "grad_norm": 0.10333729535341263, + "learning_rate": 0.0005435778713738292, + "loss": 0.0117, + "num_input_tokens_seen": 752032, + "step": 1660 + }, + { + "epoch": 10.537974683544304, + "grad_norm": 0.10650386661291122, + "learning_rate": 0.0005408261324690192, + "loss": 0.0083, + "num_input_tokens_seen": 754336, + "step": 1665 + }, + { + "epoch": 10.569620253164556, + "grad_norm": 0.01503800880163908, + "learning_rate": 0.0005380731481388447, + "loss": 0.0063, + "num_input_tokens_seen": 756576, + "step": 1670 + }, + { + "epoch": 10.60126582278481, + "grad_norm": 0.009724380448460579, + "learning_rate": 0.0005353190023647248, + "loss": 0.0156, + "num_input_tokens_seen": 758816, + "step": 1675 + }, + { + "epoch": 10.632911392405063, + "grad_norm": 0.32821592688560486, + "learning_rate": 0.0005325637791635076, + "loss": 0.0122, + "num_input_tokens_seen": 761056, + "step": 1680 + }, + { + "epoch": 10.664556962025316, + "grad_norm": 0.04828578606247902, + "learning_rate": 0.00052980756258491, + "loss": 0.0037, + "num_input_tokens_seen": 763200, + "step": 1685 + }, + { + "epoch": 10.69620253164557, + "grad_norm": 0.015509342774748802, + "learning_rate": 0.0005270504367089519, + "loss": 0.0178, + "num_input_tokens_seen": 765344, + "step": 1690 + }, + { + "epoch": 10.727848101265822, + "grad_norm": 0.0019919374026358128, + "learning_rate": 0.0005242924856433921, + "loss": 0.0036, + "num_input_tokens_seen": 767552, + "step": 1695 + }, + { + "epoch": 10.759493670886076, + "grad_norm": 0.1297902762889862, + "learning_rate": 0.0005215337935211622, + "loss": 0.0052, + "num_input_tokens_seen": 769920, + "step": 1700 + }, + { + "epoch": 10.791139240506329, + "grad_norm": 0.024680687114596367, + "learning_rate": 0.0005187744444978001, + "loss": 0.0033, + "num_input_tokens_seen": 772224, + "step": 1705 + }, + { + "epoch": 10.822784810126583, + "grad_norm": 0.026743615046143532, + "learning_rate": 0.0005160145227488831, + "loss": 0.0032, + "num_input_tokens_seen": 774592, + "step": 1710 + }, + { + "epoch": 10.854430379746836, + "grad_norm": 0.04600111022591591, + "learning_rate": 0.0005132541124674594, + "loss": 0.002, + "num_input_tokens_seen": 776864, + "step": 1715 + }, + { + "epoch": 10.886075949367088, + "grad_norm": 0.02488797903060913, + "learning_rate": 0.0005104932978614806, + "loss": 0.0019, + "num_input_tokens_seen": 779168, + "step": 1720 + }, + { + "epoch": 10.917721518987342, + "grad_norm": 0.0002597300917841494, + "learning_rate": 0.0005077321631512322, + "loss": 0.0076, + "num_input_tokens_seen": 781376, + "step": 1725 + }, + { + "epoch": 10.949367088607595, + "grad_norm": 0.17480707168579102, + "learning_rate": 0.0005049707925667649, + "loss": 0.0042, + "num_input_tokens_seen": 783744, + "step": 1730 + }, + { + "epoch": 10.981012658227849, + "grad_norm": 0.0045512220822274685, + "learning_rate": 0.0005022092703453246, + "loss": 0.0096, + "num_input_tokens_seen": 786016, + "step": 1735 + }, + { + "epoch": 11.0, + "eval_loss": 0.23822355270385742, + "eval_runtime": 1.63, + "eval_samples_per_second": 42.945, + "eval_steps_per_second": 11.043, + "num_input_tokens_seen": 787200, + "step": 1738 + }, + { + "epoch": 11.012658227848101, + "grad_norm": 0.008017591200768948, + "learning_rate": 0.0004994476807287834, + "loss": 0.0193, + "num_input_tokens_seen": 788128, + "step": 1740 + }, + { + "epoch": 11.044303797468354, + "grad_norm": 0.07010766863822937, + "learning_rate": 0.0004966861079610688, + "loss": 0.0093, + "num_input_tokens_seen": 790432, + "step": 1745 + }, + { + "epoch": 11.075949367088608, + "grad_norm": 0.13294902443885803, + "learning_rate": 0.0004939246362855944, + "loss": 0.0018, + "num_input_tokens_seen": 792672, + "step": 1750 + }, + { + "epoch": 11.10759493670886, + "grad_norm": 0.2742575407028198, + "learning_rate": 0.0004911633499426907, + "loss": 0.0105, + "num_input_tokens_seen": 795008, + "step": 1755 + }, + { + "epoch": 11.139240506329115, + "grad_norm": 0.020827749744057655, + "learning_rate": 0.0004884023331670334, + "loss": 0.0087, + "num_input_tokens_seen": 797344, + "step": 1760 + }, + { + "epoch": 11.170886075949367, + "grad_norm": 0.27521681785583496, + "learning_rate": 0.00048564167018507544, + "loss": 0.0048, + "num_input_tokens_seen": 799776, + "step": 1765 + }, + { + "epoch": 11.20253164556962, + "grad_norm": 0.3762897551059723, + "learning_rate": 0.0004828814452124773, + "loss": 0.0109, + "num_input_tokens_seen": 801952, + "step": 1770 + }, + { + "epoch": 11.234177215189874, + "grad_norm": 0.016188887879252434, + "learning_rate": 0.0004801217424515373, + "loss": 0.0033, + "num_input_tokens_seen": 804224, + "step": 1775 + }, + { + "epoch": 11.265822784810126, + "grad_norm": 0.4677315056324005, + "learning_rate": 0.0004773626460886234, + "loss": 0.0183, + "num_input_tokens_seen": 806496, + "step": 1780 + }, + { + "epoch": 11.29746835443038, + "grad_norm": 0.0292410496622324, + "learning_rate": 0.00047460424029160546, + "loss": 0.0069, + "num_input_tokens_seen": 808672, + "step": 1785 + }, + { + "epoch": 11.329113924050633, + "grad_norm": 0.0002641333267092705, + "learning_rate": 0.0004718466092072868, + "loss": 0.0005, + "num_input_tokens_seen": 811008, + "step": 1790 + }, + { + "epoch": 11.360759493670885, + "grad_norm": 0.1780717968940735, + "learning_rate": 0.0004690898369588377, + "loss": 0.0203, + "num_input_tokens_seen": 813344, + "step": 1795 + }, + { + "epoch": 11.39240506329114, + "grad_norm": 0.017060041427612305, + "learning_rate": 0.0004663340076432295, + "loss": 0.0078, + "num_input_tokens_seen": 815648, + "step": 1800 + }, + { + "epoch": 11.424050632911392, + "grad_norm": 0.2558180093765259, + "learning_rate": 0.0004635792053286682, + "loss": 0.0119, + "num_input_tokens_seen": 817888, + "step": 1805 + }, + { + "epoch": 11.455696202531646, + "grad_norm": 0.009733879007399082, + "learning_rate": 0.00046082551405203135, + "loss": 0.0004, + "num_input_tokens_seen": 820128, + "step": 1810 + }, + { + "epoch": 11.487341772151899, + "grad_norm": 0.007143679074943066, + "learning_rate": 0.0004580730178163028, + "loss": 0.002, + "num_input_tokens_seen": 822368, + "step": 1815 + }, + { + "epoch": 11.518987341772151, + "grad_norm": 0.0018589214887470007, + "learning_rate": 0.0004553218005880114, + "loss": 0.0085, + "num_input_tokens_seen": 824576, + "step": 1820 + }, + { + "epoch": 11.550632911392405, + "grad_norm": 0.17903174459934235, + "learning_rate": 0.00045257194629466917, + "loss": 0.0048, + "num_input_tokens_seen": 826880, + "step": 1825 + }, + { + "epoch": 11.582278481012658, + "grad_norm": 0.2393081933259964, + "learning_rate": 0.00044982353882221083, + "loss": 0.0126, + "num_input_tokens_seen": 829120, + "step": 1830 + }, + { + "epoch": 11.613924050632912, + "grad_norm": 0.005285333376377821, + "learning_rate": 0.0004470766620124349, + "loss": 0.0082, + "num_input_tokens_seen": 831424, + "step": 1835 + }, + { + "epoch": 11.645569620253164, + "grad_norm": 0.00949358195066452, + "learning_rate": 0.00044433139966044646, + "loss": 0.0108, + "num_input_tokens_seen": 833728, + "step": 1840 + }, + { + "epoch": 11.677215189873417, + "grad_norm": 0.032899949699640274, + "learning_rate": 0.0004415878355121002, + "loss": 0.0018, + "num_input_tokens_seen": 835936, + "step": 1845 + }, + { + "epoch": 11.708860759493671, + "grad_norm": 0.26240622997283936, + "learning_rate": 0.00043884605326144615, + "loss": 0.0188, + "num_input_tokens_seen": 838176, + "step": 1850 + }, + { + "epoch": 11.740506329113924, + "grad_norm": 0.41593241691589355, + "learning_rate": 0.0004361061365481771, + "loss": 0.0045, + "num_input_tokens_seen": 840480, + "step": 1855 + }, + { + "epoch": 11.772151898734178, + "grad_norm": 0.30750226974487305, + "learning_rate": 0.0004333681689550756, + "loss": 0.0153, + "num_input_tokens_seen": 842848, + "step": 1860 + }, + { + "epoch": 11.80379746835443, + "grad_norm": 0.03316473588347435, + "learning_rate": 0.0004306322340054659, + "loss": 0.0011, + "num_input_tokens_seen": 844992, + "step": 1865 + }, + { + "epoch": 11.835443037974684, + "grad_norm": 0.0011924748541787267, + "learning_rate": 0.0004278984151606648, + "loss": 0.0059, + "num_input_tokens_seen": 847328, + "step": 1870 + }, + { + "epoch": 11.867088607594937, + "grad_norm": 0.11064691841602325, + "learning_rate": 0.0004251667958174361, + "loss": 0.0037, + "num_input_tokens_seen": 849536, + "step": 1875 + }, + { + "epoch": 11.89873417721519, + "grad_norm": 0.0013144888216629624, + "learning_rate": 0.00042243745930544677, + "loss": 0.0042, + "num_input_tokens_seen": 851712, + "step": 1880 + }, + { + "epoch": 11.930379746835444, + "grad_norm": 0.0031672825571149588, + "learning_rate": 0.00041971048888472443, + "loss": 0.0292, + "num_input_tokens_seen": 853984, + "step": 1885 + }, + { + "epoch": 11.962025316455696, + "grad_norm": 0.21116450428962708, + "learning_rate": 0.00041698596774311755, + "loss": 0.0107, + "num_input_tokens_seen": 856224, + "step": 1890 + }, + { + "epoch": 11.99367088607595, + "grad_norm": 0.08107837289571762, + "learning_rate": 0.0004142639789937585, + "loss": 0.0017, + "num_input_tokens_seen": 858528, + "step": 1895 + }, + { + "epoch": 12.0, + "eval_loss": 0.22486339509487152, + "eval_runtime": 1.6346, + "eval_samples_per_second": 42.824, + "eval_steps_per_second": 11.012, + "num_input_tokens_seen": 858736, + "step": 1896 + }, + { + "epoch": 12.025316455696203, + "grad_norm": 0.22082574665546417, + "learning_rate": 0.00041154460567252696, + "loss": 0.0024, + "num_input_tokens_seen": 860592, + "step": 1900 + }, + { + "epoch": 12.056962025316455, + "grad_norm": 0.028646036982536316, + "learning_rate": 0.0004088279307355173, + "loss": 0.0006, + "num_input_tokens_seen": 862800, + "step": 1905 + }, + { + "epoch": 12.08860759493671, + "grad_norm": 0.0015675805043429136, + "learning_rate": 0.0004061140370565088, + "loss": 0.0003, + "num_input_tokens_seen": 865104, + "step": 1910 + }, + { + "epoch": 12.120253164556962, + "grad_norm": 0.009807409718632698, + "learning_rate": 0.0004034030074244361, + "loss": 0.0012, + "num_input_tokens_seen": 867408, + "step": 1915 + }, + { + "epoch": 12.151898734177216, + "grad_norm": 0.8419751524925232, + "learning_rate": 0.00040069492454086465, + "loss": 0.016, + "num_input_tokens_seen": 869680, + "step": 1920 + }, + { + "epoch": 12.183544303797468, + "grad_norm": 0.17428022623062134, + "learning_rate": 0.00039798987101746775, + "loss": 0.015, + "num_input_tokens_seen": 871856, + "step": 1925 + }, + { + "epoch": 12.215189873417721, + "grad_norm": 0.14627766609191895, + "learning_rate": 0.00039528792937350586, + "loss": 0.0176, + "num_input_tokens_seen": 874192, + "step": 1930 + }, + { + "epoch": 12.246835443037975, + "grad_norm": 0.014510978944599628, + "learning_rate": 0.0003925891820333104, + "loss": 0.0007, + "num_input_tokens_seen": 876432, + "step": 1935 + }, + { + "epoch": 12.278481012658228, + "grad_norm": 0.49219152331352234, + "learning_rate": 0.00038989371132376805, + "loss": 0.0058, + "num_input_tokens_seen": 878672, + "step": 1940 + }, + { + "epoch": 12.310126582278482, + "grad_norm": 0.241677388548851, + "learning_rate": 0.00038720159947180997, + "loss": 0.0064, + "num_input_tokens_seen": 880976, + "step": 1945 + }, + { + "epoch": 12.341772151898734, + "grad_norm": 0.0009249493596144021, + "learning_rate": 0.00038451292860190386, + "loss": 0.0092, + "num_input_tokens_seen": 883280, + "step": 1950 + }, + { + "epoch": 12.373417721518987, + "grad_norm": 0.0020899928640574217, + "learning_rate": 0.00038182778073354764, + "loss": 0.0129, + "num_input_tokens_seen": 885488, + "step": 1955 + }, + { + "epoch": 12.405063291139241, + "grad_norm": 0.0769369974732399, + "learning_rate": 0.0003791462377787682, + "loss": 0.0248, + "num_input_tokens_seen": 887760, + "step": 1960 + }, + { + "epoch": 12.436708860759493, + "grad_norm": 0.020182520151138306, + "learning_rate": 0.0003764683815396226, + "loss": 0.0017, + "num_input_tokens_seen": 889968, + "step": 1965 + }, + { + "epoch": 12.468354430379748, + "grad_norm": 0.07283952087163925, + "learning_rate": 0.00037379429370570233, + "loss": 0.0068, + "num_input_tokens_seen": 892336, + "step": 1970 + }, + { + "epoch": 12.5, + "grad_norm": 0.11385155469179153, + "learning_rate": 0.0003711240558516411, + "loss": 0.0025, + "num_input_tokens_seen": 894608, + "step": 1975 + }, + { + "epoch": 12.531645569620252, + "grad_norm": 0.12106140702962875, + "learning_rate": 0.0003684577494346275, + "loss": 0.0075, + "num_input_tokens_seen": 896944, + "step": 1980 + }, + { + "epoch": 12.563291139240507, + "grad_norm": 0.19506441056728363, + "learning_rate": 0.0003657954557919183, + "loss": 0.0106, + "num_input_tokens_seen": 899120, + "step": 1985 + }, + { + "epoch": 12.594936708860759, + "grad_norm": 0.006248318590223789, + "learning_rate": 0.00036313725613835914, + "loss": 0.0007, + "num_input_tokens_seen": 901456, + "step": 1990 + }, + { + "epoch": 12.626582278481013, + "grad_norm": 0.0065716831013560295, + "learning_rate": 0.0003604832315639056, + "loss": 0.0015, + "num_input_tokens_seen": 903696, + "step": 1995 + }, + { + "epoch": 12.658227848101266, + "grad_norm": 0.2696636915206909, + "learning_rate": 0.00035783346303114983, + "loss": 0.0084, + "num_input_tokens_seen": 906000, + "step": 2000 + }, + { + "epoch": 12.689873417721518, + "grad_norm": 0.004048376809805632, + "learning_rate": 0.0003551880313728515, + "loss": 0.0042, + "num_input_tokens_seen": 908304, + "step": 2005 + }, + { + "epoch": 12.721518987341772, + "grad_norm": 0.029864784330129623, + "learning_rate": 0.0003525470172894709, + "loss": 0.0018, + "num_input_tokens_seen": 910448, + "step": 2010 + }, + { + "epoch": 12.753164556962025, + "grad_norm": 0.00526443449780345, + "learning_rate": 0.00034991050134670736, + "loss": 0.0028, + "num_input_tokens_seen": 912816, + "step": 2015 + }, + { + "epoch": 12.784810126582279, + "grad_norm": 0.05799572914838791, + "learning_rate": 0.0003472785639730428, + "loss": 0.0022, + "num_input_tokens_seen": 915056, + "step": 2020 + }, + { + "epoch": 12.816455696202532, + "grad_norm": 0.05574914440512657, + "learning_rate": 0.00034465128545728586, + "loss": 0.0028, + "num_input_tokens_seen": 917264, + "step": 2025 + }, + { + "epoch": 12.848101265822784, + "grad_norm": 0.008765865117311478, + "learning_rate": 0.00034202874594612467, + "loss": 0.004, + "num_input_tokens_seen": 919504, + "step": 2030 + }, + { + "epoch": 12.879746835443038, + "grad_norm": 0.04581623524427414, + "learning_rate": 0.00033941102544168133, + "loss": 0.0008, + "num_input_tokens_seen": 921712, + "step": 2035 + }, + { + "epoch": 12.91139240506329, + "grad_norm": 0.0002680871111806482, + "learning_rate": 0.000336798203799071, + "loss": 0.0035, + "num_input_tokens_seen": 924112, + "step": 2040 + }, + { + "epoch": 12.943037974683545, + "grad_norm": 0.01643231138586998, + "learning_rate": 0.00033419036072396616, + "loss": 0.0015, + "num_input_tokens_seen": 926320, + "step": 2045 + }, + { + "epoch": 12.974683544303797, + "grad_norm": 0.034664545208215714, + "learning_rate": 0.0003315875757701653, + "loss": 0.001, + "num_input_tokens_seen": 928624, + "step": 2050 + }, + { + "epoch": 13.0, + "eval_loss": 0.22669243812561035, + "eval_runtime": 1.6293, + "eval_samples_per_second": 42.963, + "eval_steps_per_second": 11.048, + "num_input_tokens_seen": 930160, + "step": 2054 + }, + { + "epoch": 13.00632911392405, + "grad_norm": 0.0022140443325042725, + "learning_rate": 0.0003289899283371657, + "loss": 0.0054, + "num_input_tokens_seen": 930576, + "step": 2055 + }, + { + "epoch": 13.037974683544304, + "grad_norm": 0.07002821564674377, + "learning_rate": 0.00032639749766774173, + "loss": 0.0034, + "num_input_tokens_seen": 932944, + "step": 2060 + }, + { + "epoch": 13.069620253164556, + "grad_norm": 0.003276290837675333, + "learning_rate": 0.00032381036284552734, + "loss": 0.0005, + "num_input_tokens_seen": 935152, + "step": 2065 + }, + { + "epoch": 13.10126582278481, + "grad_norm": 0.004695270210504532, + "learning_rate": 0.00032122860279260306, + "loss": 0.002, + "num_input_tokens_seen": 937456, + "step": 2070 + }, + { + "epoch": 13.132911392405063, + "grad_norm": 0.0033169821836054325, + "learning_rate": 0.0003186522962670897, + "loss": 0.0006, + "num_input_tokens_seen": 939760, + "step": 2075 + }, + { + "epoch": 13.164556962025316, + "grad_norm": 0.004817298613488674, + "learning_rate": 0.00031608152186074425, + "loss": 0.0002, + "num_input_tokens_seen": 942128, + "step": 2080 + }, + { + "epoch": 13.19620253164557, + "grad_norm": 0.0003279719385318458, + "learning_rate": 0.0003135163579965633, + "loss": 0.002, + "num_input_tokens_seen": 944368, + "step": 2085 + }, + { + "epoch": 13.227848101265822, + "grad_norm": 0.0006703323451802135, + "learning_rate": 0.0003109568829263909, + "loss": 0.0056, + "num_input_tokens_seen": 946672, + "step": 2090 + }, + { + "epoch": 13.259493670886076, + "grad_norm": 0.08480872213840485, + "learning_rate": 0.00030840317472853075, + "loss": 0.0016, + "num_input_tokens_seen": 949008, + "step": 2095 + }, + { + "epoch": 13.291139240506329, + "grad_norm": 2.3488152027130127, + "learning_rate": 0.00030585531130536447, + "loss": 0.007, + "num_input_tokens_seen": 951312, + "step": 2100 + }, + { + "epoch": 13.322784810126583, + "grad_norm": 0.007569640409201384, + "learning_rate": 0.00030331337038097593, + "loss": 0.0008, + "num_input_tokens_seen": 953552, + "step": 2105 + }, + { + "epoch": 13.354430379746836, + "grad_norm": 0.02700829692184925, + "learning_rate": 0.00030077742949877905, + "loss": 0.0004, + "num_input_tokens_seen": 955792, + "step": 2110 + }, + { + "epoch": 13.386075949367088, + "grad_norm": 0.01100717019289732, + "learning_rate": 0.0002982475660191536, + "loss": 0.0017, + "num_input_tokens_seen": 958000, + "step": 2115 + }, + { + "epoch": 13.417721518987342, + "grad_norm": 0.051008135080337524, + "learning_rate": 0.0002957238571170841, + "loss": 0.0019, + "num_input_tokens_seen": 960304, + "step": 2120 + }, + { + "epoch": 13.449367088607595, + "grad_norm": 0.010950152762234211, + "learning_rate": 0.0002932063797798059, + "loss": 0.0008, + "num_input_tokens_seen": 962448, + "step": 2125 + }, + { + "epoch": 13.481012658227849, + "grad_norm": 0.005450937431305647, + "learning_rate": 0.0002906952108044574, + "loss": 0.0017, + "num_input_tokens_seen": 964688, + "step": 2130 + }, + { + "epoch": 13.512658227848101, + "grad_norm": 0.007405989803373814, + "learning_rate": 0.00028819042679573614, + "loss": 0.0053, + "num_input_tokens_seen": 966960, + "step": 2135 + }, + { + "epoch": 13.544303797468354, + "grad_norm": 0.0005706689553335309, + "learning_rate": 0.00028569210416356215, + "loss": 0.0012, + "num_input_tokens_seen": 969296, + "step": 2140 + }, + { + "epoch": 13.575949367088608, + "grad_norm": 0.0037800113204866648, + "learning_rate": 0.0002832003191207487, + "loss": 0.0012, + "num_input_tokens_seen": 971600, + "step": 2145 + }, + { + "epoch": 13.60759493670886, + "grad_norm": 0.006751175969839096, + "learning_rate": 0.00028071514768067445, + "loss": 0.0055, + "num_input_tokens_seen": 973904, + "step": 2150 + }, + { + "epoch": 13.639240506329115, + "grad_norm": 0.022593379020690918, + "learning_rate": 0.00027823666565496714, + "loss": 0.0013, + "num_input_tokens_seen": 976176, + "step": 2155 + }, + { + "epoch": 13.670886075949367, + "grad_norm": 0.00210123835131526, + "learning_rate": 0.00027576494865118984, + "loss": 0.0041, + "num_input_tokens_seen": 978480, + "step": 2160 + }, + { + "epoch": 13.70253164556962, + "grad_norm": 0.007202755194157362, + "learning_rate": 0.0002733000720705341, + "loss": 0.0053, + "num_input_tokens_seen": 980752, + "step": 2165 + }, + { + "epoch": 13.734177215189874, + "grad_norm": 0.01162320002913475, + "learning_rate": 0.0002708421111055209, + "loss": 0.0004, + "num_input_tokens_seen": 983056, + "step": 2170 + }, + { + "epoch": 13.765822784810126, + "grad_norm": 0.0022520236670970917, + "learning_rate": 0.0002683911407377062, + "loss": 0.001, + "num_input_tokens_seen": 985232, + "step": 2175 + }, + { + "epoch": 13.79746835443038, + "grad_norm": 0.045108262449502945, + "learning_rate": 0.00026594723573539306, + "loss": 0.0006, + "num_input_tokens_seen": 987536, + "step": 2180 + }, + { + "epoch": 13.829113924050633, + "grad_norm": 0.008652398362755775, + "learning_rate": 0.00026351047065135237, + "loss": 0.0022, + "num_input_tokens_seen": 989744, + "step": 2185 + }, + { + "epoch": 13.860759493670885, + "grad_norm": 0.12414415925741196, + "learning_rate": 0.0002610809198205466, + "loss": 0.0021, + "num_input_tokens_seen": 991952, + "step": 2190 + }, + { + "epoch": 13.89240506329114, + "grad_norm": 0.06359447538852692, + "learning_rate": 0.00025865865735786377, + "loss": 0.0027, + "num_input_tokens_seen": 994224, + "step": 2195 + }, + { + "epoch": 13.924050632911392, + "grad_norm": 0.038647353649139404, + "learning_rate": 0.0002562437571558558, + "loss": 0.0022, + "num_input_tokens_seen": 996528, + "step": 2200 + }, + { + "epoch": 13.955696202531646, + "grad_norm": 0.03827156871557236, + "learning_rate": 0.00025383629288248423, + "loss": 0.0005, + "num_input_tokens_seen": 998800, + "step": 2205 + }, + { + "epoch": 13.987341772151899, + "grad_norm": 0.002246325137093663, + "learning_rate": 0.0002514363379788733, + "loss": 0.0012, + "num_input_tokens_seen": 1001104, + "step": 2210 + }, + { + "epoch": 14.0, + "eval_loss": 0.24711216986179352, + "eval_runtime": 1.6279, + "eval_samples_per_second": 43.001, + "eval_steps_per_second": 11.057, + "num_input_tokens_seen": 1001792, + "step": 2212 + }, + { + "epoch": 14.018987341772151, + "grad_norm": 0.00921409297734499, + "learning_rate": 0.00024904396565707, + "loss": 0.0008, + "num_input_tokens_seen": 1003168, + "step": 2215 + }, + { + "epoch": 14.050632911392405, + "grad_norm": 0.00028061773627996445, + "learning_rate": 0.00024665924889780914, + "loss": 0.0005, + "num_input_tokens_seen": 1005472, + "step": 2220 + }, + { + "epoch": 14.082278481012658, + "grad_norm": 0.00020452811440918595, + "learning_rate": 0.00024428226044828893, + "loss": 0.0045, + "num_input_tokens_seen": 1007776, + "step": 2225 + }, + { + "epoch": 14.113924050632912, + "grad_norm": 0.01197787281125784, + "learning_rate": 0.00024191307281995058, + "loss": 0.0007, + "num_input_tokens_seen": 1010048, + "step": 2230 + }, + { + "epoch": 14.145569620253164, + "grad_norm": 0.00035051812301389873, + "learning_rate": 0.00023955175828626658, + "loss": 0.0005, + "num_input_tokens_seen": 1012256, + "step": 2235 + }, + { + "epoch": 14.177215189873417, + "grad_norm": 0.04289071634411812, + "learning_rate": 0.00023719838888053635, + "loss": 0.0003, + "num_input_tokens_seen": 1014592, + "step": 2240 + }, + { + "epoch": 14.208860759493671, + "grad_norm": 0.0009035664843395352, + "learning_rate": 0.00023485303639368782, + "loss": 0.0033, + "num_input_tokens_seen": 1016928, + "step": 2245 + }, + { + "epoch": 14.240506329113924, + "grad_norm": 0.0009375275694765151, + "learning_rate": 0.00023251577237208866, + "loss": 0.0004, + "num_input_tokens_seen": 1019232, + "step": 2250 + }, + { + "epoch": 14.272151898734178, + "grad_norm": 0.054324060678482056, + "learning_rate": 0.00023018666811536304, + "loss": 0.0009, + "num_input_tokens_seen": 1021696, + "step": 2255 + }, + { + "epoch": 14.30379746835443, + "grad_norm": 0.0071850186213850975, + "learning_rate": 0.00022786579467421614, + "loss": 0.0017, + "num_input_tokens_seen": 1023808, + "step": 2260 + }, + { + "epoch": 14.335443037974684, + "grad_norm": 0.006104636006057262, + "learning_rate": 0.00022555322284826783, + "loss": 0.0009, + "num_input_tokens_seen": 1026080, + "step": 2265 + }, + { + "epoch": 14.367088607594937, + "grad_norm": 0.01361861266195774, + "learning_rate": 0.0002232490231838923, + "loss": 0.0008, + "num_input_tokens_seen": 1028256, + "step": 2270 + }, + { + "epoch": 14.39873417721519, + "grad_norm": 0.0193213801831007, + "learning_rate": 0.00022095326597206528, + "loss": 0.0004, + "num_input_tokens_seen": 1030560, + "step": 2275 + }, + { + "epoch": 14.430379746835444, + "grad_norm": 0.00014660239685326815, + "learning_rate": 0.00021866602124622132, + "loss": 0.0005, + "num_input_tokens_seen": 1032832, + "step": 2280 + }, + { + "epoch": 14.462025316455696, + "grad_norm": 0.020347679033875465, + "learning_rate": 0.000216387358780116, + "loss": 0.0004, + "num_input_tokens_seen": 1035104, + "step": 2285 + }, + { + "epoch": 14.49367088607595, + "grad_norm": 0.0009191425633616745, + "learning_rate": 0.00021411734808569855, + "loss": 0.0005, + "num_input_tokens_seen": 1037280, + "step": 2290 + }, + { + "epoch": 14.525316455696203, + "grad_norm": 0.0035052187740802765, + "learning_rate": 0.00021185605841098987, + "loss": 0.0005, + "num_input_tokens_seen": 1039488, + "step": 2295 + }, + { + "epoch": 14.556962025316455, + "grad_norm": 0.0022822876926511526, + "learning_rate": 0.00020960355873797176, + "loss": 0.0001, + "num_input_tokens_seen": 1041792, + "step": 2300 + }, + { + "epoch": 14.58860759493671, + "grad_norm": 0.015031488612294197, + "learning_rate": 0.00020735991778048158, + "loss": 0.0008, + "num_input_tokens_seen": 1044064, + "step": 2305 + }, + { + "epoch": 14.620253164556962, + "grad_norm": 0.0009929704247042537, + "learning_rate": 0.00020512520398211647, + "loss": 0.0005, + "num_input_tokens_seen": 1046432, + "step": 2310 + }, + { + "epoch": 14.651898734177216, + "grad_norm": 0.03135538101196289, + "learning_rate": 0.00020289948551414482, + "loss": 0.0004, + "num_input_tokens_seen": 1048736, + "step": 2315 + }, + { + "epoch": 14.683544303797468, + "grad_norm": 0.011596915312111378, + "learning_rate": 0.00020068283027342755, + "loss": 0.0004, + "num_input_tokens_seen": 1051040, + "step": 2320 + }, + { + "epoch": 14.715189873417721, + "grad_norm": 0.013896669261157513, + "learning_rate": 0.00019847530588034635, + "loss": 0.001, + "num_input_tokens_seen": 1053376, + "step": 2325 + }, + { + "epoch": 14.746835443037975, + "grad_norm": 0.00045874802162870765, + "learning_rate": 0.00019627697967674118, + "loss": 0.0009, + "num_input_tokens_seen": 1055584, + "step": 2330 + }, + { + "epoch": 14.778481012658228, + "grad_norm": 0.05592343583703041, + "learning_rate": 0.00019408791872385528, + "loss": 0.0011, + "num_input_tokens_seen": 1057920, + "step": 2335 + }, + { + "epoch": 14.810126582278482, + "grad_norm": 0.00030699645867571235, + "learning_rate": 0.00019190818980029047, + "loss": 0.0004, + "num_input_tokens_seen": 1060128, + "step": 2340 + }, + { + "epoch": 14.841772151898734, + "grad_norm": 0.0046336534433066845, + "learning_rate": 0.00018973785939996928, + "loss": 0.0002, + "num_input_tokens_seen": 1062400, + "step": 2345 + }, + { + "epoch": 14.873417721518987, + "grad_norm": 0.000329044705722481, + "learning_rate": 0.00018757699373010646, + "loss": 0.0007, + "num_input_tokens_seen": 1064640, + "step": 2350 + }, + { + "epoch": 14.905063291139241, + "grad_norm": 0.001433463767170906, + "learning_rate": 0.00018542565870918992, + "loss": 0.0029, + "num_input_tokens_seen": 1066784, + "step": 2355 + }, + { + "epoch": 14.936708860759493, + "grad_norm": 0.0028429583180695772, + "learning_rate": 0.0001832839199649694, + "loss": 0.0007, + "num_input_tokens_seen": 1068992, + "step": 2360 + }, + { + "epoch": 14.968354430379748, + "grad_norm": 0.009809029288589954, + "learning_rate": 0.00018115184283245438, + "loss": 0.0019, + "num_input_tokens_seen": 1071168, + "step": 2365 + }, + { + "epoch": 15.0, + "grad_norm": 0.05353161692619324, + "learning_rate": 0.0001790294923519215, + "loss": 0.0017, + "num_input_tokens_seen": 1073248, + "step": 2370 + }, + { + "epoch": 15.0, + "eval_loss": 0.2629498839378357, + "eval_runtime": 1.6293, + "eval_samples_per_second": 42.963, + "eval_steps_per_second": 11.048, + "num_input_tokens_seen": 1073248, + "step": 2370 + }, + { + "epoch": 15.031645569620252, + "grad_norm": 0.0013401020551100373, + "learning_rate": 0.00017691693326692942, + "loss": 0.0002, + "num_input_tokens_seen": 1075392, + "step": 2375 + }, + { + "epoch": 15.063291139240507, + "grad_norm": 0.0022331036161631346, + "learning_rate": 0.0001748142300223452, + "loss": 0.0001, + "num_input_tokens_seen": 1077632, + "step": 2380 + }, + { + "epoch": 15.094936708860759, + "grad_norm": 0.04992636665701866, + "learning_rate": 0.0001727214467623775, + "loss": 0.0013, + "num_input_tokens_seen": 1079808, + "step": 2385 + }, + { + "epoch": 15.126582278481013, + "grad_norm": 0.003744828049093485, + "learning_rate": 0.00017063864732861945, + "loss": 0.001, + "num_input_tokens_seen": 1082048, + "step": 2390 + }, + { + "epoch": 15.158227848101266, + "grad_norm": 0.009123779833316803, + "learning_rate": 0.0001685658952581023, + "loss": 0.0003, + "num_input_tokens_seen": 1084288, + "step": 2395 + }, + { + "epoch": 15.189873417721518, + "grad_norm": 0.00019574619363993406, + "learning_rate": 0.00016650325378135628, + "loss": 0.0002, + "num_input_tokens_seen": 1086624, + "step": 2400 + }, + { + "epoch": 15.221518987341772, + "grad_norm": 0.0076944525353610516, + "learning_rate": 0.00016445078582048156, + "loss": 0.0003, + "num_input_tokens_seen": 1088960, + "step": 2405 + }, + { + "epoch": 15.253164556962025, + "grad_norm": 0.007140711881220341, + "learning_rate": 0.0001624085539872302, + "loss": 0.0012, + "num_input_tokens_seen": 1091328, + "step": 2410 + }, + { + "epoch": 15.284810126582279, + "grad_norm": 0.001084710587747395, + "learning_rate": 0.00016037662058109413, + "loss": 0.0019, + "num_input_tokens_seen": 1093568, + "step": 2415 + }, + { + "epoch": 15.316455696202532, + "grad_norm": 0.007704330142587423, + "learning_rate": 0.00015835504758740577, + "loss": 0.0006, + "num_input_tokens_seen": 1095808, + "step": 2420 + }, + { + "epoch": 15.348101265822784, + "grad_norm": 0.0009865473257377744, + "learning_rate": 0.00015634389667544784, + "loss": 0.0006, + "num_input_tokens_seen": 1098176, + "step": 2425 + }, + { + "epoch": 15.379746835443038, + "grad_norm": 0.015731172636151314, + "learning_rate": 0.00015434322919657023, + "loss": 0.0006, + "num_input_tokens_seen": 1100544, + "step": 2430 + }, + { + "epoch": 15.41139240506329, + "grad_norm": 0.008956117555499077, + "learning_rate": 0.0001523531061823202, + "loss": 0.0003, + "num_input_tokens_seen": 1102880, + "step": 2435 + }, + { + "epoch": 15.443037974683545, + "grad_norm": 0.0002976666437461972, + "learning_rate": 0.00015037358834257963, + "loss": 0.0002, + "num_input_tokens_seen": 1105056, + "step": 2440 + }, + { + "epoch": 15.474683544303797, + "grad_norm": 0.0002089941845042631, + "learning_rate": 0.00014840473606371298, + "loss": 0.0007, + "num_input_tokens_seen": 1107296, + "step": 2445 + }, + { + "epoch": 15.50632911392405, + "grad_norm": 0.002252733102068305, + "learning_rate": 0.00014644660940672628, + "loss": 0.0004, + "num_input_tokens_seen": 1109536, + "step": 2450 + }, + { + "epoch": 15.537974683544304, + "grad_norm": 0.0030139400623738766, + "learning_rate": 0.00014449926810543328, + "loss": 0.0004, + "num_input_tokens_seen": 1111744, + "step": 2455 + }, + { + "epoch": 15.569620253164556, + "grad_norm": 0.0004334330733399838, + "learning_rate": 0.00014256277156463404, + "loss": 0.0004, + "num_input_tokens_seen": 1114208, + "step": 2460 + }, + { + "epoch": 15.60126582278481, + "grad_norm": 0.019599957391619682, + "learning_rate": 0.00014063717885830373, + "loss": 0.0005, + "num_input_tokens_seen": 1116448, + "step": 2465 + }, + { + "epoch": 15.632911392405063, + "grad_norm": 0.003064208198338747, + "learning_rate": 0.00013872254872778845, + "loss": 0.0001, + "num_input_tokens_seen": 1118624, + "step": 2470 + }, + { + "epoch": 15.664556962025316, + "grad_norm": 0.00022443223861046135, + "learning_rate": 0.00013681893958001517, + "loss": 0.0005, + "num_input_tokens_seen": 1120896, + "step": 2475 + }, + { + "epoch": 15.69620253164557, + "grad_norm": 0.00029300598544068635, + "learning_rate": 0.00013492640948570907, + "loss": 0.0002, + "num_input_tokens_seen": 1123264, + "step": 2480 + }, + { + "epoch": 15.727848101265822, + "grad_norm": 0.01933925226330757, + "learning_rate": 0.00013304501617762178, + "loss": 0.0005, + "num_input_tokens_seen": 1125504, + "step": 2485 + }, + { + "epoch": 15.759493670886076, + "grad_norm": 0.0003681584494188428, + "learning_rate": 0.00013117481704877165, + "loss": 0.0003, + "num_input_tokens_seen": 1127744, + "step": 2490 + }, + { + "epoch": 15.791139240506329, + "grad_norm": 0.00017387945263180882, + "learning_rate": 0.00012931586915069105, + "loss": 0.0003, + "num_input_tokens_seen": 1129920, + "step": 2495 + }, + { + "epoch": 15.822784810126583, + "grad_norm": 0.00025900316541083157, + "learning_rate": 0.00012746822919168694, + "loss": 0.0013, + "num_input_tokens_seen": 1132128, + "step": 2500 + }, + { + "epoch": 15.854430379746836, + "grad_norm": 0.0006923755281604826, + "learning_rate": 0.00012563195353511175, + "loss": 0.0002, + "num_input_tokens_seen": 1134400, + "step": 2505 + }, + { + "epoch": 15.886075949367088, + "grad_norm": 0.0005279296892695129, + "learning_rate": 0.00012380709819764219, + "loss": 0.0011, + "num_input_tokens_seen": 1136640, + "step": 2510 + }, + { + "epoch": 15.917721518987342, + "grad_norm": 0.0021485064644366503, + "learning_rate": 0.00012199371884757171, + "loss": 0.0007, + "num_input_tokens_seen": 1139008, + "step": 2515 + }, + { + "epoch": 15.949367088607595, + "grad_norm": 0.0023130758199840784, + "learning_rate": 0.00012019187080311228, + "loss": 0.0008, + "num_input_tokens_seen": 1141280, + "step": 2520 + }, + { + "epoch": 15.981012658227849, + "grad_norm": 0.0036742729134857655, + "learning_rate": 0.0001184016090307059, + "loss": 0.0002, + "num_input_tokens_seen": 1143552, + "step": 2525 + }, + { + "epoch": 16.0, + "eval_loss": 0.2718905806541443, + "eval_runtime": 1.6315, + "eval_samples_per_second": 42.905, + "eval_steps_per_second": 11.033, + "num_input_tokens_seen": 1144672, + "step": 2528 + }, + { + "epoch": 16.0126582278481, + "grad_norm": 0.004688389599323273, + "learning_rate": 0.00011662298814334899, + "loss": 0.0004, + "num_input_tokens_seen": 1145536, + "step": 2530 + }, + { + "epoch": 16.044303797468356, + "grad_norm": 0.0036812222097069025, + "learning_rate": 0.00011485606239892587, + "loss": 0.001, + "num_input_tokens_seen": 1147840, + "step": 2535 + }, + { + "epoch": 16.075949367088608, + "grad_norm": 0.007777748629450798, + "learning_rate": 0.00011310088569855315, + "loss": 0.0003, + "num_input_tokens_seen": 1149984, + "step": 2540 + }, + { + "epoch": 16.10759493670886, + "grad_norm": 0.00014890628517605364, + "learning_rate": 0.00011135751158493634, + "loss": 0.0001, + "num_input_tokens_seen": 1152192, + "step": 2545 + }, + { + "epoch": 16.139240506329113, + "grad_norm": 0.009418509900569916, + "learning_rate": 0.00010962599324073586, + "loss": 0.0005, + "num_input_tokens_seen": 1154592, + "step": 2550 + }, + { + "epoch": 16.170886075949365, + "grad_norm": 0.005742751061916351, + "learning_rate": 0.00010790638348694487, + "loss": 0.0012, + "num_input_tokens_seen": 1156800, + "step": 2555 + }, + { + "epoch": 16.20253164556962, + "grad_norm": 0.006581949070096016, + "learning_rate": 0.00010619873478127817, + "loss": 0.0002, + "num_input_tokens_seen": 1159104, + "step": 2560 + }, + { + "epoch": 16.234177215189874, + "grad_norm": 0.005423244088888168, + "learning_rate": 0.00010450309921657114, + "loss": 0.0005, + "num_input_tokens_seen": 1161312, + "step": 2565 + }, + { + "epoch": 16.265822784810126, + "grad_norm": 0.0019696063827723265, + "learning_rate": 0.00010281952851919158, + "loss": 0.0004, + "num_input_tokens_seen": 1163520, + "step": 2570 + }, + { + "epoch": 16.29746835443038, + "grad_norm": 0.00014235895650926977, + "learning_rate": 0.00010114807404746151, + "loss": 0.0003, + "num_input_tokens_seen": 1165792, + "step": 2575 + }, + { + "epoch": 16.32911392405063, + "grad_norm": 0.00781866442412138, + "learning_rate": 9.948878679008977e-05, + "loss": 0.0016, + "num_input_tokens_seen": 1168000, + "step": 2580 + }, + { + "epoch": 16.360759493670887, + "grad_norm": 0.0008007868891581893, + "learning_rate": 9.78417173646176e-05, + "loss": 0.0002, + "num_input_tokens_seen": 1170208, + "step": 2585 + }, + { + "epoch": 16.39240506329114, + "grad_norm": 0.00132509705144912, + "learning_rate": 9.620691601587384e-05, + "loss": 0.0003, + "num_input_tokens_seen": 1172544, + "step": 2590 + }, + { + "epoch": 16.424050632911392, + "grad_norm": 0.009305765852332115, + "learning_rate": 9.458443261444255e-05, + "loss": 0.0002, + "num_input_tokens_seen": 1174816, + "step": 2595 + }, + { + "epoch": 16.455696202531644, + "grad_norm": 0.012984943576157093, + "learning_rate": 9.297431665514111e-05, + "loss": 0.0005, + "num_input_tokens_seen": 1177056, + "step": 2600 + }, + { + "epoch": 16.4873417721519, + "grad_norm": 0.005006816238164902, + "learning_rate": 9.137661725551111e-05, + "loss": 0.0003, + "num_input_tokens_seen": 1179392, + "step": 2605 + }, + { + "epoch": 16.518987341772153, + "grad_norm": 0.0007168608717620373, + "learning_rate": 8.97913831543195e-05, + "loss": 0.0002, + "num_input_tokens_seen": 1181664, + "step": 2610 + }, + { + "epoch": 16.550632911392405, + "grad_norm": 0.0033773721661418676, + "learning_rate": 8.821866271007218e-05, + "loss": 0.0005, + "num_input_tokens_seen": 1183808, + "step": 2615 + }, + { + "epoch": 16.582278481012658, + "grad_norm": 0.0013766802148893476, + "learning_rate": 8.665850389953788e-05, + "loss": 0.0004, + "num_input_tokens_seen": 1186240, + "step": 2620 + }, + { + "epoch": 16.61392405063291, + "grad_norm": 0.002955741947516799, + "learning_rate": 8.511095431628591e-05, + "loss": 0.0004, + "num_input_tokens_seen": 1188704, + "step": 2625 + }, + { + "epoch": 16.645569620253166, + "grad_norm": 0.00041950310696847737, + "learning_rate": 8.357606116923328e-05, + "loss": 0.0004, + "num_input_tokens_seen": 1190944, + "step": 2630 + }, + { + "epoch": 16.67721518987342, + "grad_norm": 0.01038124319165945, + "learning_rate": 8.205387128120518e-05, + "loss": 0.0007, + "num_input_tokens_seen": 1193216, + "step": 2635 + }, + { + "epoch": 16.70886075949367, + "grad_norm": 0.0003012821252923459, + "learning_rate": 8.054443108750592e-05, + "loss": 0.0004, + "num_input_tokens_seen": 1195584, + "step": 2640 + }, + { + "epoch": 16.740506329113924, + "grad_norm": 0.019842565059661865, + "learning_rate": 7.904778663450324e-05, + "loss": 0.0007, + "num_input_tokens_seen": 1197824, + "step": 2645 + }, + { + "epoch": 16.772151898734176, + "grad_norm": 0.0002593526733107865, + "learning_rate": 7.75639835782232e-05, + "loss": 0.0, + "num_input_tokens_seen": 1200032, + "step": 2650 + }, + { + "epoch": 16.803797468354432, + "grad_norm": 0.00022097289911471307, + "learning_rate": 7.60930671829571e-05, + "loss": 0.0005, + "num_input_tokens_seen": 1202240, + "step": 2655 + }, + { + "epoch": 16.835443037974684, + "grad_norm": 0.011344253085553646, + "learning_rate": 7.46350823198813e-05, + "loss": 0.0005, + "num_input_tokens_seen": 1204576, + "step": 2660 + }, + { + "epoch": 16.867088607594937, + "grad_norm": 0.012124335393309593, + "learning_rate": 7.319007346568817e-05, + "loss": 0.0004, + "num_input_tokens_seen": 1206816, + "step": 2665 + }, + { + "epoch": 16.89873417721519, + "grad_norm": 0.0009705662960186601, + "learning_rate": 7.175808470122897e-05, + "loss": 0.0005, + "num_input_tokens_seen": 1209120, + "step": 2670 + }, + { + "epoch": 16.930379746835442, + "grad_norm": 0.0013107015984132886, + "learning_rate": 7.033915971016952e-05, + "loss": 0.0001, + "num_input_tokens_seen": 1211552, + "step": 2675 + }, + { + "epoch": 16.962025316455698, + "grad_norm": 0.0003129469114355743, + "learning_rate": 6.893334177765759e-05, + "loss": 0.0002, + "num_input_tokens_seen": 1213632, + "step": 2680 + }, + { + "epoch": 16.99367088607595, + "grad_norm": 0.010664188303053379, + "learning_rate": 6.75406737890023e-05, + "loss": 0.0003, + "num_input_tokens_seen": 1215968, + "step": 2685 + }, + { + "epoch": 17.0, + "eval_loss": 0.2750537693500519, + "eval_runtime": 1.6312, + "eval_samples_per_second": 42.913, + "eval_steps_per_second": 11.035, + "num_input_tokens_seen": 1216160, + "step": 2686 + }, + { + "epoch": 17.025316455696203, + "grad_norm": 0.008590532466769218, + "learning_rate": 6.616119822836609e-05, + "loss": 0.0006, + "num_input_tokens_seen": 1217888, + "step": 2690 + }, + { + "epoch": 17.056962025316455, + "grad_norm": 0.0066283028572797775, + "learning_rate": 6.479495717746808e-05, + "loss": 0.001, + "num_input_tokens_seen": 1220128, + "step": 2695 + }, + { + "epoch": 17.088607594936708, + "grad_norm": 0.0048147221095860004, + "learning_rate": 6.344199231430132e-05, + "loss": 0.0003, + "num_input_tokens_seen": 1222432, + "step": 2700 + }, + { + "epoch": 17.120253164556964, + "grad_norm": 0.012622714973986149, + "learning_rate": 6.210234491186079e-05, + "loss": 0.0005, + "num_input_tokens_seen": 1224768, + "step": 2705 + }, + { + "epoch": 17.151898734177216, + "grad_norm": 0.05339493229985237, + "learning_rate": 6.077605583688417e-05, + "loss": 0.001, + "num_input_tokens_seen": 1227136, + "step": 2710 + }, + { + "epoch": 17.18354430379747, + "grad_norm": 0.0019386393250897527, + "learning_rate": 5.946316554860581e-05, + "loss": 0.0002, + "num_input_tokens_seen": 1229344, + "step": 2715 + }, + { + "epoch": 17.21518987341772, + "grad_norm": 0.00074522610520944, + "learning_rate": 5.8163714097522025e-05, + "loss": 0.0002, + "num_input_tokens_seen": 1231552, + "step": 2720 + }, + { + "epoch": 17.246835443037973, + "grad_norm": 0.002929766895249486, + "learning_rate": 5.6877741124169045e-05, + "loss": 0.001, + "num_input_tokens_seen": 1233824, + "step": 2725 + }, + { + "epoch": 17.27848101265823, + "grad_norm": 0.003988339100033045, + "learning_rate": 5.560528585791491e-05, + "loss": 0.0002, + "num_input_tokens_seen": 1236160, + "step": 2730 + }, + { + "epoch": 17.310126582278482, + "grad_norm": 0.008571026846766472, + "learning_rate": 5.434638711576123e-05, + "loss": 0.0004, + "num_input_tokens_seen": 1238464, + "step": 2735 + }, + { + "epoch": 17.341772151898734, + "grad_norm": 0.009090566076338291, + "learning_rate": 5.310108330116037e-05, + "loss": 0.0008, + "num_input_tokens_seen": 1240704, + "step": 2740 + }, + { + "epoch": 17.373417721518987, + "grad_norm": 0.00025374346296302974, + "learning_rate": 5.18694124028431e-05, + "loss": 0.0002, + "num_input_tokens_seen": 1243008, + "step": 2745 + }, + { + "epoch": 17.40506329113924, + "grad_norm": 0.0058571649715304375, + "learning_rate": 5.065141199365991e-05, + "loss": 0.0002, + "num_input_tokens_seen": 1245184, + "step": 2750 + }, + { + "epoch": 17.436708860759495, + "grad_norm": 0.001075509935617447, + "learning_rate": 4.944711922943523e-05, + "loss": 0.0002, + "num_input_tokens_seen": 1247456, + "step": 2755 + }, + { + "epoch": 17.468354430379748, + "grad_norm": 0.00021690309222321957, + "learning_rate": 4.825657084783347e-05, + "loss": 0.0002, + "num_input_tokens_seen": 1249760, + "step": 2760 + }, + { + "epoch": 17.5, + "grad_norm": 0.01787727139890194, + "learning_rate": 4.7079803167238366e-05, + "loss": 0.0005, + "num_input_tokens_seen": 1252032, + "step": 2765 + }, + { + "epoch": 17.531645569620252, + "grad_norm": 0.0052017997950315475, + "learning_rate": 4.591685208564561e-05, + "loss": 0.0002, + "num_input_tokens_seen": 1254240, + "step": 2770 + }, + { + "epoch": 17.563291139240505, + "grad_norm": 0.01735135167837143, + "learning_rate": 4.476775307956699e-05, + "loss": 0.0006, + "num_input_tokens_seen": 1256512, + "step": 2775 + }, + { + "epoch": 17.59493670886076, + "grad_norm": 0.00021527201170101762, + "learning_rate": 4.363254120294846e-05, + "loss": 0.0, + "num_input_tokens_seen": 1258784, + "step": 2780 + }, + { + "epoch": 17.626582278481013, + "grad_norm": 0.0014001114759594202, + "learning_rate": 4.2511251086101373e-05, + "loss": 0.0002, + "num_input_tokens_seen": 1261024, + "step": 2785 + }, + { + "epoch": 17.658227848101266, + "grad_norm": 0.0006366492016240954, + "learning_rate": 4.14039169346449e-05, + "loss": 0.0004, + "num_input_tokens_seen": 1263392, + "step": 2790 + }, + { + "epoch": 17.689873417721518, + "grad_norm": 0.0036711168941110373, + "learning_rate": 4.031057252846371e-05, + "loss": 0.0005, + "num_input_tokens_seen": 1265600, + "step": 2795 + }, + { + "epoch": 17.72151898734177, + "grad_norm": 0.00026541907573118806, + "learning_rate": 3.923125122067689e-05, + "loss": 0.0001, + "num_input_tokens_seen": 1267808, + "step": 2800 + }, + { + "epoch": 17.753164556962027, + "grad_norm": 0.0001458050828659907, + "learning_rate": 3.816598593662024e-05, + "loss": 0.0004, + "num_input_tokens_seen": 1270016, + "step": 2805 + }, + { + "epoch": 17.78481012658228, + "grad_norm": 0.0001481924409745261, + "learning_rate": 3.711480917284282e-05, + "loss": 0.0004, + "num_input_tokens_seen": 1272448, + "step": 2810 + }, + { + "epoch": 17.81645569620253, + "grad_norm": 0.003592695342376828, + "learning_rate": 3.607775299611465e-05, + "loss": 0.0004, + "num_input_tokens_seen": 1274848, + "step": 2815 + }, + { + "epoch": 17.848101265822784, + "grad_norm": 0.005215463228523731, + "learning_rate": 3.505484904244877e-05, + "loss": 0.0003, + "num_input_tokens_seen": 1277152, + "step": 2820 + }, + { + "epoch": 17.879746835443036, + "grad_norm": 0.00018693594029173255, + "learning_rate": 3.404612851613675e-05, + "loss": 0.0002, + "num_input_tokens_seen": 1279488, + "step": 2825 + }, + { + "epoch": 17.911392405063292, + "grad_norm": 0.01072632148861885, + "learning_rate": 3.305162218879576e-05, + "loss": 0.0006, + "num_input_tokens_seen": 1281728, + "step": 2830 + }, + { + "epoch": 17.943037974683545, + "grad_norm": 0.002433460671454668, + "learning_rate": 3.207136039843078e-05, + "loss": 0.0001, + "num_input_tokens_seen": 1284000, + "step": 2835 + }, + { + "epoch": 17.974683544303797, + "grad_norm": 0.008887792937457561, + "learning_rate": 3.110537304850869e-05, + "loss": 0.0003, + "num_input_tokens_seen": 1286176, + "step": 2840 + }, + { + "epoch": 18.0, + "eval_loss": 0.276852011680603, + "eval_runtime": 1.6464, + "eval_samples_per_second": 42.516, + "eval_steps_per_second": 10.933, + "num_input_tokens_seen": 1287728, + "step": 2844 + }, + { + "epoch": 18.00632911392405, + "grad_norm": 0.006616608239710331, + "learning_rate": 3.0153689607045842e-05, + "loss": 0.0003, + "num_input_tokens_seen": 1288176, + "step": 2845 + }, + { + "epoch": 18.037974683544302, + "grad_norm": 0.009547991678118706, + "learning_rate": 2.92163391057097e-05, + "loss": 0.0002, + "num_input_tokens_seen": 1290384, + "step": 2850 + }, + { + "epoch": 18.069620253164558, + "grad_norm": 0.00018490054935682565, + "learning_rate": 2.8293350138932806e-05, + "loss": 0.0002, + "num_input_tokens_seen": 1292592, + "step": 2855 + }, + { + "epoch": 18.10126582278481, + "grad_norm": 0.0009518595761619508, + "learning_rate": 2.738475086304032e-05, + "loss": 0.0001, + "num_input_tokens_seen": 1294864, + "step": 2860 + }, + { + "epoch": 18.132911392405063, + "grad_norm": 0.006473258137702942, + "learning_rate": 2.6490568995391984e-05, + "loss": 0.0004, + "num_input_tokens_seen": 1297168, + "step": 2865 + }, + { + "epoch": 18.164556962025316, + "grad_norm": 0.004321450833231211, + "learning_rate": 2.561083181353524e-05, + "loss": 0.0003, + "num_input_tokens_seen": 1299600, + "step": 2870 + }, + { + "epoch": 18.196202531645568, + "grad_norm": 0.00034929916728287935, + "learning_rate": 2.4745566154374234e-05, + "loss": 0.0001, + "num_input_tokens_seen": 1301808, + "step": 2875 + }, + { + "epoch": 18.227848101265824, + "grad_norm": 0.001334361033514142, + "learning_rate": 2.3894798413350738e-05, + "loss": 0.0002, + "num_input_tokens_seen": 1304144, + "step": 2880 + }, + { + "epoch": 18.259493670886076, + "grad_norm": 0.009019610472023487, + "learning_rate": 2.30585545436387e-05, + "loss": 0.0004, + "num_input_tokens_seen": 1306352, + "step": 2885 + }, + { + "epoch": 18.29113924050633, + "grad_norm": 0.008182347752153873, + "learning_rate": 2.223686005535297e-05, + "loss": 0.0005, + "num_input_tokens_seen": 1308528, + "step": 2890 + }, + { + "epoch": 18.32278481012658, + "grad_norm": 0.01867622882127762, + "learning_rate": 2.1429740014770993e-05, + "loss": 0.0007, + "num_input_tokens_seen": 1310704, + "step": 2895 + }, + { + "epoch": 18.354430379746834, + "grad_norm": 0.01742101088166237, + "learning_rate": 2.0637219043567636e-05, + "loss": 0.0013, + "num_input_tokens_seen": 1313008, + "step": 2900 + }, + { + "epoch": 18.38607594936709, + "grad_norm": 0.02185041271150112, + "learning_rate": 1.985932131806495e-05, + "loss": 0.0012, + "num_input_tokens_seen": 1315280, + "step": 2905 + }, + { + "epoch": 18.417721518987342, + "grad_norm": 0.006965738721191883, + "learning_rate": 1.9096070568493996e-05, + "loss": 0.0003, + "num_input_tokens_seen": 1317488, + "step": 2910 + }, + { + "epoch": 18.449367088607595, + "grad_norm": 0.009952598251402378, + "learning_rate": 1.8347490078271244e-05, + "loss": 0.0002, + "num_input_tokens_seen": 1319696, + "step": 2915 + }, + { + "epoch": 18.481012658227847, + "grad_norm": 0.001204541651532054, + "learning_rate": 1.7613602683288044e-05, + "loss": 0.0003, + "num_input_tokens_seen": 1321840, + "step": 2920 + }, + { + "epoch": 18.5126582278481, + "grad_norm": 0.00531289866194129, + "learning_rate": 1.6894430771214277e-05, + "loss": 0.0001, + "num_input_tokens_seen": 1324048, + "step": 2925 + }, + { + "epoch": 18.544303797468356, + "grad_norm": 0.0035829441621899605, + "learning_rate": 1.6189996280815354e-05, + "loss": 0.0007, + "num_input_tokens_seen": 1326416, + "step": 2930 + }, + { + "epoch": 18.575949367088608, + "grad_norm": 0.0015941396122798324, + "learning_rate": 1.5500320701282934e-05, + "loss": 0.0003, + "num_input_tokens_seen": 1328624, + "step": 2935 + }, + { + "epoch": 18.60759493670886, + "grad_norm": 0.008270417340099812, + "learning_rate": 1.4825425071579079e-05, + "loss": 0.0002, + "num_input_tokens_seen": 1330992, + "step": 2940 + }, + { + "epoch": 18.639240506329113, + "grad_norm": 0.011578932404518127, + "learning_rate": 1.4165329979794973e-05, + "loss": 0.0002, + "num_input_tokens_seen": 1333360, + "step": 2945 + }, + { + "epoch": 18.67088607594937, + "grad_norm": 0.00026211809017695487, + "learning_rate": 1.3520055562522671e-05, + "loss": 0.0001, + "num_input_tokens_seen": 1335536, + "step": 2950 + }, + { + "epoch": 18.70253164556962, + "grad_norm": 0.00020787143148481846, + "learning_rate": 1.2889621504240557e-05, + "loss": 0.0002, + "num_input_tokens_seen": 1337872, + "step": 2955 + }, + { + "epoch": 18.734177215189874, + "grad_norm": 0.00027836396475322545, + "learning_rate": 1.2274047036713198e-05, + "loss": 0.0003, + "num_input_tokens_seen": 1340208, + "step": 2960 + }, + { + "epoch": 18.765822784810126, + "grad_norm": 0.001785777392797172, + "learning_rate": 1.1673350938404493e-05, + "loss": 0.0004, + "num_input_tokens_seen": 1342448, + "step": 2965 + }, + { + "epoch": 18.79746835443038, + "grad_norm": 0.0030327762942761183, + "learning_rate": 1.1087551533904894e-05, + "loss": 0.0004, + "num_input_tokens_seen": 1344720, + "step": 2970 + }, + { + "epoch": 18.82911392405063, + "grad_norm": 0.002383754588663578, + "learning_rate": 1.0516666693372423e-05, + "loss": 0.0002, + "num_input_tokens_seen": 1347088, + "step": 2975 + }, + { + "epoch": 18.860759493670887, + "grad_norm": 0.000466236931970343, + "learning_rate": 9.960713831987323e-06, + "loss": 0.0001, + "num_input_tokens_seen": 1349360, + "step": 2980 + }, + { + "epoch": 18.89240506329114, + "grad_norm": 0.000480673392303288, + "learning_rate": 9.419709909421203e-06, + "loss": 0.0009, + "num_input_tokens_seen": 1351600, + "step": 2985 + }, + { + "epoch": 18.924050632911392, + "grad_norm": 0.00112088059540838, + "learning_rate": 8.893671429319295e-06, + "loss": 0.0003, + "num_input_tokens_seen": 1353776, + "step": 2990 + }, + { + "epoch": 18.955696202531644, + "grad_norm": 0.002725357422605157, + "learning_rate": 8.382614438797177e-06, + "loss": 0.0004, + "num_input_tokens_seen": 1356080, + "step": 2995 + }, + { + "epoch": 18.9873417721519, + "grad_norm": 0.007981766015291214, + "learning_rate": 7.886554527951117e-06, + "loss": 0.0005, + "num_input_tokens_seen": 1358448, + "step": 3000 + }, + { + "epoch": 19.0, + "eval_loss": 0.28090646862983704, + "eval_runtime": 1.6382, + "eval_samples_per_second": 42.73, + "eval_steps_per_second": 10.988, + "num_input_tokens_seen": 1359120, + "step": 3002 + }, + { + "epoch": 19.018987341772153, + "grad_norm": 0.01756378635764122, + "learning_rate": 7.405506829382736e-06, + "loss": 0.0005, + "num_input_tokens_seen": 1360464, + "step": 3005 + }, + { + "epoch": 19.050632911392405, + "grad_norm": 0.00034704216523095965, + "learning_rate": 6.9394860177370845e-06, + "loss": 0.0007, + "num_input_tokens_seen": 1362832, + "step": 3010 + }, + { + "epoch": 19.082278481012658, + "grad_norm": 0.009799733757972717, + "learning_rate": 6.488506309255238e-06, + "loss": 0.0001, + "num_input_tokens_seen": 1365072, + "step": 3015 + }, + { + "epoch": 19.11392405063291, + "grad_norm": 0.0008434038609266281, + "learning_rate": 6.052581461340411e-06, + "loss": 0.0002, + "num_input_tokens_seen": 1367440, + "step": 3020 + }, + { + "epoch": 19.145569620253166, + "grad_norm": 0.007226018700748682, + "learning_rate": 5.631724772138469e-06, + "loss": 0.0002, + "num_input_tokens_seen": 1369744, + "step": 3025 + }, + { + "epoch": 19.17721518987342, + "grad_norm": 0.006701529957354069, + "learning_rate": 5.225949080131964e-06, + "loss": 0.0009, + "num_input_tokens_seen": 1371984, + "step": 3030 + }, + { + "epoch": 19.20886075949367, + "grad_norm": 0.000355382013367489, + "learning_rate": 4.8352667637490684e-06, + "loss": 0.0004, + "num_input_tokens_seen": 1374288, + "step": 3035 + }, + { + "epoch": 19.240506329113924, + "grad_norm": 0.014170898124575615, + "learning_rate": 4.459689740985206e-06, + "loss": 0.0002, + "num_input_tokens_seen": 1376496, + "step": 3040 + }, + { + "epoch": 19.272151898734176, + "grad_norm": 0.00471507478505373, + "learning_rate": 4.099229469040011e-06, + "loss": 0.0003, + "num_input_tokens_seen": 1378736, + "step": 3045 + }, + { + "epoch": 19.303797468354432, + "grad_norm": 0.002437709365040064, + "learning_rate": 3.7538969439678272e-06, + "loss": 0.0003, + "num_input_tokens_seen": 1380944, + "step": 3050 + }, + { + "epoch": 19.335443037974684, + "grad_norm": 0.00025732358335517347, + "learning_rate": 3.423702700341813e-06, + "loss": 0.0001, + "num_input_tokens_seen": 1383248, + "step": 3055 + }, + { + "epoch": 19.367088607594937, + "grad_norm": 0.0001917013869388029, + "learning_rate": 3.1086568109331413e-06, + "loss": 0.0003, + "num_input_tokens_seen": 1385488, + "step": 3060 + }, + { + "epoch": 19.39873417721519, + "grad_norm": 0.0010126049164682627, + "learning_rate": 2.8087688864033013e-06, + "loss": 0.0005, + "num_input_tokens_seen": 1387760, + "step": 3065 + }, + { + "epoch": 19.430379746835442, + "grad_norm": 0.0010305697796866298, + "learning_rate": 2.524048075011165e-06, + "loss": 0.0002, + "num_input_tokens_seen": 1390192, + "step": 3070 + }, + { + "epoch": 19.462025316455698, + "grad_norm": 0.0010394472628831863, + "learning_rate": 2.254503062333824e-06, + "loss": 0.0006, + "num_input_tokens_seen": 1392464, + "step": 3075 + }, + { + "epoch": 19.49367088607595, + "grad_norm": 0.002385683823376894, + "learning_rate": 2.0001420710016318e-06, + "loss": 0.0001, + "num_input_tokens_seen": 1394768, + "step": 3080 + }, + { + "epoch": 19.525316455696203, + "grad_norm": 0.00017207856581080705, + "learning_rate": 1.7609728604472963e-06, + "loss": 0.0005, + "num_input_tokens_seen": 1397072, + "step": 3085 + }, + { + "epoch": 19.556962025316455, + "grad_norm": 0.006562650669366121, + "learning_rate": 1.5370027266694008e-06, + "loss": 0.0003, + "num_input_tokens_seen": 1399344, + "step": 3090 + }, + { + "epoch": 19.588607594936708, + "grad_norm": 0.002432658337056637, + "learning_rate": 1.3282385020095267e-06, + "loss": 0.0002, + "num_input_tokens_seen": 1401552, + "step": 3095 + }, + { + "epoch": 19.620253164556964, + "grad_norm": 0.0025194003246724606, + "learning_rate": 1.1346865549440867e-06, + "loss": 0.0003, + "num_input_tokens_seen": 1403760, + "step": 3100 + }, + { + "epoch": 19.651898734177216, + "grad_norm": 0.007078900001943111, + "learning_rate": 9.563527898899805e-07, + "loss": 0.0001, + "num_input_tokens_seen": 1406000, + "step": 3105 + }, + { + "epoch": 19.68354430379747, + "grad_norm": 0.00788037572056055, + "learning_rate": 7.932426470242948e-07, + "loss": 0.0009, + "num_input_tokens_seen": 1408208, + "step": 3110 + }, + { + "epoch": 19.71518987341772, + "grad_norm": 0.00032059315708465874, + "learning_rate": 6.453611021186578e-07, + "loss": 0.0001, + "num_input_tokens_seen": 1410416, + "step": 3115 + }, + { + "epoch": 19.746835443037973, + "grad_norm": 0.0016587411519140005, + "learning_rate": 5.12712666387194e-07, + "loss": 0.0001, + "num_input_tokens_seen": 1412592, + "step": 3120 + }, + { + "epoch": 19.77848101265823, + "grad_norm": 0.010737399570643902, + "learning_rate": 3.953013863490784e-07, + "loss": 0.0002, + "num_input_tokens_seen": 1414960, + "step": 3125 + }, + { + "epoch": 19.810126582278482, + "grad_norm": 0.000507625169120729, + "learning_rate": 2.9313084370496955e-07, + "loss": 0.0002, + "num_input_tokens_seen": 1417232, + "step": 3130 + }, + { + "epoch": 19.841772151898734, + "grad_norm": 0.0002272734127473086, + "learning_rate": 2.062041552277627e-07, + "loss": 0.0002, + "num_input_tokens_seen": 1419568, + "step": 3135 + }, + { + "epoch": 19.873417721518987, + "grad_norm": 0.00031070318073034286, + "learning_rate": 1.3452397266772166e-07, + "loss": 0.0003, + "num_input_tokens_seen": 1421776, + "step": 3140 + }, + { + "epoch": 19.90506329113924, + "grad_norm": 0.0002692148555070162, + "learning_rate": 7.809248267121038e-08, + "loss": 0.0008, + "num_input_tokens_seen": 1424016, + "step": 3145 + }, + { + "epoch": 19.936708860759495, + "grad_norm": 0.00034570536809042096, + "learning_rate": 3.6911406714246195e-08, + "loss": 0.0003, + "num_input_tokens_seen": 1426288, + "step": 3150 + }, + { + "epoch": 19.968354430379748, + "grad_norm": 0.004976261407136917, + "learning_rate": 1.0982001050041657e-08, + "loss": 0.001, + "num_input_tokens_seen": 1428592, + "step": 3155 + }, + { + "epoch": 20.0, + "grad_norm": 0.00042913382640108466, + "learning_rate": 3.0505667042435294e-10, + "loss": 0.0001, + "num_input_tokens_seen": 1430592, + "step": 3160 + }, + { + "epoch": 20.0, + "eval_loss": 0.27992814779281616, + "eval_runtime": 1.6357, + "eval_samples_per_second": 42.795, + "eval_steps_per_second": 11.004, + "num_input_tokens_seen": 1430592, + "step": 3160 + }, + { + "epoch": 20.0, + "num_input_tokens_seen": 1430592, + "step": 3160, + "total_flos": 6.441891117819494e+16, + "train_loss": 0.06765125470246627, + "train_runtime": 660.6029, + "train_samples_per_second": 19.073, + "train_steps_per_second": 4.784 + } + ], + "logging_steps": 5, + "max_steps": 3160, + "num_input_tokens_seen": 1430592, + "num_train_epochs": 20, + "save_steps": 158, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 6.441891117819494e+16, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}