{ "best_global_step": 3160, "best_metric": 0.214238703250885, "best_model_checkpoint": "saves_multiple/ia3/llama-3-8b-instruct/train_svamp_101112_1760638004/checkpoint-3160", "epoch": 20.0, "eval_steps": 158, "global_step": 3160, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03164556962025317, "grad_norm": 1.6946802139282227, "learning_rate": 6.329113924050633e-07, "loss": 2.399, "num_input_tokens_seen": 2272, "step": 5 }, { "epoch": 0.06329113924050633, "grad_norm": 1.4426089525222778, "learning_rate": 1.4240506329113925e-06, "loss": 2.3473, "num_input_tokens_seen": 4576, "step": 10 }, { "epoch": 0.0949367088607595, "grad_norm": 1.4806976318359375, "learning_rate": 2.2151898734177215e-06, "loss": 2.3323, "num_input_tokens_seen": 6784, "step": 15 }, { "epoch": 0.12658227848101267, "grad_norm": 1.760801076889038, "learning_rate": 3.0063291139240506e-06, "loss": 2.2433, "num_input_tokens_seen": 9056, "step": 20 }, { "epoch": 0.15822784810126583, "grad_norm": 2.1461338996887207, "learning_rate": 3.7974683544303802e-06, "loss": 2.4959, "num_input_tokens_seen": 11360, "step": 25 }, { "epoch": 0.189873417721519, "grad_norm": 1.7740542888641357, "learning_rate": 4.588607594936709e-06, "loss": 2.4692, "num_input_tokens_seen": 13504, "step": 30 }, { "epoch": 0.22151898734177214, "grad_norm": 1.6903886795043945, "learning_rate": 5.379746835443038e-06, "loss": 2.4533, "num_input_tokens_seen": 15680, "step": 35 }, { "epoch": 0.25316455696202533, "grad_norm": 1.689306616783142, "learning_rate": 6.170886075949367e-06, "loss": 2.455, "num_input_tokens_seen": 17952, "step": 40 }, { "epoch": 0.2848101265822785, "grad_norm": 1.7290929555892944, "learning_rate": 6.9620253164556965e-06, "loss": 2.5512, "num_input_tokens_seen": 20224, "step": 45 }, { "epoch": 0.31645569620253167, "grad_norm": 1.5842483043670654, "learning_rate": 7.753164556962026e-06, "loss": 2.2403, "num_input_tokens_seen": 22528, "step": 50 }, { "epoch": 0.34810126582278483, "grad_norm": 1.5157476663589478, "learning_rate": 8.544303797468354e-06, "loss": 2.2837, "num_input_tokens_seen": 24896, "step": 55 }, { "epoch": 0.379746835443038, "grad_norm": 1.783208966255188, "learning_rate": 9.335443037974684e-06, "loss": 2.3636, "num_input_tokens_seen": 27136, "step": 60 }, { "epoch": 0.41139240506329117, "grad_norm": 1.7329400777816772, "learning_rate": 1.0126582278481012e-05, "loss": 2.198, "num_input_tokens_seen": 29472, "step": 65 }, { "epoch": 0.4430379746835443, "grad_norm": 1.5789158344268799, "learning_rate": 1.0917721518987342e-05, "loss": 2.3521, "num_input_tokens_seen": 31744, "step": 70 }, { "epoch": 0.47468354430379744, "grad_norm": 1.6448791027069092, "learning_rate": 1.170886075949367e-05, "loss": 2.4413, "num_input_tokens_seen": 34112, "step": 75 }, { "epoch": 0.5063291139240507, "grad_norm": 1.556320309638977, "learning_rate": 1.25e-05, "loss": 2.375, "num_input_tokens_seen": 36320, "step": 80 }, { "epoch": 0.5379746835443038, "grad_norm": 1.615371584892273, "learning_rate": 1.3291139240506329e-05, "loss": 2.4612, "num_input_tokens_seen": 38656, "step": 85 }, { "epoch": 0.569620253164557, "grad_norm": 1.8173415660858154, "learning_rate": 1.4082278481012659e-05, "loss": 2.5126, "num_input_tokens_seen": 40928, "step": 90 }, { "epoch": 0.6012658227848101, "grad_norm": 1.584877610206604, "learning_rate": 1.4873417721518987e-05, "loss": 2.2944, "num_input_tokens_seen": 43296, "step": 95 }, { "epoch": 0.6329113924050633, "grad_norm": 1.4178416728973389, "learning_rate": 1.566455696202532e-05, "loss": 2.3253, "num_input_tokens_seen": 45600, "step": 100 }, { "epoch": 0.6645569620253164, "grad_norm": 1.6961376667022705, "learning_rate": 1.6455696202531644e-05, "loss": 2.3822, "num_input_tokens_seen": 47968, "step": 105 }, { "epoch": 0.6962025316455697, "grad_norm": 1.636988639831543, "learning_rate": 1.7246835443037976e-05, "loss": 2.6532, "num_input_tokens_seen": 50208, "step": 110 }, { "epoch": 0.7278481012658228, "grad_norm": 1.7380754947662354, "learning_rate": 1.8037974683544304e-05, "loss": 2.3374, "num_input_tokens_seen": 52448, "step": 115 }, { "epoch": 0.759493670886076, "grad_norm": 1.7528373003005981, "learning_rate": 1.8829113924050636e-05, "loss": 2.3868, "num_input_tokens_seen": 54752, "step": 120 }, { "epoch": 0.7911392405063291, "grad_norm": 1.979233980178833, "learning_rate": 1.962025316455696e-05, "loss": 2.3867, "num_input_tokens_seen": 56992, "step": 125 }, { "epoch": 0.8227848101265823, "grad_norm": 2.0350630283355713, "learning_rate": 2.0411392405063292e-05, "loss": 2.373, "num_input_tokens_seen": 59232, "step": 130 }, { "epoch": 0.8544303797468354, "grad_norm": 1.7272742986679077, "learning_rate": 2.120253164556962e-05, "loss": 2.3042, "num_input_tokens_seen": 61472, "step": 135 }, { "epoch": 0.8860759493670886, "grad_norm": 1.4649332761764526, "learning_rate": 2.1993670886075952e-05, "loss": 2.4077, "num_input_tokens_seen": 63584, "step": 140 }, { "epoch": 0.9177215189873418, "grad_norm": 1.6773498058319092, "learning_rate": 2.278481012658228e-05, "loss": 2.3607, "num_input_tokens_seen": 65824, "step": 145 }, { "epoch": 0.9493670886075949, "grad_norm": 1.7161122560501099, "learning_rate": 2.357594936708861e-05, "loss": 2.3587, "num_input_tokens_seen": 68128, "step": 150 }, { "epoch": 0.9810126582278481, "grad_norm": 1.5581388473510742, "learning_rate": 2.4367088607594937e-05, "loss": 2.4702, "num_input_tokens_seen": 70400, "step": 155 }, { "epoch": 1.0, "eval_loss": 2.441667318344116, "eval_runtime": 1.0491, "eval_samples_per_second": 66.722, "eval_steps_per_second": 17.157, "num_input_tokens_seen": 71552, "step": 158 }, { "epoch": 1.0126582278481013, "grad_norm": 1.5443732738494873, "learning_rate": 2.515822784810127e-05, "loss": 2.2796, "num_input_tokens_seen": 72480, "step": 160 }, { "epoch": 1.0443037974683544, "grad_norm": 1.775388479232788, "learning_rate": 2.5949367088607597e-05, "loss": 2.2701, "num_input_tokens_seen": 74784, "step": 165 }, { "epoch": 1.0759493670886076, "grad_norm": 2.137613296508789, "learning_rate": 2.6740506329113922e-05, "loss": 2.4756, "num_input_tokens_seen": 76928, "step": 170 }, { "epoch": 1.1075949367088607, "grad_norm": 2.4731106758117676, "learning_rate": 2.7531645569620257e-05, "loss": 2.3573, "num_input_tokens_seen": 79296, "step": 175 }, { "epoch": 1.139240506329114, "grad_norm": 1.5658464431762695, "learning_rate": 2.8322784810126586e-05, "loss": 2.3019, "num_input_tokens_seen": 81632, "step": 180 }, { "epoch": 1.1708860759493671, "grad_norm": 1.4423842430114746, "learning_rate": 2.9113924050632914e-05, "loss": 2.2589, "num_input_tokens_seen": 83872, "step": 185 }, { "epoch": 1.2025316455696202, "grad_norm": 1.408566951751709, "learning_rate": 2.990506329113924e-05, "loss": 2.1265, "num_input_tokens_seen": 86240, "step": 190 }, { "epoch": 1.2341772151898733, "grad_norm": 1.4578754901885986, "learning_rate": 3.0696202531645574e-05, "loss": 2.2293, "num_input_tokens_seen": 88544, "step": 195 }, { "epoch": 1.2658227848101267, "grad_norm": 1.57094144821167, "learning_rate": 3.14873417721519e-05, "loss": 1.9952, "num_input_tokens_seen": 90880, "step": 200 }, { "epoch": 1.2974683544303798, "grad_norm": 1.5864253044128418, "learning_rate": 3.227848101265823e-05, "loss": 2.1111, "num_input_tokens_seen": 93056, "step": 205 }, { "epoch": 1.3291139240506329, "grad_norm": 1.4571633338928223, "learning_rate": 3.3069620253164555e-05, "loss": 2.0919, "num_input_tokens_seen": 95392, "step": 210 }, { "epoch": 1.360759493670886, "grad_norm": 1.5949128866195679, "learning_rate": 3.386075949367089e-05, "loss": 2.0189, "num_input_tokens_seen": 97568, "step": 215 }, { "epoch": 1.3924050632911391, "grad_norm": 1.2293975353240967, "learning_rate": 3.465189873417722e-05, "loss": 1.9678, "num_input_tokens_seen": 99776, "step": 220 }, { "epoch": 1.4240506329113924, "grad_norm": 1.6140550374984741, "learning_rate": 3.5443037974683544e-05, "loss": 1.8783, "num_input_tokens_seen": 102176, "step": 225 }, { "epoch": 1.4556962025316456, "grad_norm": 1.6481391191482544, "learning_rate": 3.6234177215189875e-05, "loss": 1.8979, "num_input_tokens_seen": 104480, "step": 230 }, { "epoch": 1.4873417721518987, "grad_norm": 1.6942737102508545, "learning_rate": 3.70253164556962e-05, "loss": 1.9046, "num_input_tokens_seen": 106816, "step": 235 }, { "epoch": 1.518987341772152, "grad_norm": 1.3236186504364014, "learning_rate": 3.781645569620253e-05, "loss": 1.6367, "num_input_tokens_seen": 109024, "step": 240 }, { "epoch": 1.5506329113924051, "grad_norm": 1.318651556968689, "learning_rate": 3.8607594936708864e-05, "loss": 1.6789, "num_input_tokens_seen": 111200, "step": 245 }, { "epoch": 1.5822784810126582, "grad_norm": 1.3366973400115967, "learning_rate": 3.939873417721519e-05, "loss": 1.7654, "num_input_tokens_seen": 113408, "step": 250 }, { "epoch": 1.6139240506329116, "grad_norm": 1.1693854331970215, "learning_rate": 4.018987341772152e-05, "loss": 1.7488, "num_input_tokens_seen": 115616, "step": 255 }, { "epoch": 1.6455696202531644, "grad_norm": 1.393717885017395, "learning_rate": 4.098101265822785e-05, "loss": 1.7352, "num_input_tokens_seen": 117728, "step": 260 }, { "epoch": 1.6772151898734178, "grad_norm": 1.1501590013504028, "learning_rate": 4.177215189873418e-05, "loss": 1.6651, "num_input_tokens_seen": 120032, "step": 265 }, { "epoch": 1.7088607594936709, "grad_norm": 1.3241972923278809, "learning_rate": 4.256329113924051e-05, "loss": 1.7203, "num_input_tokens_seen": 122144, "step": 270 }, { "epoch": 1.740506329113924, "grad_norm": 1.362923264503479, "learning_rate": 4.3354430379746834e-05, "loss": 1.6829, "num_input_tokens_seen": 124384, "step": 275 }, { "epoch": 1.7721518987341773, "grad_norm": 1.3987077474594116, "learning_rate": 4.4145569620253165e-05, "loss": 1.7782, "num_input_tokens_seen": 126560, "step": 280 }, { "epoch": 1.8037974683544302, "grad_norm": 1.3914768695831299, "learning_rate": 4.49367088607595e-05, "loss": 1.6258, "num_input_tokens_seen": 128928, "step": 285 }, { "epoch": 1.8354430379746836, "grad_norm": 1.319828987121582, "learning_rate": 4.572784810126582e-05, "loss": 1.6883, "num_input_tokens_seen": 131264, "step": 290 }, { "epoch": 1.8670886075949367, "grad_norm": 1.2665724754333496, "learning_rate": 4.6518987341772154e-05, "loss": 1.6287, "num_input_tokens_seen": 133568, "step": 295 }, { "epoch": 1.8987341772151898, "grad_norm": 1.3669281005859375, "learning_rate": 4.7310126582278485e-05, "loss": 1.664, "num_input_tokens_seen": 135936, "step": 300 }, { "epoch": 1.9303797468354431, "grad_norm": 1.4922566413879395, "learning_rate": 4.810126582278481e-05, "loss": 1.5485, "num_input_tokens_seen": 138080, "step": 305 }, { "epoch": 1.9620253164556962, "grad_norm": 1.3618347644805908, "learning_rate": 4.889240506329114e-05, "loss": 1.5031, "num_input_tokens_seen": 140384, "step": 310 }, { "epoch": 1.9936708860759493, "grad_norm": 1.3220609426498413, "learning_rate": 4.968354430379747e-05, "loss": 1.5492, "num_input_tokens_seen": 142752, "step": 315 }, { "epoch": 2.0, "eval_loss": 1.5887207984924316, "eval_runtime": 1.0484, "eval_samples_per_second": 66.767, "eval_steps_per_second": 17.169, "num_input_tokens_seen": 142960, "step": 316 }, { "epoch": 2.0253164556962027, "grad_norm": 1.2385551929473877, "learning_rate": 4.9999862724609986e-05, "loss": 1.523, "num_input_tokens_seen": 144816, "step": 320 }, { "epoch": 2.0569620253164556, "grad_norm": 1.2174158096313477, "learning_rate": 4.999902382490825e-05, "loss": 1.5465, "num_input_tokens_seen": 147120, "step": 325 }, { "epoch": 2.088607594936709, "grad_norm": 1.2971528768539429, "learning_rate": 4.9997422315170474e-05, "loss": 1.4823, "num_input_tokens_seen": 149392, "step": 330 }, { "epoch": 2.1202531645569622, "grad_norm": 1.260852336883545, "learning_rate": 4.9995058244251644e-05, "loss": 1.4796, "num_input_tokens_seen": 151632, "step": 335 }, { "epoch": 2.151898734177215, "grad_norm": 1.114691138267517, "learning_rate": 4.9991931684269146e-05, "loss": 1.3452, "num_input_tokens_seen": 153904, "step": 340 }, { "epoch": 2.1835443037974684, "grad_norm": 1.3311914205551147, "learning_rate": 4.998804273060055e-05, "loss": 1.3615, "num_input_tokens_seen": 156176, "step": 345 }, { "epoch": 2.2151898734177213, "grad_norm": 1.0716017484664917, "learning_rate": 4.998339150188069e-05, "loss": 1.273, "num_input_tokens_seen": 158480, "step": 350 }, { "epoch": 2.2468354430379747, "grad_norm": 1.2003897428512573, "learning_rate": 4.997797813999805e-05, "loss": 1.3779, "num_input_tokens_seen": 160688, "step": 355 }, { "epoch": 2.278481012658228, "grad_norm": 1.2310749292373657, "learning_rate": 4.997180281009045e-05, "loss": 1.3583, "num_input_tokens_seen": 162896, "step": 360 }, { "epoch": 2.310126582278481, "grad_norm": 1.3503273725509644, "learning_rate": 4.9964865700539986e-05, "loss": 1.2353, "num_input_tokens_seen": 165200, "step": 365 }, { "epoch": 2.3417721518987342, "grad_norm": 1.3061615228652954, "learning_rate": 4.99571670229673e-05, "loss": 1.2784, "num_input_tokens_seen": 167376, "step": 370 }, { "epoch": 2.3734177215189876, "grad_norm": 1.1640284061431885, "learning_rate": 4.99487070122251e-05, "loss": 1.2327, "num_input_tokens_seen": 169520, "step": 375 }, { "epoch": 2.4050632911392404, "grad_norm": 1.2148544788360596, "learning_rate": 4.993948592639104e-05, "loss": 1.3369, "num_input_tokens_seen": 171920, "step": 380 }, { "epoch": 2.4367088607594938, "grad_norm": 1.2271808385849, "learning_rate": 4.9929504046759805e-05, "loss": 1.2571, "num_input_tokens_seen": 174288, "step": 385 }, { "epoch": 2.4683544303797467, "grad_norm": 1.1879481077194214, "learning_rate": 4.9918761677834545e-05, "loss": 1.189, "num_input_tokens_seen": 176592, "step": 390 }, { "epoch": 2.5, "grad_norm": 0.997016429901123, "learning_rate": 4.990725914731759e-05, "loss": 1.1029, "num_input_tokens_seen": 178896, "step": 395 }, { "epoch": 2.5316455696202533, "grad_norm": 1.308315634727478, "learning_rate": 4.989499680610046e-05, "loss": 1.1933, "num_input_tokens_seen": 181168, "step": 400 }, { "epoch": 2.5632911392405062, "grad_norm": 1.0051201581954956, "learning_rate": 4.988197502825312e-05, "loss": 1.0757, "num_input_tokens_seen": 183376, "step": 405 }, { "epoch": 2.5949367088607596, "grad_norm": 1.1688600778579712, "learning_rate": 4.986819421101264e-05, "loss": 1.0739, "num_input_tokens_seen": 185744, "step": 410 }, { "epoch": 2.6265822784810124, "grad_norm": 1.299314022064209, "learning_rate": 4.9853654774770995e-05, "loss": 1.0986, "num_input_tokens_seen": 187984, "step": 415 }, { "epoch": 2.6582278481012658, "grad_norm": 1.096156120300293, "learning_rate": 4.9838357163062336e-05, "loss": 1.0604, "num_input_tokens_seen": 190128, "step": 420 }, { "epoch": 2.689873417721519, "grad_norm": 0.9744728803634644, "learning_rate": 4.982230184254933e-05, "loss": 1.0079, "num_input_tokens_seen": 192336, "step": 425 }, { "epoch": 2.721518987341772, "grad_norm": 0.9797954559326172, "learning_rate": 4.980548930300909e-05, "loss": 1.0385, "num_input_tokens_seen": 194640, "step": 430 }, { "epoch": 2.7531645569620253, "grad_norm": 0.9810925126075745, "learning_rate": 4.978792005731806e-05, "loss": 0.9731, "num_input_tokens_seen": 196848, "step": 435 }, { "epoch": 2.7848101265822782, "grad_norm": 0.9190636277198792, "learning_rate": 4.976959464143652e-05, "loss": 0.9789, "num_input_tokens_seen": 199088, "step": 440 }, { "epoch": 2.8164556962025316, "grad_norm": 0.7974667549133301, "learning_rate": 4.9750513614392116e-05, "loss": 0.8175, "num_input_tokens_seen": 201392, "step": 445 }, { "epoch": 2.848101265822785, "grad_norm": 0.908745527267456, "learning_rate": 4.97306775582629e-05, "loss": 0.8823, "num_input_tokens_seen": 203664, "step": 450 }, { "epoch": 2.879746835443038, "grad_norm": 1.1987823247909546, "learning_rate": 4.971008707815952e-05, "loss": 0.8747, "num_input_tokens_seen": 205968, "step": 455 }, { "epoch": 2.911392405063291, "grad_norm": 0.7311404347419739, "learning_rate": 4.968874280220676e-05, "loss": 0.7876, "num_input_tokens_seen": 208208, "step": 460 }, { "epoch": 2.9430379746835444, "grad_norm": 0.9768964052200317, "learning_rate": 4.966664538152442e-05, "loss": 0.8566, "num_input_tokens_seen": 210544, "step": 465 }, { "epoch": 2.9746835443037973, "grad_norm": 0.8635650277137756, "learning_rate": 4.964379549020741e-05, "loss": 0.8332, "num_input_tokens_seen": 212848, "step": 470 }, { "epoch": 3.0, "eval_loss": 0.9196476340293884, "eval_runtime": 1.0475, "eval_samples_per_second": 66.824, "eval_steps_per_second": 17.183, "num_input_tokens_seen": 214432, "step": 474 }, { "epoch": 3.0063291139240507, "grad_norm": 0.7106447815895081, "learning_rate": 4.962019382530521e-05, "loss": 0.7997, "num_input_tokens_seen": 214944, "step": 475 }, { "epoch": 3.037974683544304, "grad_norm": 0.7042839527130127, "learning_rate": 4.959584110680057e-05, "loss": 0.7494, "num_input_tokens_seen": 217280, "step": 480 }, { "epoch": 3.069620253164557, "grad_norm": 0.9107934832572937, "learning_rate": 4.957073807758763e-05, "loss": 0.7534, "num_input_tokens_seen": 219456, "step": 485 }, { "epoch": 3.1012658227848102, "grad_norm": 1.5178989171981812, "learning_rate": 4.9544885503449156e-05, "loss": 0.8619, "num_input_tokens_seen": 221664, "step": 490 }, { "epoch": 3.132911392405063, "grad_norm": 0.7359448075294495, "learning_rate": 4.9518284173033245e-05, "loss": 0.7201, "num_input_tokens_seen": 224064, "step": 495 }, { "epoch": 3.1645569620253164, "grad_norm": 1.1088182926177979, "learning_rate": 4.9490934897829246e-05, "loss": 0.7456, "num_input_tokens_seen": 226336, "step": 500 }, { "epoch": 3.1962025316455698, "grad_norm": 1.2289255857467651, "learning_rate": 4.9462838512143e-05, "loss": 0.8, "num_input_tokens_seen": 228672, "step": 505 }, { "epoch": 3.2278481012658227, "grad_norm": 1.094308853149414, "learning_rate": 4.943399587307141e-05, "loss": 0.7275, "num_input_tokens_seen": 230880, "step": 510 }, { "epoch": 3.259493670886076, "grad_norm": 0.7639238834381104, "learning_rate": 4.940440786047628e-05, "loss": 0.7012, "num_input_tokens_seen": 233056, "step": 515 }, { "epoch": 3.291139240506329, "grad_norm": 1.1105587482452393, "learning_rate": 4.937407537695744e-05, "loss": 0.6919, "num_input_tokens_seen": 235232, "step": 520 }, { "epoch": 3.3227848101265822, "grad_norm": 0.9030940532684326, "learning_rate": 4.93429993478253e-05, "loss": 0.5918, "num_input_tokens_seen": 237472, "step": 525 }, { "epoch": 3.3544303797468356, "grad_norm": 0.8279197216033936, "learning_rate": 4.931118072107255e-05, "loss": 0.6016, "num_input_tokens_seen": 239680, "step": 530 }, { "epoch": 3.3860759493670884, "grad_norm": 0.8506993651390076, "learning_rate": 4.9278620467345225e-05, "loss": 0.6172, "num_input_tokens_seen": 241888, "step": 535 }, { "epoch": 3.4177215189873418, "grad_norm": 0.6411468982696533, "learning_rate": 4.9245319579913185e-05, "loss": 0.5579, "num_input_tokens_seen": 244160, "step": 540 }, { "epoch": 3.449367088607595, "grad_norm": 0.8177703022956848, "learning_rate": 4.921127907463972e-05, "loss": 0.605, "num_input_tokens_seen": 246432, "step": 545 }, { "epoch": 3.481012658227848, "grad_norm": 0.6882438659667969, "learning_rate": 4.9176499989950624e-05, "loss": 0.5855, "num_input_tokens_seen": 248704, "step": 550 }, { "epoch": 3.5126582278481013, "grad_norm": 0.7634233832359314, "learning_rate": 4.9140983386802484e-05, "loss": 0.6133, "num_input_tokens_seen": 250944, "step": 555 }, { "epoch": 3.5443037974683547, "grad_norm": 1.1774110794067383, "learning_rate": 4.910473034865033e-05, "loss": 0.5079, "num_input_tokens_seen": 253248, "step": 560 }, { "epoch": 3.5759493670886076, "grad_norm": 0.7217822074890137, "learning_rate": 4.906774198141456e-05, "loss": 0.5536, "num_input_tokens_seen": 255648, "step": 565 }, { "epoch": 3.607594936708861, "grad_norm": 0.6471545100212097, "learning_rate": 4.903001941344726e-05, "loss": 0.4687, "num_input_tokens_seen": 257952, "step": 570 }, { "epoch": 3.6392405063291138, "grad_norm": 0.8361453413963318, "learning_rate": 4.899156379549769e-05, "loss": 0.5721, "num_input_tokens_seen": 260256, "step": 575 }, { "epoch": 3.670886075949367, "grad_norm": 0.723422646522522, "learning_rate": 4.8952376300677286e-05, "loss": 0.4713, "num_input_tokens_seen": 262688, "step": 580 }, { "epoch": 3.7025316455696204, "grad_norm": 0.9789317846298218, "learning_rate": 4.8912458124423795e-05, "loss": 0.4738, "num_input_tokens_seen": 264960, "step": 585 }, { "epoch": 3.7341772151898733, "grad_norm": 0.8792746067047119, "learning_rate": 4.8871810484464834e-05, "loss": 0.5112, "num_input_tokens_seen": 267136, "step": 590 }, { "epoch": 3.7658227848101267, "grad_norm": 0.7401533722877502, "learning_rate": 4.883043462078075e-05, "loss": 0.5123, "num_input_tokens_seen": 269408, "step": 595 }, { "epoch": 3.7974683544303796, "grad_norm": 0.4692544639110565, "learning_rate": 4.8788331795566785e-05, "loss": 0.429, "num_input_tokens_seen": 271712, "step": 600 }, { "epoch": 3.829113924050633, "grad_norm": 0.6941015124320984, "learning_rate": 4.874550329319457e-05, "loss": 0.4745, "num_input_tokens_seen": 274016, "step": 605 }, { "epoch": 3.8607594936708862, "grad_norm": 0.5532127618789673, "learning_rate": 4.870195042017295e-05, "loss": 0.4464, "num_input_tokens_seen": 276288, "step": 610 }, { "epoch": 3.892405063291139, "grad_norm": 1.0190658569335938, "learning_rate": 4.865767450510813e-05, "loss": 0.444, "num_input_tokens_seen": 278560, "step": 615 }, { "epoch": 3.9240506329113924, "grad_norm": 0.6436709761619568, "learning_rate": 4.861267689866313e-05, "loss": 0.4765, "num_input_tokens_seen": 280864, "step": 620 }, { "epoch": 3.9556962025316453, "grad_norm": 0.8717548251152039, "learning_rate": 4.8566958973516606e-05, "loss": 0.4072, "num_input_tokens_seen": 283072, "step": 625 }, { "epoch": 3.9873417721518987, "grad_norm": 0.5599835515022278, "learning_rate": 4.8520522124320954e-05, "loss": 0.4267, "num_input_tokens_seen": 285376, "step": 630 }, { "epoch": 4.0, "eval_loss": 0.5329218506813049, "eval_runtime": 1.0529, "eval_samples_per_second": 66.48, "eval_steps_per_second": 17.095, "num_input_tokens_seen": 286000, "step": 632 }, { "epoch": 4.018987341772152, "grad_norm": 0.5241405367851257, "learning_rate": 4.847336776765978e-05, "loss": 0.3502, "num_input_tokens_seen": 287280, "step": 635 }, { "epoch": 4.050632911392405, "grad_norm": 0.7361217737197876, "learning_rate": 4.842549734200467e-05, "loss": 0.4278, "num_input_tokens_seen": 289552, "step": 640 }, { "epoch": 4.082278481012658, "grad_norm": 0.4765454828739166, "learning_rate": 4.837691230767133e-05, "loss": 0.3713, "num_input_tokens_seen": 291824, "step": 645 }, { "epoch": 4.113924050632911, "grad_norm": 0.4159042537212372, "learning_rate": 4.832761414677503e-05, "loss": 0.3156, "num_input_tokens_seen": 294224, "step": 650 }, { "epoch": 4.1455696202531644, "grad_norm": 0.8802501559257507, "learning_rate": 4.827760436318537e-05, "loss": 0.3747, "num_input_tokens_seen": 296496, "step": 655 }, { "epoch": 4.177215189873418, "grad_norm": 0.6730992197990417, "learning_rate": 4.8226884482480436e-05, "loss": 0.4042, "num_input_tokens_seen": 298896, "step": 660 }, { "epoch": 4.208860759493671, "grad_norm": 0.5022607445716858, "learning_rate": 4.8175456051900256e-05, "loss": 0.3704, "num_input_tokens_seen": 301200, "step": 665 }, { "epoch": 4.2405063291139244, "grad_norm": 0.45622891187667847, "learning_rate": 4.8123320640299576e-05, "loss": 0.3836, "num_input_tokens_seen": 303376, "step": 670 }, { "epoch": 4.272151898734177, "grad_norm": 0.4927991032600403, "learning_rate": 4.807047983810002e-05, "loss": 0.2699, "num_input_tokens_seen": 305744, "step": 675 }, { "epoch": 4.30379746835443, "grad_norm": 0.965599000453949, "learning_rate": 4.801693525724157e-05, "loss": 0.3518, "num_input_tokens_seen": 307920, "step": 680 }, { "epoch": 4.3354430379746836, "grad_norm": 0.7399435043334961, "learning_rate": 4.79626885311334e-05, "loss": 0.3502, "num_input_tokens_seen": 310224, "step": 685 }, { "epoch": 4.367088607594937, "grad_norm": 0.4212355315685272, "learning_rate": 4.790774131460403e-05, "loss": 0.2208, "num_input_tokens_seen": 312464, "step": 690 }, { "epoch": 4.39873417721519, "grad_norm": 0.6757424473762512, "learning_rate": 4.7852095283850866e-05, "loss": 0.349, "num_input_tokens_seen": 314704, "step": 695 }, { "epoch": 4.430379746835443, "grad_norm": 0.47920024394989014, "learning_rate": 4.779575213638906e-05, "loss": 0.2892, "num_input_tokens_seen": 317008, "step": 700 }, { "epoch": 4.462025316455696, "grad_norm": 0.3310740888118744, "learning_rate": 4.7738713590999695e-05, "loss": 0.2641, "num_input_tokens_seen": 319248, "step": 705 }, { "epoch": 4.493670886075949, "grad_norm": 0.5297502279281616, "learning_rate": 4.768098138767741e-05, "loss": 0.4152, "num_input_tokens_seen": 321552, "step": 710 }, { "epoch": 4.525316455696203, "grad_norm": 0.9140908122062683, "learning_rate": 4.7622557287577284e-05, "loss": 0.4117, "num_input_tokens_seen": 323728, "step": 715 }, { "epoch": 4.556962025316456, "grad_norm": 0.5034212470054626, "learning_rate": 4.75634430729611e-05, "loss": 0.256, "num_input_tokens_seen": 326000, "step": 720 }, { "epoch": 4.588607594936709, "grad_norm": 0.43623366951942444, "learning_rate": 4.750364054714302e-05, "loss": 0.2728, "num_input_tokens_seen": 328368, "step": 725 }, { "epoch": 4.620253164556962, "grad_norm": 0.3411039710044861, "learning_rate": 4.744315153443452e-05, "loss": 0.1897, "num_input_tokens_seen": 330576, "step": 730 }, { "epoch": 4.651898734177215, "grad_norm": 0.3897060453891754, "learning_rate": 4.7381977880088786e-05, "loss": 0.228, "num_input_tokens_seen": 332784, "step": 735 }, { "epoch": 4.6835443037974684, "grad_norm": 0.48439887166023254, "learning_rate": 4.7320121450244394e-05, "loss": 0.2449, "num_input_tokens_seen": 335152, "step": 740 }, { "epoch": 4.715189873417722, "grad_norm": 0.37089109420776367, "learning_rate": 4.725758413186838e-05, "loss": 0.243, "num_input_tokens_seen": 337456, "step": 745 }, { "epoch": 4.746835443037975, "grad_norm": 0.5262792110443115, "learning_rate": 4.719436783269872e-05, "loss": 0.295, "num_input_tokens_seen": 339792, "step": 750 }, { "epoch": 4.7784810126582276, "grad_norm": 0.34344956278800964, "learning_rate": 4.713047448118606e-05, "loss": 0.2026, "num_input_tokens_seen": 342032, "step": 755 }, { "epoch": 4.810126582278481, "grad_norm": 0.3643595278263092, "learning_rate": 4.706590602643497e-05, "loss": 0.2585, "num_input_tokens_seen": 344304, "step": 760 }, { "epoch": 4.841772151898734, "grad_norm": 0.7422813773155212, "learning_rate": 4.700066443814443e-05, "loss": 0.3176, "num_input_tokens_seen": 346640, "step": 765 }, { "epoch": 4.8734177215189876, "grad_norm": 0.5333808660507202, "learning_rate": 4.6934751706547724e-05, "loss": 0.2783, "num_input_tokens_seen": 348880, "step": 770 }, { "epoch": 4.905063291139241, "grad_norm": 0.3037487268447876, "learning_rate": 4.6868169842351804e-05, "loss": 0.2517, "num_input_tokens_seen": 351248, "step": 775 }, { "epoch": 4.936708860759493, "grad_norm": 0.7458473443984985, "learning_rate": 4.6800920876675905e-05, "loss": 0.3091, "num_input_tokens_seen": 353552, "step": 780 }, { "epoch": 4.968354430379747, "grad_norm": 0.7682437300682068, "learning_rate": 4.673300686098957e-05, "loss": 0.2402, "num_input_tokens_seen": 355888, "step": 785 }, { "epoch": 5.0, "grad_norm": 0.9256905913352966, "learning_rate": 4.6664429867050077e-05, "loss": 0.2682, "num_input_tokens_seen": 357888, "step": 790 }, { "epoch": 5.0, "eval_loss": 0.37361642718315125, "eval_runtime": 1.0557, "eval_samples_per_second": 66.31, "eval_steps_per_second": 17.051, "num_input_tokens_seen": 357888, "step": 790 }, { "epoch": 5.031645569620253, "grad_norm": 0.346300333738327, "learning_rate": 4.659519198683928e-05, "loss": 0.1705, "num_input_tokens_seen": 360096, "step": 795 }, { "epoch": 5.063291139240507, "grad_norm": 0.659578800201416, "learning_rate": 4.652529533249974e-05, "loss": 0.2372, "num_input_tokens_seen": 362432, "step": 800 }, { "epoch": 5.094936708860759, "grad_norm": 0.7640724778175354, "learning_rate": 4.6454742036270326e-05, "loss": 0.2801, "num_input_tokens_seen": 364768, "step": 805 }, { "epoch": 5.1265822784810124, "grad_norm": 0.2766515612602234, "learning_rate": 4.638353425042114e-05, "loss": 0.1861, "num_input_tokens_seen": 366976, "step": 810 }, { "epoch": 5.158227848101266, "grad_norm": 0.2714124321937561, "learning_rate": 4.631167414718788e-05, "loss": 0.1818, "num_input_tokens_seen": 369184, "step": 815 }, { "epoch": 5.189873417721519, "grad_norm": 1.5225164890289307, "learning_rate": 4.62391639187056e-05, "loss": 0.2337, "num_input_tokens_seen": 371424, "step": 820 }, { "epoch": 5.2215189873417724, "grad_norm": 0.3433351218700409, "learning_rate": 4.616600577694177e-05, "loss": 0.1512, "num_input_tokens_seen": 373600, "step": 825 }, { "epoch": 5.253164556962025, "grad_norm": 0.2986677587032318, "learning_rate": 4.609220195362886e-05, "loss": 0.2053, "num_input_tokens_seen": 375904, "step": 830 }, { "epoch": 5.284810126582278, "grad_norm": 0.5398145318031311, "learning_rate": 4.601775470019625e-05, "loss": 0.2221, "num_input_tokens_seen": 378336, "step": 835 }, { "epoch": 5.3164556962025316, "grad_norm": 0.5913758873939514, "learning_rate": 4.594266628770151e-05, "loss": 0.2418, "num_input_tokens_seen": 380672, "step": 840 }, { "epoch": 5.348101265822785, "grad_norm": 0.34356844425201416, "learning_rate": 4.586693900676116e-05, "loss": 0.1909, "num_input_tokens_seen": 382976, "step": 845 }, { "epoch": 5.379746835443038, "grad_norm": 0.2539352774620056, "learning_rate": 4.579057516748078e-05, "loss": 0.2271, "num_input_tokens_seen": 385152, "step": 850 }, { "epoch": 5.4113924050632916, "grad_norm": 0.26296737790107727, "learning_rate": 4.571357709938455e-05, "loss": 0.2083, "num_input_tokens_seen": 387488, "step": 855 }, { "epoch": 5.443037974683544, "grad_norm": 0.42699864506721497, "learning_rate": 4.5635947151344164e-05, "loss": 0.1787, "num_input_tokens_seen": 389696, "step": 860 }, { "epoch": 5.474683544303797, "grad_norm": 0.4503214359283447, "learning_rate": 4.555768769150718e-05, "loss": 0.2314, "num_input_tokens_seen": 392000, "step": 865 }, { "epoch": 5.506329113924051, "grad_norm": 0.3542090058326721, "learning_rate": 4.54788011072248e-05, "loss": 0.1565, "num_input_tokens_seen": 394240, "step": 870 }, { "epoch": 5.537974683544304, "grad_norm": 0.4513137638568878, "learning_rate": 4.539928980497903e-05, "loss": 0.1935, "num_input_tokens_seen": 396512, "step": 875 }, { "epoch": 5.569620253164557, "grad_norm": 0.5062257051467896, "learning_rate": 4.531915621030925e-05, "loss": 0.2155, "num_input_tokens_seen": 398880, "step": 880 }, { "epoch": 5.60126582278481, "grad_norm": 0.28146153688430786, "learning_rate": 4.523840276773828e-05, "loss": 0.1534, "num_input_tokens_seen": 401184, "step": 885 }, { "epoch": 5.632911392405063, "grad_norm": 0.7047305703163147, "learning_rate": 4.5157031940697715e-05, "loss": 0.1973, "num_input_tokens_seen": 403360, "step": 890 }, { "epoch": 5.6645569620253164, "grad_norm": 0.23154175281524658, "learning_rate": 4.507504621145286e-05, "loss": 0.2274, "num_input_tokens_seen": 405760, "step": 895 }, { "epoch": 5.69620253164557, "grad_norm": 0.8780015707015991, "learning_rate": 4.499244808102697e-05, "loss": 0.2635, "num_input_tokens_seen": 408096, "step": 900 }, { "epoch": 5.727848101265823, "grad_norm": 0.23884202539920807, "learning_rate": 4.4909240069124966e-05, "loss": 0.1505, "num_input_tokens_seen": 410432, "step": 905 }, { "epoch": 5.759493670886076, "grad_norm": 0.7365670800209045, "learning_rate": 4.482542471405655e-05, "loss": 0.1754, "num_input_tokens_seen": 412608, "step": 910 }, { "epoch": 5.791139240506329, "grad_norm": 0.3029552400112152, "learning_rate": 4.4741004572658804e-05, "loss": 0.1886, "num_input_tokens_seen": 414784, "step": 915 }, { "epoch": 5.822784810126582, "grad_norm": 0.5446221232414246, "learning_rate": 4.4655982220218176e-05, "loss": 0.1609, "num_input_tokens_seen": 417024, "step": 920 }, { "epoch": 5.8544303797468356, "grad_norm": 1.0820600986480713, "learning_rate": 4.4570360250391884e-05, "loss": 0.1698, "num_input_tokens_seen": 419296, "step": 925 }, { "epoch": 5.886075949367089, "grad_norm": 1.1907758712768555, "learning_rate": 4.448414127512889e-05, "loss": 0.28, "num_input_tokens_seen": 421504, "step": 930 }, { "epoch": 5.917721518987342, "grad_norm": 1.0096385478973389, "learning_rate": 4.43973279245901e-05, "loss": 0.1373, "num_input_tokens_seen": 423712, "step": 935 }, { "epoch": 5.949367088607595, "grad_norm": 0.45004627108573914, "learning_rate": 4.430992284706823e-05, "loss": 0.187, "num_input_tokens_seen": 426016, "step": 940 }, { "epoch": 5.981012658227848, "grad_norm": 0.6072269678115845, "learning_rate": 4.422192870890697e-05, "loss": 0.1618, "num_input_tokens_seen": 428352, "step": 945 }, { "epoch": 6.0, "eval_loss": 0.3068278133869171, "eval_runtime": 1.0473, "eval_samples_per_second": 66.84, "eval_steps_per_second": 17.187, "num_input_tokens_seen": 429456, "step": 948 }, { "epoch": 6.012658227848101, "grad_norm": 0.7842978239059448, "learning_rate": 4.413334819441963e-05, "loss": 0.1625, "num_input_tokens_seen": 430416, "step": 950 }, { "epoch": 6.044303797468355, "grad_norm": 0.5223548412322998, "learning_rate": 4.404418400580732e-05, "loss": 0.1524, "num_input_tokens_seen": 432624, "step": 955 }, { "epoch": 6.075949367088608, "grad_norm": 0.36074554920196533, "learning_rate": 4.395443886307644e-05, "loss": 0.2179, "num_input_tokens_seen": 434960, "step": 960 }, { "epoch": 6.1075949367088604, "grad_norm": 0.8336349725723267, "learning_rate": 4.386411550395576e-05, "loss": 0.1414, "num_input_tokens_seen": 437200, "step": 965 }, { "epoch": 6.139240506329114, "grad_norm": 0.27555224299430847, "learning_rate": 4.3773216683812856e-05, "loss": 0.1192, "num_input_tokens_seen": 439376, "step": 970 }, { "epoch": 6.170886075949367, "grad_norm": 0.3605653643608093, "learning_rate": 4.368174517557012e-05, "loss": 0.1751, "num_input_tokens_seen": 441744, "step": 975 }, { "epoch": 6.2025316455696204, "grad_norm": 0.4451218843460083, "learning_rate": 4.3589703769620115e-05, "loss": 0.122, "num_input_tokens_seen": 443952, "step": 980 }, { "epoch": 6.234177215189874, "grad_norm": 0.46900850534439087, "learning_rate": 4.349709527374046e-05, "loss": 0.1232, "num_input_tokens_seen": 446256, "step": 985 }, { "epoch": 6.265822784810126, "grad_norm": 0.6415440440177917, "learning_rate": 4.340392251300822e-05, "loss": 0.2038, "num_input_tokens_seen": 448560, "step": 990 }, { "epoch": 6.2974683544303796, "grad_norm": 0.8457127213478088, "learning_rate": 4.3310188329713665e-05, "loss": 0.2031, "num_input_tokens_seen": 450640, "step": 995 }, { "epoch": 6.329113924050633, "grad_norm": 0.18252256512641907, "learning_rate": 4.3215895583273596e-05, "loss": 0.1741, "num_input_tokens_seen": 452912, "step": 1000 }, { "epoch": 6.360759493670886, "grad_norm": 0.5281898975372314, "learning_rate": 4.3121047150144137e-05, "loss": 0.1652, "num_input_tokens_seen": 455280, "step": 1005 }, { "epoch": 6.3924050632911396, "grad_norm": 1.0520316362380981, "learning_rate": 4.3025645923732926e-05, "loss": 0.176, "num_input_tokens_seen": 457584, "step": 1010 }, { "epoch": 6.424050632911392, "grad_norm": 0.27545469999313354, "learning_rate": 4.292969481431092e-05, "loss": 0.2194, "num_input_tokens_seen": 459792, "step": 1015 }, { "epoch": 6.455696202531645, "grad_norm": 0.2468576431274414, "learning_rate": 4.283319674892358e-05, "loss": 0.1489, "num_input_tokens_seen": 462064, "step": 1020 }, { "epoch": 6.487341772151899, "grad_norm": 0.7657990455627441, "learning_rate": 4.2736154671301556e-05, "loss": 0.1609, "num_input_tokens_seen": 464272, "step": 1025 }, { "epoch": 6.518987341772152, "grad_norm": 0.6345427632331848, "learning_rate": 4.2638571541770956e-05, "loss": 0.1173, "num_input_tokens_seen": 466640, "step": 1030 }, { "epoch": 6.550632911392405, "grad_norm": 1.2554429769515991, "learning_rate": 4.254045033716295e-05, "loss": 0.2128, "num_input_tokens_seen": 468848, "step": 1035 }, { "epoch": 6.582278481012658, "grad_norm": 0.1956392228603363, "learning_rate": 4.244179405072305e-05, "loss": 0.1617, "num_input_tokens_seen": 471216, "step": 1040 }, { "epoch": 6.613924050632911, "grad_norm": 0.4352739453315735, "learning_rate": 4.234260569201973e-05, "loss": 0.1051, "num_input_tokens_seen": 473552, "step": 1045 }, { "epoch": 6.6455696202531644, "grad_norm": 0.5126502513885498, "learning_rate": 4.224288828685264e-05, "loss": 0.1378, "num_input_tokens_seen": 475792, "step": 1050 }, { "epoch": 6.677215189873418, "grad_norm": 0.43032675981521606, "learning_rate": 4.214264487716033e-05, "loss": 0.1491, "num_input_tokens_seen": 478096, "step": 1055 }, { "epoch": 6.708860759493671, "grad_norm": 0.18930715322494507, "learning_rate": 4.204187852092741e-05, "loss": 0.1367, "num_input_tokens_seen": 480400, "step": 1060 }, { "epoch": 6.740506329113924, "grad_norm": 0.646236777305603, "learning_rate": 4.1940592292091297e-05, "loss": 0.2204, "num_input_tokens_seen": 482704, "step": 1065 }, { "epoch": 6.772151898734177, "grad_norm": 0.7857571244239807, "learning_rate": 4.183878928044842e-05, "loss": 0.0956, "num_input_tokens_seen": 484848, "step": 1070 }, { "epoch": 6.80379746835443, "grad_norm": 0.8375144600868225, "learning_rate": 4.173647259155997e-05, "loss": 0.1252, "num_input_tokens_seen": 487152, "step": 1075 }, { "epoch": 6.8354430379746836, "grad_norm": 0.2958216965198517, "learning_rate": 4.163364534665718e-05, "loss": 0.098, "num_input_tokens_seen": 489360, "step": 1080 }, { "epoch": 6.867088607594937, "grad_norm": 0.17427486181259155, "learning_rate": 4.15303106825461e-05, "loss": 0.1376, "num_input_tokens_seen": 491664, "step": 1085 }, { "epoch": 6.89873417721519, "grad_norm": 0.5177532434463501, "learning_rate": 4.142647175151188e-05, "loss": 0.2165, "num_input_tokens_seen": 493968, "step": 1090 }, { "epoch": 6.930379746835443, "grad_norm": 1.2495945692062378, "learning_rate": 4.1322131721222635e-05, "loss": 0.1885, "num_input_tokens_seen": 496272, "step": 1095 }, { "epoch": 6.962025316455696, "grad_norm": 0.21747252345085144, "learning_rate": 4.121729377463285e-05, "loss": 0.1042, "num_input_tokens_seen": 498576, "step": 1100 }, { "epoch": 6.993670886075949, "grad_norm": 0.5200768113136292, "learning_rate": 4.1111961109886196e-05, "loss": 0.1109, "num_input_tokens_seen": 500912, "step": 1105 }, { "epoch": 7.0, "eval_loss": 0.27513089776039124, "eval_runtime": 1.0553, "eval_samples_per_second": 66.332, "eval_steps_per_second": 17.057, "num_input_tokens_seen": 501136, "step": 1106 }, { "epoch": 7.025316455696203, "grad_norm": 0.4884664714336395, "learning_rate": 4.100613694021803e-05, "loss": 0.0915, "num_input_tokens_seen": 502928, "step": 1110 }, { "epoch": 7.056962025316456, "grad_norm": 0.5775274038314819, "learning_rate": 4.089982449385736e-05, "loss": 0.1525, "num_input_tokens_seen": 505232, "step": 1115 }, { "epoch": 7.0886075949367084, "grad_norm": 0.24733032286167145, "learning_rate": 4.079302701392837e-05, "loss": 0.2347, "num_input_tokens_seen": 507408, "step": 1120 }, { "epoch": 7.120253164556962, "grad_norm": 0.7950642108917236, "learning_rate": 4.068574775835145e-05, "loss": 0.2623, "num_input_tokens_seen": 509680, "step": 1125 }, { "epoch": 7.151898734177215, "grad_norm": 0.4400986433029175, "learning_rate": 4.0577989999743894e-05, "loss": 0.1067, "num_input_tokens_seen": 511920, "step": 1130 }, { "epoch": 7.1835443037974684, "grad_norm": 0.6501129269599915, "learning_rate": 4.0469757025319955e-05, "loss": 0.117, "num_input_tokens_seen": 514256, "step": 1135 }, { "epoch": 7.215189873417722, "grad_norm": 0.43046820163726807, "learning_rate": 4.036105213679069e-05, "loss": 0.1245, "num_input_tokens_seen": 516560, "step": 1140 }, { "epoch": 7.246835443037975, "grad_norm": 0.3614645004272461, "learning_rate": 4.025187865026311e-05, "loss": 0.1377, "num_input_tokens_seen": 518864, "step": 1145 }, { "epoch": 7.2784810126582276, "grad_norm": 0.700169026851654, "learning_rate": 4.014223989613914e-05, "loss": 0.1243, "num_input_tokens_seen": 521232, "step": 1150 }, { "epoch": 7.310126582278481, "grad_norm": 0.17842534184455872, "learning_rate": 4.0032139219013934e-05, "loss": 0.1197, "num_input_tokens_seen": 523536, "step": 1155 }, { "epoch": 7.341772151898734, "grad_norm": 0.2855094373226166, "learning_rate": 3.992157997757389e-05, "loss": 0.1015, "num_input_tokens_seen": 525840, "step": 1160 }, { "epoch": 7.3734177215189876, "grad_norm": 0.2117883712053299, "learning_rate": 3.98105655444942e-05, "loss": 0.0884, "num_input_tokens_seen": 528144, "step": 1165 }, { "epoch": 7.405063291139241, "grad_norm": 0.4648926556110382, "learning_rate": 3.969909930633591e-05, "loss": 0.1118, "num_input_tokens_seen": 530320, "step": 1170 }, { "epoch": 7.436708860759493, "grad_norm": 0.45844343304634094, "learning_rate": 3.958718466344269e-05, "loss": 0.1863, "num_input_tokens_seen": 532592, "step": 1175 }, { "epoch": 7.468354430379747, "grad_norm": 0.7561975121498108, "learning_rate": 3.947482502983702e-05, "loss": 0.1319, "num_input_tokens_seen": 534960, "step": 1180 }, { "epoch": 7.5, "grad_norm": 0.40282535552978516, "learning_rate": 3.9362023833116125e-05, "loss": 0.1286, "num_input_tokens_seen": 537232, "step": 1185 }, { "epoch": 7.531645569620253, "grad_norm": 0.3494255244731903, "learning_rate": 3.924878451434735e-05, "loss": 0.1285, "num_input_tokens_seen": 539504, "step": 1190 }, { "epoch": 7.563291139240507, "grad_norm": 0.27715620398521423, "learning_rate": 3.9135110527963224e-05, "loss": 0.1077, "num_input_tokens_seen": 541680, "step": 1195 }, { "epoch": 7.594936708860759, "grad_norm": 0.32365524768829346, "learning_rate": 3.902100534165606e-05, "loss": 0.1347, "num_input_tokens_seen": 543984, "step": 1200 }, { "epoch": 7.6265822784810124, "grad_norm": 0.39298486709594727, "learning_rate": 3.890647243627218e-05, "loss": 0.1948, "num_input_tokens_seen": 546224, "step": 1205 }, { "epoch": 7.658227848101266, "grad_norm": 0.35612449049949646, "learning_rate": 3.879151530570574e-05, "loss": 0.1253, "num_input_tokens_seen": 548496, "step": 1210 }, { "epoch": 7.689873417721519, "grad_norm": 0.2790282964706421, "learning_rate": 3.867613745679213e-05, "loss": 0.1034, "num_input_tokens_seen": 550704, "step": 1215 }, { "epoch": 7.7215189873417724, "grad_norm": 0.5812414288520813, "learning_rate": 3.856034240920099e-05, "loss": 0.1576, "num_input_tokens_seen": 552976, "step": 1220 }, { "epoch": 7.753164556962025, "grad_norm": 0.15643061697483063, "learning_rate": 3.844413369532889e-05, "loss": 0.1788, "num_input_tokens_seen": 555312, "step": 1225 }, { "epoch": 7.784810126582278, "grad_norm": 0.2115858644247055, "learning_rate": 3.8327514860191496e-05, "loss": 0.1025, "num_input_tokens_seen": 557680, "step": 1230 }, { "epoch": 7.8164556962025316, "grad_norm": 1.532456398010254, "learning_rate": 3.821048946131549e-05, "loss": 0.1279, "num_input_tokens_seen": 559984, "step": 1235 }, { "epoch": 7.848101265822785, "grad_norm": 0.20506373047828674, "learning_rate": 3.809306106863e-05, "loss": 0.112, "num_input_tokens_seen": 562256, "step": 1240 }, { "epoch": 7.879746835443038, "grad_norm": 0.10853352397680283, "learning_rate": 3.7975233264357755e-05, "loss": 0.1182, "num_input_tokens_seen": 564560, "step": 1245 }, { "epoch": 7.911392405063291, "grad_norm": 0.18702203035354614, "learning_rate": 3.785700964290572e-05, "loss": 0.0843, "num_input_tokens_seen": 566896, "step": 1250 }, { "epoch": 7.943037974683544, "grad_norm": 0.36072391271591187, "learning_rate": 3.773839381075555e-05, "loss": 0.1175, "num_input_tokens_seen": 569200, "step": 1255 }, { "epoch": 7.974683544303797, "grad_norm": 0.21678870916366577, "learning_rate": 3.7619389386353477e-05, "loss": 0.1032, "num_input_tokens_seen": 571536, "step": 1260 }, { "epoch": 8.0, "eval_loss": 0.25815248489379883, "eval_runtime": 1.0528, "eval_samples_per_second": 66.487, "eval_steps_per_second": 17.097, "num_input_tokens_seen": 573104, "step": 1264 }, { "epoch": 8.00632911392405, "grad_norm": 0.3502642810344696, "learning_rate": 3.7500000000000003e-05, "loss": 0.144, "num_input_tokens_seen": 573616, "step": 1265 }, { "epoch": 8.037974683544304, "grad_norm": 0.37402281165122986, "learning_rate": 3.73802292937391e-05, "loss": 0.0827, "num_input_tokens_seen": 575824, "step": 1270 }, { "epoch": 8.069620253164556, "grad_norm": 0.4159397482872009, "learning_rate": 3.726008092124714e-05, "loss": 0.0999, "num_input_tokens_seen": 578032, "step": 1275 }, { "epoch": 8.10126582278481, "grad_norm": 0.4707047641277313, "learning_rate": 3.713955854772144e-05, "loss": 0.1416, "num_input_tokens_seen": 580464, "step": 1280 }, { "epoch": 8.132911392405063, "grad_norm": 0.24735240638256073, "learning_rate": 3.701866584976839e-05, "loss": 0.1626, "num_input_tokens_seen": 582640, "step": 1285 }, { "epoch": 8.164556962025316, "grad_norm": 0.37268760800361633, "learning_rate": 3.689740651529141e-05, "loss": 0.111, "num_input_tokens_seen": 584976, "step": 1290 }, { "epoch": 8.19620253164557, "grad_norm": 0.2798592746257782, "learning_rate": 3.6775784243378354e-05, "loss": 0.0814, "num_input_tokens_seen": 587120, "step": 1295 }, { "epoch": 8.227848101265822, "grad_norm": 0.8058192729949951, "learning_rate": 3.665380274418869e-05, "loss": 0.1267, "num_input_tokens_seen": 589392, "step": 1300 }, { "epoch": 8.259493670886076, "grad_norm": 0.1984112709760666, "learning_rate": 3.6531465738840336e-05, "loss": 0.1096, "num_input_tokens_seen": 591632, "step": 1305 }, { "epoch": 8.291139240506329, "grad_norm": 0.5250291228294373, "learning_rate": 3.640877695929614e-05, "loss": 0.1742, "num_input_tokens_seen": 594000, "step": 1310 }, { "epoch": 8.322784810126583, "grad_norm": 0.6921360492706299, "learning_rate": 3.628574014825004e-05, "loss": 0.1246, "num_input_tokens_seen": 596368, "step": 1315 }, { "epoch": 8.354430379746836, "grad_norm": 0.6403974890708923, "learning_rate": 3.616235905901284e-05, "loss": 0.143, "num_input_tokens_seen": 598672, "step": 1320 }, { "epoch": 8.386075949367088, "grad_norm": 0.4208763539791107, "learning_rate": 3.60386374553978e-05, "loss": 0.0957, "num_input_tokens_seen": 601040, "step": 1325 }, { "epoch": 8.417721518987342, "grad_norm": 0.3937915563583374, "learning_rate": 3.591457911160575e-05, "loss": 0.1077, "num_input_tokens_seen": 603344, "step": 1330 }, { "epoch": 8.449367088607595, "grad_norm": 0.38302183151245117, "learning_rate": 3.579018781210999e-05, "loss": 0.1789, "num_input_tokens_seen": 605552, "step": 1335 }, { "epoch": 8.481012658227849, "grad_norm": 0.22398924827575684, "learning_rate": 3.566546735154082e-05, "loss": 0.1375, "num_input_tokens_seen": 607760, "step": 1340 }, { "epoch": 8.512658227848101, "grad_norm": 0.2035192847251892, "learning_rate": 3.55404215345698e-05, "loss": 0.0941, "num_input_tokens_seen": 610192, "step": 1345 }, { "epoch": 8.544303797468354, "grad_norm": 0.42479661107063293, "learning_rate": 3.541505417579366e-05, "loss": 0.1073, "num_input_tokens_seen": 612528, "step": 1350 }, { "epoch": 8.575949367088608, "grad_norm": 0.38617411255836487, "learning_rate": 3.528936909961801e-05, "loss": 0.1322, "num_input_tokens_seen": 614768, "step": 1355 }, { "epoch": 8.60759493670886, "grad_norm": 0.9132348895072937, "learning_rate": 3.5163370140140545e-05, "loss": 0.1665, "num_input_tokens_seen": 616912, "step": 1360 }, { "epoch": 8.639240506329115, "grad_norm": 0.6936427354812622, "learning_rate": 3.50370611410342e-05, "loss": 0.0737, "num_input_tokens_seen": 619088, "step": 1365 }, { "epoch": 8.670886075949367, "grad_norm": 0.5526509881019592, "learning_rate": 3.4910445955429854e-05, "loss": 0.1254, "num_input_tokens_seen": 621328, "step": 1370 }, { "epoch": 8.70253164556962, "grad_norm": 0.37854841351509094, "learning_rate": 3.478352844579876e-05, "loss": 0.0789, "num_input_tokens_seen": 623536, "step": 1375 }, { "epoch": 8.734177215189874, "grad_norm": 0.15246200561523438, "learning_rate": 3.465631248383477e-05, "loss": 0.2085, "num_input_tokens_seen": 625744, "step": 1380 }, { "epoch": 8.765822784810126, "grad_norm": 0.42943328619003296, "learning_rate": 3.4528801950336174e-05, "loss": 0.1199, "num_input_tokens_seen": 628144, "step": 1385 }, { "epoch": 8.79746835443038, "grad_norm": 0.28464996814727783, "learning_rate": 3.4401000735087384e-05, "loss": 0.1366, "num_input_tokens_seen": 630448, "step": 1390 }, { "epoch": 8.829113924050633, "grad_norm": 0.22061419486999512, "learning_rate": 3.4272912736740185e-05, "loss": 0.0714, "num_input_tokens_seen": 632720, "step": 1395 }, { "epoch": 8.860759493670885, "grad_norm": 0.22898775339126587, "learning_rate": 3.414454186269489e-05, "loss": 0.1177, "num_input_tokens_seen": 634960, "step": 1400 }, { "epoch": 8.89240506329114, "grad_norm": 0.1717626303434372, "learning_rate": 3.401589202898107e-05, "loss": 0.072, "num_input_tokens_seen": 637328, "step": 1405 }, { "epoch": 8.924050632911392, "grad_norm": 0.25118422508239746, "learning_rate": 3.388696716013813e-05, "loss": 0.0858, "num_input_tokens_seen": 639600, "step": 1410 }, { "epoch": 8.955696202531646, "grad_norm": 0.4251887798309326, "learning_rate": 3.375777118909561e-05, "loss": 0.0979, "num_input_tokens_seen": 641808, "step": 1415 }, { "epoch": 8.987341772151899, "grad_norm": 0.443633496761322, "learning_rate": 3.3628308057053164e-05, "loss": 0.1209, "num_input_tokens_seen": 644080, "step": 1420 }, { "epoch": 9.0, "eval_loss": 0.2467295378446579, "eval_runtime": 1.0499, "eval_samples_per_second": 66.675, "eval_steps_per_second": 17.145, "num_input_tokens_seen": 644752, "step": 1422 }, { "epoch": 9.018987341772151, "grad_norm": 1.6179637908935547, "learning_rate": 3.349858171336035e-05, "loss": 0.2166, "num_input_tokens_seen": 646128, "step": 1425 }, { "epoch": 9.050632911392405, "grad_norm": 0.309814453125, "learning_rate": 3.3368596115396164e-05, "loss": 0.0861, "num_input_tokens_seen": 648336, "step": 1430 }, { "epoch": 9.082278481012658, "grad_norm": 0.428047239780426, "learning_rate": 3.32383552284483e-05, "loss": 0.0808, "num_input_tokens_seen": 650640, "step": 1435 }, { "epoch": 9.113924050632912, "grad_norm": 0.11092729866504669, "learning_rate": 3.3107863025592186e-05, "loss": 0.1114, "num_input_tokens_seen": 652880, "step": 1440 }, { "epoch": 9.145569620253164, "grad_norm": 0.13635869324207306, "learning_rate": 3.297712348756982e-05, "loss": 0.0745, "num_input_tokens_seen": 655056, "step": 1445 }, { "epoch": 9.177215189873417, "grad_norm": 0.800934374332428, "learning_rate": 3.284614060266825e-05, "loss": 0.1154, "num_input_tokens_seen": 657328, "step": 1450 }, { "epoch": 9.208860759493671, "grad_norm": 0.09133181720972061, "learning_rate": 3.271491836659803e-05, "loss": 0.1015, "num_input_tokens_seen": 659632, "step": 1455 }, { "epoch": 9.240506329113924, "grad_norm": 0.40369704365730286, "learning_rate": 3.258346078237122e-05, "loss": 0.0937, "num_input_tokens_seen": 662000, "step": 1460 }, { "epoch": 9.272151898734178, "grad_norm": 0.6892872452735901, "learning_rate": 3.2451771860179326e-05, "loss": 0.08, "num_input_tokens_seen": 664208, "step": 1465 }, { "epoch": 9.30379746835443, "grad_norm": 0.49290552735328674, "learning_rate": 3.2319855617270956e-05, "loss": 0.1161, "num_input_tokens_seen": 666672, "step": 1470 }, { "epoch": 9.335443037974684, "grad_norm": 0.2779461443424225, "learning_rate": 3.218771607782929e-05, "loss": 0.0767, "num_input_tokens_seen": 668944, "step": 1475 }, { "epoch": 9.367088607594937, "grad_norm": 0.40174365043640137, "learning_rate": 3.205535727284927e-05, "loss": 0.1097, "num_input_tokens_seen": 671216, "step": 1480 }, { "epoch": 9.39873417721519, "grad_norm": 0.6387947797775269, "learning_rate": 3.192278324001467e-05, "loss": 0.1139, "num_input_tokens_seen": 673424, "step": 1485 }, { "epoch": 9.430379746835444, "grad_norm": 0.2569015920162201, "learning_rate": 3.178999802357493e-05, "loss": 0.0626, "num_input_tokens_seen": 675600, "step": 1490 }, { "epoch": 9.462025316455696, "grad_norm": 0.7649990320205688, "learning_rate": 3.1657005674221786e-05, "loss": 0.1451, "num_input_tokens_seen": 677872, "step": 1495 }, { "epoch": 9.49367088607595, "grad_norm": 0.2705703377723694, "learning_rate": 3.1523810248965635e-05, "loss": 0.0856, "num_input_tokens_seen": 680080, "step": 1500 }, { "epoch": 9.525316455696203, "grad_norm": 0.21918971836566925, "learning_rate": 3.139041581101187e-05, "loss": 0.0574, "num_input_tokens_seen": 682288, "step": 1505 }, { "epoch": 9.556962025316455, "grad_norm": 0.6609599590301514, "learning_rate": 3.125682642963686e-05, "loss": 0.1585, "num_input_tokens_seen": 684656, "step": 1510 }, { "epoch": 9.58860759493671, "grad_norm": 0.8386855125427246, "learning_rate": 3.112304618006387e-05, "loss": 0.0919, "num_input_tokens_seen": 686896, "step": 1515 }, { "epoch": 9.620253164556962, "grad_norm": 0.33244284987449646, "learning_rate": 3.098907914333867e-05, "loss": 0.0666, "num_input_tokens_seen": 689104, "step": 1520 }, { "epoch": 9.651898734177216, "grad_norm": 0.1733875572681427, "learning_rate": 3.085492940620511e-05, "loss": 0.0767, "num_input_tokens_seen": 691376, "step": 1525 }, { "epoch": 9.683544303797468, "grad_norm": 0.4196367859840393, "learning_rate": 3.072060106098042e-05, "loss": 0.1023, "num_input_tokens_seen": 693744, "step": 1530 }, { "epoch": 9.715189873417721, "grad_norm": 0.36132556200027466, "learning_rate": 3.0586098205430355e-05, "loss": 0.1268, "num_input_tokens_seen": 695984, "step": 1535 }, { "epoch": 9.746835443037975, "grad_norm": 0.670192301273346, "learning_rate": 3.0451424942644265e-05, "loss": 0.1191, "num_input_tokens_seen": 698160, "step": 1540 }, { "epoch": 9.778481012658228, "grad_norm": 0.6079818606376648, "learning_rate": 3.0316585380909808e-05, "loss": 0.1629, "num_input_tokens_seen": 700496, "step": 1545 }, { "epoch": 9.810126582278482, "grad_norm": 0.2806653678417206, "learning_rate": 3.018158363358773e-05, "loss": 0.1555, "num_input_tokens_seen": 702736, "step": 1550 }, { "epoch": 9.841772151898734, "grad_norm": 0.17211346328258514, "learning_rate": 3.004642381898633e-05, "loss": 0.104, "num_input_tokens_seen": 705104, "step": 1555 }, { "epoch": 9.873417721518987, "grad_norm": 0.949230968952179, "learning_rate": 2.991111006023586e-05, "loss": 0.1382, "num_input_tokens_seen": 707376, "step": 1560 }, { "epoch": 9.905063291139241, "grad_norm": 0.26634857058525085, "learning_rate": 2.9775646485162694e-05, "loss": 0.1208, "num_input_tokens_seen": 709616, "step": 1565 }, { "epoch": 9.936708860759493, "grad_norm": 0.23626114428043365, "learning_rate": 2.964003722616349e-05, "loss": 0.11, "num_input_tokens_seen": 711888, "step": 1570 }, { "epoch": 9.968354430379748, "grad_norm": 0.35244104266166687, "learning_rate": 2.9504286420079038e-05, "loss": 0.141, "num_input_tokens_seen": 714192, "step": 1575 }, { "epoch": 10.0, "grad_norm": 0.3549441695213318, "learning_rate": 2.9368398208068127e-05, "loss": 0.1224, "num_input_tokens_seen": 716192, "step": 1580 }, { "epoch": 10.0, "eval_loss": 0.2368132472038269, "eval_runtime": 1.0544, "eval_samples_per_second": 66.391, "eval_steps_per_second": 17.072, "num_input_tokens_seen": 716192, "step": 1580 }, { "epoch": 10.031645569620252, "grad_norm": 0.493324875831604, "learning_rate": 2.9232376735481198e-05, "loss": 0.1193, "num_input_tokens_seen": 718496, "step": 1585 }, { "epoch": 10.063291139240507, "grad_norm": 0.4959459900856018, "learning_rate": 2.9096226151733862e-05, "loss": 0.1674, "num_input_tokens_seen": 720800, "step": 1590 }, { "epoch": 10.094936708860759, "grad_norm": 0.5235639810562134, "learning_rate": 2.8959950610180374e-05, "loss": 0.0656, "num_input_tokens_seen": 722944, "step": 1595 }, { "epoch": 10.126582278481013, "grad_norm": 0.4561421871185303, "learning_rate": 2.882355426798688e-05, "loss": 0.106, "num_input_tokens_seen": 725088, "step": 1600 }, { "epoch": 10.158227848101266, "grad_norm": 0.2645431160926819, "learning_rate": 2.868704128600463e-05, "loss": 0.1065, "num_input_tokens_seen": 727232, "step": 1605 }, { "epoch": 10.189873417721518, "grad_norm": 0.4032263457775116, "learning_rate": 2.8550415828643016e-05, "loss": 0.0754, "num_input_tokens_seen": 729344, "step": 1610 }, { "epoch": 10.221518987341772, "grad_norm": 0.1715727597475052, "learning_rate": 2.8413682063742603e-05, "loss": 0.0815, "num_input_tokens_seen": 731616, "step": 1615 }, { "epoch": 10.253164556962025, "grad_norm": 0.38201895356178284, "learning_rate": 2.827684416244792e-05, "loss": 0.0866, "num_input_tokens_seen": 734016, "step": 1620 }, { "epoch": 10.284810126582279, "grad_norm": 0.3131459057331085, "learning_rate": 2.8139906299080203e-05, "loss": 0.1697, "num_input_tokens_seen": 736320, "step": 1625 }, { "epoch": 10.316455696202532, "grad_norm": 0.12523798644542694, "learning_rate": 2.800287265101015e-05, "loss": 0.0902, "num_input_tokens_seen": 738528, "step": 1630 }, { "epoch": 10.348101265822784, "grad_norm": 0.33886829018592834, "learning_rate": 2.7865747398530396e-05, "loss": 0.0957, "num_input_tokens_seen": 740768, "step": 1635 }, { "epoch": 10.379746835443038, "grad_norm": 0.6492605209350586, "learning_rate": 2.7728534724728027e-05, "loss": 0.1586, "num_input_tokens_seen": 743072, "step": 1640 }, { "epoch": 10.41139240506329, "grad_norm": 0.7702073454856873, "learning_rate": 2.7591238815356956e-05, "loss": 0.0923, "num_input_tokens_seen": 745312, "step": 1645 }, { "epoch": 10.443037974683545, "grad_norm": 0.6533333659172058, "learning_rate": 2.7453863858710276e-05, "loss": 0.0999, "num_input_tokens_seen": 747488, "step": 1650 }, { "epoch": 10.474683544303797, "grad_norm": 0.41343164443969727, "learning_rate": 2.7316414045492445e-05, "loss": 0.1414, "num_input_tokens_seen": 749760, "step": 1655 }, { "epoch": 10.50632911392405, "grad_norm": 0.22790366411209106, "learning_rate": 2.717889356869146e-05, "loss": 0.0598, "num_input_tokens_seen": 752032, "step": 1660 }, { "epoch": 10.537974683544304, "grad_norm": 0.3924558162689209, "learning_rate": 2.704130662345096e-05, "loss": 0.1178, "num_input_tokens_seen": 754336, "step": 1665 }, { "epoch": 10.569620253164556, "grad_norm": 0.5162143111228943, "learning_rate": 2.690365740694224e-05, "loss": 0.0807, "num_input_tokens_seen": 756576, "step": 1670 }, { "epoch": 10.60126582278481, "grad_norm": 0.24316571652889252, "learning_rate": 2.676595011823624e-05, "loss": 0.1437, "num_input_tokens_seen": 758816, "step": 1675 }, { "epoch": 10.632911392405063, "grad_norm": 0.37156954407691956, "learning_rate": 2.6628188958175384e-05, "loss": 0.1214, "num_input_tokens_seen": 761056, "step": 1680 }, { "epoch": 10.664556962025316, "grad_norm": 0.284976989030838, "learning_rate": 2.6490378129245498e-05, "loss": 0.0904, "num_input_tokens_seen": 763200, "step": 1685 }, { "epoch": 10.69620253164557, "grad_norm": 0.3619583547115326, "learning_rate": 2.6352521835447596e-05, "loss": 0.1663, "num_input_tokens_seen": 765344, "step": 1690 }, { "epoch": 10.727848101265822, "grad_norm": 0.08084660023450851, "learning_rate": 2.621462428216961e-05, "loss": 0.1036, "num_input_tokens_seen": 767552, "step": 1695 }, { "epoch": 10.759493670886076, "grad_norm": 0.4137870669364929, "learning_rate": 2.6076689676058114e-05, "loss": 0.0785, "num_input_tokens_seen": 769920, "step": 1700 }, { "epoch": 10.791139240506329, "grad_norm": 0.32944831252098083, "learning_rate": 2.5938722224890005e-05, "loss": 0.0805, "num_input_tokens_seen": 772224, "step": 1705 }, { "epoch": 10.822784810126583, "grad_norm": 0.34037455916404724, "learning_rate": 2.5800726137444153e-05, "loss": 0.1002, "num_input_tokens_seen": 774592, "step": 1710 }, { "epoch": 10.854430379746836, "grad_norm": 0.4142782390117645, "learning_rate": 2.5662705623372967e-05, "loss": 0.0662, "num_input_tokens_seen": 776864, "step": 1715 }, { "epoch": 10.886075949367088, "grad_norm": 0.6172336339950562, "learning_rate": 2.552466489307403e-05, "loss": 0.1274, "num_input_tokens_seen": 779168, "step": 1720 }, { "epoch": 10.917721518987342, "grad_norm": 0.14528794586658478, "learning_rate": 2.538660815756161e-05, "loss": 0.1488, "num_input_tokens_seen": 781376, "step": 1725 }, { "epoch": 10.949367088607595, "grad_norm": 0.5204742550849915, "learning_rate": 2.5248539628338246e-05, "loss": 0.0774, "num_input_tokens_seen": 783744, "step": 1730 }, { "epoch": 10.981012658227849, "grad_norm": 0.20708367228507996, "learning_rate": 2.511046351726623e-05, "loss": 0.0463, "num_input_tokens_seen": 786016, "step": 1735 }, { "epoch": 11.0, "eval_loss": 0.22990846633911133, "eval_runtime": 1.0589, "eval_samples_per_second": 66.107, "eval_steps_per_second": 16.999, "num_input_tokens_seen": 787200, "step": 1738 }, { "epoch": 11.012658227848101, "grad_norm": 0.2000531703233719, "learning_rate": 2.497238403643917e-05, "loss": 0.0568, "num_input_tokens_seen": 788128, "step": 1740 }, { "epoch": 11.044303797468354, "grad_norm": 0.12597692012786865, "learning_rate": 2.483430539805344e-05, "loss": 0.1282, "num_input_tokens_seen": 790432, "step": 1745 }, { "epoch": 11.075949367088608, "grad_norm": 0.1441134363412857, "learning_rate": 2.4696231814279722e-05, "loss": 0.0434, "num_input_tokens_seen": 792672, "step": 1750 }, { "epoch": 11.10759493670886, "grad_norm": 0.4239365756511688, "learning_rate": 2.455816749713453e-05, "loss": 0.09, "num_input_tokens_seen": 795008, "step": 1755 }, { "epoch": 11.139240506329115, "grad_norm": 0.3774973452091217, "learning_rate": 2.4420116658351673e-05, "loss": 0.1105, "num_input_tokens_seen": 797344, "step": 1760 }, { "epoch": 11.170886075949367, "grad_norm": 0.17944270372390747, "learning_rate": 2.428208350925377e-05, "loss": 0.0744, "num_input_tokens_seen": 799776, "step": 1765 }, { "epoch": 11.20253164556962, "grad_norm": 0.6264793276786804, "learning_rate": 2.4144072260623864e-05, "loss": 0.0764, "num_input_tokens_seen": 801952, "step": 1770 }, { "epoch": 11.234177215189874, "grad_norm": 0.6170404553413391, "learning_rate": 2.4006087122576863e-05, "loss": 0.0822, "num_input_tokens_seen": 804224, "step": 1775 }, { "epoch": 11.265822784810126, "grad_norm": 0.3619542121887207, "learning_rate": 2.386813230443117e-05, "loss": 0.0746, "num_input_tokens_seen": 806496, "step": 1780 }, { "epoch": 11.29746835443038, "grad_norm": 0.18494242429733276, "learning_rate": 2.3730212014580274e-05, "loss": 0.078, "num_input_tokens_seen": 808672, "step": 1785 }, { "epoch": 11.329113924050633, "grad_norm": 0.09566467255353928, "learning_rate": 2.359233046036434e-05, "loss": 0.0427, "num_input_tokens_seen": 811008, "step": 1790 }, { "epoch": 11.360759493670885, "grad_norm": 0.6648082137107849, "learning_rate": 2.3454491847941884e-05, "loss": 0.118, "num_input_tokens_seen": 813344, "step": 1795 }, { "epoch": 11.39240506329114, "grad_norm": 0.25523728132247925, "learning_rate": 2.3316700382161476e-05, "loss": 0.1754, "num_input_tokens_seen": 815648, "step": 1800 }, { "epoch": 11.424050632911392, "grad_norm": 1.1086726188659668, "learning_rate": 2.317896026643341e-05, "loss": 0.1187, "num_input_tokens_seen": 817888, "step": 1805 }, { "epoch": 11.455696202531646, "grad_norm": 0.43428266048431396, "learning_rate": 2.3041275702601565e-05, "loss": 0.0762, "num_input_tokens_seen": 820128, "step": 1810 }, { "epoch": 11.487341772151899, "grad_norm": 0.15228426456451416, "learning_rate": 2.2903650890815144e-05, "loss": 0.0557, "num_input_tokens_seen": 822368, "step": 1815 }, { "epoch": 11.518987341772151, "grad_norm": 0.4841778576374054, "learning_rate": 2.2766090029400573e-05, "loss": 0.0954, "num_input_tokens_seen": 824576, "step": 1820 }, { "epoch": 11.550632911392405, "grad_norm": 0.21318280696868896, "learning_rate": 2.262859731473346e-05, "loss": 0.0537, "num_input_tokens_seen": 826880, "step": 1825 }, { "epoch": 11.582278481012658, "grad_norm": 0.7512830495834351, "learning_rate": 2.2491176941110542e-05, "loss": 0.1178, "num_input_tokens_seen": 829120, "step": 1830 }, { "epoch": 11.613924050632912, "grad_norm": 0.31742382049560547, "learning_rate": 2.2353833100621747e-05, "loss": 0.0971, "num_input_tokens_seen": 831424, "step": 1835 }, { "epoch": 11.645569620253164, "grad_norm": 0.10350695252418518, "learning_rate": 2.2216569983022324e-05, "loss": 0.1202, "num_input_tokens_seen": 833728, "step": 1840 }, { "epoch": 11.677215189873417, "grad_norm": 0.6000233888626099, "learning_rate": 2.2079391775605013e-05, "loss": 0.0919, "num_input_tokens_seen": 835936, "step": 1845 }, { "epoch": 11.708860759493671, "grad_norm": 0.6373884677886963, "learning_rate": 2.194230266307231e-05, "loss": 0.1133, "num_input_tokens_seen": 838176, "step": 1850 }, { "epoch": 11.740506329113924, "grad_norm": 0.12952275574207306, "learning_rate": 2.1805306827408857e-05, "loss": 0.0757, "num_input_tokens_seen": 840480, "step": 1855 }, { "epoch": 11.772151898734178, "grad_norm": 0.6111325025558472, "learning_rate": 2.1668408447753782e-05, "loss": 0.1369, "num_input_tokens_seen": 842848, "step": 1860 }, { "epoch": 11.80379746835443, "grad_norm": 0.2619107663631439, "learning_rate": 2.1531611700273297e-05, "loss": 0.0592, "num_input_tokens_seen": 844992, "step": 1865 }, { "epoch": 11.835443037974684, "grad_norm": 0.18811312317848206, "learning_rate": 2.139492075803324e-05, "loss": 0.1061, "num_input_tokens_seen": 847328, "step": 1870 }, { "epoch": 11.867088607594937, "grad_norm": 0.6877657175064087, "learning_rate": 2.1258339790871803e-05, "loss": 0.1791, "num_input_tokens_seen": 849536, "step": 1875 }, { "epoch": 11.89873417721519, "grad_norm": 0.46471917629241943, "learning_rate": 2.1121872965272338e-05, "loss": 0.0524, "num_input_tokens_seen": 851712, "step": 1880 }, { "epoch": 11.930379746835444, "grad_norm": 0.2743806540966034, "learning_rate": 2.098552444423622e-05, "loss": 0.2058, "num_input_tokens_seen": 853984, "step": 1885 }, { "epoch": 11.962025316455696, "grad_norm": 0.37350520491600037, "learning_rate": 2.084929838715588e-05, "loss": 0.1089, "num_input_tokens_seen": 856224, "step": 1890 }, { "epoch": 11.99367088607595, "grad_norm": 0.7323923707008362, "learning_rate": 2.0713198949687924e-05, "loss": 0.1375, "num_input_tokens_seen": 858528, "step": 1895 }, { "epoch": 12.0, "eval_loss": 0.22599400579929352, "eval_runtime": 1.1671, "eval_samples_per_second": 59.976, "eval_steps_per_second": 15.422, "num_input_tokens_seen": 858736, "step": 1896 }, { "epoch": 12.025316455696203, "grad_norm": 0.5037997961044312, "learning_rate": 2.057723028362635e-05, "loss": 0.0939, "num_input_tokens_seen": 860592, "step": 1900 }, { "epoch": 12.056962025316455, "grad_norm": 0.25296589732170105, "learning_rate": 2.0441396536775868e-05, "loss": 0.0673, "num_input_tokens_seen": 862800, "step": 1905 }, { "epoch": 12.08860759493671, "grad_norm": 0.22964730858802795, "learning_rate": 2.030570185282544e-05, "loss": 0.0612, "num_input_tokens_seen": 865104, "step": 1910 }, { "epoch": 12.120253164556962, "grad_norm": 0.21586164832115173, "learning_rate": 2.0170150371221803e-05, "loss": 0.1323, "num_input_tokens_seen": 867408, "step": 1915 }, { "epoch": 12.151898734177216, "grad_norm": 0.7638270854949951, "learning_rate": 2.0034746227043233e-05, "loss": 0.0997, "num_input_tokens_seen": 869680, "step": 1920 }, { "epoch": 12.183544303797468, "grad_norm": 0.4471627473831177, "learning_rate": 1.989949355087339e-05, "loss": 0.0864, "num_input_tokens_seen": 871856, "step": 1925 }, { "epoch": 12.215189873417721, "grad_norm": 0.5558658838272095, "learning_rate": 1.9764396468675296e-05, "loss": 0.1157, "num_input_tokens_seen": 874192, "step": 1930 }, { "epoch": 12.246835443037975, "grad_norm": 0.1573903113603592, "learning_rate": 1.962945910166552e-05, "loss": 0.0667, "num_input_tokens_seen": 876432, "step": 1935 }, { "epoch": 12.278481012658228, "grad_norm": 0.17011329531669617, "learning_rate": 1.9494685566188403e-05, "loss": 0.1005, "num_input_tokens_seen": 878672, "step": 1940 }, { "epoch": 12.310126582278482, "grad_norm": 0.24080784618854523, "learning_rate": 1.9360079973590502e-05, "loss": 0.086, "num_input_tokens_seen": 880976, "step": 1945 }, { "epoch": 12.341772151898734, "grad_norm": 0.10938843339681625, "learning_rate": 1.9225646430095192e-05, "loss": 0.1118, "num_input_tokens_seen": 883280, "step": 1950 }, { "epoch": 12.373417721518987, "grad_norm": 0.1863957941532135, "learning_rate": 1.9091389036677382e-05, "loss": 0.0714, "num_input_tokens_seen": 885488, "step": 1955 }, { "epoch": 12.405063291139241, "grad_norm": 0.5461353063583374, "learning_rate": 1.895731188893841e-05, "loss": 0.1402, "num_input_tokens_seen": 887760, "step": 1960 }, { "epoch": 12.436708860759493, "grad_norm": 0.27624064683914185, "learning_rate": 1.8823419076981135e-05, "loss": 0.0606, "num_input_tokens_seen": 889968, "step": 1965 }, { "epoch": 12.468354430379748, "grad_norm": 0.6548911929130554, "learning_rate": 1.8689714685285118e-05, "loss": 0.0744, "num_input_tokens_seen": 892336, "step": 1970 }, { "epoch": 12.5, "grad_norm": 0.6202312111854553, "learning_rate": 1.8556202792582057e-05, "loss": 0.1477, "num_input_tokens_seen": 894608, "step": 1975 }, { "epoch": 12.531645569620252, "grad_norm": 0.9487397074699402, "learning_rate": 1.8422887471731375e-05, "loss": 0.1526, "num_input_tokens_seen": 896944, "step": 1980 }, { "epoch": 12.563291139240507, "grad_norm": 0.1772773265838623, "learning_rate": 1.8289772789595917e-05, "loss": 0.0431, "num_input_tokens_seen": 899120, "step": 1985 }, { "epoch": 12.594936708860759, "grad_norm": 0.60627681016922, "learning_rate": 1.8156862806917956e-05, "loss": 0.0869, "num_input_tokens_seen": 901456, "step": 1990 }, { "epoch": 12.626582278481013, "grad_norm": 0.6594812870025635, "learning_rate": 1.802416157819528e-05, "loss": 0.1041, "num_input_tokens_seen": 903696, "step": 1995 }, { "epoch": 12.658227848101266, "grad_norm": 0.3567030429840088, "learning_rate": 1.789167315155749e-05, "loss": 0.0705, "num_input_tokens_seen": 906000, "step": 2000 }, { "epoch": 12.689873417721518, "grad_norm": 0.14158804714679718, "learning_rate": 1.7759401568642576e-05, "loss": 0.1084, "num_input_tokens_seen": 908304, "step": 2005 }, { "epoch": 12.721518987341772, "grad_norm": 0.15054543316364288, "learning_rate": 1.7627350864473545e-05, "loss": 0.0749, "num_input_tokens_seen": 910448, "step": 2010 }, { "epoch": 12.753164556962025, "grad_norm": 0.32681116461753845, "learning_rate": 1.749552506733537e-05, "loss": 0.0771, "num_input_tokens_seen": 912816, "step": 2015 }, { "epoch": 12.784810126582279, "grad_norm": 1.1441223621368408, "learning_rate": 1.736392819865214e-05, "loss": 0.0958, "num_input_tokens_seen": 915056, "step": 2020 }, { "epoch": 12.816455696202532, "grad_norm": 0.686684250831604, "learning_rate": 1.7232564272864295e-05, "loss": 0.06, "num_input_tokens_seen": 917264, "step": 2025 }, { "epoch": 12.848101265822784, "grad_norm": 0.3770323395729065, "learning_rate": 1.7101437297306233e-05, "loss": 0.1313, "num_input_tokens_seen": 919504, "step": 2030 }, { "epoch": 12.879746835443038, "grad_norm": 0.7681977152824402, "learning_rate": 1.6970551272084068e-05, "loss": 0.0722, "num_input_tokens_seen": 921712, "step": 2035 }, { "epoch": 12.91139240506329, "grad_norm": 0.22253715991973877, "learning_rate": 1.683991018995355e-05, "loss": 0.0766, "num_input_tokens_seen": 924112, "step": 2040 }, { "epoch": 12.943037974683545, "grad_norm": 0.34088701009750366, "learning_rate": 1.6709518036198308e-05, "loss": 0.0911, "num_input_tokens_seen": 926320, "step": 2045 }, { "epoch": 12.974683544303797, "grad_norm": 0.2326185405254364, "learning_rate": 1.6579378788508265e-05, "loss": 0.1195, "num_input_tokens_seen": 928624, "step": 2050 }, { "epoch": 13.0, "eval_loss": 0.22308097779750824, "eval_runtime": 1.0528, "eval_samples_per_second": 66.487, "eval_steps_per_second": 17.097, "num_input_tokens_seen": 930160, "step": 2054 }, { "epoch": 13.00632911392405, "grad_norm": 0.2550225555896759, "learning_rate": 1.6449496416858284e-05, "loss": 0.1393, "num_input_tokens_seen": 930576, "step": 2055 }, { "epoch": 13.037974683544304, "grad_norm": 0.29662492871284485, "learning_rate": 1.6319874883387088e-05, "loss": 0.0977, "num_input_tokens_seen": 932944, "step": 2060 }, { "epoch": 13.069620253164556, "grad_norm": 0.27638939023017883, "learning_rate": 1.6190518142276368e-05, "loss": 0.0835, "num_input_tokens_seen": 935152, "step": 2065 }, { "epoch": 13.10126582278481, "grad_norm": 0.3081075847148895, "learning_rate": 1.6061430139630153e-05, "loss": 0.1471, "num_input_tokens_seen": 937456, "step": 2070 }, { "epoch": 13.132911392405063, "grad_norm": 0.35588836669921875, "learning_rate": 1.5932614813354486e-05, "loss": 0.1473, "num_input_tokens_seen": 939760, "step": 2075 }, { "epoch": 13.164556962025316, "grad_norm": 0.1294773370027542, "learning_rate": 1.5804076093037212e-05, "loss": 0.0677, "num_input_tokens_seen": 942128, "step": 2080 }, { "epoch": 13.19620253164557, "grad_norm": 0.08237936347723007, "learning_rate": 1.5675817899828165e-05, "loss": 0.0767, "num_input_tokens_seen": 944368, "step": 2085 }, { "epoch": 13.227848101265822, "grad_norm": 0.17590849101543427, "learning_rate": 1.5547844146319545e-05, "loss": 0.073, "num_input_tokens_seen": 946672, "step": 2090 }, { "epoch": 13.259493670886076, "grad_norm": 0.30852624773979187, "learning_rate": 1.5420158736426538e-05, "loss": 0.1391, "num_input_tokens_seen": 949008, "step": 2095 }, { "epoch": 13.291139240506329, "grad_norm": 0.1229742243885994, "learning_rate": 1.5292765565268225e-05, "loss": 0.0575, "num_input_tokens_seen": 951312, "step": 2100 }, { "epoch": 13.322784810126583, "grad_norm": 0.32235658168792725, "learning_rate": 1.5165668519048799e-05, "loss": 0.0707, "num_input_tokens_seen": 953552, "step": 2105 }, { "epoch": 13.354430379746836, "grad_norm": 0.9266071915626526, "learning_rate": 1.5038871474938954e-05, "loss": 0.058, "num_input_tokens_seen": 955792, "step": 2110 }, { "epoch": 13.386075949367088, "grad_norm": 0.7249566912651062, "learning_rate": 1.491237830095768e-05, "loss": 0.0852, "num_input_tokens_seen": 958000, "step": 2115 }, { "epoch": 13.417721518987342, "grad_norm": 0.25600212812423706, "learning_rate": 1.4786192855854206e-05, "loss": 0.0679, "num_input_tokens_seen": 960304, "step": 2120 }, { "epoch": 13.449367088607595, "grad_norm": 0.5201130509376526, "learning_rate": 1.4660318988990296e-05, "loss": 0.0754, "num_input_tokens_seen": 962448, "step": 2125 }, { "epoch": 13.481012658227849, "grad_norm": 0.8170276284217834, "learning_rate": 1.453476054022287e-05, "loss": 0.1261, "num_input_tokens_seen": 964688, "step": 2130 }, { "epoch": 13.512658227848101, "grad_norm": 0.6764806509017944, "learning_rate": 1.4409521339786808e-05, "loss": 0.0995, "num_input_tokens_seen": 966960, "step": 2135 }, { "epoch": 13.544303797468354, "grad_norm": 0.06347092986106873, "learning_rate": 1.4284605208178109e-05, "loss": 0.1336, "num_input_tokens_seen": 969296, "step": 2140 }, { "epoch": 13.575949367088608, "grad_norm": 0.38479360938072205, "learning_rate": 1.4160015956037437e-05, "loss": 0.0736, "num_input_tokens_seen": 971600, "step": 2145 }, { "epoch": 13.60759493670886, "grad_norm": 0.16858382523059845, "learning_rate": 1.4035757384033723e-05, "loss": 0.0916, "num_input_tokens_seen": 973904, "step": 2150 }, { "epoch": 13.639240506329115, "grad_norm": 0.21626552939414978, "learning_rate": 1.3911833282748358e-05, "loss": 0.0588, "num_input_tokens_seen": 976176, "step": 2155 }, { "epoch": 13.670886075949367, "grad_norm": 0.3646763563156128, "learning_rate": 1.3788247432559492e-05, "loss": 0.1438, "num_input_tokens_seen": 978480, "step": 2160 }, { "epoch": 13.70253164556962, "grad_norm": 0.7966201305389404, "learning_rate": 1.3665003603526705e-05, "loss": 0.1077, "num_input_tokens_seen": 980752, "step": 2165 }, { "epoch": 13.734177215189874, "grad_norm": 0.5051817893981934, "learning_rate": 1.3542105555276047e-05, "loss": 0.0597, "num_input_tokens_seen": 983056, "step": 2170 }, { "epoch": 13.765822784810126, "grad_norm": 0.21413205564022064, "learning_rate": 1.341955703688531e-05, "loss": 0.0578, "num_input_tokens_seen": 985232, "step": 2175 }, { "epoch": 13.79746835443038, "grad_norm": 0.6278512477874756, "learning_rate": 1.3297361786769652e-05, "loss": 0.0553, "num_input_tokens_seen": 987536, "step": 2180 }, { "epoch": 13.829113924050633, "grad_norm": 0.2530674636363983, "learning_rate": 1.317552353256762e-05, "loss": 0.1728, "num_input_tokens_seen": 989744, "step": 2185 }, { "epoch": 13.860759493670885, "grad_norm": 1.9078822135925293, "learning_rate": 1.305404599102733e-05, "loss": 0.0604, "num_input_tokens_seen": 991952, "step": 2190 }, { "epoch": 13.89240506329114, "grad_norm": 0.5475679636001587, "learning_rate": 1.2932932867893189e-05, "loss": 0.1165, "num_input_tokens_seen": 994224, "step": 2195 }, { "epoch": 13.924050632911392, "grad_norm": 0.6509348750114441, "learning_rate": 1.281218785779279e-05, "loss": 0.1493, "num_input_tokens_seen": 996528, "step": 2200 }, { "epoch": 13.955696202531646, "grad_norm": 0.21807987987995148, "learning_rate": 1.2691814644124212e-05, "loss": 0.0489, "num_input_tokens_seen": 998800, "step": 2205 }, { "epoch": 13.987341772151899, "grad_norm": 0.11147414892911911, "learning_rate": 1.2571816898943667e-05, "loss": 0.0919, "num_input_tokens_seen": 1001104, "step": 2210 }, { "epoch": 14.0, "eval_loss": 0.2209039181470871, "eval_runtime": 1.0524, "eval_samples_per_second": 66.512, "eval_steps_per_second": 17.103, "num_input_tokens_seen": 1001792, "step": 2212 }, { "epoch": 14.018987341772151, "grad_norm": 0.6045026183128357, "learning_rate": 1.24521982828535e-05, "loss": 0.0897, "num_input_tokens_seen": 1003168, "step": 2215 }, { "epoch": 14.050632911392405, "grad_norm": 0.1305987387895584, "learning_rate": 1.2332962444890459e-05, "loss": 0.0655, "num_input_tokens_seen": 1005472, "step": 2220 }, { "epoch": 14.082278481012658, "grad_norm": 0.14753109216690063, "learning_rate": 1.2214113022414448e-05, "loss": 0.0657, "num_input_tokens_seen": 1007776, "step": 2225 }, { "epoch": 14.113924050632912, "grad_norm": 0.6496151089668274, "learning_rate": 1.2095653640997529e-05, "loss": 0.1108, "num_input_tokens_seen": 1010048, "step": 2230 }, { "epoch": 14.145569620253164, "grad_norm": 0.20171499252319336, "learning_rate": 1.197758791431333e-05, "loss": 0.0491, "num_input_tokens_seen": 1012256, "step": 2235 }, { "epoch": 14.177215189873417, "grad_norm": 0.15159562230110168, "learning_rate": 1.1859919444026817e-05, "loss": 0.0529, "num_input_tokens_seen": 1014592, "step": 2240 }, { "epoch": 14.208860759493671, "grad_norm": 0.5315350294113159, "learning_rate": 1.174265181968439e-05, "loss": 0.1376, "num_input_tokens_seen": 1016928, "step": 2245 }, { "epoch": 14.240506329113924, "grad_norm": 0.4214925169944763, "learning_rate": 1.1625788618604433e-05, "loss": 0.0839, "num_input_tokens_seen": 1019232, "step": 2250 }, { "epoch": 14.272151898734178, "grad_norm": 0.8212571740150452, "learning_rate": 1.1509333405768152e-05, "loss": 0.0924, "num_input_tokens_seen": 1021696, "step": 2255 }, { "epoch": 14.30379746835443, "grad_norm": 0.32492563128471375, "learning_rate": 1.1393289733710808e-05, "loss": 0.1284, "num_input_tokens_seen": 1023808, "step": 2260 }, { "epoch": 14.335443037974684, "grad_norm": 0.30455082654953003, "learning_rate": 1.1277661142413393e-05, "loss": 0.1088, "num_input_tokens_seen": 1026080, "step": 2265 }, { "epoch": 14.367088607594937, "grad_norm": 0.3383852243423462, "learning_rate": 1.1162451159194614e-05, "loss": 0.0772, "num_input_tokens_seen": 1028256, "step": 2270 }, { "epoch": 14.39873417721519, "grad_norm": 0.2520776391029358, "learning_rate": 1.1047663298603264e-05, "loss": 0.0968, "num_input_tokens_seen": 1030560, "step": 2275 }, { "epoch": 14.430379746835444, "grad_norm": 0.30788475275039673, "learning_rate": 1.0933301062311066e-05, "loss": 0.1086, "num_input_tokens_seen": 1032832, "step": 2280 }, { "epoch": 14.462025316455696, "grad_norm": 0.43799611926078796, "learning_rate": 1.08193679390058e-05, "loss": 0.0563, "num_input_tokens_seen": 1035104, "step": 2285 }, { "epoch": 14.49367088607595, "grad_norm": 0.7597590088844299, "learning_rate": 1.0705867404284928e-05, "loss": 0.0992, "num_input_tokens_seen": 1037280, "step": 2290 }, { "epoch": 14.525316455696203, "grad_norm": 0.4234831631183624, "learning_rate": 1.0592802920549493e-05, "loss": 0.0808, "num_input_tokens_seen": 1039488, "step": 2295 }, { "epoch": 14.556962025316455, "grad_norm": 0.18929152190685272, "learning_rate": 1.0480177936898588e-05, "loss": 0.0614, "num_input_tokens_seen": 1041792, "step": 2300 }, { "epoch": 14.58860759493671, "grad_norm": 0.4219571053981781, "learning_rate": 1.036799588902408e-05, "loss": 0.1065, "num_input_tokens_seen": 1044064, "step": 2305 }, { "epoch": 14.620253164556962, "grad_norm": 0.19593296945095062, "learning_rate": 1.0256260199105824e-05, "loss": 0.086, "num_input_tokens_seen": 1046432, "step": 2310 }, { "epoch": 14.651898734177216, "grad_norm": 0.23182716965675354, "learning_rate": 1.0144974275707241e-05, "loss": 0.069, "num_input_tokens_seen": 1048736, "step": 2315 }, { "epoch": 14.683544303797468, "grad_norm": 0.4068852663040161, "learning_rate": 1.0034141513671377e-05, "loss": 0.089, "num_input_tokens_seen": 1051040, "step": 2320 }, { "epoch": 14.715189873417721, "grad_norm": 0.20161470770835876, "learning_rate": 9.923765294017317e-06, "loss": 0.1296, "num_input_tokens_seen": 1053376, "step": 2325 }, { "epoch": 14.746835443037975, "grad_norm": 0.12184353172779083, "learning_rate": 9.81384898383706e-06, "loss": 0.0826, "num_input_tokens_seen": 1055584, "step": 2330 }, { "epoch": 14.778481012658228, "grad_norm": 0.3952997624874115, "learning_rate": 9.704395936192765e-06, "loss": 0.1022, "num_input_tokens_seen": 1057920, "step": 2335 }, { "epoch": 14.810126582278482, "grad_norm": 0.08857820183038712, "learning_rate": 9.595409490014522e-06, "loss": 0.0593, "num_input_tokens_seen": 1060128, "step": 2340 }, { "epoch": 14.841772151898734, "grad_norm": 0.2308875322341919, "learning_rate": 9.486892969998465e-06, "loss": 0.0608, "num_input_tokens_seen": 1062400, "step": 2345 }, { "epoch": 14.873417721518987, "grad_norm": 0.08134555071592331, "learning_rate": 9.378849686505323e-06, "loss": 0.0717, "num_input_tokens_seen": 1064640, "step": 2350 }, { "epoch": 14.905063291139241, "grad_norm": 0.23280799388885498, "learning_rate": 9.271282935459497e-06, "loss": 0.1271, "num_input_tokens_seen": 1066784, "step": 2355 }, { "epoch": 14.936708860759493, "grad_norm": 0.2921713888645172, "learning_rate": 9.16419599824847e-06, "loss": 0.156, "num_input_tokens_seen": 1068992, "step": 2360 }, { "epoch": 14.968354430379748, "grad_norm": 0.32854291796684265, "learning_rate": 9.05759214162272e-06, "loss": 0.1159, "num_input_tokens_seen": 1071168, "step": 2365 }, { "epoch": 15.0, "grad_norm": 0.7001221776008606, "learning_rate": 8.951474617596075e-06, "loss": 0.0831, "num_input_tokens_seen": 1073248, "step": 2370 }, { "epoch": 15.0, "eval_loss": 0.2171768844127655, "eval_runtime": 1.0544, "eval_samples_per_second": 66.385, "eval_steps_per_second": 17.071, "num_input_tokens_seen": 1073248, "step": 2370 }, { "epoch": 15.031645569620252, "grad_norm": 0.10970105975866318, "learning_rate": 8.845846663346472e-06, "loss": 0.0834, "num_input_tokens_seen": 1075392, "step": 2375 }, { "epoch": 15.063291139240507, "grad_norm": 0.2830284833908081, "learning_rate": 8.74071150111726e-06, "loss": 0.0489, "num_input_tokens_seen": 1077632, "step": 2380 }, { "epoch": 15.094936708860759, "grad_norm": 0.383652001619339, "learning_rate": 8.636072338118875e-06, "loss": 0.0798, "num_input_tokens_seen": 1079808, "step": 2385 }, { "epoch": 15.126582278481013, "grad_norm": 0.6168588399887085, "learning_rate": 8.531932366430972e-06, "loss": 0.1202, "num_input_tokens_seen": 1082048, "step": 2390 }, { "epoch": 15.158227848101266, "grad_norm": 0.7165786027908325, "learning_rate": 8.428294762905115e-06, "loss": 0.0665, "num_input_tokens_seen": 1084288, "step": 2395 }, { "epoch": 15.189873417721518, "grad_norm": 0.096212238073349, "learning_rate": 8.325162689067813e-06, "loss": 0.0568, "num_input_tokens_seen": 1086624, "step": 2400 }, { "epoch": 15.221518987341772, "grad_norm": 0.3153831362724304, "learning_rate": 8.222539291024078e-06, "loss": 0.0783, "num_input_tokens_seen": 1088960, "step": 2405 }, { "epoch": 15.253164556962025, "grad_norm": 0.6100369691848755, "learning_rate": 8.12042769936151e-06, "loss": 0.1605, "num_input_tokens_seen": 1091328, "step": 2410 }, { "epoch": 15.284810126582279, "grad_norm": 0.36699631810188293, "learning_rate": 8.018831029054707e-06, "loss": 0.1371, "num_input_tokens_seen": 1093568, "step": 2415 }, { "epoch": 15.316455696202532, "grad_norm": 0.36354267597198486, "learning_rate": 7.917752379370288e-06, "loss": 0.0894, "num_input_tokens_seen": 1095808, "step": 2420 }, { "epoch": 15.348101265822784, "grad_norm": 0.4293253421783447, "learning_rate": 7.817194833772393e-06, "loss": 0.069, "num_input_tokens_seen": 1098176, "step": 2425 }, { "epoch": 15.379746835443038, "grad_norm": 0.4813206195831299, "learning_rate": 7.717161459828511e-06, "loss": 0.0812, "num_input_tokens_seen": 1100544, "step": 2430 }, { "epoch": 15.41139240506329, "grad_norm": 0.30571675300598145, "learning_rate": 7.617655309116009e-06, "loss": 0.0821, "num_input_tokens_seen": 1102880, "step": 2435 }, { "epoch": 15.443037974683545, "grad_norm": 0.39746037125587463, "learning_rate": 7.518679417128982e-06, "loss": 0.0652, "num_input_tokens_seen": 1105056, "step": 2440 }, { "epoch": 15.474683544303797, "grad_norm": 0.3243711292743683, "learning_rate": 7.420236803185649e-06, "loss": 0.1191, "num_input_tokens_seen": 1107296, "step": 2445 }, { "epoch": 15.50632911392405, "grad_norm": 0.3102463185787201, "learning_rate": 7.3223304703363135e-06, "loss": 0.0518, "num_input_tokens_seen": 1109536, "step": 2450 }, { "epoch": 15.537974683544304, "grad_norm": 0.2161189466714859, "learning_rate": 7.224963405271665e-06, "loss": 0.0925, "num_input_tokens_seen": 1111744, "step": 2455 }, { "epoch": 15.569620253164556, "grad_norm": 0.18209107220172882, "learning_rate": 7.128138578231702e-06, "loss": 0.0682, "num_input_tokens_seen": 1114208, "step": 2460 }, { "epoch": 15.60126582278481, "grad_norm": 0.31828245520591736, "learning_rate": 7.031858942915187e-06, "loss": 0.068, "num_input_tokens_seen": 1116448, "step": 2465 }, { "epoch": 15.632911392405063, "grad_norm": 0.23197156190872192, "learning_rate": 6.936127436389422e-06, "loss": 0.0578, "num_input_tokens_seen": 1118624, "step": 2470 }, { "epoch": 15.664556962025316, "grad_norm": 0.5017430186271667, "learning_rate": 6.840946979000759e-06, "loss": 0.0931, "num_input_tokens_seen": 1120896, "step": 2475 }, { "epoch": 15.69620253164557, "grad_norm": 0.2737450897693634, "learning_rate": 6.746320474285453e-06, "loss": 0.0913, "num_input_tokens_seen": 1123264, "step": 2480 }, { "epoch": 15.727848101265822, "grad_norm": 0.2858702838420868, "learning_rate": 6.652250808881089e-06, "loss": 0.118, "num_input_tokens_seen": 1125504, "step": 2485 }, { "epoch": 15.759493670886076, "grad_norm": 0.26174217462539673, "learning_rate": 6.558740852438583e-06, "loss": 0.0527, "num_input_tokens_seen": 1127744, "step": 2490 }, { "epoch": 15.791139240506329, "grad_norm": 0.1320747286081314, "learning_rate": 6.465793457534553e-06, "loss": 0.0497, "num_input_tokens_seen": 1129920, "step": 2495 }, { "epoch": 15.822784810126583, "grad_norm": 0.08994608372449875, "learning_rate": 6.373411459584347e-06, "loss": 0.0474, "num_input_tokens_seen": 1132128, "step": 2500 }, { "epoch": 15.854430379746836, "grad_norm": 0.2467956244945526, "learning_rate": 6.281597676755588e-06, "loss": 0.0482, "num_input_tokens_seen": 1134400, "step": 2505 }, { "epoch": 15.886075949367088, "grad_norm": 0.4874517321586609, "learning_rate": 6.190354909882109e-06, "loss": 0.1845, "num_input_tokens_seen": 1136640, "step": 2510 }, { "epoch": 15.917721518987342, "grad_norm": 0.18561723828315735, "learning_rate": 6.099685942378586e-06, "loss": 0.1613, "num_input_tokens_seen": 1139008, "step": 2515 }, { "epoch": 15.949367088607595, "grad_norm": 0.6990167498588562, "learning_rate": 6.009593540155614e-06, "loss": 0.1756, "num_input_tokens_seen": 1141280, "step": 2520 }, { "epoch": 15.981012658227849, "grad_norm": 0.5182476043701172, "learning_rate": 5.920080451535295e-06, "loss": 0.0659, "num_input_tokens_seen": 1143552, "step": 2525 }, { "epoch": 16.0, "eval_loss": 0.2179746925830841, "eval_runtime": 1.0487, "eval_samples_per_second": 66.751, "eval_steps_per_second": 17.165, "num_input_tokens_seen": 1144672, "step": 2528 }, { "epoch": 16.0126582278481, "grad_norm": 0.22019904851913452, "learning_rate": 5.831149407167449e-06, "loss": 0.0607, "num_input_tokens_seen": 1145536, "step": 2530 }, { "epoch": 16.044303797468356, "grad_norm": 0.12874069809913635, "learning_rate": 5.742803119946294e-06, "loss": 0.167, "num_input_tokens_seen": 1147840, "step": 2535 }, { "epoch": 16.075949367088608, "grad_norm": 0.3488861918449402, "learning_rate": 5.655044284927657e-06, "loss": 0.1161, "num_input_tokens_seen": 1149984, "step": 2540 }, { "epoch": 16.10759493670886, "grad_norm": 0.07585413753986359, "learning_rate": 5.567875579246817e-06, "loss": 0.0489, "num_input_tokens_seen": 1152192, "step": 2545 }, { "epoch": 16.139240506329113, "grad_norm": 0.528355062007904, "learning_rate": 5.481299662036793e-06, "loss": 0.0859, "num_input_tokens_seen": 1154592, "step": 2550 }, { "epoch": 16.170886075949365, "grad_norm": 0.8395475149154663, "learning_rate": 5.395319174347244e-06, "loss": 0.113, "num_input_tokens_seen": 1156800, "step": 2555 }, { "epoch": 16.20253164556962, "grad_norm": 0.4059028625488281, "learning_rate": 5.309936739063909e-06, "loss": 0.0657, "num_input_tokens_seen": 1159104, "step": 2560 }, { "epoch": 16.234177215189874, "grad_norm": 0.3494884669780731, "learning_rate": 5.225154960828557e-06, "loss": 0.0707, "num_input_tokens_seen": 1161312, "step": 2565 }, { "epoch": 16.265822784810126, "grad_norm": 0.3024592101573944, "learning_rate": 5.140976425959579e-06, "loss": 0.0969, "num_input_tokens_seen": 1163520, "step": 2570 }, { "epoch": 16.29746835443038, "grad_norm": 0.15737831592559814, "learning_rate": 5.057403702373076e-06, "loss": 0.047, "num_input_tokens_seen": 1165792, "step": 2575 }, { "epoch": 16.32911392405063, "grad_norm": 0.12250042706727982, "learning_rate": 4.9744393395044884e-06, "loss": 0.1036, "num_input_tokens_seen": 1168000, "step": 2580 }, { "epoch": 16.360759493670887, "grad_norm": 0.11752212047576904, "learning_rate": 4.892085868230881e-06, "loss": 0.0438, "num_input_tokens_seen": 1170208, "step": 2585 }, { "epoch": 16.39240506329114, "grad_norm": 0.320800244808197, "learning_rate": 4.8103458007936915e-06, "loss": 0.0658, "num_input_tokens_seen": 1172544, "step": 2590 }, { "epoch": 16.424050632911392, "grad_norm": 0.35643938183784485, "learning_rate": 4.729221630722128e-06, "loss": 0.0559, "num_input_tokens_seen": 1174816, "step": 2595 }, { "epoch": 16.455696202531644, "grad_norm": 0.6458439230918884, "learning_rate": 4.648715832757056e-06, "loss": 0.0967, "num_input_tokens_seen": 1177056, "step": 2600 }, { "epoch": 16.4873417721519, "grad_norm": 0.563255250453949, "learning_rate": 4.568830862775556e-06, "loss": 0.0829, "num_input_tokens_seen": 1179392, "step": 2605 }, { "epoch": 16.518987341772153, "grad_norm": 0.37135928869247437, "learning_rate": 4.489569157715976e-06, "loss": 0.1337, "num_input_tokens_seen": 1181664, "step": 2610 }, { "epoch": 16.550632911392405, "grad_norm": 0.28977271914482117, "learning_rate": 4.4109331355036085e-06, "loss": 0.0897, "num_input_tokens_seen": 1183808, "step": 2615 }, { "epoch": 16.582278481012658, "grad_norm": 0.17604626715183258, "learning_rate": 4.332925194976894e-06, "loss": 0.0869, "num_input_tokens_seen": 1186240, "step": 2620 }, { "epoch": 16.61392405063291, "grad_norm": 0.5408340096473694, "learning_rate": 4.255547715814296e-06, "loss": 0.1057, "num_input_tokens_seen": 1188704, "step": 2625 }, { "epoch": 16.645569620253166, "grad_norm": 0.5435397624969482, "learning_rate": 4.178803058461664e-06, "loss": 0.1033, "num_input_tokens_seen": 1190944, "step": 2630 }, { "epoch": 16.67721518987342, "grad_norm": 0.538170337677002, "learning_rate": 4.102693564060258e-06, "loss": 0.1631, "num_input_tokens_seen": 1193216, "step": 2635 }, { "epoch": 16.70886075949367, "grad_norm": 0.2762531042098999, "learning_rate": 4.027221554375296e-06, "loss": 0.0924, "num_input_tokens_seen": 1195584, "step": 2640 }, { "epoch": 16.740506329113924, "grad_norm": 0.5973560214042664, "learning_rate": 3.952389331725162e-06, "loss": 0.0989, "num_input_tokens_seen": 1197824, "step": 2645 }, { "epoch": 16.772151898734176, "grad_norm": 0.06301240622997284, "learning_rate": 3.87819917891116e-06, "loss": 0.0296, "num_input_tokens_seen": 1200032, "step": 2650 }, { "epoch": 16.803797468354432, "grad_norm": 0.18006472289562225, "learning_rate": 3.8046533591478556e-06, "loss": 0.0656, "num_input_tokens_seen": 1202240, "step": 2655 }, { "epoch": 16.835443037974684, "grad_norm": 0.4730052053928375, "learning_rate": 3.7317541159940657e-06, "loss": 0.0615, "num_input_tokens_seen": 1204576, "step": 2660 }, { "epoch": 16.867088607594937, "grad_norm": 0.30968526005744934, "learning_rate": 3.659503673284409e-06, "loss": 0.099, "num_input_tokens_seen": 1206816, "step": 2665 }, { "epoch": 16.89873417721519, "grad_norm": 0.4650189280509949, "learning_rate": 3.5879042350614482e-06, "loss": 0.1582, "num_input_tokens_seen": 1209120, "step": 2670 }, { "epoch": 16.930379746835442, "grad_norm": 0.4333466589450836, "learning_rate": 3.516957985508476e-06, "loss": 0.0654, "num_input_tokens_seen": 1211552, "step": 2675 }, { "epoch": 16.962025316455698, "grad_norm": 0.05511218681931496, "learning_rate": 3.4466670888828794e-06, "loss": 0.0446, "num_input_tokens_seen": 1213632, "step": 2680 }, { "epoch": 16.99367088607595, "grad_norm": 0.5889723896980286, "learning_rate": 3.377033689450115e-06, "loss": 0.0959, "num_input_tokens_seen": 1215968, "step": 2685 }, { "epoch": 17.0, "eval_loss": 0.2168864607810974, "eval_runtime": 1.0486, "eval_samples_per_second": 66.758, "eval_steps_per_second": 17.166, "num_input_tokens_seen": 1216160, "step": 2686 }, { "epoch": 17.025316455696203, "grad_norm": 0.27847614884376526, "learning_rate": 3.3080599114183043e-06, "loss": 0.1202, "num_input_tokens_seen": 1217888, "step": 2690 }, { "epoch": 17.056962025316455, "grad_norm": 0.24617423117160797, "learning_rate": 3.2397478588734043e-06, "loss": 0.1354, "num_input_tokens_seen": 1220128, "step": 2695 }, { "epoch": 17.088607594936708, "grad_norm": 0.3356018364429474, "learning_rate": 3.1720996157150657e-06, "loss": 0.077, "num_input_tokens_seen": 1222432, "step": 2700 }, { "epoch": 17.120253164556964, "grad_norm": 0.2386729121208191, "learning_rate": 3.1051172455930394e-06, "loss": 0.0669, "num_input_tokens_seen": 1224768, "step": 2705 }, { "epoch": 17.151898734177216, "grad_norm": 0.9189938306808472, "learning_rate": 3.0388027918442086e-06, "loss": 0.1163, "num_input_tokens_seen": 1227136, "step": 2710 }, { "epoch": 17.18354430379747, "grad_norm": 0.24651862680912018, "learning_rate": 2.9731582774302905e-06, "loss": 0.0665, "num_input_tokens_seen": 1229344, "step": 2715 }, { "epoch": 17.21518987341772, "grad_norm": 0.1498272866010666, "learning_rate": 2.908185704876101e-06, "loss": 0.0772, "num_input_tokens_seen": 1231552, "step": 2720 }, { "epoch": 17.246835443037973, "grad_norm": 0.5082818269729614, "learning_rate": 2.8438870562084523e-06, "loss": 0.0804, "num_input_tokens_seen": 1233824, "step": 2725 }, { "epoch": 17.27848101265823, "grad_norm": 0.6192799806594849, "learning_rate": 2.7802642928957458e-06, "loss": 0.0775, "num_input_tokens_seen": 1236160, "step": 2730 }, { "epoch": 17.310126582278482, "grad_norm": 0.47422462701797485, "learning_rate": 2.7173193557880615e-06, "loss": 0.0984, "num_input_tokens_seen": 1238464, "step": 2735 }, { "epoch": 17.341772151898734, "grad_norm": 0.37722766399383545, "learning_rate": 2.6550541650580186e-06, "loss": 0.1213, "num_input_tokens_seen": 1240704, "step": 2740 }, { "epoch": 17.373417721518987, "grad_norm": 0.18606220185756683, "learning_rate": 2.593470620142155e-06, "loss": 0.0808, "num_input_tokens_seen": 1243008, "step": 2745 }, { "epoch": 17.40506329113924, "grad_norm": 0.4717780351638794, "learning_rate": 2.5325705996829956e-06, "loss": 0.0544, "num_input_tokens_seen": 1245184, "step": 2750 }, { "epoch": 17.436708860759495, "grad_norm": 0.4897431433200836, "learning_rate": 2.472355961471762e-06, "loss": 0.0512, "num_input_tokens_seen": 1247456, "step": 2755 }, { "epoch": 17.468354430379748, "grad_norm": 0.18225190043449402, "learning_rate": 2.4128285423916735e-06, "loss": 0.0646, "num_input_tokens_seen": 1249760, "step": 2760 }, { "epoch": 17.5, "grad_norm": 0.4073273241519928, "learning_rate": 2.3539901583619185e-06, "loss": 0.0874, "num_input_tokens_seen": 1252032, "step": 2765 }, { "epoch": 17.531645569620252, "grad_norm": 0.41607582569122314, "learning_rate": 2.2958426042822806e-06, "loss": 0.1236, "num_input_tokens_seen": 1254240, "step": 2770 }, { "epoch": 17.563291139240505, "grad_norm": 0.6229532361030579, "learning_rate": 2.2383876539783493e-06, "loss": 0.1414, "num_input_tokens_seen": 1256512, "step": 2775 }, { "epoch": 17.59493670886076, "grad_norm": 0.12924204766750336, "learning_rate": 2.181627060147423e-06, "loss": 0.0315, "num_input_tokens_seen": 1258784, "step": 2780 }, { "epoch": 17.626582278481013, "grad_norm": 0.5748106241226196, "learning_rate": 2.125562554305069e-06, "loss": 0.0901, "num_input_tokens_seen": 1261024, "step": 2785 }, { "epoch": 17.658227848101266, "grad_norm": 0.24264433979988098, "learning_rate": 2.0701958467322452e-06, "loss": 0.0821, "num_input_tokens_seen": 1263392, "step": 2790 }, { "epoch": 17.689873417721518, "grad_norm": 0.28921446204185486, "learning_rate": 2.0155286264231856e-06, "loss": 0.1439, "num_input_tokens_seen": 1265600, "step": 2795 }, { "epoch": 17.72151898734177, "grad_norm": 0.06597453355789185, "learning_rate": 1.9615625610338445e-06, "loss": 0.0505, "num_input_tokens_seen": 1267808, "step": 2800 }, { "epoch": 17.753164556962027, "grad_norm": 0.04219668358564377, "learning_rate": 1.908299296831012e-06, "loss": 0.0594, "num_input_tokens_seen": 1270016, "step": 2805 }, { "epoch": 17.78481012658228, "grad_norm": 0.10402850806713104, "learning_rate": 1.8557404586421413e-06, "loss": 0.0716, "num_input_tokens_seen": 1272448, "step": 2810 }, { "epoch": 17.81645569620253, "grad_norm": 0.3619997203350067, "learning_rate": 1.8038876498057329e-06, "loss": 0.1263, "num_input_tokens_seen": 1274848, "step": 2815 }, { "epoch": 17.848101265822784, "grad_norm": 0.26856812834739685, "learning_rate": 1.7527424521224384e-06, "loss": 0.074, "num_input_tokens_seen": 1277152, "step": 2820 }, { "epoch": 17.879746835443036, "grad_norm": 0.13303466141223907, "learning_rate": 1.7023064258068377e-06, "loss": 0.0599, "num_input_tokens_seen": 1279488, "step": 2825 }, { "epoch": 17.911392405063292, "grad_norm": 0.4508163034915924, "learning_rate": 1.652581109439788e-06, "loss": 0.1143, "num_input_tokens_seen": 1281728, "step": 2830 }, { "epoch": 17.943037974683545, "grad_norm": 0.35757380723953247, "learning_rate": 1.6035680199215391e-06, "loss": 0.1011, "num_input_tokens_seen": 1284000, "step": 2835 }, { "epoch": 17.974683544303797, "grad_norm": 0.2496608942747116, "learning_rate": 1.5552686524254345e-06, "loss": 0.0386, "num_input_tokens_seen": 1286176, "step": 2840 }, { "epoch": 18.0, "eval_loss": 0.2155109941959381, "eval_runtime": 1.0547, "eval_samples_per_second": 66.37, "eval_steps_per_second": 17.066, "num_input_tokens_seen": 1287728, "step": 2844 }, { "epoch": 18.00632911392405, "grad_norm": 0.2967507243156433, "learning_rate": 1.5076844803522922e-06, "loss": 0.1161, "num_input_tokens_seen": 1288176, "step": 2845 }, { "epoch": 18.037974683544302, "grad_norm": 0.3860829472541809, "learning_rate": 1.460816955285485e-06, "loss": 0.0729, "num_input_tokens_seen": 1290384, "step": 2850 }, { "epoch": 18.069620253164558, "grad_norm": 0.4198305606842041, "learning_rate": 1.4146675069466403e-06, "loss": 0.1043, "num_input_tokens_seen": 1292592, "step": 2855 }, { "epoch": 18.10126582278481, "grad_norm": 0.3928205072879791, "learning_rate": 1.369237543152016e-06, "loss": 0.0525, "num_input_tokens_seen": 1294864, "step": 2860 }, { "epoch": 18.132911392405063, "grad_norm": 0.5683562159538269, "learning_rate": 1.3245284497695993e-06, "loss": 0.0617, "num_input_tokens_seen": 1297168, "step": 2865 }, { "epoch": 18.164556962025316, "grad_norm": 0.4196165204048157, "learning_rate": 1.2805415906767621e-06, "loss": 0.0774, "num_input_tokens_seen": 1299600, "step": 2870 }, { "epoch": 18.196202531645568, "grad_norm": 0.1231977716088295, "learning_rate": 1.2372783077187117e-06, "loss": 0.0218, "num_input_tokens_seen": 1301808, "step": 2875 }, { "epoch": 18.227848101265824, "grad_norm": 0.30443814396858215, "learning_rate": 1.1947399206675369e-06, "loss": 0.0666, "num_input_tokens_seen": 1304144, "step": 2880 }, { "epoch": 18.259493670886076, "grad_norm": 0.21611914038658142, "learning_rate": 1.152927727181935e-06, "loss": 0.1095, "num_input_tokens_seen": 1306352, "step": 2885 }, { "epoch": 18.29113924050633, "grad_norm": 0.64686518907547, "learning_rate": 1.1118430027676486e-06, "loss": 0.1, "num_input_tokens_seen": 1308528, "step": 2890 }, { "epoch": 18.32278481012658, "grad_norm": 0.35639044642448425, "learning_rate": 1.0714870007385497e-06, "loss": 0.0659, "num_input_tokens_seen": 1310704, "step": 2895 }, { "epoch": 18.354430379746834, "grad_norm": 0.5319870710372925, "learning_rate": 1.0318609521783818e-06, "loss": 0.1447, "num_input_tokens_seen": 1313008, "step": 2900 }, { "epoch": 18.38607594936709, "grad_norm": 0.3170202374458313, "learning_rate": 9.929660659032475e-07, "loss": 0.1658, "num_input_tokens_seen": 1315280, "step": 2905 }, { "epoch": 18.417721518987342, "grad_norm": 0.13067427277565002, "learning_rate": 9.548035284246998e-07, "loss": 0.1333, "num_input_tokens_seen": 1317488, "step": 2910 }, { "epoch": 18.449367088607595, "grad_norm": 0.691867470741272, "learning_rate": 9.173745039135622e-07, "loss": 0.0966, "num_input_tokens_seen": 1319696, "step": 2915 }, { "epoch": 18.481012658227847, "grad_norm": 0.15655472874641418, "learning_rate": 8.806801341644022e-07, "loss": 0.0564, "num_input_tokens_seen": 1321840, "step": 2920 }, { "epoch": 18.5126582278481, "grad_norm": 0.18627077341079712, "learning_rate": 8.447215385607138e-07, "loss": 0.0356, "num_input_tokens_seen": 1324048, "step": 2925 }, { "epoch": 18.544303797468356, "grad_norm": 0.8122642636299133, "learning_rate": 8.094998140407678e-07, "loss": 0.1526, "num_input_tokens_seen": 1326416, "step": 2930 }, { "epoch": 18.575949367088608, "grad_norm": 0.3567385971546173, "learning_rate": 7.750160350641467e-07, "loss": 0.0635, "num_input_tokens_seen": 1328624, "step": 2935 }, { "epoch": 18.60759493670886, "grad_norm": 0.27645525336265564, "learning_rate": 7.41271253578954e-07, "loss": 0.0786, "num_input_tokens_seen": 1330992, "step": 2940 }, { "epoch": 18.639240506329113, "grad_norm": 0.40808364748954773, "learning_rate": 7.082664989897487e-07, "loss": 0.1048, "num_input_tokens_seen": 1333360, "step": 2945 }, { "epoch": 18.67088607594937, "grad_norm": 0.21505287289619446, "learning_rate": 6.760027781261336e-07, "loss": 0.0412, "num_input_tokens_seen": 1335536, "step": 2950 }, { "epoch": 18.70253164556962, "grad_norm": 0.3021751046180725, "learning_rate": 6.444810752120278e-07, "loss": 0.1036, "num_input_tokens_seen": 1337872, "step": 2955 }, { "epoch": 18.734177215189874, "grad_norm": 0.29500246047973633, "learning_rate": 6.137023518356599e-07, "loss": 0.0896, "num_input_tokens_seen": 1340208, "step": 2960 }, { "epoch": 18.765822784810126, "grad_norm": 0.747681200504303, "learning_rate": 5.836675469202247e-07, "loss": 0.0836, "num_input_tokens_seen": 1342448, "step": 2965 }, { "epoch": 18.79746835443038, "grad_norm": 0.3954211473464966, "learning_rate": 5.543775766952447e-07, "loss": 0.0857, "num_input_tokens_seen": 1344720, "step": 2970 }, { "epoch": 18.82911392405063, "grad_norm": 0.3156197667121887, "learning_rate": 5.258333346686211e-07, "loss": 0.0607, "num_input_tokens_seen": 1347088, "step": 2975 }, { "epoch": 18.860759493670887, "grad_norm": 0.18224351108074188, "learning_rate": 4.980356915993661e-07, "loss": 0.0819, "num_input_tokens_seen": 1349360, "step": 2980 }, { "epoch": 18.89240506329114, "grad_norm": 0.4493334889411926, "learning_rate": 4.709854954710602e-07, "loss": 0.0847, "num_input_tokens_seen": 1351600, "step": 2985 }, { "epoch": 18.924050632911392, "grad_norm": 0.3458345830440521, "learning_rate": 4.4468357146596475e-07, "loss": 0.0962, "num_input_tokens_seen": 1353776, "step": 2990 }, { "epoch": 18.955696202531644, "grad_norm": 0.4429856240749359, "learning_rate": 4.191307219398588e-07, "loss": 0.1347, "num_input_tokens_seen": 1356080, "step": 2995 }, { "epoch": 18.9873417721519, "grad_norm": 0.6942119598388672, "learning_rate": 3.943277263975559e-07, "loss": 0.0871, "num_input_tokens_seen": 1358448, "step": 3000 }, { "epoch": 19.0, "eval_loss": 0.21432405710220337, "eval_runtime": 1.0554, "eval_samples_per_second": 66.325, "eval_steps_per_second": 17.055, "num_input_tokens_seen": 1359120, "step": 3002 }, { "epoch": 19.018987341772153, "grad_norm": 0.4526669681072235, "learning_rate": 3.7027534146913676e-07, "loss": 0.1245, "num_input_tokens_seen": 1360464, "step": 3005 }, { "epoch": 19.050632911392405, "grad_norm": 0.05428961664438248, "learning_rate": 3.469743008868542e-07, "loss": 0.0609, "num_input_tokens_seen": 1362832, "step": 3010 }, { "epoch": 19.082278481012658, "grad_norm": 0.5243335366249084, "learning_rate": 3.244253154627619e-07, "loss": 0.058, "num_input_tokens_seen": 1365072, "step": 3015 }, { "epoch": 19.11392405063291, "grad_norm": 0.1941998451948166, "learning_rate": 3.026290730670206e-07, "loss": 0.0696, "num_input_tokens_seen": 1367440, "step": 3020 }, { "epoch": 19.145569620253166, "grad_norm": 0.3650839626789093, "learning_rate": 2.8158623860692344e-07, "loss": 0.0703, "num_input_tokens_seen": 1369744, "step": 3025 }, { "epoch": 19.17721518987342, "grad_norm": 0.3534727394580841, "learning_rate": 2.612974540065982e-07, "loss": 0.1613, "num_input_tokens_seen": 1371984, "step": 3030 }, { "epoch": 19.20886075949367, "grad_norm": 0.21053653955459595, "learning_rate": 2.417633381874534e-07, "loss": 0.0604, "num_input_tokens_seen": 1374288, "step": 3035 }, { "epoch": 19.240506329113924, "grad_norm": 0.3834356665611267, "learning_rate": 2.2298448704926034e-07, "loss": 0.0791, "num_input_tokens_seen": 1376496, "step": 3040 }, { "epoch": 19.272151898734176, "grad_norm": 0.39977359771728516, "learning_rate": 2.0496147345200055e-07, "loss": 0.1412, "num_input_tokens_seen": 1378736, "step": 3045 }, { "epoch": 19.303797468354432, "grad_norm": 0.35632604360580444, "learning_rate": 1.8769484719839136e-07, "loss": 0.0683, "num_input_tokens_seen": 1380944, "step": 3050 }, { "epoch": 19.335443037974684, "grad_norm": 0.11820244044065475, "learning_rate": 1.7118513501709066e-07, "loss": 0.0494, "num_input_tokens_seen": 1383248, "step": 3055 }, { "epoch": 19.367088607594937, "grad_norm": 0.19507254660129547, "learning_rate": 1.5543284054665708e-07, "loss": 0.0989, "num_input_tokens_seen": 1385488, "step": 3060 }, { "epoch": 19.39873417721519, "grad_norm": 0.07385202497243881, "learning_rate": 1.4043844432016506e-07, "loss": 0.1038, "num_input_tokens_seen": 1387760, "step": 3065 }, { "epoch": 19.430379746835442, "grad_norm": 0.1789783090353012, "learning_rate": 1.2620240375055826e-07, "loss": 0.0772, "num_input_tokens_seen": 1390192, "step": 3070 }, { "epoch": 19.462025316455698, "grad_norm": 0.15113328397274017, "learning_rate": 1.1272515311669119e-07, "loss": 0.0799, "num_input_tokens_seen": 1392464, "step": 3075 }, { "epoch": 19.49367088607595, "grad_norm": 0.29063722491264343, "learning_rate": 1.0000710355008159e-07, "loss": 0.0707, "num_input_tokens_seen": 1394768, "step": 3080 }, { "epoch": 19.525316455696203, "grad_norm": 0.1928526759147644, "learning_rate": 8.804864302236482e-08, "loss": 0.1053, "num_input_tokens_seen": 1397072, "step": 3085 }, { "epoch": 19.556962025316455, "grad_norm": 0.4062618315219879, "learning_rate": 7.685013633347005e-08, "loss": 0.0658, "num_input_tokens_seen": 1399344, "step": 3090 }, { "epoch": 19.588607594936708, "grad_norm": 0.11600256711244583, "learning_rate": 6.641192510047634e-08, "loss": 0.1311, "num_input_tokens_seen": 1401552, "step": 3095 }, { "epoch": 19.620253164556964, "grad_norm": 0.14116211235523224, "learning_rate": 5.673432774720433e-08, "loss": 0.0816, "num_input_tokens_seen": 1403760, "step": 3100 }, { "epoch": 19.651898734177216, "grad_norm": 0.31183066964149475, "learning_rate": 4.7817639494499025e-08, "loss": 0.0421, "num_input_tokens_seen": 1406000, "step": 3105 }, { "epoch": 19.68354430379747, "grad_norm": 0.48970139026641846, "learning_rate": 3.9662132351214744e-08, "loss": 0.1217, "num_input_tokens_seen": 1408208, "step": 3110 }, { "epoch": 19.71518987341772, "grad_norm": 0.15410216152668, "learning_rate": 3.2268055105932894e-08, "loss": 0.0513, "num_input_tokens_seen": 1410416, "step": 3115 }, { "epoch": 19.746835443037973, "grad_norm": 0.3174501955509186, "learning_rate": 2.5635633319359698e-08, "loss": 0.033, "num_input_tokens_seen": 1412592, "step": 3120 }, { "epoch": 19.77848101265823, "grad_norm": 0.3811391294002533, "learning_rate": 1.976506931745392e-08, "loss": 0.0805, "num_input_tokens_seen": 1414960, "step": 3125 }, { "epoch": 19.810126582278482, "grad_norm": 0.37022286653518677, "learning_rate": 1.465654218524848e-08, "loss": 0.1069, "num_input_tokens_seen": 1417232, "step": 3130 }, { "epoch": 19.841772151898734, "grad_norm": 0.2080419808626175, "learning_rate": 1.0310207761388135e-08, "loss": 0.0578, "num_input_tokens_seen": 1419568, "step": 3135 }, { "epoch": 19.873417721518987, "grad_norm": 0.470682829618454, "learning_rate": 6.726198633386083e-09, "loss": 0.1281, "num_input_tokens_seen": 1421776, "step": 3140 }, { "epoch": 19.90506329113924, "grad_norm": 0.3464868664741516, "learning_rate": 3.904624133560519e-09, "loss": 0.0797, "num_input_tokens_seen": 1424016, "step": 3145 }, { "epoch": 19.936708860759495, "grad_norm": 0.17444895207881927, "learning_rate": 1.84557033571231e-09, "loss": 0.0434, "num_input_tokens_seen": 1426288, "step": 3150 }, { "epoch": 19.968354430379748, "grad_norm": 0.4138510525226593, "learning_rate": 5.491000525020828e-10, "loss": 0.1777, "num_input_tokens_seen": 1428592, "step": 3155 }, { "epoch": 20.0, "grad_norm": 0.13195763528347015, "learning_rate": 1.5252833521217648e-11, "loss": 0.0834, "num_input_tokens_seen": 1430592, "step": 3160 }, { "epoch": 20.0, "eval_loss": 0.214238703250885, "eval_runtime": 1.0589, "eval_samples_per_second": 66.108, "eval_steps_per_second": 16.999, "num_input_tokens_seen": 1430592, "step": 3160 }, { "epoch": 20.0, "num_input_tokens_seen": 1430592, "step": 3160, "total_flos": 6.442059877318656e+16, "train_loss": 0.39732415456136194, "train_runtime": 576.3251, "train_samples_per_second": 21.863, "train_steps_per_second": 5.483 } ], "logging_steps": 5, "max_steps": 3160, "num_input_tokens_seen": 1430592, "num_train_epochs": 20, "save_steps": 158, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6.442059877318656e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }