{ "best_metric": null, "best_model_checkpoint": null, "epoch": 4.928, "eval_steps": 500, "global_step": 310, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.08, "grad_norm": 1.009501338005066, "learning_rate": 9.993582535855265e-06, "loss": 0.4352, "num_input_tokens_seen": 904912, "step": 5 }, { "epoch": 0.16, "grad_norm": 0.7794860601425171, "learning_rate": 9.974346616959476e-06, "loss": 0.388, "num_input_tokens_seen": 1808376, "step": 10 }, { "epoch": 0.24, "grad_norm": 0.5388593673706055, "learning_rate": 9.942341621640558e-06, "loss": 0.3875, "num_input_tokens_seen": 2684696, "step": 15 }, { "epoch": 0.32, "grad_norm": 0.46523410081863403, "learning_rate": 9.897649706262474e-06, "loss": 0.3959, "num_input_tokens_seen": 3596136, "step": 20 }, { "epoch": 0.4, "grad_norm": 0.5142658352851868, "learning_rate": 9.840385594331022e-06, "loss": 0.3718, "num_input_tokens_seen": 4549488, "step": 25 }, { "epoch": 0.48, "grad_norm": 0.3676227927207947, "learning_rate": 9.770696282000245e-06, "loss": 0.3597, "num_input_tokens_seen": 5432232, "step": 30 }, { "epoch": 0.56, "grad_norm": 0.3568161725997925, "learning_rate": 9.688760660735403e-06, "loss": 0.3683, "num_input_tokens_seen": 6289320, "step": 35 }, { "epoch": 0.64, "grad_norm": 0.2971496284008026, "learning_rate": 9.594789058101154e-06, "loss": 0.3701, "num_input_tokens_seen": 7178328, "step": 40 }, { "epoch": 0.72, "grad_norm": 0.3190518021583557, "learning_rate": 9.48902269785371e-06, "loss": 0.3796, "num_input_tokens_seen": 8074600, "step": 45 }, { "epoch": 0.8, "grad_norm": 0.32561618089675903, "learning_rate": 9.371733080722911e-06, "loss": 0.3451, "num_input_tokens_seen": 9044168, "step": 50 }, { "epoch": 0.88, "grad_norm": 0.3187696635723114, "learning_rate": 9.243221287473755e-06, "loss": 0.3526, "num_input_tokens_seen": 9969944, "step": 55 }, { "epoch": 0.96, "grad_norm": 0.30837559700012207, "learning_rate": 9.103817206036383e-06, "loss": 0.3386, "num_input_tokens_seen": 10909008, "step": 60 }, { "epoch": 1.032, "grad_norm": 0.4271972179412842, "learning_rate": 8.953878684688492e-06, "loss": 0.3002, "num_input_tokens_seen": 11663840, "step": 65 }, { "epoch": 1.112, "grad_norm": 0.4927215874195099, "learning_rate": 8.793790613463956e-06, "loss": 0.2818, "num_input_tokens_seen": 12553064, "step": 70 }, { "epoch": 1.192, "grad_norm": 0.4385733902454376, "learning_rate": 8.6239639361456e-06, "loss": 0.2778, "num_input_tokens_seen": 13469144, "step": 75 }, { "epoch": 1.272, "grad_norm": 0.43125155568122864, "learning_rate": 8.444834595378434e-06, "loss": 0.2691, "num_input_tokens_seen": 14406256, "step": 80 }, { "epoch": 1.3519999999999999, "grad_norm": 0.3859022557735443, "learning_rate": 8.256862413611113e-06, "loss": 0.262, "num_input_tokens_seen": 15305328, "step": 85 }, { "epoch": 1.432, "grad_norm": 0.46962714195251465, "learning_rate": 8.060529912738316e-06, "loss": 0.2805, "num_input_tokens_seen": 16259688, "step": 90 }, { "epoch": 1.512, "grad_norm": 0.3477609157562256, "learning_rate": 7.856341075473963e-06, "loss": 0.2775, "num_input_tokens_seen": 17167192, "step": 95 }, { "epoch": 1.592, "grad_norm": 0.41423365473747253, "learning_rate": 7.644820051634813e-06, "loss": 0.2645, "num_input_tokens_seen": 18059304, "step": 100 }, { "epoch": 1.6720000000000002, "grad_norm": 0.32248997688293457, "learning_rate": 7.4265098126554065e-06, "loss": 0.2773, "num_input_tokens_seen": 18973864, "step": 105 }, { "epoch": 1.752, "grad_norm": 0.3673882782459259, "learning_rate": 7.201970757788172e-06, "loss": 0.2867, "num_input_tokens_seen": 19877832, "step": 110 }, { "epoch": 1.8319999999999999, "grad_norm": 0.2969827353954315, "learning_rate": 6.971779275566593e-06, "loss": 0.2697, "num_input_tokens_seen": 20793160, "step": 115 }, { "epoch": 1.912, "grad_norm": 0.3408501148223877, "learning_rate": 6.736526264224101e-06, "loss": 0.264, "num_input_tokens_seen": 21662200, "step": 120 }, { "epoch": 1.992, "grad_norm": 0.3277493715286255, "learning_rate": 6.496815614866792e-06, "loss": 0.2734, "num_input_tokens_seen": 22579968, "step": 125 }, { "epoch": 2.064, "grad_norm": 0.39144793152809143, "learning_rate": 6.2532626612936035e-06, "loss": 0.2438, "num_input_tokens_seen": 23409800, "step": 130 }, { "epoch": 2.144, "grad_norm": 0.3468864858150482, "learning_rate": 6.006492600443301e-06, "loss": 0.2017, "num_input_tokens_seen": 24285480, "step": 135 }, { "epoch": 2.224, "grad_norm": 0.36446893215179443, "learning_rate": 5.757138887522884e-06, "loss": 0.2014, "num_input_tokens_seen": 25155560, "step": 140 }, { "epoch": 2.304, "grad_norm": 0.2912499010562897, "learning_rate": 5.505841609937162e-06, "loss": 0.193, "num_input_tokens_seen": 26045360, "step": 145 }, { "epoch": 2.384, "grad_norm": 0.32886043190956116, "learning_rate": 5.253245844193564e-06, "loss": 0.204, "num_input_tokens_seen": 26953512, "step": 150 }, { "epoch": 2.464, "grad_norm": 0.28212445974349976, "learning_rate": 5e-06, "loss": 0.2224, "num_input_tokens_seen": 27875320, "step": 155 }, { "epoch": 2.544, "grad_norm": 0.35469871759414673, "learning_rate": 4.746754155806437e-06, "loss": 0.2122, "num_input_tokens_seen": 28786896, "step": 160 }, { "epoch": 2.624, "grad_norm": 0.285258024930954, "learning_rate": 4.49415839006284e-06, "loss": 0.1976, "num_input_tokens_seen": 29630128, "step": 165 }, { "epoch": 2.7039999999999997, "grad_norm": 0.3201941251754761, "learning_rate": 4.2428611124771184e-06, "loss": 0.1935, "num_input_tokens_seen": 30503816, "step": 170 }, { "epoch": 2.784, "grad_norm": 0.3066469430923462, "learning_rate": 3.993507399556699e-06, "loss": 0.2237, "num_input_tokens_seen": 31433032, "step": 175 }, { "epoch": 2.864, "grad_norm": 0.2676903009414673, "learning_rate": 3.7467373387063973e-06, "loss": 0.2002, "num_input_tokens_seen": 32347864, "step": 180 }, { "epoch": 2.944, "grad_norm": 0.30449098348617554, "learning_rate": 3.5031843851332105e-06, "loss": 0.2138, "num_input_tokens_seen": 33346576, "step": 185 }, { "epoch": 3.016, "grad_norm": 0.478014200925827, "learning_rate": 3.2634737357758994e-06, "loss": 0.1904, "num_input_tokens_seen": 34219176, "step": 190 }, { "epoch": 3.096, "grad_norm": 0.2607128620147705, "learning_rate": 3.0282207244334084e-06, "loss": 0.1643, "num_input_tokens_seen": 35158432, "step": 195 }, { "epoch": 3.176, "grad_norm": 0.29989877343177795, "learning_rate": 2.7980292422118282e-06, "loss": 0.16, "num_input_tokens_seen": 36055520, "step": 200 }, { "epoch": 3.2560000000000002, "grad_norm": 0.2684940695762634, "learning_rate": 2.573490187344596e-06, "loss": 0.1493, "num_input_tokens_seen": 36983672, "step": 205 }, { "epoch": 3.336, "grad_norm": 0.2773168087005615, "learning_rate": 2.3551799483651894e-06, "loss": 0.1436, "num_input_tokens_seen": 37837680, "step": 210 }, { "epoch": 3.416, "grad_norm": 0.33160483837127686, "learning_rate": 2.1436589245260375e-06, "loss": 0.154, "num_input_tokens_seen": 38766568, "step": 215 }, { "epoch": 3.496, "grad_norm": 0.3284499943256378, "learning_rate": 1.9394700872616856e-06, "loss": 0.1658, "num_input_tokens_seen": 39667328, "step": 220 }, { "epoch": 3.576, "grad_norm": 0.27978459000587463, "learning_rate": 1.74313758638889e-06, "loss": 0.1472, "num_input_tokens_seen": 40532840, "step": 225 }, { "epoch": 3.656, "grad_norm": 0.2714602053165436, "learning_rate": 1.555165404621567e-06, "loss": 0.1654, "num_input_tokens_seen": 41498224, "step": 230 }, { "epoch": 3.7359999999999998, "grad_norm": 0.2441283017396927, "learning_rate": 1.3760360638544012e-06, "loss": 0.1583, "num_input_tokens_seen": 42449096, "step": 235 }, { "epoch": 3.816, "grad_norm": 0.32902443408966064, "learning_rate": 1.2062093865360458e-06, "loss": 0.1463, "num_input_tokens_seen": 43293600, "step": 240 }, { "epoch": 3.896, "grad_norm": 0.3130110204219818, "learning_rate": 1.046121315311508e-06, "loss": 0.1599, "num_input_tokens_seen": 44190712, "step": 245 }, { "epoch": 3.976, "grad_norm": 0.2552832067012787, "learning_rate": 8.961827939636198e-07, "loss": 0.1609, "num_input_tokens_seen": 45096112, "step": 250 }, { "epoch": 4.048, "grad_norm": 0.2593518793582916, "learning_rate": 7.567787125262449e-07, "loss": 0.1428, "num_input_tokens_seen": 45916648, "step": 255 }, { "epoch": 4.128, "grad_norm": 0.20947769284248352, "learning_rate": 6.282669192770896e-07, "loss": 0.1216, "num_input_tokens_seen": 46801776, "step": 260 }, { "epoch": 4.208, "grad_norm": 0.22543282806873322, "learning_rate": 5.109773021462921e-07, "loss": 0.1309, "num_input_tokens_seen": 47685704, "step": 265 }, { "epoch": 4.288, "grad_norm": 0.24011245369911194, "learning_rate": 4.05210941898847e-07, "loss": 0.1346, "num_input_tokens_seen": 48567032, "step": 270 }, { "epoch": 4.368, "grad_norm": 0.22010523080825806, "learning_rate": 3.112393392645985e-07, "loss": 0.1318, "num_input_tokens_seen": 49491568, "step": 275 }, { "epoch": 4.448, "grad_norm": 0.1899755746126175, "learning_rate": 2.2930371799975593e-07, "loss": 0.1355, "num_input_tokens_seen": 50442584, "step": 280 }, { "epoch": 4.5280000000000005, "grad_norm": 0.20433053374290466, "learning_rate": 1.5961440566897913e-07, "loss": 0.1418, "num_input_tokens_seen": 51413272, "step": 285 }, { "epoch": 4.608, "grad_norm": 0.23340986669063568, "learning_rate": 1.0235029373752758e-07, "loss": 0.1358, "num_input_tokens_seen": 52276024, "step": 290 }, { "epoch": 4.688, "grad_norm": 0.20058894157409668, "learning_rate": 5.7658378359443104e-08, "loss": 0.1319, "num_input_tokens_seen": 53176640, "step": 295 }, { "epoch": 4.768, "grad_norm": 0.21048085391521454, "learning_rate": 2.5653383040524228e-08, "loss": 0.1401, "num_input_tokens_seen": 54130624, "step": 300 }, { "epoch": 4.848, "grad_norm": 0.21626824140548706, "learning_rate": 6.417464144736208e-09, "loss": 0.1395, "num_input_tokens_seen": 55018008, "step": 305 }, { "epoch": 4.928, "grad_norm": 0.22036929428577423, "learning_rate": 0.0, "loss": 0.1385, "num_input_tokens_seen": 55950608, "step": 310 }, { "epoch": 4.928, "num_input_tokens_seen": 55950608, "step": 310, "total_flos": 375241330655232.0, "train_loss": 0.23023235970927822, "train_runtime": 24795.9426, "train_samples_per_second": 0.202, "train_steps_per_second": 0.013 } ], "logging_steps": 5, "max_steps": 310, "num_input_tokens_seen": 55950608, "num_train_epochs": 5, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 375241330655232.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }