| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 4.928, | |
| "eval_steps": 500, | |
| "global_step": 310, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 1.009501338005066, | |
| "learning_rate": 9.993582535855265e-06, | |
| "loss": 0.4352, | |
| "num_input_tokens_seen": 904912, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 0.7794860601425171, | |
| "learning_rate": 9.974346616959476e-06, | |
| "loss": 0.388, | |
| "num_input_tokens_seen": 1808376, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 0.5388593673706055, | |
| "learning_rate": 9.942341621640558e-06, | |
| "loss": 0.3875, | |
| "num_input_tokens_seen": 2684696, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 0.46523410081863403, | |
| "learning_rate": 9.897649706262474e-06, | |
| "loss": 0.3959, | |
| "num_input_tokens_seen": 3596136, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 0.5142658352851868, | |
| "learning_rate": 9.840385594331022e-06, | |
| "loss": 0.3718, | |
| "num_input_tokens_seen": 4549488, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 0.3676227927207947, | |
| "learning_rate": 9.770696282000245e-06, | |
| "loss": 0.3597, | |
| "num_input_tokens_seen": 5432232, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 0.3568161725997925, | |
| "learning_rate": 9.688760660735403e-06, | |
| "loss": 0.3683, | |
| "num_input_tokens_seen": 6289320, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 0.2971496284008026, | |
| "learning_rate": 9.594789058101154e-06, | |
| "loss": 0.3701, | |
| "num_input_tokens_seen": 7178328, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 0.3190518021583557, | |
| "learning_rate": 9.48902269785371e-06, | |
| "loss": 0.3796, | |
| "num_input_tokens_seen": 8074600, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 0.32561618089675903, | |
| "learning_rate": 9.371733080722911e-06, | |
| "loss": 0.3451, | |
| "num_input_tokens_seen": 9044168, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 0.3187696635723114, | |
| "learning_rate": 9.243221287473755e-06, | |
| "loss": 0.3526, | |
| "num_input_tokens_seen": 9969944, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 0.30837559700012207, | |
| "learning_rate": 9.103817206036383e-06, | |
| "loss": 0.3386, | |
| "num_input_tokens_seen": 10909008, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 1.032, | |
| "grad_norm": 0.4271972179412842, | |
| "learning_rate": 8.953878684688492e-06, | |
| "loss": 0.3002, | |
| "num_input_tokens_seen": 11663840, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 1.112, | |
| "grad_norm": 0.4927215874195099, | |
| "learning_rate": 8.793790613463956e-06, | |
| "loss": 0.2818, | |
| "num_input_tokens_seen": 12553064, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 1.192, | |
| "grad_norm": 0.4385733902454376, | |
| "learning_rate": 8.6239639361456e-06, | |
| "loss": 0.2778, | |
| "num_input_tokens_seen": 13469144, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 1.272, | |
| "grad_norm": 0.43125155568122864, | |
| "learning_rate": 8.444834595378434e-06, | |
| "loss": 0.2691, | |
| "num_input_tokens_seen": 14406256, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 1.3519999999999999, | |
| "grad_norm": 0.3859022557735443, | |
| "learning_rate": 8.256862413611113e-06, | |
| "loss": 0.262, | |
| "num_input_tokens_seen": 15305328, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 1.432, | |
| "grad_norm": 0.46962714195251465, | |
| "learning_rate": 8.060529912738316e-06, | |
| "loss": 0.2805, | |
| "num_input_tokens_seen": 16259688, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 1.512, | |
| "grad_norm": 0.3477609157562256, | |
| "learning_rate": 7.856341075473963e-06, | |
| "loss": 0.2775, | |
| "num_input_tokens_seen": 17167192, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 1.592, | |
| "grad_norm": 0.41423365473747253, | |
| "learning_rate": 7.644820051634813e-06, | |
| "loss": 0.2645, | |
| "num_input_tokens_seen": 18059304, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 1.6720000000000002, | |
| "grad_norm": 0.32248997688293457, | |
| "learning_rate": 7.4265098126554065e-06, | |
| "loss": 0.2773, | |
| "num_input_tokens_seen": 18973864, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 1.752, | |
| "grad_norm": 0.3673882782459259, | |
| "learning_rate": 7.201970757788172e-06, | |
| "loss": 0.2867, | |
| "num_input_tokens_seen": 19877832, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 1.8319999999999999, | |
| "grad_norm": 0.2969827353954315, | |
| "learning_rate": 6.971779275566593e-06, | |
| "loss": 0.2697, | |
| "num_input_tokens_seen": 20793160, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 1.912, | |
| "grad_norm": 0.3408501148223877, | |
| "learning_rate": 6.736526264224101e-06, | |
| "loss": 0.264, | |
| "num_input_tokens_seen": 21662200, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 1.992, | |
| "grad_norm": 0.3277493715286255, | |
| "learning_rate": 6.496815614866792e-06, | |
| "loss": 0.2734, | |
| "num_input_tokens_seen": 22579968, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 2.064, | |
| "grad_norm": 0.39144793152809143, | |
| "learning_rate": 6.2532626612936035e-06, | |
| "loss": 0.2438, | |
| "num_input_tokens_seen": 23409800, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 2.144, | |
| "grad_norm": 0.3468864858150482, | |
| "learning_rate": 6.006492600443301e-06, | |
| "loss": 0.2017, | |
| "num_input_tokens_seen": 24285480, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 2.224, | |
| "grad_norm": 0.36446893215179443, | |
| "learning_rate": 5.757138887522884e-06, | |
| "loss": 0.2014, | |
| "num_input_tokens_seen": 25155560, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 2.304, | |
| "grad_norm": 0.2912499010562897, | |
| "learning_rate": 5.505841609937162e-06, | |
| "loss": 0.193, | |
| "num_input_tokens_seen": 26045360, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 2.384, | |
| "grad_norm": 0.32886043190956116, | |
| "learning_rate": 5.253245844193564e-06, | |
| "loss": 0.204, | |
| "num_input_tokens_seen": 26953512, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 2.464, | |
| "grad_norm": 0.28212445974349976, | |
| "learning_rate": 5e-06, | |
| "loss": 0.2224, | |
| "num_input_tokens_seen": 27875320, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 2.544, | |
| "grad_norm": 0.35469871759414673, | |
| "learning_rate": 4.746754155806437e-06, | |
| "loss": 0.2122, | |
| "num_input_tokens_seen": 28786896, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 2.624, | |
| "grad_norm": 0.285258024930954, | |
| "learning_rate": 4.49415839006284e-06, | |
| "loss": 0.1976, | |
| "num_input_tokens_seen": 29630128, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 2.7039999999999997, | |
| "grad_norm": 0.3201941251754761, | |
| "learning_rate": 4.2428611124771184e-06, | |
| "loss": 0.1935, | |
| "num_input_tokens_seen": 30503816, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 2.784, | |
| "grad_norm": 0.3066469430923462, | |
| "learning_rate": 3.993507399556699e-06, | |
| "loss": 0.2237, | |
| "num_input_tokens_seen": 31433032, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 2.864, | |
| "grad_norm": 0.2676903009414673, | |
| "learning_rate": 3.7467373387063973e-06, | |
| "loss": 0.2002, | |
| "num_input_tokens_seen": 32347864, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 2.944, | |
| "grad_norm": 0.30449098348617554, | |
| "learning_rate": 3.5031843851332105e-06, | |
| "loss": 0.2138, | |
| "num_input_tokens_seen": 33346576, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 3.016, | |
| "grad_norm": 0.478014200925827, | |
| "learning_rate": 3.2634737357758994e-06, | |
| "loss": 0.1904, | |
| "num_input_tokens_seen": 34219176, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 3.096, | |
| "grad_norm": 0.2607128620147705, | |
| "learning_rate": 3.0282207244334084e-06, | |
| "loss": 0.1643, | |
| "num_input_tokens_seen": 35158432, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 3.176, | |
| "grad_norm": 0.29989877343177795, | |
| "learning_rate": 2.7980292422118282e-06, | |
| "loss": 0.16, | |
| "num_input_tokens_seen": 36055520, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 3.2560000000000002, | |
| "grad_norm": 0.2684940695762634, | |
| "learning_rate": 2.573490187344596e-06, | |
| "loss": 0.1493, | |
| "num_input_tokens_seen": 36983672, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 3.336, | |
| "grad_norm": 0.2773168087005615, | |
| "learning_rate": 2.3551799483651894e-06, | |
| "loss": 0.1436, | |
| "num_input_tokens_seen": 37837680, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 3.416, | |
| "grad_norm": 0.33160483837127686, | |
| "learning_rate": 2.1436589245260375e-06, | |
| "loss": 0.154, | |
| "num_input_tokens_seen": 38766568, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 3.496, | |
| "grad_norm": 0.3284499943256378, | |
| "learning_rate": 1.9394700872616856e-06, | |
| "loss": 0.1658, | |
| "num_input_tokens_seen": 39667328, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 3.576, | |
| "grad_norm": 0.27978459000587463, | |
| "learning_rate": 1.74313758638889e-06, | |
| "loss": 0.1472, | |
| "num_input_tokens_seen": 40532840, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 3.656, | |
| "grad_norm": 0.2714602053165436, | |
| "learning_rate": 1.555165404621567e-06, | |
| "loss": 0.1654, | |
| "num_input_tokens_seen": 41498224, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 3.7359999999999998, | |
| "grad_norm": 0.2441283017396927, | |
| "learning_rate": 1.3760360638544012e-06, | |
| "loss": 0.1583, | |
| "num_input_tokens_seen": 42449096, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 3.816, | |
| "grad_norm": 0.32902443408966064, | |
| "learning_rate": 1.2062093865360458e-06, | |
| "loss": 0.1463, | |
| "num_input_tokens_seen": 43293600, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 3.896, | |
| "grad_norm": 0.3130110204219818, | |
| "learning_rate": 1.046121315311508e-06, | |
| "loss": 0.1599, | |
| "num_input_tokens_seen": 44190712, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 3.976, | |
| "grad_norm": 0.2552832067012787, | |
| "learning_rate": 8.961827939636198e-07, | |
| "loss": 0.1609, | |
| "num_input_tokens_seen": 45096112, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 4.048, | |
| "grad_norm": 0.2593518793582916, | |
| "learning_rate": 7.567787125262449e-07, | |
| "loss": 0.1428, | |
| "num_input_tokens_seen": 45916648, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 4.128, | |
| "grad_norm": 0.20947769284248352, | |
| "learning_rate": 6.282669192770896e-07, | |
| "loss": 0.1216, | |
| "num_input_tokens_seen": 46801776, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 4.208, | |
| "grad_norm": 0.22543282806873322, | |
| "learning_rate": 5.109773021462921e-07, | |
| "loss": 0.1309, | |
| "num_input_tokens_seen": 47685704, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 4.288, | |
| "grad_norm": 0.24011245369911194, | |
| "learning_rate": 4.05210941898847e-07, | |
| "loss": 0.1346, | |
| "num_input_tokens_seen": 48567032, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 4.368, | |
| "grad_norm": 0.22010523080825806, | |
| "learning_rate": 3.112393392645985e-07, | |
| "loss": 0.1318, | |
| "num_input_tokens_seen": 49491568, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 4.448, | |
| "grad_norm": 0.1899755746126175, | |
| "learning_rate": 2.2930371799975593e-07, | |
| "loss": 0.1355, | |
| "num_input_tokens_seen": 50442584, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 4.5280000000000005, | |
| "grad_norm": 0.20433053374290466, | |
| "learning_rate": 1.5961440566897913e-07, | |
| "loss": 0.1418, | |
| "num_input_tokens_seen": 51413272, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 4.608, | |
| "grad_norm": 0.23340986669063568, | |
| "learning_rate": 1.0235029373752758e-07, | |
| "loss": 0.1358, | |
| "num_input_tokens_seen": 52276024, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 4.688, | |
| "grad_norm": 0.20058894157409668, | |
| "learning_rate": 5.7658378359443104e-08, | |
| "loss": 0.1319, | |
| "num_input_tokens_seen": 53176640, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 4.768, | |
| "grad_norm": 0.21048085391521454, | |
| "learning_rate": 2.5653383040524228e-08, | |
| "loss": 0.1401, | |
| "num_input_tokens_seen": 54130624, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 4.848, | |
| "grad_norm": 0.21626824140548706, | |
| "learning_rate": 6.417464144736208e-09, | |
| "loss": 0.1395, | |
| "num_input_tokens_seen": 55018008, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 4.928, | |
| "grad_norm": 0.22036929428577423, | |
| "learning_rate": 0.0, | |
| "loss": 0.1385, | |
| "num_input_tokens_seen": 55950608, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 4.928, | |
| "num_input_tokens_seen": 55950608, | |
| "step": 310, | |
| "total_flos": 375241330655232.0, | |
| "train_loss": 0.23023235970927822, | |
| "train_runtime": 24795.9426, | |
| "train_samples_per_second": 0.202, | |
| "train_steps_per_second": 0.013 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 310, | |
| "num_input_tokens_seen": 55950608, | |
| "num_train_epochs": 5, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 375241330655232.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |