S1.1-QwQ-DS / trainer_state.json
BitStarWalkin's picture
Upload folder using huggingface_hub
4cdf0df verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 4.928,
"eval_steps": 500,
"global_step": 310,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.08,
"grad_norm": 1.009501338005066,
"learning_rate": 9.993582535855265e-06,
"loss": 0.4352,
"num_input_tokens_seen": 904912,
"step": 5
},
{
"epoch": 0.16,
"grad_norm": 0.7794860601425171,
"learning_rate": 9.974346616959476e-06,
"loss": 0.388,
"num_input_tokens_seen": 1808376,
"step": 10
},
{
"epoch": 0.24,
"grad_norm": 0.5388593673706055,
"learning_rate": 9.942341621640558e-06,
"loss": 0.3875,
"num_input_tokens_seen": 2684696,
"step": 15
},
{
"epoch": 0.32,
"grad_norm": 0.46523410081863403,
"learning_rate": 9.897649706262474e-06,
"loss": 0.3959,
"num_input_tokens_seen": 3596136,
"step": 20
},
{
"epoch": 0.4,
"grad_norm": 0.5142658352851868,
"learning_rate": 9.840385594331022e-06,
"loss": 0.3718,
"num_input_tokens_seen": 4549488,
"step": 25
},
{
"epoch": 0.48,
"grad_norm": 0.3676227927207947,
"learning_rate": 9.770696282000245e-06,
"loss": 0.3597,
"num_input_tokens_seen": 5432232,
"step": 30
},
{
"epoch": 0.56,
"grad_norm": 0.3568161725997925,
"learning_rate": 9.688760660735403e-06,
"loss": 0.3683,
"num_input_tokens_seen": 6289320,
"step": 35
},
{
"epoch": 0.64,
"grad_norm": 0.2971496284008026,
"learning_rate": 9.594789058101154e-06,
"loss": 0.3701,
"num_input_tokens_seen": 7178328,
"step": 40
},
{
"epoch": 0.72,
"grad_norm": 0.3190518021583557,
"learning_rate": 9.48902269785371e-06,
"loss": 0.3796,
"num_input_tokens_seen": 8074600,
"step": 45
},
{
"epoch": 0.8,
"grad_norm": 0.32561618089675903,
"learning_rate": 9.371733080722911e-06,
"loss": 0.3451,
"num_input_tokens_seen": 9044168,
"step": 50
},
{
"epoch": 0.88,
"grad_norm": 0.3187696635723114,
"learning_rate": 9.243221287473755e-06,
"loss": 0.3526,
"num_input_tokens_seen": 9969944,
"step": 55
},
{
"epoch": 0.96,
"grad_norm": 0.30837559700012207,
"learning_rate": 9.103817206036383e-06,
"loss": 0.3386,
"num_input_tokens_seen": 10909008,
"step": 60
},
{
"epoch": 1.032,
"grad_norm": 0.4271972179412842,
"learning_rate": 8.953878684688492e-06,
"loss": 0.3002,
"num_input_tokens_seen": 11663840,
"step": 65
},
{
"epoch": 1.112,
"grad_norm": 0.4927215874195099,
"learning_rate": 8.793790613463956e-06,
"loss": 0.2818,
"num_input_tokens_seen": 12553064,
"step": 70
},
{
"epoch": 1.192,
"grad_norm": 0.4385733902454376,
"learning_rate": 8.6239639361456e-06,
"loss": 0.2778,
"num_input_tokens_seen": 13469144,
"step": 75
},
{
"epoch": 1.272,
"grad_norm": 0.43125155568122864,
"learning_rate": 8.444834595378434e-06,
"loss": 0.2691,
"num_input_tokens_seen": 14406256,
"step": 80
},
{
"epoch": 1.3519999999999999,
"grad_norm": 0.3859022557735443,
"learning_rate": 8.256862413611113e-06,
"loss": 0.262,
"num_input_tokens_seen": 15305328,
"step": 85
},
{
"epoch": 1.432,
"grad_norm": 0.46962714195251465,
"learning_rate": 8.060529912738316e-06,
"loss": 0.2805,
"num_input_tokens_seen": 16259688,
"step": 90
},
{
"epoch": 1.512,
"grad_norm": 0.3477609157562256,
"learning_rate": 7.856341075473963e-06,
"loss": 0.2775,
"num_input_tokens_seen": 17167192,
"step": 95
},
{
"epoch": 1.592,
"grad_norm": 0.41423365473747253,
"learning_rate": 7.644820051634813e-06,
"loss": 0.2645,
"num_input_tokens_seen": 18059304,
"step": 100
},
{
"epoch": 1.6720000000000002,
"grad_norm": 0.32248997688293457,
"learning_rate": 7.4265098126554065e-06,
"loss": 0.2773,
"num_input_tokens_seen": 18973864,
"step": 105
},
{
"epoch": 1.752,
"grad_norm": 0.3673882782459259,
"learning_rate": 7.201970757788172e-06,
"loss": 0.2867,
"num_input_tokens_seen": 19877832,
"step": 110
},
{
"epoch": 1.8319999999999999,
"grad_norm": 0.2969827353954315,
"learning_rate": 6.971779275566593e-06,
"loss": 0.2697,
"num_input_tokens_seen": 20793160,
"step": 115
},
{
"epoch": 1.912,
"grad_norm": 0.3408501148223877,
"learning_rate": 6.736526264224101e-06,
"loss": 0.264,
"num_input_tokens_seen": 21662200,
"step": 120
},
{
"epoch": 1.992,
"grad_norm": 0.3277493715286255,
"learning_rate": 6.496815614866792e-06,
"loss": 0.2734,
"num_input_tokens_seen": 22579968,
"step": 125
},
{
"epoch": 2.064,
"grad_norm": 0.39144793152809143,
"learning_rate": 6.2532626612936035e-06,
"loss": 0.2438,
"num_input_tokens_seen": 23409800,
"step": 130
},
{
"epoch": 2.144,
"grad_norm": 0.3468864858150482,
"learning_rate": 6.006492600443301e-06,
"loss": 0.2017,
"num_input_tokens_seen": 24285480,
"step": 135
},
{
"epoch": 2.224,
"grad_norm": 0.36446893215179443,
"learning_rate": 5.757138887522884e-06,
"loss": 0.2014,
"num_input_tokens_seen": 25155560,
"step": 140
},
{
"epoch": 2.304,
"grad_norm": 0.2912499010562897,
"learning_rate": 5.505841609937162e-06,
"loss": 0.193,
"num_input_tokens_seen": 26045360,
"step": 145
},
{
"epoch": 2.384,
"grad_norm": 0.32886043190956116,
"learning_rate": 5.253245844193564e-06,
"loss": 0.204,
"num_input_tokens_seen": 26953512,
"step": 150
},
{
"epoch": 2.464,
"grad_norm": 0.28212445974349976,
"learning_rate": 5e-06,
"loss": 0.2224,
"num_input_tokens_seen": 27875320,
"step": 155
},
{
"epoch": 2.544,
"grad_norm": 0.35469871759414673,
"learning_rate": 4.746754155806437e-06,
"loss": 0.2122,
"num_input_tokens_seen": 28786896,
"step": 160
},
{
"epoch": 2.624,
"grad_norm": 0.285258024930954,
"learning_rate": 4.49415839006284e-06,
"loss": 0.1976,
"num_input_tokens_seen": 29630128,
"step": 165
},
{
"epoch": 2.7039999999999997,
"grad_norm": 0.3201941251754761,
"learning_rate": 4.2428611124771184e-06,
"loss": 0.1935,
"num_input_tokens_seen": 30503816,
"step": 170
},
{
"epoch": 2.784,
"grad_norm": 0.3066469430923462,
"learning_rate": 3.993507399556699e-06,
"loss": 0.2237,
"num_input_tokens_seen": 31433032,
"step": 175
},
{
"epoch": 2.864,
"grad_norm": 0.2676903009414673,
"learning_rate": 3.7467373387063973e-06,
"loss": 0.2002,
"num_input_tokens_seen": 32347864,
"step": 180
},
{
"epoch": 2.944,
"grad_norm": 0.30449098348617554,
"learning_rate": 3.5031843851332105e-06,
"loss": 0.2138,
"num_input_tokens_seen": 33346576,
"step": 185
},
{
"epoch": 3.016,
"grad_norm": 0.478014200925827,
"learning_rate": 3.2634737357758994e-06,
"loss": 0.1904,
"num_input_tokens_seen": 34219176,
"step": 190
},
{
"epoch": 3.096,
"grad_norm": 0.2607128620147705,
"learning_rate": 3.0282207244334084e-06,
"loss": 0.1643,
"num_input_tokens_seen": 35158432,
"step": 195
},
{
"epoch": 3.176,
"grad_norm": 0.29989877343177795,
"learning_rate": 2.7980292422118282e-06,
"loss": 0.16,
"num_input_tokens_seen": 36055520,
"step": 200
},
{
"epoch": 3.2560000000000002,
"grad_norm": 0.2684940695762634,
"learning_rate": 2.573490187344596e-06,
"loss": 0.1493,
"num_input_tokens_seen": 36983672,
"step": 205
},
{
"epoch": 3.336,
"grad_norm": 0.2773168087005615,
"learning_rate": 2.3551799483651894e-06,
"loss": 0.1436,
"num_input_tokens_seen": 37837680,
"step": 210
},
{
"epoch": 3.416,
"grad_norm": 0.33160483837127686,
"learning_rate": 2.1436589245260375e-06,
"loss": 0.154,
"num_input_tokens_seen": 38766568,
"step": 215
},
{
"epoch": 3.496,
"grad_norm": 0.3284499943256378,
"learning_rate": 1.9394700872616856e-06,
"loss": 0.1658,
"num_input_tokens_seen": 39667328,
"step": 220
},
{
"epoch": 3.576,
"grad_norm": 0.27978459000587463,
"learning_rate": 1.74313758638889e-06,
"loss": 0.1472,
"num_input_tokens_seen": 40532840,
"step": 225
},
{
"epoch": 3.656,
"grad_norm": 0.2714602053165436,
"learning_rate": 1.555165404621567e-06,
"loss": 0.1654,
"num_input_tokens_seen": 41498224,
"step": 230
},
{
"epoch": 3.7359999999999998,
"grad_norm": 0.2441283017396927,
"learning_rate": 1.3760360638544012e-06,
"loss": 0.1583,
"num_input_tokens_seen": 42449096,
"step": 235
},
{
"epoch": 3.816,
"grad_norm": 0.32902443408966064,
"learning_rate": 1.2062093865360458e-06,
"loss": 0.1463,
"num_input_tokens_seen": 43293600,
"step": 240
},
{
"epoch": 3.896,
"grad_norm": 0.3130110204219818,
"learning_rate": 1.046121315311508e-06,
"loss": 0.1599,
"num_input_tokens_seen": 44190712,
"step": 245
},
{
"epoch": 3.976,
"grad_norm": 0.2552832067012787,
"learning_rate": 8.961827939636198e-07,
"loss": 0.1609,
"num_input_tokens_seen": 45096112,
"step": 250
},
{
"epoch": 4.048,
"grad_norm": 0.2593518793582916,
"learning_rate": 7.567787125262449e-07,
"loss": 0.1428,
"num_input_tokens_seen": 45916648,
"step": 255
},
{
"epoch": 4.128,
"grad_norm": 0.20947769284248352,
"learning_rate": 6.282669192770896e-07,
"loss": 0.1216,
"num_input_tokens_seen": 46801776,
"step": 260
},
{
"epoch": 4.208,
"grad_norm": 0.22543282806873322,
"learning_rate": 5.109773021462921e-07,
"loss": 0.1309,
"num_input_tokens_seen": 47685704,
"step": 265
},
{
"epoch": 4.288,
"grad_norm": 0.24011245369911194,
"learning_rate": 4.05210941898847e-07,
"loss": 0.1346,
"num_input_tokens_seen": 48567032,
"step": 270
},
{
"epoch": 4.368,
"grad_norm": 0.22010523080825806,
"learning_rate": 3.112393392645985e-07,
"loss": 0.1318,
"num_input_tokens_seen": 49491568,
"step": 275
},
{
"epoch": 4.448,
"grad_norm": 0.1899755746126175,
"learning_rate": 2.2930371799975593e-07,
"loss": 0.1355,
"num_input_tokens_seen": 50442584,
"step": 280
},
{
"epoch": 4.5280000000000005,
"grad_norm": 0.20433053374290466,
"learning_rate": 1.5961440566897913e-07,
"loss": 0.1418,
"num_input_tokens_seen": 51413272,
"step": 285
},
{
"epoch": 4.608,
"grad_norm": 0.23340986669063568,
"learning_rate": 1.0235029373752758e-07,
"loss": 0.1358,
"num_input_tokens_seen": 52276024,
"step": 290
},
{
"epoch": 4.688,
"grad_norm": 0.20058894157409668,
"learning_rate": 5.7658378359443104e-08,
"loss": 0.1319,
"num_input_tokens_seen": 53176640,
"step": 295
},
{
"epoch": 4.768,
"grad_norm": 0.21048085391521454,
"learning_rate": 2.5653383040524228e-08,
"loss": 0.1401,
"num_input_tokens_seen": 54130624,
"step": 300
},
{
"epoch": 4.848,
"grad_norm": 0.21626824140548706,
"learning_rate": 6.417464144736208e-09,
"loss": 0.1395,
"num_input_tokens_seen": 55018008,
"step": 305
},
{
"epoch": 4.928,
"grad_norm": 0.22036929428577423,
"learning_rate": 0.0,
"loss": 0.1385,
"num_input_tokens_seen": 55950608,
"step": 310
},
{
"epoch": 4.928,
"num_input_tokens_seen": 55950608,
"step": 310,
"total_flos": 375241330655232.0,
"train_loss": 0.23023235970927822,
"train_runtime": 24795.9426,
"train_samples_per_second": 0.202,
"train_steps_per_second": 0.013
}
],
"logging_steps": 5,
"max_steps": 310,
"num_input_tokens_seen": 55950608,
"num_train_epochs": 5,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 375241330655232.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}