mistral7b-pissa-coding-11-v1 / trainer_state.json
chansung's picture
Model save
0c9858a verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 216,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.004629629629629629,
"grad_norm": 4.558699607849121,
"learning_rate": 9.090909090909091e-06,
"loss": 1.4478,
"step": 1
},
{
"epoch": 0.023148148148148147,
"grad_norm": 3.5731630325317383,
"learning_rate": 4.545454545454546e-05,
"loss": 1.4229,
"step": 5
},
{
"epoch": 0.046296296296296294,
"grad_norm": 2.504847288131714,
"learning_rate": 9.090909090909092e-05,
"loss": 1.346,
"step": 10
},
{
"epoch": 0.06944444444444445,
"grad_norm": 1.5195527076721191,
"learning_rate": 0.00013636363636363637,
"loss": 1.2411,
"step": 15
},
{
"epoch": 0.09259259259259259,
"grad_norm": 1.1458969116210938,
"learning_rate": 0.00018181818181818183,
"loss": 1.1481,
"step": 20
},
{
"epoch": 0.11574074074074074,
"grad_norm": 1.0300581455230713,
"learning_rate": 0.0001998820159279591,
"loss": 1.0637,
"step": 25
},
{
"epoch": 0.1388888888888889,
"grad_norm": 0.5588775873184204,
"learning_rate": 0.00019916201012264254,
"loss": 1.0107,
"step": 30
},
{
"epoch": 0.16203703703703703,
"grad_norm": 0.6917332410812378,
"learning_rate": 0.00019779225723955707,
"loss": 0.9795,
"step": 35
},
{
"epoch": 0.18518518518518517,
"grad_norm": 0.4281361997127533,
"learning_rate": 0.00019578173241879872,
"loss": 0.9527,
"step": 40
},
{
"epoch": 0.20833333333333334,
"grad_norm": 0.42266684770584106,
"learning_rate": 0.00019314360938108425,
"loss": 0.9431,
"step": 45
},
{
"epoch": 0.23148148148148148,
"grad_norm": 0.4428061246871948,
"learning_rate": 0.00018989517410853955,
"loss": 0.9236,
"step": 50
},
{
"epoch": 0.25462962962962965,
"grad_norm": 0.41992688179016113,
"learning_rate": 0.00018605771158039253,
"loss": 0.9225,
"step": 55
},
{
"epoch": 0.2777777777777778,
"grad_norm": 0.3743288815021515,
"learning_rate": 0.0001816563663057211,
"loss": 0.9176,
"step": 60
},
{
"epoch": 0.30092592592592593,
"grad_norm": 0.4716743528842926,
"learning_rate": 0.00017671997756709863,
"loss": 0.9112,
"step": 65
},
{
"epoch": 0.32407407407407407,
"grad_norm": 0.3897479176521301,
"learning_rate": 0.00017128089045468294,
"loss": 0.9079,
"step": 70
},
{
"epoch": 0.3472222222222222,
"grad_norm": 0.40086159110069275,
"learning_rate": 0.00016537474392892528,
"loss": 0.9065,
"step": 75
},
{
"epoch": 0.37037037037037035,
"grad_norm": 0.3658371865749359,
"learning_rate": 0.00015904023730059228,
"loss": 0.8987,
"step": 80
},
{
"epoch": 0.39351851851851855,
"grad_norm": 0.392740935087204,
"learning_rate": 0.000152318876658213,
"loss": 0.9045,
"step": 85
},
{
"epoch": 0.4166666666666667,
"grad_norm": 0.37253740429878235,
"learning_rate": 0.00014525470290445392,
"loss": 0.9038,
"step": 90
},
{
"epoch": 0.4398148148148148,
"grad_norm": 0.3818472623825073,
"learning_rate": 0.00013789400318343068,
"loss": 0.897,
"step": 95
},
{
"epoch": 0.46296296296296297,
"grad_norm": 1.4701107740402222,
"learning_rate": 0.00013028500758979506,
"loss": 0.8999,
"step": 100
},
{
"epoch": 0.4861111111111111,
"grad_norm": 0.3863574266433716,
"learning_rate": 0.00012247757314687297,
"loss": 0.8896,
"step": 105
},
{
"epoch": 0.5092592592592593,
"grad_norm": 0.3857921063899994,
"learning_rate": 0.00011452285712454904,
"loss": 0.8936,
"step": 110
},
{
"epoch": 0.5324074074074074,
"grad_norm": 0.4537260830402374,
"learning_rate": 0.00010647298183744359,
"loss": 0.8841,
"step": 115
},
{
"epoch": 0.5555555555555556,
"grad_norm": 0.41398969292640686,
"learning_rate": 9.838069311974986e-05,
"loss": 0.8853,
"step": 120
},
{
"epoch": 0.5787037037037037,
"grad_norm": 0.5760473608970642,
"learning_rate": 9.02990147145352e-05,
"loss": 0.8869,
"step": 125
},
{
"epoch": 0.6018518518518519,
"grad_norm": 0.3519139289855957,
"learning_rate": 8.228090084207774e-05,
"loss": 0.8862,
"step": 130
},
{
"epoch": 0.625,
"grad_norm": 0.3705141842365265,
"learning_rate": 7.437888922374276e-05,
"loss": 0.8942,
"step": 135
},
{
"epoch": 0.6481481481481481,
"grad_norm": 0.3546064794063568,
"learning_rate": 6.664475683491796e-05,
"loss": 0.8858,
"step": 140
},
{
"epoch": 0.6712962962962963,
"grad_norm": 0.4081316292285919,
"learning_rate": 5.9129180642644414e-05,
"loss": 0.8873,
"step": 145
},
{
"epoch": 0.6944444444444444,
"grad_norm": 0.3876301646232605,
"learning_rate": 5.1881405550919493e-05,
"loss": 0.8873,
"step": 150
},
{
"epoch": 0.7175925925925926,
"grad_norm": 0.37546050548553467,
"learning_rate": 4.494892172941965e-05,
"loss": 0.8877,
"step": 155
},
{
"epoch": 0.7407407407407407,
"grad_norm": 0.3889768123626709,
"learning_rate": 3.8377153439907266e-05,
"loss": 0.8835,
"step": 160
},
{
"epoch": 0.7638888888888888,
"grad_norm": 0.37897348403930664,
"learning_rate": 3.2209161399249674e-05,
"loss": 0.8714,
"step": 165
},
{
"epoch": 0.7870370370370371,
"grad_norm": 0.3686477839946747,
"learning_rate": 2.6485360629279987e-05,
"loss": 0.8789,
"step": 170
},
{
"epoch": 0.8101851851851852,
"grad_norm": 0.33174994587898254,
"learning_rate": 2.1243255642254578e-05,
"loss": 0.8797,
"step": 175
},
{
"epoch": 0.8333333333333334,
"grad_norm": 0.35853928327560425,
"learning_rate": 1.65171946970729e-05,
"loss": 0.8841,
"step": 180
},
{
"epoch": 0.8564814814814815,
"grad_norm": 0.3387424051761627,
"learning_rate": 1.233814473646524e-05,
"loss": 0.8852,
"step": 185
},
{
"epoch": 0.8796296296296297,
"grad_norm": 0.40066128969192505,
"learning_rate": 8.733488479845997e-06,
"loss": 0.8811,
"step": 190
},
{
"epoch": 0.9027777777777778,
"grad_norm": 0.341008722782135,
"learning_rate": 5.726845001356573e-06,
"loss": 0.8743,
"step": 195
},
{
"epoch": 0.9259259259259259,
"grad_norm": 0.3468351364135742,
"learning_rate": 3.3379149687388867e-06,
"loss": 0.8731,
"step": 200
},
{
"epoch": 0.9490740740740741,
"grad_norm": 0.37516307830810547,
"learning_rate": 1.5823515570925763e-06,
"loss": 0.8711,
"step": 205
},
{
"epoch": 0.9722222222222222,
"grad_norm": 0.34528055787086487,
"learning_rate": 4.7165788333860536e-07,
"loss": 0.8752,
"step": 210
},
{
"epoch": 0.9953703703703703,
"grad_norm": 0.35141706466674805,
"learning_rate": 1.3111633436779791e-08,
"loss": 0.8727,
"step": 215
},
{
"epoch": 1.0,
"eval_loss": 1.372731328010559,
"eval_runtime": 0.567,
"eval_samples_per_second": 19.4,
"eval_steps_per_second": 1.764,
"step": 216
},
{
"epoch": 1.0,
"step": 216,
"total_flos": 9.06748200373715e+17,
"train_loss": 0.9395653671688504,
"train_runtime": 729.0361,
"train_samples_per_second": 56.83,
"train_steps_per_second": 0.296
}
],
"logging_steps": 5,
"max_steps": 216,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 9.06748200373715e+17,
"train_batch_size": 12,
"trial_name": null,
"trial_params": null
}