teapotllm / checkpoint-8160 /trainer_state.json
zakerytclarke's picture
Upload folder using huggingface_hub
7f911ab verified
{
"best_metric": 0.07254856824874878,
"best_model_checkpoint": "./teapotllm/checkpoint-1224",
"epoch": 20.0,
"eval_steps": 500,
"global_step": 8160,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 1.0,
"grad_norm": 0.07505040615797043,
"learning_rate": 4.75e-05,
"loss": 0.0953,
"step": 408
},
{
"epoch": 1.0,
"eval_loss": 0.07464946806430817,
"eval_runtime": 4.9963,
"eval_samples_per_second": 45.034,
"eval_steps_per_second": 5.804,
"step": 408
},
{
"epoch": 2.0,
"grad_norm": 0.3880476951599121,
"learning_rate": 4.5e-05,
"loss": 0.0706,
"step": 816
},
{
"epoch": 2.0,
"eval_loss": 0.0732811763882637,
"eval_runtime": 4.9711,
"eval_samples_per_second": 45.262,
"eval_steps_per_second": 5.834,
"step": 816
},
{
"epoch": 3.0,
"grad_norm": 0.514928936958313,
"learning_rate": 4.25e-05,
"loss": 0.0579,
"step": 1224
},
{
"epoch": 3.0,
"eval_loss": 0.07254856824874878,
"eval_runtime": 4.9773,
"eval_samples_per_second": 45.205,
"eval_steps_per_second": 5.826,
"step": 1224
},
{
"epoch": 4.0,
"grad_norm": 0.4975501000881195,
"learning_rate": 4e-05,
"loss": 0.0493,
"step": 1632
},
{
"epoch": 4.0,
"eval_loss": 0.07370911538600922,
"eval_runtime": 4.9687,
"eval_samples_per_second": 45.284,
"eval_steps_per_second": 5.837,
"step": 1632
},
{
"epoch": 5.0,
"grad_norm": 0.588830292224884,
"learning_rate": 3.7500000000000003e-05,
"loss": 0.0427,
"step": 2040
},
{
"epoch": 5.0,
"eval_loss": 0.07484618574380875,
"eval_runtime": 4.9726,
"eval_samples_per_second": 45.248,
"eval_steps_per_second": 5.832,
"step": 2040
},
{
"epoch": 6.0,
"grad_norm": 0.07073836028575897,
"learning_rate": 3.5e-05,
"loss": 0.0376,
"step": 2448
},
{
"epoch": 6.0,
"eval_loss": 0.07779362797737122,
"eval_runtime": 4.9773,
"eval_samples_per_second": 45.205,
"eval_steps_per_second": 5.826,
"step": 2448
},
{
"epoch": 7.0,
"grad_norm": 0.638589084148407,
"learning_rate": 3.2500000000000004e-05,
"loss": 0.0324,
"step": 2856
},
{
"epoch": 7.0,
"eval_loss": 0.07828149944543839,
"eval_runtime": 4.984,
"eval_samples_per_second": 45.145,
"eval_steps_per_second": 5.819,
"step": 2856
},
{
"epoch": 8.0,
"grad_norm": 0.11211636662483215,
"learning_rate": 3e-05,
"loss": 0.0291,
"step": 3264
},
{
"epoch": 8.0,
"eval_loss": 0.08292075246572495,
"eval_runtime": 4.9953,
"eval_samples_per_second": 45.042,
"eval_steps_per_second": 5.805,
"step": 3264
},
{
"epoch": 9.0,
"grad_norm": 0.2681402266025543,
"learning_rate": 2.7500000000000004e-05,
"loss": 0.0264,
"step": 3672
},
{
"epoch": 9.0,
"eval_loss": 0.08268510550260544,
"eval_runtime": 4.9786,
"eval_samples_per_second": 45.194,
"eval_steps_per_second": 5.825,
"step": 3672
},
{
"epoch": 10.0,
"grad_norm": 0.26614490151405334,
"learning_rate": 2.5e-05,
"loss": 0.023,
"step": 4080
},
{
"epoch": 10.0,
"eval_loss": 0.08475232124328613,
"eval_runtime": 4.9942,
"eval_samples_per_second": 45.052,
"eval_steps_per_second": 5.807,
"step": 4080
},
{
"epoch": 11.0,
"grad_norm": 0.05123787373304367,
"learning_rate": 2.25e-05,
"loss": 0.0217,
"step": 4488
},
{
"epoch": 11.0,
"eval_loss": 0.08677990734577179,
"eval_runtime": 5.0004,
"eval_samples_per_second": 44.996,
"eval_steps_per_second": 5.8,
"step": 4488
},
{
"epoch": 12.0,
"grad_norm": 0.8446316719055176,
"learning_rate": 2e-05,
"loss": 0.0196,
"step": 4896
},
{
"epoch": 12.0,
"eval_loss": 0.08961891382932663,
"eval_runtime": 4.9782,
"eval_samples_per_second": 45.197,
"eval_steps_per_second": 5.825,
"step": 4896
},
{
"epoch": 13.0,
"grad_norm": 0.32860517501831055,
"learning_rate": 1.75e-05,
"loss": 0.0178,
"step": 5304
},
{
"epoch": 13.0,
"eval_loss": 0.09325850754976273,
"eval_runtime": 4.9928,
"eval_samples_per_second": 45.065,
"eval_steps_per_second": 5.808,
"step": 5304
},
{
"epoch": 14.0,
"grad_norm": 0.9496984481811523,
"learning_rate": 1.5e-05,
"loss": 0.0167,
"step": 5712
},
{
"epoch": 14.0,
"eval_loss": 0.09426513314247131,
"eval_runtime": 4.9809,
"eval_samples_per_second": 45.173,
"eval_steps_per_second": 5.822,
"step": 5712
},
{
"epoch": 15.0,
"grad_norm": 0.056426361203193665,
"learning_rate": 1.25e-05,
"loss": 0.016,
"step": 6120
},
{
"epoch": 15.0,
"eval_loss": 0.09544987976551056,
"eval_runtime": 4.9829,
"eval_samples_per_second": 45.155,
"eval_steps_per_second": 5.82,
"step": 6120
},
{
"epoch": 16.0,
"grad_norm": 0.05803034454584122,
"learning_rate": 1e-05,
"loss": 0.0147,
"step": 6528
},
{
"epoch": 16.0,
"eval_loss": 0.09645407646894455,
"eval_runtime": 4.9673,
"eval_samples_per_second": 45.296,
"eval_steps_per_second": 5.838,
"step": 6528
},
{
"epoch": 17.0,
"grad_norm": 0.1461056023836136,
"learning_rate": 7.5e-06,
"loss": 0.0143,
"step": 6936
},
{
"epoch": 17.0,
"eval_loss": 0.09935282170772552,
"eval_runtime": 4.9798,
"eval_samples_per_second": 45.182,
"eval_steps_per_second": 5.824,
"step": 6936
},
{
"epoch": 18.0,
"grad_norm": 0.007102633360773325,
"learning_rate": 5e-06,
"loss": 0.0136,
"step": 7344
},
{
"epoch": 18.0,
"eval_loss": 0.1016102060675621,
"eval_runtime": 4.9718,
"eval_samples_per_second": 45.255,
"eval_steps_per_second": 5.833,
"step": 7344
},
{
"epoch": 19.0,
"grad_norm": 0.011653387919068336,
"learning_rate": 2.5e-06,
"loss": 0.0135,
"step": 7752
},
{
"epoch": 19.0,
"eval_loss": 0.10058598965406418,
"eval_runtime": 4.9846,
"eval_samples_per_second": 45.139,
"eval_steps_per_second": 5.818,
"step": 7752
},
{
"epoch": 20.0,
"grad_norm": 0.7057294249534607,
"learning_rate": 0.0,
"loss": 0.013,
"step": 8160
},
{
"epoch": 20.0,
"eval_loss": 0.10097935050725937,
"eval_runtime": 4.9778,
"eval_samples_per_second": 45.201,
"eval_steps_per_second": 5.826,
"step": 8160
}
],
"logging_steps": 500,
"max_steps": 8160,
"num_input_tokens_seen": 0,
"num_train_epochs": 20,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.11615448449024e+16,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}