{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.2197634490652423, "eval_steps": 100, "global_step": 800, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.07630675314765356, "grad_norm": 1.023113063134267, "learning_rate": 9.8e-05, "loss": 1.7986, "step": 50 }, { "epoch": 0.15261350629530712, "grad_norm": 0.6499131057323768, "learning_rate": 9.998611902101788e-05, "loss": 1.6798, "step": 100 }, { "epoch": 0.15261350629530712, "eval_loss": 1.6440359354019165, "eval_runtime": 14.1831, "eval_samples_per_second": 70.506, "eval_steps_per_second": 2.256, "step": 100 }, { "epoch": 0.2289202594429607, "grad_norm": 0.8768674937154102, "learning_rate": 9.99433453236782e-05, "loss": 1.6172, "step": 150 }, { "epoch": 0.30522701259061424, "grad_norm": 0.7622537034016803, "learning_rate": 9.987169810501313e-05, "loss": 1.5567, "step": 200 }, { "epoch": 0.30522701259061424, "eval_loss": 1.5789680480957031, "eval_runtime": 14.0913, "eval_samples_per_second": 70.966, "eval_steps_per_second": 2.271, "step": 200 }, { "epoch": 0.38153376573826786, "grad_norm": 0.644103797442569, "learning_rate": 9.977121920498083e-05, "loss": 1.5, "step": 250 }, { "epoch": 0.4578405188859214, "grad_norm": 0.4613543442112991, "learning_rate": 9.964196730042939e-05, "loss": 1.4557, "step": 300 }, { "epoch": 0.4578405188859214, "eval_loss": 1.548462152481079, "eval_runtime": 14.0367, "eval_samples_per_second": 71.242, "eval_steps_per_second": 2.28, "step": 300 }, { "epoch": 0.5341472720335749, "grad_norm": 0.7375950318857653, "learning_rate": 9.948401787083122e-05, "loss": 1.4018, "step": 350 }, { "epoch": 0.6104540251812285, "grad_norm": 0.48711275390307385, "learning_rate": 9.929746315420536e-05, "loss": 1.3662, "step": 400 }, { "epoch": 0.6104540251812285, "eval_loss": 1.5432072877883911, "eval_runtime": 14.0842, "eval_samples_per_second": 71.002, "eval_steps_per_second": 2.272, "step": 400 }, { "epoch": 0.6867607783288822, "grad_norm": 0.5560635325361898, "learning_rate": 9.908241209325283e-05, "loss": 1.3261, "step": 450 }, { "epoch": 0.7630675314765357, "grad_norm": 0.9108823533128092, "learning_rate": 9.883899027173732e-05, "loss": 1.2771, "step": 500 }, { "epoch": 0.7630675314765357, "eval_loss": 1.5354772806167603, "eval_runtime": 14.0372, "eval_samples_per_second": 71.239, "eval_steps_per_second": 2.28, "step": 500 }, { "epoch": 0.8393742846241893, "grad_norm": 0.8339599604457497, "learning_rate": 9.856733984114773e-05, "loss": 1.2362, "step": 550 }, { "epoch": 0.9156810377718428, "grad_norm": 0.5108679619066585, "learning_rate": 9.826761943768576e-05, "loss": 1.2166, "step": 600 }, { "epoch": 0.9156810377718428, "eval_loss": 1.571360468864441, "eval_runtime": 14.0728, "eval_samples_per_second": 71.059, "eval_steps_per_second": 2.274, "step": 600 }, { "epoch": 0.9919877909194964, "grad_norm": 0.7051167229545275, "learning_rate": 9.794000408962676e-05, "loss": 1.1748, "step": 650 }, { "epoch": 1.0671499427699351, "grad_norm": 0.917674294403115, "learning_rate": 9.758468511510826e-05, "loss": 0.9771, "step": 700 }, { "epoch": 1.0671499427699351, "eval_loss": 1.6417957544326782, "eval_runtime": 14.0627, "eval_samples_per_second": 71.11, "eval_steps_per_second": 2.276, "step": 700 }, { "epoch": 1.1434566959175887, "grad_norm": 0.9078688914191193, "learning_rate": 9.720187001040543e-05, "loss": 0.9599, "step": 750 }, { "epoch": 1.2197634490652423, "grad_norm": 0.6577179345572577, "learning_rate": 9.679178232875922e-05, "loss": 0.9498, "step": 800 }, { "epoch": 1.2197634490652423, "eval_loss": 1.688981533050537, "eval_runtime": 14.0233, "eval_samples_per_second": 71.31, "eval_steps_per_second": 2.282, "step": 800 } ], "logging_steps": 50, "max_steps": 6550, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 800, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 759728326049792.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }