{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 14, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.16, "grad_norm": 17.805028915405273, "learning_rate": 0.0, "loss": 4.49, "mean_token_accuracy": 0.33135150372982025, "num_tokens": 6495.0, "step": 1 }, { "epoch": 0.32, "grad_norm": 17.48756980895996, "learning_rate": 0.0002, "loss": 4.3372, "mean_token_accuracy": 0.3364496976137161, "num_tokens": 12652.0, "step": 2 }, { "epoch": 0.48, "grad_norm": 14.583043098449707, "learning_rate": 0.0001973847635683447, "loss": 3.9199, "mean_token_accuracy": 0.397413432598114, "num_tokens": 19254.0, "step": 3 }, { "epoch": 0.64, "grad_norm": 10.18574333190918, "learning_rate": 0.0001896910423087889, "loss": 3.3676, "mean_token_accuracy": 0.45736685395240784, "num_tokens": 25477.0, "step": 4 }, { "epoch": 0.8, "grad_norm": 6.001947402954102, "learning_rate": 0.00017736596733539909, "loss": 2.7383, "mean_token_accuracy": 0.5451276451349258, "num_tokens": 31819.0, "step": 5 }, { "epoch": 0.96, "grad_norm": 5.421485900878906, "learning_rate": 0.00016112582720580402, "loss": 2.3826, "mean_token_accuracy": 0.5848924219608307, "num_tokens": 38750.0, "step": 6 }, { "epoch": 1.0, "grad_norm": 4.082874298095703, "learning_rate": 0.00014191443983382822, "loss": 2.337, "mean_token_accuracy": 0.5862069129943848, "num_tokens": 40233.0, "step": 7 }, { "epoch": 1.16, "grad_norm": 2.713186502456665, "learning_rate": 0.00012084830122297907, "loss": 1.8353, "mean_token_accuracy": 0.6436551362276077, "num_tokens": 47108.0, "step": 8 }, { "epoch": 1.32, "grad_norm": 2.771085262298584, "learning_rate": 9.915169877702095e-05, "loss": 1.9092, "mean_token_accuracy": 0.6217025518417358, "num_tokens": 53107.0, "step": 9 }, { "epoch": 1.48, "grad_norm": 2.3573851585388184, "learning_rate": 7.808556016617178e-05, "loss": 1.5703, "mean_token_accuracy": 0.6634047329425812, "num_tokens": 60275.0, "step": 10 }, { "epoch": 1.6400000000000001, "grad_norm": 2.2969491481781006, "learning_rate": 5.887417279419599e-05, "loss": 1.7129, "mean_token_accuracy": 0.6377729177474976, "num_tokens": 66974.0, "step": 11 }, { "epoch": 1.8, "grad_norm": 2.3216073513031006, "learning_rate": 4.2634032664600895e-05, "loss": 1.7239, "mean_token_accuracy": 0.6286583840847015, "num_tokens": 73024.0, "step": 12 }, { "epoch": 1.96, "grad_norm": 2.114013195037842, "learning_rate": 3.030895769121112e-05, "loss": 1.7101, "mean_token_accuracy": 0.6209566742181778, "num_tokens": 78927.0, "step": 13 }, { "epoch": 2.0, "grad_norm": 2.2533507347106934, "learning_rate": 2.261523643165532e-05, "loss": 1.6181, "mean_token_accuracy": 0.6488599181175232, "num_tokens": 80466.0, "step": 14 } ], "logging_steps": 1, "max_steps": 14, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.5187782136909824e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }