{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.4153686396677051, "eval_steps": 50, "global_step": 400, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0, "eval_entropy": 1.202019446364073, "eval_loss": 1.0483194589614868, "eval_mean_token_accuracy": 0.7420558578500124, "eval_num_tokens": 0.0, "eval_runtime": 129.0837, "eval_samples_per_second": 0.829, "eval_steps_per_second": 0.829, "step": 0 }, { "entropy": 0.8833672893047333, "epoch": 0.05192107995846314, "grad_norm": 10.9375, "learning_rate": 9.785867237687366e-06, "loss": 0.7974, "mean_token_accuracy": 0.7731659519672394, "num_tokens": 117579.0, "step": 50 }, { "epoch": 0.05192107995846314, "eval_entropy": 0.6945631228317725, "eval_loss": 0.661672055721283, "eval_mean_token_accuracy": 0.7917334231260781, "eval_num_tokens": 117579.0, "eval_runtime": 3670.0079, "eval_samples_per_second": 0.029, "eval_steps_per_second": 0.029, "step": 50 }, { "entropy": 0.6754305803775787, "epoch": 0.10384215991692627, "grad_norm": 10.375, "learning_rate": 9.250535331905782e-06, "loss": 0.6539, "mean_token_accuracy": 0.7942240250110626, "num_tokens": 234762.0, "step": 100 }, { "epoch": 0.10384215991692627, "eval_entropy": 0.6633974996125587, "eval_loss": 0.6503159999847412, "eval_mean_token_accuracy": 0.7945662413802103, "eval_num_tokens": 234762.0, "eval_runtime": 3639.4734, "eval_samples_per_second": 0.029, "eval_steps_per_second": 0.029, "step": 100 }, { "entropy": 0.673185322880745, "epoch": 0.1557632398753894, "grad_norm": 10.75, "learning_rate": 8.715203426124197e-06, "loss": 0.654, "mean_token_accuracy": 0.7920358991622924, "num_tokens": 340474.0, "step": 150 }, { "epoch": 0.1557632398753894, "eval_entropy": 0.6714856841853846, "eval_loss": 0.6473907232284546, "eval_mean_token_accuracy": 0.7945645585238377, "eval_num_tokens": 340474.0, "eval_runtime": 3648.7713, "eval_samples_per_second": 0.029, "eval_steps_per_second": 0.029, "step": 150 }, { "entropy": 0.6565037113428116, "epoch": 0.20768431983385255, "grad_norm": 15.875, "learning_rate": 8.179871520342612e-06, "loss": 0.6425, "mean_token_accuracy": 0.7962384045124054, "num_tokens": 457598.0, "step": 200 }, { "epoch": 0.20768431983385255, "eval_entropy": 0.6561593619462486, "eval_loss": 0.6449708342552185, "eval_mean_token_accuracy": 0.795283340405081, "eval_num_tokens": 457598.0, "eval_runtime": 3483.4231, "eval_samples_per_second": 0.031, "eval_steps_per_second": 0.031, "step": 200 }, { "entropy": 0.6419710898399353, "epoch": 0.25960539979231567, "grad_norm": 12.0, "learning_rate": 7.644539614561029e-06, "loss": 0.6278, "mean_token_accuracy": 0.7990661442279816, "num_tokens": 575217.0, "step": 250 }, { "epoch": 0.25960539979231567, "eval_entropy": 0.6565199853660905, "eval_loss": 0.6442670226097107, "eval_mean_token_accuracy": 0.7952453271250859, "eval_num_tokens": 575217.0, "eval_runtime": 3485.0746, "eval_samples_per_second": 0.031, "eval_steps_per_second": 0.031, "step": 250 }, { "entropy": 0.6925940608978272, "epoch": 0.3115264797507788, "grad_norm": 14.125, "learning_rate": 7.109207708779444e-06, "loss": 0.6782, "mean_token_accuracy": 0.7854274523258209, "num_tokens": 690541.0, "step": 300 }, { "epoch": 0.3115264797507788, "eval_entropy": 0.64524694051698, "eval_loss": 0.6422281265258789, "eval_mean_token_accuracy": 0.7960561396919679, "eval_num_tokens": 690541.0, "eval_runtime": 3645.789, "eval_samples_per_second": 0.029, "eval_steps_per_second": 0.029, "step": 300 }, { "entropy": 0.6548466223478318, "epoch": 0.363447559709242, "grad_norm": 10.375, "learning_rate": 6.573875802997859e-06, "loss": 0.6461, "mean_token_accuracy": 0.7949914515018464, "num_tokens": 811130.0, "step": 350 }, { "epoch": 0.363447559709242, "eval_entropy": 0.6484908422577047, "eval_loss": 0.6409919261932373, "eval_mean_token_accuracy": 0.7962207621503099, "eval_num_tokens": 811130.0, "eval_runtime": 3645.1814, "eval_samples_per_second": 0.029, "eval_steps_per_second": 0.029, "step": 350 }, { "entropy": 0.6180817884206772, "epoch": 0.4153686396677051, "grad_norm": 10.9375, "learning_rate": 6.038543897216275e-06, "loss": 0.6089, "mean_token_accuracy": 0.8053069579601287, "num_tokens": 923200.0, "step": 400 }, { "epoch": 0.4153686396677051, "eval_entropy": 0.6479496045090328, "eval_loss": 0.6389999985694885, "eval_mean_token_accuracy": 0.7966728973611493, "eval_num_tokens": 923200.0, "eval_runtime": 3638.2153, "eval_samples_per_second": 0.029, "eval_steps_per_second": 0.029, "step": 400 } ], "logging_steps": 50, "max_steps": 963, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.126430870427648e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }