{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 185, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.005405405405405406, "grad_norm": 1.7116371393203735, "learning_rate": 1.0526315789473684e-05, "loss": 1.4457, "step": 1 }, { "epoch": 0.02702702702702703, "grad_norm": 1.6178371906280518, "learning_rate": 5.2631578947368424e-05, "loss": 1.4317, "step": 5 }, { "epoch": 0.05405405405405406, "grad_norm": 1.0490155220031738, "learning_rate": 0.00010526315789473685, "loss": 1.4017, "step": 10 }, { "epoch": 0.08108108108108109, "grad_norm": 1.015197992324829, "learning_rate": 0.00015789473684210527, "loss": 1.3362, "step": 15 }, { "epoch": 0.10810810810810811, "grad_norm": 0.8447503447532654, "learning_rate": 0.00019998209226697376, "loss": 1.2393, "step": 20 }, { "epoch": 0.13513513513513514, "grad_norm": 0.7622084617614746, "learning_rate": 0.0001993559947963185, "loss": 1.1269, "step": 25 }, { "epoch": 0.16216216216216217, "grad_norm": 0.4874676764011383, "learning_rate": 0.00019784091409455728, "loss": 1.0469, "step": 30 }, { "epoch": 0.1891891891891892, "grad_norm": 0.4265550374984741, "learning_rate": 0.0001954504062771555, "loss": 0.9981, "step": 35 }, { "epoch": 0.21621621621621623, "grad_norm": 0.2609158754348755, "learning_rate": 0.00019220586030376134, "loss": 0.9822, "step": 40 }, { "epoch": 0.24324324324324326, "grad_norm": 0.21494942903518677, "learning_rate": 0.00018813630660146488, "loss": 0.9561, "step": 45 }, { "epoch": 0.2702702702702703, "grad_norm": 0.22001613676548004, "learning_rate": 0.00018327815731637612, "loss": 0.9534, "step": 50 }, { "epoch": 0.2972972972972973, "grad_norm": 0.17728982865810394, "learning_rate": 0.00017767488051760857, "loss": 0.9442, "step": 55 }, { "epoch": 0.32432432432432434, "grad_norm": 0.24004819989204407, "learning_rate": 0.0001713766112687139, "loss": 0.9395, "step": 60 }, { "epoch": 0.35135135135135137, "grad_norm": 0.18783989548683167, "learning_rate": 0.0001644397030464877, "loss": 0.9387, "step": 65 }, { "epoch": 0.3783783783783784, "grad_norm": 0.2297225445508957, "learning_rate": 0.00015692622352080662, "loss": 0.9212, "step": 70 }, { "epoch": 0.40540540540540543, "grad_norm": 0.19587929546833038, "learning_rate": 0.00014890339920698334, "loss": 0.9379, "step": 75 }, { "epoch": 0.43243243243243246, "grad_norm": 0.17770881950855255, "learning_rate": 0.0001404430139595877, "loss": 0.9233, "step": 80 }, { "epoch": 0.4594594594594595, "grad_norm": 0.1968083679676056, "learning_rate": 0.0001316207666896824, "loss": 0.9277, "step": 85 }, { "epoch": 0.4864864864864865, "grad_norm": 0.2078552544116974, "learning_rate": 0.00012251559405226941, "loss": 0.9141, "step": 90 }, { "epoch": 0.5135135135135135, "grad_norm": 0.217142254114151, "learning_rate": 0.00011320896416417026, "loss": 0.9178, "step": 95 }, { "epoch": 0.5405405405405406, "grad_norm": 0.21711134910583496, "learning_rate": 0.00010378414767176705, "loss": 0.9074, "step": 100 }, { "epoch": 0.5675675675675675, "grad_norm": 0.2110918015241623, "learning_rate": 9.432547269069261e-05, "loss": 0.9109, "step": 105 }, { "epoch": 0.5945945945945946, "grad_norm": 0.19920781254768372, "learning_rate": 8.491757028386263e-05, "loss": 0.9087, "step": 110 }, { "epoch": 0.6216216216216216, "grad_norm": 0.19524553418159485, "learning_rate": 7.564461722890081e-05, "loss": 0.916, "step": 115 }, { "epoch": 0.6486486486486487, "grad_norm": 0.1919243484735489, "learning_rate": 6.658958285026102e-05, "loss": 0.9086, "step": 120 }, { "epoch": 0.6756756756756757, "grad_norm": 0.1887766569852829, "learning_rate": 5.7833486654981606e-05, "loss": 0.9115, "step": 125 }, { "epoch": 0.7027027027027027, "grad_norm": 0.19857022166252136, "learning_rate": 4.945467341434195e-05, "loss": 0.9091, "step": 130 }, { "epoch": 0.7297297297297297, "grad_norm": 0.18941636383533478, "learning_rate": 4.152811217759529e-05, "loss": 0.9086, "step": 135 }, { "epoch": 0.7567567567567568, "grad_norm": 0.1900327503681183, "learning_rate": 3.4124725489820645e-05, "loss": 0.8949, "step": 140 }, { "epoch": 0.7837837837837838, "grad_norm": 0.2118791937828064, "learning_rate": 2.7310754815685624e-05, "loss": 0.9032, "step": 145 }, { "epoch": 0.8108108108108109, "grad_norm": 0.1908016800880432, "learning_rate": 2.1147167846963422e-05, "loss": 0.9004, "step": 150 }, { "epoch": 0.8378378378378378, "grad_norm": 0.18138864636421204, "learning_rate": 1.5689112996891576e-05, "loss": 0.9078, "step": 155 }, { "epoch": 0.8648648648648649, "grad_norm": 0.17937543988227844, "learning_rate": 1.0985425962260343e-05, "loss": 0.9048, "step": 160 }, { "epoch": 0.8918918918918919, "grad_norm": 0.17949175834655762, "learning_rate": 7.078192768243486e-06, "loss": 0.8973, "step": 165 }, { "epoch": 0.918918918918919, "grad_norm": 0.18442046642303467, "learning_rate": 4.002373205607723e-06, "loss": 0.9025, "step": 170 }, { "epoch": 0.9459459459459459, "grad_norm": 0.1833404004573822, "learning_rate": 1.7854880295797405e-06, "loss": 0.888, "step": 175 }, { "epoch": 0.972972972972973, "grad_norm": 0.18067924678325653, "learning_rate": 4.4737271914411236e-07, "loss": 0.8983, "step": 180 }, { "epoch": 1.0, "grad_norm": 0.20572535693645477, "learning_rate": 0.0, "loss": 0.8994, "step": 185 }, { "epoch": 1.0, "eval_loss": 1.3997735977172852, "eval_runtime": 0.6709, "eval_samples_per_second": 16.396, "eval_steps_per_second": 1.491, "step": 185 }, { "epoch": 1.0, "step": 185, "total_flos": 9.060485625492275e+17, "train_loss": 0.9761296278721577, "train_runtime": 715.1184, "train_samples_per_second": 57.936, "train_steps_per_second": 0.259 } ], "logging_steps": 5, "max_steps": 185, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 9.060485625492275e+17, "train_batch_size": 14, "trial_name": null, "trial_params": null }