{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 4.0, "eval_steps": 500, "global_step": 832, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.12062726176115803, "grad_norm": 0.21568018198013306, "learning_rate": 0.00027872727272727267, "loss": 1.5868, "mean_token_accuracy": 0.6624862551689148, "num_tokens": 281912.0, "step": 25 }, { "epoch": 0.24125452352231605, "grad_norm": 0.18077188730239868, "learning_rate": 0.0005109841901849095, "loss": 0.6885, "mean_token_accuracy": 0.814376802444458, "num_tokens": 567502.0, "step": 50 }, { "epoch": 0.3618817852834741, "grad_norm": 0.21275386214256287, "learning_rate": 0.0005104310520681908, "loss": 0.4572, "mean_token_accuracy": 0.8698280012607574, "num_tokens": 851707.0, "step": 75 }, { "epoch": 0.4825090470446321, "grad_norm": 0.21209321916103363, "learning_rate": 0.0005090893786665853, "loss": 0.2886, "mean_token_accuracy": 0.9166975712776184, "num_tokens": 1136836.0, "step": 100 }, { "epoch": 0.6031363088057901, "grad_norm": 0.21541441977024078, "learning_rate": 0.0005069633199524458, "loss": 0.2088, "mean_token_accuracy": 0.9403706526756287, "num_tokens": 1421665.0, "step": 125 }, { "epoch": 0.7237635705669482, "grad_norm": 0.2024364322423935, "learning_rate": 0.000504059452105097, "loss": 0.1676, "mean_token_accuracy": 0.9520682036876679, "num_tokens": 1707215.0, "step": 150 }, { "epoch": 0.8443908323281062, "grad_norm": 0.15191498398780823, "learning_rate": 0.0005003867571698477, "loss": 0.1272, "mean_token_accuracy": 0.9645591104030609, "num_tokens": 1990889.0, "step": 175 }, { "epoch": 0.9650180940892642, "grad_norm": 0.16032542288303375, "learning_rate": 0.0004959565952753427, "loss": 0.1091, "mean_token_accuracy": 0.9695405942201615, "num_tokens": 2274523.0, "step": 200 }, { "epoch": 1.0, "eval_loss": 0.09917861223220825, "eval_mean_token_accuracy": 0.9725541690344451, "eval_num_tokens": 2355152.0, "eval_runtime": 14.4426, "eval_samples_per_second": 25.549, "eval_steps_per_second": 6.439, "step": 208 }, { "epoch": 1.0820265379975875, "grad_norm": 0.08378228545188904, "learning_rate": 0.0004907826694951906, "loss": 0.0898, "mean_token_accuracy": 0.9755665328084808, "num_tokens": 2562109.0, "step": 225 }, { "epoch": 1.2026537997587454, "grad_norm": 0.10287031531333923, "learning_rate": 0.0004848809834625538, "loss": 0.0713, "mean_token_accuracy": 0.9797296553850174, "num_tokens": 2845938.0, "step": 250 }, { "epoch": 1.3232810615199035, "grad_norm": 0.06548517942428589, "learning_rate": 0.00047826979186880607, "loss": 0.0673, "mean_token_accuracy": 0.9813395547866821, "num_tokens": 3129801.0, "step": 275 }, { "epoch": 1.4439083232810614, "grad_norm": 0.10134833306074142, "learning_rate": 0.00047096954399936976, "loss": 0.0597, "mean_token_accuracy": 0.9829099756479264, "num_tokens": 3412051.0, "step": 300 }, { "epoch": 1.5645355850422196, "grad_norm": 0.09496472030878067, "learning_rate": 0.00046300282048138635, "loss": 0.0608, "mean_token_accuracy": 0.9829603230953217, "num_tokens": 3697161.0, "step": 325 }, { "epoch": 1.6851628468033777, "grad_norm": 0.07210717350244522, "learning_rate": 0.0004543942634388648, "loss": 0.0574, "mean_token_accuracy": 0.9837546187639237, "num_tokens": 3980794.0, "step": 350 }, { "epoch": 1.8057901085645356, "grad_norm": 0.07639794051647186, "learning_rate": 0.00044517050027135087, "loss": 0.055, "mean_token_accuracy": 0.9842740440368652, "num_tokens": 4265160.0, "step": 375 }, { "epoch": 1.9264173703256935, "grad_norm": 0.06361191719770432, "learning_rate": 0.00043536006129187623, "loss": 0.0483, "mean_token_accuracy": 0.9864062821865082, "num_tokens": 4549538.0, "step": 400 }, { "epoch": 2.0, "eval_loss": 0.054612528532743454, "eval_mean_token_accuracy": 0.9850417369155473, "eval_num_tokens": 4710428.0, "eval_runtime": 14.4072, "eval_samples_per_second": 25.612, "eval_steps_per_second": 6.455, "step": 416 }, { "epoch": 2.0434258142340167, "grad_norm": 0.05373181030154228, "learning_rate": 0.00042499329147894713, "loss": 0.0502, "mean_token_accuracy": 0.986190964266197, "num_tokens": 4827513.0, "step": 425 }, { "epoch": 2.164053075995175, "grad_norm": 0.06150342524051666, "learning_rate": 0.0004141022566155328, "loss": 0.0439, "mean_token_accuracy": 0.9872288095951081, "num_tokens": 5110985.0, "step": 450 }, { "epoch": 2.284680337756333, "grad_norm": 0.06713089346885681, "learning_rate": 0.00040272064410538054, "loss": 0.0449, "mean_token_accuracy": 0.9868355304002762, "num_tokens": 5396931.0, "step": 475 }, { "epoch": 2.405307599517491, "grad_norm": 0.04453020170331001, "learning_rate": 0.0003908836587734436, "loss": 0.0406, "mean_token_accuracy": 0.9880980604887009, "num_tokens": 5682413.0, "step": 500 }, { "epoch": 2.5259348612786487, "grad_norm": 0.07445702701807022, "learning_rate": 0.00037862791397272533, "loss": 0.0365, "mean_token_accuracy": 0.988963897228241, "num_tokens": 5965283.0, "step": 525 }, { "epoch": 2.646562123039807, "grad_norm": 0.046956002712249756, "learning_rate": 0.00036599131833436156, "loss": 0.0373, "mean_token_accuracy": 0.9889561772346497, "num_tokens": 6249629.0, "step": 550 }, { "epoch": 2.767189384800965, "grad_norm": 0.05231843888759613, "learning_rate": 0.000353012958511237, "loss": 0.0357, "mean_token_accuracy": 0.9893056184053421, "num_tokens": 6535592.0, "step": 575 }, { "epoch": 2.887816646562123, "grad_norm": 0.046165380626916885, "learning_rate": 0.000339732978277825, "loss": 0.0331, "mean_token_accuracy": 0.9900722551345825, "num_tokens": 6818437.0, "step": 600 }, { "epoch": 3.0, "eval_loss": 0.04674848914146423, "eval_mean_token_accuracy": 0.9872116202949196, "eval_num_tokens": 7065581.0, "eval_runtime": 14.4011, "eval_samples_per_second": 25.623, "eval_steps_per_second": 6.458, "step": 624 }, { "epoch": 3.004825090470446, "grad_norm": 0.07378800958395004, "learning_rate": 0.0003261924543602117, "loss": 0.0346, "mean_token_accuracy": 0.990131803394593, "num_tokens": 7081259.0, "step": 625 }, { "epoch": 3.1254523522316044, "grad_norm": 0.06171029433608055, "learning_rate": 0.00031243326938037763, "loss": 0.0295, "mean_token_accuracy": 0.9908573353290557, "num_tokens": 7365656.0, "step": 650 }, { "epoch": 3.2460796139927623, "grad_norm": 0.058736372739076614, "learning_rate": 0.00029849798230773775, "loss": 0.0282, "mean_token_accuracy": 0.9913074177503586, "num_tokens": 7649008.0, "step": 675 }, { "epoch": 3.3667068757539202, "grad_norm": 0.05701276659965515, "learning_rate": 0.0002844296968186495, "loss": 0.0281, "mean_token_accuracy": 0.9912878751754761, "num_tokens": 7933268.0, "step": 700 }, { "epoch": 3.4873341375150786, "grad_norm": 0.05498967319726944, "learning_rate": 0.00027027192797107074, "loss": 0.0294, "mean_token_accuracy": 0.9906794387102127, "num_tokens": 8215897.0, "step": 725 }, { "epoch": 3.6079613992762365, "grad_norm": 0.06823839247226715, "learning_rate": 0.00025606846760676025, "loss": 0.0277, "mean_token_accuracy": 0.9914608508348465, "num_tokens": 8501329.0, "step": 750 }, { "epoch": 3.7285886610373944, "grad_norm": 0.0680052787065506, "learning_rate": 0.00024186324889734987, "loss": 0.0274, "mean_token_accuracy": 0.9915587675571441, "num_tokens": 8785631.0, "step": 775 }, { "epoch": 3.8492159227985523, "grad_norm": 0.05490550398826599, "learning_rate": 0.00022770021045326374, "loss": 0.0258, "mean_token_accuracy": 0.9919438254833222, "num_tokens": 9069077.0, "step": 800 }, { "epoch": 3.9698431845597106, "grad_norm": 0.03772555664181709, "learning_rate": 0.00021362316041581623, "loss": 0.0265, "mean_token_accuracy": 0.9916725689172745, "num_tokens": 9353723.0, "step": 825 }, { "epoch": 4.0, "eval_loss": 0.04437188059091568, "eval_mean_token_accuracy": 0.9882329240922005, "eval_num_tokens": 9420685.0, "eval_runtime": 14.4213, "eval_samples_per_second": 25.587, "eval_steps_per_second": 6.449, "step": 832 } ], "logging_steps": 25, "max_steps": 1456, "num_input_tokens_seen": 0, "num_train_epochs": 7, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.097329674898637e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }