| { | |
| "best_global_step": 896, | |
| "best_metric": -0.0009789273608475924, | |
| "best_model_checkpoint": "/home/datta0/mnt/dattafs/train/llama_r1math_grpo_acconly/checkpoint-896", | |
| "epoch": 1.0, | |
| "eval_steps": 64, | |
| "global_step": 1250, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "advantages": 0.0, | |
| "clip_ratio": 0.0, | |
| "coeff1": 0.0, | |
| "coeff2": 0.0, | |
| "completion_length": 1027.7083549499512, | |
| "epoch": 0.0008, | |
| "grad_norm": 0.01466101370737968, | |
| "kl": 0.0004107952117919922, | |
| "learning_rate": 2.4e-08, | |
| "loss": 0.0, | |
| "num_tokens": 87864.0, | |
| "reward": 0.0, | |
| "reward_std": 0.0, | |
| "rewards/accuracy_reward": 0.0, | |
| "step": 1 | |
| }, | |
| { | |
| "advantages": -1.6556845894299955e-10, | |
| "clip_ratio": 0.0, | |
| "coeff1": 0.0, | |
| "coeff2": 0.0, | |
| "completion_length": 1264.6620464324951, | |
| "epoch": 0.0128, | |
| "grad_norm": 0.025934769703142593, | |
| "kl": 0.00044070879618326824, | |
| "learning_rate": 3.84e-07, | |
| "loss": 0.0023, | |
| "num_tokens": 1674685.0, | |
| "reward": 0.012037037312984467, | |
| "reward_std": 0.026614275574684144, | |
| "rewards/accuracy_reward": 0.012037037312984467, | |
| "step": 16 | |
| }, | |
| { | |
| "advantages": -2.587007240373307e-10, | |
| "clip_ratio": 0.0, | |
| "coeff1": 0.0, | |
| "coeff2": 0.0, | |
| "completion_length": 1219.9435840845108, | |
| "epoch": 0.0256, | |
| "grad_norm": 0.04061673893266753, | |
| "kl": 0.0007980614900588989, | |
| "learning_rate": 7.68e-07, | |
| "loss": -0.0021, | |
| "num_tokens": 3304853.0, | |
| "reward": 0.021701389166992158, | |
| "reward_std": 0.03553581167943776, | |
| "rewards/accuracy_reward": 0.021701389166992158, | |
| "step": 32 | |
| }, | |
| { | |
| "advantages": -1.810905054383527e-10, | |
| "clip_ratio": 0.0, | |
| "coeff1": 0.0, | |
| "coeff2": 0.0, | |
| "completion_length": 1160.891501903534, | |
| "epoch": 0.0384, | |
| "grad_norm": 0.04088811488095649, | |
| "kl": 0.0014685392379760742, | |
| "learning_rate": 1.1520000000000002e-06, | |
| "loss": 0.0038, | |
| "num_tokens": 4870287.0, | |
| "reward": 0.01736111141508445, | |
| "reward_std": 0.032931644935160875, | |
| "rewards/accuracy_reward": 0.01736111141508445, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.0512, | |
| "grad_norm": 0.12319314674263099, | |
| "learning_rate": 1.536e-06, | |
| "loss": 0.0039, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.0512, | |
| "eval_advantages": -2.5086130142878716e-10, | |
| "eval_clip_ratio": 0.0, | |
| "eval_coeff1": 0.0, | |
| "eval_coeff2": 0.0, | |
| "eval_completion_length": 1093.752197265625, | |
| "eval_kl": 0.005735917524857955, | |
| "eval_loss": 0.0002934922813437879, | |
| "eval_num_tokens": 6538255.0, | |
| "eval_reward": 0.033670033920894966, | |
| "eval_reward_std": 0.06973752108487216, | |
| "eval_rewards/accuracy_reward": 0.033670033920894966, | |
| "eval_runtime": 551.8022, | |
| "eval_samples_per_second": 0.058, | |
| "eval_steps_per_second": 0.004, | |
| "step": 64 | |
| }, | |
| { | |
| "advantages": -4.009861184414709e-10, | |
| "clip_ratio": 0.0, | |
| "coeff1": 0.0, | |
| "coeff2": 0.0, | |
| "completion_length": 1243.5607726573944, | |
| "epoch": 0.064, | |
| "grad_norm": 0.27812226841587845, | |
| "kl": 0.008373796939849854, | |
| "learning_rate": 1.9200000000000003e-06, | |
| "loss": 0.0007, | |
| "num_tokens": 8191544.0, | |
| "reward": 0.034288194990949705, | |
| "reward_std": 0.05555227259173989, | |
| "rewards/accuracy_reward": 0.034288194990949705, | |
| "step": 80 | |
| }, | |
| { | |
| "advantages": -4.915313728953707e-10, | |
| "clip_ratio": 0.0, | |
| "coeff1": 0.0, | |
| "coeff2": 0.0, | |
| "completion_length": 1286.811640381813, | |
| "epoch": 0.0768, | |
| "grad_norm": 0.08377069137091007, | |
| "kl": 0.011660099029541016, | |
| "learning_rate": 2.3040000000000003e-06, | |
| "loss": 0.0102, | |
| "num_tokens": 9907465.0, | |
| "reward": 0.03819444525288418, | |
| "reward_std": 0.06641548709012568, | |
| "rewards/accuracy_reward": 0.03819444525288418, | |
| "step": 96 | |
| }, | |
| { | |
| "advantages": -6.726218748642765e-10, | |
| "clip_ratio": 0.0, | |
| "coeff1": 0.0, | |
| "coeff2": 0.0, | |
| "completion_length": 1128.9600811004639, | |
| "epoch": 0.0896, | |
| "grad_norm": 0.16347202160725127, | |
| "kl": 0.09761714935302734, | |
| "learning_rate": 2.688e-06, | |
| "loss": 0.0047, | |
| "num_tokens": 11442612.0, | |
| "reward": 0.04079861199716106, | |
| "reward_std": 0.06649718736298382, | |
| "rewards/accuracy_reward": 0.04079861199716106, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 0.1024, | |
| "grad_norm": 0.27678886955004334, | |
| "learning_rate": 2.999947362417721e-06, | |
| "loss": 0.0042, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 0.1024, | |
| "eval_advantages": -2.0068904114302975e-10, | |
| "eval_clip_ratio": 0.0, | |
| "eval_coeff1": 0.0, | |
| "eval_coeff2": 0.0, | |
| "eval_completion_length": 1419.1784390536222, | |
| "eval_kl": 0.029097123579545456, | |
| "eval_loss": 0.010437126271426678, | |
| "eval_num_tokens": 12942930.0, | |
| "eval_reward": 0.04713804748925296, | |
| "eval_reward_std": 0.07804939692670648, | |
| "eval_rewards/accuracy_reward": 0.04713804748925296, | |
| "eval_runtime": 728.7234, | |
| "eval_samples_per_second": 0.044, | |
| "eval_steps_per_second": 0.003, | |
| "step": 128 | |
| }, | |
| { | |
| "advantages": -6.72621876599e-10, | |
| "clip_ratio": 0.0, | |
| "coeff1": 0.0, | |
| "coeff2": 0.0, | |
| "completion_length": 1180.0976678729057, | |
| "epoch": 0.1152, | |
| "grad_norm": 0.1291924427524718, | |
| "kl": 0.03859281539916992, | |
| "learning_rate": 2.997889131011168e-06, | |
| "loss": -0.0005, | |
| "num_tokens": 14623194.0, | |
| "reward": 0.04861111196805723, | |
| "reward_std": 0.09081879071891308, | |
| "rewards/accuracy_reward": 0.04861111196805723, | |
| "step": 144 | |
| }, | |
| { | |
| "advantages": 2.328306471233166e-10, | |
| "clip_ratio": 0.0, | |
| "coeff1": 0.0, | |
| "coeff2": 0.0, | |
| "completion_length": 808.3142422437668, | |
| "epoch": 0.128, | |
| "grad_norm": 0.2312672301481172, | |
| "kl": 0.05692291259765625, | |
| "learning_rate": 2.9928410999727467e-06, | |
| "loss": 0.0056, | |
| "num_tokens": 15791756.0, | |
| "reward": 0.030381944845430553, | |
| "reward_std": 0.06712671066634357, | |
| "rewards/accuracy_reward": 0.030381944845430553, | |
| "step": 160 | |
| }, | |
| { | |
| "advantages": -2.32830652327487e-10, | |
| "clip_ratio": 0.0, | |
| "coeff1": 0.0, | |
| "coeff2": 0.0, | |
| "completion_length": 1044.7361204624176, | |
| "epoch": 0.1408, | |
| "grad_norm": 0.2204521818711921, | |
| "kl": 0.055908203125, | |
| "learning_rate": 2.9848133452159737e-06, | |
| "loss": 0.0019, | |
| "num_tokens": 17229706.0, | |
| "reward": 0.043402778333984315, | |
| "reward_std": 0.056710043689236045, | |
| "rewards/accuracy_reward": 0.043402778333984315, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 0.1536, | |
| "grad_norm": 0.2784129015892714, | |
| "learning_rate": 2.97382189020862e-06, | |
| "loss": 0.0035, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 0.1536, | |
| "eval_advantages": 4.264642124289382e-10, | |
| "eval_clip_ratio": 0.0, | |
| "eval_coeff1": 0.0, | |
| "eval_coeff2": 0.0, | |
| "eval_completion_length": 1347.6511951793325, | |
| "eval_kl": 0.04640891335227273, | |
| "eval_loss": 0.009176909923553467, | |
| "eval_num_tokens": 18805959.0, | |
| "eval_reward": 0.033670033920894966, | |
| "eval_reward_std": 0.08733082156289708, | |
| "eval_rewards/accuracy_reward": 0.033670033920894966, | |
| "eval_runtime": 743.3844, | |
| "eval_samples_per_second": 0.043, | |
| "eval_steps_per_second": 0.003, | |
| "step": 192 | |
| }, | |
| { | |
| "advantages": -1.0348028857409819e-10, | |
| "clip_ratio": 0.0, | |
| "coeff1": 0.0, | |
| "coeff2": 0.0, | |
| "completion_length": 1291.665376842022, | |
| "epoch": 0.1664, | |
| "grad_norm": 0.3752515234781844, | |
| "kl": 0.053093910217285156, | |
| "learning_rate": 2.959888673989734e-06, | |
| "loss": 0.0065, | |
| "num_tokens": 20658358.0, | |
| "reward": 0.023871528188465163, | |
| "reward_std": 0.0477204411290586, | |
| "rewards/accuracy_reward": 0.023871528188465163, | |
| "step": 208 | |
| }, | |
| { | |
| "advantages": -1.810905054383527e-10, | |
| "clip_ratio": 0.0, | |
| "coeff1": 0.0, | |
| "coeff2": 0.0, | |
| "completion_length": 1179.581609249115, | |
| "epoch": 0.1792, | |
| "grad_norm": 0.1424187976880637, | |
| "kl": 0.05293464660644531, | |
| "learning_rate": 2.943041507379129e-06, | |
| "loss": 0.0057, | |
| "num_tokens": 22251551.0, | |
| "reward": 0.026041667151730508, | |
| "reward_std": 0.04704763786867261, | |
| "rewards/accuracy_reward": 0.026041667151730508, | |
| "step": 224 | |
| }, | |
| { | |
| "advantages": -1.810905054383527e-10, | |
| "clip_ratio": 0.0, | |
| "coeff1": 0.0, | |
| "coeff2": 0.0, | |
| "completion_length": 1221.903657913208, | |
| "epoch": 0.192, | |
| "grad_norm": 0.06793437710568745, | |
| "kl": 0.0833587646484375, | |
| "learning_rate": 2.9233140174667447e-06, | |
| "loss": 0.0039, | |
| "num_tokens": 23897243.0, | |
| "reward": 0.02170138928340748, | |
| "reward_std": 0.04070548270829022, | |
| "rewards/accuracy_reward": 0.02170138928340748, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.2048, | |
| "grad_norm": 0.0687227576536205, | |
| "learning_rate": 2.90074558049269e-06, | |
| "loss": 0.0087, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 0.2048, | |
| "eval_advantages": -4.013780822860595e-10, | |
| "eval_clip_ratio": 0.0, | |
| "eval_coeff1": 0.0, | |
| "eval_coeff2": 0.0, | |
| "eval_completion_length": 1184.7589333274148, | |
| "eval_kl": 0.05091441761363636, | |
| "eval_loss": 0.019146718084812164, | |
| "eval_num_tokens": 25328098.0, | |
| "eval_reward": 0.09090909158641641, | |
| "eval_reward_std": 0.12024257670749318, | |
| "eval_rewards/accuracy_reward": 0.09090909158641641, | |
| "eval_runtime": 605.2476, | |
| "eval_samples_per_second": 0.053, | |
| "eval_steps_per_second": 0.003, | |
| "step": 256 | |
| }, | |
| { | |
| "advantages": -3.3631094176894694e-10, | |
| "clip_ratio": 0.0, | |
| "coeff1": 0.0, | |
| "coeff2": 0.0, | |
| "completion_length": 1135.4952353835106, | |
| "epoch": 0.2176, | |
| "grad_norm": 0.13313117965541169, | |
| "kl": 0.05780601501464844, | |
| "learning_rate": 2.875381243251925e-06, | |
| "loss": 0.0062, | |
| "num_tokens": 26977725.0, | |
| "reward": 0.03515625064028427, | |
| "reward_std": 0.06366020266432315, | |
| "rewards/accuracy_reward": 0.03515625064028427, | |
| "step": 272 | |
| }, | |
| { | |
| "advantages": -4.3979123294513034e-10, | |
| "clip_ratio": 0.0, | |
| "coeff1": 0.0, | |
| "coeff2": 0.0, | |
| "completion_length": 1205.5920230150223, | |
| "epoch": 0.2304, | |
| "grad_norm": 0.12756211705162357, | |
| "kl": 0.06094169616699219, | |
| "learning_rate": 2.8472716331804677e-06, | |
| "loss": 0.0077, | |
| "num_tokens": 28604140.0, | |
| "reward": 0.03125000046566129, | |
| "reward_std": 0.07459831284359097, | |
| "rewards/accuracy_reward": 0.03125000046566129, | |
| "step": 288 | |
| }, | |
| { | |
| "advantages": -4.656613011855271e-10, | |
| "clip_ratio": 0.0, | |
| "coeff1": 0.0, | |
| "coeff2": 0.0, | |
| "completion_length": 992.4383769035339, | |
| "epoch": 0.2432, | |
| "grad_norm": 0.3503219113511989, | |
| "kl": 0.06413078308105469, | |
| "learning_rate": 2.8164728573026006e-06, | |
| "loss": 0.0034, | |
| "num_tokens": 29971997.0, | |
| "reward": 0.03211805614409968, | |
| "reward_std": 0.048306683311238885, | |
| "rewards/accuracy_reward": 0.03211805614409968, | |
| "step": 304 | |
| }, | |
| { | |
| "epoch": 0.256, | |
| "grad_norm": 0.0744612699055891, | |
| "learning_rate": 2.78304639024076e-06, | |
| "loss": 0.0067, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.256, | |
| "eval_advantages": 3.0103356171454464e-10, | |
| "eval_clip_ratio": 0.0, | |
| "eval_coeff1": 0.0, | |
| "eval_coeff2": 0.0, | |
| "eval_completion_length": 894.4821555397727, | |
| "eval_kl": 0.055109197443181816, | |
| "eval_loss": 0.008422672748565674, | |
| "eval_num_tokens": 31693821.0, | |
| "eval_reward": 0.047138048843903976, | |
| "eval_reward_std": 0.10330192202871497, | |
| "eval_rewards/accuracy_reward": 0.04713804613460194, | |
| "eval_runtime": 320.0579, | |
| "eval_samples_per_second": 0.1, | |
| "eval_steps_per_second": 0.006, | |
| "step": 320 | |
| }, | |
| { | |
| "advantages": 7.761021339480756e-11, | |
| "clip_ratio": 0.0, | |
| "coeff1": 0.0, | |
| "coeff2": 0.0, | |
| "completion_length": 991.0763994455338, | |
| "epoch": 0.2688, | |
| "grad_norm": 0.06535007437884081, | |
| "kl": 0.059384822845458984, | |
| "learning_rate": 2.7470589515116627e-06, | |
| "loss": 0.004, | |
| "num_tokens": 1372957.0, | |
| "reward": 0.046006944903638214, | |
| "reward_std": 0.06322046089917421, | |
| "rewards/accuracy_reward": 0.046006944903638214, | |
| "step": 336 | |
| }, | |
| { | |
| "advantages": 2.5870069975120202e-11, | |
| "clip_ratio": 0.0, | |
| "coeff1": 0.0, | |
| "coeff2": 0.0, | |
| "completion_length": 1039.1059153676033, | |
| "epoch": 0.2816, | |
| "grad_norm": 0.45362050511921775, | |
| "kl": 0.08066940307617188, | |
| "learning_rate": 2.7085823723535738e-06, | |
| "loss": 0.0086, | |
| "num_tokens": 2797554.0, | |
| "reward": 0.0460069450782612, | |
| "reward_std": 0.07455965038388968, | |
| "rewards/accuracy_reward": 0.0460069450782612, | |
| "step": 352 | |
| }, | |
| { | |
| "advantages": -8.537123698942883e-10, | |
| "clip_ratio": 0.0, | |
| "coeff1": 0.0, | |
| "coeff2": 0.0, | |
| "completion_length": 1101.8316051959991, | |
| "epoch": 0.2944, | |
| "grad_norm": 0.11366606297326208, | |
| "kl": 0.06418609619140625, | |
| "learning_rate": 2.6676934523505355e-06, | |
| "loss": 0.0083, | |
| "num_tokens": 4299874.0, | |
| "reward": 0.04079861211357638, | |
| "reward_std": 0.06628588796593249, | |
| "rewards/accuracy_reward": 0.04079861211357638, | |
| "step": 368 | |
| }, | |
| { | |
| "epoch": 0.3072, | |
| "grad_norm": 0.10560527731603102, | |
| "learning_rate": 2.6244738061397326e-06, | |
| "loss": 0.0066, | |
| "step": 384 | |
| }, | |
| { | |
| "epoch": 0.3072, | |
| "eval_advantages": 6.020671234290893e-10, | |
| "eval_clip_ratio": 0.0, | |
| "eval_coeff1": 0.0, | |
| "eval_coeff2": 0.0, | |
| "eval_completion_length": 949.9097789417614, | |
| "eval_kl": 0.05579723011363636, | |
| "eval_loss": 0.011235292069613934, | |
| "eval_num_tokens": 5860471.0, | |
| "eval_reward": 0.04040404070507397, | |
| "eval_reward_std": 0.09238132766701958, | |
| "eval_rewards/accuracy_reward": 0.04040404070507397, | |
| "eval_runtime": 373.4119, | |
| "eval_samples_per_second": 0.086, | |
| "eval_steps_per_second": 0.005, | |
| "step": 384 | |
| }, | |
| { | |
| "advantages": -4.2685619362076155e-10, | |
| "clip_ratio": 0.0, | |
| "coeff1": 0.0, | |
| "coeff2": 0.0, | |
| "completion_length": 1075.3763124346733, | |
| "epoch": 0.32, | |
| "grad_norm": 0.14605916839741878, | |
| "kl": 0.08471107482910156, | |
| "learning_rate": 2.5790097005079765e-06, | |
| "loss": 0.0022, | |
| "num_tokens": 7238953.0, | |
| "reward": 0.026041667180834338, | |
| "reward_std": 0.045010624220594764, | |
| "rewards/accuracy_reward": 0.026041667180834338, | |
| "step": 400 | |
| }, | |
| { | |
| "advantages": -1.8109050890779965e-10, | |
| "clip_ratio": 0.0, | |
| "coeff1": 0.0, | |
| "coeff2": 0.0, | |
| "completion_length": 1245.3810901641846, | |
| "epoch": 0.3328, | |
| "grad_norm": 0.1183172246791569, | |
| "kl": 0.05459880828857422, | |
| "learning_rate": 2.531391882202451e-06, | |
| "loss": 0.0028, | |
| "num_tokens": 8902421.0, | |
| "reward": 0.0295138893998228, | |
| "reward_std": 0.04784542182460427, | |
| "rewards/accuracy_reward": 0.0295138893998228, | |
| "step": 416 | |
| }, | |
| { | |
| "advantages": -6.467517979502624e-10, | |
| "clip_ratio": 0.0, | |
| "coeff1": 0.0, | |
| "coeff2": 0.0, | |
| "completion_length": 1229.9618154764175, | |
| "epoch": 0.3456, | |
| "grad_norm": 0.3066664475065796, | |
| "kl": 0.05420970916748047, | |
| "learning_rate": 2.4817153967994223e-06, | |
| "loss": 0.0029, | |
| "num_tokens": 10560447.0, | |
| "reward": 0.029513889574445784, | |
| "reward_std": 0.0411667434964329, | |
| "rewards/accuracy_reward": 0.029513889574445784, | |
| "step": 432 | |
| }, | |
| { | |
| "epoch": 0.3584, | |
| "grad_norm": 0.1268279921380881, | |
| "learning_rate": 2.430079398992449e-06, | |
| "loss": 0.0055, | |
| "step": 448 | |
| }, | |
| { | |
| "epoch": 0.3584, | |
| "eval_advantages": -1.153961986572421e-09, | |
| "eval_clip_ratio": 0.0, | |
| "eval_coeff1": 0.0, | |
| "eval_coeff2": 0.0, | |
| "eval_completion_length": 906.0922629616477, | |
| "eval_kl": 0.054398970170454544, | |
| "eval_loss": 0.009388250298798084, | |
| "eval_num_tokens": 12040457.0, | |
| "eval_reward": 0.04040404070507397, | |
| "eval_reward_std": 0.07722981409593062, | |
| "eval_rewards/accuracy_reward": 0.04040404070507397, | |
| "eval_runtime": 424.1842, | |
| "eval_samples_per_second": 0.075, | |
| "eval_steps_per_second": 0.005, | |
| "step": 448 | |
| }, | |
| { | |
| "advantages": -4.91531372028009e-10, | |
| "clip_ratio": 0.0, | |
| "coeff1": 0.0, | |
| "coeff2": 0.0, | |
| "completion_length": 1094.4409800171852, | |
| "epoch": 0.3712, | |
| "grad_norm": 0.06191021180344248, | |
| "kl": 0.060648441314697266, | |
| "learning_rate": 2.376586954678758e-06, | |
| "loss": 0.0065, | |
| "num_tokens": 13547906.0, | |
| "reward": 0.034288195049157366, | |
| "reward_std": 0.061476447503082454, | |
| "rewards/accuracy_reward": 0.034288195049157366, | |
| "step": 464 | |
| }, | |
| { | |
| "advantages": -7.761022553787189e-11, | |
| "clip_ratio": 0.0, | |
| "coeff1": 0.0, | |
| "coeff2": 0.0, | |
| "completion_length": 961.0772646665573, | |
| "epoch": 0.384, | |
| "grad_norm": 1.7554215023891546, | |
| "kl": 0.08006000518798828, | |
| "learning_rate": 2.3213448352388254e-06, | |
| "loss": 0.0062, | |
| "num_tokens": 14885431.0, | |
| "reward": 0.03385416715173051, | |
| "reward_std": 0.0651520665269345, | |
| "rewards/accuracy_reward": 0.03385416715173051, | |
| "step": 480 | |
| }, | |
| { | |
| "advantages": -2.587007517929063e-11, | |
| "clip_ratio": 0.0, | |
| "coeff1": 0.0, | |
| "coeff2": 0.0, | |
| "completion_length": 1175.2361198663712, | |
| "epoch": 0.3968, | |
| "grad_norm": 0.08822820636304002, | |
| "kl": 0.060199737548828125, | |
| "learning_rate": 2.2644633044197768e-06, | |
| "loss": 0.0032, | |
| "num_tokens": 16476912.0, | |
| "reward": 0.012152777926530689, | |
| "reward_std": 0.02902539470233023, | |
| "rewards/accuracy_reward": 0.012152777926530689, | |
| "step": 496 | |
| }, | |
| { | |
| "epoch": 0.4096, | |
| "grad_norm": 0.25608207256166327, | |
| "learning_rate": 2.2060558982479992e-06, | |
| "loss": 0.0046, | |
| "step": 512 | |
| }, | |
| { | |
| "epoch": 0.4096, | |
| "eval_advantages": -1.2543065071439358e-10, | |
| "eval_clip_ratio": 0.0, | |
| "eval_coeff1": 0.0, | |
| "eval_coeff2": 0.0, | |
| "eval_completion_length": 946.973066850142, | |
| "eval_kl": 0.054021661931818184, | |
| "eval_loss": 0.0011832525487989187, | |
| "eval_num_tokens": 18159572.0, | |
| "eval_reward": 0.04377104409716346, | |
| "eval_reward_std": 0.08733082630417564, | |
| "eval_rewards/accuracy_reward": 0.04377104409716346, | |
| "eval_runtime": 393.0584, | |
| "eval_samples_per_second": 0.081, | |
| "eval_steps_per_second": 0.005, | |
| "step": 512 | |
| }, | |
| { | |
| "advantages": -2.5870072056788374e-10, | |
| "clip_ratio": 0.0, | |
| "coeff1": 0.0, | |
| "coeff2": 0.0, | |
| "completion_length": 1322.1076508760452, | |
| "epoch": 0.4224, | |
| "grad_norm": 5.019169577841116, | |
| "kl": 0.06106758117675781, | |
| "learning_rate": 2.1462391984102506e-06, | |
| "loss": 0.0056, | |
| "num_tokens": 19980464.0, | |
| "reward": 0.03645833395421505, | |
| "reward_std": 0.052383382339030504, | |
| "rewards/accuracy_reward": 0.03645833395421505, | |
| "step": 528 | |
| }, | |
| { | |
| "advantages": -1.2935036028394187e-10, | |
| "clip_ratio": 0.0, | |
| "coeff1": 0.0, | |
| "coeff2": 0.0, | |
| "completion_length": 1045.0260478258133, | |
| "epoch": 0.4352, | |
| "grad_norm": 0.12391478097100359, | |
| "kl": 0.05616188049316406, | |
| "learning_rate": 2.0851325995556095e-06, | |
| "loss": 0.0048, | |
| "num_tokens": 21416561.0, | |
| "reward": 0.026041667093522847, | |
| "reward_std": 0.05095388810150325, | |
| "rewards/accuracy_reward": 0.026041667093522847, | |
| "step": 544 | |
| }, | |
| { | |
| "advantages": -8.537123855067996e-10, | |
| "clip_ratio": 0.0, | |
| "coeff1": 0.0, | |
| "coeff2": 0.0, | |
| "completion_length": 1117.5321254730225, | |
| "epoch": 0.448, | |
| "grad_norm": 0.052893524659718465, | |
| "kl": 0.05390167236328125, | |
| "learning_rate": 2.022858070982723e-06, | |
| "loss": 0.0052, | |
| "num_tokens": 22935690.0, | |
| "reward": 0.06076388992369175, | |
| "reward_std": 0.07754337647929788, | |
| "rewards/accuracy_reward": 0.06076388992369175, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.4608, | |
| "grad_norm": 0.33061073442663064, | |
| "learning_rate": 1.9595399131880336e-06, | |
| "loss": 0.0109, | |
| "step": 576 | |
| }, | |
| { | |
| "epoch": 0.4608, | |
| "eval_advantages": 5.268087330004531e-10, | |
| "eval_clip_ratio": 0.0, | |
| "eval_coeff1": 0.0, | |
| "eval_coeff2": 0.0, | |
| "eval_completion_length": 1592.6956287730825, | |
| "eval_kl": 6.670787464488637, | |
| "eval_loss": 0.2936657965183258, | |
| "eval_num_tokens": 24698629.0, | |
| "eval_reward": 0.06397306377237494, | |
| "eval_reward_std": 0.09890406985174525, | |
| "eval_rewards/accuracy_reward": 0.06397306377237494, | |
| "eval_runtime": 777.425, | |
| "eval_samples_per_second": 0.041, | |
| "eval_steps_per_second": 0.003, | |
| "step": 576 | |
| }, | |
| { | |
| "advantages": -2.4576568384560016e-10, | |
| "clip_ratio": 0.0, | |
| "coeff1": 0.0, | |
| "coeff2": 0.0, | |
| "completion_length": 1218.7899428009987, | |
| "epoch": 0.4736, | |
| "grad_norm": 0.1739154737585171, | |
| "kl": 0.06676912307739258, | |
| "learning_rate": 1.8953045097609192e-06, | |
| "loss": 0.0054, | |
| "num_tokens": 26199776.0, | |
| "reward": 0.039930556318722665, | |
| "reward_std": 0.067484509665519, | |
| "rewards/accuracy_reward": 0.039930556318722665, | |
| "step": 592 | |
| }, | |
| { | |
| "advantages": -4.915313624870299e-10, | |
| "clip_ratio": 0.0, | |
| "coeff1": 0.0, | |
| "coeff2": 0.0, | |
| "completion_length": 972.5477449893951, | |
| "epoch": 0.4864, | |
| "grad_norm": 0.08699168333257727, | |
| "kl": 0.12245941162109375, | |
| "learning_rate": 1.8302800751209524e-06, | |
| "loss": 0.0078, | |
| "num_tokens": 27551361.0, | |
| "reward": 0.037326389574445784, | |
| "reward_std": 0.061457116389647126, | |
| "rewards/accuracy_reward": 0.037326389574445784, | |
| "step": 608 | |
| }, | |
| { | |
| "advantages": -1.0348028683937471e-10, | |
| "clip_ratio": 0.0, | |
| "coeff1": 0.0, | |
| "coeff2": 0.0, | |
| "completion_length": 1149.3463646173477, | |
| "epoch": 0.4992, | |
| "grad_norm": 0.11664913948177272, | |
| "kl": 0.05477714538574219, | |
| "learning_rate": 1.7645963986008188e-06, | |
| "loss": 0.0034, | |
| "num_tokens": 29104197.0, | |
| "reward": 0.0381944450782612, | |
| "reward_std": 0.0675879716873169, | |
| "rewards/accuracy_reward": 0.0381944450782612, | |
| "step": 624 | |
| }, | |
| { | |
| "epoch": 0.512, | |
| "grad_norm": 0.05178259833648462, | |
| "learning_rate": 1.698384585385684e-06, | |
| "loss": 0.0025, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.512, | |
| "eval_advantages": 7.776700344292402e-10, | |
| "eval_clip_ratio": 0.0, | |
| "eval_coeff1": 0.0, | |
| "eval_coeff2": 0.0, | |
| "eval_completion_length": 852.7952991832386, | |
| "eval_kl": 0.051957563920454544, | |
| "eval_loss": 0.0046281577087938786, | |
| "eval_num_tokens": 30476142.0, | |
| "eval_reward": 0.04713804816657847, | |
| "eval_reward_std": 0.08570862019603903, | |
| "eval_rewards/accuracy_reward": 0.04713804681192745, | |
| "eval_runtime": 339.9404, | |
| "eval_samples_per_second": 0.094, | |
| "eval_steps_per_second": 0.006, | |
| "step": 640 | |
| }, | |
| { | |
| "advantages": -1.1641532442902003e-10, | |
| "clip_ratio": 0.0, | |
| "coeff1": 0.0, | |
| "coeff2": 0.0, | |
| "completion_length": 1037.3619900345802, | |
| "epoch": 0.5248, | |
| "grad_norm": 0.1715480919627143, | |
| "kl": 0.05947065353393555, | |
| "learning_rate": 1.6317767948261151e-06, | |
| "loss": 0.0068, | |
| "num_tokens": 31962531.0, | |
| "reward": 0.036892361589707434, | |
| "reward_std": 0.056289632339030504, | |
| "rewards/accuracy_reward": 0.036892361589707434, | |
| "step": 656 | |
| }, | |
| { | |
| "advantages": -3.469446951953614e-18, | |
| "clip_ratio": 0.0, | |
| "coeff1": 0.0, | |
| "coeff2": 0.0, | |
| "completion_length": 1162.2994906902313, | |
| "epoch": 0.5376, | |
| "grad_norm": 0.12907042606523647, | |
| "kl": 0.058727264404296875, | |
| "learning_rate": 1.5649059766468756e-06, | |
| "loss": 0.0033, | |
| "num_tokens": 33531576.0, | |
| "reward": 0.023437500232830644, | |
| "reward_std": 0.03835127782076597, | |
| "rewards/accuracy_reward": 0.023437500232830644, | |
| "step": 672 | |
| }, | |
| { | |
| "advantages": -7.761021634383747e-10, | |
| "clip_ratio": 0.0, | |
| "coeff1": 0.0, | |
| "coeff2": 0.0, | |
| "completion_length": 1093.435774922371, | |
| "epoch": 0.5504, | |
| "grad_norm": 0.13341389554678587, | |
| "kl": 0.07756614685058594, | |
| "learning_rate": 1.4979056055781284e-06, | |
| "loss": 0.0117, | |
| "num_tokens": 35020210.0, | |
| "reward": 0.05642361217178404, | |
| "reward_std": 0.08594673778861761, | |
| "rewards/accuracy_reward": 0.05642361217178404, | |
| "step": 688 | |
| }, | |
| { | |
| "epoch": 0.5632, | |
| "grad_norm": 0.05130106262462248, | |
| "learning_rate": 1.4309094149387215e-06, | |
| "loss": 0.0065, | |
| "step": 704 | |
| }, | |
| { | |
| "epoch": 0.5632, | |
| "eval_advantages": -8.529284248578764e-10, | |
| "eval_clip_ratio": 0.0, | |
| "eval_coeff1": 0.0, | |
| "eval_coeff2": 0.0, | |
| "eval_completion_length": 1165.2128184925425, | |
| "eval_kl": 0.05832741477272727, | |
| "eval_loss": 0.009196331724524498, | |
| "eval_num_tokens": 36812916.0, | |
| "eval_reward": 0.05387205495075746, | |
| "eval_reward_std": 0.08570862019603903, | |
| "eval_rewards/accuracy_reward": 0.053872053596106445, | |
| "eval_runtime": 592.7192, | |
| "eval_samples_per_second": 0.054, | |
| "eval_steps_per_second": 0.003, | |
| "step": 704 | |
| }, | |
| { | |
| "advantages": -6.079466955896673e-10, | |
| "clip_ratio": 0.0, | |
| "coeff1": 0.0, | |
| "coeff2": 0.0, | |
| "completion_length": 1193.20140260458, | |
| "epoch": 0.576, | |
| "grad_norm": 0.07760359685028215, | |
| "kl": 0.0630340576171875, | |
| "learning_rate": 1.36405112970333e-06, | |
| "loss": 0.0075, | |
| "num_tokens": 38224305.0, | |
| "reward": 0.04036458412883803, | |
| "reward_std": 0.06313632964156568, | |
| "rewards/accuracy_reward": 0.04036458412883803, | |
| "step": 720 | |
| }, | |
| { | |
| "advantages": -2.5870080383461058e-11, | |
| "clip_ratio": 0.0, | |
| "coeff1": 0.0, | |
| "coeff2": 0.0, | |
| "completion_length": 1223.709213733673, | |
| "epoch": 0.5888, | |
| "grad_norm": 0.15725359939619768, | |
| "kl": 0.07776260375976562, | |
| "learning_rate": 1.2974641995862513e-06, | |
| "loss": 0.006, | |
| "num_tokens": 39864679.0, | |
| "reward": 0.02430555591126904, | |
| "reward_std": 0.0411667434964329, | |
| "rewards/accuracy_reward": 0.02430555591126904, | |
| "step": 736 | |
| }, | |
| { | |
| "advantages": -3.6218100740725845e-10, | |
| "clip_ratio": 0.0, | |
| "coeff1": 0.0, | |
| "coeff2": 0.0, | |
| "completion_length": 1188.0920258760452, | |
| "epoch": 0.6016, | |
| "grad_norm": 0.2281893872440948, | |
| "kl": 0.05897712707519531, | |
| "learning_rate": 1.2312815326746265e-06, | |
| "loss": 0.0053, | |
| "num_tokens": 41460323.0, | |
| "reward": 0.026041667151730508, | |
| "reward_std": 0.03751045558601618, | |
| "rewards/accuracy_reward": 0.026041667151730508, | |
| "step": 752 | |
| }, | |
| { | |
| "epoch": 0.6144, | |
| "grad_norm": 0.09938089965885895, | |
| "learning_rate": 1.1656352301427494e-06, | |
| "loss": 0.0071, | |
| "step": 768 | |
| }, | |
| { | |
| "epoch": 0.6144, | |
| "eval_advantages": -1.8312875408018929e-09, | |
| "eval_clip_ratio": 0.0, | |
| "eval_coeff1": 0.0, | |
| "eval_coeff2": 0.0, | |
| "eval_completion_length": 1155.5676713423295, | |
| "eval_kl": 0.06094637784090909, | |
| "eval_loss": 0.01705244928598404, | |
| "eval_num_tokens": 43104138.0, | |
| "eval_reward": 0.060606061734936455, | |
| "eval_reward_std": 0.11014155772599307, | |
| "eval_rewards/accuracy_reward": 0.06060606038028544, | |
| "eval_runtime": 560.21, | |
| "eval_samples_per_second": 0.057, | |
| "eval_steps_per_second": 0.004, | |
| "step": 768 | |
| }, | |
| { | |
| "advantages": -5.950116606021072e-10, | |
| "clip_ratio": 0.0, | |
| "coeff1": 0.0, | |
| "coeff2": 0.0, | |
| "completion_length": 1141.80166092515, | |
| "epoch": 0.6272, | |
| "grad_norm": 0.09820866890356648, | |
| "kl": 0.05560588836669922, | |
| "learning_rate": 1.1006563225769834e-06, | |
| "loss": 0.0077, | |
| "num_tokens": 44553976.0, | |
| "reward": 0.05251736202626489, | |
| "reward_std": 0.07094883010722697, | |
| "rewards/accuracy_reward": 0.05251736202626489, | |
| "step": 784 | |
| }, | |
| { | |
| "advantages": -3.3631094610575563e-10, | |
| "clip_ratio": 0.0, | |
| "coeff1": 0.0, | |
| "coeff2": 0.0, | |
| "completion_length": 1058.8411583900452, | |
| "epoch": 0.64, | |
| "grad_norm": 2.8454182835451376, | |
| "kl": 0.0807657241821289, | |
| "learning_rate": 1.036474508437579e-06, | |
| "loss": 0.01, | |
| "num_tokens": 45998554.0, | |
| "reward": 0.0616319453692995, | |
| "reward_std": 0.08384686964564025, | |
| "rewards/accuracy_reward": 0.0616319453692995, | |
| "step": 800 | |
| }, | |
| { | |
| "advantages": -3.1044086572229457e-10, | |
| "clip_ratio": 0.0, | |
| "coeff1": 0.0, | |
| "coeff2": 0.0, | |
| "completion_length": 1077.1380355358124, | |
| "epoch": 0.6528, | |
| "grad_norm": 0.14199326993500677, | |
| "kl": 0.07318687438964844, | |
| "learning_rate": 9.7321789517943e-07, | |
| "loss": 0.0096, | |
| "num_tokens": 47470276.0, | |
| "reward": 0.06250000064028427, | |
| "reward_std": 0.0695626160595566, | |
| "rewards/accuracy_reward": 0.06250000064028427, | |
| "step": 816 | |
| }, | |
| { | |
| "epoch": 0.6656, | |
| "grad_norm": 0.0490038499435179, | |
| "learning_rate": 9.110127435484876e-07, | |
| "loss": 0.0047, | |
| "step": 832 | |
| }, | |
| { | |
| "epoch": 0.6656, | |
| "eval_advantages": 4.013780822860595e-10, | |
| "eval_clip_ratio": 0.0, | |
| "eval_coeff1": 0.0, | |
| "eval_coeff2": 0.0, | |
| "eval_completion_length": 1023.4006847034801, | |
| "eval_kl": 0.08331853693181818, | |
| "eval_loss": 0.011626713909208775, | |
| "eval_num_tokens": 49096390.0, | |
| "eval_reward": 0.03703703731298447, | |
| "eval_reward_std": 0.08911995657465675, | |
| "eval_rewards/accuracy_reward": 0.03703703731298447, | |
| "eval_runtime": 489.5183, | |
| "eval_samples_per_second": 0.065, | |
| "eval_steps_per_second": 0.004, | |
| "step": 832 | |
| }, | |
| { | |
| "advantages": -4.656612959813566e-10, | |
| "clip_ratio": 0.0, | |
| "coeff1": 0.0, | |
| "coeff2": 0.0, | |
| "completion_length": 1183.9275279045105, | |
| "epoch": 0.6784, | |
| "grad_norm": 0.16220979708988933, | |
| "kl": 0.0560002326965332, | |
| "learning_rate": 8.499832155642192e-07, | |
| "loss": 0.0045, | |
| "num_tokens": 50664137.0, | |
| "reward": 0.02821180599858053, | |
| "reward_std": 0.05475473066326231, | |
| "rewards/accuracy_reward": 0.02821180599858053, | |
| "step": 848 | |
| }, | |
| { | |
| "advantages": -2.0696057367874943e-10, | |
| "clip_ratio": 0.0, | |
| "coeff1": 0.0, | |
| "coeff2": 0.0, | |
| "completion_length": 1126.2864711284637, | |
| "epoch": 0.6912, | |
| "grad_norm": 0.18714923830540647, | |
| "kl": 0.057494163513183594, | |
| "learning_rate": 7.902511266911504e-07, | |
| "loss": 0.0091, | |
| "num_tokens": 52188707.0, | |
| "reward": 0.040798611706122756, | |
| "reward_std": 0.06322046043351293, | |
| "rewards/accuracy_reward": 0.040798611706122756, | |
| "step": 864 | |
| }, | |
| { | |
| "advantages": -5.174014446052144e-10, | |
| "clip_ratio": 0.0, | |
| "coeff1": 0.0, | |
| "coeff2": 0.0, | |
| "completion_length": 1009.0295271873474, | |
| "epoch": 0.704, | |
| "grad_norm": 0.053802748980809056, | |
| "kl": 0.0721750259399414, | |
| "learning_rate": 7.319357026941429e-07, | |
| "loss": 0.0064, | |
| "num_tokens": 53585793.0, | |
| "reward": 0.032986111647915095, | |
| "reward_std": 0.041123706148937345, | |
| "rewards/accuracy_reward": 0.032986111647915095, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.7168, | |
| "grad_norm": 0.043086794360717814, | |
| "learning_rate": 6.751533416627402e-07, | |
| "loss": 0.0041, | |
| "step": 896 | |
| }, | |
| { | |
| "epoch": 0.7168, | |
| "eval_advantages": -1.2543065071439358e-10, | |
| "eval_clip_ratio": 0.0, | |
| "eval_coeff1": 0.0, | |
| "eval_coeff2": 0.0, | |
| "eval_completion_length": 1056.4094349254262, | |
| "eval_kl": 0.047407670454545456, | |
| "eval_loss": -0.0009789273608475924, | |
| "eval_num_tokens": 55246005.0, | |
| "eval_reward": 0.05050505088134245, | |
| "eval_reward_std": 0.08993954008275812, | |
| "eval_rewards/accuracy_reward": 0.05050505088134245, | |
| "eval_runtime": 530.6591, | |
| "eval_samples_per_second": 0.06, | |
| "eval_steps_per_second": 0.004, | |
| "step": 896 | |
| }, | |
| { | |
| "advantages": -3.363109365647765e-10, | |
| "clip_ratio": 0.0, | |
| "coeff1": 0.0, | |
| "coeff2": 0.0, | |
| "completion_length": 1178.87544465065, | |
| "epoch": 0.7296, | |
| "grad_norm": 0.5075980892313485, | |
| "kl": 0.05362844467163086, | |
| "learning_rate": 6.200173816795782e-07, | |
| "loss": 0.0058, | |
| "num_tokens": 56780614.0, | |
| "reward": 0.04730902868323028, | |
| "reward_std": 0.06855815008748323, | |
| "rewards/accuracy_reward": 0.04730902868323028, | |
| "step": 912 | |
| }, | |
| { | |
| "advantages": -4.915313676912003e-10, | |
| "clip_ratio": 0.0, | |
| "coeff1": 0.0, | |
| "coeff2": 0.0, | |
| "completion_length": 1000.2708378434181, | |
| "epoch": 0.7424, | |
| "grad_norm": 0.2521305121709945, | |
| "kl": 0.05377674102783203, | |
| "learning_rate": 5.666378745965906e-07, | |
| "loss": 0.0039, | |
| "num_tokens": 58167790.0, | |
| "reward": 0.03819444519467652, | |
| "reward_std": 0.06435428117401898, | |
| "rewards/accuracy_reward": 0.03819444519467652, | |
| "step": 928 | |
| }, | |
| { | |
| "advantages": -9.830627388518476e-10, | |
| "clip_ratio": 0.0, | |
| "coeff1": 0.0, | |
| "coeff2": 0.0, | |
| "completion_length": 1074.833347082138, | |
| "epoch": 0.7552, | |
| "grad_norm": 0.2128395774905888, | |
| "kl": 0.05224609375, | |
| "learning_rate": 5.151213663705655e-07, | |
| "loss": 0.0052, | |
| "num_tokens": 59637865.0, | |
| "reward": 0.03906250081490725, | |
| "reward_std": 0.05284245638176799, | |
| "rewards/accuracy_reward": 0.03906250081490725, | |
| "step": 944 | |
| }, | |
| { | |
| "epoch": 0.768, | |
| "grad_norm": 0.34929628186787115, | |
| "learning_rate": 4.6557068439649533e-07, | |
| "loss": 0.0063, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 0.768, | |
| "eval_advantages": -3.0103356171454464e-10, | |
| "eval_clip_ratio": 0.0, | |
| "eval_coeff1": 0.0, | |
| "eval_coeff2": 0.0, | |
| "eval_completion_length": 962.1421064897017, | |
| "eval_kl": 0.06276633522727272, | |
| "eval_loss": 0.004617475904524326, | |
| "eval_num_tokens": 61063526.0, | |
| "eval_reward": 0.033670033920894966, | |
| "eval_reward_std": 0.06386743079532277, | |
| "eval_rewards/accuracy_reward": 0.033670033920894966, | |
| "eval_runtime": 378.3809, | |
| "eval_samples_per_second": 0.085, | |
| "eval_steps_per_second": 0.005, | |
| "step": 960 | |
| }, | |
| { | |
| "advantages": 6.467517753988572e-11, | |
| "clip_ratio": 0.0, | |
| "coeff1": 0.0, | |
| "coeff2": 0.0, | |
| "completion_length": 1014.7638971209526, | |
| "epoch": 0.7808, | |
| "grad_norm": 0.25605661324993567, | |
| "kl": 0.06560897827148438, | |
| "learning_rate": 4.1808473226320925e-07, | |
| "loss": 0.0059, | |
| "num_tokens": 62437923.0, | |
| "reward": 0.03211805599858053, | |
| "reward_std": 0.05740193568635732, | |
| "rewards/accuracy_reward": 0.03211805599858053, | |
| "step": 976 | |
| }, | |
| { | |
| "advantages": -6.726218696601061e-10, | |
| "clip_ratio": 0.0, | |
| "coeff1": 0.0, | |
| "coeff2": 0.0, | |
| "completion_length": 958.8003573417664, | |
| "epoch": 0.7936, | |
| "grad_norm": 0.12555370381277758, | |
| "kl": 0.06540393829345703, | |
| "learning_rate": 3.7275829234095597e-07, | |
| "loss": 0.0044, | |
| "num_tokens": 63773140.0, | |
| "reward": 0.034722223062999547, | |
| "reward_std": 0.05171300959773362, | |
| "rewards/accuracy_reward": 0.034722223062999547, | |
| "step": 992 | |
| }, | |
| { | |
| "advantages": -1.0089328036227974e-09, | |
| "clip_ratio": 0.0, | |
| "coeff1": 0.0, | |
| "coeff2": 0.0, | |
| "completion_length": 1007.450531244278, | |
| "epoch": 0.8064, | |
| "grad_norm": 0.2615829366145835, | |
| "kl": 0.06541252136230469, | |
| "learning_rate": 3.2968183659497496e-07, | |
| "loss": 0.0075, | |
| "num_tokens": 65169955.0, | |
| "reward": 0.05381944548571482, | |
| "reward_std": 0.07846590015105903, | |
| "rewards/accuracy_reward": 0.05381944548571482, | |
| "step": 1008 | |
| }, | |
| { | |
| "epoch": 0.8192, | |
| "grad_norm": 0.2355642427029959, | |
| "learning_rate": 2.889413460026724e-07, | |
| "loss": 0.0039, | |
| "step": 1024 | |
| }, | |
| { | |
| "epoch": 0.8192, | |
| "eval_advantages": -9.031006851436339e-10, | |
| "eval_clip_ratio": 0.0, | |
| "eval_coeff1": 0.0, | |
| "eval_coeff2": 0.0, | |
| "eval_completion_length": 1017.2666736949574, | |
| "eval_kl": 0.06349875710227272, | |
| "eval_loss": 0.008554265834391117, | |
| "eval_num_tokens": 66669954.0, | |
| "eval_reward": 0.04713804816657847, | |
| "eval_reward_std": 0.08911995792930777, | |
| "eval_rewards/accuracy_reward": 0.04713804681192745, | |
| "eval_runtime": 434.6785, | |
| "eval_samples_per_second": 0.074, | |
| "eval_steps_per_second": 0.005, | |
| "step": 1024 | |
| }, | |
| { | |
| "advantages": -5.562065521699799e-10, | |
| "clip_ratio": 0.0, | |
| "coeff1": 0.0, | |
| "coeff2": 0.0, | |
| "completion_length": 1069.8537440896034, | |
| "epoch": 0.832, | |
| "grad_norm": 0.3632287215575524, | |
| "kl": 0.06230878829956055, | |
| "learning_rate": 2.5061813893485086e-07, | |
| "loss": 0.0059, | |
| "num_tokens": 68107083.0, | |
| "reward": 0.03125000061118044, | |
| "reward_std": 0.04950311663560569, | |
| "rewards/accuracy_reward": 0.03125000061118044, | |
| "step": 1040 | |
| }, | |
| { | |
| "advantages": -7.761021564994808e-10, | |
| "clip_ratio": 0.0, | |
| "coeff1": 0.0, | |
| "coeff2": 0.0, | |
| "completion_length": 1005.6675432920456, | |
| "epoch": 0.8448, | |
| "grad_norm": 0.10428444418157863, | |
| "kl": 0.04985618591308594, | |
| "learning_rate": 2.1478870884353569e-07, | |
| "loss": 0.003, | |
| "num_tokens": 69492196.0, | |
| "reward": 0.036458334245253354, | |
| "reward_std": 0.04957010387443006, | |
| "rewards/accuracy_reward": 0.036458334245253354, | |
| "step": 1056 | |
| }, | |
| { | |
| "advantages": -3.8805107738237865e-10, | |
| "clip_ratio": 0.0, | |
| "coeff1": 0.0, | |
| "coeff2": 0.0, | |
| "completion_length": 1020.9513982534409, | |
| "epoch": 0.8576, | |
| "grad_norm": 0.1509133862122977, | |
| "kl": 0.05609416961669922, | |
| "learning_rate": 1.8152457158038754e-07, | |
| "loss": 0.0039, | |
| "num_tokens": 70900217.0, | |
| "reward": 0.04427083412883803, | |
| "reward_std": 0.06196141499094665, | |
| "rewards/accuracy_reward": 0.04427083412883803, | |
| "step": 1072 | |
| }, | |
| { | |
| "epoch": 0.8704, | |
| "grad_norm": 0.22985562058251116, | |
| "learning_rate": 1.508921226504434e-07, | |
| "loss": 0.007, | |
| "step": 1088 | |
| }, | |
| { | |
| "epoch": 0.8704, | |
| "eval_advantages": 3.512058220003021e-10, | |
| "eval_clip_ratio": 0.0, | |
| "eval_coeff1": 0.0, | |
| "eval_coeff2": 0.0, | |
| "eval_completion_length": 930.0915971235795, | |
| "eval_kl": 0.05968128551136364, | |
| "eval_loss": 0.001855027861893177, | |
| "eval_num_tokens": 72531411.0, | |
| "eval_reward": 0.060606061057610947, | |
| "eval_reward_std": 0.0933678461746736, | |
| "eval_rewards/accuracy_reward": 0.060606061057610947, | |
| "eval_runtime": 468.1576, | |
| "eval_samples_per_second": 0.068, | |
| "eval_steps_per_second": 0.004, | |
| "step": 1088 | |
| }, | |
| { | |
| "advantages": -4.78596330101555e-10, | |
| "clip_ratio": 0.0, | |
| "coeff1": 0.0, | |
| "coeff2": 0.0, | |
| "completion_length": 1140.6423711180687, | |
| "epoch": 0.8832, | |
| "grad_norm": 0.3042206749317757, | |
| "kl": 0.06157279014587402, | |
| "learning_rate": 1.229525046861178e-07, | |
| "loss": 0.0037, | |
| "num_tokens": 74003268.0, | |
| "reward": 0.04079861190984957, | |
| "reward_std": 0.058684688061475754, | |
| "rewards/accuracy_reward": 0.04079861190984957, | |
| "step": 1104 | |
| }, | |
| { | |
| "advantages": -2.069605754134729e-10, | |
| "clip_ratio": 0.0, | |
| "coeff1": 0.0, | |
| "coeff2": 0.0, | |
| "completion_length": 1093.5312616825104, | |
| "epoch": 0.896, | |
| "grad_norm": 0.18150421284527432, | |
| "kl": 0.04981040954589844, | |
| "learning_rate": 9.776148540597835e-08, | |
| "loss": 0.004, | |
| "num_tokens": 75487530.0, | |
| "reward": 0.029513889516238123, | |
| "reward_std": 0.04994966462254524, | |
| "rewards/accuracy_reward": 0.029513889516238123, | |
| "step": 1120 | |
| }, | |
| { | |
| "advantages": 1.5522042678961512e-10, | |
| "clip_ratio": 0.0, | |
| "coeff1": 0.0, | |
| "coeff2": 0.0, | |
| "completion_length": 1031.1415041685104, | |
| "epoch": 0.9088, | |
| "grad_norm": 0.10760453701580872, | |
| "kl": 0.048736572265625, | |
| "learning_rate": 7.536934630189984e-08, | |
| "loss": 0.003, | |
| "num_tokens": 76909621.0, | |
| "reward": 0.034722222422715276, | |
| "reward_std": 0.040032922057434916, | |
| "rewards/accuracy_reward": 0.034722222422715276, | |
| "step": 1136 | |
| }, | |
| { | |
| "epoch": 0.9216, | |
| "grad_norm": 0.034815976141797315, | |
| "learning_rate": 5.5820782276771844e-08, | |
| "loss": 0.0048, | |
| "step": 1152 | |
| }, | |
| { | |
| "epoch": 0.9216, | |
| "eval_advantages": 5.518948631433318e-10, | |
| "eval_clip_ratio": 0.0, | |
| "eval_coeff1": 0.0, | |
| "eval_coeff2": 0.0, | |
| "eval_completion_length": 973.9892245205966, | |
| "eval_kl": 0.047429865056818184, | |
| "eval_loss": 0.005054465960711241, | |
| "eval_num_tokens": 78362666.0, | |
| "eval_reward": 0.037037037990309975, | |
| "eval_reward_std": 0.06468701430342415, | |
| "eval_rewards/accuracy_reward": 0.03703703663565896, | |
| "eval_runtime": 518.4926, | |
| "eval_samples_per_second": 0.062, | |
| "eval_steps_per_second": 0.004, | |
| "step": 1152 | |
| }, | |
| { | |
| "advantages": 1.1641532095957308e-10, | |
| "clip_ratio": 0.0, | |
| "coeff1": 0.0, | |
| "coeff2": 0.0, | |
| "completion_length": 1048.4752708673477, | |
| "epoch": 0.9344, | |
| "grad_norm": 0.21242493691362754, | |
| "kl": 0.044564247131347656, | |
| "learning_rate": 3.915481243308722e-08, | |
| "loss": 0.0037, | |
| "num_tokens": 79790743.0, | |
| "reward": 0.03645833375048824, | |
| "reward_std": 0.058012127061374485, | |
| "rewards/accuracy_reward": 0.03645833375048824, | |
| "step": 1168 | |
| }, | |
| { | |
| "advantages": -6.984919483088436e-10, | |
| "clip_ratio": 0.0, | |
| "coeff1": 0.0, | |
| "coeff2": 0.0, | |
| "completion_length": 1015.795152425766, | |
| "epoch": 0.9472, | |
| "grad_norm": 0.2173139620490386, | |
| "kl": 0.05732440948486328, | |
| "learning_rate": 2.5404702190476857e-08, | |
| "loss": 0.0073, | |
| "num_tokens": 81192986.0, | |
| "reward": 0.05295138974906877, | |
| "reward_std": 0.07493921066634357, | |
| "rewards/accuracy_reward": 0.05295138974906877, | |
| "step": 1184 | |
| }, | |
| { | |
| "advantages": -4.1392115429639276e-10, | |
| "clip_ratio": 0.0, | |
| "coeff1": 0.0, | |
| "coeff2": 0.0, | |
| "completion_length": 1034.2934141159058, | |
| "epoch": 0.96, | |
| "grad_norm": 0.2699272899444651, | |
| "kl": 0.05102252960205078, | |
| "learning_rate": 1.4597896887644457e-08, | |
| "loss": 0.0068, | |
| "num_tokens": 82614892.0, | |
| "reward": 0.029513889516238123, | |
| "reward_std": 0.05481710028834641, | |
| "rewards/accuracy_reward": 0.029513889516238123, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.9728, | |
| "grad_norm": 0.6748474802026545, | |
| "learning_rate": 6.755967001234542e-09, | |
| "loss": 0.0056, | |
| "step": 1216 | |
| }, | |
| { | |
| "epoch": 0.9728, | |
| "eval_advantages": 9.031006851436339e-10, | |
| "eval_clip_ratio": 0.0, | |
| "eval_coeff1": 0.0, | |
| "eval_coeff2": 0.0, | |
| "eval_completion_length": 920.020224831321, | |
| "eval_kl": 0.052800958806818184, | |
| "eval_loss": 0.007503577042371035, | |
| "eval_num_tokens": 84106311.0, | |
| "eval_reward": 0.057239057665521446, | |
| "eval_reward_std": 0.08718085559931668, | |
| "eval_rewards/accuracy_reward": 0.057239057665521446, | |
| "eval_runtime": 350.3027, | |
| "eval_samples_per_second": 0.091, | |
| "eval_steps_per_second": 0.006, | |
| "step": 1216 | |
| }, | |
| { | |
| "advantages": -3.1044086572229457e-10, | |
| "clip_ratio": 0.0, | |
| "coeff1": 0.0, | |
| "coeff2": 0.0, | |
| "completion_length": 1054.8498351573944, | |
| "epoch": 0.9856, | |
| "grad_norm": 0.05988202916363173, | |
| "kl": 0.06485462188720703, | |
| "learning_rate": 1.8945650909737986e-09, | |
| "loss": 0.0047, | |
| "num_tokens": 85506813.0, | |
| "reward": 0.03211805602768436, | |
| "reward_std": 0.04946226696483791, | |
| "rewards/accuracy_reward": 0.03211805602768436, | |
| "step": 1232 | |
| }, | |
| { | |
| "advantages": -1.5522043719795597e-10, | |
| "clip_ratio": 0.0, | |
| "coeff1": 0.0, | |
| "coeff2": 0.0, | |
| "completion_length": 1088.2647668123245, | |
| "epoch": 0.9984, | |
| "grad_norm": 0.30744410756834145, | |
| "kl": 0.04868507385253906, | |
| "learning_rate": 2.3394557027600626e-11, | |
| "loss": 0.0027, | |
| "num_tokens": 86991596.0, | |
| "reward": 0.020833333604969084, | |
| "reward_std": 0.04309834982268512, | |
| "rewards/accuracy_reward": 0.020833333604969084, | |
| "step": 1248 | |
| }, | |
| { | |
| "advantages": 0.0, | |
| "clip_ratio": 0.0, | |
| "coeff1": 0.0, | |
| "coeff2": 0.0, | |
| "completion_length": 1299.5555686950684, | |
| "epoch": 1.0, | |
| "kl": 0.0355224609375, | |
| "num_tokens": 87209782.0, | |
| "reward": 0.013888888992369175, | |
| "reward_std": 0.0416666679084301, | |
| "rewards/accuracy_reward": 0.013888888992369175, | |
| "step": 1250, | |
| "total_flos": 0.0, | |
| "train_loss": 0.004173475314304232, | |
| "train_runtime": 206115.5028, | |
| "train_samples_per_second": 0.049, | |
| "train_steps_per_second": 0.006 | |
| } | |
| ], | |
| "logging_steps": 16, | |
| "max_steps": 1250, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 64, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 3, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |