llama_r1math_grpo_acconly / trainer_state.json
imdatta0's picture
Model save
386da46 verified
{
"best_global_step": 896,
"best_metric": -0.0009789273608475924,
"best_model_checkpoint": "/home/datta0/mnt/dattafs/train/llama_r1math_grpo_acconly/checkpoint-896",
"epoch": 1.0,
"eval_steps": 64,
"global_step": 1250,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"advantages": 0.0,
"clip_ratio": 0.0,
"coeff1": 0.0,
"coeff2": 0.0,
"completion_length": 1027.7083549499512,
"epoch": 0.0008,
"grad_norm": 0.01466101370737968,
"kl": 0.0004107952117919922,
"learning_rate": 2.4e-08,
"loss": 0.0,
"num_tokens": 87864.0,
"reward": 0.0,
"reward_std": 0.0,
"rewards/accuracy_reward": 0.0,
"step": 1
},
{
"advantages": -1.6556845894299955e-10,
"clip_ratio": 0.0,
"coeff1": 0.0,
"coeff2": 0.0,
"completion_length": 1264.6620464324951,
"epoch": 0.0128,
"grad_norm": 0.025934769703142593,
"kl": 0.00044070879618326824,
"learning_rate": 3.84e-07,
"loss": 0.0023,
"num_tokens": 1674685.0,
"reward": 0.012037037312984467,
"reward_std": 0.026614275574684144,
"rewards/accuracy_reward": 0.012037037312984467,
"step": 16
},
{
"advantages": -2.587007240373307e-10,
"clip_ratio": 0.0,
"coeff1": 0.0,
"coeff2": 0.0,
"completion_length": 1219.9435840845108,
"epoch": 0.0256,
"grad_norm": 0.04061673893266753,
"kl": 0.0007980614900588989,
"learning_rate": 7.68e-07,
"loss": -0.0021,
"num_tokens": 3304853.0,
"reward": 0.021701389166992158,
"reward_std": 0.03553581167943776,
"rewards/accuracy_reward": 0.021701389166992158,
"step": 32
},
{
"advantages": -1.810905054383527e-10,
"clip_ratio": 0.0,
"coeff1": 0.0,
"coeff2": 0.0,
"completion_length": 1160.891501903534,
"epoch": 0.0384,
"grad_norm": 0.04088811488095649,
"kl": 0.0014685392379760742,
"learning_rate": 1.1520000000000002e-06,
"loss": 0.0038,
"num_tokens": 4870287.0,
"reward": 0.01736111141508445,
"reward_std": 0.032931644935160875,
"rewards/accuracy_reward": 0.01736111141508445,
"step": 48
},
{
"epoch": 0.0512,
"grad_norm": 0.12319314674263099,
"learning_rate": 1.536e-06,
"loss": 0.0039,
"step": 64
},
{
"epoch": 0.0512,
"eval_advantages": -2.5086130142878716e-10,
"eval_clip_ratio": 0.0,
"eval_coeff1": 0.0,
"eval_coeff2": 0.0,
"eval_completion_length": 1093.752197265625,
"eval_kl": 0.005735917524857955,
"eval_loss": 0.0002934922813437879,
"eval_num_tokens": 6538255.0,
"eval_reward": 0.033670033920894966,
"eval_reward_std": 0.06973752108487216,
"eval_rewards/accuracy_reward": 0.033670033920894966,
"eval_runtime": 551.8022,
"eval_samples_per_second": 0.058,
"eval_steps_per_second": 0.004,
"step": 64
},
{
"advantages": -4.009861184414709e-10,
"clip_ratio": 0.0,
"coeff1": 0.0,
"coeff2": 0.0,
"completion_length": 1243.5607726573944,
"epoch": 0.064,
"grad_norm": 0.27812226841587845,
"kl": 0.008373796939849854,
"learning_rate": 1.9200000000000003e-06,
"loss": 0.0007,
"num_tokens": 8191544.0,
"reward": 0.034288194990949705,
"reward_std": 0.05555227259173989,
"rewards/accuracy_reward": 0.034288194990949705,
"step": 80
},
{
"advantages": -4.915313728953707e-10,
"clip_ratio": 0.0,
"coeff1": 0.0,
"coeff2": 0.0,
"completion_length": 1286.811640381813,
"epoch": 0.0768,
"grad_norm": 0.08377069137091007,
"kl": 0.011660099029541016,
"learning_rate": 2.3040000000000003e-06,
"loss": 0.0102,
"num_tokens": 9907465.0,
"reward": 0.03819444525288418,
"reward_std": 0.06641548709012568,
"rewards/accuracy_reward": 0.03819444525288418,
"step": 96
},
{
"advantages": -6.726218748642765e-10,
"clip_ratio": 0.0,
"coeff1": 0.0,
"coeff2": 0.0,
"completion_length": 1128.9600811004639,
"epoch": 0.0896,
"grad_norm": 0.16347202160725127,
"kl": 0.09761714935302734,
"learning_rate": 2.688e-06,
"loss": 0.0047,
"num_tokens": 11442612.0,
"reward": 0.04079861199716106,
"reward_std": 0.06649718736298382,
"rewards/accuracy_reward": 0.04079861199716106,
"step": 112
},
{
"epoch": 0.1024,
"grad_norm": 0.27678886955004334,
"learning_rate": 2.999947362417721e-06,
"loss": 0.0042,
"step": 128
},
{
"epoch": 0.1024,
"eval_advantages": -2.0068904114302975e-10,
"eval_clip_ratio": 0.0,
"eval_coeff1": 0.0,
"eval_coeff2": 0.0,
"eval_completion_length": 1419.1784390536222,
"eval_kl": 0.029097123579545456,
"eval_loss": 0.010437126271426678,
"eval_num_tokens": 12942930.0,
"eval_reward": 0.04713804748925296,
"eval_reward_std": 0.07804939692670648,
"eval_rewards/accuracy_reward": 0.04713804748925296,
"eval_runtime": 728.7234,
"eval_samples_per_second": 0.044,
"eval_steps_per_second": 0.003,
"step": 128
},
{
"advantages": -6.72621876599e-10,
"clip_ratio": 0.0,
"coeff1": 0.0,
"coeff2": 0.0,
"completion_length": 1180.0976678729057,
"epoch": 0.1152,
"grad_norm": 0.1291924427524718,
"kl": 0.03859281539916992,
"learning_rate": 2.997889131011168e-06,
"loss": -0.0005,
"num_tokens": 14623194.0,
"reward": 0.04861111196805723,
"reward_std": 0.09081879071891308,
"rewards/accuracy_reward": 0.04861111196805723,
"step": 144
},
{
"advantages": 2.328306471233166e-10,
"clip_ratio": 0.0,
"coeff1": 0.0,
"coeff2": 0.0,
"completion_length": 808.3142422437668,
"epoch": 0.128,
"grad_norm": 0.2312672301481172,
"kl": 0.05692291259765625,
"learning_rate": 2.9928410999727467e-06,
"loss": 0.0056,
"num_tokens": 15791756.0,
"reward": 0.030381944845430553,
"reward_std": 0.06712671066634357,
"rewards/accuracy_reward": 0.030381944845430553,
"step": 160
},
{
"advantages": -2.32830652327487e-10,
"clip_ratio": 0.0,
"coeff1": 0.0,
"coeff2": 0.0,
"completion_length": 1044.7361204624176,
"epoch": 0.1408,
"grad_norm": 0.2204521818711921,
"kl": 0.055908203125,
"learning_rate": 2.9848133452159737e-06,
"loss": 0.0019,
"num_tokens": 17229706.0,
"reward": 0.043402778333984315,
"reward_std": 0.056710043689236045,
"rewards/accuracy_reward": 0.043402778333984315,
"step": 176
},
{
"epoch": 0.1536,
"grad_norm": 0.2784129015892714,
"learning_rate": 2.97382189020862e-06,
"loss": 0.0035,
"step": 192
},
{
"epoch": 0.1536,
"eval_advantages": 4.264642124289382e-10,
"eval_clip_ratio": 0.0,
"eval_coeff1": 0.0,
"eval_coeff2": 0.0,
"eval_completion_length": 1347.6511951793325,
"eval_kl": 0.04640891335227273,
"eval_loss": 0.009176909923553467,
"eval_num_tokens": 18805959.0,
"eval_reward": 0.033670033920894966,
"eval_reward_std": 0.08733082156289708,
"eval_rewards/accuracy_reward": 0.033670033920894966,
"eval_runtime": 743.3844,
"eval_samples_per_second": 0.043,
"eval_steps_per_second": 0.003,
"step": 192
},
{
"advantages": -1.0348028857409819e-10,
"clip_ratio": 0.0,
"coeff1": 0.0,
"coeff2": 0.0,
"completion_length": 1291.665376842022,
"epoch": 0.1664,
"grad_norm": 0.3752515234781844,
"kl": 0.053093910217285156,
"learning_rate": 2.959888673989734e-06,
"loss": 0.0065,
"num_tokens": 20658358.0,
"reward": 0.023871528188465163,
"reward_std": 0.0477204411290586,
"rewards/accuracy_reward": 0.023871528188465163,
"step": 208
},
{
"advantages": -1.810905054383527e-10,
"clip_ratio": 0.0,
"coeff1": 0.0,
"coeff2": 0.0,
"completion_length": 1179.581609249115,
"epoch": 0.1792,
"grad_norm": 0.1424187976880637,
"kl": 0.05293464660644531,
"learning_rate": 2.943041507379129e-06,
"loss": 0.0057,
"num_tokens": 22251551.0,
"reward": 0.026041667151730508,
"reward_std": 0.04704763786867261,
"rewards/accuracy_reward": 0.026041667151730508,
"step": 224
},
{
"advantages": -1.810905054383527e-10,
"clip_ratio": 0.0,
"coeff1": 0.0,
"coeff2": 0.0,
"completion_length": 1221.903657913208,
"epoch": 0.192,
"grad_norm": 0.06793437710568745,
"kl": 0.0833587646484375,
"learning_rate": 2.9233140174667447e-06,
"loss": 0.0039,
"num_tokens": 23897243.0,
"reward": 0.02170138928340748,
"reward_std": 0.04070548270829022,
"rewards/accuracy_reward": 0.02170138928340748,
"step": 240
},
{
"epoch": 0.2048,
"grad_norm": 0.0687227576536205,
"learning_rate": 2.90074558049269e-06,
"loss": 0.0087,
"step": 256
},
{
"epoch": 0.2048,
"eval_advantages": -4.013780822860595e-10,
"eval_clip_ratio": 0.0,
"eval_coeff1": 0.0,
"eval_coeff2": 0.0,
"eval_completion_length": 1184.7589333274148,
"eval_kl": 0.05091441761363636,
"eval_loss": 0.019146718084812164,
"eval_num_tokens": 25328098.0,
"eval_reward": 0.09090909158641641,
"eval_reward_std": 0.12024257670749318,
"eval_rewards/accuracy_reward": 0.09090909158641641,
"eval_runtime": 605.2476,
"eval_samples_per_second": 0.053,
"eval_steps_per_second": 0.003,
"step": 256
},
{
"advantages": -3.3631094176894694e-10,
"clip_ratio": 0.0,
"coeff1": 0.0,
"coeff2": 0.0,
"completion_length": 1135.4952353835106,
"epoch": 0.2176,
"grad_norm": 0.13313117965541169,
"kl": 0.05780601501464844,
"learning_rate": 2.875381243251925e-06,
"loss": 0.0062,
"num_tokens": 26977725.0,
"reward": 0.03515625064028427,
"reward_std": 0.06366020266432315,
"rewards/accuracy_reward": 0.03515625064028427,
"step": 272
},
{
"advantages": -4.3979123294513034e-10,
"clip_ratio": 0.0,
"coeff1": 0.0,
"coeff2": 0.0,
"completion_length": 1205.5920230150223,
"epoch": 0.2304,
"grad_norm": 0.12756211705162357,
"kl": 0.06094169616699219,
"learning_rate": 2.8472716331804677e-06,
"loss": 0.0077,
"num_tokens": 28604140.0,
"reward": 0.03125000046566129,
"reward_std": 0.07459831284359097,
"rewards/accuracy_reward": 0.03125000046566129,
"step": 288
},
{
"advantages": -4.656613011855271e-10,
"clip_ratio": 0.0,
"coeff1": 0.0,
"coeff2": 0.0,
"completion_length": 992.4383769035339,
"epoch": 0.2432,
"grad_norm": 0.3503219113511989,
"kl": 0.06413078308105469,
"learning_rate": 2.8164728573026006e-06,
"loss": 0.0034,
"num_tokens": 29971997.0,
"reward": 0.03211805614409968,
"reward_std": 0.048306683311238885,
"rewards/accuracy_reward": 0.03211805614409968,
"step": 304
},
{
"epoch": 0.256,
"grad_norm": 0.0744612699055891,
"learning_rate": 2.78304639024076e-06,
"loss": 0.0067,
"step": 320
},
{
"epoch": 0.256,
"eval_advantages": 3.0103356171454464e-10,
"eval_clip_ratio": 0.0,
"eval_coeff1": 0.0,
"eval_coeff2": 0.0,
"eval_completion_length": 894.4821555397727,
"eval_kl": 0.055109197443181816,
"eval_loss": 0.008422672748565674,
"eval_num_tokens": 31693821.0,
"eval_reward": 0.047138048843903976,
"eval_reward_std": 0.10330192202871497,
"eval_rewards/accuracy_reward": 0.04713804613460194,
"eval_runtime": 320.0579,
"eval_samples_per_second": 0.1,
"eval_steps_per_second": 0.006,
"step": 320
},
{
"advantages": 7.761021339480756e-11,
"clip_ratio": 0.0,
"coeff1": 0.0,
"coeff2": 0.0,
"completion_length": 991.0763994455338,
"epoch": 0.2688,
"grad_norm": 0.06535007437884081,
"kl": 0.059384822845458984,
"learning_rate": 2.7470589515116627e-06,
"loss": 0.004,
"num_tokens": 1372957.0,
"reward": 0.046006944903638214,
"reward_std": 0.06322046089917421,
"rewards/accuracy_reward": 0.046006944903638214,
"step": 336
},
{
"advantages": 2.5870069975120202e-11,
"clip_ratio": 0.0,
"coeff1": 0.0,
"coeff2": 0.0,
"completion_length": 1039.1059153676033,
"epoch": 0.2816,
"grad_norm": 0.45362050511921775,
"kl": 0.08066940307617188,
"learning_rate": 2.7085823723535738e-06,
"loss": 0.0086,
"num_tokens": 2797554.0,
"reward": 0.0460069450782612,
"reward_std": 0.07455965038388968,
"rewards/accuracy_reward": 0.0460069450782612,
"step": 352
},
{
"advantages": -8.537123698942883e-10,
"clip_ratio": 0.0,
"coeff1": 0.0,
"coeff2": 0.0,
"completion_length": 1101.8316051959991,
"epoch": 0.2944,
"grad_norm": 0.11366606297326208,
"kl": 0.06418609619140625,
"learning_rate": 2.6676934523505355e-06,
"loss": 0.0083,
"num_tokens": 4299874.0,
"reward": 0.04079861211357638,
"reward_std": 0.06628588796593249,
"rewards/accuracy_reward": 0.04079861211357638,
"step": 368
},
{
"epoch": 0.3072,
"grad_norm": 0.10560527731603102,
"learning_rate": 2.6244738061397326e-06,
"loss": 0.0066,
"step": 384
},
{
"epoch": 0.3072,
"eval_advantages": 6.020671234290893e-10,
"eval_clip_ratio": 0.0,
"eval_coeff1": 0.0,
"eval_coeff2": 0.0,
"eval_completion_length": 949.9097789417614,
"eval_kl": 0.05579723011363636,
"eval_loss": 0.011235292069613934,
"eval_num_tokens": 5860471.0,
"eval_reward": 0.04040404070507397,
"eval_reward_std": 0.09238132766701958,
"eval_rewards/accuracy_reward": 0.04040404070507397,
"eval_runtime": 373.4119,
"eval_samples_per_second": 0.086,
"eval_steps_per_second": 0.005,
"step": 384
},
{
"advantages": -4.2685619362076155e-10,
"clip_ratio": 0.0,
"coeff1": 0.0,
"coeff2": 0.0,
"completion_length": 1075.3763124346733,
"epoch": 0.32,
"grad_norm": 0.14605916839741878,
"kl": 0.08471107482910156,
"learning_rate": 2.5790097005079765e-06,
"loss": 0.0022,
"num_tokens": 7238953.0,
"reward": 0.026041667180834338,
"reward_std": 0.045010624220594764,
"rewards/accuracy_reward": 0.026041667180834338,
"step": 400
},
{
"advantages": -1.8109050890779965e-10,
"clip_ratio": 0.0,
"coeff1": 0.0,
"coeff2": 0.0,
"completion_length": 1245.3810901641846,
"epoch": 0.3328,
"grad_norm": 0.1183172246791569,
"kl": 0.05459880828857422,
"learning_rate": 2.531391882202451e-06,
"loss": 0.0028,
"num_tokens": 8902421.0,
"reward": 0.0295138893998228,
"reward_std": 0.04784542182460427,
"rewards/accuracy_reward": 0.0295138893998228,
"step": 416
},
{
"advantages": -6.467517979502624e-10,
"clip_ratio": 0.0,
"coeff1": 0.0,
"coeff2": 0.0,
"completion_length": 1229.9618154764175,
"epoch": 0.3456,
"grad_norm": 0.3066664475065796,
"kl": 0.05420970916748047,
"learning_rate": 2.4817153967994223e-06,
"loss": 0.0029,
"num_tokens": 10560447.0,
"reward": 0.029513889574445784,
"reward_std": 0.0411667434964329,
"rewards/accuracy_reward": 0.029513889574445784,
"step": 432
},
{
"epoch": 0.3584,
"grad_norm": 0.1268279921380881,
"learning_rate": 2.430079398992449e-06,
"loss": 0.0055,
"step": 448
},
{
"epoch": 0.3584,
"eval_advantages": -1.153961986572421e-09,
"eval_clip_ratio": 0.0,
"eval_coeff1": 0.0,
"eval_coeff2": 0.0,
"eval_completion_length": 906.0922629616477,
"eval_kl": 0.054398970170454544,
"eval_loss": 0.009388250298798084,
"eval_num_tokens": 12040457.0,
"eval_reward": 0.04040404070507397,
"eval_reward_std": 0.07722981409593062,
"eval_rewards/accuracy_reward": 0.04040404070507397,
"eval_runtime": 424.1842,
"eval_samples_per_second": 0.075,
"eval_steps_per_second": 0.005,
"step": 448
},
{
"advantages": -4.91531372028009e-10,
"clip_ratio": 0.0,
"coeff1": 0.0,
"coeff2": 0.0,
"completion_length": 1094.4409800171852,
"epoch": 0.3712,
"grad_norm": 0.06191021180344248,
"kl": 0.060648441314697266,
"learning_rate": 2.376586954678758e-06,
"loss": 0.0065,
"num_tokens": 13547906.0,
"reward": 0.034288195049157366,
"reward_std": 0.061476447503082454,
"rewards/accuracy_reward": 0.034288195049157366,
"step": 464
},
{
"advantages": -7.761022553787189e-11,
"clip_ratio": 0.0,
"coeff1": 0.0,
"coeff2": 0.0,
"completion_length": 961.0772646665573,
"epoch": 0.384,
"grad_norm": 1.7554215023891546,
"kl": 0.08006000518798828,
"learning_rate": 2.3213448352388254e-06,
"loss": 0.0062,
"num_tokens": 14885431.0,
"reward": 0.03385416715173051,
"reward_std": 0.0651520665269345,
"rewards/accuracy_reward": 0.03385416715173051,
"step": 480
},
{
"advantages": -2.587007517929063e-11,
"clip_ratio": 0.0,
"coeff1": 0.0,
"coeff2": 0.0,
"completion_length": 1175.2361198663712,
"epoch": 0.3968,
"grad_norm": 0.08822820636304002,
"kl": 0.060199737548828125,
"learning_rate": 2.2644633044197768e-06,
"loss": 0.0032,
"num_tokens": 16476912.0,
"reward": 0.012152777926530689,
"reward_std": 0.02902539470233023,
"rewards/accuracy_reward": 0.012152777926530689,
"step": 496
},
{
"epoch": 0.4096,
"grad_norm": 0.25608207256166327,
"learning_rate": 2.2060558982479992e-06,
"loss": 0.0046,
"step": 512
},
{
"epoch": 0.4096,
"eval_advantages": -1.2543065071439358e-10,
"eval_clip_ratio": 0.0,
"eval_coeff1": 0.0,
"eval_coeff2": 0.0,
"eval_completion_length": 946.973066850142,
"eval_kl": 0.054021661931818184,
"eval_loss": 0.0011832525487989187,
"eval_num_tokens": 18159572.0,
"eval_reward": 0.04377104409716346,
"eval_reward_std": 0.08733082630417564,
"eval_rewards/accuracy_reward": 0.04377104409716346,
"eval_runtime": 393.0584,
"eval_samples_per_second": 0.081,
"eval_steps_per_second": 0.005,
"step": 512
},
{
"advantages": -2.5870072056788374e-10,
"clip_ratio": 0.0,
"coeff1": 0.0,
"coeff2": 0.0,
"completion_length": 1322.1076508760452,
"epoch": 0.4224,
"grad_norm": 5.019169577841116,
"kl": 0.06106758117675781,
"learning_rate": 2.1462391984102506e-06,
"loss": 0.0056,
"num_tokens": 19980464.0,
"reward": 0.03645833395421505,
"reward_std": 0.052383382339030504,
"rewards/accuracy_reward": 0.03645833395421505,
"step": 528
},
{
"advantages": -1.2935036028394187e-10,
"clip_ratio": 0.0,
"coeff1": 0.0,
"coeff2": 0.0,
"completion_length": 1045.0260478258133,
"epoch": 0.4352,
"grad_norm": 0.12391478097100359,
"kl": 0.05616188049316406,
"learning_rate": 2.0851325995556095e-06,
"loss": 0.0048,
"num_tokens": 21416561.0,
"reward": 0.026041667093522847,
"reward_std": 0.05095388810150325,
"rewards/accuracy_reward": 0.026041667093522847,
"step": 544
},
{
"advantages": -8.537123855067996e-10,
"clip_ratio": 0.0,
"coeff1": 0.0,
"coeff2": 0.0,
"completion_length": 1117.5321254730225,
"epoch": 0.448,
"grad_norm": 0.052893524659718465,
"kl": 0.05390167236328125,
"learning_rate": 2.022858070982723e-06,
"loss": 0.0052,
"num_tokens": 22935690.0,
"reward": 0.06076388992369175,
"reward_std": 0.07754337647929788,
"rewards/accuracy_reward": 0.06076388992369175,
"step": 560
},
{
"epoch": 0.4608,
"grad_norm": 0.33061073442663064,
"learning_rate": 1.9595399131880336e-06,
"loss": 0.0109,
"step": 576
},
{
"epoch": 0.4608,
"eval_advantages": 5.268087330004531e-10,
"eval_clip_ratio": 0.0,
"eval_coeff1": 0.0,
"eval_coeff2": 0.0,
"eval_completion_length": 1592.6956287730825,
"eval_kl": 6.670787464488637,
"eval_loss": 0.2936657965183258,
"eval_num_tokens": 24698629.0,
"eval_reward": 0.06397306377237494,
"eval_reward_std": 0.09890406985174525,
"eval_rewards/accuracy_reward": 0.06397306377237494,
"eval_runtime": 777.425,
"eval_samples_per_second": 0.041,
"eval_steps_per_second": 0.003,
"step": 576
},
{
"advantages": -2.4576568384560016e-10,
"clip_ratio": 0.0,
"coeff1": 0.0,
"coeff2": 0.0,
"completion_length": 1218.7899428009987,
"epoch": 0.4736,
"grad_norm": 0.1739154737585171,
"kl": 0.06676912307739258,
"learning_rate": 1.8953045097609192e-06,
"loss": 0.0054,
"num_tokens": 26199776.0,
"reward": 0.039930556318722665,
"reward_std": 0.067484509665519,
"rewards/accuracy_reward": 0.039930556318722665,
"step": 592
},
{
"advantages": -4.915313624870299e-10,
"clip_ratio": 0.0,
"coeff1": 0.0,
"coeff2": 0.0,
"completion_length": 972.5477449893951,
"epoch": 0.4864,
"grad_norm": 0.08699168333257727,
"kl": 0.12245941162109375,
"learning_rate": 1.8302800751209524e-06,
"loss": 0.0078,
"num_tokens": 27551361.0,
"reward": 0.037326389574445784,
"reward_std": 0.061457116389647126,
"rewards/accuracy_reward": 0.037326389574445784,
"step": 608
},
{
"advantages": -1.0348028683937471e-10,
"clip_ratio": 0.0,
"coeff1": 0.0,
"coeff2": 0.0,
"completion_length": 1149.3463646173477,
"epoch": 0.4992,
"grad_norm": 0.11664913948177272,
"kl": 0.05477714538574219,
"learning_rate": 1.7645963986008188e-06,
"loss": 0.0034,
"num_tokens": 29104197.0,
"reward": 0.0381944450782612,
"reward_std": 0.0675879716873169,
"rewards/accuracy_reward": 0.0381944450782612,
"step": 624
},
{
"epoch": 0.512,
"grad_norm": 0.05178259833648462,
"learning_rate": 1.698384585385684e-06,
"loss": 0.0025,
"step": 640
},
{
"epoch": 0.512,
"eval_advantages": 7.776700344292402e-10,
"eval_clip_ratio": 0.0,
"eval_coeff1": 0.0,
"eval_coeff2": 0.0,
"eval_completion_length": 852.7952991832386,
"eval_kl": 0.051957563920454544,
"eval_loss": 0.0046281577087938786,
"eval_num_tokens": 30476142.0,
"eval_reward": 0.04713804816657847,
"eval_reward_std": 0.08570862019603903,
"eval_rewards/accuracy_reward": 0.04713804681192745,
"eval_runtime": 339.9404,
"eval_samples_per_second": 0.094,
"eval_steps_per_second": 0.006,
"step": 640
},
{
"advantages": -1.1641532442902003e-10,
"clip_ratio": 0.0,
"coeff1": 0.0,
"coeff2": 0.0,
"completion_length": 1037.3619900345802,
"epoch": 0.5248,
"grad_norm": 0.1715480919627143,
"kl": 0.05947065353393555,
"learning_rate": 1.6317767948261151e-06,
"loss": 0.0068,
"num_tokens": 31962531.0,
"reward": 0.036892361589707434,
"reward_std": 0.056289632339030504,
"rewards/accuracy_reward": 0.036892361589707434,
"step": 656
},
{
"advantages": -3.469446951953614e-18,
"clip_ratio": 0.0,
"coeff1": 0.0,
"coeff2": 0.0,
"completion_length": 1162.2994906902313,
"epoch": 0.5376,
"grad_norm": 0.12907042606523647,
"kl": 0.058727264404296875,
"learning_rate": 1.5649059766468756e-06,
"loss": 0.0033,
"num_tokens": 33531576.0,
"reward": 0.023437500232830644,
"reward_std": 0.03835127782076597,
"rewards/accuracy_reward": 0.023437500232830644,
"step": 672
},
{
"advantages": -7.761021634383747e-10,
"clip_ratio": 0.0,
"coeff1": 0.0,
"coeff2": 0.0,
"completion_length": 1093.435774922371,
"epoch": 0.5504,
"grad_norm": 0.13341389554678587,
"kl": 0.07756614685058594,
"learning_rate": 1.4979056055781284e-06,
"loss": 0.0117,
"num_tokens": 35020210.0,
"reward": 0.05642361217178404,
"reward_std": 0.08594673778861761,
"rewards/accuracy_reward": 0.05642361217178404,
"step": 688
},
{
"epoch": 0.5632,
"grad_norm": 0.05130106262462248,
"learning_rate": 1.4309094149387215e-06,
"loss": 0.0065,
"step": 704
},
{
"epoch": 0.5632,
"eval_advantages": -8.529284248578764e-10,
"eval_clip_ratio": 0.0,
"eval_coeff1": 0.0,
"eval_coeff2": 0.0,
"eval_completion_length": 1165.2128184925425,
"eval_kl": 0.05832741477272727,
"eval_loss": 0.009196331724524498,
"eval_num_tokens": 36812916.0,
"eval_reward": 0.05387205495075746,
"eval_reward_std": 0.08570862019603903,
"eval_rewards/accuracy_reward": 0.053872053596106445,
"eval_runtime": 592.7192,
"eval_samples_per_second": 0.054,
"eval_steps_per_second": 0.003,
"step": 704
},
{
"advantages": -6.079466955896673e-10,
"clip_ratio": 0.0,
"coeff1": 0.0,
"coeff2": 0.0,
"completion_length": 1193.20140260458,
"epoch": 0.576,
"grad_norm": 0.07760359685028215,
"kl": 0.0630340576171875,
"learning_rate": 1.36405112970333e-06,
"loss": 0.0075,
"num_tokens": 38224305.0,
"reward": 0.04036458412883803,
"reward_std": 0.06313632964156568,
"rewards/accuracy_reward": 0.04036458412883803,
"step": 720
},
{
"advantages": -2.5870080383461058e-11,
"clip_ratio": 0.0,
"coeff1": 0.0,
"coeff2": 0.0,
"completion_length": 1223.709213733673,
"epoch": 0.5888,
"grad_norm": 0.15725359939619768,
"kl": 0.07776260375976562,
"learning_rate": 1.2974641995862513e-06,
"loss": 0.006,
"num_tokens": 39864679.0,
"reward": 0.02430555591126904,
"reward_std": 0.0411667434964329,
"rewards/accuracy_reward": 0.02430555591126904,
"step": 736
},
{
"advantages": -3.6218100740725845e-10,
"clip_ratio": 0.0,
"coeff1": 0.0,
"coeff2": 0.0,
"completion_length": 1188.0920258760452,
"epoch": 0.6016,
"grad_norm": 0.2281893872440948,
"kl": 0.05897712707519531,
"learning_rate": 1.2312815326746265e-06,
"loss": 0.0053,
"num_tokens": 41460323.0,
"reward": 0.026041667151730508,
"reward_std": 0.03751045558601618,
"rewards/accuracy_reward": 0.026041667151730508,
"step": 752
},
{
"epoch": 0.6144,
"grad_norm": 0.09938089965885895,
"learning_rate": 1.1656352301427494e-06,
"loss": 0.0071,
"step": 768
},
{
"epoch": 0.6144,
"eval_advantages": -1.8312875408018929e-09,
"eval_clip_ratio": 0.0,
"eval_coeff1": 0.0,
"eval_coeff2": 0.0,
"eval_completion_length": 1155.5676713423295,
"eval_kl": 0.06094637784090909,
"eval_loss": 0.01705244928598404,
"eval_num_tokens": 43104138.0,
"eval_reward": 0.060606061734936455,
"eval_reward_std": 0.11014155772599307,
"eval_rewards/accuracy_reward": 0.06060606038028544,
"eval_runtime": 560.21,
"eval_samples_per_second": 0.057,
"eval_steps_per_second": 0.004,
"step": 768
},
{
"advantages": -5.950116606021072e-10,
"clip_ratio": 0.0,
"coeff1": 0.0,
"coeff2": 0.0,
"completion_length": 1141.80166092515,
"epoch": 0.6272,
"grad_norm": 0.09820866890356648,
"kl": 0.05560588836669922,
"learning_rate": 1.1006563225769834e-06,
"loss": 0.0077,
"num_tokens": 44553976.0,
"reward": 0.05251736202626489,
"reward_std": 0.07094883010722697,
"rewards/accuracy_reward": 0.05251736202626489,
"step": 784
},
{
"advantages": -3.3631094610575563e-10,
"clip_ratio": 0.0,
"coeff1": 0.0,
"coeff2": 0.0,
"completion_length": 1058.8411583900452,
"epoch": 0.64,
"grad_norm": 2.8454182835451376,
"kl": 0.0807657241821289,
"learning_rate": 1.036474508437579e-06,
"loss": 0.01,
"num_tokens": 45998554.0,
"reward": 0.0616319453692995,
"reward_std": 0.08384686964564025,
"rewards/accuracy_reward": 0.0616319453692995,
"step": 800
},
{
"advantages": -3.1044086572229457e-10,
"clip_ratio": 0.0,
"coeff1": 0.0,
"coeff2": 0.0,
"completion_length": 1077.1380355358124,
"epoch": 0.6528,
"grad_norm": 0.14199326993500677,
"kl": 0.07318687438964844,
"learning_rate": 9.7321789517943e-07,
"loss": 0.0096,
"num_tokens": 47470276.0,
"reward": 0.06250000064028427,
"reward_std": 0.0695626160595566,
"rewards/accuracy_reward": 0.06250000064028427,
"step": 816
},
{
"epoch": 0.6656,
"grad_norm": 0.0490038499435179,
"learning_rate": 9.110127435484876e-07,
"loss": 0.0047,
"step": 832
},
{
"epoch": 0.6656,
"eval_advantages": 4.013780822860595e-10,
"eval_clip_ratio": 0.0,
"eval_coeff1": 0.0,
"eval_coeff2": 0.0,
"eval_completion_length": 1023.4006847034801,
"eval_kl": 0.08331853693181818,
"eval_loss": 0.011626713909208775,
"eval_num_tokens": 49096390.0,
"eval_reward": 0.03703703731298447,
"eval_reward_std": 0.08911995657465675,
"eval_rewards/accuracy_reward": 0.03703703731298447,
"eval_runtime": 489.5183,
"eval_samples_per_second": 0.065,
"eval_steps_per_second": 0.004,
"step": 832
},
{
"advantages": -4.656612959813566e-10,
"clip_ratio": 0.0,
"coeff1": 0.0,
"coeff2": 0.0,
"completion_length": 1183.9275279045105,
"epoch": 0.6784,
"grad_norm": 0.16220979708988933,
"kl": 0.0560002326965332,
"learning_rate": 8.499832155642192e-07,
"loss": 0.0045,
"num_tokens": 50664137.0,
"reward": 0.02821180599858053,
"reward_std": 0.05475473066326231,
"rewards/accuracy_reward": 0.02821180599858053,
"step": 848
},
{
"advantages": -2.0696057367874943e-10,
"clip_ratio": 0.0,
"coeff1": 0.0,
"coeff2": 0.0,
"completion_length": 1126.2864711284637,
"epoch": 0.6912,
"grad_norm": 0.18714923830540647,
"kl": 0.057494163513183594,
"learning_rate": 7.902511266911504e-07,
"loss": 0.0091,
"num_tokens": 52188707.0,
"reward": 0.040798611706122756,
"reward_std": 0.06322046043351293,
"rewards/accuracy_reward": 0.040798611706122756,
"step": 864
},
{
"advantages": -5.174014446052144e-10,
"clip_ratio": 0.0,
"coeff1": 0.0,
"coeff2": 0.0,
"completion_length": 1009.0295271873474,
"epoch": 0.704,
"grad_norm": 0.053802748980809056,
"kl": 0.0721750259399414,
"learning_rate": 7.319357026941429e-07,
"loss": 0.0064,
"num_tokens": 53585793.0,
"reward": 0.032986111647915095,
"reward_std": 0.041123706148937345,
"rewards/accuracy_reward": 0.032986111647915095,
"step": 880
},
{
"epoch": 0.7168,
"grad_norm": 0.043086794360717814,
"learning_rate": 6.751533416627402e-07,
"loss": 0.0041,
"step": 896
},
{
"epoch": 0.7168,
"eval_advantages": -1.2543065071439358e-10,
"eval_clip_ratio": 0.0,
"eval_coeff1": 0.0,
"eval_coeff2": 0.0,
"eval_completion_length": 1056.4094349254262,
"eval_kl": 0.047407670454545456,
"eval_loss": -0.0009789273608475924,
"eval_num_tokens": 55246005.0,
"eval_reward": 0.05050505088134245,
"eval_reward_std": 0.08993954008275812,
"eval_rewards/accuracy_reward": 0.05050505088134245,
"eval_runtime": 530.6591,
"eval_samples_per_second": 0.06,
"eval_steps_per_second": 0.004,
"step": 896
},
{
"advantages": -3.363109365647765e-10,
"clip_ratio": 0.0,
"coeff1": 0.0,
"coeff2": 0.0,
"completion_length": 1178.87544465065,
"epoch": 0.7296,
"grad_norm": 0.5075980892313485,
"kl": 0.05362844467163086,
"learning_rate": 6.200173816795782e-07,
"loss": 0.0058,
"num_tokens": 56780614.0,
"reward": 0.04730902868323028,
"reward_std": 0.06855815008748323,
"rewards/accuracy_reward": 0.04730902868323028,
"step": 912
},
{
"advantages": -4.915313676912003e-10,
"clip_ratio": 0.0,
"coeff1": 0.0,
"coeff2": 0.0,
"completion_length": 1000.2708378434181,
"epoch": 0.7424,
"grad_norm": 0.2521305121709945,
"kl": 0.05377674102783203,
"learning_rate": 5.666378745965906e-07,
"loss": 0.0039,
"num_tokens": 58167790.0,
"reward": 0.03819444519467652,
"reward_std": 0.06435428117401898,
"rewards/accuracy_reward": 0.03819444519467652,
"step": 928
},
{
"advantages": -9.830627388518476e-10,
"clip_ratio": 0.0,
"coeff1": 0.0,
"coeff2": 0.0,
"completion_length": 1074.833347082138,
"epoch": 0.7552,
"grad_norm": 0.2128395774905888,
"kl": 0.05224609375,
"learning_rate": 5.151213663705655e-07,
"loss": 0.0052,
"num_tokens": 59637865.0,
"reward": 0.03906250081490725,
"reward_std": 0.05284245638176799,
"rewards/accuracy_reward": 0.03906250081490725,
"step": 944
},
{
"epoch": 0.768,
"grad_norm": 0.34929628186787115,
"learning_rate": 4.6557068439649533e-07,
"loss": 0.0063,
"step": 960
},
{
"epoch": 0.768,
"eval_advantages": -3.0103356171454464e-10,
"eval_clip_ratio": 0.0,
"eval_coeff1": 0.0,
"eval_coeff2": 0.0,
"eval_completion_length": 962.1421064897017,
"eval_kl": 0.06276633522727272,
"eval_loss": 0.004617475904524326,
"eval_num_tokens": 61063526.0,
"eval_reward": 0.033670033920894966,
"eval_reward_std": 0.06386743079532277,
"eval_rewards/accuracy_reward": 0.033670033920894966,
"eval_runtime": 378.3809,
"eval_samples_per_second": 0.085,
"eval_steps_per_second": 0.005,
"step": 960
},
{
"advantages": 6.467517753988572e-11,
"clip_ratio": 0.0,
"coeff1": 0.0,
"coeff2": 0.0,
"completion_length": 1014.7638971209526,
"epoch": 0.7808,
"grad_norm": 0.25605661324993567,
"kl": 0.06560897827148438,
"learning_rate": 4.1808473226320925e-07,
"loss": 0.0059,
"num_tokens": 62437923.0,
"reward": 0.03211805599858053,
"reward_std": 0.05740193568635732,
"rewards/accuracy_reward": 0.03211805599858053,
"step": 976
},
{
"advantages": -6.726218696601061e-10,
"clip_ratio": 0.0,
"coeff1": 0.0,
"coeff2": 0.0,
"completion_length": 958.8003573417664,
"epoch": 0.7936,
"grad_norm": 0.12555370381277758,
"kl": 0.06540393829345703,
"learning_rate": 3.7275829234095597e-07,
"loss": 0.0044,
"num_tokens": 63773140.0,
"reward": 0.034722223062999547,
"reward_std": 0.05171300959773362,
"rewards/accuracy_reward": 0.034722223062999547,
"step": 992
},
{
"advantages": -1.0089328036227974e-09,
"clip_ratio": 0.0,
"coeff1": 0.0,
"coeff2": 0.0,
"completion_length": 1007.450531244278,
"epoch": 0.8064,
"grad_norm": 0.2615829366145835,
"kl": 0.06541252136230469,
"learning_rate": 3.2968183659497496e-07,
"loss": 0.0075,
"num_tokens": 65169955.0,
"reward": 0.05381944548571482,
"reward_std": 0.07846590015105903,
"rewards/accuracy_reward": 0.05381944548571482,
"step": 1008
},
{
"epoch": 0.8192,
"grad_norm": 0.2355642427029959,
"learning_rate": 2.889413460026724e-07,
"loss": 0.0039,
"step": 1024
},
{
"epoch": 0.8192,
"eval_advantages": -9.031006851436339e-10,
"eval_clip_ratio": 0.0,
"eval_coeff1": 0.0,
"eval_coeff2": 0.0,
"eval_completion_length": 1017.2666736949574,
"eval_kl": 0.06349875710227272,
"eval_loss": 0.008554265834391117,
"eval_num_tokens": 66669954.0,
"eval_reward": 0.04713804816657847,
"eval_reward_std": 0.08911995792930777,
"eval_rewards/accuracy_reward": 0.04713804681192745,
"eval_runtime": 434.6785,
"eval_samples_per_second": 0.074,
"eval_steps_per_second": 0.005,
"step": 1024
},
{
"advantages": -5.562065521699799e-10,
"clip_ratio": 0.0,
"coeff1": 0.0,
"coeff2": 0.0,
"completion_length": 1069.8537440896034,
"epoch": 0.832,
"grad_norm": 0.3632287215575524,
"kl": 0.06230878829956055,
"learning_rate": 2.5061813893485086e-07,
"loss": 0.0059,
"num_tokens": 68107083.0,
"reward": 0.03125000061118044,
"reward_std": 0.04950311663560569,
"rewards/accuracy_reward": 0.03125000061118044,
"step": 1040
},
{
"advantages": -7.761021564994808e-10,
"clip_ratio": 0.0,
"coeff1": 0.0,
"coeff2": 0.0,
"completion_length": 1005.6675432920456,
"epoch": 0.8448,
"grad_norm": 0.10428444418157863,
"kl": 0.04985618591308594,
"learning_rate": 2.1478870884353569e-07,
"loss": 0.003,
"num_tokens": 69492196.0,
"reward": 0.036458334245253354,
"reward_std": 0.04957010387443006,
"rewards/accuracy_reward": 0.036458334245253354,
"step": 1056
},
{
"advantages": -3.8805107738237865e-10,
"clip_ratio": 0.0,
"coeff1": 0.0,
"coeff2": 0.0,
"completion_length": 1020.9513982534409,
"epoch": 0.8576,
"grad_norm": 0.1509133862122977,
"kl": 0.05609416961669922,
"learning_rate": 1.8152457158038754e-07,
"loss": 0.0039,
"num_tokens": 70900217.0,
"reward": 0.04427083412883803,
"reward_std": 0.06196141499094665,
"rewards/accuracy_reward": 0.04427083412883803,
"step": 1072
},
{
"epoch": 0.8704,
"grad_norm": 0.22985562058251116,
"learning_rate": 1.508921226504434e-07,
"loss": 0.007,
"step": 1088
},
{
"epoch": 0.8704,
"eval_advantages": 3.512058220003021e-10,
"eval_clip_ratio": 0.0,
"eval_coeff1": 0.0,
"eval_coeff2": 0.0,
"eval_completion_length": 930.0915971235795,
"eval_kl": 0.05968128551136364,
"eval_loss": 0.001855027861893177,
"eval_num_tokens": 72531411.0,
"eval_reward": 0.060606061057610947,
"eval_reward_std": 0.0933678461746736,
"eval_rewards/accuracy_reward": 0.060606061057610947,
"eval_runtime": 468.1576,
"eval_samples_per_second": 0.068,
"eval_steps_per_second": 0.004,
"step": 1088
},
{
"advantages": -4.78596330101555e-10,
"clip_ratio": 0.0,
"coeff1": 0.0,
"coeff2": 0.0,
"completion_length": 1140.6423711180687,
"epoch": 0.8832,
"grad_norm": 0.3042206749317757,
"kl": 0.06157279014587402,
"learning_rate": 1.229525046861178e-07,
"loss": 0.0037,
"num_tokens": 74003268.0,
"reward": 0.04079861190984957,
"reward_std": 0.058684688061475754,
"rewards/accuracy_reward": 0.04079861190984957,
"step": 1104
},
{
"advantages": -2.069605754134729e-10,
"clip_ratio": 0.0,
"coeff1": 0.0,
"coeff2": 0.0,
"completion_length": 1093.5312616825104,
"epoch": 0.896,
"grad_norm": 0.18150421284527432,
"kl": 0.04981040954589844,
"learning_rate": 9.776148540597835e-08,
"loss": 0.004,
"num_tokens": 75487530.0,
"reward": 0.029513889516238123,
"reward_std": 0.04994966462254524,
"rewards/accuracy_reward": 0.029513889516238123,
"step": 1120
},
{
"advantages": 1.5522042678961512e-10,
"clip_ratio": 0.0,
"coeff1": 0.0,
"coeff2": 0.0,
"completion_length": 1031.1415041685104,
"epoch": 0.9088,
"grad_norm": 0.10760453701580872,
"kl": 0.048736572265625,
"learning_rate": 7.536934630189984e-08,
"loss": 0.003,
"num_tokens": 76909621.0,
"reward": 0.034722222422715276,
"reward_std": 0.040032922057434916,
"rewards/accuracy_reward": 0.034722222422715276,
"step": 1136
},
{
"epoch": 0.9216,
"grad_norm": 0.034815976141797315,
"learning_rate": 5.5820782276771844e-08,
"loss": 0.0048,
"step": 1152
},
{
"epoch": 0.9216,
"eval_advantages": 5.518948631433318e-10,
"eval_clip_ratio": 0.0,
"eval_coeff1": 0.0,
"eval_coeff2": 0.0,
"eval_completion_length": 973.9892245205966,
"eval_kl": 0.047429865056818184,
"eval_loss": 0.005054465960711241,
"eval_num_tokens": 78362666.0,
"eval_reward": 0.037037037990309975,
"eval_reward_std": 0.06468701430342415,
"eval_rewards/accuracy_reward": 0.03703703663565896,
"eval_runtime": 518.4926,
"eval_samples_per_second": 0.062,
"eval_steps_per_second": 0.004,
"step": 1152
},
{
"advantages": 1.1641532095957308e-10,
"clip_ratio": 0.0,
"coeff1": 0.0,
"coeff2": 0.0,
"completion_length": 1048.4752708673477,
"epoch": 0.9344,
"grad_norm": 0.21242493691362754,
"kl": 0.044564247131347656,
"learning_rate": 3.915481243308722e-08,
"loss": 0.0037,
"num_tokens": 79790743.0,
"reward": 0.03645833375048824,
"reward_std": 0.058012127061374485,
"rewards/accuracy_reward": 0.03645833375048824,
"step": 1168
},
{
"advantages": -6.984919483088436e-10,
"clip_ratio": 0.0,
"coeff1": 0.0,
"coeff2": 0.0,
"completion_length": 1015.795152425766,
"epoch": 0.9472,
"grad_norm": 0.2173139620490386,
"kl": 0.05732440948486328,
"learning_rate": 2.5404702190476857e-08,
"loss": 0.0073,
"num_tokens": 81192986.0,
"reward": 0.05295138974906877,
"reward_std": 0.07493921066634357,
"rewards/accuracy_reward": 0.05295138974906877,
"step": 1184
},
{
"advantages": -4.1392115429639276e-10,
"clip_ratio": 0.0,
"coeff1": 0.0,
"coeff2": 0.0,
"completion_length": 1034.2934141159058,
"epoch": 0.96,
"grad_norm": 0.2699272899444651,
"kl": 0.05102252960205078,
"learning_rate": 1.4597896887644457e-08,
"loss": 0.0068,
"num_tokens": 82614892.0,
"reward": 0.029513889516238123,
"reward_std": 0.05481710028834641,
"rewards/accuracy_reward": 0.029513889516238123,
"step": 1200
},
{
"epoch": 0.9728,
"grad_norm": 0.6748474802026545,
"learning_rate": 6.755967001234542e-09,
"loss": 0.0056,
"step": 1216
},
{
"epoch": 0.9728,
"eval_advantages": 9.031006851436339e-10,
"eval_clip_ratio": 0.0,
"eval_coeff1": 0.0,
"eval_coeff2": 0.0,
"eval_completion_length": 920.020224831321,
"eval_kl": 0.052800958806818184,
"eval_loss": 0.007503577042371035,
"eval_num_tokens": 84106311.0,
"eval_reward": 0.057239057665521446,
"eval_reward_std": 0.08718085559931668,
"eval_rewards/accuracy_reward": 0.057239057665521446,
"eval_runtime": 350.3027,
"eval_samples_per_second": 0.091,
"eval_steps_per_second": 0.006,
"step": 1216
},
{
"advantages": -3.1044086572229457e-10,
"clip_ratio": 0.0,
"coeff1": 0.0,
"coeff2": 0.0,
"completion_length": 1054.8498351573944,
"epoch": 0.9856,
"grad_norm": 0.05988202916363173,
"kl": 0.06485462188720703,
"learning_rate": 1.8945650909737986e-09,
"loss": 0.0047,
"num_tokens": 85506813.0,
"reward": 0.03211805602768436,
"reward_std": 0.04946226696483791,
"rewards/accuracy_reward": 0.03211805602768436,
"step": 1232
},
{
"advantages": -1.5522043719795597e-10,
"clip_ratio": 0.0,
"coeff1": 0.0,
"coeff2": 0.0,
"completion_length": 1088.2647668123245,
"epoch": 0.9984,
"grad_norm": 0.30744410756834145,
"kl": 0.04868507385253906,
"learning_rate": 2.3394557027600626e-11,
"loss": 0.0027,
"num_tokens": 86991596.0,
"reward": 0.020833333604969084,
"reward_std": 0.04309834982268512,
"rewards/accuracy_reward": 0.020833333604969084,
"step": 1248
},
{
"advantages": 0.0,
"clip_ratio": 0.0,
"coeff1": 0.0,
"coeff2": 0.0,
"completion_length": 1299.5555686950684,
"epoch": 1.0,
"kl": 0.0355224609375,
"num_tokens": 87209782.0,
"reward": 0.013888888992369175,
"reward_std": 0.0416666679084301,
"rewards/accuracy_reward": 0.013888888992369175,
"step": 1250,
"total_flos": 0.0,
"train_loss": 0.004173475314304232,
"train_runtime": 206115.5028,
"train_samples_per_second": 0.049,
"train_steps_per_second": 0.006
}
],
"logging_steps": 16,
"max_steps": 1250,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 64,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 3,
"trial_name": null,
"trial_params": null
}